abstractcore 2.9.1__py3-none-any.whl → 2.11.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/__init__.py +7 -27
- abstractcore/apps/extractor.py +33 -100
- abstractcore/apps/intent.py +19 -0
- abstractcore/apps/judge.py +20 -1
- abstractcore/apps/summarizer.py +20 -1
- abstractcore/architectures/detection.py +34 -1
- abstractcore/architectures/response_postprocessing.py +313 -0
- abstractcore/assets/architecture_formats.json +38 -8
- abstractcore/assets/model_capabilities.json +781 -160
- abstractcore/compression/__init__.py +1 -2
- abstractcore/compression/glyph_processor.py +6 -4
- abstractcore/config/main.py +31 -19
- abstractcore/config/manager.py +389 -11
- abstractcore/config/vision_config.py +5 -5
- abstractcore/core/interface.py +151 -3
- abstractcore/core/session.py +16 -10
- abstractcore/download.py +1 -1
- abstractcore/embeddings/manager.py +20 -6
- abstractcore/endpoint/__init__.py +2 -0
- abstractcore/endpoint/app.py +458 -0
- abstractcore/mcp/client.py +3 -1
- abstractcore/media/__init__.py +52 -17
- abstractcore/media/auto_handler.py +42 -22
- abstractcore/media/base.py +44 -1
- abstractcore/media/capabilities.py +12 -33
- abstractcore/media/enrichment.py +105 -0
- abstractcore/media/handlers/anthropic_handler.py +19 -28
- abstractcore/media/handlers/local_handler.py +124 -70
- abstractcore/media/handlers/openai_handler.py +19 -31
- abstractcore/media/processors/__init__.py +4 -2
- abstractcore/media/processors/audio_processor.py +57 -0
- abstractcore/media/processors/office_processor.py +8 -3
- abstractcore/media/processors/pdf_processor.py +46 -3
- abstractcore/media/processors/text_processor.py +22 -24
- abstractcore/media/processors/video_processor.py +58 -0
- abstractcore/media/types.py +97 -4
- abstractcore/media/utils/image_scaler.py +20 -2
- abstractcore/media/utils/video_frames.py +219 -0
- abstractcore/media/vision_fallback.py +136 -22
- abstractcore/processing/__init__.py +32 -3
- abstractcore/processing/basic_deepsearch.py +15 -10
- abstractcore/processing/basic_intent.py +3 -2
- abstractcore/processing/basic_judge.py +3 -2
- abstractcore/processing/basic_summarizer.py +1 -1
- abstractcore/providers/__init__.py +3 -1
- abstractcore/providers/anthropic_provider.py +95 -8
- abstractcore/providers/base.py +1516 -81
- abstractcore/providers/huggingface_provider.py +546 -69
- abstractcore/providers/lmstudio_provider.py +35 -923
- abstractcore/providers/mlx_provider.py +382 -35
- abstractcore/providers/model_capabilities.py +5 -1
- abstractcore/providers/ollama_provider.py +99 -15
- abstractcore/providers/openai_compatible_provider.py +406 -180
- abstractcore/providers/openai_provider.py +188 -44
- abstractcore/providers/openrouter_provider.py +76 -0
- abstractcore/providers/registry.py +61 -5
- abstractcore/providers/streaming.py +138 -33
- abstractcore/providers/vllm_provider.py +92 -817
- abstractcore/server/app.py +461 -13
- abstractcore/server/audio_endpoints.py +139 -0
- abstractcore/server/vision_endpoints.py +1319 -0
- abstractcore/structured/handler.py +316 -41
- abstractcore/tools/common_tools.py +5501 -2012
- abstractcore/tools/comms_tools.py +1641 -0
- abstractcore/tools/core.py +37 -7
- abstractcore/tools/handler.py +4 -9
- abstractcore/tools/parser.py +49 -2
- abstractcore/tools/tag_rewriter.py +2 -1
- abstractcore/tools/telegram_tdlib.py +407 -0
- abstractcore/tools/telegram_tools.py +261 -0
- abstractcore/utils/cli.py +1085 -72
- abstractcore/utils/token_utils.py +2 -0
- abstractcore/utils/truncation.py +29 -0
- abstractcore/utils/version.py +3 -4
- abstractcore/utils/vlm_token_calculator.py +12 -2
- abstractcore-2.11.2.dist-info/METADATA +562 -0
- abstractcore-2.11.2.dist-info/RECORD +133 -0
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/WHEEL +1 -1
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/entry_points.txt +1 -0
- abstractcore-2.9.1.dist-info/METADATA +0 -1190
- abstractcore-2.9.1.dist-info/RECORD +0 -119
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/top_level.txt +0 -0
abstractcore/__init__.py
CHANGED
|
@@ -35,30 +35,16 @@ from .core.types import GenerateResponse, Message
|
|
|
35
35
|
from .core.enums import ModelParameter, ModelCapability, MessageRole
|
|
36
36
|
from .exceptions import ModelNotFoundError, ProviderAPIError, AuthenticationError
|
|
37
37
|
|
|
38
|
-
#
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
_has_embeddings = True
|
|
42
|
-
except ImportError:
|
|
43
|
-
_has_embeddings = False
|
|
38
|
+
# Processing helpers (lightweight; do not import optional tool/media deps here).
|
|
39
|
+
from .processing.basic_summarizer import BasicSummarizer, SummaryStyle, SummaryLength
|
|
40
|
+
from .processing.basic_extractor import BasicExtractor
|
|
44
41
|
|
|
45
|
-
#
|
|
46
|
-
from .
|
|
47
|
-
_has_processing = True
|
|
48
|
-
|
|
49
|
-
# Tools module (core functionality)
|
|
50
|
-
from .tools import tool
|
|
42
|
+
# Tools: the decorator is dependency-free (built-in tool library lives in abstractcore.tools.common_tools).
|
|
43
|
+
from .tools.core import tool
|
|
51
44
|
|
|
52
45
|
# Download module (core functionality)
|
|
53
46
|
from .download import download_model, DownloadProgress, DownloadStatus
|
|
54
47
|
|
|
55
|
-
# Compression module (optional import)
|
|
56
|
-
try:
|
|
57
|
-
from .compression import GlyphConfig, CompressionOrchestrator
|
|
58
|
-
_has_compression = True
|
|
59
|
-
except ImportError:
|
|
60
|
-
_has_compression = False
|
|
61
|
-
|
|
62
48
|
__all__ = [
|
|
63
49
|
'create_llm',
|
|
64
50
|
'BasicSession',
|
|
@@ -76,11 +62,5 @@ __all__ = [
|
|
|
76
62
|
'DownloadStatus',
|
|
77
63
|
]
|
|
78
64
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
if _has_compression:
|
|
83
|
-
__all__.extend(['GlyphConfig', 'CompressionOrchestrator'])
|
|
84
|
-
|
|
85
|
-
# Processing is core functionality
|
|
86
|
-
__all__.extend(['BasicSummarizer', 'SummaryStyle', 'SummaryLength', 'BasicExtractor'])
|
|
65
|
+
# Processing helpers are part of the default install.
|
|
66
|
+
__all__.extend(['BasicSummarizer', 'SummaryStyle', 'SummaryLength', 'BasicExtractor'])
|
abstractcore/apps/extractor.py
CHANGED
|
@@ -1,38 +1,10 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
"""
|
|
3
|
-
AbstractCore
|
|
4
|
-
|
|
5
|
-
Usage:
|
|
6
|
-
python -m abstractcore.apps.extractor <file_path> [options]
|
|
7
|
-
|
|
8
|
-
Options:
|
|
9
|
-
--focus <focus> Specific focus area for extraction (e.g., "technology", "business", "medical")
|
|
10
|
-
--style <style> Extraction style (structured, focused, minimal, comprehensive, default: structured)
|
|
11
|
-
--length <length> Extraction depth (brief, standard, detailed, comprehensive, default: standard)
|
|
12
|
-
--entity-types <types> Comma-separated entity types to focus on (person,organization,location,etc.)
|
|
13
|
-
--similarity-threshold <t> Similarity threshold for entity deduplication (0.0-1.0, default: 0.85)
|
|
14
|
-
--format <format> Output format (json-ld, json, yaml, triples, default: json-ld)
|
|
15
|
-
--output <output> Output file path (optional, prints to console if not provided)
|
|
16
|
-
--chunk-size <size> Chunk size in characters (default: 6000, max: 32000)
|
|
17
|
-
--provider <provider> LLM provider (requires --model)
|
|
18
|
-
--model <model> LLM model (requires --provider)
|
|
19
|
-
--no-embeddings Disable semantic entity deduplication
|
|
20
|
-
--fast Use fast extraction (skip verification, larger chunks, no embeddings)
|
|
21
|
-
--iterate <number> Number of refinement iterations (default: 1, finds missing entities and verifies relationships)
|
|
22
|
-
--minified Output minified JSON-LD (compact, no indentation)
|
|
23
|
-
--verbose Show detailed progress information
|
|
24
|
-
--timeout <seconds> HTTP timeout for LLM providers (default: 300, increase for large models)
|
|
25
|
-
--max-tokens <tokens> Maximum total tokens for LLM context (default: 32000)
|
|
26
|
-
--max-output-tokens <tokens> Maximum tokens for LLM output generation (default: 8000)
|
|
27
|
-
--help Show this help message
|
|
3
|
+
AbstractCore entity & relationship extraction CLI.
|
|
28
4
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
python -m abstractcore.apps.extractor data.md --entity-types person,organization --output kg.jsonld
|
|
33
|
-
python -m abstractcore.apps.extractor large.txt --fast --minified --verbose # Fast, compact output
|
|
34
|
-
python -m abstractcore.apps.extractor report.txt --length detailed --provider openai --model gpt-4o-mini
|
|
35
|
-
python -m abstractcore.apps.extractor doc.txt --iterate 3 --verbose # 3 refinement passes for higher quality
|
|
5
|
+
Run:
|
|
6
|
+
- extractor <file> --help
|
|
7
|
+
- python -m abstractcore.apps.extractor <file> --help
|
|
36
8
|
"""
|
|
37
9
|
|
|
38
10
|
import argparse
|
|
@@ -86,6 +58,25 @@ def read_file_content(file_path: str) -> str:
|
|
|
86
58
|
if not file_path_obj.is_file():
|
|
87
59
|
raise ValueError(f"Path is not a file: {file_path}")
|
|
88
60
|
|
|
61
|
+
# Use the Media system for non-text documents when available (PDF/Office).
|
|
62
|
+
rich_doc_exts = {'.pdf', '.docx', '.pptx', '.xlsx', '.odt', '.rtf'}
|
|
63
|
+
if file_path_obj.suffix.lower() in rich_doc_exts:
|
|
64
|
+
try:
|
|
65
|
+
from ..media import process_file
|
|
66
|
+
|
|
67
|
+
media_content = process_file(str(file_path_obj))
|
|
68
|
+
content = getattr(media_content, "content", "")
|
|
69
|
+
if isinstance(content, bytes):
|
|
70
|
+
return content.decode("utf-8", errors="ignore")
|
|
71
|
+
return str(content or "")
|
|
72
|
+
except ImportError as e:
|
|
73
|
+
raise ImportError(
|
|
74
|
+
f"Reading {file_path_obj.suffix.lower()} files requires media dependencies. "
|
|
75
|
+
f"Install with: pip install \"abstractcore[media]\". Error: {e}"
|
|
76
|
+
) from e
|
|
77
|
+
except Exception as e:
|
|
78
|
+
raise Exception(f"Failed to extract content from {file_path}: {e}") from e
|
|
79
|
+
|
|
89
80
|
# Try to read as text file
|
|
90
81
|
try:
|
|
91
82
|
# Try UTF-8 first
|
|
@@ -162,7 +153,7 @@ Examples:
|
|
|
162
153
|
python -m abstractcore.apps.extractor document.pdf
|
|
163
154
|
python -m abstractcore.apps.extractor report.txt --focus=technology --style=structured --verbose
|
|
164
155
|
python -m abstractcore.apps.extractor data.md --entity-types=person,organization --output=kg.jsonld
|
|
165
|
-
python -m abstractcore.apps.extractor large.txt --length=detailed --
|
|
156
|
+
python -m abstractcore.apps.extractor large.txt --length=detailed --minified --verbose
|
|
166
157
|
python -m abstractcore.apps.extractor doc.txt --iterate=3 --verbose # Iterative refinement for quality
|
|
167
158
|
python -m abstractcore.apps.extractor doc.txt --format=triples --verbose # RDF triples output
|
|
168
159
|
python -m abstractcore.apps.extractor doc.txt --format=triples --output=triples.txt # Simple triples
|
|
@@ -179,20 +170,19 @@ Output options:
|
|
|
179
170
|
- Default: Pretty-printed JSON with indentation
|
|
180
171
|
- --minified: Compact JSON without indentation (smaller file size)
|
|
181
172
|
|
|
182
|
-
Performance
|
|
183
|
-
-
|
|
184
|
-
- --
|
|
185
|
-
-
|
|
173
|
+
Performance notes:
|
|
174
|
+
- Extraction is an LLM call; latency depends on provider/model and input size.
|
|
175
|
+
- For large files, increase --chunk-size to reduce the number of LLM calls (at the cost of context usage).
|
|
176
|
+
- Use --iterate for refinement (higher quality, more calls).
|
|
186
177
|
|
|
187
178
|
Quality enhancement:
|
|
188
179
|
- --iterate=N: Perform N refinement passes to find missing entities/relationships
|
|
189
180
|
- Each iteration reviews the extraction to find gaps and verify relationship directionality
|
|
190
|
-
-
|
|
181
|
+
- Tip: Start with 1 (default), then increase if you need higher recall.
|
|
191
182
|
|
|
192
183
|
Default model setup:
|
|
193
184
|
- Requires Ollama: https://ollama.com/
|
|
194
185
|
- Download model: ollama pull qwen3:4b-instruct-2507-q4_K_M
|
|
195
|
-
- For best performance: qwen3-coder:30b or gpt-oss:120b
|
|
196
186
|
- Or use --provider and --model for other providers
|
|
197
187
|
"""
|
|
198
188
|
)
|
|
@@ -226,13 +216,6 @@ Default model setup:
|
|
|
226
216
|
help='Comma-separated entity types to focus on (person,organization,location,concept,event,technology,product,date,other)'
|
|
227
217
|
)
|
|
228
218
|
|
|
229
|
-
parser.add_argument(
|
|
230
|
-
'--similarity-threshold',
|
|
231
|
-
type=float,
|
|
232
|
-
default=0.85,
|
|
233
|
-
help='Similarity threshold for entity deduplication (0.0-1.0, default: 0.85)'
|
|
234
|
-
)
|
|
235
|
-
|
|
236
219
|
# Build format choices based on available dependencies
|
|
237
220
|
format_choices = ['json-ld', 'triples', 'json']
|
|
238
221
|
if YAML_AVAILABLE:
|
|
@@ -267,25 +250,6 @@ Default model setup:
|
|
|
267
250
|
help='LLM model (requires --provider)'
|
|
268
251
|
)
|
|
269
252
|
|
|
270
|
-
parser.add_argument(
|
|
271
|
-
'--no-embeddings',
|
|
272
|
-
action='store_true',
|
|
273
|
-
help='Disable semantic entity deduplication'
|
|
274
|
-
)
|
|
275
|
-
|
|
276
|
-
parser.add_argument(
|
|
277
|
-
'--mode',
|
|
278
|
-
choices=['fast', 'balanced', 'thorough'],
|
|
279
|
-
default='balanced',
|
|
280
|
-
help='Extraction mode: fast (2-3x faster), balanced (default), thorough (highest quality)'
|
|
281
|
-
)
|
|
282
|
-
|
|
283
|
-
parser.add_argument(
|
|
284
|
-
'--fast',
|
|
285
|
-
action='store_true',
|
|
286
|
-
help='Legacy flag: equivalent to --mode=fast (deprecated, use --mode instead)'
|
|
287
|
-
)
|
|
288
|
-
|
|
289
253
|
parser.add_argument(
|
|
290
254
|
'--iterate',
|
|
291
255
|
type=int,
|
|
@@ -330,11 +294,6 @@ Default model setup:
|
|
|
330
294
|
args = parser.parse_args()
|
|
331
295
|
|
|
332
296
|
try:
|
|
333
|
-
# Validate similarity threshold
|
|
334
|
-
if not 0.0 <= args.similarity_threshold <= 1.0:
|
|
335
|
-
print("Error: Similarity threshold must be between 0.0 and 1.0")
|
|
336
|
-
sys.exit(1)
|
|
337
|
-
|
|
338
297
|
# Validate chunk size
|
|
339
298
|
if args.chunk_size < 1000:
|
|
340
299
|
print("Error: Chunk size must be at least 1000 characters")
|
|
@@ -383,43 +342,18 @@ Default model setup:
|
|
|
383
342
|
extraction_style = parse_extraction_style(args.style)
|
|
384
343
|
extraction_length = parse_extraction_length(args.length)
|
|
385
344
|
|
|
386
|
-
# Determine extraction mode (handle legacy --fast flag)
|
|
387
|
-
extraction_mode = args.mode
|
|
388
|
-
if args.fast:
|
|
389
|
-
extraction_mode = "fast"
|
|
390
|
-
|
|
391
345
|
# Initialize LLM and extractor
|
|
392
|
-
use_embeddings = not args.no_embeddings
|
|
393
|
-
|
|
394
346
|
if args.provider and args.model:
|
|
395
|
-
# Custom provider/model with max_tokens adjusted for chunk size and provider limits
|
|
396
|
-
max_tokens = max(32000, args.chunk_size)
|
|
397
|
-
|
|
398
347
|
# Adjust chunk size for provider-specific limits first
|
|
399
348
|
adjusted_chunk_size = args.chunk_size
|
|
400
349
|
|
|
401
|
-
#
|
|
350
|
+
# Provider-specific safety: some models work better with smaller per-chunk payloads.
|
|
402
351
|
if args.provider.lower() == "anthropic":
|
|
403
|
-
# Claude models have varying context limits
|
|
404
352
|
if "haiku" in args.model.lower():
|
|
405
|
-
# Claude 3.5 Haiku: 200K tokens total
|
|
406
|
-
max_tokens = min(max_tokens, 150000) # Leave room for output
|
|
407
|
-
max_output_tokens = 4000 # Conservative output limit
|
|
408
353
|
adjusted_chunk_size = min(args.chunk_size, 4000) # Smaller chunks for Haiku
|
|
409
|
-
elif "sonnet" in args.model.lower():
|
|
410
|
-
# Claude 3.5 Sonnet: 200K tokens
|
|
411
|
-
max_tokens = min(max_tokens, 180000)
|
|
412
|
-
max_output_tokens = 8000
|
|
413
|
-
else:
|
|
414
|
-
# Claude 3 Opus or other: assume 200K
|
|
415
|
-
max_tokens = min(max_tokens, 180000)
|
|
416
|
-
max_output_tokens = 8000
|
|
417
|
-
else:
|
|
418
|
-
# Default for other providers
|
|
419
|
-
max_output_tokens = 8000
|
|
420
354
|
|
|
421
355
|
if args.verbose:
|
|
422
|
-
print(f"Initializing BasicExtractor (
|
|
356
|
+
print(f"Initializing BasicExtractor ({args.provider}, {args.model}, {args.max_tokens} token context, {args.max_output_tokens} output tokens)...")
|
|
423
357
|
if adjusted_chunk_size != args.chunk_size:
|
|
424
358
|
print(f"Adjusted chunk size from {args.chunk_size} to {adjusted_chunk_size} characters for {args.provider} compatibility")
|
|
425
359
|
|
|
@@ -435,7 +369,7 @@ Default model setup:
|
|
|
435
369
|
else:
|
|
436
370
|
# Default configuration
|
|
437
371
|
if args.verbose:
|
|
438
|
-
print(f"Initializing BasicExtractor (
|
|
372
|
+
print(f"Initializing BasicExtractor (ollama, qwen3:4b-instruct-2507-q4_K_M, {args.max_tokens} token context, {args.max_output_tokens} output tokens)...")
|
|
439
373
|
|
|
440
374
|
try:
|
|
441
375
|
extractor = BasicExtractor(
|
|
@@ -450,7 +384,6 @@ Default model setup:
|
|
|
450
384
|
print("\n🚀 Quick alternatives to get started:")
|
|
451
385
|
print(" - Use --provider and --model to specify an available provider")
|
|
452
386
|
print(" - Example: extractor document.txt --provider openai --model gpt-4o-mini")
|
|
453
|
-
print(" - For speed: extractor document.txt --mode=fast")
|
|
454
387
|
sys.exit(1)
|
|
455
388
|
|
|
456
389
|
# Extract entities and relationships
|
|
@@ -604,4 +537,4 @@ Default model setup:
|
|
|
604
537
|
|
|
605
538
|
|
|
606
539
|
if __name__ == "__main__":
|
|
607
|
-
main()
|
|
540
|
+
main()
|
abstractcore/apps/intent.py
CHANGED
|
@@ -127,6 +127,25 @@ def read_file_content(file_path: str) -> str:
|
|
|
127
127
|
if not file_path_obj.is_file():
|
|
128
128
|
raise ValueError(f"Path is not a file: {file_path}")
|
|
129
129
|
|
|
130
|
+
# Use the Media system for non-text documents when available (PDF/Office).
|
|
131
|
+
rich_doc_exts = {'.pdf', '.docx', '.pptx', '.xlsx', '.odt', '.rtf'}
|
|
132
|
+
if file_path_obj.suffix.lower() in rich_doc_exts:
|
|
133
|
+
try:
|
|
134
|
+
from ..media import process_file
|
|
135
|
+
|
|
136
|
+
media_content = process_file(str(file_path_obj))
|
|
137
|
+
content = getattr(media_content, "content", "")
|
|
138
|
+
if isinstance(content, bytes):
|
|
139
|
+
return content.decode("utf-8", errors="ignore")
|
|
140
|
+
return str(content or "")
|
|
141
|
+
except ImportError as e:
|
|
142
|
+
raise ImportError(
|
|
143
|
+
f"Reading {file_path_obj.suffix.lower()} files requires media dependencies. "
|
|
144
|
+
f"Install with: pip install \"abstractcore[media]\". Error: {e}"
|
|
145
|
+
) from e
|
|
146
|
+
except Exception as e:
|
|
147
|
+
raise Exception(f"Failed to extract content from {file_path}: {e}") from e
|
|
148
|
+
|
|
130
149
|
# Try to read as text file
|
|
131
150
|
try:
|
|
132
151
|
# Try UTF-8 first
|
abstractcore/apps/judge.py
CHANGED
|
@@ -72,6 +72,25 @@ def read_content(content_or_path: str) -> str:
|
|
|
72
72
|
try:
|
|
73
73
|
file_path = Path(content_or_path)
|
|
74
74
|
if file_path.exists() and file_path.is_file():
|
|
75
|
+
# Use the Media system for non-text documents when available (PDF/Office).
|
|
76
|
+
rich_doc_exts = {'.pdf', '.docx', '.pptx', '.xlsx', '.odt', '.rtf'}
|
|
77
|
+
if file_path.suffix.lower() in rich_doc_exts:
|
|
78
|
+
try:
|
|
79
|
+
from ..media import process_file
|
|
80
|
+
|
|
81
|
+
media_content = process_file(str(file_path))
|
|
82
|
+
content = getattr(media_content, "content", "")
|
|
83
|
+
if isinstance(content, bytes):
|
|
84
|
+
return content.decode("utf-8", errors="ignore")
|
|
85
|
+
return str(content or "")
|
|
86
|
+
except ImportError as e:
|
|
87
|
+
raise ImportError(
|
|
88
|
+
f"Reading {file_path.suffix.lower()} files requires media dependencies. "
|
|
89
|
+
f"Install with: pip install \"abstractcore[media]\". Error: {e}"
|
|
90
|
+
) from e
|
|
91
|
+
except Exception as e:
|
|
92
|
+
raise Exception(f"Failed to extract content from {content_or_path}: {e}") from e
|
|
93
|
+
|
|
75
94
|
# Try to read as text file
|
|
76
95
|
try:
|
|
77
96
|
# Try UTF-8 first
|
|
@@ -628,4 +647,4 @@ Default model setup:
|
|
|
628
647
|
|
|
629
648
|
|
|
630
649
|
if __name__ == "__main__":
|
|
631
|
-
main()
|
|
650
|
+
main()
|
abstractcore/apps/summarizer.py
CHANGED
|
@@ -90,6 +90,25 @@ def read_file_content(file_path: str) -> str:
|
|
|
90
90
|
if not file_path_obj.is_file():
|
|
91
91
|
raise ValueError(f"Path is not a file: {file_path}")
|
|
92
92
|
|
|
93
|
+
# Use the Media system for non-text documents when available (PDF/Office).
|
|
94
|
+
rich_doc_exts = {'.pdf', '.docx', '.pptx', '.xlsx', '.odt', '.rtf'}
|
|
95
|
+
if file_path_obj.suffix.lower() in rich_doc_exts:
|
|
96
|
+
try:
|
|
97
|
+
from ..media import process_file
|
|
98
|
+
|
|
99
|
+
media_content = process_file(str(file_path_obj))
|
|
100
|
+
content = getattr(media_content, "content", "")
|
|
101
|
+
if isinstance(content, bytes):
|
|
102
|
+
return content.decode("utf-8", errors="ignore")
|
|
103
|
+
return str(content or "")
|
|
104
|
+
except ImportError as e:
|
|
105
|
+
raise ImportError(
|
|
106
|
+
f"Reading {file_path_obj.suffix.lower()} files requires media dependencies. "
|
|
107
|
+
f"Install with: pip install \"abstractcore[media]\". Error: {e}"
|
|
108
|
+
) from e
|
|
109
|
+
except Exception as e:
|
|
110
|
+
raise Exception(f"Failed to extract content from {file_path}: {e}") from e
|
|
111
|
+
|
|
93
112
|
# Try to read as text file
|
|
94
113
|
try:
|
|
95
114
|
# Try UTF-8 first
|
|
@@ -468,4 +487,4 @@ Default model setup:
|
|
|
468
487
|
|
|
469
488
|
|
|
470
489
|
if __name__ == "__main__":
|
|
471
|
-
main()
|
|
490
|
+
main()
|
|
@@ -42,6 +42,7 @@ _KNOWN_PROVIDER_PREFIXES = {
|
|
|
42
42
|
"ollama",
|
|
43
43
|
"openai",
|
|
44
44
|
"openai-compatible",
|
|
45
|
+
"openrouter",
|
|
45
46
|
"together",
|
|
46
47
|
"vllm",
|
|
47
48
|
}
|
|
@@ -212,6 +213,30 @@ def resolve_model_alias(model_name: str, models: Dict[str, Any]) -> str:
|
|
|
212
213
|
return s
|
|
213
214
|
return s.split("/")[-1].strip()
|
|
214
215
|
|
|
216
|
+
def _colon_variants(name: str) -> List[str]:
|
|
217
|
+
"""Generate best-effort candidates for Ollama-style `name:tag` ids."""
|
|
218
|
+
s = str(name or "").strip()
|
|
219
|
+
if not s or ":" not in s:
|
|
220
|
+
return []
|
|
221
|
+
head, rest = s.split(":", 1)
|
|
222
|
+
head = head.strip()
|
|
223
|
+
rest = rest.strip()
|
|
224
|
+
if not head or not rest:
|
|
225
|
+
return []
|
|
226
|
+
first = rest.split("-", 1)[0].strip()
|
|
227
|
+
out: List[str] = []
|
|
228
|
+
# Avoid over-mapping specialized Ollama names (e.g., `qwen3-coder:30b`) onto
|
|
229
|
+
# upstream base-model capability entries which may advertise much larger context
|
|
230
|
+
# windows than the local runtime is configured to support by default.
|
|
231
|
+
if "-" in head:
|
|
232
|
+
out.append(head)
|
|
233
|
+
return out
|
|
234
|
+
if first:
|
|
235
|
+
out.append(f"{head}:{first}")
|
|
236
|
+
out.append(f"{head}-{first}")
|
|
237
|
+
out.append(head)
|
|
238
|
+
return out
|
|
239
|
+
|
|
215
240
|
def _candidates(*names: str) -> List[str]:
|
|
216
241
|
out: List[str] = []
|
|
217
242
|
for n in names:
|
|
@@ -219,9 +244,11 @@ def resolve_model_alias(model_name: str, models: Dict[str, Any]) -> str:
|
|
|
219
244
|
if not s:
|
|
220
245
|
continue
|
|
221
246
|
out.append(s)
|
|
247
|
+
out.extend(_colon_variants(s))
|
|
222
248
|
t = _tail(s)
|
|
223
249
|
if t and t != s:
|
|
224
250
|
out.append(t)
|
|
251
|
+
out.extend(_colon_variants(t))
|
|
225
252
|
# Deduplicate while preserving order
|
|
226
253
|
uniq: List[str] = []
|
|
227
254
|
seen: set[str] = set()
|
|
@@ -378,7 +405,8 @@ def get_model_capabilities(model_name: str) -> Dict[str, Any]:
|
|
|
378
405
|
except Exception:
|
|
379
406
|
raw_name = ""
|
|
380
407
|
|
|
381
|
-
|
|
408
|
+
placeholder_names = {"default"}
|
|
409
|
+
if raw_name and raw_name.lower() not in placeholder_names and raw_name not in _default_capabilities_warning_cache:
|
|
382
410
|
_default_capabilities_warning_cache.add(raw_name)
|
|
383
411
|
logger.warning(
|
|
384
412
|
"Model not found in model_capabilities.json; falling back to architecture defaults",
|
|
@@ -454,6 +482,11 @@ def supports_audio(model_name: str) -> bool:
|
|
|
454
482
|
def supports_embeddings(model_name: str) -> bool:
|
|
455
483
|
"""Check if model supports embeddings."""
|
|
456
484
|
capabilities = get_model_capabilities(model_name)
|
|
485
|
+
# Prefer explicit model metadata over name heuristics:
|
|
486
|
+
# - `model_type: "embedding"` is the canonical signal in `assets/model_capabilities.json`.
|
|
487
|
+
# - `embedding_support` is a legacy boolean (kept for backwards compatibility).
|
|
488
|
+
if capabilities.get("model_type") == "embedding":
|
|
489
|
+
return True
|
|
457
490
|
return capabilities.get("embedding_support", False)
|
|
458
491
|
|
|
459
492
|
|