abstractcore 2.5.0__py3-none-any.whl → 2.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/__init__.py +12 -0
- abstractcore/apps/__main__.py +8 -1
- abstractcore/apps/deepsearch.py +644 -0
- abstractcore/apps/intent.py +614 -0
- abstractcore/architectures/detection.py +250 -4
- abstractcore/assets/architecture_formats.json +14 -1
- abstractcore/assets/model_capabilities.json +583 -44
- abstractcore/compression/__init__.py +29 -0
- abstractcore/compression/analytics.py +420 -0
- abstractcore/compression/cache.py +250 -0
- abstractcore/compression/config.py +279 -0
- abstractcore/compression/exceptions.py +30 -0
- abstractcore/compression/glyph_processor.py +381 -0
- abstractcore/compression/optimizer.py +388 -0
- abstractcore/compression/orchestrator.py +380 -0
- abstractcore/compression/pil_text_renderer.py +818 -0
- abstractcore/compression/quality.py +226 -0
- abstractcore/compression/text_formatter.py +666 -0
- abstractcore/compression/vision_compressor.py +371 -0
- abstractcore/config/main.py +66 -1
- abstractcore/config/manager.py +111 -5
- abstractcore/core/session.py +105 -5
- abstractcore/events/__init__.py +1 -1
- abstractcore/media/auto_handler.py +312 -18
- abstractcore/media/handlers/local_handler.py +14 -2
- abstractcore/media/handlers/openai_handler.py +62 -3
- abstractcore/media/processors/__init__.py +11 -1
- abstractcore/media/processors/direct_pdf_processor.py +210 -0
- abstractcore/media/processors/glyph_pdf_processor.py +227 -0
- abstractcore/media/processors/image_processor.py +7 -1
- abstractcore/media/processors/text_processor.py +18 -3
- abstractcore/media/types.py +164 -7
- abstractcore/processing/__init__.py +5 -1
- abstractcore/processing/basic_deepsearch.py +2173 -0
- abstractcore/processing/basic_intent.py +690 -0
- abstractcore/providers/__init__.py +18 -0
- abstractcore/providers/anthropic_provider.py +29 -2
- abstractcore/providers/base.py +279 -6
- abstractcore/providers/huggingface_provider.py +658 -27
- abstractcore/providers/lmstudio_provider.py +52 -2
- abstractcore/providers/mlx_provider.py +103 -4
- abstractcore/providers/model_capabilities.py +352 -0
- abstractcore/providers/ollama_provider.py +44 -6
- abstractcore/providers/openai_provider.py +29 -2
- abstractcore/providers/registry.py +91 -19
- abstractcore/server/app.py +91 -81
- abstractcore/structured/handler.py +161 -1
- abstractcore/tools/common_tools.py +98 -3
- abstractcore/utils/__init__.py +4 -1
- abstractcore/utils/cli.py +114 -1
- abstractcore/utils/trace_export.py +287 -0
- abstractcore/utils/version.py +1 -1
- abstractcore/utils/vlm_token_calculator.py +655 -0
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/METADATA +140 -23
- abstractcore-2.5.3.dist-info/RECORD +107 -0
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/entry_points.txt +4 -0
- abstractcore-2.5.0.dist-info/RECORD +0 -86
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/WHEEL +0 -0
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/top_level.txt +0 -0
abstractcore/media/types.py
CHANGED
|
@@ -207,15 +207,51 @@ FILE_TYPE_MAPPINGS = {
|
|
|
207
207
|
'gif': MediaType.IMAGE, 'bmp': MediaType.IMAGE, 'tif': MediaType.IMAGE,
|
|
208
208
|
'tiff': MediaType.IMAGE, 'webp': MediaType.IMAGE, 'ico': MediaType.IMAGE,
|
|
209
209
|
|
|
210
|
-
# Documents
|
|
210
|
+
# Documents (binary formats that need special processing)
|
|
211
211
|
'pdf': MediaType.DOCUMENT, 'doc': MediaType.DOCUMENT, 'docx': MediaType.DOCUMENT,
|
|
212
212
|
'xls': MediaType.DOCUMENT, 'xlsx': MediaType.DOCUMENT, 'ppt': MediaType.DOCUMENT,
|
|
213
213
|
'pptx': MediaType.DOCUMENT, 'odt': MediaType.DOCUMENT, 'rtf': MediaType.DOCUMENT,
|
|
214
214
|
|
|
215
|
-
# Text formats
|
|
216
|
-
'txt': MediaType.TEXT, 'md': MediaType.TEXT, '
|
|
217
|
-
'
|
|
218
|
-
'
|
|
215
|
+
# Text formats - Common markup and data formats
|
|
216
|
+
'txt': MediaType.TEXT, 'md': MediaType.TEXT, 'markdown': MediaType.TEXT,
|
|
217
|
+
'csv': MediaType.TEXT, 'tsv': MediaType.TEXT,
|
|
218
|
+
'json': MediaType.TEXT, 'jsonl': MediaType.TEXT, 'ndjson': MediaType.TEXT,
|
|
219
|
+
'xml': MediaType.TEXT, 'html': MediaType.TEXT, 'htm': MediaType.TEXT,
|
|
220
|
+
'yaml': MediaType.TEXT, 'yml': MediaType.TEXT, 'toml': MediaType.TEXT,
|
|
221
|
+
'ini': MediaType.TEXT, 'cfg': MediaType.TEXT, 'conf': MediaType.TEXT,
|
|
222
|
+
|
|
223
|
+
# Text formats - Programming and scripting languages
|
|
224
|
+
'py': MediaType.TEXT, 'pyw': MediaType.TEXT, 'pyx': MediaType.TEXT,
|
|
225
|
+
'js': MediaType.TEXT, 'jsx': MediaType.TEXT, 'ts': MediaType.TEXT, 'tsx': MediaType.TEXT,
|
|
226
|
+
'java': MediaType.TEXT, 'kt': MediaType.TEXT, 'scala': MediaType.TEXT,
|
|
227
|
+
'c': MediaType.TEXT, 'cpp': MediaType.TEXT, 'cc': MediaType.TEXT, 'cxx': MediaType.TEXT,
|
|
228
|
+
'h': MediaType.TEXT, 'hpp': MediaType.TEXT, 'hxx': MediaType.TEXT,
|
|
229
|
+
'cs': MediaType.TEXT, 'go': MediaType.TEXT, 'rs': MediaType.TEXT, 'swift': MediaType.TEXT,
|
|
230
|
+
'rb': MediaType.TEXT, 'php': MediaType.TEXT, 'pl': MediaType.TEXT, 'pm': MediaType.TEXT,
|
|
231
|
+
'sh': MediaType.TEXT, 'bash': MediaType.TEXT, 'zsh': MediaType.TEXT, 'fish': MediaType.TEXT,
|
|
232
|
+
'r': MediaType.TEXT, 'R': MediaType.TEXT, 'rmd': MediaType.TEXT, 'Rmd': MediaType.TEXT,
|
|
233
|
+
'jl': MediaType.TEXT, 'matlab': MediaType.TEXT, 'm': MediaType.TEXT,
|
|
234
|
+
'sql': MediaType.TEXT, 'lua': MediaType.TEXT, 'vim': MediaType.TEXT,
|
|
235
|
+
'dart': MediaType.TEXT, 'ex': MediaType.TEXT, 'exs': MediaType.TEXT,
|
|
236
|
+
'erl': MediaType.TEXT, 'hrl': MediaType.TEXT, 'clj': MediaType.TEXT, 'cljs': MediaType.TEXT,
|
|
237
|
+
|
|
238
|
+
# Text formats - Notebooks and documentation
|
|
239
|
+
'ipynb': MediaType.TEXT, 'qmd': MediaType.TEXT, 'rst': MediaType.TEXT,
|
|
240
|
+
'tex': MediaType.TEXT, 'latex': MediaType.TEXT, 'bib': MediaType.TEXT,
|
|
241
|
+
'org': MediaType.TEXT, 'adoc': MediaType.TEXT, 'asciidoc': MediaType.TEXT,
|
|
242
|
+
|
|
243
|
+
# Text formats - Web and styles
|
|
244
|
+
'css': MediaType.TEXT, 'scss': MediaType.TEXT, 'sass': MediaType.TEXT, 'less': MediaType.TEXT,
|
|
245
|
+
'vue': MediaType.TEXT, 'svelte': MediaType.TEXT,
|
|
246
|
+
|
|
247
|
+
# Text formats - Build and config files
|
|
248
|
+
'gradle': MediaType.TEXT, 'cmake': MediaType.TEXT, 'make': MediaType.TEXT,
|
|
249
|
+
'dockerfile': MediaType.TEXT, 'containerfile': MediaType.TEXT,
|
|
250
|
+
'gitignore': MediaType.TEXT, 'gitattributes': MediaType.TEXT,
|
|
251
|
+
'env': MediaType.TEXT, 'properties': MediaType.TEXT,
|
|
252
|
+
|
|
253
|
+
# Text formats - Log and output files
|
|
254
|
+
'log': MediaType.TEXT, 'out': MediaType.TEXT, 'err': MediaType.TEXT,
|
|
219
255
|
|
|
220
256
|
# Audio
|
|
221
257
|
'mp3': MediaType.AUDIO, 'wav': MediaType.AUDIO, 'm4a': MediaType.AUDIO,
|
|
@@ -227,9 +263,66 @@ FILE_TYPE_MAPPINGS = {
|
|
|
227
263
|
}
|
|
228
264
|
|
|
229
265
|
|
|
266
|
+
def is_text_file(file_path: Union[str, Path]) -> bool:
|
|
267
|
+
"""
|
|
268
|
+
Detect if a file is text-based by attempting to read it.
|
|
269
|
+
|
|
270
|
+
This is a heuristic check that samples the beginning of the file
|
|
271
|
+
to determine if it contains text content.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
file_path: Path to the file
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
True if file appears to be text-based, False otherwise
|
|
278
|
+
"""
|
|
279
|
+
path = Path(file_path)
|
|
280
|
+
|
|
281
|
+
if not path.exists():
|
|
282
|
+
return False
|
|
283
|
+
|
|
284
|
+
# Check file size - avoid reading very large files
|
|
285
|
+
try:
|
|
286
|
+
file_size = path.stat().st_size
|
|
287
|
+
if file_size == 0:
|
|
288
|
+
return True # Empty files are text
|
|
289
|
+
|
|
290
|
+
# Sample first 8KB to detect if it's text
|
|
291
|
+
sample_size = min(8192, file_size)
|
|
292
|
+
|
|
293
|
+
with open(path, 'rb') as f:
|
|
294
|
+
sample = f.read(sample_size)
|
|
295
|
+
|
|
296
|
+
# Check for null bytes (strong indicator of binary)
|
|
297
|
+
if b'\x00' in sample:
|
|
298
|
+
return False
|
|
299
|
+
|
|
300
|
+
# Try to decode as UTF-8
|
|
301
|
+
try:
|
|
302
|
+
sample.decode('utf-8')
|
|
303
|
+
return True
|
|
304
|
+
except UnicodeDecodeError:
|
|
305
|
+
pass
|
|
306
|
+
|
|
307
|
+
# Try other common encodings
|
|
308
|
+
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
|
|
309
|
+
try:
|
|
310
|
+
sample.decode(encoding)
|
|
311
|
+
return True
|
|
312
|
+
except (UnicodeDecodeError, LookupError):
|
|
313
|
+
continue
|
|
314
|
+
|
|
315
|
+
# If we can't decode it, it's probably binary
|
|
316
|
+
return False
|
|
317
|
+
|
|
318
|
+
except Exception:
|
|
319
|
+
# On any error, assume it's not text
|
|
320
|
+
return False
|
|
321
|
+
|
|
322
|
+
|
|
230
323
|
def detect_media_type(file_path: Union[str, Path]) -> MediaType:
|
|
231
324
|
"""
|
|
232
|
-
Detect the media type of a file based on its extension.
|
|
325
|
+
Detect the media type of a file based on its extension and content.
|
|
233
326
|
|
|
234
327
|
Args:
|
|
235
328
|
file_path: Path to the file
|
|
@@ -240,7 +333,71 @@ def detect_media_type(file_path: Union[str, Path]) -> MediaType:
|
|
|
240
333
|
path = Path(file_path)
|
|
241
334
|
extension = path.suffix.lower().lstrip('.')
|
|
242
335
|
|
|
243
|
-
|
|
336
|
+
# First check the known extension mappings
|
|
337
|
+
if extension in FILE_TYPE_MAPPINGS:
|
|
338
|
+
return FILE_TYPE_MAPPINGS[extension]
|
|
339
|
+
|
|
340
|
+
# For unknown extensions, try to detect if it's a text file
|
|
341
|
+
# This handles cases like .R, .Rmd, .ipynb, and any other text-based files
|
|
342
|
+
if is_text_file(path):
|
|
343
|
+
return MediaType.TEXT
|
|
344
|
+
|
|
345
|
+
# Fall back to DOCUMENT for binary files with unknown extensions
|
|
346
|
+
return MediaType.DOCUMENT
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def get_all_supported_extensions() -> Dict[str, List[str]]:
|
|
350
|
+
"""
|
|
351
|
+
Get all supported file extensions organized by media type.
|
|
352
|
+
|
|
353
|
+
This function provides programmatic access to all file extensions
|
|
354
|
+
that AbstractCore can process.
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
Dictionary mapping media type names to lists of supported extensions.
|
|
358
|
+
|
|
359
|
+
Example:
|
|
360
|
+
>>> from abstractcore.media.types import get_all_supported_extensions
|
|
361
|
+
>>> formats = get_all_supported_extensions()
|
|
362
|
+
>>> print(f"Text formats: {len(formats['text'])} extensions")
|
|
363
|
+
Text formats: 70+ extensions
|
|
364
|
+
>>> print(formats['text'][:5])
|
|
365
|
+
['txt', 'md', 'markdown', 'csv', 'tsv']
|
|
366
|
+
"""
|
|
367
|
+
result = {}
|
|
368
|
+
for ext, media_type in FILE_TYPE_MAPPINGS.items():
|
|
369
|
+
type_name = media_type.value
|
|
370
|
+
if type_name not in result:
|
|
371
|
+
result[type_name] = []
|
|
372
|
+
result[type_name].append(ext)
|
|
373
|
+
|
|
374
|
+
# Sort extensions within each type for consistency
|
|
375
|
+
for type_name in result:
|
|
376
|
+
result[type_name].sort()
|
|
377
|
+
|
|
378
|
+
return result
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def get_supported_extensions_by_type(media_type: MediaType) -> List[str]:
|
|
382
|
+
"""
|
|
383
|
+
Get all supported file extensions for a specific media type.
|
|
384
|
+
|
|
385
|
+
Args:
|
|
386
|
+
media_type: The MediaType to query
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
List of file extensions (without dots) supported for this type
|
|
390
|
+
|
|
391
|
+
Example:
|
|
392
|
+
>>> from abstractcore.media.types import get_supported_extensions_by_type, MediaType
|
|
393
|
+
>>> text_exts = get_supported_extensions_by_type(MediaType.TEXT)
|
|
394
|
+
>>> 'r' in text_exts # R scripts
|
|
395
|
+
True
|
|
396
|
+
>>> 'ipynb' in text_exts # Jupyter notebooks
|
|
397
|
+
True
|
|
398
|
+
"""
|
|
399
|
+
extensions = [ext for ext, mt in FILE_TYPE_MAPPINGS.items() if mt == media_type]
|
|
400
|
+
return sorted(extensions)
|
|
244
401
|
|
|
245
402
|
|
|
246
403
|
def create_media_content(
|
|
@@ -8,9 +8,13 @@ demonstrating how to leverage the core infrastructure for real-world tasks.
|
|
|
8
8
|
from .basic_summarizer import BasicSummarizer, SummaryStyle, SummaryLength
|
|
9
9
|
from .basic_extractor import BasicExtractor
|
|
10
10
|
from .basic_judge import BasicJudge, JudgmentCriteria, Assessment, create_judge
|
|
11
|
+
from .basic_deepsearch import BasicDeepSearch, ResearchReport, ResearchFinding, ResearchPlan, ResearchSubTask
|
|
12
|
+
from .basic_intent import BasicIntentAnalyzer, IntentType, IntentDepth, IntentContext, IdentifiedIntent, IntentAnalysisOutput
|
|
11
13
|
|
|
12
14
|
__all__ = [
|
|
13
15
|
'BasicSummarizer', 'SummaryStyle', 'SummaryLength',
|
|
14
16
|
'BasicExtractor',
|
|
15
|
-
'BasicJudge', 'JudgmentCriteria', 'Assessment', 'create_judge'
|
|
17
|
+
'BasicJudge', 'JudgmentCriteria', 'Assessment', 'create_judge',
|
|
18
|
+
'BasicDeepSearch', 'ResearchReport', 'ResearchFinding', 'ResearchPlan', 'ResearchSubTask',
|
|
19
|
+
'BasicIntentAnalyzer', 'IntentType', 'IntentDepth', 'IntentContext', 'IdentifiedIntent', 'IntentAnalysisOutput'
|
|
16
20
|
]
|