abstractcore 2.5.0__py3-none-any.whl → 2.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. abstractcore/__init__.py +12 -0
  2. abstractcore/apps/__main__.py +8 -1
  3. abstractcore/apps/deepsearch.py +644 -0
  4. abstractcore/apps/intent.py +614 -0
  5. abstractcore/architectures/detection.py +250 -4
  6. abstractcore/assets/architecture_formats.json +14 -1
  7. abstractcore/assets/model_capabilities.json +583 -44
  8. abstractcore/compression/__init__.py +29 -0
  9. abstractcore/compression/analytics.py +420 -0
  10. abstractcore/compression/cache.py +250 -0
  11. abstractcore/compression/config.py +279 -0
  12. abstractcore/compression/exceptions.py +30 -0
  13. abstractcore/compression/glyph_processor.py +381 -0
  14. abstractcore/compression/optimizer.py +388 -0
  15. abstractcore/compression/orchestrator.py +380 -0
  16. abstractcore/compression/pil_text_renderer.py +818 -0
  17. abstractcore/compression/quality.py +226 -0
  18. abstractcore/compression/text_formatter.py +666 -0
  19. abstractcore/compression/vision_compressor.py +371 -0
  20. abstractcore/config/main.py +66 -1
  21. abstractcore/config/manager.py +111 -5
  22. abstractcore/core/session.py +105 -5
  23. abstractcore/events/__init__.py +1 -1
  24. abstractcore/media/auto_handler.py +312 -18
  25. abstractcore/media/handlers/local_handler.py +14 -2
  26. abstractcore/media/handlers/openai_handler.py +62 -3
  27. abstractcore/media/processors/__init__.py +11 -1
  28. abstractcore/media/processors/direct_pdf_processor.py +210 -0
  29. abstractcore/media/processors/glyph_pdf_processor.py +227 -0
  30. abstractcore/media/processors/image_processor.py +7 -1
  31. abstractcore/media/processors/text_processor.py +18 -3
  32. abstractcore/media/types.py +164 -7
  33. abstractcore/processing/__init__.py +5 -1
  34. abstractcore/processing/basic_deepsearch.py +2173 -0
  35. abstractcore/processing/basic_intent.py +690 -0
  36. abstractcore/providers/__init__.py +18 -0
  37. abstractcore/providers/anthropic_provider.py +29 -2
  38. abstractcore/providers/base.py +279 -6
  39. abstractcore/providers/huggingface_provider.py +658 -27
  40. abstractcore/providers/lmstudio_provider.py +52 -2
  41. abstractcore/providers/mlx_provider.py +103 -4
  42. abstractcore/providers/model_capabilities.py +352 -0
  43. abstractcore/providers/ollama_provider.py +44 -6
  44. abstractcore/providers/openai_provider.py +29 -2
  45. abstractcore/providers/registry.py +91 -19
  46. abstractcore/server/app.py +91 -81
  47. abstractcore/structured/handler.py +161 -1
  48. abstractcore/tools/common_tools.py +98 -3
  49. abstractcore/utils/__init__.py +4 -1
  50. abstractcore/utils/cli.py +114 -1
  51. abstractcore/utils/trace_export.py +287 -0
  52. abstractcore/utils/version.py +1 -1
  53. abstractcore/utils/vlm_token_calculator.py +655 -0
  54. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/METADATA +140 -23
  55. abstractcore-2.5.3.dist-info/RECORD +107 -0
  56. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/entry_points.txt +4 -0
  57. abstractcore-2.5.0.dist-info/RECORD +0 -86
  58. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/WHEEL +0 -0
  59. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/licenses/LICENSE +0 -0
  60. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/top_level.txt +0 -0
@@ -207,15 +207,51 @@ FILE_TYPE_MAPPINGS = {
207
207
  'gif': MediaType.IMAGE, 'bmp': MediaType.IMAGE, 'tif': MediaType.IMAGE,
208
208
  'tiff': MediaType.IMAGE, 'webp': MediaType.IMAGE, 'ico': MediaType.IMAGE,
209
209
 
210
- # Documents
210
+ # Documents (binary formats that need special processing)
211
211
  'pdf': MediaType.DOCUMENT, 'doc': MediaType.DOCUMENT, 'docx': MediaType.DOCUMENT,
212
212
  'xls': MediaType.DOCUMENT, 'xlsx': MediaType.DOCUMENT, 'ppt': MediaType.DOCUMENT,
213
213
  'pptx': MediaType.DOCUMENT, 'odt': MediaType.DOCUMENT, 'rtf': MediaType.DOCUMENT,
214
214
 
215
- # Text formats
216
- 'txt': MediaType.TEXT, 'md': MediaType.TEXT, 'csv': MediaType.TEXT,
217
- 'tsv': MediaType.TEXT, 'json': MediaType.TEXT, 'xml': MediaType.TEXT,
218
- 'html': MediaType.TEXT, 'htm': MediaType.TEXT,
215
+ # Text formats - Common markup and data formats
216
+ 'txt': MediaType.TEXT, 'md': MediaType.TEXT, 'markdown': MediaType.TEXT,
217
+ 'csv': MediaType.TEXT, 'tsv': MediaType.TEXT,
218
+ 'json': MediaType.TEXT, 'jsonl': MediaType.TEXT, 'ndjson': MediaType.TEXT,
219
+ 'xml': MediaType.TEXT, 'html': MediaType.TEXT, 'htm': MediaType.TEXT,
220
+ 'yaml': MediaType.TEXT, 'yml': MediaType.TEXT, 'toml': MediaType.TEXT,
221
+ 'ini': MediaType.TEXT, 'cfg': MediaType.TEXT, 'conf': MediaType.TEXT,
222
+
223
+ # Text formats - Programming and scripting languages
224
+ 'py': MediaType.TEXT, 'pyw': MediaType.TEXT, 'pyx': MediaType.TEXT,
225
+ 'js': MediaType.TEXT, 'jsx': MediaType.TEXT, 'ts': MediaType.TEXT, 'tsx': MediaType.TEXT,
226
+ 'java': MediaType.TEXT, 'kt': MediaType.TEXT, 'scala': MediaType.TEXT,
227
+ 'c': MediaType.TEXT, 'cpp': MediaType.TEXT, 'cc': MediaType.TEXT, 'cxx': MediaType.TEXT,
228
+ 'h': MediaType.TEXT, 'hpp': MediaType.TEXT, 'hxx': MediaType.TEXT,
229
+ 'cs': MediaType.TEXT, 'go': MediaType.TEXT, 'rs': MediaType.TEXT, 'swift': MediaType.TEXT,
230
+ 'rb': MediaType.TEXT, 'php': MediaType.TEXT, 'pl': MediaType.TEXT, 'pm': MediaType.TEXT,
231
+ 'sh': MediaType.TEXT, 'bash': MediaType.TEXT, 'zsh': MediaType.TEXT, 'fish': MediaType.TEXT,
232
+ 'r': MediaType.TEXT, 'R': MediaType.TEXT, 'rmd': MediaType.TEXT, 'Rmd': MediaType.TEXT,
233
+ 'jl': MediaType.TEXT, 'matlab': MediaType.TEXT, 'm': MediaType.TEXT,
234
+ 'sql': MediaType.TEXT, 'lua': MediaType.TEXT, 'vim': MediaType.TEXT,
235
+ 'dart': MediaType.TEXT, 'ex': MediaType.TEXT, 'exs': MediaType.TEXT,
236
+ 'erl': MediaType.TEXT, 'hrl': MediaType.TEXT, 'clj': MediaType.TEXT, 'cljs': MediaType.TEXT,
237
+
238
+ # Text formats - Notebooks and documentation
239
+ 'ipynb': MediaType.TEXT, 'qmd': MediaType.TEXT, 'rst': MediaType.TEXT,
240
+ 'tex': MediaType.TEXT, 'latex': MediaType.TEXT, 'bib': MediaType.TEXT,
241
+ 'org': MediaType.TEXT, 'adoc': MediaType.TEXT, 'asciidoc': MediaType.TEXT,
242
+
243
+ # Text formats - Web and styles
244
+ 'css': MediaType.TEXT, 'scss': MediaType.TEXT, 'sass': MediaType.TEXT, 'less': MediaType.TEXT,
245
+ 'vue': MediaType.TEXT, 'svelte': MediaType.TEXT,
246
+
247
+ # Text formats - Build and config files
248
+ 'gradle': MediaType.TEXT, 'cmake': MediaType.TEXT, 'make': MediaType.TEXT,
249
+ 'dockerfile': MediaType.TEXT, 'containerfile': MediaType.TEXT,
250
+ 'gitignore': MediaType.TEXT, 'gitattributes': MediaType.TEXT,
251
+ 'env': MediaType.TEXT, 'properties': MediaType.TEXT,
252
+
253
+ # Text formats - Log and output files
254
+ 'log': MediaType.TEXT, 'out': MediaType.TEXT, 'err': MediaType.TEXT,
219
255
 
220
256
  # Audio
221
257
  'mp3': MediaType.AUDIO, 'wav': MediaType.AUDIO, 'm4a': MediaType.AUDIO,
@@ -227,9 +263,66 @@ FILE_TYPE_MAPPINGS = {
227
263
  }
228
264
 
229
265
 
266
+ def is_text_file(file_path: Union[str, Path]) -> bool:
267
+ """
268
+ Detect if a file is text-based by attempting to read it.
269
+
270
+ This is a heuristic check that samples the beginning of the file
271
+ to determine if it contains text content.
272
+
273
+ Args:
274
+ file_path: Path to the file
275
+
276
+ Returns:
277
+ True if file appears to be text-based, False otherwise
278
+ """
279
+ path = Path(file_path)
280
+
281
+ if not path.exists():
282
+ return False
283
+
284
+ # Check file size - avoid reading very large files
285
+ try:
286
+ file_size = path.stat().st_size
287
+ if file_size == 0:
288
+ return True # Empty files are text
289
+
290
+ # Sample first 8KB to detect if it's text
291
+ sample_size = min(8192, file_size)
292
+
293
+ with open(path, 'rb') as f:
294
+ sample = f.read(sample_size)
295
+
296
+ # Check for null bytes (strong indicator of binary)
297
+ if b'\x00' in sample:
298
+ return False
299
+
300
+ # Try to decode as UTF-8
301
+ try:
302
+ sample.decode('utf-8')
303
+ return True
304
+ except UnicodeDecodeError:
305
+ pass
306
+
307
+ # Try other common encodings
308
+ for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
309
+ try:
310
+ sample.decode(encoding)
311
+ return True
312
+ except (UnicodeDecodeError, LookupError):
313
+ continue
314
+
315
+ # If we can't decode it, it's probably binary
316
+ return False
317
+
318
+ except Exception:
319
+ # On any error, assume it's not text
320
+ return False
321
+
322
+
230
323
  def detect_media_type(file_path: Union[str, Path]) -> MediaType:
231
324
  """
232
- Detect the media type of a file based on its extension.
325
+ Detect the media type of a file based on its extension and content.
233
326
 
234
327
  Args:
235
328
  file_path: Path to the file
@@ -240,7 +333,71 @@ def detect_media_type(file_path: Union[str, Path]) -> MediaType:
240
333
  path = Path(file_path)
241
334
  extension = path.suffix.lower().lstrip('.')
242
335
 
243
- return FILE_TYPE_MAPPINGS.get(extension, MediaType.DOCUMENT)
336
+ # First check the known extension mappings
337
+ if extension in FILE_TYPE_MAPPINGS:
338
+ return FILE_TYPE_MAPPINGS[extension]
339
+
340
+ # For unknown extensions, try to detect if it's a text file
341
+ # This handles cases like .R, .Rmd, .ipynb, and any other text-based files
342
+ if is_text_file(path):
343
+ return MediaType.TEXT
344
+
345
+ # Fall back to DOCUMENT for binary files with unknown extensions
346
+ return MediaType.DOCUMENT
347
+
348
+
349
+ def get_all_supported_extensions() -> Dict[str, List[str]]:
350
+ """
351
+ Get all supported file extensions organized by media type.
352
+
353
+ This function provides programmatic access to all file extensions
354
+ that AbstractCore can process.
355
+
356
+ Returns:
357
+ Dictionary mapping media type names to lists of supported extensions.
358
+
359
+ Example:
360
+ >>> from abstractcore.media.types import get_all_supported_extensions
361
+ >>> formats = get_all_supported_extensions()
362
+ >>> print(f"Text formats: {len(formats['text'])} extensions")
363
+ Text formats: 70+ extensions
364
+ >>> print(formats['text'][:5])
365
+ ['txt', 'md', 'markdown', 'csv', 'tsv']
366
+ """
367
+ result = {}
368
+ for ext, media_type in FILE_TYPE_MAPPINGS.items():
369
+ type_name = media_type.value
370
+ if type_name not in result:
371
+ result[type_name] = []
372
+ result[type_name].append(ext)
373
+
374
+ # Sort extensions within each type for consistency
375
+ for type_name in result:
376
+ result[type_name].sort()
377
+
378
+ return result
379
+
380
+
381
+ def get_supported_extensions_by_type(media_type: MediaType) -> List[str]:
382
+ """
383
+ Get all supported file extensions for a specific media type.
384
+
385
+ Args:
386
+ media_type: The MediaType to query
387
+
388
+ Returns:
389
+ List of file extensions (without dots) supported for this type
390
+
391
+ Example:
392
+ >>> from abstractcore.media.types import get_supported_extensions_by_type, MediaType
393
+ >>> text_exts = get_supported_extensions_by_type(MediaType.TEXT)
394
+ >>> 'r' in text_exts # R scripts
395
+ True
396
+ >>> 'ipynb' in text_exts # Jupyter notebooks
397
+ True
398
+ """
399
+ extensions = [ext for ext, mt in FILE_TYPE_MAPPINGS.items() if mt == media_type]
400
+ return sorted(extensions)
244
401
 
245
402
 
246
403
  def create_media_content(
@@ -8,9 +8,13 @@ demonstrating how to leverage the core infrastructure for real-world tasks.
8
8
  from .basic_summarizer import BasicSummarizer, SummaryStyle, SummaryLength
9
9
  from .basic_extractor import BasicExtractor
10
10
  from .basic_judge import BasicJudge, JudgmentCriteria, Assessment, create_judge
11
+ from .basic_deepsearch import BasicDeepSearch, ResearchReport, ResearchFinding, ResearchPlan, ResearchSubTask
12
+ from .basic_intent import BasicIntentAnalyzer, IntentType, IntentDepth, IntentContext, IdentifiedIntent, IntentAnalysisOutput
11
13
 
12
14
  __all__ = [
13
15
  'BasicSummarizer', 'SummaryStyle', 'SummaryLength',
14
16
  'BasicExtractor',
15
- 'BasicJudge', 'JudgmentCriteria', 'Assessment', 'create_judge'
17
+ 'BasicJudge', 'JudgmentCriteria', 'Assessment', 'create_judge',
18
+ 'BasicDeepSearch', 'ResearchReport', 'ResearchFinding', 'ResearchPlan', 'ResearchSubTask',
19
+ 'BasicIntentAnalyzer', 'IntentType', 'IntentDepth', 'IntentContext', 'IdentifiedIntent', 'IntentAnalysisOutput'
16
20
  ]