PyPI - abstractcore - Versions diffs - 2.9.1__py3-none-any.whl → 2.11.2__py3-none-any.whl - Mend

abstractcore 2.9.1py3-none-any.whl → 2.11.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

abstractcore/__init__.py +7 -27
abstractcore/apps/extractor.py +33 -100
abstractcore/apps/intent.py +19 -0
abstractcore/apps/judge.py +20 -1
abstractcore/apps/summarizer.py +20 -1
abstractcore/architectures/detection.py +34 -1
abstractcore/architectures/response_postprocessing.py +313 -0
abstractcore/assets/architecture_formats.json +38 -8
abstractcore/assets/model_capabilities.json +781 -160
abstractcore/compression/__init__.py +1 -2
abstractcore/compression/glyph_processor.py +6 -4
abstractcore/config/main.py +31 -19
abstractcore/config/manager.py +389 -11
abstractcore/config/vision_config.py +5 -5
abstractcore/core/interface.py +151 -3
abstractcore/core/session.py +16 -10
abstractcore/download.py +1 -1
abstractcore/embeddings/manager.py +20 -6
abstractcore/endpoint/__init__.py +2 -0
abstractcore/endpoint/app.py +458 -0
abstractcore/mcp/client.py +3 -1
abstractcore/media/__init__.py +52 -17
abstractcore/media/auto_handler.py +42 -22
abstractcore/media/base.py +44 -1
abstractcore/media/capabilities.py +12 -33
abstractcore/media/enrichment.py +105 -0
abstractcore/media/handlers/anthropic_handler.py +19 -28
abstractcore/media/handlers/local_handler.py +124 -70
abstractcore/media/handlers/openai_handler.py +19 -31
abstractcore/media/processors/__init__.py +4 -2
abstractcore/media/processors/audio_processor.py +57 -0
abstractcore/media/processors/office_processor.py +8 -3
abstractcore/media/processors/pdf_processor.py +46 -3
abstractcore/media/processors/text_processor.py +22 -24
abstractcore/media/processors/video_processor.py +58 -0
abstractcore/media/types.py +97 -4
abstractcore/media/utils/image_scaler.py +20 -2
abstractcore/media/utils/video_frames.py +219 -0
abstractcore/media/vision_fallback.py +136 -22
abstractcore/processing/__init__.py +32 -3
abstractcore/processing/basic_deepsearch.py +15 -10
abstractcore/processing/basic_intent.py +3 -2
abstractcore/processing/basic_judge.py +3 -2
abstractcore/processing/basic_summarizer.py +1 -1
abstractcore/providers/__init__.py +3 -1
abstractcore/providers/anthropic_provider.py +95 -8
abstractcore/providers/base.py +1516 -81
abstractcore/providers/huggingface_provider.py +546 -69
abstractcore/providers/lmstudio_provider.py +35 -923
abstractcore/providers/mlx_provider.py +382 -35
abstractcore/providers/model_capabilities.py +5 -1
abstractcore/providers/ollama_provider.py +99 -15
abstractcore/providers/openai_compatible_provider.py +406 -180
abstractcore/providers/openai_provider.py +188 -44
abstractcore/providers/openrouter_provider.py +76 -0
abstractcore/providers/registry.py +61 -5
abstractcore/providers/streaming.py +138 -33
abstractcore/providers/vllm_provider.py +92 -817
abstractcore/server/app.py +461 -13
abstractcore/server/audio_endpoints.py +139 -0
abstractcore/server/vision_endpoints.py +1319 -0
abstractcore/structured/handler.py +316 -41
abstractcore/tools/common_tools.py +5501 -2012
abstractcore/tools/comms_tools.py +1641 -0
abstractcore/tools/core.py +37 -7
abstractcore/tools/handler.py +4 -9
abstractcore/tools/parser.py +49 -2
abstractcore/tools/tag_rewriter.py +2 -1
abstractcore/tools/telegram_tdlib.py +407 -0
abstractcore/tools/telegram_tools.py +261 -0
abstractcore/utils/cli.py +1085 -72
abstractcore/utils/token_utils.py +2 -0
abstractcore/utils/truncation.py +29 -0
abstractcore/utils/version.py +3 -4
abstractcore/utils/vlm_token_calculator.py +12 -2
abstractcore-2.11.2.dist-info/METADATA +562 -0
abstractcore-2.11.2.dist-info/RECORD +133 -0
{abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/WHEEL +1 -1
{abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/entry_points.txt +1 -0
abstractcore-2.9.1.dist-info/METADATA +0 -1190
abstractcore-2.9.1.dist-info/RECORD +0 -119
{abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/licenses/LICENSE +0 -0
{abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/top_level.txt +0 -0

abstractcore/__init__.py CHANGED Viewed

@@ -35,30 +35,16 @@ from .core.types import GenerateResponse, Message
 from .core.enums import ModelParameter, ModelCapability, MessageRole
 from .exceptions import ModelNotFoundError, ProviderAPIError, AuthenticationError
-# Embeddings module (optional import)
-try:
-    from .embeddings import EmbeddingManager
-    _has_embeddings = True
-except ImportError:
-    _has_embeddings = False
+# Processing helpers (lightweight; do not import optional tool/media deps here).
+from .processing.basic_summarizer import BasicSummarizer, SummaryStyle, SummaryLength
+from .processing.basic_extractor import BasicExtractor
-# Processing module (core functionality)
-from .processing import BasicSummarizer, SummaryStyle, SummaryLength, BasicExtractor
-_has_processing = True
-# Tools module (core functionality)
-from .tools import tool
+# Tools: the decorator is dependency-free (built-in tool library lives in abstractcore.tools.common_tools).
+from .tools.core import tool
 # Download module (core functionality)
 from .download import download_model, DownloadProgress, DownloadStatus
-# Compression module (optional import)
-try:
-    from .compression import GlyphConfig, CompressionOrchestrator
-    _has_compression = True
-except ImportError:
-    _has_compression = False
 __all__ = [
     'create_llm',
     'BasicSession',
@@ -76,11 +62,5 @@ __all__ = [
     'DownloadStatus',
 ]
-if _has_embeddings:
-    __all__.append('EmbeddingManager')
-if _has_compression:
-    __all__.extend(['GlyphConfig', 'CompressionOrchestrator'])
-# Processing is core functionality
-__all__.extend(['BasicSummarizer', 'SummaryStyle', 'SummaryLength', 'BasicExtractor'])
+# Processing helpers are part of the default install.
+__all__.extend(['BasicSummarizer', 'SummaryStyle', 'SummaryLength', 'BasicExtractor'])

abstractcore/apps/extractor.py CHANGED Viewed

@@ -1,38 +1,10 @@
 #!/usr/bin/env python3
 """
-AbstractCore Entity Extractor CLI Application
-Usage:
-    python -m abstractcore.apps.extractor <file_path> [options]
-Options:
-    --focus <focus>             Specific focus area for extraction (e.g., "technology", "business", "medical")
-    --style <style>             Extraction style (structured, focused, minimal, comprehensive, default: structured)
-    --length <length>           Extraction depth (brief, standard, detailed, comprehensive, default: standard)
-    --entity-types <types>      Comma-separated entity types to focus on (person,organization,location,etc.)
-    --similarity-threshold <t>  Similarity threshold for entity deduplication (0.0-1.0, default: 0.85)
-    --format <format>           Output format (json-ld, json, yaml, triples, default: json-ld)
-    --output <output>           Output file path (optional, prints to console if not provided)
-    --chunk-size <size>         Chunk size in characters (default: 6000, max: 32000)
-    --provider <provider>       LLM provider (requires --model)
-    --model <model>             LLM model (requires --provider)
-    --no-embeddings             Disable semantic entity deduplication
-    --fast                      Use fast extraction (skip verification, larger chunks, no embeddings)
-    --iterate <number>          Number of refinement iterations (default: 1, finds missing entities and verifies relationships)
-    --minified                  Output minified JSON-LD (compact, no indentation)
-    --verbose                   Show detailed progress information
-    --timeout <seconds>         HTTP timeout for LLM providers (default: 300, increase for large models)
-    --max-tokens <tokens>       Maximum total tokens for LLM context (default: 32000)
-    --max-output-tokens <tokens> Maximum tokens for LLM output generation (default: 8000)
-    --help                      Show this help message
+AbstractCore entity & relationship extraction CLI.
-Examples:
-    python -m abstractcore.apps.extractor document.pdf
-    python -m abstractcore.apps.extractor report.txt --focus technology --style structured --verbose
-    python -m abstractcore.apps.extractor data.md --entity-types person,organization --output kg.jsonld
-    python -m abstractcore.apps.extractor large.txt --fast --minified --verbose  # Fast, compact output
-    python -m abstractcore.apps.extractor report.txt --length detailed --provider openai --model gpt-4o-mini
-    python -m abstractcore.apps.extractor doc.txt --iterate 3 --verbose  # 3 refinement passes for higher quality
+Run:
+  - extractor <file> --help
+  - python -m abstractcore.apps.extractor <file> --help
 """
 import argparse
@@ -86,6 +58,25 @@ def read_file_content(file_path: str) -> str:
     if not file_path_obj.is_file():
         raise ValueError(f"Path is not a file: {file_path}")
+    # Use the Media system for non-text documents when available (PDF/Office).
+    rich_doc_exts = {'.pdf', '.docx', '.pptx', '.xlsx', '.odt', '.rtf'}
+    if file_path_obj.suffix.lower() in rich_doc_exts:
+        try:
+            from ..media import process_file
+            media_content = process_file(str(file_path_obj))
+            content = getattr(media_content, "content", "")
+            if isinstance(content, bytes):
+                return content.decode("utf-8", errors="ignore")
+            return str(content or "")
+        except ImportError as e:
+            raise ImportError(
+                f"Reading {file_path_obj.suffix.lower()} files requires media dependencies. "
+                f"Install with: pip install \"abstractcore[media]\". Error: {e}"
+            ) from e
+        except Exception as e:
+            raise Exception(f"Failed to extract content from {file_path}: {e}") from e
     # Try to read as text file
     try:
         # Try UTF-8 first
@@ -162,7 +153,7 @@ Examples:
   python -m abstractcore.apps.extractor document.pdf
   python -m abstractcore.apps.extractor report.txt --focus=technology --style=structured --verbose
   python -m abstractcore.apps.extractor data.md --entity-types=person,organization --output=kg.jsonld
-  python -m abstractcore.apps.extractor large.txt --length=detailed --fast --minified --verbose
+  python -m abstractcore.apps.extractor large.txt --length=detailed --minified --verbose
   python -m abstractcore.apps.extractor doc.txt --iterate=3 --verbose  # Iterative refinement for quality
   python -m abstractcore.apps.extractor doc.txt --format=triples --verbose  # RDF triples output
   python -m abstractcore.apps.extractor doc.txt --format=triples --output=triples.txt  # Simple triples
@@ -179,20 +170,19 @@ Output options:
   - Default: Pretty-printed JSON with indentation
   - --minified: Compact JSON without indentation (smaller file size)
-Performance options:
-  - Default: High accuracy with Chain of Verification (slower, 2x LLM calls per chunk)
-  - --fast: Optimized speed (skip verification, larger chunks, no embeddings)
-  - For large files: Use --fast flag for significant speedup (2-4x faster)
+Performance notes:
+  - Extraction is an LLM call; latency depends on provider/model and input size.
+  - For large files, increase --chunk-size to reduce the number of LLM calls (at the cost of context usage).
+  - Use --iterate for refinement (higher quality, more calls).
 Quality enhancement:
   - --iterate=N: Perform N refinement passes to find missing entities/relationships
   - Each iteration reviews the extraction to find gaps and verify relationship directionality
-  - Recommended: 2-3 iterations for critical extractions, 1 (default) for speed
+  - Tip: Start with 1 (default), then increase if you need higher recall.
 Default model setup:
   - Requires Ollama: https://ollama.com/
   - Download model: ollama pull qwen3:4b-instruct-2507-q4_K_M
-  - For best performance: qwen3-coder:30b or gpt-oss:120b
   - Or use --provider and --model for other providers
         """
     )
@@ -226,13 +216,6 @@ Default model setup:
         help='Comma-separated entity types to focus on (person,organization,location,concept,event,technology,product,date,other)'
     )
-    parser.add_argument(
-        '--similarity-threshold',
-        type=float,
-        default=0.85,
-        help='Similarity threshold for entity deduplication (0.0-1.0, default: 0.85)'
-    )
     # Build format choices based on available dependencies
     format_choices = ['json-ld', 'triples', 'json']
     if YAML_AVAILABLE:
@@ -267,25 +250,6 @@ Default model setup:
         help='LLM model (requires --provider)'
     )
-    parser.add_argument(
-        '--no-embeddings',
-        action='store_true',
-        help='Disable semantic entity deduplication'
-    )
-    parser.add_argument(
-        '--mode',
-        choices=['fast', 'balanced', 'thorough'],
-        default='balanced',
-        help='Extraction mode: fast (2-3x faster), balanced (default), thorough (highest quality)'
-    )
-    parser.add_argument(
-        '--fast',
-        action='store_true',
-        help='Legacy flag: equivalent to --mode=fast (deprecated, use --mode instead)'
-    )
     parser.add_argument(
         '--iterate',
         type=int,
@@ -330,11 +294,6 @@ Default model setup:
     args = parser.parse_args()
     try:
-        # Validate similarity threshold
-        if not 0.0 <= args.similarity_threshold <= 1.0:
-            print("Error: Similarity threshold must be between 0.0 and 1.0")
-            sys.exit(1)
         # Validate chunk size
         if args.chunk_size < 1000:
             print("Error: Chunk size must be at least 1000 characters")
@@ -383,43 +342,18 @@ Default model setup:
         extraction_style = parse_extraction_style(args.style)
         extraction_length = parse_extraction_length(args.length)
-        # Determine extraction mode (handle legacy --fast flag)
-        extraction_mode = args.mode
-        if args.fast:
-            extraction_mode = "fast"
         # Initialize LLM and extractor
-        use_embeddings = not args.no_embeddings
         if args.provider and args.model:
-            # Custom provider/model with max_tokens adjusted for chunk size and provider limits
-            max_tokens = max(32000, args.chunk_size)
             # Adjust chunk size for provider-specific limits first
             adjusted_chunk_size = args.chunk_size
-            # Adjust limits for specific providers to avoid context overflow
+            # Provider-specific safety: some models work better with smaller per-chunk payloads.
             if args.provider.lower() == "anthropic":
-                # Claude models have varying context limits
                 if "haiku" in args.model.lower():
-                    # Claude 3.5 Haiku: 200K tokens total
-                    max_tokens = min(max_tokens, 150000)  # Leave room for output
-                    max_output_tokens = 4000  # Conservative output limit
                     adjusted_chunk_size = min(args.chunk_size, 4000)  # Smaller chunks for Haiku
-                elif "sonnet" in args.model.lower():
-                    # Claude 3.5 Sonnet: 200K tokens
-                    max_tokens = min(max_tokens, 180000)
-                    max_output_tokens = 8000
-                else:
-                    # Claude 3 Opus or other: assume 200K
-                    max_tokens = min(max_tokens, 180000)
-                    max_output_tokens = 8000
-            else:
-                # Default for other providers
-                max_output_tokens = 8000
             if args.verbose:
-                print(f"Initializing BasicExtractor (mode: {extraction_mode}, {args.provider}, {args.model}, {args.max_tokens} token context, {args.max_output_tokens} output tokens)...")
+                print(f"Initializing BasicExtractor ({args.provider}, {args.model}, {args.max_tokens} token context, {args.max_output_tokens} output tokens)...")
                 if adjusted_chunk_size != args.chunk_size:
                     print(f"Adjusted chunk size from {args.chunk_size} to {adjusted_chunk_size} characters for {args.provider} compatibility")
@@ -435,7 +369,7 @@ Default model setup:
         else:
             # Default configuration
             if args.verbose:
-                print(f"Initializing BasicExtractor (mode: {extraction_mode}, ollama, qwen3:4b-instruct-2507-q4_K_M, {args.max_tokens} token context, {args.max_output_tokens} output tokens)...")
+                print(f"Initializing BasicExtractor (ollama, qwen3:4b-instruct-2507-q4_K_M, {args.max_tokens} token context, {args.max_output_tokens} output tokens)...")
             try:
                 extractor = BasicExtractor(
@@ -450,7 +384,6 @@ Default model setup:
                 print("\n🚀 Quick alternatives to get started:")
                 print("   - Use --provider and --model to specify an available provider")
                 print("   - Example: extractor document.txt --provider openai --model gpt-4o-mini")
-                print("   - For speed: extractor document.txt --mode=fast")
                 sys.exit(1)
         # Extract entities and relationships
@@ -604,4 +537,4 @@ Default model setup:
 if __name__ == "__main__":
-    main()
+    main()

abstractcore/apps/intent.py CHANGED Viewed

@@ -127,6 +127,25 @@ def read_file_content(file_path: str) -> str:
     if not file_path_obj.is_file():
         raise ValueError(f"Path is not a file: {file_path}")
+    # Use the Media system for non-text documents when available (PDF/Office).
+    rich_doc_exts = {'.pdf', '.docx', '.pptx', '.xlsx', '.odt', '.rtf'}
+    if file_path_obj.suffix.lower() in rich_doc_exts:
+        try:
+            from ..media import process_file
+            media_content = process_file(str(file_path_obj))
+            content = getattr(media_content, "content", "")
+            if isinstance(content, bytes):
+                return content.decode("utf-8", errors="ignore")
+            return str(content or "")
+        except ImportError as e:
+            raise ImportError(
+                f"Reading {file_path_obj.suffix.lower()} files requires media dependencies. "
+                f"Install with: pip install \"abstractcore[media]\". Error: {e}"
+            ) from e
+        except Exception as e:
+            raise Exception(f"Failed to extract content from {file_path}: {e}") from e
     # Try to read as text file
     try:
         # Try UTF-8 first

abstractcore/apps/judge.py CHANGED Viewed

@@ -72,6 +72,25 @@ def read_content(content_or_path: str) -> str:
     try:
         file_path = Path(content_or_path)
         if file_path.exists() and file_path.is_file():
+            # Use the Media system for non-text documents when available (PDF/Office).
+            rich_doc_exts = {'.pdf', '.docx', '.pptx', '.xlsx', '.odt', '.rtf'}
+            if file_path.suffix.lower() in rich_doc_exts:
+                try:
+                    from ..media import process_file
+                    media_content = process_file(str(file_path))
+                    content = getattr(media_content, "content", "")
+                    if isinstance(content, bytes):
+                        return content.decode("utf-8", errors="ignore")
+                    return str(content or "")
+                except ImportError as e:
+                    raise ImportError(
+                        f"Reading {file_path.suffix.lower()} files requires media dependencies. "
+                        f"Install with: pip install \"abstractcore[media]\". Error: {e}"
+                    ) from e
+                except Exception as e:
+                    raise Exception(f"Failed to extract content from {content_or_path}: {e}") from e
             # Try to read as text file
             try:
                 # Try UTF-8 first
@@ -628,4 +647,4 @@ Default model setup:
 if __name__ == "__main__":
-    main()
+    main()

abstractcore/apps/summarizer.py CHANGED Viewed

@@ -90,6 +90,25 @@ def read_file_content(file_path: str) -> str:
     if not file_path_obj.is_file():
         raise ValueError(f"Path is not a file: {file_path}")
+    # Use the Media system for non-text documents when available (PDF/Office).
+    rich_doc_exts = {'.pdf', '.docx', '.pptx', '.xlsx', '.odt', '.rtf'}
+    if file_path_obj.suffix.lower() in rich_doc_exts:
+        try:
+            from ..media import process_file
+            media_content = process_file(str(file_path_obj))
+            content = getattr(media_content, "content", "")
+            if isinstance(content, bytes):
+                return content.decode("utf-8", errors="ignore")
+            return str(content or "")
+        except ImportError as e:
+            raise ImportError(
+                f"Reading {file_path_obj.suffix.lower()} files requires media dependencies. "
+                f"Install with: pip install \"abstractcore[media]\". Error: {e}"
+            ) from e
+        except Exception as e:
+            raise Exception(f"Failed to extract content from {file_path}: {e}") from e
     # Try to read as text file
     try:
         # Try UTF-8 first
@@ -468,4 +487,4 @@ Default model setup:
 if __name__ == "__main__":
-    main()
+    main()

abstractcore/architectures/detection.py CHANGED Viewed

@@ -42,6 +42,7 @@ _KNOWN_PROVIDER_PREFIXES = {
     "ollama",
     "openai",
     "openai-compatible",
+    "openrouter",
     "together",
     "vllm",
 }
@@ -212,6 +213,30 @@ def resolve_model_alias(model_name: str, models: Dict[str, Any]) -> str:
             return s
         return s.split("/")[-1].strip()
+    def _colon_variants(name: str) -> List[str]:
+        """Generate best-effort candidates for Ollama-style `name:tag` ids."""
+        s = str(name or "").strip()
+        if not s or ":" not in s:
+            return []
+        head, rest = s.split(":", 1)
+        head = head.strip()
+        rest = rest.strip()
+        if not head or not rest:
+            return []
+        first = rest.split("-", 1)[0].strip()
+        out: List[str] = []
+        # Avoid over-mapping specialized Ollama names (e.g., `qwen3-coder:30b`) onto
+        # upstream base-model capability entries which may advertise much larger context
+        # windows than the local runtime is configured to support by default.
+        if "-" in head:
+            out.append(head)
+            return out
+        if first:
+            out.append(f"{head}:{first}")
+            out.append(f"{head}-{first}")
+        out.append(head)
+        return out
     def _candidates(*names: str) -> List[str]:
         out: List[str] = []
         for n in names:
@@ -219,9 +244,11 @@ def resolve_model_alias(model_name: str, models: Dict[str, Any]) -> str:
             if not s:
                 continue
             out.append(s)
+            out.extend(_colon_variants(s))
             t = _tail(s)
             if t and t != s:
                 out.append(t)
+                out.extend(_colon_variants(t))
         # Deduplicate while preserving order
         uniq: List[str] = []
         seen: set[str] = set()
@@ -378,7 +405,8 @@ def get_model_capabilities(model_name: str) -> Dict[str, Any]:
     except Exception:
         raw_name = ""
-    if raw_name and raw_name not in _default_capabilities_warning_cache:
+    placeholder_names = {"default"}
+    if raw_name and raw_name.lower() not in placeholder_names and raw_name not in _default_capabilities_warning_cache:
         _default_capabilities_warning_cache.add(raw_name)
         logger.warning(
             "Model not found in model_capabilities.json; falling back to architecture defaults",
@@ -454,6 +482,11 @@ def supports_audio(model_name: str) -> bool:
 def supports_embeddings(model_name: str) -> bool:
     """Check if model supports embeddings."""
     capabilities = get_model_capabilities(model_name)
+    # Prefer explicit model metadata over name heuristics:
+    # - `model_type: "embedding"` is the canonical signal in `assets/model_capabilities.json`.
+    # - `embedding_support` is a legacy boolean (kept for backwards compatibility).
+    if capabilities.get("model_type") == "embedding":
+        return True
     return capabilities.get("embedding_support", False)

abstractcore 2.9.1__py3-none-any.whl → 2.11.2__py3-none-any.whl

abstractcore 2.9.1py3-none-any.whl → 2.11.2py3-none-any.whl