PyPI - abstractcore - Versions diffs - 2.5.0__py3-none-any.whl → 2.5.3__py3-none-any.whl - Mend

abstractcore 2.5.0py3-none-any.whl → 2.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

abstractcore/__init__.py +12 -0
abstractcore/apps/__main__.py +8 -1
abstractcore/apps/deepsearch.py +644 -0
abstractcore/apps/intent.py +614 -0
abstractcore/architectures/detection.py +250 -4
abstractcore/assets/architecture_formats.json +14 -1
abstractcore/assets/model_capabilities.json +583 -44
abstractcore/compression/__init__.py +29 -0
abstractcore/compression/analytics.py +420 -0
abstractcore/compression/cache.py +250 -0
abstractcore/compression/config.py +279 -0
abstractcore/compression/exceptions.py +30 -0
abstractcore/compression/glyph_processor.py +381 -0
abstractcore/compression/optimizer.py +388 -0
abstractcore/compression/orchestrator.py +380 -0
abstractcore/compression/pil_text_renderer.py +818 -0
abstractcore/compression/quality.py +226 -0
abstractcore/compression/text_formatter.py +666 -0
abstractcore/compression/vision_compressor.py +371 -0
abstractcore/config/main.py +66 -1
abstractcore/config/manager.py +111 -5
abstractcore/core/session.py +105 -5
abstractcore/events/__init__.py +1 -1
abstractcore/media/auto_handler.py +312 -18
abstractcore/media/handlers/local_handler.py +14 -2
abstractcore/media/handlers/openai_handler.py +62 -3
abstractcore/media/processors/__init__.py +11 -1
abstractcore/media/processors/direct_pdf_processor.py +210 -0
abstractcore/media/processors/glyph_pdf_processor.py +227 -0
abstractcore/media/processors/image_processor.py +7 -1
abstractcore/media/processors/text_processor.py +18 -3
abstractcore/media/types.py +164 -7
abstractcore/processing/__init__.py +5 -1
abstractcore/processing/basic_deepsearch.py +2173 -0
abstractcore/processing/basic_intent.py +690 -0
abstractcore/providers/__init__.py +18 -0
abstractcore/providers/anthropic_provider.py +29 -2
abstractcore/providers/base.py +279 -6
abstractcore/providers/huggingface_provider.py +658 -27
abstractcore/providers/lmstudio_provider.py +52 -2
abstractcore/providers/mlx_provider.py +103 -4
abstractcore/providers/model_capabilities.py +352 -0
abstractcore/providers/ollama_provider.py +44 -6
abstractcore/providers/openai_provider.py +29 -2
abstractcore/providers/registry.py +91 -19
abstractcore/server/app.py +91 -81
abstractcore/structured/handler.py +161 -1
abstractcore/tools/common_tools.py +98 -3
abstractcore/utils/__init__.py +4 -1
abstractcore/utils/cli.py +114 -1
abstractcore/utils/trace_export.py +287 -0
abstractcore/utils/version.py +1 -1
abstractcore/utils/vlm_token_calculator.py +655 -0
{abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/METADATA +140 -23
abstractcore-2.5.3.dist-info/RECORD +107 -0
{abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/entry_points.txt +4 -0
abstractcore-2.5.0.dist-info/RECORD +0 -86
{abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/WHEEL +0 -0
{abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/licenses/LICENSE +0 -0
{abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/top_level.txt +0 -0

abstractcore/assets/model_capabilities.json CHANGED Viewed

@@ -13,6 +13,25 @@
       "aliases": [],
       "max_tokens": 128000
     },
+    "gpt-4-turbo": {
+      "max_output_tokens": 4096,
+      "tool_support": "native",
+      "structured_output": "native",
+      "parallel_tools": true,
+      "max_tools": -1,
+      "vision_support": true,
+      "audio_support": false,
+      "image_resolutions": [
+        "variable"
+      ],
+      "notes": "GPT-4 Turbo with vision capabilities",
+      "source": "OpenAI official docs 2025",
+      "canonical_name": "gpt-4-turbo",
+      "aliases": [
+        "gpt-4-turbo-preview"
+      ],
+      "max_tokens": 128000
+    },
     "gpt-4-turbo-with-vision": {
       "max_output_tokens": 4096,
       "tool_support": "native",
@@ -45,6 +64,18 @@
       "image_resolutions": [
         "variable"
       ],
+      "image_tokenization_method": "tile_based",
+      "base_image_tokens": 85,
+      "tokens_per_tile": 170,
+      "tile_size": "512x512",
+      "max_image_dimension": 2048,
+      "short_side_resize_target": 768,
+      "detail_levels": [
+        "low",
+        "high",
+        "auto"
+      ],
+      "low_detail_tokens": 85,
       "notes": "Multimodal omni model, 2x faster, half price, 5x higher rate limits (updated Nov 2024)",
       "source": "OpenAI official docs 2025",
       "canonical_name": "gpt-4o",
@@ -154,6 +185,12 @@
       "image_resolutions": [
         "up to 1568x1568"
       ],
+      "image_tokenization_method": "pixel_area_based",
+      "token_formula": "(width * height) / 750",
+      "pixel_divisor": 750,
+      "max_image_dimension": 1568,
+      "token_cap": 1600,
+      "min_dimension_warning": 200,
       "audio_support": false,
       "notes": "disable_parallel_tool_use option available",
       "source": "Anthropic official docs",
@@ -316,7 +353,7 @@
     "llama-3.2-1b": {
       "max_output_tokens": 2048,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -329,7 +366,7 @@
     "llama-3.2-3b": {
       "max_output_tokens": 2048,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -342,7 +379,7 @@
     "llama-3.2-11b-vision": {
       "max_output_tokens": 2048,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": true,
       "image_resolutions": [
@@ -358,7 +395,7 @@
     "llama-3.3-70b": {
       "max_output_tokens": 8192,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": true,
       "vision_support": false,
       "audio_support": false,
@@ -371,7 +408,7 @@
     "llama-3.1-8b": {
       "max_output_tokens": 8192,
       "tool_support": "native",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": true,
       "vision_support": false,
       "audio_support": false,
@@ -384,7 +421,7 @@
     "llama-3.1-70b": {
       "max_output_tokens": 8192,
       "tool_support": "native",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": true,
       "vision_support": false,
       "audio_support": false,
@@ -397,7 +434,7 @@
     "llama-3.1-405b": {
       "max_output_tokens": 8192,
       "tool_support": "native",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": true,
       "vision_support": false,
       "audio_support": false,
@@ -426,7 +463,7 @@
     "qwen2.5-0.5b": {
       "max_output_tokens": 8192,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -439,7 +476,7 @@
     "qwen2.5-1.5b": {
       "max_output_tokens": 8192,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -452,7 +489,7 @@
     "qwen2.5-3b": {
       "max_output_tokens": 8192,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -465,7 +502,7 @@
     "qwen2.5-7b": {
       "max_output_tokens": 8192,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -478,7 +515,7 @@
     "qwen2.5-14b": {
       "max_output_tokens": 8192,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -491,7 +528,7 @@
     "qwen2.5-32b": {
       "max_output_tokens": 8192,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -504,7 +541,7 @@
     "qwen2.5-72b": {
       "max_output_tokens": 8192,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -517,7 +554,7 @@
     "qwen3-0.6b": {
       "max_output_tokens": 8192,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -531,7 +568,7 @@
     "qwen3-1.7b": {
       "max_output_tokens": 8192,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -545,7 +582,7 @@
     "qwen3-4b": {
       "max_output_tokens": 8192,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -559,7 +596,7 @@
     "qwen3-32b": {
       "max_output_tokens": 8192,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -573,7 +610,7 @@
     "qwen3-30b-a3b": {
       "max_output_tokens": 8192,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -584,10 +621,26 @@
       "aliases": [],
       "max_tokens": 40960
     },
+    "qwen3-30b-a3b-2507": {
+      "max_output_tokens": 8192,
+      "tool_support": "prompted",
+      "structured_output": "native",
+      "parallel_tools": false,
+      "vision_support": false,
+      "audio_support": false,
+      "thinking_support": true,
+      "notes": "Qwen3-30B-A3B-Instruct-2507 with enhanced reasoning, coding, and mathematical skills. Supports up to 256K context, extendable to 1M tokens",
+      "source": "Alibaba Qwen3 2507 release",
+      "canonical_name": "qwen3-30b-a3b-2507",
+      "aliases": [
+        "qwen/qwen3-30b-a3b-2507"
+      ],
+      "max_tokens": 262144
+    },
     "qwen3-coder-30b": {
       "max_output_tokens": 8192,
       "tool_support": "native",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": true,
       "vision_support": false,
       "audio_support": false,
@@ -600,7 +653,7 @@
     "qwen2-vl": {
       "max_output_tokens": 8192,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": true,
       "image_resolutions": [
@@ -643,7 +696,7 @@
     "phi-3-mini": {
       "max_output_tokens": 4096,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -656,7 +709,7 @@
     "phi-3-small": {
       "max_output_tokens": 8192,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -668,7 +721,7 @@
     "phi-3-medium": {
       "max_output_tokens": 4096,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -680,7 +733,7 @@
     "phi-3.5-mini": {
       "max_output_tokens": 4096,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -692,7 +745,7 @@
     "phi-3.5-moe": {
       "max_output_tokens": 4096,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -705,7 +758,7 @@
     "phi-3-vision": {
       "max_output_tokens": 4096,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": true,
       "image_resolutions": [
@@ -720,7 +773,7 @@
     "phi-4": {
       "max_output_tokens": 16000,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -733,7 +786,7 @@
     "mistral-7b": {
       "max_output_tokens": 8192,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -877,6 +930,31 @@
       "max_image_resolution": "1120x1120",
       "image_patch_size": 14,
       "max_image_tokens": 6400,
+      "image_tokenization_method": "resolution_tier_based",
+      "supported_resolutions": [
+        [
+          560,
+          560
+        ],
+        [
+          1120,
+          560
+        ],
+        [
+          560,
+          1120
+        ],
+        [
+          1120,
+          1120
+        ]
+      ],
+      "base_tokens_per_resolution": {
+        "560x560": 1600,
+        "1120x560": 3200,
+        "560x1120": 3200,
+        "1120x1120": 6400
+      },
       "notes": "Llama 3.2 Vision 11B model with multimodal capabilities for visual recognition and reasoning",
       "source": "Meta AI Llama 3.2 release",
       "canonical_name": "llama3.2-vision:11b",
@@ -941,7 +1019,7 @@
     "gemma-2b": {
       "max_output_tokens": 8192,
       "tool_support": "none",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -953,7 +1031,7 @@
     "gemma-7b": {
       "max_output_tokens": 8192,
       "tool_support": "none",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -1002,7 +1080,7 @@
     "codegemma": {
       "max_output_tokens": 8192,
       "tool_support": "none",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -1033,7 +1111,7 @@
     "glm-4": {
       "max_output_tokens": 4096,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -1085,7 +1163,7 @@
     "qwen3": {
       "max_output_tokens": 8192,
       "tool_support": "prompted",
-      "structured_output": "prompted",
+      "structured_output": "native",
       "parallel_tools": false,
       "vision_support": false,
       "audio_support": false,
@@ -1255,13 +1333,18 @@
       "video_support": true,
       "audio_support": false,
       "image_resolutions": [
-        "variable"
+        "64x64 to 4096x4096"
       ],
-      "max_image_resolution": "variable",
+      "max_image_resolution": "4096x4096",
       "image_patch_size": 16,
       "max_image_tokens": 24576,
       "pixel_grouping": "32x32",
-      "notes": "Qwen3-VL 4B dense model with 256K context, optimized for LMStudio",
+      "image_tokenization_method": "patch_based_adaptive",
+      "adaptive_resolution": true,
+      "min_resolution": 64,
+      "max_resolution": 4096,
+      "vision_encoder": "ViT-based",
+      "notes": "Qwen3-VL 4B dense model with 256K context, optimized for LMStudio. Parameters: 4.83B. FP8 checkpoints available.",
       "source": "Alibaba Qwen3-VL technical report 2025",
       "canonical_name": "qwen3-vl-4b",
       "aliases": [
@@ -1278,13 +1361,18 @@
       "video_support": true,
       "audio_support": false,
       "image_resolutions": [
-        "variable"
+        "64x64 to 4096x4096"
       ],
-      "max_image_resolution": "variable",
+      "max_image_resolution": "4096x4096",
       "image_patch_size": 16,
       "max_image_tokens": 24576,
       "pixel_grouping": "32x32",
-      "notes": "Qwen3-VL 8B dense model with 256K context, optimized for LMStudio",
+      "image_tokenization_method": "patch_based_adaptive",
+      "adaptive_resolution": true,
+      "min_resolution": 64,
+      "max_resolution": 4096,
+      "vision_encoder": "ViT-based",
+      "notes": "Qwen3-VL 8B dense model with 256K context, optimized for LMStudio. Parameters: 8.77B. FP8 checkpoints available.",
       "source": "Alibaba Qwen3-VL technical report 2025",
       "canonical_name": "qwen3-vl-8b",
       "aliases": [
@@ -1301,19 +1389,24 @@
       "video_support": true,
       "audio_support": false,
       "image_resolutions": [
-        "variable"
+        "64x64 to 4096x4096"
       ],
-      "max_image_resolution": "variable",
+      "max_image_resolution": "4096x4096",
       "image_patch_size": 16,
       "max_image_tokens": 24576,
       "pixel_grouping": "32x32",
-      "notes": "Qwen3-VL 30B MoE model (30.5B total/3.3B active), best performing vision model, 256K context",
+      "image_tokenization_method": "patch_based_adaptive",
+      "adaptive_resolution": true,
+      "min_resolution": 64,
+      "max_resolution": 4096,
+      "vision_encoder": "ViT-based",
+      "notes": "Qwen3-VL 30B MoE model (30.5B total/3.3B active), best performing vision model, 128K context",
       "source": "Alibaba Qwen3-VL technical report 2025",
       "canonical_name": "qwen3-vl-30b",
       "aliases": [
         "qwen/qwen3-vl-30b"
       ],
-      "max_tokens": 262144
+      "max_tokens": 131072
     },
     "qwen2.5-vl-7b": {
       "max_output_tokens": 8192,
@@ -1329,6 +1422,11 @@
       "image_patch_size": 14,
       "max_image_tokens": 16384,
       "pixel_grouping": "28x28",
+      "image_tokenization_method": "patch_based_adaptive",
+      "adaptive_resolution": true,
+      "min_resolution": 56,
+      "max_resolution": 3584,
+      "vision_encoder": "ViT-based",
       "notes": "Qwen2.5-VL 7B parameter vision model, 28x28 pixel patches, max 3584x3584 resolution",
       "source": "Alibaba official docs",
       "canonical_name": "qwen2.5-vl-7b",
@@ -1353,6 +1451,12 @@
       "vision_encoder": "SigLIP-400M",
       "image_tokens_per_image": 256,
       "adaptive_windowing": true,
+      "image_tokenization_method": "fixed_resolution",
+      "fixed_resolution": [
+        896,
+        896
+      ],
+      "preprocessing": "automatic_resize_and_crop",
       "notes": "Gemma3 4B parameter model with vision support, 896x896 fixed resolution with adaptive windowing",
       "source": "Google Gemma3 documentation 2025",
       "canonical_name": "gemma3-4b",
@@ -1547,6 +1651,7 @@
       "max_image_resolution": "768x768",
       "vision_encoder": "SigLIP2-so400m-patch14-384",
       "image_patch_size": 14,
+      "image_tokenization_method": "patch_based",
       "notes": "IBM Granite 3.2-Vision 2B model with SigLIP2 encoder, optimized for visual document understanding",
       "source": "IBM Granite 3.2 technical report arXiv:2502.09927",
       "canonical_name": "granite3.2-vision:2b",
@@ -1558,6 +1663,58 @@
       ],
       "max_tokens": 32768
     },
+    "gemini-2.5-flash": {
+      "max_output_tokens": 8192,
+      "tool_support": "native",
+      "structured_output": "native",
+      "parallel_tools": true,
+      "max_tools": -1,
+      "vision_support": true,
+      "audio_support": true,
+      "video_support": true,
+      "image_resolutions": [
+        "224x224",
+        "448x448",
+        "1024x1024"
+      ],
+      "max_image_resolution": "768x768",
+      "image_tokenization_method": "gemini_vision_encoder",
+      "thinking_support": true,
+      "thinking_budget": true,
+      "notes": "Optimized for speed and efficiency, suitable for high-volume, latency-sensitive tasks. Supports configurable thinking budgets",
+      "source": "Google AI official docs 2025",
+      "canonical_name": "gemini-2.5-flash",
+      "aliases": [
+        "gemini-2.5-flash-001"
+      ],
+      "max_tokens": 1000000
+    },
+    "gemini-2.5-pro": {
+      "max_output_tokens": 65536,
+      "tool_support": "native",
+      "structured_output": "native",
+      "parallel_tools": true,
+      "max_tools": -1,
+      "vision_support": true,
+      "audio_support": true,
+      "video_support": true,
+      "image_resolutions": [
+        "224x224",
+        "448x448",
+        "1024x1024"
+      ],
+      "max_image_resolution":  "768x768",
+      "image_tokenization_method": "gemini_vision_encoder",
+      "thinking_support": true,
+      "thinking_budget": true,
+      "notes": "Most advanced Gemini model for complex reasoning, coding, and mathematical problem-solving. Features Deep Think mode for enhanced reasoning",
+      "source": "Google AI official docs 2025",
+      "canonical_name": "gemini-2.5-pro",
+      "aliases": [
+        "gemini-2.5-pro-001"
+      ],
+      "max_tokens": 1048576
+    },
     "granite3.3:2b": {
       "max_output_tokens": 8192,
       "tool_support": "prompted",
@@ -1587,6 +1744,321 @@
         "granite3.3-8b"
       ],
       "max_tokens": 32768
+    },
+    "embeddinggemma:300m": {
+      "max_output_tokens": 0,
+      "tool_support": "none",
+      "structured_output": "none",
+      "parallel_tools": false,
+      "vision_support": false,
+      "audio_support": false,
+      "notes": "Text embedding model, not for generation or vision",
+      "source": "Google Gemma documentation",
+      "canonical_name": "embeddinggemma:300m",
+      "aliases": [
+        "google/embeddinggemma-300m"
+      ],
+      "max_tokens": 0,
+      "model_type": "embedding"
+    },
+    "blip-image-captioning-base": {
+      "max_output_tokens": 512,
+      "tool_support": "none",
+      "structured_output": "none",
+      "parallel_tools": false,
+      "vision_support": true,
+      "audio_support": false,
+      "video_support": false,
+      "image_resolutions": [
+        "224x224",
+        "384x384"
+      ],
+      "max_image_resolution": "384x384",
+      "vision_encoder": "ViT-B/16",
+      "image_patch_size": 16,
+      "image_tokenization_method": "patch_based",
+      "base_image_tokens": 577,
+      "notes": "Salesforce BLIP image captioning model, primarily for image-to-text tasks",
+      "source": "Salesforce BLIP documentation",
+      "canonical_name": "blip-image-captioning-base",
+      "aliases": [
+        "Salesforce/blip-image-captioning-base"
+      ],
+      "max_tokens": 512
+    },
+    "glyph": {
+      "max_output_tokens": 8192,
+      "tool_support": "prompted",
+      "structured_output": "prompted",
+      "parallel_tools": false,
+      "vision_support": true,
+      "audio_support": false,
+      "video_support": false,
+      "image_resolutions": [
+        "variable"
+      ],
+      "max_image_resolution": "variable",
+      "base_model": "GLM-4.1V-9B-Base",
+      "total_parameters": "10B",
+      "tensor_type": "BF16",
+      "image_tokenization_method": "visual_text_compression",
+      "optimized_for_glyph": true,
+      "text_image_processing": true,
+      "architecture": "glm4v",
+      "requires_processor": true,
+      "message_format": "glm_special_tokens",
+      "conversation_template": {
+        "system_prefix": "<|system|>\n",
+        "system_suffix": "\n",
+        "user_prefix": "<|user|>\n",
+        "user_suffix": "\n",
+        "assistant_prefix": "<|assistant|>\n",
+        "assistant_suffix": "\n"
+      },
+      "model_class": "AutoModelForImageTextToText",
+      "processor_class": "AutoProcessor",
+      "trust_remote_code": true,
+      "transformers_version_min": "4.57.1",
+      "notes": "Glyph framework for scaling context windows via visual-text compression. Built on GLM-4.1V-9B-Base. Renders long text into images for VLM processing. Requires AutoModelForImageTextToText and AutoProcessor with trust_remote_code=True.",
+      "source": "HuggingFace zai-org/Glyph model card",
+      "canonical_name": "glyph",
+      "aliases": [
+        "zai-org/Glyph"
+      ],
+      "max_tokens": 131072,
+      "license": "MIT",
+      "arxiv": "2510.17800",
+      "repository": "https://github.com/thu-coai/Glyph"
+    },
+    "glm-4.1v-9b-base": {
+      "max_output_tokens": 8192,
+      "tool_support": "prompted",
+      "structured_output": "prompted",
+      "parallel_tools": false,
+      "vision_support": true,
+      "audio_support": false,
+      "video_support": false,
+      "image_resolutions": [
+        "variable"
+      ],
+      "max_image_resolution": "variable",
+      "total_parameters": "9B",
+      "base_model": "GLM-4-9B-0414",
+      "image_tokenization_method": "glm_vision_encoder",
+      "architecture": "glm4v",
+      "requires_processor": true,
+      "message_format": "glm_special_tokens",
+      "model_class": "AutoModelForImageTextToText",
+      "processor_class": "AutoProcessor",
+      "trust_remote_code": true,
+      "transformers_version_min": "4.57.1",
+      "notes": "GLM-4.1V 9B base model, backbone for Glyph visual-text compression framework",
+      "source": "HuggingFace zai-org/GLM-4.1V-9B-Base",
+      "canonical_name": "glm-4.1v-9b-base",
+      "aliases": [
+        "zai-org/GLM-4.1V-9B-Base"
+      ],
+      "max_tokens": 131072
+    },
+    "glm-4.1v-9b-thinking": {
+      "max_output_tokens": 8192,
+      "tool_support": "prompted",
+      "structured_output": "prompted",
+      "parallel_tools": false,
+      "vision_support": true,
+      "audio_support": false,
+      "video_support": false,
+      "image_resolutions": [
+        "up to 4096x4096"
+      ],
+      "max_image_resolution": "4096x4096",
+      "total_parameters": "10B",
+      "base_model": "GLM-4-9B-0414",
+      "image_tokenization_method": "glm_vision_encoder",
+      "thinking_support": true,
+      "reasoning_paradigm": "chain_of_thought",
+      "adaptive_resolution": true,
+      "aspect_ratio_support": "arbitrary",
+      "architecture": "glm4v",
+      "requires_processor": true,
+      "message_format": "glm_special_tokens",
+      "model_class": "AutoModelForImageTextToText",
+      "processor_class": "AutoProcessor",
+      "trust_remote_code": true,
+      "transformers_version_min": "4.57.1",
+      "notes": "GLM-4.1V-9B-Thinking with Chain-of-Thought reasoning, 64K context, arbitrary aspect ratios up to 4K resolution. First reasoning-focused VLM in the series, matches 72B models on 18 benchmark tasks.",
+      "source": "HuggingFace zai-org/GLM-4.1V-9B-Thinking and GitHub zai-org/GLM-V",
+      "canonical_name": "glm-4.1v-9b-thinking",
+      "aliases": [
+        "zai-org/GLM-4.1V-9B-Thinking",
+        "glm-4.1v-thinking",
+        "glm4.1v-9b-thinking"
+      ],
+      "max_tokens": 65536,
+      "arxiv": "2507.01006"
+    },
+    "mistral-small-3.1-24b-instruct": {
+      "max_output_tokens": 8192,
+      "tool_support": "native",
+      "structured_output": "native",
+      "parallel_tools": true,
+      "vision_support": true,
+      "audio_support": false,
+      "video_support": false,
+      "image_resolutions": [
+        "up to 2048x2048"
+      ],
+      "max_image_resolution": "2048x2048",
+      "image_tokenization_method": "mistral_vision_encoder",
+      "notes": "Mistral Small 3.1 with 24B parameters, 128K context, multimodal understanding. Released March 2025.",
+      "source": "Mistral AI documentation and HuggingFace",
+      "canonical_name": "mistral-small-3.1-24b-instruct",
+      "aliases": [
+        "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+      ],
+      "max_tokens": 131072,
+      "total_parameters": "24B",
+      "release_date": "2025-03-17"
+    },
+    "mistral-small-3.2-24b-instruct": {
+      "max_output_tokens": 8192,
+      "tool_support": "native",
+      "structured_output": "native",
+      "parallel_tools": true,
+      "vision_support": true,
+      "audio_support": false,
+      "video_support": false,
+      "image_resolutions": [
+        "up to 2048x2048"
+      ],
+      "max_image_resolution": "2048x2048",
+      "image_tokenization_method": "mistral_vision_encoder",
+      "tensor_type": "BF16",
+      "gpu_memory_required": "55GB",
+      "notes": "Mistral Small 3.2 with 24B parameters, 128K context. Improved instruction following, reduced repetition, enhanced function calling. Released June 2025.",
+      "source": "HuggingFace mistralai/Mistral-Small-3.2-24B-Instruct-2506",
+      "canonical_name": "mistral-small-3.2-24b-instruct",
+      "aliases": [
+        "mistralai/Mistral-Small-3.2-24B-Instruct-2506"
+      ],
+      "max_tokens": 131072,
+      "total_parameters": "24B",
+      "release_date": "2025-06-01"
+    },
+    "llama-4-scout": {
+      "max_output_tokens": 8192,
+      "tool_support": "native",
+      "structured_output": "native",
+      "parallel_tools": true,
+      "vision_support": true,
+      "audio_support": true,
+      "video_support": false,
+      "image_resolutions": [
+        "up to 1120x1120"
+      ],
+      "max_image_resolution": "1120x1120",
+      "architecture": "mixture_of_experts",
+      "active_parameters": "17B",
+      "total_parameters": "109B",
+      "experts": 16,
+      "image_tokenization_method": "resolution_tier_based",
+      "notes": "LLaMA 4 Scout with MoE architecture, 17B active/109B total parameters, 10M context window. Multimodal with early fusion. Released April 2025.",
+      "source": "Meta LLaMA 4 documentation and NVIDIA docs",
+      "canonical_name": "llama-4-scout",
+      "aliases": [
+        "llama4-17b-scout-16e-instruct",
+        "llama-4-17b-scout-16e-instruct"
+      ],
+      "max_tokens": 10000000,
+      "release_date": "2025-04-05",
+      "image_patch_size": 14,
+      "max_image_tokens": 6400
+    },
+    "llama-4-maverick": {
+      "max_output_tokens": 8192,
+      "tool_support": "native",
+      "structured_output": "native",
+      "parallel_tools": true,
+      "vision_support": true,
+      "audio_support": true,
+      "video_support": false,
+      "image_resolutions": [
+        "up to 1120x1120"
+      ],
+      "max_image_resolution": "1120x1120",
+      "architecture": "mixture_of_experts",
+      "active_parameters": "17B",
+      "total_parameters": "400B",
+      "experts": 128,
+      "image_tokenization_method": "resolution_tier_based",
+      "notes": "LLaMA 4 Maverick with MoE architecture, 17B active/400B total parameters, 1M context window. Optimized for coding and reasoning. Released April 2025.",
+      "source": "Meta LLaMA 4 documentation and Oracle docs",
+      "canonical_name": "llama-4-maverick",
+      "aliases": [
+        "llama4-17b-maverick-128e-instruct"
+      ],
+      "max_tokens": 1000000,
+      "release_date": "2025-04-05",
+      "image_patch_size": 14,
+      "max_image_tokens": 6400
+    },
+    "llama-4-behemoth": {
+      "max_output_tokens": 8192,
+      "tool_support": "native",
+      "structured_output": "native",
+      "parallel_tools": true,
+      "vision_support": true,
+      "audio_support": true,
+      "video_support": false,
+      "image_resolutions": [
+        "up to 1120x1120"
+      ],
+      "max_image_resolution": "1120x1120",
+      "architecture": "mixture_of_experts",
+      "active_parameters": "288B",
+      "total_parameters": "2T",
+      "experts": 16,
+      "image_tokenization_method": "resolution_tier_based",
+      "notes": "LLaMA 4 Behemoth teacher model with 288B active/2T total parameters. Designed for distilling performance into smaller models. Announced April 2025.",
+      "source": "Meta LLaMA 4 announcement and PromptHub",
+      "canonical_name": "llama-4-behemoth",
+      "aliases": [
+        "llama4-288b-behemoth-16e"
+      ],
+      "max_tokens": 1000000,
+      "release_date": "2025-04-05",
+      "status": "announced",
+      "image_patch_size": 14,
+      "max_image_tokens": 6400
+    },
+    "minimax-m2": {
+      "max_output_tokens": 8192,
+      "tool_support": "native",
+      "structured_output": "native",
+      "parallel_tools": true,
+      "vision_support": false,
+      "audio_support": false,
+      "video_support": false,
+      "thinking_support": true,
+      "architecture": "mixture_of_experts",
+      "active_parameters": "10B",
+      "total_parameters": "230B",
+      "thinking_paradigm": "interleaved_thinking",
+      "thinking_format": "<think>...</think>",
+      "notes": "MiniMax M2 MoE model optimized for coding and agentic workflows. Industry-leading 204K token context window. Uses interleaved thinking with <think> tags for reasoning. 10B active parameters from 230B total. Achieves strong performance on SWE-Bench and Terminal-Bench tasks. Supports complete tool calling for agent workflows.",
+      "source": "MiniMax official docs (minimax-m2.org, HuggingFace, GitHub)",
+      "canonical_name": "minimax-m2",
+      "aliases": [
+        "MiniMaxAI/MiniMax-M2",
+        "mlx-community/minimax-m2",
+        "mlx-community/MiniMax-M2",
+        "unsloth/MiniMax-M2-GGUF",
+        "minimax-m2-230b",
+        "minimax-m2-10b-active"
+      ],
+      "max_tokens": 208896,
+      "release_date": "2025-01",
+      "license": "Apache-2.0"
     }
   },
   "tool_support_levels": {
@@ -1605,6 +2077,73 @@
     "video_support": "Video processing capabilities",
     "fim_support": "Fill-in-the-middle code completion"
   },
+  "vlm_tokenization_research": {
+    "openai_gpt4v_formula": {
+      "step1": "Resize to fit 2048x2048 (preserve aspect ratio)",
+      "step2": "Resize shortest side to 768px",
+      "step3": "Calculate tiles: ceil(width/512) * ceil(height/512)",
+      "step4": "Total tokens = 85 + (tiles * 170)",
+      "low_detail": "Fixed 85 tokens regardless of size",
+      "research_source": "OpenAI official documentation + Image Tokenization research"
+    },
+    "anthropic_claude_formula": {
+      "formula": "min((width * height) / 750, 1600)",
+      "pixel_divisor": 750,
+      "token_cap": 1600,
+      "resize_trigger": "max(width, height) > 1568",
+      "warning_threshold": "min(width, height) < 200",
+      "research_source": "Anthropic Claude documentation + research analysis"
+    },
+    "google_gemini_formula": {
+      "small_image": "width <= 384 AND height <= 384 \u2192 258 tokens",
+      "large_image": "ceil(width/768) * ceil(height/768) * 258 tokens",
+      "small_threshold": 384,
+      "tile_size": 768,
+      "tokens_per_tile": 258,
+      "research_source": "Google Gemini documentation + research analysis"
+    },
+    "qwen_vl_adaptive_formula": {
+      "formula": "min(ceil(width/patch_size) * ceil(height/patch_size), max_tokens)",
+      "adaptive_resize": "Resize to fit within [min_res, max_res] range",
+      "patch_sizes": {
+        "qwen2.5": 14,
+        "qwen3": 16
+      },
+      "research_source": "Qwen-VL technical documentation + research"
+    },
+    "vision_transformer_baseline": {
+      "standard_patch_size": 16,
+      "formula": "tokens = (height * width) / (patch_size^2)",
+      "typical_range": [
+        196,
+        2048
+      ],
+      "research_source": "Vision Transformer foundational paper"
+    }
+  },
+  "generic_vision_model": {
+    "max_output_tokens": 4096,
+    "tool_support": "prompted",
+    "structured_output": "prompted",
+    "parallel_tools": false,
+    "vision_support": true,
+    "audio_support": false,
+    "video_support": false,
+    "image_resolutions": [
+      "up to 1024x1024"
+    ],
+    "max_image_resolution": "1024x1024",
+    "image_patch_size": 16,
+    "max_image_tokens": 2048,
+    "image_tokenization_method": "patch_based",
+    "adaptive_resolution": false,
+    "vision_encoder": "generic_vit",
+    "notes": "Generic vision model fallback with conservative parameters that should work with most VLMs",
+    "source": "AbstractCore generic fallback",
+    "canonical_name": "generic_vision_model",
+    "aliases": [],
+    "max_tokens": 32768
+  },
   "default_capabilities": {
     "max_output_tokens": 4096,
     "tool_support": "none",

abstractcore 2.5.0__py3-none-any.whl → 2.5.3__py3-none-any.whl

abstractcore 2.5.0py3-none-any.whl → 2.5.3py3-none-any.whl