abstractcore 2.5.2__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. abstractcore/__init__.py +19 -1
  2. abstractcore/architectures/detection.py +252 -6
  3. abstractcore/assets/architecture_formats.json +14 -1
  4. abstractcore/assets/model_capabilities.json +533 -10
  5. abstractcore/compression/__init__.py +29 -0
  6. abstractcore/compression/analytics.py +420 -0
  7. abstractcore/compression/cache.py +250 -0
  8. abstractcore/compression/config.py +279 -0
  9. abstractcore/compression/exceptions.py +30 -0
  10. abstractcore/compression/glyph_processor.py +381 -0
  11. abstractcore/compression/optimizer.py +388 -0
  12. abstractcore/compression/orchestrator.py +380 -0
  13. abstractcore/compression/pil_text_renderer.py +818 -0
  14. abstractcore/compression/quality.py +226 -0
  15. abstractcore/compression/text_formatter.py +666 -0
  16. abstractcore/compression/vision_compressor.py +371 -0
  17. abstractcore/config/main.py +64 -0
  18. abstractcore/config/manager.py +100 -5
  19. abstractcore/core/retry.py +2 -2
  20. abstractcore/core/session.py +193 -7
  21. abstractcore/download.py +253 -0
  22. abstractcore/embeddings/manager.py +2 -2
  23. abstractcore/events/__init__.py +113 -2
  24. abstractcore/exceptions/__init__.py +49 -2
  25. abstractcore/media/auto_handler.py +312 -18
  26. abstractcore/media/handlers/local_handler.py +14 -2
  27. abstractcore/media/handlers/openai_handler.py +62 -3
  28. abstractcore/media/processors/__init__.py +11 -1
  29. abstractcore/media/processors/direct_pdf_processor.py +210 -0
  30. abstractcore/media/processors/glyph_pdf_processor.py +227 -0
  31. abstractcore/media/processors/image_processor.py +7 -1
  32. abstractcore/media/processors/office_processor.py +2 -2
  33. abstractcore/media/processors/text_processor.py +18 -3
  34. abstractcore/media/types.py +164 -7
  35. abstractcore/media/utils/image_scaler.py +2 -2
  36. abstractcore/media/vision_fallback.py +2 -2
  37. abstractcore/providers/__init__.py +18 -0
  38. abstractcore/providers/anthropic_provider.py +228 -8
  39. abstractcore/providers/base.py +378 -11
  40. abstractcore/providers/huggingface_provider.py +563 -23
  41. abstractcore/providers/lmstudio_provider.py +284 -4
  42. abstractcore/providers/mlx_provider.py +27 -2
  43. abstractcore/providers/model_capabilities.py +352 -0
  44. abstractcore/providers/ollama_provider.py +282 -6
  45. abstractcore/providers/openai_provider.py +286 -8
  46. abstractcore/providers/registry.py +85 -13
  47. abstractcore/providers/streaming.py +2 -2
  48. abstractcore/server/app.py +91 -81
  49. abstractcore/tools/common_tools.py +2 -2
  50. abstractcore/tools/handler.py +2 -2
  51. abstractcore/tools/parser.py +2 -2
  52. abstractcore/tools/registry.py +2 -2
  53. abstractcore/tools/syntax_rewriter.py +2 -2
  54. abstractcore/tools/tag_rewriter.py +3 -3
  55. abstractcore/utils/__init__.py +4 -1
  56. abstractcore/utils/self_fixes.py +2 -2
  57. abstractcore/utils/trace_export.py +287 -0
  58. abstractcore/utils/version.py +1 -1
  59. abstractcore/utils/vlm_token_calculator.py +655 -0
  60. {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/METADATA +207 -8
  61. abstractcore-2.6.0.dist-info/RECORD +108 -0
  62. abstractcore-2.5.2.dist-info/RECORD +0 -90
  63. {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/WHEEL +0 -0
  64. {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/entry_points.txt +0 -0
  65. {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/licenses/LICENSE +0 -0
  66. {abstractcore-2.5.2.dist-info → abstractcore-2.6.0.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,25 @@
13
13
  "aliases": [],
14
14
  "max_tokens": 128000
15
15
  },
16
+ "gpt-4-turbo": {
17
+ "max_output_tokens": 4096,
18
+ "tool_support": "native",
19
+ "structured_output": "native",
20
+ "parallel_tools": true,
21
+ "max_tools": -1,
22
+ "vision_support": true,
23
+ "audio_support": false,
24
+ "image_resolutions": [
25
+ "variable"
26
+ ],
27
+ "notes": "GPT-4 Turbo with vision capabilities",
28
+ "source": "OpenAI official docs 2025",
29
+ "canonical_name": "gpt-4-turbo",
30
+ "aliases": [
31
+ "gpt-4-turbo-preview"
32
+ ],
33
+ "max_tokens": 128000
34
+ },
16
35
  "gpt-4-turbo-with-vision": {
17
36
  "max_output_tokens": 4096,
18
37
  "tool_support": "native",
@@ -45,6 +64,18 @@
45
64
  "image_resolutions": [
46
65
  "variable"
47
66
  ],
67
+ "image_tokenization_method": "tile_based",
68
+ "base_image_tokens": 85,
69
+ "tokens_per_tile": 170,
70
+ "tile_size": "512x512",
71
+ "max_image_dimension": 2048,
72
+ "short_side_resize_target": 768,
73
+ "detail_levels": [
74
+ "low",
75
+ "high",
76
+ "auto"
77
+ ],
78
+ "low_detail_tokens": 85,
48
79
  "notes": "Multimodal omni model, 2x faster, half price, 5x higher rate limits (updated Nov 2024)",
49
80
  "source": "OpenAI official docs 2025",
50
81
  "canonical_name": "gpt-4o",
@@ -154,6 +185,12 @@
154
185
  "image_resolutions": [
155
186
  "up to 1568x1568"
156
187
  ],
188
+ "image_tokenization_method": "pixel_area_based",
189
+ "token_formula": "(width * height) / 750",
190
+ "pixel_divisor": 750,
191
+ "max_image_dimension": 1568,
192
+ "token_cap": 1600,
193
+ "min_dimension_warning": 200,
157
194
  "audio_support": false,
158
195
  "notes": "disable_parallel_tool_use option available",
159
196
  "source": "Anthropic official docs",
@@ -893,6 +930,31 @@
893
930
  "max_image_resolution": "1120x1120",
894
931
  "image_patch_size": 14,
895
932
  "max_image_tokens": 6400,
933
+ "image_tokenization_method": "resolution_tier_based",
934
+ "supported_resolutions": [
935
+ [
936
+ 560,
937
+ 560
938
+ ],
939
+ [
940
+ 1120,
941
+ 560
942
+ ],
943
+ [
944
+ 560,
945
+ 1120
946
+ ],
947
+ [
948
+ 1120,
949
+ 1120
950
+ ]
951
+ ],
952
+ "base_tokens_per_resolution": {
953
+ "560x560": 1600,
954
+ "1120x560": 3200,
955
+ "560x1120": 3200,
956
+ "1120x1120": 6400
957
+ },
896
958
  "notes": "Llama 3.2 Vision 11B model with multimodal capabilities for visual recognition and reasoning",
897
959
  "source": "Meta AI Llama 3.2 release",
898
960
  "canonical_name": "llama3.2-vision:11b",
@@ -1271,13 +1333,18 @@
1271
1333
  "video_support": true,
1272
1334
  "audio_support": false,
1273
1335
  "image_resolutions": [
1274
- "variable"
1336
+ "64x64 to 4096x4096"
1275
1337
  ],
1276
- "max_image_resolution": "variable",
1338
+ "max_image_resolution": "4096x4096",
1277
1339
  "image_patch_size": 16,
1278
1340
  "max_image_tokens": 24576,
1279
1341
  "pixel_grouping": "32x32",
1280
- "notes": "Qwen3-VL 4B dense model with 256K context, optimized for LMStudio",
1342
+ "image_tokenization_method": "patch_based_adaptive",
1343
+ "adaptive_resolution": true,
1344
+ "min_resolution": 64,
1345
+ "max_resolution": 4096,
1346
+ "vision_encoder": "ViT-based",
1347
+ "notes": "Qwen3-VL 4B dense model with 256K context, optimized for LMStudio. Parameters: 4.83B. FP8 checkpoints available.",
1281
1348
  "source": "Alibaba Qwen3-VL technical report 2025",
1282
1349
  "canonical_name": "qwen3-vl-4b",
1283
1350
  "aliases": [
@@ -1294,13 +1361,18 @@
1294
1361
  "video_support": true,
1295
1362
  "audio_support": false,
1296
1363
  "image_resolutions": [
1297
- "variable"
1364
+ "64x64 to 4096x4096"
1298
1365
  ],
1299
- "max_image_resolution": "variable",
1366
+ "max_image_resolution": "4096x4096",
1300
1367
  "image_patch_size": 16,
1301
1368
  "max_image_tokens": 24576,
1302
1369
  "pixel_grouping": "32x32",
1303
- "notes": "Qwen3-VL 8B dense model with 256K context, optimized for LMStudio",
1370
+ "image_tokenization_method": "patch_based_adaptive",
1371
+ "adaptive_resolution": true,
1372
+ "min_resolution": 64,
1373
+ "max_resolution": 4096,
1374
+ "vision_encoder": "ViT-based",
1375
+ "notes": "Qwen3-VL 8B dense model with 256K context, optimized for LMStudio. Parameters: 8.77B. FP8 checkpoints available.",
1304
1376
  "source": "Alibaba Qwen3-VL technical report 2025",
1305
1377
  "canonical_name": "qwen3-vl-8b",
1306
1378
  "aliases": [
@@ -1317,19 +1389,24 @@
1317
1389
  "video_support": true,
1318
1390
  "audio_support": false,
1319
1391
  "image_resolutions": [
1320
- "variable"
1392
+ "64x64 to 4096x4096"
1321
1393
  ],
1322
- "max_image_resolution": "variable",
1394
+ "max_image_resolution": "4096x4096",
1323
1395
  "image_patch_size": 16,
1324
1396
  "max_image_tokens": 24576,
1325
1397
  "pixel_grouping": "32x32",
1326
- "notes": "Qwen3-VL 30B MoE model (30.5B total/3.3B active), best performing vision model, 256K context",
1398
+ "image_tokenization_method": "patch_based_adaptive",
1399
+ "adaptive_resolution": true,
1400
+ "min_resolution": 64,
1401
+ "max_resolution": 4096,
1402
+ "vision_encoder": "ViT-based",
1403
+ "notes": "Qwen3-VL 30B MoE model (30.5B total/3.3B active), best performing vision model, 128K context",
1327
1404
  "source": "Alibaba Qwen3-VL technical report 2025",
1328
1405
  "canonical_name": "qwen3-vl-30b",
1329
1406
  "aliases": [
1330
1407
  "qwen/qwen3-vl-30b"
1331
1408
  ],
1332
- "max_tokens": 262144
1409
+ "max_tokens": 131072
1333
1410
  },
1334
1411
  "qwen2.5-vl-7b": {
1335
1412
  "max_output_tokens": 8192,
@@ -1345,6 +1422,11 @@
1345
1422
  "image_patch_size": 14,
1346
1423
  "max_image_tokens": 16384,
1347
1424
  "pixel_grouping": "28x28",
1425
+ "image_tokenization_method": "patch_based_adaptive",
1426
+ "adaptive_resolution": true,
1427
+ "min_resolution": 56,
1428
+ "max_resolution": 3584,
1429
+ "vision_encoder": "ViT-based",
1348
1430
  "notes": "Qwen2.5-VL 7B parameter vision model, 28x28 pixel patches, max 3584x3584 resolution",
1349
1431
  "source": "Alibaba official docs",
1350
1432
  "canonical_name": "qwen2.5-vl-7b",
@@ -1369,6 +1451,12 @@
1369
1451
  "vision_encoder": "SigLIP-400M",
1370
1452
  "image_tokens_per_image": 256,
1371
1453
  "adaptive_windowing": true,
1454
+ "image_tokenization_method": "fixed_resolution",
1455
+ "fixed_resolution": [
1456
+ 896,
1457
+ 896
1458
+ ],
1459
+ "preprocessing": "automatic_resize_and_crop",
1372
1460
  "notes": "Gemma3 4B parameter model with vision support, 896x896 fixed resolution with adaptive windowing",
1373
1461
  "source": "Google Gemma3 documentation 2025",
1374
1462
  "canonical_name": "gemma3-4b",
@@ -1563,6 +1651,7 @@
1563
1651
  "max_image_resolution": "768x768",
1564
1652
  "vision_encoder": "SigLIP2-so400m-patch14-384",
1565
1653
  "image_patch_size": 14,
1654
+ "image_tokenization_method": "patch_based",
1566
1655
  "notes": "IBM Granite 3.2-Vision 2B model with SigLIP2 encoder, optimized for visual document understanding",
1567
1656
  "source": "IBM Granite 3.2 technical report arXiv:2502.09927",
1568
1657
  "canonical_name": "granite3.2-vision:2b",
@@ -1574,6 +1663,58 @@
1574
1663
  ],
1575
1664
  "max_tokens": 32768
1576
1665
  },
1666
+ "gemini-2.5-flash": {
1667
+ "max_output_tokens": 8192,
1668
+ "tool_support": "native",
1669
+ "structured_output": "native",
1670
+ "parallel_tools": true,
1671
+ "max_tools": -1,
1672
+ "vision_support": true,
1673
+ "audio_support": true,
1674
+ "video_support": true,
1675
+ "image_resolutions": [
1676
+ "224x224",
1677
+ "448x448",
1678
+ "1024x1024"
1679
+ ],
1680
+ "max_image_resolution": "768x768",
1681
+ "image_tokenization_method": "gemini_vision_encoder",
1682
+ "thinking_support": true,
1683
+ "thinking_budget": true,
1684
+ "notes": "Optimized for speed and efficiency, suitable for high-volume, latency-sensitive tasks. Supports configurable thinking budgets",
1685
+ "source": "Google AI official docs 2025",
1686
+ "canonical_name": "gemini-2.5-flash",
1687
+ "aliases": [
1688
+ "gemini-2.5-flash-001"
1689
+ ],
1690
+ "max_tokens": 1000000
1691
+ },
1692
+ "gemini-2.5-pro": {
1693
+ "max_output_tokens": 65536,
1694
+ "tool_support": "native",
1695
+ "structured_output": "native",
1696
+ "parallel_tools": true,
1697
+ "max_tools": -1,
1698
+ "vision_support": true,
1699
+ "audio_support": true,
1700
+ "video_support": true,
1701
+ "image_resolutions": [
1702
+ "224x224",
1703
+ "448x448",
1704
+ "1024x1024"
1705
+ ],
1706
+ "max_image_resolution": "768x768",
1707
+ "image_tokenization_method": "gemini_vision_encoder",
1708
+ "thinking_support": true,
1709
+ "thinking_budget": true,
1710
+ "notes": "Most advanced Gemini model for complex reasoning, coding, and mathematical problem-solving. Features Deep Think mode for enhanced reasoning",
1711
+ "source": "Google AI official docs 2025",
1712
+ "canonical_name": "gemini-2.5-pro",
1713
+ "aliases": [
1714
+ "gemini-2.5-pro-001"
1715
+ ],
1716
+ "max_tokens": 1048576
1717
+ },
1577
1718
  "granite3.3:2b": {
1578
1719
  "max_output_tokens": 8192,
1579
1720
  "tool_support": "prompted",
@@ -1603,6 +1744,321 @@
1603
1744
  "granite3.3-8b"
1604
1745
  ],
1605
1746
  "max_tokens": 32768
1747
+ },
1748
+ "embeddinggemma:300m": {
1749
+ "max_output_tokens": 0,
1750
+ "tool_support": "none",
1751
+ "structured_output": "none",
1752
+ "parallel_tools": false,
1753
+ "vision_support": false,
1754
+ "audio_support": false,
1755
+ "notes": "Text embedding model, not for generation or vision",
1756
+ "source": "Google Gemma documentation",
1757
+ "canonical_name": "embeddinggemma:300m",
1758
+ "aliases": [
1759
+ "google/embeddinggemma-300m"
1760
+ ],
1761
+ "max_tokens": 0,
1762
+ "model_type": "embedding"
1763
+ },
1764
+ "blip-image-captioning-base": {
1765
+ "max_output_tokens": 512,
1766
+ "tool_support": "none",
1767
+ "structured_output": "none",
1768
+ "parallel_tools": false,
1769
+ "vision_support": true,
1770
+ "audio_support": false,
1771
+ "video_support": false,
1772
+ "image_resolutions": [
1773
+ "224x224",
1774
+ "384x384"
1775
+ ],
1776
+ "max_image_resolution": "384x384",
1777
+ "vision_encoder": "ViT-B/16",
1778
+ "image_patch_size": 16,
1779
+ "image_tokenization_method": "patch_based",
1780
+ "base_image_tokens": 577,
1781
+ "notes": "Salesforce BLIP image captioning model, primarily for image-to-text tasks",
1782
+ "source": "Salesforce BLIP documentation",
1783
+ "canonical_name": "blip-image-captioning-base",
1784
+ "aliases": [
1785
+ "Salesforce/blip-image-captioning-base"
1786
+ ],
1787
+ "max_tokens": 512
1788
+ },
1789
+ "glyph": {
1790
+ "max_output_tokens": 8192,
1791
+ "tool_support": "prompted",
1792
+ "structured_output": "prompted",
1793
+ "parallel_tools": false,
1794
+ "vision_support": true,
1795
+ "audio_support": false,
1796
+ "video_support": false,
1797
+ "image_resolutions": [
1798
+ "variable"
1799
+ ],
1800
+ "max_image_resolution": "variable",
1801
+ "base_model": "GLM-4.1V-9B-Base",
1802
+ "total_parameters": "10B",
1803
+ "tensor_type": "BF16",
1804
+ "image_tokenization_method": "visual_text_compression",
1805
+ "optimized_for_glyph": true,
1806
+ "text_image_processing": true,
1807
+ "architecture": "glm4v",
1808
+ "requires_processor": true,
1809
+ "message_format": "glm_special_tokens",
1810
+ "conversation_template": {
1811
+ "system_prefix": "<|system|>\n",
1812
+ "system_suffix": "\n",
1813
+ "user_prefix": "<|user|>\n",
1814
+ "user_suffix": "\n",
1815
+ "assistant_prefix": "<|assistant|>\n",
1816
+ "assistant_suffix": "\n"
1817
+ },
1818
+ "model_class": "AutoModelForImageTextToText",
1819
+ "processor_class": "AutoProcessor",
1820
+ "trust_remote_code": true,
1821
+ "transformers_version_min": "4.57.1",
1822
+ "notes": "Glyph framework for scaling context windows via visual-text compression. Built on GLM-4.1V-9B-Base. Renders long text into images for VLM processing. Requires AutoModelForImageTextToText and AutoProcessor with trust_remote_code=True.",
1823
+ "source": "HuggingFace zai-org/Glyph model card",
1824
+ "canonical_name": "glyph",
1825
+ "aliases": [
1826
+ "zai-org/Glyph"
1827
+ ],
1828
+ "max_tokens": 131072,
1829
+ "license": "MIT",
1830
+ "arxiv": "2510.17800",
1831
+ "repository": "https://github.com/thu-coai/Glyph"
1832
+ },
1833
+ "glm-4.1v-9b-base": {
1834
+ "max_output_tokens": 8192,
1835
+ "tool_support": "prompted",
1836
+ "structured_output": "prompted",
1837
+ "parallel_tools": false,
1838
+ "vision_support": true,
1839
+ "audio_support": false,
1840
+ "video_support": false,
1841
+ "image_resolutions": [
1842
+ "variable"
1843
+ ],
1844
+ "max_image_resolution": "variable",
1845
+ "total_parameters": "9B",
1846
+ "base_model": "GLM-4-9B-0414",
1847
+ "image_tokenization_method": "glm_vision_encoder",
1848
+ "architecture": "glm4v",
1849
+ "requires_processor": true,
1850
+ "message_format": "glm_special_tokens",
1851
+ "model_class": "AutoModelForImageTextToText",
1852
+ "processor_class": "AutoProcessor",
1853
+ "trust_remote_code": true,
1854
+ "transformers_version_min": "4.57.1",
1855
+ "notes": "GLM-4.1V 9B base model, backbone for Glyph visual-text compression framework",
1856
+ "source": "HuggingFace zai-org/GLM-4.1V-9B-Base",
1857
+ "canonical_name": "glm-4.1v-9b-base",
1858
+ "aliases": [
1859
+ "zai-org/GLM-4.1V-9B-Base"
1860
+ ],
1861
+ "max_tokens": 131072
1862
+ },
1863
+ "glm-4.1v-9b-thinking": {
1864
+ "max_output_tokens": 8192,
1865
+ "tool_support": "prompted",
1866
+ "structured_output": "prompted",
1867
+ "parallel_tools": false,
1868
+ "vision_support": true,
1869
+ "audio_support": false,
1870
+ "video_support": false,
1871
+ "image_resolutions": [
1872
+ "up to 4096x4096"
1873
+ ],
1874
+ "max_image_resolution": "4096x4096",
1875
+ "total_parameters": "10B",
1876
+ "base_model": "GLM-4-9B-0414",
1877
+ "image_tokenization_method": "glm_vision_encoder",
1878
+ "thinking_support": true,
1879
+ "reasoning_paradigm": "chain_of_thought",
1880
+ "adaptive_resolution": true,
1881
+ "aspect_ratio_support": "arbitrary",
1882
+ "architecture": "glm4v",
1883
+ "requires_processor": true,
1884
+ "message_format": "glm_special_tokens",
1885
+ "model_class": "AutoModelForImageTextToText",
1886
+ "processor_class": "AutoProcessor",
1887
+ "trust_remote_code": true,
1888
+ "transformers_version_min": "4.57.1",
1889
+ "notes": "GLM-4.1V-9B-Thinking with Chain-of-Thought reasoning, 64K context, arbitrary aspect ratios up to 4K resolution. First reasoning-focused VLM in the series, matches 72B models on 18 benchmark tasks.",
1890
+ "source": "HuggingFace zai-org/GLM-4.1V-9B-Thinking and GitHub zai-org/GLM-V",
1891
+ "canonical_name": "glm-4.1v-9b-thinking",
1892
+ "aliases": [
1893
+ "zai-org/GLM-4.1V-9B-Thinking",
1894
+ "glm-4.1v-thinking",
1895
+ "glm4.1v-9b-thinking"
1896
+ ],
1897
+ "max_tokens": 65536,
1898
+ "arxiv": "2507.01006"
1899
+ },
1900
+ "mistral-small-3.1-24b-instruct": {
1901
+ "max_output_tokens": 8192,
1902
+ "tool_support": "native",
1903
+ "structured_output": "native",
1904
+ "parallel_tools": true,
1905
+ "vision_support": true,
1906
+ "audio_support": false,
1907
+ "video_support": false,
1908
+ "image_resolutions": [
1909
+ "up to 2048x2048"
1910
+ ],
1911
+ "max_image_resolution": "2048x2048",
1912
+ "image_tokenization_method": "mistral_vision_encoder",
1913
+ "notes": "Mistral Small 3.1 with 24B parameters, 128K context, multimodal understanding. Released March 2025.",
1914
+ "source": "Mistral AI documentation and HuggingFace",
1915
+ "canonical_name": "mistral-small-3.1-24b-instruct",
1916
+ "aliases": [
1917
+ "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
1918
+ ],
1919
+ "max_tokens": 131072,
1920
+ "total_parameters": "24B",
1921
+ "release_date": "2025-03-17"
1922
+ },
1923
+ "mistral-small-3.2-24b-instruct": {
1924
+ "max_output_tokens": 8192,
1925
+ "tool_support": "native",
1926
+ "structured_output": "native",
1927
+ "parallel_tools": true,
1928
+ "vision_support": true,
1929
+ "audio_support": false,
1930
+ "video_support": false,
1931
+ "image_resolutions": [
1932
+ "up to 2048x2048"
1933
+ ],
1934
+ "max_image_resolution": "2048x2048",
1935
+ "image_tokenization_method": "mistral_vision_encoder",
1936
+ "tensor_type": "BF16",
1937
+ "gpu_memory_required": "55GB",
1938
+ "notes": "Mistral Small 3.2 with 24B parameters, 128K context. Improved instruction following, reduced repetition, enhanced function calling. Released June 2025.",
1939
+ "source": "HuggingFace mistralai/Mistral-Small-3.2-24B-Instruct-2506",
1940
+ "canonical_name": "mistral-small-3.2-24b-instruct",
1941
+ "aliases": [
1942
+ "mistralai/Mistral-Small-3.2-24B-Instruct-2506"
1943
+ ],
1944
+ "max_tokens": 131072,
1945
+ "total_parameters": "24B",
1946
+ "release_date": "2025-06-01"
1947
+ },
1948
+ "llama-4-scout": {
1949
+ "max_output_tokens": 8192,
1950
+ "tool_support": "native",
1951
+ "structured_output": "native",
1952
+ "parallel_tools": true,
1953
+ "vision_support": true,
1954
+ "audio_support": true,
1955
+ "video_support": false,
1956
+ "image_resolutions": [
1957
+ "up to 1120x1120"
1958
+ ],
1959
+ "max_image_resolution": "1120x1120",
1960
+ "architecture": "mixture_of_experts",
1961
+ "active_parameters": "17B",
1962
+ "total_parameters": "109B",
1963
+ "experts": 16,
1964
+ "image_tokenization_method": "resolution_tier_based",
1965
+ "notes": "LLaMA 4 Scout with MoE architecture, 17B active/109B total parameters, 10M context window. Multimodal with early fusion. Released April 2025.",
1966
+ "source": "Meta LLaMA 4 documentation and NVIDIA docs",
1967
+ "canonical_name": "llama-4-scout",
1968
+ "aliases": [
1969
+ "llama4-17b-scout-16e-instruct",
1970
+ "llama-4-17b-scout-16e-instruct"
1971
+ ],
1972
+ "max_tokens": 10000000,
1973
+ "release_date": "2025-04-05",
1974
+ "image_patch_size": 14,
1975
+ "max_image_tokens": 6400
1976
+ },
1977
+ "llama-4-maverick": {
1978
+ "max_output_tokens": 8192,
1979
+ "tool_support": "native",
1980
+ "structured_output": "native",
1981
+ "parallel_tools": true,
1982
+ "vision_support": true,
1983
+ "audio_support": true,
1984
+ "video_support": false,
1985
+ "image_resolutions": [
1986
+ "up to 1120x1120"
1987
+ ],
1988
+ "max_image_resolution": "1120x1120",
1989
+ "architecture": "mixture_of_experts",
1990
+ "active_parameters": "17B",
1991
+ "total_parameters": "400B",
1992
+ "experts": 128,
1993
+ "image_tokenization_method": "resolution_tier_based",
1994
+ "notes": "LLaMA 4 Maverick with MoE architecture, 17B active/400B total parameters, 1M context window. Optimized for coding and reasoning. Released April 2025.",
1995
+ "source": "Meta LLaMA 4 documentation and Oracle docs",
1996
+ "canonical_name": "llama-4-maverick",
1997
+ "aliases": [
1998
+ "llama4-17b-maverick-128e-instruct"
1999
+ ],
2000
+ "max_tokens": 1000000,
2001
+ "release_date": "2025-04-05",
2002
+ "image_patch_size": 14,
2003
+ "max_image_tokens": 6400
2004
+ },
2005
+ "llama-4-behemoth": {
2006
+ "max_output_tokens": 8192,
2007
+ "tool_support": "native",
2008
+ "structured_output": "native",
2009
+ "parallel_tools": true,
2010
+ "vision_support": true,
2011
+ "audio_support": true,
2012
+ "video_support": false,
2013
+ "image_resolutions": [
2014
+ "up to 1120x1120"
2015
+ ],
2016
+ "max_image_resolution": "1120x1120",
2017
+ "architecture": "mixture_of_experts",
2018
+ "active_parameters": "288B",
2019
+ "total_parameters": "2T",
2020
+ "experts": 16,
2021
+ "image_tokenization_method": "resolution_tier_based",
2022
+ "notes": "LLaMA 4 Behemoth teacher model with 288B active/2T total parameters. Designed for distilling performance into smaller models. Announced April 2025.",
2023
+ "source": "Meta LLaMA 4 announcement and PromptHub",
2024
+ "canonical_name": "llama-4-behemoth",
2025
+ "aliases": [
2026
+ "llama4-288b-behemoth-16e"
2027
+ ],
2028
+ "max_tokens": 1000000,
2029
+ "release_date": "2025-04-05",
2030
+ "status": "announced",
2031
+ "image_patch_size": 14,
2032
+ "max_image_tokens": 6400
2033
+ },
2034
+ "minimax-m2": {
2035
+ "max_output_tokens": 8192,
2036
+ "tool_support": "native",
2037
+ "structured_output": "native",
2038
+ "parallel_tools": true,
2039
+ "vision_support": false,
2040
+ "audio_support": false,
2041
+ "video_support": false,
2042
+ "thinking_support": true,
2043
+ "architecture": "mixture_of_experts",
2044
+ "active_parameters": "10B",
2045
+ "total_parameters": "230B",
2046
+ "thinking_paradigm": "interleaved_thinking",
2047
+ "thinking_format": "<think>...</think>",
2048
+ "notes": "MiniMax M2 MoE model optimized for coding and agentic workflows. Industry-leading 204K token context window. Uses interleaved thinking with <think> tags for reasoning. 10B active parameters from 230B total. Achieves strong performance on SWE-Bench and Terminal-Bench tasks. Supports complete tool calling for agent workflows.",
2049
+ "source": "MiniMax official docs (minimax-m2.org, HuggingFace, GitHub)",
2050
+ "canonical_name": "minimax-m2",
2051
+ "aliases": [
2052
+ "MiniMaxAI/MiniMax-M2",
2053
+ "mlx-community/minimax-m2",
2054
+ "mlx-community/MiniMax-M2",
2055
+ "unsloth/MiniMax-M2-GGUF",
2056
+ "minimax-m2-230b",
2057
+ "minimax-m2-10b-active"
2058
+ ],
2059
+ "max_tokens": 208896,
2060
+ "release_date": "2025-01",
2061
+ "license": "Apache-2.0"
1606
2062
  }
1607
2063
  },
1608
2064
  "tool_support_levels": {
@@ -1621,6 +2077,73 @@
1621
2077
  "video_support": "Video processing capabilities",
1622
2078
  "fim_support": "Fill-in-the-middle code completion"
1623
2079
  },
2080
+ "vlm_tokenization_research": {
2081
+ "openai_gpt4v_formula": {
2082
+ "step1": "Resize to fit 2048x2048 (preserve aspect ratio)",
2083
+ "step2": "Resize shortest side to 768px",
2084
+ "step3": "Calculate tiles: ceil(width/512) * ceil(height/512)",
2085
+ "step4": "Total tokens = 85 + (tiles * 170)",
2086
+ "low_detail": "Fixed 85 tokens regardless of size",
2087
+ "research_source": "OpenAI official documentation + Image Tokenization research"
2088
+ },
2089
+ "anthropic_claude_formula": {
2090
+ "formula": "min((width * height) / 750, 1600)",
2091
+ "pixel_divisor": 750,
2092
+ "token_cap": 1600,
2093
+ "resize_trigger": "max(width, height) > 1568",
2094
+ "warning_threshold": "min(width, height) < 200",
2095
+ "research_source": "Anthropic Claude documentation + research analysis"
2096
+ },
2097
+ "google_gemini_formula": {
2098
+ "small_image": "width <= 384 AND height <= 384 \u2192 258 tokens",
2099
+ "large_image": "ceil(width/768) * ceil(height/768) * 258 tokens",
2100
+ "small_threshold": 384,
2101
+ "tile_size": 768,
2102
+ "tokens_per_tile": 258,
2103
+ "research_source": "Google Gemini documentation + research analysis"
2104
+ },
2105
+ "qwen_vl_adaptive_formula": {
2106
+ "formula": "min(ceil(width/patch_size) * ceil(height/patch_size), max_tokens)",
2107
+ "adaptive_resize": "Resize to fit within [min_res, max_res] range",
2108
+ "patch_sizes": {
2109
+ "qwen2.5": 14,
2110
+ "qwen3": 16
2111
+ },
2112
+ "research_source": "Qwen-VL technical documentation + research"
2113
+ },
2114
+ "vision_transformer_baseline": {
2115
+ "standard_patch_size": 16,
2116
+ "formula": "tokens = (height * width) / (patch_size^2)",
2117
+ "typical_range": [
2118
+ 196,
2119
+ 2048
2120
+ ],
2121
+ "research_source": "Vision Transformer foundational paper"
2122
+ }
2123
+ },
2124
+ "generic_vision_model": {
2125
+ "max_output_tokens": 4096,
2126
+ "tool_support": "prompted",
2127
+ "structured_output": "prompted",
2128
+ "parallel_tools": false,
2129
+ "vision_support": true,
2130
+ "audio_support": false,
2131
+ "video_support": false,
2132
+ "image_resolutions": [
2133
+ "up to 1024x1024"
2134
+ ],
2135
+ "max_image_resolution": "1024x1024",
2136
+ "image_patch_size": 16,
2137
+ "max_image_tokens": 2048,
2138
+ "image_tokenization_method": "patch_based",
2139
+ "adaptive_resolution": false,
2140
+ "vision_encoder": "generic_vit",
2141
+ "notes": "Generic vision model fallback with conservative parameters that should work with most VLMs",
2142
+ "source": "AbstractCore generic fallback",
2143
+ "canonical_name": "generic_vision_model",
2144
+ "aliases": [],
2145
+ "max_tokens": 32768
2146
+ },
1624
2147
  "default_capabilities": {
1625
2148
  "max_output_tokens": 4096,
1626
2149
  "tool_support": "none",
@@ -0,0 +1,29 @@
1
+ """
2
+ Glyph visual-text compression system for AbstractCore.
3
+
4
+ This module provides visual-text compression capabilities that transform long textual
5
+ sequences into optimized images for processing by Vision-Language Models (VLMs),
6
+ achieving 3-4x token compression without accuracy loss.
7
+
8
+ Based on the Glyph framework by Z.ai/THU-COAI with AbstractCore-specific enhancements.
9
+ """
10
+
11
+ from .glyph_processor import GlyphProcessor
12
+ from .orchestrator import CompressionOrchestrator
13
+ from .config import GlyphConfig, RenderingConfig
14
+ from .quality import QualityValidator, CompressionStats
15
+ from .cache import CompressionCache
16
+ from .exceptions import CompressionError, CompressionQualityError
17
+
18
+ __all__ = [
19
+ 'GlyphProcessor',
20
+ 'CompressionOrchestrator',
21
+ 'GlyphConfig',
22
+ 'RenderingConfig',
23
+ 'QualityValidator',
24
+ 'CompressionStats',
25
+ 'CompressionCache',
26
+ 'CompressionError',
27
+ 'CompressionQualityError'
28
+ ]
29
+