abstractcore 2.5.0__py3-none-any.whl → 2.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. abstractcore/__init__.py +12 -0
  2. abstractcore/apps/__main__.py +8 -1
  3. abstractcore/apps/deepsearch.py +644 -0
  4. abstractcore/apps/intent.py +614 -0
  5. abstractcore/architectures/detection.py +250 -4
  6. abstractcore/assets/architecture_formats.json +14 -1
  7. abstractcore/assets/model_capabilities.json +583 -44
  8. abstractcore/compression/__init__.py +29 -0
  9. abstractcore/compression/analytics.py +420 -0
  10. abstractcore/compression/cache.py +250 -0
  11. abstractcore/compression/config.py +279 -0
  12. abstractcore/compression/exceptions.py +30 -0
  13. abstractcore/compression/glyph_processor.py +381 -0
  14. abstractcore/compression/optimizer.py +388 -0
  15. abstractcore/compression/orchestrator.py +380 -0
  16. abstractcore/compression/pil_text_renderer.py +818 -0
  17. abstractcore/compression/quality.py +226 -0
  18. abstractcore/compression/text_formatter.py +666 -0
  19. abstractcore/compression/vision_compressor.py +371 -0
  20. abstractcore/config/main.py +66 -1
  21. abstractcore/config/manager.py +111 -5
  22. abstractcore/core/session.py +105 -5
  23. abstractcore/events/__init__.py +1 -1
  24. abstractcore/media/auto_handler.py +312 -18
  25. abstractcore/media/handlers/local_handler.py +14 -2
  26. abstractcore/media/handlers/openai_handler.py +62 -3
  27. abstractcore/media/processors/__init__.py +11 -1
  28. abstractcore/media/processors/direct_pdf_processor.py +210 -0
  29. abstractcore/media/processors/glyph_pdf_processor.py +227 -0
  30. abstractcore/media/processors/image_processor.py +7 -1
  31. abstractcore/media/processors/text_processor.py +18 -3
  32. abstractcore/media/types.py +164 -7
  33. abstractcore/processing/__init__.py +5 -1
  34. abstractcore/processing/basic_deepsearch.py +2173 -0
  35. abstractcore/processing/basic_intent.py +690 -0
  36. abstractcore/providers/__init__.py +18 -0
  37. abstractcore/providers/anthropic_provider.py +29 -2
  38. abstractcore/providers/base.py +279 -6
  39. abstractcore/providers/huggingface_provider.py +658 -27
  40. abstractcore/providers/lmstudio_provider.py +52 -2
  41. abstractcore/providers/mlx_provider.py +103 -4
  42. abstractcore/providers/model_capabilities.py +352 -0
  43. abstractcore/providers/ollama_provider.py +44 -6
  44. abstractcore/providers/openai_provider.py +29 -2
  45. abstractcore/providers/registry.py +91 -19
  46. abstractcore/server/app.py +91 -81
  47. abstractcore/structured/handler.py +161 -1
  48. abstractcore/tools/common_tools.py +98 -3
  49. abstractcore/utils/__init__.py +4 -1
  50. abstractcore/utils/cli.py +114 -1
  51. abstractcore/utils/trace_export.py +287 -0
  52. abstractcore/utils/version.py +1 -1
  53. abstractcore/utils/vlm_token_calculator.py +655 -0
  54. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/METADATA +140 -23
  55. abstractcore-2.5.3.dist-info/RECORD +107 -0
  56. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/entry_points.txt +4 -0
  57. abstractcore-2.5.0.dist-info/RECORD +0 -86
  58. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/WHEEL +0 -0
  59. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/licenses/LICENSE +0 -0
  60. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,25 @@
13
13
  "aliases": [],
14
14
  "max_tokens": 128000
15
15
  },
16
+ "gpt-4-turbo": {
17
+ "max_output_tokens": 4096,
18
+ "tool_support": "native",
19
+ "structured_output": "native",
20
+ "parallel_tools": true,
21
+ "max_tools": -1,
22
+ "vision_support": true,
23
+ "audio_support": false,
24
+ "image_resolutions": [
25
+ "variable"
26
+ ],
27
+ "notes": "GPT-4 Turbo with vision capabilities",
28
+ "source": "OpenAI official docs 2025",
29
+ "canonical_name": "gpt-4-turbo",
30
+ "aliases": [
31
+ "gpt-4-turbo-preview"
32
+ ],
33
+ "max_tokens": 128000
34
+ },
16
35
  "gpt-4-turbo-with-vision": {
17
36
  "max_output_tokens": 4096,
18
37
  "tool_support": "native",
@@ -45,6 +64,18 @@
45
64
  "image_resolutions": [
46
65
  "variable"
47
66
  ],
67
+ "image_tokenization_method": "tile_based",
68
+ "base_image_tokens": 85,
69
+ "tokens_per_tile": 170,
70
+ "tile_size": "512x512",
71
+ "max_image_dimension": 2048,
72
+ "short_side_resize_target": 768,
73
+ "detail_levels": [
74
+ "low",
75
+ "high",
76
+ "auto"
77
+ ],
78
+ "low_detail_tokens": 85,
48
79
  "notes": "Multimodal omni model, 2x faster, half price, 5x higher rate limits (updated Nov 2024)",
49
80
  "source": "OpenAI official docs 2025",
50
81
  "canonical_name": "gpt-4o",
@@ -154,6 +185,12 @@
154
185
  "image_resolutions": [
155
186
  "up to 1568x1568"
156
187
  ],
188
+ "image_tokenization_method": "pixel_area_based",
189
+ "token_formula": "(width * height) / 750",
190
+ "pixel_divisor": 750,
191
+ "max_image_dimension": 1568,
192
+ "token_cap": 1600,
193
+ "min_dimension_warning": 200,
157
194
  "audio_support": false,
158
195
  "notes": "disable_parallel_tool_use option available",
159
196
  "source": "Anthropic official docs",
@@ -316,7 +353,7 @@
316
353
  "llama-3.2-1b": {
317
354
  "max_output_tokens": 2048,
318
355
  "tool_support": "prompted",
319
- "structured_output": "prompted",
356
+ "structured_output": "native",
320
357
  "parallel_tools": false,
321
358
  "vision_support": false,
322
359
  "audio_support": false,
@@ -329,7 +366,7 @@
329
366
  "llama-3.2-3b": {
330
367
  "max_output_tokens": 2048,
331
368
  "tool_support": "prompted",
332
- "structured_output": "prompted",
369
+ "structured_output": "native",
333
370
  "parallel_tools": false,
334
371
  "vision_support": false,
335
372
  "audio_support": false,
@@ -342,7 +379,7 @@
342
379
  "llama-3.2-11b-vision": {
343
380
  "max_output_tokens": 2048,
344
381
  "tool_support": "prompted",
345
- "structured_output": "prompted",
382
+ "structured_output": "native",
346
383
  "parallel_tools": false,
347
384
  "vision_support": true,
348
385
  "image_resolutions": [
@@ -358,7 +395,7 @@
358
395
  "llama-3.3-70b": {
359
396
  "max_output_tokens": 8192,
360
397
  "tool_support": "prompted",
361
- "structured_output": "prompted",
398
+ "structured_output": "native",
362
399
  "parallel_tools": true,
363
400
  "vision_support": false,
364
401
  "audio_support": false,
@@ -371,7 +408,7 @@
371
408
  "llama-3.1-8b": {
372
409
  "max_output_tokens": 8192,
373
410
  "tool_support": "native",
374
- "structured_output": "prompted",
411
+ "structured_output": "native",
375
412
  "parallel_tools": true,
376
413
  "vision_support": false,
377
414
  "audio_support": false,
@@ -384,7 +421,7 @@
384
421
  "llama-3.1-70b": {
385
422
  "max_output_tokens": 8192,
386
423
  "tool_support": "native",
387
- "structured_output": "prompted",
424
+ "structured_output": "native",
388
425
  "parallel_tools": true,
389
426
  "vision_support": false,
390
427
  "audio_support": false,
@@ -397,7 +434,7 @@
397
434
  "llama-3.1-405b": {
398
435
  "max_output_tokens": 8192,
399
436
  "tool_support": "native",
400
- "structured_output": "prompted",
437
+ "structured_output": "native",
401
438
  "parallel_tools": true,
402
439
  "vision_support": false,
403
440
  "audio_support": false,
@@ -426,7 +463,7 @@
426
463
  "qwen2.5-0.5b": {
427
464
  "max_output_tokens": 8192,
428
465
  "tool_support": "prompted",
429
- "structured_output": "prompted",
466
+ "structured_output": "native",
430
467
  "parallel_tools": false,
431
468
  "vision_support": false,
432
469
  "audio_support": false,
@@ -439,7 +476,7 @@
439
476
  "qwen2.5-1.5b": {
440
477
  "max_output_tokens": 8192,
441
478
  "tool_support": "prompted",
442
- "structured_output": "prompted",
479
+ "structured_output": "native",
443
480
  "parallel_tools": false,
444
481
  "vision_support": false,
445
482
  "audio_support": false,
@@ -452,7 +489,7 @@
452
489
  "qwen2.5-3b": {
453
490
  "max_output_tokens": 8192,
454
491
  "tool_support": "prompted",
455
- "structured_output": "prompted",
492
+ "structured_output": "native",
456
493
  "parallel_tools": false,
457
494
  "vision_support": false,
458
495
  "audio_support": false,
@@ -465,7 +502,7 @@
465
502
  "qwen2.5-7b": {
466
503
  "max_output_tokens": 8192,
467
504
  "tool_support": "prompted",
468
- "structured_output": "prompted",
505
+ "structured_output": "native",
469
506
  "parallel_tools": false,
470
507
  "vision_support": false,
471
508
  "audio_support": false,
@@ -478,7 +515,7 @@
478
515
  "qwen2.5-14b": {
479
516
  "max_output_tokens": 8192,
480
517
  "tool_support": "prompted",
481
- "structured_output": "prompted",
518
+ "structured_output": "native",
482
519
  "parallel_tools": false,
483
520
  "vision_support": false,
484
521
  "audio_support": false,
@@ -491,7 +528,7 @@
491
528
  "qwen2.5-32b": {
492
529
  "max_output_tokens": 8192,
493
530
  "tool_support": "prompted",
494
- "structured_output": "prompted",
531
+ "structured_output": "native",
495
532
  "parallel_tools": false,
496
533
  "vision_support": false,
497
534
  "audio_support": false,
@@ -504,7 +541,7 @@
504
541
  "qwen2.5-72b": {
505
542
  "max_output_tokens": 8192,
506
543
  "tool_support": "prompted",
507
- "structured_output": "prompted",
544
+ "structured_output": "native",
508
545
  "parallel_tools": false,
509
546
  "vision_support": false,
510
547
  "audio_support": false,
@@ -517,7 +554,7 @@
517
554
  "qwen3-0.6b": {
518
555
  "max_output_tokens": 8192,
519
556
  "tool_support": "prompted",
520
- "structured_output": "prompted",
557
+ "structured_output": "native",
521
558
  "parallel_tools": false,
522
559
  "vision_support": false,
523
560
  "audio_support": false,
@@ -531,7 +568,7 @@
531
568
  "qwen3-1.7b": {
532
569
  "max_output_tokens": 8192,
533
570
  "tool_support": "prompted",
534
- "structured_output": "prompted",
571
+ "structured_output": "native",
535
572
  "parallel_tools": false,
536
573
  "vision_support": false,
537
574
  "audio_support": false,
@@ -545,7 +582,7 @@
545
582
  "qwen3-4b": {
546
583
  "max_output_tokens": 8192,
547
584
  "tool_support": "prompted",
548
- "structured_output": "prompted",
585
+ "structured_output": "native",
549
586
  "parallel_tools": false,
550
587
  "vision_support": false,
551
588
  "audio_support": false,
@@ -559,7 +596,7 @@
559
596
  "qwen3-32b": {
560
597
  "max_output_tokens": 8192,
561
598
  "tool_support": "prompted",
562
- "structured_output": "prompted",
599
+ "structured_output": "native",
563
600
  "parallel_tools": false,
564
601
  "vision_support": false,
565
602
  "audio_support": false,
@@ -573,7 +610,7 @@
573
610
  "qwen3-30b-a3b": {
574
611
  "max_output_tokens": 8192,
575
612
  "tool_support": "prompted",
576
- "structured_output": "prompted",
613
+ "structured_output": "native",
577
614
  "parallel_tools": false,
578
615
  "vision_support": false,
579
616
  "audio_support": false,
@@ -584,10 +621,26 @@
584
621
  "aliases": [],
585
622
  "max_tokens": 40960
586
623
  },
624
+ "qwen3-30b-a3b-2507": {
625
+ "max_output_tokens": 8192,
626
+ "tool_support": "prompted",
627
+ "structured_output": "native",
628
+ "parallel_tools": false,
629
+ "vision_support": false,
630
+ "audio_support": false,
631
+ "thinking_support": true,
632
+ "notes": "Qwen3-30B-A3B-Instruct-2507 with enhanced reasoning, coding, and mathematical skills. Supports up to 256K context, extendable to 1M tokens",
633
+ "source": "Alibaba Qwen3 2507 release",
634
+ "canonical_name": "qwen3-30b-a3b-2507",
635
+ "aliases": [
636
+ "qwen/qwen3-30b-a3b-2507"
637
+ ],
638
+ "max_tokens": 262144
639
+ },
587
640
  "qwen3-coder-30b": {
588
641
  "max_output_tokens": 8192,
589
642
  "tool_support": "native",
590
- "structured_output": "prompted",
643
+ "structured_output": "native",
591
644
  "parallel_tools": true,
592
645
  "vision_support": false,
593
646
  "audio_support": false,
@@ -600,7 +653,7 @@
600
653
  "qwen2-vl": {
601
654
  "max_output_tokens": 8192,
602
655
  "tool_support": "prompted",
603
- "structured_output": "prompted",
656
+ "structured_output": "native",
604
657
  "parallel_tools": false,
605
658
  "vision_support": true,
606
659
  "image_resolutions": [
@@ -643,7 +696,7 @@
643
696
  "phi-3-mini": {
644
697
  "max_output_tokens": 4096,
645
698
  "tool_support": "prompted",
646
- "structured_output": "prompted",
699
+ "structured_output": "native",
647
700
  "parallel_tools": false,
648
701
  "vision_support": false,
649
702
  "audio_support": false,
@@ -656,7 +709,7 @@
656
709
  "phi-3-small": {
657
710
  "max_output_tokens": 8192,
658
711
  "tool_support": "prompted",
659
- "structured_output": "prompted",
712
+ "structured_output": "native",
660
713
  "parallel_tools": false,
661
714
  "vision_support": false,
662
715
  "audio_support": false,
@@ -668,7 +721,7 @@
668
721
  "phi-3-medium": {
669
722
  "max_output_tokens": 4096,
670
723
  "tool_support": "prompted",
671
- "structured_output": "prompted",
724
+ "structured_output": "native",
672
725
  "parallel_tools": false,
673
726
  "vision_support": false,
674
727
  "audio_support": false,
@@ -680,7 +733,7 @@
680
733
  "phi-3.5-mini": {
681
734
  "max_output_tokens": 4096,
682
735
  "tool_support": "prompted",
683
- "structured_output": "prompted",
736
+ "structured_output": "native",
684
737
  "parallel_tools": false,
685
738
  "vision_support": false,
686
739
  "audio_support": false,
@@ -692,7 +745,7 @@
692
745
  "phi-3.5-moe": {
693
746
  "max_output_tokens": 4096,
694
747
  "tool_support": "prompted",
695
- "structured_output": "prompted",
748
+ "structured_output": "native",
696
749
  "parallel_tools": false,
697
750
  "vision_support": false,
698
751
  "audio_support": false,
@@ -705,7 +758,7 @@
705
758
  "phi-3-vision": {
706
759
  "max_output_tokens": 4096,
707
760
  "tool_support": "prompted",
708
- "structured_output": "prompted",
761
+ "structured_output": "native",
709
762
  "parallel_tools": false,
710
763
  "vision_support": true,
711
764
  "image_resolutions": [
@@ -720,7 +773,7 @@
720
773
  "phi-4": {
721
774
  "max_output_tokens": 16000,
722
775
  "tool_support": "prompted",
723
- "structured_output": "prompted",
776
+ "structured_output": "native",
724
777
  "parallel_tools": false,
725
778
  "vision_support": false,
726
779
  "audio_support": false,
@@ -733,7 +786,7 @@
733
786
  "mistral-7b": {
734
787
  "max_output_tokens": 8192,
735
788
  "tool_support": "prompted",
736
- "structured_output": "prompted",
789
+ "structured_output": "native",
737
790
  "parallel_tools": false,
738
791
  "vision_support": false,
739
792
  "audio_support": false,
@@ -877,6 +930,31 @@
877
930
  "max_image_resolution": "1120x1120",
878
931
  "image_patch_size": 14,
879
932
  "max_image_tokens": 6400,
933
+ "image_tokenization_method": "resolution_tier_based",
934
+ "supported_resolutions": [
935
+ [
936
+ 560,
937
+ 560
938
+ ],
939
+ [
940
+ 1120,
941
+ 560
942
+ ],
943
+ [
944
+ 560,
945
+ 1120
946
+ ],
947
+ [
948
+ 1120,
949
+ 1120
950
+ ]
951
+ ],
952
+ "base_tokens_per_resolution": {
953
+ "560x560": 1600,
954
+ "1120x560": 3200,
955
+ "560x1120": 3200,
956
+ "1120x1120": 6400
957
+ },
880
958
  "notes": "Llama 3.2 Vision 11B model with multimodal capabilities for visual recognition and reasoning",
881
959
  "source": "Meta AI Llama 3.2 release",
882
960
  "canonical_name": "llama3.2-vision:11b",
@@ -941,7 +1019,7 @@
941
1019
  "gemma-2b": {
942
1020
  "max_output_tokens": 8192,
943
1021
  "tool_support": "none",
944
- "structured_output": "prompted",
1022
+ "structured_output": "native",
945
1023
  "parallel_tools": false,
946
1024
  "vision_support": false,
947
1025
  "audio_support": false,
@@ -953,7 +1031,7 @@
953
1031
  "gemma-7b": {
954
1032
  "max_output_tokens": 8192,
955
1033
  "tool_support": "none",
956
- "structured_output": "prompted",
1034
+ "structured_output": "native",
957
1035
  "parallel_tools": false,
958
1036
  "vision_support": false,
959
1037
  "audio_support": false,
@@ -1002,7 +1080,7 @@
1002
1080
  "codegemma": {
1003
1081
  "max_output_tokens": 8192,
1004
1082
  "tool_support": "none",
1005
- "structured_output": "prompted",
1083
+ "structured_output": "native",
1006
1084
  "parallel_tools": false,
1007
1085
  "vision_support": false,
1008
1086
  "audio_support": false,
@@ -1033,7 +1111,7 @@
1033
1111
  "glm-4": {
1034
1112
  "max_output_tokens": 4096,
1035
1113
  "tool_support": "prompted",
1036
- "structured_output": "prompted",
1114
+ "structured_output": "native",
1037
1115
  "parallel_tools": false,
1038
1116
  "vision_support": false,
1039
1117
  "audio_support": false,
@@ -1085,7 +1163,7 @@
1085
1163
  "qwen3": {
1086
1164
  "max_output_tokens": 8192,
1087
1165
  "tool_support": "prompted",
1088
- "structured_output": "prompted",
1166
+ "structured_output": "native",
1089
1167
  "parallel_tools": false,
1090
1168
  "vision_support": false,
1091
1169
  "audio_support": false,
@@ -1255,13 +1333,18 @@
1255
1333
  "video_support": true,
1256
1334
  "audio_support": false,
1257
1335
  "image_resolutions": [
1258
- "variable"
1336
+ "64x64 to 4096x4096"
1259
1337
  ],
1260
- "max_image_resolution": "variable",
1338
+ "max_image_resolution": "4096x4096",
1261
1339
  "image_patch_size": 16,
1262
1340
  "max_image_tokens": 24576,
1263
1341
  "pixel_grouping": "32x32",
1264
- "notes": "Qwen3-VL 4B dense model with 256K context, optimized for LMStudio",
1342
+ "image_tokenization_method": "patch_based_adaptive",
1343
+ "adaptive_resolution": true,
1344
+ "min_resolution": 64,
1345
+ "max_resolution": 4096,
1346
+ "vision_encoder": "ViT-based",
1347
+ "notes": "Qwen3-VL 4B dense model with 256K context, optimized for LMStudio. Parameters: 4.83B. FP8 checkpoints available.",
1265
1348
  "source": "Alibaba Qwen3-VL technical report 2025",
1266
1349
  "canonical_name": "qwen3-vl-4b",
1267
1350
  "aliases": [
@@ -1278,13 +1361,18 @@
1278
1361
  "video_support": true,
1279
1362
  "audio_support": false,
1280
1363
  "image_resolutions": [
1281
- "variable"
1364
+ "64x64 to 4096x4096"
1282
1365
  ],
1283
- "max_image_resolution": "variable",
1366
+ "max_image_resolution": "4096x4096",
1284
1367
  "image_patch_size": 16,
1285
1368
  "max_image_tokens": 24576,
1286
1369
  "pixel_grouping": "32x32",
1287
- "notes": "Qwen3-VL 8B dense model with 256K context, optimized for LMStudio",
1370
+ "image_tokenization_method": "patch_based_adaptive",
1371
+ "adaptive_resolution": true,
1372
+ "min_resolution": 64,
1373
+ "max_resolution": 4096,
1374
+ "vision_encoder": "ViT-based",
1375
+ "notes": "Qwen3-VL 8B dense model with 256K context, optimized for LMStudio. Parameters: 8.77B. FP8 checkpoints available.",
1288
1376
  "source": "Alibaba Qwen3-VL technical report 2025",
1289
1377
  "canonical_name": "qwen3-vl-8b",
1290
1378
  "aliases": [
@@ -1301,19 +1389,24 @@
1301
1389
  "video_support": true,
1302
1390
  "audio_support": false,
1303
1391
  "image_resolutions": [
1304
- "variable"
1392
+ "64x64 to 4096x4096"
1305
1393
  ],
1306
- "max_image_resolution": "variable",
1394
+ "max_image_resolution": "4096x4096",
1307
1395
  "image_patch_size": 16,
1308
1396
  "max_image_tokens": 24576,
1309
1397
  "pixel_grouping": "32x32",
1310
- "notes": "Qwen3-VL 30B MoE model (30.5B total/3.3B active), best performing vision model, 256K context",
1398
+ "image_tokenization_method": "patch_based_adaptive",
1399
+ "adaptive_resolution": true,
1400
+ "min_resolution": 64,
1401
+ "max_resolution": 4096,
1402
+ "vision_encoder": "ViT-based",
1403
+ "notes": "Qwen3-VL 30B MoE model (30.5B total/3.3B active), best performing vision model, 128K context",
1311
1404
  "source": "Alibaba Qwen3-VL technical report 2025",
1312
1405
  "canonical_name": "qwen3-vl-30b",
1313
1406
  "aliases": [
1314
1407
  "qwen/qwen3-vl-30b"
1315
1408
  ],
1316
- "max_tokens": 262144
1409
+ "max_tokens": 131072
1317
1410
  },
1318
1411
  "qwen2.5-vl-7b": {
1319
1412
  "max_output_tokens": 8192,
@@ -1329,6 +1422,11 @@
1329
1422
  "image_patch_size": 14,
1330
1423
  "max_image_tokens": 16384,
1331
1424
  "pixel_grouping": "28x28",
1425
+ "image_tokenization_method": "patch_based_adaptive",
1426
+ "adaptive_resolution": true,
1427
+ "min_resolution": 56,
1428
+ "max_resolution": 3584,
1429
+ "vision_encoder": "ViT-based",
1332
1430
  "notes": "Qwen2.5-VL 7B parameter vision model, 28x28 pixel patches, max 3584x3584 resolution",
1333
1431
  "source": "Alibaba official docs",
1334
1432
  "canonical_name": "qwen2.5-vl-7b",
@@ -1353,6 +1451,12 @@
1353
1451
  "vision_encoder": "SigLIP-400M",
1354
1452
  "image_tokens_per_image": 256,
1355
1453
  "adaptive_windowing": true,
1454
+ "image_tokenization_method": "fixed_resolution",
1455
+ "fixed_resolution": [
1456
+ 896,
1457
+ 896
1458
+ ],
1459
+ "preprocessing": "automatic_resize_and_crop",
1356
1460
  "notes": "Gemma3 4B parameter model with vision support, 896x896 fixed resolution with adaptive windowing",
1357
1461
  "source": "Google Gemma3 documentation 2025",
1358
1462
  "canonical_name": "gemma3-4b",
@@ -1547,6 +1651,7 @@
1547
1651
  "max_image_resolution": "768x768",
1548
1652
  "vision_encoder": "SigLIP2-so400m-patch14-384",
1549
1653
  "image_patch_size": 14,
1654
+ "image_tokenization_method": "patch_based",
1550
1655
  "notes": "IBM Granite 3.2-Vision 2B model with SigLIP2 encoder, optimized for visual document understanding",
1551
1656
  "source": "IBM Granite 3.2 technical report arXiv:2502.09927",
1552
1657
  "canonical_name": "granite3.2-vision:2b",
@@ -1558,6 +1663,58 @@
1558
1663
  ],
1559
1664
  "max_tokens": 32768
1560
1665
  },
1666
+ "gemini-2.5-flash": {
1667
+ "max_output_tokens": 8192,
1668
+ "tool_support": "native",
1669
+ "structured_output": "native",
1670
+ "parallel_tools": true,
1671
+ "max_tools": -1,
1672
+ "vision_support": true,
1673
+ "audio_support": true,
1674
+ "video_support": true,
1675
+ "image_resolutions": [
1676
+ "224x224",
1677
+ "448x448",
1678
+ "1024x1024"
1679
+ ],
1680
+ "max_image_resolution": "768x768",
1681
+ "image_tokenization_method": "gemini_vision_encoder",
1682
+ "thinking_support": true,
1683
+ "thinking_budget": true,
1684
+ "notes": "Optimized for speed and efficiency, suitable for high-volume, latency-sensitive tasks. Supports configurable thinking budgets",
1685
+ "source": "Google AI official docs 2025",
1686
+ "canonical_name": "gemini-2.5-flash",
1687
+ "aliases": [
1688
+ "gemini-2.5-flash-001"
1689
+ ],
1690
+ "max_tokens": 1000000
1691
+ },
1692
+ "gemini-2.5-pro": {
1693
+ "max_output_tokens": 65536,
1694
+ "tool_support": "native",
1695
+ "structured_output": "native",
1696
+ "parallel_tools": true,
1697
+ "max_tools": -1,
1698
+ "vision_support": true,
1699
+ "audio_support": true,
1700
+ "video_support": true,
1701
+ "image_resolutions": [
1702
+ "224x224",
1703
+ "448x448",
1704
+ "1024x1024"
1705
+ ],
1706
+ "max_image_resolution": "768x768",
1707
+ "image_tokenization_method": "gemini_vision_encoder",
1708
+ "thinking_support": true,
1709
+ "thinking_budget": true,
1710
+ "notes": "Most advanced Gemini model for complex reasoning, coding, and mathematical problem-solving. Features Deep Think mode for enhanced reasoning",
1711
+ "source": "Google AI official docs 2025",
1712
+ "canonical_name": "gemini-2.5-pro",
1713
+ "aliases": [
1714
+ "gemini-2.5-pro-001"
1715
+ ],
1716
+ "max_tokens": 1048576
1717
+ },
1561
1718
  "granite3.3:2b": {
1562
1719
  "max_output_tokens": 8192,
1563
1720
  "tool_support": "prompted",
@@ -1587,6 +1744,321 @@
1587
1744
  "granite3.3-8b"
1588
1745
  ],
1589
1746
  "max_tokens": 32768
1747
+ },
1748
+ "embeddinggemma:300m": {
1749
+ "max_output_tokens": 0,
1750
+ "tool_support": "none",
1751
+ "structured_output": "none",
1752
+ "parallel_tools": false,
1753
+ "vision_support": false,
1754
+ "audio_support": false,
1755
+ "notes": "Text embedding model, not for generation or vision",
1756
+ "source": "Google Gemma documentation",
1757
+ "canonical_name": "embeddinggemma:300m",
1758
+ "aliases": [
1759
+ "google/embeddinggemma-300m"
1760
+ ],
1761
+ "max_tokens": 0,
1762
+ "model_type": "embedding"
1763
+ },
1764
+ "blip-image-captioning-base": {
1765
+ "max_output_tokens": 512,
1766
+ "tool_support": "none",
1767
+ "structured_output": "none",
1768
+ "parallel_tools": false,
1769
+ "vision_support": true,
1770
+ "audio_support": false,
1771
+ "video_support": false,
1772
+ "image_resolutions": [
1773
+ "224x224",
1774
+ "384x384"
1775
+ ],
1776
+ "max_image_resolution": "384x384",
1777
+ "vision_encoder": "ViT-B/16",
1778
+ "image_patch_size": 16,
1779
+ "image_tokenization_method": "patch_based",
1780
+ "base_image_tokens": 577,
1781
+ "notes": "Salesforce BLIP image captioning model, primarily for image-to-text tasks",
1782
+ "source": "Salesforce BLIP documentation",
1783
+ "canonical_name": "blip-image-captioning-base",
1784
+ "aliases": [
1785
+ "Salesforce/blip-image-captioning-base"
1786
+ ],
1787
+ "max_tokens": 512
1788
+ },
1789
+ "glyph": {
1790
+ "max_output_tokens": 8192,
1791
+ "tool_support": "prompted",
1792
+ "structured_output": "prompted",
1793
+ "parallel_tools": false,
1794
+ "vision_support": true,
1795
+ "audio_support": false,
1796
+ "video_support": false,
1797
+ "image_resolutions": [
1798
+ "variable"
1799
+ ],
1800
+ "max_image_resolution": "variable",
1801
+ "base_model": "GLM-4.1V-9B-Base",
1802
+ "total_parameters": "10B",
1803
+ "tensor_type": "BF16",
1804
+ "image_tokenization_method": "visual_text_compression",
1805
+ "optimized_for_glyph": true,
1806
+ "text_image_processing": true,
1807
+ "architecture": "glm4v",
1808
+ "requires_processor": true,
1809
+ "message_format": "glm_special_tokens",
1810
+ "conversation_template": {
1811
+ "system_prefix": "<|system|>\n",
1812
+ "system_suffix": "\n",
1813
+ "user_prefix": "<|user|>\n",
1814
+ "user_suffix": "\n",
1815
+ "assistant_prefix": "<|assistant|>\n",
1816
+ "assistant_suffix": "\n"
1817
+ },
1818
+ "model_class": "AutoModelForImageTextToText",
1819
+ "processor_class": "AutoProcessor",
1820
+ "trust_remote_code": true,
1821
+ "transformers_version_min": "4.57.1",
1822
+ "notes": "Glyph framework for scaling context windows via visual-text compression. Built on GLM-4.1V-9B-Base. Renders long text into images for VLM processing. Requires AutoModelForImageTextToText and AutoProcessor with trust_remote_code=True.",
1823
+ "source": "HuggingFace zai-org/Glyph model card",
1824
+ "canonical_name": "glyph",
1825
+ "aliases": [
1826
+ "zai-org/Glyph"
1827
+ ],
1828
+ "max_tokens": 131072,
1829
+ "license": "MIT",
1830
+ "arxiv": "2510.17800",
1831
+ "repository": "https://github.com/thu-coai/Glyph"
1832
+ },
1833
+ "glm-4.1v-9b-base": {
1834
+ "max_output_tokens": 8192,
1835
+ "tool_support": "prompted",
1836
+ "structured_output": "prompted",
1837
+ "parallel_tools": false,
1838
+ "vision_support": true,
1839
+ "audio_support": false,
1840
+ "video_support": false,
1841
+ "image_resolutions": [
1842
+ "variable"
1843
+ ],
1844
+ "max_image_resolution": "variable",
1845
+ "total_parameters": "9B",
1846
+ "base_model": "GLM-4-9B-0414",
1847
+ "image_tokenization_method": "glm_vision_encoder",
1848
+ "architecture": "glm4v",
1849
+ "requires_processor": true,
1850
+ "message_format": "glm_special_tokens",
1851
+ "model_class": "AutoModelForImageTextToText",
1852
+ "processor_class": "AutoProcessor",
1853
+ "trust_remote_code": true,
1854
+ "transformers_version_min": "4.57.1",
1855
+ "notes": "GLM-4.1V 9B base model, backbone for Glyph visual-text compression framework",
1856
+ "source": "HuggingFace zai-org/GLM-4.1V-9B-Base",
1857
+ "canonical_name": "glm-4.1v-9b-base",
1858
+ "aliases": [
1859
+ "zai-org/GLM-4.1V-9B-Base"
1860
+ ],
1861
+ "max_tokens": 131072
1862
+ },
1863
+ "glm-4.1v-9b-thinking": {
1864
+ "max_output_tokens": 8192,
1865
+ "tool_support": "prompted",
1866
+ "structured_output": "prompted",
1867
+ "parallel_tools": false,
1868
+ "vision_support": true,
1869
+ "audio_support": false,
1870
+ "video_support": false,
1871
+ "image_resolutions": [
1872
+ "up to 4096x4096"
1873
+ ],
1874
+ "max_image_resolution": "4096x4096",
1875
+ "total_parameters": "10B",
1876
+ "base_model": "GLM-4-9B-0414",
1877
+ "image_tokenization_method": "glm_vision_encoder",
1878
+ "thinking_support": true,
1879
+ "reasoning_paradigm": "chain_of_thought",
1880
+ "adaptive_resolution": true,
1881
+ "aspect_ratio_support": "arbitrary",
1882
+ "architecture": "glm4v",
1883
+ "requires_processor": true,
1884
+ "message_format": "glm_special_tokens",
1885
+ "model_class": "AutoModelForImageTextToText",
1886
+ "processor_class": "AutoProcessor",
1887
+ "trust_remote_code": true,
1888
+ "transformers_version_min": "4.57.1",
1889
+ "notes": "GLM-4.1V-9B-Thinking with Chain-of-Thought reasoning, 64K context, arbitrary aspect ratios up to 4K resolution. First reasoning-focused VLM in the series, matches 72B models on 18 benchmark tasks.",
1890
+ "source": "HuggingFace zai-org/GLM-4.1V-9B-Thinking and GitHub zai-org/GLM-V",
1891
+ "canonical_name": "glm-4.1v-9b-thinking",
1892
+ "aliases": [
1893
+ "zai-org/GLM-4.1V-9B-Thinking",
1894
+ "glm-4.1v-thinking",
1895
+ "glm4.1v-9b-thinking"
1896
+ ],
1897
+ "max_tokens": 65536,
1898
+ "arxiv": "2507.01006"
1899
+ },
1900
+ "mistral-small-3.1-24b-instruct": {
1901
+ "max_output_tokens": 8192,
1902
+ "tool_support": "native",
1903
+ "structured_output": "native",
1904
+ "parallel_tools": true,
1905
+ "vision_support": true,
1906
+ "audio_support": false,
1907
+ "video_support": false,
1908
+ "image_resolutions": [
1909
+ "up to 2048x2048"
1910
+ ],
1911
+ "max_image_resolution": "2048x2048",
1912
+ "image_tokenization_method": "mistral_vision_encoder",
1913
+ "notes": "Mistral Small 3.1 with 24B parameters, 128K context, multimodal understanding. Released March 2025.",
1914
+ "source": "Mistral AI documentation and HuggingFace",
1915
+ "canonical_name": "mistral-small-3.1-24b-instruct",
1916
+ "aliases": [
1917
+ "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
1918
+ ],
1919
+ "max_tokens": 131072,
1920
+ "total_parameters": "24B",
1921
+ "release_date": "2025-03-17"
1922
+ },
1923
+ "mistral-small-3.2-24b-instruct": {
1924
+ "max_output_tokens": 8192,
1925
+ "tool_support": "native",
1926
+ "structured_output": "native",
1927
+ "parallel_tools": true,
1928
+ "vision_support": true,
1929
+ "audio_support": false,
1930
+ "video_support": false,
1931
+ "image_resolutions": [
1932
+ "up to 2048x2048"
1933
+ ],
1934
+ "max_image_resolution": "2048x2048",
1935
+ "image_tokenization_method": "mistral_vision_encoder",
1936
+ "tensor_type": "BF16",
1937
+ "gpu_memory_required": "55GB",
1938
+ "notes": "Mistral Small 3.2 with 24B parameters, 128K context. Improved instruction following, reduced repetition, enhanced function calling. Released June 2025.",
1939
+ "source": "HuggingFace mistralai/Mistral-Small-3.2-24B-Instruct-2506",
1940
+ "canonical_name": "mistral-small-3.2-24b-instruct",
1941
+ "aliases": [
1942
+ "mistralai/Mistral-Small-3.2-24B-Instruct-2506"
1943
+ ],
1944
+ "max_tokens": 131072,
1945
+ "total_parameters": "24B",
1946
+ "release_date": "2025-06-01"
1947
+ },
1948
+ "llama-4-scout": {
1949
+ "max_output_tokens": 8192,
1950
+ "tool_support": "native",
1951
+ "structured_output": "native",
1952
+ "parallel_tools": true,
1953
+ "vision_support": true,
1954
+ "audio_support": true,
1955
+ "video_support": false,
1956
+ "image_resolutions": [
1957
+ "up to 1120x1120"
1958
+ ],
1959
+ "max_image_resolution": "1120x1120",
1960
+ "architecture": "mixture_of_experts",
1961
+ "active_parameters": "17B",
1962
+ "total_parameters": "109B",
1963
+ "experts": 16,
1964
+ "image_tokenization_method": "resolution_tier_based",
1965
+ "notes": "LLaMA 4 Scout with MoE architecture, 17B active/109B total parameters, 10M context window. Multimodal with early fusion. Released April 2025.",
1966
+ "source": "Meta LLaMA 4 documentation and NVIDIA docs",
1967
+ "canonical_name": "llama-4-scout",
1968
+ "aliases": [
1969
+ "llama4-17b-scout-16e-instruct",
1970
+ "llama-4-17b-scout-16e-instruct"
1971
+ ],
1972
+ "max_tokens": 10000000,
1973
+ "release_date": "2025-04-05",
1974
+ "image_patch_size": 14,
1975
+ "max_image_tokens": 6400
1976
+ },
1977
+ "llama-4-maverick": {
1978
+ "max_output_tokens": 8192,
1979
+ "tool_support": "native",
1980
+ "structured_output": "native",
1981
+ "parallel_tools": true,
1982
+ "vision_support": true,
1983
+ "audio_support": true,
1984
+ "video_support": false,
1985
+ "image_resolutions": [
1986
+ "up to 1120x1120"
1987
+ ],
1988
+ "max_image_resolution": "1120x1120",
1989
+ "architecture": "mixture_of_experts",
1990
+ "active_parameters": "17B",
1991
+ "total_parameters": "400B",
1992
+ "experts": 128,
1993
+ "image_tokenization_method": "resolution_tier_based",
1994
+ "notes": "LLaMA 4 Maverick with MoE architecture, 17B active/400B total parameters, 1M context window. Optimized for coding and reasoning. Released April 2025.",
1995
+ "source": "Meta LLaMA 4 documentation and Oracle docs",
1996
+ "canonical_name": "llama-4-maverick",
1997
+ "aliases": [
1998
+ "llama4-17b-maverick-128e-instruct"
1999
+ ],
2000
+ "max_tokens": 1000000,
2001
+ "release_date": "2025-04-05",
2002
+ "image_patch_size": 14,
2003
+ "max_image_tokens": 6400
2004
+ },
2005
+ "llama-4-behemoth": {
2006
+ "max_output_tokens": 8192,
2007
+ "tool_support": "native",
2008
+ "structured_output": "native",
2009
+ "parallel_tools": true,
2010
+ "vision_support": true,
2011
+ "audio_support": true,
2012
+ "video_support": false,
2013
+ "image_resolutions": [
2014
+ "up to 1120x1120"
2015
+ ],
2016
+ "max_image_resolution": "1120x1120",
2017
+ "architecture": "mixture_of_experts",
2018
+ "active_parameters": "288B",
2019
+ "total_parameters": "2T",
2020
+ "experts": 16,
2021
+ "image_tokenization_method": "resolution_tier_based",
2022
+ "notes": "LLaMA 4 Behemoth teacher model with 288B active/2T total parameters. Designed for distilling performance into smaller models. Announced April 2025.",
2023
+ "source": "Meta LLaMA 4 announcement and PromptHub",
2024
+ "canonical_name": "llama-4-behemoth",
2025
+ "aliases": [
2026
+ "llama4-288b-behemoth-16e"
2027
+ ],
2028
+ "max_tokens": 1000000,
2029
+ "release_date": "2025-04-05",
2030
+ "status": "announced",
2031
+ "image_patch_size": 14,
2032
+ "max_image_tokens": 6400
2033
+ },
2034
+ "minimax-m2": {
2035
+ "max_output_tokens": 8192,
2036
+ "tool_support": "native",
2037
+ "structured_output": "native",
2038
+ "parallel_tools": true,
2039
+ "vision_support": false,
2040
+ "audio_support": false,
2041
+ "video_support": false,
2042
+ "thinking_support": true,
2043
+ "architecture": "mixture_of_experts",
2044
+ "active_parameters": "10B",
2045
+ "total_parameters": "230B",
2046
+ "thinking_paradigm": "interleaved_thinking",
2047
+ "thinking_format": "<think>...</think>",
2048
+ "notes": "MiniMax M2 MoE model optimized for coding and agentic workflows. Industry-leading 204K token context window. Uses interleaved thinking with <think> tags for reasoning. 10B active parameters from 230B total. Achieves strong performance on SWE-Bench and Terminal-Bench tasks. Supports complete tool calling for agent workflows.",
2049
+ "source": "MiniMax official docs (minimax-m2.org, HuggingFace, GitHub)",
2050
+ "canonical_name": "minimax-m2",
2051
+ "aliases": [
2052
+ "MiniMaxAI/MiniMax-M2",
2053
+ "mlx-community/minimax-m2",
2054
+ "mlx-community/MiniMax-M2",
2055
+ "unsloth/MiniMax-M2-GGUF",
2056
+ "minimax-m2-230b",
2057
+ "minimax-m2-10b-active"
2058
+ ],
2059
+ "max_tokens": 208896,
2060
+ "release_date": "2025-01",
2061
+ "license": "Apache-2.0"
1590
2062
  }
1591
2063
  },
1592
2064
  "tool_support_levels": {
@@ -1605,6 +2077,73 @@
1605
2077
  "video_support": "Video processing capabilities",
1606
2078
  "fim_support": "Fill-in-the-middle code completion"
1607
2079
  },
2080
+ "vlm_tokenization_research": {
2081
+ "openai_gpt4v_formula": {
2082
+ "step1": "Resize to fit 2048x2048 (preserve aspect ratio)",
2083
+ "step2": "Resize shortest side to 768px",
2084
+ "step3": "Calculate tiles: ceil(width/512) * ceil(height/512)",
2085
+ "step4": "Total tokens = 85 + (tiles * 170)",
2086
+ "low_detail": "Fixed 85 tokens regardless of size",
2087
+ "research_source": "OpenAI official documentation + Image Tokenization research"
2088
+ },
2089
+ "anthropic_claude_formula": {
2090
+ "formula": "min((width * height) / 750, 1600)",
2091
+ "pixel_divisor": 750,
2092
+ "token_cap": 1600,
2093
+ "resize_trigger": "max(width, height) > 1568",
2094
+ "warning_threshold": "min(width, height) < 200",
2095
+ "research_source": "Anthropic Claude documentation + research analysis"
2096
+ },
2097
+ "google_gemini_formula": {
2098
+ "small_image": "width <= 384 AND height <= 384 \u2192 258 tokens",
2099
+ "large_image": "ceil(width/768) * ceil(height/768) * 258 tokens",
2100
+ "small_threshold": 384,
2101
+ "tile_size": 768,
2102
+ "tokens_per_tile": 258,
2103
+ "research_source": "Google Gemini documentation + research analysis"
2104
+ },
2105
+ "qwen_vl_adaptive_formula": {
2106
+ "formula": "min(ceil(width/patch_size) * ceil(height/patch_size), max_tokens)",
2107
+ "adaptive_resize": "Resize to fit within [min_res, max_res] range",
2108
+ "patch_sizes": {
2109
+ "qwen2.5": 14,
2110
+ "qwen3": 16
2111
+ },
2112
+ "research_source": "Qwen-VL technical documentation + research"
2113
+ },
2114
+ "vision_transformer_baseline": {
2115
+ "standard_patch_size": 16,
2116
+ "formula": "tokens = (height * width) / (patch_size^2)",
2117
+ "typical_range": [
2118
+ 196,
2119
+ 2048
2120
+ ],
2121
+ "research_source": "Vision Transformer foundational paper"
2122
+ }
2123
+ },
2124
+ "generic_vision_model": {
2125
+ "max_output_tokens": 4096,
2126
+ "tool_support": "prompted",
2127
+ "structured_output": "prompted",
2128
+ "parallel_tools": false,
2129
+ "vision_support": true,
2130
+ "audio_support": false,
2131
+ "video_support": false,
2132
+ "image_resolutions": [
2133
+ "up to 1024x1024"
2134
+ ],
2135
+ "max_image_resolution": "1024x1024",
2136
+ "image_patch_size": 16,
2137
+ "max_image_tokens": 2048,
2138
+ "image_tokenization_method": "patch_based",
2139
+ "adaptive_resolution": false,
2140
+ "vision_encoder": "generic_vit",
2141
+ "notes": "Generic vision model fallback with conservative parameters that should work with most VLMs",
2142
+ "source": "AbstractCore generic fallback",
2143
+ "canonical_name": "generic_vision_model",
2144
+ "aliases": [],
2145
+ "max_tokens": 32768
2146
+ },
1608
2147
  "default_capabilities": {
1609
2148
  "max_output_tokens": 4096,
1610
2149
  "tool_support": "none",