sie-server 0.4.0__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (441) hide show
  1. {sie_server-0.4.0 → sie_server-0.4.2}/Dockerfile.cpu +6 -12
  2. {sie_server-0.4.0 → sie_server-0.4.2}/Dockerfile.cuda12 +18 -16
  3. {sie_server-0.4.0 → sie_server-0.4.2}/PKG-INFO +3 -3
  4. {sie_server-0.4.0 → sie_server-0.4.2}/README.md +1 -1
  5. {sie_server-0.4.0 → sie_server-0.4.2}/bundles/default.yaml +2 -1
  6. {sie_server-0.4.0 → sie_server-0.4.2}/bundles/sglang-embedding.yaml +1 -1
  7. {sie_server-0.4.0 → sie_server-0.4.2}/bundles/sglang.yaml +21 -21
  8. sie_server-0.4.2/models/Marqo__marqo-fashionSigLIP.yaml +28 -0
  9. {sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3-0.6B.yaml +20 -21
  10. {sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3-4B-Instruct-2507.yaml +2 -3
  11. sie_server-0.4.2/models/Qwen__Qwen3.6-27B.yaml +308 -0
  12. {sie_server-0.4.0 → sie_server-0.4.2}/models/docling.yaml +1 -1
  13. sie_server-0.4.2/models/opendatalab__MinerU2.5-Pro-2604-1.2B.yaml +24 -0
  14. {sie_server-0.4.0 → sie_server-0.4.2}/openapi.json +1 -1
  15. {sie_server-0.4.0 → sie_server-0.4.2}/pyproject.toml +10 -6
  16. sie_server-0.4.2/scripts/generate_tokenize_fixture.py +203 -0
  17. sie_server-0.4.2/src/sie_server/__init__.py +9 -0
  18. sie_server-0.4.2/src/sie_server/_ipc_test_harness.py +356 -0
  19. sie_server-0.4.2/src/sie_server/adapter_call_loop.py +439 -0
  20. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/_generation_base.py +2 -4
  21. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/_utils.py +4 -1
  22. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/base.py +2 -5
  23. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/bert_flash_cross_encoder/__init__.py +0 -2
  24. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/clip/__init__.py +19 -6
  25. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/colpali/__init__.py +18 -13
  26. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/colqwen2/__init__.py +6 -4
  27. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/colqwen3/__init__.py +72 -0
  28. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/docling/__init__.py +29 -8
  29. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/donut/__init__.py +0 -2
  30. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/gliner/__init__.py +0 -2
  31. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/glirel/__init__.py +0 -3
  32. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/glm_ocr/__init__.py +105 -0
  33. sie_server-0.4.2/src/sie_server/adapters/mineru_vl/__init__.py +434 -0
  34. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/nemo_colembed/__init__.py +49 -1
  35. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/peft_lora_mixin.py +0 -2
  36. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/pytorch_embedding/__init__.py +17 -4
  37. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/sglang/_server.py +1 -0
  38. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/sglang/embedding.py +1 -3
  39. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/sglang/generation.py +11 -5
  40. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/siglip/__init__.py +3 -3
  41. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/encode.py +3 -3
  42. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/extract.py +10 -3
  43. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/health.py +0 -2
  44. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/helpers.py +1 -1
  45. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/openai_compat.py +1 -1
  46. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/serialization.py +1 -1
  47. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/ws.py +25 -9
  48. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/app/app_factory.py +56 -208
  49. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/cli.py +20 -0
  50. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/config/engine.py +79 -6
  51. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/config/model.py +4 -8
  52. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/adaptive_batching.py +205 -10
  53. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/batcher.py +9 -6
  54. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/deps.py +1 -45
  55. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/disk_cache.py +1 -1
  56. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/encode_pipeline.py +70 -2
  57. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/extract_cost.py +1 -2
  58. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/hot_reload.py +0 -2
  59. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/inference.py +2 -2
  60. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/inference_output.py +0 -3
  61. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/loader.py +21 -0
  62. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/memory.py +2 -2
  63. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/model_loader.py +7 -1
  64. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/pool_isolation.py +2 -5
  65. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/postprocessor.py +0 -3
  66. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/postprocessor_registry.py +2 -2
  67. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/prepared.py +21 -1
  68. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/preprocessor/__init__.py +0 -2
  69. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/preprocessor/base.py +1 -1
  70. sie_server-0.4.2/src/sie_server/core/preprocessor/text.py +495 -0
  71. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/preprocessor/vision.py +175 -0
  72. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/preprocessor_registry.py +2 -1
  73. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/readiness.py +26 -3
  74. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/registry.py +10 -4
  75. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/timing.py +1 -1
  76. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/tokenizer.py +2 -18
  77. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/worker/__init__.py +0 -2
  78. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/worker/model_worker.py +167 -12
  79. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/worker/types.py +47 -2
  80. sie_server-0.4.2/src/sie_server/ipc_server.py +679 -0
  81. sie_server-0.4.2/src/sie_server/ipc_types.py +514 -0
  82. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/observability/__init__.py +0 -6
  83. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/observability/gpu.py +0 -2
  84. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/observability/metrics.py +53 -13
  85. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/observability/prometheus.py +0 -2
  86. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/observability/tracing.py +0 -1
  87. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/processors/streaming.py +110 -30
  88. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/processors/work_class_scheduler.py +4 -5
  89. sie_server-0.4.2/src/sie_server/queue_executor.py +1088 -0
  90. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/types/inputs.py +2 -2
  91. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/types/outputs.py +1 -1
  92. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_clip.py +52 -8
  93. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_docling.py +64 -2
  94. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_lora_integration.py +1 -1
  95. sie_server-0.4.2/tests/adapters/test_mineru_vl.py +380 -0
  96. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_pytorch_embedding_revision.py +34 -2
  97. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_sentence_transformer.py +61 -0
  98. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_sglang_generation.py +2 -2
  99. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_stablebridge_integration.py +1 -1
  100. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_visual_document.py +18 -3
  101. {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_encode_dtype.py +1 -1
  102. {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_encode_endpoint.py +1 -1
  103. {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_encode_timing.py +1 -1
  104. {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_extract.py +26 -1
  105. {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_generate.py +2 -6
  106. {sie_server-0.4.0 → sie_server-0.4.2}/tests/app/test_app_factory.py +173 -17
  107. {sie_server-0.4.0 → sie_server-0.4.2}/tests/config/test_bundle_coverage.py +3 -6
  108. {sie_server-0.4.0 → sie_server-0.4.2}/tests/config/test_config.py +9 -2
  109. {sie_server-0.4.0 → sie_server-0.4.2}/tests/config/test_profile_backend_consistency.py +3 -12
  110. {sie_server-0.4.0 → sie_server-0.4.2}/tests/conftest.py +31 -22
  111. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_adaptive_batching.py +279 -3
  112. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_batcher.py +13 -11
  113. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_loader.py +79 -0
  114. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_lora_generation_exclusion.py +1 -1
  115. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_model_load_timeout.py +1 -1
  116. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_preprocessor.py +358 -0
  117. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_registry_async.py +1 -1
  118. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_worker_core.py +39 -1
  119. sie_server-0.4.2/tests/core/test_worker_passthrough.py +220 -0
  120. {sie_server-0.4.0 → sie_server-0.4.2}/tests/integration/test_chat_completions.py +2 -3
  121. {sie_server-0.4.0 → sie_server-0.4.2}/tests/integration/test_grammar_generate.py +1 -1
  122. {sie_server-0.4.0 → sie_server-0.4.2}/tests/observability/test_generation_metrics.py +4 -4
  123. {sie_server-0.4.0 → sie_server-0.4.2}/tests/observability/test_metrics.py +110 -44
  124. {sie_server-0.4.0 → sie_server-0.4.2}/tests/observability/test_trace_propagation.py +3 -3
  125. {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/test_grammar_prewarm.py +2 -4
  126. {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/test_streaming.py +14 -26
  127. {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/test_work_class_scheduler.py +1 -1
  128. sie_server-0.4.2/tests/test_adapter_call_loop.py +295 -0
  129. {sie_server-0.4.0 → sie_server-0.4.2}/tests/test_docker_integration.py +9 -9
  130. sie_server-0.4.2/tests/test_ipc_server.py +712 -0
  131. sie_server-0.4.2/tests/test_ipc_types_raw_output.py +162 -0
  132. sie_server-0.4.2/tests/test_model_yaml_filenames.py +35 -0
  133. sie_server-0.4.2/tests/test_parity_run_batch.py +332 -0
  134. sie_server-0.4.2/tests/test_queue_executor.py +724 -0
  135. sie_server-0.4.2/tests/test_queue_executor_stage1d.py +622 -0
  136. sie_server-0.4.2/tests/test_readiness.py +53 -0
  137. sie_server-0.4.2/tests/test_server_smoke.py +14 -0
  138. sie_server-0.4.2/tests/test_stage1d_byte_identity.py +393 -0
  139. {sie_server-0.4.0 → sie_server-0.4.2}/tests/type_defs/test_media_bytes.py +38 -11
  140. sie_server-0.4.0/src/sie_server/__init__.py +0 -3
  141. sie_server-0.4.0/src/sie_server/core/preprocessor/text.py +0 -268
  142. sie_server-0.4.0/src/sie_server/nats_pull_loop.py +0 -2458
  143. sie_server-0.4.0/src/sie_server/nats_subscriber.py +0 -231
  144. sie_server-0.4.0/tests/test_nats_pull_loop.py +0 -924
  145. sie_server-0.4.0/tests/test_nats_pull_loop_batching.py +0 -1291
  146. sie_server-0.4.0/tests/test_server_smoke.py +0 -8
  147. {sie_server-0.4.0 → sie_server-0.4.2}/.gitignore +0 -0
  148. {sie_server-0.4.0 → sie_server-0.4.2}/CONTRIBUTING.md +0 -0
  149. {sie_server-0.4.0 → sie_server-0.4.2}/LICENSE +0 -0
  150. {sie_server-0.4.0 → sie_server-0.4.2}/bundles/transformers5.yaml +0 -0
  151. {sie_server-0.4.0 → sie_server-0.4.2}/models/Alibaba-NLP__gte-Qwen2-1.5B-instruct.yaml +0 -0
  152. {sie_server-0.4.0 → sie_server-0.4.2}/models/Alibaba-NLP__gte-Qwen2-7B-instruct.yaml +0 -0
  153. {sie_server-0.4.0 → sie_server-0.4.2}/models/Alibaba-NLP__gte-modernbert-base.yaml +0 -0
  154. {sie_server-0.4.0 → sie_server-0.4.2}/models/Alibaba-NLP__gte-multilingual-base.yaml +0 -0
  155. {sie_server-0.4.0 → sie_server-0.4.2}/models/Alibaba-NLP__gte-reranker-modernbert-base.yaml +0 -0
  156. {sie_server-0.4.0 → sie_server-0.4.2}/models/BAAI__bge-m3.yaml +0 -0
  157. {sie_server-0.4.0 → sie_server-0.4.2}/models/BAAI__bge-reranker-base.yaml +0 -0
  158. {sie_server-0.4.0 → sie_server-0.4.2}/models/BAAI__bge-reranker-large.yaml +0 -0
  159. {sie_server-0.4.0 → sie_server-0.4.2}/models/BAAI__bge-reranker-v2-m3.yaml +0 -0
  160. {sie_server-0.4.0 → sie_server-0.4.2}/models/EmergentMethods__gliner_large_news-v2.1.yaml +0 -0
  161. {sie_server-0.4.0 → sie_server-0.4.2}/models/GritLM__GritLM-7B.yaml +0 -0
  162. {sie_server-0.4.0 → sie_server-0.4.2}/models/IDEA-Research__grounding-dino-base.yaml +0 -0
  163. {sie_server-0.4.0 → sie_server-0.4.2}/models/IDEA-Research__grounding-dino-tiny.yaml +0 -0
  164. {sie_server-0.4.0 → sie_server-0.4.2}/models/Ihor__gliner-biomed-large-v1.0.yaml +0 -0
  165. {sie_server-0.4.0 → sie_server-0.4.2}/models/Linq-AI-Research__Linq-Embed-Mistral.yaml +0 -0
  166. {sie_server-0.4.0 → sie_server-0.4.2}/models/Marqo__marqo-ecommerce-embeddings-B.yaml +0 -0
  167. {sie_server-0.4.0 → sie_server-0.4.2}/models/MoritzLaurer__deberta-v3-base-zeroshot-v2.0.yaml +0 -0
  168. {sie_server-0.4.0 → sie_server-0.4.2}/models/MoritzLaurer__deberta-v3-large-zeroshot-v2.0.yaml +0 -0
  169. {sie_server-0.4.0 → sie_server-0.4.2}/models/NeuML__gliner-bert-tiny.yaml +0 -0
  170. {sie_server-0.4.0 → sie_server-0.4.2}/models/NovaSearch__stella_en_1.5B_v5.yaml +0 -0
  171. {sie_server-0.4.0 → sie_server-0.4.2}/models/NovaSearch__stella_en_400M_v5.yaml +0 -0
  172. {sie_server-0.4.0 → sie_server-0.4.2}/models/PaddlePaddle__PaddleOCR-VL-1.5.yaml +0 -0
  173. {sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3-Embedding-0.6B.yaml +0 -0
  174. {sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3-Embedding-4B.yaml +0 -0
  175. {sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3-Reranker-0.6B.yaml +0 -0
  176. {sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3-Reranker-4B.yaml +0 -0
  177. {sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3-VL-Embedding-2B.yaml +0 -0
  178. {sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3-VL-Reranker-2B.yaml +0 -0
  179. {sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3.5-4B.yaml +0 -0
  180. {sie_server-0.4.0 → sie_server-0.4.2}/models/Salesforce__SFR-Embedding-2_R.yaml +0 -0
  181. {sie_server-0.4.0 → sie_server-0.4.2}/models/Salesforce__SFR-Embedding-Mistral.yaml +0 -0
  182. {sie_server-0.4.0 → sie_server-0.4.2}/models/Snowflake__snowflake-arctic-embed-m-v2.0.yaml +0 -0
  183. /sie_server-0.4.0/models/tomoroai__tomoro-colqwen3-embed-4b.yaml → /sie_server-0.4.2/models/TomoroAI__tomoro-colqwen3-embed-4b.yaml +0 -0
  184. {sie_server-0.4.0 → sie_server-0.4.2}/models/answerdotai__ModernBERT-base.yaml +0 -0
  185. {sie_server-0.4.0 → sie_server-0.4.2}/models/answerdotai__answerai-colbert-small-v1.yaml +0 -0
  186. {sie_server-0.4.0 → sie_server-0.4.2}/models/colbert-ir__colbertv2.0.yaml +0 -0
  187. {sie_server-0.4.0 → sie_server-0.4.2}/models/cross-encoder__ms-marco-MiniLM-L-12-v2.yaml +0 -0
  188. {sie_server-0.4.0 → sie_server-0.4.2}/models/cross-encoder__ms-marco-MiniLM-L-6-v2.yaml +0 -0
  189. {sie_server-0.4.0 → sie_server-0.4.2}/models/cross-encoder__nli-deberta-v3-base.yaml +0 -0
  190. {sie_server-0.4.0 → sie_server-0.4.2}/models/fastino__gliner2-base-v1.yaml +0 -0
  191. {sie_server-0.4.0 → sie_server-0.4.2}/models/gliner-community__gliner_large-v2.5.yaml +0 -0
  192. {sie_server-0.4.0 → sie_server-0.4.2}/models/gliner-community__gliner_medium-v2.5.yaml +0 -0
  193. {sie_server-0.4.0 → sie_server-0.4.2}/models/gliner-community__gliner_small-v2.5.yaml +0 -0
  194. {sie_server-0.4.0 → sie_server-0.4.2}/models/google__embeddinggemma-300m.yaml +0 -0
  195. {sie_server-0.4.0 → sie_server-0.4.2}/models/google__owlv2-base-patch16-ensemble.yaml +0 -0
  196. {sie_server-0.4.0 → sie_server-0.4.2}/models/google__siglip-so400m-patch14-224.yaml +0 -0
  197. {sie_server-0.4.0 → sie_server-0.4.2}/models/google__siglip-so400m-patch14-384.yaml +0 -0
  198. {sie_server-0.4.0 → sie_server-0.4.2}/models/google__siglip2-base-patch16-224.yaml +0 -0
  199. {sie_server-0.4.0 → sie_server-0.4.2}/models/ibm-granite__granite-embedding-30m-sparse.yaml +0 -0
  200. {sie_server-0.4.0 → sie_server-0.4.2}/models/ibm-granite__granite-embedding-english-r2.yaml +0 -0
  201. {sie_server-0.4.0 → sie_server-0.4.2}/models/ibm-granite__granite-embedding-small-english-r2.yaml +0 -0
  202. {sie_server-0.4.0 → sie_server-0.4.2}/models/intfloat__e5-base-v2.yaml +0 -0
  203. {sie_server-0.4.0 → sie_server-0.4.2}/models/intfloat__e5-large-v2.yaml +0 -0
  204. {sie_server-0.4.0 → sie_server-0.4.2}/models/intfloat__e5-mistral-7b-instruct.yaml +0 -0
  205. {sie_server-0.4.0 → sie_server-0.4.2}/models/intfloat__e5-small-v2.yaml +0 -0
  206. {sie_server-0.4.0 → sie_server-0.4.2}/models/intfloat__multilingual-e5-large-instruct.yaml +0 -0
  207. {sie_server-0.4.0 → sie_server-0.4.2}/models/intfloat__multilingual-e5-large.yaml +0 -0
  208. {sie_server-0.4.0 → sie_server-0.4.2}/models/jackboyla__glirel-large-v0.yaml +0 -0
  209. {sie_server-0.4.0 → sie_server-0.4.2}/models/jinaai__jina-colbert-v2.yaml +0 -0
  210. {sie_server-0.4.0 → sie_server-0.4.2}/models/jinaai__jina-reranker-v2-base-multilingual.yaml +0 -0
  211. {sie_server-0.4.0 → sie_server-0.4.2}/models/knowledgator__gliclass-base-v1.0.yaml +0 -0
  212. {sie_server-0.4.0 → sie_server-0.4.2}/models/knowledgator__gliclass-large-v1.0.yaml +0 -0
  213. {sie_server-0.4.0 → sie_server-0.4.2}/models/knowledgator__gliclass-large-v3.0.yaml +0 -0
  214. {sie_server-0.4.0 → sie_server-0.4.2}/models/knowledgator__gliclass-small-v1.0.yaml +0 -0
  215. {sie_server-0.4.0 → sie_server-0.4.2}/models/knowledgator__gliner-bi-base-v2.0.yaml +0 -0
  216. {sie_server-0.4.0 → sie_server-0.4.2}/models/knowledgator__modern-gliner-bi-base-v1.0.yaml +0 -0
  217. {sie_server-0.4.0 → sie_server-0.4.2}/models/laion__CLIP-ViT-B-32-laion2B-s34B-b79K.yaml +0 -0
  218. {sie_server-0.4.0 → sie_server-0.4.2}/models/laion__CLIP-ViT-H-14-laion2B-s32B-b79K.yaml +0 -0
  219. {sie_server-0.4.0 → sie_server-0.4.2}/models/lightonai__GTE-ModernColBERT-v1.yaml +0 -0
  220. {sie_server-0.4.0 → sie_server-0.4.2}/models/lightonai__LightOnOCR-2-1B.yaml +0 -0
  221. {sie_server-0.4.0 → sie_server-0.4.2}/models/lightonai__Reason-ModernColBERT.yaml +0 -0
  222. {sie_server-0.4.0 → sie_server-0.4.2}/models/microsoft__Florence-2-base-ft.yaml +0 -0
  223. {sie_server-0.4.0 → sie_server-0.4.2}/models/microsoft__Florence-2-base.yaml +0 -0
  224. {sie_server-0.4.0 → sie_server-0.4.2}/models/microsoft__Florence-2-large.yaml +0 -0
  225. {sie_server-0.4.0 → sie_server-0.4.2}/models/mixedbread-ai__mxbai-colbert-large-v1.yaml +0 -0
  226. {sie_server-0.4.0 → sie_server-0.4.2}/models/mixedbread-ai__mxbai-edge-colbert-v0-32m.yaml +0 -0
  227. {sie_server-0.4.0 → sie_server-0.4.2}/models/mixedbread-ai__mxbai-rerank-base-v2.yaml +0 -0
  228. {sie_server-0.4.0 → sie_server-0.4.2}/models/mixedbread-ai__mxbai-rerank-large-v2.yaml +0 -0
  229. {sie_server-0.4.0 → sie_server-0.4.2}/models/mynkchaudhry__Florence-2-FT-DocVQA.yaml +0 -0
  230. {sie_server-0.4.0 → sie_server-0.4.2}/models/naver-clova-ix__donut-base-finetuned-cord-v2.yaml +0 -0
  231. {sie_server-0.4.0 → sie_server-0.4.2}/models/naver-clova-ix__donut-base-finetuned-docvqa.yaml +0 -0
  232. {sie_server-0.4.0 → sie_server-0.4.2}/models/naver-clova-ix__donut-base-finetuned-rvlcdip.yaml +0 -0
  233. {sie_server-0.4.0 → sie_server-0.4.2}/models/naver__splade-cocondenser-selfdistil.yaml +0 -0
  234. {sie_server-0.4.0 → sie_server-0.4.2}/models/naver__splade-v3.yaml +0 -0
  235. {sie_server-0.4.0 → sie_server-0.4.2}/models/nomic-ai__nomic-embed-text-v2-moe.yaml +0 -0
  236. {sie_server-0.4.0 → sie_server-0.4.2}/models/numind__NuNER_Zero-span.yaml +0 -0
  237. {sie_server-0.4.0 → sie_server-0.4.2}/models/numind__NuNER_Zero.yaml +0 -0
  238. {sie_server-0.4.0 → sie_server-0.4.2}/models/nvidia__NV-Embed-v2.yaml +0 -0
  239. {sie_server-0.4.0 → sie_server-0.4.2}/models/nvidia__llama-embed-nemotron-8b.yaml +0 -0
  240. {sie_server-0.4.0 → sie_server-0.4.2}/models/nvidia__llama-nemoretriever-colembed-3b-v1.yaml +0 -0
  241. {sie_server-0.4.0 → sie_server-0.4.2}/models/nvidia__nemotron-colembed-vl-4b-v2.yaml +0 -0
  242. {sie_server-0.4.0 → sie_server-0.4.2}/models/openai__clip-vit-base-patch32.yaml +0 -0
  243. {sie_server-0.4.0 → sie_server-0.4.2}/models/openai__clip-vit-large-patch14.yaml +0 -0
  244. {sie_server-0.4.0 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v2-distill.yaml +0 -0
  245. {sie_server-0.4.0 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v2-mini.yaml +0 -0
  246. {sie_server-0.4.0 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v3-distill.yaml +0 -0
  247. {sie_server-0.4.0 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v3-gte.yaml +0 -0
  248. {sie_server-0.4.0 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-v1.yaml +0 -0
  249. {sie_server-0.4.0 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-v2-distill.yaml +0 -0
  250. {sie_server-0.4.0 → sie_server-0.4.2}/models/prithivida__Splade_PP_en_v2.yaml +0 -0
  251. {sie_server-0.4.0 → sie_server-0.4.2}/models/rasyosef__splade-mini.yaml +0 -0
  252. {sie_server-0.4.0 → sie_server-0.4.2}/models/sentence-transformers__all-MiniLM-L6-v2.yaml +0 -0
  253. {sie_server-0.4.0 → sie_server-0.4.2}/models/sugiv__stablebridge-pruner-highlighter.yaml +0 -0
  254. {sie_server-0.4.0 → sie_server-0.4.2}/models/urchade__gliner_large-v2.1.yaml +0 -0
  255. {sie_server-0.4.0 → sie_server-0.4.2}/models/urchade__gliner_medium-v2.1.yaml +0 -0
  256. {sie_server-0.4.0 → sie_server-0.4.2}/models/urchade__gliner_multi-v2.1.yaml +0 -0
  257. {sie_server-0.4.0 → sie_server-0.4.2}/models/urchade__gliner_multi_pii-v1.yaml +0 -0
  258. {sie_server-0.4.0 → sie_server-0.4.2}/models/urchade__gliner_small-v2.1.yaml +0 -0
  259. {sie_server-0.4.0 → sie_server-0.4.2}/models/vidore__colpali-v1.3-hf.yaml +0 -0
  260. {sie_server-0.4.0 → sie_server-0.4.2}/models/vidore__colqwen2.5-v0.2.yaml +0 -0
  261. {sie_server-0.4.0 → sie_server-0.4.2}/models/zai-org__GLM-OCR.yaml +0 -0
  262. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/__init__.py +0 -0
  263. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/_base_adapter.py +0 -0
  264. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/_flash_base.py +0 -0
  265. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/_spec.py +0 -0
  266. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/_types.py +0 -0
  267. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/bert_flash/__init__.py +0 -0
  268. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/bge_m3/__init__.py +0 -0
  269. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/bge_m3_flag/__init__.py +0 -0
  270. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/bge_m3_flash/__init__.py +0 -0
  271. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/bge_m3_score_mixin.py +0 -0
  272. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/colbert/__init__.py +0 -0
  273. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/colbert_modernbert_flash/__init__.py +0 -0
  274. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/colbert_rotary_flash/__init__.py +0 -0
  275. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/cross_encoder/__init__.py +0 -0
  276. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/errors.py +0 -0
  277. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/florence2/__init__.py +0 -0
  278. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/gliclass/__init__.py +0 -0
  279. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/gliner2/__init__.py +0 -0
  280. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/gliner_bi/__init__.py +0 -0
  281. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/grounding_dino/__init__.py +0 -0
  282. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/gte_sparse_flash/__init__.py +0 -0
  283. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/jina_flash_cross_encoder/__init__.py +0 -0
  284. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/lighton_ocr/__init__.py +0 -0
  285. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/modernbert_flash/__init__.py +0 -0
  286. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/modernbert_flash_cross_encoder/__init__.py +0 -0
  287. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/nli_classification/__init__.py +0 -0
  288. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/nli_classification_flash/__init__.py +0 -0
  289. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/nomic_flash/__init__.py +0 -0
  290. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/owlv2/__init__.py +0 -0
  291. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/paddleocr_vl/__init__.py +0 -0
  292. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/qwen2_flash/__init__.py +0 -0
  293. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/qwen2_flash_cross_encoder/__init__.py +0 -0
  294. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/qwen3_vl_embedding/__init__.py +0 -0
  295. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/qwen3_vl_reranker/__init__.py +0 -0
  296. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/rope_flash/__init__.py +0 -0
  297. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/sentence_transformer/__init__.py +0 -0
  298. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/sglang/__init__.py +0 -0
  299. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/splade_flash/__init__.py +0 -0
  300. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/stablebridge_pruner/__init__.py +0 -0
  301. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/xlm_roberta_flash/__init__.py +0 -0
  302. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/__init__.py +0 -0
  303. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/generate.py +0 -0
  304. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/metrics.py +0 -0
  305. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/models.py +0 -0
  306. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/openapi.py +0 -0
  307. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/options.py +0 -0
  308. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/root.py +0 -0
  309. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/score.py +0 -0
  310. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/validation.py +0 -0
  311. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/app/__init__.py +0 -0
  312. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/app/app_state_config.py +0 -0
  313. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/config/__init__.py +0 -0
  314. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/__init__.py +0 -0
  315. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/gpu_health.py +0 -0
  316. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/hf_env.py +0 -0
  317. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/load_errors.py +0 -0
  318. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/logging.py +0 -0
  319. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/oom.py +0 -0
  320. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/preprocessor/image.py +0 -0
  321. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/shutdown.py +0 -0
  322. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/text_tokens.py +0 -0
  323. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/watcher.py +0 -0
  324. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/__init__.py +0 -0
  325. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/base.py +0 -0
  326. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/encode.py +0 -0
  327. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/extract.py +0 -0
  328. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/score.py +0 -0
  329. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/worker/oom_recovery.py +0 -0
  330. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/health/__init__.py +0 -0
  331. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/health/nats_publisher.py +0 -0
  332. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/health/saturation.py +0 -0
  333. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/main.py +0 -0
  334. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/observability/telemetry.py +0 -0
  335. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/processors/__init__.py +0 -0
  336. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/processors/admission.py +0 -0
  337. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/processors/base.py +0 -0
  338. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/processors/grammar_cache.py +0 -0
  339. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/processors/grammar_compile.py +0 -0
  340. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/processors/tool_call_grammar.py +0 -0
  341. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/processors/tool_call_parser.py +0 -0
  342. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/static/__init__.py +0 -0
  343. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/static/index.html +0 -0
  344. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/types/__init__.py +0 -0
  345. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/types/grammar.py +0 -0
  346. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/types/openapi.py +0 -0
  347. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/types/overflow_policy.py +0 -0
  348. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/types/requests.py +0 -0
  349. {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/types/responses.py +0 -0
  350. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/__init__.py +0 -0
  351. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_base.py +0 -0
  352. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_bge_m3.py +0 -0
  353. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_bge_m3_flash.py +0 -0
  354. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_colbert.py +0 -0
  355. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_docling_smoke.py +0 -0
  356. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_donut.py +0 -0
  357. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_factory_integration.py +0 -0
  358. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_flash_base.py +0 -0
  359. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_florence2.py +0 -0
  360. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_gliclass_overflow_policy.py +0 -0
  361. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_glirel.py +0 -0
  362. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_glm_ocr.py +0 -0
  363. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_grounding_dino.py +0 -0
  364. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_gte_sparse.py +0 -0
  365. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_jina_flash_cross_encoder.py +0 -0
  366. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_lighton_ocr.py +0 -0
  367. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_lora.py +0 -0
  368. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_paddleocr_vl.py +0 -0
  369. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_runtime_options.py +0 -0
  370. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_sglang.py +0 -0
  371. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_siglip.py +0 -0
  372. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_sparse_aggregation.py +0 -0
  373. {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_stablebridge_pruner.py +0 -0
  374. {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/__init__.py +0 -0
  375. {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_encode_json_schema.py +0 -0
  376. {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_encode_validation.py +0 -0
  377. {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_extract_integration.py +0 -0
  378. {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_extract_oom.py +0 -0
  379. {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_health.py +0 -0
  380. {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_models.py +0 -0
  381. {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_openai_compat.py +0 -0
  382. {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_score.py +0 -0
  383. {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_version_header.py +0 -0
  384. {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_ws.py +0 -0
  385. {sie_server-0.4.0 → sie_server-0.4.2}/tests/app/__init__.py +0 -0
  386. {sie_server-0.4.0 → sie_server-0.4.2}/tests/config/__init__.py +0 -0
  387. {sie_server-0.4.0 → sie_server-0.4.2}/tests/config/test_model_prewarm_grammars.py +0 -0
  388. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/__init__.py +0 -0
  389. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_disk_cache.py +0 -0
  390. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_gpu_health.py +0 -0
  391. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_hot_reload.py +0 -0
  392. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_idle_evict.py +0 -0
  393. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_inference.py +0 -0
  394. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_logging.py +0 -0
  395. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_memory.py +0 -0
  396. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_oom_detection.py +0 -0
  397. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_pool_isolation.py +0 -0
  398. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_postprocessor.py +0 -0
  399. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_postprocessor_registry.py +0 -0
  400. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_prepared.py +0 -0
  401. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_preprocessor_registry.py +0 -0
  402. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_quantization.py +0 -0
  403. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_readiness.py +0 -0
  404. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_registry_core.py +0 -0
  405. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_registry_deps.py +0 -0
  406. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_registry_failed_state.py +0 -0
  407. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_registry_memory.py +0 -0
  408. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_registry_multi_model.py +0 -0
  409. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_shutdown.py +0 -0
  410. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_timing.py +0 -0
  411. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_watcher.py +0 -0
  412. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_worker_backpressure.py +0 -0
  413. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_worker_extract.py +0 -0
  414. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_worker_lora.py +0 -0
  415. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_worker_options.py +0 -0
  416. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_worker_score.py +0 -0
  417. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/worker/__init__.py +0 -0
  418. {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/worker/test_oom_recovery.py +0 -0
  419. {sie_server-0.4.0 → sie_server-0.4.2}/tests/health/__init__.py +0 -0
  420. {sie_server-0.4.0 → sie_server-0.4.2}/tests/health/test_nats_publisher.py +0 -0
  421. {sie_server-0.4.0 → sie_server-0.4.2}/tests/health/test_saturation.py +0 -0
  422. {sie_server-0.4.0 → sie_server-0.4.2}/tests/health/test_worker_id_consistency.py +0 -0
  423. {sie_server-0.4.0 → sie_server-0.4.2}/tests/integration/__init__.py +0 -0
  424. {sie_server-0.4.0 → sie_server-0.4.2}/tests/observability/__init__.py +0 -0
  425. {sie_server-0.4.0 → sie_server-0.4.2}/tests/observability/test_telemetry.py +0 -0
  426. {sie_server-0.4.0 → sie_server-0.4.2}/tests/observability/test_tracing.py +0 -0
  427. {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/__init__.py +0 -0
  428. {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/test_grammar_cache.py +0 -0
  429. {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/test_grammar_compile.py +0 -0
  430. {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/test_streaming_admission.py +0 -0
  431. {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/test_streaming_integration.py +0 -0
  432. {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/test_tool_call_grammar.py +0 -0
  433. {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/test_tool_call_parser.py +0 -0
  434. {sie_server-0.4.0 → sie_server-0.4.2}/tests/test_all_models.py +0 -0
  435. {sie_server-0.4.0 → sie_server-0.4.2}/tests/test_openapi_export.py +0 -0
  436. {sie_server-0.4.0 → sie_server-0.4.2}/tests/test_sdk_integration.py +0 -0
  437. {sie_server-0.4.0 → sie_server-0.4.2}/tests/test_sparse_integration.py +0 -0
  438. {sie_server-0.4.0 → sie_server-0.4.2}/tests/type_defs/__init__.py +0 -0
  439. {sie_server-0.4.0 → sie_server-0.4.2}/tests/type_defs/test_inputs.py +0 -0
  440. {sie_server-0.4.0 → sie_server-0.4.2}/tests/type_defs/test_inputs_json_decode.py +0 -0
  441. {sie_server-0.4.0 → sie_server-0.4.2}/tests/type_defs/test_types.py +0 -0
@@ -6,10 +6,9 @@
6
6
  # docker buildx build --platform linux/amd64,linux/arm64 -f packages/sie_server/Dockerfile.cpu -t sie-server:cpu .
7
7
 
8
8
  ARG BUNDLE=default
9
- ARG SIE_DEPS_IMAGE=
10
9
 
11
10
  # =============================================================================
12
- # Stage 1: Dependencies (pyproject.toml only, cached across code changes)
11
+ # Dependency image: pyproject-only cache seed
13
12
  # =============================================================================
14
13
  FROM python:3.12-slim-bookworm AS deps
15
14
 
@@ -58,9 +57,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
58
57
  -e ".[gpu-metrics]"
59
58
 
60
59
  # =============================================================================
61
- # Stage 2: Base - source install + shared-venv finalization (no BUNDLE)
60
+ # Shared runtime base: source install and venv finalization
62
61
  # =============================================================================
63
- # Bundle-agnostic: all base-stage layers are shared across bundles of this
62
+ # Bundle-agnostic: all base image layers are shared across bundles of this
64
63
  # platform in local BuildKit cache and in content-addressed registry layers.
65
64
  FROM deps AS base
66
65
 
@@ -108,9 +107,9 @@ RUN set -eux; \
108
107
  find /app/.venv -exec touch -h -d @0 {} + 2>/dev/null || true
109
108
 
110
109
  # =============================================================================
111
- # Stage 3a: Bundle-deps - bundle-specific deps install (heavy)
110
+ # Bundle dependency builder: bundle-specific deps
112
111
  # =============================================================================
113
- FROM base AS bundle_deps
112
+ FROM base AS builder
114
113
 
115
114
  ARG BUNDLE
116
115
 
@@ -151,12 +150,7 @@ RUN set -eux; \
151
150
  find /app/bundle-libs -exec touch -h -d @0 {} + 2>/dev/null || true
152
151
 
153
152
  # =============================================================================
154
- # Stage 3b: Builder - optional trampoline to a prebuilt base image
155
- # =============================================================================
156
- FROM ${SIE_DEPS_IMAGE:-bundle_deps} AS builder
157
-
158
- # =============================================================================
159
- # Stage 4: Runtime
153
+ # Runtime image
160
154
  # =============================================================================
161
155
  FROM python:3.12-slim-bookworm AS runtime
162
156
 
@@ -1,15 +1,14 @@
1
1
  # syntax=docker/dockerfile:1
2
- # SIE Server - CUDA 12.4 Image
2
+ # SIE Server - CUDA 12 Image
3
3
  # Build from repo root:
4
4
  # docker build -f packages/sie_server/Dockerfile.cuda12 -t sie-server:cuda12-default .
5
5
  # docker build -f packages/sie_server/Dockerfile.cuda12 --build-arg BUNDLE=sglang -t sie-server:cuda12-sglang .
6
6
 
7
7
  ARG BUNDLE=default
8
8
  ARG UV_VERSION=0.9.28
9
- ARG SIE_DEPS_IMAGE=
10
9
 
11
10
  # =============================================================================
12
- # Stage 1: uv + standalone Python 3.12 (no deadsnakes PPA)
11
+ # Dependency image: uv and standalone Python 3.12
13
12
  # =============================================================================
14
13
  FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS deps
15
14
 
@@ -60,10 +59,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
60
59
  -e ".[gpu-metrics]"
61
60
 
62
61
  # =============================================================================
63
- # Stage 2: Base - source install + shared-venv finalization (no BUNDLE)
62
+ # Shared CUDA base: source install and venv finalization
64
63
  # =============================================================================
65
64
  # Everything here is bundle-agnostic, so bundle-specific builds of a given
66
- # platform share every base-stage layer in local BuildKit cache and in
65
+ # platform share every base image layer in local BuildKit cache and in
67
66
  # content-addressed registry layers.
68
67
  FROM deps AS base
69
68
 
@@ -127,9 +126,9 @@ RUN set -eux; \
127
126
  find /app/.venv -exec touch -h -d @0 {} + 2>/dev/null || true
128
127
 
129
128
  # =============================================================================
130
- # Stage 3a: Bundle-deps - bundle-specific deps install (heavy)
129
+ # Bundle dependency builder: bundle-specific deps
131
130
  # =============================================================================
132
- FROM base AS bundle_deps
131
+ FROM base AS builder
133
132
 
134
133
  ARG BUNDLE
135
134
 
@@ -174,16 +173,19 @@ RUN set -eux; \
174
173
  find /app/bundle-libs -exec touch -h -d @0 {} + 2>/dev/null || true
175
174
 
176
175
  # =============================================================================
177
- # Stage 3b: Builder - optional trampoline to a prebuilt base image
176
+ # Runtime image
178
177
  # =============================================================================
179
- FROM ${SIE_DEPS_IMAGE:-bundle_deps} AS builder
180
-
181
- # =============================================================================
182
- # Stage 4: Runtime
183
- # =============================================================================
184
- # Use base CUDA image (not devel/runtime) — PyTorch wheels bundle CUDA libs,
185
- # cuDNN ships inside torch. Saves ~2GB vs `runtime` variant.
186
- FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS runtime
178
+ # Runtime base selection is bundle-scoped. Most CUDA bundles stay on the
179
+ # smaller CUDA base runtime; SGLang-family bundles need the devel toolkit
180
+ # because flashinfer/tvm_ffi perform runtime JIT through nvcc on first decode.
181
+ FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS runtime-default
182
+ FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS runtime-transformers5
183
+ FROM nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04 AS runtime-sglang
184
+ ENV CUDA_HOME=/usr/local/cuda \
185
+ LD_LIBRARY_PATH="/usr/local/cuda/lib64:$LD_LIBRARY_PATH"
186
+ FROM runtime-sglang AS runtime-sglang-embedding
187
+
188
+ FROM runtime-${BUNDLE} AS runtime
187
189
 
188
190
  ENV DEBIAN_FRONTEND=noninteractive
189
191
 
@@ -1,10 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sie-server
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: Search Inference Engine - GPU inference server for search workloads
5
5
  License: Apache-2.0
6
6
  License-File: LICENSE
7
7
  Requires-Python: <3.13,>=3.12
8
+ Requires-Dist: blake3<1,>=0.4
8
9
  Requires-Dist: docling<3,>=2
9
10
  Requires-Dist: einops<1,>=0.8
10
11
  Requires-Dist: fastapi<1,>=0.115
@@ -17,7 +18,6 @@ Requires-Dist: loguru<1,>=0.7
17
18
  Requires-Dist: msgpack-numpy<1,>=0.4
18
19
  Requires-Dist: msgpack<2,>=1.1
19
20
  Requires-Dist: msgspec>=0.20.0
20
- Requires-Dist: nats-py<3,>=2.9
21
21
  Requires-Dist: numpy<3,>=2
22
22
  Requires-Dist: open-clip-torch>=2.24
23
23
  Requires-Dist: opencv-python-headless<5,>=4
@@ -26,7 +26,7 @@ Requires-Dist: opentelemetry-exporter-otlp<2,>=1.28
26
26
  Requires-Dist: opentelemetry-instrumentation-fastapi<1,>=0.49b0
27
27
  Requires-Dist: opentelemetry-sdk<2,>=1.28
28
28
  Requires-Dist: packaging<25,>=24
29
- Requires-Dist: pillow<12,>=11
29
+ Requires-Dist: pillow>=12.2.0
30
30
  Requires-Dist: prometheus-client<1,>=0.21
31
31
  Requires-Dist: pydantic-settings<3,>=2.6
32
32
  Requires-Dist: pydantic<3,>=2.9
@@ -66,7 +66,7 @@ auto-retries; see `packages/sie_sdk/README.md` for client-side controls.
66
66
 
67
67
  | Env var | Default | Effect |
68
68
  |--|--|--|
69
- | `SIE_GRAMMAR_PREFLIGHT_DEBUG` | unset (off) | Enables the legacy worker-side Outlines preflight compile before each structured-output request. Off by default per ADR-0002 — SGLang is the production grammar authority. Use for diagnosing schema-rejection problems or slow compiles in a controlled environment; not recommended for production traffic. |
69
+ | `SIE_GRAMMAR_PREFLIGHT_DEBUG` | unset (off) | Enables the legacy worker-side Outlines preflight compile before each structured-output request. Off by default because SGLang is the production grammar authority. Use for diagnosing schema-rejection problems or slow compiles in a controlled environment; not recommended for production traffic. |
70
70
 
71
71
  For nested settings (any field with `__`), the env-var format is
72
72
  `SIE_<TOP>__<NESTED>=value`. The complete schema is in
@@ -43,6 +43,7 @@ adapters:
43
43
  - sie_server.adapters.florence2
44
44
  - sie_server.adapters.docling
45
45
  - sie_server.adapters.paddleocr_vl
46
+ - sie_server.adapters.mineru_vl
46
47
  deps:
47
48
  # Most flash adapters; sentence_transformer needs >=4.57
48
49
  transformers: '>=4.57,<5'
@@ -78,5 +79,5 @@ deps:
78
79
  docling: '>=2,<3'
79
80
  # Flash Attention 2 — CUDA only, prebuilt wheel
80
81
  flash-attn:
81
- url: https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.7.0/flash_attn-2.7.4+cu128torch2.9-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
82
+ url: https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.7.11/flash_attn-2.7.4+cu129torch2.9-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
82
83
  marker: sys_platform == 'linux'
@@ -12,7 +12,7 @@ deps:
12
12
  # pip resolution drift on environments that already had a different
13
13
  # ``outlines`` installed. Asserted in ``tests/test_bundles.py`` so a
14
14
  # future drift fails fast.
15
- sglang: '==0.5.10'
15
+ sglang: '==0.5.10.post1'
16
16
  xgrammar: '==0.1.32'
17
17
  outlines: '==0.1.11'
18
18
  llguidance: '>=0.7.11,<0.8.0'
@@ -6,29 +6,29 @@ deps:
6
6
  # SGLang 0.5.6+ includes all deps at base level (fixed from 0.4.x extras bug)
7
7
  # See: https://github.com/sgl-project/sglang/issues/4869
8
8
  #
9
- # Qwen3.5-4B compatibility M4 req2 Proj 5:
9
+ # Qwen3.5-4B + Qwen3.6-27B compatibility:
10
10
  #
11
- # ``sglang==0.5.10`` is the canonical target for Qwen3.5-4B on the
12
- # current L4 / A100-40GB / H100 fleet. Audited against
13
- # ``python/pyproject.toml@v0.5.10`` upstream (see
14
- # ``product/plans/qwen35-sglang-mtp-structured-outputs-findings.md``):
11
+ # ``sglang==0.5.10.post1`` is the canonical target for the Qwen3.x hybrid
12
+ # Gated-DeltaNet + Gated-Attention family on the current
13
+ # L4 / A100-40GB / H100 fleet. Qwen3.6-27B uses the same ``qwen3_5``
14
+ # model class shipped in 0.5.10 — the architecture (64 layers, hybrid
15
+ # Gated DeltaNet + Gated Attention, MTP/NEXTN) is identical, only the
16
+ # parameter count differs.
15
17
  #
16
- # * ships the ``qwen3_5`` model class (``models/qwen3_5.py``, 1724 LOC)
17
- # * grammar backends: ``xgrammar`` / ``outlines`` / ``llguidance`` / ``none``
18
- # * ``sglang-kernel==0.4.1`` wheel covers SM_80 / SM_89 / SM_90 / SM_100
19
- # via gencode (``CMakeLists.txt``: ``ENABLE_BELOW_SM90=ON`` default).
20
- # The runtime loader (``sgl_kernel/load_utils.py``) maps
21
- # compute_capability != 90 ``sm100/`` subdir, which holds the
22
- # SM_80 / SM_89 / SM_100 build (precise math). H100 (CC=90) gets
23
- # the ``sm90/`` fast-math build.
24
- # * torch==2.9.1 (CUDA 12.8/12.9 wheels); ``cuda-python==12.9``.
25
- # **Not** CUDA 13 — that's an SGLang-main-only path which only
26
- # became relevant when looking at the dev branch.
18
+ # SGLang 0.5.10 was evaluated against CUDA 12.9 + Qwen3.6-27B on Modal
19
+ # H100 (2026-05-27): server boots, loads weights, but the bundled
20
+ # ``sglang/jit_kernel/csrc/elementwise/activation.cuh`` has a C++
21
+ # template bug (``select_kernel<true>(type)`` is parsed as a class-
22
+ # template substitution, not a function-template call) that the
23
+ # stricter ``nvcc`` shipped with CUDA 12.9 rejects at first activation.
24
+ # 0.5.11 is also dev-only on the sglang docs wheel index — not on
25
+ # PyPI. Park the 0.5.11 bump until upstream cuts a stable release with
26
+ # the JIT header fixed; 0.5.10.post1 covers Qwen3.6-27B today.
27
27
  #
28
- # Compat note: 0.5.12 wheel observed shipping only ``sm100/`` (no SM_80
29
- # cubin entry inside) out of scope; we stay on 0.5.10 until upstream
30
- # ships multi-arch binaries again.
31
- sglang: '==0.5.10'
28
+ # * grammar backends: ``xgrammar`` / ``outlines`` / ``llguidance`` / ``none``
29
+ # * ``sgl_kernel`` covers SM_80 / SM_89 / SM_90 / SM_100 via gencode.
30
+ # * torch==2.9.1 (CUDA 12.9 wheels); ``cuda-python==12.9``.
31
+ sglang: '==0.5.10.post1'
32
32
  #
33
33
  # Grammar backend deps — pinned to exactly what SGLang 0.5.10 imports
34
34
  # internally to prevent silent pip resolution drift.
@@ -39,7 +39,7 @@ deps:
39
39
  # propagation (PR #20467). Kept available as the fallback backend.
40
40
  xgrammar: '==0.1.32'
41
41
  #
42
- # ``outlines==0.1.11`` is a hard transitive dep of sglang==0.5.10. We
42
+ # ``outlines==0.1.11`` is a hard transitive dep of sglang==0.5.10.post1. We
43
43
  # declare it explicitly at bundle level so the surface is visible.
44
44
  # ``outlines-core`` (a separate package) is a transitive of outlines
45
45
  # and intentionally NOT pinned here — pinning ``outlines-core`` directly
@@ -0,0 +1,28 @@
1
+ sie_id: Marqo/marqo-fashionSigLIP
2
+ hf_id: Marqo/marqo-fashionSigLIP
3
+ inputs:
4
+ text: true
5
+ image: true
6
+ audio: false
7
+ video: false
8
+ tasks:
9
+ encode:
10
+ dense:
11
+ dim: 768
12
+ sparse: null
13
+ multivector: null
14
+ score: null
15
+ extract: null
16
+ max_sequence_length: 64
17
+ profiles:
18
+ default:
19
+ max_batch_tokens: 16384
20
+ compute_precision: float16
21
+ adapter_path: sie_server.adapters.siglip:SiglipAdapter
22
+ adapter_options:
23
+ loadtime:
24
+ backend: open_clip
25
+ open_clip_model_id: hf-hub:Marqo/marqo-fashionSigLIP
26
+ dense_dim: 768
27
+ runtime:
28
+ normalize: true
@@ -6,40 +6,39 @@ inputs:
6
6
  audio: false
7
7
  video: false
8
8
  tasks:
9
- # Tiny generation model used for fast e2e iteration on Modal L4 (walking-skeleton
10
- # viability bench + validation-harness SIE-only smoke). Loads in ~30s. Quality is
11
- # bad this is a transport benchmark target, not a production model.
9
+ # Small, fast generation model a viable PROD pick for simple/short-prompt
10
+ # tasks (cheap + high throughput; weaker on long-context reasoning). Loads in
11
+ # ~30s. Also doubles as the transport/walking-skeleton benchmark target.
12
12
  #
13
- # Context / batch sizes are deliberately small (1024 vs the headroom an
14
- # L4 could nominally support) so the validation harness can co-resident
15
- # the worker's SGLang with a second SGLang for the baseline phase on
16
- # a single 22 GiB card without OOMing. Raise these if you ever run 0.6B
17
- # alone on a card it doesn't have to share.
13
+ # ``context_length`` is the standalone PROD serving value (4096): big enough
14
+ # to fit the full generation benchmark pack (casehold prompts reach ~1.8k
15
+ # tokens, gpqa ~1.3k) so the model is comparable to the rest of the fleet on
16
+ # every task, while KV stays trivial at this size (112 KB/token 4096
17
+ # 0.46 GB). The validation/co-residency harness, which packs two SGLang
18
+ # instances onto a single 22 GiB L4, does NOT depend on this default — it
19
+ # passes an explicit ``--max-seq-length``/``--context-length`` (see
20
+ # tools/bench_generation_matrix.py + run_generation_smoke.py) and caps itself
21
+ # to 1024 for that case.
18
22
  #
19
- # Note on the three 1024s below: `context_length`, `max_sequence_length`,
20
- # and `max_batch_tokens` are NOT redundant they're three independent
21
- # knobs (per-request context, SGLang --context-length, batcher cost
22
- # budget) that just happen to collide here because the model is tiny.
23
- # See sibling Qwen__Qwen3-4B-Instruct-2507.yaml for the canonical
24
- # non-collapsed shape.
23
+ # ``context_length``, ``max_sequence_length``, and ``max_batch_tokens`` are
24
+ # three independent knobs (per-request context, SGLang --context-length,
25
+ # batcher cost budget); see sibling Qwen__Qwen3-4B-Instruct-2507.yaml.
25
26
  generate:
26
- context_length: 1024
27
+ context_length: 4096
27
28
  max_output_tokens: 1024
28
29
  capabilities:
29
30
  grammar: []
30
31
  streaming: true
31
32
  tools: false
32
- max_sequence_length: 1024
33
+ max_sequence_length: 4096
33
34
  # KV-cache memory math (Qwen3-0.6B, bf16):
34
35
  # layers=28, kv_heads=8, head_dim=128, bytes_per_elem=2
35
36
  # kv_bytes_per_token = 2 × 28 × 8 × 128 × 2 = 114,688 B ≈ 112 KB
36
- # The 0.6B is a transport benchmark targetcontext_length is held at
37
- # 1024 deliberately (see header comment) so the validation harness can
38
- # co-resident two SGLang instances on an L4. KV budgets per profile
39
- # scale with the deployment scenario rather than the GPU ceiling.
37
+ # At ctx=4096 a single request's KV is ~0.46 GB negligible. The co-residency
38
+ # harness still caps context explicitly when it has to share a card.
40
39
  profiles:
41
40
  default:
42
- max_batch_tokens: 1024
41
+ max_batch_tokens: 4096
43
42
  compute_precision: bfloat16
44
43
  adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
45
44
  kv_budget_tokens: 8192
@@ -66,8 +66,7 @@ max_sequence_length: 32768
66
66
  # kv_budget_tokens is set to ~40% of theoretical max, matching the L4
67
67
  # baseline ratio (32768/90000 ≈ 36%). The headroom absorbs batch growth,
68
68
  # speculative side-cell, grammar/Outlines compile arena, fragmentation.
69
- # Final empirical validation (concurrency-16 OOM-boundary sweep) is
70
- # tracked in product/plans/m4-req2-gpu-runbook.md §"#16/#19".
69
+ # Final empirical validation should use concurrency and OOM-boundary sweeps.
71
70
  profiles:
72
71
  default:
73
72
  # max_batch_tokens is a generic engine knob; generation does not batch
@@ -93,7 +92,7 @@ profiles:
93
92
  top_p: 0.9
94
93
  stop_tokens:
95
94
  - "<|im_end|>"
96
- # M5 audit #16/#19: analytical defaults for a100-40gb / h100. Production
95
+ # Analytical defaults for a100-40gb / h100. Production
97
96
  # capacity also grows: with 2-4× the KV budget the context window can be
98
97
  # widened proportionally so longer-context workloads (RAG with large
99
98
  # retrieved passages) fit comfortably. ``max_output_tokens`` doubles