sie-server 0.4.1__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (442) hide show
  1. {sie_server-0.4.1 → sie_server-0.4.2}/Dockerfile.cpu +5 -5
  2. {sie_server-0.4.1 → sie_server-0.4.2}/Dockerfile.cuda12 +17 -9
  3. {sie_server-0.4.1 → sie_server-0.4.2}/PKG-INFO +3 -3
  4. {sie_server-0.4.1 → sie_server-0.4.2}/README.md +1 -1
  5. {sie_server-0.4.1 → sie_server-0.4.2}/bundles/default.yaml +1 -0
  6. sie_server-0.4.2/models/Marqo__marqo-fashionSigLIP.yaml +28 -0
  7. {sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3-0.6B.yaml +20 -21
  8. {sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3-4B-Instruct-2507.yaml +2 -3
  9. sie_server-0.4.2/models/Qwen__Qwen3.6-27B.yaml +308 -0
  10. {sie_server-0.4.1 → sie_server-0.4.2}/models/docling.yaml +1 -1
  11. sie_server-0.4.2/models/opendatalab__MinerU2.5-Pro-2604-1.2B.yaml +24 -0
  12. {sie_server-0.4.1 → sie_server-0.4.2}/openapi.json +1 -1
  13. {sie_server-0.4.1 → sie_server-0.4.2}/pyproject.toml +7 -4
  14. sie_server-0.4.2/scripts/generate_tokenize_fixture.py +203 -0
  15. sie_server-0.4.2/src/sie_server/__init__.py +9 -0
  16. sie_server-0.4.2/src/sie_server/_ipc_test_harness.py +356 -0
  17. sie_server-0.4.2/src/sie_server/adapter_call_loop.py +439 -0
  18. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/_generation_base.py +2 -4
  19. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/_utils.py +4 -1
  20. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/base.py +2 -5
  21. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/bert_flash_cross_encoder/__init__.py +0 -2
  22. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/clip/__init__.py +19 -6
  23. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/colpali/__init__.py +18 -13
  24. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/colqwen2/__init__.py +6 -4
  25. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/colqwen3/__init__.py +72 -0
  26. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/docling/__init__.py +29 -8
  27. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/donut/__init__.py +0 -2
  28. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/gliner/__init__.py +0 -2
  29. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/glirel/__init__.py +0 -3
  30. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/glm_ocr/__init__.py +105 -0
  31. sie_server-0.4.2/src/sie_server/adapters/mineru_vl/__init__.py +434 -0
  32. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/nemo_colembed/__init__.py +49 -1
  33. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/peft_lora_mixin.py +0 -2
  34. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/pytorch_embedding/__init__.py +17 -4
  35. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/sglang/_server.py +1 -0
  36. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/sglang/embedding.py +1 -3
  37. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/sglang/generation.py +11 -5
  38. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/siglip/__init__.py +3 -3
  39. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/encode.py +3 -3
  40. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/extract.py +10 -3
  41. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/health.py +0 -2
  42. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/helpers.py +1 -1
  43. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/openai_compat.py +1 -1
  44. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/serialization.py +1 -1
  45. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/ws.py +25 -9
  46. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/app/app_factory.py +56 -208
  47. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/cli.py +20 -0
  48. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/config/engine.py +79 -6
  49. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/config/model.py +4 -8
  50. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/adaptive_batching.py +205 -10
  51. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/batcher.py +9 -6
  52. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/deps.py +1 -45
  53. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/disk_cache.py +1 -1
  54. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/encode_pipeline.py +70 -2
  55. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/extract_cost.py +1 -2
  56. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/hot_reload.py +0 -2
  57. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/inference.py +2 -2
  58. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/inference_output.py +0 -3
  59. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/loader.py +21 -0
  60. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/memory.py +2 -2
  61. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/model_loader.py +7 -1
  62. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/pool_isolation.py +2 -5
  63. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/postprocessor.py +0 -3
  64. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/postprocessor_registry.py +2 -2
  65. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/prepared.py +21 -1
  66. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/preprocessor/__init__.py +0 -2
  67. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/preprocessor/base.py +1 -1
  68. sie_server-0.4.2/src/sie_server/core/preprocessor/text.py +495 -0
  69. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/preprocessor/vision.py +175 -0
  70. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/preprocessor_registry.py +2 -1
  71. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/readiness.py +26 -3
  72. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/registry.py +10 -4
  73. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/timing.py +1 -1
  74. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/tokenizer.py +2 -18
  75. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/worker/__init__.py +0 -2
  76. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/worker/model_worker.py +167 -12
  77. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/worker/types.py +47 -2
  78. sie_server-0.4.2/src/sie_server/ipc_server.py +679 -0
  79. sie_server-0.4.2/src/sie_server/ipc_types.py +514 -0
  80. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/observability/__init__.py +0 -6
  81. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/observability/gpu.py +0 -2
  82. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/observability/metrics.py +53 -13
  83. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/observability/prometheus.py +0 -2
  84. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/observability/tracing.py +0 -1
  85. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/processors/streaming.py +110 -30
  86. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/processors/work_class_scheduler.py +4 -5
  87. sie_server-0.4.2/src/sie_server/queue_executor.py +1088 -0
  88. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/types/inputs.py +2 -2
  89. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/types/outputs.py +1 -1
  90. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_clip.py +52 -8
  91. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_docling.py +64 -2
  92. sie_server-0.4.2/tests/adapters/test_mineru_vl.py +380 -0
  93. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_pytorch_embedding_revision.py +34 -2
  94. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_sentence_transformer.py +61 -0
  95. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_sglang_generation.py +2 -2
  96. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_visual_document.py +18 -3
  97. {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_encode_dtype.py +1 -1
  98. {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_encode_endpoint.py +1 -1
  99. {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_encode_timing.py +1 -1
  100. {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_extract.py +26 -1
  101. {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_generate.py +2 -6
  102. {sie_server-0.4.1 → sie_server-0.4.2}/tests/app/test_app_factory.py +173 -17
  103. {sie_server-0.4.1 → sie_server-0.4.2}/tests/config/test_bundle_coverage.py +3 -6
  104. {sie_server-0.4.1 → sie_server-0.4.2}/tests/config/test_config.py +9 -2
  105. {sie_server-0.4.1 → sie_server-0.4.2}/tests/config/test_profile_backend_consistency.py +3 -12
  106. {sie_server-0.4.1 → sie_server-0.4.2}/tests/conftest.py +29 -15
  107. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_adaptive_batching.py +279 -3
  108. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_batcher.py +13 -11
  109. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_loader.py +79 -0
  110. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_lora_generation_exclusion.py +1 -1
  111. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_model_load_timeout.py +1 -1
  112. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_preprocessor.py +358 -0
  113. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_registry_async.py +1 -1
  114. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_worker_core.py +39 -1
  115. sie_server-0.4.2/tests/core/test_worker_passthrough.py +220 -0
  116. {sie_server-0.4.1 → sie_server-0.4.2}/tests/integration/test_chat_completions.py +2 -3
  117. {sie_server-0.4.1 → sie_server-0.4.2}/tests/integration/test_grammar_generate.py +1 -1
  118. {sie_server-0.4.1 → sie_server-0.4.2}/tests/observability/test_generation_metrics.py +4 -4
  119. {sie_server-0.4.1 → sie_server-0.4.2}/tests/observability/test_metrics.py +110 -44
  120. {sie_server-0.4.1 → sie_server-0.4.2}/tests/observability/test_trace_propagation.py +3 -3
  121. {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/test_grammar_prewarm.py +2 -4
  122. {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/test_streaming.py +14 -26
  123. {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/test_work_class_scheduler.py +1 -1
  124. sie_server-0.4.2/tests/test_adapter_call_loop.py +295 -0
  125. {sie_server-0.4.1 → sie_server-0.4.2}/tests/test_docker_integration.py +9 -9
  126. sie_server-0.4.2/tests/test_ipc_server.py +712 -0
  127. sie_server-0.4.2/tests/test_ipc_types_raw_output.py +162 -0
  128. sie_server-0.4.2/tests/test_model_yaml_filenames.py +35 -0
  129. sie_server-0.4.2/tests/test_parity_run_batch.py +332 -0
  130. sie_server-0.4.2/tests/test_queue_executor.py +724 -0
  131. sie_server-0.4.2/tests/test_queue_executor_stage1d.py +622 -0
  132. sie_server-0.4.2/tests/test_readiness.py +53 -0
  133. sie_server-0.4.2/tests/test_server_smoke.py +14 -0
  134. sie_server-0.4.2/tests/test_stage1d_byte_identity.py +393 -0
  135. {sie_server-0.4.1 → sie_server-0.4.2}/tests/type_defs/test_media_bytes.py +38 -11
  136. sie_server-0.4.1/models/Qwen__Qwen3.6-27B.yaml +0 -196
  137. sie_server-0.4.1/src/sie_server/__init__.py +0 -3
  138. sie_server-0.4.1/src/sie_server/core/preprocessor/text.py +0 -268
  139. sie_server-0.4.1/src/sie_server/nats_pull_loop.py +0 -2532
  140. sie_server-0.4.1/src/sie_server/nats_subscriber.py +0 -231
  141. sie_server-0.4.1/tests/test_nats_pull_loop.py +0 -1122
  142. sie_server-0.4.1/tests/test_nats_pull_loop_batching.py +0 -1291
  143. sie_server-0.4.1/tests/test_server_smoke.py +0 -8
  144. {sie_server-0.4.1 → sie_server-0.4.2}/.gitignore +0 -0
  145. {sie_server-0.4.1 → sie_server-0.4.2}/CONTRIBUTING.md +0 -0
  146. {sie_server-0.4.1 → sie_server-0.4.2}/LICENSE +0 -0
  147. {sie_server-0.4.1 → sie_server-0.4.2}/bundles/sglang-embedding.yaml +0 -0
  148. {sie_server-0.4.1 → sie_server-0.4.2}/bundles/sglang.yaml +0 -0
  149. {sie_server-0.4.1 → sie_server-0.4.2}/bundles/transformers5.yaml +0 -0
  150. {sie_server-0.4.1 → sie_server-0.4.2}/models/Alibaba-NLP__gte-Qwen2-1.5B-instruct.yaml +0 -0
  151. {sie_server-0.4.1 → sie_server-0.4.2}/models/Alibaba-NLP__gte-Qwen2-7B-instruct.yaml +0 -0
  152. {sie_server-0.4.1 → sie_server-0.4.2}/models/Alibaba-NLP__gte-modernbert-base.yaml +0 -0
  153. {sie_server-0.4.1 → sie_server-0.4.2}/models/Alibaba-NLP__gte-multilingual-base.yaml +0 -0
  154. {sie_server-0.4.1 → sie_server-0.4.2}/models/Alibaba-NLP__gte-reranker-modernbert-base.yaml +0 -0
  155. {sie_server-0.4.1 → sie_server-0.4.2}/models/BAAI__bge-m3.yaml +0 -0
  156. {sie_server-0.4.1 → sie_server-0.4.2}/models/BAAI__bge-reranker-base.yaml +0 -0
  157. {sie_server-0.4.1 → sie_server-0.4.2}/models/BAAI__bge-reranker-large.yaml +0 -0
  158. {sie_server-0.4.1 → sie_server-0.4.2}/models/BAAI__bge-reranker-v2-m3.yaml +0 -0
  159. {sie_server-0.4.1 → sie_server-0.4.2}/models/EmergentMethods__gliner_large_news-v2.1.yaml +0 -0
  160. {sie_server-0.4.1 → sie_server-0.4.2}/models/GritLM__GritLM-7B.yaml +0 -0
  161. {sie_server-0.4.1 → sie_server-0.4.2}/models/IDEA-Research__grounding-dino-base.yaml +0 -0
  162. {sie_server-0.4.1 → sie_server-0.4.2}/models/IDEA-Research__grounding-dino-tiny.yaml +0 -0
  163. {sie_server-0.4.1 → sie_server-0.4.2}/models/Ihor__gliner-biomed-large-v1.0.yaml +0 -0
  164. {sie_server-0.4.1 → sie_server-0.4.2}/models/Linq-AI-Research__Linq-Embed-Mistral.yaml +0 -0
  165. {sie_server-0.4.1 → sie_server-0.4.2}/models/Marqo__marqo-ecommerce-embeddings-B.yaml +0 -0
  166. {sie_server-0.4.1 → sie_server-0.4.2}/models/MoritzLaurer__deberta-v3-base-zeroshot-v2.0.yaml +0 -0
  167. {sie_server-0.4.1 → sie_server-0.4.2}/models/MoritzLaurer__deberta-v3-large-zeroshot-v2.0.yaml +0 -0
  168. {sie_server-0.4.1 → sie_server-0.4.2}/models/NeuML__gliner-bert-tiny.yaml +0 -0
  169. {sie_server-0.4.1 → sie_server-0.4.2}/models/NovaSearch__stella_en_1.5B_v5.yaml +0 -0
  170. {sie_server-0.4.1 → sie_server-0.4.2}/models/NovaSearch__stella_en_400M_v5.yaml +0 -0
  171. {sie_server-0.4.1 → sie_server-0.4.2}/models/PaddlePaddle__PaddleOCR-VL-1.5.yaml +0 -0
  172. {sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3-Embedding-0.6B.yaml +0 -0
  173. {sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3-Embedding-4B.yaml +0 -0
  174. {sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3-Reranker-0.6B.yaml +0 -0
  175. {sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3-Reranker-4B.yaml +0 -0
  176. {sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3-VL-Embedding-2B.yaml +0 -0
  177. {sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3-VL-Reranker-2B.yaml +0 -0
  178. {sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3.5-4B.yaml +0 -0
  179. {sie_server-0.4.1 → sie_server-0.4.2}/models/Salesforce__SFR-Embedding-2_R.yaml +0 -0
  180. {sie_server-0.4.1 → sie_server-0.4.2}/models/Salesforce__SFR-Embedding-Mistral.yaml +0 -0
  181. {sie_server-0.4.1 → sie_server-0.4.2}/models/Snowflake__snowflake-arctic-embed-m-v2.0.yaml +0 -0
  182. /sie_server-0.4.1/models/tomoroai__tomoro-colqwen3-embed-4b.yaml → /sie_server-0.4.2/models/TomoroAI__tomoro-colqwen3-embed-4b.yaml +0 -0
  183. {sie_server-0.4.1 → sie_server-0.4.2}/models/answerdotai__ModernBERT-base.yaml +0 -0
  184. {sie_server-0.4.1 → sie_server-0.4.2}/models/answerdotai__answerai-colbert-small-v1.yaml +0 -0
  185. {sie_server-0.4.1 → sie_server-0.4.2}/models/colbert-ir__colbertv2.0.yaml +0 -0
  186. {sie_server-0.4.1 → sie_server-0.4.2}/models/cross-encoder__ms-marco-MiniLM-L-12-v2.yaml +0 -0
  187. {sie_server-0.4.1 → sie_server-0.4.2}/models/cross-encoder__ms-marco-MiniLM-L-6-v2.yaml +0 -0
  188. {sie_server-0.4.1 → sie_server-0.4.2}/models/cross-encoder__nli-deberta-v3-base.yaml +0 -0
  189. {sie_server-0.4.1 → sie_server-0.4.2}/models/fastino__gliner2-base-v1.yaml +0 -0
  190. {sie_server-0.4.1 → sie_server-0.4.2}/models/gliner-community__gliner_large-v2.5.yaml +0 -0
  191. {sie_server-0.4.1 → sie_server-0.4.2}/models/gliner-community__gliner_medium-v2.5.yaml +0 -0
  192. {sie_server-0.4.1 → sie_server-0.4.2}/models/gliner-community__gliner_small-v2.5.yaml +0 -0
  193. {sie_server-0.4.1 → sie_server-0.4.2}/models/google__embeddinggemma-300m.yaml +0 -0
  194. {sie_server-0.4.1 → sie_server-0.4.2}/models/google__owlv2-base-patch16-ensemble.yaml +0 -0
  195. {sie_server-0.4.1 → sie_server-0.4.2}/models/google__siglip-so400m-patch14-224.yaml +0 -0
  196. {sie_server-0.4.1 → sie_server-0.4.2}/models/google__siglip-so400m-patch14-384.yaml +0 -0
  197. {sie_server-0.4.1 → sie_server-0.4.2}/models/google__siglip2-base-patch16-224.yaml +0 -0
  198. {sie_server-0.4.1 → sie_server-0.4.2}/models/ibm-granite__granite-embedding-30m-sparse.yaml +0 -0
  199. {sie_server-0.4.1 → sie_server-0.4.2}/models/ibm-granite__granite-embedding-english-r2.yaml +0 -0
  200. {sie_server-0.4.1 → sie_server-0.4.2}/models/ibm-granite__granite-embedding-small-english-r2.yaml +0 -0
  201. {sie_server-0.4.1 → sie_server-0.4.2}/models/intfloat__e5-base-v2.yaml +0 -0
  202. {sie_server-0.4.1 → sie_server-0.4.2}/models/intfloat__e5-large-v2.yaml +0 -0
  203. {sie_server-0.4.1 → sie_server-0.4.2}/models/intfloat__e5-mistral-7b-instruct.yaml +0 -0
  204. {sie_server-0.4.1 → sie_server-0.4.2}/models/intfloat__e5-small-v2.yaml +0 -0
  205. {sie_server-0.4.1 → sie_server-0.4.2}/models/intfloat__multilingual-e5-large-instruct.yaml +0 -0
  206. {sie_server-0.4.1 → sie_server-0.4.2}/models/intfloat__multilingual-e5-large.yaml +0 -0
  207. {sie_server-0.4.1 → sie_server-0.4.2}/models/jackboyla__glirel-large-v0.yaml +0 -0
  208. {sie_server-0.4.1 → sie_server-0.4.2}/models/jinaai__jina-colbert-v2.yaml +0 -0
  209. {sie_server-0.4.1 → sie_server-0.4.2}/models/jinaai__jina-reranker-v2-base-multilingual.yaml +0 -0
  210. {sie_server-0.4.1 → sie_server-0.4.2}/models/knowledgator__gliclass-base-v1.0.yaml +0 -0
  211. {sie_server-0.4.1 → sie_server-0.4.2}/models/knowledgator__gliclass-large-v1.0.yaml +0 -0
  212. {sie_server-0.4.1 → sie_server-0.4.2}/models/knowledgator__gliclass-large-v3.0.yaml +0 -0
  213. {sie_server-0.4.1 → sie_server-0.4.2}/models/knowledgator__gliclass-small-v1.0.yaml +0 -0
  214. {sie_server-0.4.1 → sie_server-0.4.2}/models/knowledgator__gliner-bi-base-v2.0.yaml +0 -0
  215. {sie_server-0.4.1 → sie_server-0.4.2}/models/knowledgator__modern-gliner-bi-base-v1.0.yaml +0 -0
  216. {sie_server-0.4.1 → sie_server-0.4.2}/models/laion__CLIP-ViT-B-32-laion2B-s34B-b79K.yaml +0 -0
  217. {sie_server-0.4.1 → sie_server-0.4.2}/models/laion__CLIP-ViT-H-14-laion2B-s32B-b79K.yaml +0 -0
  218. {sie_server-0.4.1 → sie_server-0.4.2}/models/lightonai__GTE-ModernColBERT-v1.yaml +0 -0
  219. {sie_server-0.4.1 → sie_server-0.4.2}/models/lightonai__LightOnOCR-2-1B.yaml +0 -0
  220. {sie_server-0.4.1 → sie_server-0.4.2}/models/lightonai__Reason-ModernColBERT.yaml +0 -0
  221. {sie_server-0.4.1 → sie_server-0.4.2}/models/microsoft__Florence-2-base-ft.yaml +0 -0
  222. {sie_server-0.4.1 → sie_server-0.4.2}/models/microsoft__Florence-2-base.yaml +0 -0
  223. {sie_server-0.4.1 → sie_server-0.4.2}/models/microsoft__Florence-2-large.yaml +0 -0
  224. {sie_server-0.4.1 → sie_server-0.4.2}/models/mixedbread-ai__mxbai-colbert-large-v1.yaml +0 -0
  225. {sie_server-0.4.1 → sie_server-0.4.2}/models/mixedbread-ai__mxbai-edge-colbert-v0-32m.yaml +0 -0
  226. {sie_server-0.4.1 → sie_server-0.4.2}/models/mixedbread-ai__mxbai-rerank-base-v2.yaml +0 -0
  227. {sie_server-0.4.1 → sie_server-0.4.2}/models/mixedbread-ai__mxbai-rerank-large-v2.yaml +0 -0
  228. {sie_server-0.4.1 → sie_server-0.4.2}/models/mynkchaudhry__Florence-2-FT-DocVQA.yaml +0 -0
  229. {sie_server-0.4.1 → sie_server-0.4.2}/models/naver-clova-ix__donut-base-finetuned-cord-v2.yaml +0 -0
  230. {sie_server-0.4.1 → sie_server-0.4.2}/models/naver-clova-ix__donut-base-finetuned-docvqa.yaml +0 -0
  231. {sie_server-0.4.1 → sie_server-0.4.2}/models/naver-clova-ix__donut-base-finetuned-rvlcdip.yaml +0 -0
  232. {sie_server-0.4.1 → sie_server-0.4.2}/models/naver__splade-cocondenser-selfdistil.yaml +0 -0
  233. {sie_server-0.4.1 → sie_server-0.4.2}/models/naver__splade-v3.yaml +0 -0
  234. {sie_server-0.4.1 → sie_server-0.4.2}/models/nomic-ai__nomic-embed-text-v2-moe.yaml +0 -0
  235. {sie_server-0.4.1 → sie_server-0.4.2}/models/numind__NuNER_Zero-span.yaml +0 -0
  236. {sie_server-0.4.1 → sie_server-0.4.2}/models/numind__NuNER_Zero.yaml +0 -0
  237. {sie_server-0.4.1 → sie_server-0.4.2}/models/nvidia__NV-Embed-v2.yaml +0 -0
  238. {sie_server-0.4.1 → sie_server-0.4.2}/models/nvidia__llama-embed-nemotron-8b.yaml +0 -0
  239. {sie_server-0.4.1 → sie_server-0.4.2}/models/nvidia__llama-nemoretriever-colembed-3b-v1.yaml +0 -0
  240. {sie_server-0.4.1 → sie_server-0.4.2}/models/nvidia__nemotron-colembed-vl-4b-v2.yaml +0 -0
  241. {sie_server-0.4.1 → sie_server-0.4.2}/models/openai__clip-vit-base-patch32.yaml +0 -0
  242. {sie_server-0.4.1 → sie_server-0.4.2}/models/openai__clip-vit-large-patch14.yaml +0 -0
  243. {sie_server-0.4.1 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v2-distill.yaml +0 -0
  244. {sie_server-0.4.1 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v2-mini.yaml +0 -0
  245. {sie_server-0.4.1 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v3-distill.yaml +0 -0
  246. {sie_server-0.4.1 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v3-gte.yaml +0 -0
  247. {sie_server-0.4.1 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-v1.yaml +0 -0
  248. {sie_server-0.4.1 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-v2-distill.yaml +0 -0
  249. {sie_server-0.4.1 → sie_server-0.4.2}/models/prithivida__Splade_PP_en_v2.yaml +0 -0
  250. {sie_server-0.4.1 → sie_server-0.4.2}/models/rasyosef__splade-mini.yaml +0 -0
  251. {sie_server-0.4.1 → sie_server-0.4.2}/models/sentence-transformers__all-MiniLM-L6-v2.yaml +0 -0
  252. {sie_server-0.4.1 → sie_server-0.4.2}/models/sugiv__stablebridge-pruner-highlighter.yaml +0 -0
  253. {sie_server-0.4.1 → sie_server-0.4.2}/models/urchade__gliner_large-v2.1.yaml +0 -0
  254. {sie_server-0.4.1 → sie_server-0.4.2}/models/urchade__gliner_medium-v2.1.yaml +0 -0
  255. {sie_server-0.4.1 → sie_server-0.4.2}/models/urchade__gliner_multi-v2.1.yaml +0 -0
  256. {sie_server-0.4.1 → sie_server-0.4.2}/models/urchade__gliner_multi_pii-v1.yaml +0 -0
  257. {sie_server-0.4.1 → sie_server-0.4.2}/models/urchade__gliner_small-v2.1.yaml +0 -0
  258. {sie_server-0.4.1 → sie_server-0.4.2}/models/vidore__colpali-v1.3-hf.yaml +0 -0
  259. {sie_server-0.4.1 → sie_server-0.4.2}/models/vidore__colqwen2.5-v0.2.yaml +0 -0
  260. {sie_server-0.4.1 → sie_server-0.4.2}/models/zai-org__GLM-OCR.yaml +0 -0
  261. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/__init__.py +0 -0
  262. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/_base_adapter.py +0 -0
  263. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/_flash_base.py +0 -0
  264. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/_spec.py +0 -0
  265. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/_types.py +0 -0
  266. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/bert_flash/__init__.py +0 -0
  267. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/bge_m3/__init__.py +0 -0
  268. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/bge_m3_flag/__init__.py +0 -0
  269. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/bge_m3_flash/__init__.py +0 -0
  270. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/bge_m3_score_mixin.py +0 -0
  271. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/colbert/__init__.py +0 -0
  272. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/colbert_modernbert_flash/__init__.py +0 -0
  273. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/colbert_rotary_flash/__init__.py +0 -0
  274. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/cross_encoder/__init__.py +0 -0
  275. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/errors.py +0 -0
  276. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/florence2/__init__.py +0 -0
  277. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/gliclass/__init__.py +0 -0
  278. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/gliner2/__init__.py +0 -0
  279. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/gliner_bi/__init__.py +0 -0
  280. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/grounding_dino/__init__.py +0 -0
  281. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/gte_sparse_flash/__init__.py +0 -0
  282. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/jina_flash_cross_encoder/__init__.py +0 -0
  283. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/lighton_ocr/__init__.py +0 -0
  284. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/modernbert_flash/__init__.py +0 -0
  285. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/modernbert_flash_cross_encoder/__init__.py +0 -0
  286. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/nli_classification/__init__.py +0 -0
  287. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/nli_classification_flash/__init__.py +0 -0
  288. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/nomic_flash/__init__.py +0 -0
  289. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/owlv2/__init__.py +0 -0
  290. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/paddleocr_vl/__init__.py +0 -0
  291. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/qwen2_flash/__init__.py +0 -0
  292. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/qwen2_flash_cross_encoder/__init__.py +0 -0
  293. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/qwen3_vl_embedding/__init__.py +0 -0
  294. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/qwen3_vl_reranker/__init__.py +0 -0
  295. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/rope_flash/__init__.py +0 -0
  296. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/sentence_transformer/__init__.py +0 -0
  297. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/sglang/__init__.py +0 -0
  298. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/splade_flash/__init__.py +0 -0
  299. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/stablebridge_pruner/__init__.py +0 -0
  300. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/xlm_roberta_flash/__init__.py +0 -0
  301. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/__init__.py +0 -0
  302. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/generate.py +0 -0
  303. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/metrics.py +0 -0
  304. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/models.py +0 -0
  305. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/openapi.py +0 -0
  306. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/options.py +0 -0
  307. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/root.py +0 -0
  308. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/score.py +0 -0
  309. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/validation.py +0 -0
  310. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/app/__init__.py +0 -0
  311. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/app/app_state_config.py +0 -0
  312. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/config/__init__.py +0 -0
  313. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/__init__.py +0 -0
  314. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/gpu_health.py +0 -0
  315. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/hf_env.py +0 -0
  316. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/load_errors.py +0 -0
  317. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/logging.py +0 -0
  318. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/oom.py +0 -0
  319. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/preprocessor/image.py +0 -0
  320. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/shutdown.py +0 -0
  321. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/text_tokens.py +0 -0
  322. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/watcher.py +0 -0
  323. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/__init__.py +0 -0
  324. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/base.py +0 -0
  325. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/encode.py +0 -0
  326. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/extract.py +0 -0
  327. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/score.py +0 -0
  328. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/worker/oom_recovery.py +0 -0
  329. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/health/__init__.py +0 -0
  330. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/health/nats_publisher.py +0 -0
  331. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/health/saturation.py +0 -0
  332. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/main.py +0 -0
  333. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/observability/telemetry.py +0 -0
  334. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/processors/__init__.py +0 -0
  335. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/processors/admission.py +0 -0
  336. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/processors/base.py +0 -0
  337. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/processors/grammar_cache.py +0 -0
  338. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/processors/grammar_compile.py +0 -0
  339. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/processors/tool_call_grammar.py +0 -0
  340. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/processors/tool_call_parser.py +0 -0
  341. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/static/__init__.py +0 -0
  342. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/static/index.html +0 -0
  343. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/types/__init__.py +0 -0
  344. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/types/grammar.py +0 -0
  345. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/types/openapi.py +0 -0
  346. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/types/overflow_policy.py +0 -0
  347. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/types/requests.py +0 -0
  348. {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/types/responses.py +0 -0
  349. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/__init__.py +0 -0
  350. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_base.py +0 -0
  351. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_bge_m3.py +0 -0
  352. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_bge_m3_flash.py +0 -0
  353. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_colbert.py +0 -0
  354. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_docling_smoke.py +0 -0
  355. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_donut.py +0 -0
  356. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_factory_integration.py +0 -0
  357. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_flash_base.py +0 -0
  358. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_florence2.py +0 -0
  359. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_gliclass_overflow_policy.py +0 -0
  360. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_glirel.py +0 -0
  361. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_glm_ocr.py +0 -0
  362. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_grounding_dino.py +0 -0
  363. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_gte_sparse.py +0 -0
  364. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_jina_flash_cross_encoder.py +0 -0
  365. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_lighton_ocr.py +0 -0
  366. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_lora.py +0 -0
  367. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_lora_integration.py +0 -0
  368. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_paddleocr_vl.py +0 -0
  369. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_runtime_options.py +0 -0
  370. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_sglang.py +0 -0
  371. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_siglip.py +0 -0
  372. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_sparse_aggregation.py +0 -0
  373. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_stablebridge_integration.py +0 -0
  374. {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_stablebridge_pruner.py +0 -0
  375. {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/__init__.py +0 -0
  376. {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_encode_json_schema.py +0 -0
  377. {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_encode_validation.py +0 -0
  378. {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_extract_integration.py +0 -0
  379. {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_extract_oom.py +0 -0
  380. {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_health.py +0 -0
  381. {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_models.py +0 -0
  382. {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_openai_compat.py +0 -0
  383. {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_score.py +0 -0
  384. {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_version_header.py +0 -0
  385. {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_ws.py +0 -0
  386. {sie_server-0.4.1 → sie_server-0.4.2}/tests/app/__init__.py +0 -0
  387. {sie_server-0.4.1 → sie_server-0.4.2}/tests/config/__init__.py +0 -0
  388. {sie_server-0.4.1 → sie_server-0.4.2}/tests/config/test_model_prewarm_grammars.py +0 -0
  389. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/__init__.py +0 -0
  390. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_disk_cache.py +0 -0
  391. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_gpu_health.py +0 -0
  392. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_hot_reload.py +0 -0
  393. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_idle_evict.py +0 -0
  394. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_inference.py +0 -0
  395. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_logging.py +0 -0
  396. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_memory.py +0 -0
  397. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_oom_detection.py +0 -0
  398. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_pool_isolation.py +0 -0
  399. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_postprocessor.py +0 -0
  400. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_postprocessor_registry.py +0 -0
  401. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_prepared.py +0 -0
  402. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_preprocessor_registry.py +0 -0
  403. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_quantization.py +0 -0
  404. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_readiness.py +0 -0
  405. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_registry_core.py +0 -0
  406. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_registry_deps.py +0 -0
  407. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_registry_failed_state.py +0 -0
  408. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_registry_memory.py +0 -0
  409. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_registry_multi_model.py +0 -0
  410. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_shutdown.py +0 -0
  411. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_timing.py +0 -0
  412. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_watcher.py +0 -0
  413. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_worker_backpressure.py +0 -0
  414. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_worker_extract.py +0 -0
  415. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_worker_lora.py +0 -0
  416. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_worker_options.py +0 -0
  417. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_worker_score.py +0 -0
  418. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/worker/__init__.py +0 -0
  419. {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/worker/test_oom_recovery.py +0 -0
  420. {sie_server-0.4.1 → sie_server-0.4.2}/tests/health/__init__.py +0 -0
  421. {sie_server-0.4.1 → sie_server-0.4.2}/tests/health/test_nats_publisher.py +0 -0
  422. {sie_server-0.4.1 → sie_server-0.4.2}/tests/health/test_saturation.py +0 -0
  423. {sie_server-0.4.1 → sie_server-0.4.2}/tests/health/test_worker_id_consistency.py +0 -0
  424. {sie_server-0.4.1 → sie_server-0.4.2}/tests/integration/__init__.py +0 -0
  425. {sie_server-0.4.1 → sie_server-0.4.2}/tests/observability/__init__.py +0 -0
  426. {sie_server-0.4.1 → sie_server-0.4.2}/tests/observability/test_telemetry.py +0 -0
  427. {sie_server-0.4.1 → sie_server-0.4.2}/tests/observability/test_tracing.py +0 -0
  428. {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/__init__.py +0 -0
  429. {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/test_grammar_cache.py +0 -0
  430. {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/test_grammar_compile.py +0 -0
  431. {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/test_streaming_admission.py +0 -0
  432. {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/test_streaming_integration.py +0 -0
  433. {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/test_tool_call_grammar.py +0 -0
  434. {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/test_tool_call_parser.py +0 -0
  435. {sie_server-0.4.1 → sie_server-0.4.2}/tests/test_all_models.py +0 -0
  436. {sie_server-0.4.1 → sie_server-0.4.2}/tests/test_openapi_export.py +0 -0
  437. {sie_server-0.4.1 → sie_server-0.4.2}/tests/test_sdk_integration.py +0 -0
  438. {sie_server-0.4.1 → sie_server-0.4.2}/tests/test_sparse_integration.py +0 -0
  439. {sie_server-0.4.1 → sie_server-0.4.2}/tests/type_defs/__init__.py +0 -0
  440. {sie_server-0.4.1 → sie_server-0.4.2}/tests/type_defs/test_inputs.py +0 -0
  441. {sie_server-0.4.1 → sie_server-0.4.2}/tests/type_defs/test_inputs_json_decode.py +0 -0
  442. {sie_server-0.4.1 → sie_server-0.4.2}/tests/type_defs/test_types.py +0 -0
@@ -8,7 +8,7 @@
8
8
  ARG BUNDLE=default
9
9
 
10
10
  # =============================================================================
11
- # Stage 1: Dependencies (pyproject.toml only, cached across code changes)
11
+ # Dependency image: pyproject-only cache seed
12
12
  # =============================================================================
13
13
  FROM python:3.12-slim-bookworm AS deps
14
14
 
@@ -57,9 +57,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
57
57
  -e ".[gpu-metrics]"
58
58
 
59
59
  # =============================================================================
60
- # Stage 2: Base - source install + shared-venv finalization (no BUNDLE)
60
+ # Shared runtime base: source install and venv finalization
61
61
  # =============================================================================
62
- # Bundle-agnostic: all base-stage layers are shared across bundles of this
62
+ # Bundle-agnostic: all base image layers are shared across bundles of this
63
63
  # platform in local BuildKit cache and in content-addressed registry layers.
64
64
  FROM deps AS base
65
65
 
@@ -107,7 +107,7 @@ RUN set -eux; \
107
107
  find /app/.venv -exec touch -h -d @0 {} + 2>/dev/null || true
108
108
 
109
109
  # =============================================================================
110
- # Stage 3: Builder - bundle-specific deps
110
+ # Bundle dependency builder: bundle-specific deps
111
111
  # =============================================================================
112
112
  FROM base AS builder
113
113
 
@@ -150,7 +150,7 @@ RUN set -eux; \
150
150
  find /app/bundle-libs -exec touch -h -d @0 {} + 2>/dev/null || true
151
151
 
152
152
  # =============================================================================
153
- # Stage 4: Runtime
153
+ # Runtime image
154
154
  # =============================================================================
155
155
  FROM python:3.12-slim-bookworm AS runtime
156
156
 
@@ -1,5 +1,5 @@
1
1
  # syntax=docker/dockerfile:1
2
- # SIE Server - CUDA 12.4 Image
2
+ # SIE Server - CUDA 12 Image
3
3
  # Build from repo root:
4
4
  # docker build -f packages/sie_server/Dockerfile.cuda12 -t sie-server:cuda12-default .
5
5
  # docker build -f packages/sie_server/Dockerfile.cuda12 --build-arg BUNDLE=sglang -t sie-server:cuda12-sglang .
@@ -8,7 +8,7 @@ ARG BUNDLE=default
8
8
  ARG UV_VERSION=0.9.28
9
9
 
10
10
  # =============================================================================
11
- # Stage 1: uv + standalone Python 3.12 (no deadsnakes PPA)
11
+ # Dependency image: uv and standalone Python 3.12
12
12
  # =============================================================================
13
13
  FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS deps
14
14
 
@@ -59,10 +59,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
59
59
  -e ".[gpu-metrics]"
60
60
 
61
61
  # =============================================================================
62
- # Stage 2: Base - source install + shared-venv finalization (no BUNDLE)
62
+ # Shared CUDA base: source install and venv finalization
63
63
  # =============================================================================
64
64
  # Everything here is bundle-agnostic, so bundle-specific builds of a given
65
- # platform share every base-stage layer in local BuildKit cache and in
65
+ # platform share every base image layer in local BuildKit cache and in
66
66
  # content-addressed registry layers.
67
67
  FROM deps AS base
68
68
 
@@ -126,7 +126,7 @@ RUN set -eux; \
126
126
  find /app/.venv -exec touch -h -d @0 {} + 2>/dev/null || true
127
127
 
128
128
  # =============================================================================
129
- # Stage 3: Builder - bundle-specific deps
129
+ # Bundle dependency builder: bundle-specific deps
130
130
  # =============================================================================
131
131
  FROM base AS builder
132
132
 
@@ -173,11 +173,19 @@ RUN set -eux; \
173
173
  find /app/bundle-libs -exec touch -h -d @0 {} + 2>/dev/null || true
174
174
 
175
175
  # =============================================================================
176
- # Stage 4: Runtime
176
+ # Runtime image
177
177
  # =============================================================================
178
- # Use base CUDA image (not devel/runtime) PyTorch wheels bundle CUDA libs,
179
- # cuDNN ships inside torch. Saves ~2GB vs `runtime` variant.
180
- FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS runtime
178
+ # Runtime base selection is bundle-scoped. Most CUDA bundles stay on the
179
+ # smaller CUDA base runtime; SGLang-family bundles need the devel toolkit
180
+ # because flashinfer/tvm_ffi perform runtime JIT through nvcc on first decode.
181
+ FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS runtime-default
182
+ FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS runtime-transformers5
183
+ FROM nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04 AS runtime-sglang
184
+ ENV CUDA_HOME=/usr/local/cuda \
185
+ LD_LIBRARY_PATH="/usr/local/cuda/lib64:$LD_LIBRARY_PATH"
186
+ FROM runtime-sglang AS runtime-sglang-embedding
187
+
188
+ FROM runtime-${BUNDLE} AS runtime
181
189
 
182
190
  ENV DEBIAN_FRONTEND=noninteractive
183
191
 
@@ -1,10 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sie-server
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: Search Inference Engine - GPU inference server for search workloads
5
5
  License: Apache-2.0
6
6
  License-File: LICENSE
7
7
  Requires-Python: <3.13,>=3.12
8
+ Requires-Dist: blake3<1,>=0.4
8
9
  Requires-Dist: docling<3,>=2
9
10
  Requires-Dist: einops<1,>=0.8
10
11
  Requires-Dist: fastapi<1,>=0.115
@@ -17,7 +18,6 @@ Requires-Dist: loguru<1,>=0.7
17
18
  Requires-Dist: msgpack-numpy<1,>=0.4
18
19
  Requires-Dist: msgpack<2,>=1.1
19
20
  Requires-Dist: msgspec>=0.20.0
20
- Requires-Dist: nats-py<3,>=2.9
21
21
  Requires-Dist: numpy<3,>=2
22
22
  Requires-Dist: open-clip-torch>=2.24
23
23
  Requires-Dist: opencv-python-headless<5,>=4
@@ -26,7 +26,7 @@ Requires-Dist: opentelemetry-exporter-otlp<2,>=1.28
26
26
  Requires-Dist: opentelemetry-instrumentation-fastapi<1,>=0.49b0
27
27
  Requires-Dist: opentelemetry-sdk<2,>=1.28
28
28
  Requires-Dist: packaging<25,>=24
29
- Requires-Dist: pillow<12,>=11
29
+ Requires-Dist: pillow>=12.2.0
30
30
  Requires-Dist: prometheus-client<1,>=0.21
31
31
  Requires-Dist: pydantic-settings<3,>=2.6
32
32
  Requires-Dist: pydantic<3,>=2.9
@@ -66,7 +66,7 @@ auto-retries; see `packages/sie_sdk/README.md` for client-side controls.
66
66
 
67
67
  | Env var | Default | Effect |
68
68
  |--|--|--|
69
- | `SIE_GRAMMAR_PREFLIGHT_DEBUG` | unset (off) | Enables the legacy worker-side Outlines preflight compile before each structured-output request. Off by default per ADR-0002 — SGLang is the production grammar authority. Use for diagnosing schema-rejection problems or slow compiles in a controlled environment; not recommended for production traffic. |
69
+ | `SIE_GRAMMAR_PREFLIGHT_DEBUG` | unset (off) | Enables the legacy worker-side Outlines preflight compile before each structured-output request. Off by default because SGLang is the production grammar authority. Use for diagnosing schema-rejection problems or slow compiles in a controlled environment; not recommended for production traffic. |
70
70
 
71
71
  For nested settings (any field with `__`), the env-var format is
72
72
  `SIE_<TOP>__<NESTED>=value`. The complete schema is in
@@ -43,6 +43,7 @@ adapters:
43
43
  - sie_server.adapters.florence2
44
44
  - sie_server.adapters.docling
45
45
  - sie_server.adapters.paddleocr_vl
46
+ - sie_server.adapters.mineru_vl
46
47
  deps:
47
48
  # Most flash adapters; sentence_transformer needs >=4.57
48
49
  transformers: '>=4.57,<5'
@@ -0,0 +1,28 @@
1
+ sie_id: Marqo/marqo-fashionSigLIP
2
+ hf_id: Marqo/marqo-fashionSigLIP
3
+ inputs:
4
+ text: true
5
+ image: true
6
+ audio: false
7
+ video: false
8
+ tasks:
9
+ encode:
10
+ dense:
11
+ dim: 768
12
+ sparse: null
13
+ multivector: null
14
+ score: null
15
+ extract: null
16
+ max_sequence_length: 64
17
+ profiles:
18
+ default:
19
+ max_batch_tokens: 16384
20
+ compute_precision: float16
21
+ adapter_path: sie_server.adapters.siglip:SiglipAdapter
22
+ adapter_options:
23
+ loadtime:
24
+ backend: open_clip
25
+ open_clip_model_id: hf-hub:Marqo/marqo-fashionSigLIP
26
+ dense_dim: 768
27
+ runtime:
28
+ normalize: true
@@ -6,40 +6,39 @@ inputs:
6
6
  audio: false
7
7
  video: false
8
8
  tasks:
9
- # Tiny generation model used for fast e2e iteration on Modal L4 (walking-skeleton
10
- # viability bench + validation-harness SIE-only smoke). Loads in ~30s. Quality is
11
- # bad this is a transport benchmark target, not a production model.
9
+ # Small, fast generation model a viable PROD pick for simple/short-prompt
10
+ # tasks (cheap + high throughput; weaker on long-context reasoning). Loads in
11
+ # ~30s. Also doubles as the transport/walking-skeleton benchmark target.
12
12
  #
13
- # Context / batch sizes are deliberately small (1024 vs the headroom an
14
- # L4 could nominally support) so the validation harness can co-resident
15
- # the worker's SGLang with a second SGLang for the baseline phase on
16
- # a single 22 GiB card without OOMing. Raise these if you ever run 0.6B
17
- # alone on a card it doesn't have to share.
13
+ # ``context_length`` is the standalone PROD serving value (4096): big enough
14
+ # to fit the full generation benchmark pack (casehold prompts reach ~1.8k
15
+ # tokens, gpqa ~1.3k) so the model is comparable to the rest of the fleet on
16
+ # every task, while KV stays trivial at this size (112 KB/token 4096
17
+ # 0.46 GB). The validation/co-residency harness, which packs two SGLang
18
+ # instances onto a single 22 GiB L4, does NOT depend on this default — it
19
+ # passes an explicit ``--max-seq-length``/``--context-length`` (see
20
+ # tools/bench_generation_matrix.py + run_generation_smoke.py) and caps itself
21
+ # to 1024 for that case.
18
22
  #
19
- # Note on the three 1024s below: `context_length`, `max_sequence_length`,
20
- # and `max_batch_tokens` are NOT redundant they're three independent
21
- # knobs (per-request context, SGLang --context-length, batcher cost
22
- # budget) that just happen to collide here because the model is tiny.
23
- # See sibling Qwen__Qwen3-4B-Instruct-2507.yaml for the canonical
24
- # non-collapsed shape.
23
+ # ``context_length``, ``max_sequence_length``, and ``max_batch_tokens`` are
24
+ # three independent knobs (per-request context, SGLang --context-length,
25
+ # batcher cost budget); see sibling Qwen__Qwen3-4B-Instruct-2507.yaml.
25
26
  generate:
26
- context_length: 1024
27
+ context_length: 4096
27
28
  max_output_tokens: 1024
28
29
  capabilities:
29
30
  grammar: []
30
31
  streaming: true
31
32
  tools: false
32
- max_sequence_length: 1024
33
+ max_sequence_length: 4096
33
34
  # KV-cache memory math (Qwen3-0.6B, bf16):
34
35
  # layers=28, kv_heads=8, head_dim=128, bytes_per_elem=2
35
36
  # kv_bytes_per_token = 2 × 28 × 8 × 128 × 2 = 114,688 B ≈ 112 KB
36
- # The 0.6B is a transport benchmark targetcontext_length is held at
37
- # 1024 deliberately (see header comment) so the validation harness can
38
- # co-resident two SGLang instances on an L4. KV budgets per profile
39
- # scale with the deployment scenario rather than the GPU ceiling.
37
+ # At ctx=4096 a single request's KV is ~0.46 GB negligible. The co-residency
38
+ # harness still caps context explicitly when it has to share a card.
40
39
  profiles:
41
40
  default:
42
- max_batch_tokens: 1024
41
+ max_batch_tokens: 4096
43
42
  compute_precision: bfloat16
44
43
  adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
45
44
  kv_budget_tokens: 8192
@@ -66,8 +66,7 @@ max_sequence_length: 32768
66
66
  # kv_budget_tokens is set to ~40% of theoretical max, matching the L4
67
67
  # baseline ratio (32768/90000 ≈ 36%). The headroom absorbs batch growth,
68
68
  # speculative side-cell, grammar/Outlines compile arena, fragmentation.
69
- # Final empirical validation (concurrency-16 OOM-boundary sweep) is
70
- # tracked in product/plans/m4-req2-gpu-runbook.md §"#16/#19".
69
+ # Final empirical validation should use concurrency and OOM-boundary sweeps.
71
70
  profiles:
72
71
  default:
73
72
  # max_batch_tokens is a generic engine knob; generation does not batch
@@ -93,7 +92,7 @@ profiles:
93
92
  top_p: 0.9
94
93
  stop_tokens:
95
94
  - "<|im_end|>"
96
- # M5 audit #16/#19: analytical defaults for a100-40gb / h100. Production
95
+ # Analytical defaults for a100-40gb / h100. Production
97
96
  # capacity also grows: with 2-4× the KV budget the context window can be
98
97
  # widened proportionally so longer-context workloads (RAG with large
99
98
  # retrieved passages) fit comfortably. ``max_output_tokens`` doubles
@@ -0,0 +1,308 @@
1
+ sie_id: Qwen/Qwen3.6-27B
2
+ hf_id: Qwen/Qwen3.6-27B
3
+ inputs:
4
+ # Qwen3.6-27B is a unified vision-language model (Gated DeltaNet + Gated
5
+ # Attention with an integrated vision encoder; same hybrid family as
6
+ # Qwen3.5-4B but scaled to 64 layers / hidden_size=5120). The wire
7
+ # surface accepts text+image via the OpenAI chat-completions schema
8
+ # (``image_url`` content parts); video is documented by Qwen but not
9
+ # yet wired through the SIE gateway.
10
+ text: true
11
+ image: true
12
+ audio: false
13
+ video: false
14
+ tasks:
15
+ generate:
16
+ # Native context length is 262,144 tokens (YaRN extends to ~1M).
17
+ # Default to 4096 here — this matches the empirically-calibrated
18
+ # reference point in the profile comments below
19
+ # (mem_fraction_static=0.93, weight 51.05 GB + kvcache 11.63 GB).
20
+ # 8192 was the original optimistic default but SGLang's
21
+ # init_memory_pool refused to fit it inside the conservative
22
+ # mem_fraction envelope on H100-80GB even when bumped to 0.97 —
23
+ # NEXTN speculative-decoding draft KV pushes the total past the
24
+ # available headroom. Raise both context_length AND
25
+ # mem_fraction_static together if you need longer contexts;
26
+ # benchmarking and prod requests today fit comfortably in 4096.
27
+ context_length: 4096
28
+ max_output_tokens: 4096
29
+ capabilities:
30
+ # Same constraint as Qwen3.5-4B: SGLang's outlines_backend does
31
+ # not implement ebnf. xgrammar smoke would pass all three; flip
32
+ # ``grammar_backend: xgrammar`` and re-add ``"ebnf"`` here if a
33
+ # consumer needs it.
34
+ grammar: ["json_schema", "regex"]
35
+ streaming: true
36
+ tools: true
37
+ # Qwen3.6 emits ``<think>...</think>`` reasoning by default. We
38
+ # disable it for the OpenAI-compat path so visible output is the
39
+ # answer only. Operators wanting CoT can flip this profile-side.
40
+ chat_template_kwargs:
41
+ enable_thinking: false
42
+ prewarm_grammars:
43
+ # Bare pattern, NOT anchored — Outlines regexes are implicitly
44
+ # anchored and its FSM engine rejects ``^``/``$``. See the
45
+ # Qwen3.5-4B model card for the full back-story.
46
+ - name: yes_no
47
+ kind: regex
48
+ value: "(yes|no)"
49
+ - name: short_answer
50
+ kind: json_schema
51
+ value:
52
+ type: object
53
+ properties:
54
+ answer:
55
+ type: string
56
+ required: [answer]
57
+ max_sequence_length: 4096
58
+ # ── KV-cache math (placeholder, pending Modal calibration) ──
59
+ #
60
+ # Qwen3.6-27B layer breakdown (per model card / config.json):
61
+ # * 64 layers in a 16 × (3 × DeltaNet + 1 × GatedAttention) pattern
62
+ # * 16 KV-bearing Gated-Attention layers (4 KV heads × head_dim=256)
63
+ # * 48 recurrent Gated-DeltaNet layers — managed by SGLang's mamba
64
+ # scheduler under ``--mamba-scheduler-strategy extra_buffer``
65
+ #
66
+ # BF16 weights ≈ 27e9 × 2 B ≈ 54 GB before activations / KV.
67
+ # * L4 (22 GB) → infeasible
68
+ # * A100-40GB (40 GB) → infeasible BF16; would need FP8 or TP2
69
+ # * H100-80GB → primary target (single-GPU, BF16)
70
+ # * H100×2 (160 GB) → for context >32k or large concurrencies
71
+ #
72
+ # ``kv_budget_tokens`` is a conservative analytical placeholder pending
73
+ # the first /get_server_info dump from tools/smoke_qwen36_27b.py on
74
+ # Modal. Re-calibrate from the empirical token_capacity before relying
75
+ # on these numbers in production.
76
+ profiles:
77
+ # L4 / A100-40GB profiles intentionally omitted — Qwen3.6-27B's BF16
78
+ # weights (~54 GB) do not fit. Add an FP8 or TP2 profile if those
79
+ # tiers become required.
80
+ default:
81
+ max_batch_tokens: 16384
82
+ compute_precision: bfloat16
83
+ adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
84
+ # Conservative H100-80GB placeholder pending the first
85
+ # /get_server_info dump from tools/smoke_qwen36_27b.py on Modal.
86
+ # Empirical reference point (2026-05-27 smoke, mem_fraction_static=0.93,
87
+ # context_length=4096, no-spec): SGLang reported
88
+ # ``weight=51.05 GB, kvcache=11.63 GB, token_capacity=190,543``.
89
+ # Sizing here at ~1/12 of that capacity to leave room for NEXTN
90
+ # draft KV + grammar/Outlines compile arena once spec is re-enabled.
91
+ # Re-calibrate after a smoke run on this profile's actual settings.
92
+ # Halved from 16384 to leave more H100-80GB headroom for the NEXTN
93
+ # draft activation arena. 8192 KV slots × ~64 KB/token ≈ 0.5 GB —
94
+ # plenty for the realistic per-request concurrency on this profile.
95
+ kv_budget_tokens: 8192
96
+ adapter_options:
97
+ loadtime:
98
+ # 0.95 paired with ``context_length: 4096`` + the *smaller*
99
+ # NEXTN draft below (num_steps=2 / num_draft_tokens=2 vs the
100
+ # 3/4 model-card default). The 0.93+default-draft cell still
101
+ # OOM'd ``init_memory_pool`` because verification batch grows
102
+ # with num_steps × num_draft_tokens; halving both shrinks the
103
+ # activation arena enough to fit.
104
+ mem_fraction_static: 0.95
105
+ served_model_name: Qwen/Qwen3.6-27B
106
+ disable_cuda_graph: true
107
+ attention_backend: triton
108
+ grammar_backend: outlines
109
+ reasoning_parser: qwen3
110
+ tool_call_parser: qwen3_coder
111
+ # MTP/NEXTN per the Qwen3.x model-card recipe (SGLang implements
112
+ # NEXTN under the EAGLE codepath; ``/server_info`` reports
113
+ # ``speculative_algorithm: EAGLE``). Smaller-draft variant
114
+ # (num_steps=2 / num_draft_tokens=2 vs the model-card 3/4) so
115
+ # the verification batch fits inside H100-80GB at ctx=4096 +
116
+ # mfs=0.95. The 3/4 default reliably OOM'd
117
+ # ``init_memory_pool`` even at mfs=0.97 — the trade is a
118
+ # slightly smaller speculative window for the ability to fit at
119
+ # all. Re-tune (or re-enable 3/4) once FP8 / TP=2 is wired up.
120
+ speculative:
121
+ enabled: true
122
+ algorithm: nextn
123
+ num_steps: 2
124
+ eagle_topk: 1
125
+ num_draft_tokens: 2
126
+ # ``--mamba-scheduler-strategy extra_buffer`` is the required
127
+ # pair-flag for NEXTN spec on the hybrid Gated-DeltaNet
128
+ # architecture. With spec disabled it's also harmless to keep,
129
+ # so we leave it for when spec is re-enabled.
130
+ extra_launch_args:
131
+ - "--mamba-scheduler-strategy"
132
+ - "extra_buffer"
133
+ - "--disable-overlap-schedule"
134
+ runtime:
135
+ first_chunk_timeout_s: 90
136
+ inter_chunk_timeout_s: 15
137
+ overall_timeout_s: 600
138
+ default_sampling:
139
+ temperature: 0.7
140
+ top_p: 0.8
141
+ presence_penalty: 1.5
142
+ stop_tokens:
143
+ - "<|im_end|>"
144
+ h100:
145
+ max_batch_tokens: 32768
146
+ compute_precision: bfloat16
147
+ adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
148
+ # Same H100-80GB target as ``default``; this profile widens
149
+ # ``max_batch_tokens`` for batch-heavy workloads once calibration
150
+ # confirms KV headroom.
151
+ kv_budget_tokens: 32768
152
+ adapter_options:
153
+ loadtime:
154
+ # Same as ``default`` — see that profile's mfs comment.
155
+ mem_fraction_static: 0.93
156
+ served_model_name: Qwen/Qwen3.6-27B
157
+ disable_cuda_graph: true
158
+ attention_backend: triton
159
+ grammar_backend: outlines
160
+ reasoning_parser: qwen3
161
+ tool_call_parser: qwen3_coder
162
+ speculative:
163
+ enabled: true
164
+ algorithm: nextn
165
+ num_steps: 3
166
+ eagle_topk: 1
167
+ num_draft_tokens: 4
168
+ # ``--disable-overlap-schedule`` is the required pair-flag for
169
+ # NEXTN + mamba-scheduler ``extra_buffer`` on the hybrid Gated-
170
+ # DeltaNet architecture (same constraint as Qwen3.5-4B).
171
+ extra_launch_args:
172
+ - "--mamba-scheduler-strategy"
173
+ - "extra_buffer"
174
+ - "--disable-overlap-schedule"
175
+ runtime:
176
+ first_chunk_timeout_s: 90
177
+ inter_chunk_timeout_s: 15
178
+ overall_timeout_s: 600
179
+ default_sampling:
180
+ temperature: 0.7
181
+ top_p: 0.8
182
+ presence_penalty: 1.5
183
+ stop_tokens:
184
+ - "<|im_end|>"
185
+ # RTX PRO 6000 (96 GB GDDR7, Blackwell Server Edition, sm_120) profile.
186
+ # FP8-first for max throughput: ``--quantization fp8`` (SGLang online
187
+ # dynamic FP8 quant of the BF16 checkpoint) via the ``extra_launch_args``
188
+ # passthrough — ``compute_precision`` can only express the ``--dtype``
189
+ # axis (float16/bfloat16/float32), not the orthogonal ``--quantization``
190
+ # flag, so FP8 rides the same escape hatch already used for the mamba
191
+ # scheduler. FP8 halves weight memory (~54 → ~27 GB), which frees room
192
+ # for the *model-card* NEXTN 3/4 draft (num_steps=3 / num_draft_tokens=4)
193
+ # that OOM'd ``init_memory_pool`` on H100-80GB even at mfs=0.97. The +16 GB
194
+ # over H100 plus the FP8 weight saving is what makes 3/4 fit here.
195
+ #
196
+ # ACCURACY CONTRACT: FP8 is lossy. This profile is validated to within the
197
+ # Wilson 95% CI of the *BF16* baseline on all four generation tasks (see
198
+ # docs/adr/0001). If FP8 misses parity after bounded tuning, fall back to
199
+ # BF16 + NEXTN 3/4 (drop the ``--quantization fp8`` pair below; the 96 GB
200
+ # still fits the 3/4 draft in BF16). KV cache stays BF16 here — add
201
+ # ``--kv-cache-dtype fp8_e4m3`` only if memory/throughput needs it AND
202
+ # accuracy still holds (KV FP8 is usually the first thing to cost accuracy).
203
+ #
204
+ # Standalone block (no ``extends``): production ``resolve_profile`` does a
205
+ # full-replace of ``loadtime`` for extending profiles, so a partial child
206
+ # would drop inherited launch flags and desync the via-SIE path from the
207
+ # bare-SGLang bench control.
208
+ rtx-pro-6000:
209
+ max_batch_tokens: 32768
210
+ compute_precision: bfloat16
211
+ adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
212
+ # FP8 weights (~27 GB) + 96 GB total leaves generous KV/draft headroom;
213
+ # start conservative and raise after the first /get_server_info dump on
214
+ # the actual RTX-PRO-6000 smoke.
215
+ kv_budget_tokens: 16384
216
+ adapter_options:
217
+ loadtime:
218
+ # 0.90 is a conservative starting point — FP8 weights free enough
219
+ # memory that the 3/4 NEXTN verification batch should fit with room
220
+ # to spare. Iterate upward (smoke ``--mem-fraction-static``) once the
221
+ # first boot confirms Blackwell sm_120 + FP8 GEMM kernels are present.
222
+ mem_fraction_static: 0.90
223
+ served_model_name: Qwen/Qwen3.6-27B
224
+ disable_cuda_graph: true
225
+ # triton attention matches the rest of the Qwen3.x family. Blackwell
226
+ # (sm_120) kernel coverage for triton + FP8 + NEXTN is the first thing
227
+ # the boot smoke verifies; switch to flashinfer here if triton lacks
228
+ # sm_120 coverage in the pinned SGLang build.
229
+ attention_backend: triton
230
+ grammar_backend: outlines
231
+ reasoning_parser: qwen3
232
+ tool_call_parser: qwen3_coder
233
+ # Model-card NEXTN 3/4 — restored here (vs ``default``'s conservative
234
+ # 2/2) because FP8 + 96 GB fits the larger verification batch that
235
+ # OOM'd on H100-80GB.
236
+ speculative:
237
+ enabled: true
238
+ algorithm: nextn
239
+ num_steps: 3
240
+ eagle_topk: 1
241
+ num_draft_tokens: 4
242
+ # ``--quantization fp8`` rides the passthrough (see header comment).
243
+ # The mamba-scheduler + overlap pair-flags are the required NEXTN
244
+ # companions on the hybrid Gated-DeltaNet architecture. List is the
245
+ # FULL set (production full-replaces ``extra_launch_args``, not merge).
246
+ extra_launch_args:
247
+ - "--quantization"
248
+ - "fp8"
249
+ - "--mamba-scheduler-strategy"
250
+ - "extra_buffer"
251
+ - "--disable-overlap-schedule"
252
+ runtime:
253
+ first_chunk_timeout_s: 90
254
+ inter_chunk_timeout_s: 15
255
+ overall_timeout_s: 600
256
+ # Qwen3.6-27B empty-response fix baked in: under greedy/low-temp the
257
+ # chat template emits EOS as the FIRST token on a large fraction of
258
+ # prompts (n=50 6000 smoke: casehold 23/50, gpqa 29/50 came back
259
+ # EMPTY). The floor ``min_new_tokens>=1`` fixes it — validated on the
260
+ # 6000: min_tokens=10 → 0/50 empty on all four tasks, accuracy within
261
+ # Wilson 95% CI of the BF16 baseline. ``min_new_tokens`` is the
262
+ # SGLang-native key; the adapter merges this dict via ``setdefault``,
263
+ # so a request-supplied ``min_tokens`` still wins. NOTE: this only
264
+ # takes effect because ``runtime.default_sampling`` is now wired into
265
+ # the adapter (core/loader.py); before that fix a key here was a
266
+ # silent no-op and chat clients had to pass ``min_tokens`` themselves.
267
+ default_sampling:
268
+ temperature: 0.7
269
+ top_p: 0.8
270
+ presence_penalty: 1.5
271
+ min_new_tokens: 10
272
+ stop_tokens:
273
+ - "<|im_end|>"
274
+ # No-speculative baseline — for SIE-vs-raw-SGLang ablation cells so
275
+ # spec-decoding's contribution can be measured independently. Keeps
276
+ # the same ``extra_launch_args`` as ``default`` / ``h100`` so a config
277
+ # diff between them shows only the ``speculative`` block (the intent
278
+ # of the ablation), matching Qwen3.5-4B's convention.
279
+ no-spec:
280
+ max_batch_tokens: 32768
281
+ compute_precision: bfloat16
282
+ adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
283
+ kv_budget_tokens: 65536
284
+ adapter_options:
285
+ loadtime:
286
+ mem_fraction_static: 0.85
287
+ served_model_name: Qwen/Qwen3.6-27B
288
+ disable_cuda_graph: true
289
+ attention_backend: triton
290
+ grammar_backend: outlines
291
+ reasoning_parser: qwen3
292
+ tool_call_parser: qwen3_coder
293
+ speculative:
294
+ enabled: false
295
+ extra_launch_args:
296
+ - "--mamba-scheduler-strategy"
297
+ - "extra_buffer"
298
+ - "--disable-overlap-schedule"
299
+ runtime:
300
+ first_chunk_timeout_s: 90
301
+ inter_chunk_timeout_s: 15
302
+ overall_timeout_s: 600
303
+ default_sampling:
304
+ temperature: 0.7
305
+ top_p: 0.8
306
+ presence_penalty: 1.5
307
+ stop_tokens:
308
+ - "<|im_end|>"
@@ -2,7 +2,7 @@ sie_id: docling
2
2
  package_backed: true
3
3
  inputs:
4
4
  text: false
5
- image: false
5
+ image: true
6
6
  audio: false
7
7
  video: false
8
8
  document: true
@@ -0,0 +1,24 @@
1
+ sie_id: opendatalab/MinerU2.5-Pro-2604-1.2B
2
+ hf_id: opendatalab/MinerU2.5-Pro-2604-1.2B
3
+ hf_revision: d3f5e08d073c21466bbabe21c71bb1e9c2e595da
4
+ inputs:
5
+ text: false
6
+ image: true
7
+ audio: false
8
+ video: false
9
+ tasks:
10
+ encode: null
11
+ score: null
12
+ extract: {}
13
+ profiles:
14
+ default:
15
+ max_batch_tokens: 16384
16
+ compute_precision: bfloat16
17
+ adapter_path: sie_server.adapters.mineru_vl:MinerUVLAdapter
18
+ adapter_options:
19
+ loadtime:
20
+ default_task: "[default]"
21
+ runtime:
22
+ task: "[default]"
23
+ max_new_tokens: 4096
24
+ num_beams: 1
@@ -3,7 +3,7 @@
3
3
  "info": {
4
4
  "title": "SIE Server",
5
5
  "description": "Search Inference Engine - GPU inference server for search workloads",
6
- "version": "0.4.1"
6
+ "version": "0.4.2"
7
7
  },
8
8
  "paths": {
9
9
  "/": {
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sie-server"
3
- version = "0.4.1"
3
+ version = "0.4.2"
4
4
  description = "Search Inference Engine - GPU inference server for search workloads"
5
5
  requires-python = ">=3.12,<3.13"
6
6
  license = { text = "Apache-2.0" }
@@ -39,7 +39,7 @@ dependencies = [
39
39
  # SigLIP (Marqo/marqo-ecommerce-embeddings-B native open_clip loader)
40
40
  "open-clip-torch>=2.24",
41
41
  # Image processing
42
- "pillow>=11,<12",
42
+ "pillow>=12.2.0",
43
43
  "numpy>=2,<3",
44
44
  "torchvision>=0.18,<1", # Required by some HF models (e.g., nvidia/llama-nemoretriever)
45
45
  # Config
@@ -50,8 +50,6 @@ dependencies = [
50
50
  "packaging>=24,<25",
51
51
  # Hot-reload
52
52
  "watchdog>=6,<7",
53
- # NATS pub/sub for config notifications
54
- "nats-py>=2.9,<3",
55
53
  # Observability
56
54
  "opentelemetry-api>=1.28,<2",
57
55
  "opentelemetry-sdk>=1.28,<2",
@@ -66,6 +64,11 @@ dependencies = [
66
64
  "msgspec>=0.20.0",
67
65
  # Async HTTP client (telemetry sender)
68
66
  "httpx>=0.28.1",
67
+ # BLAKE3 used to cross-check the worker-sidecar's `PreparedTokens`
68
+ # tokenizer_id. Tiny (<200KB, pure-Rust via PyO3), mandatory for
69
+ # the encode / score fast-path consumer — see
70
+ # `sie_server.core.preprocessor.text.TextPreprocessor`.
71
+ "blake3>=0.4,<1",
69
72
  ]
70
73
 
71
74
  [project.optional-dependencies]