sie-server 0.3.3__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (424) hide show
  1. {sie_server-0.3.3 → sie_server-0.4.0}/Dockerfile.cpu +32 -5
  2. {sie_server-0.3.3 → sie_server-0.4.0}/Dockerfile.cuda12 +40 -5
  3. {sie_server-0.3.3 → sie_server-0.4.0}/PKG-INFO +2 -1
  4. {sie_server-0.3.3 → sie_server-0.4.0}/README.md +6 -0
  5. sie_server-0.4.0/bundles/sglang-embedding.yaml +18 -0
  6. sie_server-0.4.0/bundles/sglang.yaml +66 -0
  7. {sie_server-0.3.3 → sie_server-0.4.0}/models/Alibaba-NLP__gte-Qwen2-7B-instruct.yaml +1 -1
  8. {sie_server-0.3.3 → sie_server-0.4.0}/models/Linq-AI-Research__Linq-Embed-Mistral.yaml +1 -1
  9. sie_server-0.4.0/models/Qwen__Qwen3-0.6B.yaml +119 -0
  10. sie_server-0.4.0/models/Qwen__Qwen3-4B-Instruct-2507.yaml +152 -0
  11. {sie_server-0.3.3 → sie_server-0.4.0}/models/Qwen__Qwen3-Embedding-4B.yaml +1 -1
  12. sie_server-0.4.0/models/Qwen__Qwen3.5-4B.yaml +261 -0
  13. {sie_server-0.3.3 → sie_server-0.4.0}/models/Salesforce__SFR-Embedding-2_R.yaml +1 -1
  14. {sie_server-0.3.3 → sie_server-0.4.0}/models/Salesforce__SFR-Embedding-Mistral.yaml +1 -1
  15. {sie_server-0.3.3 → sie_server-0.4.0}/models/intfloat__e5-mistral-7b-instruct.yaml +1 -1
  16. {sie_server-0.3.3 → sie_server-0.4.0}/openapi.json +22 -2
  17. {sie_server-0.3.3 → sie_server-0.4.0}/pyproject.toml +9 -2
  18. sie_server-0.4.0/src/sie_server/adapters/_generation_base.py +295 -0
  19. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/_spec.py +1 -1
  20. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/base.py +1 -1
  21. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/bge_m3_flag/__init__.py +3 -0
  22. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/clip/__init__.py +2 -1
  23. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/colpali/__init__.py +2 -1
  24. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/colqwen2/__init__.py +2 -1
  25. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/colqwen3/__init__.py +2 -1
  26. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/donut/__init__.py +2 -1
  27. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/florence2/__init__.py +2 -1
  28. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/glm_ocr/__init__.py +2 -1
  29. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/grounding_dino/__init__.py +2 -2
  30. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/lighton_ocr/__init__.py +2 -1
  31. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/nemo_colembed/__init__.py +2 -1
  32. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/owlv2/__init__.py +2 -1
  33. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/paddleocr_vl/__init__.py +2 -1
  34. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/pytorch_embedding/__init__.py +16 -5
  35. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/qwen3_vl_embedding/__init__.py +3 -2
  36. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/qwen3_vl_reranker/__init__.py +2 -1
  37. sie_server-0.4.0/src/sie_server/adapters/sglang/_server.py +210 -0
  38. sie_server-0.3.3/src/sie_server/adapters/sglang/__init__.py → sie_server-0.4.0/src/sie_server/adapters/sglang/embedding.py +48 -154
  39. sie_server-0.4.0/src/sie_server/adapters/sglang/generation.py +1430 -0
  40. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/siglip/__init__.py +2 -1
  41. sie_server-0.4.0/src/sie_server/api/generate.py +540 -0
  42. sie_server-0.4.0/src/sie_server/api/health.py +79 -0
  43. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/api/ws.py +54 -5
  44. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/app/app_factory.py +93 -1
  45. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/cli.py +8 -1
  46. sie_server-0.4.0/src/sie_server/config/model.py +633 -0
  47. sie_server-0.4.0/src/sie_server/core/extract_cost.py +101 -0
  48. sie_server-0.4.0/src/sie_server/core/gpu_health.py +164 -0
  49. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/loader.py +30 -0
  50. sie_server-0.4.0/src/sie_server/core/pool_isolation.py +197 -0
  51. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/preprocessor/image.py +3 -2
  52. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/preprocessor/vision.py +11 -14
  53. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/registry.py +87 -0
  54. sie_server-0.4.0/src/sie_server/core/text_tokens.py +34 -0
  55. sie_server-0.4.0/src/sie_server/health/nats_publisher.py +148 -0
  56. sie_server-0.4.0/src/sie_server/health/saturation.py +87 -0
  57. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/nats_pull_loop.py +969 -60
  58. sie_server-0.4.0/src/sie_server/observability/metrics.py +773 -0
  59. sie_server-0.4.0/src/sie_server/processors/admission.py +78 -0
  60. sie_server-0.4.0/src/sie_server/processors/base.py +22 -0
  61. sie_server-0.4.0/src/sie_server/processors/grammar_cache.py +96 -0
  62. sie_server-0.4.0/src/sie_server/processors/grammar_compile.py +237 -0
  63. sie_server-0.4.0/src/sie_server/processors/streaming.py +3263 -0
  64. sie_server-0.4.0/src/sie_server/processors/tool_call_grammar.py +191 -0
  65. sie_server-0.4.0/src/sie_server/processors/tool_call_parser.py +706 -0
  66. sie_server-0.4.0/src/sie_server/processors/work_class_scheduler.py +281 -0
  67. sie_server-0.4.0/src/sie_server/types/grammar.py +130 -0
  68. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/types/inputs.py +68 -5
  69. sie_server-0.4.0/tests/adapters/test_pytorch_embedding_revision.py +77 -0
  70. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_runtime_options.py +3 -4
  71. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_sglang.py +58 -94
  72. sie_server-0.4.0/tests/adapters/test_sglang_generation.py +1081 -0
  73. sie_server-0.4.0/tests/api/test_generate.py +513 -0
  74. sie_server-0.4.0/tests/api/test_health.py +165 -0
  75. {sie_server-0.3.3 → sie_server-0.4.0}/tests/api/test_ws.py +36 -0
  76. {sie_server-0.3.3 → sie_server-0.4.0}/tests/app/test_app_factory.py +1 -0
  77. {sie_server-0.3.3 → sie_server-0.4.0}/tests/config/test_config.py +196 -0
  78. sie_server-0.4.0/tests/config/test_model_prewarm_grammars.py +141 -0
  79. sie_server-0.4.0/tests/config/test_profile_backend_consistency.py +104 -0
  80. sie_server-0.4.0/tests/core/test_gpu_health.py +153 -0
  81. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_loader.py +15 -0
  82. sie_server-0.4.0/tests/core/test_lora_generation_exclusion.py +255 -0
  83. sie_server-0.4.0/tests/core/test_pool_isolation.py +167 -0
  84. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_preprocessor.py +16 -1
  85. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_registry_async.py +52 -0
  86. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_worker_backpressure.py +3 -3
  87. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_worker_extract.py +1 -1
  88. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_worker_score.py +1 -1
  89. sie_server-0.4.0/tests/core/worker/__init__.py +0 -0
  90. sie_server-0.4.0/tests/health/__init__.py +0 -0
  91. sie_server-0.4.0/tests/health/test_nats_publisher.py +86 -0
  92. sie_server-0.4.0/tests/health/test_saturation.py +97 -0
  93. sie_server-0.4.0/tests/health/test_worker_id_consistency.py +100 -0
  94. sie_server-0.4.0/tests/integration/__init__.py +0 -0
  95. sie_server-0.4.0/tests/integration/test_chat_completions.py +205 -0
  96. sie_server-0.4.0/tests/integration/test_grammar_generate.py +231 -0
  97. sie_server-0.4.0/tests/observability/__init__.py +0 -0
  98. sie_server-0.4.0/tests/observability/test_generation_metrics.py +387 -0
  99. {sie_server-0.3.3 → sie_server-0.4.0}/tests/observability/test_metrics.py +4 -2
  100. sie_server-0.4.0/tests/observability/test_trace_propagation.py +250 -0
  101. sie_server-0.4.0/tests/processors/__init__.py +0 -0
  102. sie_server-0.4.0/tests/processors/test_grammar_cache.py +152 -0
  103. sie_server-0.4.0/tests/processors/test_grammar_compile.py +285 -0
  104. sie_server-0.4.0/tests/processors/test_grammar_prewarm.py +437 -0
  105. sie_server-0.4.0/tests/processors/test_streaming.py +2201 -0
  106. sie_server-0.4.0/tests/processors/test_streaming_admission.py +578 -0
  107. sie_server-0.4.0/tests/processors/test_streaming_integration.py +272 -0
  108. sie_server-0.4.0/tests/processors/test_tool_call_grammar.py +134 -0
  109. sie_server-0.4.0/tests/processors/test_tool_call_parser.py +602 -0
  110. sie_server-0.4.0/tests/processors/test_work_class_scheduler.py +148 -0
  111. {sie_server-0.3.3 → sie_server-0.4.0}/tests/test_docker_integration.py +5 -0
  112. {sie_server-0.3.3 → sie_server-0.4.0}/tests/test_nats_pull_loop.py +321 -1
  113. {sie_server-0.3.3 → sie_server-0.4.0}/tests/test_nats_pull_loop_batching.py +175 -0
  114. sie_server-0.4.0/tests/type_defs/__init__.py +0 -0
  115. sie_server-0.4.0/tests/type_defs/test_inputs_json_decode.py +95 -0
  116. sie_server-0.4.0/tests/type_defs/test_media_bytes.py +92 -0
  117. sie_server-0.3.3/Dockerfile.cuda11 +0 -217
  118. sie_server-0.3.3/bundles/sglang.yaml +0 -8
  119. sie_server-0.3.3/src/sie_server/api/health.py +0 -47
  120. sie_server-0.3.3/src/sie_server/config/model.py +0 -302
  121. sie_server-0.3.3/src/sie_server/core/extract_cost.py +0 -29
  122. sie_server-0.3.3/src/sie_server/observability/metrics.py +0 -369
  123. sie_server-0.3.3/tests/api/test_health.py +0 -45
  124. {sie_server-0.3.3 → sie_server-0.4.0}/.gitignore +0 -0
  125. {sie_server-0.3.3 → sie_server-0.4.0}/CONTRIBUTING.md +0 -0
  126. {sie_server-0.3.3 → sie_server-0.4.0}/LICENSE +0 -0
  127. {sie_server-0.3.3 → sie_server-0.4.0}/bundles/default.yaml +0 -0
  128. {sie_server-0.3.3 → sie_server-0.4.0}/bundles/transformers5.yaml +0 -0
  129. {sie_server-0.3.3 → sie_server-0.4.0}/models/Alibaba-NLP__gte-Qwen2-1.5B-instruct.yaml +0 -0
  130. {sie_server-0.3.3 → sie_server-0.4.0}/models/Alibaba-NLP__gte-modernbert-base.yaml +0 -0
  131. {sie_server-0.3.3 → sie_server-0.4.0}/models/Alibaba-NLP__gte-multilingual-base.yaml +0 -0
  132. {sie_server-0.3.3 → sie_server-0.4.0}/models/Alibaba-NLP__gte-reranker-modernbert-base.yaml +0 -0
  133. {sie_server-0.3.3 → sie_server-0.4.0}/models/BAAI__bge-m3.yaml +0 -0
  134. {sie_server-0.3.3 → sie_server-0.4.0}/models/BAAI__bge-reranker-base.yaml +0 -0
  135. {sie_server-0.3.3 → sie_server-0.4.0}/models/BAAI__bge-reranker-large.yaml +0 -0
  136. {sie_server-0.3.3 → sie_server-0.4.0}/models/BAAI__bge-reranker-v2-m3.yaml +0 -0
  137. {sie_server-0.3.3 → sie_server-0.4.0}/models/EmergentMethods__gliner_large_news-v2.1.yaml +0 -0
  138. {sie_server-0.3.3 → sie_server-0.4.0}/models/GritLM__GritLM-7B.yaml +0 -0
  139. {sie_server-0.3.3 → sie_server-0.4.0}/models/IDEA-Research__grounding-dino-base.yaml +0 -0
  140. {sie_server-0.3.3 → sie_server-0.4.0}/models/IDEA-Research__grounding-dino-tiny.yaml +0 -0
  141. {sie_server-0.3.3 → sie_server-0.4.0}/models/Ihor__gliner-biomed-large-v1.0.yaml +0 -0
  142. {sie_server-0.3.3 → sie_server-0.4.0}/models/Marqo__marqo-ecommerce-embeddings-B.yaml +0 -0
  143. {sie_server-0.3.3 → sie_server-0.4.0}/models/MoritzLaurer__deberta-v3-base-zeroshot-v2.0.yaml +0 -0
  144. {sie_server-0.3.3 → sie_server-0.4.0}/models/MoritzLaurer__deberta-v3-large-zeroshot-v2.0.yaml +0 -0
  145. {sie_server-0.3.3 → sie_server-0.4.0}/models/NeuML__gliner-bert-tiny.yaml +0 -0
  146. {sie_server-0.3.3 → sie_server-0.4.0}/models/NovaSearch__stella_en_1.5B_v5.yaml +0 -0
  147. {sie_server-0.3.3 → sie_server-0.4.0}/models/NovaSearch__stella_en_400M_v5.yaml +0 -0
  148. {sie_server-0.3.3 → sie_server-0.4.0}/models/PaddlePaddle__PaddleOCR-VL-1.5.yaml +0 -0
  149. {sie_server-0.3.3 → sie_server-0.4.0}/models/Qwen__Qwen3-Embedding-0.6B.yaml +0 -0
  150. {sie_server-0.3.3 → sie_server-0.4.0}/models/Qwen__Qwen3-Reranker-0.6B.yaml +0 -0
  151. {sie_server-0.3.3 → sie_server-0.4.0}/models/Qwen__Qwen3-Reranker-4B.yaml +0 -0
  152. {sie_server-0.3.3 → sie_server-0.4.0}/models/Qwen__Qwen3-VL-Embedding-2B.yaml +0 -0
  153. {sie_server-0.3.3 → sie_server-0.4.0}/models/Qwen__Qwen3-VL-Reranker-2B.yaml +0 -0
  154. {sie_server-0.3.3 → sie_server-0.4.0}/models/Snowflake__snowflake-arctic-embed-m-v2.0.yaml +0 -0
  155. {sie_server-0.3.3 → sie_server-0.4.0}/models/answerdotai__ModernBERT-base.yaml +0 -0
  156. {sie_server-0.3.3 → sie_server-0.4.0}/models/answerdotai__answerai-colbert-small-v1.yaml +0 -0
  157. {sie_server-0.3.3 → sie_server-0.4.0}/models/colbert-ir__colbertv2.0.yaml +0 -0
  158. {sie_server-0.3.3 → sie_server-0.4.0}/models/cross-encoder__ms-marco-MiniLM-L-12-v2.yaml +0 -0
  159. {sie_server-0.3.3 → sie_server-0.4.0}/models/cross-encoder__ms-marco-MiniLM-L-6-v2.yaml +0 -0
  160. {sie_server-0.3.3 → sie_server-0.4.0}/models/cross-encoder__nli-deberta-v3-base.yaml +0 -0
  161. {sie_server-0.3.3 → sie_server-0.4.0}/models/docling.yaml +0 -0
  162. {sie_server-0.3.3 → sie_server-0.4.0}/models/fastino__gliner2-base-v1.yaml +0 -0
  163. {sie_server-0.3.3 → sie_server-0.4.0}/models/gliner-community__gliner_large-v2.5.yaml +0 -0
  164. {sie_server-0.3.3 → sie_server-0.4.0}/models/gliner-community__gliner_medium-v2.5.yaml +0 -0
  165. {sie_server-0.3.3 → sie_server-0.4.0}/models/gliner-community__gliner_small-v2.5.yaml +0 -0
  166. {sie_server-0.3.3 → sie_server-0.4.0}/models/google__embeddinggemma-300m.yaml +0 -0
  167. {sie_server-0.3.3 → sie_server-0.4.0}/models/google__owlv2-base-patch16-ensemble.yaml +0 -0
  168. {sie_server-0.3.3 → sie_server-0.4.0}/models/google__siglip-so400m-patch14-224.yaml +0 -0
  169. {sie_server-0.3.3 → sie_server-0.4.0}/models/google__siglip-so400m-patch14-384.yaml +0 -0
  170. {sie_server-0.3.3 → sie_server-0.4.0}/models/google__siglip2-base-patch16-224.yaml +0 -0
  171. {sie_server-0.3.3 → sie_server-0.4.0}/models/ibm-granite__granite-embedding-30m-sparse.yaml +0 -0
  172. {sie_server-0.3.3 → sie_server-0.4.0}/models/ibm-granite__granite-embedding-english-r2.yaml +0 -0
  173. {sie_server-0.3.3 → sie_server-0.4.0}/models/ibm-granite__granite-embedding-small-english-r2.yaml +0 -0
  174. {sie_server-0.3.3 → sie_server-0.4.0}/models/intfloat__e5-base-v2.yaml +0 -0
  175. {sie_server-0.3.3 → sie_server-0.4.0}/models/intfloat__e5-large-v2.yaml +0 -0
  176. {sie_server-0.3.3 → sie_server-0.4.0}/models/intfloat__e5-small-v2.yaml +0 -0
  177. {sie_server-0.3.3 → sie_server-0.4.0}/models/intfloat__multilingual-e5-large-instruct.yaml +0 -0
  178. {sie_server-0.3.3 → sie_server-0.4.0}/models/intfloat__multilingual-e5-large.yaml +0 -0
  179. {sie_server-0.3.3 → sie_server-0.4.0}/models/jackboyla__glirel-large-v0.yaml +0 -0
  180. {sie_server-0.3.3 → sie_server-0.4.0}/models/jinaai__jina-colbert-v2.yaml +0 -0
  181. {sie_server-0.3.3 → sie_server-0.4.0}/models/jinaai__jina-reranker-v2-base-multilingual.yaml +0 -0
  182. {sie_server-0.3.3 → sie_server-0.4.0}/models/knowledgator__gliclass-base-v1.0.yaml +0 -0
  183. {sie_server-0.3.3 → sie_server-0.4.0}/models/knowledgator__gliclass-large-v1.0.yaml +0 -0
  184. {sie_server-0.3.3 → sie_server-0.4.0}/models/knowledgator__gliclass-large-v3.0.yaml +0 -0
  185. {sie_server-0.3.3 → sie_server-0.4.0}/models/knowledgator__gliclass-small-v1.0.yaml +0 -0
  186. {sie_server-0.3.3 → sie_server-0.4.0}/models/knowledgator__gliner-bi-base-v2.0.yaml +0 -0
  187. {sie_server-0.3.3 → sie_server-0.4.0}/models/knowledgator__modern-gliner-bi-base-v1.0.yaml +0 -0
  188. {sie_server-0.3.3 → sie_server-0.4.0}/models/laion__CLIP-ViT-B-32-laion2B-s34B-b79K.yaml +0 -0
  189. {sie_server-0.3.3 → sie_server-0.4.0}/models/laion__CLIP-ViT-H-14-laion2B-s32B-b79K.yaml +0 -0
  190. {sie_server-0.3.3 → sie_server-0.4.0}/models/lightonai__GTE-ModernColBERT-v1.yaml +0 -0
  191. {sie_server-0.3.3 → sie_server-0.4.0}/models/lightonai__LightOnOCR-2-1B.yaml +0 -0
  192. {sie_server-0.3.3 → sie_server-0.4.0}/models/lightonai__Reason-ModernColBERT.yaml +0 -0
  193. {sie_server-0.3.3 → sie_server-0.4.0}/models/microsoft__Florence-2-base-ft.yaml +0 -0
  194. {sie_server-0.3.3 → sie_server-0.4.0}/models/microsoft__Florence-2-base.yaml +0 -0
  195. {sie_server-0.3.3 → sie_server-0.4.0}/models/microsoft__Florence-2-large.yaml +0 -0
  196. {sie_server-0.3.3 → sie_server-0.4.0}/models/mixedbread-ai__mxbai-colbert-large-v1.yaml +0 -0
  197. {sie_server-0.3.3 → sie_server-0.4.0}/models/mixedbread-ai__mxbai-edge-colbert-v0-32m.yaml +0 -0
  198. {sie_server-0.3.3 → sie_server-0.4.0}/models/mixedbread-ai__mxbai-rerank-base-v2.yaml +0 -0
  199. {sie_server-0.3.3 → sie_server-0.4.0}/models/mixedbread-ai__mxbai-rerank-large-v2.yaml +0 -0
  200. {sie_server-0.3.3 → sie_server-0.4.0}/models/mynkchaudhry__Florence-2-FT-DocVQA.yaml +0 -0
  201. {sie_server-0.3.3 → sie_server-0.4.0}/models/naver-clova-ix__donut-base-finetuned-cord-v2.yaml +0 -0
  202. {sie_server-0.3.3 → sie_server-0.4.0}/models/naver-clova-ix__donut-base-finetuned-docvqa.yaml +0 -0
  203. {sie_server-0.3.3 → sie_server-0.4.0}/models/naver-clova-ix__donut-base-finetuned-rvlcdip.yaml +0 -0
  204. {sie_server-0.3.3 → sie_server-0.4.0}/models/naver__splade-cocondenser-selfdistil.yaml +0 -0
  205. {sie_server-0.3.3 → sie_server-0.4.0}/models/naver__splade-v3.yaml +0 -0
  206. {sie_server-0.3.3 → sie_server-0.4.0}/models/nomic-ai__nomic-embed-text-v2-moe.yaml +0 -0
  207. {sie_server-0.3.3 → sie_server-0.4.0}/models/numind__NuNER_Zero-span.yaml +0 -0
  208. {sie_server-0.3.3 → sie_server-0.4.0}/models/numind__NuNER_Zero.yaml +0 -0
  209. {sie_server-0.3.3 → sie_server-0.4.0}/models/nvidia__NV-Embed-v2.yaml +0 -0
  210. {sie_server-0.3.3 → sie_server-0.4.0}/models/nvidia__llama-embed-nemotron-8b.yaml +0 -0
  211. {sie_server-0.3.3 → sie_server-0.4.0}/models/nvidia__llama-nemoretriever-colembed-3b-v1.yaml +0 -0
  212. {sie_server-0.3.3 → sie_server-0.4.0}/models/nvidia__nemotron-colembed-vl-4b-v2.yaml +0 -0
  213. {sie_server-0.3.3 → sie_server-0.4.0}/models/openai__clip-vit-base-patch32.yaml +0 -0
  214. {sie_server-0.3.3 → sie_server-0.4.0}/models/openai__clip-vit-large-patch14.yaml +0 -0
  215. {sie_server-0.3.3 → sie_server-0.4.0}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v2-distill.yaml +0 -0
  216. {sie_server-0.3.3 → sie_server-0.4.0}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v2-mini.yaml +0 -0
  217. {sie_server-0.3.3 → sie_server-0.4.0}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v3-distill.yaml +0 -0
  218. {sie_server-0.3.3 → sie_server-0.4.0}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v3-gte.yaml +0 -0
  219. {sie_server-0.3.3 → sie_server-0.4.0}/models/opensearch-project__opensearch-neural-sparse-encoding-v1.yaml +0 -0
  220. {sie_server-0.3.3 → sie_server-0.4.0}/models/opensearch-project__opensearch-neural-sparse-encoding-v2-distill.yaml +0 -0
  221. {sie_server-0.3.3 → sie_server-0.4.0}/models/prithivida__Splade_PP_en_v2.yaml +0 -0
  222. {sie_server-0.3.3 → sie_server-0.4.0}/models/rasyosef__splade-mini.yaml +0 -0
  223. {sie_server-0.3.3 → sie_server-0.4.0}/models/sentence-transformers__all-MiniLM-L6-v2.yaml +0 -0
  224. {sie_server-0.3.3 → sie_server-0.4.0}/models/sugiv__stablebridge-pruner-highlighter.yaml +0 -0
  225. {sie_server-0.3.3 → sie_server-0.4.0}/models/tomoroai__tomoro-colqwen3-embed-4b.yaml +0 -0
  226. {sie_server-0.3.3 → sie_server-0.4.0}/models/urchade__gliner_large-v2.1.yaml +0 -0
  227. {sie_server-0.3.3 → sie_server-0.4.0}/models/urchade__gliner_medium-v2.1.yaml +0 -0
  228. {sie_server-0.3.3 → sie_server-0.4.0}/models/urchade__gliner_multi-v2.1.yaml +0 -0
  229. {sie_server-0.3.3 → sie_server-0.4.0}/models/urchade__gliner_multi_pii-v1.yaml +0 -0
  230. {sie_server-0.3.3 → sie_server-0.4.0}/models/urchade__gliner_small-v2.1.yaml +0 -0
  231. {sie_server-0.3.3 → sie_server-0.4.0}/models/vidore__colpali-v1.3-hf.yaml +0 -0
  232. {sie_server-0.3.3 → sie_server-0.4.0}/models/vidore__colqwen2.5-v0.2.yaml +0 -0
  233. {sie_server-0.3.3 → sie_server-0.4.0}/models/zai-org__GLM-OCR.yaml +0 -0
  234. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/__init__.py +0 -0
  235. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/__init__.py +0 -0
  236. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/_base_adapter.py +0 -0
  237. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/_flash_base.py +0 -0
  238. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/_types.py +0 -0
  239. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/_utils.py +0 -0
  240. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/bert_flash/__init__.py +0 -0
  241. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/bert_flash_cross_encoder/__init__.py +0 -0
  242. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/bge_m3/__init__.py +0 -0
  243. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/bge_m3_flash/__init__.py +0 -0
  244. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/bge_m3_score_mixin.py +0 -0
  245. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/colbert/__init__.py +0 -0
  246. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/colbert_modernbert_flash/__init__.py +0 -0
  247. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/colbert_rotary_flash/__init__.py +0 -0
  248. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/cross_encoder/__init__.py +0 -0
  249. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/docling/__init__.py +0 -0
  250. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/errors.py +0 -0
  251. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/gliclass/__init__.py +0 -0
  252. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/gliner/__init__.py +0 -0
  253. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/gliner2/__init__.py +0 -0
  254. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/gliner_bi/__init__.py +0 -0
  255. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/glirel/__init__.py +0 -0
  256. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/gte_sparse_flash/__init__.py +0 -0
  257. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/jina_flash_cross_encoder/__init__.py +0 -0
  258. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/modernbert_flash/__init__.py +0 -0
  259. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/modernbert_flash_cross_encoder/__init__.py +0 -0
  260. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/nli_classification/__init__.py +0 -0
  261. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/nli_classification_flash/__init__.py +0 -0
  262. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/nomic_flash/__init__.py +0 -0
  263. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/peft_lora_mixin.py +0 -0
  264. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/qwen2_flash/__init__.py +0 -0
  265. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/qwen2_flash_cross_encoder/__init__.py +0 -0
  266. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/rope_flash/__init__.py +0 -0
  267. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/sentence_transformer/__init__.py +0 -0
  268. {sie_server-0.3.3/src/sie_server/app → sie_server-0.4.0/src/sie_server/adapters/sglang}/__init__.py +0 -0
  269. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/splade_flash/__init__.py +0 -0
  270. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/stablebridge_pruner/__init__.py +0 -0
  271. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/adapters/xlm_roberta_flash/__init__.py +0 -0
  272. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/api/__init__.py +0 -0
  273. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/api/encode.py +0 -0
  274. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/api/extract.py +0 -0
  275. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/api/helpers.py +0 -0
  276. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/api/metrics.py +0 -0
  277. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/api/models.py +0 -0
  278. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/api/openai_compat.py +0 -0
  279. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/api/openapi.py +0 -0
  280. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/api/options.py +0 -0
  281. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/api/root.py +0 -0
  282. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/api/score.py +0 -0
  283. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/api/serialization.py +0 -0
  284. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/api/validation.py +0 -0
  285. {sie_server-0.3.3/src/sie_server/config → sie_server-0.4.0/src/sie_server/app}/__init__.py +0 -0
  286. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/app/app_state_config.py +0 -0
  287. {sie_server-0.3.3/tests/adapters → sie_server-0.4.0/src/sie_server/config}/__init__.py +0 -0
  288. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/config/engine.py +0 -0
  289. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/__init__.py +0 -0
  290. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/adaptive_batching.py +0 -0
  291. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/batcher.py +0 -0
  292. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/deps.py +0 -0
  293. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/disk_cache.py +0 -0
  294. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/encode_pipeline.py +0 -0
  295. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/hf_env.py +0 -0
  296. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/hot_reload.py +0 -0
  297. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/inference.py +0 -0
  298. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/inference_output.py +0 -0
  299. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/load_errors.py +0 -0
  300. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/logging.py +0 -0
  301. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/memory.py +0 -0
  302. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/model_loader.py +0 -0
  303. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/oom.py +0 -0
  304. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/postprocessor.py +0 -0
  305. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/postprocessor_registry.py +0 -0
  306. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/prepared.py +0 -0
  307. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/preprocessor/__init__.py +0 -0
  308. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/preprocessor/base.py +0 -0
  309. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/preprocessor/text.py +0 -0
  310. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/preprocessor_registry.py +0 -0
  311. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/readiness.py +0 -0
  312. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/shutdown.py +0 -0
  313. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/timing.py +0 -0
  314. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/tokenizer.py +0 -0
  315. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/watcher.py +0 -0
  316. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/worker/__init__.py +0 -0
  317. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/worker/handlers/__init__.py +0 -0
  318. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/worker/handlers/base.py +0 -0
  319. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/worker/handlers/encode.py +0 -0
  320. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/worker/handlers/extract.py +0 -0
  321. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/worker/handlers/score.py +0 -0
  322. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/worker/model_worker.py +0 -0
  323. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/worker/oom_recovery.py +0 -0
  324. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/core/worker/types.py +0 -0
  325. {sie_server-0.3.3/tests/api → sie_server-0.4.0/src/sie_server/health}/__init__.py +0 -0
  326. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/main.py +0 -0
  327. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/nats_subscriber.py +0 -0
  328. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/observability/__init__.py +0 -0
  329. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/observability/gpu.py +0 -0
  330. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/observability/prometheus.py +0 -0
  331. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/observability/telemetry.py +0 -0
  332. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/observability/tracing.py +0 -0
  333. {sie_server-0.3.3/tests/app → sie_server-0.4.0/src/sie_server/processors}/__init__.py +0 -0
  334. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/static/__init__.py +0 -0
  335. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/static/index.html +0 -0
  336. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/types/__init__.py +0 -0
  337. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/types/openapi.py +0 -0
  338. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/types/outputs.py +0 -0
  339. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/types/overflow_policy.py +0 -0
  340. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/types/requests.py +0 -0
  341. {sie_server-0.3.3 → sie_server-0.4.0}/src/sie_server/types/responses.py +0 -0
  342. {sie_server-0.3.3/tests/config → sie_server-0.4.0/tests/adapters}/__init__.py +0 -0
  343. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_base.py +0 -0
  344. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_bge_m3.py +0 -0
  345. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_bge_m3_flash.py +0 -0
  346. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_clip.py +0 -0
  347. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_colbert.py +0 -0
  348. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_docling.py +0 -0
  349. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_docling_smoke.py +0 -0
  350. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_donut.py +0 -0
  351. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_factory_integration.py +0 -0
  352. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_flash_base.py +0 -0
  353. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_florence2.py +0 -0
  354. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_gliclass_overflow_policy.py +0 -0
  355. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_glirel.py +0 -0
  356. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_glm_ocr.py +0 -0
  357. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_grounding_dino.py +0 -0
  358. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_gte_sparse.py +0 -0
  359. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_jina_flash_cross_encoder.py +0 -0
  360. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_lighton_ocr.py +0 -0
  361. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_lora.py +0 -0
  362. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_lora_integration.py +0 -0
  363. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_paddleocr_vl.py +0 -0
  364. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_sentence_transformer.py +0 -0
  365. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_siglip.py +0 -0
  366. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_sparse_aggregation.py +0 -0
  367. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_stablebridge_integration.py +0 -0
  368. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_stablebridge_pruner.py +0 -0
  369. {sie_server-0.3.3 → sie_server-0.4.0}/tests/adapters/test_visual_document.py +0 -0
  370. {sie_server-0.3.3/tests/core → sie_server-0.4.0/tests/api}/__init__.py +0 -0
  371. {sie_server-0.3.3 → sie_server-0.4.0}/tests/api/test_encode_dtype.py +0 -0
  372. {sie_server-0.3.3 → sie_server-0.4.0}/tests/api/test_encode_endpoint.py +0 -0
  373. {sie_server-0.3.3 → sie_server-0.4.0}/tests/api/test_encode_json_schema.py +0 -0
  374. {sie_server-0.3.3 → sie_server-0.4.0}/tests/api/test_encode_timing.py +0 -0
  375. {sie_server-0.3.3 → sie_server-0.4.0}/tests/api/test_encode_validation.py +0 -0
  376. {sie_server-0.3.3 → sie_server-0.4.0}/tests/api/test_extract.py +0 -0
  377. {sie_server-0.3.3 → sie_server-0.4.0}/tests/api/test_extract_integration.py +0 -0
  378. {sie_server-0.3.3 → sie_server-0.4.0}/tests/api/test_extract_oom.py +0 -0
  379. {sie_server-0.3.3 → sie_server-0.4.0}/tests/api/test_models.py +0 -0
  380. {sie_server-0.3.3 → sie_server-0.4.0}/tests/api/test_openai_compat.py +0 -0
  381. {sie_server-0.3.3 → sie_server-0.4.0}/tests/api/test_score.py +0 -0
  382. {sie_server-0.3.3 → sie_server-0.4.0}/tests/api/test_version_header.py +0 -0
  383. {sie_server-0.3.3/tests/core/worker → sie_server-0.4.0/tests/app}/__init__.py +0 -0
  384. {sie_server-0.3.3/tests/observability → sie_server-0.4.0/tests/config}/__init__.py +0 -0
  385. {sie_server-0.3.3 → sie_server-0.4.0}/tests/config/test_bundle_coverage.py +0 -0
  386. {sie_server-0.3.3 → sie_server-0.4.0}/tests/conftest.py +0 -0
  387. {sie_server-0.3.3/tests/type_defs → sie_server-0.4.0/tests/core}/__init__.py +0 -0
  388. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_adaptive_batching.py +0 -0
  389. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_batcher.py +0 -0
  390. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_disk_cache.py +0 -0
  391. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_hot_reload.py +0 -0
  392. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_idle_evict.py +0 -0
  393. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_inference.py +0 -0
  394. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_logging.py +0 -0
  395. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_memory.py +0 -0
  396. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_model_load_timeout.py +0 -0
  397. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_oom_detection.py +0 -0
  398. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_postprocessor.py +0 -0
  399. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_postprocessor_registry.py +0 -0
  400. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_prepared.py +0 -0
  401. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_preprocessor_registry.py +0 -0
  402. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_quantization.py +0 -0
  403. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_readiness.py +0 -0
  404. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_registry_core.py +0 -0
  405. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_registry_deps.py +0 -0
  406. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_registry_failed_state.py +0 -0
  407. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_registry_memory.py +0 -0
  408. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_registry_multi_model.py +0 -0
  409. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_shutdown.py +0 -0
  410. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_timing.py +0 -0
  411. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_watcher.py +0 -0
  412. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_worker_core.py +0 -0
  413. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_worker_lora.py +0 -0
  414. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/test_worker_options.py +0 -0
  415. {sie_server-0.3.3 → sie_server-0.4.0}/tests/core/worker/test_oom_recovery.py +0 -0
  416. {sie_server-0.3.3 → sie_server-0.4.0}/tests/observability/test_telemetry.py +0 -0
  417. {sie_server-0.3.3 → sie_server-0.4.0}/tests/observability/test_tracing.py +0 -0
  418. {sie_server-0.3.3 → sie_server-0.4.0}/tests/test_all_models.py +0 -0
  419. {sie_server-0.3.3 → sie_server-0.4.0}/tests/test_openapi_export.py +0 -0
  420. {sie_server-0.3.3 → sie_server-0.4.0}/tests/test_sdk_integration.py +0 -0
  421. {sie_server-0.3.3 → sie_server-0.4.0}/tests/test_server_smoke.py +0 -0
  422. {sie_server-0.3.3 → sie_server-0.4.0}/tests/test_sparse_integration.py +0 -0
  423. {sie_server-0.3.3 → sie_server-0.4.0}/tests/type_defs/test_inputs.py +0 -0
  424. {sie_server-0.3.3 → sie_server-0.4.0}/tests/type_defs/test_types.py +0 -0
@@ -6,6 +6,7 @@
6
6
  # docker buildx build --platform linux/amd64,linux/arm64 -f packages/sie_server/Dockerfile.cpu -t sie-server:cpu .
7
7
 
8
8
  ARG BUNDLE=default
9
+ ARG SIE_DEPS_IMAGE=
9
10
 
10
11
  # =============================================================================
11
12
  # Stage 1: Dependencies (pyproject.toml only, cached across code changes)
@@ -63,6 +64,13 @@ RUN --mount=type=cache,target=/root/.cache/pip \
63
64
  # platform in local BuildKit cache and in content-addressed registry layers.
64
65
  FROM deps AS base
65
66
 
67
+ # Source-layer cache key — see Dockerfile.cuda12 for the full rationale.
68
+ # Tie source layers to a per-commit revision arg so a rebuild can't ship
69
+ # stale code via a reused source-COPY layer. Dependency layers in the
70
+ # ``deps`` stage above stay cached.
71
+ ARG SIE_SRC_REV=dev
72
+ RUN echo "sie source revision: ${SIE_SRC_REV}"
73
+
66
74
  COPY packages/sie_sdk/src /tmp/sie_sdk/src
67
75
  COPY packages/sie_server/src src/
68
76
  COPY packages/sie_server/bundles bundles/
@@ -100,9 +108,9 @@ RUN set -eux; \
100
108
  find /app/.venv -exec touch -h -d @0 {} + 2>/dev/null || true
101
109
 
102
110
  # =============================================================================
103
- # Stage 3: Builder - bundle-specific deps
111
+ # Stage 3a: Bundle-deps - bundle-specific deps install (heavy)
104
112
  # =============================================================================
105
- FROM base AS builder
113
+ FROM base AS bundle_deps
106
114
 
107
115
  ARG BUNDLE
108
116
 
@@ -142,6 +150,11 @@ RUN set -eux; \
142
150
  fi; \
143
151
  find /app/bundle-libs -exec touch -h -d @0 {} + 2>/dev/null || true
144
152
 
153
+ # =============================================================================
154
+ # Stage 3b: Builder - optional trampoline to a prebuilt base image
155
+ # =============================================================================
156
+ FROM ${SIE_DEPS_IMAGE:-bundle_deps} AS builder
157
+
145
158
  # =============================================================================
146
159
  # Stage 4: Runtime
147
160
  # =============================================================================
@@ -158,13 +171,25 @@ ENV DEBIAN_FRONTEND=noninteractive \
158
171
  # Only the shared libs torch + pillow + rtree actually dlopen at runtime.
159
172
  # libspatialindex-c6: rtree (docling dep) dlopens libspatialindex_c.so; the
160
173
  # rtree==1.4.1 wheel only bundles the C++ core, not the C wrapper.
174
+ # libgl1 libglib2.0-0 libice6 libsm6 libx11-6 libxcb1 libxext6: docling-ibm-models'
175
+ # TableFormer imports cv2 during DoclingAdapter.load() pre-warm; the opencv-python
176
+ # wheel unconditionally dlopens an X11 + libGL + glib chain at import even in
177
+ # headless usage. Without these, every docling extract crashes with
178
+ # "ImportError: libxcb.so.1: cannot open shared object file" (issue #1028).
161
179
  RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
162
180
  --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
163
181
  apt-get update && apt-get install -y --no-install-recommends \
182
+ libgl1 \
183
+ libglib2.0-0 \
164
184
  libgomp1 \
185
+ libice6 \
165
186
  libjpeg62-turbo \
166
187
  libpng16-16 \
167
- libspatialindex-c6
188
+ libsm6 \
189
+ libspatialindex-c6 \
190
+ libx11-6 \
191
+ libxcb1 \
192
+ libxext6
168
193
 
169
194
  RUN groupadd -g 1000 sie && useradd -u 1000 -g sie -m sie
170
195
 
@@ -176,8 +201,10 @@ WORKDIR /app
176
201
  # no /etc/passwd visible (the sie user exists in the runtime FS but --link
177
202
  # layers are created in isolation).
178
203
  COPY --link --from=base --chown=1000:1000 /app/.venv /app/.venv
179
- COPY --link --from=base --chown=1000:1000 /app/src /app/src
180
- COPY --link --from=base --chown=1000:1000 /tmp/sie_sdk/src /tmp/sie_sdk/src
204
+ # Source trees WITHOUT --link see Dockerfile.cuda12 (linked cross-stage
205
+ # copies didn't reliably invalidate on source change).
206
+ COPY --from=base --chown=1000:1000 /app/src /app/src
207
+ COPY --from=base --chown=1000:1000 /tmp/sie_sdk/src /tmp/sie_sdk/src
181
208
  COPY --link --from=base --chown=1000:1000 /app/models /app/models
182
209
  COPY --link --from=base --chown=1000:1000 /app/bundles /app/bundles
183
210
  # Bundle-specific extras — last layer so shared layers above stay cached.
@@ -6,6 +6,7 @@
6
6
 
7
7
  ARG BUNDLE=default
8
8
  ARG UV_VERSION=0.9.28
9
+ ARG SIE_DEPS_IMAGE=
9
10
 
10
11
  # =============================================================================
11
12
  # Stage 1: uv + standalone Python 3.12 (no deadsnakes PPA)
@@ -66,6 +67,18 @@ RUN --mount=type=cache,target=/root/.cache/pip \
66
67
  # content-addressed registry layers.
67
68
  FROM deps AS base
68
69
 
70
+ # Source-layer cache key. A rebuild once shipped STALE code: buildx
71
+ # reused a cached source-COPY layer even though the .py files had
72
+ # changed, so the demo had to overlay patched files by hand. Tie the
73
+ # source layers to an explicit revision arg the CI passes per commit
74
+ # (``--build-arg SIE_SRC_REV=$(git rev-parse --short HEAD)``) so any
75
+ # commit forces these layers — and the editable reinstall below — to
76
+ # rebuild. The expensive dependency install lives in the ``deps`` stage
77
+ # ABOVE this line, so it stays cached. Bundles of the same commit share
78
+ # the same SIE_SRC_REV, so cross-bundle layer dedup is preserved.
79
+ ARG SIE_SRC_REV=dev
80
+ RUN echo "sie source revision: ${SIE_SRC_REV}"
81
+
69
82
  COPY packages/sie_sdk/src /tmp/sie_sdk/src
70
83
  COPY packages/sie_server/src src/
71
84
  COPY packages/sie_server/bundles bundles/
@@ -114,9 +127,9 @@ RUN set -eux; \
114
127
  find /app/.venv -exec touch -h -d @0 {} + 2>/dev/null || true
115
128
 
116
129
  # =============================================================================
117
- # Stage 3: Builder - bundle-specific deps
130
+ # Stage 3a: Bundle-deps - bundle-specific deps install (heavy)
118
131
  # =============================================================================
119
- FROM base AS builder
132
+ FROM base AS bundle_deps
120
133
 
121
134
  ARG BUNDLE
122
135
 
@@ -160,6 +173,11 @@ RUN set -eux; \
160
173
  # Normalize mtimes so rebuilds of the same bundle produce identical layer bytes.
161
174
  find /app/bundle-libs -exec touch -h -d @0 {} + 2>/dev/null || true
162
175
 
176
+ # =============================================================================
177
+ # Stage 3b: Builder - optional trampoline to a prebuilt base image
178
+ # =============================================================================
179
+ FROM ${SIE_DEPS_IMAGE:-bundle_deps} AS builder
180
+
163
181
  # =============================================================================
164
182
  # Stage 4: Runtime
165
183
  # =============================================================================
@@ -175,15 +193,27 @@ ENV DEBIAN_FRONTEND=noninteractive
175
193
  # libgomp1: torch OpenMP runtime.
176
194
  # libspatialindex-c6: rtree (docling dep) dlopens libspatialindex_c.so; the
177
195
  # rtree==1.4.1 wheel only bundles the C++ core, not the C wrapper.
196
+ # libgl1 libglib2.0-0 libice6 libsm6 libx11-6 libxcb1 libxext6: docling-ibm-models'
197
+ # TableFormer imports cv2 during DoclingAdapter.load() pre-warm; the
198
+ # opencv-python wheel unconditionally dlopens an X11 + libGL + glib chain at
199
+ # import even in headless usage. Without these, every docling extract crashes
200
+ # with "ImportError: libxcb.so.1: cannot open shared object file" (issue #1028).
178
201
  RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
179
202
  --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
180
203
  apt-get update && apt-get install -y --no-install-recommends \
181
204
  ca-certificates \
182
205
  gcc \
183
206
  libc6-dev \
207
+ libgl1 \
208
+ libglib2.0-0 \
184
209
  libgomp1 \
210
+ libice6 \
185
211
  libnuma1 \
186
- libspatialindex-c6
212
+ libsm6 \
213
+ libspatialindex-c6 \
214
+ libx11-6 \
215
+ libxcb1 \
216
+ libxext6
187
217
 
188
218
  RUN groupadd -g 1000 sie && useradd -u 1000 -g sie -m sie
189
219
 
@@ -211,8 +241,13 @@ RUN set -e; \
211
241
  # (the sie user is added in the runtime stage filesystem but --link layers
212
242
  # are created in isolation from the destination stage state).
213
243
  COPY --link --from=base --chown=1000:1000 /app/.venv /app/.venv
214
- COPY --link --from=base --chown=1000:1000 /app/src /app/src
215
- COPY --link --from=base --chown=1000:1000 /tmp/sie_sdk/src /tmp/sie_sdk/src
244
+ # Source trees are copied WITHOUT --link. ``COPY --link --from=<stage>``
245
+ # layers are cached on a digest that, in the buildx versions this image
246
+ # was built with, did not reliably invalidate when the upstream source
247
+ # changed — the stale-code bug above. These trees are small, so dropping
248
+ # --link costs negligible dedup while guaranteeing edited code ships.
249
+ COPY --from=base --chown=1000:1000 /app/src /app/src
250
+ COPY --from=base --chown=1000:1000 /tmp/sie_sdk/src /tmp/sie_sdk/src
216
251
  COPY --link --from=base --chown=1000:1000 /app/models /app/models
217
252
  COPY --link --from=base --chown=1000:1000 /app/bundles /app/bundles
218
253
  # Bundle-specific extras — last layer so shared layers above stay cached.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sie-server
3
- Version: 0.3.3
3
+ Version: 0.4.0
4
4
  Summary: Search Inference Engine - GPU inference server for search workloads
5
5
  License: Apache-2.0
6
6
  License-File: LICENSE
@@ -20,6 +20,7 @@ Requires-Dist: msgspec>=0.20.0
20
20
  Requires-Dist: nats-py<3,>=2.9
21
21
  Requires-Dist: numpy<3,>=2
22
22
  Requires-Dist: open-clip-torch>=2.24
23
+ Requires-Dist: opencv-python-headless<5,>=4
23
24
  Requires-Dist: opentelemetry-api<2,>=1.28
24
25
  Requires-Dist: opentelemetry-exporter-otlp<2,>=1.28
25
26
  Requires-Dist: opentelemetry-instrumentation-fastapi<1,>=0.49b0
@@ -62,6 +62,12 @@ auto-retries; see `packages/sie_sdk/README.md` for client-side controls.
62
62
  | `SIE_DEFAULT_COMPUTE_PRECISION` | `float16` | One of `float16`, `bfloat16`, `float32`. |
63
63
  | `SIE_ATTENTION_BACKEND` | `auto` | One of `auto`, `flash_attention_2`, `sdpa`, `eager`. |
64
64
 
65
+ ### Diagnostics
66
+
67
+ | Env var | Default | Effect |
68
+ |--|--|--|
69
+ | `SIE_GRAMMAR_PREFLIGHT_DEBUG` | unset (off) | Enables the legacy worker-side Outlines preflight compile before each structured-output request. Off by default per ADR-0002 — SGLang is the production grammar authority. Use for diagnosing schema-rejection problems or slow compiles in a controlled environment; not recommended for production traffic. |
70
+
65
71
  For nested settings (any field with `__`), the env-var format is
66
72
  `SIE_<TOP>__<NESTED>=value`. The complete schema is in
67
73
  `packages/sie_server/src/sie_server/config/engine.py`.
@@ -0,0 +1,18 @@
1
+ name: sglang-embedding
2
+ priority: 21
3
+ adapters:
4
+ - sie_server.adapters.sglang.embedding
5
+ deps:
6
+ # Lockstep with ``bundles/sglang.yaml``. The dependency stack is
7
+ # identical (sglang's grammar backends are unavoidably pulled in even
8
+ # on embedding-only deployments), and the bundle split exists only for
9
+ # worker pool isolation. The previous revision pinned ``outlines-core``
10
+ # (a transitive of ``outlines``) instead of the actual top-level deps
11
+ # the generation bundle pins, which was both wrong and produced silent
12
+ # pip resolution drift on environments that already had a different
13
+ # ``outlines`` installed. Asserted in ``tests/test_bundles.py`` so a
14
+ # future drift fails fast.
15
+ sglang: '==0.5.10'
16
+ xgrammar: '==0.1.32'
17
+ outlines: '==0.1.11'
18
+ llguidance: '>=0.7.11,<0.8.0'
@@ -0,0 +1,66 @@
1
+ name: sglang
2
+ priority: 20
3
+ adapters:
4
+ - sie_server.adapters.sglang.generation
5
+ deps:
6
+ # SGLang 0.5.6+ includes all deps at base level (fixed from 0.4.x extras bug)
7
+ # See: https://github.com/sgl-project/sglang/issues/4869
8
+ #
9
+ # Qwen3.5-4B compatibility — M4 req2 Proj 5:
10
+ #
11
+ # ``sglang==0.5.10`` is the canonical target for Qwen3.5-4B on the
12
+ # current L4 / A100-40GB / H100 fleet. Audited against
13
+ # ``python/pyproject.toml@v0.5.10`` upstream (see
14
+ # ``product/plans/qwen35-sglang-mtp-structured-outputs-findings.md``):
15
+ #
16
+ # * ships the ``qwen3_5`` model class (``models/qwen3_5.py``, 1724 LOC)
17
+ # * grammar backends: ``xgrammar`` / ``outlines`` / ``llguidance`` / ``none``
18
+ # * ``sglang-kernel==0.4.1`` wheel covers SM_80 / SM_89 / SM_90 / SM_100
19
+ # via gencode (``CMakeLists.txt``: ``ENABLE_BELOW_SM90=ON`` default).
20
+ # The runtime loader (``sgl_kernel/load_utils.py``) maps
21
+ # compute_capability != 90 → ``sm100/`` subdir, which holds the
22
+ # SM_80 / SM_89 / SM_100 build (precise math). H100 (CC=90) gets
23
+ # the ``sm90/`` fast-math build.
24
+ # * torch==2.9.1 (CUDA 12.8/12.9 wheels); ``cuda-python==12.9``.
25
+ # **Not** CUDA 13 — that's an SGLang-main-only path which only
26
+ # became relevant when looking at the dev branch.
27
+ #
28
+ # Compat note: 0.5.12 wheel observed shipping only ``sm100/`` (no SM_80
29
+ # cubin entry inside) — out of scope; we stay on 0.5.10 until upstream
30
+ # ships multi-arch binaries again.
31
+ sglang: '==0.5.10'
32
+ #
33
+ # Grammar backend deps — pinned to exactly what SGLang 0.5.10 imports
34
+ # internally to prevent silent pip resolution drift.
35
+ #
36
+ # XGrammar is SGLang 0.5.10's other supported grammar backend; it pins
37
+ # ``xgrammar==0.1.32`` exactly. That release also brought the
38
+ # structured-output VRAM-leak fix (PR #20697) and grammar-error
39
+ # propagation (PR #20467). Kept available as the fallback backend.
40
+ xgrammar: '==0.1.32'
41
+ #
42
+ # ``outlines==0.1.11`` is a hard transitive dep of sglang==0.5.10. We
43
+ # declare it explicitly at bundle level so the surface is visible.
44
+ # ``outlines-core`` (a separate package) is a transitive of outlines
45
+ # and intentionally NOT pinned here — pinning ``outlines-core`` directly
46
+ # was a bug in the previous bundle revision (it does not give us any
47
+ # grammar functionality on its own).
48
+ #
49
+ # Outlines IS the active grammar backend for Qwen3.5 (partner
50
+ # requirement). Earlier revisions said "do not switch Qwen3.5 to
51
+ # outlines" because the worker-side ``compile_outlines`` preflight
52
+ # crashed with ``'TokenizersBackend' object has no attribute
53
+ # 'vocabulary'``: it passed the raw transformers==5.3.0 tokenizer
54
+ # (now a ``TokenizersBackend``) to Outlines' processor factories, which
55
+ # require an Outlines ``Tokenizer`` adapter exposing ``.vocabulary``.
56
+ # ``compile_outlines`` now wraps the tokenizer in Outlines'
57
+ # ``TransformerTokenizer`` first (the same wrap SGLang's
58
+ # ``OutlinesGrammarBackend`` does internally), so the mismatch is gone
59
+ # for json_schema/regex; ebnf is forwarded straight to SGLang. See
60
+ # ``processors/grammar_compile.py`` ("Tokenizer adapter").
61
+ outlines: '==0.1.11'
62
+ #
63
+ # llguidance is the third grammar backend (regex / json_schema / ebnf).
64
+ # Pinned to SGLang 0.5.10's compatible range. Kept available as the
65
+ # fallback if a future model regresses on xgrammar.
66
+ llguidance: '>=0.7.11,<0.8.0'
@@ -18,7 +18,7 @@ profiles:
18
18
  default:
19
19
  max_batch_tokens: 16384
20
20
  compute_precision: bfloat16
21
- adapter_path: sie_server.adapters.sglang:SGLangEmbeddingAdapter
21
+ adapter_path: sie_server.adapters.sglang.embedding:SGLangEmbeddingAdapter
22
22
  adapter_options:
23
23
  loadtime:
24
24
  mem_fraction_static: 0.85
@@ -18,7 +18,7 @@ profiles:
18
18
  default:
19
19
  max_batch_tokens: 8192
20
20
  compute_precision: bfloat16
21
- adapter_path: sie_server.adapters.sglang:SGLangEmbeddingAdapter
21
+ adapter_path: sie_server.adapters.sglang.embedding:SGLangEmbeddingAdapter
22
22
  adapter_options:
23
23
  loadtime:
24
24
  mem_fraction_static: 0.85
@@ -0,0 +1,119 @@
1
+ sie_id: Qwen/Qwen3-0.6B
2
+ hf_id: Qwen/Qwen3-0.6B
3
+ inputs:
4
+ text: true
5
+ image: false
6
+ audio: false
7
+ video: false
8
+ tasks:
9
+ # Tiny generation model used for fast e2e iteration on Modal L4 (walking-skeleton
10
+ # viability bench + validation-harness SIE-only smoke). Loads in ~30s. Quality is
11
+ # bad — this is a transport benchmark target, not a production model.
12
+ #
13
+ # Context / batch sizes are deliberately small (1024 vs the headroom an
14
+ # L4 could nominally support) so the validation harness can co-resident
15
+ # the worker's SGLang with a second SGLang for the baseline phase on
16
+ # a single 22 GiB card without OOMing. Raise these if you ever run 0.6B
17
+ # alone on a card it doesn't have to share.
18
+ #
19
+ # Note on the three 1024s below: `context_length`, `max_sequence_length`,
20
+ # and `max_batch_tokens` are NOT redundant — they're three independent
21
+ # knobs (per-request context, SGLang --context-length, batcher cost
22
+ # budget) that just happen to collide here because the model is tiny.
23
+ # See sibling Qwen__Qwen3-4B-Instruct-2507.yaml for the canonical
24
+ # non-collapsed shape.
25
+ generate:
26
+ context_length: 1024
27
+ max_output_tokens: 1024
28
+ capabilities:
29
+ grammar: []
30
+ streaming: true
31
+ tools: false
32
+ max_sequence_length: 1024
33
+ # KV-cache memory math (Qwen3-0.6B, bf16):
34
+ # layers=28, kv_heads=8, head_dim=128, bytes_per_elem=2
35
+ # kv_bytes_per_token = 2 × 28 × 8 × 128 × 2 = 114,688 B ≈ 112 KB
36
+ # The 0.6B is a transport benchmark target — context_length is held at
37
+ # 1024 deliberately (see header comment) so the validation harness can
38
+ # co-resident two SGLang instances on an L4. KV budgets per profile
39
+ # scale with the deployment scenario rather than the GPU ceiling.
40
+ profiles:
41
+ default:
42
+ max_batch_tokens: 1024
43
+ compute_precision: bfloat16
44
+ adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
45
+ kv_budget_tokens: 8192
46
+ adapter_options:
47
+ loadtime:
48
+ # 0.8 leaves headroom on a 22 GiB L4 for a second SGLang instance
49
+ # (the validation harness co-residents worker + baseline). If the
50
+ # 0.6B is the only model on the card, 0.9 is fine.
51
+ mem_fraction_static: 0.8
52
+ served_model_name: Qwen/Qwen3-0.6B
53
+ # Modal sandbox lacks flashinfer's JIT prerequisites; switch backends.
54
+ disable_cuda_graph: true
55
+ attention_backend: triton
56
+ runtime:
57
+ first_chunk_timeout_s: 30
58
+ inter_chunk_timeout_s: 10
59
+ # Aligned with the rest of the generate model fleet (300s).
60
+ # The previous 132s was an unexplained magic number that
61
+ # diverged from every other generate config; bumping to the
62
+ # fleet default keeps long-completion requests from hitting a
63
+ # premature overall-timeout on the 0.6B model.
64
+ overall_timeout_s: 300
65
+ default_sampling:
66
+ temperature: 0.0
67
+ top_p: 1.0
68
+ # Dedicated 0.6B deployments on a100/h100 don't co-resident a baseline,
69
+ # so mem_fraction_static returns to the standard 0.85 and the KV budget
70
+ # scales with the larger GPU. kv_budget_tokens stays well below the
71
+ # theoretical ceiling because the 0.6B's *context_length* (1024) caps
72
+ # per-request KV consumption — the budget really just sets the upper
73
+ # bound on concurrent in-flight sequences.
74
+ a100-40gb:
75
+ max_batch_tokens: 4096
76
+ compute_precision: bfloat16
77
+ adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
78
+ kv_budget_tokens: 32768
79
+ adapter_options:
80
+ loadtime:
81
+ mem_fraction_static: 0.85
82
+ served_model_name: Qwen/Qwen3-0.6B
83
+ disable_cuda_graph: true
84
+ attention_backend: triton
85
+ runtime:
86
+ first_chunk_timeout_s: 30
87
+ inter_chunk_timeout_s: 10
88
+ # Aligned with the rest of the generate model fleet (300s).
89
+ # The previous 132s was an unexplained magic number that
90
+ # diverged from every other generate config; bumping to the
91
+ # fleet default keeps long-completion requests from hitting a
92
+ # premature overall-timeout on the 0.6B model.
93
+ overall_timeout_s: 300
94
+ default_sampling:
95
+ temperature: 0.0
96
+ top_p: 1.0
97
+ h100:
98
+ max_batch_tokens: 8192
99
+ compute_precision: bfloat16
100
+ adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
101
+ kv_budget_tokens: 65536
102
+ adapter_options:
103
+ loadtime:
104
+ mem_fraction_static: 0.85
105
+ served_model_name: Qwen/Qwen3-0.6B
106
+ disable_cuda_graph: true
107
+ attention_backend: triton
108
+ runtime:
109
+ first_chunk_timeout_s: 30
110
+ inter_chunk_timeout_s: 10
111
+ # Aligned with the rest of the generate model fleet (300s).
112
+ # The previous 132s was an unexplained magic number that
113
+ # diverged from every other generate config; bumping to the
114
+ # fleet default keeps long-completion requests from hitting a
115
+ # premature overall-timeout on the 0.6B model.
116
+ overall_timeout_s: 300
117
+ default_sampling:
118
+ temperature: 0.0
119
+ top_p: 1.0
@@ -0,0 +1,152 @@
1
+ sie_id: Qwen/Qwen3-4B-Instruct-2507
2
+ hf_id: Qwen/Qwen3-4B-Instruct-2507
3
+ inputs:
4
+ text: true
5
+ image: false
6
+ audio: false
7
+ video: false
8
+ tasks:
9
+ # kv_budget_tokens now lives on profiles (below). The KV calibration
10
+ # follow-up publishes the tuned value; the placeholder here is conservative
11
+ # and assumes Qwen3-4B's ~150 KB/token KV footprint on an L4 (24 GB,
12
+ # mem_fraction_static=0.85).
13
+ generate:
14
+ context_length: 32768
15
+ max_output_tokens: 4096
16
+ capabilities:
17
+ # Outlines-backed JSON Schema, regex, and EBNF grammars are
18
+ # all supported by the SGLang adapter (Outlines and XGrammar
19
+ # both accept EBNF natively). The gateway gates requests on
20
+ # this exact list — adding a new ``grammar.kind`` variant
21
+ # requires both the gateway parser and this list to be updated.
22
+ grammar: ["json_schema", "regex", "ebnf"]
23
+ streaming: true
24
+ # Qwen3-4B-Instruct's chat template emits OpenAI-compatible
25
+ # ``<tool_call>{...}</tool_call>`` blocks when ``tools`` is
26
+ # present in the rendered messages; the worker's
27
+ # ``parse_tool_call_stream`` consumes those blocks and surfaces
28
+ # them on ``delta.tool_calls`` for SSE and on
29
+ # ``message.tool_calls`` for non-streaming requests.
30
+ tools: true
31
+ # Forwarded verbatim to ``tokenizer.apply_chat_template(**kwargs)`` when
32
+ # the worker renders an OpenAI-shaped ``messages`` request.
33
+ # Qwen3's chat template emits a ``<think>``/``</think>`` reasoning block
34
+ # unless this flag suppresses it.
35
+ chat_template_kwargs:
36
+ enable_thinking: false
37
+ # Schemas/regexes the worker pre-compiles at model load so the first
38
+ # request hitting them skips the Outlines compile (cold TTFT win).
39
+ # Failures here log + bump ``sie_worker_grammar_prewarm_total{outcome="failed"}``
40
+ # without blocking model load — add entries only for shapes you
41
+ # know are hot.
42
+ prewarm_grammars:
43
+ # Bare pattern, NOT anchored: this model uses the default Outlines
44
+ # grammar backend, and Outlines regexes are implicitly anchored —
45
+ # its FSM engine (interegular) rejects ``^``/``$`` with
46
+ # ``Unsupported``, which crashes SGLang's scheduler. Use ``(yes|no)``.
47
+ - name: yes_no
48
+ kind: regex
49
+ value: "(yes|no)"
50
+ - name: short_answer
51
+ kind: json_schema
52
+ value:
53
+ type: object
54
+ properties:
55
+ answer:
56
+ type: string
57
+ required: [answer]
58
+ max_sequence_length: 32768
59
+ # KV-cache memory math (Qwen3-4B-Instruct-2507, bf16):
60
+ # layers=36, kv_heads=8, head_dim=128, bytes_per_elem=2
61
+ # kv_bytes_per_token = 2 (k+v) × 36 × 8 × 128 × 2 = 147,456 B ≈ 144 KB
62
+ # Theoretical max KV tokens per GPU (assuming ~8 GB weights, mem_fraction_static=0.85):
63
+ # l4 (24 GB): (24 × 0.85 − 8) GB / 144 KB ≈ 90,000 tokens
64
+ # a100-40gb (40 GB): (40 × 0.85 − 8) GB / 144 KB ≈ 189,000 tokens
65
+ # h100 (80 GB): (80 × 0.85 − 8) GB / 144 KB ≈ 437,000 tokens
66
+ # kv_budget_tokens is set to ~40% of theoretical max, matching the L4
67
+ # baseline ratio (32768/90000 ≈ 36%). The headroom absorbs batch growth,
68
+ # speculative side-cell, grammar/Outlines compile arena, fragmentation.
69
+ # Final empirical validation (concurrency-16 OOM-boundary sweep) is
70
+ # tracked in product/plans/m4-req2-gpu-runbook.md §"#16/#19".
71
+ profiles:
72
+ default:
73
+ # max_batch_tokens is a generic engine knob; generation does not batch
74
+ # at the SIE layer (SGLang batches internally) but the validator
75
+ # requires the field to be set.
76
+ max_batch_tokens: 16384
77
+ compute_precision: bfloat16
78
+ adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
79
+ # L4 baseline — empirically gated by the speculative-decoding + calibration work.
80
+ kv_budget_tokens: 32768
81
+ adapter_options:
82
+ loadtime:
83
+ mem_fraction_static: 0.85
84
+ served_model_name: Qwen/Qwen3-4B-Instruct-2507
85
+ # speculative decoding (MTP/EAGLE/NGRAM) intentionally absent;
86
+ # week-1 validation decides whether to promote a side-cell. See §4.9.
87
+ runtime:
88
+ first_chunk_timeout_s: 30
89
+ inter_chunk_timeout_s: 10
90
+ overall_timeout_s: 300
91
+ default_sampling:
92
+ temperature: 0.7
93
+ top_p: 0.9
94
+ stop_tokens:
95
+ - "<|im_end|>"
96
+ # M5 audit #16/#19: analytical defaults for a100-40gb / h100. Production
97
+ # capacity also grows: with 2-4× the KV budget the context window can be
98
+ # widened proportionally so longer-context workloads (RAG with large
99
+ # retrieved passages) fit comfortably. ``max_output_tokens`` doubles
100
+ # to 8192/16384 respectively — beyond that, latency hurts more than
101
+ # quality helps for instruction-style chat traffic.
102
+ a100-40gb:
103
+ max_batch_tokens: 32768
104
+ compute_precision: bfloat16
105
+ adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
106
+ # Empirically calibrated on Modal A100-SXM4-40GB
107
+ # (sglang 0.5.9, mem_fraction_static=0.85, context_length=32768).
108
+ # SGLang's /server_info reports:
109
+ # weight=7.71 GB, kvcache=25.42 GB, graph=0.18 GB,
110
+ # token_capacity=185,081 tokens
111
+ # ``kv_budget_tokens`` sized for 4 concurrent admissions:
112
+ # 185,081 / 4 = 46,270 → round down to 45,056 for headroom.
113
+ # Re-calibrate if SGLang version or mem_fraction_static changes.
114
+ kv_budget_tokens: 45056
115
+ adapter_options:
116
+ loadtime:
117
+ mem_fraction_static: 0.85
118
+ served_model_name: Qwen/Qwen3-4B-Instruct-2507
119
+ runtime:
120
+ first_chunk_timeout_s: 30
121
+ inter_chunk_timeout_s: 10
122
+ overall_timeout_s: 300
123
+ default_sampling:
124
+ temperature: 0.7
125
+ top_p: 0.9
126
+ stop_tokens:
127
+ - "<|im_end|>"
128
+ h100:
129
+ max_batch_tokens: 65536
130
+ compute_precision: bfloat16
131
+ adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
132
+ # Empirically calibrated on Modal H100 80GB HBM3
133
+ # (sglang 0.5.9, mem_fraction_static=0.85, context_length=32768).
134
+ # SGLang's /server_info reports:
135
+ # weight=7.71 GB, kvcache=59.0 GB, graph=0.43 GB,
136
+ # token_capacity=429,645 tokens
137
+ # ``kv_budget_tokens`` sized for 4 concurrent admissions:
138
+ # 429,645 / 4 = 107,411 → round down to 106,496 for headroom.
139
+ kv_budget_tokens: 106496
140
+ adapter_options:
141
+ loadtime:
142
+ mem_fraction_static: 0.85
143
+ served_model_name: Qwen/Qwen3-4B-Instruct-2507
144
+ runtime:
145
+ first_chunk_timeout_s: 30
146
+ inter_chunk_timeout_s: 10
147
+ overall_timeout_s: 300
148
+ default_sampling:
149
+ temperature: 0.7
150
+ top_p: 0.9
151
+ stop_tokens:
152
+ - "<|im_end|>"
@@ -18,7 +18,7 @@ profiles:
18
18
  default:
19
19
  max_batch_tokens: 16384
20
20
  compute_precision: bfloat16
21
- adapter_path: sie_server.adapters.sglang:SGLangEmbeddingAdapter
21
+ adapter_path: sie_server.adapters.sglang.embedding:SGLangEmbeddingAdapter
22
22
  adapter_options:
23
23
  loadtime:
24
24
  mem_fraction_static: 0.85