sie-server 0.3.4__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (425) hide show
  1. {sie_server-0.3.4 → sie_server-0.4.1}/Dockerfile.cpu +24 -3
  2. {sie_server-0.3.4 → sie_server-0.4.1}/Dockerfile.cuda12 +32 -3
  3. {sie_server-0.3.4 → sie_server-0.4.1}/PKG-INFO +1 -1
  4. {sie_server-0.3.4 → sie_server-0.4.1}/README.md +6 -0
  5. {sie_server-0.3.4 → sie_server-0.4.1}/bundles/default.yaml +1 -1
  6. sie_server-0.4.1/bundles/sglang-embedding.yaml +18 -0
  7. sie_server-0.4.1/bundles/sglang.yaml +66 -0
  8. {sie_server-0.3.4 → sie_server-0.4.1}/models/Alibaba-NLP__gte-Qwen2-7B-instruct.yaml +1 -1
  9. {sie_server-0.3.4 → sie_server-0.4.1}/models/Linq-AI-Research__Linq-Embed-Mistral.yaml +1 -1
  10. sie_server-0.4.1/models/Qwen__Qwen3-0.6B.yaml +119 -0
  11. sie_server-0.4.1/models/Qwen__Qwen3-4B-Instruct-2507.yaml +152 -0
  12. {sie_server-0.3.4 → sie_server-0.4.1}/models/Qwen__Qwen3-Embedding-4B.yaml +1 -1
  13. sie_server-0.4.1/models/Qwen__Qwen3.5-4B.yaml +261 -0
  14. sie_server-0.4.1/models/Qwen__Qwen3.6-27B.yaml +196 -0
  15. {sie_server-0.3.4 → sie_server-0.4.1}/models/Salesforce__SFR-Embedding-2_R.yaml +1 -1
  16. {sie_server-0.3.4 → sie_server-0.4.1}/models/Salesforce__SFR-Embedding-Mistral.yaml +1 -1
  17. {sie_server-0.3.4 → sie_server-0.4.1}/models/intfloat__e5-mistral-7b-instruct.yaml +1 -1
  18. {sie_server-0.3.4 → sie_server-0.4.1}/openapi.json +22 -2
  19. {sie_server-0.3.4 → sie_server-0.4.1}/pyproject.toml +4 -3
  20. sie_server-0.4.1/src/sie_server/adapters/_generation_base.py +295 -0
  21. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/_spec.py +1 -1
  22. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/base.py +1 -1
  23. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/bge_m3_flag/__init__.py +3 -0
  24. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/clip/__init__.py +2 -1
  25. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/colpali/__init__.py +2 -1
  26. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/colqwen2/__init__.py +2 -1
  27. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/colqwen3/__init__.py +2 -1
  28. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/donut/__init__.py +2 -1
  29. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/florence2/__init__.py +2 -1
  30. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/glm_ocr/__init__.py +2 -1
  31. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/grounding_dino/__init__.py +2 -2
  32. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/lighton_ocr/__init__.py +2 -1
  33. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/nemo_colembed/__init__.py +2 -1
  34. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/owlv2/__init__.py +2 -1
  35. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/paddleocr_vl/__init__.py +2 -1
  36. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/pytorch_embedding/__init__.py +16 -5
  37. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/qwen3_vl_embedding/__init__.py +3 -2
  38. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/qwen3_vl_reranker/__init__.py +2 -1
  39. sie_server-0.4.1/src/sie_server/adapters/sglang/_server.py +210 -0
  40. sie_server-0.3.4/src/sie_server/adapters/sglang/__init__.py → sie_server-0.4.1/src/sie_server/adapters/sglang/embedding.py +48 -154
  41. sie_server-0.4.1/src/sie_server/adapters/sglang/generation.py +1430 -0
  42. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/siglip/__init__.py +2 -1
  43. sie_server-0.4.1/src/sie_server/api/generate.py +540 -0
  44. sie_server-0.4.1/src/sie_server/api/health.py +79 -0
  45. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/api/ws.py +54 -5
  46. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/app/app_factory.py +93 -1
  47. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/cli.py +8 -1
  48. sie_server-0.4.1/src/sie_server/config/model.py +633 -0
  49. sie_server-0.4.1/src/sie_server/core/extract_cost.py +101 -0
  50. sie_server-0.4.1/src/sie_server/core/gpu_health.py +164 -0
  51. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/loader.py +30 -0
  52. sie_server-0.4.1/src/sie_server/core/pool_isolation.py +197 -0
  53. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/preprocessor/image.py +3 -2
  54. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/preprocessor/vision.py +11 -14
  55. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/registry.py +87 -0
  56. sie_server-0.4.1/src/sie_server/core/text_tokens.py +34 -0
  57. sie_server-0.4.1/src/sie_server/health/nats_publisher.py +148 -0
  58. sie_server-0.4.1/src/sie_server/health/saturation.py +87 -0
  59. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/nats_pull_loop.py +1047 -64
  60. sie_server-0.4.1/src/sie_server/observability/metrics.py +773 -0
  61. sie_server-0.4.1/src/sie_server/processors/admission.py +78 -0
  62. sie_server-0.4.1/src/sie_server/processors/base.py +22 -0
  63. sie_server-0.4.1/src/sie_server/processors/grammar_cache.py +96 -0
  64. sie_server-0.4.1/src/sie_server/processors/grammar_compile.py +237 -0
  65. sie_server-0.4.1/src/sie_server/processors/streaming.py +3263 -0
  66. sie_server-0.4.1/src/sie_server/processors/tool_call_grammar.py +191 -0
  67. sie_server-0.4.1/src/sie_server/processors/tool_call_parser.py +706 -0
  68. sie_server-0.4.1/src/sie_server/processors/work_class_scheduler.py +281 -0
  69. sie_server-0.4.1/src/sie_server/types/grammar.py +130 -0
  70. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/types/inputs.py +68 -5
  71. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_lora_integration.py +1 -1
  72. sie_server-0.4.1/tests/adapters/test_pytorch_embedding_revision.py +77 -0
  73. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_runtime_options.py +3 -4
  74. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_sglang.py +58 -94
  75. sie_server-0.4.1/tests/adapters/test_sglang_generation.py +1081 -0
  76. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_stablebridge_integration.py +1 -1
  77. sie_server-0.4.1/tests/api/test_generate.py +513 -0
  78. sie_server-0.4.1/tests/api/test_health.py +165 -0
  79. {sie_server-0.3.4 → sie_server-0.4.1}/tests/api/test_ws.py +36 -0
  80. {sie_server-0.3.4 → sie_server-0.4.1}/tests/app/test_app_factory.py +1 -0
  81. {sie_server-0.3.4 → sie_server-0.4.1}/tests/config/test_config.py +196 -0
  82. sie_server-0.4.1/tests/config/test_model_prewarm_grammars.py +141 -0
  83. sie_server-0.4.1/tests/config/test_profile_backend_consistency.py +104 -0
  84. {sie_server-0.3.4 → sie_server-0.4.1}/tests/conftest.py +2 -7
  85. sie_server-0.4.1/tests/core/test_gpu_health.py +153 -0
  86. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_loader.py +15 -0
  87. sie_server-0.4.1/tests/core/test_lora_generation_exclusion.py +255 -0
  88. sie_server-0.4.1/tests/core/test_pool_isolation.py +167 -0
  89. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_preprocessor.py +16 -1
  90. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_registry_async.py +52 -0
  91. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_worker_backpressure.py +3 -3
  92. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_worker_extract.py +1 -1
  93. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_worker_score.py +1 -1
  94. sie_server-0.4.1/tests/core/worker/__init__.py +0 -0
  95. sie_server-0.4.1/tests/health/__init__.py +0 -0
  96. sie_server-0.4.1/tests/health/test_nats_publisher.py +86 -0
  97. sie_server-0.4.1/tests/health/test_saturation.py +97 -0
  98. sie_server-0.4.1/tests/health/test_worker_id_consistency.py +100 -0
  99. sie_server-0.4.1/tests/integration/__init__.py +0 -0
  100. sie_server-0.4.1/tests/integration/test_chat_completions.py +205 -0
  101. sie_server-0.4.1/tests/integration/test_grammar_generate.py +231 -0
  102. sie_server-0.4.1/tests/observability/__init__.py +0 -0
  103. sie_server-0.4.1/tests/observability/test_generation_metrics.py +387 -0
  104. {sie_server-0.3.4 → sie_server-0.4.1}/tests/observability/test_metrics.py +4 -2
  105. sie_server-0.4.1/tests/observability/test_trace_propagation.py +250 -0
  106. sie_server-0.4.1/tests/processors/__init__.py +0 -0
  107. sie_server-0.4.1/tests/processors/test_grammar_cache.py +152 -0
  108. sie_server-0.4.1/tests/processors/test_grammar_compile.py +285 -0
  109. sie_server-0.4.1/tests/processors/test_grammar_prewarm.py +437 -0
  110. sie_server-0.4.1/tests/processors/test_streaming.py +2201 -0
  111. sie_server-0.4.1/tests/processors/test_streaming_admission.py +578 -0
  112. sie_server-0.4.1/tests/processors/test_streaming_integration.py +272 -0
  113. sie_server-0.4.1/tests/processors/test_tool_call_grammar.py +134 -0
  114. sie_server-0.4.1/tests/processors/test_tool_call_parser.py +602 -0
  115. sie_server-0.4.1/tests/processors/test_work_class_scheduler.py +148 -0
  116. {sie_server-0.3.4 → sie_server-0.4.1}/tests/test_docker_integration.py +5 -0
  117. {sie_server-0.3.4 → sie_server-0.4.1}/tests/test_nats_pull_loop.py +520 -2
  118. {sie_server-0.3.4 → sie_server-0.4.1}/tests/test_nats_pull_loop_batching.py +175 -0
  119. sie_server-0.4.1/tests/type_defs/__init__.py +0 -0
  120. sie_server-0.4.1/tests/type_defs/test_inputs_json_decode.py +95 -0
  121. sie_server-0.4.1/tests/type_defs/test_media_bytes.py +92 -0
  122. sie_server-0.3.4/Dockerfile.cuda11 +0 -217
  123. sie_server-0.3.4/bundles/sglang.yaml +0 -8
  124. sie_server-0.3.4/src/sie_server/api/health.py +0 -47
  125. sie_server-0.3.4/src/sie_server/config/model.py +0 -302
  126. sie_server-0.3.4/src/sie_server/core/extract_cost.py +0 -29
  127. sie_server-0.3.4/src/sie_server/observability/metrics.py +0 -369
  128. sie_server-0.3.4/tests/api/test_health.py +0 -45
  129. {sie_server-0.3.4 → sie_server-0.4.1}/.gitignore +0 -0
  130. {sie_server-0.3.4 → sie_server-0.4.1}/CONTRIBUTING.md +0 -0
  131. {sie_server-0.3.4 → sie_server-0.4.1}/LICENSE +0 -0
  132. {sie_server-0.3.4 → sie_server-0.4.1}/bundles/transformers5.yaml +0 -0
  133. {sie_server-0.3.4 → sie_server-0.4.1}/models/Alibaba-NLP__gte-Qwen2-1.5B-instruct.yaml +0 -0
  134. {sie_server-0.3.4 → sie_server-0.4.1}/models/Alibaba-NLP__gte-modernbert-base.yaml +0 -0
  135. {sie_server-0.3.4 → sie_server-0.4.1}/models/Alibaba-NLP__gte-multilingual-base.yaml +0 -0
  136. {sie_server-0.3.4 → sie_server-0.4.1}/models/Alibaba-NLP__gte-reranker-modernbert-base.yaml +0 -0
  137. {sie_server-0.3.4 → sie_server-0.4.1}/models/BAAI__bge-m3.yaml +0 -0
  138. {sie_server-0.3.4 → sie_server-0.4.1}/models/BAAI__bge-reranker-base.yaml +0 -0
  139. {sie_server-0.3.4 → sie_server-0.4.1}/models/BAAI__bge-reranker-large.yaml +0 -0
  140. {sie_server-0.3.4 → sie_server-0.4.1}/models/BAAI__bge-reranker-v2-m3.yaml +0 -0
  141. {sie_server-0.3.4 → sie_server-0.4.1}/models/EmergentMethods__gliner_large_news-v2.1.yaml +0 -0
  142. {sie_server-0.3.4 → sie_server-0.4.1}/models/GritLM__GritLM-7B.yaml +0 -0
  143. {sie_server-0.3.4 → sie_server-0.4.1}/models/IDEA-Research__grounding-dino-base.yaml +0 -0
  144. {sie_server-0.3.4 → sie_server-0.4.1}/models/IDEA-Research__grounding-dino-tiny.yaml +0 -0
  145. {sie_server-0.3.4 → sie_server-0.4.1}/models/Ihor__gliner-biomed-large-v1.0.yaml +0 -0
  146. {sie_server-0.3.4 → sie_server-0.4.1}/models/Marqo__marqo-ecommerce-embeddings-B.yaml +0 -0
  147. {sie_server-0.3.4 → sie_server-0.4.1}/models/MoritzLaurer__deberta-v3-base-zeroshot-v2.0.yaml +0 -0
  148. {sie_server-0.3.4 → sie_server-0.4.1}/models/MoritzLaurer__deberta-v3-large-zeroshot-v2.0.yaml +0 -0
  149. {sie_server-0.3.4 → sie_server-0.4.1}/models/NeuML__gliner-bert-tiny.yaml +0 -0
  150. {sie_server-0.3.4 → sie_server-0.4.1}/models/NovaSearch__stella_en_1.5B_v5.yaml +0 -0
  151. {sie_server-0.3.4 → sie_server-0.4.1}/models/NovaSearch__stella_en_400M_v5.yaml +0 -0
  152. {sie_server-0.3.4 → sie_server-0.4.1}/models/PaddlePaddle__PaddleOCR-VL-1.5.yaml +0 -0
  153. {sie_server-0.3.4 → sie_server-0.4.1}/models/Qwen__Qwen3-Embedding-0.6B.yaml +0 -0
  154. {sie_server-0.3.4 → sie_server-0.4.1}/models/Qwen__Qwen3-Reranker-0.6B.yaml +0 -0
  155. {sie_server-0.3.4 → sie_server-0.4.1}/models/Qwen__Qwen3-Reranker-4B.yaml +0 -0
  156. {sie_server-0.3.4 → sie_server-0.4.1}/models/Qwen__Qwen3-VL-Embedding-2B.yaml +0 -0
  157. {sie_server-0.3.4 → sie_server-0.4.1}/models/Qwen__Qwen3-VL-Reranker-2B.yaml +0 -0
  158. {sie_server-0.3.4 → sie_server-0.4.1}/models/Snowflake__snowflake-arctic-embed-m-v2.0.yaml +0 -0
  159. {sie_server-0.3.4 → sie_server-0.4.1}/models/answerdotai__ModernBERT-base.yaml +0 -0
  160. {sie_server-0.3.4 → sie_server-0.4.1}/models/answerdotai__answerai-colbert-small-v1.yaml +0 -0
  161. {sie_server-0.3.4 → sie_server-0.4.1}/models/colbert-ir__colbertv2.0.yaml +0 -0
  162. {sie_server-0.3.4 → sie_server-0.4.1}/models/cross-encoder__ms-marco-MiniLM-L-12-v2.yaml +0 -0
  163. {sie_server-0.3.4 → sie_server-0.4.1}/models/cross-encoder__ms-marco-MiniLM-L-6-v2.yaml +0 -0
  164. {sie_server-0.3.4 → sie_server-0.4.1}/models/cross-encoder__nli-deberta-v3-base.yaml +0 -0
  165. {sie_server-0.3.4 → sie_server-0.4.1}/models/docling.yaml +0 -0
  166. {sie_server-0.3.4 → sie_server-0.4.1}/models/fastino__gliner2-base-v1.yaml +0 -0
  167. {sie_server-0.3.4 → sie_server-0.4.1}/models/gliner-community__gliner_large-v2.5.yaml +0 -0
  168. {sie_server-0.3.4 → sie_server-0.4.1}/models/gliner-community__gliner_medium-v2.5.yaml +0 -0
  169. {sie_server-0.3.4 → sie_server-0.4.1}/models/gliner-community__gliner_small-v2.5.yaml +0 -0
  170. {sie_server-0.3.4 → sie_server-0.4.1}/models/google__embeddinggemma-300m.yaml +0 -0
  171. {sie_server-0.3.4 → sie_server-0.4.1}/models/google__owlv2-base-patch16-ensemble.yaml +0 -0
  172. {sie_server-0.3.4 → sie_server-0.4.1}/models/google__siglip-so400m-patch14-224.yaml +0 -0
  173. {sie_server-0.3.4 → sie_server-0.4.1}/models/google__siglip-so400m-patch14-384.yaml +0 -0
  174. {sie_server-0.3.4 → sie_server-0.4.1}/models/google__siglip2-base-patch16-224.yaml +0 -0
  175. {sie_server-0.3.4 → sie_server-0.4.1}/models/ibm-granite__granite-embedding-30m-sparse.yaml +0 -0
  176. {sie_server-0.3.4 → sie_server-0.4.1}/models/ibm-granite__granite-embedding-english-r2.yaml +0 -0
  177. {sie_server-0.3.4 → sie_server-0.4.1}/models/ibm-granite__granite-embedding-small-english-r2.yaml +0 -0
  178. {sie_server-0.3.4 → sie_server-0.4.1}/models/intfloat__e5-base-v2.yaml +0 -0
  179. {sie_server-0.3.4 → sie_server-0.4.1}/models/intfloat__e5-large-v2.yaml +0 -0
  180. {sie_server-0.3.4 → sie_server-0.4.1}/models/intfloat__e5-small-v2.yaml +0 -0
  181. {sie_server-0.3.4 → sie_server-0.4.1}/models/intfloat__multilingual-e5-large-instruct.yaml +0 -0
  182. {sie_server-0.3.4 → sie_server-0.4.1}/models/intfloat__multilingual-e5-large.yaml +0 -0
  183. {sie_server-0.3.4 → sie_server-0.4.1}/models/jackboyla__glirel-large-v0.yaml +0 -0
  184. {sie_server-0.3.4 → sie_server-0.4.1}/models/jinaai__jina-colbert-v2.yaml +0 -0
  185. {sie_server-0.3.4 → sie_server-0.4.1}/models/jinaai__jina-reranker-v2-base-multilingual.yaml +0 -0
  186. {sie_server-0.3.4 → sie_server-0.4.1}/models/knowledgator__gliclass-base-v1.0.yaml +0 -0
  187. {sie_server-0.3.4 → sie_server-0.4.1}/models/knowledgator__gliclass-large-v1.0.yaml +0 -0
  188. {sie_server-0.3.4 → sie_server-0.4.1}/models/knowledgator__gliclass-large-v3.0.yaml +0 -0
  189. {sie_server-0.3.4 → sie_server-0.4.1}/models/knowledgator__gliclass-small-v1.0.yaml +0 -0
  190. {sie_server-0.3.4 → sie_server-0.4.1}/models/knowledgator__gliner-bi-base-v2.0.yaml +0 -0
  191. {sie_server-0.3.4 → sie_server-0.4.1}/models/knowledgator__modern-gliner-bi-base-v1.0.yaml +0 -0
  192. {sie_server-0.3.4 → sie_server-0.4.1}/models/laion__CLIP-ViT-B-32-laion2B-s34B-b79K.yaml +0 -0
  193. {sie_server-0.3.4 → sie_server-0.4.1}/models/laion__CLIP-ViT-H-14-laion2B-s32B-b79K.yaml +0 -0
  194. {sie_server-0.3.4 → sie_server-0.4.1}/models/lightonai__GTE-ModernColBERT-v1.yaml +0 -0
  195. {sie_server-0.3.4 → sie_server-0.4.1}/models/lightonai__LightOnOCR-2-1B.yaml +0 -0
  196. {sie_server-0.3.4 → sie_server-0.4.1}/models/lightonai__Reason-ModernColBERT.yaml +0 -0
  197. {sie_server-0.3.4 → sie_server-0.4.1}/models/microsoft__Florence-2-base-ft.yaml +0 -0
  198. {sie_server-0.3.4 → sie_server-0.4.1}/models/microsoft__Florence-2-base.yaml +0 -0
  199. {sie_server-0.3.4 → sie_server-0.4.1}/models/microsoft__Florence-2-large.yaml +0 -0
  200. {sie_server-0.3.4 → sie_server-0.4.1}/models/mixedbread-ai__mxbai-colbert-large-v1.yaml +0 -0
  201. {sie_server-0.3.4 → sie_server-0.4.1}/models/mixedbread-ai__mxbai-edge-colbert-v0-32m.yaml +0 -0
  202. {sie_server-0.3.4 → sie_server-0.4.1}/models/mixedbread-ai__mxbai-rerank-base-v2.yaml +0 -0
  203. {sie_server-0.3.4 → sie_server-0.4.1}/models/mixedbread-ai__mxbai-rerank-large-v2.yaml +0 -0
  204. {sie_server-0.3.4 → sie_server-0.4.1}/models/mynkchaudhry__Florence-2-FT-DocVQA.yaml +0 -0
  205. {sie_server-0.3.4 → sie_server-0.4.1}/models/naver-clova-ix__donut-base-finetuned-cord-v2.yaml +0 -0
  206. {sie_server-0.3.4 → sie_server-0.4.1}/models/naver-clova-ix__donut-base-finetuned-docvqa.yaml +0 -0
  207. {sie_server-0.3.4 → sie_server-0.4.1}/models/naver-clova-ix__donut-base-finetuned-rvlcdip.yaml +0 -0
  208. {sie_server-0.3.4 → sie_server-0.4.1}/models/naver__splade-cocondenser-selfdistil.yaml +0 -0
  209. {sie_server-0.3.4 → sie_server-0.4.1}/models/naver__splade-v3.yaml +0 -0
  210. {sie_server-0.3.4 → sie_server-0.4.1}/models/nomic-ai__nomic-embed-text-v2-moe.yaml +0 -0
  211. {sie_server-0.3.4 → sie_server-0.4.1}/models/numind__NuNER_Zero-span.yaml +0 -0
  212. {sie_server-0.3.4 → sie_server-0.4.1}/models/numind__NuNER_Zero.yaml +0 -0
  213. {sie_server-0.3.4 → sie_server-0.4.1}/models/nvidia__NV-Embed-v2.yaml +0 -0
  214. {sie_server-0.3.4 → sie_server-0.4.1}/models/nvidia__llama-embed-nemotron-8b.yaml +0 -0
  215. {sie_server-0.3.4 → sie_server-0.4.1}/models/nvidia__llama-nemoretriever-colembed-3b-v1.yaml +0 -0
  216. {sie_server-0.3.4 → sie_server-0.4.1}/models/nvidia__nemotron-colembed-vl-4b-v2.yaml +0 -0
  217. {sie_server-0.3.4 → sie_server-0.4.1}/models/openai__clip-vit-base-patch32.yaml +0 -0
  218. {sie_server-0.3.4 → sie_server-0.4.1}/models/openai__clip-vit-large-patch14.yaml +0 -0
  219. {sie_server-0.3.4 → sie_server-0.4.1}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v2-distill.yaml +0 -0
  220. {sie_server-0.3.4 → sie_server-0.4.1}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v2-mini.yaml +0 -0
  221. {sie_server-0.3.4 → sie_server-0.4.1}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v3-distill.yaml +0 -0
  222. {sie_server-0.3.4 → sie_server-0.4.1}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v3-gte.yaml +0 -0
  223. {sie_server-0.3.4 → sie_server-0.4.1}/models/opensearch-project__opensearch-neural-sparse-encoding-v1.yaml +0 -0
  224. {sie_server-0.3.4 → sie_server-0.4.1}/models/opensearch-project__opensearch-neural-sparse-encoding-v2-distill.yaml +0 -0
  225. {sie_server-0.3.4 → sie_server-0.4.1}/models/prithivida__Splade_PP_en_v2.yaml +0 -0
  226. {sie_server-0.3.4 → sie_server-0.4.1}/models/rasyosef__splade-mini.yaml +0 -0
  227. {sie_server-0.3.4 → sie_server-0.4.1}/models/sentence-transformers__all-MiniLM-L6-v2.yaml +0 -0
  228. {sie_server-0.3.4 → sie_server-0.4.1}/models/sugiv__stablebridge-pruner-highlighter.yaml +0 -0
  229. {sie_server-0.3.4 → sie_server-0.4.1}/models/tomoroai__tomoro-colqwen3-embed-4b.yaml +0 -0
  230. {sie_server-0.3.4 → sie_server-0.4.1}/models/urchade__gliner_large-v2.1.yaml +0 -0
  231. {sie_server-0.3.4 → sie_server-0.4.1}/models/urchade__gliner_medium-v2.1.yaml +0 -0
  232. {sie_server-0.3.4 → sie_server-0.4.1}/models/urchade__gliner_multi-v2.1.yaml +0 -0
  233. {sie_server-0.3.4 → sie_server-0.4.1}/models/urchade__gliner_multi_pii-v1.yaml +0 -0
  234. {sie_server-0.3.4 → sie_server-0.4.1}/models/urchade__gliner_small-v2.1.yaml +0 -0
  235. {sie_server-0.3.4 → sie_server-0.4.1}/models/vidore__colpali-v1.3-hf.yaml +0 -0
  236. {sie_server-0.3.4 → sie_server-0.4.1}/models/vidore__colqwen2.5-v0.2.yaml +0 -0
  237. {sie_server-0.3.4 → sie_server-0.4.1}/models/zai-org__GLM-OCR.yaml +0 -0
  238. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/__init__.py +0 -0
  239. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/__init__.py +0 -0
  240. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/_base_adapter.py +0 -0
  241. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/_flash_base.py +0 -0
  242. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/_types.py +0 -0
  243. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/_utils.py +0 -0
  244. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/bert_flash/__init__.py +0 -0
  245. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/bert_flash_cross_encoder/__init__.py +0 -0
  246. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/bge_m3/__init__.py +0 -0
  247. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/bge_m3_flash/__init__.py +0 -0
  248. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/bge_m3_score_mixin.py +0 -0
  249. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/colbert/__init__.py +0 -0
  250. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/colbert_modernbert_flash/__init__.py +0 -0
  251. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/colbert_rotary_flash/__init__.py +0 -0
  252. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/cross_encoder/__init__.py +0 -0
  253. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/docling/__init__.py +0 -0
  254. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/errors.py +0 -0
  255. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/gliclass/__init__.py +0 -0
  256. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/gliner/__init__.py +0 -0
  257. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/gliner2/__init__.py +0 -0
  258. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/gliner_bi/__init__.py +0 -0
  259. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/glirel/__init__.py +0 -0
  260. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/gte_sparse_flash/__init__.py +0 -0
  261. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/jina_flash_cross_encoder/__init__.py +0 -0
  262. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/modernbert_flash/__init__.py +0 -0
  263. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/modernbert_flash_cross_encoder/__init__.py +0 -0
  264. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/nli_classification/__init__.py +0 -0
  265. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/nli_classification_flash/__init__.py +0 -0
  266. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/nomic_flash/__init__.py +0 -0
  267. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/peft_lora_mixin.py +0 -0
  268. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/qwen2_flash/__init__.py +0 -0
  269. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/qwen2_flash_cross_encoder/__init__.py +0 -0
  270. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/rope_flash/__init__.py +0 -0
  271. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/sentence_transformer/__init__.py +0 -0
  272. {sie_server-0.3.4/src/sie_server/app → sie_server-0.4.1/src/sie_server/adapters/sglang}/__init__.py +0 -0
  273. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/splade_flash/__init__.py +0 -0
  274. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/stablebridge_pruner/__init__.py +0 -0
  275. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/adapters/xlm_roberta_flash/__init__.py +0 -0
  276. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/api/__init__.py +0 -0
  277. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/api/encode.py +0 -0
  278. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/api/extract.py +0 -0
  279. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/api/helpers.py +0 -0
  280. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/api/metrics.py +0 -0
  281. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/api/models.py +0 -0
  282. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/api/openai_compat.py +0 -0
  283. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/api/openapi.py +0 -0
  284. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/api/options.py +0 -0
  285. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/api/root.py +0 -0
  286. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/api/score.py +0 -0
  287. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/api/serialization.py +0 -0
  288. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/api/validation.py +0 -0
  289. {sie_server-0.3.4/src/sie_server/config → sie_server-0.4.1/src/sie_server/app}/__init__.py +0 -0
  290. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/app/app_state_config.py +0 -0
  291. {sie_server-0.3.4/tests/adapters → sie_server-0.4.1/src/sie_server/config}/__init__.py +0 -0
  292. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/config/engine.py +0 -0
  293. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/__init__.py +0 -0
  294. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/adaptive_batching.py +0 -0
  295. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/batcher.py +0 -0
  296. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/deps.py +0 -0
  297. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/disk_cache.py +0 -0
  298. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/encode_pipeline.py +0 -0
  299. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/hf_env.py +0 -0
  300. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/hot_reload.py +0 -0
  301. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/inference.py +0 -0
  302. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/inference_output.py +0 -0
  303. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/load_errors.py +0 -0
  304. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/logging.py +0 -0
  305. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/memory.py +0 -0
  306. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/model_loader.py +0 -0
  307. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/oom.py +0 -0
  308. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/postprocessor.py +0 -0
  309. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/postprocessor_registry.py +0 -0
  310. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/prepared.py +0 -0
  311. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/preprocessor/__init__.py +0 -0
  312. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/preprocessor/base.py +0 -0
  313. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/preprocessor/text.py +0 -0
  314. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/preprocessor_registry.py +0 -0
  315. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/readiness.py +0 -0
  316. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/shutdown.py +0 -0
  317. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/timing.py +0 -0
  318. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/tokenizer.py +0 -0
  319. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/watcher.py +0 -0
  320. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/worker/__init__.py +0 -0
  321. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/worker/handlers/__init__.py +0 -0
  322. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/worker/handlers/base.py +0 -0
  323. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/worker/handlers/encode.py +0 -0
  324. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/worker/handlers/extract.py +0 -0
  325. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/worker/handlers/score.py +0 -0
  326. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/worker/model_worker.py +0 -0
  327. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/worker/oom_recovery.py +0 -0
  328. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/core/worker/types.py +0 -0
  329. {sie_server-0.3.4/tests/api → sie_server-0.4.1/src/sie_server/health}/__init__.py +0 -0
  330. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/main.py +0 -0
  331. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/nats_subscriber.py +0 -0
  332. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/observability/__init__.py +0 -0
  333. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/observability/gpu.py +0 -0
  334. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/observability/prometheus.py +0 -0
  335. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/observability/telemetry.py +0 -0
  336. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/observability/tracing.py +0 -0
  337. {sie_server-0.3.4/tests/app → sie_server-0.4.1/src/sie_server/processors}/__init__.py +0 -0
  338. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/static/__init__.py +0 -0
  339. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/static/index.html +0 -0
  340. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/types/__init__.py +0 -0
  341. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/types/openapi.py +0 -0
  342. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/types/outputs.py +0 -0
  343. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/types/overflow_policy.py +0 -0
  344. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/types/requests.py +0 -0
  345. {sie_server-0.3.4 → sie_server-0.4.1}/src/sie_server/types/responses.py +0 -0
  346. {sie_server-0.3.4/tests/config → sie_server-0.4.1/tests/adapters}/__init__.py +0 -0
  347. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_base.py +0 -0
  348. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_bge_m3.py +0 -0
  349. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_bge_m3_flash.py +0 -0
  350. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_clip.py +0 -0
  351. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_colbert.py +0 -0
  352. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_docling.py +0 -0
  353. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_docling_smoke.py +0 -0
  354. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_donut.py +0 -0
  355. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_factory_integration.py +0 -0
  356. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_flash_base.py +0 -0
  357. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_florence2.py +0 -0
  358. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_gliclass_overflow_policy.py +0 -0
  359. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_glirel.py +0 -0
  360. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_glm_ocr.py +0 -0
  361. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_grounding_dino.py +0 -0
  362. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_gte_sparse.py +0 -0
  363. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_jina_flash_cross_encoder.py +0 -0
  364. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_lighton_ocr.py +0 -0
  365. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_lora.py +0 -0
  366. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_paddleocr_vl.py +0 -0
  367. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_sentence_transformer.py +0 -0
  368. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_siglip.py +0 -0
  369. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_sparse_aggregation.py +0 -0
  370. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_stablebridge_pruner.py +0 -0
  371. {sie_server-0.3.4 → sie_server-0.4.1}/tests/adapters/test_visual_document.py +0 -0
  372. {sie_server-0.3.4/tests/core → sie_server-0.4.1/tests/api}/__init__.py +0 -0
  373. {sie_server-0.3.4 → sie_server-0.4.1}/tests/api/test_encode_dtype.py +0 -0
  374. {sie_server-0.3.4 → sie_server-0.4.1}/tests/api/test_encode_endpoint.py +0 -0
  375. {sie_server-0.3.4 → sie_server-0.4.1}/tests/api/test_encode_json_schema.py +0 -0
  376. {sie_server-0.3.4 → sie_server-0.4.1}/tests/api/test_encode_timing.py +0 -0
  377. {sie_server-0.3.4 → sie_server-0.4.1}/tests/api/test_encode_validation.py +0 -0
  378. {sie_server-0.3.4 → sie_server-0.4.1}/tests/api/test_extract.py +0 -0
  379. {sie_server-0.3.4 → sie_server-0.4.1}/tests/api/test_extract_integration.py +0 -0
  380. {sie_server-0.3.4 → sie_server-0.4.1}/tests/api/test_extract_oom.py +0 -0
  381. {sie_server-0.3.4 → sie_server-0.4.1}/tests/api/test_models.py +0 -0
  382. {sie_server-0.3.4 → sie_server-0.4.1}/tests/api/test_openai_compat.py +0 -0
  383. {sie_server-0.3.4 → sie_server-0.4.1}/tests/api/test_score.py +0 -0
  384. {sie_server-0.3.4 → sie_server-0.4.1}/tests/api/test_version_header.py +0 -0
  385. {sie_server-0.3.4/tests/core/worker → sie_server-0.4.1/tests/app}/__init__.py +0 -0
  386. {sie_server-0.3.4/tests/observability → sie_server-0.4.1/tests/config}/__init__.py +0 -0
  387. {sie_server-0.3.4 → sie_server-0.4.1}/tests/config/test_bundle_coverage.py +0 -0
  388. {sie_server-0.3.4/tests/type_defs → sie_server-0.4.1/tests/core}/__init__.py +0 -0
  389. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_adaptive_batching.py +0 -0
  390. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_batcher.py +0 -0
  391. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_disk_cache.py +0 -0
  392. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_hot_reload.py +0 -0
  393. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_idle_evict.py +0 -0
  394. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_inference.py +0 -0
  395. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_logging.py +0 -0
  396. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_memory.py +0 -0
  397. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_model_load_timeout.py +0 -0
  398. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_oom_detection.py +0 -0
  399. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_postprocessor.py +0 -0
  400. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_postprocessor_registry.py +0 -0
  401. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_prepared.py +0 -0
  402. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_preprocessor_registry.py +0 -0
  403. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_quantization.py +0 -0
  404. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_readiness.py +0 -0
  405. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_registry_core.py +0 -0
  406. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_registry_deps.py +0 -0
  407. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_registry_failed_state.py +0 -0
  408. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_registry_memory.py +0 -0
  409. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_registry_multi_model.py +0 -0
  410. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_shutdown.py +0 -0
  411. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_timing.py +0 -0
  412. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_watcher.py +0 -0
  413. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_worker_core.py +0 -0
  414. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_worker_lora.py +0 -0
  415. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/test_worker_options.py +0 -0
  416. {sie_server-0.3.4 → sie_server-0.4.1}/tests/core/worker/test_oom_recovery.py +0 -0
  417. {sie_server-0.3.4 → sie_server-0.4.1}/tests/observability/test_telemetry.py +0 -0
  418. {sie_server-0.3.4 → sie_server-0.4.1}/tests/observability/test_tracing.py +0 -0
  419. {sie_server-0.3.4 → sie_server-0.4.1}/tests/test_all_models.py +0 -0
  420. {sie_server-0.3.4 → sie_server-0.4.1}/tests/test_openapi_export.py +0 -0
  421. {sie_server-0.3.4 → sie_server-0.4.1}/tests/test_sdk_integration.py +0 -0
  422. {sie_server-0.3.4 → sie_server-0.4.1}/tests/test_server_smoke.py +0 -0
  423. {sie_server-0.3.4 → sie_server-0.4.1}/tests/test_sparse_integration.py +0 -0
  424. {sie_server-0.3.4 → sie_server-0.4.1}/tests/type_defs/test_inputs.py +0 -0
  425. {sie_server-0.3.4 → sie_server-0.4.1}/tests/type_defs/test_types.py +0 -0
@@ -63,6 +63,13 @@ RUN --mount=type=cache,target=/root/.cache/pip \
63
63
  # platform in local BuildKit cache and in content-addressed registry layers.
64
64
  FROM deps AS base
65
65
 
66
+ # Source-layer cache key — see Dockerfile.cuda12 for the full rationale.
67
+ # Tie source layers to a per-commit revision arg so a rebuild can't ship
68
+ # stale code via a reused source-COPY layer. Dependency layers in the
69
+ # ``deps`` stage above stay cached.
70
+ ARG SIE_SRC_REV=dev
71
+ RUN echo "sie source revision: ${SIE_SRC_REV}"
72
+
66
73
  COPY packages/sie_sdk/src /tmp/sie_sdk/src
67
74
  COPY packages/sie_server/src src/
68
75
  COPY packages/sie_server/bundles bundles/
@@ -158,13 +165,25 @@ ENV DEBIAN_FRONTEND=noninteractive \
158
165
  # Only the shared libs torch + pillow + rtree actually dlopen at runtime.
159
166
  # libspatialindex-c6: rtree (docling dep) dlopens libspatialindex_c.so; the
160
167
  # rtree==1.4.1 wheel only bundles the C++ core, not the C wrapper.
168
+ # libgl1 libglib2.0-0 libice6 libsm6 libx11-6 libxcb1 libxext6: docling-ibm-models'
169
+ # TableFormer imports cv2 during DoclingAdapter.load() pre-warm; the opencv-python
170
+ # wheel unconditionally dlopens an X11 + libGL + glib chain at import even in
171
+ # headless usage. Without these, every docling extract crashes with
172
+ # "ImportError: libxcb.so.1: cannot open shared object file" (issue #1028).
161
173
  RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
162
174
  --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
163
175
  apt-get update && apt-get install -y --no-install-recommends \
176
+ libgl1 \
177
+ libglib2.0-0 \
164
178
  libgomp1 \
179
+ libice6 \
165
180
  libjpeg62-turbo \
166
181
  libpng16-16 \
167
- libspatialindex-c6
182
+ libsm6 \
183
+ libspatialindex-c6 \
184
+ libx11-6 \
185
+ libxcb1 \
186
+ libxext6
168
187
 
169
188
  RUN groupadd -g 1000 sie && useradd -u 1000 -g sie -m sie
170
189
 
@@ -176,8 +195,10 @@ WORKDIR /app
176
195
  # no /etc/passwd visible (the sie user exists in the runtime FS but --link
177
196
  # layers are created in isolation).
178
197
  COPY --link --from=base --chown=1000:1000 /app/.venv /app/.venv
179
- COPY --link --from=base --chown=1000:1000 /app/src /app/src
180
- COPY --link --from=base --chown=1000:1000 /tmp/sie_sdk/src /tmp/sie_sdk/src
198
+ # Source trees WITHOUT --link see Dockerfile.cuda12 (linked cross-stage
199
+ # copies didn't reliably invalidate on source change).
200
+ COPY --from=base --chown=1000:1000 /app/src /app/src
201
+ COPY --from=base --chown=1000:1000 /tmp/sie_sdk/src /tmp/sie_sdk/src
181
202
  COPY --link --from=base --chown=1000:1000 /app/models /app/models
182
203
  COPY --link --from=base --chown=1000:1000 /app/bundles /app/bundles
183
204
  # Bundle-specific extras — last layer so shared layers above stay cached.
@@ -66,6 +66,18 @@ RUN --mount=type=cache,target=/root/.cache/pip \
66
66
  # content-addressed registry layers.
67
67
  FROM deps AS base
68
68
 
69
+ # Source-layer cache key. A rebuild once shipped STALE code: buildx
70
+ # reused a cached source-COPY layer even though the .py files had
71
+ # changed, so the demo had to overlay patched files by hand. Tie the
72
+ # source layers to an explicit revision arg the CI passes per commit
73
+ # (``--build-arg SIE_SRC_REV=$(git rev-parse --short HEAD)``) so any
74
+ # commit forces these layers — and the editable reinstall below — to
75
+ # rebuild. The expensive dependency install lives in the ``deps`` stage
76
+ # ABOVE this line, so it stays cached. Bundles of the same commit share
77
+ # the same SIE_SRC_REV, so cross-bundle layer dedup is preserved.
78
+ ARG SIE_SRC_REV=dev
79
+ RUN echo "sie source revision: ${SIE_SRC_REV}"
80
+
69
81
  COPY packages/sie_sdk/src /tmp/sie_sdk/src
70
82
  COPY packages/sie_server/src src/
71
83
  COPY packages/sie_server/bundles bundles/
@@ -175,15 +187,27 @@ ENV DEBIAN_FRONTEND=noninteractive
175
187
  # libgomp1: torch OpenMP runtime.
176
188
  # libspatialindex-c6: rtree (docling dep) dlopens libspatialindex_c.so; the
177
189
  # rtree==1.4.1 wheel only bundles the C++ core, not the C wrapper.
190
+ # libgl1 libglib2.0-0 libice6 libsm6 libx11-6 libxcb1 libxext6: docling-ibm-models'
191
+ # TableFormer imports cv2 during DoclingAdapter.load() pre-warm; the
192
+ # opencv-python wheel unconditionally dlopens an X11 + libGL + glib chain at
193
+ # import even in headless usage. Without these, every docling extract crashes
194
+ # with "ImportError: libxcb.so.1: cannot open shared object file" (issue #1028).
178
195
  RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
179
196
  --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
180
197
  apt-get update && apt-get install -y --no-install-recommends \
181
198
  ca-certificates \
182
199
  gcc \
183
200
  libc6-dev \
201
+ libgl1 \
202
+ libglib2.0-0 \
184
203
  libgomp1 \
204
+ libice6 \
185
205
  libnuma1 \
186
- libspatialindex-c6
206
+ libsm6 \
207
+ libspatialindex-c6 \
208
+ libx11-6 \
209
+ libxcb1 \
210
+ libxext6
187
211
 
188
212
  RUN groupadd -g 1000 sie && useradd -u 1000 -g sie -m sie
189
213
 
@@ -211,8 +235,13 @@ RUN set -e; \
211
235
  # (the sie user is added in the runtime stage filesystem but --link layers
212
236
  # are created in isolation from the destination stage state).
213
237
  COPY --link --from=base --chown=1000:1000 /app/.venv /app/.venv
214
- COPY --link --from=base --chown=1000:1000 /app/src /app/src
215
- COPY --link --from=base --chown=1000:1000 /tmp/sie_sdk/src /tmp/sie_sdk/src
238
+ # Source trees are copied WITHOUT --link. ``COPY --link --from=<stage>``
239
+ # layers are cached on a digest that, in the buildx versions this image
240
+ # was built with, did not reliably invalidate when the upstream source
241
+ # changed — the stale-code bug above. These trees are small, so dropping
242
+ # --link costs negligible dedup while guaranteeing edited code ships.
243
+ COPY --from=base --chown=1000:1000 /app/src /app/src
244
+ COPY --from=base --chown=1000:1000 /tmp/sie_sdk/src /tmp/sie_sdk/src
216
245
  COPY --link --from=base --chown=1000:1000 /app/models /app/models
217
246
  COPY --link --from=base --chown=1000:1000 /app/bundles /app/bundles
218
247
  # Bundle-specific extras — last layer so shared layers above stay cached.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sie-server
3
- Version: 0.3.4
3
+ Version: 0.4.1
4
4
  Summary: Search Inference Engine - GPU inference server for search workloads
5
5
  License: Apache-2.0
6
6
  License-File: LICENSE
@@ -62,6 +62,12 @@ auto-retries; see `packages/sie_sdk/README.md` for client-side controls.
62
62
  | `SIE_DEFAULT_COMPUTE_PRECISION` | `float16` | One of `float16`, `bfloat16`, `float32`. |
63
63
  | `SIE_ATTENTION_BACKEND` | `auto` | One of `auto`, `flash_attention_2`, `sdpa`, `eager`. |
64
64
 
65
+ ### Diagnostics
66
+
67
+ | Env var | Default | Effect |
68
+ |--|--|--|
69
+ | `SIE_GRAMMAR_PREFLIGHT_DEBUG` | unset (off) | Enables the legacy worker-side Outlines preflight compile before each structured-output request. Off by default per ADR-0002 — SGLang is the production grammar authority. Use for diagnosing schema-rejection problems or slow compiles in a controlled environment; not recommended for production traffic. |
70
+
65
71
  For nested settings (any field with `__`), the env-var format is
66
72
  `SIE_<TOP>__<NESTED>=value`. The complete schema is in
67
73
  `packages/sie_server/src/sie_server/config/engine.py`.
@@ -78,5 +78,5 @@ deps:
78
78
  docling: '>=2,<3'
79
79
  # Flash Attention 2 — CUDA only, prebuilt wheel
80
80
  flash-attn:
81
- url: https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.7.0/flash_attn-2.7.4+cu128torch2.9-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
81
+ url: https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.7.11/flash_attn-2.7.4+cu129torch2.9-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
82
82
  marker: sys_platform == 'linux'
@@ -0,0 +1,18 @@
1
+ name: sglang-embedding
2
+ priority: 21
3
+ adapters:
4
+ - sie_server.adapters.sglang.embedding
5
+ deps:
6
+ # Lockstep with ``bundles/sglang.yaml``. The dependency stack is
7
+ # identical (sglang's grammar backends are unavoidably pulled in even
8
+ # on embedding-only deployments), and the bundle split exists only for
9
+ # worker pool isolation. The previous revision pinned ``outlines-core``
10
+ # (a transitive of ``outlines``) instead of the actual top-level deps
11
+ # the generation bundle pins, which was both wrong and produced silent
12
+ # pip resolution drift on environments that already had a different
13
+ # ``outlines`` installed. Asserted in ``tests/test_bundles.py`` so a
14
+ # future drift fails fast.
15
+ sglang: '==0.5.10.post1'
16
+ xgrammar: '==0.1.32'
17
+ outlines: '==0.1.11'
18
+ llguidance: '>=0.7.11,<0.8.0'
@@ -0,0 +1,66 @@
1
+ name: sglang
2
+ priority: 20
3
+ adapters:
4
+ - sie_server.adapters.sglang.generation
5
+ deps:
6
+ # SGLang 0.5.6+ includes all deps at base level (fixed from 0.4.x extras bug)
7
+ # See: https://github.com/sgl-project/sglang/issues/4869
8
+ #
9
+ # Qwen3.5-4B + Qwen3.6-27B compatibility:
10
+ #
11
+ # ``sglang==0.5.10.post1`` is the canonical target for the Qwen3.x hybrid
12
+ # Gated-DeltaNet + Gated-Attention family on the current
13
+ # L4 / A100-40GB / H100 fleet. Qwen3.6-27B uses the same ``qwen3_5``
14
+ # model class shipped in 0.5.10 — the architecture (64 layers, hybrid
15
+ # Gated DeltaNet + Gated Attention, MTP/NEXTN) is identical, only the
16
+ # parameter count differs.
17
+ #
18
+ # SGLang 0.5.10 was evaluated against CUDA 12.9 + Qwen3.6-27B on Modal
19
+ # H100 (2026-05-27): server boots, loads weights, but the bundled
20
+ # ``sglang/jit_kernel/csrc/elementwise/activation.cuh`` has a C++
21
+ # template bug (``select_kernel<true>(type)`` is parsed as a class-
22
+ # template substitution, not a function-template call) that the
23
+ # stricter ``nvcc`` shipped with CUDA 12.9 rejects at first activation.
24
+ # 0.5.11 is also dev-only on the sglang docs wheel index — not on
25
+ # PyPI. Park the 0.5.11 bump until upstream cuts a stable release with
26
+ # the JIT header fixed; 0.5.10.post1 covers Qwen3.6-27B today.
27
+ #
28
+ # * grammar backends: ``xgrammar`` / ``outlines`` / ``llguidance`` / ``none``
29
+ # * ``sgl_kernel`` covers SM_80 / SM_89 / SM_90 / SM_100 via gencode.
30
+ # * torch==2.9.1 (CUDA 12.9 wheels); ``cuda-python==12.9``.
31
+ sglang: '==0.5.10.post1'
32
+ #
33
+ # Grammar backend deps — pinned to exactly what SGLang 0.5.10 imports
34
+ # internally to prevent silent pip resolution drift.
35
+ #
36
+ # XGrammar is SGLang 0.5.10's other supported grammar backend; it pins
37
+ # ``xgrammar==0.1.32`` exactly. That release also brought the
38
+ # structured-output VRAM-leak fix (PR #20697) and grammar-error
39
+ # propagation (PR #20467). Kept available as the fallback backend.
40
+ xgrammar: '==0.1.32'
41
+ #
42
+ # ``outlines==0.1.11`` is a hard transitive dep of sglang==0.5.10.post1. We
43
+ # declare it explicitly at bundle level so the surface is visible.
44
+ # ``outlines-core`` (a separate package) is a transitive of outlines
45
+ # and intentionally NOT pinned here — pinning ``outlines-core`` directly
46
+ # was a bug in the previous bundle revision (it does not give us any
47
+ # grammar functionality on its own).
48
+ #
49
+ # Outlines IS the active grammar backend for Qwen3.5 (partner
50
+ # requirement). Earlier revisions said "do not switch Qwen3.5 to
51
+ # outlines" because the worker-side ``compile_outlines`` preflight
52
+ # crashed with ``'TokenizersBackend' object has no attribute
53
+ # 'vocabulary'``: it passed the raw transformers==5.3.0 tokenizer
54
+ # (now a ``TokenizersBackend``) to Outlines' processor factories, which
55
+ # require an Outlines ``Tokenizer`` adapter exposing ``.vocabulary``.
56
+ # ``compile_outlines`` now wraps the tokenizer in Outlines'
57
+ # ``TransformerTokenizer`` first (the same wrap SGLang's
58
+ # ``OutlinesGrammarBackend`` does internally), so the mismatch is gone
59
+ # for json_schema/regex; ebnf is forwarded straight to SGLang. See
60
+ # ``processors/grammar_compile.py`` ("Tokenizer adapter").
61
+ outlines: '==0.1.11'
62
+ #
63
+ # llguidance is the third grammar backend (regex / json_schema / ebnf).
64
+ # Pinned to SGLang 0.5.10's compatible range. Kept available as the
65
+ # fallback if a future model regresses on xgrammar.
66
+ llguidance: '>=0.7.11,<0.8.0'
@@ -18,7 +18,7 @@ profiles:
18
18
  default:
19
19
  max_batch_tokens: 16384
20
20
  compute_precision: bfloat16
21
- adapter_path: sie_server.adapters.sglang:SGLangEmbeddingAdapter
21
+ adapter_path: sie_server.adapters.sglang.embedding:SGLangEmbeddingAdapter
22
22
  adapter_options:
23
23
  loadtime:
24
24
  mem_fraction_static: 0.85
@@ -18,7 +18,7 @@ profiles:
18
18
  default:
19
19
  max_batch_tokens: 8192
20
20
  compute_precision: bfloat16
21
- adapter_path: sie_server.adapters.sglang:SGLangEmbeddingAdapter
21
+ adapter_path: sie_server.adapters.sglang.embedding:SGLangEmbeddingAdapter
22
22
  adapter_options:
23
23
  loadtime:
24
24
  mem_fraction_static: 0.85
@@ -0,0 +1,119 @@
1
+ sie_id: Qwen/Qwen3-0.6B
2
+ hf_id: Qwen/Qwen3-0.6B
3
+ inputs:
4
+ text: true
5
+ image: false
6
+ audio: false
7
+ video: false
8
+ tasks:
9
+ # Tiny generation model used for fast e2e iteration on Modal L4 (walking-skeleton
10
+ # viability bench + validation-harness SIE-only smoke). Loads in ~30s. Quality is
11
+ # bad — this is a transport benchmark target, not a production model.
12
+ #
13
+ # Context / batch sizes are deliberately small (1024 vs the headroom an
14
+ # L4 could nominally support) so the validation harness can co-resident
15
+ # the worker's SGLang with a second SGLang for the baseline phase on
16
+ # a single 22 GiB card without OOMing. Raise these if you ever run 0.6B
17
+ # alone on a card it doesn't have to share.
18
+ #
19
+ # Note on the three 1024s below: `context_length`, `max_sequence_length`,
20
+ # and `max_batch_tokens` are NOT redundant — they're three independent
21
+ # knobs (per-request context, SGLang --context-length, batcher cost
22
+ # budget) that just happen to collide here because the model is tiny.
23
+ # See sibling Qwen__Qwen3-4B-Instruct-2507.yaml for the canonical
24
+ # non-collapsed shape.
25
+ generate:
26
+ context_length: 1024
27
+ max_output_tokens: 1024
28
+ capabilities:
29
+ grammar: []
30
+ streaming: true
31
+ tools: false
32
+ max_sequence_length: 1024
33
+ # KV-cache memory math (Qwen3-0.6B, bf16):
34
+ # layers=28, kv_heads=8, head_dim=128, bytes_per_elem=2
35
+ # kv_bytes_per_token = 2 × 28 × 8 × 128 × 2 = 114,688 B ≈ 112 KB
36
+ # The 0.6B is a transport benchmark target — context_length is held at
37
+ # 1024 deliberately (see header comment) so the validation harness can
38
+ # co-resident two SGLang instances on an L4. KV budgets per profile
39
+ # scale with the deployment scenario rather than the GPU ceiling.
40
+ profiles:
41
+ default:
42
+ max_batch_tokens: 1024
43
+ compute_precision: bfloat16
44
+ adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
45
+ kv_budget_tokens: 8192
46
+ adapter_options:
47
+ loadtime:
48
+ # 0.8 leaves headroom on a 22 GiB L4 for a second SGLang instance
49
+ # (the validation harness co-residents worker + baseline). If the
50
+ # 0.6B is the only model on the card, 0.9 is fine.
51
+ mem_fraction_static: 0.8
52
+ served_model_name: Qwen/Qwen3-0.6B
53
+ # Modal sandbox lacks flashinfer's JIT prerequisites; switch backends.
54
+ disable_cuda_graph: true
55
+ attention_backend: triton
56
+ runtime:
57
+ first_chunk_timeout_s: 30
58
+ inter_chunk_timeout_s: 10
59
+ # Aligned with the rest of the generate model fleet (300s).
60
+ # The previous 132s was an unexplained magic number that
61
+ # diverged from every other generate config; bumping to the
62
+ # fleet default keeps long-completion requests from hitting a
63
+ # premature overall-timeout on the 0.6B model.
64
+ overall_timeout_s: 300
65
+ default_sampling:
66
+ temperature: 0.0
67
+ top_p: 1.0
68
+ # Dedicated 0.6B deployments on a100/h100 don't co-resident a baseline,
69
+ # so mem_fraction_static returns to the standard 0.85 and the KV budget
70
+ # scales with the larger GPU. kv_budget_tokens stays well below the
71
+ # theoretical ceiling because the 0.6B's *context_length* (1024) caps
72
+ # per-request KV consumption — the budget really just sets the upper
73
+ # bound on concurrent in-flight sequences.
74
+ a100-40gb:
75
+ max_batch_tokens: 4096
76
+ compute_precision: bfloat16
77
+ adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
78
+ kv_budget_tokens: 32768
79
+ adapter_options:
80
+ loadtime:
81
+ mem_fraction_static: 0.85
82
+ served_model_name: Qwen/Qwen3-0.6B
83
+ disable_cuda_graph: true
84
+ attention_backend: triton
85
+ runtime:
86
+ first_chunk_timeout_s: 30
87
+ inter_chunk_timeout_s: 10
88
+ # Aligned with the rest of the generate model fleet (300s).
89
+ # The previous 132s was an unexplained magic number that
90
+ # diverged from every other generate config; bumping to the
91
+ # fleet default keeps long-completion requests from hitting a
92
+ # premature overall-timeout on the 0.6B model.
93
+ overall_timeout_s: 300
94
+ default_sampling:
95
+ temperature: 0.0
96
+ top_p: 1.0
97
+ h100:
98
+ max_batch_tokens: 8192
99
+ compute_precision: bfloat16
100
+ adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
101
+ kv_budget_tokens: 65536
102
+ adapter_options:
103
+ loadtime:
104
+ mem_fraction_static: 0.85
105
+ served_model_name: Qwen/Qwen3-0.6B
106
+ disable_cuda_graph: true
107
+ attention_backend: triton
108
+ runtime:
109
+ first_chunk_timeout_s: 30
110
+ inter_chunk_timeout_s: 10
111
+ # Aligned with the rest of the generate model fleet (300s).
112
+ # The previous 132s was an unexplained magic number that
113
+ # diverged from every other generate config; bumping to the
114
+ # fleet default keeps long-completion requests from hitting a
115
+ # premature overall-timeout on the 0.6B model.
116
+ overall_timeout_s: 300
117
+ default_sampling:
118
+ temperature: 0.0
119
+ top_p: 1.0
@@ -0,0 +1,152 @@
1
+ sie_id: Qwen/Qwen3-4B-Instruct-2507
2
+ hf_id: Qwen/Qwen3-4B-Instruct-2507
3
+ inputs:
4
+ text: true
5
+ image: false
6
+ audio: false
7
+ video: false
8
+ tasks:
9
+ # kv_budget_tokens now lives on profiles (below). The KV calibration
10
+ # follow-up publishes the tuned value; the placeholder here is conservative
11
+ # and assumes Qwen3-4B's ~150 KB/token KV footprint on an L4 (24 GB,
12
+ # mem_fraction_static=0.85).
13
+ generate:
14
+ context_length: 32768
15
+ max_output_tokens: 4096
16
+ capabilities:
17
+ # Outlines-backed JSON Schema, regex, and EBNF grammars are
18
+ # all supported by the SGLang adapter (Outlines and XGrammar
19
+ # both accept EBNF natively). The gateway gates requests on
20
+ # this exact list — adding a new ``grammar.kind`` variant
21
+ # requires both the gateway parser and this list to be updated.
22
+ grammar: ["json_schema", "regex", "ebnf"]
23
+ streaming: true
24
+ # Qwen3-4B-Instruct's chat template emits OpenAI-compatible
25
+ # ``<tool_call>{...}</tool_call>`` blocks when ``tools`` is
26
+ # present in the rendered messages; the worker's
27
+ # ``parse_tool_call_stream`` consumes those blocks and surfaces
28
+ # them on ``delta.tool_calls`` for SSE and on
29
+ # ``message.tool_calls`` for non-streaming requests.
30
+ tools: true
31
+ # Forwarded verbatim to ``tokenizer.apply_chat_template(**kwargs)`` when
32
+ # the worker renders an OpenAI-shaped ``messages`` request.
33
+ # Qwen3's chat template emits a ``<think>``/``</think>`` reasoning block
34
+ # unless this flag suppresses it.
35
+ chat_template_kwargs:
36
+ enable_thinking: false
37
+ # Schemas/regexes the worker pre-compiles at model load so the first
38
+ # request hitting them skips the Outlines compile (cold TTFT win).
39
+ # Failures here log + bump ``sie_worker_grammar_prewarm_total{outcome="failed"}``
40
+ # without blocking model load — add entries only for shapes you
41
+ # know are hot.
42
+ prewarm_grammars:
43
+ # Bare pattern, NOT anchored: this model uses the default Outlines
44
+ # grammar backend, and Outlines regexes are implicitly anchored —
45
+ # its FSM engine (interegular) rejects ``^``/``$`` with
46
+ # ``Unsupported``, which crashes SGLang's scheduler. Use ``(yes|no)``.
47
+ - name: yes_no
48
+ kind: regex
49
+ value: "(yes|no)"
50
+ - name: short_answer
51
+ kind: json_schema
52
+ value:
53
+ type: object
54
+ properties:
55
+ answer:
56
+ type: string
57
+ required: [answer]
58
+ max_sequence_length: 32768
59
+ # KV-cache memory math (Qwen3-4B-Instruct-2507, bf16):
60
+ # layers=36, kv_heads=8, head_dim=128, bytes_per_elem=2
61
+ # kv_bytes_per_token = 2 (k+v) × 36 × 8 × 128 × 2 = 147,456 B ≈ 144 KB
62
+ # Theoretical max KV tokens per GPU (assuming ~8 GB weights, mem_fraction_static=0.85):
63
+ # l4 (24 GB): (24 × 0.85 − 8) GB / 144 KB ≈ 90,000 tokens
64
+ # a100-40gb (40 GB): (40 × 0.85 − 8) GB / 144 KB ≈ 189,000 tokens
65
+ # h100 (80 GB): (80 × 0.85 − 8) GB / 144 KB ≈ 437,000 tokens
66
+ # kv_budget_tokens is set to ~40% of theoretical max, matching the L4
67
+ # baseline ratio (32768/90000 ≈ 36%). The headroom absorbs batch growth,
68
+ # speculative side-cell, grammar/Outlines compile arena, fragmentation.
69
+ # Final empirical validation (concurrency-16 OOM-boundary sweep) is
70
+ # tracked in product/plans/m4-req2-gpu-runbook.md §"#16/#19".
71
+ profiles:
72
+ default:
73
+ # max_batch_tokens is a generic engine knob; generation does not batch
74
+ # at the SIE layer (SGLang batches internally) but the validator
75
+ # requires the field to be set.
76
+ max_batch_tokens: 16384
77
+ compute_precision: bfloat16
78
+ adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
79
+ # L4 baseline — empirically gated by the speculative-decoding + calibration work.
80
+ kv_budget_tokens: 32768
81
+ adapter_options:
82
+ loadtime:
83
+ mem_fraction_static: 0.85
84
+ served_model_name: Qwen/Qwen3-4B-Instruct-2507
85
+ # speculative decoding (MTP/EAGLE/NGRAM) intentionally absent;
86
+ # week-1 validation decides whether to promote a side-cell. See §4.9.
87
+ runtime:
88
+ first_chunk_timeout_s: 30
89
+ inter_chunk_timeout_s: 10
90
+ overall_timeout_s: 300
91
+ default_sampling:
92
+ temperature: 0.7
93
+ top_p: 0.9
94
+ stop_tokens:
95
+ - "<|im_end|>"
96
+ # M5 audit #16/#19: analytical defaults for a100-40gb / h100. Production
97
+ # capacity also grows: with 2-4× the KV budget the context window can be
98
+ # widened proportionally so longer-context workloads (RAG with large
99
+ # retrieved passages) fit comfortably. ``max_output_tokens`` doubles
100
+ # to 8192/16384 respectively — beyond that, latency hurts more than
101
+ # quality helps for instruction-style chat traffic.
102
+ a100-40gb:
103
+ max_batch_tokens: 32768
104
+ compute_precision: bfloat16
105
+ adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
106
+ # Empirically calibrated on Modal A100-SXM4-40GB
107
+ # (sglang 0.5.9, mem_fraction_static=0.85, context_length=32768).
108
+ # SGLang's /server_info reports:
109
+ # weight=7.71 GB, kvcache=25.42 GB, graph=0.18 GB,
110
+ # token_capacity=185,081 tokens
111
+ # ``kv_budget_tokens`` sized for 4 concurrent admissions:
112
+ # 185,081 / 4 = 46,270 → round down to 45,056 for headroom.
113
+ # Re-calibrate if SGLang version or mem_fraction_static changes.
114
+ kv_budget_tokens: 45056
115
+ adapter_options:
116
+ loadtime:
117
+ mem_fraction_static: 0.85
118
+ served_model_name: Qwen/Qwen3-4B-Instruct-2507
119
+ runtime:
120
+ first_chunk_timeout_s: 30
121
+ inter_chunk_timeout_s: 10
122
+ overall_timeout_s: 300
123
+ default_sampling:
124
+ temperature: 0.7
125
+ top_p: 0.9
126
+ stop_tokens:
127
+ - "<|im_end|>"
128
+ h100:
129
+ max_batch_tokens: 65536
130
+ compute_precision: bfloat16
131
+ adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
132
+ # Empirically calibrated on Modal H100 80GB HBM3
133
+ # (sglang 0.5.9, mem_fraction_static=0.85, context_length=32768).
134
+ # SGLang's /server_info reports:
135
+ # weight=7.71 GB, kvcache=59.0 GB, graph=0.43 GB,
136
+ # token_capacity=429,645 tokens
137
+ # ``kv_budget_tokens`` sized for 4 concurrent admissions:
138
+ # 429,645 / 4 = 107,411 → round down to 106,496 for headroom.
139
+ kv_budget_tokens: 106496
140
+ adapter_options:
141
+ loadtime:
142
+ mem_fraction_static: 0.85
143
+ served_model_name: Qwen/Qwen3-4B-Instruct-2507
144
+ runtime:
145
+ first_chunk_timeout_s: 30
146
+ inter_chunk_timeout_s: 10
147
+ overall_timeout_s: 300
148
+ default_sampling:
149
+ temperature: 0.7
150
+ top_p: 0.9
151
+ stop_tokens:
152
+ - "<|im_end|>"
@@ -18,7 +18,7 @@ profiles:
18
18
  default:
19
19
  max_batch_tokens: 16384
20
20
  compute_precision: bfloat16
21
- adapter_path: sie_server.adapters.sglang:SGLangEmbeddingAdapter
21
+ adapter_path: sie_server.adapters.sglang.embedding:SGLangEmbeddingAdapter
22
22
  adapter_options:
23
23
  loadtime:
24
24
  mem_fraction_static: 0.85