sie-server 0.3.4__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sie_server-0.3.4 → sie_server-0.4.0}/Dockerfile.cpu +32 -5
- {sie_server-0.3.4 → sie_server-0.4.0}/Dockerfile.cuda12 +40 -5
- {sie_server-0.3.4 → sie_server-0.4.0}/PKG-INFO +1 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/README.md +6 -0
- sie_server-0.4.0/bundles/sglang-embedding.yaml +18 -0
- sie_server-0.4.0/bundles/sglang.yaml +66 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/Alibaba-NLP__gte-Qwen2-7B-instruct.yaml +1 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/models/Linq-AI-Research__Linq-Embed-Mistral.yaml +1 -1
- sie_server-0.4.0/models/Qwen__Qwen3-0.6B.yaml +119 -0
- sie_server-0.4.0/models/Qwen__Qwen3-4B-Instruct-2507.yaml +152 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/Qwen__Qwen3-Embedding-4B.yaml +1 -1
- sie_server-0.4.0/models/Qwen__Qwen3.5-4B.yaml +261 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/Salesforce__SFR-Embedding-2_R.yaml +1 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/models/Salesforce__SFR-Embedding-Mistral.yaml +1 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/models/intfloat__e5-mistral-7b-instruct.yaml +1 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/openapi.json +22 -2
- {sie_server-0.3.4 → sie_server-0.4.0}/pyproject.toml +1 -1
- sie_server-0.4.0/src/sie_server/adapters/_generation_base.py +295 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/_spec.py +1 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/base.py +1 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/bge_m3_flag/__init__.py +3 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/clip/__init__.py +2 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/colpali/__init__.py +2 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/colqwen2/__init__.py +2 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/colqwen3/__init__.py +2 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/donut/__init__.py +2 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/florence2/__init__.py +2 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/glm_ocr/__init__.py +2 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/grounding_dino/__init__.py +2 -2
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/lighton_ocr/__init__.py +2 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/nemo_colembed/__init__.py +2 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/owlv2/__init__.py +2 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/paddleocr_vl/__init__.py +2 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/pytorch_embedding/__init__.py +16 -5
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/qwen3_vl_embedding/__init__.py +3 -2
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/qwen3_vl_reranker/__init__.py +2 -1
- sie_server-0.4.0/src/sie_server/adapters/sglang/_server.py +210 -0
- sie_server-0.3.4/src/sie_server/adapters/sglang/__init__.py → sie_server-0.4.0/src/sie_server/adapters/sglang/embedding.py +48 -154
- sie_server-0.4.0/src/sie_server/adapters/sglang/generation.py +1430 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/siglip/__init__.py +2 -1
- sie_server-0.4.0/src/sie_server/api/generate.py +540 -0
- sie_server-0.4.0/src/sie_server/api/health.py +79 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/api/ws.py +54 -5
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/app/app_factory.py +93 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/cli.py +8 -1
- sie_server-0.4.0/src/sie_server/config/model.py +633 -0
- sie_server-0.4.0/src/sie_server/core/extract_cost.py +101 -0
- sie_server-0.4.0/src/sie_server/core/gpu_health.py +164 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/loader.py +30 -0
- sie_server-0.4.0/src/sie_server/core/pool_isolation.py +197 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/preprocessor/image.py +3 -2
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/preprocessor/vision.py +11 -14
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/registry.py +87 -0
- sie_server-0.4.0/src/sie_server/core/text_tokens.py +34 -0
- sie_server-0.4.0/src/sie_server/health/nats_publisher.py +148 -0
- sie_server-0.4.0/src/sie_server/health/saturation.py +87 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/nats_pull_loop.py +969 -60
- sie_server-0.4.0/src/sie_server/observability/metrics.py +773 -0
- sie_server-0.4.0/src/sie_server/processors/admission.py +78 -0
- sie_server-0.4.0/src/sie_server/processors/base.py +22 -0
- sie_server-0.4.0/src/sie_server/processors/grammar_cache.py +96 -0
- sie_server-0.4.0/src/sie_server/processors/grammar_compile.py +237 -0
- sie_server-0.4.0/src/sie_server/processors/streaming.py +3263 -0
- sie_server-0.4.0/src/sie_server/processors/tool_call_grammar.py +191 -0
- sie_server-0.4.0/src/sie_server/processors/tool_call_parser.py +706 -0
- sie_server-0.4.0/src/sie_server/processors/work_class_scheduler.py +281 -0
- sie_server-0.4.0/src/sie_server/types/grammar.py +130 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/types/inputs.py +68 -5
- sie_server-0.4.0/tests/adapters/test_pytorch_embedding_revision.py +77 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_runtime_options.py +3 -4
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_sglang.py +58 -94
- sie_server-0.4.0/tests/adapters/test_sglang_generation.py +1081 -0
- sie_server-0.4.0/tests/api/test_generate.py +513 -0
- sie_server-0.4.0/tests/api/test_health.py +165 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/api/test_ws.py +36 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/app/test_app_factory.py +1 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/config/test_config.py +196 -0
- sie_server-0.4.0/tests/config/test_model_prewarm_grammars.py +141 -0
- sie_server-0.4.0/tests/config/test_profile_backend_consistency.py +104 -0
- sie_server-0.4.0/tests/core/test_gpu_health.py +153 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_loader.py +15 -0
- sie_server-0.4.0/tests/core/test_lora_generation_exclusion.py +255 -0
- sie_server-0.4.0/tests/core/test_pool_isolation.py +167 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_preprocessor.py +16 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_registry_async.py +52 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_worker_backpressure.py +3 -3
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_worker_extract.py +1 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_worker_score.py +1 -1
- sie_server-0.4.0/tests/core/worker/__init__.py +0 -0
- sie_server-0.4.0/tests/health/__init__.py +0 -0
- sie_server-0.4.0/tests/health/test_nats_publisher.py +86 -0
- sie_server-0.4.0/tests/health/test_saturation.py +97 -0
- sie_server-0.4.0/tests/health/test_worker_id_consistency.py +100 -0
- sie_server-0.4.0/tests/integration/__init__.py +0 -0
- sie_server-0.4.0/tests/integration/test_chat_completions.py +205 -0
- sie_server-0.4.0/tests/integration/test_grammar_generate.py +231 -0
- sie_server-0.4.0/tests/observability/__init__.py +0 -0
- sie_server-0.4.0/tests/observability/test_generation_metrics.py +387 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/observability/test_metrics.py +4 -2
- sie_server-0.4.0/tests/observability/test_trace_propagation.py +250 -0
- sie_server-0.4.0/tests/processors/__init__.py +0 -0
- sie_server-0.4.0/tests/processors/test_grammar_cache.py +152 -0
- sie_server-0.4.0/tests/processors/test_grammar_compile.py +285 -0
- sie_server-0.4.0/tests/processors/test_grammar_prewarm.py +437 -0
- sie_server-0.4.0/tests/processors/test_streaming.py +2201 -0
- sie_server-0.4.0/tests/processors/test_streaming_admission.py +578 -0
- sie_server-0.4.0/tests/processors/test_streaming_integration.py +272 -0
- sie_server-0.4.0/tests/processors/test_tool_call_grammar.py +134 -0
- sie_server-0.4.0/tests/processors/test_tool_call_parser.py +602 -0
- sie_server-0.4.0/tests/processors/test_work_class_scheduler.py +148 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/test_docker_integration.py +5 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/test_nats_pull_loop.py +321 -1
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/test_nats_pull_loop_batching.py +175 -0
- sie_server-0.4.0/tests/type_defs/__init__.py +0 -0
- sie_server-0.4.0/tests/type_defs/test_inputs_json_decode.py +95 -0
- sie_server-0.4.0/tests/type_defs/test_media_bytes.py +92 -0
- sie_server-0.3.4/Dockerfile.cuda11 +0 -217
- sie_server-0.3.4/bundles/sglang.yaml +0 -8
- sie_server-0.3.4/src/sie_server/api/health.py +0 -47
- sie_server-0.3.4/src/sie_server/config/model.py +0 -302
- sie_server-0.3.4/src/sie_server/core/extract_cost.py +0 -29
- sie_server-0.3.4/src/sie_server/observability/metrics.py +0 -369
- sie_server-0.3.4/tests/api/test_health.py +0 -45
- {sie_server-0.3.4 → sie_server-0.4.0}/.gitignore +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/CONTRIBUTING.md +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/LICENSE +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/bundles/default.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/bundles/transformers5.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/Alibaba-NLP__gte-Qwen2-1.5B-instruct.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/Alibaba-NLP__gte-modernbert-base.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/Alibaba-NLP__gte-multilingual-base.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/Alibaba-NLP__gte-reranker-modernbert-base.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/BAAI__bge-m3.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/BAAI__bge-reranker-base.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/BAAI__bge-reranker-large.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/BAAI__bge-reranker-v2-m3.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/EmergentMethods__gliner_large_news-v2.1.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/GritLM__GritLM-7B.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/IDEA-Research__grounding-dino-base.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/IDEA-Research__grounding-dino-tiny.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/Ihor__gliner-biomed-large-v1.0.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/Marqo__marqo-ecommerce-embeddings-B.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/MoritzLaurer__deberta-v3-base-zeroshot-v2.0.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/MoritzLaurer__deberta-v3-large-zeroshot-v2.0.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/NeuML__gliner-bert-tiny.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/NovaSearch__stella_en_1.5B_v5.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/NovaSearch__stella_en_400M_v5.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/PaddlePaddle__PaddleOCR-VL-1.5.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/Qwen__Qwen3-Embedding-0.6B.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/Qwen__Qwen3-Reranker-0.6B.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/Qwen__Qwen3-Reranker-4B.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/Qwen__Qwen3-VL-Embedding-2B.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/Qwen__Qwen3-VL-Reranker-2B.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/Snowflake__snowflake-arctic-embed-m-v2.0.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/answerdotai__ModernBERT-base.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/answerdotai__answerai-colbert-small-v1.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/colbert-ir__colbertv2.0.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/cross-encoder__ms-marco-MiniLM-L-12-v2.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/cross-encoder__ms-marco-MiniLM-L-6-v2.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/cross-encoder__nli-deberta-v3-base.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/docling.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/fastino__gliner2-base-v1.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/gliner-community__gliner_large-v2.5.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/gliner-community__gliner_medium-v2.5.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/gliner-community__gliner_small-v2.5.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/google__embeddinggemma-300m.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/google__owlv2-base-patch16-ensemble.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/google__siglip-so400m-patch14-224.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/google__siglip-so400m-patch14-384.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/google__siglip2-base-patch16-224.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/ibm-granite__granite-embedding-30m-sparse.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/ibm-granite__granite-embedding-english-r2.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/ibm-granite__granite-embedding-small-english-r2.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/intfloat__e5-base-v2.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/intfloat__e5-large-v2.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/intfloat__e5-small-v2.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/intfloat__multilingual-e5-large-instruct.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/intfloat__multilingual-e5-large.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/jackboyla__glirel-large-v0.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/jinaai__jina-colbert-v2.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/jinaai__jina-reranker-v2-base-multilingual.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/knowledgator__gliclass-base-v1.0.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/knowledgator__gliclass-large-v1.0.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/knowledgator__gliclass-large-v3.0.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/knowledgator__gliclass-small-v1.0.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/knowledgator__gliner-bi-base-v2.0.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/knowledgator__modern-gliner-bi-base-v1.0.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/laion__CLIP-ViT-B-32-laion2B-s34B-b79K.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/laion__CLIP-ViT-H-14-laion2B-s32B-b79K.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/lightonai__GTE-ModernColBERT-v1.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/lightonai__LightOnOCR-2-1B.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/lightonai__Reason-ModernColBERT.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/microsoft__Florence-2-base-ft.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/microsoft__Florence-2-base.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/microsoft__Florence-2-large.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/mixedbread-ai__mxbai-colbert-large-v1.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/mixedbread-ai__mxbai-edge-colbert-v0-32m.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/mixedbread-ai__mxbai-rerank-base-v2.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/mixedbread-ai__mxbai-rerank-large-v2.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/mynkchaudhry__Florence-2-FT-DocVQA.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/naver-clova-ix__donut-base-finetuned-cord-v2.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/naver-clova-ix__donut-base-finetuned-docvqa.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/naver-clova-ix__donut-base-finetuned-rvlcdip.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/naver__splade-cocondenser-selfdistil.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/naver__splade-v3.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/nomic-ai__nomic-embed-text-v2-moe.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/numind__NuNER_Zero-span.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/numind__NuNER_Zero.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/nvidia__NV-Embed-v2.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/nvidia__llama-embed-nemotron-8b.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/nvidia__llama-nemoretriever-colembed-3b-v1.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/nvidia__nemotron-colembed-vl-4b-v2.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/openai__clip-vit-base-patch32.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/openai__clip-vit-large-patch14.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v2-distill.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v2-mini.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v3-distill.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v3-gte.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/opensearch-project__opensearch-neural-sparse-encoding-v1.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/opensearch-project__opensearch-neural-sparse-encoding-v2-distill.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/prithivida__Splade_PP_en_v2.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/rasyosef__splade-mini.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/sentence-transformers__all-MiniLM-L6-v2.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/sugiv__stablebridge-pruner-highlighter.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/tomoroai__tomoro-colqwen3-embed-4b.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/urchade__gliner_large-v2.1.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/urchade__gliner_medium-v2.1.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/urchade__gliner_multi-v2.1.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/urchade__gliner_multi_pii-v1.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/urchade__gliner_small-v2.1.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/vidore__colpali-v1.3-hf.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/vidore__colqwen2.5-v0.2.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/models/zai-org__GLM-OCR.yaml +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/_base_adapter.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/_flash_base.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/_types.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/_utils.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/bert_flash/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/bert_flash_cross_encoder/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/bge_m3/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/bge_m3_flash/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/bge_m3_score_mixin.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/colbert/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/colbert_modernbert_flash/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/colbert_rotary_flash/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/cross_encoder/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/docling/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/errors.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/gliclass/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/gliner/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/gliner2/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/gliner_bi/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/glirel/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/gte_sparse_flash/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/jina_flash_cross_encoder/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/modernbert_flash/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/modernbert_flash_cross_encoder/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/nli_classification/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/nli_classification_flash/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/nomic_flash/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/peft_lora_mixin.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/qwen2_flash/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/qwen2_flash_cross_encoder/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/rope_flash/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/sentence_transformer/__init__.py +0 -0
- {sie_server-0.3.4/src/sie_server/app → sie_server-0.4.0/src/sie_server/adapters/sglang}/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/splade_flash/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/stablebridge_pruner/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/adapters/xlm_roberta_flash/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/api/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/api/encode.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/api/extract.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/api/helpers.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/api/metrics.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/api/models.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/api/openai_compat.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/api/openapi.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/api/options.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/api/root.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/api/score.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/api/serialization.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/api/validation.py +0 -0
- {sie_server-0.3.4/src/sie_server/config → sie_server-0.4.0/src/sie_server/app}/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/app/app_state_config.py +0 -0
- {sie_server-0.3.4/tests/adapters → sie_server-0.4.0/src/sie_server/config}/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/config/engine.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/adaptive_batching.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/batcher.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/deps.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/disk_cache.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/encode_pipeline.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/hf_env.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/hot_reload.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/inference.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/inference_output.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/load_errors.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/logging.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/memory.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/model_loader.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/oom.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/postprocessor.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/postprocessor_registry.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/prepared.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/preprocessor/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/preprocessor/base.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/preprocessor/text.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/preprocessor_registry.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/readiness.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/shutdown.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/timing.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/tokenizer.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/watcher.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/worker/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/worker/handlers/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/worker/handlers/base.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/worker/handlers/encode.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/worker/handlers/extract.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/worker/handlers/score.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/worker/model_worker.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/worker/oom_recovery.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/core/worker/types.py +0 -0
- {sie_server-0.3.4/tests/api → sie_server-0.4.0/src/sie_server/health}/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/main.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/nats_subscriber.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/observability/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/observability/gpu.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/observability/prometheus.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/observability/telemetry.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/observability/tracing.py +0 -0
- {sie_server-0.3.4/tests/app → sie_server-0.4.0/src/sie_server/processors}/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/static/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/static/index.html +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/types/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/types/openapi.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/types/outputs.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/types/overflow_policy.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/types/requests.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/src/sie_server/types/responses.py +0 -0
- {sie_server-0.3.4/tests/config → sie_server-0.4.0/tests/adapters}/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_base.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_bge_m3.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_bge_m3_flash.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_clip.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_colbert.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_docling.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_docling_smoke.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_donut.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_factory_integration.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_flash_base.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_florence2.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_gliclass_overflow_policy.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_glirel.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_glm_ocr.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_grounding_dino.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_gte_sparse.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_jina_flash_cross_encoder.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_lighton_ocr.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_lora.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_lora_integration.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_paddleocr_vl.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_sentence_transformer.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_siglip.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_sparse_aggregation.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_stablebridge_integration.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_stablebridge_pruner.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/adapters/test_visual_document.py +0 -0
- {sie_server-0.3.4/tests/core → sie_server-0.4.0/tests/api}/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/api/test_encode_dtype.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/api/test_encode_endpoint.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/api/test_encode_json_schema.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/api/test_encode_timing.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/api/test_encode_validation.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/api/test_extract.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/api/test_extract_integration.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/api/test_extract_oom.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/api/test_models.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/api/test_openai_compat.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/api/test_score.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/api/test_version_header.py +0 -0
- {sie_server-0.3.4/tests/core/worker → sie_server-0.4.0/tests/app}/__init__.py +0 -0
- {sie_server-0.3.4/tests/observability → sie_server-0.4.0/tests/config}/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/config/test_bundle_coverage.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/conftest.py +0 -0
- {sie_server-0.3.4/tests/type_defs → sie_server-0.4.0/tests/core}/__init__.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_adaptive_batching.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_batcher.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_disk_cache.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_hot_reload.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_idle_evict.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_inference.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_logging.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_memory.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_model_load_timeout.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_oom_detection.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_postprocessor.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_postprocessor_registry.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_prepared.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_preprocessor_registry.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_quantization.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_readiness.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_registry_core.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_registry_deps.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_registry_failed_state.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_registry_memory.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_registry_multi_model.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_shutdown.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_timing.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_watcher.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_worker_core.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_worker_lora.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/test_worker_options.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/core/worker/test_oom_recovery.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/observability/test_telemetry.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/observability/test_tracing.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/test_all_models.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/test_openapi_export.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/test_sdk_integration.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/test_server_smoke.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/test_sparse_integration.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/type_defs/test_inputs.py +0 -0
- {sie_server-0.3.4 → sie_server-0.4.0}/tests/type_defs/test_types.py +0 -0
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
# docker buildx build --platform linux/amd64,linux/arm64 -f packages/sie_server/Dockerfile.cpu -t sie-server:cpu .
|
|
7
7
|
|
|
8
8
|
ARG BUNDLE=default
|
|
9
|
+
ARG SIE_DEPS_IMAGE=
|
|
9
10
|
|
|
10
11
|
# =============================================================================
|
|
11
12
|
# Stage 1: Dependencies (pyproject.toml only, cached across code changes)
|
|
@@ -63,6 +64,13 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|
|
63
64
|
# platform in local BuildKit cache and in content-addressed registry layers.
|
|
64
65
|
FROM deps AS base
|
|
65
66
|
|
|
67
|
+
# Source-layer cache key — see Dockerfile.cuda12 for the full rationale.
|
|
68
|
+
# Tie source layers to a per-commit revision arg so a rebuild can't ship
|
|
69
|
+
# stale code via a reused source-COPY layer. Dependency layers in the
|
|
70
|
+
# ``deps`` stage above stay cached.
|
|
71
|
+
ARG SIE_SRC_REV=dev
|
|
72
|
+
RUN echo "sie source revision: ${SIE_SRC_REV}"
|
|
73
|
+
|
|
66
74
|
COPY packages/sie_sdk/src /tmp/sie_sdk/src
|
|
67
75
|
COPY packages/sie_server/src src/
|
|
68
76
|
COPY packages/sie_server/bundles bundles/
|
|
@@ -100,9 +108,9 @@ RUN set -eux; \
|
|
|
100
108
|
find /app/.venv -exec touch -h -d @0 {} + 2>/dev/null || true
|
|
101
109
|
|
|
102
110
|
# =============================================================================
|
|
103
|
-
# Stage
|
|
111
|
+
# Stage 3a: Bundle-deps - bundle-specific deps install (heavy)
|
|
104
112
|
# =============================================================================
|
|
105
|
-
FROM base AS
|
|
113
|
+
FROM base AS bundle_deps
|
|
106
114
|
|
|
107
115
|
ARG BUNDLE
|
|
108
116
|
|
|
@@ -142,6 +150,11 @@ RUN set -eux; \
|
|
|
142
150
|
fi; \
|
|
143
151
|
find /app/bundle-libs -exec touch -h -d @0 {} + 2>/dev/null || true
|
|
144
152
|
|
|
153
|
+
# =============================================================================
|
|
154
|
+
# Stage 3b: Builder - optional trampoline to a prebuilt base image
|
|
155
|
+
# =============================================================================
|
|
156
|
+
FROM ${SIE_DEPS_IMAGE:-bundle_deps} AS builder
|
|
157
|
+
|
|
145
158
|
# =============================================================================
|
|
146
159
|
# Stage 4: Runtime
|
|
147
160
|
# =============================================================================
|
|
@@ -158,13 +171,25 @@ ENV DEBIAN_FRONTEND=noninteractive \
|
|
|
158
171
|
# Only the shared libs torch + pillow + rtree actually dlopen at runtime.
|
|
159
172
|
# libspatialindex-c6: rtree (docling dep) dlopens libspatialindex_c.so; the
|
|
160
173
|
# rtree==1.4.1 wheel only bundles the C++ core, not the C wrapper.
|
|
174
|
+
# libgl1 libglib2.0-0 libice6 libsm6 libx11-6 libxcb1 libxext6: docling-ibm-models'
|
|
175
|
+
# TableFormer imports cv2 during DoclingAdapter.load() pre-warm; the opencv-python
|
|
176
|
+
# wheel unconditionally dlopens an X11 + libGL + glib chain at import even in
|
|
177
|
+
# headless usage. Without these, every docling extract crashes with
|
|
178
|
+
# "ImportError: libxcb.so.1: cannot open shared object file" (issue #1028).
|
|
161
179
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
|
162
180
|
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
|
|
163
181
|
apt-get update && apt-get install -y --no-install-recommends \
|
|
182
|
+
libgl1 \
|
|
183
|
+
libglib2.0-0 \
|
|
164
184
|
libgomp1 \
|
|
185
|
+
libice6 \
|
|
165
186
|
libjpeg62-turbo \
|
|
166
187
|
libpng16-16 \
|
|
167
|
-
|
|
188
|
+
libsm6 \
|
|
189
|
+
libspatialindex-c6 \
|
|
190
|
+
libx11-6 \
|
|
191
|
+
libxcb1 \
|
|
192
|
+
libxext6
|
|
168
193
|
|
|
169
194
|
RUN groupadd -g 1000 sie && useradd -u 1000 -g sie -m sie
|
|
170
195
|
|
|
@@ -176,8 +201,10 @@ WORKDIR /app
|
|
|
176
201
|
# no /etc/passwd visible (the sie user exists in the runtime FS but --link
|
|
177
202
|
# layers are created in isolation).
|
|
178
203
|
COPY --link --from=base --chown=1000:1000 /app/.venv /app/.venv
|
|
179
|
-
|
|
180
|
-
|
|
204
|
+
# Source trees WITHOUT --link — see Dockerfile.cuda12 (linked cross-stage
|
|
205
|
+
# copies didn't reliably invalidate on source change).
|
|
206
|
+
COPY --from=base --chown=1000:1000 /app/src /app/src
|
|
207
|
+
COPY --from=base --chown=1000:1000 /tmp/sie_sdk/src /tmp/sie_sdk/src
|
|
181
208
|
COPY --link --from=base --chown=1000:1000 /app/models /app/models
|
|
182
209
|
COPY --link --from=base --chown=1000:1000 /app/bundles /app/bundles
|
|
183
210
|
# Bundle-specific extras — last layer so shared layers above stay cached.
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
ARG BUNDLE=default
|
|
8
8
|
ARG UV_VERSION=0.9.28
|
|
9
|
+
ARG SIE_DEPS_IMAGE=
|
|
9
10
|
|
|
10
11
|
# =============================================================================
|
|
11
12
|
# Stage 1: uv + standalone Python 3.12 (no deadsnakes PPA)
|
|
@@ -66,6 +67,18 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|
|
66
67
|
# content-addressed registry layers.
|
|
67
68
|
FROM deps AS base
|
|
68
69
|
|
|
70
|
+
# Source-layer cache key. A rebuild once shipped STALE code: buildx
|
|
71
|
+
# reused a cached source-COPY layer even though the .py files had
|
|
72
|
+
# changed, so the demo had to overlay patched files by hand. Tie the
|
|
73
|
+
# source layers to an explicit revision arg the CI passes per commit
|
|
74
|
+
# (``--build-arg SIE_SRC_REV=$(git rev-parse --short HEAD)``) so any
|
|
75
|
+
# commit forces these layers — and the editable reinstall below — to
|
|
76
|
+
# rebuild. The expensive dependency install lives in the ``deps`` stage
|
|
77
|
+
# ABOVE this line, so it stays cached. Bundles of the same commit share
|
|
78
|
+
# the same SIE_SRC_REV, so cross-bundle layer dedup is preserved.
|
|
79
|
+
ARG SIE_SRC_REV=dev
|
|
80
|
+
RUN echo "sie source revision: ${SIE_SRC_REV}"
|
|
81
|
+
|
|
69
82
|
COPY packages/sie_sdk/src /tmp/sie_sdk/src
|
|
70
83
|
COPY packages/sie_server/src src/
|
|
71
84
|
COPY packages/sie_server/bundles bundles/
|
|
@@ -114,9 +127,9 @@ RUN set -eux; \
|
|
|
114
127
|
find /app/.venv -exec touch -h -d @0 {} + 2>/dev/null || true
|
|
115
128
|
|
|
116
129
|
# =============================================================================
|
|
117
|
-
# Stage
|
|
130
|
+
# Stage 3a: Bundle-deps - bundle-specific deps install (heavy)
|
|
118
131
|
# =============================================================================
|
|
119
|
-
FROM base AS
|
|
132
|
+
FROM base AS bundle_deps
|
|
120
133
|
|
|
121
134
|
ARG BUNDLE
|
|
122
135
|
|
|
@@ -160,6 +173,11 @@ RUN set -eux; \
|
|
|
160
173
|
# Normalize mtimes so rebuilds of the same bundle produce identical layer bytes.
|
|
161
174
|
find /app/bundle-libs -exec touch -h -d @0 {} + 2>/dev/null || true
|
|
162
175
|
|
|
176
|
+
# =============================================================================
|
|
177
|
+
# Stage 3b: Builder - optional trampoline to a prebuilt base image
|
|
178
|
+
# =============================================================================
|
|
179
|
+
FROM ${SIE_DEPS_IMAGE:-bundle_deps} AS builder
|
|
180
|
+
|
|
163
181
|
# =============================================================================
|
|
164
182
|
# Stage 4: Runtime
|
|
165
183
|
# =============================================================================
|
|
@@ -175,15 +193,27 @@ ENV DEBIAN_FRONTEND=noninteractive
|
|
|
175
193
|
# libgomp1: torch OpenMP runtime.
|
|
176
194
|
# libspatialindex-c6: rtree (docling dep) dlopens libspatialindex_c.so; the
|
|
177
195
|
# rtree==1.4.1 wheel only bundles the C++ core, not the C wrapper.
|
|
196
|
+
# libgl1 libglib2.0-0 libice6 libsm6 libx11-6 libxcb1 libxext6: docling-ibm-models'
|
|
197
|
+
# TableFormer imports cv2 during DoclingAdapter.load() pre-warm; the
|
|
198
|
+
# opencv-python wheel unconditionally dlopens an X11 + libGL + glib chain at
|
|
199
|
+
# import even in headless usage. Without these, every docling extract crashes
|
|
200
|
+
# with "ImportError: libxcb.so.1: cannot open shared object file" (issue #1028).
|
|
178
201
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
|
179
202
|
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
|
|
180
203
|
apt-get update && apt-get install -y --no-install-recommends \
|
|
181
204
|
ca-certificates \
|
|
182
205
|
gcc \
|
|
183
206
|
libc6-dev \
|
|
207
|
+
libgl1 \
|
|
208
|
+
libglib2.0-0 \
|
|
184
209
|
libgomp1 \
|
|
210
|
+
libice6 \
|
|
185
211
|
libnuma1 \
|
|
186
|
-
|
|
212
|
+
libsm6 \
|
|
213
|
+
libspatialindex-c6 \
|
|
214
|
+
libx11-6 \
|
|
215
|
+
libxcb1 \
|
|
216
|
+
libxext6
|
|
187
217
|
|
|
188
218
|
RUN groupadd -g 1000 sie && useradd -u 1000 -g sie -m sie
|
|
189
219
|
|
|
@@ -211,8 +241,13 @@ RUN set -e; \
|
|
|
211
241
|
# (the sie user is added in the runtime stage filesystem but --link layers
|
|
212
242
|
# are created in isolation from the destination stage state).
|
|
213
243
|
COPY --link --from=base --chown=1000:1000 /app/.venv /app/.venv
|
|
214
|
-
COPY --link --from
|
|
215
|
-
|
|
244
|
+
# Source trees are copied WITHOUT --link. ``COPY --link --from=<stage>``
|
|
245
|
+
# layers are cached on a digest that, in the buildx versions this image
|
|
246
|
+
# was built with, did not reliably invalidate when the upstream source
|
|
247
|
+
# changed — the stale-code bug above. These trees are small, so dropping
|
|
248
|
+
# --link costs negligible dedup while guaranteeing edited code ships.
|
|
249
|
+
COPY --from=base --chown=1000:1000 /app/src /app/src
|
|
250
|
+
COPY --from=base --chown=1000:1000 /tmp/sie_sdk/src /tmp/sie_sdk/src
|
|
216
251
|
COPY --link --from=base --chown=1000:1000 /app/models /app/models
|
|
217
252
|
COPY --link --from=base --chown=1000:1000 /app/bundles /app/bundles
|
|
218
253
|
# Bundle-specific extras — last layer so shared layers above stay cached.
|
|
@@ -62,6 +62,12 @@ auto-retries; see `packages/sie_sdk/README.md` for client-side controls.
|
|
|
62
62
|
| `SIE_DEFAULT_COMPUTE_PRECISION` | `float16` | One of `float16`, `bfloat16`, `float32`. |
|
|
63
63
|
| `SIE_ATTENTION_BACKEND` | `auto` | One of `auto`, `flash_attention_2`, `sdpa`, `eager`. |
|
|
64
64
|
|
|
65
|
+
### Diagnostics
|
|
66
|
+
|
|
67
|
+
| Env var | Default | Effect |
|
|
68
|
+
|--|--|--|
|
|
69
|
+
| `SIE_GRAMMAR_PREFLIGHT_DEBUG` | unset (off) | Enables the legacy worker-side Outlines preflight compile before each structured-output request. Off by default per ADR-0002 — SGLang is the production grammar authority. Use for diagnosing schema-rejection problems or slow compiles in a controlled environment; not recommended for production traffic. |
|
|
70
|
+
|
|
65
71
|
For nested settings (any field with `__`), the env-var format is
|
|
66
72
|
`SIE_<TOP>__<NESTED>=value`. The complete schema is in
|
|
67
73
|
`packages/sie_server/src/sie_server/config/engine.py`.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
name: sglang-embedding
|
|
2
|
+
priority: 21
|
|
3
|
+
adapters:
|
|
4
|
+
- sie_server.adapters.sglang.embedding
|
|
5
|
+
deps:
|
|
6
|
+
# Lockstep with ``bundles/sglang.yaml``. The dependency stack is
|
|
7
|
+
# identical (sglang's grammar backends are unavoidably pulled in even
|
|
8
|
+
# on embedding-only deployments), and the bundle split exists only for
|
|
9
|
+
# worker pool isolation. The previous revision pinned ``outlines-core``
|
|
10
|
+
# (a transitive of ``outlines``) instead of the actual top-level deps
|
|
11
|
+
# the generation bundle pins, which was both wrong and produced silent
|
|
12
|
+
# pip resolution drift on environments that already had a different
|
|
13
|
+
# ``outlines`` installed. Asserted in ``tests/test_bundles.py`` so a
|
|
14
|
+
# future drift fails fast.
|
|
15
|
+
sglang: '==0.5.10'
|
|
16
|
+
xgrammar: '==0.1.32'
|
|
17
|
+
outlines: '==0.1.11'
|
|
18
|
+
llguidance: '>=0.7.11,<0.8.0'
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
name: sglang
|
|
2
|
+
priority: 20
|
|
3
|
+
adapters:
|
|
4
|
+
- sie_server.adapters.sglang.generation
|
|
5
|
+
deps:
|
|
6
|
+
# SGLang 0.5.6+ includes all deps at base level (fixed from 0.4.x extras bug)
|
|
7
|
+
# See: https://github.com/sgl-project/sglang/issues/4869
|
|
8
|
+
#
|
|
9
|
+
# Qwen3.5-4B compatibility — M4 req2 Proj 5:
|
|
10
|
+
#
|
|
11
|
+
# ``sglang==0.5.10`` is the canonical target for Qwen3.5-4B on the
|
|
12
|
+
# current L4 / A100-40GB / H100 fleet. Audited against
|
|
13
|
+
# ``python/pyproject.toml@v0.5.10`` upstream (see
|
|
14
|
+
# ``product/plans/qwen35-sglang-mtp-structured-outputs-findings.md``):
|
|
15
|
+
#
|
|
16
|
+
# * ships the ``qwen3_5`` model class (``models/qwen3_5.py``, 1724 LOC)
|
|
17
|
+
# * grammar backends: ``xgrammar`` / ``outlines`` / ``llguidance`` / ``none``
|
|
18
|
+
# * ``sglang-kernel==0.4.1`` wheel covers SM_80 / SM_89 / SM_90 / SM_100
|
|
19
|
+
# via gencode (``CMakeLists.txt``: ``ENABLE_BELOW_SM90=ON`` default).
|
|
20
|
+
# The runtime loader (``sgl_kernel/load_utils.py``) maps
|
|
21
|
+
# compute_capability != 90 → ``sm100/`` subdir, which holds the
|
|
22
|
+
# SM_80 / SM_89 / SM_100 build (precise math). H100 (CC=90) gets
|
|
23
|
+
# the ``sm90/`` fast-math build.
|
|
24
|
+
# * torch==2.9.1 (CUDA 12.8/12.9 wheels); ``cuda-python==12.9``.
|
|
25
|
+
# **Not** CUDA 13 — that's an SGLang-main-only path which only
|
|
26
|
+
# became relevant when looking at the dev branch.
|
|
27
|
+
#
|
|
28
|
+
# Compat note: 0.5.12 wheel observed shipping only ``sm100/`` (no SM_80
|
|
29
|
+
# cubin entry inside) — out of scope; we stay on 0.5.10 until upstream
|
|
30
|
+
# ships multi-arch binaries again.
|
|
31
|
+
sglang: '==0.5.10'
|
|
32
|
+
#
|
|
33
|
+
# Grammar backend deps — pinned to exactly what SGLang 0.5.10 imports
|
|
34
|
+
# internally to prevent silent pip resolution drift.
|
|
35
|
+
#
|
|
36
|
+
# XGrammar is SGLang 0.5.10's other supported grammar backend; it pins
|
|
37
|
+
# ``xgrammar==0.1.32`` exactly. That release also brought the
|
|
38
|
+
# structured-output VRAM-leak fix (PR #20697) and grammar-error
|
|
39
|
+
# propagation (PR #20467). Kept available as the fallback backend.
|
|
40
|
+
xgrammar: '==0.1.32'
|
|
41
|
+
#
|
|
42
|
+
# ``outlines==0.1.11`` is a hard transitive dep of sglang==0.5.10. We
|
|
43
|
+
# declare it explicitly at bundle level so the surface is visible.
|
|
44
|
+
# ``outlines-core`` (a separate package) is a transitive of outlines
|
|
45
|
+
# and intentionally NOT pinned here — pinning ``outlines-core`` directly
|
|
46
|
+
# was a bug in the previous bundle revision (it does not give us any
|
|
47
|
+
# grammar functionality on its own).
|
|
48
|
+
#
|
|
49
|
+
# Outlines IS the active grammar backend for Qwen3.5 (partner
|
|
50
|
+
# requirement). Earlier revisions said "do not switch Qwen3.5 to
|
|
51
|
+
# outlines" because the worker-side ``compile_outlines`` preflight
|
|
52
|
+
# crashed with ``'TokenizersBackend' object has no attribute
|
|
53
|
+
# 'vocabulary'``: it passed the raw transformers==5.3.0 tokenizer
|
|
54
|
+
# (now a ``TokenizersBackend``) to Outlines' processor factories, which
|
|
55
|
+
# require an Outlines ``Tokenizer`` adapter exposing ``.vocabulary``.
|
|
56
|
+
# ``compile_outlines`` now wraps the tokenizer in Outlines'
|
|
57
|
+
# ``TransformerTokenizer`` first (the same wrap SGLang's
|
|
58
|
+
# ``OutlinesGrammarBackend`` does internally), so the mismatch is gone
|
|
59
|
+
# for json_schema/regex; ebnf is forwarded straight to SGLang. See
|
|
60
|
+
# ``processors/grammar_compile.py`` ("Tokenizer adapter").
|
|
61
|
+
outlines: '==0.1.11'
|
|
62
|
+
#
|
|
63
|
+
# llguidance is the third grammar backend (regex / json_schema / ebnf).
|
|
64
|
+
# Pinned to SGLang 0.5.10's compatible range. Kept available as the
|
|
65
|
+
# fallback if a future model regresses on xgrammar.
|
|
66
|
+
llguidance: '>=0.7.11,<0.8.0'
|
|
@@ -18,7 +18,7 @@ profiles:
|
|
|
18
18
|
default:
|
|
19
19
|
max_batch_tokens: 16384
|
|
20
20
|
compute_precision: bfloat16
|
|
21
|
-
adapter_path: sie_server.adapters.sglang:SGLangEmbeddingAdapter
|
|
21
|
+
adapter_path: sie_server.adapters.sglang.embedding:SGLangEmbeddingAdapter
|
|
22
22
|
adapter_options:
|
|
23
23
|
loadtime:
|
|
24
24
|
mem_fraction_static: 0.85
|
|
@@ -18,7 +18,7 @@ profiles:
|
|
|
18
18
|
default:
|
|
19
19
|
max_batch_tokens: 8192
|
|
20
20
|
compute_precision: bfloat16
|
|
21
|
-
adapter_path: sie_server.adapters.sglang:SGLangEmbeddingAdapter
|
|
21
|
+
adapter_path: sie_server.adapters.sglang.embedding:SGLangEmbeddingAdapter
|
|
22
22
|
adapter_options:
|
|
23
23
|
loadtime:
|
|
24
24
|
mem_fraction_static: 0.85
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
sie_id: Qwen/Qwen3-0.6B
|
|
2
|
+
hf_id: Qwen/Qwen3-0.6B
|
|
3
|
+
inputs:
|
|
4
|
+
text: true
|
|
5
|
+
image: false
|
|
6
|
+
audio: false
|
|
7
|
+
video: false
|
|
8
|
+
tasks:
|
|
9
|
+
# Tiny generation model used for fast e2e iteration on Modal L4 (walking-skeleton
|
|
10
|
+
# viability bench + validation-harness SIE-only smoke). Loads in ~30s. Quality is
|
|
11
|
+
# bad — this is a transport benchmark target, not a production model.
|
|
12
|
+
#
|
|
13
|
+
# Context / batch sizes are deliberately small (1024 vs the headroom an
|
|
14
|
+
# L4 could nominally support) so the validation harness can co-resident
|
|
15
|
+
# the worker's SGLang with a second SGLang for the baseline phase on
|
|
16
|
+
# a single 22 GiB card without OOMing. Raise these if you ever run 0.6B
|
|
17
|
+
# alone on a card it doesn't have to share.
|
|
18
|
+
#
|
|
19
|
+
# Note on the three 1024s below: `context_length`, `max_sequence_length`,
|
|
20
|
+
# and `max_batch_tokens` are NOT redundant — they're three independent
|
|
21
|
+
# knobs (per-request context, SGLang --context-length, batcher cost
|
|
22
|
+
# budget) that just happen to collide here because the model is tiny.
|
|
23
|
+
# See sibling Qwen__Qwen3-4B-Instruct-2507.yaml for the canonical
|
|
24
|
+
# non-collapsed shape.
|
|
25
|
+
generate:
|
|
26
|
+
context_length: 1024
|
|
27
|
+
max_output_tokens: 1024
|
|
28
|
+
capabilities:
|
|
29
|
+
grammar: []
|
|
30
|
+
streaming: true
|
|
31
|
+
tools: false
|
|
32
|
+
max_sequence_length: 1024
|
|
33
|
+
# KV-cache memory math (Qwen3-0.6B, bf16):
|
|
34
|
+
# layers=28, kv_heads=8, head_dim=128, bytes_per_elem=2
|
|
35
|
+
# kv_bytes_per_token = 2 × 28 × 8 × 128 × 2 = 114,688 B ≈ 112 KB
|
|
36
|
+
# The 0.6B is a transport benchmark target — context_length is held at
|
|
37
|
+
# 1024 deliberately (see header comment) so the validation harness can
|
|
38
|
+
# co-resident two SGLang instances on an L4. KV budgets per profile
|
|
39
|
+
# scale with the deployment scenario rather than the GPU ceiling.
|
|
40
|
+
profiles:
|
|
41
|
+
default:
|
|
42
|
+
max_batch_tokens: 1024
|
|
43
|
+
compute_precision: bfloat16
|
|
44
|
+
adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
|
|
45
|
+
kv_budget_tokens: 8192
|
|
46
|
+
adapter_options:
|
|
47
|
+
loadtime:
|
|
48
|
+
# 0.8 leaves headroom on a 22 GiB L4 for a second SGLang instance
|
|
49
|
+
# (the validation harness co-residents worker + baseline). If the
|
|
50
|
+
# 0.6B is the only model on the card, 0.9 is fine.
|
|
51
|
+
mem_fraction_static: 0.8
|
|
52
|
+
served_model_name: Qwen/Qwen3-0.6B
|
|
53
|
+
# Modal sandbox lacks flashinfer's JIT prerequisites; switch backends.
|
|
54
|
+
disable_cuda_graph: true
|
|
55
|
+
attention_backend: triton
|
|
56
|
+
runtime:
|
|
57
|
+
first_chunk_timeout_s: 30
|
|
58
|
+
inter_chunk_timeout_s: 10
|
|
59
|
+
# Aligned with the rest of the generate model fleet (300s).
|
|
60
|
+
# The previous 132s was an unexplained magic number that
|
|
61
|
+
# diverged from every other generate config; bumping to the
|
|
62
|
+
# fleet default keeps long-completion requests from hitting a
|
|
63
|
+
# premature overall-timeout on the 0.6B model.
|
|
64
|
+
overall_timeout_s: 300
|
|
65
|
+
default_sampling:
|
|
66
|
+
temperature: 0.0
|
|
67
|
+
top_p: 1.0
|
|
68
|
+
# Dedicated 0.6B deployments on a100/h100 don't co-resident a baseline,
|
|
69
|
+
# so mem_fraction_static returns to the standard 0.85 and the KV budget
|
|
70
|
+
# scales with the larger GPU. kv_budget_tokens stays well below the
|
|
71
|
+
# theoretical ceiling because the 0.6B's *context_length* (1024) caps
|
|
72
|
+
# per-request KV consumption — the budget really just sets the upper
|
|
73
|
+
# bound on concurrent in-flight sequences.
|
|
74
|
+
a100-40gb:
|
|
75
|
+
max_batch_tokens: 4096
|
|
76
|
+
compute_precision: bfloat16
|
|
77
|
+
adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
|
|
78
|
+
kv_budget_tokens: 32768
|
|
79
|
+
adapter_options:
|
|
80
|
+
loadtime:
|
|
81
|
+
mem_fraction_static: 0.85
|
|
82
|
+
served_model_name: Qwen/Qwen3-0.6B
|
|
83
|
+
disable_cuda_graph: true
|
|
84
|
+
attention_backend: triton
|
|
85
|
+
runtime:
|
|
86
|
+
first_chunk_timeout_s: 30
|
|
87
|
+
inter_chunk_timeout_s: 10
|
|
88
|
+
# Aligned with the rest of the generate model fleet (300s).
|
|
89
|
+
# The previous 132s was an unexplained magic number that
|
|
90
|
+
# diverged from every other generate config; bumping to the
|
|
91
|
+
# fleet default keeps long-completion requests from hitting a
|
|
92
|
+
# premature overall-timeout on the 0.6B model.
|
|
93
|
+
overall_timeout_s: 300
|
|
94
|
+
default_sampling:
|
|
95
|
+
temperature: 0.0
|
|
96
|
+
top_p: 1.0
|
|
97
|
+
h100:
|
|
98
|
+
max_batch_tokens: 8192
|
|
99
|
+
compute_precision: bfloat16
|
|
100
|
+
adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
|
|
101
|
+
kv_budget_tokens: 65536
|
|
102
|
+
adapter_options:
|
|
103
|
+
loadtime:
|
|
104
|
+
mem_fraction_static: 0.85
|
|
105
|
+
served_model_name: Qwen/Qwen3-0.6B
|
|
106
|
+
disable_cuda_graph: true
|
|
107
|
+
attention_backend: triton
|
|
108
|
+
runtime:
|
|
109
|
+
first_chunk_timeout_s: 30
|
|
110
|
+
inter_chunk_timeout_s: 10
|
|
111
|
+
# Aligned with the rest of the generate model fleet (300s).
|
|
112
|
+
# The previous 132s was an unexplained magic number that
|
|
113
|
+
# diverged from every other generate config; bumping to the
|
|
114
|
+
# fleet default keeps long-completion requests from hitting a
|
|
115
|
+
# premature overall-timeout on the 0.6B model.
|
|
116
|
+
overall_timeout_s: 300
|
|
117
|
+
default_sampling:
|
|
118
|
+
temperature: 0.0
|
|
119
|
+
top_p: 1.0
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
sie_id: Qwen/Qwen3-4B-Instruct-2507
|
|
2
|
+
hf_id: Qwen/Qwen3-4B-Instruct-2507
|
|
3
|
+
inputs:
|
|
4
|
+
text: true
|
|
5
|
+
image: false
|
|
6
|
+
audio: false
|
|
7
|
+
video: false
|
|
8
|
+
tasks:
|
|
9
|
+
# kv_budget_tokens now lives on profiles (below). The KV calibration
|
|
10
|
+
# follow-up publishes the tuned value; the placeholder here is conservative
|
|
11
|
+
# and assumes Qwen3-4B's ~150 KB/token KV footprint on an L4 (24 GB,
|
|
12
|
+
# mem_fraction_static=0.85).
|
|
13
|
+
generate:
|
|
14
|
+
context_length: 32768
|
|
15
|
+
max_output_tokens: 4096
|
|
16
|
+
capabilities:
|
|
17
|
+
# Outlines-backed JSON Schema, regex, and EBNF grammars are
|
|
18
|
+
# all supported by the SGLang adapter (Outlines and XGrammar
|
|
19
|
+
# both accept EBNF natively). The gateway gates requests on
|
|
20
|
+
# this exact list — adding a new ``grammar.kind`` variant
|
|
21
|
+
# requires both the gateway parser and this list to be updated.
|
|
22
|
+
grammar: ["json_schema", "regex", "ebnf"]
|
|
23
|
+
streaming: true
|
|
24
|
+
# Qwen3-4B-Instruct's chat template emits OpenAI-compatible
|
|
25
|
+
# ``<tool_call>{...}</tool_call>`` blocks when ``tools`` is
|
|
26
|
+
# present in the rendered messages; the worker's
|
|
27
|
+
# ``parse_tool_call_stream`` consumes those blocks and surfaces
|
|
28
|
+
# them on ``delta.tool_calls`` for SSE and on
|
|
29
|
+
# ``message.tool_calls`` for non-streaming requests.
|
|
30
|
+
tools: true
|
|
31
|
+
# Forwarded verbatim to ``tokenizer.apply_chat_template(**kwargs)`` when
|
|
32
|
+
# the worker renders an OpenAI-shaped ``messages`` request.
|
|
33
|
+
# Qwen3's chat template emits a ``<think>``/``</think>`` reasoning block
|
|
34
|
+
# unless this flag suppresses it.
|
|
35
|
+
chat_template_kwargs:
|
|
36
|
+
enable_thinking: false
|
|
37
|
+
# Schemas/regexes the worker pre-compiles at model load so the first
|
|
38
|
+
# request hitting them skips the Outlines compile (cold TTFT win).
|
|
39
|
+
# Failures here log + bump ``sie_worker_grammar_prewarm_total{outcome="failed"}``
|
|
40
|
+
# without blocking model load — add entries only for shapes you
|
|
41
|
+
# know are hot.
|
|
42
|
+
prewarm_grammars:
|
|
43
|
+
# Bare pattern, NOT anchored: this model uses the default Outlines
|
|
44
|
+
# grammar backend, and Outlines regexes are implicitly anchored —
|
|
45
|
+
# its FSM engine (interegular) rejects ``^``/``$`` with
|
|
46
|
+
# ``Unsupported``, which crashes SGLang's scheduler. Use ``(yes|no)``.
|
|
47
|
+
- name: yes_no
|
|
48
|
+
kind: regex
|
|
49
|
+
value: "(yes|no)"
|
|
50
|
+
- name: short_answer
|
|
51
|
+
kind: json_schema
|
|
52
|
+
value:
|
|
53
|
+
type: object
|
|
54
|
+
properties:
|
|
55
|
+
answer:
|
|
56
|
+
type: string
|
|
57
|
+
required: [answer]
|
|
58
|
+
max_sequence_length: 32768
|
|
59
|
+
# KV-cache memory math (Qwen3-4B-Instruct-2507, bf16):
|
|
60
|
+
# layers=36, kv_heads=8, head_dim=128, bytes_per_elem=2
|
|
61
|
+
# kv_bytes_per_token = 2 (k+v) × 36 × 8 × 128 × 2 = 147,456 B ≈ 144 KB
|
|
62
|
+
# Theoretical max KV tokens per GPU (assuming ~8 GB weights, mem_fraction_static=0.85):
|
|
63
|
+
# l4 (24 GB): (24 × 0.85 − 8) GB / 144 KB ≈ 90,000 tokens
|
|
64
|
+
# a100-40gb (40 GB): (40 × 0.85 − 8) GB / 144 KB ≈ 189,000 tokens
|
|
65
|
+
# h100 (80 GB): (80 × 0.85 − 8) GB / 144 KB ≈ 437,000 tokens
|
|
66
|
+
# kv_budget_tokens is set to ~40% of theoretical max, matching the L4
|
|
67
|
+
# baseline ratio (32768/90000 ≈ 36%). The headroom absorbs batch growth,
|
|
68
|
+
# speculative side-cell, grammar/Outlines compile arena, fragmentation.
|
|
69
|
+
# Final empirical validation (concurrency-16 OOM-boundary sweep) is
|
|
70
|
+
# tracked in product/plans/m4-req2-gpu-runbook.md §"#16/#19".
|
|
71
|
+
profiles:
|
|
72
|
+
default:
|
|
73
|
+
# max_batch_tokens is a generic engine knob; generation does not batch
|
|
74
|
+
# at the SIE layer (SGLang batches internally) but the validator
|
|
75
|
+
# requires the field to be set.
|
|
76
|
+
max_batch_tokens: 16384
|
|
77
|
+
compute_precision: bfloat16
|
|
78
|
+
adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
|
|
79
|
+
# L4 baseline — empirically gated by the speculative-decoding + calibration work.
|
|
80
|
+
kv_budget_tokens: 32768
|
|
81
|
+
adapter_options:
|
|
82
|
+
loadtime:
|
|
83
|
+
mem_fraction_static: 0.85
|
|
84
|
+
served_model_name: Qwen/Qwen3-4B-Instruct-2507
|
|
85
|
+
# speculative decoding (MTP/EAGLE/NGRAM) intentionally absent;
|
|
86
|
+
# week-1 validation decides whether to promote a side-cell. See §4.9.
|
|
87
|
+
runtime:
|
|
88
|
+
first_chunk_timeout_s: 30
|
|
89
|
+
inter_chunk_timeout_s: 10
|
|
90
|
+
overall_timeout_s: 300
|
|
91
|
+
default_sampling:
|
|
92
|
+
temperature: 0.7
|
|
93
|
+
top_p: 0.9
|
|
94
|
+
stop_tokens:
|
|
95
|
+
- "<|im_end|>"
|
|
96
|
+
# M5 audit #16/#19: analytical defaults for a100-40gb / h100. Production
|
|
97
|
+
# capacity also grows: with 2-4× the KV budget the context window can be
|
|
98
|
+
# widened proportionally so longer-context workloads (RAG with large
|
|
99
|
+
# retrieved passages) fit comfortably. ``max_output_tokens`` doubles
|
|
100
|
+
# to 8192/16384 respectively — beyond that, latency hurts more than
|
|
101
|
+
# quality helps for instruction-style chat traffic.
|
|
102
|
+
a100-40gb:
|
|
103
|
+
max_batch_tokens: 32768
|
|
104
|
+
compute_precision: bfloat16
|
|
105
|
+
adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
|
|
106
|
+
# Empirically calibrated on Modal A100-SXM4-40GB
|
|
107
|
+
# (sglang 0.5.9, mem_fraction_static=0.85, context_length=32768).
|
|
108
|
+
# SGLang's /server_info reports:
|
|
109
|
+
# weight=7.71 GB, kvcache=25.42 GB, graph=0.18 GB,
|
|
110
|
+
# token_capacity=185,081 tokens
|
|
111
|
+
# ``kv_budget_tokens`` sized for 4 concurrent admissions:
|
|
112
|
+
# 185,081 / 4 = 46,270 → round down to 45,056 for headroom.
|
|
113
|
+
# Re-calibrate if SGLang version or mem_fraction_static changes.
|
|
114
|
+
kv_budget_tokens: 45056
|
|
115
|
+
adapter_options:
|
|
116
|
+
loadtime:
|
|
117
|
+
mem_fraction_static: 0.85
|
|
118
|
+
served_model_name: Qwen/Qwen3-4B-Instruct-2507
|
|
119
|
+
runtime:
|
|
120
|
+
first_chunk_timeout_s: 30
|
|
121
|
+
inter_chunk_timeout_s: 10
|
|
122
|
+
overall_timeout_s: 300
|
|
123
|
+
default_sampling:
|
|
124
|
+
temperature: 0.7
|
|
125
|
+
top_p: 0.9
|
|
126
|
+
stop_tokens:
|
|
127
|
+
- "<|im_end|>"
|
|
128
|
+
h100:
|
|
129
|
+
max_batch_tokens: 65536
|
|
130
|
+
compute_precision: bfloat16
|
|
131
|
+
adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
|
|
132
|
+
# Empirically calibrated on Modal H100 80GB HBM3
|
|
133
|
+
# (sglang 0.5.9, mem_fraction_static=0.85, context_length=32768).
|
|
134
|
+
# SGLang's /server_info reports:
|
|
135
|
+
# weight=7.71 GB, kvcache=59.0 GB, graph=0.43 GB,
|
|
136
|
+
# token_capacity=429,645 tokens
|
|
137
|
+
# ``kv_budget_tokens`` sized for 4 concurrent admissions:
|
|
138
|
+
# 429,645 / 4 = 107,411 → round down to 106,496 for headroom.
|
|
139
|
+
kv_budget_tokens: 106496
|
|
140
|
+
adapter_options:
|
|
141
|
+
loadtime:
|
|
142
|
+
mem_fraction_static: 0.85
|
|
143
|
+
served_model_name: Qwen/Qwen3-4B-Instruct-2507
|
|
144
|
+
runtime:
|
|
145
|
+
first_chunk_timeout_s: 30
|
|
146
|
+
inter_chunk_timeout_s: 10
|
|
147
|
+
overall_timeout_s: 300
|
|
148
|
+
default_sampling:
|
|
149
|
+
temperature: 0.7
|
|
150
|
+
top_p: 0.9
|
|
151
|
+
stop_tokens:
|
|
152
|
+
- "<|im_end|>"
|
|
@@ -18,7 +18,7 @@ profiles:
|
|
|
18
18
|
default:
|
|
19
19
|
max_batch_tokens: 16384
|
|
20
20
|
compute_precision: bfloat16
|
|
21
|
-
adapter_path: sie_server.adapters.sglang:SGLangEmbeddingAdapter
|
|
21
|
+
adapter_path: sie_server.adapters.sglang.embedding:SGLangEmbeddingAdapter
|
|
22
22
|
adapter_options:
|
|
23
23
|
loadtime:
|
|
24
24
|
mem_fraction_static: 0.85
|