sie-server 0.4.1__tar.gz → 0.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sie_server-0.4.1 → sie_server-0.4.2}/Dockerfile.cpu +5 -5
- {sie_server-0.4.1 → sie_server-0.4.2}/Dockerfile.cuda12 +17 -9
- {sie_server-0.4.1 → sie_server-0.4.2}/PKG-INFO +3 -3
- {sie_server-0.4.1 → sie_server-0.4.2}/README.md +1 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/bundles/default.yaml +1 -0
- sie_server-0.4.2/models/Marqo__marqo-fashionSigLIP.yaml +28 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3-0.6B.yaml +20 -21
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3-4B-Instruct-2507.yaml +2 -3
- sie_server-0.4.2/models/Qwen__Qwen3.6-27B.yaml +308 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/docling.yaml +1 -1
- sie_server-0.4.2/models/opendatalab__MinerU2.5-Pro-2604-1.2B.yaml +24 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/openapi.json +1 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/pyproject.toml +7 -4
- sie_server-0.4.2/scripts/generate_tokenize_fixture.py +203 -0
- sie_server-0.4.2/src/sie_server/__init__.py +9 -0
- sie_server-0.4.2/src/sie_server/_ipc_test_harness.py +356 -0
- sie_server-0.4.2/src/sie_server/adapter_call_loop.py +439 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/_generation_base.py +2 -4
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/_utils.py +4 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/base.py +2 -5
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/bert_flash_cross_encoder/__init__.py +0 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/clip/__init__.py +19 -6
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/colpali/__init__.py +18 -13
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/colqwen2/__init__.py +6 -4
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/colqwen3/__init__.py +72 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/docling/__init__.py +29 -8
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/donut/__init__.py +0 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/gliner/__init__.py +0 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/glirel/__init__.py +0 -3
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/glm_ocr/__init__.py +105 -0
- sie_server-0.4.2/src/sie_server/adapters/mineru_vl/__init__.py +434 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/nemo_colembed/__init__.py +49 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/peft_lora_mixin.py +0 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/pytorch_embedding/__init__.py +17 -4
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/sglang/_server.py +1 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/sglang/embedding.py +1 -3
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/sglang/generation.py +11 -5
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/siglip/__init__.py +3 -3
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/encode.py +3 -3
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/extract.py +10 -3
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/health.py +0 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/helpers.py +1 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/openai_compat.py +1 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/serialization.py +1 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/ws.py +25 -9
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/app/app_factory.py +56 -208
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/cli.py +20 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/config/engine.py +79 -6
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/config/model.py +4 -8
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/adaptive_batching.py +205 -10
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/batcher.py +9 -6
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/deps.py +1 -45
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/disk_cache.py +1 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/encode_pipeline.py +70 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/extract_cost.py +1 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/hot_reload.py +0 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/inference.py +2 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/inference_output.py +0 -3
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/loader.py +21 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/memory.py +2 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/model_loader.py +7 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/pool_isolation.py +2 -5
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/postprocessor.py +0 -3
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/postprocessor_registry.py +2 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/prepared.py +21 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/preprocessor/__init__.py +0 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/preprocessor/base.py +1 -1
- sie_server-0.4.2/src/sie_server/core/preprocessor/text.py +495 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/preprocessor/vision.py +175 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/preprocessor_registry.py +2 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/readiness.py +26 -3
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/registry.py +10 -4
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/timing.py +1 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/tokenizer.py +2 -18
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/worker/__init__.py +0 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/worker/model_worker.py +167 -12
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/worker/types.py +47 -2
- sie_server-0.4.2/src/sie_server/ipc_server.py +679 -0
- sie_server-0.4.2/src/sie_server/ipc_types.py +514 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/observability/__init__.py +0 -6
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/observability/gpu.py +0 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/observability/metrics.py +53 -13
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/observability/prometheus.py +0 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/observability/tracing.py +0 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/processors/streaming.py +110 -30
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/processors/work_class_scheduler.py +4 -5
- sie_server-0.4.2/src/sie_server/queue_executor.py +1088 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/types/inputs.py +2 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/types/outputs.py +1 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_clip.py +52 -8
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_docling.py +64 -2
- sie_server-0.4.2/tests/adapters/test_mineru_vl.py +380 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_pytorch_embedding_revision.py +34 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_sentence_transformer.py +61 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_sglang_generation.py +2 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_visual_document.py +18 -3
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_encode_dtype.py +1 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_encode_endpoint.py +1 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_encode_timing.py +1 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_extract.py +26 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_generate.py +2 -6
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/app/test_app_factory.py +173 -17
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/config/test_bundle_coverage.py +3 -6
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/config/test_config.py +9 -2
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/config/test_profile_backend_consistency.py +3 -12
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/conftest.py +29 -15
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_adaptive_batching.py +279 -3
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_batcher.py +13 -11
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_loader.py +79 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_lora_generation_exclusion.py +1 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_model_load_timeout.py +1 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_preprocessor.py +358 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_registry_async.py +1 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_worker_core.py +39 -1
- sie_server-0.4.2/tests/core/test_worker_passthrough.py +220 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/integration/test_chat_completions.py +2 -3
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/integration/test_grammar_generate.py +1 -1
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/observability/test_generation_metrics.py +4 -4
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/observability/test_metrics.py +110 -44
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/observability/test_trace_propagation.py +3 -3
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/test_grammar_prewarm.py +2 -4
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/test_streaming.py +14 -26
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/test_work_class_scheduler.py +1 -1
- sie_server-0.4.2/tests/test_adapter_call_loop.py +295 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/test_docker_integration.py +9 -9
- sie_server-0.4.2/tests/test_ipc_server.py +712 -0
- sie_server-0.4.2/tests/test_ipc_types_raw_output.py +162 -0
- sie_server-0.4.2/tests/test_model_yaml_filenames.py +35 -0
- sie_server-0.4.2/tests/test_parity_run_batch.py +332 -0
- sie_server-0.4.2/tests/test_queue_executor.py +724 -0
- sie_server-0.4.2/tests/test_queue_executor_stage1d.py +622 -0
- sie_server-0.4.2/tests/test_readiness.py +53 -0
- sie_server-0.4.2/tests/test_server_smoke.py +14 -0
- sie_server-0.4.2/tests/test_stage1d_byte_identity.py +393 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/type_defs/test_media_bytes.py +38 -11
- sie_server-0.4.1/models/Qwen__Qwen3.6-27B.yaml +0 -196
- sie_server-0.4.1/src/sie_server/__init__.py +0 -3
- sie_server-0.4.1/src/sie_server/core/preprocessor/text.py +0 -268
- sie_server-0.4.1/src/sie_server/nats_pull_loop.py +0 -2532
- sie_server-0.4.1/src/sie_server/nats_subscriber.py +0 -231
- sie_server-0.4.1/tests/test_nats_pull_loop.py +0 -1122
- sie_server-0.4.1/tests/test_nats_pull_loop_batching.py +0 -1291
- sie_server-0.4.1/tests/test_server_smoke.py +0 -8
- {sie_server-0.4.1 → sie_server-0.4.2}/.gitignore +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/CONTRIBUTING.md +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/LICENSE +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/bundles/sglang-embedding.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/bundles/sglang.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/bundles/transformers5.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Alibaba-NLP__gte-Qwen2-1.5B-instruct.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Alibaba-NLP__gte-Qwen2-7B-instruct.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Alibaba-NLP__gte-modernbert-base.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Alibaba-NLP__gte-multilingual-base.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Alibaba-NLP__gte-reranker-modernbert-base.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/BAAI__bge-m3.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/BAAI__bge-reranker-base.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/BAAI__bge-reranker-large.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/BAAI__bge-reranker-v2-m3.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/EmergentMethods__gliner_large_news-v2.1.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/GritLM__GritLM-7B.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/IDEA-Research__grounding-dino-base.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/IDEA-Research__grounding-dino-tiny.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Ihor__gliner-biomed-large-v1.0.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Linq-AI-Research__Linq-Embed-Mistral.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Marqo__marqo-ecommerce-embeddings-B.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/MoritzLaurer__deberta-v3-base-zeroshot-v2.0.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/MoritzLaurer__deberta-v3-large-zeroshot-v2.0.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/NeuML__gliner-bert-tiny.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/NovaSearch__stella_en_1.5B_v5.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/NovaSearch__stella_en_400M_v5.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/PaddlePaddle__PaddleOCR-VL-1.5.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3-Embedding-0.6B.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3-Embedding-4B.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3-Reranker-0.6B.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3-Reranker-4B.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3-VL-Embedding-2B.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3-VL-Reranker-2B.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3.5-4B.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Salesforce__SFR-Embedding-2_R.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Salesforce__SFR-Embedding-Mistral.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/Snowflake__snowflake-arctic-embed-m-v2.0.yaml +0 -0
- /sie_server-0.4.1/models/tomoroai__tomoro-colqwen3-embed-4b.yaml → /sie_server-0.4.2/models/TomoroAI__tomoro-colqwen3-embed-4b.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/answerdotai__ModernBERT-base.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/answerdotai__answerai-colbert-small-v1.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/colbert-ir__colbertv2.0.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/cross-encoder__ms-marco-MiniLM-L-12-v2.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/cross-encoder__ms-marco-MiniLM-L-6-v2.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/cross-encoder__nli-deberta-v3-base.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/fastino__gliner2-base-v1.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/gliner-community__gliner_large-v2.5.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/gliner-community__gliner_medium-v2.5.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/gliner-community__gliner_small-v2.5.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/google__embeddinggemma-300m.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/google__owlv2-base-patch16-ensemble.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/google__siglip-so400m-patch14-224.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/google__siglip-so400m-patch14-384.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/google__siglip2-base-patch16-224.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/ibm-granite__granite-embedding-30m-sparse.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/ibm-granite__granite-embedding-english-r2.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/ibm-granite__granite-embedding-small-english-r2.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/intfloat__e5-base-v2.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/intfloat__e5-large-v2.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/intfloat__e5-mistral-7b-instruct.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/intfloat__e5-small-v2.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/intfloat__multilingual-e5-large-instruct.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/intfloat__multilingual-e5-large.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/jackboyla__glirel-large-v0.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/jinaai__jina-colbert-v2.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/jinaai__jina-reranker-v2-base-multilingual.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/knowledgator__gliclass-base-v1.0.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/knowledgator__gliclass-large-v1.0.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/knowledgator__gliclass-large-v3.0.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/knowledgator__gliclass-small-v1.0.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/knowledgator__gliner-bi-base-v2.0.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/knowledgator__modern-gliner-bi-base-v1.0.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/laion__CLIP-ViT-B-32-laion2B-s34B-b79K.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/laion__CLIP-ViT-H-14-laion2B-s32B-b79K.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/lightonai__GTE-ModernColBERT-v1.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/lightonai__LightOnOCR-2-1B.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/lightonai__Reason-ModernColBERT.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/microsoft__Florence-2-base-ft.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/microsoft__Florence-2-base.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/microsoft__Florence-2-large.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/mixedbread-ai__mxbai-colbert-large-v1.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/mixedbread-ai__mxbai-edge-colbert-v0-32m.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/mixedbread-ai__mxbai-rerank-base-v2.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/mixedbread-ai__mxbai-rerank-large-v2.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/mynkchaudhry__Florence-2-FT-DocVQA.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/naver-clova-ix__donut-base-finetuned-cord-v2.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/naver-clova-ix__donut-base-finetuned-docvqa.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/naver-clova-ix__donut-base-finetuned-rvlcdip.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/naver__splade-cocondenser-selfdistil.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/naver__splade-v3.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/nomic-ai__nomic-embed-text-v2-moe.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/numind__NuNER_Zero-span.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/numind__NuNER_Zero.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/nvidia__NV-Embed-v2.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/nvidia__llama-embed-nemotron-8b.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/nvidia__llama-nemoretriever-colembed-3b-v1.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/nvidia__nemotron-colembed-vl-4b-v2.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/openai__clip-vit-base-patch32.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/openai__clip-vit-large-patch14.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v2-distill.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v2-mini.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v3-distill.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v3-gte.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-v1.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-v2-distill.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/prithivida__Splade_PP_en_v2.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/rasyosef__splade-mini.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/sentence-transformers__all-MiniLM-L6-v2.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/sugiv__stablebridge-pruner-highlighter.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/urchade__gliner_large-v2.1.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/urchade__gliner_medium-v2.1.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/urchade__gliner_multi-v2.1.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/urchade__gliner_multi_pii-v1.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/urchade__gliner_small-v2.1.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/vidore__colpali-v1.3-hf.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/vidore__colqwen2.5-v0.2.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/models/zai-org__GLM-OCR.yaml +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/_base_adapter.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/_flash_base.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/_spec.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/_types.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/bert_flash/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/bge_m3/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/bge_m3_flag/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/bge_m3_flash/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/bge_m3_score_mixin.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/colbert/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/colbert_modernbert_flash/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/colbert_rotary_flash/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/cross_encoder/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/errors.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/florence2/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/gliclass/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/gliner2/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/gliner_bi/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/grounding_dino/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/gte_sparse_flash/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/jina_flash_cross_encoder/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/lighton_ocr/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/modernbert_flash/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/modernbert_flash_cross_encoder/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/nli_classification/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/nli_classification_flash/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/nomic_flash/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/owlv2/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/paddleocr_vl/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/qwen2_flash/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/qwen2_flash_cross_encoder/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/qwen3_vl_embedding/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/qwen3_vl_reranker/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/rope_flash/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/sentence_transformer/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/sglang/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/splade_flash/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/stablebridge_pruner/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/adapters/xlm_roberta_flash/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/generate.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/metrics.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/models.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/openapi.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/options.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/root.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/score.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/api/validation.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/app/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/app/app_state_config.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/config/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/gpu_health.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/hf_env.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/load_errors.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/logging.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/oom.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/preprocessor/image.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/shutdown.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/text_tokens.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/watcher.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/base.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/encode.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/extract.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/score.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/core/worker/oom_recovery.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/health/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/health/nats_publisher.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/health/saturation.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/main.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/observability/telemetry.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/processors/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/processors/admission.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/processors/base.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/processors/grammar_cache.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/processors/grammar_compile.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/processors/tool_call_grammar.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/processors/tool_call_parser.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/static/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/static/index.html +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/types/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/types/grammar.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/types/openapi.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/types/overflow_policy.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/types/requests.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/src/sie_server/types/responses.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_base.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_bge_m3.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_bge_m3_flash.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_colbert.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_docling_smoke.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_donut.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_factory_integration.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_flash_base.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_florence2.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_gliclass_overflow_policy.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_glirel.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_glm_ocr.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_grounding_dino.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_gte_sparse.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_jina_flash_cross_encoder.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_lighton_ocr.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_lora.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_lora_integration.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_paddleocr_vl.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_runtime_options.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_sglang.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_siglip.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_sparse_aggregation.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_stablebridge_integration.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/adapters/test_stablebridge_pruner.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_encode_json_schema.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_encode_validation.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_extract_integration.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_extract_oom.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_health.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_models.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_openai_compat.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_score.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_version_header.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/api/test_ws.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/app/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/config/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/config/test_model_prewarm_grammars.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_disk_cache.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_gpu_health.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_hot_reload.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_idle_evict.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_inference.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_logging.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_memory.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_oom_detection.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_pool_isolation.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_postprocessor.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_postprocessor_registry.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_prepared.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_preprocessor_registry.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_quantization.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_readiness.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_registry_core.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_registry_deps.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_registry_failed_state.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_registry_memory.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_registry_multi_model.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_shutdown.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_timing.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_watcher.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_worker_backpressure.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_worker_extract.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_worker_lora.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_worker_options.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/test_worker_score.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/worker/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/core/worker/test_oom_recovery.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/health/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/health/test_nats_publisher.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/health/test_saturation.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/health/test_worker_id_consistency.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/integration/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/observability/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/observability/test_telemetry.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/observability/test_tracing.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/test_grammar_cache.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/test_grammar_compile.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/test_streaming_admission.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/test_streaming_integration.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/test_tool_call_grammar.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/processors/test_tool_call_parser.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/test_all_models.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/test_openapi_export.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/test_sdk_integration.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/test_sparse_integration.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/type_defs/__init__.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/type_defs/test_inputs.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/type_defs/test_inputs_json_decode.py +0 -0
- {sie_server-0.4.1 → sie_server-0.4.2}/tests/type_defs/test_types.py +0 -0
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
ARG BUNDLE=default
|
|
9
9
|
|
|
10
10
|
# =============================================================================
|
|
11
|
-
#
|
|
11
|
+
# Dependency image: pyproject-only cache seed
|
|
12
12
|
# =============================================================================
|
|
13
13
|
FROM python:3.12-slim-bookworm AS deps
|
|
14
14
|
|
|
@@ -57,9 +57,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|
|
57
57
|
-e ".[gpu-metrics]"
|
|
58
58
|
|
|
59
59
|
# =============================================================================
|
|
60
|
-
#
|
|
60
|
+
# Shared runtime base: source install and venv finalization
|
|
61
61
|
# =============================================================================
|
|
62
|
-
# Bundle-agnostic: all base
|
|
62
|
+
# Bundle-agnostic: all base image layers are shared across bundles of this
|
|
63
63
|
# platform in local BuildKit cache and in content-addressed registry layers.
|
|
64
64
|
FROM deps AS base
|
|
65
65
|
|
|
@@ -107,7 +107,7 @@ RUN set -eux; \
|
|
|
107
107
|
find /app/.venv -exec touch -h -d @0 {} + 2>/dev/null || true
|
|
108
108
|
|
|
109
109
|
# =============================================================================
|
|
110
|
-
#
|
|
110
|
+
# Bundle dependency builder: bundle-specific deps
|
|
111
111
|
# =============================================================================
|
|
112
112
|
FROM base AS builder
|
|
113
113
|
|
|
@@ -150,7 +150,7 @@ RUN set -eux; \
|
|
|
150
150
|
find /app/bundle-libs -exec touch -h -d @0 {} + 2>/dev/null || true
|
|
151
151
|
|
|
152
152
|
# =============================================================================
|
|
153
|
-
#
|
|
153
|
+
# Runtime image
|
|
154
154
|
# =============================================================================
|
|
155
155
|
FROM python:3.12-slim-bookworm AS runtime
|
|
156
156
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# syntax=docker/dockerfile:1
|
|
2
|
-
# SIE Server - CUDA 12
|
|
2
|
+
# SIE Server - CUDA 12 Image
|
|
3
3
|
# Build from repo root:
|
|
4
4
|
# docker build -f packages/sie_server/Dockerfile.cuda12 -t sie-server:cuda12-default .
|
|
5
5
|
# docker build -f packages/sie_server/Dockerfile.cuda12 --build-arg BUNDLE=sglang -t sie-server:cuda12-sglang .
|
|
@@ -8,7 +8,7 @@ ARG BUNDLE=default
|
|
|
8
8
|
ARG UV_VERSION=0.9.28
|
|
9
9
|
|
|
10
10
|
# =============================================================================
|
|
11
|
-
#
|
|
11
|
+
# Dependency image: uv and standalone Python 3.12
|
|
12
12
|
# =============================================================================
|
|
13
13
|
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS deps
|
|
14
14
|
|
|
@@ -59,10 +59,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|
|
59
59
|
-e ".[gpu-metrics]"
|
|
60
60
|
|
|
61
61
|
# =============================================================================
|
|
62
|
-
#
|
|
62
|
+
# Shared CUDA base: source install and venv finalization
|
|
63
63
|
# =============================================================================
|
|
64
64
|
# Everything here is bundle-agnostic, so bundle-specific builds of a given
|
|
65
|
-
# platform share every base
|
|
65
|
+
# platform share every base image layer in local BuildKit cache and in
|
|
66
66
|
# content-addressed registry layers.
|
|
67
67
|
FROM deps AS base
|
|
68
68
|
|
|
@@ -126,7 +126,7 @@ RUN set -eux; \
|
|
|
126
126
|
find /app/.venv -exec touch -h -d @0 {} + 2>/dev/null || true
|
|
127
127
|
|
|
128
128
|
# =============================================================================
|
|
129
|
-
#
|
|
129
|
+
# Bundle dependency builder: bundle-specific deps
|
|
130
130
|
# =============================================================================
|
|
131
131
|
FROM base AS builder
|
|
132
132
|
|
|
@@ -173,11 +173,19 @@ RUN set -eux; \
|
|
|
173
173
|
find /app/bundle-libs -exec touch -h -d @0 {} + 2>/dev/null || true
|
|
174
174
|
|
|
175
175
|
# =============================================================================
|
|
176
|
-
#
|
|
176
|
+
# Runtime image
|
|
177
177
|
# =============================================================================
|
|
178
|
-
#
|
|
179
|
-
#
|
|
180
|
-
|
|
178
|
+
# Runtime base selection is bundle-scoped. Most CUDA bundles stay on the
|
|
179
|
+
# smaller CUDA base runtime; SGLang-family bundles need the devel toolkit
|
|
180
|
+
# because flashinfer/tvm_ffi perform runtime JIT through nvcc on first decode.
|
|
181
|
+
FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS runtime-default
|
|
182
|
+
FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS runtime-transformers5
|
|
183
|
+
FROM nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04 AS runtime-sglang
|
|
184
|
+
ENV CUDA_HOME=/usr/local/cuda \
|
|
185
|
+
LD_LIBRARY_PATH="/usr/local/cuda/lib64:$LD_LIBRARY_PATH"
|
|
186
|
+
FROM runtime-sglang AS runtime-sglang-embedding
|
|
187
|
+
|
|
188
|
+
FROM runtime-${BUNDLE} AS runtime
|
|
181
189
|
|
|
182
190
|
ENV DEBIAN_FRONTEND=noninteractive
|
|
183
191
|
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sie-server
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: Search Inference Engine - GPU inference server for search workloads
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Requires-Python: <3.13,>=3.12
|
|
8
|
+
Requires-Dist: blake3<1,>=0.4
|
|
8
9
|
Requires-Dist: docling<3,>=2
|
|
9
10
|
Requires-Dist: einops<1,>=0.8
|
|
10
11
|
Requires-Dist: fastapi<1,>=0.115
|
|
@@ -17,7 +18,6 @@ Requires-Dist: loguru<1,>=0.7
|
|
|
17
18
|
Requires-Dist: msgpack-numpy<1,>=0.4
|
|
18
19
|
Requires-Dist: msgpack<2,>=1.1
|
|
19
20
|
Requires-Dist: msgspec>=0.20.0
|
|
20
|
-
Requires-Dist: nats-py<3,>=2.9
|
|
21
21
|
Requires-Dist: numpy<3,>=2
|
|
22
22
|
Requires-Dist: open-clip-torch>=2.24
|
|
23
23
|
Requires-Dist: opencv-python-headless<5,>=4
|
|
@@ -26,7 +26,7 @@ Requires-Dist: opentelemetry-exporter-otlp<2,>=1.28
|
|
|
26
26
|
Requires-Dist: opentelemetry-instrumentation-fastapi<1,>=0.49b0
|
|
27
27
|
Requires-Dist: opentelemetry-sdk<2,>=1.28
|
|
28
28
|
Requires-Dist: packaging<25,>=24
|
|
29
|
-
Requires-Dist: pillow
|
|
29
|
+
Requires-Dist: pillow>=12.2.0
|
|
30
30
|
Requires-Dist: prometheus-client<1,>=0.21
|
|
31
31
|
Requires-Dist: pydantic-settings<3,>=2.6
|
|
32
32
|
Requires-Dist: pydantic<3,>=2.9
|
|
@@ -66,7 +66,7 @@ auto-retries; see `packages/sie_sdk/README.md` for client-side controls.
|
|
|
66
66
|
|
|
67
67
|
| Env var | Default | Effect |
|
|
68
68
|
|--|--|--|
|
|
69
|
-
| `SIE_GRAMMAR_PREFLIGHT_DEBUG` | unset (off) | Enables the legacy worker-side Outlines preflight compile before each structured-output request. Off by default
|
|
69
|
+
| `SIE_GRAMMAR_PREFLIGHT_DEBUG` | unset (off) | Enables the legacy worker-side Outlines preflight compile before each structured-output request. Off by default because SGLang is the production grammar authority. Use for diagnosing schema-rejection problems or slow compiles in a controlled environment; not recommended for production traffic. |
|
|
70
70
|
|
|
71
71
|
For nested settings (any field with `__`), the env-var format is
|
|
72
72
|
`SIE_<TOP>__<NESTED>=value`. The complete schema is in
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
sie_id: Marqo/marqo-fashionSigLIP
|
|
2
|
+
hf_id: Marqo/marqo-fashionSigLIP
|
|
3
|
+
inputs:
|
|
4
|
+
text: true
|
|
5
|
+
image: true
|
|
6
|
+
audio: false
|
|
7
|
+
video: false
|
|
8
|
+
tasks:
|
|
9
|
+
encode:
|
|
10
|
+
dense:
|
|
11
|
+
dim: 768
|
|
12
|
+
sparse: null
|
|
13
|
+
multivector: null
|
|
14
|
+
score: null
|
|
15
|
+
extract: null
|
|
16
|
+
max_sequence_length: 64
|
|
17
|
+
profiles:
|
|
18
|
+
default:
|
|
19
|
+
max_batch_tokens: 16384
|
|
20
|
+
compute_precision: float16
|
|
21
|
+
adapter_path: sie_server.adapters.siglip:SiglipAdapter
|
|
22
|
+
adapter_options:
|
|
23
|
+
loadtime:
|
|
24
|
+
backend: open_clip
|
|
25
|
+
open_clip_model_id: hf-hub:Marqo/marqo-fashionSigLIP
|
|
26
|
+
dense_dim: 768
|
|
27
|
+
runtime:
|
|
28
|
+
normalize: true
|
|
@@ -6,40 +6,39 @@ inputs:
|
|
|
6
6
|
audio: false
|
|
7
7
|
video: false
|
|
8
8
|
tasks:
|
|
9
|
-
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
9
|
+
# Small, fast generation model — a viable PROD pick for simple/short-prompt
|
|
10
|
+
# tasks (cheap + high throughput; weaker on long-context reasoning). Loads in
|
|
11
|
+
# ~30s. Also doubles as the transport/walking-skeleton benchmark target.
|
|
12
12
|
#
|
|
13
|
-
#
|
|
14
|
-
#
|
|
15
|
-
# the
|
|
16
|
-
#
|
|
17
|
-
#
|
|
13
|
+
# ``context_length`` is the standalone PROD serving value (4096): big enough
|
|
14
|
+
# to fit the full generation benchmark pack (casehold prompts reach ~1.8k
|
|
15
|
+
# tokens, gpqa ~1.3k) so the model is comparable to the rest of the fleet on
|
|
16
|
+
# every task, while KV stays trivial at this size (112 KB/token → 4096 ≈
|
|
17
|
+
# 0.46 GB). The validation/co-residency harness, which packs two SGLang
|
|
18
|
+
# instances onto a single 22 GiB L4, does NOT depend on this default — it
|
|
19
|
+
# passes an explicit ``--max-seq-length``/``--context-length`` (see
|
|
20
|
+
# tools/bench_generation_matrix.py + run_generation_smoke.py) and caps itself
|
|
21
|
+
# to 1024 for that case.
|
|
18
22
|
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
# budget) that just happen to collide here because the model is tiny.
|
|
23
|
-
# See sibling Qwen__Qwen3-4B-Instruct-2507.yaml for the canonical
|
|
24
|
-
# non-collapsed shape.
|
|
23
|
+
# ``context_length``, ``max_sequence_length``, and ``max_batch_tokens`` are
|
|
24
|
+
# three independent knobs (per-request context, SGLang --context-length,
|
|
25
|
+
# batcher cost budget); see sibling Qwen__Qwen3-4B-Instruct-2507.yaml.
|
|
25
26
|
generate:
|
|
26
|
-
context_length:
|
|
27
|
+
context_length: 4096
|
|
27
28
|
max_output_tokens: 1024
|
|
28
29
|
capabilities:
|
|
29
30
|
grammar: []
|
|
30
31
|
streaming: true
|
|
31
32
|
tools: false
|
|
32
|
-
max_sequence_length:
|
|
33
|
+
max_sequence_length: 4096
|
|
33
34
|
# KV-cache memory math (Qwen3-0.6B, bf16):
|
|
34
35
|
# layers=28, kv_heads=8, head_dim=128, bytes_per_elem=2
|
|
35
36
|
# kv_bytes_per_token = 2 × 28 × 8 × 128 × 2 = 114,688 B ≈ 112 KB
|
|
36
|
-
#
|
|
37
|
-
#
|
|
38
|
-
# co-resident two SGLang instances on an L4. KV budgets per profile
|
|
39
|
-
# scale with the deployment scenario rather than the GPU ceiling.
|
|
37
|
+
# At ctx=4096 a single request's KV is ~0.46 GB — negligible. The co-residency
|
|
38
|
+
# harness still caps context explicitly when it has to share a card.
|
|
40
39
|
profiles:
|
|
41
40
|
default:
|
|
42
|
-
max_batch_tokens:
|
|
41
|
+
max_batch_tokens: 4096
|
|
43
42
|
compute_precision: bfloat16
|
|
44
43
|
adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
|
|
45
44
|
kv_budget_tokens: 8192
|
|
@@ -66,8 +66,7 @@ max_sequence_length: 32768
|
|
|
66
66
|
# kv_budget_tokens is set to ~40% of theoretical max, matching the L4
|
|
67
67
|
# baseline ratio (32768/90000 ≈ 36%). The headroom absorbs batch growth,
|
|
68
68
|
# speculative side-cell, grammar/Outlines compile arena, fragmentation.
|
|
69
|
-
# Final empirical validation
|
|
70
|
-
# tracked in product/plans/m4-req2-gpu-runbook.md §"#16/#19".
|
|
69
|
+
# Final empirical validation should use concurrency and OOM-boundary sweeps.
|
|
71
70
|
profiles:
|
|
72
71
|
default:
|
|
73
72
|
# max_batch_tokens is a generic engine knob; generation does not batch
|
|
@@ -93,7 +92,7 @@ profiles:
|
|
|
93
92
|
top_p: 0.9
|
|
94
93
|
stop_tokens:
|
|
95
94
|
- "<|im_end|>"
|
|
96
|
-
#
|
|
95
|
+
# Analytical defaults for a100-40gb / h100. Production
|
|
97
96
|
# capacity also grows: with 2-4× the KV budget the context window can be
|
|
98
97
|
# widened proportionally so longer-context workloads (RAG with large
|
|
99
98
|
# retrieved passages) fit comfortably. ``max_output_tokens`` doubles
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
sie_id: Qwen/Qwen3.6-27B
|
|
2
|
+
hf_id: Qwen/Qwen3.6-27B
|
|
3
|
+
inputs:
|
|
4
|
+
# Qwen3.6-27B is a unified vision-language model (Gated DeltaNet + Gated
|
|
5
|
+
# Attention with an integrated vision encoder; same hybrid family as
|
|
6
|
+
# Qwen3.5-4B but scaled to 64 layers / hidden_size=5120). The wire
|
|
7
|
+
# surface accepts text+image via the OpenAI chat-completions schema
|
|
8
|
+
# (``image_url`` content parts); video is documented by Qwen but not
|
|
9
|
+
# yet wired through the SIE gateway.
|
|
10
|
+
text: true
|
|
11
|
+
image: true
|
|
12
|
+
audio: false
|
|
13
|
+
video: false
|
|
14
|
+
tasks:
|
|
15
|
+
generate:
|
|
16
|
+
# Native context length is 262,144 tokens (YaRN extends to ~1M).
|
|
17
|
+
# Default to 4096 here — this matches the empirically-calibrated
|
|
18
|
+
# reference point in the profile comments below
|
|
19
|
+
# (mem_fraction_static=0.93, weight 51.05 GB + kvcache 11.63 GB).
|
|
20
|
+
# 8192 was the original optimistic default but SGLang's
|
|
21
|
+
# init_memory_pool refused to fit it inside the conservative
|
|
22
|
+
# mem_fraction envelope on H100-80GB even when bumped to 0.97 —
|
|
23
|
+
# NEXTN speculative-decoding draft KV pushes the total past the
|
|
24
|
+
# available headroom. Raise both context_length AND
|
|
25
|
+
# mem_fraction_static together if you need longer contexts;
|
|
26
|
+
# benchmarking and prod requests today fit comfortably in 4096.
|
|
27
|
+
context_length: 4096
|
|
28
|
+
max_output_tokens: 4096
|
|
29
|
+
capabilities:
|
|
30
|
+
# Same constraint as Qwen3.5-4B: SGLang's outlines_backend does
|
|
31
|
+
# not implement ebnf. xgrammar smoke would pass all three; flip
|
|
32
|
+
# ``grammar_backend: xgrammar`` and re-add ``"ebnf"`` here if a
|
|
33
|
+
# consumer needs it.
|
|
34
|
+
grammar: ["json_schema", "regex"]
|
|
35
|
+
streaming: true
|
|
36
|
+
tools: true
|
|
37
|
+
# Qwen3.6 emits ``<think>...</think>`` reasoning by default. We
|
|
38
|
+
# disable it for the OpenAI-compat path so visible output is the
|
|
39
|
+
# answer only. Operators wanting CoT can flip this profile-side.
|
|
40
|
+
chat_template_kwargs:
|
|
41
|
+
enable_thinking: false
|
|
42
|
+
prewarm_grammars:
|
|
43
|
+
# Bare pattern, NOT anchored — Outlines regexes are implicitly
|
|
44
|
+
# anchored and its FSM engine rejects ``^``/``$``. See the
|
|
45
|
+
# Qwen3.5-4B model card for the full back-story.
|
|
46
|
+
- name: yes_no
|
|
47
|
+
kind: regex
|
|
48
|
+
value: "(yes|no)"
|
|
49
|
+
- name: short_answer
|
|
50
|
+
kind: json_schema
|
|
51
|
+
value:
|
|
52
|
+
type: object
|
|
53
|
+
properties:
|
|
54
|
+
answer:
|
|
55
|
+
type: string
|
|
56
|
+
required: [answer]
|
|
57
|
+
max_sequence_length: 4096
|
|
58
|
+
# ── KV-cache math (placeholder, pending Modal calibration) ──
|
|
59
|
+
#
|
|
60
|
+
# Qwen3.6-27B layer breakdown (per model card / config.json):
|
|
61
|
+
# * 64 layers in a 16 × (3 × DeltaNet + 1 × GatedAttention) pattern
|
|
62
|
+
# * 16 KV-bearing Gated-Attention layers (4 KV heads × head_dim=256)
|
|
63
|
+
# * 48 recurrent Gated-DeltaNet layers — managed by SGLang's mamba
|
|
64
|
+
# scheduler under ``--mamba-scheduler-strategy extra_buffer``
|
|
65
|
+
#
|
|
66
|
+
# BF16 weights ≈ 27e9 × 2 B ≈ 54 GB before activations / KV.
|
|
67
|
+
# * L4 (22 GB) → infeasible
|
|
68
|
+
# * A100-40GB (40 GB) → infeasible BF16; would need FP8 or TP2
|
|
69
|
+
# * H100-80GB → primary target (single-GPU, BF16)
|
|
70
|
+
# * H100×2 (160 GB) → for context >32k or large concurrencies
|
|
71
|
+
#
|
|
72
|
+
# ``kv_budget_tokens`` is a conservative analytical placeholder pending
|
|
73
|
+
# the first /get_server_info dump from tools/smoke_qwen36_27b.py on
|
|
74
|
+
# Modal. Re-calibrate from the empirical token_capacity before relying
|
|
75
|
+
# on these numbers in production.
|
|
76
|
+
profiles:
|
|
77
|
+
# L4 / A100-40GB profiles intentionally omitted — Qwen3.6-27B's BF16
|
|
78
|
+
# weights (~54 GB) do not fit. Add an FP8 or TP2 profile if those
|
|
79
|
+
# tiers become required.
|
|
80
|
+
default:
|
|
81
|
+
max_batch_tokens: 16384
|
|
82
|
+
compute_precision: bfloat16
|
|
83
|
+
adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
|
|
84
|
+
# Conservative H100-80GB placeholder pending the first
|
|
85
|
+
# /get_server_info dump from tools/smoke_qwen36_27b.py on Modal.
|
|
86
|
+
# Empirical reference point (2026-05-27 smoke, mem_fraction_static=0.93,
|
|
87
|
+
# context_length=4096, no-spec): SGLang reported
|
|
88
|
+
# ``weight=51.05 GB, kvcache=11.63 GB, token_capacity=190,543``.
|
|
89
|
+
# Sizing here at ~1/12 of that capacity to leave room for NEXTN
|
|
90
|
+
# draft KV + grammar/Outlines compile arena once spec is re-enabled.
|
|
91
|
+
# Re-calibrate after a smoke run on this profile's actual settings.
|
|
92
|
+
# Halved from 16384 to leave more H100-80GB headroom for the NEXTN
|
|
93
|
+
# draft activation arena. 8192 KV slots × ~64 KB/token ≈ 0.5 GB —
|
|
94
|
+
# plenty for the realistic per-request concurrency on this profile.
|
|
95
|
+
kv_budget_tokens: 8192
|
|
96
|
+
adapter_options:
|
|
97
|
+
loadtime:
|
|
98
|
+
# 0.95 paired with ``context_length: 4096`` + the *smaller*
|
|
99
|
+
# NEXTN draft below (num_steps=2 / num_draft_tokens=2 vs the
|
|
100
|
+
# 3/4 model-card default). The 0.93+default-draft cell still
|
|
101
|
+
# OOM'd ``init_memory_pool`` because verification batch grows
|
|
102
|
+
# with num_steps × num_draft_tokens; halving both shrinks the
|
|
103
|
+
# activation arena enough to fit.
|
|
104
|
+
mem_fraction_static: 0.95
|
|
105
|
+
served_model_name: Qwen/Qwen3.6-27B
|
|
106
|
+
disable_cuda_graph: true
|
|
107
|
+
attention_backend: triton
|
|
108
|
+
grammar_backend: outlines
|
|
109
|
+
reasoning_parser: qwen3
|
|
110
|
+
tool_call_parser: qwen3_coder
|
|
111
|
+
# MTP/NEXTN per the Qwen3.x model-card recipe (SGLang implements
|
|
112
|
+
# NEXTN under the EAGLE codepath; ``/server_info`` reports
|
|
113
|
+
# ``speculative_algorithm: EAGLE``). Smaller-draft variant
|
|
114
|
+
# (num_steps=2 / num_draft_tokens=2 vs the model-card 3/4) so
|
|
115
|
+
# the verification batch fits inside H100-80GB at ctx=4096 +
|
|
116
|
+
# mfs=0.95. The 3/4 default reliably OOM'd
|
|
117
|
+
# ``init_memory_pool`` even at mfs=0.97 — the trade is a
|
|
118
|
+
# slightly smaller speculative window for the ability to fit at
|
|
119
|
+
# all. Re-tune (or re-enable 3/4) once FP8 / TP=2 is wired up.
|
|
120
|
+
speculative:
|
|
121
|
+
enabled: true
|
|
122
|
+
algorithm: nextn
|
|
123
|
+
num_steps: 2
|
|
124
|
+
eagle_topk: 1
|
|
125
|
+
num_draft_tokens: 2
|
|
126
|
+
# ``--mamba-scheduler-strategy extra_buffer`` is the required
|
|
127
|
+
# pair-flag for NEXTN spec on the hybrid Gated-DeltaNet
|
|
128
|
+
# architecture. With spec disabled it's also harmless to keep,
|
|
129
|
+
# so we leave it for when spec is re-enabled.
|
|
130
|
+
extra_launch_args:
|
|
131
|
+
- "--mamba-scheduler-strategy"
|
|
132
|
+
- "extra_buffer"
|
|
133
|
+
- "--disable-overlap-schedule"
|
|
134
|
+
runtime:
|
|
135
|
+
first_chunk_timeout_s: 90
|
|
136
|
+
inter_chunk_timeout_s: 15
|
|
137
|
+
overall_timeout_s: 600
|
|
138
|
+
default_sampling:
|
|
139
|
+
temperature: 0.7
|
|
140
|
+
top_p: 0.8
|
|
141
|
+
presence_penalty: 1.5
|
|
142
|
+
stop_tokens:
|
|
143
|
+
- "<|im_end|>"
|
|
144
|
+
h100:
|
|
145
|
+
max_batch_tokens: 32768
|
|
146
|
+
compute_precision: bfloat16
|
|
147
|
+
adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
|
|
148
|
+
# Same H100-80GB target as ``default``; this profile widens
|
|
149
|
+
# ``max_batch_tokens`` for batch-heavy workloads once calibration
|
|
150
|
+
# confirms KV headroom.
|
|
151
|
+
kv_budget_tokens: 32768
|
|
152
|
+
adapter_options:
|
|
153
|
+
loadtime:
|
|
154
|
+
# Same as ``default`` — see that profile's mfs comment.
|
|
155
|
+
mem_fraction_static: 0.93
|
|
156
|
+
served_model_name: Qwen/Qwen3.6-27B
|
|
157
|
+
disable_cuda_graph: true
|
|
158
|
+
attention_backend: triton
|
|
159
|
+
grammar_backend: outlines
|
|
160
|
+
reasoning_parser: qwen3
|
|
161
|
+
tool_call_parser: qwen3_coder
|
|
162
|
+
speculative:
|
|
163
|
+
enabled: true
|
|
164
|
+
algorithm: nextn
|
|
165
|
+
num_steps: 3
|
|
166
|
+
eagle_topk: 1
|
|
167
|
+
num_draft_tokens: 4
|
|
168
|
+
# ``--disable-overlap-schedule`` is the required pair-flag for
|
|
169
|
+
# NEXTN + mamba-scheduler ``extra_buffer`` on the hybrid Gated-
|
|
170
|
+
# DeltaNet architecture (same constraint as Qwen3.5-4B).
|
|
171
|
+
extra_launch_args:
|
|
172
|
+
- "--mamba-scheduler-strategy"
|
|
173
|
+
- "extra_buffer"
|
|
174
|
+
- "--disable-overlap-schedule"
|
|
175
|
+
runtime:
|
|
176
|
+
first_chunk_timeout_s: 90
|
|
177
|
+
inter_chunk_timeout_s: 15
|
|
178
|
+
overall_timeout_s: 600
|
|
179
|
+
default_sampling:
|
|
180
|
+
temperature: 0.7
|
|
181
|
+
top_p: 0.8
|
|
182
|
+
presence_penalty: 1.5
|
|
183
|
+
stop_tokens:
|
|
184
|
+
- "<|im_end|>"
|
|
185
|
+
# RTX PRO 6000 (96 GB GDDR7, Blackwell Server Edition, sm_120) profile.
|
|
186
|
+
# FP8-first for max throughput: ``--quantization fp8`` (SGLang online
|
|
187
|
+
# dynamic FP8 quant of the BF16 checkpoint) via the ``extra_launch_args``
|
|
188
|
+
# passthrough — ``compute_precision`` can only express the ``--dtype``
|
|
189
|
+
# axis (float16/bfloat16/float32), not the orthogonal ``--quantization``
|
|
190
|
+
# flag, so FP8 rides the same escape hatch already used for the mamba
|
|
191
|
+
# scheduler. FP8 halves weight memory (~54 → ~27 GB), which frees room
|
|
192
|
+
# for the *model-card* NEXTN 3/4 draft (num_steps=3 / num_draft_tokens=4)
|
|
193
|
+
# that OOM'd ``init_memory_pool`` on H100-80GB even at mfs=0.97. The +16 GB
|
|
194
|
+
# over H100 plus the FP8 weight saving is what makes 3/4 fit here.
|
|
195
|
+
#
|
|
196
|
+
# ACCURACY CONTRACT: FP8 is lossy. This profile is validated to within the
|
|
197
|
+
# Wilson 95% CI of the *BF16* baseline on all four generation tasks (see
|
|
198
|
+
# docs/adr/0001). If FP8 misses parity after bounded tuning, fall back to
|
|
199
|
+
# BF16 + NEXTN 3/4 (drop the ``--quantization fp8`` pair below; the 96 GB
|
|
200
|
+
# still fits the 3/4 draft in BF16). KV cache stays BF16 here — add
|
|
201
|
+
# ``--kv-cache-dtype fp8_e4m3`` only if memory/throughput needs it AND
|
|
202
|
+
# accuracy still holds (KV FP8 is usually the first thing to cost accuracy).
|
|
203
|
+
#
|
|
204
|
+
# Standalone block (no ``extends``): production ``resolve_profile`` does a
|
|
205
|
+
# full-replace of ``loadtime`` for extending profiles, so a partial child
|
|
206
|
+
# would drop inherited launch flags and desync the via-SIE path from the
|
|
207
|
+
# bare-SGLang bench control.
|
|
208
|
+
rtx-pro-6000:
|
|
209
|
+
max_batch_tokens: 32768
|
|
210
|
+
compute_precision: bfloat16
|
|
211
|
+
adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
|
|
212
|
+
# FP8 weights (~27 GB) + 96 GB total leaves generous KV/draft headroom;
|
|
213
|
+
# start conservative and raise after the first /get_server_info dump on
|
|
214
|
+
# the actual RTX-PRO-6000 smoke.
|
|
215
|
+
kv_budget_tokens: 16384
|
|
216
|
+
adapter_options:
|
|
217
|
+
loadtime:
|
|
218
|
+
# 0.90 is a conservative starting point — FP8 weights free enough
|
|
219
|
+
# memory that the 3/4 NEXTN verification batch should fit with room
|
|
220
|
+
# to spare. Iterate upward (smoke ``--mem-fraction-static``) once the
|
|
221
|
+
# first boot confirms Blackwell sm_120 + FP8 GEMM kernels are present.
|
|
222
|
+
mem_fraction_static: 0.90
|
|
223
|
+
served_model_name: Qwen/Qwen3.6-27B
|
|
224
|
+
disable_cuda_graph: true
|
|
225
|
+
# triton attention matches the rest of the Qwen3.x family. Blackwell
|
|
226
|
+
# (sm_120) kernel coverage for triton + FP8 + NEXTN is the first thing
|
|
227
|
+
# the boot smoke verifies; switch to flashinfer here if triton lacks
|
|
228
|
+
# sm_120 coverage in the pinned SGLang build.
|
|
229
|
+
attention_backend: triton
|
|
230
|
+
grammar_backend: outlines
|
|
231
|
+
reasoning_parser: qwen3
|
|
232
|
+
tool_call_parser: qwen3_coder
|
|
233
|
+
# Model-card NEXTN 3/4 — restored here (vs ``default``'s conservative
|
|
234
|
+
# 2/2) because FP8 + 96 GB fits the larger verification batch that
|
|
235
|
+
# OOM'd on H100-80GB.
|
|
236
|
+
speculative:
|
|
237
|
+
enabled: true
|
|
238
|
+
algorithm: nextn
|
|
239
|
+
num_steps: 3
|
|
240
|
+
eagle_topk: 1
|
|
241
|
+
num_draft_tokens: 4
|
|
242
|
+
# ``--quantization fp8`` rides the passthrough (see header comment).
|
|
243
|
+
# The mamba-scheduler + overlap pair-flags are the required NEXTN
|
|
244
|
+
# companions on the hybrid Gated-DeltaNet architecture. List is the
|
|
245
|
+
# FULL set (production full-replaces ``extra_launch_args``, not merge).
|
|
246
|
+
extra_launch_args:
|
|
247
|
+
- "--quantization"
|
|
248
|
+
- "fp8"
|
|
249
|
+
- "--mamba-scheduler-strategy"
|
|
250
|
+
- "extra_buffer"
|
|
251
|
+
- "--disable-overlap-schedule"
|
|
252
|
+
runtime:
|
|
253
|
+
first_chunk_timeout_s: 90
|
|
254
|
+
inter_chunk_timeout_s: 15
|
|
255
|
+
overall_timeout_s: 600
|
|
256
|
+
# Qwen3.6-27B empty-response fix baked in: under greedy/low-temp the
|
|
257
|
+
# chat template emits EOS as the FIRST token on a large fraction of
|
|
258
|
+
# prompts (n=50 6000 smoke: casehold 23/50, gpqa 29/50 came back
|
|
259
|
+
# EMPTY). The floor ``min_new_tokens>=1`` fixes it — validated on the
|
|
260
|
+
# 6000: min_tokens=10 → 0/50 empty on all four tasks, accuracy within
|
|
261
|
+
# Wilson 95% CI of the BF16 baseline. ``min_new_tokens`` is the
|
|
262
|
+
# SGLang-native key; the adapter merges this dict via ``setdefault``,
|
|
263
|
+
# so a request-supplied ``min_tokens`` still wins. NOTE: this only
|
|
264
|
+
# takes effect because ``runtime.default_sampling`` is now wired into
|
|
265
|
+
# the adapter (core/loader.py); before that fix a key here was a
|
|
266
|
+
# silent no-op and chat clients had to pass ``min_tokens`` themselves.
|
|
267
|
+
default_sampling:
|
|
268
|
+
temperature: 0.7
|
|
269
|
+
top_p: 0.8
|
|
270
|
+
presence_penalty: 1.5
|
|
271
|
+
min_new_tokens: 10
|
|
272
|
+
stop_tokens:
|
|
273
|
+
- "<|im_end|>"
|
|
274
|
+
# No-speculative baseline — for SIE-vs-raw-SGLang ablation cells so
|
|
275
|
+
# spec-decoding's contribution can be measured independently. Keeps
|
|
276
|
+
# the same ``extra_launch_args`` as ``default`` / ``h100`` so a config
|
|
277
|
+
# diff between them shows only the ``speculative`` block (the intent
|
|
278
|
+
# of the ablation), matching Qwen3.5-4B's convention.
|
|
279
|
+
no-spec:
|
|
280
|
+
max_batch_tokens: 32768
|
|
281
|
+
compute_precision: bfloat16
|
|
282
|
+
adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
|
|
283
|
+
kv_budget_tokens: 65536
|
|
284
|
+
adapter_options:
|
|
285
|
+
loadtime:
|
|
286
|
+
mem_fraction_static: 0.85
|
|
287
|
+
served_model_name: Qwen/Qwen3.6-27B
|
|
288
|
+
disable_cuda_graph: true
|
|
289
|
+
attention_backend: triton
|
|
290
|
+
grammar_backend: outlines
|
|
291
|
+
reasoning_parser: qwen3
|
|
292
|
+
tool_call_parser: qwen3_coder
|
|
293
|
+
speculative:
|
|
294
|
+
enabled: false
|
|
295
|
+
extra_launch_args:
|
|
296
|
+
- "--mamba-scheduler-strategy"
|
|
297
|
+
- "extra_buffer"
|
|
298
|
+
- "--disable-overlap-schedule"
|
|
299
|
+
runtime:
|
|
300
|
+
first_chunk_timeout_s: 90
|
|
301
|
+
inter_chunk_timeout_s: 15
|
|
302
|
+
overall_timeout_s: 600
|
|
303
|
+
default_sampling:
|
|
304
|
+
temperature: 0.7
|
|
305
|
+
top_p: 0.8
|
|
306
|
+
presence_penalty: 1.5
|
|
307
|
+
stop_tokens:
|
|
308
|
+
- "<|im_end|>"
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
sie_id: opendatalab/MinerU2.5-Pro-2604-1.2B
|
|
2
|
+
hf_id: opendatalab/MinerU2.5-Pro-2604-1.2B
|
|
3
|
+
hf_revision: d3f5e08d073c21466bbabe21c71bb1e9c2e595da
|
|
4
|
+
inputs:
|
|
5
|
+
text: false
|
|
6
|
+
image: true
|
|
7
|
+
audio: false
|
|
8
|
+
video: false
|
|
9
|
+
tasks:
|
|
10
|
+
encode: null
|
|
11
|
+
score: null
|
|
12
|
+
extract: {}
|
|
13
|
+
profiles:
|
|
14
|
+
default:
|
|
15
|
+
max_batch_tokens: 16384
|
|
16
|
+
compute_precision: bfloat16
|
|
17
|
+
adapter_path: sie_server.adapters.mineru_vl:MinerUVLAdapter
|
|
18
|
+
adapter_options:
|
|
19
|
+
loadtime:
|
|
20
|
+
default_task: "[default]"
|
|
21
|
+
runtime:
|
|
22
|
+
task: "[default]"
|
|
23
|
+
max_new_tokens: 4096
|
|
24
|
+
num_beams: 1
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "sie-server"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.2"
|
|
4
4
|
description = "Search Inference Engine - GPU inference server for search workloads"
|
|
5
5
|
requires-python = ">=3.12,<3.13"
|
|
6
6
|
license = { text = "Apache-2.0" }
|
|
@@ -39,7 +39,7 @@ dependencies = [
|
|
|
39
39
|
# SigLIP (Marqo/marqo-ecommerce-embeddings-B native open_clip loader)
|
|
40
40
|
"open-clip-torch>=2.24",
|
|
41
41
|
# Image processing
|
|
42
|
-
"pillow>=
|
|
42
|
+
"pillow>=12.2.0",
|
|
43
43
|
"numpy>=2,<3",
|
|
44
44
|
"torchvision>=0.18,<1", # Required by some HF models (e.g., nvidia/llama-nemoretriever)
|
|
45
45
|
# Config
|
|
@@ -50,8 +50,6 @@ dependencies = [
|
|
|
50
50
|
"packaging>=24,<25",
|
|
51
51
|
# Hot-reload
|
|
52
52
|
"watchdog>=6,<7",
|
|
53
|
-
# NATS pub/sub for config notifications
|
|
54
|
-
"nats-py>=2.9,<3",
|
|
55
53
|
# Observability
|
|
56
54
|
"opentelemetry-api>=1.28,<2",
|
|
57
55
|
"opentelemetry-sdk>=1.28,<2",
|
|
@@ -66,6 +64,11 @@ dependencies = [
|
|
|
66
64
|
"msgspec>=0.20.0",
|
|
67
65
|
# Async HTTP client (telemetry sender)
|
|
68
66
|
"httpx>=0.28.1",
|
|
67
|
+
# BLAKE3 used to cross-check the worker-sidecar's `PreparedTokens`
|
|
68
|
+
# tokenizer_id. Tiny (<200KB, pure-Rust via PyO3), mandatory for
|
|
69
|
+
# the encode / score fast-path consumer — see
|
|
70
|
+
# `sie_server.core.preprocessor.text.TextPreprocessor`.
|
|
71
|
+
"blake3>=0.4,<1",
|
|
69
72
|
]
|
|
70
73
|
|
|
71
74
|
[project.optional-dependencies]
|