sie-server 0.4.0__tar.gz → 0.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sie_server-0.4.0 → sie_server-0.4.2}/Dockerfile.cpu +6 -12
- {sie_server-0.4.0 → sie_server-0.4.2}/Dockerfile.cuda12 +18 -16
- {sie_server-0.4.0 → sie_server-0.4.2}/PKG-INFO +3 -3
- {sie_server-0.4.0 → sie_server-0.4.2}/README.md +1 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/bundles/default.yaml +2 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/bundles/sglang-embedding.yaml +1 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/bundles/sglang.yaml +21 -21
- sie_server-0.4.2/models/Marqo__marqo-fashionSigLIP.yaml +28 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3-0.6B.yaml +20 -21
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3-4B-Instruct-2507.yaml +2 -3
- sie_server-0.4.2/models/Qwen__Qwen3.6-27B.yaml +308 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/docling.yaml +1 -1
- sie_server-0.4.2/models/opendatalab__MinerU2.5-Pro-2604-1.2B.yaml +24 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/openapi.json +1 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/pyproject.toml +10 -6
- sie_server-0.4.2/scripts/generate_tokenize_fixture.py +203 -0
- sie_server-0.4.2/src/sie_server/__init__.py +9 -0
- sie_server-0.4.2/src/sie_server/_ipc_test_harness.py +356 -0
- sie_server-0.4.2/src/sie_server/adapter_call_loop.py +439 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/_generation_base.py +2 -4
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/_utils.py +4 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/base.py +2 -5
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/bert_flash_cross_encoder/__init__.py +0 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/clip/__init__.py +19 -6
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/colpali/__init__.py +18 -13
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/colqwen2/__init__.py +6 -4
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/colqwen3/__init__.py +72 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/docling/__init__.py +29 -8
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/donut/__init__.py +0 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/gliner/__init__.py +0 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/glirel/__init__.py +0 -3
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/glm_ocr/__init__.py +105 -0
- sie_server-0.4.2/src/sie_server/adapters/mineru_vl/__init__.py +434 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/nemo_colembed/__init__.py +49 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/peft_lora_mixin.py +0 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/pytorch_embedding/__init__.py +17 -4
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/sglang/_server.py +1 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/sglang/embedding.py +1 -3
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/sglang/generation.py +11 -5
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/siglip/__init__.py +3 -3
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/encode.py +3 -3
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/extract.py +10 -3
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/health.py +0 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/helpers.py +1 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/openai_compat.py +1 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/serialization.py +1 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/ws.py +25 -9
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/app/app_factory.py +56 -208
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/cli.py +20 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/config/engine.py +79 -6
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/config/model.py +4 -8
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/adaptive_batching.py +205 -10
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/batcher.py +9 -6
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/deps.py +1 -45
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/disk_cache.py +1 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/encode_pipeline.py +70 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/extract_cost.py +1 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/hot_reload.py +0 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/inference.py +2 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/inference_output.py +0 -3
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/loader.py +21 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/memory.py +2 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/model_loader.py +7 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/pool_isolation.py +2 -5
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/postprocessor.py +0 -3
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/postprocessor_registry.py +2 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/prepared.py +21 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/preprocessor/__init__.py +0 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/preprocessor/base.py +1 -1
- sie_server-0.4.2/src/sie_server/core/preprocessor/text.py +495 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/preprocessor/vision.py +175 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/preprocessor_registry.py +2 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/readiness.py +26 -3
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/registry.py +10 -4
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/timing.py +1 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/tokenizer.py +2 -18
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/worker/__init__.py +0 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/worker/model_worker.py +167 -12
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/worker/types.py +47 -2
- sie_server-0.4.2/src/sie_server/ipc_server.py +679 -0
- sie_server-0.4.2/src/sie_server/ipc_types.py +514 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/observability/__init__.py +0 -6
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/observability/gpu.py +0 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/observability/metrics.py +53 -13
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/observability/prometheus.py +0 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/observability/tracing.py +0 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/processors/streaming.py +110 -30
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/processors/work_class_scheduler.py +4 -5
- sie_server-0.4.2/src/sie_server/queue_executor.py +1088 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/types/inputs.py +2 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/types/outputs.py +1 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_clip.py +52 -8
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_docling.py +64 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_lora_integration.py +1 -1
- sie_server-0.4.2/tests/adapters/test_mineru_vl.py +380 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_pytorch_embedding_revision.py +34 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_sentence_transformer.py +61 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_sglang_generation.py +2 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_stablebridge_integration.py +1 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_visual_document.py +18 -3
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_encode_dtype.py +1 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_encode_endpoint.py +1 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_encode_timing.py +1 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_extract.py +26 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_generate.py +2 -6
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/app/test_app_factory.py +173 -17
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/config/test_bundle_coverage.py +3 -6
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/config/test_config.py +9 -2
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/config/test_profile_backend_consistency.py +3 -12
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/conftest.py +31 -22
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_adaptive_batching.py +279 -3
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_batcher.py +13 -11
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_loader.py +79 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_lora_generation_exclusion.py +1 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_model_load_timeout.py +1 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_preprocessor.py +358 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_registry_async.py +1 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_worker_core.py +39 -1
- sie_server-0.4.2/tests/core/test_worker_passthrough.py +220 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/integration/test_chat_completions.py +2 -3
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/integration/test_grammar_generate.py +1 -1
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/observability/test_generation_metrics.py +4 -4
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/observability/test_metrics.py +110 -44
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/observability/test_trace_propagation.py +3 -3
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/test_grammar_prewarm.py +2 -4
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/test_streaming.py +14 -26
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/test_work_class_scheduler.py +1 -1
- sie_server-0.4.2/tests/test_adapter_call_loop.py +295 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/test_docker_integration.py +9 -9
- sie_server-0.4.2/tests/test_ipc_server.py +712 -0
- sie_server-0.4.2/tests/test_ipc_types_raw_output.py +162 -0
- sie_server-0.4.2/tests/test_model_yaml_filenames.py +35 -0
- sie_server-0.4.2/tests/test_parity_run_batch.py +332 -0
- sie_server-0.4.2/tests/test_queue_executor.py +724 -0
- sie_server-0.4.2/tests/test_queue_executor_stage1d.py +622 -0
- sie_server-0.4.2/tests/test_readiness.py +53 -0
- sie_server-0.4.2/tests/test_server_smoke.py +14 -0
- sie_server-0.4.2/tests/test_stage1d_byte_identity.py +393 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/type_defs/test_media_bytes.py +38 -11
- sie_server-0.4.0/src/sie_server/__init__.py +0 -3
- sie_server-0.4.0/src/sie_server/core/preprocessor/text.py +0 -268
- sie_server-0.4.0/src/sie_server/nats_pull_loop.py +0 -2458
- sie_server-0.4.0/src/sie_server/nats_subscriber.py +0 -231
- sie_server-0.4.0/tests/test_nats_pull_loop.py +0 -924
- sie_server-0.4.0/tests/test_nats_pull_loop_batching.py +0 -1291
- sie_server-0.4.0/tests/test_server_smoke.py +0 -8
- {sie_server-0.4.0 → sie_server-0.4.2}/.gitignore +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/CONTRIBUTING.md +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/LICENSE +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/bundles/transformers5.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Alibaba-NLP__gte-Qwen2-1.5B-instruct.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Alibaba-NLP__gte-Qwen2-7B-instruct.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Alibaba-NLP__gte-modernbert-base.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Alibaba-NLP__gte-multilingual-base.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Alibaba-NLP__gte-reranker-modernbert-base.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/BAAI__bge-m3.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/BAAI__bge-reranker-base.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/BAAI__bge-reranker-large.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/BAAI__bge-reranker-v2-m3.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/EmergentMethods__gliner_large_news-v2.1.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/GritLM__GritLM-7B.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/IDEA-Research__grounding-dino-base.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/IDEA-Research__grounding-dino-tiny.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Ihor__gliner-biomed-large-v1.0.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Linq-AI-Research__Linq-Embed-Mistral.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Marqo__marqo-ecommerce-embeddings-B.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/MoritzLaurer__deberta-v3-base-zeroshot-v2.0.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/MoritzLaurer__deberta-v3-large-zeroshot-v2.0.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/NeuML__gliner-bert-tiny.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/NovaSearch__stella_en_1.5B_v5.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/NovaSearch__stella_en_400M_v5.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/PaddlePaddle__PaddleOCR-VL-1.5.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3-Embedding-0.6B.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3-Embedding-4B.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3-Reranker-0.6B.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3-Reranker-4B.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3-VL-Embedding-2B.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3-VL-Reranker-2B.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3.5-4B.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Salesforce__SFR-Embedding-2_R.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Salesforce__SFR-Embedding-Mistral.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/Snowflake__snowflake-arctic-embed-m-v2.0.yaml +0 -0
- /sie_server-0.4.0/models/tomoroai__tomoro-colqwen3-embed-4b.yaml → /sie_server-0.4.2/models/TomoroAI__tomoro-colqwen3-embed-4b.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/answerdotai__ModernBERT-base.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/answerdotai__answerai-colbert-small-v1.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/colbert-ir__colbertv2.0.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/cross-encoder__ms-marco-MiniLM-L-12-v2.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/cross-encoder__ms-marco-MiniLM-L-6-v2.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/cross-encoder__nli-deberta-v3-base.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/fastino__gliner2-base-v1.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/gliner-community__gliner_large-v2.5.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/gliner-community__gliner_medium-v2.5.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/gliner-community__gliner_small-v2.5.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/google__embeddinggemma-300m.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/google__owlv2-base-patch16-ensemble.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/google__siglip-so400m-patch14-224.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/google__siglip-so400m-patch14-384.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/google__siglip2-base-patch16-224.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/ibm-granite__granite-embedding-30m-sparse.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/ibm-granite__granite-embedding-english-r2.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/ibm-granite__granite-embedding-small-english-r2.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/intfloat__e5-base-v2.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/intfloat__e5-large-v2.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/intfloat__e5-mistral-7b-instruct.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/intfloat__e5-small-v2.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/intfloat__multilingual-e5-large-instruct.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/intfloat__multilingual-e5-large.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/jackboyla__glirel-large-v0.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/jinaai__jina-colbert-v2.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/jinaai__jina-reranker-v2-base-multilingual.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/knowledgator__gliclass-base-v1.0.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/knowledgator__gliclass-large-v1.0.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/knowledgator__gliclass-large-v3.0.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/knowledgator__gliclass-small-v1.0.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/knowledgator__gliner-bi-base-v2.0.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/knowledgator__modern-gliner-bi-base-v1.0.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/laion__CLIP-ViT-B-32-laion2B-s34B-b79K.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/laion__CLIP-ViT-H-14-laion2B-s32B-b79K.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/lightonai__GTE-ModernColBERT-v1.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/lightonai__LightOnOCR-2-1B.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/lightonai__Reason-ModernColBERT.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/microsoft__Florence-2-base-ft.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/microsoft__Florence-2-base.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/microsoft__Florence-2-large.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/mixedbread-ai__mxbai-colbert-large-v1.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/mixedbread-ai__mxbai-edge-colbert-v0-32m.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/mixedbread-ai__mxbai-rerank-base-v2.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/mixedbread-ai__mxbai-rerank-large-v2.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/mynkchaudhry__Florence-2-FT-DocVQA.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/naver-clova-ix__donut-base-finetuned-cord-v2.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/naver-clova-ix__donut-base-finetuned-docvqa.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/naver-clova-ix__donut-base-finetuned-rvlcdip.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/naver__splade-cocondenser-selfdistil.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/naver__splade-v3.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/nomic-ai__nomic-embed-text-v2-moe.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/numind__NuNER_Zero-span.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/numind__NuNER_Zero.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/nvidia__NV-Embed-v2.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/nvidia__llama-embed-nemotron-8b.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/nvidia__llama-nemoretriever-colembed-3b-v1.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/nvidia__nemotron-colembed-vl-4b-v2.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/openai__clip-vit-base-patch32.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/openai__clip-vit-large-patch14.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v2-distill.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v2-mini.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v3-distill.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-doc-v3-gte.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-v1.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/opensearch-project__opensearch-neural-sparse-encoding-v2-distill.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/prithivida__Splade_PP_en_v2.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/rasyosef__splade-mini.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/sentence-transformers__all-MiniLM-L6-v2.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/sugiv__stablebridge-pruner-highlighter.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/urchade__gliner_large-v2.1.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/urchade__gliner_medium-v2.1.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/urchade__gliner_multi-v2.1.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/urchade__gliner_multi_pii-v1.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/urchade__gliner_small-v2.1.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/vidore__colpali-v1.3-hf.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/vidore__colqwen2.5-v0.2.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/models/zai-org__GLM-OCR.yaml +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/_base_adapter.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/_flash_base.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/_spec.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/_types.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/bert_flash/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/bge_m3/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/bge_m3_flag/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/bge_m3_flash/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/bge_m3_score_mixin.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/colbert/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/colbert_modernbert_flash/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/colbert_rotary_flash/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/cross_encoder/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/errors.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/florence2/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/gliclass/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/gliner2/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/gliner_bi/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/grounding_dino/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/gte_sparse_flash/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/jina_flash_cross_encoder/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/lighton_ocr/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/modernbert_flash/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/modernbert_flash_cross_encoder/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/nli_classification/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/nli_classification_flash/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/nomic_flash/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/owlv2/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/paddleocr_vl/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/qwen2_flash/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/qwen2_flash_cross_encoder/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/qwen3_vl_embedding/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/qwen3_vl_reranker/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/rope_flash/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/sentence_transformer/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/sglang/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/splade_flash/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/stablebridge_pruner/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/adapters/xlm_roberta_flash/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/generate.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/metrics.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/models.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/openapi.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/options.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/root.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/score.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/api/validation.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/app/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/app/app_state_config.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/config/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/gpu_health.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/hf_env.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/load_errors.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/logging.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/oom.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/preprocessor/image.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/shutdown.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/text_tokens.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/watcher.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/base.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/encode.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/extract.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/worker/handlers/score.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/core/worker/oom_recovery.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/health/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/health/nats_publisher.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/health/saturation.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/main.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/observability/telemetry.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/processors/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/processors/admission.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/processors/base.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/processors/grammar_cache.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/processors/grammar_compile.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/processors/tool_call_grammar.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/processors/tool_call_parser.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/static/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/static/index.html +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/types/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/types/grammar.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/types/openapi.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/types/overflow_policy.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/types/requests.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/src/sie_server/types/responses.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_base.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_bge_m3.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_bge_m3_flash.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_colbert.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_docling_smoke.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_donut.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_factory_integration.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_flash_base.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_florence2.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_gliclass_overflow_policy.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_glirel.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_glm_ocr.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_grounding_dino.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_gte_sparse.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_jina_flash_cross_encoder.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_lighton_ocr.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_lora.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_paddleocr_vl.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_runtime_options.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_sglang.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_siglip.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_sparse_aggregation.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/adapters/test_stablebridge_pruner.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_encode_json_schema.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_encode_validation.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_extract_integration.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_extract_oom.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_health.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_models.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_openai_compat.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_score.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_version_header.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/api/test_ws.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/app/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/config/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/config/test_model_prewarm_grammars.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_disk_cache.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_gpu_health.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_hot_reload.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_idle_evict.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_inference.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_logging.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_memory.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_oom_detection.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_pool_isolation.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_postprocessor.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_postprocessor_registry.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_prepared.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_preprocessor_registry.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_quantization.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_readiness.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_registry_core.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_registry_deps.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_registry_failed_state.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_registry_memory.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_registry_multi_model.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_shutdown.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_timing.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_watcher.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_worker_backpressure.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_worker_extract.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_worker_lora.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_worker_options.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/test_worker_score.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/worker/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/core/worker/test_oom_recovery.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/health/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/health/test_nats_publisher.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/health/test_saturation.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/health/test_worker_id_consistency.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/integration/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/observability/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/observability/test_telemetry.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/observability/test_tracing.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/test_grammar_cache.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/test_grammar_compile.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/test_streaming_admission.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/test_streaming_integration.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/test_tool_call_grammar.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/processors/test_tool_call_parser.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/test_all_models.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/test_openapi_export.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/test_sdk_integration.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/test_sparse_integration.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/type_defs/__init__.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/type_defs/test_inputs.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/type_defs/test_inputs_json_decode.py +0 -0
- {sie_server-0.4.0 → sie_server-0.4.2}/tests/type_defs/test_types.py +0 -0
|
@@ -6,10 +6,9 @@
|
|
|
6
6
|
# docker buildx build --platform linux/amd64,linux/arm64 -f packages/sie_server/Dockerfile.cpu -t sie-server:cpu .
|
|
7
7
|
|
|
8
8
|
ARG BUNDLE=default
|
|
9
|
-
ARG SIE_DEPS_IMAGE=
|
|
10
9
|
|
|
11
10
|
# =============================================================================
|
|
12
|
-
#
|
|
11
|
+
# Dependency image: pyproject-only cache seed
|
|
13
12
|
# =============================================================================
|
|
14
13
|
FROM python:3.12-slim-bookworm AS deps
|
|
15
14
|
|
|
@@ -58,9 +57,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|
|
58
57
|
-e ".[gpu-metrics]"
|
|
59
58
|
|
|
60
59
|
# =============================================================================
|
|
61
|
-
#
|
|
60
|
+
# Shared runtime base: source install and venv finalization
|
|
62
61
|
# =============================================================================
|
|
63
|
-
# Bundle-agnostic: all base
|
|
62
|
+
# Bundle-agnostic: all base image layers are shared across bundles of this
|
|
64
63
|
# platform in local BuildKit cache and in content-addressed registry layers.
|
|
65
64
|
FROM deps AS base
|
|
66
65
|
|
|
@@ -108,9 +107,9 @@ RUN set -eux; \
|
|
|
108
107
|
find /app/.venv -exec touch -h -d @0 {} + 2>/dev/null || true
|
|
109
108
|
|
|
110
109
|
# =============================================================================
|
|
111
|
-
#
|
|
110
|
+
# Bundle dependency builder: bundle-specific deps
|
|
112
111
|
# =============================================================================
|
|
113
|
-
FROM base AS
|
|
112
|
+
FROM base AS builder
|
|
114
113
|
|
|
115
114
|
ARG BUNDLE
|
|
116
115
|
|
|
@@ -151,12 +150,7 @@ RUN set -eux; \
|
|
|
151
150
|
find /app/bundle-libs -exec touch -h -d @0 {} + 2>/dev/null || true
|
|
152
151
|
|
|
153
152
|
# =============================================================================
|
|
154
|
-
#
|
|
155
|
-
# =============================================================================
|
|
156
|
-
FROM ${SIE_DEPS_IMAGE:-bundle_deps} AS builder
|
|
157
|
-
|
|
158
|
-
# =============================================================================
|
|
159
|
-
# Stage 4: Runtime
|
|
153
|
+
# Runtime image
|
|
160
154
|
# =============================================================================
|
|
161
155
|
FROM python:3.12-slim-bookworm AS runtime
|
|
162
156
|
|
|
@@ -1,15 +1,14 @@
|
|
|
1
1
|
# syntax=docker/dockerfile:1
|
|
2
|
-
# SIE Server - CUDA 12
|
|
2
|
+
# SIE Server - CUDA 12 Image
|
|
3
3
|
# Build from repo root:
|
|
4
4
|
# docker build -f packages/sie_server/Dockerfile.cuda12 -t sie-server:cuda12-default .
|
|
5
5
|
# docker build -f packages/sie_server/Dockerfile.cuda12 --build-arg BUNDLE=sglang -t sie-server:cuda12-sglang .
|
|
6
6
|
|
|
7
7
|
ARG BUNDLE=default
|
|
8
8
|
ARG UV_VERSION=0.9.28
|
|
9
|
-
ARG SIE_DEPS_IMAGE=
|
|
10
9
|
|
|
11
10
|
# =============================================================================
|
|
12
|
-
#
|
|
11
|
+
# Dependency image: uv and standalone Python 3.12
|
|
13
12
|
# =============================================================================
|
|
14
13
|
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS deps
|
|
15
14
|
|
|
@@ -60,10 +59,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|
|
60
59
|
-e ".[gpu-metrics]"
|
|
61
60
|
|
|
62
61
|
# =============================================================================
|
|
63
|
-
#
|
|
62
|
+
# Shared CUDA base: source install and venv finalization
|
|
64
63
|
# =============================================================================
|
|
65
64
|
# Everything here is bundle-agnostic, so bundle-specific builds of a given
|
|
66
|
-
# platform share every base
|
|
65
|
+
# platform share every base image layer in local BuildKit cache and in
|
|
67
66
|
# content-addressed registry layers.
|
|
68
67
|
FROM deps AS base
|
|
69
68
|
|
|
@@ -127,9 +126,9 @@ RUN set -eux; \
|
|
|
127
126
|
find /app/.venv -exec touch -h -d @0 {} + 2>/dev/null || true
|
|
128
127
|
|
|
129
128
|
# =============================================================================
|
|
130
|
-
#
|
|
129
|
+
# Bundle dependency builder: bundle-specific deps
|
|
131
130
|
# =============================================================================
|
|
132
|
-
FROM base AS
|
|
131
|
+
FROM base AS builder
|
|
133
132
|
|
|
134
133
|
ARG BUNDLE
|
|
135
134
|
|
|
@@ -174,16 +173,19 @@ RUN set -eux; \
|
|
|
174
173
|
find /app/bundle-libs -exec touch -h -d @0 {} + 2>/dev/null || true
|
|
175
174
|
|
|
176
175
|
# =============================================================================
|
|
177
|
-
#
|
|
176
|
+
# Runtime image
|
|
178
177
|
# =============================================================================
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
#
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
178
|
+
# Runtime base selection is bundle-scoped. Most CUDA bundles stay on the
|
|
179
|
+
# smaller CUDA base runtime; SGLang-family bundles need the devel toolkit
|
|
180
|
+
# because flashinfer/tvm_ffi perform runtime JIT through nvcc on first decode.
|
|
181
|
+
FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS runtime-default
|
|
182
|
+
FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS runtime-transformers5
|
|
183
|
+
FROM nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04 AS runtime-sglang
|
|
184
|
+
ENV CUDA_HOME=/usr/local/cuda \
|
|
185
|
+
LD_LIBRARY_PATH="/usr/local/cuda/lib64:$LD_LIBRARY_PATH"
|
|
186
|
+
FROM runtime-sglang AS runtime-sglang-embedding
|
|
187
|
+
|
|
188
|
+
FROM runtime-${BUNDLE} AS runtime
|
|
187
189
|
|
|
188
190
|
ENV DEBIAN_FRONTEND=noninteractive
|
|
189
191
|
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sie-server
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: Search Inference Engine - GPU inference server for search workloads
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Requires-Python: <3.13,>=3.12
|
|
8
|
+
Requires-Dist: blake3<1,>=0.4
|
|
8
9
|
Requires-Dist: docling<3,>=2
|
|
9
10
|
Requires-Dist: einops<1,>=0.8
|
|
10
11
|
Requires-Dist: fastapi<1,>=0.115
|
|
@@ -17,7 +18,6 @@ Requires-Dist: loguru<1,>=0.7
|
|
|
17
18
|
Requires-Dist: msgpack-numpy<1,>=0.4
|
|
18
19
|
Requires-Dist: msgpack<2,>=1.1
|
|
19
20
|
Requires-Dist: msgspec>=0.20.0
|
|
20
|
-
Requires-Dist: nats-py<3,>=2.9
|
|
21
21
|
Requires-Dist: numpy<3,>=2
|
|
22
22
|
Requires-Dist: open-clip-torch>=2.24
|
|
23
23
|
Requires-Dist: opencv-python-headless<5,>=4
|
|
@@ -26,7 +26,7 @@ Requires-Dist: opentelemetry-exporter-otlp<2,>=1.28
|
|
|
26
26
|
Requires-Dist: opentelemetry-instrumentation-fastapi<1,>=0.49b0
|
|
27
27
|
Requires-Dist: opentelemetry-sdk<2,>=1.28
|
|
28
28
|
Requires-Dist: packaging<25,>=24
|
|
29
|
-
Requires-Dist: pillow
|
|
29
|
+
Requires-Dist: pillow>=12.2.0
|
|
30
30
|
Requires-Dist: prometheus-client<1,>=0.21
|
|
31
31
|
Requires-Dist: pydantic-settings<3,>=2.6
|
|
32
32
|
Requires-Dist: pydantic<3,>=2.9
|
|
@@ -66,7 +66,7 @@ auto-retries; see `packages/sie_sdk/README.md` for client-side controls.
|
|
|
66
66
|
|
|
67
67
|
| Env var | Default | Effect |
|
|
68
68
|
|--|--|--|
|
|
69
|
-
| `SIE_GRAMMAR_PREFLIGHT_DEBUG` | unset (off) | Enables the legacy worker-side Outlines preflight compile before each structured-output request. Off by default
|
|
69
|
+
| `SIE_GRAMMAR_PREFLIGHT_DEBUG` | unset (off) | Enables the legacy worker-side Outlines preflight compile before each structured-output request. Off by default because SGLang is the production grammar authority. Use for diagnosing schema-rejection problems or slow compiles in a controlled environment; not recommended for production traffic. |
|
|
70
70
|
|
|
71
71
|
For nested settings (any field with `__`), the env-var format is
|
|
72
72
|
`SIE_<TOP>__<NESTED>=value`. The complete schema is in
|
|
@@ -43,6 +43,7 @@ adapters:
|
|
|
43
43
|
- sie_server.adapters.florence2
|
|
44
44
|
- sie_server.adapters.docling
|
|
45
45
|
- sie_server.adapters.paddleocr_vl
|
|
46
|
+
- sie_server.adapters.mineru_vl
|
|
46
47
|
deps:
|
|
47
48
|
# Most flash adapters; sentence_transformer needs >=4.57
|
|
48
49
|
transformers: '>=4.57,<5'
|
|
@@ -78,5 +79,5 @@ deps:
|
|
|
78
79
|
docling: '>=2,<3'
|
|
79
80
|
# Flash Attention 2 — CUDA only, prebuilt wheel
|
|
80
81
|
flash-attn:
|
|
81
|
-
url: https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.7.
|
|
82
|
+
url: https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.7.11/flash_attn-2.7.4+cu129torch2.9-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
|
|
82
83
|
marker: sys_platform == 'linux'
|
|
@@ -12,7 +12,7 @@ deps:
|
|
|
12
12
|
# pip resolution drift on environments that already had a different
|
|
13
13
|
# ``outlines`` installed. Asserted in ``tests/test_bundles.py`` so a
|
|
14
14
|
# future drift fails fast.
|
|
15
|
-
sglang: '==0.5.10'
|
|
15
|
+
sglang: '==0.5.10.post1'
|
|
16
16
|
xgrammar: '==0.1.32'
|
|
17
17
|
outlines: '==0.1.11'
|
|
18
18
|
llguidance: '>=0.7.11,<0.8.0'
|
|
@@ -6,29 +6,29 @@ deps:
|
|
|
6
6
|
# SGLang 0.5.6+ includes all deps at base level (fixed from 0.4.x extras bug)
|
|
7
7
|
# See: https://github.com/sgl-project/sglang/issues/4869
|
|
8
8
|
#
|
|
9
|
-
# Qwen3.5-4B
|
|
9
|
+
# Qwen3.5-4B + Qwen3.6-27B compatibility:
|
|
10
10
|
#
|
|
11
|
-
# ``sglang==0.5.10`` is the canonical target for Qwen3.
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
#
|
|
11
|
+
# ``sglang==0.5.10.post1`` is the canonical target for the Qwen3.x hybrid
|
|
12
|
+
# Gated-DeltaNet + Gated-Attention family on the current
|
|
13
|
+
# L4 / A100-40GB / H100 fleet. Qwen3.6-27B uses the same ``qwen3_5``
|
|
14
|
+
# model class shipped in 0.5.10 — the architecture (64 layers, hybrid
|
|
15
|
+
# Gated DeltaNet + Gated Attention, MTP/NEXTN) is identical, only the
|
|
16
|
+
# parameter count differs.
|
|
15
17
|
#
|
|
16
|
-
#
|
|
17
|
-
#
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
#
|
|
23
|
-
#
|
|
24
|
-
#
|
|
25
|
-
# **Not** CUDA 13 — that's an SGLang-main-only path which only
|
|
26
|
-
# became relevant when looking at the dev branch.
|
|
18
|
+
# SGLang 0.5.10 was evaluated against CUDA 12.9 + Qwen3.6-27B on Modal
|
|
19
|
+
# H100 (2026-05-27): server boots, loads weights, but the bundled
|
|
20
|
+
# ``sglang/jit_kernel/csrc/elementwise/activation.cuh`` has a C++
|
|
21
|
+
# template bug (``select_kernel<true>(type)`` is parsed as a class-
|
|
22
|
+
# template substitution, not a function-template call) that the
|
|
23
|
+
# stricter ``nvcc`` shipped with CUDA 12.9 rejects at first activation.
|
|
24
|
+
# 0.5.11 is also dev-only on the sglang docs wheel index — not on
|
|
25
|
+
# PyPI. Park the 0.5.11 bump until upstream cuts a stable release with
|
|
26
|
+
# the JIT header fixed; 0.5.10.post1 covers Qwen3.6-27B today.
|
|
27
27
|
#
|
|
28
|
-
#
|
|
29
|
-
#
|
|
30
|
-
#
|
|
31
|
-
sglang: '==0.5.10'
|
|
28
|
+
# * grammar backends: ``xgrammar`` / ``outlines`` / ``llguidance`` / ``none``
|
|
29
|
+
# * ``sgl_kernel`` covers SM_80 / SM_89 / SM_90 / SM_100 via gencode.
|
|
30
|
+
# * torch==2.9.1 (CUDA 12.9 wheels); ``cuda-python==12.9``.
|
|
31
|
+
sglang: '==0.5.10.post1'
|
|
32
32
|
#
|
|
33
33
|
# Grammar backend deps — pinned to exactly what SGLang 0.5.10 imports
|
|
34
34
|
# internally to prevent silent pip resolution drift.
|
|
@@ -39,7 +39,7 @@ deps:
|
|
|
39
39
|
# propagation (PR #20467). Kept available as the fallback backend.
|
|
40
40
|
xgrammar: '==0.1.32'
|
|
41
41
|
#
|
|
42
|
-
# ``outlines==0.1.11`` is a hard transitive dep of sglang==0.5.10. We
|
|
42
|
+
# ``outlines==0.1.11`` is a hard transitive dep of sglang==0.5.10.post1. We
|
|
43
43
|
# declare it explicitly at bundle level so the surface is visible.
|
|
44
44
|
# ``outlines-core`` (a separate package) is a transitive of outlines
|
|
45
45
|
# and intentionally NOT pinned here — pinning ``outlines-core`` directly
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
sie_id: Marqo/marqo-fashionSigLIP
|
|
2
|
+
hf_id: Marqo/marqo-fashionSigLIP
|
|
3
|
+
inputs:
|
|
4
|
+
text: true
|
|
5
|
+
image: true
|
|
6
|
+
audio: false
|
|
7
|
+
video: false
|
|
8
|
+
tasks:
|
|
9
|
+
encode:
|
|
10
|
+
dense:
|
|
11
|
+
dim: 768
|
|
12
|
+
sparse: null
|
|
13
|
+
multivector: null
|
|
14
|
+
score: null
|
|
15
|
+
extract: null
|
|
16
|
+
max_sequence_length: 64
|
|
17
|
+
profiles:
|
|
18
|
+
default:
|
|
19
|
+
max_batch_tokens: 16384
|
|
20
|
+
compute_precision: float16
|
|
21
|
+
adapter_path: sie_server.adapters.siglip:SiglipAdapter
|
|
22
|
+
adapter_options:
|
|
23
|
+
loadtime:
|
|
24
|
+
backend: open_clip
|
|
25
|
+
open_clip_model_id: hf-hub:Marqo/marqo-fashionSigLIP
|
|
26
|
+
dense_dim: 768
|
|
27
|
+
runtime:
|
|
28
|
+
normalize: true
|
|
@@ -6,40 +6,39 @@ inputs:
|
|
|
6
6
|
audio: false
|
|
7
7
|
video: false
|
|
8
8
|
tasks:
|
|
9
|
-
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
9
|
+
# Small, fast generation model — a viable PROD pick for simple/short-prompt
|
|
10
|
+
# tasks (cheap + high throughput; weaker on long-context reasoning). Loads in
|
|
11
|
+
# ~30s. Also doubles as the transport/walking-skeleton benchmark target.
|
|
12
12
|
#
|
|
13
|
-
#
|
|
14
|
-
#
|
|
15
|
-
# the
|
|
16
|
-
#
|
|
17
|
-
#
|
|
13
|
+
# ``context_length`` is the standalone PROD serving value (4096): big enough
|
|
14
|
+
# to fit the full generation benchmark pack (casehold prompts reach ~1.8k
|
|
15
|
+
# tokens, gpqa ~1.3k) so the model is comparable to the rest of the fleet on
|
|
16
|
+
# every task, while KV stays trivial at this size (112 KB/token → 4096 ≈
|
|
17
|
+
# 0.46 GB). The validation/co-residency harness, which packs two SGLang
|
|
18
|
+
# instances onto a single 22 GiB L4, does NOT depend on this default — it
|
|
19
|
+
# passes an explicit ``--max-seq-length``/``--context-length`` (see
|
|
20
|
+
# tools/bench_generation_matrix.py + run_generation_smoke.py) and caps itself
|
|
21
|
+
# to 1024 for that case.
|
|
18
22
|
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
# budget) that just happen to collide here because the model is tiny.
|
|
23
|
-
# See sibling Qwen__Qwen3-4B-Instruct-2507.yaml for the canonical
|
|
24
|
-
# non-collapsed shape.
|
|
23
|
+
# ``context_length``, ``max_sequence_length``, and ``max_batch_tokens`` are
|
|
24
|
+
# three independent knobs (per-request context, SGLang --context-length,
|
|
25
|
+
# batcher cost budget); see sibling Qwen__Qwen3-4B-Instruct-2507.yaml.
|
|
25
26
|
generate:
|
|
26
|
-
context_length:
|
|
27
|
+
context_length: 4096
|
|
27
28
|
max_output_tokens: 1024
|
|
28
29
|
capabilities:
|
|
29
30
|
grammar: []
|
|
30
31
|
streaming: true
|
|
31
32
|
tools: false
|
|
32
|
-
max_sequence_length:
|
|
33
|
+
max_sequence_length: 4096
|
|
33
34
|
# KV-cache memory math (Qwen3-0.6B, bf16):
|
|
34
35
|
# layers=28, kv_heads=8, head_dim=128, bytes_per_elem=2
|
|
35
36
|
# kv_bytes_per_token = 2 × 28 × 8 × 128 × 2 = 114,688 B ≈ 112 KB
|
|
36
|
-
#
|
|
37
|
-
#
|
|
38
|
-
# co-resident two SGLang instances on an L4. KV budgets per profile
|
|
39
|
-
# scale with the deployment scenario rather than the GPU ceiling.
|
|
37
|
+
# At ctx=4096 a single request's KV is ~0.46 GB — negligible. The co-residency
|
|
38
|
+
# harness still caps context explicitly when it has to share a card.
|
|
40
39
|
profiles:
|
|
41
40
|
default:
|
|
42
|
-
max_batch_tokens:
|
|
41
|
+
max_batch_tokens: 4096
|
|
43
42
|
compute_precision: bfloat16
|
|
44
43
|
adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
|
|
45
44
|
kv_budget_tokens: 8192
|
|
@@ -66,8 +66,7 @@ max_sequence_length: 32768
|
|
|
66
66
|
# kv_budget_tokens is set to ~40% of theoretical max, matching the L4
|
|
67
67
|
# baseline ratio (32768/90000 ≈ 36%). The headroom absorbs batch growth,
|
|
68
68
|
# speculative side-cell, grammar/Outlines compile arena, fragmentation.
|
|
69
|
-
# Final empirical validation
|
|
70
|
-
# tracked in product/plans/m4-req2-gpu-runbook.md §"#16/#19".
|
|
69
|
+
# Final empirical validation should use concurrency and OOM-boundary sweeps.
|
|
71
70
|
profiles:
|
|
72
71
|
default:
|
|
73
72
|
# max_batch_tokens is a generic engine knob; generation does not batch
|
|
@@ -93,7 +92,7 @@ profiles:
|
|
|
93
92
|
top_p: 0.9
|
|
94
93
|
stop_tokens:
|
|
95
94
|
- "<|im_end|>"
|
|
96
|
-
#
|
|
95
|
+
# Analytical defaults for a100-40gb / h100. Production
|
|
97
96
|
# capacity also grows: with 2-4× the KV budget the context window can be
|
|
98
97
|
# widened proportionally so longer-context workloads (RAG with large
|
|
99
98
|
# retrieved passages) fit comfortably. ``max_output_tokens`` doubles
|