PyPI - sie-server - Versions diffs - 0.4.1__tar.gz → 0.4.2__tar.gz - Mend

sie-server 0.4.1tar.gz → 0.4.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (442) hide show

{sie_server-0.4.1 → sie_server-0.4.2}/Dockerfile.cpu RENAMED Viewed

@@ -8,7 +8,7 @@
 ARG BUNDLE=default
 # =============================================================================
-# Stage 1: Dependencies (pyproject.toml only, cached across code changes)
+# Dependency image: pyproject-only cache seed
 # =============================================================================
 FROM python:3.12-slim-bookworm AS deps
@@ -57,9 +57,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         -e ".[gpu-metrics]"
 # =============================================================================
-# Stage 2: Base - source install + shared-venv finalization (no BUNDLE)
+# Shared runtime base: source install and venv finalization
 # =============================================================================
-# Bundle-agnostic: all base-stage layers are shared across bundles of this
+# Bundle-agnostic: all base image layers are shared across bundles of this
 # platform in local BuildKit cache and in content-addressed registry layers.
 FROM deps AS base
@@ -107,7 +107,7 @@ RUN set -eux; \
     find /app/.venv -exec touch -h -d @0 {} + 2>/dev/null || true
 # =============================================================================
-# Stage 3: Builder - bundle-specific deps
+# Bundle dependency builder: bundle-specific deps
 # =============================================================================
 FROM base AS builder
@@ -150,7 +150,7 @@ RUN set -eux; \
     find /app/bundle-libs -exec touch -h -d @0 {} + 2>/dev/null || true
 # =============================================================================
-# Stage 4: Runtime
+# Runtime image
 # =============================================================================
 FROM python:3.12-slim-bookworm AS runtime

{sie_server-0.4.1 → sie_server-0.4.2}/Dockerfile.cuda12 RENAMED Viewed

@@ -1,5 +1,5 @@
 # syntax=docker/dockerfile:1
-# SIE Server - CUDA 12.4 Image
+# SIE Server - CUDA 12 Image
 # Build from repo root:
 #   docker build -f packages/sie_server/Dockerfile.cuda12 -t sie-server:cuda12-default .
 #   docker build -f packages/sie_server/Dockerfile.cuda12 --build-arg BUNDLE=sglang -t sie-server:cuda12-sglang .
@@ -8,7 +8,7 @@ ARG BUNDLE=default
 ARG UV_VERSION=0.9.28
 # =============================================================================
-# Stage 1: uv + standalone Python 3.12 (no deadsnakes PPA)
+# Dependency image: uv and standalone Python 3.12
 # =============================================================================
 FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS deps
@@ -59,10 +59,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         -e ".[gpu-metrics]"
 # =============================================================================
-# Stage 2: Base - source install + shared-venv finalization (no BUNDLE)
+# Shared CUDA base: source install and venv finalization
 # =============================================================================
 # Everything here is bundle-agnostic, so bundle-specific builds of a given
-# platform share every base-stage layer in local BuildKit cache and in
+# platform share every base image layer in local BuildKit cache and in
 # content-addressed registry layers.
 FROM deps AS base
@@ -126,7 +126,7 @@ RUN set -eux; \
     find /app/.venv -exec touch -h -d @0 {} + 2>/dev/null || true
 # =============================================================================
-# Stage 3: Builder - bundle-specific deps
+# Bundle dependency builder: bundle-specific deps
 # =============================================================================
 FROM base AS builder
@@ -173,11 +173,19 @@ RUN set -eux; \
     find /app/bundle-libs -exec touch -h -d @0 {} + 2>/dev/null || true
 # =============================================================================
-# Stage 4: Runtime
+# Runtime image
 # =============================================================================
-# Use base CUDA image (not devel/runtime) — PyTorch wheels bundle CUDA libs,
-# cuDNN ships inside torch. Saves ~2GB vs `runtime` variant.
-FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS runtime
+# Runtime base selection is bundle-scoped. Most CUDA bundles stay on the
+# smaller CUDA base runtime; SGLang-family bundles need the devel toolkit
+# because flashinfer/tvm_ffi perform runtime JIT through nvcc on first decode.
+FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS runtime-default
+FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS runtime-transformers5
+FROM nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04 AS runtime-sglang
+ENV CUDA_HOME=/usr/local/cuda \
+    LD_LIBRARY_PATH="/usr/local/cuda/lib64:$LD_LIBRARY_PATH"
+FROM runtime-sglang AS runtime-sglang-embedding
+FROM runtime-${BUNDLE} AS runtime
 ENV DEBIAN_FRONTEND=noninteractive

{sie_server-0.4.1 → sie_server-0.4.2}/PKG-INFO RENAMED Viewed

@@ -1,10 +1,11 @@
 Metadata-Version: 2.4
 Name: sie-server
-Version: 0.4.1
+Version: 0.4.2
 Summary: Search Inference Engine - GPU inference server for search workloads
 License: Apache-2.0
 License-File: LICENSE
 Requires-Python: <3.13,>=3.12
+Requires-Dist: blake3<1,>=0.4
 Requires-Dist: docling<3,>=2
 Requires-Dist: einops<1,>=0.8
 Requires-Dist: fastapi<1,>=0.115
@@ -17,7 +18,6 @@ Requires-Dist: loguru<1,>=0.7
 Requires-Dist: msgpack-numpy<1,>=0.4
 Requires-Dist: msgpack<2,>=1.1
 Requires-Dist: msgspec>=0.20.0
-Requires-Dist: nats-py<3,>=2.9
 Requires-Dist: numpy<3,>=2
 Requires-Dist: open-clip-torch>=2.24
 Requires-Dist: opencv-python-headless<5,>=4
@@ -26,7 +26,7 @@ Requires-Dist: opentelemetry-exporter-otlp<2,>=1.28
 Requires-Dist: opentelemetry-instrumentation-fastapi<1,>=0.49b0
 Requires-Dist: opentelemetry-sdk<2,>=1.28
 Requires-Dist: packaging<25,>=24
-Requires-Dist: pillow<12,>=11
+Requires-Dist: pillow>=12.2.0
 Requires-Dist: prometheus-client<1,>=0.21
 Requires-Dist: pydantic-settings<3,>=2.6
 Requires-Dist: pydantic<3,>=2.9

{sie_server-0.4.1 → sie_server-0.4.2}/README.md RENAMED Viewed

@@ -66,7 +66,7 @@ auto-retries; see `packages/sie_sdk/README.md` for client-side controls.
 | Env var | Default | Effect |
 |--|--|--|
-| `SIE_GRAMMAR_PREFLIGHT_DEBUG` | unset (off) | Enables the legacy worker-side Outlines preflight compile before each structured-output request. Off by default per ADR-0002 — SGLang is the production grammar authority. Use for diagnosing schema-rejection problems or slow compiles in a controlled environment; not recommended for production traffic. |
+| `SIE_GRAMMAR_PREFLIGHT_DEBUG` | unset (off) | Enables the legacy worker-side Outlines preflight compile before each structured-output request. Off by default because SGLang is the production grammar authority. Use for diagnosing schema-rejection problems or slow compiles in a controlled environment; not recommended for production traffic. |
 For nested settings (any field with `__`), the env-var format is
 `SIE_<TOP>__<NESTED>=value`. The complete schema is in

{sie_server-0.4.1 → sie_server-0.4.2}/bundles/default.yaml RENAMED Viewed

@@ -43,6 +43,7 @@ adapters:
 - sie_server.adapters.florence2
 - sie_server.adapters.docling
 - sie_server.adapters.paddleocr_vl
+- sie_server.adapters.mineru_vl
 deps:
   # Most flash adapters; sentence_transformer needs >=4.57
   transformers: '>=4.57,<5'

sie_server-0.4.2/models/Marqo__marqo-fashionSigLIP.yaml ADDED Viewed

@@ -0,0 +1,28 @@
+sie_id: Marqo/marqo-fashionSigLIP
+hf_id: Marqo/marqo-fashionSigLIP
+inputs:
+  text: true
+  image: true
+  audio: false
+  video: false
+tasks:
+  encode:
+    dense:
+      dim: 768
+    sparse: null
+    multivector: null
+  score: null
+  extract: null
+max_sequence_length: 64
+profiles:
+  default:
+    max_batch_tokens: 16384
+    compute_precision: float16
+    adapter_path: sie_server.adapters.siglip:SiglipAdapter
+    adapter_options:
+      loadtime:
+        backend: open_clip
+        open_clip_model_id: hf-hub:Marqo/marqo-fashionSigLIP
+        dense_dim: 768
+      runtime:
+        normalize: true

{sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3-0.6B.yaml RENAMED Viewed

@@ -6,40 +6,39 @@ inputs:
   audio: false
   video: false
 tasks:
-  # Tiny generation model used for fast e2e iteration on Modal L4 (walking-skeleton
-  # viability bench + validation-harness SIE-only smoke). Loads in ~30s. Quality is
-  # bad — this is a transport benchmark target, not a production model.
+  # Small, fast generation model — a viable PROD pick for simple/short-prompt
+  # tasks (cheap + high throughput; weaker on long-context reasoning). Loads in
+  # ~30s. Also doubles as the transport/walking-skeleton benchmark target.
   #
-  # Context / batch sizes are deliberately small (1024 vs the headroom an
-  # L4 could nominally support) so the validation harness can co-resident
-  # the worker's SGLang with a second SGLang for the baseline phase on
-  # a single 22 GiB card without OOMing. Raise these if you ever run 0.6B
-  # alone on a card it doesn't have to share.
+  # ``context_length`` is the standalone PROD serving value (4096): big enough
+  # to fit the full generation benchmark pack (casehold prompts reach ~1.8k
+  # tokens, gpqa ~1.3k) so the model is comparable to the rest of the fleet on
+  # every task, while KV stays trivial at this size (112 KB/token → 4096 ≈
+  # 0.46 GB). The validation/co-residency harness, which packs two SGLang
+  # instances onto a single 22 GiB L4, does NOT depend on this default — it
+  # passes an explicit ``--max-seq-length``/``--context-length`` (see
+  # tools/bench_generation_matrix.py + run_generation_smoke.py) and caps itself
+  # to 1024 for that case.
   #
-  # Note on the three 1024s below: `context_length`, `max_sequence_length`,
-  # and `max_batch_tokens` are NOT redundant — they're three independent
-  # knobs (per-request context, SGLang --context-length, batcher cost
-  # budget) that just happen to collide here because the model is tiny.
-  # See sibling Qwen__Qwen3-4B-Instruct-2507.yaml for the canonical
-  # non-collapsed shape.
+  # ``context_length``, ``max_sequence_length``, and ``max_batch_tokens`` are
+  # three independent knobs (per-request context, SGLang --context-length,
+  # batcher cost budget); see sibling Qwen__Qwen3-4B-Instruct-2507.yaml.
   generate:
-    context_length: 1024
+    context_length: 4096
     max_output_tokens: 1024
     capabilities:
       grammar: []
       streaming: true
       tools: false
-max_sequence_length: 1024
+max_sequence_length: 4096
 # KV-cache memory math (Qwen3-0.6B, bf16):
 #   layers=28, kv_heads=8, head_dim=128, bytes_per_elem=2
 #   kv_bytes_per_token = 2 × 28 × 8 × 128 × 2 = 114,688 B ≈ 112 KB
-# The 0.6B is a transport benchmark target — context_length is held at
-# 1024 deliberately (see header comment) so the validation harness can
-# co-resident two SGLang instances on an L4. KV budgets per profile
-# scale with the deployment scenario rather than the GPU ceiling.
+# At ctx=4096 a single request's KV is ~0.46 GB — negligible. The co-residency
+# harness still caps context explicitly when it has to share a card.
 profiles:
   default:
-    max_batch_tokens: 1024
+    max_batch_tokens: 4096
     compute_precision: bfloat16
     adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
     kv_budget_tokens: 8192

{sie_server-0.4.1 → sie_server-0.4.2}/models/Qwen__Qwen3-4B-Instruct-2507.yaml RENAMED Viewed

@@ -66,8 +66,7 @@ max_sequence_length: 32768
 # kv_budget_tokens is set to ~40% of theoretical max, matching the L4
 # baseline ratio (32768/90000 ≈ 36%). The headroom absorbs batch growth,
 # speculative side-cell, grammar/Outlines compile arena, fragmentation.
-# Final empirical validation (concurrency-16 OOM-boundary sweep) is
-# tracked in product/plans/m4-req2-gpu-runbook.md §"#16/#19".
+# Final empirical validation should use concurrency and OOM-boundary sweeps.
 profiles:
   default:
     # max_batch_tokens is a generic engine knob; generation does not batch
@@ -93,7 +92,7 @@ profiles:
           top_p: 0.9
         stop_tokens:
           - "<|im_end|>"
-  # M5 audit #16/#19: analytical defaults for a100-40gb / h100. Production
+  # Analytical defaults for a100-40gb / h100. Production
   # capacity also grows: with 2-4× the KV budget the context window can be
   # widened proportionally so longer-context workloads (RAG with large
   # retrieved passages) fit comfortably. ``max_output_tokens`` doubles

sie_server-0.4.2/models/Qwen__Qwen3.6-27B.yaml ADDED Viewed

@@ -0,0 +1,308 @@
+sie_id: Qwen/Qwen3.6-27B
+hf_id: Qwen/Qwen3.6-27B
+inputs:
+  # Qwen3.6-27B is a unified vision-language model (Gated DeltaNet + Gated
+  # Attention with an integrated vision encoder; same hybrid family as
+  # Qwen3.5-4B but scaled to 64 layers / hidden_size=5120). The wire
+  # surface accepts text+image via the OpenAI chat-completions schema
+  # (``image_url`` content parts); video is documented by Qwen but not
+  # yet wired through the SIE gateway.
+  text: true
+  image: true
+  audio: false
+  video: false
+tasks:
+  generate:
+    # Native context length is 262,144 tokens (YaRN extends to ~1M).
+    # Default to 4096 here — this matches the empirically-calibrated
+    # reference point in the profile comments below
+    # (mem_fraction_static=0.93, weight 51.05 GB + kvcache 11.63 GB).
+    # 8192 was the original optimistic default but SGLang's
+    # init_memory_pool refused to fit it inside the conservative
+    # mem_fraction envelope on H100-80GB even when bumped to 0.97 —
+    # NEXTN speculative-decoding draft KV pushes the total past the
+    # available headroom. Raise both context_length AND
+    # mem_fraction_static together if you need longer contexts;
+    # benchmarking and prod requests today fit comfortably in 4096.
+    context_length: 4096
+    max_output_tokens: 4096
+    capabilities:
+      # Same constraint as Qwen3.5-4B: SGLang's outlines_backend does
+      # not implement ebnf. xgrammar smoke would pass all three; flip
+      # ``grammar_backend: xgrammar`` and re-add ``"ebnf"`` here if a
+      # consumer needs it.
+      grammar: ["json_schema", "regex"]
+      streaming: true
+      tools: true
+    # Qwen3.6 emits ``<think>...</think>`` reasoning by default. We
+    # disable it for the OpenAI-compat path so visible output is the
+    # answer only. Operators wanting CoT can flip this profile-side.
+    chat_template_kwargs:
+      enable_thinking: false
+    prewarm_grammars:
+      # Bare pattern, NOT anchored — Outlines regexes are implicitly
+      # anchored and its FSM engine rejects ``^``/``$``. See the
+      # Qwen3.5-4B model card for the full back-story.
+      - name: yes_no
+        kind: regex
+        value: "(yes|no)"
+      - name: short_answer
+        kind: json_schema
+        value:
+          type: object
+          properties:
+            answer:
+              type: string
+          required: [answer]
+max_sequence_length: 4096
+# ── KV-cache math (placeholder, pending Modal calibration) ──
+#
+# Qwen3.6-27B layer breakdown (per model card / config.json):
+#   * 64 layers in a 16 × (3 × DeltaNet + 1 × GatedAttention) pattern
+#   * 16 KV-bearing Gated-Attention layers (4 KV heads × head_dim=256)
+#   * 48 recurrent Gated-DeltaNet layers — managed by SGLang's mamba
+#     scheduler under ``--mamba-scheduler-strategy extra_buffer``
+#
+# BF16 weights ≈ 27e9 × 2 B ≈ 54 GB before activations / KV.
+#   * L4 (22 GB)        → infeasible
+#   * A100-40GB (40 GB) → infeasible BF16; would need FP8 or TP2
+#   * H100-80GB         → primary target (single-GPU, BF16)
+#   * H100×2 (160 GB)   → for context >32k or large concurrencies
+#
+# ``kv_budget_tokens`` is a conservative analytical placeholder pending
+# the first /get_server_info dump from tools/smoke_qwen36_27b.py on
+# Modal. Re-calibrate from the empirical token_capacity before relying
+# on these numbers in production.
+profiles:
+  # L4 / A100-40GB profiles intentionally omitted — Qwen3.6-27B's BF16
+  # weights (~54 GB) do not fit. Add an FP8 or TP2 profile if those
+  # tiers become required.
+  default:
+    max_batch_tokens: 16384
+    compute_precision: bfloat16
+    adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
+    # Conservative H100-80GB placeholder pending the first
+    # /get_server_info dump from tools/smoke_qwen36_27b.py on Modal.
+    # Empirical reference point (2026-05-27 smoke, mem_fraction_static=0.93,
+    # context_length=4096, no-spec): SGLang reported
+    # ``weight=51.05 GB, kvcache=11.63 GB, token_capacity=190,543``.
+    # Sizing here at ~1/12 of that capacity to leave room for NEXTN
+    # draft KV + grammar/Outlines compile arena once spec is re-enabled.
+    # Re-calibrate after a smoke run on this profile's actual settings.
+    # Halved from 16384 to leave more H100-80GB headroom for the NEXTN
+    # draft activation arena. 8192 KV slots × ~64 KB/token ≈ 0.5 GB —
+    # plenty for the realistic per-request concurrency on this profile.
+    kv_budget_tokens: 8192
+    adapter_options:
+      loadtime:
+        # 0.95 paired with ``context_length: 4096`` + the *smaller*
+        # NEXTN draft below (num_steps=2 / num_draft_tokens=2 vs the
+        # 3/4 model-card default). The 0.93+default-draft cell still
+        # OOM'd ``init_memory_pool`` because verification batch grows
+        # with num_steps × num_draft_tokens; halving both shrinks the
+        # activation arena enough to fit.
+        mem_fraction_static: 0.95
+        served_model_name: Qwen/Qwen3.6-27B
+        disable_cuda_graph: true
+        attention_backend: triton
+        grammar_backend: outlines
+        reasoning_parser: qwen3
+        tool_call_parser: qwen3_coder
+        # MTP/NEXTN per the Qwen3.x model-card recipe (SGLang implements
+        # NEXTN under the EAGLE codepath; ``/server_info`` reports
+        # ``speculative_algorithm: EAGLE``). Smaller-draft variant
+        # (num_steps=2 / num_draft_tokens=2 vs the model-card 3/4) so
+        # the verification batch fits inside H100-80GB at ctx=4096 +
+        # mfs=0.95. The 3/4 default reliably OOM'd
+        # ``init_memory_pool`` even at mfs=0.97 — the trade is a
+        # slightly smaller speculative window for the ability to fit at
+        # all. Re-tune (or re-enable 3/4) once FP8 / TP=2 is wired up.
+        speculative:
+          enabled: true
+          algorithm: nextn
+          num_steps: 2
+          eagle_topk: 1
+          num_draft_tokens: 2
+        # ``--mamba-scheduler-strategy extra_buffer`` is the required
+        # pair-flag for NEXTN spec on the hybrid Gated-DeltaNet
+        # architecture. With spec disabled it's also harmless to keep,
+        # so we leave it for when spec is re-enabled.
+        extra_launch_args:
+          - "--mamba-scheduler-strategy"
+          - "extra_buffer"
+          - "--disable-overlap-schedule"
+      runtime:
+        first_chunk_timeout_s: 90
+        inter_chunk_timeout_s: 15
+        overall_timeout_s: 600
+        default_sampling:
+          temperature: 0.7
+          top_p: 0.8
+          presence_penalty: 1.5
+        stop_tokens:
+          - "<|im_end|>"
+  h100:
+    max_batch_tokens: 32768
+    compute_precision: bfloat16
+    adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
+    # Same H100-80GB target as ``default``; this profile widens
+    # ``max_batch_tokens`` for batch-heavy workloads once calibration
+    # confirms KV headroom.
+    kv_budget_tokens: 32768
+    adapter_options:
+      loadtime:
+        # Same as ``default`` — see that profile's mfs comment.
+        mem_fraction_static: 0.93
+        served_model_name: Qwen/Qwen3.6-27B
+        disable_cuda_graph: true
+        attention_backend: triton
+        grammar_backend: outlines
+        reasoning_parser: qwen3
+        tool_call_parser: qwen3_coder
+        speculative:
+          enabled: true
+          algorithm: nextn
+          num_steps: 3
+          eagle_topk: 1
+          num_draft_tokens: 4
+        # ``--disable-overlap-schedule`` is the required pair-flag for
+        # NEXTN + mamba-scheduler ``extra_buffer`` on the hybrid Gated-
+        # DeltaNet architecture (same constraint as Qwen3.5-4B).
+        extra_launch_args:
+          - "--mamba-scheduler-strategy"
+          - "extra_buffer"
+          - "--disable-overlap-schedule"
+      runtime:
+        first_chunk_timeout_s: 90
+        inter_chunk_timeout_s: 15
+        overall_timeout_s: 600
+        default_sampling:
+          temperature: 0.7
+          top_p: 0.8
+          presence_penalty: 1.5
+        stop_tokens:
+          - "<|im_end|>"
+  # RTX PRO 6000 (96 GB GDDR7, Blackwell Server Edition, sm_120) profile.
+  # FP8-first for max throughput: ``--quantization fp8`` (SGLang online
+  # dynamic FP8 quant of the BF16 checkpoint) via the ``extra_launch_args``
+  # passthrough — ``compute_precision`` can only express the ``--dtype``
+  # axis (float16/bfloat16/float32), not the orthogonal ``--quantization``
+  # flag, so FP8 rides the same escape hatch already used for the mamba
+  # scheduler. FP8 halves weight memory (~54 → ~27 GB), which frees room
+  # for the *model-card* NEXTN 3/4 draft (num_steps=3 / num_draft_tokens=4)
+  # that OOM'd ``init_memory_pool`` on H100-80GB even at mfs=0.97. The +16 GB
+  # over H100 plus the FP8 weight saving is what makes 3/4 fit here.
+  #
+  # ACCURACY CONTRACT: FP8 is lossy. This profile is validated to within the
+  # Wilson 95% CI of the *BF16* baseline on all four generation tasks (see
+  # docs/adr/0001). If FP8 misses parity after bounded tuning, fall back to
+  # BF16 + NEXTN 3/4 (drop the ``--quantization fp8`` pair below; the 96 GB
+  # still fits the 3/4 draft in BF16). KV cache stays BF16 here — add
+  # ``--kv-cache-dtype fp8_e4m3`` only if memory/throughput needs it AND
+  # accuracy still holds (KV FP8 is usually the first thing to cost accuracy).
+  #
+  # Standalone block (no ``extends``): production ``resolve_profile`` does a
+  # full-replace of ``loadtime`` for extending profiles, so a partial child
+  # would drop inherited launch flags and desync the via-SIE path from the
+  # bare-SGLang bench control.
+  rtx-pro-6000:
+    max_batch_tokens: 32768
+    compute_precision: bfloat16
+    adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
+    # FP8 weights (~27 GB) + 96 GB total leaves generous KV/draft headroom;
+    # start conservative and raise after the first /get_server_info dump on
+    # the actual RTX-PRO-6000 smoke.
+    kv_budget_tokens: 16384
+    adapter_options:
+      loadtime:
+        # 0.90 is a conservative starting point — FP8 weights free enough
+        # memory that the 3/4 NEXTN verification batch should fit with room
+        # to spare. Iterate upward (smoke ``--mem-fraction-static``) once the
+        # first boot confirms Blackwell sm_120 + FP8 GEMM kernels are present.
+        mem_fraction_static: 0.90
+        served_model_name: Qwen/Qwen3.6-27B
+        disable_cuda_graph: true
+        # triton attention matches the rest of the Qwen3.x family. Blackwell
+        # (sm_120) kernel coverage for triton + FP8 + NEXTN is the first thing
+        # the boot smoke verifies; switch to flashinfer here if triton lacks
+        # sm_120 coverage in the pinned SGLang build.
+        attention_backend: triton
+        grammar_backend: outlines
+        reasoning_parser: qwen3
+        tool_call_parser: qwen3_coder
+        # Model-card NEXTN 3/4 — restored here (vs ``default``'s conservative
+        # 2/2) because FP8 + 96 GB fits the larger verification batch that
+        # OOM'd on H100-80GB.
+        speculative:
+          enabled: true
+          algorithm: nextn
+          num_steps: 3
+          eagle_topk: 1
+          num_draft_tokens: 4
+        # ``--quantization fp8`` rides the passthrough (see header comment).
+        # The mamba-scheduler + overlap pair-flags are the required NEXTN
+        # companions on the hybrid Gated-DeltaNet architecture. List is the
+        # FULL set (production full-replaces ``extra_launch_args``, not merge).
+        extra_launch_args:
+          - "--quantization"
+          - "fp8"
+          - "--mamba-scheduler-strategy"
+          - "extra_buffer"
+          - "--disable-overlap-schedule"
+      runtime:
+        first_chunk_timeout_s: 90
+        inter_chunk_timeout_s: 15
+        overall_timeout_s: 600
+        # Qwen3.6-27B empty-response fix baked in: under greedy/low-temp the
+        # chat template emits EOS as the FIRST token on a large fraction of
+        # prompts (n=50 6000 smoke: casehold 23/50, gpqa 29/50 came back
+        # EMPTY). The floor ``min_new_tokens>=1`` fixes it — validated on the
+        # 6000: min_tokens=10 → 0/50 empty on all four tasks, accuracy within
+        # Wilson 95% CI of the BF16 baseline. ``min_new_tokens`` is the
+        # SGLang-native key; the adapter merges this dict via ``setdefault``,
+        # so a request-supplied ``min_tokens`` still wins. NOTE: this only
+        # takes effect because ``runtime.default_sampling`` is now wired into
+        # the adapter (core/loader.py); before that fix a key here was a
+        # silent no-op and chat clients had to pass ``min_tokens`` themselves.
+        default_sampling:
+          temperature: 0.7
+          top_p: 0.8
+          presence_penalty: 1.5
+          min_new_tokens: 10
+        stop_tokens:
+          - "<|im_end|>"
+  # No-speculative baseline — for SIE-vs-raw-SGLang ablation cells so
+  # spec-decoding's contribution can be measured independently. Keeps
+  # the same ``extra_launch_args`` as ``default`` / ``h100`` so a config
+  # diff between them shows only the ``speculative`` block (the intent
+  # of the ablation), matching Qwen3.5-4B's convention.
+  no-spec:
+    max_batch_tokens: 32768
+    compute_precision: bfloat16
+    adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
+    kv_budget_tokens: 65536
+    adapter_options:
+      loadtime:
+        mem_fraction_static: 0.85
+        served_model_name: Qwen/Qwen3.6-27B
+        disable_cuda_graph: true
+        attention_backend: triton
+        grammar_backend: outlines
+        reasoning_parser: qwen3
+        tool_call_parser: qwen3_coder
+        speculative:
+          enabled: false
+        extra_launch_args:
+          - "--mamba-scheduler-strategy"
+          - "extra_buffer"
+          - "--disable-overlap-schedule"
+      runtime:
+        first_chunk_timeout_s: 90
+        inter_chunk_timeout_s: 15
+        overall_timeout_s: 600
+        default_sampling:
+          temperature: 0.7
+          top_p: 0.8
+          presence_penalty: 1.5
+        stop_tokens:
+          - "<|im_end|>"

{sie_server-0.4.1 → sie_server-0.4.2}/models/docling.yaml RENAMED Viewed

@@ -2,7 +2,7 @@ sie_id: docling
 package_backed: true
 inputs:
   text: false
-  image: false
+  image: true
   audio: false
   video: false
   document: true

sie_server-0.4.2/models/opendatalab__MinerU2.5-Pro-2604-1.2B.yaml ADDED Viewed

@@ -0,0 +1,24 @@
+sie_id: opendatalab/MinerU2.5-Pro-2604-1.2B
+hf_id: opendatalab/MinerU2.5-Pro-2604-1.2B
+hf_revision: d3f5e08d073c21466bbabe21c71bb1e9c2e595da
+inputs:
+  text: false
+  image: true
+  audio: false
+  video: false
+tasks:
+  encode: null
+  score: null
+  extract: {}
+profiles:
+  default:
+    max_batch_tokens: 16384
+    compute_precision: bfloat16
+    adapter_path: sie_server.adapters.mineru_vl:MinerUVLAdapter
+    adapter_options:
+      loadtime:
+        default_task: "[default]"
+      runtime:
+        task: "[default]"
+        max_new_tokens: 4096
+        num_beams: 1

{sie_server-0.4.1 → sie_server-0.4.2}/openapi.json RENAMED Viewed

@@ -3,7 +3,7 @@
   "info": {
     "title": "SIE Server",
     "description": "Search Inference Engine - GPU inference server for search workloads",
-    "version": "0.4.1"
+    "version": "0.4.2"
   },
   "paths": {
     "/": {

{sie_server-0.4.1 → sie_server-0.4.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "sie-server"
-version = "0.4.1"
+version = "0.4.2"
 description = "Search Inference Engine - GPU inference server for search workloads"
 requires-python = ">=3.12,<3.13"
 license = { text = "Apache-2.0" }
@@ -39,7 +39,7 @@ dependencies = [
     # SigLIP (Marqo/marqo-ecommerce-embeddings-B native open_clip loader)
     "open-clip-torch>=2.24",
     # Image processing
-    "pillow>=11,<12",
+    "pillow>=12.2.0",
     "numpy>=2,<3",
     "torchvision>=0.18,<1", # Required by some HF models (e.g., nvidia/llama-nemoretriever)
     # Config
@@ -50,8 +50,6 @@ dependencies = [
     "packaging>=24,<25",
     # Hot-reload
     "watchdog>=6,<7",
-    # NATS pub/sub for config notifications
-    "nats-py>=2.9,<3",
     # Observability
     "opentelemetry-api>=1.28,<2",
     "opentelemetry-sdk>=1.28,<2",
@@ -66,6 +64,11 @@ dependencies = [
     "msgspec>=0.20.0",
     # Async HTTP client (telemetry sender)
     "httpx>=0.28.1",
+    # BLAKE3 used to cross-check the worker-sidecar's `PreparedTokens`
+    # tokenizer_id. Tiny (<200KB, pure-Rust via PyO3), mandatory for
+    # the encode / score fast-path consumer — see
+    # `sie_server.core.preprocessor.text.TextPreprocessor`.
+    "blake3>=0.4,<1",
 ]
 [project.optional-dependencies]

sie-server 0.4.1__tar.gz → 0.4.2__tar.gz

sie-server 0.4.1tar.gz → 0.4.2tar.gz