PyPI - sie-server - Versions diffs - 0.3.4__tar.gz → 0.4.1__tar.gz - Mend

sie-server 0.3.4tar.gz → 0.4.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (425) hide show

{sie_server-0.3.4 → sie_server-0.4.1}/Dockerfile.cpu RENAMED Viewed

@@ -63,6 +63,13 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # platform in local BuildKit cache and in content-addressed registry layers.
 FROM deps AS base
+# Source-layer cache key — see Dockerfile.cuda12 for the full rationale.
+# Tie source layers to a per-commit revision arg so a rebuild can't ship
+# stale code via a reused source-COPY layer. Dependency layers in the
+# ``deps`` stage above stay cached.
+ARG SIE_SRC_REV=dev
+RUN echo "sie source revision: ${SIE_SRC_REV}"
 COPY packages/sie_sdk/src /tmp/sie_sdk/src
 COPY packages/sie_server/src src/
 COPY packages/sie_server/bundles bundles/
@@ -158,13 +165,25 @@ ENV DEBIAN_FRONTEND=noninteractive \
 # Only the shared libs torch + pillow + rtree actually dlopen at runtime.
 # libspatialindex-c6: rtree (docling dep) dlopens libspatialindex_c.so; the
 # rtree==1.4.1 wheel only bundles the C++ core, not the C wrapper.
+# libgl1 libglib2.0-0 libice6 libsm6 libx11-6 libxcb1 libxext6: docling-ibm-models'
+# TableFormer imports cv2 during DoclingAdapter.load() pre-warm; the opencv-python
+# wheel unconditionally dlopens an X11 + libGL + glib chain at import even in
+# headless usage. Without these, every docling extract crashes with
+# "ImportError: libxcb.so.1: cannot open shared object file" (issue #1028).
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
     apt-get update && apt-get install -y --no-install-recommends \
+    libgl1 \
+    libglib2.0-0 \
     libgomp1 \
+    libice6 \
     libjpeg62-turbo \
     libpng16-16 \
-    libspatialindex-c6
+    libsm6 \
+    libspatialindex-c6 \
+    libx11-6 \
+    libxcb1 \
+    libxext6
 RUN groupadd -g 1000 sie && useradd -u 1000 -g sie -m sie
@@ -176,8 +195,10 @@ WORKDIR /app
 # no /etc/passwd visible (the sie user exists in the runtime FS but --link
 # layers are created in isolation).
 COPY --link --from=base --chown=1000:1000 /app/.venv /app/.venv
-COPY --link --from=base --chown=1000:1000 /app/src /app/src
-COPY --link --from=base --chown=1000:1000 /tmp/sie_sdk/src /tmp/sie_sdk/src
+# Source trees WITHOUT --link — see Dockerfile.cuda12 (linked cross-stage
+# copies didn't reliably invalidate on source change).
+COPY --from=base --chown=1000:1000 /app/src /app/src
+COPY --from=base --chown=1000:1000 /tmp/sie_sdk/src /tmp/sie_sdk/src
 COPY --link --from=base --chown=1000:1000 /app/models /app/models
 COPY --link --from=base --chown=1000:1000 /app/bundles /app/bundles
 # Bundle-specific extras — last layer so shared layers above stay cached.

{sie_server-0.3.4 → sie_server-0.4.1}/Dockerfile.cuda12 RENAMED Viewed

@@ -66,6 +66,18 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # content-addressed registry layers.
 FROM deps AS base
+# Source-layer cache key. A rebuild once shipped STALE code: buildx
+# reused a cached source-COPY layer even though the .py files had
+# changed, so the demo had to overlay patched files by hand. Tie the
+# source layers to an explicit revision arg the CI passes per commit
+# (``--build-arg SIE_SRC_REV=$(git rev-parse --short HEAD)``) so any
+# commit forces these layers — and the editable reinstall below — to
+# rebuild. The expensive dependency install lives in the ``deps`` stage
+# ABOVE this line, so it stays cached. Bundles of the same commit share
+# the same SIE_SRC_REV, so cross-bundle layer dedup is preserved.
+ARG SIE_SRC_REV=dev
+RUN echo "sie source revision: ${SIE_SRC_REV}"
 COPY packages/sie_sdk/src /tmp/sie_sdk/src
 COPY packages/sie_server/src src/
 COPY packages/sie_server/bundles bundles/
@@ -175,15 +187,27 @@ ENV DEBIAN_FRONTEND=noninteractive
 # libgomp1: torch OpenMP runtime.
 # libspatialindex-c6: rtree (docling dep) dlopens libspatialindex_c.so; the
 #   rtree==1.4.1 wheel only bundles the C++ core, not the C wrapper.
+# libgl1 libglib2.0-0 libice6 libsm6 libx11-6 libxcb1 libxext6: docling-ibm-models'
+#   TableFormer imports cv2 during DoclingAdapter.load() pre-warm; the
+#   opencv-python wheel unconditionally dlopens an X11 + libGL + glib chain at
+#   import even in headless usage. Without these, every docling extract crashes
+#   with "ImportError: libxcb.so.1: cannot open shared object file" (issue #1028).
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
     apt-get update && apt-get install -y --no-install-recommends \
     ca-certificates \
     gcc \
     libc6-dev \
+    libgl1 \
+    libglib2.0-0 \
     libgomp1 \
+    libice6 \
     libnuma1 \
-    libspatialindex-c6
+    libsm6 \
+    libspatialindex-c6 \
+    libx11-6 \
+    libxcb1 \
+    libxext6
 RUN groupadd -g 1000 sie && useradd -u 1000 -g sie -m sie
@@ -211,8 +235,13 @@ RUN set -e; \
 # (the sie user is added in the runtime stage filesystem but --link layers
 # are created in isolation from the destination stage state).
 COPY --link --from=base --chown=1000:1000 /app/.venv /app/.venv
-COPY --link --from=base --chown=1000:1000 /app/src /app/src
-COPY --link --from=base --chown=1000:1000 /tmp/sie_sdk/src /tmp/sie_sdk/src
+# Source trees are copied WITHOUT --link. ``COPY --link --from=<stage>``
+# layers are cached on a digest that, in the buildx versions this image
+# was built with, did not reliably invalidate when the upstream source
+# changed — the stale-code bug above. These trees are small, so dropping
+# --link costs negligible dedup while guaranteeing edited code ships.
+COPY --from=base --chown=1000:1000 /app/src /app/src
+COPY --from=base --chown=1000:1000 /tmp/sie_sdk/src /tmp/sie_sdk/src
 COPY --link --from=base --chown=1000:1000 /app/models /app/models
 COPY --link --from=base --chown=1000:1000 /app/bundles /app/bundles
 # Bundle-specific extras — last layer so shared layers above stay cached.

{sie_server-0.3.4 → sie_server-0.4.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sie-server
-Version: 0.3.4
+Version: 0.4.1
 Summary: Search Inference Engine - GPU inference server for search workloads
 License: Apache-2.0
 License-File: LICENSE

{sie_server-0.3.4 → sie_server-0.4.1}/README.md RENAMED Viewed

@@ -62,6 +62,12 @@ auto-retries; see `packages/sie_sdk/README.md` for client-side controls.
 | `SIE_DEFAULT_COMPUTE_PRECISION` | `float16` | One of `float16`, `bfloat16`, `float32`. |
 | `SIE_ATTENTION_BACKEND` | `auto` | One of `auto`, `flash_attention_2`, `sdpa`, `eager`. |
+### Diagnostics
+| Env var | Default | Effect |
+|--|--|--|
+| `SIE_GRAMMAR_PREFLIGHT_DEBUG` | unset (off) | Enables the legacy worker-side Outlines preflight compile before each structured-output request. Off by default per ADR-0002 — SGLang is the production grammar authority. Use for diagnosing schema-rejection problems or slow compiles in a controlled environment; not recommended for production traffic. |
 For nested settings (any field with `__`), the env-var format is
 `SIE_<TOP>__<NESTED>=value`. The complete schema is in
 `packages/sie_server/src/sie_server/config/engine.py`.

{sie_server-0.3.4 → sie_server-0.4.1}/bundles/default.yaml RENAMED Viewed

@@ -78,5 +78,5 @@ deps:
   docling: '>=2,<3'
   # Flash Attention 2 — CUDA only, prebuilt wheel
   flash-attn:
-    url: https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.7.0/flash_attn-2.7.4+cu128torch2.9-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
+    url: https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.7.11/flash_attn-2.7.4+cu129torch2.9-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
     marker: sys_platform == 'linux'

sie_server-0.4.1/bundles/sglang-embedding.yaml ADDED Viewed

@@ -0,0 +1,18 @@
+name: sglang-embedding
+priority: 21
+adapters:
+- sie_server.adapters.sglang.embedding
+deps:
+  # Lockstep with ``bundles/sglang.yaml``. The dependency stack is
+  # identical (sglang's grammar backends are unavoidably pulled in even
+  # on embedding-only deployments), and the bundle split exists only for
+  # worker pool isolation. The previous revision pinned ``outlines-core``
+  # (a transitive of ``outlines``) instead of the actual top-level deps
+  # the generation bundle pins, which was both wrong and produced silent
+  # pip resolution drift on environments that already had a different
+  # ``outlines`` installed. Asserted in ``tests/test_bundles.py`` so a
+  # future drift fails fast.
+  sglang: '==0.5.10.post1'
+  xgrammar: '==0.1.32'
+  outlines: '==0.1.11'
+  llguidance: '>=0.7.11,<0.8.0'

sie_server-0.4.1/bundles/sglang.yaml ADDED Viewed

@@ -0,0 +1,66 @@
+name: sglang
+priority: 20
+adapters:
+- sie_server.adapters.sglang.generation
+deps:
+  # SGLang 0.5.6+ includes all deps at base level (fixed from 0.4.x extras bug)
+  # See: https://github.com/sgl-project/sglang/issues/4869
+  #
+  # Qwen3.5-4B + Qwen3.6-27B compatibility:
+  #
+  # ``sglang==0.5.10.post1`` is the canonical target for the Qwen3.x hybrid
+  # Gated-DeltaNet + Gated-Attention family on the current
+  # L4 / A100-40GB / H100 fleet. Qwen3.6-27B uses the same ``qwen3_5``
+  # model class shipped in 0.5.10 — the architecture (64 layers, hybrid
+  # Gated DeltaNet + Gated Attention, MTP/NEXTN) is identical, only the
+  # parameter count differs.
+  #
+  # SGLang 0.5.10 was evaluated against CUDA 12.9 + Qwen3.6-27B on Modal
+  # H100 (2026-05-27): server boots, loads weights, but the bundled
+  # ``sglang/jit_kernel/csrc/elementwise/activation.cuh`` has a C++
+  # template bug (``select_kernel<true>(type)`` is parsed as a class-
+  # template substitution, not a function-template call) that the
+  # stricter ``nvcc`` shipped with CUDA 12.9 rejects at first activation.
+  # 0.5.11 is also dev-only on the sglang docs wheel index — not on
+  # PyPI. Park the 0.5.11 bump until upstream cuts a stable release with
+  # the JIT header fixed; 0.5.10.post1 covers Qwen3.6-27B today.
+  #
+  #   * grammar backends: ``xgrammar`` / ``outlines`` / ``llguidance`` / ``none``
+  #   * ``sgl_kernel`` covers SM_80 / SM_89 / SM_90 / SM_100 via gencode.
+  #   * torch==2.9.1 (CUDA 12.9 wheels); ``cuda-python==12.9``.
+  sglang: '==0.5.10.post1'
+  #
+  # Grammar backend deps — pinned to exactly what SGLang 0.5.10 imports
+  # internally to prevent silent pip resolution drift.
+  #
+  # XGrammar is SGLang 0.5.10's other supported grammar backend; it pins
+  # ``xgrammar==0.1.32`` exactly. That release also brought the
+  # structured-output VRAM-leak fix (PR #20697) and grammar-error
+  # propagation (PR #20467). Kept available as the fallback backend.
+  xgrammar: '==0.1.32'
+  #
+  # ``outlines==0.1.11`` is a hard transitive dep of sglang==0.5.10.post1. We
+  # declare it explicitly at bundle level so the surface is visible.
+  # ``outlines-core`` (a separate package) is a transitive of outlines
+  # and intentionally NOT pinned here — pinning ``outlines-core`` directly
+  # was a bug in the previous bundle revision (it does not give us any
+  # grammar functionality on its own).
+  #
+  # Outlines IS the active grammar backend for Qwen3.5 (partner
+  # requirement). Earlier revisions said "do not switch Qwen3.5 to
+  # outlines" because the worker-side ``compile_outlines`` preflight
+  # crashed with ``'TokenizersBackend' object has no attribute
+  # 'vocabulary'``: it passed the raw transformers==5.3.0 tokenizer
+  # (now a ``TokenizersBackend``) to Outlines' processor factories, which
+  # require an Outlines ``Tokenizer`` adapter exposing ``.vocabulary``.
+  # ``compile_outlines`` now wraps the tokenizer in Outlines'
+  # ``TransformerTokenizer`` first (the same wrap SGLang's
+  # ``OutlinesGrammarBackend`` does internally), so the mismatch is gone
+  # for json_schema/regex; ebnf is forwarded straight to SGLang. See
+  # ``processors/grammar_compile.py`` ("Tokenizer adapter").
+  outlines: '==0.1.11'
+  #
+  # llguidance is the third grammar backend (regex / json_schema / ebnf).
+  # Pinned to SGLang 0.5.10's compatible range. Kept available as the
+  # fallback if a future model regresses on xgrammar.
+  llguidance: '>=0.7.11,<0.8.0'

{sie_server-0.3.4 → sie_server-0.4.1}/models/Alibaba-NLP__gte-Qwen2-7B-instruct.yaml RENAMED Viewed

@@ -18,7 +18,7 @@ profiles:
   default:
     max_batch_tokens: 16384
     compute_precision: bfloat16
-    adapter_path: sie_server.adapters.sglang:SGLangEmbeddingAdapter
+    adapter_path: sie_server.adapters.sglang.embedding:SGLangEmbeddingAdapter
     adapter_options:
       loadtime:
         mem_fraction_static: 0.85

{sie_server-0.3.4 → sie_server-0.4.1}/models/Linq-AI-Research__Linq-Embed-Mistral.yaml RENAMED Viewed

@@ -18,7 +18,7 @@ profiles:
   default:
     max_batch_tokens: 8192
     compute_precision: bfloat16
-    adapter_path: sie_server.adapters.sglang:SGLangEmbeddingAdapter
+    adapter_path: sie_server.adapters.sglang.embedding:SGLangEmbeddingAdapter
     adapter_options:
       loadtime:
         mem_fraction_static: 0.85

sie_server-0.4.1/models/Qwen__Qwen3-0.6B.yaml ADDED Viewed

@@ -0,0 +1,119 @@
+sie_id: Qwen/Qwen3-0.6B
+hf_id: Qwen/Qwen3-0.6B
+inputs:
+  text: true
+  image: false
+  audio: false
+  video: false
+tasks:
+  # Tiny generation model used for fast e2e iteration on Modal L4 (walking-skeleton
+  # viability bench + validation-harness SIE-only smoke). Loads in ~30s. Quality is
+  # bad — this is a transport benchmark target, not a production model.
+  #
+  # Context / batch sizes are deliberately small (1024 vs the headroom an
+  # L4 could nominally support) so the validation harness can co-resident
+  # the worker's SGLang with a second SGLang for the baseline phase on
+  # a single 22 GiB card without OOMing. Raise these if you ever run 0.6B
+  # alone on a card it doesn't have to share.
+  #
+  # Note on the three 1024s below: `context_length`, `max_sequence_length`,
+  # and `max_batch_tokens` are NOT redundant — they're three independent
+  # knobs (per-request context, SGLang --context-length, batcher cost
+  # budget) that just happen to collide here because the model is tiny.
+  # See sibling Qwen__Qwen3-4B-Instruct-2507.yaml for the canonical
+  # non-collapsed shape.
+  generate:
+    context_length: 1024
+    max_output_tokens: 1024
+    capabilities:
+      grammar: []
+      streaming: true
+      tools: false
+max_sequence_length: 1024
+# KV-cache memory math (Qwen3-0.6B, bf16):
+#   layers=28, kv_heads=8, head_dim=128, bytes_per_elem=2
+#   kv_bytes_per_token = 2 × 28 × 8 × 128 × 2 = 114,688 B ≈ 112 KB
+# The 0.6B is a transport benchmark target — context_length is held at
+# 1024 deliberately (see header comment) so the validation harness can
+# co-resident two SGLang instances on an L4. KV budgets per profile
+# scale with the deployment scenario rather than the GPU ceiling.
+profiles:
+  default:
+    max_batch_tokens: 1024
+    compute_precision: bfloat16
+    adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
+    kv_budget_tokens: 8192
+    adapter_options:
+      loadtime:
+        # 0.8 leaves headroom on a 22 GiB L4 for a second SGLang instance
+        # (the validation harness co-residents worker + baseline). If the
+        # 0.6B is the only model on the card, 0.9 is fine.
+        mem_fraction_static: 0.8
+        served_model_name: Qwen/Qwen3-0.6B
+        # Modal sandbox lacks flashinfer's JIT prerequisites; switch backends.
+        disable_cuda_graph: true
+        attention_backend: triton
+      runtime:
+        first_chunk_timeout_s: 30
+        inter_chunk_timeout_s: 10
+        # Aligned with the rest of the generate model fleet (300s).
+        # The previous 132s was an unexplained magic number that
+        # diverged from every other generate config; bumping to the
+        # fleet default keeps long-completion requests from hitting a
+        # premature overall-timeout on the 0.6B model.
+        overall_timeout_s: 300
+        default_sampling:
+          temperature: 0.0
+          top_p: 1.0
+  # Dedicated 0.6B deployments on a100/h100 don't co-resident a baseline,
+  # so mem_fraction_static returns to the standard 0.85 and the KV budget
+  # scales with the larger GPU. kv_budget_tokens stays well below the
+  # theoretical ceiling because the 0.6B's *context_length* (1024) caps
+  # per-request KV consumption — the budget really just sets the upper
+  # bound on concurrent in-flight sequences.
+  a100-40gb:
+    max_batch_tokens: 4096
+    compute_precision: bfloat16
+    adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
+    kv_budget_tokens: 32768
+    adapter_options:
+      loadtime:
+        mem_fraction_static: 0.85
+        served_model_name: Qwen/Qwen3-0.6B
+        disable_cuda_graph: true
+        attention_backend: triton
+      runtime:
+        first_chunk_timeout_s: 30
+        inter_chunk_timeout_s: 10
+        # Aligned with the rest of the generate model fleet (300s).
+        # The previous 132s was an unexplained magic number that
+        # diverged from every other generate config; bumping to the
+        # fleet default keeps long-completion requests from hitting a
+        # premature overall-timeout on the 0.6B model.
+        overall_timeout_s: 300
+        default_sampling:
+          temperature: 0.0
+          top_p: 1.0
+  h100:
+    max_batch_tokens: 8192
+    compute_precision: bfloat16
+    adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
+    kv_budget_tokens: 65536
+    adapter_options:
+      loadtime:
+        mem_fraction_static: 0.85
+        served_model_name: Qwen/Qwen3-0.6B
+        disable_cuda_graph: true
+        attention_backend: triton
+      runtime:
+        first_chunk_timeout_s: 30
+        inter_chunk_timeout_s: 10
+        # Aligned with the rest of the generate model fleet (300s).
+        # The previous 132s was an unexplained magic number that
+        # diverged from every other generate config; bumping to the
+        # fleet default keeps long-completion requests from hitting a
+        # premature overall-timeout on the 0.6B model.
+        overall_timeout_s: 300
+        default_sampling:
+          temperature: 0.0
+          top_p: 1.0

sie_server-0.4.1/models/Qwen__Qwen3-4B-Instruct-2507.yaml ADDED Viewed

@@ -0,0 +1,152 @@
+sie_id: Qwen/Qwen3-4B-Instruct-2507
+hf_id: Qwen/Qwen3-4B-Instruct-2507
+inputs:
+  text: true
+  image: false
+  audio: false
+  video: false
+tasks:
+  # kv_budget_tokens now lives on profiles (below). The KV calibration
+  # follow-up publishes the tuned value; the placeholder here is conservative
+  # and assumes Qwen3-4B's ~150 KB/token KV footprint on an L4 (24 GB,
+  # mem_fraction_static=0.85).
+  generate:
+    context_length: 32768
+    max_output_tokens: 4096
+    capabilities:
+      # Outlines-backed JSON Schema, regex, and EBNF grammars are
+      # all supported by the SGLang adapter (Outlines and XGrammar
+      # both accept EBNF natively). The gateway gates requests on
+      # this exact list — adding a new ``grammar.kind`` variant
+      # requires both the gateway parser and this list to be updated.
+      grammar: ["json_schema", "regex", "ebnf"]
+      streaming: true
+      # Qwen3-4B-Instruct's chat template emits OpenAI-compatible
+      # ``<tool_call>{...}</tool_call>`` blocks when ``tools`` is
+      # present in the rendered messages; the worker's
+      # ``parse_tool_call_stream`` consumes those blocks and surfaces
+      # them on ``delta.tool_calls`` for SSE and on
+      # ``message.tool_calls`` for non-streaming requests.
+      tools: true
+    # Forwarded verbatim to ``tokenizer.apply_chat_template(**kwargs)`` when
+    # the worker renders an OpenAI-shaped ``messages`` request.
+    # Qwen3's chat template emits a ``<think>``/``</think>`` reasoning block
+    # unless this flag suppresses it.
+    chat_template_kwargs:
+      enable_thinking: false
+    # Schemas/regexes the worker pre-compiles at model load so the first
+    # request hitting them skips the Outlines compile (cold TTFT win).
+    # Failures here log + bump ``sie_worker_grammar_prewarm_total{outcome="failed"}``
+    # without blocking model load — add entries only for shapes you
+    # know are hot.
+    prewarm_grammars:
+      # Bare pattern, NOT anchored: this model uses the default Outlines
+      # grammar backend, and Outlines regexes are implicitly anchored —
+      # its FSM engine (interegular) rejects ``^``/``$`` with
+      # ``Unsupported``, which crashes SGLang's scheduler. Use ``(yes|no)``.
+      - name: yes_no
+        kind: regex
+        value: "(yes|no)"
+      - name: short_answer
+        kind: json_schema
+        value:
+          type: object
+          properties:
+            answer:
+              type: string
+          required: [answer]
+max_sequence_length: 32768
+# KV-cache memory math (Qwen3-4B-Instruct-2507, bf16):
+#   layers=36, kv_heads=8, head_dim=128, bytes_per_elem=2
+#   kv_bytes_per_token = 2 (k+v) × 36 × 8 × 128 × 2 = 147,456 B ≈ 144 KB
+# Theoretical max KV tokens per GPU (assuming ~8 GB weights, mem_fraction_static=0.85):
+#   l4         (24 GB):  (24 × 0.85 − 8) GB / 144 KB ≈   90,000 tokens
+#   a100-40gb  (40 GB):  (40 × 0.85 − 8) GB / 144 KB ≈  189,000 tokens
+#   h100       (80 GB):  (80 × 0.85 − 8) GB / 144 KB ≈  437,000 tokens
+# kv_budget_tokens is set to ~40% of theoretical max, matching the L4
+# baseline ratio (32768/90000 ≈ 36%). The headroom absorbs batch growth,
+# speculative side-cell, grammar/Outlines compile arena, fragmentation.
+# Final empirical validation (concurrency-16 OOM-boundary sweep) is
+# tracked in product/plans/m4-req2-gpu-runbook.md §"#16/#19".
+profiles:
+  default:
+    # max_batch_tokens is a generic engine knob; generation does not batch
+    # at the SIE layer (SGLang batches internally) but the validator
+    # requires the field to be set.
+    max_batch_tokens: 16384
+    compute_precision: bfloat16
+    adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
+    # L4 baseline — empirically gated by the speculative-decoding + calibration work.
+    kv_budget_tokens: 32768
+    adapter_options:
+      loadtime:
+        mem_fraction_static: 0.85
+        served_model_name: Qwen/Qwen3-4B-Instruct-2507
+        # speculative decoding (MTP/EAGLE/NGRAM) intentionally absent;
+        # week-1 validation decides whether to promote a side-cell. See §4.9.
+      runtime:
+        first_chunk_timeout_s: 30
+        inter_chunk_timeout_s: 10
+        overall_timeout_s: 300
+        default_sampling:
+          temperature: 0.7
+          top_p: 0.9
+        stop_tokens:
+          - "<|im_end|>"
+  # M5 audit #16/#19: analytical defaults for a100-40gb / h100. Production
+  # capacity also grows: with 2-4× the KV budget the context window can be
+  # widened proportionally so longer-context workloads (RAG with large
+  # retrieved passages) fit comfortably. ``max_output_tokens`` doubles
+  # to 8192/16384 respectively — beyond that, latency hurts more than
+  # quality helps for instruction-style chat traffic.
+  a100-40gb:
+    max_batch_tokens: 32768
+    compute_precision: bfloat16
+    adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
+    # Empirically calibrated on Modal A100-SXM4-40GB
+    # (sglang 0.5.9, mem_fraction_static=0.85, context_length=32768).
+    # SGLang's /server_info reports:
+    #   weight=7.71 GB, kvcache=25.42 GB, graph=0.18 GB,
+    #   token_capacity=185,081 tokens
+    # ``kv_budget_tokens`` sized for 4 concurrent admissions:
+    #   185,081 / 4 = 46,270 → round down to 45,056 for headroom.
+    # Re-calibrate if SGLang version or mem_fraction_static changes.
+    kv_budget_tokens: 45056
+    adapter_options:
+      loadtime:
+        mem_fraction_static: 0.85
+        served_model_name: Qwen/Qwen3-4B-Instruct-2507
+      runtime:
+        first_chunk_timeout_s: 30
+        inter_chunk_timeout_s: 10
+        overall_timeout_s: 300
+        default_sampling:
+          temperature: 0.7
+          top_p: 0.9
+        stop_tokens:
+          - "<|im_end|>"
+  h100:
+    max_batch_tokens: 65536
+    compute_precision: bfloat16
+    adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
+    # Empirically calibrated on Modal H100 80GB HBM3
+    # (sglang 0.5.9, mem_fraction_static=0.85, context_length=32768).
+    # SGLang's /server_info reports:
+    #   weight=7.71 GB, kvcache=59.0 GB, graph=0.43 GB,
+    #   token_capacity=429,645 tokens
+    # ``kv_budget_tokens`` sized for 4 concurrent admissions:
+    #   429,645 / 4 = 107,411 → round down to 106,496 for headroom.
+    kv_budget_tokens: 106496
+    adapter_options:
+      loadtime:
+        mem_fraction_static: 0.85
+        served_model_name: Qwen/Qwen3-4B-Instruct-2507
+      runtime:
+        first_chunk_timeout_s: 30
+        inter_chunk_timeout_s: 10
+        overall_timeout_s: 300
+        default_sampling:
+          temperature: 0.7
+          top_p: 0.9
+        stop_tokens:
+          - "<|im_end|>"

{sie_server-0.3.4 → sie_server-0.4.1}/models/Qwen__Qwen3-Embedding-4B.yaml RENAMED Viewed

@@ -18,7 +18,7 @@ profiles:
   default:
     max_batch_tokens: 16384
     compute_precision: bfloat16
-    adapter_path: sie_server.adapters.sglang:SGLangEmbeddingAdapter
+    adapter_path: sie_server.adapters.sglang.embedding:SGLangEmbeddingAdapter
     adapter_options:
       loadtime:
         mem_fraction_static: 0.85

sie-server 0.3.4__tar.gz → 0.4.1__tar.gz

sie-server 0.3.4tar.gz → 0.4.1tar.gz