PyPI - sie-server - Versions diffs - 0.4.0__tar.gz → 0.4.2__tar.gz - Mend

sie-server 0.4.0tar.gz → 0.4.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (441) hide show

{sie_server-0.4.0 → sie_server-0.4.2}/Dockerfile.cpu RENAMED Viewed

@@ -6,10 +6,9 @@
 #   docker buildx build --platform linux/amd64,linux/arm64 -f packages/sie_server/Dockerfile.cpu -t sie-server:cpu .
 ARG BUNDLE=default
-ARG SIE_DEPS_IMAGE=
 # =============================================================================
-# Stage 1: Dependencies (pyproject.toml only, cached across code changes)
+# Dependency image: pyproject-only cache seed
 # =============================================================================
 FROM python:3.12-slim-bookworm AS deps
@@ -58,9 +57,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         -e ".[gpu-metrics]"
 # =============================================================================
-# Stage 2: Base - source install + shared-venv finalization (no BUNDLE)
+# Shared runtime base: source install and venv finalization
 # =============================================================================
-# Bundle-agnostic: all base-stage layers are shared across bundles of this
+# Bundle-agnostic: all base image layers are shared across bundles of this
 # platform in local BuildKit cache and in content-addressed registry layers.
 FROM deps AS base
@@ -108,9 +107,9 @@ RUN set -eux; \
     find /app/.venv -exec touch -h -d @0 {} + 2>/dev/null || true
 # =============================================================================
-# Stage 3a: Bundle-deps - bundle-specific deps install (heavy)
+# Bundle dependency builder: bundle-specific deps
 # =============================================================================
-FROM base AS bundle_deps
+FROM base AS builder
 ARG BUNDLE
@@ -151,12 +150,7 @@ RUN set -eux; \
     find /app/bundle-libs -exec touch -h -d @0 {} + 2>/dev/null || true
 # =============================================================================
-# Stage 3b: Builder - optional trampoline to a prebuilt base image
-# =============================================================================
-FROM ${SIE_DEPS_IMAGE:-bundle_deps} AS builder
-# =============================================================================
-# Stage 4: Runtime
+# Runtime image
 # =============================================================================
 FROM python:3.12-slim-bookworm AS runtime

{sie_server-0.4.0 → sie_server-0.4.2}/Dockerfile.cuda12 RENAMED Viewed

@@ -1,15 +1,14 @@
 # syntax=docker/dockerfile:1
-# SIE Server - CUDA 12.4 Image
+# SIE Server - CUDA 12 Image
 # Build from repo root:
 #   docker build -f packages/sie_server/Dockerfile.cuda12 -t sie-server:cuda12-default .
 #   docker build -f packages/sie_server/Dockerfile.cuda12 --build-arg BUNDLE=sglang -t sie-server:cuda12-sglang .
 ARG BUNDLE=default
 ARG UV_VERSION=0.9.28
-ARG SIE_DEPS_IMAGE=
 # =============================================================================
-# Stage 1: uv + standalone Python 3.12 (no deadsnakes PPA)
+# Dependency image: uv and standalone Python 3.12
 # =============================================================================
 FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS deps
@@ -60,10 +59,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         -e ".[gpu-metrics]"
 # =============================================================================
-# Stage 2: Base - source install + shared-venv finalization (no BUNDLE)
+# Shared CUDA base: source install and venv finalization
 # =============================================================================
 # Everything here is bundle-agnostic, so bundle-specific builds of a given
-# platform share every base-stage layer in local BuildKit cache and in
+# platform share every base image layer in local BuildKit cache and in
 # content-addressed registry layers.
 FROM deps AS base
@@ -127,9 +126,9 @@ RUN set -eux; \
     find /app/.venv -exec touch -h -d @0 {} + 2>/dev/null || true
 # =============================================================================
-# Stage 3a: Bundle-deps - bundle-specific deps install (heavy)
+# Bundle dependency builder: bundle-specific deps
 # =============================================================================
-FROM base AS bundle_deps
+FROM base AS builder
 ARG BUNDLE
@@ -174,16 +173,19 @@ RUN set -eux; \
     find /app/bundle-libs -exec touch -h -d @0 {} + 2>/dev/null || true
 # =============================================================================
-# Stage 3b: Builder - optional trampoline to a prebuilt base image
+# Runtime image
 # =============================================================================
-FROM ${SIE_DEPS_IMAGE:-bundle_deps} AS builder
-# =============================================================================
-# Stage 4: Runtime
-# =============================================================================
-# Use base CUDA image (not devel/runtime) — PyTorch wheels bundle CUDA libs,
-# cuDNN ships inside torch. Saves ~2GB vs `runtime` variant.
-FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS runtime
+# Runtime base selection is bundle-scoped. Most CUDA bundles stay on the
+# smaller CUDA base runtime; SGLang-family bundles need the devel toolkit
+# because flashinfer/tvm_ffi perform runtime JIT through nvcc on first decode.
+FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS runtime-default
+FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS runtime-transformers5
+FROM nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04 AS runtime-sglang
+ENV CUDA_HOME=/usr/local/cuda \
+    LD_LIBRARY_PATH="/usr/local/cuda/lib64:$LD_LIBRARY_PATH"
+FROM runtime-sglang AS runtime-sglang-embedding
+FROM runtime-${BUNDLE} AS runtime
 ENV DEBIAN_FRONTEND=noninteractive

{sie_server-0.4.0 → sie_server-0.4.2}/PKG-INFO RENAMED Viewed

@@ -1,10 +1,11 @@
 Metadata-Version: 2.4
 Name: sie-server
-Version: 0.4.0
+Version: 0.4.2
 Summary: Search Inference Engine - GPU inference server for search workloads
 License: Apache-2.0
 License-File: LICENSE
 Requires-Python: <3.13,>=3.12
+Requires-Dist: blake3<1,>=0.4
 Requires-Dist: docling<3,>=2
 Requires-Dist: einops<1,>=0.8
 Requires-Dist: fastapi<1,>=0.115
@@ -17,7 +18,6 @@ Requires-Dist: loguru<1,>=0.7
 Requires-Dist: msgpack-numpy<1,>=0.4
 Requires-Dist: msgpack<2,>=1.1
 Requires-Dist: msgspec>=0.20.0
-Requires-Dist: nats-py<3,>=2.9
 Requires-Dist: numpy<3,>=2
 Requires-Dist: open-clip-torch>=2.24
 Requires-Dist: opencv-python-headless<5,>=4
@@ -26,7 +26,7 @@ Requires-Dist: opentelemetry-exporter-otlp<2,>=1.28
 Requires-Dist: opentelemetry-instrumentation-fastapi<1,>=0.49b0
 Requires-Dist: opentelemetry-sdk<2,>=1.28
 Requires-Dist: packaging<25,>=24
-Requires-Dist: pillow<12,>=11
+Requires-Dist: pillow>=12.2.0
 Requires-Dist: prometheus-client<1,>=0.21
 Requires-Dist: pydantic-settings<3,>=2.6
 Requires-Dist: pydantic<3,>=2.9

{sie_server-0.4.0 → sie_server-0.4.2}/README.md RENAMED Viewed

@@ -66,7 +66,7 @@ auto-retries; see `packages/sie_sdk/README.md` for client-side controls.
 | Env var | Default | Effect |
 |--|--|--|
-| `SIE_GRAMMAR_PREFLIGHT_DEBUG` | unset (off) | Enables the legacy worker-side Outlines preflight compile before each structured-output request. Off by default per ADR-0002 — SGLang is the production grammar authority. Use for diagnosing schema-rejection problems or slow compiles in a controlled environment; not recommended for production traffic. |
+| `SIE_GRAMMAR_PREFLIGHT_DEBUG` | unset (off) | Enables the legacy worker-side Outlines preflight compile before each structured-output request. Off by default because SGLang is the production grammar authority. Use for diagnosing schema-rejection problems or slow compiles in a controlled environment; not recommended for production traffic. |
 For nested settings (any field with `__`), the env-var format is
 `SIE_<TOP>__<NESTED>=value`. The complete schema is in

{sie_server-0.4.0 → sie_server-0.4.2}/bundles/default.yaml RENAMED Viewed

@@ -43,6 +43,7 @@ adapters:
 - sie_server.adapters.florence2
 - sie_server.adapters.docling
 - sie_server.adapters.paddleocr_vl
+- sie_server.adapters.mineru_vl
 deps:
   # Most flash adapters; sentence_transformer needs >=4.57
   transformers: '>=4.57,<5'
@@ -78,5 +79,5 @@ deps:
   docling: '>=2,<3'
   # Flash Attention 2 — CUDA only, prebuilt wheel
   flash-attn:
-    url: https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.7.0/flash_attn-2.7.4+cu128torch2.9-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
+    url: https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.7.11/flash_attn-2.7.4+cu129torch2.9-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
     marker: sys_platform == 'linux'

{sie_server-0.4.0 → sie_server-0.4.2}/bundles/sglang-embedding.yaml RENAMED Viewed

@@ -12,7 +12,7 @@ deps:
   # pip resolution drift on environments that already had a different
   # ``outlines`` installed. Asserted in ``tests/test_bundles.py`` so a
   # future drift fails fast.
-  sglang: '==0.5.10'
+  sglang: '==0.5.10.post1'
   xgrammar: '==0.1.32'
   outlines: '==0.1.11'
   llguidance: '>=0.7.11,<0.8.0'

{sie_server-0.4.0 → sie_server-0.4.2}/bundles/sglang.yaml RENAMED Viewed

@@ -6,29 +6,29 @@ deps:
   # SGLang 0.5.6+ includes all deps at base level (fixed from 0.4.x extras bug)
   # See: https://github.com/sgl-project/sglang/issues/4869
   #
-  # Qwen3.5-4B compatibility — M4 req2 Proj 5:
+  # Qwen3.5-4B + Qwen3.6-27B compatibility:
   #
-  # ``sglang==0.5.10`` is the canonical target for Qwen3.5-4B on the
-  # current L4 / A100-40GB / H100 fleet. Audited against
-  # ``python/pyproject.toml@v0.5.10`` upstream (see
-  # ``product/plans/qwen35-sglang-mtp-structured-outputs-findings.md``):
+  # ``sglang==0.5.10.post1`` is the canonical target for the Qwen3.x hybrid
+  # Gated-DeltaNet + Gated-Attention family on the current
+  # L4 / A100-40GB / H100 fleet. Qwen3.6-27B uses the same ``qwen3_5``
+  # model class shipped in 0.5.10 — the architecture (64 layers, hybrid
+  # Gated DeltaNet + Gated Attention, MTP/NEXTN) is identical, only the
+  # parameter count differs.
   #
-  #   * ships the ``qwen3_5`` model class (``models/qwen3_5.py``, 1724 LOC)
-  #   * grammar backends: ``xgrammar`` / ``outlines`` / ``llguidance`` / ``none``
-  #   * ``sglang-kernel==0.4.1`` wheel covers SM_80 / SM_89 / SM_90 / SM_100
-  #     via gencode (``CMakeLists.txt``: ``ENABLE_BELOW_SM90=ON`` default).
-  #     The runtime loader (``sgl_kernel/load_utils.py``) maps
-  #     compute_capability != 90 → ``sm100/`` subdir, which holds the
-  #     SM_80 / SM_89 / SM_100 build (precise math). H100 (CC=90) gets
-  #     the ``sm90/`` fast-math build.
-  #   * torch==2.9.1 (CUDA 12.8/12.9 wheels); ``cuda-python==12.9``.
-  #     **Not** CUDA 13 — that's an SGLang-main-only path which only
-  #     became relevant when looking at the dev branch.
+  # SGLang 0.5.10 was evaluated against CUDA 12.9 + Qwen3.6-27B on Modal
+  # H100 (2026-05-27): server boots, loads weights, but the bundled
+  # ``sglang/jit_kernel/csrc/elementwise/activation.cuh`` has a C++
+  # template bug (``select_kernel<true>(type)`` is parsed as a class-
+  # template substitution, not a function-template call) that the
+  # stricter ``nvcc`` shipped with CUDA 12.9 rejects at first activation.
+  # 0.5.11 is also dev-only on the sglang docs wheel index — not on
+  # PyPI. Park the 0.5.11 bump until upstream cuts a stable release with
+  # the JIT header fixed; 0.5.10.post1 covers Qwen3.6-27B today.
   #
-  # Compat note: 0.5.12 wheel observed shipping only ``sm100/`` (no SM_80
-  # cubin entry inside) — out of scope; we stay on 0.5.10 until upstream
-  # ships multi-arch binaries again.
-  sglang: '==0.5.10'
+  #   * grammar backends: ``xgrammar`` / ``outlines`` / ``llguidance`` / ``none``
+  #   * ``sgl_kernel`` covers SM_80 / SM_89 / SM_90 / SM_100 via gencode.
+  #   * torch==2.9.1 (CUDA 12.9 wheels); ``cuda-python==12.9``.
+  sglang: '==0.5.10.post1'
   #
   # Grammar backend deps — pinned to exactly what SGLang 0.5.10 imports
   # internally to prevent silent pip resolution drift.
@@ -39,7 +39,7 @@ deps:
   # propagation (PR #20467). Kept available as the fallback backend.
   xgrammar: '==0.1.32'
   #
-  # ``outlines==0.1.11`` is a hard transitive dep of sglang==0.5.10. We
+  # ``outlines==0.1.11`` is a hard transitive dep of sglang==0.5.10.post1. We
   # declare it explicitly at bundle level so the surface is visible.
   # ``outlines-core`` (a separate package) is a transitive of outlines
   # and intentionally NOT pinned here — pinning ``outlines-core`` directly

sie_server-0.4.2/models/Marqo__marqo-fashionSigLIP.yaml ADDED Viewed

@@ -0,0 +1,28 @@
+sie_id: Marqo/marqo-fashionSigLIP
+hf_id: Marqo/marqo-fashionSigLIP
+inputs:
+  text: true
+  image: true
+  audio: false
+  video: false
+tasks:
+  encode:
+    dense:
+      dim: 768
+    sparse: null
+    multivector: null
+  score: null
+  extract: null
+max_sequence_length: 64
+profiles:
+  default:
+    max_batch_tokens: 16384
+    compute_precision: float16
+    adapter_path: sie_server.adapters.siglip:SiglipAdapter
+    adapter_options:
+      loadtime:
+        backend: open_clip
+        open_clip_model_id: hf-hub:Marqo/marqo-fashionSigLIP
+        dense_dim: 768
+      runtime:
+        normalize: true

{sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3-0.6B.yaml RENAMED Viewed

@@ -6,40 +6,39 @@ inputs:
   audio: false
   video: false
 tasks:
-  # Tiny generation model used for fast e2e iteration on Modal L4 (walking-skeleton
-  # viability bench + validation-harness SIE-only smoke). Loads in ~30s. Quality is
-  # bad — this is a transport benchmark target, not a production model.
+  # Small, fast generation model — a viable PROD pick for simple/short-prompt
+  # tasks (cheap + high throughput; weaker on long-context reasoning). Loads in
+  # ~30s. Also doubles as the transport/walking-skeleton benchmark target.
   #
-  # Context / batch sizes are deliberately small (1024 vs the headroom an
-  # L4 could nominally support) so the validation harness can co-resident
-  # the worker's SGLang with a second SGLang for the baseline phase on
-  # a single 22 GiB card without OOMing. Raise these if you ever run 0.6B
-  # alone on a card it doesn't have to share.
+  # ``context_length`` is the standalone PROD serving value (4096): big enough
+  # to fit the full generation benchmark pack (casehold prompts reach ~1.8k
+  # tokens, gpqa ~1.3k) so the model is comparable to the rest of the fleet on
+  # every task, while KV stays trivial at this size (112 KB/token → 4096 ≈
+  # 0.46 GB). The validation/co-residency harness, which packs two SGLang
+  # instances onto a single 22 GiB L4, does NOT depend on this default — it
+  # passes an explicit ``--max-seq-length``/``--context-length`` (see
+  # tools/bench_generation_matrix.py + run_generation_smoke.py) and caps itself
+  # to 1024 for that case.
   #
-  # Note on the three 1024s below: `context_length`, `max_sequence_length`,
-  # and `max_batch_tokens` are NOT redundant — they're three independent
-  # knobs (per-request context, SGLang --context-length, batcher cost
-  # budget) that just happen to collide here because the model is tiny.
-  # See sibling Qwen__Qwen3-4B-Instruct-2507.yaml for the canonical
-  # non-collapsed shape.
+  # ``context_length``, ``max_sequence_length``, and ``max_batch_tokens`` are
+  # three independent knobs (per-request context, SGLang --context-length,
+  # batcher cost budget); see sibling Qwen__Qwen3-4B-Instruct-2507.yaml.
   generate:
-    context_length: 1024
+    context_length: 4096
     max_output_tokens: 1024
     capabilities:
       grammar: []
       streaming: true
       tools: false
-max_sequence_length: 1024
+max_sequence_length: 4096
 # KV-cache memory math (Qwen3-0.6B, bf16):
 #   layers=28, kv_heads=8, head_dim=128, bytes_per_elem=2
 #   kv_bytes_per_token = 2 × 28 × 8 × 128 × 2 = 114,688 B ≈ 112 KB
-# The 0.6B is a transport benchmark target — context_length is held at
-# 1024 deliberately (see header comment) so the validation harness can
-# co-resident two SGLang instances on an L4. KV budgets per profile
-# scale with the deployment scenario rather than the GPU ceiling.
+# At ctx=4096 a single request's KV is ~0.46 GB — negligible. The co-residency
+# harness still caps context explicitly when it has to share a card.
 profiles:
   default:
-    max_batch_tokens: 1024
+    max_batch_tokens: 4096
     compute_precision: bfloat16
     adapter_path: sie_server.adapters.sglang.generation:SGLangGenerationAdapter
     kv_budget_tokens: 8192

{sie_server-0.4.0 → sie_server-0.4.2}/models/Qwen__Qwen3-4B-Instruct-2507.yaml RENAMED Viewed

@@ -66,8 +66,7 @@ max_sequence_length: 32768
 # kv_budget_tokens is set to ~40% of theoretical max, matching the L4
 # baseline ratio (32768/90000 ≈ 36%). The headroom absorbs batch growth,
 # speculative side-cell, grammar/Outlines compile arena, fragmentation.
-# Final empirical validation (concurrency-16 OOM-boundary sweep) is
-# tracked in product/plans/m4-req2-gpu-runbook.md §"#16/#19".
+# Final empirical validation should use concurrency and OOM-boundary sweeps.
 profiles:
   default:
     # max_batch_tokens is a generic engine knob; generation does not batch
@@ -93,7 +92,7 @@ profiles:
           top_p: 0.9
         stop_tokens:
           - "<|im_end|>"
-  # M5 audit #16/#19: analytical defaults for a100-40gb / h100. Production
+  # Analytical defaults for a100-40gb / h100. Production
   # capacity also grows: with 2-4× the KV budget the context window can be
   # widened proportionally so longer-context workloads (RAG with large
   # retrieved passages) fit comfortably. ``max_output_tokens`` doubles

sie-server 0.4.0__tar.gz → 0.4.2__tar.gz

sie-server 0.4.0tar.gz → 0.4.2tar.gz