PyPI - sie-server - Versions diffs - 0.6.2__tar.gz → 0.6.4__tar.gz - Mend

sie-server 0.6.2tar.gz → 0.6.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (443) hide show

{sie_server-0.6.2 → sie_server-0.6.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sie-server
-Version: 0.6.2
+Version: 0.6.4
 Summary: Search Inference Engine - GPU inference server for search workloads
 License: Apache-2.0
 License-File: LICENSE

sie_server-0.6.4/models/MoritzLaurer__ModernBERT-base-zeroshot-v2.0.yaml ADDED Viewed

@@ -0,0 +1,22 @@
+sie_id: MoritzLaurer/ModernBERT-base-zeroshot-v2.0
+hf_id: MoritzLaurer/ModernBERT-base-zeroshot-v2.0
+inputs:
+  text: true
+  image: false
+  audio: false
+  video: false
+tasks:
+  encode: null
+  score: null
+  extract: {}
+max_sequence_length: 512
+profiles:
+  default:
+    max_batch_tokens: 16384
+    compute_precision: null
+    adapter_path: sie_server.adapters.nli_classification_flash:NLIClassificationFlashAdapter
+    adapter_options:
+      loadtime: {}
+      runtime:
+        hypothesis_template: This text is about {}.
+        multi_label: false

sie_server-0.6.4/models/facebook__bart-large-mnli.yaml ADDED Viewed

@@ -0,0 +1,22 @@
+sie_id: facebook/bart-large-mnli
+hf_id: facebook/bart-large-mnli
+inputs:
+  text: true
+  image: false
+  audio: false
+  video: false
+tasks:
+  encode: null
+  score: null
+  extract: {}
+max_sequence_length: 512
+profiles:
+  default:
+    max_batch_tokens: 16384
+    compute_precision: null
+    adapter_path: sie_server.adapters.nli_classification_flash:NLIClassificationFlashAdapter
+    adapter_options:
+      loadtime: {}
+      runtime:
+        hypothesis_template: This text is about {}.
+        multi_label: false

sie_server-0.6.4/models/fastino__gliner2-large-v1.yaml ADDED Viewed

@@ -0,0 +1,20 @@
+sie_id: fastino/gliner2-large-v1
+hf_id: fastino/gliner2-large-v1
+inputs:
+  text: true
+  image: false
+  audio: false
+  video: false
+tasks:
+  encode: null
+  score: null
+  extract: {}
+max_sequence_length: 512
+profiles:
+  default:
+    max_batch_tokens: 16384
+    compute_precision: float16
+    adapter_path: sie_server.adapters.gliner2:GLiNER2Adapter
+    adapter_options:
+      loadtime: {}
+      runtime: {}

sie_server-0.6.4/models/google__owlv2-large-patch14-ensemble.yaml ADDED Viewed

@@ -0,0 +1,21 @@
+sie_id: google/owlv2-large-patch14-ensemble
+hf_id: google/owlv2-large-patch14-ensemble
+inputs:
+  text: false
+  image: true
+  audio: false
+  video: false
+tasks:
+  encode: null
+  score: null
+  extract: {}
+profiles:
+  default:
+    max_batch_tokens: 16384
+    compute_precision: float16
+    adapter_path: sie_server.adapters.owlv2:Owlv2Adapter
+    adapter_options:
+      loadtime:
+        score_threshold: 0.1
+      runtime:
+        score_threshold: 0.1

{sie_server-0.6.2 → sie_server-0.6.4}/openapi.json RENAMED Viewed

@@ -3,7 +3,7 @@
   "info": {
     "title": "SIE Server",
     "description": "Search Inference Engine - GPU inference server for search workloads",
-    "version": "0.6.2"
+    "version": "0.6.4"
   },
   "paths": {
     "/": {

{sie_server-0.6.2 → sie_server-0.6.4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "sie-server"
-version = "0.6.2"
+version = "0.6.4"
 description = "Search Inference Engine - GPU inference server for search workloads"
 requires-python = ">=3.12,<3.13"
 license = { text = "Apache-2.0" }

{sie_server-0.6.2 → sie_server-0.6.4}/src/sie_server/_ipc_test_harness.py RENAMED Viewed

@@ -136,7 +136,7 @@ class _StubExecutor:
     async def process_extract_batch(self, req: ProcessExtractBatchRequest) -> BatchOutcome:
         await self._maybe_sleep()
-        return _canned_batch_outcome(req.items)
+        return _canned_extract_batch_outcome(req.items)
 class _FakeGenerateProcessor:
@@ -179,6 +179,51 @@ def _canned_batch_outcome(items: list[Any]) -> BatchOutcome:
     )
+def _extract_document_echo(item: Any) -> dict[str, Any]:
+    document = item.item.get("document") if isinstance(item.item, dict) else None
+    if not isinstance(document, dict):
+        return {
+            "present": False,
+            "data_is_bytes": False,
+            "data": b"",
+            "data_len": 0,
+            "format": None,
+        }
+    data = document.get("data")
+    data_is_bytes = isinstance(data, bytes | bytearray)
+    data_bytes = bytes(data) if data_is_bytes else b""
+    return {
+        "present": True,
+        "data_is_bytes": data_is_bytes,
+        "data": data_bytes,
+        "data_len": len(data_bytes),
+        "format": document.get("format"),
+    }
+def _canned_extract_batch_outcome(items: list[Any]) -> BatchOutcome:
+    outcomes: list[ItemOutcome] = []
+    for item in items:
+        payload = msgpack.packb(
+            {**_CANNED_RESULT, "extract_document": _extract_document_echo(item)},
+            use_bin_type=True,
+        )
+        outcomes.append(
+            ItemOutcome(
+                work_item_id=item.work_item_id,
+                request_id=item.request_id,
+                item_index=item.item_index,
+                disposition="publish_and_ack",
+                result_msgpack=payload,
+                inference_ms=0.1,
+                tokenization_ms=0.05,
+                postprocessing_ms=0.01,
+            )
+        )
+    return BatchOutcome(outcomes=outcomes)
 def _canned_batch_outcome_echoing_prepared_tokens(items: list[Any]) -> BatchOutcome:
     """Like :func:`_canned_batch_outcome` but folds each item's
     ``prepared_tokens`` presence / content into the per-item

{sie_server-0.6.2 → sie_server-0.6.4}/src/sie_server/adapters/_generation_base.py RENAMED Viewed

@@ -23,6 +23,7 @@ from typing import Any, ClassVar, Literal, cast
 from sie_server.adapters._spec import AdapterSpec
 from sie_server.adapters.base import ModelAdapter, ModelCapabilities, ModelDims
+from sie_server.types.inputs import ImageInput
 logger = logging.getLogger(__name__)
@@ -247,6 +248,7 @@ class GenerationAdapter(ModelAdapter):
         logit_bias: dict[str, float] | None = None,
         logprobs: bool = False,
         top_logprobs: int | None = None,
+        images: list[ImageInput] | None = None,
     ) -> AsyncIterator[GenerationChunk]:
         """Stream generation chunks from a prompt.
@@ -279,6 +281,13 @@ class GenerationAdapter(ModelAdapter):
                 with per-token log-probabilities.
             top_logprobs: How many alternates per position; only
                 consulted when ``logprobs`` is True.
+            images: Optional list of wire-format :class:`ImageInput`
+                entries for vision-language models. The ``prompt`` is
+                expected to already carry the model's image placeholder
+                tokens (rendered by the chat template upstream); the
+                adapter forwards the image bytes to the engine. ``None``
+                or empty for text-only generation. Text-only adapters may
+                ignore this argument.
         Yields:
             :class:`GenerationChunk` instances. At least one terminal

{sie_server-0.6.2 → sie_server-0.6.4}/src/sie_server/adapters/nemo_colembed/__init__.py RENAMED Viewed

@@ -346,6 +346,31 @@ class NemoColEmbedAdapter(BaseAdapter):
             num_image_token,
         )
+    def get_preprocessor(self) -> Any:
+        """Register BOTH a text and an image preprocessor for v1 (#1163).
+        v1 documents must take the conformant ``_encode_images_preprocessed`` path,
+        which requires an *image* preprocessor to be registered so the encode pipeline
+        produces a ``NemoColEmbedPayload`` (with ``pixel_values``) instead of a
+        passthrough ``ImagePayload``. Without it every doc batch falls back to the
+        model's ``forward_passages`` — which re-tiles each page inline on one thread,
+        ~3x slower than running the tiling upstream in the preprocessing thread pool.
+        But v1 *queries* (text) still go through ``model.forward_queries`` and rely on
+        the batched worker path; registering only the image preprocessor de-registers
+        the text one and routes queries to the unbatched direct-call path, which has
+        surfaced ``forward_queries`` failures. So we register both: the base
+        ``CharCountPreprocessor`` (text → worker-batched queries) and the
+        ``NemoColEmbedPreprocessor`` (image → conformant docs). ``model_loader``
+        registers each entry of the returned list by its ``modality``.
+        v2 (Qwen3-VL backbone) builds no ``_processor`` (``None``); it keeps just the
+        base text preprocessor and its native ``forward_images`` path (with #1055 fix).
+        """
+        if self._processor is None:
+            return super().get_preprocessor()
+        return [super().get_preprocessor(), self._processor]
     def encode(
         self,
         items: list[Item],
@@ -606,14 +631,25 @@ class NemoColEmbedAdapter(BaseAdapter):
             if self._normalize:
                 embeddings = functional.normalize(embeddings, p=2, dim=-1)
-            # Store results for this sub-batch (move to CPU immediately to free GPU memory)
+            # Store results for this sub-batch (move to CPU immediately to free GPU
+            # memory). Trim each item's left-padding rows before returning: the batch is
+            # left-padded, so padded positions are zeroed by the attention_mask above —
+            # but emitting them as zero vectors leaks 0-similarity rows into the late-
+            # interaction MaxSim (a 0-floor on every query token's max). Because the
+            # batcher pads inconsistently across docs, identical docs then score
+            # differently by batch and ranking is corrupted on variable-tile batches
+            # (#1163: Vidore3 Hr 0.6532 -> 0.5713). Keep only real tokens, matching the
+            # native forward_passages path (_unpack_embeddings drops zero rows likewise).
             for i in range(len(sub_batch_items)):
-                emb = embeddings[i].float().cpu().numpy()
+                keep = batch["attention_mask"][i].bool()
+                emb = embeddings[i][keep].float().cpu().numpy()
                 all_embeddings.append(emb)
-            # Clear GPU memory between sub-batches
-            del outputs, embeddings, batch
-            torch.cuda.empty_cache()
+            # Free this sub-batch's GPU tensors. NOTE: no per-sub-batch
+            # torch.cuda.empty_cache() — repeatedly releasing the allocator's cache and
+            # re-acquiring ~GB blocks fragments the pool and OOMs at scale on big GPUs
+            # (#1163). The sub-batch loop + immediate CPU offload already bound peak VRAM.
+            del outputs, embeddings, attention_mask, batch
         return EncodeOutput(
             multivector=all_embeddings,

{sie_server-0.6.2 → sie_server-0.6.4}/src/sie_server/adapters/owlv2/__init__.py RENAMED Viewed

@@ -189,7 +189,7 @@ class Owlv2Adapter(BaseAdapter):
         # Extract options once
         opts = options or {}
-        score_threshold = opts.get("score_threshold", self._score_threshold)
+        score_threshold = opts.get("score_threshold", opts.get("threshold", self._score_threshold))
         # Build text queries once (shared across batch)
         # OWL-v2 format: list of prompts per image

{sie_server-0.6.2 → sie_server-0.6.4}/src/sie_server/adapters/sglang/generation.py RENAMED Viewed

@@ -19,6 +19,7 @@ HTTP connection, which SGLang treats as a cancel signal. A best-effort
 from __future__ import annotations
 import asyncio
+import base64
 import contextlib
 import dataclasses
 import json
@@ -47,6 +48,7 @@ from sie_server.adapters._types import ERR_NOT_LOADED, ComputePrecision
 from sie_server.adapters.sglang import _server
 from sie_server.observability.metrics import GenerationStreamTimer
 from sie_server.types.grammar import GrammarSpec
+from sie_server.types.inputs import ImageInput, media_bytes
 logger = logging.getLogger(__name__)
@@ -91,6 +93,47 @@ def _resolve_read_timeout() -> float | None:
 _GENERATE_READ_TIMEOUT_S: float | None = _resolve_read_timeout()
+# Format hints we re-embed into the SGLang ``image_data`` MIME type. Anything
+# else falls back to ``jpeg`` (the engine sniffs the real format from bytes).
+_ALLOWED_IMAGE_FORMATS = frozenset({"png", "jpeg", "jpg", "webp", "gif"})
+def _encode_image_data(images: list[ImageInput] | None) -> list[str] | None:
+    """Translate wire ``ImageInput`` entries into SGLang ``image_data`` URIs.
+    SGLang's ``/generate`` accepts a top-level ``image_data`` field — a list of
+    images, each as a base64 string, an ``http(s)`` URL, or a local file path.
+    We emit ``data:image/<fmt>;base64,<...>`` data URIs so the format hint
+    travels with the bytes and SGLang's image loader can decode without
+    sniffing. Bytes are validated through :func:`media_bytes`, the single
+    enforcement point for the wire contract (raises :class:`InvalidMediaError`
+    on a non-bytes ``data``, e.g. an un-decoded base64 JSON string).
+    Returns ``None`` when there are no images so the request body stays
+    byte-identical to the text-only path — vision plumbing is inert for the
+    text-only models that share this adapter.
+    """
+    if not images:
+        return None
+    encoded: list[str] = []
+    for image in images:
+        raw = media_bytes(image, kind="image")
+        fmt = (image.get("format") or "jpeg").strip().lower() or "jpeg"
+        # Clamp the client-controlled format hint to a known set before
+        # re-embedding it in the data-URI MIME type — an arbitrary subtype
+        # would produce a malformed URI for SGLang's loader. The engine
+        # sniffs the real format from the bytes regardless, so an unknown
+        # hint safely falls back to jpeg.
+        if fmt not in _ALLOWED_IMAGE_FORMATS:
+            fmt = "jpeg"
+        elif fmt == "jpg":
+            # ``image/jpg`` is not a registered MIME type; normalise to jpeg.
+            fmt = "jpeg"
+        b64 = base64.b64encode(raw).decode("ascii")
+        encoded.append(f"data:image/{fmt};base64,{b64}")
+    return encoded
 def _tail_file(path: str, *, max_lines: int = 200) -> str:
     """Return the final lines from a startup log for diagnostics."""
     try:
@@ -764,9 +807,17 @@ class SGLangGenerationAdapter(GenerationAdapter):
         best_of: int | None = None,
         stream: bool = False,
         lora_path: str | None = None,
+        images: list[ImageInput] | None = None,
     ) -> AsyncIterator[GenerationChunk]:
         self._check_loaded()
+        # Vision input: encode any images into SGLang's top-level ``image_data``
+        # field once, then attach to whichever request body we build below. The
+        # ``prompt`` is expected to already carry the model's image placeholder
+        # tokens (the chat template renders them worker-side). ``None`` when
+        # there are no images, keeping the text-only request body unchanged.
+        image_data = _encode_image_data(images)
         # Guard verdict thresholding only runs on the single-candidate (n=1)
         # path, so reject multi-candidate sampling up front — otherwise a guard
         # request with n>1 / best_of>1 would silently return an UN-thresholded
@@ -892,6 +943,8 @@ class SGLangGenerationAdapter(GenerationAdapter):
             }
             if lora_path:
                 sbody["lora_path"] = lora_path
+            if image_data:
+                sbody["image_data"] = image_data
             if logprobs:
                 sbody["return_logprob"] = True
                 # Without this SGLang omits the decoded token TEXT from
@@ -1023,6 +1076,8 @@ class SGLangGenerationAdapter(GenerationAdapter):
             nbody: dict[str, Any] = {"text": prompt, "sampling_params": sp, "stream": False}
             if lora_path:
                 nbody["lora_path"] = lora_path
+            if image_data:
+                nbody["image_data"] = image_data
             if logprobs or rank:
                 nbody["return_logprob"] = True
                 # Surface decoded token text (see streaming body below) so the
@@ -1125,6 +1180,8 @@ class SGLangGenerationAdapter(GenerationAdapter):
         # verified on L4). Empirically applies the adapter in-batch per request.
         if lora_path:
             body["lora_path"] = lora_path
+        if image_data:
+            body["image_data"] = image_data
         # OpenAI ``logprobs`` → SGLang ``return_logprob`` (top-level body
         # flag, not under sampling_params). ``top_logprobs`` →
         # ``top_logprobs_num``. SGLang surfaces them under

{sie_server-0.6.2 → sie_server-0.6.4}/src/sie_server/app/app_state_config.py RENAMED Viewed

@@ -20,7 +20,7 @@ class AppStateConfig:
     """
     models_dir: Path | str | None = None
-    """Path to models directory (local path, s3://, or gs://). If None, registry starts empty."""
+    """Path to models directory (local path, s3://, gs://, abfs://, or abfss://). If None, registry starts empty."""
     device: str = "cpu"
     """Device to load models on (e.g., "cuda:0", "cpu", "mps")."""

{sie_server-0.6.2 → sie_server-0.6.4}/src/sie_server/cli.py RENAMED Viewed

@@ -220,12 +220,18 @@ def serve(
     host: str = typer.Option("0.0.0.0", "--host", help="Host to bind to"),  # noqa: S104 — intentional bind to all interfaces for server
     device: str = typer.Option("auto", "--device", "-d", help="Device to use (auto, cuda, mps, cpu)"),
     models_dir: str = typer.Option(
-        DEFAULT_MODELS_DIR, "--models-dir", help="Models directory (local path, s3://, or gs://)"
+        DEFAULT_MODELS_DIR,
+        "--models-dir",
+        help="Models directory (local path, s3://, gs://, abfs://, or abfss://)",
     ),
     bundle: str | None = typer.Option(None, "--bundle", "-b", help="Bundle name to load (from bundles/ dir)"),
     models: str | None = typer.Option(None, "--models", "-m", help="Comma-separated model names to load"),
     local_cache: str | None = typer.Option(None, "--local-cache", help="Local cache directory (default: HF_HOME)"),
-    cluster_cache: str | None = typer.Option(None, "--cluster-cache", help="Cluster cache URL (s3:// or gs://)"),
+    cluster_cache: str | None = typer.Option(
+        None,
+        "--cluster-cache",
+        help="Cluster cache URL (s3://, gs://, abfs://, or abfss://)",
+    ),
     hf_fallback: bool = typer.Option(True, "--hf-fallback/--no-hf-fallback", help="Enable HuggingFace Hub fallback"),
     reload: bool = typer.Option(default=False, help="Enable auto-reload for development"),
     tracing: bool = typer.Option(default=False, help="Enable OpenTelemetry tracing (exports to localhost:4317)"),

{sie_server-0.6.2 → sie_server-0.6.4}/src/sie_server/core/loader.py RENAMED Viewed

@@ -48,7 +48,7 @@ def load_model_configs(models_dir: Path | str) -> dict[str, ModelConfig]:
     """Load all model configs from a directory (local or cloud).
     Args:
-        models_dir: Path to the models directory (local path, s3://, or gs://).
+        models_dir: Path to the models directory (local path, s3://, gs://, abfs://, or abfss://).
     Returns:
         Dictionary mapping model names to their ModelConfig objects.
@@ -141,7 +141,7 @@ def _expand_profile_variants(configs: dict[str, ModelConfig]) -> None:
 def _load_configs_from_cloud(models_dir: str) -> dict[str, ModelConfig]:
-    """Load model configs from S3/GCS.
+    """Load model configs from cloud object storage.
     Discovers YAML files via LIST operation, downloads them to local cache, and parses them.
     Model configs are flat YAML files (e.g., gs://bucket/models/BAAI__bge-m3.yaml).

{sie_server-0.6.2 → sie_server-0.6.4}/src/sie_server/core/model_loader.py RENAMED Viewed

@@ -561,11 +561,16 @@ class ModelLoader:
         Returns:
             LoadedModel containing the loaded state.
         """
-        # Get preprocessor from adapter - all adapters implement get_preprocessor()
-        preprocessor = adapter.get_preprocessor()
-        # Register the preprocessor based on its modality
-        if preprocessor is not None:
+        # Get preprocessor(s) from adapter - all adapters implement get_preprocessor().
+        # Most return a single preprocessor; multi-modal adapters (e.g. NemoColEmbed v1,
+        # which needs a text preprocessor for queries AND an image preprocessor for
+        # documents) may return a list. Register each by its modality.
+        preprocessors = adapter.get_preprocessor()
+        if not isinstance(preprocessors, list):
+            preprocessors = [preprocessors]
+        for preprocessor in preprocessors:
+            if preprocessor is None:
+                continue
             modality = getattr(preprocessor, "modality", None)
             if modality == "text":
                 self._preprocessor_registry._register(name, preprocessor)

{sie_server-0.6.2 → sie_server-0.6.4}/src/sie_server/core/registry.py RENAMED Viewed

@@ -82,7 +82,7 @@ class ModelRegistry:
         """Initialize the registry.
         Args:
-            models_dir: Path to models directory (local path, s3://, or gs://).
+            models_dir: Path to models directory (local path, s3://, gs://, abfs://, or abfss://).
                        If None, registry starts empty and configs must be added manually.
             memory_config: Configuration for memory management. If None, uses defaults.
             drain_timeout_s: Timeout in seconds to wait for worker drain before unload.
@@ -1396,7 +1396,7 @@ class ModelRegistry:
             logger.debug("No models_dir, skipping hot reload")
             return
-        # Don't watch cloud URLs (s3://, gs://)
+        # Don't watch cloud URLs (s3://, gs://, abfs(s)://)
         if is_cloud_path(self._models_dir):
             logger.debug("Cloud models_dir, skipping hot reload (not supported)")
             return

sie-server 0.6.2__tar.gz → 0.6.4__tar.gz

sie-server 0.6.2tar.gz → 0.6.4tar.gz