PyPI - fieldkit - Versions diffs - 0.1.0__py3-none-any.whl - Mend

fieldkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

fieldkit/__init__.py +10 -0
fieldkit/_version.py +9 -0
fieldkit/capabilities/__init__.py +229 -0
fieldkit/capabilities/data/__init__.py +8 -0
fieldkit/capabilities/data/spark-capabilities.json +163 -0
fieldkit/cli/__init__.py +255 -0
fieldkit/eval/__init__.py +804 -0
fieldkit/nim/__init__.py +374 -0
fieldkit/rag/__init__.py +550 -0
fieldkit-0.1.0.dist-info/METADATA +92 -0
fieldkit-0.1.0.dist-info/RECORD +14 -0
fieldkit-0.1.0.dist-info/WHEEL +4 -0
fieldkit-0.1.0.dist-info/entry_points.txt +2 -0
fieldkit-0.1.0.dist-info/licenses/LICENSE +202 -0

fieldkit/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+# Copyright 2026 Manav Sehgal
+# SPDX-License-Identifier: Apache-2.0
+"""fieldkit — verified-on-Spark patterns from the ai-field-notes blog.
+See https://ainative.business/fieldkit/ for module-level docs.
+"""
+from fieldkit._version import __version__
+__all__ = ["__version__"]

fieldkit/_version.py ADDED Viewed

@@ -0,0 +1,9 @@
+# Copyright 2026 Manav Sehgal
+# SPDX-License-Identifier: Apache-2.0
+"""Single source of truth for the fieldkit version.
+`pyproject.toml`'s `[tool.hatch.version]` reads `__version__` from this file at
+build time, so bumping it here is enough to bump the wheel version too.
+"""
+__version__ = "0.1.0"

fieldkit/capabilities/__init__.py ADDED Viewed

@@ -0,0 +1,229 @@
+# Copyright 2026 Manav Sehgal
+# SPDX-License-Identifier: Apache-2.0
+"""Typed Python facade over the project's Spark capabilities map.
+The JSON at `data/spark-capabilities.json` is the project's grounding floor for
+hardware envelope claims (KV-cache math, weight memory, in/out-envelope
+signals, NIM/NeMo/TRT-LLM stack notes). This module exposes it as:
+- `Capabilities.load()` — singleton typed view of the JSON
+- `kv_cache_bytes(...)` — canonical KV cache equation from
+  `kv-cache-arithmetic-at-inference`
+- `weight_bytes(...)` — parameter-bytes lookup from the rules-of-thumb table
+- `practical_inference_envelope(...)` — string lookup over the envelope table
+Read-only by design. The source-of-truth JSON lives at
+`scripts/lib/spark-capabilities.json` in the parent repo; the package's copy
+is kept in sync by `fieldkit/scripts/sync_capabilities.py`.
+"""
+from __future__ import annotations
+import json
+from dataclasses import dataclass, field
+from importlib.resources import files
+from typing import Any, ClassVar
+__all__ = [
+    "Capabilities",
+    "Hardware",
+    "MemoryBudgetRulesOfThumb",
+    "StackEntry",
+    "kv_cache_bytes",
+    "weight_bytes",
+    "practical_inference_envelope",
+    "DTYPE_BYTES",
+    "UnknownDtype",
+    "UnknownEnvelope",
+]
+DTYPE_BYTES: dict[str, float] = {
+    "fp32": 4.0,
+    "bf16": 2.0,
+    "fp16": 2.0,
+    "fp8": 1.0,
+    "int8": 1.0,
+    "int4": 0.5,
+    "nf4": 0.5,
+}
+class UnknownDtype(KeyError):
+    """Raised when a dtype string isn't in `DTYPE_BYTES`."""
+class UnknownEnvelope(KeyError):
+    """Raised when `practical_inference_envelope` can't find the requested model size."""
+@dataclass(frozen=True, slots=True)
+class Hardware:
+    name: str
+    unified_memory_gb: int
+    memory_topology: str
+    compute_arch: str
+    supported_dtypes: tuple[str, ...]
+    interconnect_to_other_gpus: str
+@dataclass(frozen=True, slots=True)
+class MemoryBudgetRulesOfThumb:
+    param_bytes: dict[str, float]
+    training_overhead_multiplier: str
+    kv_cache_per_token_per_layer: str
+    practical_inference_envelope: dict[str, str]
+    practical_finetune_envelope: dict[str, str]
+@dataclass(frozen=True, slots=True)
+class StackEntry:
+    id: str
+    label: str
+    purpose: str
+    verified_in_articles: tuple[str, ...] = ()
+    known_limits: tuple[str, ...] = ()
+    fits_paper_shapes: tuple[str, ...] = ()
+    supported_models_at_spark_scale: tuple[str, ...] = ()
+@dataclass(frozen=True, slots=True)
+class Capabilities:
+    """Typed singleton view of `spark-capabilities.json`."""
+    schema: str
+    version: str
+    hardware: Hardware
+    memory_budget_rules_of_thumb: MemoryBudgetRulesOfThumb
+    stack: dict[str, StackEntry]
+    out_of_envelope_signals: tuple[str, ...]
+    in_envelope_signals: tuple[str, ...]
+    stage_routing_hints: dict[str, str]
+    series_routing_hints: dict[str, str]
+    raw: dict[str, Any] = field(repr=False)
+    _instance: ClassVar["Capabilities | None"] = None
+    @classmethod
+    def load(cls, *, refresh: bool = False) -> "Capabilities":
+        """Return the cached singleton; pass `refresh=True` to force a re-read."""
+        if cls._instance is None or refresh:
+            data = json.loads(_data_path().read_text(encoding="utf-8"))
+            cls._instance = cls._from_raw(data)
+        return cls._instance
+    @classmethod
+    def _from_raw(cls, raw: dict[str, Any]) -> "Capabilities":
+        hw = raw["hardware"]
+        rt = raw["memory_budget_rules_of_thumb"]
+        stack = {
+            key: StackEntry(
+                id=entry.get("id", key),
+                label=entry["label"],
+                purpose=entry["purpose"],
+                verified_in_articles=tuple(entry.get("verified_in_articles", [])),
+                known_limits=tuple(entry.get("known_limits", [])),
+                fits_paper_shapes=tuple(entry.get("fits_paper_shapes", [])),
+                supported_models_at_spark_scale=tuple(
+                    entry.get("supported_models_at_spark_scale", [])
+                ),
+            )
+            for key, entry in raw["stack"].items()
+        }
+        return cls(
+            schema=raw["$schema"],
+            version=raw["version"],
+            hardware=Hardware(
+                name=hw["name"],
+                unified_memory_gb=int(hw["unified_memory_gb"]),
+                memory_topology=hw["memory_topology"],
+                compute_arch=hw["compute_arch"],
+                supported_dtypes=tuple(hw["supported_dtypes"]),
+                interconnect_to_other_gpus=hw["interconnect_to_other_gpus"],
+            ),
+            memory_budget_rules_of_thumb=MemoryBudgetRulesOfThumb(
+                param_bytes={k: float(v) for k, v in rt["param_bytes"].items()},
+                training_overhead_multiplier=rt["training_overhead_multiplier"],
+                kv_cache_per_token_per_layer=rt["kv_cache_per_token_per_layer"],
+                practical_inference_envelope=dict(rt["practical_inference_envelope"]),
+                practical_finetune_envelope=dict(rt["practical_finetune_envelope"]),
+            ),
+            stack=stack,
+            out_of_envelope_signals=tuple(raw["out_of_envelope_signals"]),
+            in_envelope_signals=tuple(raw["in_envelope_signals"]),
+            stage_routing_hints=dict(raw["stage_routing_hints"]),
+            series_routing_hints=dict(raw["series_routing_hints"]),
+            raw=raw,
+        )
+def _data_path() -> Any:
+    return files("fieldkit.capabilities.data").joinpath("spark-capabilities.json")
+def _dtype_bytes(dtype: str) -> float:
+    try:
+        return DTYPE_BYTES[dtype.lower()]
+    except KeyError as exc:
+        raise UnknownDtype(
+            f"unknown dtype {dtype!r}; known: {sorted(DTYPE_BYTES)}"
+        ) from exc
+def kv_cache_bytes(
+    *,
+    hidden: int,
+    n_layers: int,
+    ctx: int,
+    batch: int,
+    dtype: str,
+) -> int:
+    """KV cache memory in bytes for one decoder, given KV-hidden size and shape.
+    Formula (from `kv-cache-arithmetic-at-inference`):
+        KV bytes = 2 × n_layers × kv_hidden × ctx × batch × bytes_per_dtype
+    The factor of 2 covers K and V (both stored). `hidden` here means the
+    *KV hidden size* — `n_kv_heads × head_dim`. For a non-GQA model that
+    equals the model's hidden size; for Llama 3.1 70B (8 KV heads × 128
+    head_dim) it's 1024, regardless of the 8192-dim model hidden size.
+    Returns bytes as an int (rounded down).
+    """
+    if min(hidden, n_layers, ctx, batch) <= 0:
+        raise ValueError("hidden, n_layers, ctx, batch must all be positive")
+    bpd = _dtype_bytes(dtype)
+    return int(2 * n_layers * hidden * ctx * batch * bpd)
+def weight_bytes(*, params_b: float, dtype: str) -> int:
+    """Weight memory in bytes for `params_b` billion parameters at `dtype`.
+    `params_b` is in billions; `weight_bytes(params_b=70, dtype="bf16")` is
+    70e9 × 2 = 140 GB. Quantization dtypes (fp8, int8, nf4, int4) follow the
+    rules-of-thumb table in `spark-capabilities.json`.
+    """
+    if params_b <= 0:
+        raise ValueError("params_b must be positive (in billions)")
+    bpp = _dtype_bytes(dtype)
+    return int(params_b * 1e9 * bpp)
+def practical_inference_envelope(model_size: str) -> str:
+    """Look up the rule-of-thumb envelope string for `model_size`.
+    Keys mirror the JSON's `practical_inference_envelope` dict — e.g.
+    `"8B params bf16"`, `"70B params fp8"`, `"405B+ params"`. Lookup is
+    case-insensitive and tolerates surrounding whitespace.
+    Raises `UnknownEnvelope` if no rule matches.
+    """
+    table = Capabilities.load().memory_budget_rules_of_thumb.practical_inference_envelope
+    needle = model_size.strip().lower()
+    for key, val in table.items():
+        if key.lower() == needle:
+            return val
+    raise UnknownEnvelope(
+        f"no envelope rule for {model_size!r}; known keys: {list(table)}"
+    )

fieldkit/capabilities/data/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+# Copyright 2026 Manav Sehgal
+# SPDX-License-Identifier: Apache-2.0
+"""Package-data namespace for fieldkit.capabilities.
+Holds `spark-capabilities.json` so `importlib.resources` can locate it from
+both wheel installs and editable installs. Synced from
+`scripts/lib/spark-capabilities.json` by `fieldkit/scripts/sync_capabilities.py`.
+"""

fieldkit/capabilities/data/spark-capabilities.json ADDED Viewed

@@ -0,0 +1,163 @@
+{
+  "$schema": "spark-capabilities-v1",
+  "version": "2026.05.01",
+  "hardware": {
+    "name": "NVIDIA DGX Spark (GB10 Grace Blackwell)",
+    "unified_memory_gb": 128,
+    "memory_topology": "unified — same 128 GB pool serves CPU and GPU; allocations from one shrink the budget of the other; OOM hangs the whole box (see HANDOFF.md 2026-04-22 incident)",
+    "compute_arch": "Blackwell SM_100",
+    "supported_dtypes": ["bf16", "fp16", "fp8", "int8", "int4 (via NIM/TRT-LLM)"],
+    "interconnect_to_other_gpus": "none — single-node only; multi-GPU requires the cloud envelope (see Looking Beyond Spark series)"
+  },
+  "memory_budget_rules_of_thumb": {
+    "param_bytes": { "fp32": 4, "bf16": 2, "fp16": 2, "fp8": 1, "int8": 1, "int4": 0.5 },
+    "training_overhead_multiplier": "~4× params for full fine-tune (params + grads + optimizer state + activations); ~1.5× for LoRA",
+    "kv_cache_per_token_per_layer": "2 × hidden_size × bytes_per_dtype (one for K, one for V); see kv-cache-arithmetic-at-inference article for the canonical walkthrough",
+    "practical_inference_envelope": {
+      "8B params bf16": "fits with room — ~16 GB weights + KV; 24.8 tok/s measured on NIM",
+      "70B params bf16": "marginal — 140 GB weights alone exceeds the unified pool; requires fp8/int4 quant or offload",
+      "70B params fp8": "~70 GB weights; leaves ~50 GB for KV + activations + system; tight but possible",
+      "70B params int4": "~35 GB weights; comfortable",
+      "405B+ params": "out of envelope at any common quant; belongs in Looking Beyond Spark"
+    },
+    "practical_finetune_envelope": {
+      "LoRA on 8B": "trivially fits; demonstrated across multiple Foundations + Fine-tuning articles",
+      "LoRA on 70B fp8": "borderline; requires careful batch + grad_accum tuning",
+      "Full fine-tune on 8B": "fits with bf16 + grad checkpointing + small batch",
+      "Full fine-tune on 70B": "out of envelope; needs cloud"
+    }
+  },
+  "stack": {
+    "nim": {
+      "id": "nim",
+      "label": "NVIDIA NIM (NVIDIA Inference Microservices)",
+      "purpose": "Containerized inference microservices for popular open weights with TensorRT-LLM optimization built in",
+      "supported_models_at_spark_scale": ["llama-3.1-8b-instruct", "llama-3.3-70b-instruct (fp8/int4)", "nemotron-super-49b", "qwen-2.5-7b-instruct", "nemotron-embed-1b-v2", "nemotron-reranker-1b"],
+      "verified_in_articles": ["nim-first-inference-dgx-spark", "naive-rag-on-spark", "trtllm-and-triton-on-spark"],
+      "known_limits": [
+        "8192-token context window default; chunking math must respect it (see project-spark-nim-context-window memory)",
+        "NIM_GPU_MEM_FRACTION caps the KV reservation; 0.5 default leaves headroom for the OS",
+        "Cold start can take minutes when pulling weights"
+      ],
+      "fits_paper_shapes": ["serving open-weight LLMs", "RAG generators", "reranker pipelines", "embedding endpoints"]
+    },
+    "nemo": {
+      "id": "nemo",
+      "label": "NVIDIA NeMo Framework",
+      "purpose": "Training, fine-tuning, distillation, curation across LLM/multimodal/speech",
+      "verified_in_articles": ["distilling-the-architect", "lora-on-spark", "fine-tune-memory-math"],
+      "known_limits": [
+        "/opt/venv pip trap on NeMo containers — installs go to /usr/local/lib unless you use /opt/venv/bin/python3 -m pip (see feedback memory)",
+        "Multi-node distributed training APIs assume a cluster; on Spark you're capped at single-node configs"
+      ],
+      "fits_paper_shapes": ["LoRA / PEFT", "instruction tuning", "knowledge distillation", "data curation pipelines"]
+    },
+    "tensorrt_llm": {
+      "id": "tensorrt-llm",
+      "label": "NVIDIA TensorRT-LLM",
+      "purpose": "Compiled inference engines with paged KV cache, fp8/int4 kernels, in-flight batching",
+      "verified_in_articles": ["trtllm-and-triton-on-spark", "kv-cache-arithmetic-at-inference"],
+      "known_limits": [
+        "Engine builds are model + dtype + max-context-length specific; rebuilding for new shapes is slow",
+        "fp8 quant requires Hopper+ (Blackwell on Spark satisfies)",
+        "Use `--use_paged_context_fmha` and `--use_fp8_context_fmha` for KV efficiency on Spark"
+      ],
+      "fits_paper_shapes": ["latency-critical inference", "long-context generation", "high-concurrency serving"]
+    },
+    "triton": {
+      "id": "triton",
+      "label": "NVIDIA Triton Inference Server",
+      "purpose": "Multi-model serving with batching, ensembles, custom backends",
+      "verified_in_articles": ["trtllm-and-triton-on-spark"],
+      "known_limits": ["Setup complexity higher than NIM for single-model use cases"],
+      "fits_paper_shapes": ["multi-model pipelines", "ensemble inference", "custom backends"]
+    },
+    "nemo_retriever": {
+      "id": "nemo-retriever",
+      "label": "NVIDIA NeMo Retriever (NIM-served embed + rerank)",
+      "purpose": "Embedding + reranker microservices tuned for retrieval pipelines",
+      "verified_in_articles": ["pgvector-on-spark", "rerank-fusion-retrieval-on-spark", "naive-rag-on-spark"],
+      "known_limits": [
+        "1B reranker is the practical default on Spark; larger rerankers cut into LLM headroom",
+        "pgvector retrieval ~70 ms top-5; reranker ~40 ms per call (measured)"
+      ],
+      "fits_paper_shapes": ["RAG", "dense retrieval", "rerank fusion", "hybrid search"]
+    },
+    "pgvector": {
+      "id": "pgvector",
+      "label": "pgvector (Postgres extension)",
+      "purpose": "Vector search inside Postgres; the project's default RAG store",
+      "verified_in_articles": ["pgvector-on-spark", "naive-rag-on-spark"],
+      "known_limits": ["IVFFlat / HNSW tradeoffs documented; HNSW preferred for low-latency"],
+      "fits_paper_shapes": ["RAG ingestion", "vector search", "hybrid SQL+vector queries"]
+    },
+    "nemoclaw": {
+      "id": "nemoclaw",
+      "label": "NVIDIA NemoClaw (OpenClaw-in-OpenShell sandboxed agents)",
+      "purpose": "Local sandboxed agent stack with Nemotron backing for tool-using agents",
+      "verified_in_articles": ["nemoclaw-vs-openclaw-dgx-spark", "autoresearch-agent-loop"],
+      "known_limits": [
+        "Pinned to Nemotron at ~22 tok/s baseline",
+        "k3s/CoreDNS quirks on Ubuntu 24.04 cgroup v2 (see nemoclaw-guru skill)"
+      ],
+      "fits_paper_shapes": ["tool-using agents", "code-execution agents", "multi-step workflows", "agentic experimentation harnesses"]
+    },
+    "nemo_guardrails": {
+      "id": "nemo-guardrails",
+      "label": "NVIDIA NeMo Guardrails",
+      "purpose": "Programmable safety + topical rails around LLM responses",
+      "verified_in_articles": ["guardrails-on-spark"],
+      "known_limits": [],
+      "fits_paper_shapes": ["safety filtering", "topical refusal", "input/output rails", "jailbreak defense studies"]
+    },
+    "openclaw": {
+      "id": "openclaw",
+      "label": "OpenClaw CLI (Ollama-integrated)",
+      "purpose": "Ollama-backed agent CLI for lightweight tool use",
+      "verified_in_articles": ["nemoclaw-vs-openclaw-dgx-spark"],
+      "known_limits": ["Ollama models stack — second concurrent model can OOM the box"],
+      "fits_paper_shapes": ["lightweight agents", "rapid prototyping"]
+    },
+    "ollama": {
+      "id": "ollama",
+      "label": "Ollama",
+      "purpose": "Local LLM runner; useful for quick model swaps but not the production path",
+      "known_limits": ["Multi-model loading exceeds 128 GB unified pool — hard hang risk (see project-spark-unified-memory-oom memory)"]
+    }
+  },
+  "out_of_envelope_signals": [
+    "Pretraining anything ≥ 1B from scratch (need a cluster, see Looking Beyond Spark)",
+    "Multi-node distributed training of any size",
+    "Mixture-of-Experts at large total-param counts paired with long context",
+    "Anything requiring B200 / H200 / multiple H100s simultaneously",
+    "Training datasets that exceed local NVMe budget"
+  ],
+  "in_envelope_signals": [
+    "Fine-tuning open-weight LLMs ≤ 70B with LoRA / QLoRA",
+    "Inference-time techniques: speculative decoding, prompt caching, structured generation, constrained decoding",
+    "RAG pipelines: chunking, embedding, retrieval, reranking, fusion, hybrid search",
+    "Agentic systems: tool use, multi-step planning, sandboxed execution",
+    "Distillation from a frontier model into a small student",
+    "Quantization studies (fp8, int4, AWQ, GPTQ) on existing weights",
+    "Long-context inference economics (KV cache, paged attention, attention sinks)",
+    "Observability and instrumentation of inference and agent traces"
+  ],
+  "stage_routing_hints": {
+    "foundations": "papers about install / setup / hardware / drivers / day-one onboarding",
+    "training": "pretraining or continued pretraining studies; loss/loss-curve dynamics",
+    "fine-tuning": "LoRA, PEFT, instruction-tuning, distillation",
+    "inference": "serving, throughput, latency, KV cache, paged attention, quant",
+    "deployment": "containerization, services, graceful degradation, multi-tenant",
+    "agentic": "tool use, multi-agent, sandboxes, planning, code execution",
+    "observability": "tracing, profiling, eval pipelines, judge models, telemetry",
+    "dev-tools": "Nsight, CUDA tooling, IDE integrations, debug workflows"
+  },
+  "series_routing_hints": {
+    "Foundations": "shared install / setup substrate; cross-cutting onboarding",
+    "Second Brain": "RAG over the user's own corpus; query-time generation",
+    "LLM Wiki": "compile-time synthesis; LLM maintains a linted knowledge base",
+    "Autoresearch": "autonomous experimentation; agent runs overnight loops, edits, measures, decides",
+    "Looking Beyond Spark": "arithmetic that extrapolates beyond the 128 GB envelope to H100/H200/B200/SuperPOD",
+    "Frontier Scout": "the meta-series — papers + the system that found them"
+  }
+}

fieldkit/cli/__init__.py ADDED Viewed

@@ -0,0 +1,255 @@
+# Copyright 2026 Manav Sehgal
+# SPDX-License-Identifier: Apache-2.0
+"""`fieldkit` command-line entry point.
+Wires Typer subcommands to the existing module APIs:
+    fieldkit version           — print the installed version
+    fieldkit envelope <size>   — practical inference envelope rule for a model size
+    fieldkit feasibility <id>  — quick feasibility view from spark-capabilities.json
+    fieldkit bench rag         — drive Pipeline.ask through Bench against a tiny
+                                 in-memory corpus, print the latency report
+The CLI is intentionally thin — every command is a ~20-line wrapper over the
+public Python API. For real workloads, import `fieldkit.{capabilities,nim,rag,eval}`
+directly instead.
+"""
+from __future__ import annotations
+import json
+import os
+import sys
+from typing import Optional
+import typer
+from fieldkit import __version__
+from fieldkit.capabilities import (
+    Capabilities,
+    UnknownDtype,
+    UnknownEnvelope,
+    kv_cache_bytes,
+    practical_inference_envelope,
+    weight_bytes,
+)
+app = typer.Typer(
+    name="fieldkit",
+    help="Verified-on-Spark patterns from the ai-field-notes blog.",
+    no_args_is_help=True,
+    add_completion=False,
+)
+bench_app = typer.Typer(
+    name="bench",
+    help="Run small benchmarks against Spark services.",
+    no_args_is_help=True,
+)
+app.add_typer(bench_app, name="bench")
+@app.command("version")
+def version_cmd() -> None:
+    """Print the installed fieldkit version."""
+    typer.echo(__version__)
+@app.command("envelope")
+def envelope_cmd(
+    size: str = typer.Argument(
+        ...,
+        help='Model size key — e.g. "8B params bf16", "70B params fp8", "405B+ params".',
+    ),
+) -> None:
+    """Look up a practical inference envelope rule from spark-capabilities.json."""
+    try:
+        rule = practical_inference_envelope(size)
+    except UnknownEnvelope as exc:
+        typer.echo(f"error: {exc}", err=True)
+        raise typer.Exit(code=2) from exc
+    typer.echo(rule)
+@app.command("feasibility")
+def feasibility_cmd(
+    model_id: str = typer.Argument(
+        ...,
+        help='Model id — e.g. "llama-3.1-8b", "llama-3.1-70b", "100B-bf16".',
+    ),
+    ctx: int = typer.Option(4096, "--ctx", help="Context length in tokens."),
+    batch: int = typer.Option(1, "--batch", help="Concurrency / batch size."),
+    dtype: str = typer.Option(
+        "fp16",
+        "--dtype",
+        help="Weights/KV dtype — fp32, bf16, fp16, fp8, int8, int4, nf4.",
+    ),
+) -> None:
+    """Quick weights + KV-cache feasibility view for a known shape.
+    Recognises a small built-in catalog of shapes (Llama 3.1 8B/70B, 100B
+    Nemotron-class). Prints weight bytes, KV bytes, and the practical
+    inference envelope string.
+    """
+    shapes: dict[str, dict[str, object]] = {
+        "llama-3.1-8b": {
+            "params_b": 8.0,
+            "kv_hidden": 8 * 128,
+            "n_layers": 32,
+            "envelope_key": "8B params bf16",
+        },
+        "llama-3.1-70b": {
+            "params_b": 70.0,
+            "kv_hidden": 8 * 128,
+            "n_layers": 80,
+            "envelope_key": "70B params fp8",
+        },
+        "100b-bf16": {
+            "params_b": 100.0,
+            "kv_hidden": 8 * 128,
+            "n_layers": 96,
+            "envelope_key": "70B params bf16",
+        },
+    }
+    key = model_id.strip().lower()
+    if key not in shapes:
+        typer.echo(
+            f"error: unknown model id {model_id!r}; known: {sorted(shapes)}",
+            err=True,
+        )
+        raise typer.Exit(code=2)
+    shape = shapes[key]
+    try:
+        wb = weight_bytes(params_b=float(shape["params_b"]), dtype=dtype)
+        kvb = kv_cache_bytes(
+            hidden=int(shape["kv_hidden"]),  # type: ignore[arg-type]
+            n_layers=int(shape["n_layers"]),  # type: ignore[arg-type]
+            ctx=ctx,
+            batch=batch,
+            dtype=dtype,
+        )
+    except UnknownDtype as exc:
+        typer.echo(f"error: {exc}", err=True)
+        raise typer.Exit(code=2) from exc
+    caps = Capabilities.load()
+    gb = lambda b: b / 10**9  # noqa: E731
+    typer.echo(f"model:           {model_id}")
+    typer.echo(f"hardware:        {caps.hardware.name} — {caps.hardware.unified_memory_gb} GB unified")
+    typer.echo(f"weights ({dtype}):    {gb(wb):>7.1f} GB")
+    typer.echo(f"KV cache ({dtype}):   {gb(kvb):>7.1f} GB  (ctx={ctx}, batch={batch})")
+    typer.echo(f"weights + KV:    {gb(wb + kvb):>7.1f} GB")
+    try:
+        rule = practical_inference_envelope(str(shape["envelope_key"]))
+        typer.echo(f"envelope rule:   {rule}")
+    except UnknownEnvelope:
+        pass
+@bench_app.command("rag")
+def bench_rag_cmd(
+    embed_url: str = typer.Option(
+        os.environ.get("EMBED_BASE_URL", "http://localhost:8001/v1"),
+        "--embed-url",
+        envvar="EMBED_BASE_URL",
+    ),
+    nim_url: str = typer.Option(
+        os.environ.get("NIM_BASE_URL", "http://localhost:8000/v1"),
+        "--nim-url",
+        envvar="NIM_BASE_URL",
+    ),
+    nim_model: str = typer.Option(
+        os.environ.get("NIM_MODEL", "meta/llama-3.1-8b-instruct"),
+        "--nim-model",
+        envvar="NIM_MODEL",
+    ),
+    pgvector_dsn: str = typer.Option(
+        os.environ.get(
+            "PGVECTOR_DSN", "postgresql://spark:spark@localhost:5432/vectors"
+        ),
+        "--pgvector-dsn",
+        envvar="PGVECTOR_DSN",
+    ),
+    table: str = typer.Option(
+        "fieldkit_cli_bench_rag", "--table", help="pgvector table to use."
+    ),
+    out: Optional[str] = typer.Option(
+        None, "--out", help="Optional path to dump the bench JSON."
+    ),
+) -> None:
+    """Smoke-bench Pipeline.ask against a 3-doc in-memory corpus.
+    Requires the chat NIM, embed NIM, and pgvector to be reachable. Prints
+    a markdown latency report and (optionally) writes the full bench JSON
+    to disk.
+    """
+    # Imports are local so `fieldkit version` / `envelope` / `feasibility`
+    # don't pay the httpx / psycopg import cost.
+    from fieldkit.eval import Bench
+    from fieldkit.nim import NIMClient, wait_for_warm
+    from fieldkit.rag import Document, Pipeline
+    typer.echo(f"waiting for embed NIM at {embed_url} ...")
+    if not wait_for_warm(embed_url):
+        typer.echo("error: embed NIM not warm in time", err=True)
+        raise typer.Exit(code=1)
+    typer.echo(f"waiting for chat NIM at {nim_url} ...")
+    if not wait_for_warm(nim_url):
+        typer.echo("error: chat NIM not warm in time", err=True)
+        raise typer.Exit(code=1)
+    docs = [
+        Document(id=1, label="spark", text=(
+            "The DGX Spark is a personal AI computer with a GB10 Grace-Blackwell "
+            "superchip and 128 GB of unified memory shared between CPU and GPU."
+        )),
+        Document(id=2, label="spark", text=(
+            "Spark's unified memory means a single large model competes with the "
+            "OS and other processes for the same 128 GB pool."
+        )),
+        Document(id=3, label="distractor", text=(
+            "The 2004 Athens Olympics hosted 11099 athletes across 28 sports."
+        )),
+    ]
+    questions = [
+        "How much unified memory does the DGX Spark have?",
+        "What superchip powers the DGX Spark?",
+        "What does Spark's unified memory mean for large models?",
+        "Who won the 2020 US presidential election?",  # out-of-corpus
+    ]
+    with NIMClient(base_url=nim_url, model=nim_model) as gen, Pipeline(
+        embed_url=embed_url,
+        pgvector_dsn=pgvector_dsn,
+        generator=gen,
+        table=table,
+        chunk_tokens=400,
+    ) as pipe:
+        pipe.ensure_schema()
+        ingested = pipe.ingest(docs)
+        typer.echo(f"ingested {ingested} chunks into {table}")
+        bench = Bench(name="fieldkit-cli-bench-rag", metrics=[])
+        with bench:
+            bench.run(
+                lambda q: pipe.ask(q, retrieve_k=3, rerank_k=2, max_tokens=96),
+                questions,
+            )
+        typer.echo("")
+        typer.echo(bench.report())
+        if out:
+            from pathlib import Path
+            path = bench.dump(Path(out))
+            typer.echo(f"\nwrote {path}")
+def main() -> None:
+    """Module-level entry point used by `python -m fieldkit`."""
+    app()
+if __name__ == "__main__":  # pragma: no cover
+    sys.exit(app() or 0)