fieldkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fieldkit/__init__.py ADDED
@@ -0,0 +1,10 @@
1
+ # Copyright 2026 Manav Sehgal
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """fieldkit — verified-on-Spark patterns from the ai-field-notes blog.
4
+
5
+ See https://ainative.business/fieldkit/ for module-level docs.
6
+ """
7
+
8
+ from fieldkit._version import __version__
9
+
10
+ __all__ = ["__version__"]
fieldkit/_version.py ADDED
@@ -0,0 +1,9 @@
1
+ # Copyright 2026 Manav Sehgal
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Single source of truth for the fieldkit version.
4
+
5
+ `pyproject.toml`'s `[tool.hatch.version]` reads `__version__` from this file at
6
+ build time, so bumping it here is enough to bump the wheel version too.
7
+ """
8
+
9
+ __version__ = "0.1.0"
@@ -0,0 +1,229 @@
1
+ # Copyright 2026 Manav Sehgal
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Typed Python facade over the project's Spark capabilities map.
4
+
5
+ The JSON at `data/spark-capabilities.json` is the project's grounding floor for
6
+ hardware envelope claims (KV-cache math, weight memory, in/out-envelope
7
+ signals, NIM/NeMo/TRT-LLM stack notes). This module exposes it as:
8
+
9
+ - `Capabilities.load()` — singleton typed view of the JSON
10
+ - `kv_cache_bytes(...)` — canonical KV cache equation from
11
+ `kv-cache-arithmetic-at-inference`
12
+ - `weight_bytes(...)` — parameter-bytes lookup from the rules-of-thumb table
13
+ - `practical_inference_envelope(...)` — string lookup over the envelope table
14
+
15
+ Read-only by design. The source-of-truth JSON lives at
16
+ `scripts/lib/spark-capabilities.json` in the parent repo; the package's copy
17
+ is kept in sync by `fieldkit/scripts/sync_capabilities.py`.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import json
23
+ from dataclasses import dataclass, field
24
+ from importlib.resources import files
25
+ from typing import Any, ClassVar
26
+
27
+ __all__ = [
28
+ "Capabilities",
29
+ "Hardware",
30
+ "MemoryBudgetRulesOfThumb",
31
+ "StackEntry",
32
+ "kv_cache_bytes",
33
+ "weight_bytes",
34
+ "practical_inference_envelope",
35
+ "DTYPE_BYTES",
36
+ "UnknownDtype",
37
+ "UnknownEnvelope",
38
+ ]
39
+
40
+
41
+ DTYPE_BYTES: dict[str, float] = {
42
+ "fp32": 4.0,
43
+ "bf16": 2.0,
44
+ "fp16": 2.0,
45
+ "fp8": 1.0,
46
+ "int8": 1.0,
47
+ "int4": 0.5,
48
+ "nf4": 0.5,
49
+ }
50
+
51
+
52
+ class UnknownDtype(KeyError):
53
+ """Raised when a dtype string isn't in `DTYPE_BYTES`."""
54
+
55
+
56
+ class UnknownEnvelope(KeyError):
57
+ """Raised when `practical_inference_envelope` can't find the requested model size."""
58
+
59
+
60
+ @dataclass(frozen=True, slots=True)
61
+ class Hardware:
62
+ name: str
63
+ unified_memory_gb: int
64
+ memory_topology: str
65
+ compute_arch: str
66
+ supported_dtypes: tuple[str, ...]
67
+ interconnect_to_other_gpus: str
68
+
69
+
70
+ @dataclass(frozen=True, slots=True)
71
+ class MemoryBudgetRulesOfThumb:
72
+ param_bytes: dict[str, float]
73
+ training_overhead_multiplier: str
74
+ kv_cache_per_token_per_layer: str
75
+ practical_inference_envelope: dict[str, str]
76
+ practical_finetune_envelope: dict[str, str]
77
+
78
+
79
+ @dataclass(frozen=True, slots=True)
80
+ class StackEntry:
81
+ id: str
82
+ label: str
83
+ purpose: str
84
+ verified_in_articles: tuple[str, ...] = ()
85
+ known_limits: tuple[str, ...] = ()
86
+ fits_paper_shapes: tuple[str, ...] = ()
87
+ supported_models_at_spark_scale: tuple[str, ...] = ()
88
+
89
+
90
+ @dataclass(frozen=True, slots=True)
91
+ class Capabilities:
92
+ """Typed singleton view of `spark-capabilities.json`."""
93
+
94
+ schema: str
95
+ version: str
96
+ hardware: Hardware
97
+ memory_budget_rules_of_thumb: MemoryBudgetRulesOfThumb
98
+ stack: dict[str, StackEntry]
99
+ out_of_envelope_signals: tuple[str, ...]
100
+ in_envelope_signals: tuple[str, ...]
101
+ stage_routing_hints: dict[str, str]
102
+ series_routing_hints: dict[str, str]
103
+ raw: dict[str, Any] = field(repr=False)
104
+
105
+ _instance: ClassVar["Capabilities | None"] = None
106
+
107
+ @classmethod
108
+ def load(cls, *, refresh: bool = False) -> "Capabilities":
109
+ """Return the cached singleton; pass `refresh=True` to force a re-read."""
110
+ if cls._instance is None or refresh:
111
+ data = json.loads(_data_path().read_text(encoding="utf-8"))
112
+ cls._instance = cls._from_raw(data)
113
+ return cls._instance
114
+
115
+ @classmethod
116
+ def _from_raw(cls, raw: dict[str, Any]) -> "Capabilities":
117
+ hw = raw["hardware"]
118
+ rt = raw["memory_budget_rules_of_thumb"]
119
+ stack = {
120
+ key: StackEntry(
121
+ id=entry.get("id", key),
122
+ label=entry["label"],
123
+ purpose=entry["purpose"],
124
+ verified_in_articles=tuple(entry.get("verified_in_articles", [])),
125
+ known_limits=tuple(entry.get("known_limits", [])),
126
+ fits_paper_shapes=tuple(entry.get("fits_paper_shapes", [])),
127
+ supported_models_at_spark_scale=tuple(
128
+ entry.get("supported_models_at_spark_scale", [])
129
+ ),
130
+ )
131
+ for key, entry in raw["stack"].items()
132
+ }
133
+ return cls(
134
+ schema=raw["$schema"],
135
+ version=raw["version"],
136
+ hardware=Hardware(
137
+ name=hw["name"],
138
+ unified_memory_gb=int(hw["unified_memory_gb"]),
139
+ memory_topology=hw["memory_topology"],
140
+ compute_arch=hw["compute_arch"],
141
+ supported_dtypes=tuple(hw["supported_dtypes"]),
142
+ interconnect_to_other_gpus=hw["interconnect_to_other_gpus"],
143
+ ),
144
+ memory_budget_rules_of_thumb=MemoryBudgetRulesOfThumb(
145
+ param_bytes={k: float(v) for k, v in rt["param_bytes"].items()},
146
+ training_overhead_multiplier=rt["training_overhead_multiplier"],
147
+ kv_cache_per_token_per_layer=rt["kv_cache_per_token_per_layer"],
148
+ practical_inference_envelope=dict(rt["practical_inference_envelope"]),
149
+ practical_finetune_envelope=dict(rt["practical_finetune_envelope"]),
150
+ ),
151
+ stack=stack,
152
+ out_of_envelope_signals=tuple(raw["out_of_envelope_signals"]),
153
+ in_envelope_signals=tuple(raw["in_envelope_signals"]),
154
+ stage_routing_hints=dict(raw["stage_routing_hints"]),
155
+ series_routing_hints=dict(raw["series_routing_hints"]),
156
+ raw=raw,
157
+ )
158
+
159
+
160
+ def _data_path() -> Any:
161
+ return files("fieldkit.capabilities.data").joinpath("spark-capabilities.json")
162
+
163
+
164
+ def _dtype_bytes(dtype: str) -> float:
165
+ try:
166
+ return DTYPE_BYTES[dtype.lower()]
167
+ except KeyError as exc:
168
+ raise UnknownDtype(
169
+ f"unknown dtype {dtype!r}; known: {sorted(DTYPE_BYTES)}"
170
+ ) from exc
171
+
172
+
173
+ def kv_cache_bytes(
174
+ *,
175
+ hidden: int,
176
+ n_layers: int,
177
+ ctx: int,
178
+ batch: int,
179
+ dtype: str,
180
+ ) -> int:
181
+ """KV cache memory in bytes for one decoder, given KV-hidden size and shape.
182
+
183
+ Formula (from `kv-cache-arithmetic-at-inference`):
184
+
185
+ KV bytes = 2 × n_layers × kv_hidden × ctx × batch × bytes_per_dtype
186
+
187
+ The factor of 2 covers K and V (both stored). `hidden` here means the
188
+ *KV hidden size* — `n_kv_heads × head_dim`. For a non-GQA model that
189
+ equals the model's hidden size; for Llama 3.1 70B (8 KV heads × 128
190
+ head_dim) it's 1024, regardless of the 8192-dim model hidden size.
191
+
192
+ Returns bytes as an int (rounded down).
193
+ """
194
+ if min(hidden, n_layers, ctx, batch) <= 0:
195
+ raise ValueError("hidden, n_layers, ctx, batch must all be positive")
196
+ bpd = _dtype_bytes(dtype)
197
+ return int(2 * n_layers * hidden * ctx * batch * bpd)
198
+
199
+
200
+ def weight_bytes(*, params_b: float, dtype: str) -> int:
201
+ """Weight memory in bytes for `params_b` billion parameters at `dtype`.
202
+
203
+ `params_b` is in billions; `weight_bytes(params_b=70, dtype="bf16")` is
204
+ 70e9 × 2 = 140 GB. Quantization dtypes (fp8, int8, nf4, int4) follow the
205
+ rules-of-thumb table in `spark-capabilities.json`.
206
+ """
207
+ if params_b <= 0:
208
+ raise ValueError("params_b must be positive (in billions)")
209
+ bpp = _dtype_bytes(dtype)
210
+ return int(params_b * 1e9 * bpp)
211
+
212
+
213
+ def practical_inference_envelope(model_size: str) -> str:
214
+ """Look up the rule-of-thumb envelope string for `model_size`.
215
+
216
+ Keys mirror the JSON's `practical_inference_envelope` dict — e.g.
217
+ `"8B params bf16"`, `"70B params fp8"`, `"405B+ params"`. Lookup is
218
+ case-insensitive and tolerates surrounding whitespace.
219
+
220
+ Raises `UnknownEnvelope` if no rule matches.
221
+ """
222
+ table = Capabilities.load().memory_budget_rules_of_thumb.practical_inference_envelope
223
+ needle = model_size.strip().lower()
224
+ for key, val in table.items():
225
+ if key.lower() == needle:
226
+ return val
227
+ raise UnknownEnvelope(
228
+ f"no envelope rule for {model_size!r}; known keys: {list(table)}"
229
+ )
@@ -0,0 +1,8 @@
1
+ # Copyright 2026 Manav Sehgal
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Package-data namespace for fieldkit.capabilities.
4
+
5
+ Holds `spark-capabilities.json` so `importlib.resources` can locate it from
6
+ both wheel installs and editable installs. Synced from
7
+ `scripts/lib/spark-capabilities.json` by `fieldkit/scripts/sync_capabilities.py`.
8
+ """
@@ -0,0 +1,163 @@
1
+ {
2
+ "$schema": "spark-capabilities-v1",
3
+ "version": "2026.05.01",
4
+ "hardware": {
5
+ "name": "NVIDIA DGX Spark (GB10 Grace Blackwell)",
6
+ "unified_memory_gb": 128,
7
+ "memory_topology": "unified — same 128 GB pool serves CPU and GPU; allocations from one shrink the budget of the other; OOM hangs the whole box (see HANDOFF.md 2026-04-22 incident)",
8
+ "compute_arch": "Blackwell SM_100",
9
+ "supported_dtypes": ["bf16", "fp16", "fp8", "int8", "int4 (via NIM/TRT-LLM)"],
10
+ "interconnect_to_other_gpus": "none — single-node only; multi-GPU requires the cloud envelope (see Looking Beyond Spark series)"
11
+ },
12
+ "memory_budget_rules_of_thumb": {
13
+ "param_bytes": { "fp32": 4, "bf16": 2, "fp16": 2, "fp8": 1, "int8": 1, "int4": 0.5 },
14
+ "training_overhead_multiplier": "~4× params for full fine-tune (params + grads + optimizer state + activations); ~1.5× for LoRA",
15
+ "kv_cache_per_token_per_layer": "2 × hidden_size × bytes_per_dtype (one for K, one for V); see kv-cache-arithmetic-at-inference article for the canonical walkthrough",
16
+ "practical_inference_envelope": {
17
+ "8B params bf16": "fits with room — ~16 GB weights + KV; 24.8 tok/s measured on NIM",
18
+ "70B params bf16": "marginal — 140 GB weights alone exceeds the unified pool; requires fp8/int4 quant or offload",
19
+ "70B params fp8": "~70 GB weights; leaves ~50 GB for KV + activations + system; tight but possible",
20
+ "70B params int4": "~35 GB weights; comfortable",
21
+ "405B+ params": "out of envelope at any common quant; belongs in Looking Beyond Spark"
22
+ },
23
+ "practical_finetune_envelope": {
24
+ "LoRA on 8B": "trivially fits; demonstrated across multiple Foundations + Fine-tuning articles",
25
+ "LoRA on 70B fp8": "borderline; requires careful batch + grad_accum tuning",
26
+ "Full fine-tune on 8B": "fits with bf16 + grad checkpointing + small batch",
27
+ "Full fine-tune on 70B": "out of envelope; needs cloud"
28
+ }
29
+ },
30
+ "stack": {
31
+ "nim": {
32
+ "id": "nim",
33
+ "label": "NVIDIA NIM (NVIDIA Inference Microservices)",
34
+ "purpose": "Containerized inference microservices for popular open weights with TensorRT-LLM optimization built in",
35
+ "supported_models_at_spark_scale": ["llama-3.1-8b-instruct", "llama-3.3-70b-instruct (fp8/int4)", "nemotron-super-49b", "qwen-2.5-7b-instruct", "nemotron-embed-1b-v2", "nemotron-reranker-1b"],
36
+ "verified_in_articles": ["nim-first-inference-dgx-spark", "naive-rag-on-spark", "trtllm-and-triton-on-spark"],
37
+ "known_limits": [
38
+ "8192-token context window default; chunking math must respect it (see project-spark-nim-context-window memory)",
39
+ "NIM_GPU_MEM_FRACTION caps the KV reservation; 0.5 default leaves headroom for the OS",
40
+ "Cold start can take minutes when pulling weights"
41
+ ],
42
+ "fits_paper_shapes": ["serving open-weight LLMs", "RAG generators", "reranker pipelines", "embedding endpoints"]
43
+ },
44
+ "nemo": {
45
+ "id": "nemo",
46
+ "label": "NVIDIA NeMo Framework",
47
+ "purpose": "Training, fine-tuning, distillation, curation across LLM/multimodal/speech",
48
+ "verified_in_articles": ["distilling-the-architect", "lora-on-spark", "fine-tune-memory-math"],
49
+ "known_limits": [
50
+ "/opt/venv pip trap on NeMo containers — installs go to /usr/local/lib unless you use /opt/venv/bin/python3 -m pip (see feedback memory)",
51
+ "Multi-node distributed training APIs assume a cluster; on Spark you're capped at single-node configs"
52
+ ],
53
+ "fits_paper_shapes": ["LoRA / PEFT", "instruction tuning", "knowledge distillation", "data curation pipelines"]
54
+ },
55
+ "tensorrt_llm": {
56
+ "id": "tensorrt-llm",
57
+ "label": "NVIDIA TensorRT-LLM",
58
+ "purpose": "Compiled inference engines with paged KV cache, fp8/int4 kernels, in-flight batching",
59
+ "verified_in_articles": ["trtllm-and-triton-on-spark", "kv-cache-arithmetic-at-inference"],
60
+ "known_limits": [
61
+ "Engine builds are model + dtype + max-context-length specific; rebuilding for new shapes is slow",
62
+ "fp8 quant requires Hopper+ (Blackwell on Spark satisfies)",
63
+ "Use `--use_paged_context_fmha` and `--use_fp8_context_fmha` for KV efficiency on Spark"
64
+ ],
65
+ "fits_paper_shapes": ["latency-critical inference", "long-context generation", "high-concurrency serving"]
66
+ },
67
+ "triton": {
68
+ "id": "triton",
69
+ "label": "NVIDIA Triton Inference Server",
70
+ "purpose": "Multi-model serving with batching, ensembles, custom backends",
71
+ "verified_in_articles": ["trtllm-and-triton-on-spark"],
72
+ "known_limits": ["Setup complexity higher than NIM for single-model use cases"],
73
+ "fits_paper_shapes": ["multi-model pipelines", "ensemble inference", "custom backends"]
74
+ },
75
+ "nemo_retriever": {
76
+ "id": "nemo-retriever",
77
+ "label": "NVIDIA NeMo Retriever (NIM-served embed + rerank)",
78
+ "purpose": "Embedding + reranker microservices tuned for retrieval pipelines",
79
+ "verified_in_articles": ["pgvector-on-spark", "rerank-fusion-retrieval-on-spark", "naive-rag-on-spark"],
80
+ "known_limits": [
81
+ "1B reranker is the practical default on Spark; larger rerankers cut into LLM headroom",
82
+ "pgvector retrieval ~70 ms top-5; reranker ~40 ms per call (measured)"
83
+ ],
84
+ "fits_paper_shapes": ["RAG", "dense retrieval", "rerank fusion", "hybrid search"]
85
+ },
86
+ "pgvector": {
87
+ "id": "pgvector",
88
+ "label": "pgvector (Postgres extension)",
89
+ "purpose": "Vector search inside Postgres; the project's default RAG store",
90
+ "verified_in_articles": ["pgvector-on-spark", "naive-rag-on-spark"],
91
+ "known_limits": ["IVFFlat / HNSW tradeoffs documented; HNSW preferred for low-latency"],
92
+ "fits_paper_shapes": ["RAG ingestion", "vector search", "hybrid SQL+vector queries"]
93
+ },
94
+ "nemoclaw": {
95
+ "id": "nemoclaw",
96
+ "label": "NVIDIA NemoClaw (OpenClaw-in-OpenShell sandboxed agents)",
97
+ "purpose": "Local sandboxed agent stack with Nemotron backing for tool-using agents",
98
+ "verified_in_articles": ["nemoclaw-vs-openclaw-dgx-spark", "autoresearch-agent-loop"],
99
+ "known_limits": [
100
+ "Pinned to Nemotron at ~22 tok/s baseline",
101
+ "k3s/CoreDNS quirks on Ubuntu 24.04 cgroup v2 (see nemoclaw-guru skill)"
102
+ ],
103
+ "fits_paper_shapes": ["tool-using agents", "code-execution agents", "multi-step workflows", "agentic experimentation harnesses"]
104
+ },
105
+ "nemo_guardrails": {
106
+ "id": "nemo-guardrails",
107
+ "label": "NVIDIA NeMo Guardrails",
108
+ "purpose": "Programmable safety + topical rails around LLM responses",
109
+ "verified_in_articles": ["guardrails-on-spark"],
110
+ "known_limits": [],
111
+ "fits_paper_shapes": ["safety filtering", "topical refusal", "input/output rails", "jailbreak defense studies"]
112
+ },
113
+ "openclaw": {
114
+ "id": "openclaw",
115
+ "label": "OpenClaw CLI (Ollama-integrated)",
116
+ "purpose": "Ollama-backed agent CLI for lightweight tool use",
117
+ "verified_in_articles": ["nemoclaw-vs-openclaw-dgx-spark"],
118
+ "known_limits": ["Ollama models stack — second concurrent model can OOM the box"],
119
+ "fits_paper_shapes": ["lightweight agents", "rapid prototyping"]
120
+ },
121
+ "ollama": {
122
+ "id": "ollama",
123
+ "label": "Ollama",
124
+ "purpose": "Local LLM runner; useful for quick model swaps but not the production path",
125
+ "known_limits": ["Multi-model loading exceeds 128 GB unified pool — hard hang risk (see project-spark-unified-memory-oom memory)"]
126
+ }
127
+ },
128
+ "out_of_envelope_signals": [
129
+ "Pretraining anything ≥ 1B from scratch (need a cluster, see Looking Beyond Spark)",
130
+ "Multi-node distributed training of any size",
131
+ "Mixture-of-Experts at large total-param counts paired with long context",
132
+ "Anything requiring B200 / H200 / multiple H100s simultaneously",
133
+ "Training datasets that exceed local NVMe budget"
134
+ ],
135
+ "in_envelope_signals": [
136
+ "Fine-tuning open-weight LLMs ≤ 70B with LoRA / QLoRA",
137
+ "Inference-time techniques: speculative decoding, prompt caching, structured generation, constrained decoding",
138
+ "RAG pipelines: chunking, embedding, retrieval, reranking, fusion, hybrid search",
139
+ "Agentic systems: tool use, multi-step planning, sandboxed execution",
140
+ "Distillation from a frontier model into a small student",
141
+ "Quantization studies (fp8, int4, AWQ, GPTQ) on existing weights",
142
+ "Long-context inference economics (KV cache, paged attention, attention sinks)",
143
+ "Observability and instrumentation of inference and agent traces"
144
+ ],
145
+ "stage_routing_hints": {
146
+ "foundations": "papers about install / setup / hardware / drivers / day-one onboarding",
147
+ "training": "pretraining or continued pretraining studies; loss/loss-curve dynamics",
148
+ "fine-tuning": "LoRA, PEFT, instruction-tuning, distillation",
149
+ "inference": "serving, throughput, latency, KV cache, paged attention, quant",
150
+ "deployment": "containerization, services, graceful degradation, multi-tenant",
151
+ "agentic": "tool use, multi-agent, sandboxes, planning, code execution",
152
+ "observability": "tracing, profiling, eval pipelines, judge models, telemetry",
153
+ "dev-tools": "Nsight, CUDA tooling, IDE integrations, debug workflows"
154
+ },
155
+ "series_routing_hints": {
156
+ "Foundations": "shared install / setup substrate; cross-cutting onboarding",
157
+ "Second Brain": "RAG over the user's own corpus; query-time generation",
158
+ "LLM Wiki": "compile-time synthesis; LLM maintains a linted knowledge base",
159
+ "Autoresearch": "autonomous experimentation; agent runs overnight loops, edits, measures, decides",
160
+ "Looking Beyond Spark": "arithmetic that extrapolates beyond the 128 GB envelope to H100/H200/B200/SuperPOD",
161
+ "Frontier Scout": "the meta-series — papers + the system that found them"
162
+ }
163
+ }
@@ -0,0 +1,255 @@
1
+ # Copyright 2026 Manav Sehgal
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """`fieldkit` command-line entry point.
4
+
5
+ Wires Typer subcommands to the existing module APIs:
6
+
7
+ fieldkit version — print the installed version
8
+ fieldkit envelope <size> — practical inference envelope rule for a model size
9
+ fieldkit feasibility <id> — quick feasibility view from spark-capabilities.json
10
+ fieldkit bench rag — drive Pipeline.ask through Bench against a tiny
11
+ in-memory corpus, print the latency report
12
+
13
+ The CLI is intentionally thin — every command is a ~20-line wrapper over the
14
+ public Python API. For real workloads, import `fieldkit.{capabilities,nim,rag,eval}`
15
+ directly instead.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import json
21
+ import os
22
+ import sys
23
+ from typing import Optional
24
+
25
+ import typer
26
+
27
+ from fieldkit import __version__
28
+ from fieldkit.capabilities import (
29
+ Capabilities,
30
+ UnknownDtype,
31
+ UnknownEnvelope,
32
+ kv_cache_bytes,
33
+ practical_inference_envelope,
34
+ weight_bytes,
35
+ )
36
+
37
+ app = typer.Typer(
38
+ name="fieldkit",
39
+ help="Verified-on-Spark patterns from the ai-field-notes blog.",
40
+ no_args_is_help=True,
41
+ add_completion=False,
42
+ )
43
+
44
+ bench_app = typer.Typer(
45
+ name="bench",
46
+ help="Run small benchmarks against Spark services.",
47
+ no_args_is_help=True,
48
+ )
49
+ app.add_typer(bench_app, name="bench")
50
+
51
+
52
+ @app.command("version")
53
+ def version_cmd() -> None:
54
+ """Print the installed fieldkit version."""
55
+ typer.echo(__version__)
56
+
57
+
58
+ @app.command("envelope")
59
+ def envelope_cmd(
60
+ size: str = typer.Argument(
61
+ ...,
62
+ help='Model size key — e.g. "8B params bf16", "70B params fp8", "405B+ params".',
63
+ ),
64
+ ) -> None:
65
+ """Look up a practical inference envelope rule from spark-capabilities.json."""
66
+ try:
67
+ rule = practical_inference_envelope(size)
68
+ except UnknownEnvelope as exc:
69
+ typer.echo(f"error: {exc}", err=True)
70
+ raise typer.Exit(code=2) from exc
71
+ typer.echo(rule)
72
+
73
+
74
+ @app.command("feasibility")
75
+ def feasibility_cmd(
76
+ model_id: str = typer.Argument(
77
+ ...,
78
+ help='Model id — e.g. "llama-3.1-8b", "llama-3.1-70b", "100B-bf16".',
79
+ ),
80
+ ctx: int = typer.Option(4096, "--ctx", help="Context length in tokens."),
81
+ batch: int = typer.Option(1, "--batch", help="Concurrency / batch size."),
82
+ dtype: str = typer.Option(
83
+ "fp16",
84
+ "--dtype",
85
+ help="Weights/KV dtype — fp32, bf16, fp16, fp8, int8, int4, nf4.",
86
+ ),
87
+ ) -> None:
88
+ """Quick weights + KV-cache feasibility view for a known shape.
89
+
90
+ Recognises a small built-in catalog of shapes (Llama 3.1 8B/70B, 100B
91
+ Nemotron-class). Prints weight bytes, KV bytes, and the practical
92
+ inference envelope string.
93
+ """
94
+ shapes: dict[str, dict[str, object]] = {
95
+ "llama-3.1-8b": {
96
+ "params_b": 8.0,
97
+ "kv_hidden": 8 * 128,
98
+ "n_layers": 32,
99
+ "envelope_key": "8B params bf16",
100
+ },
101
+ "llama-3.1-70b": {
102
+ "params_b": 70.0,
103
+ "kv_hidden": 8 * 128,
104
+ "n_layers": 80,
105
+ "envelope_key": "70B params fp8",
106
+ },
107
+ "100b-bf16": {
108
+ "params_b": 100.0,
109
+ "kv_hidden": 8 * 128,
110
+ "n_layers": 96,
111
+ "envelope_key": "70B params bf16",
112
+ },
113
+ }
114
+ key = model_id.strip().lower()
115
+ if key not in shapes:
116
+ typer.echo(
117
+ f"error: unknown model id {model_id!r}; known: {sorted(shapes)}",
118
+ err=True,
119
+ )
120
+ raise typer.Exit(code=2)
121
+ shape = shapes[key]
122
+ try:
123
+ wb = weight_bytes(params_b=float(shape["params_b"]), dtype=dtype)
124
+ kvb = kv_cache_bytes(
125
+ hidden=int(shape["kv_hidden"]), # type: ignore[arg-type]
126
+ n_layers=int(shape["n_layers"]), # type: ignore[arg-type]
127
+ ctx=ctx,
128
+ batch=batch,
129
+ dtype=dtype,
130
+ )
131
+ except UnknownDtype as exc:
132
+ typer.echo(f"error: {exc}", err=True)
133
+ raise typer.Exit(code=2) from exc
134
+
135
+ caps = Capabilities.load()
136
+ gb = lambda b: b / 10**9 # noqa: E731
137
+
138
+ typer.echo(f"model: {model_id}")
139
+ typer.echo(f"hardware: {caps.hardware.name} — {caps.hardware.unified_memory_gb} GB unified")
140
+ typer.echo(f"weights ({dtype}): {gb(wb):>7.1f} GB")
141
+ typer.echo(f"KV cache ({dtype}): {gb(kvb):>7.1f} GB (ctx={ctx}, batch={batch})")
142
+ typer.echo(f"weights + KV: {gb(wb + kvb):>7.1f} GB")
143
+ try:
144
+ rule = practical_inference_envelope(str(shape["envelope_key"]))
145
+ typer.echo(f"envelope rule: {rule}")
146
+ except UnknownEnvelope:
147
+ pass
148
+
149
+
150
+ @bench_app.command("rag")
151
+ def bench_rag_cmd(
152
+ embed_url: str = typer.Option(
153
+ os.environ.get("EMBED_BASE_URL", "http://localhost:8001/v1"),
154
+ "--embed-url",
155
+ envvar="EMBED_BASE_URL",
156
+ ),
157
+ nim_url: str = typer.Option(
158
+ os.environ.get("NIM_BASE_URL", "http://localhost:8000/v1"),
159
+ "--nim-url",
160
+ envvar="NIM_BASE_URL",
161
+ ),
162
+ nim_model: str = typer.Option(
163
+ os.environ.get("NIM_MODEL", "meta/llama-3.1-8b-instruct"),
164
+ "--nim-model",
165
+ envvar="NIM_MODEL",
166
+ ),
167
+ pgvector_dsn: str = typer.Option(
168
+ os.environ.get(
169
+ "PGVECTOR_DSN", "postgresql://spark:spark@localhost:5432/vectors"
170
+ ),
171
+ "--pgvector-dsn",
172
+ envvar="PGVECTOR_DSN",
173
+ ),
174
+ table: str = typer.Option(
175
+ "fieldkit_cli_bench_rag", "--table", help="pgvector table to use."
176
+ ),
177
+ out: Optional[str] = typer.Option(
178
+ None, "--out", help="Optional path to dump the bench JSON."
179
+ ),
180
+ ) -> None:
181
+ """Smoke-bench Pipeline.ask against a 3-doc in-memory corpus.
182
+
183
+ Requires the chat NIM, embed NIM, and pgvector to be reachable. Prints
184
+ a markdown latency report and (optionally) writes the full bench JSON
185
+ to disk.
186
+ """
187
+ # Imports are local so `fieldkit version` / `envelope` / `feasibility`
188
+ # don't pay the httpx / psycopg import cost.
189
+ from fieldkit.eval import Bench
190
+ from fieldkit.nim import NIMClient, wait_for_warm
191
+ from fieldkit.rag import Document, Pipeline
192
+
193
+ typer.echo(f"waiting for embed NIM at {embed_url} ...")
194
+ if not wait_for_warm(embed_url):
195
+ typer.echo("error: embed NIM not warm in time", err=True)
196
+ raise typer.Exit(code=1)
197
+ typer.echo(f"waiting for chat NIM at {nim_url} ...")
198
+ if not wait_for_warm(nim_url):
199
+ typer.echo("error: chat NIM not warm in time", err=True)
200
+ raise typer.Exit(code=1)
201
+
202
+ docs = [
203
+ Document(id=1, label="spark", text=(
204
+ "The DGX Spark is a personal AI computer with a GB10 Grace-Blackwell "
205
+ "superchip and 128 GB of unified memory shared between CPU and GPU."
206
+ )),
207
+ Document(id=2, label="spark", text=(
208
+ "Spark's unified memory means a single large model competes with the "
209
+ "OS and other processes for the same 128 GB pool."
210
+ )),
211
+ Document(id=3, label="distractor", text=(
212
+ "The 2004 Athens Olympics hosted 11099 athletes across 28 sports."
213
+ )),
214
+ ]
215
+ questions = [
216
+ "How much unified memory does the DGX Spark have?",
217
+ "What superchip powers the DGX Spark?",
218
+ "What does Spark's unified memory mean for large models?",
219
+ "Who won the 2020 US presidential election?", # out-of-corpus
220
+ ]
221
+
222
+ with NIMClient(base_url=nim_url, model=nim_model) as gen, Pipeline(
223
+ embed_url=embed_url,
224
+ pgvector_dsn=pgvector_dsn,
225
+ generator=gen,
226
+ table=table,
227
+ chunk_tokens=400,
228
+ ) as pipe:
229
+ pipe.ensure_schema()
230
+ ingested = pipe.ingest(docs)
231
+ typer.echo(f"ingested {ingested} chunks into {table}")
232
+
233
+ bench = Bench(name="fieldkit-cli-bench-rag", metrics=[])
234
+ with bench:
235
+ bench.run(
236
+ lambda q: pipe.ask(q, retrieve_k=3, rerank_k=2, max_tokens=96),
237
+ questions,
238
+ )
239
+ typer.echo("")
240
+ typer.echo(bench.report())
241
+
242
+ if out:
243
+ from pathlib import Path
244
+
245
+ path = bench.dump(Path(out))
246
+ typer.echo(f"\nwrote {path}")
247
+
248
+
249
+ def main() -> None:
250
+ """Module-level entry point used by `python -m fieldkit`."""
251
+ app()
252
+
253
+
254
+ if __name__ == "__main__": # pragma: no cover
255
+ sys.exit(app() or 0)