chunkshop 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. chunkshop/__init__.py +5 -0
  2. chunkshop/bakeoff/__init__.py +25 -0
  3. chunkshop/bakeoff/config.py +115 -0
  4. chunkshop/bakeoff/gold.py +40 -0
  5. chunkshop/bakeoff/keys.py +53 -0
  6. chunkshop/bakeoff/output.py +159 -0
  7. chunkshop/bakeoff/runner.py +228 -0
  8. chunkshop/bakeoff/score.py +41 -0
  9. chunkshop/chunkers/__init__.py +86 -0
  10. chunkshop/chunkers/_sentence_split.py +56 -0
  11. chunkshop/chunkers/_splitting.py +111 -0
  12. chunkshop/chunkers/_summarizer.py +73 -0
  13. chunkshop/chunkers/base.py +18 -0
  14. chunkshop/chunkers/fixed_overlap.py +35 -0
  15. chunkshop/chunkers/hierarchical_summary.py +118 -0
  16. chunkshop/chunkers/hierarchy.py +84 -0
  17. chunkshop/chunkers/neighbor_expand.py +28 -0
  18. chunkshop/chunkers/semantic.py +192 -0
  19. chunkshop/chunkers/sentence_aware.py +71 -0
  20. chunkshop/chunkers/summary_embed.py +33 -0
  21. chunkshop/cli.py +241 -0
  22. chunkshop/config.py +458 -0
  23. chunkshop/embedders/__init__.py +22 -0
  24. chunkshop/embedders/_registry.py +169 -0
  25. chunkshop/embedders/base.py +8 -0
  26. chunkshop/embedders/fastembed_provider.py +73 -0
  27. chunkshop/extractors/__init__.py +36 -0
  28. chunkshop/extractors/base.py +8 -0
  29. chunkshop/extractors/composite.py +44 -0
  30. chunkshop/extractors/keybert_phrases.py +38 -0
  31. chunkshop/extractors/lang_detect.py +51 -0
  32. chunkshop/extractors/none_provider.py +10 -0
  33. chunkshop/extractors/rake_keywords.py +29 -0
  34. chunkshop/extractors/result.py +19 -0
  35. chunkshop/extractors/spacy_entities.py +54 -0
  36. chunkshop/framers/__init__.py +34 -0
  37. chunkshop/framers/base.py +16 -0
  38. chunkshop/framers/heading_boundary.py +60 -0
  39. chunkshop/framers/identity.py +14 -0
  40. chunkshop/framers/jsonpath.py +66 -0
  41. chunkshop/framers/regex_boundary.py +52 -0
  42. chunkshop/orchestrator.py +138 -0
  43. chunkshop/pipeline.py +130 -0
  44. chunkshop/runner.py +139 -0
  45. chunkshop/sink.py +306 -0
  46. chunkshop/sources/__init__.py +39 -0
  47. chunkshop/sources/base.py +15 -0
  48. chunkshop/sources/files.py +33 -0
  49. chunkshop/sources/http.py +104 -0
  50. chunkshop/sources/json_corpus.py +26 -0
  51. chunkshop/sources/pg_table.py +75 -0
  52. chunkshop/sources/s3.py +65 -0
  53. chunkshop/summarizers/__init__.py +11 -0
  54. chunkshop/summarizers/lede.py +29 -0
  55. chunkshop/summarizers/sumy.py +67 -0
  56. chunkshop-0.3.0.dist-info/METADATA +426 -0
  57. chunkshop-0.3.0.dist-info/RECORD +60 -0
  58. chunkshop-0.3.0.dist-info/WHEEL +5 -0
  59. chunkshop-0.3.0.dist-info/entry_points.txt +2 -0
  60. chunkshop-0.3.0.dist-info/top_level.txt +1 -0
chunkshop/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """Reusable ingestion tool: source -> chunker -> embedder -> extractor -> pgvector table."""
2
+ from chunkshop.config import CellConfig, load_config
3
+ from chunkshop.pipeline import Pipeline
4
+
5
+ __all__ = ["CellConfig", "load_config", "Pipeline"]
@@ -0,0 +1,25 @@
1
+ """chunkshop bakeoff: `chunkshop bakeoff` CLI subcommand + library surface.
2
+
3
+ Promotes the `scripts/bench_matrix.py` hacking tool into a user-facing,
4
+ config-driven chunker x embedder matrix evaluation that emits a leaderboard +
5
+ a runnable `recommended.yaml` cell.
6
+ """
7
+ from chunkshop.bakeoff.config import (
8
+ BakeoffConfig,
9
+ BakeoffResults,
10
+ BakeoffTargetConfig,
11
+ ComboResult,
12
+ GoldQuery,
13
+ MatrixConfig,
14
+ ScoringConfig,
15
+ )
16
+
17
+ __all__ = [
18
+ "BakeoffConfig",
19
+ "BakeoffResults",
20
+ "BakeoffTargetConfig",
21
+ "ComboResult",
22
+ "GoldQuery",
23
+ "MatrixConfig",
24
+ "ScoringConfig",
25
+ ]
@@ -0,0 +1,115 @@
1
+ """Pydantic config models for `chunkshop bakeoff` runs (SC-002, SC-003).
2
+
3
+ One `BakeoffConfig` = one matrix evaluation: a corpus, a set of gold queries,
4
+ and a cross-product of chunkers x embedders to rank with recall@k + MRR.
5
+ Mirrors the existing `chunkshop.config.CellConfig` conventions (pydantic v2,
6
+ `extra="forbid"`, discriminated unions) so typos in YAML fail at config-load,
7
+ not at runtime.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ from typing import Any, Optional, Union
12
+
13
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
14
+
15
+ from chunkshop.config import (
16
+ ChunkerConfig,
17
+ FastembedEmbedder,
18
+ FramerConfig,
19
+ IdentityFramerConfig,
20
+ RuntimeConfig,
21
+ SourceConfig,
22
+ )
23
+
24
+
25
+ class _Base(BaseModel):
26
+ model_config = ConfigDict(extra="forbid")
27
+
28
+
29
+ class GoldQuery(_Base):
30
+ """One user-authored evaluation query. Doc-level gold only for MVP."""
31
+
32
+ query: str
33
+ gold_doc_id: str
34
+
35
+
36
+ class MatrixConfig(_Base):
37
+ """Cross-product axes. N embedders x M chunkers = N*M combos."""
38
+
39
+ embedders: list[FastembedEmbedder] = Field(..., min_length=1)
40
+ chunkers: list[ChunkerConfig] = Field(..., min_length=1)
41
+
42
+
43
+ class BakeoffTargetConfig(_Base):
44
+ """Where bakeoff tables land. One table per combo under `schema`."""
45
+
46
+ dsn_env: str
47
+ # `schema` is a pydantic reserved name; expose as `schema_name` in Python,
48
+ # accept `schema` in YAML via alias. Matches `TargetConfig`'s treatment.
49
+ schema_name: str = Field(alias="schema")
50
+
51
+
52
+ class ScoringConfig(_Base):
53
+ """Retrieval-metric config. `k` controls recall cutoffs; `top_k` is pgvector LIMIT."""
54
+
55
+ k: list[int] = [1, 3, 5]
56
+ include_mrr: bool = True
57
+ top_k: int = 5
58
+
59
+ @field_validator("k")
60
+ @classmethod
61
+ def _ks_positive(cls, v: list[int]) -> list[int]:
62
+ if not v or any(k <= 0 for k in v):
63
+ raise ValueError("scoring.k must be a non-empty list of positive ints")
64
+ return sorted(set(v))
65
+
66
+
67
+ class BakeoffConfig(_Base):
68
+ """Top-level bakeoff run config. One YAML = one factorial run."""
69
+
70
+ name: str
71
+ source: SourceConfig
72
+ framer: FramerConfig = Field(default_factory=IdentityFramerConfig)
73
+ gold_queries: Union[str, list[GoldQuery]] # path to YAML/JSON file OR inline list
74
+ matrix: MatrixConfig
75
+ target: BakeoffTargetConfig
76
+ scoring: ScoringConfig = Field(default_factory=ScoringConfig)
77
+ output_dir: Optional[str] = None
78
+ runtime: Optional[RuntimeConfig] = None
79
+
80
+
81
+ class ComboResult(_Base):
82
+ """Scored outcome for one (chunker, embedder) combo."""
83
+
84
+ chunker_key: str
85
+ embedder_key: str
86
+ chunker_label: str
87
+ embedder_label: str
88
+ table: str
89
+ ingest_chunks: int
90
+ ingest_wall_seconds: float
91
+ # Subset of ingest_wall_seconds spent inside the embedder. Lets the
92
+ # leaderboard distinguish "this combo is slow because of the embedder"
93
+ # from "this combo is slow because of the chunker / sink". 0.0 if the
94
+ # embedder didn't track timing.
95
+ ingest_embed_seconds: float = 0.0
96
+ aggregate: dict[str, float]
97
+ per_query: list[dict[str, Any]]
98
+
99
+
100
+ class BakeoffResults(_Base):
101
+ """Full output of `run_bakeoff`. Round-trips through JSON via pydantic."""
102
+
103
+ run_name: str
104
+ started_at: str
105
+ corpus_label: str
106
+ n_queries: int
107
+ n_combos: int
108
+ combos: list[ComboResult]
109
+ gold_queries: list[dict[str, str]]
110
+ # Wall time per unique embedder spent embedding all gold queries during
111
+ # the scoring phase. Indicative of query-time latency at production
112
+ # scale: the value scaled by your expected QPS predicts CPU cost.
113
+ # Keys are embedder_key (same as ComboResult.embedder_key); values are
114
+ # seconds for embedding all `n_queries` queries in one batch.
115
+ query_embed_seconds_by_embedder: dict[str, float] = {}
@@ -0,0 +1,40 @@
1
+ """Gold-query loader for bakeoff configs (SC-003).
2
+
3
+ `BakeoffConfig.gold_queries` is `str | list[GoldQuery]` — a path to a YAML/JSON
4
+ file on disk, or an inline list already validated by pydantic. This module
5
+ resolves either shape to a concrete `list[GoldQuery]`.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from pathlib import Path
11
+ from typing import Union
12
+
13
+ import yaml
14
+
15
+ from chunkshop.bakeoff.config import GoldQuery
16
+
17
+
18
+ def load_gold_queries(spec: Union[str, list[GoldQuery]]) -> list[GoldQuery]:
19
+ """Resolve a `gold_queries` spec to a validated list of `GoldQuery`.
20
+
21
+ - `list[GoldQuery]` — returned as-is (already pydantic-validated).
22
+ - `str` — treated as a filesystem path. `.json` parses as JSON array;
23
+ anything else parses as a YAML list. Each element is run through
24
+ `GoldQuery.model_validate`.
25
+ """
26
+ if isinstance(spec, list):
27
+ return spec
28
+ path = Path(spec)
29
+ if not path.exists():
30
+ raise FileNotFoundError(f"gold_queries file not found: {path}")
31
+ text = path.read_text(encoding="utf-8")
32
+ if path.suffix.lower() == ".json":
33
+ raw = json.loads(text)
34
+ else:
35
+ raw = yaml.safe_load(text)
36
+ if not isinstance(raw, list):
37
+ raise ValueError(
38
+ f"gold_queries file must be a YAML/JSON list; got {type(raw).__name__}"
39
+ )
40
+ return [GoldQuery.model_validate(x) for x in raw]
@@ -0,0 +1,53 @@
1
+ """Deterministic chunker + embedder key derivation for combo table names (SC-004).
2
+
3
+ The bakeoff runner writes one Postgres table per combo: `{chunker_key}__{embedder_key}`
4
+ under the target schema. Table names must be lowercase-underscore idents so
5
+ pgvector operators + the sink's regex-allowlisted identifier validator both
6
+ accept them. Derivations here are pure functions — same config in, same key out.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import re
11
+
12
+ from chunkshop.config import (
13
+ ChunkerConfig,
14
+ FastembedEmbedder,
15
+ FixedOverlapChunker,
16
+ HierarchyChunker,
17
+ NeighborExpandChunker,
18
+ SentenceAwareChunker,
19
+ )
20
+
21
+ _ID_SAFE = re.compile(r"[^a-z0-9]+")
22
+
23
+
24
+ def embedder_key(cfg: FastembedEmbedder) -> str:
25
+ """Model short name stripped of org prefix + punctuation.
26
+
27
+ `Xenova/bge-base-en-v1.5-int8` -> `bge_base_en_v1_5_int8`.
28
+ """
29
+ short = cfg.model_name.split("/")[-1].lower()
30
+ return _ID_SAFE.sub("_", short).strip("_")
31
+
32
+
33
+ def chunker_key(cfg: ChunkerConfig) -> str:
34
+ """One deterministic key per chunker shape. Include params that change behavior.
35
+
36
+ `fixed_overlap` includes window/step so (w=300,s=150) and (w=500,s=100)
37
+ don't collide. `neighbor_expand` recurses into its `base` so the outer
38
+ window + the underlying strategy both land in the ident.
39
+ """
40
+ if isinstance(cfg, HierarchyChunker):
41
+ return "hierarchy"
42
+ if isinstance(cfg, SentenceAwareChunker):
43
+ return "sentence_aware"
44
+ if isinstance(cfg, FixedOverlapChunker):
45
+ return f"fixed_overlap_w{cfg.window_words}_s{cfg.step_words}"
46
+ if isinstance(cfg, NeighborExpandChunker):
47
+ return f"neighbor_expand_w{cfg.window}_over_{chunker_key(cfg.base)}"
48
+ raise ValueError(f"unknown chunker type for key derivation: {type(cfg).__name__}")
49
+
50
+
51
+ def combo_table(chunker: ChunkerConfig, embedder: FastembedEmbedder) -> str:
52
+ """Build the combo's table name: `{chunker_key}__{embedder_key}`."""
53
+ return f"{chunker_key(chunker)}__{embedder_key(embedder)}"
@@ -0,0 +1,159 @@
1
+ """Output writers for bakeoff runs (SC-006, SC-007, SC-008).
2
+
3
+ Three files land in `out_dir`:
4
+ - `results.json` — raw BakeoffResults, round-trips via pydantic.
5
+ - `report.md` — human-readable leaderboard + statistical-power honesty note.
6
+ - `recommended.yaml` — the top-MRR combo rendered as a runnable chunkshop
7
+ `CellConfig`. User points `source` at their real corpus and runs `chunkshop
8
+ ingest` with it.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ from pathlib import Path
13
+
14
+ import yaml
15
+
16
+ from chunkshop.bakeoff.config import BakeoffConfig, BakeoffResults
17
+ from chunkshop.bakeoff.keys import chunker_key, embedder_key
18
+
19
+
20
+ def write_results_json(results: BakeoffResults, out_dir: Path) -> Path:
21
+ """Dump raw BakeoffResults to `{out_dir}/results.json`."""
22
+ out = out_dir / "results.json"
23
+ out.write_text(results.model_dump_json(indent=2))
24
+ return out
25
+
26
+
27
+ def write_report_md(
28
+ cfg: BakeoffConfig, results: BakeoffResults, out_dir: Path
29
+ ) -> Path:
30
+ """Render a markdown leaderboard sorted by MRR desc + honesty note."""
31
+ ranked = sorted(results.combos, key=lambda c: -c.aggregate.get("mrr", 0))
32
+
33
+ header_cols = " | ".join(f"r@{k}" for k in cfg.scoring.k)
34
+ # Columns: # | Chunker | Embedder | r@k... | MRR | chunks | ingest_s | embed_s
35
+ sep_cells = "|".join(["---"] * (len(cfg.scoring.k) + 7))
36
+
37
+ lines = [
38
+ f"# Bakeoff report: {results.run_name}",
39
+ "",
40
+ f"- Run: {results.started_at}",
41
+ f"- Corpus: {results.corpus_label}",
42
+ f"- Queries: {results.n_queries}",
43
+ f"- Combos: {results.n_combos}",
44
+ "",
45
+ "## Leaderboard (sorted by MRR)",
46
+ "",
47
+ f"| # | Chunker | Embedder | {header_cols} | MRR | chunks | ingest_s | embed_s |",
48
+ f"|{sep_cells}|",
49
+ ]
50
+ for i, c in enumerate(ranked, start=1):
51
+ rk = [f"{c.aggregate.get(f'recall_at_{k}', 0):.3f}" for k in cfg.scoring.k]
52
+ mrr = f"{c.aggregate.get('mrr', 0):.3f}"
53
+ lines.append(
54
+ f"| {i} | `{c.chunker_label}` | `{c.embedder_label}` | "
55
+ + " | ".join(rk)
56
+ + f" | {mrr} | {c.ingest_chunks} | {c.ingest_wall_seconds:.2f} | "
57
+ f"{c.ingest_embed_seconds:.2f} |"
58
+ )
59
+
60
+ # Per-query detail: top-1 doc_id per combo, one row per (combo, query).
61
+ lines += [
62
+ "",
63
+ "## Per-query detail (top-1 hit per combo)",
64
+ "",
65
+ "| Chunker | Embedder | Query | Gold | Top-1 | MRR |",
66
+ "|---|---|---|---|---|---|",
67
+ ]
68
+ for c in ranked:
69
+ for pq in c.per_query:
70
+ top1 = pq["top_k"][0]["doc_id"] if pq.get("top_k") else "-"
71
+ lines.append(
72
+ f"| `{c.chunker_label}` | `{c.embedder_label}` | "
73
+ f"{pq['query']} | `{pq['gold_doc_id']}` | "
74
+ f"`{top1}` | {pq.get('mrr', 0):.3f} |"
75
+ )
76
+
77
+ # Query-time embedding cost. Each unique embedder embeds all gold
78
+ # queries once during scoring; the wall time is a proxy for the
79
+ # production query-time latency you'd see at scale.
80
+ if results.query_embed_seconds_by_embedder:
81
+ n = max(results.n_queries, 1)
82
+ lines += [
83
+ "",
84
+ "## Query-time embedding cost",
85
+ "",
86
+ f"Wall time to embed all {results.n_queries} gold queries, per "
87
+ "unique embedder. At production scale this scales by your "
88
+ "expected QPS — useful for choosing between a slower-but-better "
89
+ "embedder and a faster-but-worse one.",
90
+ "",
91
+ "| Embedder | total_s | per_query_ms |",
92
+ "|---|---|---|",
93
+ ]
94
+ for k, total in sorted(
95
+ results.query_embed_seconds_by_embedder.items(), key=lambda kv: kv[1]
96
+ ):
97
+ per_q_ms = (total / n) * 1000.0
98
+ lines.append(f"| `{k}` | {total:.3f} | {per_q_ms:.1f} |")
99
+
100
+ # Honesty note scaled to n_queries. Load-bearing — never drop this.
101
+ n = max(results.n_queries, 1)
102
+ lines += [
103
+ "",
104
+ "## Statistical power",
105
+ "",
106
+ f"{results.n_queries} queries means one query flipping moves aggregate recall by "
107
+ f"{1 / n:.3f}. Combos within ~{2 / n:.2f} of the leader are not reliably "
108
+ "distinguishable. Re-run with more queries or a larger corpus before treating "
109
+ "the leaderboard as a tournament result.",
110
+ "",
111
+ ]
112
+ out = out_dir / "report.md"
113
+ out.write_text("\n".join(lines))
114
+ return out
115
+
116
+
117
+ def write_recommended_yaml(
118
+ cfg: BakeoffConfig, results: BakeoffResults, out_dir: Path
119
+ ) -> Path:
120
+ """Render the top-MRR combo as a runnable CellConfig YAML.
121
+
122
+ Round-trip through `CellConfig.model_validate` is covered by
123
+ test_bakeoff_output::test_recommended_yaml_parses_as_cell_config.
124
+ """
125
+ ranked = sorted(results.combos, key=lambda c: -c.aggregate.get("mrr", 0))
126
+ top = ranked[0]
127
+
128
+ winner_chunker = next(
129
+ c for c in cfg.matrix.chunkers if chunker_key(c) == top.chunker_key
130
+ )
131
+ winner_embedder = next(
132
+ e for e in cfg.matrix.embedders if embedder_key(e) == top.embedder_key
133
+ )
134
+
135
+ # Use pydantic's `by_alias=True` so `schema_name` dumps as `schema`
136
+ # (matches the YAML/chunkshop convention and round-trips through
137
+ # TargetConfig.model_validate).
138
+ recommended = {
139
+ "# NOTE": (
140
+ f"Top combo from bakeoff '{results.run_name}' "
141
+ f"(MRR={top.aggregate.get('mrr', 0):.3f}, "
142
+ f"r@1={top.aggregate.get('recall_at_1', 0):.3f}). "
143
+ "Point `source` at your real corpus before running `chunkshop ingest`."
144
+ ),
145
+ "cell_name": f"{results.run_name}_recommended",
146
+ "source": cfg.source.model_dump(exclude_none=True, by_alias=True),
147
+ "framer": cfg.framer.model_dump(exclude_none=True, by_alias=True),
148
+ "chunker": winner_chunker.model_dump(exclude_none=True, by_alias=True),
149
+ "embedder": winner_embedder.model_dump(exclude_none=True, by_alias=True),
150
+ "target": {
151
+ "dsn_env": cfg.target.dsn_env,
152
+ "schema": cfg.target.schema_name,
153
+ "table": f"{results.run_name}_production",
154
+ "mode": "overwrite",
155
+ },
156
+ }
157
+ out = out_dir / "recommended.yaml"
158
+ out.write_text(yaml.safe_dump(recommended, sort_keys=False))
159
+ return out
@@ -0,0 +1,228 @@
1
+ """`run_bakeoff(cfg: BakeoffConfig) -> BakeoffResults` (SC-004, SC-005, SC-006).
2
+
3
+ Serial cross-product over `matrix.embedders x matrix.chunkers`. For each combo:
4
+ 1. Build a `CellConfig` and call `runner.run_cell` — reuses the full
5
+ source -> framer -> chunker -> embedder -> sink pipeline.
6
+ 2. Count chunks that landed in the combo's table.
7
+ Then, once per embedder (not per combo), embed all gold queries in a batch so
8
+ N combos sharing an embedder share the same query vectors. For each combo run
9
+ pgvector top-K against its table, score with `score.score_query`, aggregate,
10
+ and return a `BakeoffResults`.
11
+
12
+ Design decisions:
13
+ - Serial, not parallel. MVP. Parallelism via subprocess orchestrator is an ASK
14
+ FIRST item in the brief.
15
+ - DSN env var is read, not passed. Caller (CLI) sets `os.environ[dsn_env] = dsn`.
16
+ - Schema creation happens naturally via `run_cell -> sink.create_table -> advisory
17
+ lock`. No pre-creation here.
18
+ - Matrix size > 50 is caller's responsibility. CLI checks + prompts; runner
19
+ runs whatever the config says.
20
+ """
21
+ from __future__ import annotations
22
+
23
+ import os
24
+ import time
25
+ from typing import Any
26
+
27
+ import numpy as np
28
+ import psycopg
29
+ from psycopg import sql
30
+
31
+ from chunkshop.bakeoff.config import (
32
+ BakeoffConfig,
33
+ BakeoffResults,
34
+ ComboResult,
35
+ )
36
+ from chunkshop.bakeoff.gold import load_gold_queries
37
+ from chunkshop.bakeoff.keys import chunker_key, combo_table, embedder_key
38
+ from chunkshop.bakeoff.score import aggregate_scores, score_query
39
+ from chunkshop.config import (
40
+ CellConfig,
41
+ NoneExtractor,
42
+ RuntimeConfig,
43
+ TargetConfig,
44
+ )
45
+ from chunkshop.embedders import load_embedder
46
+ from chunkshop.runner import run_cell
47
+
48
+
49
+ def _chunker_label(cfg) -> str:
50
+ """Human-readable chunker label for report tables."""
51
+ t = getattr(cfg, "type", type(cfg).__name__)
52
+ if t == "neighbor_expand":
53
+ base = _chunker_label(cfg.base)
54
+ return f"neighbor_expand(window={cfg.window}, base={base})"
55
+ if t == "fixed_overlap":
56
+ return f"fixed_overlap(window_words={cfg.window_words}, step_words={cfg.step_words})"
57
+ return t
58
+
59
+
60
+ def _count_chunks(dsn: str, schema: str, table: str) -> int:
61
+ with psycopg.connect(dsn) as conn, conn.cursor() as cur:
62
+ cur.execute(
63
+ sql.SQL("SELECT COUNT(*) FROM {}.{}").format(
64
+ sql.Identifier(schema), sql.Identifier(table)
65
+ )
66
+ )
67
+ return cur.fetchone()[0]
68
+
69
+
70
+ def _query_top_k(
71
+ dsn: str,
72
+ schema: str,
73
+ table: str,
74
+ query_vec: np.ndarray,
75
+ k: int,
76
+ ) -> list[tuple[str, int]]:
77
+ """pgvector top-K lookup. Returns [(doc_id, seq_num), ...] ordered by cosine distance."""
78
+ vec_str = "[" + ",".join(f"{x:.8f}" for x in query_vec.tolist()) + "]"
79
+ with psycopg.connect(dsn) as conn, conn.cursor() as cur:
80
+ cur.execute(
81
+ sql.SQL(
82
+ "SELECT doc_id, seq_num FROM {}.{} "
83
+ "ORDER BY embedding <=> %s::vector LIMIT %s"
84
+ ).format(sql.Identifier(schema), sql.Identifier(table)),
85
+ (vec_str, k),
86
+ )
87
+ return [(r[0], r[1]) for r in cur.fetchall()]
88
+
89
+
90
+ def _build_cell_cfg(
91
+ bakeoff: BakeoffConfig,
92
+ chunker_cfg,
93
+ embedder_cfg,
94
+ table: str,
95
+ ) -> CellConfig:
96
+ """Translate (bakeoff config, one chunker, one embedder) into a runnable CellConfig."""
97
+ rt = bakeoff.runtime or RuntimeConfig()
98
+ return CellConfig(
99
+ cell_name=f"{bakeoff.name}__{chunker_key(chunker_cfg)}__{embedder_key(embedder_cfg)}",
100
+ source=bakeoff.source,
101
+ framer=bakeoff.framer,
102
+ chunker=chunker_cfg,
103
+ embedder=embedder_cfg,
104
+ extractor=NoneExtractor(),
105
+ target=TargetConfig(
106
+ dsn_env=bakeoff.target.dsn_env,
107
+ schema=bakeoff.target.schema_name,
108
+ table=table,
109
+ mode="overwrite",
110
+ hnsw=False,
111
+ ),
112
+ runtime=rt,
113
+ )
114
+
115
+
116
+ def _corpus_label(bakeoff: BakeoffConfig) -> str:
117
+ """Best-effort human label for the corpus. `files.glob` -> the glob string; else source type."""
118
+ src = bakeoff.source
119
+ if getattr(src, "type", None) == "files":
120
+ return getattr(src, "glob", "files")
121
+ return getattr(src, "type", "unknown")
122
+
123
+
124
+ def run_bakeoff(cfg: BakeoffConfig) -> BakeoffResults:
125
+ """Execute every (chunker, embedder) combo, score against gold, return results.
126
+
127
+ Caller must set `os.environ[cfg.target.dsn_env]` before calling — the sink
128
+ reads the DSN from there. Raises `RuntimeError` if the env var is unset.
129
+ """
130
+ if cfg.target.dsn_env not in os.environ:
131
+ raise RuntimeError(
132
+ f"DSN env var {cfg.target.dsn_env!r} is not set. The CLI sets it from "
133
+ "--dsn / $CHUNKSHOP_DSN before calling run_bakeoff."
134
+ )
135
+ dsn = os.environ[cfg.target.dsn_env]
136
+ schema = cfg.target.schema_name
137
+
138
+ gold = load_gold_queries(cfg.gold_queries)
139
+ combos_in = [(c, e) for c in cfg.matrix.chunkers for e in cfg.matrix.embedders]
140
+
141
+ started_at = time.strftime("%Y-%m-%d %H:%M:%S")
142
+
143
+ # ----- Phase 1: ingest every combo serially -----
144
+ ingest_meta: list[dict[str, Any]] = []
145
+ for c, e in combos_in:
146
+ table = combo_table(c, e)
147
+ cell_cfg = _build_cell_cfg(cfg, c, e, table)
148
+ t0 = time.time()
149
+ res = run_cell(cell_cfg)
150
+ wall = time.time() - t0
151
+ if res.error:
152
+ raise RuntimeError(
153
+ f"ingest failed for combo {table}: {res.error}"
154
+ )
155
+ n_chunks = _count_chunks(dsn, schema, table)
156
+ ingest_meta.append(
157
+ {"chunker": c, "embedder": e, "table": table,
158
+ "chunks": n_chunks,
159
+ "wall_seconds": round(wall, 2),
160
+ # Subset of wall_seconds: just the embedder's portion.
161
+ "embed_seconds": round(getattr(res, "embed_seconds", 0.0), 2)}
162
+ )
163
+
164
+ # ----- Phase 2: embed gold queries once per embedder (not once per combo) -----
165
+ # Two combos sharing an embedder share the same query vectors.
166
+ # Capture per-embedder query-embed wall time — that's a proxy for
167
+ # production query-time latency at scale.
168
+ query_vecs_by_emb_key: dict[str, np.ndarray] = {}
169
+ query_embed_seconds_by_emb_key: dict[str, float] = {}
170
+ for e in cfg.matrix.embedders:
171
+ k = embedder_key(e)
172
+ if k in query_vecs_by_emb_key:
173
+ continue
174
+ embedder = load_embedder(e)
175
+ t_qe = time.perf_counter()
176
+ vecs = embedder.embed([g.query for g in gold])
177
+ query_embed_seconds_by_emb_key[k] = round(time.perf_counter() - t_qe, 3)
178
+ query_vecs_by_emb_key[k] = vecs
179
+
180
+ # ----- Phase 3: score every combo -----
181
+ combo_results: list[ComboResult] = []
182
+ for meta in ingest_meta:
183
+ c = meta["chunker"]
184
+ e = meta["embedder"]
185
+ ck = chunker_key(c)
186
+ ek = embedder_key(e)
187
+ table = meta["table"]
188
+ vecs = query_vecs_by_emb_key[ek]
189
+
190
+ per_query: list[dict[str, Any]] = []
191
+ per_query_scores: list[dict[str, float]] = []
192
+ for i, g in enumerate(gold):
193
+ top = _query_top_k(dsn, schema, table, vecs[i], k=cfg.scoring.top_k)
194
+ doc_ids = [t[0] for t in top]
195
+ s = score_query(doc_ids, g.gold_doc_id, cfg.scoring.k)
196
+ per_query_scores.append(s)
197
+ per_query.append({
198
+ "query": g.query,
199
+ "gold_doc_id": g.gold_doc_id,
200
+ "top_k": [{"doc_id": t[0], "seq_num": t[1]} for t in top],
201
+ **s,
202
+ })
203
+
204
+ agg = aggregate_scores(per_query_scores)
205
+
206
+ combo_results.append(ComboResult(
207
+ chunker_key=ck,
208
+ embedder_key=ek,
209
+ chunker_label=_chunker_label(c),
210
+ embedder_label=e.model_name,
211
+ table=table,
212
+ ingest_chunks=meta["chunks"],
213
+ ingest_wall_seconds=meta["wall_seconds"],
214
+ ingest_embed_seconds=meta["embed_seconds"],
215
+ aggregate=agg,
216
+ per_query=per_query,
217
+ ))
218
+
219
+ return BakeoffResults(
220
+ run_name=cfg.name,
221
+ started_at=started_at,
222
+ corpus_label=_corpus_label(cfg),
223
+ n_queries=len(gold),
224
+ n_combos=len(combo_results),
225
+ combos=combo_results,
226
+ gold_queries=[{"query": g.query, "gold_doc_id": g.gold_doc_id} for g in gold],
227
+ query_embed_seconds_by_embedder=query_embed_seconds_by_emb_key,
228
+ )
@@ -0,0 +1,41 @@
1
+ """Pure scoring functions for bakeoff evaluation (SC-005).
2
+
3
+ Given a ranked list of retrieved doc_ids for a query and a single gold doc_id,
4
+ compute recall@k for each k in `k_values` and MRR. Aggregation across queries
5
+ is a simple arithmetic mean. No external deps — kept here for easy unit test.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from typing import Iterable
10
+
11
+
12
+ def score_query(
13
+ ranked_doc_ids: list[str],
14
+ gold_doc_id: str,
15
+ k_values: Iterable[int],
16
+ ) -> dict[str, float]:
17
+ """Score one query against one gold doc_id.
18
+
19
+ Returns `{recall_at_K: 0|1, ..., mrr: float}`. MRR uses 1/rank of the first
20
+ gold hit in the ranked list (unbounded — callers should slice to top-K
21
+ before scoring if they want bounded MRR).
22
+ """
23
+ result: dict[str, float] = {}
24
+ for k in k_values:
25
+ result[f"recall_at_{k}"] = 1 if gold_doc_id in ranked_doc_ids[:k] else 0
26
+ mrr = 0.0
27
+ for rank, did in enumerate(ranked_doc_ids, start=1):
28
+ if did == gold_doc_id:
29
+ mrr = 1.0 / rank
30
+ break
31
+ result["mrr"] = mrr
32
+ return result
33
+
34
+
35
+ def aggregate_scores(per_query: list[dict[str, float]]) -> dict[str, float]:
36
+ """Arithmetic mean of each metric across all queries. Empty input -> {}."""
37
+ if not per_query:
38
+ return {}
39
+ n = len(per_query)
40
+ keys = per_query[0].keys()
41
+ return {k: sum(q[k] for q in per_query) / n for k in keys}