chunkshop 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunkshop/__init__.py +5 -0
- chunkshop/bakeoff/__init__.py +25 -0
- chunkshop/bakeoff/config.py +115 -0
- chunkshop/bakeoff/gold.py +40 -0
- chunkshop/bakeoff/keys.py +53 -0
- chunkshop/bakeoff/output.py +159 -0
- chunkshop/bakeoff/runner.py +228 -0
- chunkshop/bakeoff/score.py +41 -0
- chunkshop/chunkers/__init__.py +86 -0
- chunkshop/chunkers/_sentence_split.py +56 -0
- chunkshop/chunkers/_splitting.py +111 -0
- chunkshop/chunkers/_summarizer.py +73 -0
- chunkshop/chunkers/base.py +18 -0
- chunkshop/chunkers/fixed_overlap.py +35 -0
- chunkshop/chunkers/hierarchical_summary.py +118 -0
- chunkshop/chunkers/hierarchy.py +84 -0
- chunkshop/chunkers/neighbor_expand.py +28 -0
- chunkshop/chunkers/semantic.py +192 -0
- chunkshop/chunkers/sentence_aware.py +71 -0
- chunkshop/chunkers/summary_embed.py +33 -0
- chunkshop/cli.py +241 -0
- chunkshop/config.py +458 -0
- chunkshop/embedders/__init__.py +22 -0
- chunkshop/embedders/_registry.py +169 -0
- chunkshop/embedders/base.py +8 -0
- chunkshop/embedders/fastembed_provider.py +73 -0
- chunkshop/extractors/__init__.py +36 -0
- chunkshop/extractors/base.py +8 -0
- chunkshop/extractors/composite.py +44 -0
- chunkshop/extractors/keybert_phrases.py +38 -0
- chunkshop/extractors/lang_detect.py +51 -0
- chunkshop/extractors/none_provider.py +10 -0
- chunkshop/extractors/rake_keywords.py +29 -0
- chunkshop/extractors/result.py +19 -0
- chunkshop/extractors/spacy_entities.py +54 -0
- chunkshop/framers/__init__.py +34 -0
- chunkshop/framers/base.py +16 -0
- chunkshop/framers/heading_boundary.py +60 -0
- chunkshop/framers/identity.py +14 -0
- chunkshop/framers/jsonpath.py +66 -0
- chunkshop/framers/regex_boundary.py +52 -0
- chunkshop/orchestrator.py +138 -0
- chunkshop/pipeline.py +130 -0
- chunkshop/runner.py +139 -0
- chunkshop/sink.py +306 -0
- chunkshop/sources/__init__.py +39 -0
- chunkshop/sources/base.py +15 -0
- chunkshop/sources/files.py +33 -0
- chunkshop/sources/http.py +104 -0
- chunkshop/sources/json_corpus.py +26 -0
- chunkshop/sources/pg_table.py +75 -0
- chunkshop/sources/s3.py +65 -0
- chunkshop/summarizers/__init__.py +11 -0
- chunkshop/summarizers/lede.py +29 -0
- chunkshop/summarizers/sumy.py +67 -0
- chunkshop-0.3.0.dist-info/METADATA +426 -0
- chunkshop-0.3.0.dist-info/RECORD +60 -0
- chunkshop-0.3.0.dist-info/WHEEL +5 -0
- chunkshop-0.3.0.dist-info/entry_points.txt +2 -0
- chunkshop-0.3.0.dist-info/top_level.txt +1 -0
chunkshop/__init__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""chunkshop bakeoff: `chunkshop bakeoff` CLI subcommand + library surface.
|
|
2
|
+
|
|
3
|
+
Promotes the `scripts/bench_matrix.py` hacking tool into a user-facing,
|
|
4
|
+
config-driven chunker x embedder matrix evaluation that emits a leaderboard +
|
|
5
|
+
a runnable `recommended.yaml` cell.
|
|
6
|
+
"""
|
|
7
|
+
from chunkshop.bakeoff.config import (
|
|
8
|
+
BakeoffConfig,
|
|
9
|
+
BakeoffResults,
|
|
10
|
+
BakeoffTargetConfig,
|
|
11
|
+
ComboResult,
|
|
12
|
+
GoldQuery,
|
|
13
|
+
MatrixConfig,
|
|
14
|
+
ScoringConfig,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"BakeoffConfig",
|
|
19
|
+
"BakeoffResults",
|
|
20
|
+
"BakeoffTargetConfig",
|
|
21
|
+
"ComboResult",
|
|
22
|
+
"GoldQuery",
|
|
23
|
+
"MatrixConfig",
|
|
24
|
+
"ScoringConfig",
|
|
25
|
+
]
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Pydantic config models for `chunkshop bakeoff` runs (SC-002, SC-003).
|
|
2
|
+
|
|
3
|
+
One `BakeoffConfig` = one matrix evaluation: a corpus, a set of gold queries,
|
|
4
|
+
and a cross-product of chunkers x embedders to rank with recall@k + MRR.
|
|
5
|
+
Mirrors the existing `chunkshop.config.CellConfig` conventions (pydantic v2,
|
|
6
|
+
`extra="forbid"`, discriminated unions) so typos in YAML fail at config-load,
|
|
7
|
+
not at runtime.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Any, Optional, Union
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
14
|
+
|
|
15
|
+
from chunkshop.config import (
|
|
16
|
+
ChunkerConfig,
|
|
17
|
+
FastembedEmbedder,
|
|
18
|
+
FramerConfig,
|
|
19
|
+
IdentityFramerConfig,
|
|
20
|
+
RuntimeConfig,
|
|
21
|
+
SourceConfig,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class _Base(BaseModel):
|
|
26
|
+
model_config = ConfigDict(extra="forbid")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class GoldQuery(_Base):
|
|
30
|
+
"""One user-authored evaluation query. Doc-level gold only for MVP."""
|
|
31
|
+
|
|
32
|
+
query: str
|
|
33
|
+
gold_doc_id: str
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class MatrixConfig(_Base):
|
|
37
|
+
"""Cross-product axes. N embedders x M chunkers = N*M combos."""
|
|
38
|
+
|
|
39
|
+
embedders: list[FastembedEmbedder] = Field(..., min_length=1)
|
|
40
|
+
chunkers: list[ChunkerConfig] = Field(..., min_length=1)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class BakeoffTargetConfig(_Base):
|
|
44
|
+
"""Where bakeoff tables land. One table per combo under `schema`."""
|
|
45
|
+
|
|
46
|
+
dsn_env: str
|
|
47
|
+
# `schema` is a pydantic reserved name; expose as `schema_name` in Python,
|
|
48
|
+
# accept `schema` in YAML via alias. Matches `TargetConfig`'s treatment.
|
|
49
|
+
schema_name: str = Field(alias="schema")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class ScoringConfig(_Base):
|
|
53
|
+
"""Retrieval-metric config. `k` controls recall cutoffs; `top_k` is pgvector LIMIT."""
|
|
54
|
+
|
|
55
|
+
k: list[int] = [1, 3, 5]
|
|
56
|
+
include_mrr: bool = True
|
|
57
|
+
top_k: int = 5
|
|
58
|
+
|
|
59
|
+
@field_validator("k")
|
|
60
|
+
@classmethod
|
|
61
|
+
def _ks_positive(cls, v: list[int]) -> list[int]:
|
|
62
|
+
if not v or any(k <= 0 for k in v):
|
|
63
|
+
raise ValueError("scoring.k must be a non-empty list of positive ints")
|
|
64
|
+
return sorted(set(v))
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class BakeoffConfig(_Base):
|
|
68
|
+
"""Top-level bakeoff run config. One YAML = one factorial run."""
|
|
69
|
+
|
|
70
|
+
name: str
|
|
71
|
+
source: SourceConfig
|
|
72
|
+
framer: FramerConfig = Field(default_factory=IdentityFramerConfig)
|
|
73
|
+
gold_queries: Union[str, list[GoldQuery]] # path to YAML/JSON file OR inline list
|
|
74
|
+
matrix: MatrixConfig
|
|
75
|
+
target: BakeoffTargetConfig
|
|
76
|
+
scoring: ScoringConfig = Field(default_factory=ScoringConfig)
|
|
77
|
+
output_dir: Optional[str] = None
|
|
78
|
+
runtime: Optional[RuntimeConfig] = None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class ComboResult(_Base):
|
|
82
|
+
"""Scored outcome for one (chunker, embedder) combo."""
|
|
83
|
+
|
|
84
|
+
chunker_key: str
|
|
85
|
+
embedder_key: str
|
|
86
|
+
chunker_label: str
|
|
87
|
+
embedder_label: str
|
|
88
|
+
table: str
|
|
89
|
+
ingest_chunks: int
|
|
90
|
+
ingest_wall_seconds: float
|
|
91
|
+
# Subset of ingest_wall_seconds spent inside the embedder. Lets the
|
|
92
|
+
# leaderboard distinguish "this combo is slow because of the embedder"
|
|
93
|
+
# from "this combo is slow because of the chunker / sink". 0.0 if the
|
|
94
|
+
# embedder didn't track timing.
|
|
95
|
+
ingest_embed_seconds: float = 0.0
|
|
96
|
+
aggregate: dict[str, float]
|
|
97
|
+
per_query: list[dict[str, Any]]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class BakeoffResults(_Base):
|
|
101
|
+
"""Full output of `run_bakeoff`. Round-trips through JSON via pydantic."""
|
|
102
|
+
|
|
103
|
+
run_name: str
|
|
104
|
+
started_at: str
|
|
105
|
+
corpus_label: str
|
|
106
|
+
n_queries: int
|
|
107
|
+
n_combos: int
|
|
108
|
+
combos: list[ComboResult]
|
|
109
|
+
gold_queries: list[dict[str, str]]
|
|
110
|
+
# Wall time per unique embedder spent embedding all gold queries during
|
|
111
|
+
# the scoring phase. Indicative of query-time latency at production
|
|
112
|
+
# scale: the value scaled by your expected QPS predicts CPU cost.
|
|
113
|
+
# Keys are embedder_key (same as ComboResult.embedder_key); values are
|
|
114
|
+
# seconds for embedding all `n_queries` queries in one batch.
|
|
115
|
+
query_embed_seconds_by_embedder: dict[str, float] = {}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Gold-query loader for bakeoff configs (SC-003).
|
|
2
|
+
|
|
3
|
+
`BakeoffConfig.gold_queries` is `str | list[GoldQuery]` — a path to a YAML/JSON
|
|
4
|
+
file on disk, or an inline list already validated by pydantic. This module
|
|
5
|
+
resolves either shape to a concrete `list[GoldQuery]`.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Union
|
|
12
|
+
|
|
13
|
+
import yaml
|
|
14
|
+
|
|
15
|
+
from chunkshop.bakeoff.config import GoldQuery
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def load_gold_queries(spec: Union[str, list[GoldQuery]]) -> list[GoldQuery]:
|
|
19
|
+
"""Resolve a `gold_queries` spec to a validated list of `GoldQuery`.
|
|
20
|
+
|
|
21
|
+
- `list[GoldQuery]` — returned as-is (already pydantic-validated).
|
|
22
|
+
- `str` — treated as a filesystem path. `.json` parses as JSON array;
|
|
23
|
+
anything else parses as a YAML list. Each element is run through
|
|
24
|
+
`GoldQuery.model_validate`.
|
|
25
|
+
"""
|
|
26
|
+
if isinstance(spec, list):
|
|
27
|
+
return spec
|
|
28
|
+
path = Path(spec)
|
|
29
|
+
if not path.exists():
|
|
30
|
+
raise FileNotFoundError(f"gold_queries file not found: {path}")
|
|
31
|
+
text = path.read_text(encoding="utf-8")
|
|
32
|
+
if path.suffix.lower() == ".json":
|
|
33
|
+
raw = json.loads(text)
|
|
34
|
+
else:
|
|
35
|
+
raw = yaml.safe_load(text)
|
|
36
|
+
if not isinstance(raw, list):
|
|
37
|
+
raise ValueError(
|
|
38
|
+
f"gold_queries file must be a YAML/JSON list; got {type(raw).__name__}"
|
|
39
|
+
)
|
|
40
|
+
return [GoldQuery.model_validate(x) for x in raw]
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Deterministic chunker + embedder key derivation for combo table names (SC-004).
|
|
2
|
+
|
|
3
|
+
The bakeoff runner writes one Postgres table per combo: `{chunker_key}__{embedder_key}`
|
|
4
|
+
under the target schema. Table names must be lowercase-underscore idents so
|
|
5
|
+
pgvector operators + the sink's regex-allowlisted identifier validator both
|
|
6
|
+
accept them. Derivations here are pure functions — same config in, same key out.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
from chunkshop.config import (
|
|
13
|
+
ChunkerConfig,
|
|
14
|
+
FastembedEmbedder,
|
|
15
|
+
FixedOverlapChunker,
|
|
16
|
+
HierarchyChunker,
|
|
17
|
+
NeighborExpandChunker,
|
|
18
|
+
SentenceAwareChunker,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
_ID_SAFE = re.compile(r"[^a-z0-9]+")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def embedder_key(cfg: FastembedEmbedder) -> str:
|
|
25
|
+
"""Model short name stripped of org prefix + punctuation.
|
|
26
|
+
|
|
27
|
+
`Xenova/bge-base-en-v1.5-int8` -> `bge_base_en_v1_5_int8`.
|
|
28
|
+
"""
|
|
29
|
+
short = cfg.model_name.split("/")[-1].lower()
|
|
30
|
+
return _ID_SAFE.sub("_", short).strip("_")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def chunker_key(cfg: ChunkerConfig) -> str:
|
|
34
|
+
"""One deterministic key per chunker shape. Include params that change behavior.
|
|
35
|
+
|
|
36
|
+
`fixed_overlap` includes window/step so (w=300,s=150) and (w=500,s=100)
|
|
37
|
+
don't collide. `neighbor_expand` recurses into its `base` so the outer
|
|
38
|
+
window + the underlying strategy both land in the ident.
|
|
39
|
+
"""
|
|
40
|
+
if isinstance(cfg, HierarchyChunker):
|
|
41
|
+
return "hierarchy"
|
|
42
|
+
if isinstance(cfg, SentenceAwareChunker):
|
|
43
|
+
return "sentence_aware"
|
|
44
|
+
if isinstance(cfg, FixedOverlapChunker):
|
|
45
|
+
return f"fixed_overlap_w{cfg.window_words}_s{cfg.step_words}"
|
|
46
|
+
if isinstance(cfg, NeighborExpandChunker):
|
|
47
|
+
return f"neighbor_expand_w{cfg.window}_over_{chunker_key(cfg.base)}"
|
|
48
|
+
raise ValueError(f"unknown chunker type for key derivation: {type(cfg).__name__}")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def combo_table(chunker: ChunkerConfig, embedder: FastembedEmbedder) -> str:
|
|
52
|
+
"""Build the combo's table name: `{chunker_key}__{embedder_key}`."""
|
|
53
|
+
return f"{chunker_key(chunker)}__{embedder_key(embedder)}"
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""Output writers for bakeoff runs (SC-006, SC-007, SC-008).
|
|
2
|
+
|
|
3
|
+
Three files land in `out_dir`:
|
|
4
|
+
- `results.json` — raw BakeoffResults, round-trips via pydantic.
|
|
5
|
+
- `report.md` — human-readable leaderboard + statistical-power honesty note.
|
|
6
|
+
- `recommended.yaml` — the top-MRR combo rendered as a runnable chunkshop
|
|
7
|
+
`CellConfig`. User points `source` at their real corpus and runs `chunkshop
|
|
8
|
+
ingest` with it.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import yaml
|
|
15
|
+
|
|
16
|
+
from chunkshop.bakeoff.config import BakeoffConfig, BakeoffResults
|
|
17
|
+
from chunkshop.bakeoff.keys import chunker_key, embedder_key
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def write_results_json(results: BakeoffResults, out_dir: Path) -> Path:
|
|
21
|
+
"""Dump raw BakeoffResults to `{out_dir}/results.json`."""
|
|
22
|
+
out = out_dir / "results.json"
|
|
23
|
+
out.write_text(results.model_dump_json(indent=2))
|
|
24
|
+
return out
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def write_report_md(
|
|
28
|
+
cfg: BakeoffConfig, results: BakeoffResults, out_dir: Path
|
|
29
|
+
) -> Path:
|
|
30
|
+
"""Render a markdown leaderboard sorted by MRR desc + honesty note."""
|
|
31
|
+
ranked = sorted(results.combos, key=lambda c: -c.aggregate.get("mrr", 0))
|
|
32
|
+
|
|
33
|
+
header_cols = " | ".join(f"r@{k}" for k in cfg.scoring.k)
|
|
34
|
+
# Columns: # | Chunker | Embedder | r@k... | MRR | chunks | ingest_s | embed_s
|
|
35
|
+
sep_cells = "|".join(["---"] * (len(cfg.scoring.k) + 7))
|
|
36
|
+
|
|
37
|
+
lines = [
|
|
38
|
+
f"# Bakeoff report: {results.run_name}",
|
|
39
|
+
"",
|
|
40
|
+
f"- Run: {results.started_at}",
|
|
41
|
+
f"- Corpus: {results.corpus_label}",
|
|
42
|
+
f"- Queries: {results.n_queries}",
|
|
43
|
+
f"- Combos: {results.n_combos}",
|
|
44
|
+
"",
|
|
45
|
+
"## Leaderboard (sorted by MRR)",
|
|
46
|
+
"",
|
|
47
|
+
f"| # | Chunker | Embedder | {header_cols} | MRR | chunks | ingest_s | embed_s |",
|
|
48
|
+
f"|{sep_cells}|",
|
|
49
|
+
]
|
|
50
|
+
for i, c in enumerate(ranked, start=1):
|
|
51
|
+
rk = [f"{c.aggregate.get(f'recall_at_{k}', 0):.3f}" for k in cfg.scoring.k]
|
|
52
|
+
mrr = f"{c.aggregate.get('mrr', 0):.3f}"
|
|
53
|
+
lines.append(
|
|
54
|
+
f"| {i} | `{c.chunker_label}` | `{c.embedder_label}` | "
|
|
55
|
+
+ " | ".join(rk)
|
|
56
|
+
+ f" | {mrr} | {c.ingest_chunks} | {c.ingest_wall_seconds:.2f} | "
|
|
57
|
+
f"{c.ingest_embed_seconds:.2f} |"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Per-query detail: top-1 doc_id per combo, one row per (combo, query).
|
|
61
|
+
lines += [
|
|
62
|
+
"",
|
|
63
|
+
"## Per-query detail (top-1 hit per combo)",
|
|
64
|
+
"",
|
|
65
|
+
"| Chunker | Embedder | Query | Gold | Top-1 | MRR |",
|
|
66
|
+
"|---|---|---|---|---|---|",
|
|
67
|
+
]
|
|
68
|
+
for c in ranked:
|
|
69
|
+
for pq in c.per_query:
|
|
70
|
+
top1 = pq["top_k"][0]["doc_id"] if pq.get("top_k") else "-"
|
|
71
|
+
lines.append(
|
|
72
|
+
f"| `{c.chunker_label}` | `{c.embedder_label}` | "
|
|
73
|
+
f"{pq['query']} | `{pq['gold_doc_id']}` | "
|
|
74
|
+
f"`{top1}` | {pq.get('mrr', 0):.3f} |"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Query-time embedding cost. Each unique embedder embeds all gold
|
|
78
|
+
# queries once during scoring; the wall time is a proxy for the
|
|
79
|
+
# production query-time latency you'd see at scale.
|
|
80
|
+
if results.query_embed_seconds_by_embedder:
|
|
81
|
+
n = max(results.n_queries, 1)
|
|
82
|
+
lines += [
|
|
83
|
+
"",
|
|
84
|
+
"## Query-time embedding cost",
|
|
85
|
+
"",
|
|
86
|
+
f"Wall time to embed all {results.n_queries} gold queries, per "
|
|
87
|
+
"unique embedder. At production scale this scales by your "
|
|
88
|
+
"expected QPS — useful for choosing between a slower-but-better "
|
|
89
|
+
"embedder and a faster-but-worse one.",
|
|
90
|
+
"",
|
|
91
|
+
"| Embedder | total_s | per_query_ms |",
|
|
92
|
+
"|---|---|---|",
|
|
93
|
+
]
|
|
94
|
+
for k, total in sorted(
|
|
95
|
+
results.query_embed_seconds_by_embedder.items(), key=lambda kv: kv[1]
|
|
96
|
+
):
|
|
97
|
+
per_q_ms = (total / n) * 1000.0
|
|
98
|
+
lines.append(f"| `{k}` | {total:.3f} | {per_q_ms:.1f} |")
|
|
99
|
+
|
|
100
|
+
# Honesty note scaled to n_queries. Load-bearing — never drop this.
|
|
101
|
+
n = max(results.n_queries, 1)
|
|
102
|
+
lines += [
|
|
103
|
+
"",
|
|
104
|
+
"## Statistical power",
|
|
105
|
+
"",
|
|
106
|
+
f"{results.n_queries} queries means one query flipping moves aggregate recall by "
|
|
107
|
+
f"{1 / n:.3f}. Combos within ~{2 / n:.2f} of the leader are not reliably "
|
|
108
|
+
"distinguishable. Re-run with more queries or a larger corpus before treating "
|
|
109
|
+
"the leaderboard as a tournament result.",
|
|
110
|
+
"",
|
|
111
|
+
]
|
|
112
|
+
out = out_dir / "report.md"
|
|
113
|
+
out.write_text("\n".join(lines))
|
|
114
|
+
return out
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def write_recommended_yaml(
|
|
118
|
+
cfg: BakeoffConfig, results: BakeoffResults, out_dir: Path
|
|
119
|
+
) -> Path:
|
|
120
|
+
"""Render the top-MRR combo as a runnable CellConfig YAML.
|
|
121
|
+
|
|
122
|
+
Round-trip through `CellConfig.model_validate` is covered by
|
|
123
|
+
test_bakeoff_output::test_recommended_yaml_parses_as_cell_config.
|
|
124
|
+
"""
|
|
125
|
+
ranked = sorted(results.combos, key=lambda c: -c.aggregate.get("mrr", 0))
|
|
126
|
+
top = ranked[0]
|
|
127
|
+
|
|
128
|
+
winner_chunker = next(
|
|
129
|
+
c for c in cfg.matrix.chunkers if chunker_key(c) == top.chunker_key
|
|
130
|
+
)
|
|
131
|
+
winner_embedder = next(
|
|
132
|
+
e for e in cfg.matrix.embedders if embedder_key(e) == top.embedder_key
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Use pydantic's `by_alias=True` so `schema_name` dumps as `schema`
|
|
136
|
+
# (matches the YAML/chunkshop convention and round-trips through
|
|
137
|
+
# TargetConfig.model_validate).
|
|
138
|
+
recommended = {
|
|
139
|
+
"# NOTE": (
|
|
140
|
+
f"Top combo from bakeoff '{results.run_name}' "
|
|
141
|
+
f"(MRR={top.aggregate.get('mrr', 0):.3f}, "
|
|
142
|
+
f"r@1={top.aggregate.get('recall_at_1', 0):.3f}). "
|
|
143
|
+
"Point `source` at your real corpus before running `chunkshop ingest`."
|
|
144
|
+
),
|
|
145
|
+
"cell_name": f"{results.run_name}_recommended",
|
|
146
|
+
"source": cfg.source.model_dump(exclude_none=True, by_alias=True),
|
|
147
|
+
"framer": cfg.framer.model_dump(exclude_none=True, by_alias=True),
|
|
148
|
+
"chunker": winner_chunker.model_dump(exclude_none=True, by_alias=True),
|
|
149
|
+
"embedder": winner_embedder.model_dump(exclude_none=True, by_alias=True),
|
|
150
|
+
"target": {
|
|
151
|
+
"dsn_env": cfg.target.dsn_env,
|
|
152
|
+
"schema": cfg.target.schema_name,
|
|
153
|
+
"table": f"{results.run_name}_production",
|
|
154
|
+
"mode": "overwrite",
|
|
155
|
+
},
|
|
156
|
+
}
|
|
157
|
+
out = out_dir / "recommended.yaml"
|
|
158
|
+
out.write_text(yaml.safe_dump(recommended, sort_keys=False))
|
|
159
|
+
return out
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""`run_bakeoff(cfg: BakeoffConfig) -> BakeoffResults` (SC-004, SC-005, SC-006).
|
|
2
|
+
|
|
3
|
+
Serial cross-product over `matrix.embedders x matrix.chunkers`. For each combo:
|
|
4
|
+
1. Build a `CellConfig` and call `runner.run_cell` — reuses the full
|
|
5
|
+
source -> framer -> chunker -> embedder -> sink pipeline.
|
|
6
|
+
2. Count chunks that landed in the combo's table.
|
|
7
|
+
Then, once per embedder (not per combo), embed all gold queries in a batch so
|
|
8
|
+
N combos sharing an embedder share the same query vectors. For each combo run
|
|
9
|
+
pgvector top-K against its table, score with `score.score_query`, aggregate,
|
|
10
|
+
and return a `BakeoffResults`.
|
|
11
|
+
|
|
12
|
+
Design decisions:
|
|
13
|
+
- Serial, not parallel. MVP. Parallelism via subprocess orchestrator is an ASK
|
|
14
|
+
FIRST item in the brief.
|
|
15
|
+
- DSN env var is read, not passed. Caller (CLI) sets `os.environ[dsn_env] = dsn`.
|
|
16
|
+
- Schema creation happens naturally via `run_cell -> sink.create_table -> advisory
|
|
17
|
+
lock`. No pre-creation here.
|
|
18
|
+
- Matrix size > 50 is caller's responsibility. CLI checks + prompts; runner
|
|
19
|
+
runs whatever the config says.
|
|
20
|
+
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import os
|
|
24
|
+
import time
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
import numpy as np
|
|
28
|
+
import psycopg
|
|
29
|
+
from psycopg import sql
|
|
30
|
+
|
|
31
|
+
from chunkshop.bakeoff.config import (
|
|
32
|
+
BakeoffConfig,
|
|
33
|
+
BakeoffResults,
|
|
34
|
+
ComboResult,
|
|
35
|
+
)
|
|
36
|
+
from chunkshop.bakeoff.gold import load_gold_queries
|
|
37
|
+
from chunkshop.bakeoff.keys import chunker_key, combo_table, embedder_key
|
|
38
|
+
from chunkshop.bakeoff.score import aggregate_scores, score_query
|
|
39
|
+
from chunkshop.config import (
|
|
40
|
+
CellConfig,
|
|
41
|
+
NoneExtractor,
|
|
42
|
+
RuntimeConfig,
|
|
43
|
+
TargetConfig,
|
|
44
|
+
)
|
|
45
|
+
from chunkshop.embedders import load_embedder
|
|
46
|
+
from chunkshop.runner import run_cell
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _chunker_label(cfg) -> str:
|
|
50
|
+
"""Human-readable chunker label for report tables."""
|
|
51
|
+
t = getattr(cfg, "type", type(cfg).__name__)
|
|
52
|
+
if t == "neighbor_expand":
|
|
53
|
+
base = _chunker_label(cfg.base)
|
|
54
|
+
return f"neighbor_expand(window={cfg.window}, base={base})"
|
|
55
|
+
if t == "fixed_overlap":
|
|
56
|
+
return f"fixed_overlap(window_words={cfg.window_words}, step_words={cfg.step_words})"
|
|
57
|
+
return t
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _count_chunks(dsn: str, schema: str, table: str) -> int:
|
|
61
|
+
with psycopg.connect(dsn) as conn, conn.cursor() as cur:
|
|
62
|
+
cur.execute(
|
|
63
|
+
sql.SQL("SELECT COUNT(*) FROM {}.{}").format(
|
|
64
|
+
sql.Identifier(schema), sql.Identifier(table)
|
|
65
|
+
)
|
|
66
|
+
)
|
|
67
|
+
return cur.fetchone()[0]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _query_top_k(
|
|
71
|
+
dsn: str,
|
|
72
|
+
schema: str,
|
|
73
|
+
table: str,
|
|
74
|
+
query_vec: np.ndarray,
|
|
75
|
+
k: int,
|
|
76
|
+
) -> list[tuple[str, int]]:
|
|
77
|
+
"""pgvector top-K lookup. Returns [(doc_id, seq_num), ...] ordered by cosine distance."""
|
|
78
|
+
vec_str = "[" + ",".join(f"{x:.8f}" for x in query_vec.tolist()) + "]"
|
|
79
|
+
with psycopg.connect(dsn) as conn, conn.cursor() as cur:
|
|
80
|
+
cur.execute(
|
|
81
|
+
sql.SQL(
|
|
82
|
+
"SELECT doc_id, seq_num FROM {}.{} "
|
|
83
|
+
"ORDER BY embedding <=> %s::vector LIMIT %s"
|
|
84
|
+
).format(sql.Identifier(schema), sql.Identifier(table)),
|
|
85
|
+
(vec_str, k),
|
|
86
|
+
)
|
|
87
|
+
return [(r[0], r[1]) for r in cur.fetchall()]
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _build_cell_cfg(
|
|
91
|
+
bakeoff: BakeoffConfig,
|
|
92
|
+
chunker_cfg,
|
|
93
|
+
embedder_cfg,
|
|
94
|
+
table: str,
|
|
95
|
+
) -> CellConfig:
|
|
96
|
+
"""Translate (bakeoff config, one chunker, one embedder) into a runnable CellConfig."""
|
|
97
|
+
rt = bakeoff.runtime or RuntimeConfig()
|
|
98
|
+
return CellConfig(
|
|
99
|
+
cell_name=f"{bakeoff.name}__{chunker_key(chunker_cfg)}__{embedder_key(embedder_cfg)}",
|
|
100
|
+
source=bakeoff.source,
|
|
101
|
+
framer=bakeoff.framer,
|
|
102
|
+
chunker=chunker_cfg,
|
|
103
|
+
embedder=embedder_cfg,
|
|
104
|
+
extractor=NoneExtractor(),
|
|
105
|
+
target=TargetConfig(
|
|
106
|
+
dsn_env=bakeoff.target.dsn_env,
|
|
107
|
+
schema=bakeoff.target.schema_name,
|
|
108
|
+
table=table,
|
|
109
|
+
mode="overwrite",
|
|
110
|
+
hnsw=False,
|
|
111
|
+
),
|
|
112
|
+
runtime=rt,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _corpus_label(bakeoff: BakeoffConfig) -> str:
|
|
117
|
+
"""Best-effort human label for the corpus. `files.glob` -> the glob string; else source type."""
|
|
118
|
+
src = bakeoff.source
|
|
119
|
+
if getattr(src, "type", None) == "files":
|
|
120
|
+
return getattr(src, "glob", "files")
|
|
121
|
+
return getattr(src, "type", "unknown")
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def run_bakeoff(cfg: BakeoffConfig) -> BakeoffResults:
|
|
125
|
+
"""Execute every (chunker, embedder) combo, score against gold, return results.
|
|
126
|
+
|
|
127
|
+
Caller must set `os.environ[cfg.target.dsn_env]` before calling — the sink
|
|
128
|
+
reads the DSN from there. Raises `RuntimeError` if the env var is unset.
|
|
129
|
+
"""
|
|
130
|
+
if cfg.target.dsn_env not in os.environ:
|
|
131
|
+
raise RuntimeError(
|
|
132
|
+
f"DSN env var {cfg.target.dsn_env!r} is not set. The CLI sets it from "
|
|
133
|
+
"--dsn / $CHUNKSHOP_DSN before calling run_bakeoff."
|
|
134
|
+
)
|
|
135
|
+
dsn = os.environ[cfg.target.dsn_env]
|
|
136
|
+
schema = cfg.target.schema_name
|
|
137
|
+
|
|
138
|
+
gold = load_gold_queries(cfg.gold_queries)
|
|
139
|
+
combos_in = [(c, e) for c in cfg.matrix.chunkers for e in cfg.matrix.embedders]
|
|
140
|
+
|
|
141
|
+
started_at = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
142
|
+
|
|
143
|
+
# ----- Phase 1: ingest every combo serially -----
|
|
144
|
+
ingest_meta: list[dict[str, Any]] = []
|
|
145
|
+
for c, e in combos_in:
|
|
146
|
+
table = combo_table(c, e)
|
|
147
|
+
cell_cfg = _build_cell_cfg(cfg, c, e, table)
|
|
148
|
+
t0 = time.time()
|
|
149
|
+
res = run_cell(cell_cfg)
|
|
150
|
+
wall = time.time() - t0
|
|
151
|
+
if res.error:
|
|
152
|
+
raise RuntimeError(
|
|
153
|
+
f"ingest failed for combo {table}: {res.error}"
|
|
154
|
+
)
|
|
155
|
+
n_chunks = _count_chunks(dsn, schema, table)
|
|
156
|
+
ingest_meta.append(
|
|
157
|
+
{"chunker": c, "embedder": e, "table": table,
|
|
158
|
+
"chunks": n_chunks,
|
|
159
|
+
"wall_seconds": round(wall, 2),
|
|
160
|
+
# Subset of wall_seconds: just the embedder's portion.
|
|
161
|
+
"embed_seconds": round(getattr(res, "embed_seconds", 0.0), 2)}
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# ----- Phase 2: embed gold queries once per embedder (not once per combo) -----
|
|
165
|
+
# Two combos sharing an embedder share the same query vectors.
|
|
166
|
+
# Capture per-embedder query-embed wall time — that's a proxy for
|
|
167
|
+
# production query-time latency at scale.
|
|
168
|
+
query_vecs_by_emb_key: dict[str, np.ndarray] = {}
|
|
169
|
+
query_embed_seconds_by_emb_key: dict[str, float] = {}
|
|
170
|
+
for e in cfg.matrix.embedders:
|
|
171
|
+
k = embedder_key(e)
|
|
172
|
+
if k in query_vecs_by_emb_key:
|
|
173
|
+
continue
|
|
174
|
+
embedder = load_embedder(e)
|
|
175
|
+
t_qe = time.perf_counter()
|
|
176
|
+
vecs = embedder.embed([g.query for g in gold])
|
|
177
|
+
query_embed_seconds_by_emb_key[k] = round(time.perf_counter() - t_qe, 3)
|
|
178
|
+
query_vecs_by_emb_key[k] = vecs
|
|
179
|
+
|
|
180
|
+
# ----- Phase 3: score every combo -----
|
|
181
|
+
combo_results: list[ComboResult] = []
|
|
182
|
+
for meta in ingest_meta:
|
|
183
|
+
c = meta["chunker"]
|
|
184
|
+
e = meta["embedder"]
|
|
185
|
+
ck = chunker_key(c)
|
|
186
|
+
ek = embedder_key(e)
|
|
187
|
+
table = meta["table"]
|
|
188
|
+
vecs = query_vecs_by_emb_key[ek]
|
|
189
|
+
|
|
190
|
+
per_query: list[dict[str, Any]] = []
|
|
191
|
+
per_query_scores: list[dict[str, float]] = []
|
|
192
|
+
for i, g in enumerate(gold):
|
|
193
|
+
top = _query_top_k(dsn, schema, table, vecs[i], k=cfg.scoring.top_k)
|
|
194
|
+
doc_ids = [t[0] for t in top]
|
|
195
|
+
s = score_query(doc_ids, g.gold_doc_id, cfg.scoring.k)
|
|
196
|
+
per_query_scores.append(s)
|
|
197
|
+
per_query.append({
|
|
198
|
+
"query": g.query,
|
|
199
|
+
"gold_doc_id": g.gold_doc_id,
|
|
200
|
+
"top_k": [{"doc_id": t[0], "seq_num": t[1]} for t in top],
|
|
201
|
+
**s,
|
|
202
|
+
})
|
|
203
|
+
|
|
204
|
+
agg = aggregate_scores(per_query_scores)
|
|
205
|
+
|
|
206
|
+
combo_results.append(ComboResult(
|
|
207
|
+
chunker_key=ck,
|
|
208
|
+
embedder_key=ek,
|
|
209
|
+
chunker_label=_chunker_label(c),
|
|
210
|
+
embedder_label=e.model_name,
|
|
211
|
+
table=table,
|
|
212
|
+
ingest_chunks=meta["chunks"],
|
|
213
|
+
ingest_wall_seconds=meta["wall_seconds"],
|
|
214
|
+
ingest_embed_seconds=meta["embed_seconds"],
|
|
215
|
+
aggregate=agg,
|
|
216
|
+
per_query=per_query,
|
|
217
|
+
))
|
|
218
|
+
|
|
219
|
+
return BakeoffResults(
|
|
220
|
+
run_name=cfg.name,
|
|
221
|
+
started_at=started_at,
|
|
222
|
+
corpus_label=_corpus_label(cfg),
|
|
223
|
+
n_queries=len(gold),
|
|
224
|
+
n_combos=len(combo_results),
|
|
225
|
+
combos=combo_results,
|
|
226
|
+
gold_queries=[{"query": g.query, "gold_doc_id": g.gold_doc_id} for g in gold],
|
|
227
|
+
query_embed_seconds_by_embedder=query_embed_seconds_by_emb_key,
|
|
228
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Pure scoring functions for bakeoff evaluation (SC-005).
|
|
2
|
+
|
|
3
|
+
Given a ranked list of retrieved doc_ids for a query and a single gold doc_id,
|
|
4
|
+
compute recall@k for each k in `k_values` and MRR. Aggregation across queries
|
|
5
|
+
is a simple arithmetic mean. No external deps — kept here for easy unit test.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Iterable
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def score_query(
|
|
13
|
+
ranked_doc_ids: list[str],
|
|
14
|
+
gold_doc_id: str,
|
|
15
|
+
k_values: Iterable[int],
|
|
16
|
+
) -> dict[str, float]:
|
|
17
|
+
"""Score one query against one gold doc_id.
|
|
18
|
+
|
|
19
|
+
Returns `{recall_at_K: 0|1, ..., mrr: float}`. MRR uses 1/rank of the first
|
|
20
|
+
gold hit in the ranked list (unbounded — callers should slice to top-K
|
|
21
|
+
before scoring if they want bounded MRR).
|
|
22
|
+
"""
|
|
23
|
+
result: dict[str, float] = {}
|
|
24
|
+
for k in k_values:
|
|
25
|
+
result[f"recall_at_{k}"] = 1 if gold_doc_id in ranked_doc_ids[:k] else 0
|
|
26
|
+
mrr = 0.0
|
|
27
|
+
for rank, did in enumerate(ranked_doc_ids, start=1):
|
|
28
|
+
if did == gold_doc_id:
|
|
29
|
+
mrr = 1.0 / rank
|
|
30
|
+
break
|
|
31
|
+
result["mrr"] = mrr
|
|
32
|
+
return result
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def aggregate_scores(per_query: list[dict[str, float]]) -> dict[str, float]:
|
|
36
|
+
"""Arithmetic mean of each metric across all queries. Empty input -> {}."""
|
|
37
|
+
if not per_query:
|
|
38
|
+
return {}
|
|
39
|
+
n = len(per_query)
|
|
40
|
+
keys = per_query[0].keys()
|
|
41
|
+
return {k: sum(q[k] for q in per_query) / n for k in keys}
|