chunkshop 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. chunkshop-0.3.0/PKG-INFO +426 -0
  2. chunkshop-0.3.0/README.md +373 -0
  3. chunkshop-0.3.0/pyproject.toml +59 -0
  4. chunkshop-0.3.0/setup.cfg +4 -0
  5. chunkshop-0.3.0/src/chunkshop/__init__.py +5 -0
  6. chunkshop-0.3.0/src/chunkshop/bakeoff/__init__.py +25 -0
  7. chunkshop-0.3.0/src/chunkshop/bakeoff/config.py +115 -0
  8. chunkshop-0.3.0/src/chunkshop/bakeoff/gold.py +40 -0
  9. chunkshop-0.3.0/src/chunkshop/bakeoff/keys.py +53 -0
  10. chunkshop-0.3.0/src/chunkshop/bakeoff/output.py +159 -0
  11. chunkshop-0.3.0/src/chunkshop/bakeoff/runner.py +228 -0
  12. chunkshop-0.3.0/src/chunkshop/bakeoff/score.py +41 -0
  13. chunkshop-0.3.0/src/chunkshop/chunkers/__init__.py +86 -0
  14. chunkshop-0.3.0/src/chunkshop/chunkers/_sentence_split.py +56 -0
  15. chunkshop-0.3.0/src/chunkshop/chunkers/_splitting.py +111 -0
  16. chunkshop-0.3.0/src/chunkshop/chunkers/_summarizer.py +73 -0
  17. chunkshop-0.3.0/src/chunkshop/chunkers/base.py +18 -0
  18. chunkshop-0.3.0/src/chunkshop/chunkers/fixed_overlap.py +35 -0
  19. chunkshop-0.3.0/src/chunkshop/chunkers/hierarchical_summary.py +118 -0
  20. chunkshop-0.3.0/src/chunkshop/chunkers/hierarchy.py +84 -0
  21. chunkshop-0.3.0/src/chunkshop/chunkers/neighbor_expand.py +28 -0
  22. chunkshop-0.3.0/src/chunkshop/chunkers/semantic.py +192 -0
  23. chunkshop-0.3.0/src/chunkshop/chunkers/sentence_aware.py +71 -0
  24. chunkshop-0.3.0/src/chunkshop/chunkers/summary_embed.py +33 -0
  25. chunkshop-0.3.0/src/chunkshop/cli.py +241 -0
  26. chunkshop-0.3.0/src/chunkshop/config.py +458 -0
  27. chunkshop-0.3.0/src/chunkshop/embedders/__init__.py +22 -0
  28. chunkshop-0.3.0/src/chunkshop/embedders/_registry.py +169 -0
  29. chunkshop-0.3.0/src/chunkshop/embedders/base.py +8 -0
  30. chunkshop-0.3.0/src/chunkshop/embedders/fastembed_provider.py +73 -0
  31. chunkshop-0.3.0/src/chunkshop/extractors/__init__.py +36 -0
  32. chunkshop-0.3.0/src/chunkshop/extractors/base.py +8 -0
  33. chunkshop-0.3.0/src/chunkshop/extractors/composite.py +44 -0
  34. chunkshop-0.3.0/src/chunkshop/extractors/keybert_phrases.py +38 -0
  35. chunkshop-0.3.0/src/chunkshop/extractors/lang_detect.py +51 -0
  36. chunkshop-0.3.0/src/chunkshop/extractors/none_provider.py +10 -0
  37. chunkshop-0.3.0/src/chunkshop/extractors/rake_keywords.py +29 -0
  38. chunkshop-0.3.0/src/chunkshop/extractors/result.py +19 -0
  39. chunkshop-0.3.0/src/chunkshop/extractors/spacy_entities.py +54 -0
  40. chunkshop-0.3.0/src/chunkshop/framers/__init__.py +34 -0
  41. chunkshop-0.3.0/src/chunkshop/framers/base.py +16 -0
  42. chunkshop-0.3.0/src/chunkshop/framers/heading_boundary.py +60 -0
  43. chunkshop-0.3.0/src/chunkshop/framers/identity.py +14 -0
  44. chunkshop-0.3.0/src/chunkshop/framers/jsonpath.py +66 -0
  45. chunkshop-0.3.0/src/chunkshop/framers/regex_boundary.py +52 -0
  46. chunkshop-0.3.0/src/chunkshop/orchestrator.py +138 -0
  47. chunkshop-0.3.0/src/chunkshop/pipeline.py +130 -0
  48. chunkshop-0.3.0/src/chunkshop/runner.py +139 -0
  49. chunkshop-0.3.0/src/chunkshop/sink.py +306 -0
  50. chunkshop-0.3.0/src/chunkshop/sources/__init__.py +39 -0
  51. chunkshop-0.3.0/src/chunkshop/sources/base.py +15 -0
  52. chunkshop-0.3.0/src/chunkshop/sources/files.py +33 -0
  53. chunkshop-0.3.0/src/chunkshop/sources/http.py +104 -0
  54. chunkshop-0.3.0/src/chunkshop/sources/json_corpus.py +26 -0
  55. chunkshop-0.3.0/src/chunkshop/sources/pg_table.py +75 -0
  56. chunkshop-0.3.0/src/chunkshop/sources/s3.py +65 -0
  57. chunkshop-0.3.0/src/chunkshop/summarizers/__init__.py +11 -0
  58. chunkshop-0.3.0/src/chunkshop/summarizers/lede.py +29 -0
  59. chunkshop-0.3.0/src/chunkshop/summarizers/sumy.py +67 -0
  60. chunkshop-0.3.0/src/chunkshop.egg-info/PKG-INFO +426 -0
  61. chunkshop-0.3.0/src/chunkshop.egg-info/SOURCES.txt +63 -0
  62. chunkshop-0.3.0/src/chunkshop.egg-info/dependency_links.txt +1 -0
  63. chunkshop-0.3.0/src/chunkshop.egg-info/entry_points.txt +2 -0
  64. chunkshop-0.3.0/src/chunkshop.egg-info/requires.txt +45 -0
  65. chunkshop-0.3.0/src/chunkshop.egg-info/top_level.txt +1 -0
@@ -0,0 +1,426 @@
1
+ Metadata-Version: 2.4
2
+ Name: chunkshop
3
+ Version: 0.3.0
4
+ Summary: Standalone ingest-to-pgvector: source → chunker → embedder → extractor → table. int8 by default.
5
+ Author-email: The Yonk <matt@theyonk.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/yonk-labs/chunkshop
8
+ Project-URL: Issues, https://github.com/yonk-labs/chunkshop/issues
9
+ Project-URL: Repository, https://github.com/yonk-labs/chunkshop.git
10
+ Keywords: rag,pgvector,embeddings,chunking,ingest
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Database
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Python: >=3.12
18
+ Description-Content-Type: text/markdown
19
+ Requires-Dist: click>=8.1
20
+ Requires-Dist: psycopg[binary]>=3.2
21
+ Requires-Dist: pydantic>=2.7
22
+ Requires-Dist: pyyaml>=6.0
23
+ Requires-Dist: numpy>=1.26
24
+ Requires-Dist: fastembed>=0.3
25
+ Provides-Extra: extractors
26
+ Requires-Dist: rake-nltk>=1.0.6; extra == "extractors"
27
+ Requires-Dist: nltk>=3.8; extra == "extractors"
28
+ Provides-Extra: keybert
29
+ Requires-Dist: keybert>=0.8; extra == "keybert"
30
+ Requires-Dist: sentence-transformers>=3.0; extra == "keybert"
31
+ Provides-Extra: spacy
32
+ Requires-Dist: spacy>=3.7; extra == "spacy"
33
+ Provides-Extra: lang
34
+ Requires-Dist: langdetect>=1.0.9; extra == "lang"
35
+ Provides-Extra: nlp
36
+ Requires-Dist: keybert>=0.8; extra == "nlp"
37
+ Requires-Dist: sentence-transformers>=3.0; extra == "nlp"
38
+ Requires-Dist: spacy>=3.7; extra == "nlp"
39
+ Requires-Dist: langdetect>=1.0.9; extra == "nlp"
40
+ Provides-Extra: quantize
41
+ Requires-Dist: onnx>=1.14; extra == "quantize"
42
+ Provides-Extra: lede
43
+ Requires-Dist: lede>=0.3; extra == "lede"
44
+ Provides-Extra: sumy
45
+ Requires-Dist: sumy>=0.11; extra == "sumy"
46
+ Provides-Extra: summarize
47
+ Provides-Extra: s3
48
+ Requires-Dist: boto3>=1.28; extra == "s3"
49
+ Provides-Extra: dev
50
+ Requires-Dist: pytest>=8; extra == "dev"
51
+ Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
52
+ Requires-Dist: onnx>=1.14; extra == "dev"
53
+
54
+ # chunkshop (Python)
55
+
56
+ Reference implementation of the chunkshop ingest tool. v0.2.0, alpha.
57
+
58
+ **New here?** Start with the [**end-to-end tutorial**](../docs/tutorial.md) — a guided
59
+ walkthrough from zero (no Postgres) to a running semantic query.
60
+
61
+ This file is the field-by-field reference: every CLI flag, every YAML field, the
62
+ troubleshooting table. Use it alongside the tutorial once you know what you're doing.
63
+
64
+ For the high-level shape and mermaid diagram, see the [top-level README](../README.md).
65
+
66
+ ## Install
67
+
68
+ From source (recommended while alpha):
69
+
70
+ ```bash
71
+ cd chunkshop/python
72
+ uv sync --extra dev
73
+ ```
74
+
75
+ As a path dependency from another project:
76
+
77
+ ```toml
78
+ [tool.uv.sources]
79
+ chunkshop = { path = "../chunkshop/python", editable = true }
80
+ ```
81
+
82
+ Optional extras:
83
+
84
+ | Extra | What you get |
85
+ |--------------|----------------------------------------------------------------------|
86
+ | `extractors` | `rake-nltk` + `nltk` for the RAKE extractor. |
87
+ | `keybert` | `keybert` + `sentence-transformers` for the `keybert_phrases` extractor. |
88
+ | `spacy` | `spacy` for the `spacy_entities` NER extractor. |
89
+ | `lang` | `langdetect` for the `lang_detect` extractor. |
90
+ | `nlp` | Umbrella: `keybert` + `spacy` + `lang` in one install. |
91
+ | `lede` | Sibling `extractive_summary` repo as a path dep — enables `summary_embed` with `lede.tfidf.summarize`. |
92
+ | `sumy` | `sumy` + NLTK corpora for the sumy adapter shim (`chunkshop.summarizers.sumy`). |
93
+ | `quantize` | `onnx` for on-the-fly quantization scratch. |
94
+ | `dev` | `pytest`, `pytest-asyncio`, `onnx`. |
95
+
96
+ Python ≥ 3.12 required.
97
+
98
+ ## Prerequisites
99
+
100
+ - **Postgres ≥ 14** with the `pgvector` extension installed
101
+ (`CREATE EXTENSION vector;` must succeed in your target DB).
102
+ - **Disk space for model cache** in `~/.cache/fastembed/` — ~85 MB for int8 `bge-base`,
103
+ ~550 MB for `nomic`.
104
+ - **An env var holding your DSN.** The target config references it by name, not by value.
105
+
106
+ ## Quick run
107
+
108
+ ```bash
109
+ export CHUNKSHOP_DSN="postgresql://postgres:postgres@localhost:5432/mydb"
110
+
111
+ # Point at the sample corpus in docs/samples/ for a real end-to-end run:
112
+ chunkshop ingest --config ../docs/samples/sample.yaml
113
+
114
+ # Or copy the template and edit it:
115
+ cp src/chunkshop/configs/example-files-to-bge.yaml my-cell.yaml
116
+ chunkshop ingest --config my-cell.yaml
117
+ ```
118
+
119
+ Success looks like:
120
+
121
+ ```json
122
+ {
123
+ "cell_name": "example_files",
124
+ "docs_processed": 47,
125
+ "chunks_written": 312,
126
+ "wall_seconds": 18.4,
127
+ "error": null
128
+ }
129
+ ```
130
+
131
+ ## CLI
132
+
133
+ Two subcommands: `ingest` (one cell) and `orchestrate` (many cells in parallel).
134
+
135
+ ### `chunkshop ingest`
136
+
137
+ Runs one YAML end-to-end.
138
+
139
+ ```
140
+ chunkshop ingest --config PATH [--doc-limit N] [--log PATH] [--omp-threads N]
141
+ ```
142
+
143
+ | Flag | YAML override | Purpose |
144
+ |-----------------|------------------------|-----------------------------------------|
145
+ | `-c, --config` | — | Required. Path to YAML. |
146
+ | `--doc-limit` | `runtime.doc_limit` | Smoke-test mode; stop after N docs. |
147
+ | `--log` | `runtime.log_path` | Append stdout log lines to this file. |
148
+ | `--omp-threads` | `runtime.omp_num_threads` | Cap BLAS/OMP threads before ORT loads. |
149
+
150
+ Exit code: `0` on success, `1` if the cell errored. Stdout = a JSON summary.
151
+
152
+ ### `chunkshop orchestrate`
153
+
154
+ Runs N cells in parallel as subprocesses.
155
+
156
+ ```
157
+ chunkshop orchestrate (--config-dir DIR | --config PATH [--config PATH ...])
158
+ [--concurrency N]
159
+ [--checkpoints "60,120,300,600"]
160
+ [--timeout SECONDS]
161
+ [--smoke | --full]
162
+ ```
163
+
164
+ | Flag | Default | Purpose |
165
+ |------------------|-----------------|---------------------------------------------------------------------|
166
+ | `-d, --config-dir` | — | Run every `*.yaml`/`*.yml` in the directory. |
167
+ | `-c, --config` | — | Explicit path; repeatable. Mutually exclusive with `--config-dir`. |
168
+ | `--concurrency` | `4` | Max parallel cells (subprocess pool size). |
169
+ | `--checkpoints` | `60,120,300,600`| Seconds at which to print a status report. |
170
+ | `--timeout` | `7200` (2h) | Overall wall limit; survivors get SIGTERM to their process group. |
171
+ | `--smoke` | off | Force `doc_limit=1` + `concurrency=1`. Useful for "does it crash". |
172
+
173
+ Stdout = checkpoint reports during the run, JSON summary at the end.
174
+
175
+ ### `chunkshop bakeoff`
176
+
177
+ Runs a chunker × embedder matrix against a corpus with hand-written gold
178
+ queries, scores recall@k + MRR per combo, writes a leaderboard + a
179
+ runnable `recommended.yaml`. Config-driven — the matrix lives in YAML,
180
+ not on the command line.
181
+
182
+ ```
183
+ chunkshop bakeoff --config PATH [--dsn DSN] [--yes] [--keep-schema]
184
+ ```
185
+
186
+ | Flag | Default | Purpose |
187
+ |-----------------|-------------------------|---------------------------------------------------------------|
188
+ | `--config` | — | Path to the bakeoff YAML. Required. |
189
+ | `--dsn` | `$CHUNKSHOP_DSN` | Postgres DSN. Required (env var or flag). |
190
+ | `--yes` | off | Bypass the >50-cell matrix confirmation prompt. |
191
+ | `--keep-schema` | off | Keep the bakeoff schema after run — useful for debugging. |
192
+
193
+ Outputs land in `skill-output/bakeoff/{name}/`:
194
+ - `results.json` — raw per-combo + per-query data.
195
+ - `report.md` — leaderboard sorted by MRR, per-query detail, statistical-
196
+ power caveat.
197
+ - `recommended.yaml` — top combo pre-filled as a runnable
198
+ `chunkshop ingest` cell.
199
+
200
+ Full walkthrough: [`../docs/tutorial-bakeoff.md`](../docs/tutorial-bakeoff.md).
201
+ Recipe card: [`../docs/quickstart-bakeoff.md`](../docs/quickstart-bakeoff.md).
202
+
203
+ ## YAML reference
204
+
205
+ Every cell config has five sections plus an optional `runtime`. Extra keys are rejected
206
+ (`extra="forbid"` in pydantic), so typos fail loudly.
207
+
208
+ ```yaml
209
+ cell_name: my_cell
210
+ source: { ... }
211
+ chunker: { ... }
212
+ embedder: { ... }
213
+ extractor: { ... } # optional, defaults to {type: none}
214
+ target: { ... }
215
+ runtime: { ... } # optional, sensible defaults below
216
+ ```
217
+
218
+ ### `source`
219
+
220
+ | `type` | Required fields | Optional fields |
221
+ |----------------|----------------------------------------|--------------------------------------------------------------------------|
222
+ | `files` | `glob` | `id_from: path \| stem \| sha1` (default `stem`), `encoding` (`utf-8`) |
223
+ | `json_corpus` | `path` | `documents_key` (`documents`), `id_field` (`id`), `content_field` (`content`), `title_field` (`title`) |
224
+ | `pg_table` | `dsn_env`, `schema`, `table`, `id_column`, `content_column` | `title_column`, `where` |
225
+ | `http` | `urls` or `sitemap` | — (stub today) |
226
+ | `s3` | `bucket` | `prefix` (stub today) |
227
+
228
+ ### `chunker`
229
+
230
+ Seven chunkers in three families. Pick one per cell.
231
+
232
+ **Structural** — split on headings, paragraphs, or word counts:
233
+
234
+ | `type` | Required | Defaults |
235
+ |-------------------|---------------------------|----------------------------------------------|
236
+ | `sentence_aware` | — | `doc_type: prose` (or `code`), `max_chars: 2000`, `min_chars: 200` |
237
+ | `fixed_overlap` | — | `window_words: 300`, `step_words: 150` |
238
+ | `hierarchy` | — | `prefix_heading: true`, `min_section_chars: 100`, `max_chars: 2000` |
239
+ | `neighbor_expand` | `base:` (nested chunker) | `window: 1` |
240
+
241
+ **Semantic** — splits on embedding-drift boundaries (no heading needed):
242
+
243
+ | `type` | Required | Defaults |
244
+ |------------|----------|--------------------------------------------------------------------|
245
+ | `semantic` | — | `boundary_model: "sentence-transformers/all-MiniLM-L6-v2-int8"`, `breakpoint_percentile: 95`, `min_sentences_per_chunk: 3`, `max_chunk_chars: 2000`, `sentence_splitter: "naive"` |
246
+
247
+ Pass `boundary_model: "same"` to reuse the cell's main embedder (trades
248
+ speed for memory). See [`../docs/tutorial-semantic.md`](../docs/tutorial-semantic.md).
249
+
250
+ **Summary-layer** — wrap any base chunker and change what gets embedded
251
+ vs. what gets stored (`summary_embed`) or emit fine+coarse rows linked by
252
+ `group_id` (`hierarchical_summary`):
253
+
254
+ | `type` | Required | Defaults |
255
+ |--------------------------|------------------------------------|--------------------------------------|
256
+ | `summary_embed` | `base:`, `summarizer:` | — |
257
+ | `hierarchical_summary` | `base:`, `summarizer:`, `grouping:` | `grouping: {strategy: fixed_n, n: 5}` |
258
+
259
+ The `summarizer` config is a discriminated union: `{mode: external, field: ...}`
260
+ pulls a pre-computed summary from a source document metadata field; `{mode:
261
+ callable, module: "lede.tfidf", function: "summarize", kwargs: {...}}`
262
+ imports lazily at first use; `{mode: passthrough}` reuses the raw chunk as
263
+ the summary (baseline). See [`../docs/summaries.md`](../docs/summaries.md)
264
+ and [`../docs/tutorial-summaries.md`](../docs/tutorial-summaries.md).
265
+
266
+ Full per-chunker guidance: [`../docs/chunkers.md`](../docs/chunkers.md).
267
+
268
+ ### `embedder`
269
+
270
+ Only `fastembed` today.
271
+
272
+ | Field | Required | Default | Notes |
273
+ |--------------|----------|---------|----------------------------------------------------------|
274
+ | `type` | yes | — | Literal `fastembed`. |
275
+ | `model_name` | yes | — | e.g. `Xenova/bge-base-en-v1.5-int8`. See [embedders.md](../docs/embedders.md). |
276
+ | `dim` | yes | — | Must match the model. Mismatch fails loudly at first embed. |
277
+ | `batch_size` | no | `64` | Per-call batch to `fastembed.embed`. |
278
+ | `threads` | no | `None` | `None` = auto (bad on shared boxes). Set to 4 typically. |
279
+
280
+ ### `extractor`
281
+
282
+ | `type` | Fields |
283
+ |-------------------|-------------------------------------------|
284
+ | `none` | — (default) |
285
+ | `rake_keywords` | `top_k: 10`, `min_chars: 3` (defaults) |
286
+
287
+ RAKE downloads NLTK corpora (`stopwords`, `punkt`) on first use to `~/nltk_data/`.
288
+
289
+ ### `target`
290
+
291
+ | Field | Required | Default | Notes |
292
+ |--------------------|----------------------|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------|
293
+ | `dsn_env` | no | `AGE_BAKEOFF_PGRG_DSN` | Name of the env var holding your DSN. **Override this** to `CHUNKSHOP_DSN` in your configs. |
294
+ | `schema` | yes | — | Lowercase ident; must match `^[a-z_][a-z0-9_]*$`. Created if missing. |
295
+ | `table` | yes | — | Same ident rule. |
296
+ | `mode` | no | `overwrite` | One of `overwrite`, `append`, `create_if_missing`. See [`../docs/tutorial-multi-source.md`](../docs/tutorial-multi-source.md). |
297
+ | `source_tag` | when `mode=append` | `null` | Ident-safe tag written to every row's `source` column. Required for `append`; optional (but recommended) for `overwrite`/`create_if_missing`. |
298
+ | `promote_metadata` | no | `[]` | List of `{path, type}` pairs lifting jsonb metadata paths into typed columns. `path` is lowercased + `.` → `__` for the column name. |
299
+ | `force_overwrite` | no | `false` | Bypasses the "refuse to drop a table that holds rows from a foreign `source_tag`" safety check in `overwrite` mode. |
300
+ | `overwrite` | no (soft-deprecated) | `false` | Legacy boolean. Still honored when `mode=overwrite` (acts as the DROP+CREATE switch). Prefer the new `mode` field for new configs. |
301
+ | `hnsw` | no | `true` | `false` for tiny test tables where HNSW is slower than seq scan. |
302
+
303
+ ### Multi-source ingest
304
+
305
+ Multiple cells can write to the same table by tagging each cell's rows with a `source_tag`.
306
+ Cell A creates the table with `mode: create_if_missing`, Cell B appends with `mode: append`
307
+ and its own tag. Queries filter or group by the `source` column. See
308
+ [`../docs/tutorial-multi-source.md`](../docs/tutorial-multi-source.md) for the end-to-end walkthrough.
309
+
310
+ ```yaml
311
+ target:
312
+ dsn_env: CHUNKSHOP_DSN
313
+ schema: mydata
314
+ table: all_docs
315
+ mode: append
316
+ source_tag: support_tickets
317
+ ```
318
+
319
+ ### `runtime`
320
+
321
+ | Field | Default | Notes |
322
+ |---------------------|---------|------------------------------------------------------------------|
323
+ | `omp_num_threads` | `1` | Sets `OMP/MKL/OPENBLAS/NUMEXPR` env vars before ORT loads. |
324
+ | `doc_limit` | `null` | Stop after N docs. Smoke-test lever. |
325
+ | `log_path` | `null` | Mirror stdout heartbeats to this file. Parent dirs auto-created. |
326
+ | `heartbeat_every` | `25` | Log a progress line every N docs. |
327
+
328
+ ## Environment variables
329
+
330
+ | Var | When chunkshop reads it |
331
+ |----------------------------------------|------------------------------------------------------------|
332
+ | `$<target.dsn_env>` (default `AGE_BAKEOFF_PGRG_DSN`) | At sink construction; must be a valid libpq DSN. |
333
+ | `OMP_NUM_THREADS` and friends | Set by `runner` before any numpy/ORT import. |
334
+ | `HF_HOME` / `HF_HUB_CACHE` | Respected by fastembed's downloader if you've moved the cache. |
335
+
336
+ ## Troubleshooting
337
+
338
+ ### "no files matched glob: /path/**/*.md"
339
+
340
+ Your `source.glob` didn't match anything. Test it in a shell first:
341
+
342
+ ```bash
343
+ ls /path/**/*.md | head
344
+ ```
345
+
346
+ Note that chunkshop uses Python's `glob.glob(..., recursive=True)` — `**` only matches across
347
+ directories when it's its own path component (`/foo/**/*.md`, not `/foo/**.md`).
348
+
349
+ ### "relation already exists" on second run
350
+
351
+ `target.overwrite` is `false` by default. Either flip it to `true` (drops + recreates) or
352
+ drop the table yourself. The `ON CONFLICT DO UPDATE` in the writer will also happily upsert
353
+ into an existing table.
354
+
355
+ ### "model X produced dim Y, config says dim=Z"
356
+
357
+ Your YAML's `embedder.dim` doesn't match the model's output. Look up the right dim in
358
+ [`../docs/embedders.md`](../docs/embedders.md) — `bge-small`=384, `bge-base`=768,
359
+ `nomic`=768.
360
+
361
+ ### "CREATE EXTENSION IF NOT EXISTS vector" fails with permission denied
362
+
363
+ Your DB role can't create extensions. Ask a superuser to run it once per database:
364
+
365
+ ```sql
366
+ CREATE EXTENSION IF NOT EXISTS vector;
367
+ ```
368
+
369
+ Then re-run chunkshop — the sink's `CREATE EXTENSION IF NOT EXISTS` will be a no-op.
370
+
371
+ ### "table/schema must match ^[a-z_][a-z0-9_]*$"
372
+
373
+ chunkshop refuses to interpolate mixed-case or quoted identifiers — SQL injection safety via
374
+ allowlist. Lowercase your `schema` and `table`.
375
+
376
+ ### Ingest is slow and my CPU fans are loud
377
+
378
+ Three knobs. Pick one:
379
+
380
+ - Drop `embedder.batch_size` from 64 to 32 — less memory pressure, slower per-doc.
381
+ - Set `embedder.threads: 4` (or 2) — caps ORT's worker pool.
382
+ - If running under `orchestrate`, reduce `--concurrency`.
383
+
384
+ See the thread-tuning table in [`../docs/embedders.md`](../docs/embedders.md).
385
+
386
+ ### First run hangs on "downloading model"
387
+
388
+ Fastembed is pulling the ONNX from HuggingFace. Network / HF outage. Check
389
+ `curl -sI https://huggingface.co/` and your proxy settings. The file lands in
390
+ `~/.cache/fastembed/<model-name>/`.
391
+
392
+ ### nltk errors on first `rake_keywords` run
393
+
394
+ The extractor downloads `stopwords`, `punkt`, `punkt_tab` into `~/nltk_data/` on first use.
395
+ Behind a strict firewall? Pre-download once:
396
+
397
+ ```python
398
+ import nltk
399
+ for r in ("stopwords", "punkt", "punkt_tab"):
400
+ nltk.download(r)
401
+ ```
402
+
403
+ ## Using chunkshop as a library
404
+
405
+ ```python
406
+ from chunkshop import load_config
407
+ from chunkshop.runner import run_cell
408
+
409
+ cfg = load_config("my-cell.yaml")
410
+ result = run_cell(cfg)
411
+ print(result.docs_processed, result.chunks_written, result.wall_seconds)
412
+ ```
413
+
414
+ Or skip the YAML and build a `CellConfig` directly — every section is a plain pydantic
415
+ model.
416
+
417
+ ## Tests
418
+
419
+ ```bash
420
+ cd python
421
+ uv run pytest
422
+ ```
423
+
424
+ Most tests are offline. `test_embedder_fastembed.py` and `test_int8_registry.py` download the
425
+ int8 `bge-base` model on first run and cache it — budget ~85 MB + a few seconds the first
426
+ time.