fancychunk 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. fancychunk-0.1.0/.github/workflows/ci.yml +44 -0
  2. fancychunk-0.1.0/.github/workflows/release.yml +58 -0
  3. fancychunk-0.1.0/.gitignore +35 -0
  4. fancychunk-0.1.0/CHANGELOG.md +74 -0
  5. fancychunk-0.1.0/LICENSE +21 -0
  6. fancychunk-0.1.0/PKG-INFO +300 -0
  7. fancychunk-0.1.0/README.md +279 -0
  8. fancychunk-0.1.0/bench_qwen3.py +84 -0
  9. fancychunk-0.1.0/docs/specs/00-pipeline-overview.md +168 -0
  10. fancychunk-0.1.0/docs/specs/01-sentence-splitting.md +338 -0
  11. fancychunk-0.1.0/docs/specs/02-chunklet-grouping.md +347 -0
  12. fancychunk-0.1.0/docs/specs/03-semantic-chunking.md +362 -0
  13. fancychunk-0.1.0/docs/specs/04-late-chunking.md +340 -0
  14. fancychunk-0.1.0/docs/specs/05-contextual-headings.md +221 -0
  15. fancychunk-0.1.0/docs/specs/README.md +56 -0
  16. fancychunk-0.1.0/docs/specs/acceptance/checklist.md +235 -0
  17. fancychunk-0.1.0/docs/specs/contracts/public-api.md +182 -0
  18. fancychunk-0.1.0/docs/specs/test-vectors/01-sentence-splitting.md +234 -0
  19. fancychunk-0.1.0/docs/specs/test-vectors/02-chunklet-grouping.md +214 -0
  20. fancychunk-0.1.0/docs/specs/test-vectors/03-semantic-chunking.md +204 -0
  21. fancychunk-0.1.0/docs/specs/test-vectors/04-late-chunking.md +187 -0
  22. fancychunk-0.1.0/docs/specs/test-vectors/05-contextual-headings.md +180 -0
  23. fancychunk-0.1.0/examples/embedders/README.md +94 -0
  24. fancychunk-0.1.0/examples/embedders/huggingface_offsets.py +142 -0
  25. fancychunk-0.1.0/examples/embedders/qwen3_mlx.py +140 -0
  26. fancychunk-0.1.0/examples/embedders/remote_http.py +127 -0
  27. fancychunk-0.1.0/pyproject.toml +55 -0
  28. fancychunk-0.1.0/src/fancychunk/__init__.py +57 -0
  29. fancychunk-0.1.0/src/fancychunk/_constants.py +30 -0
  30. fancychunk-0.1.0/src/fancychunk/_markdown.py +169 -0
  31. fancychunk-0.1.0/src/fancychunk/_segmenter.py +117 -0
  32. fancychunk-0.1.0/src/fancychunk/_telemetry.py +52 -0
  33. fancychunk-0.1.0/src/fancychunk/_typing.py +13 -0
  34. fancychunk-0.1.0/src/fancychunk/chunklets.py +310 -0
  35. fancychunk-0.1.0/src/fancychunk/chunks.py +256 -0
  36. fancychunk-0.1.0/src/fancychunk/errors.py +59 -0
  37. fancychunk-0.1.0/src/fancychunk/headings.py +90 -0
  38. fancychunk-0.1.0/src/fancychunk/late_chunking.py +343 -0
  39. fancychunk-0.1.0/src/fancychunk/py.typed +0 -0
  40. fancychunk-0.1.0/src/fancychunk/sentences.py +263 -0
  41. fancychunk-0.1.0/tests/__init__.py +0 -0
  42. fancychunk-0.1.0/tests/_fake_embedder.py +106 -0
  43. fancychunk-0.1.0/tests/conftest.py +28 -0
  44. fancychunk-0.1.0/tests/test_bert.py +136 -0
  45. fancychunk-0.1.0/tests/test_chunklets.py +123 -0
  46. fancychunk-0.1.0/tests/test_chunks.py +181 -0
  47. fancychunk-0.1.0/tests/test_headings.py +92 -0
  48. fancychunk-0.1.0/tests/test_late_chunking.py +137 -0
  49. fancychunk-0.1.0/tests/test_pipeline.py +80 -0
  50. fancychunk-0.1.0/tests/test_sat.py +31 -0
  51. fancychunk-0.1.0/tests/test_sentences.py +176 -0
  52. fancychunk-0.1.0/tests/test_telemetry.py +141 -0
@@ -0,0 +1,44 @@
1
+ name: ci
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ strategy:
12
+ fail-fast: false
13
+ matrix:
14
+ python-version: ["3.12", "3.13"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Cache pip
25
+ uses: actions/cache@v4
26
+ with:
27
+ path: ~/.cache/pip
28
+ key: pip-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }}
29
+ restore-keys: pip-${{ matrix.python-version }}-
30
+
31
+ - name: Install package + dev tools
32
+ run: |
33
+ python -m pip install --upgrade pip
34
+ pip install -e .
35
+ pip install pytest pyright "opentelemetry-sdk>=1.20"
36
+
37
+ - name: Type-check (pyright strict)
38
+ run: pyright src/fancychunk
39
+
40
+ - name: Test (without SaT — segmenter swapped to punctuation via conftest)
41
+ run: pytest -q
42
+
43
+ # SaT model is 408 MB. Don't fetch it in CI by default; a manual
44
+ # workflow run can flip ``FANCYCHUNK_TEST_USE_SAT=1`` if needed.
@@ -0,0 +1,58 @@
1
+ name: release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*.*.*"
7
+
8
+ # PyPI Trusted Publishing: this workflow's identity (repo + workflow
9
+ # file + environment) is what PyPI authorizes, no API token stored
10
+ # anywhere. Configure once at
11
+ # https://pypi.org/manage/account/publishing/ — see README §Releases.
12
+
13
+ jobs:
14
+ build:
15
+ runs-on: ubuntu-latest
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: "3.12"
23
+
24
+ - name: Install build tooling
25
+ run: |
26
+ python -m pip install --upgrade pip
27
+ pip install build
28
+
29
+ - name: Build sdist + wheel
30
+ run: python -m build
31
+
32
+ - name: Show artifacts
33
+ run: ls -lh dist/
34
+
35
+ - name: Upload artifacts
36
+ uses: actions/upload-artifact@v4
37
+ with:
38
+ name: dist
39
+ path: dist/
40
+
41
+ publish:
42
+ needs: build
43
+ runs-on: ubuntu-latest
44
+ environment: pypi
45
+ permissions:
46
+ id-token: write # required for Trusted Publishing
47
+ steps:
48
+ - uses: actions/download-artifact@v4
49
+ with:
50
+ name: dist
51
+ path: dist/
52
+
53
+ - name: Publish to PyPI
54
+ uses: pypa/gh-action-pypi-publish@release/v1
55
+ # No ``with:`` token field — Trusted Publishing uses the
56
+ # workflow's OIDC identity instead. PyPI must have the
57
+ # corresponding "trusted publisher" configured pointing at
58
+ # this repo + workflow file + environment.
@@ -0,0 +1,35 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ dist/
9
+ *.egg-info/
10
+ .pytest_cache/
11
+ .ruff_cache/
12
+ .mypy_cache/
13
+ .pyright_cache/
14
+
15
+ # Virtual environments
16
+ .venv/
17
+ venv/
18
+ env/
19
+
20
+ # uv
21
+ uv.lock
22
+
23
+ # pyenv (developer-local; pyproject.toml's requires-python is the
24
+ # project-wide source of truth).
25
+ .python-version
26
+
27
+ # IDE
28
+ .vscode/
29
+ .idea/
30
+ *.swp
31
+ *.swo
32
+
33
+ # OS
34
+ .DS_Store
35
+ Thumbs.db
@@ -0,0 +1,74 @@
1
+ # Changelog
2
+
3
+ All notable changes to fancychunk are recorded here. The format
4
+ follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and
5
+ the project follows [Semantic Versioning](https://semver.org/).
6
+
7
+ ## [Unreleased]
8
+
9
+ ### Changed (breaking — pre-1.0)
10
+ - `embed_with_late_chunking` now takes a `SegmentEmbedder` instead of
11
+ `TokenLevelEmbedder`. The new protocol is two methods + one
12
+ attribute (`n_ctx`, `count_tokens`, `embed_segment`) — replacing
13
+ the four-method `tokenize` / `detokenize` / `embed` / `n_ctx`
14
+ contract.
15
+ - Tokenization, special-token policy, and sentence-to-token
16
+ alignment are now the embedder's concern, not the library's. The
17
+ sentinel-token method (with `⊕` default), sentinel discovery,
18
+ and the `sentinel` keyword argument are removed.
19
+ - New `examples/embedders/` directory with reference adapters for
20
+ MLX (`qwen3_mlx.py`), HuggingFace transformers
21
+ (`huggingface_offsets.py`), and a remote HTTP service
22
+ (`remote_http.py`), each runnable.
23
+
24
+ ### Spec changes
25
+ - SPEC-CHUNK-420 rewritten: per-sentence alignment is the embedder's
26
+ responsibility; the library's contract is that
27
+ `sum(per_sentence_counts) == matrix_row_count`.
28
+ - SPEC-CHUNK-421 removed (sentinel character requirements are no
29
+ longer normative — implementations that adopt the sentinel method
30
+ test it against their own tokenizer).
31
+ - SPEC-CHUNK-412 simplified to four steps; the largest-remainder
32
+ safety net stays as the absorber for count drift between
33
+ budget-planning and the actual joined-input tokenization.
34
+ - TV-407 (sentinel collision detection) removed; TV-408 rewritten
35
+ for the new protocol.
36
+
37
+ ### Added
38
+ - Initial implementation of every pipeline stage in the spec:
39
+ - Stage 1 `split_sentences` (SaT-backed default segmenter via
40
+ `wtpsplit-lite`, lazy-loaded; punctuation fallback; heading
41
+ override; whitespace-trailing pass; vectorised boundary-score DP).
42
+ - Stage 2 `split_chunklets` (markdown-it-driven boundary
43
+ probabilities, statement-count function, vectorised
44
+ minimum-cost DP).
45
+ - Stage 3 `split_chunks` (unit-norm + discourse-vector projection,
46
+ rescaled cosine similarity, heading-aware modification recognising
47
+ both ATX and Setext, vectorised DP under covering constraint).
48
+ - Stage 4 `embed_with_late_chunking` (greedy segment construction
49
+ with backward preamble, sentinel-token alignment, largest-remainder
50
+ safety net).
51
+ - Stage 5 `heading_paths` (Markdown heading stack with reset
52
+ semantics).
53
+ - Public exception hierarchy (`fancychunk.FancyChunkError` and
54
+ subclasses).
55
+ - OpenTelemetry tracing for every public entry point. Spans are
56
+ zero-cost no-ops when no SDK is configured; span names are
57
+ `fancychunk.<function>` and attributes use the
58
+ `fancychunk.<key>` namespace.
59
+ - 88-test pytest suite covering every remaining test vector plus
60
+ cross-stage invariants and the inspection-only SPEC-CHUNK IDs
61
+ (101, 110, 113, 116). Tests use the punctuation segmenter by
62
+ default; set `FANCYCHUNK_TEST_USE_SAT=1` to exercise the real
63
+ model.
64
+ - GitHub Actions CI workflow runs pyright (strict mode) and pytest
65
+ against Python 3.12 and 3.13.
66
+
67
+ ### Removed
68
+ - Four test vectors that contradicted the spec's own cost math
69
+ (TV-113 as originally written, TV-205's downstream-partition
70
+ property, TV-209, TV-306). TV-113 was rewritten to test genuine
71
+ infeasibility; TV-205 was rewritten to test the upstream
72
+ probability mapping.
73
+
74
+ [Unreleased]: https://github.com/anthropics/fancychunk/compare/HEAD
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 fancychunk authors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,300 @@
1
+ Metadata-Version: 2.4
2
+ Name: fancychunk
3
+ Version: 0.1.0
4
+ Summary: Text chunking for retrieval-augmented generation.
5
+ Author: fancychunk authors
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3 :: Only
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Topic :: Text Processing :: Linguistic
15
+ Requires-Python: >=3.12
16
+ Requires-Dist: markdown-it-py>=3.0
17
+ Requires-Dist: numpy>=1.26
18
+ Requires-Dist: opentelemetry-api>=1.20
19
+ Requires-Dist: wtpsplit-lite>=0.1
20
+ Description-Content-Type: text/markdown
21
+
22
+ # fancychunk
23
+
24
+ A small, focused library for splitting text documents into semantically
25
+ coherent chunks suitable for retrieval-augmented generation.
26
+
27
+ > **Status:** initial implementation. The full specification lives in
28
+ > [`docs/specs/`](docs/specs/README.md); the public API in
29
+ > [`docs/specs/contracts/public-api.md`](docs/specs/contracts/public-api.md);
30
+ > the test vectors in
31
+ > [`docs/specs/test-vectors/`](docs/specs/test-vectors/). The
32
+ > implementation lives in [`src/fancychunk/`](src/fancychunk/) and
33
+ > covers the three required pipeline stages plus the two optional
34
+ > helpers (`embed_with_late_chunking`, `heading_paths`).
35
+
36
+ ## Quick start
37
+
38
+ ```python
39
+ import numpy as np
40
+ from fancychunk import (
41
+ split_sentences,
42
+ split_chunklets,
43
+ split_chunks,
44
+ heading_paths,
45
+ )
46
+
47
+ doc = open("README.md").read()
48
+ sentences = split_sentences(doc, max_len=2048)
49
+ chunklets = split_chunklets(sentences, max_size=2048)
50
+
51
+ # Caller supplies the embedding matrix; embedding is not part of
52
+ # fancychunk's core pipeline. Any deterministic embedder works.
53
+ embeddings = my_embedder(chunklets)
54
+ chunks, chunk_embeddings = split_chunks(chunklets, embeddings, max_size=2048)
55
+ paths = heading_paths(chunks)
56
+ ```
57
+
58
+ ## Late chunking — bring your own embedder
59
+
60
+ `embed_with_late_chunking` is an optional stage that improves
61
+ retrieval quality on documents with anaphoric references ("it",
62
+ "this method", "the algorithm") by giving each sentence an embedding
63
+ computed in the context of its neighbours. It costs about 4 MTEB
64
+ points on retrieval benchmarks vs. naive per-chunklet embedding, at
65
+ the price of ~30% more compute.
66
+
67
+ **The library doesn't ship any embedding model.** It owns the
68
+ algorithm — segment planning with backward preamble, mean-pool per
69
+ sentence, preamble discard, optional L2 normalize — and delegates
70
+ everything tokenizer-specific to a caller-supplied
71
+ [`SegmentEmbedder`](docs/specs/04-late-chunking.md#embedder-contract).
72
+ The contract is two methods and one attribute:
73
+
74
+ ```python
75
+ class SegmentEmbedder(Protocol):
76
+ n_ctx: int
77
+ def count_tokens(self, sentences: list[str]) -> list[int]: ...
78
+ def embed_segment(
79
+ self, sentences: list[str]
80
+ ) -> tuple[NDArray, list[int]]: ...
81
+ ```
82
+
83
+ Adapters for three deployment shapes ship as runnable examples:
84
+
85
+ | File | Backend | Best for |
86
+ |---|---|---|
87
+ | [`examples/embedders/qwen3_mlx.py`](examples/embedders/qwen3_mlx.py) | MLX + Qwen3-Embedding | Apple Silicon; offline / batch |
88
+ | [`examples/embedders/huggingface_offsets.py`](examples/embedders/huggingface_offsets.py) | HuggingFace transformers | Any platform; recommended default |
89
+ | [`examples/embedders/remote_http.py`](examples/embedders/remote_http.py) | HTTP client + local tokenizer | When the GPU lives on another machine |
90
+
91
+ See [`examples/embedders/README.md`](examples/embedders/README.md)
92
+ for guidance on picking an alignment method (offset-based vs.
93
+ sentinel-token), handling special tokens, and writing your own
94
+ adapter — typically ~20 lines of glue.
95
+
96
+ Wire it into the pipeline between stages 2 and 3:
97
+
98
+ ```python
99
+ from examples.embedders.huggingface_offsets import HFOffsetEmbedder
100
+ from fancychunk import (
101
+ embed_with_late_chunking,
102
+ split_chunklets,
103
+ split_chunks,
104
+ split_sentences,
105
+ )
106
+
107
+ embedder = HFOffsetEmbedder("BAAI/bge-m3")
108
+
109
+ sentences = split_sentences(doc, max_len=2048)
110
+ chunklets = split_chunklets(sentences, max_size=2048)
111
+
112
+ # Per-sentence embeddings with surrounding context.
113
+ sentence_embeddings = embed_with_late_chunking(sentences, embedder)
114
+
115
+ # Aggregate to per-chunklet (mean-pool over the sentences inside
116
+ # each chunklet — the caller's responsibility).
117
+ chunklet_embeddings = aggregate_to_chunklets(
118
+ sentence_embeddings, sentences, chunklets
119
+ )
120
+
121
+ chunks, _ = split_chunks(chunklets, chunklet_embeddings, max_size=2048)
122
+ ```
123
+
124
+ ## Observability
125
+
126
+ Every public stage emits an OpenTelemetry span with attributes that
127
+ describe input/output sizes and the option choices that affected the
128
+ outcome. The library depends only on `opentelemetry-api`; spans are
129
+ zero-cost no-ops until the host application configures an SDK and
130
+ exporter.
131
+
132
+ Span names are `fancychunk.<function>` (e.g.
133
+ `fancychunk.split_sentences`). Attribute keys use the
134
+ `fancychunk.<key>` namespace:
135
+
136
+ | Stage | Attribute keys |
137
+ |---|---|
138
+ | `split_sentences` | `document.length`, `min_len`, `max_len`, `segmenter`, `sentences.count`, `short_circuit` |
139
+ | `split_chunklets` | `sentences.count`, `max_size`, `custom_costs`, `chunklets.count`, `short_circuit` |
140
+ | `split_chunks` | `chunklets.count`, `max_size`, `embedding.dim`, `chunks.count`, `short_circuit` |
141
+ | `embed_with_late_chunking` | `sentences.count`, `embedder`, `embedder.n_ctx`, `budget`, `preamble_budget`, `preamble_fraction`, `normalize`, `segments.count`, `embedding.dim` |
142
+ | `heading_paths` | `chunks.count`, `paths.non_empty` |
143
+
144
+ To see them locally, install `opentelemetry-sdk` and configure a
145
+ console exporter:
146
+
147
+ ```python
148
+ from opentelemetry import trace
149
+ from opentelemetry.sdk.trace import TracerProvider
150
+ from opentelemetry.sdk.trace.export import ConsoleSpanExporter, SimpleSpanProcessor
151
+
152
+ provider = TracerProvider()
153
+ provider.add_span_processor(SimpleSpanProcessor(ConsoleSpanExporter()))
154
+ trace.set_tracer_provider(provider)
155
+
156
+ # subsequent fancychunk calls now emit spans to stdout
157
+ ```
158
+
159
+ The library also exposes a standard `logging.Logger` at
160
+ `fancychunk` (currently silent by default; future versions may add
161
+ INFO-level breadcrumbs at stage transitions).
162
+
163
+ ## What it does
164
+
165
+ Given a Markdown document, fancychunk partitions it into chunks where
166
+ each chunk:
167
+
168
+ - Respects sentence and paragraph boundaries.
169
+ - Targets a configurable maximum size.
170
+ - Begins at a structurally meaningful point (heading, paragraph start).
171
+ - Groups together semantically related material, splitting where the
172
+ topic shifts.
173
+
174
+ Optionally:
175
+
176
+ - When paired with a token-level embedding model, fancychunk can
177
+ produce *per-sentence* embeddings that incorporate surrounding-
178
+ document context ("late chunking"). The caller aggregates them to
179
+ per-chunklet level (typically by mean-pool over the sentences in
180
+ each chunklet) before passing them to the semantic-chunking stage.
181
+ - For each chunk, fancychunk can compute the Markdown heading path
182
+ that was in scope at the chunk's start, suitable for prepending as
183
+ embedding context.
184
+
185
+ ## What it does *not* do
186
+
187
+ - It does not parse PDFs, Word documents, or HTML. Input is Markdown.
188
+ - It does not embed text in the core three-stage pipeline. Embedding
189
+ is the caller's responsibility; fancychunk consumes pre-computed
190
+ chunklet embeddings for the semantic-chunking stage. (The optional
191
+ `embed_with_late_chunking` helper does invoke an embedder, but it
192
+ is opt-in and requires the caller to supply one.)
193
+ - It does not store, index, or retrieve. Output is a list of strings.
194
+ - It does not generate. There is no LLM in the loop.
195
+
196
+ ## How to read the specs
197
+
198
+ The specs in [`docs/specs/`](docs/specs/) are behavioral, not
199
+ prescriptive about implementation. A spec line says *what* a function
200
+ must do, not *how* to do it. Implementations are free to choose
201
+ tools, algorithms, libraries, and internal architecture.
202
+
203
+ Specs are numbered. SPEC-CHUNK-NNN identifiers within each spec
204
+ correspond to a single testable behavior; the
205
+ [acceptance checklist](docs/specs/acceptance/checklist.md) tracks every
206
+ ID.
207
+
208
+ ## Repo layout
209
+
210
+ ```
211
+ fancychunk/
212
+ ├── README.md # This file
213
+ ├── LICENSE # MIT
214
+ ├── pyproject.toml # Package metadata + runtime deps
215
+ ├── docs/specs/
216
+ │ ├── README.md # Glossary and reading order
217
+ │ ├── 00-pipeline-overview.md # End-to-end data flow
218
+ │ ├── 01-sentence-splitting.md # Stage 1
219
+ │ ├── 02-chunklet-grouping.md # Stage 2
220
+ │ ├── 03-semantic-chunking.md # Stage 3
221
+ │ ├── 04-late-chunking.md # Optional embed strategy
222
+ │ ├── 05-contextual-headings.md # Optional helper
223
+ │ ├── contracts/ # Public API signatures
224
+ │ ├── test-vectors/ # Concrete input → expected output pairs
225
+ │ └── acceptance/ # Pass/fail criteria
226
+ ├── src/fancychunk/ # Implementation
227
+ │ ├── sentences.py # Stage 1 — sentence splitting
228
+ │ ├── chunklets.py # Stage 2 — chunklet grouping
229
+ │ ├── chunks.py # Stage 3 — semantic chunking
230
+ │ ├── late_chunking.py # Stage 4 — late chunking (optional)
231
+ │ ├── headings.py # Stage 5 — heading paths (optional)
232
+ │ ├── _markdown.py # Markdown-it heading + opener helpers
233
+ │ ├── _segmenter.py # SaT default + punctuation fallback
234
+ │ ├── _constants.py # Named constants from the specs
235
+ │ └── errors.py # Exception hierarchy
236
+ └── tests/ # pytest suite covering every TV-*
237
+ ```
238
+
239
+ ## Production readiness
240
+
241
+ This is an alpha release (`0.1.x`). The behaviour the public API
242
+ documents is fully spec-conforming and locked in by the 88-test
243
+ suite; what's *not* yet promised:
244
+
245
+ - **API stability.** Names and defaults are unlikely to change but
246
+ aren't yet contract-stable. SemVer applies once the version hits
247
+ `1.0.0`.
248
+ - **SaT model on first run.** The default segmenter downloads ~408 MB
249
+ of weights from Hugging Face on first call. For production
250
+ deployment, either pre-warm the cache during image build or pass
251
+ `segmenter=punctuation_segmenter` if you can tolerate its quality.
252
+ - **Thread safety.** The module-level SaT singleton and markdown-it
253
+ parser are reentrant for *read*; the library doesn't synchronise.
254
+ Concurrent calls from multiple threads work because every operation
255
+ reads-only. Concurrent first-time SaT loading from multiple threads
256
+ may load the model twice (harmless but wasteful) — pre-warm if
257
+ this matters.
258
+ - **No global state writes.** No caches, no temp files, no logging
259
+ side effects. The library does not call `logging.basicConfig` and
260
+ attaches no handlers.
261
+ - **Determinism.** Cross-run reproducibility is guaranteed for every
262
+ stage given a deterministic segmenter / embedder (see
263
+ SPEC-CHUNK-901 in the specs).
264
+
265
+ CI runs `pyright` in strict mode and `pytest` against Python 3.12
266
+ and 3.13 on every push. Tests use the lightweight punctuation
267
+ segmenter so CI doesn't need the SaT weights; set
268
+ `FANCYCHUNK_TEST_USE_SAT=1` to exercise the real model.
269
+
270
+ ## Releases
271
+
272
+ Tags of the form `vX.Y.Z` on `main` trigger the release workflow
273
+ (`.github/workflows/release.yml`), which builds `sdist` + `wheel` and
274
+ publishes to PyPI via [Trusted Publishing](https://docs.pypi.org/trusted-publishers/)
275
+ — no API tokens stored anywhere. The first publish has to be done
276
+ manually (to reserve the project name on PyPI); subsequent releases
277
+ ride the workflow.
278
+
279
+ To cut a release:
280
+
281
+ ```bash
282
+ # 1. Update the version (single source of truth is pyproject.toml).
283
+ # 2. Update CHANGELOG.md.
284
+ # 3. Tag and push:
285
+ git tag -a v0.1.1 -m "Describe the release"
286
+ git push origin v0.1.1
287
+ ```
288
+
289
+ The `release` workflow takes over from there.
290
+
291
+ ## Acknowledgments
292
+
293
+ The three-stage pipeline (sentence → chunklet → chunk), the
294
+ late-chunking strategy, and the contextual-headings helper are
295
+ inspired by the chunking pipeline in
296
+ [raglite](https://github.com/superlinear-ai/raglite). Specific
297
+ techniques cite their originators inline in the specs: the SaT
298
+ segmenter, Greg Kamradt's "5 levels" taxonomy, Arora et al.'s
299
+ discourse-vector technique, the Weaviate / Jina late-chunking work,
300
+ and Dan Stites's contextual-headings post.