fancychunk 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fancychunk-0.1.0/.github/workflows/ci.yml +44 -0
- fancychunk-0.1.0/.github/workflows/release.yml +58 -0
- fancychunk-0.1.0/.gitignore +35 -0
- fancychunk-0.1.0/CHANGELOG.md +74 -0
- fancychunk-0.1.0/LICENSE +21 -0
- fancychunk-0.1.0/PKG-INFO +300 -0
- fancychunk-0.1.0/README.md +279 -0
- fancychunk-0.1.0/bench_qwen3.py +84 -0
- fancychunk-0.1.0/docs/specs/00-pipeline-overview.md +168 -0
- fancychunk-0.1.0/docs/specs/01-sentence-splitting.md +338 -0
- fancychunk-0.1.0/docs/specs/02-chunklet-grouping.md +347 -0
- fancychunk-0.1.0/docs/specs/03-semantic-chunking.md +362 -0
- fancychunk-0.1.0/docs/specs/04-late-chunking.md +340 -0
- fancychunk-0.1.0/docs/specs/05-contextual-headings.md +221 -0
- fancychunk-0.1.0/docs/specs/README.md +56 -0
- fancychunk-0.1.0/docs/specs/acceptance/checklist.md +235 -0
- fancychunk-0.1.0/docs/specs/contracts/public-api.md +182 -0
- fancychunk-0.1.0/docs/specs/test-vectors/01-sentence-splitting.md +234 -0
- fancychunk-0.1.0/docs/specs/test-vectors/02-chunklet-grouping.md +214 -0
- fancychunk-0.1.0/docs/specs/test-vectors/03-semantic-chunking.md +204 -0
- fancychunk-0.1.0/docs/specs/test-vectors/04-late-chunking.md +187 -0
- fancychunk-0.1.0/docs/specs/test-vectors/05-contextual-headings.md +180 -0
- fancychunk-0.1.0/examples/embedders/README.md +94 -0
- fancychunk-0.1.0/examples/embedders/huggingface_offsets.py +142 -0
- fancychunk-0.1.0/examples/embedders/qwen3_mlx.py +140 -0
- fancychunk-0.1.0/examples/embedders/remote_http.py +127 -0
- fancychunk-0.1.0/pyproject.toml +55 -0
- fancychunk-0.1.0/src/fancychunk/__init__.py +57 -0
- fancychunk-0.1.0/src/fancychunk/_constants.py +30 -0
- fancychunk-0.1.0/src/fancychunk/_markdown.py +169 -0
- fancychunk-0.1.0/src/fancychunk/_segmenter.py +117 -0
- fancychunk-0.1.0/src/fancychunk/_telemetry.py +52 -0
- fancychunk-0.1.0/src/fancychunk/_typing.py +13 -0
- fancychunk-0.1.0/src/fancychunk/chunklets.py +310 -0
- fancychunk-0.1.0/src/fancychunk/chunks.py +256 -0
- fancychunk-0.1.0/src/fancychunk/errors.py +59 -0
- fancychunk-0.1.0/src/fancychunk/headings.py +90 -0
- fancychunk-0.1.0/src/fancychunk/late_chunking.py +343 -0
- fancychunk-0.1.0/src/fancychunk/py.typed +0 -0
- fancychunk-0.1.0/src/fancychunk/sentences.py +263 -0
- fancychunk-0.1.0/tests/__init__.py +0 -0
- fancychunk-0.1.0/tests/_fake_embedder.py +106 -0
- fancychunk-0.1.0/tests/conftest.py +28 -0
- fancychunk-0.1.0/tests/test_bert.py +136 -0
- fancychunk-0.1.0/tests/test_chunklets.py +123 -0
- fancychunk-0.1.0/tests/test_chunks.py +181 -0
- fancychunk-0.1.0/tests/test_headings.py +92 -0
- fancychunk-0.1.0/tests/test_late_chunking.py +137 -0
- fancychunk-0.1.0/tests/test_pipeline.py +80 -0
- fancychunk-0.1.0/tests/test_sat.py +31 -0
- fancychunk-0.1.0/tests/test_sentences.py +176 -0
- fancychunk-0.1.0/tests/test_telemetry.py +141 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
name: ci
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
strategy:
|
|
12
|
+
fail-fast: false
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.12", "3.13"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Cache pip
|
|
25
|
+
uses: actions/cache@v4
|
|
26
|
+
with:
|
|
27
|
+
path: ~/.cache/pip
|
|
28
|
+
key: pip-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }}
|
|
29
|
+
restore-keys: pip-${{ matrix.python-version }}-
|
|
30
|
+
|
|
31
|
+
- name: Install package + dev tools
|
|
32
|
+
run: |
|
|
33
|
+
python -m pip install --upgrade pip
|
|
34
|
+
pip install -e .
|
|
35
|
+
pip install pytest pyright "opentelemetry-sdk>=1.20"
|
|
36
|
+
|
|
37
|
+
- name: Type-check (pyright strict)
|
|
38
|
+
run: pyright src/fancychunk
|
|
39
|
+
|
|
40
|
+
- name: Test (without SaT — segmenter swapped to punctuation via conftest)
|
|
41
|
+
run: pytest -q
|
|
42
|
+
|
|
43
|
+
# SaT model is 408 MB. Don't fetch it in CI by default; a manual
|
|
44
|
+
# workflow run can flip ``FANCYCHUNK_TEST_USE_SAT=1`` if needed.
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
name: release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*.*.*"
|
|
7
|
+
|
|
8
|
+
# PyPI Trusted Publishing: this workflow's identity (repo + workflow
|
|
9
|
+
# file + environment) is what PyPI authorizes, no API token stored
|
|
10
|
+
# anywhere. Configure once at
|
|
11
|
+
# https://pypi.org/manage/account/publishing/ — see README §Releases.
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
build:
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: "3.12"
|
|
23
|
+
|
|
24
|
+
- name: Install build tooling
|
|
25
|
+
run: |
|
|
26
|
+
python -m pip install --upgrade pip
|
|
27
|
+
pip install build
|
|
28
|
+
|
|
29
|
+
- name: Build sdist + wheel
|
|
30
|
+
run: python -m build
|
|
31
|
+
|
|
32
|
+
- name: Show artifacts
|
|
33
|
+
run: ls -lh dist/
|
|
34
|
+
|
|
35
|
+
- name: Upload artifacts
|
|
36
|
+
uses: actions/upload-artifact@v4
|
|
37
|
+
with:
|
|
38
|
+
name: dist
|
|
39
|
+
path: dist/
|
|
40
|
+
|
|
41
|
+
publish:
|
|
42
|
+
needs: build
|
|
43
|
+
runs-on: ubuntu-latest
|
|
44
|
+
environment: pypi
|
|
45
|
+
permissions:
|
|
46
|
+
id-token: write # required for Trusted Publishing
|
|
47
|
+
steps:
|
|
48
|
+
- uses: actions/download-artifact@v4
|
|
49
|
+
with:
|
|
50
|
+
name: dist
|
|
51
|
+
path: dist/
|
|
52
|
+
|
|
53
|
+
- name: Publish to PyPI
|
|
54
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
55
|
+
# No ``with:`` token field — Trusted Publishing uses the
|
|
56
|
+
# workflow's OIDC identity instead. PyPI must have the
|
|
57
|
+
# corresponding "trusted publisher" configured pointing at
|
|
58
|
+
# this repo + workflow file + environment.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
dist/
|
|
9
|
+
*.egg-info/
|
|
10
|
+
.pytest_cache/
|
|
11
|
+
.ruff_cache/
|
|
12
|
+
.mypy_cache/
|
|
13
|
+
.pyright_cache/
|
|
14
|
+
|
|
15
|
+
# Virtual environments
|
|
16
|
+
.venv/
|
|
17
|
+
venv/
|
|
18
|
+
env/
|
|
19
|
+
|
|
20
|
+
# uv
|
|
21
|
+
uv.lock
|
|
22
|
+
|
|
23
|
+
# pyenv (developer-local; pyproject.toml's requires-python is the
|
|
24
|
+
# project-wide source of truth).
|
|
25
|
+
.python-version
|
|
26
|
+
|
|
27
|
+
# IDE
|
|
28
|
+
.vscode/
|
|
29
|
+
.idea/
|
|
30
|
+
*.swp
|
|
31
|
+
*.swo
|
|
32
|
+
|
|
33
|
+
# OS
|
|
34
|
+
.DS_Store
|
|
35
|
+
Thumbs.db
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to fancychunk are recorded here. The format
|
|
4
|
+
follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and
|
|
5
|
+
the project follows [Semantic Versioning](https://semver.org/).
|
|
6
|
+
|
|
7
|
+
## [Unreleased]
|
|
8
|
+
|
|
9
|
+
### Changed (breaking — pre-1.0)
|
|
10
|
+
- `embed_with_late_chunking` now takes a `SegmentEmbedder` instead of
|
|
11
|
+
`TokenLevelEmbedder`. The new protocol is two methods + one
|
|
12
|
+
attribute (`n_ctx`, `count_tokens`, `embed_segment`) — replacing
|
|
13
|
+
the four-method `tokenize` / `detokenize` / `embed` / `n_ctx`
|
|
14
|
+
contract.
|
|
15
|
+
- Tokenization, special-token policy, and sentence-to-token
|
|
16
|
+
alignment are now the embedder's concern, not the library's. The
|
|
17
|
+
sentinel-token method (with `⊕` default), sentinel discovery,
|
|
18
|
+
and the `sentinel` keyword argument are removed.
|
|
19
|
+
- New `examples/embedders/` directory with reference adapters for
|
|
20
|
+
MLX (`qwen3_mlx.py`), HuggingFace transformers
|
|
21
|
+
(`huggingface_offsets.py`), and a remote HTTP service
|
|
22
|
+
(`remote_http.py`), each runnable.
|
|
23
|
+
|
|
24
|
+
### Spec changes
|
|
25
|
+
- SPEC-CHUNK-420 rewritten: per-sentence alignment is the embedder's
|
|
26
|
+
responsibility; the library's contract is that
|
|
27
|
+
`sum(per_sentence_counts) == matrix_row_count`.
|
|
28
|
+
- SPEC-CHUNK-421 removed (sentinel character requirements are no
|
|
29
|
+
longer normative — implementations that adopt the sentinel method
|
|
30
|
+
test it against their own tokenizer).
|
|
31
|
+
- SPEC-CHUNK-412 simplified to four steps; the largest-remainder
|
|
32
|
+
safety net stays as the absorber for count drift between
|
|
33
|
+
budget-planning and the actual joined-input tokenization.
|
|
34
|
+
- TV-407 (sentinel collision detection) removed; TV-408 rewritten
|
|
35
|
+
for the new protocol.
|
|
36
|
+
|
|
37
|
+
### Added
|
|
38
|
+
- Initial implementation of every pipeline stage in the spec:
|
|
39
|
+
- Stage 1 `split_sentences` (SaT-backed default segmenter via
|
|
40
|
+
`wtpsplit-lite`, lazy-loaded; punctuation fallback; heading
|
|
41
|
+
override; whitespace-trailing pass; vectorised boundary-score DP).
|
|
42
|
+
- Stage 2 `split_chunklets` (markdown-it-driven boundary
|
|
43
|
+
probabilities, statement-count function, vectorised
|
|
44
|
+
minimum-cost DP).
|
|
45
|
+
- Stage 3 `split_chunks` (unit-norm + discourse-vector projection,
|
|
46
|
+
rescaled cosine similarity, heading-aware modification recognising
|
|
47
|
+
both ATX and Setext, vectorised DP under covering constraint).
|
|
48
|
+
- Stage 4 `embed_with_late_chunking` (greedy segment construction
|
|
49
|
+
with backward preamble, sentinel-token alignment, largest-remainder
|
|
50
|
+
safety net).
|
|
51
|
+
- Stage 5 `heading_paths` (Markdown heading stack with reset
|
|
52
|
+
semantics).
|
|
53
|
+
- Public exception hierarchy (`fancychunk.FancyChunkError` and
|
|
54
|
+
subclasses).
|
|
55
|
+
- OpenTelemetry tracing for every public entry point. Spans are
|
|
56
|
+
zero-cost no-ops when no SDK is configured; span names are
|
|
57
|
+
`fancychunk.<function>` and attributes use the
|
|
58
|
+
`fancychunk.<key>` namespace.
|
|
59
|
+
- 88-test pytest suite covering every remaining test vector plus
|
|
60
|
+
cross-stage invariants and the inspection-only SPEC-CHUNK IDs
|
|
61
|
+
(101, 110, 113, 116). Tests use the punctuation segmenter by
|
|
62
|
+
default; set `FANCYCHUNK_TEST_USE_SAT=1` to exercise the real
|
|
63
|
+
model.
|
|
64
|
+
- GitHub Actions CI workflow runs pyright (strict mode) and pytest
|
|
65
|
+
against Python 3.12 and 3.13.
|
|
66
|
+
|
|
67
|
+
### Removed
|
|
68
|
+
- Four test vectors that contradicted the spec's own cost math
|
|
69
|
+
(TV-113 as originally written, TV-205's downstream-partition
|
|
70
|
+
property, TV-209, TV-306). TV-113 was rewritten to test genuine
|
|
71
|
+
infeasibility; TV-205 was rewritten to test the upstream
|
|
72
|
+
probability mapping.
|
|
73
|
+
|
|
74
|
+
[Unreleased]: https://github.com/anthropics/fancychunk/compare/HEAD
|
fancychunk-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 fancychunk authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fancychunk
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Text chunking for retrieval-augmented generation.
|
|
5
|
+
Author: fancychunk authors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
15
|
+
Requires-Python: >=3.12
|
|
16
|
+
Requires-Dist: markdown-it-py>=3.0
|
|
17
|
+
Requires-Dist: numpy>=1.26
|
|
18
|
+
Requires-Dist: opentelemetry-api>=1.20
|
|
19
|
+
Requires-Dist: wtpsplit-lite>=0.1
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# fancychunk
|
|
23
|
+
|
|
24
|
+
A small, focused library for splitting text documents into semantically
|
|
25
|
+
coherent chunks suitable for retrieval-augmented generation.
|
|
26
|
+
|
|
27
|
+
> **Status:** initial implementation. The full specification lives in
|
|
28
|
+
> [`docs/specs/`](docs/specs/README.md); the public API in
|
|
29
|
+
> [`docs/specs/contracts/public-api.md`](docs/specs/contracts/public-api.md);
|
|
30
|
+
> the test vectors in
|
|
31
|
+
> [`docs/specs/test-vectors/`](docs/specs/test-vectors/). The
|
|
32
|
+
> implementation lives in [`src/fancychunk/`](src/fancychunk/) and
|
|
33
|
+
> covers the three required pipeline stages plus the two optional
|
|
34
|
+
> helpers (`embed_with_late_chunking`, `heading_paths`).
|
|
35
|
+
|
|
36
|
+
## Quick start
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
import numpy as np
|
|
40
|
+
from fancychunk import (
|
|
41
|
+
split_sentences,
|
|
42
|
+
split_chunklets,
|
|
43
|
+
split_chunks,
|
|
44
|
+
heading_paths,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
doc = open("README.md").read()
|
|
48
|
+
sentences = split_sentences(doc, max_len=2048)
|
|
49
|
+
chunklets = split_chunklets(sentences, max_size=2048)
|
|
50
|
+
|
|
51
|
+
# Caller supplies the embedding matrix; embedding is not part of
|
|
52
|
+
# fancychunk's core pipeline. Any deterministic embedder works.
|
|
53
|
+
embeddings = my_embedder(chunklets)
|
|
54
|
+
chunks, chunk_embeddings = split_chunks(chunklets, embeddings, max_size=2048)
|
|
55
|
+
paths = heading_paths(chunks)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Late chunking — bring your own embedder
|
|
59
|
+
|
|
60
|
+
`embed_with_late_chunking` is an optional stage that improves
|
|
61
|
+
retrieval quality on documents with anaphoric references ("it",
|
|
62
|
+
"this method", "the algorithm") by giving each sentence an embedding
|
|
63
|
+
computed in the context of its neighbours. It costs about 4 MTEB
|
|
64
|
+
points on retrieval benchmarks vs. naive per-chunklet embedding, at
|
|
65
|
+
the price of ~30% more compute.
|
|
66
|
+
|
|
67
|
+
**The library doesn't ship any embedding model.** It owns the
|
|
68
|
+
algorithm — segment planning with backward preamble, mean-pool per
|
|
69
|
+
sentence, preamble discard, optional L2 normalize — and delegates
|
|
70
|
+
everything tokenizer-specific to a caller-supplied
|
|
71
|
+
[`SegmentEmbedder`](docs/specs/04-late-chunking.md#embedder-contract).
|
|
72
|
+
The contract is two methods and one attribute:
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
class SegmentEmbedder(Protocol):
|
|
76
|
+
n_ctx: int
|
|
77
|
+
def count_tokens(self, sentences: list[str]) -> list[int]: ...
|
|
78
|
+
def embed_segment(
|
|
79
|
+
self, sentences: list[str]
|
|
80
|
+
) -> tuple[NDArray, list[int]]: ...
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Adapters for three deployment shapes ship as runnable examples:
|
|
84
|
+
|
|
85
|
+
| File | Backend | Best for |
|
|
86
|
+
|---|---|---|
|
|
87
|
+
| [`examples/embedders/qwen3_mlx.py`](examples/embedders/qwen3_mlx.py) | MLX + Qwen3-Embedding | Apple Silicon; offline / batch |
|
|
88
|
+
| [`examples/embedders/huggingface_offsets.py`](examples/embedders/huggingface_offsets.py) | HuggingFace transformers | Any platform; recommended default |
|
|
89
|
+
| [`examples/embedders/remote_http.py`](examples/embedders/remote_http.py) | HTTP client + local tokenizer | When the GPU lives on another machine |
|
|
90
|
+
|
|
91
|
+
See [`examples/embedders/README.md`](examples/embedders/README.md)
|
|
92
|
+
for guidance on picking an alignment method (offset-based vs.
|
|
93
|
+
sentinel-token), handling special tokens, and writing your own
|
|
94
|
+
adapter — typically ~20 lines of glue.
|
|
95
|
+
|
|
96
|
+
Wire it into the pipeline between stages 2 and 3:
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from examples.embedders.huggingface_offsets import HFOffsetEmbedder
|
|
100
|
+
from fancychunk import (
|
|
101
|
+
embed_with_late_chunking,
|
|
102
|
+
split_chunklets,
|
|
103
|
+
split_chunks,
|
|
104
|
+
split_sentences,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
embedder = HFOffsetEmbedder("BAAI/bge-m3")
|
|
108
|
+
|
|
109
|
+
sentences = split_sentences(doc, max_len=2048)
|
|
110
|
+
chunklets = split_chunklets(sentences, max_size=2048)
|
|
111
|
+
|
|
112
|
+
# Per-sentence embeddings with surrounding context.
|
|
113
|
+
sentence_embeddings = embed_with_late_chunking(sentences, embedder)
|
|
114
|
+
|
|
115
|
+
# Aggregate to per-chunklet (mean-pool over the sentences inside
|
|
116
|
+
# each chunklet — the caller's responsibility).
|
|
117
|
+
chunklet_embeddings = aggregate_to_chunklets(
|
|
118
|
+
sentence_embeddings, sentences, chunklets
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
chunks, _ = split_chunks(chunklets, chunklet_embeddings, max_size=2048)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Observability
|
|
125
|
+
|
|
126
|
+
Every public stage emits an OpenTelemetry span with attributes that
|
|
127
|
+
describe input/output sizes and the option choices that affected the
|
|
128
|
+
outcome. The library depends only on `opentelemetry-api`; spans are
|
|
129
|
+
zero-cost no-ops until the host application configures an SDK and
|
|
130
|
+
exporter.
|
|
131
|
+
|
|
132
|
+
Span names are `fancychunk.<function>` (e.g.
|
|
133
|
+
`fancychunk.split_sentences`). Attribute keys use the
|
|
134
|
+
`fancychunk.<key>` namespace:
|
|
135
|
+
|
|
136
|
+
| Stage | Attribute keys |
|
|
137
|
+
|---|---|
|
|
138
|
+
| `split_sentences` | `document.length`, `min_len`, `max_len`, `segmenter`, `sentences.count`, `short_circuit` |
|
|
139
|
+
| `split_chunklets` | `sentences.count`, `max_size`, `custom_costs`, `chunklets.count`, `short_circuit` |
|
|
140
|
+
| `split_chunks` | `chunklets.count`, `max_size`, `embedding.dim`, `chunks.count`, `short_circuit` |
|
|
141
|
+
| `embed_with_late_chunking` | `sentences.count`, `embedder`, `embedder.n_ctx`, `budget`, `preamble_budget`, `preamble_fraction`, `normalize`, `segments.count`, `embedding.dim` |
|
|
142
|
+
| `heading_paths` | `chunks.count`, `paths.non_empty` |
|
|
143
|
+
|
|
144
|
+
To see them locally, install `opentelemetry-sdk` and configure a
|
|
145
|
+
console exporter:
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
from opentelemetry import trace
|
|
149
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
150
|
+
from opentelemetry.sdk.trace.export import ConsoleSpanExporter, SimpleSpanProcessor
|
|
151
|
+
|
|
152
|
+
provider = TracerProvider()
|
|
153
|
+
provider.add_span_processor(SimpleSpanProcessor(ConsoleSpanExporter()))
|
|
154
|
+
trace.set_tracer_provider(provider)
|
|
155
|
+
|
|
156
|
+
# subsequent fancychunk calls now emit spans to stdout
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
The library also exposes a standard `logging.Logger` at
|
|
160
|
+
`fancychunk` (currently silent by default; future versions may add
|
|
161
|
+
INFO-level breadcrumbs at stage transitions).
|
|
162
|
+
|
|
163
|
+
## What it does
|
|
164
|
+
|
|
165
|
+
Given a Markdown document, fancychunk partitions it into chunks where
|
|
166
|
+
each chunk:
|
|
167
|
+
|
|
168
|
+
- Respects sentence and paragraph boundaries.
|
|
169
|
+
- Targets a configurable maximum size.
|
|
170
|
+
- Begins at a structurally meaningful point (heading, paragraph start).
|
|
171
|
+
- Groups together semantically related material, splitting where the
|
|
172
|
+
topic shifts.
|
|
173
|
+
|
|
174
|
+
Optionally:
|
|
175
|
+
|
|
176
|
+
- When paired with a token-level embedding model, fancychunk can
|
|
177
|
+
produce *per-sentence* embeddings that incorporate surrounding-
|
|
178
|
+
document context ("late chunking"). The caller aggregates them to
|
|
179
|
+
per-chunklet level (typically by mean-pool over the sentences in
|
|
180
|
+
each chunklet) before passing them to the semantic-chunking stage.
|
|
181
|
+
- For each chunk, fancychunk can compute the Markdown heading path
|
|
182
|
+
that was in scope at the chunk's start, suitable for prepending as
|
|
183
|
+
embedding context.
|
|
184
|
+
|
|
185
|
+
## What it does *not* do
|
|
186
|
+
|
|
187
|
+
- It does not parse PDFs, Word documents, or HTML. Input is Markdown.
|
|
188
|
+
- It does not embed text in the core three-stage pipeline. Embedding
|
|
189
|
+
is the caller's responsibility; fancychunk consumes pre-computed
|
|
190
|
+
chunklet embeddings for the semantic-chunking stage. (The optional
|
|
191
|
+
`embed_with_late_chunking` helper does invoke an embedder, but it
|
|
192
|
+
is opt-in and requires the caller to supply one.)
|
|
193
|
+
- It does not store, index, or retrieve. Output is a list of strings.
|
|
194
|
+
- It does not generate. There is no LLM in the loop.
|
|
195
|
+
|
|
196
|
+
## How to read the specs
|
|
197
|
+
|
|
198
|
+
The specs in [`docs/specs/`](docs/specs/) are behavioral, not
|
|
199
|
+
prescriptive about implementation. A spec line says *what* a function
|
|
200
|
+
must do, not *how* to do it. Implementations are free to choose
|
|
201
|
+
tools, algorithms, libraries, and internal architecture.
|
|
202
|
+
|
|
203
|
+
Specs are numbered. SPEC-CHUNK-NNN identifiers within each spec
|
|
204
|
+
correspond to a single testable behavior; the
|
|
205
|
+
[acceptance checklist](docs/specs/acceptance/checklist.md) tracks every
|
|
206
|
+
ID.
|
|
207
|
+
|
|
208
|
+
## Repo layout
|
|
209
|
+
|
|
210
|
+
```
|
|
211
|
+
fancychunk/
|
|
212
|
+
├── README.md # This file
|
|
213
|
+
├── LICENSE # MIT
|
|
214
|
+
├── pyproject.toml # Package metadata + runtime deps
|
|
215
|
+
├── docs/specs/
|
|
216
|
+
│ ├── README.md # Glossary and reading order
|
|
217
|
+
│ ├── 00-pipeline-overview.md # End-to-end data flow
|
|
218
|
+
│ ├── 01-sentence-splitting.md # Stage 1
|
|
219
|
+
│ ├── 02-chunklet-grouping.md # Stage 2
|
|
220
|
+
│ ├── 03-semantic-chunking.md # Stage 3
|
|
221
|
+
│ ├── 04-late-chunking.md # Optional embed strategy
|
|
222
|
+
│ ├── 05-contextual-headings.md # Optional helper
|
|
223
|
+
│ ├── contracts/ # Public API signatures
|
|
224
|
+
│ ├── test-vectors/ # Concrete input → expected output pairs
|
|
225
|
+
│ └── acceptance/ # Pass/fail criteria
|
|
226
|
+
├── src/fancychunk/ # Implementation
|
|
227
|
+
│ ├── sentences.py # Stage 1 — sentence splitting
|
|
228
|
+
│ ├── chunklets.py # Stage 2 — chunklet grouping
|
|
229
|
+
│ ├── chunks.py # Stage 3 — semantic chunking
|
|
230
|
+
│ ├── late_chunking.py # Stage 4 — late chunking (optional)
|
|
231
|
+
│ ├── headings.py # Stage 5 — heading paths (optional)
|
|
232
|
+
│ ├── _markdown.py # Markdown-it heading + opener helpers
|
|
233
|
+
│ ├── _segmenter.py # SaT default + punctuation fallback
|
|
234
|
+
│ ├── _constants.py # Named constants from the specs
|
|
235
|
+
│ └── errors.py # Exception hierarchy
|
|
236
|
+
└── tests/ # pytest suite covering every TV-*
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
## Production readiness
|
|
240
|
+
|
|
241
|
+
This is an alpha release (`0.1.x`). The behaviour the public API
|
|
242
|
+
documents is fully spec-conforming and locked in by the 88-test
|
|
243
|
+
suite; what's *not* yet promised:
|
|
244
|
+
|
|
245
|
+
- **API stability.** Names and defaults are unlikely to change but
|
|
246
|
+
aren't yet contract-stable. SemVer applies once the version hits
|
|
247
|
+
`1.0.0`.
|
|
248
|
+
- **SaT model on first run.** The default segmenter downloads ~408 MB
|
|
249
|
+
of weights from Hugging Face on first call. For production
|
|
250
|
+
deployment, either pre-warm the cache during image build or pass
|
|
251
|
+
`segmenter=punctuation_segmenter` if you can tolerate its quality.
|
|
252
|
+
- **Thread safety.** The module-level SaT singleton and markdown-it
|
|
253
|
+
parser are reentrant for *read*; the library doesn't synchronise.
|
|
254
|
+
Concurrent calls from multiple threads work because every operation
|
|
255
|
+
reads-only. Concurrent first-time SaT loading from multiple threads
|
|
256
|
+
may load the model twice (harmless but wasteful) — pre-warm if
|
|
257
|
+
this matters.
|
|
258
|
+
- **No global state writes.** No caches, no temp files, no logging
|
|
259
|
+
side effects. The library does not call `logging.basicConfig` and
|
|
260
|
+
attaches no handlers.
|
|
261
|
+
- **Determinism.** Cross-run reproducibility is guaranteed for every
|
|
262
|
+
stage given a deterministic segmenter / embedder (see
|
|
263
|
+
SPEC-CHUNK-901 in the specs).
|
|
264
|
+
|
|
265
|
+
CI runs `pyright` in strict mode and `pytest` against Python 3.12
|
|
266
|
+
and 3.13 on every push. Tests use the lightweight punctuation
|
|
267
|
+
segmenter so CI doesn't need the SaT weights; set
|
|
268
|
+
`FANCYCHUNK_TEST_USE_SAT=1` to exercise the real model.
|
|
269
|
+
|
|
270
|
+
## Releases
|
|
271
|
+
|
|
272
|
+
Tags of the form `vX.Y.Z` on `main` trigger the release workflow
|
|
273
|
+
(`.github/workflows/release.yml`), which builds `sdist` + `wheel` and
|
|
274
|
+
publishes to PyPI via [Trusted Publishing](https://docs.pypi.org/trusted-publishers/)
|
|
275
|
+
— no API tokens stored anywhere. The first publish has to be done
|
|
276
|
+
manually (to reserve the project name on PyPI); subsequent releases
|
|
277
|
+
ride the workflow.
|
|
278
|
+
|
|
279
|
+
To cut a release:
|
|
280
|
+
|
|
281
|
+
```bash
|
|
282
|
+
# 1. Update the version (single source of truth is pyproject.toml).
|
|
283
|
+
# 2. Update CHANGELOG.md.
|
|
284
|
+
# 3. Tag and push:
|
|
285
|
+
git tag -a v0.1.1 -m "Describe the release"
|
|
286
|
+
git push origin v0.1.1
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
The `release` workflow takes over from there.
|
|
290
|
+
|
|
291
|
+
## Acknowledgments
|
|
292
|
+
|
|
293
|
+
The three-stage pipeline (sentence → chunklet → chunk), the
|
|
294
|
+
late-chunking strategy, and the contextual-headings helper are
|
|
295
|
+
inspired by the chunking pipeline in
|
|
296
|
+
[raglite](https://github.com/superlinear-ai/raglite). Specific
|
|
297
|
+
techniques cite their originators inline in the specs: the SaT
|
|
298
|
+
segmenter, Greg Kamradt's "5 levels" taxonomy, Arora et al.'s
|
|
299
|
+
discourse-vector technique, the Weaviate / Jina late-chunking work,
|
|
300
|
+
and Dan Stites's contextual-headings post.
|