doclighter 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Pratyush
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,245 @@
1
+ Metadata-Version: 2.4
2
+ Name: doclighter
3
+ Version: 0.1.0
4
+ Summary: A semantic Ctrl+F that paints your document with a relevance gradient.
5
+ Author-email: Pratyush <pratyush272@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/pratyush272/doclighter
8
+ Project-URL: Repository, https://github.com/pratyush272/doclighter
9
+ Project-URL: Issues, https://github.com/pratyush272/doclighter/issues
10
+ Keywords: semantic-search,embeddings,visualization,rag,nlp,pdf
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
22
+ Classifier: Topic :: Text Processing :: Indexing
23
+ Requires-Python: >=3.9
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: numpy>=1.21
27
+ Requires-Dist: sentence-transformers>=2.2
28
+ Requires-Dist: pypdf>=4.0
29
+ Requires-Dist: requests>=2.25
30
+ Provides-Extra: quantize
31
+ Requires-Dist: faiss-cpu>=1.7; extra == "quantize"
32
+ Provides-Extra: streamlit
33
+ Requires-Dist: streamlit>=1.28; extra == "streamlit"
34
+ Provides-Extra: dev
35
+ Requires-Dist: pytest>=7; extra == "dev"
36
+ Requires-Dist: pytest-cov>=4; extra == "dev"
37
+ Requires-Dist: ruff>=0.1; extra == "dev"
38
+ Provides-Extra: all
39
+ Requires-Dist: doclighter[dev,quantize,streamlit]; extra == "all"
40
+ Dynamic: license-file
41
+
42
+ # doclighter
43
+
44
+ [![tests](https://github.com/pratyush272/doclighter/actions/workflows/test.yml/badge.svg)](https://github.com/pratyush272/doclighter/actions/workflows/test.yml)
45
+ [![PyPI](https://img.shields.io/pypi/v/doclighter.svg)](https://pypi.org/project/doclighter/)
46
+ [![Python](https://img.shields.io/pypi/pyversions/doclighter.svg)](https://pypi.org/project/doclighter/)
47
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
48
+
49
+ **A semantic Ctrl+F that paints your document with a relevance gradient.**
50
+
51
+ `doclighter` is what you reach for when you need to *see* where a topic lives in a document — not be told an answer. It embeds your document at fine granularity, then projects query relevance back onto every word as a heatmap. No LLM, no hallucination, no top-K cliff.
52
+
53
+ ```python
54
+ from doclighter import Doclighter
55
+
56
+ doc = Doclighter.from_pdf("contract.pdf")
57
+ result = doc.search("termination clauses")
58
+ result # In Jupyter: renders the whole document, color-coded by relevance
59
+ ```
60
+
61
+ ## Why this exists
62
+
63
+ Traditional RAG hands an LLM the top 3–10 chunks and asks it to generate an answer. That's great when you trust the answer and don't need to read the source. But sometimes you *need to read the source* — legal review, paper skimming, contract diffing, due diligence — and you want a tool that helps you *navigate* a long document, not summarize it away.
64
+
65
+ `doclighter` is for that. It treats the whole document as the output, and re-colors it by semantic relevance to your query. Hot regions deserve your attention; cold regions you can skim past.
66
+
67
+ It's deterministic, fast (sub-100ms per query after indexing), and shows you the long tail — including the case where your query *doesn't* match anything (everything stays cold blue, which is itself useful information that RAG hides).
68
+
69
+ ## Install
70
+
71
+ ```bash
72
+ pip install doclighter
73
+ ```
74
+
75
+ Optional extras:
76
+ ```bash
77
+ pip install "doclighter[quantize]" # FAISS SQ8 index for very large docs
78
+ pip install "doclighter[streamlit]" # for the interactive demo app
79
+ pip install "doclighter[dev]" # for contributors
80
+ ```
81
+
82
+ ## Quickstart
83
+
84
+ ### Load a document
85
+
86
+ ```python
87
+ from doclighter import Doclighter
88
+
89
+ # From a PDF on disk
90
+ doc = Doclighter.from_pdf("contract.pdf")
91
+
92
+ # From a PDF URL
93
+ doc = Doclighter.from_url("https://example.com/contract.pdf")
94
+
95
+ # From raw text
96
+ doc = Doclighter.from_text(open("paper.txt").read())
97
+ ```
98
+
99
+ The first call downloads the default embedding model (~80 MB MiniLM) and embeds your document. For a ~10K word doc this takes ~25 seconds. Subsequent searches reuse the index.
100
+
101
+ ### Search
102
+
103
+ ```python
104
+ result = doc.search("termination clauses")
105
+
106
+ result.word_scores # numpy array, shape (n_words,), values in [0, 1]
107
+ result.top_chunks(k=10) # list of (chunk_text, score, (start, end))
108
+ result.elapsed_ms # ~10-50ms for typical docs
109
+ result.to_html() # HTML string for display anywhere
110
+ ```
111
+
112
+ In Jupyter, just put `result` on the last line of a cell — it renders the heatmap inline.
113
+
114
+ ### Zoom: the `decay_sigma` knob
115
+
116
+ The differentiating feature. `decay_sigma` controls how far semantic warmth spreads from a matched region:
117
+
118
+ ```python
119
+ narrow = doc.search("termination", decay_sigma=5.0) # sharp word-level highlights
120
+ broad = doc.search("termination", decay_sigma=80.0) # broad thematic regions
121
+ ```
122
+
123
+ Same index, no re-embedding. Drag the σ slider in the Streamlit demo to feel what this does.
124
+
125
+ ### Multi-query
126
+
127
+ ```python
128
+ result = doc.search(
129
+ ["termination", "indemnification", "labour wages"],
130
+ multi_query_aggregate="max", # or "sum" to favor regions matching multiple
131
+ )
132
+ ```
133
+
134
+ ### Save / load the index
135
+
136
+ Embedding is the slow step. Save once, reuse:
137
+
138
+ ```python
139
+ doc.save("contract.idx")
140
+ doc = Doclighter.load("contract.idx")
141
+ ```
142
+
143
+ ### Bring your own embedder
144
+
145
+ Any callable mapping `list[str] -> np.ndarray` of shape `(N, dim)` works:
146
+
147
+ ```python
148
+ from sentence_transformers import SentenceTransformer
149
+
150
+ bge = SentenceTransformer("BAAI/bge-small-en-v1.5")
151
+ doc = Doclighter.from_text(text, embedder=bge.encode)
152
+ ```
153
+
154
+ ## Streamlit demo
155
+
156
+ ```bash
157
+ pip install "doclighter[streamlit]"
158
+ streamlit run examples/streamlit_app.py
159
+ ```
160
+
161
+ A working UI with PDF upload, query box, σ slider, and live re-rendering.
162
+
163
+ ## How it works
164
+
165
+ 1. **Chunk** the document into small rolling windows (default: 12 words, 50% overlap).
166
+ 2. **Embed** each chunk with sentence-transformers (default: `all-MiniLM-L6-v2`).
167
+ 3. **Score** each chunk against your query via cosine similarity.
168
+ 4. **Project** chunk scores back onto every word via exponential proximity decay:
169
+
170
+ ```
171
+ word_score[w] = max over chunks c of raw[c] × exp(-distance(w, c) / sigma)
172
+ ```
173
+
174
+ 5. **Render** the document as colored HTML — words inherit warmth from their nearest semantically matched chunk.
175
+
176
+ Step 4 is the interesting one. Max-aggregation (rather than sum) means a word's color reflects its single strongest semantic neighbor — visually intuitive and resistant to "many lukewarm chunks add up to red" noise.
177
+
178
+ ## How this compares to RAG
179
+
180
+ `doclighter` and RAG solve different problems:
181
+
182
+ | | RAG | doclighter |
183
+ |---|---|---|
184
+ | Output | LLM-generated answer | Document, recolored |
185
+ | Best for | "What's the answer?" | "Where in the doc?" |
186
+ | When query has no answer | LLM hedges / hallucinates | Document stays cold (honest) |
187
+ | Hides chunk boundaries | No — top-K cliff | Yes — gradient smooths over |
188
+ | Cost per query | LLM tokens | Free (one matmul) |
189
+ | Determinism | Sampling-dependent | Fully deterministic |
190
+
191
+ They're complementary. Use RAG when you trust the LLM with the question; use `doclighter` when you need to read the source yourself.
192
+
193
+ The algorithmic kernel is conceptually related to [ColBERT](https://github.com/stanford-futuredata/ColBERT)'s MaxSim late-interaction, but applied to *visualization* rather than ranking, and at the word-level rather than the token-level.
194
+
195
+ ## API reference
196
+
197
+ ### `Doclighter(text, **kwargs)`
198
+
199
+ | Parameter | Default | Description |
200
+ |---|---|---|
201
+ | `text` | required | The document as a string |
202
+ | `chunk_size` | `12` | Words per rolling window |
203
+ | `chunk_overlap` | `0.5` | Fraction overlap between windows |
204
+ | `embedder` | `None` | Custom embedder callable (default: MiniLM) |
205
+ | `embedding_model` | `"all-MiniLM-L6-v2"` | sentence-transformers model name |
206
+ | `decay_sigma` | `20.0` | Default proximity decay scale (words) |
207
+ | `quantize` | `False` | Use SQ8 FAISS index instead of flat exact |
208
+ | `quantize_rerank_k` | `200` | If quantize=True, rerank top-K with exact |
209
+
210
+ Alternate constructors: `Doclighter.from_text(...)`, `Doclighter.from_pdf(path, ...)`, `Doclighter.from_url(url, ...)`.
211
+
212
+ ### `doc.search(query, **kwargs) -> SearchResult`
213
+
214
+ | Parameter | Default | Description |
215
+ |---|---|---|
216
+ | `query` | required | A string, or list of strings for multi-query |
217
+ | `decay_sigma` | `None` | Override the doc's default sigma |
218
+ | `multi_query_aggregate` | `"max"` | `"max"`, `"mean"`, or `"sum"` for multi-query |
219
+
220
+ ### `SearchResult`
221
+
222
+ | Attribute / method | Returns |
223
+ |---|---|
224
+ | `.word_scores` | `np.ndarray` of shape `(n_words,)`, in `[0, 1]` |
225
+ | `.chunk_scores` | `np.ndarray` of raw per-chunk cosine scores |
226
+ | `.top_chunks(k=10)` | `list[(text, score, (start, end))]` |
227
+ | `.to_html(**kwargs)` | HTML string of the heatmap-colored document |
228
+ | `.elapsed_ms` | Search latency |
229
+
230
+ ## Development
231
+
232
+ ```bash
233
+ git clone https://github.com/pratyush272/doclighter
234
+ cd doclighter
235
+ pip install -e ".[dev]"
236
+ pytest
237
+ ```
238
+
239
+ ## License
240
+
241
+ MIT. See [LICENSE](LICENSE).
242
+
243
+ ## Citation / acknowledgement
244
+
245
+ If you use `doclighter` in research, a link back is appreciated. The proximity-decay scoring idea borrows from passage-retrieval literature; max-aggregation over fine-grained matches is in spirit closest to ColBERT.
@@ -0,0 +1,204 @@
1
+ # doclighter
2
+
3
+ [![tests](https://github.com/pratyush272/doclighter/actions/workflows/test.yml/badge.svg)](https://github.com/pratyush272/doclighter/actions/workflows/test.yml)
4
+ [![PyPI](https://img.shields.io/pypi/v/doclighter.svg)](https://pypi.org/project/doclighter/)
5
+ [![Python](https://img.shields.io/pypi/pyversions/doclighter.svg)](https://pypi.org/project/doclighter/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
7
+
8
+ **A semantic Ctrl+F that paints your document with a relevance gradient.**
9
+
10
+ `doclighter` is what you reach for when you need to *see* where a topic lives in a document — not be told an answer. It embeds your document at fine granularity, then projects query relevance back onto every word as a heatmap. No LLM, no hallucination, no top-K cliff.
11
+
12
+ ```python
13
+ from doclighter import Doclighter
14
+
15
+ doc = Doclighter.from_pdf("contract.pdf")
16
+ result = doc.search("termination clauses")
17
+ result # In Jupyter: renders the whole document, color-coded by relevance
18
+ ```
19
+
20
+ ## Why this exists
21
+
22
+ Traditional RAG hands an LLM the top 3–10 chunks and asks it to generate an answer. That's great when you trust the answer and don't need to read the source. But sometimes you *need to read the source* — legal review, paper skimming, contract diffing, due diligence — and you want a tool that helps you *navigate* a long document, not summarize it away.
23
+
24
+ `doclighter` is for that. It treats the whole document as the output, and re-colors it by semantic relevance to your query. Hot regions deserve your attention; cold regions you can skim past.
25
+
26
+ It's deterministic, fast (sub-100ms per query after indexing), and shows you the long tail — including the case where your query *doesn't* match anything (everything stays cold blue, which is itself useful information that RAG hides).
27
+
28
+ ## Install
29
+
30
+ ```bash
31
+ pip install doclighter
32
+ ```
33
+
34
+ Optional extras:
35
+ ```bash
36
+ pip install "doclighter[quantize]" # FAISS SQ8 index for very large docs
37
+ pip install "doclighter[streamlit]" # for the interactive demo app
38
+ pip install "doclighter[dev]" # for contributors
39
+ ```
40
+
41
+ ## Quickstart
42
+
43
+ ### Load a document
44
+
45
+ ```python
46
+ from doclighter import Doclighter
47
+
48
+ # From a PDF on disk
49
+ doc = Doclighter.from_pdf("contract.pdf")
50
+
51
+ # From a PDF URL
52
+ doc = Doclighter.from_url("https://example.com/contract.pdf")
53
+
54
+ # From raw text
55
+ doc = Doclighter.from_text(open("paper.txt").read())
56
+ ```
57
+
58
+ The first call downloads the default embedding model (~80 MB MiniLM) and embeds your document. For a ~10K word doc this takes ~25 seconds. Subsequent searches reuse the index.
59
+
60
+ ### Search
61
+
62
+ ```python
63
+ result = doc.search("termination clauses")
64
+
65
+ result.word_scores # numpy array, shape (n_words,), values in [0, 1]
66
+ result.top_chunks(k=10) # list of (chunk_text, score, (start, end))
67
+ result.elapsed_ms # ~10-50ms for typical docs
68
+ result.to_html() # HTML string for display anywhere
69
+ ```
70
+
71
+ In Jupyter, just put `result` on the last line of a cell — it renders the heatmap inline.
72
+
73
+ ### Zoom: the `decay_sigma` knob
74
+
75
+ The differentiating feature. `decay_sigma` controls how far semantic warmth spreads from a matched region:
76
+
77
+ ```python
78
+ narrow = doc.search("termination", decay_sigma=5.0) # sharp word-level highlights
79
+ broad = doc.search("termination", decay_sigma=80.0) # broad thematic regions
80
+ ```
81
+
82
+ Same index, no re-embedding. Drag the σ slider in the Streamlit demo to feel what this does.
83
+
84
+ ### Multi-query
85
+
86
+ ```python
87
+ result = doc.search(
88
+ ["termination", "indemnification", "labour wages"],
89
+ multi_query_aggregate="max", # or "sum" to favor regions matching multiple
90
+ )
91
+ ```
92
+
93
+ ### Save / load the index
94
+
95
+ Embedding is the slow step. Save once, reuse:
96
+
97
+ ```python
98
+ doc.save("contract.idx")
99
+ doc = Doclighter.load("contract.idx")
100
+ ```
101
+
102
+ ### Bring your own embedder
103
+
104
+ Any callable mapping `list[str] -> np.ndarray` of shape `(N, dim)` works:
105
+
106
+ ```python
107
+ from sentence_transformers import SentenceTransformer
108
+
109
+ bge = SentenceTransformer("BAAI/bge-small-en-v1.5")
110
+ doc = Doclighter.from_text(text, embedder=bge.encode)
111
+ ```
112
+
113
+ ## Streamlit demo
114
+
115
+ ```bash
116
+ pip install "doclighter[streamlit]"
117
+ streamlit run examples/streamlit_app.py
118
+ ```
119
+
120
+ A working UI with PDF upload, query box, σ slider, and live re-rendering.
121
+
122
+ ## How it works
123
+
124
+ 1. **Chunk** the document into small rolling windows (default: 12 words, 50% overlap).
125
+ 2. **Embed** each chunk with sentence-transformers (default: `all-MiniLM-L6-v2`).
126
+ 3. **Score** each chunk against your query via cosine similarity.
127
+ 4. **Project** chunk scores back onto every word via exponential proximity decay:
128
+
129
+ ```
130
+ word_score[w] = max over chunks c of raw[c] × exp(-distance(w, c) / sigma)
131
+ ```
132
+
133
+ 5. **Render** the document as colored HTML — words inherit warmth from their nearest semantically matched chunk.
134
+
135
+ Step 4 is the interesting one. Max-aggregation (rather than sum) means a word's color reflects its single strongest semantic neighbor — visually intuitive and resistant to "many lukewarm chunks add up to red" noise.
136
+
137
+ ## How this compares to RAG
138
+
139
+ `doclighter` and RAG solve different problems:
140
+
141
+ | | RAG | doclighter |
142
+ |---|---|---|
143
+ | Output | LLM-generated answer | Document, recolored |
144
+ | Best for | "What's the answer?" | "Where in the doc?" |
145
+ | When query has no answer | LLM hedges / hallucinates | Document stays cold (honest) |
146
+ | Hides chunk boundaries | No — top-K cliff | Yes — gradient smooths over |
147
+ | Cost per query | LLM tokens | Free (one matmul) |
148
+ | Determinism | Sampling-dependent | Fully deterministic |
149
+
150
+ They're complementary. Use RAG when you trust the LLM with the question; use `doclighter` when you need to read the source yourself.
151
+
152
+ The algorithmic kernel is conceptually related to [ColBERT](https://github.com/stanford-futuredata/ColBERT)'s MaxSim late-interaction, but applied to *visualization* rather than ranking, and at the word-level rather than the token-level.
153
+
154
+ ## API reference
155
+
156
+ ### `Doclighter(text, **kwargs)`
157
+
158
+ | Parameter | Default | Description |
159
+ |---|---|---|
160
+ | `text` | required | The document as a string |
161
+ | `chunk_size` | `12` | Words per rolling window |
162
+ | `chunk_overlap` | `0.5` | Fraction overlap between windows |
163
+ | `embedder` | `None` | Custom embedder callable (default: MiniLM) |
164
+ | `embedding_model` | `"all-MiniLM-L6-v2"` | sentence-transformers model name |
165
+ | `decay_sigma` | `20.0` | Default proximity decay scale (words) |
166
+ | `quantize` | `False` | Use SQ8 FAISS index instead of flat exact |
167
+ | `quantize_rerank_k` | `200` | If quantize=True, rerank top-K with exact |
168
+
169
+ Alternate constructors: `Doclighter.from_text(...)`, `Doclighter.from_pdf(path, ...)`, `Doclighter.from_url(url, ...)`.
170
+
171
+ ### `doc.search(query, **kwargs) -> SearchResult`
172
+
173
+ | Parameter | Default | Description |
174
+ |---|---|---|
175
+ | `query` | required | A string, or list of strings for multi-query |
176
+ | `decay_sigma` | `None` | Override the doc's default sigma |
177
+ | `multi_query_aggregate` | `"max"` | `"max"`, `"mean"`, or `"sum"` for multi-query |
178
+
179
+ ### `SearchResult`
180
+
181
+ | Attribute / method | Returns |
182
+ |---|---|
183
+ | `.word_scores` | `np.ndarray` of shape `(n_words,)`, in `[0, 1]` |
184
+ | `.chunk_scores` | `np.ndarray` of raw per-chunk cosine scores |
185
+ | `.top_chunks(k=10)` | `list[(text, score, (start, end))]` |
186
+ | `.to_html(**kwargs)` | HTML string of the heatmap-colored document |
187
+ | `.elapsed_ms` | Search latency |
188
+
189
+ ## Development
190
+
191
+ ```bash
192
+ git clone https://github.com/pratyush272/doclighter
193
+ cd doclighter
194
+ pip install -e ".[dev]"
195
+ pytest
196
+ ```
197
+
198
+ ## License
199
+
200
+ MIT. See [LICENSE](LICENSE).
201
+
202
+ ## Citation / acknowledgement
203
+
204
+ If you use `doclighter` in research, a link back is appreciated. The proximity-decay scoring idea borrows from passage-retrieval literature; max-aggregation over fine-grained matches is in spirit closest to ColBERT.
@@ -0,0 +1,21 @@
1
+ """Doclighter — a semantic Ctrl+F that paints your document with relevance.
2
+
3
+ See https://github.com/pratyush272/doclighter for docs.
4
+ """
5
+ from .core import Doclighter, SearchResult
6
+ from .chunking import Chunk, make_chunks
7
+ from .scoring import word_heatmap, aggregate_multi_query
8
+ from .render import render_html, score_to_hex
9
+
10
+ __version__ = "0.1.0"
11
+
12
+ __all__ = [
13
+ "Doclighter",
14
+ "SearchResult",
15
+ "Chunk",
16
+ "make_chunks",
17
+ "word_heatmap",
18
+ "aggregate_multi_query",
19
+ "render_html",
20
+ "score_to_hex",
21
+ ]
@@ -0,0 +1,58 @@
1
+ """Rolling window chunking.
2
+
3
+ Small word-window chunks (default 12 words, 50% overlap) are the unit of
4
+ semantic match. This is deliberately finer than typical RAG chunking
5
+ (256-1024 tokens) because Doclighter is a visualization tool, not a
6
+ context-window filler — fine chunks give fine spatial resolution.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+ from typing import List
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class Chunk:
16
+ """A single rolling window over the document word list."""
17
+
18
+ text: str
19
+ start: int # inclusive word index
20
+ end: int # exclusive word index
21
+
22
+
23
+ def make_chunks(
24
+ words: List[str],
25
+ chunk_size: int = 12,
26
+ overlap: float = 0.5,
27
+ ) -> List[Chunk]:
28
+ """Split a word list into rolling windows.
29
+
30
+ Parameters
31
+ ----------
32
+ words : list of str
33
+ Tokenized document (typically ``text.split()``).
34
+ chunk_size : int
35
+ Words per window. Default 12 — small enough that semantic units rarely
36
+ get cut, large enough that MiniLM produces a useful embedding.
37
+ overlap : float
38
+ Fraction of overlap between consecutive windows, in [0, 1).
39
+ Default 0.5 means 50%% overlap (step = chunk_size / 2).
40
+
41
+ Returns
42
+ -------
43
+ list of Chunk
44
+ """
45
+ if not 0 <= overlap < 1:
46
+ raise ValueError(f"overlap must be in [0, 1), got {overlap}")
47
+ if chunk_size < 1:
48
+ raise ValueError(f"chunk_size must be >= 1, got {chunk_size}")
49
+
50
+ step = max(1, int(chunk_size * (1 - overlap)))
51
+ chunks: List[Chunk] = []
52
+ for i in range(0, len(words), step):
53
+ window = words[i : i + chunk_size]
54
+ if window:
55
+ chunks.append(Chunk(text=" ".join(window), start=i, end=i + len(window)))
56
+ if i + chunk_size >= len(words):
57
+ break # last window already covers tail
58
+ return chunks