doclighter 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doclighter-0.1.0/LICENSE +21 -0
- doclighter-0.1.0/PKG-INFO +245 -0
- doclighter-0.1.0/README.md +204 -0
- doclighter-0.1.0/doclighter/__init__.py +21 -0
- doclighter-0.1.0/doclighter/chunking.py +58 -0
- doclighter-0.1.0/doclighter/core.py +297 -0
- doclighter-0.1.0/doclighter/embedding.py +57 -0
- doclighter-0.1.0/doclighter/extract.py +38 -0
- doclighter-0.1.0/doclighter/index.py +72 -0
- doclighter-0.1.0/doclighter/render.py +124 -0
- doclighter-0.1.0/doclighter/scoring.py +118 -0
- doclighter-0.1.0/doclighter.egg-info/PKG-INFO +245 -0
- doclighter-0.1.0/doclighter.egg-info/SOURCES.txt +20 -0
- doclighter-0.1.0/doclighter.egg-info/dependency_links.txt +1 -0
- doclighter-0.1.0/doclighter.egg-info/requires.txt +18 -0
- doclighter-0.1.0/doclighter.egg-info/top_level.txt +1 -0
- doclighter-0.1.0/pyproject.toml +66 -0
- doclighter-0.1.0/setup.cfg +4 -0
- doclighter-0.1.0/tests/test_chunking.py +57 -0
- doclighter-0.1.0/tests/test_integration.py +129 -0
- doclighter-0.1.0/tests/test_render.py +72 -0
- doclighter-0.1.0/tests/test_scoring.py +132 -0
doclighter-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Pratyush
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: doclighter
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A semantic Ctrl+F that paints your document with a relevance gradient.
|
|
5
|
+
Author-email: Pratyush <pratyush272@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/pratyush272/doclighter
|
|
8
|
+
Project-URL: Repository, https://github.com/pratyush272/doclighter
|
|
9
|
+
Project-URL: Issues, https://github.com/pratyush272/doclighter/issues
|
|
10
|
+
Keywords: semantic-search,embeddings,visualization,rag,nlp,pdf
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
22
|
+
Classifier: Topic :: Text Processing :: Indexing
|
|
23
|
+
Requires-Python: >=3.9
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: numpy>=1.21
|
|
27
|
+
Requires-Dist: sentence-transformers>=2.2
|
|
28
|
+
Requires-Dist: pypdf>=4.0
|
|
29
|
+
Requires-Dist: requests>=2.25
|
|
30
|
+
Provides-Extra: quantize
|
|
31
|
+
Requires-Dist: faiss-cpu>=1.7; extra == "quantize"
|
|
32
|
+
Provides-Extra: streamlit
|
|
33
|
+
Requires-Dist: streamlit>=1.28; extra == "streamlit"
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
36
|
+
Requires-Dist: pytest-cov>=4; extra == "dev"
|
|
37
|
+
Requires-Dist: ruff>=0.1; extra == "dev"
|
|
38
|
+
Provides-Extra: all
|
|
39
|
+
Requires-Dist: doclighter[dev,quantize,streamlit]; extra == "all"
|
|
40
|
+
Dynamic: license-file
|
|
41
|
+
|
|
42
|
+
# doclighter
|
|
43
|
+
|
|
44
|
+
[](https://github.com/pratyush272/doclighter/actions/workflows/test.yml)
|
|
45
|
+
[](https://pypi.org/project/doclighter/)
|
|
46
|
+
[](https://pypi.org/project/doclighter/)
|
|
47
|
+
[](LICENSE)
|
|
48
|
+
|
|
49
|
+
**A semantic Ctrl+F that paints your document with a relevance gradient.**
|
|
50
|
+
|
|
51
|
+
`doclighter` is what you reach for when you need to *see* where a topic lives in a document — not be told an answer. It embeds your document at fine granularity, then projects query relevance back onto every word as a heatmap. No LLM, no hallucination, no top-K cliff.
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from doclighter import Doclighter
|
|
55
|
+
|
|
56
|
+
doc = Doclighter.from_pdf("contract.pdf")
|
|
57
|
+
result = doc.search("termination clauses")
|
|
58
|
+
result # In Jupyter: renders the whole document, color-coded by relevance
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Why this exists
|
|
62
|
+
|
|
63
|
+
Traditional RAG hands an LLM the top 3–10 chunks and asks it to generate an answer. That's great when you trust the answer and don't need to read the source. But sometimes you *need to read the source* — legal review, paper skimming, contract diffing, due diligence — and you want a tool that helps you *navigate* a long document, not summarize it away.
|
|
64
|
+
|
|
65
|
+
`doclighter` is for that. It treats the whole document as the output, and re-colors it by semantic relevance to your query. Hot regions deserve your attention; cold regions you can skim past.
|
|
66
|
+
|
|
67
|
+
It's deterministic, fast (sub-100ms per query after indexing), and shows you the long tail — including the case where your query *doesn't* match anything (everything stays cold blue, which is itself useful information that RAG hides).
|
|
68
|
+
|
|
69
|
+
## Install
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install doclighter
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Optional extras:
|
|
76
|
+
```bash
|
|
77
|
+
pip install "doclighter[quantize]" # FAISS SQ8 index for very large docs
|
|
78
|
+
pip install "doclighter[streamlit]" # for the interactive demo app
|
|
79
|
+
pip install "doclighter[dev]" # for contributors
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Quickstart
|
|
83
|
+
|
|
84
|
+
### Load a document
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from doclighter import Doclighter
|
|
88
|
+
|
|
89
|
+
# From a PDF on disk
|
|
90
|
+
doc = Doclighter.from_pdf("contract.pdf")
|
|
91
|
+
|
|
92
|
+
# From a PDF URL
|
|
93
|
+
doc = Doclighter.from_url("https://example.com/contract.pdf")
|
|
94
|
+
|
|
95
|
+
# From raw text
|
|
96
|
+
doc = Doclighter.from_text(open("paper.txt").read())
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
The first call downloads the default embedding model (~80 MB MiniLM) and embeds your document. For a ~10K word doc this takes ~25 seconds. Subsequent searches reuse the index.
|
|
100
|
+
|
|
101
|
+
### Search
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
result = doc.search("termination clauses")
|
|
105
|
+
|
|
106
|
+
result.word_scores # numpy array, shape (n_words,), values in [0, 1]
|
|
107
|
+
result.top_chunks(k=10) # list of (chunk_text, score, (start, end))
|
|
108
|
+
result.elapsed_ms # ~10-50ms for typical docs
|
|
109
|
+
result.to_html() # HTML string for display anywhere
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
In Jupyter, just put `result` on the last line of a cell — it renders the heatmap inline.
|
|
113
|
+
|
|
114
|
+
### Zoom: the `decay_sigma` knob
|
|
115
|
+
|
|
116
|
+
The differentiating feature. `decay_sigma` controls how far semantic warmth spreads from a matched region:
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
narrow = doc.search("termination", decay_sigma=5.0) # sharp word-level highlights
|
|
120
|
+
broad = doc.search("termination", decay_sigma=80.0) # broad thematic regions
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Same index, no re-embedding. Drag the σ slider in the Streamlit demo to feel what this does.
|
|
124
|
+
|
|
125
|
+
### Multi-query
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
result = doc.search(
|
|
129
|
+
["termination", "indemnification", "labour wages"],
|
|
130
|
+
multi_query_aggregate="max", # or "sum" to favor regions matching multiple
|
|
131
|
+
)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Save / load the index
|
|
135
|
+
|
|
136
|
+
Embedding is the slow step. Save once, reuse:
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
doc.save("contract.idx")
|
|
140
|
+
doc = Doclighter.load("contract.idx")
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Bring your own embedder
|
|
144
|
+
|
|
145
|
+
Any callable mapping `list[str] -> np.ndarray` of shape `(N, dim)` works:
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
from sentence_transformers import SentenceTransformer
|
|
149
|
+
|
|
150
|
+
bge = SentenceTransformer("BAAI/bge-small-en-v1.5")
|
|
151
|
+
doc = Doclighter.from_text(text, embedder=bge.encode)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## Streamlit demo
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
pip install "doclighter[streamlit]"
|
|
158
|
+
streamlit run examples/streamlit_app.py
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
A working UI with PDF upload, query box, σ slider, and live re-rendering.
|
|
162
|
+
|
|
163
|
+
## How it works
|
|
164
|
+
|
|
165
|
+
1. **Chunk** the document into small rolling windows (default: 12 words, 50% overlap).
|
|
166
|
+
2. **Embed** each chunk with sentence-transformers (default: `all-MiniLM-L6-v2`).
|
|
167
|
+
3. **Score** each chunk against your query via cosine similarity.
|
|
168
|
+
4. **Project** chunk scores back onto every word via exponential proximity decay:
|
|
169
|
+
|
|
170
|
+
```
|
|
171
|
+
word_score[w] = max over chunks c of raw[c] × exp(-distance(w, c) / sigma)
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
5. **Render** the document as colored HTML — words inherit warmth from their nearest semantically matched chunk.
|
|
175
|
+
|
|
176
|
+
Step 4 is the interesting one. Max-aggregation (rather than sum) means a word's color reflects its single strongest semantic neighbor — visually intuitive and resistant to "many lukewarm chunks add up to red" noise.
|
|
177
|
+
|
|
178
|
+
## How this compares to RAG
|
|
179
|
+
|
|
180
|
+
`doclighter` and RAG solve different problems:
|
|
181
|
+
|
|
182
|
+
| | RAG | doclighter |
|
|
183
|
+
|---|---|---|
|
|
184
|
+
| Output | LLM-generated answer | Document, recolored |
|
|
185
|
+
| Best for | "What's the answer?" | "Where in the doc?" |
|
|
186
|
+
| When query has no answer | LLM hedges / hallucinates | Document stays cold (honest) |
|
|
187
|
+
| Hides chunk boundaries | No — top-K cliff | Yes — gradient smooths over |
|
|
188
|
+
| Cost per query | LLM tokens | Free (one matmul) |
|
|
189
|
+
| Determinism | Sampling-dependent | Fully deterministic |
|
|
190
|
+
|
|
191
|
+
They're complementary. Use RAG when you trust the LLM with the question; use `doclighter` when you need to read the source yourself.
|
|
192
|
+
|
|
193
|
+
The algorithmic kernel is conceptually related to [ColBERT](https://github.com/stanford-futuredata/ColBERT)'s MaxSim late-interaction, but applied to *visualization* rather than ranking, and at the word-level rather than the token-level.
|
|
194
|
+
|
|
195
|
+
## API reference
|
|
196
|
+
|
|
197
|
+
### `Doclighter(text, **kwargs)`
|
|
198
|
+
|
|
199
|
+
| Parameter | Default | Description |
|
|
200
|
+
|---|---|---|
|
|
201
|
+
| `text` | required | The document as a string |
|
|
202
|
+
| `chunk_size` | `12` | Words per rolling window |
|
|
203
|
+
| `chunk_overlap` | `0.5` | Fraction overlap between windows |
|
|
204
|
+
| `embedder` | `None` | Custom embedder callable (default: MiniLM) |
|
|
205
|
+
| `embedding_model` | `"all-MiniLM-L6-v2"` | sentence-transformers model name |
|
|
206
|
+
| `decay_sigma` | `20.0` | Default proximity decay scale (words) |
|
|
207
|
+
| `quantize` | `False` | Use SQ8 FAISS index instead of flat exact |
|
|
208
|
+
| `quantize_rerank_k` | `200` | If quantize=True, rerank top-K with exact |
|
|
209
|
+
|
|
210
|
+
Alternate constructors: `Doclighter.from_text(...)`, `Doclighter.from_pdf(path, ...)`, `Doclighter.from_url(url, ...)`.
|
|
211
|
+
|
|
212
|
+
### `doc.search(query, **kwargs) -> SearchResult`
|
|
213
|
+
|
|
214
|
+
| Parameter | Default | Description |
|
|
215
|
+
|---|---|---|
|
|
216
|
+
| `query` | required | A string, or list of strings for multi-query |
|
|
217
|
+
| `decay_sigma` | `None` | Override the doc's default sigma |
|
|
218
|
+
| `multi_query_aggregate` | `"max"` | `"max"`, `"mean"`, or `"sum"` for multi-query |
|
|
219
|
+
|
|
220
|
+
### `SearchResult`
|
|
221
|
+
|
|
222
|
+
| Attribute / method | Returns |
|
|
223
|
+
|---|---|
|
|
224
|
+
| `.word_scores` | `np.ndarray` of shape `(n_words,)`, in `[0, 1]` |
|
|
225
|
+
| `.chunk_scores` | `np.ndarray` of raw per-chunk cosine scores |
|
|
226
|
+
| `.top_chunks(k=10)` | `list[(text, score, (start, end))]` |
|
|
227
|
+
| `.to_html(**kwargs)` | HTML string of the heatmap-colored document |
|
|
228
|
+
| `.elapsed_ms` | Search latency |
|
|
229
|
+
|
|
230
|
+
## Development
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
git clone https://github.com/pratyush272/doclighter
|
|
234
|
+
cd doclighter
|
|
235
|
+
pip install -e ".[dev]"
|
|
236
|
+
pytest
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
## License
|
|
240
|
+
|
|
241
|
+
MIT. See [LICENSE](LICENSE).
|
|
242
|
+
|
|
243
|
+
## Citation / acknowledgement
|
|
244
|
+
|
|
245
|
+
If you use `doclighter` in research, a link back is appreciated. The proximity-decay scoring idea borrows from passage-retrieval literature; max-aggregation over fine-grained matches is in spirit closest to ColBERT.
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
# doclighter
|
|
2
|
+
|
|
3
|
+
[](https://github.com/pratyush272/doclighter/actions/workflows/test.yml)
|
|
4
|
+
[](https://pypi.org/project/doclighter/)
|
|
5
|
+
[](https://pypi.org/project/doclighter/)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
|
|
8
|
+
**A semantic Ctrl+F that paints your document with a relevance gradient.**
|
|
9
|
+
|
|
10
|
+
`doclighter` is what you reach for when you need to *see* where a topic lives in a document — not be told an answer. It embeds your document at fine granularity, then projects query relevance back onto every word as a heatmap. No LLM, no hallucination, no top-K cliff.
|
|
11
|
+
|
|
12
|
+
```python
|
|
13
|
+
from doclighter import Doclighter
|
|
14
|
+
|
|
15
|
+
doc = Doclighter.from_pdf("contract.pdf")
|
|
16
|
+
result = doc.search("termination clauses")
|
|
17
|
+
result # In Jupyter: renders the whole document, color-coded by relevance
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Why this exists
|
|
21
|
+
|
|
22
|
+
Traditional RAG hands an LLM the top 3–10 chunks and asks it to generate an answer. That's great when you trust the answer and don't need to read the source. But sometimes you *need to read the source* — legal review, paper skimming, contract diffing, due diligence — and you want a tool that helps you *navigate* a long document, not summarize it away.
|
|
23
|
+
|
|
24
|
+
`doclighter` is for that. It treats the whole document as the output, and re-colors it by semantic relevance to your query. Hot regions deserve your attention; cold regions you can skim past.
|
|
25
|
+
|
|
26
|
+
It's deterministic, fast (sub-100ms per query after indexing), and shows you the long tail — including the case where your query *doesn't* match anything (everything stays cold blue, which is itself useful information that RAG hides).
|
|
27
|
+
|
|
28
|
+
## Install
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install doclighter
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Optional extras:
|
|
35
|
+
```bash
|
|
36
|
+
pip install "doclighter[quantize]" # FAISS SQ8 index for very large docs
|
|
37
|
+
pip install "doclighter[streamlit]" # for the interactive demo app
|
|
38
|
+
pip install "doclighter[dev]" # for contributors
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Quickstart
|
|
42
|
+
|
|
43
|
+
### Load a document
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from doclighter import Doclighter
|
|
47
|
+
|
|
48
|
+
# From a PDF on disk
|
|
49
|
+
doc = Doclighter.from_pdf("contract.pdf")
|
|
50
|
+
|
|
51
|
+
# From a PDF URL
|
|
52
|
+
doc = Doclighter.from_url("https://example.com/contract.pdf")
|
|
53
|
+
|
|
54
|
+
# From raw text
|
|
55
|
+
doc = Doclighter.from_text(open("paper.txt").read())
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
The first call downloads the default embedding model (~80 MB MiniLM) and embeds your document. For a ~10K word doc this takes ~25 seconds. Subsequent searches reuse the index.
|
|
59
|
+
|
|
60
|
+
### Search
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
result = doc.search("termination clauses")
|
|
64
|
+
|
|
65
|
+
result.word_scores # numpy array, shape (n_words,), values in [0, 1]
|
|
66
|
+
result.top_chunks(k=10) # list of (chunk_text, score, (start, end))
|
|
67
|
+
result.elapsed_ms # ~10-50ms for typical docs
|
|
68
|
+
result.to_html() # HTML string for display anywhere
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
In Jupyter, just put `result` on the last line of a cell — it renders the heatmap inline.
|
|
72
|
+
|
|
73
|
+
### Zoom: the `decay_sigma` knob
|
|
74
|
+
|
|
75
|
+
The differentiating feature. `decay_sigma` controls how far semantic warmth spreads from a matched region:
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
narrow = doc.search("termination", decay_sigma=5.0) # sharp word-level highlights
|
|
79
|
+
broad = doc.search("termination", decay_sigma=80.0) # broad thematic regions
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Same index, no re-embedding. Drag the σ slider in the Streamlit demo to feel what this does.
|
|
83
|
+
|
|
84
|
+
### Multi-query
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
result = doc.search(
|
|
88
|
+
["termination", "indemnification", "labour wages"],
|
|
89
|
+
multi_query_aggregate="max", # or "sum" to favor regions matching multiple
|
|
90
|
+
)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Save / load the index
|
|
94
|
+
|
|
95
|
+
Embedding is the slow step. Save once, reuse:
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
doc.save("contract.idx")
|
|
99
|
+
doc = Doclighter.load("contract.idx")
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Bring your own embedder
|
|
103
|
+
|
|
104
|
+
Any callable mapping `list[str] -> np.ndarray` of shape `(N, dim)` works:
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from sentence_transformers import SentenceTransformer
|
|
108
|
+
|
|
109
|
+
bge = SentenceTransformer("BAAI/bge-small-en-v1.5")
|
|
110
|
+
doc = Doclighter.from_text(text, embedder=bge.encode)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Streamlit demo
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
pip install "doclighter[streamlit]"
|
|
117
|
+
streamlit run examples/streamlit_app.py
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
A working UI with PDF upload, query box, σ slider, and live re-rendering.
|
|
121
|
+
|
|
122
|
+
## How it works
|
|
123
|
+
|
|
124
|
+
1. **Chunk** the document into small rolling windows (default: 12 words, 50% overlap).
|
|
125
|
+
2. **Embed** each chunk with sentence-transformers (default: `all-MiniLM-L6-v2`).
|
|
126
|
+
3. **Score** each chunk against your query via cosine similarity.
|
|
127
|
+
4. **Project** chunk scores back onto every word via exponential proximity decay:
|
|
128
|
+
|
|
129
|
+
```
|
|
130
|
+
word_score[w] = max over chunks c of raw[c] × exp(-distance(w, c) / sigma)
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
5. **Render** the document as colored HTML — words inherit warmth from their nearest semantically matched chunk.
|
|
134
|
+
|
|
135
|
+
Step 4 is the interesting one. Max-aggregation (rather than sum) means a word's color reflects its single strongest semantic neighbor — visually intuitive and resistant to "many lukewarm chunks add up to red" noise.
|
|
136
|
+
|
|
137
|
+
## How this compares to RAG
|
|
138
|
+
|
|
139
|
+
`doclighter` and RAG solve different problems:
|
|
140
|
+
|
|
141
|
+
| | RAG | doclighter |
|
|
142
|
+
|---|---|---|
|
|
143
|
+
| Output | LLM-generated answer | Document, recolored |
|
|
144
|
+
| Best for | "What's the answer?" | "Where in the doc?" |
|
|
145
|
+
| When query has no answer | LLM hedges / hallucinates | Document stays cold (honest) |
|
|
146
|
+
| Hides chunk boundaries | No — top-K cliff | Yes — gradient smooths over |
|
|
147
|
+
| Cost per query | LLM tokens | Free (one matmul) |
|
|
148
|
+
| Determinism | Sampling-dependent | Fully deterministic |
|
|
149
|
+
|
|
150
|
+
They're complementary. Use RAG when you trust the LLM with the question; use `doclighter` when you need to read the source yourself.
|
|
151
|
+
|
|
152
|
+
The algorithmic kernel is conceptually related to [ColBERT](https://github.com/stanford-futuredata/ColBERT)'s MaxSim late-interaction, but applied to *visualization* rather than ranking, and at the word-level rather than the token-level.
|
|
153
|
+
|
|
154
|
+
## API reference
|
|
155
|
+
|
|
156
|
+
### `Doclighter(text, **kwargs)`
|
|
157
|
+
|
|
158
|
+
| Parameter | Default | Description |
|
|
159
|
+
|---|---|---|
|
|
160
|
+
| `text` | required | The document as a string |
|
|
161
|
+
| `chunk_size` | `12` | Words per rolling window |
|
|
162
|
+
| `chunk_overlap` | `0.5` | Fraction overlap between windows |
|
|
163
|
+
| `embedder` | `None` | Custom embedder callable (default: MiniLM) |
|
|
164
|
+
| `embedding_model` | `"all-MiniLM-L6-v2"` | sentence-transformers model name |
|
|
165
|
+
| `decay_sigma` | `20.0` | Default proximity decay scale (words) |
|
|
166
|
+
| `quantize` | `False` | Use SQ8 FAISS index instead of flat exact |
|
|
167
|
+
| `quantize_rerank_k` | `200` | If quantize=True, rerank top-K with exact |
|
|
168
|
+
|
|
169
|
+
Alternate constructors: `Doclighter.from_text(...)`, `Doclighter.from_pdf(path, ...)`, `Doclighter.from_url(url, ...)`.
|
|
170
|
+
|
|
171
|
+
### `doc.search(query, **kwargs) -> SearchResult`
|
|
172
|
+
|
|
173
|
+
| Parameter | Default | Description |
|
|
174
|
+
|---|---|---|
|
|
175
|
+
| `query` | required | A string, or list of strings for multi-query |
|
|
176
|
+
| `decay_sigma` | `None` | Override the doc's default sigma |
|
|
177
|
+
| `multi_query_aggregate` | `"max"` | `"max"`, `"mean"`, or `"sum"` for multi-query |
|
|
178
|
+
|
|
179
|
+
### `SearchResult`
|
|
180
|
+
|
|
181
|
+
| Attribute / method | Returns |
|
|
182
|
+
|---|---|
|
|
183
|
+
| `.word_scores` | `np.ndarray` of shape `(n_words,)`, in `[0, 1]` |
|
|
184
|
+
| `.chunk_scores` | `np.ndarray` of raw per-chunk cosine scores |
|
|
185
|
+
| `.top_chunks(k=10)` | `list[(text, score, (start, end))]` |
|
|
186
|
+
| `.to_html(**kwargs)` | HTML string of the heatmap-colored document |
|
|
187
|
+
| `.elapsed_ms` | Search latency |
|
|
188
|
+
|
|
189
|
+
## Development
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
git clone https://github.com/pratyush272/doclighter
|
|
193
|
+
cd doclighter
|
|
194
|
+
pip install -e ".[dev]"
|
|
195
|
+
pytest
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## License
|
|
199
|
+
|
|
200
|
+
MIT. See [LICENSE](LICENSE).
|
|
201
|
+
|
|
202
|
+
## Citation / acknowledgement
|
|
203
|
+
|
|
204
|
+
If you use `doclighter` in research, a link back is appreciated. The proximity-decay scoring idea borrows from passage-retrieval literature; max-aggregation over fine-grained matches is in spirit closest to ColBERT.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Doclighter — a semantic Ctrl+F that paints your document with relevance.
|
|
2
|
+
|
|
3
|
+
See https://github.com/pratyush272/doclighter for docs.
|
|
4
|
+
"""
|
|
5
|
+
from .core import Doclighter, SearchResult
|
|
6
|
+
from .chunking import Chunk, make_chunks
|
|
7
|
+
from .scoring import word_heatmap, aggregate_multi_query
|
|
8
|
+
from .render import render_html, score_to_hex
|
|
9
|
+
|
|
10
|
+
__version__ = "0.1.0"
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"Doclighter",
|
|
14
|
+
"SearchResult",
|
|
15
|
+
"Chunk",
|
|
16
|
+
"make_chunks",
|
|
17
|
+
"word_heatmap",
|
|
18
|
+
"aggregate_multi_query",
|
|
19
|
+
"render_html",
|
|
20
|
+
"score_to_hex",
|
|
21
|
+
]
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Rolling window chunking.
|
|
2
|
+
|
|
3
|
+
Small word-window chunks (default 12 words, 50% overlap) are the unit of
|
|
4
|
+
semantic match. This is deliberately finer than typical RAG chunking
|
|
5
|
+
(256-1024 tokens) because Doclighter is a visualization tool, not a
|
|
6
|
+
context-window filler — fine chunks give fine spatial resolution.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import List
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class Chunk:
|
|
16
|
+
"""A single rolling window over the document word list."""
|
|
17
|
+
|
|
18
|
+
text: str
|
|
19
|
+
start: int # inclusive word index
|
|
20
|
+
end: int # exclusive word index
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def make_chunks(
|
|
24
|
+
words: List[str],
|
|
25
|
+
chunk_size: int = 12,
|
|
26
|
+
overlap: float = 0.5,
|
|
27
|
+
) -> List[Chunk]:
|
|
28
|
+
"""Split a word list into rolling windows.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
words : list of str
|
|
33
|
+
Tokenized document (typically ``text.split()``).
|
|
34
|
+
chunk_size : int
|
|
35
|
+
Words per window. Default 12 — small enough that semantic units rarely
|
|
36
|
+
get cut, large enough that MiniLM produces a useful embedding.
|
|
37
|
+
overlap : float
|
|
38
|
+
Fraction of overlap between consecutive windows, in [0, 1).
|
|
39
|
+
Default 0.5 means 50%% overlap (step = chunk_size / 2).
|
|
40
|
+
|
|
41
|
+
Returns
|
|
42
|
+
-------
|
|
43
|
+
list of Chunk
|
|
44
|
+
"""
|
|
45
|
+
if not 0 <= overlap < 1:
|
|
46
|
+
raise ValueError(f"overlap must be in [0, 1), got {overlap}")
|
|
47
|
+
if chunk_size < 1:
|
|
48
|
+
raise ValueError(f"chunk_size must be >= 1, got {chunk_size}")
|
|
49
|
+
|
|
50
|
+
step = max(1, int(chunk_size * (1 - overlap)))
|
|
51
|
+
chunks: List[Chunk] = []
|
|
52
|
+
for i in range(0, len(words), step):
|
|
53
|
+
window = words[i : i + chunk_size]
|
|
54
|
+
if window:
|
|
55
|
+
chunks.append(Chunk(text=" ".join(window), start=i, end=i + len(window)))
|
|
56
|
+
if i + chunk_size >= len(words):
|
|
57
|
+
break # last window already covers tail
|
|
58
|
+
return chunks
|