doclighter 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doclighter/__init__.py +21 -0
- doclighter/chunking.py +58 -0
- doclighter/core.py +297 -0
- doclighter/embedding.py +57 -0
- doclighter/extract.py +38 -0
- doclighter/index.py +72 -0
- doclighter/render.py +124 -0
- doclighter/scoring.py +118 -0
- doclighter-0.1.0.dist-info/METADATA +245 -0
- doclighter-0.1.0.dist-info/RECORD +13 -0
- doclighter-0.1.0.dist-info/WHEEL +5 -0
- doclighter-0.1.0.dist-info/licenses/LICENSE +21 -0
- doclighter-0.1.0.dist-info/top_level.txt +1 -0
doclighter/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Doclighter — a semantic Ctrl+F that paints your document with relevance.
|
|
2
|
+
|
|
3
|
+
See https://github.com/pratyush272/doclighter for docs.
|
|
4
|
+
"""
|
|
5
|
+
from .core import Doclighter, SearchResult
|
|
6
|
+
from .chunking import Chunk, make_chunks
|
|
7
|
+
from .scoring import word_heatmap, aggregate_multi_query
|
|
8
|
+
from .render import render_html, score_to_hex
|
|
9
|
+
|
|
10
|
+
__version__ = "0.1.0"
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"Doclighter",
|
|
14
|
+
"SearchResult",
|
|
15
|
+
"Chunk",
|
|
16
|
+
"make_chunks",
|
|
17
|
+
"word_heatmap",
|
|
18
|
+
"aggregate_multi_query",
|
|
19
|
+
"render_html",
|
|
20
|
+
"score_to_hex",
|
|
21
|
+
]
|
doclighter/chunking.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Rolling window chunking.
|
|
2
|
+
|
|
3
|
+
Small word-window chunks (default 12 words, 50% overlap) are the unit of
|
|
4
|
+
semantic match. This is deliberately finer than typical RAG chunking
|
|
5
|
+
(256-1024 tokens) because Doclighter is a visualization tool, not a
|
|
6
|
+
context-window filler — fine chunks give fine spatial resolution.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import List
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class Chunk:
|
|
16
|
+
"""A single rolling window over the document word list."""
|
|
17
|
+
|
|
18
|
+
text: str
|
|
19
|
+
start: int # inclusive word index
|
|
20
|
+
end: int # exclusive word index
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def make_chunks(
|
|
24
|
+
words: List[str],
|
|
25
|
+
chunk_size: int = 12,
|
|
26
|
+
overlap: float = 0.5,
|
|
27
|
+
) -> List[Chunk]:
|
|
28
|
+
"""Split a word list into rolling windows.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
words : list of str
|
|
33
|
+
Tokenized document (typically ``text.split()``).
|
|
34
|
+
chunk_size : int
|
|
35
|
+
Words per window. Default 12 — small enough that semantic units rarely
|
|
36
|
+
get cut, large enough that MiniLM produces a useful embedding.
|
|
37
|
+
overlap : float
|
|
38
|
+
Fraction of overlap between consecutive windows, in [0, 1).
|
|
39
|
+
Default 0.5 means 50%% overlap (step = chunk_size / 2).
|
|
40
|
+
|
|
41
|
+
Returns
|
|
42
|
+
-------
|
|
43
|
+
list of Chunk
|
|
44
|
+
"""
|
|
45
|
+
if not 0 <= overlap < 1:
|
|
46
|
+
raise ValueError(f"overlap must be in [0, 1), got {overlap}")
|
|
47
|
+
if chunk_size < 1:
|
|
48
|
+
raise ValueError(f"chunk_size must be >= 1, got {chunk_size}")
|
|
49
|
+
|
|
50
|
+
step = max(1, int(chunk_size * (1 - overlap)))
|
|
51
|
+
chunks: List[Chunk] = []
|
|
52
|
+
for i in range(0, len(words), step):
|
|
53
|
+
window = words[i : i + chunk_size]
|
|
54
|
+
if window:
|
|
55
|
+
chunks.append(Chunk(text=" ".join(window), start=i, end=i + len(window)))
|
|
56
|
+
if i + chunk_size >= len(words):
|
|
57
|
+
break # last window already covers tail
|
|
58
|
+
return chunks
|
doclighter/core.py
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
"""The main Doclighter API.
|
|
2
|
+
|
|
3
|
+
Typical usage::
|
|
4
|
+
|
|
5
|
+
from doclighter import Doclighter
|
|
6
|
+
|
|
7
|
+
doc = Doclighter.from_pdf("contract.pdf")
|
|
8
|
+
result = doc.search("termination clauses")
|
|
9
|
+
|
|
10
|
+
# In Jupyter:
|
|
11
|
+
from IPython.display import HTML, display
|
|
12
|
+
display(HTML(result.to_html()))
|
|
13
|
+
|
|
14
|
+
# Anywhere else:
|
|
15
|
+
print(result.top_chunks(k=5))
|
|
16
|
+
scores = result.word_scores # numpy array, shape (n_words,)
|
|
17
|
+
"""
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import pickle
|
|
21
|
+
import time
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Callable, List, Optional, Sequence, Tuple, Union
|
|
25
|
+
|
|
26
|
+
import numpy as np
|
|
27
|
+
|
|
28
|
+
from . import extract
|
|
29
|
+
from .chunking import Chunk, make_chunks
|
|
30
|
+
from .embedding import Embedder, default_embedder, wrap_callable
|
|
31
|
+
from .index import FlatIndex, QuantizedIndex
|
|
32
|
+
from .render import render_html
|
|
33
|
+
from .scoring import aggregate_multi_query, word_heatmap
|
|
34
|
+
|
|
35
|
+
PathLike = Union[str, Path]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class SearchResult:
|
|
40
|
+
"""The output of a single Doclighter search.
|
|
41
|
+
|
|
42
|
+
Attributes
|
|
43
|
+
----------
|
|
44
|
+
query : str or list of str
|
|
45
|
+
The query that produced this result. List form for multi-query searches.
|
|
46
|
+
word_scores : np.ndarray, shape (n_words,)
|
|
47
|
+
Per-word relevance in [0, 1]. The thing you'd visualize.
|
|
48
|
+
chunk_scores : np.ndarray, shape (n_chunks,)
|
|
49
|
+
Raw cosine similarity per chunk (before proximity smoothing).
|
|
50
|
+
elapsed_ms : float
|
|
51
|
+
Search latency, including all postprocessing.
|
|
52
|
+
decay_sigma : float
|
|
53
|
+
The sigma used for this search (recorded for reproducibility).
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
query: Union[str, List[str]]
|
|
57
|
+
word_scores: np.ndarray
|
|
58
|
+
chunk_scores: np.ndarray
|
|
59
|
+
elapsed_ms: float
|
|
60
|
+
decay_sigma: float
|
|
61
|
+
_words: List[str] = field(repr=False)
|
|
62
|
+
_chunks: List[Chunk] = field(repr=False)
|
|
63
|
+
|
|
64
|
+
def top_chunks(self, k: int = 10) -> List[Tuple[str, float, Tuple[int, int]]]:
|
|
65
|
+
"""Top-k chunks by raw semantic score.
|
|
66
|
+
|
|
67
|
+
Returns list of ``(chunk_text, score, (start_word, end_word))``.
|
|
68
|
+
"""
|
|
69
|
+
top_idx = np.argsort(-self.chunk_scores)[:k]
|
|
70
|
+
return [
|
|
71
|
+
(self._chunks[i].text, float(self.chunk_scores[i]),
|
|
72
|
+
(self._chunks[i].start, self._chunks[i].end))
|
|
73
|
+
for i in top_idx
|
|
74
|
+
]
|
|
75
|
+
|
|
76
|
+
def to_html(self, **render_kwargs) -> str:
|
|
77
|
+
"""Render the heatmap as an HTML fragment (string).
|
|
78
|
+
|
|
79
|
+
Extra kwargs are forwarded to ``doclighter.render.render_html``
|
|
80
|
+
(e.g. ``max_height_px``, ``quantize_levels``).
|
|
81
|
+
"""
|
|
82
|
+
query_str = self.query if isinstance(self.query, str) else " | ".join(self.query)
|
|
83
|
+
return render_html(
|
|
84
|
+
words=self._words,
|
|
85
|
+
word_scores=self.word_scores,
|
|
86
|
+
query=query_str,
|
|
87
|
+
decay_sigma=self.decay_sigma,
|
|
88
|
+
elapsed_ms=self.elapsed_ms,
|
|
89
|
+
**render_kwargs,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
def _repr_html_(self) -> str:
|
|
93
|
+
"""Jupyter automatically calls this for rich display."""
|
|
94
|
+
return self.to_html()
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class Doclighter:
|
|
98
|
+
"""A searchable, visualizable view of a single document.
|
|
99
|
+
|
|
100
|
+
Build once (embedding the document is the slow step), then search many
|
|
101
|
+
times — each search is sub-100ms on typical documents.
|
|
102
|
+
|
|
103
|
+
Parameters are mostly set at build time; ``decay_sigma`` can be overridden
|
|
104
|
+
per-search, which lets you toggle between fine word-level highlights
|
|
105
|
+
(small sigma) and broad thematic regions (large sigma) without re-indexing.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
def __init__(
|
|
109
|
+
self,
|
|
110
|
+
text: str,
|
|
111
|
+
*,
|
|
112
|
+
chunk_size: int = 12,
|
|
113
|
+
chunk_overlap: float = 0.5,
|
|
114
|
+
embedder: Optional[Union[Embedder, Callable]] = None,
|
|
115
|
+
embedding_model: str = "all-MiniLM-L6-v2",
|
|
116
|
+
decay_sigma: float = 20.0,
|
|
117
|
+
quantize: bool = False,
|
|
118
|
+
quantize_rerank_k: int = 200,
|
|
119
|
+
):
|
|
120
|
+
if not text or not text.strip():
|
|
121
|
+
raise ValueError("text is empty — nothing to index")
|
|
122
|
+
|
|
123
|
+
self.text = text
|
|
124
|
+
self.words = text.split()
|
|
125
|
+
self.chunks = make_chunks(self.words, chunk_size=chunk_size, overlap=chunk_overlap)
|
|
126
|
+
if not self.chunks:
|
|
127
|
+
raise ValueError("No chunks produced — document too short?")
|
|
128
|
+
|
|
129
|
+
self.chunk_size = chunk_size
|
|
130
|
+
self.chunk_overlap = chunk_overlap
|
|
131
|
+
self.decay_sigma = decay_sigma # default; per-search override allowed
|
|
132
|
+
self.embedding_model_name = embedding_model
|
|
133
|
+
|
|
134
|
+
# Embedder: user-supplied callable or default MiniLM
|
|
135
|
+
if embedder is None:
|
|
136
|
+
self._embedder = default_embedder(embedding_model)
|
|
137
|
+
elif callable(embedder):
|
|
138
|
+
self._embedder = wrap_callable(embedder)
|
|
139
|
+
else:
|
|
140
|
+
raise TypeError(f"embedder must be callable, got {type(embedder)}")
|
|
141
|
+
|
|
142
|
+
# Embed all chunks (the slow step)
|
|
143
|
+
chunk_texts = [c.text for c in self.chunks]
|
|
144
|
+
self.chunk_embeddings = self._embedder(chunk_texts)
|
|
145
|
+
|
|
146
|
+
# Build index
|
|
147
|
+
if quantize:
|
|
148
|
+
self.index = QuantizedIndex(self.chunk_embeddings, rerank_k=quantize_rerank_k)
|
|
149
|
+
else:
|
|
150
|
+
self.index = FlatIndex(self.chunk_embeddings)
|
|
151
|
+
|
|
152
|
+
# ---------- alternate constructors ----------
|
|
153
|
+
|
|
154
|
+
@classmethod
|
|
155
|
+
def from_text(cls, text: str, **kwargs) -> "Doclighter":
|
|
156
|
+
"""Build from a raw string. Same as the default constructor."""
|
|
157
|
+
return cls(text, **kwargs)
|
|
158
|
+
|
|
159
|
+
@classmethod
|
|
160
|
+
def from_pdf(cls, path: PathLike, **kwargs) -> "Doclighter":
|
|
161
|
+
"""Build from a PDF file on disk."""
|
|
162
|
+
return cls(extract.from_pdf_path(path), **kwargs)
|
|
163
|
+
|
|
164
|
+
@classmethod
|
|
165
|
+
def from_url(cls, url: str, timeout: int = 30, **kwargs) -> "Doclighter":
|
|
166
|
+
"""Build from a PDF served over HTTP(S)."""
|
|
167
|
+
return cls(extract.from_url(url, timeout=timeout), **kwargs)
|
|
168
|
+
|
|
169
|
+
# ---------- search ----------
|
|
170
|
+
|
|
171
|
+
def search(
|
|
172
|
+
self,
|
|
173
|
+
query: Union[str, Sequence[str]],
|
|
174
|
+
*,
|
|
175
|
+
decay_sigma: Optional[float] = None,
|
|
176
|
+
multi_query_aggregate: str = "max",
|
|
177
|
+
) -> SearchResult:
|
|
178
|
+
"""Search the document and return a SearchResult.
|
|
179
|
+
|
|
180
|
+
Parameters
|
|
181
|
+
----------
|
|
182
|
+
query : str or sequence of str
|
|
183
|
+
A single query, or multiple sub-queries (e.g.
|
|
184
|
+
``["termination", "indemnification"]``). Multi-query mode runs
|
|
185
|
+
each sub-query and combines the heatmaps.
|
|
186
|
+
decay_sigma : float, optional
|
|
187
|
+
Override the proximity decay scale for this search. Smaller =
|
|
188
|
+
sharper word-level highlights; larger = broader regions.
|
|
189
|
+
Defaults to the value set at construction.
|
|
190
|
+
multi_query_aggregate : {"max", "mean", "sum"}
|
|
191
|
+
How to combine sub-query heatmaps. See ``scoring.aggregate_multi_query``.
|
|
192
|
+
|
|
193
|
+
Returns
|
|
194
|
+
-------
|
|
195
|
+
SearchResult
|
|
196
|
+
"""
|
|
197
|
+
t0 = time.perf_counter()
|
|
198
|
+
sigma = decay_sigma if decay_sigma is not None else self.decay_sigma
|
|
199
|
+
|
|
200
|
+
is_multi = not isinstance(query, str)
|
|
201
|
+
queries = list(query) if is_multi else [query]
|
|
202
|
+
q_embs = self._embedder(queries) # shape (n_queries, dim)
|
|
203
|
+
|
|
204
|
+
heatmaps = []
|
|
205
|
+
all_chunk_scores = []
|
|
206
|
+
for q_emb in q_embs:
|
|
207
|
+
chunk_scores = self.index.all_scores(q_emb)
|
|
208
|
+
all_chunk_scores.append(chunk_scores)
|
|
209
|
+
hm = word_heatmap(
|
|
210
|
+
chunk_scores=chunk_scores,
|
|
211
|
+
chunks=self.chunks,
|
|
212
|
+
n_words=len(self.words),
|
|
213
|
+
decay_sigma=sigma,
|
|
214
|
+
)
|
|
215
|
+
heatmaps.append(hm)
|
|
216
|
+
|
|
217
|
+
if is_multi:
|
|
218
|
+
final_heatmap = aggregate_multi_query(heatmaps, mode=multi_query_aggregate)
|
|
219
|
+
# For chunk_scores, use the max across queries (most informative single number)
|
|
220
|
+
final_chunk_scores = np.max(np.stack(all_chunk_scores), axis=0)
|
|
221
|
+
else:
|
|
222
|
+
final_heatmap = heatmaps[0]
|
|
223
|
+
final_chunk_scores = all_chunk_scores[0]
|
|
224
|
+
|
|
225
|
+
elapsed_ms = (time.perf_counter() - t0) * 1000
|
|
226
|
+
return SearchResult(
|
|
227
|
+
query=query if is_multi else queries[0],
|
|
228
|
+
word_scores=final_heatmap,
|
|
229
|
+
chunk_scores=final_chunk_scores,
|
|
230
|
+
elapsed_ms=elapsed_ms,
|
|
231
|
+
decay_sigma=sigma,
|
|
232
|
+
_words=self.words,
|
|
233
|
+
_chunks=self.chunks,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# ---------- persistence ----------
|
|
237
|
+
|
|
238
|
+
def save(self, path: PathLike) -> None:
|
|
239
|
+
"""Save the indexed document to disk (skip re-embedding next time).
|
|
240
|
+
|
|
241
|
+
Note: this saves embeddings and chunk metadata, not the embedder
|
|
242
|
+
itself — when you load, you'll need to be able to embed query
|
|
243
|
+
strings, so the same model must be available.
|
|
244
|
+
"""
|
|
245
|
+
state = {
|
|
246
|
+
"text": self.text,
|
|
247
|
+
"words": self.words,
|
|
248
|
+
"chunks": self.chunks,
|
|
249
|
+
"chunk_embeddings": self.chunk_embeddings,
|
|
250
|
+
"chunk_size": self.chunk_size,
|
|
251
|
+
"chunk_overlap": self.chunk_overlap,
|
|
252
|
+
"decay_sigma": self.decay_sigma,
|
|
253
|
+
"embedding_model_name": self.embedding_model_name,
|
|
254
|
+
}
|
|
255
|
+
with open(path, "wb") as f:
|
|
256
|
+
pickle.dump(state, f)
|
|
257
|
+
|
|
258
|
+
@classmethod
|
|
259
|
+
def load(
|
|
260
|
+
cls,
|
|
261
|
+
path: PathLike,
|
|
262
|
+
*,
|
|
263
|
+
embedder: Optional[Callable] = None,
|
|
264
|
+
quantize: bool = False,
|
|
265
|
+
) -> "Doclighter":
|
|
266
|
+
"""Load a saved index. Re-instantiates the embedder for query encoding."""
|
|
267
|
+
with open(path, "rb") as f:
|
|
268
|
+
state = pickle.load(f)
|
|
269
|
+
|
|
270
|
+
obj = cls.__new__(cls)
|
|
271
|
+
obj.text = state["text"]
|
|
272
|
+
obj.words = state["words"]
|
|
273
|
+
obj.chunks = state["chunks"]
|
|
274
|
+
obj.chunk_embeddings = state["chunk_embeddings"]
|
|
275
|
+
obj.chunk_size = state["chunk_size"]
|
|
276
|
+
obj.chunk_overlap = state["chunk_overlap"]
|
|
277
|
+
obj.decay_sigma = state["decay_sigma"]
|
|
278
|
+
obj.embedding_model_name = state["embedding_model_name"]
|
|
279
|
+
|
|
280
|
+
if embedder is None:
|
|
281
|
+
obj._embedder = default_embedder(obj.embedding_model_name)
|
|
282
|
+
else:
|
|
283
|
+
obj._embedder = wrap_callable(embedder)
|
|
284
|
+
|
|
285
|
+
if quantize:
|
|
286
|
+
obj.index = QuantizedIndex(obj.chunk_embeddings)
|
|
287
|
+
else:
|
|
288
|
+
obj.index = FlatIndex(obj.chunk_embeddings)
|
|
289
|
+
return obj
|
|
290
|
+
|
|
291
|
+
# ---------- diagnostics ----------
|
|
292
|
+
|
|
293
|
+
def __repr__(self) -> str:
|
|
294
|
+
return (
|
|
295
|
+
f"Doclighter(words={len(self.words)}, chunks={len(self.chunks)}, "
|
|
296
|
+
f"chunk_size={self.chunk_size}, model={self.embedding_model_name!r})"
|
|
297
|
+
)
|
doclighter/embedding.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Embedding backend.
|
|
2
|
+
|
|
3
|
+
Default: sentence-transformers ``all-MiniLM-L6-v2`` (384-dim, fast, ~80MB).
|
|
4
|
+
Users can pass any callable matching ``Embedder`` (e.g. their own model,
|
|
5
|
+
an API call, or a different sentence-transformers checkpoint).
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Callable, List, Protocol
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Embedder(Protocol):
|
|
15
|
+
"""A callable that turns texts into a (N, dim) float32 numpy array.
|
|
16
|
+
|
|
17
|
+
Vectors should be L2-normalized — Doclighter uses inner-product cosine
|
|
18
|
+
similarity and assumes unit norm.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __call__(self, texts: List[str]) -> np.ndarray: ...
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def default_embedder(model_name: str = "all-MiniLM-L6-v2") -> Embedder:
|
|
25
|
+
"""Build a sentence-transformers-backed embedder.
|
|
26
|
+
|
|
27
|
+
The model is loaded once and reused. Batches of 64 by default; override
|
|
28
|
+
by wrapping with your own callable if you need finer control.
|
|
29
|
+
"""
|
|
30
|
+
from sentence_transformers import SentenceTransformer # local import
|
|
31
|
+
|
|
32
|
+
model = SentenceTransformer(model_name)
|
|
33
|
+
|
|
34
|
+
def encode(texts: List[str]) -> np.ndarray:
|
|
35
|
+
return model.encode(
|
|
36
|
+
texts,
|
|
37
|
+
normalize_embeddings=True,
|
|
38
|
+
batch_size=64,
|
|
39
|
+
convert_to_numpy=True,
|
|
40
|
+
show_progress_bar=len(texts) > 100,
|
|
41
|
+
).astype("float32")
|
|
42
|
+
|
|
43
|
+
return encode
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def wrap_callable(fn: Callable[[List[str]], np.ndarray]) -> Embedder:
|
|
47
|
+
"""Wrap a user-supplied callable, ensuring float32 + L2 normalization."""
|
|
48
|
+
|
|
49
|
+
def encode(texts: List[str]) -> np.ndarray:
|
|
50
|
+
out = np.asarray(fn(texts), dtype="float32")
|
|
51
|
+
if out.ndim != 2:
|
|
52
|
+
raise ValueError(f"Embedder must return 2D array, got shape {out.shape}")
|
|
53
|
+
norms = np.linalg.norm(out, axis=1, keepdims=True)
|
|
54
|
+
norms[norms == 0] = 1.0
|
|
55
|
+
return out / norms
|
|
56
|
+
|
|
57
|
+
return encode
|
doclighter/extract.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Text loaders for Doclighter.
|
|
2
|
+
|
|
3
|
+
Supports PDFs (local path or URL) and raw text. Kept deliberately small —
|
|
4
|
+
users with exotic formats (docx, html) should convert upstream and pass text.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Union
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def from_pdf_bytes(data: bytes) -> str:
|
|
14
|
+
"""Extract text from PDF bytes using pypdf."""
|
|
15
|
+
from pypdf import PdfReader # local import keeps base install lean
|
|
16
|
+
|
|
17
|
+
reader = PdfReader(BytesIO(data))
|
|
18
|
+
pages = [p.extract_text() for p in reader.pages if p.extract_text()]
|
|
19
|
+
if not pages:
|
|
20
|
+
raise RuntimeError(
|
|
21
|
+
"No extractable text in PDF. It may be image-only — "
|
|
22
|
+
"run OCR (e.g. pytesseract) and pass the result to Doclighter.from_text()."
|
|
23
|
+
)
|
|
24
|
+
return "\n".join(pages)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def from_pdf_path(path: Union[str, Path]) -> str:
|
|
28
|
+
"""Read a PDF from disk and extract text."""
|
|
29
|
+
return from_pdf_bytes(Path(path).read_bytes())
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def from_url(url: str, timeout: int = 30) -> str:
|
|
33
|
+
"""Download a PDF from a URL and extract text."""
|
|
34
|
+
import requests
|
|
35
|
+
|
|
36
|
+
r = requests.get(url, timeout=timeout)
|
|
37
|
+
r.raise_for_status()
|
|
38
|
+
return from_pdf_bytes(r.content)
|
doclighter/index.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Vector index.
|
|
2
|
+
|
|
3
|
+
Default: flat exact inner-product (fast enough for documents up to ~50K
|
|
4
|
+
chunks, which covers virtually all single-document use cases — a 500-page
|
|
5
|
+
book at 12-word chunks is ~30K chunks).
|
|
6
|
+
|
|
7
|
+
Opt-in: SQ8 scalar-quantized index for very large docs or memory-constrained
|
|
8
|
+
environments. Trades a tiny bit of ranking precision for 4x memory savings.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Tuple
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FlatIndex:
|
|
18
|
+
"""Exact inner-product search. Trivial, fast, no dependencies beyond numpy."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, embeddings: np.ndarray):
|
|
21
|
+
self.embeddings = embeddings.astype("float32")
|
|
22
|
+
self.ntotal = embeddings.shape[0]
|
|
23
|
+
self.dim = embeddings.shape[1]
|
|
24
|
+
|
|
25
|
+
def search(self, query: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]:
|
|
26
|
+
"""Return (scores, indices), both shape (k,), sorted descending."""
|
|
27
|
+
scores = self.embeddings @ query.reshape(-1)
|
|
28
|
+
k = min(k, self.ntotal)
|
|
29
|
+
# argpartition is O(n); full sort only on the top-k slice
|
|
30
|
+
top_idx = np.argpartition(-scores, k - 1)[:k]
|
|
31
|
+
top_idx = top_idx[np.argsort(-scores[top_idx])]
|
|
32
|
+
return scores[top_idx], top_idx
|
|
33
|
+
|
|
34
|
+
def all_scores(self, query: np.ndarray) -> np.ndarray:
|
|
35
|
+
"""Exact scores for every vector — used for full word-heatmap rendering."""
|
|
36
|
+
return self.embeddings @ query.reshape(-1)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class QuantizedIndex:
|
|
40
|
+
"""SQ8 + exact rerank. ~4x smaller in memory, near-identical ranking.
|
|
41
|
+
|
|
42
|
+
Only useful for very large documents; for the default visualization use
|
|
43
|
+
case, FlatIndex is simpler and the speed difference is invisible.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(self, embeddings: np.ndarray, rerank_k: int = 200):
|
|
47
|
+
import faiss # local import — only required for this path
|
|
48
|
+
|
|
49
|
+
self.embeddings = embeddings.astype("float32") # kept for exact rerank
|
|
50
|
+
self.ntotal = embeddings.shape[0]
|
|
51
|
+
self.dim = embeddings.shape[1]
|
|
52
|
+
self.rerank_k = rerank_k
|
|
53
|
+
|
|
54
|
+
self._sq = faiss.IndexScalarQuantizer(
|
|
55
|
+
self.dim, faiss.ScalarQuantizer.QT_8bit, faiss.METRIC_INNER_PRODUCT
|
|
56
|
+
)
|
|
57
|
+
self._sq.train(self.embeddings)
|
|
58
|
+
self._sq.add(self.embeddings)
|
|
59
|
+
|
|
60
|
+
def search(self, query: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]:
|
|
61
|
+
q = query.reshape(1, -1).astype("float32")
|
|
62
|
+
cand_k = min(max(k, self.rerank_k), self.ntotal)
|
|
63
|
+
_, cand_idx = self._sq.search(q, cand_k)
|
|
64
|
+
cand_idx = cand_idx[0]
|
|
65
|
+
# Exact rerank
|
|
66
|
+
exact_scores = self.embeddings[cand_idx] @ q[0]
|
|
67
|
+
order = np.argsort(-exact_scores)[:k]
|
|
68
|
+
return exact_scores[order], cand_idx[order]
|
|
69
|
+
|
|
70
|
+
def all_scores(self, query: np.ndarray) -> np.ndarray:
|
|
71
|
+
"""Exact scores everywhere — we keep float32 embeddings for this."""
|
|
72
|
+
return self.embeddings @ query.reshape(-1)
|
doclighter/render.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""HTML rendering of word-level heatmaps.
|
|
2
|
+
|
|
3
|
+
The renderer is deliberately decoupled from Jupyter — ``to_html()`` returns
|
|
4
|
+
a string. Jupyter users wrap with ``display(HTML(...))``; Streamlit users
|
|
5
|
+
pass to ``st.markdown(..., unsafe_allow_html=True)``; FastAPI users return
|
|
6
|
+
it from a route.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import html
|
|
11
|
+
from typing import List, Tuple
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
# Smooth gradient: grey -> indigo -> cyan -> yellow -> orange -> red
|
|
16
|
+
# Tuples are (score_threshold, (r, g, b)) — Tailwind-esque palette that
|
|
17
|
+
# reads well on both light and dark backgrounds.
|
|
18
|
+
_COLOR_STOPS: List[Tuple[float, Tuple[int, int, int]]] = [
|
|
19
|
+
(1.00, (220, 38, 38)), # red-600 — core match
|
|
20
|
+
(0.80, (249, 115, 22)), # orange-500
|
|
21
|
+
(0.60, (234, 179, 8)), # yellow-500
|
|
22
|
+
(0.40, (6, 182, 212)), # cyan-500
|
|
23
|
+
(0.20, (79, 70, 229)), # indigo-600
|
|
24
|
+
(0.00, (156, 163, 175)), # gray-400 — unrelated
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def score_to_hex(score: float) -> str:
|
|
29
|
+
"""Interpolate a score in [0, 1] to a hex color along the gradient."""
|
|
30
|
+
s = max(0.0, min(1.0, score))
|
|
31
|
+
for i in range(len(_COLOR_STOPS) - 1):
|
|
32
|
+
hi_s, hi_c = _COLOR_STOPS[i]
|
|
33
|
+
lo_s, lo_c = _COLOR_STOPS[i + 1]
|
|
34
|
+
if s >= lo_s:
|
|
35
|
+
t = (s - lo_s) / (hi_s - lo_s) if hi_s > lo_s else 0.0
|
|
36
|
+
r = int(lo_c[0] + t * (hi_c[0] - lo_c[0]))
|
|
37
|
+
g = int(lo_c[1] + t * (hi_c[1] - lo_c[1]))
|
|
38
|
+
b = int(lo_c[2] + t * (hi_c[2] - lo_c[2]))
|
|
39
|
+
return f"#{r:02x}{g:02x}{b:02x}"
|
|
40
|
+
return "#9ca3af"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _legend_html(n_words: int, decay_sigma: float, elapsed_ms: float) -> str:
|
|
44
|
+
swatches = " → ".join(
|
|
45
|
+
f'<span style="color:{score_to_hex(s)}">■</span>'
|
|
46
|
+
for s in (1.0, 0.8, 0.6, 0.4, 0.2, 0.0)
|
|
47
|
+
)
|
|
48
|
+
return (
|
|
49
|
+
'<div style="font-family:monospace;margin-bottom:8px;font-size:0.9em">'
|
|
50
|
+
f'<b>Relevance:</b> {swatches} '
|
|
51
|
+
'<span style="color:#666">(hot → cold)</span>'
|
|
52
|
+
f' <span style="color:#888;font-size:0.85em">'
|
|
53
|
+
f'({elapsed_ms:.1f} ms · {n_words} words · σ={decay_sigma})</span>'
|
|
54
|
+
'</div>'
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def render_html(
|
|
59
|
+
words: List[str],
|
|
60
|
+
word_scores: np.ndarray,
|
|
61
|
+
query: str,
|
|
62
|
+
decay_sigma: float,
|
|
63
|
+
elapsed_ms: float,
|
|
64
|
+
quantize_levels: int = 64,
|
|
65
|
+
max_height_px: int = 540,
|
|
66
|
+
) -> str:
|
|
67
|
+
"""Render a heatmap-colored document as an HTML string.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
words : list of str
|
|
72
|
+
The full document, tokenized.
|
|
73
|
+
word_scores : np.ndarray, shape (n_words,), values in [0, 1]
|
|
74
|
+
query : str
|
|
75
|
+
Shown in the result header.
|
|
76
|
+
decay_sigma : float
|
|
77
|
+
Shown in the legend (informational).
|
|
78
|
+
elapsed_ms : float
|
|
79
|
+
Search latency in milliseconds, shown in the legend.
|
|
80
|
+
quantize_levels : int
|
|
81
|
+
Group consecutive words of similar score into a single <span> to
|
|
82
|
+
keep HTML compact. Higher = finer color gradient, larger HTML.
|
|
83
|
+
max_height_px : int
|
|
84
|
+
Scroll the document container if it exceeds this height.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
str
|
|
89
|
+
A self-contained HTML fragment. Safe to embed; words are escaped.
|
|
90
|
+
"""
|
|
91
|
+
# Quantize scores so consecutive words of similar warmth share a span
|
|
92
|
+
def quant(s: float) -> int:
|
|
93
|
+
return int(s * quantize_levels)
|
|
94
|
+
|
|
95
|
+
spans: List[str] = []
|
|
96
|
+
cur_q: int | None = None
|
|
97
|
+
cur_words: List[str] = []
|
|
98
|
+
for i, w in enumerate(words):
|
|
99
|
+
q = quant(float(word_scores[i]))
|
|
100
|
+
if q == cur_q:
|
|
101
|
+
cur_words.append(w)
|
|
102
|
+
else:
|
|
103
|
+
if cur_words:
|
|
104
|
+
col = score_to_hex(cur_q / quantize_levels if cur_q is not None else 0.0)
|
|
105
|
+
spans.append(
|
|
106
|
+
f'<span style="color:{col}">{html.escape(" ".join(cur_words))}</span>'
|
|
107
|
+
)
|
|
108
|
+
cur_q, cur_words = q, [w]
|
|
109
|
+
if cur_words:
|
|
110
|
+
col = score_to_hex(cur_q / quantize_levels if cur_q is not None else 0.0)
|
|
111
|
+
spans.append(
|
|
112
|
+
f'<span style="color:{col}">{html.escape(" ".join(cur_words))}</span>'
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
header = f'<h4 style="margin-bottom:4px">Results for: <em>{html.escape(query)}</em></h4>'
|
|
116
|
+
legend = _legend_html(len(words), decay_sigma, elapsed_ms)
|
|
117
|
+
doc = (
|
|
118
|
+
f'<div style="font-family:Georgia,serif;font-size:0.95em;line-height:1.8;'
|
|
119
|
+
f'white-space:pre-wrap;border:1px solid #ddd;border-radius:6px;'
|
|
120
|
+
f'padding:16px;max-height:{max_height_px}px;overflow-y:auto">'
|
|
121
|
+
+ " ".join(spans)
|
|
122
|
+
+ '</div>'
|
|
123
|
+
)
|
|
124
|
+
return header + legend + doc
|
doclighter/scoring.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Proximity-decayed word-level scoring.
|
|
2
|
+
|
|
3
|
+
This is the algorithmic heart of Doclighter. Given per-chunk semantic scores,
|
|
4
|
+
we compute a per-word relevance score by letting each chunk's score "radiate"
|
|
5
|
+
outward with exponential decay:
|
|
6
|
+
|
|
7
|
+
word_score[w] = max over chunks c of raw[c] * exp(-distance(w, c) / sigma)
|
|
8
|
+
|
|
9
|
+
Distance is measured in words to the nearest edge of the chunk (0 if inside).
|
|
10
|
+
Max-aggregation (rather than sum) means a word's color reflects its single
|
|
11
|
+
strongest semantic neighbor, which matches the visual intuition: a word is
|
|
12
|
+
"hot" if any nearby region is hot, not because many lukewarm regions sum up.
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from typing import List
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
from .chunking import Chunk
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def word_heatmap(
|
|
24
|
+
chunk_scores: np.ndarray,
|
|
25
|
+
chunks: List[Chunk],
|
|
26
|
+
n_words: int,
|
|
27
|
+
decay_sigma: float = 20.0,
|
|
28
|
+
threshold_frac: float = 0.05,
|
|
29
|
+
) -> np.ndarray:
|
|
30
|
+
"""Spread chunk scores into per-word scores via exponential decay.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
chunk_scores : np.ndarray, shape (n_chunks,)
|
|
35
|
+
Semantic similarity score per chunk. Negatives are clipped to 0
|
|
36
|
+
(irrelevant regions shouldn't radiate warmth).
|
|
37
|
+
chunks : list of Chunk
|
|
38
|
+
Word-span metadata for each chunk.
|
|
39
|
+
n_words : int
|
|
40
|
+
Total number of words in the document.
|
|
41
|
+
decay_sigma : float
|
|
42
|
+
Decay length scale in words. exp(-d/sigma): at d=sigma, weight = 0.37;
|
|
43
|
+
at d=2*sigma, weight = 0.14. Small sigma (~5) = sharp word-level
|
|
44
|
+
highlights. Large sigma (~100) = broad thematic regions.
|
|
45
|
+
threshold_frac : float
|
|
46
|
+
Skip chunks with score < threshold_frac * max_score. Pure optimization:
|
|
47
|
+
chunks with negligible scores can't affect the final heatmap.
|
|
48
|
+
|
|
49
|
+
Returns
|
|
50
|
+
-------
|
|
51
|
+
np.ndarray, shape (n_words,), values in [0, 1]
|
|
52
|
+
Per-word relevance score, normalized so max = 1.
|
|
53
|
+
"""
|
|
54
|
+
raw = np.maximum(chunk_scores, 0.0).astype(np.float32)
|
|
55
|
+
if raw.max() == 0:
|
|
56
|
+
return np.zeros(n_words, dtype=np.float32)
|
|
57
|
+
|
|
58
|
+
threshold = float(raw.max()) * threshold_frac
|
|
59
|
+
word_idx = np.arange(n_words, dtype=np.float32)
|
|
60
|
+
word_scores = np.zeros(n_words, dtype=np.float32)
|
|
61
|
+
|
|
62
|
+
for ci, chunk in enumerate(chunks):
|
|
63
|
+
score = raw[ci]
|
|
64
|
+
if score < threshold:
|
|
65
|
+
continue
|
|
66
|
+
# Distance from each word to nearest edge of this chunk
|
|
67
|
+
# (0 inside the chunk, positive outside)
|
|
68
|
+
dist = np.where(
|
|
69
|
+
word_idx < chunk.start,
|
|
70
|
+
chunk.start - word_idx,
|
|
71
|
+
np.where(word_idx >= chunk.end, word_idx - (chunk.end - 1), 0.0),
|
|
72
|
+
)
|
|
73
|
+
influence = score * np.exp(-dist / decay_sigma)
|
|
74
|
+
np.maximum(word_scores, influence, out=word_scores)
|
|
75
|
+
|
|
76
|
+
mx = word_scores.max()
|
|
77
|
+
if mx > 0:
|
|
78
|
+
word_scores /= mx
|
|
79
|
+
return word_scores
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def aggregate_multi_query(
|
|
83
|
+
per_query_heatmaps: List[np.ndarray],
|
|
84
|
+
mode: str = "max",
|
|
85
|
+
) -> np.ndarray:
|
|
86
|
+
"""Combine word-level heatmaps from multiple sub-queries.
|
|
87
|
+
|
|
88
|
+
Parameters
|
|
89
|
+
----------
|
|
90
|
+
per_query_heatmaps : list of np.ndarray
|
|
91
|
+
Each shape (n_words,), values in [0, 1].
|
|
92
|
+
mode : {"max", "mean", "sum"}
|
|
93
|
+
"max" — strongest single match wins per word (default; closest to
|
|
94
|
+
ColBERT MaxSim spirit).
|
|
95
|
+
"mean" — average influence across queries.
|
|
96
|
+
"sum" — additive, then renormalized. Useful when you want words that
|
|
97
|
+
match *multiple* sub-queries to outshine words matching just one.
|
|
98
|
+
|
|
99
|
+
Returns
|
|
100
|
+
-------
|
|
101
|
+
np.ndarray, shape (n_words,), values in [0, 1]
|
|
102
|
+
"""
|
|
103
|
+
if not per_query_heatmaps:
|
|
104
|
+
raise ValueError("per_query_heatmaps must be non-empty")
|
|
105
|
+
stack = np.stack(per_query_heatmaps, axis=0)
|
|
106
|
+
if mode == "max":
|
|
107
|
+
combined = stack.max(axis=0)
|
|
108
|
+
elif mode == "mean":
|
|
109
|
+
combined = stack.mean(axis=0)
|
|
110
|
+
elif mode == "sum":
|
|
111
|
+
combined = stack.sum(axis=0)
|
|
112
|
+
else:
|
|
113
|
+
raise ValueError(f"mode must be 'max', 'mean', or 'sum'; got {mode!r}")
|
|
114
|
+
|
|
115
|
+
mx = combined.max()
|
|
116
|
+
if mx > 0:
|
|
117
|
+
combined = combined / mx
|
|
118
|
+
return combined.astype(np.float32)
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: doclighter
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A semantic Ctrl+F that paints your document with a relevance gradient.
|
|
5
|
+
Author-email: Pratyush <pratyush272@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/pratyush272/doclighter
|
|
8
|
+
Project-URL: Repository, https://github.com/pratyush272/doclighter
|
|
9
|
+
Project-URL: Issues, https://github.com/pratyush272/doclighter/issues
|
|
10
|
+
Keywords: semantic-search,embeddings,visualization,rag,nlp,pdf
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
22
|
+
Classifier: Topic :: Text Processing :: Indexing
|
|
23
|
+
Requires-Python: >=3.9
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: numpy>=1.21
|
|
27
|
+
Requires-Dist: sentence-transformers>=2.2
|
|
28
|
+
Requires-Dist: pypdf>=4.0
|
|
29
|
+
Requires-Dist: requests>=2.25
|
|
30
|
+
Provides-Extra: quantize
|
|
31
|
+
Requires-Dist: faiss-cpu>=1.7; extra == "quantize"
|
|
32
|
+
Provides-Extra: streamlit
|
|
33
|
+
Requires-Dist: streamlit>=1.28; extra == "streamlit"
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
36
|
+
Requires-Dist: pytest-cov>=4; extra == "dev"
|
|
37
|
+
Requires-Dist: ruff>=0.1; extra == "dev"
|
|
38
|
+
Provides-Extra: all
|
|
39
|
+
Requires-Dist: doclighter[dev,quantize,streamlit]; extra == "all"
|
|
40
|
+
Dynamic: license-file
|
|
41
|
+
|
|
42
|
+
# doclighter
|
|
43
|
+
|
|
44
|
+
[](https://github.com/pratyush272/doclighter/actions/workflows/test.yml)
|
|
45
|
+
[](https://pypi.org/project/doclighter/)
|
|
46
|
+
[](https://pypi.org/project/doclighter/)
|
|
47
|
+
[](LICENSE)
|
|
48
|
+
|
|
49
|
+
**A semantic Ctrl+F that paints your document with a relevance gradient.**
|
|
50
|
+
|
|
51
|
+
`doclighter` is what you reach for when you need to *see* where a topic lives in a document — not be told an answer. It embeds your document at fine granularity, then projects query relevance back onto every word as a heatmap. No LLM, no hallucination, no top-K cliff.
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from doclighter import Doclighter
|
|
55
|
+
|
|
56
|
+
doc = Doclighter.from_pdf("contract.pdf")
|
|
57
|
+
result = doc.search("termination clauses")
|
|
58
|
+
result # In Jupyter: renders the whole document, color-coded by relevance
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Why this exists
|
|
62
|
+
|
|
63
|
+
Traditional RAG hands an LLM the top 3–10 chunks and asks it to generate an answer. That's great when you trust the answer and don't need to read the source. But sometimes you *need to read the source* — legal review, paper skimming, contract diffing, due diligence — and you want a tool that helps you *navigate* a long document, not summarize it away.
|
|
64
|
+
|
|
65
|
+
`doclighter` is for that. It treats the whole document as the output, and re-colors it by semantic relevance to your query. Hot regions deserve your attention; cold regions you can skim past.
|
|
66
|
+
|
|
67
|
+
It's deterministic, fast (sub-100ms per query after indexing), and shows you the long tail — including the case where your query *doesn't* match anything (everything stays cold blue, which is itself useful information that RAG hides).
|
|
68
|
+
|
|
69
|
+
## Install
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install doclighter
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Optional extras:
|
|
76
|
+
```bash
|
|
77
|
+
pip install "doclighter[quantize]" # FAISS SQ8 index for very large docs
|
|
78
|
+
pip install "doclighter[streamlit]" # for the interactive demo app
|
|
79
|
+
pip install "doclighter[dev]" # for contributors
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Quickstart
|
|
83
|
+
|
|
84
|
+
### Load a document
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from doclighter import Doclighter
|
|
88
|
+
|
|
89
|
+
# From a PDF on disk
|
|
90
|
+
doc = Doclighter.from_pdf("contract.pdf")
|
|
91
|
+
|
|
92
|
+
# From a PDF URL
|
|
93
|
+
doc = Doclighter.from_url("https://example.com/contract.pdf")
|
|
94
|
+
|
|
95
|
+
# From raw text
|
|
96
|
+
doc = Doclighter.from_text(open("paper.txt").read())
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
The first call downloads the default embedding model (~80 MB MiniLM) and embeds your document. For a ~10K word doc this takes ~25 seconds. Subsequent searches reuse the index.
|
|
100
|
+
|
|
101
|
+
### Search
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
result = doc.search("termination clauses")
|
|
105
|
+
|
|
106
|
+
result.word_scores # numpy array, shape (n_words,), values in [0, 1]
|
|
107
|
+
result.top_chunks(k=10) # list of (chunk_text, score, (start, end))
|
|
108
|
+
result.elapsed_ms # ~10-50ms for typical docs
|
|
109
|
+
result.to_html() # HTML string for display anywhere
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
In Jupyter, just put `result` on the last line of a cell — it renders the heatmap inline.
|
|
113
|
+
|
|
114
|
+
### Zoom: the `decay_sigma` knob
|
|
115
|
+
|
|
116
|
+
The differentiating feature. `decay_sigma` controls how far semantic warmth spreads from a matched region:
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
narrow = doc.search("termination", decay_sigma=5.0) # sharp word-level highlights
|
|
120
|
+
broad = doc.search("termination", decay_sigma=80.0) # broad thematic regions
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Same index, no re-embedding. Drag the σ slider in the Streamlit demo to feel what this does.
|
|
124
|
+
|
|
125
|
+
### Multi-query
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
result = doc.search(
|
|
129
|
+
["termination", "indemnification", "labour wages"],
|
|
130
|
+
multi_query_aggregate="max", # or "sum" to favor regions matching multiple
|
|
131
|
+
)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Save / load the index
|
|
135
|
+
|
|
136
|
+
Embedding is the slow step. Save once, reuse:
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
doc.save("contract.idx")
|
|
140
|
+
doc = Doclighter.load("contract.idx")
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Bring your own embedder
|
|
144
|
+
|
|
145
|
+
Any callable mapping `list[str] -> np.ndarray` of shape `(N, dim)` works:
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
from sentence_transformers import SentenceTransformer
|
|
149
|
+
|
|
150
|
+
bge = SentenceTransformer("BAAI/bge-small-en-v1.5")
|
|
151
|
+
doc = Doclighter.from_text(text, embedder=bge.encode)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## Streamlit demo
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
pip install "doclighter[streamlit]"
|
|
158
|
+
streamlit run examples/streamlit_app.py
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
A working UI with PDF upload, query box, σ slider, and live re-rendering.
|
|
162
|
+
|
|
163
|
+
## How it works
|
|
164
|
+
|
|
165
|
+
1. **Chunk** the document into small rolling windows (default: 12 words, 50% overlap).
|
|
166
|
+
2. **Embed** each chunk with sentence-transformers (default: `all-MiniLM-L6-v2`).
|
|
167
|
+
3. **Score** each chunk against your query via cosine similarity.
|
|
168
|
+
4. **Project** chunk scores back onto every word via exponential proximity decay:
|
|
169
|
+
|
|
170
|
+
```
|
|
171
|
+
word_score[w] = max over chunks c of raw[c] × exp(-distance(w, c) / sigma)
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
5. **Render** the document as colored HTML — words inherit warmth from their nearest semantically matched chunk.
|
|
175
|
+
|
|
176
|
+
Step 4 is the interesting one. Max-aggregation (rather than sum) means a word's color reflects its single strongest semantic neighbor — visually intuitive and resistant to "many lukewarm chunks add up to red" noise.
|
|
177
|
+
|
|
178
|
+
## How this compares to RAG
|
|
179
|
+
|
|
180
|
+
`doclighter` and RAG solve different problems:
|
|
181
|
+
|
|
182
|
+
| | RAG | doclighter |
|
|
183
|
+
|---|---|---|
|
|
184
|
+
| Output | LLM-generated answer | Document, recolored |
|
|
185
|
+
| Best for | "What's the answer?" | "Where in the doc?" |
|
|
186
|
+
| When query has no answer | LLM hedges / hallucinates | Document stays cold (honest) |
|
|
187
|
+
| Hides chunk boundaries | No — top-K cliff | Yes — gradient smooths over |
|
|
188
|
+
| Cost per query | LLM tokens | Free (one matmul) |
|
|
189
|
+
| Determinism | Sampling-dependent | Fully deterministic |
|
|
190
|
+
|
|
191
|
+
They're complementary. Use RAG when you trust the LLM with the question; use `doclighter` when you need to read the source yourself.
|
|
192
|
+
|
|
193
|
+
The algorithmic kernel is conceptually related to [ColBERT](https://github.com/stanford-futuredata/ColBERT)'s MaxSim late-interaction, but applied to *visualization* rather than ranking, and at the word-level rather than the token-level.
|
|
194
|
+
|
|
195
|
+
## API reference
|
|
196
|
+
|
|
197
|
+
### `Doclighter(text, **kwargs)`
|
|
198
|
+
|
|
199
|
+
| Parameter | Default | Description |
|
|
200
|
+
|---|---|---|
|
|
201
|
+
| `text` | required | The document as a string |
|
|
202
|
+
| `chunk_size` | `12` | Words per rolling window |
|
|
203
|
+
| `chunk_overlap` | `0.5` | Fraction overlap between windows |
|
|
204
|
+
| `embedder` | `None` | Custom embedder callable (default: MiniLM) |
|
|
205
|
+
| `embedding_model` | `"all-MiniLM-L6-v2"` | sentence-transformers model name |
|
|
206
|
+
| `decay_sigma` | `20.0` | Default proximity decay scale (words) |
|
|
207
|
+
| `quantize` | `False` | Use SQ8 FAISS index instead of flat exact |
|
|
208
|
+
| `quantize_rerank_k` | `200` | If quantize=True, rerank top-K with exact |
|
|
209
|
+
|
|
210
|
+
Alternate constructors: `Doclighter.from_text(...)`, `Doclighter.from_pdf(path, ...)`, `Doclighter.from_url(url, ...)`.
|
|
211
|
+
|
|
212
|
+
### `doc.search(query, **kwargs) -> SearchResult`
|
|
213
|
+
|
|
214
|
+
| Parameter | Default | Description |
|
|
215
|
+
|---|---|---|
|
|
216
|
+
| `query` | required | A string, or list of strings for multi-query |
|
|
217
|
+
| `decay_sigma` | `None` | Override the doc's default sigma |
|
|
218
|
+
| `multi_query_aggregate` | `"max"` | `"max"`, `"mean"`, or `"sum"` for multi-query |
|
|
219
|
+
|
|
220
|
+
### `SearchResult`
|
|
221
|
+
|
|
222
|
+
| Attribute / method | Returns |
|
|
223
|
+
|---|---|
|
|
224
|
+
| `.word_scores` | `np.ndarray` of shape `(n_words,)`, in `[0, 1]` |
|
|
225
|
+
| `.chunk_scores` | `np.ndarray` of raw per-chunk cosine scores |
|
|
226
|
+
| `.top_chunks(k=10)` | `list[(text, score, (start, end))]` |
|
|
227
|
+
| `.to_html(**kwargs)` | HTML string of the heatmap-colored document |
|
|
228
|
+
| `.elapsed_ms` | Search latency |
|
|
229
|
+
|
|
230
|
+
## Development
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
git clone https://github.com/pratyush272/doclighter
|
|
234
|
+
cd doclighter
|
|
235
|
+
pip install -e ".[dev]"
|
|
236
|
+
pytest
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
## License
|
|
240
|
+
|
|
241
|
+
MIT. See [LICENSE](LICENSE).
|
|
242
|
+
|
|
243
|
+
## Citation / acknowledgement
|
|
244
|
+
|
|
245
|
+
If you use `doclighter` in research, a link back is appreciated. The proximity-decay scoring idea borrows from passage-retrieval literature; max-aggregation over fine-grained matches is in spirit closest to ColBERT.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
doclighter/__init__.py,sha256=kzREXwu7PcSRSSEOKxefkl7wqq3Tf92iRZ8GyrJ8aws,522
|
|
2
|
+
doclighter/chunking.py,sha256=Zaicqk9eNpkPlulCJPB_TkIOGCSag7p6bLoEG9jN42o,1823
|
|
3
|
+
doclighter/core.py,sha256=rz2ZOIB9FY1ibtZnn5qvWZCasy-KpD2wOpfOGA0chso,10271
|
|
4
|
+
doclighter/embedding.py,sha256=hd2e5K75HvBC0BM83z6gwjMqXwuoS6tgDzRffknNpE0,1816
|
|
5
|
+
doclighter/extract.py,sha256=NZoHDD8VaeqUeuw6Mszb91Qp375k-5z9gNsf54lPKDo,1201
|
|
6
|
+
doclighter/index.py,sha256=AHCS4w3FR-zq8JEfzXMF81CToswkuZJkk7ffqSElnnk,2830
|
|
7
|
+
doclighter/render.py,sha256=fzkqYC7tfyW6SgtUeG9VPnc4Q-5yamWJUpTeOyh_M80,4473
|
|
8
|
+
doclighter/scoring.py,sha256=MZGGIFuWK3nW3hGpX38aBUjiDa9uq1nBe1WXMLzZXwI,4048
|
|
9
|
+
doclighter-0.1.0.dist-info/licenses/LICENSE,sha256=k412KnI3imf_ScR1LlhRFjnHFq62gZSbHO88cwa_Ljk,1065
|
|
10
|
+
doclighter-0.1.0.dist-info/METADATA,sha256=V3Pjm6w2MtLzEQsHi3RuqF4Eb5yu1jDxeZNNPr2heYc,9401
|
|
11
|
+
doclighter-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
12
|
+
doclighter-0.1.0.dist-info/top_level.txt,sha256=Sd9T52y2LYw0Vz5aIRk_C3rD6JAktygliw5KakMp4TA,11
|
|
13
|
+
doclighter-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Pratyush
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
doclighter
|