doclighter 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
doclighter/__init__.py ADDED
@@ -0,0 +1,21 @@
1
+ """Doclighter — a semantic Ctrl+F that paints your document with relevance.
2
+
3
+ See https://github.com/pratyush272/doclighter for docs.
4
+ """
5
+ from .core import Doclighter, SearchResult
6
+ from .chunking import Chunk, make_chunks
7
+ from .scoring import word_heatmap, aggregate_multi_query
8
+ from .render import render_html, score_to_hex
9
+
10
+ __version__ = "0.1.0"
11
+
12
+ __all__ = [
13
+ "Doclighter",
14
+ "SearchResult",
15
+ "Chunk",
16
+ "make_chunks",
17
+ "word_heatmap",
18
+ "aggregate_multi_query",
19
+ "render_html",
20
+ "score_to_hex",
21
+ ]
doclighter/chunking.py ADDED
@@ -0,0 +1,58 @@
1
+ """Rolling window chunking.
2
+
3
+ Small word-window chunks (default 12 words, 50% overlap) are the unit of
4
+ semantic match. This is deliberately finer than typical RAG chunking
5
+ (256-1024 tokens) because Doclighter is a visualization tool, not a
6
+ context-window filler — fine chunks give fine spatial resolution.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+ from typing import List
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class Chunk:
16
+ """A single rolling window over the document word list."""
17
+
18
+ text: str
19
+ start: int # inclusive word index
20
+ end: int # exclusive word index
21
+
22
+
23
+ def make_chunks(
24
+ words: List[str],
25
+ chunk_size: int = 12,
26
+ overlap: float = 0.5,
27
+ ) -> List[Chunk]:
28
+ """Split a word list into rolling windows.
29
+
30
+ Parameters
31
+ ----------
32
+ words : list of str
33
+ Tokenized document (typically ``text.split()``).
34
+ chunk_size : int
35
+ Words per window. Default 12 — small enough that semantic units rarely
36
+ get cut, large enough that MiniLM produces a useful embedding.
37
+ overlap : float
38
+ Fraction of overlap between consecutive windows, in [0, 1).
39
+ Default 0.5 means 50%% overlap (step = chunk_size / 2).
40
+
41
+ Returns
42
+ -------
43
+ list of Chunk
44
+ """
45
+ if not 0 <= overlap < 1:
46
+ raise ValueError(f"overlap must be in [0, 1), got {overlap}")
47
+ if chunk_size < 1:
48
+ raise ValueError(f"chunk_size must be >= 1, got {chunk_size}")
49
+
50
+ step = max(1, int(chunk_size * (1 - overlap)))
51
+ chunks: List[Chunk] = []
52
+ for i in range(0, len(words), step):
53
+ window = words[i : i + chunk_size]
54
+ if window:
55
+ chunks.append(Chunk(text=" ".join(window), start=i, end=i + len(window)))
56
+ if i + chunk_size >= len(words):
57
+ break # last window already covers tail
58
+ return chunks
doclighter/core.py ADDED
@@ -0,0 +1,297 @@
1
+ """The main Doclighter API.
2
+
3
+ Typical usage::
4
+
5
+ from doclighter import Doclighter
6
+
7
+ doc = Doclighter.from_pdf("contract.pdf")
8
+ result = doc.search("termination clauses")
9
+
10
+ # In Jupyter:
11
+ from IPython.display import HTML, display
12
+ display(HTML(result.to_html()))
13
+
14
+ # Anywhere else:
15
+ print(result.top_chunks(k=5))
16
+ scores = result.word_scores # numpy array, shape (n_words,)
17
+ """
18
+ from __future__ import annotations
19
+
20
+ import pickle
21
+ import time
22
+ from dataclasses import dataclass, field
23
+ from pathlib import Path
24
+ from typing import Callable, List, Optional, Sequence, Tuple, Union
25
+
26
+ import numpy as np
27
+
28
+ from . import extract
29
+ from .chunking import Chunk, make_chunks
30
+ from .embedding import Embedder, default_embedder, wrap_callable
31
+ from .index import FlatIndex, QuantizedIndex
32
+ from .render import render_html
33
+ from .scoring import aggregate_multi_query, word_heatmap
34
+
35
+ PathLike = Union[str, Path]
36
+
37
+
38
+ @dataclass
39
+ class SearchResult:
40
+ """The output of a single Doclighter search.
41
+
42
+ Attributes
43
+ ----------
44
+ query : str or list of str
45
+ The query that produced this result. List form for multi-query searches.
46
+ word_scores : np.ndarray, shape (n_words,)
47
+ Per-word relevance in [0, 1]. The thing you'd visualize.
48
+ chunk_scores : np.ndarray, shape (n_chunks,)
49
+ Raw cosine similarity per chunk (before proximity smoothing).
50
+ elapsed_ms : float
51
+ Search latency, including all postprocessing.
52
+ decay_sigma : float
53
+ The sigma used for this search (recorded for reproducibility).
54
+ """
55
+
56
+ query: Union[str, List[str]]
57
+ word_scores: np.ndarray
58
+ chunk_scores: np.ndarray
59
+ elapsed_ms: float
60
+ decay_sigma: float
61
+ _words: List[str] = field(repr=False)
62
+ _chunks: List[Chunk] = field(repr=False)
63
+
64
+ def top_chunks(self, k: int = 10) -> List[Tuple[str, float, Tuple[int, int]]]:
65
+ """Top-k chunks by raw semantic score.
66
+
67
+ Returns list of ``(chunk_text, score, (start_word, end_word))``.
68
+ """
69
+ top_idx = np.argsort(-self.chunk_scores)[:k]
70
+ return [
71
+ (self._chunks[i].text, float(self.chunk_scores[i]),
72
+ (self._chunks[i].start, self._chunks[i].end))
73
+ for i in top_idx
74
+ ]
75
+
76
+ def to_html(self, **render_kwargs) -> str:
77
+ """Render the heatmap as an HTML fragment (string).
78
+
79
+ Extra kwargs are forwarded to ``doclighter.render.render_html``
80
+ (e.g. ``max_height_px``, ``quantize_levels``).
81
+ """
82
+ query_str = self.query if isinstance(self.query, str) else " | ".join(self.query)
83
+ return render_html(
84
+ words=self._words,
85
+ word_scores=self.word_scores,
86
+ query=query_str,
87
+ decay_sigma=self.decay_sigma,
88
+ elapsed_ms=self.elapsed_ms,
89
+ **render_kwargs,
90
+ )
91
+
92
+ def _repr_html_(self) -> str:
93
+ """Jupyter automatically calls this for rich display."""
94
+ return self.to_html()
95
+
96
+
97
+ class Doclighter:
98
+ """A searchable, visualizable view of a single document.
99
+
100
+ Build once (embedding the document is the slow step), then search many
101
+ times — each search is sub-100ms on typical documents.
102
+
103
+ Parameters are mostly set at build time; ``decay_sigma`` can be overridden
104
+ per-search, which lets you toggle between fine word-level highlights
105
+ (small sigma) and broad thematic regions (large sigma) without re-indexing.
106
+ """
107
+
108
+ def __init__(
109
+ self,
110
+ text: str,
111
+ *,
112
+ chunk_size: int = 12,
113
+ chunk_overlap: float = 0.5,
114
+ embedder: Optional[Union[Embedder, Callable]] = None,
115
+ embedding_model: str = "all-MiniLM-L6-v2",
116
+ decay_sigma: float = 20.0,
117
+ quantize: bool = False,
118
+ quantize_rerank_k: int = 200,
119
+ ):
120
+ if not text or not text.strip():
121
+ raise ValueError("text is empty — nothing to index")
122
+
123
+ self.text = text
124
+ self.words = text.split()
125
+ self.chunks = make_chunks(self.words, chunk_size=chunk_size, overlap=chunk_overlap)
126
+ if not self.chunks:
127
+ raise ValueError("No chunks produced — document too short?")
128
+
129
+ self.chunk_size = chunk_size
130
+ self.chunk_overlap = chunk_overlap
131
+ self.decay_sigma = decay_sigma # default; per-search override allowed
132
+ self.embedding_model_name = embedding_model
133
+
134
+ # Embedder: user-supplied callable or default MiniLM
135
+ if embedder is None:
136
+ self._embedder = default_embedder(embedding_model)
137
+ elif callable(embedder):
138
+ self._embedder = wrap_callable(embedder)
139
+ else:
140
+ raise TypeError(f"embedder must be callable, got {type(embedder)}")
141
+
142
+ # Embed all chunks (the slow step)
143
+ chunk_texts = [c.text for c in self.chunks]
144
+ self.chunk_embeddings = self._embedder(chunk_texts)
145
+
146
+ # Build index
147
+ if quantize:
148
+ self.index = QuantizedIndex(self.chunk_embeddings, rerank_k=quantize_rerank_k)
149
+ else:
150
+ self.index = FlatIndex(self.chunk_embeddings)
151
+
152
+ # ---------- alternate constructors ----------
153
+
154
+ @classmethod
155
+ def from_text(cls, text: str, **kwargs) -> "Doclighter":
156
+ """Build from a raw string. Same as the default constructor."""
157
+ return cls(text, **kwargs)
158
+
159
+ @classmethod
160
+ def from_pdf(cls, path: PathLike, **kwargs) -> "Doclighter":
161
+ """Build from a PDF file on disk."""
162
+ return cls(extract.from_pdf_path(path), **kwargs)
163
+
164
+ @classmethod
165
+ def from_url(cls, url: str, timeout: int = 30, **kwargs) -> "Doclighter":
166
+ """Build from a PDF served over HTTP(S)."""
167
+ return cls(extract.from_url(url, timeout=timeout), **kwargs)
168
+
169
+ # ---------- search ----------
170
+
171
+ def search(
172
+ self,
173
+ query: Union[str, Sequence[str]],
174
+ *,
175
+ decay_sigma: Optional[float] = None,
176
+ multi_query_aggregate: str = "max",
177
+ ) -> SearchResult:
178
+ """Search the document and return a SearchResult.
179
+
180
+ Parameters
181
+ ----------
182
+ query : str or sequence of str
183
+ A single query, or multiple sub-queries (e.g.
184
+ ``["termination", "indemnification"]``). Multi-query mode runs
185
+ each sub-query and combines the heatmaps.
186
+ decay_sigma : float, optional
187
+ Override the proximity decay scale for this search. Smaller =
188
+ sharper word-level highlights; larger = broader regions.
189
+ Defaults to the value set at construction.
190
+ multi_query_aggregate : {"max", "mean", "sum"}
191
+ How to combine sub-query heatmaps. See ``scoring.aggregate_multi_query``.
192
+
193
+ Returns
194
+ -------
195
+ SearchResult
196
+ """
197
+ t0 = time.perf_counter()
198
+ sigma = decay_sigma if decay_sigma is not None else self.decay_sigma
199
+
200
+ is_multi = not isinstance(query, str)
201
+ queries = list(query) if is_multi else [query]
202
+ q_embs = self._embedder(queries) # shape (n_queries, dim)
203
+
204
+ heatmaps = []
205
+ all_chunk_scores = []
206
+ for q_emb in q_embs:
207
+ chunk_scores = self.index.all_scores(q_emb)
208
+ all_chunk_scores.append(chunk_scores)
209
+ hm = word_heatmap(
210
+ chunk_scores=chunk_scores,
211
+ chunks=self.chunks,
212
+ n_words=len(self.words),
213
+ decay_sigma=sigma,
214
+ )
215
+ heatmaps.append(hm)
216
+
217
+ if is_multi:
218
+ final_heatmap = aggregate_multi_query(heatmaps, mode=multi_query_aggregate)
219
+ # For chunk_scores, use the max across queries (most informative single number)
220
+ final_chunk_scores = np.max(np.stack(all_chunk_scores), axis=0)
221
+ else:
222
+ final_heatmap = heatmaps[0]
223
+ final_chunk_scores = all_chunk_scores[0]
224
+
225
+ elapsed_ms = (time.perf_counter() - t0) * 1000
226
+ return SearchResult(
227
+ query=query if is_multi else queries[0],
228
+ word_scores=final_heatmap,
229
+ chunk_scores=final_chunk_scores,
230
+ elapsed_ms=elapsed_ms,
231
+ decay_sigma=sigma,
232
+ _words=self.words,
233
+ _chunks=self.chunks,
234
+ )
235
+
236
+ # ---------- persistence ----------
237
+
238
+ def save(self, path: PathLike) -> None:
239
+ """Save the indexed document to disk (skip re-embedding next time).
240
+
241
+ Note: this saves embeddings and chunk metadata, not the embedder
242
+ itself — when you load, you'll need to be able to embed query
243
+ strings, so the same model must be available.
244
+ """
245
+ state = {
246
+ "text": self.text,
247
+ "words": self.words,
248
+ "chunks": self.chunks,
249
+ "chunk_embeddings": self.chunk_embeddings,
250
+ "chunk_size": self.chunk_size,
251
+ "chunk_overlap": self.chunk_overlap,
252
+ "decay_sigma": self.decay_sigma,
253
+ "embedding_model_name": self.embedding_model_name,
254
+ }
255
+ with open(path, "wb") as f:
256
+ pickle.dump(state, f)
257
+
258
+ @classmethod
259
+ def load(
260
+ cls,
261
+ path: PathLike,
262
+ *,
263
+ embedder: Optional[Callable] = None,
264
+ quantize: bool = False,
265
+ ) -> "Doclighter":
266
+ """Load a saved index. Re-instantiates the embedder for query encoding."""
267
+ with open(path, "rb") as f:
268
+ state = pickle.load(f)
269
+
270
+ obj = cls.__new__(cls)
271
+ obj.text = state["text"]
272
+ obj.words = state["words"]
273
+ obj.chunks = state["chunks"]
274
+ obj.chunk_embeddings = state["chunk_embeddings"]
275
+ obj.chunk_size = state["chunk_size"]
276
+ obj.chunk_overlap = state["chunk_overlap"]
277
+ obj.decay_sigma = state["decay_sigma"]
278
+ obj.embedding_model_name = state["embedding_model_name"]
279
+
280
+ if embedder is None:
281
+ obj._embedder = default_embedder(obj.embedding_model_name)
282
+ else:
283
+ obj._embedder = wrap_callable(embedder)
284
+
285
+ if quantize:
286
+ obj.index = QuantizedIndex(obj.chunk_embeddings)
287
+ else:
288
+ obj.index = FlatIndex(obj.chunk_embeddings)
289
+ return obj
290
+
291
+ # ---------- diagnostics ----------
292
+
293
+ def __repr__(self) -> str:
294
+ return (
295
+ f"Doclighter(words={len(self.words)}, chunks={len(self.chunks)}, "
296
+ f"chunk_size={self.chunk_size}, model={self.embedding_model_name!r})"
297
+ )
@@ -0,0 +1,57 @@
1
+ """Embedding backend.
2
+
3
+ Default: sentence-transformers ``all-MiniLM-L6-v2`` (384-dim, fast, ~80MB).
4
+ Users can pass any callable matching ``Embedder`` (e.g. their own model,
5
+ an API call, or a different sentence-transformers checkpoint).
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from typing import Callable, List, Protocol
10
+
11
+ import numpy as np
12
+
13
+
14
+ class Embedder(Protocol):
15
+ """A callable that turns texts into a (N, dim) float32 numpy array.
16
+
17
+ Vectors should be L2-normalized — Doclighter uses inner-product cosine
18
+ similarity and assumes unit norm.
19
+ """
20
+
21
+ def __call__(self, texts: List[str]) -> np.ndarray: ...
22
+
23
+
24
+ def default_embedder(model_name: str = "all-MiniLM-L6-v2") -> Embedder:
25
+ """Build a sentence-transformers-backed embedder.
26
+
27
+ The model is loaded once and reused. Batches of 64 by default; override
28
+ by wrapping with your own callable if you need finer control.
29
+ """
30
+ from sentence_transformers import SentenceTransformer # local import
31
+
32
+ model = SentenceTransformer(model_name)
33
+
34
+ def encode(texts: List[str]) -> np.ndarray:
35
+ return model.encode(
36
+ texts,
37
+ normalize_embeddings=True,
38
+ batch_size=64,
39
+ convert_to_numpy=True,
40
+ show_progress_bar=len(texts) > 100,
41
+ ).astype("float32")
42
+
43
+ return encode
44
+
45
+
46
+ def wrap_callable(fn: Callable[[List[str]], np.ndarray]) -> Embedder:
47
+ """Wrap a user-supplied callable, ensuring float32 + L2 normalization."""
48
+
49
+ def encode(texts: List[str]) -> np.ndarray:
50
+ out = np.asarray(fn(texts), dtype="float32")
51
+ if out.ndim != 2:
52
+ raise ValueError(f"Embedder must return 2D array, got shape {out.shape}")
53
+ norms = np.linalg.norm(out, axis=1, keepdims=True)
54
+ norms[norms == 0] = 1.0
55
+ return out / norms
56
+
57
+ return encode
doclighter/extract.py ADDED
@@ -0,0 +1,38 @@
1
+ """Text loaders for Doclighter.
2
+
3
+ Supports PDFs (local path or URL) and raw text. Kept deliberately small —
4
+ users with exotic formats (docx, html) should convert upstream and pass text.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ from io import BytesIO
9
+ from pathlib import Path
10
+ from typing import Union
11
+
12
+
13
+ def from_pdf_bytes(data: bytes) -> str:
14
+ """Extract text from PDF bytes using pypdf."""
15
+ from pypdf import PdfReader # local import keeps base install lean
16
+
17
+ reader = PdfReader(BytesIO(data))
18
+ pages = [p.extract_text() for p in reader.pages if p.extract_text()]
19
+ if not pages:
20
+ raise RuntimeError(
21
+ "No extractable text in PDF. It may be image-only — "
22
+ "run OCR (e.g. pytesseract) and pass the result to Doclighter.from_text()."
23
+ )
24
+ return "\n".join(pages)
25
+
26
+
27
+ def from_pdf_path(path: Union[str, Path]) -> str:
28
+ """Read a PDF from disk and extract text."""
29
+ return from_pdf_bytes(Path(path).read_bytes())
30
+
31
+
32
+ def from_url(url: str, timeout: int = 30) -> str:
33
+ """Download a PDF from a URL and extract text."""
34
+ import requests
35
+
36
+ r = requests.get(url, timeout=timeout)
37
+ r.raise_for_status()
38
+ return from_pdf_bytes(r.content)
doclighter/index.py ADDED
@@ -0,0 +1,72 @@
1
+ """Vector index.
2
+
3
+ Default: flat exact inner-product (fast enough for documents up to ~50K
4
+ chunks, which covers virtually all single-document use cases — a 500-page
5
+ book at 12-word chunks is ~30K chunks).
6
+
7
+ Opt-in: SQ8 scalar-quantized index for very large docs or memory-constrained
8
+ environments. Trades a tiny bit of ranking precision for 4x memory savings.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ from typing import Tuple
13
+
14
+ import numpy as np
15
+
16
+
17
+ class FlatIndex:
18
+ """Exact inner-product search. Trivial, fast, no dependencies beyond numpy."""
19
+
20
+ def __init__(self, embeddings: np.ndarray):
21
+ self.embeddings = embeddings.astype("float32")
22
+ self.ntotal = embeddings.shape[0]
23
+ self.dim = embeddings.shape[1]
24
+
25
+ def search(self, query: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]:
26
+ """Return (scores, indices), both shape (k,), sorted descending."""
27
+ scores = self.embeddings @ query.reshape(-1)
28
+ k = min(k, self.ntotal)
29
+ # argpartition is O(n); full sort only on the top-k slice
30
+ top_idx = np.argpartition(-scores, k - 1)[:k]
31
+ top_idx = top_idx[np.argsort(-scores[top_idx])]
32
+ return scores[top_idx], top_idx
33
+
34
+ def all_scores(self, query: np.ndarray) -> np.ndarray:
35
+ """Exact scores for every vector — used for full word-heatmap rendering."""
36
+ return self.embeddings @ query.reshape(-1)
37
+
38
+
39
+ class QuantizedIndex:
40
+ """SQ8 + exact rerank. ~4x smaller in memory, near-identical ranking.
41
+
42
+ Only useful for very large documents; for the default visualization use
43
+ case, FlatIndex is simpler and the speed difference is invisible.
44
+ """
45
+
46
+ def __init__(self, embeddings: np.ndarray, rerank_k: int = 200):
47
+ import faiss # local import — only required for this path
48
+
49
+ self.embeddings = embeddings.astype("float32") # kept for exact rerank
50
+ self.ntotal = embeddings.shape[0]
51
+ self.dim = embeddings.shape[1]
52
+ self.rerank_k = rerank_k
53
+
54
+ self._sq = faiss.IndexScalarQuantizer(
55
+ self.dim, faiss.ScalarQuantizer.QT_8bit, faiss.METRIC_INNER_PRODUCT
56
+ )
57
+ self._sq.train(self.embeddings)
58
+ self._sq.add(self.embeddings)
59
+
60
+ def search(self, query: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]:
61
+ q = query.reshape(1, -1).astype("float32")
62
+ cand_k = min(max(k, self.rerank_k), self.ntotal)
63
+ _, cand_idx = self._sq.search(q, cand_k)
64
+ cand_idx = cand_idx[0]
65
+ # Exact rerank
66
+ exact_scores = self.embeddings[cand_idx] @ q[0]
67
+ order = np.argsort(-exact_scores)[:k]
68
+ return exact_scores[order], cand_idx[order]
69
+
70
+ def all_scores(self, query: np.ndarray) -> np.ndarray:
71
+ """Exact scores everywhere — we keep float32 embeddings for this."""
72
+ return self.embeddings @ query.reshape(-1)
doclighter/render.py ADDED
@@ -0,0 +1,124 @@
1
+ """HTML rendering of word-level heatmaps.
2
+
3
+ The renderer is deliberately decoupled from Jupyter — ``to_html()`` returns
4
+ a string. Jupyter users wrap with ``display(HTML(...))``; Streamlit users
5
+ pass to ``st.markdown(..., unsafe_allow_html=True)``; FastAPI users return
6
+ it from a route.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import html
11
+ from typing import List, Tuple
12
+
13
+ import numpy as np
14
+
15
+ # Smooth gradient: grey -> indigo -> cyan -> yellow -> orange -> red
16
+ # Tuples are (score_threshold, (r, g, b)) — Tailwind-esque palette that
17
+ # reads well on both light and dark backgrounds.
18
+ _COLOR_STOPS: List[Tuple[float, Tuple[int, int, int]]] = [
19
+ (1.00, (220, 38, 38)), # red-600 — core match
20
+ (0.80, (249, 115, 22)), # orange-500
21
+ (0.60, (234, 179, 8)), # yellow-500
22
+ (0.40, (6, 182, 212)), # cyan-500
23
+ (0.20, (79, 70, 229)), # indigo-600
24
+ (0.00, (156, 163, 175)), # gray-400 — unrelated
25
+ ]
26
+
27
+
28
+ def score_to_hex(score: float) -> str:
29
+ """Interpolate a score in [0, 1] to a hex color along the gradient."""
30
+ s = max(0.0, min(1.0, score))
31
+ for i in range(len(_COLOR_STOPS) - 1):
32
+ hi_s, hi_c = _COLOR_STOPS[i]
33
+ lo_s, lo_c = _COLOR_STOPS[i + 1]
34
+ if s >= lo_s:
35
+ t = (s - lo_s) / (hi_s - lo_s) if hi_s > lo_s else 0.0
36
+ r = int(lo_c[0] + t * (hi_c[0] - lo_c[0]))
37
+ g = int(lo_c[1] + t * (hi_c[1] - lo_c[1]))
38
+ b = int(lo_c[2] + t * (hi_c[2] - lo_c[2]))
39
+ return f"#{r:02x}{g:02x}{b:02x}"
40
+ return "#9ca3af"
41
+
42
+
43
+ def _legend_html(n_words: int, decay_sigma: float, elapsed_ms: float) -> str:
44
+ swatches = " → ".join(
45
+ f'<span style="color:{score_to_hex(s)}">■</span>'
46
+ for s in (1.0, 0.8, 0.6, 0.4, 0.2, 0.0)
47
+ )
48
+ return (
49
+ '<div style="font-family:monospace;margin-bottom:8px;font-size:0.9em">'
50
+ f'<b>Relevance:</b>&nbsp;{swatches}&nbsp;'
51
+ '<span style="color:#666">(hot → cold)</span>'
52
+ f'&nbsp;&nbsp;<span style="color:#888;font-size:0.85em">'
53
+ f'({elapsed_ms:.1f} ms · {n_words} words · σ={decay_sigma})</span>'
54
+ '</div>'
55
+ )
56
+
57
+
58
+ def render_html(
59
+ words: List[str],
60
+ word_scores: np.ndarray,
61
+ query: str,
62
+ decay_sigma: float,
63
+ elapsed_ms: float,
64
+ quantize_levels: int = 64,
65
+ max_height_px: int = 540,
66
+ ) -> str:
67
+ """Render a heatmap-colored document as an HTML string.
68
+
69
+ Parameters
70
+ ----------
71
+ words : list of str
72
+ The full document, tokenized.
73
+ word_scores : np.ndarray, shape (n_words,), values in [0, 1]
74
+ query : str
75
+ Shown in the result header.
76
+ decay_sigma : float
77
+ Shown in the legend (informational).
78
+ elapsed_ms : float
79
+ Search latency in milliseconds, shown in the legend.
80
+ quantize_levels : int
81
+ Group consecutive words of similar score into a single <span> to
82
+ keep HTML compact. Higher = finer color gradient, larger HTML.
83
+ max_height_px : int
84
+ Scroll the document container if it exceeds this height.
85
+
86
+ Returns
87
+ -------
88
+ str
89
+ A self-contained HTML fragment. Safe to embed; words are escaped.
90
+ """
91
+ # Quantize scores so consecutive words of similar warmth share a span
92
+ def quant(s: float) -> int:
93
+ return int(s * quantize_levels)
94
+
95
+ spans: List[str] = []
96
+ cur_q: int | None = None
97
+ cur_words: List[str] = []
98
+ for i, w in enumerate(words):
99
+ q = quant(float(word_scores[i]))
100
+ if q == cur_q:
101
+ cur_words.append(w)
102
+ else:
103
+ if cur_words:
104
+ col = score_to_hex(cur_q / quantize_levels if cur_q is not None else 0.0)
105
+ spans.append(
106
+ f'<span style="color:{col}">{html.escape(" ".join(cur_words))}</span>'
107
+ )
108
+ cur_q, cur_words = q, [w]
109
+ if cur_words:
110
+ col = score_to_hex(cur_q / quantize_levels if cur_q is not None else 0.0)
111
+ spans.append(
112
+ f'<span style="color:{col}">{html.escape(" ".join(cur_words))}</span>'
113
+ )
114
+
115
+ header = f'<h4 style="margin-bottom:4px">Results for: <em>{html.escape(query)}</em></h4>'
116
+ legend = _legend_html(len(words), decay_sigma, elapsed_ms)
117
+ doc = (
118
+ f'<div style="font-family:Georgia,serif;font-size:0.95em;line-height:1.8;'
119
+ f'white-space:pre-wrap;border:1px solid #ddd;border-radius:6px;'
120
+ f'padding:16px;max-height:{max_height_px}px;overflow-y:auto">'
121
+ + " ".join(spans)
122
+ + '</div>'
123
+ )
124
+ return header + legend + doc
doclighter/scoring.py ADDED
@@ -0,0 +1,118 @@
1
+ """Proximity-decayed word-level scoring.
2
+
3
+ This is the algorithmic heart of Doclighter. Given per-chunk semantic scores,
4
+ we compute a per-word relevance score by letting each chunk's score "radiate"
5
+ outward with exponential decay:
6
+
7
+ word_score[w] = max over chunks c of raw[c] * exp(-distance(w, c) / sigma)
8
+
9
+ Distance is measured in words to the nearest edge of the chunk (0 if inside).
10
+ Max-aggregation (rather than sum) means a word's color reflects its single
11
+ strongest semantic neighbor, which matches the visual intuition: a word is
12
+ "hot" if any nearby region is hot, not because many lukewarm regions sum up.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ from typing import List
17
+
18
+ import numpy as np
19
+
20
+ from .chunking import Chunk
21
+
22
+
23
+ def word_heatmap(
24
+ chunk_scores: np.ndarray,
25
+ chunks: List[Chunk],
26
+ n_words: int,
27
+ decay_sigma: float = 20.0,
28
+ threshold_frac: float = 0.05,
29
+ ) -> np.ndarray:
30
+ """Spread chunk scores into per-word scores via exponential decay.
31
+
32
+ Parameters
33
+ ----------
34
+ chunk_scores : np.ndarray, shape (n_chunks,)
35
+ Semantic similarity score per chunk. Negatives are clipped to 0
36
+ (irrelevant regions shouldn't radiate warmth).
37
+ chunks : list of Chunk
38
+ Word-span metadata for each chunk.
39
+ n_words : int
40
+ Total number of words in the document.
41
+ decay_sigma : float
42
+ Decay length scale in words. exp(-d/sigma): at d=sigma, weight = 0.37;
43
+ at d=2*sigma, weight = 0.14. Small sigma (~5) = sharp word-level
44
+ highlights. Large sigma (~100) = broad thematic regions.
45
+ threshold_frac : float
46
+ Skip chunks with score < threshold_frac * max_score. Pure optimization:
47
+ chunks with negligible scores can't affect the final heatmap.
48
+
49
+ Returns
50
+ -------
51
+ np.ndarray, shape (n_words,), values in [0, 1]
52
+ Per-word relevance score, normalized so max = 1.
53
+ """
54
+ raw = np.maximum(chunk_scores, 0.0).astype(np.float32)
55
+ if raw.max() == 0:
56
+ return np.zeros(n_words, dtype=np.float32)
57
+
58
+ threshold = float(raw.max()) * threshold_frac
59
+ word_idx = np.arange(n_words, dtype=np.float32)
60
+ word_scores = np.zeros(n_words, dtype=np.float32)
61
+
62
+ for ci, chunk in enumerate(chunks):
63
+ score = raw[ci]
64
+ if score < threshold:
65
+ continue
66
+ # Distance from each word to nearest edge of this chunk
67
+ # (0 inside the chunk, positive outside)
68
+ dist = np.where(
69
+ word_idx < chunk.start,
70
+ chunk.start - word_idx,
71
+ np.where(word_idx >= chunk.end, word_idx - (chunk.end - 1), 0.0),
72
+ )
73
+ influence = score * np.exp(-dist / decay_sigma)
74
+ np.maximum(word_scores, influence, out=word_scores)
75
+
76
+ mx = word_scores.max()
77
+ if mx > 0:
78
+ word_scores /= mx
79
+ return word_scores
80
+
81
+
82
+ def aggregate_multi_query(
83
+ per_query_heatmaps: List[np.ndarray],
84
+ mode: str = "max",
85
+ ) -> np.ndarray:
86
+ """Combine word-level heatmaps from multiple sub-queries.
87
+
88
+ Parameters
89
+ ----------
90
+ per_query_heatmaps : list of np.ndarray
91
+ Each shape (n_words,), values in [0, 1].
92
+ mode : {"max", "mean", "sum"}
93
+ "max" — strongest single match wins per word (default; closest to
94
+ ColBERT MaxSim spirit).
95
+ "mean" — average influence across queries.
96
+ "sum" — additive, then renormalized. Useful when you want words that
97
+ match *multiple* sub-queries to outshine words matching just one.
98
+
99
+ Returns
100
+ -------
101
+ np.ndarray, shape (n_words,), values in [0, 1]
102
+ """
103
+ if not per_query_heatmaps:
104
+ raise ValueError("per_query_heatmaps must be non-empty")
105
+ stack = np.stack(per_query_heatmaps, axis=0)
106
+ if mode == "max":
107
+ combined = stack.max(axis=0)
108
+ elif mode == "mean":
109
+ combined = stack.mean(axis=0)
110
+ elif mode == "sum":
111
+ combined = stack.sum(axis=0)
112
+ else:
113
+ raise ValueError(f"mode must be 'max', 'mean', or 'sum'; got {mode!r}")
114
+
115
+ mx = combined.max()
116
+ if mx > 0:
117
+ combined = combined / mx
118
+ return combined.astype(np.float32)
@@ -0,0 +1,245 @@
1
+ Metadata-Version: 2.4
2
+ Name: doclighter
3
+ Version: 0.1.0
4
+ Summary: A semantic Ctrl+F that paints your document with a relevance gradient.
5
+ Author-email: Pratyush <pratyush272@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/pratyush272/doclighter
8
+ Project-URL: Repository, https://github.com/pratyush272/doclighter
9
+ Project-URL: Issues, https://github.com/pratyush272/doclighter/issues
10
+ Keywords: semantic-search,embeddings,visualization,rag,nlp,pdf
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
22
+ Classifier: Topic :: Text Processing :: Indexing
23
+ Requires-Python: >=3.9
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: numpy>=1.21
27
+ Requires-Dist: sentence-transformers>=2.2
28
+ Requires-Dist: pypdf>=4.0
29
+ Requires-Dist: requests>=2.25
30
+ Provides-Extra: quantize
31
+ Requires-Dist: faiss-cpu>=1.7; extra == "quantize"
32
+ Provides-Extra: streamlit
33
+ Requires-Dist: streamlit>=1.28; extra == "streamlit"
34
+ Provides-Extra: dev
35
+ Requires-Dist: pytest>=7; extra == "dev"
36
+ Requires-Dist: pytest-cov>=4; extra == "dev"
37
+ Requires-Dist: ruff>=0.1; extra == "dev"
38
+ Provides-Extra: all
39
+ Requires-Dist: doclighter[dev,quantize,streamlit]; extra == "all"
40
+ Dynamic: license-file
41
+
42
+ # doclighter
43
+
44
+ [![tests](https://github.com/pratyush272/doclighter/actions/workflows/test.yml/badge.svg)](https://github.com/pratyush272/doclighter/actions/workflows/test.yml)
45
+ [![PyPI](https://img.shields.io/pypi/v/doclighter.svg)](https://pypi.org/project/doclighter/)
46
+ [![Python](https://img.shields.io/pypi/pyversions/doclighter.svg)](https://pypi.org/project/doclighter/)
47
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
48
+
49
+ **A semantic Ctrl+F that paints your document with a relevance gradient.**
50
+
51
+ `doclighter` is what you reach for when you need to *see* where a topic lives in a document — not be told an answer. It embeds your document at fine granularity, then projects query relevance back onto every word as a heatmap. No LLM, no hallucination, no top-K cliff.
52
+
53
+ ```python
54
+ from doclighter import Doclighter
55
+
56
+ doc = Doclighter.from_pdf("contract.pdf")
57
+ result = doc.search("termination clauses")
58
+ result # In Jupyter: renders the whole document, color-coded by relevance
59
+ ```
60
+
61
+ ## Why this exists
62
+
63
+ Traditional RAG hands an LLM the top 3–10 chunks and asks it to generate an answer. That's great when you trust the answer and don't need to read the source. But sometimes you *need to read the source* — legal review, paper skimming, contract diffing, due diligence — and you want a tool that helps you *navigate* a long document, not summarize it away.
64
+
65
+ `doclighter` is for that. It treats the whole document as the output, and re-colors it by semantic relevance to your query. Hot regions deserve your attention; cold regions you can skim past.
66
+
67
+ It's deterministic, fast (sub-100ms per query after indexing), and shows you the long tail — including the case where your query *doesn't* match anything (everything stays cold blue, which is itself useful information that RAG hides).
68
+
69
+ ## Install
70
+
71
+ ```bash
72
+ pip install doclighter
73
+ ```
74
+
75
+ Optional extras:
76
+ ```bash
77
+ pip install "doclighter[quantize]" # FAISS SQ8 index for very large docs
78
+ pip install "doclighter[streamlit]" # for the interactive demo app
79
+ pip install "doclighter[dev]" # for contributors
80
+ ```
81
+
82
+ ## Quickstart
83
+
84
+ ### Load a document
85
+
86
+ ```python
87
+ from doclighter import Doclighter
88
+
89
+ # From a PDF on disk
90
+ doc = Doclighter.from_pdf("contract.pdf")
91
+
92
+ # From a PDF URL
93
+ doc = Doclighter.from_url("https://example.com/contract.pdf")
94
+
95
+ # From raw text
96
+ doc = Doclighter.from_text(open("paper.txt").read())
97
+ ```
98
+
99
+ The first call downloads the default embedding model (~80 MB MiniLM) and embeds your document. For a ~10K word doc this takes ~25 seconds. Subsequent searches reuse the index.
100
+
101
+ ### Search
102
+
103
+ ```python
104
+ result = doc.search("termination clauses")
105
+
106
+ result.word_scores # numpy array, shape (n_words,), values in [0, 1]
107
+ result.top_chunks(k=10) # list of (chunk_text, score, (start, end))
108
+ result.elapsed_ms # ~10-50ms for typical docs
109
+ result.to_html() # HTML string for display anywhere
110
+ ```
111
+
112
+ In Jupyter, just put `result` on the last line of a cell — it renders the heatmap inline.
113
+
114
+ ### Zoom: the `decay_sigma` knob
115
+
116
+ The differentiating feature. `decay_sigma` controls how far semantic warmth spreads from a matched region:
117
+
118
+ ```python
119
+ narrow = doc.search("termination", decay_sigma=5.0) # sharp word-level highlights
120
+ broad = doc.search("termination", decay_sigma=80.0) # broad thematic regions
121
+ ```
122
+
123
+ Same index, no re-embedding. Drag the σ slider in the Streamlit demo to feel what this does.
124
+
125
+ ### Multi-query
126
+
127
+ ```python
128
+ result = doc.search(
129
+ ["termination", "indemnification", "labour wages"],
130
+ multi_query_aggregate="max", # or "sum" to favor regions matching multiple
131
+ )
132
+ ```
133
+
134
+ ### Save / load the index
135
+
136
+ Embedding is the slow step. Save once, reuse:
137
+
138
+ ```python
139
+ doc.save("contract.idx")
140
+ doc = Doclighter.load("contract.idx")
141
+ ```
142
+
143
+ ### Bring your own embedder
144
+
145
+ Any callable mapping `list[str] -> np.ndarray` of shape `(N, dim)` works:
146
+
147
+ ```python
148
+ from sentence_transformers import SentenceTransformer
149
+
150
+ bge = SentenceTransformer("BAAI/bge-small-en-v1.5")
151
+ doc = Doclighter.from_text(text, embedder=bge.encode)
152
+ ```
153
+
154
+ ## Streamlit demo
155
+
156
+ ```bash
157
+ pip install "doclighter[streamlit]"
158
+ streamlit run examples/streamlit_app.py
159
+ ```
160
+
161
+ A working UI with PDF upload, query box, σ slider, and live re-rendering.
162
+
163
+ ## How it works
164
+
165
+ 1. **Chunk** the document into small rolling windows (default: 12 words, 50% overlap).
166
+ 2. **Embed** each chunk with sentence-transformers (default: `all-MiniLM-L6-v2`).
167
+ 3. **Score** each chunk against your query via cosine similarity.
168
+ 4. **Project** chunk scores back onto every word via exponential proximity decay:
169
+
170
+ ```
171
+ word_score[w] = max over chunks c of raw[c] × exp(-distance(w, c) / sigma)
172
+ ```
173
+
174
+ 5. **Render** the document as colored HTML — words inherit warmth from their nearest semantically matched chunk.
175
+
176
+ Step 4 is the interesting one. Max-aggregation (rather than sum) means a word's color reflects its single strongest semantic neighbor — visually intuitive and resistant to "many lukewarm chunks add up to red" noise.
177
+
178
+ ## How this compares to RAG
179
+
180
+ `doclighter` and RAG solve different problems:
181
+
182
+ | | RAG | doclighter |
183
+ |---|---|---|
184
+ | Output | LLM-generated answer | Document, recolored |
185
+ | Best for | "What's the answer?" | "Where in the doc?" |
186
+ | When query has no answer | LLM hedges / hallucinates | Document stays cold (honest) |
187
+ | Hides chunk boundaries | No — top-K cliff | Yes — gradient smooths over |
188
+ | Cost per query | LLM tokens | Free (one matmul) |
189
+ | Determinism | Sampling-dependent | Fully deterministic |
190
+
191
+ They're complementary. Use RAG when you trust the LLM with the question; use `doclighter` when you need to read the source yourself.
192
+
193
+ The algorithmic kernel is conceptually related to [ColBERT](https://github.com/stanford-futuredata/ColBERT)'s MaxSim late-interaction, but applied to *visualization* rather than ranking, and at the word-level rather than the token-level.
194
+
195
+ ## API reference
196
+
197
+ ### `Doclighter(text, **kwargs)`
198
+
199
+ | Parameter | Default | Description |
200
+ |---|---|---|
201
+ | `text` | required | The document as a string |
202
+ | `chunk_size` | `12` | Words per rolling window |
203
+ | `chunk_overlap` | `0.5` | Fraction overlap between windows |
204
+ | `embedder` | `None` | Custom embedder callable (default: MiniLM) |
205
+ | `embedding_model` | `"all-MiniLM-L6-v2"` | sentence-transformers model name |
206
+ | `decay_sigma` | `20.0` | Default proximity decay scale (words) |
207
+ | `quantize` | `False` | Use SQ8 FAISS index instead of flat exact |
208
+ | `quantize_rerank_k` | `200` | If quantize=True, rerank top-K with exact |
209
+
210
+ Alternate constructors: `Doclighter.from_text(...)`, `Doclighter.from_pdf(path, ...)`, `Doclighter.from_url(url, ...)`.
211
+
212
+ ### `doc.search(query, **kwargs) -> SearchResult`
213
+
214
+ | Parameter | Default | Description |
215
+ |---|---|---|
216
+ | `query` | required | A string, or list of strings for multi-query |
217
+ | `decay_sigma` | `None` | Override the doc's default sigma |
218
+ | `multi_query_aggregate` | `"max"` | `"max"`, `"mean"`, or `"sum"` for multi-query |
219
+
220
+ ### `SearchResult`
221
+
222
+ | Attribute / method | Returns |
223
+ |---|---|
224
+ | `.word_scores` | `np.ndarray` of shape `(n_words,)`, in `[0, 1]` |
225
+ | `.chunk_scores` | `np.ndarray` of raw per-chunk cosine scores |
226
+ | `.top_chunks(k=10)` | `list[(text, score, (start, end))]` |
227
+ | `.to_html(**kwargs)` | HTML string of the heatmap-colored document |
228
+ | `.elapsed_ms` | Search latency |
229
+
230
+ ## Development
231
+
232
+ ```bash
233
+ git clone https://github.com/pratyush272/doclighter
234
+ cd doclighter
235
+ pip install -e ".[dev]"
236
+ pytest
237
+ ```
238
+
239
+ ## License
240
+
241
+ MIT. See [LICENSE](LICENSE).
242
+
243
+ ## Citation / acknowledgement
244
+
245
+ If you use `doclighter` in research, a link back is appreciated. The proximity-decay scoring idea borrows from passage-retrieval literature; max-aggregation over fine-grained matches is in spirit closest to ColBERT.
@@ -0,0 +1,13 @@
1
+ doclighter/__init__.py,sha256=kzREXwu7PcSRSSEOKxefkl7wqq3Tf92iRZ8GyrJ8aws,522
2
+ doclighter/chunking.py,sha256=Zaicqk9eNpkPlulCJPB_TkIOGCSag7p6bLoEG9jN42o,1823
3
+ doclighter/core.py,sha256=rz2ZOIB9FY1ibtZnn5qvWZCasy-KpD2wOpfOGA0chso,10271
4
+ doclighter/embedding.py,sha256=hd2e5K75HvBC0BM83z6gwjMqXwuoS6tgDzRffknNpE0,1816
5
+ doclighter/extract.py,sha256=NZoHDD8VaeqUeuw6Mszb91Qp375k-5z9gNsf54lPKDo,1201
6
+ doclighter/index.py,sha256=AHCS4w3FR-zq8JEfzXMF81CToswkuZJkk7ffqSElnnk,2830
7
+ doclighter/render.py,sha256=fzkqYC7tfyW6SgtUeG9VPnc4Q-5yamWJUpTeOyh_M80,4473
8
+ doclighter/scoring.py,sha256=MZGGIFuWK3nW3hGpX38aBUjiDa9uq1nBe1WXMLzZXwI,4048
9
+ doclighter-0.1.0.dist-info/licenses/LICENSE,sha256=k412KnI3imf_ScR1LlhRFjnHFq62gZSbHO88cwa_Ljk,1065
10
+ doclighter-0.1.0.dist-info/METADATA,sha256=V3Pjm6w2MtLzEQsHi3RuqF4Eb5yu1jDxeZNNPr2heYc,9401
11
+ doclighter-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
12
+ doclighter-0.1.0.dist-info/top_level.txt,sha256=Sd9T52y2LYw0Vz5aIRk_C3rD6JAktygliw5KakMp4TA,11
13
+ doclighter-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Pratyush
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ doclighter