superlinear 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- apps/__init__.py +4 -0
- apps/cli/__init__.py +8 -0
- apps/cli/bm25_rag.py +471 -0
- apps/cli/chat_repl.py +1497 -0
- apps/cli/client.py +195 -0
- apps/cli/docs_repl.py +2275 -0
- apps/cli/light_rag.py +729 -0
- apps/cli/local_snapshots.py +139 -0
- apps/cli/locks.py +214 -0
- apps/cli/main.py +457 -0
- apps/cli/output.py +32 -0
- apps/cli/server_cmds.py +516 -0
- apps/cli/session_cmds.py +491 -0
- apps/cli/snapshot_cmds.py +303 -0
- apps/cli/state.py +265 -0
- apps/server/__init__.py +4 -0
- apps/server/app.py +1363 -0
- apps/server/main.py +313 -0
- superlinear/__init__.py +114 -0
- superlinear/_version.py +3 -0
- superlinear/engine/__init__.py +10 -0
- superlinear/engine/adapters/__init__.py +12 -0
- superlinear/engine/adapters/base.py +91 -0
- superlinear/engine/adapters/superlinear.py +1233 -0
- superlinear/engine/chat_engine.py +1173 -0
- superlinear/engine/chat_types.py +130 -0
- superlinear/engine/registry.py +51 -0
- superlinear/engine/repetition.py +203 -0
- superlinear/engine/session_snapshots.py +451 -0
- superlinear/engine/tool_parser.py +83 -0
- superlinear/engine/types.py +42 -0
- superlinear/kernels/__init__.py +2 -0
- superlinear/kernels/common/__init__.py +21 -0
- superlinear/kernels/common/adjustment.py +106 -0
- superlinear/kernels/common/power.py +154 -0
- superlinear/kernels/superlinear/__init__.py +10 -0
- superlinear/kernels/superlinear/attention/__init__.py +78 -0
- superlinear/kernels/superlinear/attention/_prefill.py +940 -0
- superlinear/kernels/superlinear/attention/_sliding_window.py +1167 -0
- superlinear/kernels/superlinear/attention/api.py +433 -0
- superlinear/kernels/superlinear/search/__init__.py +33 -0
- superlinear/kernels/superlinear/search/_reference.py +204 -0
- superlinear/kernels/superlinear/search/_triton.py +488 -0
- superlinear/kernels/superlinear/search/_triton_gqa.py +534 -0
- superlinear/kernels/superlinear/search/api.py +200 -0
- superlinear/kernels/superlinear/span/__init__.py +41 -0
- superlinear/kernels/superlinear/span/_triton_bucketed_gqa.py +1461 -0
- superlinear/kernels/superlinear/span/_triton_forward.py +22 -0
- superlinear/kernels/superlinear/span/_triton_gqa.py +1226 -0
- superlinear/kernels/superlinear/span/_triton_impl.py +928 -0
- superlinear/kernels/superlinear/span/_triton_precomputed_sw.py +460 -0
- superlinear/kernels/superlinear/span/_triton_precomputed_sw_gqa.py +598 -0
- superlinear/kernels/superlinear/span/api.py +296 -0
- superlinear/kernels/superlinear/span/masks.py +187 -0
- superlinear/py.typed +0 -0
- superlinear/runtime.py +71 -0
- superlinear-0.1.0.dist-info/METADATA +469 -0
- superlinear-0.1.0.dist-info/RECORD +62 -0
- superlinear-0.1.0.dist-info/WHEEL +5 -0
- superlinear-0.1.0.dist-info/entry_points.txt +2 -0
- superlinear-0.1.0.dist-info/licenses/LICENSE +202 -0
- superlinear-0.1.0.dist-info/top_level.txt +2 -0
apps/cli/light_rag.py
ADDED
|
@@ -0,0 +1,729 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from collections import OrderedDict
|
|
6
|
+
from dataclasses import dataclass, replace
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# Used for normalization/matching where we intentionally work in lowercase.
|
|
12
|
+
_NON_ALNUM_RE = re.compile(r"[^a-z0-9]+")
|
|
13
|
+
# Used only for extracting original-cased query tokens (e.g. "Go", "AI").
|
|
14
|
+
_NON_ALNUM_RE_ORIG = re.compile(r"[^A-Za-z0-9]+")
|
|
15
|
+
_MULTISPACE_RE = re.compile(r"\s+")
|
|
16
|
+
_PARA_SPLIT_RE = re.compile(r"\n\s*\n+")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_SHORT_QUERY_TERMS_2 = frozenset(
|
|
20
|
+
{
|
|
21
|
+
"ai",
|
|
22
|
+
"go",
|
|
23
|
+
"ml",
|
|
24
|
+
"rl",
|
|
25
|
+
}
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
_STOPWORDS = frozenset(
|
|
30
|
+
{
|
|
31
|
+
"a",
|
|
32
|
+
"about",
|
|
33
|
+
"above",
|
|
34
|
+
"after",
|
|
35
|
+
"again",
|
|
36
|
+
"against",
|
|
37
|
+
"all",
|
|
38
|
+
"also",
|
|
39
|
+
"am",
|
|
40
|
+
"an",
|
|
41
|
+
"and",
|
|
42
|
+
"any",
|
|
43
|
+
"are",
|
|
44
|
+
"as",
|
|
45
|
+
"at",
|
|
46
|
+
"be",
|
|
47
|
+
"because",
|
|
48
|
+
"been",
|
|
49
|
+
"before",
|
|
50
|
+
"being",
|
|
51
|
+
"below",
|
|
52
|
+
"between",
|
|
53
|
+
"both",
|
|
54
|
+
"but",
|
|
55
|
+
"by",
|
|
56
|
+
"can",
|
|
57
|
+
"could",
|
|
58
|
+
"did",
|
|
59
|
+
"do",
|
|
60
|
+
"does",
|
|
61
|
+
"doing",
|
|
62
|
+
"down",
|
|
63
|
+
"during",
|
|
64
|
+
"each",
|
|
65
|
+
"few",
|
|
66
|
+
"for",
|
|
67
|
+
"from",
|
|
68
|
+
"further",
|
|
69
|
+
"had",
|
|
70
|
+
"has",
|
|
71
|
+
"have",
|
|
72
|
+
"having",
|
|
73
|
+
"he",
|
|
74
|
+
"her",
|
|
75
|
+
"here",
|
|
76
|
+
"hers",
|
|
77
|
+
"herself",
|
|
78
|
+
"him",
|
|
79
|
+
"himself",
|
|
80
|
+
"his",
|
|
81
|
+
"how",
|
|
82
|
+
"i",
|
|
83
|
+
"if",
|
|
84
|
+
"in",
|
|
85
|
+
"into",
|
|
86
|
+
"is",
|
|
87
|
+
"it",
|
|
88
|
+
"its",
|
|
89
|
+
"itself",
|
|
90
|
+
"just",
|
|
91
|
+
"me",
|
|
92
|
+
"more",
|
|
93
|
+
"most",
|
|
94
|
+
"my",
|
|
95
|
+
"myself",
|
|
96
|
+
"no",
|
|
97
|
+
"nor",
|
|
98
|
+
"not",
|
|
99
|
+
"now",
|
|
100
|
+
"of",
|
|
101
|
+
"off",
|
|
102
|
+
"on",
|
|
103
|
+
"once",
|
|
104
|
+
"only",
|
|
105
|
+
"or",
|
|
106
|
+
"other",
|
|
107
|
+
"our",
|
|
108
|
+
"ours",
|
|
109
|
+
"ourselves",
|
|
110
|
+
"out",
|
|
111
|
+
"over",
|
|
112
|
+
"own",
|
|
113
|
+
"same",
|
|
114
|
+
"she",
|
|
115
|
+
"should",
|
|
116
|
+
"so",
|
|
117
|
+
"some",
|
|
118
|
+
"such",
|
|
119
|
+
"than",
|
|
120
|
+
"that",
|
|
121
|
+
"the",
|
|
122
|
+
"their",
|
|
123
|
+
"theirs",
|
|
124
|
+
"them",
|
|
125
|
+
"themselves",
|
|
126
|
+
"then",
|
|
127
|
+
"there",
|
|
128
|
+
"these",
|
|
129
|
+
"they",
|
|
130
|
+
"this",
|
|
131
|
+
"those",
|
|
132
|
+
"through",
|
|
133
|
+
"to",
|
|
134
|
+
"too",
|
|
135
|
+
"under",
|
|
136
|
+
"until",
|
|
137
|
+
"up",
|
|
138
|
+
"very",
|
|
139
|
+
"was",
|
|
140
|
+
"we",
|
|
141
|
+
"were",
|
|
142
|
+
"what",
|
|
143
|
+
"when",
|
|
144
|
+
"where",
|
|
145
|
+
"which",
|
|
146
|
+
"while",
|
|
147
|
+
"who",
|
|
148
|
+
"whom",
|
|
149
|
+
"why",
|
|
150
|
+
"with",
|
|
151
|
+
"would",
|
|
152
|
+
"you",
|
|
153
|
+
"your",
|
|
154
|
+
"yours",
|
|
155
|
+
"yourself",
|
|
156
|
+
"yourselves",
|
|
157
|
+
|
|
158
|
+
# Query-instruction / meta words (common in docs REPL prompts)
|
|
159
|
+
"article",
|
|
160
|
+
"articles",
|
|
161
|
+
"mention",
|
|
162
|
+
"mentions",
|
|
163
|
+
"quoted",
|
|
164
|
+
"quote",
|
|
165
|
+
"quotes",
|
|
166
|
+
"exact",
|
|
167
|
+
"exactly",
|
|
168
|
+
"sentence",
|
|
169
|
+
"sentences",
|
|
170
|
+
"substring",
|
|
171
|
+
"end",
|
|
172
|
+
"sources",
|
|
173
|
+
"path",
|
|
174
|
+
"containing",
|
|
175
|
+
"contains",
|
|
176
|
+
"yes",
|
|
177
|
+
|
|
178
|
+
# More prompt-instruction words that otherwise pollute retrieval.
|
|
179
|
+
"name",
|
|
180
|
+
"short",
|
|
181
|
+
"shorter",
|
|
182
|
+
"shortest",
|
|
183
|
+
"line",
|
|
184
|
+
"lines",
|
|
185
|
+
"fragment",
|
|
186
|
+
"fragments",
|
|
187
|
+
"sense",
|
|
188
|
+
"verb",
|
|
189
|
+
"verbatim",
|
|
190
|
+
|
|
191
|
+
# More meta words common in evaluation prompts
|
|
192
|
+
"ingested",
|
|
193
|
+
"difference",
|
|
194
|
+
"answer",
|
|
195
|
+
"include",
|
|
196
|
+
"available",
|
|
197
|
+
"term",
|
|
198
|
+
"terms",
|
|
199
|
+
"one",
|
|
200
|
+
|
|
201
|
+
# Prompt verbs/nouns that shouldn't drive lexical retrieval
|
|
202
|
+
"find",
|
|
203
|
+
"passage",
|
|
204
|
+
"define",
|
|
205
|
+
"defines",
|
|
206
|
+
"definition",
|
|
207
|
+
"imply",
|
|
208
|
+
"implies",
|
|
209
|
+
"say",
|
|
210
|
+
"says",
|
|
211
|
+
|
|
212
|
+
"explicitly",
|
|
213
|
+
"explain",
|
|
214
|
+
"extra",
|
|
215
|
+
"detail",
|
|
216
|
+
"details",
|
|
217
|
+
"sentence",
|
|
218
|
+
"sentences",
|
|
219
|
+
|
|
220
|
+
"full",
|
|
221
|
+
"ellipsis",
|
|
222
|
+
"ellipses",
|
|
223
|
+
"paraphrase",
|
|
224
|
+
"using",
|
|
225
|
+
"word",
|
|
226
|
+
"words",
|
|
227
|
+
"present",
|
|
228
|
+
|
|
229
|
+
# Constraint/policy words frequently used in prompts
|
|
230
|
+
"must",
|
|
231
|
+
"found",
|
|
232
|
+
|
|
233
|
+
# Generic question framing words that rarely help lexical retrieval
|
|
234
|
+
"intuition",
|
|
235
|
+
"problem",
|
|
236
|
+
"problems",
|
|
237
|
+
"solve",
|
|
238
|
+
"solves",
|
|
239
|
+
"solving",
|
|
240
|
+
|
|
241
|
+
# Common prompt directives that shouldn't influence retrieval
|
|
242
|
+
"use",
|
|
243
|
+
"used",
|
|
244
|
+
"provide",
|
|
245
|
+
"provided",
|
|
246
|
+
"providing",
|
|
247
|
+
"retrieve",
|
|
248
|
+
"retrieved",
|
|
249
|
+
"excerpt",
|
|
250
|
+
"excerpts",
|
|
251
|
+
"copy",
|
|
252
|
+
"copied",
|
|
253
|
+
"given",
|
|
254
|
+
|
|
255
|
+
# Conversation glue words (common in multi-turn follow-ups)
|
|
256
|
+
"tell",
|
|
257
|
+
"mentioned",
|
|
258
|
+
}
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
_QUOTEY_RE = re.compile(
|
|
263
|
+
r"\b(quote|verbatim|exact|substring|sentence|sentences|fragment|fragments|no\s+ellipses)\b",
|
|
264
|
+
re.IGNORECASE,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _looks_like_quote_task(question: str) -> bool:
|
|
269
|
+
return bool(_QUOTEY_RE.search(question or ""))
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _split_into_sentences(text: str) -> list[str]:
|
|
273
|
+
"""Best-effort sentence splitter for excerpt selection.
|
|
274
|
+
|
|
275
|
+
We intentionally keep it simple (no NLP deps). Newlines also act as boundaries.
|
|
276
|
+
"""
|
|
277
|
+
|
|
278
|
+
t = (text or "").replace("\r", "")
|
|
279
|
+
if not t:
|
|
280
|
+
return []
|
|
281
|
+
|
|
282
|
+
out: list[str] = []
|
|
283
|
+
start = 0
|
|
284
|
+
i = 0
|
|
285
|
+
n = len(t)
|
|
286
|
+
while i < n:
|
|
287
|
+
ch = t[i]
|
|
288
|
+
if ch == "\n":
|
|
289
|
+
seg = t[start:i].strip()
|
|
290
|
+
if seg:
|
|
291
|
+
out.append(seg)
|
|
292
|
+
start = i + 1
|
|
293
|
+
elif ch in {".", "!", "?"}:
|
|
294
|
+
# End sentence at punctuation; include it.
|
|
295
|
+
end = i + 1
|
|
296
|
+
seg = t[start:end].strip()
|
|
297
|
+
if seg:
|
|
298
|
+
out.append(seg)
|
|
299
|
+
# Skip trailing whitespace.
|
|
300
|
+
j = end
|
|
301
|
+
while j < n and t[j].isspace() and t[j] != "\n":
|
|
302
|
+
j += 1
|
|
303
|
+
start = j
|
|
304
|
+
i = j - 1
|
|
305
|
+
i += 1
|
|
306
|
+
|
|
307
|
+
tail = t[start:].strip()
|
|
308
|
+
if tail:
|
|
309
|
+
out.append(tail)
|
|
310
|
+
return out
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _select_sentence_snippet(text: str, *, terms: list[str], max_chars: int) -> str:
|
|
314
|
+
if max_chars <= 0:
|
|
315
|
+
return ""
|
|
316
|
+
if len(text) <= max_chars:
|
|
317
|
+
return text
|
|
318
|
+
|
|
319
|
+
sentences = _split_into_sentences(text)
|
|
320
|
+
if not sentences:
|
|
321
|
+
return _truncate_text(text, max_chars)
|
|
322
|
+
|
|
323
|
+
# Prefer the shortest sentence that still matches the most terms.
|
|
324
|
+
best: tuple[int, int, str] | None = None # (score, -len, sentence)
|
|
325
|
+
for s in sentences:
|
|
326
|
+
s_l = s.lower()
|
|
327
|
+
score = sum(1 for t in terms if t and t in s_l)
|
|
328
|
+
if score <= 0:
|
|
329
|
+
continue
|
|
330
|
+
cand = (score, -len(s), s)
|
|
331
|
+
if best is None or cand > best:
|
|
332
|
+
best = cand
|
|
333
|
+
|
|
334
|
+
chosen = (best[2] if best is not None else sentences[0]).strip()
|
|
335
|
+
if len(chosen) <= max_chars:
|
|
336
|
+
return chosen
|
|
337
|
+
|
|
338
|
+
# Last resort: clip without adding an ellipsis (to avoid models copying it into "verbatim" quotes).
|
|
339
|
+
return chosen[:max_chars].rstrip()
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def tokenize_query_terms(text: str, *, max_terms: int = 32) -> list[str]:
|
|
343
|
+
if not text or not isinstance(text, str):
|
|
344
|
+
return []
|
|
345
|
+
|
|
346
|
+
# Preserve original casing for acronym detection
|
|
347
|
+
raw_terms_original = _NON_ALNUM_RE_ORIG.sub(" ", text).split()
|
|
348
|
+
cleaned = _NON_ALNUM_RE.sub(" ", text.lower())
|
|
349
|
+
raw_terms = cleaned.split()
|
|
350
|
+
|
|
351
|
+
out: list[str] = []
|
|
352
|
+
seen: set[str] = set()
|
|
353
|
+
for i, term in enumerate(raw_terms):
|
|
354
|
+
# Keep short terms (2 chars) only if they look like acronyms/proper nouns
|
|
355
|
+
# (original had uppercase) e.g., "Go", "AI", "ML"
|
|
356
|
+
original = raw_terms_original[i] if i < len(raw_terms_original) else term
|
|
357
|
+
is_likely_name = len(original) >= 2 and original[0].isupper()
|
|
358
|
+
keep_short = term in _SHORT_QUERY_TERMS_2
|
|
359
|
+
|
|
360
|
+
if len(term) < 2:
|
|
361
|
+
continue
|
|
362
|
+
if len(term) == 2 and not (is_likely_name or keep_short):
|
|
363
|
+
continue
|
|
364
|
+
if term in _STOPWORDS:
|
|
365
|
+
continue
|
|
366
|
+
if term in seen:
|
|
367
|
+
continue
|
|
368
|
+
seen.add(term)
|
|
369
|
+
out.append(term)
|
|
370
|
+
if len(out) >= int(max_terms):
|
|
371
|
+
break
|
|
372
|
+
return out
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def tokenize_rag_text(text: str, *, max_tokens: int | None = None) -> list[str]:
|
|
376
|
+
"""Tokenize arbitrary text for lexical/BM25 retrieval.
|
|
377
|
+
|
|
378
|
+
The tokenization policy matches `tokenize_query_terms`, except:
|
|
379
|
+
- tokens are **not** deduplicated (BM25 needs term frequencies)
|
|
380
|
+
- the output can be optionally limited by `max_tokens`
|
|
381
|
+
"""
|
|
382
|
+
|
|
383
|
+
if not text or not isinstance(text, str):
|
|
384
|
+
return []
|
|
385
|
+
|
|
386
|
+
raw_terms_original = _NON_ALNUM_RE_ORIG.sub(" ", text).split()
|
|
387
|
+
cleaned = _NON_ALNUM_RE.sub(" ", text.lower())
|
|
388
|
+
raw_terms = cleaned.split()
|
|
389
|
+
|
|
390
|
+
out: list[str] = []
|
|
391
|
+
for i, term in enumerate(raw_terms):
|
|
392
|
+
original = raw_terms_original[i] if i < len(raw_terms_original) else term
|
|
393
|
+
is_likely_name = len(original) >= 2 and original[0].isupper()
|
|
394
|
+
keep_short = term in _SHORT_QUERY_TERMS_2
|
|
395
|
+
|
|
396
|
+
if len(term) < 2:
|
|
397
|
+
continue
|
|
398
|
+
if len(term) == 2 and not (is_likely_name or keep_short):
|
|
399
|
+
continue
|
|
400
|
+
if term in _STOPWORDS:
|
|
401
|
+
continue
|
|
402
|
+
|
|
403
|
+
out.append(term)
|
|
404
|
+
if max_tokens is not None and len(out) >= int(max_tokens):
|
|
405
|
+
break
|
|
406
|
+
|
|
407
|
+
return out
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def split_paragraphs(text: str) -> list[str]:
|
|
411
|
+
if not text or not isinstance(text, str):
|
|
412
|
+
return []
|
|
413
|
+
normalized = text.replace("\r", "").strip()
|
|
414
|
+
if not normalized:
|
|
415
|
+
return []
|
|
416
|
+
parts = _PARA_SPLIT_RE.split(normalized)
|
|
417
|
+
return [p.strip() for p in parts if p and p.strip()]
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def _normalize_for_matching(text: str) -> str:
|
|
421
|
+
cleaned = _NON_ALNUM_RE.sub(" ", (text or "").lower())
|
|
422
|
+
cleaned = _MULTISPACE_RE.sub(" ", cleaned).strip()
|
|
423
|
+
return f" {cleaned} " if cleaned else " "
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def _truncate_text(text: str, max_chars: int) -> str:
|
|
427
|
+
if max_chars <= 0:
|
|
428
|
+
return ""
|
|
429
|
+
if len(text) <= max_chars:
|
|
430
|
+
return text
|
|
431
|
+
if max_chars == 1:
|
|
432
|
+
return text[:1]
|
|
433
|
+
return text[: max_chars - 1].rstrip() + "…"
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def _coerce_int(v: Any, *, default: int, min_v: int, max_v: int) -> int:
|
|
437
|
+
try:
|
|
438
|
+
n = int(v)
|
|
439
|
+
except Exception:
|
|
440
|
+
return default
|
|
441
|
+
if n < min_v:
|
|
442
|
+
return min_v
|
|
443
|
+
if n > max_v:
|
|
444
|
+
return max_v
|
|
445
|
+
return n
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
@dataclass(frozen=True)
|
|
449
|
+
class LightRagConfig:
|
|
450
|
+
enabled: bool = True
|
|
451
|
+
k: int = 5
|
|
452
|
+
total_chars: int = 12000
|
|
453
|
+
per_source_chars: int = 2600
|
|
454
|
+
debug: bool = False
|
|
455
|
+
|
|
456
|
+
candidate_sources: int = 20
|
|
457
|
+
max_terms: int = 32
|
|
458
|
+
max_paragraphs_per_source: int = 8
|
|
459
|
+
max_paragraph_chars: int = 1200
|
|
460
|
+
min_term_matches: int = 2 # Only include excerpts matching at least this many query terms
|
|
461
|
+
|
|
462
|
+
def sanitized(self) -> "LightRagConfig":
|
|
463
|
+
return replace(
|
|
464
|
+
self,
|
|
465
|
+
k=_coerce_int(self.k, default=5, min_v=1, max_v=50),
|
|
466
|
+
total_chars=_coerce_int(self.total_chars, default=12000, min_v=200, max_v=200000),
|
|
467
|
+
per_source_chars=_coerce_int(self.per_source_chars, default=2600, min_v=50, max_v=50000),
|
|
468
|
+
candidate_sources=_coerce_int(self.candidate_sources, default=20, min_v=1, max_v=200),
|
|
469
|
+
max_terms=_coerce_int(self.max_terms, default=32, min_v=1, max_v=128),
|
|
470
|
+
max_paragraphs_per_source=_coerce_int(self.max_paragraphs_per_source, default=8, min_v=1, max_v=64),
|
|
471
|
+
max_paragraph_chars=_coerce_int(self.max_paragraph_chars, default=1200, min_v=50, max_v=20000),
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
@dataclass
|
|
476
|
+
class _DocCacheEntry:
|
|
477
|
+
mtime_ns: int
|
|
478
|
+
paragraphs: list[str]
|
|
479
|
+
normalized_paragraphs: list[str]
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
class LightRagRetriever:
|
|
483
|
+
def __init__(self, *, cache_docs: int = 64) -> None:
|
|
484
|
+
self._cache_docs = int(cache_docs)
|
|
485
|
+
self._cache: OrderedDict[str, _DocCacheEntry] = OrderedDict()
|
|
486
|
+
|
|
487
|
+
def clear_cache(self) -> None:
|
|
488
|
+
self._cache.clear()
|
|
489
|
+
|
|
490
|
+
def _read_doc(self, path_str: str) -> _DocCacheEntry:
|
|
491
|
+
path = Path(path_str)
|
|
492
|
+
st = path.stat()
|
|
493
|
+
mtime_ns = int(st.st_mtime_ns)
|
|
494
|
+
|
|
495
|
+
cached = self._cache.get(path_str)
|
|
496
|
+
if cached is not None and cached.mtime_ns == mtime_ns:
|
|
497
|
+
self._cache.move_to_end(path_str)
|
|
498
|
+
return cached
|
|
499
|
+
|
|
500
|
+
data = path.read_bytes()
|
|
501
|
+
if b"\x00" in data:
|
|
502
|
+
raise ValueError("refusing to read binary file (NUL byte found)")
|
|
503
|
+
text = data.decode("utf-8", errors="replace")
|
|
504
|
+
|
|
505
|
+
paragraphs = split_paragraphs(text)
|
|
506
|
+
normalized_paragraphs = [_normalize_for_matching(p) for p in paragraphs]
|
|
507
|
+
entry = _DocCacheEntry(mtime_ns=mtime_ns, paragraphs=paragraphs, normalized_paragraphs=normalized_paragraphs)
|
|
508
|
+
|
|
509
|
+
self._cache[path_str] = entry
|
|
510
|
+
self._cache.move_to_end(path_str)
|
|
511
|
+
while len(self._cache) > self._cache_docs:
|
|
512
|
+
self._cache.popitem(last=False)
|
|
513
|
+
|
|
514
|
+
return entry
|
|
515
|
+
|
|
516
|
+
def build_retrieved_excerpts_message(
|
|
517
|
+
self,
|
|
518
|
+
*,
|
|
519
|
+
question: str,
|
|
520
|
+
sources: list[dict[str, Any]],
|
|
521
|
+
config: LightRagConfig,
|
|
522
|
+
) -> tuple[str | None, list[str]]:
|
|
523
|
+
cfg = config.sanitized()
|
|
524
|
+
if not cfg.enabled:
|
|
525
|
+
return None, []
|
|
526
|
+
|
|
527
|
+
quote_task = _looks_like_quote_task(question)
|
|
528
|
+
|
|
529
|
+
terms = tokenize_query_terms(question, max_terms=cfg.max_terms)
|
|
530
|
+
if not terms:
|
|
531
|
+
return None, []
|
|
532
|
+
|
|
533
|
+
term_pats = [f" {t} " for t in terms]
|
|
534
|
+
# Adaptive threshold: for short queries, require fewer matches.
|
|
535
|
+
# Note: some queries include generic terms (e.g. "articles", "mention") that won't
|
|
536
|
+
# appear in the docs; we apply a fallback later if the threshold filters everything out.
|
|
537
|
+
effective_min_matches = min(cfg.min_term_matches, len(terms))
|
|
538
|
+
|
|
539
|
+
meta_scored: list[tuple[int, int, dict[str, Any], str]] = []
|
|
540
|
+
for idx, s in enumerate(sources):
|
|
541
|
+
path = s.get("path")
|
|
542
|
+
if not isinstance(path, str) or not path:
|
|
543
|
+
continue
|
|
544
|
+
|
|
545
|
+
title = s.get("title") if isinstance(s.get("title"), str) else ""
|
|
546
|
+
src = s.get("source") if isinstance(s.get("source"), str) else ""
|
|
547
|
+
meta_text = f"{title}\n{Path(path).name}\n{src}"
|
|
548
|
+
meta_norm = _normalize_for_matching(meta_text)
|
|
549
|
+
meta_score = sum(1 for pat in term_pats if pat in meta_norm)
|
|
550
|
+
meta_scored.append((int(meta_score), idx, s, meta_norm))
|
|
551
|
+
|
|
552
|
+
if not meta_scored:
|
|
553
|
+
return None, []
|
|
554
|
+
|
|
555
|
+
meta_scored.sort(key=lambda x: (-x[0], x[1]))
|
|
556
|
+
|
|
557
|
+
# If the query terms do not appear in titles/filenames/source labels, meta scoring
|
|
558
|
+
# provides no useful signal (many/most scores will be 0). In that case, avoid
|
|
559
|
+
# over-pruning to the first N sources, which can miss the relevant doc purely due
|
|
560
|
+
# to ingestion order.
|
|
561
|
+
if meta_scored[0][0] <= 0:
|
|
562
|
+
max_candidates = max(int(cfg.candidate_sources), 200)
|
|
563
|
+
candidates = meta_scored[: min(len(meta_scored), max_candidates)]
|
|
564
|
+
else:
|
|
565
|
+
candidates = meta_scored[: cfg.candidate_sources]
|
|
566
|
+
|
|
567
|
+
def _scan(threshold: int) -> tuple[
|
|
568
|
+
list[tuple[int, int, int, int, dict[str, Any], list[tuple[int, int]], _DocCacheEntry]],
|
|
569
|
+
list[str],
|
|
570
|
+
]:
|
|
571
|
+
matches_local: list[
|
|
572
|
+
tuple[int, int, int, int, dict[str, Any], list[tuple[int, int]], _DocCacheEntry]
|
|
573
|
+
] = []
|
|
574
|
+
skipped_local: list[str] = []
|
|
575
|
+
|
|
576
|
+
for meta_score, idx, s, meta_norm in candidates:
|
|
577
|
+
path = s.get("path")
|
|
578
|
+
if not isinstance(path, str) or not path:
|
|
579
|
+
continue
|
|
580
|
+
try:
|
|
581
|
+
doc = self._read_doc(path)
|
|
582
|
+
except Exception as exc:
|
|
583
|
+
skipped_local.append(f"{path}: {exc}")
|
|
584
|
+
continue
|
|
585
|
+
|
|
586
|
+
para_hits: list[tuple[int, int]] = []
|
|
587
|
+
|
|
588
|
+
# If a query term already matches the source metadata (title/filename/source label),
|
|
589
|
+
# don't require it to appear in every paragraph. This prevents broad doc-selection
|
|
590
|
+
# hints (e.g. "Transformer") from excluding the exact paragraph we want.
|
|
591
|
+
term_pats_doc = term_pats
|
|
592
|
+
if meta_score > 0 and meta_norm:
|
|
593
|
+
filtered = [pat for pat in term_pats if pat not in meta_norm]
|
|
594
|
+
if filtered:
|
|
595
|
+
term_pats_doc = filtered
|
|
596
|
+
|
|
597
|
+
threshold_doc = min(int(threshold), len(term_pats_doc))
|
|
598
|
+
if threshold_doc < 1:
|
|
599
|
+
threshold_doc = 1
|
|
600
|
+
for p_idx, norm in enumerate(doc.normalized_paragraphs):
|
|
601
|
+
score = 0
|
|
602
|
+
for pat in term_pats_doc:
|
|
603
|
+
if pat in norm:
|
|
604
|
+
score += 1
|
|
605
|
+
if score >= threshold_doc:
|
|
606
|
+
para_hits.append((score, p_idx))
|
|
607
|
+
|
|
608
|
+
if not para_hits:
|
|
609
|
+
continue
|
|
610
|
+
|
|
611
|
+
para_hits.sort(key=lambda x: (-x[0], x[1]))
|
|
612
|
+
top = para_hits[: cfg.max_paragraphs_per_source]
|
|
613
|
+
|
|
614
|
+
best = int(top[0][0])
|
|
615
|
+
total = int(sum(score for score, _ in top))
|
|
616
|
+
matches_local.append((best, total, meta_score, idx, s, top, doc))
|
|
617
|
+
|
|
618
|
+
return matches_local, skipped_local
|
|
619
|
+
|
|
620
|
+
matches, skipped = _scan(effective_min_matches)
|
|
621
|
+
used_threshold = int(effective_min_matches)
|
|
622
|
+
if not matches and effective_min_matches > 1:
|
|
623
|
+
# Fallback: if the threshold filters out everything, relax to 1 so we can still
|
|
624
|
+
# retrieve entity hits like "AlphaGo" even when other query terms are generic.
|
|
625
|
+
matches, skipped = _scan(1)
|
|
626
|
+
used_threshold = 1
|
|
627
|
+
|
|
628
|
+
matches.sort(key=lambda x: (-x[0], -x[1], -x[2], x[3]))
|
|
629
|
+
selected = matches[: cfg.k]
|
|
630
|
+
|
|
631
|
+
if not selected:
|
|
632
|
+
debug = []
|
|
633
|
+
if cfg.debug:
|
|
634
|
+
debug = [
|
|
635
|
+
f"lightRAG: terms={terms!r}",
|
|
636
|
+
f"lightRAG: candidates={len(candidates)} scanned=0 selected=0 skipped={len(skipped)}",
|
|
637
|
+
]
|
|
638
|
+
if skipped:
|
|
639
|
+
debug.append("lightRAG: skipped (read errors):")
|
|
640
|
+
debug.extend([f" - {s}" for s in skipped[:20]])
|
|
641
|
+
return None, debug
|
|
642
|
+
|
|
643
|
+
total_remaining = int(cfg.total_chars)
|
|
644
|
+
blocks: list[str] = ["Retrieved excerpts (hints for where to look - verify against your full memory of the documents):", ""]
|
|
645
|
+
|
|
646
|
+
debug_lines: list[str] = []
|
|
647
|
+
if cfg.debug:
|
|
648
|
+
debug_lines.append(f"lightRAG: terms={terms!r}")
|
|
649
|
+
debug_lines.append(f"lightRAG: min_matches_used={used_threshold}")
|
|
650
|
+
debug_lines.append(
|
|
651
|
+
f"lightRAG: candidates={len(candidates)} scanned={len(matches)} selected={min(len(selected), cfg.k)} skipped={len(skipped)}"
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
included = 0
|
|
655
|
+
for best, total, meta_score, idx, s, top, doc in selected:
|
|
656
|
+
if total_remaining <= 0:
|
|
657
|
+
break
|
|
658
|
+
|
|
659
|
+
per_remaining = min(int(cfg.per_source_chars), total_remaining)
|
|
660
|
+
if per_remaining <= 0:
|
|
661
|
+
break
|
|
662
|
+
|
|
663
|
+
para_indices = sorted({p_idx for _, p_idx in top})
|
|
664
|
+
parts: list[str] = []
|
|
665
|
+
used = 0
|
|
666
|
+
|
|
667
|
+
for p_idx in para_indices:
|
|
668
|
+
para = doc.paragraphs[p_idx].strip()
|
|
669
|
+
if not para:
|
|
670
|
+
continue
|
|
671
|
+
|
|
672
|
+
sep = "\n\n" if parts else ""
|
|
673
|
+
avail = per_remaining - used - len(sep)
|
|
674
|
+
if avail <= 0:
|
|
675
|
+
break
|
|
676
|
+
|
|
677
|
+
clip_limit = min(int(cfg.max_paragraph_chars), avail)
|
|
678
|
+
if quote_task:
|
|
679
|
+
clipped = _select_sentence_snippet(para, terms=terms, max_chars=clip_limit)
|
|
680
|
+
else:
|
|
681
|
+
clipped = _truncate_text(para, clip_limit)
|
|
682
|
+
if not clipped:
|
|
683
|
+
break
|
|
684
|
+
|
|
685
|
+
parts.append(sep + clipped)
|
|
686
|
+
used += len(sep) + len(clipped)
|
|
687
|
+
if used >= per_remaining:
|
|
688
|
+
break
|
|
689
|
+
|
|
690
|
+
excerpt = "".join(parts).strip()
|
|
691
|
+
if not excerpt:
|
|
692
|
+
continue
|
|
693
|
+
|
|
694
|
+
included += 1
|
|
695
|
+
total_remaining -= used
|
|
696
|
+
|
|
697
|
+
attrs: list[str] = []
|
|
698
|
+
path = s.get("path")
|
|
699
|
+
if isinstance(path, str) and path:
|
|
700
|
+
attrs.append(f"path={json.dumps(path, ensure_ascii=False)}")
|
|
701
|
+
title = s.get("title")
|
|
702
|
+
if isinstance(title, str) and title.strip():
|
|
703
|
+
attrs.append(f"title={json.dumps(title.strip(), ensure_ascii=False)}")
|
|
704
|
+
src = s.get("source")
|
|
705
|
+
if isinstance(src, str) and src.strip():
|
|
706
|
+
attrs.append(f"source={json.dumps(src.strip(), ensure_ascii=False)}")
|
|
707
|
+
url = s.get("url")
|
|
708
|
+
if isinstance(url, str) and url.strip():
|
|
709
|
+
attrs.append(f"url={json.dumps(url.strip(), ensure_ascii=False)}")
|
|
710
|
+
|
|
711
|
+
blocks.append(f"[SOURCE {' '.join(attrs)}]")
|
|
712
|
+
blocks.append(excerpt)
|
|
713
|
+
blocks.append("[/SOURCE]")
|
|
714
|
+
blocks.append("")
|
|
715
|
+
|
|
716
|
+
if cfg.debug:
|
|
717
|
+
debug_lines.append(
|
|
718
|
+
f"lightRAG: + {path} best={best} total={total} meta={meta_score} paras={len(para_indices)} chars={used}"
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
if included == 0:
|
|
722
|
+
return None, debug_lines
|
|
723
|
+
|
|
724
|
+
if cfg.debug and skipped:
|
|
725
|
+
debug_lines.append("lightRAG: skipped (read errors):")
|
|
726
|
+
debug_lines.extend([f" - {s}" for s in skipped[:20]])
|
|
727
|
+
|
|
728
|
+
msg = "\n".join(blocks).rstrip() + "\n"
|
|
729
|
+
return msg, debug_lines
|