superlinear 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. apps/__init__.py +4 -0
  2. apps/cli/__init__.py +8 -0
  3. apps/cli/bm25_rag.py +471 -0
  4. apps/cli/chat_repl.py +1497 -0
  5. apps/cli/client.py +195 -0
  6. apps/cli/docs_repl.py +2275 -0
  7. apps/cli/light_rag.py +729 -0
  8. apps/cli/local_snapshots.py +139 -0
  9. apps/cli/locks.py +214 -0
  10. apps/cli/main.py +457 -0
  11. apps/cli/output.py +32 -0
  12. apps/cli/server_cmds.py +516 -0
  13. apps/cli/session_cmds.py +491 -0
  14. apps/cli/snapshot_cmds.py +303 -0
  15. apps/cli/state.py +265 -0
  16. apps/server/__init__.py +4 -0
  17. apps/server/app.py +1363 -0
  18. apps/server/main.py +313 -0
  19. superlinear/__init__.py +114 -0
  20. superlinear/_version.py +3 -0
  21. superlinear/engine/__init__.py +10 -0
  22. superlinear/engine/adapters/__init__.py +12 -0
  23. superlinear/engine/adapters/base.py +91 -0
  24. superlinear/engine/adapters/superlinear.py +1233 -0
  25. superlinear/engine/chat_engine.py +1173 -0
  26. superlinear/engine/chat_types.py +130 -0
  27. superlinear/engine/registry.py +51 -0
  28. superlinear/engine/repetition.py +203 -0
  29. superlinear/engine/session_snapshots.py +451 -0
  30. superlinear/engine/tool_parser.py +83 -0
  31. superlinear/engine/types.py +42 -0
  32. superlinear/kernels/__init__.py +2 -0
  33. superlinear/kernels/common/__init__.py +21 -0
  34. superlinear/kernels/common/adjustment.py +106 -0
  35. superlinear/kernels/common/power.py +154 -0
  36. superlinear/kernels/superlinear/__init__.py +10 -0
  37. superlinear/kernels/superlinear/attention/__init__.py +78 -0
  38. superlinear/kernels/superlinear/attention/_prefill.py +940 -0
  39. superlinear/kernels/superlinear/attention/_sliding_window.py +1167 -0
  40. superlinear/kernels/superlinear/attention/api.py +433 -0
  41. superlinear/kernels/superlinear/search/__init__.py +33 -0
  42. superlinear/kernels/superlinear/search/_reference.py +204 -0
  43. superlinear/kernels/superlinear/search/_triton.py +488 -0
  44. superlinear/kernels/superlinear/search/_triton_gqa.py +534 -0
  45. superlinear/kernels/superlinear/search/api.py +200 -0
  46. superlinear/kernels/superlinear/span/__init__.py +41 -0
  47. superlinear/kernels/superlinear/span/_triton_bucketed_gqa.py +1461 -0
  48. superlinear/kernels/superlinear/span/_triton_forward.py +22 -0
  49. superlinear/kernels/superlinear/span/_triton_gqa.py +1226 -0
  50. superlinear/kernels/superlinear/span/_triton_impl.py +928 -0
  51. superlinear/kernels/superlinear/span/_triton_precomputed_sw.py +460 -0
  52. superlinear/kernels/superlinear/span/_triton_precomputed_sw_gqa.py +598 -0
  53. superlinear/kernels/superlinear/span/api.py +296 -0
  54. superlinear/kernels/superlinear/span/masks.py +187 -0
  55. superlinear/py.typed +0 -0
  56. superlinear/runtime.py +71 -0
  57. superlinear-0.1.0.dist-info/METADATA +469 -0
  58. superlinear-0.1.0.dist-info/RECORD +62 -0
  59. superlinear-0.1.0.dist-info/WHEEL +5 -0
  60. superlinear-0.1.0.dist-info/entry_points.txt +2 -0
  61. superlinear-0.1.0.dist-info/licenses/LICENSE +202 -0
  62. superlinear-0.1.0.dist-info/top_level.txt +2 -0
apps/cli/light_rag.py ADDED
@@ -0,0 +1,729 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ from collections import OrderedDict
6
+ from dataclasses import dataclass, replace
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+
11
+ # Used for normalization/matching where we intentionally work in lowercase.
12
+ _NON_ALNUM_RE = re.compile(r"[^a-z0-9]+")
13
+ # Used only for extracting original-cased query tokens (e.g. "Go", "AI").
14
+ _NON_ALNUM_RE_ORIG = re.compile(r"[^A-Za-z0-9]+")
15
+ _MULTISPACE_RE = re.compile(r"\s+")
16
+ _PARA_SPLIT_RE = re.compile(r"\n\s*\n+")
17
+
18
+
19
+ _SHORT_QUERY_TERMS_2 = frozenset(
20
+ {
21
+ "ai",
22
+ "go",
23
+ "ml",
24
+ "rl",
25
+ }
26
+ )
27
+
28
+
29
+ _STOPWORDS = frozenset(
30
+ {
31
+ "a",
32
+ "about",
33
+ "above",
34
+ "after",
35
+ "again",
36
+ "against",
37
+ "all",
38
+ "also",
39
+ "am",
40
+ "an",
41
+ "and",
42
+ "any",
43
+ "are",
44
+ "as",
45
+ "at",
46
+ "be",
47
+ "because",
48
+ "been",
49
+ "before",
50
+ "being",
51
+ "below",
52
+ "between",
53
+ "both",
54
+ "but",
55
+ "by",
56
+ "can",
57
+ "could",
58
+ "did",
59
+ "do",
60
+ "does",
61
+ "doing",
62
+ "down",
63
+ "during",
64
+ "each",
65
+ "few",
66
+ "for",
67
+ "from",
68
+ "further",
69
+ "had",
70
+ "has",
71
+ "have",
72
+ "having",
73
+ "he",
74
+ "her",
75
+ "here",
76
+ "hers",
77
+ "herself",
78
+ "him",
79
+ "himself",
80
+ "his",
81
+ "how",
82
+ "i",
83
+ "if",
84
+ "in",
85
+ "into",
86
+ "is",
87
+ "it",
88
+ "its",
89
+ "itself",
90
+ "just",
91
+ "me",
92
+ "more",
93
+ "most",
94
+ "my",
95
+ "myself",
96
+ "no",
97
+ "nor",
98
+ "not",
99
+ "now",
100
+ "of",
101
+ "off",
102
+ "on",
103
+ "once",
104
+ "only",
105
+ "or",
106
+ "other",
107
+ "our",
108
+ "ours",
109
+ "ourselves",
110
+ "out",
111
+ "over",
112
+ "own",
113
+ "same",
114
+ "she",
115
+ "should",
116
+ "so",
117
+ "some",
118
+ "such",
119
+ "than",
120
+ "that",
121
+ "the",
122
+ "their",
123
+ "theirs",
124
+ "them",
125
+ "themselves",
126
+ "then",
127
+ "there",
128
+ "these",
129
+ "they",
130
+ "this",
131
+ "those",
132
+ "through",
133
+ "to",
134
+ "too",
135
+ "under",
136
+ "until",
137
+ "up",
138
+ "very",
139
+ "was",
140
+ "we",
141
+ "were",
142
+ "what",
143
+ "when",
144
+ "where",
145
+ "which",
146
+ "while",
147
+ "who",
148
+ "whom",
149
+ "why",
150
+ "with",
151
+ "would",
152
+ "you",
153
+ "your",
154
+ "yours",
155
+ "yourself",
156
+ "yourselves",
157
+
158
+ # Query-instruction / meta words (common in docs REPL prompts)
159
+ "article",
160
+ "articles",
161
+ "mention",
162
+ "mentions",
163
+ "quoted",
164
+ "quote",
165
+ "quotes",
166
+ "exact",
167
+ "exactly",
168
+ "sentence",
169
+ "sentences",
170
+ "substring",
171
+ "end",
172
+ "sources",
173
+ "path",
174
+ "containing",
175
+ "contains",
176
+ "yes",
177
+
178
+ # More prompt-instruction words that otherwise pollute retrieval.
179
+ "name",
180
+ "short",
181
+ "shorter",
182
+ "shortest",
183
+ "line",
184
+ "lines",
185
+ "fragment",
186
+ "fragments",
187
+ "sense",
188
+ "verb",
189
+ "verbatim",
190
+
191
+ # More meta words common in evaluation prompts
192
+ "ingested",
193
+ "difference",
194
+ "answer",
195
+ "include",
196
+ "available",
197
+ "term",
198
+ "terms",
199
+ "one",
200
+
201
+ # Prompt verbs/nouns that shouldn't drive lexical retrieval
202
+ "find",
203
+ "passage",
204
+ "define",
205
+ "defines",
206
+ "definition",
207
+ "imply",
208
+ "implies",
209
+ "say",
210
+ "says",
211
+
212
+ "explicitly",
213
+ "explain",
214
+ "extra",
215
+ "detail",
216
+ "details",
217
+ "sentence",
218
+ "sentences",
219
+
220
+ "full",
221
+ "ellipsis",
222
+ "ellipses",
223
+ "paraphrase",
224
+ "using",
225
+ "word",
226
+ "words",
227
+ "present",
228
+
229
+ # Constraint/policy words frequently used in prompts
230
+ "must",
231
+ "found",
232
+
233
+ # Generic question framing words that rarely help lexical retrieval
234
+ "intuition",
235
+ "problem",
236
+ "problems",
237
+ "solve",
238
+ "solves",
239
+ "solving",
240
+
241
+ # Common prompt directives that shouldn't influence retrieval
242
+ "use",
243
+ "used",
244
+ "provide",
245
+ "provided",
246
+ "providing",
247
+ "retrieve",
248
+ "retrieved",
249
+ "excerpt",
250
+ "excerpts",
251
+ "copy",
252
+ "copied",
253
+ "given",
254
+
255
+ # Conversation glue words (common in multi-turn follow-ups)
256
+ "tell",
257
+ "mentioned",
258
+ }
259
+ )
260
+
261
+
262
+ _QUOTEY_RE = re.compile(
263
+ r"\b(quote|verbatim|exact|substring|sentence|sentences|fragment|fragments|no\s+ellipses)\b",
264
+ re.IGNORECASE,
265
+ )
266
+
267
+
268
+ def _looks_like_quote_task(question: str) -> bool:
269
+ return bool(_QUOTEY_RE.search(question or ""))
270
+
271
+
272
+ def _split_into_sentences(text: str) -> list[str]:
273
+ """Best-effort sentence splitter for excerpt selection.
274
+
275
+ We intentionally keep it simple (no NLP deps). Newlines also act as boundaries.
276
+ """
277
+
278
+ t = (text or "").replace("\r", "")
279
+ if not t:
280
+ return []
281
+
282
+ out: list[str] = []
283
+ start = 0
284
+ i = 0
285
+ n = len(t)
286
+ while i < n:
287
+ ch = t[i]
288
+ if ch == "\n":
289
+ seg = t[start:i].strip()
290
+ if seg:
291
+ out.append(seg)
292
+ start = i + 1
293
+ elif ch in {".", "!", "?"}:
294
+ # End sentence at punctuation; include it.
295
+ end = i + 1
296
+ seg = t[start:end].strip()
297
+ if seg:
298
+ out.append(seg)
299
+ # Skip trailing whitespace.
300
+ j = end
301
+ while j < n and t[j].isspace() and t[j] != "\n":
302
+ j += 1
303
+ start = j
304
+ i = j - 1
305
+ i += 1
306
+
307
+ tail = t[start:].strip()
308
+ if tail:
309
+ out.append(tail)
310
+ return out
311
+
312
+
313
+ def _select_sentence_snippet(text: str, *, terms: list[str], max_chars: int) -> str:
314
+ if max_chars <= 0:
315
+ return ""
316
+ if len(text) <= max_chars:
317
+ return text
318
+
319
+ sentences = _split_into_sentences(text)
320
+ if not sentences:
321
+ return _truncate_text(text, max_chars)
322
+
323
+ # Prefer the shortest sentence that still matches the most terms.
324
+ best: tuple[int, int, str] | None = None # (score, -len, sentence)
325
+ for s in sentences:
326
+ s_l = s.lower()
327
+ score = sum(1 for t in terms if t and t in s_l)
328
+ if score <= 0:
329
+ continue
330
+ cand = (score, -len(s), s)
331
+ if best is None or cand > best:
332
+ best = cand
333
+
334
+ chosen = (best[2] if best is not None else sentences[0]).strip()
335
+ if len(chosen) <= max_chars:
336
+ return chosen
337
+
338
+ # Last resort: clip without adding an ellipsis (to avoid models copying it into "verbatim" quotes).
339
+ return chosen[:max_chars].rstrip()
340
+
341
+
342
+ def tokenize_query_terms(text: str, *, max_terms: int = 32) -> list[str]:
343
+ if not text or not isinstance(text, str):
344
+ return []
345
+
346
+ # Preserve original casing for acronym detection
347
+ raw_terms_original = _NON_ALNUM_RE_ORIG.sub(" ", text).split()
348
+ cleaned = _NON_ALNUM_RE.sub(" ", text.lower())
349
+ raw_terms = cleaned.split()
350
+
351
+ out: list[str] = []
352
+ seen: set[str] = set()
353
+ for i, term in enumerate(raw_terms):
354
+ # Keep short terms (2 chars) only if they look like acronyms/proper nouns
355
+ # (original had uppercase) e.g., "Go", "AI", "ML"
356
+ original = raw_terms_original[i] if i < len(raw_terms_original) else term
357
+ is_likely_name = len(original) >= 2 and original[0].isupper()
358
+ keep_short = term in _SHORT_QUERY_TERMS_2
359
+
360
+ if len(term) < 2:
361
+ continue
362
+ if len(term) == 2 and not (is_likely_name or keep_short):
363
+ continue
364
+ if term in _STOPWORDS:
365
+ continue
366
+ if term in seen:
367
+ continue
368
+ seen.add(term)
369
+ out.append(term)
370
+ if len(out) >= int(max_terms):
371
+ break
372
+ return out
373
+
374
+
375
+ def tokenize_rag_text(text: str, *, max_tokens: int | None = None) -> list[str]:
376
+ """Tokenize arbitrary text for lexical/BM25 retrieval.
377
+
378
+ The tokenization policy matches `tokenize_query_terms`, except:
379
+ - tokens are **not** deduplicated (BM25 needs term frequencies)
380
+ - the output can be optionally limited by `max_tokens`
381
+ """
382
+
383
+ if not text or not isinstance(text, str):
384
+ return []
385
+
386
+ raw_terms_original = _NON_ALNUM_RE_ORIG.sub(" ", text).split()
387
+ cleaned = _NON_ALNUM_RE.sub(" ", text.lower())
388
+ raw_terms = cleaned.split()
389
+
390
+ out: list[str] = []
391
+ for i, term in enumerate(raw_terms):
392
+ original = raw_terms_original[i] if i < len(raw_terms_original) else term
393
+ is_likely_name = len(original) >= 2 and original[0].isupper()
394
+ keep_short = term in _SHORT_QUERY_TERMS_2
395
+
396
+ if len(term) < 2:
397
+ continue
398
+ if len(term) == 2 and not (is_likely_name or keep_short):
399
+ continue
400
+ if term in _STOPWORDS:
401
+ continue
402
+
403
+ out.append(term)
404
+ if max_tokens is not None and len(out) >= int(max_tokens):
405
+ break
406
+
407
+ return out
408
+
409
+
410
+ def split_paragraphs(text: str) -> list[str]:
411
+ if not text or not isinstance(text, str):
412
+ return []
413
+ normalized = text.replace("\r", "").strip()
414
+ if not normalized:
415
+ return []
416
+ parts = _PARA_SPLIT_RE.split(normalized)
417
+ return [p.strip() for p in parts if p and p.strip()]
418
+
419
+
420
+ def _normalize_for_matching(text: str) -> str:
421
+ cleaned = _NON_ALNUM_RE.sub(" ", (text or "").lower())
422
+ cleaned = _MULTISPACE_RE.sub(" ", cleaned).strip()
423
+ return f" {cleaned} " if cleaned else " "
424
+
425
+
426
+ def _truncate_text(text: str, max_chars: int) -> str:
427
+ if max_chars <= 0:
428
+ return ""
429
+ if len(text) <= max_chars:
430
+ return text
431
+ if max_chars == 1:
432
+ return text[:1]
433
+ return text[: max_chars - 1].rstrip() + "…"
434
+
435
+
436
+ def _coerce_int(v: Any, *, default: int, min_v: int, max_v: int) -> int:
437
+ try:
438
+ n = int(v)
439
+ except Exception:
440
+ return default
441
+ if n < min_v:
442
+ return min_v
443
+ if n > max_v:
444
+ return max_v
445
+ return n
446
+
447
+
448
+ @dataclass(frozen=True)
449
+ class LightRagConfig:
450
+ enabled: bool = True
451
+ k: int = 5
452
+ total_chars: int = 12000
453
+ per_source_chars: int = 2600
454
+ debug: bool = False
455
+
456
+ candidate_sources: int = 20
457
+ max_terms: int = 32
458
+ max_paragraphs_per_source: int = 8
459
+ max_paragraph_chars: int = 1200
460
+ min_term_matches: int = 2 # Only include excerpts matching at least this many query terms
461
+
462
+ def sanitized(self) -> "LightRagConfig":
463
+ return replace(
464
+ self,
465
+ k=_coerce_int(self.k, default=5, min_v=1, max_v=50),
466
+ total_chars=_coerce_int(self.total_chars, default=12000, min_v=200, max_v=200000),
467
+ per_source_chars=_coerce_int(self.per_source_chars, default=2600, min_v=50, max_v=50000),
468
+ candidate_sources=_coerce_int(self.candidate_sources, default=20, min_v=1, max_v=200),
469
+ max_terms=_coerce_int(self.max_terms, default=32, min_v=1, max_v=128),
470
+ max_paragraphs_per_source=_coerce_int(self.max_paragraphs_per_source, default=8, min_v=1, max_v=64),
471
+ max_paragraph_chars=_coerce_int(self.max_paragraph_chars, default=1200, min_v=50, max_v=20000),
472
+ )
473
+
474
+
475
+ @dataclass
476
+ class _DocCacheEntry:
477
+ mtime_ns: int
478
+ paragraphs: list[str]
479
+ normalized_paragraphs: list[str]
480
+
481
+
482
+ class LightRagRetriever:
483
+ def __init__(self, *, cache_docs: int = 64) -> None:
484
+ self._cache_docs = int(cache_docs)
485
+ self._cache: OrderedDict[str, _DocCacheEntry] = OrderedDict()
486
+
487
+ def clear_cache(self) -> None:
488
+ self._cache.clear()
489
+
490
+ def _read_doc(self, path_str: str) -> _DocCacheEntry:
491
+ path = Path(path_str)
492
+ st = path.stat()
493
+ mtime_ns = int(st.st_mtime_ns)
494
+
495
+ cached = self._cache.get(path_str)
496
+ if cached is not None and cached.mtime_ns == mtime_ns:
497
+ self._cache.move_to_end(path_str)
498
+ return cached
499
+
500
+ data = path.read_bytes()
501
+ if b"\x00" in data:
502
+ raise ValueError("refusing to read binary file (NUL byte found)")
503
+ text = data.decode("utf-8", errors="replace")
504
+
505
+ paragraphs = split_paragraphs(text)
506
+ normalized_paragraphs = [_normalize_for_matching(p) for p in paragraphs]
507
+ entry = _DocCacheEntry(mtime_ns=mtime_ns, paragraphs=paragraphs, normalized_paragraphs=normalized_paragraphs)
508
+
509
+ self._cache[path_str] = entry
510
+ self._cache.move_to_end(path_str)
511
+ while len(self._cache) > self._cache_docs:
512
+ self._cache.popitem(last=False)
513
+
514
+ return entry
515
+
516
+ def build_retrieved_excerpts_message(
517
+ self,
518
+ *,
519
+ question: str,
520
+ sources: list[dict[str, Any]],
521
+ config: LightRagConfig,
522
+ ) -> tuple[str | None, list[str]]:
523
+ cfg = config.sanitized()
524
+ if not cfg.enabled:
525
+ return None, []
526
+
527
+ quote_task = _looks_like_quote_task(question)
528
+
529
+ terms = tokenize_query_terms(question, max_terms=cfg.max_terms)
530
+ if not terms:
531
+ return None, []
532
+
533
+ term_pats = [f" {t} " for t in terms]
534
+ # Adaptive threshold: for short queries, require fewer matches.
535
+ # Note: some queries include generic terms (e.g. "articles", "mention") that won't
536
+ # appear in the docs; we apply a fallback later if the threshold filters everything out.
537
+ effective_min_matches = min(cfg.min_term_matches, len(terms))
538
+
539
+ meta_scored: list[tuple[int, int, dict[str, Any], str]] = []
540
+ for idx, s in enumerate(sources):
541
+ path = s.get("path")
542
+ if not isinstance(path, str) or not path:
543
+ continue
544
+
545
+ title = s.get("title") if isinstance(s.get("title"), str) else ""
546
+ src = s.get("source") if isinstance(s.get("source"), str) else ""
547
+ meta_text = f"{title}\n{Path(path).name}\n{src}"
548
+ meta_norm = _normalize_for_matching(meta_text)
549
+ meta_score = sum(1 for pat in term_pats if pat in meta_norm)
550
+ meta_scored.append((int(meta_score), idx, s, meta_norm))
551
+
552
+ if not meta_scored:
553
+ return None, []
554
+
555
+ meta_scored.sort(key=lambda x: (-x[0], x[1]))
556
+
557
+ # If the query terms do not appear in titles/filenames/source labels, meta scoring
558
+ # provides no useful signal (many/most scores will be 0). In that case, avoid
559
+ # over-pruning to the first N sources, which can miss the relevant doc purely due
560
+ # to ingestion order.
561
+ if meta_scored[0][0] <= 0:
562
+ max_candidates = max(int(cfg.candidate_sources), 200)
563
+ candidates = meta_scored[: min(len(meta_scored), max_candidates)]
564
+ else:
565
+ candidates = meta_scored[: cfg.candidate_sources]
566
+
567
+ def _scan(threshold: int) -> tuple[
568
+ list[tuple[int, int, int, int, dict[str, Any], list[tuple[int, int]], _DocCacheEntry]],
569
+ list[str],
570
+ ]:
571
+ matches_local: list[
572
+ tuple[int, int, int, int, dict[str, Any], list[tuple[int, int]], _DocCacheEntry]
573
+ ] = []
574
+ skipped_local: list[str] = []
575
+
576
+ for meta_score, idx, s, meta_norm in candidates:
577
+ path = s.get("path")
578
+ if not isinstance(path, str) or not path:
579
+ continue
580
+ try:
581
+ doc = self._read_doc(path)
582
+ except Exception as exc:
583
+ skipped_local.append(f"{path}: {exc}")
584
+ continue
585
+
586
+ para_hits: list[tuple[int, int]] = []
587
+
588
+ # If a query term already matches the source metadata (title/filename/source label),
589
+ # don't require it to appear in every paragraph. This prevents broad doc-selection
590
+ # hints (e.g. "Transformer") from excluding the exact paragraph we want.
591
+ term_pats_doc = term_pats
592
+ if meta_score > 0 and meta_norm:
593
+ filtered = [pat for pat in term_pats if pat not in meta_norm]
594
+ if filtered:
595
+ term_pats_doc = filtered
596
+
597
+ threshold_doc = min(int(threshold), len(term_pats_doc))
598
+ if threshold_doc < 1:
599
+ threshold_doc = 1
600
+ for p_idx, norm in enumerate(doc.normalized_paragraphs):
601
+ score = 0
602
+ for pat in term_pats_doc:
603
+ if pat in norm:
604
+ score += 1
605
+ if score >= threshold_doc:
606
+ para_hits.append((score, p_idx))
607
+
608
+ if not para_hits:
609
+ continue
610
+
611
+ para_hits.sort(key=lambda x: (-x[0], x[1]))
612
+ top = para_hits[: cfg.max_paragraphs_per_source]
613
+
614
+ best = int(top[0][0])
615
+ total = int(sum(score for score, _ in top))
616
+ matches_local.append((best, total, meta_score, idx, s, top, doc))
617
+
618
+ return matches_local, skipped_local
619
+
620
+ matches, skipped = _scan(effective_min_matches)
621
+ used_threshold = int(effective_min_matches)
622
+ if not matches and effective_min_matches > 1:
623
+ # Fallback: if the threshold filters out everything, relax to 1 so we can still
624
+ # retrieve entity hits like "AlphaGo" even when other query terms are generic.
625
+ matches, skipped = _scan(1)
626
+ used_threshold = 1
627
+
628
+ matches.sort(key=lambda x: (-x[0], -x[1], -x[2], x[3]))
629
+ selected = matches[: cfg.k]
630
+
631
+ if not selected:
632
+ debug = []
633
+ if cfg.debug:
634
+ debug = [
635
+ f"lightRAG: terms={terms!r}",
636
+ f"lightRAG: candidates={len(candidates)} scanned=0 selected=0 skipped={len(skipped)}",
637
+ ]
638
+ if skipped:
639
+ debug.append("lightRAG: skipped (read errors):")
640
+ debug.extend([f" - {s}" for s in skipped[:20]])
641
+ return None, debug
642
+
643
+ total_remaining = int(cfg.total_chars)
644
+ blocks: list[str] = ["Retrieved excerpts (hints for where to look - verify against your full memory of the documents):", ""]
645
+
646
+ debug_lines: list[str] = []
647
+ if cfg.debug:
648
+ debug_lines.append(f"lightRAG: terms={terms!r}")
649
+ debug_lines.append(f"lightRAG: min_matches_used={used_threshold}")
650
+ debug_lines.append(
651
+ f"lightRAG: candidates={len(candidates)} scanned={len(matches)} selected={min(len(selected), cfg.k)} skipped={len(skipped)}"
652
+ )
653
+
654
+ included = 0
655
+ for best, total, meta_score, idx, s, top, doc in selected:
656
+ if total_remaining <= 0:
657
+ break
658
+
659
+ per_remaining = min(int(cfg.per_source_chars), total_remaining)
660
+ if per_remaining <= 0:
661
+ break
662
+
663
+ para_indices = sorted({p_idx for _, p_idx in top})
664
+ parts: list[str] = []
665
+ used = 0
666
+
667
+ for p_idx in para_indices:
668
+ para = doc.paragraphs[p_idx].strip()
669
+ if not para:
670
+ continue
671
+
672
+ sep = "\n\n" if parts else ""
673
+ avail = per_remaining - used - len(sep)
674
+ if avail <= 0:
675
+ break
676
+
677
+ clip_limit = min(int(cfg.max_paragraph_chars), avail)
678
+ if quote_task:
679
+ clipped = _select_sentence_snippet(para, terms=terms, max_chars=clip_limit)
680
+ else:
681
+ clipped = _truncate_text(para, clip_limit)
682
+ if not clipped:
683
+ break
684
+
685
+ parts.append(sep + clipped)
686
+ used += len(sep) + len(clipped)
687
+ if used >= per_remaining:
688
+ break
689
+
690
+ excerpt = "".join(parts).strip()
691
+ if not excerpt:
692
+ continue
693
+
694
+ included += 1
695
+ total_remaining -= used
696
+
697
+ attrs: list[str] = []
698
+ path = s.get("path")
699
+ if isinstance(path, str) and path:
700
+ attrs.append(f"path={json.dumps(path, ensure_ascii=False)}")
701
+ title = s.get("title")
702
+ if isinstance(title, str) and title.strip():
703
+ attrs.append(f"title={json.dumps(title.strip(), ensure_ascii=False)}")
704
+ src = s.get("source")
705
+ if isinstance(src, str) and src.strip():
706
+ attrs.append(f"source={json.dumps(src.strip(), ensure_ascii=False)}")
707
+ url = s.get("url")
708
+ if isinstance(url, str) and url.strip():
709
+ attrs.append(f"url={json.dumps(url.strip(), ensure_ascii=False)}")
710
+
711
+ blocks.append(f"[SOURCE {' '.join(attrs)}]")
712
+ blocks.append(excerpt)
713
+ blocks.append("[/SOURCE]")
714
+ blocks.append("")
715
+
716
+ if cfg.debug:
717
+ debug_lines.append(
718
+ f"lightRAG: + {path} best={best} total={total} meta={meta_score} paras={len(para_indices)} chars={used}"
719
+ )
720
+
721
+ if included == 0:
722
+ return None, debug_lines
723
+
724
+ if cfg.debug and skipped:
725
+ debug_lines.append("lightRAG: skipped (read errors):")
726
+ debug_lines.extend([f" - {s}" for s in skipped[:20]])
727
+
728
+ msg = "\n".join(blocks).rstrip() + "\n"
729
+ return msg, debug_lines