superlinear 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- apps/__init__.py +4 -0
- apps/cli/__init__.py +8 -0
- apps/cli/bm25_rag.py +471 -0
- apps/cli/chat_repl.py +1497 -0
- apps/cli/client.py +195 -0
- apps/cli/docs_repl.py +2275 -0
- apps/cli/light_rag.py +729 -0
- apps/cli/local_snapshots.py +139 -0
- apps/cli/locks.py +214 -0
- apps/cli/main.py +457 -0
- apps/cli/output.py +32 -0
- apps/cli/server_cmds.py +516 -0
- apps/cli/session_cmds.py +491 -0
- apps/cli/snapshot_cmds.py +303 -0
- apps/cli/state.py +265 -0
- apps/server/__init__.py +4 -0
- apps/server/app.py +1363 -0
- apps/server/main.py +313 -0
- superlinear/__init__.py +114 -0
- superlinear/_version.py +3 -0
- superlinear/engine/__init__.py +10 -0
- superlinear/engine/adapters/__init__.py +12 -0
- superlinear/engine/adapters/base.py +91 -0
- superlinear/engine/adapters/superlinear.py +1233 -0
- superlinear/engine/chat_engine.py +1173 -0
- superlinear/engine/chat_types.py +130 -0
- superlinear/engine/registry.py +51 -0
- superlinear/engine/repetition.py +203 -0
- superlinear/engine/session_snapshots.py +451 -0
- superlinear/engine/tool_parser.py +83 -0
- superlinear/engine/types.py +42 -0
- superlinear/kernels/__init__.py +2 -0
- superlinear/kernels/common/__init__.py +21 -0
- superlinear/kernels/common/adjustment.py +106 -0
- superlinear/kernels/common/power.py +154 -0
- superlinear/kernels/superlinear/__init__.py +10 -0
- superlinear/kernels/superlinear/attention/__init__.py +78 -0
- superlinear/kernels/superlinear/attention/_prefill.py +940 -0
- superlinear/kernels/superlinear/attention/_sliding_window.py +1167 -0
- superlinear/kernels/superlinear/attention/api.py +433 -0
- superlinear/kernels/superlinear/search/__init__.py +33 -0
- superlinear/kernels/superlinear/search/_reference.py +204 -0
- superlinear/kernels/superlinear/search/_triton.py +488 -0
- superlinear/kernels/superlinear/search/_triton_gqa.py +534 -0
- superlinear/kernels/superlinear/search/api.py +200 -0
- superlinear/kernels/superlinear/span/__init__.py +41 -0
- superlinear/kernels/superlinear/span/_triton_bucketed_gqa.py +1461 -0
- superlinear/kernels/superlinear/span/_triton_forward.py +22 -0
- superlinear/kernels/superlinear/span/_triton_gqa.py +1226 -0
- superlinear/kernels/superlinear/span/_triton_impl.py +928 -0
- superlinear/kernels/superlinear/span/_triton_precomputed_sw.py +460 -0
- superlinear/kernels/superlinear/span/_triton_precomputed_sw_gqa.py +598 -0
- superlinear/kernels/superlinear/span/api.py +296 -0
- superlinear/kernels/superlinear/span/masks.py +187 -0
- superlinear/py.typed +0 -0
- superlinear/runtime.py +71 -0
- superlinear-0.1.0.dist-info/METADATA +469 -0
- superlinear-0.1.0.dist-info/RECORD +62 -0
- superlinear-0.1.0.dist-info/WHEEL +5 -0
- superlinear-0.1.0.dist-info/entry_points.txt +2 -0
- superlinear-0.1.0.dist-info/licenses/LICENSE +202 -0
- superlinear-0.1.0.dist-info/top_level.txt +2 -0
apps/__init__.py
ADDED
apps/cli/__init__.py
ADDED
apps/cli/bm25_rag.py
ADDED
|
@@ -0,0 +1,471 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass, replace
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from apps.cli.light_rag import split_paragraphs, tokenize_query_terms, tokenize_rag_text
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
_QUOTEY_RE = re.compile(
|
|
15
|
+
r"\b(quote|verbatim|exact|substring|sentence|sentences|fragment|fragments|no\s+ellipses)\b",
|
|
16
|
+
re.IGNORECASE,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _looks_like_quote_task(question: str) -> bool:
|
|
21
|
+
return bool(_QUOTEY_RE.search(question or ""))
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _split_into_sentences(text: str) -> list[str]:
|
|
25
|
+
t = (text or "").replace("\r", "")
|
|
26
|
+
if not t:
|
|
27
|
+
return []
|
|
28
|
+
|
|
29
|
+
out: list[str] = []
|
|
30
|
+
start = 0
|
|
31
|
+
i = 0
|
|
32
|
+
n = len(t)
|
|
33
|
+
while i < n:
|
|
34
|
+
ch = t[i]
|
|
35
|
+
if ch == "\n":
|
|
36
|
+
seg = t[start:i].strip()
|
|
37
|
+
if seg:
|
|
38
|
+
out.append(seg)
|
|
39
|
+
start = i + 1
|
|
40
|
+
elif ch in {".", "!", "?"}:
|
|
41
|
+
end = i + 1
|
|
42
|
+
seg = t[start:end].strip()
|
|
43
|
+
if seg:
|
|
44
|
+
out.append(seg)
|
|
45
|
+
j = end
|
|
46
|
+
while j < n and t[j].isspace() and t[j] != "\n":
|
|
47
|
+
j += 1
|
|
48
|
+
start = j
|
|
49
|
+
i = j - 1
|
|
50
|
+
i += 1
|
|
51
|
+
|
|
52
|
+
tail = t[start:].strip()
|
|
53
|
+
if tail:
|
|
54
|
+
out.append(tail)
|
|
55
|
+
return out
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _truncate_text(text: str, max_chars: int) -> str:
|
|
59
|
+
if max_chars <= 0:
|
|
60
|
+
return ""
|
|
61
|
+
if len(text) <= max_chars:
|
|
62
|
+
return text
|
|
63
|
+
if max_chars == 1:
|
|
64
|
+
return text[:1]
|
|
65
|
+
return text[: max_chars - 1].rstrip() + "…"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _select_sentence_snippet(text: str, *, terms: list[str], max_chars: int) -> str:
|
|
69
|
+
if max_chars <= 0:
|
|
70
|
+
return ""
|
|
71
|
+
if len(text) <= max_chars:
|
|
72
|
+
return text
|
|
73
|
+
|
|
74
|
+
sentences = _split_into_sentences(text)
|
|
75
|
+
if not sentences:
|
|
76
|
+
return _truncate_text(text, max_chars)
|
|
77
|
+
|
|
78
|
+
best: tuple[int, int, str] | None = None # (score, -len, sentence)
|
|
79
|
+
for s in sentences:
|
|
80
|
+
s_l = s.lower()
|
|
81
|
+
score = sum(1 for t in terms if t and t in s_l)
|
|
82
|
+
if score <= 0:
|
|
83
|
+
continue
|
|
84
|
+
cand = (score, -len(s), s)
|
|
85
|
+
if best is None or cand > best:
|
|
86
|
+
best = cand
|
|
87
|
+
|
|
88
|
+
chosen = (best[2] if best is not None else sentences[0]).strip()
|
|
89
|
+
if len(chosen) <= max_chars:
|
|
90
|
+
return chosen
|
|
91
|
+
|
|
92
|
+
# Clip without adding an ellipsis (to avoid models copying it into "verbatim" quotes).
|
|
93
|
+
return chosen[:max_chars].rstrip()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _coerce_int(v: Any, *, default: int, min_v: int, max_v: int) -> int:
|
|
97
|
+
try:
|
|
98
|
+
n = int(v)
|
|
99
|
+
except Exception:
|
|
100
|
+
return default
|
|
101
|
+
if n < min_v:
|
|
102
|
+
return min_v
|
|
103
|
+
if n > max_v:
|
|
104
|
+
return max_v
|
|
105
|
+
return n
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@dataclass(frozen=True)
|
|
109
|
+
class Bm25RagConfig:
|
|
110
|
+
enabled: bool = True
|
|
111
|
+
k_sources: int = 5
|
|
112
|
+
total_chars: int = 12000
|
|
113
|
+
per_source_chars: int = 2600
|
|
114
|
+
debug: bool = False
|
|
115
|
+
|
|
116
|
+
k_paragraphs: int = 40
|
|
117
|
+
max_terms: int = 32
|
|
118
|
+
max_paragraphs_per_source: int = 8
|
|
119
|
+
max_paragraph_chars: int = 1200
|
|
120
|
+
|
|
121
|
+
def sanitized(self) -> "Bm25RagConfig":
|
|
122
|
+
return replace(
|
|
123
|
+
self,
|
|
124
|
+
k_sources=_coerce_int(self.k_sources, default=5, min_v=1, max_v=50),
|
|
125
|
+
total_chars=_coerce_int(self.total_chars, default=12000, min_v=200, max_v=200000),
|
|
126
|
+
per_source_chars=_coerce_int(self.per_source_chars, default=2600, min_v=50, max_v=50000),
|
|
127
|
+
k_paragraphs=_coerce_int(self.k_paragraphs, default=40, min_v=1, max_v=1000),
|
|
128
|
+
max_terms=_coerce_int(self.max_terms, default=32, min_v=1, max_v=256),
|
|
129
|
+
max_paragraphs_per_source=_coerce_int(
|
|
130
|
+
self.max_paragraphs_per_source, default=8, min_v=1, max_v=64
|
|
131
|
+
),
|
|
132
|
+
max_paragraph_chars=_coerce_int(self.max_paragraph_chars, default=1200, min_v=50, max_v=20000),
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@dataclass(frozen=True)
|
|
137
|
+
class _Paragraph:
|
|
138
|
+
path: str
|
|
139
|
+
paragraph_index: int
|
|
140
|
+
text: str
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class Bm25RagRetriever:
|
|
144
|
+
def __init__(self) -> None:
|
|
145
|
+
self._bm25_cls: type | None = None
|
|
146
|
+
self._bm25_import_error: str | None = None
|
|
147
|
+
self._index_key: str | None = None
|
|
148
|
+
|
|
149
|
+
self._paragraphs: list[_Paragraph] = []
|
|
150
|
+
self._paragraph_tokens: list[list[str]] = []
|
|
151
|
+
self._source_meta: dict[str, dict[str, Any]] = {}
|
|
152
|
+
self._bm25: Any | None = None
|
|
153
|
+
self._last_build_ms: int | None = None
|
|
154
|
+
|
|
155
|
+
def is_available(self) -> bool:
|
|
156
|
+
return self._get_bm25_cls() is not None
|
|
157
|
+
|
|
158
|
+
def last_build_stats(self) -> dict[str, Any]:
|
|
159
|
+
return {
|
|
160
|
+
"sources": len(self._source_meta),
|
|
161
|
+
"paragraphs": len(self._paragraphs),
|
|
162
|
+
"build_ms": self._last_build_ms,
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
def clear_index(self) -> None:
|
|
166
|
+
self._index_key = None
|
|
167
|
+
self._paragraphs = []
|
|
168
|
+
self._paragraph_tokens = []
|
|
169
|
+
self._source_meta = {}
|
|
170
|
+
self._bm25 = None
|
|
171
|
+
self._last_build_ms = None
|
|
172
|
+
|
|
173
|
+
def _get_bm25_cls(self) -> type | None:
|
|
174
|
+
if self._bm25_cls is not None:
|
|
175
|
+
return self._bm25_cls
|
|
176
|
+
if self._bm25_import_error is not None:
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
try:
|
|
180
|
+
mod = importlib.import_module("rank_bm25")
|
|
181
|
+
cls = getattr(mod, "BM25Okapi", None)
|
|
182
|
+
if cls is None:
|
|
183
|
+
self._bm25_import_error = "rank_bm25.BM25Okapi not found"
|
|
184
|
+
return None
|
|
185
|
+
self._bm25_cls = cls
|
|
186
|
+
return cls
|
|
187
|
+
except Exception as exc:
|
|
188
|
+
self._bm25_import_error = str(exc)
|
|
189
|
+
return None
|
|
190
|
+
|
|
191
|
+
def _sources_key(self, sources: list[dict[str, Any]]) -> str:
|
|
192
|
+
# Use (path, sha256) when available so we can detect content changes across /add.
|
|
193
|
+
# Sort for stability.
|
|
194
|
+
items: list[tuple[str, str]] = []
|
|
195
|
+
for s in sources:
|
|
196
|
+
if not isinstance(s, dict):
|
|
197
|
+
continue
|
|
198
|
+
path = s.get("path")
|
|
199
|
+
if not isinstance(path, str) or not path:
|
|
200
|
+
continue
|
|
201
|
+
sha = s.get("sha256")
|
|
202
|
+
items.append((path, sha if isinstance(sha, str) else ""))
|
|
203
|
+
items.sort()
|
|
204
|
+
return json.dumps({"v": 1, "sources": items}, ensure_ascii=False, sort_keys=True)
|
|
205
|
+
|
|
206
|
+
def ensure_index(self, *, sources: list[dict[str, Any]], debug: bool = False) -> list[str]:
|
|
207
|
+
dbg: list[str] = []
|
|
208
|
+
bm25_cls = self._get_bm25_cls()
|
|
209
|
+
if bm25_cls is None:
|
|
210
|
+
if debug:
|
|
211
|
+
hint = (
|
|
212
|
+
"bm25: unavailable (install `rank-bm25` to enable BM25 retrieval)"
|
|
213
|
+
if self._bm25_import_error is None
|
|
214
|
+
else f"bm25: unavailable ({self._bm25_import_error})"
|
|
215
|
+
)
|
|
216
|
+
dbg.append(hint)
|
|
217
|
+
self.clear_index()
|
|
218
|
+
return dbg
|
|
219
|
+
|
|
220
|
+
key = self._sources_key(sources)
|
|
221
|
+
if self._index_key == key and self._bm25 is not None:
|
|
222
|
+
return dbg
|
|
223
|
+
|
|
224
|
+
t0 = time.perf_counter()
|
|
225
|
+
|
|
226
|
+
paragraphs: list[_Paragraph] = []
|
|
227
|
+
paragraph_tokens: list[list[str]] = []
|
|
228
|
+
source_meta: dict[str, dict[str, Any]] = {}
|
|
229
|
+
|
|
230
|
+
skipped: list[str] = []
|
|
231
|
+
|
|
232
|
+
# Deduplicate by path; keep the last metadata entry for a path.
|
|
233
|
+
seen_paths: set[str] = set()
|
|
234
|
+
unique_sources: list[dict[str, Any]] = []
|
|
235
|
+
for s in reversed(sources):
|
|
236
|
+
if not isinstance(s, dict):
|
|
237
|
+
continue
|
|
238
|
+
path = s.get("path")
|
|
239
|
+
if not isinstance(path, str) or not path:
|
|
240
|
+
continue
|
|
241
|
+
if path in seen_paths:
|
|
242
|
+
continue
|
|
243
|
+
seen_paths.add(path)
|
|
244
|
+
unique_sources.append(s)
|
|
245
|
+
unique_sources.reverse()
|
|
246
|
+
|
|
247
|
+
for s in unique_sources:
|
|
248
|
+
path = s.get("path")
|
|
249
|
+
if not isinstance(path, str) or not path:
|
|
250
|
+
continue
|
|
251
|
+
|
|
252
|
+
title = s.get("title")
|
|
253
|
+
src = s.get("source")
|
|
254
|
+
url = s.get("url")
|
|
255
|
+
meta: dict[str, Any] = {"path": path}
|
|
256
|
+
if isinstance(title, str) and title.strip():
|
|
257
|
+
meta["title"] = title.strip()
|
|
258
|
+
if isinstance(src, str) and src.strip():
|
|
259
|
+
meta["source"] = src.strip()
|
|
260
|
+
if isinstance(url, str) and url.strip():
|
|
261
|
+
meta["url"] = url.strip()
|
|
262
|
+
source_meta[path] = meta
|
|
263
|
+
|
|
264
|
+
try:
|
|
265
|
+
data = Path(path).read_bytes()
|
|
266
|
+
if b"\x00" in data:
|
|
267
|
+
raise ValueError("refusing to read binary file (NUL byte found)")
|
|
268
|
+
text = data.decode("utf-8", errors="replace")
|
|
269
|
+
except Exception as exc:
|
|
270
|
+
skipped.append(f"{path}: {exc}")
|
|
271
|
+
continue
|
|
272
|
+
|
|
273
|
+
for p_idx, para in enumerate(split_paragraphs(text)):
|
|
274
|
+
tokens = tokenize_rag_text(para)
|
|
275
|
+
if not tokens:
|
|
276
|
+
continue
|
|
277
|
+
paragraphs.append(_Paragraph(path=path, paragraph_index=p_idx, text=para))
|
|
278
|
+
paragraph_tokens.append(tokens)
|
|
279
|
+
|
|
280
|
+
if not paragraphs:
|
|
281
|
+
self._index_key = key
|
|
282
|
+
self._paragraphs = []
|
|
283
|
+
self._paragraph_tokens = []
|
|
284
|
+
self._source_meta = source_meta
|
|
285
|
+
self._bm25 = None
|
|
286
|
+
self._last_build_ms = int((time.perf_counter() - t0) * 1000)
|
|
287
|
+
if debug:
|
|
288
|
+
dbg.append(
|
|
289
|
+
f"bm25: index empty (sources={len(source_meta)} paragraphs=0 build_ms={self._last_build_ms})"
|
|
290
|
+
)
|
|
291
|
+
if skipped:
|
|
292
|
+
dbg.append("bm25: skipped (read errors):")
|
|
293
|
+
dbg.extend([f" - {s}" for s in skipped[:20]])
|
|
294
|
+
return dbg
|
|
295
|
+
|
|
296
|
+
bm25 = bm25_cls(paragraph_tokens)
|
|
297
|
+
|
|
298
|
+
self._index_key = key
|
|
299
|
+
self._paragraphs = paragraphs
|
|
300
|
+
self._paragraph_tokens = paragraph_tokens
|
|
301
|
+
self._source_meta = source_meta
|
|
302
|
+
self._bm25 = bm25
|
|
303
|
+
self._last_build_ms = int((time.perf_counter() - t0) * 1000)
|
|
304
|
+
|
|
305
|
+
if debug:
|
|
306
|
+
dbg.append(
|
|
307
|
+
f"bm25: index built (sources={len(source_meta)} paragraphs={len(paragraphs)} build_ms={self._last_build_ms})"
|
|
308
|
+
)
|
|
309
|
+
if skipped:
|
|
310
|
+
dbg.append("bm25: skipped (read errors):")
|
|
311
|
+
dbg.extend([f" - {s}" for s in skipped[:20]])
|
|
312
|
+
|
|
313
|
+
return dbg
|
|
314
|
+
|
|
315
|
+
def build_retrieved_excerpts_message(
|
|
316
|
+
self,
|
|
317
|
+
*,
|
|
318
|
+
question: str,
|
|
319
|
+
sources: list[dict[str, Any]],
|
|
320
|
+
config: Bm25RagConfig,
|
|
321
|
+
) -> tuple[str | None, list[str]]:
|
|
322
|
+
cfg = config.sanitized()
|
|
323
|
+
if not cfg.enabled:
|
|
324
|
+
return None, []
|
|
325
|
+
|
|
326
|
+
terms = tokenize_query_terms(question, max_terms=cfg.max_terms)
|
|
327
|
+
if not terms:
|
|
328
|
+
return None, []
|
|
329
|
+
|
|
330
|
+
debug_lines: list[str] = []
|
|
331
|
+
debug_lines.extend(self.ensure_index(sources=sources, debug=cfg.debug))
|
|
332
|
+
if self._bm25 is None or not self._paragraphs:
|
|
333
|
+
return None, debug_lines
|
|
334
|
+
|
|
335
|
+
quote_task = _looks_like_quote_task(question)
|
|
336
|
+
|
|
337
|
+
try:
|
|
338
|
+
scores_raw = self._bm25.get_scores(terms)
|
|
339
|
+
except Exception as exc:
|
|
340
|
+
if cfg.debug:
|
|
341
|
+
debug_lines.append(f"bm25: scoring failed ({exc}); falling back")
|
|
342
|
+
return None, debug_lines
|
|
343
|
+
|
|
344
|
+
try:
|
|
345
|
+
scores = list(scores_raw)
|
|
346
|
+
except Exception:
|
|
347
|
+
scores = [scores_raw[i] for i in range(len(self._paragraphs))]
|
|
348
|
+
|
|
349
|
+
scored: list[tuple[float, int]] = []
|
|
350
|
+
for i, s in enumerate(scores[: len(self._paragraphs)]):
|
|
351
|
+
try:
|
|
352
|
+
f = float(s)
|
|
353
|
+
except Exception:
|
|
354
|
+
continue
|
|
355
|
+
if f <= 0:
|
|
356
|
+
continue
|
|
357
|
+
scored.append((f, i))
|
|
358
|
+
|
|
359
|
+
if not scored:
|
|
360
|
+
if cfg.debug:
|
|
361
|
+
debug_lines.append(f"bm25: terms={terms!r}")
|
|
362
|
+
debug_lines.append("bm25: no positive-scoring paragraphs")
|
|
363
|
+
return None, debug_lines
|
|
364
|
+
|
|
365
|
+
scored.sort(key=lambda x: (-x[0], x[1]))
|
|
366
|
+
top_para = scored[: cfg.k_paragraphs]
|
|
367
|
+
|
|
368
|
+
by_path: dict[str, list[tuple[float, int]]] = {}
|
|
369
|
+
for score, pid in top_para:
|
|
370
|
+
path = self._paragraphs[pid].path
|
|
371
|
+
by_path.setdefault(path, []).append((score, pid))
|
|
372
|
+
|
|
373
|
+
source_scored: list[tuple[float, str]] = []
|
|
374
|
+
for path, items in by_path.items():
|
|
375
|
+
agg = float(sum(score for score, _ in items))
|
|
376
|
+
source_scored.append((agg, path))
|
|
377
|
+
source_scored.sort(key=lambda x: (-x[0], x[1]))
|
|
378
|
+
|
|
379
|
+
selected_sources = source_scored[: cfg.k_sources]
|
|
380
|
+
if not selected_sources:
|
|
381
|
+
return None, debug_lines
|
|
382
|
+
|
|
383
|
+
if cfg.debug:
|
|
384
|
+
debug_lines.append(f"bm25: terms={terms!r}")
|
|
385
|
+
debug_lines.append(
|
|
386
|
+
f"bm25: selected_sources={len(selected_sources)} from_paths={len(by_path)} top_paragraphs={len(top_para)}"
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
total_remaining = int(cfg.total_chars)
|
|
390
|
+
blocks: list[str] = [
|
|
391
|
+
"Retrieved excerpts (hints for where to look - verify against your full memory of the documents):",
|
|
392
|
+
"",
|
|
393
|
+
]
|
|
394
|
+
|
|
395
|
+
included = 0
|
|
396
|
+
for agg, path in selected_sources:
|
|
397
|
+
if total_remaining <= 0:
|
|
398
|
+
break
|
|
399
|
+
|
|
400
|
+
per_remaining = min(int(cfg.per_source_chars), total_remaining)
|
|
401
|
+
if per_remaining <= 0:
|
|
402
|
+
break
|
|
403
|
+
|
|
404
|
+
items = by_path.get(path, [])
|
|
405
|
+
items.sort(key=lambda x: (-x[0], x[1]))
|
|
406
|
+
items = items[: cfg.max_paragraphs_per_source]
|
|
407
|
+
items.sort(key=lambda x: self._paragraphs[x[1]].paragraph_index)
|
|
408
|
+
|
|
409
|
+
parts: list[str] = []
|
|
410
|
+
used = 0
|
|
411
|
+
for score, pid in items:
|
|
412
|
+
para = self._paragraphs[pid].text.strip()
|
|
413
|
+
if not para:
|
|
414
|
+
continue
|
|
415
|
+
|
|
416
|
+
sep = "\n\n" if parts else ""
|
|
417
|
+
avail = per_remaining - used - len(sep)
|
|
418
|
+
if avail <= 0:
|
|
419
|
+
break
|
|
420
|
+
|
|
421
|
+
clip_limit = min(int(cfg.max_paragraph_chars), avail)
|
|
422
|
+
if quote_task:
|
|
423
|
+
clipped = _select_sentence_snippet(para, terms=terms, max_chars=clip_limit)
|
|
424
|
+
else:
|
|
425
|
+
clipped = _truncate_text(para, clip_limit)
|
|
426
|
+
if not clipped:
|
|
427
|
+
break
|
|
428
|
+
|
|
429
|
+
parts.append(sep + clipped)
|
|
430
|
+
used += len(sep) + len(clipped)
|
|
431
|
+
if used >= per_remaining:
|
|
432
|
+
break
|
|
433
|
+
|
|
434
|
+
excerpt = "".join(parts).strip()
|
|
435
|
+
if not excerpt:
|
|
436
|
+
continue
|
|
437
|
+
|
|
438
|
+
included += 1
|
|
439
|
+
total_remaining -= used
|
|
440
|
+
|
|
441
|
+
meta = self._source_meta.get(path, {"path": path})
|
|
442
|
+
attrs: list[str] = []
|
|
443
|
+
attrs.append(f"path={json.dumps(path, ensure_ascii=False)}")
|
|
444
|
+
|
|
445
|
+
title = meta.get("title")
|
|
446
|
+
if isinstance(title, str) and title.strip():
|
|
447
|
+
attrs.append(f"title={json.dumps(title.strip(), ensure_ascii=False)}")
|
|
448
|
+
src = meta.get("source")
|
|
449
|
+
if isinstance(src, str) and src.strip():
|
|
450
|
+
attrs.append(f"source={json.dumps(src.strip(), ensure_ascii=False)}")
|
|
451
|
+
url = meta.get("url")
|
|
452
|
+
if isinstance(url, str) and url.strip():
|
|
453
|
+
attrs.append(f"url={json.dumps(url.strip(), ensure_ascii=False)}")
|
|
454
|
+
|
|
455
|
+
blocks.append(f"[SOURCE {' '.join(attrs)}]")
|
|
456
|
+
blocks.append(excerpt)
|
|
457
|
+
blocks.append("[/SOURCE]")
|
|
458
|
+
blocks.append("")
|
|
459
|
+
|
|
460
|
+
if cfg.debug:
|
|
461
|
+
top_scores = [f"{score:.3f}" for score, _ in sorted(items, reverse=True)[:3]]
|
|
462
|
+
debug_lines.append(
|
|
463
|
+
f"bm25: + {path} agg={agg:.3f} paras={len(items)} chars={used} top_scores={top_scores}"
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
if included == 0:
|
|
467
|
+
return None, debug_lines
|
|
468
|
+
|
|
469
|
+
msg = "\n".join(blocks).rstrip() + "\n"
|
|
470
|
+
return msg, debug_lines
|
|
471
|
+
|