peeky-search 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +256 -0
- package/dist/chunk-F3PNR32Z.js +227 -0
- package/dist/chunk-S3WZDJCP.js +2716 -0
- package/dist/cli.js +357 -0
- package/dist/docker-IWGZDSIP.js +20 -0
- package/dist/index.js +179 -0
- package/dist/mcp/server.js +67 -0
- package/package.json +39 -0
|
@@ -0,0 +1,2716 @@
|
|
|
1
|
+
// src/preprocessing/tokenize.ts
|
|
2
|
+
import { stemmer } from "stemmer";
|
|
3
|
+
var STOP_WORDS = /* @__PURE__ */ new Set([
|
|
4
|
+
"a",
|
|
5
|
+
"an",
|
|
6
|
+
"and",
|
|
7
|
+
"are",
|
|
8
|
+
"as",
|
|
9
|
+
"at",
|
|
10
|
+
"be",
|
|
11
|
+
"by",
|
|
12
|
+
"for",
|
|
13
|
+
"from",
|
|
14
|
+
"has",
|
|
15
|
+
"he",
|
|
16
|
+
"in",
|
|
17
|
+
"is",
|
|
18
|
+
"it",
|
|
19
|
+
"its",
|
|
20
|
+
"of",
|
|
21
|
+
"on",
|
|
22
|
+
"that",
|
|
23
|
+
"the",
|
|
24
|
+
"to",
|
|
25
|
+
"was",
|
|
26
|
+
"were",
|
|
27
|
+
"will",
|
|
28
|
+
"with",
|
|
29
|
+
"the",
|
|
30
|
+
"this",
|
|
31
|
+
"but",
|
|
32
|
+
"they",
|
|
33
|
+
"have",
|
|
34
|
+
"had",
|
|
35
|
+
"what",
|
|
36
|
+
"when",
|
|
37
|
+
"where",
|
|
38
|
+
"who",
|
|
39
|
+
"which",
|
|
40
|
+
"why",
|
|
41
|
+
"how",
|
|
42
|
+
"all",
|
|
43
|
+
"each",
|
|
44
|
+
"every",
|
|
45
|
+
"both",
|
|
46
|
+
"few",
|
|
47
|
+
"more",
|
|
48
|
+
"most",
|
|
49
|
+
"other",
|
|
50
|
+
"some",
|
|
51
|
+
"such",
|
|
52
|
+
"no",
|
|
53
|
+
"nor",
|
|
54
|
+
"not",
|
|
55
|
+
"only",
|
|
56
|
+
"own",
|
|
57
|
+
"same",
|
|
58
|
+
"so",
|
|
59
|
+
"than",
|
|
60
|
+
"too",
|
|
61
|
+
"very",
|
|
62
|
+
"can",
|
|
63
|
+
"just",
|
|
64
|
+
"should",
|
|
65
|
+
"now",
|
|
66
|
+
"or",
|
|
67
|
+
"if",
|
|
68
|
+
"then",
|
|
69
|
+
"else",
|
|
70
|
+
"been",
|
|
71
|
+
"being",
|
|
72
|
+
"do",
|
|
73
|
+
"does",
|
|
74
|
+
"did",
|
|
75
|
+
"doing",
|
|
76
|
+
"would",
|
|
77
|
+
"could",
|
|
78
|
+
"might",
|
|
79
|
+
"must",
|
|
80
|
+
"shall",
|
|
81
|
+
"may",
|
|
82
|
+
"about",
|
|
83
|
+
"above",
|
|
84
|
+
"after",
|
|
85
|
+
"again",
|
|
86
|
+
"against",
|
|
87
|
+
"below",
|
|
88
|
+
"between",
|
|
89
|
+
"into",
|
|
90
|
+
"through",
|
|
91
|
+
"during",
|
|
92
|
+
"before",
|
|
93
|
+
"under",
|
|
94
|
+
"over",
|
|
95
|
+
"out",
|
|
96
|
+
"up",
|
|
97
|
+
"down",
|
|
98
|
+
"off",
|
|
99
|
+
"once",
|
|
100
|
+
"here",
|
|
101
|
+
"there",
|
|
102
|
+
"any",
|
|
103
|
+
"your",
|
|
104
|
+
"you",
|
|
105
|
+
"we",
|
|
106
|
+
"our",
|
|
107
|
+
"us",
|
|
108
|
+
"i",
|
|
109
|
+
"me",
|
|
110
|
+
"my",
|
|
111
|
+
"myself",
|
|
112
|
+
"him",
|
|
113
|
+
"her",
|
|
114
|
+
"them",
|
|
115
|
+
"their",
|
|
116
|
+
"his",
|
|
117
|
+
"she",
|
|
118
|
+
"he",
|
|
119
|
+
"it",
|
|
120
|
+
"its",
|
|
121
|
+
"itself"
|
|
122
|
+
]);
|
|
123
|
+
function splitCamelCase(text) {
|
|
124
|
+
return text.replace(/([a-z])([A-Z])/g, "$1 $2").replace(/([A-Z]+)([A-Z][a-z])/g, "$1 $2");
|
|
125
|
+
}
|
|
126
|
+
function normalizeText(text) {
|
|
127
|
+
return splitCamelCase(text).toLowerCase().replace(/[^\p{L}\p{N}\s]/gu, " ").replace(/\s+/g, " ").trim();
|
|
128
|
+
}
|
|
129
|
+
function tokenize(text, options = {}) {
|
|
130
|
+
const {
|
|
131
|
+
removeStopWords = true,
|
|
132
|
+
applyStemming = true,
|
|
133
|
+
minLength = 2
|
|
134
|
+
} = options;
|
|
135
|
+
const normalized = normalizeText(text);
|
|
136
|
+
let tokens = normalized.split(/\s+/).filter((t) => t.length >= minLength);
|
|
137
|
+
if (removeStopWords) {
|
|
138
|
+
tokens = tokens.filter((t) => !STOP_WORDS.has(t));
|
|
139
|
+
}
|
|
140
|
+
if (applyStemming) {
|
|
141
|
+
tokens = tokens.map(stemmer);
|
|
142
|
+
}
|
|
143
|
+
return tokens;
|
|
144
|
+
}
|
|
145
|
+
function buildTermFrequencyMap(tokens) {
|
|
146
|
+
const tf = {};
|
|
147
|
+
for (const token of tokens) {
|
|
148
|
+
tf[token] = (tf[token] ?? 0) + 1;
|
|
149
|
+
}
|
|
150
|
+
return tf;
|
|
151
|
+
}
|
|
152
|
+
function jaccardSimilarity(tokensA, tokensB) {
|
|
153
|
+
const setA = new Set(tokensA);
|
|
154
|
+
const setB = new Set(tokensB);
|
|
155
|
+
let intersection = 0;
|
|
156
|
+
for (const token of setA) {
|
|
157
|
+
if (setB.has(token)) {
|
|
158
|
+
intersection++;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
const union = setA.size + setB.size - intersection;
|
|
162
|
+
if (union === 0) return 0;
|
|
163
|
+
return intersection / union;
|
|
164
|
+
}
|
|
165
|
+
function termOverlapRatio(tokensA, tokensB) {
|
|
166
|
+
if (tokensA.length === 0) return 0;
|
|
167
|
+
const setB = new Set(tokensB);
|
|
168
|
+
let overlap = 0;
|
|
169
|
+
for (const token of tokensA) {
|
|
170
|
+
if (setB.has(token)) {
|
|
171
|
+
overlap++;
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
return overlap / tokensA.length;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// src/output/excerpts.ts
|
|
178
|
+
var DEFAULT_CONFIG = {
|
|
179
|
+
maxExcerpts: 3,
|
|
180
|
+
charBudget: 2e3,
|
|
181
|
+
minExcerptChars: 50
|
|
182
|
+
};
|
|
183
|
+
function chunkToExcerpt(chunk) {
|
|
184
|
+
return {
|
|
185
|
+
text: chunk.text,
|
|
186
|
+
headingPath: chunk.headingPath,
|
|
187
|
+
score: chunk.score,
|
|
188
|
+
charCount: chunk.charCount
|
|
189
|
+
};
|
|
190
|
+
}
|
|
191
|
+
function assembleExcerpts(chunks, query, config = {}) {
|
|
192
|
+
const {
|
|
193
|
+
maxExcerpts = DEFAULT_CONFIG.maxExcerpts,
|
|
194
|
+
charBudget = DEFAULT_CONFIG.charBudget,
|
|
195
|
+
minExcerptChars = DEFAULT_CONFIG.minExcerptChars
|
|
196
|
+
} = config;
|
|
197
|
+
const sorted = [...chunks].sort((a, b) => {
|
|
198
|
+
const d = b.score - a.score;
|
|
199
|
+
if (d !== 0) return d;
|
|
200
|
+
return a.anchorIndex - b.anchorIndex;
|
|
201
|
+
});
|
|
202
|
+
const excerpts = [];
|
|
203
|
+
let totalChars = 0;
|
|
204
|
+
for (const chunk of sorted) {
|
|
205
|
+
if (chunk.charCount < minExcerptChars) {
|
|
206
|
+
continue;
|
|
207
|
+
}
|
|
208
|
+
if (totalChars + chunk.charCount > charBudget) {
|
|
209
|
+
continue;
|
|
210
|
+
}
|
|
211
|
+
if (excerpts.length >= maxExcerpts) {
|
|
212
|
+
break;
|
|
213
|
+
}
|
|
214
|
+
excerpts.push(chunkToExcerpt(chunk));
|
|
215
|
+
totalChars += chunk.charCount;
|
|
216
|
+
}
|
|
217
|
+
return {
|
|
218
|
+
excerpts,
|
|
219
|
+
totalChars,
|
|
220
|
+
query
|
|
221
|
+
};
|
|
222
|
+
}
|
|
223
|
+
function formatExcerpts(result) {
|
|
224
|
+
const lines = [];
|
|
225
|
+
lines.push(`Query: "${result.query}"`);
|
|
226
|
+
lines.push(`Total characters: ${result.totalChars}`);
|
|
227
|
+
lines.push("");
|
|
228
|
+
for (let i = 0; i < result.excerpts.length; i++) {
|
|
229
|
+
const excerpt = result.excerpts[i];
|
|
230
|
+
if (excerpt === void 0) continue;
|
|
231
|
+
lines.push(`--- Excerpt ${i + 1} (score: ${excerpt.score.toFixed(3)}, ${excerpt.charCount} chars) ---`);
|
|
232
|
+
if (excerpt.headingPath.length > 0) {
|
|
233
|
+
lines.push(`[${excerpt.headingPath.join(" > ")}]`);
|
|
234
|
+
}
|
|
235
|
+
lines.push(excerpt.text);
|
|
236
|
+
lines.push("");
|
|
237
|
+
}
|
|
238
|
+
return lines.join("\n");
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// src/utils/logger.ts
|
|
242
|
+
var Logger = class _Logger {
|
|
243
|
+
static instance;
|
|
244
|
+
timings = [];
|
|
245
|
+
timingEnabled = false;
|
|
246
|
+
constructor() {
|
|
247
|
+
}
|
|
248
|
+
static getInstance() {
|
|
249
|
+
if (!_Logger.instance) {
|
|
250
|
+
_Logger.instance = new _Logger();
|
|
251
|
+
}
|
|
252
|
+
return _Logger.instance;
|
|
253
|
+
}
|
|
254
|
+
log(message) {
|
|
255
|
+
console.log(`[${(/* @__PURE__ */ new Date()).toISOString()}] [INFO] ${message}`);
|
|
256
|
+
}
|
|
257
|
+
error(message) {
|
|
258
|
+
console.error(`[${(/* @__PURE__ */ new Date()).toISOString()}] [ERROR] ${message}`);
|
|
259
|
+
}
|
|
260
|
+
debug(message, enabled = true) {
|
|
261
|
+
if (enabled) {
|
|
262
|
+
console.error(`[${(/* @__PURE__ */ new Date()).toISOString()}] [DEBUG] ${message}`);
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
/**
|
|
266
|
+
* Enable or disable timing collection
|
|
267
|
+
*/
|
|
268
|
+
setTimingEnabled(enabled) {
|
|
269
|
+
this.timingEnabled = enabled;
|
|
270
|
+
if (enabled) {
|
|
271
|
+
this.timings = [];
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
/**
|
|
275
|
+
* Time a synchronous function and record the result
|
|
276
|
+
*/
|
|
277
|
+
time(label, fn) {
|
|
278
|
+
if (!this.timingEnabled) {
|
|
279
|
+
return fn();
|
|
280
|
+
}
|
|
281
|
+
const start = performance.now();
|
|
282
|
+
const result = fn();
|
|
283
|
+
const durationMs = performance.now() - start;
|
|
284
|
+
this.timings.push({ label, durationMs });
|
|
285
|
+
return result;
|
|
286
|
+
}
|
|
287
|
+
/**
|
|
288
|
+
* Time an async function and record the result
|
|
289
|
+
*/
|
|
290
|
+
async timeAsync(label, fn) {
|
|
291
|
+
if (!this.timingEnabled) {
|
|
292
|
+
return fn();
|
|
293
|
+
}
|
|
294
|
+
const start = performance.now();
|
|
295
|
+
const result = await fn();
|
|
296
|
+
const durationMs = performance.now() - start;
|
|
297
|
+
this.timings.push({ label, durationMs });
|
|
298
|
+
return result;
|
|
299
|
+
}
|
|
300
|
+
/**
|
|
301
|
+
* Get all recorded timings
|
|
302
|
+
*/
|
|
303
|
+
getTimings() {
|
|
304
|
+
return [...this.timings];
|
|
305
|
+
}
|
|
306
|
+
/**
|
|
307
|
+
* Clear recorded timings
|
|
308
|
+
*/
|
|
309
|
+
clearTimings() {
|
|
310
|
+
this.timings = [];
|
|
311
|
+
}
|
|
312
|
+
/**
|
|
313
|
+
* Record a timing directly (useful for manual timing measurements)
|
|
314
|
+
*/
|
|
315
|
+
recordTiming(label, durationMs) {
|
|
316
|
+
if (this.timingEnabled) {
|
|
317
|
+
this.timings.push({ label, durationMs });
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
/**
|
|
321
|
+
* Print timing summary to console
|
|
322
|
+
*/
|
|
323
|
+
printTimings() {
|
|
324
|
+
if (this.timings.length === 0) {
|
|
325
|
+
console.error("[TIMING] No timings recorded");
|
|
326
|
+
return;
|
|
327
|
+
}
|
|
328
|
+
console.error("\n[TIMING] === Performance Summary ===");
|
|
329
|
+
const total = this.timings.reduce((sum, t) => sum + t.durationMs, 0);
|
|
330
|
+
for (const timing of this.timings) {
|
|
331
|
+
const pct = (timing.durationMs / total * 100).toFixed(1);
|
|
332
|
+
console.error(`[TIMING] ${timing.label.padEnd(30)} ${timing.durationMs.toFixed(2).padStart(8)}ms (${pct.padStart(5)}%)`);
|
|
333
|
+
}
|
|
334
|
+
console.error(`[TIMING] ${"TOTAL".padEnd(30)} ${total.toFixed(2).padStart(8)}ms`);
|
|
335
|
+
console.error("[TIMING] ================================\n");
|
|
336
|
+
}
|
|
337
|
+
};
|
|
338
|
+
var logger_default = Logger;
|
|
339
|
+
|
|
340
|
+
// src/preprocessing/strip.ts
|
|
341
|
+
import * as cheerio from "cheerio";
|
|
342
|
+
function isElement(node) {
|
|
343
|
+
return node.type === "tag";
|
|
344
|
+
}
|
|
345
|
+
var REMOVE_ELEMENTS = [
|
|
346
|
+
"script",
|
|
347
|
+
"style",
|
|
348
|
+
"link",
|
|
349
|
+
"img",
|
|
350
|
+
"iframe",
|
|
351
|
+
"video",
|
|
352
|
+
"audio",
|
|
353
|
+
"object",
|
|
354
|
+
"embed",
|
|
355
|
+
"param",
|
|
356
|
+
"applet",
|
|
357
|
+
"noscript",
|
|
358
|
+
"svg",
|
|
359
|
+
"canvas",
|
|
360
|
+
"map",
|
|
361
|
+
"area",
|
|
362
|
+
// Interactive elements that are never content
|
|
363
|
+
"button",
|
|
364
|
+
"input",
|
|
365
|
+
"select",
|
|
366
|
+
"textarea",
|
|
367
|
+
"form",
|
|
368
|
+
"label",
|
|
369
|
+
"fieldset",
|
|
370
|
+
"legend",
|
|
371
|
+
"datalist",
|
|
372
|
+
"output",
|
|
373
|
+
"progress",
|
|
374
|
+
"meter"
|
|
375
|
+
];
|
|
376
|
+
var BOILERPLATE_ELEMENTS = [
|
|
377
|
+
"nav",
|
|
378
|
+
"footer",
|
|
379
|
+
"aside",
|
|
380
|
+
"header"
|
|
381
|
+
];
|
|
382
|
+
var BOILERPLATE_PATTERNS = [
|
|
383
|
+
/nav(igation)?/i,
|
|
384
|
+
/footer/i,
|
|
385
|
+
/header/i,
|
|
386
|
+
/sidebar/i,
|
|
387
|
+
/menu/i,
|
|
388
|
+
/breadcrumb/i,
|
|
389
|
+
/cookie/i,
|
|
390
|
+
/consent/i,
|
|
391
|
+
/banner/i,
|
|
392
|
+
/advert(isement)?/i,
|
|
393
|
+
/ads?[-_]?/i,
|
|
394
|
+
/social/i,
|
|
395
|
+
/share/i,
|
|
396
|
+
/comment/i,
|
|
397
|
+
/related/i,
|
|
398
|
+
/recommend/i,
|
|
399
|
+
/popup/i,
|
|
400
|
+
/modal/i,
|
|
401
|
+
/newsletter/i,
|
|
402
|
+
/subscribe/i,
|
|
403
|
+
/signup/i,
|
|
404
|
+
/login/i,
|
|
405
|
+
/signin/i,
|
|
406
|
+
/search/i,
|
|
407
|
+
/widget/i,
|
|
408
|
+
/toolbar/i,
|
|
409
|
+
/promo/i
|
|
410
|
+
];
|
|
411
|
+
var UI_ELEMENT_PATTERNS = [
|
|
412
|
+
/copy[-_]?(button|link|code|markdown)?/i,
|
|
413
|
+
/share[-_]?(button|link|menu)?/i,
|
|
414
|
+
/action[-_]?(bar|menu|buttons?)/i,
|
|
415
|
+
/btn[-_]?(copy|share|action)/i,
|
|
416
|
+
/clipboard/i,
|
|
417
|
+
/open[-_]?in/i,
|
|
418
|
+
/chat[-_]?(gpt|button)/i,
|
|
419
|
+
/ai[-_]?(chat|assistant)/i,
|
|
420
|
+
/feedback/i,
|
|
421
|
+
/edit[-_]?(page|this|on[-_]?github)/i,
|
|
422
|
+
/page[-_]?actions?/i,
|
|
423
|
+
/toc|table[-_]?of[-_]?contents/i,
|
|
424
|
+
/on[-_]?this[-_]?page/i,
|
|
425
|
+
/sticky[-_]?(nav|sidebar|toc)/i,
|
|
426
|
+
/floating[-_]?(menu|button|action)/i
|
|
427
|
+
];
|
|
428
|
+
var UI_TEXT_PATTERNS = [
|
|
429
|
+
/^copy\s*(as\s*)?(markdown|code|text)?$/i,
|
|
430
|
+
/^open\s+in\s+\w+/i,
|
|
431
|
+
// "Open in X" for any service
|
|
432
|
+
/^(share|copy)\s*(this|link|page)?$/i,
|
|
433
|
+
/^edit\s*(this\s*)?(page|on\s*github)?$/i,
|
|
434
|
+
/^(give\s*)?feedback$/i,
|
|
435
|
+
/^(scroll\s*to\s*)?top$/i,
|
|
436
|
+
/^on\s+this\s+page$/i,
|
|
437
|
+
/^table\s+of\s+contents$/i
|
|
438
|
+
];
|
|
439
|
+
function isBoilerplateElement(el) {
|
|
440
|
+
if (el.type !== "tag") return false;
|
|
441
|
+
const id = el.attribs?.id ?? "";
|
|
442
|
+
const className = el.attribs?.class ?? "";
|
|
443
|
+
const combined = `${id} ${className}`;
|
|
444
|
+
return BOILERPLATE_PATTERNS.some((pattern) => pattern.test(combined));
|
|
445
|
+
}
|
|
446
|
+
function isUIElement(el) {
|
|
447
|
+
if (el.type !== "tag") return false;
|
|
448
|
+
const id = el.attribs?.id ?? "";
|
|
449
|
+
const className = el.attribs?.class ?? "";
|
|
450
|
+
const combined = `${id} ${className}`;
|
|
451
|
+
return UI_ELEMENT_PATTERNS.some((pattern) => pattern.test(combined));
|
|
452
|
+
}
|
|
453
|
+
function isUIText(text) {
|
|
454
|
+
const normalized = text.replace(/\s+/g, " ").trim();
|
|
455
|
+
return UI_TEXT_PATTERNS.some((pattern) => pattern.test(normalized));
|
|
456
|
+
}
|
|
457
|
+
function stripHtml(html) {
|
|
458
|
+
const $ = cheerio.load(html);
|
|
459
|
+
for (const selector of REMOVE_ELEMENTS) {
|
|
460
|
+
$(selector).remove();
|
|
461
|
+
}
|
|
462
|
+
return $;
|
|
463
|
+
}
|
|
464
|
+
function removeBoilerplate($) {
|
|
465
|
+
const mainContent = $("main, article, [role='main']");
|
|
466
|
+
mainContent.attr("data-peeky-protect", "true");
|
|
467
|
+
mainContent.find("*").attr("data-peeky-protect", "true");
|
|
468
|
+
for (const selector of BOILERPLATE_ELEMENTS) {
|
|
469
|
+
$(selector).each((_, el) => {
|
|
470
|
+
const $el = $(el);
|
|
471
|
+
if (!$el.attr("data-peeky-protect") && !$el.find("[data-peeky-protect]").length) {
|
|
472
|
+
$el.remove();
|
|
473
|
+
}
|
|
474
|
+
});
|
|
475
|
+
}
|
|
476
|
+
$("*").each((_, el) => {
|
|
477
|
+
if (!isElement(el)) return;
|
|
478
|
+
const $el = $(el);
|
|
479
|
+
if ($el.attr("data-peeky-protect")) return;
|
|
480
|
+
if ($el.find("[data-peeky-protect]").length) return;
|
|
481
|
+
if (isBoilerplateElement(el)) {
|
|
482
|
+
$el.remove();
|
|
483
|
+
}
|
|
484
|
+
});
|
|
485
|
+
$("[data-peeky-protect]").removeAttr("data-peeky-protect");
|
|
486
|
+
}
|
|
487
|
+
function removeUIElements($, container) {
|
|
488
|
+
container.find("*").each((_, el) => {
|
|
489
|
+
if (!isElement(el)) return;
|
|
490
|
+
if (isUIElement(el)) {
|
|
491
|
+
$(el).remove();
|
|
492
|
+
}
|
|
493
|
+
});
|
|
494
|
+
container.find("button, a, span, div").each((_, el) => {
|
|
495
|
+
const $el = $(el);
|
|
496
|
+
const text = $el.text();
|
|
497
|
+
if (text.length < 50 && isUIText(text)) {
|
|
498
|
+
$el.remove();
|
|
499
|
+
}
|
|
500
|
+
});
|
|
501
|
+
const uiSelectors = [
|
|
502
|
+
"[data-copy]",
|
|
503
|
+
"[data-clipboard]",
|
|
504
|
+
"[aria-label*='copy']",
|
|
505
|
+
"[aria-label*='share']",
|
|
506
|
+
"[title*='Copy']",
|
|
507
|
+
"[title*='Share']",
|
|
508
|
+
".copy-button",
|
|
509
|
+
".share-button",
|
|
510
|
+
"[class*='ActionMenu']",
|
|
511
|
+
"[class*='PageActions']",
|
|
512
|
+
// Tab navigation and anchor links within content
|
|
513
|
+
"[role='tablist']",
|
|
514
|
+
"[role='tab']",
|
|
515
|
+
".tabs",
|
|
516
|
+
".tab-list",
|
|
517
|
+
"[class*='TabList']",
|
|
518
|
+
"[class*='Tabs']",
|
|
519
|
+
// Anchor/jump links
|
|
520
|
+
"[class*='anchor-link']",
|
|
521
|
+
"[class*='heading-link']",
|
|
522
|
+
"a[href^='#']:empty",
|
|
523
|
+
// Empty anchor links
|
|
524
|
+
// Sticky sidebars (often contain TOC and share links)
|
|
525
|
+
"[class*='sticky']"
|
|
526
|
+
];
|
|
527
|
+
for (const selector of uiSelectors) {
|
|
528
|
+
container.find(selector).remove();
|
|
529
|
+
}
|
|
530
|
+
container.find("p, div, span").each((_, el) => {
|
|
531
|
+
const $el = $(el);
|
|
532
|
+
const links = $el.find("a");
|
|
533
|
+
const text = $el.text().replace(/\s+/g, " ").trim();
|
|
534
|
+
if (links.length >= 3) {
|
|
535
|
+
let linkTextLen = 0;
|
|
536
|
+
links.each((_2, link) => {
|
|
537
|
+
linkTextLen += $(link).text().length;
|
|
538
|
+
});
|
|
539
|
+
if (linkTextLen > text.length * 0.8 && text.length < 200) {
|
|
540
|
+
$el.remove();
|
|
541
|
+
}
|
|
542
|
+
}
|
|
543
|
+
});
|
|
544
|
+
}
|
|
545
|
+
function findMainContent($) {
|
|
546
|
+
const mainEl = $("main");
|
|
547
|
+
if (mainEl.length > 0) {
|
|
548
|
+
return {
|
|
549
|
+
element: mainEl,
|
|
550
|
+
selector: "main",
|
|
551
|
+
score: 100
|
|
552
|
+
};
|
|
553
|
+
}
|
|
554
|
+
const articleEl = $("article");
|
|
555
|
+
if (articleEl.length > 0) {
|
|
556
|
+
return {
|
|
557
|
+
element: articleEl,
|
|
558
|
+
selector: "article",
|
|
559
|
+
score: 90
|
|
560
|
+
};
|
|
561
|
+
}
|
|
562
|
+
const candidates = [];
|
|
563
|
+
$("body").children().each((_, elem) => {
|
|
564
|
+
if (elem.type !== "tag") return;
|
|
565
|
+
const $el = $(elem);
|
|
566
|
+
if (BOILERPLATE_ELEMENTS.includes(elem.tagName.toLowerCase())) return;
|
|
567
|
+
if (isBoilerplateElement(elem)) return;
|
|
568
|
+
const textLen = $el.text().replace(/\s+/g, " ").trim().length;
|
|
569
|
+
let linkTextLen = 0;
|
|
570
|
+
$el.find("a").each((_2, link) => {
|
|
571
|
+
linkTextLen += $(link).text().replace(/\s+/g, " ").trim().length;
|
|
572
|
+
});
|
|
573
|
+
const score = textLen - 2 * linkTextLen;
|
|
574
|
+
let selector = elem.tagName;
|
|
575
|
+
if (elem.attribs?.id) {
|
|
576
|
+
selector += `#${elem.attribs.id}`;
|
|
577
|
+
}
|
|
578
|
+
if (elem.attribs?.class) {
|
|
579
|
+
selector += `.${elem.attribs.class.split(" ").join(".")}`;
|
|
580
|
+
}
|
|
581
|
+
candidates.push({ element: $el, selector, score });
|
|
582
|
+
});
|
|
583
|
+
const contentSelectors = [
|
|
584
|
+
"[role='main']",
|
|
585
|
+
"#content",
|
|
586
|
+
"#main-content",
|
|
587
|
+
".content",
|
|
588
|
+
".main-content",
|
|
589
|
+
".post-content",
|
|
590
|
+
".article-content",
|
|
591
|
+
".entry-content"
|
|
592
|
+
];
|
|
593
|
+
for (const sel of contentSelectors) {
|
|
594
|
+
const $el = $(sel);
|
|
595
|
+
if ($el.length > 0) {
|
|
596
|
+
const textLen = $el.text().replace(/\s+/g, " ").trim().length;
|
|
597
|
+
let linkTextLen = 0;
|
|
598
|
+
$el.find("a").each((_, link) => {
|
|
599
|
+
linkTextLen += $(link).text().replace(/\s+/g, " ").trim().length;
|
|
600
|
+
});
|
|
601
|
+
const score = textLen - 2 * linkTextLen;
|
|
602
|
+
candidates.push({ element: $el, selector: sel, score });
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
if (candidates.length === 0) {
|
|
606
|
+
return null;
|
|
607
|
+
}
|
|
608
|
+
candidates.sort((a, b) => {
|
|
609
|
+
const d = b.score - a.score;
|
|
610
|
+
if (d !== 0) return d;
|
|
611
|
+
return a.selector.localeCompare(b.selector);
|
|
612
|
+
});
|
|
613
|
+
const best = candidates[0];
|
|
614
|
+
return best ?? null;
|
|
615
|
+
}
|
|
616
|
+
function preprocessHtml(html) {
|
|
617
|
+
const $ = stripHtml(html);
|
|
618
|
+
removeBoilerplate($);
|
|
619
|
+
const candidate = findMainContent($);
|
|
620
|
+
if (candidate === null) {
|
|
621
|
+
return { $, mainContent: null, selector: null };
|
|
622
|
+
}
|
|
623
|
+
removeUIElements($, candidate.element);
|
|
624
|
+
return {
|
|
625
|
+
$,
|
|
626
|
+
mainContent: candidate.element,
|
|
627
|
+
selector: candidate.selector
|
|
628
|
+
};
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
// src/preprocessing/segment.ts
|
|
632
|
+
function isBlockTag(tagName) {
|
|
633
|
+
return /^h[1-6]$/.test(tagName) || tagName === "p" || tagName === "li" || tagName === "pre";
|
|
634
|
+
}
|
|
635
|
+
function extractCodeText($, $pre) {
|
|
636
|
+
const $lines = $pre.find(".line, .code-line, [class*='line']");
|
|
637
|
+
if ($lines.length > 0) {
|
|
638
|
+
const lines = [];
|
|
639
|
+
$lines.each((_, el) => {
|
|
640
|
+
const lineText = $(el).text().trim();
|
|
641
|
+
if (lineText.length > 0) {
|
|
642
|
+
lines.push(lineText);
|
|
643
|
+
}
|
|
644
|
+
});
|
|
645
|
+
return lines.join("\n").trim();
|
|
646
|
+
}
|
|
647
|
+
const html = $pre.html() ?? "";
|
|
648
|
+
if (html.includes("<br")) {
|
|
649
|
+
const withNewlines = html.replace(/<br\s*\/?>/gi, "\n");
|
|
650
|
+
const $temp = $.load(`<div>${withNewlines}</div>`);
|
|
651
|
+
return $temp("div").text().trim();
|
|
652
|
+
}
|
|
653
|
+
const rawText = $pre.text() ?? "";
|
|
654
|
+
return rawText.trim();
|
|
655
|
+
}
|
|
656
|
+
function walk($, $node, path, blocks, indexRef, skipNav) {
|
|
657
|
+
if (skipNav && $node.is("nav")) return;
|
|
658
|
+
const rawTag = $node.prop("tagName");
|
|
659
|
+
const tagName = typeof rawTag === "string" ? rawTag.toLowerCase() : "";
|
|
660
|
+
if (tagName !== "" && isBlockTag(tagName)) {
|
|
661
|
+
let text;
|
|
662
|
+
if (tagName === "pre") {
|
|
663
|
+
text = extractCodeText($, $node);
|
|
664
|
+
} else {
|
|
665
|
+
const rawText = $node.text() ?? "";
|
|
666
|
+
text = rawText.replace(/\s+/g, " ").trim();
|
|
667
|
+
}
|
|
668
|
+
if (text.length === 0) return;
|
|
669
|
+
blocks.push({
|
|
670
|
+
type: tagName,
|
|
671
|
+
text,
|
|
672
|
+
index: indexRef.current++,
|
|
673
|
+
headingPath: [...path]
|
|
674
|
+
});
|
|
675
|
+
if (/^h[1-6]$/.test(tagName)) {
|
|
676
|
+
const levelChar = tagName[1];
|
|
677
|
+
const level = levelChar !== void 0 ? parseInt(levelChar, 10) : 1;
|
|
678
|
+
const targetLength = level - 1;
|
|
679
|
+
if (targetLength < path.length) {
|
|
680
|
+
path.length = targetLength;
|
|
681
|
+
}
|
|
682
|
+
path.push(text);
|
|
683
|
+
}
|
|
684
|
+
return;
|
|
685
|
+
}
|
|
686
|
+
$node.children().each((_, child) => {
|
|
687
|
+
if (child.type === "tag") {
|
|
688
|
+
walk($, $(child), path, blocks, indexRef, skipNav);
|
|
689
|
+
}
|
|
690
|
+
});
|
|
691
|
+
}
|
|
692
|
+
function extractBlocks($, container, options = {}) {
|
|
693
|
+
const { skipNav = true } = options;
|
|
694
|
+
const blocks = [];
|
|
695
|
+
const path = [];
|
|
696
|
+
const indexRef = { current: 0 };
|
|
697
|
+
walk($, container, path, blocks, indexRef, skipNav);
|
|
698
|
+
return blocks;
|
|
699
|
+
}
|
|
700
|
+
var ABBREVIATIONS = /* @__PURE__ */ new Set([
|
|
701
|
+
"mr",
|
|
702
|
+
"mrs",
|
|
703
|
+
"ms",
|
|
704
|
+
"dr",
|
|
705
|
+
"prof",
|
|
706
|
+
"sr",
|
|
707
|
+
"jr",
|
|
708
|
+
"vs",
|
|
709
|
+
"etc",
|
|
710
|
+
"inc",
|
|
711
|
+
"ltd",
|
|
712
|
+
"st",
|
|
713
|
+
"ave",
|
|
714
|
+
"blvd",
|
|
715
|
+
"rd",
|
|
716
|
+
"e.g",
|
|
717
|
+
"i.e",
|
|
718
|
+
"cf",
|
|
719
|
+
"al",
|
|
720
|
+
"fig",
|
|
721
|
+
"vol",
|
|
722
|
+
"no"
|
|
723
|
+
]);
|
|
724
|
+
var CODE_BLOCK_CLEANUP_PATTERNS = [
|
|
725
|
+
/Try$/,
|
|
726
|
+
// TypeScript playground "Try" suffix
|
|
727
|
+
/Run$/,
|
|
728
|
+
// "Run" button text
|
|
729
|
+
/Copy$/,
|
|
730
|
+
// Copy button text
|
|
731
|
+
/\s*Try\s*$/,
|
|
732
|
+
// With whitespace
|
|
733
|
+
/\s*Run\s*$/,
|
|
734
|
+
/\s*Copy\s*$/,
|
|
735
|
+
/Open in Playground$/i,
|
|
736
|
+
// Playground links
|
|
737
|
+
/Open in CodeSandbox$/i,
|
|
738
|
+
/Open in StackBlitz$/i,
|
|
739
|
+
/Edit on GitHub$/i,
|
|
740
|
+
/View on GitHub$/i
|
|
741
|
+
];
|
|
742
|
+
function cleanCodeBlock(text) {
|
|
743
|
+
let cleaned = text;
|
|
744
|
+
for (const pattern of CODE_BLOCK_CLEANUP_PATTERNS) {
|
|
745
|
+
cleaned = cleaned.replace(pattern, "");
|
|
746
|
+
}
|
|
747
|
+
return cleaned.trim();
|
|
748
|
+
}
|
|
749
|
+
function splitIntoSentences(text) {
|
|
750
|
+
if (text.length === 0) return [];
|
|
751
|
+
const normalized = text.replace(/\s+/g, " ").trim();
|
|
752
|
+
const sentences = [];
|
|
753
|
+
let current = "";
|
|
754
|
+
let i = 0;
|
|
755
|
+
while (i < normalized.length) {
|
|
756
|
+
const char = normalized[i];
|
|
757
|
+
current += char;
|
|
758
|
+
if (char === "." || char === "!" || char === "?") {
|
|
759
|
+
const nextChar = normalized[i + 1];
|
|
760
|
+
const afterNext = normalized[i + 2];
|
|
761
|
+
if (nextChar === void 0) {
|
|
762
|
+
const trimmed2 = current.trim();
|
|
763
|
+
if (trimmed2.length > 0) {
|
|
764
|
+
sentences.push(trimmed2);
|
|
765
|
+
}
|
|
766
|
+
current = "";
|
|
767
|
+
} else if (nextChar === " " && afterNext !== void 0 && /[A-Z]/.test(afterNext)) {
|
|
768
|
+
const wordMatch = current.match(/(\w+)\.$/);
|
|
769
|
+
const word = wordMatch?.[1]?.toLowerCase() ?? "";
|
|
770
|
+
if (!ABBREVIATIONS.has(word)) {
|
|
771
|
+
const trimmed2 = current.trim();
|
|
772
|
+
if (trimmed2.length > 0) {
|
|
773
|
+
sentences.push(trimmed2);
|
|
774
|
+
}
|
|
775
|
+
current = "";
|
|
776
|
+
i++;
|
|
777
|
+
}
|
|
778
|
+
}
|
|
779
|
+
}
|
|
780
|
+
i++;
|
|
781
|
+
}
|
|
782
|
+
const trimmed = current.trim();
|
|
783
|
+
if (trimmed.length > 0) {
|
|
784
|
+
sentences.push(trimmed);
|
|
785
|
+
}
|
|
786
|
+
return sentences;
|
|
787
|
+
}
|
|
788
|
+
function segmentBlocks(blocks, options = {}) {
|
|
789
|
+
const { tokenizeOptions } = options;
|
|
790
|
+
const sentences = [];
|
|
791
|
+
let globalIndex = 0;
|
|
792
|
+
const totalBlocks = blocks.length;
|
|
793
|
+
for (const block of blocks) {
|
|
794
|
+
const isHeading = /^h[1-6]$/.test(block.type);
|
|
795
|
+
const isCode = block.type === "pre";
|
|
796
|
+
const blockText = isCode ? cleanCodeBlock(block.text) : block.text;
|
|
797
|
+
const blockSentences = isHeading || isCode ? [blockText] : splitIntoSentences(blockText);
|
|
798
|
+
for (let sentenceIndex = 0; sentenceIndex < blockSentences.length; sentenceIndex++) {
|
|
799
|
+
const text = blockSentences[sentenceIndex];
|
|
800
|
+
if (text === void 0 || text.length === 0) continue;
|
|
801
|
+
const tokens = tokenize(text, tokenizeOptions);
|
|
802
|
+
sentences.push({
|
|
803
|
+
text,
|
|
804
|
+
tokens,
|
|
805
|
+
blockIndex: block.index,
|
|
806
|
+
sentenceIndex,
|
|
807
|
+
globalIndex,
|
|
808
|
+
headingPath: block.headingPath,
|
|
809
|
+
position: totalBlocks > 1 ? block.index / (totalBlocks - 1) : 0,
|
|
810
|
+
blockType: block.type
|
|
811
|
+
});
|
|
812
|
+
globalIndex++;
|
|
813
|
+
}
|
|
814
|
+
}
|
|
815
|
+
return sentences;
|
|
816
|
+
}
|
|
817
|
+
function segmentHtml($, container, options = {}) {
|
|
818
|
+
const { skipNav = true, tokenizeOptions } = options;
|
|
819
|
+
const blocks = extractBlocks($, container, { skipNav });
|
|
820
|
+
const sentences = segmentBlocks(blocks, { tokenizeOptions });
|
|
821
|
+
return { blocks, sentences };
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
// src/scoring/bm25.ts
|
|
825
|
+
var DEFAULT_K1 = 1.2;
|
|
826
|
+
var DEFAULT_B = 0.5;
|
|
827
|
+
function computeDocumentStats(sentences) {
|
|
828
|
+
const docFrequency = {};
|
|
829
|
+
let totalLength = 0;
|
|
830
|
+
for (const sentence of sentences) {
|
|
831
|
+
const uniqueTerms = new Set(sentence.tokens);
|
|
832
|
+
for (const term of uniqueTerms) {
|
|
833
|
+
docFrequency[term] = (docFrequency[term] ?? 0) + 1;
|
|
834
|
+
}
|
|
835
|
+
totalLength += sentence.tokens.length;
|
|
836
|
+
}
|
|
837
|
+
return {
|
|
838
|
+
totalDocs: sentences.length,
|
|
839
|
+
avgDocLength: sentences.length > 0 ? totalLength / sentences.length : 0,
|
|
840
|
+
docFrequency
|
|
841
|
+
};
|
|
842
|
+
}
|
|
843
|
+
function calculateIdf(term, stats) {
|
|
844
|
+
const df = stats.docFrequency[term] ?? 0;
|
|
845
|
+
const N = stats.totalDocs;
|
|
846
|
+
if (df === 0) {
|
|
847
|
+
return Math.log((N + 0.5) / 0.5 + 1);
|
|
848
|
+
}
|
|
849
|
+
return Math.log((N - df + 0.5) / (df + 0.5) + 1);
|
|
850
|
+
}
|
|
851
|
+
function createBM25Scorer(sentences, config = {}) {
|
|
852
|
+
const { k1 = DEFAULT_K1, b = DEFAULT_B } = config;
|
|
853
|
+
const stats = computeDocumentStats(sentences);
|
|
854
|
+
function getIdf(term) {
|
|
855
|
+
return calculateIdf(term, stats);
|
|
856
|
+
}
|
|
857
|
+
function scoreWithTf(queryTokens, docTf, docLength) {
|
|
858
|
+
let score2 = 0;
|
|
859
|
+
for (const term of queryTokens) {
|
|
860
|
+
const tf = docTf[term] ?? 0;
|
|
861
|
+
if (tf === 0) continue;
|
|
862
|
+
const idf = getIdf(term);
|
|
863
|
+
const numerator = tf * (k1 + 1);
|
|
864
|
+
const denominator = tf + k1 * (1 - b + b * (docLength / stats.avgDocLength));
|
|
865
|
+
const termScore = idf * (numerator / denominator);
|
|
866
|
+
score2 += termScore;
|
|
867
|
+
}
|
|
868
|
+
return score2;
|
|
869
|
+
}
|
|
870
|
+
function score(queryTokens, docTokens) {
|
|
871
|
+
const docTf = buildTermFrequencyMap(docTokens);
|
|
872
|
+
return scoreWithTf(queryTokens, docTf, docTokens.length);
|
|
873
|
+
}
|
|
874
|
+
return {
|
|
875
|
+
score,
|
|
876
|
+
scoreWithTf,
|
|
877
|
+
getIdf,
|
|
878
|
+
stats
|
|
879
|
+
};
|
|
880
|
+
}
|
|
881
|
+
|
|
882
|
+
// src/scoring/heuristics.ts
|
|
883
|
+
var DEFAULT_WEIGHTS = {
|
|
884
|
+
position: 0.05,
|
|
885
|
+
headingProximity: 0.11,
|
|
886
|
+
density: 0.09,
|
|
887
|
+
structure: 0.11,
|
|
888
|
+
proximity: 0.14,
|
|
889
|
+
headingPath: 0.17,
|
|
890
|
+
coverage: 0.16,
|
|
891
|
+
outlier: 0.09,
|
|
892
|
+
metaSection: 0.08
|
|
893
|
+
};
|
|
894
|
+
var META_SECTION_PATTERNS = [
|
|
895
|
+
// Introduction/overview patterns
|
|
896
|
+
/^introduction$/i,
|
|
897
|
+
/^overview$/i,
|
|
898
|
+
/^about(\s+this)?(\s+(article|guide|tutorial|post|page))?$/i,
|
|
899
|
+
/^what('s|\s+is)\s+this/i,
|
|
900
|
+
/^what\s+you('ll|.will)\s+(learn|cover|build|create)/i,
|
|
901
|
+
/^what\s+we('ll|.will)\s+(learn|cover|build|create)/i,
|
|
902
|
+
/^in\s+this\s+(article|guide|tutorial|post)/i,
|
|
903
|
+
/^getting\s+started$/i,
|
|
904
|
+
/^before\s+(you\s+)?begin/i,
|
|
905
|
+
/^prerequisites?$/i,
|
|
906
|
+
/^requirements?$/i,
|
|
907
|
+
/^background$/i,
|
|
908
|
+
/^context$/i,
|
|
909
|
+
// Summary/conclusion patterns
|
|
910
|
+
/^summary$/i,
|
|
911
|
+
/^conclusion$/i,
|
|
912
|
+
/^tl;?\s*dr$/i,
|
|
913
|
+
/^takeaways?$/i,
|
|
914
|
+
/^key\s+takeaways?$/i,
|
|
915
|
+
/^key\s+points?$/i,
|
|
916
|
+
/^wrapping\s+up$/i,
|
|
917
|
+
/^final\s+thoughts?$/i,
|
|
918
|
+
/^closing\s+thoughts?$/i,
|
|
919
|
+
/^in\s+summary$/i,
|
|
920
|
+
/^to\s+summarize$/i,
|
|
921
|
+
/^recap$/i,
|
|
922
|
+
// Next steps/related content
|
|
923
|
+
/^next\s+steps?$/i,
|
|
924
|
+
/^what('s|.is)\s+next/i,
|
|
925
|
+
/^further\s+reading$/i,
|
|
926
|
+
/^additional\s+resources?$/i,
|
|
927
|
+
/^related\s+(articles?|posts?|content|links?|resources?)/i,
|
|
928
|
+
/^see\s+also$/i,
|
|
929
|
+
/^learn\s+more$/i,
|
|
930
|
+
/^more\s+resources?$/i,
|
|
931
|
+
/^references?$/i,
|
|
932
|
+
/^sources?$/i,
|
|
933
|
+
/^bibliography$/i,
|
|
934
|
+
/^credits?$/i,
|
|
935
|
+
/^acknowledgements?$/i,
|
|
936
|
+
// Author/meta content
|
|
937
|
+
/^about\s+(the\s+)?author/i,
|
|
938
|
+
/^author(\s+bio)?$/i,
|
|
939
|
+
/^bio(graphy)?$/i,
|
|
940
|
+
/^written\s+by$/i,
|
|
941
|
+
/^posted\s+by$/i,
|
|
942
|
+
/^published\s+by$/i,
|
|
943
|
+
// Engagement/social
|
|
944
|
+
/^comments?$/i,
|
|
945
|
+
/^feedback$/i,
|
|
946
|
+
/^discussion$/i,
|
|
947
|
+
/^share(\s+this)?$/i,
|
|
948
|
+
/^subscribe$/i,
|
|
949
|
+
/^newsletter$/i,
|
|
950
|
+
/^follow(\s+us)?$/i,
|
|
951
|
+
/^connect(\s+with\s+us)?$/i,
|
|
952
|
+
/^join(\s+us)?$/i,
|
|
953
|
+
/^support(\s+us)?$/i,
|
|
954
|
+
/^donate$/i,
|
|
955
|
+
/^buy\s+me\s+a\s+coffee/i,
|
|
956
|
+
/^sponsor/i,
|
|
957
|
+
// Navigation/structural
|
|
958
|
+
/^table\s+of\s+contents?$/i,
|
|
959
|
+
/^contents?$/i,
|
|
960
|
+
/^toc$/i,
|
|
961
|
+
/^navigation$/i,
|
|
962
|
+
/^menu$/i,
|
|
963
|
+
/^sidebar$/i,
|
|
964
|
+
/^footer$/i,
|
|
965
|
+
/^header$/i,
|
|
966
|
+
/^breadcrumb/i,
|
|
967
|
+
/^skip\s+to/i,
|
|
968
|
+
// Cross-reference sections (often just link lists)
|
|
969
|
+
/^see\s+also$/i,
|
|
970
|
+
/^more\s+info(rmation)?$/i,
|
|
971
|
+
/^external\s+links?$/i,
|
|
972
|
+
/^useful\s+links?$/i,
|
|
973
|
+
/^quick\s+links?$/i,
|
|
974
|
+
/^related\s+topics?$/i,
|
|
975
|
+
/^related\s+guides?$/i,
|
|
976
|
+
/^related\s+tutorials?$/i,
|
|
977
|
+
/^related\s+documentation$/i,
|
|
978
|
+
/^other\s+resources?$/i,
|
|
979
|
+
/^specifications?$/i,
|
|
980
|
+
/^browser\s+compatibility$/i,
|
|
981
|
+
/^browser\s+support$/i,
|
|
982
|
+
// Disclaimers/legal
|
|
983
|
+
/^disclaimer$/i,
|
|
984
|
+
/^disclosure$/i,
|
|
985
|
+
/^affiliate\s+disclosure/i,
|
|
986
|
+
/^privacy(\s+policy)?$/i,
|
|
987
|
+
/^terms(\s+(of\s+)?(use|service))?$/i,
|
|
988
|
+
/^copyright$/i,
|
|
989
|
+
/^legal$/i,
|
|
990
|
+
// Promotional
|
|
991
|
+
/^featured$/i,
|
|
992
|
+
/^trending$/i,
|
|
993
|
+
/^popular(\s+posts?)?$/i,
|
|
994
|
+
/^recommended$/i,
|
|
995
|
+
/^you\s+might\s+(also\s+)?like$/i,
|
|
996
|
+
/^related\s+posts?$/i,
|
|
997
|
+
/^top\s+stories$/i,
|
|
998
|
+
/^latest(\s+posts?)?$/i,
|
|
999
|
+
/^recent(\s+posts?)?$/i,
|
|
1000
|
+
/^archives?$/i,
|
|
1001
|
+
/^categories$/i,
|
|
1002
|
+
/^tags?$/i
|
|
1003
|
+
];
|
|
1004
|
+
function calculatePositionScore(sentence) {
|
|
1005
|
+
const position = sentence.position;
|
|
1006
|
+
if (position <= 0.3) {
|
|
1007
|
+
return 1 - position / 0.3 * 0.3;
|
|
1008
|
+
} else if (position <= 0.7) {
|
|
1009
|
+
return 0.7 - (position - 0.3) / 0.4 * 0.2;
|
|
1010
|
+
} else {
|
|
1011
|
+
return 0.5 - (position - 0.7) / 0.3 * 0.2;
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
function calculateHeadingProximityScore(sentence, queryTokens, allSentences) {
|
|
1015
|
+
let nearestHeading = null;
|
|
1016
|
+
let distance = Infinity;
|
|
1017
|
+
for (let i = sentence.globalIndex - 1; i >= 0; i--) {
|
|
1018
|
+
const s = allSentences[i];
|
|
1019
|
+
if (s === void 0) continue;
|
|
1020
|
+
if (/^h[1-6]$/.test(s.blockType)) {
|
|
1021
|
+
nearestHeading = s;
|
|
1022
|
+
distance = sentence.globalIndex - i;
|
|
1023
|
+
break;
|
|
1024
|
+
}
|
|
1025
|
+
}
|
|
1026
|
+
if (nearestHeading === null) {
|
|
1027
|
+
return 0.3;
|
|
1028
|
+
}
|
|
1029
|
+
const headingOverlap = termOverlapRatio(queryTokens, nearestHeading.tokens);
|
|
1030
|
+
const distanceScore = 1 / (1 + distance / 5);
|
|
1031
|
+
if (headingOverlap > 0) {
|
|
1032
|
+
return Math.min(1, headingOverlap * 0.6 + distanceScore * 0.4);
|
|
1033
|
+
}
|
|
1034
|
+
return distanceScore * 0.5;
|
|
1035
|
+
}
|
|
1036
|
+
function calculateDensityScore(sentence, queryTokens) {
|
|
1037
|
+
if (sentence.tokens.length === 0 || queryTokens.length === 0) {
|
|
1038
|
+
return 0;
|
|
1039
|
+
}
|
|
1040
|
+
const querySet = new Set(queryTokens);
|
|
1041
|
+
let queryTermCount = 0;
|
|
1042
|
+
for (const token of sentence.tokens) {
|
|
1043
|
+
if (querySet.has(token)) {
|
|
1044
|
+
queryTermCount++;
|
|
1045
|
+
}
|
|
1046
|
+
}
|
|
1047
|
+
const rawDensity = queryTermCount / sentence.tokens.length;
|
|
1048
|
+
const uniqueQueryTerms = /* @__PURE__ */ new Set();
|
|
1049
|
+
for (const token of sentence.tokens) {
|
|
1050
|
+
if (querySet.has(token)) {
|
|
1051
|
+
uniqueQueryTerms.add(token);
|
|
1052
|
+
}
|
|
1053
|
+
}
|
|
1054
|
+
const coverage = uniqueQueryTerms.size / queryTokens.length;
|
|
1055
|
+
return rawDensity * 0.4 + coverage * 0.6;
|
|
1056
|
+
}
|
|
1057
|
+
function calculateStructureScore(sentence, queryTokens, allSentences) {
|
|
1058
|
+
let baseScore;
|
|
1059
|
+
switch (sentence.blockType) {
|
|
1060
|
+
case "p":
|
|
1061
|
+
baseScore = 0.8;
|
|
1062
|
+
break;
|
|
1063
|
+
case "li":
|
|
1064
|
+
baseScore = 0.7;
|
|
1065
|
+
break;
|
|
1066
|
+
case "pre":
|
|
1067
|
+
baseScore = 0.65;
|
|
1068
|
+
break;
|
|
1069
|
+
default:
|
|
1070
|
+
baseScore = 0.4;
|
|
1071
|
+
}
|
|
1072
|
+
const codeAdjacentBonus = sentence.blockType !== "pre" && allSentences.some(
|
|
1073
|
+
(s) => s.blockType === "pre" && s.globalIndex !== sentence.globalIndex && Math.abs(s.globalIndex - sentence.globalIndex) <= 2
|
|
1074
|
+
) ? 0.1 : 0;
|
|
1075
|
+
let sameBlockBonus = 0;
|
|
1076
|
+
for (const s of allSentences) {
|
|
1077
|
+
if (s.blockIndex === sentence.blockIndex && s.globalIndex !== sentence.globalIndex) {
|
|
1078
|
+
const overlap = termOverlapRatio(queryTokens, s.tokens);
|
|
1079
|
+
if (overlap > 0.3) {
|
|
1080
|
+
sameBlockBonus = Math.max(sameBlockBonus, 0.15);
|
|
1081
|
+
}
|
|
1082
|
+
}
|
|
1083
|
+
}
|
|
1084
|
+
let headingPathBonus = 0;
|
|
1085
|
+
for (const heading of sentence.headingPath) {
|
|
1086
|
+
const headingLower = heading.toLowerCase();
|
|
1087
|
+
for (const token of queryTokens) {
|
|
1088
|
+
if (headingLower.includes(token)) {
|
|
1089
|
+
headingPathBonus = 0.1;
|
|
1090
|
+
break;
|
|
1091
|
+
}
|
|
1092
|
+
}
|
|
1093
|
+
if (headingPathBonus > 0) break;
|
|
1094
|
+
}
|
|
1095
|
+
return Math.min(1, baseScore + sameBlockBonus + headingPathBonus + codeAdjacentBonus);
|
|
1096
|
+
}
|
|
1097
|
+
function calculateProximityScore(sentence, queryTokens) {
|
|
1098
|
+
if (sentence.tokens.length === 0 || queryTokens.length === 0) {
|
|
1099
|
+
return 0;
|
|
1100
|
+
}
|
|
1101
|
+
const querySet = new Set(queryTokens);
|
|
1102
|
+
const termPositions = /* @__PURE__ */ new Map();
|
|
1103
|
+
for (let i = 0; i < sentence.tokens.length; i++) {
|
|
1104
|
+
const token = sentence.tokens[i];
|
|
1105
|
+
if (token !== void 0 && querySet.has(token)) {
|
|
1106
|
+
const positions = termPositions.get(token);
|
|
1107
|
+
if (positions === void 0) {
|
|
1108
|
+
termPositions.set(token, [i]);
|
|
1109
|
+
} else {
|
|
1110
|
+
positions.push(i);
|
|
1111
|
+
}
|
|
1112
|
+
}
|
|
1113
|
+
}
|
|
1114
|
+
const matchedTermCount = termPositions.size;
|
|
1115
|
+
if (matchedTermCount === 0) {
|
|
1116
|
+
return 0;
|
|
1117
|
+
}
|
|
1118
|
+
const coverageRatio = matchedTermCount / queryTokens.length;
|
|
1119
|
+
if (matchedTermCount === 1) {
|
|
1120
|
+
return coverageRatio * 0.5;
|
|
1121
|
+
}
|
|
1122
|
+
const allPositions = [];
|
|
1123
|
+
for (const [term, positions] of termPositions) {
|
|
1124
|
+
for (const pos of positions) {
|
|
1125
|
+
allPositions.push({ pos, term });
|
|
1126
|
+
}
|
|
1127
|
+
}
|
|
1128
|
+
allPositions.sort((a, b) => a.pos - b.pos);
|
|
1129
|
+
let minSpan = Infinity;
|
|
1130
|
+
const windowTermCount = /* @__PURE__ */ new Map();
|
|
1131
|
+
let uniqueTermsInWindow = 0;
|
|
1132
|
+
let left = 0;
|
|
1133
|
+
for (let right = 0; right < allPositions.length; right++) {
|
|
1134
|
+
const rightItem = allPositions[right];
|
|
1135
|
+
if (rightItem === void 0) continue;
|
|
1136
|
+
const rightTerm = rightItem.term;
|
|
1137
|
+
const prevCount = windowTermCount.get(rightTerm) ?? 0;
|
|
1138
|
+
windowTermCount.set(rightTerm, prevCount + 1);
|
|
1139
|
+
if (prevCount === 0) {
|
|
1140
|
+
uniqueTermsInWindow++;
|
|
1141
|
+
}
|
|
1142
|
+
while (uniqueTermsInWindow === matchedTermCount) {
|
|
1143
|
+
const leftItem = allPositions[left];
|
|
1144
|
+
if (leftItem === void 0) break;
|
|
1145
|
+
const currentSpan = rightItem.pos - leftItem.pos + 1;
|
|
1146
|
+
if (currentSpan < minSpan) {
|
|
1147
|
+
minSpan = currentSpan;
|
|
1148
|
+
}
|
|
1149
|
+
const leftTerm = leftItem.term;
|
|
1150
|
+
const leftCount = windowTermCount.get(leftTerm) ?? 0;
|
|
1151
|
+
windowTermCount.set(leftTerm, leftCount - 1);
|
|
1152
|
+
if (leftCount - 1 === 0) {
|
|
1153
|
+
uniqueTermsInWindow--;
|
|
1154
|
+
}
|
|
1155
|
+
left++;
|
|
1156
|
+
}
|
|
1157
|
+
}
|
|
1158
|
+
const spanTightness = 1 - Math.min(1, minSpan / sentence.tokens.length);
|
|
1159
|
+
const densityInSpan = matchedTermCount / minSpan;
|
|
1160
|
+
return coverageRatio * 0.4 + spanTightness * 0.35 + Math.min(1, densityInSpan) * 0.25;
|
|
1161
|
+
}
|
|
1162
|
+
function calculateHeadingPathScore(sentence, queryTokens, getIdf) {
|
|
1163
|
+
if (sentence.headingPath.length === 0) {
|
|
1164
|
+
return 0.3;
|
|
1165
|
+
}
|
|
1166
|
+
const headingText = sentence.headingPath.join(" ");
|
|
1167
|
+
const headingTokens = tokenize(headingText);
|
|
1168
|
+
if (headingTokens.length === 0 || queryTokens.length === 0) {
|
|
1169
|
+
return 0.3;
|
|
1170
|
+
}
|
|
1171
|
+
const headingSet = new Set(headingTokens);
|
|
1172
|
+
let matchedIdfSum = 0;
|
|
1173
|
+
let totalIdfSum = 0;
|
|
1174
|
+
for (const queryTerm of queryTokens) {
|
|
1175
|
+
const idf = getIdf(queryTerm);
|
|
1176
|
+
totalIdfSum += idf;
|
|
1177
|
+
if (headingSet.has(queryTerm)) {
|
|
1178
|
+
matchedIdfSum += idf;
|
|
1179
|
+
}
|
|
1180
|
+
}
|
|
1181
|
+
if (totalIdfSum === 0) {
|
|
1182
|
+
return 0.3;
|
|
1183
|
+
}
|
|
1184
|
+
const idfWeightedOverlap = matchedIdfSum / totalIdfSum;
|
|
1185
|
+
return 0.3 + idfWeightedOverlap * 0.7;
|
|
1186
|
+
}
|
|
1187
|
+
function calculateCoverageScore(sentence, queryTokens, getIdf) {
|
|
1188
|
+
if (sentence.tokens.length === 0 || queryTokens.length === 0) {
|
|
1189
|
+
return 0;
|
|
1190
|
+
}
|
|
1191
|
+
const sentenceSet = new Set(sentence.tokens);
|
|
1192
|
+
let matchedIdfSum = 0;
|
|
1193
|
+
let totalIdfSum = 0;
|
|
1194
|
+
let matchedCount = 0;
|
|
1195
|
+
for (const queryTerm of queryTokens) {
|
|
1196
|
+
const idf = getIdf(queryTerm);
|
|
1197
|
+
totalIdfSum += idf;
|
|
1198
|
+
if (sentenceSet.has(queryTerm)) {
|
|
1199
|
+
matchedIdfSum += idf;
|
|
1200
|
+
matchedCount++;
|
|
1201
|
+
}
|
|
1202
|
+
}
|
|
1203
|
+
if (totalIdfSum === 0) {
|
|
1204
|
+
return 0;
|
|
1205
|
+
}
|
|
1206
|
+
const idfWeightedCoverage = matchedIdfSum / totalIdfSum;
|
|
1207
|
+
const simpleCoverage = matchedCount / queryTokens.length;
|
|
1208
|
+
return idfWeightedCoverage * 0.7 + simpleCoverage * 0.3;
|
|
1209
|
+
}
|
|
1210
|
+
function computeDensityStats(sentences, queryTokens) {
|
|
1211
|
+
if (sentences.length === 0 || queryTokens.length === 0) {
|
|
1212
|
+
return { median: 0, mad: 1e-3 };
|
|
1213
|
+
}
|
|
1214
|
+
const querySet = new Set(queryTokens);
|
|
1215
|
+
const densities = [];
|
|
1216
|
+
for (const sentence of sentences) {
|
|
1217
|
+
if (sentence.tokens.length === 0) {
|
|
1218
|
+
densities.push(0);
|
|
1219
|
+
continue;
|
|
1220
|
+
}
|
|
1221
|
+
let queryTermCount = 0;
|
|
1222
|
+
for (const token of sentence.tokens) {
|
|
1223
|
+
if (querySet.has(token)) {
|
|
1224
|
+
queryTermCount++;
|
|
1225
|
+
}
|
|
1226
|
+
}
|
|
1227
|
+
densities.push(queryTermCount / sentence.tokens.length);
|
|
1228
|
+
}
|
|
1229
|
+
densities.sort((a, b) => a - b);
|
|
1230
|
+
const mid = Math.floor(densities.length / 2);
|
|
1231
|
+
const median = densities.length % 2 === 0 ? ((densities[mid - 1] ?? 0) + (densities[mid] ?? 0)) / 2 : densities[mid] ?? 0;
|
|
1232
|
+
const absoluteDeviations = densities.map((d) => Math.abs(d - median));
|
|
1233
|
+
absoluteDeviations.sort((a, b) => a - b);
|
|
1234
|
+
const madMid = Math.floor(absoluteDeviations.length / 2);
|
|
1235
|
+
let mad = absoluteDeviations.length % 2 === 0 ? ((absoluteDeviations[madMid - 1] ?? 0) + (absoluteDeviations[madMid] ?? 0)) / 2 : absoluteDeviations[madMid] ?? 0;
|
|
1236
|
+
if (mad < 1e-3) {
|
|
1237
|
+
mad = 1e-3;
|
|
1238
|
+
}
|
|
1239
|
+
return { median, mad };
|
|
1240
|
+
}
|
|
1241
|
+
function isMetaHeading(heading) {
|
|
1242
|
+
const trimmed = heading.trim();
|
|
1243
|
+
for (const pattern of META_SECTION_PATTERNS) {
|
|
1244
|
+
if (pattern.test(trimmed)) {
|
|
1245
|
+
return true;
|
|
1246
|
+
}
|
|
1247
|
+
}
|
|
1248
|
+
return false;
|
|
1249
|
+
}
|
|
1250
|
+
function calculateMetaSectionScore(sentence) {
|
|
1251
|
+
for (const heading of sentence.headingPath) {
|
|
1252
|
+
if (isMetaHeading(heading)) {
|
|
1253
|
+
return 0.2;
|
|
1254
|
+
}
|
|
1255
|
+
}
|
|
1256
|
+
const text = sentence.text.toLowerCase();
|
|
1257
|
+
const metaPhrases = [
|
|
1258
|
+
"in this article",
|
|
1259
|
+
"in this guide",
|
|
1260
|
+
"in this tutorial",
|
|
1261
|
+
"in this post",
|
|
1262
|
+
"we will cover",
|
|
1263
|
+
"we'll cover",
|
|
1264
|
+
"you will learn",
|
|
1265
|
+
"you'll learn",
|
|
1266
|
+
"we will explore",
|
|
1267
|
+
"we'll explore",
|
|
1268
|
+
"let's explore",
|
|
1269
|
+
"let's dive",
|
|
1270
|
+
"let's get started",
|
|
1271
|
+
"i will show you",
|
|
1272
|
+
"i'll show you",
|
|
1273
|
+
"we will discuss",
|
|
1274
|
+
"we'll discuss",
|
|
1275
|
+
"this article covers",
|
|
1276
|
+
"this guide covers",
|
|
1277
|
+
"this tutorial covers",
|
|
1278
|
+
"by the end of this",
|
|
1279
|
+
"after reading this",
|
|
1280
|
+
"if you enjoyed this",
|
|
1281
|
+
"if you found this helpful",
|
|
1282
|
+
"don't forget to",
|
|
1283
|
+
"make sure to subscribe",
|
|
1284
|
+
"leave a comment",
|
|
1285
|
+
"share this article",
|
|
1286
|
+
"follow me on",
|
|
1287
|
+
"check out my",
|
|
1288
|
+
"support my work",
|
|
1289
|
+
"buy me a coffee"
|
|
1290
|
+
];
|
|
1291
|
+
for (const phrase of metaPhrases) {
|
|
1292
|
+
if (text.includes(phrase)) {
|
|
1293
|
+
return 0.3;
|
|
1294
|
+
}
|
|
1295
|
+
}
|
|
1296
|
+
return 1;
|
|
1297
|
+
}
|
|
1298
|
+
function calculateOutlierScore(sentence, queryTokens, densityStats) {
|
|
1299
|
+
if (sentence.tokens.length === 0 || queryTokens.length === 0) {
|
|
1300
|
+
return 0.3;
|
|
1301
|
+
}
|
|
1302
|
+
const querySet = new Set(queryTokens);
|
|
1303
|
+
let queryTermCount = 0;
|
|
1304
|
+
for (const token of sentence.tokens) {
|
|
1305
|
+
if (querySet.has(token)) {
|
|
1306
|
+
queryTermCount++;
|
|
1307
|
+
}
|
|
1308
|
+
}
|
|
1309
|
+
const density = queryTermCount / sentence.tokens.length;
|
|
1310
|
+
const zScore = (density - densityStats.median) / densityStats.mad;
|
|
1311
|
+
if (zScore <= 0) {
|
|
1312
|
+
return 0.3;
|
|
1313
|
+
}
|
|
1314
|
+
const sigmoid = 1 / (1 + Math.exp(-(zScore - 2)));
|
|
1315
|
+
return 0.3 + 0.7 * sigmoid;
|
|
1316
|
+
}
|
|
1317
|
+
function calculateHeuristicScores(sentence, queryTokens, allSentences, getIdf, densityStats, weights = {}) {
|
|
1318
|
+
const w = { ...DEFAULT_WEIGHTS, ...weights };
|
|
1319
|
+
const positionScore = calculatePositionScore(sentence);
|
|
1320
|
+
const headingProximityScore = calculateHeadingProximityScore(sentence, queryTokens, allSentences);
|
|
1321
|
+
const densityScore = calculateDensityScore(sentence, queryTokens);
|
|
1322
|
+
const structureScore = calculateStructureScore(sentence, queryTokens, allSentences);
|
|
1323
|
+
const proximityScore = calculateProximityScore(sentence, queryTokens);
|
|
1324
|
+
const headingPathScore = calculateHeadingPathScore(sentence, queryTokens, getIdf);
|
|
1325
|
+
const coverageScore = calculateCoverageScore(sentence, queryTokens, getIdf);
|
|
1326
|
+
const outlierScore = calculateOutlierScore(sentence, queryTokens, densityStats);
|
|
1327
|
+
const metaSectionScore = calculateMetaSectionScore(sentence);
|
|
1328
|
+
const combined = w.position * positionScore + w.headingProximity * headingProximityScore + w.density * densityScore + w.structure * structureScore + w.proximity * proximityScore + w.headingPath * headingPathScore + w.coverage * coverageScore + w.outlier * outlierScore + w.metaSection * metaSectionScore;
|
|
1329
|
+
return {
|
|
1330
|
+
positionScore,
|
|
1331
|
+
headingProximityScore,
|
|
1332
|
+
densityScore,
|
|
1333
|
+
structureScore,
|
|
1334
|
+
proximityScore,
|
|
1335
|
+
headingPathScore,
|
|
1336
|
+
coverageScore,
|
|
1337
|
+
outlierScore,
|
|
1338
|
+
metaSectionScore,
|
|
1339
|
+
combined
|
|
1340
|
+
};
|
|
1341
|
+
}
|
|
1342
|
+
function scoreAllSentencesHeuristics(sentences, queryTokens, getIdf, weights = {}) {
|
|
1343
|
+
const scores = /* @__PURE__ */ new Map();
|
|
1344
|
+
const densityStats = computeDensityStats(sentences, queryTokens);
|
|
1345
|
+
for (const sentence of sentences) {
|
|
1346
|
+
const heuristics = calculateHeuristicScores(
|
|
1347
|
+
sentence,
|
|
1348
|
+
queryTokens,
|
|
1349
|
+
sentences,
|
|
1350
|
+
getIdf,
|
|
1351
|
+
densityStats,
|
|
1352
|
+
weights
|
|
1353
|
+
);
|
|
1354
|
+
scores.set(sentence.globalIndex, heuristics);
|
|
1355
|
+
}
|
|
1356
|
+
return scores;
|
|
1357
|
+
}
|
|
1358
|
+
|
|
1359
|
+
// src/scoring/ranker.ts
|
|
1360
|
+
var DEFAULT_CONFIG2 = {
|
|
1361
|
+
bm25Weight: 0.6,
|
|
1362
|
+
heuristicWeight: 0.4,
|
|
1363
|
+
relevanceMode: "strict"
|
|
1364
|
+
};
|
|
1365
|
+
var SEARCH_RELEVANCE_THRESHOLDS = {
|
|
1366
|
+
strongSentenceBm25: 0.8,
|
|
1367
|
+
// Was 1.2
|
|
1368
|
+
strongSentenceCoverage: 0.25,
|
|
1369
|
+
cooccurrenceMinTerms: 2,
|
|
1370
|
+
cooccurrenceBm25: 0.5,
|
|
1371
|
+
// Was 0.8
|
|
1372
|
+
centralTermBm25: 0.4,
|
|
1373
|
+
// Was 0.6
|
|
1374
|
+
goodCoverage: 0.5,
|
|
1375
|
+
goodCoverageBm25: 0.3
|
|
1376
|
+
// Was 0.4
|
|
1377
|
+
};
|
|
1378
|
+
var STRICT_RELEVANCE_THRESHOLDS = {
|
|
1379
|
+
cooccurrenceMinTerms: 2,
|
|
1380
|
+
cooccurrenceBm25: 1,
|
|
1381
|
+
centralTermBm25: 0.8,
|
|
1382
|
+
highCoverage: 0.8,
|
|
1383
|
+
highCoverageBm25: 0.5
|
|
1384
|
+
};
|
|
1385
|
+
function normalizeScores(scores) {
|
|
1386
|
+
if (scores.size === 0) return /* @__PURE__ */ new Map();
|
|
1387
|
+
let min = Infinity;
|
|
1388
|
+
let max = -Infinity;
|
|
1389
|
+
for (const score of scores.values()) {
|
|
1390
|
+
if (score < min) min = score;
|
|
1391
|
+
if (score > max) max = score;
|
|
1392
|
+
}
|
|
1393
|
+
const range = max - min;
|
|
1394
|
+
const normalized = /* @__PURE__ */ new Map();
|
|
1395
|
+
for (const [id, score] of scores) {
|
|
1396
|
+
normalized.set(id, range === 0 ? 0.5 : (score - min) / range);
|
|
1397
|
+
}
|
|
1398
|
+
return normalized;
|
|
1399
|
+
}
|
|
1400
|
+
function rankSentencesWithRelevance(sentences, queryTokens, config = {}) {
|
|
1401
|
+
const {
|
|
1402
|
+
bm25Weight = DEFAULT_CONFIG2.bm25Weight,
|
|
1403
|
+
heuristicWeight = DEFAULT_CONFIG2.heuristicWeight,
|
|
1404
|
+
bm25Config,
|
|
1405
|
+
heuristicWeights,
|
|
1406
|
+
relevanceMode = DEFAULT_CONFIG2.relevanceMode
|
|
1407
|
+
} = config;
|
|
1408
|
+
if (sentences.length === 0 || queryTokens.length === 0) {
|
|
1409
|
+
return {
|
|
1410
|
+
sentences: sentences.map((s) => ({
|
|
1411
|
+
...s,
|
|
1412
|
+
bm25Score: 0,
|
|
1413
|
+
heuristicScore: 0,
|
|
1414
|
+
combinedScore: 0
|
|
1415
|
+
})),
|
|
1416
|
+
hasRelevantResults: false,
|
|
1417
|
+
maxRawBm25: 0,
|
|
1418
|
+
queryTermCoverage: 0,
|
|
1419
|
+
maxCooccurrence: 0
|
|
1420
|
+
};
|
|
1421
|
+
}
|
|
1422
|
+
const bm25Scorer = createBM25Scorer(sentences, bm25Config);
|
|
1423
|
+
const rawBm25Scores = /* @__PURE__ */ new Map();
|
|
1424
|
+
let maxRawBm25 = 0;
|
|
1425
|
+
for (const sentence of sentences) {
|
|
1426
|
+
const docTf = buildTermFrequencyMap(sentence.tokens);
|
|
1427
|
+
const score = bm25Scorer.scoreWithTf(queryTokens, docTf, sentence.tokens.length);
|
|
1428
|
+
rawBm25Scores.set(sentence.globalIndex, score);
|
|
1429
|
+
if (score > maxRawBm25) {
|
|
1430
|
+
maxRawBm25 = score;
|
|
1431
|
+
}
|
|
1432
|
+
}
|
|
1433
|
+
const queryTermSet = new Set(queryTokens);
|
|
1434
|
+
const foundTerms = /* @__PURE__ */ new Set();
|
|
1435
|
+
for (const sentence of sentences) {
|
|
1436
|
+
for (const token of sentence.tokens) {
|
|
1437
|
+
if (queryTermSet.has(token)) {
|
|
1438
|
+
foundTerms.add(token);
|
|
1439
|
+
}
|
|
1440
|
+
}
|
|
1441
|
+
}
|
|
1442
|
+
const queryTermCoverage = queryTokens.length > 0 ? foundTerms.size / queryTokens.length : 0;
|
|
1443
|
+
let maxCooccurrence = 0;
|
|
1444
|
+
const termSentenceCount = /* @__PURE__ */ new Map();
|
|
1445
|
+
for (const sentence of sentences) {
|
|
1446
|
+
const uniqueInSentence = /* @__PURE__ */ new Set();
|
|
1447
|
+
for (const token of sentence.tokens) {
|
|
1448
|
+
if (queryTermSet.has(token)) {
|
|
1449
|
+
uniqueInSentence.add(token);
|
|
1450
|
+
}
|
|
1451
|
+
}
|
|
1452
|
+
if (uniqueInSentence.size > maxCooccurrence) {
|
|
1453
|
+
maxCooccurrence = uniqueInSentence.size;
|
|
1454
|
+
}
|
|
1455
|
+
for (const term of uniqueInSentence) {
|
|
1456
|
+
termSentenceCount.set(term, (termSentenceCount.get(term) ?? 0) + 1);
|
|
1457
|
+
}
|
|
1458
|
+
}
|
|
1459
|
+
const centralTermThreshold = Math.max(3, sentences.length * 0.1);
|
|
1460
|
+
let hasCentralTerm = false;
|
|
1461
|
+
for (const count of termSentenceCount.values()) {
|
|
1462
|
+
if (count >= centralTermThreshold) {
|
|
1463
|
+
hasCentralTerm = true;
|
|
1464
|
+
break;
|
|
1465
|
+
}
|
|
1466
|
+
}
|
|
1467
|
+
const hasRelevantResults = relevanceMode === "search" ? (
|
|
1468
|
+
// Single strong sentence with decent coverage (search mode)
|
|
1469
|
+
maxRawBm25 > SEARCH_RELEVANCE_THRESHOLDS.strongSentenceBm25 && queryTermCoverage >= SEARCH_RELEVANCE_THRESHOLDS.strongSentenceCoverage || // Multiple query terms in same sentence
|
|
1470
|
+
maxCooccurrence >= SEARCH_RELEVANCE_THRESHOLDS.cooccurrenceMinTerms && maxRawBm25 > SEARCH_RELEVANCE_THRESHOLDS.cooccurrenceBm25 || // A matching term is central to the document
|
|
1471
|
+
hasCentralTerm && maxRawBm25 > SEARCH_RELEVANCE_THRESHOLDS.centralTermBm25 || // Good coverage
|
|
1472
|
+
queryTermCoverage >= SEARCH_RELEVANCE_THRESHOLDS.goodCoverage && maxRawBm25 > SEARCH_RELEVANCE_THRESHOLDS.goodCoverageBm25
|
|
1473
|
+
) : (
|
|
1474
|
+
// Multiple query terms in same sentence with good BM25 (strict mode)
|
|
1475
|
+
maxCooccurrence >= STRICT_RELEVANCE_THRESHOLDS.cooccurrenceMinTerms && maxRawBm25 > STRICT_RELEVANCE_THRESHOLDS.cooccurrenceBm25 || // A matching term is central to the document (topic match)
|
|
1476
|
+
hasCentralTerm && maxRawBm25 > STRICT_RELEVANCE_THRESHOLDS.centralTermBm25 || // Very high coverage
|
|
1477
|
+
queryTermCoverage >= STRICT_RELEVANCE_THRESHOLDS.highCoverage && maxRawBm25 > STRICT_RELEVANCE_THRESHOLDS.highCoverageBm25
|
|
1478
|
+
);
|
|
1479
|
+
const heuristicScores = scoreAllSentencesHeuristics(
|
|
1480
|
+
sentences,
|
|
1481
|
+
queryTokens,
|
|
1482
|
+
bm25Scorer.getIdf,
|
|
1483
|
+
heuristicWeights
|
|
1484
|
+
);
|
|
1485
|
+
const normalizedBm25 = normalizeScores(rawBm25Scores);
|
|
1486
|
+
const scoredSentences = sentences.map((sentence) => {
|
|
1487
|
+
const bm25Score = normalizedBm25.get(sentence.globalIndex) ?? 0;
|
|
1488
|
+
const heuristics = heuristicScores.get(sentence.globalIndex);
|
|
1489
|
+
const heuristicScore = heuristics?.combined ?? 0;
|
|
1490
|
+
const combinedScore = bm25Weight * bm25Score + heuristicWeight * heuristicScore;
|
|
1491
|
+
return {
|
|
1492
|
+
...sentence,
|
|
1493
|
+
bm25Score,
|
|
1494
|
+
heuristicScore,
|
|
1495
|
+
combinedScore
|
|
1496
|
+
};
|
|
1497
|
+
});
|
|
1498
|
+
scoredSentences.sort((a, b) => {
|
|
1499
|
+
const d = b.combinedScore - a.combinedScore;
|
|
1500
|
+
if (d !== 0) return d;
|
|
1501
|
+
return a.globalIndex - b.globalIndex;
|
|
1502
|
+
});
|
|
1503
|
+
return {
|
|
1504
|
+
sentences: scoredSentences,
|
|
1505
|
+
hasRelevantResults,
|
|
1506
|
+
maxRawBm25,
|
|
1507
|
+
queryTermCoverage,
|
|
1508
|
+
maxCooccurrence
|
|
1509
|
+
};
|
|
1510
|
+
}
|
|
1511
|
+
|
|
1512
|
+
// src/extraction/anchors.ts
|
|
1513
|
+
var DEFAULT_CONFIG3 = {
|
|
1514
|
+
maxAnchors: 5,
|
|
1515
|
+
minScore: 0.1,
|
|
1516
|
+
diversityThreshold: 0.5
|
|
1517
|
+
// Max Jaccard similarity to already-selected anchors
|
|
1518
|
+
};
|
|
1519
|
+
function selectAnchorsWithPositionDiversity(rankedSentences, config = {}) {
|
|
1520
|
+
const {
|
|
1521
|
+
maxAnchors = DEFAULT_CONFIG3.maxAnchors,
|
|
1522
|
+
minScore = DEFAULT_CONFIG3.minScore,
|
|
1523
|
+
diversityThreshold = DEFAULT_CONFIG3.diversityThreshold,
|
|
1524
|
+
minPositionGap = 3
|
|
1525
|
+
// Minimum global index gap between anchors
|
|
1526
|
+
} = config;
|
|
1527
|
+
const anchors = [];
|
|
1528
|
+
for (const sentence of rankedSentences) {
|
|
1529
|
+
if (sentence.combinedScore < minScore) {
|
|
1530
|
+
continue;
|
|
1531
|
+
}
|
|
1532
|
+
let tooSimilar = false;
|
|
1533
|
+
for (const anchor of anchors) {
|
|
1534
|
+
const similarity = jaccardSimilarity(sentence.tokens, anchor.tokens);
|
|
1535
|
+
if (similarity > diversityThreshold) {
|
|
1536
|
+
tooSimilar = true;
|
|
1537
|
+
break;
|
|
1538
|
+
}
|
|
1539
|
+
const positionGap = Math.abs(sentence.globalIndex - anchor.globalIndex);
|
|
1540
|
+
if (positionGap < minPositionGap) {
|
|
1541
|
+
tooSimilar = true;
|
|
1542
|
+
break;
|
|
1543
|
+
}
|
|
1544
|
+
}
|
|
1545
|
+
if (!tooSimilar) {
|
|
1546
|
+
anchors.push(sentence);
|
|
1547
|
+
if (anchors.length >= maxAnchors) {
|
|
1548
|
+
break;
|
|
1549
|
+
}
|
|
1550
|
+
}
|
|
1551
|
+
}
|
|
1552
|
+
return anchors;
|
|
1553
|
+
}
|
|
1554
|
+
|
|
1555
|
+
// src/extraction/expand.ts
|
|
1556
|
+
function buildChunkText(sentences) {
|
|
1557
|
+
const textParts = [];
|
|
1558
|
+
let lastBlockType = "";
|
|
1559
|
+
for (const s of sentences) {
|
|
1560
|
+
if (/^h[1-6]$/.test(s.blockType) && textParts.length > 0) {
|
|
1561
|
+
textParts.push("\n\n" + s.text);
|
|
1562
|
+
} else if (s.blockType === "pre") {
|
|
1563
|
+
textParts.push("\n\n```\n" + s.text + "\n```");
|
|
1564
|
+
} else if (s.blockType === "li") {
|
|
1565
|
+
if (lastBlockType !== "li") {
|
|
1566
|
+
textParts.push("\n");
|
|
1567
|
+
}
|
|
1568
|
+
textParts.push("\n- " + s.text);
|
|
1569
|
+
} else {
|
|
1570
|
+
if (lastBlockType === "pre" || /^h[1-6]$/.test(lastBlockType)) {
|
|
1571
|
+
textParts.push("\n\n" + s.text);
|
|
1572
|
+
} else if (textParts.length > 0) {
|
|
1573
|
+
textParts.push(" " + s.text);
|
|
1574
|
+
} else {
|
|
1575
|
+
textParts.push(s.text);
|
|
1576
|
+
}
|
|
1577
|
+
}
|
|
1578
|
+
lastBlockType = s.blockType;
|
|
1579
|
+
}
|
|
1580
|
+
return textParts.join("").trim();
|
|
1581
|
+
}
|
|
1582
|
+
var DEFAULT_CONFIG4 = {
|
|
1583
|
+
contextBefore: 5,
|
|
1584
|
+
contextAfter: 8,
|
|
1585
|
+
respectBlockBoundaries: false,
|
|
1586
|
+
// Changed: allow crossing blocks by default
|
|
1587
|
+
maxChunkChars: 2e3,
|
|
1588
|
+
includeCodeBlocks: true,
|
|
1589
|
+
expandToSection: true
|
|
1590
|
+
};
|
|
1591
|
+
function findSectionBoundaries(anchorIndex, allSentences, maxChars) {
|
|
1592
|
+
const anchor = allSentences[anchorIndex];
|
|
1593
|
+
if (anchor === void 0) {
|
|
1594
|
+
return { start: anchorIndex, end: anchorIndex };
|
|
1595
|
+
}
|
|
1596
|
+
let sectionHeadingLevel = 6;
|
|
1597
|
+
let sectionStart = anchorIndex;
|
|
1598
|
+
for (let i = anchorIndex; i >= 0; i--) {
|
|
1599
|
+
const s = allSentences[i];
|
|
1600
|
+
if (s === void 0) continue;
|
|
1601
|
+
if (/^h[1-6]$/.test(s.blockType)) {
|
|
1602
|
+
const level = parseInt(s.blockType[1] ?? "6", 10);
|
|
1603
|
+
sectionHeadingLevel = level;
|
|
1604
|
+
sectionStart = i;
|
|
1605
|
+
break;
|
|
1606
|
+
}
|
|
1607
|
+
}
|
|
1608
|
+
let sectionEnd = allSentences.length - 1;
|
|
1609
|
+
for (let i = anchorIndex + 1; i < allSentences.length; i++) {
|
|
1610
|
+
const s = allSentences[i];
|
|
1611
|
+
if (s === void 0) continue;
|
|
1612
|
+
if (/^h[1-6]$/.test(s.blockType)) {
|
|
1613
|
+
const level = parseInt(s.blockType[1] ?? "6", 10);
|
|
1614
|
+
if (level <= sectionHeadingLevel) {
|
|
1615
|
+
sectionEnd = i - 1;
|
|
1616
|
+
break;
|
|
1617
|
+
}
|
|
1618
|
+
}
|
|
1619
|
+
}
|
|
1620
|
+
let charCount = 0;
|
|
1621
|
+
let actualStart = sectionStart;
|
|
1622
|
+
let actualEnd = sectionEnd;
|
|
1623
|
+
for (let i = sectionStart; i <= sectionEnd; i++) {
|
|
1624
|
+
const s = allSentences[i];
|
|
1625
|
+
if (s !== void 0) {
|
|
1626
|
+
charCount += s.text.length + 1;
|
|
1627
|
+
}
|
|
1628
|
+
}
|
|
1629
|
+
if (charCount <= maxChars) {
|
|
1630
|
+
return { start: sectionStart, end: sectionEnd };
|
|
1631
|
+
}
|
|
1632
|
+
charCount = allSentences[anchorIndex]?.text.length ?? 0;
|
|
1633
|
+
actualStart = anchorIndex;
|
|
1634
|
+
actualEnd = anchorIndex;
|
|
1635
|
+
let canExpandBefore = actualStart > sectionStart;
|
|
1636
|
+
let canExpandAfter = actualEnd < sectionEnd;
|
|
1637
|
+
while ((canExpandBefore || canExpandAfter) && charCount < maxChars) {
|
|
1638
|
+
if (canExpandBefore) {
|
|
1639
|
+
const prev = allSentences[actualStart - 1];
|
|
1640
|
+
if (prev && charCount + prev.text.length < maxChars) {
|
|
1641
|
+
actualStart--;
|
|
1642
|
+
charCount += prev.text.length + 1;
|
|
1643
|
+
canExpandBefore = actualStart > sectionStart;
|
|
1644
|
+
} else {
|
|
1645
|
+
canExpandBefore = false;
|
|
1646
|
+
}
|
|
1647
|
+
}
|
|
1648
|
+
if (canExpandAfter && charCount < maxChars) {
|
|
1649
|
+
const next = allSentences[actualEnd + 1];
|
|
1650
|
+
if (next && charCount + next.text.length < maxChars) {
|
|
1651
|
+
actualEnd++;
|
|
1652
|
+
charCount += next.text.length + 1;
|
|
1653
|
+
canExpandAfter = actualEnd < sectionEnd;
|
|
1654
|
+
} else {
|
|
1655
|
+
canExpandAfter = false;
|
|
1656
|
+
}
|
|
1657
|
+
}
|
|
1658
|
+
}
|
|
1659
|
+
return { start: actualStart, end: actualEnd };
|
|
1660
|
+
}
|
|
1661
|
+
function expandToIncludeCode(endIndex, allSentences, maxChars, currentChars) {
|
|
1662
|
+
let newEnd = endIndex;
|
|
1663
|
+
let chars = currentChars;
|
|
1664
|
+
for (let i = endIndex + 1; i < allSentences.length; i++) {
|
|
1665
|
+
const s = allSentences[i];
|
|
1666
|
+
if (s === void 0) break;
|
|
1667
|
+
if (/^h[1-6]$/.test(s.blockType)) {
|
|
1668
|
+
break;
|
|
1669
|
+
}
|
|
1670
|
+
if (s.blockType === "pre") {
|
|
1671
|
+
if (chars + s.text.length <= maxChars * 1.5) {
|
|
1672
|
+
newEnd = i;
|
|
1673
|
+
chars += s.text.length;
|
|
1674
|
+
}
|
|
1675
|
+
break;
|
|
1676
|
+
}
|
|
1677
|
+
if (chars + s.text.length <= maxChars) {
|
|
1678
|
+
newEnd = i;
|
|
1679
|
+
chars += s.text.length;
|
|
1680
|
+
} else {
|
|
1681
|
+
break;
|
|
1682
|
+
}
|
|
1683
|
+
}
|
|
1684
|
+
return newEnd;
|
|
1685
|
+
}
|
|
1686
|
+
function expandAnchor(anchor, allSentences, config = {}) {
|
|
1687
|
+
const {
|
|
1688
|
+
contextBefore = DEFAULT_CONFIG4.contextBefore,
|
|
1689
|
+
contextAfter = DEFAULT_CONFIG4.contextAfter,
|
|
1690
|
+
maxChunkChars = DEFAULT_CONFIG4.maxChunkChars,
|
|
1691
|
+
includeCodeBlocks = DEFAULT_CONFIG4.includeCodeBlocks,
|
|
1692
|
+
expandToSection = DEFAULT_CONFIG4.expandToSection
|
|
1693
|
+
} = config;
|
|
1694
|
+
const anchorIndex = anchor.globalIndex;
|
|
1695
|
+
let startIndex;
|
|
1696
|
+
let endIndex;
|
|
1697
|
+
if (expandToSection) {
|
|
1698
|
+
const bounds = findSectionBoundaries(anchorIndex, allSentences, maxChunkChars);
|
|
1699
|
+
startIndex = bounds.start;
|
|
1700
|
+
endIndex = bounds.end;
|
|
1701
|
+
} else {
|
|
1702
|
+
startIndex = Math.max(0, anchorIndex - contextBefore);
|
|
1703
|
+
endIndex = Math.min(allSentences.length - 1, anchorIndex + contextAfter);
|
|
1704
|
+
let charCount = 0;
|
|
1705
|
+
for (let i = startIndex; i <= endIndex; i++) {
|
|
1706
|
+
const s = allSentences[i];
|
|
1707
|
+
if (s !== void 0) {
|
|
1708
|
+
charCount += s.text.length;
|
|
1709
|
+
}
|
|
1710
|
+
}
|
|
1711
|
+
while (charCount > maxChunkChars && startIndex < anchorIndex) {
|
|
1712
|
+
const s = allSentences[startIndex];
|
|
1713
|
+
if (s !== void 0) {
|
|
1714
|
+
charCount -= s.text.length;
|
|
1715
|
+
}
|
|
1716
|
+
startIndex++;
|
|
1717
|
+
}
|
|
1718
|
+
while (charCount > maxChunkChars && endIndex > anchorIndex) {
|
|
1719
|
+
const s = allSentences[endIndex];
|
|
1720
|
+
if (s !== void 0) {
|
|
1721
|
+
charCount -= s.text.length;
|
|
1722
|
+
}
|
|
1723
|
+
endIndex--;
|
|
1724
|
+
}
|
|
1725
|
+
}
|
|
1726
|
+
if (includeCodeBlocks) {
|
|
1727
|
+
let currentChars = 0;
|
|
1728
|
+
for (let i = startIndex; i <= endIndex; i++) {
|
|
1729
|
+
const s = allSentences[i];
|
|
1730
|
+
if (s !== void 0) {
|
|
1731
|
+
currentChars += s.text.length;
|
|
1732
|
+
}
|
|
1733
|
+
}
|
|
1734
|
+
endIndex = expandToIncludeCode(endIndex, allSentences, maxChunkChars, currentChars);
|
|
1735
|
+
}
|
|
1736
|
+
const chunkSentences = [];
|
|
1737
|
+
for (let i = startIndex; i <= endIndex; i++) {
|
|
1738
|
+
const s = allSentences[i];
|
|
1739
|
+
if (s !== void 0) {
|
|
1740
|
+
chunkSentences.push(s);
|
|
1741
|
+
}
|
|
1742
|
+
}
|
|
1743
|
+
const text = buildChunkText(chunkSentences);
|
|
1744
|
+
const headingPath = anchor.headingPath;
|
|
1745
|
+
const rawCharCount = chunkSentences.reduce((sum, s) => sum + s.text.length, 0);
|
|
1746
|
+
return {
|
|
1747
|
+
sentences: chunkSentences,
|
|
1748
|
+
anchorIndex: anchor.globalIndex,
|
|
1749
|
+
score: anchor.combinedScore,
|
|
1750
|
+
text,
|
|
1751
|
+
charCount: rawCharCount,
|
|
1752
|
+
// Use raw content for budget, not formatted text
|
|
1753
|
+
headingPath
|
|
1754
|
+
};
|
|
1755
|
+
}
|
|
1756
|
+
function expandAnchors(anchors, allSentences, config = {}) {
|
|
1757
|
+
return anchors.map((anchor) => expandAnchor(anchor, allSentences, config));
|
|
1758
|
+
}
|
|
1759
|
+
|
|
1760
|
+
// src/extraction/dedupe.ts
|
|
1761
|
+
var DEFAULT_CONFIG5 = {
|
|
1762
|
+
overlapThreshold: 0.5,
|
|
1763
|
+
// Sentence index overlap ratio to trigger merge
|
|
1764
|
+
tokenSimilarityThreshold: 0.72
|
|
1765
|
+
// Token Jaccard similarity to consider duplicate
|
|
1766
|
+
};
|
|
1767
|
+
function calculateSentenceOverlap(chunkA, chunkB) {
|
|
1768
|
+
const indicesA = new Set(chunkA.sentences.map((s) => s.globalIndex));
|
|
1769
|
+
const indicesB = new Set(chunkB.sentences.map((s) => s.globalIndex));
|
|
1770
|
+
let intersection = 0;
|
|
1771
|
+
for (const idx of indicesA) {
|
|
1772
|
+
if (indicesB.has(idx)) {
|
|
1773
|
+
intersection++;
|
|
1774
|
+
}
|
|
1775
|
+
}
|
|
1776
|
+
const smaller = Math.min(indicesA.size, indicesB.size);
|
|
1777
|
+
if (smaller === 0) return 0;
|
|
1778
|
+
return intersection / smaller;
|
|
1779
|
+
}
|
|
1780
|
+
function mergeChunks(chunkA, chunkB) {
|
|
1781
|
+
const sentenceMap = /* @__PURE__ */ new Map();
|
|
1782
|
+
for (const s of chunkA.sentences) {
|
|
1783
|
+
sentenceMap.set(s.globalIndex, s);
|
|
1784
|
+
}
|
|
1785
|
+
for (const s of chunkB.sentences) {
|
|
1786
|
+
sentenceMap.set(s.globalIndex, s);
|
|
1787
|
+
}
|
|
1788
|
+
const mergedSentences = Array.from(sentenceMap.values()).sort((a, b) => a.globalIndex - b.globalIndex);
|
|
1789
|
+
const keepChunk = chunkA.score >= chunkB.score ? chunkA : chunkB;
|
|
1790
|
+
const text = buildChunkText(mergedSentences);
|
|
1791
|
+
return {
|
|
1792
|
+
sentences: mergedSentences,
|
|
1793
|
+
anchorIndex: keepChunk.anchorIndex,
|
|
1794
|
+
score: Math.max(chunkA.score, chunkB.score),
|
|
1795
|
+
text,
|
|
1796
|
+
charCount: text.length,
|
|
1797
|
+
headingPath: keepChunk.headingPath
|
|
1798
|
+
};
|
|
1799
|
+
}
|
|
1800
|
+
function getChunkTokens(chunk) {
|
|
1801
|
+
const tokens = [];
|
|
1802
|
+
for (const s of chunk.sentences) {
|
|
1803
|
+
tokens.push(...s.tokens);
|
|
1804
|
+
}
|
|
1805
|
+
return tokens;
|
|
1806
|
+
}
|
|
1807
|
+
function dedupeChunks(chunks, config = {}) {
|
|
1808
|
+
const {
|
|
1809
|
+
overlapThreshold = DEFAULT_CONFIG5.overlapThreshold,
|
|
1810
|
+
tokenSimilarityThreshold = DEFAULT_CONFIG5.tokenSimilarityThreshold
|
|
1811
|
+
} = config;
|
|
1812
|
+
if (chunks.length <= 1) {
|
|
1813
|
+
return chunks;
|
|
1814
|
+
}
|
|
1815
|
+
const sorted = [...chunks].sort((a, b) => {
|
|
1816
|
+
const d = b.score - a.score;
|
|
1817
|
+
if (d !== 0) return d;
|
|
1818
|
+
return a.anchorIndex - b.anchorIndex;
|
|
1819
|
+
});
|
|
1820
|
+
const result = [];
|
|
1821
|
+
const merged = /* @__PURE__ */ new Set();
|
|
1822
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
1823
|
+
if (merged.has(i)) continue;
|
|
1824
|
+
let accumulatedChunk = sorted[i];
|
|
1825
|
+
if (accumulatedChunk === void 0) continue;
|
|
1826
|
+
for (let j = i + 1; j < sorted.length; j++) {
|
|
1827
|
+
if (merged.has(j)) continue;
|
|
1828
|
+
const other = sorted[j];
|
|
1829
|
+
if (other === void 0) continue;
|
|
1830
|
+
const overlap = calculateSentenceOverlap(accumulatedChunk, other);
|
|
1831
|
+
if (overlap >= overlapThreshold) {
|
|
1832
|
+
accumulatedChunk = mergeChunks(accumulatedChunk, other);
|
|
1833
|
+
merged.add(j);
|
|
1834
|
+
continue;
|
|
1835
|
+
}
|
|
1836
|
+
const accumulatedTokens = getChunkTokens(accumulatedChunk);
|
|
1837
|
+
const otherTokens = getChunkTokens(other);
|
|
1838
|
+
const similarity = jaccardSimilarity(accumulatedTokens, otherTokens);
|
|
1839
|
+
if (similarity >= tokenSimilarityThreshold) {
|
|
1840
|
+
merged.add(j);
|
|
1841
|
+
}
|
|
1842
|
+
}
|
|
1843
|
+
result.push(accumulatedChunk);
|
|
1844
|
+
}
|
|
1845
|
+
result.sort((a, b) => {
|
|
1846
|
+
const d = b.score - a.score;
|
|
1847
|
+
if (d !== 0) return d;
|
|
1848
|
+
return a.anchorIndex - b.anchorIndex;
|
|
1849
|
+
});
|
|
1850
|
+
return result;
|
|
1851
|
+
}
|
|
1852
|
+
function removeSubsetChunks(chunks) {
|
|
1853
|
+
if (chunks.length <= 1) {
|
|
1854
|
+
return chunks;
|
|
1855
|
+
}
|
|
1856
|
+
const sorted = [...chunks].sort((a, b) => {
|
|
1857
|
+
const d = b.sentences.length - a.sentences.length;
|
|
1858
|
+
if (d !== 0) return d;
|
|
1859
|
+
return a.anchorIndex - b.anchorIndex;
|
|
1860
|
+
});
|
|
1861
|
+
const result = [];
|
|
1862
|
+
for (const chunk of sorted) {
|
|
1863
|
+
let isSubset = false;
|
|
1864
|
+
for (const existing of result) {
|
|
1865
|
+
const existingIndices = new Set(existing.sentences.map((s) => s.globalIndex));
|
|
1866
|
+
const chunkIndices = chunk.sentences.map((s) => s.globalIndex);
|
|
1867
|
+
const allInExisting = chunkIndices.every((idx) => existingIndices.has(idx));
|
|
1868
|
+
if (allInExisting) {
|
|
1869
|
+
isSubset = true;
|
|
1870
|
+
break;
|
|
1871
|
+
}
|
|
1872
|
+
}
|
|
1873
|
+
if (!isSubset) {
|
|
1874
|
+
result.push(chunk);
|
|
1875
|
+
}
|
|
1876
|
+
}
|
|
1877
|
+
result.sort((a, b) => {
|
|
1878
|
+
const d = b.score - a.score;
|
|
1879
|
+
if (d !== 0) return d;
|
|
1880
|
+
return a.anchorIndex - b.anchorIndex;
|
|
1881
|
+
});
|
|
1882
|
+
return result;
|
|
1883
|
+
}
|
|
1884
|
+
function fullDedupe(chunks, config = {}) {
|
|
1885
|
+
const merged = dedupeChunks(chunks, config);
|
|
1886
|
+
return removeSubsetChunks(merged);
|
|
1887
|
+
}
|
|
1888
|
+
|
|
1889
|
+
// src/pipeline.ts
|
|
1890
|
+
function createEmptyResult(query, debug, debugInfo, relevanceMetrics) {
|
|
1891
|
+
const result = {
|
|
1892
|
+
excerpts: [],
|
|
1893
|
+
totalChars: 0,
|
|
1894
|
+
query
|
|
1895
|
+
};
|
|
1896
|
+
if (relevanceMetrics) {
|
|
1897
|
+
result.relevanceMetrics = relevanceMetrics;
|
|
1898
|
+
}
|
|
1899
|
+
if (debug) {
|
|
1900
|
+
result.debug = {
|
|
1901
|
+
sentenceCount: 0,
|
|
1902
|
+
anchorCount: 0,
|
|
1903
|
+
chunkCount: 0,
|
|
1904
|
+
dedupedChunkCount: 0,
|
|
1905
|
+
topSentences: [],
|
|
1906
|
+
...debugInfo
|
|
1907
|
+
};
|
|
1908
|
+
}
|
|
1909
|
+
return result;
|
|
1910
|
+
}
|
|
1911
|
+
var DEFAULT_CONFIG6 = {
|
|
1912
|
+
ranker: {
|
|
1913
|
+
bm25Weight: 0.6,
|
|
1914
|
+
heuristicWeight: 0.4
|
|
1915
|
+
},
|
|
1916
|
+
anchors: {
|
|
1917
|
+
maxAnchors: 5,
|
|
1918
|
+
minScore: 0.25,
|
|
1919
|
+
// Higher threshold to filter irrelevant results
|
|
1920
|
+
diversityThreshold: 0.4,
|
|
1921
|
+
minPositionGap: 5
|
|
1922
|
+
},
|
|
1923
|
+
expand: {
|
|
1924
|
+
contextBefore: 5,
|
|
1925
|
+
contextAfter: 8,
|
|
1926
|
+
respectBlockBoundaries: false,
|
|
1927
|
+
maxChunkChars: 2e3,
|
|
1928
|
+
includeCodeBlocks: true,
|
|
1929
|
+
expandToSection: true
|
|
1930
|
+
},
|
|
1931
|
+
dedupe: {
|
|
1932
|
+
overlapThreshold: 0.3,
|
|
1933
|
+
tokenSimilarityThreshold: 0.6
|
|
1934
|
+
},
|
|
1935
|
+
excerpts: {
|
|
1936
|
+
maxExcerpts: 3,
|
|
1937
|
+
charBudget: 6e3,
|
|
1938
|
+
minExcerptChars: 100
|
|
1939
|
+
},
|
|
1940
|
+
debug: false
|
|
1941
|
+
};
|
|
1942
|
+
function extractExcerpts(html, query, config = {}) {
|
|
1943
|
+
const cfg = mergeConfig(DEFAULT_CONFIG6, config);
|
|
1944
|
+
const logger2 = logger_default.getInstance();
|
|
1945
|
+
const { $, mainContent } = logger2.time("1. Preprocess HTML", () => preprocessHtml(html));
|
|
1946
|
+
if (mainContent === null) {
|
|
1947
|
+
return createEmptyResult(query, cfg.debug);
|
|
1948
|
+
}
|
|
1949
|
+
const { sentences } = logger2.time("2. Segment into sentences", () => segmentHtml($, mainContent));
|
|
1950
|
+
if (sentences.length === 0) {
|
|
1951
|
+
return createEmptyResult(query, cfg.debug);
|
|
1952
|
+
}
|
|
1953
|
+
const queryTokens = logger2.time("3. Tokenize query", () => tokenize(query));
|
|
1954
|
+
if (queryTokens.length === 0) {
|
|
1955
|
+
return extractWithoutQuery(sentences, query, cfg);
|
|
1956
|
+
}
|
|
1957
|
+
const rankingResult = logger2.time("4. Rank sentences", () => rankSentencesWithRelevance(sentences, queryTokens, cfg.ranker));
|
|
1958
|
+
const rankedSentences = rankingResult.sentences;
|
|
1959
|
+
if (!rankingResult.hasRelevantResults) {
|
|
1960
|
+
return createEmptyResult(
|
|
1961
|
+
query,
|
|
1962
|
+
cfg.debug,
|
|
1963
|
+
{
|
|
1964
|
+
sentenceCount: sentences.length,
|
|
1965
|
+
queryTermCoverage: rankingResult.queryTermCoverage,
|
|
1966
|
+
maxRawBm25: rankingResult.maxRawBm25,
|
|
1967
|
+
hasRelevantResults: false,
|
|
1968
|
+
topSentences: rankedSentences.slice(0, 5).map((s) => ({
|
|
1969
|
+
text: s.text.slice(0, 100) + (s.text.length > 100 ? "..." : ""),
|
|
1970
|
+
score: s.combinedScore,
|
|
1971
|
+
headingPath: s.headingPath
|
|
1972
|
+
}))
|
|
1973
|
+
},
|
|
1974
|
+
{
|
|
1975
|
+
hasRelevantResults: false,
|
|
1976
|
+
sentenceCount: sentences.length,
|
|
1977
|
+
queryTermCoverage: rankingResult.queryTermCoverage,
|
|
1978
|
+
maxBm25: rankingResult.maxRawBm25,
|
|
1979
|
+
maxCooccurrence: rankingResult.maxCooccurrence
|
|
1980
|
+
}
|
|
1981
|
+
);
|
|
1982
|
+
}
|
|
1983
|
+
const anchors = logger2.time("5. Select anchors", () => selectAnchorsWithPositionDiversity(rankedSentences, cfg.anchors));
|
|
1984
|
+
const chunks = logger2.time("6. Expand anchors", () => expandAnchors(anchors, sentences, cfg.expand));
|
|
1985
|
+
const dedupedChunks = logger2.time("7. Deduplicate chunks", () => fullDedupe(chunks, cfg.dedupe));
|
|
1986
|
+
const result = logger2.time("8. Assemble excerpts", () => assembleExcerpts(dedupedChunks, query, cfg.excerpts));
|
|
1987
|
+
const relevanceMetrics = {
|
|
1988
|
+
hasRelevantResults: rankingResult.hasRelevantResults,
|
|
1989
|
+
sentenceCount: sentences.length,
|
|
1990
|
+
queryTermCoverage: rankingResult.queryTermCoverage,
|
|
1991
|
+
maxBm25: rankingResult.maxRawBm25,
|
|
1992
|
+
maxCooccurrence: rankingResult.maxCooccurrence
|
|
1993
|
+
};
|
|
1994
|
+
if (cfg.debug) {
|
|
1995
|
+
const debugInfo = {
|
|
1996
|
+
sentenceCount: sentences.length,
|
|
1997
|
+
anchorCount: anchors.length,
|
|
1998
|
+
chunkCount: chunks.length,
|
|
1999
|
+
dedupedChunkCount: dedupedChunks.length,
|
|
2000
|
+
queryTermCoverage: rankingResult.queryTermCoverage,
|
|
2001
|
+
maxRawBm25: rankingResult.maxRawBm25,
|
|
2002
|
+
hasRelevantResults: rankingResult.hasRelevantResults,
|
|
2003
|
+
topSentences: rankedSentences.slice(0, 10).map((s) => ({
|
|
2004
|
+
text: s.text.slice(0, 100) + (s.text.length > 100 ? "..." : ""),
|
|
2005
|
+
score: s.combinedScore,
|
|
2006
|
+
headingPath: s.headingPath
|
|
2007
|
+
}))
|
|
2008
|
+
};
|
|
2009
|
+
return { ...result, debug: debugInfo, relevanceMetrics };
|
|
2010
|
+
}
|
|
2011
|
+
return { ...result, relevanceMetrics };
|
|
2012
|
+
}
|
|
2013
|
+
function extractWithoutQuery(sentences, query, cfg) {
|
|
2014
|
+
const earlySentences = sentences.filter((s) => s.position < 0.4).slice(0, cfg.anchors?.maxAnchors ?? 5);
|
|
2015
|
+
if (earlySentences.length === 0) {
|
|
2016
|
+
return {
|
|
2017
|
+
excerpts: [],
|
|
2018
|
+
totalChars: 0,
|
|
2019
|
+
query
|
|
2020
|
+
};
|
|
2021
|
+
}
|
|
2022
|
+
const excerpts = earlySentences.map((s) => ({
|
|
2023
|
+
text: s.text,
|
|
2024
|
+
headingPath: s.headingPath,
|
|
2025
|
+
score: 1 - s.position,
|
|
2026
|
+
// Higher score for earlier content
|
|
2027
|
+
charCount: s.text.length
|
|
2028
|
+
}));
|
|
2029
|
+
let totalChars = 0;
|
|
2030
|
+
const budget = cfg.excerpts?.charBudget ?? 2e3;
|
|
2031
|
+
const maxExcerpts = cfg.excerpts?.maxExcerpts ?? 3;
|
|
2032
|
+
const selected = [];
|
|
2033
|
+
for (const excerpt of excerpts) {
|
|
2034
|
+
if (totalChars + excerpt.charCount > budget) break;
|
|
2035
|
+
if (selected.length >= maxExcerpts) break;
|
|
2036
|
+
selected.push(excerpt);
|
|
2037
|
+
totalChars += excerpt.charCount;
|
|
2038
|
+
}
|
|
2039
|
+
return {
|
|
2040
|
+
excerpts: selected,
|
|
2041
|
+
totalChars,
|
|
2042
|
+
query
|
|
2043
|
+
};
|
|
2044
|
+
}
|
|
2045
|
+
function mergeConfig(defaults, overrides) {
|
|
2046
|
+
return {
|
|
2047
|
+
ranker: { ...defaults.ranker, ...overrides.ranker },
|
|
2048
|
+
anchors: { ...defaults.anchors, ...overrides.anchors },
|
|
2049
|
+
expand: { ...defaults.expand, ...overrides.expand },
|
|
2050
|
+
dedupe: { ...defaults.dedupe, ...overrides.dedupe },
|
|
2051
|
+
excerpts: { ...defaults.excerpts, ...overrides.excerpts },
|
|
2052
|
+
debug: overrides.debug ?? defaults.debug ?? false
|
|
2053
|
+
};
|
|
2054
|
+
}
|
|
2055
|
+
|
|
2056
|
+
// src/mcp/types.ts
|
|
2057
|
+
var DEFAULT_CONFIG7 = {
|
|
2058
|
+
searxngUrl: "http://localhost:8888",
|
|
2059
|
+
maxResults: 5,
|
|
2060
|
+
timeout: 5e3,
|
|
2061
|
+
perPageCharBudget: 3e3,
|
|
2062
|
+
// Increased for docs pages
|
|
2063
|
+
totalCharBudget: 12e3
|
|
2064
|
+
// Increased to accommodate larger excerpts
|
|
2065
|
+
};
|
|
2066
|
+
|
|
2067
|
+
// src/mcp/searxng.ts
|
|
2068
|
+
async function searchSearxng(query, options) {
|
|
2069
|
+
const { baseUrl, maxResults, timeout = 1e4 } = options;
|
|
2070
|
+
const searchUrl = new URL("/search", baseUrl);
|
|
2071
|
+
searchUrl.searchParams.set("q", query);
|
|
2072
|
+
searchUrl.searchParams.set("format", "json");
|
|
2073
|
+
const controller = new AbortController();
|
|
2074
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
2075
|
+
try {
|
|
2076
|
+
const response = await fetch(searchUrl.toString(), {
|
|
2077
|
+
signal: controller.signal,
|
|
2078
|
+
headers: {
|
|
2079
|
+
"Accept": "application/json"
|
|
2080
|
+
}
|
|
2081
|
+
});
|
|
2082
|
+
if (!response.ok) {
|
|
2083
|
+
throw new Error(`SearXNG returned ${response.status}: ${response.statusText}`);
|
|
2084
|
+
}
|
|
2085
|
+
const data = await response.json();
|
|
2086
|
+
return data.results.slice(0, maxResults).map((r) => ({
|
|
2087
|
+
url: r.url,
|
|
2088
|
+
title: r.title ?? "",
|
|
2089
|
+
content: r.content ?? "",
|
|
2090
|
+
score: r.score ?? 1,
|
|
2091
|
+
engine: r.engine ?? "unknown"
|
|
2092
|
+
}));
|
|
2093
|
+
} catch (error) {
|
|
2094
|
+
if (error instanceof Error && error.name === "AbortError") {
|
|
2095
|
+
throw new Error(`SearXNG request timed out after ${timeout}ms`);
|
|
2096
|
+
}
|
|
2097
|
+
throw error;
|
|
2098
|
+
} finally {
|
|
2099
|
+
clearTimeout(timeoutId);
|
|
2100
|
+
}
|
|
2101
|
+
}
|
|
2102
|
+
|
|
2103
|
+
// src/mcp/scraper.ts
|
|
2104
|
+
var DEFAULT_USER_AGENT = "Mozilla/5.0 (compatible; PeekyBot/1.0; +https://github.com/peeky-search)";
|
|
2105
|
+
var DEFAULT_MAX_CONCURRENT = 3;
|
|
2106
|
+
async function runWithConcurrency(items, fn, maxConcurrent) {
|
|
2107
|
+
const queue = [...items];
|
|
2108
|
+
const workers = [];
|
|
2109
|
+
for (let i = 0; i < Math.min(maxConcurrent, queue.length); i++) {
|
|
2110
|
+
workers.push((async () => {
|
|
2111
|
+
while (queue.length > 0) {
|
|
2112
|
+
const item = queue.shift();
|
|
2113
|
+
if (item !== void 0) {
|
|
2114
|
+
await fn(item);
|
|
2115
|
+
}
|
|
2116
|
+
}
|
|
2117
|
+
})());
|
|
2118
|
+
}
|
|
2119
|
+
await Promise.all(workers);
|
|
2120
|
+
}
|
|
2121
|
+
async function scrapeUrl(url, options) {
|
|
2122
|
+
const { timeout = 5e3, userAgent = DEFAULT_USER_AGENT } = options;
|
|
2123
|
+
const controller = new AbortController();
|
|
2124
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
2125
|
+
try {
|
|
2126
|
+
const response = await fetch(url, {
|
|
2127
|
+
signal: controller.signal,
|
|
2128
|
+
headers: {
|
|
2129
|
+
"User-Agent": userAgent,
|
|
2130
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
2131
|
+
"Accept-Language": "en-US,en;q=0.5"
|
|
2132
|
+
},
|
|
2133
|
+
redirect: "follow"
|
|
2134
|
+
});
|
|
2135
|
+
if (!response.ok) {
|
|
2136
|
+
return {
|
|
2137
|
+
url,
|
|
2138
|
+
html: null,
|
|
2139
|
+
error: `HTTP ${response.status}: ${response.statusText}`
|
|
2140
|
+
};
|
|
2141
|
+
}
|
|
2142
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
2143
|
+
if (!contentType.includes("text/html") && !contentType.includes("application/xhtml")) {
|
|
2144
|
+
return {
|
|
2145
|
+
url,
|
|
2146
|
+
html: null,
|
|
2147
|
+
error: `Non-HTML content type: ${contentType}`
|
|
2148
|
+
};
|
|
2149
|
+
}
|
|
2150
|
+
const html = await response.text();
|
|
2151
|
+
return { url, html };
|
|
2152
|
+
} catch (error) {
|
|
2153
|
+
if (error instanceof Error) {
|
|
2154
|
+
if (error.name === "AbortError") {
|
|
2155
|
+
return {
|
|
2156
|
+
url,
|
|
2157
|
+
html: null,
|
|
2158
|
+
error: `Timeout after ${timeout}ms`
|
|
2159
|
+
};
|
|
2160
|
+
}
|
|
2161
|
+
return {
|
|
2162
|
+
url,
|
|
2163
|
+
html: null,
|
|
2164
|
+
error: error.message
|
|
2165
|
+
};
|
|
2166
|
+
}
|
|
2167
|
+
return {
|
|
2168
|
+
url,
|
|
2169
|
+
html: null,
|
|
2170
|
+
error: "Unknown error"
|
|
2171
|
+
};
|
|
2172
|
+
} finally {
|
|
2173
|
+
clearTimeout(timeoutId);
|
|
2174
|
+
}
|
|
2175
|
+
}
|
|
2176
|
+
async function scrapeUrls(urls, options = {}) {
|
|
2177
|
+
const { maxConcurrent = DEFAULT_MAX_CONCURRENT } = options;
|
|
2178
|
+
const results = [];
|
|
2179
|
+
await runWithConcurrency(
|
|
2180
|
+
urls,
|
|
2181
|
+
async (url) => {
|
|
2182
|
+
const result = await scrapeUrl(url, options);
|
|
2183
|
+
results.push(result);
|
|
2184
|
+
},
|
|
2185
|
+
maxConcurrent
|
|
2186
|
+
);
|
|
2187
|
+
return results;
|
|
2188
|
+
}
|
|
2189
|
+
|
|
2190
|
+
// src/mcp/query-parser.ts
|
|
2191
|
+
function parseSearchOperators(query) {
|
|
2192
|
+
let extractionQuery = query;
|
|
2193
|
+
extractionQuery = extractionQuery.replace(/\bsite:[\w.-]+/gi, "");
|
|
2194
|
+
extractionQuery = extractionQuery.replace(/\s-\w+/g, "");
|
|
2195
|
+
extractionQuery = extractionQuery.replace(/\bfiletype:\w+/gi, "");
|
|
2196
|
+
extractionQuery = extractionQuery.replace(/"([^"]+)"/g, "$1");
|
|
2197
|
+
extractionQuery = extractionQuery.replace(/\s+/g, " ").trim();
|
|
2198
|
+
return {
|
|
2199
|
+
searchQuery: query,
|
|
2200
|
+
extractionQuery
|
|
2201
|
+
};
|
|
2202
|
+
}
|
|
2203
|
+
|
|
2204
|
+
// src/mcp/orchestrator.ts
|
|
2205
|
+
var logger = logger_default.getInstance();
|
|
2206
|
+
var JS_RENDERED_DOMAINS = /* @__PURE__ */ new Set([
|
|
2207
|
+
"medium.com",
|
|
2208
|
+
"npmjs.com"
|
|
2209
|
+
]);
|
|
2210
|
+
function isBlockedDomain(url) {
|
|
2211
|
+
try {
|
|
2212
|
+
const hostname = new URL(url).hostname.replace(/^www\./, "");
|
|
2213
|
+
for (const blocked of JS_RENDERED_DOMAINS) {
|
|
2214
|
+
if (hostname === blocked || hostname.endsWith(`.${blocked}`)) {
|
|
2215
|
+
return true;
|
|
2216
|
+
}
|
|
2217
|
+
}
|
|
2218
|
+
return false;
|
|
2219
|
+
} catch {
|
|
2220
|
+
return false;
|
|
2221
|
+
}
|
|
2222
|
+
}
|
|
2223
|
+
var VERSION_PATH_PATTERNS = [
|
|
2224
|
+
/\/v\d+(\.\d+)*\//gi,
|
|
2225
|
+
// /v1/, /v2/, /v1.2.3/
|
|
2226
|
+
/\/\d+\.\d+(\.\d+)*\//gi,
|
|
2227
|
+
// /4.3/, /3.0.0/
|
|
2228
|
+
/\/(stable|latest|current|master|main|dev|nightly)\//gi,
|
|
2229
|
+
/\/(en|en-us|en-gb)\//gi
|
|
2230
|
+
// Language prefixes (often redundant)
|
|
2231
|
+
];
|
|
2232
|
+
var VERSION_SUBDOMAIN_PATTERNS = [
|
|
2233
|
+
/^v\d+\./i,
|
|
2234
|
+
// v1., v2., etc.
|
|
2235
|
+
/^\d+\.\d+\./,
|
|
2236
|
+
// 4.3., 3.0., etc.
|
|
2237
|
+
/^(stable|latest|current|docs)\./i,
|
|
2238
|
+
/^[a-z]{2}\./i,
|
|
2239
|
+
// Two-letter language codes: en., he., fr., etc.
|
|
2240
|
+
/^[a-z]{2}-[a-z]{2}\./i
|
|
2241
|
+
// Regional codes: en-us., pt-br., etc.
|
|
2242
|
+
];
|
|
2243
|
+
function normalizeUrlForDedup(url) {
|
|
2244
|
+
try {
|
|
2245
|
+
const urlObj = new URL(url);
|
|
2246
|
+
let hostname = urlObj.hostname;
|
|
2247
|
+
let path = urlObj.pathname;
|
|
2248
|
+
for (const pattern of VERSION_SUBDOMAIN_PATTERNS) {
|
|
2249
|
+
hostname = hostname.replace(pattern, "");
|
|
2250
|
+
}
|
|
2251
|
+
for (const pattern of VERSION_PATH_PATTERNS) {
|
|
2252
|
+
path = path.replace(pattern, "/");
|
|
2253
|
+
}
|
|
2254
|
+
path = path.replace(/\/+/g, "/");
|
|
2255
|
+
return `${hostname}${path}`;
|
|
2256
|
+
} catch {
|
|
2257
|
+
return url;
|
|
2258
|
+
}
|
|
2259
|
+
}
|
|
2260
|
+
function deduplicateUrls(results) {
|
|
2261
|
+
const seen = /* @__PURE__ */ new Map();
|
|
2262
|
+
for (const result of results) {
|
|
2263
|
+
const normalized = normalizeUrlForDedup(result.url);
|
|
2264
|
+
if (!seen.has(normalized)) {
|
|
2265
|
+
seen.set(normalized, result);
|
|
2266
|
+
}
|
|
2267
|
+
}
|
|
2268
|
+
return Array.from(seen.values());
|
|
2269
|
+
}
|
|
2270
|
+
var PAGE_RELEVANCE_WEIGHTS = {
|
|
2271
|
+
titleMatch: 0.35,
|
|
2272
|
+
urlMatch: 0.15,
|
|
2273
|
+
excerptScore: 0.35,
|
|
2274
|
+
searxngScore: 0.15
|
|
2275
|
+
};
|
|
2276
|
+
var SEARCH_PIPELINE_CONFIG = {
|
|
2277
|
+
ranker: {
|
|
2278
|
+
relevanceMode: "search"
|
|
2279
|
+
// Looser relevance for multi-page search
|
|
2280
|
+
},
|
|
2281
|
+
anchors: {
|
|
2282
|
+
maxAnchors: 5
|
|
2283
|
+
// More anchors for comprehensive docs coverage
|
|
2284
|
+
},
|
|
2285
|
+
expand: {
|
|
2286
|
+
maxChunkChars: 2e3,
|
|
2287
|
+
// Larger chunks for better context
|
|
2288
|
+
contextAfter: 12,
|
|
2289
|
+
// More trailing context for code explanations
|
|
2290
|
+
contextBefore: 8
|
|
2291
|
+
// More leading context
|
|
2292
|
+
}
|
|
2293
|
+
};
|
|
2294
|
+
function extractTitle(html) {
|
|
2295
|
+
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
|
|
2296
|
+
if (titleMatch?.[1]) {
|
|
2297
|
+
return titleMatch[1].trim();
|
|
2298
|
+
}
|
|
2299
|
+
return "";
|
|
2300
|
+
}
|
|
2301
|
+
function tokenOverlap(tokensA, tokensB) {
|
|
2302
|
+
if (tokensA.length === 0 || tokensB.length === 0) return 0;
|
|
2303
|
+
const setB = new Set(tokensB);
|
|
2304
|
+
let matches = 0;
|
|
2305
|
+
for (const token of tokensA) {
|
|
2306
|
+
if (setB.has(token)) matches++;
|
|
2307
|
+
}
|
|
2308
|
+
return matches / tokensA.length;
|
|
2309
|
+
}
|
|
2310
|
+
function getUrlPathTokens(url) {
|
|
2311
|
+
try {
|
|
2312
|
+
const urlObj = new URL(url);
|
|
2313
|
+
const segments = urlObj.pathname.split("/").filter((s) => s.length > 0);
|
|
2314
|
+
const tokens = [];
|
|
2315
|
+
for (const segment of segments) {
|
|
2316
|
+
tokens.push(...tokenize(segment.replace(/-/g, " ")));
|
|
2317
|
+
}
|
|
2318
|
+
return tokens;
|
|
2319
|
+
} catch {
|
|
2320
|
+
return [];
|
|
2321
|
+
}
|
|
2322
|
+
}
|
|
2323
|
+
function computePageRelevance(page, queryTokens, searxngScore) {
|
|
2324
|
+
const titleTokens = tokenize(page.title);
|
|
2325
|
+
const titleMatch = tokenOverlap(queryTokens, titleTokens);
|
|
2326
|
+
const urlTokens = getUrlPathTokens(page.url);
|
|
2327
|
+
const urlMatch = tokenOverlap(queryTokens, urlTokens);
|
|
2328
|
+
const bestExcerptScore = page.excerpts.length > 0 ? Math.max(...page.excerpts.map((e) => e.score)) : 0;
|
|
2329
|
+
const normalizedSearxng = Math.min(searxngScore / 10, 1);
|
|
2330
|
+
return PAGE_RELEVANCE_WEIGHTS.titleMatch * titleMatch + PAGE_RELEVANCE_WEIGHTS.urlMatch * urlMatch + PAGE_RELEVANCE_WEIGHTS.excerptScore * bestExcerptScore + PAGE_RELEVANCE_WEIGHTS.searxngScore * normalizedSearxng;
|
|
2331
|
+
}
|
|
2332
|
+
function processPage(url, html, query, charBudget, searchResult) {
|
|
2333
|
+
try {
|
|
2334
|
+
const result = extractExcerpts(html, query, {
|
|
2335
|
+
...SEARCH_PIPELINE_CONFIG,
|
|
2336
|
+
excerpts: {
|
|
2337
|
+
charBudget,
|
|
2338
|
+
maxExcerpts: 5
|
|
2339
|
+
// More excerpts for comprehensive docs
|
|
2340
|
+
}
|
|
2341
|
+
});
|
|
2342
|
+
const title = searchResult?.title ?? extractTitle(html) ?? url;
|
|
2343
|
+
return {
|
|
2344
|
+
extraction: {
|
|
2345
|
+
url,
|
|
2346
|
+
title,
|
|
2347
|
+
excerpts: result.excerpts.map((e) => ({
|
|
2348
|
+
text: e.text,
|
|
2349
|
+
headingPath: e.headingPath,
|
|
2350
|
+
score: e.score
|
|
2351
|
+
})),
|
|
2352
|
+
totalChars: result.totalChars
|
|
2353
|
+
},
|
|
2354
|
+
relevanceMetrics: result.relevanceMetrics
|
|
2355
|
+
};
|
|
2356
|
+
} catch (error) {
|
|
2357
|
+
return {
|
|
2358
|
+
extraction: {
|
|
2359
|
+
url,
|
|
2360
|
+
title: searchResult?.title ?? url,
|
|
2361
|
+
excerpts: [],
|
|
2362
|
+
totalChars: 0,
|
|
2363
|
+
error: error instanceof Error ? error.message : "Unknown extraction error"
|
|
2364
|
+
},
|
|
2365
|
+
relevanceMetrics: void 0
|
|
2366
|
+
};
|
|
2367
|
+
}
|
|
2368
|
+
}
|
|
2369
|
+
function getShortUrl(url) {
|
|
2370
|
+
try {
|
|
2371
|
+
const parsed = new URL(url);
|
|
2372
|
+
const domain = parsed.hostname.replace(/^www\./, "");
|
|
2373
|
+
let path = parsed.pathname;
|
|
2374
|
+
if (path.length > 40) {
|
|
2375
|
+
path = path.slice(0, 37) + "...";
|
|
2376
|
+
}
|
|
2377
|
+
return domain + path;
|
|
2378
|
+
} catch {
|
|
2379
|
+
return url;
|
|
2380
|
+
}
|
|
2381
|
+
}
|
|
2382
|
+
function formatPageStatus(diag) {
|
|
2383
|
+
const shortUrl = getShortUrl(diag.url);
|
|
2384
|
+
switch (diag.status) {
|
|
2385
|
+
case "success":
|
|
2386
|
+
return `+ [${shortUrl}] ${diag.excerptCount} excerpt(s), ${diag.charCount} chars`;
|
|
2387
|
+
case "scrape_failed":
|
|
2388
|
+
return `- [${shortUrl}] FAILED: ${diag.error ?? "Could not fetch page"}`;
|
|
2389
|
+
case "no_content":
|
|
2390
|
+
return `- [${shortUrl}] SKIPPED: No extractable content`;
|
|
2391
|
+
case "not_relevant": {
|
|
2392
|
+
const coverage = diag.metrics?.queryTermCoverage ?? 0;
|
|
2393
|
+
const bm25 = diag.metrics?.maxBm25 ?? 0;
|
|
2394
|
+
const coveragePct = (coverage * 100).toFixed(0);
|
|
2395
|
+
return `- [${shortUrl}] SKIPPED: Not relevant (${coveragePct}% terms, score: ${bm25.toFixed(2)})`;
|
|
2396
|
+
}
|
|
2397
|
+
case "budget_exceeded":
|
|
2398
|
+
return `~ [${shortUrl}] TRUNCATED: Output limit reached`;
|
|
2399
|
+
case "blocked_js":
|
|
2400
|
+
return `x [${shortUrl}] SKIPPED: JS-rendered site`;
|
|
2401
|
+
default:
|
|
2402
|
+
return `? [${shortUrl}] Unknown status`;
|
|
2403
|
+
}
|
|
2404
|
+
}
|
|
2405
|
+
function generateSuggestions(diagnostics, queryTokens) {
|
|
2406
|
+
const suggestions = [];
|
|
2407
|
+
const successCount = diagnostics.filter((d) => d.status === "success").length;
|
|
2408
|
+
const notRelevantPages = diagnostics.filter((d) => d.status === "not_relevant");
|
|
2409
|
+
const noContentCount = diagnostics.filter((d) => d.status === "no_content").length;
|
|
2410
|
+
const scrapeFailedCount = diagnostics.filter((d) => d.status === "scrape_failed").length;
|
|
2411
|
+
const budgetExceededCount = diagnostics.filter((d) => d.status === "budget_exceeded").length;
|
|
2412
|
+
const blockedJsCount = diagnostics.filter((d) => d.status === "blocked_js").length;
|
|
2413
|
+
if (successCount === 0) {
|
|
2414
|
+
suggestions.push("NO RESULTS EXTRACTED. Your query may be too vague or not match the content.");
|
|
2415
|
+
suggestions.push("Try a more specific query with exact library names, function names, or error messages.");
|
|
2416
|
+
suggestions.push(`Current query tokens: [${queryTokens.join(", ")}] - ensure these terms appear in documentation you're looking for.`);
|
|
2417
|
+
if (blockedJsCount > 0) {
|
|
2418
|
+
suggestions.push(`${blockedJsCount} result(s) from Stack Overflow/GitHub were skipped (JavaScript-rendered sites not supported).`);
|
|
2419
|
+
}
|
|
2420
|
+
return suggestions;
|
|
2421
|
+
}
|
|
2422
|
+
if (successCount < diagnostics.length / 2) {
|
|
2423
|
+
suggestions.push(`Only ${successCount}/${diagnostics.length} pages had relevant content.`);
|
|
2424
|
+
}
|
|
2425
|
+
if (notRelevantPages.length >= 2) {
|
|
2426
|
+
const avgCoverage = notRelevantPages.reduce((sum, p) => sum + (p.metrics?.queryTermCoverage ?? 0), 0) / notRelevantPages.length;
|
|
2427
|
+
if (avgCoverage < 0.3) {
|
|
2428
|
+
suggestions.push("Query terms not found on many pages. Try adding the exact library/framework name (e.g., 'React', 'Next.js', 'Express').");
|
|
2429
|
+
} else {
|
|
2430
|
+
suggestions.push("Pages contained some query terms but lacked focused content. Try adding method names or error codes.");
|
|
2431
|
+
}
|
|
2432
|
+
}
|
|
2433
|
+
if (scrapeFailedCount >= 2) {
|
|
2434
|
+
suggestions.push("Multiple pages failed to load. This is normal for paywalled or bot-protected sites.");
|
|
2435
|
+
}
|
|
2436
|
+
if (noContentCount >= 2) {
|
|
2437
|
+
suggestions.push("Multiple pages had no extractable content. These may require login or have unusual page structures.");
|
|
2438
|
+
}
|
|
2439
|
+
if (budgetExceededCount > 0) {
|
|
2440
|
+
suggestions.push(`${budgetExceededCount} additional page(s) had content but were omitted due to output size limits.`);
|
|
2441
|
+
}
|
|
2442
|
+
if (blockedJsCount > 0) {
|
|
2443
|
+
suggestions.push(`${blockedJsCount} result(s) from Stack Overflow/GitHub were skipped (JavaScript-rendered).`);
|
|
2444
|
+
}
|
|
2445
|
+
return suggestions;
|
|
2446
|
+
}
|
|
2447
|
+
function formatResults(result, includeDiagnostics) {
|
|
2448
|
+
const lines = [];
|
|
2449
|
+
lines.push(`# Search Results for: "${result.query}"
|
|
2450
|
+
`);
|
|
2451
|
+
lines.push(`Found ${result.successfulPages} of ${result.totalPages} pages with relevant content.
|
|
2452
|
+
`);
|
|
2453
|
+
for (const page of result.pages) {
|
|
2454
|
+
if (page.excerpts.length === 0) continue;
|
|
2455
|
+
lines.push(`
|
|
2456
|
+
## ${page.title}`);
|
|
2457
|
+
lines.push(`Source: ${page.url}
|
|
2458
|
+
`);
|
|
2459
|
+
for (const excerpt of page.excerpts) {
|
|
2460
|
+
if (excerpt.headingPath.length > 0) {
|
|
2461
|
+
lines.push(`### ${excerpt.headingPath.join(" > ")}
|
|
2462
|
+
`);
|
|
2463
|
+
}
|
|
2464
|
+
lines.push(excerpt.text);
|
|
2465
|
+
lines.push("");
|
|
2466
|
+
}
|
|
2467
|
+
}
|
|
2468
|
+
const noResults = result.pages.every((p) => p.excerpts.length === 0);
|
|
2469
|
+
if (noResults) {
|
|
2470
|
+
lines.push("\nNo relevant content was extracted from any search result.\n");
|
|
2471
|
+
}
|
|
2472
|
+
if (includeDiagnostics || noResults) {
|
|
2473
|
+
lines.push("\n---\n");
|
|
2474
|
+
lines.push("## Search Diagnostics\n");
|
|
2475
|
+
lines.push("**Query Analysis:**");
|
|
2476
|
+
lines.push(`- Original query: "${result.query}"`);
|
|
2477
|
+
lines.push(`- Tokenized to: [${result.queryTokens.join(", ")}]`);
|
|
2478
|
+
lines.push(`- ${result.queryTokens.length} search term(s) used for matching
|
|
2479
|
+
`);
|
|
2480
|
+
lines.push("**Page-by-Page Results:**");
|
|
2481
|
+
for (const diag of result.diagnostics) {
|
|
2482
|
+
lines.push(`- ${formatPageStatus(diag)}`);
|
|
2483
|
+
}
|
|
2484
|
+
const suggestions = generateSuggestions(result.diagnostics, result.queryTokens);
|
|
2485
|
+
if (suggestions.length > 0) {
|
|
2486
|
+
lines.push("\n**Suggestions:**");
|
|
2487
|
+
for (const suggestion of suggestions) {
|
|
2488
|
+
lines.push(suggestion);
|
|
2489
|
+
}
|
|
2490
|
+
}
|
|
2491
|
+
}
|
|
2492
|
+
return lines.join("\n");
|
|
2493
|
+
}
|
|
2494
|
+
async function search(query, config = {}) {
|
|
2495
|
+
const debug = config.debug ?? false;
|
|
2496
|
+
const includeDiagnostics = config.diagnostics ?? false;
|
|
2497
|
+
const { searchQuery, extractionQuery } = parseSearchOperators(query);
|
|
2498
|
+
const cfg = {
|
|
2499
|
+
searxngUrl: config.searxngUrl ?? DEFAULT_CONFIG7.searxngUrl,
|
|
2500
|
+
maxResults: config.maxResults ?? DEFAULT_CONFIG7.maxResults,
|
|
2501
|
+
timeout: config.timeout ?? DEFAULT_CONFIG7.timeout,
|
|
2502
|
+
perPageCharBudget: config.perPageCharBudget ?? DEFAULT_CONFIG7.perPageCharBudget,
|
|
2503
|
+
totalCharBudget: config.totalCharBudget ?? DEFAULT_CONFIG7.totalCharBudget
|
|
2504
|
+
};
|
|
2505
|
+
const requestMultiplier = 2;
|
|
2506
|
+
let searchResults;
|
|
2507
|
+
try {
|
|
2508
|
+
searchResults = await logger.timeAsync("MCP: SearXNG search", async () => searchSearxng(searchQuery, {
|
|
2509
|
+
baseUrl: cfg.searxngUrl,
|
|
2510
|
+
maxResults: cfg.maxResults * requestMultiplier,
|
|
2511
|
+
timeout: cfg.timeout
|
|
2512
|
+
}));
|
|
2513
|
+
} catch (error) {
|
|
2514
|
+
return `Error searching SearXNG: ${error instanceof Error ? error.message : "Unknown error"}`;
|
|
2515
|
+
}
|
|
2516
|
+
if (searchResults.length === 0) {
|
|
2517
|
+
return `No search results found for: "${query}"`;
|
|
2518
|
+
}
|
|
2519
|
+
const deduplicatedResults = deduplicateUrls(searchResults);
|
|
2520
|
+
logger.debug(`Deduplicated ${searchResults.length} URLs to ${deduplicatedResults.length}`, debug);
|
|
2521
|
+
deduplicatedResults.sort((a, b) => {
|
|
2522
|
+
const scoreDiff = b.score - a.score;
|
|
2523
|
+
if (scoreDiff !== 0) return scoreDiff;
|
|
2524
|
+
return a.url.localeCompare(b.url);
|
|
2525
|
+
});
|
|
2526
|
+
const blockedResults = [];
|
|
2527
|
+
const scrapableResults = [];
|
|
2528
|
+
for (const result2 of deduplicatedResults) {
|
|
2529
|
+
if (isBlockedDomain(result2.url)) {
|
|
2530
|
+
blockedResults.push(result2);
|
|
2531
|
+
} else {
|
|
2532
|
+
scrapableResults.push(result2);
|
|
2533
|
+
}
|
|
2534
|
+
}
|
|
2535
|
+
const resultsToScrape = scrapableResults.slice(0, cfg.maxResults);
|
|
2536
|
+
logger.debug(`SearXNG returned ${searchResults.length} URLs, ${blockedResults.length} blocked, ${resultsToScrape.length} to scrape:`, debug);
|
|
2537
|
+
for (let i = 0; i < resultsToScrape.length; i++) {
|
|
2538
|
+
const r = resultsToScrape[i];
|
|
2539
|
+
if (r) {
|
|
2540
|
+
logger.debug(` ${i + 1}. [score=${r.score.toFixed(2)}] ${r.url}`, debug);
|
|
2541
|
+
}
|
|
2542
|
+
}
|
|
2543
|
+
if (blockedResults.length > 0) {
|
|
2544
|
+
logger.debug(`Blocked domains:`, debug);
|
|
2545
|
+
for (const r of blockedResults) {
|
|
2546
|
+
logger.debug(` - ${r.url}`, debug);
|
|
2547
|
+
}
|
|
2548
|
+
}
|
|
2549
|
+
const urls = resultsToScrape.map((r) => r.url);
|
|
2550
|
+
const scrapeResults = await logger.timeAsync("MCP: Scrape pages", async () => scrapeUrls(urls, { timeout: cfg.timeout }));
|
|
2551
|
+
const searchResultMap = /* @__PURE__ */ new Map();
|
|
2552
|
+
for (const sr of resultsToScrape) {
|
|
2553
|
+
searchResultMap.set(sr.url, sr);
|
|
2554
|
+
}
|
|
2555
|
+
const diagnostics = [];
|
|
2556
|
+
const blockedToShow = blockedResults.slice(0, 3);
|
|
2557
|
+
for (const blocked of blockedToShow) {
|
|
2558
|
+
diagnostics.push({
|
|
2559
|
+
url: blocked.url,
|
|
2560
|
+
title: blocked.title,
|
|
2561
|
+
status: "blocked_js"
|
|
2562
|
+
});
|
|
2563
|
+
}
|
|
2564
|
+
const pageExtractions = [];
|
|
2565
|
+
const extractionStart = performance.now();
|
|
2566
|
+
for (const scrape of scrapeResults) {
|
|
2567
|
+
const searchResultInfo = searchResultMap.get(scrape.url);
|
|
2568
|
+
const title = searchResultInfo?.title ?? scrape.url;
|
|
2569
|
+
if (scrape.html === null) {
|
|
2570
|
+
const diagEntry = {
|
|
2571
|
+
url: scrape.url,
|
|
2572
|
+
title,
|
|
2573
|
+
status: "scrape_failed"
|
|
2574
|
+
};
|
|
2575
|
+
if (scrape.error !== void 0) {
|
|
2576
|
+
diagEntry.error = scrape.error;
|
|
2577
|
+
}
|
|
2578
|
+
diagnostics.push(diagEntry);
|
|
2579
|
+
const extraction = {
|
|
2580
|
+
url: scrape.url,
|
|
2581
|
+
title,
|
|
2582
|
+
excerpts: [],
|
|
2583
|
+
totalChars: 0
|
|
2584
|
+
};
|
|
2585
|
+
if (scrape.error !== void 0) {
|
|
2586
|
+
extraction.error = scrape.error;
|
|
2587
|
+
}
|
|
2588
|
+
pageExtractions.push(extraction);
|
|
2589
|
+
continue;
|
|
2590
|
+
}
|
|
2591
|
+
const result2 = processPage(
|
|
2592
|
+
scrape.url,
|
|
2593
|
+
scrape.html,
|
|
2594
|
+
extractionQuery,
|
|
2595
|
+
cfg.perPageCharBudget,
|
|
2596
|
+
searchResultInfo
|
|
2597
|
+
);
|
|
2598
|
+
pageExtractions.push(result2.extraction);
|
|
2599
|
+
if (result2.extraction.excerpts.length === 0) {
|
|
2600
|
+
const metrics = result2.relevanceMetrics;
|
|
2601
|
+
let status;
|
|
2602
|
+
if (metrics && metrics.hasRelevantResults === false) {
|
|
2603
|
+
status = "not_relevant";
|
|
2604
|
+
} else if (metrics && metrics.sentenceCount === 0) {
|
|
2605
|
+
status = "no_content";
|
|
2606
|
+
} else {
|
|
2607
|
+
status = "no_content";
|
|
2608
|
+
}
|
|
2609
|
+
const diagEntry = {
|
|
2610
|
+
url: scrape.url,
|
|
2611
|
+
title: result2.extraction.title,
|
|
2612
|
+
status
|
|
2613
|
+
};
|
|
2614
|
+
if (metrics) {
|
|
2615
|
+
diagEntry.metrics = {
|
|
2616
|
+
sentenceCount: metrics.sentenceCount,
|
|
2617
|
+
queryTermCoverage: metrics.queryTermCoverage,
|
|
2618
|
+
maxBm25: metrics.maxBm25,
|
|
2619
|
+
maxCooccurrence: metrics.maxCooccurrence
|
|
2620
|
+
};
|
|
2621
|
+
}
|
|
2622
|
+
diagnostics.push(diagEntry);
|
|
2623
|
+
} else {
|
|
2624
|
+
diagnostics.push({
|
|
2625
|
+
url: scrape.url,
|
|
2626
|
+
title: result2.extraction.title,
|
|
2627
|
+
status: "success",
|
|
2628
|
+
excerptCount: result2.extraction.excerpts.length,
|
|
2629
|
+
charCount: result2.extraction.totalChars
|
|
2630
|
+
});
|
|
2631
|
+
}
|
|
2632
|
+
}
|
|
2633
|
+
const extractionDuration = performance.now() - extractionStart;
|
|
2634
|
+
logger.recordTiming(`MCP: Extract excerpts (${scrapeResults.length} pages)`, extractionDuration);
|
|
2635
|
+
const rankingStart = performance.now();
|
|
2636
|
+
const queryTokens = tokenize(extractionQuery);
|
|
2637
|
+
const successfulPages = pageExtractions.filter((p) => p.excerpts.length > 0);
|
|
2638
|
+
const pagesWithRelevance = successfulPages.map((page) => {
|
|
2639
|
+
const searxngScore = searchResultMap.get(page.url)?.score ?? 0;
|
|
2640
|
+
const relevance = computePageRelevance(page, queryTokens, searxngScore);
|
|
2641
|
+
return { page, relevance };
|
|
2642
|
+
});
|
|
2643
|
+
pagesWithRelevance.sort((a, b) => {
|
|
2644
|
+
const d = b.relevance - a.relevance;
|
|
2645
|
+
if (d !== 0) return d;
|
|
2646
|
+
return a.page.url.localeCompare(b.page.url);
|
|
2647
|
+
});
|
|
2648
|
+
let totalChars = 0;
|
|
2649
|
+
const budgetedPages = [];
|
|
2650
|
+
const includedUrls = /* @__PURE__ */ new Set();
|
|
2651
|
+
for (const { page } of pagesWithRelevance) {
|
|
2652
|
+
if (totalChars + page.totalChars > cfg.totalCharBudget) {
|
|
2653
|
+
const remainingBudget = cfg.totalCharBudget - totalChars;
|
|
2654
|
+
if (remainingBudget > 200) {
|
|
2655
|
+
let pageChars = 0;
|
|
2656
|
+
const trimmedExcerpts = [];
|
|
2657
|
+
for (const excerpt of page.excerpts) {
|
|
2658
|
+
const excerptChars = excerpt.text.length;
|
|
2659
|
+
if (pageChars + excerptChars <= remainingBudget) {
|
|
2660
|
+
trimmedExcerpts.push(excerpt);
|
|
2661
|
+
pageChars += excerptChars;
|
|
2662
|
+
}
|
|
2663
|
+
}
|
|
2664
|
+
if (trimmedExcerpts.length > 0) {
|
|
2665
|
+
budgetedPages.push({
|
|
2666
|
+
...page,
|
|
2667
|
+
excerpts: trimmedExcerpts,
|
|
2668
|
+
totalChars: pageChars
|
|
2669
|
+
});
|
|
2670
|
+
includedUrls.add(page.url);
|
|
2671
|
+
totalChars += pageChars;
|
|
2672
|
+
}
|
|
2673
|
+
}
|
|
2674
|
+
break;
|
|
2675
|
+
}
|
|
2676
|
+
budgetedPages.push(page);
|
|
2677
|
+
includedUrls.add(page.url);
|
|
2678
|
+
totalChars += page.totalChars;
|
|
2679
|
+
}
|
|
2680
|
+
for (const { page } of pagesWithRelevance) {
|
|
2681
|
+
if (!includedUrls.has(page.url)) {
|
|
2682
|
+
const diagIndex = diagnostics.findIndex((d) => d.url === page.url);
|
|
2683
|
+
if (diagIndex !== -1) {
|
|
2684
|
+
diagnostics[diagIndex] = {
|
|
2685
|
+
url: page.url,
|
|
2686
|
+
title: page.title,
|
|
2687
|
+
status: "budget_exceeded",
|
|
2688
|
+
excerptCount: page.excerpts.length,
|
|
2689
|
+
charCount: page.totalChars
|
|
2690
|
+
};
|
|
2691
|
+
}
|
|
2692
|
+
}
|
|
2693
|
+
}
|
|
2694
|
+
logger.recordTiming("MCP: Rank and budget pages", performance.now() - rankingStart);
|
|
2695
|
+
const result = {
|
|
2696
|
+
query,
|
|
2697
|
+
pages: budgetedPages,
|
|
2698
|
+
totalPages: resultsToScrape.length,
|
|
2699
|
+
// Pages we actually attempted to scrape
|
|
2700
|
+
successfulPages: budgetedPages.length,
|
|
2701
|
+
totalChars,
|
|
2702
|
+
diagnostics,
|
|
2703
|
+
queryTokens
|
|
2704
|
+
};
|
|
2705
|
+
const formatted = logger.time("MCP: Format results", () => formatResults(result, includeDiagnostics));
|
|
2706
|
+
logger.printTimings();
|
|
2707
|
+
return formatted;
|
|
2708
|
+
}
|
|
2709
|
+
|
|
2710
|
+
export {
|
|
2711
|
+
tokenize,
|
|
2712
|
+
formatExcerpts,
|
|
2713
|
+
logger_default,
|
|
2714
|
+
extractExcerpts,
|
|
2715
|
+
search
|
|
2716
|
+
};
|