peeky-search 1.0.10 → 1.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -4
- package/dist/{chunk-S3WZDJCP.js → chunk-5CZPI5V7.js} +236 -28
- package/dist/index.js +1 -1
- package/dist/mcp/server.js +16 -4
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -125,6 +125,7 @@ Once configured, your MCP client will have access to `peeky_web_search`:
|
|
|
125
125
|
| `query` | string | Search query. Supports `site:`, `"quotes"`, `-exclude` |
|
|
126
126
|
| `maxResults` | number | Pages to fetch (default 5, max 10) |
|
|
127
127
|
| `diagnostics` | boolean | Include filtering details (default false) |
|
|
128
|
+
| `sessionKey` | string | Key for cross-call URL deduplication (optional) |
|
|
128
129
|
|
|
129
130
|
**Output:**
|
|
130
131
|
```json
|
|
@@ -171,10 +172,11 @@ HTML → Strip boilerplate → Extract blocks → Segment sentences
|
|
|
171
172
|
|
|
172
173
|
1. **Preprocess**: Strip scripts, styles, nav, ads, and boilerplate
|
|
173
174
|
2. **Segment**: Extract blocks (headings, paragraphs, lists, code) into sentences
|
|
174
|
-
3. **
|
|
175
|
-
4. **
|
|
176
|
-
5. **
|
|
177
|
-
6. **
|
|
175
|
+
3. **Quality gate**: Reject low-quality pages (too few sentences, mostly fragments)
|
|
176
|
+
4. **Score**: BM25 for term relevance + 9 structural heuristics
|
|
177
|
+
5. **Select**: Pick top sentences with position/content diversity
|
|
178
|
+
6. **Expand**: Build context around anchors, respecting section boundaries
|
|
179
|
+
7. **Assemble**: Fit excerpts within character budget
|
|
178
180
|
|
|
179
181
|
## Performance
|
|
180
182
|
|
|
@@ -822,8 +822,8 @@ function segmentHtml($, container, options = {}) {
|
|
|
822
822
|
}
|
|
823
823
|
|
|
824
824
|
// src/scoring/bm25.ts
|
|
825
|
-
var DEFAULT_K1 = 1.
|
|
826
|
-
var DEFAULT_B = 0.
|
|
825
|
+
var DEFAULT_K1 = 1.5;
|
|
826
|
+
var DEFAULT_B = 0.75;
|
|
827
827
|
function computeDocumentStats(sentences) {
|
|
828
828
|
const docFrequency = {};
|
|
829
829
|
let totalLength = 0;
|
|
@@ -1228,15 +1228,15 @@ function computeDensityStats(sentences, queryTokens) {
|
|
|
1228
1228
|
}
|
|
1229
1229
|
densities.sort((a, b) => a - b);
|
|
1230
1230
|
const mid = Math.floor(densities.length / 2);
|
|
1231
|
-
const
|
|
1232
|
-
const absoluteDeviations = densities.map((d) => Math.abs(d -
|
|
1231
|
+
const median2 = densities.length % 2 === 0 ? ((densities[mid - 1] ?? 0) + (densities[mid] ?? 0)) / 2 : densities[mid] ?? 0;
|
|
1232
|
+
const absoluteDeviations = densities.map((d) => Math.abs(d - median2));
|
|
1233
1233
|
absoluteDeviations.sort((a, b) => a - b);
|
|
1234
1234
|
const madMid = Math.floor(absoluteDeviations.length / 2);
|
|
1235
1235
|
let mad = absoluteDeviations.length % 2 === 0 ? ((absoluteDeviations[madMid - 1] ?? 0) + (absoluteDeviations[madMid] ?? 0)) / 2 : absoluteDeviations[madMid] ?? 0;
|
|
1236
1236
|
if (mad < 1e-3) {
|
|
1237
1237
|
mad = 1e-3;
|
|
1238
1238
|
}
|
|
1239
|
-
return { median, mad };
|
|
1239
|
+
return { median: median2, mad };
|
|
1240
1240
|
}
|
|
1241
1241
|
function isMetaHeading(heading) {
|
|
1242
1242
|
const trimmed = heading.trim();
|
|
@@ -1886,6 +1886,75 @@ function fullDedupe(chunks, config = {}) {
|
|
|
1886
1886
|
return removeSubsetChunks(merged);
|
|
1887
1887
|
}
|
|
1888
1888
|
|
|
1889
|
+
// src/scoring/quality.ts
|
|
1890
|
+
var DEFAULT_CONFIG6 = {
|
|
1891
|
+
minLongSentences: 3,
|
|
1892
|
+
maxFragmentRatio: 0.65,
|
|
1893
|
+
minMedianLength: 25,
|
|
1894
|
+
minTotalSentences: 5
|
|
1895
|
+
};
|
|
1896
|
+
function median(values) {
|
|
1897
|
+
if (values.length === 0) return 0;
|
|
1898
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
1899
|
+
const mid = Math.floor(sorted.length / 2);
|
|
1900
|
+
if (sorted.length % 2 === 0) {
|
|
1901
|
+
const left = sorted[mid - 1];
|
|
1902
|
+
const right = sorted[mid];
|
|
1903
|
+
if (left !== void 0 && right !== void 0) {
|
|
1904
|
+
return (left + right) / 2;
|
|
1905
|
+
}
|
|
1906
|
+
return 0;
|
|
1907
|
+
}
|
|
1908
|
+
return sorted[mid] ?? 0;
|
|
1909
|
+
}
|
|
1910
|
+
function assessDocumentQuality(sentences, config = {}) {
|
|
1911
|
+
const {
|
|
1912
|
+
minLongSentences,
|
|
1913
|
+
maxFragmentRatio,
|
|
1914
|
+
minMedianLength,
|
|
1915
|
+
minTotalSentences
|
|
1916
|
+
} = { ...DEFAULT_CONFIG6, ...config };
|
|
1917
|
+
const totalSentences = sentences.length;
|
|
1918
|
+
if (totalSentences === 0) {
|
|
1919
|
+
return {
|
|
1920
|
+
totalSentences: 0,
|
|
1921
|
+
longSentenceCount: 0,
|
|
1922
|
+
medianSentenceLength: 0,
|
|
1923
|
+
fragmentRatio: 1,
|
|
1924
|
+
passesThreshold: false,
|
|
1925
|
+
rejectReason: "No sentences found"
|
|
1926
|
+
};
|
|
1927
|
+
}
|
|
1928
|
+
const lengths = sentences.map((s) => s.text.length);
|
|
1929
|
+
const longSentenceCount = lengths.filter((len) => len > 50).length;
|
|
1930
|
+
const fragmentCount = lengths.filter((len) => len < 30).length;
|
|
1931
|
+
const fragmentRatio = fragmentCount / totalSentences;
|
|
1932
|
+
const medianSentenceLength = median(lengths);
|
|
1933
|
+
let passesThreshold = true;
|
|
1934
|
+
let rejectReason;
|
|
1935
|
+
if (totalSentences < minTotalSentences) {
|
|
1936
|
+
passesThreshold = false;
|
|
1937
|
+
rejectReason = `Too few sentences (${totalSentences} < ${minTotalSentences})`;
|
|
1938
|
+
} else if (longSentenceCount < minLongSentences) {
|
|
1939
|
+
passesThreshold = false;
|
|
1940
|
+
rejectReason = `Too few long sentences (${longSentenceCount} < ${minLongSentences})`;
|
|
1941
|
+
} else if (fragmentRatio > maxFragmentRatio) {
|
|
1942
|
+
passesThreshold = false;
|
|
1943
|
+
rejectReason = `Too many fragments (${(fragmentRatio * 100).toFixed(0)}% > ${maxFragmentRatio * 100}%)`;
|
|
1944
|
+
} else if (medianSentenceLength < minMedianLength) {
|
|
1945
|
+
passesThreshold = false;
|
|
1946
|
+
rejectReason = `Median sentence too short (${medianSentenceLength.toFixed(0)} < ${minMedianLength})`;
|
|
1947
|
+
}
|
|
1948
|
+
return {
|
|
1949
|
+
totalSentences,
|
|
1950
|
+
longSentenceCount,
|
|
1951
|
+
medianSentenceLength,
|
|
1952
|
+
fragmentRatio,
|
|
1953
|
+
passesThreshold,
|
|
1954
|
+
...rejectReason && { rejectReason }
|
|
1955
|
+
};
|
|
1956
|
+
}
|
|
1957
|
+
|
|
1889
1958
|
// src/pipeline.ts
|
|
1890
1959
|
function createEmptyResult(query, debug, debugInfo, relevanceMetrics) {
|
|
1891
1960
|
const result = {
|
|
@@ -1908,7 +1977,7 @@ function createEmptyResult(query, debug, debugInfo, relevanceMetrics) {
|
|
|
1908
1977
|
}
|
|
1909
1978
|
return result;
|
|
1910
1979
|
}
|
|
1911
|
-
var
|
|
1980
|
+
var DEFAULT_CONFIG7 = {
|
|
1912
1981
|
ranker: {
|
|
1913
1982
|
bm25Weight: 0.6,
|
|
1914
1983
|
heuristicWeight: 0.4
|
|
@@ -1937,10 +2006,17 @@ var DEFAULT_CONFIG6 = {
|
|
|
1937
2006
|
charBudget: 6e3,
|
|
1938
2007
|
minExcerptChars: 100
|
|
1939
2008
|
},
|
|
2009
|
+
quality: {
|
|
2010
|
+
minLongSentences: 3,
|
|
2011
|
+
maxFragmentRatio: 0.65,
|
|
2012
|
+
minMedianLength: 25,
|
|
2013
|
+
minTotalSentences: 5
|
|
2014
|
+
},
|
|
2015
|
+
skipQualityCheck: false,
|
|
1940
2016
|
debug: false
|
|
1941
2017
|
};
|
|
1942
2018
|
function extractExcerpts(html, query, config = {}) {
|
|
1943
|
-
const cfg = mergeConfig(
|
|
2019
|
+
const cfg = mergeConfig(DEFAULT_CONFIG7, config);
|
|
1944
2020
|
const logger2 = logger_default.getInstance();
|
|
1945
2021
|
const { $, mainContent } = logger2.time("1. Preprocess HTML", () => preprocessHtml(html));
|
|
1946
2022
|
if (mainContent === null) {
|
|
@@ -1950,6 +2026,28 @@ function extractExcerpts(html, query, config = {}) {
|
|
|
1950
2026
|
if (sentences.length === 0) {
|
|
1951
2027
|
return createEmptyResult(query, cfg.debug);
|
|
1952
2028
|
}
|
|
2029
|
+
if (!cfg.skipQualityCheck) {
|
|
2030
|
+
const quality = logger2.time("2b. Quality check", () => assessDocumentQuality(sentences, cfg.quality));
|
|
2031
|
+
if (!quality.passesThreshold) {
|
|
2032
|
+
return createEmptyResult(
|
|
2033
|
+
query,
|
|
2034
|
+
cfg.debug,
|
|
2035
|
+
{
|
|
2036
|
+
sentenceCount: sentences.length,
|
|
2037
|
+
hasRelevantResults: false,
|
|
2038
|
+
topSentences: []
|
|
2039
|
+
},
|
|
2040
|
+
{
|
|
2041
|
+
hasRelevantResults: false,
|
|
2042
|
+
sentenceCount: sentences.length,
|
|
2043
|
+
queryTermCoverage: 0,
|
|
2044
|
+
maxBm25: 0,
|
|
2045
|
+
maxCooccurrence: 0,
|
|
2046
|
+
...quality.rejectReason && { qualityRejectReason: quality.rejectReason }
|
|
2047
|
+
}
|
|
2048
|
+
);
|
|
2049
|
+
}
|
|
2050
|
+
}
|
|
1953
2051
|
const queryTokens = logger2.time("3. Tokenize query", () => tokenize(query));
|
|
1954
2052
|
if (queryTokens.length === 0) {
|
|
1955
2053
|
return extractWithoutQuery(sentences, query, cfg);
|
|
@@ -2049,12 +2147,14 @@ function mergeConfig(defaults, overrides) {
|
|
|
2049
2147
|
expand: { ...defaults.expand, ...overrides.expand },
|
|
2050
2148
|
dedupe: { ...defaults.dedupe, ...overrides.dedupe },
|
|
2051
2149
|
excerpts: { ...defaults.excerpts, ...overrides.excerpts },
|
|
2150
|
+
quality: { ...defaults.quality, ...overrides.quality },
|
|
2151
|
+
skipQualityCheck: overrides.skipQualityCheck ?? defaults.skipQualityCheck ?? false,
|
|
2052
2152
|
debug: overrides.debug ?? defaults.debug ?? false
|
|
2053
2153
|
};
|
|
2054
2154
|
}
|
|
2055
2155
|
|
|
2056
2156
|
// src/mcp/types.ts
|
|
2057
|
-
var
|
|
2157
|
+
var DEFAULT_CONFIG8 = {
|
|
2058
2158
|
searxngUrl: "http://localhost:8888",
|
|
2059
2159
|
maxResults: 5,
|
|
2060
2160
|
timeout: 5e3,
|
|
@@ -2203,18 +2303,78 @@ function parseSearchOperators(query) {
|
|
|
2203
2303
|
|
|
2204
2304
|
// src/mcp/orchestrator.ts
|
|
2205
2305
|
var logger = logger_default.getInstance();
|
|
2206
|
-
var
|
|
2306
|
+
var sessionCache = /* @__PURE__ */ new Map();
|
|
2307
|
+
var SESSION_TTL_MS = 10 * 1e3;
|
|
2308
|
+
function cleanupExpiredSessions() {
|
|
2309
|
+
const now = Date.now();
|
|
2310
|
+
for (const [key, session] of sessionCache) {
|
|
2311
|
+
if (now - session.lastUsed > SESSION_TTL_MS) {
|
|
2312
|
+
sessionCache.delete(key);
|
|
2313
|
+
}
|
|
2314
|
+
}
|
|
2315
|
+
}
|
|
2316
|
+
function getSession(sessionKey) {
|
|
2317
|
+
cleanupExpiredSessions();
|
|
2318
|
+
let session = sessionCache.get(sessionKey);
|
|
2319
|
+
if (!session) {
|
|
2320
|
+
session = { urls: /* @__PURE__ */ new Set(), lastUsed: Date.now() };
|
|
2321
|
+
sessionCache.set(sessionKey, session);
|
|
2322
|
+
} else {
|
|
2323
|
+
session.lastUsed = Date.now();
|
|
2324
|
+
}
|
|
2325
|
+
return session;
|
|
2326
|
+
}
|
|
2327
|
+
function addUrlsToSession(sessionKey, urls) {
|
|
2328
|
+
const session = getSession(sessionKey);
|
|
2329
|
+
for (const url of urls) {
|
|
2330
|
+
session.urls.add(url);
|
|
2331
|
+
}
|
|
2332
|
+
}
|
|
2333
|
+
function filterSessionUrls(sessionKey, urls) {
|
|
2334
|
+
if (!sessionKey) {
|
|
2335
|
+
return { newUrls: urls, skippedUrls: [] };
|
|
2336
|
+
}
|
|
2337
|
+
const session = getSession(sessionKey);
|
|
2338
|
+
const newUrls = [];
|
|
2339
|
+
const skippedUrls = [];
|
|
2340
|
+
for (const url of urls) {
|
|
2341
|
+
if (session.urls.has(url)) {
|
|
2342
|
+
skippedUrls.push(url);
|
|
2343
|
+
} else {
|
|
2344
|
+
newUrls.push(url);
|
|
2345
|
+
}
|
|
2346
|
+
}
|
|
2347
|
+
return { newUrls, skippedUrls };
|
|
2348
|
+
}
|
|
2349
|
+
var BLOCKED_DOMAINS = /* @__PURE__ */ new Set([
|
|
2207
2350
|
"medium.com",
|
|
2208
|
-
"npmjs.com"
|
|
2351
|
+
"npmjs.com",
|
|
2352
|
+
"researchgate.net",
|
|
2353
|
+
"grokipedia.org"
|
|
2209
2354
|
]);
|
|
2355
|
+
function isGitHubRepoMainPage(url) {
|
|
2356
|
+
try {
|
|
2357
|
+
const urlObj = new URL(url);
|
|
2358
|
+
const hostname = urlObj.hostname.replace(/^www\./, "");
|
|
2359
|
+
if (hostname !== "github.com") return false;
|
|
2360
|
+
const path = urlObj.pathname.replace(/\/$/, "");
|
|
2361
|
+
const segments = path.split("/").filter((s) => s.length > 0);
|
|
2362
|
+
return segments.length === 2;
|
|
2363
|
+
} catch {
|
|
2364
|
+
return false;
|
|
2365
|
+
}
|
|
2366
|
+
}
|
|
2210
2367
|
function isBlockedDomain(url) {
|
|
2211
2368
|
try {
|
|
2212
2369
|
const hostname = new URL(url).hostname.replace(/^www\./, "");
|
|
2213
|
-
for (const blocked of
|
|
2370
|
+
for (const blocked of BLOCKED_DOMAINS) {
|
|
2214
2371
|
if (hostname === blocked || hostname.endsWith(`.${blocked}`)) {
|
|
2215
2372
|
return true;
|
|
2216
2373
|
}
|
|
2217
2374
|
}
|
|
2375
|
+
if (isGitHubRepoMainPage(url)) {
|
|
2376
|
+
return true;
|
|
2377
|
+
}
|
|
2218
2378
|
return false;
|
|
2219
2379
|
} catch {
|
|
2220
2380
|
return false;
|
|
@@ -2398,6 +2558,10 @@ function formatPageStatus(diag) {
|
|
|
2398
2558
|
return `~ [${shortUrl}] TRUNCATED: Output limit reached`;
|
|
2399
2559
|
case "blocked_js":
|
|
2400
2560
|
return `x [${shortUrl}] SKIPPED: JS-rendered site`;
|
|
2561
|
+
case "low_quality":
|
|
2562
|
+
return `x [${shortUrl}] SKIPPED: ${diag.error ?? "Low quality content"}`;
|
|
2563
|
+
case "session_cached":
|
|
2564
|
+
return `= [${shortUrl}] SKIPPED: Already fetched in this session`;
|
|
2401
2565
|
default:
|
|
2402
2566
|
return `? [${shortUrl}] Unknown status`;
|
|
2403
2567
|
}
|
|
@@ -2410,7 +2574,10 @@ function generateSuggestions(diagnostics, queryTokens) {
|
|
|
2410
2574
|
const scrapeFailedCount = diagnostics.filter((d) => d.status === "scrape_failed").length;
|
|
2411
2575
|
const budgetExceededCount = diagnostics.filter((d) => d.status === "budget_exceeded").length;
|
|
2412
2576
|
const blockedJsCount = diagnostics.filter((d) => d.status === "blocked_js").length;
|
|
2413
|
-
|
|
2577
|
+
const lowQualityCount = diagnostics.filter((d) => d.status === "low_quality").length;
|
|
2578
|
+
const sessionCachedCount = diagnostics.filter((d) => d.status === "session_cached").length;
|
|
2579
|
+
const relevantDiagCount = diagnostics.length - sessionCachedCount;
|
|
2580
|
+
if (successCount === 0 && relevantDiagCount > 0) {
|
|
2414
2581
|
suggestions.push("NO RESULTS EXTRACTED. Your query may be too vague or not match the content.");
|
|
2415
2582
|
suggestions.push("Try a more specific query with exact library names, function names, or error messages.");
|
|
2416
2583
|
suggestions.push(`Current query tokens: [${queryTokens.join(", ")}] - ensure these terms appear in documentation you're looking for.`);
|
|
@@ -2419,8 +2586,8 @@ function generateSuggestions(diagnostics, queryTokens) {
|
|
|
2419
2586
|
}
|
|
2420
2587
|
return suggestions;
|
|
2421
2588
|
}
|
|
2422
|
-
if (successCount <
|
|
2423
|
-
suggestions.push(`Only ${successCount}/${
|
|
2589
|
+
if (relevantDiagCount > 0 && successCount < relevantDiagCount / 2) {
|
|
2590
|
+
suggestions.push(`Only ${successCount}/${relevantDiagCount} pages had relevant content.`);
|
|
2424
2591
|
}
|
|
2425
2592
|
if (notRelevantPages.length >= 2) {
|
|
2426
2593
|
const avgCoverage = notRelevantPages.reduce((sum, p) => sum + (p.metrics?.queryTermCoverage ?? 0), 0) / notRelevantPages.length;
|
|
@@ -2442,14 +2609,23 @@ function generateSuggestions(diagnostics, queryTokens) {
|
|
|
2442
2609
|
if (blockedJsCount > 0) {
|
|
2443
2610
|
suggestions.push(`${blockedJsCount} result(s) from Stack Overflow/GitHub were skipped (JavaScript-rendered).`);
|
|
2444
2611
|
}
|
|
2612
|
+
if (lowQualityCount > 0) {
|
|
2613
|
+
suggestions.push(`${lowQualityCount} page(s) had low quality content (mostly metadata or fragments).`);
|
|
2614
|
+
}
|
|
2615
|
+
if (sessionCachedCount > 0) {
|
|
2616
|
+
suggestions.push(`${sessionCachedCount} page(s) were skipped (already fetched earlier in this session).`);
|
|
2617
|
+
}
|
|
2445
2618
|
return suggestions;
|
|
2446
2619
|
}
|
|
2447
2620
|
function formatResults(result, includeDiagnostics) {
|
|
2448
2621
|
const lines = [];
|
|
2449
2622
|
lines.push(`# Search Results for: "${result.query}"
|
|
2450
2623
|
`);
|
|
2451
|
-
|
|
2452
|
-
|
|
2624
|
+
let statusLine = `Found ${result.successfulPages} of ${result.totalPages} pages with relevant content.`;
|
|
2625
|
+
if (result.sessionSkippedCount && result.sessionSkippedCount > 0) {
|
|
2626
|
+
statusLine += ` (${result.sessionSkippedCount} already fetched in this session)`;
|
|
2627
|
+
}
|
|
2628
|
+
lines.push(statusLine + "\n");
|
|
2453
2629
|
for (const page of result.pages) {
|
|
2454
2630
|
if (page.excerpts.length === 0) continue;
|
|
2455
2631
|
lines.push(`
|
|
@@ -2465,11 +2641,13 @@ function formatResults(result, includeDiagnostics) {
|
|
|
2465
2641
|
lines.push("");
|
|
2466
2642
|
}
|
|
2467
2643
|
}
|
|
2468
|
-
const
|
|
2469
|
-
|
|
2644
|
+
const hasExcerpts = result.pages.some((p) => p.excerpts.length > 0);
|
|
2645
|
+
const hasSessionCached = (result.sessionSkippedCount ?? 0) > 0;
|
|
2646
|
+
const showFailureInfo = !hasExcerpts && !hasSessionCached;
|
|
2647
|
+
if (showFailureInfo) {
|
|
2470
2648
|
lines.push("\nNo relevant content was extracted from any search result.\n");
|
|
2471
2649
|
}
|
|
2472
|
-
if (includeDiagnostics
|
|
2650
|
+
if (includeDiagnostics) {
|
|
2473
2651
|
lines.push("\n---\n");
|
|
2474
2652
|
lines.push("## Search Diagnostics\n");
|
|
2475
2653
|
lines.push("**Query Analysis:**");
|
|
@@ -2496,11 +2674,12 @@ async function search(query, config = {}) {
|
|
|
2496
2674
|
const includeDiagnostics = config.diagnostics ?? false;
|
|
2497
2675
|
const { searchQuery, extractionQuery } = parseSearchOperators(query);
|
|
2498
2676
|
const cfg = {
|
|
2499
|
-
searxngUrl: config.searxngUrl ??
|
|
2500
|
-
maxResults: config.maxResults ??
|
|
2501
|
-
timeout: config.timeout ??
|
|
2502
|
-
perPageCharBudget: config.perPageCharBudget ??
|
|
2503
|
-
totalCharBudget: config.totalCharBudget ??
|
|
2677
|
+
searxngUrl: config.searxngUrl ?? DEFAULT_CONFIG8.searxngUrl,
|
|
2678
|
+
maxResults: config.maxResults ?? DEFAULT_CONFIG8.maxResults,
|
|
2679
|
+
timeout: config.timeout ?? DEFAULT_CONFIG8.timeout,
|
|
2680
|
+
perPageCharBudget: config.perPageCharBudget ?? DEFAULT_CONFIG8.perPageCharBudget,
|
|
2681
|
+
totalCharBudget: config.totalCharBudget ?? DEFAULT_CONFIG8.totalCharBudget,
|
|
2682
|
+
sessionKey: config.sessionKey
|
|
2504
2683
|
};
|
|
2505
2684
|
const requestMultiplier = 2;
|
|
2506
2685
|
let searchResults;
|
|
@@ -2532,8 +2711,19 @@ async function search(query, config = {}) {
|
|
|
2532
2711
|
scrapableResults.push(result2);
|
|
2533
2712
|
}
|
|
2534
2713
|
}
|
|
2535
|
-
|
|
2536
|
-
|
|
2714
|
+
let resultsToScrape = scrapableResults.slice(0, cfg.maxResults);
|
|
2715
|
+
let sessionSkippedResults = [];
|
|
2716
|
+
if (cfg.sessionKey) {
|
|
2717
|
+
const urlsToCheck = resultsToScrape.map((r) => r.url);
|
|
2718
|
+
const { newUrls, skippedUrls } = filterSessionUrls(cfg.sessionKey, urlsToCheck);
|
|
2719
|
+
if (skippedUrls.length > 0) {
|
|
2720
|
+
const skippedSet = new Set(skippedUrls);
|
|
2721
|
+
sessionSkippedResults = resultsToScrape.filter((r) => skippedSet.has(r.url));
|
|
2722
|
+
resultsToScrape = resultsToScrape.filter((r) => !skippedSet.has(r.url));
|
|
2723
|
+
logger.debug(`Session '${cfg.sessionKey}': skipped ${skippedUrls.length} already-fetched URLs`, debug);
|
|
2724
|
+
}
|
|
2725
|
+
}
|
|
2726
|
+
logger.debug(`SearXNG returned ${searchResults.length} URLs, ${blockedResults.length} blocked, ${sessionSkippedResults.length} session-cached, ${resultsToScrape.length} to scrape:`, debug);
|
|
2537
2727
|
for (let i = 0; i < resultsToScrape.length; i++) {
|
|
2538
2728
|
const r = resultsToScrape[i];
|
|
2539
2729
|
if (r) {
|
|
@@ -2561,6 +2751,13 @@ async function search(query, config = {}) {
|
|
|
2561
2751
|
status: "blocked_js"
|
|
2562
2752
|
});
|
|
2563
2753
|
}
|
|
2754
|
+
for (const skipped of sessionSkippedResults) {
|
|
2755
|
+
diagnostics.push({
|
|
2756
|
+
url: skipped.url,
|
|
2757
|
+
title: skipped.title,
|
|
2758
|
+
status: "session_cached"
|
|
2759
|
+
});
|
|
2760
|
+
}
|
|
2564
2761
|
const pageExtractions = [];
|
|
2565
2762
|
const extractionStart = performance.now();
|
|
2566
2763
|
for (const scrape of scrapeResults) {
|
|
@@ -2599,7 +2796,9 @@ async function search(query, config = {}) {
|
|
|
2599
2796
|
if (result2.extraction.excerpts.length === 0) {
|
|
2600
2797
|
const metrics = result2.relevanceMetrics;
|
|
2601
2798
|
let status;
|
|
2602
|
-
if (metrics
|
|
2799
|
+
if (metrics?.qualityRejectReason) {
|
|
2800
|
+
status = "low_quality";
|
|
2801
|
+
} else if (metrics && metrics.hasRelevantResults === false) {
|
|
2603
2802
|
status = "not_relevant";
|
|
2604
2803
|
} else if (metrics && metrics.sentenceCount === 0) {
|
|
2605
2804
|
status = "no_content";
|
|
@@ -2611,6 +2810,9 @@ async function search(query, config = {}) {
|
|
|
2611
2810
|
title: result2.extraction.title,
|
|
2612
2811
|
status
|
|
2613
2812
|
};
|
|
2813
|
+
if (metrics?.qualityRejectReason) {
|
|
2814
|
+
diagEntry.error = metrics.qualityRejectReason;
|
|
2815
|
+
}
|
|
2614
2816
|
if (metrics) {
|
|
2615
2817
|
diagEntry.metrics = {
|
|
2616
2818
|
sentenceCount: metrics.sentenceCount,
|
|
@@ -2692,6 +2894,11 @@ async function search(query, config = {}) {
|
|
|
2692
2894
|
}
|
|
2693
2895
|
}
|
|
2694
2896
|
logger.recordTiming("MCP: Rank and budget pages", performance.now() - rankingStart);
|
|
2897
|
+
if (cfg.sessionKey) {
|
|
2898
|
+
const scrapedUrls = resultsToScrape.map((r) => r.url);
|
|
2899
|
+
addUrlsToSession(cfg.sessionKey, scrapedUrls);
|
|
2900
|
+
logger.debug(`Session '${cfg.sessionKey}': cached ${scrapedUrls.length} URLs for future deduplication`, debug);
|
|
2901
|
+
}
|
|
2695
2902
|
const result = {
|
|
2696
2903
|
query,
|
|
2697
2904
|
pages: budgetedPages,
|
|
@@ -2700,7 +2907,8 @@ async function search(query, config = {}) {
|
|
|
2700
2907
|
successfulPages: budgetedPages.length,
|
|
2701
2908
|
totalChars,
|
|
2702
2909
|
diagnostics,
|
|
2703
|
-
queryTokens
|
|
2910
|
+
queryTokens,
|
|
2911
|
+
...sessionSkippedResults.length > 0 && { sessionSkippedCount: sessionSkippedResults.length }
|
|
2704
2912
|
};
|
|
2705
2913
|
const formatted = logger.time("MCP: Format results", () => formatResults(result, includeDiagnostics));
|
|
2706
2914
|
logger.printTimings();
|
package/dist/index.js
CHANGED
package/dist/mcp/server.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import {
|
|
2
2
|
logger_default,
|
|
3
3
|
search
|
|
4
|
-
} from "../chunk-
|
|
4
|
+
} from "../chunk-5CZPI5V7.js";
|
|
5
5
|
|
|
6
6
|
// src/mcp/server.ts
|
|
7
7
|
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
@@ -34,18 +34,30 @@ SEARCH OPERATORS:
|
|
|
34
34
|
- "exact phrase" - exact phrase match
|
|
35
35
|
- -term - exclude term
|
|
36
36
|
|
|
37
|
+
SESSION DEDUPLICATION:
|
|
38
|
+
When making multiple related searches (e.g., researching a topic), provide a consistent sessionKey (e.g., "react-hooks-research"). This prevents re-fetching pages you've already seen, saving time and avoiding duplicate content in your results.
|
|
39
|
+
|
|
40
|
+
IMPORTANT - Diversify your searches:
|
|
41
|
+
Similar queries return similar results from search engines. If you search "react useState hook" and then "react useState tutorial", you'll likely get overlapping pages. To maximize information:
|
|
42
|
+
- Vary your query terms significantly between searches (don't just rephrase)
|
|
43
|
+
- Search different aspects of the topic (e.g., "useState" vs "useReducer" vs "state management patterns")
|
|
44
|
+
- Use specific terms for specific questions rather than broad terms repeatedly
|
|
45
|
+
- The sessionKey deduplication helps, but can't prevent overlap if the search engine returns the same URLs for similar queries
|
|
46
|
+
|
|
37
47
|
RETURNS: Extracted text excerpts with source URLs (not raw HTML).`,
|
|
38
48
|
{
|
|
39
49
|
query: z.string().describe(
|
|
40
50
|
'Search query with technical terms. Supports operators: site:, "quotes", -exclude.'
|
|
41
51
|
),
|
|
42
52
|
maxResults: z.number().optional().describe("Maximum pages to scrape (default: 5, max: 10)"),
|
|
43
|
-
diagnostics: z.boolean().optional().describe("Include detailed diagnostics about why pages were filtered or failed (default: false).
|
|
53
|
+
diagnostics: z.boolean().optional().describe("Include detailed diagnostics about why pages were filtered or failed (default: false)."),
|
|
54
|
+
sessionKey: z.string().optional().describe("Session key for cross-call URL deduplication. When provided, URLs already fetched in previous calls with the same key will be skipped. Use a consistent key (e.g., 'react-research') across related searches to avoid re-fetching the same pages.")
|
|
44
55
|
},
|
|
45
|
-
async ({ query, maxResults, diagnostics }) => {
|
|
56
|
+
async ({ query, maxResults, diagnostics, sessionKey }) => {
|
|
46
57
|
const result = await search(query, {
|
|
47
58
|
...maxResults !== void 0 && { maxResults },
|
|
48
|
-
...diagnostics !== void 0 && { diagnostics }
|
|
59
|
+
...diagnostics !== void 0 && { diagnostics },
|
|
60
|
+
...sessionKey !== void 0 && { sessionKey }
|
|
49
61
|
});
|
|
50
62
|
return {
|
|
51
63
|
content: [
|