peeky-search 1.0.10 → 1.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -125,6 +125,7 @@ Once configured, your MCP client will have access to `peeky_web_search`:
125
125
  | `query` | string | Search query. Supports `site:`, `"quotes"`, `-exclude` |
126
126
  | `maxResults` | number | Pages to fetch (default 5, max 10) |
127
127
  | `diagnostics` | boolean | Include filtering details (default false) |
128
+ | `sessionKey` | string | Key for cross-call URL deduplication (optional) |
128
129
 
129
130
  **Output:**
130
131
  ```json
@@ -171,10 +172,11 @@ HTML → Strip boilerplate → Extract blocks → Segment sentences
171
172
 
172
173
  1. **Preprocess**: Strip scripts, styles, nav, ads, and boilerplate
173
174
  2. **Segment**: Extract blocks (headings, paragraphs, lists, code) into sentences
174
- 3. **Score**: BM25 for term relevance + 8 structural heuristics
175
- 4. **Select**: Pick top sentences with position/content diversity
176
- 5. **Expand**: Build context around anchors, respecting section boundaries
177
- 6. **Assemble**: Fit excerpts within character budget
175
+ 3. **Quality gate**: Reject low-quality pages (too few sentences, mostly fragments)
176
+ 4. **Score**: BM25 for term relevance + 9 structural heuristics
177
+ 5. **Select**: Pick top sentences with position/content diversity
178
+ 6. **Expand**: Build context around anchors, respecting section boundaries
179
+ 7. **Assemble**: Fit excerpts within character budget
178
180
 
179
181
  ## Performance
180
182
 
@@ -822,8 +822,8 @@ function segmentHtml($, container, options = {}) {
822
822
  }
823
823
 
824
824
  // src/scoring/bm25.ts
825
- var DEFAULT_K1 = 1.2;
826
- var DEFAULT_B = 0.5;
825
+ var DEFAULT_K1 = 1.5;
826
+ var DEFAULT_B = 0.75;
827
827
  function computeDocumentStats(sentences) {
828
828
  const docFrequency = {};
829
829
  let totalLength = 0;
@@ -1228,15 +1228,15 @@ function computeDensityStats(sentences, queryTokens) {
1228
1228
  }
1229
1229
  densities.sort((a, b) => a - b);
1230
1230
  const mid = Math.floor(densities.length / 2);
1231
- const median = densities.length % 2 === 0 ? ((densities[mid - 1] ?? 0) + (densities[mid] ?? 0)) / 2 : densities[mid] ?? 0;
1232
- const absoluteDeviations = densities.map((d) => Math.abs(d - median));
1231
+ const median2 = densities.length % 2 === 0 ? ((densities[mid - 1] ?? 0) + (densities[mid] ?? 0)) / 2 : densities[mid] ?? 0;
1232
+ const absoluteDeviations = densities.map((d) => Math.abs(d - median2));
1233
1233
  absoluteDeviations.sort((a, b) => a - b);
1234
1234
  const madMid = Math.floor(absoluteDeviations.length / 2);
1235
1235
  let mad = absoluteDeviations.length % 2 === 0 ? ((absoluteDeviations[madMid - 1] ?? 0) + (absoluteDeviations[madMid] ?? 0)) / 2 : absoluteDeviations[madMid] ?? 0;
1236
1236
  if (mad < 1e-3) {
1237
1237
  mad = 1e-3;
1238
1238
  }
1239
- return { median, mad };
1239
+ return { median: median2, mad };
1240
1240
  }
1241
1241
  function isMetaHeading(heading) {
1242
1242
  const trimmed = heading.trim();
@@ -1886,6 +1886,75 @@ function fullDedupe(chunks, config = {}) {
1886
1886
  return removeSubsetChunks(merged);
1887
1887
  }
1888
1888
 
1889
+ // src/scoring/quality.ts
1890
+ var DEFAULT_CONFIG6 = {
1891
+ minLongSentences: 3,
1892
+ maxFragmentRatio: 0.65,
1893
+ minMedianLength: 25,
1894
+ minTotalSentences: 5
1895
+ };
1896
+ function median(values) {
1897
+ if (values.length === 0) return 0;
1898
+ const sorted = [...values].sort((a, b) => a - b);
1899
+ const mid = Math.floor(sorted.length / 2);
1900
+ if (sorted.length % 2 === 0) {
1901
+ const left = sorted[mid - 1];
1902
+ const right = sorted[mid];
1903
+ if (left !== void 0 && right !== void 0) {
1904
+ return (left + right) / 2;
1905
+ }
1906
+ return 0;
1907
+ }
1908
+ return sorted[mid] ?? 0;
1909
+ }
1910
+ function assessDocumentQuality(sentences, config = {}) {
1911
+ const {
1912
+ minLongSentences,
1913
+ maxFragmentRatio,
1914
+ minMedianLength,
1915
+ minTotalSentences
1916
+ } = { ...DEFAULT_CONFIG6, ...config };
1917
+ const totalSentences = sentences.length;
1918
+ if (totalSentences === 0) {
1919
+ return {
1920
+ totalSentences: 0,
1921
+ longSentenceCount: 0,
1922
+ medianSentenceLength: 0,
1923
+ fragmentRatio: 1,
1924
+ passesThreshold: false,
1925
+ rejectReason: "No sentences found"
1926
+ };
1927
+ }
1928
+ const lengths = sentences.map((s) => s.text.length);
1929
+ const longSentenceCount = lengths.filter((len) => len > 50).length;
1930
+ const fragmentCount = lengths.filter((len) => len < 30).length;
1931
+ const fragmentRatio = fragmentCount / totalSentences;
1932
+ const medianSentenceLength = median(lengths);
1933
+ let passesThreshold = true;
1934
+ let rejectReason;
1935
+ if (totalSentences < minTotalSentences) {
1936
+ passesThreshold = false;
1937
+ rejectReason = `Too few sentences (${totalSentences} < ${minTotalSentences})`;
1938
+ } else if (longSentenceCount < minLongSentences) {
1939
+ passesThreshold = false;
1940
+ rejectReason = `Too few long sentences (${longSentenceCount} < ${minLongSentences})`;
1941
+ } else if (fragmentRatio > maxFragmentRatio) {
1942
+ passesThreshold = false;
1943
+ rejectReason = `Too many fragments (${(fragmentRatio * 100).toFixed(0)}% > ${maxFragmentRatio * 100}%)`;
1944
+ } else if (medianSentenceLength < minMedianLength) {
1945
+ passesThreshold = false;
1946
+ rejectReason = `Median sentence too short (${medianSentenceLength.toFixed(0)} < ${minMedianLength})`;
1947
+ }
1948
+ return {
1949
+ totalSentences,
1950
+ longSentenceCount,
1951
+ medianSentenceLength,
1952
+ fragmentRatio,
1953
+ passesThreshold,
1954
+ ...rejectReason && { rejectReason }
1955
+ };
1956
+ }
1957
+
1889
1958
  // src/pipeline.ts
1890
1959
  function createEmptyResult(query, debug, debugInfo, relevanceMetrics) {
1891
1960
  const result = {
@@ -1908,7 +1977,7 @@ function createEmptyResult(query, debug, debugInfo, relevanceMetrics) {
1908
1977
  }
1909
1978
  return result;
1910
1979
  }
1911
- var DEFAULT_CONFIG6 = {
1980
+ var DEFAULT_CONFIG7 = {
1912
1981
  ranker: {
1913
1982
  bm25Weight: 0.6,
1914
1983
  heuristicWeight: 0.4
@@ -1937,10 +2006,17 @@ var DEFAULT_CONFIG6 = {
1937
2006
  charBudget: 6e3,
1938
2007
  minExcerptChars: 100
1939
2008
  },
2009
+ quality: {
2010
+ minLongSentences: 3,
2011
+ maxFragmentRatio: 0.65,
2012
+ minMedianLength: 25,
2013
+ minTotalSentences: 5
2014
+ },
2015
+ skipQualityCheck: false,
1940
2016
  debug: false
1941
2017
  };
1942
2018
  function extractExcerpts(html, query, config = {}) {
1943
- const cfg = mergeConfig(DEFAULT_CONFIG6, config);
2019
+ const cfg = mergeConfig(DEFAULT_CONFIG7, config);
1944
2020
  const logger2 = logger_default.getInstance();
1945
2021
  const { $, mainContent } = logger2.time("1. Preprocess HTML", () => preprocessHtml(html));
1946
2022
  if (mainContent === null) {
@@ -1950,6 +2026,28 @@ function extractExcerpts(html, query, config = {}) {
1950
2026
  if (sentences.length === 0) {
1951
2027
  return createEmptyResult(query, cfg.debug);
1952
2028
  }
2029
+ if (!cfg.skipQualityCheck) {
2030
+ const quality = logger2.time("2b. Quality check", () => assessDocumentQuality(sentences, cfg.quality));
2031
+ if (!quality.passesThreshold) {
2032
+ return createEmptyResult(
2033
+ query,
2034
+ cfg.debug,
2035
+ {
2036
+ sentenceCount: sentences.length,
2037
+ hasRelevantResults: false,
2038
+ topSentences: []
2039
+ },
2040
+ {
2041
+ hasRelevantResults: false,
2042
+ sentenceCount: sentences.length,
2043
+ queryTermCoverage: 0,
2044
+ maxBm25: 0,
2045
+ maxCooccurrence: 0,
2046
+ ...quality.rejectReason && { qualityRejectReason: quality.rejectReason }
2047
+ }
2048
+ );
2049
+ }
2050
+ }
1953
2051
  const queryTokens = logger2.time("3. Tokenize query", () => tokenize(query));
1954
2052
  if (queryTokens.length === 0) {
1955
2053
  return extractWithoutQuery(sentences, query, cfg);
@@ -2049,12 +2147,14 @@ function mergeConfig(defaults, overrides) {
2049
2147
  expand: { ...defaults.expand, ...overrides.expand },
2050
2148
  dedupe: { ...defaults.dedupe, ...overrides.dedupe },
2051
2149
  excerpts: { ...defaults.excerpts, ...overrides.excerpts },
2150
+ quality: { ...defaults.quality, ...overrides.quality },
2151
+ skipQualityCheck: overrides.skipQualityCheck ?? defaults.skipQualityCheck ?? false,
2052
2152
  debug: overrides.debug ?? defaults.debug ?? false
2053
2153
  };
2054
2154
  }
2055
2155
 
2056
2156
  // src/mcp/types.ts
2057
- var DEFAULT_CONFIG7 = {
2157
+ var DEFAULT_CONFIG8 = {
2058
2158
  searxngUrl: "http://localhost:8888",
2059
2159
  maxResults: 5,
2060
2160
  timeout: 5e3,
@@ -2203,18 +2303,78 @@ function parseSearchOperators(query) {
2203
2303
 
2204
2304
  // src/mcp/orchestrator.ts
2205
2305
  var logger = logger_default.getInstance();
2206
- var JS_RENDERED_DOMAINS = /* @__PURE__ */ new Set([
2306
+ var sessionCache = /* @__PURE__ */ new Map();
2307
+ var SESSION_TTL_MS = 10 * 1e3;
2308
+ function cleanupExpiredSessions() {
2309
+ const now = Date.now();
2310
+ for (const [key, session] of sessionCache) {
2311
+ if (now - session.lastUsed > SESSION_TTL_MS) {
2312
+ sessionCache.delete(key);
2313
+ }
2314
+ }
2315
+ }
2316
+ function getSession(sessionKey) {
2317
+ cleanupExpiredSessions();
2318
+ let session = sessionCache.get(sessionKey);
2319
+ if (!session) {
2320
+ session = { urls: /* @__PURE__ */ new Set(), lastUsed: Date.now() };
2321
+ sessionCache.set(sessionKey, session);
2322
+ } else {
2323
+ session.lastUsed = Date.now();
2324
+ }
2325
+ return session;
2326
+ }
2327
+ function addUrlsToSession(sessionKey, urls) {
2328
+ const session = getSession(sessionKey);
2329
+ for (const url of urls) {
2330
+ session.urls.add(url);
2331
+ }
2332
+ }
2333
+ function filterSessionUrls(sessionKey, urls) {
2334
+ if (!sessionKey) {
2335
+ return { newUrls: urls, skippedUrls: [] };
2336
+ }
2337
+ const session = getSession(sessionKey);
2338
+ const newUrls = [];
2339
+ const skippedUrls = [];
2340
+ for (const url of urls) {
2341
+ if (session.urls.has(url)) {
2342
+ skippedUrls.push(url);
2343
+ } else {
2344
+ newUrls.push(url);
2345
+ }
2346
+ }
2347
+ return { newUrls, skippedUrls };
2348
+ }
2349
+ var BLOCKED_DOMAINS = /* @__PURE__ */ new Set([
2207
2350
  "medium.com",
2208
- "npmjs.com"
2351
+ "npmjs.com",
2352
+ "researchgate.net",
2353
+ "grokipedia.org"
2209
2354
  ]);
2355
+ function isGitHubRepoMainPage(url) {
2356
+ try {
2357
+ const urlObj = new URL(url);
2358
+ const hostname = urlObj.hostname.replace(/^www\./, "");
2359
+ if (hostname !== "github.com") return false;
2360
+ const path = urlObj.pathname.replace(/\/$/, "");
2361
+ const segments = path.split("/").filter((s) => s.length > 0);
2362
+ return segments.length === 2;
2363
+ } catch {
2364
+ return false;
2365
+ }
2366
+ }
2210
2367
  function isBlockedDomain(url) {
2211
2368
  try {
2212
2369
  const hostname = new URL(url).hostname.replace(/^www\./, "");
2213
- for (const blocked of JS_RENDERED_DOMAINS) {
2370
+ for (const blocked of BLOCKED_DOMAINS) {
2214
2371
  if (hostname === blocked || hostname.endsWith(`.${blocked}`)) {
2215
2372
  return true;
2216
2373
  }
2217
2374
  }
2375
+ if (isGitHubRepoMainPage(url)) {
2376
+ return true;
2377
+ }
2218
2378
  return false;
2219
2379
  } catch {
2220
2380
  return false;
@@ -2398,6 +2558,10 @@ function formatPageStatus(diag) {
2398
2558
  return `~ [${shortUrl}] TRUNCATED: Output limit reached`;
2399
2559
  case "blocked_js":
2400
2560
  return `x [${shortUrl}] SKIPPED: JS-rendered site`;
2561
+ case "low_quality":
2562
+ return `x [${shortUrl}] SKIPPED: ${diag.error ?? "Low quality content"}`;
2563
+ case "session_cached":
2564
+ return `= [${shortUrl}] SKIPPED: Already fetched in this session`;
2401
2565
  default:
2402
2566
  return `? [${shortUrl}] Unknown status`;
2403
2567
  }
@@ -2410,7 +2574,10 @@ function generateSuggestions(diagnostics, queryTokens) {
2410
2574
  const scrapeFailedCount = diagnostics.filter((d) => d.status === "scrape_failed").length;
2411
2575
  const budgetExceededCount = diagnostics.filter((d) => d.status === "budget_exceeded").length;
2412
2576
  const blockedJsCount = diagnostics.filter((d) => d.status === "blocked_js").length;
2413
- if (successCount === 0) {
2577
+ const lowQualityCount = diagnostics.filter((d) => d.status === "low_quality").length;
2578
+ const sessionCachedCount = diagnostics.filter((d) => d.status === "session_cached").length;
2579
+ const relevantDiagCount = diagnostics.length - sessionCachedCount;
2580
+ if (successCount === 0 && relevantDiagCount > 0) {
2414
2581
  suggestions.push("NO RESULTS EXTRACTED. Your query may be too vague or not match the content.");
2415
2582
  suggestions.push("Try a more specific query with exact library names, function names, or error messages.");
2416
2583
  suggestions.push(`Current query tokens: [${queryTokens.join(", ")}] - ensure these terms appear in documentation you're looking for.`);
@@ -2419,8 +2586,8 @@ function generateSuggestions(diagnostics, queryTokens) {
2419
2586
  }
2420
2587
  return suggestions;
2421
2588
  }
2422
- if (successCount < diagnostics.length / 2) {
2423
- suggestions.push(`Only ${successCount}/${diagnostics.length} pages had relevant content.`);
2589
+ if (relevantDiagCount > 0 && successCount < relevantDiagCount / 2) {
2590
+ suggestions.push(`Only ${successCount}/${relevantDiagCount} pages had relevant content.`);
2424
2591
  }
2425
2592
  if (notRelevantPages.length >= 2) {
2426
2593
  const avgCoverage = notRelevantPages.reduce((sum, p) => sum + (p.metrics?.queryTermCoverage ?? 0), 0) / notRelevantPages.length;
@@ -2442,14 +2609,23 @@ function generateSuggestions(diagnostics, queryTokens) {
2442
2609
  if (blockedJsCount > 0) {
2443
2610
  suggestions.push(`${blockedJsCount} result(s) from Stack Overflow/GitHub were skipped (JavaScript-rendered).`);
2444
2611
  }
2612
+ if (lowQualityCount > 0) {
2613
+ suggestions.push(`${lowQualityCount} page(s) had low quality content (mostly metadata or fragments).`);
2614
+ }
2615
+ if (sessionCachedCount > 0) {
2616
+ suggestions.push(`${sessionCachedCount} page(s) were skipped (already fetched earlier in this session).`);
2617
+ }
2445
2618
  return suggestions;
2446
2619
  }
2447
2620
  function formatResults(result, includeDiagnostics) {
2448
2621
  const lines = [];
2449
2622
  lines.push(`# Search Results for: "${result.query}"
2450
2623
  `);
2451
- lines.push(`Found ${result.successfulPages} of ${result.totalPages} pages with relevant content.
2452
- `);
2624
+ let statusLine = `Found ${result.successfulPages} of ${result.totalPages} pages with relevant content.`;
2625
+ if (result.sessionSkippedCount && result.sessionSkippedCount > 0) {
2626
+ statusLine += ` (${result.sessionSkippedCount} already fetched in this session)`;
2627
+ }
2628
+ lines.push(statusLine + "\n");
2453
2629
  for (const page of result.pages) {
2454
2630
  if (page.excerpts.length === 0) continue;
2455
2631
  lines.push(`
@@ -2465,11 +2641,13 @@ function formatResults(result, includeDiagnostics) {
2465
2641
  lines.push("");
2466
2642
  }
2467
2643
  }
2468
- const noResults = result.pages.every((p) => p.excerpts.length === 0);
2469
- if (noResults) {
2644
+ const hasExcerpts = result.pages.some((p) => p.excerpts.length > 0);
2645
+ const hasSessionCached = (result.sessionSkippedCount ?? 0) > 0;
2646
+ const showFailureInfo = !hasExcerpts && !hasSessionCached;
2647
+ if (showFailureInfo) {
2470
2648
  lines.push("\nNo relevant content was extracted from any search result.\n");
2471
2649
  }
2472
- if (includeDiagnostics || noResults) {
2650
+ if (includeDiagnostics) {
2473
2651
  lines.push("\n---\n");
2474
2652
  lines.push("## Search Diagnostics\n");
2475
2653
  lines.push("**Query Analysis:**");
@@ -2496,11 +2674,12 @@ async function search(query, config = {}) {
2496
2674
  const includeDiagnostics = config.diagnostics ?? false;
2497
2675
  const { searchQuery, extractionQuery } = parseSearchOperators(query);
2498
2676
  const cfg = {
2499
- searxngUrl: config.searxngUrl ?? DEFAULT_CONFIG7.searxngUrl,
2500
- maxResults: config.maxResults ?? DEFAULT_CONFIG7.maxResults,
2501
- timeout: config.timeout ?? DEFAULT_CONFIG7.timeout,
2502
- perPageCharBudget: config.perPageCharBudget ?? DEFAULT_CONFIG7.perPageCharBudget,
2503
- totalCharBudget: config.totalCharBudget ?? DEFAULT_CONFIG7.totalCharBudget
2677
+ searxngUrl: config.searxngUrl ?? DEFAULT_CONFIG8.searxngUrl,
2678
+ maxResults: config.maxResults ?? DEFAULT_CONFIG8.maxResults,
2679
+ timeout: config.timeout ?? DEFAULT_CONFIG8.timeout,
2680
+ perPageCharBudget: config.perPageCharBudget ?? DEFAULT_CONFIG8.perPageCharBudget,
2681
+ totalCharBudget: config.totalCharBudget ?? DEFAULT_CONFIG8.totalCharBudget,
2682
+ sessionKey: config.sessionKey
2504
2683
  };
2505
2684
  const requestMultiplier = 2;
2506
2685
  let searchResults;
@@ -2532,8 +2711,19 @@ async function search(query, config = {}) {
2532
2711
  scrapableResults.push(result2);
2533
2712
  }
2534
2713
  }
2535
- const resultsToScrape = scrapableResults.slice(0, cfg.maxResults);
2536
- logger.debug(`SearXNG returned ${searchResults.length} URLs, ${blockedResults.length} blocked, ${resultsToScrape.length} to scrape:`, debug);
2714
+ let resultsToScrape = scrapableResults.slice(0, cfg.maxResults);
2715
+ let sessionSkippedResults = [];
2716
+ if (cfg.sessionKey) {
2717
+ const urlsToCheck = resultsToScrape.map((r) => r.url);
2718
+ const { newUrls, skippedUrls } = filterSessionUrls(cfg.sessionKey, urlsToCheck);
2719
+ if (skippedUrls.length > 0) {
2720
+ const skippedSet = new Set(skippedUrls);
2721
+ sessionSkippedResults = resultsToScrape.filter((r) => skippedSet.has(r.url));
2722
+ resultsToScrape = resultsToScrape.filter((r) => !skippedSet.has(r.url));
2723
+ logger.debug(`Session '${cfg.sessionKey}': skipped ${skippedUrls.length} already-fetched URLs`, debug);
2724
+ }
2725
+ }
2726
+ logger.debug(`SearXNG returned ${searchResults.length} URLs, ${blockedResults.length} blocked, ${sessionSkippedResults.length} session-cached, ${resultsToScrape.length} to scrape:`, debug);
2537
2727
  for (let i = 0; i < resultsToScrape.length; i++) {
2538
2728
  const r = resultsToScrape[i];
2539
2729
  if (r) {
@@ -2561,6 +2751,13 @@ async function search(query, config = {}) {
2561
2751
  status: "blocked_js"
2562
2752
  });
2563
2753
  }
2754
+ for (const skipped of sessionSkippedResults) {
2755
+ diagnostics.push({
2756
+ url: skipped.url,
2757
+ title: skipped.title,
2758
+ status: "session_cached"
2759
+ });
2760
+ }
2564
2761
  const pageExtractions = [];
2565
2762
  const extractionStart = performance.now();
2566
2763
  for (const scrape of scrapeResults) {
@@ -2599,7 +2796,9 @@ async function search(query, config = {}) {
2599
2796
  if (result2.extraction.excerpts.length === 0) {
2600
2797
  const metrics = result2.relevanceMetrics;
2601
2798
  let status;
2602
- if (metrics && metrics.hasRelevantResults === false) {
2799
+ if (metrics?.qualityRejectReason) {
2800
+ status = "low_quality";
2801
+ } else if (metrics && metrics.hasRelevantResults === false) {
2603
2802
  status = "not_relevant";
2604
2803
  } else if (metrics && metrics.sentenceCount === 0) {
2605
2804
  status = "no_content";
@@ -2611,6 +2810,9 @@ async function search(query, config = {}) {
2611
2810
  title: result2.extraction.title,
2612
2811
  status
2613
2812
  };
2813
+ if (metrics?.qualityRejectReason) {
2814
+ diagEntry.error = metrics.qualityRejectReason;
2815
+ }
2614
2816
  if (metrics) {
2615
2817
  diagEntry.metrics = {
2616
2818
  sentenceCount: metrics.sentenceCount,
@@ -2692,6 +2894,11 @@ async function search(query, config = {}) {
2692
2894
  }
2693
2895
  }
2694
2896
  logger.recordTiming("MCP: Rank and budget pages", performance.now() - rankingStart);
2897
+ if (cfg.sessionKey) {
2898
+ const scrapedUrls = resultsToScrape.map((r) => r.url);
2899
+ addUrlsToSession(cfg.sessionKey, scrapedUrls);
2900
+ logger.debug(`Session '${cfg.sessionKey}': cached ${scrapedUrls.length} URLs for future deduplication`, debug);
2901
+ }
2695
2902
  const result = {
2696
2903
  query,
2697
2904
  pages: budgetedPages,
@@ -2700,7 +2907,8 @@ async function search(query, config = {}) {
2700
2907
  successfulPages: budgetedPages.length,
2701
2908
  totalChars,
2702
2909
  diagnostics,
2703
- queryTokens
2910
+ queryTokens,
2911
+ ...sessionSkippedResults.length > 0 && { sessionSkippedCount: sessionSkippedResults.length }
2704
2912
  };
2705
2913
  const formatted = logger.time("MCP: Format results", () => formatResults(result, includeDiagnostics));
2706
2914
  logger.printTimings();
package/dist/index.js CHANGED
@@ -4,7 +4,7 @@ import {
4
4
  logger_default,
5
5
  search,
6
6
  tokenize
7
- } from "./chunk-S3WZDJCP.js";
7
+ } from "./chunk-5CZPI5V7.js";
8
8
 
9
9
  // src/index.ts
10
10
  import * as fs from "fs";
@@ -1,7 +1,7 @@
1
1
  import {
2
2
  logger_default,
3
3
  search
4
- } from "../chunk-S3WZDJCP.js";
4
+ } from "../chunk-5CZPI5V7.js";
5
5
 
6
6
  // src/mcp/server.ts
7
7
  import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
@@ -34,18 +34,30 @@ SEARCH OPERATORS:
34
34
  - "exact phrase" - exact phrase match
35
35
  - -term - exclude term
36
36
 
37
+ SESSION DEDUPLICATION:
38
+ When making multiple related searches (e.g., researching a topic), provide a consistent sessionKey (e.g., "react-hooks-research"). This prevents re-fetching pages you've already seen, saving time and avoiding duplicate content in your results.
39
+
40
+ IMPORTANT - Diversify your searches:
41
+ Similar queries return similar results from search engines. If you search "react useState hook" and then "react useState tutorial", you'll likely get overlapping pages. To maximize information:
42
+ - Vary your query terms significantly between searches (don't just rephrase)
43
+ - Search different aspects of the topic (e.g., "useState" vs "useReducer" vs "state management patterns")
44
+ - Use specific terms for specific questions rather than broad terms repeatedly
45
+ - The sessionKey deduplication helps, but can't prevent overlap if the search engine returns the same URLs for similar queries
46
+
37
47
  RETURNS: Extracted text excerpts with source URLs (not raw HTML).`,
38
48
  {
39
49
  query: z.string().describe(
40
50
  'Search query with technical terms. Supports operators: site:, "quotes", -exclude.'
41
51
  ),
42
52
  maxResults: z.number().optional().describe("Maximum pages to scrape (default: 5, max: 10)"),
43
- diagnostics: z.boolean().optional().describe("Include detailed diagnostics about why pages were filtered or failed (default: false). Diagnostics are always shown when no results are found.")
53
+ diagnostics: z.boolean().optional().describe("Include detailed diagnostics about why pages were filtered or failed (default: false)."),
54
+ sessionKey: z.string().optional().describe("Session key for cross-call URL deduplication. When provided, URLs already fetched in previous calls with the same key will be skipped. Use a consistent key (e.g., 'react-research') across related searches to avoid re-fetching the same pages.")
44
55
  },
45
- async ({ query, maxResults, diagnostics }) => {
56
+ async ({ query, maxResults, diagnostics, sessionKey }) => {
46
57
  const result = await search(query, {
47
58
  ...maxResults !== void 0 && { maxResults },
48
- ...diagnostics !== void 0 && { diagnostics }
59
+ ...diagnostics !== void 0 && { diagnostics },
60
+ ...sessionKey !== void 0 && { sessionKey }
49
61
  });
50
62
  return {
51
63
  content: [
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "peeky-search",
3
- "version": "1.0.10",
3
+ "version": "1.0.11",
4
4
  "description": "IR-based HTML content extraction with MCP server for web search",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",