npm - peeky-search - Versions diffs - 1.0.10 → 1.0.11 - Mend

peeky-search 1.0.10 → 1.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md +6 -4
package/dist/{chunk-S3WZDJCP.js → chunk-5CZPI5V7.js} +236 -28
package/dist/index.js +1 -1
package/dist/mcp/server.js +16 -4
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -125,6 +125,7 @@ Once configured, your MCP client will have access to `peeky_web_search`:
 | `query` | string | Search query. Supports `site:`, `"quotes"`, `-exclude` |
 | `maxResults` | number | Pages to fetch (default 5, max 10) |
 | `diagnostics` | boolean | Include filtering details (default false) |
+| `sessionKey` | string | Key for cross-call URL deduplication (optional) |
 **Output:**
 ```json
@@ -171,10 +172,11 @@ HTML → Strip boilerplate → Extract blocks → Segment sentences
 1. **Preprocess**: Strip scripts, styles, nav, ads, and boilerplate
 2. **Segment**: Extract blocks (headings, paragraphs, lists, code) into sentences
-3. **Score**: BM25 for term relevance + 8 structural heuristics
-4. **Select**: Pick top sentences with position/content diversity
-5. **Expand**: Build context around anchors, respecting section boundaries
-6. **Assemble**: Fit excerpts within character budget
+3. **Quality gate**: Reject low-quality pages (too few sentences, mostly fragments)
+4. **Score**: BM25 for term relevance + 9 structural heuristics
+5. **Select**: Pick top sentences with position/content diversity
+6. **Expand**: Build context around anchors, respecting section boundaries
+7. **Assemble**: Fit excerpts within character budget
 ## Performance

package/dist/{chunk-S3WZDJCP.js → chunk-5CZPI5V7.js} RENAMED Viewed

@@ -822,8 +822,8 @@ function segmentHtml($, container, options = {}) {
 }
 // src/scoring/bm25.ts
-var DEFAULT_K1 = 1.2;
-var DEFAULT_B = 0.5;
+var DEFAULT_K1 = 1.5;
+var DEFAULT_B = 0.75;
 function computeDocumentStats(sentences) {
   const docFrequency = {};
   let totalLength = 0;
@@ -1228,15 +1228,15 @@ function computeDensityStats(sentences, queryTokens) {
   }
   densities.sort((a, b) => a - b);
   const mid = Math.floor(densities.length / 2);
-  const median = densities.length % 2 === 0 ? ((densities[mid - 1] ?? 0) + (densities[mid] ?? 0)) / 2 : densities[mid] ?? 0;
-  const absoluteDeviations = densities.map((d) => Math.abs(d - median));
+  const median2 = densities.length % 2 === 0 ? ((densities[mid - 1] ?? 0) + (densities[mid] ?? 0)) / 2 : densities[mid] ?? 0;
+  const absoluteDeviations = densities.map((d) => Math.abs(d - median2));
   absoluteDeviations.sort((a, b) => a - b);
   const madMid = Math.floor(absoluteDeviations.length / 2);
   let mad = absoluteDeviations.length % 2 === 0 ? ((absoluteDeviations[madMid - 1] ?? 0) + (absoluteDeviations[madMid] ?? 0)) / 2 : absoluteDeviations[madMid] ?? 0;
   if (mad < 1e-3) {
     mad = 1e-3;
   }
-  return { median, mad };
+  return { median: median2, mad };
 }
 function isMetaHeading(heading) {
   const trimmed = heading.trim();
@@ -1886,6 +1886,75 @@ function fullDedupe(chunks, config = {}) {
   return removeSubsetChunks(merged);
 }
+// src/scoring/quality.ts
+var DEFAULT_CONFIG6 = {
+  minLongSentences: 3,
+  maxFragmentRatio: 0.65,
+  minMedianLength: 25,
+  minTotalSentences: 5
+};
+function median(values) {
+  if (values.length === 0) return 0;
+  const sorted = [...values].sort((a, b) => a - b);
+  const mid = Math.floor(sorted.length / 2);
+  if (sorted.length % 2 === 0) {
+    const left = sorted[mid - 1];
+    const right = sorted[mid];
+    if (left !== void 0 && right !== void 0) {
+      return (left + right) / 2;
+    }
+    return 0;
+  }
+  return sorted[mid] ?? 0;
+}
+function assessDocumentQuality(sentences, config = {}) {
+  const {
+    minLongSentences,
+    maxFragmentRatio,
+    minMedianLength,
+    minTotalSentences
+  } = { ...DEFAULT_CONFIG6, ...config };
+  const totalSentences = sentences.length;
+  if (totalSentences === 0) {
+    return {
+      totalSentences: 0,
+      longSentenceCount: 0,
+      medianSentenceLength: 0,
+      fragmentRatio: 1,
+      passesThreshold: false,
+      rejectReason: "No sentences found"
+    };
+  }
+  const lengths = sentences.map((s) => s.text.length);
+  const longSentenceCount = lengths.filter((len) => len > 50).length;
+  const fragmentCount = lengths.filter((len) => len < 30).length;
+  const fragmentRatio = fragmentCount / totalSentences;
+  const medianSentenceLength = median(lengths);
+  let passesThreshold = true;
+  let rejectReason;
+  if (totalSentences < minTotalSentences) {
+    passesThreshold = false;
+    rejectReason = `Too few sentences (${totalSentences} < ${minTotalSentences})`;
+  } else if (longSentenceCount < minLongSentences) {
+    passesThreshold = false;
+    rejectReason = `Too few long sentences (${longSentenceCount} < ${minLongSentences})`;
+  } else if (fragmentRatio > maxFragmentRatio) {
+    passesThreshold = false;
+    rejectReason = `Too many fragments (${(fragmentRatio * 100).toFixed(0)}% > ${maxFragmentRatio * 100}%)`;
+  } else if (medianSentenceLength < minMedianLength) {
+    passesThreshold = false;
+    rejectReason = `Median sentence too short (${medianSentenceLength.toFixed(0)} < ${minMedianLength})`;
+  }
+  return {
+    totalSentences,
+    longSentenceCount,
+    medianSentenceLength,
+    fragmentRatio,
+    passesThreshold,
+    ...rejectReason && { rejectReason }
+  };
+}
 // src/pipeline.ts
 function createEmptyResult(query, debug, debugInfo, relevanceMetrics) {
   const result = {
@@ -1908,7 +1977,7 @@ function createEmptyResult(query, debug, debugInfo, relevanceMetrics) {
   }
   return result;
 }
-var DEFAULT_CONFIG6 = {
+var DEFAULT_CONFIG7 = {
   ranker: {
     bm25Weight: 0.6,
     heuristicWeight: 0.4
@@ -1937,10 +2006,17 @@ var DEFAULT_CONFIG6 = {
     charBudget: 6e3,
     minExcerptChars: 100
   },
+  quality: {
+    minLongSentences: 3,
+    maxFragmentRatio: 0.65,
+    minMedianLength: 25,
+    minTotalSentences: 5
+  },
+  skipQualityCheck: false,
   debug: false
 };
 function extractExcerpts(html, query, config = {}) {
-  const cfg = mergeConfig(DEFAULT_CONFIG6, config);
+  const cfg = mergeConfig(DEFAULT_CONFIG7, config);
   const logger2 = logger_default.getInstance();
   const { $, mainContent } = logger2.time("1. Preprocess HTML", () => preprocessHtml(html));
   if (mainContent === null) {
@@ -1950,6 +2026,28 @@ function extractExcerpts(html, query, config = {}) {
   if (sentences.length === 0) {
     return createEmptyResult(query, cfg.debug);
   }
+  if (!cfg.skipQualityCheck) {
+    const quality = logger2.time("2b. Quality check", () => assessDocumentQuality(sentences, cfg.quality));
+    if (!quality.passesThreshold) {
+      return createEmptyResult(
+        query,
+        cfg.debug,
+        {
+          sentenceCount: sentences.length,
+          hasRelevantResults: false,
+          topSentences: []
+        },
+        {
+          hasRelevantResults: false,
+          sentenceCount: sentences.length,
+          queryTermCoverage: 0,
+          maxBm25: 0,
+          maxCooccurrence: 0,
+          ...quality.rejectReason && { qualityRejectReason: quality.rejectReason }
+        }
+      );
+    }
+  }
   const queryTokens = logger2.time("3. Tokenize query", () => tokenize(query));
   if (queryTokens.length === 0) {
     return extractWithoutQuery(sentences, query, cfg);
@@ -2049,12 +2147,14 @@ function mergeConfig(defaults, overrides) {
     expand: { ...defaults.expand, ...overrides.expand },
     dedupe: { ...defaults.dedupe, ...overrides.dedupe },
     excerpts: { ...defaults.excerpts, ...overrides.excerpts },
+    quality: { ...defaults.quality, ...overrides.quality },
+    skipQualityCheck: overrides.skipQualityCheck ?? defaults.skipQualityCheck ?? false,
     debug: overrides.debug ?? defaults.debug ?? false
   };
 }
 // src/mcp/types.ts
-var DEFAULT_CONFIG7 = {
+var DEFAULT_CONFIG8 = {
   searxngUrl: "http://localhost:8888",
   maxResults: 5,
   timeout: 5e3,
@@ -2203,18 +2303,78 @@ function parseSearchOperators(query) {
 // src/mcp/orchestrator.ts
 var logger = logger_default.getInstance();
-var JS_RENDERED_DOMAINS = /* @__PURE__ */ new Set([
+var sessionCache = /* @__PURE__ */ new Map();
+var SESSION_TTL_MS = 10 * 1e3;
+function cleanupExpiredSessions() {
+  const now = Date.now();
+  for (const [key, session] of sessionCache) {
+    if (now - session.lastUsed > SESSION_TTL_MS) {
+      sessionCache.delete(key);
+    }
+  }
+}
+function getSession(sessionKey) {
+  cleanupExpiredSessions();
+  let session = sessionCache.get(sessionKey);
+  if (!session) {
+    session = { urls: /* @__PURE__ */ new Set(), lastUsed: Date.now() };
+    sessionCache.set(sessionKey, session);
+  } else {
+    session.lastUsed = Date.now();
+  }
+  return session;
+}
+function addUrlsToSession(sessionKey, urls) {
+  const session = getSession(sessionKey);
+  for (const url of urls) {
+    session.urls.add(url);
+  }
+}
+function filterSessionUrls(sessionKey, urls) {
+  if (!sessionKey) {
+    return { newUrls: urls, skippedUrls: [] };
+  }
+  const session = getSession(sessionKey);
+  const newUrls = [];
+  const skippedUrls = [];
+  for (const url of urls) {
+    if (session.urls.has(url)) {
+      skippedUrls.push(url);
+    } else {
+      newUrls.push(url);
+    }
+  }
+  return { newUrls, skippedUrls };
+}
+var BLOCKED_DOMAINS = /* @__PURE__ */ new Set([
   "medium.com",
-  "npmjs.com"
+  "npmjs.com",
+  "researchgate.net",
+  "grokipedia.org"
 ]);
+function isGitHubRepoMainPage(url) {
+  try {
+    const urlObj = new URL(url);
+    const hostname = urlObj.hostname.replace(/^www\./, "");
+    if (hostname !== "github.com") return false;
+    const path = urlObj.pathname.replace(/\/$/, "");
+    const segments = path.split("/").filter((s) => s.length > 0);
+    return segments.length === 2;
+  } catch {
+    return false;
+  }
+}
 function isBlockedDomain(url) {
   try {
     const hostname = new URL(url).hostname.replace(/^www\./, "");
-    for (const blocked of JS_RENDERED_DOMAINS) {
+    for (const blocked of BLOCKED_DOMAINS) {
       if (hostname === blocked || hostname.endsWith(`.${blocked}`)) {
         return true;
       }
     }
+    if (isGitHubRepoMainPage(url)) {
+      return true;
+    }
     return false;
   } catch {
     return false;
@@ -2398,6 +2558,10 @@ function formatPageStatus(diag) {
       return `~ [${shortUrl}] TRUNCATED: Output limit reached`;
     case "blocked_js":
       return `x [${shortUrl}] SKIPPED: JS-rendered site`;
+    case "low_quality":
+      return `x [${shortUrl}] SKIPPED: ${diag.error ?? "Low quality content"}`;
+    case "session_cached":
+      return `= [${shortUrl}] SKIPPED: Already fetched in this session`;
     default:
       return `? [${shortUrl}] Unknown status`;
   }
@@ -2410,7 +2574,10 @@ function generateSuggestions(diagnostics, queryTokens) {
   const scrapeFailedCount = diagnostics.filter((d) => d.status === "scrape_failed").length;
   const budgetExceededCount = diagnostics.filter((d) => d.status === "budget_exceeded").length;
   const blockedJsCount = diagnostics.filter((d) => d.status === "blocked_js").length;
-  if (successCount === 0) {
+  const lowQualityCount = diagnostics.filter((d) => d.status === "low_quality").length;
+  const sessionCachedCount = diagnostics.filter((d) => d.status === "session_cached").length;
+  const relevantDiagCount = diagnostics.length - sessionCachedCount;
+  if (successCount === 0 && relevantDiagCount > 0) {
     suggestions.push("NO RESULTS EXTRACTED. Your query may be too vague or not match the content.");
     suggestions.push("Try a more specific query with exact library names, function names, or error messages.");
     suggestions.push(`Current query tokens: [${queryTokens.join(", ")}] - ensure these terms appear in documentation you're looking for.`);
@@ -2419,8 +2586,8 @@ function generateSuggestions(diagnostics, queryTokens) {
     }
     return suggestions;
   }
-  if (successCount < diagnostics.length / 2) {
-    suggestions.push(`Only ${successCount}/${diagnostics.length} pages had relevant content.`);
+  if (relevantDiagCount > 0 && successCount < relevantDiagCount / 2) {
+    suggestions.push(`Only ${successCount}/${relevantDiagCount} pages had relevant content.`);
   }
   if (notRelevantPages.length >= 2) {
     const avgCoverage = notRelevantPages.reduce((sum, p) => sum + (p.metrics?.queryTermCoverage ?? 0), 0) / notRelevantPages.length;
@@ -2442,14 +2609,23 @@ function generateSuggestions(diagnostics, queryTokens) {
   if (blockedJsCount > 0) {
     suggestions.push(`${blockedJsCount} result(s) from Stack Overflow/GitHub were skipped (JavaScript-rendered).`);
   }
+  if (lowQualityCount > 0) {
+    suggestions.push(`${lowQualityCount} page(s) had low quality content (mostly metadata or fragments).`);
+  }
+  if (sessionCachedCount > 0) {
+    suggestions.push(`${sessionCachedCount} page(s) were skipped (already fetched earlier in this session).`);
+  }
   return suggestions;
 }
 function formatResults(result, includeDiagnostics) {
   const lines = [];
   lines.push(`# Search Results for: "${result.query}"
 `);
-  lines.push(`Found ${result.successfulPages} of ${result.totalPages} pages with relevant content.
-`);
+  let statusLine = `Found ${result.successfulPages} of ${result.totalPages} pages with relevant content.`;
+  if (result.sessionSkippedCount && result.sessionSkippedCount > 0) {
+    statusLine += ` (${result.sessionSkippedCount} already fetched in this session)`;
+  }
+  lines.push(statusLine + "\n");
   for (const page of result.pages) {
     if (page.excerpts.length === 0) continue;
     lines.push(`
@@ -2465,11 +2641,13 @@ function formatResults(result, includeDiagnostics) {
       lines.push("");
     }
   }
-  const noResults = result.pages.every((p) => p.excerpts.length === 0);
-  if (noResults) {
+  const hasExcerpts = result.pages.some((p) => p.excerpts.length > 0);
+  const hasSessionCached = (result.sessionSkippedCount ?? 0) > 0;
+  const showFailureInfo = !hasExcerpts && !hasSessionCached;
+  if (showFailureInfo) {
     lines.push("\nNo relevant content was extracted from any search result.\n");
   }
-  if (includeDiagnostics || noResults) {
+  if (includeDiagnostics) {
     lines.push("\n---\n");
     lines.push("## Search Diagnostics\n");
     lines.push("**Query Analysis:**");
@@ -2496,11 +2674,12 @@ async function search(query, config = {}) {
   const includeDiagnostics = config.diagnostics ?? false;
   const { searchQuery, extractionQuery } = parseSearchOperators(query);
   const cfg = {
-    searxngUrl: config.searxngUrl ?? DEFAULT_CONFIG7.searxngUrl,
-    maxResults: config.maxResults ?? DEFAULT_CONFIG7.maxResults,
-    timeout: config.timeout ?? DEFAULT_CONFIG7.timeout,
-    perPageCharBudget: config.perPageCharBudget ?? DEFAULT_CONFIG7.perPageCharBudget,
-    totalCharBudget: config.totalCharBudget ?? DEFAULT_CONFIG7.totalCharBudget
+    searxngUrl: config.searxngUrl ?? DEFAULT_CONFIG8.searxngUrl,
+    maxResults: config.maxResults ?? DEFAULT_CONFIG8.maxResults,
+    timeout: config.timeout ?? DEFAULT_CONFIG8.timeout,
+    perPageCharBudget: config.perPageCharBudget ?? DEFAULT_CONFIG8.perPageCharBudget,
+    totalCharBudget: config.totalCharBudget ?? DEFAULT_CONFIG8.totalCharBudget,
+    sessionKey: config.sessionKey
   };
   const requestMultiplier = 2;
   let searchResults;
@@ -2532,8 +2711,19 @@ async function search(query, config = {}) {
       scrapableResults.push(result2);
     }
   }
-  const resultsToScrape = scrapableResults.slice(0, cfg.maxResults);
-  logger.debug(`SearXNG returned ${searchResults.length} URLs, ${blockedResults.length} blocked, ${resultsToScrape.length} to scrape:`, debug);
+  let resultsToScrape = scrapableResults.slice(0, cfg.maxResults);
+  let sessionSkippedResults = [];
+  if (cfg.sessionKey) {
+    const urlsToCheck = resultsToScrape.map((r) => r.url);
+    const { newUrls, skippedUrls } = filterSessionUrls(cfg.sessionKey, urlsToCheck);
+    if (skippedUrls.length > 0) {
+      const skippedSet = new Set(skippedUrls);
+      sessionSkippedResults = resultsToScrape.filter((r) => skippedSet.has(r.url));
+      resultsToScrape = resultsToScrape.filter((r) => !skippedSet.has(r.url));
+      logger.debug(`Session '${cfg.sessionKey}': skipped ${skippedUrls.length} already-fetched URLs`, debug);
+    }
+  }
+  logger.debug(`SearXNG returned ${searchResults.length} URLs, ${blockedResults.length} blocked, ${sessionSkippedResults.length} session-cached, ${resultsToScrape.length} to scrape:`, debug);
   for (let i = 0; i < resultsToScrape.length; i++) {
     const r = resultsToScrape[i];
     if (r) {
@@ -2561,6 +2751,13 @@ async function search(query, config = {}) {
       status: "blocked_js"
     });
   }
+  for (const skipped of sessionSkippedResults) {
+    diagnostics.push({
+      url: skipped.url,
+      title: skipped.title,
+      status: "session_cached"
+    });
+  }
   const pageExtractions = [];
   const extractionStart = performance.now();
   for (const scrape of scrapeResults) {
@@ -2599,7 +2796,9 @@ async function search(query, config = {}) {
     if (result2.extraction.excerpts.length === 0) {
       const metrics = result2.relevanceMetrics;
       let status;
-      if (metrics && metrics.hasRelevantResults === false) {
+      if (metrics?.qualityRejectReason) {
+        status = "low_quality";
+      } else if (metrics && metrics.hasRelevantResults === false) {
         status = "not_relevant";
       } else if (metrics && metrics.sentenceCount === 0) {
         status = "no_content";
@@ -2611,6 +2810,9 @@ async function search(query, config = {}) {
         title: result2.extraction.title,
         status
       };
+      if (metrics?.qualityRejectReason) {
+        diagEntry.error = metrics.qualityRejectReason;
+      }
       if (metrics) {
         diagEntry.metrics = {
           sentenceCount: metrics.sentenceCount,
@@ -2692,6 +2894,11 @@ async function search(query, config = {}) {
     }
   }
   logger.recordTiming("MCP: Rank and budget pages", performance.now() - rankingStart);
+  if (cfg.sessionKey) {
+    const scrapedUrls = resultsToScrape.map((r) => r.url);
+    addUrlsToSession(cfg.sessionKey, scrapedUrls);
+    logger.debug(`Session '${cfg.sessionKey}': cached ${scrapedUrls.length} URLs for future deduplication`, debug);
+  }
   const result = {
     query,
     pages: budgetedPages,
@@ -2700,7 +2907,8 @@ async function search(query, config = {}) {
     successfulPages: budgetedPages.length,
     totalChars,
     diagnostics,
-    queryTokens
+    queryTokens,
+    ...sessionSkippedResults.length > 0 && { sessionSkippedCount: sessionSkippedResults.length }
   };
   const formatted = logger.time("MCP: Format results", () => formatResults(result, includeDiagnostics));
   logger.printTimings();

package/dist/index.js CHANGED Viewed

@@ -4,7 +4,7 @@ import {
   logger_default,
   search,
   tokenize
-} from "./chunk-S3WZDJCP.js";
+} from "./chunk-5CZPI5V7.js";
 // src/index.ts
 import * as fs from "fs";

package/dist/mcp/server.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import {
   logger_default,
   search
-} from "../chunk-S3WZDJCP.js";
+} from "../chunk-5CZPI5V7.js";
 // src/mcp/server.ts
 import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
@@ -34,18 +34,30 @@ SEARCH OPERATORS:
 - "exact phrase" - exact phrase match
 - -term - exclude term
+SESSION DEDUPLICATION:
+When making multiple related searches (e.g., researching a topic), provide a consistent sessionKey (e.g., "react-hooks-research"). This prevents re-fetching pages you've already seen, saving time and avoiding duplicate content in your results.
+IMPORTANT - Diversify your searches:
+Similar queries return similar results from search engines. If you search "react useState hook" and then "react useState tutorial", you'll likely get overlapping pages. To maximize information:
+- Vary your query terms significantly between searches (don't just rephrase)
+- Search different aspects of the topic (e.g., "useState" vs "useReducer" vs "state management patterns")
+- Use specific terms for specific questions rather than broad terms repeatedly
+- The sessionKey deduplication helps, but can't prevent overlap if the search engine returns the same URLs for similar queries
 RETURNS: Extracted text excerpts with source URLs (not raw HTML).`,
   {
     query: z.string().describe(
       'Search query with technical terms. Supports operators: site:, "quotes", -exclude.'
     ),
     maxResults: z.number().optional().describe("Maximum pages to scrape (default: 5, max: 10)"),
-    diagnostics: z.boolean().optional().describe("Include detailed diagnostics about why pages were filtered or failed (default: false). Diagnostics are always shown when no results are found.")
+    diagnostics: z.boolean().optional().describe("Include detailed diagnostics about why pages were filtered or failed (default: false)."),
+    sessionKey: z.string().optional().describe("Session key for cross-call URL deduplication. When provided, URLs already fetched in previous calls with the same key will be skipped. Use a consistent key (e.g., 'react-research') across related searches to avoid re-fetching the same pages.")
   },
-  async ({ query, maxResults, diagnostics }) => {
+  async ({ query, maxResults, diagnostics, sessionKey }) => {
     const result = await search(query, {
       ...maxResults !== void 0 && { maxResults },
-      ...diagnostics !== void 0 && { diagnostics }
+      ...diagnostics !== void 0 && { diagnostics },
+      ...sessionKey !== void 0 && { sessionKey }
     });
     return {
       content: [

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "peeky-search",
-  "version": "1.0.10",
+  "version": "1.0.11",
   "description": "IR-based HTML content extraction with MCP server for web search",
   "type": "module",
   "main": "dist/index.js",