searchsocket 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -5025,32 +5025,32 @@ var require_URL = __commonJS({
5025
5025
  else
5026
5026
  return basepath.substring(0, lastslash + 1) + refpath;
5027
5027
  }
5028
- function remove_dot_segments(path14) {
5029
- if (!path14) return path14;
5028
+ function remove_dot_segments(path15) {
5029
+ if (!path15) return path15;
5030
5030
  var output = "";
5031
- while (path14.length > 0) {
5032
- if (path14 === "." || path14 === "..") {
5033
- path14 = "";
5031
+ while (path15.length > 0) {
5032
+ if (path15 === "." || path15 === "..") {
5033
+ path15 = "";
5034
5034
  break;
5035
5035
  }
5036
- var twochars = path14.substring(0, 2);
5037
- var threechars = path14.substring(0, 3);
5038
- var fourchars = path14.substring(0, 4);
5036
+ var twochars = path15.substring(0, 2);
5037
+ var threechars = path15.substring(0, 3);
5038
+ var fourchars = path15.substring(0, 4);
5039
5039
  if (threechars === "../") {
5040
- path14 = path14.substring(3);
5040
+ path15 = path15.substring(3);
5041
5041
  } else if (twochars === "./") {
5042
- path14 = path14.substring(2);
5042
+ path15 = path15.substring(2);
5043
5043
  } else if (threechars === "/./") {
5044
- path14 = "/" + path14.substring(3);
5045
- } else if (twochars === "/." && path14.length === 2) {
5046
- path14 = "/";
5047
- } else if (fourchars === "/../" || threechars === "/.." && path14.length === 3) {
5048
- path14 = "/" + path14.substring(4);
5044
+ path15 = "/" + path15.substring(3);
5045
+ } else if (twochars === "/." && path15.length === 2) {
5046
+ path15 = "/";
5047
+ } else if (fourchars === "/../" || threechars === "/.." && path15.length === 3) {
5048
+ path15 = "/" + path15.substring(4);
5049
5049
  output = output.replace(/\/?[^\/]*$/, "");
5050
5050
  } else {
5051
- var segment = path14.match(/(\/?([^\/]*))/)[0];
5051
+ var segment = path15.match(/(\/?([^\/]*))/)[0];
5052
5052
  output += segment;
5053
- path14 = path14.substring(segment.length);
5053
+ path15 = path15.substring(segment.length);
5054
5054
  }
5055
5055
  }
5056
5056
  return output;
@@ -16614,6 +16614,8 @@ var searchSocketConfigSchema = zod.z.object({
16614
16614
  envVar: zod.z.string().min(1).optional(),
16615
16615
  sanitize: zod.z.boolean().optional()
16616
16616
  }).optional(),
16617
+ exclude: zod.z.array(zod.z.string()).optional(),
16618
+ respectRobotsTxt: zod.z.boolean().optional(),
16617
16619
  source: zod.z.object({
16618
16620
  mode: zod.z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
16619
16621
  staticOutputDir: zod.z.string().min(1).optional(),
@@ -16744,6 +16746,8 @@ function createDefaultConfig(projectId) {
16744
16746
  envVar: "SEARCHSOCKET_SCOPE",
16745
16747
  sanitize: true
16746
16748
  },
16749
+ exclude: [],
16750
+ respectRobotsTxt: true,
16747
16751
  source: {
16748
16752
  mode: "static-output",
16749
16753
  staticOutputDir: "build",
@@ -16774,7 +16778,7 @@ function createDefaultConfig(projectId) {
16774
16778
  },
16775
16779
  embeddings: {
16776
16780
  provider: "jina",
16777
- model: "jina-embeddings-v3",
16781
+ model: "jina-embeddings-v5-text-small",
16778
16782
  apiKeyEnv: "JINA_API_KEY",
16779
16783
  batchSize: 64,
16780
16784
  concurrency: 4
@@ -16787,9 +16791,9 @@ function createDefaultConfig(projectId) {
16787
16791
  }
16788
16792
  },
16789
16793
  rerank: {
16790
- enabled: false,
16794
+ enabled: true,
16791
16795
  topN: 20,
16792
- model: "jina-reranker-v2-base-multilingual"
16796
+ model: "jina-reranker-v3"
16793
16797
  },
16794
16798
  ranking: {
16795
16799
  enableIncomingLinkBoost: true,
@@ -16908,6 +16912,8 @@ ${issues}`
16908
16912
  ...defaults.scope,
16909
16913
  ...parsed.scope
16910
16914
  },
16915
+ exclude: parsed.exclude ?? defaults.exclude,
16916
+ respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
16911
16917
  source: {
16912
16918
  ...defaults.source,
16913
16919
  ...parsed.source,
@@ -19049,6 +19055,17 @@ function extractFromHtml(url, html, config) {
19049
19055
  if ($(`[${config.extract.noindexAttr}]`).length > 0) {
19050
19056
  return null;
19051
19057
  }
19058
+ const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
19059
+ let weight;
19060
+ if (weightRaw !== void 0) {
19061
+ const parsed = Number(weightRaw);
19062
+ if (Number.isFinite(parsed) && parsed >= 0) {
19063
+ weight = parsed;
19064
+ }
19065
+ }
19066
+ if (weight === 0) {
19067
+ return null;
19068
+ }
19052
19069
  const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
19053
19070
  const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
19054
19071
  const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
@@ -19104,7 +19121,8 @@ function extractFromHtml(url, html, config) {
19104
19121
  noindex: false,
19105
19122
  tags,
19106
19123
  description,
19107
- keywords
19124
+ keywords,
19125
+ weight
19108
19126
  };
19109
19127
  }
19110
19128
  function extractFromMarkdown(url, markdown, title) {
@@ -19117,6 +19135,14 @@ function extractFromMarkdown(url, markdown, title) {
19117
19135
  if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
19118
19136
  return null;
19119
19137
  }
19138
+ let mdWeight;
19139
+ const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
19140
+ if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
19141
+ mdWeight = rawWeight;
19142
+ }
19143
+ if (mdWeight === 0) {
19144
+ return null;
19145
+ }
19120
19146
  const content = parsed.content;
19121
19147
  const normalized = normalizeMarkdown(content);
19122
19148
  if (!normalizeText(normalized)) {
@@ -19139,7 +19165,8 @@ function extractFromMarkdown(url, markdown, title) {
19139
19165
  noindex: false,
19140
19166
  tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
19141
19167
  description: fmDescription,
19142
- keywords: fmKeywords
19168
+ keywords: fmKeywords,
19169
+ weight: mdWeight
19143
19170
  };
19144
19171
  }
19145
19172
  function yamlString(value) {
@@ -19335,6 +19362,38 @@ var Logger = class {
19335
19362
  `);
19336
19363
  }
19337
19364
  };
19365
+
19366
+ // src/utils/pattern.ts
19367
+ function matchUrlPattern(url, pattern) {
19368
+ const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
19369
+ const normalizedUrl = norm(url);
19370
+ const normalizedPattern = norm(pattern);
19371
+ if (normalizedPattern.endsWith("/**")) {
19372
+ const prefix = normalizedPattern.slice(0, -3);
19373
+ if (prefix === "") {
19374
+ return true;
19375
+ }
19376
+ return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
19377
+ }
19378
+ if (normalizedPattern.endsWith("/*")) {
19379
+ const prefix = normalizedPattern.slice(0, -2);
19380
+ if (prefix === "") {
19381
+ return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
19382
+ }
19383
+ if (!normalizedUrl.startsWith(prefix + "/")) return false;
19384
+ const rest = normalizedUrl.slice(prefix.length + 1);
19385
+ return rest.length > 0 && !rest.includes("/");
19386
+ }
19387
+ return normalizedUrl === normalizedPattern;
19388
+ }
19389
+ function matchUrlPatterns(url, patterns) {
19390
+ for (const pattern of patterns) {
19391
+ if (matchUrlPattern(url, pattern)) return true;
19392
+ }
19393
+ return false;
19394
+ }
19395
+
19396
+ // src/indexing/sources/build/manifest-parser.ts
19338
19397
  function routeIdToFile(routeId) {
19339
19398
  if (routeId === "/") {
19340
19399
  return "src/routes/+page.svelte";
@@ -19408,15 +19467,7 @@ function expandDynamicUrl(url, value) {
19408
19467
  return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
19409
19468
  }
19410
19469
  function isExcluded(url, patterns) {
19411
- for (const pattern of patterns) {
19412
- if (pattern.endsWith("/*")) {
19413
- const prefix = pattern.slice(0, -1);
19414
- if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
19415
- } else if (url === pattern) {
19416
- return true;
19417
- }
19418
- }
19419
- return false;
19470
+ return matchUrlPatterns(url, patterns);
19420
19471
  }
19421
19472
  function findFreePort() {
19422
19473
  return new Promise((resolve, reject) => {
@@ -19832,6 +19883,158 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
19832
19883
  }
19833
19884
  return pages;
19834
19885
  }
19886
+ function parseRobotsTxt(content, userAgent = "Searchsocket") {
19887
+ const lines = content.split(/\r?\n/);
19888
+ const agentGroups = /* @__PURE__ */ new Map();
19889
+ let currentAgents = [];
19890
+ for (const rawLine of lines) {
19891
+ const line = rawLine.replace(/#.*$/, "").trim();
19892
+ if (!line) continue;
19893
+ const colonIdx = line.indexOf(":");
19894
+ if (colonIdx === -1) continue;
19895
+ const directive = line.slice(0, colonIdx).trim().toLowerCase();
19896
+ const value = line.slice(colonIdx + 1).trim();
19897
+ if (directive === "user-agent") {
19898
+ const agentName = value.toLowerCase();
19899
+ currentAgents.push(agentName);
19900
+ if (!agentGroups.has(agentName)) {
19901
+ agentGroups.set(agentName, { disallow: [], allow: [] });
19902
+ }
19903
+ } else if (directive === "disallow" && value && currentAgents.length > 0) {
19904
+ for (const agent of currentAgents) {
19905
+ agentGroups.get(agent).disallow.push(value);
19906
+ }
19907
+ } else if (directive === "allow" && value && currentAgents.length > 0) {
19908
+ for (const agent of currentAgents) {
19909
+ agentGroups.get(agent).allow.push(value);
19910
+ }
19911
+ } else if (directive !== "disallow" && directive !== "allow") {
19912
+ currentAgents = [];
19913
+ }
19914
+ }
19915
+ const specific = agentGroups.get(userAgent.toLowerCase());
19916
+ if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
19917
+ return specific;
19918
+ }
19919
+ return agentGroups.get("*") ?? { disallow: [], allow: [] };
19920
+ }
19921
+ function isBlockedByRobots(urlPath, rules3) {
19922
+ let longestDisallow = "";
19923
+ for (const pattern of rules3.disallow) {
19924
+ if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
19925
+ longestDisallow = pattern;
19926
+ }
19927
+ }
19928
+ if (!longestDisallow) return false;
19929
+ let longestAllow = "";
19930
+ for (const pattern of rules3.allow) {
19931
+ if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
19932
+ longestAllow = pattern;
19933
+ }
19934
+ }
19935
+ return longestAllow.length < longestDisallow.length;
19936
+ }
19937
+ async function loadRobotsTxtFromDir(dir) {
19938
+ try {
19939
+ const content = await fs4__default.default.readFile(path__default.default.join(dir, "robots.txt"), "utf8");
19940
+ return parseRobotsTxt(content);
19941
+ } catch {
19942
+ return null;
19943
+ }
19944
+ }
19945
+ async function fetchRobotsTxt(baseUrl) {
19946
+ try {
19947
+ const url = new URL("/robots.txt", baseUrl).href;
19948
+ const response = await fetch(url);
19949
+ if (!response.ok) return null;
19950
+ const content = await response.text();
19951
+ return parseRobotsTxt(content);
19952
+ } catch {
19953
+ return null;
19954
+ }
19955
+ }
19956
+
19957
+ // src/search/ranking.ts
19958
+ function nonNegativeOrZero(value) {
19959
+ if (!Number.isFinite(value)) {
19960
+ return 0;
19961
+ }
19962
+ return Math.max(0, value);
19963
+ }
19964
+ function rankHits(hits, config) {
19965
+ return hits.map((hit) => {
19966
+ let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
19967
+ if (config.ranking.enableIncomingLinkBoost) {
19968
+ const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
19969
+ score += incomingBoost * config.ranking.weights.incomingLinks;
19970
+ }
19971
+ if (config.ranking.enableDepthBoost) {
19972
+ const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
19973
+ score += depthBoost * config.ranking.weights.depth;
19974
+ }
19975
+ return {
19976
+ hit,
19977
+ finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
19978
+ };
19979
+ }).sort((a, b) => {
19980
+ const delta = b.finalScore - a.finalScore;
19981
+ return Number.isNaN(delta) ? 0 : delta;
19982
+ });
19983
+ }
19984
+ function findPageWeight(url, pageWeights) {
19985
+ let bestPattern = "";
19986
+ let bestWeight = 1;
19987
+ for (const [pattern, weight] of Object.entries(pageWeights)) {
19988
+ if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
19989
+ bestPattern = pattern;
19990
+ bestWeight = weight;
19991
+ }
19992
+ }
19993
+ return bestWeight;
19994
+ }
19995
+ function aggregateByPage(ranked, config) {
19996
+ const groups = /* @__PURE__ */ new Map();
19997
+ for (const hit of ranked) {
19998
+ const url = hit.hit.metadata.url;
19999
+ const group = groups.get(url);
20000
+ if (group) group.push(hit);
20001
+ else groups.set(url, [hit]);
20002
+ }
20003
+ const { aggregationCap, aggregationDecay } = config.ranking;
20004
+ const pages = [];
20005
+ for (const [url, chunks] of groups) {
20006
+ chunks.sort((a, b) => {
20007
+ const delta = b.finalScore - a.finalScore;
20008
+ return Number.isNaN(delta) ? 0 : delta;
20009
+ });
20010
+ const best = chunks[0];
20011
+ const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
20012
+ const topChunks = chunks.slice(0, aggregationCap);
20013
+ let aggregationBonus = 0;
20014
+ for (let i = 1; i < topChunks.length; i++) {
20015
+ const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
20016
+ aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
20017
+ }
20018
+ let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
20019
+ const pageWeight = findPageWeight(url, config.ranking.pageWeights);
20020
+ if (pageWeight === 0) continue;
20021
+ if (pageWeight !== 1) {
20022
+ pageScore *= pageWeight;
20023
+ }
20024
+ pages.push({
20025
+ url,
20026
+ title: best.hit.metadata.title,
20027
+ routeFile: best.hit.metadata.routeFile,
20028
+ pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
20029
+ bestChunk: best,
20030
+ matchingChunks: chunks
20031
+ });
20032
+ }
20033
+ return pages.sort((a, b) => {
20034
+ const delta = b.pageScore - a.pageScore;
20035
+ return Number.isNaN(delta) ? 0 : delta;
20036
+ });
20037
+ }
19835
20038
 
19836
20039
  // src/utils/time.ts
19837
20040
  function nowIso() {
@@ -19843,9 +20046,10 @@ function hrTimeMs(start) {
19843
20046
 
19844
20047
  // src/indexing/pipeline.ts
19845
20048
  var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
19846
- "jina-embeddings-v3": 2e-5
20049
+ "jina-embeddings-v3": 2e-5,
20050
+ "jina-embeddings-v5-text-small": 5e-5
19847
20051
  };
19848
- var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
20052
+ var DEFAULT_EMBEDDING_PRICE_PER_1K = 5e-5;
19849
20053
  var IndexPipeline = class _IndexPipeline {
19850
20054
  cwd;
19851
20055
  config;
@@ -19923,6 +20127,53 @@ var IndexPipeline = class _IndexPipeline {
19923
20127
  }
19924
20128
  stageEnd("source", sourceStart);
19925
20129
  this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
20130
+ const filterStart = stageStart();
20131
+ let filteredSourcePages = sourcePages;
20132
+ if (this.config.exclude.length > 0) {
20133
+ const beforeExclude = filteredSourcePages.length;
20134
+ filteredSourcePages = filteredSourcePages.filter((p) => {
20135
+ const url = normalizeUrlPath(p.url);
20136
+ if (matchUrlPatterns(url, this.config.exclude)) {
20137
+ this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
20138
+ return false;
20139
+ }
20140
+ return true;
20141
+ });
20142
+ const excludedCount = beforeExclude - filteredSourcePages.length;
20143
+ if (excludedCount > 0) {
20144
+ this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
20145
+ }
20146
+ }
20147
+ if (this.config.respectRobotsTxt) {
20148
+ let robotsRules = null;
20149
+ if (sourceMode === "static-output") {
20150
+ robotsRules = await loadRobotsTxtFromDir(
20151
+ path__default.default.resolve(this.cwd, this.config.source.staticOutputDir)
20152
+ );
20153
+ } else if (sourceMode === "build" && this.config.source.build) {
20154
+ robotsRules = await loadRobotsTxtFromDir(
20155
+ path__default.default.resolve(this.cwd, this.config.source.build.outputDir)
20156
+ );
20157
+ } else if (sourceMode === "crawl" && this.config.source.crawl) {
20158
+ robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
20159
+ }
20160
+ if (robotsRules) {
20161
+ const beforeRobots = filteredSourcePages.length;
20162
+ filteredSourcePages = filteredSourcePages.filter((p) => {
20163
+ const url = normalizeUrlPath(p.url);
20164
+ if (isBlockedByRobots(url, robotsRules)) {
20165
+ this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
20166
+ return false;
20167
+ }
20168
+ return true;
20169
+ });
20170
+ const robotsExcluded = beforeRobots - filteredSourcePages.length;
20171
+ if (robotsExcluded > 0) {
20172
+ this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
20173
+ }
20174
+ }
20175
+ }
20176
+ stageEnd("filter", filterStart);
19926
20177
  const routeStart = stageStart();
19927
20178
  const routePatterns = await buildRoutePatterns(this.cwd);
19928
20179
  stageEnd("route_map", routeStart);
@@ -19930,7 +20181,7 @@ var IndexPipeline = class _IndexPipeline {
19930
20181
  const extractStart = stageStart();
19931
20182
  this.logger.info("Extracting content...");
19932
20183
  const extractedPages = [];
19933
- for (const sourcePage of sourcePages) {
20184
+ for (const sourcePage of filteredSourcePages) {
19934
20185
  const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
19935
20186
  if (!extracted) {
19936
20187
  this.logger.warn(
@@ -19956,16 +20207,29 @@ var IndexPipeline = class _IndexPipeline {
19956
20207
  seenUrls.add(page.url);
19957
20208
  uniquePages.push(page);
19958
20209
  }
20210
+ const indexablePages = [];
20211
+ for (const page of uniquePages) {
20212
+ const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
20213
+ if (effectiveWeight === 0) {
20214
+ this.logger.debug(`Excluding ${page.url} (zero weight)`);
20215
+ continue;
20216
+ }
20217
+ indexablePages.push(page);
20218
+ }
20219
+ const zeroWeightCount = uniquePages.length - indexablePages.length;
20220
+ if (zeroWeightCount > 0) {
20221
+ this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
20222
+ }
19959
20223
  stageEnd("extract", extractStart);
19960
- const skippedPages = sourcePages.length - uniquePages.length;
19961
- this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
20224
+ const skippedPages = filteredSourcePages.length - indexablePages.length;
20225
+ this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
19962
20226
  const linkStart = stageStart();
19963
- const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
20227
+ const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
19964
20228
  const incomingLinkCount = /* @__PURE__ */ new Map();
19965
- for (const page of uniquePages) {
20229
+ for (const page of indexablePages) {
19966
20230
  incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
19967
20231
  }
19968
- for (const page of uniquePages) {
20232
+ for (const page of indexablePages) {
19969
20233
  for (const outgoing of page.outgoingLinks) {
19970
20234
  if (!pageSet.has(outgoing)) {
19971
20235
  continue;
@@ -19989,7 +20253,7 @@ var IndexPipeline = class _IndexPipeline {
19989
20253
  });
19990
20254
  }
19991
20255
  }
19992
- for (const page of uniquePages) {
20256
+ for (const page of indexablePages) {
19993
20257
  const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
19994
20258
  if (routeMatch.routeResolution === "best-effort") {
19995
20259
  if (this.config.source.strictRouteMapping) {
@@ -20206,100 +20470,6 @@ var IndexPipeline = class _IndexPipeline {
20206
20470
  };
20207
20471
  }
20208
20472
  };
20209
-
20210
- // src/search/ranking.ts
20211
- function nonNegativeOrZero(value) {
20212
- if (!Number.isFinite(value)) {
20213
- return 0;
20214
- }
20215
- return Math.max(0, value);
20216
- }
20217
- function rankHits(hits, config) {
20218
- return hits.map((hit) => {
20219
- let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
20220
- if (config.ranking.enableIncomingLinkBoost) {
20221
- const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
20222
- score += incomingBoost * config.ranking.weights.incomingLinks;
20223
- }
20224
- if (config.ranking.enableDepthBoost) {
20225
- const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
20226
- score += depthBoost * config.ranking.weights.depth;
20227
- }
20228
- return {
20229
- hit,
20230
- finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
20231
- };
20232
- }).sort((a, b) => {
20233
- const delta = b.finalScore - a.finalScore;
20234
- return Number.isNaN(delta) ? 0 : delta;
20235
- });
20236
- }
20237
- function findPageWeight(url, pageWeights) {
20238
- const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
20239
- const normalizedUrl = norm(url);
20240
- for (const [pattern, weight] of Object.entries(pageWeights)) {
20241
- if (norm(pattern) === normalizedUrl) {
20242
- return weight;
20243
- }
20244
- }
20245
- let bestPrefix = "";
20246
- let bestWeight = 1;
20247
- for (const [pattern, weight] of Object.entries(pageWeights)) {
20248
- const normalizedPattern = norm(pattern);
20249
- if (normalizedPattern === "/") continue;
20250
- const prefix = `${normalizedPattern}/`;
20251
- if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
20252
- bestPrefix = prefix;
20253
- bestWeight = weight;
20254
- }
20255
- }
20256
- return bestWeight;
20257
- }
20258
- function aggregateByPage(ranked, config) {
20259
- const groups = /* @__PURE__ */ new Map();
20260
- for (const hit of ranked) {
20261
- const url = hit.hit.metadata.url;
20262
- const group = groups.get(url);
20263
- if (group) group.push(hit);
20264
- else groups.set(url, [hit]);
20265
- }
20266
- const { aggregationCap, aggregationDecay } = config.ranking;
20267
- const pages = [];
20268
- for (const [url, chunks] of groups) {
20269
- chunks.sort((a, b) => {
20270
- const delta = b.finalScore - a.finalScore;
20271
- return Number.isNaN(delta) ? 0 : delta;
20272
- });
20273
- const best = chunks[0];
20274
- const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
20275
- const topChunks = chunks.slice(0, aggregationCap);
20276
- let aggregationBonus = 0;
20277
- for (let i = 1; i < topChunks.length; i++) {
20278
- const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
20279
- aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
20280
- }
20281
- let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
20282
- const pageWeight = findPageWeight(url, config.ranking.pageWeights);
20283
- if (pageWeight === 0) continue;
20284
- if (pageWeight !== 1) {
20285
- pageScore *= pageWeight;
20286
- }
20287
- pages.push({
20288
- url,
20289
- title: best.hit.metadata.title,
20290
- routeFile: best.hit.metadata.routeFile,
20291
- pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
20292
- bestChunk: best,
20293
- matchingChunks: chunks
20294
- });
20295
- }
20296
- return pages.sort((a, b) => {
20297
- const delta = b.pageScore - a.pageScore;
20298
- return Number.isNaN(delta) ? 0 : delta;
20299
- });
20300
- }
20301
-
20302
- // src/search/engine.ts
20303
20473
  var requestSchema = zod.z.object({
20304
20474
  q: zod.z.string().trim().min(1),
20305
20475
  topK: zod.z.number().int().positive().max(100).optional(),
@@ -20307,7 +20477,8 @@ var requestSchema = zod.z.object({
20307
20477
  pathPrefix: zod.z.string().optional(),
20308
20478
  tags: zod.z.array(zod.z.string()).optional(),
20309
20479
  rerank: zod.z.boolean().optional(),
20310
- groupBy: zod.z.enum(["page", "chunk"]).optional()
20480
+ groupBy: zod.z.enum(["page", "chunk"]).optional(),
20481
+ stream: zod.z.boolean().optional()
20311
20482
  });
20312
20483
  var SearchEngine = class _SearchEngine {
20313
20484
  cwd;
@@ -20380,7 +20551,103 @@ var SearchEngine = class _SearchEngine {
20380
20551
  rerankMs = hrTimeMs(rerankStart);
20381
20552
  usedRerank = true;
20382
20553
  }
20383
- let results;
20554
+ const results = this.buildResults(ordered, topK, groupByPage);
20555
+ return {
20556
+ q: input.q,
20557
+ scope: resolvedScope.scopeName,
20558
+ results,
20559
+ meta: {
20560
+ timingsMs: {
20561
+ embed: Math.round(embedMs),
20562
+ vector: Math.round(vectorMs),
20563
+ rerank: Math.round(rerankMs),
20564
+ total: Math.round(hrTimeMs(totalStart))
20565
+ },
20566
+ usedRerank,
20567
+ modelId: this.config.embeddings.model
20568
+ }
20569
+ };
20570
+ }
20571
+ async *searchStreaming(request) {
20572
+ const parsed = requestSchema.safeParse(request);
20573
+ if (!parsed.success) {
20574
+ throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
20575
+ }
20576
+ const input = parsed.data;
20577
+ const wantsRerank = Boolean(input.rerank);
20578
+ if (!wantsRerank) {
20579
+ const response = await this.search(request);
20580
+ yield { phase: "initial", data: response };
20581
+ return;
20582
+ }
20583
+ const totalStart = process.hrtime.bigint();
20584
+ const resolvedScope = resolveScope(this.config, input.scope);
20585
+ await this.assertModelCompatibility(resolvedScope);
20586
+ const topK = input.topK ?? 10;
20587
+ const groupByPage = (input.groupBy ?? "page") === "page";
20588
+ const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
20589
+ const embedStart = process.hrtime.bigint();
20590
+ const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
20591
+ const queryVector = queryEmbeddings[0];
20592
+ if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
20593
+ throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
20594
+ }
20595
+ const embedMs = hrTimeMs(embedStart);
20596
+ const vectorStart = process.hrtime.bigint();
20597
+ const hits = await this.vectorStore.query(
20598
+ queryVector,
20599
+ {
20600
+ topK: candidateK,
20601
+ pathPrefix: input.pathPrefix,
20602
+ tags: input.tags
20603
+ },
20604
+ resolvedScope
20605
+ );
20606
+ const vectorMs = hrTimeMs(vectorStart);
20607
+ const ranked = rankHits(hits, this.config);
20608
+ const initialResults = this.buildResults(ranked, topK, groupByPage);
20609
+ yield {
20610
+ phase: "initial",
20611
+ data: {
20612
+ q: input.q,
20613
+ scope: resolvedScope.scopeName,
20614
+ results: initialResults,
20615
+ meta: {
20616
+ timingsMs: {
20617
+ embed: Math.round(embedMs),
20618
+ vector: Math.round(vectorMs),
20619
+ rerank: 0,
20620
+ total: Math.round(hrTimeMs(totalStart))
20621
+ },
20622
+ usedRerank: false,
20623
+ modelId: this.config.embeddings.model
20624
+ }
20625
+ }
20626
+ };
20627
+ const rerankStart = process.hrtime.bigint();
20628
+ const reranked = await this.rerankHits(input.q, ranked, topK);
20629
+ const rerankMs = hrTimeMs(rerankStart);
20630
+ const rerankedResults = this.buildResults(reranked, topK, groupByPage);
20631
+ yield {
20632
+ phase: "reranked",
20633
+ data: {
20634
+ q: input.q,
20635
+ scope: resolvedScope.scopeName,
20636
+ results: rerankedResults,
20637
+ meta: {
20638
+ timingsMs: {
20639
+ embed: Math.round(embedMs),
20640
+ vector: Math.round(vectorMs),
20641
+ rerank: Math.round(rerankMs),
20642
+ total: Math.round(hrTimeMs(totalStart))
20643
+ },
20644
+ usedRerank: true,
20645
+ modelId: this.config.embeddings.model
20646
+ }
20647
+ }
20648
+ };
20649
+ }
20650
+ buildResults(ordered, topK, groupByPage) {
20384
20651
  const minScore = this.config.ranking.minScore;
20385
20652
  if (groupByPage) {
20386
20653
  let pages = aggregateByPage(ordered, this.config);
@@ -20388,10 +20655,10 @@ var SearchEngine = class _SearchEngine {
20388
20655
  pages = pages.filter((p) => p.pageScore >= minScore);
20389
20656
  }
20390
20657
  const minRatio = this.config.ranking.minChunkScoreRatio;
20391
- results = pages.slice(0, topK).map((page) => {
20658
+ return pages.slice(0, topK).map((page) => {
20392
20659
  const bestScore = page.bestChunk.finalScore;
20393
- const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
20394
- const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
20660
+ const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
20661
+ const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
20395
20662
  return {
20396
20663
  url: page.url,
20397
20664
  title: page.title,
@@ -20408,10 +20675,11 @@ var SearchEngine = class _SearchEngine {
20408
20675
  };
20409
20676
  });
20410
20677
  } else {
20678
+ let filtered = ordered;
20411
20679
  if (minScore > 0) {
20412
- ordered = ordered.filter((entry) => entry.finalScore >= minScore);
20680
+ filtered = ordered.filter((entry) => entry.finalScore >= minScore);
20413
20681
  }
20414
- results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
20682
+ return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
20415
20683
  url: hit.metadata.url,
20416
20684
  title: hit.metadata.title,
20417
20685
  sectionTitle: hit.metadata.sectionTitle || void 0,
@@ -20420,21 +20688,6 @@ var SearchEngine = class _SearchEngine {
20420
20688
  routeFile: hit.metadata.routeFile
20421
20689
  }));
20422
20690
  }
20423
- return {
20424
- q: input.q,
20425
- scope: resolvedScope.scopeName,
20426
- results,
20427
- meta: {
20428
- timingsMs: {
20429
- embed: Math.round(embedMs),
20430
- vector: Math.round(vectorMs),
20431
- rerank: Math.round(rerankMs),
20432
- total: Math.round(hrTimeMs(totalStart))
20433
- },
20434
- usedRerank,
20435
- modelId: this.config.embeddings.model
20436
- }
20437
- };
20438
20691
  }
20439
20692
  async getPage(pathOrUrl, scope) {
20440
20693
  const resolvedScope = resolveScope(this.config, scope);
@@ -20557,7 +20810,7 @@ var SearchEngine = class _SearchEngine {
20557
20810
  });
20558
20811
  }
20559
20812
  };
20560
- function createServer(engine) {
20813
+ function createServer(engine, config) {
20561
20814
  const server = new mcp_js.McpServer({
20562
20815
  name: "searchsocket-mcp",
20563
20816
  version: "0.1.0"
@@ -20565,14 +20818,15 @@ function createServer(engine) {
20565
20818
  server.registerTool(
20566
20819
  "search",
20567
20820
  {
20568
- description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, and topK.",
20821
+ description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, topK, and rerank. Enable rerank for better relevance on natural-language queries.",
20569
20822
  inputSchema: {
20570
20823
  query: zod.z.string().min(1),
20571
20824
  scope: zod.z.string().optional(),
20572
20825
  topK: zod.z.number().int().positive().max(100).optional(),
20573
20826
  pathPrefix: zod.z.string().optional(),
20574
20827
  tags: zod.z.array(zod.z.string()).optional(),
20575
- groupBy: zod.z.enum(["page", "chunk"]).optional()
20828
+ groupBy: zod.z.enum(["page", "chunk"]).optional(),
20829
+ rerank: zod.z.boolean().optional().describe("Enable reranking for better relevance (uses Jina Reranker). Defaults to true when rerank is enabled in config.")
20576
20830
  }
20577
20831
  },
20578
20832
  async (input) => {
@@ -20582,7 +20836,8 @@ function createServer(engine) {
20582
20836
  scope: input.scope,
20583
20837
  pathPrefix: input.pathPrefix,
20584
20838
  tags: input.tags,
20585
- groupBy: input.groupBy
20839
+ groupBy: input.groupBy,
20840
+ rerank: input.rerank ?? config.rerank.enabled
20586
20841
  });
20587
20842
  return {
20588
20843
  content: [
@@ -20708,10 +20963,10 @@ async function runMcpServer(options = {}) {
20708
20963
  config
20709
20964
  });
20710
20965
  if (resolvedTransport === "http") {
20711
- await startHttpServer(() => createServer(engine), config, options);
20966
+ await startHttpServer(() => createServer(engine, config), config, options);
20712
20967
  return;
20713
20968
  }
20714
- const server = createServer(engine);
20969
+ const server = createServer(engine, config);
20715
20970
  const stdioTransport = new stdio_js.StdioServerTransport();
20716
20971
  await server.connect(stdioTransport);
20717
20972
  }
@@ -20867,7 +21122,44 @@ function searchsocketHandle(options = {}) {
20867
21122
  throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
20868
21123
  }
20869
21124
  const engine = await getEngine();
20870
- const result = await engine.search(body);
21125
+ const searchRequest = body;
21126
+ if (searchRequest.stream && searchRequest.rerank) {
21127
+ const encoder = new TextEncoder();
21128
+ const stream = new ReadableStream({
21129
+ async start(controller) {
21130
+ try {
21131
+ for await (const event2 of engine.searchStreaming(searchRequest)) {
21132
+ const line = JSON.stringify(event2) + "\n";
21133
+ controller.enqueue(encoder.encode(line));
21134
+ }
21135
+ } catch (streamError) {
21136
+ const errorEvent = {
21137
+ phase: "error",
21138
+ data: {
21139
+ error: {
21140
+ code: streamError instanceof SearchSocketError ? streamError.code : "INTERNAL_ERROR",
21141
+ message: streamError instanceof Error ? streamError.message : "Unknown error"
21142
+ }
21143
+ }
21144
+ };
21145
+ controller.enqueue(encoder.encode(JSON.stringify(errorEvent) + "\n"));
21146
+ } finally {
21147
+ controller.close();
21148
+ }
21149
+ }
21150
+ });
21151
+ return withCors(
21152
+ new Response(stream, {
21153
+ status: 200,
21154
+ headers: {
21155
+ "content-type": "application/x-ndjson"
21156
+ }
21157
+ }),
21158
+ event.request,
21159
+ config
21160
+ );
21161
+ }
21162
+ const result = await engine.search(searchRequest);
20871
21163
  return withCors(
20872
21164
  new Response(JSON.stringify(result), {
20873
21165
  status: 200,
@@ -20980,7 +21272,7 @@ function searchsocketVitePlugin(options = {}) {
20980
21272
  });
20981
21273
  const stats = await pipeline.run({
20982
21274
  changedOnly: options.changedOnly ?? true,
20983
- force: options.force ?? false,
21275
+ force: (options.force ?? false) || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
20984
21276
  dryRun: options.dryRun ?? false,
20985
21277
  scopeOverride: options.scope,
20986
21278
  verbose: options.verbose
@@ -20997,6 +21289,60 @@ function searchsocketVitePlugin(options = {}) {
20997
21289
  };
20998
21290
  }
20999
21291
 
21292
+ // src/merge.ts
21293
+ function mergeSearchResults(initial, reranked, options) {
21294
+ const maxDisplacement = options?.maxDisplacement ?? 3;
21295
+ const initialUrls = initial.results.map((r) => r.url);
21296
+ const rerankedUrls = reranked.results.map((r) => r.url);
21297
+ const initialPos = /* @__PURE__ */ new Map();
21298
+ for (let i = 0; i < initialUrls.length; i++) {
21299
+ initialPos.set(initialUrls[i], i);
21300
+ }
21301
+ const rerankedPos = /* @__PURE__ */ new Map();
21302
+ for (let i = 0; i < rerankedUrls.length; i++) {
21303
+ rerankedPos.set(rerankedUrls[i], i);
21304
+ }
21305
+ const displacements = [];
21306
+ for (const url of initialUrls) {
21307
+ const iPos = initialPos.get(url);
21308
+ const rPos = rerankedPos.get(url);
21309
+ const displacement = rPos !== void 0 ? Math.abs(iPos - rPos) : 0;
21310
+ displacements.push({ url, displacement });
21311
+ }
21312
+ const totalResults = displacements.length;
21313
+ if (totalResults === 0) {
21314
+ return {
21315
+ response: reranked,
21316
+ usedRerankedOrder: true,
21317
+ displacements
21318
+ };
21319
+ }
21320
+ const hasLargeDisplacement = displacements.some((d) => d.displacement > maxDisplacement);
21321
+ if (hasLargeDisplacement) {
21322
+ return {
21323
+ response: reranked,
21324
+ usedRerankedOrder: true,
21325
+ displacements
21326
+ };
21327
+ }
21328
+ const rerankedScoreMap = /* @__PURE__ */ new Map();
21329
+ for (const result of reranked.results) {
21330
+ rerankedScoreMap.set(result.url, result.score);
21331
+ }
21332
+ const mergedResults = initial.results.map((result) => ({
21333
+ ...result,
21334
+ score: rerankedScoreMap.get(result.url) ?? result.score
21335
+ }));
21336
+ return {
21337
+ response: {
21338
+ ...reranked,
21339
+ results: mergedResults
21340
+ },
21341
+ usedRerankedOrder: false,
21342
+ displacements
21343
+ };
21344
+ }
21345
+
21000
21346
  // src/client.ts
21001
21347
  function createSearchClient(options = {}) {
21002
21348
  const endpoint = options.endpoint ?? "/api/search";
@@ -21024,6 +21370,72 @@ function createSearchClient(options = {}) {
21024
21370
  throw new Error(message);
21025
21371
  }
21026
21372
  return payload;
21373
+ },
21374
+ async streamSearch(request, onPhase) {
21375
+ const response = await fetchImpl(endpoint, {
21376
+ method: "POST",
21377
+ headers: {
21378
+ "content-type": "application/json"
21379
+ },
21380
+ body: JSON.stringify(request)
21381
+ });
21382
+ if (!response.ok) {
21383
+ let payload;
21384
+ try {
21385
+ payload = await response.json();
21386
+ } catch {
21387
+ throw new Error("Search failed");
21388
+ }
21389
+ const message = payload.error?.message ?? "Search failed";
21390
+ throw new Error(message);
21391
+ }
21392
+ const contentType = response.headers.get("content-type") ?? "";
21393
+ if (contentType.includes("application/json")) {
21394
+ const data = await response.json();
21395
+ onPhase({ phase: "initial", data });
21396
+ return data;
21397
+ }
21398
+ if (!response.body) {
21399
+ throw new Error("Response body is not readable");
21400
+ }
21401
+ const reader = response.body.getReader();
21402
+ const decoder = new TextDecoder();
21403
+ let buffer = "";
21404
+ let lastResponse = null;
21405
+ for (; ; ) {
21406
+ const { done, value } = await reader.read();
21407
+ if (done) break;
21408
+ buffer += decoder.decode(value, { stream: true });
21409
+ let newlineIdx;
21410
+ while ((newlineIdx = buffer.indexOf("\n")) !== -1) {
21411
+ const line = buffer.slice(0, newlineIdx).trim();
21412
+ buffer = buffer.slice(newlineIdx + 1);
21413
+ if (line.length === 0) continue;
21414
+ const event = JSON.parse(line);
21415
+ if (event.phase === "error") {
21416
+ const errData = event.data;
21417
+ throw new Error(errData.error.message ?? "Streaming search error");
21418
+ }
21419
+ const searchEvent = event;
21420
+ onPhase(searchEvent);
21421
+ lastResponse = searchEvent.data;
21422
+ }
21423
+ }
21424
+ const remaining = buffer.trim();
21425
+ if (remaining.length > 0) {
21426
+ const event = JSON.parse(remaining);
21427
+ if (event.phase === "error") {
21428
+ const errData = event.data;
21429
+ throw new Error(errData.error.message ?? "Streaming search error");
21430
+ }
21431
+ const searchEvent = event;
21432
+ onPhase(searchEvent);
21433
+ lastResponse = searchEvent.data;
21434
+ }
21435
+ if (!lastResponse) {
21436
+ throw new Error("No search results received");
21437
+ }
21438
+ return lastResponse;
21027
21439
  }
21028
21440
  };
21029
21441
  }
@@ -21050,6 +21462,7 @@ exports.isServerless = isServerless;
21050
21462
  exports.loadConfig = loadConfig;
21051
21463
  exports.mergeConfig = mergeConfig;
21052
21464
  exports.mergeConfigServerless = mergeConfigServerless;
21465
+ exports.mergeSearchResults = mergeSearchResults;
21053
21466
  exports.resolveScope = resolveScope;
21054
21467
  exports.runMcpServer = runMcpServer;
21055
21468
  exports.searchsocketHandle = searchsocketHandle;