searchsocket 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -5013,32 +5013,32 @@ var require_URL = __commonJS({
5013
5013
  else
5014
5014
  return basepath.substring(0, lastslash + 1) + refpath;
5015
5015
  }
5016
- function remove_dot_segments(path14) {
5017
- if (!path14) return path14;
5016
+ function remove_dot_segments(path15) {
5017
+ if (!path15) return path15;
5018
5018
  var output = "";
5019
- while (path14.length > 0) {
5020
- if (path14 === "." || path14 === "..") {
5021
- path14 = "";
5019
+ while (path15.length > 0) {
5020
+ if (path15 === "." || path15 === "..") {
5021
+ path15 = "";
5022
5022
  break;
5023
5023
  }
5024
- var twochars = path14.substring(0, 2);
5025
- var threechars = path14.substring(0, 3);
5026
- var fourchars = path14.substring(0, 4);
5024
+ var twochars = path15.substring(0, 2);
5025
+ var threechars = path15.substring(0, 3);
5026
+ var fourchars = path15.substring(0, 4);
5027
5027
  if (threechars === "../") {
5028
- path14 = path14.substring(3);
5028
+ path15 = path15.substring(3);
5029
5029
  } else if (twochars === "./") {
5030
- path14 = path14.substring(2);
5030
+ path15 = path15.substring(2);
5031
5031
  } else if (threechars === "/./") {
5032
- path14 = "/" + path14.substring(3);
5033
- } else if (twochars === "/." && path14.length === 2) {
5034
- path14 = "/";
5035
- } else if (fourchars === "/../" || threechars === "/.." && path14.length === 3) {
5036
- path14 = "/" + path14.substring(4);
5032
+ path15 = "/" + path15.substring(3);
5033
+ } else if (twochars === "/." && path15.length === 2) {
5034
+ path15 = "/";
5035
+ } else if (fourchars === "/../" || threechars === "/.." && path15.length === 3) {
5036
+ path15 = "/" + path15.substring(4);
5037
5037
  output = output.replace(/\/?[^\/]*$/, "");
5038
5038
  } else {
5039
- var segment = path14.match(/(\/?([^\/]*))/)[0];
5039
+ var segment = path15.match(/(\/?([^\/]*))/)[0];
5040
5040
  output += segment;
5041
- path14 = path14.substring(segment.length);
5041
+ path15 = path15.substring(segment.length);
5042
5042
  }
5043
5043
  }
5044
5044
  return output;
@@ -16602,6 +16602,8 @@ var searchSocketConfigSchema = z.object({
16602
16602
  envVar: z.string().min(1).optional(),
16603
16603
  sanitize: z.boolean().optional()
16604
16604
  }).optional(),
16605
+ exclude: z.array(z.string()).optional(),
16606
+ respectRobotsTxt: z.boolean().optional(),
16605
16607
  source: z.object({
16606
16608
  mode: z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
16607
16609
  staticOutputDir: z.string().min(1).optional(),
@@ -16732,6 +16734,8 @@ function createDefaultConfig(projectId) {
16732
16734
  envVar: "SEARCHSOCKET_SCOPE",
16733
16735
  sanitize: true
16734
16736
  },
16737
+ exclude: [],
16738
+ respectRobotsTxt: true,
16735
16739
  source: {
16736
16740
  mode: "static-output",
16737
16741
  staticOutputDir: "build",
@@ -16762,7 +16766,7 @@ function createDefaultConfig(projectId) {
16762
16766
  },
16763
16767
  embeddings: {
16764
16768
  provider: "jina",
16765
- model: "jina-embeddings-v3",
16769
+ model: "jina-embeddings-v5-text-small",
16766
16770
  apiKeyEnv: "JINA_API_KEY",
16767
16771
  batchSize: 64,
16768
16772
  concurrency: 4
@@ -16775,9 +16779,9 @@ function createDefaultConfig(projectId) {
16775
16779
  }
16776
16780
  },
16777
16781
  rerank: {
16778
- enabled: false,
16782
+ enabled: true,
16779
16783
  topN: 20,
16780
- model: "jina-reranker-v2-base-multilingual"
16784
+ model: "jina-reranker-v3"
16781
16785
  },
16782
16786
  ranking: {
16783
16787
  enableIncomingLinkBoost: true,
@@ -16896,6 +16900,8 @@ ${issues}`
16896
16900
  ...defaults.scope,
16897
16901
  ...parsed.scope
16898
16902
  },
16903
+ exclude: parsed.exclude ?? defaults.exclude,
16904
+ respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
16899
16905
  source: {
16900
16906
  ...defaults.source,
16901
16907
  ...parsed.source,
@@ -19037,6 +19043,17 @@ function extractFromHtml(url, html, config) {
19037
19043
  if ($(`[${config.extract.noindexAttr}]`).length > 0) {
19038
19044
  return null;
19039
19045
  }
19046
+ const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
19047
+ let weight;
19048
+ if (weightRaw !== void 0) {
19049
+ const parsed = Number(weightRaw);
19050
+ if (Number.isFinite(parsed) && parsed >= 0) {
19051
+ weight = parsed;
19052
+ }
19053
+ }
19054
+ if (weight === 0) {
19055
+ return null;
19056
+ }
19040
19057
  const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
19041
19058
  const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
19042
19059
  const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
@@ -19092,7 +19109,8 @@ function extractFromHtml(url, html, config) {
19092
19109
  noindex: false,
19093
19110
  tags,
19094
19111
  description,
19095
- keywords
19112
+ keywords,
19113
+ weight
19096
19114
  };
19097
19115
  }
19098
19116
  function extractFromMarkdown(url, markdown, title) {
@@ -19105,6 +19123,14 @@ function extractFromMarkdown(url, markdown, title) {
19105
19123
  if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
19106
19124
  return null;
19107
19125
  }
19126
+ let mdWeight;
19127
+ const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
19128
+ if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
19129
+ mdWeight = rawWeight;
19130
+ }
19131
+ if (mdWeight === 0) {
19132
+ return null;
19133
+ }
19108
19134
  const content = parsed.content;
19109
19135
  const normalized = normalizeMarkdown(content);
19110
19136
  if (!normalizeText(normalized)) {
@@ -19127,7 +19153,8 @@ function extractFromMarkdown(url, markdown, title) {
19127
19153
  noindex: false,
19128
19154
  tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
19129
19155
  description: fmDescription,
19130
- keywords: fmKeywords
19156
+ keywords: fmKeywords,
19157
+ weight: mdWeight
19131
19158
  };
19132
19159
  }
19133
19160
  function yamlString(value) {
@@ -19323,6 +19350,38 @@ var Logger = class {
19323
19350
  `);
19324
19351
  }
19325
19352
  };
19353
+
19354
+ // src/utils/pattern.ts
19355
+ function matchUrlPattern(url, pattern) {
19356
+ const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
19357
+ const normalizedUrl = norm(url);
19358
+ const normalizedPattern = norm(pattern);
19359
+ if (normalizedPattern.endsWith("/**")) {
19360
+ const prefix = normalizedPattern.slice(0, -3);
19361
+ if (prefix === "") {
19362
+ return true;
19363
+ }
19364
+ return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
19365
+ }
19366
+ if (normalizedPattern.endsWith("/*")) {
19367
+ const prefix = normalizedPattern.slice(0, -2);
19368
+ if (prefix === "") {
19369
+ return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
19370
+ }
19371
+ if (!normalizedUrl.startsWith(prefix + "/")) return false;
19372
+ const rest = normalizedUrl.slice(prefix.length + 1);
19373
+ return rest.length > 0 && !rest.includes("/");
19374
+ }
19375
+ return normalizedUrl === normalizedPattern;
19376
+ }
19377
+ function matchUrlPatterns(url, patterns) {
19378
+ for (const pattern of patterns) {
19379
+ if (matchUrlPattern(url, pattern)) return true;
19380
+ }
19381
+ return false;
19382
+ }
19383
+
19384
+ // src/indexing/sources/build/manifest-parser.ts
19326
19385
  function routeIdToFile(routeId) {
19327
19386
  if (routeId === "/") {
19328
19387
  return "src/routes/+page.svelte";
@@ -19396,15 +19455,7 @@ function expandDynamicUrl(url, value) {
19396
19455
  return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
19397
19456
  }
19398
19457
  function isExcluded(url, patterns) {
19399
- for (const pattern of patterns) {
19400
- if (pattern.endsWith("/*")) {
19401
- const prefix = pattern.slice(0, -1);
19402
- if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
19403
- } else if (url === pattern) {
19404
- return true;
19405
- }
19406
- }
19407
- return false;
19458
+ return matchUrlPatterns(url, patterns);
19408
19459
  }
19409
19460
  function findFreePort() {
19410
19461
  return new Promise((resolve, reject) => {
@@ -19820,6 +19871,158 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
19820
19871
  }
19821
19872
  return pages;
19822
19873
  }
19874
+ function parseRobotsTxt(content, userAgent = "Searchsocket") {
19875
+ const lines = content.split(/\r?\n/);
19876
+ const agentGroups = /* @__PURE__ */ new Map();
19877
+ let currentAgents = [];
19878
+ for (const rawLine of lines) {
19879
+ const line = rawLine.replace(/#.*$/, "").trim();
19880
+ if (!line) continue;
19881
+ const colonIdx = line.indexOf(":");
19882
+ if (colonIdx === -1) continue;
19883
+ const directive = line.slice(0, colonIdx).trim().toLowerCase();
19884
+ const value = line.slice(colonIdx + 1).trim();
19885
+ if (directive === "user-agent") {
19886
+ const agentName = value.toLowerCase();
19887
+ currentAgents.push(agentName);
19888
+ if (!agentGroups.has(agentName)) {
19889
+ agentGroups.set(agentName, { disallow: [], allow: [] });
19890
+ }
19891
+ } else if (directive === "disallow" && value && currentAgents.length > 0) {
19892
+ for (const agent of currentAgents) {
19893
+ agentGroups.get(agent).disallow.push(value);
19894
+ }
19895
+ } else if (directive === "allow" && value && currentAgents.length > 0) {
19896
+ for (const agent of currentAgents) {
19897
+ agentGroups.get(agent).allow.push(value);
19898
+ }
19899
+ } else if (directive !== "disallow" && directive !== "allow") {
19900
+ currentAgents = [];
19901
+ }
19902
+ }
19903
+ const specific = agentGroups.get(userAgent.toLowerCase());
19904
+ if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
19905
+ return specific;
19906
+ }
19907
+ return agentGroups.get("*") ?? { disallow: [], allow: [] };
19908
+ }
19909
+ function isBlockedByRobots(urlPath, rules3) {
19910
+ let longestDisallow = "";
19911
+ for (const pattern of rules3.disallow) {
19912
+ if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
19913
+ longestDisallow = pattern;
19914
+ }
19915
+ }
19916
+ if (!longestDisallow) return false;
19917
+ let longestAllow = "";
19918
+ for (const pattern of rules3.allow) {
19919
+ if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
19920
+ longestAllow = pattern;
19921
+ }
19922
+ }
19923
+ return longestAllow.length < longestDisallow.length;
19924
+ }
19925
+ async function loadRobotsTxtFromDir(dir) {
19926
+ try {
19927
+ const content = await fs4.readFile(path.join(dir, "robots.txt"), "utf8");
19928
+ return parseRobotsTxt(content);
19929
+ } catch {
19930
+ return null;
19931
+ }
19932
+ }
19933
+ async function fetchRobotsTxt(baseUrl) {
19934
+ try {
19935
+ const url = new URL("/robots.txt", baseUrl).href;
19936
+ const response = await fetch(url);
19937
+ if (!response.ok) return null;
19938
+ const content = await response.text();
19939
+ return parseRobotsTxt(content);
19940
+ } catch {
19941
+ return null;
19942
+ }
19943
+ }
19944
+
19945
+ // src/search/ranking.ts
19946
+ function nonNegativeOrZero(value) {
19947
+ if (!Number.isFinite(value)) {
19948
+ return 0;
19949
+ }
19950
+ return Math.max(0, value);
19951
+ }
19952
+ function rankHits(hits, config) {
19953
+ return hits.map((hit) => {
19954
+ let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
19955
+ if (config.ranking.enableIncomingLinkBoost) {
19956
+ const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
19957
+ score += incomingBoost * config.ranking.weights.incomingLinks;
19958
+ }
19959
+ if (config.ranking.enableDepthBoost) {
19960
+ const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
19961
+ score += depthBoost * config.ranking.weights.depth;
19962
+ }
19963
+ return {
19964
+ hit,
19965
+ finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
19966
+ };
19967
+ }).sort((a, b) => {
19968
+ const delta = b.finalScore - a.finalScore;
19969
+ return Number.isNaN(delta) ? 0 : delta;
19970
+ });
19971
+ }
19972
+ function findPageWeight(url, pageWeights) {
19973
+ let bestPattern = "";
19974
+ let bestWeight = 1;
19975
+ for (const [pattern, weight] of Object.entries(pageWeights)) {
19976
+ if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
19977
+ bestPattern = pattern;
19978
+ bestWeight = weight;
19979
+ }
19980
+ }
19981
+ return bestWeight;
19982
+ }
19983
+ function aggregateByPage(ranked, config) {
19984
+ const groups = /* @__PURE__ */ new Map();
19985
+ for (const hit of ranked) {
19986
+ const url = hit.hit.metadata.url;
19987
+ const group = groups.get(url);
19988
+ if (group) group.push(hit);
19989
+ else groups.set(url, [hit]);
19990
+ }
19991
+ const { aggregationCap, aggregationDecay } = config.ranking;
19992
+ const pages = [];
19993
+ for (const [url, chunks] of groups) {
19994
+ chunks.sort((a, b) => {
19995
+ const delta = b.finalScore - a.finalScore;
19996
+ return Number.isNaN(delta) ? 0 : delta;
19997
+ });
19998
+ const best = chunks[0];
19999
+ const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
20000
+ const topChunks = chunks.slice(0, aggregationCap);
20001
+ let aggregationBonus = 0;
20002
+ for (let i = 1; i < topChunks.length; i++) {
20003
+ const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
20004
+ aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
20005
+ }
20006
+ let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
20007
+ const pageWeight = findPageWeight(url, config.ranking.pageWeights);
20008
+ if (pageWeight === 0) continue;
20009
+ if (pageWeight !== 1) {
20010
+ pageScore *= pageWeight;
20011
+ }
20012
+ pages.push({
20013
+ url,
20014
+ title: best.hit.metadata.title,
20015
+ routeFile: best.hit.metadata.routeFile,
20016
+ pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
20017
+ bestChunk: best,
20018
+ matchingChunks: chunks
20019
+ });
20020
+ }
20021
+ return pages.sort((a, b) => {
20022
+ const delta = b.pageScore - a.pageScore;
20023
+ return Number.isNaN(delta) ? 0 : delta;
20024
+ });
20025
+ }
19823
20026
 
19824
20027
  // src/utils/time.ts
19825
20028
  function nowIso() {
@@ -19831,9 +20034,10 @@ function hrTimeMs(start) {
19831
20034
 
19832
20035
  // src/indexing/pipeline.ts
19833
20036
  var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
19834
- "jina-embeddings-v3": 2e-5
20037
+ "jina-embeddings-v3": 2e-5,
20038
+ "jina-embeddings-v5-text-small": 5e-5
19835
20039
  };
19836
- var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
20040
+ var DEFAULT_EMBEDDING_PRICE_PER_1K = 5e-5;
19837
20041
  var IndexPipeline = class _IndexPipeline {
19838
20042
  cwd;
19839
20043
  config;
@@ -19911,6 +20115,53 @@ var IndexPipeline = class _IndexPipeline {
19911
20115
  }
19912
20116
  stageEnd("source", sourceStart);
19913
20117
  this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
20118
+ const filterStart = stageStart();
20119
+ let filteredSourcePages = sourcePages;
20120
+ if (this.config.exclude.length > 0) {
20121
+ const beforeExclude = filteredSourcePages.length;
20122
+ filteredSourcePages = filteredSourcePages.filter((p) => {
20123
+ const url = normalizeUrlPath(p.url);
20124
+ if (matchUrlPatterns(url, this.config.exclude)) {
20125
+ this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
20126
+ return false;
20127
+ }
20128
+ return true;
20129
+ });
20130
+ const excludedCount = beforeExclude - filteredSourcePages.length;
20131
+ if (excludedCount > 0) {
20132
+ this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
20133
+ }
20134
+ }
20135
+ if (this.config.respectRobotsTxt) {
20136
+ let robotsRules = null;
20137
+ if (sourceMode === "static-output") {
20138
+ robotsRules = await loadRobotsTxtFromDir(
20139
+ path.resolve(this.cwd, this.config.source.staticOutputDir)
20140
+ );
20141
+ } else if (sourceMode === "build" && this.config.source.build) {
20142
+ robotsRules = await loadRobotsTxtFromDir(
20143
+ path.resolve(this.cwd, this.config.source.build.outputDir)
20144
+ );
20145
+ } else if (sourceMode === "crawl" && this.config.source.crawl) {
20146
+ robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
20147
+ }
20148
+ if (robotsRules) {
20149
+ const beforeRobots = filteredSourcePages.length;
20150
+ filteredSourcePages = filteredSourcePages.filter((p) => {
20151
+ const url = normalizeUrlPath(p.url);
20152
+ if (isBlockedByRobots(url, robotsRules)) {
20153
+ this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
20154
+ return false;
20155
+ }
20156
+ return true;
20157
+ });
20158
+ const robotsExcluded = beforeRobots - filteredSourcePages.length;
20159
+ if (robotsExcluded > 0) {
20160
+ this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
20161
+ }
20162
+ }
20163
+ }
20164
+ stageEnd("filter", filterStart);
19914
20165
  const routeStart = stageStart();
19915
20166
  const routePatterns = await buildRoutePatterns(this.cwd);
19916
20167
  stageEnd("route_map", routeStart);
@@ -19918,7 +20169,7 @@ var IndexPipeline = class _IndexPipeline {
19918
20169
  const extractStart = stageStart();
19919
20170
  this.logger.info("Extracting content...");
19920
20171
  const extractedPages = [];
19921
- for (const sourcePage of sourcePages) {
20172
+ for (const sourcePage of filteredSourcePages) {
19922
20173
  const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
19923
20174
  if (!extracted) {
19924
20175
  this.logger.warn(
@@ -19944,16 +20195,29 @@ var IndexPipeline = class _IndexPipeline {
19944
20195
  seenUrls.add(page.url);
19945
20196
  uniquePages.push(page);
19946
20197
  }
20198
+ const indexablePages = [];
20199
+ for (const page of uniquePages) {
20200
+ const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
20201
+ if (effectiveWeight === 0) {
20202
+ this.logger.debug(`Excluding ${page.url} (zero weight)`);
20203
+ continue;
20204
+ }
20205
+ indexablePages.push(page);
20206
+ }
20207
+ const zeroWeightCount = uniquePages.length - indexablePages.length;
20208
+ if (zeroWeightCount > 0) {
20209
+ this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
20210
+ }
19947
20211
  stageEnd("extract", extractStart);
19948
- const skippedPages = sourcePages.length - uniquePages.length;
19949
- this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
20212
+ const skippedPages = filteredSourcePages.length - indexablePages.length;
20213
+ this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
19950
20214
  const linkStart = stageStart();
19951
- const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
20215
+ const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
19952
20216
  const incomingLinkCount = /* @__PURE__ */ new Map();
19953
- for (const page of uniquePages) {
20217
+ for (const page of indexablePages) {
19954
20218
  incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
19955
20219
  }
19956
- for (const page of uniquePages) {
20220
+ for (const page of indexablePages) {
19957
20221
  for (const outgoing of page.outgoingLinks) {
19958
20222
  if (!pageSet.has(outgoing)) {
19959
20223
  continue;
@@ -19977,7 +20241,7 @@ var IndexPipeline = class _IndexPipeline {
19977
20241
  });
19978
20242
  }
19979
20243
  }
19980
- for (const page of uniquePages) {
20244
+ for (const page of indexablePages) {
19981
20245
  const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
19982
20246
  if (routeMatch.routeResolution === "best-effort") {
19983
20247
  if (this.config.source.strictRouteMapping) {
@@ -20194,100 +20458,6 @@ var IndexPipeline = class _IndexPipeline {
20194
20458
  };
20195
20459
  }
20196
20460
  };
20197
-
20198
- // src/search/ranking.ts
20199
- function nonNegativeOrZero(value) {
20200
- if (!Number.isFinite(value)) {
20201
- return 0;
20202
- }
20203
- return Math.max(0, value);
20204
- }
20205
- function rankHits(hits, config) {
20206
- return hits.map((hit) => {
20207
- let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
20208
- if (config.ranking.enableIncomingLinkBoost) {
20209
- const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
20210
- score += incomingBoost * config.ranking.weights.incomingLinks;
20211
- }
20212
- if (config.ranking.enableDepthBoost) {
20213
- const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
20214
- score += depthBoost * config.ranking.weights.depth;
20215
- }
20216
- return {
20217
- hit,
20218
- finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
20219
- };
20220
- }).sort((a, b) => {
20221
- const delta = b.finalScore - a.finalScore;
20222
- return Number.isNaN(delta) ? 0 : delta;
20223
- });
20224
- }
20225
- function findPageWeight(url, pageWeights) {
20226
- const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
20227
- const normalizedUrl = norm(url);
20228
- for (const [pattern, weight] of Object.entries(pageWeights)) {
20229
- if (norm(pattern) === normalizedUrl) {
20230
- return weight;
20231
- }
20232
- }
20233
- let bestPrefix = "";
20234
- let bestWeight = 1;
20235
- for (const [pattern, weight] of Object.entries(pageWeights)) {
20236
- const normalizedPattern = norm(pattern);
20237
- if (normalizedPattern === "/") continue;
20238
- const prefix = `${normalizedPattern}/`;
20239
- if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
20240
- bestPrefix = prefix;
20241
- bestWeight = weight;
20242
- }
20243
- }
20244
- return bestWeight;
20245
- }
20246
- function aggregateByPage(ranked, config) {
20247
- const groups = /* @__PURE__ */ new Map();
20248
- for (const hit of ranked) {
20249
- const url = hit.hit.metadata.url;
20250
- const group = groups.get(url);
20251
- if (group) group.push(hit);
20252
- else groups.set(url, [hit]);
20253
- }
20254
- const { aggregationCap, aggregationDecay } = config.ranking;
20255
- const pages = [];
20256
- for (const [url, chunks] of groups) {
20257
- chunks.sort((a, b) => {
20258
- const delta = b.finalScore - a.finalScore;
20259
- return Number.isNaN(delta) ? 0 : delta;
20260
- });
20261
- const best = chunks[0];
20262
- const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
20263
- const topChunks = chunks.slice(0, aggregationCap);
20264
- let aggregationBonus = 0;
20265
- for (let i = 1; i < topChunks.length; i++) {
20266
- const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
20267
- aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
20268
- }
20269
- let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
20270
- const pageWeight = findPageWeight(url, config.ranking.pageWeights);
20271
- if (pageWeight === 0) continue;
20272
- if (pageWeight !== 1) {
20273
- pageScore *= pageWeight;
20274
- }
20275
- pages.push({
20276
- url,
20277
- title: best.hit.metadata.title,
20278
- routeFile: best.hit.metadata.routeFile,
20279
- pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
20280
- bestChunk: best,
20281
- matchingChunks: chunks
20282
- });
20283
- }
20284
- return pages.sort((a, b) => {
20285
- const delta = b.pageScore - a.pageScore;
20286
- return Number.isNaN(delta) ? 0 : delta;
20287
- });
20288
- }
20289
-
20290
- // src/search/engine.ts
20291
20461
  var requestSchema = z.object({
20292
20462
  q: z.string().trim().min(1),
20293
20463
  topK: z.number().int().positive().max(100).optional(),
@@ -20295,7 +20465,8 @@ var requestSchema = z.object({
20295
20465
  pathPrefix: z.string().optional(),
20296
20466
  tags: z.array(z.string()).optional(),
20297
20467
  rerank: z.boolean().optional(),
20298
- groupBy: z.enum(["page", "chunk"]).optional()
20468
+ groupBy: z.enum(["page", "chunk"]).optional(),
20469
+ stream: z.boolean().optional()
20299
20470
  });
20300
20471
  var SearchEngine = class _SearchEngine {
20301
20472
  cwd;
@@ -20368,7 +20539,103 @@ var SearchEngine = class _SearchEngine {
20368
20539
  rerankMs = hrTimeMs(rerankStart);
20369
20540
  usedRerank = true;
20370
20541
  }
20371
- let results;
20542
+ const results = this.buildResults(ordered, topK, groupByPage);
20543
+ return {
20544
+ q: input.q,
20545
+ scope: resolvedScope.scopeName,
20546
+ results,
20547
+ meta: {
20548
+ timingsMs: {
20549
+ embed: Math.round(embedMs),
20550
+ vector: Math.round(vectorMs),
20551
+ rerank: Math.round(rerankMs),
20552
+ total: Math.round(hrTimeMs(totalStart))
20553
+ },
20554
+ usedRerank,
20555
+ modelId: this.config.embeddings.model
20556
+ }
20557
+ };
20558
+ }
20559
+ async *searchStreaming(request) {
20560
+ const parsed = requestSchema.safeParse(request);
20561
+ if (!parsed.success) {
20562
+ throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
20563
+ }
20564
+ const input = parsed.data;
20565
+ const wantsRerank = Boolean(input.rerank);
20566
+ if (!wantsRerank) {
20567
+ const response = await this.search(request);
20568
+ yield { phase: "initial", data: response };
20569
+ return;
20570
+ }
20571
+ const totalStart = process.hrtime.bigint();
20572
+ const resolvedScope = resolveScope(this.config, input.scope);
20573
+ await this.assertModelCompatibility(resolvedScope);
20574
+ const topK = input.topK ?? 10;
20575
+ const groupByPage = (input.groupBy ?? "page") === "page";
20576
+ const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
20577
+ const embedStart = process.hrtime.bigint();
20578
+ const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
20579
+ const queryVector = queryEmbeddings[0];
20580
+ if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
20581
+ throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
20582
+ }
20583
+ const embedMs = hrTimeMs(embedStart);
20584
+ const vectorStart = process.hrtime.bigint();
20585
+ const hits = await this.vectorStore.query(
20586
+ queryVector,
20587
+ {
20588
+ topK: candidateK,
20589
+ pathPrefix: input.pathPrefix,
20590
+ tags: input.tags
20591
+ },
20592
+ resolvedScope
20593
+ );
20594
+ const vectorMs = hrTimeMs(vectorStart);
20595
+ const ranked = rankHits(hits, this.config);
20596
+ const initialResults = this.buildResults(ranked, topK, groupByPage);
20597
+ yield {
20598
+ phase: "initial",
20599
+ data: {
20600
+ q: input.q,
20601
+ scope: resolvedScope.scopeName,
20602
+ results: initialResults,
20603
+ meta: {
20604
+ timingsMs: {
20605
+ embed: Math.round(embedMs),
20606
+ vector: Math.round(vectorMs),
20607
+ rerank: 0,
20608
+ total: Math.round(hrTimeMs(totalStart))
20609
+ },
20610
+ usedRerank: false,
20611
+ modelId: this.config.embeddings.model
20612
+ }
20613
+ }
20614
+ };
20615
+ const rerankStart = process.hrtime.bigint();
20616
+ const reranked = await this.rerankHits(input.q, ranked, topK);
20617
+ const rerankMs = hrTimeMs(rerankStart);
20618
+ const rerankedResults = this.buildResults(reranked, topK, groupByPage);
20619
+ yield {
20620
+ phase: "reranked",
20621
+ data: {
20622
+ q: input.q,
20623
+ scope: resolvedScope.scopeName,
20624
+ results: rerankedResults,
20625
+ meta: {
20626
+ timingsMs: {
20627
+ embed: Math.round(embedMs),
20628
+ vector: Math.round(vectorMs),
20629
+ rerank: Math.round(rerankMs),
20630
+ total: Math.round(hrTimeMs(totalStart))
20631
+ },
20632
+ usedRerank: true,
20633
+ modelId: this.config.embeddings.model
20634
+ }
20635
+ }
20636
+ };
20637
+ }
20638
+ buildResults(ordered, topK, groupByPage) {
20372
20639
  const minScore = this.config.ranking.minScore;
20373
20640
  if (groupByPage) {
20374
20641
  let pages = aggregateByPage(ordered, this.config);
@@ -20376,10 +20643,10 @@ var SearchEngine = class _SearchEngine {
20376
20643
  pages = pages.filter((p) => p.pageScore >= minScore);
20377
20644
  }
20378
20645
  const minRatio = this.config.ranking.minChunkScoreRatio;
20379
- results = pages.slice(0, topK).map((page) => {
20646
+ return pages.slice(0, topK).map((page) => {
20380
20647
  const bestScore = page.bestChunk.finalScore;
20381
- const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
20382
- const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
20648
+ const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
20649
+ const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
20383
20650
  return {
20384
20651
  url: page.url,
20385
20652
  title: page.title,
@@ -20396,10 +20663,11 @@ var SearchEngine = class _SearchEngine {
20396
20663
  };
20397
20664
  });
20398
20665
  } else {
20666
+ let filtered = ordered;
20399
20667
  if (minScore > 0) {
20400
- ordered = ordered.filter((entry) => entry.finalScore >= minScore);
20668
+ filtered = ordered.filter((entry) => entry.finalScore >= minScore);
20401
20669
  }
20402
- results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
20670
+ return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
20403
20671
  url: hit.metadata.url,
20404
20672
  title: hit.metadata.title,
20405
20673
  sectionTitle: hit.metadata.sectionTitle || void 0,
@@ -20408,21 +20676,6 @@ var SearchEngine = class _SearchEngine {
20408
20676
  routeFile: hit.metadata.routeFile
20409
20677
  }));
20410
20678
  }
20411
- return {
20412
- q: input.q,
20413
- scope: resolvedScope.scopeName,
20414
- results,
20415
- meta: {
20416
- timingsMs: {
20417
- embed: Math.round(embedMs),
20418
- vector: Math.round(vectorMs),
20419
- rerank: Math.round(rerankMs),
20420
- total: Math.round(hrTimeMs(totalStart))
20421
- },
20422
- usedRerank,
20423
- modelId: this.config.embeddings.model
20424
- }
20425
- };
20426
20679
  }
20427
20680
  async getPage(pathOrUrl, scope) {
20428
20681
  const resolvedScope = resolveScope(this.config, scope);
@@ -20545,7 +20798,7 @@ var SearchEngine = class _SearchEngine {
20545
20798
  });
20546
20799
  }
20547
20800
  };
20548
- function createServer(engine) {
20801
+ function createServer(engine, config) {
20549
20802
  const server = new McpServer({
20550
20803
  name: "searchsocket-mcp",
20551
20804
  version: "0.1.0"
@@ -20553,14 +20806,15 @@ function createServer(engine) {
20553
20806
  server.registerTool(
20554
20807
  "search",
20555
20808
  {
20556
- description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, and topK.",
20809
+ description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, topK, and rerank. Enable rerank for better relevance on natural-language queries.",
20557
20810
  inputSchema: {
20558
20811
  query: z.string().min(1),
20559
20812
  scope: z.string().optional(),
20560
20813
  topK: z.number().int().positive().max(100).optional(),
20561
20814
  pathPrefix: z.string().optional(),
20562
20815
  tags: z.array(z.string()).optional(),
20563
- groupBy: z.enum(["page", "chunk"]).optional()
20816
+ groupBy: z.enum(["page", "chunk"]).optional(),
20817
+ rerank: z.boolean().optional().describe("Enable reranking for better relevance (uses Jina Reranker). Defaults to true when rerank is enabled in config.")
20564
20818
  }
20565
20819
  },
20566
20820
  async (input) => {
@@ -20570,7 +20824,8 @@ function createServer(engine) {
20570
20824
  scope: input.scope,
20571
20825
  pathPrefix: input.pathPrefix,
20572
20826
  tags: input.tags,
20573
- groupBy: input.groupBy
20827
+ groupBy: input.groupBy,
20828
+ rerank: input.rerank ?? config.rerank.enabled
20574
20829
  });
20575
20830
  return {
20576
20831
  content: [
@@ -20696,10 +20951,10 @@ async function runMcpServer(options = {}) {
20696
20951
  config
20697
20952
  });
20698
20953
  if (resolvedTransport === "http") {
20699
- await startHttpServer(() => createServer(engine), config, options);
20954
+ await startHttpServer(() => createServer(engine, config), config, options);
20700
20955
  return;
20701
20956
  }
20702
- const server = createServer(engine);
20957
+ const server = createServer(engine, config);
20703
20958
  const stdioTransport = new StdioServerTransport();
20704
20959
  await server.connect(stdioTransport);
20705
20960
  }
@@ -20855,7 +21110,44 @@ function searchsocketHandle(options = {}) {
20855
21110
  throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
20856
21111
  }
20857
21112
  const engine = await getEngine();
20858
- const result = await engine.search(body);
21113
+ const searchRequest = body;
21114
+ if (searchRequest.stream && searchRequest.rerank) {
21115
+ const encoder = new TextEncoder();
21116
+ const stream = new ReadableStream({
21117
+ async start(controller) {
21118
+ try {
21119
+ for await (const event2 of engine.searchStreaming(searchRequest)) {
21120
+ const line = JSON.stringify(event2) + "\n";
21121
+ controller.enqueue(encoder.encode(line));
21122
+ }
21123
+ } catch (streamError) {
21124
+ const errorEvent = {
21125
+ phase: "error",
21126
+ data: {
21127
+ error: {
21128
+ code: streamError instanceof SearchSocketError ? streamError.code : "INTERNAL_ERROR",
21129
+ message: streamError instanceof Error ? streamError.message : "Unknown error"
21130
+ }
21131
+ }
21132
+ };
21133
+ controller.enqueue(encoder.encode(JSON.stringify(errorEvent) + "\n"));
21134
+ } finally {
21135
+ controller.close();
21136
+ }
21137
+ }
21138
+ });
21139
+ return withCors(
21140
+ new Response(stream, {
21141
+ status: 200,
21142
+ headers: {
21143
+ "content-type": "application/x-ndjson"
21144
+ }
21145
+ }),
21146
+ event.request,
21147
+ config
21148
+ );
21149
+ }
21150
+ const result = await engine.search(searchRequest);
20859
21151
  return withCors(
20860
21152
  new Response(JSON.stringify(result), {
20861
21153
  status: 200,
@@ -20968,7 +21260,7 @@ function searchsocketVitePlugin(options = {}) {
20968
21260
  });
20969
21261
  const stats = await pipeline.run({
20970
21262
  changedOnly: options.changedOnly ?? true,
20971
- force: options.force ?? false,
21263
+ force: (options.force ?? false) || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
20972
21264
  dryRun: options.dryRun ?? false,
20973
21265
  scopeOverride: options.scope,
20974
21266
  verbose: options.verbose
@@ -20985,6 +21277,60 @@ function searchsocketVitePlugin(options = {}) {
20985
21277
  };
20986
21278
  }
20987
21279
 
21280
+ // src/merge.ts
21281
+ function mergeSearchResults(initial, reranked, options) {
21282
+ const maxDisplacement = options?.maxDisplacement ?? 3;
21283
+ const initialUrls = initial.results.map((r) => r.url);
21284
+ const rerankedUrls = reranked.results.map((r) => r.url);
21285
+ const initialPos = /* @__PURE__ */ new Map();
21286
+ for (let i = 0; i < initialUrls.length; i++) {
21287
+ initialPos.set(initialUrls[i], i);
21288
+ }
21289
+ const rerankedPos = /* @__PURE__ */ new Map();
21290
+ for (let i = 0; i < rerankedUrls.length; i++) {
21291
+ rerankedPos.set(rerankedUrls[i], i);
21292
+ }
21293
+ const displacements = [];
21294
+ for (const url of initialUrls) {
21295
+ const iPos = initialPos.get(url);
21296
+ const rPos = rerankedPos.get(url);
21297
+ const displacement = rPos !== void 0 ? Math.abs(iPos - rPos) : 0;
21298
+ displacements.push({ url, displacement });
21299
+ }
21300
+ const totalResults = displacements.length;
21301
+ if (totalResults === 0) {
21302
+ return {
21303
+ response: reranked,
21304
+ usedRerankedOrder: true,
21305
+ displacements
21306
+ };
21307
+ }
21308
+ const hasLargeDisplacement = displacements.some((d) => d.displacement > maxDisplacement);
21309
+ if (hasLargeDisplacement) {
21310
+ return {
21311
+ response: reranked,
21312
+ usedRerankedOrder: true,
21313
+ displacements
21314
+ };
21315
+ }
21316
+ const rerankedScoreMap = /* @__PURE__ */ new Map();
21317
+ for (const result of reranked.results) {
21318
+ rerankedScoreMap.set(result.url, result.score);
21319
+ }
21320
+ const mergedResults = initial.results.map((result) => ({
21321
+ ...result,
21322
+ score: rerankedScoreMap.get(result.url) ?? result.score
21323
+ }));
21324
+ return {
21325
+ response: {
21326
+ ...reranked,
21327
+ results: mergedResults
21328
+ },
21329
+ usedRerankedOrder: false,
21330
+ displacements
21331
+ };
21332
+ }
21333
+
20988
21334
  // src/client.ts
20989
21335
  function createSearchClient(options = {}) {
20990
21336
  const endpoint = options.endpoint ?? "/api/search";
@@ -21012,6 +21358,72 @@ function createSearchClient(options = {}) {
21012
21358
  throw new Error(message);
21013
21359
  }
21014
21360
  return payload;
21361
+ },
21362
+ async streamSearch(request, onPhase) {
21363
+ const response = await fetchImpl(endpoint, {
21364
+ method: "POST",
21365
+ headers: {
21366
+ "content-type": "application/json"
21367
+ },
21368
+ body: JSON.stringify(request)
21369
+ });
21370
+ if (!response.ok) {
21371
+ let payload;
21372
+ try {
21373
+ payload = await response.json();
21374
+ } catch {
21375
+ throw new Error("Search failed");
21376
+ }
21377
+ const message = payload.error?.message ?? "Search failed";
21378
+ throw new Error(message);
21379
+ }
21380
+ const contentType = response.headers.get("content-type") ?? "";
21381
+ if (contentType.includes("application/json")) {
21382
+ const data = await response.json();
21383
+ onPhase({ phase: "initial", data });
21384
+ return data;
21385
+ }
21386
+ if (!response.body) {
21387
+ throw new Error("Response body is not readable");
21388
+ }
21389
+ const reader = response.body.getReader();
21390
+ const decoder = new TextDecoder();
21391
+ let buffer = "";
21392
+ let lastResponse = null;
21393
+ for (; ; ) {
21394
+ const { done, value } = await reader.read();
21395
+ if (done) break;
21396
+ buffer += decoder.decode(value, { stream: true });
21397
+ let newlineIdx;
21398
+ while ((newlineIdx = buffer.indexOf("\n")) !== -1) {
21399
+ const line = buffer.slice(0, newlineIdx).trim();
21400
+ buffer = buffer.slice(newlineIdx + 1);
21401
+ if (line.length === 0) continue;
21402
+ const event = JSON.parse(line);
21403
+ if (event.phase === "error") {
21404
+ const errData = event.data;
21405
+ throw new Error(errData.error.message ?? "Streaming search error");
21406
+ }
21407
+ const searchEvent = event;
21408
+ onPhase(searchEvent);
21409
+ lastResponse = searchEvent.data;
21410
+ }
21411
+ }
21412
+ const remaining = buffer.trim();
21413
+ if (remaining.length > 0) {
21414
+ const event = JSON.parse(remaining);
21415
+ if (event.phase === "error") {
21416
+ const errData = event.data;
21417
+ throw new Error(errData.error.message ?? "Streaming search error");
21418
+ }
21419
+ const searchEvent = event;
21420
+ onPhase(searchEvent);
21421
+ lastResponse = searchEvent.data;
21422
+ }
21423
+ if (!lastResponse) {
21424
+ throw new Error("No search results received");
21425
+ }
21426
+ return lastResponse;
21015
21427
  }
21016
21428
  };
21017
21429
  }
@@ -21027,6 +21439,6 @@ function createSearchClient(options = {}) {
21027
21439
  *)
21028
21440
  */
21029
21441
 
21030
- export { IndexPipeline, JinaReranker, SearchEngine, createEmbeddingsProvider, createReranker, createSearchClient, createVectorStore, isServerless, loadConfig, mergeConfig, mergeConfigServerless, resolveScope, runMcpServer, searchsocketHandle, searchsocketVitePlugin };
21442
+ export { IndexPipeline, JinaReranker, SearchEngine, createEmbeddingsProvider, createReranker, createSearchClient, createVectorStore, isServerless, loadConfig, mergeConfig, mergeConfigServerless, mergeSearchResults, resolveScope, runMcpServer, searchsocketHandle, searchsocketVitePlugin };
21031
21443
  //# sourceMappingURL=index.js.map
21032
21444
  //# sourceMappingURL=index.js.map