searchsocket 0.5.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,27 +1,33 @@
1
1
  'use strict';
2
2
 
3
- var fs = require('fs');
3
+ var crypto = require('crypto');
4
+ var fs9 = require('fs/promises');
4
5
  var path = require('path');
6
+ var webStandardStreamableHttp_js = require('@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js');
7
+ var fs = require('fs');
5
8
  var jiti = require('jiti');
6
9
  var zod = require('zod');
10
+ var mcp_js = require('@modelcontextprotocol/sdk/server/mcp.js');
11
+ require('@modelcontextprotocol/sdk/server/stdio.js');
12
+ require('@modelcontextprotocol/sdk/server/streamableHttp.js');
13
+ require('@modelcontextprotocol/sdk/server/express.js');
7
14
  var child_process = require('child_process');
8
- var crypto = require('crypto');
15
+ var vector = require('@upstash/vector');
9
16
  var cheerio = require('cheerio');
10
17
  var matter = require('gray-matter');
11
18
  var fg = require('fast-glob');
12
19
  var pLimit = require('p-limit');
13
- var fs3 = require('fs/promises');
14
20
  var net = require('net');
15
21
  var zlib = require('zlib');
16
22
 
17
23
  function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
18
24
 
19
- var fs__default = /*#__PURE__*/_interopDefault(fs);
25
+ var fs9__default = /*#__PURE__*/_interopDefault(fs9);
20
26
  var path__default = /*#__PURE__*/_interopDefault(path);
27
+ var fs__default = /*#__PURE__*/_interopDefault(fs);
21
28
  var matter__default = /*#__PURE__*/_interopDefault(matter);
22
29
  var fg__default = /*#__PURE__*/_interopDefault(fg);
23
30
  var pLimit__default = /*#__PURE__*/_interopDefault(pLimit);
24
- var fs3__default = /*#__PURE__*/_interopDefault(fs3);
25
31
  var net__default = /*#__PURE__*/_interopDefault(net);
26
32
 
27
33
  var __getOwnPropNames = Object.getOwnPropertyNames;
@@ -5021,32 +5027,32 @@ var require_URL = __commonJS({
5021
5027
  else
5022
5028
  return basepath.substring(0, lastslash + 1) + refpath;
5023
5029
  }
5024
- function remove_dot_segments(path13) {
5025
- if (!path13) return path13;
5030
+ function remove_dot_segments(path14) {
5031
+ if (!path14) return path14;
5026
5032
  var output = "";
5027
- while (path13.length > 0) {
5028
- if (path13 === "." || path13 === "..") {
5029
- path13 = "";
5033
+ while (path14.length > 0) {
5034
+ if (path14 === "." || path14 === "..") {
5035
+ path14 = "";
5030
5036
  break;
5031
5037
  }
5032
- var twochars = path13.substring(0, 2);
5033
- var threechars = path13.substring(0, 3);
5034
- var fourchars = path13.substring(0, 4);
5038
+ var twochars = path14.substring(0, 2);
5039
+ var threechars = path14.substring(0, 3);
5040
+ var fourchars = path14.substring(0, 4);
5035
5041
  if (threechars === "../") {
5036
- path13 = path13.substring(3);
5042
+ path14 = path14.substring(3);
5037
5043
  } else if (twochars === "./") {
5038
- path13 = path13.substring(2);
5044
+ path14 = path14.substring(2);
5039
5045
  } else if (threechars === "/./") {
5040
- path13 = "/" + path13.substring(3);
5041
- } else if (twochars === "/." && path13.length === 2) {
5042
- path13 = "/";
5043
- } else if (fourchars === "/../" || threechars === "/.." && path13.length === 3) {
5044
- path13 = "/" + path13.substring(4);
5046
+ path14 = "/" + path14.substring(3);
5047
+ } else if (twochars === "/." && path14.length === 2) {
5048
+ path14 = "/";
5049
+ } else if (fourchars === "/../" || threechars === "/.." && path14.length === 3) {
5050
+ path14 = "/" + path14.substring(4);
5045
5051
  output = output.replace(/\/?[^\/]*$/, "");
5046
5052
  } else {
5047
- var segment = path13.match(/(\/?([^\/]*))/)[0];
5053
+ var segment = path14.match(/(\/?([^\/]*))/)[0];
5048
5054
  output += segment;
5049
- path13 = path13.substring(segment.length);
5055
+ path14 = path14.substring(segment.length);
5050
5056
  }
5051
5057
  }
5052
5058
  return output;
@@ -16642,6 +16648,7 @@ var searchSocketConfigSchema = zod.z.object({
16642
16648
  dropSelectors: zod.z.array(zod.z.string()).optional(),
16643
16649
  ignoreAttr: zod.z.string().optional(),
16644
16650
  noindexAttr: zod.z.string().optional(),
16651
+ imageDescAttr: zod.z.string().optional(),
16645
16652
  respectRobotsNoindex: zod.z.boolean().optional()
16646
16653
  }).optional(),
16647
16654
  transform: zod.z.object({
@@ -16657,35 +16664,48 @@ var searchSocketConfigSchema = zod.z.object({
16657
16664
  headingPathDepth: zod.z.number().int().positive().optional(),
16658
16665
  dontSplitInside: zod.z.array(zod.z.enum(["code", "table", "blockquote"])).optional(),
16659
16666
  prependTitle: zod.z.boolean().optional(),
16660
- pageSummaryChunk: zod.z.boolean().optional()
16667
+ pageSummaryChunk: zod.z.boolean().optional(),
16668
+ weightHeadings: zod.z.boolean().optional()
16661
16669
  }).optional(),
16662
16670
  upstash: zod.z.object({
16663
16671
  url: zod.z.string().url().optional(),
16664
16672
  token: zod.z.string().min(1).optional(),
16665
16673
  urlEnv: zod.z.string().min(1).optional(),
16666
- tokenEnv: zod.z.string().min(1).optional()
16674
+ tokenEnv: zod.z.string().min(1).optional(),
16675
+ namespaces: zod.z.object({
16676
+ pages: zod.z.string().min(1).optional(),
16677
+ chunks: zod.z.string().min(1).optional()
16678
+ }).optional()
16679
+ }).optional(),
16680
+ embedding: zod.z.object({
16681
+ model: zod.z.string().optional(),
16682
+ dimensions: zod.z.number().int().positive().optional(),
16683
+ taskType: zod.z.string().optional(),
16684
+ batchSize: zod.z.number().int().positive().optional()
16667
16685
  }).optional(),
16668
16686
  search: zod.z.object({
16669
- semanticWeight: zod.z.number().min(0).max(1).optional(),
16670
- inputEnrichment: zod.z.boolean().optional(),
16671
- reranking: zod.z.boolean().optional(),
16672
16687
  dualSearch: zod.z.boolean().optional(),
16673
16688
  pageSearchWeight: zod.z.number().min(0).max(1).optional()
16674
16689
  }).optional(),
16675
16690
  ranking: zod.z.object({
16676
16691
  enableIncomingLinkBoost: zod.z.boolean().optional(),
16677
16692
  enableDepthBoost: zod.z.boolean().optional(),
16693
+ enableFreshnessBoost: zod.z.boolean().optional(),
16694
+ freshnessDecayRate: zod.z.number().positive().optional(),
16695
+ enableAnchorTextBoost: zod.z.boolean().optional(),
16678
16696
  pageWeights: zod.z.record(zod.z.string(), zod.z.number().min(0)).optional(),
16679
16697
  aggregationCap: zod.z.number().int().positive().optional(),
16680
16698
  aggregationDecay: zod.z.number().min(0).max(1).optional(),
16681
16699
  minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
16682
- minScore: zod.z.number().min(0).max(1).optional(),
16700
+ minScoreRatio: zod.z.number().min(0).max(1).optional(),
16683
16701
  scoreGapThreshold: zod.z.number().min(0).max(1).optional(),
16684
16702
  weights: zod.z.object({
16685
16703
  incomingLinks: zod.z.number().optional(),
16686
16704
  depth: zod.z.number().optional(),
16687
16705
  aggregation: zod.z.number().optional(),
16688
- titleMatch: zod.z.number().optional()
16706
+ titleMatch: zod.z.number().optional(),
16707
+ freshness: zod.z.number().optional(),
16708
+ anchorText: zod.z.number().optional()
16689
16709
  }).optional()
16690
16710
  }).optional(),
16691
16711
  api: zod.z.object({
@@ -16700,12 +16720,28 @@ var searchSocketConfigSchema = zod.z.object({
16700
16720
  }).optional(),
16701
16721
  mcp: zod.z.object({
16702
16722
  enable: zod.z.boolean().optional(),
16723
+ access: zod.z.enum(["public", "private"]).optional(),
16703
16724
  transport: zod.z.enum(["stdio", "http"]).optional(),
16704
16725
  http: zod.z.object({
16705
16726
  port: zod.z.number().int().positive().optional(),
16706
- path: zod.z.string().optional()
16727
+ path: zod.z.string().optional(),
16728
+ apiKey: zod.z.string().min(1).optional(),
16729
+ apiKeyEnv: zod.z.string().min(1).optional()
16730
+ }).optional(),
16731
+ handle: zod.z.object({
16732
+ path: zod.z.string().optional(),
16733
+ apiKey: zod.z.string().min(1).optional(),
16734
+ enableJsonResponse: zod.z.boolean().optional()
16707
16735
  }).optional()
16708
16736
  }).optional(),
16737
+ llmsTxt: zod.z.object({
16738
+ enable: zod.z.boolean().optional(),
16739
+ outputPath: zod.z.string().optional(),
16740
+ title: zod.z.string().optional(),
16741
+ description: zod.z.string().optional(),
16742
+ generateFull: zod.z.boolean().optional(),
16743
+ serveMarkdownVariants: zod.z.boolean().optional()
16744
+ }).optional(),
16709
16745
  state: zod.z.object({
16710
16746
  dir: zod.z.string().optional()
16711
16747
  }).optional()
@@ -16744,6 +16780,7 @@ function createDefaultConfig(projectId) {
16744
16780
  dropSelectors: DEFAULT_DROP_SELECTORS,
16745
16781
  ignoreAttr: "data-search-ignore",
16746
16782
  noindexAttr: "data-search-noindex",
16783
+ imageDescAttr: "data-search-description",
16747
16784
  respectRobotsNoindex: true
16748
16785
  },
16749
16786
  transform: {
@@ -16753,39 +16790,52 @@ function createDefaultConfig(projectId) {
16753
16790
  },
16754
16791
  chunking: {
16755
16792
  strategy: "hybrid",
16756
- maxChars: 2200,
16793
+ maxChars: 1500,
16757
16794
  overlapChars: 200,
16758
16795
  minChars: 250,
16759
16796
  headingPathDepth: 3,
16760
16797
  dontSplitInside: ["code", "table", "blockquote"],
16761
16798
  prependTitle: true,
16762
- pageSummaryChunk: true
16799
+ pageSummaryChunk: true,
16800
+ weightHeadings: true
16763
16801
  },
16764
16802
  upstash: {
16765
- urlEnv: "UPSTASH_SEARCH_REST_URL",
16766
- tokenEnv: "UPSTASH_SEARCH_REST_TOKEN"
16803
+ urlEnv: "UPSTASH_VECTOR_REST_URL",
16804
+ tokenEnv: "UPSTASH_VECTOR_REST_TOKEN",
16805
+ namespaces: {
16806
+ pages: "pages",
16807
+ chunks: "chunks"
16808
+ }
16809
+ },
16810
+ embedding: {
16811
+ model: "bge-large-en-v1.5",
16812
+ dimensions: 1024,
16813
+ taskType: "RETRIEVAL_DOCUMENT",
16814
+ batchSize: 100
16767
16815
  },
16768
16816
  search: {
16769
- semanticWeight: 0.75,
16770
- inputEnrichment: true,
16771
- reranking: true,
16772
16817
  dualSearch: true,
16773
16818
  pageSearchWeight: 0.3
16774
16819
  },
16775
16820
  ranking: {
16776
16821
  enableIncomingLinkBoost: true,
16777
16822
  enableDepthBoost: true,
16823
+ enableFreshnessBoost: false,
16824
+ freshnessDecayRate: 1e-3,
16825
+ enableAnchorTextBoost: false,
16778
16826
  pageWeights: {},
16779
16827
  aggregationCap: 5,
16780
16828
  aggregationDecay: 0.5,
16781
16829
  minChunkScoreRatio: 0.5,
16782
- minScore: 0.3,
16830
+ minScoreRatio: 0.7,
16783
16831
  scoreGapThreshold: 0.4,
16784
16832
  weights: {
16785
16833
  incomingLinks: 0.05,
16786
16834
  depth: 0.03,
16787
16835
  aggregation: 0.1,
16788
- titleMatch: 0.15
16836
+ titleMatch: 0.15,
16837
+ freshness: 0.1,
16838
+ anchorText: 0.1
16789
16839
  }
16790
16840
  },
16791
16841
  api: {
@@ -16796,12 +16846,23 @@ function createDefaultConfig(projectId) {
16796
16846
  },
16797
16847
  mcp: {
16798
16848
  enable: process.env.NODE_ENV !== "production",
16849
+ access: "private",
16799
16850
  transport: "stdio",
16800
16851
  http: {
16801
16852
  port: 3338,
16802
16853
  path: "/mcp"
16854
+ },
16855
+ handle: {
16856
+ path: "/api/mcp",
16857
+ enableJsonResponse: true
16803
16858
  }
16804
16859
  },
16860
+ llmsTxt: {
16861
+ enable: false,
16862
+ outputPath: "static/llms.txt",
16863
+ generateFull: true,
16864
+ serveMarkdownVariants: false
16865
+ },
16805
16866
  state: {
16806
16867
  dir: ".searchsocket"
16807
16868
  }
@@ -16929,7 +16990,15 @@ ${issues}`
16929
16990
  },
16930
16991
  upstash: {
16931
16992
  ...defaults.upstash,
16932
- ...parsed.upstash
16993
+ ...parsed.upstash,
16994
+ namespaces: {
16995
+ ...defaults.upstash.namespaces,
16996
+ ...parsed.upstash?.namespaces
16997
+ }
16998
+ },
16999
+ embedding: {
17000
+ ...defaults.embedding,
17001
+ ...parsed.embedding
16933
17002
  },
16934
17003
  search: {
16935
17004
  ...defaults.search,
@@ -16966,8 +17035,16 @@ ${issues}`
16966
17035
  http: {
16967
17036
  ...defaults.mcp.http,
16968
17037
  ...parsed.mcp?.http
17038
+ },
17039
+ handle: {
17040
+ ...defaults.mcp.handle,
17041
+ ...parsed.mcp?.handle
16969
17042
  }
16970
17043
  },
17044
+ llmsTxt: {
17045
+ ...defaults.llmsTxt,
17046
+ ...parsed.llmsTxt
17047
+ },
16971
17048
  state: {
16972
17049
  ...defaults.state,
16973
17050
  ...parsed.state
@@ -16987,6 +17064,15 @@ ${issues}`
16987
17064
  maxDepth: 10
16988
17065
  };
16989
17066
  }
17067
+ if (merged.mcp.access === "public") {
17068
+ const resolvedKey = merged.mcp.http.apiKey ?? (merged.mcp.http.apiKeyEnv ? process.env[merged.mcp.http.apiKeyEnv] : void 0);
17069
+ if (!resolvedKey) {
17070
+ throw new SearchSocketError(
17071
+ "CONFIG_MISSING",
17072
+ '`mcp.access` is "public" but no API key is configured. Set `mcp.http.apiKey` or `mcp.http.apiKeyEnv`.'
17073
+ );
17074
+ }
17075
+ }
16990
17076
  if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
16991
17077
  throw new SearchSocketError("CONFIG_MISSING", "`source.crawl.baseUrl` is required when source.mode is crawl.");
16992
17078
  }
@@ -17035,13 +17121,84 @@ function normalizeMarkdown(input) {
17035
17121
  function sanitizeScopeName(scopeName) {
17036
17122
  return scopeName.toLowerCase().replace(/[^a-z0-9._-]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80);
17037
17123
  }
17124
+ function markdownToPlain(markdown) {
17125
+ return markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/[#>*_|\-]/g, " ").replace(/\s+/g, " ").trim();
17126
+ }
17038
17127
  function toSnippet(markdown, maxLen = 220) {
17039
- const plain = markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/[#>*_|\-]/g, " ").replace(/\s+/g, " ").trim();
17128
+ const plain = markdownToPlain(markdown);
17040
17129
  if (plain.length <= maxLen) {
17041
17130
  return plain;
17042
17131
  }
17043
17132
  return `${plain.slice(0, Math.max(0, maxLen - 1)).trim()}\u2026`;
17044
17133
  }
17134
+ function queryAwareExcerpt(markdown, query, maxLen = 220) {
17135
+ const plain = markdownToPlain(markdown);
17136
+ if (plain.length <= maxLen) return plain;
17137
+ const tokens = query.toLowerCase().split(/\s+/).filter((t) => t.length >= 2);
17138
+ if (tokens.length === 0) return toSnippet(markdown, maxLen);
17139
+ const positions = [];
17140
+ for (let ti = 0; ti < tokens.length; ti++) {
17141
+ const escaped = tokens[ti].replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
17142
+ const re = new RegExp(escaped, "gi");
17143
+ let m;
17144
+ while ((m = re.exec(plain)) !== null) {
17145
+ positions.push({ start: m.index, end: m.index + m[0].length, tokenIdx: ti });
17146
+ }
17147
+ }
17148
+ if (positions.length === 0) return toSnippet(markdown, maxLen);
17149
+ positions.sort((a, b) => a.start - b.start);
17150
+ let bestUniqueCount = 0;
17151
+ let bestTotalCount = 0;
17152
+ let bestLeft = 0;
17153
+ let bestRight = 0;
17154
+ let left = 0;
17155
+ const tokenCounts = /* @__PURE__ */ new Map();
17156
+ for (let right = 0; right < positions.length; right++) {
17157
+ tokenCounts.set(positions[right].tokenIdx, (tokenCounts.get(positions[right].tokenIdx) ?? 0) + 1);
17158
+ while (positions[right].end - positions[left].start > maxLen && left < right) {
17159
+ const leftToken = positions[left].tokenIdx;
17160
+ const cnt = tokenCounts.get(leftToken) - 1;
17161
+ if (cnt === 0) tokenCounts.delete(leftToken);
17162
+ else tokenCounts.set(leftToken, cnt);
17163
+ left++;
17164
+ }
17165
+ const uniqueCount = tokenCounts.size;
17166
+ const totalCount = right - left + 1;
17167
+ if (uniqueCount > bestUniqueCount || uniqueCount === bestUniqueCount && totalCount > bestTotalCount) {
17168
+ bestUniqueCount = uniqueCount;
17169
+ bestTotalCount = totalCount;
17170
+ bestLeft = left;
17171
+ bestRight = right;
17172
+ }
17173
+ }
17174
+ const mid = Math.floor((positions[bestLeft].start + positions[bestRight].end) / 2);
17175
+ let start = Math.max(0, mid - Math.floor(maxLen / 2));
17176
+ let end = Math.min(plain.length, start + maxLen);
17177
+ start = Math.max(0, end - maxLen);
17178
+ if (start > 0) {
17179
+ const spaceIdx = plain.lastIndexOf(" ", start);
17180
+ if (spaceIdx > start - 30) {
17181
+ start = spaceIdx + 1;
17182
+ }
17183
+ }
17184
+ if (end < plain.length) {
17185
+ const spaceIdx = plain.indexOf(" ", end);
17186
+ if (spaceIdx !== -1 && spaceIdx < end + 30) {
17187
+ end = spaceIdx;
17188
+ }
17189
+ }
17190
+ let excerpt = plain.slice(start, end);
17191
+ if (excerpt.length > Math.ceil(maxLen * 1.2)) {
17192
+ excerpt = excerpt.slice(0, maxLen);
17193
+ const lastSpace = excerpt.lastIndexOf(" ");
17194
+ if (lastSpace > maxLen * 0.5) {
17195
+ excerpt = excerpt.slice(0, lastSpace);
17196
+ }
17197
+ }
17198
+ const prefix = start > 0 ? "\u2026" : "";
17199
+ const suffix = end < plain.length ? "\u2026" : "";
17200
+ return `${prefix}${excerpt}${suffix}`;
17201
+ }
17045
17202
  function extractFirstParagraph(markdown) {
17046
17203
  const lines = markdown.split("\n");
17047
17204
  let inFence = false;
@@ -17148,162 +17305,342 @@ function joinUrl(baseUrl, route) {
17148
17305
  const routePart = ensureLeadingSlash(route);
17149
17306
  return `${base}${routePart}`;
17150
17307
  }
17151
-
17152
- // src/vector/upstash.ts
17153
- function chunkIndexName(scope) {
17154
- return `${scope.projectId}--${scope.scopeName}`;
17155
- }
17156
- function pageIndexName(scope) {
17157
- return `${scope.projectId}--${scope.scopeName}--pages`;
17158
- }
17159
17308
  var UpstashSearchStore = class {
17160
- client;
17309
+ index;
17310
+ pagesNs;
17311
+ chunksNs;
17161
17312
  constructor(opts) {
17162
- this.client = opts.client;
17163
- }
17164
- chunkIndex(scope) {
17165
- return this.client.index(chunkIndexName(scope));
17166
- }
17167
- pageIndex(scope) {
17168
- return this.client.index(pageIndexName(scope));
17313
+ this.index = opts.index;
17314
+ this.pagesNs = opts.index.namespace(opts.pagesNamespace);
17315
+ this.chunksNs = opts.index.namespace(opts.chunksNamespace);
17169
17316
  }
17170
17317
  async upsertChunks(chunks, scope) {
17171
17318
  if (chunks.length === 0) return;
17172
- const index = this.chunkIndex(scope);
17173
- const BATCH_SIZE = 100;
17319
+ const BATCH_SIZE = 90;
17174
17320
  for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
17175
17321
  const batch = chunks.slice(i, i + BATCH_SIZE);
17176
- await index.upsert(batch);
17177
- }
17178
- }
17179
- async search(query, opts, scope) {
17180
- const index = this.chunkIndex(scope);
17181
- const results = await index.search({
17182
- query,
17183
- limit: opts.limit,
17184
- semanticWeight: opts.semanticWeight,
17185
- inputEnrichment: opts.inputEnrichment,
17186
- reranking: opts.reranking,
17187
- filter: opts.filter
17322
+ await this.chunksNs.upsert(
17323
+ batch.map((c) => ({
17324
+ id: c.id,
17325
+ data: c.data,
17326
+ metadata: {
17327
+ ...c.metadata,
17328
+ projectId: scope.projectId,
17329
+ scopeName: scope.scopeName,
17330
+ type: c.metadata.type || "chunk"
17331
+ }
17332
+ }))
17333
+ );
17334
+ }
17335
+ }
17336
+ async search(data, opts, scope) {
17337
+ const filterParts = [
17338
+ `projectId = '${scope.projectId}'`,
17339
+ `scopeName = '${scope.scopeName}'`
17340
+ ];
17341
+ if (opts.filter) {
17342
+ filterParts.push(opts.filter);
17343
+ }
17344
+ const results = await this.chunksNs.query({
17345
+ data,
17346
+ topK: opts.limit,
17347
+ includeMetadata: true,
17348
+ filter: filterParts.join(" AND "),
17349
+ queryMode: vector.QueryMode.HYBRID,
17350
+ fusionAlgorithm: vector.FusionAlgorithm.DBSF
17351
+ });
17352
+ return results.map((doc) => ({
17353
+ id: String(doc.id),
17354
+ score: doc.score,
17355
+ metadata: {
17356
+ projectId: doc.metadata?.projectId ?? "",
17357
+ scopeName: doc.metadata?.scopeName ?? "",
17358
+ url: doc.metadata?.url ?? "",
17359
+ path: doc.metadata?.path ?? "",
17360
+ title: doc.metadata?.title ?? "",
17361
+ sectionTitle: doc.metadata?.sectionTitle ?? "",
17362
+ headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
17363
+ snippet: doc.metadata?.snippet ?? "",
17364
+ chunkText: doc.metadata?.chunkText ?? "",
17365
+ ordinal: doc.metadata?.ordinal ?? 0,
17366
+ contentHash: doc.metadata?.contentHash ?? "",
17367
+ depth: doc.metadata?.depth ?? 0,
17368
+ incomingLinks: doc.metadata?.incomingLinks ?? 0,
17369
+ routeFile: doc.metadata?.routeFile ?? "",
17370
+ tags: doc.metadata?.tags ?? [],
17371
+ description: doc.metadata?.description || void 0,
17372
+ keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
17373
+ publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
17374
+ incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
17375
+ }
17376
+ }));
17377
+ }
17378
+ async searchChunksByUrl(data, url, opts, scope) {
17379
+ const filterParts = [
17380
+ `projectId = '${scope.projectId}'`,
17381
+ `scopeName = '${scope.scopeName}'`,
17382
+ `url = '${url}'`
17383
+ ];
17384
+ if (opts.filter) {
17385
+ filterParts.push(opts.filter);
17386
+ }
17387
+ const results = await this.chunksNs.query({
17388
+ data,
17389
+ topK: opts.limit,
17390
+ includeMetadata: true,
17391
+ filter: filterParts.join(" AND "),
17392
+ queryMode: vector.QueryMode.HYBRID,
17393
+ fusionAlgorithm: vector.FusionAlgorithm.DBSF
17188
17394
  });
17189
17395
  return results.map((doc) => ({
17190
- id: doc.id,
17396
+ id: String(doc.id),
17191
17397
  score: doc.score,
17192
17398
  metadata: {
17193
17399
  projectId: doc.metadata?.projectId ?? "",
17194
17400
  scopeName: doc.metadata?.scopeName ?? "",
17195
- url: doc.content.url,
17401
+ url: doc.metadata?.url ?? "",
17196
17402
  path: doc.metadata?.path ?? "",
17197
- title: doc.content.title,
17198
- sectionTitle: doc.content.sectionTitle,
17199
- headingPath: doc.content.headingPath ? doc.content.headingPath.split(" > ").filter(Boolean) : [],
17403
+ title: doc.metadata?.title ?? "",
17404
+ sectionTitle: doc.metadata?.sectionTitle ?? "",
17405
+ headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
17200
17406
  snippet: doc.metadata?.snippet ?? "",
17201
- chunkText: doc.content.text,
17407
+ chunkText: doc.metadata?.chunkText ?? "",
17202
17408
  ordinal: doc.metadata?.ordinal ?? 0,
17203
17409
  contentHash: doc.metadata?.contentHash ?? "",
17204
17410
  depth: doc.metadata?.depth ?? 0,
17205
17411
  incomingLinks: doc.metadata?.incomingLinks ?? 0,
17206
17412
  routeFile: doc.metadata?.routeFile ?? "",
17207
- tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
17413
+ tags: doc.metadata?.tags ?? [],
17208
17414
  description: doc.metadata?.description || void 0,
17209
- keywords: doc.metadata?.keywords ? doc.metadata.keywords.split(",").filter(Boolean) : void 0
17415
+ keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
17416
+ publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
17417
+ incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
17210
17418
  }
17211
17419
  }));
17212
17420
  }
17213
- async searchPages(query, opts, scope) {
17214
- const index = this.pageIndex(scope);
17421
+ async searchPagesByText(data, opts, scope) {
17422
+ return this.queryPages({ data }, opts, scope);
17423
+ }
17424
+ async searchPagesByVector(vector, opts, scope) {
17425
+ return this.queryPages({ vector }, opts, scope);
17426
+ }
17427
+ async queryPages(input, opts, scope) {
17428
+ const filterParts = [
17429
+ `projectId = '${scope.projectId}'`,
17430
+ `scopeName = '${scope.scopeName}'`
17431
+ ];
17432
+ if (opts.filter) {
17433
+ filterParts.push(opts.filter);
17434
+ }
17215
17435
  let results;
17216
17436
  try {
17217
- results = await index.search({
17218
- query,
17219
- limit: opts.limit,
17220
- semanticWeight: opts.semanticWeight,
17221
- inputEnrichment: opts.inputEnrichment,
17222
- reranking: true,
17223
- filter: opts.filter
17437
+ results = await this.pagesNs.query({
17438
+ ...input,
17439
+ topK: opts.limit,
17440
+ includeMetadata: true,
17441
+ filter: filterParts.join(" AND "),
17442
+ queryMode: vector.QueryMode.HYBRID,
17443
+ fusionAlgorithm: vector.FusionAlgorithm.DBSF
17224
17444
  });
17225
17445
  } catch {
17226
17446
  return [];
17227
17447
  }
17228
17448
  return results.map((doc) => ({
17229
- id: doc.id,
17449
+ id: String(doc.id),
17230
17450
  score: doc.score,
17231
- title: doc.content.title,
17232
- url: doc.content.url,
17233
- description: doc.content.description ?? "",
17234
- tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
17451
+ title: doc.metadata?.title ?? "",
17452
+ url: doc.metadata?.url ?? "",
17453
+ description: doc.metadata?.description ?? "",
17454
+ tags: doc.metadata?.tags ?? [],
17235
17455
  depth: doc.metadata?.depth ?? 0,
17236
17456
  incomingLinks: doc.metadata?.incomingLinks ?? 0,
17237
- routeFile: doc.metadata?.routeFile ?? ""
17457
+ routeFile: doc.metadata?.routeFile ?? "",
17458
+ publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0
17238
17459
  }));
17239
17460
  }
17240
- async deleteByIds(ids, scope) {
17461
+ async deleteByIds(ids, _scope) {
17241
17462
  if (ids.length === 0) return;
17242
- const index = this.chunkIndex(scope);
17243
- const BATCH_SIZE = 500;
17463
+ const BATCH_SIZE = 90;
17244
17464
  for (let i = 0; i < ids.length; i += BATCH_SIZE) {
17245
17465
  const batch = ids.slice(i, i + BATCH_SIZE);
17246
- await index.delete(batch);
17466
+ await this.chunksNs.delete(batch);
17247
17467
  }
17248
17468
  }
17249
17469
  async deleteScope(scope) {
17250
- try {
17251
- const chunkIdx = this.chunkIndex(scope);
17252
- await chunkIdx.deleteIndex();
17253
- } catch {
17254
- }
17255
- try {
17256
- const pageIdx = this.pageIndex(scope);
17257
- await pageIdx.deleteIndex();
17258
- } catch {
17470
+ for (const ns of [this.chunksNs, this.pagesNs]) {
17471
+ const ids = [];
17472
+ let cursor = "0";
17473
+ try {
17474
+ for (; ; ) {
17475
+ const result = await ns.range({
17476
+ cursor,
17477
+ limit: 100,
17478
+ includeMetadata: true
17479
+ });
17480
+ for (const doc of result.vectors) {
17481
+ if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
17482
+ ids.push(String(doc.id));
17483
+ }
17484
+ }
17485
+ if (!result.nextCursor || result.nextCursor === "0") break;
17486
+ cursor = result.nextCursor;
17487
+ }
17488
+ } catch {
17489
+ }
17490
+ if (ids.length > 0) {
17491
+ const BATCH_SIZE = 90;
17492
+ for (let i = 0; i < ids.length; i += BATCH_SIZE) {
17493
+ const batch = ids.slice(i, i + BATCH_SIZE);
17494
+ await ns.delete(batch);
17495
+ }
17496
+ }
17259
17497
  }
17260
17498
  }
17261
17499
  async listScopes(projectId) {
17262
- const allIndexes = await this.client.listIndexes();
17263
- const prefix = `${projectId}--`;
17264
- const scopeNames = /* @__PURE__ */ new Set();
17265
- for (const name of allIndexes) {
17266
- if (name.startsWith(prefix) && !name.endsWith("--pages")) {
17267
- const scopeName = name.slice(prefix.length);
17268
- scopeNames.add(scopeName);
17269
- }
17270
- }
17271
- const scopes = [];
17272
- for (const scopeName of scopeNames) {
17273
- const scope = {
17274
- projectId,
17275
- scopeName,
17276
- scopeId: `${projectId}:${scopeName}`
17277
- };
17500
+ const scopeMap = /* @__PURE__ */ new Map();
17501
+ for (const ns of [this.chunksNs, this.pagesNs]) {
17502
+ let cursor = "0";
17503
+ try {
17504
+ for (; ; ) {
17505
+ const result = await ns.range({
17506
+ cursor,
17507
+ limit: 100,
17508
+ includeMetadata: true
17509
+ });
17510
+ for (const doc of result.vectors) {
17511
+ if (doc.metadata?.projectId === projectId) {
17512
+ const scopeName = doc.metadata.scopeName ?? "";
17513
+ scopeMap.set(scopeName, (scopeMap.get(scopeName) ?? 0) + 1);
17514
+ }
17515
+ }
17516
+ if (!result.nextCursor || result.nextCursor === "0") break;
17517
+ cursor = result.nextCursor;
17518
+ }
17519
+ } catch {
17520
+ }
17521
+ }
17522
+ return [...scopeMap.entries()].map(([scopeName, count]) => ({
17523
+ projectId,
17524
+ scopeName,
17525
+ lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
17526
+ documentCount: count
17527
+ }));
17528
+ }
17529
+ async getContentHashes(scope) {
17530
+ return this.scanHashes(this.chunksNs, scope);
17531
+ }
17532
+ /**
17533
+ * Fetch content hashes for a specific set of chunk keys using direct fetch()
17534
+ * instead of range(). This avoids potential issues with range() returning
17535
+ * vectors from the wrong namespace on hybrid indexes.
17536
+ */
17537
+ async fetchContentHashesForKeys(keys, scope) {
17538
+ const map = /* @__PURE__ */ new Map();
17539
+ if (keys.length === 0) return map;
17540
+ const BATCH_SIZE = 90;
17541
+ for (let i = 0; i < keys.length; i += BATCH_SIZE) {
17542
+ const batch = keys.slice(i, i + BATCH_SIZE);
17278
17543
  try {
17279
- const info = await this.chunkIndex(scope).info();
17280
- scopes.push({
17281
- projectId,
17282
- scopeName,
17283
- lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
17284
- documentCount: info.documentCount
17544
+ const results = await this.chunksNs.fetch(batch, {
17545
+ includeMetadata: true
17285
17546
  });
17547
+ for (const doc of results) {
17548
+ if (doc && doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
17549
+ map.set(String(doc.id), doc.metadata.contentHash);
17550
+ }
17551
+ }
17286
17552
  } catch {
17287
- scopes.push({
17288
- projectId,
17289
- scopeName,
17290
- lastIndexedAt: "unknown",
17291
- documentCount: 0
17553
+ }
17554
+ }
17555
+ return map;
17556
+ }
17557
+ /**
17558
+ * Scan all IDs in the chunks namespace for this scope.
17559
+ * Used for deletion detection (finding stale chunk keys).
17560
+ */
17561
+ async scanChunkIds(scope) {
17562
+ const ids = /* @__PURE__ */ new Set();
17563
+ let cursor = "0";
17564
+ try {
17565
+ for (; ; ) {
17566
+ const result = await this.chunksNs.range({
17567
+ cursor,
17568
+ limit: 100,
17569
+ includeMetadata: true
17292
17570
  });
17571
+ for (const doc of result.vectors) {
17572
+ if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
17573
+ ids.add(String(doc.id));
17574
+ }
17575
+ }
17576
+ if (!result.nextCursor || result.nextCursor === "0") break;
17577
+ cursor = result.nextCursor;
17293
17578
  }
17579
+ } catch {
17294
17580
  }
17295
- return scopes;
17581
+ return ids;
17296
17582
  }
17297
- async getContentHashes(scope) {
17583
+ async scanHashes(ns, scope) {
17584
+ const map = /* @__PURE__ */ new Map();
17585
+ let cursor = "0";
17586
+ try {
17587
+ for (; ; ) {
17588
+ const result = await ns.range({
17589
+ cursor,
17590
+ limit: 100,
17591
+ includeMetadata: true
17592
+ });
17593
+ for (const doc of result.vectors) {
17594
+ if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
17595
+ map.set(String(doc.id), doc.metadata.contentHash);
17596
+ }
17597
+ }
17598
+ if (!result.nextCursor || result.nextCursor === "0") break;
17599
+ cursor = result.nextCursor;
17600
+ }
17601
+ } catch {
17602
+ }
17603
+ return map;
17604
+ }
17605
+ async listPages(scope, opts) {
17606
+ const cursor = opts?.cursor ?? "0";
17607
+ const limit = opts?.limit ?? 50;
17608
+ try {
17609
+ const result = await this.pagesNs.range({
17610
+ cursor,
17611
+ limit,
17612
+ includeMetadata: true
17613
+ });
17614
+ const pages = result.vectors.filter(
17615
+ (doc) => doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && (!opts?.pathPrefix || (doc.metadata?.url ?? "").startsWith(opts.pathPrefix))
17616
+ ).map((doc) => ({
17617
+ url: doc.metadata?.url ?? "",
17618
+ title: doc.metadata?.title ?? "",
17619
+ description: doc.metadata?.description ?? "",
17620
+ routeFile: doc.metadata?.routeFile ?? ""
17621
+ }));
17622
+ const response = { pages };
17623
+ if (result.nextCursor && result.nextCursor !== "0") {
17624
+ response.nextCursor = result.nextCursor;
17625
+ }
17626
+ return response;
17627
+ } catch {
17628
+ return { pages: [] };
17629
+ }
17630
+ }
17631
+ async getPageHashes(scope) {
17298
17632
  const map = /* @__PURE__ */ new Map();
17299
- const index = this.chunkIndex(scope);
17300
17633
  let cursor = "0";
17301
17634
  try {
17302
17635
  for (; ; ) {
17303
- const result = await index.range({ cursor, limit: 100 });
17304
- for (const doc of result.documents) {
17305
- if (doc.metadata?.contentHash) {
17306
- map.set(doc.id, doc.metadata.contentHash);
17636
+ const result = await this.pagesNs.range({
17637
+ cursor,
17638
+ limit: 100,
17639
+ includeMetadata: true
17640
+ });
17641
+ for (const doc of result.vectors) {
17642
+ if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
17643
+ map.set(String(doc.id), doc.metadata.contentHash);
17307
17644
  }
17308
17645
  }
17309
17646
  if (!result.nextCursor || result.nextCursor === "0") break;
@@ -17313,47 +17650,43 @@ var UpstashSearchStore = class {
17313
17650
  }
17314
17651
  return map;
17315
17652
  }
17653
+ async deletePagesByIds(ids, _scope) {
17654
+ if (ids.length === 0) return;
17655
+ const BATCH_SIZE = 90;
17656
+ for (let i = 0; i < ids.length; i += BATCH_SIZE) {
17657
+ const batch = ids.slice(i, i + BATCH_SIZE);
17658
+ await this.pagesNs.delete(batch);
17659
+ }
17660
+ }
17316
17661
  async upsertPages(pages, scope) {
17317
17662
  if (pages.length === 0) return;
17318
- const index = this.pageIndex(scope);
17319
- const BATCH_SIZE = 50;
17663
+ const BATCH_SIZE = 90;
17320
17664
  for (let i = 0; i < pages.length; i += BATCH_SIZE) {
17321
17665
  const batch = pages.slice(i, i + BATCH_SIZE);
17322
- const docs = batch.map((p) => ({
17323
- id: p.url,
17324
- content: {
17325
- title: p.title,
17326
- url: p.url,
17327
- type: "page",
17328
- description: p.description ?? "",
17329
- keywords: (p.keywords ?? []).join(","),
17330
- summary: p.summary ?? "",
17331
- tags: p.tags.join(",")
17332
- },
17333
- metadata: {
17334
- markdown: p.markdown,
17335
- projectId: p.projectId,
17336
- scopeName: p.scopeName,
17337
- routeFile: p.routeFile,
17338
- routeResolution: p.routeResolution,
17339
- incomingLinks: p.incomingLinks,
17340
- outgoingLinks: p.outgoingLinks,
17341
- depth: p.depth,
17342
- indexedAt: p.indexedAt
17343
- }
17344
- }));
17345
- await index.upsert(docs);
17666
+ await this.pagesNs.upsert(
17667
+ batch.map((p) => ({
17668
+ id: p.id,
17669
+ data: p.data,
17670
+ metadata: {
17671
+ ...p.metadata,
17672
+ projectId: scope.projectId,
17673
+ scopeName: scope.scopeName,
17674
+ type: "page"
17675
+ }
17676
+ }))
17677
+ );
17346
17678
  }
17347
17679
  }
17348
17680
  async getPage(url, scope) {
17349
- const index = this.pageIndex(scope);
17350
17681
  try {
17351
- const results = await index.fetch([url]);
17682
+ const results = await this.pagesNs.fetch([url], {
17683
+ includeMetadata: true
17684
+ });
17352
17685
  const doc = results[0];
17353
- if (!doc) return null;
17686
+ if (!doc || !doc.metadata) return null;
17354
17687
  return {
17355
- url: doc.content.url,
17356
- title: doc.content.title,
17688
+ url: doc.metadata.url,
17689
+ title: doc.metadata.title,
17357
17690
  markdown: doc.metadata.markdown,
17358
17691
  projectId: doc.metadata.projectId,
17359
17692
  scopeName: doc.metadata.scopeName,
@@ -17361,27 +17694,86 @@ var UpstashSearchStore = class {
17361
17694
  routeResolution: doc.metadata.routeResolution,
17362
17695
  incomingLinks: doc.metadata.incomingLinks,
17363
17696
  outgoingLinks: doc.metadata.outgoingLinks,
17697
+ outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? void 0,
17364
17698
  depth: doc.metadata.depth,
17365
- tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
17699
+ tags: doc.metadata.tags ?? [],
17366
17700
  indexedAt: doc.metadata.indexedAt,
17367
- summary: doc.content.summary || void 0,
17368
- description: doc.content.description || void 0,
17369
- keywords: doc.content.keywords ? doc.content.keywords.split(",").filter(Boolean) : void 0
17701
+ summary: doc.metadata.summary || void 0,
17702
+ description: doc.metadata.description || void 0,
17703
+ keywords: doc.metadata.keywords?.length ? doc.metadata.keywords : void 0,
17704
+ publishedAt: typeof doc.metadata.publishedAt === "number" ? doc.metadata.publishedAt : void 0
17370
17705
  };
17371
17706
  } catch {
17372
17707
  return null;
17373
17708
  }
17374
17709
  }
17710
+ async fetchPageWithVector(url, scope) {
17711
+ try {
17712
+ const results = await this.pagesNs.fetch([url], {
17713
+ includeMetadata: true,
17714
+ includeVectors: true
17715
+ });
17716
+ const doc = results[0];
17717
+ if (!doc || !doc.metadata || !doc.vector) return null;
17718
+ if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
17719
+ return null;
17720
+ }
17721
+ return { metadata: doc.metadata, vector: doc.vector };
17722
+ } catch {
17723
+ return null;
17724
+ }
17725
+ }
17726
+ async fetchPagesBatch(urls, scope) {
17727
+ if (urls.length === 0) return [];
17728
+ try {
17729
+ const results = await this.pagesNs.fetch(urls, {
17730
+ includeMetadata: true
17731
+ });
17732
+ const out = [];
17733
+ for (const doc of results) {
17734
+ if (!doc || !doc.metadata) continue;
17735
+ if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
17736
+ continue;
17737
+ }
17738
+ out.push({
17739
+ url: doc.metadata.url,
17740
+ title: doc.metadata.title,
17741
+ routeFile: doc.metadata.routeFile,
17742
+ outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? []
17743
+ });
17744
+ }
17745
+ return out;
17746
+ } catch {
17747
+ return [];
17748
+ }
17749
+ }
17375
17750
  async deletePages(scope) {
17751
+ const ids = [];
17752
+ let cursor = "0";
17376
17753
  try {
17377
- const index = this.pageIndex(scope);
17378
- await index.reset();
17754
+ for (; ; ) {
17755
+ const result = await this.pagesNs.range({
17756
+ cursor,
17757
+ limit: 100,
17758
+ includeMetadata: true
17759
+ });
17760
+ for (const doc of result.vectors) {
17761
+ if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
17762
+ ids.push(String(doc.id));
17763
+ }
17764
+ }
17765
+ if (!result.nextCursor || result.nextCursor === "0") break;
17766
+ cursor = result.nextCursor;
17767
+ }
17379
17768
  } catch {
17380
17769
  }
17770
+ if (ids.length > 0) {
17771
+ await this.deletePagesByIds(ids, scope);
17772
+ }
17381
17773
  }
17382
17774
  async health() {
17383
17775
  try {
17384
- await this.client.info();
17776
+ await this.index.info();
17385
17777
  return { ok: true };
17386
17778
  } catch (error) {
17387
17779
  return {
@@ -17391,14 +17783,31 @@ var UpstashSearchStore = class {
17391
17783
  }
17392
17784
  }
17393
17785
  async dropAllIndexes(projectId) {
17394
- const allIndexes = await this.client.listIndexes();
17395
- const prefix = `${projectId}--`;
17396
- for (const name of allIndexes) {
17397
- if (name.startsWith(prefix)) {
17398
- try {
17399
- const index = this.client.index(name);
17400
- await index.deleteIndex();
17401
- } catch {
17786
+ for (const ns of [this.chunksNs, this.pagesNs]) {
17787
+ const ids = [];
17788
+ let cursor = "0";
17789
+ try {
17790
+ for (; ; ) {
17791
+ const result = await ns.range({
17792
+ cursor,
17793
+ limit: 100,
17794
+ includeMetadata: true
17795
+ });
17796
+ for (const doc of result.vectors) {
17797
+ if (doc.metadata?.projectId === projectId) {
17798
+ ids.push(String(doc.id));
17799
+ }
17800
+ }
17801
+ if (!result.nextCursor || result.nextCursor === "0") break;
17802
+ cursor = result.nextCursor;
17803
+ }
17804
+ } catch {
17805
+ }
17806
+ if (ids.length > 0) {
17807
+ const BATCH_SIZE = 90;
17808
+ for (let i = 0; i < ids.length; i += BATCH_SIZE) {
17809
+ const batch = ids.slice(i, i + BATCH_SIZE);
17810
+ await ns.delete(batch);
17402
17811
  }
17403
17812
  }
17404
17813
  }
@@ -17412,12 +17821,16 @@ async function createUpstashStore(config) {
17412
17821
  if (!url || !token) {
17413
17822
  throw new SearchSocketError(
17414
17823
  "VECTOR_BACKEND_UNAVAILABLE",
17415
- `Missing Upstash Search credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
17824
+ `Missing Upstash Vector credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
17416
17825
  );
17417
17826
  }
17418
- const { Search } = await import('@upstash/search');
17419
- const client = new Search({ url, token });
17420
- return new UpstashSearchStore({ client });
17827
+ const { Index } = await import('@upstash/vector');
17828
+ const index = new Index({ url, token });
17829
+ return new UpstashSearchStore({
17830
+ index,
17831
+ pagesNamespace: config.upstash.namespaces.pages,
17832
+ chunksNamespace: config.upstash.namespaces.chunks
17833
+ });
17421
17834
  }
17422
17835
 
17423
17836
  // src/utils/pattern.ts
@@ -17460,29 +17873,65 @@ function nonNegativeOrZero(value) {
17460
17873
  function normalizeForTitleMatch(text) {
17461
17874
  return text.toLowerCase().replace(/[^a-z0-9\s]/g, "").replace(/\s+/g, " ").trim();
17462
17875
  }
17463
- function rankHits(hits, config, query) {
17876
+ function rankHits(hits, config, query, debug) {
17464
17877
  const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
17465
17878
  const titleMatchWeight = config.ranking.weights.titleMatch;
17466
17879
  return hits.map((hit) => {
17467
- let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
17880
+ const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
17881
+ let score = baseScore;
17882
+ let incomingLinkBoostValue = 0;
17468
17883
  if (config.ranking.enableIncomingLinkBoost) {
17469
17884
  const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
17470
- score += incomingBoost * config.ranking.weights.incomingLinks;
17885
+ incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
17886
+ score += incomingLinkBoostValue;
17471
17887
  }
17888
+ let depthBoostValue = 0;
17472
17889
  if (config.ranking.enableDepthBoost) {
17473
17890
  const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
17474
- score += depthBoost * config.ranking.weights.depth;
17891
+ depthBoostValue = depthBoost * config.ranking.weights.depth;
17892
+ score += depthBoostValue;
17475
17893
  }
17894
+ let titleMatchBoostValue = 0;
17476
17895
  if (normalizedQuery && titleMatchWeight > 0) {
17477
17896
  const normalizedTitle = normalizeForTitleMatch(hit.metadata.title);
17478
17897
  if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
17479
- score += titleMatchWeight;
17898
+ titleMatchBoostValue = titleMatchWeight;
17899
+ score += titleMatchBoostValue;
17480
17900
  }
17481
17901
  }
17482
- return {
17902
+ let freshnessBoostValue = 0;
17903
+ if (config.ranking.enableFreshnessBoost) {
17904
+ const publishedAt = hit.metadata.publishedAt;
17905
+ if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
17906
+ const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
17907
+ const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
17908
+ freshnessBoostValue = decay * config.ranking.weights.freshness;
17909
+ score += freshnessBoostValue;
17910
+ }
17911
+ }
17912
+ let anchorTextMatchBoostValue = 0;
17913
+ if (config.ranking.enableAnchorTextBoost && normalizedQuery && config.ranking.weights.anchorText > 0) {
17914
+ const normalizedAnchorText = normalizeForTitleMatch(hit.metadata.incomingAnchorText ?? "");
17915
+ if (normalizedAnchorText.length > 0 && normalizedQuery.length > 0 && (normalizedAnchorText.includes(normalizedQuery) || normalizedQuery.includes(normalizedAnchorText))) {
17916
+ anchorTextMatchBoostValue = config.ranking.weights.anchorText;
17917
+ score += anchorTextMatchBoostValue;
17918
+ }
17919
+ }
17920
+ const result = {
17483
17921
  hit,
17484
17922
  finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
17485
17923
  };
17924
+ if (debug) {
17925
+ result.breakdown = {
17926
+ baseScore,
17927
+ incomingLinkBoost: incomingLinkBoostValue,
17928
+ depthBoost: depthBoostValue,
17929
+ titleMatchBoost: titleMatchBoostValue,
17930
+ freshnessBoost: freshnessBoostValue,
17931
+ anchorTextMatchBoost: anchorTextMatchBoostValue
17932
+ };
17933
+ }
17934
+ return result;
17486
17935
  }).sort((a, b) => {
17487
17936
  const delta = b.finalScore - a.finalScore;
17488
17937
  return Number.isNaN(delta) ? 0 : delta;
@@ -17491,12 +17940,13 @@ function rankHits(hits, config, query) {
17491
17940
  function trimByScoreGap(results, config) {
17492
17941
  if (results.length === 0) return results;
17493
17942
  const threshold = config.ranking.scoreGapThreshold;
17494
- const minScore = config.ranking.minScore;
17495
- if (minScore > 0 && results.length > 0) {
17496
- const sortedScores = results.map((r) => r.pageScore).sort((a, b) => a - b);
17497
- const mid = Math.floor(sortedScores.length / 2);
17498
- const median = sortedScores.length % 2 === 0 ? (sortedScores[mid - 1] + sortedScores[mid]) / 2 : sortedScores[mid];
17499
- if (median < minScore) return [];
17943
+ const minScoreRatio = config.ranking.minScoreRatio;
17944
+ if (minScoreRatio > 0 && results.length > 0) {
17945
+ const topScore = results[0].pageScore;
17946
+ if (Number.isFinite(topScore) && topScore > 0) {
17947
+ const minThreshold = topScore * minScoreRatio;
17948
+ results = results.filter((r) => r.pageScore >= minThreshold);
17949
+ }
17500
17950
  }
17501
17951
  if (threshold > 0 && results.length > 1) {
17502
17952
  for (let i = 1; i < results.length; i++) {
@@ -17566,92 +18016,293 @@ function aggregateByPage(ranked, config) {
17566
18016
  return Number.isNaN(delta) ? 0 : delta;
17567
18017
  });
17568
18018
  }
17569
- function mergePageAndChunkResults(pageHits, rankedChunks, config) {
17570
- if (pageHits.length === 0) return rankedChunks;
17571
- const w = config.search.pageSearchWeight;
17572
- const pageScoreMap = /* @__PURE__ */ new Map();
17573
- for (const ph of pageHits) {
17574
- pageScoreMap.set(ph.url, ph);
17575
- }
17576
- const pagesWithChunks = /* @__PURE__ */ new Set();
17577
- const merged = rankedChunks.map((ranked) => {
17578
- const url = ranked.hit.metadata.url;
17579
- const pageHit = pageScoreMap.get(url);
17580
- if (pageHit) {
17581
- pagesWithChunks.add(url);
17582
- const blended = (1 - w) * ranked.finalScore + w * pageHit.score;
17583
- return {
17584
- hit: ranked.hit,
17585
- finalScore: Number.isFinite(blended) ? blended : ranked.finalScore
17586
- };
18019
+ function rankPageHits(pageHits, config, query, debug) {
18020
+ const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
18021
+ const titleMatchWeight = config.ranking.weights.titleMatch;
18022
+ return pageHits.map((hit) => {
18023
+ const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
18024
+ let score = baseScore;
18025
+ let incomingLinkBoostValue = 0;
18026
+ if (config.ranking.enableIncomingLinkBoost) {
18027
+ const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.incomingLinks));
18028
+ incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
18029
+ score += incomingLinkBoostValue;
17587
18030
  }
17588
- return ranked;
17589
- });
17590
- for (const [url, pageHit] of pageScoreMap) {
17591
- if (pagesWithChunks.has(url)) continue;
17592
- const syntheticScore = pageHit.score * w;
17593
- const syntheticHit = {
17594
- id: `page:${url}`,
17595
- score: pageHit.score,
17596
- metadata: {
17597
- projectId: "",
17598
- scopeName: "",
17599
- url: pageHit.url,
17600
- path: pageHit.url,
17601
- title: pageHit.title,
17602
- sectionTitle: "",
17603
- headingPath: [],
17604
- snippet: pageHit.description || pageHit.title,
17605
- chunkText: pageHit.description || pageHit.title,
17606
- ordinal: 0,
17607
- contentHash: "",
17608
- depth: pageHit.depth,
17609
- incomingLinks: pageHit.incomingLinks,
17610
- routeFile: pageHit.routeFile,
17611
- tags: pageHit.tags
18031
+ let depthBoostValue = 0;
18032
+ if (config.ranking.enableDepthBoost) {
18033
+ const depthBoost = 1 / (1 + nonNegativeOrZero(hit.depth));
18034
+ depthBoostValue = depthBoost * config.ranking.weights.depth;
18035
+ score += depthBoostValue;
18036
+ }
18037
+ let titleMatchBoostValue = 0;
18038
+ if (normalizedQuery && titleMatchWeight > 0) {
18039
+ const normalizedTitle = normalizeForTitleMatch(hit.title);
18040
+ if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
18041
+ titleMatchBoostValue = titleMatchWeight;
18042
+ score += titleMatchBoostValue;
18043
+ }
18044
+ }
18045
+ let freshnessBoostValue = 0;
18046
+ if (config.ranking.enableFreshnessBoost) {
18047
+ const publishedAt = hit.publishedAt;
18048
+ if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
18049
+ const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
18050
+ const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
18051
+ freshnessBoostValue = decay * config.ranking.weights.freshness;
18052
+ score += freshnessBoostValue;
17612
18053
  }
18054
+ }
18055
+ const pageWeight = findPageWeight(hit.url, config.ranking.pageWeights);
18056
+ if (pageWeight !== 1) {
18057
+ score *= pageWeight;
18058
+ }
18059
+ const result = {
18060
+ url: hit.url,
18061
+ title: hit.title,
18062
+ description: hit.description,
18063
+ routeFile: hit.routeFile,
18064
+ depth: hit.depth,
18065
+ incomingLinks: hit.incomingLinks,
18066
+ tags: hit.tags,
18067
+ baseScore,
18068
+ finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY,
18069
+ publishedAt: hit.publishedAt
17613
18070
  };
17614
- merged.push({
17615
- hit: syntheticHit,
17616
- finalScore: Number.isFinite(syntheticScore) ? syntheticScore : 0
17617
- });
17618
- }
17619
- return merged.sort((a, b) => {
18071
+ if (debug) {
18072
+ result.breakdown = {
18073
+ baseScore,
18074
+ pageWeight,
18075
+ incomingLinkBoost: incomingLinkBoostValue,
18076
+ depthBoost: depthBoostValue,
18077
+ titleMatchBoost: titleMatchBoostValue,
18078
+ freshnessBoost: freshnessBoostValue
18079
+ };
18080
+ }
18081
+ return result;
18082
+ }).filter((p) => findPageWeight(p.url, config.ranking.pageWeights) !== 0).sort((a, b) => {
17620
18083
  const delta = b.finalScore - a.finalScore;
17621
18084
  return Number.isNaN(delta) ? 0 : delta;
17622
18085
  });
17623
18086
  }
17624
-
17625
- // src/search/engine.ts
17626
- var requestSchema = zod.z.object({
17627
- q: zod.z.string().trim().min(1),
17628
- topK: zod.z.number().int().positive().max(100).optional(),
17629
- scope: zod.z.string().optional(),
17630
- pathPrefix: zod.z.string().optional(),
17631
- tags: zod.z.array(zod.z.string()).optional(),
17632
- groupBy: zod.z.enum(["page", "chunk"]).optional()
17633
- });
17634
- var SearchEngine = class _SearchEngine {
17635
- cwd;
17636
- config;
17637
- store;
17638
- constructor(options) {
17639
- this.cwd = options.cwd;
17640
- this.config = options.config;
17641
- this.store = options.store;
17642
- }
17643
- static async create(options = {}) {
17644
- const cwd = path__default.default.resolve(options.cwd ?? process.cwd());
17645
- const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
17646
- const store = options.store ?? await createUpstashStore(config);
17647
- return new _SearchEngine({
17648
- cwd,
17649
- config,
17650
- store
17651
- });
17652
- }
17653
- getConfig() {
17654
- return this.config;
18087
+ function trimPagesByScoreGap(results, config) {
18088
+ if (results.length === 0) return results;
18089
+ const threshold = config.ranking.scoreGapThreshold;
18090
+ const minScoreRatio = config.ranking.minScoreRatio;
18091
+ if (minScoreRatio > 0 && results.length > 0) {
18092
+ const topScore = results[0].finalScore;
18093
+ if (Number.isFinite(topScore) && topScore > 0) {
18094
+ const minThreshold = topScore * minScoreRatio;
18095
+ results = results.filter((r) => r.finalScore >= minThreshold);
18096
+ }
18097
+ }
18098
+ if (threshold > 0 && results.length > 1) {
18099
+ for (let i = 1; i < results.length; i++) {
18100
+ const prev = results[i - 1].finalScore;
18101
+ const current = results[i].finalScore;
18102
+ if (prev > 0) {
18103
+ const gap = (prev - current) / prev;
18104
+ if (gap >= threshold) {
18105
+ return results.slice(0, i);
18106
+ }
18107
+ }
18108
+ }
18109
+ }
18110
+ return results;
18111
+ }
18112
+
18113
+ // src/search/related-pages.ts
18114
+ function diceScore(urlA, urlB) {
18115
+ const segmentsA = urlA.split("/").filter(Boolean);
18116
+ const segmentsB = urlB.split("/").filter(Boolean);
18117
+ if (segmentsA.length === 0 && segmentsB.length === 0) return 1;
18118
+ if (segmentsA.length === 0 || segmentsB.length === 0) return 0;
18119
+ let shared = 0;
18120
+ const minLen = Math.min(segmentsA.length, segmentsB.length);
18121
+ for (let i = 0; i < minLen; i++) {
18122
+ if (segmentsA[i] === segmentsB[i]) {
18123
+ shared++;
18124
+ } else {
18125
+ break;
18126
+ }
18127
+ }
18128
+ return 2 * shared / (segmentsA.length + segmentsB.length);
18129
+ }
18130
+ function compositeScore(isLinked, dice, semantic) {
18131
+ return (isLinked ? 0.5 : 0) + 0.3 * dice + 0.2 * semantic;
18132
+ }
18133
+ function dominantRelationshipType(isOutgoing, isIncoming, dice) {
18134
+ if (isOutgoing) return "outgoing_link";
18135
+ if (isIncoming) return "incoming_link";
18136
+ if (dice > 0.4) return "sibling";
18137
+ return "semantic";
18138
+ }
18139
+
18140
+ // src/utils/structured-meta.ts
18141
+ var VALID_KEY_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
18142
+ function validateMetaKey(key) {
18143
+ return VALID_KEY_RE.test(key);
18144
+ }
18145
+ function parseMetaValue(content, dataType) {
18146
+ switch (dataType) {
18147
+ case "number": {
18148
+ const n = Number(content);
18149
+ return Number.isFinite(n) ? n : content;
18150
+ }
18151
+ case "boolean":
18152
+ return content === "true";
18153
+ case "string[]":
18154
+ return content ? content.split(",").map((s) => s.trim()) : [];
18155
+ case "date": {
18156
+ const ms = Number(content);
18157
+ return Number.isFinite(ms) ? ms : content;
18158
+ }
18159
+ default:
18160
+ return content;
18161
+ }
18162
+ }
18163
+ function escapeFilterValue(s) {
18164
+ return s.replace(/'/g, "''");
18165
+ }
18166
+ function buildMetaFilterString(filters) {
18167
+ const clauses = [];
18168
+ for (const [key, value] of Object.entries(filters)) {
18169
+ if (!validateMetaKey(key)) continue;
18170
+ const field = `meta.${key}`;
18171
+ if (typeof value === "string") {
18172
+ clauses.push(`${field} CONTAINS '${escapeFilterValue(value)}'`);
18173
+ } else if (typeof value === "boolean") {
18174
+ clauses.push(`${field} = ${value}`);
18175
+ } else {
18176
+ clauses.push(`${field} = ${value}`);
18177
+ }
18178
+ }
18179
+ return clauses.join(" AND ");
18180
+ }
18181
+
18182
+ // src/search/engine.ts
18183
+ var rankingOverridesSchema = zod.z.object({
18184
+ ranking: zod.z.object({
18185
+ enableIncomingLinkBoost: zod.z.boolean().optional(),
18186
+ enableDepthBoost: zod.z.boolean().optional(),
18187
+ aggregationCap: zod.z.number().int().positive().optional(),
18188
+ aggregationDecay: zod.z.number().min(0).max(1).optional(),
18189
+ minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
18190
+ minScoreRatio: zod.z.number().min(0).max(1).optional(),
18191
+ scoreGapThreshold: zod.z.number().min(0).max(1).optional(),
18192
+ weights: zod.z.object({
18193
+ incomingLinks: zod.z.number().optional(),
18194
+ depth: zod.z.number().optional(),
18195
+ aggregation: zod.z.number().optional(),
18196
+ titleMatch: zod.z.number().optional()
18197
+ }).optional()
18198
+ }).optional(),
18199
+ search: zod.z.object({
18200
+ pageSearchWeight: zod.z.number().min(0).max(1).optional()
18201
+ }).optional()
18202
+ }).optional();
18203
+ var requestSchema = zod.z.object({
18204
+ q: zod.z.string().trim().min(1),
18205
+ topK: zod.z.number().int().positive().max(100).optional(),
18206
+ scope: zod.z.string().optional(),
18207
+ pathPrefix: zod.z.string().optional(),
18208
+ tags: zod.z.array(zod.z.string()).optional(),
18209
+ filters: zod.z.record(zod.z.string(), zod.z.union([zod.z.string(), zod.z.number(), zod.z.boolean()])).optional(),
18210
+ groupBy: zod.z.enum(["page", "chunk"]).optional(),
18211
+ maxSubResults: zod.z.number().int().positive().max(20).optional(),
18212
+ debug: zod.z.boolean().optional(),
18213
+ rankingOverrides: rankingOverridesSchema
18214
+ });
18215
+ var MAX_SITE_STRUCTURE_PAGES = 2e3;
18216
+ function makeNode(url, depth) {
18217
+ return { url, title: "", depth, routeFile: "", isIndexed: false, childCount: 0, children: [] };
18218
+ }
18219
+ function buildTree(pages, pathPrefix) {
18220
+ const nodeMap = /* @__PURE__ */ new Map();
18221
+ const root2 = makeNode("/", 0);
18222
+ nodeMap.set("/", root2);
18223
+ for (const page of pages) {
18224
+ const normalized = normalizeUrlPath(page.url);
18225
+ const segments = normalized.split("/").filter(Boolean);
18226
+ if (segments.length === 0) {
18227
+ root2.title = page.title;
18228
+ root2.routeFile = page.routeFile;
18229
+ root2.isIndexed = true;
18230
+ continue;
18231
+ }
18232
+ for (let i = 1; i <= segments.length; i++) {
18233
+ const partialUrl = "/" + segments.slice(0, i).join("/");
18234
+ if (!nodeMap.has(partialUrl)) {
18235
+ nodeMap.set(partialUrl, makeNode(partialUrl, i));
18236
+ }
18237
+ }
18238
+ const node = nodeMap.get(normalized);
18239
+ node.title = page.title;
18240
+ node.routeFile = page.routeFile;
18241
+ node.isIndexed = true;
18242
+ }
18243
+ for (const [url, node] of nodeMap) {
18244
+ if (url === "/") continue;
18245
+ const segments = url.split("/").filter(Boolean);
18246
+ const parentUrl = segments.length === 1 ? "/" : "/" + segments.slice(0, -1).join("/");
18247
+ const parent = nodeMap.get(parentUrl) ?? root2;
18248
+ parent.children.push(node);
18249
+ }
18250
+ const sortAndCount = (node) => {
18251
+ node.children.sort((a, b) => a.url.localeCompare(b.url));
18252
+ node.childCount = node.children.length;
18253
+ for (const child of node.children) {
18254
+ sortAndCount(child);
18255
+ }
18256
+ };
18257
+ sortAndCount(root2);
18258
+ if (pathPrefix) {
18259
+ const normalizedPrefix = normalizeUrlPath(pathPrefix);
18260
+ const subtreeRoot = nodeMap.get(normalizedPrefix);
18261
+ if (subtreeRoot) {
18262
+ return subtreeRoot;
18263
+ }
18264
+ return makeNode(normalizedPrefix, normalizedPrefix.split("/").filter(Boolean).length);
18265
+ }
18266
+ return root2;
18267
+ }
18268
+ function mergeRankingOverrides(base, overrides) {
18269
+ return {
18270
+ ...base,
18271
+ search: {
18272
+ ...base.search,
18273
+ ...overrides.search
18274
+ },
18275
+ ranking: {
18276
+ ...base.ranking,
18277
+ ...overrides.ranking,
18278
+ weights: {
18279
+ ...base.ranking.weights,
18280
+ ...overrides.ranking?.weights
18281
+ }
18282
+ }
18283
+ };
18284
+ }
18285
+ var SearchEngine = class _SearchEngine {
18286
+ cwd;
18287
+ config;
18288
+ store;
18289
+ constructor(options) {
18290
+ this.cwd = options.cwd;
18291
+ this.config = options.config;
18292
+ this.store = options.store;
18293
+ }
18294
+ static async create(options = {}) {
18295
+ const cwd = path__default.default.resolve(options.cwd ?? process.cwd());
18296
+ const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
18297
+ const store = options.store ?? await createUpstashStore(config);
18298
+ return new _SearchEngine({
18299
+ cwd,
18300
+ config,
18301
+ store
18302
+ });
18303
+ }
18304
+ getConfig() {
18305
+ return this.config;
17655
18306
  }
17656
18307
  async search(request) {
17657
18308
  const parsed = requestSchema.safeParse(request);
@@ -17660,125 +18311,203 @@ var SearchEngine = class _SearchEngine {
17660
18311
  }
17661
18312
  const input = parsed.data;
17662
18313
  const totalStart = process.hrtime.bigint();
18314
+ const effectiveConfig = input.debug && input.rankingOverrides ? mergeRankingOverrides(this.config, input.rankingOverrides) : this.config;
17663
18315
  const resolvedScope = resolveScope(this.config, input.scope);
17664
18316
  const topK = input.topK ?? 10;
18317
+ const maxSubResults = input.maxSubResults ?? 5;
17665
18318
  const groupByPage = (input.groupBy ?? "page") === "page";
17666
- const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
17667
- const filterParts = [];
17668
- if (input.pathPrefix) {
17669
- const prefix = input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}`;
17670
- filterParts.push(`url GLOB '${prefix}*'`);
17671
- }
17672
- if (input.tags && input.tags.length > 0) {
17673
- for (const tag of input.tags) {
17674
- filterParts.push(`tags GLOB '*${tag}*'`);
18319
+ const queryText = input.q;
18320
+ const pathPrefix = input.pathPrefix ? input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}` : void 0;
18321
+ const filterTags = input.tags && input.tags.length > 0 ? input.tags : void 0;
18322
+ const metaFilterStr = input.filters && Object.keys(input.filters).length > 0 ? buildMetaFilterString(input.filters) : "";
18323
+ const metaFilter = metaFilterStr || void 0;
18324
+ const applyPagePostFilters = (hits) => {
18325
+ let filtered = hits;
18326
+ if (pathPrefix) {
18327
+ filtered = filtered.filter((h) => h.url.startsWith(pathPrefix));
18328
+ }
18329
+ if (filterTags) {
18330
+ filtered = filtered.filter(
18331
+ (h) => filterTags.every((tag) => h.tags.includes(tag))
18332
+ );
17675
18333
  }
17676
- }
17677
- const filter = filterParts.length > 0 ? filterParts.join(" AND ") : void 0;
17678
- const useDualSearch = this.config.search.dualSearch && groupByPage;
18334
+ return filtered;
18335
+ };
18336
+ const applyChunkPostFilters = (hits) => {
18337
+ let filtered = hits;
18338
+ if (filterTags) {
18339
+ filtered = filtered.filter(
18340
+ (h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
18341
+ );
18342
+ }
18343
+ return filtered;
18344
+ };
17679
18345
  const searchStart = process.hrtime.bigint();
17680
- let ranked;
17681
- if (useDualSearch) {
17682
- const chunkLimit = Math.max(topK * 10, 100);
17683
- const pageLimit = 20;
17684
- const [pageHits, chunkHits] = await Promise.all([
17685
- this.store.searchPages(
17686
- input.q,
17687
- {
17688
- limit: pageLimit,
17689
- semanticWeight: this.config.search.semanticWeight,
17690
- inputEnrichment: this.config.search.inputEnrichment,
17691
- filter
17692
- },
17693
- resolvedScope
17694
- ),
17695
- this.store.search(
17696
- input.q,
17697
- {
17698
- limit: chunkLimit,
17699
- semanticWeight: this.config.search.semanticWeight,
17700
- inputEnrichment: this.config.search.inputEnrichment,
17701
- reranking: false,
17702
- filter
17703
- },
18346
+ if (groupByPage) {
18347
+ const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
18348
+ const pageLimit = Math.max(topK * 2, 20);
18349
+ const pageHits = await this.store.searchPagesByText(
18350
+ queryText,
18351
+ { limit: pageLimit * fetchMultiplier, filter: metaFilter },
18352
+ resolvedScope
18353
+ );
18354
+ const filteredPages = applyPagePostFilters(pageHits);
18355
+ let rankedPages = rankPageHits(filteredPages, effectiveConfig, input.q, input.debug);
18356
+ rankedPages = trimPagesByScoreGap(rankedPages, effectiveConfig);
18357
+ const topPages = rankedPages.slice(0, topK);
18358
+ const chunkPromises = topPages.map(
18359
+ (page) => this.store.searchChunksByUrl(
18360
+ queryText,
18361
+ page.url,
18362
+ { limit: maxSubResults, filter: metaFilter },
17704
18363
  resolvedScope
17705
- )
17706
- ]);
17707
- const rankedChunks = rankHits(chunkHits, this.config, input.q);
17708
- ranked = mergePageAndChunkResults(pageHits, rankedChunks, this.config);
18364
+ ).then((chunks) => applyChunkPostFilters(chunks))
18365
+ );
18366
+ const allChunks = await Promise.all(chunkPromises);
18367
+ const searchMs = hrTimeMs(searchStart);
18368
+ const results = this.buildPageFirstResults(topPages, allChunks, input.q, input.debug, maxSubResults);
18369
+ return {
18370
+ q: input.q,
18371
+ scope: resolvedScope.scopeName,
18372
+ results,
18373
+ meta: {
18374
+ timingsMs: {
18375
+ search: Math.round(searchMs),
18376
+ total: Math.round(hrTimeMs(totalStart))
18377
+ }
18378
+ }
18379
+ };
17709
18380
  } else {
18381
+ const candidateK = Math.max(50, topK);
18382
+ const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
17710
18383
  const hits = await this.store.search(
17711
- input.q,
17712
- {
17713
- limit: candidateK,
17714
- semanticWeight: this.config.search.semanticWeight,
17715
- inputEnrichment: this.config.search.inputEnrichment,
17716
- reranking: this.config.search.reranking,
17717
- filter
17718
- },
18384
+ queryText,
18385
+ { limit: candidateK * fetchMultiplier, filter: metaFilter },
17719
18386
  resolvedScope
17720
18387
  );
17721
- ranked = rankHits(hits, this.config, input.q);
17722
- }
17723
- const searchMs = hrTimeMs(searchStart);
17724
- const results = this.buildResults(ranked, topK, groupByPage, input.q);
17725
- return {
17726
- q: input.q,
17727
- scope: resolvedScope.scopeName,
17728
- results,
17729
- meta: {
17730
- timingsMs: {
17731
- search: Math.round(searchMs),
17732
- total: Math.round(hrTimeMs(totalStart))
18388
+ let filtered = hits;
18389
+ if (pathPrefix) {
18390
+ filtered = filtered.filter((h) => h.metadata.url.startsWith(pathPrefix));
18391
+ }
18392
+ if (filterTags) {
18393
+ filtered = filtered.filter(
18394
+ (h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
18395
+ );
18396
+ }
18397
+ const ranked = rankHits(filtered, effectiveConfig, input.q, input.debug);
18398
+ const searchMs = hrTimeMs(searchStart);
18399
+ const results = this.buildResults(ranked, topK, false, maxSubResults, input.q, input.debug, effectiveConfig);
18400
+ return {
18401
+ q: input.q,
18402
+ scope: resolvedScope.scopeName,
18403
+ results,
18404
+ meta: {
18405
+ timingsMs: {
18406
+ search: Math.round(searchMs),
18407
+ total: Math.round(hrTimeMs(totalStart))
18408
+ }
17733
18409
  }
18410
+ };
18411
+ }
18412
+ }
18413
+ buildPageFirstResults(rankedPages, allChunks, query, debug, maxSubResults = 5) {
18414
+ return rankedPages.map((page, i) => {
18415
+ const chunks = allChunks[i] ?? [];
18416
+ const bestChunk = chunks[0];
18417
+ const snippet = bestChunk ? query ? queryAwareExcerpt(bestChunk.metadata.chunkText, query) : toSnippet(bestChunk.metadata.chunkText) : page.description || page.title;
18418
+ const result = {
18419
+ url: page.url,
18420
+ title: page.title,
18421
+ sectionTitle: bestChunk?.metadata.sectionTitle || void 0,
18422
+ snippet,
18423
+ chunkText: bestChunk?.metadata.chunkText || void 0,
18424
+ score: Number(page.finalScore.toFixed(6)),
18425
+ routeFile: page.routeFile,
18426
+ chunks: chunks.length > 0 ? chunks.slice(0, maxSubResults).map((c) => ({
18427
+ sectionTitle: c.metadata.sectionTitle || void 0,
18428
+ snippet: query ? queryAwareExcerpt(c.metadata.chunkText, query) : toSnippet(c.metadata.chunkText),
18429
+ chunkText: c.metadata.chunkText || void 0,
18430
+ headingPath: c.metadata.headingPath,
18431
+ score: Number(c.score.toFixed(6))
18432
+ })) : void 0
18433
+ };
18434
+ if (debug && page.breakdown) {
18435
+ result.breakdown = {
18436
+ baseScore: page.breakdown.baseScore,
18437
+ incomingLinkBoost: page.breakdown.incomingLinkBoost,
18438
+ depthBoost: page.breakdown.depthBoost,
18439
+ titleMatchBoost: page.breakdown.titleMatchBoost,
18440
+ freshnessBoost: page.breakdown.freshnessBoost,
18441
+ anchorTextMatchBoost: 0
18442
+ };
17734
18443
  }
17735
- };
18444
+ return result;
18445
+ });
17736
18446
  }
17737
- ensureSnippet(hit) {
18447
+ ensureSnippet(hit, query) {
18448
+ const chunkText = hit.hit.metadata.chunkText;
18449
+ if (query && chunkText) return queryAwareExcerpt(chunkText, query);
17738
18450
  const snippet = hit.hit.metadata.snippet;
17739
18451
  if (snippet && snippet.length >= 30) return snippet;
17740
- const chunkText = hit.hit.metadata.chunkText;
17741
18452
  if (chunkText) return toSnippet(chunkText);
17742
18453
  return snippet || "";
17743
18454
  }
17744
- buildResults(ordered, topK, groupByPage, _query) {
18455
+ buildResults(ordered, topK, groupByPage, maxSubResults, query, debug, config) {
18456
+ const cfg = config ?? this.config;
17745
18457
  if (groupByPage) {
17746
- let pages = aggregateByPage(ordered, this.config);
17747
- pages = trimByScoreGap(pages, this.config);
17748
- const minRatio = this.config.ranking.minChunkScoreRatio;
18458
+ let pages = aggregateByPage(ordered, cfg);
18459
+ pages = trimByScoreGap(pages, cfg);
18460
+ const minRatio = cfg.ranking.minChunkScoreRatio;
17749
18461
  return pages.slice(0, topK).map((page) => {
17750
18462
  const bestScore = page.bestChunk.finalScore;
17751
18463
  const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
17752
- const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
17753
- return {
18464
+ const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, maxSubResults);
18465
+ const result = {
17754
18466
  url: page.url,
17755
18467
  title: page.title,
17756
18468
  sectionTitle: page.bestChunk.hit.metadata.sectionTitle || void 0,
17757
- snippet: this.ensureSnippet(page.bestChunk),
18469
+ snippet: this.ensureSnippet(page.bestChunk, query),
18470
+ chunkText: page.bestChunk.hit.metadata.chunkText || void 0,
17758
18471
  score: Number(page.pageScore.toFixed(6)),
17759
18472
  routeFile: page.routeFile,
17760
- chunks: meaningful.length > 1 ? meaningful.map((c) => ({
18473
+ chunks: meaningful.length >= 1 ? meaningful.map((c) => ({
17761
18474
  sectionTitle: c.hit.metadata.sectionTitle || void 0,
17762
- snippet: this.ensureSnippet(c),
18475
+ snippet: this.ensureSnippet(c, query),
18476
+ chunkText: c.hit.metadata.chunkText || void 0,
17763
18477
  headingPath: c.hit.metadata.headingPath,
17764
18478
  score: Number(c.finalScore.toFixed(6))
17765
18479
  })) : void 0
17766
18480
  };
18481
+ if (debug && page.bestChunk.breakdown) {
18482
+ result.breakdown = page.bestChunk.breakdown;
18483
+ }
18484
+ return result;
17767
18485
  });
17768
18486
  } else {
17769
18487
  let filtered = ordered;
17770
- const minScore = this.config.ranking.minScore;
17771
- if (minScore > 0) {
17772
- filtered = ordered.filter((entry) => entry.finalScore >= minScore);
17773
- }
17774
- return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
17775
- url: hit.metadata.url,
17776
- title: hit.metadata.title,
17777
- sectionTitle: hit.metadata.sectionTitle || void 0,
17778
- snippet: this.ensureSnippet({ hit, finalScore }),
17779
- score: Number(finalScore.toFixed(6)),
17780
- routeFile: hit.metadata.routeFile
17781
- }));
18488
+ const minScoreRatio = cfg.ranking.minScoreRatio;
18489
+ if (minScoreRatio > 0 && ordered.length > 0) {
18490
+ const topScore = ordered[0].finalScore;
18491
+ if (Number.isFinite(topScore) && topScore > 0) {
18492
+ const threshold = topScore * minScoreRatio;
18493
+ filtered = ordered.filter((entry) => entry.finalScore >= threshold);
18494
+ }
18495
+ }
18496
+ return filtered.slice(0, topK).map(({ hit, finalScore, breakdown }) => {
18497
+ const result = {
18498
+ url: hit.metadata.url,
18499
+ title: hit.metadata.title,
18500
+ sectionTitle: hit.metadata.sectionTitle || void 0,
18501
+ snippet: this.ensureSnippet({ hit, finalScore }, query),
18502
+ chunkText: hit.metadata.chunkText || void 0,
18503
+ score: Number(finalScore.toFixed(6)),
18504
+ routeFile: hit.metadata.routeFile
18505
+ };
18506
+ if (debug && breakdown) {
18507
+ result.breakdown = breakdown;
18508
+ }
18509
+ return result;
18510
+ });
17782
18511
  }
17783
18512
  }
17784
18513
  async getPage(pathOrUrl, scope) {
@@ -17804,6 +18533,116 @@ var SearchEngine = class _SearchEngine {
17804
18533
  markdown: page.markdown
17805
18534
  };
17806
18535
  }
18536
+ async listPages(opts) {
18537
+ const resolvedScope = resolveScope(this.config, opts?.scope);
18538
+ const pathPrefix = opts?.pathPrefix ? opts.pathPrefix.startsWith("/") ? opts.pathPrefix : `/${opts.pathPrefix}` : void 0;
18539
+ return this.store.listPages(resolvedScope, {
18540
+ cursor: opts?.cursor,
18541
+ limit: opts?.limit,
18542
+ pathPrefix
18543
+ });
18544
+ }
18545
+ async getSiteStructure(opts) {
18546
+ const maxPages = Math.min(opts?.maxPages ?? MAX_SITE_STRUCTURE_PAGES, MAX_SITE_STRUCTURE_PAGES);
18547
+ const allPages = [];
18548
+ let cursor;
18549
+ let truncated = false;
18550
+ do {
18551
+ const result = await this.listPages({
18552
+ pathPrefix: opts?.pathPrefix,
18553
+ scope: opts?.scope,
18554
+ cursor,
18555
+ limit: 200
18556
+ });
18557
+ allPages.push(...result.pages);
18558
+ cursor = result.nextCursor;
18559
+ if (allPages.length >= maxPages) {
18560
+ truncated = allPages.length > maxPages || !!cursor;
18561
+ allPages.length = maxPages;
18562
+ break;
18563
+ }
18564
+ } while (cursor);
18565
+ const root2 = buildTree(allPages, opts?.pathPrefix);
18566
+ return {
18567
+ root: root2,
18568
+ totalPages: allPages.length,
18569
+ truncated
18570
+ };
18571
+ }
18572
+ async getRelatedPages(pathOrUrl, opts) {
18573
+ const resolvedScope = resolveScope(this.config, opts?.scope);
18574
+ const urlPath = this.resolveInputPath(pathOrUrl);
18575
+ const topK = Math.min(opts?.topK ?? 10, 25);
18576
+ const source = await this.store.fetchPageWithVector(urlPath, resolvedScope);
18577
+ if (!source) {
18578
+ throw new SearchSocketError("INVALID_REQUEST", `Indexed page not found for ${urlPath}`, 404);
18579
+ }
18580
+ const sourceOutgoing = new Set(source.metadata.outgoingLinkUrls ?? []);
18581
+ const semanticHits = await this.store.searchPagesByVector(
18582
+ source.vector,
18583
+ { limit: 50 },
18584
+ resolvedScope
18585
+ );
18586
+ const filteredHits = semanticHits.filter((h) => h.url !== urlPath);
18587
+ const semanticScoreMap = /* @__PURE__ */ new Map();
18588
+ for (const hit of filteredHits) {
18589
+ semanticScoreMap.set(hit.url, hit.score);
18590
+ }
18591
+ const candidateUrls = /* @__PURE__ */ new Set();
18592
+ for (const hit of filteredHits) {
18593
+ candidateUrls.add(hit.url);
18594
+ }
18595
+ for (const url of sourceOutgoing) {
18596
+ if (url !== urlPath) candidateUrls.add(url);
18597
+ }
18598
+ const missingUrls = [...sourceOutgoing].filter(
18599
+ (u) => u !== urlPath && !semanticScoreMap.has(u)
18600
+ );
18601
+ const fetchedPages = missingUrls.length > 0 ? await this.store.fetchPagesBatch(missingUrls, resolvedScope) : [];
18602
+ const metaMap = /* @__PURE__ */ new Map();
18603
+ for (const hit of filteredHits) {
18604
+ metaMap.set(hit.url, { title: hit.title, routeFile: hit.routeFile, outgoingLinkUrls: [] });
18605
+ }
18606
+ for (const p of fetchedPages) {
18607
+ metaMap.set(p.url, { title: p.title, routeFile: p.routeFile, outgoingLinkUrls: p.outgoingLinkUrls });
18608
+ }
18609
+ const semanticUrls = filteredHits.map((h) => h.url);
18610
+ if (semanticUrls.length > 0) {
18611
+ const semanticPageData = await this.store.fetchPagesBatch(semanticUrls, resolvedScope);
18612
+ for (const p of semanticPageData) {
18613
+ const existing = metaMap.get(p.url);
18614
+ if (existing) {
18615
+ existing.outgoingLinkUrls = p.outgoingLinkUrls;
18616
+ }
18617
+ }
18618
+ }
18619
+ const candidates = [];
18620
+ for (const url of candidateUrls) {
18621
+ const meta = metaMap.get(url);
18622
+ if (!meta) continue;
18623
+ const isOutgoing = sourceOutgoing.has(url);
18624
+ const isIncoming = meta.outgoingLinkUrls.includes(urlPath);
18625
+ const isLinked = isOutgoing || isIncoming;
18626
+ const dice = diceScore(urlPath, url);
18627
+ const semantic = semanticScoreMap.get(url) ?? 0;
18628
+ const score = compositeScore(isLinked, dice, semantic);
18629
+ const relationshipType = dominantRelationshipType(isOutgoing, isIncoming, dice);
18630
+ candidates.push({
18631
+ url,
18632
+ title: meta.title,
18633
+ score: Number(score.toFixed(6)),
18634
+ relationshipType,
18635
+ routeFile: meta.routeFile
18636
+ });
18637
+ }
18638
+ candidates.sort((a, b) => b.score - a.score);
18639
+ const results = candidates.slice(0, topK);
18640
+ return {
18641
+ sourceUrl: urlPath,
18642
+ scope: resolvedScope.scopeName,
18643
+ relatedPages: results
18644
+ };
18645
+ }
17807
18646
  async health() {
17808
18647
  return this.store.health();
17809
18648
  }
@@ -17819,6 +18658,215 @@ var SearchEngine = class _SearchEngine {
17819
18658
  }
17820
18659
  };
17821
18660
 
18661
+ // src/mcp/server.ts
18662
+ function createServer(engine) {
18663
+ const server = new mcp_js.McpServer({
18664
+ name: "searchsocket-mcp",
18665
+ version: "0.1.0"
18666
+ });
18667
+ server.registerTool(
18668
+ "search",
18669
+ {
18670
+ description: `Semantic site search powered by Upstash Search. Returns url, title, snippet, chunkText, score, and routeFile per result. chunkText contains the full raw chunk markdown. When groupBy is 'page' (default), each result includes a chunks array with section-level sub-results containing sectionTitle, headingPath, snippet, and score. Supports optional filters for structured metadata (e.g. {"version": 2, "deprecated": false}).`,
18671
+ inputSchema: {
18672
+ query: zod.z.string().min(1),
18673
+ scope: zod.z.string().optional(),
18674
+ topK: zod.z.number().int().positive().max(100).optional(),
18675
+ pathPrefix: zod.z.string().optional(),
18676
+ tags: zod.z.array(zod.z.string()).optional(),
18677
+ filters: zod.z.record(zod.z.string(), zod.z.union([zod.z.string(), zod.z.number(), zod.z.boolean()])).optional(),
18678
+ groupBy: zod.z.enum(["page", "chunk"]).optional(),
18679
+ maxSubResults: zod.z.number().int().positive().max(20).optional()
18680
+ },
18681
+ outputSchema: {
18682
+ q: zod.z.string(),
18683
+ scope: zod.z.string(),
18684
+ results: zod.z.array(zod.z.object({
18685
+ url: zod.z.string(),
18686
+ title: zod.z.string(),
18687
+ sectionTitle: zod.z.string().optional(),
18688
+ snippet: zod.z.string(),
18689
+ score: zod.z.number(),
18690
+ routeFile: zod.z.string(),
18691
+ chunks: zod.z.array(zod.z.object({
18692
+ sectionTitle: zod.z.string().optional(),
18693
+ snippet: zod.z.string(),
18694
+ headingPath: zod.z.array(zod.z.string()),
18695
+ score: zod.z.number()
18696
+ })).optional()
18697
+ })),
18698
+ meta: zod.z.object({
18699
+ timingsMs: zod.z.object({
18700
+ search: zod.z.number(),
18701
+ total: zod.z.number()
18702
+ })
18703
+ })
18704
+ }
18705
+ },
18706
+ async (input) => {
18707
+ const result = await engine.search({
18708
+ q: input.query,
18709
+ topK: input.topK,
18710
+ scope: input.scope,
18711
+ pathPrefix: input.pathPrefix,
18712
+ tags: input.tags,
18713
+ filters: input.filters,
18714
+ groupBy: input.groupBy,
18715
+ maxSubResults: input.maxSubResults
18716
+ });
18717
+ return {
18718
+ content: [
18719
+ {
18720
+ type: "text",
18721
+ text: JSON.stringify(result, null, 2)
18722
+ }
18723
+ ],
18724
+ structuredContent: result
18725
+ };
18726
+ }
18727
+ );
18728
+ server.registerTool(
18729
+ "get_page",
18730
+ {
18731
+ description: "Fetch indexed markdown for a specific path or URL, including frontmatter and routeFile mapping.",
18732
+ inputSchema: {
18733
+ pathOrUrl: zod.z.string().min(1),
18734
+ scope: zod.z.string().optional()
18735
+ }
18736
+ },
18737
+ async (input) => {
18738
+ const page = await engine.getPage(input.pathOrUrl, input.scope);
18739
+ return {
18740
+ content: [
18741
+ {
18742
+ type: "text",
18743
+ text: JSON.stringify(page, null, 2)
18744
+ }
18745
+ ]
18746
+ };
18747
+ }
18748
+ );
18749
+ server.registerTool(
18750
+ "list_pages",
18751
+ {
18752
+ description: "List indexed pages with optional path prefix filtering and cursor-based pagination. Returns url, title, description, and routeFile for each page. Use nextCursor to fetch subsequent pages.",
18753
+ inputSchema: {
18754
+ pathPrefix: zod.z.string().optional(),
18755
+ cursor: zod.z.string().optional(),
18756
+ limit: zod.z.number().int().positive().max(200).optional(),
18757
+ scope: zod.z.string().optional()
18758
+ }
18759
+ },
18760
+ async (input) => {
18761
+ const result = await engine.listPages({
18762
+ pathPrefix: input.pathPrefix,
18763
+ cursor: input.cursor,
18764
+ limit: input.limit,
18765
+ scope: input.scope
18766
+ });
18767
+ return {
18768
+ content: [
18769
+ {
18770
+ type: "text",
18771
+ text: JSON.stringify(result, null, 2)
18772
+ }
18773
+ ]
18774
+ };
18775
+ }
18776
+ );
18777
+ server.registerTool(
18778
+ "get_site_structure",
18779
+ {
18780
+ description: "Returns the hierarchical page tree derived from URL paths. Use this to understand site navigation structure, find where pages belong, or scope further operations to a section. Nodes with isIndexed: false are implicit structural parents not directly in the index. Large sites (>2000 pages) return truncated: true.",
18781
+ inputSchema: {
18782
+ pathPrefix: zod.z.string().optional(),
18783
+ scope: zod.z.string().optional(),
18784
+ maxPages: zod.z.number().int().positive().max(2e3).optional()
18785
+ }
18786
+ },
18787
+ async (input) => {
18788
+ const result = await engine.getSiteStructure({
18789
+ pathPrefix: input.pathPrefix,
18790
+ scope: input.scope,
18791
+ maxPages: input.maxPages
18792
+ });
18793
+ return {
18794
+ content: [
18795
+ {
18796
+ type: "text",
18797
+ text: JSON.stringify(result, null, 2)
18798
+ }
18799
+ ]
18800
+ };
18801
+ }
18802
+ );
18803
+ server.registerTool(
18804
+ "find_source_file",
18805
+ {
18806
+ description: "Find the SvelteKit source file for a piece of site content. Use this when you need to locate and edit content on the site. Returns the URL, route file path, section title, and a content snippet.",
18807
+ inputSchema: {
18808
+ query: zod.z.string().min(1),
18809
+ scope: zod.z.string().optional()
18810
+ }
18811
+ },
18812
+ async (input) => {
18813
+ const result = await engine.search({
18814
+ q: input.query,
18815
+ topK: 1,
18816
+ scope: input.scope
18817
+ });
18818
+ if (result.results.length === 0) {
18819
+ return {
18820
+ content: [
18821
+ {
18822
+ type: "text",
18823
+ text: JSON.stringify({
18824
+ error: "No matching content found for the given query."
18825
+ })
18826
+ }
18827
+ ]
18828
+ };
18829
+ }
18830
+ const match = result.results[0];
18831
+ const { url, routeFile, sectionTitle, snippet } = match;
18832
+ return {
18833
+ content: [
18834
+ {
18835
+ type: "text",
18836
+ text: JSON.stringify({ url, routeFile, sectionTitle, snippet })
18837
+ }
18838
+ ]
18839
+ };
18840
+ }
18841
+ );
18842
+ server.registerTool(
18843
+ "get_related_pages",
18844
+ {
18845
+ description: "Find pages related to a given URL using link graph, semantic similarity, and structural proximity. Returns related pages ranked by a composite relatedness score. Use this to discover content connected to a known page.",
18846
+ inputSchema: {
18847
+ pathOrUrl: zod.z.string().min(1),
18848
+ scope: zod.z.string().optional(),
18849
+ topK: zod.z.number().int().positive().max(25).optional()
18850
+ }
18851
+ },
18852
+ async (input) => {
18853
+ const result = await engine.getRelatedPages(input.pathOrUrl, {
18854
+ topK: input.topK,
18855
+ scope: input.scope
18856
+ });
18857
+ return {
18858
+ content: [
18859
+ {
18860
+ type: "text",
18861
+ text: JSON.stringify(result, null, 2)
18862
+ }
18863
+ ]
18864
+ };
18865
+ }
18866
+ );
18867
+ return server;
18868
+ }
18869
+
17822
18870
  // src/sveltekit/handle.ts
17823
18871
  var InMemoryRateLimiter = class {
17824
18872
  constructor(windowMs, max) {
@@ -17847,7 +18895,13 @@ function searchsocketHandle(options = {}) {
17847
18895
  let enginePromise = null;
17848
18896
  let configPromise = null;
17849
18897
  let apiPath = options.path;
18898
+ let llmsServePath = null;
18899
+ let serveMarkdownVariants = false;
18900
+ let mcpPath;
18901
+ let mcpApiKey;
18902
+ let mcpEnableJsonResponse = true;
17850
18903
  let rateLimiter = null;
18904
+ let notConfigured = false;
17851
18905
  const getConfig = async () => {
17852
18906
  if (!configPromise) {
17853
18907
  let configP;
@@ -17864,6 +18918,13 @@ function searchsocketHandle(options = {}) {
17864
18918
  }
17865
18919
  configPromise = configP.then((config) => {
17866
18920
  apiPath = apiPath ?? config.api.path;
18921
+ mcpPath = config.mcp.handle.path;
18922
+ mcpApiKey = config.mcp.handle.apiKey;
18923
+ mcpEnableJsonResponse = config.mcp.handle.enableJsonResponse;
18924
+ if (config.llmsTxt.enable) {
18925
+ llmsServePath = "/" + config.llmsTxt.outputPath.replace(/^static\//, "");
18926
+ serveMarkdownVariants = config.llmsTxt.serveMarkdownVariants;
18927
+ }
17867
18928
  if (config.api.rateLimit && !isServerless()) {
17868
18929
  rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
17869
18930
  }
@@ -17873,59 +18934,109 @@ function searchsocketHandle(options = {}) {
17873
18934
  return configPromise;
17874
18935
  };
17875
18936
  const getEngine = async () => {
18937
+ if (notConfigured) {
18938
+ throw new SearchSocketError(
18939
+ "SEARCH_NOT_CONFIGURED",
18940
+ "Search is not configured. Set the required Upstash environment variables to enable search.",
18941
+ 503
18942
+ );
18943
+ }
17876
18944
  if (!enginePromise) {
17877
18945
  const config = await getConfig();
17878
18946
  enginePromise = SearchEngine.create({
17879
18947
  cwd: options.cwd,
17880
18948
  config
18949
+ }).catch((error) => {
18950
+ enginePromise = null;
18951
+ if (error instanceof SearchSocketError && error.code === "VECTOR_BACKEND_UNAVAILABLE") {
18952
+ notConfigured = true;
18953
+ throw new SearchSocketError(
18954
+ "SEARCH_NOT_CONFIGURED",
18955
+ "Search is not configured. Set the required Upstash environment variables to enable search.",
18956
+ 503
18957
+ );
18958
+ }
18959
+ throw error;
17881
18960
  });
17882
18961
  }
17883
18962
  return enginePromise;
17884
18963
  };
17885
18964
  const bodyLimit = options.maxBodyBytes ?? 64 * 1024;
17886
18965
  return async ({ event, resolve }) => {
17887
- if (apiPath && event.url.pathname !== apiPath) {
17888
- return resolve(event);
18966
+ if (apiPath && !isApiPath(event.url.pathname, apiPath) && event.url.pathname !== llmsServePath) {
18967
+ const isMarkdownVariant = event.request.method === "GET" && event.url.pathname.endsWith(".md");
18968
+ if (mcpPath && event.url.pathname === mcpPath) {
18969
+ return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
18970
+ }
18971
+ if (mcpPath) {
18972
+ if (serveMarkdownVariants && isMarkdownVariant) ; else {
18973
+ return resolve(event);
18974
+ }
18975
+ } else {
18976
+ if (configPromise || options.config || options.rawConfig) {
18977
+ await getConfig();
18978
+ if (mcpPath && event.url.pathname === mcpPath) {
18979
+ return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
18980
+ }
18981
+ if (!(serveMarkdownVariants && isMarkdownVariant)) {
18982
+ return resolve(event);
18983
+ }
18984
+ } else {
18985
+ return resolve(event);
18986
+ }
18987
+ }
17889
18988
  }
17890
18989
  const config = await getConfig();
18990
+ if (llmsServePath && event.request.method === "GET" && event.url.pathname === llmsServePath) {
18991
+ const cwd = options.cwd ?? process.cwd();
18992
+ const filePath = path__default.default.resolve(cwd, config.llmsTxt.outputPath);
18993
+ try {
18994
+ const content = await fs9__default.default.readFile(filePath, "utf8");
18995
+ return new Response(content, {
18996
+ status: 200,
18997
+ headers: { "content-type": "text/plain; charset=utf-8" }
18998
+ });
18999
+ } catch {
19000
+ return resolve(event);
19001
+ }
19002
+ }
19003
+ if (serveMarkdownVariants && event.request.method === "GET" && event.url.pathname.endsWith(".md")) {
19004
+ let rawPath;
19005
+ try {
19006
+ rawPath = decodeURIComponent(event.url.pathname.slice(0, -3));
19007
+ } catch {
19008
+ return resolve(event);
19009
+ }
19010
+ const scope = event.url.searchParams?.get("scope") ?? void 0;
19011
+ try {
19012
+ const engine = await getEngine();
19013
+ const page = await engine.getPage(rawPath, scope);
19014
+ return new Response(page.markdown, {
19015
+ status: 200,
19016
+ headers: { "content-type": "text/markdown; charset=utf-8" }
19017
+ });
19018
+ } catch (error) {
19019
+ if (error instanceof SearchSocketError && error.status === 404) {
19020
+ return resolve(event);
19021
+ }
19022
+ throw error;
19023
+ }
19024
+ }
19025
+ if (mcpPath && event.url.pathname === mcpPath) {
19026
+ return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
19027
+ }
17891
19028
  const targetPath = apiPath ?? config.api.path;
17892
- if (event.url.pathname !== targetPath) {
19029
+ if (!isApiPath(event.url.pathname, targetPath)) {
17893
19030
  return resolve(event);
17894
19031
  }
17895
- if (event.request.method === "OPTIONS") {
19032
+ const subPath = event.url.pathname.slice(targetPath.length);
19033
+ const method = event.request.method;
19034
+ if (method === "OPTIONS") {
17896
19035
  return new Response(null, {
17897
19036
  status: 204,
17898
19037
  headers: buildCorsHeaders(event.request, config)
17899
19038
  });
17900
19039
  }
17901
- if (event.request.method !== "POST") {
17902
- return withCors(
17903
- new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
17904
- status: 405,
17905
- headers: {
17906
- "content-type": "application/json"
17907
- }
17908
- }),
17909
- event.request,
17910
- config
17911
- );
17912
- }
17913
- const contentLength = Number(event.request.headers.get("content-length") ?? 0);
17914
- if (contentLength > bodyLimit) {
17915
- return withCors(
17916
- new Response(
17917
- JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Request body too large", 413))),
17918
- {
17919
- status: 413,
17920
- headers: {
17921
- "content-type": "application/json"
17922
- }
17923
- }
17924
- ),
17925
- event.request,
17926
- config
17927
- );
17928
- }
17929
19040
  if (rateLimiter) {
17930
19041
  const ip = event.getClientAddress?.() ?? event.request.headers.get("x-forwarded-for")?.split(",")[0]?.trim() ?? "unknown";
17931
19042
  if (!rateLimiter.check(ip)) {
@@ -17945,39 +19056,32 @@ function searchsocketHandle(options = {}) {
17945
19056
  }
17946
19057
  }
17947
19058
  try {
17948
- let rawBody;
17949
- if (typeof event.request.text === "function") {
17950
- rawBody = await event.request.text();
17951
- } else {
17952
- let parsedFallback;
17953
- try {
17954
- parsedFallback = await event.request.json();
17955
- } catch (error) {
17956
- if (error instanceof SyntaxError) {
17957
- throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
17958
- }
17959
- throw error;
19059
+ if (method === "GET") {
19060
+ if (subPath === "" || subPath === "/") {
19061
+ return await handleGetSearch(event, config, getEngine);
17960
19062
  }
17961
- rawBody = JSON.stringify(parsedFallback);
17962
- }
17963
- if (Buffer.byteLength(rawBody, "utf8") > bodyLimit) {
17964
- throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
19063
+ if (subPath === "/health") {
19064
+ return await handleGetHealth(event, config, getEngine);
19065
+ }
19066
+ if (subPath.startsWith("/pages/")) {
19067
+ return await handleGetPage(event, config, getEngine, subPath);
19068
+ }
19069
+ return withCors(
19070
+ new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Not found", 404))), {
19071
+ status: 404,
19072
+ headers: { "content-type": "application/json" }
19073
+ }),
19074
+ event.request,
19075
+ config
19076
+ );
17965
19077
  }
17966
- let body;
17967
- try {
17968
- body = JSON.parse(rawBody);
17969
- } catch {
17970
- throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
19078
+ if (method === "POST" && (subPath === "" || subPath === "/")) {
19079
+ return await handlePostSearch(event, config, getEngine, bodyLimit);
17971
19080
  }
17972
- const engine = await getEngine();
17973
- const searchRequest = body;
17974
- const result = await engine.search(searchRequest);
17975
19081
  return withCors(
17976
- new Response(JSON.stringify(result), {
17977
- status: 200,
17978
- headers: {
17979
- "content-type": "application/json"
17980
- }
19082
+ new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
19083
+ status: 405,
19084
+ headers: { "content-type": "application/json" }
17981
19085
  }),
17982
19086
  event.request,
17983
19087
  config
@@ -17998,6 +19102,183 @@ function searchsocketHandle(options = {}) {
17998
19102
  }
17999
19103
  };
18000
19104
  }
19105
+ function isApiPath(pathname, apiPath) {
19106
+ return pathname === apiPath || pathname.startsWith(apiPath + "/");
19107
+ }
19108
+ async function handleGetSearch(event, config, getEngine) {
19109
+ const params = event.url.searchParams;
19110
+ const q = params.get("q");
19111
+ if (!q || q.trim() === "") {
19112
+ throw new SearchSocketError("INVALID_REQUEST", "Missing required query parameter: q", 400);
19113
+ }
19114
+ const searchRequest = { q };
19115
+ const topK = params.get("topK");
19116
+ if (topK !== null) {
19117
+ const parsed = Number.parseInt(topK, 10);
19118
+ if (Number.isNaN(parsed) || parsed < 1) {
19119
+ throw new SearchSocketError("INVALID_REQUEST", "topK must be a positive integer", 400);
19120
+ }
19121
+ searchRequest.topK = parsed;
19122
+ }
19123
+ const scope = params.get("scope");
19124
+ if (scope !== null) searchRequest.scope = scope;
19125
+ const pathPrefix = params.get("pathPrefix");
19126
+ if (pathPrefix !== null) searchRequest.pathPrefix = pathPrefix;
19127
+ const groupBy = params.get("groupBy");
19128
+ if (groupBy) {
19129
+ if (groupBy !== "page" && groupBy !== "chunk") {
19130
+ throw new SearchSocketError("INVALID_REQUEST", 'groupBy must be "page" or "chunk"', 400);
19131
+ }
19132
+ searchRequest.groupBy = groupBy;
19133
+ }
19134
+ const maxSubResults = params.get("maxSubResults");
19135
+ if (maxSubResults !== null) {
19136
+ const parsed = Number.parseInt(maxSubResults, 10);
19137
+ if (Number.isNaN(parsed) || parsed < 1 || parsed > 20) {
19138
+ throw new SearchSocketError("INVALID_REQUEST", "maxSubResults must be a positive integer between 1 and 20", 400);
19139
+ }
19140
+ searchRequest.maxSubResults = parsed;
19141
+ }
19142
+ const tags = params.getAll("tags");
19143
+ if (tags.length > 0) searchRequest.tags = tags;
19144
+ const engine = await getEngine();
19145
+ const result = await engine.search(searchRequest);
19146
+ return withCors(
19147
+ new Response(JSON.stringify(result), {
19148
+ status: 200,
19149
+ headers: { "content-type": "application/json" }
19150
+ }),
19151
+ event.request,
19152
+ config
19153
+ );
19154
+ }
19155
+ async function handleGetHealth(event, config, getEngine) {
19156
+ const engine = await getEngine();
19157
+ const result = await engine.health();
19158
+ return withCors(
19159
+ new Response(JSON.stringify(result), {
19160
+ status: 200,
19161
+ headers: { "content-type": "application/json" }
19162
+ }),
19163
+ event.request,
19164
+ config
19165
+ );
19166
+ }
19167
+ async function handleGetPage(event, config, getEngine, subPath) {
19168
+ const rawPath = subPath.slice("/pages".length);
19169
+ let pagePath;
19170
+ try {
19171
+ pagePath = decodeURIComponent(rawPath);
19172
+ } catch {
19173
+ throw new SearchSocketError("INVALID_REQUEST", "Malformed page path", 400);
19174
+ }
19175
+ const scope = event.url.searchParams?.get("scope") ?? void 0;
19176
+ const engine = await getEngine();
19177
+ const result = await engine.getPage(pagePath, scope);
19178
+ return withCors(
19179
+ new Response(JSON.stringify(result), {
19180
+ status: 200,
19181
+ headers: { "content-type": "application/json" }
19182
+ }),
19183
+ event.request,
19184
+ config
19185
+ );
19186
+ }
19187
+ async function handlePostSearch(event, config, getEngine, bodyLimit) {
19188
+ const contentLength = Number(event.request.headers.get("content-length") ?? 0);
19189
+ if (contentLength > bodyLimit) {
19190
+ throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
19191
+ }
19192
+ let rawBody;
19193
+ if (typeof event.request.text === "function") {
19194
+ rawBody = await event.request.text();
19195
+ } else {
19196
+ let parsedFallback;
19197
+ try {
19198
+ parsedFallback = await event.request.json();
19199
+ } catch (error) {
19200
+ if (error instanceof SyntaxError) {
19201
+ throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
19202
+ }
19203
+ throw error;
19204
+ }
19205
+ rawBody = JSON.stringify(parsedFallback);
19206
+ }
19207
+ if (Buffer.byteLength(rawBody, "utf8") > bodyLimit) {
19208
+ throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
19209
+ }
19210
+ let body;
19211
+ try {
19212
+ body = JSON.parse(rawBody);
19213
+ } catch {
19214
+ throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
19215
+ }
19216
+ const engine = await getEngine();
19217
+ const searchRequest = body;
19218
+ const result = await engine.search(searchRequest);
19219
+ return withCors(
19220
+ new Response(JSON.stringify(result), {
19221
+ status: 200,
19222
+ headers: { "content-type": "application/json" }
19223
+ }),
19224
+ event.request,
19225
+ config
19226
+ );
19227
+ }
19228
+ async function handleMcpRequest(event, apiKey, enableJsonResponse, getEngine) {
19229
+ if (apiKey) {
19230
+ const authHeader = event.request.headers.get("authorization") ?? "";
19231
+ const token = authHeader.startsWith("Bearer ") ? authHeader.slice(7) : "";
19232
+ const tokenBuf = Buffer.from(token);
19233
+ const keyBuf = Buffer.from(apiKey);
19234
+ if (tokenBuf.length !== keyBuf.length || !crypto.timingSafeEqual(tokenBuf, keyBuf)) {
19235
+ return new Response(
19236
+ JSON.stringify({
19237
+ jsonrpc: "2.0",
19238
+ error: { code: -32001, message: "Unauthorized" },
19239
+ id: null
19240
+ }),
19241
+ { status: 401, headers: { "content-type": "application/json" } }
19242
+ );
19243
+ }
19244
+ }
19245
+ const transport = new webStandardStreamableHttp_js.WebStandardStreamableHTTPServerTransport({
19246
+ sessionIdGenerator: void 0,
19247
+ enableJsonResponse
19248
+ });
19249
+ let server;
19250
+ try {
19251
+ const engine = await getEngine();
19252
+ server = createServer(engine);
19253
+ await server.connect(transport);
19254
+ const response = await transport.handleRequest(event.request);
19255
+ if (enableJsonResponse) {
19256
+ await transport.close();
19257
+ await server.close();
19258
+ }
19259
+ return response;
19260
+ } catch (error) {
19261
+ try {
19262
+ await transport.close();
19263
+ } catch {
19264
+ }
19265
+ try {
19266
+ await server?.close();
19267
+ } catch {
19268
+ }
19269
+ return new Response(
19270
+ JSON.stringify({
19271
+ jsonrpc: "2.0",
19272
+ error: {
19273
+ code: -32603,
19274
+ message: error instanceof Error ? error.message : "Internal server error"
19275
+ },
19276
+ id: null
19277
+ }),
19278
+ { status: 500, headers: { "content-type": "application/json" } }
19279
+ );
19280
+ }
19281
+ }
18001
19282
  function buildCorsHeaders(request, config) {
18002
19283
  const allowOrigins = config.api.cors.allowOrigins;
18003
19284
  if (!allowOrigins || allowOrigins.length === 0) {
@@ -18010,7 +19291,7 @@ function buildCorsHeaders(request, config) {
18010
19291
  }
18011
19292
  return {
18012
19293
  "access-control-allow-origin": allowOrigins.includes("*") ? "*" : origin,
18013
- "access-control-allow-methods": "POST, OPTIONS",
19294
+ "access-control-allow-methods": "GET, POST, OPTIONS",
18014
19295
  "access-control-allow-headers": "content-type"
18015
19296
  };
18016
19297
  }
@@ -18057,6 +19338,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
18057
19338
  if (normalizeText(current.text)) {
18058
19339
  sections.push({
18059
19340
  sectionTitle: current.sectionTitle,
19341
+ headingLevel: current.headingLevel,
18060
19342
  headingPath: current.headingPath,
18061
19343
  text: current.text.trim()
18062
19344
  });
@@ -18075,6 +19357,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
18075
19357
  headingStack.length = level;
18076
19358
  current = {
18077
19359
  sectionTitle: title,
19360
+ headingLevel: level,
18078
19361
  headingPath: headingStack.filter((entry) => Boolean(entry)).slice(0, headingPathDepth),
18079
19362
  text: `${line}
18080
19363
  `
@@ -18210,6 +19493,7 @@ function splitSection(section, config) {
18210
19493
  return [
18211
19494
  {
18212
19495
  sectionTitle: section.sectionTitle,
19496
+ headingLevel: section.headingLevel,
18213
19497
  headingPath: section.headingPath,
18214
19498
  chunkText: text
18215
19499
  }
@@ -18260,6 +19544,7 @@ ${chunk}`;
18260
19544
  }
18261
19545
  return merged.map((chunkText) => ({
18262
19546
  sectionTitle: section.sectionTitle,
19547
+ headingLevel: section.headingLevel,
18263
19548
  headingPath: section.headingPath,
18264
19549
  chunkText
18265
19550
  }));
@@ -18275,6 +19560,18 @@ function buildSummaryChunkText(page) {
18275
19560
  }
18276
19561
  return parts.join("\n\n");
18277
19562
  }
19563
+ function buildEmbeddingTitle(chunk) {
19564
+ if (!chunk.sectionTitle || chunk.headingLevel === void 0) return void 0;
19565
+ if (chunk.headingPath.length > 1) {
19566
+ const path14 = chunk.headingPath.join(" > ");
19567
+ const lastInPath = chunk.headingPath[chunk.headingPath.length - 1];
19568
+ if (lastInPath !== chunk.sectionTitle) {
19569
+ return `${chunk.title} \u2014 ${path14} > ${chunk.sectionTitle}`;
19570
+ }
19571
+ return `${chunk.title} \u2014 ${path14}`;
19572
+ }
19573
+ return `${chunk.title} \u2014 ${chunk.sectionTitle}`;
19574
+ }
18278
19575
  function buildEmbeddingText(chunk, prependTitle) {
18279
19576
  if (!prependTitle) return chunk.chunkText;
18280
19577
  const prefix = chunk.sectionTitle ? `${chunk.title} \u2014 ${chunk.sectionTitle}` : chunk.title;
@@ -18305,10 +19602,14 @@ function chunkPage(page, config, scope) {
18305
19602
  tags: page.tags,
18306
19603
  contentHash: "",
18307
19604
  description: page.description,
18308
- keywords: page.keywords
19605
+ keywords: page.keywords,
19606
+ publishedAt: page.publishedAt,
19607
+ incomingAnchorText: page.incomingAnchorText,
19608
+ meta: page.meta
18309
19609
  };
18310
19610
  const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
18311
- summaryChunk.contentHash = sha256(normalizeText(embeddingText));
19611
+ const metaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
19612
+ summaryChunk.contentHash = sha256(normalizeText(embeddingText) + metaSuffix);
18312
19613
  chunks.push(summaryChunk);
18313
19614
  }
18314
19615
  const ordinalOffset = config.chunking.pageSummaryChunk ? 1 : 0;
@@ -18325,6 +19626,7 @@ function chunkPage(page, config, scope) {
18325
19626
  path: page.url,
18326
19627
  title: page.title,
18327
19628
  sectionTitle: entry.sectionTitle,
19629
+ headingLevel: entry.headingLevel,
18328
19630
  headingPath: entry.headingPath,
18329
19631
  chunkText: entry.chunkText,
18330
19632
  snippet: toSnippet(entry.chunkText),
@@ -18334,10 +19636,16 @@ function chunkPage(page, config, scope) {
18334
19636
  tags: page.tags,
18335
19637
  contentHash: "",
18336
19638
  description: page.description,
18337
- keywords: page.keywords
19639
+ keywords: page.keywords,
19640
+ publishedAt: page.publishedAt,
19641
+ incomingAnchorText: page.incomingAnchorText,
19642
+ meta: page.meta
18338
19643
  };
18339
19644
  const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
18340
- chunk.contentHash = sha256(normalizeText(embeddingText));
19645
+ const embeddingTitle = config.chunking.weightHeadings ? buildEmbeddingTitle(chunk) : void 0;
19646
+ const chunkMetaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
19647
+ const hashInput = embeddingTitle ? `${normalizeText(embeddingText)}|title:${embeddingTitle}` : normalizeText(embeddingText);
19648
+ chunk.contentHash = sha256(hashInput + chunkMetaSuffix);
18341
19649
  chunks.push(chunk);
18342
19650
  }
18343
19651
  return chunks;
@@ -19170,6 +20478,69 @@ function gfm(turndownService) {
19170
20478
  }
19171
20479
 
19172
20480
  // src/indexing/extractor.ts
20481
+ function normalizeDateToMs(value) {
20482
+ if (value == null) return void 0;
20483
+ if (value instanceof Date) {
20484
+ const ts = value.getTime();
20485
+ return Number.isFinite(ts) ? ts : void 0;
20486
+ }
20487
+ if (typeof value === "string") {
20488
+ const ts = new Date(value).getTime();
20489
+ return Number.isFinite(ts) ? ts : void 0;
20490
+ }
20491
+ if (typeof value === "number") {
20492
+ return Number.isFinite(value) ? value : void 0;
20493
+ }
20494
+ return void 0;
20495
+ }
20496
+ var FRONTMATTER_DATE_FIELDS = ["date", "publishedAt", "updatedAt", "published_at", "updated_at"];
20497
+ function extractPublishedAtFromFrontmatter(data) {
20498
+ for (const field of FRONTMATTER_DATE_FIELDS) {
20499
+ const val = normalizeDateToMs(data[field]);
20500
+ if (val !== void 0) return val;
20501
+ }
20502
+ return void 0;
20503
+ }
20504
+ function extractPublishedAtFromHtml($) {
20505
+ const jsonLdScripts = $('script[type="application/ld+json"]');
20506
+ for (let i = 0; i < jsonLdScripts.length; i++) {
20507
+ try {
20508
+ const raw = $(jsonLdScripts[i]).html();
20509
+ if (!raw) continue;
20510
+ const parsed = JSON.parse(raw);
20511
+ const candidates = [];
20512
+ if (Array.isArray(parsed)) {
20513
+ candidates.push(...parsed);
20514
+ } else if (parsed && typeof parsed === "object") {
20515
+ candidates.push(parsed);
20516
+ if (Array.isArray(parsed["@graph"])) {
20517
+ candidates.push(...parsed["@graph"]);
20518
+ }
20519
+ }
20520
+ for (const candidate of candidates) {
20521
+ const val = normalizeDateToMs(candidate.datePublished);
20522
+ if (val !== void 0) return val;
20523
+ }
20524
+ } catch {
20525
+ }
20526
+ }
20527
+ const ogTime = $('meta[property="article:published_time"]').attr("content")?.trim();
20528
+ if (ogTime) {
20529
+ const val = normalizeDateToMs(ogTime);
20530
+ if (val !== void 0) return val;
20531
+ }
20532
+ const itempropDate = $('meta[itemprop="datePublished"]').attr("content")?.trim() || $('time[itemprop="datePublished"]').attr("datetime")?.trim();
20533
+ if (itempropDate) {
20534
+ const val = normalizeDateToMs(itempropDate);
20535
+ if (val !== void 0) return val;
20536
+ }
20537
+ const timeEl = $("time[datetime]").first().attr("datetime")?.trim();
20538
+ if (timeEl) {
20539
+ const val = normalizeDateToMs(timeEl);
20540
+ if (val !== void 0) return val;
20541
+ }
20542
+ return void 0;
20543
+ }
19173
20544
  function hasTopLevelNoindexComment(markdown) {
19174
20545
  const lines = markdown.split(/\r?\n/);
19175
20546
  let inFence = false;
@@ -19185,6 +20556,97 @@ function hasTopLevelNoindexComment(markdown) {
19185
20556
  }
19186
20557
  return false;
19187
20558
  }
20559
+ var GARBAGE_ALT_WORDS = /* @__PURE__ */ new Set([
20560
+ "image",
20561
+ "photo",
20562
+ "picture",
20563
+ "icon",
20564
+ "logo",
20565
+ "banner",
20566
+ "screenshot",
20567
+ "thumbnail",
20568
+ "img",
20569
+ "graphic",
20570
+ "illustration",
20571
+ "spacer",
20572
+ "pixel",
20573
+ "placeholder",
20574
+ "avatar",
20575
+ "background"
20576
+ ]);
20577
+ var IMAGE_EXT_RE = /\.(jpg|jpeg|png|gif|svg|webp|avif|bmp|ico)(\?.*)?$/i;
20578
+ function isMeaningfulAlt(alt) {
20579
+ const trimmed = alt.trim();
20580
+ if (!trimmed || trimmed.length < 5) return false;
20581
+ if (IMAGE_EXT_RE.test(trimmed)) return false;
20582
+ if (GARBAGE_ALT_WORDS.has(trimmed.toLowerCase())) return false;
20583
+ return true;
20584
+ }
20585
+ function resolveImageText(img, $, imageDescAttr) {
20586
+ const imgDesc = img.attr(imageDescAttr)?.trim();
20587
+ if (imgDesc) return imgDesc;
20588
+ const figure = img.closest("figure");
20589
+ if (figure.length) {
20590
+ const figDesc = figure.attr(imageDescAttr)?.trim();
20591
+ if (figDesc) return figDesc;
20592
+ }
20593
+ const alt = img.attr("alt")?.trim() ?? "";
20594
+ const caption = figure.length ? figure.find("figcaption").first().text().trim() : "";
20595
+ if (isMeaningfulAlt(alt) && caption) {
20596
+ return `${alt} \u2014 ${caption}`;
20597
+ }
20598
+ if (isMeaningfulAlt(alt)) {
20599
+ return alt;
20600
+ }
20601
+ if (caption) {
20602
+ return caption;
20603
+ }
20604
+ return null;
20605
+ }
20606
+ var STOP_ANCHORS = /* @__PURE__ */ new Set([
20607
+ "here",
20608
+ "click",
20609
+ "click here",
20610
+ "read more",
20611
+ "link",
20612
+ "this",
20613
+ "more"
20614
+ ]);
20615
+ function normalizeAnchorText(raw) {
20616
+ const normalized = raw.replace(/\s+/g, " ").trim().toLowerCase();
20617
+ if (normalized.length < 3) return "";
20618
+ if (STOP_ANCHORS.has(normalized)) return "";
20619
+ if (normalized.length > 100) return normalized.slice(0, 100);
20620
+ return normalized;
20621
+ }
20622
+ function escapeHtml(text) {
20623
+ return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
20624
+ }
20625
+ function preprocessImages(root2, $, imageDescAttr) {
20626
+ root2.find("picture").each((_i, el) => {
20627
+ const picture = $(el);
20628
+ const img = picture.find("img").first();
20629
+ const parentFigure = picture.closest("figure");
20630
+ const text = img.length ? resolveImageText(img, $, imageDescAttr) : null;
20631
+ if (text) {
20632
+ if (parentFigure.length) parentFigure.find("figcaption").remove();
20633
+ picture.replaceWith(`<span>${escapeHtml(text)}</span>`);
20634
+ } else {
20635
+ picture.remove();
20636
+ }
20637
+ });
20638
+ root2.find("img").each((_i, el) => {
20639
+ const img = $(el);
20640
+ const parentFigure = img.closest("figure");
20641
+ const text = resolveImageText(img, $, imageDescAttr);
20642
+ if (text) {
20643
+ if (parentFigure.length) parentFigure.find("figcaption").remove();
20644
+ img.replaceWith(`<span>${escapeHtml(text)}</span>`);
20645
+ } else {
20646
+ img.remove();
20647
+ }
20648
+ });
20649
+ }
19188
20650
  function extractFromHtml(url, html, config) {
19189
20651
  const $ = cheerio.load(html);
19190
20652
  const normalizedUrl = normalizeUrlPath(url);
@@ -19210,6 +20672,20 @@ function extractFromHtml(url, html, config) {
19210
20672
  if (weight === 0) {
19211
20673
  return null;
19212
20674
  }
20675
+ if ($('meta[name="searchsocket:noindex"]').attr("content") === "true") {
20676
+ return null;
20677
+ }
20678
+ const RESERVED_META_KEYS = /* @__PURE__ */ new Set(["noindex", "tags"]);
20679
+ const meta = {};
20680
+ $('meta[name^="searchsocket:"]').each((_i, el) => {
20681
+ const name = $(el).attr("name") ?? "";
20682
+ const key = name.slice("searchsocket:".length);
20683
+ if (!key || RESERVED_META_KEYS.has(key) || !validateMetaKey(key)) return;
20684
+ const content = $(el).attr("content") ?? "";
20685
+ const dataType = $(el).attr("data-type") ?? "string";
20686
+ meta[key] = parseMetaValue(content, dataType);
20687
+ });
20688
+ const componentTags = $('meta[name="searchsocket:tags"]').attr("content")?.trim();
19213
20689
  const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
19214
20690
  const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
19215
20691
  const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
@@ -19221,7 +20697,9 @@ function extractFromHtml(url, html, config) {
19221
20697
  root2.find(selector).remove();
19222
20698
  }
19223
20699
  root2.find(`[${config.extract.ignoreAttr}]`).remove();
20700
+ preprocessImages(root2, $, config.extract.imageDescAttr);
19224
20701
  const outgoingLinks = [];
20702
+ const seenLinkKeys = /* @__PURE__ */ new Set();
19225
20703
  root2.find("a[href]").each((_index, node) => {
19226
20704
  const href = $(node).attr("href");
19227
20705
  if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:")) {
@@ -19232,7 +20710,19 @@ function extractFromHtml(url, html, config) {
19232
20710
  if (!["http:", "https:"].includes(parsed.protocol)) {
19233
20711
  return;
19234
20712
  }
19235
- outgoingLinks.push(normalizeUrlPath(parsed.pathname));
20713
+ const url2 = normalizeUrlPath(parsed.pathname);
20714
+ let anchorText = normalizeAnchorText($(node).text());
20715
+ if (!anchorText) {
20716
+ const imgAlt = $(node).find("img").first().attr("alt") ?? "";
20717
+ if (isMeaningfulAlt(imgAlt)) {
20718
+ anchorText = normalizeAnchorText(imgAlt);
20719
+ }
20720
+ }
20721
+ const key = `${url2}|${anchorText}`;
20722
+ if (!seenLinkKeys.has(key)) {
20723
+ seenLinkKeys.add(key);
20724
+ outgoingLinks.push({ url: url2, anchorText });
20725
+ }
19236
20726
  } catch {
19237
20727
  }
19238
20728
  });
@@ -19257,16 +20747,25 @@ function extractFromHtml(url, html, config) {
19257
20747
  return null;
19258
20748
  }
19259
20749
  const tags = normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1);
20750
+ const publishedAt = extractPublishedAtFromHtml($);
20751
+ if (componentTags) {
20752
+ const extraTags = componentTags.split(",").map((t) => t.trim()).filter(Boolean);
20753
+ for (const t of extraTags) {
20754
+ if (!tags.includes(t)) tags.push(t);
20755
+ }
20756
+ }
19260
20757
  return {
19261
20758
  url: normalizeUrlPath(url),
19262
20759
  title,
19263
20760
  markdown,
19264
- outgoingLinks: [...new Set(outgoingLinks)],
20761
+ outgoingLinks,
19265
20762
  noindex: false,
19266
20763
  tags,
19267
20764
  description,
19268
20765
  keywords,
19269
- weight
20766
+ weight,
20767
+ publishedAt,
20768
+ meta: Object.keys(meta).length > 0 ? meta : void 0
19270
20769
  };
19271
20770
  }
19272
20771
  function extractFromMarkdown(url, markdown, title) {
@@ -19287,6 +20786,24 @@ function extractFromMarkdown(url, markdown, title) {
19287
20786
  if (mdWeight === 0) {
19288
20787
  return null;
19289
20788
  }
20789
+ let mdMeta;
20790
+ const rawMeta = searchsocketMeta?.meta;
20791
+ if (rawMeta && typeof rawMeta === "object" && !Array.isArray(rawMeta)) {
20792
+ const metaObj = {};
20793
+ for (const [key, val] of Object.entries(rawMeta)) {
20794
+ if (!validateMetaKey(key)) continue;
20795
+ if (typeof val === "string" || typeof val === "number" || typeof val === "boolean") {
20796
+ metaObj[key] = val;
20797
+ } else if (Array.isArray(val) && val.every((v) => typeof v === "string")) {
20798
+ metaObj[key] = val;
20799
+ } else if (val instanceof Date) {
20800
+ metaObj[key] = val.getTime();
20801
+ }
20802
+ }
20803
+ if (Object.keys(metaObj).length > 0) {
20804
+ mdMeta = metaObj;
20805
+ }
20806
+ }
19290
20807
  const content = parsed.content;
19291
20808
  const normalized = normalizeMarkdown(content);
19292
20809
  if (!normalizeText(normalized)) {
@@ -19301,6 +20818,7 @@ function extractFromMarkdown(url, markdown, title) {
19301
20818
  fmKeywords = frontmatter.keywords.split(",").map((k) => k.trim()).filter(Boolean);
19302
20819
  }
19303
20820
  if (fmKeywords && fmKeywords.length === 0) fmKeywords = void 0;
20821
+ const publishedAt = extractPublishedAtFromFrontmatter(frontmatter);
19304
20822
  return {
19305
20823
  url: normalizeUrlPath(url),
19306
20824
  title: resolvedTitle,
@@ -19310,7 +20828,9 @@ function extractFromMarkdown(url, markdown, title) {
19310
20828
  tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
19311
20829
  description: fmDescription,
19312
20830
  keywords: fmKeywords,
19313
- weight: mdWeight
20831
+ weight: mdWeight,
20832
+ publishedAt,
20833
+ meta: mdMeta
19314
20834
  };
19315
20835
  }
19316
20836
  function segmentToRegex(segment) {
@@ -19473,7 +20993,7 @@ async function parseManifest(cwd, outputDir) {
19473
20993
  const manifestPath = path__default.default.resolve(cwd, outputDir, "server", "manifest-full.js");
19474
20994
  let content;
19475
20995
  try {
19476
- content = await fs3__default.default.readFile(manifestPath, "utf8");
20996
+ content = await fs9__default.default.readFile(manifestPath, "utf8");
19477
20997
  } catch {
19478
20998
  throw new SearchSocketError(
19479
20999
  "BUILD_MANIFEST_NOT_FOUND",
@@ -19784,6 +21304,125 @@ function filePathToUrl(filePath, baseDir) {
19784
21304
  const noExt = relative.replace(/\.md$/i, "").replace(/\/index$/i, "");
19785
21305
  return normalizeUrlPath(noExt || "/");
19786
21306
  }
21307
+ var ROUTE_FILE_RE = /\+(page|layout|error)(@[^.]+)?\.svelte$/;
21308
+ function isSvelteComponentFile(filePath) {
21309
+ if (!filePath.endsWith(".svelte")) return false;
21310
+ return !ROUTE_FILE_RE.test(filePath);
21311
+ }
21312
+ function extractSvelteComponentMeta(source) {
21313
+ const componentMatch = source.match(/<!--\s*@component\s*([\s\S]*?)\s*-->/);
21314
+ const description = componentMatch?.[1]?.trim() || void 0;
21315
+ const propsMatch = source.match(
21316
+ /let\s+\{([\s\S]*?)\}\s*(?::\s*([^=;{][\s\S]*?))?\s*=\s*\$props\(\)/
21317
+ );
21318
+ const props = [];
21319
+ if (propsMatch) {
21320
+ const destructureBlock = propsMatch[1];
21321
+ const typeAnnotation = propsMatch[2]?.trim();
21322
+ let resolvedTypeMap;
21323
+ if (typeAnnotation && /^[A-Z]\w*$/.test(typeAnnotation)) {
21324
+ resolvedTypeMap = resolveTypeReference(source, typeAnnotation);
21325
+ } else if (typeAnnotation && typeAnnotation.startsWith("{")) {
21326
+ resolvedTypeMap = parseInlineTypeAnnotation(typeAnnotation);
21327
+ }
21328
+ const propEntries = splitDestructureBlock(destructureBlock);
21329
+ for (const entry of propEntries) {
21330
+ const trimmed = entry.trim();
21331
+ if (!trimmed || trimmed.startsWith("...")) continue;
21332
+ let propName;
21333
+ let defaultValue;
21334
+ const renameMatch = trimmed.match(/^(\w+)\s*:\s*\w+\s*(?:=\s*([\s\S]+))?$/);
21335
+ if (renameMatch) {
21336
+ propName = renameMatch[1];
21337
+ defaultValue = renameMatch[2]?.trim();
21338
+ } else {
21339
+ const defaultMatch = trimmed.match(/^(\w+)\s*=\s*([\s\S]+)$/);
21340
+ if (defaultMatch) {
21341
+ propName = defaultMatch[1];
21342
+ defaultValue = defaultMatch[2]?.trim();
21343
+ } else {
21344
+ propName = trimmed.match(/^(\w+)/)?.[1] ?? trimmed;
21345
+ }
21346
+ }
21347
+ const propType = resolvedTypeMap?.get(propName);
21348
+ props.push({
21349
+ name: propName,
21350
+ ...propType ? { type: propType } : {},
21351
+ ...defaultValue ? { default: defaultValue } : {}
21352
+ });
21353
+ }
21354
+ }
21355
+ return { description, props };
21356
+ }
21357
+ function splitDestructureBlock(block) {
21358
+ const entries = [];
21359
+ let depth = 0;
21360
+ let current = "";
21361
+ for (const ch of block) {
21362
+ if (ch === "{" || ch === "[" || ch === "(") {
21363
+ depth++;
21364
+ current += ch;
21365
+ } else if (ch === "}" || ch === "]" || ch === ")") {
21366
+ depth--;
21367
+ current += ch;
21368
+ } else if (ch === "," && depth === 0) {
21369
+ entries.push(current);
21370
+ current = "";
21371
+ } else {
21372
+ current += ch;
21373
+ }
21374
+ }
21375
+ if (current.trim()) entries.push(current);
21376
+ return entries;
21377
+ }
21378
+ function resolveTypeReference(source, typeName) {
21379
+ const startRe = new RegExp(`(?:interface\\s+${typeName}\\s*|type\\s+${typeName}\\s*=\\s*)\\{`);
21380
+ const startMatch = source.match(startRe);
21381
+ if (!startMatch || startMatch.index === void 0) return void 0;
21382
+ const bodyStart = startMatch.index + startMatch[0].length;
21383
+ let depth = 1;
21384
+ let i = bodyStart;
21385
+ while (i < source.length && depth > 0) {
21386
+ if (source[i] === "{") depth++;
21387
+ else if (source[i] === "}") depth--;
21388
+ i++;
21389
+ }
21390
+ if (depth !== 0) return void 0;
21391
+ const body = source.slice(bodyStart, i - 1);
21392
+ return parseTypeMembers(body);
21393
+ }
21394
+ function parseInlineTypeAnnotation(annotation) {
21395
+ const inner = annotation.replace(/^\{/, "").replace(/\}$/, "");
21396
+ return parseTypeMembers(inner);
21397
+ }
21398
+ function parseTypeMembers(body) {
21399
+ const map = /* @__PURE__ */ new Map();
21400
+ const members = body.split(/[;\n]/).map((m) => m.trim()).filter(Boolean);
21401
+ for (const member of members) {
21402
+ const memberMatch = member.match(/^(\w+)\??\s*:\s*(.+)$/);
21403
+ if (memberMatch) {
21404
+ map.set(memberMatch[1], memberMatch[2].replace(/,\s*$/, "").trim());
21405
+ }
21406
+ }
21407
+ return map;
21408
+ }
21409
+ function buildComponentMarkdown(componentName, meta) {
21410
+ if (!meta.description && meta.props.length === 0) return "";
21411
+ const parts = [`${componentName} component.`];
21412
+ if (meta.description) {
21413
+ parts.push(meta.description);
21414
+ }
21415
+ if (meta.props.length > 0) {
21416
+ const propEntries = meta.props.map((p) => {
21417
+ let entry = p.name;
21418
+ if (p.type) entry += ` (${p.type})`;
21419
+ if (p.default) entry += ` default: ${p.default}`;
21420
+ return entry;
21421
+ });
21422
+ parts.push(`Props: ${propEntries.join(", ")}.`);
21423
+ }
21424
+ return parts.join(" ");
21425
+ }
19787
21426
  function normalizeSvelteToMarkdown(source) {
19788
21427
  return source.replace(/<script[\s\S]*?<\/script>/g, "").replace(/<style[\s\S]*?<\/style>/g, "").replace(/<[^>]+>/g, " ").replace(/\{[^}]+\}/g, " ").replace(/\s+/g, " ").trim();
19789
21428
  }
@@ -19802,13 +21441,27 @@ async function loadContentFilesPages(cwd, config, maxPages) {
19802
21441
  const selected = typeof limit === "number" ? files.slice(0, limit) : files;
19803
21442
  const pages = [];
19804
21443
  for (const filePath of selected) {
19805
- const raw = await fs3__default.default.readFile(filePath, "utf8");
19806
- const markdown = filePath.endsWith(".md") ? raw : normalizeSvelteToMarkdown(raw);
21444
+ const raw = await fs9__default.default.readFile(filePath, "utf8");
21445
+ let markdown;
21446
+ let tags;
21447
+ if (filePath.endsWith(".md")) {
21448
+ markdown = raw;
21449
+ } else if (isSvelteComponentFile(filePath)) {
21450
+ const componentName = path__default.default.basename(filePath, ".svelte");
21451
+ const meta = extractSvelteComponentMeta(raw);
21452
+ const componentMarkdown = buildComponentMarkdown(componentName, meta);
21453
+ const templateContent = normalizeSvelteToMarkdown(raw);
21454
+ markdown = componentMarkdown ? [componentMarkdown, templateContent].filter(Boolean).join("\n\n") : templateContent;
21455
+ tags = ["component"];
21456
+ } else {
21457
+ markdown = normalizeSvelteToMarkdown(raw);
21458
+ }
19807
21459
  pages.push({
19808
21460
  url: filePathToUrl(filePath, baseDir),
19809
21461
  markdown,
19810
21462
  sourcePath: path__default.default.relative(cwd, filePath).replace(/\\/g, "/"),
19811
- outgoingLinks: []
21463
+ outgoingLinks: [],
21464
+ ...tags ? { tags } : {}
19812
21465
  });
19813
21466
  }
19814
21467
  return pages;
@@ -19938,7 +21591,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
19938
21591
  const selected = typeof limit === "number" ? htmlFiles.slice(0, limit) : htmlFiles;
19939
21592
  const pages = [];
19940
21593
  for (const filePath of selected) {
19941
- const html = await fs3__default.default.readFile(filePath, "utf8");
21594
+ const html = await fs9__default.default.readFile(filePath, "utf8");
19942
21595
  pages.push({
19943
21596
  url: staticHtmlFileToUrl(filePath, outputDir),
19944
21597
  html,
@@ -20001,7 +21654,7 @@ function isBlockedByRobots(urlPath, rules3) {
20001
21654
  }
20002
21655
  async function loadRobotsTxtFromDir(dir) {
20003
21656
  try {
20004
- const content = await fs3__default.default.readFile(path__default.default.join(dir, "robots.txt"), "utf8");
21657
+ const content = await fs9__default.default.readFile(path__default.default.join(dir, "robots.txt"), "utf8");
20005
21658
  return parseRobotsTxt(content);
20006
21659
  } catch {
20007
21660
  return null;
@@ -20018,6 +21671,81 @@ async function fetchRobotsTxt(baseUrl) {
20018
21671
  return null;
20019
21672
  }
20020
21673
  }
21674
+ function resolvePageUrl(pageUrl, baseUrl) {
21675
+ if (!baseUrl) return pageUrl;
21676
+ try {
21677
+ return new URL(pageUrl, baseUrl).href;
21678
+ } catch {
21679
+ return pageUrl;
21680
+ }
21681
+ }
21682
+ function generateLlmsTxt(pages, config) {
21683
+ const title = config.llmsTxt.title ?? config.project.id;
21684
+ const description = config.llmsTxt.description;
21685
+ const baseUrl = config.project.baseUrl;
21686
+ const lines = [`# ${title}`];
21687
+ if (description) {
21688
+ lines.push("", `> ${description}`);
21689
+ }
21690
+ const filtered = pages.filter(
21691
+ (p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
21692
+ );
21693
+ const sorted = [...filtered].sort((a, b) => {
21694
+ if (a.depth !== b.depth) return a.depth - b.depth;
21695
+ return b.incomingLinks - a.incomingLinks;
21696
+ });
21697
+ if (sorted.length > 0) {
21698
+ lines.push("", "## Pages", "");
21699
+ for (const page of sorted) {
21700
+ const url = resolvePageUrl(page.url, baseUrl);
21701
+ if (page.description) {
21702
+ lines.push(`- [${page.title}](${url}): ${page.description}`);
21703
+ } else {
21704
+ lines.push(`- [${page.title}](${url})`);
21705
+ }
21706
+ }
21707
+ }
21708
+ lines.push("");
21709
+ return lines.join("\n");
21710
+ }
21711
+ function generateLlmsFullTxt(pages, config) {
21712
+ const title = config.llmsTxt.title ?? config.project.id;
21713
+ const description = config.llmsTxt.description;
21714
+ const baseUrl = config.project.baseUrl;
21715
+ const lines = [`# ${title}`];
21716
+ if (description) {
21717
+ lines.push("", `> ${description}`);
21718
+ }
21719
+ const filtered = pages.filter(
21720
+ (p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
21721
+ );
21722
+ const sorted = [...filtered].sort((a, b) => {
21723
+ if (a.depth !== b.depth) return a.depth - b.depth;
21724
+ return b.incomingLinks - a.incomingLinks;
21725
+ });
21726
+ for (const page of sorted) {
21727
+ const url = resolvePageUrl(page.url, baseUrl);
21728
+ lines.push("", "---", "", `## [${page.title}](${url})`, "");
21729
+ lines.push(page.markdown.trim());
21730
+ }
21731
+ lines.push("");
21732
+ return lines.join("\n");
21733
+ }
21734
+ async function writeLlmsTxt(pages, config, cwd, logger3) {
21735
+ const outputPath = path__default.default.resolve(cwd, config.llmsTxt.outputPath);
21736
+ const outputDir = path__default.default.dirname(outputPath);
21737
+ await fs9__default.default.mkdir(outputDir, { recursive: true });
21738
+ const content = generateLlmsTxt(pages, config);
21739
+ await fs9__default.default.writeFile(outputPath, content, "utf8");
21740
+ logger3.info(`Generated llms.txt at ${config.llmsTxt.outputPath}`);
21741
+ if (config.llmsTxt.generateFull) {
21742
+ const fullPath = outputPath.replace(/\.txt$/, "-full.txt");
21743
+ const fullContent = generateLlmsFullTxt(pages, config);
21744
+ await fs9__default.default.writeFile(fullPath, fullContent, "utf8");
21745
+ const relativeFull = path__default.default.relative(cwd, fullPath);
21746
+ logger3.info(`Generated llms-full.txt at ${relativeFull}`);
21747
+ }
21748
+ }
20021
21749
 
20022
21750
  // src/indexing/pipeline.ts
20023
21751
  function buildPageSummary(page, maxChars = 3500) {
@@ -20036,16 +21764,33 @@ function buildPageSummary(page, maxChars = 3500) {
20036
21764
  if (joined.length <= maxChars) return joined;
20037
21765
  return joined.slice(0, maxChars).trim();
20038
21766
  }
21767
+ function buildPageContentHash(page) {
21768
+ const parts = [
21769
+ page.title,
21770
+ page.description ?? "",
21771
+ (page.keywords ?? []).slice().sort().join(","),
21772
+ page.tags.slice().sort().join(","),
21773
+ page.markdown,
21774
+ String(page.outgoingLinks),
21775
+ String(page.publishedAt ?? ""),
21776
+ page.incomingAnchorText ?? "",
21777
+ (page.outgoingLinkUrls ?? []).slice().sort().join(","),
21778
+ page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : ""
21779
+ ];
21780
+ return sha256(parts.join("|"));
21781
+ }
20039
21782
  var IndexPipeline = class _IndexPipeline {
20040
21783
  cwd;
20041
21784
  config;
20042
21785
  store;
20043
21786
  logger;
21787
+ hooks;
20044
21788
  constructor(options) {
20045
21789
  this.cwd = options.cwd;
20046
21790
  this.config = options.config;
20047
21791
  this.store = options.store;
20048
21792
  this.logger = options.logger;
21793
+ this.hooks = options.hooks;
20049
21794
  }
20050
21795
  static async create(options = {}) {
20051
21796
  const cwd = path__default.default.resolve(options.cwd ?? process.cwd());
@@ -20055,7 +21800,8 @@ var IndexPipeline = class _IndexPipeline {
20055
21800
  cwd,
20056
21801
  config,
20057
21802
  store,
20058
- logger: options.logger ?? new Logger()
21803
+ logger: options.logger ?? new Logger(),
21804
+ hooks: options.hooks ?? {}
20059
21805
  });
20060
21806
  }
20061
21807
  getConfig() {
@@ -20076,7 +21822,7 @@ var IndexPipeline = class _IndexPipeline {
20076
21822
  const scope = resolveScope(this.config, options.scopeOverride);
20077
21823
  ensureStateDirs(this.cwd, this.config.state.dir);
20078
21824
  const sourceMode = options.sourceOverride ?? this.config.source.mode;
20079
- this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-search)`);
21825
+ this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-vector)`);
20080
21826
  if (options.force) {
20081
21827
  this.logger.info("Force mode enabled \u2014 full rebuild");
20082
21828
  }
@@ -20084,9 +21830,9 @@ var IndexPipeline = class _IndexPipeline {
20084
21830
  this.logger.info("Dry run \u2014 no writes will be performed");
20085
21831
  }
20086
21832
  const manifestStart = stageStart();
20087
- const existingHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getContentHashes(scope);
21833
+ const existingPageHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getPageHashes(scope);
20088
21834
  stageEnd("manifest", manifestStart);
20089
- this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
21835
+ this.logger.debug(`Manifest: ${existingPageHashes.size} existing page hashes loaded`);
20090
21836
  const sourceStart = stageStart();
20091
21837
  this.logger.info(`Loading pages (source: ${sourceMode})...`);
20092
21838
  let sourcePages;
@@ -20163,11 +21909,61 @@ var IndexPipeline = class _IndexPipeline {
20163
21909
  );
20164
21910
  continue;
20165
21911
  }
20166
- extractedPages.push(extracted);
21912
+ if (sourcePage.tags && sourcePage.tags.length > 0) {
21913
+ extracted.tags = [.../* @__PURE__ */ new Set([...extracted.tags, ...sourcePage.tags])];
21914
+ }
21915
+ let accepted;
21916
+ if (this.hooks.transformPage) {
21917
+ const transformed = await this.hooks.transformPage(extracted);
21918
+ if (transformed === null) {
21919
+ this.logger.debug(`Page ${sourcePage.url} skipped by transformPage hook`);
21920
+ continue;
21921
+ }
21922
+ accepted = transformed;
21923
+ } else {
21924
+ accepted = extracted;
21925
+ }
21926
+ extractedPages.push(accepted);
20167
21927
  this.logger.event("page_extracted", {
20168
- url: extracted.url
21928
+ url: accepted.url
20169
21929
  });
20170
21930
  }
21931
+ const customRecords = options.customRecords ?? [];
21932
+ if (customRecords.length > 0) {
21933
+ this.logger.info(`Processing ${customRecords.length} custom record${customRecords.length === 1 ? "" : "s"}...`);
21934
+ for (const record of customRecords) {
21935
+ const normalizedUrl = normalizeUrlPath(record.url);
21936
+ const normalized = normalizeMarkdown(record.content);
21937
+ if (!normalized.trim()) {
21938
+ this.logger.warn(`Custom record ${normalizedUrl} has empty content and was skipped.`);
21939
+ continue;
21940
+ }
21941
+ const urlTags = normalizedUrl.split("/").filter(Boolean).slice(0, 1);
21942
+ const tags = record.tags ? [.../* @__PURE__ */ new Set([...urlTags, ...record.tags])] : urlTags;
21943
+ const extracted = {
21944
+ url: normalizedUrl,
21945
+ title: record.title,
21946
+ markdown: normalized,
21947
+ outgoingLinks: [],
21948
+ noindex: false,
21949
+ tags,
21950
+ weight: record.weight
21951
+ };
21952
+ let accepted;
21953
+ if (this.hooks.transformPage) {
21954
+ const transformed = await this.hooks.transformPage(extracted);
21955
+ if (transformed === null) {
21956
+ this.logger.debug(`Custom record ${normalizedUrl} skipped by transformPage hook`);
21957
+ continue;
21958
+ }
21959
+ accepted = transformed;
21960
+ } else {
21961
+ accepted = extracted;
21962
+ }
21963
+ extractedPages.push(accepted);
21964
+ this.logger.event("page_extracted", { url: accepted.url, custom: true });
21965
+ }
21966
+ }
20171
21967
  extractedPages.sort((a, b) => a.url.localeCompare(b.url));
20172
21968
  const uniquePages = [];
20173
21969
  const seenUrls = /* @__PURE__ */ new Set();
@@ -20200,15 +21996,28 @@ var IndexPipeline = class _IndexPipeline {
20200
21996
  const linkStart = stageStart();
20201
21997
  const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
20202
21998
  const incomingLinkCount = /* @__PURE__ */ new Map();
21999
+ const incomingAnchorTexts = /* @__PURE__ */ new Map();
20203
22000
  for (const page of indexablePages) {
20204
22001
  incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
20205
22002
  }
20206
22003
  for (const page of indexablePages) {
20207
- for (const outgoing of page.outgoingLinks) {
22004
+ const seenForCount = /* @__PURE__ */ new Set();
22005
+ const seenForAnchor = /* @__PURE__ */ new Set();
22006
+ for (const { url: outgoing, anchorText } of page.outgoingLinks) {
20208
22007
  if (!pageSet.has(outgoing)) {
20209
22008
  continue;
20210
22009
  }
20211
- incomingLinkCount.set(outgoing, (incomingLinkCount.get(outgoing) ?? 0) + 1);
22010
+ if (!seenForCount.has(outgoing)) {
22011
+ seenForCount.add(outgoing);
22012
+ incomingLinkCount.set(outgoing, (incomingLinkCount.get(outgoing) ?? 0) + 1);
22013
+ }
22014
+ if (anchorText && !seenForAnchor.has(outgoing)) {
22015
+ seenForAnchor.add(outgoing);
22016
+ if (!incomingAnchorTexts.has(outgoing)) {
22017
+ incomingAnchorTexts.set(outgoing, /* @__PURE__ */ new Set());
22018
+ }
22019
+ incomingAnchorTexts.get(outgoing).add(anchorText);
22020
+ }
20212
22021
  }
20213
22022
  }
20214
22023
  stageEnd("links", linkStart);
@@ -20227,6 +22036,15 @@ var IndexPipeline = class _IndexPipeline {
20227
22036
  });
20228
22037
  }
20229
22038
  }
22039
+ for (const record of customRecords) {
22040
+ const normalizedUrl = normalizeUrlPath(record.url);
22041
+ if (!precomputedRoutes.has(normalizedUrl)) {
22042
+ precomputedRoutes.set(normalizedUrl, {
22043
+ routeFile: "",
22044
+ routeResolution: "exact"
22045
+ });
22046
+ }
22047
+ }
20230
22048
  for (const page of indexablePages) {
20231
22049
  const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
20232
22050
  if (routeMatch.routeResolution === "best-effort") {
@@ -20244,6 +22062,17 @@ var IndexPipeline = class _IndexPipeline {
20244
22062
  } else {
20245
22063
  routeExact += 1;
20246
22064
  }
22065
+ const anchorSet = incomingAnchorTexts.get(page.url);
22066
+ let incomingAnchorText;
22067
+ if (anchorSet && anchorSet.size > 0) {
22068
+ let joined = "";
22069
+ for (const phrase of anchorSet) {
22070
+ const next2 = joined ? `${joined} ${phrase}` : phrase;
22071
+ if (next2.length > 500) break;
22072
+ joined = next2;
22073
+ }
22074
+ incomingAnchorText = joined || void 0;
22075
+ }
20247
22076
  const indexedPage = {
20248
22077
  url: page.url,
20249
22078
  title: page.title,
@@ -20253,40 +22082,113 @@ var IndexPipeline = class _IndexPipeline {
20253
22082
  generatedAt: nowIso(),
20254
22083
  incomingLinks: incomingLinkCount.get(page.url) ?? 0,
20255
22084
  outgoingLinks: page.outgoingLinks.length,
22085
+ outgoingLinkUrls: page.outgoingLinks.map((l) => typeof l === "string" ? l : l.url),
20256
22086
  depth: getUrlDepth(page.url),
20257
22087
  tags: page.tags,
20258
22088
  markdown: page.markdown,
20259
22089
  description: page.description,
20260
- keywords: page.keywords
22090
+ keywords: page.keywords,
22091
+ publishedAt: page.publishedAt,
22092
+ incomingAnchorText,
22093
+ meta: page.meta
20261
22094
  };
20262
22095
  pages.push(indexedPage);
20263
22096
  this.logger.event("page_indexed", { url: page.url });
20264
22097
  }
22098
+ const pageRecords = pages.map((p) => {
22099
+ const summary = buildPageSummary(p);
22100
+ return {
22101
+ url: p.url,
22102
+ title: p.title,
22103
+ markdown: p.markdown,
22104
+ projectId: scope.projectId,
22105
+ scopeName: scope.scopeName,
22106
+ routeFile: p.routeFile,
22107
+ routeResolution: p.routeResolution,
22108
+ incomingLinks: p.incomingLinks,
22109
+ outgoingLinks: p.outgoingLinks,
22110
+ outgoingLinkUrls: p.outgoingLinkUrls,
22111
+ depth: p.depth,
22112
+ tags: p.tags,
22113
+ indexedAt: p.generatedAt,
22114
+ summary,
22115
+ description: p.description,
22116
+ keywords: p.keywords,
22117
+ contentHash: buildPageContentHash(p),
22118
+ publishedAt: p.publishedAt,
22119
+ meta: p.meta
22120
+ };
22121
+ });
22122
+ const currentPageUrls = new Set(pageRecords.map((r) => r.url));
22123
+ const changedPages = pageRecords.filter(
22124
+ (r) => !existingPageHashes.has(r.url) || existingPageHashes.get(r.url) !== r.contentHash
22125
+ );
22126
+ const deletedPageUrls = [...existingPageHashes.keys()].filter((url) => !currentPageUrls.has(url));
20265
22127
  if (!options.dryRun) {
20266
- const pageRecords = pages.map((p) => {
20267
- const summary = buildPageSummary(p);
20268
- return {
20269
- url: p.url,
20270
- title: p.title,
20271
- markdown: p.markdown,
20272
- projectId: scope.projectId,
20273
- scopeName: scope.scopeName,
20274
- routeFile: p.routeFile,
20275
- routeResolution: p.routeResolution,
20276
- incomingLinks: p.incomingLinks,
20277
- outgoingLinks: p.outgoingLinks,
20278
- depth: p.depth,
20279
- tags: p.tags,
20280
- indexedAt: p.generatedAt,
20281
- summary,
20282
- description: p.description,
20283
- keywords: p.keywords
20284
- };
20285
- });
20286
- await this.store.deletePages(scope);
20287
- await this.store.upsertPages(pageRecords, scope);
22128
+ if (options.force) {
22129
+ await this.store.deletePages(scope);
22130
+ this.logger.info(`Upserting ${pageRecords.length} page summaries...`);
22131
+ const pageDocs = pageRecords.map((r) => ({
22132
+ id: r.url,
22133
+ data: r.summary ?? r.title,
22134
+ metadata: {
22135
+ title: r.title,
22136
+ url: r.url,
22137
+ description: r.description ?? "",
22138
+ keywords: r.keywords ?? [],
22139
+ summary: r.summary ?? "",
22140
+ tags: r.tags,
22141
+ markdown: r.markdown,
22142
+ routeFile: r.routeFile,
22143
+ routeResolution: r.routeResolution,
22144
+ incomingLinks: r.incomingLinks,
22145
+ outgoingLinks: r.outgoingLinks,
22146
+ outgoingLinkUrls: r.outgoingLinkUrls ?? [],
22147
+ depth: r.depth,
22148
+ indexedAt: r.indexedAt,
22149
+ contentHash: r.contentHash ?? "",
22150
+ publishedAt: r.publishedAt ?? null,
22151
+ ...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
22152
+ }
22153
+ }));
22154
+ await this.store.upsertPages(pageDocs, scope);
22155
+ } else {
22156
+ if (changedPages.length > 0) {
22157
+ this.logger.info(`Upserting ${changedPages.length} changed page summaries...`);
22158
+ const pageDocs = changedPages.map((r) => ({
22159
+ id: r.url,
22160
+ data: r.summary ?? r.title,
22161
+ metadata: {
22162
+ title: r.title,
22163
+ url: r.url,
22164
+ description: r.description ?? "",
22165
+ keywords: r.keywords ?? [],
22166
+ summary: r.summary ?? "",
22167
+ tags: r.tags,
22168
+ markdown: r.markdown,
22169
+ routeFile: r.routeFile,
22170
+ routeResolution: r.routeResolution,
22171
+ incomingLinks: r.incomingLinks,
22172
+ outgoingLinks: r.outgoingLinks,
22173
+ outgoingLinkUrls: r.outgoingLinkUrls ?? [],
22174
+ depth: r.depth,
22175
+ indexedAt: r.indexedAt,
22176
+ contentHash: r.contentHash ?? "",
22177
+ publishedAt: r.publishedAt ?? null,
22178
+ ...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
22179
+ }
22180
+ }));
22181
+ await this.store.upsertPages(pageDocs, scope);
22182
+ }
22183
+ if (deletedPageUrls.length > 0) {
22184
+ await this.store.deletePagesByIds(deletedPageUrls, scope);
22185
+ }
22186
+ }
20288
22187
  }
22188
+ const pagesChanged = options.force ? pageRecords.length : changedPages.length;
22189
+ const pagesDeleted = deletedPageUrls.length;
20289
22190
  stageEnd("pages", pagesStart);
22191
+ this.logger.info(`Page changes: ${pagesChanged} changed/new, ${pagesDeleted} deleted, ${pageRecords.length - changedPages.length} unchanged`);
20290
22192
  this.logger.info(`Indexed ${pages.length} page${pages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["pages"]}ms)`);
20291
22193
  const chunkStart = stageStart();
20292
22194
  this.logger.info("Chunking pages...");
@@ -20295,6 +22197,18 @@ var IndexPipeline = class _IndexPipeline {
20295
22197
  if (typeof maxChunks === "number") {
20296
22198
  chunks = chunks.slice(0, maxChunks);
20297
22199
  }
22200
+ if (this.hooks.transformChunk) {
22201
+ const transformed = [];
22202
+ for (const chunk of chunks) {
22203
+ const result = await this.hooks.transformChunk(chunk);
22204
+ if (result === null) {
22205
+ this.logger.debug(`Chunk ${chunk.chunkKey} skipped by transformChunk hook`);
22206
+ continue;
22207
+ }
22208
+ transformed.push(result);
22209
+ }
22210
+ chunks = transformed;
22211
+ }
20298
22212
  for (const chunk of chunks) {
20299
22213
  this.logger.event("chunked", {
20300
22214
  url: chunk.url,
@@ -20307,7 +22221,12 @@ var IndexPipeline = class _IndexPipeline {
20307
22221
  for (const chunk of chunks) {
20308
22222
  currentChunkMap.set(chunk.chunkKey, chunk);
20309
22223
  }
20310
- const changedChunks = chunks.filter((chunk) => {
22224
+ const chunkHashStart = stageStart();
22225
+ const currentChunkKeys = chunks.map((c) => c.chunkKey);
22226
+ const existingHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.fetchContentHashesForKeys(currentChunkKeys, scope);
22227
+ stageEnd("chunk_hashes", chunkHashStart);
22228
+ this.logger.debug(`Fetched ${existingHashes.size} existing chunk hashes for ${currentChunkKeys.length} current keys`);
22229
+ let changedChunks = chunks.filter((chunk) => {
20311
22230
  if (options.force) {
20312
22231
  return true;
20313
22232
  }
@@ -20320,37 +22239,45 @@ var IndexPipeline = class _IndexPipeline {
20320
22239
  }
20321
22240
  return existingHash !== chunk.contentHash;
20322
22241
  });
20323
- const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
22242
+ const existingChunkIds = options.force ? /* @__PURE__ */ new Set() : await this.store.scanChunkIds(scope);
22243
+ const deletes = [...existingChunkIds].filter((chunkKey) => !currentChunkMap.has(chunkKey));
22244
+ if (this.hooks.beforeIndex) {
22245
+ changedChunks = await this.hooks.beforeIndex(changedChunks);
22246
+ }
20324
22247
  this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
20325
22248
  const upsertStart = stageStart();
20326
22249
  let documentsUpserted = 0;
20327
22250
  if (!options.dryRun && changedChunks.length > 0) {
20328
- this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Search...`);
20329
- const UPSTASH_CONTENT_LIMIT = 4096;
22251
+ this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Vector...`);
20330
22252
  const docs = changedChunks.map((chunk) => {
20331
- const title = chunk.title;
20332
- const sectionTitle = chunk.sectionTitle ?? "";
20333
- const url = chunk.url;
20334
- const tags = chunk.tags.join(",");
20335
- const headingPath = chunk.headingPath.join(" > ");
20336
- const otherFieldsLen = title.length + sectionTitle.length + url.length + tags.length + headingPath.length;
20337
- const textBudget = Math.max(500, UPSTASH_CONTENT_LIMIT - otherFieldsLen - 50);
20338
- const text = buildEmbeddingText(chunk, this.config.chunking.prependTitle).slice(0, textBudget);
22253
+ const embeddingText = buildEmbeddingText(chunk, this.config.chunking.prependTitle);
22254
+ if (embeddingText.length > 2e3) {
22255
+ this.logger.warn(
22256
+ `Chunk ${chunk.chunkKey} text is ${embeddingText.length} chars (~${Math.round(embeddingText.length / 4)} tokens), which may exceed the 512-token model limit and be silently truncated.`
22257
+ );
22258
+ }
20339
22259
  return {
20340
22260
  id: chunk.chunkKey,
20341
- content: { title, sectionTitle, text, url, tags, headingPath },
22261
+ data: embeddingText,
20342
22262
  metadata: {
20343
- projectId: scope.projectId,
20344
- scopeName: scope.scopeName,
22263
+ url: chunk.url,
20345
22264
  path: chunk.path,
22265
+ title: chunk.title,
22266
+ sectionTitle: chunk.sectionTitle ?? "",
22267
+ headingPath: chunk.headingPath.join(" > "),
20346
22268
  snippet: chunk.snippet,
22269
+ chunkText: embeddingText,
22270
+ tags: chunk.tags,
20347
22271
  ordinal: chunk.ordinal,
20348
22272
  contentHash: chunk.contentHash,
20349
22273
  depth: chunk.depth,
20350
22274
  incomingLinks: chunk.incomingLinks,
20351
22275
  routeFile: chunk.routeFile,
20352
22276
  description: chunk.description ?? "",
20353
- keywords: (chunk.keywords ?? []).join(",")
22277
+ keywords: chunk.keywords ?? [],
22278
+ publishedAt: chunk.publishedAt ?? null,
22279
+ incomingAnchorText: chunk.incomingAnchorText ?? "",
22280
+ ...chunk.meta && Object.keys(chunk.meta).length > 0 ? { meta: chunk.meta } : {}
20354
22281
  }
20355
22282
  };
20356
22283
  });
@@ -20368,9 +22295,16 @@ var IndexPipeline = class _IndexPipeline {
20368
22295
  } else {
20369
22296
  this.logger.info("No chunks to upsert \u2014 all up to date");
20370
22297
  }
22298
+ if (this.config.llmsTxt.enable && !options.dryRun) {
22299
+ const llmsStart = stageStart();
22300
+ await writeLlmsTxt(pages, this.config, this.cwd, this.logger);
22301
+ stageEnd("llms_txt", llmsStart);
22302
+ }
20371
22303
  this.logger.info("Done.");
20372
- return {
22304
+ const stats = {
20373
22305
  pagesProcessed: pages.length,
22306
+ pagesChanged,
22307
+ pagesDeleted,
20374
22308
  chunksTotal: chunks.length,
20375
22309
  chunksChanged: changedChunks.length,
20376
22310
  documentsUpserted,
@@ -20379,6 +22313,10 @@ var IndexPipeline = class _IndexPipeline {
20379
22313
  routeBestEffort,
20380
22314
  stageTimingsMs
20381
22315
  };
22316
+ if (this.hooks.afterIndex) {
22317
+ await this.hooks.afterIndex(stats);
22318
+ }
22319
+ return stats;
20382
22320
  }
20383
22321
  };
20384
22322
 
@@ -20400,9 +22338,6 @@ function shouldRunAutoIndex(options) {
20400
22338
  if (explicit && /^(1|true|yes)$/i.test(explicit)) {
20401
22339
  return true;
20402
22340
  }
20403
- if (process.env.CI && /^(1|true)$/i.test(process.env.CI)) {
20404
- return true;
20405
- }
20406
22341
  return false;
20407
22342
  }
20408
22343
  function searchsocketVitePlugin(options = {}) {
@@ -20427,7 +22362,8 @@ function searchsocketVitePlugin(options = {}) {
20427
22362
  const pipeline = await IndexPipeline.create({
20428
22363
  cwd,
20429
22364
  configPath: options.configPath,
20430
- logger: logger3
22365
+ logger: logger3,
22366
+ hooks: options.hooks
20431
22367
  });
20432
22368
  const stats = await pipeline.run({
20433
22369
  changedOnly: options.changedOnly ?? true,