searchsocket 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,27 +1,33 @@
1
1
  'use strict';
2
2
 
3
- var fs = require('fs');
3
+ var crypto = require('crypto');
4
+ var fs9 = require('fs/promises');
4
5
  var path = require('path');
6
+ var webStandardStreamableHttp_js = require('@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js');
7
+ var fs = require('fs');
5
8
  var jiti = require('jiti');
6
9
  var zod = require('zod');
10
+ var mcp_js = require('@modelcontextprotocol/sdk/server/mcp.js');
11
+ require('@modelcontextprotocol/sdk/server/stdio.js');
12
+ require('@modelcontextprotocol/sdk/server/streamableHttp.js');
13
+ require('@modelcontextprotocol/sdk/server/express.js');
7
14
  var child_process = require('child_process');
8
- var crypto = require('crypto');
15
+ var vector = require('@upstash/vector');
9
16
  var cheerio = require('cheerio');
10
17
  var matter = require('gray-matter');
11
18
  var fg = require('fast-glob');
12
19
  var pLimit = require('p-limit');
13
- var fs3 = require('fs/promises');
14
20
  var net = require('net');
15
21
  var zlib = require('zlib');
16
22
 
17
23
  function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
18
24
 
19
- var fs__default = /*#__PURE__*/_interopDefault(fs);
25
+ var fs9__default = /*#__PURE__*/_interopDefault(fs9);
20
26
  var path__default = /*#__PURE__*/_interopDefault(path);
27
+ var fs__default = /*#__PURE__*/_interopDefault(fs);
21
28
  var matter__default = /*#__PURE__*/_interopDefault(matter);
22
29
  var fg__default = /*#__PURE__*/_interopDefault(fg);
23
30
  var pLimit__default = /*#__PURE__*/_interopDefault(pLimit);
24
- var fs3__default = /*#__PURE__*/_interopDefault(fs3);
25
31
  var net__default = /*#__PURE__*/_interopDefault(net);
26
32
 
27
33
  var __getOwnPropNames = Object.getOwnPropertyNames;
@@ -5021,32 +5027,32 @@ var require_URL = __commonJS({
5021
5027
  else
5022
5028
  return basepath.substring(0, lastslash + 1) + refpath;
5023
5029
  }
5024
- function remove_dot_segments(path13) {
5025
- if (!path13) return path13;
5030
+ function remove_dot_segments(path14) {
5031
+ if (!path14) return path14;
5026
5032
  var output = "";
5027
- while (path13.length > 0) {
5028
- if (path13 === "." || path13 === "..") {
5029
- path13 = "";
5033
+ while (path14.length > 0) {
5034
+ if (path14 === "." || path14 === "..") {
5035
+ path14 = "";
5030
5036
  break;
5031
5037
  }
5032
- var twochars = path13.substring(0, 2);
5033
- var threechars = path13.substring(0, 3);
5034
- var fourchars = path13.substring(0, 4);
5038
+ var twochars = path14.substring(0, 2);
5039
+ var threechars = path14.substring(0, 3);
5040
+ var fourchars = path14.substring(0, 4);
5035
5041
  if (threechars === "../") {
5036
- path13 = path13.substring(3);
5042
+ path14 = path14.substring(3);
5037
5043
  } else if (twochars === "./") {
5038
- path13 = path13.substring(2);
5044
+ path14 = path14.substring(2);
5039
5045
  } else if (threechars === "/./") {
5040
- path13 = "/" + path13.substring(3);
5041
- } else if (twochars === "/." && path13.length === 2) {
5042
- path13 = "/";
5043
- } else if (fourchars === "/../" || threechars === "/.." && path13.length === 3) {
5044
- path13 = "/" + path13.substring(4);
5046
+ path14 = "/" + path14.substring(3);
5047
+ } else if (twochars === "/." && path14.length === 2) {
5048
+ path14 = "/";
5049
+ } else if (fourchars === "/../" || threechars === "/.." && path14.length === 3) {
5050
+ path14 = "/" + path14.substring(4);
5045
5051
  output = output.replace(/\/?[^\/]*$/, "");
5046
5052
  } else {
5047
- var segment = path13.match(/(\/?([^\/]*))/)[0];
5053
+ var segment = path14.match(/(\/?([^\/]*))/)[0];
5048
5054
  output += segment;
5049
- path13 = path13.substring(segment.length);
5055
+ path14 = path14.substring(segment.length);
5050
5056
  }
5051
5057
  }
5052
5058
  return output;
@@ -16642,6 +16648,7 @@ var searchSocketConfigSchema = zod.z.object({
16642
16648
  dropSelectors: zod.z.array(zod.z.string()).optional(),
16643
16649
  ignoreAttr: zod.z.string().optional(),
16644
16650
  noindexAttr: zod.z.string().optional(),
16651
+ imageDescAttr: zod.z.string().optional(),
16645
16652
  respectRobotsNoindex: zod.z.boolean().optional()
16646
16653
  }).optional(),
16647
16654
  transform: zod.z.object({
@@ -16657,35 +16664,48 @@ var searchSocketConfigSchema = zod.z.object({
16657
16664
  headingPathDepth: zod.z.number().int().positive().optional(),
16658
16665
  dontSplitInside: zod.z.array(zod.z.enum(["code", "table", "blockquote"])).optional(),
16659
16666
  prependTitle: zod.z.boolean().optional(),
16660
- pageSummaryChunk: zod.z.boolean().optional()
16667
+ pageSummaryChunk: zod.z.boolean().optional(),
16668
+ weightHeadings: zod.z.boolean().optional()
16661
16669
  }).optional(),
16662
16670
  upstash: zod.z.object({
16663
16671
  url: zod.z.string().url().optional(),
16664
16672
  token: zod.z.string().min(1).optional(),
16665
16673
  urlEnv: zod.z.string().min(1).optional(),
16666
- tokenEnv: zod.z.string().min(1).optional()
16674
+ tokenEnv: zod.z.string().min(1).optional(),
16675
+ namespaces: zod.z.object({
16676
+ pages: zod.z.string().min(1).optional(),
16677
+ chunks: zod.z.string().min(1).optional()
16678
+ }).optional()
16679
+ }).optional(),
16680
+ embedding: zod.z.object({
16681
+ model: zod.z.string().optional(),
16682
+ dimensions: zod.z.number().int().positive().optional(),
16683
+ taskType: zod.z.string().optional(),
16684
+ batchSize: zod.z.number().int().positive().optional()
16667
16685
  }).optional(),
16668
16686
  search: zod.z.object({
16669
- semanticWeight: zod.z.number().min(0).max(1).optional(),
16670
- inputEnrichment: zod.z.boolean().optional(),
16671
- reranking: zod.z.boolean().optional(),
16672
16687
  dualSearch: zod.z.boolean().optional(),
16673
16688
  pageSearchWeight: zod.z.number().min(0).max(1).optional()
16674
16689
  }).optional(),
16675
16690
  ranking: zod.z.object({
16676
16691
  enableIncomingLinkBoost: zod.z.boolean().optional(),
16677
16692
  enableDepthBoost: zod.z.boolean().optional(),
16693
+ enableFreshnessBoost: zod.z.boolean().optional(),
16694
+ freshnessDecayRate: zod.z.number().positive().optional(),
16695
+ enableAnchorTextBoost: zod.z.boolean().optional(),
16678
16696
  pageWeights: zod.z.record(zod.z.string(), zod.z.number().min(0)).optional(),
16679
16697
  aggregationCap: zod.z.number().int().positive().optional(),
16680
16698
  aggregationDecay: zod.z.number().min(0).max(1).optional(),
16681
16699
  minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
16682
- minScore: zod.z.number().min(0).max(1).optional(),
16700
+ minScoreRatio: zod.z.number().min(0).max(1).optional(),
16683
16701
  scoreGapThreshold: zod.z.number().min(0).max(1).optional(),
16684
16702
  weights: zod.z.object({
16685
16703
  incomingLinks: zod.z.number().optional(),
16686
16704
  depth: zod.z.number().optional(),
16687
16705
  aggregation: zod.z.number().optional(),
16688
- titleMatch: zod.z.number().optional()
16706
+ titleMatch: zod.z.number().optional(),
16707
+ freshness: zod.z.number().optional(),
16708
+ anchorText: zod.z.number().optional()
16689
16709
  }).optional()
16690
16710
  }).optional(),
16691
16711
  api: zod.z.object({
@@ -16700,12 +16720,28 @@ var searchSocketConfigSchema = zod.z.object({
16700
16720
  }).optional(),
16701
16721
  mcp: zod.z.object({
16702
16722
  enable: zod.z.boolean().optional(),
16723
+ access: zod.z.enum(["public", "private"]).optional(),
16703
16724
  transport: zod.z.enum(["stdio", "http"]).optional(),
16704
16725
  http: zod.z.object({
16705
16726
  port: zod.z.number().int().positive().optional(),
16706
- path: zod.z.string().optional()
16727
+ path: zod.z.string().optional(),
16728
+ apiKey: zod.z.string().min(1).optional(),
16729
+ apiKeyEnv: zod.z.string().min(1).optional()
16730
+ }).optional(),
16731
+ handle: zod.z.object({
16732
+ path: zod.z.string().optional(),
16733
+ apiKey: zod.z.string().min(1).optional(),
16734
+ enableJsonResponse: zod.z.boolean().optional()
16707
16735
  }).optional()
16708
16736
  }).optional(),
16737
+ llmsTxt: zod.z.object({
16738
+ enable: zod.z.boolean().optional(),
16739
+ outputPath: zod.z.string().optional(),
16740
+ title: zod.z.string().optional(),
16741
+ description: zod.z.string().optional(),
16742
+ generateFull: zod.z.boolean().optional(),
16743
+ serveMarkdownVariants: zod.z.boolean().optional()
16744
+ }).optional(),
16709
16745
  state: zod.z.object({
16710
16746
  dir: zod.z.string().optional()
16711
16747
  }).optional()
@@ -16744,6 +16780,7 @@ function createDefaultConfig(projectId) {
16744
16780
  dropSelectors: DEFAULT_DROP_SELECTORS,
16745
16781
  ignoreAttr: "data-search-ignore",
16746
16782
  noindexAttr: "data-search-noindex",
16783
+ imageDescAttr: "data-search-description",
16747
16784
  respectRobotsNoindex: true
16748
16785
  },
16749
16786
  transform: {
@@ -16753,39 +16790,52 @@ function createDefaultConfig(projectId) {
16753
16790
  },
16754
16791
  chunking: {
16755
16792
  strategy: "hybrid",
16756
- maxChars: 2200,
16793
+ maxChars: 1500,
16757
16794
  overlapChars: 200,
16758
16795
  minChars: 250,
16759
16796
  headingPathDepth: 3,
16760
16797
  dontSplitInside: ["code", "table", "blockquote"],
16761
16798
  prependTitle: true,
16762
- pageSummaryChunk: true
16799
+ pageSummaryChunk: true,
16800
+ weightHeadings: true
16763
16801
  },
16764
16802
  upstash: {
16765
- urlEnv: "UPSTASH_SEARCH_REST_URL",
16766
- tokenEnv: "UPSTASH_SEARCH_REST_TOKEN"
16803
+ urlEnv: "UPSTASH_VECTOR_REST_URL",
16804
+ tokenEnv: "UPSTASH_VECTOR_REST_TOKEN",
16805
+ namespaces: {
16806
+ pages: "pages",
16807
+ chunks: "chunks"
16808
+ }
16809
+ },
16810
+ embedding: {
16811
+ model: "bge-large-en-v1.5",
16812
+ dimensions: 1024,
16813
+ taskType: "RETRIEVAL_DOCUMENT",
16814
+ batchSize: 100
16767
16815
  },
16768
16816
  search: {
16769
- semanticWeight: 0.75,
16770
- inputEnrichment: true,
16771
- reranking: true,
16772
16817
  dualSearch: true,
16773
16818
  pageSearchWeight: 0.3
16774
16819
  },
16775
16820
  ranking: {
16776
16821
  enableIncomingLinkBoost: true,
16777
16822
  enableDepthBoost: true,
16823
+ enableFreshnessBoost: false,
16824
+ freshnessDecayRate: 1e-3,
16825
+ enableAnchorTextBoost: false,
16778
16826
  pageWeights: {},
16779
16827
  aggregationCap: 5,
16780
16828
  aggregationDecay: 0.5,
16781
16829
  minChunkScoreRatio: 0.5,
16782
- minScore: 0.3,
16830
+ minScoreRatio: 0.7,
16783
16831
  scoreGapThreshold: 0.4,
16784
16832
  weights: {
16785
16833
  incomingLinks: 0.05,
16786
16834
  depth: 0.03,
16787
16835
  aggregation: 0.1,
16788
- titleMatch: 0.15
16836
+ titleMatch: 0.15,
16837
+ freshness: 0.1,
16838
+ anchorText: 0.1
16789
16839
  }
16790
16840
  },
16791
16841
  api: {
@@ -16796,12 +16846,23 @@ function createDefaultConfig(projectId) {
16796
16846
  },
16797
16847
  mcp: {
16798
16848
  enable: process.env.NODE_ENV !== "production",
16849
+ access: "private",
16799
16850
  transport: "stdio",
16800
16851
  http: {
16801
16852
  port: 3338,
16802
16853
  path: "/mcp"
16854
+ },
16855
+ handle: {
16856
+ path: "/api/mcp",
16857
+ enableJsonResponse: true
16803
16858
  }
16804
16859
  },
16860
+ llmsTxt: {
16861
+ enable: false,
16862
+ outputPath: "static/llms.txt",
16863
+ generateFull: true,
16864
+ serveMarkdownVariants: false
16865
+ },
16805
16866
  state: {
16806
16867
  dir: ".searchsocket"
16807
16868
  }
@@ -16929,7 +16990,15 @@ ${issues}`
16929
16990
  },
16930
16991
  upstash: {
16931
16992
  ...defaults.upstash,
16932
- ...parsed.upstash
16993
+ ...parsed.upstash,
16994
+ namespaces: {
16995
+ ...defaults.upstash.namespaces,
16996
+ ...parsed.upstash?.namespaces
16997
+ }
16998
+ },
16999
+ embedding: {
17000
+ ...defaults.embedding,
17001
+ ...parsed.embedding
16933
17002
  },
16934
17003
  search: {
16935
17004
  ...defaults.search,
@@ -16966,8 +17035,16 @@ ${issues}`
16966
17035
  http: {
16967
17036
  ...defaults.mcp.http,
16968
17037
  ...parsed.mcp?.http
17038
+ },
17039
+ handle: {
17040
+ ...defaults.mcp.handle,
17041
+ ...parsed.mcp?.handle
16969
17042
  }
16970
17043
  },
17044
+ llmsTxt: {
17045
+ ...defaults.llmsTxt,
17046
+ ...parsed.llmsTxt
17047
+ },
16971
17048
  state: {
16972
17049
  ...defaults.state,
16973
17050
  ...parsed.state
@@ -16987,6 +17064,15 @@ ${issues}`
16987
17064
  maxDepth: 10
16988
17065
  };
16989
17066
  }
17067
+ if (merged.mcp.access === "public") {
17068
+ const resolvedKey = merged.mcp.http.apiKey ?? (merged.mcp.http.apiKeyEnv ? process.env[merged.mcp.http.apiKeyEnv] : void 0);
17069
+ if (!resolvedKey) {
17070
+ throw new SearchSocketError(
17071
+ "CONFIG_MISSING",
17072
+ '`mcp.access` is "public" but no API key is configured. Set `mcp.http.apiKey` or `mcp.http.apiKeyEnv`.'
17073
+ );
17074
+ }
17075
+ }
16990
17076
  if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
16991
17077
  throw new SearchSocketError("CONFIG_MISSING", "`source.crawl.baseUrl` is required when source.mode is crawl.");
16992
17078
  }
@@ -17035,13 +17121,84 @@ function normalizeMarkdown(input) {
17035
17121
  function sanitizeScopeName(scopeName) {
17036
17122
  return scopeName.toLowerCase().replace(/[^a-z0-9._-]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80);
17037
17123
  }
17124
+ function markdownToPlain(markdown) {
17125
+ return markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/[#>*_|\-]/g, " ").replace(/\s+/g, " ").trim();
17126
+ }
17038
17127
  function toSnippet(markdown, maxLen = 220) {
17039
- const plain = markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/[#>*_|\-]/g, " ").replace(/\s+/g, " ").trim();
17128
+ const plain = markdownToPlain(markdown);
17040
17129
  if (plain.length <= maxLen) {
17041
17130
  return plain;
17042
17131
  }
17043
17132
  return `${plain.slice(0, Math.max(0, maxLen - 1)).trim()}\u2026`;
17044
17133
  }
17134
+ function queryAwareExcerpt(markdown, query, maxLen = 220) {
17135
+ const plain = markdownToPlain(markdown);
17136
+ if (plain.length <= maxLen) return plain;
17137
+ const tokens = query.toLowerCase().split(/\s+/).filter((t) => t.length >= 2);
17138
+ if (tokens.length === 0) return toSnippet(markdown, maxLen);
17139
+ const positions = [];
17140
+ for (let ti = 0; ti < tokens.length; ti++) {
17141
+ const escaped = tokens[ti].replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
17142
+ const re = new RegExp(escaped, "gi");
17143
+ let m;
17144
+ while ((m = re.exec(plain)) !== null) {
17145
+ positions.push({ start: m.index, end: m.index + m[0].length, tokenIdx: ti });
17146
+ }
17147
+ }
17148
+ if (positions.length === 0) return toSnippet(markdown, maxLen);
17149
+ positions.sort((a, b) => a.start - b.start);
17150
+ let bestUniqueCount = 0;
17151
+ let bestTotalCount = 0;
17152
+ let bestLeft = 0;
17153
+ let bestRight = 0;
17154
+ let left = 0;
17155
+ const tokenCounts = /* @__PURE__ */ new Map();
17156
+ for (let right = 0; right < positions.length; right++) {
17157
+ tokenCounts.set(positions[right].tokenIdx, (tokenCounts.get(positions[right].tokenIdx) ?? 0) + 1);
17158
+ while (positions[right].end - positions[left].start > maxLen && left < right) {
17159
+ const leftToken = positions[left].tokenIdx;
17160
+ const cnt = tokenCounts.get(leftToken) - 1;
17161
+ if (cnt === 0) tokenCounts.delete(leftToken);
17162
+ else tokenCounts.set(leftToken, cnt);
17163
+ left++;
17164
+ }
17165
+ const uniqueCount = tokenCounts.size;
17166
+ const totalCount = right - left + 1;
17167
+ if (uniqueCount > bestUniqueCount || uniqueCount === bestUniqueCount && totalCount > bestTotalCount) {
17168
+ bestUniqueCount = uniqueCount;
17169
+ bestTotalCount = totalCount;
17170
+ bestLeft = left;
17171
+ bestRight = right;
17172
+ }
17173
+ }
17174
+ const mid = Math.floor((positions[bestLeft].start + positions[bestRight].end) / 2);
17175
+ let start = Math.max(0, mid - Math.floor(maxLen / 2));
17176
+ let end = Math.min(plain.length, start + maxLen);
17177
+ start = Math.max(0, end - maxLen);
17178
+ if (start > 0) {
17179
+ const spaceIdx = plain.lastIndexOf(" ", start);
17180
+ if (spaceIdx > start - 30) {
17181
+ start = spaceIdx + 1;
17182
+ }
17183
+ }
17184
+ if (end < plain.length) {
17185
+ const spaceIdx = plain.indexOf(" ", end);
17186
+ if (spaceIdx !== -1 && spaceIdx < end + 30) {
17187
+ end = spaceIdx;
17188
+ }
17189
+ }
17190
+ let excerpt = plain.slice(start, end);
17191
+ if (excerpt.length > Math.ceil(maxLen * 1.2)) {
17192
+ excerpt = excerpt.slice(0, maxLen);
17193
+ const lastSpace = excerpt.lastIndexOf(" ");
17194
+ if (lastSpace > maxLen * 0.5) {
17195
+ excerpt = excerpt.slice(0, lastSpace);
17196
+ }
17197
+ }
17198
+ const prefix = start > 0 ? "\u2026" : "";
17199
+ const suffix = end < plain.length ? "\u2026" : "";
17200
+ return `${prefix}${excerpt}${suffix}`;
17201
+ }
17045
17202
  function extractFirstParagraph(markdown) {
17046
17203
  const lines = markdown.split("\n");
17047
17204
  let inFence = false;
@@ -17148,162 +17305,288 @@ function joinUrl(baseUrl, route) {
17148
17305
  const routePart = ensureLeadingSlash(route);
17149
17306
  return `${base}${routePart}`;
17150
17307
  }
17151
-
17152
- // src/vector/upstash.ts
17153
- function chunkIndexName(scope) {
17154
- return `${scope.projectId}--${scope.scopeName}`;
17155
- }
17156
- function pageIndexName(scope) {
17157
- return `${scope.projectId}--${scope.scopeName}--pages`;
17158
- }
17159
17308
  var UpstashSearchStore = class {
17160
- client;
17309
+ index;
17310
+ pagesNs;
17311
+ chunksNs;
17161
17312
  constructor(opts) {
17162
- this.client = opts.client;
17163
- }
17164
- chunkIndex(scope) {
17165
- return this.client.index(chunkIndexName(scope));
17166
- }
17167
- pageIndex(scope) {
17168
- return this.client.index(pageIndexName(scope));
17313
+ this.index = opts.index;
17314
+ this.pagesNs = opts.index.namespace(opts.pagesNamespace);
17315
+ this.chunksNs = opts.index.namespace(opts.chunksNamespace);
17169
17316
  }
17170
17317
  async upsertChunks(chunks, scope) {
17171
17318
  if (chunks.length === 0) return;
17172
- const index = this.chunkIndex(scope);
17173
17319
  const BATCH_SIZE = 100;
17174
17320
  for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
17175
17321
  const batch = chunks.slice(i, i + BATCH_SIZE);
17176
- await index.upsert(batch);
17177
- }
17178
- }
17179
- async search(query, opts, scope) {
17180
- const index = this.chunkIndex(scope);
17181
- const results = await index.search({
17182
- query,
17183
- limit: opts.limit,
17184
- semanticWeight: opts.semanticWeight,
17185
- inputEnrichment: opts.inputEnrichment,
17186
- reranking: opts.reranking,
17187
- filter: opts.filter
17322
+ await this.chunksNs.upsert(
17323
+ batch.map((c) => ({
17324
+ id: c.id,
17325
+ data: c.data,
17326
+ metadata: {
17327
+ ...c.metadata,
17328
+ projectId: scope.projectId,
17329
+ scopeName: scope.scopeName,
17330
+ type: c.metadata.type || "chunk"
17331
+ }
17332
+ }))
17333
+ );
17334
+ }
17335
+ }
17336
+ async search(data, opts, scope) {
17337
+ const filterParts = [
17338
+ `projectId = '${scope.projectId}'`,
17339
+ `scopeName = '${scope.scopeName}'`
17340
+ ];
17341
+ if (opts.filter) {
17342
+ filterParts.push(opts.filter);
17343
+ }
17344
+ const results = await this.chunksNs.query({
17345
+ data,
17346
+ topK: opts.limit,
17347
+ includeMetadata: true,
17348
+ filter: filterParts.join(" AND "),
17349
+ queryMode: vector.QueryMode.HYBRID,
17350
+ fusionAlgorithm: vector.FusionAlgorithm.DBSF
17351
+ });
17352
+ return results.map((doc) => ({
17353
+ id: String(doc.id),
17354
+ score: doc.score,
17355
+ metadata: {
17356
+ projectId: doc.metadata?.projectId ?? "",
17357
+ scopeName: doc.metadata?.scopeName ?? "",
17358
+ url: doc.metadata?.url ?? "",
17359
+ path: doc.metadata?.path ?? "",
17360
+ title: doc.metadata?.title ?? "",
17361
+ sectionTitle: doc.metadata?.sectionTitle ?? "",
17362
+ headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
17363
+ snippet: doc.metadata?.snippet ?? "",
17364
+ chunkText: doc.metadata?.chunkText ?? "",
17365
+ ordinal: doc.metadata?.ordinal ?? 0,
17366
+ contentHash: doc.metadata?.contentHash ?? "",
17367
+ depth: doc.metadata?.depth ?? 0,
17368
+ incomingLinks: doc.metadata?.incomingLinks ?? 0,
17369
+ routeFile: doc.metadata?.routeFile ?? "",
17370
+ tags: doc.metadata?.tags ?? [],
17371
+ description: doc.metadata?.description || void 0,
17372
+ keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
17373
+ publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
17374
+ incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
17375
+ }
17376
+ }));
17377
+ }
17378
+ async searchChunksByUrl(data, url, opts, scope) {
17379
+ const filterParts = [
17380
+ `projectId = '${scope.projectId}'`,
17381
+ `scopeName = '${scope.scopeName}'`,
17382
+ `url = '${url}'`
17383
+ ];
17384
+ if (opts.filter) {
17385
+ filterParts.push(opts.filter);
17386
+ }
17387
+ const results = await this.chunksNs.query({
17388
+ data,
17389
+ topK: opts.limit,
17390
+ includeMetadata: true,
17391
+ filter: filterParts.join(" AND "),
17392
+ queryMode: vector.QueryMode.HYBRID,
17393
+ fusionAlgorithm: vector.FusionAlgorithm.DBSF
17188
17394
  });
17189
17395
  return results.map((doc) => ({
17190
- id: doc.id,
17396
+ id: String(doc.id),
17191
17397
  score: doc.score,
17192
17398
  metadata: {
17193
17399
  projectId: doc.metadata?.projectId ?? "",
17194
17400
  scopeName: doc.metadata?.scopeName ?? "",
17195
- url: doc.content.url,
17401
+ url: doc.metadata?.url ?? "",
17196
17402
  path: doc.metadata?.path ?? "",
17197
- title: doc.content.title,
17198
- sectionTitle: doc.content.sectionTitle,
17199
- headingPath: doc.content.headingPath ? doc.content.headingPath.split(" > ").filter(Boolean) : [],
17403
+ title: doc.metadata?.title ?? "",
17404
+ sectionTitle: doc.metadata?.sectionTitle ?? "",
17405
+ headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
17200
17406
  snippet: doc.metadata?.snippet ?? "",
17201
- chunkText: doc.content.text,
17407
+ chunkText: doc.metadata?.chunkText ?? "",
17202
17408
  ordinal: doc.metadata?.ordinal ?? 0,
17203
17409
  contentHash: doc.metadata?.contentHash ?? "",
17204
17410
  depth: doc.metadata?.depth ?? 0,
17205
17411
  incomingLinks: doc.metadata?.incomingLinks ?? 0,
17206
17412
  routeFile: doc.metadata?.routeFile ?? "",
17207
- tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
17413
+ tags: doc.metadata?.tags ?? [],
17208
17414
  description: doc.metadata?.description || void 0,
17209
- keywords: doc.metadata?.keywords ? doc.metadata.keywords.split(",").filter(Boolean) : void 0
17415
+ keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
17416
+ publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
17417
+ incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
17210
17418
  }
17211
17419
  }));
17212
17420
  }
17213
- async searchPages(query, opts, scope) {
17214
- const index = this.pageIndex(scope);
17421
+ async searchPagesByText(data, opts, scope) {
17422
+ return this.queryPages({ data }, opts, scope);
17423
+ }
17424
+ async searchPagesByVector(vector, opts, scope) {
17425
+ return this.queryPages({ vector }, opts, scope);
17426
+ }
17427
+ async queryPages(input, opts, scope) {
17428
+ const filterParts = [
17429
+ `projectId = '${scope.projectId}'`,
17430
+ `scopeName = '${scope.scopeName}'`
17431
+ ];
17432
+ if (opts.filter) {
17433
+ filterParts.push(opts.filter);
17434
+ }
17215
17435
  let results;
17216
17436
  try {
17217
- results = await index.search({
17218
- query,
17219
- limit: opts.limit,
17220
- semanticWeight: opts.semanticWeight,
17221
- inputEnrichment: opts.inputEnrichment,
17222
- reranking: true,
17223
- filter: opts.filter
17437
+ results = await this.pagesNs.query({
17438
+ ...input,
17439
+ topK: opts.limit,
17440
+ includeMetadata: true,
17441
+ filter: filterParts.join(" AND "),
17442
+ queryMode: vector.QueryMode.HYBRID,
17443
+ fusionAlgorithm: vector.FusionAlgorithm.DBSF
17224
17444
  });
17225
17445
  } catch {
17226
17446
  return [];
17227
17447
  }
17228
17448
  return results.map((doc) => ({
17229
- id: doc.id,
17449
+ id: String(doc.id),
17230
17450
  score: doc.score,
17231
- title: doc.content.title,
17232
- url: doc.content.url,
17233
- description: doc.content.description ?? "",
17234
- tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
17451
+ title: doc.metadata?.title ?? "",
17452
+ url: doc.metadata?.url ?? "",
17453
+ description: doc.metadata?.description ?? "",
17454
+ tags: doc.metadata?.tags ?? [],
17235
17455
  depth: doc.metadata?.depth ?? 0,
17236
17456
  incomingLinks: doc.metadata?.incomingLinks ?? 0,
17237
- routeFile: doc.metadata?.routeFile ?? ""
17457
+ routeFile: doc.metadata?.routeFile ?? "",
17458
+ publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0
17238
17459
  }));
17239
17460
  }
17240
- async deleteByIds(ids, scope) {
17461
+ async deleteByIds(ids, _scope) {
17241
17462
  if (ids.length === 0) return;
17242
- const index = this.chunkIndex(scope);
17243
- const BATCH_SIZE = 500;
17463
+ const BATCH_SIZE = 100;
17244
17464
  for (let i = 0; i < ids.length; i += BATCH_SIZE) {
17245
17465
  const batch = ids.slice(i, i + BATCH_SIZE);
17246
- await index.delete(batch);
17466
+ await this.chunksNs.delete(batch);
17247
17467
  }
17248
17468
  }
17249
17469
  async deleteScope(scope) {
17250
- try {
17251
- const chunkIdx = this.chunkIndex(scope);
17252
- await chunkIdx.deleteIndex();
17253
- } catch {
17254
- }
17255
- try {
17256
- const pageIdx = this.pageIndex(scope);
17257
- await pageIdx.deleteIndex();
17258
- } catch {
17470
+ for (const ns of [this.chunksNs, this.pagesNs]) {
17471
+ const ids = [];
17472
+ let cursor = "0";
17473
+ try {
17474
+ for (; ; ) {
17475
+ const result = await ns.range({
17476
+ cursor,
17477
+ limit: 100,
17478
+ includeMetadata: true
17479
+ });
17480
+ for (const doc of result.vectors) {
17481
+ if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
17482
+ ids.push(String(doc.id));
17483
+ }
17484
+ }
17485
+ if (!result.nextCursor || result.nextCursor === "0") break;
17486
+ cursor = result.nextCursor;
17487
+ }
17488
+ } catch {
17489
+ }
17490
+ if (ids.length > 0) {
17491
+ const BATCH_SIZE = 100;
17492
+ for (let i = 0; i < ids.length; i += BATCH_SIZE) {
17493
+ const batch = ids.slice(i, i + BATCH_SIZE);
17494
+ await ns.delete(batch);
17495
+ }
17496
+ }
17259
17497
  }
17260
17498
  }
17261
17499
  async listScopes(projectId) {
17262
- const allIndexes = await this.client.listIndexes();
17263
- const prefix = `${projectId}--`;
17264
- const scopeNames = /* @__PURE__ */ new Set();
17265
- for (const name of allIndexes) {
17266
- if (name.startsWith(prefix) && !name.endsWith("--pages")) {
17267
- const scopeName = name.slice(prefix.length);
17268
- scopeNames.add(scopeName);
17269
- }
17270
- }
17271
- const scopes = [];
17272
- for (const scopeName of scopeNames) {
17273
- const scope = {
17274
- projectId,
17275
- scopeName,
17276
- scopeId: `${projectId}:${scopeName}`
17277
- };
17500
+ const scopeMap = /* @__PURE__ */ new Map();
17501
+ for (const ns of [this.chunksNs, this.pagesNs]) {
17502
+ let cursor = "0";
17278
17503
  try {
17279
- const info = await this.chunkIndex(scope).info();
17280
- scopes.push({
17281
- projectId,
17282
- scopeName,
17283
- lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
17284
- documentCount: info.documentCount
17285
- });
17504
+ for (; ; ) {
17505
+ const result = await ns.range({
17506
+ cursor,
17507
+ limit: 100,
17508
+ includeMetadata: true
17509
+ });
17510
+ for (const doc of result.vectors) {
17511
+ if (doc.metadata?.projectId === projectId) {
17512
+ const scopeName = doc.metadata.scopeName ?? "";
17513
+ scopeMap.set(scopeName, (scopeMap.get(scopeName) ?? 0) + 1);
17514
+ }
17515
+ }
17516
+ if (!result.nextCursor || result.nextCursor === "0") break;
17517
+ cursor = result.nextCursor;
17518
+ }
17286
17519
  } catch {
17287
- scopes.push({
17288
- projectId,
17289
- scopeName,
17290
- lastIndexedAt: "unknown",
17291
- documentCount: 0
17292
- });
17293
17520
  }
17294
17521
  }
17295
- return scopes;
17522
+ return [...scopeMap.entries()].map(([scopeName, count]) => ({
17523
+ projectId,
17524
+ scopeName,
17525
+ lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
17526
+ documentCount: count
17527
+ }));
17296
17528
  }
17297
17529
  async getContentHashes(scope) {
17298
17530
  const map = /* @__PURE__ */ new Map();
17299
- const index = this.chunkIndex(scope);
17300
17531
  let cursor = "0";
17301
17532
  try {
17302
17533
  for (; ; ) {
17303
- const result = await index.range({ cursor, limit: 100 });
17304
- for (const doc of result.documents) {
17305
- if (doc.metadata?.contentHash) {
17306
- map.set(doc.id, doc.metadata.contentHash);
17534
+ const result = await this.chunksNs.range({
17535
+ cursor,
17536
+ limit: 100,
17537
+ includeMetadata: true
17538
+ });
17539
+ for (const doc of result.vectors) {
17540
+ if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
17541
+ map.set(String(doc.id), doc.metadata.contentHash);
17542
+ }
17543
+ }
17544
+ if (!result.nextCursor || result.nextCursor === "0") break;
17545
+ cursor = result.nextCursor;
17546
+ }
17547
+ } catch {
17548
+ }
17549
+ return map;
17550
+ }
17551
+ async listPages(scope, opts) {
17552
+ const cursor = opts?.cursor ?? "0";
17553
+ const limit = opts?.limit ?? 50;
17554
+ try {
17555
+ const result = await this.pagesNs.range({
17556
+ cursor,
17557
+ limit,
17558
+ includeMetadata: true
17559
+ });
17560
+ const pages = result.vectors.filter(
17561
+ (doc) => doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && (!opts?.pathPrefix || (doc.metadata?.url ?? "").startsWith(opts.pathPrefix))
17562
+ ).map((doc) => ({
17563
+ url: doc.metadata?.url ?? "",
17564
+ title: doc.metadata?.title ?? "",
17565
+ description: doc.metadata?.description ?? "",
17566
+ routeFile: doc.metadata?.routeFile ?? ""
17567
+ }));
17568
+ const response = { pages };
17569
+ if (result.nextCursor && result.nextCursor !== "0") {
17570
+ response.nextCursor = result.nextCursor;
17571
+ }
17572
+ return response;
17573
+ } catch {
17574
+ return { pages: [] };
17575
+ }
17576
+ }
17577
+ async getPageHashes(scope) {
17578
+ const map = /* @__PURE__ */ new Map();
17579
+ let cursor = "0";
17580
+ try {
17581
+ for (; ; ) {
17582
+ const result = await this.pagesNs.range({
17583
+ cursor,
17584
+ limit: 100,
17585
+ includeMetadata: true
17586
+ });
17587
+ for (const doc of result.vectors) {
17588
+ if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
17589
+ map.set(String(doc.id), doc.metadata.contentHash);
17307
17590
  }
17308
17591
  }
17309
17592
  if (!result.nextCursor || result.nextCursor === "0") break;
@@ -17313,47 +17596,43 @@ var UpstashSearchStore = class {
17313
17596
  }
17314
17597
  return map;
17315
17598
  }
17599
+ async deletePagesByIds(ids, _scope) {
17600
+ if (ids.length === 0) return;
17601
+ const BATCH_SIZE = 50;
17602
+ for (let i = 0; i < ids.length; i += BATCH_SIZE) {
17603
+ const batch = ids.slice(i, i + BATCH_SIZE);
17604
+ await this.pagesNs.delete(batch);
17605
+ }
17606
+ }
17316
17607
  async upsertPages(pages, scope) {
17317
17608
  if (pages.length === 0) return;
17318
- const index = this.pageIndex(scope);
17319
17609
  const BATCH_SIZE = 50;
17320
17610
  for (let i = 0; i < pages.length; i += BATCH_SIZE) {
17321
17611
  const batch = pages.slice(i, i + BATCH_SIZE);
17322
- const docs = batch.map((p) => ({
17323
- id: p.url,
17324
- content: {
17325
- title: p.title,
17326
- url: p.url,
17327
- type: "page",
17328
- description: p.description ?? "",
17329
- keywords: (p.keywords ?? []).join(","),
17330
- summary: p.summary ?? "",
17331
- tags: p.tags.join(",")
17332
- },
17333
- metadata: {
17334
- markdown: p.markdown,
17335
- projectId: p.projectId,
17336
- scopeName: p.scopeName,
17337
- routeFile: p.routeFile,
17338
- routeResolution: p.routeResolution,
17339
- incomingLinks: p.incomingLinks,
17340
- outgoingLinks: p.outgoingLinks,
17341
- depth: p.depth,
17342
- indexedAt: p.indexedAt
17343
- }
17344
- }));
17345
- await index.upsert(docs);
17612
+ await this.pagesNs.upsert(
17613
+ batch.map((p) => ({
17614
+ id: p.id,
17615
+ data: p.data,
17616
+ metadata: {
17617
+ ...p.metadata,
17618
+ projectId: scope.projectId,
17619
+ scopeName: scope.scopeName,
17620
+ type: "page"
17621
+ }
17622
+ }))
17623
+ );
17346
17624
  }
17347
17625
  }
17348
17626
  async getPage(url, scope) {
17349
- const index = this.pageIndex(scope);
17350
17627
  try {
17351
- const results = await index.fetch([url]);
17628
+ const results = await this.pagesNs.fetch([url], {
17629
+ includeMetadata: true
17630
+ });
17352
17631
  const doc = results[0];
17353
- if (!doc) return null;
17632
+ if (!doc || !doc.metadata) return null;
17354
17633
  return {
17355
- url: doc.content.url,
17356
- title: doc.content.title,
17634
+ url: doc.metadata.url,
17635
+ title: doc.metadata.title,
17357
17636
  markdown: doc.metadata.markdown,
17358
17637
  projectId: doc.metadata.projectId,
17359
17638
  scopeName: doc.metadata.scopeName,
@@ -17361,27 +17640,86 @@ var UpstashSearchStore = class {
17361
17640
  routeResolution: doc.metadata.routeResolution,
17362
17641
  incomingLinks: doc.metadata.incomingLinks,
17363
17642
  outgoingLinks: doc.metadata.outgoingLinks,
17643
+ outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? void 0,
17364
17644
  depth: doc.metadata.depth,
17365
- tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
17645
+ tags: doc.metadata.tags ?? [],
17366
17646
  indexedAt: doc.metadata.indexedAt,
17367
- summary: doc.content.summary || void 0,
17368
- description: doc.content.description || void 0,
17369
- keywords: doc.content.keywords ? doc.content.keywords.split(",").filter(Boolean) : void 0
17647
+ summary: doc.metadata.summary || void 0,
17648
+ description: doc.metadata.description || void 0,
17649
+ keywords: doc.metadata.keywords?.length ? doc.metadata.keywords : void 0,
17650
+ publishedAt: typeof doc.metadata.publishedAt === "number" ? doc.metadata.publishedAt : void 0
17370
17651
  };
17371
17652
  } catch {
17372
17653
  return null;
17373
17654
  }
17374
17655
  }
17656
+ async fetchPageWithVector(url, scope) {
17657
+ try {
17658
+ const results = await this.pagesNs.fetch([url], {
17659
+ includeMetadata: true,
17660
+ includeVectors: true
17661
+ });
17662
+ const doc = results[0];
17663
+ if (!doc || !doc.metadata || !doc.vector) return null;
17664
+ if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
17665
+ return null;
17666
+ }
17667
+ return { metadata: doc.metadata, vector: doc.vector };
17668
+ } catch {
17669
+ return null;
17670
+ }
17671
+ }
17672
+ async fetchPagesBatch(urls, scope) {
17673
+ if (urls.length === 0) return [];
17674
+ try {
17675
+ const results = await this.pagesNs.fetch(urls, {
17676
+ includeMetadata: true
17677
+ });
17678
+ const out = [];
17679
+ for (const doc of results) {
17680
+ if (!doc || !doc.metadata) continue;
17681
+ if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
17682
+ continue;
17683
+ }
17684
+ out.push({
17685
+ url: doc.metadata.url,
17686
+ title: doc.metadata.title,
17687
+ routeFile: doc.metadata.routeFile,
17688
+ outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? []
17689
+ });
17690
+ }
17691
+ return out;
17692
+ } catch {
17693
+ return [];
17694
+ }
17695
+ }
17375
17696
  async deletePages(scope) {
17697
+ const ids = [];
17698
+ let cursor = "0";
17376
17699
  try {
17377
- const index = this.pageIndex(scope);
17378
- await index.reset();
17700
+ for (; ; ) {
17701
+ const result = await this.pagesNs.range({
17702
+ cursor,
17703
+ limit: 100,
17704
+ includeMetadata: true
17705
+ });
17706
+ for (const doc of result.vectors) {
17707
+ if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
17708
+ ids.push(String(doc.id));
17709
+ }
17710
+ }
17711
+ if (!result.nextCursor || result.nextCursor === "0") break;
17712
+ cursor = result.nextCursor;
17713
+ }
17379
17714
  } catch {
17380
17715
  }
17716
+ if (ids.length > 0) {
17717
+ await this.deletePagesByIds(ids, scope);
17718
+ }
17381
17719
  }
17382
17720
  async health() {
17383
17721
  try {
17384
- await this.client.info();
17722
+ await this.index.info();
17385
17723
  return { ok: true };
17386
17724
  } catch (error) {
17387
17725
  return {
@@ -17391,14 +17729,31 @@ var UpstashSearchStore = class {
17391
17729
  }
17392
17730
  }
17393
17731
  async dropAllIndexes(projectId) {
17394
- const allIndexes = await this.client.listIndexes();
17395
- const prefix = `${projectId}--`;
17396
- for (const name of allIndexes) {
17397
- if (name.startsWith(prefix)) {
17398
- try {
17399
- const index = this.client.index(name);
17400
- await index.deleteIndex();
17401
- } catch {
17732
+ for (const ns of [this.chunksNs, this.pagesNs]) {
17733
+ const ids = [];
17734
+ let cursor = "0";
17735
+ try {
17736
+ for (; ; ) {
17737
+ const result = await ns.range({
17738
+ cursor,
17739
+ limit: 100,
17740
+ includeMetadata: true
17741
+ });
17742
+ for (const doc of result.vectors) {
17743
+ if (doc.metadata?.projectId === projectId) {
17744
+ ids.push(String(doc.id));
17745
+ }
17746
+ }
17747
+ if (!result.nextCursor || result.nextCursor === "0") break;
17748
+ cursor = result.nextCursor;
17749
+ }
17750
+ } catch {
17751
+ }
17752
+ if (ids.length > 0) {
17753
+ const BATCH_SIZE = 100;
17754
+ for (let i = 0; i < ids.length; i += BATCH_SIZE) {
17755
+ const batch = ids.slice(i, i + BATCH_SIZE);
17756
+ await ns.delete(batch);
17402
17757
  }
17403
17758
  }
17404
17759
  }
@@ -17412,12 +17767,16 @@ async function createUpstashStore(config) {
17412
17767
  if (!url || !token) {
17413
17768
  throw new SearchSocketError(
17414
17769
  "VECTOR_BACKEND_UNAVAILABLE",
17415
- `Missing Upstash Search credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
17770
+ `Missing Upstash Vector credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
17416
17771
  );
17417
17772
  }
17418
- const { Search } = await import('@upstash/search');
17419
- const client = new Search({ url, token });
17420
- return new UpstashSearchStore({ client });
17773
+ const { Index } = await import('@upstash/vector');
17774
+ const index = new Index({ url, token });
17775
+ return new UpstashSearchStore({
17776
+ index,
17777
+ pagesNamespace: config.upstash.namespaces.pages,
17778
+ chunksNamespace: config.upstash.namespaces.chunks
17779
+ });
17421
17780
  }
17422
17781
 
17423
17782
  // src/utils/pattern.ts
@@ -17460,29 +17819,65 @@ function nonNegativeOrZero(value) {
17460
17819
  function normalizeForTitleMatch(text) {
17461
17820
  return text.toLowerCase().replace(/[^a-z0-9\s]/g, "").replace(/\s+/g, " ").trim();
17462
17821
  }
17463
- function rankHits(hits, config, query) {
17822
+ function rankHits(hits, config, query, debug) {
17464
17823
  const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
17465
17824
  const titleMatchWeight = config.ranking.weights.titleMatch;
17466
17825
  return hits.map((hit) => {
17467
- let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
17826
+ const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
17827
+ let score = baseScore;
17828
+ let incomingLinkBoostValue = 0;
17468
17829
  if (config.ranking.enableIncomingLinkBoost) {
17469
17830
  const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
17470
- score += incomingBoost * config.ranking.weights.incomingLinks;
17831
+ incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
17832
+ score += incomingLinkBoostValue;
17471
17833
  }
17834
+ let depthBoostValue = 0;
17472
17835
  if (config.ranking.enableDepthBoost) {
17473
17836
  const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
17474
- score += depthBoost * config.ranking.weights.depth;
17837
+ depthBoostValue = depthBoost * config.ranking.weights.depth;
17838
+ score += depthBoostValue;
17475
17839
  }
17840
+ let titleMatchBoostValue = 0;
17476
17841
  if (normalizedQuery && titleMatchWeight > 0) {
17477
17842
  const normalizedTitle = normalizeForTitleMatch(hit.metadata.title);
17478
17843
  if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
17479
- score += titleMatchWeight;
17844
+ titleMatchBoostValue = titleMatchWeight;
17845
+ score += titleMatchBoostValue;
17480
17846
  }
17481
17847
  }
17482
- return {
17848
+ let freshnessBoostValue = 0;
17849
+ if (config.ranking.enableFreshnessBoost) {
17850
+ const publishedAt = hit.metadata.publishedAt;
17851
+ if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
17852
+ const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
17853
+ const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
17854
+ freshnessBoostValue = decay * config.ranking.weights.freshness;
17855
+ score += freshnessBoostValue;
17856
+ }
17857
+ }
17858
+ let anchorTextMatchBoostValue = 0;
17859
+ if (config.ranking.enableAnchorTextBoost && normalizedQuery && config.ranking.weights.anchorText > 0) {
17860
+ const normalizedAnchorText = normalizeForTitleMatch(hit.metadata.incomingAnchorText ?? "");
17861
+ if (normalizedAnchorText.length > 0 && normalizedQuery.length > 0 && (normalizedAnchorText.includes(normalizedQuery) || normalizedQuery.includes(normalizedAnchorText))) {
17862
+ anchorTextMatchBoostValue = config.ranking.weights.anchorText;
17863
+ score += anchorTextMatchBoostValue;
17864
+ }
17865
+ }
17866
+ const result = {
17483
17867
  hit,
17484
17868
  finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
17485
17869
  };
17870
+ if (debug) {
17871
+ result.breakdown = {
17872
+ baseScore,
17873
+ incomingLinkBoost: incomingLinkBoostValue,
17874
+ depthBoost: depthBoostValue,
17875
+ titleMatchBoost: titleMatchBoostValue,
17876
+ freshnessBoost: freshnessBoostValue,
17877
+ anchorTextMatchBoost: anchorTextMatchBoostValue
17878
+ };
17879
+ }
17880
+ return result;
17486
17881
  }).sort((a, b) => {
17487
17882
  const delta = b.finalScore - a.finalScore;
17488
17883
  return Number.isNaN(delta) ? 0 : delta;
@@ -17491,12 +17886,13 @@ function rankHits(hits, config, query) {
17491
17886
  function trimByScoreGap(results, config) {
17492
17887
  if (results.length === 0) return results;
17493
17888
  const threshold = config.ranking.scoreGapThreshold;
17494
- const minScore = config.ranking.minScore;
17495
- if (minScore > 0 && results.length > 0) {
17496
- const sortedScores = results.map((r) => r.pageScore).sort((a, b) => a - b);
17497
- const mid = Math.floor(sortedScores.length / 2);
17498
- const median = sortedScores.length % 2 === 0 ? (sortedScores[mid - 1] + sortedScores[mid]) / 2 : sortedScores[mid];
17499
- if (median < minScore) return [];
17889
+ const minScoreRatio = config.ranking.minScoreRatio;
17890
+ if (minScoreRatio > 0 && results.length > 0) {
17891
+ const topScore = results[0].pageScore;
17892
+ if (Number.isFinite(topScore) && topScore > 0) {
17893
+ const minThreshold = topScore * minScoreRatio;
17894
+ results = results.filter((r) => r.pageScore >= minThreshold);
17895
+ }
17500
17896
  }
17501
17897
  if (threshold > 0 && results.length > 1) {
17502
17898
  for (let i = 1; i < results.length; i++) {
@@ -17566,75 +17962,276 @@ function aggregateByPage(ranked, config) {
17566
17962
  return Number.isNaN(delta) ? 0 : delta;
17567
17963
  });
17568
17964
  }
17569
- function mergePageAndChunkResults(pageHits, rankedChunks, config) {
17570
- if (pageHits.length === 0) return rankedChunks;
17571
- const w = config.search.pageSearchWeight;
17572
- const pageScoreMap = /* @__PURE__ */ new Map();
17573
- for (const ph of pageHits) {
17574
- pageScoreMap.set(ph.url, ph);
17575
- }
17576
- const pagesWithChunks = /* @__PURE__ */ new Set();
17577
- const merged = rankedChunks.map((ranked) => {
17578
- const url = ranked.hit.metadata.url;
17579
- const pageHit = pageScoreMap.get(url);
17580
- if (pageHit) {
17581
- pagesWithChunks.add(url);
17582
- const blended = (1 - w) * ranked.finalScore + w * pageHit.score;
17583
- return {
17584
- hit: ranked.hit,
17585
- finalScore: Number.isFinite(blended) ? blended : ranked.finalScore
17586
- };
17965
+ function rankPageHits(pageHits, config, query, debug) {
17966
+ const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
17967
+ const titleMatchWeight = config.ranking.weights.titleMatch;
17968
+ return pageHits.map((hit) => {
17969
+ const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
17970
+ let score = baseScore;
17971
+ let incomingLinkBoostValue = 0;
17972
+ if (config.ranking.enableIncomingLinkBoost) {
17973
+ const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.incomingLinks));
17974
+ incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
17975
+ score += incomingLinkBoostValue;
17587
17976
  }
17588
- return ranked;
17589
- });
17590
- for (const [url, pageHit] of pageScoreMap) {
17591
- if (pagesWithChunks.has(url)) continue;
17592
- const syntheticScore = pageHit.score * w;
17593
- const syntheticHit = {
17594
- id: `page:${url}`,
17595
- score: pageHit.score,
17596
- metadata: {
17597
- projectId: "",
17598
- scopeName: "",
17599
- url: pageHit.url,
17600
- path: pageHit.url,
17601
- title: pageHit.title,
17602
- sectionTitle: "",
17603
- headingPath: [],
17604
- snippet: pageHit.description || pageHit.title,
17605
- chunkText: pageHit.description || pageHit.title,
17606
- ordinal: 0,
17607
- contentHash: "",
17608
- depth: pageHit.depth,
17609
- incomingLinks: pageHit.incomingLinks,
17610
- routeFile: pageHit.routeFile,
17611
- tags: pageHit.tags
17977
+ let depthBoostValue = 0;
17978
+ if (config.ranking.enableDepthBoost) {
17979
+ const depthBoost = 1 / (1 + nonNegativeOrZero(hit.depth));
17980
+ depthBoostValue = depthBoost * config.ranking.weights.depth;
17981
+ score += depthBoostValue;
17982
+ }
17983
+ let titleMatchBoostValue = 0;
17984
+ if (normalizedQuery && titleMatchWeight > 0) {
17985
+ const normalizedTitle = normalizeForTitleMatch(hit.title);
17986
+ if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
17987
+ titleMatchBoostValue = titleMatchWeight;
17988
+ score += titleMatchBoostValue;
17989
+ }
17990
+ }
17991
+ let freshnessBoostValue = 0;
17992
+ if (config.ranking.enableFreshnessBoost) {
17993
+ const publishedAt = hit.publishedAt;
17994
+ if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
17995
+ const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
17996
+ const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
17997
+ freshnessBoostValue = decay * config.ranking.weights.freshness;
17998
+ score += freshnessBoostValue;
17612
17999
  }
18000
+ }
18001
+ const pageWeight = findPageWeight(hit.url, config.ranking.pageWeights);
18002
+ if (pageWeight !== 1) {
18003
+ score *= pageWeight;
18004
+ }
18005
+ const result = {
18006
+ url: hit.url,
18007
+ title: hit.title,
18008
+ description: hit.description,
18009
+ routeFile: hit.routeFile,
18010
+ depth: hit.depth,
18011
+ incomingLinks: hit.incomingLinks,
18012
+ tags: hit.tags,
18013
+ baseScore,
18014
+ finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY,
18015
+ publishedAt: hit.publishedAt
17613
18016
  };
17614
- merged.push({
17615
- hit: syntheticHit,
17616
- finalScore: Number.isFinite(syntheticScore) ? syntheticScore : 0
17617
- });
17618
- }
17619
- return merged.sort((a, b) => {
18017
+ if (debug) {
18018
+ result.breakdown = {
18019
+ baseScore,
18020
+ pageWeight,
18021
+ incomingLinkBoost: incomingLinkBoostValue,
18022
+ depthBoost: depthBoostValue,
18023
+ titleMatchBoost: titleMatchBoostValue,
18024
+ freshnessBoost: freshnessBoostValue
18025
+ };
18026
+ }
18027
+ return result;
18028
+ }).filter((p) => findPageWeight(p.url, config.ranking.pageWeights) !== 0).sort((a, b) => {
17620
18029
  const delta = b.finalScore - a.finalScore;
17621
18030
  return Number.isNaN(delta) ? 0 : delta;
17622
18031
  });
17623
18032
  }
18033
+ function trimPagesByScoreGap(results, config) {
18034
+ if (results.length === 0) return results;
18035
+ const threshold = config.ranking.scoreGapThreshold;
18036
+ const minScoreRatio = config.ranking.minScoreRatio;
18037
+ if (minScoreRatio > 0 && results.length > 0) {
18038
+ const topScore = results[0].finalScore;
18039
+ if (Number.isFinite(topScore) && topScore > 0) {
18040
+ const minThreshold = topScore * minScoreRatio;
18041
+ results = results.filter((r) => r.finalScore >= minThreshold);
18042
+ }
18043
+ }
18044
+ if (threshold > 0 && results.length > 1) {
18045
+ for (let i = 1; i < results.length; i++) {
18046
+ const prev = results[i - 1].finalScore;
18047
+ const current = results[i].finalScore;
18048
+ if (prev > 0) {
18049
+ const gap = (prev - current) / prev;
18050
+ if (gap >= threshold) {
18051
+ return results.slice(0, i);
18052
+ }
18053
+ }
18054
+ }
18055
+ }
18056
+ return results;
18057
+ }
18058
+
18059
+ // src/search/related-pages.ts
18060
+ function diceScore(urlA, urlB) {
18061
+ const segmentsA = urlA.split("/").filter(Boolean);
18062
+ const segmentsB = urlB.split("/").filter(Boolean);
18063
+ if (segmentsA.length === 0 && segmentsB.length === 0) return 1;
18064
+ if (segmentsA.length === 0 || segmentsB.length === 0) return 0;
18065
+ let shared = 0;
18066
+ const minLen = Math.min(segmentsA.length, segmentsB.length);
18067
+ for (let i = 0; i < minLen; i++) {
18068
+ if (segmentsA[i] === segmentsB[i]) {
18069
+ shared++;
18070
+ } else {
18071
+ break;
18072
+ }
18073
+ }
18074
+ return 2 * shared / (segmentsA.length + segmentsB.length);
18075
+ }
18076
+ function compositeScore(isLinked, dice, semantic) {
18077
+ return (isLinked ? 0.5 : 0) + 0.3 * dice + 0.2 * semantic;
18078
+ }
18079
+ function dominantRelationshipType(isOutgoing, isIncoming, dice) {
18080
+ if (isOutgoing) return "outgoing_link";
18081
+ if (isIncoming) return "incoming_link";
18082
+ if (dice > 0.4) return "sibling";
18083
+ return "semantic";
18084
+ }
18085
+
18086
+ // src/utils/structured-meta.ts
18087
+ var VALID_KEY_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
18088
+ function validateMetaKey(key) {
18089
+ return VALID_KEY_RE.test(key);
18090
+ }
18091
+ function parseMetaValue(content, dataType) {
18092
+ switch (dataType) {
18093
+ case "number": {
18094
+ const n = Number(content);
18095
+ return Number.isFinite(n) ? n : content;
18096
+ }
18097
+ case "boolean":
18098
+ return content === "true";
18099
+ case "string[]":
18100
+ return content ? content.split(",").map((s) => s.trim()) : [];
18101
+ case "date": {
18102
+ const ms = Number(content);
18103
+ return Number.isFinite(ms) ? ms : content;
18104
+ }
18105
+ default:
18106
+ return content;
18107
+ }
18108
+ }
18109
+ function escapeFilterValue(s) {
18110
+ return s.replace(/'/g, "''");
18111
+ }
18112
+ function buildMetaFilterString(filters) {
18113
+ const clauses = [];
18114
+ for (const [key, value] of Object.entries(filters)) {
18115
+ if (!validateMetaKey(key)) continue;
18116
+ const field = `meta.${key}`;
18117
+ if (typeof value === "string") {
18118
+ clauses.push(`${field} CONTAINS '${escapeFilterValue(value)}'`);
18119
+ } else if (typeof value === "boolean") {
18120
+ clauses.push(`${field} = ${value}`);
18121
+ } else {
18122
+ clauses.push(`${field} = ${value}`);
18123
+ }
18124
+ }
18125
+ return clauses.join(" AND ");
18126
+ }
17624
18127
 
17625
18128
  // src/search/engine.ts
18129
+ var rankingOverridesSchema = zod.z.object({
18130
+ ranking: zod.z.object({
18131
+ enableIncomingLinkBoost: zod.z.boolean().optional(),
18132
+ enableDepthBoost: zod.z.boolean().optional(),
18133
+ aggregationCap: zod.z.number().int().positive().optional(),
18134
+ aggregationDecay: zod.z.number().min(0).max(1).optional(),
18135
+ minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
18136
+ minScoreRatio: zod.z.number().min(0).max(1).optional(),
18137
+ scoreGapThreshold: zod.z.number().min(0).max(1).optional(),
18138
+ weights: zod.z.object({
18139
+ incomingLinks: zod.z.number().optional(),
18140
+ depth: zod.z.number().optional(),
18141
+ aggregation: zod.z.number().optional(),
18142
+ titleMatch: zod.z.number().optional()
18143
+ }).optional()
18144
+ }).optional(),
18145
+ search: zod.z.object({
18146
+ pageSearchWeight: zod.z.number().min(0).max(1).optional()
18147
+ }).optional()
18148
+ }).optional();
17626
18149
  var requestSchema = zod.z.object({
17627
18150
  q: zod.z.string().trim().min(1),
17628
18151
  topK: zod.z.number().int().positive().max(100).optional(),
17629
18152
  scope: zod.z.string().optional(),
17630
18153
  pathPrefix: zod.z.string().optional(),
17631
18154
  tags: zod.z.array(zod.z.string()).optional(),
17632
- groupBy: zod.z.enum(["page", "chunk"]).optional()
18155
+ filters: zod.z.record(zod.z.string(), zod.z.union([zod.z.string(), zod.z.number(), zod.z.boolean()])).optional(),
18156
+ groupBy: zod.z.enum(["page", "chunk"]).optional(),
18157
+ maxSubResults: zod.z.number().int().positive().max(20).optional(),
18158
+ debug: zod.z.boolean().optional(),
18159
+ rankingOverrides: rankingOverridesSchema
17633
18160
  });
17634
- var SearchEngine = class _SearchEngine {
17635
- cwd;
17636
- config;
17637
- store;
18161
+ var MAX_SITE_STRUCTURE_PAGES = 2e3;
18162
+ function makeNode(url, depth) {
18163
+ return { url, title: "", depth, routeFile: "", isIndexed: false, childCount: 0, children: [] };
18164
+ }
18165
+ function buildTree(pages, pathPrefix) {
18166
+ const nodeMap = /* @__PURE__ */ new Map();
18167
+ const root2 = makeNode("/", 0);
18168
+ nodeMap.set("/", root2);
18169
+ for (const page of pages) {
18170
+ const normalized = normalizeUrlPath(page.url);
18171
+ const segments = normalized.split("/").filter(Boolean);
18172
+ if (segments.length === 0) {
18173
+ root2.title = page.title;
18174
+ root2.routeFile = page.routeFile;
18175
+ root2.isIndexed = true;
18176
+ continue;
18177
+ }
18178
+ for (let i = 1; i <= segments.length; i++) {
18179
+ const partialUrl = "/" + segments.slice(0, i).join("/");
18180
+ if (!nodeMap.has(partialUrl)) {
18181
+ nodeMap.set(partialUrl, makeNode(partialUrl, i));
18182
+ }
18183
+ }
18184
+ const node = nodeMap.get(normalized);
18185
+ node.title = page.title;
18186
+ node.routeFile = page.routeFile;
18187
+ node.isIndexed = true;
18188
+ }
18189
+ for (const [url, node] of nodeMap) {
18190
+ if (url === "/") continue;
18191
+ const segments = url.split("/").filter(Boolean);
18192
+ const parentUrl = segments.length === 1 ? "/" : "/" + segments.slice(0, -1).join("/");
18193
+ const parent = nodeMap.get(parentUrl) ?? root2;
18194
+ parent.children.push(node);
18195
+ }
18196
+ const sortAndCount = (node) => {
18197
+ node.children.sort((a, b) => a.url.localeCompare(b.url));
18198
+ node.childCount = node.children.length;
18199
+ for (const child of node.children) {
18200
+ sortAndCount(child);
18201
+ }
18202
+ };
18203
+ sortAndCount(root2);
18204
+ if (pathPrefix) {
18205
+ const normalizedPrefix = normalizeUrlPath(pathPrefix);
18206
+ const subtreeRoot = nodeMap.get(normalizedPrefix);
18207
+ if (subtreeRoot) {
18208
+ return subtreeRoot;
18209
+ }
18210
+ return makeNode(normalizedPrefix, normalizedPrefix.split("/").filter(Boolean).length);
18211
+ }
18212
+ return root2;
18213
+ }
18214
+ function mergeRankingOverrides(base, overrides) {
18215
+ return {
18216
+ ...base,
18217
+ search: {
18218
+ ...base.search,
18219
+ ...overrides.search
18220
+ },
18221
+ ranking: {
18222
+ ...base.ranking,
18223
+ ...overrides.ranking,
18224
+ weights: {
18225
+ ...base.ranking.weights,
18226
+ ...overrides.ranking?.weights
18227
+ }
18228
+ }
18229
+ };
18230
+ }
18231
+ var SearchEngine = class _SearchEngine {
18232
+ cwd;
18233
+ config;
18234
+ store;
17638
18235
  constructor(options) {
17639
18236
  this.cwd = options.cwd;
17640
18237
  this.config = options.config;
@@ -17660,125 +18257,203 @@ var SearchEngine = class _SearchEngine {
17660
18257
  }
17661
18258
  const input = parsed.data;
17662
18259
  const totalStart = process.hrtime.bigint();
18260
+ const effectiveConfig = input.debug && input.rankingOverrides ? mergeRankingOverrides(this.config, input.rankingOverrides) : this.config;
17663
18261
  const resolvedScope = resolveScope(this.config, input.scope);
17664
18262
  const topK = input.topK ?? 10;
18263
+ const maxSubResults = input.maxSubResults ?? 5;
17665
18264
  const groupByPage = (input.groupBy ?? "page") === "page";
17666
- const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
17667
- const filterParts = [];
17668
- if (input.pathPrefix) {
17669
- const prefix = input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}`;
17670
- filterParts.push(`url GLOB '${prefix}*'`);
17671
- }
17672
- if (input.tags && input.tags.length > 0) {
17673
- for (const tag of input.tags) {
17674
- filterParts.push(`tags GLOB '*${tag}*'`);
18265
+ const queryText = input.q;
18266
+ const pathPrefix = input.pathPrefix ? input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}` : void 0;
18267
+ const filterTags = input.tags && input.tags.length > 0 ? input.tags : void 0;
18268
+ const metaFilterStr = input.filters && Object.keys(input.filters).length > 0 ? buildMetaFilterString(input.filters) : "";
18269
+ const metaFilter = metaFilterStr || void 0;
18270
+ const applyPagePostFilters = (hits) => {
18271
+ let filtered = hits;
18272
+ if (pathPrefix) {
18273
+ filtered = filtered.filter((h) => h.url.startsWith(pathPrefix));
18274
+ }
18275
+ if (filterTags) {
18276
+ filtered = filtered.filter(
18277
+ (h) => filterTags.every((tag) => h.tags.includes(tag))
18278
+ );
17675
18279
  }
17676
- }
17677
- const filter = filterParts.length > 0 ? filterParts.join(" AND ") : void 0;
17678
- const useDualSearch = this.config.search.dualSearch && groupByPage;
18280
+ return filtered;
18281
+ };
18282
+ const applyChunkPostFilters = (hits) => {
18283
+ let filtered = hits;
18284
+ if (filterTags) {
18285
+ filtered = filtered.filter(
18286
+ (h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
18287
+ );
18288
+ }
18289
+ return filtered;
18290
+ };
17679
18291
  const searchStart = process.hrtime.bigint();
17680
- let ranked;
17681
- if (useDualSearch) {
17682
- const chunkLimit = Math.max(topK * 10, 100);
17683
- const pageLimit = 20;
17684
- const [pageHits, chunkHits] = await Promise.all([
17685
- this.store.searchPages(
17686
- input.q,
17687
- {
17688
- limit: pageLimit,
17689
- semanticWeight: this.config.search.semanticWeight,
17690
- inputEnrichment: this.config.search.inputEnrichment,
17691
- filter
17692
- },
17693
- resolvedScope
17694
- ),
17695
- this.store.search(
17696
- input.q,
17697
- {
17698
- limit: chunkLimit,
17699
- semanticWeight: this.config.search.semanticWeight,
17700
- inputEnrichment: this.config.search.inputEnrichment,
17701
- reranking: false,
17702
- filter
17703
- },
18292
+ if (groupByPage) {
18293
+ const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
18294
+ const pageLimit = Math.max(topK * 2, 20);
18295
+ const pageHits = await this.store.searchPagesByText(
18296
+ queryText,
18297
+ { limit: pageLimit * fetchMultiplier, filter: metaFilter },
18298
+ resolvedScope
18299
+ );
18300
+ const filteredPages = applyPagePostFilters(pageHits);
18301
+ let rankedPages = rankPageHits(filteredPages, effectiveConfig, input.q, input.debug);
18302
+ rankedPages = trimPagesByScoreGap(rankedPages, effectiveConfig);
18303
+ const topPages = rankedPages.slice(0, topK);
18304
+ const chunkPromises = topPages.map(
18305
+ (page) => this.store.searchChunksByUrl(
18306
+ queryText,
18307
+ page.url,
18308
+ { limit: maxSubResults, filter: metaFilter },
17704
18309
  resolvedScope
17705
- )
17706
- ]);
17707
- const rankedChunks = rankHits(chunkHits, this.config, input.q);
17708
- ranked = mergePageAndChunkResults(pageHits, rankedChunks, this.config);
18310
+ ).then((chunks) => applyChunkPostFilters(chunks))
18311
+ );
18312
+ const allChunks = await Promise.all(chunkPromises);
18313
+ const searchMs = hrTimeMs(searchStart);
18314
+ const results = this.buildPageFirstResults(topPages, allChunks, input.q, input.debug, maxSubResults);
18315
+ return {
18316
+ q: input.q,
18317
+ scope: resolvedScope.scopeName,
18318
+ results,
18319
+ meta: {
18320
+ timingsMs: {
18321
+ search: Math.round(searchMs),
18322
+ total: Math.round(hrTimeMs(totalStart))
18323
+ }
18324
+ }
18325
+ };
17709
18326
  } else {
18327
+ const candidateK = Math.max(50, topK);
18328
+ const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
17710
18329
  const hits = await this.store.search(
17711
- input.q,
17712
- {
17713
- limit: candidateK,
17714
- semanticWeight: this.config.search.semanticWeight,
17715
- inputEnrichment: this.config.search.inputEnrichment,
17716
- reranking: this.config.search.reranking,
17717
- filter
17718
- },
18330
+ queryText,
18331
+ { limit: candidateK * fetchMultiplier, filter: metaFilter },
17719
18332
  resolvedScope
17720
18333
  );
17721
- ranked = rankHits(hits, this.config, input.q);
17722
- }
17723
- const searchMs = hrTimeMs(searchStart);
17724
- const results = this.buildResults(ranked, topK, groupByPage, input.q);
17725
- return {
17726
- q: input.q,
17727
- scope: resolvedScope.scopeName,
17728
- results,
17729
- meta: {
17730
- timingsMs: {
17731
- search: Math.round(searchMs),
17732
- total: Math.round(hrTimeMs(totalStart))
18334
+ let filtered = hits;
18335
+ if (pathPrefix) {
18336
+ filtered = filtered.filter((h) => h.metadata.url.startsWith(pathPrefix));
18337
+ }
18338
+ if (filterTags) {
18339
+ filtered = filtered.filter(
18340
+ (h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
18341
+ );
18342
+ }
18343
+ const ranked = rankHits(filtered, effectiveConfig, input.q, input.debug);
18344
+ const searchMs = hrTimeMs(searchStart);
18345
+ const results = this.buildResults(ranked, topK, false, maxSubResults, input.q, input.debug, effectiveConfig);
18346
+ return {
18347
+ q: input.q,
18348
+ scope: resolvedScope.scopeName,
18349
+ results,
18350
+ meta: {
18351
+ timingsMs: {
18352
+ search: Math.round(searchMs),
18353
+ total: Math.round(hrTimeMs(totalStart))
18354
+ }
17733
18355
  }
18356
+ };
18357
+ }
18358
+ }
18359
+ buildPageFirstResults(rankedPages, allChunks, query, debug, maxSubResults = 5) {
18360
+ return rankedPages.map((page, i) => {
18361
+ const chunks = allChunks[i] ?? [];
18362
+ const bestChunk = chunks[0];
18363
+ const snippet = bestChunk ? query ? queryAwareExcerpt(bestChunk.metadata.chunkText, query) : toSnippet(bestChunk.metadata.chunkText) : page.description || page.title;
18364
+ const result = {
18365
+ url: page.url,
18366
+ title: page.title,
18367
+ sectionTitle: bestChunk?.metadata.sectionTitle || void 0,
18368
+ snippet,
18369
+ chunkText: bestChunk?.metadata.chunkText || void 0,
18370
+ score: Number(page.finalScore.toFixed(6)),
18371
+ routeFile: page.routeFile,
18372
+ chunks: chunks.length > 0 ? chunks.slice(0, maxSubResults).map((c) => ({
18373
+ sectionTitle: c.metadata.sectionTitle || void 0,
18374
+ snippet: query ? queryAwareExcerpt(c.metadata.chunkText, query) : toSnippet(c.metadata.chunkText),
18375
+ chunkText: c.metadata.chunkText || void 0,
18376
+ headingPath: c.metadata.headingPath,
18377
+ score: Number(c.score.toFixed(6))
18378
+ })) : void 0
18379
+ };
18380
+ if (debug && page.breakdown) {
18381
+ result.breakdown = {
18382
+ baseScore: page.breakdown.baseScore,
18383
+ incomingLinkBoost: page.breakdown.incomingLinkBoost,
18384
+ depthBoost: page.breakdown.depthBoost,
18385
+ titleMatchBoost: page.breakdown.titleMatchBoost,
18386
+ freshnessBoost: page.breakdown.freshnessBoost,
18387
+ anchorTextMatchBoost: 0
18388
+ };
17734
18389
  }
17735
- };
18390
+ return result;
18391
+ });
17736
18392
  }
17737
- ensureSnippet(hit) {
18393
+ ensureSnippet(hit, query) {
18394
+ const chunkText = hit.hit.metadata.chunkText;
18395
+ if (query && chunkText) return queryAwareExcerpt(chunkText, query);
17738
18396
  const snippet = hit.hit.metadata.snippet;
17739
18397
  if (snippet && snippet.length >= 30) return snippet;
17740
- const chunkText = hit.hit.metadata.chunkText;
17741
18398
  if (chunkText) return toSnippet(chunkText);
17742
18399
  return snippet || "";
17743
18400
  }
17744
- buildResults(ordered, topK, groupByPage, _query) {
18401
+ buildResults(ordered, topK, groupByPage, maxSubResults, query, debug, config) {
18402
+ const cfg = config ?? this.config;
17745
18403
  if (groupByPage) {
17746
- let pages = aggregateByPage(ordered, this.config);
17747
- pages = trimByScoreGap(pages, this.config);
17748
- const minRatio = this.config.ranking.minChunkScoreRatio;
18404
+ let pages = aggregateByPage(ordered, cfg);
18405
+ pages = trimByScoreGap(pages, cfg);
18406
+ const minRatio = cfg.ranking.minChunkScoreRatio;
17749
18407
  return pages.slice(0, topK).map((page) => {
17750
18408
  const bestScore = page.bestChunk.finalScore;
17751
18409
  const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
17752
- const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
17753
- return {
18410
+ const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, maxSubResults);
18411
+ const result = {
17754
18412
  url: page.url,
17755
18413
  title: page.title,
17756
18414
  sectionTitle: page.bestChunk.hit.metadata.sectionTitle || void 0,
17757
- snippet: this.ensureSnippet(page.bestChunk),
18415
+ snippet: this.ensureSnippet(page.bestChunk, query),
18416
+ chunkText: page.bestChunk.hit.metadata.chunkText || void 0,
17758
18417
  score: Number(page.pageScore.toFixed(6)),
17759
18418
  routeFile: page.routeFile,
17760
- chunks: meaningful.length > 1 ? meaningful.map((c) => ({
18419
+ chunks: meaningful.length >= 1 ? meaningful.map((c) => ({
17761
18420
  sectionTitle: c.hit.metadata.sectionTitle || void 0,
17762
- snippet: this.ensureSnippet(c),
18421
+ snippet: this.ensureSnippet(c, query),
18422
+ chunkText: c.hit.metadata.chunkText || void 0,
17763
18423
  headingPath: c.hit.metadata.headingPath,
17764
18424
  score: Number(c.finalScore.toFixed(6))
17765
18425
  })) : void 0
17766
18426
  };
18427
+ if (debug && page.bestChunk.breakdown) {
18428
+ result.breakdown = page.bestChunk.breakdown;
18429
+ }
18430
+ return result;
17767
18431
  });
17768
18432
  } else {
17769
18433
  let filtered = ordered;
17770
- const minScore = this.config.ranking.minScore;
17771
- if (minScore > 0) {
17772
- filtered = ordered.filter((entry) => entry.finalScore >= minScore);
17773
- }
17774
- return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
17775
- url: hit.metadata.url,
17776
- title: hit.metadata.title,
17777
- sectionTitle: hit.metadata.sectionTitle || void 0,
17778
- snippet: this.ensureSnippet({ hit, finalScore }),
17779
- score: Number(finalScore.toFixed(6)),
17780
- routeFile: hit.metadata.routeFile
17781
- }));
18434
+ const minScoreRatio = cfg.ranking.minScoreRatio;
18435
+ if (minScoreRatio > 0 && ordered.length > 0) {
18436
+ const topScore = ordered[0].finalScore;
18437
+ if (Number.isFinite(topScore) && topScore > 0) {
18438
+ const threshold = topScore * minScoreRatio;
18439
+ filtered = ordered.filter((entry) => entry.finalScore >= threshold);
18440
+ }
18441
+ }
18442
+ return filtered.slice(0, topK).map(({ hit, finalScore, breakdown }) => {
18443
+ const result = {
18444
+ url: hit.metadata.url,
18445
+ title: hit.metadata.title,
18446
+ sectionTitle: hit.metadata.sectionTitle || void 0,
18447
+ snippet: this.ensureSnippet({ hit, finalScore }, query),
18448
+ chunkText: hit.metadata.chunkText || void 0,
18449
+ score: Number(finalScore.toFixed(6)),
18450
+ routeFile: hit.metadata.routeFile
18451
+ };
18452
+ if (debug && breakdown) {
18453
+ result.breakdown = breakdown;
18454
+ }
18455
+ return result;
18456
+ });
17782
18457
  }
17783
18458
  }
17784
18459
  async getPage(pathOrUrl, scope) {
@@ -17804,6 +18479,116 @@ var SearchEngine = class _SearchEngine {
17804
18479
  markdown: page.markdown
17805
18480
  };
17806
18481
  }
18482
+ async listPages(opts) {
18483
+ const resolvedScope = resolveScope(this.config, opts?.scope);
18484
+ const pathPrefix = opts?.pathPrefix ? opts.pathPrefix.startsWith("/") ? opts.pathPrefix : `/${opts.pathPrefix}` : void 0;
18485
+ return this.store.listPages(resolvedScope, {
18486
+ cursor: opts?.cursor,
18487
+ limit: opts?.limit,
18488
+ pathPrefix
18489
+ });
18490
+ }
18491
+ async getSiteStructure(opts) {
18492
+ const maxPages = Math.min(opts?.maxPages ?? MAX_SITE_STRUCTURE_PAGES, MAX_SITE_STRUCTURE_PAGES);
18493
+ const allPages = [];
18494
+ let cursor;
18495
+ let truncated = false;
18496
+ do {
18497
+ const result = await this.listPages({
18498
+ pathPrefix: opts?.pathPrefix,
18499
+ scope: opts?.scope,
18500
+ cursor,
18501
+ limit: 200
18502
+ });
18503
+ allPages.push(...result.pages);
18504
+ cursor = result.nextCursor;
18505
+ if (allPages.length >= maxPages) {
18506
+ truncated = allPages.length > maxPages || !!cursor;
18507
+ allPages.length = maxPages;
18508
+ break;
18509
+ }
18510
+ } while (cursor);
18511
+ const root2 = buildTree(allPages, opts?.pathPrefix);
18512
+ return {
18513
+ root: root2,
18514
+ totalPages: allPages.length,
18515
+ truncated
18516
+ };
18517
+ }
18518
+ async getRelatedPages(pathOrUrl, opts) {
18519
+ const resolvedScope = resolveScope(this.config, opts?.scope);
18520
+ const urlPath = this.resolveInputPath(pathOrUrl);
18521
+ const topK = Math.min(opts?.topK ?? 10, 25);
18522
+ const source = await this.store.fetchPageWithVector(urlPath, resolvedScope);
18523
+ if (!source) {
18524
+ throw new SearchSocketError("INVALID_REQUEST", `Indexed page not found for ${urlPath}`, 404);
18525
+ }
18526
+ const sourceOutgoing = new Set(source.metadata.outgoingLinkUrls ?? []);
18527
+ const semanticHits = await this.store.searchPagesByVector(
18528
+ source.vector,
18529
+ { limit: 50 },
18530
+ resolvedScope
18531
+ );
18532
+ const filteredHits = semanticHits.filter((h) => h.url !== urlPath);
18533
+ const semanticScoreMap = /* @__PURE__ */ new Map();
18534
+ for (const hit of filteredHits) {
18535
+ semanticScoreMap.set(hit.url, hit.score);
18536
+ }
18537
+ const candidateUrls = /* @__PURE__ */ new Set();
18538
+ for (const hit of filteredHits) {
18539
+ candidateUrls.add(hit.url);
18540
+ }
18541
+ for (const url of sourceOutgoing) {
18542
+ if (url !== urlPath) candidateUrls.add(url);
18543
+ }
18544
+ const missingUrls = [...sourceOutgoing].filter(
18545
+ (u) => u !== urlPath && !semanticScoreMap.has(u)
18546
+ );
18547
+ const fetchedPages = missingUrls.length > 0 ? await this.store.fetchPagesBatch(missingUrls, resolvedScope) : [];
18548
+ const metaMap = /* @__PURE__ */ new Map();
18549
+ for (const hit of filteredHits) {
18550
+ metaMap.set(hit.url, { title: hit.title, routeFile: hit.routeFile, outgoingLinkUrls: [] });
18551
+ }
18552
+ for (const p of fetchedPages) {
18553
+ metaMap.set(p.url, { title: p.title, routeFile: p.routeFile, outgoingLinkUrls: p.outgoingLinkUrls });
18554
+ }
18555
+ const semanticUrls = filteredHits.map((h) => h.url);
18556
+ if (semanticUrls.length > 0) {
18557
+ const semanticPageData = await this.store.fetchPagesBatch(semanticUrls, resolvedScope);
18558
+ for (const p of semanticPageData) {
18559
+ const existing = metaMap.get(p.url);
18560
+ if (existing) {
18561
+ existing.outgoingLinkUrls = p.outgoingLinkUrls;
18562
+ }
18563
+ }
18564
+ }
18565
+ const candidates = [];
18566
+ for (const url of candidateUrls) {
18567
+ const meta = metaMap.get(url);
18568
+ if (!meta) continue;
18569
+ const isOutgoing = sourceOutgoing.has(url);
18570
+ const isIncoming = meta.outgoingLinkUrls.includes(urlPath);
18571
+ const isLinked = isOutgoing || isIncoming;
18572
+ const dice = diceScore(urlPath, url);
18573
+ const semantic = semanticScoreMap.get(url) ?? 0;
18574
+ const score = compositeScore(isLinked, dice, semantic);
18575
+ const relationshipType = dominantRelationshipType(isOutgoing, isIncoming, dice);
18576
+ candidates.push({
18577
+ url,
18578
+ title: meta.title,
18579
+ score: Number(score.toFixed(6)),
18580
+ relationshipType,
18581
+ routeFile: meta.routeFile
18582
+ });
18583
+ }
18584
+ candidates.sort((a, b) => b.score - a.score);
18585
+ const results = candidates.slice(0, topK);
18586
+ return {
18587
+ sourceUrl: urlPath,
18588
+ scope: resolvedScope.scopeName,
18589
+ relatedPages: results
18590
+ };
18591
+ }
17807
18592
  async health() {
17808
18593
  return this.store.health();
17809
18594
  }
@@ -17819,6 +18604,215 @@ var SearchEngine = class _SearchEngine {
17819
18604
  }
17820
18605
  };
17821
18606
 
18607
+ // src/mcp/server.ts
18608
+ function createServer(engine) {
18609
+ const server = new mcp_js.McpServer({
18610
+ name: "searchsocket-mcp",
18611
+ version: "0.1.0"
18612
+ });
18613
+ server.registerTool(
18614
+ "search",
18615
+ {
18616
+ description: `Semantic site search powered by Upstash Search. Returns url, title, snippet, chunkText, score, and routeFile per result. chunkText contains the full raw chunk markdown. When groupBy is 'page' (default), each result includes a chunks array with section-level sub-results containing sectionTitle, headingPath, snippet, and score. Supports optional filters for structured metadata (e.g. {"version": 2, "deprecated": false}).`,
18617
+ inputSchema: {
18618
+ query: zod.z.string().min(1),
18619
+ scope: zod.z.string().optional(),
18620
+ topK: zod.z.number().int().positive().max(100).optional(),
18621
+ pathPrefix: zod.z.string().optional(),
18622
+ tags: zod.z.array(zod.z.string()).optional(),
18623
+ filters: zod.z.record(zod.z.string(), zod.z.union([zod.z.string(), zod.z.number(), zod.z.boolean()])).optional(),
18624
+ groupBy: zod.z.enum(["page", "chunk"]).optional(),
18625
+ maxSubResults: zod.z.number().int().positive().max(20).optional()
18626
+ },
18627
+ outputSchema: {
18628
+ q: zod.z.string(),
18629
+ scope: zod.z.string(),
18630
+ results: zod.z.array(zod.z.object({
18631
+ url: zod.z.string(),
18632
+ title: zod.z.string(),
18633
+ sectionTitle: zod.z.string().optional(),
18634
+ snippet: zod.z.string(),
18635
+ score: zod.z.number(),
18636
+ routeFile: zod.z.string(),
18637
+ chunks: zod.z.array(zod.z.object({
18638
+ sectionTitle: zod.z.string().optional(),
18639
+ snippet: zod.z.string(),
18640
+ headingPath: zod.z.array(zod.z.string()),
18641
+ score: zod.z.number()
18642
+ })).optional()
18643
+ })),
18644
+ meta: zod.z.object({
18645
+ timingsMs: zod.z.object({
18646
+ search: zod.z.number(),
18647
+ total: zod.z.number()
18648
+ })
18649
+ })
18650
+ }
18651
+ },
18652
+ async (input) => {
18653
+ const result = await engine.search({
18654
+ q: input.query,
18655
+ topK: input.topK,
18656
+ scope: input.scope,
18657
+ pathPrefix: input.pathPrefix,
18658
+ tags: input.tags,
18659
+ filters: input.filters,
18660
+ groupBy: input.groupBy,
18661
+ maxSubResults: input.maxSubResults
18662
+ });
18663
+ return {
18664
+ content: [
18665
+ {
18666
+ type: "text",
18667
+ text: JSON.stringify(result, null, 2)
18668
+ }
18669
+ ],
18670
+ structuredContent: result
18671
+ };
18672
+ }
18673
+ );
18674
+ server.registerTool(
18675
+ "get_page",
18676
+ {
18677
+ description: "Fetch indexed markdown for a specific path or URL, including frontmatter and routeFile mapping.",
18678
+ inputSchema: {
18679
+ pathOrUrl: zod.z.string().min(1),
18680
+ scope: zod.z.string().optional()
18681
+ }
18682
+ },
18683
+ async (input) => {
18684
+ const page = await engine.getPage(input.pathOrUrl, input.scope);
18685
+ return {
18686
+ content: [
18687
+ {
18688
+ type: "text",
18689
+ text: JSON.stringify(page, null, 2)
18690
+ }
18691
+ ]
18692
+ };
18693
+ }
18694
+ );
18695
+ server.registerTool(
18696
+ "list_pages",
18697
+ {
18698
+ description: "List indexed pages with optional path prefix filtering and cursor-based pagination. Returns url, title, description, and routeFile for each page. Use nextCursor to fetch subsequent pages.",
18699
+ inputSchema: {
18700
+ pathPrefix: zod.z.string().optional(),
18701
+ cursor: zod.z.string().optional(),
18702
+ limit: zod.z.number().int().positive().max(200).optional(),
18703
+ scope: zod.z.string().optional()
18704
+ }
18705
+ },
18706
+ async (input) => {
18707
+ const result = await engine.listPages({
18708
+ pathPrefix: input.pathPrefix,
18709
+ cursor: input.cursor,
18710
+ limit: input.limit,
18711
+ scope: input.scope
18712
+ });
18713
+ return {
18714
+ content: [
18715
+ {
18716
+ type: "text",
18717
+ text: JSON.stringify(result, null, 2)
18718
+ }
18719
+ ]
18720
+ };
18721
+ }
18722
+ );
18723
+ server.registerTool(
18724
+ "get_site_structure",
18725
+ {
18726
+ description: "Returns the hierarchical page tree derived from URL paths. Use this to understand site navigation structure, find where pages belong, or scope further operations to a section. Nodes with isIndexed: false are implicit structural parents not directly in the index. Large sites (>2000 pages) return truncated: true.",
18727
+ inputSchema: {
18728
+ pathPrefix: zod.z.string().optional(),
18729
+ scope: zod.z.string().optional(),
18730
+ maxPages: zod.z.number().int().positive().max(2e3).optional()
18731
+ }
18732
+ },
18733
+ async (input) => {
18734
+ const result = await engine.getSiteStructure({
18735
+ pathPrefix: input.pathPrefix,
18736
+ scope: input.scope,
18737
+ maxPages: input.maxPages
18738
+ });
18739
+ return {
18740
+ content: [
18741
+ {
18742
+ type: "text",
18743
+ text: JSON.stringify(result, null, 2)
18744
+ }
18745
+ ]
18746
+ };
18747
+ }
18748
+ );
18749
+ server.registerTool(
18750
+ "find_source_file",
18751
+ {
18752
+ description: "Find the SvelteKit source file for a piece of site content. Use this when you need to locate and edit content on the site. Returns the URL, route file path, section title, and a content snippet.",
18753
+ inputSchema: {
18754
+ query: zod.z.string().min(1),
18755
+ scope: zod.z.string().optional()
18756
+ }
18757
+ },
18758
+ async (input) => {
18759
+ const result = await engine.search({
18760
+ q: input.query,
18761
+ topK: 1,
18762
+ scope: input.scope
18763
+ });
18764
+ if (result.results.length === 0) {
18765
+ return {
18766
+ content: [
18767
+ {
18768
+ type: "text",
18769
+ text: JSON.stringify({
18770
+ error: "No matching content found for the given query."
18771
+ })
18772
+ }
18773
+ ]
18774
+ };
18775
+ }
18776
+ const match = result.results[0];
18777
+ const { url, routeFile, sectionTitle, snippet } = match;
18778
+ return {
18779
+ content: [
18780
+ {
18781
+ type: "text",
18782
+ text: JSON.stringify({ url, routeFile, sectionTitle, snippet })
18783
+ }
18784
+ ]
18785
+ };
18786
+ }
18787
+ );
18788
+ server.registerTool(
18789
+ "get_related_pages",
18790
+ {
18791
+ description: "Find pages related to a given URL using link graph, semantic similarity, and structural proximity. Returns related pages ranked by a composite relatedness score. Use this to discover content connected to a known page.",
18792
+ inputSchema: {
18793
+ pathOrUrl: zod.z.string().min(1),
18794
+ scope: zod.z.string().optional(),
18795
+ topK: zod.z.number().int().positive().max(25).optional()
18796
+ }
18797
+ },
18798
+ async (input) => {
18799
+ const result = await engine.getRelatedPages(input.pathOrUrl, {
18800
+ topK: input.topK,
18801
+ scope: input.scope
18802
+ });
18803
+ return {
18804
+ content: [
18805
+ {
18806
+ type: "text",
18807
+ text: JSON.stringify(result, null, 2)
18808
+ }
18809
+ ]
18810
+ };
18811
+ }
18812
+ );
18813
+ return server;
18814
+ }
18815
+
17822
18816
  // src/sveltekit/handle.ts
17823
18817
  var InMemoryRateLimiter = class {
17824
18818
  constructor(windowMs, max) {
@@ -17847,7 +18841,13 @@ function searchsocketHandle(options = {}) {
17847
18841
  let enginePromise = null;
17848
18842
  let configPromise = null;
17849
18843
  let apiPath = options.path;
18844
+ let llmsServePath = null;
18845
+ let serveMarkdownVariants = false;
18846
+ let mcpPath;
18847
+ let mcpApiKey;
18848
+ let mcpEnableJsonResponse = true;
17850
18849
  let rateLimiter = null;
18850
+ let notConfigured = false;
17851
18851
  const getConfig = async () => {
17852
18852
  if (!configPromise) {
17853
18853
  let configP;
@@ -17864,6 +18864,13 @@ function searchsocketHandle(options = {}) {
17864
18864
  }
17865
18865
  configPromise = configP.then((config) => {
17866
18866
  apiPath = apiPath ?? config.api.path;
18867
+ mcpPath = config.mcp.handle.path;
18868
+ mcpApiKey = config.mcp.handle.apiKey;
18869
+ mcpEnableJsonResponse = config.mcp.handle.enableJsonResponse;
18870
+ if (config.llmsTxt.enable) {
18871
+ llmsServePath = "/" + config.llmsTxt.outputPath.replace(/^static\//, "");
18872
+ serveMarkdownVariants = config.llmsTxt.serveMarkdownVariants;
18873
+ }
17867
18874
  if (config.api.rateLimit && !isServerless()) {
17868
18875
  rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
17869
18876
  }
@@ -17873,59 +18880,109 @@ function searchsocketHandle(options = {}) {
17873
18880
  return configPromise;
17874
18881
  };
17875
18882
  const getEngine = async () => {
18883
+ if (notConfigured) {
18884
+ throw new SearchSocketError(
18885
+ "SEARCH_NOT_CONFIGURED",
18886
+ "Search is not configured. Set the required Upstash environment variables to enable search.",
18887
+ 503
18888
+ );
18889
+ }
17876
18890
  if (!enginePromise) {
17877
18891
  const config = await getConfig();
17878
18892
  enginePromise = SearchEngine.create({
17879
18893
  cwd: options.cwd,
17880
18894
  config
18895
+ }).catch((error) => {
18896
+ enginePromise = null;
18897
+ if (error instanceof SearchSocketError && error.code === "VECTOR_BACKEND_UNAVAILABLE") {
18898
+ notConfigured = true;
18899
+ throw new SearchSocketError(
18900
+ "SEARCH_NOT_CONFIGURED",
18901
+ "Search is not configured. Set the required Upstash environment variables to enable search.",
18902
+ 503
18903
+ );
18904
+ }
18905
+ throw error;
17881
18906
  });
17882
18907
  }
17883
18908
  return enginePromise;
17884
18909
  };
17885
18910
  const bodyLimit = options.maxBodyBytes ?? 64 * 1024;
17886
18911
  return async ({ event, resolve }) => {
17887
- if (apiPath && event.url.pathname !== apiPath) {
17888
- return resolve(event);
18912
+ if (apiPath && !isApiPath(event.url.pathname, apiPath) && event.url.pathname !== llmsServePath) {
18913
+ const isMarkdownVariant = event.request.method === "GET" && event.url.pathname.endsWith(".md");
18914
+ if (mcpPath && event.url.pathname === mcpPath) {
18915
+ return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
18916
+ }
18917
+ if (mcpPath) {
18918
+ if (serveMarkdownVariants && isMarkdownVariant) ; else {
18919
+ return resolve(event);
18920
+ }
18921
+ } else {
18922
+ if (configPromise || options.config || options.rawConfig) {
18923
+ await getConfig();
18924
+ if (mcpPath && event.url.pathname === mcpPath) {
18925
+ return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
18926
+ }
18927
+ if (!(serveMarkdownVariants && isMarkdownVariant)) {
18928
+ return resolve(event);
18929
+ }
18930
+ } else {
18931
+ return resolve(event);
18932
+ }
18933
+ }
17889
18934
  }
17890
18935
  const config = await getConfig();
18936
+ if (llmsServePath && event.request.method === "GET" && event.url.pathname === llmsServePath) {
18937
+ const cwd = options.cwd ?? process.cwd();
18938
+ const filePath = path__default.default.resolve(cwd, config.llmsTxt.outputPath);
18939
+ try {
18940
+ const content = await fs9__default.default.readFile(filePath, "utf8");
18941
+ return new Response(content, {
18942
+ status: 200,
18943
+ headers: { "content-type": "text/plain; charset=utf-8" }
18944
+ });
18945
+ } catch {
18946
+ return resolve(event);
18947
+ }
18948
+ }
18949
+ if (serveMarkdownVariants && event.request.method === "GET" && event.url.pathname.endsWith(".md")) {
18950
+ let rawPath;
18951
+ try {
18952
+ rawPath = decodeURIComponent(event.url.pathname.slice(0, -3));
18953
+ } catch {
18954
+ return resolve(event);
18955
+ }
18956
+ const scope = event.url.searchParams?.get("scope") ?? void 0;
18957
+ try {
18958
+ const engine = await getEngine();
18959
+ const page = await engine.getPage(rawPath, scope);
18960
+ return new Response(page.markdown, {
18961
+ status: 200,
18962
+ headers: { "content-type": "text/markdown; charset=utf-8" }
18963
+ });
18964
+ } catch (error) {
18965
+ if (error instanceof SearchSocketError && error.status === 404) {
18966
+ return resolve(event);
18967
+ }
18968
+ throw error;
18969
+ }
18970
+ }
18971
+ if (mcpPath && event.url.pathname === mcpPath) {
18972
+ return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
18973
+ }
17891
18974
  const targetPath = apiPath ?? config.api.path;
17892
- if (event.url.pathname !== targetPath) {
18975
+ if (!isApiPath(event.url.pathname, targetPath)) {
17893
18976
  return resolve(event);
17894
18977
  }
17895
- if (event.request.method === "OPTIONS") {
18978
+ const subPath = event.url.pathname.slice(targetPath.length);
18979
+ const method = event.request.method;
18980
+ if (method === "OPTIONS") {
17896
18981
  return new Response(null, {
17897
18982
  status: 204,
17898
18983
  headers: buildCorsHeaders(event.request, config)
17899
18984
  });
17900
18985
  }
17901
- if (event.request.method !== "POST") {
17902
- return withCors(
17903
- new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
17904
- status: 405,
17905
- headers: {
17906
- "content-type": "application/json"
17907
- }
17908
- }),
17909
- event.request,
17910
- config
17911
- );
17912
- }
17913
- const contentLength = Number(event.request.headers.get("content-length") ?? 0);
17914
- if (contentLength > bodyLimit) {
17915
- return withCors(
17916
- new Response(
17917
- JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Request body too large", 413))),
17918
- {
17919
- status: 413,
17920
- headers: {
17921
- "content-type": "application/json"
17922
- }
17923
- }
17924
- ),
17925
- event.request,
17926
- config
17927
- );
17928
- }
17929
18986
  if (rateLimiter) {
17930
18987
  const ip = event.getClientAddress?.() ?? event.request.headers.get("x-forwarded-for")?.split(",")[0]?.trim() ?? "unknown";
17931
18988
  if (!rateLimiter.check(ip)) {
@@ -17945,39 +19002,32 @@ function searchsocketHandle(options = {}) {
17945
19002
  }
17946
19003
  }
17947
19004
  try {
17948
- let rawBody;
17949
- if (typeof event.request.text === "function") {
17950
- rawBody = await event.request.text();
17951
- } else {
17952
- let parsedFallback;
17953
- try {
17954
- parsedFallback = await event.request.json();
17955
- } catch (error) {
17956
- if (error instanceof SyntaxError) {
17957
- throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
17958
- }
17959
- throw error;
19005
+ if (method === "GET") {
19006
+ if (subPath === "" || subPath === "/") {
19007
+ return await handleGetSearch(event, config, getEngine);
19008
+ }
19009
+ if (subPath === "/health") {
19010
+ return await handleGetHealth(event, config, getEngine);
19011
+ }
19012
+ if (subPath.startsWith("/pages/")) {
19013
+ return await handleGetPage(event, config, getEngine, subPath);
17960
19014
  }
17961
- rawBody = JSON.stringify(parsedFallback);
19015
+ return withCors(
19016
+ new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Not found", 404))), {
19017
+ status: 404,
19018
+ headers: { "content-type": "application/json" }
19019
+ }),
19020
+ event.request,
19021
+ config
19022
+ );
17962
19023
  }
17963
- if (Buffer.byteLength(rawBody, "utf8") > bodyLimit) {
17964
- throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
19024
+ if (method === "POST" && (subPath === "" || subPath === "/")) {
19025
+ return await handlePostSearch(event, config, getEngine, bodyLimit);
17965
19026
  }
17966
- let body;
17967
- try {
17968
- body = JSON.parse(rawBody);
17969
- } catch {
17970
- throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
17971
- }
17972
- const engine = await getEngine();
17973
- const searchRequest = body;
17974
- const result = await engine.search(searchRequest);
17975
19027
  return withCors(
17976
- new Response(JSON.stringify(result), {
17977
- status: 200,
17978
- headers: {
17979
- "content-type": "application/json"
17980
- }
19028
+ new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
19029
+ status: 405,
19030
+ headers: { "content-type": "application/json" }
17981
19031
  }),
17982
19032
  event.request,
17983
19033
  config
@@ -17998,6 +19048,183 @@ function searchsocketHandle(options = {}) {
17998
19048
  }
17999
19049
  };
18000
19050
  }
19051
+ function isApiPath(pathname, apiPath) {
19052
+ return pathname === apiPath || pathname.startsWith(apiPath + "/");
19053
+ }
19054
+ async function handleGetSearch(event, config, getEngine) {
19055
+ const params = event.url.searchParams;
19056
+ const q = params.get("q");
19057
+ if (!q || q.trim() === "") {
19058
+ throw new SearchSocketError("INVALID_REQUEST", "Missing required query parameter: q", 400);
19059
+ }
19060
+ const searchRequest = { q };
19061
+ const topK = params.get("topK");
19062
+ if (topK !== null) {
19063
+ const parsed = Number.parseInt(topK, 10);
19064
+ if (Number.isNaN(parsed) || parsed < 1) {
19065
+ throw new SearchSocketError("INVALID_REQUEST", "topK must be a positive integer", 400);
19066
+ }
19067
+ searchRequest.topK = parsed;
19068
+ }
19069
+ const scope = params.get("scope");
19070
+ if (scope !== null) searchRequest.scope = scope;
19071
+ const pathPrefix = params.get("pathPrefix");
19072
+ if (pathPrefix !== null) searchRequest.pathPrefix = pathPrefix;
19073
+ const groupBy = params.get("groupBy");
19074
+ if (groupBy) {
19075
+ if (groupBy !== "page" && groupBy !== "chunk") {
19076
+ throw new SearchSocketError("INVALID_REQUEST", 'groupBy must be "page" or "chunk"', 400);
19077
+ }
19078
+ searchRequest.groupBy = groupBy;
19079
+ }
19080
+ const maxSubResults = params.get("maxSubResults");
19081
+ if (maxSubResults !== null) {
19082
+ const parsed = Number.parseInt(maxSubResults, 10);
19083
+ if (Number.isNaN(parsed) || parsed < 1 || parsed > 20) {
19084
+ throw new SearchSocketError("INVALID_REQUEST", "maxSubResults must be a positive integer between 1 and 20", 400);
19085
+ }
19086
+ searchRequest.maxSubResults = parsed;
19087
+ }
19088
+ const tags = params.getAll("tags");
19089
+ if (tags.length > 0) searchRequest.tags = tags;
19090
+ const engine = await getEngine();
19091
+ const result = await engine.search(searchRequest);
19092
+ return withCors(
19093
+ new Response(JSON.stringify(result), {
19094
+ status: 200,
19095
+ headers: { "content-type": "application/json" }
19096
+ }),
19097
+ event.request,
19098
+ config
19099
+ );
19100
+ }
19101
+ async function handleGetHealth(event, config, getEngine) {
19102
+ const engine = await getEngine();
19103
+ const result = await engine.health();
19104
+ return withCors(
19105
+ new Response(JSON.stringify(result), {
19106
+ status: 200,
19107
+ headers: { "content-type": "application/json" }
19108
+ }),
19109
+ event.request,
19110
+ config
19111
+ );
19112
+ }
19113
+ async function handleGetPage(event, config, getEngine, subPath) {
19114
+ const rawPath = subPath.slice("/pages".length);
19115
+ let pagePath;
19116
+ try {
19117
+ pagePath = decodeURIComponent(rawPath);
19118
+ } catch {
19119
+ throw new SearchSocketError("INVALID_REQUEST", "Malformed page path", 400);
19120
+ }
19121
+ const scope = event.url.searchParams?.get("scope") ?? void 0;
19122
+ const engine = await getEngine();
19123
+ const result = await engine.getPage(pagePath, scope);
19124
+ return withCors(
19125
+ new Response(JSON.stringify(result), {
19126
+ status: 200,
19127
+ headers: { "content-type": "application/json" }
19128
+ }),
19129
+ event.request,
19130
+ config
19131
+ );
19132
+ }
19133
+ async function handlePostSearch(event, config, getEngine, bodyLimit) {
19134
+ const contentLength = Number(event.request.headers.get("content-length") ?? 0);
19135
+ if (contentLength > bodyLimit) {
19136
+ throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
19137
+ }
19138
+ let rawBody;
19139
+ if (typeof event.request.text === "function") {
19140
+ rawBody = await event.request.text();
19141
+ } else {
19142
+ let parsedFallback;
19143
+ try {
19144
+ parsedFallback = await event.request.json();
19145
+ } catch (error) {
19146
+ if (error instanceof SyntaxError) {
19147
+ throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
19148
+ }
19149
+ throw error;
19150
+ }
19151
+ rawBody = JSON.stringify(parsedFallback);
19152
+ }
19153
+ if (Buffer.byteLength(rawBody, "utf8") > bodyLimit) {
19154
+ throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
19155
+ }
19156
+ let body;
19157
+ try {
19158
+ body = JSON.parse(rawBody);
19159
+ } catch {
19160
+ throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
19161
+ }
19162
+ const engine = await getEngine();
19163
+ const searchRequest = body;
19164
+ const result = await engine.search(searchRequest);
19165
+ return withCors(
19166
+ new Response(JSON.stringify(result), {
19167
+ status: 200,
19168
+ headers: { "content-type": "application/json" }
19169
+ }),
19170
+ event.request,
19171
+ config
19172
+ );
19173
+ }
19174
+ async function handleMcpRequest(event, apiKey, enableJsonResponse, getEngine) {
19175
+ if (apiKey) {
19176
+ const authHeader = event.request.headers.get("authorization") ?? "";
19177
+ const token = authHeader.startsWith("Bearer ") ? authHeader.slice(7) : "";
19178
+ const tokenBuf = Buffer.from(token);
19179
+ const keyBuf = Buffer.from(apiKey);
19180
+ if (tokenBuf.length !== keyBuf.length || !crypto.timingSafeEqual(tokenBuf, keyBuf)) {
19181
+ return new Response(
19182
+ JSON.stringify({
19183
+ jsonrpc: "2.0",
19184
+ error: { code: -32001, message: "Unauthorized" },
19185
+ id: null
19186
+ }),
19187
+ { status: 401, headers: { "content-type": "application/json" } }
19188
+ );
19189
+ }
19190
+ }
19191
+ const transport = new webStandardStreamableHttp_js.WebStandardStreamableHTTPServerTransport({
19192
+ sessionIdGenerator: void 0,
19193
+ enableJsonResponse
19194
+ });
19195
+ let server;
19196
+ try {
19197
+ const engine = await getEngine();
19198
+ server = createServer(engine);
19199
+ await server.connect(transport);
19200
+ const response = await transport.handleRequest(event.request);
19201
+ if (enableJsonResponse) {
19202
+ await transport.close();
19203
+ await server.close();
19204
+ }
19205
+ return response;
19206
+ } catch (error) {
19207
+ try {
19208
+ await transport.close();
19209
+ } catch {
19210
+ }
19211
+ try {
19212
+ await server?.close();
19213
+ } catch {
19214
+ }
19215
+ return new Response(
19216
+ JSON.stringify({
19217
+ jsonrpc: "2.0",
19218
+ error: {
19219
+ code: -32603,
19220
+ message: error instanceof Error ? error.message : "Internal server error"
19221
+ },
19222
+ id: null
19223
+ }),
19224
+ { status: 500, headers: { "content-type": "application/json" } }
19225
+ );
19226
+ }
19227
+ }
18001
19228
  function buildCorsHeaders(request, config) {
18002
19229
  const allowOrigins = config.api.cors.allowOrigins;
18003
19230
  if (!allowOrigins || allowOrigins.length === 0) {
@@ -18010,7 +19237,7 @@ function buildCorsHeaders(request, config) {
18010
19237
  }
18011
19238
  return {
18012
19239
  "access-control-allow-origin": allowOrigins.includes("*") ? "*" : origin,
18013
- "access-control-allow-methods": "POST, OPTIONS",
19240
+ "access-control-allow-methods": "GET, POST, OPTIONS",
18014
19241
  "access-control-allow-headers": "content-type"
18015
19242
  };
18016
19243
  }
@@ -18057,6 +19284,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
18057
19284
  if (normalizeText(current.text)) {
18058
19285
  sections.push({
18059
19286
  sectionTitle: current.sectionTitle,
19287
+ headingLevel: current.headingLevel,
18060
19288
  headingPath: current.headingPath,
18061
19289
  text: current.text.trim()
18062
19290
  });
@@ -18075,6 +19303,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
18075
19303
  headingStack.length = level;
18076
19304
  current = {
18077
19305
  sectionTitle: title,
19306
+ headingLevel: level,
18078
19307
  headingPath: headingStack.filter((entry) => Boolean(entry)).slice(0, headingPathDepth),
18079
19308
  text: `${line}
18080
19309
  `
@@ -18210,6 +19439,7 @@ function splitSection(section, config) {
18210
19439
  return [
18211
19440
  {
18212
19441
  sectionTitle: section.sectionTitle,
19442
+ headingLevel: section.headingLevel,
18213
19443
  headingPath: section.headingPath,
18214
19444
  chunkText: text
18215
19445
  }
@@ -18260,6 +19490,7 @@ ${chunk}`;
18260
19490
  }
18261
19491
  return merged.map((chunkText) => ({
18262
19492
  sectionTitle: section.sectionTitle,
19493
+ headingLevel: section.headingLevel,
18263
19494
  headingPath: section.headingPath,
18264
19495
  chunkText
18265
19496
  }));
@@ -18275,6 +19506,18 @@ function buildSummaryChunkText(page) {
18275
19506
  }
18276
19507
  return parts.join("\n\n");
18277
19508
  }
19509
+ function buildEmbeddingTitle(chunk) {
19510
+ if (!chunk.sectionTitle || chunk.headingLevel === void 0) return void 0;
19511
+ if (chunk.headingPath.length > 1) {
19512
+ const path14 = chunk.headingPath.join(" > ");
19513
+ const lastInPath = chunk.headingPath[chunk.headingPath.length - 1];
19514
+ if (lastInPath !== chunk.sectionTitle) {
19515
+ return `${chunk.title} \u2014 ${path14} > ${chunk.sectionTitle}`;
19516
+ }
19517
+ return `${chunk.title} \u2014 ${path14}`;
19518
+ }
19519
+ return `${chunk.title} \u2014 ${chunk.sectionTitle}`;
19520
+ }
18278
19521
  function buildEmbeddingText(chunk, prependTitle) {
18279
19522
  if (!prependTitle) return chunk.chunkText;
18280
19523
  const prefix = chunk.sectionTitle ? `${chunk.title} \u2014 ${chunk.sectionTitle}` : chunk.title;
@@ -18305,10 +19548,14 @@ function chunkPage(page, config, scope) {
18305
19548
  tags: page.tags,
18306
19549
  contentHash: "",
18307
19550
  description: page.description,
18308
- keywords: page.keywords
19551
+ keywords: page.keywords,
19552
+ publishedAt: page.publishedAt,
19553
+ incomingAnchorText: page.incomingAnchorText,
19554
+ meta: page.meta
18309
19555
  };
18310
19556
  const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
18311
- summaryChunk.contentHash = sha256(normalizeText(embeddingText));
19557
+ const metaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
19558
+ summaryChunk.contentHash = sha256(normalizeText(embeddingText) + metaSuffix);
18312
19559
  chunks.push(summaryChunk);
18313
19560
  }
18314
19561
  const ordinalOffset = config.chunking.pageSummaryChunk ? 1 : 0;
@@ -18325,6 +19572,7 @@ function chunkPage(page, config, scope) {
18325
19572
  path: page.url,
18326
19573
  title: page.title,
18327
19574
  sectionTitle: entry.sectionTitle,
19575
+ headingLevel: entry.headingLevel,
18328
19576
  headingPath: entry.headingPath,
18329
19577
  chunkText: entry.chunkText,
18330
19578
  snippet: toSnippet(entry.chunkText),
@@ -18334,10 +19582,16 @@ function chunkPage(page, config, scope) {
18334
19582
  tags: page.tags,
18335
19583
  contentHash: "",
18336
19584
  description: page.description,
18337
- keywords: page.keywords
19585
+ keywords: page.keywords,
19586
+ publishedAt: page.publishedAt,
19587
+ incomingAnchorText: page.incomingAnchorText,
19588
+ meta: page.meta
18338
19589
  };
18339
19590
  const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
18340
- chunk.contentHash = sha256(normalizeText(embeddingText));
19591
+ const embeddingTitle = config.chunking.weightHeadings ? buildEmbeddingTitle(chunk) : void 0;
19592
+ const chunkMetaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
19593
+ const hashInput = embeddingTitle ? `${normalizeText(embeddingText)}|title:${embeddingTitle}` : normalizeText(embeddingText);
19594
+ chunk.contentHash = sha256(hashInput + chunkMetaSuffix);
18341
19595
  chunks.push(chunk);
18342
19596
  }
18343
19597
  return chunks;
@@ -19170,6 +20424,69 @@ function gfm(turndownService) {
19170
20424
  }
19171
20425
 
19172
20426
  // src/indexing/extractor.ts
20427
+ function normalizeDateToMs(value) {
20428
+ if (value == null) return void 0;
20429
+ if (value instanceof Date) {
20430
+ const ts = value.getTime();
20431
+ return Number.isFinite(ts) ? ts : void 0;
20432
+ }
20433
+ if (typeof value === "string") {
20434
+ const ts = new Date(value).getTime();
20435
+ return Number.isFinite(ts) ? ts : void 0;
20436
+ }
20437
+ if (typeof value === "number") {
20438
+ return Number.isFinite(value) ? value : void 0;
20439
+ }
20440
+ return void 0;
20441
+ }
20442
+ var FRONTMATTER_DATE_FIELDS = ["date", "publishedAt", "updatedAt", "published_at", "updated_at"];
20443
+ function extractPublishedAtFromFrontmatter(data) {
20444
+ for (const field of FRONTMATTER_DATE_FIELDS) {
20445
+ const val = normalizeDateToMs(data[field]);
20446
+ if (val !== void 0) return val;
20447
+ }
20448
+ return void 0;
20449
+ }
20450
+ function extractPublishedAtFromHtml($) {
20451
+ const jsonLdScripts = $('script[type="application/ld+json"]');
20452
+ for (let i = 0; i < jsonLdScripts.length; i++) {
20453
+ try {
20454
+ const raw = $(jsonLdScripts[i]).html();
20455
+ if (!raw) continue;
20456
+ const parsed = JSON.parse(raw);
20457
+ const candidates = [];
20458
+ if (Array.isArray(parsed)) {
20459
+ candidates.push(...parsed);
20460
+ } else if (parsed && typeof parsed === "object") {
20461
+ candidates.push(parsed);
20462
+ if (Array.isArray(parsed["@graph"])) {
20463
+ candidates.push(...parsed["@graph"]);
20464
+ }
20465
+ }
20466
+ for (const candidate of candidates) {
20467
+ const val = normalizeDateToMs(candidate.datePublished);
20468
+ if (val !== void 0) return val;
20469
+ }
20470
+ } catch {
20471
+ }
20472
+ }
20473
+ const ogTime = $('meta[property="article:published_time"]').attr("content")?.trim();
20474
+ if (ogTime) {
20475
+ const val = normalizeDateToMs(ogTime);
20476
+ if (val !== void 0) return val;
20477
+ }
20478
+ const itempropDate = $('meta[itemprop="datePublished"]').attr("content")?.trim() || $('time[itemprop="datePublished"]').attr("datetime")?.trim();
20479
+ if (itempropDate) {
20480
+ const val = normalizeDateToMs(itempropDate);
20481
+ if (val !== void 0) return val;
20482
+ }
20483
+ const timeEl = $("time[datetime]").first().attr("datetime")?.trim();
20484
+ if (timeEl) {
20485
+ const val = normalizeDateToMs(timeEl);
20486
+ if (val !== void 0) return val;
20487
+ }
20488
+ return void 0;
20489
+ }
19173
20490
  function hasTopLevelNoindexComment(markdown) {
19174
20491
  const lines = markdown.split(/\r?\n/);
19175
20492
  let inFence = false;
@@ -19185,6 +20502,97 @@ function hasTopLevelNoindexComment(markdown) {
19185
20502
  }
19186
20503
  return false;
19187
20504
  }
20505
+ var GARBAGE_ALT_WORDS = /* @__PURE__ */ new Set([
20506
+ "image",
20507
+ "photo",
20508
+ "picture",
20509
+ "icon",
20510
+ "logo",
20511
+ "banner",
20512
+ "screenshot",
20513
+ "thumbnail",
20514
+ "img",
20515
+ "graphic",
20516
+ "illustration",
20517
+ "spacer",
20518
+ "pixel",
20519
+ "placeholder",
20520
+ "avatar",
20521
+ "background"
20522
+ ]);
20523
+ var IMAGE_EXT_RE = /\.(jpg|jpeg|png|gif|svg|webp|avif|bmp|ico)(\?.*)?$/i;
20524
+ function isMeaningfulAlt(alt) {
20525
+ const trimmed = alt.trim();
20526
+ if (!trimmed || trimmed.length < 5) return false;
20527
+ if (IMAGE_EXT_RE.test(trimmed)) return false;
20528
+ if (GARBAGE_ALT_WORDS.has(trimmed.toLowerCase())) return false;
20529
+ return true;
20530
+ }
20531
+ function resolveImageText(img, $, imageDescAttr) {
20532
+ const imgDesc = img.attr(imageDescAttr)?.trim();
20533
+ if (imgDesc) return imgDesc;
20534
+ const figure = img.closest("figure");
20535
+ if (figure.length) {
20536
+ const figDesc = figure.attr(imageDescAttr)?.trim();
20537
+ if (figDesc) return figDesc;
20538
+ }
20539
+ const alt = img.attr("alt")?.trim() ?? "";
20540
+ const caption = figure.length ? figure.find("figcaption").first().text().trim() : "";
20541
+ if (isMeaningfulAlt(alt) && caption) {
20542
+ return `${alt} \u2014 ${caption}`;
20543
+ }
20544
+ if (isMeaningfulAlt(alt)) {
20545
+ return alt;
20546
+ }
20547
+ if (caption) {
20548
+ return caption;
20549
+ }
20550
+ return null;
20551
+ }
20552
+ var STOP_ANCHORS = /* @__PURE__ */ new Set([
20553
+ "here",
20554
+ "click",
20555
+ "click here",
20556
+ "read more",
20557
+ "link",
20558
+ "this",
20559
+ "more"
20560
+ ]);
20561
+ function normalizeAnchorText(raw) {
20562
+ const normalized = raw.replace(/\s+/g, " ").trim().toLowerCase();
20563
+ if (normalized.length < 3) return "";
20564
+ if (STOP_ANCHORS.has(normalized)) return "";
20565
+ if (normalized.length > 100) return normalized.slice(0, 100);
20566
+ return normalized;
20567
+ }
20568
+ function escapeHtml(text) {
20569
+ return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
20570
+ }
20571
+ function preprocessImages(root2, $, imageDescAttr) {
20572
+ root2.find("picture").each((_i, el) => {
20573
+ const picture = $(el);
20574
+ const img = picture.find("img").first();
20575
+ const parentFigure = picture.closest("figure");
20576
+ const text = img.length ? resolveImageText(img, $, imageDescAttr) : null;
20577
+ if (text) {
20578
+ if (parentFigure.length) parentFigure.find("figcaption").remove();
20579
+ picture.replaceWith(`<span>${escapeHtml(text)}</span>`);
20580
+ } else {
20581
+ picture.remove();
20582
+ }
20583
+ });
20584
+ root2.find("img").each((_i, el) => {
20585
+ const img = $(el);
20586
+ const parentFigure = img.closest("figure");
20587
+ const text = resolveImageText(img, $, imageDescAttr);
20588
+ if (text) {
20589
+ if (parentFigure.length) parentFigure.find("figcaption").remove();
20590
+ img.replaceWith(`<span>${escapeHtml(text)}</span>`);
20591
+ } else {
20592
+ img.remove();
20593
+ }
20594
+ });
20595
+ }
19188
20596
  function extractFromHtml(url, html, config) {
19189
20597
  const $ = cheerio.load(html);
19190
20598
  const normalizedUrl = normalizeUrlPath(url);
@@ -19210,6 +20618,20 @@ function extractFromHtml(url, html, config) {
19210
20618
  if (weight === 0) {
19211
20619
  return null;
19212
20620
  }
20621
+ if ($('meta[name="searchsocket:noindex"]').attr("content") === "true") {
20622
+ return null;
20623
+ }
20624
+ const RESERVED_META_KEYS = /* @__PURE__ */ new Set(["noindex", "tags"]);
20625
+ const meta = {};
20626
+ $('meta[name^="searchsocket:"]').each((_i, el) => {
20627
+ const name = $(el).attr("name") ?? "";
20628
+ const key = name.slice("searchsocket:".length);
20629
+ if (!key || RESERVED_META_KEYS.has(key) || !validateMetaKey(key)) return;
20630
+ const content = $(el).attr("content") ?? "";
20631
+ const dataType = $(el).attr("data-type") ?? "string";
20632
+ meta[key] = parseMetaValue(content, dataType);
20633
+ });
20634
+ const componentTags = $('meta[name="searchsocket:tags"]').attr("content")?.trim();
19213
20635
  const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
19214
20636
  const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
19215
20637
  const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
@@ -19221,7 +20643,9 @@ function extractFromHtml(url, html, config) {
19221
20643
  root2.find(selector).remove();
19222
20644
  }
19223
20645
  root2.find(`[${config.extract.ignoreAttr}]`).remove();
20646
+ preprocessImages(root2, $, config.extract.imageDescAttr);
19224
20647
  const outgoingLinks = [];
20648
+ const seenLinkKeys = /* @__PURE__ */ new Set();
19225
20649
  root2.find("a[href]").each((_index, node) => {
19226
20650
  const href = $(node).attr("href");
19227
20651
  if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:")) {
@@ -19232,7 +20656,19 @@ function extractFromHtml(url, html, config) {
19232
20656
  if (!["http:", "https:"].includes(parsed.protocol)) {
19233
20657
  return;
19234
20658
  }
19235
- outgoingLinks.push(normalizeUrlPath(parsed.pathname));
20659
+ const url2 = normalizeUrlPath(parsed.pathname);
20660
+ let anchorText = normalizeAnchorText($(node).text());
20661
+ if (!anchorText) {
20662
+ const imgAlt = $(node).find("img").first().attr("alt") ?? "";
20663
+ if (isMeaningfulAlt(imgAlt)) {
20664
+ anchorText = normalizeAnchorText(imgAlt);
20665
+ }
20666
+ }
20667
+ const key = `${url2}|${anchorText}`;
20668
+ if (!seenLinkKeys.has(key)) {
20669
+ seenLinkKeys.add(key);
20670
+ outgoingLinks.push({ url: url2, anchorText });
20671
+ }
19236
20672
  } catch {
19237
20673
  }
19238
20674
  });
@@ -19257,16 +20693,25 @@ function extractFromHtml(url, html, config) {
19257
20693
  return null;
19258
20694
  }
19259
20695
  const tags = normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1);
20696
+ const publishedAt = extractPublishedAtFromHtml($);
20697
+ if (componentTags) {
20698
+ const extraTags = componentTags.split(",").map((t) => t.trim()).filter(Boolean);
20699
+ for (const t of extraTags) {
20700
+ if (!tags.includes(t)) tags.push(t);
20701
+ }
20702
+ }
19260
20703
  return {
19261
20704
  url: normalizeUrlPath(url),
19262
20705
  title,
19263
20706
  markdown,
19264
- outgoingLinks: [...new Set(outgoingLinks)],
20707
+ outgoingLinks,
19265
20708
  noindex: false,
19266
20709
  tags,
19267
20710
  description,
19268
20711
  keywords,
19269
- weight
20712
+ weight,
20713
+ publishedAt,
20714
+ meta: Object.keys(meta).length > 0 ? meta : void 0
19270
20715
  };
19271
20716
  }
19272
20717
  function extractFromMarkdown(url, markdown, title) {
@@ -19287,6 +20732,24 @@ function extractFromMarkdown(url, markdown, title) {
19287
20732
  if (mdWeight === 0) {
19288
20733
  return null;
19289
20734
  }
20735
+ let mdMeta;
20736
+ const rawMeta = searchsocketMeta?.meta;
20737
+ if (rawMeta && typeof rawMeta === "object" && !Array.isArray(rawMeta)) {
20738
+ const metaObj = {};
20739
+ for (const [key, val] of Object.entries(rawMeta)) {
20740
+ if (!validateMetaKey(key)) continue;
20741
+ if (typeof val === "string" || typeof val === "number" || typeof val === "boolean") {
20742
+ metaObj[key] = val;
20743
+ } else if (Array.isArray(val) && val.every((v) => typeof v === "string")) {
20744
+ metaObj[key] = val;
20745
+ } else if (val instanceof Date) {
20746
+ metaObj[key] = val.getTime();
20747
+ }
20748
+ }
20749
+ if (Object.keys(metaObj).length > 0) {
20750
+ mdMeta = metaObj;
20751
+ }
20752
+ }
19290
20753
  const content = parsed.content;
19291
20754
  const normalized = normalizeMarkdown(content);
19292
20755
  if (!normalizeText(normalized)) {
@@ -19301,6 +20764,7 @@ function extractFromMarkdown(url, markdown, title) {
19301
20764
  fmKeywords = frontmatter.keywords.split(",").map((k) => k.trim()).filter(Boolean);
19302
20765
  }
19303
20766
  if (fmKeywords && fmKeywords.length === 0) fmKeywords = void 0;
20767
+ const publishedAt = extractPublishedAtFromFrontmatter(frontmatter);
19304
20768
  return {
19305
20769
  url: normalizeUrlPath(url),
19306
20770
  title: resolvedTitle,
@@ -19310,7 +20774,9 @@ function extractFromMarkdown(url, markdown, title) {
19310
20774
  tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
19311
20775
  description: fmDescription,
19312
20776
  keywords: fmKeywords,
19313
- weight: mdWeight
20777
+ weight: mdWeight,
20778
+ publishedAt,
20779
+ meta: mdMeta
19314
20780
  };
19315
20781
  }
19316
20782
  function segmentToRegex(segment) {
@@ -19473,7 +20939,7 @@ async function parseManifest(cwd, outputDir) {
19473
20939
  const manifestPath = path__default.default.resolve(cwd, outputDir, "server", "manifest-full.js");
19474
20940
  let content;
19475
20941
  try {
19476
- content = await fs3__default.default.readFile(manifestPath, "utf8");
20942
+ content = await fs9__default.default.readFile(manifestPath, "utf8");
19477
20943
  } catch {
19478
20944
  throw new SearchSocketError(
19479
20945
  "BUILD_MANIFEST_NOT_FOUND",
@@ -19784,6 +21250,125 @@ function filePathToUrl(filePath, baseDir) {
19784
21250
  const noExt = relative.replace(/\.md$/i, "").replace(/\/index$/i, "");
19785
21251
  return normalizeUrlPath(noExt || "/");
19786
21252
  }
21253
+ var ROUTE_FILE_RE = /\+(page|layout|error)(@[^.]+)?\.svelte$/;
21254
+ function isSvelteComponentFile(filePath) {
21255
+ if (!filePath.endsWith(".svelte")) return false;
21256
+ return !ROUTE_FILE_RE.test(filePath);
21257
+ }
21258
+ function extractSvelteComponentMeta(source) {
21259
+ const componentMatch = source.match(/<!--\s*@component\s*([\s\S]*?)\s*-->/);
21260
+ const description = componentMatch?.[1]?.trim() || void 0;
21261
+ const propsMatch = source.match(
21262
+ /let\s+\{([\s\S]*?)\}\s*(?::\s*([^=;{][\s\S]*?))?\s*=\s*\$props\(\)/
21263
+ );
21264
+ const props = [];
21265
+ if (propsMatch) {
21266
+ const destructureBlock = propsMatch[1];
21267
+ const typeAnnotation = propsMatch[2]?.trim();
21268
+ let resolvedTypeMap;
21269
+ if (typeAnnotation && /^[A-Z]\w*$/.test(typeAnnotation)) {
21270
+ resolvedTypeMap = resolveTypeReference(source, typeAnnotation);
21271
+ } else if (typeAnnotation && typeAnnotation.startsWith("{")) {
21272
+ resolvedTypeMap = parseInlineTypeAnnotation(typeAnnotation);
21273
+ }
21274
+ const propEntries = splitDestructureBlock(destructureBlock);
21275
+ for (const entry of propEntries) {
21276
+ const trimmed = entry.trim();
21277
+ if (!trimmed || trimmed.startsWith("...")) continue;
21278
+ let propName;
21279
+ let defaultValue;
21280
+ const renameMatch = trimmed.match(/^(\w+)\s*:\s*\w+\s*(?:=\s*([\s\S]+))?$/);
21281
+ if (renameMatch) {
21282
+ propName = renameMatch[1];
21283
+ defaultValue = renameMatch[2]?.trim();
21284
+ } else {
21285
+ const defaultMatch = trimmed.match(/^(\w+)\s*=\s*([\s\S]+)$/);
21286
+ if (defaultMatch) {
21287
+ propName = defaultMatch[1];
21288
+ defaultValue = defaultMatch[2]?.trim();
21289
+ } else {
21290
+ propName = trimmed.match(/^(\w+)/)?.[1] ?? trimmed;
21291
+ }
21292
+ }
21293
+ const propType = resolvedTypeMap?.get(propName);
21294
+ props.push({
21295
+ name: propName,
21296
+ ...propType ? { type: propType } : {},
21297
+ ...defaultValue ? { default: defaultValue } : {}
21298
+ });
21299
+ }
21300
+ }
21301
+ return { description, props };
21302
+ }
21303
+ function splitDestructureBlock(block) {
21304
+ const entries = [];
21305
+ let depth = 0;
21306
+ let current = "";
21307
+ for (const ch of block) {
21308
+ if (ch === "{" || ch === "[" || ch === "(") {
21309
+ depth++;
21310
+ current += ch;
21311
+ } else if (ch === "}" || ch === "]" || ch === ")") {
21312
+ depth--;
21313
+ current += ch;
21314
+ } else if (ch === "," && depth === 0) {
21315
+ entries.push(current);
21316
+ current = "";
21317
+ } else {
21318
+ current += ch;
21319
+ }
21320
+ }
21321
+ if (current.trim()) entries.push(current);
21322
+ return entries;
21323
+ }
21324
+ function resolveTypeReference(source, typeName) {
21325
+ const startRe = new RegExp(`(?:interface\\s+${typeName}\\s*|type\\s+${typeName}\\s*=\\s*)\\{`);
21326
+ const startMatch = source.match(startRe);
21327
+ if (!startMatch || startMatch.index === void 0) return void 0;
21328
+ const bodyStart = startMatch.index + startMatch[0].length;
21329
+ let depth = 1;
21330
+ let i = bodyStart;
21331
+ while (i < source.length && depth > 0) {
21332
+ if (source[i] === "{") depth++;
21333
+ else if (source[i] === "}") depth--;
21334
+ i++;
21335
+ }
21336
+ if (depth !== 0) return void 0;
21337
+ const body = source.slice(bodyStart, i - 1);
21338
+ return parseTypeMembers(body);
21339
+ }
21340
+ function parseInlineTypeAnnotation(annotation) {
21341
+ const inner = annotation.replace(/^\{/, "").replace(/\}$/, "");
21342
+ return parseTypeMembers(inner);
21343
+ }
21344
+ function parseTypeMembers(body) {
21345
+ const map = /* @__PURE__ */ new Map();
21346
+ const members = body.split(/[;\n]/).map((m) => m.trim()).filter(Boolean);
21347
+ for (const member of members) {
21348
+ const memberMatch = member.match(/^(\w+)\??\s*:\s*(.+)$/);
21349
+ if (memberMatch) {
21350
+ map.set(memberMatch[1], memberMatch[2].replace(/,\s*$/, "").trim());
21351
+ }
21352
+ }
21353
+ return map;
21354
+ }
21355
+ function buildComponentMarkdown(componentName, meta) {
21356
+ if (!meta.description && meta.props.length === 0) return "";
21357
+ const parts = [`${componentName} component.`];
21358
+ if (meta.description) {
21359
+ parts.push(meta.description);
21360
+ }
21361
+ if (meta.props.length > 0) {
21362
+ const propEntries = meta.props.map((p) => {
21363
+ let entry = p.name;
21364
+ if (p.type) entry += ` (${p.type})`;
21365
+ if (p.default) entry += ` default: ${p.default}`;
21366
+ return entry;
21367
+ });
21368
+ parts.push(`Props: ${propEntries.join(", ")}.`);
21369
+ }
21370
+ return parts.join(" ");
21371
+ }
19787
21372
  function normalizeSvelteToMarkdown(source) {
19788
21373
  return source.replace(/<script[\s\S]*?<\/script>/g, "").replace(/<style[\s\S]*?<\/style>/g, "").replace(/<[^>]+>/g, " ").replace(/\{[^}]+\}/g, " ").replace(/\s+/g, " ").trim();
19789
21374
  }
@@ -19802,13 +21387,27 @@ async function loadContentFilesPages(cwd, config, maxPages) {
19802
21387
  const selected = typeof limit === "number" ? files.slice(0, limit) : files;
19803
21388
  const pages = [];
19804
21389
  for (const filePath of selected) {
19805
- const raw = await fs3__default.default.readFile(filePath, "utf8");
19806
- const markdown = filePath.endsWith(".md") ? raw : normalizeSvelteToMarkdown(raw);
21390
+ const raw = await fs9__default.default.readFile(filePath, "utf8");
21391
+ let markdown;
21392
+ let tags;
21393
+ if (filePath.endsWith(".md")) {
21394
+ markdown = raw;
21395
+ } else if (isSvelteComponentFile(filePath)) {
21396
+ const componentName = path__default.default.basename(filePath, ".svelte");
21397
+ const meta = extractSvelteComponentMeta(raw);
21398
+ const componentMarkdown = buildComponentMarkdown(componentName, meta);
21399
+ const templateContent = normalizeSvelteToMarkdown(raw);
21400
+ markdown = componentMarkdown ? [componentMarkdown, templateContent].filter(Boolean).join("\n\n") : templateContent;
21401
+ tags = ["component"];
21402
+ } else {
21403
+ markdown = normalizeSvelteToMarkdown(raw);
21404
+ }
19807
21405
  pages.push({
19808
21406
  url: filePathToUrl(filePath, baseDir),
19809
21407
  markdown,
19810
21408
  sourcePath: path__default.default.relative(cwd, filePath).replace(/\\/g, "/"),
19811
- outgoingLinks: []
21409
+ outgoingLinks: [],
21410
+ ...tags ? { tags } : {}
19812
21411
  });
19813
21412
  }
19814
21413
  return pages;
@@ -19938,7 +21537,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
19938
21537
  const selected = typeof limit === "number" ? htmlFiles.slice(0, limit) : htmlFiles;
19939
21538
  const pages = [];
19940
21539
  for (const filePath of selected) {
19941
- const html = await fs3__default.default.readFile(filePath, "utf8");
21540
+ const html = await fs9__default.default.readFile(filePath, "utf8");
19942
21541
  pages.push({
19943
21542
  url: staticHtmlFileToUrl(filePath, outputDir),
19944
21543
  html,
@@ -20001,7 +21600,7 @@ function isBlockedByRobots(urlPath, rules3) {
20001
21600
  }
20002
21601
  async function loadRobotsTxtFromDir(dir) {
20003
21602
  try {
20004
- const content = await fs3__default.default.readFile(path__default.default.join(dir, "robots.txt"), "utf8");
21603
+ const content = await fs9__default.default.readFile(path__default.default.join(dir, "robots.txt"), "utf8");
20005
21604
  return parseRobotsTxt(content);
20006
21605
  } catch {
20007
21606
  return null;
@@ -20018,6 +21617,81 @@ async function fetchRobotsTxt(baseUrl) {
20018
21617
  return null;
20019
21618
  }
20020
21619
  }
21620
+ function resolvePageUrl(pageUrl, baseUrl) {
21621
+ if (!baseUrl) return pageUrl;
21622
+ try {
21623
+ return new URL(pageUrl, baseUrl).href;
21624
+ } catch {
21625
+ return pageUrl;
21626
+ }
21627
+ }
21628
+ function generateLlmsTxt(pages, config) {
21629
+ const title = config.llmsTxt.title ?? config.project.id;
21630
+ const description = config.llmsTxt.description;
21631
+ const baseUrl = config.project.baseUrl;
21632
+ const lines = [`# ${title}`];
21633
+ if (description) {
21634
+ lines.push("", `> ${description}`);
21635
+ }
21636
+ const filtered = pages.filter(
21637
+ (p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
21638
+ );
21639
+ const sorted = [...filtered].sort((a, b) => {
21640
+ if (a.depth !== b.depth) return a.depth - b.depth;
21641
+ return b.incomingLinks - a.incomingLinks;
21642
+ });
21643
+ if (sorted.length > 0) {
21644
+ lines.push("", "## Pages", "");
21645
+ for (const page of sorted) {
21646
+ const url = resolvePageUrl(page.url, baseUrl);
21647
+ if (page.description) {
21648
+ lines.push(`- [${page.title}](${url}): ${page.description}`);
21649
+ } else {
21650
+ lines.push(`- [${page.title}](${url})`);
21651
+ }
21652
+ }
21653
+ }
21654
+ lines.push("");
21655
+ return lines.join("\n");
21656
+ }
21657
+ function generateLlmsFullTxt(pages, config) {
21658
+ const title = config.llmsTxt.title ?? config.project.id;
21659
+ const description = config.llmsTxt.description;
21660
+ const baseUrl = config.project.baseUrl;
21661
+ const lines = [`# ${title}`];
21662
+ if (description) {
21663
+ lines.push("", `> ${description}`);
21664
+ }
21665
+ const filtered = pages.filter(
21666
+ (p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
21667
+ );
21668
+ const sorted = [...filtered].sort((a, b) => {
21669
+ if (a.depth !== b.depth) return a.depth - b.depth;
21670
+ return b.incomingLinks - a.incomingLinks;
21671
+ });
21672
+ for (const page of sorted) {
21673
+ const url = resolvePageUrl(page.url, baseUrl);
21674
+ lines.push("", "---", "", `## [${page.title}](${url})`, "");
21675
+ lines.push(page.markdown.trim());
21676
+ }
21677
+ lines.push("");
21678
+ return lines.join("\n");
21679
+ }
21680
+ async function writeLlmsTxt(pages, config, cwd, logger3) {
21681
+ const outputPath = path__default.default.resolve(cwd, config.llmsTxt.outputPath);
21682
+ const outputDir = path__default.default.dirname(outputPath);
21683
+ await fs9__default.default.mkdir(outputDir, { recursive: true });
21684
+ const content = generateLlmsTxt(pages, config);
21685
+ await fs9__default.default.writeFile(outputPath, content, "utf8");
21686
+ logger3.info(`Generated llms.txt at ${config.llmsTxt.outputPath}`);
21687
+ if (config.llmsTxt.generateFull) {
21688
+ const fullPath = outputPath.replace(/\.txt$/, "-full.txt");
21689
+ const fullContent = generateLlmsFullTxt(pages, config);
21690
+ await fs9__default.default.writeFile(fullPath, fullContent, "utf8");
21691
+ const relativeFull = path__default.default.relative(cwd, fullPath);
21692
+ logger3.info(`Generated llms-full.txt at ${relativeFull}`);
21693
+ }
21694
+ }
20021
21695
 
20022
21696
  // src/indexing/pipeline.ts
20023
21697
  function buildPageSummary(page, maxChars = 3500) {
@@ -20036,16 +21710,33 @@ function buildPageSummary(page, maxChars = 3500) {
20036
21710
  if (joined.length <= maxChars) return joined;
20037
21711
  return joined.slice(0, maxChars).trim();
20038
21712
  }
21713
+ function buildPageContentHash(page) {
21714
+ const parts = [
21715
+ page.title,
21716
+ page.description ?? "",
21717
+ (page.keywords ?? []).slice().sort().join(","),
21718
+ page.tags.slice().sort().join(","),
21719
+ page.markdown,
21720
+ String(page.outgoingLinks),
21721
+ String(page.publishedAt ?? ""),
21722
+ page.incomingAnchorText ?? "",
21723
+ (page.outgoingLinkUrls ?? []).slice().sort().join(","),
21724
+ page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : ""
21725
+ ];
21726
+ return sha256(parts.join("|"));
21727
+ }
20039
21728
  var IndexPipeline = class _IndexPipeline {
20040
21729
  cwd;
20041
21730
  config;
20042
21731
  store;
20043
21732
  logger;
21733
+ hooks;
20044
21734
  constructor(options) {
20045
21735
  this.cwd = options.cwd;
20046
21736
  this.config = options.config;
20047
21737
  this.store = options.store;
20048
21738
  this.logger = options.logger;
21739
+ this.hooks = options.hooks;
20049
21740
  }
20050
21741
  static async create(options = {}) {
20051
21742
  const cwd = path__default.default.resolve(options.cwd ?? process.cwd());
@@ -20055,7 +21746,8 @@ var IndexPipeline = class _IndexPipeline {
20055
21746
  cwd,
20056
21747
  config,
20057
21748
  store,
20058
- logger: options.logger ?? new Logger()
21749
+ logger: options.logger ?? new Logger(),
21750
+ hooks: options.hooks ?? {}
20059
21751
  });
20060
21752
  }
20061
21753
  getConfig() {
@@ -20076,7 +21768,7 @@ var IndexPipeline = class _IndexPipeline {
20076
21768
  const scope = resolveScope(this.config, options.scopeOverride);
20077
21769
  ensureStateDirs(this.cwd, this.config.state.dir);
20078
21770
  const sourceMode = options.sourceOverride ?? this.config.source.mode;
20079
- this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-search)`);
21771
+ this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-vector)`);
20080
21772
  if (options.force) {
20081
21773
  this.logger.info("Force mode enabled \u2014 full rebuild");
20082
21774
  }
@@ -20085,8 +21777,9 @@ var IndexPipeline = class _IndexPipeline {
20085
21777
  }
20086
21778
  const manifestStart = stageStart();
20087
21779
  const existingHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getContentHashes(scope);
21780
+ const existingPageHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getPageHashes(scope);
20088
21781
  stageEnd("manifest", manifestStart);
20089
- this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
21782
+ this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes, ${existingPageHashes.size} existing page hashes loaded`);
20090
21783
  const sourceStart = stageStart();
20091
21784
  this.logger.info(`Loading pages (source: ${sourceMode})...`);
20092
21785
  let sourcePages;
@@ -20163,11 +21856,61 @@ var IndexPipeline = class _IndexPipeline {
20163
21856
  );
20164
21857
  continue;
20165
21858
  }
20166
- extractedPages.push(extracted);
21859
+ if (sourcePage.tags && sourcePage.tags.length > 0) {
21860
+ extracted.tags = [.../* @__PURE__ */ new Set([...extracted.tags, ...sourcePage.tags])];
21861
+ }
21862
+ let accepted;
21863
+ if (this.hooks.transformPage) {
21864
+ const transformed = await this.hooks.transformPage(extracted);
21865
+ if (transformed === null) {
21866
+ this.logger.debug(`Page ${sourcePage.url} skipped by transformPage hook`);
21867
+ continue;
21868
+ }
21869
+ accepted = transformed;
21870
+ } else {
21871
+ accepted = extracted;
21872
+ }
21873
+ extractedPages.push(accepted);
20167
21874
  this.logger.event("page_extracted", {
20168
- url: extracted.url
21875
+ url: accepted.url
20169
21876
  });
20170
21877
  }
21878
+ const customRecords = options.customRecords ?? [];
21879
+ if (customRecords.length > 0) {
21880
+ this.logger.info(`Processing ${customRecords.length} custom record${customRecords.length === 1 ? "" : "s"}...`);
21881
+ for (const record of customRecords) {
21882
+ const normalizedUrl = normalizeUrlPath(record.url);
21883
+ const normalized = normalizeMarkdown(record.content);
21884
+ if (!normalized.trim()) {
21885
+ this.logger.warn(`Custom record ${normalizedUrl} has empty content and was skipped.`);
21886
+ continue;
21887
+ }
21888
+ const urlTags = normalizedUrl.split("/").filter(Boolean).slice(0, 1);
21889
+ const tags = record.tags ? [.../* @__PURE__ */ new Set([...urlTags, ...record.tags])] : urlTags;
21890
+ const extracted = {
21891
+ url: normalizedUrl,
21892
+ title: record.title,
21893
+ markdown: normalized,
21894
+ outgoingLinks: [],
21895
+ noindex: false,
21896
+ tags,
21897
+ weight: record.weight
21898
+ };
21899
+ let accepted;
21900
+ if (this.hooks.transformPage) {
21901
+ const transformed = await this.hooks.transformPage(extracted);
21902
+ if (transformed === null) {
21903
+ this.logger.debug(`Custom record ${normalizedUrl} skipped by transformPage hook`);
21904
+ continue;
21905
+ }
21906
+ accepted = transformed;
21907
+ } else {
21908
+ accepted = extracted;
21909
+ }
21910
+ extractedPages.push(accepted);
21911
+ this.logger.event("page_extracted", { url: accepted.url, custom: true });
21912
+ }
21913
+ }
20171
21914
  extractedPages.sort((a, b) => a.url.localeCompare(b.url));
20172
21915
  const uniquePages = [];
20173
21916
  const seenUrls = /* @__PURE__ */ new Set();
@@ -20200,15 +21943,28 @@ var IndexPipeline = class _IndexPipeline {
20200
21943
  const linkStart = stageStart();
20201
21944
  const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
20202
21945
  const incomingLinkCount = /* @__PURE__ */ new Map();
21946
+ const incomingAnchorTexts = /* @__PURE__ */ new Map();
20203
21947
  for (const page of indexablePages) {
20204
21948
  incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
20205
21949
  }
20206
21950
  for (const page of indexablePages) {
20207
- for (const outgoing of page.outgoingLinks) {
21951
+ const seenForCount = /* @__PURE__ */ new Set();
21952
+ const seenForAnchor = /* @__PURE__ */ new Set();
21953
+ for (const { url: outgoing, anchorText } of page.outgoingLinks) {
20208
21954
  if (!pageSet.has(outgoing)) {
20209
21955
  continue;
20210
21956
  }
20211
- incomingLinkCount.set(outgoing, (incomingLinkCount.get(outgoing) ?? 0) + 1);
21957
+ if (!seenForCount.has(outgoing)) {
21958
+ seenForCount.add(outgoing);
21959
+ incomingLinkCount.set(outgoing, (incomingLinkCount.get(outgoing) ?? 0) + 1);
21960
+ }
21961
+ if (anchorText && !seenForAnchor.has(outgoing)) {
21962
+ seenForAnchor.add(outgoing);
21963
+ if (!incomingAnchorTexts.has(outgoing)) {
21964
+ incomingAnchorTexts.set(outgoing, /* @__PURE__ */ new Set());
21965
+ }
21966
+ incomingAnchorTexts.get(outgoing).add(anchorText);
21967
+ }
20212
21968
  }
20213
21969
  }
20214
21970
  stageEnd("links", linkStart);
@@ -20227,6 +21983,15 @@ var IndexPipeline = class _IndexPipeline {
20227
21983
  });
20228
21984
  }
20229
21985
  }
21986
+ for (const record of customRecords) {
21987
+ const normalizedUrl = normalizeUrlPath(record.url);
21988
+ if (!precomputedRoutes.has(normalizedUrl)) {
21989
+ precomputedRoutes.set(normalizedUrl, {
21990
+ routeFile: "",
21991
+ routeResolution: "exact"
21992
+ });
21993
+ }
21994
+ }
20230
21995
  for (const page of indexablePages) {
20231
21996
  const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
20232
21997
  if (routeMatch.routeResolution === "best-effort") {
@@ -20244,6 +22009,17 @@ var IndexPipeline = class _IndexPipeline {
20244
22009
  } else {
20245
22010
  routeExact += 1;
20246
22011
  }
22012
+ const anchorSet = incomingAnchorTexts.get(page.url);
22013
+ let incomingAnchorText;
22014
+ if (anchorSet && anchorSet.size > 0) {
22015
+ let joined = "";
22016
+ for (const phrase of anchorSet) {
22017
+ const next2 = joined ? `${joined} ${phrase}` : phrase;
22018
+ if (next2.length > 500) break;
22019
+ joined = next2;
22020
+ }
22021
+ incomingAnchorText = joined || void 0;
22022
+ }
20247
22023
  const indexedPage = {
20248
22024
  url: page.url,
20249
22025
  title: page.title,
@@ -20253,40 +22029,113 @@ var IndexPipeline = class _IndexPipeline {
20253
22029
  generatedAt: nowIso(),
20254
22030
  incomingLinks: incomingLinkCount.get(page.url) ?? 0,
20255
22031
  outgoingLinks: page.outgoingLinks.length,
22032
+ outgoingLinkUrls: page.outgoingLinks.map((l) => typeof l === "string" ? l : l.url),
20256
22033
  depth: getUrlDepth(page.url),
20257
22034
  tags: page.tags,
20258
22035
  markdown: page.markdown,
20259
22036
  description: page.description,
20260
- keywords: page.keywords
22037
+ keywords: page.keywords,
22038
+ publishedAt: page.publishedAt,
22039
+ incomingAnchorText,
22040
+ meta: page.meta
20261
22041
  };
20262
22042
  pages.push(indexedPage);
20263
22043
  this.logger.event("page_indexed", { url: page.url });
20264
22044
  }
22045
+ const pageRecords = pages.map((p) => {
22046
+ const summary = buildPageSummary(p);
22047
+ return {
22048
+ url: p.url,
22049
+ title: p.title,
22050
+ markdown: p.markdown,
22051
+ projectId: scope.projectId,
22052
+ scopeName: scope.scopeName,
22053
+ routeFile: p.routeFile,
22054
+ routeResolution: p.routeResolution,
22055
+ incomingLinks: p.incomingLinks,
22056
+ outgoingLinks: p.outgoingLinks,
22057
+ outgoingLinkUrls: p.outgoingLinkUrls,
22058
+ depth: p.depth,
22059
+ tags: p.tags,
22060
+ indexedAt: p.generatedAt,
22061
+ summary,
22062
+ description: p.description,
22063
+ keywords: p.keywords,
22064
+ contentHash: buildPageContentHash(p),
22065
+ publishedAt: p.publishedAt,
22066
+ meta: p.meta
22067
+ };
22068
+ });
22069
+ const currentPageUrls = new Set(pageRecords.map((r) => r.url));
22070
+ const changedPages = pageRecords.filter(
22071
+ (r) => !existingPageHashes.has(r.url) || existingPageHashes.get(r.url) !== r.contentHash
22072
+ );
22073
+ const deletedPageUrls = [...existingPageHashes.keys()].filter((url) => !currentPageUrls.has(url));
20265
22074
  if (!options.dryRun) {
20266
- const pageRecords = pages.map((p) => {
20267
- const summary = buildPageSummary(p);
20268
- return {
20269
- url: p.url,
20270
- title: p.title,
20271
- markdown: p.markdown,
20272
- projectId: scope.projectId,
20273
- scopeName: scope.scopeName,
20274
- routeFile: p.routeFile,
20275
- routeResolution: p.routeResolution,
20276
- incomingLinks: p.incomingLinks,
20277
- outgoingLinks: p.outgoingLinks,
20278
- depth: p.depth,
20279
- tags: p.tags,
20280
- indexedAt: p.generatedAt,
20281
- summary,
20282
- description: p.description,
20283
- keywords: p.keywords
20284
- };
20285
- });
20286
- await this.store.deletePages(scope);
20287
- await this.store.upsertPages(pageRecords, scope);
22075
+ if (options.force) {
22076
+ await this.store.deletePages(scope);
22077
+ this.logger.info(`Upserting ${pageRecords.length} page summaries...`);
22078
+ const pageDocs = pageRecords.map((r) => ({
22079
+ id: r.url,
22080
+ data: r.summary ?? r.title,
22081
+ metadata: {
22082
+ title: r.title,
22083
+ url: r.url,
22084
+ description: r.description ?? "",
22085
+ keywords: r.keywords ?? [],
22086
+ summary: r.summary ?? "",
22087
+ tags: r.tags,
22088
+ markdown: r.markdown,
22089
+ routeFile: r.routeFile,
22090
+ routeResolution: r.routeResolution,
22091
+ incomingLinks: r.incomingLinks,
22092
+ outgoingLinks: r.outgoingLinks,
22093
+ outgoingLinkUrls: r.outgoingLinkUrls ?? [],
22094
+ depth: r.depth,
22095
+ indexedAt: r.indexedAt,
22096
+ contentHash: r.contentHash ?? "",
22097
+ publishedAt: r.publishedAt ?? null,
22098
+ ...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
22099
+ }
22100
+ }));
22101
+ await this.store.upsertPages(pageDocs, scope);
22102
+ } else {
22103
+ if (changedPages.length > 0) {
22104
+ this.logger.info(`Upserting ${changedPages.length} changed page summaries...`);
22105
+ const pageDocs = changedPages.map((r) => ({
22106
+ id: r.url,
22107
+ data: r.summary ?? r.title,
22108
+ metadata: {
22109
+ title: r.title,
22110
+ url: r.url,
22111
+ description: r.description ?? "",
22112
+ keywords: r.keywords ?? [],
22113
+ summary: r.summary ?? "",
22114
+ tags: r.tags,
22115
+ markdown: r.markdown,
22116
+ routeFile: r.routeFile,
22117
+ routeResolution: r.routeResolution,
22118
+ incomingLinks: r.incomingLinks,
22119
+ outgoingLinks: r.outgoingLinks,
22120
+ outgoingLinkUrls: r.outgoingLinkUrls ?? [],
22121
+ depth: r.depth,
22122
+ indexedAt: r.indexedAt,
22123
+ contentHash: r.contentHash ?? "",
22124
+ publishedAt: r.publishedAt ?? null,
22125
+ ...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
22126
+ }
22127
+ }));
22128
+ await this.store.upsertPages(pageDocs, scope);
22129
+ }
22130
+ if (deletedPageUrls.length > 0) {
22131
+ await this.store.deletePagesByIds(deletedPageUrls, scope);
22132
+ }
22133
+ }
20288
22134
  }
22135
+ const pagesChanged = options.force ? pageRecords.length : changedPages.length;
22136
+ const pagesDeleted = deletedPageUrls.length;
20289
22137
  stageEnd("pages", pagesStart);
22138
+ this.logger.info(`Page changes: ${pagesChanged} changed/new, ${pagesDeleted} deleted, ${pageRecords.length - changedPages.length} unchanged`);
20290
22139
  this.logger.info(`Indexed ${pages.length} page${pages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["pages"]}ms)`);
20291
22140
  const chunkStart = stageStart();
20292
22141
  this.logger.info("Chunking pages...");
@@ -20295,6 +22144,18 @@ var IndexPipeline = class _IndexPipeline {
20295
22144
  if (typeof maxChunks === "number") {
20296
22145
  chunks = chunks.slice(0, maxChunks);
20297
22146
  }
22147
+ if (this.hooks.transformChunk) {
22148
+ const transformed = [];
22149
+ for (const chunk of chunks) {
22150
+ const result = await this.hooks.transformChunk(chunk);
22151
+ if (result === null) {
22152
+ this.logger.debug(`Chunk ${chunk.chunkKey} skipped by transformChunk hook`);
22153
+ continue;
22154
+ }
22155
+ transformed.push(result);
22156
+ }
22157
+ chunks = transformed;
22158
+ }
20298
22159
  for (const chunk of chunks) {
20299
22160
  this.logger.event("chunked", {
20300
22161
  url: chunk.url,
@@ -20307,7 +22168,7 @@ var IndexPipeline = class _IndexPipeline {
20307
22168
  for (const chunk of chunks) {
20308
22169
  currentChunkMap.set(chunk.chunkKey, chunk);
20309
22170
  }
20310
- const changedChunks = chunks.filter((chunk) => {
22171
+ let changedChunks = chunks.filter((chunk) => {
20311
22172
  if (options.force) {
20312
22173
  return true;
20313
22174
  }
@@ -20321,36 +22182,43 @@ var IndexPipeline = class _IndexPipeline {
20321
22182
  return existingHash !== chunk.contentHash;
20322
22183
  });
20323
22184
  const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
22185
+ if (this.hooks.beforeIndex) {
22186
+ changedChunks = await this.hooks.beforeIndex(changedChunks);
22187
+ }
20324
22188
  this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
20325
22189
  const upsertStart = stageStart();
20326
22190
  let documentsUpserted = 0;
20327
22191
  if (!options.dryRun && changedChunks.length > 0) {
20328
- this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Search...`);
20329
- const UPSTASH_CONTENT_LIMIT = 4096;
22192
+ this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Vector...`);
20330
22193
  const docs = changedChunks.map((chunk) => {
20331
- const title = chunk.title;
20332
- const sectionTitle = chunk.sectionTitle ?? "";
20333
- const url = chunk.url;
20334
- const tags = chunk.tags.join(",");
20335
- const headingPath = chunk.headingPath.join(" > ");
20336
- const otherFieldsLen = title.length + sectionTitle.length + url.length + tags.length + headingPath.length;
20337
- const textBudget = Math.max(500, UPSTASH_CONTENT_LIMIT - otherFieldsLen - 50);
20338
- const text = buildEmbeddingText(chunk, this.config.chunking.prependTitle).slice(0, textBudget);
22194
+ const embeddingText = buildEmbeddingText(chunk, this.config.chunking.prependTitle);
22195
+ if (embeddingText.length > 2e3) {
22196
+ this.logger.warn(
22197
+ `Chunk ${chunk.chunkKey} text is ${embeddingText.length} chars (~${Math.round(embeddingText.length / 4)} tokens), which may exceed the 512-token model limit and be silently truncated.`
22198
+ );
22199
+ }
20339
22200
  return {
20340
22201
  id: chunk.chunkKey,
20341
- content: { title, sectionTitle, text, url, tags, headingPath },
22202
+ data: embeddingText,
20342
22203
  metadata: {
20343
- projectId: scope.projectId,
20344
- scopeName: scope.scopeName,
22204
+ url: chunk.url,
20345
22205
  path: chunk.path,
22206
+ title: chunk.title,
22207
+ sectionTitle: chunk.sectionTitle ?? "",
22208
+ headingPath: chunk.headingPath.join(" > "),
20346
22209
  snippet: chunk.snippet,
22210
+ chunkText: embeddingText,
22211
+ tags: chunk.tags,
20347
22212
  ordinal: chunk.ordinal,
20348
22213
  contentHash: chunk.contentHash,
20349
22214
  depth: chunk.depth,
20350
22215
  incomingLinks: chunk.incomingLinks,
20351
22216
  routeFile: chunk.routeFile,
20352
22217
  description: chunk.description ?? "",
20353
- keywords: (chunk.keywords ?? []).join(",")
22218
+ keywords: chunk.keywords ?? [],
22219
+ publishedAt: chunk.publishedAt ?? null,
22220
+ incomingAnchorText: chunk.incomingAnchorText ?? "",
22221
+ ...chunk.meta && Object.keys(chunk.meta).length > 0 ? { meta: chunk.meta } : {}
20354
22222
  }
20355
22223
  };
20356
22224
  });
@@ -20368,9 +22236,16 @@ var IndexPipeline = class _IndexPipeline {
20368
22236
  } else {
20369
22237
  this.logger.info("No chunks to upsert \u2014 all up to date");
20370
22238
  }
22239
+ if (this.config.llmsTxt.enable && !options.dryRun) {
22240
+ const llmsStart = stageStart();
22241
+ await writeLlmsTxt(pages, this.config, this.cwd, this.logger);
22242
+ stageEnd("llms_txt", llmsStart);
22243
+ }
20371
22244
  this.logger.info("Done.");
20372
- return {
22245
+ const stats = {
20373
22246
  pagesProcessed: pages.length,
22247
+ pagesChanged,
22248
+ pagesDeleted,
20374
22249
  chunksTotal: chunks.length,
20375
22250
  chunksChanged: changedChunks.length,
20376
22251
  documentsUpserted,
@@ -20379,6 +22254,10 @@ var IndexPipeline = class _IndexPipeline {
20379
22254
  routeBestEffort,
20380
22255
  stageTimingsMs
20381
22256
  };
22257
+ if (this.hooks.afterIndex) {
22258
+ await this.hooks.afterIndex(stats);
22259
+ }
22260
+ return stats;
20382
22261
  }
20383
22262
  };
20384
22263
 
@@ -20400,9 +22279,6 @@ function shouldRunAutoIndex(options) {
20400
22279
  if (explicit && /^(1|true|yes)$/i.test(explicit)) {
20401
22280
  return true;
20402
22281
  }
20403
- if (process.env.CI && /^(1|true)$/i.test(process.env.CI)) {
20404
- return true;
20405
- }
20406
22282
  return false;
20407
22283
  }
20408
22284
  function searchsocketVitePlugin(options = {}) {
@@ -20427,7 +22303,8 @@ function searchsocketVitePlugin(options = {}) {
20427
22303
  const pipeline = await IndexPipeline.create({
20428
22304
  cwd,
20429
22305
  configPath: options.configPath,
20430
- logger: logger3
22306
+ logger: logger3,
22307
+ hooks: options.hooks
20431
22308
  });
20432
22309
  const stats = await pipeline.run({
20433
22310
  changedOnly: options.changedOnly ?? true,