searchsocket 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/sveltekit.js CHANGED
@@ -1,14 +1,20 @@
1
- import fs from 'fs';
1
+ import { timingSafeEqual, createHash } from 'crypto';
2
+ import fs9 from 'fs/promises';
2
3
  import path from 'path';
4
+ import { WebStandardStreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js';
5
+ import fs from 'fs';
3
6
  import { createJiti } from 'jiti';
4
7
  import { z } from 'zod';
8
+ import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
9
+ import '@modelcontextprotocol/sdk/server/stdio.js';
10
+ import '@modelcontextprotocol/sdk/server/streamableHttp.js';
11
+ import '@modelcontextprotocol/sdk/server/express.js';
5
12
  import { execSync, spawn } from 'child_process';
6
- import { createHash } from 'crypto';
13
+ import { FusionAlgorithm, QueryMode } from '@upstash/vector';
7
14
  import { load } from 'cheerio';
8
15
  import matter from 'gray-matter';
9
16
  import fg from 'fast-glob';
10
17
  import pLimit from 'p-limit';
11
- import fs3 from 'fs/promises';
12
18
  import net from 'net';
13
19
  import { gunzipSync } from 'zlib';
14
20
 
@@ -5009,32 +5015,32 @@ var require_URL = __commonJS({
5009
5015
  else
5010
5016
  return basepath.substring(0, lastslash + 1) + refpath;
5011
5017
  }
5012
- function remove_dot_segments(path13) {
5013
- if (!path13) return path13;
5018
+ function remove_dot_segments(path14) {
5019
+ if (!path14) return path14;
5014
5020
  var output = "";
5015
- while (path13.length > 0) {
5016
- if (path13 === "." || path13 === "..") {
5017
- path13 = "";
5021
+ while (path14.length > 0) {
5022
+ if (path14 === "." || path14 === "..") {
5023
+ path14 = "";
5018
5024
  break;
5019
5025
  }
5020
- var twochars = path13.substring(0, 2);
5021
- var threechars = path13.substring(0, 3);
5022
- var fourchars = path13.substring(0, 4);
5026
+ var twochars = path14.substring(0, 2);
5027
+ var threechars = path14.substring(0, 3);
5028
+ var fourchars = path14.substring(0, 4);
5023
5029
  if (threechars === "../") {
5024
- path13 = path13.substring(3);
5030
+ path14 = path14.substring(3);
5025
5031
  } else if (twochars === "./") {
5026
- path13 = path13.substring(2);
5032
+ path14 = path14.substring(2);
5027
5033
  } else if (threechars === "/./") {
5028
- path13 = "/" + path13.substring(3);
5029
- } else if (twochars === "/." && path13.length === 2) {
5030
- path13 = "/";
5031
- } else if (fourchars === "/../" || threechars === "/.." && path13.length === 3) {
5032
- path13 = "/" + path13.substring(4);
5034
+ path14 = "/" + path14.substring(3);
5035
+ } else if (twochars === "/." && path14.length === 2) {
5036
+ path14 = "/";
5037
+ } else if (fourchars === "/../" || threechars === "/.." && path14.length === 3) {
5038
+ path14 = "/" + path14.substring(4);
5033
5039
  output = output.replace(/\/?[^\/]*$/, "");
5034
5040
  } else {
5035
- var segment = path13.match(/(\/?([^\/]*))/)[0];
5041
+ var segment = path14.match(/(\/?([^\/]*))/)[0];
5036
5042
  output += segment;
5037
- path13 = path13.substring(segment.length);
5043
+ path14 = path14.substring(segment.length);
5038
5044
  }
5039
5045
  }
5040
5046
  return output;
@@ -16630,6 +16636,7 @@ var searchSocketConfigSchema = z.object({
16630
16636
  dropSelectors: z.array(z.string()).optional(),
16631
16637
  ignoreAttr: z.string().optional(),
16632
16638
  noindexAttr: z.string().optional(),
16639
+ imageDescAttr: z.string().optional(),
16633
16640
  respectRobotsNoindex: z.boolean().optional()
16634
16641
  }).optional(),
16635
16642
  transform: z.object({
@@ -16645,35 +16652,48 @@ var searchSocketConfigSchema = z.object({
16645
16652
  headingPathDepth: z.number().int().positive().optional(),
16646
16653
  dontSplitInside: z.array(z.enum(["code", "table", "blockquote"])).optional(),
16647
16654
  prependTitle: z.boolean().optional(),
16648
- pageSummaryChunk: z.boolean().optional()
16655
+ pageSummaryChunk: z.boolean().optional(),
16656
+ weightHeadings: z.boolean().optional()
16649
16657
  }).optional(),
16650
16658
  upstash: z.object({
16651
16659
  url: z.string().url().optional(),
16652
16660
  token: z.string().min(1).optional(),
16653
16661
  urlEnv: z.string().min(1).optional(),
16654
- tokenEnv: z.string().min(1).optional()
16662
+ tokenEnv: z.string().min(1).optional(),
16663
+ namespaces: z.object({
16664
+ pages: z.string().min(1).optional(),
16665
+ chunks: z.string().min(1).optional()
16666
+ }).optional()
16667
+ }).optional(),
16668
+ embedding: z.object({
16669
+ model: z.string().optional(),
16670
+ dimensions: z.number().int().positive().optional(),
16671
+ taskType: z.string().optional(),
16672
+ batchSize: z.number().int().positive().optional()
16655
16673
  }).optional(),
16656
16674
  search: z.object({
16657
- semanticWeight: z.number().min(0).max(1).optional(),
16658
- inputEnrichment: z.boolean().optional(),
16659
- reranking: z.boolean().optional(),
16660
16675
  dualSearch: z.boolean().optional(),
16661
16676
  pageSearchWeight: z.number().min(0).max(1).optional()
16662
16677
  }).optional(),
16663
16678
  ranking: z.object({
16664
16679
  enableIncomingLinkBoost: z.boolean().optional(),
16665
16680
  enableDepthBoost: z.boolean().optional(),
16681
+ enableFreshnessBoost: z.boolean().optional(),
16682
+ freshnessDecayRate: z.number().positive().optional(),
16683
+ enableAnchorTextBoost: z.boolean().optional(),
16666
16684
  pageWeights: z.record(z.string(), z.number().min(0)).optional(),
16667
16685
  aggregationCap: z.number().int().positive().optional(),
16668
16686
  aggregationDecay: z.number().min(0).max(1).optional(),
16669
16687
  minChunkScoreRatio: z.number().min(0).max(1).optional(),
16670
- minScore: z.number().min(0).max(1).optional(),
16688
+ minScoreRatio: z.number().min(0).max(1).optional(),
16671
16689
  scoreGapThreshold: z.number().min(0).max(1).optional(),
16672
16690
  weights: z.object({
16673
16691
  incomingLinks: z.number().optional(),
16674
16692
  depth: z.number().optional(),
16675
16693
  aggregation: z.number().optional(),
16676
- titleMatch: z.number().optional()
16694
+ titleMatch: z.number().optional(),
16695
+ freshness: z.number().optional(),
16696
+ anchorText: z.number().optional()
16677
16697
  }).optional()
16678
16698
  }).optional(),
16679
16699
  api: z.object({
@@ -16688,12 +16708,28 @@ var searchSocketConfigSchema = z.object({
16688
16708
  }).optional(),
16689
16709
  mcp: z.object({
16690
16710
  enable: z.boolean().optional(),
16711
+ access: z.enum(["public", "private"]).optional(),
16691
16712
  transport: z.enum(["stdio", "http"]).optional(),
16692
16713
  http: z.object({
16693
16714
  port: z.number().int().positive().optional(),
16694
- path: z.string().optional()
16715
+ path: z.string().optional(),
16716
+ apiKey: z.string().min(1).optional(),
16717
+ apiKeyEnv: z.string().min(1).optional()
16718
+ }).optional(),
16719
+ handle: z.object({
16720
+ path: z.string().optional(),
16721
+ apiKey: z.string().min(1).optional(),
16722
+ enableJsonResponse: z.boolean().optional()
16695
16723
  }).optional()
16696
16724
  }).optional(),
16725
+ llmsTxt: z.object({
16726
+ enable: z.boolean().optional(),
16727
+ outputPath: z.string().optional(),
16728
+ title: z.string().optional(),
16729
+ description: z.string().optional(),
16730
+ generateFull: z.boolean().optional(),
16731
+ serveMarkdownVariants: z.boolean().optional()
16732
+ }).optional(),
16697
16733
  state: z.object({
16698
16734
  dir: z.string().optional()
16699
16735
  }).optional()
@@ -16732,6 +16768,7 @@ function createDefaultConfig(projectId) {
16732
16768
  dropSelectors: DEFAULT_DROP_SELECTORS,
16733
16769
  ignoreAttr: "data-search-ignore",
16734
16770
  noindexAttr: "data-search-noindex",
16771
+ imageDescAttr: "data-search-description",
16735
16772
  respectRobotsNoindex: true
16736
16773
  },
16737
16774
  transform: {
@@ -16741,39 +16778,52 @@ function createDefaultConfig(projectId) {
16741
16778
  },
16742
16779
  chunking: {
16743
16780
  strategy: "hybrid",
16744
- maxChars: 2200,
16781
+ maxChars: 1500,
16745
16782
  overlapChars: 200,
16746
16783
  minChars: 250,
16747
16784
  headingPathDepth: 3,
16748
16785
  dontSplitInside: ["code", "table", "blockquote"],
16749
16786
  prependTitle: true,
16750
- pageSummaryChunk: true
16787
+ pageSummaryChunk: true,
16788
+ weightHeadings: true
16751
16789
  },
16752
16790
  upstash: {
16753
- urlEnv: "UPSTASH_SEARCH_REST_URL",
16754
- tokenEnv: "UPSTASH_SEARCH_REST_TOKEN"
16791
+ urlEnv: "UPSTASH_VECTOR_REST_URL",
16792
+ tokenEnv: "UPSTASH_VECTOR_REST_TOKEN",
16793
+ namespaces: {
16794
+ pages: "pages",
16795
+ chunks: "chunks"
16796
+ }
16797
+ },
16798
+ embedding: {
16799
+ model: "bge-large-en-v1.5",
16800
+ dimensions: 1024,
16801
+ taskType: "RETRIEVAL_DOCUMENT",
16802
+ batchSize: 100
16755
16803
  },
16756
16804
  search: {
16757
- semanticWeight: 0.75,
16758
- inputEnrichment: true,
16759
- reranking: true,
16760
16805
  dualSearch: true,
16761
16806
  pageSearchWeight: 0.3
16762
16807
  },
16763
16808
  ranking: {
16764
16809
  enableIncomingLinkBoost: true,
16765
16810
  enableDepthBoost: true,
16811
+ enableFreshnessBoost: false,
16812
+ freshnessDecayRate: 1e-3,
16813
+ enableAnchorTextBoost: false,
16766
16814
  pageWeights: {},
16767
16815
  aggregationCap: 5,
16768
16816
  aggregationDecay: 0.5,
16769
16817
  minChunkScoreRatio: 0.5,
16770
- minScore: 0.3,
16818
+ minScoreRatio: 0.7,
16771
16819
  scoreGapThreshold: 0.4,
16772
16820
  weights: {
16773
16821
  incomingLinks: 0.05,
16774
16822
  depth: 0.03,
16775
16823
  aggregation: 0.1,
16776
- titleMatch: 0.15
16824
+ titleMatch: 0.15,
16825
+ freshness: 0.1,
16826
+ anchorText: 0.1
16777
16827
  }
16778
16828
  },
16779
16829
  api: {
@@ -16784,12 +16834,23 @@ function createDefaultConfig(projectId) {
16784
16834
  },
16785
16835
  mcp: {
16786
16836
  enable: process.env.NODE_ENV !== "production",
16837
+ access: "private",
16787
16838
  transport: "stdio",
16788
16839
  http: {
16789
16840
  port: 3338,
16790
16841
  path: "/mcp"
16842
+ },
16843
+ handle: {
16844
+ path: "/api/mcp",
16845
+ enableJsonResponse: true
16791
16846
  }
16792
16847
  },
16848
+ llmsTxt: {
16849
+ enable: false,
16850
+ outputPath: "static/llms.txt",
16851
+ generateFull: true,
16852
+ serveMarkdownVariants: false
16853
+ },
16793
16854
  state: {
16794
16855
  dir: ".searchsocket"
16795
16856
  }
@@ -16917,7 +16978,15 @@ ${issues}`
16917
16978
  },
16918
16979
  upstash: {
16919
16980
  ...defaults.upstash,
16920
- ...parsed.upstash
16981
+ ...parsed.upstash,
16982
+ namespaces: {
16983
+ ...defaults.upstash.namespaces,
16984
+ ...parsed.upstash?.namespaces
16985
+ }
16986
+ },
16987
+ embedding: {
16988
+ ...defaults.embedding,
16989
+ ...parsed.embedding
16921
16990
  },
16922
16991
  search: {
16923
16992
  ...defaults.search,
@@ -16954,8 +17023,16 @@ ${issues}`
16954
17023
  http: {
16955
17024
  ...defaults.mcp.http,
16956
17025
  ...parsed.mcp?.http
17026
+ },
17027
+ handle: {
17028
+ ...defaults.mcp.handle,
17029
+ ...parsed.mcp?.handle
16957
17030
  }
16958
17031
  },
17032
+ llmsTxt: {
17033
+ ...defaults.llmsTxt,
17034
+ ...parsed.llmsTxt
17035
+ },
16959
17036
  state: {
16960
17037
  ...defaults.state,
16961
17038
  ...parsed.state
@@ -16975,6 +17052,15 @@ ${issues}`
16975
17052
  maxDepth: 10
16976
17053
  };
16977
17054
  }
17055
+ if (merged.mcp.access === "public") {
17056
+ const resolvedKey = merged.mcp.http.apiKey ?? (merged.mcp.http.apiKeyEnv ? process.env[merged.mcp.http.apiKeyEnv] : void 0);
17057
+ if (!resolvedKey) {
17058
+ throw new SearchSocketError(
17059
+ "CONFIG_MISSING",
17060
+ '`mcp.access` is "public" but no API key is configured. Set `mcp.http.apiKey` or `mcp.http.apiKeyEnv`.'
17061
+ );
17062
+ }
17063
+ }
16978
17064
  if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
16979
17065
  throw new SearchSocketError("CONFIG_MISSING", "`source.crawl.baseUrl` is required when source.mode is crawl.");
16980
17066
  }
@@ -17023,13 +17109,84 @@ function normalizeMarkdown(input) {
17023
17109
  function sanitizeScopeName(scopeName) {
17024
17110
  return scopeName.toLowerCase().replace(/[^a-z0-9._-]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80);
17025
17111
  }
17112
+ function markdownToPlain(markdown) {
17113
+ return markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/[#>*_|\-]/g, " ").replace(/\s+/g, " ").trim();
17114
+ }
17026
17115
  function toSnippet(markdown, maxLen = 220) {
17027
- const plain = markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/[#>*_|\-]/g, " ").replace(/\s+/g, " ").trim();
17116
+ const plain = markdownToPlain(markdown);
17028
17117
  if (plain.length <= maxLen) {
17029
17118
  return plain;
17030
17119
  }
17031
17120
  return `${plain.slice(0, Math.max(0, maxLen - 1)).trim()}\u2026`;
17032
17121
  }
17122
+ function queryAwareExcerpt(markdown, query, maxLen = 220) {
17123
+ const plain = markdownToPlain(markdown);
17124
+ if (plain.length <= maxLen) return plain;
17125
+ const tokens = query.toLowerCase().split(/\s+/).filter((t) => t.length >= 2);
17126
+ if (tokens.length === 0) return toSnippet(markdown, maxLen);
17127
+ const positions = [];
17128
+ for (let ti = 0; ti < tokens.length; ti++) {
17129
+ const escaped = tokens[ti].replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
17130
+ const re = new RegExp(escaped, "gi");
17131
+ let m;
17132
+ while ((m = re.exec(plain)) !== null) {
17133
+ positions.push({ start: m.index, end: m.index + m[0].length, tokenIdx: ti });
17134
+ }
17135
+ }
17136
+ if (positions.length === 0) return toSnippet(markdown, maxLen);
17137
+ positions.sort((a, b) => a.start - b.start);
17138
+ let bestUniqueCount = 0;
17139
+ let bestTotalCount = 0;
17140
+ let bestLeft = 0;
17141
+ let bestRight = 0;
17142
+ let left = 0;
17143
+ const tokenCounts = /* @__PURE__ */ new Map();
17144
+ for (let right = 0; right < positions.length; right++) {
17145
+ tokenCounts.set(positions[right].tokenIdx, (tokenCounts.get(positions[right].tokenIdx) ?? 0) + 1);
17146
+ while (positions[right].end - positions[left].start > maxLen && left < right) {
17147
+ const leftToken = positions[left].tokenIdx;
17148
+ const cnt = tokenCounts.get(leftToken) - 1;
17149
+ if (cnt === 0) tokenCounts.delete(leftToken);
17150
+ else tokenCounts.set(leftToken, cnt);
17151
+ left++;
17152
+ }
17153
+ const uniqueCount = tokenCounts.size;
17154
+ const totalCount = right - left + 1;
17155
+ if (uniqueCount > bestUniqueCount || uniqueCount === bestUniqueCount && totalCount > bestTotalCount) {
17156
+ bestUniqueCount = uniqueCount;
17157
+ bestTotalCount = totalCount;
17158
+ bestLeft = left;
17159
+ bestRight = right;
17160
+ }
17161
+ }
17162
+ const mid = Math.floor((positions[bestLeft].start + positions[bestRight].end) / 2);
17163
+ let start = Math.max(0, mid - Math.floor(maxLen / 2));
17164
+ let end = Math.min(plain.length, start + maxLen);
17165
+ start = Math.max(0, end - maxLen);
17166
+ if (start > 0) {
17167
+ const spaceIdx = plain.lastIndexOf(" ", start);
17168
+ if (spaceIdx > start - 30) {
17169
+ start = spaceIdx + 1;
17170
+ }
17171
+ }
17172
+ if (end < plain.length) {
17173
+ const spaceIdx = plain.indexOf(" ", end);
17174
+ if (spaceIdx !== -1 && spaceIdx < end + 30) {
17175
+ end = spaceIdx;
17176
+ }
17177
+ }
17178
+ let excerpt = plain.slice(start, end);
17179
+ if (excerpt.length > Math.ceil(maxLen * 1.2)) {
17180
+ excerpt = excerpt.slice(0, maxLen);
17181
+ const lastSpace = excerpt.lastIndexOf(" ");
17182
+ if (lastSpace > maxLen * 0.5) {
17183
+ excerpt = excerpt.slice(0, lastSpace);
17184
+ }
17185
+ }
17186
+ const prefix = start > 0 ? "\u2026" : "";
17187
+ const suffix = end < plain.length ? "\u2026" : "";
17188
+ return `${prefix}${excerpt}${suffix}`;
17189
+ }
17033
17190
  function extractFirstParagraph(markdown) {
17034
17191
  const lines = markdown.split("\n");
17035
17192
  let inFence = false;
@@ -17136,162 +17293,288 @@ function joinUrl(baseUrl, route) {
17136
17293
  const routePart = ensureLeadingSlash(route);
17137
17294
  return `${base}${routePart}`;
17138
17295
  }
17139
-
17140
- // src/vector/upstash.ts
17141
- function chunkIndexName(scope) {
17142
- return `${scope.projectId}--${scope.scopeName}`;
17143
- }
17144
- function pageIndexName(scope) {
17145
- return `${scope.projectId}--${scope.scopeName}--pages`;
17146
- }
17147
17296
  var UpstashSearchStore = class {
17148
- client;
17297
+ index;
17298
+ pagesNs;
17299
+ chunksNs;
17149
17300
  constructor(opts) {
17150
- this.client = opts.client;
17151
- }
17152
- chunkIndex(scope) {
17153
- return this.client.index(chunkIndexName(scope));
17154
- }
17155
- pageIndex(scope) {
17156
- return this.client.index(pageIndexName(scope));
17301
+ this.index = opts.index;
17302
+ this.pagesNs = opts.index.namespace(opts.pagesNamespace);
17303
+ this.chunksNs = opts.index.namespace(opts.chunksNamespace);
17157
17304
  }
17158
17305
  async upsertChunks(chunks, scope) {
17159
17306
  if (chunks.length === 0) return;
17160
- const index = this.chunkIndex(scope);
17161
17307
  const BATCH_SIZE = 100;
17162
17308
  for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
17163
17309
  const batch = chunks.slice(i, i + BATCH_SIZE);
17164
- await index.upsert(batch);
17165
- }
17166
- }
17167
- async search(query, opts, scope) {
17168
- const index = this.chunkIndex(scope);
17169
- const results = await index.search({
17170
- query,
17171
- limit: opts.limit,
17172
- semanticWeight: opts.semanticWeight,
17173
- inputEnrichment: opts.inputEnrichment,
17174
- reranking: opts.reranking,
17175
- filter: opts.filter
17310
+ await this.chunksNs.upsert(
17311
+ batch.map((c) => ({
17312
+ id: c.id,
17313
+ data: c.data,
17314
+ metadata: {
17315
+ ...c.metadata,
17316
+ projectId: scope.projectId,
17317
+ scopeName: scope.scopeName,
17318
+ type: c.metadata.type || "chunk"
17319
+ }
17320
+ }))
17321
+ );
17322
+ }
17323
+ }
17324
+ async search(data, opts, scope) {
17325
+ const filterParts = [
17326
+ `projectId = '${scope.projectId}'`,
17327
+ `scopeName = '${scope.scopeName}'`
17328
+ ];
17329
+ if (opts.filter) {
17330
+ filterParts.push(opts.filter);
17331
+ }
17332
+ const results = await this.chunksNs.query({
17333
+ data,
17334
+ topK: opts.limit,
17335
+ includeMetadata: true,
17336
+ filter: filterParts.join(" AND "),
17337
+ queryMode: QueryMode.HYBRID,
17338
+ fusionAlgorithm: FusionAlgorithm.DBSF
17339
+ });
17340
+ return results.map((doc) => ({
17341
+ id: String(doc.id),
17342
+ score: doc.score,
17343
+ metadata: {
17344
+ projectId: doc.metadata?.projectId ?? "",
17345
+ scopeName: doc.metadata?.scopeName ?? "",
17346
+ url: doc.metadata?.url ?? "",
17347
+ path: doc.metadata?.path ?? "",
17348
+ title: doc.metadata?.title ?? "",
17349
+ sectionTitle: doc.metadata?.sectionTitle ?? "",
17350
+ headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
17351
+ snippet: doc.metadata?.snippet ?? "",
17352
+ chunkText: doc.metadata?.chunkText ?? "",
17353
+ ordinal: doc.metadata?.ordinal ?? 0,
17354
+ contentHash: doc.metadata?.contentHash ?? "",
17355
+ depth: doc.metadata?.depth ?? 0,
17356
+ incomingLinks: doc.metadata?.incomingLinks ?? 0,
17357
+ routeFile: doc.metadata?.routeFile ?? "",
17358
+ tags: doc.metadata?.tags ?? [],
17359
+ description: doc.metadata?.description || void 0,
17360
+ keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
17361
+ publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
17362
+ incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
17363
+ }
17364
+ }));
17365
+ }
17366
+ async searchChunksByUrl(data, url, opts, scope) {
17367
+ const filterParts = [
17368
+ `projectId = '${scope.projectId}'`,
17369
+ `scopeName = '${scope.scopeName}'`,
17370
+ `url = '${url}'`
17371
+ ];
17372
+ if (opts.filter) {
17373
+ filterParts.push(opts.filter);
17374
+ }
17375
+ const results = await this.chunksNs.query({
17376
+ data,
17377
+ topK: opts.limit,
17378
+ includeMetadata: true,
17379
+ filter: filterParts.join(" AND "),
17380
+ queryMode: QueryMode.HYBRID,
17381
+ fusionAlgorithm: FusionAlgorithm.DBSF
17176
17382
  });
17177
17383
  return results.map((doc) => ({
17178
- id: doc.id,
17384
+ id: String(doc.id),
17179
17385
  score: doc.score,
17180
17386
  metadata: {
17181
17387
  projectId: doc.metadata?.projectId ?? "",
17182
17388
  scopeName: doc.metadata?.scopeName ?? "",
17183
- url: doc.content.url,
17389
+ url: doc.metadata?.url ?? "",
17184
17390
  path: doc.metadata?.path ?? "",
17185
- title: doc.content.title,
17186
- sectionTitle: doc.content.sectionTitle,
17187
- headingPath: doc.content.headingPath ? doc.content.headingPath.split(" > ").filter(Boolean) : [],
17391
+ title: doc.metadata?.title ?? "",
17392
+ sectionTitle: doc.metadata?.sectionTitle ?? "",
17393
+ headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
17188
17394
  snippet: doc.metadata?.snippet ?? "",
17189
- chunkText: doc.content.text,
17395
+ chunkText: doc.metadata?.chunkText ?? "",
17190
17396
  ordinal: doc.metadata?.ordinal ?? 0,
17191
17397
  contentHash: doc.metadata?.contentHash ?? "",
17192
17398
  depth: doc.metadata?.depth ?? 0,
17193
17399
  incomingLinks: doc.metadata?.incomingLinks ?? 0,
17194
17400
  routeFile: doc.metadata?.routeFile ?? "",
17195
- tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
17401
+ tags: doc.metadata?.tags ?? [],
17196
17402
  description: doc.metadata?.description || void 0,
17197
- keywords: doc.metadata?.keywords ? doc.metadata.keywords.split(",").filter(Boolean) : void 0
17403
+ keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
17404
+ publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
17405
+ incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
17198
17406
  }
17199
17407
  }));
17200
17408
  }
17201
- async searchPages(query, opts, scope) {
17202
- const index = this.pageIndex(scope);
17409
+ async searchPagesByText(data, opts, scope) {
17410
+ return this.queryPages({ data }, opts, scope);
17411
+ }
17412
+ async searchPagesByVector(vector, opts, scope) {
17413
+ return this.queryPages({ vector }, opts, scope);
17414
+ }
17415
+ async queryPages(input, opts, scope) {
17416
+ const filterParts = [
17417
+ `projectId = '${scope.projectId}'`,
17418
+ `scopeName = '${scope.scopeName}'`
17419
+ ];
17420
+ if (opts.filter) {
17421
+ filterParts.push(opts.filter);
17422
+ }
17203
17423
  let results;
17204
17424
  try {
17205
- results = await index.search({
17206
- query,
17207
- limit: opts.limit,
17208
- semanticWeight: opts.semanticWeight,
17209
- inputEnrichment: opts.inputEnrichment,
17210
- reranking: true,
17211
- filter: opts.filter
17425
+ results = await this.pagesNs.query({
17426
+ ...input,
17427
+ topK: opts.limit,
17428
+ includeMetadata: true,
17429
+ filter: filterParts.join(" AND "),
17430
+ queryMode: QueryMode.HYBRID,
17431
+ fusionAlgorithm: FusionAlgorithm.DBSF
17212
17432
  });
17213
17433
  } catch {
17214
17434
  return [];
17215
17435
  }
17216
17436
  return results.map((doc) => ({
17217
- id: doc.id,
17437
+ id: String(doc.id),
17218
17438
  score: doc.score,
17219
- title: doc.content.title,
17220
- url: doc.content.url,
17221
- description: doc.content.description ?? "",
17222
- tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
17439
+ title: doc.metadata?.title ?? "",
17440
+ url: doc.metadata?.url ?? "",
17441
+ description: doc.metadata?.description ?? "",
17442
+ tags: doc.metadata?.tags ?? [],
17223
17443
  depth: doc.metadata?.depth ?? 0,
17224
17444
  incomingLinks: doc.metadata?.incomingLinks ?? 0,
17225
- routeFile: doc.metadata?.routeFile ?? ""
17445
+ routeFile: doc.metadata?.routeFile ?? "",
17446
+ publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0
17226
17447
  }));
17227
17448
  }
17228
- async deleteByIds(ids, scope) {
17449
+ async deleteByIds(ids, _scope) {
17229
17450
  if (ids.length === 0) return;
17230
- const index = this.chunkIndex(scope);
17231
- const BATCH_SIZE = 500;
17451
+ const BATCH_SIZE = 100;
17232
17452
  for (let i = 0; i < ids.length; i += BATCH_SIZE) {
17233
17453
  const batch = ids.slice(i, i + BATCH_SIZE);
17234
- await index.delete(batch);
17454
+ await this.chunksNs.delete(batch);
17235
17455
  }
17236
17456
  }
17237
17457
  async deleteScope(scope) {
17238
- try {
17239
- const chunkIdx = this.chunkIndex(scope);
17240
- await chunkIdx.deleteIndex();
17241
- } catch {
17242
- }
17243
- try {
17244
- const pageIdx = this.pageIndex(scope);
17245
- await pageIdx.deleteIndex();
17246
- } catch {
17458
+ for (const ns of [this.chunksNs, this.pagesNs]) {
17459
+ const ids = [];
17460
+ let cursor = "0";
17461
+ try {
17462
+ for (; ; ) {
17463
+ const result = await ns.range({
17464
+ cursor,
17465
+ limit: 100,
17466
+ includeMetadata: true
17467
+ });
17468
+ for (const doc of result.vectors) {
17469
+ if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
17470
+ ids.push(String(doc.id));
17471
+ }
17472
+ }
17473
+ if (!result.nextCursor || result.nextCursor === "0") break;
17474
+ cursor = result.nextCursor;
17475
+ }
17476
+ } catch {
17477
+ }
17478
+ if (ids.length > 0) {
17479
+ const BATCH_SIZE = 100;
17480
+ for (let i = 0; i < ids.length; i += BATCH_SIZE) {
17481
+ const batch = ids.slice(i, i + BATCH_SIZE);
17482
+ await ns.delete(batch);
17483
+ }
17484
+ }
17247
17485
  }
17248
17486
  }
17249
17487
  async listScopes(projectId) {
17250
- const allIndexes = await this.client.listIndexes();
17251
- const prefix = `${projectId}--`;
17252
- const scopeNames = /* @__PURE__ */ new Set();
17253
- for (const name of allIndexes) {
17254
- if (name.startsWith(prefix) && !name.endsWith("--pages")) {
17255
- const scopeName = name.slice(prefix.length);
17256
- scopeNames.add(scopeName);
17257
- }
17258
- }
17259
- const scopes = [];
17260
- for (const scopeName of scopeNames) {
17261
- const scope = {
17262
- projectId,
17263
- scopeName,
17264
- scopeId: `${projectId}:${scopeName}`
17265
- };
17488
+ const scopeMap = /* @__PURE__ */ new Map();
17489
+ for (const ns of [this.chunksNs, this.pagesNs]) {
17490
+ let cursor = "0";
17266
17491
  try {
17267
- const info = await this.chunkIndex(scope).info();
17268
- scopes.push({
17269
- projectId,
17270
- scopeName,
17271
- lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
17272
- documentCount: info.documentCount
17273
- });
17492
+ for (; ; ) {
17493
+ const result = await ns.range({
17494
+ cursor,
17495
+ limit: 100,
17496
+ includeMetadata: true
17497
+ });
17498
+ for (const doc of result.vectors) {
17499
+ if (doc.metadata?.projectId === projectId) {
17500
+ const scopeName = doc.metadata.scopeName ?? "";
17501
+ scopeMap.set(scopeName, (scopeMap.get(scopeName) ?? 0) + 1);
17502
+ }
17503
+ }
17504
+ if (!result.nextCursor || result.nextCursor === "0") break;
17505
+ cursor = result.nextCursor;
17506
+ }
17274
17507
  } catch {
17275
- scopes.push({
17276
- projectId,
17277
- scopeName,
17278
- lastIndexedAt: "unknown",
17279
- documentCount: 0
17280
- });
17281
17508
  }
17282
17509
  }
17283
- return scopes;
17510
+ return [...scopeMap.entries()].map(([scopeName, count]) => ({
17511
+ projectId,
17512
+ scopeName,
17513
+ lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
17514
+ documentCount: count
17515
+ }));
17284
17516
  }
17285
17517
  async getContentHashes(scope) {
17286
17518
  const map = /* @__PURE__ */ new Map();
17287
- const index = this.chunkIndex(scope);
17288
17519
  let cursor = "0";
17289
17520
  try {
17290
17521
  for (; ; ) {
17291
- const result = await index.range({ cursor, limit: 100 });
17292
- for (const doc of result.documents) {
17293
- if (doc.metadata?.contentHash) {
17294
- map.set(doc.id, doc.metadata.contentHash);
17522
+ const result = await this.chunksNs.range({
17523
+ cursor,
17524
+ limit: 100,
17525
+ includeMetadata: true
17526
+ });
17527
+ for (const doc of result.vectors) {
17528
+ if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
17529
+ map.set(String(doc.id), doc.metadata.contentHash);
17530
+ }
17531
+ }
17532
+ if (!result.nextCursor || result.nextCursor === "0") break;
17533
+ cursor = result.nextCursor;
17534
+ }
17535
+ } catch {
17536
+ }
17537
+ return map;
17538
+ }
17539
+ async listPages(scope, opts) {
17540
+ const cursor = opts?.cursor ?? "0";
17541
+ const limit = opts?.limit ?? 50;
17542
+ try {
17543
+ const result = await this.pagesNs.range({
17544
+ cursor,
17545
+ limit,
17546
+ includeMetadata: true
17547
+ });
17548
+ const pages = result.vectors.filter(
17549
+ (doc) => doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && (!opts?.pathPrefix || (doc.metadata?.url ?? "").startsWith(opts.pathPrefix))
17550
+ ).map((doc) => ({
17551
+ url: doc.metadata?.url ?? "",
17552
+ title: doc.metadata?.title ?? "",
17553
+ description: doc.metadata?.description ?? "",
17554
+ routeFile: doc.metadata?.routeFile ?? ""
17555
+ }));
17556
+ const response = { pages };
17557
+ if (result.nextCursor && result.nextCursor !== "0") {
17558
+ response.nextCursor = result.nextCursor;
17559
+ }
17560
+ return response;
17561
+ } catch {
17562
+ return { pages: [] };
17563
+ }
17564
+ }
17565
+ async getPageHashes(scope) {
17566
+ const map = /* @__PURE__ */ new Map();
17567
+ let cursor = "0";
17568
+ try {
17569
+ for (; ; ) {
17570
+ const result = await this.pagesNs.range({
17571
+ cursor,
17572
+ limit: 100,
17573
+ includeMetadata: true
17574
+ });
17575
+ for (const doc of result.vectors) {
17576
+ if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
17577
+ map.set(String(doc.id), doc.metadata.contentHash);
17295
17578
  }
17296
17579
  }
17297
17580
  if (!result.nextCursor || result.nextCursor === "0") break;
@@ -17301,47 +17584,43 @@ var UpstashSearchStore = class {
17301
17584
  }
17302
17585
  return map;
17303
17586
  }
17587
+ async deletePagesByIds(ids, _scope) {
17588
+ if (ids.length === 0) return;
17589
+ const BATCH_SIZE = 50;
17590
+ for (let i = 0; i < ids.length; i += BATCH_SIZE) {
17591
+ const batch = ids.slice(i, i + BATCH_SIZE);
17592
+ await this.pagesNs.delete(batch);
17593
+ }
17594
+ }
17304
17595
  async upsertPages(pages, scope) {
17305
17596
  if (pages.length === 0) return;
17306
- const index = this.pageIndex(scope);
17307
17597
  const BATCH_SIZE = 50;
17308
17598
  for (let i = 0; i < pages.length; i += BATCH_SIZE) {
17309
17599
  const batch = pages.slice(i, i + BATCH_SIZE);
17310
- const docs = batch.map((p) => ({
17311
- id: p.url,
17312
- content: {
17313
- title: p.title,
17314
- url: p.url,
17315
- type: "page",
17316
- description: p.description ?? "",
17317
- keywords: (p.keywords ?? []).join(","),
17318
- summary: p.summary ?? "",
17319
- tags: p.tags.join(",")
17320
- },
17321
- metadata: {
17322
- markdown: p.markdown,
17323
- projectId: p.projectId,
17324
- scopeName: p.scopeName,
17325
- routeFile: p.routeFile,
17326
- routeResolution: p.routeResolution,
17327
- incomingLinks: p.incomingLinks,
17328
- outgoingLinks: p.outgoingLinks,
17329
- depth: p.depth,
17330
- indexedAt: p.indexedAt
17331
- }
17332
- }));
17333
- await index.upsert(docs);
17600
+ await this.pagesNs.upsert(
17601
+ batch.map((p) => ({
17602
+ id: p.id,
17603
+ data: p.data,
17604
+ metadata: {
17605
+ ...p.metadata,
17606
+ projectId: scope.projectId,
17607
+ scopeName: scope.scopeName,
17608
+ type: "page"
17609
+ }
17610
+ }))
17611
+ );
17334
17612
  }
17335
17613
  }
17336
17614
  async getPage(url, scope) {
17337
- const index = this.pageIndex(scope);
17338
17615
  try {
17339
- const results = await index.fetch([url]);
17616
+ const results = await this.pagesNs.fetch([url], {
17617
+ includeMetadata: true
17618
+ });
17340
17619
  const doc = results[0];
17341
- if (!doc) return null;
17620
+ if (!doc || !doc.metadata) return null;
17342
17621
  return {
17343
- url: doc.content.url,
17344
- title: doc.content.title,
17622
+ url: doc.metadata.url,
17623
+ title: doc.metadata.title,
17345
17624
  markdown: doc.metadata.markdown,
17346
17625
  projectId: doc.metadata.projectId,
17347
17626
  scopeName: doc.metadata.scopeName,
@@ -17349,27 +17628,86 @@ var UpstashSearchStore = class {
17349
17628
  routeResolution: doc.metadata.routeResolution,
17350
17629
  incomingLinks: doc.metadata.incomingLinks,
17351
17630
  outgoingLinks: doc.metadata.outgoingLinks,
17631
+ outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? void 0,
17352
17632
  depth: doc.metadata.depth,
17353
- tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
17633
+ tags: doc.metadata.tags ?? [],
17354
17634
  indexedAt: doc.metadata.indexedAt,
17355
- summary: doc.content.summary || void 0,
17356
- description: doc.content.description || void 0,
17357
- keywords: doc.content.keywords ? doc.content.keywords.split(",").filter(Boolean) : void 0
17635
+ summary: doc.metadata.summary || void 0,
17636
+ description: doc.metadata.description || void 0,
17637
+ keywords: doc.metadata.keywords?.length ? doc.metadata.keywords : void 0,
17638
+ publishedAt: typeof doc.metadata.publishedAt === "number" ? doc.metadata.publishedAt : void 0
17358
17639
  };
17359
17640
  } catch {
17360
17641
  return null;
17361
17642
  }
17362
17643
  }
17644
+ async fetchPageWithVector(url, scope) {
17645
+ try {
17646
+ const results = await this.pagesNs.fetch([url], {
17647
+ includeMetadata: true,
17648
+ includeVectors: true
17649
+ });
17650
+ const doc = results[0];
17651
+ if (!doc || !doc.metadata || !doc.vector) return null;
17652
+ if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
17653
+ return null;
17654
+ }
17655
+ return { metadata: doc.metadata, vector: doc.vector };
17656
+ } catch {
17657
+ return null;
17658
+ }
17659
+ }
17660
+ async fetchPagesBatch(urls, scope) {
17661
+ if (urls.length === 0) return [];
17662
+ try {
17663
+ const results = await this.pagesNs.fetch(urls, {
17664
+ includeMetadata: true
17665
+ });
17666
+ const out = [];
17667
+ for (const doc of results) {
17668
+ if (!doc || !doc.metadata) continue;
17669
+ if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
17670
+ continue;
17671
+ }
17672
+ out.push({
17673
+ url: doc.metadata.url,
17674
+ title: doc.metadata.title,
17675
+ routeFile: doc.metadata.routeFile,
17676
+ outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? []
17677
+ });
17678
+ }
17679
+ return out;
17680
+ } catch {
17681
+ return [];
17682
+ }
17683
+ }
17363
17684
  async deletePages(scope) {
17685
+ const ids = [];
17686
+ let cursor = "0";
17364
17687
  try {
17365
- const index = this.pageIndex(scope);
17366
- await index.reset();
17688
+ for (; ; ) {
17689
+ const result = await this.pagesNs.range({
17690
+ cursor,
17691
+ limit: 100,
17692
+ includeMetadata: true
17693
+ });
17694
+ for (const doc of result.vectors) {
17695
+ if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
17696
+ ids.push(String(doc.id));
17697
+ }
17698
+ }
17699
+ if (!result.nextCursor || result.nextCursor === "0") break;
17700
+ cursor = result.nextCursor;
17701
+ }
17367
17702
  } catch {
17368
17703
  }
17704
+ if (ids.length > 0) {
17705
+ await this.deletePagesByIds(ids, scope);
17706
+ }
17369
17707
  }
17370
17708
  async health() {
17371
17709
  try {
17372
- await this.client.info();
17710
+ await this.index.info();
17373
17711
  return { ok: true };
17374
17712
  } catch (error) {
17375
17713
  return {
@@ -17379,14 +17717,31 @@ var UpstashSearchStore = class {
17379
17717
  }
17380
17718
  }
17381
17719
  async dropAllIndexes(projectId) {
17382
- const allIndexes = await this.client.listIndexes();
17383
- const prefix = `${projectId}--`;
17384
- for (const name of allIndexes) {
17385
- if (name.startsWith(prefix)) {
17386
- try {
17387
- const index = this.client.index(name);
17388
- await index.deleteIndex();
17389
- } catch {
17720
+ for (const ns of [this.chunksNs, this.pagesNs]) {
17721
+ const ids = [];
17722
+ let cursor = "0";
17723
+ try {
17724
+ for (; ; ) {
17725
+ const result = await ns.range({
17726
+ cursor,
17727
+ limit: 100,
17728
+ includeMetadata: true
17729
+ });
17730
+ for (const doc of result.vectors) {
17731
+ if (doc.metadata?.projectId === projectId) {
17732
+ ids.push(String(doc.id));
17733
+ }
17734
+ }
17735
+ if (!result.nextCursor || result.nextCursor === "0") break;
17736
+ cursor = result.nextCursor;
17737
+ }
17738
+ } catch {
17739
+ }
17740
+ if (ids.length > 0) {
17741
+ const BATCH_SIZE = 100;
17742
+ for (let i = 0; i < ids.length; i += BATCH_SIZE) {
17743
+ const batch = ids.slice(i, i + BATCH_SIZE);
17744
+ await ns.delete(batch);
17390
17745
  }
17391
17746
  }
17392
17747
  }
@@ -17400,12 +17755,16 @@ async function createUpstashStore(config) {
17400
17755
  if (!url || !token) {
17401
17756
  throw new SearchSocketError(
17402
17757
  "VECTOR_BACKEND_UNAVAILABLE",
17403
- `Missing Upstash Search credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
17758
+ `Missing Upstash Vector credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
17404
17759
  );
17405
17760
  }
17406
- const { Search } = await import('@upstash/search');
17407
- const client = new Search({ url, token });
17408
- return new UpstashSearchStore({ client });
17761
+ const { Index } = await import('@upstash/vector');
17762
+ const index = new Index({ url, token });
17763
+ return new UpstashSearchStore({
17764
+ index,
17765
+ pagesNamespace: config.upstash.namespaces.pages,
17766
+ chunksNamespace: config.upstash.namespaces.chunks
17767
+ });
17409
17768
  }
17410
17769
 
17411
17770
  // src/utils/pattern.ts
@@ -17448,29 +17807,65 @@ function nonNegativeOrZero(value) {
17448
17807
  function normalizeForTitleMatch(text) {
17449
17808
  return text.toLowerCase().replace(/[^a-z0-9\s]/g, "").replace(/\s+/g, " ").trim();
17450
17809
  }
17451
- function rankHits(hits, config, query) {
17810
+ function rankHits(hits, config, query, debug) {
17452
17811
  const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
17453
17812
  const titleMatchWeight = config.ranking.weights.titleMatch;
17454
17813
  return hits.map((hit) => {
17455
- let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
17814
+ const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
17815
+ let score = baseScore;
17816
+ let incomingLinkBoostValue = 0;
17456
17817
  if (config.ranking.enableIncomingLinkBoost) {
17457
17818
  const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
17458
- score += incomingBoost * config.ranking.weights.incomingLinks;
17819
+ incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
17820
+ score += incomingLinkBoostValue;
17459
17821
  }
17822
+ let depthBoostValue = 0;
17460
17823
  if (config.ranking.enableDepthBoost) {
17461
17824
  const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
17462
- score += depthBoost * config.ranking.weights.depth;
17825
+ depthBoostValue = depthBoost * config.ranking.weights.depth;
17826
+ score += depthBoostValue;
17463
17827
  }
17828
+ let titleMatchBoostValue = 0;
17464
17829
  if (normalizedQuery && titleMatchWeight > 0) {
17465
17830
  const normalizedTitle = normalizeForTitleMatch(hit.metadata.title);
17466
17831
  if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
17467
- score += titleMatchWeight;
17832
+ titleMatchBoostValue = titleMatchWeight;
17833
+ score += titleMatchBoostValue;
17468
17834
  }
17469
17835
  }
17470
- return {
17836
+ let freshnessBoostValue = 0;
17837
+ if (config.ranking.enableFreshnessBoost) {
17838
+ const publishedAt = hit.metadata.publishedAt;
17839
+ if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
17840
+ const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
17841
+ const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
17842
+ freshnessBoostValue = decay * config.ranking.weights.freshness;
17843
+ score += freshnessBoostValue;
17844
+ }
17845
+ }
17846
+ let anchorTextMatchBoostValue = 0;
17847
+ if (config.ranking.enableAnchorTextBoost && normalizedQuery && config.ranking.weights.anchorText > 0) {
17848
+ const normalizedAnchorText = normalizeForTitleMatch(hit.metadata.incomingAnchorText ?? "");
17849
+ if (normalizedAnchorText.length > 0 && normalizedQuery.length > 0 && (normalizedAnchorText.includes(normalizedQuery) || normalizedQuery.includes(normalizedAnchorText))) {
17850
+ anchorTextMatchBoostValue = config.ranking.weights.anchorText;
17851
+ score += anchorTextMatchBoostValue;
17852
+ }
17853
+ }
17854
+ const result = {
17471
17855
  hit,
17472
17856
  finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
17473
17857
  };
17858
+ if (debug) {
17859
+ result.breakdown = {
17860
+ baseScore,
17861
+ incomingLinkBoost: incomingLinkBoostValue,
17862
+ depthBoost: depthBoostValue,
17863
+ titleMatchBoost: titleMatchBoostValue,
17864
+ freshnessBoost: freshnessBoostValue,
17865
+ anchorTextMatchBoost: anchorTextMatchBoostValue
17866
+ };
17867
+ }
17868
+ return result;
17474
17869
  }).sort((a, b) => {
17475
17870
  const delta = b.finalScore - a.finalScore;
17476
17871
  return Number.isNaN(delta) ? 0 : delta;
@@ -17479,12 +17874,13 @@ function rankHits(hits, config, query) {
17479
17874
  function trimByScoreGap(results, config) {
17480
17875
  if (results.length === 0) return results;
17481
17876
  const threshold = config.ranking.scoreGapThreshold;
17482
- const minScore = config.ranking.minScore;
17483
- if (minScore > 0 && results.length > 0) {
17484
- const sortedScores = results.map((r) => r.pageScore).sort((a, b) => a - b);
17485
- const mid = Math.floor(sortedScores.length / 2);
17486
- const median = sortedScores.length % 2 === 0 ? (sortedScores[mid - 1] + sortedScores[mid]) / 2 : sortedScores[mid];
17487
- if (median < minScore) return [];
17877
+ const minScoreRatio = config.ranking.minScoreRatio;
17878
+ if (minScoreRatio > 0 && results.length > 0) {
17879
+ const topScore = results[0].pageScore;
17880
+ if (Number.isFinite(topScore) && topScore > 0) {
17881
+ const minThreshold = topScore * minScoreRatio;
17882
+ results = results.filter((r) => r.pageScore >= minThreshold);
17883
+ }
17488
17884
  }
17489
17885
  if (threshold > 0 && results.length > 1) {
17490
17886
  for (let i = 1; i < results.length; i++) {
@@ -17554,79 +17950,280 @@ function aggregateByPage(ranked, config) {
17554
17950
  return Number.isNaN(delta) ? 0 : delta;
17555
17951
  });
17556
17952
  }
17557
- function mergePageAndChunkResults(pageHits, rankedChunks, config) {
17558
- if (pageHits.length === 0) return rankedChunks;
17559
- const w = config.search.pageSearchWeight;
17560
- const pageScoreMap = /* @__PURE__ */ new Map();
17561
- for (const ph of pageHits) {
17562
- pageScoreMap.set(ph.url, ph);
17563
- }
17564
- const pagesWithChunks = /* @__PURE__ */ new Set();
17565
- const merged = rankedChunks.map((ranked) => {
17566
- const url = ranked.hit.metadata.url;
17567
- const pageHit = pageScoreMap.get(url);
17568
- if (pageHit) {
17569
- pagesWithChunks.add(url);
17570
- const blended = (1 - w) * ranked.finalScore + w * pageHit.score;
17571
- return {
17572
- hit: ranked.hit,
17573
- finalScore: Number.isFinite(blended) ? blended : ranked.finalScore
17574
- };
17953
+ function rankPageHits(pageHits, config, query, debug) {
17954
+ const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
17955
+ const titleMatchWeight = config.ranking.weights.titleMatch;
17956
+ return pageHits.map((hit) => {
17957
+ const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
17958
+ let score = baseScore;
17959
+ let incomingLinkBoostValue = 0;
17960
+ if (config.ranking.enableIncomingLinkBoost) {
17961
+ const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.incomingLinks));
17962
+ incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
17963
+ score += incomingLinkBoostValue;
17575
17964
  }
17576
- return ranked;
17577
- });
17578
- for (const [url, pageHit] of pageScoreMap) {
17579
- if (pagesWithChunks.has(url)) continue;
17580
- const syntheticScore = pageHit.score * w;
17581
- const syntheticHit = {
17582
- id: `page:${url}`,
17583
- score: pageHit.score,
17584
- metadata: {
17585
- projectId: "",
17586
- scopeName: "",
17587
- url: pageHit.url,
17588
- path: pageHit.url,
17589
- title: pageHit.title,
17590
- sectionTitle: "",
17591
- headingPath: [],
17592
- snippet: pageHit.description || pageHit.title,
17593
- chunkText: pageHit.description || pageHit.title,
17594
- ordinal: 0,
17595
- contentHash: "",
17596
- depth: pageHit.depth,
17597
- incomingLinks: pageHit.incomingLinks,
17598
- routeFile: pageHit.routeFile,
17599
- tags: pageHit.tags
17965
+ let depthBoostValue = 0;
17966
+ if (config.ranking.enableDepthBoost) {
17967
+ const depthBoost = 1 / (1 + nonNegativeOrZero(hit.depth));
17968
+ depthBoostValue = depthBoost * config.ranking.weights.depth;
17969
+ score += depthBoostValue;
17970
+ }
17971
+ let titleMatchBoostValue = 0;
17972
+ if (normalizedQuery && titleMatchWeight > 0) {
17973
+ const normalizedTitle = normalizeForTitleMatch(hit.title);
17974
+ if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
17975
+ titleMatchBoostValue = titleMatchWeight;
17976
+ score += titleMatchBoostValue;
17977
+ }
17978
+ }
17979
+ let freshnessBoostValue = 0;
17980
+ if (config.ranking.enableFreshnessBoost) {
17981
+ const publishedAt = hit.publishedAt;
17982
+ if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
17983
+ const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
17984
+ const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
17985
+ freshnessBoostValue = decay * config.ranking.weights.freshness;
17986
+ score += freshnessBoostValue;
17600
17987
  }
17988
+ }
17989
+ const pageWeight = findPageWeight(hit.url, config.ranking.pageWeights);
17990
+ if (pageWeight !== 1) {
17991
+ score *= pageWeight;
17992
+ }
17993
+ const result = {
17994
+ url: hit.url,
17995
+ title: hit.title,
17996
+ description: hit.description,
17997
+ routeFile: hit.routeFile,
17998
+ depth: hit.depth,
17999
+ incomingLinks: hit.incomingLinks,
18000
+ tags: hit.tags,
18001
+ baseScore,
18002
+ finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY,
18003
+ publishedAt: hit.publishedAt
17601
18004
  };
17602
- merged.push({
17603
- hit: syntheticHit,
17604
- finalScore: Number.isFinite(syntheticScore) ? syntheticScore : 0
17605
- });
17606
- }
17607
- return merged.sort((a, b) => {
18005
+ if (debug) {
18006
+ result.breakdown = {
18007
+ baseScore,
18008
+ pageWeight,
18009
+ incomingLinkBoost: incomingLinkBoostValue,
18010
+ depthBoost: depthBoostValue,
18011
+ titleMatchBoost: titleMatchBoostValue,
18012
+ freshnessBoost: freshnessBoostValue
18013
+ };
18014
+ }
18015
+ return result;
18016
+ }).filter((p) => findPageWeight(p.url, config.ranking.pageWeights) !== 0).sort((a, b) => {
17608
18017
  const delta = b.finalScore - a.finalScore;
17609
18018
  return Number.isNaN(delta) ? 0 : delta;
17610
18019
  });
17611
18020
  }
18021
+ function trimPagesByScoreGap(results, config) {
18022
+ if (results.length === 0) return results;
18023
+ const threshold = config.ranking.scoreGapThreshold;
18024
+ const minScoreRatio = config.ranking.minScoreRatio;
18025
+ if (minScoreRatio > 0 && results.length > 0) {
18026
+ const topScore = results[0].finalScore;
18027
+ if (Number.isFinite(topScore) && topScore > 0) {
18028
+ const minThreshold = topScore * minScoreRatio;
18029
+ results = results.filter((r) => r.finalScore >= minThreshold);
18030
+ }
18031
+ }
18032
+ if (threshold > 0 && results.length > 1) {
18033
+ for (let i = 1; i < results.length; i++) {
18034
+ const prev = results[i - 1].finalScore;
18035
+ const current = results[i].finalScore;
18036
+ if (prev > 0) {
18037
+ const gap = (prev - current) / prev;
18038
+ if (gap >= threshold) {
18039
+ return results.slice(0, i);
18040
+ }
18041
+ }
18042
+ }
18043
+ }
18044
+ return results;
18045
+ }
18046
+
18047
+ // src/search/related-pages.ts
18048
+ function diceScore(urlA, urlB) {
18049
+ const segmentsA = urlA.split("/").filter(Boolean);
18050
+ const segmentsB = urlB.split("/").filter(Boolean);
18051
+ if (segmentsA.length === 0 && segmentsB.length === 0) return 1;
18052
+ if (segmentsA.length === 0 || segmentsB.length === 0) return 0;
18053
+ let shared = 0;
18054
+ const minLen = Math.min(segmentsA.length, segmentsB.length);
18055
+ for (let i = 0; i < minLen; i++) {
18056
+ if (segmentsA[i] === segmentsB[i]) {
18057
+ shared++;
18058
+ } else {
18059
+ break;
18060
+ }
18061
+ }
18062
+ return 2 * shared / (segmentsA.length + segmentsB.length);
18063
+ }
18064
+ function compositeScore(isLinked, dice, semantic) {
18065
+ return (isLinked ? 0.5 : 0) + 0.3 * dice + 0.2 * semantic;
18066
+ }
18067
+ function dominantRelationshipType(isOutgoing, isIncoming, dice) {
18068
+ if (isOutgoing) return "outgoing_link";
18069
+ if (isIncoming) return "incoming_link";
18070
+ if (dice > 0.4) return "sibling";
18071
+ return "semantic";
18072
+ }
18073
+
18074
+ // src/utils/structured-meta.ts
18075
+ var VALID_KEY_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
18076
+ function validateMetaKey(key) {
18077
+ return VALID_KEY_RE.test(key);
18078
+ }
18079
+ function parseMetaValue(content, dataType) {
18080
+ switch (dataType) {
18081
+ case "number": {
18082
+ const n = Number(content);
18083
+ return Number.isFinite(n) ? n : content;
18084
+ }
18085
+ case "boolean":
18086
+ return content === "true";
18087
+ case "string[]":
18088
+ return content ? content.split(",").map((s) => s.trim()) : [];
18089
+ case "date": {
18090
+ const ms = Number(content);
18091
+ return Number.isFinite(ms) ? ms : content;
18092
+ }
18093
+ default:
18094
+ return content;
18095
+ }
18096
+ }
18097
+ function escapeFilterValue(s) {
18098
+ return s.replace(/'/g, "''");
18099
+ }
18100
+ function buildMetaFilterString(filters) {
18101
+ const clauses = [];
18102
+ for (const [key, value] of Object.entries(filters)) {
18103
+ if (!validateMetaKey(key)) continue;
18104
+ const field = `meta.${key}`;
18105
+ if (typeof value === "string") {
18106
+ clauses.push(`${field} CONTAINS '${escapeFilterValue(value)}'`);
18107
+ } else if (typeof value === "boolean") {
18108
+ clauses.push(`${field} = ${value}`);
18109
+ } else {
18110
+ clauses.push(`${field} = ${value}`);
18111
+ }
18112
+ }
18113
+ return clauses.join(" AND ");
18114
+ }
17612
18115
 
17613
18116
  // src/search/engine.ts
18117
+ var rankingOverridesSchema = z.object({
18118
+ ranking: z.object({
18119
+ enableIncomingLinkBoost: z.boolean().optional(),
18120
+ enableDepthBoost: z.boolean().optional(),
18121
+ aggregationCap: z.number().int().positive().optional(),
18122
+ aggregationDecay: z.number().min(0).max(1).optional(),
18123
+ minChunkScoreRatio: z.number().min(0).max(1).optional(),
18124
+ minScoreRatio: z.number().min(0).max(1).optional(),
18125
+ scoreGapThreshold: z.number().min(0).max(1).optional(),
18126
+ weights: z.object({
18127
+ incomingLinks: z.number().optional(),
18128
+ depth: z.number().optional(),
18129
+ aggregation: z.number().optional(),
18130
+ titleMatch: z.number().optional()
18131
+ }).optional()
18132
+ }).optional(),
18133
+ search: z.object({
18134
+ pageSearchWeight: z.number().min(0).max(1).optional()
18135
+ }).optional()
18136
+ }).optional();
17614
18137
  var requestSchema = z.object({
17615
18138
  q: z.string().trim().min(1),
17616
18139
  topK: z.number().int().positive().max(100).optional(),
17617
18140
  scope: z.string().optional(),
17618
18141
  pathPrefix: z.string().optional(),
17619
18142
  tags: z.array(z.string()).optional(),
17620
- groupBy: z.enum(["page", "chunk"]).optional()
18143
+ filters: z.record(z.string(), z.union([z.string(), z.number(), z.boolean()])).optional(),
18144
+ groupBy: z.enum(["page", "chunk"]).optional(),
18145
+ maxSubResults: z.number().int().positive().max(20).optional(),
18146
+ debug: z.boolean().optional(),
18147
+ rankingOverrides: rankingOverridesSchema
17621
18148
  });
17622
- var SearchEngine = class _SearchEngine {
17623
- cwd;
17624
- config;
17625
- store;
17626
- constructor(options) {
17627
- this.cwd = options.cwd;
17628
- this.config = options.config;
17629
- this.store = options.store;
18149
+ var MAX_SITE_STRUCTURE_PAGES = 2e3;
18150
+ function makeNode(url, depth) {
18151
+ return { url, title: "", depth, routeFile: "", isIndexed: false, childCount: 0, children: [] };
18152
+ }
18153
+ function buildTree(pages, pathPrefix) {
18154
+ const nodeMap = /* @__PURE__ */ new Map();
18155
+ const root2 = makeNode("/", 0);
18156
+ nodeMap.set("/", root2);
18157
+ for (const page of pages) {
18158
+ const normalized = normalizeUrlPath(page.url);
18159
+ const segments = normalized.split("/").filter(Boolean);
18160
+ if (segments.length === 0) {
18161
+ root2.title = page.title;
18162
+ root2.routeFile = page.routeFile;
18163
+ root2.isIndexed = true;
18164
+ continue;
18165
+ }
18166
+ for (let i = 1; i <= segments.length; i++) {
18167
+ const partialUrl = "/" + segments.slice(0, i).join("/");
18168
+ if (!nodeMap.has(partialUrl)) {
18169
+ nodeMap.set(partialUrl, makeNode(partialUrl, i));
18170
+ }
18171
+ }
18172
+ const node = nodeMap.get(normalized);
18173
+ node.title = page.title;
18174
+ node.routeFile = page.routeFile;
18175
+ node.isIndexed = true;
18176
+ }
18177
+ for (const [url, node] of nodeMap) {
18178
+ if (url === "/") continue;
18179
+ const segments = url.split("/").filter(Boolean);
18180
+ const parentUrl = segments.length === 1 ? "/" : "/" + segments.slice(0, -1).join("/");
18181
+ const parent = nodeMap.get(parentUrl) ?? root2;
18182
+ parent.children.push(node);
18183
+ }
18184
+ const sortAndCount = (node) => {
18185
+ node.children.sort((a, b) => a.url.localeCompare(b.url));
18186
+ node.childCount = node.children.length;
18187
+ for (const child of node.children) {
18188
+ sortAndCount(child);
18189
+ }
18190
+ };
18191
+ sortAndCount(root2);
18192
+ if (pathPrefix) {
18193
+ const normalizedPrefix = normalizeUrlPath(pathPrefix);
18194
+ const subtreeRoot = nodeMap.get(normalizedPrefix);
18195
+ if (subtreeRoot) {
18196
+ return subtreeRoot;
18197
+ }
18198
+ return makeNode(normalizedPrefix, normalizedPrefix.split("/").filter(Boolean).length);
18199
+ }
18200
+ return root2;
18201
+ }
18202
+ function mergeRankingOverrides(base, overrides) {
18203
+ return {
18204
+ ...base,
18205
+ search: {
18206
+ ...base.search,
18207
+ ...overrides.search
18208
+ },
18209
+ ranking: {
18210
+ ...base.ranking,
18211
+ ...overrides.ranking,
18212
+ weights: {
18213
+ ...base.ranking.weights,
18214
+ ...overrides.ranking?.weights
18215
+ }
18216
+ }
18217
+ };
18218
+ }
18219
+ var SearchEngine = class _SearchEngine {
18220
+ cwd;
18221
+ config;
18222
+ store;
18223
+ constructor(options) {
18224
+ this.cwd = options.cwd;
18225
+ this.config = options.config;
18226
+ this.store = options.store;
17630
18227
  }
17631
18228
  static async create(options = {}) {
17632
18229
  const cwd = path.resolve(options.cwd ?? process.cwd());
@@ -17648,125 +18245,203 @@ var SearchEngine = class _SearchEngine {
17648
18245
  }
17649
18246
  const input = parsed.data;
17650
18247
  const totalStart = process.hrtime.bigint();
18248
+ const effectiveConfig = input.debug && input.rankingOverrides ? mergeRankingOverrides(this.config, input.rankingOverrides) : this.config;
17651
18249
  const resolvedScope = resolveScope(this.config, input.scope);
17652
18250
  const topK = input.topK ?? 10;
18251
+ const maxSubResults = input.maxSubResults ?? 5;
17653
18252
  const groupByPage = (input.groupBy ?? "page") === "page";
17654
- const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
17655
- const filterParts = [];
17656
- if (input.pathPrefix) {
17657
- const prefix = input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}`;
17658
- filterParts.push(`url GLOB '${prefix}*'`);
17659
- }
17660
- if (input.tags && input.tags.length > 0) {
17661
- for (const tag of input.tags) {
17662
- filterParts.push(`tags GLOB '*${tag}*'`);
18253
+ const queryText = input.q;
18254
+ const pathPrefix = input.pathPrefix ? input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}` : void 0;
18255
+ const filterTags = input.tags && input.tags.length > 0 ? input.tags : void 0;
18256
+ const metaFilterStr = input.filters && Object.keys(input.filters).length > 0 ? buildMetaFilterString(input.filters) : "";
18257
+ const metaFilter = metaFilterStr || void 0;
18258
+ const applyPagePostFilters = (hits) => {
18259
+ let filtered = hits;
18260
+ if (pathPrefix) {
18261
+ filtered = filtered.filter((h) => h.url.startsWith(pathPrefix));
18262
+ }
18263
+ if (filterTags) {
18264
+ filtered = filtered.filter(
18265
+ (h) => filterTags.every((tag) => h.tags.includes(tag))
18266
+ );
17663
18267
  }
17664
- }
17665
- const filter = filterParts.length > 0 ? filterParts.join(" AND ") : void 0;
17666
- const useDualSearch = this.config.search.dualSearch && groupByPage;
18268
+ return filtered;
18269
+ };
18270
+ const applyChunkPostFilters = (hits) => {
18271
+ let filtered = hits;
18272
+ if (filterTags) {
18273
+ filtered = filtered.filter(
18274
+ (h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
18275
+ );
18276
+ }
18277
+ return filtered;
18278
+ };
17667
18279
  const searchStart = process.hrtime.bigint();
17668
- let ranked;
17669
- if (useDualSearch) {
17670
- const chunkLimit = Math.max(topK * 10, 100);
17671
- const pageLimit = 20;
17672
- const [pageHits, chunkHits] = await Promise.all([
17673
- this.store.searchPages(
17674
- input.q,
17675
- {
17676
- limit: pageLimit,
17677
- semanticWeight: this.config.search.semanticWeight,
17678
- inputEnrichment: this.config.search.inputEnrichment,
17679
- filter
17680
- },
17681
- resolvedScope
17682
- ),
17683
- this.store.search(
17684
- input.q,
17685
- {
17686
- limit: chunkLimit,
17687
- semanticWeight: this.config.search.semanticWeight,
17688
- inputEnrichment: this.config.search.inputEnrichment,
17689
- reranking: false,
17690
- filter
17691
- },
18280
+ if (groupByPage) {
18281
+ const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
18282
+ const pageLimit = Math.max(topK * 2, 20);
18283
+ const pageHits = await this.store.searchPagesByText(
18284
+ queryText,
18285
+ { limit: pageLimit * fetchMultiplier, filter: metaFilter },
18286
+ resolvedScope
18287
+ );
18288
+ const filteredPages = applyPagePostFilters(pageHits);
18289
+ let rankedPages = rankPageHits(filteredPages, effectiveConfig, input.q, input.debug);
18290
+ rankedPages = trimPagesByScoreGap(rankedPages, effectiveConfig);
18291
+ const topPages = rankedPages.slice(0, topK);
18292
+ const chunkPromises = topPages.map(
18293
+ (page) => this.store.searchChunksByUrl(
18294
+ queryText,
18295
+ page.url,
18296
+ { limit: maxSubResults, filter: metaFilter },
17692
18297
  resolvedScope
17693
- )
17694
- ]);
17695
- const rankedChunks = rankHits(chunkHits, this.config, input.q);
17696
- ranked = mergePageAndChunkResults(pageHits, rankedChunks, this.config);
18298
+ ).then((chunks) => applyChunkPostFilters(chunks))
18299
+ );
18300
+ const allChunks = await Promise.all(chunkPromises);
18301
+ const searchMs = hrTimeMs(searchStart);
18302
+ const results = this.buildPageFirstResults(topPages, allChunks, input.q, input.debug, maxSubResults);
18303
+ return {
18304
+ q: input.q,
18305
+ scope: resolvedScope.scopeName,
18306
+ results,
18307
+ meta: {
18308
+ timingsMs: {
18309
+ search: Math.round(searchMs),
18310
+ total: Math.round(hrTimeMs(totalStart))
18311
+ }
18312
+ }
18313
+ };
17697
18314
  } else {
18315
+ const candidateK = Math.max(50, topK);
18316
+ const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
17698
18317
  const hits = await this.store.search(
17699
- input.q,
17700
- {
17701
- limit: candidateK,
17702
- semanticWeight: this.config.search.semanticWeight,
17703
- inputEnrichment: this.config.search.inputEnrichment,
17704
- reranking: this.config.search.reranking,
17705
- filter
17706
- },
18318
+ queryText,
18319
+ { limit: candidateK * fetchMultiplier, filter: metaFilter },
17707
18320
  resolvedScope
17708
18321
  );
17709
- ranked = rankHits(hits, this.config, input.q);
17710
- }
17711
- const searchMs = hrTimeMs(searchStart);
17712
- const results = this.buildResults(ranked, topK, groupByPage, input.q);
17713
- return {
17714
- q: input.q,
17715
- scope: resolvedScope.scopeName,
17716
- results,
17717
- meta: {
17718
- timingsMs: {
17719
- search: Math.round(searchMs),
17720
- total: Math.round(hrTimeMs(totalStart))
18322
+ let filtered = hits;
18323
+ if (pathPrefix) {
18324
+ filtered = filtered.filter((h) => h.metadata.url.startsWith(pathPrefix));
18325
+ }
18326
+ if (filterTags) {
18327
+ filtered = filtered.filter(
18328
+ (h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
18329
+ );
18330
+ }
18331
+ const ranked = rankHits(filtered, effectiveConfig, input.q, input.debug);
18332
+ const searchMs = hrTimeMs(searchStart);
18333
+ const results = this.buildResults(ranked, topK, false, maxSubResults, input.q, input.debug, effectiveConfig);
18334
+ return {
18335
+ q: input.q,
18336
+ scope: resolvedScope.scopeName,
18337
+ results,
18338
+ meta: {
18339
+ timingsMs: {
18340
+ search: Math.round(searchMs),
18341
+ total: Math.round(hrTimeMs(totalStart))
18342
+ }
17721
18343
  }
18344
+ };
18345
+ }
18346
+ }
18347
+ buildPageFirstResults(rankedPages, allChunks, query, debug, maxSubResults = 5) {
18348
+ return rankedPages.map((page, i) => {
18349
+ const chunks = allChunks[i] ?? [];
18350
+ const bestChunk = chunks[0];
18351
+ const snippet = bestChunk ? query ? queryAwareExcerpt(bestChunk.metadata.chunkText, query) : toSnippet(bestChunk.metadata.chunkText) : page.description || page.title;
18352
+ const result = {
18353
+ url: page.url,
18354
+ title: page.title,
18355
+ sectionTitle: bestChunk?.metadata.sectionTitle || void 0,
18356
+ snippet,
18357
+ chunkText: bestChunk?.metadata.chunkText || void 0,
18358
+ score: Number(page.finalScore.toFixed(6)),
18359
+ routeFile: page.routeFile,
18360
+ chunks: chunks.length > 0 ? chunks.slice(0, maxSubResults).map((c) => ({
18361
+ sectionTitle: c.metadata.sectionTitle || void 0,
18362
+ snippet: query ? queryAwareExcerpt(c.metadata.chunkText, query) : toSnippet(c.metadata.chunkText),
18363
+ chunkText: c.metadata.chunkText || void 0,
18364
+ headingPath: c.metadata.headingPath,
18365
+ score: Number(c.score.toFixed(6))
18366
+ })) : void 0
18367
+ };
18368
+ if (debug && page.breakdown) {
18369
+ result.breakdown = {
18370
+ baseScore: page.breakdown.baseScore,
18371
+ incomingLinkBoost: page.breakdown.incomingLinkBoost,
18372
+ depthBoost: page.breakdown.depthBoost,
18373
+ titleMatchBoost: page.breakdown.titleMatchBoost,
18374
+ freshnessBoost: page.breakdown.freshnessBoost,
18375
+ anchorTextMatchBoost: 0
18376
+ };
17722
18377
  }
17723
- };
18378
+ return result;
18379
+ });
17724
18380
  }
17725
- ensureSnippet(hit) {
18381
+ ensureSnippet(hit, query) {
18382
+ const chunkText = hit.hit.metadata.chunkText;
18383
+ if (query && chunkText) return queryAwareExcerpt(chunkText, query);
17726
18384
  const snippet = hit.hit.metadata.snippet;
17727
18385
  if (snippet && snippet.length >= 30) return snippet;
17728
- const chunkText = hit.hit.metadata.chunkText;
17729
18386
  if (chunkText) return toSnippet(chunkText);
17730
18387
  return snippet || "";
17731
18388
  }
17732
- buildResults(ordered, topK, groupByPage, _query) {
18389
+ buildResults(ordered, topK, groupByPage, maxSubResults, query, debug, config) {
18390
+ const cfg = config ?? this.config;
17733
18391
  if (groupByPage) {
17734
- let pages = aggregateByPage(ordered, this.config);
17735
- pages = trimByScoreGap(pages, this.config);
17736
- const minRatio = this.config.ranking.minChunkScoreRatio;
18392
+ let pages = aggregateByPage(ordered, cfg);
18393
+ pages = trimByScoreGap(pages, cfg);
18394
+ const minRatio = cfg.ranking.minChunkScoreRatio;
17737
18395
  return pages.slice(0, topK).map((page) => {
17738
18396
  const bestScore = page.bestChunk.finalScore;
17739
18397
  const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
17740
- const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
17741
- return {
18398
+ const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, maxSubResults);
18399
+ const result = {
17742
18400
  url: page.url,
17743
18401
  title: page.title,
17744
18402
  sectionTitle: page.bestChunk.hit.metadata.sectionTitle || void 0,
17745
- snippet: this.ensureSnippet(page.bestChunk),
18403
+ snippet: this.ensureSnippet(page.bestChunk, query),
18404
+ chunkText: page.bestChunk.hit.metadata.chunkText || void 0,
17746
18405
  score: Number(page.pageScore.toFixed(6)),
17747
18406
  routeFile: page.routeFile,
17748
- chunks: meaningful.length > 1 ? meaningful.map((c) => ({
18407
+ chunks: meaningful.length >= 1 ? meaningful.map((c) => ({
17749
18408
  sectionTitle: c.hit.metadata.sectionTitle || void 0,
17750
- snippet: this.ensureSnippet(c),
18409
+ snippet: this.ensureSnippet(c, query),
18410
+ chunkText: c.hit.metadata.chunkText || void 0,
17751
18411
  headingPath: c.hit.metadata.headingPath,
17752
18412
  score: Number(c.finalScore.toFixed(6))
17753
18413
  })) : void 0
17754
18414
  };
18415
+ if (debug && page.bestChunk.breakdown) {
18416
+ result.breakdown = page.bestChunk.breakdown;
18417
+ }
18418
+ return result;
17755
18419
  });
17756
18420
  } else {
17757
18421
  let filtered = ordered;
17758
- const minScore = this.config.ranking.minScore;
17759
- if (minScore > 0) {
17760
- filtered = ordered.filter((entry) => entry.finalScore >= minScore);
17761
- }
17762
- return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
17763
- url: hit.metadata.url,
17764
- title: hit.metadata.title,
17765
- sectionTitle: hit.metadata.sectionTitle || void 0,
17766
- snippet: this.ensureSnippet({ hit, finalScore }),
17767
- score: Number(finalScore.toFixed(6)),
17768
- routeFile: hit.metadata.routeFile
17769
- }));
18422
+ const minScoreRatio = cfg.ranking.minScoreRatio;
18423
+ if (minScoreRatio > 0 && ordered.length > 0) {
18424
+ const topScore = ordered[0].finalScore;
18425
+ if (Number.isFinite(topScore) && topScore > 0) {
18426
+ const threshold = topScore * minScoreRatio;
18427
+ filtered = ordered.filter((entry) => entry.finalScore >= threshold);
18428
+ }
18429
+ }
18430
+ return filtered.slice(0, topK).map(({ hit, finalScore, breakdown }) => {
18431
+ const result = {
18432
+ url: hit.metadata.url,
18433
+ title: hit.metadata.title,
18434
+ sectionTitle: hit.metadata.sectionTitle || void 0,
18435
+ snippet: this.ensureSnippet({ hit, finalScore }, query),
18436
+ chunkText: hit.metadata.chunkText || void 0,
18437
+ score: Number(finalScore.toFixed(6)),
18438
+ routeFile: hit.metadata.routeFile
18439
+ };
18440
+ if (debug && breakdown) {
18441
+ result.breakdown = breakdown;
18442
+ }
18443
+ return result;
18444
+ });
17770
18445
  }
17771
18446
  }
17772
18447
  async getPage(pathOrUrl, scope) {
@@ -17792,6 +18467,116 @@ var SearchEngine = class _SearchEngine {
17792
18467
  markdown: page.markdown
17793
18468
  };
17794
18469
  }
18470
+ async listPages(opts) {
18471
+ const resolvedScope = resolveScope(this.config, opts?.scope);
18472
+ const pathPrefix = opts?.pathPrefix ? opts.pathPrefix.startsWith("/") ? opts.pathPrefix : `/${opts.pathPrefix}` : void 0;
18473
+ return this.store.listPages(resolvedScope, {
18474
+ cursor: opts?.cursor,
18475
+ limit: opts?.limit,
18476
+ pathPrefix
18477
+ });
18478
+ }
18479
+ async getSiteStructure(opts) {
18480
+ const maxPages = Math.min(opts?.maxPages ?? MAX_SITE_STRUCTURE_PAGES, MAX_SITE_STRUCTURE_PAGES);
18481
+ const allPages = [];
18482
+ let cursor;
18483
+ let truncated = false;
18484
+ do {
18485
+ const result = await this.listPages({
18486
+ pathPrefix: opts?.pathPrefix,
18487
+ scope: opts?.scope,
18488
+ cursor,
18489
+ limit: 200
18490
+ });
18491
+ allPages.push(...result.pages);
18492
+ cursor = result.nextCursor;
18493
+ if (allPages.length >= maxPages) {
18494
+ truncated = allPages.length > maxPages || !!cursor;
18495
+ allPages.length = maxPages;
18496
+ break;
18497
+ }
18498
+ } while (cursor);
18499
+ const root2 = buildTree(allPages, opts?.pathPrefix);
18500
+ return {
18501
+ root: root2,
18502
+ totalPages: allPages.length,
18503
+ truncated
18504
+ };
18505
+ }
18506
+ async getRelatedPages(pathOrUrl, opts) {
18507
+ const resolvedScope = resolveScope(this.config, opts?.scope);
18508
+ const urlPath = this.resolveInputPath(pathOrUrl);
18509
+ const topK = Math.min(opts?.topK ?? 10, 25);
18510
+ const source = await this.store.fetchPageWithVector(urlPath, resolvedScope);
18511
+ if (!source) {
18512
+ throw new SearchSocketError("INVALID_REQUEST", `Indexed page not found for ${urlPath}`, 404);
18513
+ }
18514
+ const sourceOutgoing = new Set(source.metadata.outgoingLinkUrls ?? []);
18515
+ const semanticHits = await this.store.searchPagesByVector(
18516
+ source.vector,
18517
+ { limit: 50 },
18518
+ resolvedScope
18519
+ );
18520
+ const filteredHits = semanticHits.filter((h) => h.url !== urlPath);
18521
+ const semanticScoreMap = /* @__PURE__ */ new Map();
18522
+ for (const hit of filteredHits) {
18523
+ semanticScoreMap.set(hit.url, hit.score);
18524
+ }
18525
+ const candidateUrls = /* @__PURE__ */ new Set();
18526
+ for (const hit of filteredHits) {
18527
+ candidateUrls.add(hit.url);
18528
+ }
18529
+ for (const url of sourceOutgoing) {
18530
+ if (url !== urlPath) candidateUrls.add(url);
18531
+ }
18532
+ const missingUrls = [...sourceOutgoing].filter(
18533
+ (u) => u !== urlPath && !semanticScoreMap.has(u)
18534
+ );
18535
+ const fetchedPages = missingUrls.length > 0 ? await this.store.fetchPagesBatch(missingUrls, resolvedScope) : [];
18536
+ const metaMap = /* @__PURE__ */ new Map();
18537
+ for (const hit of filteredHits) {
18538
+ metaMap.set(hit.url, { title: hit.title, routeFile: hit.routeFile, outgoingLinkUrls: [] });
18539
+ }
18540
+ for (const p of fetchedPages) {
18541
+ metaMap.set(p.url, { title: p.title, routeFile: p.routeFile, outgoingLinkUrls: p.outgoingLinkUrls });
18542
+ }
18543
+ const semanticUrls = filteredHits.map((h) => h.url);
18544
+ if (semanticUrls.length > 0) {
18545
+ const semanticPageData = await this.store.fetchPagesBatch(semanticUrls, resolvedScope);
18546
+ for (const p of semanticPageData) {
18547
+ const existing = metaMap.get(p.url);
18548
+ if (existing) {
18549
+ existing.outgoingLinkUrls = p.outgoingLinkUrls;
18550
+ }
18551
+ }
18552
+ }
18553
+ const candidates = [];
18554
+ for (const url of candidateUrls) {
18555
+ const meta = metaMap.get(url);
18556
+ if (!meta) continue;
18557
+ const isOutgoing = sourceOutgoing.has(url);
18558
+ const isIncoming = meta.outgoingLinkUrls.includes(urlPath);
18559
+ const isLinked = isOutgoing || isIncoming;
18560
+ const dice = diceScore(urlPath, url);
18561
+ const semantic = semanticScoreMap.get(url) ?? 0;
18562
+ const score = compositeScore(isLinked, dice, semantic);
18563
+ const relationshipType = dominantRelationshipType(isOutgoing, isIncoming, dice);
18564
+ candidates.push({
18565
+ url,
18566
+ title: meta.title,
18567
+ score: Number(score.toFixed(6)),
18568
+ relationshipType,
18569
+ routeFile: meta.routeFile
18570
+ });
18571
+ }
18572
+ candidates.sort((a, b) => b.score - a.score);
18573
+ const results = candidates.slice(0, topK);
18574
+ return {
18575
+ sourceUrl: urlPath,
18576
+ scope: resolvedScope.scopeName,
18577
+ relatedPages: results
18578
+ };
18579
+ }
17795
18580
  async health() {
17796
18581
  return this.store.health();
17797
18582
  }
@@ -17807,6 +18592,215 @@ var SearchEngine = class _SearchEngine {
17807
18592
  }
17808
18593
  };
17809
18594
 
18595
+ // src/mcp/server.ts
18596
+ function createServer(engine) {
18597
+ const server = new McpServer({
18598
+ name: "searchsocket-mcp",
18599
+ version: "0.1.0"
18600
+ });
18601
+ server.registerTool(
18602
+ "search",
18603
+ {
18604
+ description: `Semantic site search powered by Upstash Search. Returns url, title, snippet, chunkText, score, and routeFile per result. chunkText contains the full raw chunk markdown. When groupBy is 'page' (default), each result includes a chunks array with section-level sub-results containing sectionTitle, headingPath, snippet, and score. Supports optional filters for structured metadata (e.g. {"version": 2, "deprecated": false}).`,
18605
+ inputSchema: {
18606
+ query: z.string().min(1),
18607
+ scope: z.string().optional(),
18608
+ topK: z.number().int().positive().max(100).optional(),
18609
+ pathPrefix: z.string().optional(),
18610
+ tags: z.array(z.string()).optional(),
18611
+ filters: z.record(z.string(), z.union([z.string(), z.number(), z.boolean()])).optional(),
18612
+ groupBy: z.enum(["page", "chunk"]).optional(),
18613
+ maxSubResults: z.number().int().positive().max(20).optional()
18614
+ },
18615
+ outputSchema: {
18616
+ q: z.string(),
18617
+ scope: z.string(),
18618
+ results: z.array(z.object({
18619
+ url: z.string(),
18620
+ title: z.string(),
18621
+ sectionTitle: z.string().optional(),
18622
+ snippet: z.string(),
18623
+ score: z.number(),
18624
+ routeFile: z.string(),
18625
+ chunks: z.array(z.object({
18626
+ sectionTitle: z.string().optional(),
18627
+ snippet: z.string(),
18628
+ headingPath: z.array(z.string()),
18629
+ score: z.number()
18630
+ })).optional()
18631
+ })),
18632
+ meta: z.object({
18633
+ timingsMs: z.object({
18634
+ search: z.number(),
18635
+ total: z.number()
18636
+ })
18637
+ })
18638
+ }
18639
+ },
18640
+ async (input) => {
18641
+ const result = await engine.search({
18642
+ q: input.query,
18643
+ topK: input.topK,
18644
+ scope: input.scope,
18645
+ pathPrefix: input.pathPrefix,
18646
+ tags: input.tags,
18647
+ filters: input.filters,
18648
+ groupBy: input.groupBy,
18649
+ maxSubResults: input.maxSubResults
18650
+ });
18651
+ return {
18652
+ content: [
18653
+ {
18654
+ type: "text",
18655
+ text: JSON.stringify(result, null, 2)
18656
+ }
18657
+ ],
18658
+ structuredContent: result
18659
+ };
18660
+ }
18661
+ );
18662
+ server.registerTool(
18663
+ "get_page",
18664
+ {
18665
+ description: "Fetch indexed markdown for a specific path or URL, including frontmatter and routeFile mapping.",
18666
+ inputSchema: {
18667
+ pathOrUrl: z.string().min(1),
18668
+ scope: z.string().optional()
18669
+ }
18670
+ },
18671
+ async (input) => {
18672
+ const page = await engine.getPage(input.pathOrUrl, input.scope);
18673
+ return {
18674
+ content: [
18675
+ {
18676
+ type: "text",
18677
+ text: JSON.stringify(page, null, 2)
18678
+ }
18679
+ ]
18680
+ };
18681
+ }
18682
+ );
18683
+ server.registerTool(
18684
+ "list_pages",
18685
+ {
18686
+ description: "List indexed pages with optional path prefix filtering and cursor-based pagination. Returns url, title, description, and routeFile for each page. Use nextCursor to fetch subsequent pages.",
18687
+ inputSchema: {
18688
+ pathPrefix: z.string().optional(),
18689
+ cursor: z.string().optional(),
18690
+ limit: z.number().int().positive().max(200).optional(),
18691
+ scope: z.string().optional()
18692
+ }
18693
+ },
18694
+ async (input) => {
18695
+ const result = await engine.listPages({
18696
+ pathPrefix: input.pathPrefix,
18697
+ cursor: input.cursor,
18698
+ limit: input.limit,
18699
+ scope: input.scope
18700
+ });
18701
+ return {
18702
+ content: [
18703
+ {
18704
+ type: "text",
18705
+ text: JSON.stringify(result, null, 2)
18706
+ }
18707
+ ]
18708
+ };
18709
+ }
18710
+ );
18711
+ server.registerTool(
18712
+ "get_site_structure",
18713
+ {
18714
+ description: "Returns the hierarchical page tree derived from URL paths. Use this to understand site navigation structure, find where pages belong, or scope further operations to a section. Nodes with isIndexed: false are implicit structural parents not directly in the index. Large sites (>2000 pages) return truncated: true.",
18715
+ inputSchema: {
18716
+ pathPrefix: z.string().optional(),
18717
+ scope: z.string().optional(),
18718
+ maxPages: z.number().int().positive().max(2e3).optional()
18719
+ }
18720
+ },
18721
+ async (input) => {
18722
+ const result = await engine.getSiteStructure({
18723
+ pathPrefix: input.pathPrefix,
18724
+ scope: input.scope,
18725
+ maxPages: input.maxPages
18726
+ });
18727
+ return {
18728
+ content: [
18729
+ {
18730
+ type: "text",
18731
+ text: JSON.stringify(result, null, 2)
18732
+ }
18733
+ ]
18734
+ };
18735
+ }
18736
+ );
18737
+ server.registerTool(
18738
+ "find_source_file",
18739
+ {
18740
+ description: "Find the SvelteKit source file for a piece of site content. Use this when you need to locate and edit content on the site. Returns the URL, route file path, section title, and a content snippet.",
18741
+ inputSchema: {
18742
+ query: z.string().min(1),
18743
+ scope: z.string().optional()
18744
+ }
18745
+ },
18746
+ async (input) => {
18747
+ const result = await engine.search({
18748
+ q: input.query,
18749
+ topK: 1,
18750
+ scope: input.scope
18751
+ });
18752
+ if (result.results.length === 0) {
18753
+ return {
18754
+ content: [
18755
+ {
18756
+ type: "text",
18757
+ text: JSON.stringify({
18758
+ error: "No matching content found for the given query."
18759
+ })
18760
+ }
18761
+ ]
18762
+ };
18763
+ }
18764
+ const match = result.results[0];
18765
+ const { url, routeFile, sectionTitle, snippet } = match;
18766
+ return {
18767
+ content: [
18768
+ {
18769
+ type: "text",
18770
+ text: JSON.stringify({ url, routeFile, sectionTitle, snippet })
18771
+ }
18772
+ ]
18773
+ };
18774
+ }
18775
+ );
18776
+ server.registerTool(
18777
+ "get_related_pages",
18778
+ {
18779
+ description: "Find pages related to a given URL using link graph, semantic similarity, and structural proximity. Returns related pages ranked by a composite relatedness score. Use this to discover content connected to a known page.",
18780
+ inputSchema: {
18781
+ pathOrUrl: z.string().min(1),
18782
+ scope: z.string().optional(),
18783
+ topK: z.number().int().positive().max(25).optional()
18784
+ }
18785
+ },
18786
+ async (input) => {
18787
+ const result = await engine.getRelatedPages(input.pathOrUrl, {
18788
+ topK: input.topK,
18789
+ scope: input.scope
18790
+ });
18791
+ return {
18792
+ content: [
18793
+ {
18794
+ type: "text",
18795
+ text: JSON.stringify(result, null, 2)
18796
+ }
18797
+ ]
18798
+ };
18799
+ }
18800
+ );
18801
+ return server;
18802
+ }
18803
+
17810
18804
  // src/sveltekit/handle.ts
17811
18805
  var InMemoryRateLimiter = class {
17812
18806
  constructor(windowMs, max) {
@@ -17835,7 +18829,13 @@ function searchsocketHandle(options = {}) {
17835
18829
  let enginePromise = null;
17836
18830
  let configPromise = null;
17837
18831
  let apiPath = options.path;
18832
+ let llmsServePath = null;
18833
+ let serveMarkdownVariants = false;
18834
+ let mcpPath;
18835
+ let mcpApiKey;
18836
+ let mcpEnableJsonResponse = true;
17838
18837
  let rateLimiter = null;
18838
+ let notConfigured = false;
17839
18839
  const getConfig = async () => {
17840
18840
  if (!configPromise) {
17841
18841
  let configP;
@@ -17852,6 +18852,13 @@ function searchsocketHandle(options = {}) {
17852
18852
  }
17853
18853
  configPromise = configP.then((config) => {
17854
18854
  apiPath = apiPath ?? config.api.path;
18855
+ mcpPath = config.mcp.handle.path;
18856
+ mcpApiKey = config.mcp.handle.apiKey;
18857
+ mcpEnableJsonResponse = config.mcp.handle.enableJsonResponse;
18858
+ if (config.llmsTxt.enable) {
18859
+ llmsServePath = "/" + config.llmsTxt.outputPath.replace(/^static\//, "");
18860
+ serveMarkdownVariants = config.llmsTxt.serveMarkdownVariants;
18861
+ }
17855
18862
  if (config.api.rateLimit && !isServerless()) {
17856
18863
  rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
17857
18864
  }
@@ -17861,59 +18868,109 @@ function searchsocketHandle(options = {}) {
17861
18868
  return configPromise;
17862
18869
  };
17863
18870
  const getEngine = async () => {
18871
+ if (notConfigured) {
18872
+ throw new SearchSocketError(
18873
+ "SEARCH_NOT_CONFIGURED",
18874
+ "Search is not configured. Set the required Upstash environment variables to enable search.",
18875
+ 503
18876
+ );
18877
+ }
17864
18878
  if (!enginePromise) {
17865
18879
  const config = await getConfig();
17866
18880
  enginePromise = SearchEngine.create({
17867
18881
  cwd: options.cwd,
17868
18882
  config
18883
+ }).catch((error) => {
18884
+ enginePromise = null;
18885
+ if (error instanceof SearchSocketError && error.code === "VECTOR_BACKEND_UNAVAILABLE") {
18886
+ notConfigured = true;
18887
+ throw new SearchSocketError(
18888
+ "SEARCH_NOT_CONFIGURED",
18889
+ "Search is not configured. Set the required Upstash environment variables to enable search.",
18890
+ 503
18891
+ );
18892
+ }
18893
+ throw error;
17869
18894
  });
17870
18895
  }
17871
18896
  return enginePromise;
17872
18897
  };
17873
18898
  const bodyLimit = options.maxBodyBytes ?? 64 * 1024;
17874
18899
  return async ({ event, resolve }) => {
17875
- if (apiPath && event.url.pathname !== apiPath) {
17876
- return resolve(event);
18900
+ if (apiPath && !isApiPath(event.url.pathname, apiPath) && event.url.pathname !== llmsServePath) {
18901
+ const isMarkdownVariant = event.request.method === "GET" && event.url.pathname.endsWith(".md");
18902
+ if (mcpPath && event.url.pathname === mcpPath) {
18903
+ return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
18904
+ }
18905
+ if (mcpPath) {
18906
+ if (serveMarkdownVariants && isMarkdownVariant) ; else {
18907
+ return resolve(event);
18908
+ }
18909
+ } else {
18910
+ if (configPromise || options.config || options.rawConfig) {
18911
+ await getConfig();
18912
+ if (mcpPath && event.url.pathname === mcpPath) {
18913
+ return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
18914
+ }
18915
+ if (!(serveMarkdownVariants && isMarkdownVariant)) {
18916
+ return resolve(event);
18917
+ }
18918
+ } else {
18919
+ return resolve(event);
18920
+ }
18921
+ }
17877
18922
  }
17878
18923
  const config = await getConfig();
18924
+ if (llmsServePath && event.request.method === "GET" && event.url.pathname === llmsServePath) {
18925
+ const cwd = options.cwd ?? process.cwd();
18926
+ const filePath = path.resolve(cwd, config.llmsTxt.outputPath);
18927
+ try {
18928
+ const content = await fs9.readFile(filePath, "utf8");
18929
+ return new Response(content, {
18930
+ status: 200,
18931
+ headers: { "content-type": "text/plain; charset=utf-8" }
18932
+ });
18933
+ } catch {
18934
+ return resolve(event);
18935
+ }
18936
+ }
18937
+ if (serveMarkdownVariants && event.request.method === "GET" && event.url.pathname.endsWith(".md")) {
18938
+ let rawPath;
18939
+ try {
18940
+ rawPath = decodeURIComponent(event.url.pathname.slice(0, -3));
18941
+ } catch {
18942
+ return resolve(event);
18943
+ }
18944
+ const scope = event.url.searchParams?.get("scope") ?? void 0;
18945
+ try {
18946
+ const engine = await getEngine();
18947
+ const page = await engine.getPage(rawPath, scope);
18948
+ return new Response(page.markdown, {
18949
+ status: 200,
18950
+ headers: { "content-type": "text/markdown; charset=utf-8" }
18951
+ });
18952
+ } catch (error) {
18953
+ if (error instanceof SearchSocketError && error.status === 404) {
18954
+ return resolve(event);
18955
+ }
18956
+ throw error;
18957
+ }
18958
+ }
18959
+ if (mcpPath && event.url.pathname === mcpPath) {
18960
+ return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
18961
+ }
17879
18962
  const targetPath = apiPath ?? config.api.path;
17880
- if (event.url.pathname !== targetPath) {
18963
+ if (!isApiPath(event.url.pathname, targetPath)) {
17881
18964
  return resolve(event);
17882
18965
  }
17883
- if (event.request.method === "OPTIONS") {
18966
+ const subPath = event.url.pathname.slice(targetPath.length);
18967
+ const method = event.request.method;
18968
+ if (method === "OPTIONS") {
17884
18969
  return new Response(null, {
17885
18970
  status: 204,
17886
18971
  headers: buildCorsHeaders(event.request, config)
17887
18972
  });
17888
18973
  }
17889
- if (event.request.method !== "POST") {
17890
- return withCors(
17891
- new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
17892
- status: 405,
17893
- headers: {
17894
- "content-type": "application/json"
17895
- }
17896
- }),
17897
- event.request,
17898
- config
17899
- );
17900
- }
17901
- const contentLength = Number(event.request.headers.get("content-length") ?? 0);
17902
- if (contentLength > bodyLimit) {
17903
- return withCors(
17904
- new Response(
17905
- JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Request body too large", 413))),
17906
- {
17907
- status: 413,
17908
- headers: {
17909
- "content-type": "application/json"
17910
- }
17911
- }
17912
- ),
17913
- event.request,
17914
- config
17915
- );
17916
- }
17917
18974
  if (rateLimiter) {
17918
18975
  const ip = event.getClientAddress?.() ?? event.request.headers.get("x-forwarded-for")?.split(",")[0]?.trim() ?? "unknown";
17919
18976
  if (!rateLimiter.check(ip)) {
@@ -17933,39 +18990,32 @@ function searchsocketHandle(options = {}) {
17933
18990
  }
17934
18991
  }
17935
18992
  try {
17936
- let rawBody;
17937
- if (typeof event.request.text === "function") {
17938
- rawBody = await event.request.text();
17939
- } else {
17940
- let parsedFallback;
17941
- try {
17942
- parsedFallback = await event.request.json();
17943
- } catch (error) {
17944
- if (error instanceof SyntaxError) {
17945
- throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
17946
- }
17947
- throw error;
18993
+ if (method === "GET") {
18994
+ if (subPath === "" || subPath === "/") {
18995
+ return await handleGetSearch(event, config, getEngine);
17948
18996
  }
17949
- rawBody = JSON.stringify(parsedFallback);
18997
+ if (subPath === "/health") {
18998
+ return await handleGetHealth(event, config, getEngine);
18999
+ }
19000
+ if (subPath.startsWith("/pages/")) {
19001
+ return await handleGetPage(event, config, getEngine, subPath);
19002
+ }
19003
+ return withCors(
19004
+ new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Not found", 404))), {
19005
+ status: 404,
19006
+ headers: { "content-type": "application/json" }
19007
+ }),
19008
+ event.request,
19009
+ config
19010
+ );
17950
19011
  }
17951
- if (Buffer.byteLength(rawBody, "utf8") > bodyLimit) {
17952
- throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
19012
+ if (method === "POST" && (subPath === "" || subPath === "/")) {
19013
+ return await handlePostSearch(event, config, getEngine, bodyLimit);
17953
19014
  }
17954
- let body;
17955
- try {
17956
- body = JSON.parse(rawBody);
17957
- } catch {
17958
- throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
17959
- }
17960
- const engine = await getEngine();
17961
- const searchRequest = body;
17962
- const result = await engine.search(searchRequest);
17963
19015
  return withCors(
17964
- new Response(JSON.stringify(result), {
17965
- status: 200,
17966
- headers: {
17967
- "content-type": "application/json"
17968
- }
19016
+ new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
19017
+ status: 405,
19018
+ headers: { "content-type": "application/json" }
17969
19019
  }),
17970
19020
  event.request,
17971
19021
  config
@@ -17986,6 +19036,183 @@ function searchsocketHandle(options = {}) {
17986
19036
  }
17987
19037
  };
17988
19038
  }
19039
+ function isApiPath(pathname, apiPath) {
19040
+ return pathname === apiPath || pathname.startsWith(apiPath + "/");
19041
+ }
19042
+ async function handleGetSearch(event, config, getEngine) {
19043
+ const params = event.url.searchParams;
19044
+ const q = params.get("q");
19045
+ if (!q || q.trim() === "") {
19046
+ throw new SearchSocketError("INVALID_REQUEST", "Missing required query parameter: q", 400);
19047
+ }
19048
+ const searchRequest = { q };
19049
+ const topK = params.get("topK");
19050
+ if (topK !== null) {
19051
+ const parsed = Number.parseInt(topK, 10);
19052
+ if (Number.isNaN(parsed) || parsed < 1) {
19053
+ throw new SearchSocketError("INVALID_REQUEST", "topK must be a positive integer", 400);
19054
+ }
19055
+ searchRequest.topK = parsed;
19056
+ }
19057
+ const scope = params.get("scope");
19058
+ if (scope !== null) searchRequest.scope = scope;
19059
+ const pathPrefix = params.get("pathPrefix");
19060
+ if (pathPrefix !== null) searchRequest.pathPrefix = pathPrefix;
19061
+ const groupBy = params.get("groupBy");
19062
+ if (groupBy) {
19063
+ if (groupBy !== "page" && groupBy !== "chunk") {
19064
+ throw new SearchSocketError("INVALID_REQUEST", 'groupBy must be "page" or "chunk"', 400);
19065
+ }
19066
+ searchRequest.groupBy = groupBy;
19067
+ }
19068
+ const maxSubResults = params.get("maxSubResults");
19069
+ if (maxSubResults !== null) {
19070
+ const parsed = Number.parseInt(maxSubResults, 10);
19071
+ if (Number.isNaN(parsed) || parsed < 1 || parsed > 20) {
19072
+ throw new SearchSocketError("INVALID_REQUEST", "maxSubResults must be a positive integer between 1 and 20", 400);
19073
+ }
19074
+ searchRequest.maxSubResults = parsed;
19075
+ }
19076
+ const tags = params.getAll("tags");
19077
+ if (tags.length > 0) searchRequest.tags = tags;
19078
+ const engine = await getEngine();
19079
+ const result = await engine.search(searchRequest);
19080
+ return withCors(
19081
+ new Response(JSON.stringify(result), {
19082
+ status: 200,
19083
+ headers: { "content-type": "application/json" }
19084
+ }),
19085
+ event.request,
19086
+ config
19087
+ );
19088
+ }
19089
+ async function handleGetHealth(event, config, getEngine) {
19090
+ const engine = await getEngine();
19091
+ const result = await engine.health();
19092
+ return withCors(
19093
+ new Response(JSON.stringify(result), {
19094
+ status: 200,
19095
+ headers: { "content-type": "application/json" }
19096
+ }),
19097
+ event.request,
19098
+ config
19099
+ );
19100
+ }
19101
+ async function handleGetPage(event, config, getEngine, subPath) {
19102
+ const rawPath = subPath.slice("/pages".length);
19103
+ let pagePath;
19104
+ try {
19105
+ pagePath = decodeURIComponent(rawPath);
19106
+ } catch {
19107
+ throw new SearchSocketError("INVALID_REQUEST", "Malformed page path", 400);
19108
+ }
19109
+ const scope = event.url.searchParams?.get("scope") ?? void 0;
19110
+ const engine = await getEngine();
19111
+ const result = await engine.getPage(pagePath, scope);
19112
+ return withCors(
19113
+ new Response(JSON.stringify(result), {
19114
+ status: 200,
19115
+ headers: { "content-type": "application/json" }
19116
+ }),
19117
+ event.request,
19118
+ config
19119
+ );
19120
+ }
19121
+ async function handlePostSearch(event, config, getEngine, bodyLimit) {
19122
+ const contentLength = Number(event.request.headers.get("content-length") ?? 0);
19123
+ if (contentLength > bodyLimit) {
19124
+ throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
19125
+ }
19126
+ let rawBody;
19127
+ if (typeof event.request.text === "function") {
19128
+ rawBody = await event.request.text();
19129
+ } else {
19130
+ let parsedFallback;
19131
+ try {
19132
+ parsedFallback = await event.request.json();
19133
+ } catch (error) {
19134
+ if (error instanceof SyntaxError) {
19135
+ throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
19136
+ }
19137
+ throw error;
19138
+ }
19139
+ rawBody = JSON.stringify(parsedFallback);
19140
+ }
19141
+ if (Buffer.byteLength(rawBody, "utf8") > bodyLimit) {
19142
+ throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
19143
+ }
19144
+ let body;
19145
+ try {
19146
+ body = JSON.parse(rawBody);
19147
+ } catch {
19148
+ throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
19149
+ }
19150
+ const engine = await getEngine();
19151
+ const searchRequest = body;
19152
+ const result = await engine.search(searchRequest);
19153
+ return withCors(
19154
+ new Response(JSON.stringify(result), {
19155
+ status: 200,
19156
+ headers: { "content-type": "application/json" }
19157
+ }),
19158
+ event.request,
19159
+ config
19160
+ );
19161
+ }
19162
+ async function handleMcpRequest(event, apiKey, enableJsonResponse, getEngine) {
19163
+ if (apiKey) {
19164
+ const authHeader = event.request.headers.get("authorization") ?? "";
19165
+ const token = authHeader.startsWith("Bearer ") ? authHeader.slice(7) : "";
19166
+ const tokenBuf = Buffer.from(token);
19167
+ const keyBuf = Buffer.from(apiKey);
19168
+ if (tokenBuf.length !== keyBuf.length || !timingSafeEqual(tokenBuf, keyBuf)) {
19169
+ return new Response(
19170
+ JSON.stringify({
19171
+ jsonrpc: "2.0",
19172
+ error: { code: -32001, message: "Unauthorized" },
19173
+ id: null
19174
+ }),
19175
+ { status: 401, headers: { "content-type": "application/json" } }
19176
+ );
19177
+ }
19178
+ }
19179
+ const transport = new WebStandardStreamableHTTPServerTransport({
19180
+ sessionIdGenerator: void 0,
19181
+ enableJsonResponse
19182
+ });
19183
+ let server;
19184
+ try {
19185
+ const engine = await getEngine();
19186
+ server = createServer(engine);
19187
+ await server.connect(transport);
19188
+ const response = await transport.handleRequest(event.request);
19189
+ if (enableJsonResponse) {
19190
+ await transport.close();
19191
+ await server.close();
19192
+ }
19193
+ return response;
19194
+ } catch (error) {
19195
+ try {
19196
+ await transport.close();
19197
+ } catch {
19198
+ }
19199
+ try {
19200
+ await server?.close();
19201
+ } catch {
19202
+ }
19203
+ return new Response(
19204
+ JSON.stringify({
19205
+ jsonrpc: "2.0",
19206
+ error: {
19207
+ code: -32603,
19208
+ message: error instanceof Error ? error.message : "Internal server error"
19209
+ },
19210
+ id: null
19211
+ }),
19212
+ { status: 500, headers: { "content-type": "application/json" } }
19213
+ );
19214
+ }
19215
+ }
17989
19216
  function buildCorsHeaders(request, config) {
17990
19217
  const allowOrigins = config.api.cors.allowOrigins;
17991
19218
  if (!allowOrigins || allowOrigins.length === 0) {
@@ -17998,7 +19225,7 @@ function buildCorsHeaders(request, config) {
17998
19225
  }
17999
19226
  return {
18000
19227
  "access-control-allow-origin": allowOrigins.includes("*") ? "*" : origin,
18001
- "access-control-allow-methods": "POST, OPTIONS",
19228
+ "access-control-allow-methods": "GET, POST, OPTIONS",
18002
19229
  "access-control-allow-headers": "content-type"
18003
19230
  };
18004
19231
  }
@@ -18045,6 +19272,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
18045
19272
  if (normalizeText(current.text)) {
18046
19273
  sections.push({
18047
19274
  sectionTitle: current.sectionTitle,
19275
+ headingLevel: current.headingLevel,
18048
19276
  headingPath: current.headingPath,
18049
19277
  text: current.text.trim()
18050
19278
  });
@@ -18063,6 +19291,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
18063
19291
  headingStack.length = level;
18064
19292
  current = {
18065
19293
  sectionTitle: title,
19294
+ headingLevel: level,
18066
19295
  headingPath: headingStack.filter((entry) => Boolean(entry)).slice(0, headingPathDepth),
18067
19296
  text: `${line}
18068
19297
  `
@@ -18198,6 +19427,7 @@ function splitSection(section, config) {
18198
19427
  return [
18199
19428
  {
18200
19429
  sectionTitle: section.sectionTitle,
19430
+ headingLevel: section.headingLevel,
18201
19431
  headingPath: section.headingPath,
18202
19432
  chunkText: text
18203
19433
  }
@@ -18248,6 +19478,7 @@ ${chunk}`;
18248
19478
  }
18249
19479
  return merged.map((chunkText) => ({
18250
19480
  sectionTitle: section.sectionTitle,
19481
+ headingLevel: section.headingLevel,
18251
19482
  headingPath: section.headingPath,
18252
19483
  chunkText
18253
19484
  }));
@@ -18263,6 +19494,18 @@ function buildSummaryChunkText(page) {
18263
19494
  }
18264
19495
  return parts.join("\n\n");
18265
19496
  }
19497
+ function buildEmbeddingTitle(chunk) {
19498
+ if (!chunk.sectionTitle || chunk.headingLevel === void 0) return void 0;
19499
+ if (chunk.headingPath.length > 1) {
19500
+ const path14 = chunk.headingPath.join(" > ");
19501
+ const lastInPath = chunk.headingPath[chunk.headingPath.length - 1];
19502
+ if (lastInPath !== chunk.sectionTitle) {
19503
+ return `${chunk.title} \u2014 ${path14} > ${chunk.sectionTitle}`;
19504
+ }
19505
+ return `${chunk.title} \u2014 ${path14}`;
19506
+ }
19507
+ return `${chunk.title} \u2014 ${chunk.sectionTitle}`;
19508
+ }
18266
19509
  function buildEmbeddingText(chunk, prependTitle) {
18267
19510
  if (!prependTitle) return chunk.chunkText;
18268
19511
  const prefix = chunk.sectionTitle ? `${chunk.title} \u2014 ${chunk.sectionTitle}` : chunk.title;
@@ -18293,10 +19536,14 @@ function chunkPage(page, config, scope) {
18293
19536
  tags: page.tags,
18294
19537
  contentHash: "",
18295
19538
  description: page.description,
18296
- keywords: page.keywords
19539
+ keywords: page.keywords,
19540
+ publishedAt: page.publishedAt,
19541
+ incomingAnchorText: page.incomingAnchorText,
19542
+ meta: page.meta
18297
19543
  };
18298
19544
  const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
18299
- summaryChunk.contentHash = sha256(normalizeText(embeddingText));
19545
+ const metaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
19546
+ summaryChunk.contentHash = sha256(normalizeText(embeddingText) + metaSuffix);
18300
19547
  chunks.push(summaryChunk);
18301
19548
  }
18302
19549
  const ordinalOffset = config.chunking.pageSummaryChunk ? 1 : 0;
@@ -18313,6 +19560,7 @@ function chunkPage(page, config, scope) {
18313
19560
  path: page.url,
18314
19561
  title: page.title,
18315
19562
  sectionTitle: entry.sectionTitle,
19563
+ headingLevel: entry.headingLevel,
18316
19564
  headingPath: entry.headingPath,
18317
19565
  chunkText: entry.chunkText,
18318
19566
  snippet: toSnippet(entry.chunkText),
@@ -18322,10 +19570,16 @@ function chunkPage(page, config, scope) {
18322
19570
  tags: page.tags,
18323
19571
  contentHash: "",
18324
19572
  description: page.description,
18325
- keywords: page.keywords
19573
+ keywords: page.keywords,
19574
+ publishedAt: page.publishedAt,
19575
+ incomingAnchorText: page.incomingAnchorText,
19576
+ meta: page.meta
18326
19577
  };
18327
19578
  const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
18328
- chunk.contentHash = sha256(normalizeText(embeddingText));
19579
+ const embeddingTitle = config.chunking.weightHeadings ? buildEmbeddingTitle(chunk) : void 0;
19580
+ const chunkMetaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
19581
+ const hashInput = embeddingTitle ? `${normalizeText(embeddingText)}|title:${embeddingTitle}` : normalizeText(embeddingText);
19582
+ chunk.contentHash = sha256(hashInput + chunkMetaSuffix);
18329
19583
  chunks.push(chunk);
18330
19584
  }
18331
19585
  return chunks;
@@ -19158,6 +20412,69 @@ function gfm(turndownService) {
19158
20412
  }
19159
20413
 
19160
20414
  // src/indexing/extractor.ts
20415
+ function normalizeDateToMs(value) {
20416
+ if (value == null) return void 0;
20417
+ if (value instanceof Date) {
20418
+ const ts = value.getTime();
20419
+ return Number.isFinite(ts) ? ts : void 0;
20420
+ }
20421
+ if (typeof value === "string") {
20422
+ const ts = new Date(value).getTime();
20423
+ return Number.isFinite(ts) ? ts : void 0;
20424
+ }
20425
+ if (typeof value === "number") {
20426
+ return Number.isFinite(value) ? value : void 0;
20427
+ }
20428
+ return void 0;
20429
+ }
20430
+ var FRONTMATTER_DATE_FIELDS = ["date", "publishedAt", "updatedAt", "published_at", "updated_at"];
20431
+ function extractPublishedAtFromFrontmatter(data) {
20432
+ for (const field of FRONTMATTER_DATE_FIELDS) {
20433
+ const val = normalizeDateToMs(data[field]);
20434
+ if (val !== void 0) return val;
20435
+ }
20436
+ return void 0;
20437
+ }
20438
+ function extractPublishedAtFromHtml($) {
20439
+ const jsonLdScripts = $('script[type="application/ld+json"]');
20440
+ for (let i = 0; i < jsonLdScripts.length; i++) {
20441
+ try {
20442
+ const raw = $(jsonLdScripts[i]).html();
20443
+ if (!raw) continue;
20444
+ const parsed = JSON.parse(raw);
20445
+ const candidates = [];
20446
+ if (Array.isArray(parsed)) {
20447
+ candidates.push(...parsed);
20448
+ } else if (parsed && typeof parsed === "object") {
20449
+ candidates.push(parsed);
20450
+ if (Array.isArray(parsed["@graph"])) {
20451
+ candidates.push(...parsed["@graph"]);
20452
+ }
20453
+ }
20454
+ for (const candidate of candidates) {
20455
+ const val = normalizeDateToMs(candidate.datePublished);
20456
+ if (val !== void 0) return val;
20457
+ }
20458
+ } catch {
20459
+ }
20460
+ }
20461
+ const ogTime = $('meta[property="article:published_time"]').attr("content")?.trim();
20462
+ if (ogTime) {
20463
+ const val = normalizeDateToMs(ogTime);
20464
+ if (val !== void 0) return val;
20465
+ }
20466
+ const itempropDate = $('meta[itemprop="datePublished"]').attr("content")?.trim() || $('time[itemprop="datePublished"]').attr("datetime")?.trim();
20467
+ if (itempropDate) {
20468
+ const val = normalizeDateToMs(itempropDate);
20469
+ if (val !== void 0) return val;
20470
+ }
20471
+ const timeEl = $("time[datetime]").first().attr("datetime")?.trim();
20472
+ if (timeEl) {
20473
+ const val = normalizeDateToMs(timeEl);
20474
+ if (val !== void 0) return val;
20475
+ }
20476
+ return void 0;
20477
+ }
19161
20478
  function hasTopLevelNoindexComment(markdown) {
19162
20479
  const lines = markdown.split(/\r?\n/);
19163
20480
  let inFence = false;
@@ -19173,6 +20490,97 @@ function hasTopLevelNoindexComment(markdown) {
19173
20490
  }
19174
20491
  return false;
19175
20492
  }
20493
+ var GARBAGE_ALT_WORDS = /* @__PURE__ */ new Set([
20494
+ "image",
20495
+ "photo",
20496
+ "picture",
20497
+ "icon",
20498
+ "logo",
20499
+ "banner",
20500
+ "screenshot",
20501
+ "thumbnail",
20502
+ "img",
20503
+ "graphic",
20504
+ "illustration",
20505
+ "spacer",
20506
+ "pixel",
20507
+ "placeholder",
20508
+ "avatar",
20509
+ "background"
20510
+ ]);
20511
+ var IMAGE_EXT_RE = /\.(jpg|jpeg|png|gif|svg|webp|avif|bmp|ico)(\?.*)?$/i;
20512
+ function isMeaningfulAlt(alt) {
20513
+ const trimmed = alt.trim();
20514
+ if (!trimmed || trimmed.length < 5) return false;
20515
+ if (IMAGE_EXT_RE.test(trimmed)) return false;
20516
+ if (GARBAGE_ALT_WORDS.has(trimmed.toLowerCase())) return false;
20517
+ return true;
20518
+ }
20519
+ function resolveImageText(img, $, imageDescAttr) {
20520
+ const imgDesc = img.attr(imageDescAttr)?.trim();
20521
+ if (imgDesc) return imgDesc;
20522
+ const figure = img.closest("figure");
20523
+ if (figure.length) {
20524
+ const figDesc = figure.attr(imageDescAttr)?.trim();
20525
+ if (figDesc) return figDesc;
20526
+ }
20527
+ const alt = img.attr("alt")?.trim() ?? "";
20528
+ const caption = figure.length ? figure.find("figcaption").first().text().trim() : "";
20529
+ if (isMeaningfulAlt(alt) && caption) {
20530
+ return `${alt} \u2014 ${caption}`;
20531
+ }
20532
+ if (isMeaningfulAlt(alt)) {
20533
+ return alt;
20534
+ }
20535
+ if (caption) {
20536
+ return caption;
20537
+ }
20538
+ return null;
20539
+ }
20540
+ var STOP_ANCHORS = /* @__PURE__ */ new Set([
20541
+ "here",
20542
+ "click",
20543
+ "click here",
20544
+ "read more",
20545
+ "link",
20546
+ "this",
20547
+ "more"
20548
+ ]);
20549
+ function normalizeAnchorText(raw) {
20550
+ const normalized = raw.replace(/\s+/g, " ").trim().toLowerCase();
20551
+ if (normalized.length < 3) return "";
20552
+ if (STOP_ANCHORS.has(normalized)) return "";
20553
+ if (normalized.length > 100) return normalized.slice(0, 100);
20554
+ return normalized;
20555
+ }
20556
+ function escapeHtml(text) {
20557
+ return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
20558
+ }
20559
+ function preprocessImages(root2, $, imageDescAttr) {
20560
+ root2.find("picture").each((_i, el) => {
20561
+ const picture = $(el);
20562
+ const img = picture.find("img").first();
20563
+ const parentFigure = picture.closest("figure");
20564
+ const text = img.length ? resolveImageText(img, $, imageDescAttr) : null;
20565
+ if (text) {
20566
+ if (parentFigure.length) parentFigure.find("figcaption").remove();
20567
+ picture.replaceWith(`<span>${escapeHtml(text)}</span>`);
20568
+ } else {
20569
+ picture.remove();
20570
+ }
20571
+ });
20572
+ root2.find("img").each((_i, el) => {
20573
+ const img = $(el);
20574
+ const parentFigure = img.closest("figure");
20575
+ const text = resolveImageText(img, $, imageDescAttr);
20576
+ if (text) {
20577
+ if (parentFigure.length) parentFigure.find("figcaption").remove();
20578
+ img.replaceWith(`<span>${escapeHtml(text)}</span>`);
20579
+ } else {
20580
+ img.remove();
20581
+ }
20582
+ });
20583
+ }
19176
20584
  function extractFromHtml(url, html, config) {
19177
20585
  const $ = load(html);
19178
20586
  const normalizedUrl = normalizeUrlPath(url);
@@ -19198,6 +20606,20 @@ function extractFromHtml(url, html, config) {
19198
20606
  if (weight === 0) {
19199
20607
  return null;
19200
20608
  }
20609
+ if ($('meta[name="searchsocket:noindex"]').attr("content") === "true") {
20610
+ return null;
20611
+ }
20612
+ const RESERVED_META_KEYS = /* @__PURE__ */ new Set(["noindex", "tags"]);
20613
+ const meta = {};
20614
+ $('meta[name^="searchsocket:"]').each((_i, el) => {
20615
+ const name = $(el).attr("name") ?? "";
20616
+ const key = name.slice("searchsocket:".length);
20617
+ if (!key || RESERVED_META_KEYS.has(key) || !validateMetaKey(key)) return;
20618
+ const content = $(el).attr("content") ?? "";
20619
+ const dataType = $(el).attr("data-type") ?? "string";
20620
+ meta[key] = parseMetaValue(content, dataType);
20621
+ });
20622
+ const componentTags = $('meta[name="searchsocket:tags"]').attr("content")?.trim();
19201
20623
  const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
19202
20624
  const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
19203
20625
  const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
@@ -19209,7 +20631,9 @@ function extractFromHtml(url, html, config) {
19209
20631
  root2.find(selector).remove();
19210
20632
  }
19211
20633
  root2.find(`[${config.extract.ignoreAttr}]`).remove();
20634
+ preprocessImages(root2, $, config.extract.imageDescAttr);
19212
20635
  const outgoingLinks = [];
20636
+ const seenLinkKeys = /* @__PURE__ */ new Set();
19213
20637
  root2.find("a[href]").each((_index, node) => {
19214
20638
  const href = $(node).attr("href");
19215
20639
  if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:")) {
@@ -19220,7 +20644,19 @@ function extractFromHtml(url, html, config) {
19220
20644
  if (!["http:", "https:"].includes(parsed.protocol)) {
19221
20645
  return;
19222
20646
  }
19223
- outgoingLinks.push(normalizeUrlPath(parsed.pathname));
20647
+ const url2 = normalizeUrlPath(parsed.pathname);
20648
+ let anchorText = normalizeAnchorText($(node).text());
20649
+ if (!anchorText) {
20650
+ const imgAlt = $(node).find("img").first().attr("alt") ?? "";
20651
+ if (isMeaningfulAlt(imgAlt)) {
20652
+ anchorText = normalizeAnchorText(imgAlt);
20653
+ }
20654
+ }
20655
+ const key = `${url2}|${anchorText}`;
20656
+ if (!seenLinkKeys.has(key)) {
20657
+ seenLinkKeys.add(key);
20658
+ outgoingLinks.push({ url: url2, anchorText });
20659
+ }
19224
20660
  } catch {
19225
20661
  }
19226
20662
  });
@@ -19245,16 +20681,25 @@ function extractFromHtml(url, html, config) {
19245
20681
  return null;
19246
20682
  }
19247
20683
  const tags = normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1);
20684
+ const publishedAt = extractPublishedAtFromHtml($);
20685
+ if (componentTags) {
20686
+ const extraTags = componentTags.split(",").map((t) => t.trim()).filter(Boolean);
20687
+ for (const t of extraTags) {
20688
+ if (!tags.includes(t)) tags.push(t);
20689
+ }
20690
+ }
19248
20691
  return {
19249
20692
  url: normalizeUrlPath(url),
19250
20693
  title,
19251
20694
  markdown,
19252
- outgoingLinks: [...new Set(outgoingLinks)],
20695
+ outgoingLinks,
19253
20696
  noindex: false,
19254
20697
  tags,
19255
20698
  description,
19256
20699
  keywords,
19257
- weight
20700
+ weight,
20701
+ publishedAt,
20702
+ meta: Object.keys(meta).length > 0 ? meta : void 0
19258
20703
  };
19259
20704
  }
19260
20705
  function extractFromMarkdown(url, markdown, title) {
@@ -19275,6 +20720,24 @@ function extractFromMarkdown(url, markdown, title) {
19275
20720
  if (mdWeight === 0) {
19276
20721
  return null;
19277
20722
  }
20723
+ let mdMeta;
20724
+ const rawMeta = searchsocketMeta?.meta;
20725
+ if (rawMeta && typeof rawMeta === "object" && !Array.isArray(rawMeta)) {
20726
+ const metaObj = {};
20727
+ for (const [key, val] of Object.entries(rawMeta)) {
20728
+ if (!validateMetaKey(key)) continue;
20729
+ if (typeof val === "string" || typeof val === "number" || typeof val === "boolean") {
20730
+ metaObj[key] = val;
20731
+ } else if (Array.isArray(val) && val.every((v) => typeof v === "string")) {
20732
+ metaObj[key] = val;
20733
+ } else if (val instanceof Date) {
20734
+ metaObj[key] = val.getTime();
20735
+ }
20736
+ }
20737
+ if (Object.keys(metaObj).length > 0) {
20738
+ mdMeta = metaObj;
20739
+ }
20740
+ }
19278
20741
  const content = parsed.content;
19279
20742
  const normalized = normalizeMarkdown(content);
19280
20743
  if (!normalizeText(normalized)) {
@@ -19289,6 +20752,7 @@ function extractFromMarkdown(url, markdown, title) {
19289
20752
  fmKeywords = frontmatter.keywords.split(",").map((k) => k.trim()).filter(Boolean);
19290
20753
  }
19291
20754
  if (fmKeywords && fmKeywords.length === 0) fmKeywords = void 0;
20755
+ const publishedAt = extractPublishedAtFromFrontmatter(frontmatter);
19292
20756
  return {
19293
20757
  url: normalizeUrlPath(url),
19294
20758
  title: resolvedTitle,
@@ -19298,7 +20762,9 @@ function extractFromMarkdown(url, markdown, title) {
19298
20762
  tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
19299
20763
  description: fmDescription,
19300
20764
  keywords: fmKeywords,
19301
- weight: mdWeight
20765
+ weight: mdWeight,
20766
+ publishedAt,
20767
+ meta: mdMeta
19302
20768
  };
19303
20769
  }
19304
20770
  function segmentToRegex(segment) {
@@ -19461,7 +20927,7 @@ async function parseManifest(cwd, outputDir) {
19461
20927
  const manifestPath = path.resolve(cwd, outputDir, "server", "manifest-full.js");
19462
20928
  let content;
19463
20929
  try {
19464
- content = await fs3.readFile(manifestPath, "utf8");
20930
+ content = await fs9.readFile(manifestPath, "utf8");
19465
20931
  } catch {
19466
20932
  throw new SearchSocketError(
19467
20933
  "BUILD_MANIFEST_NOT_FOUND",
@@ -19772,6 +21238,125 @@ function filePathToUrl(filePath, baseDir) {
19772
21238
  const noExt = relative.replace(/\.md$/i, "").replace(/\/index$/i, "");
19773
21239
  return normalizeUrlPath(noExt || "/");
19774
21240
  }
21241
+ var ROUTE_FILE_RE = /\+(page|layout|error)(@[^.]+)?\.svelte$/;
21242
+ function isSvelteComponentFile(filePath) {
21243
+ if (!filePath.endsWith(".svelte")) return false;
21244
+ return !ROUTE_FILE_RE.test(filePath);
21245
+ }
21246
+ function extractSvelteComponentMeta(source) {
21247
+ const componentMatch = source.match(/<!--\s*@component\s*([\s\S]*?)\s*-->/);
21248
+ const description = componentMatch?.[1]?.trim() || void 0;
21249
+ const propsMatch = source.match(
21250
+ /let\s+\{([\s\S]*?)\}\s*(?::\s*([^=;{][\s\S]*?))?\s*=\s*\$props\(\)/
21251
+ );
21252
+ const props = [];
21253
+ if (propsMatch) {
21254
+ const destructureBlock = propsMatch[1];
21255
+ const typeAnnotation = propsMatch[2]?.trim();
21256
+ let resolvedTypeMap;
21257
+ if (typeAnnotation && /^[A-Z]\w*$/.test(typeAnnotation)) {
21258
+ resolvedTypeMap = resolveTypeReference(source, typeAnnotation);
21259
+ } else if (typeAnnotation && typeAnnotation.startsWith("{")) {
21260
+ resolvedTypeMap = parseInlineTypeAnnotation(typeAnnotation);
21261
+ }
21262
+ const propEntries = splitDestructureBlock(destructureBlock);
21263
+ for (const entry of propEntries) {
21264
+ const trimmed = entry.trim();
21265
+ if (!trimmed || trimmed.startsWith("...")) continue;
21266
+ let propName;
21267
+ let defaultValue;
21268
+ const renameMatch = trimmed.match(/^(\w+)\s*:\s*\w+\s*(?:=\s*([\s\S]+))?$/);
21269
+ if (renameMatch) {
21270
+ propName = renameMatch[1];
21271
+ defaultValue = renameMatch[2]?.trim();
21272
+ } else {
21273
+ const defaultMatch = trimmed.match(/^(\w+)\s*=\s*([\s\S]+)$/);
21274
+ if (defaultMatch) {
21275
+ propName = defaultMatch[1];
21276
+ defaultValue = defaultMatch[2]?.trim();
21277
+ } else {
21278
+ propName = trimmed.match(/^(\w+)/)?.[1] ?? trimmed;
21279
+ }
21280
+ }
21281
+ const propType = resolvedTypeMap?.get(propName);
21282
+ props.push({
21283
+ name: propName,
21284
+ ...propType ? { type: propType } : {},
21285
+ ...defaultValue ? { default: defaultValue } : {}
21286
+ });
21287
+ }
21288
+ }
21289
+ return { description, props };
21290
+ }
21291
+ function splitDestructureBlock(block) {
21292
+ const entries = [];
21293
+ let depth = 0;
21294
+ let current = "";
21295
+ for (const ch of block) {
21296
+ if (ch === "{" || ch === "[" || ch === "(") {
21297
+ depth++;
21298
+ current += ch;
21299
+ } else if (ch === "}" || ch === "]" || ch === ")") {
21300
+ depth--;
21301
+ current += ch;
21302
+ } else if (ch === "," && depth === 0) {
21303
+ entries.push(current);
21304
+ current = "";
21305
+ } else {
21306
+ current += ch;
21307
+ }
21308
+ }
21309
+ if (current.trim()) entries.push(current);
21310
+ return entries;
21311
+ }
21312
+ function resolveTypeReference(source, typeName) {
21313
+ const startRe = new RegExp(`(?:interface\\s+${typeName}\\s*|type\\s+${typeName}\\s*=\\s*)\\{`);
21314
+ const startMatch = source.match(startRe);
21315
+ if (!startMatch || startMatch.index === void 0) return void 0;
21316
+ const bodyStart = startMatch.index + startMatch[0].length;
21317
+ let depth = 1;
21318
+ let i = bodyStart;
21319
+ while (i < source.length && depth > 0) {
21320
+ if (source[i] === "{") depth++;
21321
+ else if (source[i] === "}") depth--;
21322
+ i++;
21323
+ }
21324
+ if (depth !== 0) return void 0;
21325
+ const body = source.slice(bodyStart, i - 1);
21326
+ return parseTypeMembers(body);
21327
+ }
21328
+ function parseInlineTypeAnnotation(annotation) {
21329
+ const inner = annotation.replace(/^\{/, "").replace(/\}$/, "");
21330
+ return parseTypeMembers(inner);
21331
+ }
21332
+ function parseTypeMembers(body) {
21333
+ const map = /* @__PURE__ */ new Map();
21334
+ const members = body.split(/[;\n]/).map((m) => m.trim()).filter(Boolean);
21335
+ for (const member of members) {
21336
+ const memberMatch = member.match(/^(\w+)\??\s*:\s*(.+)$/);
21337
+ if (memberMatch) {
21338
+ map.set(memberMatch[1], memberMatch[2].replace(/,\s*$/, "").trim());
21339
+ }
21340
+ }
21341
+ return map;
21342
+ }
21343
+ function buildComponentMarkdown(componentName, meta) {
21344
+ if (!meta.description && meta.props.length === 0) return "";
21345
+ const parts = [`${componentName} component.`];
21346
+ if (meta.description) {
21347
+ parts.push(meta.description);
21348
+ }
21349
+ if (meta.props.length > 0) {
21350
+ const propEntries = meta.props.map((p) => {
21351
+ let entry = p.name;
21352
+ if (p.type) entry += ` (${p.type})`;
21353
+ if (p.default) entry += ` default: ${p.default}`;
21354
+ return entry;
21355
+ });
21356
+ parts.push(`Props: ${propEntries.join(", ")}.`);
21357
+ }
21358
+ return parts.join(" ");
21359
+ }
19775
21360
  function normalizeSvelteToMarkdown(source) {
19776
21361
  return source.replace(/<script[\s\S]*?<\/script>/g, "").replace(/<style[\s\S]*?<\/style>/g, "").replace(/<[^>]+>/g, " ").replace(/\{[^}]+\}/g, " ").replace(/\s+/g, " ").trim();
19777
21362
  }
@@ -19790,13 +21375,27 @@ async function loadContentFilesPages(cwd, config, maxPages) {
19790
21375
  const selected = typeof limit === "number" ? files.slice(0, limit) : files;
19791
21376
  const pages = [];
19792
21377
  for (const filePath of selected) {
19793
- const raw = await fs3.readFile(filePath, "utf8");
19794
- const markdown = filePath.endsWith(".md") ? raw : normalizeSvelteToMarkdown(raw);
21378
+ const raw = await fs9.readFile(filePath, "utf8");
21379
+ let markdown;
21380
+ let tags;
21381
+ if (filePath.endsWith(".md")) {
21382
+ markdown = raw;
21383
+ } else if (isSvelteComponentFile(filePath)) {
21384
+ const componentName = path.basename(filePath, ".svelte");
21385
+ const meta = extractSvelteComponentMeta(raw);
21386
+ const componentMarkdown = buildComponentMarkdown(componentName, meta);
21387
+ const templateContent = normalizeSvelteToMarkdown(raw);
21388
+ markdown = componentMarkdown ? [componentMarkdown, templateContent].filter(Boolean).join("\n\n") : templateContent;
21389
+ tags = ["component"];
21390
+ } else {
21391
+ markdown = normalizeSvelteToMarkdown(raw);
21392
+ }
19795
21393
  pages.push({
19796
21394
  url: filePathToUrl(filePath, baseDir),
19797
21395
  markdown,
19798
21396
  sourcePath: path.relative(cwd, filePath).replace(/\\/g, "/"),
19799
- outgoingLinks: []
21397
+ outgoingLinks: [],
21398
+ ...tags ? { tags } : {}
19800
21399
  });
19801
21400
  }
19802
21401
  return pages;
@@ -19926,7 +21525,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
19926
21525
  const selected = typeof limit === "number" ? htmlFiles.slice(0, limit) : htmlFiles;
19927
21526
  const pages = [];
19928
21527
  for (const filePath of selected) {
19929
- const html = await fs3.readFile(filePath, "utf8");
21528
+ const html = await fs9.readFile(filePath, "utf8");
19930
21529
  pages.push({
19931
21530
  url: staticHtmlFileToUrl(filePath, outputDir),
19932
21531
  html,
@@ -19989,7 +21588,7 @@ function isBlockedByRobots(urlPath, rules3) {
19989
21588
  }
19990
21589
  async function loadRobotsTxtFromDir(dir) {
19991
21590
  try {
19992
- const content = await fs3.readFile(path.join(dir, "robots.txt"), "utf8");
21591
+ const content = await fs9.readFile(path.join(dir, "robots.txt"), "utf8");
19993
21592
  return parseRobotsTxt(content);
19994
21593
  } catch {
19995
21594
  return null;
@@ -20006,6 +21605,81 @@ async function fetchRobotsTxt(baseUrl) {
20006
21605
  return null;
20007
21606
  }
20008
21607
  }
21608
+ function resolvePageUrl(pageUrl, baseUrl) {
21609
+ if (!baseUrl) return pageUrl;
21610
+ try {
21611
+ return new URL(pageUrl, baseUrl).href;
21612
+ } catch {
21613
+ return pageUrl;
21614
+ }
21615
+ }
21616
+ function generateLlmsTxt(pages, config) {
21617
+ const title = config.llmsTxt.title ?? config.project.id;
21618
+ const description = config.llmsTxt.description;
21619
+ const baseUrl = config.project.baseUrl;
21620
+ const lines = [`# ${title}`];
21621
+ if (description) {
21622
+ lines.push("", `> ${description}`);
21623
+ }
21624
+ const filtered = pages.filter(
21625
+ (p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
21626
+ );
21627
+ const sorted = [...filtered].sort((a, b) => {
21628
+ if (a.depth !== b.depth) return a.depth - b.depth;
21629
+ return b.incomingLinks - a.incomingLinks;
21630
+ });
21631
+ if (sorted.length > 0) {
21632
+ lines.push("", "## Pages", "");
21633
+ for (const page of sorted) {
21634
+ const url = resolvePageUrl(page.url, baseUrl);
21635
+ if (page.description) {
21636
+ lines.push(`- [${page.title}](${url}): ${page.description}`);
21637
+ } else {
21638
+ lines.push(`- [${page.title}](${url})`);
21639
+ }
21640
+ }
21641
+ }
21642
+ lines.push("");
21643
+ return lines.join("\n");
21644
+ }
21645
+ function generateLlmsFullTxt(pages, config) {
21646
+ const title = config.llmsTxt.title ?? config.project.id;
21647
+ const description = config.llmsTxt.description;
21648
+ const baseUrl = config.project.baseUrl;
21649
+ const lines = [`# ${title}`];
21650
+ if (description) {
21651
+ lines.push("", `> ${description}`);
21652
+ }
21653
+ const filtered = pages.filter(
21654
+ (p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
21655
+ );
21656
+ const sorted = [...filtered].sort((a, b) => {
21657
+ if (a.depth !== b.depth) return a.depth - b.depth;
21658
+ return b.incomingLinks - a.incomingLinks;
21659
+ });
21660
+ for (const page of sorted) {
21661
+ const url = resolvePageUrl(page.url, baseUrl);
21662
+ lines.push("", "---", "", `## [${page.title}](${url})`, "");
21663
+ lines.push(page.markdown.trim());
21664
+ }
21665
+ lines.push("");
21666
+ return lines.join("\n");
21667
+ }
21668
+ async function writeLlmsTxt(pages, config, cwd, logger3) {
21669
+ const outputPath = path.resolve(cwd, config.llmsTxt.outputPath);
21670
+ const outputDir = path.dirname(outputPath);
21671
+ await fs9.mkdir(outputDir, { recursive: true });
21672
+ const content = generateLlmsTxt(pages, config);
21673
+ await fs9.writeFile(outputPath, content, "utf8");
21674
+ logger3.info(`Generated llms.txt at ${config.llmsTxt.outputPath}`);
21675
+ if (config.llmsTxt.generateFull) {
21676
+ const fullPath = outputPath.replace(/\.txt$/, "-full.txt");
21677
+ const fullContent = generateLlmsFullTxt(pages, config);
21678
+ await fs9.writeFile(fullPath, fullContent, "utf8");
21679
+ const relativeFull = path.relative(cwd, fullPath);
21680
+ logger3.info(`Generated llms-full.txt at ${relativeFull}`);
21681
+ }
21682
+ }
20009
21683
 
20010
21684
  // src/indexing/pipeline.ts
20011
21685
  function buildPageSummary(page, maxChars = 3500) {
@@ -20024,16 +21698,33 @@ function buildPageSummary(page, maxChars = 3500) {
20024
21698
  if (joined.length <= maxChars) return joined;
20025
21699
  return joined.slice(0, maxChars).trim();
20026
21700
  }
21701
+ function buildPageContentHash(page) {
21702
+ const parts = [
21703
+ page.title,
21704
+ page.description ?? "",
21705
+ (page.keywords ?? []).slice().sort().join(","),
21706
+ page.tags.slice().sort().join(","),
21707
+ page.markdown,
21708
+ String(page.outgoingLinks),
21709
+ String(page.publishedAt ?? ""),
21710
+ page.incomingAnchorText ?? "",
21711
+ (page.outgoingLinkUrls ?? []).slice().sort().join(","),
21712
+ page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : ""
21713
+ ];
21714
+ return sha256(parts.join("|"));
21715
+ }
20027
21716
  var IndexPipeline = class _IndexPipeline {
20028
21717
  cwd;
20029
21718
  config;
20030
21719
  store;
20031
21720
  logger;
21721
+ hooks;
20032
21722
  constructor(options) {
20033
21723
  this.cwd = options.cwd;
20034
21724
  this.config = options.config;
20035
21725
  this.store = options.store;
20036
21726
  this.logger = options.logger;
21727
+ this.hooks = options.hooks;
20037
21728
  }
20038
21729
  static async create(options = {}) {
20039
21730
  const cwd = path.resolve(options.cwd ?? process.cwd());
@@ -20043,7 +21734,8 @@ var IndexPipeline = class _IndexPipeline {
20043
21734
  cwd,
20044
21735
  config,
20045
21736
  store,
20046
- logger: options.logger ?? new Logger()
21737
+ logger: options.logger ?? new Logger(),
21738
+ hooks: options.hooks ?? {}
20047
21739
  });
20048
21740
  }
20049
21741
  getConfig() {
@@ -20064,7 +21756,7 @@ var IndexPipeline = class _IndexPipeline {
20064
21756
  const scope = resolveScope(this.config, options.scopeOverride);
20065
21757
  ensureStateDirs(this.cwd, this.config.state.dir);
20066
21758
  const sourceMode = options.sourceOverride ?? this.config.source.mode;
20067
- this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-search)`);
21759
+ this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-vector)`);
20068
21760
  if (options.force) {
20069
21761
  this.logger.info("Force mode enabled \u2014 full rebuild");
20070
21762
  }
@@ -20073,8 +21765,9 @@ var IndexPipeline = class _IndexPipeline {
20073
21765
  }
20074
21766
  const manifestStart = stageStart();
20075
21767
  const existingHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getContentHashes(scope);
21768
+ const existingPageHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getPageHashes(scope);
20076
21769
  stageEnd("manifest", manifestStart);
20077
- this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
21770
+ this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes, ${existingPageHashes.size} existing page hashes loaded`);
20078
21771
  const sourceStart = stageStart();
20079
21772
  this.logger.info(`Loading pages (source: ${sourceMode})...`);
20080
21773
  let sourcePages;
@@ -20151,11 +21844,61 @@ var IndexPipeline = class _IndexPipeline {
20151
21844
  );
20152
21845
  continue;
20153
21846
  }
20154
- extractedPages.push(extracted);
21847
+ if (sourcePage.tags && sourcePage.tags.length > 0) {
21848
+ extracted.tags = [.../* @__PURE__ */ new Set([...extracted.tags, ...sourcePage.tags])];
21849
+ }
21850
+ let accepted;
21851
+ if (this.hooks.transformPage) {
21852
+ const transformed = await this.hooks.transformPage(extracted);
21853
+ if (transformed === null) {
21854
+ this.logger.debug(`Page ${sourcePage.url} skipped by transformPage hook`);
21855
+ continue;
21856
+ }
21857
+ accepted = transformed;
21858
+ } else {
21859
+ accepted = extracted;
21860
+ }
21861
+ extractedPages.push(accepted);
20155
21862
  this.logger.event("page_extracted", {
20156
- url: extracted.url
21863
+ url: accepted.url
20157
21864
  });
20158
21865
  }
21866
+ const customRecords = options.customRecords ?? [];
21867
+ if (customRecords.length > 0) {
21868
+ this.logger.info(`Processing ${customRecords.length} custom record${customRecords.length === 1 ? "" : "s"}...`);
21869
+ for (const record of customRecords) {
21870
+ const normalizedUrl = normalizeUrlPath(record.url);
21871
+ const normalized = normalizeMarkdown(record.content);
21872
+ if (!normalized.trim()) {
21873
+ this.logger.warn(`Custom record ${normalizedUrl} has empty content and was skipped.`);
21874
+ continue;
21875
+ }
21876
+ const urlTags = normalizedUrl.split("/").filter(Boolean).slice(0, 1);
21877
+ const tags = record.tags ? [.../* @__PURE__ */ new Set([...urlTags, ...record.tags])] : urlTags;
21878
+ const extracted = {
21879
+ url: normalizedUrl,
21880
+ title: record.title,
21881
+ markdown: normalized,
21882
+ outgoingLinks: [],
21883
+ noindex: false,
21884
+ tags,
21885
+ weight: record.weight
21886
+ };
21887
+ let accepted;
21888
+ if (this.hooks.transformPage) {
21889
+ const transformed = await this.hooks.transformPage(extracted);
21890
+ if (transformed === null) {
21891
+ this.logger.debug(`Custom record ${normalizedUrl} skipped by transformPage hook`);
21892
+ continue;
21893
+ }
21894
+ accepted = transformed;
21895
+ } else {
21896
+ accepted = extracted;
21897
+ }
21898
+ extractedPages.push(accepted);
21899
+ this.logger.event("page_extracted", { url: accepted.url, custom: true });
21900
+ }
21901
+ }
20159
21902
  extractedPages.sort((a, b) => a.url.localeCompare(b.url));
20160
21903
  const uniquePages = [];
20161
21904
  const seenUrls = /* @__PURE__ */ new Set();
@@ -20188,15 +21931,28 @@ var IndexPipeline = class _IndexPipeline {
20188
21931
  const linkStart = stageStart();
20189
21932
  const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
20190
21933
  const incomingLinkCount = /* @__PURE__ */ new Map();
21934
+ const incomingAnchorTexts = /* @__PURE__ */ new Map();
20191
21935
  for (const page of indexablePages) {
20192
21936
  incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
20193
21937
  }
20194
21938
  for (const page of indexablePages) {
20195
- for (const outgoing of page.outgoingLinks) {
21939
+ const seenForCount = /* @__PURE__ */ new Set();
21940
+ const seenForAnchor = /* @__PURE__ */ new Set();
21941
+ for (const { url: outgoing, anchorText } of page.outgoingLinks) {
20196
21942
  if (!pageSet.has(outgoing)) {
20197
21943
  continue;
20198
21944
  }
20199
- incomingLinkCount.set(outgoing, (incomingLinkCount.get(outgoing) ?? 0) + 1);
21945
+ if (!seenForCount.has(outgoing)) {
21946
+ seenForCount.add(outgoing);
21947
+ incomingLinkCount.set(outgoing, (incomingLinkCount.get(outgoing) ?? 0) + 1);
21948
+ }
21949
+ if (anchorText && !seenForAnchor.has(outgoing)) {
21950
+ seenForAnchor.add(outgoing);
21951
+ if (!incomingAnchorTexts.has(outgoing)) {
21952
+ incomingAnchorTexts.set(outgoing, /* @__PURE__ */ new Set());
21953
+ }
21954
+ incomingAnchorTexts.get(outgoing).add(anchorText);
21955
+ }
20200
21956
  }
20201
21957
  }
20202
21958
  stageEnd("links", linkStart);
@@ -20215,6 +21971,15 @@ var IndexPipeline = class _IndexPipeline {
20215
21971
  });
20216
21972
  }
20217
21973
  }
21974
+ for (const record of customRecords) {
21975
+ const normalizedUrl = normalizeUrlPath(record.url);
21976
+ if (!precomputedRoutes.has(normalizedUrl)) {
21977
+ precomputedRoutes.set(normalizedUrl, {
21978
+ routeFile: "",
21979
+ routeResolution: "exact"
21980
+ });
21981
+ }
21982
+ }
20218
21983
  for (const page of indexablePages) {
20219
21984
  const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
20220
21985
  if (routeMatch.routeResolution === "best-effort") {
@@ -20232,6 +21997,17 @@ var IndexPipeline = class _IndexPipeline {
20232
21997
  } else {
20233
21998
  routeExact += 1;
20234
21999
  }
22000
+ const anchorSet = incomingAnchorTexts.get(page.url);
22001
+ let incomingAnchorText;
22002
+ if (anchorSet && anchorSet.size > 0) {
22003
+ let joined = "";
22004
+ for (const phrase of anchorSet) {
22005
+ const next2 = joined ? `${joined} ${phrase}` : phrase;
22006
+ if (next2.length > 500) break;
22007
+ joined = next2;
22008
+ }
22009
+ incomingAnchorText = joined || void 0;
22010
+ }
20235
22011
  const indexedPage = {
20236
22012
  url: page.url,
20237
22013
  title: page.title,
@@ -20241,40 +22017,113 @@ var IndexPipeline = class _IndexPipeline {
20241
22017
  generatedAt: nowIso(),
20242
22018
  incomingLinks: incomingLinkCount.get(page.url) ?? 0,
20243
22019
  outgoingLinks: page.outgoingLinks.length,
22020
+ outgoingLinkUrls: page.outgoingLinks.map((l) => typeof l === "string" ? l : l.url),
20244
22021
  depth: getUrlDepth(page.url),
20245
22022
  tags: page.tags,
20246
22023
  markdown: page.markdown,
20247
22024
  description: page.description,
20248
- keywords: page.keywords
22025
+ keywords: page.keywords,
22026
+ publishedAt: page.publishedAt,
22027
+ incomingAnchorText,
22028
+ meta: page.meta
20249
22029
  };
20250
22030
  pages.push(indexedPage);
20251
22031
  this.logger.event("page_indexed", { url: page.url });
20252
22032
  }
22033
+ const pageRecords = pages.map((p) => {
22034
+ const summary = buildPageSummary(p);
22035
+ return {
22036
+ url: p.url,
22037
+ title: p.title,
22038
+ markdown: p.markdown,
22039
+ projectId: scope.projectId,
22040
+ scopeName: scope.scopeName,
22041
+ routeFile: p.routeFile,
22042
+ routeResolution: p.routeResolution,
22043
+ incomingLinks: p.incomingLinks,
22044
+ outgoingLinks: p.outgoingLinks,
22045
+ outgoingLinkUrls: p.outgoingLinkUrls,
22046
+ depth: p.depth,
22047
+ tags: p.tags,
22048
+ indexedAt: p.generatedAt,
22049
+ summary,
22050
+ description: p.description,
22051
+ keywords: p.keywords,
22052
+ contentHash: buildPageContentHash(p),
22053
+ publishedAt: p.publishedAt,
22054
+ meta: p.meta
22055
+ };
22056
+ });
22057
+ const currentPageUrls = new Set(pageRecords.map((r) => r.url));
22058
+ const changedPages = pageRecords.filter(
22059
+ (r) => !existingPageHashes.has(r.url) || existingPageHashes.get(r.url) !== r.contentHash
22060
+ );
22061
+ const deletedPageUrls = [...existingPageHashes.keys()].filter((url) => !currentPageUrls.has(url));
20253
22062
  if (!options.dryRun) {
20254
- const pageRecords = pages.map((p) => {
20255
- const summary = buildPageSummary(p);
20256
- return {
20257
- url: p.url,
20258
- title: p.title,
20259
- markdown: p.markdown,
20260
- projectId: scope.projectId,
20261
- scopeName: scope.scopeName,
20262
- routeFile: p.routeFile,
20263
- routeResolution: p.routeResolution,
20264
- incomingLinks: p.incomingLinks,
20265
- outgoingLinks: p.outgoingLinks,
20266
- depth: p.depth,
20267
- tags: p.tags,
20268
- indexedAt: p.generatedAt,
20269
- summary,
20270
- description: p.description,
20271
- keywords: p.keywords
20272
- };
20273
- });
20274
- await this.store.deletePages(scope);
20275
- await this.store.upsertPages(pageRecords, scope);
22063
+ if (options.force) {
22064
+ await this.store.deletePages(scope);
22065
+ this.logger.info(`Upserting ${pageRecords.length} page summaries...`);
22066
+ const pageDocs = pageRecords.map((r) => ({
22067
+ id: r.url,
22068
+ data: r.summary ?? r.title,
22069
+ metadata: {
22070
+ title: r.title,
22071
+ url: r.url,
22072
+ description: r.description ?? "",
22073
+ keywords: r.keywords ?? [],
22074
+ summary: r.summary ?? "",
22075
+ tags: r.tags,
22076
+ markdown: r.markdown,
22077
+ routeFile: r.routeFile,
22078
+ routeResolution: r.routeResolution,
22079
+ incomingLinks: r.incomingLinks,
22080
+ outgoingLinks: r.outgoingLinks,
22081
+ outgoingLinkUrls: r.outgoingLinkUrls ?? [],
22082
+ depth: r.depth,
22083
+ indexedAt: r.indexedAt,
22084
+ contentHash: r.contentHash ?? "",
22085
+ publishedAt: r.publishedAt ?? null,
22086
+ ...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
22087
+ }
22088
+ }));
22089
+ await this.store.upsertPages(pageDocs, scope);
22090
+ } else {
22091
+ if (changedPages.length > 0) {
22092
+ this.logger.info(`Upserting ${changedPages.length} changed page summaries...`);
22093
+ const pageDocs = changedPages.map((r) => ({
22094
+ id: r.url,
22095
+ data: r.summary ?? r.title,
22096
+ metadata: {
22097
+ title: r.title,
22098
+ url: r.url,
22099
+ description: r.description ?? "",
22100
+ keywords: r.keywords ?? [],
22101
+ summary: r.summary ?? "",
22102
+ tags: r.tags,
22103
+ markdown: r.markdown,
22104
+ routeFile: r.routeFile,
22105
+ routeResolution: r.routeResolution,
22106
+ incomingLinks: r.incomingLinks,
22107
+ outgoingLinks: r.outgoingLinks,
22108
+ outgoingLinkUrls: r.outgoingLinkUrls ?? [],
22109
+ depth: r.depth,
22110
+ indexedAt: r.indexedAt,
22111
+ contentHash: r.contentHash ?? "",
22112
+ publishedAt: r.publishedAt ?? null,
22113
+ ...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
22114
+ }
22115
+ }));
22116
+ await this.store.upsertPages(pageDocs, scope);
22117
+ }
22118
+ if (deletedPageUrls.length > 0) {
22119
+ await this.store.deletePagesByIds(deletedPageUrls, scope);
22120
+ }
22121
+ }
20276
22122
  }
22123
+ const pagesChanged = options.force ? pageRecords.length : changedPages.length;
22124
+ const pagesDeleted = deletedPageUrls.length;
20277
22125
  stageEnd("pages", pagesStart);
22126
+ this.logger.info(`Page changes: ${pagesChanged} changed/new, ${pagesDeleted} deleted, ${pageRecords.length - changedPages.length} unchanged`);
20278
22127
  this.logger.info(`Indexed ${pages.length} page${pages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["pages"]}ms)`);
20279
22128
  const chunkStart = stageStart();
20280
22129
  this.logger.info("Chunking pages...");
@@ -20283,6 +22132,18 @@ var IndexPipeline = class _IndexPipeline {
20283
22132
  if (typeof maxChunks === "number") {
20284
22133
  chunks = chunks.slice(0, maxChunks);
20285
22134
  }
22135
+ if (this.hooks.transformChunk) {
22136
+ const transformed = [];
22137
+ for (const chunk of chunks) {
22138
+ const result = await this.hooks.transformChunk(chunk);
22139
+ if (result === null) {
22140
+ this.logger.debug(`Chunk ${chunk.chunkKey} skipped by transformChunk hook`);
22141
+ continue;
22142
+ }
22143
+ transformed.push(result);
22144
+ }
22145
+ chunks = transformed;
22146
+ }
20286
22147
  for (const chunk of chunks) {
20287
22148
  this.logger.event("chunked", {
20288
22149
  url: chunk.url,
@@ -20295,7 +22156,7 @@ var IndexPipeline = class _IndexPipeline {
20295
22156
  for (const chunk of chunks) {
20296
22157
  currentChunkMap.set(chunk.chunkKey, chunk);
20297
22158
  }
20298
- const changedChunks = chunks.filter((chunk) => {
22159
+ let changedChunks = chunks.filter((chunk) => {
20299
22160
  if (options.force) {
20300
22161
  return true;
20301
22162
  }
@@ -20309,36 +22170,43 @@ var IndexPipeline = class _IndexPipeline {
20309
22170
  return existingHash !== chunk.contentHash;
20310
22171
  });
20311
22172
  const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
22173
+ if (this.hooks.beforeIndex) {
22174
+ changedChunks = await this.hooks.beforeIndex(changedChunks);
22175
+ }
20312
22176
  this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
20313
22177
  const upsertStart = stageStart();
20314
22178
  let documentsUpserted = 0;
20315
22179
  if (!options.dryRun && changedChunks.length > 0) {
20316
- this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Search...`);
20317
- const UPSTASH_CONTENT_LIMIT = 4096;
22180
+ this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Vector...`);
20318
22181
  const docs = changedChunks.map((chunk) => {
20319
- const title = chunk.title;
20320
- const sectionTitle = chunk.sectionTitle ?? "";
20321
- const url = chunk.url;
20322
- const tags = chunk.tags.join(",");
20323
- const headingPath = chunk.headingPath.join(" > ");
20324
- const otherFieldsLen = title.length + sectionTitle.length + url.length + tags.length + headingPath.length;
20325
- const textBudget = Math.max(500, UPSTASH_CONTENT_LIMIT - otherFieldsLen - 50);
20326
- const text = buildEmbeddingText(chunk, this.config.chunking.prependTitle).slice(0, textBudget);
22182
+ const embeddingText = buildEmbeddingText(chunk, this.config.chunking.prependTitle);
22183
+ if (embeddingText.length > 2e3) {
22184
+ this.logger.warn(
22185
+ `Chunk ${chunk.chunkKey} text is ${embeddingText.length} chars (~${Math.round(embeddingText.length / 4)} tokens), which may exceed the 512-token model limit and be silently truncated.`
22186
+ );
22187
+ }
20327
22188
  return {
20328
22189
  id: chunk.chunkKey,
20329
- content: { title, sectionTitle, text, url, tags, headingPath },
22190
+ data: embeddingText,
20330
22191
  metadata: {
20331
- projectId: scope.projectId,
20332
- scopeName: scope.scopeName,
22192
+ url: chunk.url,
20333
22193
  path: chunk.path,
22194
+ title: chunk.title,
22195
+ sectionTitle: chunk.sectionTitle ?? "",
22196
+ headingPath: chunk.headingPath.join(" > "),
20334
22197
  snippet: chunk.snippet,
22198
+ chunkText: embeddingText,
22199
+ tags: chunk.tags,
20335
22200
  ordinal: chunk.ordinal,
20336
22201
  contentHash: chunk.contentHash,
20337
22202
  depth: chunk.depth,
20338
22203
  incomingLinks: chunk.incomingLinks,
20339
22204
  routeFile: chunk.routeFile,
20340
22205
  description: chunk.description ?? "",
20341
- keywords: (chunk.keywords ?? []).join(",")
22206
+ keywords: chunk.keywords ?? [],
22207
+ publishedAt: chunk.publishedAt ?? null,
22208
+ incomingAnchorText: chunk.incomingAnchorText ?? "",
22209
+ ...chunk.meta && Object.keys(chunk.meta).length > 0 ? { meta: chunk.meta } : {}
20342
22210
  }
20343
22211
  };
20344
22212
  });
@@ -20356,9 +22224,16 @@ var IndexPipeline = class _IndexPipeline {
20356
22224
  } else {
20357
22225
  this.logger.info("No chunks to upsert \u2014 all up to date");
20358
22226
  }
22227
+ if (this.config.llmsTxt.enable && !options.dryRun) {
22228
+ const llmsStart = stageStart();
22229
+ await writeLlmsTxt(pages, this.config, this.cwd, this.logger);
22230
+ stageEnd("llms_txt", llmsStart);
22231
+ }
20359
22232
  this.logger.info("Done.");
20360
- return {
22233
+ const stats = {
20361
22234
  pagesProcessed: pages.length,
22235
+ pagesChanged,
22236
+ pagesDeleted,
20362
22237
  chunksTotal: chunks.length,
20363
22238
  chunksChanged: changedChunks.length,
20364
22239
  documentsUpserted,
@@ -20367,6 +22242,10 @@ var IndexPipeline = class _IndexPipeline {
20367
22242
  routeBestEffort,
20368
22243
  stageTimingsMs
20369
22244
  };
22245
+ if (this.hooks.afterIndex) {
22246
+ await this.hooks.afterIndex(stats);
22247
+ }
22248
+ return stats;
20370
22249
  }
20371
22250
  };
20372
22251
 
@@ -20388,9 +22267,6 @@ function shouldRunAutoIndex(options) {
20388
22267
  if (explicit && /^(1|true|yes)$/i.test(explicit)) {
20389
22268
  return true;
20390
22269
  }
20391
- if (process.env.CI && /^(1|true)$/i.test(process.env.CI)) {
20392
- return true;
20393
- }
20394
22270
  return false;
20395
22271
  }
20396
22272
  function searchsocketVitePlugin(options = {}) {
@@ -20415,7 +22291,8 @@ function searchsocketVitePlugin(options = {}) {
20415
22291
  const pipeline = await IndexPipeline.create({
20416
22292
  cwd,
20417
22293
  configPath: options.configPath,
20418
- logger: logger3
22294
+ logger: logger3,
22295
+ hooks: options.hooks
20419
22296
  });
20420
22297
  const stats = await pipeline.run({
20421
22298
  changedOnly: options.changedOnly ?? true,