searchsocket 0.5.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +731 -514
- package/dist/cli.js +3335 -492
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +2378 -475
- package/dist/index.d.cts +113 -40
- package/dist/index.d.ts +113 -40
- package/dist/index.js +2378 -475
- package/dist/{plugin-B_npJSux.d.cts → plugin-C61L-ykY.d.ts} +2 -1
- package/dist/{plugin-M-aW0ev6.d.ts → plugin-DoBW1gkK.d.cts} +2 -1
- package/dist/sveltekit.cjs +2430 -494
- package/dist/sveltekit.d.cts +2 -2
- package/dist/sveltekit.d.ts +2 -2
- package/dist/sveltekit.js +2416 -480
- package/dist/templates/search-dialog/SearchDialog.svelte +175 -0
- package/dist/templates/search-input/SearchInput.svelte +151 -0
- package/dist/templates/search-results/SearchResults.svelte +75 -0
- package/dist/{types-Dk43uz25.d.cts → types-029hl6P2.d.cts} +180 -9
- package/dist/{types-Dk43uz25.d.ts → types-029hl6P2.d.ts} +180 -9
- package/package.json +28 -11
- package/src/svelte/SearchSocket.svelte +35 -0
- package/src/svelte/index.svelte.ts +181 -0
package/dist/index.cjs
CHANGED
|
@@ -5,18 +5,20 @@ var path = require('path');
|
|
|
5
5
|
var jiti = require('jiti');
|
|
6
6
|
var zod = require('zod');
|
|
7
7
|
var child_process = require('child_process');
|
|
8
|
+
var vector = require('@upstash/vector');
|
|
8
9
|
var crypto = require('crypto');
|
|
9
10
|
var cheerio = require('cheerio');
|
|
10
11
|
var matter = require('gray-matter');
|
|
11
12
|
var fg = require('fast-glob');
|
|
12
13
|
var pLimit = require('p-limit');
|
|
13
|
-
var
|
|
14
|
+
var fs8 = require('fs/promises');
|
|
14
15
|
var net = require('net');
|
|
15
16
|
var zlib = require('zlib');
|
|
16
17
|
var mcp_js = require('@modelcontextprotocol/sdk/server/mcp.js');
|
|
17
18
|
var stdio_js = require('@modelcontextprotocol/sdk/server/stdio.js');
|
|
18
19
|
var streamableHttp_js = require('@modelcontextprotocol/sdk/server/streamableHttp.js');
|
|
19
20
|
var express_js = require('@modelcontextprotocol/sdk/server/express.js');
|
|
21
|
+
var webStandardStreamableHttp_js = require('@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js');
|
|
20
22
|
|
|
21
23
|
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
22
24
|
|
|
@@ -25,7 +27,7 @@ var path__default = /*#__PURE__*/_interopDefault(path);
|
|
|
25
27
|
var matter__default = /*#__PURE__*/_interopDefault(matter);
|
|
26
28
|
var fg__default = /*#__PURE__*/_interopDefault(fg);
|
|
27
29
|
var pLimit__default = /*#__PURE__*/_interopDefault(pLimit);
|
|
28
|
-
var
|
|
30
|
+
var fs8__default = /*#__PURE__*/_interopDefault(fs8);
|
|
29
31
|
var net__default = /*#__PURE__*/_interopDefault(net);
|
|
30
32
|
|
|
31
33
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
@@ -5025,32 +5027,32 @@ var require_URL = __commonJS({
|
|
|
5025
5027
|
else
|
|
5026
5028
|
return basepath.substring(0, lastslash + 1) + refpath;
|
|
5027
5029
|
}
|
|
5028
|
-
function remove_dot_segments(
|
|
5029
|
-
if (!
|
|
5030
|
+
function remove_dot_segments(path14) {
|
|
5031
|
+
if (!path14) return path14;
|
|
5030
5032
|
var output = "";
|
|
5031
|
-
while (
|
|
5032
|
-
if (
|
|
5033
|
-
|
|
5033
|
+
while (path14.length > 0) {
|
|
5034
|
+
if (path14 === "." || path14 === "..") {
|
|
5035
|
+
path14 = "";
|
|
5034
5036
|
break;
|
|
5035
5037
|
}
|
|
5036
|
-
var twochars =
|
|
5037
|
-
var threechars =
|
|
5038
|
-
var fourchars =
|
|
5038
|
+
var twochars = path14.substring(0, 2);
|
|
5039
|
+
var threechars = path14.substring(0, 3);
|
|
5040
|
+
var fourchars = path14.substring(0, 4);
|
|
5039
5041
|
if (threechars === "../") {
|
|
5040
|
-
|
|
5042
|
+
path14 = path14.substring(3);
|
|
5041
5043
|
} else if (twochars === "./") {
|
|
5042
|
-
|
|
5044
|
+
path14 = path14.substring(2);
|
|
5043
5045
|
} else if (threechars === "/./") {
|
|
5044
|
-
|
|
5045
|
-
} else if (twochars === "/." &&
|
|
5046
|
-
|
|
5047
|
-
} else if (fourchars === "/../" || threechars === "/.." &&
|
|
5048
|
-
|
|
5046
|
+
path14 = "/" + path14.substring(3);
|
|
5047
|
+
} else if (twochars === "/." && path14.length === 2) {
|
|
5048
|
+
path14 = "/";
|
|
5049
|
+
} else if (fourchars === "/../" || threechars === "/.." && path14.length === 3) {
|
|
5050
|
+
path14 = "/" + path14.substring(4);
|
|
5049
5051
|
output = output.replace(/\/?[^\/]*$/, "");
|
|
5050
5052
|
} else {
|
|
5051
|
-
var segment =
|
|
5053
|
+
var segment = path14.match(/(\/?([^\/]*))/)[0];
|
|
5052
5054
|
output += segment;
|
|
5053
|
-
|
|
5055
|
+
path14 = path14.substring(segment.length);
|
|
5054
5056
|
}
|
|
5055
5057
|
}
|
|
5056
5058
|
return output;
|
|
@@ -16646,6 +16648,7 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16646
16648
|
dropSelectors: zod.z.array(zod.z.string()).optional(),
|
|
16647
16649
|
ignoreAttr: zod.z.string().optional(),
|
|
16648
16650
|
noindexAttr: zod.z.string().optional(),
|
|
16651
|
+
imageDescAttr: zod.z.string().optional(),
|
|
16649
16652
|
respectRobotsNoindex: zod.z.boolean().optional()
|
|
16650
16653
|
}).optional(),
|
|
16651
16654
|
transform: zod.z.object({
|
|
@@ -16661,35 +16664,48 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16661
16664
|
headingPathDepth: zod.z.number().int().positive().optional(),
|
|
16662
16665
|
dontSplitInside: zod.z.array(zod.z.enum(["code", "table", "blockquote"])).optional(),
|
|
16663
16666
|
prependTitle: zod.z.boolean().optional(),
|
|
16664
|
-
pageSummaryChunk: zod.z.boolean().optional()
|
|
16667
|
+
pageSummaryChunk: zod.z.boolean().optional(),
|
|
16668
|
+
weightHeadings: zod.z.boolean().optional()
|
|
16665
16669
|
}).optional(),
|
|
16666
16670
|
upstash: zod.z.object({
|
|
16667
16671
|
url: zod.z.string().url().optional(),
|
|
16668
16672
|
token: zod.z.string().min(1).optional(),
|
|
16669
16673
|
urlEnv: zod.z.string().min(1).optional(),
|
|
16670
|
-
tokenEnv: zod.z.string().min(1).optional()
|
|
16674
|
+
tokenEnv: zod.z.string().min(1).optional(),
|
|
16675
|
+
namespaces: zod.z.object({
|
|
16676
|
+
pages: zod.z.string().min(1).optional(),
|
|
16677
|
+
chunks: zod.z.string().min(1).optional()
|
|
16678
|
+
}).optional()
|
|
16679
|
+
}).optional(),
|
|
16680
|
+
embedding: zod.z.object({
|
|
16681
|
+
model: zod.z.string().optional(),
|
|
16682
|
+
dimensions: zod.z.number().int().positive().optional(),
|
|
16683
|
+
taskType: zod.z.string().optional(),
|
|
16684
|
+
batchSize: zod.z.number().int().positive().optional()
|
|
16671
16685
|
}).optional(),
|
|
16672
16686
|
search: zod.z.object({
|
|
16673
|
-
semanticWeight: zod.z.number().min(0).max(1).optional(),
|
|
16674
|
-
inputEnrichment: zod.z.boolean().optional(),
|
|
16675
|
-
reranking: zod.z.boolean().optional(),
|
|
16676
16687
|
dualSearch: zod.z.boolean().optional(),
|
|
16677
16688
|
pageSearchWeight: zod.z.number().min(0).max(1).optional()
|
|
16678
16689
|
}).optional(),
|
|
16679
16690
|
ranking: zod.z.object({
|
|
16680
16691
|
enableIncomingLinkBoost: zod.z.boolean().optional(),
|
|
16681
16692
|
enableDepthBoost: zod.z.boolean().optional(),
|
|
16693
|
+
enableFreshnessBoost: zod.z.boolean().optional(),
|
|
16694
|
+
freshnessDecayRate: zod.z.number().positive().optional(),
|
|
16695
|
+
enableAnchorTextBoost: zod.z.boolean().optional(),
|
|
16682
16696
|
pageWeights: zod.z.record(zod.z.string(), zod.z.number().min(0)).optional(),
|
|
16683
16697
|
aggregationCap: zod.z.number().int().positive().optional(),
|
|
16684
16698
|
aggregationDecay: zod.z.number().min(0).max(1).optional(),
|
|
16685
16699
|
minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
16686
|
-
|
|
16700
|
+
minScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
16687
16701
|
scoreGapThreshold: zod.z.number().min(0).max(1).optional(),
|
|
16688
16702
|
weights: zod.z.object({
|
|
16689
16703
|
incomingLinks: zod.z.number().optional(),
|
|
16690
16704
|
depth: zod.z.number().optional(),
|
|
16691
16705
|
aggregation: zod.z.number().optional(),
|
|
16692
|
-
titleMatch: zod.z.number().optional()
|
|
16706
|
+
titleMatch: zod.z.number().optional(),
|
|
16707
|
+
freshness: zod.z.number().optional(),
|
|
16708
|
+
anchorText: zod.z.number().optional()
|
|
16693
16709
|
}).optional()
|
|
16694
16710
|
}).optional(),
|
|
16695
16711
|
api: zod.z.object({
|
|
@@ -16704,12 +16720,28 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16704
16720
|
}).optional(),
|
|
16705
16721
|
mcp: zod.z.object({
|
|
16706
16722
|
enable: zod.z.boolean().optional(),
|
|
16723
|
+
access: zod.z.enum(["public", "private"]).optional(),
|
|
16707
16724
|
transport: zod.z.enum(["stdio", "http"]).optional(),
|
|
16708
16725
|
http: zod.z.object({
|
|
16709
16726
|
port: zod.z.number().int().positive().optional(),
|
|
16710
|
-
path: zod.z.string().optional()
|
|
16727
|
+
path: zod.z.string().optional(),
|
|
16728
|
+
apiKey: zod.z.string().min(1).optional(),
|
|
16729
|
+
apiKeyEnv: zod.z.string().min(1).optional()
|
|
16730
|
+
}).optional(),
|
|
16731
|
+
handle: zod.z.object({
|
|
16732
|
+
path: zod.z.string().optional(),
|
|
16733
|
+
apiKey: zod.z.string().min(1).optional(),
|
|
16734
|
+
enableJsonResponse: zod.z.boolean().optional()
|
|
16711
16735
|
}).optional()
|
|
16712
16736
|
}).optional(),
|
|
16737
|
+
llmsTxt: zod.z.object({
|
|
16738
|
+
enable: zod.z.boolean().optional(),
|
|
16739
|
+
outputPath: zod.z.string().optional(),
|
|
16740
|
+
title: zod.z.string().optional(),
|
|
16741
|
+
description: zod.z.string().optional(),
|
|
16742
|
+
generateFull: zod.z.boolean().optional(),
|
|
16743
|
+
serveMarkdownVariants: zod.z.boolean().optional()
|
|
16744
|
+
}).optional(),
|
|
16713
16745
|
state: zod.z.object({
|
|
16714
16746
|
dir: zod.z.string().optional()
|
|
16715
16747
|
}).optional()
|
|
@@ -16748,6 +16780,7 @@ function createDefaultConfig(projectId) {
|
|
|
16748
16780
|
dropSelectors: DEFAULT_DROP_SELECTORS,
|
|
16749
16781
|
ignoreAttr: "data-search-ignore",
|
|
16750
16782
|
noindexAttr: "data-search-noindex",
|
|
16783
|
+
imageDescAttr: "data-search-description",
|
|
16751
16784
|
respectRobotsNoindex: true
|
|
16752
16785
|
},
|
|
16753
16786
|
transform: {
|
|
@@ -16757,39 +16790,52 @@ function createDefaultConfig(projectId) {
|
|
|
16757
16790
|
},
|
|
16758
16791
|
chunking: {
|
|
16759
16792
|
strategy: "hybrid",
|
|
16760
|
-
maxChars:
|
|
16793
|
+
maxChars: 1500,
|
|
16761
16794
|
overlapChars: 200,
|
|
16762
16795
|
minChars: 250,
|
|
16763
16796
|
headingPathDepth: 3,
|
|
16764
16797
|
dontSplitInside: ["code", "table", "blockquote"],
|
|
16765
16798
|
prependTitle: true,
|
|
16766
|
-
pageSummaryChunk: true
|
|
16799
|
+
pageSummaryChunk: true,
|
|
16800
|
+
weightHeadings: true
|
|
16767
16801
|
},
|
|
16768
16802
|
upstash: {
|
|
16769
|
-
urlEnv: "
|
|
16770
|
-
tokenEnv: "
|
|
16803
|
+
urlEnv: "UPSTASH_VECTOR_REST_URL",
|
|
16804
|
+
tokenEnv: "UPSTASH_VECTOR_REST_TOKEN",
|
|
16805
|
+
namespaces: {
|
|
16806
|
+
pages: "pages",
|
|
16807
|
+
chunks: "chunks"
|
|
16808
|
+
}
|
|
16809
|
+
},
|
|
16810
|
+
embedding: {
|
|
16811
|
+
model: "bge-large-en-v1.5",
|
|
16812
|
+
dimensions: 1024,
|
|
16813
|
+
taskType: "RETRIEVAL_DOCUMENT",
|
|
16814
|
+
batchSize: 100
|
|
16771
16815
|
},
|
|
16772
16816
|
search: {
|
|
16773
|
-
semanticWeight: 0.75,
|
|
16774
|
-
inputEnrichment: true,
|
|
16775
|
-
reranking: true,
|
|
16776
16817
|
dualSearch: true,
|
|
16777
16818
|
pageSearchWeight: 0.3
|
|
16778
16819
|
},
|
|
16779
16820
|
ranking: {
|
|
16780
16821
|
enableIncomingLinkBoost: true,
|
|
16781
16822
|
enableDepthBoost: true,
|
|
16823
|
+
enableFreshnessBoost: false,
|
|
16824
|
+
freshnessDecayRate: 1e-3,
|
|
16825
|
+
enableAnchorTextBoost: false,
|
|
16782
16826
|
pageWeights: {},
|
|
16783
16827
|
aggregationCap: 5,
|
|
16784
16828
|
aggregationDecay: 0.5,
|
|
16785
16829
|
minChunkScoreRatio: 0.5,
|
|
16786
|
-
|
|
16830
|
+
minScoreRatio: 0.7,
|
|
16787
16831
|
scoreGapThreshold: 0.4,
|
|
16788
16832
|
weights: {
|
|
16789
16833
|
incomingLinks: 0.05,
|
|
16790
16834
|
depth: 0.03,
|
|
16791
16835
|
aggregation: 0.1,
|
|
16792
|
-
titleMatch: 0.15
|
|
16836
|
+
titleMatch: 0.15,
|
|
16837
|
+
freshness: 0.1,
|
|
16838
|
+
anchorText: 0.1
|
|
16793
16839
|
}
|
|
16794
16840
|
},
|
|
16795
16841
|
api: {
|
|
@@ -16800,12 +16846,23 @@ function createDefaultConfig(projectId) {
|
|
|
16800
16846
|
},
|
|
16801
16847
|
mcp: {
|
|
16802
16848
|
enable: process.env.NODE_ENV !== "production",
|
|
16849
|
+
access: "private",
|
|
16803
16850
|
transport: "stdio",
|
|
16804
16851
|
http: {
|
|
16805
16852
|
port: 3338,
|
|
16806
16853
|
path: "/mcp"
|
|
16854
|
+
},
|
|
16855
|
+
handle: {
|
|
16856
|
+
path: "/api/mcp",
|
|
16857
|
+
enableJsonResponse: true
|
|
16807
16858
|
}
|
|
16808
16859
|
},
|
|
16860
|
+
llmsTxt: {
|
|
16861
|
+
enable: false,
|
|
16862
|
+
outputPath: "static/llms.txt",
|
|
16863
|
+
generateFull: true,
|
|
16864
|
+
serveMarkdownVariants: false
|
|
16865
|
+
},
|
|
16809
16866
|
state: {
|
|
16810
16867
|
dir: ".searchsocket"
|
|
16811
16868
|
}
|
|
@@ -16933,7 +16990,15 @@ ${issues}`
|
|
|
16933
16990
|
},
|
|
16934
16991
|
upstash: {
|
|
16935
16992
|
...defaults.upstash,
|
|
16936
|
-
...parsed.upstash
|
|
16993
|
+
...parsed.upstash,
|
|
16994
|
+
namespaces: {
|
|
16995
|
+
...defaults.upstash.namespaces,
|
|
16996
|
+
...parsed.upstash?.namespaces
|
|
16997
|
+
}
|
|
16998
|
+
},
|
|
16999
|
+
embedding: {
|
|
17000
|
+
...defaults.embedding,
|
|
17001
|
+
...parsed.embedding
|
|
16937
17002
|
},
|
|
16938
17003
|
search: {
|
|
16939
17004
|
...defaults.search,
|
|
@@ -16970,8 +17035,16 @@ ${issues}`
|
|
|
16970
17035
|
http: {
|
|
16971
17036
|
...defaults.mcp.http,
|
|
16972
17037
|
...parsed.mcp?.http
|
|
17038
|
+
},
|
|
17039
|
+
handle: {
|
|
17040
|
+
...defaults.mcp.handle,
|
|
17041
|
+
...parsed.mcp?.handle
|
|
16973
17042
|
}
|
|
16974
17043
|
},
|
|
17044
|
+
llmsTxt: {
|
|
17045
|
+
...defaults.llmsTxt,
|
|
17046
|
+
...parsed.llmsTxt
|
|
17047
|
+
},
|
|
16975
17048
|
state: {
|
|
16976
17049
|
...defaults.state,
|
|
16977
17050
|
...parsed.state
|
|
@@ -16991,6 +17064,15 @@ ${issues}`
|
|
|
16991
17064
|
maxDepth: 10
|
|
16992
17065
|
};
|
|
16993
17066
|
}
|
|
17067
|
+
if (merged.mcp.access === "public") {
|
|
17068
|
+
const resolvedKey = merged.mcp.http.apiKey ?? (merged.mcp.http.apiKeyEnv ? process.env[merged.mcp.http.apiKeyEnv] : void 0);
|
|
17069
|
+
if (!resolvedKey) {
|
|
17070
|
+
throw new SearchSocketError(
|
|
17071
|
+
"CONFIG_MISSING",
|
|
17072
|
+
'`mcp.access` is "public" but no API key is configured. Set `mcp.http.apiKey` or `mcp.http.apiKeyEnv`.'
|
|
17073
|
+
);
|
|
17074
|
+
}
|
|
17075
|
+
}
|
|
16994
17076
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
16995
17077
|
throw new SearchSocketError("CONFIG_MISSING", "`source.crawl.baseUrl` is required when source.mode is crawl.");
|
|
16996
17078
|
}
|
|
@@ -17054,13 +17136,84 @@ function normalizeMarkdown(input) {
|
|
|
17054
17136
|
function sanitizeScopeName(scopeName) {
|
|
17055
17137
|
return scopeName.toLowerCase().replace(/[^a-z0-9._-]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80);
|
|
17056
17138
|
}
|
|
17139
|
+
function markdownToPlain(markdown) {
|
|
17140
|
+
return markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/[#>*_|\-]/g, " ").replace(/\s+/g, " ").trim();
|
|
17141
|
+
}
|
|
17057
17142
|
function toSnippet(markdown, maxLen = 220) {
|
|
17058
|
-
const plain = markdown
|
|
17143
|
+
const plain = markdownToPlain(markdown);
|
|
17059
17144
|
if (plain.length <= maxLen) {
|
|
17060
17145
|
return plain;
|
|
17061
17146
|
}
|
|
17062
17147
|
return `${plain.slice(0, Math.max(0, maxLen - 1)).trim()}\u2026`;
|
|
17063
17148
|
}
|
|
17149
|
+
function queryAwareExcerpt(markdown, query, maxLen = 220) {
|
|
17150
|
+
const plain = markdownToPlain(markdown);
|
|
17151
|
+
if (plain.length <= maxLen) return plain;
|
|
17152
|
+
const tokens = query.toLowerCase().split(/\s+/).filter((t) => t.length >= 2);
|
|
17153
|
+
if (tokens.length === 0) return toSnippet(markdown, maxLen);
|
|
17154
|
+
const positions = [];
|
|
17155
|
+
for (let ti = 0; ti < tokens.length; ti++) {
|
|
17156
|
+
const escaped = tokens[ti].replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
17157
|
+
const re = new RegExp(escaped, "gi");
|
|
17158
|
+
let m;
|
|
17159
|
+
while ((m = re.exec(plain)) !== null) {
|
|
17160
|
+
positions.push({ start: m.index, end: m.index + m[0].length, tokenIdx: ti });
|
|
17161
|
+
}
|
|
17162
|
+
}
|
|
17163
|
+
if (positions.length === 0) return toSnippet(markdown, maxLen);
|
|
17164
|
+
positions.sort((a, b) => a.start - b.start);
|
|
17165
|
+
let bestUniqueCount = 0;
|
|
17166
|
+
let bestTotalCount = 0;
|
|
17167
|
+
let bestLeft = 0;
|
|
17168
|
+
let bestRight = 0;
|
|
17169
|
+
let left = 0;
|
|
17170
|
+
const tokenCounts = /* @__PURE__ */ new Map();
|
|
17171
|
+
for (let right = 0; right < positions.length; right++) {
|
|
17172
|
+
tokenCounts.set(positions[right].tokenIdx, (tokenCounts.get(positions[right].tokenIdx) ?? 0) + 1);
|
|
17173
|
+
while (positions[right].end - positions[left].start > maxLen && left < right) {
|
|
17174
|
+
const leftToken = positions[left].tokenIdx;
|
|
17175
|
+
const cnt = tokenCounts.get(leftToken) - 1;
|
|
17176
|
+
if (cnt === 0) tokenCounts.delete(leftToken);
|
|
17177
|
+
else tokenCounts.set(leftToken, cnt);
|
|
17178
|
+
left++;
|
|
17179
|
+
}
|
|
17180
|
+
const uniqueCount = tokenCounts.size;
|
|
17181
|
+
const totalCount = right - left + 1;
|
|
17182
|
+
if (uniqueCount > bestUniqueCount || uniqueCount === bestUniqueCount && totalCount > bestTotalCount) {
|
|
17183
|
+
bestUniqueCount = uniqueCount;
|
|
17184
|
+
bestTotalCount = totalCount;
|
|
17185
|
+
bestLeft = left;
|
|
17186
|
+
bestRight = right;
|
|
17187
|
+
}
|
|
17188
|
+
}
|
|
17189
|
+
const mid = Math.floor((positions[bestLeft].start + positions[bestRight].end) / 2);
|
|
17190
|
+
let start = Math.max(0, mid - Math.floor(maxLen / 2));
|
|
17191
|
+
let end = Math.min(plain.length, start + maxLen);
|
|
17192
|
+
start = Math.max(0, end - maxLen);
|
|
17193
|
+
if (start > 0) {
|
|
17194
|
+
const spaceIdx = plain.lastIndexOf(" ", start);
|
|
17195
|
+
if (spaceIdx > start - 30) {
|
|
17196
|
+
start = spaceIdx + 1;
|
|
17197
|
+
}
|
|
17198
|
+
}
|
|
17199
|
+
if (end < plain.length) {
|
|
17200
|
+
const spaceIdx = plain.indexOf(" ", end);
|
|
17201
|
+
if (spaceIdx !== -1 && spaceIdx < end + 30) {
|
|
17202
|
+
end = spaceIdx;
|
|
17203
|
+
}
|
|
17204
|
+
}
|
|
17205
|
+
let excerpt = plain.slice(start, end);
|
|
17206
|
+
if (excerpt.length > Math.ceil(maxLen * 1.2)) {
|
|
17207
|
+
excerpt = excerpt.slice(0, maxLen);
|
|
17208
|
+
const lastSpace = excerpt.lastIndexOf(" ");
|
|
17209
|
+
if (lastSpace > maxLen * 0.5) {
|
|
17210
|
+
excerpt = excerpt.slice(0, lastSpace);
|
|
17211
|
+
}
|
|
17212
|
+
}
|
|
17213
|
+
const prefix = start > 0 ? "\u2026" : "";
|
|
17214
|
+
const suffix = end < plain.length ? "\u2026" : "";
|
|
17215
|
+
return `${prefix}${excerpt}${suffix}`;
|
|
17216
|
+
}
|
|
17064
17217
|
function extractFirstParagraph(markdown) {
|
|
17065
17218
|
const lines = markdown.split("\n");
|
|
17066
17219
|
let inFence = false;
|
|
@@ -17121,162 +17274,342 @@ function ensureStateDirs(cwd, stateDir, scope) {
|
|
|
17121
17274
|
fs__default.default.mkdirSync(statePath, { recursive: true });
|
|
17122
17275
|
return { statePath };
|
|
17123
17276
|
}
|
|
17124
|
-
|
|
17125
|
-
// src/vector/upstash.ts
|
|
17126
|
-
function chunkIndexName(scope) {
|
|
17127
|
-
return `${scope.projectId}--${scope.scopeName}`;
|
|
17128
|
-
}
|
|
17129
|
-
function pageIndexName(scope) {
|
|
17130
|
-
return `${scope.projectId}--${scope.scopeName}--pages`;
|
|
17131
|
-
}
|
|
17132
17277
|
var UpstashSearchStore = class {
|
|
17133
|
-
|
|
17278
|
+
index;
|
|
17279
|
+
pagesNs;
|
|
17280
|
+
chunksNs;
|
|
17134
17281
|
constructor(opts) {
|
|
17135
|
-
this.
|
|
17136
|
-
|
|
17137
|
-
|
|
17138
|
-
return this.client.index(chunkIndexName(scope));
|
|
17139
|
-
}
|
|
17140
|
-
pageIndex(scope) {
|
|
17141
|
-
return this.client.index(pageIndexName(scope));
|
|
17282
|
+
this.index = opts.index;
|
|
17283
|
+
this.pagesNs = opts.index.namespace(opts.pagesNamespace);
|
|
17284
|
+
this.chunksNs = opts.index.namespace(opts.chunksNamespace);
|
|
17142
17285
|
}
|
|
17143
17286
|
async upsertChunks(chunks, scope) {
|
|
17144
17287
|
if (chunks.length === 0) return;
|
|
17145
|
-
const
|
|
17146
|
-
const BATCH_SIZE = 100;
|
|
17288
|
+
const BATCH_SIZE = 90;
|
|
17147
17289
|
for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
|
|
17148
17290
|
const batch = chunks.slice(i, i + BATCH_SIZE);
|
|
17149
|
-
await
|
|
17150
|
-
|
|
17151
|
-
|
|
17152
|
-
|
|
17153
|
-
|
|
17154
|
-
|
|
17155
|
-
|
|
17156
|
-
|
|
17157
|
-
|
|
17158
|
-
|
|
17159
|
-
|
|
17160
|
-
|
|
17291
|
+
await this.chunksNs.upsert(
|
|
17292
|
+
batch.map((c) => ({
|
|
17293
|
+
id: c.id,
|
|
17294
|
+
data: c.data,
|
|
17295
|
+
metadata: {
|
|
17296
|
+
...c.metadata,
|
|
17297
|
+
projectId: scope.projectId,
|
|
17298
|
+
scopeName: scope.scopeName,
|
|
17299
|
+
type: c.metadata.type || "chunk"
|
|
17300
|
+
}
|
|
17301
|
+
}))
|
|
17302
|
+
);
|
|
17303
|
+
}
|
|
17304
|
+
}
|
|
17305
|
+
async search(data, opts, scope) {
|
|
17306
|
+
const filterParts = [
|
|
17307
|
+
`projectId = '${scope.projectId}'`,
|
|
17308
|
+
`scopeName = '${scope.scopeName}'`
|
|
17309
|
+
];
|
|
17310
|
+
if (opts.filter) {
|
|
17311
|
+
filterParts.push(opts.filter);
|
|
17312
|
+
}
|
|
17313
|
+
const results = await this.chunksNs.query({
|
|
17314
|
+
data,
|
|
17315
|
+
topK: opts.limit,
|
|
17316
|
+
includeMetadata: true,
|
|
17317
|
+
filter: filterParts.join(" AND "),
|
|
17318
|
+
queryMode: vector.QueryMode.HYBRID,
|
|
17319
|
+
fusionAlgorithm: vector.FusionAlgorithm.DBSF
|
|
17320
|
+
});
|
|
17321
|
+
return results.map((doc) => ({
|
|
17322
|
+
id: String(doc.id),
|
|
17323
|
+
score: doc.score,
|
|
17324
|
+
metadata: {
|
|
17325
|
+
projectId: doc.metadata?.projectId ?? "",
|
|
17326
|
+
scopeName: doc.metadata?.scopeName ?? "",
|
|
17327
|
+
url: doc.metadata?.url ?? "",
|
|
17328
|
+
path: doc.metadata?.path ?? "",
|
|
17329
|
+
title: doc.metadata?.title ?? "",
|
|
17330
|
+
sectionTitle: doc.metadata?.sectionTitle ?? "",
|
|
17331
|
+
headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
|
|
17332
|
+
snippet: doc.metadata?.snippet ?? "",
|
|
17333
|
+
chunkText: doc.metadata?.chunkText ?? "",
|
|
17334
|
+
ordinal: doc.metadata?.ordinal ?? 0,
|
|
17335
|
+
contentHash: doc.metadata?.contentHash ?? "",
|
|
17336
|
+
depth: doc.metadata?.depth ?? 0,
|
|
17337
|
+
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17338
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
17339
|
+
tags: doc.metadata?.tags ?? [],
|
|
17340
|
+
description: doc.metadata?.description || void 0,
|
|
17341
|
+
keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17342
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
|
|
17343
|
+
incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
|
|
17344
|
+
}
|
|
17345
|
+
}));
|
|
17346
|
+
}
|
|
17347
|
+
async searchChunksByUrl(data, url, opts, scope) {
|
|
17348
|
+
const filterParts = [
|
|
17349
|
+
`projectId = '${scope.projectId}'`,
|
|
17350
|
+
`scopeName = '${scope.scopeName}'`,
|
|
17351
|
+
`url = '${url}'`
|
|
17352
|
+
];
|
|
17353
|
+
if (opts.filter) {
|
|
17354
|
+
filterParts.push(opts.filter);
|
|
17355
|
+
}
|
|
17356
|
+
const results = await this.chunksNs.query({
|
|
17357
|
+
data,
|
|
17358
|
+
topK: opts.limit,
|
|
17359
|
+
includeMetadata: true,
|
|
17360
|
+
filter: filterParts.join(" AND "),
|
|
17361
|
+
queryMode: vector.QueryMode.HYBRID,
|
|
17362
|
+
fusionAlgorithm: vector.FusionAlgorithm.DBSF
|
|
17161
17363
|
});
|
|
17162
17364
|
return results.map((doc) => ({
|
|
17163
|
-
id: doc.id,
|
|
17365
|
+
id: String(doc.id),
|
|
17164
17366
|
score: doc.score,
|
|
17165
17367
|
metadata: {
|
|
17166
17368
|
projectId: doc.metadata?.projectId ?? "",
|
|
17167
17369
|
scopeName: doc.metadata?.scopeName ?? "",
|
|
17168
|
-
url: doc.
|
|
17370
|
+
url: doc.metadata?.url ?? "",
|
|
17169
17371
|
path: doc.metadata?.path ?? "",
|
|
17170
|
-
title: doc.
|
|
17171
|
-
sectionTitle: doc.
|
|
17172
|
-
headingPath: doc.
|
|
17372
|
+
title: doc.metadata?.title ?? "",
|
|
17373
|
+
sectionTitle: doc.metadata?.sectionTitle ?? "",
|
|
17374
|
+
headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
|
|
17173
17375
|
snippet: doc.metadata?.snippet ?? "",
|
|
17174
|
-
chunkText: doc.
|
|
17376
|
+
chunkText: doc.metadata?.chunkText ?? "",
|
|
17175
17377
|
ordinal: doc.metadata?.ordinal ?? 0,
|
|
17176
17378
|
contentHash: doc.metadata?.contentHash ?? "",
|
|
17177
17379
|
depth: doc.metadata?.depth ?? 0,
|
|
17178
17380
|
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17179
17381
|
routeFile: doc.metadata?.routeFile ?? "",
|
|
17180
|
-
tags: doc.
|
|
17382
|
+
tags: doc.metadata?.tags ?? [],
|
|
17181
17383
|
description: doc.metadata?.description || void 0,
|
|
17182
|
-
keywords: doc.metadata?.keywords ? doc.metadata.keywords
|
|
17384
|
+
keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17385
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
|
|
17386
|
+
incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
|
|
17183
17387
|
}
|
|
17184
17388
|
}));
|
|
17185
17389
|
}
|
|
17186
|
-
async
|
|
17187
|
-
|
|
17390
|
+
async searchPagesByText(data, opts, scope) {
|
|
17391
|
+
return this.queryPages({ data }, opts, scope);
|
|
17392
|
+
}
|
|
17393
|
+
async searchPagesByVector(vector, opts, scope) {
|
|
17394
|
+
return this.queryPages({ vector }, opts, scope);
|
|
17395
|
+
}
|
|
17396
|
+
async queryPages(input, opts, scope) {
|
|
17397
|
+
const filterParts = [
|
|
17398
|
+
`projectId = '${scope.projectId}'`,
|
|
17399
|
+
`scopeName = '${scope.scopeName}'`
|
|
17400
|
+
];
|
|
17401
|
+
if (opts.filter) {
|
|
17402
|
+
filterParts.push(opts.filter);
|
|
17403
|
+
}
|
|
17188
17404
|
let results;
|
|
17189
17405
|
try {
|
|
17190
|
-
results = await
|
|
17191
|
-
|
|
17192
|
-
|
|
17193
|
-
|
|
17194
|
-
|
|
17195
|
-
|
|
17196
|
-
|
|
17406
|
+
results = await this.pagesNs.query({
|
|
17407
|
+
...input,
|
|
17408
|
+
topK: opts.limit,
|
|
17409
|
+
includeMetadata: true,
|
|
17410
|
+
filter: filterParts.join(" AND "),
|
|
17411
|
+
queryMode: vector.QueryMode.HYBRID,
|
|
17412
|
+
fusionAlgorithm: vector.FusionAlgorithm.DBSF
|
|
17197
17413
|
});
|
|
17198
17414
|
} catch {
|
|
17199
17415
|
return [];
|
|
17200
17416
|
}
|
|
17201
17417
|
return results.map((doc) => ({
|
|
17202
|
-
id: doc.id,
|
|
17418
|
+
id: String(doc.id),
|
|
17203
17419
|
score: doc.score,
|
|
17204
|
-
title: doc.
|
|
17205
|
-
url: doc.
|
|
17206
|
-
description: doc.
|
|
17207
|
-
tags: doc.
|
|
17420
|
+
title: doc.metadata?.title ?? "",
|
|
17421
|
+
url: doc.metadata?.url ?? "",
|
|
17422
|
+
description: doc.metadata?.description ?? "",
|
|
17423
|
+
tags: doc.metadata?.tags ?? [],
|
|
17208
17424
|
depth: doc.metadata?.depth ?? 0,
|
|
17209
17425
|
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17210
|
-
routeFile: doc.metadata?.routeFile ?? ""
|
|
17426
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
17427
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0
|
|
17211
17428
|
}));
|
|
17212
17429
|
}
|
|
17213
|
-
async deleteByIds(ids,
|
|
17430
|
+
async deleteByIds(ids, _scope) {
|
|
17214
17431
|
if (ids.length === 0) return;
|
|
17215
|
-
const
|
|
17216
|
-
const BATCH_SIZE = 500;
|
|
17432
|
+
const BATCH_SIZE = 90;
|
|
17217
17433
|
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17218
17434
|
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17219
|
-
await
|
|
17435
|
+
await this.chunksNs.delete(batch);
|
|
17220
17436
|
}
|
|
17221
17437
|
}
|
|
17222
17438
|
async deleteScope(scope) {
|
|
17223
|
-
|
|
17224
|
-
const
|
|
17225
|
-
|
|
17226
|
-
|
|
17227
|
-
|
|
17228
|
-
|
|
17229
|
-
|
|
17230
|
-
|
|
17231
|
-
|
|
17439
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17440
|
+
const ids = [];
|
|
17441
|
+
let cursor = "0";
|
|
17442
|
+
try {
|
|
17443
|
+
for (; ; ) {
|
|
17444
|
+
const result = await ns.range({
|
|
17445
|
+
cursor,
|
|
17446
|
+
limit: 100,
|
|
17447
|
+
includeMetadata: true
|
|
17448
|
+
});
|
|
17449
|
+
for (const doc of result.vectors) {
|
|
17450
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17451
|
+
ids.push(String(doc.id));
|
|
17452
|
+
}
|
|
17453
|
+
}
|
|
17454
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17455
|
+
cursor = result.nextCursor;
|
|
17456
|
+
}
|
|
17457
|
+
} catch {
|
|
17458
|
+
}
|
|
17459
|
+
if (ids.length > 0) {
|
|
17460
|
+
const BATCH_SIZE = 90;
|
|
17461
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17462
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17463
|
+
await ns.delete(batch);
|
|
17464
|
+
}
|
|
17465
|
+
}
|
|
17232
17466
|
}
|
|
17233
17467
|
}
|
|
17234
17468
|
async listScopes(projectId) {
|
|
17235
|
-
const
|
|
17236
|
-
const
|
|
17237
|
-
|
|
17238
|
-
|
|
17239
|
-
|
|
17240
|
-
|
|
17241
|
-
|
|
17242
|
-
|
|
17243
|
-
|
|
17244
|
-
|
|
17245
|
-
|
|
17246
|
-
|
|
17247
|
-
|
|
17248
|
-
|
|
17249
|
-
|
|
17250
|
-
|
|
17469
|
+
const scopeMap = /* @__PURE__ */ new Map();
|
|
17470
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17471
|
+
let cursor = "0";
|
|
17472
|
+
try {
|
|
17473
|
+
for (; ; ) {
|
|
17474
|
+
const result = await ns.range({
|
|
17475
|
+
cursor,
|
|
17476
|
+
limit: 100,
|
|
17477
|
+
includeMetadata: true
|
|
17478
|
+
});
|
|
17479
|
+
for (const doc of result.vectors) {
|
|
17480
|
+
if (doc.metadata?.projectId === projectId) {
|
|
17481
|
+
const scopeName = doc.metadata.scopeName ?? "";
|
|
17482
|
+
scopeMap.set(scopeName, (scopeMap.get(scopeName) ?? 0) + 1);
|
|
17483
|
+
}
|
|
17484
|
+
}
|
|
17485
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17486
|
+
cursor = result.nextCursor;
|
|
17487
|
+
}
|
|
17488
|
+
} catch {
|
|
17489
|
+
}
|
|
17490
|
+
}
|
|
17491
|
+
return [...scopeMap.entries()].map(([scopeName, count]) => ({
|
|
17492
|
+
projectId,
|
|
17493
|
+
scopeName,
|
|
17494
|
+
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
17495
|
+
documentCount: count
|
|
17496
|
+
}));
|
|
17497
|
+
}
|
|
17498
|
+
async getContentHashes(scope) {
|
|
17499
|
+
return this.scanHashes(this.chunksNs, scope);
|
|
17500
|
+
}
|
|
17501
|
+
/**
|
|
17502
|
+
* Fetch content hashes for a specific set of chunk keys using direct fetch()
|
|
17503
|
+
* instead of range(). This avoids potential issues with range() returning
|
|
17504
|
+
* vectors from the wrong namespace on hybrid indexes.
|
|
17505
|
+
*/
|
|
17506
|
+
async fetchContentHashesForKeys(keys, scope) {
|
|
17507
|
+
const map = /* @__PURE__ */ new Map();
|
|
17508
|
+
if (keys.length === 0) return map;
|
|
17509
|
+
const BATCH_SIZE = 90;
|
|
17510
|
+
for (let i = 0; i < keys.length; i += BATCH_SIZE) {
|
|
17511
|
+
const batch = keys.slice(i, i + BATCH_SIZE);
|
|
17251
17512
|
try {
|
|
17252
|
-
const
|
|
17253
|
-
|
|
17254
|
-
projectId,
|
|
17255
|
-
scopeName,
|
|
17256
|
-
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
17257
|
-
documentCount: info.documentCount
|
|
17513
|
+
const results = await this.chunksNs.fetch(batch, {
|
|
17514
|
+
includeMetadata: true
|
|
17258
17515
|
});
|
|
17516
|
+
for (const doc of results) {
|
|
17517
|
+
if (doc && doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17518
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17519
|
+
}
|
|
17520
|
+
}
|
|
17259
17521
|
} catch {
|
|
17260
|
-
|
|
17261
|
-
|
|
17262
|
-
|
|
17263
|
-
|
|
17264
|
-
|
|
17522
|
+
}
|
|
17523
|
+
}
|
|
17524
|
+
return map;
|
|
17525
|
+
}
|
|
17526
|
+
/**
|
|
17527
|
+
* Scan all IDs in the chunks namespace for this scope.
|
|
17528
|
+
* Used for deletion detection (finding stale chunk keys).
|
|
17529
|
+
*/
|
|
17530
|
+
async scanChunkIds(scope) {
|
|
17531
|
+
const ids = /* @__PURE__ */ new Set();
|
|
17532
|
+
let cursor = "0";
|
|
17533
|
+
try {
|
|
17534
|
+
for (; ; ) {
|
|
17535
|
+
const result = await this.chunksNs.range({
|
|
17536
|
+
cursor,
|
|
17537
|
+
limit: 100,
|
|
17538
|
+
includeMetadata: true
|
|
17265
17539
|
});
|
|
17540
|
+
for (const doc of result.vectors) {
|
|
17541
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17542
|
+
ids.add(String(doc.id));
|
|
17543
|
+
}
|
|
17544
|
+
}
|
|
17545
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17546
|
+
cursor = result.nextCursor;
|
|
17266
17547
|
}
|
|
17548
|
+
} catch {
|
|
17267
17549
|
}
|
|
17268
|
-
return
|
|
17550
|
+
return ids;
|
|
17269
17551
|
}
|
|
17270
|
-
async
|
|
17552
|
+
async scanHashes(ns, scope) {
|
|
17553
|
+
const map = /* @__PURE__ */ new Map();
|
|
17554
|
+
let cursor = "0";
|
|
17555
|
+
try {
|
|
17556
|
+
for (; ; ) {
|
|
17557
|
+
const result = await ns.range({
|
|
17558
|
+
cursor,
|
|
17559
|
+
limit: 100,
|
|
17560
|
+
includeMetadata: true
|
|
17561
|
+
});
|
|
17562
|
+
for (const doc of result.vectors) {
|
|
17563
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17564
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17565
|
+
}
|
|
17566
|
+
}
|
|
17567
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17568
|
+
cursor = result.nextCursor;
|
|
17569
|
+
}
|
|
17570
|
+
} catch {
|
|
17571
|
+
}
|
|
17572
|
+
return map;
|
|
17573
|
+
}
|
|
17574
|
+
async listPages(scope, opts) {
|
|
17575
|
+
const cursor = opts?.cursor ?? "0";
|
|
17576
|
+
const limit = opts?.limit ?? 50;
|
|
17577
|
+
try {
|
|
17578
|
+
const result = await this.pagesNs.range({
|
|
17579
|
+
cursor,
|
|
17580
|
+
limit,
|
|
17581
|
+
includeMetadata: true
|
|
17582
|
+
});
|
|
17583
|
+
const pages = result.vectors.filter(
|
|
17584
|
+
(doc) => doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && (!opts?.pathPrefix || (doc.metadata?.url ?? "").startsWith(opts.pathPrefix))
|
|
17585
|
+
).map((doc) => ({
|
|
17586
|
+
url: doc.metadata?.url ?? "",
|
|
17587
|
+
title: doc.metadata?.title ?? "",
|
|
17588
|
+
description: doc.metadata?.description ?? "",
|
|
17589
|
+
routeFile: doc.metadata?.routeFile ?? ""
|
|
17590
|
+
}));
|
|
17591
|
+
const response = { pages };
|
|
17592
|
+
if (result.nextCursor && result.nextCursor !== "0") {
|
|
17593
|
+
response.nextCursor = result.nextCursor;
|
|
17594
|
+
}
|
|
17595
|
+
return response;
|
|
17596
|
+
} catch {
|
|
17597
|
+
return { pages: [] };
|
|
17598
|
+
}
|
|
17599
|
+
}
|
|
17600
|
+
async getPageHashes(scope) {
|
|
17271
17601
|
const map = /* @__PURE__ */ new Map();
|
|
17272
|
-
const index = this.chunkIndex(scope);
|
|
17273
17602
|
let cursor = "0";
|
|
17274
17603
|
try {
|
|
17275
17604
|
for (; ; ) {
|
|
17276
|
-
const result = await
|
|
17277
|
-
|
|
17278
|
-
|
|
17279
|
-
|
|
17605
|
+
const result = await this.pagesNs.range({
|
|
17606
|
+
cursor,
|
|
17607
|
+
limit: 100,
|
|
17608
|
+
includeMetadata: true
|
|
17609
|
+
});
|
|
17610
|
+
for (const doc of result.vectors) {
|
|
17611
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17612
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17280
17613
|
}
|
|
17281
17614
|
}
|
|
17282
17615
|
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
@@ -17286,47 +17619,43 @@ var UpstashSearchStore = class {
|
|
|
17286
17619
|
}
|
|
17287
17620
|
return map;
|
|
17288
17621
|
}
|
|
17622
|
+
async deletePagesByIds(ids, _scope) {
|
|
17623
|
+
if (ids.length === 0) return;
|
|
17624
|
+
const BATCH_SIZE = 90;
|
|
17625
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17626
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17627
|
+
await this.pagesNs.delete(batch);
|
|
17628
|
+
}
|
|
17629
|
+
}
|
|
17289
17630
|
async upsertPages(pages, scope) {
|
|
17290
17631
|
if (pages.length === 0) return;
|
|
17291
|
-
const
|
|
17292
|
-
const BATCH_SIZE = 50;
|
|
17632
|
+
const BATCH_SIZE = 90;
|
|
17293
17633
|
for (let i = 0; i < pages.length; i += BATCH_SIZE) {
|
|
17294
17634
|
const batch = pages.slice(i, i + BATCH_SIZE);
|
|
17295
|
-
|
|
17296
|
-
|
|
17297
|
-
|
|
17298
|
-
|
|
17299
|
-
|
|
17300
|
-
|
|
17301
|
-
|
|
17302
|
-
|
|
17303
|
-
|
|
17304
|
-
|
|
17305
|
-
}
|
|
17306
|
-
|
|
17307
|
-
markdown: p.markdown,
|
|
17308
|
-
projectId: p.projectId,
|
|
17309
|
-
scopeName: p.scopeName,
|
|
17310
|
-
routeFile: p.routeFile,
|
|
17311
|
-
routeResolution: p.routeResolution,
|
|
17312
|
-
incomingLinks: p.incomingLinks,
|
|
17313
|
-
outgoingLinks: p.outgoingLinks,
|
|
17314
|
-
depth: p.depth,
|
|
17315
|
-
indexedAt: p.indexedAt
|
|
17316
|
-
}
|
|
17317
|
-
}));
|
|
17318
|
-
await index.upsert(docs);
|
|
17635
|
+
await this.pagesNs.upsert(
|
|
17636
|
+
batch.map((p) => ({
|
|
17637
|
+
id: p.id,
|
|
17638
|
+
data: p.data,
|
|
17639
|
+
metadata: {
|
|
17640
|
+
...p.metadata,
|
|
17641
|
+
projectId: scope.projectId,
|
|
17642
|
+
scopeName: scope.scopeName,
|
|
17643
|
+
type: "page"
|
|
17644
|
+
}
|
|
17645
|
+
}))
|
|
17646
|
+
);
|
|
17319
17647
|
}
|
|
17320
17648
|
}
|
|
17321
17649
|
async getPage(url, scope) {
|
|
17322
|
-
const index = this.pageIndex(scope);
|
|
17323
17650
|
try {
|
|
17324
|
-
const results = await
|
|
17651
|
+
const results = await this.pagesNs.fetch([url], {
|
|
17652
|
+
includeMetadata: true
|
|
17653
|
+
});
|
|
17325
17654
|
const doc = results[0];
|
|
17326
|
-
if (!doc) return null;
|
|
17655
|
+
if (!doc || !doc.metadata) return null;
|
|
17327
17656
|
return {
|
|
17328
|
-
url: doc.
|
|
17329
|
-
title: doc.
|
|
17657
|
+
url: doc.metadata.url,
|
|
17658
|
+
title: doc.metadata.title,
|
|
17330
17659
|
markdown: doc.metadata.markdown,
|
|
17331
17660
|
projectId: doc.metadata.projectId,
|
|
17332
17661
|
scopeName: doc.metadata.scopeName,
|
|
@@ -17334,27 +17663,86 @@ var UpstashSearchStore = class {
|
|
|
17334
17663
|
routeResolution: doc.metadata.routeResolution,
|
|
17335
17664
|
incomingLinks: doc.metadata.incomingLinks,
|
|
17336
17665
|
outgoingLinks: doc.metadata.outgoingLinks,
|
|
17666
|
+
outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? void 0,
|
|
17337
17667
|
depth: doc.metadata.depth,
|
|
17338
|
-
tags: doc.
|
|
17668
|
+
tags: doc.metadata.tags ?? [],
|
|
17339
17669
|
indexedAt: doc.metadata.indexedAt,
|
|
17340
|
-
summary: doc.
|
|
17341
|
-
description: doc.
|
|
17342
|
-
keywords: doc.
|
|
17670
|
+
summary: doc.metadata.summary || void 0,
|
|
17671
|
+
description: doc.metadata.description || void 0,
|
|
17672
|
+
keywords: doc.metadata.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17673
|
+
publishedAt: typeof doc.metadata.publishedAt === "number" ? doc.metadata.publishedAt : void 0
|
|
17343
17674
|
};
|
|
17344
17675
|
} catch {
|
|
17345
17676
|
return null;
|
|
17346
17677
|
}
|
|
17347
17678
|
}
|
|
17679
|
+
async fetchPageWithVector(url, scope) {
|
|
17680
|
+
try {
|
|
17681
|
+
const results = await this.pagesNs.fetch([url], {
|
|
17682
|
+
includeMetadata: true,
|
|
17683
|
+
includeVectors: true
|
|
17684
|
+
});
|
|
17685
|
+
const doc = results[0];
|
|
17686
|
+
if (!doc || !doc.metadata || !doc.vector) return null;
|
|
17687
|
+
if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
|
|
17688
|
+
return null;
|
|
17689
|
+
}
|
|
17690
|
+
return { metadata: doc.metadata, vector: doc.vector };
|
|
17691
|
+
} catch {
|
|
17692
|
+
return null;
|
|
17693
|
+
}
|
|
17694
|
+
}
|
|
17695
|
+
async fetchPagesBatch(urls, scope) {
|
|
17696
|
+
if (urls.length === 0) return [];
|
|
17697
|
+
try {
|
|
17698
|
+
const results = await this.pagesNs.fetch(urls, {
|
|
17699
|
+
includeMetadata: true
|
|
17700
|
+
});
|
|
17701
|
+
const out = [];
|
|
17702
|
+
for (const doc of results) {
|
|
17703
|
+
if (!doc || !doc.metadata) continue;
|
|
17704
|
+
if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
|
|
17705
|
+
continue;
|
|
17706
|
+
}
|
|
17707
|
+
out.push({
|
|
17708
|
+
url: doc.metadata.url,
|
|
17709
|
+
title: doc.metadata.title,
|
|
17710
|
+
routeFile: doc.metadata.routeFile,
|
|
17711
|
+
outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? []
|
|
17712
|
+
});
|
|
17713
|
+
}
|
|
17714
|
+
return out;
|
|
17715
|
+
} catch {
|
|
17716
|
+
return [];
|
|
17717
|
+
}
|
|
17718
|
+
}
|
|
17348
17719
|
async deletePages(scope) {
|
|
17720
|
+
const ids = [];
|
|
17721
|
+
let cursor = "0";
|
|
17349
17722
|
try {
|
|
17350
|
-
|
|
17351
|
-
|
|
17723
|
+
for (; ; ) {
|
|
17724
|
+
const result = await this.pagesNs.range({
|
|
17725
|
+
cursor,
|
|
17726
|
+
limit: 100,
|
|
17727
|
+
includeMetadata: true
|
|
17728
|
+
});
|
|
17729
|
+
for (const doc of result.vectors) {
|
|
17730
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17731
|
+
ids.push(String(doc.id));
|
|
17732
|
+
}
|
|
17733
|
+
}
|
|
17734
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17735
|
+
cursor = result.nextCursor;
|
|
17736
|
+
}
|
|
17352
17737
|
} catch {
|
|
17353
17738
|
}
|
|
17739
|
+
if (ids.length > 0) {
|
|
17740
|
+
await this.deletePagesByIds(ids, scope);
|
|
17741
|
+
}
|
|
17354
17742
|
}
|
|
17355
17743
|
async health() {
|
|
17356
17744
|
try {
|
|
17357
|
-
await this.
|
|
17745
|
+
await this.index.info();
|
|
17358
17746
|
return { ok: true };
|
|
17359
17747
|
} catch (error) {
|
|
17360
17748
|
return {
|
|
@@ -17364,14 +17752,31 @@ var UpstashSearchStore = class {
|
|
|
17364
17752
|
}
|
|
17365
17753
|
}
|
|
17366
17754
|
async dropAllIndexes(projectId) {
|
|
17367
|
-
const
|
|
17368
|
-
|
|
17369
|
-
|
|
17370
|
-
|
|
17371
|
-
|
|
17372
|
-
const
|
|
17373
|
-
|
|
17374
|
-
|
|
17755
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17756
|
+
const ids = [];
|
|
17757
|
+
let cursor = "0";
|
|
17758
|
+
try {
|
|
17759
|
+
for (; ; ) {
|
|
17760
|
+
const result = await ns.range({
|
|
17761
|
+
cursor,
|
|
17762
|
+
limit: 100,
|
|
17763
|
+
includeMetadata: true
|
|
17764
|
+
});
|
|
17765
|
+
for (const doc of result.vectors) {
|
|
17766
|
+
if (doc.metadata?.projectId === projectId) {
|
|
17767
|
+
ids.push(String(doc.id));
|
|
17768
|
+
}
|
|
17769
|
+
}
|
|
17770
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17771
|
+
cursor = result.nextCursor;
|
|
17772
|
+
}
|
|
17773
|
+
} catch {
|
|
17774
|
+
}
|
|
17775
|
+
if (ids.length > 0) {
|
|
17776
|
+
const BATCH_SIZE = 90;
|
|
17777
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17778
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17779
|
+
await ns.delete(batch);
|
|
17375
17780
|
}
|
|
17376
17781
|
}
|
|
17377
17782
|
}
|
|
@@ -17385,12 +17790,16 @@ async function createUpstashStore(config) {
|
|
|
17385
17790
|
if (!url || !token) {
|
|
17386
17791
|
throw new SearchSocketError(
|
|
17387
17792
|
"VECTOR_BACKEND_UNAVAILABLE",
|
|
17388
|
-
`Missing Upstash
|
|
17793
|
+
`Missing Upstash Vector credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
|
|
17389
17794
|
);
|
|
17390
17795
|
}
|
|
17391
|
-
const {
|
|
17392
|
-
const
|
|
17393
|
-
return new UpstashSearchStore({
|
|
17796
|
+
const { Index } = await import('@upstash/vector');
|
|
17797
|
+
const index = new Index({ url, token });
|
|
17798
|
+
return new UpstashSearchStore({
|
|
17799
|
+
index,
|
|
17800
|
+
pagesNamespace: config.upstash.namespaces.pages,
|
|
17801
|
+
chunksNamespace: config.upstash.namespaces.chunks
|
|
17802
|
+
});
|
|
17394
17803
|
}
|
|
17395
17804
|
function sha1(input) {
|
|
17396
17805
|
return crypto.createHash("sha1").update(input).digest("hex");
|
|
@@ -17458,6 +17867,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
|
|
|
17458
17867
|
if (normalizeText(current.text)) {
|
|
17459
17868
|
sections.push({
|
|
17460
17869
|
sectionTitle: current.sectionTitle,
|
|
17870
|
+
headingLevel: current.headingLevel,
|
|
17461
17871
|
headingPath: current.headingPath,
|
|
17462
17872
|
text: current.text.trim()
|
|
17463
17873
|
});
|
|
@@ -17476,6 +17886,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
|
|
|
17476
17886
|
headingStack.length = level;
|
|
17477
17887
|
current = {
|
|
17478
17888
|
sectionTitle: title,
|
|
17889
|
+
headingLevel: level,
|
|
17479
17890
|
headingPath: headingStack.filter((entry) => Boolean(entry)).slice(0, headingPathDepth),
|
|
17480
17891
|
text: `${line}
|
|
17481
17892
|
`
|
|
@@ -17611,6 +18022,7 @@ function splitSection(section, config) {
|
|
|
17611
18022
|
return [
|
|
17612
18023
|
{
|
|
17613
18024
|
sectionTitle: section.sectionTitle,
|
|
18025
|
+
headingLevel: section.headingLevel,
|
|
17614
18026
|
headingPath: section.headingPath,
|
|
17615
18027
|
chunkText: text
|
|
17616
18028
|
}
|
|
@@ -17661,6 +18073,7 @@ ${chunk}`;
|
|
|
17661
18073
|
}
|
|
17662
18074
|
return merged.map((chunkText) => ({
|
|
17663
18075
|
sectionTitle: section.sectionTitle,
|
|
18076
|
+
headingLevel: section.headingLevel,
|
|
17664
18077
|
headingPath: section.headingPath,
|
|
17665
18078
|
chunkText
|
|
17666
18079
|
}));
|
|
@@ -17676,6 +18089,18 @@ function buildSummaryChunkText(page) {
|
|
|
17676
18089
|
}
|
|
17677
18090
|
return parts.join("\n\n");
|
|
17678
18091
|
}
|
|
18092
|
+
function buildEmbeddingTitle(chunk) {
|
|
18093
|
+
if (!chunk.sectionTitle || chunk.headingLevel === void 0) return void 0;
|
|
18094
|
+
if (chunk.headingPath.length > 1) {
|
|
18095
|
+
const path14 = chunk.headingPath.join(" > ");
|
|
18096
|
+
const lastInPath = chunk.headingPath[chunk.headingPath.length - 1];
|
|
18097
|
+
if (lastInPath !== chunk.sectionTitle) {
|
|
18098
|
+
return `${chunk.title} \u2014 ${path14} > ${chunk.sectionTitle}`;
|
|
18099
|
+
}
|
|
18100
|
+
return `${chunk.title} \u2014 ${path14}`;
|
|
18101
|
+
}
|
|
18102
|
+
return `${chunk.title} \u2014 ${chunk.sectionTitle}`;
|
|
18103
|
+
}
|
|
17679
18104
|
function buildEmbeddingText(chunk, prependTitle) {
|
|
17680
18105
|
if (!prependTitle) return chunk.chunkText;
|
|
17681
18106
|
const prefix = chunk.sectionTitle ? `${chunk.title} \u2014 ${chunk.sectionTitle}` : chunk.title;
|
|
@@ -17706,10 +18131,14 @@ function chunkPage(page, config, scope) {
|
|
|
17706
18131
|
tags: page.tags,
|
|
17707
18132
|
contentHash: "",
|
|
17708
18133
|
description: page.description,
|
|
17709
|
-
keywords: page.keywords
|
|
18134
|
+
keywords: page.keywords,
|
|
18135
|
+
publishedAt: page.publishedAt,
|
|
18136
|
+
incomingAnchorText: page.incomingAnchorText,
|
|
18137
|
+
meta: page.meta
|
|
17710
18138
|
};
|
|
17711
18139
|
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
17712
|
-
|
|
18140
|
+
const metaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
|
|
18141
|
+
summaryChunk.contentHash = sha256(normalizeText(embeddingText) + metaSuffix);
|
|
17713
18142
|
chunks.push(summaryChunk);
|
|
17714
18143
|
}
|
|
17715
18144
|
const ordinalOffset = config.chunking.pageSummaryChunk ? 1 : 0;
|
|
@@ -17726,6 +18155,7 @@ function chunkPage(page, config, scope) {
|
|
|
17726
18155
|
path: page.url,
|
|
17727
18156
|
title: page.title,
|
|
17728
18157
|
sectionTitle: entry.sectionTitle,
|
|
18158
|
+
headingLevel: entry.headingLevel,
|
|
17729
18159
|
headingPath: entry.headingPath,
|
|
17730
18160
|
chunkText: entry.chunkText,
|
|
17731
18161
|
snippet: toSnippet(entry.chunkText),
|
|
@@ -17735,10 +18165,16 @@ function chunkPage(page, config, scope) {
|
|
|
17735
18165
|
tags: page.tags,
|
|
17736
18166
|
contentHash: "",
|
|
17737
18167
|
description: page.description,
|
|
17738
|
-
keywords: page.keywords
|
|
18168
|
+
keywords: page.keywords,
|
|
18169
|
+
publishedAt: page.publishedAt,
|
|
18170
|
+
incomingAnchorText: page.incomingAnchorText,
|
|
18171
|
+
meta: page.meta
|
|
17739
18172
|
};
|
|
17740
18173
|
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
17741
|
-
|
|
18174
|
+
const embeddingTitle = config.chunking.weightHeadings ? buildEmbeddingTitle(chunk) : void 0;
|
|
18175
|
+
const chunkMetaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
|
|
18176
|
+
const hashInput = embeddingTitle ? `${normalizeText(embeddingText)}|title:${embeddingTitle}` : normalizeText(embeddingText);
|
|
18177
|
+
chunk.contentHash = sha256(hashInput + chunkMetaSuffix);
|
|
17742
18178
|
chunks.push(chunk);
|
|
17743
18179
|
}
|
|
17744
18180
|
return chunks;
|
|
@@ -18570,7 +19006,112 @@ function gfm(turndownService) {
|
|
|
18570
19006
|
]);
|
|
18571
19007
|
}
|
|
18572
19008
|
|
|
19009
|
+
// src/utils/structured-meta.ts
|
|
19010
|
+
var VALID_KEY_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
|
|
19011
|
+
function validateMetaKey(key) {
|
|
19012
|
+
return VALID_KEY_RE.test(key);
|
|
19013
|
+
}
|
|
19014
|
+
function parseMetaValue(content, dataType) {
|
|
19015
|
+
switch (dataType) {
|
|
19016
|
+
case "number": {
|
|
19017
|
+
const n = Number(content);
|
|
19018
|
+
return Number.isFinite(n) ? n : content;
|
|
19019
|
+
}
|
|
19020
|
+
case "boolean":
|
|
19021
|
+
return content === "true";
|
|
19022
|
+
case "string[]":
|
|
19023
|
+
return content ? content.split(",").map((s) => s.trim()) : [];
|
|
19024
|
+
case "date": {
|
|
19025
|
+
const ms = Number(content);
|
|
19026
|
+
return Number.isFinite(ms) ? ms : content;
|
|
19027
|
+
}
|
|
19028
|
+
default:
|
|
19029
|
+
return content;
|
|
19030
|
+
}
|
|
19031
|
+
}
|
|
19032
|
+
function escapeFilterValue(s) {
|
|
19033
|
+
return s.replace(/'/g, "''");
|
|
19034
|
+
}
|
|
19035
|
+
function buildMetaFilterString(filters) {
|
|
19036
|
+
const clauses = [];
|
|
19037
|
+
for (const [key, value] of Object.entries(filters)) {
|
|
19038
|
+
if (!validateMetaKey(key)) continue;
|
|
19039
|
+
const field = `meta.${key}`;
|
|
19040
|
+
if (typeof value === "string") {
|
|
19041
|
+
clauses.push(`${field} CONTAINS '${escapeFilterValue(value)}'`);
|
|
19042
|
+
} else if (typeof value === "boolean") {
|
|
19043
|
+
clauses.push(`${field} = ${value}`);
|
|
19044
|
+
} else {
|
|
19045
|
+
clauses.push(`${field} = ${value}`);
|
|
19046
|
+
}
|
|
19047
|
+
}
|
|
19048
|
+
return clauses.join(" AND ");
|
|
19049
|
+
}
|
|
19050
|
+
|
|
18573
19051
|
// src/indexing/extractor.ts
|
|
19052
|
+
function normalizeDateToMs(value) {
|
|
19053
|
+
if (value == null) return void 0;
|
|
19054
|
+
if (value instanceof Date) {
|
|
19055
|
+
const ts = value.getTime();
|
|
19056
|
+
return Number.isFinite(ts) ? ts : void 0;
|
|
19057
|
+
}
|
|
19058
|
+
if (typeof value === "string") {
|
|
19059
|
+
const ts = new Date(value).getTime();
|
|
19060
|
+
return Number.isFinite(ts) ? ts : void 0;
|
|
19061
|
+
}
|
|
19062
|
+
if (typeof value === "number") {
|
|
19063
|
+
return Number.isFinite(value) ? value : void 0;
|
|
19064
|
+
}
|
|
19065
|
+
return void 0;
|
|
19066
|
+
}
|
|
19067
|
+
var FRONTMATTER_DATE_FIELDS = ["date", "publishedAt", "updatedAt", "published_at", "updated_at"];
|
|
19068
|
+
function extractPublishedAtFromFrontmatter(data) {
|
|
19069
|
+
for (const field of FRONTMATTER_DATE_FIELDS) {
|
|
19070
|
+
const val = normalizeDateToMs(data[field]);
|
|
19071
|
+
if (val !== void 0) return val;
|
|
19072
|
+
}
|
|
19073
|
+
return void 0;
|
|
19074
|
+
}
|
|
19075
|
+
function extractPublishedAtFromHtml($) {
|
|
19076
|
+
const jsonLdScripts = $('script[type="application/ld+json"]');
|
|
19077
|
+
for (let i = 0; i < jsonLdScripts.length; i++) {
|
|
19078
|
+
try {
|
|
19079
|
+
const raw = $(jsonLdScripts[i]).html();
|
|
19080
|
+
if (!raw) continue;
|
|
19081
|
+
const parsed = JSON.parse(raw);
|
|
19082
|
+
const candidates = [];
|
|
19083
|
+
if (Array.isArray(parsed)) {
|
|
19084
|
+
candidates.push(...parsed);
|
|
19085
|
+
} else if (parsed && typeof parsed === "object") {
|
|
19086
|
+
candidates.push(parsed);
|
|
19087
|
+
if (Array.isArray(parsed["@graph"])) {
|
|
19088
|
+
candidates.push(...parsed["@graph"]);
|
|
19089
|
+
}
|
|
19090
|
+
}
|
|
19091
|
+
for (const candidate of candidates) {
|
|
19092
|
+
const val = normalizeDateToMs(candidate.datePublished);
|
|
19093
|
+
if (val !== void 0) return val;
|
|
19094
|
+
}
|
|
19095
|
+
} catch {
|
|
19096
|
+
}
|
|
19097
|
+
}
|
|
19098
|
+
const ogTime = $('meta[property="article:published_time"]').attr("content")?.trim();
|
|
19099
|
+
if (ogTime) {
|
|
19100
|
+
const val = normalizeDateToMs(ogTime);
|
|
19101
|
+
if (val !== void 0) return val;
|
|
19102
|
+
}
|
|
19103
|
+
const itempropDate = $('meta[itemprop="datePublished"]').attr("content")?.trim() || $('time[itemprop="datePublished"]').attr("datetime")?.trim();
|
|
19104
|
+
if (itempropDate) {
|
|
19105
|
+
const val = normalizeDateToMs(itempropDate);
|
|
19106
|
+
if (val !== void 0) return val;
|
|
19107
|
+
}
|
|
19108
|
+
const timeEl = $("time[datetime]").first().attr("datetime")?.trim();
|
|
19109
|
+
if (timeEl) {
|
|
19110
|
+
const val = normalizeDateToMs(timeEl);
|
|
19111
|
+
if (val !== void 0) return val;
|
|
19112
|
+
}
|
|
19113
|
+
return void 0;
|
|
19114
|
+
}
|
|
18574
19115
|
function hasTopLevelNoindexComment(markdown) {
|
|
18575
19116
|
const lines = markdown.split(/\r?\n/);
|
|
18576
19117
|
let inFence = false;
|
|
@@ -18586,6 +19127,97 @@ function hasTopLevelNoindexComment(markdown) {
|
|
|
18586
19127
|
}
|
|
18587
19128
|
return false;
|
|
18588
19129
|
}
|
|
19130
|
+
var GARBAGE_ALT_WORDS = /* @__PURE__ */ new Set([
|
|
19131
|
+
"image",
|
|
19132
|
+
"photo",
|
|
19133
|
+
"picture",
|
|
19134
|
+
"icon",
|
|
19135
|
+
"logo",
|
|
19136
|
+
"banner",
|
|
19137
|
+
"screenshot",
|
|
19138
|
+
"thumbnail",
|
|
19139
|
+
"img",
|
|
19140
|
+
"graphic",
|
|
19141
|
+
"illustration",
|
|
19142
|
+
"spacer",
|
|
19143
|
+
"pixel",
|
|
19144
|
+
"placeholder",
|
|
19145
|
+
"avatar",
|
|
19146
|
+
"background"
|
|
19147
|
+
]);
|
|
19148
|
+
var IMAGE_EXT_RE = /\.(jpg|jpeg|png|gif|svg|webp|avif|bmp|ico)(\?.*)?$/i;
|
|
19149
|
+
function isMeaningfulAlt(alt) {
|
|
19150
|
+
const trimmed = alt.trim();
|
|
19151
|
+
if (!trimmed || trimmed.length < 5) return false;
|
|
19152
|
+
if (IMAGE_EXT_RE.test(trimmed)) return false;
|
|
19153
|
+
if (GARBAGE_ALT_WORDS.has(trimmed.toLowerCase())) return false;
|
|
19154
|
+
return true;
|
|
19155
|
+
}
|
|
19156
|
+
function resolveImageText(img, $, imageDescAttr) {
|
|
19157
|
+
const imgDesc = img.attr(imageDescAttr)?.trim();
|
|
19158
|
+
if (imgDesc) return imgDesc;
|
|
19159
|
+
const figure = img.closest("figure");
|
|
19160
|
+
if (figure.length) {
|
|
19161
|
+
const figDesc = figure.attr(imageDescAttr)?.trim();
|
|
19162
|
+
if (figDesc) return figDesc;
|
|
19163
|
+
}
|
|
19164
|
+
const alt = img.attr("alt")?.trim() ?? "";
|
|
19165
|
+
const caption = figure.length ? figure.find("figcaption").first().text().trim() : "";
|
|
19166
|
+
if (isMeaningfulAlt(alt) && caption) {
|
|
19167
|
+
return `${alt} \u2014 ${caption}`;
|
|
19168
|
+
}
|
|
19169
|
+
if (isMeaningfulAlt(alt)) {
|
|
19170
|
+
return alt;
|
|
19171
|
+
}
|
|
19172
|
+
if (caption) {
|
|
19173
|
+
return caption;
|
|
19174
|
+
}
|
|
19175
|
+
return null;
|
|
19176
|
+
}
|
|
19177
|
+
var STOP_ANCHORS = /* @__PURE__ */ new Set([
|
|
19178
|
+
"here",
|
|
19179
|
+
"click",
|
|
19180
|
+
"click here",
|
|
19181
|
+
"read more",
|
|
19182
|
+
"link",
|
|
19183
|
+
"this",
|
|
19184
|
+
"more"
|
|
19185
|
+
]);
|
|
19186
|
+
function normalizeAnchorText(raw) {
|
|
19187
|
+
const normalized = raw.replace(/\s+/g, " ").trim().toLowerCase();
|
|
19188
|
+
if (normalized.length < 3) return "";
|
|
19189
|
+
if (STOP_ANCHORS.has(normalized)) return "";
|
|
19190
|
+
if (normalized.length > 100) return normalized.slice(0, 100);
|
|
19191
|
+
return normalized;
|
|
19192
|
+
}
|
|
19193
|
+
function escapeHtml(text) {
|
|
19194
|
+
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">");
|
|
19195
|
+
}
|
|
19196
|
+
function preprocessImages(root2, $, imageDescAttr) {
|
|
19197
|
+
root2.find("picture").each((_i, el) => {
|
|
19198
|
+
const picture = $(el);
|
|
19199
|
+
const img = picture.find("img").first();
|
|
19200
|
+
const parentFigure = picture.closest("figure");
|
|
19201
|
+
const text = img.length ? resolveImageText(img, $, imageDescAttr) : null;
|
|
19202
|
+
if (text) {
|
|
19203
|
+
if (parentFigure.length) parentFigure.find("figcaption").remove();
|
|
19204
|
+
picture.replaceWith(`<span>${escapeHtml(text)}</span>`);
|
|
19205
|
+
} else {
|
|
19206
|
+
picture.remove();
|
|
19207
|
+
}
|
|
19208
|
+
});
|
|
19209
|
+
root2.find("img").each((_i, el) => {
|
|
19210
|
+
const img = $(el);
|
|
19211
|
+
const parentFigure = img.closest("figure");
|
|
19212
|
+
const text = resolveImageText(img, $, imageDescAttr);
|
|
19213
|
+
if (text) {
|
|
19214
|
+
if (parentFigure.length) parentFigure.find("figcaption").remove();
|
|
19215
|
+
img.replaceWith(`<span>${escapeHtml(text)}</span>`);
|
|
19216
|
+
} else {
|
|
19217
|
+
img.remove();
|
|
19218
|
+
}
|
|
19219
|
+
});
|
|
19220
|
+
}
|
|
18589
19221
|
function extractFromHtml(url, html, config) {
|
|
18590
19222
|
const $ = cheerio.load(html);
|
|
18591
19223
|
const normalizedUrl = normalizeUrlPath(url);
|
|
@@ -18611,6 +19243,20 @@ function extractFromHtml(url, html, config) {
|
|
|
18611
19243
|
if (weight === 0) {
|
|
18612
19244
|
return null;
|
|
18613
19245
|
}
|
|
19246
|
+
if ($('meta[name="searchsocket:noindex"]').attr("content") === "true") {
|
|
19247
|
+
return null;
|
|
19248
|
+
}
|
|
19249
|
+
const RESERVED_META_KEYS = /* @__PURE__ */ new Set(["noindex", "tags"]);
|
|
19250
|
+
const meta = {};
|
|
19251
|
+
$('meta[name^="searchsocket:"]').each((_i, el) => {
|
|
19252
|
+
const name = $(el).attr("name") ?? "";
|
|
19253
|
+
const key = name.slice("searchsocket:".length);
|
|
19254
|
+
if (!key || RESERVED_META_KEYS.has(key) || !validateMetaKey(key)) return;
|
|
19255
|
+
const content = $(el).attr("content") ?? "";
|
|
19256
|
+
const dataType = $(el).attr("data-type") ?? "string";
|
|
19257
|
+
meta[key] = parseMetaValue(content, dataType);
|
|
19258
|
+
});
|
|
19259
|
+
const componentTags = $('meta[name="searchsocket:tags"]').attr("content")?.trim();
|
|
18614
19260
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
18615
19261
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
18616
19262
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -18622,7 +19268,9 @@ function extractFromHtml(url, html, config) {
|
|
|
18622
19268
|
root2.find(selector).remove();
|
|
18623
19269
|
}
|
|
18624
19270
|
root2.find(`[${config.extract.ignoreAttr}]`).remove();
|
|
19271
|
+
preprocessImages(root2, $, config.extract.imageDescAttr);
|
|
18625
19272
|
const outgoingLinks = [];
|
|
19273
|
+
const seenLinkKeys = /* @__PURE__ */ new Set();
|
|
18626
19274
|
root2.find("a[href]").each((_index, node) => {
|
|
18627
19275
|
const href = $(node).attr("href");
|
|
18628
19276
|
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:")) {
|
|
@@ -18633,7 +19281,19 @@ function extractFromHtml(url, html, config) {
|
|
|
18633
19281
|
if (!["http:", "https:"].includes(parsed.protocol)) {
|
|
18634
19282
|
return;
|
|
18635
19283
|
}
|
|
18636
|
-
|
|
19284
|
+
const url2 = normalizeUrlPath(parsed.pathname);
|
|
19285
|
+
let anchorText = normalizeAnchorText($(node).text());
|
|
19286
|
+
if (!anchorText) {
|
|
19287
|
+
const imgAlt = $(node).find("img").first().attr("alt") ?? "";
|
|
19288
|
+
if (isMeaningfulAlt(imgAlt)) {
|
|
19289
|
+
anchorText = normalizeAnchorText(imgAlt);
|
|
19290
|
+
}
|
|
19291
|
+
}
|
|
19292
|
+
const key = `${url2}|${anchorText}`;
|
|
19293
|
+
if (!seenLinkKeys.has(key)) {
|
|
19294
|
+
seenLinkKeys.add(key);
|
|
19295
|
+
outgoingLinks.push({ url: url2, anchorText });
|
|
19296
|
+
}
|
|
18637
19297
|
} catch {
|
|
18638
19298
|
}
|
|
18639
19299
|
});
|
|
@@ -18658,16 +19318,25 @@ function extractFromHtml(url, html, config) {
|
|
|
18658
19318
|
return null;
|
|
18659
19319
|
}
|
|
18660
19320
|
const tags = normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1);
|
|
19321
|
+
const publishedAt = extractPublishedAtFromHtml($);
|
|
19322
|
+
if (componentTags) {
|
|
19323
|
+
const extraTags = componentTags.split(",").map((t) => t.trim()).filter(Boolean);
|
|
19324
|
+
for (const t of extraTags) {
|
|
19325
|
+
if (!tags.includes(t)) tags.push(t);
|
|
19326
|
+
}
|
|
19327
|
+
}
|
|
18661
19328
|
return {
|
|
18662
19329
|
url: normalizeUrlPath(url),
|
|
18663
19330
|
title,
|
|
18664
19331
|
markdown,
|
|
18665
|
-
outgoingLinks
|
|
19332
|
+
outgoingLinks,
|
|
18666
19333
|
noindex: false,
|
|
18667
19334
|
tags,
|
|
18668
19335
|
description,
|
|
18669
19336
|
keywords,
|
|
18670
|
-
weight
|
|
19337
|
+
weight,
|
|
19338
|
+
publishedAt,
|
|
19339
|
+
meta: Object.keys(meta).length > 0 ? meta : void 0
|
|
18671
19340
|
};
|
|
18672
19341
|
}
|
|
18673
19342
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -18688,6 +19357,24 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
18688
19357
|
if (mdWeight === 0) {
|
|
18689
19358
|
return null;
|
|
18690
19359
|
}
|
|
19360
|
+
let mdMeta;
|
|
19361
|
+
const rawMeta = searchsocketMeta?.meta;
|
|
19362
|
+
if (rawMeta && typeof rawMeta === "object" && !Array.isArray(rawMeta)) {
|
|
19363
|
+
const metaObj = {};
|
|
19364
|
+
for (const [key, val] of Object.entries(rawMeta)) {
|
|
19365
|
+
if (!validateMetaKey(key)) continue;
|
|
19366
|
+
if (typeof val === "string" || typeof val === "number" || typeof val === "boolean") {
|
|
19367
|
+
metaObj[key] = val;
|
|
19368
|
+
} else if (Array.isArray(val) && val.every((v) => typeof v === "string")) {
|
|
19369
|
+
metaObj[key] = val;
|
|
19370
|
+
} else if (val instanceof Date) {
|
|
19371
|
+
metaObj[key] = val.getTime();
|
|
19372
|
+
}
|
|
19373
|
+
}
|
|
19374
|
+
if (Object.keys(metaObj).length > 0) {
|
|
19375
|
+
mdMeta = metaObj;
|
|
19376
|
+
}
|
|
19377
|
+
}
|
|
18691
19378
|
const content = parsed.content;
|
|
18692
19379
|
const normalized = normalizeMarkdown(content);
|
|
18693
19380
|
if (!normalizeText(normalized)) {
|
|
@@ -18702,6 +19389,7 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
18702
19389
|
fmKeywords = frontmatter.keywords.split(",").map((k) => k.trim()).filter(Boolean);
|
|
18703
19390
|
}
|
|
18704
19391
|
if (fmKeywords && fmKeywords.length === 0) fmKeywords = void 0;
|
|
19392
|
+
const publishedAt = extractPublishedAtFromFrontmatter(frontmatter);
|
|
18705
19393
|
return {
|
|
18706
19394
|
url: normalizeUrlPath(url),
|
|
18707
19395
|
title: resolvedTitle,
|
|
@@ -18711,7 +19399,9 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
18711
19399
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
18712
19400
|
description: fmDescription,
|
|
18713
19401
|
keywords: fmKeywords,
|
|
18714
|
-
weight: mdWeight
|
|
19402
|
+
weight: mdWeight,
|
|
19403
|
+
publishedAt,
|
|
19404
|
+
meta: mdMeta
|
|
18715
19405
|
};
|
|
18716
19406
|
}
|
|
18717
19407
|
function segmentToRegex(segment) {
|
|
@@ -18906,7 +19596,7 @@ async function parseManifest(cwd, outputDir) {
|
|
|
18906
19596
|
const manifestPath = path__default.default.resolve(cwd, outputDir, "server", "manifest-full.js");
|
|
18907
19597
|
let content;
|
|
18908
19598
|
try {
|
|
18909
|
-
content = await
|
|
19599
|
+
content = await fs8__default.default.readFile(manifestPath, "utf8");
|
|
18910
19600
|
} catch {
|
|
18911
19601
|
throw new SearchSocketError(
|
|
18912
19602
|
"BUILD_MANIFEST_NOT_FOUND",
|
|
@@ -19217,13 +19907,132 @@ function filePathToUrl(filePath, baseDir) {
|
|
|
19217
19907
|
const noExt = relative.replace(/\.md$/i, "").replace(/\/index$/i, "");
|
|
19218
19908
|
return normalizeUrlPath(noExt || "/");
|
|
19219
19909
|
}
|
|
19220
|
-
|
|
19221
|
-
|
|
19910
|
+
var ROUTE_FILE_RE = /\+(page|layout|error)(@[^.]+)?\.svelte$/;
|
|
19911
|
+
function isSvelteComponentFile(filePath) {
|
|
19912
|
+
if (!filePath.endsWith(".svelte")) return false;
|
|
19913
|
+
return !ROUTE_FILE_RE.test(filePath);
|
|
19222
19914
|
}
|
|
19223
|
-
|
|
19224
|
-
const
|
|
19225
|
-
|
|
19226
|
-
|
|
19915
|
+
function extractSvelteComponentMeta(source) {
|
|
19916
|
+
const componentMatch = source.match(/<!--\s*@component\s*([\s\S]*?)\s*-->/);
|
|
19917
|
+
const description = componentMatch?.[1]?.trim() || void 0;
|
|
19918
|
+
const propsMatch = source.match(
|
|
19919
|
+
/let\s+\{([\s\S]*?)\}\s*(?::\s*([^=;{][\s\S]*?))?\s*=\s*\$props\(\)/
|
|
19920
|
+
);
|
|
19921
|
+
const props = [];
|
|
19922
|
+
if (propsMatch) {
|
|
19923
|
+
const destructureBlock = propsMatch[1];
|
|
19924
|
+
const typeAnnotation = propsMatch[2]?.trim();
|
|
19925
|
+
let resolvedTypeMap;
|
|
19926
|
+
if (typeAnnotation && /^[A-Z]\w*$/.test(typeAnnotation)) {
|
|
19927
|
+
resolvedTypeMap = resolveTypeReference(source, typeAnnotation);
|
|
19928
|
+
} else if (typeAnnotation && typeAnnotation.startsWith("{")) {
|
|
19929
|
+
resolvedTypeMap = parseInlineTypeAnnotation(typeAnnotation);
|
|
19930
|
+
}
|
|
19931
|
+
const propEntries = splitDestructureBlock(destructureBlock);
|
|
19932
|
+
for (const entry of propEntries) {
|
|
19933
|
+
const trimmed = entry.trim();
|
|
19934
|
+
if (!trimmed || trimmed.startsWith("...")) continue;
|
|
19935
|
+
let propName;
|
|
19936
|
+
let defaultValue;
|
|
19937
|
+
const renameMatch = trimmed.match(/^(\w+)\s*:\s*\w+\s*(?:=\s*([\s\S]+))?$/);
|
|
19938
|
+
if (renameMatch) {
|
|
19939
|
+
propName = renameMatch[1];
|
|
19940
|
+
defaultValue = renameMatch[2]?.trim();
|
|
19941
|
+
} else {
|
|
19942
|
+
const defaultMatch = trimmed.match(/^(\w+)\s*=\s*([\s\S]+)$/);
|
|
19943
|
+
if (defaultMatch) {
|
|
19944
|
+
propName = defaultMatch[1];
|
|
19945
|
+
defaultValue = defaultMatch[2]?.trim();
|
|
19946
|
+
} else {
|
|
19947
|
+
propName = trimmed.match(/^(\w+)/)?.[1] ?? trimmed;
|
|
19948
|
+
}
|
|
19949
|
+
}
|
|
19950
|
+
const propType = resolvedTypeMap?.get(propName);
|
|
19951
|
+
props.push({
|
|
19952
|
+
name: propName,
|
|
19953
|
+
...propType ? { type: propType } : {},
|
|
19954
|
+
...defaultValue ? { default: defaultValue } : {}
|
|
19955
|
+
});
|
|
19956
|
+
}
|
|
19957
|
+
}
|
|
19958
|
+
return { description, props };
|
|
19959
|
+
}
|
|
19960
|
+
function splitDestructureBlock(block) {
|
|
19961
|
+
const entries = [];
|
|
19962
|
+
let depth = 0;
|
|
19963
|
+
let current = "";
|
|
19964
|
+
for (const ch of block) {
|
|
19965
|
+
if (ch === "{" || ch === "[" || ch === "(") {
|
|
19966
|
+
depth++;
|
|
19967
|
+
current += ch;
|
|
19968
|
+
} else if (ch === "}" || ch === "]" || ch === ")") {
|
|
19969
|
+
depth--;
|
|
19970
|
+
current += ch;
|
|
19971
|
+
} else if (ch === "," && depth === 0) {
|
|
19972
|
+
entries.push(current);
|
|
19973
|
+
current = "";
|
|
19974
|
+
} else {
|
|
19975
|
+
current += ch;
|
|
19976
|
+
}
|
|
19977
|
+
}
|
|
19978
|
+
if (current.trim()) entries.push(current);
|
|
19979
|
+
return entries;
|
|
19980
|
+
}
|
|
19981
|
+
function resolveTypeReference(source, typeName) {
|
|
19982
|
+
const startRe = new RegExp(`(?:interface\\s+${typeName}\\s*|type\\s+${typeName}\\s*=\\s*)\\{`);
|
|
19983
|
+
const startMatch = source.match(startRe);
|
|
19984
|
+
if (!startMatch || startMatch.index === void 0) return void 0;
|
|
19985
|
+
const bodyStart = startMatch.index + startMatch[0].length;
|
|
19986
|
+
let depth = 1;
|
|
19987
|
+
let i = bodyStart;
|
|
19988
|
+
while (i < source.length && depth > 0) {
|
|
19989
|
+
if (source[i] === "{") depth++;
|
|
19990
|
+
else if (source[i] === "}") depth--;
|
|
19991
|
+
i++;
|
|
19992
|
+
}
|
|
19993
|
+
if (depth !== 0) return void 0;
|
|
19994
|
+
const body = source.slice(bodyStart, i - 1);
|
|
19995
|
+
return parseTypeMembers(body);
|
|
19996
|
+
}
|
|
19997
|
+
function parseInlineTypeAnnotation(annotation) {
|
|
19998
|
+
const inner = annotation.replace(/^\{/, "").replace(/\}$/, "");
|
|
19999
|
+
return parseTypeMembers(inner);
|
|
20000
|
+
}
|
|
20001
|
+
function parseTypeMembers(body) {
|
|
20002
|
+
const map = /* @__PURE__ */ new Map();
|
|
20003
|
+
const members = body.split(/[;\n]/).map((m) => m.trim()).filter(Boolean);
|
|
20004
|
+
for (const member of members) {
|
|
20005
|
+
const memberMatch = member.match(/^(\w+)\??\s*:\s*(.+)$/);
|
|
20006
|
+
if (memberMatch) {
|
|
20007
|
+
map.set(memberMatch[1], memberMatch[2].replace(/,\s*$/, "").trim());
|
|
20008
|
+
}
|
|
20009
|
+
}
|
|
20010
|
+
return map;
|
|
20011
|
+
}
|
|
20012
|
+
function buildComponentMarkdown(componentName, meta) {
|
|
20013
|
+
if (!meta.description && meta.props.length === 0) return "";
|
|
20014
|
+
const parts = [`${componentName} component.`];
|
|
20015
|
+
if (meta.description) {
|
|
20016
|
+
parts.push(meta.description);
|
|
20017
|
+
}
|
|
20018
|
+
if (meta.props.length > 0) {
|
|
20019
|
+
const propEntries = meta.props.map((p) => {
|
|
20020
|
+
let entry = p.name;
|
|
20021
|
+
if (p.type) entry += ` (${p.type})`;
|
|
20022
|
+
if (p.default) entry += ` default: ${p.default}`;
|
|
20023
|
+
return entry;
|
|
20024
|
+
});
|
|
20025
|
+
parts.push(`Props: ${propEntries.join(", ")}.`);
|
|
20026
|
+
}
|
|
20027
|
+
return parts.join(" ");
|
|
20028
|
+
}
|
|
20029
|
+
function normalizeSvelteToMarkdown(source) {
|
|
20030
|
+
return source.replace(/<script[\s\S]*?<\/script>/g, "").replace(/<style[\s\S]*?<\/style>/g, "").replace(/<[^>]+>/g, " ").replace(/\{[^}]+\}/g, " ").replace(/\s+/g, " ").trim();
|
|
20031
|
+
}
|
|
20032
|
+
async function loadContentFilesPages(cwd, config, maxPages) {
|
|
20033
|
+
const contentConfig = config.source.contentFiles;
|
|
20034
|
+
if (!contentConfig) {
|
|
20035
|
+
throw new Error("content-files config is missing");
|
|
19227
20036
|
}
|
|
19228
20037
|
const baseDir = path__default.default.resolve(cwd, contentConfig.baseDir);
|
|
19229
20038
|
const files = await fg__default.default(contentConfig.globs, {
|
|
@@ -19235,13 +20044,27 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
19235
20044
|
const selected = typeof limit === "number" ? files.slice(0, limit) : files;
|
|
19236
20045
|
const pages = [];
|
|
19237
20046
|
for (const filePath of selected) {
|
|
19238
|
-
const raw = await
|
|
19239
|
-
|
|
20047
|
+
const raw = await fs8__default.default.readFile(filePath, "utf8");
|
|
20048
|
+
let markdown;
|
|
20049
|
+
let tags;
|
|
20050
|
+
if (filePath.endsWith(".md")) {
|
|
20051
|
+
markdown = raw;
|
|
20052
|
+
} else if (isSvelteComponentFile(filePath)) {
|
|
20053
|
+
const componentName = path__default.default.basename(filePath, ".svelte");
|
|
20054
|
+
const meta = extractSvelteComponentMeta(raw);
|
|
20055
|
+
const componentMarkdown = buildComponentMarkdown(componentName, meta);
|
|
20056
|
+
const templateContent = normalizeSvelteToMarkdown(raw);
|
|
20057
|
+
markdown = componentMarkdown ? [componentMarkdown, templateContent].filter(Boolean).join("\n\n") : templateContent;
|
|
20058
|
+
tags = ["component"];
|
|
20059
|
+
} else {
|
|
20060
|
+
markdown = normalizeSvelteToMarkdown(raw);
|
|
20061
|
+
}
|
|
19240
20062
|
pages.push({
|
|
19241
20063
|
url: filePathToUrl(filePath, baseDir),
|
|
19242
20064
|
markdown,
|
|
19243
20065
|
sourcePath: path__default.default.relative(cwd, filePath).replace(/\\/g, "/"),
|
|
19244
|
-
outgoingLinks: []
|
|
20066
|
+
outgoingLinks: [],
|
|
20067
|
+
...tags ? { tags } : {}
|
|
19245
20068
|
});
|
|
19246
20069
|
}
|
|
19247
20070
|
return pages;
|
|
@@ -19371,7 +20194,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
19371
20194
|
const selected = typeof limit === "number" ? htmlFiles.slice(0, limit) : htmlFiles;
|
|
19372
20195
|
const pages = [];
|
|
19373
20196
|
for (const filePath of selected) {
|
|
19374
|
-
const html = await
|
|
20197
|
+
const html = await fs8__default.default.readFile(filePath, "utf8");
|
|
19375
20198
|
pages.push({
|
|
19376
20199
|
url: staticHtmlFileToUrl(filePath, outputDir),
|
|
19377
20200
|
html,
|
|
@@ -19434,7 +20257,7 @@ function isBlockedByRobots(urlPath, rules3) {
|
|
|
19434
20257
|
}
|
|
19435
20258
|
async function loadRobotsTxtFromDir(dir) {
|
|
19436
20259
|
try {
|
|
19437
|
-
const content = await
|
|
20260
|
+
const content = await fs8__default.default.readFile(path__default.default.join(dir, "robots.txt"), "utf8");
|
|
19438
20261
|
return parseRobotsTxt(content);
|
|
19439
20262
|
} catch {
|
|
19440
20263
|
return null;
|
|
@@ -19462,29 +20285,65 @@ function nonNegativeOrZero(value) {
|
|
|
19462
20285
|
function normalizeForTitleMatch(text) {
|
|
19463
20286
|
return text.toLowerCase().replace(/[^a-z0-9\s]/g, "").replace(/\s+/g, " ").trim();
|
|
19464
20287
|
}
|
|
19465
|
-
function rankHits(hits, config, query) {
|
|
20288
|
+
function rankHits(hits, config, query, debug) {
|
|
19466
20289
|
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
19467
20290
|
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
19468
20291
|
return hits.map((hit) => {
|
|
19469
|
-
|
|
20292
|
+
const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
20293
|
+
let score = baseScore;
|
|
20294
|
+
let incomingLinkBoostValue = 0;
|
|
19470
20295
|
if (config.ranking.enableIncomingLinkBoost) {
|
|
19471
20296
|
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
19472
|
-
|
|
20297
|
+
incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
|
|
20298
|
+
score += incomingLinkBoostValue;
|
|
19473
20299
|
}
|
|
20300
|
+
let depthBoostValue = 0;
|
|
19474
20301
|
if (config.ranking.enableDepthBoost) {
|
|
19475
20302
|
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
19476
|
-
|
|
20303
|
+
depthBoostValue = depthBoost * config.ranking.weights.depth;
|
|
20304
|
+
score += depthBoostValue;
|
|
19477
20305
|
}
|
|
20306
|
+
let titleMatchBoostValue = 0;
|
|
19478
20307
|
if (normalizedQuery && titleMatchWeight > 0) {
|
|
19479
20308
|
const normalizedTitle = normalizeForTitleMatch(hit.metadata.title);
|
|
19480
20309
|
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
19481
|
-
|
|
20310
|
+
titleMatchBoostValue = titleMatchWeight;
|
|
20311
|
+
score += titleMatchBoostValue;
|
|
19482
20312
|
}
|
|
19483
20313
|
}
|
|
19484
|
-
|
|
20314
|
+
let freshnessBoostValue = 0;
|
|
20315
|
+
if (config.ranking.enableFreshnessBoost) {
|
|
20316
|
+
const publishedAt = hit.metadata.publishedAt;
|
|
20317
|
+
if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
|
|
20318
|
+
const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
|
|
20319
|
+
const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
|
|
20320
|
+
freshnessBoostValue = decay * config.ranking.weights.freshness;
|
|
20321
|
+
score += freshnessBoostValue;
|
|
20322
|
+
}
|
|
20323
|
+
}
|
|
20324
|
+
let anchorTextMatchBoostValue = 0;
|
|
20325
|
+
if (config.ranking.enableAnchorTextBoost && normalizedQuery && config.ranking.weights.anchorText > 0) {
|
|
20326
|
+
const normalizedAnchorText = normalizeForTitleMatch(hit.metadata.incomingAnchorText ?? "");
|
|
20327
|
+
if (normalizedAnchorText.length > 0 && normalizedQuery.length > 0 && (normalizedAnchorText.includes(normalizedQuery) || normalizedQuery.includes(normalizedAnchorText))) {
|
|
20328
|
+
anchorTextMatchBoostValue = config.ranking.weights.anchorText;
|
|
20329
|
+
score += anchorTextMatchBoostValue;
|
|
20330
|
+
}
|
|
20331
|
+
}
|
|
20332
|
+
const result = {
|
|
19485
20333
|
hit,
|
|
19486
20334
|
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
19487
20335
|
};
|
|
20336
|
+
if (debug) {
|
|
20337
|
+
result.breakdown = {
|
|
20338
|
+
baseScore,
|
|
20339
|
+
incomingLinkBoost: incomingLinkBoostValue,
|
|
20340
|
+
depthBoost: depthBoostValue,
|
|
20341
|
+
titleMatchBoost: titleMatchBoostValue,
|
|
20342
|
+
freshnessBoost: freshnessBoostValue,
|
|
20343
|
+
anchorTextMatchBoost: anchorTextMatchBoostValue
|
|
20344
|
+
};
|
|
20345
|
+
}
|
|
20346
|
+
return result;
|
|
19488
20347
|
}).sort((a, b) => {
|
|
19489
20348
|
const delta = b.finalScore - a.finalScore;
|
|
19490
20349
|
return Number.isNaN(delta) ? 0 : delta;
|
|
@@ -19493,12 +20352,13 @@ function rankHits(hits, config, query) {
|
|
|
19493
20352
|
function trimByScoreGap(results, config) {
|
|
19494
20353
|
if (results.length === 0) return results;
|
|
19495
20354
|
const threshold = config.ranking.scoreGapThreshold;
|
|
19496
|
-
const
|
|
19497
|
-
if (
|
|
19498
|
-
const
|
|
19499
|
-
|
|
19500
|
-
|
|
19501
|
-
|
|
20355
|
+
const minScoreRatio = config.ranking.minScoreRatio;
|
|
20356
|
+
if (minScoreRatio > 0 && results.length > 0) {
|
|
20357
|
+
const topScore = results[0].pageScore;
|
|
20358
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
20359
|
+
const minThreshold = topScore * minScoreRatio;
|
|
20360
|
+
results = results.filter((r) => r.pageScore >= minThreshold);
|
|
20361
|
+
}
|
|
19502
20362
|
}
|
|
19503
20363
|
if (threshold > 0 && results.length > 1) {
|
|
19504
20364
|
for (let i = 1; i < results.length; i++) {
|
|
@@ -19568,61 +20428,99 @@ function aggregateByPage(ranked, config) {
|
|
|
19568
20428
|
return Number.isNaN(delta) ? 0 : delta;
|
|
19569
20429
|
});
|
|
19570
20430
|
}
|
|
19571
|
-
function
|
|
19572
|
-
|
|
19573
|
-
const
|
|
19574
|
-
|
|
19575
|
-
|
|
19576
|
-
|
|
19577
|
-
|
|
19578
|
-
|
|
19579
|
-
|
|
19580
|
-
|
|
19581
|
-
|
|
19582
|
-
if (pageHit) {
|
|
19583
|
-
pagesWithChunks.add(url);
|
|
19584
|
-
const blended = (1 - w) * ranked.finalScore + w * pageHit.score;
|
|
19585
|
-
return {
|
|
19586
|
-
hit: ranked.hit,
|
|
19587
|
-
finalScore: Number.isFinite(blended) ? blended : ranked.finalScore
|
|
19588
|
-
};
|
|
20431
|
+
function rankPageHits(pageHits, config, query, debug) {
|
|
20432
|
+
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
20433
|
+
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
20434
|
+
return pageHits.map((hit) => {
|
|
20435
|
+
const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
20436
|
+
let score = baseScore;
|
|
20437
|
+
let incomingLinkBoostValue = 0;
|
|
20438
|
+
if (config.ranking.enableIncomingLinkBoost) {
|
|
20439
|
+
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.incomingLinks));
|
|
20440
|
+
incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
|
|
20441
|
+
score += incomingLinkBoostValue;
|
|
19589
20442
|
}
|
|
19590
|
-
|
|
19591
|
-
|
|
19592
|
-
|
|
19593
|
-
|
|
19594
|
-
|
|
19595
|
-
|
|
19596
|
-
|
|
19597
|
-
|
|
19598
|
-
|
|
19599
|
-
|
|
19600
|
-
|
|
19601
|
-
|
|
19602
|
-
|
|
19603
|
-
|
|
19604
|
-
|
|
19605
|
-
|
|
19606
|
-
|
|
19607
|
-
|
|
19608
|
-
|
|
19609
|
-
|
|
19610
|
-
|
|
19611
|
-
|
|
19612
|
-
routeFile: pageHit.routeFile,
|
|
19613
|
-
tags: pageHit.tags
|
|
20443
|
+
let depthBoostValue = 0;
|
|
20444
|
+
if (config.ranking.enableDepthBoost) {
|
|
20445
|
+
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.depth));
|
|
20446
|
+
depthBoostValue = depthBoost * config.ranking.weights.depth;
|
|
20447
|
+
score += depthBoostValue;
|
|
20448
|
+
}
|
|
20449
|
+
let titleMatchBoostValue = 0;
|
|
20450
|
+
if (normalizedQuery && titleMatchWeight > 0) {
|
|
20451
|
+
const normalizedTitle = normalizeForTitleMatch(hit.title);
|
|
20452
|
+
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
20453
|
+
titleMatchBoostValue = titleMatchWeight;
|
|
20454
|
+
score += titleMatchBoostValue;
|
|
20455
|
+
}
|
|
20456
|
+
}
|
|
20457
|
+
let freshnessBoostValue = 0;
|
|
20458
|
+
if (config.ranking.enableFreshnessBoost) {
|
|
20459
|
+
const publishedAt = hit.publishedAt;
|
|
20460
|
+
if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
|
|
20461
|
+
const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
|
|
20462
|
+
const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
|
|
20463
|
+
freshnessBoostValue = decay * config.ranking.weights.freshness;
|
|
20464
|
+
score += freshnessBoostValue;
|
|
19614
20465
|
}
|
|
20466
|
+
}
|
|
20467
|
+
const pageWeight = findPageWeight(hit.url, config.ranking.pageWeights);
|
|
20468
|
+
if (pageWeight !== 1) {
|
|
20469
|
+
score *= pageWeight;
|
|
20470
|
+
}
|
|
20471
|
+
const result = {
|
|
20472
|
+
url: hit.url,
|
|
20473
|
+
title: hit.title,
|
|
20474
|
+
description: hit.description,
|
|
20475
|
+
routeFile: hit.routeFile,
|
|
20476
|
+
depth: hit.depth,
|
|
20477
|
+
incomingLinks: hit.incomingLinks,
|
|
20478
|
+
tags: hit.tags,
|
|
20479
|
+
baseScore,
|
|
20480
|
+
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY,
|
|
20481
|
+
publishedAt: hit.publishedAt
|
|
19615
20482
|
};
|
|
19616
|
-
|
|
19617
|
-
|
|
19618
|
-
|
|
19619
|
-
|
|
19620
|
-
|
|
19621
|
-
|
|
20483
|
+
if (debug) {
|
|
20484
|
+
result.breakdown = {
|
|
20485
|
+
baseScore,
|
|
20486
|
+
pageWeight,
|
|
20487
|
+
incomingLinkBoost: incomingLinkBoostValue,
|
|
20488
|
+
depthBoost: depthBoostValue,
|
|
20489
|
+
titleMatchBoost: titleMatchBoostValue,
|
|
20490
|
+
freshnessBoost: freshnessBoostValue
|
|
20491
|
+
};
|
|
20492
|
+
}
|
|
20493
|
+
return result;
|
|
20494
|
+
}).filter((p) => findPageWeight(p.url, config.ranking.pageWeights) !== 0).sort((a, b) => {
|
|
19622
20495
|
const delta = b.finalScore - a.finalScore;
|
|
19623
20496
|
return Number.isNaN(delta) ? 0 : delta;
|
|
19624
20497
|
});
|
|
19625
20498
|
}
|
|
20499
|
+
function trimPagesByScoreGap(results, config) {
|
|
20500
|
+
if (results.length === 0) return results;
|
|
20501
|
+
const threshold = config.ranking.scoreGapThreshold;
|
|
20502
|
+
const minScoreRatio = config.ranking.minScoreRatio;
|
|
20503
|
+
if (minScoreRatio > 0 && results.length > 0) {
|
|
20504
|
+
const topScore = results[0].finalScore;
|
|
20505
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
20506
|
+
const minThreshold = topScore * minScoreRatio;
|
|
20507
|
+
results = results.filter((r) => r.finalScore >= minThreshold);
|
|
20508
|
+
}
|
|
20509
|
+
}
|
|
20510
|
+
if (threshold > 0 && results.length > 1) {
|
|
20511
|
+
for (let i = 1; i < results.length; i++) {
|
|
20512
|
+
const prev = results[i - 1].finalScore;
|
|
20513
|
+
const current = results[i].finalScore;
|
|
20514
|
+
if (prev > 0) {
|
|
20515
|
+
const gap = (prev - current) / prev;
|
|
20516
|
+
if (gap >= threshold) {
|
|
20517
|
+
return results.slice(0, i);
|
|
20518
|
+
}
|
|
20519
|
+
}
|
|
20520
|
+
}
|
|
20521
|
+
}
|
|
20522
|
+
return results;
|
|
20523
|
+
}
|
|
19626
20524
|
|
|
19627
20525
|
// src/utils/time.ts
|
|
19628
20526
|
function nowIso() {
|
|
@@ -19631,6 +20529,81 @@ function nowIso() {
|
|
|
19631
20529
|
function hrTimeMs(start) {
|
|
19632
20530
|
return Number(process.hrtime.bigint() - start) / 1e6;
|
|
19633
20531
|
}
|
|
20532
|
+
function resolvePageUrl(pageUrl, baseUrl) {
|
|
20533
|
+
if (!baseUrl) return pageUrl;
|
|
20534
|
+
try {
|
|
20535
|
+
return new URL(pageUrl, baseUrl).href;
|
|
20536
|
+
} catch {
|
|
20537
|
+
return pageUrl;
|
|
20538
|
+
}
|
|
20539
|
+
}
|
|
20540
|
+
function generateLlmsTxt(pages, config) {
|
|
20541
|
+
const title = config.llmsTxt.title ?? config.project.id;
|
|
20542
|
+
const description = config.llmsTxt.description;
|
|
20543
|
+
const baseUrl = config.project.baseUrl;
|
|
20544
|
+
const lines = [`# ${title}`];
|
|
20545
|
+
if (description) {
|
|
20546
|
+
lines.push("", `> ${description}`);
|
|
20547
|
+
}
|
|
20548
|
+
const filtered = pages.filter(
|
|
20549
|
+
(p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
|
|
20550
|
+
);
|
|
20551
|
+
const sorted = [...filtered].sort((a, b) => {
|
|
20552
|
+
if (a.depth !== b.depth) return a.depth - b.depth;
|
|
20553
|
+
return b.incomingLinks - a.incomingLinks;
|
|
20554
|
+
});
|
|
20555
|
+
if (sorted.length > 0) {
|
|
20556
|
+
lines.push("", "## Pages", "");
|
|
20557
|
+
for (const page of sorted) {
|
|
20558
|
+
const url = resolvePageUrl(page.url, baseUrl);
|
|
20559
|
+
if (page.description) {
|
|
20560
|
+
lines.push(`- [${page.title}](${url}): ${page.description}`);
|
|
20561
|
+
} else {
|
|
20562
|
+
lines.push(`- [${page.title}](${url})`);
|
|
20563
|
+
}
|
|
20564
|
+
}
|
|
20565
|
+
}
|
|
20566
|
+
lines.push("");
|
|
20567
|
+
return lines.join("\n");
|
|
20568
|
+
}
|
|
20569
|
+
function generateLlmsFullTxt(pages, config) {
|
|
20570
|
+
const title = config.llmsTxt.title ?? config.project.id;
|
|
20571
|
+
const description = config.llmsTxt.description;
|
|
20572
|
+
const baseUrl = config.project.baseUrl;
|
|
20573
|
+
const lines = [`# ${title}`];
|
|
20574
|
+
if (description) {
|
|
20575
|
+
lines.push("", `> ${description}`);
|
|
20576
|
+
}
|
|
20577
|
+
const filtered = pages.filter(
|
|
20578
|
+
(p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
|
|
20579
|
+
);
|
|
20580
|
+
const sorted = [...filtered].sort((a, b) => {
|
|
20581
|
+
if (a.depth !== b.depth) return a.depth - b.depth;
|
|
20582
|
+
return b.incomingLinks - a.incomingLinks;
|
|
20583
|
+
});
|
|
20584
|
+
for (const page of sorted) {
|
|
20585
|
+
const url = resolvePageUrl(page.url, baseUrl);
|
|
20586
|
+
lines.push("", "---", "", `## [${page.title}](${url})`, "");
|
|
20587
|
+
lines.push(page.markdown.trim());
|
|
20588
|
+
}
|
|
20589
|
+
lines.push("");
|
|
20590
|
+
return lines.join("\n");
|
|
20591
|
+
}
|
|
20592
|
+
async function writeLlmsTxt(pages, config, cwd, logger3) {
|
|
20593
|
+
const outputPath = path__default.default.resolve(cwd, config.llmsTxt.outputPath);
|
|
20594
|
+
const outputDir = path__default.default.dirname(outputPath);
|
|
20595
|
+
await fs8__default.default.mkdir(outputDir, { recursive: true });
|
|
20596
|
+
const content = generateLlmsTxt(pages, config);
|
|
20597
|
+
await fs8__default.default.writeFile(outputPath, content, "utf8");
|
|
20598
|
+
logger3.info(`Generated llms.txt at ${config.llmsTxt.outputPath}`);
|
|
20599
|
+
if (config.llmsTxt.generateFull) {
|
|
20600
|
+
const fullPath = outputPath.replace(/\.txt$/, "-full.txt");
|
|
20601
|
+
const fullContent = generateLlmsFullTxt(pages, config);
|
|
20602
|
+
await fs8__default.default.writeFile(fullPath, fullContent, "utf8");
|
|
20603
|
+
const relativeFull = path__default.default.relative(cwd, fullPath);
|
|
20604
|
+
logger3.info(`Generated llms-full.txt at ${relativeFull}`);
|
|
20605
|
+
}
|
|
20606
|
+
}
|
|
19634
20607
|
|
|
19635
20608
|
// src/indexing/pipeline.ts
|
|
19636
20609
|
function buildPageSummary(page, maxChars = 3500) {
|
|
@@ -19649,16 +20622,33 @@ function buildPageSummary(page, maxChars = 3500) {
|
|
|
19649
20622
|
if (joined.length <= maxChars) return joined;
|
|
19650
20623
|
return joined.slice(0, maxChars).trim();
|
|
19651
20624
|
}
|
|
20625
|
+
function buildPageContentHash(page) {
|
|
20626
|
+
const parts = [
|
|
20627
|
+
page.title,
|
|
20628
|
+
page.description ?? "",
|
|
20629
|
+
(page.keywords ?? []).slice().sort().join(","),
|
|
20630
|
+
page.tags.slice().sort().join(","),
|
|
20631
|
+
page.markdown,
|
|
20632
|
+
String(page.outgoingLinks),
|
|
20633
|
+
String(page.publishedAt ?? ""),
|
|
20634
|
+
page.incomingAnchorText ?? "",
|
|
20635
|
+
(page.outgoingLinkUrls ?? []).slice().sort().join(","),
|
|
20636
|
+
page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : ""
|
|
20637
|
+
];
|
|
20638
|
+
return sha256(parts.join("|"));
|
|
20639
|
+
}
|
|
19652
20640
|
var IndexPipeline = class _IndexPipeline {
|
|
19653
20641
|
cwd;
|
|
19654
20642
|
config;
|
|
19655
20643
|
store;
|
|
19656
20644
|
logger;
|
|
20645
|
+
hooks;
|
|
19657
20646
|
constructor(options) {
|
|
19658
20647
|
this.cwd = options.cwd;
|
|
19659
20648
|
this.config = options.config;
|
|
19660
20649
|
this.store = options.store;
|
|
19661
20650
|
this.logger = options.logger;
|
|
20651
|
+
this.hooks = options.hooks;
|
|
19662
20652
|
}
|
|
19663
20653
|
static async create(options = {}) {
|
|
19664
20654
|
const cwd = path__default.default.resolve(options.cwd ?? process.cwd());
|
|
@@ -19668,7 +20658,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19668
20658
|
cwd,
|
|
19669
20659
|
config,
|
|
19670
20660
|
store,
|
|
19671
|
-
logger: options.logger ?? new Logger()
|
|
20661
|
+
logger: options.logger ?? new Logger(),
|
|
20662
|
+
hooks: options.hooks ?? {}
|
|
19672
20663
|
});
|
|
19673
20664
|
}
|
|
19674
20665
|
getConfig() {
|
|
@@ -19689,7 +20680,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19689
20680
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
19690
20681
|
ensureStateDirs(this.cwd, this.config.state.dir);
|
|
19691
20682
|
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
19692
|
-
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-
|
|
20683
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-vector)`);
|
|
19693
20684
|
if (options.force) {
|
|
19694
20685
|
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
19695
20686
|
}
|
|
@@ -19697,9 +20688,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19697
20688
|
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
19698
20689
|
}
|
|
19699
20690
|
const manifestStart = stageStart();
|
|
19700
|
-
const
|
|
20691
|
+
const existingPageHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getPageHashes(scope);
|
|
19701
20692
|
stageEnd("manifest", manifestStart);
|
|
19702
|
-
this.logger.debug(`Manifest: ${
|
|
20693
|
+
this.logger.debug(`Manifest: ${existingPageHashes.size} existing page hashes loaded`);
|
|
19703
20694
|
const sourceStart = stageStart();
|
|
19704
20695
|
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
19705
20696
|
let sourcePages;
|
|
@@ -19776,11 +20767,61 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19776
20767
|
);
|
|
19777
20768
|
continue;
|
|
19778
20769
|
}
|
|
19779
|
-
|
|
20770
|
+
if (sourcePage.tags && sourcePage.tags.length > 0) {
|
|
20771
|
+
extracted.tags = [.../* @__PURE__ */ new Set([...extracted.tags, ...sourcePage.tags])];
|
|
20772
|
+
}
|
|
20773
|
+
let accepted;
|
|
20774
|
+
if (this.hooks.transformPage) {
|
|
20775
|
+
const transformed = await this.hooks.transformPage(extracted);
|
|
20776
|
+
if (transformed === null) {
|
|
20777
|
+
this.logger.debug(`Page ${sourcePage.url} skipped by transformPage hook`);
|
|
20778
|
+
continue;
|
|
20779
|
+
}
|
|
20780
|
+
accepted = transformed;
|
|
20781
|
+
} else {
|
|
20782
|
+
accepted = extracted;
|
|
20783
|
+
}
|
|
20784
|
+
extractedPages.push(accepted);
|
|
19780
20785
|
this.logger.event("page_extracted", {
|
|
19781
|
-
url:
|
|
20786
|
+
url: accepted.url
|
|
19782
20787
|
});
|
|
19783
20788
|
}
|
|
20789
|
+
const customRecords = options.customRecords ?? [];
|
|
20790
|
+
if (customRecords.length > 0) {
|
|
20791
|
+
this.logger.info(`Processing ${customRecords.length} custom record${customRecords.length === 1 ? "" : "s"}...`);
|
|
20792
|
+
for (const record of customRecords) {
|
|
20793
|
+
const normalizedUrl = normalizeUrlPath(record.url);
|
|
20794
|
+
const normalized = normalizeMarkdown(record.content);
|
|
20795
|
+
if (!normalized.trim()) {
|
|
20796
|
+
this.logger.warn(`Custom record ${normalizedUrl} has empty content and was skipped.`);
|
|
20797
|
+
continue;
|
|
20798
|
+
}
|
|
20799
|
+
const urlTags = normalizedUrl.split("/").filter(Boolean).slice(0, 1);
|
|
20800
|
+
const tags = record.tags ? [.../* @__PURE__ */ new Set([...urlTags, ...record.tags])] : urlTags;
|
|
20801
|
+
const extracted = {
|
|
20802
|
+
url: normalizedUrl,
|
|
20803
|
+
title: record.title,
|
|
20804
|
+
markdown: normalized,
|
|
20805
|
+
outgoingLinks: [],
|
|
20806
|
+
noindex: false,
|
|
20807
|
+
tags,
|
|
20808
|
+
weight: record.weight
|
|
20809
|
+
};
|
|
20810
|
+
let accepted;
|
|
20811
|
+
if (this.hooks.transformPage) {
|
|
20812
|
+
const transformed = await this.hooks.transformPage(extracted);
|
|
20813
|
+
if (transformed === null) {
|
|
20814
|
+
this.logger.debug(`Custom record ${normalizedUrl} skipped by transformPage hook`);
|
|
20815
|
+
continue;
|
|
20816
|
+
}
|
|
20817
|
+
accepted = transformed;
|
|
20818
|
+
} else {
|
|
20819
|
+
accepted = extracted;
|
|
20820
|
+
}
|
|
20821
|
+
extractedPages.push(accepted);
|
|
20822
|
+
this.logger.event("page_extracted", { url: accepted.url, custom: true });
|
|
20823
|
+
}
|
|
20824
|
+
}
|
|
19784
20825
|
extractedPages.sort((a, b) => a.url.localeCompare(b.url));
|
|
19785
20826
|
const uniquePages = [];
|
|
19786
20827
|
const seenUrls = /* @__PURE__ */ new Set();
|
|
@@ -19813,15 +20854,28 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19813
20854
|
const linkStart = stageStart();
|
|
19814
20855
|
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
19815
20856
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
20857
|
+
const incomingAnchorTexts = /* @__PURE__ */ new Map();
|
|
19816
20858
|
for (const page of indexablePages) {
|
|
19817
20859
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
19818
20860
|
}
|
|
19819
20861
|
for (const page of indexablePages) {
|
|
19820
|
-
|
|
20862
|
+
const seenForCount = /* @__PURE__ */ new Set();
|
|
20863
|
+
const seenForAnchor = /* @__PURE__ */ new Set();
|
|
20864
|
+
for (const { url: outgoing, anchorText } of page.outgoingLinks) {
|
|
19821
20865
|
if (!pageSet.has(outgoing)) {
|
|
19822
20866
|
continue;
|
|
19823
20867
|
}
|
|
19824
|
-
|
|
20868
|
+
if (!seenForCount.has(outgoing)) {
|
|
20869
|
+
seenForCount.add(outgoing);
|
|
20870
|
+
incomingLinkCount.set(outgoing, (incomingLinkCount.get(outgoing) ?? 0) + 1);
|
|
20871
|
+
}
|
|
20872
|
+
if (anchorText && !seenForAnchor.has(outgoing)) {
|
|
20873
|
+
seenForAnchor.add(outgoing);
|
|
20874
|
+
if (!incomingAnchorTexts.has(outgoing)) {
|
|
20875
|
+
incomingAnchorTexts.set(outgoing, /* @__PURE__ */ new Set());
|
|
20876
|
+
}
|
|
20877
|
+
incomingAnchorTexts.get(outgoing).add(anchorText);
|
|
20878
|
+
}
|
|
19825
20879
|
}
|
|
19826
20880
|
}
|
|
19827
20881
|
stageEnd("links", linkStart);
|
|
@@ -19840,6 +20894,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19840
20894
|
});
|
|
19841
20895
|
}
|
|
19842
20896
|
}
|
|
20897
|
+
for (const record of customRecords) {
|
|
20898
|
+
const normalizedUrl = normalizeUrlPath(record.url);
|
|
20899
|
+
if (!precomputedRoutes.has(normalizedUrl)) {
|
|
20900
|
+
precomputedRoutes.set(normalizedUrl, {
|
|
20901
|
+
routeFile: "",
|
|
20902
|
+
routeResolution: "exact"
|
|
20903
|
+
});
|
|
20904
|
+
}
|
|
20905
|
+
}
|
|
19843
20906
|
for (const page of indexablePages) {
|
|
19844
20907
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
19845
20908
|
if (routeMatch.routeResolution === "best-effort") {
|
|
@@ -19857,6 +20920,17 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19857
20920
|
} else {
|
|
19858
20921
|
routeExact += 1;
|
|
19859
20922
|
}
|
|
20923
|
+
const anchorSet = incomingAnchorTexts.get(page.url);
|
|
20924
|
+
let incomingAnchorText;
|
|
20925
|
+
if (anchorSet && anchorSet.size > 0) {
|
|
20926
|
+
let joined = "";
|
|
20927
|
+
for (const phrase of anchorSet) {
|
|
20928
|
+
const next2 = joined ? `${joined} ${phrase}` : phrase;
|
|
20929
|
+
if (next2.length > 500) break;
|
|
20930
|
+
joined = next2;
|
|
20931
|
+
}
|
|
20932
|
+
incomingAnchorText = joined || void 0;
|
|
20933
|
+
}
|
|
19860
20934
|
const indexedPage = {
|
|
19861
20935
|
url: page.url,
|
|
19862
20936
|
title: page.title,
|
|
@@ -19866,40 +20940,113 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19866
20940
|
generatedAt: nowIso(),
|
|
19867
20941
|
incomingLinks: incomingLinkCount.get(page.url) ?? 0,
|
|
19868
20942
|
outgoingLinks: page.outgoingLinks.length,
|
|
20943
|
+
outgoingLinkUrls: page.outgoingLinks.map((l) => typeof l === "string" ? l : l.url),
|
|
19869
20944
|
depth: getUrlDepth(page.url),
|
|
19870
20945
|
tags: page.tags,
|
|
19871
20946
|
markdown: page.markdown,
|
|
19872
20947
|
description: page.description,
|
|
19873
|
-
keywords: page.keywords
|
|
20948
|
+
keywords: page.keywords,
|
|
20949
|
+
publishedAt: page.publishedAt,
|
|
20950
|
+
incomingAnchorText,
|
|
20951
|
+
meta: page.meta
|
|
19874
20952
|
};
|
|
19875
20953
|
pages.push(indexedPage);
|
|
19876
20954
|
this.logger.event("page_indexed", { url: page.url });
|
|
19877
20955
|
}
|
|
20956
|
+
const pageRecords = pages.map((p) => {
|
|
20957
|
+
const summary = buildPageSummary(p);
|
|
20958
|
+
return {
|
|
20959
|
+
url: p.url,
|
|
20960
|
+
title: p.title,
|
|
20961
|
+
markdown: p.markdown,
|
|
20962
|
+
projectId: scope.projectId,
|
|
20963
|
+
scopeName: scope.scopeName,
|
|
20964
|
+
routeFile: p.routeFile,
|
|
20965
|
+
routeResolution: p.routeResolution,
|
|
20966
|
+
incomingLinks: p.incomingLinks,
|
|
20967
|
+
outgoingLinks: p.outgoingLinks,
|
|
20968
|
+
outgoingLinkUrls: p.outgoingLinkUrls,
|
|
20969
|
+
depth: p.depth,
|
|
20970
|
+
tags: p.tags,
|
|
20971
|
+
indexedAt: p.generatedAt,
|
|
20972
|
+
summary,
|
|
20973
|
+
description: p.description,
|
|
20974
|
+
keywords: p.keywords,
|
|
20975
|
+
contentHash: buildPageContentHash(p),
|
|
20976
|
+
publishedAt: p.publishedAt,
|
|
20977
|
+
meta: p.meta
|
|
20978
|
+
};
|
|
20979
|
+
});
|
|
20980
|
+
const currentPageUrls = new Set(pageRecords.map((r) => r.url));
|
|
20981
|
+
const changedPages = pageRecords.filter(
|
|
20982
|
+
(r) => !existingPageHashes.has(r.url) || existingPageHashes.get(r.url) !== r.contentHash
|
|
20983
|
+
);
|
|
20984
|
+
const deletedPageUrls = [...existingPageHashes.keys()].filter((url) => !currentPageUrls.has(url));
|
|
19878
20985
|
if (!options.dryRun) {
|
|
19879
|
-
|
|
19880
|
-
|
|
19881
|
-
|
|
19882
|
-
|
|
19883
|
-
|
|
19884
|
-
|
|
19885
|
-
|
|
19886
|
-
|
|
19887
|
-
|
|
19888
|
-
|
|
19889
|
-
|
|
19890
|
-
|
|
19891
|
-
|
|
19892
|
-
|
|
19893
|
-
|
|
19894
|
-
|
|
19895
|
-
|
|
19896
|
-
|
|
19897
|
-
|
|
19898
|
-
|
|
19899
|
-
|
|
19900
|
-
|
|
20986
|
+
if (options.force) {
|
|
20987
|
+
await this.store.deletePages(scope);
|
|
20988
|
+
this.logger.info(`Upserting ${pageRecords.length} page summaries...`);
|
|
20989
|
+
const pageDocs = pageRecords.map((r) => ({
|
|
20990
|
+
id: r.url,
|
|
20991
|
+
data: r.summary ?? r.title,
|
|
20992
|
+
metadata: {
|
|
20993
|
+
title: r.title,
|
|
20994
|
+
url: r.url,
|
|
20995
|
+
description: r.description ?? "",
|
|
20996
|
+
keywords: r.keywords ?? [],
|
|
20997
|
+
summary: r.summary ?? "",
|
|
20998
|
+
tags: r.tags,
|
|
20999
|
+
markdown: r.markdown,
|
|
21000
|
+
routeFile: r.routeFile,
|
|
21001
|
+
routeResolution: r.routeResolution,
|
|
21002
|
+
incomingLinks: r.incomingLinks,
|
|
21003
|
+
outgoingLinks: r.outgoingLinks,
|
|
21004
|
+
outgoingLinkUrls: r.outgoingLinkUrls ?? [],
|
|
21005
|
+
depth: r.depth,
|
|
21006
|
+
indexedAt: r.indexedAt,
|
|
21007
|
+
contentHash: r.contentHash ?? "",
|
|
21008
|
+
publishedAt: r.publishedAt ?? null,
|
|
21009
|
+
...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
|
|
21010
|
+
}
|
|
21011
|
+
}));
|
|
21012
|
+
await this.store.upsertPages(pageDocs, scope);
|
|
21013
|
+
} else {
|
|
21014
|
+
if (changedPages.length > 0) {
|
|
21015
|
+
this.logger.info(`Upserting ${changedPages.length} changed page summaries...`);
|
|
21016
|
+
const pageDocs = changedPages.map((r) => ({
|
|
21017
|
+
id: r.url,
|
|
21018
|
+
data: r.summary ?? r.title,
|
|
21019
|
+
metadata: {
|
|
21020
|
+
title: r.title,
|
|
21021
|
+
url: r.url,
|
|
21022
|
+
description: r.description ?? "",
|
|
21023
|
+
keywords: r.keywords ?? [],
|
|
21024
|
+
summary: r.summary ?? "",
|
|
21025
|
+
tags: r.tags,
|
|
21026
|
+
markdown: r.markdown,
|
|
21027
|
+
routeFile: r.routeFile,
|
|
21028
|
+
routeResolution: r.routeResolution,
|
|
21029
|
+
incomingLinks: r.incomingLinks,
|
|
21030
|
+
outgoingLinks: r.outgoingLinks,
|
|
21031
|
+
outgoingLinkUrls: r.outgoingLinkUrls ?? [],
|
|
21032
|
+
depth: r.depth,
|
|
21033
|
+
indexedAt: r.indexedAt,
|
|
21034
|
+
contentHash: r.contentHash ?? "",
|
|
21035
|
+
publishedAt: r.publishedAt ?? null,
|
|
21036
|
+
...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
|
|
21037
|
+
}
|
|
21038
|
+
}));
|
|
21039
|
+
await this.store.upsertPages(pageDocs, scope);
|
|
21040
|
+
}
|
|
21041
|
+
if (deletedPageUrls.length > 0) {
|
|
21042
|
+
await this.store.deletePagesByIds(deletedPageUrls, scope);
|
|
21043
|
+
}
|
|
21044
|
+
}
|
|
19901
21045
|
}
|
|
21046
|
+
const pagesChanged = options.force ? pageRecords.length : changedPages.length;
|
|
21047
|
+
const pagesDeleted = deletedPageUrls.length;
|
|
19902
21048
|
stageEnd("pages", pagesStart);
|
|
21049
|
+
this.logger.info(`Page changes: ${pagesChanged} changed/new, ${pagesDeleted} deleted, ${pageRecords.length - changedPages.length} unchanged`);
|
|
19903
21050
|
this.logger.info(`Indexed ${pages.length} page${pages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["pages"]}ms)`);
|
|
19904
21051
|
const chunkStart = stageStart();
|
|
19905
21052
|
this.logger.info("Chunking pages...");
|
|
@@ -19908,6 +21055,18 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19908
21055
|
if (typeof maxChunks === "number") {
|
|
19909
21056
|
chunks = chunks.slice(0, maxChunks);
|
|
19910
21057
|
}
|
|
21058
|
+
if (this.hooks.transformChunk) {
|
|
21059
|
+
const transformed = [];
|
|
21060
|
+
for (const chunk of chunks) {
|
|
21061
|
+
const result = await this.hooks.transformChunk(chunk);
|
|
21062
|
+
if (result === null) {
|
|
21063
|
+
this.logger.debug(`Chunk ${chunk.chunkKey} skipped by transformChunk hook`);
|
|
21064
|
+
continue;
|
|
21065
|
+
}
|
|
21066
|
+
transformed.push(result);
|
|
21067
|
+
}
|
|
21068
|
+
chunks = transformed;
|
|
21069
|
+
}
|
|
19911
21070
|
for (const chunk of chunks) {
|
|
19912
21071
|
this.logger.event("chunked", {
|
|
19913
21072
|
url: chunk.url,
|
|
@@ -19920,7 +21079,12 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19920
21079
|
for (const chunk of chunks) {
|
|
19921
21080
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
19922
21081
|
}
|
|
19923
|
-
const
|
|
21082
|
+
const chunkHashStart = stageStart();
|
|
21083
|
+
const currentChunkKeys = chunks.map((c) => c.chunkKey);
|
|
21084
|
+
const existingHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.fetchContentHashesForKeys(currentChunkKeys, scope);
|
|
21085
|
+
stageEnd("chunk_hashes", chunkHashStart);
|
|
21086
|
+
this.logger.debug(`Fetched ${existingHashes.size} existing chunk hashes for ${currentChunkKeys.length} current keys`);
|
|
21087
|
+
let changedChunks = chunks.filter((chunk) => {
|
|
19924
21088
|
if (options.force) {
|
|
19925
21089
|
return true;
|
|
19926
21090
|
}
|
|
@@ -19933,37 +21097,45 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19933
21097
|
}
|
|
19934
21098
|
return existingHash !== chunk.contentHash;
|
|
19935
21099
|
});
|
|
19936
|
-
const
|
|
21100
|
+
const existingChunkIds = options.force ? /* @__PURE__ */ new Set() : await this.store.scanChunkIds(scope);
|
|
21101
|
+
const deletes = [...existingChunkIds].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
21102
|
+
if (this.hooks.beforeIndex) {
|
|
21103
|
+
changedChunks = await this.hooks.beforeIndex(changedChunks);
|
|
21104
|
+
}
|
|
19937
21105
|
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
19938
21106
|
const upsertStart = stageStart();
|
|
19939
21107
|
let documentsUpserted = 0;
|
|
19940
21108
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
19941
|
-
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash
|
|
19942
|
-
const UPSTASH_CONTENT_LIMIT = 4096;
|
|
21109
|
+
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Vector...`);
|
|
19943
21110
|
const docs = changedChunks.map((chunk) => {
|
|
19944
|
-
const
|
|
19945
|
-
|
|
19946
|
-
|
|
19947
|
-
|
|
19948
|
-
|
|
19949
|
-
|
|
19950
|
-
const textBudget = Math.max(500, UPSTASH_CONTENT_LIMIT - otherFieldsLen - 50);
|
|
19951
|
-
const text = buildEmbeddingText(chunk, this.config.chunking.prependTitle).slice(0, textBudget);
|
|
21111
|
+
const embeddingText = buildEmbeddingText(chunk, this.config.chunking.prependTitle);
|
|
21112
|
+
if (embeddingText.length > 2e3) {
|
|
21113
|
+
this.logger.warn(
|
|
21114
|
+
`Chunk ${chunk.chunkKey} text is ${embeddingText.length} chars (~${Math.round(embeddingText.length / 4)} tokens), which may exceed the 512-token model limit and be silently truncated.`
|
|
21115
|
+
);
|
|
21116
|
+
}
|
|
19952
21117
|
return {
|
|
19953
21118
|
id: chunk.chunkKey,
|
|
19954
|
-
|
|
21119
|
+
data: embeddingText,
|
|
19955
21120
|
metadata: {
|
|
19956
|
-
|
|
19957
|
-
scopeName: scope.scopeName,
|
|
21121
|
+
url: chunk.url,
|
|
19958
21122
|
path: chunk.path,
|
|
21123
|
+
title: chunk.title,
|
|
21124
|
+
sectionTitle: chunk.sectionTitle ?? "",
|
|
21125
|
+
headingPath: chunk.headingPath.join(" > "),
|
|
19959
21126
|
snippet: chunk.snippet,
|
|
21127
|
+
chunkText: embeddingText,
|
|
21128
|
+
tags: chunk.tags,
|
|
19960
21129
|
ordinal: chunk.ordinal,
|
|
19961
21130
|
contentHash: chunk.contentHash,
|
|
19962
21131
|
depth: chunk.depth,
|
|
19963
21132
|
incomingLinks: chunk.incomingLinks,
|
|
19964
21133
|
routeFile: chunk.routeFile,
|
|
19965
21134
|
description: chunk.description ?? "",
|
|
19966
|
-
keywords:
|
|
21135
|
+
keywords: chunk.keywords ?? [],
|
|
21136
|
+
publishedAt: chunk.publishedAt ?? null,
|
|
21137
|
+
incomingAnchorText: chunk.incomingAnchorText ?? "",
|
|
21138
|
+
...chunk.meta && Object.keys(chunk.meta).length > 0 ? { meta: chunk.meta } : {}
|
|
19967
21139
|
}
|
|
19968
21140
|
};
|
|
19969
21141
|
});
|
|
@@ -19981,9 +21153,16 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19981
21153
|
} else {
|
|
19982
21154
|
this.logger.info("No chunks to upsert \u2014 all up to date");
|
|
19983
21155
|
}
|
|
21156
|
+
if (this.config.llmsTxt.enable && !options.dryRun) {
|
|
21157
|
+
const llmsStart = stageStart();
|
|
21158
|
+
await writeLlmsTxt(pages, this.config, this.cwd, this.logger);
|
|
21159
|
+
stageEnd("llms_txt", llmsStart);
|
|
21160
|
+
}
|
|
19984
21161
|
this.logger.info("Done.");
|
|
19985
|
-
|
|
21162
|
+
const stats = {
|
|
19986
21163
|
pagesProcessed: pages.length,
|
|
21164
|
+
pagesChanged,
|
|
21165
|
+
pagesDeleted,
|
|
19987
21166
|
chunksTotal: chunks.length,
|
|
19988
21167
|
chunksChanged: changedChunks.length,
|
|
19989
21168
|
documentsUpserted,
|
|
@@ -19992,16 +21171,143 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19992
21171
|
routeBestEffort,
|
|
19993
21172
|
stageTimingsMs
|
|
19994
21173
|
};
|
|
21174
|
+
if (this.hooks.afterIndex) {
|
|
21175
|
+
await this.hooks.afterIndex(stats);
|
|
21176
|
+
}
|
|
21177
|
+
return stats;
|
|
19995
21178
|
}
|
|
19996
21179
|
};
|
|
21180
|
+
|
|
21181
|
+
// src/search/related-pages.ts
|
|
21182
|
+
function diceScore(urlA, urlB) {
|
|
21183
|
+
const segmentsA = urlA.split("/").filter(Boolean);
|
|
21184
|
+
const segmentsB = urlB.split("/").filter(Boolean);
|
|
21185
|
+
if (segmentsA.length === 0 && segmentsB.length === 0) return 1;
|
|
21186
|
+
if (segmentsA.length === 0 || segmentsB.length === 0) return 0;
|
|
21187
|
+
let shared = 0;
|
|
21188
|
+
const minLen = Math.min(segmentsA.length, segmentsB.length);
|
|
21189
|
+
for (let i = 0; i < minLen; i++) {
|
|
21190
|
+
if (segmentsA[i] === segmentsB[i]) {
|
|
21191
|
+
shared++;
|
|
21192
|
+
} else {
|
|
21193
|
+
break;
|
|
21194
|
+
}
|
|
21195
|
+
}
|
|
21196
|
+
return 2 * shared / (segmentsA.length + segmentsB.length);
|
|
21197
|
+
}
|
|
21198
|
+
function compositeScore(isLinked, dice, semantic) {
|
|
21199
|
+
return (isLinked ? 0.5 : 0) + 0.3 * dice + 0.2 * semantic;
|
|
21200
|
+
}
|
|
21201
|
+
function dominantRelationshipType(isOutgoing, isIncoming, dice) {
|
|
21202
|
+
if (isOutgoing) return "outgoing_link";
|
|
21203
|
+
if (isIncoming) return "incoming_link";
|
|
21204
|
+
if (dice > 0.4) return "sibling";
|
|
21205
|
+
return "semantic";
|
|
21206
|
+
}
|
|
21207
|
+
|
|
21208
|
+
// src/search/engine.ts
|
|
21209
|
+
var rankingOverridesSchema = zod.z.object({
|
|
21210
|
+
ranking: zod.z.object({
|
|
21211
|
+
enableIncomingLinkBoost: zod.z.boolean().optional(),
|
|
21212
|
+
enableDepthBoost: zod.z.boolean().optional(),
|
|
21213
|
+
aggregationCap: zod.z.number().int().positive().optional(),
|
|
21214
|
+
aggregationDecay: zod.z.number().min(0).max(1).optional(),
|
|
21215
|
+
minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
21216
|
+
minScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
21217
|
+
scoreGapThreshold: zod.z.number().min(0).max(1).optional(),
|
|
21218
|
+
weights: zod.z.object({
|
|
21219
|
+
incomingLinks: zod.z.number().optional(),
|
|
21220
|
+
depth: zod.z.number().optional(),
|
|
21221
|
+
aggregation: zod.z.number().optional(),
|
|
21222
|
+
titleMatch: zod.z.number().optional()
|
|
21223
|
+
}).optional()
|
|
21224
|
+
}).optional(),
|
|
21225
|
+
search: zod.z.object({
|
|
21226
|
+
pageSearchWeight: zod.z.number().min(0).max(1).optional()
|
|
21227
|
+
}).optional()
|
|
21228
|
+
}).optional();
|
|
19997
21229
|
var requestSchema = zod.z.object({
|
|
19998
21230
|
q: zod.z.string().trim().min(1),
|
|
19999
21231
|
topK: zod.z.number().int().positive().max(100).optional(),
|
|
20000
21232
|
scope: zod.z.string().optional(),
|
|
20001
21233
|
pathPrefix: zod.z.string().optional(),
|
|
20002
21234
|
tags: zod.z.array(zod.z.string()).optional(),
|
|
20003
|
-
|
|
21235
|
+
filters: zod.z.record(zod.z.string(), zod.z.union([zod.z.string(), zod.z.number(), zod.z.boolean()])).optional(),
|
|
21236
|
+
groupBy: zod.z.enum(["page", "chunk"]).optional(),
|
|
21237
|
+
maxSubResults: zod.z.number().int().positive().max(20).optional(),
|
|
21238
|
+
debug: zod.z.boolean().optional(),
|
|
21239
|
+
rankingOverrides: rankingOverridesSchema
|
|
20004
21240
|
});
|
|
21241
|
+
var MAX_SITE_STRUCTURE_PAGES = 2e3;
|
|
21242
|
+
function makeNode(url, depth) {
|
|
21243
|
+
return { url, title: "", depth, routeFile: "", isIndexed: false, childCount: 0, children: [] };
|
|
21244
|
+
}
|
|
21245
|
+
function buildTree(pages, pathPrefix) {
|
|
21246
|
+
const nodeMap = /* @__PURE__ */ new Map();
|
|
21247
|
+
const root2 = makeNode("/", 0);
|
|
21248
|
+
nodeMap.set("/", root2);
|
|
21249
|
+
for (const page of pages) {
|
|
21250
|
+
const normalized = normalizeUrlPath(page.url);
|
|
21251
|
+
const segments = normalized.split("/").filter(Boolean);
|
|
21252
|
+
if (segments.length === 0) {
|
|
21253
|
+
root2.title = page.title;
|
|
21254
|
+
root2.routeFile = page.routeFile;
|
|
21255
|
+
root2.isIndexed = true;
|
|
21256
|
+
continue;
|
|
21257
|
+
}
|
|
21258
|
+
for (let i = 1; i <= segments.length; i++) {
|
|
21259
|
+
const partialUrl = "/" + segments.slice(0, i).join("/");
|
|
21260
|
+
if (!nodeMap.has(partialUrl)) {
|
|
21261
|
+
nodeMap.set(partialUrl, makeNode(partialUrl, i));
|
|
21262
|
+
}
|
|
21263
|
+
}
|
|
21264
|
+
const node = nodeMap.get(normalized);
|
|
21265
|
+
node.title = page.title;
|
|
21266
|
+
node.routeFile = page.routeFile;
|
|
21267
|
+
node.isIndexed = true;
|
|
21268
|
+
}
|
|
21269
|
+
for (const [url, node] of nodeMap) {
|
|
21270
|
+
if (url === "/") continue;
|
|
21271
|
+
const segments = url.split("/").filter(Boolean);
|
|
21272
|
+
const parentUrl = segments.length === 1 ? "/" : "/" + segments.slice(0, -1).join("/");
|
|
21273
|
+
const parent = nodeMap.get(parentUrl) ?? root2;
|
|
21274
|
+
parent.children.push(node);
|
|
21275
|
+
}
|
|
21276
|
+
const sortAndCount = (node) => {
|
|
21277
|
+
node.children.sort((a, b) => a.url.localeCompare(b.url));
|
|
21278
|
+
node.childCount = node.children.length;
|
|
21279
|
+
for (const child of node.children) {
|
|
21280
|
+
sortAndCount(child);
|
|
21281
|
+
}
|
|
21282
|
+
};
|
|
21283
|
+
sortAndCount(root2);
|
|
21284
|
+
if (pathPrefix) {
|
|
21285
|
+
const normalizedPrefix = normalizeUrlPath(pathPrefix);
|
|
21286
|
+
const subtreeRoot = nodeMap.get(normalizedPrefix);
|
|
21287
|
+
if (subtreeRoot) {
|
|
21288
|
+
return subtreeRoot;
|
|
21289
|
+
}
|
|
21290
|
+
return makeNode(normalizedPrefix, normalizedPrefix.split("/").filter(Boolean).length);
|
|
21291
|
+
}
|
|
21292
|
+
return root2;
|
|
21293
|
+
}
|
|
21294
|
+
function mergeRankingOverrides(base, overrides) {
|
|
21295
|
+
return {
|
|
21296
|
+
...base,
|
|
21297
|
+
search: {
|
|
21298
|
+
...base.search,
|
|
21299
|
+
...overrides.search
|
|
21300
|
+
},
|
|
21301
|
+
ranking: {
|
|
21302
|
+
...base.ranking,
|
|
21303
|
+
...overrides.ranking,
|
|
21304
|
+
weights: {
|
|
21305
|
+
...base.ranking.weights,
|
|
21306
|
+
...overrides.ranking?.weights
|
|
21307
|
+
}
|
|
21308
|
+
}
|
|
21309
|
+
};
|
|
21310
|
+
}
|
|
20005
21311
|
var SearchEngine = class _SearchEngine {
|
|
20006
21312
|
cwd;
|
|
20007
21313
|
config;
|
|
@@ -20031,125 +21337,203 @@ var SearchEngine = class _SearchEngine {
|
|
|
20031
21337
|
}
|
|
20032
21338
|
const input = parsed.data;
|
|
20033
21339
|
const totalStart = process.hrtime.bigint();
|
|
21340
|
+
const effectiveConfig = input.debug && input.rankingOverrides ? mergeRankingOverrides(this.config, input.rankingOverrides) : this.config;
|
|
20034
21341
|
const resolvedScope = resolveScope(this.config, input.scope);
|
|
20035
21342
|
const topK = input.topK ?? 10;
|
|
21343
|
+
const maxSubResults = input.maxSubResults ?? 5;
|
|
20036
21344
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
20037
|
-
const
|
|
20038
|
-
const
|
|
20039
|
-
|
|
20040
|
-
|
|
20041
|
-
|
|
20042
|
-
|
|
20043
|
-
|
|
20044
|
-
|
|
20045
|
-
|
|
21345
|
+
const queryText = input.q;
|
|
21346
|
+
const pathPrefix = input.pathPrefix ? input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}` : void 0;
|
|
21347
|
+
const filterTags = input.tags && input.tags.length > 0 ? input.tags : void 0;
|
|
21348
|
+
const metaFilterStr = input.filters && Object.keys(input.filters).length > 0 ? buildMetaFilterString(input.filters) : "";
|
|
21349
|
+
const metaFilter = metaFilterStr || void 0;
|
|
21350
|
+
const applyPagePostFilters = (hits) => {
|
|
21351
|
+
let filtered = hits;
|
|
21352
|
+
if (pathPrefix) {
|
|
21353
|
+
filtered = filtered.filter((h) => h.url.startsWith(pathPrefix));
|
|
21354
|
+
}
|
|
21355
|
+
if (filterTags) {
|
|
21356
|
+
filtered = filtered.filter(
|
|
21357
|
+
(h) => filterTags.every((tag) => h.tags.includes(tag))
|
|
21358
|
+
);
|
|
20046
21359
|
}
|
|
20047
|
-
|
|
20048
|
-
|
|
20049
|
-
const
|
|
21360
|
+
return filtered;
|
|
21361
|
+
};
|
|
21362
|
+
const applyChunkPostFilters = (hits) => {
|
|
21363
|
+
let filtered = hits;
|
|
21364
|
+
if (filterTags) {
|
|
21365
|
+
filtered = filtered.filter(
|
|
21366
|
+
(h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
|
|
21367
|
+
);
|
|
21368
|
+
}
|
|
21369
|
+
return filtered;
|
|
21370
|
+
};
|
|
20050
21371
|
const searchStart = process.hrtime.bigint();
|
|
20051
|
-
|
|
20052
|
-
|
|
20053
|
-
const
|
|
20054
|
-
const
|
|
20055
|
-
|
|
20056
|
-
|
|
20057
|
-
|
|
20058
|
-
|
|
20059
|
-
|
|
20060
|
-
|
|
20061
|
-
|
|
20062
|
-
|
|
20063
|
-
|
|
20064
|
-
|
|
20065
|
-
|
|
20066
|
-
|
|
20067
|
-
|
|
20068
|
-
{
|
|
20069
|
-
limit: chunkLimit,
|
|
20070
|
-
semanticWeight: this.config.search.semanticWeight,
|
|
20071
|
-
inputEnrichment: this.config.search.inputEnrichment,
|
|
20072
|
-
reranking: false,
|
|
20073
|
-
filter
|
|
20074
|
-
},
|
|
21372
|
+
if (groupByPage) {
|
|
21373
|
+
const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
|
|
21374
|
+
const pageLimit = Math.max(topK * 2, 20);
|
|
21375
|
+
const pageHits = await this.store.searchPagesByText(
|
|
21376
|
+
queryText,
|
|
21377
|
+
{ limit: pageLimit * fetchMultiplier, filter: metaFilter },
|
|
21378
|
+
resolvedScope
|
|
21379
|
+
);
|
|
21380
|
+
const filteredPages = applyPagePostFilters(pageHits);
|
|
21381
|
+
let rankedPages = rankPageHits(filteredPages, effectiveConfig, input.q, input.debug);
|
|
21382
|
+
rankedPages = trimPagesByScoreGap(rankedPages, effectiveConfig);
|
|
21383
|
+
const topPages = rankedPages.slice(0, topK);
|
|
21384
|
+
const chunkPromises = topPages.map(
|
|
21385
|
+
(page) => this.store.searchChunksByUrl(
|
|
21386
|
+
queryText,
|
|
21387
|
+
page.url,
|
|
21388
|
+
{ limit: maxSubResults, filter: metaFilter },
|
|
20075
21389
|
resolvedScope
|
|
20076
|
-
)
|
|
20077
|
-
|
|
20078
|
-
const
|
|
20079
|
-
|
|
21390
|
+
).then((chunks) => applyChunkPostFilters(chunks))
|
|
21391
|
+
);
|
|
21392
|
+
const allChunks = await Promise.all(chunkPromises);
|
|
21393
|
+
const searchMs = hrTimeMs(searchStart);
|
|
21394
|
+
const results = this.buildPageFirstResults(topPages, allChunks, input.q, input.debug, maxSubResults);
|
|
21395
|
+
return {
|
|
21396
|
+
q: input.q,
|
|
21397
|
+
scope: resolvedScope.scopeName,
|
|
21398
|
+
results,
|
|
21399
|
+
meta: {
|
|
21400
|
+
timingsMs: {
|
|
21401
|
+
search: Math.round(searchMs),
|
|
21402
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
21403
|
+
}
|
|
21404
|
+
}
|
|
21405
|
+
};
|
|
20080
21406
|
} else {
|
|
21407
|
+
const candidateK = Math.max(50, topK);
|
|
21408
|
+
const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
|
|
20081
21409
|
const hits = await this.store.search(
|
|
20082
|
-
|
|
20083
|
-
{
|
|
20084
|
-
limit: candidateK,
|
|
20085
|
-
semanticWeight: this.config.search.semanticWeight,
|
|
20086
|
-
inputEnrichment: this.config.search.inputEnrichment,
|
|
20087
|
-
reranking: this.config.search.reranking,
|
|
20088
|
-
filter
|
|
20089
|
-
},
|
|
21410
|
+
queryText,
|
|
21411
|
+
{ limit: candidateK * fetchMultiplier, filter: metaFilter },
|
|
20090
21412
|
resolvedScope
|
|
20091
21413
|
);
|
|
20092
|
-
|
|
20093
|
-
|
|
20094
|
-
|
|
20095
|
-
|
|
20096
|
-
|
|
20097
|
-
|
|
20098
|
-
|
|
20099
|
-
|
|
20100
|
-
|
|
20101
|
-
|
|
20102
|
-
|
|
20103
|
-
|
|
21414
|
+
let filtered = hits;
|
|
21415
|
+
if (pathPrefix) {
|
|
21416
|
+
filtered = filtered.filter((h) => h.metadata.url.startsWith(pathPrefix));
|
|
21417
|
+
}
|
|
21418
|
+
if (filterTags) {
|
|
21419
|
+
filtered = filtered.filter(
|
|
21420
|
+
(h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
|
|
21421
|
+
);
|
|
21422
|
+
}
|
|
21423
|
+
const ranked = rankHits(filtered, effectiveConfig, input.q, input.debug);
|
|
21424
|
+
const searchMs = hrTimeMs(searchStart);
|
|
21425
|
+
const results = this.buildResults(ranked, topK, false, maxSubResults, input.q, input.debug, effectiveConfig);
|
|
21426
|
+
return {
|
|
21427
|
+
q: input.q,
|
|
21428
|
+
scope: resolvedScope.scopeName,
|
|
21429
|
+
results,
|
|
21430
|
+
meta: {
|
|
21431
|
+
timingsMs: {
|
|
21432
|
+
search: Math.round(searchMs),
|
|
21433
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
21434
|
+
}
|
|
20104
21435
|
}
|
|
21436
|
+
};
|
|
21437
|
+
}
|
|
21438
|
+
}
|
|
21439
|
+
buildPageFirstResults(rankedPages, allChunks, query, debug, maxSubResults = 5) {
|
|
21440
|
+
return rankedPages.map((page, i) => {
|
|
21441
|
+
const chunks = allChunks[i] ?? [];
|
|
21442
|
+
const bestChunk = chunks[0];
|
|
21443
|
+
const snippet = bestChunk ? query ? queryAwareExcerpt(bestChunk.metadata.chunkText, query) : toSnippet(bestChunk.metadata.chunkText) : page.description || page.title;
|
|
21444
|
+
const result = {
|
|
21445
|
+
url: page.url,
|
|
21446
|
+
title: page.title,
|
|
21447
|
+
sectionTitle: bestChunk?.metadata.sectionTitle || void 0,
|
|
21448
|
+
snippet,
|
|
21449
|
+
chunkText: bestChunk?.metadata.chunkText || void 0,
|
|
21450
|
+
score: Number(page.finalScore.toFixed(6)),
|
|
21451
|
+
routeFile: page.routeFile,
|
|
21452
|
+
chunks: chunks.length > 0 ? chunks.slice(0, maxSubResults).map((c) => ({
|
|
21453
|
+
sectionTitle: c.metadata.sectionTitle || void 0,
|
|
21454
|
+
snippet: query ? queryAwareExcerpt(c.metadata.chunkText, query) : toSnippet(c.metadata.chunkText),
|
|
21455
|
+
chunkText: c.metadata.chunkText || void 0,
|
|
21456
|
+
headingPath: c.metadata.headingPath,
|
|
21457
|
+
score: Number(c.score.toFixed(6))
|
|
21458
|
+
})) : void 0
|
|
21459
|
+
};
|
|
21460
|
+
if (debug && page.breakdown) {
|
|
21461
|
+
result.breakdown = {
|
|
21462
|
+
baseScore: page.breakdown.baseScore,
|
|
21463
|
+
incomingLinkBoost: page.breakdown.incomingLinkBoost,
|
|
21464
|
+
depthBoost: page.breakdown.depthBoost,
|
|
21465
|
+
titleMatchBoost: page.breakdown.titleMatchBoost,
|
|
21466
|
+
freshnessBoost: page.breakdown.freshnessBoost,
|
|
21467
|
+
anchorTextMatchBoost: 0
|
|
21468
|
+
};
|
|
20105
21469
|
}
|
|
20106
|
-
|
|
21470
|
+
return result;
|
|
21471
|
+
});
|
|
20107
21472
|
}
|
|
20108
|
-
ensureSnippet(hit) {
|
|
21473
|
+
ensureSnippet(hit, query) {
|
|
21474
|
+
const chunkText = hit.hit.metadata.chunkText;
|
|
21475
|
+
if (query && chunkText) return queryAwareExcerpt(chunkText, query);
|
|
20109
21476
|
const snippet = hit.hit.metadata.snippet;
|
|
20110
21477
|
if (snippet && snippet.length >= 30) return snippet;
|
|
20111
|
-
const chunkText = hit.hit.metadata.chunkText;
|
|
20112
21478
|
if (chunkText) return toSnippet(chunkText);
|
|
20113
21479
|
return snippet || "";
|
|
20114
21480
|
}
|
|
20115
|
-
buildResults(ordered, topK, groupByPage,
|
|
21481
|
+
buildResults(ordered, topK, groupByPage, maxSubResults, query, debug, config) {
|
|
21482
|
+
const cfg = config ?? this.config;
|
|
20116
21483
|
if (groupByPage) {
|
|
20117
|
-
let pages = aggregateByPage(ordered,
|
|
20118
|
-
pages = trimByScoreGap(pages,
|
|
20119
|
-
const minRatio =
|
|
21484
|
+
let pages = aggregateByPage(ordered, cfg);
|
|
21485
|
+
pages = trimByScoreGap(pages, cfg);
|
|
21486
|
+
const minRatio = cfg.ranking.minChunkScoreRatio;
|
|
20120
21487
|
return pages.slice(0, topK).map((page) => {
|
|
20121
21488
|
const bestScore = page.bestChunk.finalScore;
|
|
20122
21489
|
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
20123
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0,
|
|
20124
|
-
|
|
21490
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, maxSubResults);
|
|
21491
|
+
const result = {
|
|
20125
21492
|
url: page.url,
|
|
20126
21493
|
title: page.title,
|
|
20127
21494
|
sectionTitle: page.bestChunk.hit.metadata.sectionTitle || void 0,
|
|
20128
|
-
snippet: this.ensureSnippet(page.bestChunk),
|
|
21495
|
+
snippet: this.ensureSnippet(page.bestChunk, query),
|
|
21496
|
+
chunkText: page.bestChunk.hit.metadata.chunkText || void 0,
|
|
20129
21497
|
score: Number(page.pageScore.toFixed(6)),
|
|
20130
21498
|
routeFile: page.routeFile,
|
|
20131
|
-
chunks: meaningful.length
|
|
21499
|
+
chunks: meaningful.length >= 1 ? meaningful.map((c) => ({
|
|
20132
21500
|
sectionTitle: c.hit.metadata.sectionTitle || void 0,
|
|
20133
|
-
snippet: this.ensureSnippet(c),
|
|
21501
|
+
snippet: this.ensureSnippet(c, query),
|
|
21502
|
+
chunkText: c.hit.metadata.chunkText || void 0,
|
|
20134
21503
|
headingPath: c.hit.metadata.headingPath,
|
|
20135
21504
|
score: Number(c.finalScore.toFixed(6))
|
|
20136
21505
|
})) : void 0
|
|
20137
21506
|
};
|
|
21507
|
+
if (debug && page.bestChunk.breakdown) {
|
|
21508
|
+
result.breakdown = page.bestChunk.breakdown;
|
|
21509
|
+
}
|
|
21510
|
+
return result;
|
|
20138
21511
|
});
|
|
20139
21512
|
} else {
|
|
20140
21513
|
let filtered = ordered;
|
|
20141
|
-
const
|
|
20142
|
-
if (
|
|
20143
|
-
|
|
20144
|
-
|
|
20145
|
-
|
|
20146
|
-
|
|
20147
|
-
|
|
20148
|
-
|
|
20149
|
-
|
|
20150
|
-
|
|
20151
|
-
|
|
20152
|
-
|
|
21514
|
+
const minScoreRatio = cfg.ranking.minScoreRatio;
|
|
21515
|
+
if (minScoreRatio > 0 && ordered.length > 0) {
|
|
21516
|
+
const topScore = ordered[0].finalScore;
|
|
21517
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
21518
|
+
const threshold = topScore * minScoreRatio;
|
|
21519
|
+
filtered = ordered.filter((entry) => entry.finalScore >= threshold);
|
|
21520
|
+
}
|
|
21521
|
+
}
|
|
21522
|
+
return filtered.slice(0, topK).map(({ hit, finalScore, breakdown }) => {
|
|
21523
|
+
const result = {
|
|
21524
|
+
url: hit.metadata.url,
|
|
21525
|
+
title: hit.metadata.title,
|
|
21526
|
+
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
21527
|
+
snippet: this.ensureSnippet({ hit, finalScore }, query),
|
|
21528
|
+
chunkText: hit.metadata.chunkText || void 0,
|
|
21529
|
+
score: Number(finalScore.toFixed(6)),
|
|
21530
|
+
routeFile: hit.metadata.routeFile
|
|
21531
|
+
};
|
|
21532
|
+
if (debug && breakdown) {
|
|
21533
|
+
result.breakdown = breakdown;
|
|
21534
|
+
}
|
|
21535
|
+
return result;
|
|
21536
|
+
});
|
|
20153
21537
|
}
|
|
20154
21538
|
}
|
|
20155
21539
|
async getPage(pathOrUrl, scope) {
|
|
@@ -20175,6 +21559,116 @@ var SearchEngine = class _SearchEngine {
|
|
|
20175
21559
|
markdown: page.markdown
|
|
20176
21560
|
};
|
|
20177
21561
|
}
|
|
21562
|
+
async listPages(opts) {
|
|
21563
|
+
const resolvedScope = resolveScope(this.config, opts?.scope);
|
|
21564
|
+
const pathPrefix = opts?.pathPrefix ? opts.pathPrefix.startsWith("/") ? opts.pathPrefix : `/${opts.pathPrefix}` : void 0;
|
|
21565
|
+
return this.store.listPages(resolvedScope, {
|
|
21566
|
+
cursor: opts?.cursor,
|
|
21567
|
+
limit: opts?.limit,
|
|
21568
|
+
pathPrefix
|
|
21569
|
+
});
|
|
21570
|
+
}
|
|
21571
|
+
async getSiteStructure(opts) {
|
|
21572
|
+
const maxPages = Math.min(opts?.maxPages ?? MAX_SITE_STRUCTURE_PAGES, MAX_SITE_STRUCTURE_PAGES);
|
|
21573
|
+
const allPages = [];
|
|
21574
|
+
let cursor;
|
|
21575
|
+
let truncated = false;
|
|
21576
|
+
do {
|
|
21577
|
+
const result = await this.listPages({
|
|
21578
|
+
pathPrefix: opts?.pathPrefix,
|
|
21579
|
+
scope: opts?.scope,
|
|
21580
|
+
cursor,
|
|
21581
|
+
limit: 200
|
|
21582
|
+
});
|
|
21583
|
+
allPages.push(...result.pages);
|
|
21584
|
+
cursor = result.nextCursor;
|
|
21585
|
+
if (allPages.length >= maxPages) {
|
|
21586
|
+
truncated = allPages.length > maxPages || !!cursor;
|
|
21587
|
+
allPages.length = maxPages;
|
|
21588
|
+
break;
|
|
21589
|
+
}
|
|
21590
|
+
} while (cursor);
|
|
21591
|
+
const root2 = buildTree(allPages, opts?.pathPrefix);
|
|
21592
|
+
return {
|
|
21593
|
+
root: root2,
|
|
21594
|
+
totalPages: allPages.length,
|
|
21595
|
+
truncated
|
|
21596
|
+
};
|
|
21597
|
+
}
|
|
21598
|
+
async getRelatedPages(pathOrUrl, opts) {
|
|
21599
|
+
const resolvedScope = resolveScope(this.config, opts?.scope);
|
|
21600
|
+
const urlPath = this.resolveInputPath(pathOrUrl);
|
|
21601
|
+
const topK = Math.min(opts?.topK ?? 10, 25);
|
|
21602
|
+
const source = await this.store.fetchPageWithVector(urlPath, resolvedScope);
|
|
21603
|
+
if (!source) {
|
|
21604
|
+
throw new SearchSocketError("INVALID_REQUEST", `Indexed page not found for ${urlPath}`, 404);
|
|
21605
|
+
}
|
|
21606
|
+
const sourceOutgoing = new Set(source.metadata.outgoingLinkUrls ?? []);
|
|
21607
|
+
const semanticHits = await this.store.searchPagesByVector(
|
|
21608
|
+
source.vector,
|
|
21609
|
+
{ limit: 50 },
|
|
21610
|
+
resolvedScope
|
|
21611
|
+
);
|
|
21612
|
+
const filteredHits = semanticHits.filter((h) => h.url !== urlPath);
|
|
21613
|
+
const semanticScoreMap = /* @__PURE__ */ new Map();
|
|
21614
|
+
for (const hit of filteredHits) {
|
|
21615
|
+
semanticScoreMap.set(hit.url, hit.score);
|
|
21616
|
+
}
|
|
21617
|
+
const candidateUrls = /* @__PURE__ */ new Set();
|
|
21618
|
+
for (const hit of filteredHits) {
|
|
21619
|
+
candidateUrls.add(hit.url);
|
|
21620
|
+
}
|
|
21621
|
+
for (const url of sourceOutgoing) {
|
|
21622
|
+
if (url !== urlPath) candidateUrls.add(url);
|
|
21623
|
+
}
|
|
21624
|
+
const missingUrls = [...sourceOutgoing].filter(
|
|
21625
|
+
(u) => u !== urlPath && !semanticScoreMap.has(u)
|
|
21626
|
+
);
|
|
21627
|
+
const fetchedPages = missingUrls.length > 0 ? await this.store.fetchPagesBatch(missingUrls, resolvedScope) : [];
|
|
21628
|
+
const metaMap = /* @__PURE__ */ new Map();
|
|
21629
|
+
for (const hit of filteredHits) {
|
|
21630
|
+
metaMap.set(hit.url, { title: hit.title, routeFile: hit.routeFile, outgoingLinkUrls: [] });
|
|
21631
|
+
}
|
|
21632
|
+
for (const p of fetchedPages) {
|
|
21633
|
+
metaMap.set(p.url, { title: p.title, routeFile: p.routeFile, outgoingLinkUrls: p.outgoingLinkUrls });
|
|
21634
|
+
}
|
|
21635
|
+
const semanticUrls = filteredHits.map((h) => h.url);
|
|
21636
|
+
if (semanticUrls.length > 0) {
|
|
21637
|
+
const semanticPageData = await this.store.fetchPagesBatch(semanticUrls, resolvedScope);
|
|
21638
|
+
for (const p of semanticPageData) {
|
|
21639
|
+
const existing = metaMap.get(p.url);
|
|
21640
|
+
if (existing) {
|
|
21641
|
+
existing.outgoingLinkUrls = p.outgoingLinkUrls;
|
|
21642
|
+
}
|
|
21643
|
+
}
|
|
21644
|
+
}
|
|
21645
|
+
const candidates = [];
|
|
21646
|
+
for (const url of candidateUrls) {
|
|
21647
|
+
const meta = metaMap.get(url);
|
|
21648
|
+
if (!meta) continue;
|
|
21649
|
+
const isOutgoing = sourceOutgoing.has(url);
|
|
21650
|
+
const isIncoming = meta.outgoingLinkUrls.includes(urlPath);
|
|
21651
|
+
const isLinked = isOutgoing || isIncoming;
|
|
21652
|
+
const dice = diceScore(urlPath, url);
|
|
21653
|
+
const semantic = semanticScoreMap.get(url) ?? 0;
|
|
21654
|
+
const score = compositeScore(isLinked, dice, semantic);
|
|
21655
|
+
const relationshipType = dominantRelationshipType(isOutgoing, isIncoming, dice);
|
|
21656
|
+
candidates.push({
|
|
21657
|
+
url,
|
|
21658
|
+
title: meta.title,
|
|
21659
|
+
score: Number(score.toFixed(6)),
|
|
21660
|
+
relationshipType,
|
|
21661
|
+
routeFile: meta.routeFile
|
|
21662
|
+
});
|
|
21663
|
+
}
|
|
21664
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
21665
|
+
const results = candidates.slice(0, topK);
|
|
21666
|
+
return {
|
|
21667
|
+
sourceUrl: urlPath,
|
|
21668
|
+
scope: resolvedScope.scopeName,
|
|
21669
|
+
relatedPages: results
|
|
21670
|
+
};
|
|
21671
|
+
}
|
|
20178
21672
|
async health() {
|
|
20179
21673
|
return this.store.health();
|
|
20180
21674
|
}
|
|
@@ -20197,14 +21691,40 @@ function createServer(engine) {
|
|
|
20197
21691
|
server.registerTool(
|
|
20198
21692
|
"search",
|
|
20199
21693
|
{
|
|
20200
|
-
description:
|
|
21694
|
+
description: `Semantic site search powered by Upstash Search. Returns url, title, snippet, chunkText, score, and routeFile per result. chunkText contains the full raw chunk markdown. When groupBy is 'page' (default), each result includes a chunks array with section-level sub-results containing sectionTitle, headingPath, snippet, and score. Supports optional filters for structured metadata (e.g. {"version": 2, "deprecated": false}).`,
|
|
20201
21695
|
inputSchema: {
|
|
20202
21696
|
query: zod.z.string().min(1),
|
|
20203
21697
|
scope: zod.z.string().optional(),
|
|
20204
21698
|
topK: zod.z.number().int().positive().max(100).optional(),
|
|
20205
21699
|
pathPrefix: zod.z.string().optional(),
|
|
20206
21700
|
tags: zod.z.array(zod.z.string()).optional(),
|
|
20207
|
-
|
|
21701
|
+
filters: zod.z.record(zod.z.string(), zod.z.union([zod.z.string(), zod.z.number(), zod.z.boolean()])).optional(),
|
|
21702
|
+
groupBy: zod.z.enum(["page", "chunk"]).optional(),
|
|
21703
|
+
maxSubResults: zod.z.number().int().positive().max(20).optional()
|
|
21704
|
+
},
|
|
21705
|
+
outputSchema: {
|
|
21706
|
+
q: zod.z.string(),
|
|
21707
|
+
scope: zod.z.string(),
|
|
21708
|
+
results: zod.z.array(zod.z.object({
|
|
21709
|
+
url: zod.z.string(),
|
|
21710
|
+
title: zod.z.string(),
|
|
21711
|
+
sectionTitle: zod.z.string().optional(),
|
|
21712
|
+
snippet: zod.z.string(),
|
|
21713
|
+
score: zod.z.number(),
|
|
21714
|
+
routeFile: zod.z.string(),
|
|
21715
|
+
chunks: zod.z.array(zod.z.object({
|
|
21716
|
+
sectionTitle: zod.z.string().optional(),
|
|
21717
|
+
snippet: zod.z.string(),
|
|
21718
|
+
headingPath: zod.z.array(zod.z.string()),
|
|
21719
|
+
score: zod.z.number()
|
|
21720
|
+
})).optional()
|
|
21721
|
+
})),
|
|
21722
|
+
meta: zod.z.object({
|
|
21723
|
+
timingsMs: zod.z.object({
|
|
21724
|
+
search: zod.z.number(),
|
|
21725
|
+
total: zod.z.number()
|
|
21726
|
+
})
|
|
21727
|
+
})
|
|
20208
21728
|
}
|
|
20209
21729
|
},
|
|
20210
21730
|
async (input) => {
|
|
@@ -20214,7 +21734,9 @@ function createServer(engine) {
|
|
|
20214
21734
|
scope: input.scope,
|
|
20215
21735
|
pathPrefix: input.pathPrefix,
|
|
20216
21736
|
tags: input.tags,
|
|
20217
|
-
|
|
21737
|
+
filters: input.filters,
|
|
21738
|
+
groupBy: input.groupBy,
|
|
21739
|
+
maxSubResults: input.maxSubResults
|
|
20218
21740
|
});
|
|
20219
21741
|
return {
|
|
20220
21742
|
content: [
|
|
@@ -20222,7 +21744,8 @@ function createServer(engine) {
|
|
|
20222
21744
|
type: "text",
|
|
20223
21745
|
text: JSON.stringify(result, null, 2)
|
|
20224
21746
|
}
|
|
20225
|
-
]
|
|
21747
|
+
],
|
|
21748
|
+
structuredContent: result
|
|
20226
21749
|
};
|
|
20227
21750
|
}
|
|
20228
21751
|
);
|
|
@@ -20247,8 +21770,134 @@ function createServer(engine) {
|
|
|
20247
21770
|
};
|
|
20248
21771
|
}
|
|
20249
21772
|
);
|
|
21773
|
+
server.registerTool(
|
|
21774
|
+
"list_pages",
|
|
21775
|
+
{
|
|
21776
|
+
description: "List indexed pages with optional path prefix filtering and cursor-based pagination. Returns url, title, description, and routeFile for each page. Use nextCursor to fetch subsequent pages.",
|
|
21777
|
+
inputSchema: {
|
|
21778
|
+
pathPrefix: zod.z.string().optional(),
|
|
21779
|
+
cursor: zod.z.string().optional(),
|
|
21780
|
+
limit: zod.z.number().int().positive().max(200).optional(),
|
|
21781
|
+
scope: zod.z.string().optional()
|
|
21782
|
+
}
|
|
21783
|
+
},
|
|
21784
|
+
async (input) => {
|
|
21785
|
+
const result = await engine.listPages({
|
|
21786
|
+
pathPrefix: input.pathPrefix,
|
|
21787
|
+
cursor: input.cursor,
|
|
21788
|
+
limit: input.limit,
|
|
21789
|
+
scope: input.scope
|
|
21790
|
+
});
|
|
21791
|
+
return {
|
|
21792
|
+
content: [
|
|
21793
|
+
{
|
|
21794
|
+
type: "text",
|
|
21795
|
+
text: JSON.stringify(result, null, 2)
|
|
21796
|
+
}
|
|
21797
|
+
]
|
|
21798
|
+
};
|
|
21799
|
+
}
|
|
21800
|
+
);
|
|
21801
|
+
server.registerTool(
|
|
21802
|
+
"get_site_structure",
|
|
21803
|
+
{
|
|
21804
|
+
description: "Returns the hierarchical page tree derived from URL paths. Use this to understand site navigation structure, find where pages belong, or scope further operations to a section. Nodes with isIndexed: false are implicit structural parents not directly in the index. Large sites (>2000 pages) return truncated: true.",
|
|
21805
|
+
inputSchema: {
|
|
21806
|
+
pathPrefix: zod.z.string().optional(),
|
|
21807
|
+
scope: zod.z.string().optional(),
|
|
21808
|
+
maxPages: zod.z.number().int().positive().max(2e3).optional()
|
|
21809
|
+
}
|
|
21810
|
+
},
|
|
21811
|
+
async (input) => {
|
|
21812
|
+
const result = await engine.getSiteStructure({
|
|
21813
|
+
pathPrefix: input.pathPrefix,
|
|
21814
|
+
scope: input.scope,
|
|
21815
|
+
maxPages: input.maxPages
|
|
21816
|
+
});
|
|
21817
|
+
return {
|
|
21818
|
+
content: [
|
|
21819
|
+
{
|
|
21820
|
+
type: "text",
|
|
21821
|
+
text: JSON.stringify(result, null, 2)
|
|
21822
|
+
}
|
|
21823
|
+
]
|
|
21824
|
+
};
|
|
21825
|
+
}
|
|
21826
|
+
);
|
|
21827
|
+
server.registerTool(
|
|
21828
|
+
"find_source_file",
|
|
21829
|
+
{
|
|
21830
|
+
description: "Find the SvelteKit source file for a piece of site content. Use this when you need to locate and edit content on the site. Returns the URL, route file path, section title, and a content snippet.",
|
|
21831
|
+
inputSchema: {
|
|
21832
|
+
query: zod.z.string().min(1),
|
|
21833
|
+
scope: zod.z.string().optional()
|
|
21834
|
+
}
|
|
21835
|
+
},
|
|
21836
|
+
async (input) => {
|
|
21837
|
+
const result = await engine.search({
|
|
21838
|
+
q: input.query,
|
|
21839
|
+
topK: 1,
|
|
21840
|
+
scope: input.scope
|
|
21841
|
+
});
|
|
21842
|
+
if (result.results.length === 0) {
|
|
21843
|
+
return {
|
|
21844
|
+
content: [
|
|
21845
|
+
{
|
|
21846
|
+
type: "text",
|
|
21847
|
+
text: JSON.stringify({
|
|
21848
|
+
error: "No matching content found for the given query."
|
|
21849
|
+
})
|
|
21850
|
+
}
|
|
21851
|
+
]
|
|
21852
|
+
};
|
|
21853
|
+
}
|
|
21854
|
+
const match = result.results[0];
|
|
21855
|
+
const { url, routeFile, sectionTitle, snippet } = match;
|
|
21856
|
+
return {
|
|
21857
|
+
content: [
|
|
21858
|
+
{
|
|
21859
|
+
type: "text",
|
|
21860
|
+
text: JSON.stringify({ url, routeFile, sectionTitle, snippet })
|
|
21861
|
+
}
|
|
21862
|
+
]
|
|
21863
|
+
};
|
|
21864
|
+
}
|
|
21865
|
+
);
|
|
21866
|
+
server.registerTool(
|
|
21867
|
+
"get_related_pages",
|
|
21868
|
+
{
|
|
21869
|
+
description: "Find pages related to a given URL using link graph, semantic similarity, and structural proximity. Returns related pages ranked by a composite relatedness score. Use this to discover content connected to a known page.",
|
|
21870
|
+
inputSchema: {
|
|
21871
|
+
pathOrUrl: zod.z.string().min(1),
|
|
21872
|
+
scope: zod.z.string().optional(),
|
|
21873
|
+
topK: zod.z.number().int().positive().max(25).optional()
|
|
21874
|
+
}
|
|
21875
|
+
},
|
|
21876
|
+
async (input) => {
|
|
21877
|
+
const result = await engine.getRelatedPages(input.pathOrUrl, {
|
|
21878
|
+
topK: input.topK,
|
|
21879
|
+
scope: input.scope
|
|
21880
|
+
});
|
|
21881
|
+
return {
|
|
21882
|
+
content: [
|
|
21883
|
+
{
|
|
21884
|
+
type: "text",
|
|
21885
|
+
text: JSON.stringify(result, null, 2)
|
|
21886
|
+
}
|
|
21887
|
+
]
|
|
21888
|
+
};
|
|
21889
|
+
}
|
|
21890
|
+
);
|
|
20250
21891
|
return server;
|
|
20251
21892
|
}
|
|
21893
|
+
function resolveApiKey(config) {
|
|
21894
|
+
return config.mcp.http.apiKey ?? (config.mcp.http.apiKeyEnv ? process.env[config.mcp.http.apiKeyEnv] : void 0);
|
|
21895
|
+
}
|
|
21896
|
+
function verifyApiKey(provided, expected) {
|
|
21897
|
+
const a = crypto.createHash("sha256").update(provided).digest();
|
|
21898
|
+
const b = crypto.createHash("sha256").update(expected).digest();
|
|
21899
|
+
return crypto.timingSafeEqual(a, b);
|
|
21900
|
+
}
|
|
20252
21901
|
function redirectConsoleToStderr() {
|
|
20253
21902
|
console.log = (...args) => {
|
|
20254
21903
|
process.stderr.write(`[LOG] ${args.map(String).join(" ")}
|
|
@@ -20263,7 +21912,22 @@ async function startHttpServer(serverFactory, config, opts) {
|
|
|
20263
21912
|
const app = express_js.createMcpExpressApp();
|
|
20264
21913
|
const port = opts.httpPort ?? config.mcp.http.port;
|
|
20265
21914
|
const endpointPath = opts.httpPath ?? config.mcp.http.path;
|
|
21915
|
+
const isPublic = config.mcp.access === "public";
|
|
21916
|
+
const host = isPublic ? "0.0.0.0" : "127.0.0.1";
|
|
21917
|
+
const apiKey = isPublic ? resolveApiKey(config) : void 0;
|
|
20266
21918
|
app.post(endpointPath, async (req, res) => {
|
|
21919
|
+
if (isPublic && apiKey) {
|
|
21920
|
+
const authHeader = req.headers["authorization"];
|
|
21921
|
+
const provided = (authHeader?.startsWith("Bearer ") ? authHeader.slice(7) : void 0) ?? req.headers["x-api-key"] ?? "";
|
|
21922
|
+
if (!provided || !verifyApiKey(provided, apiKey)) {
|
|
21923
|
+
res.status(401).json({
|
|
21924
|
+
jsonrpc: "2.0",
|
|
21925
|
+
error: { code: -32001, message: "Unauthorized" },
|
|
21926
|
+
id: null
|
|
21927
|
+
});
|
|
21928
|
+
return;
|
|
21929
|
+
}
|
|
21930
|
+
}
|
|
20267
21931
|
const server = serverFactory();
|
|
20268
21932
|
const transport = new streamableHttp_js.StreamableHTTPServerTransport({
|
|
20269
21933
|
sessionIdGenerator: void 0
|
|
@@ -20313,9 +21977,12 @@ async function startHttpServer(serverFactory, config, opts) {
|
|
|
20313
21977
|
);
|
|
20314
21978
|
});
|
|
20315
21979
|
await new Promise((resolve, reject) => {
|
|
20316
|
-
const instance = app.listen(port,
|
|
20317
|
-
process.stderr.write(`SearchSocket MCP HTTP server listening on http
|
|
21980
|
+
const instance = app.listen(port, host, () => {
|
|
21981
|
+
process.stderr.write(`SearchSocket MCP HTTP server listening on http://${host}:${port}${endpointPath}
|
|
20318
21982
|
`);
|
|
21983
|
+
if (isPublic) {
|
|
21984
|
+
process.stderr.write("WARNING: Server is in public mode. Ensure HTTPS is configured via a reverse proxy for production use.\n");
|
|
21985
|
+
}
|
|
20319
21986
|
resolve();
|
|
20320
21987
|
});
|
|
20321
21988
|
instance.once("error", reject);
|
|
@@ -20330,6 +21997,13 @@ async function runMcpServer(options = {}) {
|
|
|
20330
21997
|
cwd: options.cwd,
|
|
20331
21998
|
configPath: options.configPath
|
|
20332
21999
|
});
|
|
22000
|
+
if (options.access) config.mcp.access = options.access;
|
|
22001
|
+
if (options.apiKey) config.mcp.http.apiKey = options.apiKey;
|
|
22002
|
+
if (config.mcp.access === "public" && !resolveApiKey(config)) {
|
|
22003
|
+
throw new Error(
|
|
22004
|
+
'MCP access is "public" but no API key is configured. Pass --api-key or set mcp.http.apiKey / mcp.http.apiKeyEnv in config.'
|
|
22005
|
+
);
|
|
22006
|
+
}
|
|
20333
22007
|
const resolvedTransport = options.transport ?? config.mcp.transport;
|
|
20334
22008
|
if (resolvedTransport === "stdio") {
|
|
20335
22009
|
redirectConsoleToStderr();
|
|
@@ -20347,8 +22021,6 @@ async function runMcpServer(options = {}) {
|
|
|
20347
22021
|
const stdioTransport = new stdio_js.StdioServerTransport();
|
|
20348
22022
|
await server.connect(stdioTransport);
|
|
20349
22023
|
}
|
|
20350
|
-
|
|
20351
|
-
// src/sveltekit/handle.ts
|
|
20352
22024
|
var InMemoryRateLimiter = class {
|
|
20353
22025
|
constructor(windowMs, max) {
|
|
20354
22026
|
this.windowMs = windowMs;
|
|
@@ -20376,7 +22048,13 @@ function searchsocketHandle(options = {}) {
|
|
|
20376
22048
|
let enginePromise = null;
|
|
20377
22049
|
let configPromise = null;
|
|
20378
22050
|
let apiPath = options.path;
|
|
22051
|
+
let llmsServePath = null;
|
|
22052
|
+
let serveMarkdownVariants = false;
|
|
22053
|
+
let mcpPath;
|
|
22054
|
+
let mcpApiKey;
|
|
22055
|
+
let mcpEnableJsonResponse = true;
|
|
20379
22056
|
let rateLimiter = null;
|
|
22057
|
+
let notConfigured = false;
|
|
20380
22058
|
const getConfig = async () => {
|
|
20381
22059
|
if (!configPromise) {
|
|
20382
22060
|
let configP;
|
|
@@ -20393,6 +22071,13 @@ function searchsocketHandle(options = {}) {
|
|
|
20393
22071
|
}
|
|
20394
22072
|
configPromise = configP.then((config) => {
|
|
20395
22073
|
apiPath = apiPath ?? config.api.path;
|
|
22074
|
+
mcpPath = config.mcp.handle.path;
|
|
22075
|
+
mcpApiKey = config.mcp.handle.apiKey;
|
|
22076
|
+
mcpEnableJsonResponse = config.mcp.handle.enableJsonResponse;
|
|
22077
|
+
if (config.llmsTxt.enable) {
|
|
22078
|
+
llmsServePath = "/" + config.llmsTxt.outputPath.replace(/^static\//, "");
|
|
22079
|
+
serveMarkdownVariants = config.llmsTxt.serveMarkdownVariants;
|
|
22080
|
+
}
|
|
20396
22081
|
if (config.api.rateLimit && !isServerless()) {
|
|
20397
22082
|
rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
|
|
20398
22083
|
}
|
|
@@ -20402,59 +22087,109 @@ function searchsocketHandle(options = {}) {
|
|
|
20402
22087
|
return configPromise;
|
|
20403
22088
|
};
|
|
20404
22089
|
const getEngine = async () => {
|
|
22090
|
+
if (notConfigured) {
|
|
22091
|
+
throw new SearchSocketError(
|
|
22092
|
+
"SEARCH_NOT_CONFIGURED",
|
|
22093
|
+
"Search is not configured. Set the required Upstash environment variables to enable search.",
|
|
22094
|
+
503
|
|
22095
|
+
);
|
|
22096
|
+
}
|
|
20405
22097
|
if (!enginePromise) {
|
|
20406
22098
|
const config = await getConfig();
|
|
20407
22099
|
enginePromise = SearchEngine.create({
|
|
20408
22100
|
cwd: options.cwd,
|
|
20409
22101
|
config
|
|
22102
|
+
}).catch((error) => {
|
|
22103
|
+
enginePromise = null;
|
|
22104
|
+
if (error instanceof SearchSocketError && error.code === "VECTOR_BACKEND_UNAVAILABLE") {
|
|
22105
|
+
notConfigured = true;
|
|
22106
|
+
throw new SearchSocketError(
|
|
22107
|
+
"SEARCH_NOT_CONFIGURED",
|
|
22108
|
+
"Search is not configured. Set the required Upstash environment variables to enable search.",
|
|
22109
|
+
503
|
|
22110
|
+
);
|
|
22111
|
+
}
|
|
22112
|
+
throw error;
|
|
20410
22113
|
});
|
|
20411
22114
|
}
|
|
20412
22115
|
return enginePromise;
|
|
20413
22116
|
};
|
|
20414
22117
|
const bodyLimit = options.maxBodyBytes ?? 64 * 1024;
|
|
20415
22118
|
return async ({ event, resolve }) => {
|
|
20416
|
-
if (apiPath && event.url.pathname !==
|
|
20417
|
-
|
|
22119
|
+
if (apiPath && !isApiPath(event.url.pathname, apiPath) && event.url.pathname !== llmsServePath) {
|
|
22120
|
+
const isMarkdownVariant = event.request.method === "GET" && event.url.pathname.endsWith(".md");
|
|
22121
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
22122
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
22123
|
+
}
|
|
22124
|
+
if (mcpPath) {
|
|
22125
|
+
if (serveMarkdownVariants && isMarkdownVariant) ; else {
|
|
22126
|
+
return resolve(event);
|
|
22127
|
+
}
|
|
22128
|
+
} else {
|
|
22129
|
+
if (configPromise || options.config || options.rawConfig) {
|
|
22130
|
+
await getConfig();
|
|
22131
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
22132
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
22133
|
+
}
|
|
22134
|
+
if (!(serveMarkdownVariants && isMarkdownVariant)) {
|
|
22135
|
+
return resolve(event);
|
|
22136
|
+
}
|
|
22137
|
+
} else {
|
|
22138
|
+
return resolve(event);
|
|
22139
|
+
}
|
|
22140
|
+
}
|
|
20418
22141
|
}
|
|
20419
22142
|
const config = await getConfig();
|
|
22143
|
+
if (llmsServePath && event.request.method === "GET" && event.url.pathname === llmsServePath) {
|
|
22144
|
+
const cwd = options.cwd ?? process.cwd();
|
|
22145
|
+
const filePath = path__default.default.resolve(cwd, config.llmsTxt.outputPath);
|
|
22146
|
+
try {
|
|
22147
|
+
const content = await fs8__default.default.readFile(filePath, "utf8");
|
|
22148
|
+
return new Response(content, {
|
|
22149
|
+
status: 200,
|
|
22150
|
+
headers: { "content-type": "text/plain; charset=utf-8" }
|
|
22151
|
+
});
|
|
22152
|
+
} catch {
|
|
22153
|
+
return resolve(event);
|
|
22154
|
+
}
|
|
22155
|
+
}
|
|
22156
|
+
if (serveMarkdownVariants && event.request.method === "GET" && event.url.pathname.endsWith(".md")) {
|
|
22157
|
+
let rawPath;
|
|
22158
|
+
try {
|
|
22159
|
+
rawPath = decodeURIComponent(event.url.pathname.slice(0, -3));
|
|
22160
|
+
} catch {
|
|
22161
|
+
return resolve(event);
|
|
22162
|
+
}
|
|
22163
|
+
const scope = event.url.searchParams?.get("scope") ?? void 0;
|
|
22164
|
+
try {
|
|
22165
|
+
const engine = await getEngine();
|
|
22166
|
+
const page = await engine.getPage(rawPath, scope);
|
|
22167
|
+
return new Response(page.markdown, {
|
|
22168
|
+
status: 200,
|
|
22169
|
+
headers: { "content-type": "text/markdown; charset=utf-8" }
|
|
22170
|
+
});
|
|
22171
|
+
} catch (error) {
|
|
22172
|
+
if (error instanceof SearchSocketError && error.status === 404) {
|
|
22173
|
+
return resolve(event);
|
|
22174
|
+
}
|
|
22175
|
+
throw error;
|
|
22176
|
+
}
|
|
22177
|
+
}
|
|
22178
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
22179
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
22180
|
+
}
|
|
20420
22181
|
const targetPath = apiPath ?? config.api.path;
|
|
20421
|
-
if (event.url.pathname
|
|
22182
|
+
if (!isApiPath(event.url.pathname, targetPath)) {
|
|
20422
22183
|
return resolve(event);
|
|
20423
22184
|
}
|
|
20424
|
-
|
|
22185
|
+
const subPath = event.url.pathname.slice(targetPath.length);
|
|
22186
|
+
const method = event.request.method;
|
|
22187
|
+
if (method === "OPTIONS") {
|
|
20425
22188
|
return new Response(null, {
|
|
20426
22189
|
status: 204,
|
|
20427
22190
|
headers: buildCorsHeaders(event.request, config)
|
|
20428
22191
|
});
|
|
20429
22192
|
}
|
|
20430
|
-
if (event.request.method !== "POST") {
|
|
20431
|
-
return withCors(
|
|
20432
|
-
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
|
|
20433
|
-
status: 405,
|
|
20434
|
-
headers: {
|
|
20435
|
-
"content-type": "application/json"
|
|
20436
|
-
}
|
|
20437
|
-
}),
|
|
20438
|
-
event.request,
|
|
20439
|
-
config
|
|
20440
|
-
);
|
|
20441
|
-
}
|
|
20442
|
-
const contentLength = Number(event.request.headers.get("content-length") ?? 0);
|
|
20443
|
-
if (contentLength > bodyLimit) {
|
|
20444
|
-
return withCors(
|
|
20445
|
-
new Response(
|
|
20446
|
-
JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Request body too large", 413))),
|
|
20447
|
-
{
|
|
20448
|
-
status: 413,
|
|
20449
|
-
headers: {
|
|
20450
|
-
"content-type": "application/json"
|
|
20451
|
-
}
|
|
20452
|
-
}
|
|
20453
|
-
),
|
|
20454
|
-
event.request,
|
|
20455
|
-
config
|
|
20456
|
-
);
|
|
20457
|
-
}
|
|
20458
22193
|
if (rateLimiter) {
|
|
20459
22194
|
const ip = event.getClientAddress?.() ?? event.request.headers.get("x-forwarded-for")?.split(",")[0]?.trim() ?? "unknown";
|
|
20460
22195
|
if (!rateLimiter.check(ip)) {
|
|
@@ -20474,39 +22209,32 @@ function searchsocketHandle(options = {}) {
|
|
|
20474
22209
|
}
|
|
20475
22210
|
}
|
|
20476
22211
|
try {
|
|
20477
|
-
|
|
20478
|
-
|
|
20479
|
-
|
|
20480
|
-
} else {
|
|
20481
|
-
let parsedFallback;
|
|
20482
|
-
try {
|
|
20483
|
-
parsedFallback = await event.request.json();
|
|
20484
|
-
} catch (error) {
|
|
20485
|
-
if (error instanceof SyntaxError) {
|
|
20486
|
-
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
20487
|
-
}
|
|
20488
|
-
throw error;
|
|
22212
|
+
if (method === "GET") {
|
|
22213
|
+
if (subPath === "" || subPath === "/") {
|
|
22214
|
+
return await handleGetSearch(event, config, getEngine);
|
|
20489
22215
|
}
|
|
20490
|
-
|
|
22216
|
+
if (subPath === "/health") {
|
|
22217
|
+
return await handleGetHealth(event, config, getEngine);
|
|
22218
|
+
}
|
|
22219
|
+
if (subPath.startsWith("/pages/")) {
|
|
22220
|
+
return await handleGetPage(event, config, getEngine, subPath);
|
|
22221
|
+
}
|
|
22222
|
+
return withCors(
|
|
22223
|
+
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Not found", 404))), {
|
|
22224
|
+
status: 404,
|
|
22225
|
+
headers: { "content-type": "application/json" }
|
|
22226
|
+
}),
|
|
22227
|
+
event.request,
|
|
22228
|
+
config
|
|
22229
|
+
);
|
|
20491
22230
|
}
|
|
20492
|
-
if (
|
|
20493
|
-
|
|
22231
|
+
if (method === "POST" && (subPath === "" || subPath === "/")) {
|
|
22232
|
+
return await handlePostSearch(event, config, getEngine, bodyLimit);
|
|
20494
22233
|
}
|
|
20495
|
-
let body;
|
|
20496
|
-
try {
|
|
20497
|
-
body = JSON.parse(rawBody);
|
|
20498
|
-
} catch {
|
|
20499
|
-
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
20500
|
-
}
|
|
20501
|
-
const engine = await getEngine();
|
|
20502
|
-
const searchRequest = body;
|
|
20503
|
-
const result = await engine.search(searchRequest);
|
|
20504
22234
|
return withCors(
|
|
20505
|
-
new Response(JSON.stringify(
|
|
20506
|
-
status:
|
|
20507
|
-
headers: {
|
|
20508
|
-
"content-type": "application/json"
|
|
20509
|
-
}
|
|
22235
|
+
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
|
|
22236
|
+
status: 405,
|
|
22237
|
+
headers: { "content-type": "application/json" }
|
|
20510
22238
|
}),
|
|
20511
22239
|
event.request,
|
|
20512
22240
|
config
|
|
@@ -20527,6 +22255,183 @@ function searchsocketHandle(options = {}) {
|
|
|
20527
22255
|
}
|
|
20528
22256
|
};
|
|
20529
22257
|
}
|
|
22258
|
+
function isApiPath(pathname, apiPath) {
|
|
22259
|
+
return pathname === apiPath || pathname.startsWith(apiPath + "/");
|
|
22260
|
+
}
|
|
22261
|
+
async function handleGetSearch(event, config, getEngine) {
|
|
22262
|
+
const params = event.url.searchParams;
|
|
22263
|
+
const q = params.get("q");
|
|
22264
|
+
if (!q || q.trim() === "") {
|
|
22265
|
+
throw new SearchSocketError("INVALID_REQUEST", "Missing required query parameter: q", 400);
|
|
22266
|
+
}
|
|
22267
|
+
const searchRequest = { q };
|
|
22268
|
+
const topK = params.get("topK");
|
|
22269
|
+
if (topK !== null) {
|
|
22270
|
+
const parsed = Number.parseInt(topK, 10);
|
|
22271
|
+
if (Number.isNaN(parsed) || parsed < 1) {
|
|
22272
|
+
throw new SearchSocketError("INVALID_REQUEST", "topK must be a positive integer", 400);
|
|
22273
|
+
}
|
|
22274
|
+
searchRequest.topK = parsed;
|
|
22275
|
+
}
|
|
22276
|
+
const scope = params.get("scope");
|
|
22277
|
+
if (scope !== null) searchRequest.scope = scope;
|
|
22278
|
+
const pathPrefix = params.get("pathPrefix");
|
|
22279
|
+
if (pathPrefix !== null) searchRequest.pathPrefix = pathPrefix;
|
|
22280
|
+
const groupBy = params.get("groupBy");
|
|
22281
|
+
if (groupBy) {
|
|
22282
|
+
if (groupBy !== "page" && groupBy !== "chunk") {
|
|
22283
|
+
throw new SearchSocketError("INVALID_REQUEST", 'groupBy must be "page" or "chunk"', 400);
|
|
22284
|
+
}
|
|
22285
|
+
searchRequest.groupBy = groupBy;
|
|
22286
|
+
}
|
|
22287
|
+
const maxSubResults = params.get("maxSubResults");
|
|
22288
|
+
if (maxSubResults !== null) {
|
|
22289
|
+
const parsed = Number.parseInt(maxSubResults, 10);
|
|
22290
|
+
if (Number.isNaN(parsed) || parsed < 1 || parsed > 20) {
|
|
22291
|
+
throw new SearchSocketError("INVALID_REQUEST", "maxSubResults must be a positive integer between 1 and 20", 400);
|
|
22292
|
+
}
|
|
22293
|
+
searchRequest.maxSubResults = parsed;
|
|
22294
|
+
}
|
|
22295
|
+
const tags = params.getAll("tags");
|
|
22296
|
+
if (tags.length > 0) searchRequest.tags = tags;
|
|
22297
|
+
const engine = await getEngine();
|
|
22298
|
+
const result = await engine.search(searchRequest);
|
|
22299
|
+
return withCors(
|
|
22300
|
+
new Response(JSON.stringify(result), {
|
|
22301
|
+
status: 200,
|
|
22302
|
+
headers: { "content-type": "application/json" }
|
|
22303
|
+
}),
|
|
22304
|
+
event.request,
|
|
22305
|
+
config
|
|
22306
|
+
);
|
|
22307
|
+
}
|
|
22308
|
+
async function handleGetHealth(event, config, getEngine) {
|
|
22309
|
+
const engine = await getEngine();
|
|
22310
|
+
const result = await engine.health();
|
|
22311
|
+
return withCors(
|
|
22312
|
+
new Response(JSON.stringify(result), {
|
|
22313
|
+
status: 200,
|
|
22314
|
+
headers: { "content-type": "application/json" }
|
|
22315
|
+
}),
|
|
22316
|
+
event.request,
|
|
22317
|
+
config
|
|
22318
|
+
);
|
|
22319
|
+
}
|
|
22320
|
+
async function handleGetPage(event, config, getEngine, subPath) {
|
|
22321
|
+
const rawPath = subPath.slice("/pages".length);
|
|
22322
|
+
let pagePath;
|
|
22323
|
+
try {
|
|
22324
|
+
pagePath = decodeURIComponent(rawPath);
|
|
22325
|
+
} catch {
|
|
22326
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed page path", 400);
|
|
22327
|
+
}
|
|
22328
|
+
const scope = event.url.searchParams?.get("scope") ?? void 0;
|
|
22329
|
+
const engine = await getEngine();
|
|
22330
|
+
const result = await engine.getPage(pagePath, scope);
|
|
22331
|
+
return withCors(
|
|
22332
|
+
new Response(JSON.stringify(result), {
|
|
22333
|
+
status: 200,
|
|
22334
|
+
headers: { "content-type": "application/json" }
|
|
22335
|
+
}),
|
|
22336
|
+
event.request,
|
|
22337
|
+
config
|
|
22338
|
+
);
|
|
22339
|
+
}
|
|
22340
|
+
async function handlePostSearch(event, config, getEngine, bodyLimit) {
|
|
22341
|
+
const contentLength = Number(event.request.headers.get("content-length") ?? 0);
|
|
22342
|
+
if (contentLength > bodyLimit) {
|
|
22343
|
+
throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
|
|
22344
|
+
}
|
|
22345
|
+
let rawBody;
|
|
22346
|
+
if (typeof event.request.text === "function") {
|
|
22347
|
+
rawBody = await event.request.text();
|
|
22348
|
+
} else {
|
|
22349
|
+
let parsedFallback;
|
|
22350
|
+
try {
|
|
22351
|
+
parsedFallback = await event.request.json();
|
|
22352
|
+
} catch (error) {
|
|
22353
|
+
if (error instanceof SyntaxError) {
|
|
22354
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
22355
|
+
}
|
|
22356
|
+
throw error;
|
|
22357
|
+
}
|
|
22358
|
+
rawBody = JSON.stringify(parsedFallback);
|
|
22359
|
+
}
|
|
22360
|
+
if (Buffer.byteLength(rawBody, "utf8") > bodyLimit) {
|
|
22361
|
+
throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
|
|
22362
|
+
}
|
|
22363
|
+
let body;
|
|
22364
|
+
try {
|
|
22365
|
+
body = JSON.parse(rawBody);
|
|
22366
|
+
} catch {
|
|
22367
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
22368
|
+
}
|
|
22369
|
+
const engine = await getEngine();
|
|
22370
|
+
const searchRequest = body;
|
|
22371
|
+
const result = await engine.search(searchRequest);
|
|
22372
|
+
return withCors(
|
|
22373
|
+
new Response(JSON.stringify(result), {
|
|
22374
|
+
status: 200,
|
|
22375
|
+
headers: { "content-type": "application/json" }
|
|
22376
|
+
}),
|
|
22377
|
+
event.request,
|
|
22378
|
+
config
|
|
22379
|
+
);
|
|
22380
|
+
}
|
|
22381
|
+
async function handleMcpRequest(event, apiKey, enableJsonResponse, getEngine) {
|
|
22382
|
+
if (apiKey) {
|
|
22383
|
+
const authHeader = event.request.headers.get("authorization") ?? "";
|
|
22384
|
+
const token = authHeader.startsWith("Bearer ") ? authHeader.slice(7) : "";
|
|
22385
|
+
const tokenBuf = Buffer.from(token);
|
|
22386
|
+
const keyBuf = Buffer.from(apiKey);
|
|
22387
|
+
if (tokenBuf.length !== keyBuf.length || !crypto.timingSafeEqual(tokenBuf, keyBuf)) {
|
|
22388
|
+
return new Response(
|
|
22389
|
+
JSON.stringify({
|
|
22390
|
+
jsonrpc: "2.0",
|
|
22391
|
+
error: { code: -32001, message: "Unauthorized" },
|
|
22392
|
+
id: null
|
|
22393
|
+
}),
|
|
22394
|
+
{ status: 401, headers: { "content-type": "application/json" } }
|
|
22395
|
+
);
|
|
22396
|
+
}
|
|
22397
|
+
}
|
|
22398
|
+
const transport = new webStandardStreamableHttp_js.WebStandardStreamableHTTPServerTransport({
|
|
22399
|
+
sessionIdGenerator: void 0,
|
|
22400
|
+
enableJsonResponse
|
|
22401
|
+
});
|
|
22402
|
+
let server;
|
|
22403
|
+
try {
|
|
22404
|
+
const engine = await getEngine();
|
|
22405
|
+
server = createServer(engine);
|
|
22406
|
+
await server.connect(transport);
|
|
22407
|
+
const response = await transport.handleRequest(event.request);
|
|
22408
|
+
if (enableJsonResponse) {
|
|
22409
|
+
await transport.close();
|
|
22410
|
+
await server.close();
|
|
22411
|
+
}
|
|
22412
|
+
return response;
|
|
22413
|
+
} catch (error) {
|
|
22414
|
+
try {
|
|
22415
|
+
await transport.close();
|
|
22416
|
+
} catch {
|
|
22417
|
+
}
|
|
22418
|
+
try {
|
|
22419
|
+
await server?.close();
|
|
22420
|
+
} catch {
|
|
22421
|
+
}
|
|
22422
|
+
return new Response(
|
|
22423
|
+
JSON.stringify({
|
|
22424
|
+
jsonrpc: "2.0",
|
|
22425
|
+
error: {
|
|
22426
|
+
code: -32603,
|
|
22427
|
+
message: error instanceof Error ? error.message : "Internal server error"
|
|
22428
|
+
},
|
|
22429
|
+
id: null
|
|
22430
|
+
}),
|
|
22431
|
+
{ status: 500, headers: { "content-type": "application/json" } }
|
|
22432
|
+
);
|
|
22433
|
+
}
|
|
22434
|
+
}
|
|
20530
22435
|
function buildCorsHeaders(request, config) {
|
|
20531
22436
|
const allowOrigins = config.api.cors.allowOrigins;
|
|
20532
22437
|
if (!allowOrigins || allowOrigins.length === 0) {
|
|
@@ -20539,7 +22444,7 @@ function buildCorsHeaders(request, config) {
|
|
|
20539
22444
|
}
|
|
20540
22445
|
return {
|
|
20541
22446
|
"access-control-allow-origin": allowOrigins.includes("*") ? "*" : origin,
|
|
20542
|
-
"access-control-allow-methods": "POST, OPTIONS",
|
|
22447
|
+
"access-control-allow-methods": "GET, POST, OPTIONS",
|
|
20543
22448
|
"access-control-allow-headers": "content-type"
|
|
20544
22449
|
};
|
|
20545
22450
|
}
|
|
@@ -20575,9 +22480,6 @@ function shouldRunAutoIndex(options) {
|
|
|
20575
22480
|
if (explicit && /^(1|true|yes)$/i.test(explicit)) {
|
|
20576
22481
|
return true;
|
|
20577
22482
|
}
|
|
20578
|
-
if (process.env.CI && /^(1|true)$/i.test(process.env.CI)) {
|
|
20579
|
-
return true;
|
|
20580
|
-
}
|
|
20581
22483
|
return false;
|
|
20582
22484
|
}
|
|
20583
22485
|
function searchsocketVitePlugin(options = {}) {
|
|
@@ -20602,7 +22504,8 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20602
22504
|
const pipeline = await IndexPipeline.create({
|
|
20603
22505
|
cwd,
|
|
20604
22506
|
configPath: options.configPath,
|
|
20605
|
-
logger: logger3
|
|
22507
|
+
logger: logger3,
|
|
22508
|
+
hooks: options.hooks
|
|
20606
22509
|
});
|
|
20607
22510
|
const stats = await pipeline.run({
|
|
20608
22511
|
changedOnly: options.changedOnly ?? true,
|