searchsocket 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +731 -514
- package/dist/cli.js +3308 -524
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +2310 -466
- package/dist/index.d.cts +101 -40
- package/dist/index.d.ts +101 -40
- package/dist/index.js +2310 -466
- package/dist/{plugin-B_npJSux.d.cts → plugin-C61L-ykY.d.ts} +2 -1
- package/dist/{plugin-M-aW0ev6.d.ts → plugin-DoBW1gkK.d.cts} +2 -1
- package/dist/sveltekit.cjs +2342 -465
- package/dist/sveltekit.d.cts +2 -2
- package/dist/sveltekit.d.ts +2 -2
- package/dist/sveltekit.js +2344 -467
- package/dist/templates/search-dialog/SearchDialog.svelte +175 -0
- package/dist/templates/search-input/SearchInput.svelte +151 -0
- package/dist/templates/search-results/SearchResults.svelte +75 -0
- package/dist/{types-Dk43uz25.d.cts → types-029hl6P2.d.cts} +180 -9
- package/dist/{types-Dk43uz25.d.ts → types-029hl6P2.d.ts} +180 -9
- package/package.json +20 -2
- package/src/svelte/SearchSocket.svelte +35 -0
- package/src/svelte/index.svelte.ts +181 -0
package/dist/index.cjs
CHANGED
|
@@ -5,18 +5,20 @@ var path = require('path');
|
|
|
5
5
|
var jiti = require('jiti');
|
|
6
6
|
var zod = require('zod');
|
|
7
7
|
var child_process = require('child_process');
|
|
8
|
+
var vector = require('@upstash/vector');
|
|
8
9
|
var crypto = require('crypto');
|
|
9
10
|
var cheerio = require('cheerio');
|
|
10
11
|
var matter = require('gray-matter');
|
|
11
12
|
var fg = require('fast-glob');
|
|
12
13
|
var pLimit = require('p-limit');
|
|
13
|
-
var
|
|
14
|
+
var fs8 = require('fs/promises');
|
|
14
15
|
var net = require('net');
|
|
15
16
|
var zlib = require('zlib');
|
|
16
17
|
var mcp_js = require('@modelcontextprotocol/sdk/server/mcp.js');
|
|
17
18
|
var stdio_js = require('@modelcontextprotocol/sdk/server/stdio.js');
|
|
18
19
|
var streamableHttp_js = require('@modelcontextprotocol/sdk/server/streamableHttp.js');
|
|
19
20
|
var express_js = require('@modelcontextprotocol/sdk/server/express.js');
|
|
21
|
+
var webStandardStreamableHttp_js = require('@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js');
|
|
20
22
|
|
|
21
23
|
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
22
24
|
|
|
@@ -25,7 +27,7 @@ var path__default = /*#__PURE__*/_interopDefault(path);
|
|
|
25
27
|
var matter__default = /*#__PURE__*/_interopDefault(matter);
|
|
26
28
|
var fg__default = /*#__PURE__*/_interopDefault(fg);
|
|
27
29
|
var pLimit__default = /*#__PURE__*/_interopDefault(pLimit);
|
|
28
|
-
var
|
|
30
|
+
var fs8__default = /*#__PURE__*/_interopDefault(fs8);
|
|
29
31
|
var net__default = /*#__PURE__*/_interopDefault(net);
|
|
30
32
|
|
|
31
33
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
@@ -5025,32 +5027,32 @@ var require_URL = __commonJS({
|
|
|
5025
5027
|
else
|
|
5026
5028
|
return basepath.substring(0, lastslash + 1) + refpath;
|
|
5027
5029
|
}
|
|
5028
|
-
function remove_dot_segments(
|
|
5029
|
-
if (!
|
|
5030
|
+
function remove_dot_segments(path14) {
|
|
5031
|
+
if (!path14) return path14;
|
|
5030
5032
|
var output = "";
|
|
5031
|
-
while (
|
|
5032
|
-
if (
|
|
5033
|
-
|
|
5033
|
+
while (path14.length > 0) {
|
|
5034
|
+
if (path14 === "." || path14 === "..") {
|
|
5035
|
+
path14 = "";
|
|
5034
5036
|
break;
|
|
5035
5037
|
}
|
|
5036
|
-
var twochars =
|
|
5037
|
-
var threechars =
|
|
5038
|
-
var fourchars =
|
|
5038
|
+
var twochars = path14.substring(0, 2);
|
|
5039
|
+
var threechars = path14.substring(0, 3);
|
|
5040
|
+
var fourchars = path14.substring(0, 4);
|
|
5039
5041
|
if (threechars === "../") {
|
|
5040
|
-
|
|
5042
|
+
path14 = path14.substring(3);
|
|
5041
5043
|
} else if (twochars === "./") {
|
|
5042
|
-
|
|
5044
|
+
path14 = path14.substring(2);
|
|
5043
5045
|
} else if (threechars === "/./") {
|
|
5044
|
-
|
|
5045
|
-
} else if (twochars === "/." &&
|
|
5046
|
-
|
|
5047
|
-
} else if (fourchars === "/../" || threechars === "/.." &&
|
|
5048
|
-
|
|
5046
|
+
path14 = "/" + path14.substring(3);
|
|
5047
|
+
} else if (twochars === "/." && path14.length === 2) {
|
|
5048
|
+
path14 = "/";
|
|
5049
|
+
} else if (fourchars === "/../" || threechars === "/.." && path14.length === 3) {
|
|
5050
|
+
path14 = "/" + path14.substring(4);
|
|
5049
5051
|
output = output.replace(/\/?[^\/]*$/, "");
|
|
5050
5052
|
} else {
|
|
5051
|
-
var segment =
|
|
5053
|
+
var segment = path14.match(/(\/?([^\/]*))/)[0];
|
|
5052
5054
|
output += segment;
|
|
5053
|
-
|
|
5055
|
+
path14 = path14.substring(segment.length);
|
|
5054
5056
|
}
|
|
5055
5057
|
}
|
|
5056
5058
|
return output;
|
|
@@ -16646,6 +16648,7 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16646
16648
|
dropSelectors: zod.z.array(zod.z.string()).optional(),
|
|
16647
16649
|
ignoreAttr: zod.z.string().optional(),
|
|
16648
16650
|
noindexAttr: zod.z.string().optional(),
|
|
16651
|
+
imageDescAttr: zod.z.string().optional(),
|
|
16649
16652
|
respectRobotsNoindex: zod.z.boolean().optional()
|
|
16650
16653
|
}).optional(),
|
|
16651
16654
|
transform: zod.z.object({
|
|
@@ -16661,35 +16664,48 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16661
16664
|
headingPathDepth: zod.z.number().int().positive().optional(),
|
|
16662
16665
|
dontSplitInside: zod.z.array(zod.z.enum(["code", "table", "blockquote"])).optional(),
|
|
16663
16666
|
prependTitle: zod.z.boolean().optional(),
|
|
16664
|
-
pageSummaryChunk: zod.z.boolean().optional()
|
|
16667
|
+
pageSummaryChunk: zod.z.boolean().optional(),
|
|
16668
|
+
weightHeadings: zod.z.boolean().optional()
|
|
16665
16669
|
}).optional(),
|
|
16666
16670
|
upstash: zod.z.object({
|
|
16667
16671
|
url: zod.z.string().url().optional(),
|
|
16668
16672
|
token: zod.z.string().min(1).optional(),
|
|
16669
16673
|
urlEnv: zod.z.string().min(1).optional(),
|
|
16670
|
-
tokenEnv: zod.z.string().min(1).optional()
|
|
16674
|
+
tokenEnv: zod.z.string().min(1).optional(),
|
|
16675
|
+
namespaces: zod.z.object({
|
|
16676
|
+
pages: zod.z.string().min(1).optional(),
|
|
16677
|
+
chunks: zod.z.string().min(1).optional()
|
|
16678
|
+
}).optional()
|
|
16679
|
+
}).optional(),
|
|
16680
|
+
embedding: zod.z.object({
|
|
16681
|
+
model: zod.z.string().optional(),
|
|
16682
|
+
dimensions: zod.z.number().int().positive().optional(),
|
|
16683
|
+
taskType: zod.z.string().optional(),
|
|
16684
|
+
batchSize: zod.z.number().int().positive().optional()
|
|
16671
16685
|
}).optional(),
|
|
16672
16686
|
search: zod.z.object({
|
|
16673
|
-
semanticWeight: zod.z.number().min(0).max(1).optional(),
|
|
16674
|
-
inputEnrichment: zod.z.boolean().optional(),
|
|
16675
|
-
reranking: zod.z.boolean().optional(),
|
|
16676
16687
|
dualSearch: zod.z.boolean().optional(),
|
|
16677
16688
|
pageSearchWeight: zod.z.number().min(0).max(1).optional()
|
|
16678
16689
|
}).optional(),
|
|
16679
16690
|
ranking: zod.z.object({
|
|
16680
16691
|
enableIncomingLinkBoost: zod.z.boolean().optional(),
|
|
16681
16692
|
enableDepthBoost: zod.z.boolean().optional(),
|
|
16693
|
+
enableFreshnessBoost: zod.z.boolean().optional(),
|
|
16694
|
+
freshnessDecayRate: zod.z.number().positive().optional(),
|
|
16695
|
+
enableAnchorTextBoost: zod.z.boolean().optional(),
|
|
16682
16696
|
pageWeights: zod.z.record(zod.z.string(), zod.z.number().min(0)).optional(),
|
|
16683
16697
|
aggregationCap: zod.z.number().int().positive().optional(),
|
|
16684
16698
|
aggregationDecay: zod.z.number().min(0).max(1).optional(),
|
|
16685
16699
|
minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
16686
|
-
|
|
16700
|
+
minScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
16687
16701
|
scoreGapThreshold: zod.z.number().min(0).max(1).optional(),
|
|
16688
16702
|
weights: zod.z.object({
|
|
16689
16703
|
incomingLinks: zod.z.number().optional(),
|
|
16690
16704
|
depth: zod.z.number().optional(),
|
|
16691
16705
|
aggregation: zod.z.number().optional(),
|
|
16692
|
-
titleMatch: zod.z.number().optional()
|
|
16706
|
+
titleMatch: zod.z.number().optional(),
|
|
16707
|
+
freshness: zod.z.number().optional(),
|
|
16708
|
+
anchorText: zod.z.number().optional()
|
|
16693
16709
|
}).optional()
|
|
16694
16710
|
}).optional(),
|
|
16695
16711
|
api: zod.z.object({
|
|
@@ -16704,12 +16720,28 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16704
16720
|
}).optional(),
|
|
16705
16721
|
mcp: zod.z.object({
|
|
16706
16722
|
enable: zod.z.boolean().optional(),
|
|
16723
|
+
access: zod.z.enum(["public", "private"]).optional(),
|
|
16707
16724
|
transport: zod.z.enum(["stdio", "http"]).optional(),
|
|
16708
16725
|
http: zod.z.object({
|
|
16709
16726
|
port: zod.z.number().int().positive().optional(),
|
|
16710
|
-
path: zod.z.string().optional()
|
|
16727
|
+
path: zod.z.string().optional(),
|
|
16728
|
+
apiKey: zod.z.string().min(1).optional(),
|
|
16729
|
+
apiKeyEnv: zod.z.string().min(1).optional()
|
|
16730
|
+
}).optional(),
|
|
16731
|
+
handle: zod.z.object({
|
|
16732
|
+
path: zod.z.string().optional(),
|
|
16733
|
+
apiKey: zod.z.string().min(1).optional(),
|
|
16734
|
+
enableJsonResponse: zod.z.boolean().optional()
|
|
16711
16735
|
}).optional()
|
|
16712
16736
|
}).optional(),
|
|
16737
|
+
llmsTxt: zod.z.object({
|
|
16738
|
+
enable: zod.z.boolean().optional(),
|
|
16739
|
+
outputPath: zod.z.string().optional(),
|
|
16740
|
+
title: zod.z.string().optional(),
|
|
16741
|
+
description: zod.z.string().optional(),
|
|
16742
|
+
generateFull: zod.z.boolean().optional(),
|
|
16743
|
+
serveMarkdownVariants: zod.z.boolean().optional()
|
|
16744
|
+
}).optional(),
|
|
16713
16745
|
state: zod.z.object({
|
|
16714
16746
|
dir: zod.z.string().optional()
|
|
16715
16747
|
}).optional()
|
|
@@ -16748,6 +16780,7 @@ function createDefaultConfig(projectId) {
|
|
|
16748
16780
|
dropSelectors: DEFAULT_DROP_SELECTORS,
|
|
16749
16781
|
ignoreAttr: "data-search-ignore",
|
|
16750
16782
|
noindexAttr: "data-search-noindex",
|
|
16783
|
+
imageDescAttr: "data-search-description",
|
|
16751
16784
|
respectRobotsNoindex: true
|
|
16752
16785
|
},
|
|
16753
16786
|
transform: {
|
|
@@ -16757,39 +16790,52 @@ function createDefaultConfig(projectId) {
|
|
|
16757
16790
|
},
|
|
16758
16791
|
chunking: {
|
|
16759
16792
|
strategy: "hybrid",
|
|
16760
|
-
maxChars:
|
|
16793
|
+
maxChars: 1500,
|
|
16761
16794
|
overlapChars: 200,
|
|
16762
16795
|
minChars: 250,
|
|
16763
16796
|
headingPathDepth: 3,
|
|
16764
16797
|
dontSplitInside: ["code", "table", "blockquote"],
|
|
16765
16798
|
prependTitle: true,
|
|
16766
|
-
pageSummaryChunk: true
|
|
16799
|
+
pageSummaryChunk: true,
|
|
16800
|
+
weightHeadings: true
|
|
16767
16801
|
},
|
|
16768
16802
|
upstash: {
|
|
16769
|
-
urlEnv: "
|
|
16770
|
-
tokenEnv: "
|
|
16803
|
+
urlEnv: "UPSTASH_VECTOR_REST_URL",
|
|
16804
|
+
tokenEnv: "UPSTASH_VECTOR_REST_TOKEN",
|
|
16805
|
+
namespaces: {
|
|
16806
|
+
pages: "pages",
|
|
16807
|
+
chunks: "chunks"
|
|
16808
|
+
}
|
|
16809
|
+
},
|
|
16810
|
+
embedding: {
|
|
16811
|
+
model: "bge-large-en-v1.5",
|
|
16812
|
+
dimensions: 1024,
|
|
16813
|
+
taskType: "RETRIEVAL_DOCUMENT",
|
|
16814
|
+
batchSize: 100
|
|
16771
16815
|
},
|
|
16772
16816
|
search: {
|
|
16773
|
-
semanticWeight: 0.75,
|
|
16774
|
-
inputEnrichment: true,
|
|
16775
|
-
reranking: true,
|
|
16776
16817
|
dualSearch: true,
|
|
16777
16818
|
pageSearchWeight: 0.3
|
|
16778
16819
|
},
|
|
16779
16820
|
ranking: {
|
|
16780
16821
|
enableIncomingLinkBoost: true,
|
|
16781
16822
|
enableDepthBoost: true,
|
|
16823
|
+
enableFreshnessBoost: false,
|
|
16824
|
+
freshnessDecayRate: 1e-3,
|
|
16825
|
+
enableAnchorTextBoost: false,
|
|
16782
16826
|
pageWeights: {},
|
|
16783
16827
|
aggregationCap: 5,
|
|
16784
16828
|
aggregationDecay: 0.5,
|
|
16785
16829
|
minChunkScoreRatio: 0.5,
|
|
16786
|
-
|
|
16830
|
+
minScoreRatio: 0.7,
|
|
16787
16831
|
scoreGapThreshold: 0.4,
|
|
16788
16832
|
weights: {
|
|
16789
16833
|
incomingLinks: 0.05,
|
|
16790
16834
|
depth: 0.03,
|
|
16791
16835
|
aggregation: 0.1,
|
|
16792
|
-
titleMatch: 0.15
|
|
16836
|
+
titleMatch: 0.15,
|
|
16837
|
+
freshness: 0.1,
|
|
16838
|
+
anchorText: 0.1
|
|
16793
16839
|
}
|
|
16794
16840
|
},
|
|
16795
16841
|
api: {
|
|
@@ -16800,12 +16846,23 @@ function createDefaultConfig(projectId) {
|
|
|
16800
16846
|
},
|
|
16801
16847
|
mcp: {
|
|
16802
16848
|
enable: process.env.NODE_ENV !== "production",
|
|
16849
|
+
access: "private",
|
|
16803
16850
|
transport: "stdio",
|
|
16804
16851
|
http: {
|
|
16805
16852
|
port: 3338,
|
|
16806
16853
|
path: "/mcp"
|
|
16854
|
+
},
|
|
16855
|
+
handle: {
|
|
16856
|
+
path: "/api/mcp",
|
|
16857
|
+
enableJsonResponse: true
|
|
16807
16858
|
}
|
|
16808
16859
|
},
|
|
16860
|
+
llmsTxt: {
|
|
16861
|
+
enable: false,
|
|
16862
|
+
outputPath: "static/llms.txt",
|
|
16863
|
+
generateFull: true,
|
|
16864
|
+
serveMarkdownVariants: false
|
|
16865
|
+
},
|
|
16809
16866
|
state: {
|
|
16810
16867
|
dir: ".searchsocket"
|
|
16811
16868
|
}
|
|
@@ -16933,7 +16990,15 @@ ${issues}`
|
|
|
16933
16990
|
},
|
|
16934
16991
|
upstash: {
|
|
16935
16992
|
...defaults.upstash,
|
|
16936
|
-
...parsed.upstash
|
|
16993
|
+
...parsed.upstash,
|
|
16994
|
+
namespaces: {
|
|
16995
|
+
...defaults.upstash.namespaces,
|
|
16996
|
+
...parsed.upstash?.namespaces
|
|
16997
|
+
}
|
|
16998
|
+
},
|
|
16999
|
+
embedding: {
|
|
17000
|
+
...defaults.embedding,
|
|
17001
|
+
...parsed.embedding
|
|
16937
17002
|
},
|
|
16938
17003
|
search: {
|
|
16939
17004
|
...defaults.search,
|
|
@@ -16970,8 +17035,16 @@ ${issues}`
|
|
|
16970
17035
|
http: {
|
|
16971
17036
|
...defaults.mcp.http,
|
|
16972
17037
|
...parsed.mcp?.http
|
|
17038
|
+
},
|
|
17039
|
+
handle: {
|
|
17040
|
+
...defaults.mcp.handle,
|
|
17041
|
+
...parsed.mcp?.handle
|
|
16973
17042
|
}
|
|
16974
17043
|
},
|
|
17044
|
+
llmsTxt: {
|
|
17045
|
+
...defaults.llmsTxt,
|
|
17046
|
+
...parsed.llmsTxt
|
|
17047
|
+
},
|
|
16975
17048
|
state: {
|
|
16976
17049
|
...defaults.state,
|
|
16977
17050
|
...parsed.state
|
|
@@ -16991,6 +17064,15 @@ ${issues}`
|
|
|
16991
17064
|
maxDepth: 10
|
|
16992
17065
|
};
|
|
16993
17066
|
}
|
|
17067
|
+
if (merged.mcp.access === "public") {
|
|
17068
|
+
const resolvedKey = merged.mcp.http.apiKey ?? (merged.mcp.http.apiKeyEnv ? process.env[merged.mcp.http.apiKeyEnv] : void 0);
|
|
17069
|
+
if (!resolvedKey) {
|
|
17070
|
+
throw new SearchSocketError(
|
|
17071
|
+
"CONFIG_MISSING",
|
|
17072
|
+
'`mcp.access` is "public" but no API key is configured. Set `mcp.http.apiKey` or `mcp.http.apiKeyEnv`.'
|
|
17073
|
+
);
|
|
17074
|
+
}
|
|
17075
|
+
}
|
|
16994
17076
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
16995
17077
|
throw new SearchSocketError("CONFIG_MISSING", "`source.crawl.baseUrl` is required when source.mode is crawl.");
|
|
16996
17078
|
}
|
|
@@ -17054,13 +17136,84 @@ function normalizeMarkdown(input) {
|
|
|
17054
17136
|
function sanitizeScopeName(scopeName) {
|
|
17055
17137
|
return scopeName.toLowerCase().replace(/[^a-z0-9._-]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80);
|
|
17056
17138
|
}
|
|
17139
|
+
function markdownToPlain(markdown) {
|
|
17140
|
+
return markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/[#>*_|\-]/g, " ").replace(/\s+/g, " ").trim();
|
|
17141
|
+
}
|
|
17057
17142
|
function toSnippet(markdown, maxLen = 220) {
|
|
17058
|
-
const plain = markdown
|
|
17143
|
+
const plain = markdownToPlain(markdown);
|
|
17059
17144
|
if (plain.length <= maxLen) {
|
|
17060
17145
|
return plain;
|
|
17061
17146
|
}
|
|
17062
17147
|
return `${plain.slice(0, Math.max(0, maxLen - 1)).trim()}\u2026`;
|
|
17063
17148
|
}
|
|
17149
|
+
function queryAwareExcerpt(markdown, query, maxLen = 220) {
|
|
17150
|
+
const plain = markdownToPlain(markdown);
|
|
17151
|
+
if (plain.length <= maxLen) return plain;
|
|
17152
|
+
const tokens = query.toLowerCase().split(/\s+/).filter((t) => t.length >= 2);
|
|
17153
|
+
if (tokens.length === 0) return toSnippet(markdown, maxLen);
|
|
17154
|
+
const positions = [];
|
|
17155
|
+
for (let ti = 0; ti < tokens.length; ti++) {
|
|
17156
|
+
const escaped = tokens[ti].replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
17157
|
+
const re = new RegExp(escaped, "gi");
|
|
17158
|
+
let m;
|
|
17159
|
+
while ((m = re.exec(plain)) !== null) {
|
|
17160
|
+
positions.push({ start: m.index, end: m.index + m[0].length, tokenIdx: ti });
|
|
17161
|
+
}
|
|
17162
|
+
}
|
|
17163
|
+
if (positions.length === 0) return toSnippet(markdown, maxLen);
|
|
17164
|
+
positions.sort((a, b) => a.start - b.start);
|
|
17165
|
+
let bestUniqueCount = 0;
|
|
17166
|
+
let bestTotalCount = 0;
|
|
17167
|
+
let bestLeft = 0;
|
|
17168
|
+
let bestRight = 0;
|
|
17169
|
+
let left = 0;
|
|
17170
|
+
const tokenCounts = /* @__PURE__ */ new Map();
|
|
17171
|
+
for (let right = 0; right < positions.length; right++) {
|
|
17172
|
+
tokenCounts.set(positions[right].tokenIdx, (tokenCounts.get(positions[right].tokenIdx) ?? 0) + 1);
|
|
17173
|
+
while (positions[right].end - positions[left].start > maxLen && left < right) {
|
|
17174
|
+
const leftToken = positions[left].tokenIdx;
|
|
17175
|
+
const cnt = tokenCounts.get(leftToken) - 1;
|
|
17176
|
+
if (cnt === 0) tokenCounts.delete(leftToken);
|
|
17177
|
+
else tokenCounts.set(leftToken, cnt);
|
|
17178
|
+
left++;
|
|
17179
|
+
}
|
|
17180
|
+
const uniqueCount = tokenCounts.size;
|
|
17181
|
+
const totalCount = right - left + 1;
|
|
17182
|
+
if (uniqueCount > bestUniqueCount || uniqueCount === bestUniqueCount && totalCount > bestTotalCount) {
|
|
17183
|
+
bestUniqueCount = uniqueCount;
|
|
17184
|
+
bestTotalCount = totalCount;
|
|
17185
|
+
bestLeft = left;
|
|
17186
|
+
bestRight = right;
|
|
17187
|
+
}
|
|
17188
|
+
}
|
|
17189
|
+
const mid = Math.floor((positions[bestLeft].start + positions[bestRight].end) / 2);
|
|
17190
|
+
let start = Math.max(0, mid - Math.floor(maxLen / 2));
|
|
17191
|
+
let end = Math.min(plain.length, start + maxLen);
|
|
17192
|
+
start = Math.max(0, end - maxLen);
|
|
17193
|
+
if (start > 0) {
|
|
17194
|
+
const spaceIdx = plain.lastIndexOf(" ", start);
|
|
17195
|
+
if (spaceIdx > start - 30) {
|
|
17196
|
+
start = spaceIdx + 1;
|
|
17197
|
+
}
|
|
17198
|
+
}
|
|
17199
|
+
if (end < plain.length) {
|
|
17200
|
+
const spaceIdx = plain.indexOf(" ", end);
|
|
17201
|
+
if (spaceIdx !== -1 && spaceIdx < end + 30) {
|
|
17202
|
+
end = spaceIdx;
|
|
17203
|
+
}
|
|
17204
|
+
}
|
|
17205
|
+
let excerpt = plain.slice(start, end);
|
|
17206
|
+
if (excerpt.length > Math.ceil(maxLen * 1.2)) {
|
|
17207
|
+
excerpt = excerpt.slice(0, maxLen);
|
|
17208
|
+
const lastSpace = excerpt.lastIndexOf(" ");
|
|
17209
|
+
if (lastSpace > maxLen * 0.5) {
|
|
17210
|
+
excerpt = excerpt.slice(0, lastSpace);
|
|
17211
|
+
}
|
|
17212
|
+
}
|
|
17213
|
+
const prefix = start > 0 ? "\u2026" : "";
|
|
17214
|
+
const suffix = end < plain.length ? "\u2026" : "";
|
|
17215
|
+
return `${prefix}${excerpt}${suffix}`;
|
|
17216
|
+
}
|
|
17064
17217
|
function extractFirstParagraph(markdown) {
|
|
17065
17218
|
const lines = markdown.split("\n");
|
|
17066
17219
|
let inFence = false;
|
|
@@ -17121,162 +17274,288 @@ function ensureStateDirs(cwd, stateDir, scope) {
|
|
|
17121
17274
|
fs__default.default.mkdirSync(statePath, { recursive: true });
|
|
17122
17275
|
return { statePath };
|
|
17123
17276
|
}
|
|
17124
|
-
|
|
17125
|
-
// src/vector/upstash.ts
|
|
17126
|
-
function chunkIndexName(scope) {
|
|
17127
|
-
return `${scope.projectId}--${scope.scopeName}`;
|
|
17128
|
-
}
|
|
17129
|
-
function pageIndexName(scope) {
|
|
17130
|
-
return `${scope.projectId}--${scope.scopeName}--pages`;
|
|
17131
|
-
}
|
|
17132
17277
|
var UpstashSearchStore = class {
|
|
17133
|
-
|
|
17278
|
+
index;
|
|
17279
|
+
pagesNs;
|
|
17280
|
+
chunksNs;
|
|
17134
17281
|
constructor(opts) {
|
|
17135
|
-
this.
|
|
17136
|
-
|
|
17137
|
-
|
|
17138
|
-
return this.client.index(chunkIndexName(scope));
|
|
17139
|
-
}
|
|
17140
|
-
pageIndex(scope) {
|
|
17141
|
-
return this.client.index(pageIndexName(scope));
|
|
17282
|
+
this.index = opts.index;
|
|
17283
|
+
this.pagesNs = opts.index.namespace(opts.pagesNamespace);
|
|
17284
|
+
this.chunksNs = opts.index.namespace(opts.chunksNamespace);
|
|
17142
17285
|
}
|
|
17143
17286
|
async upsertChunks(chunks, scope) {
|
|
17144
17287
|
if (chunks.length === 0) return;
|
|
17145
|
-
const index = this.chunkIndex(scope);
|
|
17146
17288
|
const BATCH_SIZE = 100;
|
|
17147
17289
|
for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
|
|
17148
17290
|
const batch = chunks.slice(i, i + BATCH_SIZE);
|
|
17149
|
-
await
|
|
17150
|
-
|
|
17151
|
-
|
|
17152
|
-
|
|
17153
|
-
|
|
17154
|
-
|
|
17155
|
-
|
|
17156
|
-
|
|
17157
|
-
|
|
17158
|
-
|
|
17159
|
-
|
|
17160
|
-
|
|
17291
|
+
await this.chunksNs.upsert(
|
|
17292
|
+
batch.map((c) => ({
|
|
17293
|
+
id: c.id,
|
|
17294
|
+
data: c.data,
|
|
17295
|
+
metadata: {
|
|
17296
|
+
...c.metadata,
|
|
17297
|
+
projectId: scope.projectId,
|
|
17298
|
+
scopeName: scope.scopeName,
|
|
17299
|
+
type: c.metadata.type || "chunk"
|
|
17300
|
+
}
|
|
17301
|
+
}))
|
|
17302
|
+
);
|
|
17303
|
+
}
|
|
17304
|
+
}
|
|
17305
|
+
async search(data, opts, scope) {
|
|
17306
|
+
const filterParts = [
|
|
17307
|
+
`projectId = '${scope.projectId}'`,
|
|
17308
|
+
`scopeName = '${scope.scopeName}'`
|
|
17309
|
+
];
|
|
17310
|
+
if (opts.filter) {
|
|
17311
|
+
filterParts.push(opts.filter);
|
|
17312
|
+
}
|
|
17313
|
+
const results = await this.chunksNs.query({
|
|
17314
|
+
data,
|
|
17315
|
+
topK: opts.limit,
|
|
17316
|
+
includeMetadata: true,
|
|
17317
|
+
filter: filterParts.join(" AND "),
|
|
17318
|
+
queryMode: vector.QueryMode.HYBRID,
|
|
17319
|
+
fusionAlgorithm: vector.FusionAlgorithm.DBSF
|
|
17320
|
+
});
|
|
17321
|
+
return results.map((doc) => ({
|
|
17322
|
+
id: String(doc.id),
|
|
17323
|
+
score: doc.score,
|
|
17324
|
+
metadata: {
|
|
17325
|
+
projectId: doc.metadata?.projectId ?? "",
|
|
17326
|
+
scopeName: doc.metadata?.scopeName ?? "",
|
|
17327
|
+
url: doc.metadata?.url ?? "",
|
|
17328
|
+
path: doc.metadata?.path ?? "",
|
|
17329
|
+
title: doc.metadata?.title ?? "",
|
|
17330
|
+
sectionTitle: doc.metadata?.sectionTitle ?? "",
|
|
17331
|
+
headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
|
|
17332
|
+
snippet: doc.metadata?.snippet ?? "",
|
|
17333
|
+
chunkText: doc.metadata?.chunkText ?? "",
|
|
17334
|
+
ordinal: doc.metadata?.ordinal ?? 0,
|
|
17335
|
+
contentHash: doc.metadata?.contentHash ?? "",
|
|
17336
|
+
depth: doc.metadata?.depth ?? 0,
|
|
17337
|
+
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17338
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
17339
|
+
tags: doc.metadata?.tags ?? [],
|
|
17340
|
+
description: doc.metadata?.description || void 0,
|
|
17341
|
+
keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17342
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
|
|
17343
|
+
incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
|
|
17344
|
+
}
|
|
17345
|
+
}));
|
|
17346
|
+
}
|
|
17347
|
+
async searchChunksByUrl(data, url, opts, scope) {
|
|
17348
|
+
const filterParts = [
|
|
17349
|
+
`projectId = '${scope.projectId}'`,
|
|
17350
|
+
`scopeName = '${scope.scopeName}'`,
|
|
17351
|
+
`url = '${url}'`
|
|
17352
|
+
];
|
|
17353
|
+
if (opts.filter) {
|
|
17354
|
+
filterParts.push(opts.filter);
|
|
17355
|
+
}
|
|
17356
|
+
const results = await this.chunksNs.query({
|
|
17357
|
+
data,
|
|
17358
|
+
topK: opts.limit,
|
|
17359
|
+
includeMetadata: true,
|
|
17360
|
+
filter: filterParts.join(" AND "),
|
|
17361
|
+
queryMode: vector.QueryMode.HYBRID,
|
|
17362
|
+
fusionAlgorithm: vector.FusionAlgorithm.DBSF
|
|
17161
17363
|
});
|
|
17162
17364
|
return results.map((doc) => ({
|
|
17163
|
-
id: doc.id,
|
|
17365
|
+
id: String(doc.id),
|
|
17164
17366
|
score: doc.score,
|
|
17165
17367
|
metadata: {
|
|
17166
17368
|
projectId: doc.metadata?.projectId ?? "",
|
|
17167
17369
|
scopeName: doc.metadata?.scopeName ?? "",
|
|
17168
|
-
url: doc.
|
|
17370
|
+
url: doc.metadata?.url ?? "",
|
|
17169
17371
|
path: doc.metadata?.path ?? "",
|
|
17170
|
-
title: doc.
|
|
17171
|
-
sectionTitle: doc.
|
|
17172
|
-
headingPath: doc.
|
|
17372
|
+
title: doc.metadata?.title ?? "",
|
|
17373
|
+
sectionTitle: doc.metadata?.sectionTitle ?? "",
|
|
17374
|
+
headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
|
|
17173
17375
|
snippet: doc.metadata?.snippet ?? "",
|
|
17174
|
-
chunkText: doc.
|
|
17376
|
+
chunkText: doc.metadata?.chunkText ?? "",
|
|
17175
17377
|
ordinal: doc.metadata?.ordinal ?? 0,
|
|
17176
17378
|
contentHash: doc.metadata?.contentHash ?? "",
|
|
17177
17379
|
depth: doc.metadata?.depth ?? 0,
|
|
17178
17380
|
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17179
17381
|
routeFile: doc.metadata?.routeFile ?? "",
|
|
17180
|
-
tags: doc.
|
|
17382
|
+
tags: doc.metadata?.tags ?? [],
|
|
17181
17383
|
description: doc.metadata?.description || void 0,
|
|
17182
|
-
keywords: doc.metadata?.keywords ? doc.metadata.keywords
|
|
17384
|
+
keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17385
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
|
|
17386
|
+
incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
|
|
17183
17387
|
}
|
|
17184
17388
|
}));
|
|
17185
17389
|
}
|
|
17186
|
-
async
|
|
17187
|
-
|
|
17390
|
+
async searchPagesByText(data, opts, scope) {
|
|
17391
|
+
return this.queryPages({ data }, opts, scope);
|
|
17392
|
+
}
|
|
17393
|
+
async searchPagesByVector(vector, opts, scope) {
|
|
17394
|
+
return this.queryPages({ vector }, opts, scope);
|
|
17395
|
+
}
|
|
17396
|
+
async queryPages(input, opts, scope) {
|
|
17397
|
+
const filterParts = [
|
|
17398
|
+
`projectId = '${scope.projectId}'`,
|
|
17399
|
+
`scopeName = '${scope.scopeName}'`
|
|
17400
|
+
];
|
|
17401
|
+
if (opts.filter) {
|
|
17402
|
+
filterParts.push(opts.filter);
|
|
17403
|
+
}
|
|
17188
17404
|
let results;
|
|
17189
17405
|
try {
|
|
17190
|
-
results = await
|
|
17191
|
-
|
|
17192
|
-
|
|
17193
|
-
|
|
17194
|
-
|
|
17195
|
-
|
|
17196
|
-
|
|
17406
|
+
results = await this.pagesNs.query({
|
|
17407
|
+
...input,
|
|
17408
|
+
topK: opts.limit,
|
|
17409
|
+
includeMetadata: true,
|
|
17410
|
+
filter: filterParts.join(" AND "),
|
|
17411
|
+
queryMode: vector.QueryMode.HYBRID,
|
|
17412
|
+
fusionAlgorithm: vector.FusionAlgorithm.DBSF
|
|
17197
17413
|
});
|
|
17198
17414
|
} catch {
|
|
17199
17415
|
return [];
|
|
17200
17416
|
}
|
|
17201
17417
|
return results.map((doc) => ({
|
|
17202
|
-
id: doc.id,
|
|
17418
|
+
id: String(doc.id),
|
|
17203
17419
|
score: doc.score,
|
|
17204
|
-
title: doc.
|
|
17205
|
-
url: doc.
|
|
17206
|
-
description: doc.
|
|
17207
|
-
tags: doc.
|
|
17420
|
+
title: doc.metadata?.title ?? "",
|
|
17421
|
+
url: doc.metadata?.url ?? "",
|
|
17422
|
+
description: doc.metadata?.description ?? "",
|
|
17423
|
+
tags: doc.metadata?.tags ?? [],
|
|
17208
17424
|
depth: doc.metadata?.depth ?? 0,
|
|
17209
17425
|
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17210
|
-
routeFile: doc.metadata?.routeFile ?? ""
|
|
17426
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
17427
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0
|
|
17211
17428
|
}));
|
|
17212
17429
|
}
|
|
17213
|
-
async deleteByIds(ids,
|
|
17430
|
+
async deleteByIds(ids, _scope) {
|
|
17214
17431
|
if (ids.length === 0) return;
|
|
17215
|
-
const
|
|
17216
|
-
const BATCH_SIZE = 500;
|
|
17432
|
+
const BATCH_SIZE = 100;
|
|
17217
17433
|
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17218
17434
|
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17219
|
-
await
|
|
17435
|
+
await this.chunksNs.delete(batch);
|
|
17220
17436
|
}
|
|
17221
17437
|
}
|
|
17222
17438
|
async deleteScope(scope) {
|
|
17223
|
-
|
|
17224
|
-
const
|
|
17225
|
-
|
|
17226
|
-
|
|
17227
|
-
|
|
17228
|
-
|
|
17229
|
-
|
|
17230
|
-
|
|
17231
|
-
|
|
17439
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17440
|
+
const ids = [];
|
|
17441
|
+
let cursor = "0";
|
|
17442
|
+
try {
|
|
17443
|
+
for (; ; ) {
|
|
17444
|
+
const result = await ns.range({
|
|
17445
|
+
cursor,
|
|
17446
|
+
limit: 100,
|
|
17447
|
+
includeMetadata: true
|
|
17448
|
+
});
|
|
17449
|
+
for (const doc of result.vectors) {
|
|
17450
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17451
|
+
ids.push(String(doc.id));
|
|
17452
|
+
}
|
|
17453
|
+
}
|
|
17454
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17455
|
+
cursor = result.nextCursor;
|
|
17456
|
+
}
|
|
17457
|
+
} catch {
|
|
17458
|
+
}
|
|
17459
|
+
if (ids.length > 0) {
|
|
17460
|
+
const BATCH_SIZE = 100;
|
|
17461
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17462
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17463
|
+
await ns.delete(batch);
|
|
17464
|
+
}
|
|
17465
|
+
}
|
|
17232
17466
|
}
|
|
17233
17467
|
}
|
|
17234
17468
|
async listScopes(projectId) {
|
|
17235
|
-
const
|
|
17236
|
-
const
|
|
17237
|
-
|
|
17238
|
-
for (const name of allIndexes) {
|
|
17239
|
-
if (name.startsWith(prefix) && !name.endsWith("--pages")) {
|
|
17240
|
-
const scopeName = name.slice(prefix.length);
|
|
17241
|
-
scopeNames.add(scopeName);
|
|
17242
|
-
}
|
|
17243
|
-
}
|
|
17244
|
-
const scopes = [];
|
|
17245
|
-
for (const scopeName of scopeNames) {
|
|
17246
|
-
const scope = {
|
|
17247
|
-
projectId,
|
|
17248
|
-
scopeName,
|
|
17249
|
-
scopeId: `${projectId}:${scopeName}`
|
|
17250
|
-
};
|
|
17469
|
+
const scopeMap = /* @__PURE__ */ new Map();
|
|
17470
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17471
|
+
let cursor = "0";
|
|
17251
17472
|
try {
|
|
17252
|
-
|
|
17253
|
-
|
|
17254
|
-
|
|
17255
|
-
|
|
17256
|
-
|
|
17257
|
-
|
|
17258
|
-
|
|
17473
|
+
for (; ; ) {
|
|
17474
|
+
const result = await ns.range({
|
|
17475
|
+
cursor,
|
|
17476
|
+
limit: 100,
|
|
17477
|
+
includeMetadata: true
|
|
17478
|
+
});
|
|
17479
|
+
for (const doc of result.vectors) {
|
|
17480
|
+
if (doc.metadata?.projectId === projectId) {
|
|
17481
|
+
const scopeName = doc.metadata.scopeName ?? "";
|
|
17482
|
+
scopeMap.set(scopeName, (scopeMap.get(scopeName) ?? 0) + 1);
|
|
17483
|
+
}
|
|
17484
|
+
}
|
|
17485
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17486
|
+
cursor = result.nextCursor;
|
|
17487
|
+
}
|
|
17259
17488
|
} catch {
|
|
17260
|
-
scopes.push({
|
|
17261
|
-
projectId,
|
|
17262
|
-
scopeName,
|
|
17263
|
-
lastIndexedAt: "unknown",
|
|
17264
|
-
documentCount: 0
|
|
17265
|
-
});
|
|
17266
17489
|
}
|
|
17267
17490
|
}
|
|
17268
|
-
return
|
|
17491
|
+
return [...scopeMap.entries()].map(([scopeName, count]) => ({
|
|
17492
|
+
projectId,
|
|
17493
|
+
scopeName,
|
|
17494
|
+
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
17495
|
+
documentCount: count
|
|
17496
|
+
}));
|
|
17269
17497
|
}
|
|
17270
17498
|
async getContentHashes(scope) {
|
|
17271
17499
|
const map = /* @__PURE__ */ new Map();
|
|
17272
|
-
const index = this.chunkIndex(scope);
|
|
17273
17500
|
let cursor = "0";
|
|
17274
17501
|
try {
|
|
17275
17502
|
for (; ; ) {
|
|
17276
|
-
const result = await
|
|
17277
|
-
|
|
17278
|
-
|
|
17279
|
-
|
|
17503
|
+
const result = await this.chunksNs.range({
|
|
17504
|
+
cursor,
|
|
17505
|
+
limit: 100,
|
|
17506
|
+
includeMetadata: true
|
|
17507
|
+
});
|
|
17508
|
+
for (const doc of result.vectors) {
|
|
17509
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17510
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17511
|
+
}
|
|
17512
|
+
}
|
|
17513
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17514
|
+
cursor = result.nextCursor;
|
|
17515
|
+
}
|
|
17516
|
+
} catch {
|
|
17517
|
+
}
|
|
17518
|
+
return map;
|
|
17519
|
+
}
|
|
17520
|
+
async listPages(scope, opts) {
|
|
17521
|
+
const cursor = opts?.cursor ?? "0";
|
|
17522
|
+
const limit = opts?.limit ?? 50;
|
|
17523
|
+
try {
|
|
17524
|
+
const result = await this.pagesNs.range({
|
|
17525
|
+
cursor,
|
|
17526
|
+
limit,
|
|
17527
|
+
includeMetadata: true
|
|
17528
|
+
});
|
|
17529
|
+
const pages = result.vectors.filter(
|
|
17530
|
+
(doc) => doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && (!opts?.pathPrefix || (doc.metadata?.url ?? "").startsWith(opts.pathPrefix))
|
|
17531
|
+
).map((doc) => ({
|
|
17532
|
+
url: doc.metadata?.url ?? "",
|
|
17533
|
+
title: doc.metadata?.title ?? "",
|
|
17534
|
+
description: doc.metadata?.description ?? "",
|
|
17535
|
+
routeFile: doc.metadata?.routeFile ?? ""
|
|
17536
|
+
}));
|
|
17537
|
+
const response = { pages };
|
|
17538
|
+
if (result.nextCursor && result.nextCursor !== "0") {
|
|
17539
|
+
response.nextCursor = result.nextCursor;
|
|
17540
|
+
}
|
|
17541
|
+
return response;
|
|
17542
|
+
} catch {
|
|
17543
|
+
return { pages: [] };
|
|
17544
|
+
}
|
|
17545
|
+
}
|
|
17546
|
+
async getPageHashes(scope) {
|
|
17547
|
+
const map = /* @__PURE__ */ new Map();
|
|
17548
|
+
let cursor = "0";
|
|
17549
|
+
try {
|
|
17550
|
+
for (; ; ) {
|
|
17551
|
+
const result = await this.pagesNs.range({
|
|
17552
|
+
cursor,
|
|
17553
|
+
limit: 100,
|
|
17554
|
+
includeMetadata: true
|
|
17555
|
+
});
|
|
17556
|
+
for (const doc of result.vectors) {
|
|
17557
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17558
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17280
17559
|
}
|
|
17281
17560
|
}
|
|
17282
17561
|
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
@@ -17286,47 +17565,43 @@ var UpstashSearchStore = class {
|
|
|
17286
17565
|
}
|
|
17287
17566
|
return map;
|
|
17288
17567
|
}
|
|
17568
|
+
async deletePagesByIds(ids, _scope) {
|
|
17569
|
+
if (ids.length === 0) return;
|
|
17570
|
+
const BATCH_SIZE = 50;
|
|
17571
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17572
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17573
|
+
await this.pagesNs.delete(batch);
|
|
17574
|
+
}
|
|
17575
|
+
}
|
|
17289
17576
|
async upsertPages(pages, scope) {
|
|
17290
17577
|
if (pages.length === 0) return;
|
|
17291
|
-
const index = this.pageIndex(scope);
|
|
17292
17578
|
const BATCH_SIZE = 50;
|
|
17293
17579
|
for (let i = 0; i < pages.length; i += BATCH_SIZE) {
|
|
17294
17580
|
const batch = pages.slice(i, i + BATCH_SIZE);
|
|
17295
|
-
|
|
17296
|
-
|
|
17297
|
-
|
|
17298
|
-
|
|
17299
|
-
|
|
17300
|
-
|
|
17301
|
-
|
|
17302
|
-
|
|
17303
|
-
|
|
17304
|
-
|
|
17305
|
-
}
|
|
17306
|
-
|
|
17307
|
-
markdown: p.markdown,
|
|
17308
|
-
projectId: p.projectId,
|
|
17309
|
-
scopeName: p.scopeName,
|
|
17310
|
-
routeFile: p.routeFile,
|
|
17311
|
-
routeResolution: p.routeResolution,
|
|
17312
|
-
incomingLinks: p.incomingLinks,
|
|
17313
|
-
outgoingLinks: p.outgoingLinks,
|
|
17314
|
-
depth: p.depth,
|
|
17315
|
-
indexedAt: p.indexedAt
|
|
17316
|
-
}
|
|
17317
|
-
}));
|
|
17318
|
-
await index.upsert(docs);
|
|
17581
|
+
await this.pagesNs.upsert(
|
|
17582
|
+
batch.map((p) => ({
|
|
17583
|
+
id: p.id,
|
|
17584
|
+
data: p.data,
|
|
17585
|
+
metadata: {
|
|
17586
|
+
...p.metadata,
|
|
17587
|
+
projectId: scope.projectId,
|
|
17588
|
+
scopeName: scope.scopeName,
|
|
17589
|
+
type: "page"
|
|
17590
|
+
}
|
|
17591
|
+
}))
|
|
17592
|
+
);
|
|
17319
17593
|
}
|
|
17320
17594
|
}
|
|
17321
17595
|
async getPage(url, scope) {
|
|
17322
|
-
const index = this.pageIndex(scope);
|
|
17323
17596
|
try {
|
|
17324
|
-
const results = await
|
|
17597
|
+
const results = await this.pagesNs.fetch([url], {
|
|
17598
|
+
includeMetadata: true
|
|
17599
|
+
});
|
|
17325
17600
|
const doc = results[0];
|
|
17326
|
-
if (!doc) return null;
|
|
17601
|
+
if (!doc || !doc.metadata) return null;
|
|
17327
17602
|
return {
|
|
17328
|
-
url: doc.
|
|
17329
|
-
title: doc.
|
|
17603
|
+
url: doc.metadata.url,
|
|
17604
|
+
title: doc.metadata.title,
|
|
17330
17605
|
markdown: doc.metadata.markdown,
|
|
17331
17606
|
projectId: doc.metadata.projectId,
|
|
17332
17607
|
scopeName: doc.metadata.scopeName,
|
|
@@ -17334,27 +17609,86 @@ var UpstashSearchStore = class {
|
|
|
17334
17609
|
routeResolution: doc.metadata.routeResolution,
|
|
17335
17610
|
incomingLinks: doc.metadata.incomingLinks,
|
|
17336
17611
|
outgoingLinks: doc.metadata.outgoingLinks,
|
|
17612
|
+
outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? void 0,
|
|
17337
17613
|
depth: doc.metadata.depth,
|
|
17338
|
-
tags: doc.
|
|
17614
|
+
tags: doc.metadata.tags ?? [],
|
|
17339
17615
|
indexedAt: doc.metadata.indexedAt,
|
|
17340
|
-
summary: doc.
|
|
17341
|
-
description: doc.
|
|
17342
|
-
keywords: doc.
|
|
17616
|
+
summary: doc.metadata.summary || void 0,
|
|
17617
|
+
description: doc.metadata.description || void 0,
|
|
17618
|
+
keywords: doc.metadata.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17619
|
+
publishedAt: typeof doc.metadata.publishedAt === "number" ? doc.metadata.publishedAt : void 0
|
|
17343
17620
|
};
|
|
17344
17621
|
} catch {
|
|
17345
17622
|
return null;
|
|
17346
17623
|
}
|
|
17347
17624
|
}
|
|
17625
|
+
async fetchPageWithVector(url, scope) {
|
|
17626
|
+
try {
|
|
17627
|
+
const results = await this.pagesNs.fetch([url], {
|
|
17628
|
+
includeMetadata: true,
|
|
17629
|
+
includeVectors: true
|
|
17630
|
+
});
|
|
17631
|
+
const doc = results[0];
|
|
17632
|
+
if (!doc || !doc.metadata || !doc.vector) return null;
|
|
17633
|
+
if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
|
|
17634
|
+
return null;
|
|
17635
|
+
}
|
|
17636
|
+
return { metadata: doc.metadata, vector: doc.vector };
|
|
17637
|
+
} catch {
|
|
17638
|
+
return null;
|
|
17639
|
+
}
|
|
17640
|
+
}
|
|
17641
|
+
async fetchPagesBatch(urls, scope) {
|
|
17642
|
+
if (urls.length === 0) return [];
|
|
17643
|
+
try {
|
|
17644
|
+
const results = await this.pagesNs.fetch(urls, {
|
|
17645
|
+
includeMetadata: true
|
|
17646
|
+
});
|
|
17647
|
+
const out = [];
|
|
17648
|
+
for (const doc of results) {
|
|
17649
|
+
if (!doc || !doc.metadata) continue;
|
|
17650
|
+
if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
|
|
17651
|
+
continue;
|
|
17652
|
+
}
|
|
17653
|
+
out.push({
|
|
17654
|
+
url: doc.metadata.url,
|
|
17655
|
+
title: doc.metadata.title,
|
|
17656
|
+
routeFile: doc.metadata.routeFile,
|
|
17657
|
+
outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? []
|
|
17658
|
+
});
|
|
17659
|
+
}
|
|
17660
|
+
return out;
|
|
17661
|
+
} catch {
|
|
17662
|
+
return [];
|
|
17663
|
+
}
|
|
17664
|
+
}
|
|
17348
17665
|
async deletePages(scope) {
|
|
17666
|
+
const ids = [];
|
|
17667
|
+
let cursor = "0";
|
|
17349
17668
|
try {
|
|
17350
|
-
|
|
17351
|
-
|
|
17669
|
+
for (; ; ) {
|
|
17670
|
+
const result = await this.pagesNs.range({
|
|
17671
|
+
cursor,
|
|
17672
|
+
limit: 100,
|
|
17673
|
+
includeMetadata: true
|
|
17674
|
+
});
|
|
17675
|
+
for (const doc of result.vectors) {
|
|
17676
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17677
|
+
ids.push(String(doc.id));
|
|
17678
|
+
}
|
|
17679
|
+
}
|
|
17680
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17681
|
+
cursor = result.nextCursor;
|
|
17682
|
+
}
|
|
17352
17683
|
} catch {
|
|
17353
17684
|
}
|
|
17685
|
+
if (ids.length > 0) {
|
|
17686
|
+
await this.deletePagesByIds(ids, scope);
|
|
17687
|
+
}
|
|
17354
17688
|
}
|
|
17355
17689
|
async health() {
|
|
17356
17690
|
try {
|
|
17357
|
-
await this.
|
|
17691
|
+
await this.index.info();
|
|
17358
17692
|
return { ok: true };
|
|
17359
17693
|
} catch (error) {
|
|
17360
17694
|
return {
|
|
@@ -17364,14 +17698,31 @@ var UpstashSearchStore = class {
|
|
|
17364
17698
|
}
|
|
17365
17699
|
}
|
|
17366
17700
|
async dropAllIndexes(projectId) {
|
|
17367
|
-
const
|
|
17368
|
-
|
|
17369
|
-
|
|
17370
|
-
|
|
17371
|
-
|
|
17372
|
-
const
|
|
17373
|
-
|
|
17374
|
-
|
|
17701
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17702
|
+
const ids = [];
|
|
17703
|
+
let cursor = "0";
|
|
17704
|
+
try {
|
|
17705
|
+
for (; ; ) {
|
|
17706
|
+
const result = await ns.range({
|
|
17707
|
+
cursor,
|
|
17708
|
+
limit: 100,
|
|
17709
|
+
includeMetadata: true
|
|
17710
|
+
});
|
|
17711
|
+
for (const doc of result.vectors) {
|
|
17712
|
+
if (doc.metadata?.projectId === projectId) {
|
|
17713
|
+
ids.push(String(doc.id));
|
|
17714
|
+
}
|
|
17715
|
+
}
|
|
17716
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17717
|
+
cursor = result.nextCursor;
|
|
17718
|
+
}
|
|
17719
|
+
} catch {
|
|
17720
|
+
}
|
|
17721
|
+
if (ids.length > 0) {
|
|
17722
|
+
const BATCH_SIZE = 100;
|
|
17723
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17724
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17725
|
+
await ns.delete(batch);
|
|
17375
17726
|
}
|
|
17376
17727
|
}
|
|
17377
17728
|
}
|
|
@@ -17385,12 +17736,16 @@ async function createUpstashStore(config) {
|
|
|
17385
17736
|
if (!url || !token) {
|
|
17386
17737
|
throw new SearchSocketError(
|
|
17387
17738
|
"VECTOR_BACKEND_UNAVAILABLE",
|
|
17388
|
-
`Missing Upstash
|
|
17739
|
+
`Missing Upstash Vector credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
|
|
17389
17740
|
);
|
|
17390
17741
|
}
|
|
17391
|
-
const {
|
|
17392
|
-
const
|
|
17393
|
-
return new UpstashSearchStore({
|
|
17742
|
+
const { Index } = await import('@upstash/vector');
|
|
17743
|
+
const index = new Index({ url, token });
|
|
17744
|
+
return new UpstashSearchStore({
|
|
17745
|
+
index,
|
|
17746
|
+
pagesNamespace: config.upstash.namespaces.pages,
|
|
17747
|
+
chunksNamespace: config.upstash.namespaces.chunks
|
|
17748
|
+
});
|
|
17394
17749
|
}
|
|
17395
17750
|
function sha1(input) {
|
|
17396
17751
|
return crypto.createHash("sha1").update(input).digest("hex");
|
|
@@ -17458,6 +17813,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
|
|
|
17458
17813
|
if (normalizeText(current.text)) {
|
|
17459
17814
|
sections.push({
|
|
17460
17815
|
sectionTitle: current.sectionTitle,
|
|
17816
|
+
headingLevel: current.headingLevel,
|
|
17461
17817
|
headingPath: current.headingPath,
|
|
17462
17818
|
text: current.text.trim()
|
|
17463
17819
|
});
|
|
@@ -17476,6 +17832,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
|
|
|
17476
17832
|
headingStack.length = level;
|
|
17477
17833
|
current = {
|
|
17478
17834
|
sectionTitle: title,
|
|
17835
|
+
headingLevel: level,
|
|
17479
17836
|
headingPath: headingStack.filter((entry) => Boolean(entry)).slice(0, headingPathDepth),
|
|
17480
17837
|
text: `${line}
|
|
17481
17838
|
`
|
|
@@ -17611,6 +17968,7 @@ function splitSection(section, config) {
|
|
|
17611
17968
|
return [
|
|
17612
17969
|
{
|
|
17613
17970
|
sectionTitle: section.sectionTitle,
|
|
17971
|
+
headingLevel: section.headingLevel,
|
|
17614
17972
|
headingPath: section.headingPath,
|
|
17615
17973
|
chunkText: text
|
|
17616
17974
|
}
|
|
@@ -17661,6 +18019,7 @@ ${chunk}`;
|
|
|
17661
18019
|
}
|
|
17662
18020
|
return merged.map((chunkText) => ({
|
|
17663
18021
|
sectionTitle: section.sectionTitle,
|
|
18022
|
+
headingLevel: section.headingLevel,
|
|
17664
18023
|
headingPath: section.headingPath,
|
|
17665
18024
|
chunkText
|
|
17666
18025
|
}));
|
|
@@ -17676,6 +18035,18 @@ function buildSummaryChunkText(page) {
|
|
|
17676
18035
|
}
|
|
17677
18036
|
return parts.join("\n\n");
|
|
17678
18037
|
}
|
|
18038
|
+
function buildEmbeddingTitle(chunk) {
|
|
18039
|
+
if (!chunk.sectionTitle || chunk.headingLevel === void 0) return void 0;
|
|
18040
|
+
if (chunk.headingPath.length > 1) {
|
|
18041
|
+
const path14 = chunk.headingPath.join(" > ");
|
|
18042
|
+
const lastInPath = chunk.headingPath[chunk.headingPath.length - 1];
|
|
18043
|
+
if (lastInPath !== chunk.sectionTitle) {
|
|
18044
|
+
return `${chunk.title} \u2014 ${path14} > ${chunk.sectionTitle}`;
|
|
18045
|
+
}
|
|
18046
|
+
return `${chunk.title} \u2014 ${path14}`;
|
|
18047
|
+
}
|
|
18048
|
+
return `${chunk.title} \u2014 ${chunk.sectionTitle}`;
|
|
18049
|
+
}
|
|
17679
18050
|
function buildEmbeddingText(chunk, prependTitle) {
|
|
17680
18051
|
if (!prependTitle) return chunk.chunkText;
|
|
17681
18052
|
const prefix = chunk.sectionTitle ? `${chunk.title} \u2014 ${chunk.sectionTitle}` : chunk.title;
|
|
@@ -17706,10 +18077,14 @@ function chunkPage(page, config, scope) {
|
|
|
17706
18077
|
tags: page.tags,
|
|
17707
18078
|
contentHash: "",
|
|
17708
18079
|
description: page.description,
|
|
17709
|
-
keywords: page.keywords
|
|
18080
|
+
keywords: page.keywords,
|
|
18081
|
+
publishedAt: page.publishedAt,
|
|
18082
|
+
incomingAnchorText: page.incomingAnchorText,
|
|
18083
|
+
meta: page.meta
|
|
17710
18084
|
};
|
|
17711
18085
|
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
17712
|
-
|
|
18086
|
+
const metaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
|
|
18087
|
+
summaryChunk.contentHash = sha256(normalizeText(embeddingText) + metaSuffix);
|
|
17713
18088
|
chunks.push(summaryChunk);
|
|
17714
18089
|
}
|
|
17715
18090
|
const ordinalOffset = config.chunking.pageSummaryChunk ? 1 : 0;
|
|
@@ -17726,6 +18101,7 @@ function chunkPage(page, config, scope) {
|
|
|
17726
18101
|
path: page.url,
|
|
17727
18102
|
title: page.title,
|
|
17728
18103
|
sectionTitle: entry.sectionTitle,
|
|
18104
|
+
headingLevel: entry.headingLevel,
|
|
17729
18105
|
headingPath: entry.headingPath,
|
|
17730
18106
|
chunkText: entry.chunkText,
|
|
17731
18107
|
snippet: toSnippet(entry.chunkText),
|
|
@@ -17735,10 +18111,16 @@ function chunkPage(page, config, scope) {
|
|
|
17735
18111
|
tags: page.tags,
|
|
17736
18112
|
contentHash: "",
|
|
17737
18113
|
description: page.description,
|
|
17738
|
-
keywords: page.keywords
|
|
18114
|
+
keywords: page.keywords,
|
|
18115
|
+
publishedAt: page.publishedAt,
|
|
18116
|
+
incomingAnchorText: page.incomingAnchorText,
|
|
18117
|
+
meta: page.meta
|
|
17739
18118
|
};
|
|
17740
18119
|
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
17741
|
-
|
|
18120
|
+
const embeddingTitle = config.chunking.weightHeadings ? buildEmbeddingTitle(chunk) : void 0;
|
|
18121
|
+
const chunkMetaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
|
|
18122
|
+
const hashInput = embeddingTitle ? `${normalizeText(embeddingText)}|title:${embeddingTitle}` : normalizeText(embeddingText);
|
|
18123
|
+
chunk.contentHash = sha256(hashInput + chunkMetaSuffix);
|
|
17742
18124
|
chunks.push(chunk);
|
|
17743
18125
|
}
|
|
17744
18126
|
return chunks;
|
|
@@ -18570,7 +18952,112 @@ function gfm(turndownService) {
|
|
|
18570
18952
|
]);
|
|
18571
18953
|
}
|
|
18572
18954
|
|
|
18955
|
+
// src/utils/structured-meta.ts
|
|
18956
|
+
var VALID_KEY_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
|
|
18957
|
+
function validateMetaKey(key) {
|
|
18958
|
+
return VALID_KEY_RE.test(key);
|
|
18959
|
+
}
|
|
18960
|
+
function parseMetaValue(content, dataType) {
|
|
18961
|
+
switch (dataType) {
|
|
18962
|
+
case "number": {
|
|
18963
|
+
const n = Number(content);
|
|
18964
|
+
return Number.isFinite(n) ? n : content;
|
|
18965
|
+
}
|
|
18966
|
+
case "boolean":
|
|
18967
|
+
return content === "true";
|
|
18968
|
+
case "string[]":
|
|
18969
|
+
return content ? content.split(",").map((s) => s.trim()) : [];
|
|
18970
|
+
case "date": {
|
|
18971
|
+
const ms = Number(content);
|
|
18972
|
+
return Number.isFinite(ms) ? ms : content;
|
|
18973
|
+
}
|
|
18974
|
+
default:
|
|
18975
|
+
return content;
|
|
18976
|
+
}
|
|
18977
|
+
}
|
|
18978
|
+
function escapeFilterValue(s) {
|
|
18979
|
+
return s.replace(/'/g, "''");
|
|
18980
|
+
}
|
|
18981
|
+
function buildMetaFilterString(filters) {
|
|
18982
|
+
const clauses = [];
|
|
18983
|
+
for (const [key, value] of Object.entries(filters)) {
|
|
18984
|
+
if (!validateMetaKey(key)) continue;
|
|
18985
|
+
const field = `meta.${key}`;
|
|
18986
|
+
if (typeof value === "string") {
|
|
18987
|
+
clauses.push(`${field} CONTAINS '${escapeFilterValue(value)}'`);
|
|
18988
|
+
} else if (typeof value === "boolean") {
|
|
18989
|
+
clauses.push(`${field} = ${value}`);
|
|
18990
|
+
} else {
|
|
18991
|
+
clauses.push(`${field} = ${value}`);
|
|
18992
|
+
}
|
|
18993
|
+
}
|
|
18994
|
+
return clauses.join(" AND ");
|
|
18995
|
+
}
|
|
18996
|
+
|
|
18573
18997
|
// src/indexing/extractor.ts
|
|
18998
|
+
function normalizeDateToMs(value) {
|
|
18999
|
+
if (value == null) return void 0;
|
|
19000
|
+
if (value instanceof Date) {
|
|
19001
|
+
const ts = value.getTime();
|
|
19002
|
+
return Number.isFinite(ts) ? ts : void 0;
|
|
19003
|
+
}
|
|
19004
|
+
if (typeof value === "string") {
|
|
19005
|
+
const ts = new Date(value).getTime();
|
|
19006
|
+
return Number.isFinite(ts) ? ts : void 0;
|
|
19007
|
+
}
|
|
19008
|
+
if (typeof value === "number") {
|
|
19009
|
+
return Number.isFinite(value) ? value : void 0;
|
|
19010
|
+
}
|
|
19011
|
+
return void 0;
|
|
19012
|
+
}
|
|
19013
|
+
var FRONTMATTER_DATE_FIELDS = ["date", "publishedAt", "updatedAt", "published_at", "updated_at"];
|
|
19014
|
+
function extractPublishedAtFromFrontmatter(data) {
|
|
19015
|
+
for (const field of FRONTMATTER_DATE_FIELDS) {
|
|
19016
|
+
const val = normalizeDateToMs(data[field]);
|
|
19017
|
+
if (val !== void 0) return val;
|
|
19018
|
+
}
|
|
19019
|
+
return void 0;
|
|
19020
|
+
}
|
|
19021
|
+
function extractPublishedAtFromHtml($) {
|
|
19022
|
+
const jsonLdScripts = $('script[type="application/ld+json"]');
|
|
19023
|
+
for (let i = 0; i < jsonLdScripts.length; i++) {
|
|
19024
|
+
try {
|
|
19025
|
+
const raw = $(jsonLdScripts[i]).html();
|
|
19026
|
+
if (!raw) continue;
|
|
19027
|
+
const parsed = JSON.parse(raw);
|
|
19028
|
+
const candidates = [];
|
|
19029
|
+
if (Array.isArray(parsed)) {
|
|
19030
|
+
candidates.push(...parsed);
|
|
19031
|
+
} else if (parsed && typeof parsed === "object") {
|
|
19032
|
+
candidates.push(parsed);
|
|
19033
|
+
if (Array.isArray(parsed["@graph"])) {
|
|
19034
|
+
candidates.push(...parsed["@graph"]);
|
|
19035
|
+
}
|
|
19036
|
+
}
|
|
19037
|
+
for (const candidate of candidates) {
|
|
19038
|
+
const val = normalizeDateToMs(candidate.datePublished);
|
|
19039
|
+
if (val !== void 0) return val;
|
|
19040
|
+
}
|
|
19041
|
+
} catch {
|
|
19042
|
+
}
|
|
19043
|
+
}
|
|
19044
|
+
const ogTime = $('meta[property="article:published_time"]').attr("content")?.trim();
|
|
19045
|
+
if (ogTime) {
|
|
19046
|
+
const val = normalizeDateToMs(ogTime);
|
|
19047
|
+
if (val !== void 0) return val;
|
|
19048
|
+
}
|
|
19049
|
+
const itempropDate = $('meta[itemprop="datePublished"]').attr("content")?.trim() || $('time[itemprop="datePublished"]').attr("datetime")?.trim();
|
|
19050
|
+
if (itempropDate) {
|
|
19051
|
+
const val = normalizeDateToMs(itempropDate);
|
|
19052
|
+
if (val !== void 0) return val;
|
|
19053
|
+
}
|
|
19054
|
+
const timeEl = $("time[datetime]").first().attr("datetime")?.trim();
|
|
19055
|
+
if (timeEl) {
|
|
19056
|
+
const val = normalizeDateToMs(timeEl);
|
|
19057
|
+
if (val !== void 0) return val;
|
|
19058
|
+
}
|
|
19059
|
+
return void 0;
|
|
19060
|
+
}
|
|
18574
19061
|
function hasTopLevelNoindexComment(markdown) {
|
|
18575
19062
|
const lines = markdown.split(/\r?\n/);
|
|
18576
19063
|
let inFence = false;
|
|
@@ -18586,6 +19073,97 @@ function hasTopLevelNoindexComment(markdown) {
|
|
|
18586
19073
|
}
|
|
18587
19074
|
return false;
|
|
18588
19075
|
}
|
|
19076
|
+
var GARBAGE_ALT_WORDS = /* @__PURE__ */ new Set([
|
|
19077
|
+
"image",
|
|
19078
|
+
"photo",
|
|
19079
|
+
"picture",
|
|
19080
|
+
"icon",
|
|
19081
|
+
"logo",
|
|
19082
|
+
"banner",
|
|
19083
|
+
"screenshot",
|
|
19084
|
+
"thumbnail",
|
|
19085
|
+
"img",
|
|
19086
|
+
"graphic",
|
|
19087
|
+
"illustration",
|
|
19088
|
+
"spacer",
|
|
19089
|
+
"pixel",
|
|
19090
|
+
"placeholder",
|
|
19091
|
+
"avatar",
|
|
19092
|
+
"background"
|
|
19093
|
+
]);
|
|
19094
|
+
var IMAGE_EXT_RE = /\.(jpg|jpeg|png|gif|svg|webp|avif|bmp|ico)(\?.*)?$/i;
|
|
19095
|
+
function isMeaningfulAlt(alt) {
|
|
19096
|
+
const trimmed = alt.trim();
|
|
19097
|
+
if (!trimmed || trimmed.length < 5) return false;
|
|
19098
|
+
if (IMAGE_EXT_RE.test(trimmed)) return false;
|
|
19099
|
+
if (GARBAGE_ALT_WORDS.has(trimmed.toLowerCase())) return false;
|
|
19100
|
+
return true;
|
|
19101
|
+
}
|
|
19102
|
+
function resolveImageText(img, $, imageDescAttr) {
|
|
19103
|
+
const imgDesc = img.attr(imageDescAttr)?.trim();
|
|
19104
|
+
if (imgDesc) return imgDesc;
|
|
19105
|
+
const figure = img.closest("figure");
|
|
19106
|
+
if (figure.length) {
|
|
19107
|
+
const figDesc = figure.attr(imageDescAttr)?.trim();
|
|
19108
|
+
if (figDesc) return figDesc;
|
|
19109
|
+
}
|
|
19110
|
+
const alt = img.attr("alt")?.trim() ?? "";
|
|
19111
|
+
const caption = figure.length ? figure.find("figcaption").first().text().trim() : "";
|
|
19112
|
+
if (isMeaningfulAlt(alt) && caption) {
|
|
19113
|
+
return `${alt} \u2014 ${caption}`;
|
|
19114
|
+
}
|
|
19115
|
+
if (isMeaningfulAlt(alt)) {
|
|
19116
|
+
return alt;
|
|
19117
|
+
}
|
|
19118
|
+
if (caption) {
|
|
19119
|
+
return caption;
|
|
19120
|
+
}
|
|
19121
|
+
return null;
|
|
19122
|
+
}
|
|
19123
|
+
var STOP_ANCHORS = /* @__PURE__ */ new Set([
|
|
19124
|
+
"here",
|
|
19125
|
+
"click",
|
|
19126
|
+
"click here",
|
|
19127
|
+
"read more",
|
|
19128
|
+
"link",
|
|
19129
|
+
"this",
|
|
19130
|
+
"more"
|
|
19131
|
+
]);
|
|
19132
|
+
function normalizeAnchorText(raw) {
|
|
19133
|
+
const normalized = raw.replace(/\s+/g, " ").trim().toLowerCase();
|
|
19134
|
+
if (normalized.length < 3) return "";
|
|
19135
|
+
if (STOP_ANCHORS.has(normalized)) return "";
|
|
19136
|
+
if (normalized.length > 100) return normalized.slice(0, 100);
|
|
19137
|
+
return normalized;
|
|
19138
|
+
}
|
|
19139
|
+
function escapeHtml(text) {
|
|
19140
|
+
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">");
|
|
19141
|
+
}
|
|
19142
|
+
function preprocessImages(root2, $, imageDescAttr) {
|
|
19143
|
+
root2.find("picture").each((_i, el) => {
|
|
19144
|
+
const picture = $(el);
|
|
19145
|
+
const img = picture.find("img").first();
|
|
19146
|
+
const parentFigure = picture.closest("figure");
|
|
19147
|
+
const text = img.length ? resolveImageText(img, $, imageDescAttr) : null;
|
|
19148
|
+
if (text) {
|
|
19149
|
+
if (parentFigure.length) parentFigure.find("figcaption").remove();
|
|
19150
|
+
picture.replaceWith(`<span>${escapeHtml(text)}</span>`);
|
|
19151
|
+
} else {
|
|
19152
|
+
picture.remove();
|
|
19153
|
+
}
|
|
19154
|
+
});
|
|
19155
|
+
root2.find("img").each((_i, el) => {
|
|
19156
|
+
const img = $(el);
|
|
19157
|
+
const parentFigure = img.closest("figure");
|
|
19158
|
+
const text = resolveImageText(img, $, imageDescAttr);
|
|
19159
|
+
if (text) {
|
|
19160
|
+
if (parentFigure.length) parentFigure.find("figcaption").remove();
|
|
19161
|
+
img.replaceWith(`<span>${escapeHtml(text)}</span>`);
|
|
19162
|
+
} else {
|
|
19163
|
+
img.remove();
|
|
19164
|
+
}
|
|
19165
|
+
});
|
|
19166
|
+
}
|
|
18589
19167
|
function extractFromHtml(url, html, config) {
|
|
18590
19168
|
const $ = cheerio.load(html);
|
|
18591
19169
|
const normalizedUrl = normalizeUrlPath(url);
|
|
@@ -18611,6 +19189,20 @@ function extractFromHtml(url, html, config) {
|
|
|
18611
19189
|
if (weight === 0) {
|
|
18612
19190
|
return null;
|
|
18613
19191
|
}
|
|
19192
|
+
if ($('meta[name="searchsocket:noindex"]').attr("content") === "true") {
|
|
19193
|
+
return null;
|
|
19194
|
+
}
|
|
19195
|
+
const RESERVED_META_KEYS = /* @__PURE__ */ new Set(["noindex", "tags"]);
|
|
19196
|
+
const meta = {};
|
|
19197
|
+
$('meta[name^="searchsocket:"]').each((_i, el) => {
|
|
19198
|
+
const name = $(el).attr("name") ?? "";
|
|
19199
|
+
const key = name.slice("searchsocket:".length);
|
|
19200
|
+
if (!key || RESERVED_META_KEYS.has(key) || !validateMetaKey(key)) return;
|
|
19201
|
+
const content = $(el).attr("content") ?? "";
|
|
19202
|
+
const dataType = $(el).attr("data-type") ?? "string";
|
|
19203
|
+
meta[key] = parseMetaValue(content, dataType);
|
|
19204
|
+
});
|
|
19205
|
+
const componentTags = $('meta[name="searchsocket:tags"]').attr("content")?.trim();
|
|
18614
19206
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
18615
19207
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
18616
19208
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -18622,7 +19214,9 @@ function extractFromHtml(url, html, config) {
|
|
|
18622
19214
|
root2.find(selector).remove();
|
|
18623
19215
|
}
|
|
18624
19216
|
root2.find(`[${config.extract.ignoreAttr}]`).remove();
|
|
19217
|
+
preprocessImages(root2, $, config.extract.imageDescAttr);
|
|
18625
19218
|
const outgoingLinks = [];
|
|
19219
|
+
const seenLinkKeys = /* @__PURE__ */ new Set();
|
|
18626
19220
|
root2.find("a[href]").each((_index, node) => {
|
|
18627
19221
|
const href = $(node).attr("href");
|
|
18628
19222
|
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:")) {
|
|
@@ -18633,7 +19227,19 @@ function extractFromHtml(url, html, config) {
|
|
|
18633
19227
|
if (!["http:", "https:"].includes(parsed.protocol)) {
|
|
18634
19228
|
return;
|
|
18635
19229
|
}
|
|
18636
|
-
|
|
19230
|
+
const url2 = normalizeUrlPath(parsed.pathname);
|
|
19231
|
+
let anchorText = normalizeAnchorText($(node).text());
|
|
19232
|
+
if (!anchorText) {
|
|
19233
|
+
const imgAlt = $(node).find("img").first().attr("alt") ?? "";
|
|
19234
|
+
if (isMeaningfulAlt(imgAlt)) {
|
|
19235
|
+
anchorText = normalizeAnchorText(imgAlt);
|
|
19236
|
+
}
|
|
19237
|
+
}
|
|
19238
|
+
const key = `${url2}|${anchorText}`;
|
|
19239
|
+
if (!seenLinkKeys.has(key)) {
|
|
19240
|
+
seenLinkKeys.add(key);
|
|
19241
|
+
outgoingLinks.push({ url: url2, anchorText });
|
|
19242
|
+
}
|
|
18637
19243
|
} catch {
|
|
18638
19244
|
}
|
|
18639
19245
|
});
|
|
@@ -18658,16 +19264,25 @@ function extractFromHtml(url, html, config) {
|
|
|
18658
19264
|
return null;
|
|
18659
19265
|
}
|
|
18660
19266
|
const tags = normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1);
|
|
19267
|
+
const publishedAt = extractPublishedAtFromHtml($);
|
|
19268
|
+
if (componentTags) {
|
|
19269
|
+
const extraTags = componentTags.split(",").map((t) => t.trim()).filter(Boolean);
|
|
19270
|
+
for (const t of extraTags) {
|
|
19271
|
+
if (!tags.includes(t)) tags.push(t);
|
|
19272
|
+
}
|
|
19273
|
+
}
|
|
18661
19274
|
return {
|
|
18662
19275
|
url: normalizeUrlPath(url),
|
|
18663
19276
|
title,
|
|
18664
19277
|
markdown,
|
|
18665
|
-
outgoingLinks
|
|
19278
|
+
outgoingLinks,
|
|
18666
19279
|
noindex: false,
|
|
18667
19280
|
tags,
|
|
18668
19281
|
description,
|
|
18669
19282
|
keywords,
|
|
18670
|
-
weight
|
|
19283
|
+
weight,
|
|
19284
|
+
publishedAt,
|
|
19285
|
+
meta: Object.keys(meta).length > 0 ? meta : void 0
|
|
18671
19286
|
};
|
|
18672
19287
|
}
|
|
18673
19288
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -18688,6 +19303,24 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
18688
19303
|
if (mdWeight === 0) {
|
|
18689
19304
|
return null;
|
|
18690
19305
|
}
|
|
19306
|
+
let mdMeta;
|
|
19307
|
+
const rawMeta = searchsocketMeta?.meta;
|
|
19308
|
+
if (rawMeta && typeof rawMeta === "object" && !Array.isArray(rawMeta)) {
|
|
19309
|
+
const metaObj = {};
|
|
19310
|
+
for (const [key, val] of Object.entries(rawMeta)) {
|
|
19311
|
+
if (!validateMetaKey(key)) continue;
|
|
19312
|
+
if (typeof val === "string" || typeof val === "number" || typeof val === "boolean") {
|
|
19313
|
+
metaObj[key] = val;
|
|
19314
|
+
} else if (Array.isArray(val) && val.every((v) => typeof v === "string")) {
|
|
19315
|
+
metaObj[key] = val;
|
|
19316
|
+
} else if (val instanceof Date) {
|
|
19317
|
+
metaObj[key] = val.getTime();
|
|
19318
|
+
}
|
|
19319
|
+
}
|
|
19320
|
+
if (Object.keys(metaObj).length > 0) {
|
|
19321
|
+
mdMeta = metaObj;
|
|
19322
|
+
}
|
|
19323
|
+
}
|
|
18691
19324
|
const content = parsed.content;
|
|
18692
19325
|
const normalized = normalizeMarkdown(content);
|
|
18693
19326
|
if (!normalizeText(normalized)) {
|
|
@@ -18702,6 +19335,7 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
18702
19335
|
fmKeywords = frontmatter.keywords.split(",").map((k) => k.trim()).filter(Boolean);
|
|
18703
19336
|
}
|
|
18704
19337
|
if (fmKeywords && fmKeywords.length === 0) fmKeywords = void 0;
|
|
19338
|
+
const publishedAt = extractPublishedAtFromFrontmatter(frontmatter);
|
|
18705
19339
|
return {
|
|
18706
19340
|
url: normalizeUrlPath(url),
|
|
18707
19341
|
title: resolvedTitle,
|
|
@@ -18711,7 +19345,9 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
18711
19345
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
18712
19346
|
description: fmDescription,
|
|
18713
19347
|
keywords: fmKeywords,
|
|
18714
|
-
weight: mdWeight
|
|
19348
|
+
weight: mdWeight,
|
|
19349
|
+
publishedAt,
|
|
19350
|
+
meta: mdMeta
|
|
18715
19351
|
};
|
|
18716
19352
|
}
|
|
18717
19353
|
function segmentToRegex(segment) {
|
|
@@ -18906,7 +19542,7 @@ async function parseManifest(cwd, outputDir) {
|
|
|
18906
19542
|
const manifestPath = path__default.default.resolve(cwd, outputDir, "server", "manifest-full.js");
|
|
18907
19543
|
let content;
|
|
18908
19544
|
try {
|
|
18909
|
-
content = await
|
|
19545
|
+
content = await fs8__default.default.readFile(manifestPath, "utf8");
|
|
18910
19546
|
} catch {
|
|
18911
19547
|
throw new SearchSocketError(
|
|
18912
19548
|
"BUILD_MANIFEST_NOT_FOUND",
|
|
@@ -19217,6 +19853,125 @@ function filePathToUrl(filePath, baseDir) {
|
|
|
19217
19853
|
const noExt = relative.replace(/\.md$/i, "").replace(/\/index$/i, "");
|
|
19218
19854
|
return normalizeUrlPath(noExt || "/");
|
|
19219
19855
|
}
|
|
19856
|
+
var ROUTE_FILE_RE = /\+(page|layout|error)(@[^.]+)?\.svelte$/;
|
|
19857
|
+
function isSvelteComponentFile(filePath) {
|
|
19858
|
+
if (!filePath.endsWith(".svelte")) return false;
|
|
19859
|
+
return !ROUTE_FILE_RE.test(filePath);
|
|
19860
|
+
}
|
|
19861
|
+
function extractSvelteComponentMeta(source) {
|
|
19862
|
+
const componentMatch = source.match(/<!--\s*@component\s*([\s\S]*?)\s*-->/);
|
|
19863
|
+
const description = componentMatch?.[1]?.trim() || void 0;
|
|
19864
|
+
const propsMatch = source.match(
|
|
19865
|
+
/let\s+\{([\s\S]*?)\}\s*(?::\s*([^=;{][\s\S]*?))?\s*=\s*\$props\(\)/
|
|
19866
|
+
);
|
|
19867
|
+
const props = [];
|
|
19868
|
+
if (propsMatch) {
|
|
19869
|
+
const destructureBlock = propsMatch[1];
|
|
19870
|
+
const typeAnnotation = propsMatch[2]?.trim();
|
|
19871
|
+
let resolvedTypeMap;
|
|
19872
|
+
if (typeAnnotation && /^[A-Z]\w*$/.test(typeAnnotation)) {
|
|
19873
|
+
resolvedTypeMap = resolveTypeReference(source, typeAnnotation);
|
|
19874
|
+
} else if (typeAnnotation && typeAnnotation.startsWith("{")) {
|
|
19875
|
+
resolvedTypeMap = parseInlineTypeAnnotation(typeAnnotation);
|
|
19876
|
+
}
|
|
19877
|
+
const propEntries = splitDestructureBlock(destructureBlock);
|
|
19878
|
+
for (const entry of propEntries) {
|
|
19879
|
+
const trimmed = entry.trim();
|
|
19880
|
+
if (!trimmed || trimmed.startsWith("...")) continue;
|
|
19881
|
+
let propName;
|
|
19882
|
+
let defaultValue;
|
|
19883
|
+
const renameMatch = trimmed.match(/^(\w+)\s*:\s*\w+\s*(?:=\s*([\s\S]+))?$/);
|
|
19884
|
+
if (renameMatch) {
|
|
19885
|
+
propName = renameMatch[1];
|
|
19886
|
+
defaultValue = renameMatch[2]?.trim();
|
|
19887
|
+
} else {
|
|
19888
|
+
const defaultMatch = trimmed.match(/^(\w+)\s*=\s*([\s\S]+)$/);
|
|
19889
|
+
if (defaultMatch) {
|
|
19890
|
+
propName = defaultMatch[1];
|
|
19891
|
+
defaultValue = defaultMatch[2]?.trim();
|
|
19892
|
+
} else {
|
|
19893
|
+
propName = trimmed.match(/^(\w+)/)?.[1] ?? trimmed;
|
|
19894
|
+
}
|
|
19895
|
+
}
|
|
19896
|
+
const propType = resolvedTypeMap?.get(propName);
|
|
19897
|
+
props.push({
|
|
19898
|
+
name: propName,
|
|
19899
|
+
...propType ? { type: propType } : {},
|
|
19900
|
+
...defaultValue ? { default: defaultValue } : {}
|
|
19901
|
+
});
|
|
19902
|
+
}
|
|
19903
|
+
}
|
|
19904
|
+
return { description, props };
|
|
19905
|
+
}
|
|
19906
|
+
function splitDestructureBlock(block) {
|
|
19907
|
+
const entries = [];
|
|
19908
|
+
let depth = 0;
|
|
19909
|
+
let current = "";
|
|
19910
|
+
for (const ch of block) {
|
|
19911
|
+
if (ch === "{" || ch === "[" || ch === "(") {
|
|
19912
|
+
depth++;
|
|
19913
|
+
current += ch;
|
|
19914
|
+
} else if (ch === "}" || ch === "]" || ch === ")") {
|
|
19915
|
+
depth--;
|
|
19916
|
+
current += ch;
|
|
19917
|
+
} else if (ch === "," && depth === 0) {
|
|
19918
|
+
entries.push(current);
|
|
19919
|
+
current = "";
|
|
19920
|
+
} else {
|
|
19921
|
+
current += ch;
|
|
19922
|
+
}
|
|
19923
|
+
}
|
|
19924
|
+
if (current.trim()) entries.push(current);
|
|
19925
|
+
return entries;
|
|
19926
|
+
}
|
|
19927
|
+
function resolveTypeReference(source, typeName) {
|
|
19928
|
+
const startRe = new RegExp(`(?:interface\\s+${typeName}\\s*|type\\s+${typeName}\\s*=\\s*)\\{`);
|
|
19929
|
+
const startMatch = source.match(startRe);
|
|
19930
|
+
if (!startMatch || startMatch.index === void 0) return void 0;
|
|
19931
|
+
const bodyStart = startMatch.index + startMatch[0].length;
|
|
19932
|
+
let depth = 1;
|
|
19933
|
+
let i = bodyStart;
|
|
19934
|
+
while (i < source.length && depth > 0) {
|
|
19935
|
+
if (source[i] === "{") depth++;
|
|
19936
|
+
else if (source[i] === "}") depth--;
|
|
19937
|
+
i++;
|
|
19938
|
+
}
|
|
19939
|
+
if (depth !== 0) return void 0;
|
|
19940
|
+
const body = source.slice(bodyStart, i - 1);
|
|
19941
|
+
return parseTypeMembers(body);
|
|
19942
|
+
}
|
|
19943
|
+
function parseInlineTypeAnnotation(annotation) {
|
|
19944
|
+
const inner = annotation.replace(/^\{/, "").replace(/\}$/, "");
|
|
19945
|
+
return parseTypeMembers(inner);
|
|
19946
|
+
}
|
|
19947
|
+
function parseTypeMembers(body) {
|
|
19948
|
+
const map = /* @__PURE__ */ new Map();
|
|
19949
|
+
const members = body.split(/[;\n]/).map((m) => m.trim()).filter(Boolean);
|
|
19950
|
+
for (const member of members) {
|
|
19951
|
+
const memberMatch = member.match(/^(\w+)\??\s*:\s*(.+)$/);
|
|
19952
|
+
if (memberMatch) {
|
|
19953
|
+
map.set(memberMatch[1], memberMatch[2].replace(/,\s*$/, "").trim());
|
|
19954
|
+
}
|
|
19955
|
+
}
|
|
19956
|
+
return map;
|
|
19957
|
+
}
|
|
19958
|
+
function buildComponentMarkdown(componentName, meta) {
|
|
19959
|
+
if (!meta.description && meta.props.length === 0) return "";
|
|
19960
|
+
const parts = [`${componentName} component.`];
|
|
19961
|
+
if (meta.description) {
|
|
19962
|
+
parts.push(meta.description);
|
|
19963
|
+
}
|
|
19964
|
+
if (meta.props.length > 0) {
|
|
19965
|
+
const propEntries = meta.props.map((p) => {
|
|
19966
|
+
let entry = p.name;
|
|
19967
|
+
if (p.type) entry += ` (${p.type})`;
|
|
19968
|
+
if (p.default) entry += ` default: ${p.default}`;
|
|
19969
|
+
return entry;
|
|
19970
|
+
});
|
|
19971
|
+
parts.push(`Props: ${propEntries.join(", ")}.`);
|
|
19972
|
+
}
|
|
19973
|
+
return parts.join(" ");
|
|
19974
|
+
}
|
|
19220
19975
|
function normalizeSvelteToMarkdown(source) {
|
|
19221
19976
|
return source.replace(/<script[\s\S]*?<\/script>/g, "").replace(/<style[\s\S]*?<\/style>/g, "").replace(/<[^>]+>/g, " ").replace(/\{[^}]+\}/g, " ").replace(/\s+/g, " ").trim();
|
|
19222
19977
|
}
|
|
@@ -19235,13 +19990,27 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
19235
19990
|
const selected = typeof limit === "number" ? files.slice(0, limit) : files;
|
|
19236
19991
|
const pages = [];
|
|
19237
19992
|
for (const filePath of selected) {
|
|
19238
|
-
const raw = await
|
|
19239
|
-
|
|
19993
|
+
const raw = await fs8__default.default.readFile(filePath, "utf8");
|
|
19994
|
+
let markdown;
|
|
19995
|
+
let tags;
|
|
19996
|
+
if (filePath.endsWith(".md")) {
|
|
19997
|
+
markdown = raw;
|
|
19998
|
+
} else if (isSvelteComponentFile(filePath)) {
|
|
19999
|
+
const componentName = path__default.default.basename(filePath, ".svelte");
|
|
20000
|
+
const meta = extractSvelteComponentMeta(raw);
|
|
20001
|
+
const componentMarkdown = buildComponentMarkdown(componentName, meta);
|
|
20002
|
+
const templateContent = normalizeSvelteToMarkdown(raw);
|
|
20003
|
+
markdown = componentMarkdown ? [componentMarkdown, templateContent].filter(Boolean).join("\n\n") : templateContent;
|
|
20004
|
+
tags = ["component"];
|
|
20005
|
+
} else {
|
|
20006
|
+
markdown = normalizeSvelteToMarkdown(raw);
|
|
20007
|
+
}
|
|
19240
20008
|
pages.push({
|
|
19241
20009
|
url: filePathToUrl(filePath, baseDir),
|
|
19242
20010
|
markdown,
|
|
19243
20011
|
sourcePath: path__default.default.relative(cwd, filePath).replace(/\\/g, "/"),
|
|
19244
|
-
outgoingLinks: []
|
|
20012
|
+
outgoingLinks: [],
|
|
20013
|
+
...tags ? { tags } : {}
|
|
19245
20014
|
});
|
|
19246
20015
|
}
|
|
19247
20016
|
return pages;
|
|
@@ -19371,7 +20140,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
19371
20140
|
const selected = typeof limit === "number" ? htmlFiles.slice(0, limit) : htmlFiles;
|
|
19372
20141
|
const pages = [];
|
|
19373
20142
|
for (const filePath of selected) {
|
|
19374
|
-
const html = await
|
|
20143
|
+
const html = await fs8__default.default.readFile(filePath, "utf8");
|
|
19375
20144
|
pages.push({
|
|
19376
20145
|
url: staticHtmlFileToUrl(filePath, outputDir),
|
|
19377
20146
|
html,
|
|
@@ -19434,7 +20203,7 @@ function isBlockedByRobots(urlPath, rules3) {
|
|
|
19434
20203
|
}
|
|
19435
20204
|
async function loadRobotsTxtFromDir(dir) {
|
|
19436
20205
|
try {
|
|
19437
|
-
const content = await
|
|
20206
|
+
const content = await fs8__default.default.readFile(path__default.default.join(dir, "robots.txt"), "utf8");
|
|
19438
20207
|
return parseRobotsTxt(content);
|
|
19439
20208
|
} catch {
|
|
19440
20209
|
return null;
|
|
@@ -19462,29 +20231,65 @@ function nonNegativeOrZero(value) {
|
|
|
19462
20231
|
function normalizeForTitleMatch(text) {
|
|
19463
20232
|
return text.toLowerCase().replace(/[^a-z0-9\s]/g, "").replace(/\s+/g, " ").trim();
|
|
19464
20233
|
}
|
|
19465
|
-
function rankHits(hits, config, query) {
|
|
20234
|
+
function rankHits(hits, config, query, debug) {
|
|
19466
20235
|
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
19467
20236
|
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
19468
20237
|
return hits.map((hit) => {
|
|
19469
|
-
|
|
20238
|
+
const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
20239
|
+
let score = baseScore;
|
|
20240
|
+
let incomingLinkBoostValue = 0;
|
|
19470
20241
|
if (config.ranking.enableIncomingLinkBoost) {
|
|
19471
20242
|
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
19472
|
-
|
|
20243
|
+
incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
|
|
20244
|
+
score += incomingLinkBoostValue;
|
|
19473
20245
|
}
|
|
20246
|
+
let depthBoostValue = 0;
|
|
19474
20247
|
if (config.ranking.enableDepthBoost) {
|
|
19475
20248
|
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
19476
|
-
|
|
20249
|
+
depthBoostValue = depthBoost * config.ranking.weights.depth;
|
|
20250
|
+
score += depthBoostValue;
|
|
19477
20251
|
}
|
|
20252
|
+
let titleMatchBoostValue = 0;
|
|
19478
20253
|
if (normalizedQuery && titleMatchWeight > 0) {
|
|
19479
20254
|
const normalizedTitle = normalizeForTitleMatch(hit.metadata.title);
|
|
19480
20255
|
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
19481
|
-
|
|
20256
|
+
titleMatchBoostValue = titleMatchWeight;
|
|
20257
|
+
score += titleMatchBoostValue;
|
|
19482
20258
|
}
|
|
19483
20259
|
}
|
|
19484
|
-
|
|
20260
|
+
let freshnessBoostValue = 0;
|
|
20261
|
+
if (config.ranking.enableFreshnessBoost) {
|
|
20262
|
+
const publishedAt = hit.metadata.publishedAt;
|
|
20263
|
+
if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
|
|
20264
|
+
const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
|
|
20265
|
+
const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
|
|
20266
|
+
freshnessBoostValue = decay * config.ranking.weights.freshness;
|
|
20267
|
+
score += freshnessBoostValue;
|
|
20268
|
+
}
|
|
20269
|
+
}
|
|
20270
|
+
let anchorTextMatchBoostValue = 0;
|
|
20271
|
+
if (config.ranking.enableAnchorTextBoost && normalizedQuery && config.ranking.weights.anchorText > 0) {
|
|
20272
|
+
const normalizedAnchorText = normalizeForTitleMatch(hit.metadata.incomingAnchorText ?? "");
|
|
20273
|
+
if (normalizedAnchorText.length > 0 && normalizedQuery.length > 0 && (normalizedAnchorText.includes(normalizedQuery) || normalizedQuery.includes(normalizedAnchorText))) {
|
|
20274
|
+
anchorTextMatchBoostValue = config.ranking.weights.anchorText;
|
|
20275
|
+
score += anchorTextMatchBoostValue;
|
|
20276
|
+
}
|
|
20277
|
+
}
|
|
20278
|
+
const result = {
|
|
19485
20279
|
hit,
|
|
19486
20280
|
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
19487
20281
|
};
|
|
20282
|
+
if (debug) {
|
|
20283
|
+
result.breakdown = {
|
|
20284
|
+
baseScore,
|
|
20285
|
+
incomingLinkBoost: incomingLinkBoostValue,
|
|
20286
|
+
depthBoost: depthBoostValue,
|
|
20287
|
+
titleMatchBoost: titleMatchBoostValue,
|
|
20288
|
+
freshnessBoost: freshnessBoostValue,
|
|
20289
|
+
anchorTextMatchBoost: anchorTextMatchBoostValue
|
|
20290
|
+
};
|
|
20291
|
+
}
|
|
20292
|
+
return result;
|
|
19488
20293
|
}).sort((a, b) => {
|
|
19489
20294
|
const delta = b.finalScore - a.finalScore;
|
|
19490
20295
|
return Number.isNaN(delta) ? 0 : delta;
|
|
@@ -19493,12 +20298,13 @@ function rankHits(hits, config, query) {
|
|
|
19493
20298
|
function trimByScoreGap(results, config) {
|
|
19494
20299
|
if (results.length === 0) return results;
|
|
19495
20300
|
const threshold = config.ranking.scoreGapThreshold;
|
|
19496
|
-
const
|
|
19497
|
-
if (
|
|
19498
|
-
const
|
|
19499
|
-
|
|
19500
|
-
|
|
19501
|
-
|
|
20301
|
+
const minScoreRatio = config.ranking.minScoreRatio;
|
|
20302
|
+
if (minScoreRatio > 0 && results.length > 0) {
|
|
20303
|
+
const topScore = results[0].pageScore;
|
|
20304
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
20305
|
+
const minThreshold = topScore * minScoreRatio;
|
|
20306
|
+
results = results.filter((r) => r.pageScore >= minThreshold);
|
|
20307
|
+
}
|
|
19502
20308
|
}
|
|
19503
20309
|
if (threshold > 0 && results.length > 1) {
|
|
19504
20310
|
for (let i = 1; i < results.length; i++) {
|
|
@@ -19568,61 +20374,99 @@ function aggregateByPage(ranked, config) {
|
|
|
19568
20374
|
return Number.isNaN(delta) ? 0 : delta;
|
|
19569
20375
|
});
|
|
19570
20376
|
}
|
|
19571
|
-
function
|
|
19572
|
-
|
|
19573
|
-
const
|
|
19574
|
-
|
|
19575
|
-
|
|
19576
|
-
|
|
19577
|
-
|
|
19578
|
-
|
|
19579
|
-
|
|
19580
|
-
|
|
19581
|
-
|
|
19582
|
-
if (pageHit) {
|
|
19583
|
-
pagesWithChunks.add(url);
|
|
19584
|
-
const blended = (1 - w) * ranked.finalScore + w * pageHit.score;
|
|
19585
|
-
return {
|
|
19586
|
-
hit: ranked.hit,
|
|
19587
|
-
finalScore: Number.isFinite(blended) ? blended : ranked.finalScore
|
|
19588
|
-
};
|
|
20377
|
+
function rankPageHits(pageHits, config, query, debug) {
|
|
20378
|
+
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
20379
|
+
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
20380
|
+
return pageHits.map((hit) => {
|
|
20381
|
+
const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
20382
|
+
let score = baseScore;
|
|
20383
|
+
let incomingLinkBoostValue = 0;
|
|
20384
|
+
if (config.ranking.enableIncomingLinkBoost) {
|
|
20385
|
+
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.incomingLinks));
|
|
20386
|
+
incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
|
|
20387
|
+
score += incomingLinkBoostValue;
|
|
19589
20388
|
}
|
|
19590
|
-
|
|
19591
|
-
|
|
19592
|
-
|
|
19593
|
-
|
|
19594
|
-
|
|
19595
|
-
|
|
19596
|
-
|
|
19597
|
-
|
|
19598
|
-
|
|
19599
|
-
|
|
19600
|
-
|
|
19601
|
-
|
|
19602
|
-
path: pageHit.url,
|
|
19603
|
-
title: pageHit.title,
|
|
19604
|
-
sectionTitle: "",
|
|
19605
|
-
headingPath: [],
|
|
19606
|
-
snippet: pageHit.description || pageHit.title,
|
|
19607
|
-
chunkText: pageHit.description || pageHit.title,
|
|
19608
|
-
ordinal: 0,
|
|
19609
|
-
contentHash: "",
|
|
19610
|
-
depth: pageHit.depth,
|
|
19611
|
-
incomingLinks: pageHit.incomingLinks,
|
|
19612
|
-
routeFile: pageHit.routeFile,
|
|
19613
|
-
tags: pageHit.tags
|
|
20389
|
+
let depthBoostValue = 0;
|
|
20390
|
+
if (config.ranking.enableDepthBoost) {
|
|
20391
|
+
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.depth));
|
|
20392
|
+
depthBoostValue = depthBoost * config.ranking.weights.depth;
|
|
20393
|
+
score += depthBoostValue;
|
|
20394
|
+
}
|
|
20395
|
+
let titleMatchBoostValue = 0;
|
|
20396
|
+
if (normalizedQuery && titleMatchWeight > 0) {
|
|
20397
|
+
const normalizedTitle = normalizeForTitleMatch(hit.title);
|
|
20398
|
+
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
20399
|
+
titleMatchBoostValue = titleMatchWeight;
|
|
20400
|
+
score += titleMatchBoostValue;
|
|
19614
20401
|
}
|
|
20402
|
+
}
|
|
20403
|
+
let freshnessBoostValue = 0;
|
|
20404
|
+
if (config.ranking.enableFreshnessBoost) {
|
|
20405
|
+
const publishedAt = hit.publishedAt;
|
|
20406
|
+
if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
|
|
20407
|
+
const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
|
|
20408
|
+
const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
|
|
20409
|
+
freshnessBoostValue = decay * config.ranking.weights.freshness;
|
|
20410
|
+
score += freshnessBoostValue;
|
|
20411
|
+
}
|
|
20412
|
+
}
|
|
20413
|
+
const pageWeight = findPageWeight(hit.url, config.ranking.pageWeights);
|
|
20414
|
+
if (pageWeight !== 1) {
|
|
20415
|
+
score *= pageWeight;
|
|
20416
|
+
}
|
|
20417
|
+
const result = {
|
|
20418
|
+
url: hit.url,
|
|
20419
|
+
title: hit.title,
|
|
20420
|
+
description: hit.description,
|
|
20421
|
+
routeFile: hit.routeFile,
|
|
20422
|
+
depth: hit.depth,
|
|
20423
|
+
incomingLinks: hit.incomingLinks,
|
|
20424
|
+
tags: hit.tags,
|
|
20425
|
+
baseScore,
|
|
20426
|
+
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY,
|
|
20427
|
+
publishedAt: hit.publishedAt
|
|
19615
20428
|
};
|
|
19616
|
-
|
|
19617
|
-
|
|
19618
|
-
|
|
19619
|
-
|
|
19620
|
-
|
|
19621
|
-
|
|
20429
|
+
if (debug) {
|
|
20430
|
+
result.breakdown = {
|
|
20431
|
+
baseScore,
|
|
20432
|
+
pageWeight,
|
|
20433
|
+
incomingLinkBoost: incomingLinkBoostValue,
|
|
20434
|
+
depthBoost: depthBoostValue,
|
|
20435
|
+
titleMatchBoost: titleMatchBoostValue,
|
|
20436
|
+
freshnessBoost: freshnessBoostValue
|
|
20437
|
+
};
|
|
20438
|
+
}
|
|
20439
|
+
return result;
|
|
20440
|
+
}).filter((p) => findPageWeight(p.url, config.ranking.pageWeights) !== 0).sort((a, b) => {
|
|
19622
20441
|
const delta = b.finalScore - a.finalScore;
|
|
19623
20442
|
return Number.isNaN(delta) ? 0 : delta;
|
|
19624
20443
|
});
|
|
19625
20444
|
}
|
|
20445
|
+
function trimPagesByScoreGap(results, config) {
|
|
20446
|
+
if (results.length === 0) return results;
|
|
20447
|
+
const threshold = config.ranking.scoreGapThreshold;
|
|
20448
|
+
const minScoreRatio = config.ranking.minScoreRatio;
|
|
20449
|
+
if (minScoreRatio > 0 && results.length > 0) {
|
|
20450
|
+
const topScore = results[0].finalScore;
|
|
20451
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
20452
|
+
const minThreshold = topScore * minScoreRatio;
|
|
20453
|
+
results = results.filter((r) => r.finalScore >= minThreshold);
|
|
20454
|
+
}
|
|
20455
|
+
}
|
|
20456
|
+
if (threshold > 0 && results.length > 1) {
|
|
20457
|
+
for (let i = 1; i < results.length; i++) {
|
|
20458
|
+
const prev = results[i - 1].finalScore;
|
|
20459
|
+
const current = results[i].finalScore;
|
|
20460
|
+
if (prev > 0) {
|
|
20461
|
+
const gap = (prev - current) / prev;
|
|
20462
|
+
if (gap >= threshold) {
|
|
20463
|
+
return results.slice(0, i);
|
|
20464
|
+
}
|
|
20465
|
+
}
|
|
20466
|
+
}
|
|
20467
|
+
}
|
|
20468
|
+
return results;
|
|
20469
|
+
}
|
|
19626
20470
|
|
|
19627
20471
|
// src/utils/time.ts
|
|
19628
20472
|
function nowIso() {
|
|
@@ -19631,6 +20475,81 @@ function nowIso() {
|
|
|
19631
20475
|
function hrTimeMs(start) {
|
|
19632
20476
|
return Number(process.hrtime.bigint() - start) / 1e6;
|
|
19633
20477
|
}
|
|
20478
|
+
function resolvePageUrl(pageUrl, baseUrl) {
|
|
20479
|
+
if (!baseUrl) return pageUrl;
|
|
20480
|
+
try {
|
|
20481
|
+
return new URL(pageUrl, baseUrl).href;
|
|
20482
|
+
} catch {
|
|
20483
|
+
return pageUrl;
|
|
20484
|
+
}
|
|
20485
|
+
}
|
|
20486
|
+
function generateLlmsTxt(pages, config) {
|
|
20487
|
+
const title = config.llmsTxt.title ?? config.project.id;
|
|
20488
|
+
const description = config.llmsTxt.description;
|
|
20489
|
+
const baseUrl = config.project.baseUrl;
|
|
20490
|
+
const lines = [`# ${title}`];
|
|
20491
|
+
if (description) {
|
|
20492
|
+
lines.push("", `> ${description}`);
|
|
20493
|
+
}
|
|
20494
|
+
const filtered = pages.filter(
|
|
20495
|
+
(p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
|
|
20496
|
+
);
|
|
20497
|
+
const sorted = [...filtered].sort((a, b) => {
|
|
20498
|
+
if (a.depth !== b.depth) return a.depth - b.depth;
|
|
20499
|
+
return b.incomingLinks - a.incomingLinks;
|
|
20500
|
+
});
|
|
20501
|
+
if (sorted.length > 0) {
|
|
20502
|
+
lines.push("", "## Pages", "");
|
|
20503
|
+
for (const page of sorted) {
|
|
20504
|
+
const url = resolvePageUrl(page.url, baseUrl);
|
|
20505
|
+
if (page.description) {
|
|
20506
|
+
lines.push(`- [${page.title}](${url}): ${page.description}`);
|
|
20507
|
+
} else {
|
|
20508
|
+
lines.push(`- [${page.title}](${url})`);
|
|
20509
|
+
}
|
|
20510
|
+
}
|
|
20511
|
+
}
|
|
20512
|
+
lines.push("");
|
|
20513
|
+
return lines.join("\n");
|
|
20514
|
+
}
|
|
20515
|
+
function generateLlmsFullTxt(pages, config) {
|
|
20516
|
+
const title = config.llmsTxt.title ?? config.project.id;
|
|
20517
|
+
const description = config.llmsTxt.description;
|
|
20518
|
+
const baseUrl = config.project.baseUrl;
|
|
20519
|
+
const lines = [`# ${title}`];
|
|
20520
|
+
if (description) {
|
|
20521
|
+
lines.push("", `> ${description}`);
|
|
20522
|
+
}
|
|
20523
|
+
const filtered = pages.filter(
|
|
20524
|
+
(p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
|
|
20525
|
+
);
|
|
20526
|
+
const sorted = [...filtered].sort((a, b) => {
|
|
20527
|
+
if (a.depth !== b.depth) return a.depth - b.depth;
|
|
20528
|
+
return b.incomingLinks - a.incomingLinks;
|
|
20529
|
+
});
|
|
20530
|
+
for (const page of sorted) {
|
|
20531
|
+
const url = resolvePageUrl(page.url, baseUrl);
|
|
20532
|
+
lines.push("", "---", "", `## [${page.title}](${url})`, "");
|
|
20533
|
+
lines.push(page.markdown.trim());
|
|
20534
|
+
}
|
|
20535
|
+
lines.push("");
|
|
20536
|
+
return lines.join("\n");
|
|
20537
|
+
}
|
|
20538
|
+
async function writeLlmsTxt(pages, config, cwd, logger3) {
|
|
20539
|
+
const outputPath = path__default.default.resolve(cwd, config.llmsTxt.outputPath);
|
|
20540
|
+
const outputDir = path__default.default.dirname(outputPath);
|
|
20541
|
+
await fs8__default.default.mkdir(outputDir, { recursive: true });
|
|
20542
|
+
const content = generateLlmsTxt(pages, config);
|
|
20543
|
+
await fs8__default.default.writeFile(outputPath, content, "utf8");
|
|
20544
|
+
logger3.info(`Generated llms.txt at ${config.llmsTxt.outputPath}`);
|
|
20545
|
+
if (config.llmsTxt.generateFull) {
|
|
20546
|
+
const fullPath = outputPath.replace(/\.txt$/, "-full.txt");
|
|
20547
|
+
const fullContent = generateLlmsFullTxt(pages, config);
|
|
20548
|
+
await fs8__default.default.writeFile(fullPath, fullContent, "utf8");
|
|
20549
|
+
const relativeFull = path__default.default.relative(cwd, fullPath);
|
|
20550
|
+
logger3.info(`Generated llms-full.txt at ${relativeFull}`);
|
|
20551
|
+
}
|
|
20552
|
+
}
|
|
19634
20553
|
|
|
19635
20554
|
// src/indexing/pipeline.ts
|
|
19636
20555
|
function buildPageSummary(page, maxChars = 3500) {
|
|
@@ -19649,16 +20568,33 @@ function buildPageSummary(page, maxChars = 3500) {
|
|
|
19649
20568
|
if (joined.length <= maxChars) return joined;
|
|
19650
20569
|
return joined.slice(0, maxChars).trim();
|
|
19651
20570
|
}
|
|
20571
|
+
function buildPageContentHash(page) {
|
|
20572
|
+
const parts = [
|
|
20573
|
+
page.title,
|
|
20574
|
+
page.description ?? "",
|
|
20575
|
+
(page.keywords ?? []).slice().sort().join(","),
|
|
20576
|
+
page.tags.slice().sort().join(","),
|
|
20577
|
+
page.markdown,
|
|
20578
|
+
String(page.outgoingLinks),
|
|
20579
|
+
String(page.publishedAt ?? ""),
|
|
20580
|
+
page.incomingAnchorText ?? "",
|
|
20581
|
+
(page.outgoingLinkUrls ?? []).slice().sort().join(","),
|
|
20582
|
+
page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : ""
|
|
20583
|
+
];
|
|
20584
|
+
return sha256(parts.join("|"));
|
|
20585
|
+
}
|
|
19652
20586
|
var IndexPipeline = class _IndexPipeline {
|
|
19653
20587
|
cwd;
|
|
19654
20588
|
config;
|
|
19655
20589
|
store;
|
|
19656
20590
|
logger;
|
|
20591
|
+
hooks;
|
|
19657
20592
|
constructor(options) {
|
|
19658
20593
|
this.cwd = options.cwd;
|
|
19659
20594
|
this.config = options.config;
|
|
19660
20595
|
this.store = options.store;
|
|
19661
20596
|
this.logger = options.logger;
|
|
20597
|
+
this.hooks = options.hooks;
|
|
19662
20598
|
}
|
|
19663
20599
|
static async create(options = {}) {
|
|
19664
20600
|
const cwd = path__default.default.resolve(options.cwd ?? process.cwd());
|
|
@@ -19668,7 +20604,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19668
20604
|
cwd,
|
|
19669
20605
|
config,
|
|
19670
20606
|
store,
|
|
19671
|
-
logger: options.logger ?? new Logger()
|
|
20607
|
+
logger: options.logger ?? new Logger(),
|
|
20608
|
+
hooks: options.hooks ?? {}
|
|
19672
20609
|
});
|
|
19673
20610
|
}
|
|
19674
20611
|
getConfig() {
|
|
@@ -19689,7 +20626,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19689
20626
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
19690
20627
|
ensureStateDirs(this.cwd, this.config.state.dir);
|
|
19691
20628
|
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
19692
|
-
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-
|
|
20629
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-vector)`);
|
|
19693
20630
|
if (options.force) {
|
|
19694
20631
|
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
19695
20632
|
}
|
|
@@ -19698,8 +20635,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19698
20635
|
}
|
|
19699
20636
|
const manifestStart = stageStart();
|
|
19700
20637
|
const existingHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getContentHashes(scope);
|
|
20638
|
+
const existingPageHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getPageHashes(scope);
|
|
19701
20639
|
stageEnd("manifest", manifestStart);
|
|
19702
|
-
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
20640
|
+
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes, ${existingPageHashes.size} existing page hashes loaded`);
|
|
19703
20641
|
const sourceStart = stageStart();
|
|
19704
20642
|
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
19705
20643
|
let sourcePages;
|
|
@@ -19776,11 +20714,61 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19776
20714
|
);
|
|
19777
20715
|
continue;
|
|
19778
20716
|
}
|
|
19779
|
-
|
|
20717
|
+
if (sourcePage.tags && sourcePage.tags.length > 0) {
|
|
20718
|
+
extracted.tags = [.../* @__PURE__ */ new Set([...extracted.tags, ...sourcePage.tags])];
|
|
20719
|
+
}
|
|
20720
|
+
let accepted;
|
|
20721
|
+
if (this.hooks.transformPage) {
|
|
20722
|
+
const transformed = await this.hooks.transformPage(extracted);
|
|
20723
|
+
if (transformed === null) {
|
|
20724
|
+
this.logger.debug(`Page ${sourcePage.url} skipped by transformPage hook`);
|
|
20725
|
+
continue;
|
|
20726
|
+
}
|
|
20727
|
+
accepted = transformed;
|
|
20728
|
+
} else {
|
|
20729
|
+
accepted = extracted;
|
|
20730
|
+
}
|
|
20731
|
+
extractedPages.push(accepted);
|
|
19780
20732
|
this.logger.event("page_extracted", {
|
|
19781
|
-
url:
|
|
20733
|
+
url: accepted.url
|
|
19782
20734
|
});
|
|
19783
20735
|
}
|
|
20736
|
+
const customRecords = options.customRecords ?? [];
|
|
20737
|
+
if (customRecords.length > 0) {
|
|
20738
|
+
this.logger.info(`Processing ${customRecords.length} custom record${customRecords.length === 1 ? "" : "s"}...`);
|
|
20739
|
+
for (const record of customRecords) {
|
|
20740
|
+
const normalizedUrl = normalizeUrlPath(record.url);
|
|
20741
|
+
const normalized = normalizeMarkdown(record.content);
|
|
20742
|
+
if (!normalized.trim()) {
|
|
20743
|
+
this.logger.warn(`Custom record ${normalizedUrl} has empty content and was skipped.`);
|
|
20744
|
+
continue;
|
|
20745
|
+
}
|
|
20746
|
+
const urlTags = normalizedUrl.split("/").filter(Boolean).slice(0, 1);
|
|
20747
|
+
const tags = record.tags ? [.../* @__PURE__ */ new Set([...urlTags, ...record.tags])] : urlTags;
|
|
20748
|
+
const extracted = {
|
|
20749
|
+
url: normalizedUrl,
|
|
20750
|
+
title: record.title,
|
|
20751
|
+
markdown: normalized,
|
|
20752
|
+
outgoingLinks: [],
|
|
20753
|
+
noindex: false,
|
|
20754
|
+
tags,
|
|
20755
|
+
weight: record.weight
|
|
20756
|
+
};
|
|
20757
|
+
let accepted;
|
|
20758
|
+
if (this.hooks.transformPage) {
|
|
20759
|
+
const transformed = await this.hooks.transformPage(extracted);
|
|
20760
|
+
if (transformed === null) {
|
|
20761
|
+
this.logger.debug(`Custom record ${normalizedUrl} skipped by transformPage hook`);
|
|
20762
|
+
continue;
|
|
20763
|
+
}
|
|
20764
|
+
accepted = transformed;
|
|
20765
|
+
} else {
|
|
20766
|
+
accepted = extracted;
|
|
20767
|
+
}
|
|
20768
|
+
extractedPages.push(accepted);
|
|
20769
|
+
this.logger.event("page_extracted", { url: accepted.url, custom: true });
|
|
20770
|
+
}
|
|
20771
|
+
}
|
|
19784
20772
|
extractedPages.sort((a, b) => a.url.localeCompare(b.url));
|
|
19785
20773
|
const uniquePages = [];
|
|
19786
20774
|
const seenUrls = /* @__PURE__ */ new Set();
|
|
@@ -19813,15 +20801,28 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19813
20801
|
const linkStart = stageStart();
|
|
19814
20802
|
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
19815
20803
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
20804
|
+
const incomingAnchorTexts = /* @__PURE__ */ new Map();
|
|
19816
20805
|
for (const page of indexablePages) {
|
|
19817
20806
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
19818
20807
|
}
|
|
19819
20808
|
for (const page of indexablePages) {
|
|
19820
|
-
|
|
20809
|
+
const seenForCount = /* @__PURE__ */ new Set();
|
|
20810
|
+
const seenForAnchor = /* @__PURE__ */ new Set();
|
|
20811
|
+
for (const { url: outgoing, anchorText } of page.outgoingLinks) {
|
|
19821
20812
|
if (!pageSet.has(outgoing)) {
|
|
19822
20813
|
continue;
|
|
19823
20814
|
}
|
|
19824
|
-
|
|
20815
|
+
if (!seenForCount.has(outgoing)) {
|
|
20816
|
+
seenForCount.add(outgoing);
|
|
20817
|
+
incomingLinkCount.set(outgoing, (incomingLinkCount.get(outgoing) ?? 0) + 1);
|
|
20818
|
+
}
|
|
20819
|
+
if (anchorText && !seenForAnchor.has(outgoing)) {
|
|
20820
|
+
seenForAnchor.add(outgoing);
|
|
20821
|
+
if (!incomingAnchorTexts.has(outgoing)) {
|
|
20822
|
+
incomingAnchorTexts.set(outgoing, /* @__PURE__ */ new Set());
|
|
20823
|
+
}
|
|
20824
|
+
incomingAnchorTexts.get(outgoing).add(anchorText);
|
|
20825
|
+
}
|
|
19825
20826
|
}
|
|
19826
20827
|
}
|
|
19827
20828
|
stageEnd("links", linkStart);
|
|
@@ -19840,6 +20841,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19840
20841
|
});
|
|
19841
20842
|
}
|
|
19842
20843
|
}
|
|
20844
|
+
for (const record of customRecords) {
|
|
20845
|
+
const normalizedUrl = normalizeUrlPath(record.url);
|
|
20846
|
+
if (!precomputedRoutes.has(normalizedUrl)) {
|
|
20847
|
+
precomputedRoutes.set(normalizedUrl, {
|
|
20848
|
+
routeFile: "",
|
|
20849
|
+
routeResolution: "exact"
|
|
20850
|
+
});
|
|
20851
|
+
}
|
|
20852
|
+
}
|
|
19843
20853
|
for (const page of indexablePages) {
|
|
19844
20854
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
19845
20855
|
if (routeMatch.routeResolution === "best-effort") {
|
|
@@ -19857,6 +20867,17 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19857
20867
|
} else {
|
|
19858
20868
|
routeExact += 1;
|
|
19859
20869
|
}
|
|
20870
|
+
const anchorSet = incomingAnchorTexts.get(page.url);
|
|
20871
|
+
let incomingAnchorText;
|
|
20872
|
+
if (anchorSet && anchorSet.size > 0) {
|
|
20873
|
+
let joined = "";
|
|
20874
|
+
for (const phrase of anchorSet) {
|
|
20875
|
+
const next2 = joined ? `${joined} ${phrase}` : phrase;
|
|
20876
|
+
if (next2.length > 500) break;
|
|
20877
|
+
joined = next2;
|
|
20878
|
+
}
|
|
20879
|
+
incomingAnchorText = joined || void 0;
|
|
20880
|
+
}
|
|
19860
20881
|
const indexedPage = {
|
|
19861
20882
|
url: page.url,
|
|
19862
20883
|
title: page.title,
|
|
@@ -19866,40 +20887,113 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19866
20887
|
generatedAt: nowIso(),
|
|
19867
20888
|
incomingLinks: incomingLinkCount.get(page.url) ?? 0,
|
|
19868
20889
|
outgoingLinks: page.outgoingLinks.length,
|
|
20890
|
+
outgoingLinkUrls: page.outgoingLinks.map((l) => typeof l === "string" ? l : l.url),
|
|
19869
20891
|
depth: getUrlDepth(page.url),
|
|
19870
20892
|
tags: page.tags,
|
|
19871
20893
|
markdown: page.markdown,
|
|
19872
20894
|
description: page.description,
|
|
19873
|
-
keywords: page.keywords
|
|
20895
|
+
keywords: page.keywords,
|
|
20896
|
+
publishedAt: page.publishedAt,
|
|
20897
|
+
incomingAnchorText,
|
|
20898
|
+
meta: page.meta
|
|
19874
20899
|
};
|
|
19875
20900
|
pages.push(indexedPage);
|
|
19876
20901
|
this.logger.event("page_indexed", { url: page.url });
|
|
19877
20902
|
}
|
|
20903
|
+
const pageRecords = pages.map((p) => {
|
|
20904
|
+
const summary = buildPageSummary(p);
|
|
20905
|
+
return {
|
|
20906
|
+
url: p.url,
|
|
20907
|
+
title: p.title,
|
|
20908
|
+
markdown: p.markdown,
|
|
20909
|
+
projectId: scope.projectId,
|
|
20910
|
+
scopeName: scope.scopeName,
|
|
20911
|
+
routeFile: p.routeFile,
|
|
20912
|
+
routeResolution: p.routeResolution,
|
|
20913
|
+
incomingLinks: p.incomingLinks,
|
|
20914
|
+
outgoingLinks: p.outgoingLinks,
|
|
20915
|
+
outgoingLinkUrls: p.outgoingLinkUrls,
|
|
20916
|
+
depth: p.depth,
|
|
20917
|
+
tags: p.tags,
|
|
20918
|
+
indexedAt: p.generatedAt,
|
|
20919
|
+
summary,
|
|
20920
|
+
description: p.description,
|
|
20921
|
+
keywords: p.keywords,
|
|
20922
|
+
contentHash: buildPageContentHash(p),
|
|
20923
|
+
publishedAt: p.publishedAt,
|
|
20924
|
+
meta: p.meta
|
|
20925
|
+
};
|
|
20926
|
+
});
|
|
20927
|
+
const currentPageUrls = new Set(pageRecords.map((r) => r.url));
|
|
20928
|
+
const changedPages = pageRecords.filter(
|
|
20929
|
+
(r) => !existingPageHashes.has(r.url) || existingPageHashes.get(r.url) !== r.contentHash
|
|
20930
|
+
);
|
|
20931
|
+
const deletedPageUrls = [...existingPageHashes.keys()].filter((url) => !currentPageUrls.has(url));
|
|
19878
20932
|
if (!options.dryRun) {
|
|
19879
|
-
|
|
19880
|
-
|
|
19881
|
-
|
|
19882
|
-
|
|
19883
|
-
|
|
19884
|
-
|
|
19885
|
-
|
|
19886
|
-
|
|
19887
|
-
|
|
19888
|
-
|
|
19889
|
-
|
|
19890
|
-
|
|
19891
|
-
|
|
19892
|
-
|
|
19893
|
-
|
|
19894
|
-
|
|
19895
|
-
|
|
19896
|
-
|
|
19897
|
-
|
|
19898
|
-
|
|
19899
|
-
|
|
19900
|
-
|
|
20933
|
+
if (options.force) {
|
|
20934
|
+
await this.store.deletePages(scope);
|
|
20935
|
+
this.logger.info(`Upserting ${pageRecords.length} page summaries...`);
|
|
20936
|
+
const pageDocs = pageRecords.map((r) => ({
|
|
20937
|
+
id: r.url,
|
|
20938
|
+
data: r.summary ?? r.title,
|
|
20939
|
+
metadata: {
|
|
20940
|
+
title: r.title,
|
|
20941
|
+
url: r.url,
|
|
20942
|
+
description: r.description ?? "",
|
|
20943
|
+
keywords: r.keywords ?? [],
|
|
20944
|
+
summary: r.summary ?? "",
|
|
20945
|
+
tags: r.tags,
|
|
20946
|
+
markdown: r.markdown,
|
|
20947
|
+
routeFile: r.routeFile,
|
|
20948
|
+
routeResolution: r.routeResolution,
|
|
20949
|
+
incomingLinks: r.incomingLinks,
|
|
20950
|
+
outgoingLinks: r.outgoingLinks,
|
|
20951
|
+
outgoingLinkUrls: r.outgoingLinkUrls ?? [],
|
|
20952
|
+
depth: r.depth,
|
|
20953
|
+
indexedAt: r.indexedAt,
|
|
20954
|
+
contentHash: r.contentHash ?? "",
|
|
20955
|
+
publishedAt: r.publishedAt ?? null,
|
|
20956
|
+
...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
|
|
20957
|
+
}
|
|
20958
|
+
}));
|
|
20959
|
+
await this.store.upsertPages(pageDocs, scope);
|
|
20960
|
+
} else {
|
|
20961
|
+
if (changedPages.length > 0) {
|
|
20962
|
+
this.logger.info(`Upserting ${changedPages.length} changed page summaries...`);
|
|
20963
|
+
const pageDocs = changedPages.map((r) => ({
|
|
20964
|
+
id: r.url,
|
|
20965
|
+
data: r.summary ?? r.title,
|
|
20966
|
+
metadata: {
|
|
20967
|
+
title: r.title,
|
|
20968
|
+
url: r.url,
|
|
20969
|
+
description: r.description ?? "",
|
|
20970
|
+
keywords: r.keywords ?? [],
|
|
20971
|
+
summary: r.summary ?? "",
|
|
20972
|
+
tags: r.tags,
|
|
20973
|
+
markdown: r.markdown,
|
|
20974
|
+
routeFile: r.routeFile,
|
|
20975
|
+
routeResolution: r.routeResolution,
|
|
20976
|
+
incomingLinks: r.incomingLinks,
|
|
20977
|
+
outgoingLinks: r.outgoingLinks,
|
|
20978
|
+
outgoingLinkUrls: r.outgoingLinkUrls ?? [],
|
|
20979
|
+
depth: r.depth,
|
|
20980
|
+
indexedAt: r.indexedAt,
|
|
20981
|
+
contentHash: r.contentHash ?? "",
|
|
20982
|
+
publishedAt: r.publishedAt ?? null,
|
|
20983
|
+
...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
|
|
20984
|
+
}
|
|
20985
|
+
}));
|
|
20986
|
+
await this.store.upsertPages(pageDocs, scope);
|
|
20987
|
+
}
|
|
20988
|
+
if (deletedPageUrls.length > 0) {
|
|
20989
|
+
await this.store.deletePagesByIds(deletedPageUrls, scope);
|
|
20990
|
+
}
|
|
20991
|
+
}
|
|
19901
20992
|
}
|
|
20993
|
+
const pagesChanged = options.force ? pageRecords.length : changedPages.length;
|
|
20994
|
+
const pagesDeleted = deletedPageUrls.length;
|
|
19902
20995
|
stageEnd("pages", pagesStart);
|
|
20996
|
+
this.logger.info(`Page changes: ${pagesChanged} changed/new, ${pagesDeleted} deleted, ${pageRecords.length - changedPages.length} unchanged`);
|
|
19903
20997
|
this.logger.info(`Indexed ${pages.length} page${pages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["pages"]}ms)`);
|
|
19904
20998
|
const chunkStart = stageStart();
|
|
19905
20999
|
this.logger.info("Chunking pages...");
|
|
@@ -19908,6 +21002,18 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19908
21002
|
if (typeof maxChunks === "number") {
|
|
19909
21003
|
chunks = chunks.slice(0, maxChunks);
|
|
19910
21004
|
}
|
|
21005
|
+
if (this.hooks.transformChunk) {
|
|
21006
|
+
const transformed = [];
|
|
21007
|
+
for (const chunk of chunks) {
|
|
21008
|
+
const result = await this.hooks.transformChunk(chunk);
|
|
21009
|
+
if (result === null) {
|
|
21010
|
+
this.logger.debug(`Chunk ${chunk.chunkKey} skipped by transformChunk hook`);
|
|
21011
|
+
continue;
|
|
21012
|
+
}
|
|
21013
|
+
transformed.push(result);
|
|
21014
|
+
}
|
|
21015
|
+
chunks = transformed;
|
|
21016
|
+
}
|
|
19911
21017
|
for (const chunk of chunks) {
|
|
19912
21018
|
this.logger.event("chunked", {
|
|
19913
21019
|
url: chunk.url,
|
|
@@ -19920,7 +21026,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19920
21026
|
for (const chunk of chunks) {
|
|
19921
21027
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
19922
21028
|
}
|
|
19923
|
-
|
|
21029
|
+
let changedChunks = chunks.filter((chunk) => {
|
|
19924
21030
|
if (options.force) {
|
|
19925
21031
|
return true;
|
|
19926
21032
|
}
|
|
@@ -19934,36 +21040,43 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19934
21040
|
return existingHash !== chunk.contentHash;
|
|
19935
21041
|
});
|
|
19936
21042
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
21043
|
+
if (this.hooks.beforeIndex) {
|
|
21044
|
+
changedChunks = await this.hooks.beforeIndex(changedChunks);
|
|
21045
|
+
}
|
|
19937
21046
|
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
19938
21047
|
const upsertStart = stageStart();
|
|
19939
21048
|
let documentsUpserted = 0;
|
|
19940
21049
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
19941
|
-
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash
|
|
19942
|
-
const UPSTASH_CONTENT_LIMIT = 4096;
|
|
21050
|
+
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Vector...`);
|
|
19943
21051
|
const docs = changedChunks.map((chunk) => {
|
|
19944
|
-
const
|
|
19945
|
-
|
|
19946
|
-
|
|
19947
|
-
|
|
19948
|
-
|
|
19949
|
-
|
|
19950
|
-
const textBudget = Math.max(500, UPSTASH_CONTENT_LIMIT - otherFieldsLen - 50);
|
|
19951
|
-
const text = buildEmbeddingText(chunk, this.config.chunking.prependTitle).slice(0, textBudget);
|
|
21052
|
+
const embeddingText = buildEmbeddingText(chunk, this.config.chunking.prependTitle);
|
|
21053
|
+
if (embeddingText.length > 2e3) {
|
|
21054
|
+
this.logger.warn(
|
|
21055
|
+
`Chunk ${chunk.chunkKey} text is ${embeddingText.length} chars (~${Math.round(embeddingText.length / 4)} tokens), which may exceed the 512-token model limit and be silently truncated.`
|
|
21056
|
+
);
|
|
21057
|
+
}
|
|
19952
21058
|
return {
|
|
19953
21059
|
id: chunk.chunkKey,
|
|
19954
|
-
|
|
21060
|
+
data: embeddingText,
|
|
19955
21061
|
metadata: {
|
|
19956
|
-
|
|
19957
|
-
scopeName: scope.scopeName,
|
|
21062
|
+
url: chunk.url,
|
|
19958
21063
|
path: chunk.path,
|
|
21064
|
+
title: chunk.title,
|
|
21065
|
+
sectionTitle: chunk.sectionTitle ?? "",
|
|
21066
|
+
headingPath: chunk.headingPath.join(" > "),
|
|
19959
21067
|
snippet: chunk.snippet,
|
|
21068
|
+
chunkText: embeddingText,
|
|
21069
|
+
tags: chunk.tags,
|
|
19960
21070
|
ordinal: chunk.ordinal,
|
|
19961
21071
|
contentHash: chunk.contentHash,
|
|
19962
21072
|
depth: chunk.depth,
|
|
19963
21073
|
incomingLinks: chunk.incomingLinks,
|
|
19964
21074
|
routeFile: chunk.routeFile,
|
|
19965
21075
|
description: chunk.description ?? "",
|
|
19966
|
-
keywords:
|
|
21076
|
+
keywords: chunk.keywords ?? [],
|
|
21077
|
+
publishedAt: chunk.publishedAt ?? null,
|
|
21078
|
+
incomingAnchorText: chunk.incomingAnchorText ?? "",
|
|
21079
|
+
...chunk.meta && Object.keys(chunk.meta).length > 0 ? { meta: chunk.meta } : {}
|
|
19967
21080
|
}
|
|
19968
21081
|
};
|
|
19969
21082
|
});
|
|
@@ -19981,9 +21094,16 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19981
21094
|
} else {
|
|
19982
21095
|
this.logger.info("No chunks to upsert \u2014 all up to date");
|
|
19983
21096
|
}
|
|
21097
|
+
if (this.config.llmsTxt.enable && !options.dryRun) {
|
|
21098
|
+
const llmsStart = stageStart();
|
|
21099
|
+
await writeLlmsTxt(pages, this.config, this.cwd, this.logger);
|
|
21100
|
+
stageEnd("llms_txt", llmsStart);
|
|
21101
|
+
}
|
|
19984
21102
|
this.logger.info("Done.");
|
|
19985
|
-
|
|
21103
|
+
const stats = {
|
|
19986
21104
|
pagesProcessed: pages.length,
|
|
21105
|
+
pagesChanged,
|
|
21106
|
+
pagesDeleted,
|
|
19987
21107
|
chunksTotal: chunks.length,
|
|
19988
21108
|
chunksChanged: changedChunks.length,
|
|
19989
21109
|
documentsUpserted,
|
|
@@ -19992,16 +21112,143 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19992
21112
|
routeBestEffort,
|
|
19993
21113
|
stageTimingsMs
|
|
19994
21114
|
};
|
|
21115
|
+
if (this.hooks.afterIndex) {
|
|
21116
|
+
await this.hooks.afterIndex(stats);
|
|
21117
|
+
}
|
|
21118
|
+
return stats;
|
|
19995
21119
|
}
|
|
19996
21120
|
};
|
|
21121
|
+
|
|
21122
|
+
// src/search/related-pages.ts
|
|
21123
|
+
function diceScore(urlA, urlB) {
|
|
21124
|
+
const segmentsA = urlA.split("/").filter(Boolean);
|
|
21125
|
+
const segmentsB = urlB.split("/").filter(Boolean);
|
|
21126
|
+
if (segmentsA.length === 0 && segmentsB.length === 0) return 1;
|
|
21127
|
+
if (segmentsA.length === 0 || segmentsB.length === 0) return 0;
|
|
21128
|
+
let shared = 0;
|
|
21129
|
+
const minLen = Math.min(segmentsA.length, segmentsB.length);
|
|
21130
|
+
for (let i = 0; i < minLen; i++) {
|
|
21131
|
+
if (segmentsA[i] === segmentsB[i]) {
|
|
21132
|
+
shared++;
|
|
21133
|
+
} else {
|
|
21134
|
+
break;
|
|
21135
|
+
}
|
|
21136
|
+
}
|
|
21137
|
+
return 2 * shared / (segmentsA.length + segmentsB.length);
|
|
21138
|
+
}
|
|
21139
|
+
function compositeScore(isLinked, dice, semantic) {
|
|
21140
|
+
return (isLinked ? 0.5 : 0) + 0.3 * dice + 0.2 * semantic;
|
|
21141
|
+
}
|
|
21142
|
+
function dominantRelationshipType(isOutgoing, isIncoming, dice) {
|
|
21143
|
+
if (isOutgoing) return "outgoing_link";
|
|
21144
|
+
if (isIncoming) return "incoming_link";
|
|
21145
|
+
if (dice > 0.4) return "sibling";
|
|
21146
|
+
return "semantic";
|
|
21147
|
+
}
|
|
21148
|
+
|
|
21149
|
+
// src/search/engine.ts
|
|
21150
|
+
var rankingOverridesSchema = zod.z.object({
|
|
21151
|
+
ranking: zod.z.object({
|
|
21152
|
+
enableIncomingLinkBoost: zod.z.boolean().optional(),
|
|
21153
|
+
enableDepthBoost: zod.z.boolean().optional(),
|
|
21154
|
+
aggregationCap: zod.z.number().int().positive().optional(),
|
|
21155
|
+
aggregationDecay: zod.z.number().min(0).max(1).optional(),
|
|
21156
|
+
minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
21157
|
+
minScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
21158
|
+
scoreGapThreshold: zod.z.number().min(0).max(1).optional(),
|
|
21159
|
+
weights: zod.z.object({
|
|
21160
|
+
incomingLinks: zod.z.number().optional(),
|
|
21161
|
+
depth: zod.z.number().optional(),
|
|
21162
|
+
aggregation: zod.z.number().optional(),
|
|
21163
|
+
titleMatch: zod.z.number().optional()
|
|
21164
|
+
}).optional()
|
|
21165
|
+
}).optional(),
|
|
21166
|
+
search: zod.z.object({
|
|
21167
|
+
pageSearchWeight: zod.z.number().min(0).max(1).optional()
|
|
21168
|
+
}).optional()
|
|
21169
|
+
}).optional();
|
|
19997
21170
|
var requestSchema = zod.z.object({
|
|
19998
21171
|
q: zod.z.string().trim().min(1),
|
|
19999
21172
|
topK: zod.z.number().int().positive().max(100).optional(),
|
|
20000
21173
|
scope: zod.z.string().optional(),
|
|
20001
21174
|
pathPrefix: zod.z.string().optional(),
|
|
20002
21175
|
tags: zod.z.array(zod.z.string()).optional(),
|
|
20003
|
-
|
|
21176
|
+
filters: zod.z.record(zod.z.string(), zod.z.union([zod.z.string(), zod.z.number(), zod.z.boolean()])).optional(),
|
|
21177
|
+
groupBy: zod.z.enum(["page", "chunk"]).optional(),
|
|
21178
|
+
maxSubResults: zod.z.number().int().positive().max(20).optional(),
|
|
21179
|
+
debug: zod.z.boolean().optional(),
|
|
21180
|
+
rankingOverrides: rankingOverridesSchema
|
|
20004
21181
|
});
|
|
21182
|
+
var MAX_SITE_STRUCTURE_PAGES = 2e3;
|
|
21183
|
+
function makeNode(url, depth) {
|
|
21184
|
+
return { url, title: "", depth, routeFile: "", isIndexed: false, childCount: 0, children: [] };
|
|
21185
|
+
}
|
|
21186
|
+
function buildTree(pages, pathPrefix) {
|
|
21187
|
+
const nodeMap = /* @__PURE__ */ new Map();
|
|
21188
|
+
const root2 = makeNode("/", 0);
|
|
21189
|
+
nodeMap.set("/", root2);
|
|
21190
|
+
for (const page of pages) {
|
|
21191
|
+
const normalized = normalizeUrlPath(page.url);
|
|
21192
|
+
const segments = normalized.split("/").filter(Boolean);
|
|
21193
|
+
if (segments.length === 0) {
|
|
21194
|
+
root2.title = page.title;
|
|
21195
|
+
root2.routeFile = page.routeFile;
|
|
21196
|
+
root2.isIndexed = true;
|
|
21197
|
+
continue;
|
|
21198
|
+
}
|
|
21199
|
+
for (let i = 1; i <= segments.length; i++) {
|
|
21200
|
+
const partialUrl = "/" + segments.slice(0, i).join("/");
|
|
21201
|
+
if (!nodeMap.has(partialUrl)) {
|
|
21202
|
+
nodeMap.set(partialUrl, makeNode(partialUrl, i));
|
|
21203
|
+
}
|
|
21204
|
+
}
|
|
21205
|
+
const node = nodeMap.get(normalized);
|
|
21206
|
+
node.title = page.title;
|
|
21207
|
+
node.routeFile = page.routeFile;
|
|
21208
|
+
node.isIndexed = true;
|
|
21209
|
+
}
|
|
21210
|
+
for (const [url, node] of nodeMap) {
|
|
21211
|
+
if (url === "/") continue;
|
|
21212
|
+
const segments = url.split("/").filter(Boolean);
|
|
21213
|
+
const parentUrl = segments.length === 1 ? "/" : "/" + segments.slice(0, -1).join("/");
|
|
21214
|
+
const parent = nodeMap.get(parentUrl) ?? root2;
|
|
21215
|
+
parent.children.push(node);
|
|
21216
|
+
}
|
|
21217
|
+
const sortAndCount = (node) => {
|
|
21218
|
+
node.children.sort((a, b) => a.url.localeCompare(b.url));
|
|
21219
|
+
node.childCount = node.children.length;
|
|
21220
|
+
for (const child of node.children) {
|
|
21221
|
+
sortAndCount(child);
|
|
21222
|
+
}
|
|
21223
|
+
};
|
|
21224
|
+
sortAndCount(root2);
|
|
21225
|
+
if (pathPrefix) {
|
|
21226
|
+
const normalizedPrefix = normalizeUrlPath(pathPrefix);
|
|
21227
|
+
const subtreeRoot = nodeMap.get(normalizedPrefix);
|
|
21228
|
+
if (subtreeRoot) {
|
|
21229
|
+
return subtreeRoot;
|
|
21230
|
+
}
|
|
21231
|
+
return makeNode(normalizedPrefix, normalizedPrefix.split("/").filter(Boolean).length);
|
|
21232
|
+
}
|
|
21233
|
+
return root2;
|
|
21234
|
+
}
|
|
21235
|
+
function mergeRankingOverrides(base, overrides) {
|
|
21236
|
+
return {
|
|
21237
|
+
...base,
|
|
21238
|
+
search: {
|
|
21239
|
+
...base.search,
|
|
21240
|
+
...overrides.search
|
|
21241
|
+
},
|
|
21242
|
+
ranking: {
|
|
21243
|
+
...base.ranking,
|
|
21244
|
+
...overrides.ranking,
|
|
21245
|
+
weights: {
|
|
21246
|
+
...base.ranking.weights,
|
|
21247
|
+
...overrides.ranking?.weights
|
|
21248
|
+
}
|
|
21249
|
+
}
|
|
21250
|
+
};
|
|
21251
|
+
}
|
|
20005
21252
|
var SearchEngine = class _SearchEngine {
|
|
20006
21253
|
cwd;
|
|
20007
21254
|
config;
|
|
@@ -20031,125 +21278,203 @@ var SearchEngine = class _SearchEngine {
|
|
|
20031
21278
|
}
|
|
20032
21279
|
const input = parsed.data;
|
|
20033
21280
|
const totalStart = process.hrtime.bigint();
|
|
21281
|
+
const effectiveConfig = input.debug && input.rankingOverrides ? mergeRankingOverrides(this.config, input.rankingOverrides) : this.config;
|
|
20034
21282
|
const resolvedScope = resolveScope(this.config, input.scope);
|
|
20035
21283
|
const topK = input.topK ?? 10;
|
|
21284
|
+
const maxSubResults = input.maxSubResults ?? 5;
|
|
20036
21285
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
20037
|
-
const
|
|
20038
|
-
const
|
|
20039
|
-
|
|
20040
|
-
|
|
20041
|
-
|
|
20042
|
-
|
|
20043
|
-
|
|
20044
|
-
|
|
20045
|
-
|
|
21286
|
+
const queryText = input.q;
|
|
21287
|
+
const pathPrefix = input.pathPrefix ? input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}` : void 0;
|
|
21288
|
+
const filterTags = input.tags && input.tags.length > 0 ? input.tags : void 0;
|
|
21289
|
+
const metaFilterStr = input.filters && Object.keys(input.filters).length > 0 ? buildMetaFilterString(input.filters) : "";
|
|
21290
|
+
const metaFilter = metaFilterStr || void 0;
|
|
21291
|
+
const applyPagePostFilters = (hits) => {
|
|
21292
|
+
let filtered = hits;
|
|
21293
|
+
if (pathPrefix) {
|
|
21294
|
+
filtered = filtered.filter((h) => h.url.startsWith(pathPrefix));
|
|
21295
|
+
}
|
|
21296
|
+
if (filterTags) {
|
|
21297
|
+
filtered = filtered.filter(
|
|
21298
|
+
(h) => filterTags.every((tag) => h.tags.includes(tag))
|
|
21299
|
+
);
|
|
20046
21300
|
}
|
|
20047
|
-
|
|
20048
|
-
|
|
20049
|
-
const
|
|
21301
|
+
return filtered;
|
|
21302
|
+
};
|
|
21303
|
+
const applyChunkPostFilters = (hits) => {
|
|
21304
|
+
let filtered = hits;
|
|
21305
|
+
if (filterTags) {
|
|
21306
|
+
filtered = filtered.filter(
|
|
21307
|
+
(h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
|
|
21308
|
+
);
|
|
21309
|
+
}
|
|
21310
|
+
return filtered;
|
|
21311
|
+
};
|
|
20050
21312
|
const searchStart = process.hrtime.bigint();
|
|
20051
|
-
|
|
20052
|
-
|
|
20053
|
-
const
|
|
20054
|
-
const
|
|
20055
|
-
|
|
20056
|
-
|
|
20057
|
-
|
|
20058
|
-
|
|
20059
|
-
|
|
20060
|
-
|
|
20061
|
-
|
|
20062
|
-
|
|
20063
|
-
|
|
20064
|
-
|
|
20065
|
-
|
|
20066
|
-
|
|
20067
|
-
|
|
20068
|
-
{
|
|
20069
|
-
limit: chunkLimit,
|
|
20070
|
-
semanticWeight: this.config.search.semanticWeight,
|
|
20071
|
-
inputEnrichment: this.config.search.inputEnrichment,
|
|
20072
|
-
reranking: false,
|
|
20073
|
-
filter
|
|
20074
|
-
},
|
|
21313
|
+
if (groupByPage) {
|
|
21314
|
+
const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
|
|
21315
|
+
const pageLimit = Math.max(topK * 2, 20);
|
|
21316
|
+
const pageHits = await this.store.searchPagesByText(
|
|
21317
|
+
queryText,
|
|
21318
|
+
{ limit: pageLimit * fetchMultiplier, filter: metaFilter },
|
|
21319
|
+
resolvedScope
|
|
21320
|
+
);
|
|
21321
|
+
const filteredPages = applyPagePostFilters(pageHits);
|
|
21322
|
+
let rankedPages = rankPageHits(filteredPages, effectiveConfig, input.q, input.debug);
|
|
21323
|
+
rankedPages = trimPagesByScoreGap(rankedPages, effectiveConfig);
|
|
21324
|
+
const topPages = rankedPages.slice(0, topK);
|
|
21325
|
+
const chunkPromises = topPages.map(
|
|
21326
|
+
(page) => this.store.searchChunksByUrl(
|
|
21327
|
+
queryText,
|
|
21328
|
+
page.url,
|
|
21329
|
+
{ limit: maxSubResults, filter: metaFilter },
|
|
20075
21330
|
resolvedScope
|
|
20076
|
-
)
|
|
20077
|
-
|
|
20078
|
-
const
|
|
20079
|
-
|
|
21331
|
+
).then((chunks) => applyChunkPostFilters(chunks))
|
|
21332
|
+
);
|
|
21333
|
+
const allChunks = await Promise.all(chunkPromises);
|
|
21334
|
+
const searchMs = hrTimeMs(searchStart);
|
|
21335
|
+
const results = this.buildPageFirstResults(topPages, allChunks, input.q, input.debug, maxSubResults);
|
|
21336
|
+
return {
|
|
21337
|
+
q: input.q,
|
|
21338
|
+
scope: resolvedScope.scopeName,
|
|
21339
|
+
results,
|
|
21340
|
+
meta: {
|
|
21341
|
+
timingsMs: {
|
|
21342
|
+
search: Math.round(searchMs),
|
|
21343
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
21344
|
+
}
|
|
21345
|
+
}
|
|
21346
|
+
};
|
|
20080
21347
|
} else {
|
|
21348
|
+
const candidateK = Math.max(50, topK);
|
|
21349
|
+
const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
|
|
20081
21350
|
const hits = await this.store.search(
|
|
20082
|
-
|
|
20083
|
-
{
|
|
20084
|
-
limit: candidateK,
|
|
20085
|
-
semanticWeight: this.config.search.semanticWeight,
|
|
20086
|
-
inputEnrichment: this.config.search.inputEnrichment,
|
|
20087
|
-
reranking: this.config.search.reranking,
|
|
20088
|
-
filter
|
|
20089
|
-
},
|
|
21351
|
+
queryText,
|
|
21352
|
+
{ limit: candidateK * fetchMultiplier, filter: metaFilter },
|
|
20090
21353
|
resolvedScope
|
|
20091
21354
|
);
|
|
20092
|
-
|
|
20093
|
-
|
|
20094
|
-
|
|
20095
|
-
|
|
20096
|
-
|
|
20097
|
-
|
|
20098
|
-
|
|
20099
|
-
|
|
20100
|
-
|
|
20101
|
-
|
|
20102
|
-
|
|
20103
|
-
|
|
21355
|
+
let filtered = hits;
|
|
21356
|
+
if (pathPrefix) {
|
|
21357
|
+
filtered = filtered.filter((h) => h.metadata.url.startsWith(pathPrefix));
|
|
21358
|
+
}
|
|
21359
|
+
if (filterTags) {
|
|
21360
|
+
filtered = filtered.filter(
|
|
21361
|
+
(h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
|
|
21362
|
+
);
|
|
21363
|
+
}
|
|
21364
|
+
const ranked = rankHits(filtered, effectiveConfig, input.q, input.debug);
|
|
21365
|
+
const searchMs = hrTimeMs(searchStart);
|
|
21366
|
+
const results = this.buildResults(ranked, topK, false, maxSubResults, input.q, input.debug, effectiveConfig);
|
|
21367
|
+
return {
|
|
21368
|
+
q: input.q,
|
|
21369
|
+
scope: resolvedScope.scopeName,
|
|
21370
|
+
results,
|
|
21371
|
+
meta: {
|
|
21372
|
+
timingsMs: {
|
|
21373
|
+
search: Math.round(searchMs),
|
|
21374
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
21375
|
+
}
|
|
20104
21376
|
}
|
|
21377
|
+
};
|
|
21378
|
+
}
|
|
21379
|
+
}
|
|
21380
|
+
buildPageFirstResults(rankedPages, allChunks, query, debug, maxSubResults = 5) {
|
|
21381
|
+
return rankedPages.map((page, i) => {
|
|
21382
|
+
const chunks = allChunks[i] ?? [];
|
|
21383
|
+
const bestChunk = chunks[0];
|
|
21384
|
+
const snippet = bestChunk ? query ? queryAwareExcerpt(bestChunk.metadata.chunkText, query) : toSnippet(bestChunk.metadata.chunkText) : page.description || page.title;
|
|
21385
|
+
const result = {
|
|
21386
|
+
url: page.url,
|
|
21387
|
+
title: page.title,
|
|
21388
|
+
sectionTitle: bestChunk?.metadata.sectionTitle || void 0,
|
|
21389
|
+
snippet,
|
|
21390
|
+
chunkText: bestChunk?.metadata.chunkText || void 0,
|
|
21391
|
+
score: Number(page.finalScore.toFixed(6)),
|
|
21392
|
+
routeFile: page.routeFile,
|
|
21393
|
+
chunks: chunks.length > 0 ? chunks.slice(0, maxSubResults).map((c) => ({
|
|
21394
|
+
sectionTitle: c.metadata.sectionTitle || void 0,
|
|
21395
|
+
snippet: query ? queryAwareExcerpt(c.metadata.chunkText, query) : toSnippet(c.metadata.chunkText),
|
|
21396
|
+
chunkText: c.metadata.chunkText || void 0,
|
|
21397
|
+
headingPath: c.metadata.headingPath,
|
|
21398
|
+
score: Number(c.score.toFixed(6))
|
|
21399
|
+
})) : void 0
|
|
21400
|
+
};
|
|
21401
|
+
if (debug && page.breakdown) {
|
|
21402
|
+
result.breakdown = {
|
|
21403
|
+
baseScore: page.breakdown.baseScore,
|
|
21404
|
+
incomingLinkBoost: page.breakdown.incomingLinkBoost,
|
|
21405
|
+
depthBoost: page.breakdown.depthBoost,
|
|
21406
|
+
titleMatchBoost: page.breakdown.titleMatchBoost,
|
|
21407
|
+
freshnessBoost: page.breakdown.freshnessBoost,
|
|
21408
|
+
anchorTextMatchBoost: 0
|
|
21409
|
+
};
|
|
20105
21410
|
}
|
|
20106
|
-
|
|
21411
|
+
return result;
|
|
21412
|
+
});
|
|
20107
21413
|
}
|
|
20108
|
-
ensureSnippet(hit) {
|
|
21414
|
+
ensureSnippet(hit, query) {
|
|
21415
|
+
const chunkText = hit.hit.metadata.chunkText;
|
|
21416
|
+
if (query && chunkText) return queryAwareExcerpt(chunkText, query);
|
|
20109
21417
|
const snippet = hit.hit.metadata.snippet;
|
|
20110
21418
|
if (snippet && snippet.length >= 30) return snippet;
|
|
20111
|
-
const chunkText = hit.hit.metadata.chunkText;
|
|
20112
21419
|
if (chunkText) return toSnippet(chunkText);
|
|
20113
21420
|
return snippet || "";
|
|
20114
21421
|
}
|
|
20115
|
-
buildResults(ordered, topK, groupByPage,
|
|
21422
|
+
buildResults(ordered, topK, groupByPage, maxSubResults, query, debug, config) {
|
|
21423
|
+
const cfg = config ?? this.config;
|
|
20116
21424
|
if (groupByPage) {
|
|
20117
|
-
let pages = aggregateByPage(ordered,
|
|
20118
|
-
pages = trimByScoreGap(pages,
|
|
20119
|
-
const minRatio =
|
|
21425
|
+
let pages = aggregateByPage(ordered, cfg);
|
|
21426
|
+
pages = trimByScoreGap(pages, cfg);
|
|
21427
|
+
const minRatio = cfg.ranking.minChunkScoreRatio;
|
|
20120
21428
|
return pages.slice(0, topK).map((page) => {
|
|
20121
21429
|
const bestScore = page.bestChunk.finalScore;
|
|
20122
21430
|
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
20123
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0,
|
|
20124
|
-
|
|
21431
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, maxSubResults);
|
|
21432
|
+
const result = {
|
|
20125
21433
|
url: page.url,
|
|
20126
21434
|
title: page.title,
|
|
20127
21435
|
sectionTitle: page.bestChunk.hit.metadata.sectionTitle || void 0,
|
|
20128
|
-
snippet: this.ensureSnippet(page.bestChunk),
|
|
21436
|
+
snippet: this.ensureSnippet(page.bestChunk, query),
|
|
21437
|
+
chunkText: page.bestChunk.hit.metadata.chunkText || void 0,
|
|
20129
21438
|
score: Number(page.pageScore.toFixed(6)),
|
|
20130
21439
|
routeFile: page.routeFile,
|
|
20131
|
-
chunks: meaningful.length
|
|
21440
|
+
chunks: meaningful.length >= 1 ? meaningful.map((c) => ({
|
|
20132
21441
|
sectionTitle: c.hit.metadata.sectionTitle || void 0,
|
|
20133
|
-
snippet: this.ensureSnippet(c),
|
|
21442
|
+
snippet: this.ensureSnippet(c, query),
|
|
21443
|
+
chunkText: c.hit.metadata.chunkText || void 0,
|
|
20134
21444
|
headingPath: c.hit.metadata.headingPath,
|
|
20135
21445
|
score: Number(c.finalScore.toFixed(6))
|
|
20136
21446
|
})) : void 0
|
|
20137
21447
|
};
|
|
21448
|
+
if (debug && page.bestChunk.breakdown) {
|
|
21449
|
+
result.breakdown = page.bestChunk.breakdown;
|
|
21450
|
+
}
|
|
21451
|
+
return result;
|
|
20138
21452
|
});
|
|
20139
21453
|
} else {
|
|
20140
21454
|
let filtered = ordered;
|
|
20141
|
-
const
|
|
20142
|
-
if (
|
|
20143
|
-
|
|
20144
|
-
|
|
20145
|
-
|
|
20146
|
-
|
|
20147
|
-
|
|
20148
|
-
|
|
20149
|
-
|
|
20150
|
-
|
|
20151
|
-
|
|
20152
|
-
|
|
21455
|
+
const minScoreRatio = cfg.ranking.minScoreRatio;
|
|
21456
|
+
if (minScoreRatio > 0 && ordered.length > 0) {
|
|
21457
|
+
const topScore = ordered[0].finalScore;
|
|
21458
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
21459
|
+
const threshold = topScore * minScoreRatio;
|
|
21460
|
+
filtered = ordered.filter((entry) => entry.finalScore >= threshold);
|
|
21461
|
+
}
|
|
21462
|
+
}
|
|
21463
|
+
return filtered.slice(0, topK).map(({ hit, finalScore, breakdown }) => {
|
|
21464
|
+
const result = {
|
|
21465
|
+
url: hit.metadata.url,
|
|
21466
|
+
title: hit.metadata.title,
|
|
21467
|
+
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
21468
|
+
snippet: this.ensureSnippet({ hit, finalScore }, query),
|
|
21469
|
+
chunkText: hit.metadata.chunkText || void 0,
|
|
21470
|
+
score: Number(finalScore.toFixed(6)),
|
|
21471
|
+
routeFile: hit.metadata.routeFile
|
|
21472
|
+
};
|
|
21473
|
+
if (debug && breakdown) {
|
|
21474
|
+
result.breakdown = breakdown;
|
|
21475
|
+
}
|
|
21476
|
+
return result;
|
|
21477
|
+
});
|
|
20153
21478
|
}
|
|
20154
21479
|
}
|
|
20155
21480
|
async getPage(pathOrUrl, scope) {
|
|
@@ -20175,6 +21500,116 @@ var SearchEngine = class _SearchEngine {
|
|
|
20175
21500
|
markdown: page.markdown
|
|
20176
21501
|
};
|
|
20177
21502
|
}
|
|
21503
|
+
async listPages(opts) {
|
|
21504
|
+
const resolvedScope = resolveScope(this.config, opts?.scope);
|
|
21505
|
+
const pathPrefix = opts?.pathPrefix ? opts.pathPrefix.startsWith("/") ? opts.pathPrefix : `/${opts.pathPrefix}` : void 0;
|
|
21506
|
+
return this.store.listPages(resolvedScope, {
|
|
21507
|
+
cursor: opts?.cursor,
|
|
21508
|
+
limit: opts?.limit,
|
|
21509
|
+
pathPrefix
|
|
21510
|
+
});
|
|
21511
|
+
}
|
|
21512
|
+
async getSiteStructure(opts) {
|
|
21513
|
+
const maxPages = Math.min(opts?.maxPages ?? MAX_SITE_STRUCTURE_PAGES, MAX_SITE_STRUCTURE_PAGES);
|
|
21514
|
+
const allPages = [];
|
|
21515
|
+
let cursor;
|
|
21516
|
+
let truncated = false;
|
|
21517
|
+
do {
|
|
21518
|
+
const result = await this.listPages({
|
|
21519
|
+
pathPrefix: opts?.pathPrefix,
|
|
21520
|
+
scope: opts?.scope,
|
|
21521
|
+
cursor,
|
|
21522
|
+
limit: 200
|
|
21523
|
+
});
|
|
21524
|
+
allPages.push(...result.pages);
|
|
21525
|
+
cursor = result.nextCursor;
|
|
21526
|
+
if (allPages.length >= maxPages) {
|
|
21527
|
+
truncated = allPages.length > maxPages || !!cursor;
|
|
21528
|
+
allPages.length = maxPages;
|
|
21529
|
+
break;
|
|
21530
|
+
}
|
|
21531
|
+
} while (cursor);
|
|
21532
|
+
const root2 = buildTree(allPages, opts?.pathPrefix);
|
|
21533
|
+
return {
|
|
21534
|
+
root: root2,
|
|
21535
|
+
totalPages: allPages.length,
|
|
21536
|
+
truncated
|
|
21537
|
+
};
|
|
21538
|
+
}
|
|
21539
|
+
async getRelatedPages(pathOrUrl, opts) {
|
|
21540
|
+
const resolvedScope = resolveScope(this.config, opts?.scope);
|
|
21541
|
+
const urlPath = this.resolveInputPath(pathOrUrl);
|
|
21542
|
+
const topK = Math.min(opts?.topK ?? 10, 25);
|
|
21543
|
+
const source = await this.store.fetchPageWithVector(urlPath, resolvedScope);
|
|
21544
|
+
if (!source) {
|
|
21545
|
+
throw new SearchSocketError("INVALID_REQUEST", `Indexed page not found for ${urlPath}`, 404);
|
|
21546
|
+
}
|
|
21547
|
+
const sourceOutgoing = new Set(source.metadata.outgoingLinkUrls ?? []);
|
|
21548
|
+
const semanticHits = await this.store.searchPagesByVector(
|
|
21549
|
+
source.vector,
|
|
21550
|
+
{ limit: 50 },
|
|
21551
|
+
resolvedScope
|
|
21552
|
+
);
|
|
21553
|
+
const filteredHits = semanticHits.filter((h) => h.url !== urlPath);
|
|
21554
|
+
const semanticScoreMap = /* @__PURE__ */ new Map();
|
|
21555
|
+
for (const hit of filteredHits) {
|
|
21556
|
+
semanticScoreMap.set(hit.url, hit.score);
|
|
21557
|
+
}
|
|
21558
|
+
const candidateUrls = /* @__PURE__ */ new Set();
|
|
21559
|
+
for (const hit of filteredHits) {
|
|
21560
|
+
candidateUrls.add(hit.url);
|
|
21561
|
+
}
|
|
21562
|
+
for (const url of sourceOutgoing) {
|
|
21563
|
+
if (url !== urlPath) candidateUrls.add(url);
|
|
21564
|
+
}
|
|
21565
|
+
const missingUrls = [...sourceOutgoing].filter(
|
|
21566
|
+
(u) => u !== urlPath && !semanticScoreMap.has(u)
|
|
21567
|
+
);
|
|
21568
|
+
const fetchedPages = missingUrls.length > 0 ? await this.store.fetchPagesBatch(missingUrls, resolvedScope) : [];
|
|
21569
|
+
const metaMap = /* @__PURE__ */ new Map();
|
|
21570
|
+
for (const hit of filteredHits) {
|
|
21571
|
+
metaMap.set(hit.url, { title: hit.title, routeFile: hit.routeFile, outgoingLinkUrls: [] });
|
|
21572
|
+
}
|
|
21573
|
+
for (const p of fetchedPages) {
|
|
21574
|
+
metaMap.set(p.url, { title: p.title, routeFile: p.routeFile, outgoingLinkUrls: p.outgoingLinkUrls });
|
|
21575
|
+
}
|
|
21576
|
+
const semanticUrls = filteredHits.map((h) => h.url);
|
|
21577
|
+
if (semanticUrls.length > 0) {
|
|
21578
|
+
const semanticPageData = await this.store.fetchPagesBatch(semanticUrls, resolvedScope);
|
|
21579
|
+
for (const p of semanticPageData) {
|
|
21580
|
+
const existing = metaMap.get(p.url);
|
|
21581
|
+
if (existing) {
|
|
21582
|
+
existing.outgoingLinkUrls = p.outgoingLinkUrls;
|
|
21583
|
+
}
|
|
21584
|
+
}
|
|
21585
|
+
}
|
|
21586
|
+
const candidates = [];
|
|
21587
|
+
for (const url of candidateUrls) {
|
|
21588
|
+
const meta = metaMap.get(url);
|
|
21589
|
+
if (!meta) continue;
|
|
21590
|
+
const isOutgoing = sourceOutgoing.has(url);
|
|
21591
|
+
const isIncoming = meta.outgoingLinkUrls.includes(urlPath);
|
|
21592
|
+
const isLinked = isOutgoing || isIncoming;
|
|
21593
|
+
const dice = diceScore(urlPath, url);
|
|
21594
|
+
const semantic = semanticScoreMap.get(url) ?? 0;
|
|
21595
|
+
const score = compositeScore(isLinked, dice, semantic);
|
|
21596
|
+
const relationshipType = dominantRelationshipType(isOutgoing, isIncoming, dice);
|
|
21597
|
+
candidates.push({
|
|
21598
|
+
url,
|
|
21599
|
+
title: meta.title,
|
|
21600
|
+
score: Number(score.toFixed(6)),
|
|
21601
|
+
relationshipType,
|
|
21602
|
+
routeFile: meta.routeFile
|
|
21603
|
+
});
|
|
21604
|
+
}
|
|
21605
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
21606
|
+
const results = candidates.slice(0, topK);
|
|
21607
|
+
return {
|
|
21608
|
+
sourceUrl: urlPath,
|
|
21609
|
+
scope: resolvedScope.scopeName,
|
|
21610
|
+
relatedPages: results
|
|
21611
|
+
};
|
|
21612
|
+
}
|
|
20178
21613
|
async health() {
|
|
20179
21614
|
return this.store.health();
|
|
20180
21615
|
}
|
|
@@ -20197,14 +21632,40 @@ function createServer(engine) {
|
|
|
20197
21632
|
server.registerTool(
|
|
20198
21633
|
"search",
|
|
20199
21634
|
{
|
|
20200
|
-
description:
|
|
21635
|
+
description: `Semantic site search powered by Upstash Search. Returns url, title, snippet, chunkText, score, and routeFile per result. chunkText contains the full raw chunk markdown. When groupBy is 'page' (default), each result includes a chunks array with section-level sub-results containing sectionTitle, headingPath, snippet, and score. Supports optional filters for structured metadata (e.g. {"version": 2, "deprecated": false}).`,
|
|
20201
21636
|
inputSchema: {
|
|
20202
21637
|
query: zod.z.string().min(1),
|
|
20203
21638
|
scope: zod.z.string().optional(),
|
|
20204
21639
|
topK: zod.z.number().int().positive().max(100).optional(),
|
|
20205
21640
|
pathPrefix: zod.z.string().optional(),
|
|
20206
21641
|
tags: zod.z.array(zod.z.string()).optional(),
|
|
20207
|
-
|
|
21642
|
+
filters: zod.z.record(zod.z.string(), zod.z.union([zod.z.string(), zod.z.number(), zod.z.boolean()])).optional(),
|
|
21643
|
+
groupBy: zod.z.enum(["page", "chunk"]).optional(),
|
|
21644
|
+
maxSubResults: zod.z.number().int().positive().max(20).optional()
|
|
21645
|
+
},
|
|
21646
|
+
outputSchema: {
|
|
21647
|
+
q: zod.z.string(),
|
|
21648
|
+
scope: zod.z.string(),
|
|
21649
|
+
results: zod.z.array(zod.z.object({
|
|
21650
|
+
url: zod.z.string(),
|
|
21651
|
+
title: zod.z.string(),
|
|
21652
|
+
sectionTitle: zod.z.string().optional(),
|
|
21653
|
+
snippet: zod.z.string(),
|
|
21654
|
+
score: zod.z.number(),
|
|
21655
|
+
routeFile: zod.z.string(),
|
|
21656
|
+
chunks: zod.z.array(zod.z.object({
|
|
21657
|
+
sectionTitle: zod.z.string().optional(),
|
|
21658
|
+
snippet: zod.z.string(),
|
|
21659
|
+
headingPath: zod.z.array(zod.z.string()),
|
|
21660
|
+
score: zod.z.number()
|
|
21661
|
+
})).optional()
|
|
21662
|
+
})),
|
|
21663
|
+
meta: zod.z.object({
|
|
21664
|
+
timingsMs: zod.z.object({
|
|
21665
|
+
search: zod.z.number(),
|
|
21666
|
+
total: zod.z.number()
|
|
21667
|
+
})
|
|
21668
|
+
})
|
|
20208
21669
|
}
|
|
20209
21670
|
},
|
|
20210
21671
|
async (input) => {
|
|
@@ -20214,7 +21675,9 @@ function createServer(engine) {
|
|
|
20214
21675
|
scope: input.scope,
|
|
20215
21676
|
pathPrefix: input.pathPrefix,
|
|
20216
21677
|
tags: input.tags,
|
|
20217
|
-
|
|
21678
|
+
filters: input.filters,
|
|
21679
|
+
groupBy: input.groupBy,
|
|
21680
|
+
maxSubResults: input.maxSubResults
|
|
20218
21681
|
});
|
|
20219
21682
|
return {
|
|
20220
21683
|
content: [
|
|
@@ -20222,7 +21685,8 @@ function createServer(engine) {
|
|
|
20222
21685
|
type: "text",
|
|
20223
21686
|
text: JSON.stringify(result, null, 2)
|
|
20224
21687
|
}
|
|
20225
|
-
]
|
|
21688
|
+
],
|
|
21689
|
+
structuredContent: result
|
|
20226
21690
|
};
|
|
20227
21691
|
}
|
|
20228
21692
|
);
|
|
@@ -20247,8 +21711,134 @@ function createServer(engine) {
|
|
|
20247
21711
|
};
|
|
20248
21712
|
}
|
|
20249
21713
|
);
|
|
21714
|
+
server.registerTool(
|
|
21715
|
+
"list_pages",
|
|
21716
|
+
{
|
|
21717
|
+
description: "List indexed pages with optional path prefix filtering and cursor-based pagination. Returns url, title, description, and routeFile for each page. Use nextCursor to fetch subsequent pages.",
|
|
21718
|
+
inputSchema: {
|
|
21719
|
+
pathPrefix: zod.z.string().optional(),
|
|
21720
|
+
cursor: zod.z.string().optional(),
|
|
21721
|
+
limit: zod.z.number().int().positive().max(200).optional(),
|
|
21722
|
+
scope: zod.z.string().optional()
|
|
21723
|
+
}
|
|
21724
|
+
},
|
|
21725
|
+
async (input) => {
|
|
21726
|
+
const result = await engine.listPages({
|
|
21727
|
+
pathPrefix: input.pathPrefix,
|
|
21728
|
+
cursor: input.cursor,
|
|
21729
|
+
limit: input.limit,
|
|
21730
|
+
scope: input.scope
|
|
21731
|
+
});
|
|
21732
|
+
return {
|
|
21733
|
+
content: [
|
|
21734
|
+
{
|
|
21735
|
+
type: "text",
|
|
21736
|
+
text: JSON.stringify(result, null, 2)
|
|
21737
|
+
}
|
|
21738
|
+
]
|
|
21739
|
+
};
|
|
21740
|
+
}
|
|
21741
|
+
);
|
|
21742
|
+
server.registerTool(
|
|
21743
|
+
"get_site_structure",
|
|
21744
|
+
{
|
|
21745
|
+
description: "Returns the hierarchical page tree derived from URL paths. Use this to understand site navigation structure, find where pages belong, or scope further operations to a section. Nodes with isIndexed: false are implicit structural parents not directly in the index. Large sites (>2000 pages) return truncated: true.",
|
|
21746
|
+
inputSchema: {
|
|
21747
|
+
pathPrefix: zod.z.string().optional(),
|
|
21748
|
+
scope: zod.z.string().optional(),
|
|
21749
|
+
maxPages: zod.z.number().int().positive().max(2e3).optional()
|
|
21750
|
+
}
|
|
21751
|
+
},
|
|
21752
|
+
async (input) => {
|
|
21753
|
+
const result = await engine.getSiteStructure({
|
|
21754
|
+
pathPrefix: input.pathPrefix,
|
|
21755
|
+
scope: input.scope,
|
|
21756
|
+
maxPages: input.maxPages
|
|
21757
|
+
});
|
|
21758
|
+
return {
|
|
21759
|
+
content: [
|
|
21760
|
+
{
|
|
21761
|
+
type: "text",
|
|
21762
|
+
text: JSON.stringify(result, null, 2)
|
|
21763
|
+
}
|
|
21764
|
+
]
|
|
21765
|
+
};
|
|
21766
|
+
}
|
|
21767
|
+
);
|
|
21768
|
+
server.registerTool(
|
|
21769
|
+
"find_source_file",
|
|
21770
|
+
{
|
|
21771
|
+
description: "Find the SvelteKit source file for a piece of site content. Use this when you need to locate and edit content on the site. Returns the URL, route file path, section title, and a content snippet.",
|
|
21772
|
+
inputSchema: {
|
|
21773
|
+
query: zod.z.string().min(1),
|
|
21774
|
+
scope: zod.z.string().optional()
|
|
21775
|
+
}
|
|
21776
|
+
},
|
|
21777
|
+
async (input) => {
|
|
21778
|
+
const result = await engine.search({
|
|
21779
|
+
q: input.query,
|
|
21780
|
+
topK: 1,
|
|
21781
|
+
scope: input.scope
|
|
21782
|
+
});
|
|
21783
|
+
if (result.results.length === 0) {
|
|
21784
|
+
return {
|
|
21785
|
+
content: [
|
|
21786
|
+
{
|
|
21787
|
+
type: "text",
|
|
21788
|
+
text: JSON.stringify({
|
|
21789
|
+
error: "No matching content found for the given query."
|
|
21790
|
+
})
|
|
21791
|
+
}
|
|
21792
|
+
]
|
|
21793
|
+
};
|
|
21794
|
+
}
|
|
21795
|
+
const match = result.results[0];
|
|
21796
|
+
const { url, routeFile, sectionTitle, snippet } = match;
|
|
21797
|
+
return {
|
|
21798
|
+
content: [
|
|
21799
|
+
{
|
|
21800
|
+
type: "text",
|
|
21801
|
+
text: JSON.stringify({ url, routeFile, sectionTitle, snippet })
|
|
21802
|
+
}
|
|
21803
|
+
]
|
|
21804
|
+
};
|
|
21805
|
+
}
|
|
21806
|
+
);
|
|
21807
|
+
server.registerTool(
|
|
21808
|
+
"get_related_pages",
|
|
21809
|
+
{
|
|
21810
|
+
description: "Find pages related to a given URL using link graph, semantic similarity, and structural proximity. Returns related pages ranked by a composite relatedness score. Use this to discover content connected to a known page.",
|
|
21811
|
+
inputSchema: {
|
|
21812
|
+
pathOrUrl: zod.z.string().min(1),
|
|
21813
|
+
scope: zod.z.string().optional(),
|
|
21814
|
+
topK: zod.z.number().int().positive().max(25).optional()
|
|
21815
|
+
}
|
|
21816
|
+
},
|
|
21817
|
+
async (input) => {
|
|
21818
|
+
const result = await engine.getRelatedPages(input.pathOrUrl, {
|
|
21819
|
+
topK: input.topK,
|
|
21820
|
+
scope: input.scope
|
|
21821
|
+
});
|
|
21822
|
+
return {
|
|
21823
|
+
content: [
|
|
21824
|
+
{
|
|
21825
|
+
type: "text",
|
|
21826
|
+
text: JSON.stringify(result, null, 2)
|
|
21827
|
+
}
|
|
21828
|
+
]
|
|
21829
|
+
};
|
|
21830
|
+
}
|
|
21831
|
+
);
|
|
20250
21832
|
return server;
|
|
20251
21833
|
}
|
|
21834
|
+
function resolveApiKey(config) {
|
|
21835
|
+
return config.mcp.http.apiKey ?? (config.mcp.http.apiKeyEnv ? process.env[config.mcp.http.apiKeyEnv] : void 0);
|
|
21836
|
+
}
|
|
21837
|
+
function verifyApiKey(provided, expected) {
|
|
21838
|
+
const a = crypto.createHash("sha256").update(provided).digest();
|
|
21839
|
+
const b = crypto.createHash("sha256").update(expected).digest();
|
|
21840
|
+
return crypto.timingSafeEqual(a, b);
|
|
21841
|
+
}
|
|
20252
21842
|
function redirectConsoleToStderr() {
|
|
20253
21843
|
console.log = (...args) => {
|
|
20254
21844
|
process.stderr.write(`[LOG] ${args.map(String).join(" ")}
|
|
@@ -20263,7 +21853,22 @@ async function startHttpServer(serverFactory, config, opts) {
|
|
|
20263
21853
|
const app = express_js.createMcpExpressApp();
|
|
20264
21854
|
const port = opts.httpPort ?? config.mcp.http.port;
|
|
20265
21855
|
const endpointPath = opts.httpPath ?? config.mcp.http.path;
|
|
21856
|
+
const isPublic = config.mcp.access === "public";
|
|
21857
|
+
const host = isPublic ? "0.0.0.0" : "127.0.0.1";
|
|
21858
|
+
const apiKey = isPublic ? resolveApiKey(config) : void 0;
|
|
20266
21859
|
app.post(endpointPath, async (req, res) => {
|
|
21860
|
+
if (isPublic && apiKey) {
|
|
21861
|
+
const authHeader = req.headers["authorization"];
|
|
21862
|
+
const provided = (authHeader?.startsWith("Bearer ") ? authHeader.slice(7) : void 0) ?? req.headers["x-api-key"] ?? "";
|
|
21863
|
+
if (!provided || !verifyApiKey(provided, apiKey)) {
|
|
21864
|
+
res.status(401).json({
|
|
21865
|
+
jsonrpc: "2.0",
|
|
21866
|
+
error: { code: -32001, message: "Unauthorized" },
|
|
21867
|
+
id: null
|
|
21868
|
+
});
|
|
21869
|
+
return;
|
|
21870
|
+
}
|
|
21871
|
+
}
|
|
20267
21872
|
const server = serverFactory();
|
|
20268
21873
|
const transport = new streamableHttp_js.StreamableHTTPServerTransport({
|
|
20269
21874
|
sessionIdGenerator: void 0
|
|
@@ -20313,9 +21918,12 @@ async function startHttpServer(serverFactory, config, opts) {
|
|
|
20313
21918
|
);
|
|
20314
21919
|
});
|
|
20315
21920
|
await new Promise((resolve, reject) => {
|
|
20316
|
-
const instance = app.listen(port,
|
|
20317
|
-
process.stderr.write(`SearchSocket MCP HTTP server listening on http
|
|
21921
|
+
const instance = app.listen(port, host, () => {
|
|
21922
|
+
process.stderr.write(`SearchSocket MCP HTTP server listening on http://${host}:${port}${endpointPath}
|
|
20318
21923
|
`);
|
|
21924
|
+
if (isPublic) {
|
|
21925
|
+
process.stderr.write("WARNING: Server is in public mode. Ensure HTTPS is configured via a reverse proxy for production use.\n");
|
|
21926
|
+
}
|
|
20319
21927
|
resolve();
|
|
20320
21928
|
});
|
|
20321
21929
|
instance.once("error", reject);
|
|
@@ -20330,6 +21938,13 @@ async function runMcpServer(options = {}) {
|
|
|
20330
21938
|
cwd: options.cwd,
|
|
20331
21939
|
configPath: options.configPath
|
|
20332
21940
|
});
|
|
21941
|
+
if (options.access) config.mcp.access = options.access;
|
|
21942
|
+
if (options.apiKey) config.mcp.http.apiKey = options.apiKey;
|
|
21943
|
+
if (config.mcp.access === "public" && !resolveApiKey(config)) {
|
|
21944
|
+
throw new Error(
|
|
21945
|
+
'MCP access is "public" but no API key is configured. Pass --api-key or set mcp.http.apiKey / mcp.http.apiKeyEnv in config.'
|
|
21946
|
+
);
|
|
21947
|
+
}
|
|
20333
21948
|
const resolvedTransport = options.transport ?? config.mcp.transport;
|
|
20334
21949
|
if (resolvedTransport === "stdio") {
|
|
20335
21950
|
redirectConsoleToStderr();
|
|
@@ -20347,8 +21962,6 @@ async function runMcpServer(options = {}) {
|
|
|
20347
21962
|
const stdioTransport = new stdio_js.StdioServerTransport();
|
|
20348
21963
|
await server.connect(stdioTransport);
|
|
20349
21964
|
}
|
|
20350
|
-
|
|
20351
|
-
// src/sveltekit/handle.ts
|
|
20352
21965
|
var InMemoryRateLimiter = class {
|
|
20353
21966
|
constructor(windowMs, max) {
|
|
20354
21967
|
this.windowMs = windowMs;
|
|
@@ -20376,7 +21989,13 @@ function searchsocketHandle(options = {}) {
|
|
|
20376
21989
|
let enginePromise = null;
|
|
20377
21990
|
let configPromise = null;
|
|
20378
21991
|
let apiPath = options.path;
|
|
21992
|
+
let llmsServePath = null;
|
|
21993
|
+
let serveMarkdownVariants = false;
|
|
21994
|
+
let mcpPath;
|
|
21995
|
+
let mcpApiKey;
|
|
21996
|
+
let mcpEnableJsonResponse = true;
|
|
20379
21997
|
let rateLimiter = null;
|
|
21998
|
+
let notConfigured = false;
|
|
20380
21999
|
const getConfig = async () => {
|
|
20381
22000
|
if (!configPromise) {
|
|
20382
22001
|
let configP;
|
|
@@ -20393,6 +22012,13 @@ function searchsocketHandle(options = {}) {
|
|
|
20393
22012
|
}
|
|
20394
22013
|
configPromise = configP.then((config) => {
|
|
20395
22014
|
apiPath = apiPath ?? config.api.path;
|
|
22015
|
+
mcpPath = config.mcp.handle.path;
|
|
22016
|
+
mcpApiKey = config.mcp.handle.apiKey;
|
|
22017
|
+
mcpEnableJsonResponse = config.mcp.handle.enableJsonResponse;
|
|
22018
|
+
if (config.llmsTxt.enable) {
|
|
22019
|
+
llmsServePath = "/" + config.llmsTxt.outputPath.replace(/^static\//, "");
|
|
22020
|
+
serveMarkdownVariants = config.llmsTxt.serveMarkdownVariants;
|
|
22021
|
+
}
|
|
20396
22022
|
if (config.api.rateLimit && !isServerless()) {
|
|
20397
22023
|
rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
|
|
20398
22024
|
}
|
|
@@ -20402,59 +22028,109 @@ function searchsocketHandle(options = {}) {
|
|
|
20402
22028
|
return configPromise;
|
|
20403
22029
|
};
|
|
20404
22030
|
const getEngine = async () => {
|
|
22031
|
+
if (notConfigured) {
|
|
22032
|
+
throw new SearchSocketError(
|
|
22033
|
+
"SEARCH_NOT_CONFIGURED",
|
|
22034
|
+
"Search is not configured. Set the required Upstash environment variables to enable search.",
|
|
22035
|
+
503
|
|
22036
|
+
);
|
|
22037
|
+
}
|
|
20405
22038
|
if (!enginePromise) {
|
|
20406
22039
|
const config = await getConfig();
|
|
20407
22040
|
enginePromise = SearchEngine.create({
|
|
20408
22041
|
cwd: options.cwd,
|
|
20409
22042
|
config
|
|
22043
|
+
}).catch((error) => {
|
|
22044
|
+
enginePromise = null;
|
|
22045
|
+
if (error instanceof SearchSocketError && error.code === "VECTOR_BACKEND_UNAVAILABLE") {
|
|
22046
|
+
notConfigured = true;
|
|
22047
|
+
throw new SearchSocketError(
|
|
22048
|
+
"SEARCH_NOT_CONFIGURED",
|
|
22049
|
+
"Search is not configured. Set the required Upstash environment variables to enable search.",
|
|
22050
|
+
503
|
|
22051
|
+
);
|
|
22052
|
+
}
|
|
22053
|
+
throw error;
|
|
20410
22054
|
});
|
|
20411
22055
|
}
|
|
20412
22056
|
return enginePromise;
|
|
20413
22057
|
};
|
|
20414
22058
|
const bodyLimit = options.maxBodyBytes ?? 64 * 1024;
|
|
20415
22059
|
return async ({ event, resolve }) => {
|
|
20416
|
-
if (apiPath && event.url.pathname !==
|
|
20417
|
-
|
|
22060
|
+
if (apiPath && !isApiPath(event.url.pathname, apiPath) && event.url.pathname !== llmsServePath) {
|
|
22061
|
+
const isMarkdownVariant = event.request.method === "GET" && event.url.pathname.endsWith(".md");
|
|
22062
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
22063
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
22064
|
+
}
|
|
22065
|
+
if (mcpPath) {
|
|
22066
|
+
if (serveMarkdownVariants && isMarkdownVariant) ; else {
|
|
22067
|
+
return resolve(event);
|
|
22068
|
+
}
|
|
22069
|
+
} else {
|
|
22070
|
+
if (configPromise || options.config || options.rawConfig) {
|
|
22071
|
+
await getConfig();
|
|
22072
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
22073
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
22074
|
+
}
|
|
22075
|
+
if (!(serveMarkdownVariants && isMarkdownVariant)) {
|
|
22076
|
+
return resolve(event);
|
|
22077
|
+
}
|
|
22078
|
+
} else {
|
|
22079
|
+
return resolve(event);
|
|
22080
|
+
}
|
|
22081
|
+
}
|
|
20418
22082
|
}
|
|
20419
22083
|
const config = await getConfig();
|
|
22084
|
+
if (llmsServePath && event.request.method === "GET" && event.url.pathname === llmsServePath) {
|
|
22085
|
+
const cwd = options.cwd ?? process.cwd();
|
|
22086
|
+
const filePath = path__default.default.resolve(cwd, config.llmsTxt.outputPath);
|
|
22087
|
+
try {
|
|
22088
|
+
const content = await fs8__default.default.readFile(filePath, "utf8");
|
|
22089
|
+
return new Response(content, {
|
|
22090
|
+
status: 200,
|
|
22091
|
+
headers: { "content-type": "text/plain; charset=utf-8" }
|
|
22092
|
+
});
|
|
22093
|
+
} catch {
|
|
22094
|
+
return resolve(event);
|
|
22095
|
+
}
|
|
22096
|
+
}
|
|
22097
|
+
if (serveMarkdownVariants && event.request.method === "GET" && event.url.pathname.endsWith(".md")) {
|
|
22098
|
+
let rawPath;
|
|
22099
|
+
try {
|
|
22100
|
+
rawPath = decodeURIComponent(event.url.pathname.slice(0, -3));
|
|
22101
|
+
} catch {
|
|
22102
|
+
return resolve(event);
|
|
22103
|
+
}
|
|
22104
|
+
const scope = event.url.searchParams?.get("scope") ?? void 0;
|
|
22105
|
+
try {
|
|
22106
|
+
const engine = await getEngine();
|
|
22107
|
+
const page = await engine.getPage(rawPath, scope);
|
|
22108
|
+
return new Response(page.markdown, {
|
|
22109
|
+
status: 200,
|
|
22110
|
+
headers: { "content-type": "text/markdown; charset=utf-8" }
|
|
22111
|
+
});
|
|
22112
|
+
} catch (error) {
|
|
22113
|
+
if (error instanceof SearchSocketError && error.status === 404) {
|
|
22114
|
+
return resolve(event);
|
|
22115
|
+
}
|
|
22116
|
+
throw error;
|
|
22117
|
+
}
|
|
22118
|
+
}
|
|
22119
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
22120
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
22121
|
+
}
|
|
20420
22122
|
const targetPath = apiPath ?? config.api.path;
|
|
20421
|
-
if (event.url.pathname
|
|
22123
|
+
if (!isApiPath(event.url.pathname, targetPath)) {
|
|
20422
22124
|
return resolve(event);
|
|
20423
22125
|
}
|
|
20424
|
-
|
|
22126
|
+
const subPath = event.url.pathname.slice(targetPath.length);
|
|
22127
|
+
const method = event.request.method;
|
|
22128
|
+
if (method === "OPTIONS") {
|
|
20425
22129
|
return new Response(null, {
|
|
20426
22130
|
status: 204,
|
|
20427
22131
|
headers: buildCorsHeaders(event.request, config)
|
|
20428
22132
|
});
|
|
20429
22133
|
}
|
|
20430
|
-
if (event.request.method !== "POST") {
|
|
20431
|
-
return withCors(
|
|
20432
|
-
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
|
|
20433
|
-
status: 405,
|
|
20434
|
-
headers: {
|
|
20435
|
-
"content-type": "application/json"
|
|
20436
|
-
}
|
|
20437
|
-
}),
|
|
20438
|
-
event.request,
|
|
20439
|
-
config
|
|
20440
|
-
);
|
|
20441
|
-
}
|
|
20442
|
-
const contentLength = Number(event.request.headers.get("content-length") ?? 0);
|
|
20443
|
-
if (contentLength > bodyLimit) {
|
|
20444
|
-
return withCors(
|
|
20445
|
-
new Response(
|
|
20446
|
-
JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Request body too large", 413))),
|
|
20447
|
-
{
|
|
20448
|
-
status: 413,
|
|
20449
|
-
headers: {
|
|
20450
|
-
"content-type": "application/json"
|
|
20451
|
-
}
|
|
20452
|
-
}
|
|
20453
|
-
),
|
|
20454
|
-
event.request,
|
|
20455
|
-
config
|
|
20456
|
-
);
|
|
20457
|
-
}
|
|
20458
22134
|
if (rateLimiter) {
|
|
20459
22135
|
const ip = event.getClientAddress?.() ?? event.request.headers.get("x-forwarded-for")?.split(",")[0]?.trim() ?? "unknown";
|
|
20460
22136
|
if (!rateLimiter.check(ip)) {
|
|
@@ -20474,39 +22150,32 @@ function searchsocketHandle(options = {}) {
|
|
|
20474
22150
|
}
|
|
20475
22151
|
}
|
|
20476
22152
|
try {
|
|
20477
|
-
|
|
20478
|
-
|
|
20479
|
-
|
|
20480
|
-
} else {
|
|
20481
|
-
let parsedFallback;
|
|
20482
|
-
try {
|
|
20483
|
-
parsedFallback = await event.request.json();
|
|
20484
|
-
} catch (error) {
|
|
20485
|
-
if (error instanceof SyntaxError) {
|
|
20486
|
-
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
20487
|
-
}
|
|
20488
|
-
throw error;
|
|
22153
|
+
if (method === "GET") {
|
|
22154
|
+
if (subPath === "" || subPath === "/") {
|
|
22155
|
+
return await handleGetSearch(event, config, getEngine);
|
|
20489
22156
|
}
|
|
20490
|
-
|
|
20491
|
-
|
|
20492
|
-
|
|
20493
|
-
|
|
22157
|
+
if (subPath === "/health") {
|
|
22158
|
+
return await handleGetHealth(event, config, getEngine);
|
|
22159
|
+
}
|
|
22160
|
+
if (subPath.startsWith("/pages/")) {
|
|
22161
|
+
return await handleGetPage(event, config, getEngine, subPath);
|
|
22162
|
+
}
|
|
22163
|
+
return withCors(
|
|
22164
|
+
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Not found", 404))), {
|
|
22165
|
+
status: 404,
|
|
22166
|
+
headers: { "content-type": "application/json" }
|
|
22167
|
+
}),
|
|
22168
|
+
event.request,
|
|
22169
|
+
config
|
|
22170
|
+
);
|
|
20494
22171
|
}
|
|
20495
|
-
|
|
20496
|
-
|
|
20497
|
-
body = JSON.parse(rawBody);
|
|
20498
|
-
} catch {
|
|
20499
|
-
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
22172
|
+
if (method === "POST" && (subPath === "" || subPath === "/")) {
|
|
22173
|
+
return await handlePostSearch(event, config, getEngine, bodyLimit);
|
|
20500
22174
|
}
|
|
20501
|
-
const engine = await getEngine();
|
|
20502
|
-
const searchRequest = body;
|
|
20503
|
-
const result = await engine.search(searchRequest);
|
|
20504
22175
|
return withCors(
|
|
20505
|
-
new Response(JSON.stringify(
|
|
20506
|
-
status:
|
|
20507
|
-
headers: {
|
|
20508
|
-
"content-type": "application/json"
|
|
20509
|
-
}
|
|
22176
|
+
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
|
|
22177
|
+
status: 405,
|
|
22178
|
+
headers: { "content-type": "application/json" }
|
|
20510
22179
|
}),
|
|
20511
22180
|
event.request,
|
|
20512
22181
|
config
|
|
@@ -20527,6 +22196,183 @@ function searchsocketHandle(options = {}) {
|
|
|
20527
22196
|
}
|
|
20528
22197
|
};
|
|
20529
22198
|
}
|
|
22199
|
+
function isApiPath(pathname, apiPath) {
|
|
22200
|
+
return pathname === apiPath || pathname.startsWith(apiPath + "/");
|
|
22201
|
+
}
|
|
22202
|
+
async function handleGetSearch(event, config, getEngine) {
|
|
22203
|
+
const params = event.url.searchParams;
|
|
22204
|
+
const q = params.get("q");
|
|
22205
|
+
if (!q || q.trim() === "") {
|
|
22206
|
+
throw new SearchSocketError("INVALID_REQUEST", "Missing required query parameter: q", 400);
|
|
22207
|
+
}
|
|
22208
|
+
const searchRequest = { q };
|
|
22209
|
+
const topK = params.get("topK");
|
|
22210
|
+
if (topK !== null) {
|
|
22211
|
+
const parsed = Number.parseInt(topK, 10);
|
|
22212
|
+
if (Number.isNaN(parsed) || parsed < 1) {
|
|
22213
|
+
throw new SearchSocketError("INVALID_REQUEST", "topK must be a positive integer", 400);
|
|
22214
|
+
}
|
|
22215
|
+
searchRequest.topK = parsed;
|
|
22216
|
+
}
|
|
22217
|
+
const scope = params.get("scope");
|
|
22218
|
+
if (scope !== null) searchRequest.scope = scope;
|
|
22219
|
+
const pathPrefix = params.get("pathPrefix");
|
|
22220
|
+
if (pathPrefix !== null) searchRequest.pathPrefix = pathPrefix;
|
|
22221
|
+
const groupBy = params.get("groupBy");
|
|
22222
|
+
if (groupBy) {
|
|
22223
|
+
if (groupBy !== "page" && groupBy !== "chunk") {
|
|
22224
|
+
throw new SearchSocketError("INVALID_REQUEST", 'groupBy must be "page" or "chunk"', 400);
|
|
22225
|
+
}
|
|
22226
|
+
searchRequest.groupBy = groupBy;
|
|
22227
|
+
}
|
|
22228
|
+
const maxSubResults = params.get("maxSubResults");
|
|
22229
|
+
if (maxSubResults !== null) {
|
|
22230
|
+
const parsed = Number.parseInt(maxSubResults, 10);
|
|
22231
|
+
if (Number.isNaN(parsed) || parsed < 1 || parsed > 20) {
|
|
22232
|
+
throw new SearchSocketError("INVALID_REQUEST", "maxSubResults must be a positive integer between 1 and 20", 400);
|
|
22233
|
+
}
|
|
22234
|
+
searchRequest.maxSubResults = parsed;
|
|
22235
|
+
}
|
|
22236
|
+
const tags = params.getAll("tags");
|
|
22237
|
+
if (tags.length > 0) searchRequest.tags = tags;
|
|
22238
|
+
const engine = await getEngine();
|
|
22239
|
+
const result = await engine.search(searchRequest);
|
|
22240
|
+
return withCors(
|
|
22241
|
+
new Response(JSON.stringify(result), {
|
|
22242
|
+
status: 200,
|
|
22243
|
+
headers: { "content-type": "application/json" }
|
|
22244
|
+
}),
|
|
22245
|
+
event.request,
|
|
22246
|
+
config
|
|
22247
|
+
);
|
|
22248
|
+
}
|
|
22249
|
+
async function handleGetHealth(event, config, getEngine) {
|
|
22250
|
+
const engine = await getEngine();
|
|
22251
|
+
const result = await engine.health();
|
|
22252
|
+
return withCors(
|
|
22253
|
+
new Response(JSON.stringify(result), {
|
|
22254
|
+
status: 200,
|
|
22255
|
+
headers: { "content-type": "application/json" }
|
|
22256
|
+
}),
|
|
22257
|
+
event.request,
|
|
22258
|
+
config
|
|
22259
|
+
);
|
|
22260
|
+
}
|
|
22261
|
+
async function handleGetPage(event, config, getEngine, subPath) {
|
|
22262
|
+
const rawPath = subPath.slice("/pages".length);
|
|
22263
|
+
let pagePath;
|
|
22264
|
+
try {
|
|
22265
|
+
pagePath = decodeURIComponent(rawPath);
|
|
22266
|
+
} catch {
|
|
22267
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed page path", 400);
|
|
22268
|
+
}
|
|
22269
|
+
const scope = event.url.searchParams?.get("scope") ?? void 0;
|
|
22270
|
+
const engine = await getEngine();
|
|
22271
|
+
const result = await engine.getPage(pagePath, scope);
|
|
22272
|
+
return withCors(
|
|
22273
|
+
new Response(JSON.stringify(result), {
|
|
22274
|
+
status: 200,
|
|
22275
|
+
headers: { "content-type": "application/json" }
|
|
22276
|
+
}),
|
|
22277
|
+
event.request,
|
|
22278
|
+
config
|
|
22279
|
+
);
|
|
22280
|
+
}
|
|
22281
|
+
async function handlePostSearch(event, config, getEngine, bodyLimit) {
|
|
22282
|
+
const contentLength = Number(event.request.headers.get("content-length") ?? 0);
|
|
22283
|
+
if (contentLength > bodyLimit) {
|
|
22284
|
+
throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
|
|
22285
|
+
}
|
|
22286
|
+
let rawBody;
|
|
22287
|
+
if (typeof event.request.text === "function") {
|
|
22288
|
+
rawBody = await event.request.text();
|
|
22289
|
+
} else {
|
|
22290
|
+
let parsedFallback;
|
|
22291
|
+
try {
|
|
22292
|
+
parsedFallback = await event.request.json();
|
|
22293
|
+
} catch (error) {
|
|
22294
|
+
if (error instanceof SyntaxError) {
|
|
22295
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
22296
|
+
}
|
|
22297
|
+
throw error;
|
|
22298
|
+
}
|
|
22299
|
+
rawBody = JSON.stringify(parsedFallback);
|
|
22300
|
+
}
|
|
22301
|
+
if (Buffer.byteLength(rawBody, "utf8") > bodyLimit) {
|
|
22302
|
+
throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
|
|
22303
|
+
}
|
|
22304
|
+
let body;
|
|
22305
|
+
try {
|
|
22306
|
+
body = JSON.parse(rawBody);
|
|
22307
|
+
} catch {
|
|
22308
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
22309
|
+
}
|
|
22310
|
+
const engine = await getEngine();
|
|
22311
|
+
const searchRequest = body;
|
|
22312
|
+
const result = await engine.search(searchRequest);
|
|
22313
|
+
return withCors(
|
|
22314
|
+
new Response(JSON.stringify(result), {
|
|
22315
|
+
status: 200,
|
|
22316
|
+
headers: { "content-type": "application/json" }
|
|
22317
|
+
}),
|
|
22318
|
+
event.request,
|
|
22319
|
+
config
|
|
22320
|
+
);
|
|
22321
|
+
}
|
|
22322
|
+
async function handleMcpRequest(event, apiKey, enableJsonResponse, getEngine) {
|
|
22323
|
+
if (apiKey) {
|
|
22324
|
+
const authHeader = event.request.headers.get("authorization") ?? "";
|
|
22325
|
+
const token = authHeader.startsWith("Bearer ") ? authHeader.slice(7) : "";
|
|
22326
|
+
const tokenBuf = Buffer.from(token);
|
|
22327
|
+
const keyBuf = Buffer.from(apiKey);
|
|
22328
|
+
if (tokenBuf.length !== keyBuf.length || !crypto.timingSafeEqual(tokenBuf, keyBuf)) {
|
|
22329
|
+
return new Response(
|
|
22330
|
+
JSON.stringify({
|
|
22331
|
+
jsonrpc: "2.0",
|
|
22332
|
+
error: { code: -32001, message: "Unauthorized" },
|
|
22333
|
+
id: null
|
|
22334
|
+
}),
|
|
22335
|
+
{ status: 401, headers: { "content-type": "application/json" } }
|
|
22336
|
+
);
|
|
22337
|
+
}
|
|
22338
|
+
}
|
|
22339
|
+
const transport = new webStandardStreamableHttp_js.WebStandardStreamableHTTPServerTransport({
|
|
22340
|
+
sessionIdGenerator: void 0,
|
|
22341
|
+
enableJsonResponse
|
|
22342
|
+
});
|
|
22343
|
+
let server;
|
|
22344
|
+
try {
|
|
22345
|
+
const engine = await getEngine();
|
|
22346
|
+
server = createServer(engine);
|
|
22347
|
+
await server.connect(transport);
|
|
22348
|
+
const response = await transport.handleRequest(event.request);
|
|
22349
|
+
if (enableJsonResponse) {
|
|
22350
|
+
await transport.close();
|
|
22351
|
+
await server.close();
|
|
22352
|
+
}
|
|
22353
|
+
return response;
|
|
22354
|
+
} catch (error) {
|
|
22355
|
+
try {
|
|
22356
|
+
await transport.close();
|
|
22357
|
+
} catch {
|
|
22358
|
+
}
|
|
22359
|
+
try {
|
|
22360
|
+
await server?.close();
|
|
22361
|
+
} catch {
|
|
22362
|
+
}
|
|
22363
|
+
return new Response(
|
|
22364
|
+
JSON.stringify({
|
|
22365
|
+
jsonrpc: "2.0",
|
|
22366
|
+
error: {
|
|
22367
|
+
code: -32603,
|
|
22368
|
+
message: error instanceof Error ? error.message : "Internal server error"
|
|
22369
|
+
},
|
|
22370
|
+
id: null
|
|
22371
|
+
}),
|
|
22372
|
+
{ status: 500, headers: { "content-type": "application/json" } }
|
|
22373
|
+
);
|
|
22374
|
+
}
|
|
22375
|
+
}
|
|
20530
22376
|
function buildCorsHeaders(request, config) {
|
|
20531
22377
|
const allowOrigins = config.api.cors.allowOrigins;
|
|
20532
22378
|
if (!allowOrigins || allowOrigins.length === 0) {
|
|
@@ -20539,7 +22385,7 @@ function buildCorsHeaders(request, config) {
|
|
|
20539
22385
|
}
|
|
20540
22386
|
return {
|
|
20541
22387
|
"access-control-allow-origin": allowOrigins.includes("*") ? "*" : origin,
|
|
20542
|
-
"access-control-allow-methods": "POST, OPTIONS",
|
|
22388
|
+
"access-control-allow-methods": "GET, POST, OPTIONS",
|
|
20543
22389
|
"access-control-allow-headers": "content-type"
|
|
20544
22390
|
};
|
|
20545
22391
|
}
|
|
@@ -20575,9 +22421,6 @@ function shouldRunAutoIndex(options) {
|
|
|
20575
22421
|
if (explicit && /^(1|true|yes)$/i.test(explicit)) {
|
|
20576
22422
|
return true;
|
|
20577
22423
|
}
|
|
20578
|
-
if (process.env.CI && /^(1|true)$/i.test(process.env.CI)) {
|
|
20579
|
-
return true;
|
|
20580
|
-
}
|
|
20581
22424
|
return false;
|
|
20582
22425
|
}
|
|
20583
22426
|
function searchsocketVitePlugin(options = {}) {
|
|
@@ -20602,7 +22445,8 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20602
22445
|
const pipeline = await IndexPipeline.create({
|
|
20603
22446
|
cwd,
|
|
20604
22447
|
configPath: options.configPath,
|
|
20605
|
-
logger: logger3
|
|
22448
|
+
logger: logger3,
|
|
22449
|
+
hooks: options.hooks
|
|
20606
22450
|
});
|
|
20607
22451
|
const stats = await pipeline.run({
|
|
20608
22452
|
changedOnly: options.changedOnly ?? true,
|