searchsocket 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +731 -514
- package/dist/cli.js +3308 -524
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +2310 -466
- package/dist/index.d.cts +101 -40
- package/dist/index.d.ts +101 -40
- package/dist/index.js +2310 -466
- package/dist/{plugin-B_npJSux.d.cts → plugin-C61L-ykY.d.ts} +2 -1
- package/dist/{plugin-M-aW0ev6.d.ts → plugin-DoBW1gkK.d.cts} +2 -1
- package/dist/sveltekit.cjs +2342 -465
- package/dist/sveltekit.d.cts +2 -2
- package/dist/sveltekit.d.ts +2 -2
- package/dist/sveltekit.js +2344 -467
- package/dist/templates/search-dialog/SearchDialog.svelte +175 -0
- package/dist/templates/search-input/SearchInput.svelte +151 -0
- package/dist/templates/search-results/SearchResults.svelte +75 -0
- package/dist/{types-Dk43uz25.d.cts → types-029hl6P2.d.cts} +180 -9
- package/dist/{types-Dk43uz25.d.ts → types-029hl6P2.d.ts} +180 -9
- package/package.json +20 -2
- package/src/svelte/SearchSocket.svelte +35 -0
- package/src/svelte/index.svelte.ts +181 -0
package/dist/sveltekit.cjs
CHANGED
|
@@ -1,27 +1,33 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
var
|
|
3
|
+
var crypto = require('crypto');
|
|
4
|
+
var fs9 = require('fs/promises');
|
|
4
5
|
var path = require('path');
|
|
6
|
+
var webStandardStreamableHttp_js = require('@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js');
|
|
7
|
+
var fs = require('fs');
|
|
5
8
|
var jiti = require('jiti');
|
|
6
9
|
var zod = require('zod');
|
|
10
|
+
var mcp_js = require('@modelcontextprotocol/sdk/server/mcp.js');
|
|
11
|
+
require('@modelcontextprotocol/sdk/server/stdio.js');
|
|
12
|
+
require('@modelcontextprotocol/sdk/server/streamableHttp.js');
|
|
13
|
+
require('@modelcontextprotocol/sdk/server/express.js');
|
|
7
14
|
var child_process = require('child_process');
|
|
8
|
-
var
|
|
15
|
+
var vector = require('@upstash/vector');
|
|
9
16
|
var cheerio = require('cheerio');
|
|
10
17
|
var matter = require('gray-matter');
|
|
11
18
|
var fg = require('fast-glob');
|
|
12
19
|
var pLimit = require('p-limit');
|
|
13
|
-
var fs3 = require('fs/promises');
|
|
14
20
|
var net = require('net');
|
|
15
21
|
var zlib = require('zlib');
|
|
16
22
|
|
|
17
23
|
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
18
24
|
|
|
19
|
-
var
|
|
25
|
+
var fs9__default = /*#__PURE__*/_interopDefault(fs9);
|
|
20
26
|
var path__default = /*#__PURE__*/_interopDefault(path);
|
|
27
|
+
var fs__default = /*#__PURE__*/_interopDefault(fs);
|
|
21
28
|
var matter__default = /*#__PURE__*/_interopDefault(matter);
|
|
22
29
|
var fg__default = /*#__PURE__*/_interopDefault(fg);
|
|
23
30
|
var pLimit__default = /*#__PURE__*/_interopDefault(pLimit);
|
|
24
|
-
var fs3__default = /*#__PURE__*/_interopDefault(fs3);
|
|
25
31
|
var net__default = /*#__PURE__*/_interopDefault(net);
|
|
26
32
|
|
|
27
33
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
@@ -5021,32 +5027,32 @@ var require_URL = __commonJS({
|
|
|
5021
5027
|
else
|
|
5022
5028
|
return basepath.substring(0, lastslash + 1) + refpath;
|
|
5023
5029
|
}
|
|
5024
|
-
function remove_dot_segments(
|
|
5025
|
-
if (!
|
|
5030
|
+
function remove_dot_segments(path14) {
|
|
5031
|
+
if (!path14) return path14;
|
|
5026
5032
|
var output = "";
|
|
5027
|
-
while (
|
|
5028
|
-
if (
|
|
5029
|
-
|
|
5033
|
+
while (path14.length > 0) {
|
|
5034
|
+
if (path14 === "." || path14 === "..") {
|
|
5035
|
+
path14 = "";
|
|
5030
5036
|
break;
|
|
5031
5037
|
}
|
|
5032
|
-
var twochars =
|
|
5033
|
-
var threechars =
|
|
5034
|
-
var fourchars =
|
|
5038
|
+
var twochars = path14.substring(0, 2);
|
|
5039
|
+
var threechars = path14.substring(0, 3);
|
|
5040
|
+
var fourchars = path14.substring(0, 4);
|
|
5035
5041
|
if (threechars === "../") {
|
|
5036
|
-
|
|
5042
|
+
path14 = path14.substring(3);
|
|
5037
5043
|
} else if (twochars === "./") {
|
|
5038
|
-
|
|
5044
|
+
path14 = path14.substring(2);
|
|
5039
5045
|
} else if (threechars === "/./") {
|
|
5040
|
-
|
|
5041
|
-
} else if (twochars === "/." &&
|
|
5042
|
-
|
|
5043
|
-
} else if (fourchars === "/../" || threechars === "/.." &&
|
|
5044
|
-
|
|
5046
|
+
path14 = "/" + path14.substring(3);
|
|
5047
|
+
} else if (twochars === "/." && path14.length === 2) {
|
|
5048
|
+
path14 = "/";
|
|
5049
|
+
} else if (fourchars === "/../" || threechars === "/.." && path14.length === 3) {
|
|
5050
|
+
path14 = "/" + path14.substring(4);
|
|
5045
5051
|
output = output.replace(/\/?[^\/]*$/, "");
|
|
5046
5052
|
} else {
|
|
5047
|
-
var segment =
|
|
5053
|
+
var segment = path14.match(/(\/?([^\/]*))/)[0];
|
|
5048
5054
|
output += segment;
|
|
5049
|
-
|
|
5055
|
+
path14 = path14.substring(segment.length);
|
|
5050
5056
|
}
|
|
5051
5057
|
}
|
|
5052
5058
|
return output;
|
|
@@ -16642,6 +16648,7 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16642
16648
|
dropSelectors: zod.z.array(zod.z.string()).optional(),
|
|
16643
16649
|
ignoreAttr: zod.z.string().optional(),
|
|
16644
16650
|
noindexAttr: zod.z.string().optional(),
|
|
16651
|
+
imageDescAttr: zod.z.string().optional(),
|
|
16645
16652
|
respectRobotsNoindex: zod.z.boolean().optional()
|
|
16646
16653
|
}).optional(),
|
|
16647
16654
|
transform: zod.z.object({
|
|
@@ -16657,35 +16664,48 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16657
16664
|
headingPathDepth: zod.z.number().int().positive().optional(),
|
|
16658
16665
|
dontSplitInside: zod.z.array(zod.z.enum(["code", "table", "blockquote"])).optional(),
|
|
16659
16666
|
prependTitle: zod.z.boolean().optional(),
|
|
16660
|
-
pageSummaryChunk: zod.z.boolean().optional()
|
|
16667
|
+
pageSummaryChunk: zod.z.boolean().optional(),
|
|
16668
|
+
weightHeadings: zod.z.boolean().optional()
|
|
16661
16669
|
}).optional(),
|
|
16662
16670
|
upstash: zod.z.object({
|
|
16663
16671
|
url: zod.z.string().url().optional(),
|
|
16664
16672
|
token: zod.z.string().min(1).optional(),
|
|
16665
16673
|
urlEnv: zod.z.string().min(1).optional(),
|
|
16666
|
-
tokenEnv: zod.z.string().min(1).optional()
|
|
16674
|
+
tokenEnv: zod.z.string().min(1).optional(),
|
|
16675
|
+
namespaces: zod.z.object({
|
|
16676
|
+
pages: zod.z.string().min(1).optional(),
|
|
16677
|
+
chunks: zod.z.string().min(1).optional()
|
|
16678
|
+
}).optional()
|
|
16679
|
+
}).optional(),
|
|
16680
|
+
embedding: zod.z.object({
|
|
16681
|
+
model: zod.z.string().optional(),
|
|
16682
|
+
dimensions: zod.z.number().int().positive().optional(),
|
|
16683
|
+
taskType: zod.z.string().optional(),
|
|
16684
|
+
batchSize: zod.z.number().int().positive().optional()
|
|
16667
16685
|
}).optional(),
|
|
16668
16686
|
search: zod.z.object({
|
|
16669
|
-
semanticWeight: zod.z.number().min(0).max(1).optional(),
|
|
16670
|
-
inputEnrichment: zod.z.boolean().optional(),
|
|
16671
|
-
reranking: zod.z.boolean().optional(),
|
|
16672
16687
|
dualSearch: zod.z.boolean().optional(),
|
|
16673
16688
|
pageSearchWeight: zod.z.number().min(0).max(1).optional()
|
|
16674
16689
|
}).optional(),
|
|
16675
16690
|
ranking: zod.z.object({
|
|
16676
16691
|
enableIncomingLinkBoost: zod.z.boolean().optional(),
|
|
16677
16692
|
enableDepthBoost: zod.z.boolean().optional(),
|
|
16693
|
+
enableFreshnessBoost: zod.z.boolean().optional(),
|
|
16694
|
+
freshnessDecayRate: zod.z.number().positive().optional(),
|
|
16695
|
+
enableAnchorTextBoost: zod.z.boolean().optional(),
|
|
16678
16696
|
pageWeights: zod.z.record(zod.z.string(), zod.z.number().min(0)).optional(),
|
|
16679
16697
|
aggregationCap: zod.z.number().int().positive().optional(),
|
|
16680
16698
|
aggregationDecay: zod.z.number().min(0).max(1).optional(),
|
|
16681
16699
|
minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
16682
|
-
|
|
16700
|
+
minScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
16683
16701
|
scoreGapThreshold: zod.z.number().min(0).max(1).optional(),
|
|
16684
16702
|
weights: zod.z.object({
|
|
16685
16703
|
incomingLinks: zod.z.number().optional(),
|
|
16686
16704
|
depth: zod.z.number().optional(),
|
|
16687
16705
|
aggregation: zod.z.number().optional(),
|
|
16688
|
-
titleMatch: zod.z.number().optional()
|
|
16706
|
+
titleMatch: zod.z.number().optional(),
|
|
16707
|
+
freshness: zod.z.number().optional(),
|
|
16708
|
+
anchorText: zod.z.number().optional()
|
|
16689
16709
|
}).optional()
|
|
16690
16710
|
}).optional(),
|
|
16691
16711
|
api: zod.z.object({
|
|
@@ -16700,12 +16720,28 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16700
16720
|
}).optional(),
|
|
16701
16721
|
mcp: zod.z.object({
|
|
16702
16722
|
enable: zod.z.boolean().optional(),
|
|
16723
|
+
access: zod.z.enum(["public", "private"]).optional(),
|
|
16703
16724
|
transport: zod.z.enum(["stdio", "http"]).optional(),
|
|
16704
16725
|
http: zod.z.object({
|
|
16705
16726
|
port: zod.z.number().int().positive().optional(),
|
|
16706
|
-
path: zod.z.string().optional()
|
|
16727
|
+
path: zod.z.string().optional(),
|
|
16728
|
+
apiKey: zod.z.string().min(1).optional(),
|
|
16729
|
+
apiKeyEnv: zod.z.string().min(1).optional()
|
|
16730
|
+
}).optional(),
|
|
16731
|
+
handle: zod.z.object({
|
|
16732
|
+
path: zod.z.string().optional(),
|
|
16733
|
+
apiKey: zod.z.string().min(1).optional(),
|
|
16734
|
+
enableJsonResponse: zod.z.boolean().optional()
|
|
16707
16735
|
}).optional()
|
|
16708
16736
|
}).optional(),
|
|
16737
|
+
llmsTxt: zod.z.object({
|
|
16738
|
+
enable: zod.z.boolean().optional(),
|
|
16739
|
+
outputPath: zod.z.string().optional(),
|
|
16740
|
+
title: zod.z.string().optional(),
|
|
16741
|
+
description: zod.z.string().optional(),
|
|
16742
|
+
generateFull: zod.z.boolean().optional(),
|
|
16743
|
+
serveMarkdownVariants: zod.z.boolean().optional()
|
|
16744
|
+
}).optional(),
|
|
16709
16745
|
state: zod.z.object({
|
|
16710
16746
|
dir: zod.z.string().optional()
|
|
16711
16747
|
}).optional()
|
|
@@ -16744,6 +16780,7 @@ function createDefaultConfig(projectId) {
|
|
|
16744
16780
|
dropSelectors: DEFAULT_DROP_SELECTORS,
|
|
16745
16781
|
ignoreAttr: "data-search-ignore",
|
|
16746
16782
|
noindexAttr: "data-search-noindex",
|
|
16783
|
+
imageDescAttr: "data-search-description",
|
|
16747
16784
|
respectRobotsNoindex: true
|
|
16748
16785
|
},
|
|
16749
16786
|
transform: {
|
|
@@ -16753,39 +16790,52 @@ function createDefaultConfig(projectId) {
|
|
|
16753
16790
|
},
|
|
16754
16791
|
chunking: {
|
|
16755
16792
|
strategy: "hybrid",
|
|
16756
|
-
maxChars:
|
|
16793
|
+
maxChars: 1500,
|
|
16757
16794
|
overlapChars: 200,
|
|
16758
16795
|
minChars: 250,
|
|
16759
16796
|
headingPathDepth: 3,
|
|
16760
16797
|
dontSplitInside: ["code", "table", "blockquote"],
|
|
16761
16798
|
prependTitle: true,
|
|
16762
|
-
pageSummaryChunk: true
|
|
16799
|
+
pageSummaryChunk: true,
|
|
16800
|
+
weightHeadings: true
|
|
16763
16801
|
},
|
|
16764
16802
|
upstash: {
|
|
16765
|
-
urlEnv: "
|
|
16766
|
-
tokenEnv: "
|
|
16803
|
+
urlEnv: "UPSTASH_VECTOR_REST_URL",
|
|
16804
|
+
tokenEnv: "UPSTASH_VECTOR_REST_TOKEN",
|
|
16805
|
+
namespaces: {
|
|
16806
|
+
pages: "pages",
|
|
16807
|
+
chunks: "chunks"
|
|
16808
|
+
}
|
|
16809
|
+
},
|
|
16810
|
+
embedding: {
|
|
16811
|
+
model: "bge-large-en-v1.5",
|
|
16812
|
+
dimensions: 1024,
|
|
16813
|
+
taskType: "RETRIEVAL_DOCUMENT",
|
|
16814
|
+
batchSize: 100
|
|
16767
16815
|
},
|
|
16768
16816
|
search: {
|
|
16769
|
-
semanticWeight: 0.75,
|
|
16770
|
-
inputEnrichment: true,
|
|
16771
|
-
reranking: true,
|
|
16772
16817
|
dualSearch: true,
|
|
16773
16818
|
pageSearchWeight: 0.3
|
|
16774
16819
|
},
|
|
16775
16820
|
ranking: {
|
|
16776
16821
|
enableIncomingLinkBoost: true,
|
|
16777
16822
|
enableDepthBoost: true,
|
|
16823
|
+
enableFreshnessBoost: false,
|
|
16824
|
+
freshnessDecayRate: 1e-3,
|
|
16825
|
+
enableAnchorTextBoost: false,
|
|
16778
16826
|
pageWeights: {},
|
|
16779
16827
|
aggregationCap: 5,
|
|
16780
16828
|
aggregationDecay: 0.5,
|
|
16781
16829
|
minChunkScoreRatio: 0.5,
|
|
16782
|
-
|
|
16830
|
+
minScoreRatio: 0.7,
|
|
16783
16831
|
scoreGapThreshold: 0.4,
|
|
16784
16832
|
weights: {
|
|
16785
16833
|
incomingLinks: 0.05,
|
|
16786
16834
|
depth: 0.03,
|
|
16787
16835
|
aggregation: 0.1,
|
|
16788
|
-
titleMatch: 0.15
|
|
16836
|
+
titleMatch: 0.15,
|
|
16837
|
+
freshness: 0.1,
|
|
16838
|
+
anchorText: 0.1
|
|
16789
16839
|
}
|
|
16790
16840
|
},
|
|
16791
16841
|
api: {
|
|
@@ -16796,12 +16846,23 @@ function createDefaultConfig(projectId) {
|
|
|
16796
16846
|
},
|
|
16797
16847
|
mcp: {
|
|
16798
16848
|
enable: process.env.NODE_ENV !== "production",
|
|
16849
|
+
access: "private",
|
|
16799
16850
|
transport: "stdio",
|
|
16800
16851
|
http: {
|
|
16801
16852
|
port: 3338,
|
|
16802
16853
|
path: "/mcp"
|
|
16854
|
+
},
|
|
16855
|
+
handle: {
|
|
16856
|
+
path: "/api/mcp",
|
|
16857
|
+
enableJsonResponse: true
|
|
16803
16858
|
}
|
|
16804
16859
|
},
|
|
16860
|
+
llmsTxt: {
|
|
16861
|
+
enable: false,
|
|
16862
|
+
outputPath: "static/llms.txt",
|
|
16863
|
+
generateFull: true,
|
|
16864
|
+
serveMarkdownVariants: false
|
|
16865
|
+
},
|
|
16805
16866
|
state: {
|
|
16806
16867
|
dir: ".searchsocket"
|
|
16807
16868
|
}
|
|
@@ -16929,7 +16990,15 @@ ${issues}`
|
|
|
16929
16990
|
},
|
|
16930
16991
|
upstash: {
|
|
16931
16992
|
...defaults.upstash,
|
|
16932
|
-
...parsed.upstash
|
|
16993
|
+
...parsed.upstash,
|
|
16994
|
+
namespaces: {
|
|
16995
|
+
...defaults.upstash.namespaces,
|
|
16996
|
+
...parsed.upstash?.namespaces
|
|
16997
|
+
}
|
|
16998
|
+
},
|
|
16999
|
+
embedding: {
|
|
17000
|
+
...defaults.embedding,
|
|
17001
|
+
...parsed.embedding
|
|
16933
17002
|
},
|
|
16934
17003
|
search: {
|
|
16935
17004
|
...defaults.search,
|
|
@@ -16966,8 +17035,16 @@ ${issues}`
|
|
|
16966
17035
|
http: {
|
|
16967
17036
|
...defaults.mcp.http,
|
|
16968
17037
|
...parsed.mcp?.http
|
|
17038
|
+
},
|
|
17039
|
+
handle: {
|
|
17040
|
+
...defaults.mcp.handle,
|
|
17041
|
+
...parsed.mcp?.handle
|
|
16969
17042
|
}
|
|
16970
17043
|
},
|
|
17044
|
+
llmsTxt: {
|
|
17045
|
+
...defaults.llmsTxt,
|
|
17046
|
+
...parsed.llmsTxt
|
|
17047
|
+
},
|
|
16971
17048
|
state: {
|
|
16972
17049
|
...defaults.state,
|
|
16973
17050
|
...parsed.state
|
|
@@ -16987,6 +17064,15 @@ ${issues}`
|
|
|
16987
17064
|
maxDepth: 10
|
|
16988
17065
|
};
|
|
16989
17066
|
}
|
|
17067
|
+
if (merged.mcp.access === "public") {
|
|
17068
|
+
const resolvedKey = merged.mcp.http.apiKey ?? (merged.mcp.http.apiKeyEnv ? process.env[merged.mcp.http.apiKeyEnv] : void 0);
|
|
17069
|
+
if (!resolvedKey) {
|
|
17070
|
+
throw new SearchSocketError(
|
|
17071
|
+
"CONFIG_MISSING",
|
|
17072
|
+
'`mcp.access` is "public" but no API key is configured. Set `mcp.http.apiKey` or `mcp.http.apiKeyEnv`.'
|
|
17073
|
+
);
|
|
17074
|
+
}
|
|
17075
|
+
}
|
|
16990
17076
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
16991
17077
|
throw new SearchSocketError("CONFIG_MISSING", "`source.crawl.baseUrl` is required when source.mode is crawl.");
|
|
16992
17078
|
}
|
|
@@ -17035,13 +17121,84 @@ function normalizeMarkdown(input) {
|
|
|
17035
17121
|
function sanitizeScopeName(scopeName) {
|
|
17036
17122
|
return scopeName.toLowerCase().replace(/[^a-z0-9._-]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80);
|
|
17037
17123
|
}
|
|
17124
|
+
function markdownToPlain(markdown) {
|
|
17125
|
+
return markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/[#>*_|\-]/g, " ").replace(/\s+/g, " ").trim();
|
|
17126
|
+
}
|
|
17038
17127
|
function toSnippet(markdown, maxLen = 220) {
|
|
17039
|
-
const plain = markdown
|
|
17128
|
+
const plain = markdownToPlain(markdown);
|
|
17040
17129
|
if (plain.length <= maxLen) {
|
|
17041
17130
|
return plain;
|
|
17042
17131
|
}
|
|
17043
17132
|
return `${plain.slice(0, Math.max(0, maxLen - 1)).trim()}\u2026`;
|
|
17044
17133
|
}
|
|
17134
|
+
function queryAwareExcerpt(markdown, query, maxLen = 220) {
|
|
17135
|
+
const plain = markdownToPlain(markdown);
|
|
17136
|
+
if (plain.length <= maxLen) return plain;
|
|
17137
|
+
const tokens = query.toLowerCase().split(/\s+/).filter((t) => t.length >= 2);
|
|
17138
|
+
if (tokens.length === 0) return toSnippet(markdown, maxLen);
|
|
17139
|
+
const positions = [];
|
|
17140
|
+
for (let ti = 0; ti < tokens.length; ti++) {
|
|
17141
|
+
const escaped = tokens[ti].replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
17142
|
+
const re = new RegExp(escaped, "gi");
|
|
17143
|
+
let m;
|
|
17144
|
+
while ((m = re.exec(plain)) !== null) {
|
|
17145
|
+
positions.push({ start: m.index, end: m.index + m[0].length, tokenIdx: ti });
|
|
17146
|
+
}
|
|
17147
|
+
}
|
|
17148
|
+
if (positions.length === 0) return toSnippet(markdown, maxLen);
|
|
17149
|
+
positions.sort((a, b) => a.start - b.start);
|
|
17150
|
+
let bestUniqueCount = 0;
|
|
17151
|
+
let bestTotalCount = 0;
|
|
17152
|
+
let bestLeft = 0;
|
|
17153
|
+
let bestRight = 0;
|
|
17154
|
+
let left = 0;
|
|
17155
|
+
const tokenCounts = /* @__PURE__ */ new Map();
|
|
17156
|
+
for (let right = 0; right < positions.length; right++) {
|
|
17157
|
+
tokenCounts.set(positions[right].tokenIdx, (tokenCounts.get(positions[right].tokenIdx) ?? 0) + 1);
|
|
17158
|
+
while (positions[right].end - positions[left].start > maxLen && left < right) {
|
|
17159
|
+
const leftToken = positions[left].tokenIdx;
|
|
17160
|
+
const cnt = tokenCounts.get(leftToken) - 1;
|
|
17161
|
+
if (cnt === 0) tokenCounts.delete(leftToken);
|
|
17162
|
+
else tokenCounts.set(leftToken, cnt);
|
|
17163
|
+
left++;
|
|
17164
|
+
}
|
|
17165
|
+
const uniqueCount = tokenCounts.size;
|
|
17166
|
+
const totalCount = right - left + 1;
|
|
17167
|
+
if (uniqueCount > bestUniqueCount || uniqueCount === bestUniqueCount && totalCount > bestTotalCount) {
|
|
17168
|
+
bestUniqueCount = uniqueCount;
|
|
17169
|
+
bestTotalCount = totalCount;
|
|
17170
|
+
bestLeft = left;
|
|
17171
|
+
bestRight = right;
|
|
17172
|
+
}
|
|
17173
|
+
}
|
|
17174
|
+
const mid = Math.floor((positions[bestLeft].start + positions[bestRight].end) / 2);
|
|
17175
|
+
let start = Math.max(0, mid - Math.floor(maxLen / 2));
|
|
17176
|
+
let end = Math.min(plain.length, start + maxLen);
|
|
17177
|
+
start = Math.max(0, end - maxLen);
|
|
17178
|
+
if (start > 0) {
|
|
17179
|
+
const spaceIdx = plain.lastIndexOf(" ", start);
|
|
17180
|
+
if (spaceIdx > start - 30) {
|
|
17181
|
+
start = spaceIdx + 1;
|
|
17182
|
+
}
|
|
17183
|
+
}
|
|
17184
|
+
if (end < plain.length) {
|
|
17185
|
+
const spaceIdx = plain.indexOf(" ", end);
|
|
17186
|
+
if (spaceIdx !== -1 && spaceIdx < end + 30) {
|
|
17187
|
+
end = spaceIdx;
|
|
17188
|
+
}
|
|
17189
|
+
}
|
|
17190
|
+
let excerpt = plain.slice(start, end);
|
|
17191
|
+
if (excerpt.length > Math.ceil(maxLen * 1.2)) {
|
|
17192
|
+
excerpt = excerpt.slice(0, maxLen);
|
|
17193
|
+
const lastSpace = excerpt.lastIndexOf(" ");
|
|
17194
|
+
if (lastSpace > maxLen * 0.5) {
|
|
17195
|
+
excerpt = excerpt.slice(0, lastSpace);
|
|
17196
|
+
}
|
|
17197
|
+
}
|
|
17198
|
+
const prefix = start > 0 ? "\u2026" : "";
|
|
17199
|
+
const suffix = end < plain.length ? "\u2026" : "";
|
|
17200
|
+
return `${prefix}${excerpt}${suffix}`;
|
|
17201
|
+
}
|
|
17045
17202
|
function extractFirstParagraph(markdown) {
|
|
17046
17203
|
const lines = markdown.split("\n");
|
|
17047
17204
|
let inFence = false;
|
|
@@ -17148,162 +17305,288 @@ function joinUrl(baseUrl, route) {
|
|
|
17148
17305
|
const routePart = ensureLeadingSlash(route);
|
|
17149
17306
|
return `${base}${routePart}`;
|
|
17150
17307
|
}
|
|
17151
|
-
|
|
17152
|
-
// src/vector/upstash.ts
|
|
17153
|
-
function chunkIndexName(scope) {
|
|
17154
|
-
return `${scope.projectId}--${scope.scopeName}`;
|
|
17155
|
-
}
|
|
17156
|
-
function pageIndexName(scope) {
|
|
17157
|
-
return `${scope.projectId}--${scope.scopeName}--pages`;
|
|
17158
|
-
}
|
|
17159
17308
|
var UpstashSearchStore = class {
|
|
17160
|
-
|
|
17309
|
+
index;
|
|
17310
|
+
pagesNs;
|
|
17311
|
+
chunksNs;
|
|
17161
17312
|
constructor(opts) {
|
|
17162
|
-
this.
|
|
17163
|
-
|
|
17164
|
-
|
|
17165
|
-
return this.client.index(chunkIndexName(scope));
|
|
17166
|
-
}
|
|
17167
|
-
pageIndex(scope) {
|
|
17168
|
-
return this.client.index(pageIndexName(scope));
|
|
17313
|
+
this.index = opts.index;
|
|
17314
|
+
this.pagesNs = opts.index.namespace(opts.pagesNamespace);
|
|
17315
|
+
this.chunksNs = opts.index.namespace(opts.chunksNamespace);
|
|
17169
17316
|
}
|
|
17170
17317
|
async upsertChunks(chunks, scope) {
|
|
17171
17318
|
if (chunks.length === 0) return;
|
|
17172
|
-
const index = this.chunkIndex(scope);
|
|
17173
17319
|
const BATCH_SIZE = 100;
|
|
17174
17320
|
for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
|
|
17175
17321
|
const batch = chunks.slice(i, i + BATCH_SIZE);
|
|
17176
|
-
await
|
|
17177
|
-
|
|
17178
|
-
|
|
17179
|
-
|
|
17180
|
-
|
|
17181
|
-
|
|
17182
|
-
|
|
17183
|
-
|
|
17184
|
-
|
|
17185
|
-
|
|
17186
|
-
|
|
17187
|
-
|
|
17322
|
+
await this.chunksNs.upsert(
|
|
17323
|
+
batch.map((c) => ({
|
|
17324
|
+
id: c.id,
|
|
17325
|
+
data: c.data,
|
|
17326
|
+
metadata: {
|
|
17327
|
+
...c.metadata,
|
|
17328
|
+
projectId: scope.projectId,
|
|
17329
|
+
scopeName: scope.scopeName,
|
|
17330
|
+
type: c.metadata.type || "chunk"
|
|
17331
|
+
}
|
|
17332
|
+
}))
|
|
17333
|
+
);
|
|
17334
|
+
}
|
|
17335
|
+
}
|
|
17336
|
+
async search(data, opts, scope) {
|
|
17337
|
+
const filterParts = [
|
|
17338
|
+
`projectId = '${scope.projectId}'`,
|
|
17339
|
+
`scopeName = '${scope.scopeName}'`
|
|
17340
|
+
];
|
|
17341
|
+
if (opts.filter) {
|
|
17342
|
+
filterParts.push(opts.filter);
|
|
17343
|
+
}
|
|
17344
|
+
const results = await this.chunksNs.query({
|
|
17345
|
+
data,
|
|
17346
|
+
topK: opts.limit,
|
|
17347
|
+
includeMetadata: true,
|
|
17348
|
+
filter: filterParts.join(" AND "),
|
|
17349
|
+
queryMode: vector.QueryMode.HYBRID,
|
|
17350
|
+
fusionAlgorithm: vector.FusionAlgorithm.DBSF
|
|
17351
|
+
});
|
|
17352
|
+
return results.map((doc) => ({
|
|
17353
|
+
id: String(doc.id),
|
|
17354
|
+
score: doc.score,
|
|
17355
|
+
metadata: {
|
|
17356
|
+
projectId: doc.metadata?.projectId ?? "",
|
|
17357
|
+
scopeName: doc.metadata?.scopeName ?? "",
|
|
17358
|
+
url: doc.metadata?.url ?? "",
|
|
17359
|
+
path: doc.metadata?.path ?? "",
|
|
17360
|
+
title: doc.metadata?.title ?? "",
|
|
17361
|
+
sectionTitle: doc.metadata?.sectionTitle ?? "",
|
|
17362
|
+
headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
|
|
17363
|
+
snippet: doc.metadata?.snippet ?? "",
|
|
17364
|
+
chunkText: doc.metadata?.chunkText ?? "",
|
|
17365
|
+
ordinal: doc.metadata?.ordinal ?? 0,
|
|
17366
|
+
contentHash: doc.metadata?.contentHash ?? "",
|
|
17367
|
+
depth: doc.metadata?.depth ?? 0,
|
|
17368
|
+
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17369
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
17370
|
+
tags: doc.metadata?.tags ?? [],
|
|
17371
|
+
description: doc.metadata?.description || void 0,
|
|
17372
|
+
keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17373
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
|
|
17374
|
+
incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
|
|
17375
|
+
}
|
|
17376
|
+
}));
|
|
17377
|
+
}
|
|
17378
|
+
async searchChunksByUrl(data, url, opts, scope) {
|
|
17379
|
+
const filterParts = [
|
|
17380
|
+
`projectId = '${scope.projectId}'`,
|
|
17381
|
+
`scopeName = '${scope.scopeName}'`,
|
|
17382
|
+
`url = '${url}'`
|
|
17383
|
+
];
|
|
17384
|
+
if (opts.filter) {
|
|
17385
|
+
filterParts.push(opts.filter);
|
|
17386
|
+
}
|
|
17387
|
+
const results = await this.chunksNs.query({
|
|
17388
|
+
data,
|
|
17389
|
+
topK: opts.limit,
|
|
17390
|
+
includeMetadata: true,
|
|
17391
|
+
filter: filterParts.join(" AND "),
|
|
17392
|
+
queryMode: vector.QueryMode.HYBRID,
|
|
17393
|
+
fusionAlgorithm: vector.FusionAlgorithm.DBSF
|
|
17188
17394
|
});
|
|
17189
17395
|
return results.map((doc) => ({
|
|
17190
|
-
id: doc.id,
|
|
17396
|
+
id: String(doc.id),
|
|
17191
17397
|
score: doc.score,
|
|
17192
17398
|
metadata: {
|
|
17193
17399
|
projectId: doc.metadata?.projectId ?? "",
|
|
17194
17400
|
scopeName: doc.metadata?.scopeName ?? "",
|
|
17195
|
-
url: doc.
|
|
17401
|
+
url: doc.metadata?.url ?? "",
|
|
17196
17402
|
path: doc.metadata?.path ?? "",
|
|
17197
|
-
title: doc.
|
|
17198
|
-
sectionTitle: doc.
|
|
17199
|
-
headingPath: doc.
|
|
17403
|
+
title: doc.metadata?.title ?? "",
|
|
17404
|
+
sectionTitle: doc.metadata?.sectionTitle ?? "",
|
|
17405
|
+
headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
|
|
17200
17406
|
snippet: doc.metadata?.snippet ?? "",
|
|
17201
|
-
chunkText: doc.
|
|
17407
|
+
chunkText: doc.metadata?.chunkText ?? "",
|
|
17202
17408
|
ordinal: doc.metadata?.ordinal ?? 0,
|
|
17203
17409
|
contentHash: doc.metadata?.contentHash ?? "",
|
|
17204
17410
|
depth: doc.metadata?.depth ?? 0,
|
|
17205
17411
|
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17206
17412
|
routeFile: doc.metadata?.routeFile ?? "",
|
|
17207
|
-
tags: doc.
|
|
17413
|
+
tags: doc.metadata?.tags ?? [],
|
|
17208
17414
|
description: doc.metadata?.description || void 0,
|
|
17209
|
-
keywords: doc.metadata?.keywords ? doc.metadata.keywords
|
|
17415
|
+
keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17416
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
|
|
17417
|
+
incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
|
|
17210
17418
|
}
|
|
17211
17419
|
}));
|
|
17212
17420
|
}
|
|
17213
|
-
async
|
|
17214
|
-
|
|
17421
|
+
async searchPagesByText(data, opts, scope) {
|
|
17422
|
+
return this.queryPages({ data }, opts, scope);
|
|
17423
|
+
}
|
|
17424
|
+
async searchPagesByVector(vector, opts, scope) {
|
|
17425
|
+
return this.queryPages({ vector }, opts, scope);
|
|
17426
|
+
}
|
|
17427
|
+
async queryPages(input, opts, scope) {
|
|
17428
|
+
const filterParts = [
|
|
17429
|
+
`projectId = '${scope.projectId}'`,
|
|
17430
|
+
`scopeName = '${scope.scopeName}'`
|
|
17431
|
+
];
|
|
17432
|
+
if (opts.filter) {
|
|
17433
|
+
filterParts.push(opts.filter);
|
|
17434
|
+
}
|
|
17215
17435
|
let results;
|
|
17216
17436
|
try {
|
|
17217
|
-
results = await
|
|
17218
|
-
|
|
17219
|
-
|
|
17220
|
-
|
|
17221
|
-
|
|
17222
|
-
|
|
17223
|
-
|
|
17437
|
+
results = await this.pagesNs.query({
|
|
17438
|
+
...input,
|
|
17439
|
+
topK: opts.limit,
|
|
17440
|
+
includeMetadata: true,
|
|
17441
|
+
filter: filterParts.join(" AND "),
|
|
17442
|
+
queryMode: vector.QueryMode.HYBRID,
|
|
17443
|
+
fusionAlgorithm: vector.FusionAlgorithm.DBSF
|
|
17224
17444
|
});
|
|
17225
17445
|
} catch {
|
|
17226
17446
|
return [];
|
|
17227
17447
|
}
|
|
17228
17448
|
return results.map((doc) => ({
|
|
17229
|
-
id: doc.id,
|
|
17449
|
+
id: String(doc.id),
|
|
17230
17450
|
score: doc.score,
|
|
17231
|
-
title: doc.
|
|
17232
|
-
url: doc.
|
|
17233
|
-
description: doc.
|
|
17234
|
-
tags: doc.
|
|
17451
|
+
title: doc.metadata?.title ?? "",
|
|
17452
|
+
url: doc.metadata?.url ?? "",
|
|
17453
|
+
description: doc.metadata?.description ?? "",
|
|
17454
|
+
tags: doc.metadata?.tags ?? [],
|
|
17235
17455
|
depth: doc.metadata?.depth ?? 0,
|
|
17236
17456
|
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17237
|
-
routeFile: doc.metadata?.routeFile ?? ""
|
|
17457
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
17458
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0
|
|
17238
17459
|
}));
|
|
17239
17460
|
}
|
|
17240
|
-
async deleteByIds(ids,
|
|
17461
|
+
async deleteByIds(ids, _scope) {
|
|
17241
17462
|
if (ids.length === 0) return;
|
|
17242
|
-
const
|
|
17243
|
-
const BATCH_SIZE = 500;
|
|
17463
|
+
const BATCH_SIZE = 100;
|
|
17244
17464
|
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17245
17465
|
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17246
|
-
await
|
|
17466
|
+
await this.chunksNs.delete(batch);
|
|
17247
17467
|
}
|
|
17248
17468
|
}
|
|
17249
17469
|
async deleteScope(scope) {
|
|
17250
|
-
|
|
17251
|
-
const
|
|
17252
|
-
|
|
17253
|
-
|
|
17254
|
-
|
|
17255
|
-
|
|
17256
|
-
|
|
17257
|
-
|
|
17258
|
-
|
|
17470
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17471
|
+
const ids = [];
|
|
17472
|
+
let cursor = "0";
|
|
17473
|
+
try {
|
|
17474
|
+
for (; ; ) {
|
|
17475
|
+
const result = await ns.range({
|
|
17476
|
+
cursor,
|
|
17477
|
+
limit: 100,
|
|
17478
|
+
includeMetadata: true
|
|
17479
|
+
});
|
|
17480
|
+
for (const doc of result.vectors) {
|
|
17481
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17482
|
+
ids.push(String(doc.id));
|
|
17483
|
+
}
|
|
17484
|
+
}
|
|
17485
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17486
|
+
cursor = result.nextCursor;
|
|
17487
|
+
}
|
|
17488
|
+
} catch {
|
|
17489
|
+
}
|
|
17490
|
+
if (ids.length > 0) {
|
|
17491
|
+
const BATCH_SIZE = 100;
|
|
17492
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17493
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17494
|
+
await ns.delete(batch);
|
|
17495
|
+
}
|
|
17496
|
+
}
|
|
17259
17497
|
}
|
|
17260
17498
|
}
|
|
17261
17499
|
async listScopes(projectId) {
|
|
17262
|
-
const
|
|
17263
|
-
const
|
|
17264
|
-
|
|
17265
|
-
for (const name of allIndexes) {
|
|
17266
|
-
if (name.startsWith(prefix) && !name.endsWith("--pages")) {
|
|
17267
|
-
const scopeName = name.slice(prefix.length);
|
|
17268
|
-
scopeNames.add(scopeName);
|
|
17269
|
-
}
|
|
17270
|
-
}
|
|
17271
|
-
const scopes = [];
|
|
17272
|
-
for (const scopeName of scopeNames) {
|
|
17273
|
-
const scope = {
|
|
17274
|
-
projectId,
|
|
17275
|
-
scopeName,
|
|
17276
|
-
scopeId: `${projectId}:${scopeName}`
|
|
17277
|
-
};
|
|
17500
|
+
const scopeMap = /* @__PURE__ */ new Map();
|
|
17501
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17502
|
+
let cursor = "0";
|
|
17278
17503
|
try {
|
|
17279
|
-
|
|
17280
|
-
|
|
17281
|
-
|
|
17282
|
-
|
|
17283
|
-
|
|
17284
|
-
|
|
17285
|
-
|
|
17504
|
+
for (; ; ) {
|
|
17505
|
+
const result = await ns.range({
|
|
17506
|
+
cursor,
|
|
17507
|
+
limit: 100,
|
|
17508
|
+
includeMetadata: true
|
|
17509
|
+
});
|
|
17510
|
+
for (const doc of result.vectors) {
|
|
17511
|
+
if (doc.metadata?.projectId === projectId) {
|
|
17512
|
+
const scopeName = doc.metadata.scopeName ?? "";
|
|
17513
|
+
scopeMap.set(scopeName, (scopeMap.get(scopeName) ?? 0) + 1);
|
|
17514
|
+
}
|
|
17515
|
+
}
|
|
17516
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17517
|
+
cursor = result.nextCursor;
|
|
17518
|
+
}
|
|
17286
17519
|
} catch {
|
|
17287
|
-
scopes.push({
|
|
17288
|
-
projectId,
|
|
17289
|
-
scopeName,
|
|
17290
|
-
lastIndexedAt: "unknown",
|
|
17291
|
-
documentCount: 0
|
|
17292
|
-
});
|
|
17293
17520
|
}
|
|
17294
17521
|
}
|
|
17295
|
-
return
|
|
17522
|
+
return [...scopeMap.entries()].map(([scopeName, count]) => ({
|
|
17523
|
+
projectId,
|
|
17524
|
+
scopeName,
|
|
17525
|
+
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
17526
|
+
documentCount: count
|
|
17527
|
+
}));
|
|
17296
17528
|
}
|
|
17297
17529
|
async getContentHashes(scope) {
|
|
17298
17530
|
const map = /* @__PURE__ */ new Map();
|
|
17299
|
-
const index = this.chunkIndex(scope);
|
|
17300
17531
|
let cursor = "0";
|
|
17301
17532
|
try {
|
|
17302
17533
|
for (; ; ) {
|
|
17303
|
-
const result = await
|
|
17304
|
-
|
|
17305
|
-
|
|
17306
|
-
|
|
17534
|
+
const result = await this.chunksNs.range({
|
|
17535
|
+
cursor,
|
|
17536
|
+
limit: 100,
|
|
17537
|
+
includeMetadata: true
|
|
17538
|
+
});
|
|
17539
|
+
for (const doc of result.vectors) {
|
|
17540
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17541
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17542
|
+
}
|
|
17543
|
+
}
|
|
17544
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17545
|
+
cursor = result.nextCursor;
|
|
17546
|
+
}
|
|
17547
|
+
} catch {
|
|
17548
|
+
}
|
|
17549
|
+
return map;
|
|
17550
|
+
}
|
|
17551
|
+
async listPages(scope, opts) {
|
|
17552
|
+
const cursor = opts?.cursor ?? "0";
|
|
17553
|
+
const limit = opts?.limit ?? 50;
|
|
17554
|
+
try {
|
|
17555
|
+
const result = await this.pagesNs.range({
|
|
17556
|
+
cursor,
|
|
17557
|
+
limit,
|
|
17558
|
+
includeMetadata: true
|
|
17559
|
+
});
|
|
17560
|
+
const pages = result.vectors.filter(
|
|
17561
|
+
(doc) => doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && (!opts?.pathPrefix || (doc.metadata?.url ?? "").startsWith(opts.pathPrefix))
|
|
17562
|
+
).map((doc) => ({
|
|
17563
|
+
url: doc.metadata?.url ?? "",
|
|
17564
|
+
title: doc.metadata?.title ?? "",
|
|
17565
|
+
description: doc.metadata?.description ?? "",
|
|
17566
|
+
routeFile: doc.metadata?.routeFile ?? ""
|
|
17567
|
+
}));
|
|
17568
|
+
const response = { pages };
|
|
17569
|
+
if (result.nextCursor && result.nextCursor !== "0") {
|
|
17570
|
+
response.nextCursor = result.nextCursor;
|
|
17571
|
+
}
|
|
17572
|
+
return response;
|
|
17573
|
+
} catch {
|
|
17574
|
+
return { pages: [] };
|
|
17575
|
+
}
|
|
17576
|
+
}
|
|
17577
|
+
async getPageHashes(scope) {
|
|
17578
|
+
const map = /* @__PURE__ */ new Map();
|
|
17579
|
+
let cursor = "0";
|
|
17580
|
+
try {
|
|
17581
|
+
for (; ; ) {
|
|
17582
|
+
const result = await this.pagesNs.range({
|
|
17583
|
+
cursor,
|
|
17584
|
+
limit: 100,
|
|
17585
|
+
includeMetadata: true
|
|
17586
|
+
});
|
|
17587
|
+
for (const doc of result.vectors) {
|
|
17588
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17589
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17307
17590
|
}
|
|
17308
17591
|
}
|
|
17309
17592
|
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
@@ -17313,47 +17596,43 @@ var UpstashSearchStore = class {
|
|
|
17313
17596
|
}
|
|
17314
17597
|
return map;
|
|
17315
17598
|
}
|
|
17599
|
+
async deletePagesByIds(ids, _scope) {
|
|
17600
|
+
if (ids.length === 0) return;
|
|
17601
|
+
const BATCH_SIZE = 50;
|
|
17602
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17603
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17604
|
+
await this.pagesNs.delete(batch);
|
|
17605
|
+
}
|
|
17606
|
+
}
|
|
17316
17607
|
async upsertPages(pages, scope) {
|
|
17317
17608
|
if (pages.length === 0) return;
|
|
17318
|
-
const index = this.pageIndex(scope);
|
|
17319
17609
|
const BATCH_SIZE = 50;
|
|
17320
17610
|
for (let i = 0; i < pages.length; i += BATCH_SIZE) {
|
|
17321
17611
|
const batch = pages.slice(i, i + BATCH_SIZE);
|
|
17322
|
-
|
|
17323
|
-
|
|
17324
|
-
|
|
17325
|
-
|
|
17326
|
-
|
|
17327
|
-
|
|
17328
|
-
|
|
17329
|
-
|
|
17330
|
-
|
|
17331
|
-
|
|
17332
|
-
}
|
|
17333
|
-
|
|
17334
|
-
markdown: p.markdown,
|
|
17335
|
-
projectId: p.projectId,
|
|
17336
|
-
scopeName: p.scopeName,
|
|
17337
|
-
routeFile: p.routeFile,
|
|
17338
|
-
routeResolution: p.routeResolution,
|
|
17339
|
-
incomingLinks: p.incomingLinks,
|
|
17340
|
-
outgoingLinks: p.outgoingLinks,
|
|
17341
|
-
depth: p.depth,
|
|
17342
|
-
indexedAt: p.indexedAt
|
|
17343
|
-
}
|
|
17344
|
-
}));
|
|
17345
|
-
await index.upsert(docs);
|
|
17612
|
+
await this.pagesNs.upsert(
|
|
17613
|
+
batch.map((p) => ({
|
|
17614
|
+
id: p.id,
|
|
17615
|
+
data: p.data,
|
|
17616
|
+
metadata: {
|
|
17617
|
+
...p.metadata,
|
|
17618
|
+
projectId: scope.projectId,
|
|
17619
|
+
scopeName: scope.scopeName,
|
|
17620
|
+
type: "page"
|
|
17621
|
+
}
|
|
17622
|
+
}))
|
|
17623
|
+
);
|
|
17346
17624
|
}
|
|
17347
17625
|
}
|
|
17348
17626
|
async getPage(url, scope) {
|
|
17349
|
-
const index = this.pageIndex(scope);
|
|
17350
17627
|
try {
|
|
17351
|
-
const results = await
|
|
17628
|
+
const results = await this.pagesNs.fetch([url], {
|
|
17629
|
+
includeMetadata: true
|
|
17630
|
+
});
|
|
17352
17631
|
const doc = results[0];
|
|
17353
|
-
if (!doc) return null;
|
|
17632
|
+
if (!doc || !doc.metadata) return null;
|
|
17354
17633
|
return {
|
|
17355
|
-
url: doc.
|
|
17356
|
-
title: doc.
|
|
17634
|
+
url: doc.metadata.url,
|
|
17635
|
+
title: doc.metadata.title,
|
|
17357
17636
|
markdown: doc.metadata.markdown,
|
|
17358
17637
|
projectId: doc.metadata.projectId,
|
|
17359
17638
|
scopeName: doc.metadata.scopeName,
|
|
@@ -17361,27 +17640,86 @@ var UpstashSearchStore = class {
|
|
|
17361
17640
|
routeResolution: doc.metadata.routeResolution,
|
|
17362
17641
|
incomingLinks: doc.metadata.incomingLinks,
|
|
17363
17642
|
outgoingLinks: doc.metadata.outgoingLinks,
|
|
17643
|
+
outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? void 0,
|
|
17364
17644
|
depth: doc.metadata.depth,
|
|
17365
|
-
tags: doc.
|
|
17645
|
+
tags: doc.metadata.tags ?? [],
|
|
17366
17646
|
indexedAt: doc.metadata.indexedAt,
|
|
17367
|
-
summary: doc.
|
|
17368
|
-
description: doc.
|
|
17369
|
-
keywords: doc.
|
|
17647
|
+
summary: doc.metadata.summary || void 0,
|
|
17648
|
+
description: doc.metadata.description || void 0,
|
|
17649
|
+
keywords: doc.metadata.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17650
|
+
publishedAt: typeof doc.metadata.publishedAt === "number" ? doc.metadata.publishedAt : void 0
|
|
17370
17651
|
};
|
|
17371
17652
|
} catch {
|
|
17372
17653
|
return null;
|
|
17373
17654
|
}
|
|
17374
17655
|
}
|
|
17656
|
+
async fetchPageWithVector(url, scope) {
|
|
17657
|
+
try {
|
|
17658
|
+
const results = await this.pagesNs.fetch([url], {
|
|
17659
|
+
includeMetadata: true,
|
|
17660
|
+
includeVectors: true
|
|
17661
|
+
});
|
|
17662
|
+
const doc = results[0];
|
|
17663
|
+
if (!doc || !doc.metadata || !doc.vector) return null;
|
|
17664
|
+
if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
|
|
17665
|
+
return null;
|
|
17666
|
+
}
|
|
17667
|
+
return { metadata: doc.metadata, vector: doc.vector };
|
|
17668
|
+
} catch {
|
|
17669
|
+
return null;
|
|
17670
|
+
}
|
|
17671
|
+
}
|
|
17672
|
+
async fetchPagesBatch(urls, scope) {
|
|
17673
|
+
if (urls.length === 0) return [];
|
|
17674
|
+
try {
|
|
17675
|
+
const results = await this.pagesNs.fetch(urls, {
|
|
17676
|
+
includeMetadata: true
|
|
17677
|
+
});
|
|
17678
|
+
const out = [];
|
|
17679
|
+
for (const doc of results) {
|
|
17680
|
+
if (!doc || !doc.metadata) continue;
|
|
17681
|
+
if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
|
|
17682
|
+
continue;
|
|
17683
|
+
}
|
|
17684
|
+
out.push({
|
|
17685
|
+
url: doc.metadata.url,
|
|
17686
|
+
title: doc.metadata.title,
|
|
17687
|
+
routeFile: doc.metadata.routeFile,
|
|
17688
|
+
outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? []
|
|
17689
|
+
});
|
|
17690
|
+
}
|
|
17691
|
+
return out;
|
|
17692
|
+
} catch {
|
|
17693
|
+
return [];
|
|
17694
|
+
}
|
|
17695
|
+
}
|
|
17375
17696
|
async deletePages(scope) {
|
|
17697
|
+
const ids = [];
|
|
17698
|
+
let cursor = "0";
|
|
17376
17699
|
try {
|
|
17377
|
-
|
|
17378
|
-
|
|
17700
|
+
for (; ; ) {
|
|
17701
|
+
const result = await this.pagesNs.range({
|
|
17702
|
+
cursor,
|
|
17703
|
+
limit: 100,
|
|
17704
|
+
includeMetadata: true
|
|
17705
|
+
});
|
|
17706
|
+
for (const doc of result.vectors) {
|
|
17707
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17708
|
+
ids.push(String(doc.id));
|
|
17709
|
+
}
|
|
17710
|
+
}
|
|
17711
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17712
|
+
cursor = result.nextCursor;
|
|
17713
|
+
}
|
|
17379
17714
|
} catch {
|
|
17380
17715
|
}
|
|
17716
|
+
if (ids.length > 0) {
|
|
17717
|
+
await this.deletePagesByIds(ids, scope);
|
|
17718
|
+
}
|
|
17381
17719
|
}
|
|
17382
17720
|
async health() {
|
|
17383
17721
|
try {
|
|
17384
|
-
await this.
|
|
17722
|
+
await this.index.info();
|
|
17385
17723
|
return { ok: true };
|
|
17386
17724
|
} catch (error) {
|
|
17387
17725
|
return {
|
|
@@ -17391,14 +17729,31 @@ var UpstashSearchStore = class {
|
|
|
17391
17729
|
}
|
|
17392
17730
|
}
|
|
17393
17731
|
async dropAllIndexes(projectId) {
|
|
17394
|
-
const
|
|
17395
|
-
|
|
17396
|
-
|
|
17397
|
-
|
|
17398
|
-
|
|
17399
|
-
const
|
|
17400
|
-
|
|
17401
|
-
|
|
17732
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17733
|
+
const ids = [];
|
|
17734
|
+
let cursor = "0";
|
|
17735
|
+
try {
|
|
17736
|
+
for (; ; ) {
|
|
17737
|
+
const result = await ns.range({
|
|
17738
|
+
cursor,
|
|
17739
|
+
limit: 100,
|
|
17740
|
+
includeMetadata: true
|
|
17741
|
+
});
|
|
17742
|
+
for (const doc of result.vectors) {
|
|
17743
|
+
if (doc.metadata?.projectId === projectId) {
|
|
17744
|
+
ids.push(String(doc.id));
|
|
17745
|
+
}
|
|
17746
|
+
}
|
|
17747
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17748
|
+
cursor = result.nextCursor;
|
|
17749
|
+
}
|
|
17750
|
+
} catch {
|
|
17751
|
+
}
|
|
17752
|
+
if (ids.length > 0) {
|
|
17753
|
+
const BATCH_SIZE = 100;
|
|
17754
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17755
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17756
|
+
await ns.delete(batch);
|
|
17402
17757
|
}
|
|
17403
17758
|
}
|
|
17404
17759
|
}
|
|
@@ -17412,12 +17767,16 @@ async function createUpstashStore(config) {
|
|
|
17412
17767
|
if (!url || !token) {
|
|
17413
17768
|
throw new SearchSocketError(
|
|
17414
17769
|
"VECTOR_BACKEND_UNAVAILABLE",
|
|
17415
|
-
`Missing Upstash
|
|
17770
|
+
`Missing Upstash Vector credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
|
|
17416
17771
|
);
|
|
17417
17772
|
}
|
|
17418
|
-
const {
|
|
17419
|
-
const
|
|
17420
|
-
return new UpstashSearchStore({
|
|
17773
|
+
const { Index } = await import('@upstash/vector');
|
|
17774
|
+
const index = new Index({ url, token });
|
|
17775
|
+
return new UpstashSearchStore({
|
|
17776
|
+
index,
|
|
17777
|
+
pagesNamespace: config.upstash.namespaces.pages,
|
|
17778
|
+
chunksNamespace: config.upstash.namespaces.chunks
|
|
17779
|
+
});
|
|
17421
17780
|
}
|
|
17422
17781
|
|
|
17423
17782
|
// src/utils/pattern.ts
|
|
@@ -17460,29 +17819,65 @@ function nonNegativeOrZero(value) {
|
|
|
17460
17819
|
function normalizeForTitleMatch(text) {
|
|
17461
17820
|
return text.toLowerCase().replace(/[^a-z0-9\s]/g, "").replace(/\s+/g, " ").trim();
|
|
17462
17821
|
}
|
|
17463
|
-
function rankHits(hits, config, query) {
|
|
17822
|
+
function rankHits(hits, config, query, debug) {
|
|
17464
17823
|
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
17465
17824
|
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
17466
17825
|
return hits.map((hit) => {
|
|
17467
|
-
|
|
17826
|
+
const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
17827
|
+
let score = baseScore;
|
|
17828
|
+
let incomingLinkBoostValue = 0;
|
|
17468
17829
|
if (config.ranking.enableIncomingLinkBoost) {
|
|
17469
17830
|
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
17470
|
-
|
|
17831
|
+
incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
|
|
17832
|
+
score += incomingLinkBoostValue;
|
|
17471
17833
|
}
|
|
17834
|
+
let depthBoostValue = 0;
|
|
17472
17835
|
if (config.ranking.enableDepthBoost) {
|
|
17473
17836
|
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
17474
|
-
|
|
17837
|
+
depthBoostValue = depthBoost * config.ranking.weights.depth;
|
|
17838
|
+
score += depthBoostValue;
|
|
17475
17839
|
}
|
|
17840
|
+
let titleMatchBoostValue = 0;
|
|
17476
17841
|
if (normalizedQuery && titleMatchWeight > 0) {
|
|
17477
17842
|
const normalizedTitle = normalizeForTitleMatch(hit.metadata.title);
|
|
17478
17843
|
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
17479
|
-
|
|
17844
|
+
titleMatchBoostValue = titleMatchWeight;
|
|
17845
|
+
score += titleMatchBoostValue;
|
|
17480
17846
|
}
|
|
17481
17847
|
}
|
|
17482
|
-
|
|
17848
|
+
let freshnessBoostValue = 0;
|
|
17849
|
+
if (config.ranking.enableFreshnessBoost) {
|
|
17850
|
+
const publishedAt = hit.metadata.publishedAt;
|
|
17851
|
+
if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
|
|
17852
|
+
const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
|
|
17853
|
+
const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
|
|
17854
|
+
freshnessBoostValue = decay * config.ranking.weights.freshness;
|
|
17855
|
+
score += freshnessBoostValue;
|
|
17856
|
+
}
|
|
17857
|
+
}
|
|
17858
|
+
let anchorTextMatchBoostValue = 0;
|
|
17859
|
+
if (config.ranking.enableAnchorTextBoost && normalizedQuery && config.ranking.weights.anchorText > 0) {
|
|
17860
|
+
const normalizedAnchorText = normalizeForTitleMatch(hit.metadata.incomingAnchorText ?? "");
|
|
17861
|
+
if (normalizedAnchorText.length > 0 && normalizedQuery.length > 0 && (normalizedAnchorText.includes(normalizedQuery) || normalizedQuery.includes(normalizedAnchorText))) {
|
|
17862
|
+
anchorTextMatchBoostValue = config.ranking.weights.anchorText;
|
|
17863
|
+
score += anchorTextMatchBoostValue;
|
|
17864
|
+
}
|
|
17865
|
+
}
|
|
17866
|
+
const result = {
|
|
17483
17867
|
hit,
|
|
17484
17868
|
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
17485
17869
|
};
|
|
17870
|
+
if (debug) {
|
|
17871
|
+
result.breakdown = {
|
|
17872
|
+
baseScore,
|
|
17873
|
+
incomingLinkBoost: incomingLinkBoostValue,
|
|
17874
|
+
depthBoost: depthBoostValue,
|
|
17875
|
+
titleMatchBoost: titleMatchBoostValue,
|
|
17876
|
+
freshnessBoost: freshnessBoostValue,
|
|
17877
|
+
anchorTextMatchBoost: anchorTextMatchBoostValue
|
|
17878
|
+
};
|
|
17879
|
+
}
|
|
17880
|
+
return result;
|
|
17486
17881
|
}).sort((a, b) => {
|
|
17487
17882
|
const delta = b.finalScore - a.finalScore;
|
|
17488
17883
|
return Number.isNaN(delta) ? 0 : delta;
|
|
@@ -17491,12 +17886,13 @@ function rankHits(hits, config, query) {
|
|
|
17491
17886
|
function trimByScoreGap(results, config) {
|
|
17492
17887
|
if (results.length === 0) return results;
|
|
17493
17888
|
const threshold = config.ranking.scoreGapThreshold;
|
|
17494
|
-
const
|
|
17495
|
-
if (
|
|
17496
|
-
const
|
|
17497
|
-
|
|
17498
|
-
|
|
17499
|
-
|
|
17889
|
+
const minScoreRatio = config.ranking.minScoreRatio;
|
|
17890
|
+
if (minScoreRatio > 0 && results.length > 0) {
|
|
17891
|
+
const topScore = results[0].pageScore;
|
|
17892
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
17893
|
+
const minThreshold = topScore * minScoreRatio;
|
|
17894
|
+
results = results.filter((r) => r.pageScore >= minThreshold);
|
|
17895
|
+
}
|
|
17500
17896
|
}
|
|
17501
17897
|
if (threshold > 0 && results.length > 1) {
|
|
17502
17898
|
for (let i = 1; i < results.length; i++) {
|
|
@@ -17566,75 +17962,276 @@ function aggregateByPage(ranked, config) {
|
|
|
17566
17962
|
return Number.isNaN(delta) ? 0 : delta;
|
|
17567
17963
|
});
|
|
17568
17964
|
}
|
|
17569
|
-
function
|
|
17570
|
-
|
|
17571
|
-
const
|
|
17572
|
-
|
|
17573
|
-
|
|
17574
|
-
|
|
17575
|
-
|
|
17576
|
-
|
|
17577
|
-
|
|
17578
|
-
|
|
17579
|
-
|
|
17580
|
-
if (pageHit) {
|
|
17581
|
-
pagesWithChunks.add(url);
|
|
17582
|
-
const blended = (1 - w) * ranked.finalScore + w * pageHit.score;
|
|
17583
|
-
return {
|
|
17584
|
-
hit: ranked.hit,
|
|
17585
|
-
finalScore: Number.isFinite(blended) ? blended : ranked.finalScore
|
|
17586
|
-
};
|
|
17965
|
+
function rankPageHits(pageHits, config, query, debug) {
|
|
17966
|
+
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
17967
|
+
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
17968
|
+
return pageHits.map((hit) => {
|
|
17969
|
+
const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
17970
|
+
let score = baseScore;
|
|
17971
|
+
let incomingLinkBoostValue = 0;
|
|
17972
|
+
if (config.ranking.enableIncomingLinkBoost) {
|
|
17973
|
+
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.incomingLinks));
|
|
17974
|
+
incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
|
|
17975
|
+
score += incomingLinkBoostValue;
|
|
17587
17976
|
}
|
|
17588
|
-
|
|
17589
|
-
|
|
17590
|
-
|
|
17591
|
-
|
|
17592
|
-
|
|
17593
|
-
|
|
17594
|
-
|
|
17595
|
-
|
|
17596
|
-
|
|
17597
|
-
|
|
17598
|
-
|
|
17599
|
-
|
|
17600
|
-
|
|
17601
|
-
|
|
17602
|
-
|
|
17603
|
-
|
|
17604
|
-
|
|
17605
|
-
|
|
17606
|
-
|
|
17607
|
-
|
|
17608
|
-
|
|
17609
|
-
|
|
17610
|
-
routeFile: pageHit.routeFile,
|
|
17611
|
-
tags: pageHit.tags
|
|
17977
|
+
let depthBoostValue = 0;
|
|
17978
|
+
if (config.ranking.enableDepthBoost) {
|
|
17979
|
+
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.depth));
|
|
17980
|
+
depthBoostValue = depthBoost * config.ranking.weights.depth;
|
|
17981
|
+
score += depthBoostValue;
|
|
17982
|
+
}
|
|
17983
|
+
let titleMatchBoostValue = 0;
|
|
17984
|
+
if (normalizedQuery && titleMatchWeight > 0) {
|
|
17985
|
+
const normalizedTitle = normalizeForTitleMatch(hit.title);
|
|
17986
|
+
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
17987
|
+
titleMatchBoostValue = titleMatchWeight;
|
|
17988
|
+
score += titleMatchBoostValue;
|
|
17989
|
+
}
|
|
17990
|
+
}
|
|
17991
|
+
let freshnessBoostValue = 0;
|
|
17992
|
+
if (config.ranking.enableFreshnessBoost) {
|
|
17993
|
+
const publishedAt = hit.publishedAt;
|
|
17994
|
+
if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
|
|
17995
|
+
const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
|
|
17996
|
+
const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
|
|
17997
|
+
freshnessBoostValue = decay * config.ranking.weights.freshness;
|
|
17998
|
+
score += freshnessBoostValue;
|
|
17612
17999
|
}
|
|
18000
|
+
}
|
|
18001
|
+
const pageWeight = findPageWeight(hit.url, config.ranking.pageWeights);
|
|
18002
|
+
if (pageWeight !== 1) {
|
|
18003
|
+
score *= pageWeight;
|
|
18004
|
+
}
|
|
18005
|
+
const result = {
|
|
18006
|
+
url: hit.url,
|
|
18007
|
+
title: hit.title,
|
|
18008
|
+
description: hit.description,
|
|
18009
|
+
routeFile: hit.routeFile,
|
|
18010
|
+
depth: hit.depth,
|
|
18011
|
+
incomingLinks: hit.incomingLinks,
|
|
18012
|
+
tags: hit.tags,
|
|
18013
|
+
baseScore,
|
|
18014
|
+
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY,
|
|
18015
|
+
publishedAt: hit.publishedAt
|
|
17613
18016
|
};
|
|
17614
|
-
|
|
17615
|
-
|
|
17616
|
-
|
|
17617
|
-
|
|
17618
|
-
|
|
17619
|
-
|
|
18017
|
+
if (debug) {
|
|
18018
|
+
result.breakdown = {
|
|
18019
|
+
baseScore,
|
|
18020
|
+
pageWeight,
|
|
18021
|
+
incomingLinkBoost: incomingLinkBoostValue,
|
|
18022
|
+
depthBoost: depthBoostValue,
|
|
18023
|
+
titleMatchBoost: titleMatchBoostValue,
|
|
18024
|
+
freshnessBoost: freshnessBoostValue
|
|
18025
|
+
};
|
|
18026
|
+
}
|
|
18027
|
+
return result;
|
|
18028
|
+
}).filter((p) => findPageWeight(p.url, config.ranking.pageWeights) !== 0).sort((a, b) => {
|
|
17620
18029
|
const delta = b.finalScore - a.finalScore;
|
|
17621
18030
|
return Number.isNaN(delta) ? 0 : delta;
|
|
17622
18031
|
});
|
|
17623
18032
|
}
|
|
18033
|
+
function trimPagesByScoreGap(results, config) {
|
|
18034
|
+
if (results.length === 0) return results;
|
|
18035
|
+
const threshold = config.ranking.scoreGapThreshold;
|
|
18036
|
+
const minScoreRatio = config.ranking.minScoreRatio;
|
|
18037
|
+
if (minScoreRatio > 0 && results.length > 0) {
|
|
18038
|
+
const topScore = results[0].finalScore;
|
|
18039
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
18040
|
+
const minThreshold = topScore * minScoreRatio;
|
|
18041
|
+
results = results.filter((r) => r.finalScore >= minThreshold);
|
|
18042
|
+
}
|
|
18043
|
+
}
|
|
18044
|
+
if (threshold > 0 && results.length > 1) {
|
|
18045
|
+
for (let i = 1; i < results.length; i++) {
|
|
18046
|
+
const prev = results[i - 1].finalScore;
|
|
18047
|
+
const current = results[i].finalScore;
|
|
18048
|
+
if (prev > 0) {
|
|
18049
|
+
const gap = (prev - current) / prev;
|
|
18050
|
+
if (gap >= threshold) {
|
|
18051
|
+
return results.slice(0, i);
|
|
18052
|
+
}
|
|
18053
|
+
}
|
|
18054
|
+
}
|
|
18055
|
+
}
|
|
18056
|
+
return results;
|
|
18057
|
+
}
|
|
18058
|
+
|
|
18059
|
+
// src/search/related-pages.ts
|
|
18060
|
+
function diceScore(urlA, urlB) {
|
|
18061
|
+
const segmentsA = urlA.split("/").filter(Boolean);
|
|
18062
|
+
const segmentsB = urlB.split("/").filter(Boolean);
|
|
18063
|
+
if (segmentsA.length === 0 && segmentsB.length === 0) return 1;
|
|
18064
|
+
if (segmentsA.length === 0 || segmentsB.length === 0) return 0;
|
|
18065
|
+
let shared = 0;
|
|
18066
|
+
const minLen = Math.min(segmentsA.length, segmentsB.length);
|
|
18067
|
+
for (let i = 0; i < minLen; i++) {
|
|
18068
|
+
if (segmentsA[i] === segmentsB[i]) {
|
|
18069
|
+
shared++;
|
|
18070
|
+
} else {
|
|
18071
|
+
break;
|
|
18072
|
+
}
|
|
18073
|
+
}
|
|
18074
|
+
return 2 * shared / (segmentsA.length + segmentsB.length);
|
|
18075
|
+
}
|
|
18076
|
+
function compositeScore(isLinked, dice, semantic) {
|
|
18077
|
+
return (isLinked ? 0.5 : 0) + 0.3 * dice + 0.2 * semantic;
|
|
18078
|
+
}
|
|
18079
|
+
function dominantRelationshipType(isOutgoing, isIncoming, dice) {
|
|
18080
|
+
if (isOutgoing) return "outgoing_link";
|
|
18081
|
+
if (isIncoming) return "incoming_link";
|
|
18082
|
+
if (dice > 0.4) return "sibling";
|
|
18083
|
+
return "semantic";
|
|
18084
|
+
}
|
|
18085
|
+
|
|
18086
|
+
// src/utils/structured-meta.ts
|
|
18087
|
+
var VALID_KEY_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
|
|
18088
|
+
function validateMetaKey(key) {
|
|
18089
|
+
return VALID_KEY_RE.test(key);
|
|
18090
|
+
}
|
|
18091
|
+
function parseMetaValue(content, dataType) {
|
|
18092
|
+
switch (dataType) {
|
|
18093
|
+
case "number": {
|
|
18094
|
+
const n = Number(content);
|
|
18095
|
+
return Number.isFinite(n) ? n : content;
|
|
18096
|
+
}
|
|
18097
|
+
case "boolean":
|
|
18098
|
+
return content === "true";
|
|
18099
|
+
case "string[]":
|
|
18100
|
+
return content ? content.split(",").map((s) => s.trim()) : [];
|
|
18101
|
+
case "date": {
|
|
18102
|
+
const ms = Number(content);
|
|
18103
|
+
return Number.isFinite(ms) ? ms : content;
|
|
18104
|
+
}
|
|
18105
|
+
default:
|
|
18106
|
+
return content;
|
|
18107
|
+
}
|
|
18108
|
+
}
|
|
18109
|
+
function escapeFilterValue(s) {
|
|
18110
|
+
return s.replace(/'/g, "''");
|
|
18111
|
+
}
|
|
18112
|
+
function buildMetaFilterString(filters) {
|
|
18113
|
+
const clauses = [];
|
|
18114
|
+
for (const [key, value] of Object.entries(filters)) {
|
|
18115
|
+
if (!validateMetaKey(key)) continue;
|
|
18116
|
+
const field = `meta.${key}`;
|
|
18117
|
+
if (typeof value === "string") {
|
|
18118
|
+
clauses.push(`${field} CONTAINS '${escapeFilterValue(value)}'`);
|
|
18119
|
+
} else if (typeof value === "boolean") {
|
|
18120
|
+
clauses.push(`${field} = ${value}`);
|
|
18121
|
+
} else {
|
|
18122
|
+
clauses.push(`${field} = ${value}`);
|
|
18123
|
+
}
|
|
18124
|
+
}
|
|
18125
|
+
return clauses.join(" AND ");
|
|
18126
|
+
}
|
|
17624
18127
|
|
|
17625
18128
|
// src/search/engine.ts
|
|
18129
|
+
var rankingOverridesSchema = zod.z.object({
|
|
18130
|
+
ranking: zod.z.object({
|
|
18131
|
+
enableIncomingLinkBoost: zod.z.boolean().optional(),
|
|
18132
|
+
enableDepthBoost: zod.z.boolean().optional(),
|
|
18133
|
+
aggregationCap: zod.z.number().int().positive().optional(),
|
|
18134
|
+
aggregationDecay: zod.z.number().min(0).max(1).optional(),
|
|
18135
|
+
minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
18136
|
+
minScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
18137
|
+
scoreGapThreshold: zod.z.number().min(0).max(1).optional(),
|
|
18138
|
+
weights: zod.z.object({
|
|
18139
|
+
incomingLinks: zod.z.number().optional(),
|
|
18140
|
+
depth: zod.z.number().optional(),
|
|
18141
|
+
aggregation: zod.z.number().optional(),
|
|
18142
|
+
titleMatch: zod.z.number().optional()
|
|
18143
|
+
}).optional()
|
|
18144
|
+
}).optional(),
|
|
18145
|
+
search: zod.z.object({
|
|
18146
|
+
pageSearchWeight: zod.z.number().min(0).max(1).optional()
|
|
18147
|
+
}).optional()
|
|
18148
|
+
}).optional();
|
|
17626
18149
|
var requestSchema = zod.z.object({
|
|
17627
18150
|
q: zod.z.string().trim().min(1),
|
|
17628
18151
|
topK: zod.z.number().int().positive().max(100).optional(),
|
|
17629
18152
|
scope: zod.z.string().optional(),
|
|
17630
18153
|
pathPrefix: zod.z.string().optional(),
|
|
17631
18154
|
tags: zod.z.array(zod.z.string()).optional(),
|
|
17632
|
-
|
|
18155
|
+
filters: zod.z.record(zod.z.string(), zod.z.union([zod.z.string(), zod.z.number(), zod.z.boolean()])).optional(),
|
|
18156
|
+
groupBy: zod.z.enum(["page", "chunk"]).optional(),
|
|
18157
|
+
maxSubResults: zod.z.number().int().positive().max(20).optional(),
|
|
18158
|
+
debug: zod.z.boolean().optional(),
|
|
18159
|
+
rankingOverrides: rankingOverridesSchema
|
|
17633
18160
|
});
|
|
17634
|
-
var
|
|
17635
|
-
|
|
17636
|
-
|
|
17637
|
-
|
|
18161
|
+
var MAX_SITE_STRUCTURE_PAGES = 2e3;
|
|
18162
|
+
function makeNode(url, depth) {
|
|
18163
|
+
return { url, title: "", depth, routeFile: "", isIndexed: false, childCount: 0, children: [] };
|
|
18164
|
+
}
|
|
18165
|
+
function buildTree(pages, pathPrefix) {
|
|
18166
|
+
const nodeMap = /* @__PURE__ */ new Map();
|
|
18167
|
+
const root2 = makeNode("/", 0);
|
|
18168
|
+
nodeMap.set("/", root2);
|
|
18169
|
+
for (const page of pages) {
|
|
18170
|
+
const normalized = normalizeUrlPath(page.url);
|
|
18171
|
+
const segments = normalized.split("/").filter(Boolean);
|
|
18172
|
+
if (segments.length === 0) {
|
|
18173
|
+
root2.title = page.title;
|
|
18174
|
+
root2.routeFile = page.routeFile;
|
|
18175
|
+
root2.isIndexed = true;
|
|
18176
|
+
continue;
|
|
18177
|
+
}
|
|
18178
|
+
for (let i = 1; i <= segments.length; i++) {
|
|
18179
|
+
const partialUrl = "/" + segments.slice(0, i).join("/");
|
|
18180
|
+
if (!nodeMap.has(partialUrl)) {
|
|
18181
|
+
nodeMap.set(partialUrl, makeNode(partialUrl, i));
|
|
18182
|
+
}
|
|
18183
|
+
}
|
|
18184
|
+
const node = nodeMap.get(normalized);
|
|
18185
|
+
node.title = page.title;
|
|
18186
|
+
node.routeFile = page.routeFile;
|
|
18187
|
+
node.isIndexed = true;
|
|
18188
|
+
}
|
|
18189
|
+
for (const [url, node] of nodeMap) {
|
|
18190
|
+
if (url === "/") continue;
|
|
18191
|
+
const segments = url.split("/").filter(Boolean);
|
|
18192
|
+
const parentUrl = segments.length === 1 ? "/" : "/" + segments.slice(0, -1).join("/");
|
|
18193
|
+
const parent = nodeMap.get(parentUrl) ?? root2;
|
|
18194
|
+
parent.children.push(node);
|
|
18195
|
+
}
|
|
18196
|
+
const sortAndCount = (node) => {
|
|
18197
|
+
node.children.sort((a, b) => a.url.localeCompare(b.url));
|
|
18198
|
+
node.childCount = node.children.length;
|
|
18199
|
+
for (const child of node.children) {
|
|
18200
|
+
sortAndCount(child);
|
|
18201
|
+
}
|
|
18202
|
+
};
|
|
18203
|
+
sortAndCount(root2);
|
|
18204
|
+
if (pathPrefix) {
|
|
18205
|
+
const normalizedPrefix = normalizeUrlPath(pathPrefix);
|
|
18206
|
+
const subtreeRoot = nodeMap.get(normalizedPrefix);
|
|
18207
|
+
if (subtreeRoot) {
|
|
18208
|
+
return subtreeRoot;
|
|
18209
|
+
}
|
|
18210
|
+
return makeNode(normalizedPrefix, normalizedPrefix.split("/").filter(Boolean).length);
|
|
18211
|
+
}
|
|
18212
|
+
return root2;
|
|
18213
|
+
}
|
|
18214
|
+
function mergeRankingOverrides(base, overrides) {
|
|
18215
|
+
return {
|
|
18216
|
+
...base,
|
|
18217
|
+
search: {
|
|
18218
|
+
...base.search,
|
|
18219
|
+
...overrides.search
|
|
18220
|
+
},
|
|
18221
|
+
ranking: {
|
|
18222
|
+
...base.ranking,
|
|
18223
|
+
...overrides.ranking,
|
|
18224
|
+
weights: {
|
|
18225
|
+
...base.ranking.weights,
|
|
18226
|
+
...overrides.ranking?.weights
|
|
18227
|
+
}
|
|
18228
|
+
}
|
|
18229
|
+
};
|
|
18230
|
+
}
|
|
18231
|
+
var SearchEngine = class _SearchEngine {
|
|
18232
|
+
cwd;
|
|
18233
|
+
config;
|
|
18234
|
+
store;
|
|
17638
18235
|
constructor(options) {
|
|
17639
18236
|
this.cwd = options.cwd;
|
|
17640
18237
|
this.config = options.config;
|
|
@@ -17660,125 +18257,203 @@ var SearchEngine = class _SearchEngine {
|
|
|
17660
18257
|
}
|
|
17661
18258
|
const input = parsed.data;
|
|
17662
18259
|
const totalStart = process.hrtime.bigint();
|
|
18260
|
+
const effectiveConfig = input.debug && input.rankingOverrides ? mergeRankingOverrides(this.config, input.rankingOverrides) : this.config;
|
|
17663
18261
|
const resolvedScope = resolveScope(this.config, input.scope);
|
|
17664
18262
|
const topK = input.topK ?? 10;
|
|
18263
|
+
const maxSubResults = input.maxSubResults ?? 5;
|
|
17665
18264
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
17666
|
-
const
|
|
17667
|
-
const
|
|
17668
|
-
|
|
17669
|
-
|
|
17670
|
-
|
|
17671
|
-
|
|
17672
|
-
|
|
17673
|
-
|
|
17674
|
-
|
|
18265
|
+
const queryText = input.q;
|
|
18266
|
+
const pathPrefix = input.pathPrefix ? input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}` : void 0;
|
|
18267
|
+
const filterTags = input.tags && input.tags.length > 0 ? input.tags : void 0;
|
|
18268
|
+
const metaFilterStr = input.filters && Object.keys(input.filters).length > 0 ? buildMetaFilterString(input.filters) : "";
|
|
18269
|
+
const metaFilter = metaFilterStr || void 0;
|
|
18270
|
+
const applyPagePostFilters = (hits) => {
|
|
18271
|
+
let filtered = hits;
|
|
18272
|
+
if (pathPrefix) {
|
|
18273
|
+
filtered = filtered.filter((h) => h.url.startsWith(pathPrefix));
|
|
18274
|
+
}
|
|
18275
|
+
if (filterTags) {
|
|
18276
|
+
filtered = filtered.filter(
|
|
18277
|
+
(h) => filterTags.every((tag) => h.tags.includes(tag))
|
|
18278
|
+
);
|
|
17675
18279
|
}
|
|
17676
|
-
|
|
17677
|
-
|
|
17678
|
-
const
|
|
18280
|
+
return filtered;
|
|
18281
|
+
};
|
|
18282
|
+
const applyChunkPostFilters = (hits) => {
|
|
18283
|
+
let filtered = hits;
|
|
18284
|
+
if (filterTags) {
|
|
18285
|
+
filtered = filtered.filter(
|
|
18286
|
+
(h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
|
|
18287
|
+
);
|
|
18288
|
+
}
|
|
18289
|
+
return filtered;
|
|
18290
|
+
};
|
|
17679
18291
|
const searchStart = process.hrtime.bigint();
|
|
17680
|
-
|
|
17681
|
-
|
|
17682
|
-
const
|
|
17683
|
-
const
|
|
17684
|
-
|
|
17685
|
-
|
|
17686
|
-
|
|
17687
|
-
|
|
17688
|
-
|
|
17689
|
-
|
|
17690
|
-
|
|
17691
|
-
|
|
17692
|
-
|
|
17693
|
-
|
|
17694
|
-
|
|
17695
|
-
|
|
17696
|
-
|
|
17697
|
-
{
|
|
17698
|
-
limit: chunkLimit,
|
|
17699
|
-
semanticWeight: this.config.search.semanticWeight,
|
|
17700
|
-
inputEnrichment: this.config.search.inputEnrichment,
|
|
17701
|
-
reranking: false,
|
|
17702
|
-
filter
|
|
17703
|
-
},
|
|
18292
|
+
if (groupByPage) {
|
|
18293
|
+
const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
|
|
18294
|
+
const pageLimit = Math.max(topK * 2, 20);
|
|
18295
|
+
const pageHits = await this.store.searchPagesByText(
|
|
18296
|
+
queryText,
|
|
18297
|
+
{ limit: pageLimit * fetchMultiplier, filter: metaFilter },
|
|
18298
|
+
resolvedScope
|
|
18299
|
+
);
|
|
18300
|
+
const filteredPages = applyPagePostFilters(pageHits);
|
|
18301
|
+
let rankedPages = rankPageHits(filteredPages, effectiveConfig, input.q, input.debug);
|
|
18302
|
+
rankedPages = trimPagesByScoreGap(rankedPages, effectiveConfig);
|
|
18303
|
+
const topPages = rankedPages.slice(0, topK);
|
|
18304
|
+
const chunkPromises = topPages.map(
|
|
18305
|
+
(page) => this.store.searchChunksByUrl(
|
|
18306
|
+
queryText,
|
|
18307
|
+
page.url,
|
|
18308
|
+
{ limit: maxSubResults, filter: metaFilter },
|
|
17704
18309
|
resolvedScope
|
|
17705
|
-
)
|
|
17706
|
-
|
|
17707
|
-
const
|
|
17708
|
-
|
|
18310
|
+
).then((chunks) => applyChunkPostFilters(chunks))
|
|
18311
|
+
);
|
|
18312
|
+
const allChunks = await Promise.all(chunkPromises);
|
|
18313
|
+
const searchMs = hrTimeMs(searchStart);
|
|
18314
|
+
const results = this.buildPageFirstResults(topPages, allChunks, input.q, input.debug, maxSubResults);
|
|
18315
|
+
return {
|
|
18316
|
+
q: input.q,
|
|
18317
|
+
scope: resolvedScope.scopeName,
|
|
18318
|
+
results,
|
|
18319
|
+
meta: {
|
|
18320
|
+
timingsMs: {
|
|
18321
|
+
search: Math.round(searchMs),
|
|
18322
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18323
|
+
}
|
|
18324
|
+
}
|
|
18325
|
+
};
|
|
17709
18326
|
} else {
|
|
18327
|
+
const candidateK = Math.max(50, topK);
|
|
18328
|
+
const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
|
|
17710
18329
|
const hits = await this.store.search(
|
|
17711
|
-
|
|
17712
|
-
{
|
|
17713
|
-
limit: candidateK,
|
|
17714
|
-
semanticWeight: this.config.search.semanticWeight,
|
|
17715
|
-
inputEnrichment: this.config.search.inputEnrichment,
|
|
17716
|
-
reranking: this.config.search.reranking,
|
|
17717
|
-
filter
|
|
17718
|
-
},
|
|
18330
|
+
queryText,
|
|
18331
|
+
{ limit: candidateK * fetchMultiplier, filter: metaFilter },
|
|
17719
18332
|
resolvedScope
|
|
17720
18333
|
);
|
|
17721
|
-
|
|
17722
|
-
|
|
17723
|
-
|
|
17724
|
-
|
|
17725
|
-
|
|
17726
|
-
|
|
17727
|
-
|
|
17728
|
-
|
|
17729
|
-
|
|
17730
|
-
|
|
17731
|
-
|
|
17732
|
-
|
|
18334
|
+
let filtered = hits;
|
|
18335
|
+
if (pathPrefix) {
|
|
18336
|
+
filtered = filtered.filter((h) => h.metadata.url.startsWith(pathPrefix));
|
|
18337
|
+
}
|
|
18338
|
+
if (filterTags) {
|
|
18339
|
+
filtered = filtered.filter(
|
|
18340
|
+
(h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
|
|
18341
|
+
);
|
|
18342
|
+
}
|
|
18343
|
+
const ranked = rankHits(filtered, effectiveConfig, input.q, input.debug);
|
|
18344
|
+
const searchMs = hrTimeMs(searchStart);
|
|
18345
|
+
const results = this.buildResults(ranked, topK, false, maxSubResults, input.q, input.debug, effectiveConfig);
|
|
18346
|
+
return {
|
|
18347
|
+
q: input.q,
|
|
18348
|
+
scope: resolvedScope.scopeName,
|
|
18349
|
+
results,
|
|
18350
|
+
meta: {
|
|
18351
|
+
timingsMs: {
|
|
18352
|
+
search: Math.round(searchMs),
|
|
18353
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18354
|
+
}
|
|
17733
18355
|
}
|
|
18356
|
+
};
|
|
18357
|
+
}
|
|
18358
|
+
}
|
|
18359
|
+
buildPageFirstResults(rankedPages, allChunks, query, debug, maxSubResults = 5) {
|
|
18360
|
+
return rankedPages.map((page, i) => {
|
|
18361
|
+
const chunks = allChunks[i] ?? [];
|
|
18362
|
+
const bestChunk = chunks[0];
|
|
18363
|
+
const snippet = bestChunk ? query ? queryAwareExcerpt(bestChunk.metadata.chunkText, query) : toSnippet(bestChunk.metadata.chunkText) : page.description || page.title;
|
|
18364
|
+
const result = {
|
|
18365
|
+
url: page.url,
|
|
18366
|
+
title: page.title,
|
|
18367
|
+
sectionTitle: bestChunk?.metadata.sectionTitle || void 0,
|
|
18368
|
+
snippet,
|
|
18369
|
+
chunkText: bestChunk?.metadata.chunkText || void 0,
|
|
18370
|
+
score: Number(page.finalScore.toFixed(6)),
|
|
18371
|
+
routeFile: page.routeFile,
|
|
18372
|
+
chunks: chunks.length > 0 ? chunks.slice(0, maxSubResults).map((c) => ({
|
|
18373
|
+
sectionTitle: c.metadata.sectionTitle || void 0,
|
|
18374
|
+
snippet: query ? queryAwareExcerpt(c.metadata.chunkText, query) : toSnippet(c.metadata.chunkText),
|
|
18375
|
+
chunkText: c.metadata.chunkText || void 0,
|
|
18376
|
+
headingPath: c.metadata.headingPath,
|
|
18377
|
+
score: Number(c.score.toFixed(6))
|
|
18378
|
+
})) : void 0
|
|
18379
|
+
};
|
|
18380
|
+
if (debug && page.breakdown) {
|
|
18381
|
+
result.breakdown = {
|
|
18382
|
+
baseScore: page.breakdown.baseScore,
|
|
18383
|
+
incomingLinkBoost: page.breakdown.incomingLinkBoost,
|
|
18384
|
+
depthBoost: page.breakdown.depthBoost,
|
|
18385
|
+
titleMatchBoost: page.breakdown.titleMatchBoost,
|
|
18386
|
+
freshnessBoost: page.breakdown.freshnessBoost,
|
|
18387
|
+
anchorTextMatchBoost: 0
|
|
18388
|
+
};
|
|
17734
18389
|
}
|
|
17735
|
-
|
|
18390
|
+
return result;
|
|
18391
|
+
});
|
|
17736
18392
|
}
|
|
17737
|
-
ensureSnippet(hit) {
|
|
18393
|
+
ensureSnippet(hit, query) {
|
|
18394
|
+
const chunkText = hit.hit.metadata.chunkText;
|
|
18395
|
+
if (query && chunkText) return queryAwareExcerpt(chunkText, query);
|
|
17738
18396
|
const snippet = hit.hit.metadata.snippet;
|
|
17739
18397
|
if (snippet && snippet.length >= 30) return snippet;
|
|
17740
|
-
const chunkText = hit.hit.metadata.chunkText;
|
|
17741
18398
|
if (chunkText) return toSnippet(chunkText);
|
|
17742
18399
|
return snippet || "";
|
|
17743
18400
|
}
|
|
17744
|
-
buildResults(ordered, topK, groupByPage,
|
|
18401
|
+
buildResults(ordered, topK, groupByPage, maxSubResults, query, debug, config) {
|
|
18402
|
+
const cfg = config ?? this.config;
|
|
17745
18403
|
if (groupByPage) {
|
|
17746
|
-
let pages = aggregateByPage(ordered,
|
|
17747
|
-
pages = trimByScoreGap(pages,
|
|
17748
|
-
const minRatio =
|
|
18404
|
+
let pages = aggregateByPage(ordered, cfg);
|
|
18405
|
+
pages = trimByScoreGap(pages, cfg);
|
|
18406
|
+
const minRatio = cfg.ranking.minChunkScoreRatio;
|
|
17749
18407
|
return pages.slice(0, topK).map((page) => {
|
|
17750
18408
|
const bestScore = page.bestChunk.finalScore;
|
|
17751
18409
|
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
17752
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0,
|
|
17753
|
-
|
|
18410
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, maxSubResults);
|
|
18411
|
+
const result = {
|
|
17754
18412
|
url: page.url,
|
|
17755
18413
|
title: page.title,
|
|
17756
18414
|
sectionTitle: page.bestChunk.hit.metadata.sectionTitle || void 0,
|
|
17757
|
-
snippet: this.ensureSnippet(page.bestChunk),
|
|
18415
|
+
snippet: this.ensureSnippet(page.bestChunk, query),
|
|
18416
|
+
chunkText: page.bestChunk.hit.metadata.chunkText || void 0,
|
|
17758
18417
|
score: Number(page.pageScore.toFixed(6)),
|
|
17759
18418
|
routeFile: page.routeFile,
|
|
17760
|
-
chunks: meaningful.length
|
|
18419
|
+
chunks: meaningful.length >= 1 ? meaningful.map((c) => ({
|
|
17761
18420
|
sectionTitle: c.hit.metadata.sectionTitle || void 0,
|
|
17762
|
-
snippet: this.ensureSnippet(c),
|
|
18421
|
+
snippet: this.ensureSnippet(c, query),
|
|
18422
|
+
chunkText: c.hit.metadata.chunkText || void 0,
|
|
17763
18423
|
headingPath: c.hit.metadata.headingPath,
|
|
17764
18424
|
score: Number(c.finalScore.toFixed(6))
|
|
17765
18425
|
})) : void 0
|
|
17766
18426
|
};
|
|
18427
|
+
if (debug && page.bestChunk.breakdown) {
|
|
18428
|
+
result.breakdown = page.bestChunk.breakdown;
|
|
18429
|
+
}
|
|
18430
|
+
return result;
|
|
17767
18431
|
});
|
|
17768
18432
|
} else {
|
|
17769
18433
|
let filtered = ordered;
|
|
17770
|
-
const
|
|
17771
|
-
if (
|
|
17772
|
-
|
|
17773
|
-
|
|
17774
|
-
|
|
17775
|
-
|
|
17776
|
-
|
|
17777
|
-
|
|
17778
|
-
|
|
17779
|
-
|
|
17780
|
-
|
|
17781
|
-
|
|
18434
|
+
const minScoreRatio = cfg.ranking.minScoreRatio;
|
|
18435
|
+
if (minScoreRatio > 0 && ordered.length > 0) {
|
|
18436
|
+
const topScore = ordered[0].finalScore;
|
|
18437
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
18438
|
+
const threshold = topScore * minScoreRatio;
|
|
18439
|
+
filtered = ordered.filter((entry) => entry.finalScore >= threshold);
|
|
18440
|
+
}
|
|
18441
|
+
}
|
|
18442
|
+
return filtered.slice(0, topK).map(({ hit, finalScore, breakdown }) => {
|
|
18443
|
+
const result = {
|
|
18444
|
+
url: hit.metadata.url,
|
|
18445
|
+
title: hit.metadata.title,
|
|
18446
|
+
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
18447
|
+
snippet: this.ensureSnippet({ hit, finalScore }, query),
|
|
18448
|
+
chunkText: hit.metadata.chunkText || void 0,
|
|
18449
|
+
score: Number(finalScore.toFixed(6)),
|
|
18450
|
+
routeFile: hit.metadata.routeFile
|
|
18451
|
+
};
|
|
18452
|
+
if (debug && breakdown) {
|
|
18453
|
+
result.breakdown = breakdown;
|
|
18454
|
+
}
|
|
18455
|
+
return result;
|
|
18456
|
+
});
|
|
17782
18457
|
}
|
|
17783
18458
|
}
|
|
17784
18459
|
async getPage(pathOrUrl, scope) {
|
|
@@ -17804,6 +18479,116 @@ var SearchEngine = class _SearchEngine {
|
|
|
17804
18479
|
markdown: page.markdown
|
|
17805
18480
|
};
|
|
17806
18481
|
}
|
|
18482
|
+
async listPages(opts) {
|
|
18483
|
+
const resolvedScope = resolveScope(this.config, opts?.scope);
|
|
18484
|
+
const pathPrefix = opts?.pathPrefix ? opts.pathPrefix.startsWith("/") ? opts.pathPrefix : `/${opts.pathPrefix}` : void 0;
|
|
18485
|
+
return this.store.listPages(resolvedScope, {
|
|
18486
|
+
cursor: opts?.cursor,
|
|
18487
|
+
limit: opts?.limit,
|
|
18488
|
+
pathPrefix
|
|
18489
|
+
});
|
|
18490
|
+
}
|
|
18491
|
+
async getSiteStructure(opts) {
|
|
18492
|
+
const maxPages = Math.min(opts?.maxPages ?? MAX_SITE_STRUCTURE_PAGES, MAX_SITE_STRUCTURE_PAGES);
|
|
18493
|
+
const allPages = [];
|
|
18494
|
+
let cursor;
|
|
18495
|
+
let truncated = false;
|
|
18496
|
+
do {
|
|
18497
|
+
const result = await this.listPages({
|
|
18498
|
+
pathPrefix: opts?.pathPrefix,
|
|
18499
|
+
scope: opts?.scope,
|
|
18500
|
+
cursor,
|
|
18501
|
+
limit: 200
|
|
18502
|
+
});
|
|
18503
|
+
allPages.push(...result.pages);
|
|
18504
|
+
cursor = result.nextCursor;
|
|
18505
|
+
if (allPages.length >= maxPages) {
|
|
18506
|
+
truncated = allPages.length > maxPages || !!cursor;
|
|
18507
|
+
allPages.length = maxPages;
|
|
18508
|
+
break;
|
|
18509
|
+
}
|
|
18510
|
+
} while (cursor);
|
|
18511
|
+
const root2 = buildTree(allPages, opts?.pathPrefix);
|
|
18512
|
+
return {
|
|
18513
|
+
root: root2,
|
|
18514
|
+
totalPages: allPages.length,
|
|
18515
|
+
truncated
|
|
18516
|
+
};
|
|
18517
|
+
}
|
|
18518
|
+
async getRelatedPages(pathOrUrl, opts) {
|
|
18519
|
+
const resolvedScope = resolveScope(this.config, opts?.scope);
|
|
18520
|
+
const urlPath = this.resolveInputPath(pathOrUrl);
|
|
18521
|
+
const topK = Math.min(opts?.topK ?? 10, 25);
|
|
18522
|
+
const source = await this.store.fetchPageWithVector(urlPath, resolvedScope);
|
|
18523
|
+
if (!source) {
|
|
18524
|
+
throw new SearchSocketError("INVALID_REQUEST", `Indexed page not found for ${urlPath}`, 404);
|
|
18525
|
+
}
|
|
18526
|
+
const sourceOutgoing = new Set(source.metadata.outgoingLinkUrls ?? []);
|
|
18527
|
+
const semanticHits = await this.store.searchPagesByVector(
|
|
18528
|
+
source.vector,
|
|
18529
|
+
{ limit: 50 },
|
|
18530
|
+
resolvedScope
|
|
18531
|
+
);
|
|
18532
|
+
const filteredHits = semanticHits.filter((h) => h.url !== urlPath);
|
|
18533
|
+
const semanticScoreMap = /* @__PURE__ */ new Map();
|
|
18534
|
+
for (const hit of filteredHits) {
|
|
18535
|
+
semanticScoreMap.set(hit.url, hit.score);
|
|
18536
|
+
}
|
|
18537
|
+
const candidateUrls = /* @__PURE__ */ new Set();
|
|
18538
|
+
for (const hit of filteredHits) {
|
|
18539
|
+
candidateUrls.add(hit.url);
|
|
18540
|
+
}
|
|
18541
|
+
for (const url of sourceOutgoing) {
|
|
18542
|
+
if (url !== urlPath) candidateUrls.add(url);
|
|
18543
|
+
}
|
|
18544
|
+
const missingUrls = [...sourceOutgoing].filter(
|
|
18545
|
+
(u) => u !== urlPath && !semanticScoreMap.has(u)
|
|
18546
|
+
);
|
|
18547
|
+
const fetchedPages = missingUrls.length > 0 ? await this.store.fetchPagesBatch(missingUrls, resolvedScope) : [];
|
|
18548
|
+
const metaMap = /* @__PURE__ */ new Map();
|
|
18549
|
+
for (const hit of filteredHits) {
|
|
18550
|
+
metaMap.set(hit.url, { title: hit.title, routeFile: hit.routeFile, outgoingLinkUrls: [] });
|
|
18551
|
+
}
|
|
18552
|
+
for (const p of fetchedPages) {
|
|
18553
|
+
metaMap.set(p.url, { title: p.title, routeFile: p.routeFile, outgoingLinkUrls: p.outgoingLinkUrls });
|
|
18554
|
+
}
|
|
18555
|
+
const semanticUrls = filteredHits.map((h) => h.url);
|
|
18556
|
+
if (semanticUrls.length > 0) {
|
|
18557
|
+
const semanticPageData = await this.store.fetchPagesBatch(semanticUrls, resolvedScope);
|
|
18558
|
+
for (const p of semanticPageData) {
|
|
18559
|
+
const existing = metaMap.get(p.url);
|
|
18560
|
+
if (existing) {
|
|
18561
|
+
existing.outgoingLinkUrls = p.outgoingLinkUrls;
|
|
18562
|
+
}
|
|
18563
|
+
}
|
|
18564
|
+
}
|
|
18565
|
+
const candidates = [];
|
|
18566
|
+
for (const url of candidateUrls) {
|
|
18567
|
+
const meta = metaMap.get(url);
|
|
18568
|
+
if (!meta) continue;
|
|
18569
|
+
const isOutgoing = sourceOutgoing.has(url);
|
|
18570
|
+
const isIncoming = meta.outgoingLinkUrls.includes(urlPath);
|
|
18571
|
+
const isLinked = isOutgoing || isIncoming;
|
|
18572
|
+
const dice = diceScore(urlPath, url);
|
|
18573
|
+
const semantic = semanticScoreMap.get(url) ?? 0;
|
|
18574
|
+
const score = compositeScore(isLinked, dice, semantic);
|
|
18575
|
+
const relationshipType = dominantRelationshipType(isOutgoing, isIncoming, dice);
|
|
18576
|
+
candidates.push({
|
|
18577
|
+
url,
|
|
18578
|
+
title: meta.title,
|
|
18579
|
+
score: Number(score.toFixed(6)),
|
|
18580
|
+
relationshipType,
|
|
18581
|
+
routeFile: meta.routeFile
|
|
18582
|
+
});
|
|
18583
|
+
}
|
|
18584
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
18585
|
+
const results = candidates.slice(0, topK);
|
|
18586
|
+
return {
|
|
18587
|
+
sourceUrl: urlPath,
|
|
18588
|
+
scope: resolvedScope.scopeName,
|
|
18589
|
+
relatedPages: results
|
|
18590
|
+
};
|
|
18591
|
+
}
|
|
17807
18592
|
async health() {
|
|
17808
18593
|
return this.store.health();
|
|
17809
18594
|
}
|
|
@@ -17819,6 +18604,215 @@ var SearchEngine = class _SearchEngine {
|
|
|
17819
18604
|
}
|
|
17820
18605
|
};
|
|
17821
18606
|
|
|
18607
|
+
// src/mcp/server.ts
|
|
18608
|
+
function createServer(engine) {
|
|
18609
|
+
const server = new mcp_js.McpServer({
|
|
18610
|
+
name: "searchsocket-mcp",
|
|
18611
|
+
version: "0.1.0"
|
|
18612
|
+
});
|
|
18613
|
+
server.registerTool(
|
|
18614
|
+
"search",
|
|
18615
|
+
{
|
|
18616
|
+
description: `Semantic site search powered by Upstash Search. Returns url, title, snippet, chunkText, score, and routeFile per result. chunkText contains the full raw chunk markdown. When groupBy is 'page' (default), each result includes a chunks array with section-level sub-results containing sectionTitle, headingPath, snippet, and score. Supports optional filters for structured metadata (e.g. {"version": 2, "deprecated": false}).`,
|
|
18617
|
+
inputSchema: {
|
|
18618
|
+
query: zod.z.string().min(1),
|
|
18619
|
+
scope: zod.z.string().optional(),
|
|
18620
|
+
topK: zod.z.number().int().positive().max(100).optional(),
|
|
18621
|
+
pathPrefix: zod.z.string().optional(),
|
|
18622
|
+
tags: zod.z.array(zod.z.string()).optional(),
|
|
18623
|
+
filters: zod.z.record(zod.z.string(), zod.z.union([zod.z.string(), zod.z.number(), zod.z.boolean()])).optional(),
|
|
18624
|
+
groupBy: zod.z.enum(["page", "chunk"]).optional(),
|
|
18625
|
+
maxSubResults: zod.z.number().int().positive().max(20).optional()
|
|
18626
|
+
},
|
|
18627
|
+
outputSchema: {
|
|
18628
|
+
q: zod.z.string(),
|
|
18629
|
+
scope: zod.z.string(),
|
|
18630
|
+
results: zod.z.array(zod.z.object({
|
|
18631
|
+
url: zod.z.string(),
|
|
18632
|
+
title: zod.z.string(),
|
|
18633
|
+
sectionTitle: zod.z.string().optional(),
|
|
18634
|
+
snippet: zod.z.string(),
|
|
18635
|
+
score: zod.z.number(),
|
|
18636
|
+
routeFile: zod.z.string(),
|
|
18637
|
+
chunks: zod.z.array(zod.z.object({
|
|
18638
|
+
sectionTitle: zod.z.string().optional(),
|
|
18639
|
+
snippet: zod.z.string(),
|
|
18640
|
+
headingPath: zod.z.array(zod.z.string()),
|
|
18641
|
+
score: zod.z.number()
|
|
18642
|
+
})).optional()
|
|
18643
|
+
})),
|
|
18644
|
+
meta: zod.z.object({
|
|
18645
|
+
timingsMs: zod.z.object({
|
|
18646
|
+
search: zod.z.number(),
|
|
18647
|
+
total: zod.z.number()
|
|
18648
|
+
})
|
|
18649
|
+
})
|
|
18650
|
+
}
|
|
18651
|
+
},
|
|
18652
|
+
async (input) => {
|
|
18653
|
+
const result = await engine.search({
|
|
18654
|
+
q: input.query,
|
|
18655
|
+
topK: input.topK,
|
|
18656
|
+
scope: input.scope,
|
|
18657
|
+
pathPrefix: input.pathPrefix,
|
|
18658
|
+
tags: input.tags,
|
|
18659
|
+
filters: input.filters,
|
|
18660
|
+
groupBy: input.groupBy,
|
|
18661
|
+
maxSubResults: input.maxSubResults
|
|
18662
|
+
});
|
|
18663
|
+
return {
|
|
18664
|
+
content: [
|
|
18665
|
+
{
|
|
18666
|
+
type: "text",
|
|
18667
|
+
text: JSON.stringify(result, null, 2)
|
|
18668
|
+
}
|
|
18669
|
+
],
|
|
18670
|
+
structuredContent: result
|
|
18671
|
+
};
|
|
18672
|
+
}
|
|
18673
|
+
);
|
|
18674
|
+
server.registerTool(
|
|
18675
|
+
"get_page",
|
|
18676
|
+
{
|
|
18677
|
+
description: "Fetch indexed markdown for a specific path or URL, including frontmatter and routeFile mapping.",
|
|
18678
|
+
inputSchema: {
|
|
18679
|
+
pathOrUrl: zod.z.string().min(1),
|
|
18680
|
+
scope: zod.z.string().optional()
|
|
18681
|
+
}
|
|
18682
|
+
},
|
|
18683
|
+
async (input) => {
|
|
18684
|
+
const page = await engine.getPage(input.pathOrUrl, input.scope);
|
|
18685
|
+
return {
|
|
18686
|
+
content: [
|
|
18687
|
+
{
|
|
18688
|
+
type: "text",
|
|
18689
|
+
text: JSON.stringify(page, null, 2)
|
|
18690
|
+
}
|
|
18691
|
+
]
|
|
18692
|
+
};
|
|
18693
|
+
}
|
|
18694
|
+
);
|
|
18695
|
+
server.registerTool(
|
|
18696
|
+
"list_pages",
|
|
18697
|
+
{
|
|
18698
|
+
description: "List indexed pages with optional path prefix filtering and cursor-based pagination. Returns url, title, description, and routeFile for each page. Use nextCursor to fetch subsequent pages.",
|
|
18699
|
+
inputSchema: {
|
|
18700
|
+
pathPrefix: zod.z.string().optional(),
|
|
18701
|
+
cursor: zod.z.string().optional(),
|
|
18702
|
+
limit: zod.z.number().int().positive().max(200).optional(),
|
|
18703
|
+
scope: zod.z.string().optional()
|
|
18704
|
+
}
|
|
18705
|
+
},
|
|
18706
|
+
async (input) => {
|
|
18707
|
+
const result = await engine.listPages({
|
|
18708
|
+
pathPrefix: input.pathPrefix,
|
|
18709
|
+
cursor: input.cursor,
|
|
18710
|
+
limit: input.limit,
|
|
18711
|
+
scope: input.scope
|
|
18712
|
+
});
|
|
18713
|
+
return {
|
|
18714
|
+
content: [
|
|
18715
|
+
{
|
|
18716
|
+
type: "text",
|
|
18717
|
+
text: JSON.stringify(result, null, 2)
|
|
18718
|
+
}
|
|
18719
|
+
]
|
|
18720
|
+
};
|
|
18721
|
+
}
|
|
18722
|
+
);
|
|
18723
|
+
server.registerTool(
|
|
18724
|
+
"get_site_structure",
|
|
18725
|
+
{
|
|
18726
|
+
description: "Returns the hierarchical page tree derived from URL paths. Use this to understand site navigation structure, find where pages belong, or scope further operations to a section. Nodes with isIndexed: false are implicit structural parents not directly in the index. Large sites (>2000 pages) return truncated: true.",
|
|
18727
|
+
inputSchema: {
|
|
18728
|
+
pathPrefix: zod.z.string().optional(),
|
|
18729
|
+
scope: zod.z.string().optional(),
|
|
18730
|
+
maxPages: zod.z.number().int().positive().max(2e3).optional()
|
|
18731
|
+
}
|
|
18732
|
+
},
|
|
18733
|
+
async (input) => {
|
|
18734
|
+
const result = await engine.getSiteStructure({
|
|
18735
|
+
pathPrefix: input.pathPrefix,
|
|
18736
|
+
scope: input.scope,
|
|
18737
|
+
maxPages: input.maxPages
|
|
18738
|
+
});
|
|
18739
|
+
return {
|
|
18740
|
+
content: [
|
|
18741
|
+
{
|
|
18742
|
+
type: "text",
|
|
18743
|
+
text: JSON.stringify(result, null, 2)
|
|
18744
|
+
}
|
|
18745
|
+
]
|
|
18746
|
+
};
|
|
18747
|
+
}
|
|
18748
|
+
);
|
|
18749
|
+
server.registerTool(
|
|
18750
|
+
"find_source_file",
|
|
18751
|
+
{
|
|
18752
|
+
description: "Find the SvelteKit source file for a piece of site content. Use this when you need to locate and edit content on the site. Returns the URL, route file path, section title, and a content snippet.",
|
|
18753
|
+
inputSchema: {
|
|
18754
|
+
query: zod.z.string().min(1),
|
|
18755
|
+
scope: zod.z.string().optional()
|
|
18756
|
+
}
|
|
18757
|
+
},
|
|
18758
|
+
async (input) => {
|
|
18759
|
+
const result = await engine.search({
|
|
18760
|
+
q: input.query,
|
|
18761
|
+
topK: 1,
|
|
18762
|
+
scope: input.scope
|
|
18763
|
+
});
|
|
18764
|
+
if (result.results.length === 0) {
|
|
18765
|
+
return {
|
|
18766
|
+
content: [
|
|
18767
|
+
{
|
|
18768
|
+
type: "text",
|
|
18769
|
+
text: JSON.stringify({
|
|
18770
|
+
error: "No matching content found for the given query."
|
|
18771
|
+
})
|
|
18772
|
+
}
|
|
18773
|
+
]
|
|
18774
|
+
};
|
|
18775
|
+
}
|
|
18776
|
+
const match = result.results[0];
|
|
18777
|
+
const { url, routeFile, sectionTitle, snippet } = match;
|
|
18778
|
+
return {
|
|
18779
|
+
content: [
|
|
18780
|
+
{
|
|
18781
|
+
type: "text",
|
|
18782
|
+
text: JSON.stringify({ url, routeFile, sectionTitle, snippet })
|
|
18783
|
+
}
|
|
18784
|
+
]
|
|
18785
|
+
};
|
|
18786
|
+
}
|
|
18787
|
+
);
|
|
18788
|
+
server.registerTool(
|
|
18789
|
+
"get_related_pages",
|
|
18790
|
+
{
|
|
18791
|
+
description: "Find pages related to a given URL using link graph, semantic similarity, and structural proximity. Returns related pages ranked by a composite relatedness score. Use this to discover content connected to a known page.",
|
|
18792
|
+
inputSchema: {
|
|
18793
|
+
pathOrUrl: zod.z.string().min(1),
|
|
18794
|
+
scope: zod.z.string().optional(),
|
|
18795
|
+
topK: zod.z.number().int().positive().max(25).optional()
|
|
18796
|
+
}
|
|
18797
|
+
},
|
|
18798
|
+
async (input) => {
|
|
18799
|
+
const result = await engine.getRelatedPages(input.pathOrUrl, {
|
|
18800
|
+
topK: input.topK,
|
|
18801
|
+
scope: input.scope
|
|
18802
|
+
});
|
|
18803
|
+
return {
|
|
18804
|
+
content: [
|
|
18805
|
+
{
|
|
18806
|
+
type: "text",
|
|
18807
|
+
text: JSON.stringify(result, null, 2)
|
|
18808
|
+
}
|
|
18809
|
+
]
|
|
18810
|
+
};
|
|
18811
|
+
}
|
|
18812
|
+
);
|
|
18813
|
+
return server;
|
|
18814
|
+
}
|
|
18815
|
+
|
|
17822
18816
|
// src/sveltekit/handle.ts
|
|
17823
18817
|
var InMemoryRateLimiter = class {
|
|
17824
18818
|
constructor(windowMs, max) {
|
|
@@ -17847,7 +18841,13 @@ function searchsocketHandle(options = {}) {
|
|
|
17847
18841
|
let enginePromise = null;
|
|
17848
18842
|
let configPromise = null;
|
|
17849
18843
|
let apiPath = options.path;
|
|
18844
|
+
let llmsServePath = null;
|
|
18845
|
+
let serveMarkdownVariants = false;
|
|
18846
|
+
let mcpPath;
|
|
18847
|
+
let mcpApiKey;
|
|
18848
|
+
let mcpEnableJsonResponse = true;
|
|
17850
18849
|
let rateLimiter = null;
|
|
18850
|
+
let notConfigured = false;
|
|
17851
18851
|
const getConfig = async () => {
|
|
17852
18852
|
if (!configPromise) {
|
|
17853
18853
|
let configP;
|
|
@@ -17864,6 +18864,13 @@ function searchsocketHandle(options = {}) {
|
|
|
17864
18864
|
}
|
|
17865
18865
|
configPromise = configP.then((config) => {
|
|
17866
18866
|
apiPath = apiPath ?? config.api.path;
|
|
18867
|
+
mcpPath = config.mcp.handle.path;
|
|
18868
|
+
mcpApiKey = config.mcp.handle.apiKey;
|
|
18869
|
+
mcpEnableJsonResponse = config.mcp.handle.enableJsonResponse;
|
|
18870
|
+
if (config.llmsTxt.enable) {
|
|
18871
|
+
llmsServePath = "/" + config.llmsTxt.outputPath.replace(/^static\//, "");
|
|
18872
|
+
serveMarkdownVariants = config.llmsTxt.serveMarkdownVariants;
|
|
18873
|
+
}
|
|
17867
18874
|
if (config.api.rateLimit && !isServerless()) {
|
|
17868
18875
|
rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
|
|
17869
18876
|
}
|
|
@@ -17873,59 +18880,109 @@ function searchsocketHandle(options = {}) {
|
|
|
17873
18880
|
return configPromise;
|
|
17874
18881
|
};
|
|
17875
18882
|
const getEngine = async () => {
|
|
18883
|
+
if (notConfigured) {
|
|
18884
|
+
throw new SearchSocketError(
|
|
18885
|
+
"SEARCH_NOT_CONFIGURED",
|
|
18886
|
+
"Search is not configured. Set the required Upstash environment variables to enable search.",
|
|
18887
|
+
503
|
|
18888
|
+
);
|
|
18889
|
+
}
|
|
17876
18890
|
if (!enginePromise) {
|
|
17877
18891
|
const config = await getConfig();
|
|
17878
18892
|
enginePromise = SearchEngine.create({
|
|
17879
18893
|
cwd: options.cwd,
|
|
17880
18894
|
config
|
|
18895
|
+
}).catch((error) => {
|
|
18896
|
+
enginePromise = null;
|
|
18897
|
+
if (error instanceof SearchSocketError && error.code === "VECTOR_BACKEND_UNAVAILABLE") {
|
|
18898
|
+
notConfigured = true;
|
|
18899
|
+
throw new SearchSocketError(
|
|
18900
|
+
"SEARCH_NOT_CONFIGURED",
|
|
18901
|
+
"Search is not configured. Set the required Upstash environment variables to enable search.",
|
|
18902
|
+
503
|
|
18903
|
+
);
|
|
18904
|
+
}
|
|
18905
|
+
throw error;
|
|
17881
18906
|
});
|
|
17882
18907
|
}
|
|
17883
18908
|
return enginePromise;
|
|
17884
18909
|
};
|
|
17885
18910
|
const bodyLimit = options.maxBodyBytes ?? 64 * 1024;
|
|
17886
18911
|
return async ({ event, resolve }) => {
|
|
17887
|
-
if (apiPath && event.url.pathname !==
|
|
17888
|
-
|
|
18912
|
+
if (apiPath && !isApiPath(event.url.pathname, apiPath) && event.url.pathname !== llmsServePath) {
|
|
18913
|
+
const isMarkdownVariant = event.request.method === "GET" && event.url.pathname.endsWith(".md");
|
|
18914
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
18915
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
18916
|
+
}
|
|
18917
|
+
if (mcpPath) {
|
|
18918
|
+
if (serveMarkdownVariants && isMarkdownVariant) ; else {
|
|
18919
|
+
return resolve(event);
|
|
18920
|
+
}
|
|
18921
|
+
} else {
|
|
18922
|
+
if (configPromise || options.config || options.rawConfig) {
|
|
18923
|
+
await getConfig();
|
|
18924
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
18925
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
18926
|
+
}
|
|
18927
|
+
if (!(serveMarkdownVariants && isMarkdownVariant)) {
|
|
18928
|
+
return resolve(event);
|
|
18929
|
+
}
|
|
18930
|
+
} else {
|
|
18931
|
+
return resolve(event);
|
|
18932
|
+
}
|
|
18933
|
+
}
|
|
17889
18934
|
}
|
|
17890
18935
|
const config = await getConfig();
|
|
18936
|
+
if (llmsServePath && event.request.method === "GET" && event.url.pathname === llmsServePath) {
|
|
18937
|
+
const cwd = options.cwd ?? process.cwd();
|
|
18938
|
+
const filePath = path__default.default.resolve(cwd, config.llmsTxt.outputPath);
|
|
18939
|
+
try {
|
|
18940
|
+
const content = await fs9__default.default.readFile(filePath, "utf8");
|
|
18941
|
+
return new Response(content, {
|
|
18942
|
+
status: 200,
|
|
18943
|
+
headers: { "content-type": "text/plain; charset=utf-8" }
|
|
18944
|
+
});
|
|
18945
|
+
} catch {
|
|
18946
|
+
return resolve(event);
|
|
18947
|
+
}
|
|
18948
|
+
}
|
|
18949
|
+
if (serveMarkdownVariants && event.request.method === "GET" && event.url.pathname.endsWith(".md")) {
|
|
18950
|
+
let rawPath;
|
|
18951
|
+
try {
|
|
18952
|
+
rawPath = decodeURIComponent(event.url.pathname.slice(0, -3));
|
|
18953
|
+
} catch {
|
|
18954
|
+
return resolve(event);
|
|
18955
|
+
}
|
|
18956
|
+
const scope = event.url.searchParams?.get("scope") ?? void 0;
|
|
18957
|
+
try {
|
|
18958
|
+
const engine = await getEngine();
|
|
18959
|
+
const page = await engine.getPage(rawPath, scope);
|
|
18960
|
+
return new Response(page.markdown, {
|
|
18961
|
+
status: 200,
|
|
18962
|
+
headers: { "content-type": "text/markdown; charset=utf-8" }
|
|
18963
|
+
});
|
|
18964
|
+
} catch (error) {
|
|
18965
|
+
if (error instanceof SearchSocketError && error.status === 404) {
|
|
18966
|
+
return resolve(event);
|
|
18967
|
+
}
|
|
18968
|
+
throw error;
|
|
18969
|
+
}
|
|
18970
|
+
}
|
|
18971
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
18972
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
18973
|
+
}
|
|
17891
18974
|
const targetPath = apiPath ?? config.api.path;
|
|
17892
|
-
if (event.url.pathname
|
|
18975
|
+
if (!isApiPath(event.url.pathname, targetPath)) {
|
|
17893
18976
|
return resolve(event);
|
|
17894
18977
|
}
|
|
17895
|
-
|
|
18978
|
+
const subPath = event.url.pathname.slice(targetPath.length);
|
|
18979
|
+
const method = event.request.method;
|
|
18980
|
+
if (method === "OPTIONS") {
|
|
17896
18981
|
return new Response(null, {
|
|
17897
18982
|
status: 204,
|
|
17898
18983
|
headers: buildCorsHeaders(event.request, config)
|
|
17899
18984
|
});
|
|
17900
18985
|
}
|
|
17901
|
-
if (event.request.method !== "POST") {
|
|
17902
|
-
return withCors(
|
|
17903
|
-
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
|
|
17904
|
-
status: 405,
|
|
17905
|
-
headers: {
|
|
17906
|
-
"content-type": "application/json"
|
|
17907
|
-
}
|
|
17908
|
-
}),
|
|
17909
|
-
event.request,
|
|
17910
|
-
config
|
|
17911
|
-
);
|
|
17912
|
-
}
|
|
17913
|
-
const contentLength = Number(event.request.headers.get("content-length") ?? 0);
|
|
17914
|
-
if (contentLength > bodyLimit) {
|
|
17915
|
-
return withCors(
|
|
17916
|
-
new Response(
|
|
17917
|
-
JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Request body too large", 413))),
|
|
17918
|
-
{
|
|
17919
|
-
status: 413,
|
|
17920
|
-
headers: {
|
|
17921
|
-
"content-type": "application/json"
|
|
17922
|
-
}
|
|
17923
|
-
}
|
|
17924
|
-
),
|
|
17925
|
-
event.request,
|
|
17926
|
-
config
|
|
17927
|
-
);
|
|
17928
|
-
}
|
|
17929
18986
|
if (rateLimiter) {
|
|
17930
18987
|
const ip = event.getClientAddress?.() ?? event.request.headers.get("x-forwarded-for")?.split(",")[0]?.trim() ?? "unknown";
|
|
17931
18988
|
if (!rateLimiter.check(ip)) {
|
|
@@ -17945,39 +19002,32 @@ function searchsocketHandle(options = {}) {
|
|
|
17945
19002
|
}
|
|
17946
19003
|
}
|
|
17947
19004
|
try {
|
|
17948
|
-
|
|
17949
|
-
|
|
17950
|
-
|
|
17951
|
-
|
|
17952
|
-
|
|
17953
|
-
|
|
17954
|
-
|
|
17955
|
-
|
|
17956
|
-
|
|
17957
|
-
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
17958
|
-
}
|
|
17959
|
-
throw error;
|
|
19005
|
+
if (method === "GET") {
|
|
19006
|
+
if (subPath === "" || subPath === "/") {
|
|
19007
|
+
return await handleGetSearch(event, config, getEngine);
|
|
19008
|
+
}
|
|
19009
|
+
if (subPath === "/health") {
|
|
19010
|
+
return await handleGetHealth(event, config, getEngine);
|
|
19011
|
+
}
|
|
19012
|
+
if (subPath.startsWith("/pages/")) {
|
|
19013
|
+
return await handleGetPage(event, config, getEngine, subPath);
|
|
17960
19014
|
}
|
|
17961
|
-
|
|
19015
|
+
return withCors(
|
|
19016
|
+
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Not found", 404))), {
|
|
19017
|
+
status: 404,
|
|
19018
|
+
headers: { "content-type": "application/json" }
|
|
19019
|
+
}),
|
|
19020
|
+
event.request,
|
|
19021
|
+
config
|
|
19022
|
+
);
|
|
17962
19023
|
}
|
|
17963
|
-
if (
|
|
17964
|
-
|
|
19024
|
+
if (method === "POST" && (subPath === "" || subPath === "/")) {
|
|
19025
|
+
return await handlePostSearch(event, config, getEngine, bodyLimit);
|
|
17965
19026
|
}
|
|
17966
|
-
let body;
|
|
17967
|
-
try {
|
|
17968
|
-
body = JSON.parse(rawBody);
|
|
17969
|
-
} catch {
|
|
17970
|
-
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
17971
|
-
}
|
|
17972
|
-
const engine = await getEngine();
|
|
17973
|
-
const searchRequest = body;
|
|
17974
|
-
const result = await engine.search(searchRequest);
|
|
17975
19027
|
return withCors(
|
|
17976
|
-
new Response(JSON.stringify(
|
|
17977
|
-
status:
|
|
17978
|
-
headers: {
|
|
17979
|
-
"content-type": "application/json"
|
|
17980
|
-
}
|
|
19028
|
+
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
|
|
19029
|
+
status: 405,
|
|
19030
|
+
headers: { "content-type": "application/json" }
|
|
17981
19031
|
}),
|
|
17982
19032
|
event.request,
|
|
17983
19033
|
config
|
|
@@ -17998,6 +19048,183 @@ function searchsocketHandle(options = {}) {
|
|
|
17998
19048
|
}
|
|
17999
19049
|
};
|
|
18000
19050
|
}
|
|
19051
|
+
function isApiPath(pathname, apiPath) {
|
|
19052
|
+
return pathname === apiPath || pathname.startsWith(apiPath + "/");
|
|
19053
|
+
}
|
|
19054
|
+
async function handleGetSearch(event, config, getEngine) {
|
|
19055
|
+
const params = event.url.searchParams;
|
|
19056
|
+
const q = params.get("q");
|
|
19057
|
+
if (!q || q.trim() === "") {
|
|
19058
|
+
throw new SearchSocketError("INVALID_REQUEST", "Missing required query parameter: q", 400);
|
|
19059
|
+
}
|
|
19060
|
+
const searchRequest = { q };
|
|
19061
|
+
const topK = params.get("topK");
|
|
19062
|
+
if (topK !== null) {
|
|
19063
|
+
const parsed = Number.parseInt(topK, 10);
|
|
19064
|
+
if (Number.isNaN(parsed) || parsed < 1) {
|
|
19065
|
+
throw new SearchSocketError("INVALID_REQUEST", "topK must be a positive integer", 400);
|
|
19066
|
+
}
|
|
19067
|
+
searchRequest.topK = parsed;
|
|
19068
|
+
}
|
|
19069
|
+
const scope = params.get("scope");
|
|
19070
|
+
if (scope !== null) searchRequest.scope = scope;
|
|
19071
|
+
const pathPrefix = params.get("pathPrefix");
|
|
19072
|
+
if (pathPrefix !== null) searchRequest.pathPrefix = pathPrefix;
|
|
19073
|
+
const groupBy = params.get("groupBy");
|
|
19074
|
+
if (groupBy) {
|
|
19075
|
+
if (groupBy !== "page" && groupBy !== "chunk") {
|
|
19076
|
+
throw new SearchSocketError("INVALID_REQUEST", 'groupBy must be "page" or "chunk"', 400);
|
|
19077
|
+
}
|
|
19078
|
+
searchRequest.groupBy = groupBy;
|
|
19079
|
+
}
|
|
19080
|
+
const maxSubResults = params.get("maxSubResults");
|
|
19081
|
+
if (maxSubResults !== null) {
|
|
19082
|
+
const parsed = Number.parseInt(maxSubResults, 10);
|
|
19083
|
+
if (Number.isNaN(parsed) || parsed < 1 || parsed > 20) {
|
|
19084
|
+
throw new SearchSocketError("INVALID_REQUEST", "maxSubResults must be a positive integer between 1 and 20", 400);
|
|
19085
|
+
}
|
|
19086
|
+
searchRequest.maxSubResults = parsed;
|
|
19087
|
+
}
|
|
19088
|
+
const tags = params.getAll("tags");
|
|
19089
|
+
if (tags.length > 0) searchRequest.tags = tags;
|
|
19090
|
+
const engine = await getEngine();
|
|
19091
|
+
const result = await engine.search(searchRequest);
|
|
19092
|
+
return withCors(
|
|
19093
|
+
new Response(JSON.stringify(result), {
|
|
19094
|
+
status: 200,
|
|
19095
|
+
headers: { "content-type": "application/json" }
|
|
19096
|
+
}),
|
|
19097
|
+
event.request,
|
|
19098
|
+
config
|
|
19099
|
+
);
|
|
19100
|
+
}
|
|
19101
|
+
async function handleGetHealth(event, config, getEngine) {
|
|
19102
|
+
const engine = await getEngine();
|
|
19103
|
+
const result = await engine.health();
|
|
19104
|
+
return withCors(
|
|
19105
|
+
new Response(JSON.stringify(result), {
|
|
19106
|
+
status: 200,
|
|
19107
|
+
headers: { "content-type": "application/json" }
|
|
19108
|
+
}),
|
|
19109
|
+
event.request,
|
|
19110
|
+
config
|
|
19111
|
+
);
|
|
19112
|
+
}
|
|
19113
|
+
async function handleGetPage(event, config, getEngine, subPath) {
|
|
19114
|
+
const rawPath = subPath.slice("/pages".length);
|
|
19115
|
+
let pagePath;
|
|
19116
|
+
try {
|
|
19117
|
+
pagePath = decodeURIComponent(rawPath);
|
|
19118
|
+
} catch {
|
|
19119
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed page path", 400);
|
|
19120
|
+
}
|
|
19121
|
+
const scope = event.url.searchParams?.get("scope") ?? void 0;
|
|
19122
|
+
const engine = await getEngine();
|
|
19123
|
+
const result = await engine.getPage(pagePath, scope);
|
|
19124
|
+
return withCors(
|
|
19125
|
+
new Response(JSON.stringify(result), {
|
|
19126
|
+
status: 200,
|
|
19127
|
+
headers: { "content-type": "application/json" }
|
|
19128
|
+
}),
|
|
19129
|
+
event.request,
|
|
19130
|
+
config
|
|
19131
|
+
);
|
|
19132
|
+
}
|
|
19133
|
+
async function handlePostSearch(event, config, getEngine, bodyLimit) {
|
|
19134
|
+
const contentLength = Number(event.request.headers.get("content-length") ?? 0);
|
|
19135
|
+
if (contentLength > bodyLimit) {
|
|
19136
|
+
throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
|
|
19137
|
+
}
|
|
19138
|
+
let rawBody;
|
|
19139
|
+
if (typeof event.request.text === "function") {
|
|
19140
|
+
rawBody = await event.request.text();
|
|
19141
|
+
} else {
|
|
19142
|
+
let parsedFallback;
|
|
19143
|
+
try {
|
|
19144
|
+
parsedFallback = await event.request.json();
|
|
19145
|
+
} catch (error) {
|
|
19146
|
+
if (error instanceof SyntaxError) {
|
|
19147
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
19148
|
+
}
|
|
19149
|
+
throw error;
|
|
19150
|
+
}
|
|
19151
|
+
rawBody = JSON.stringify(parsedFallback);
|
|
19152
|
+
}
|
|
19153
|
+
if (Buffer.byteLength(rawBody, "utf8") > bodyLimit) {
|
|
19154
|
+
throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
|
|
19155
|
+
}
|
|
19156
|
+
let body;
|
|
19157
|
+
try {
|
|
19158
|
+
body = JSON.parse(rawBody);
|
|
19159
|
+
} catch {
|
|
19160
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
19161
|
+
}
|
|
19162
|
+
const engine = await getEngine();
|
|
19163
|
+
const searchRequest = body;
|
|
19164
|
+
const result = await engine.search(searchRequest);
|
|
19165
|
+
return withCors(
|
|
19166
|
+
new Response(JSON.stringify(result), {
|
|
19167
|
+
status: 200,
|
|
19168
|
+
headers: { "content-type": "application/json" }
|
|
19169
|
+
}),
|
|
19170
|
+
event.request,
|
|
19171
|
+
config
|
|
19172
|
+
);
|
|
19173
|
+
}
|
|
19174
|
+
async function handleMcpRequest(event, apiKey, enableJsonResponse, getEngine) {
|
|
19175
|
+
if (apiKey) {
|
|
19176
|
+
const authHeader = event.request.headers.get("authorization") ?? "";
|
|
19177
|
+
const token = authHeader.startsWith("Bearer ") ? authHeader.slice(7) : "";
|
|
19178
|
+
const tokenBuf = Buffer.from(token);
|
|
19179
|
+
const keyBuf = Buffer.from(apiKey);
|
|
19180
|
+
if (tokenBuf.length !== keyBuf.length || !crypto.timingSafeEqual(tokenBuf, keyBuf)) {
|
|
19181
|
+
return new Response(
|
|
19182
|
+
JSON.stringify({
|
|
19183
|
+
jsonrpc: "2.0",
|
|
19184
|
+
error: { code: -32001, message: "Unauthorized" },
|
|
19185
|
+
id: null
|
|
19186
|
+
}),
|
|
19187
|
+
{ status: 401, headers: { "content-type": "application/json" } }
|
|
19188
|
+
);
|
|
19189
|
+
}
|
|
19190
|
+
}
|
|
19191
|
+
const transport = new webStandardStreamableHttp_js.WebStandardStreamableHTTPServerTransport({
|
|
19192
|
+
sessionIdGenerator: void 0,
|
|
19193
|
+
enableJsonResponse
|
|
19194
|
+
});
|
|
19195
|
+
let server;
|
|
19196
|
+
try {
|
|
19197
|
+
const engine = await getEngine();
|
|
19198
|
+
server = createServer(engine);
|
|
19199
|
+
await server.connect(transport);
|
|
19200
|
+
const response = await transport.handleRequest(event.request);
|
|
19201
|
+
if (enableJsonResponse) {
|
|
19202
|
+
await transport.close();
|
|
19203
|
+
await server.close();
|
|
19204
|
+
}
|
|
19205
|
+
return response;
|
|
19206
|
+
} catch (error) {
|
|
19207
|
+
try {
|
|
19208
|
+
await transport.close();
|
|
19209
|
+
} catch {
|
|
19210
|
+
}
|
|
19211
|
+
try {
|
|
19212
|
+
await server?.close();
|
|
19213
|
+
} catch {
|
|
19214
|
+
}
|
|
19215
|
+
return new Response(
|
|
19216
|
+
JSON.stringify({
|
|
19217
|
+
jsonrpc: "2.0",
|
|
19218
|
+
error: {
|
|
19219
|
+
code: -32603,
|
|
19220
|
+
message: error instanceof Error ? error.message : "Internal server error"
|
|
19221
|
+
},
|
|
19222
|
+
id: null
|
|
19223
|
+
}),
|
|
19224
|
+
{ status: 500, headers: { "content-type": "application/json" } }
|
|
19225
|
+
);
|
|
19226
|
+
}
|
|
19227
|
+
}
|
|
18001
19228
|
function buildCorsHeaders(request, config) {
|
|
18002
19229
|
const allowOrigins = config.api.cors.allowOrigins;
|
|
18003
19230
|
if (!allowOrigins || allowOrigins.length === 0) {
|
|
@@ -18010,7 +19237,7 @@ function buildCorsHeaders(request, config) {
|
|
|
18010
19237
|
}
|
|
18011
19238
|
return {
|
|
18012
19239
|
"access-control-allow-origin": allowOrigins.includes("*") ? "*" : origin,
|
|
18013
|
-
"access-control-allow-methods": "POST, OPTIONS",
|
|
19240
|
+
"access-control-allow-methods": "GET, POST, OPTIONS",
|
|
18014
19241
|
"access-control-allow-headers": "content-type"
|
|
18015
19242
|
};
|
|
18016
19243
|
}
|
|
@@ -18057,6 +19284,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
|
|
|
18057
19284
|
if (normalizeText(current.text)) {
|
|
18058
19285
|
sections.push({
|
|
18059
19286
|
sectionTitle: current.sectionTitle,
|
|
19287
|
+
headingLevel: current.headingLevel,
|
|
18060
19288
|
headingPath: current.headingPath,
|
|
18061
19289
|
text: current.text.trim()
|
|
18062
19290
|
});
|
|
@@ -18075,6 +19303,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
|
|
|
18075
19303
|
headingStack.length = level;
|
|
18076
19304
|
current = {
|
|
18077
19305
|
sectionTitle: title,
|
|
19306
|
+
headingLevel: level,
|
|
18078
19307
|
headingPath: headingStack.filter((entry) => Boolean(entry)).slice(0, headingPathDepth),
|
|
18079
19308
|
text: `${line}
|
|
18080
19309
|
`
|
|
@@ -18210,6 +19439,7 @@ function splitSection(section, config) {
|
|
|
18210
19439
|
return [
|
|
18211
19440
|
{
|
|
18212
19441
|
sectionTitle: section.sectionTitle,
|
|
19442
|
+
headingLevel: section.headingLevel,
|
|
18213
19443
|
headingPath: section.headingPath,
|
|
18214
19444
|
chunkText: text
|
|
18215
19445
|
}
|
|
@@ -18260,6 +19490,7 @@ ${chunk}`;
|
|
|
18260
19490
|
}
|
|
18261
19491
|
return merged.map((chunkText) => ({
|
|
18262
19492
|
sectionTitle: section.sectionTitle,
|
|
19493
|
+
headingLevel: section.headingLevel,
|
|
18263
19494
|
headingPath: section.headingPath,
|
|
18264
19495
|
chunkText
|
|
18265
19496
|
}));
|
|
@@ -18275,6 +19506,18 @@ function buildSummaryChunkText(page) {
|
|
|
18275
19506
|
}
|
|
18276
19507
|
return parts.join("\n\n");
|
|
18277
19508
|
}
|
|
19509
|
+
function buildEmbeddingTitle(chunk) {
|
|
19510
|
+
if (!chunk.sectionTitle || chunk.headingLevel === void 0) return void 0;
|
|
19511
|
+
if (chunk.headingPath.length > 1) {
|
|
19512
|
+
const path14 = chunk.headingPath.join(" > ");
|
|
19513
|
+
const lastInPath = chunk.headingPath[chunk.headingPath.length - 1];
|
|
19514
|
+
if (lastInPath !== chunk.sectionTitle) {
|
|
19515
|
+
return `${chunk.title} \u2014 ${path14} > ${chunk.sectionTitle}`;
|
|
19516
|
+
}
|
|
19517
|
+
return `${chunk.title} \u2014 ${path14}`;
|
|
19518
|
+
}
|
|
19519
|
+
return `${chunk.title} \u2014 ${chunk.sectionTitle}`;
|
|
19520
|
+
}
|
|
18278
19521
|
function buildEmbeddingText(chunk, prependTitle) {
|
|
18279
19522
|
if (!prependTitle) return chunk.chunkText;
|
|
18280
19523
|
const prefix = chunk.sectionTitle ? `${chunk.title} \u2014 ${chunk.sectionTitle}` : chunk.title;
|
|
@@ -18305,10 +19548,14 @@ function chunkPage(page, config, scope) {
|
|
|
18305
19548
|
tags: page.tags,
|
|
18306
19549
|
contentHash: "",
|
|
18307
19550
|
description: page.description,
|
|
18308
|
-
keywords: page.keywords
|
|
19551
|
+
keywords: page.keywords,
|
|
19552
|
+
publishedAt: page.publishedAt,
|
|
19553
|
+
incomingAnchorText: page.incomingAnchorText,
|
|
19554
|
+
meta: page.meta
|
|
18309
19555
|
};
|
|
18310
19556
|
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
18311
|
-
|
|
19557
|
+
const metaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
|
|
19558
|
+
summaryChunk.contentHash = sha256(normalizeText(embeddingText) + metaSuffix);
|
|
18312
19559
|
chunks.push(summaryChunk);
|
|
18313
19560
|
}
|
|
18314
19561
|
const ordinalOffset = config.chunking.pageSummaryChunk ? 1 : 0;
|
|
@@ -18325,6 +19572,7 @@ function chunkPage(page, config, scope) {
|
|
|
18325
19572
|
path: page.url,
|
|
18326
19573
|
title: page.title,
|
|
18327
19574
|
sectionTitle: entry.sectionTitle,
|
|
19575
|
+
headingLevel: entry.headingLevel,
|
|
18328
19576
|
headingPath: entry.headingPath,
|
|
18329
19577
|
chunkText: entry.chunkText,
|
|
18330
19578
|
snippet: toSnippet(entry.chunkText),
|
|
@@ -18334,10 +19582,16 @@ function chunkPage(page, config, scope) {
|
|
|
18334
19582
|
tags: page.tags,
|
|
18335
19583
|
contentHash: "",
|
|
18336
19584
|
description: page.description,
|
|
18337
|
-
keywords: page.keywords
|
|
19585
|
+
keywords: page.keywords,
|
|
19586
|
+
publishedAt: page.publishedAt,
|
|
19587
|
+
incomingAnchorText: page.incomingAnchorText,
|
|
19588
|
+
meta: page.meta
|
|
18338
19589
|
};
|
|
18339
19590
|
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
18340
|
-
|
|
19591
|
+
const embeddingTitle = config.chunking.weightHeadings ? buildEmbeddingTitle(chunk) : void 0;
|
|
19592
|
+
const chunkMetaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
|
|
19593
|
+
const hashInput = embeddingTitle ? `${normalizeText(embeddingText)}|title:${embeddingTitle}` : normalizeText(embeddingText);
|
|
19594
|
+
chunk.contentHash = sha256(hashInput + chunkMetaSuffix);
|
|
18341
19595
|
chunks.push(chunk);
|
|
18342
19596
|
}
|
|
18343
19597
|
return chunks;
|
|
@@ -19170,6 +20424,69 @@ function gfm(turndownService) {
|
|
|
19170
20424
|
}
|
|
19171
20425
|
|
|
19172
20426
|
// src/indexing/extractor.ts
|
|
20427
|
+
function normalizeDateToMs(value) {
|
|
20428
|
+
if (value == null) return void 0;
|
|
20429
|
+
if (value instanceof Date) {
|
|
20430
|
+
const ts = value.getTime();
|
|
20431
|
+
return Number.isFinite(ts) ? ts : void 0;
|
|
20432
|
+
}
|
|
20433
|
+
if (typeof value === "string") {
|
|
20434
|
+
const ts = new Date(value).getTime();
|
|
20435
|
+
return Number.isFinite(ts) ? ts : void 0;
|
|
20436
|
+
}
|
|
20437
|
+
if (typeof value === "number") {
|
|
20438
|
+
return Number.isFinite(value) ? value : void 0;
|
|
20439
|
+
}
|
|
20440
|
+
return void 0;
|
|
20441
|
+
}
|
|
20442
|
+
var FRONTMATTER_DATE_FIELDS = ["date", "publishedAt", "updatedAt", "published_at", "updated_at"];
|
|
20443
|
+
function extractPublishedAtFromFrontmatter(data) {
|
|
20444
|
+
for (const field of FRONTMATTER_DATE_FIELDS) {
|
|
20445
|
+
const val = normalizeDateToMs(data[field]);
|
|
20446
|
+
if (val !== void 0) return val;
|
|
20447
|
+
}
|
|
20448
|
+
return void 0;
|
|
20449
|
+
}
|
|
20450
|
+
function extractPublishedAtFromHtml($) {
|
|
20451
|
+
const jsonLdScripts = $('script[type="application/ld+json"]');
|
|
20452
|
+
for (let i = 0; i < jsonLdScripts.length; i++) {
|
|
20453
|
+
try {
|
|
20454
|
+
const raw = $(jsonLdScripts[i]).html();
|
|
20455
|
+
if (!raw) continue;
|
|
20456
|
+
const parsed = JSON.parse(raw);
|
|
20457
|
+
const candidates = [];
|
|
20458
|
+
if (Array.isArray(parsed)) {
|
|
20459
|
+
candidates.push(...parsed);
|
|
20460
|
+
} else if (parsed && typeof parsed === "object") {
|
|
20461
|
+
candidates.push(parsed);
|
|
20462
|
+
if (Array.isArray(parsed["@graph"])) {
|
|
20463
|
+
candidates.push(...parsed["@graph"]);
|
|
20464
|
+
}
|
|
20465
|
+
}
|
|
20466
|
+
for (const candidate of candidates) {
|
|
20467
|
+
const val = normalizeDateToMs(candidate.datePublished);
|
|
20468
|
+
if (val !== void 0) return val;
|
|
20469
|
+
}
|
|
20470
|
+
} catch {
|
|
20471
|
+
}
|
|
20472
|
+
}
|
|
20473
|
+
const ogTime = $('meta[property="article:published_time"]').attr("content")?.trim();
|
|
20474
|
+
if (ogTime) {
|
|
20475
|
+
const val = normalizeDateToMs(ogTime);
|
|
20476
|
+
if (val !== void 0) return val;
|
|
20477
|
+
}
|
|
20478
|
+
const itempropDate = $('meta[itemprop="datePublished"]').attr("content")?.trim() || $('time[itemprop="datePublished"]').attr("datetime")?.trim();
|
|
20479
|
+
if (itempropDate) {
|
|
20480
|
+
const val = normalizeDateToMs(itempropDate);
|
|
20481
|
+
if (val !== void 0) return val;
|
|
20482
|
+
}
|
|
20483
|
+
const timeEl = $("time[datetime]").first().attr("datetime")?.trim();
|
|
20484
|
+
if (timeEl) {
|
|
20485
|
+
const val = normalizeDateToMs(timeEl);
|
|
20486
|
+
if (val !== void 0) return val;
|
|
20487
|
+
}
|
|
20488
|
+
return void 0;
|
|
20489
|
+
}
|
|
19173
20490
|
function hasTopLevelNoindexComment(markdown) {
|
|
19174
20491
|
const lines = markdown.split(/\r?\n/);
|
|
19175
20492
|
let inFence = false;
|
|
@@ -19185,6 +20502,97 @@ function hasTopLevelNoindexComment(markdown) {
|
|
|
19185
20502
|
}
|
|
19186
20503
|
return false;
|
|
19187
20504
|
}
|
|
20505
|
+
var GARBAGE_ALT_WORDS = /* @__PURE__ */ new Set([
|
|
20506
|
+
"image",
|
|
20507
|
+
"photo",
|
|
20508
|
+
"picture",
|
|
20509
|
+
"icon",
|
|
20510
|
+
"logo",
|
|
20511
|
+
"banner",
|
|
20512
|
+
"screenshot",
|
|
20513
|
+
"thumbnail",
|
|
20514
|
+
"img",
|
|
20515
|
+
"graphic",
|
|
20516
|
+
"illustration",
|
|
20517
|
+
"spacer",
|
|
20518
|
+
"pixel",
|
|
20519
|
+
"placeholder",
|
|
20520
|
+
"avatar",
|
|
20521
|
+
"background"
|
|
20522
|
+
]);
|
|
20523
|
+
var IMAGE_EXT_RE = /\.(jpg|jpeg|png|gif|svg|webp|avif|bmp|ico)(\?.*)?$/i;
|
|
20524
|
+
function isMeaningfulAlt(alt) {
|
|
20525
|
+
const trimmed = alt.trim();
|
|
20526
|
+
if (!trimmed || trimmed.length < 5) return false;
|
|
20527
|
+
if (IMAGE_EXT_RE.test(trimmed)) return false;
|
|
20528
|
+
if (GARBAGE_ALT_WORDS.has(trimmed.toLowerCase())) return false;
|
|
20529
|
+
return true;
|
|
20530
|
+
}
|
|
20531
|
+
function resolveImageText(img, $, imageDescAttr) {
|
|
20532
|
+
const imgDesc = img.attr(imageDescAttr)?.trim();
|
|
20533
|
+
if (imgDesc) return imgDesc;
|
|
20534
|
+
const figure = img.closest("figure");
|
|
20535
|
+
if (figure.length) {
|
|
20536
|
+
const figDesc = figure.attr(imageDescAttr)?.trim();
|
|
20537
|
+
if (figDesc) return figDesc;
|
|
20538
|
+
}
|
|
20539
|
+
const alt = img.attr("alt")?.trim() ?? "";
|
|
20540
|
+
const caption = figure.length ? figure.find("figcaption").first().text().trim() : "";
|
|
20541
|
+
if (isMeaningfulAlt(alt) && caption) {
|
|
20542
|
+
return `${alt} \u2014 ${caption}`;
|
|
20543
|
+
}
|
|
20544
|
+
if (isMeaningfulAlt(alt)) {
|
|
20545
|
+
return alt;
|
|
20546
|
+
}
|
|
20547
|
+
if (caption) {
|
|
20548
|
+
return caption;
|
|
20549
|
+
}
|
|
20550
|
+
return null;
|
|
20551
|
+
}
|
|
20552
|
+
var STOP_ANCHORS = /* @__PURE__ */ new Set([
|
|
20553
|
+
"here",
|
|
20554
|
+
"click",
|
|
20555
|
+
"click here",
|
|
20556
|
+
"read more",
|
|
20557
|
+
"link",
|
|
20558
|
+
"this",
|
|
20559
|
+
"more"
|
|
20560
|
+
]);
|
|
20561
|
+
function normalizeAnchorText(raw) {
|
|
20562
|
+
const normalized = raw.replace(/\s+/g, " ").trim().toLowerCase();
|
|
20563
|
+
if (normalized.length < 3) return "";
|
|
20564
|
+
if (STOP_ANCHORS.has(normalized)) return "";
|
|
20565
|
+
if (normalized.length > 100) return normalized.slice(0, 100);
|
|
20566
|
+
return normalized;
|
|
20567
|
+
}
|
|
20568
|
+
function escapeHtml(text) {
|
|
20569
|
+
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">");
|
|
20570
|
+
}
|
|
20571
|
+
function preprocessImages(root2, $, imageDescAttr) {
|
|
20572
|
+
root2.find("picture").each((_i, el) => {
|
|
20573
|
+
const picture = $(el);
|
|
20574
|
+
const img = picture.find("img").first();
|
|
20575
|
+
const parentFigure = picture.closest("figure");
|
|
20576
|
+
const text = img.length ? resolveImageText(img, $, imageDescAttr) : null;
|
|
20577
|
+
if (text) {
|
|
20578
|
+
if (parentFigure.length) parentFigure.find("figcaption").remove();
|
|
20579
|
+
picture.replaceWith(`<span>${escapeHtml(text)}</span>`);
|
|
20580
|
+
} else {
|
|
20581
|
+
picture.remove();
|
|
20582
|
+
}
|
|
20583
|
+
});
|
|
20584
|
+
root2.find("img").each((_i, el) => {
|
|
20585
|
+
const img = $(el);
|
|
20586
|
+
const parentFigure = img.closest("figure");
|
|
20587
|
+
const text = resolveImageText(img, $, imageDescAttr);
|
|
20588
|
+
if (text) {
|
|
20589
|
+
if (parentFigure.length) parentFigure.find("figcaption").remove();
|
|
20590
|
+
img.replaceWith(`<span>${escapeHtml(text)}</span>`);
|
|
20591
|
+
} else {
|
|
20592
|
+
img.remove();
|
|
20593
|
+
}
|
|
20594
|
+
});
|
|
20595
|
+
}
|
|
19188
20596
|
function extractFromHtml(url, html, config) {
|
|
19189
20597
|
const $ = cheerio.load(html);
|
|
19190
20598
|
const normalizedUrl = normalizeUrlPath(url);
|
|
@@ -19210,6 +20618,20 @@ function extractFromHtml(url, html, config) {
|
|
|
19210
20618
|
if (weight === 0) {
|
|
19211
20619
|
return null;
|
|
19212
20620
|
}
|
|
20621
|
+
if ($('meta[name="searchsocket:noindex"]').attr("content") === "true") {
|
|
20622
|
+
return null;
|
|
20623
|
+
}
|
|
20624
|
+
const RESERVED_META_KEYS = /* @__PURE__ */ new Set(["noindex", "tags"]);
|
|
20625
|
+
const meta = {};
|
|
20626
|
+
$('meta[name^="searchsocket:"]').each((_i, el) => {
|
|
20627
|
+
const name = $(el).attr("name") ?? "";
|
|
20628
|
+
const key = name.slice("searchsocket:".length);
|
|
20629
|
+
if (!key || RESERVED_META_KEYS.has(key) || !validateMetaKey(key)) return;
|
|
20630
|
+
const content = $(el).attr("content") ?? "";
|
|
20631
|
+
const dataType = $(el).attr("data-type") ?? "string";
|
|
20632
|
+
meta[key] = parseMetaValue(content, dataType);
|
|
20633
|
+
});
|
|
20634
|
+
const componentTags = $('meta[name="searchsocket:tags"]').attr("content")?.trim();
|
|
19213
20635
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
19214
20636
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
19215
20637
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -19221,7 +20643,9 @@ function extractFromHtml(url, html, config) {
|
|
|
19221
20643
|
root2.find(selector).remove();
|
|
19222
20644
|
}
|
|
19223
20645
|
root2.find(`[${config.extract.ignoreAttr}]`).remove();
|
|
20646
|
+
preprocessImages(root2, $, config.extract.imageDescAttr);
|
|
19224
20647
|
const outgoingLinks = [];
|
|
20648
|
+
const seenLinkKeys = /* @__PURE__ */ new Set();
|
|
19225
20649
|
root2.find("a[href]").each((_index, node) => {
|
|
19226
20650
|
const href = $(node).attr("href");
|
|
19227
20651
|
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:")) {
|
|
@@ -19232,7 +20656,19 @@ function extractFromHtml(url, html, config) {
|
|
|
19232
20656
|
if (!["http:", "https:"].includes(parsed.protocol)) {
|
|
19233
20657
|
return;
|
|
19234
20658
|
}
|
|
19235
|
-
|
|
20659
|
+
const url2 = normalizeUrlPath(parsed.pathname);
|
|
20660
|
+
let anchorText = normalizeAnchorText($(node).text());
|
|
20661
|
+
if (!anchorText) {
|
|
20662
|
+
const imgAlt = $(node).find("img").first().attr("alt") ?? "";
|
|
20663
|
+
if (isMeaningfulAlt(imgAlt)) {
|
|
20664
|
+
anchorText = normalizeAnchorText(imgAlt);
|
|
20665
|
+
}
|
|
20666
|
+
}
|
|
20667
|
+
const key = `${url2}|${anchorText}`;
|
|
20668
|
+
if (!seenLinkKeys.has(key)) {
|
|
20669
|
+
seenLinkKeys.add(key);
|
|
20670
|
+
outgoingLinks.push({ url: url2, anchorText });
|
|
20671
|
+
}
|
|
19236
20672
|
} catch {
|
|
19237
20673
|
}
|
|
19238
20674
|
});
|
|
@@ -19257,16 +20693,25 @@ function extractFromHtml(url, html, config) {
|
|
|
19257
20693
|
return null;
|
|
19258
20694
|
}
|
|
19259
20695
|
const tags = normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1);
|
|
20696
|
+
const publishedAt = extractPublishedAtFromHtml($);
|
|
20697
|
+
if (componentTags) {
|
|
20698
|
+
const extraTags = componentTags.split(",").map((t) => t.trim()).filter(Boolean);
|
|
20699
|
+
for (const t of extraTags) {
|
|
20700
|
+
if (!tags.includes(t)) tags.push(t);
|
|
20701
|
+
}
|
|
20702
|
+
}
|
|
19260
20703
|
return {
|
|
19261
20704
|
url: normalizeUrlPath(url),
|
|
19262
20705
|
title,
|
|
19263
20706
|
markdown,
|
|
19264
|
-
outgoingLinks
|
|
20707
|
+
outgoingLinks,
|
|
19265
20708
|
noindex: false,
|
|
19266
20709
|
tags,
|
|
19267
20710
|
description,
|
|
19268
20711
|
keywords,
|
|
19269
|
-
weight
|
|
20712
|
+
weight,
|
|
20713
|
+
publishedAt,
|
|
20714
|
+
meta: Object.keys(meta).length > 0 ? meta : void 0
|
|
19270
20715
|
};
|
|
19271
20716
|
}
|
|
19272
20717
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -19287,6 +20732,24 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19287
20732
|
if (mdWeight === 0) {
|
|
19288
20733
|
return null;
|
|
19289
20734
|
}
|
|
20735
|
+
let mdMeta;
|
|
20736
|
+
const rawMeta = searchsocketMeta?.meta;
|
|
20737
|
+
if (rawMeta && typeof rawMeta === "object" && !Array.isArray(rawMeta)) {
|
|
20738
|
+
const metaObj = {};
|
|
20739
|
+
for (const [key, val] of Object.entries(rawMeta)) {
|
|
20740
|
+
if (!validateMetaKey(key)) continue;
|
|
20741
|
+
if (typeof val === "string" || typeof val === "number" || typeof val === "boolean") {
|
|
20742
|
+
metaObj[key] = val;
|
|
20743
|
+
} else if (Array.isArray(val) && val.every((v) => typeof v === "string")) {
|
|
20744
|
+
metaObj[key] = val;
|
|
20745
|
+
} else if (val instanceof Date) {
|
|
20746
|
+
metaObj[key] = val.getTime();
|
|
20747
|
+
}
|
|
20748
|
+
}
|
|
20749
|
+
if (Object.keys(metaObj).length > 0) {
|
|
20750
|
+
mdMeta = metaObj;
|
|
20751
|
+
}
|
|
20752
|
+
}
|
|
19290
20753
|
const content = parsed.content;
|
|
19291
20754
|
const normalized = normalizeMarkdown(content);
|
|
19292
20755
|
if (!normalizeText(normalized)) {
|
|
@@ -19301,6 +20764,7 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19301
20764
|
fmKeywords = frontmatter.keywords.split(",").map((k) => k.trim()).filter(Boolean);
|
|
19302
20765
|
}
|
|
19303
20766
|
if (fmKeywords && fmKeywords.length === 0) fmKeywords = void 0;
|
|
20767
|
+
const publishedAt = extractPublishedAtFromFrontmatter(frontmatter);
|
|
19304
20768
|
return {
|
|
19305
20769
|
url: normalizeUrlPath(url),
|
|
19306
20770
|
title: resolvedTitle,
|
|
@@ -19310,7 +20774,9 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19310
20774
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
19311
20775
|
description: fmDescription,
|
|
19312
20776
|
keywords: fmKeywords,
|
|
19313
|
-
weight: mdWeight
|
|
20777
|
+
weight: mdWeight,
|
|
20778
|
+
publishedAt,
|
|
20779
|
+
meta: mdMeta
|
|
19314
20780
|
};
|
|
19315
20781
|
}
|
|
19316
20782
|
function segmentToRegex(segment) {
|
|
@@ -19473,7 +20939,7 @@ async function parseManifest(cwd, outputDir) {
|
|
|
19473
20939
|
const manifestPath = path__default.default.resolve(cwd, outputDir, "server", "manifest-full.js");
|
|
19474
20940
|
let content;
|
|
19475
20941
|
try {
|
|
19476
|
-
content = await
|
|
20942
|
+
content = await fs9__default.default.readFile(manifestPath, "utf8");
|
|
19477
20943
|
} catch {
|
|
19478
20944
|
throw new SearchSocketError(
|
|
19479
20945
|
"BUILD_MANIFEST_NOT_FOUND",
|
|
@@ -19784,6 +21250,125 @@ function filePathToUrl(filePath, baseDir) {
|
|
|
19784
21250
|
const noExt = relative.replace(/\.md$/i, "").replace(/\/index$/i, "");
|
|
19785
21251
|
return normalizeUrlPath(noExt || "/");
|
|
19786
21252
|
}
|
|
21253
|
+
var ROUTE_FILE_RE = /\+(page|layout|error)(@[^.]+)?\.svelte$/;
|
|
21254
|
+
function isSvelteComponentFile(filePath) {
|
|
21255
|
+
if (!filePath.endsWith(".svelte")) return false;
|
|
21256
|
+
return !ROUTE_FILE_RE.test(filePath);
|
|
21257
|
+
}
|
|
21258
|
+
function extractSvelteComponentMeta(source) {
|
|
21259
|
+
const componentMatch = source.match(/<!--\s*@component\s*([\s\S]*?)\s*-->/);
|
|
21260
|
+
const description = componentMatch?.[1]?.trim() || void 0;
|
|
21261
|
+
const propsMatch = source.match(
|
|
21262
|
+
/let\s+\{([\s\S]*?)\}\s*(?::\s*([^=;{][\s\S]*?))?\s*=\s*\$props\(\)/
|
|
21263
|
+
);
|
|
21264
|
+
const props = [];
|
|
21265
|
+
if (propsMatch) {
|
|
21266
|
+
const destructureBlock = propsMatch[1];
|
|
21267
|
+
const typeAnnotation = propsMatch[2]?.trim();
|
|
21268
|
+
let resolvedTypeMap;
|
|
21269
|
+
if (typeAnnotation && /^[A-Z]\w*$/.test(typeAnnotation)) {
|
|
21270
|
+
resolvedTypeMap = resolveTypeReference(source, typeAnnotation);
|
|
21271
|
+
} else if (typeAnnotation && typeAnnotation.startsWith("{")) {
|
|
21272
|
+
resolvedTypeMap = parseInlineTypeAnnotation(typeAnnotation);
|
|
21273
|
+
}
|
|
21274
|
+
const propEntries = splitDestructureBlock(destructureBlock);
|
|
21275
|
+
for (const entry of propEntries) {
|
|
21276
|
+
const trimmed = entry.trim();
|
|
21277
|
+
if (!trimmed || trimmed.startsWith("...")) continue;
|
|
21278
|
+
let propName;
|
|
21279
|
+
let defaultValue;
|
|
21280
|
+
const renameMatch = trimmed.match(/^(\w+)\s*:\s*\w+\s*(?:=\s*([\s\S]+))?$/);
|
|
21281
|
+
if (renameMatch) {
|
|
21282
|
+
propName = renameMatch[1];
|
|
21283
|
+
defaultValue = renameMatch[2]?.trim();
|
|
21284
|
+
} else {
|
|
21285
|
+
const defaultMatch = trimmed.match(/^(\w+)\s*=\s*([\s\S]+)$/);
|
|
21286
|
+
if (defaultMatch) {
|
|
21287
|
+
propName = defaultMatch[1];
|
|
21288
|
+
defaultValue = defaultMatch[2]?.trim();
|
|
21289
|
+
} else {
|
|
21290
|
+
propName = trimmed.match(/^(\w+)/)?.[1] ?? trimmed;
|
|
21291
|
+
}
|
|
21292
|
+
}
|
|
21293
|
+
const propType = resolvedTypeMap?.get(propName);
|
|
21294
|
+
props.push({
|
|
21295
|
+
name: propName,
|
|
21296
|
+
...propType ? { type: propType } : {},
|
|
21297
|
+
...defaultValue ? { default: defaultValue } : {}
|
|
21298
|
+
});
|
|
21299
|
+
}
|
|
21300
|
+
}
|
|
21301
|
+
return { description, props };
|
|
21302
|
+
}
|
|
21303
|
+
function splitDestructureBlock(block) {
|
|
21304
|
+
const entries = [];
|
|
21305
|
+
let depth = 0;
|
|
21306
|
+
let current = "";
|
|
21307
|
+
for (const ch of block) {
|
|
21308
|
+
if (ch === "{" || ch === "[" || ch === "(") {
|
|
21309
|
+
depth++;
|
|
21310
|
+
current += ch;
|
|
21311
|
+
} else if (ch === "}" || ch === "]" || ch === ")") {
|
|
21312
|
+
depth--;
|
|
21313
|
+
current += ch;
|
|
21314
|
+
} else if (ch === "," && depth === 0) {
|
|
21315
|
+
entries.push(current);
|
|
21316
|
+
current = "";
|
|
21317
|
+
} else {
|
|
21318
|
+
current += ch;
|
|
21319
|
+
}
|
|
21320
|
+
}
|
|
21321
|
+
if (current.trim()) entries.push(current);
|
|
21322
|
+
return entries;
|
|
21323
|
+
}
|
|
21324
|
+
function resolveTypeReference(source, typeName) {
|
|
21325
|
+
const startRe = new RegExp(`(?:interface\\s+${typeName}\\s*|type\\s+${typeName}\\s*=\\s*)\\{`);
|
|
21326
|
+
const startMatch = source.match(startRe);
|
|
21327
|
+
if (!startMatch || startMatch.index === void 0) return void 0;
|
|
21328
|
+
const bodyStart = startMatch.index + startMatch[0].length;
|
|
21329
|
+
let depth = 1;
|
|
21330
|
+
let i = bodyStart;
|
|
21331
|
+
while (i < source.length && depth > 0) {
|
|
21332
|
+
if (source[i] === "{") depth++;
|
|
21333
|
+
else if (source[i] === "}") depth--;
|
|
21334
|
+
i++;
|
|
21335
|
+
}
|
|
21336
|
+
if (depth !== 0) return void 0;
|
|
21337
|
+
const body = source.slice(bodyStart, i - 1);
|
|
21338
|
+
return parseTypeMembers(body);
|
|
21339
|
+
}
|
|
21340
|
+
function parseInlineTypeAnnotation(annotation) {
|
|
21341
|
+
const inner = annotation.replace(/^\{/, "").replace(/\}$/, "");
|
|
21342
|
+
return parseTypeMembers(inner);
|
|
21343
|
+
}
|
|
21344
|
+
function parseTypeMembers(body) {
|
|
21345
|
+
const map = /* @__PURE__ */ new Map();
|
|
21346
|
+
const members = body.split(/[;\n]/).map((m) => m.trim()).filter(Boolean);
|
|
21347
|
+
for (const member of members) {
|
|
21348
|
+
const memberMatch = member.match(/^(\w+)\??\s*:\s*(.+)$/);
|
|
21349
|
+
if (memberMatch) {
|
|
21350
|
+
map.set(memberMatch[1], memberMatch[2].replace(/,\s*$/, "").trim());
|
|
21351
|
+
}
|
|
21352
|
+
}
|
|
21353
|
+
return map;
|
|
21354
|
+
}
|
|
21355
|
+
function buildComponentMarkdown(componentName, meta) {
|
|
21356
|
+
if (!meta.description && meta.props.length === 0) return "";
|
|
21357
|
+
const parts = [`${componentName} component.`];
|
|
21358
|
+
if (meta.description) {
|
|
21359
|
+
parts.push(meta.description);
|
|
21360
|
+
}
|
|
21361
|
+
if (meta.props.length > 0) {
|
|
21362
|
+
const propEntries = meta.props.map((p) => {
|
|
21363
|
+
let entry = p.name;
|
|
21364
|
+
if (p.type) entry += ` (${p.type})`;
|
|
21365
|
+
if (p.default) entry += ` default: ${p.default}`;
|
|
21366
|
+
return entry;
|
|
21367
|
+
});
|
|
21368
|
+
parts.push(`Props: ${propEntries.join(", ")}.`);
|
|
21369
|
+
}
|
|
21370
|
+
return parts.join(" ");
|
|
21371
|
+
}
|
|
19787
21372
|
function normalizeSvelteToMarkdown(source) {
|
|
19788
21373
|
return source.replace(/<script[\s\S]*?<\/script>/g, "").replace(/<style[\s\S]*?<\/style>/g, "").replace(/<[^>]+>/g, " ").replace(/\{[^}]+\}/g, " ").replace(/\s+/g, " ").trim();
|
|
19789
21374
|
}
|
|
@@ -19802,13 +21387,27 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
19802
21387
|
const selected = typeof limit === "number" ? files.slice(0, limit) : files;
|
|
19803
21388
|
const pages = [];
|
|
19804
21389
|
for (const filePath of selected) {
|
|
19805
|
-
const raw = await
|
|
19806
|
-
|
|
21390
|
+
const raw = await fs9__default.default.readFile(filePath, "utf8");
|
|
21391
|
+
let markdown;
|
|
21392
|
+
let tags;
|
|
21393
|
+
if (filePath.endsWith(".md")) {
|
|
21394
|
+
markdown = raw;
|
|
21395
|
+
} else if (isSvelteComponentFile(filePath)) {
|
|
21396
|
+
const componentName = path__default.default.basename(filePath, ".svelte");
|
|
21397
|
+
const meta = extractSvelteComponentMeta(raw);
|
|
21398
|
+
const componentMarkdown = buildComponentMarkdown(componentName, meta);
|
|
21399
|
+
const templateContent = normalizeSvelteToMarkdown(raw);
|
|
21400
|
+
markdown = componentMarkdown ? [componentMarkdown, templateContent].filter(Boolean).join("\n\n") : templateContent;
|
|
21401
|
+
tags = ["component"];
|
|
21402
|
+
} else {
|
|
21403
|
+
markdown = normalizeSvelteToMarkdown(raw);
|
|
21404
|
+
}
|
|
19807
21405
|
pages.push({
|
|
19808
21406
|
url: filePathToUrl(filePath, baseDir),
|
|
19809
21407
|
markdown,
|
|
19810
21408
|
sourcePath: path__default.default.relative(cwd, filePath).replace(/\\/g, "/"),
|
|
19811
|
-
outgoingLinks: []
|
|
21409
|
+
outgoingLinks: [],
|
|
21410
|
+
...tags ? { tags } : {}
|
|
19812
21411
|
});
|
|
19813
21412
|
}
|
|
19814
21413
|
return pages;
|
|
@@ -19938,7 +21537,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
19938
21537
|
const selected = typeof limit === "number" ? htmlFiles.slice(0, limit) : htmlFiles;
|
|
19939
21538
|
const pages = [];
|
|
19940
21539
|
for (const filePath of selected) {
|
|
19941
|
-
const html = await
|
|
21540
|
+
const html = await fs9__default.default.readFile(filePath, "utf8");
|
|
19942
21541
|
pages.push({
|
|
19943
21542
|
url: staticHtmlFileToUrl(filePath, outputDir),
|
|
19944
21543
|
html,
|
|
@@ -20001,7 +21600,7 @@ function isBlockedByRobots(urlPath, rules3) {
|
|
|
20001
21600
|
}
|
|
20002
21601
|
async function loadRobotsTxtFromDir(dir) {
|
|
20003
21602
|
try {
|
|
20004
|
-
const content = await
|
|
21603
|
+
const content = await fs9__default.default.readFile(path__default.default.join(dir, "robots.txt"), "utf8");
|
|
20005
21604
|
return parseRobotsTxt(content);
|
|
20006
21605
|
} catch {
|
|
20007
21606
|
return null;
|
|
@@ -20018,6 +21617,81 @@ async function fetchRobotsTxt(baseUrl) {
|
|
|
20018
21617
|
return null;
|
|
20019
21618
|
}
|
|
20020
21619
|
}
|
|
21620
|
+
function resolvePageUrl(pageUrl, baseUrl) {
|
|
21621
|
+
if (!baseUrl) return pageUrl;
|
|
21622
|
+
try {
|
|
21623
|
+
return new URL(pageUrl, baseUrl).href;
|
|
21624
|
+
} catch {
|
|
21625
|
+
return pageUrl;
|
|
21626
|
+
}
|
|
21627
|
+
}
|
|
21628
|
+
function generateLlmsTxt(pages, config) {
|
|
21629
|
+
const title = config.llmsTxt.title ?? config.project.id;
|
|
21630
|
+
const description = config.llmsTxt.description;
|
|
21631
|
+
const baseUrl = config.project.baseUrl;
|
|
21632
|
+
const lines = [`# ${title}`];
|
|
21633
|
+
if (description) {
|
|
21634
|
+
lines.push("", `> ${description}`);
|
|
21635
|
+
}
|
|
21636
|
+
const filtered = pages.filter(
|
|
21637
|
+
(p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
|
|
21638
|
+
);
|
|
21639
|
+
const sorted = [...filtered].sort((a, b) => {
|
|
21640
|
+
if (a.depth !== b.depth) return a.depth - b.depth;
|
|
21641
|
+
return b.incomingLinks - a.incomingLinks;
|
|
21642
|
+
});
|
|
21643
|
+
if (sorted.length > 0) {
|
|
21644
|
+
lines.push("", "## Pages", "");
|
|
21645
|
+
for (const page of sorted) {
|
|
21646
|
+
const url = resolvePageUrl(page.url, baseUrl);
|
|
21647
|
+
if (page.description) {
|
|
21648
|
+
lines.push(`- [${page.title}](${url}): ${page.description}`);
|
|
21649
|
+
} else {
|
|
21650
|
+
lines.push(`- [${page.title}](${url})`);
|
|
21651
|
+
}
|
|
21652
|
+
}
|
|
21653
|
+
}
|
|
21654
|
+
lines.push("");
|
|
21655
|
+
return lines.join("\n");
|
|
21656
|
+
}
|
|
21657
|
+
function generateLlmsFullTxt(pages, config) {
|
|
21658
|
+
const title = config.llmsTxt.title ?? config.project.id;
|
|
21659
|
+
const description = config.llmsTxt.description;
|
|
21660
|
+
const baseUrl = config.project.baseUrl;
|
|
21661
|
+
const lines = [`# ${title}`];
|
|
21662
|
+
if (description) {
|
|
21663
|
+
lines.push("", `> ${description}`);
|
|
21664
|
+
}
|
|
21665
|
+
const filtered = pages.filter(
|
|
21666
|
+
(p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
|
|
21667
|
+
);
|
|
21668
|
+
const sorted = [...filtered].sort((a, b) => {
|
|
21669
|
+
if (a.depth !== b.depth) return a.depth - b.depth;
|
|
21670
|
+
return b.incomingLinks - a.incomingLinks;
|
|
21671
|
+
});
|
|
21672
|
+
for (const page of sorted) {
|
|
21673
|
+
const url = resolvePageUrl(page.url, baseUrl);
|
|
21674
|
+
lines.push("", "---", "", `## [${page.title}](${url})`, "");
|
|
21675
|
+
lines.push(page.markdown.trim());
|
|
21676
|
+
}
|
|
21677
|
+
lines.push("");
|
|
21678
|
+
return lines.join("\n");
|
|
21679
|
+
}
|
|
21680
|
+
async function writeLlmsTxt(pages, config, cwd, logger3) {
|
|
21681
|
+
const outputPath = path__default.default.resolve(cwd, config.llmsTxt.outputPath);
|
|
21682
|
+
const outputDir = path__default.default.dirname(outputPath);
|
|
21683
|
+
await fs9__default.default.mkdir(outputDir, { recursive: true });
|
|
21684
|
+
const content = generateLlmsTxt(pages, config);
|
|
21685
|
+
await fs9__default.default.writeFile(outputPath, content, "utf8");
|
|
21686
|
+
logger3.info(`Generated llms.txt at ${config.llmsTxt.outputPath}`);
|
|
21687
|
+
if (config.llmsTxt.generateFull) {
|
|
21688
|
+
const fullPath = outputPath.replace(/\.txt$/, "-full.txt");
|
|
21689
|
+
const fullContent = generateLlmsFullTxt(pages, config);
|
|
21690
|
+
await fs9__default.default.writeFile(fullPath, fullContent, "utf8");
|
|
21691
|
+
const relativeFull = path__default.default.relative(cwd, fullPath);
|
|
21692
|
+
logger3.info(`Generated llms-full.txt at ${relativeFull}`);
|
|
21693
|
+
}
|
|
21694
|
+
}
|
|
20021
21695
|
|
|
20022
21696
|
// src/indexing/pipeline.ts
|
|
20023
21697
|
function buildPageSummary(page, maxChars = 3500) {
|
|
@@ -20036,16 +21710,33 @@ function buildPageSummary(page, maxChars = 3500) {
|
|
|
20036
21710
|
if (joined.length <= maxChars) return joined;
|
|
20037
21711
|
return joined.slice(0, maxChars).trim();
|
|
20038
21712
|
}
|
|
21713
|
+
function buildPageContentHash(page) {
|
|
21714
|
+
const parts = [
|
|
21715
|
+
page.title,
|
|
21716
|
+
page.description ?? "",
|
|
21717
|
+
(page.keywords ?? []).slice().sort().join(","),
|
|
21718
|
+
page.tags.slice().sort().join(","),
|
|
21719
|
+
page.markdown,
|
|
21720
|
+
String(page.outgoingLinks),
|
|
21721
|
+
String(page.publishedAt ?? ""),
|
|
21722
|
+
page.incomingAnchorText ?? "",
|
|
21723
|
+
(page.outgoingLinkUrls ?? []).slice().sort().join(","),
|
|
21724
|
+
page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : ""
|
|
21725
|
+
];
|
|
21726
|
+
return sha256(parts.join("|"));
|
|
21727
|
+
}
|
|
20039
21728
|
var IndexPipeline = class _IndexPipeline {
|
|
20040
21729
|
cwd;
|
|
20041
21730
|
config;
|
|
20042
21731
|
store;
|
|
20043
21732
|
logger;
|
|
21733
|
+
hooks;
|
|
20044
21734
|
constructor(options) {
|
|
20045
21735
|
this.cwd = options.cwd;
|
|
20046
21736
|
this.config = options.config;
|
|
20047
21737
|
this.store = options.store;
|
|
20048
21738
|
this.logger = options.logger;
|
|
21739
|
+
this.hooks = options.hooks;
|
|
20049
21740
|
}
|
|
20050
21741
|
static async create(options = {}) {
|
|
20051
21742
|
const cwd = path__default.default.resolve(options.cwd ?? process.cwd());
|
|
@@ -20055,7 +21746,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20055
21746
|
cwd,
|
|
20056
21747
|
config,
|
|
20057
21748
|
store,
|
|
20058
|
-
logger: options.logger ?? new Logger()
|
|
21749
|
+
logger: options.logger ?? new Logger(),
|
|
21750
|
+
hooks: options.hooks ?? {}
|
|
20059
21751
|
});
|
|
20060
21752
|
}
|
|
20061
21753
|
getConfig() {
|
|
@@ -20076,7 +21768,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20076
21768
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
20077
21769
|
ensureStateDirs(this.cwd, this.config.state.dir);
|
|
20078
21770
|
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
20079
|
-
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-
|
|
21771
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-vector)`);
|
|
20080
21772
|
if (options.force) {
|
|
20081
21773
|
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
20082
21774
|
}
|
|
@@ -20085,8 +21777,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20085
21777
|
}
|
|
20086
21778
|
const manifestStart = stageStart();
|
|
20087
21779
|
const existingHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getContentHashes(scope);
|
|
21780
|
+
const existingPageHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getPageHashes(scope);
|
|
20088
21781
|
stageEnd("manifest", manifestStart);
|
|
20089
|
-
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
21782
|
+
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes, ${existingPageHashes.size} existing page hashes loaded`);
|
|
20090
21783
|
const sourceStart = stageStart();
|
|
20091
21784
|
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
20092
21785
|
let sourcePages;
|
|
@@ -20163,11 +21856,61 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20163
21856
|
);
|
|
20164
21857
|
continue;
|
|
20165
21858
|
}
|
|
20166
|
-
|
|
21859
|
+
if (sourcePage.tags && sourcePage.tags.length > 0) {
|
|
21860
|
+
extracted.tags = [.../* @__PURE__ */ new Set([...extracted.tags, ...sourcePage.tags])];
|
|
21861
|
+
}
|
|
21862
|
+
let accepted;
|
|
21863
|
+
if (this.hooks.transformPage) {
|
|
21864
|
+
const transformed = await this.hooks.transformPage(extracted);
|
|
21865
|
+
if (transformed === null) {
|
|
21866
|
+
this.logger.debug(`Page ${sourcePage.url} skipped by transformPage hook`);
|
|
21867
|
+
continue;
|
|
21868
|
+
}
|
|
21869
|
+
accepted = transformed;
|
|
21870
|
+
} else {
|
|
21871
|
+
accepted = extracted;
|
|
21872
|
+
}
|
|
21873
|
+
extractedPages.push(accepted);
|
|
20167
21874
|
this.logger.event("page_extracted", {
|
|
20168
|
-
url:
|
|
21875
|
+
url: accepted.url
|
|
20169
21876
|
});
|
|
20170
21877
|
}
|
|
21878
|
+
const customRecords = options.customRecords ?? [];
|
|
21879
|
+
if (customRecords.length > 0) {
|
|
21880
|
+
this.logger.info(`Processing ${customRecords.length} custom record${customRecords.length === 1 ? "" : "s"}...`);
|
|
21881
|
+
for (const record of customRecords) {
|
|
21882
|
+
const normalizedUrl = normalizeUrlPath(record.url);
|
|
21883
|
+
const normalized = normalizeMarkdown(record.content);
|
|
21884
|
+
if (!normalized.trim()) {
|
|
21885
|
+
this.logger.warn(`Custom record ${normalizedUrl} has empty content and was skipped.`);
|
|
21886
|
+
continue;
|
|
21887
|
+
}
|
|
21888
|
+
const urlTags = normalizedUrl.split("/").filter(Boolean).slice(0, 1);
|
|
21889
|
+
const tags = record.tags ? [.../* @__PURE__ */ new Set([...urlTags, ...record.tags])] : urlTags;
|
|
21890
|
+
const extracted = {
|
|
21891
|
+
url: normalizedUrl,
|
|
21892
|
+
title: record.title,
|
|
21893
|
+
markdown: normalized,
|
|
21894
|
+
outgoingLinks: [],
|
|
21895
|
+
noindex: false,
|
|
21896
|
+
tags,
|
|
21897
|
+
weight: record.weight
|
|
21898
|
+
};
|
|
21899
|
+
let accepted;
|
|
21900
|
+
if (this.hooks.transformPage) {
|
|
21901
|
+
const transformed = await this.hooks.transformPage(extracted);
|
|
21902
|
+
if (transformed === null) {
|
|
21903
|
+
this.logger.debug(`Custom record ${normalizedUrl} skipped by transformPage hook`);
|
|
21904
|
+
continue;
|
|
21905
|
+
}
|
|
21906
|
+
accepted = transformed;
|
|
21907
|
+
} else {
|
|
21908
|
+
accepted = extracted;
|
|
21909
|
+
}
|
|
21910
|
+
extractedPages.push(accepted);
|
|
21911
|
+
this.logger.event("page_extracted", { url: accepted.url, custom: true });
|
|
21912
|
+
}
|
|
21913
|
+
}
|
|
20171
21914
|
extractedPages.sort((a, b) => a.url.localeCompare(b.url));
|
|
20172
21915
|
const uniquePages = [];
|
|
20173
21916
|
const seenUrls = /* @__PURE__ */ new Set();
|
|
@@ -20200,15 +21943,28 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20200
21943
|
const linkStart = stageStart();
|
|
20201
21944
|
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
20202
21945
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
21946
|
+
const incomingAnchorTexts = /* @__PURE__ */ new Map();
|
|
20203
21947
|
for (const page of indexablePages) {
|
|
20204
21948
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
20205
21949
|
}
|
|
20206
21950
|
for (const page of indexablePages) {
|
|
20207
|
-
|
|
21951
|
+
const seenForCount = /* @__PURE__ */ new Set();
|
|
21952
|
+
const seenForAnchor = /* @__PURE__ */ new Set();
|
|
21953
|
+
for (const { url: outgoing, anchorText } of page.outgoingLinks) {
|
|
20208
21954
|
if (!pageSet.has(outgoing)) {
|
|
20209
21955
|
continue;
|
|
20210
21956
|
}
|
|
20211
|
-
|
|
21957
|
+
if (!seenForCount.has(outgoing)) {
|
|
21958
|
+
seenForCount.add(outgoing);
|
|
21959
|
+
incomingLinkCount.set(outgoing, (incomingLinkCount.get(outgoing) ?? 0) + 1);
|
|
21960
|
+
}
|
|
21961
|
+
if (anchorText && !seenForAnchor.has(outgoing)) {
|
|
21962
|
+
seenForAnchor.add(outgoing);
|
|
21963
|
+
if (!incomingAnchorTexts.has(outgoing)) {
|
|
21964
|
+
incomingAnchorTexts.set(outgoing, /* @__PURE__ */ new Set());
|
|
21965
|
+
}
|
|
21966
|
+
incomingAnchorTexts.get(outgoing).add(anchorText);
|
|
21967
|
+
}
|
|
20212
21968
|
}
|
|
20213
21969
|
}
|
|
20214
21970
|
stageEnd("links", linkStart);
|
|
@@ -20227,6 +21983,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20227
21983
|
});
|
|
20228
21984
|
}
|
|
20229
21985
|
}
|
|
21986
|
+
for (const record of customRecords) {
|
|
21987
|
+
const normalizedUrl = normalizeUrlPath(record.url);
|
|
21988
|
+
if (!precomputedRoutes.has(normalizedUrl)) {
|
|
21989
|
+
precomputedRoutes.set(normalizedUrl, {
|
|
21990
|
+
routeFile: "",
|
|
21991
|
+
routeResolution: "exact"
|
|
21992
|
+
});
|
|
21993
|
+
}
|
|
21994
|
+
}
|
|
20230
21995
|
for (const page of indexablePages) {
|
|
20231
21996
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
20232
21997
|
if (routeMatch.routeResolution === "best-effort") {
|
|
@@ -20244,6 +22009,17 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20244
22009
|
} else {
|
|
20245
22010
|
routeExact += 1;
|
|
20246
22011
|
}
|
|
22012
|
+
const anchorSet = incomingAnchorTexts.get(page.url);
|
|
22013
|
+
let incomingAnchorText;
|
|
22014
|
+
if (anchorSet && anchorSet.size > 0) {
|
|
22015
|
+
let joined = "";
|
|
22016
|
+
for (const phrase of anchorSet) {
|
|
22017
|
+
const next2 = joined ? `${joined} ${phrase}` : phrase;
|
|
22018
|
+
if (next2.length > 500) break;
|
|
22019
|
+
joined = next2;
|
|
22020
|
+
}
|
|
22021
|
+
incomingAnchorText = joined || void 0;
|
|
22022
|
+
}
|
|
20247
22023
|
const indexedPage = {
|
|
20248
22024
|
url: page.url,
|
|
20249
22025
|
title: page.title,
|
|
@@ -20253,40 +22029,113 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20253
22029
|
generatedAt: nowIso(),
|
|
20254
22030
|
incomingLinks: incomingLinkCount.get(page.url) ?? 0,
|
|
20255
22031
|
outgoingLinks: page.outgoingLinks.length,
|
|
22032
|
+
outgoingLinkUrls: page.outgoingLinks.map((l) => typeof l === "string" ? l : l.url),
|
|
20256
22033
|
depth: getUrlDepth(page.url),
|
|
20257
22034
|
tags: page.tags,
|
|
20258
22035
|
markdown: page.markdown,
|
|
20259
22036
|
description: page.description,
|
|
20260
|
-
keywords: page.keywords
|
|
22037
|
+
keywords: page.keywords,
|
|
22038
|
+
publishedAt: page.publishedAt,
|
|
22039
|
+
incomingAnchorText,
|
|
22040
|
+
meta: page.meta
|
|
20261
22041
|
};
|
|
20262
22042
|
pages.push(indexedPage);
|
|
20263
22043
|
this.logger.event("page_indexed", { url: page.url });
|
|
20264
22044
|
}
|
|
22045
|
+
const pageRecords = pages.map((p) => {
|
|
22046
|
+
const summary = buildPageSummary(p);
|
|
22047
|
+
return {
|
|
22048
|
+
url: p.url,
|
|
22049
|
+
title: p.title,
|
|
22050
|
+
markdown: p.markdown,
|
|
22051
|
+
projectId: scope.projectId,
|
|
22052
|
+
scopeName: scope.scopeName,
|
|
22053
|
+
routeFile: p.routeFile,
|
|
22054
|
+
routeResolution: p.routeResolution,
|
|
22055
|
+
incomingLinks: p.incomingLinks,
|
|
22056
|
+
outgoingLinks: p.outgoingLinks,
|
|
22057
|
+
outgoingLinkUrls: p.outgoingLinkUrls,
|
|
22058
|
+
depth: p.depth,
|
|
22059
|
+
tags: p.tags,
|
|
22060
|
+
indexedAt: p.generatedAt,
|
|
22061
|
+
summary,
|
|
22062
|
+
description: p.description,
|
|
22063
|
+
keywords: p.keywords,
|
|
22064
|
+
contentHash: buildPageContentHash(p),
|
|
22065
|
+
publishedAt: p.publishedAt,
|
|
22066
|
+
meta: p.meta
|
|
22067
|
+
};
|
|
22068
|
+
});
|
|
22069
|
+
const currentPageUrls = new Set(pageRecords.map((r) => r.url));
|
|
22070
|
+
const changedPages = pageRecords.filter(
|
|
22071
|
+
(r) => !existingPageHashes.has(r.url) || existingPageHashes.get(r.url) !== r.contentHash
|
|
22072
|
+
);
|
|
22073
|
+
const deletedPageUrls = [...existingPageHashes.keys()].filter((url) => !currentPageUrls.has(url));
|
|
20265
22074
|
if (!options.dryRun) {
|
|
20266
|
-
|
|
20267
|
-
|
|
20268
|
-
|
|
20269
|
-
|
|
20270
|
-
|
|
20271
|
-
|
|
20272
|
-
|
|
20273
|
-
|
|
20274
|
-
|
|
20275
|
-
|
|
20276
|
-
|
|
20277
|
-
|
|
20278
|
-
|
|
20279
|
-
|
|
20280
|
-
|
|
20281
|
-
|
|
20282
|
-
|
|
20283
|
-
|
|
20284
|
-
|
|
20285
|
-
|
|
20286
|
-
|
|
20287
|
-
|
|
22075
|
+
if (options.force) {
|
|
22076
|
+
await this.store.deletePages(scope);
|
|
22077
|
+
this.logger.info(`Upserting ${pageRecords.length} page summaries...`);
|
|
22078
|
+
const pageDocs = pageRecords.map((r) => ({
|
|
22079
|
+
id: r.url,
|
|
22080
|
+
data: r.summary ?? r.title,
|
|
22081
|
+
metadata: {
|
|
22082
|
+
title: r.title,
|
|
22083
|
+
url: r.url,
|
|
22084
|
+
description: r.description ?? "",
|
|
22085
|
+
keywords: r.keywords ?? [],
|
|
22086
|
+
summary: r.summary ?? "",
|
|
22087
|
+
tags: r.tags,
|
|
22088
|
+
markdown: r.markdown,
|
|
22089
|
+
routeFile: r.routeFile,
|
|
22090
|
+
routeResolution: r.routeResolution,
|
|
22091
|
+
incomingLinks: r.incomingLinks,
|
|
22092
|
+
outgoingLinks: r.outgoingLinks,
|
|
22093
|
+
outgoingLinkUrls: r.outgoingLinkUrls ?? [],
|
|
22094
|
+
depth: r.depth,
|
|
22095
|
+
indexedAt: r.indexedAt,
|
|
22096
|
+
contentHash: r.contentHash ?? "",
|
|
22097
|
+
publishedAt: r.publishedAt ?? null,
|
|
22098
|
+
...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
|
|
22099
|
+
}
|
|
22100
|
+
}));
|
|
22101
|
+
await this.store.upsertPages(pageDocs, scope);
|
|
22102
|
+
} else {
|
|
22103
|
+
if (changedPages.length > 0) {
|
|
22104
|
+
this.logger.info(`Upserting ${changedPages.length} changed page summaries...`);
|
|
22105
|
+
const pageDocs = changedPages.map((r) => ({
|
|
22106
|
+
id: r.url,
|
|
22107
|
+
data: r.summary ?? r.title,
|
|
22108
|
+
metadata: {
|
|
22109
|
+
title: r.title,
|
|
22110
|
+
url: r.url,
|
|
22111
|
+
description: r.description ?? "",
|
|
22112
|
+
keywords: r.keywords ?? [],
|
|
22113
|
+
summary: r.summary ?? "",
|
|
22114
|
+
tags: r.tags,
|
|
22115
|
+
markdown: r.markdown,
|
|
22116
|
+
routeFile: r.routeFile,
|
|
22117
|
+
routeResolution: r.routeResolution,
|
|
22118
|
+
incomingLinks: r.incomingLinks,
|
|
22119
|
+
outgoingLinks: r.outgoingLinks,
|
|
22120
|
+
outgoingLinkUrls: r.outgoingLinkUrls ?? [],
|
|
22121
|
+
depth: r.depth,
|
|
22122
|
+
indexedAt: r.indexedAt,
|
|
22123
|
+
contentHash: r.contentHash ?? "",
|
|
22124
|
+
publishedAt: r.publishedAt ?? null,
|
|
22125
|
+
...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
|
|
22126
|
+
}
|
|
22127
|
+
}));
|
|
22128
|
+
await this.store.upsertPages(pageDocs, scope);
|
|
22129
|
+
}
|
|
22130
|
+
if (deletedPageUrls.length > 0) {
|
|
22131
|
+
await this.store.deletePagesByIds(deletedPageUrls, scope);
|
|
22132
|
+
}
|
|
22133
|
+
}
|
|
20288
22134
|
}
|
|
22135
|
+
const pagesChanged = options.force ? pageRecords.length : changedPages.length;
|
|
22136
|
+
const pagesDeleted = deletedPageUrls.length;
|
|
20289
22137
|
stageEnd("pages", pagesStart);
|
|
22138
|
+
this.logger.info(`Page changes: ${pagesChanged} changed/new, ${pagesDeleted} deleted, ${pageRecords.length - changedPages.length} unchanged`);
|
|
20290
22139
|
this.logger.info(`Indexed ${pages.length} page${pages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["pages"]}ms)`);
|
|
20291
22140
|
const chunkStart = stageStart();
|
|
20292
22141
|
this.logger.info("Chunking pages...");
|
|
@@ -20295,6 +22144,18 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20295
22144
|
if (typeof maxChunks === "number") {
|
|
20296
22145
|
chunks = chunks.slice(0, maxChunks);
|
|
20297
22146
|
}
|
|
22147
|
+
if (this.hooks.transformChunk) {
|
|
22148
|
+
const transformed = [];
|
|
22149
|
+
for (const chunk of chunks) {
|
|
22150
|
+
const result = await this.hooks.transformChunk(chunk);
|
|
22151
|
+
if (result === null) {
|
|
22152
|
+
this.logger.debug(`Chunk ${chunk.chunkKey} skipped by transformChunk hook`);
|
|
22153
|
+
continue;
|
|
22154
|
+
}
|
|
22155
|
+
transformed.push(result);
|
|
22156
|
+
}
|
|
22157
|
+
chunks = transformed;
|
|
22158
|
+
}
|
|
20298
22159
|
for (const chunk of chunks) {
|
|
20299
22160
|
this.logger.event("chunked", {
|
|
20300
22161
|
url: chunk.url,
|
|
@@ -20307,7 +22168,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20307
22168
|
for (const chunk of chunks) {
|
|
20308
22169
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
20309
22170
|
}
|
|
20310
|
-
|
|
22171
|
+
let changedChunks = chunks.filter((chunk) => {
|
|
20311
22172
|
if (options.force) {
|
|
20312
22173
|
return true;
|
|
20313
22174
|
}
|
|
@@ -20321,36 +22182,43 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20321
22182
|
return existingHash !== chunk.contentHash;
|
|
20322
22183
|
});
|
|
20323
22184
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
22185
|
+
if (this.hooks.beforeIndex) {
|
|
22186
|
+
changedChunks = await this.hooks.beforeIndex(changedChunks);
|
|
22187
|
+
}
|
|
20324
22188
|
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
20325
22189
|
const upsertStart = stageStart();
|
|
20326
22190
|
let documentsUpserted = 0;
|
|
20327
22191
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
20328
|
-
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash
|
|
20329
|
-
const UPSTASH_CONTENT_LIMIT = 4096;
|
|
22192
|
+
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Vector...`);
|
|
20330
22193
|
const docs = changedChunks.map((chunk) => {
|
|
20331
|
-
const
|
|
20332
|
-
|
|
20333
|
-
|
|
20334
|
-
|
|
20335
|
-
|
|
20336
|
-
|
|
20337
|
-
const textBudget = Math.max(500, UPSTASH_CONTENT_LIMIT - otherFieldsLen - 50);
|
|
20338
|
-
const text = buildEmbeddingText(chunk, this.config.chunking.prependTitle).slice(0, textBudget);
|
|
22194
|
+
const embeddingText = buildEmbeddingText(chunk, this.config.chunking.prependTitle);
|
|
22195
|
+
if (embeddingText.length > 2e3) {
|
|
22196
|
+
this.logger.warn(
|
|
22197
|
+
`Chunk ${chunk.chunkKey} text is ${embeddingText.length} chars (~${Math.round(embeddingText.length / 4)} tokens), which may exceed the 512-token model limit and be silently truncated.`
|
|
22198
|
+
);
|
|
22199
|
+
}
|
|
20339
22200
|
return {
|
|
20340
22201
|
id: chunk.chunkKey,
|
|
20341
|
-
|
|
22202
|
+
data: embeddingText,
|
|
20342
22203
|
metadata: {
|
|
20343
|
-
|
|
20344
|
-
scopeName: scope.scopeName,
|
|
22204
|
+
url: chunk.url,
|
|
20345
22205
|
path: chunk.path,
|
|
22206
|
+
title: chunk.title,
|
|
22207
|
+
sectionTitle: chunk.sectionTitle ?? "",
|
|
22208
|
+
headingPath: chunk.headingPath.join(" > "),
|
|
20346
22209
|
snippet: chunk.snippet,
|
|
22210
|
+
chunkText: embeddingText,
|
|
22211
|
+
tags: chunk.tags,
|
|
20347
22212
|
ordinal: chunk.ordinal,
|
|
20348
22213
|
contentHash: chunk.contentHash,
|
|
20349
22214
|
depth: chunk.depth,
|
|
20350
22215
|
incomingLinks: chunk.incomingLinks,
|
|
20351
22216
|
routeFile: chunk.routeFile,
|
|
20352
22217
|
description: chunk.description ?? "",
|
|
20353
|
-
keywords:
|
|
22218
|
+
keywords: chunk.keywords ?? [],
|
|
22219
|
+
publishedAt: chunk.publishedAt ?? null,
|
|
22220
|
+
incomingAnchorText: chunk.incomingAnchorText ?? "",
|
|
22221
|
+
...chunk.meta && Object.keys(chunk.meta).length > 0 ? { meta: chunk.meta } : {}
|
|
20354
22222
|
}
|
|
20355
22223
|
};
|
|
20356
22224
|
});
|
|
@@ -20368,9 +22236,16 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20368
22236
|
} else {
|
|
20369
22237
|
this.logger.info("No chunks to upsert \u2014 all up to date");
|
|
20370
22238
|
}
|
|
22239
|
+
if (this.config.llmsTxt.enable && !options.dryRun) {
|
|
22240
|
+
const llmsStart = stageStart();
|
|
22241
|
+
await writeLlmsTxt(pages, this.config, this.cwd, this.logger);
|
|
22242
|
+
stageEnd("llms_txt", llmsStart);
|
|
22243
|
+
}
|
|
20371
22244
|
this.logger.info("Done.");
|
|
20372
|
-
|
|
22245
|
+
const stats = {
|
|
20373
22246
|
pagesProcessed: pages.length,
|
|
22247
|
+
pagesChanged,
|
|
22248
|
+
pagesDeleted,
|
|
20374
22249
|
chunksTotal: chunks.length,
|
|
20375
22250
|
chunksChanged: changedChunks.length,
|
|
20376
22251
|
documentsUpserted,
|
|
@@ -20379,6 +22254,10 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20379
22254
|
routeBestEffort,
|
|
20380
22255
|
stageTimingsMs
|
|
20381
22256
|
};
|
|
22257
|
+
if (this.hooks.afterIndex) {
|
|
22258
|
+
await this.hooks.afterIndex(stats);
|
|
22259
|
+
}
|
|
22260
|
+
return stats;
|
|
20382
22261
|
}
|
|
20383
22262
|
};
|
|
20384
22263
|
|
|
@@ -20400,9 +22279,6 @@ function shouldRunAutoIndex(options) {
|
|
|
20400
22279
|
if (explicit && /^(1|true|yes)$/i.test(explicit)) {
|
|
20401
22280
|
return true;
|
|
20402
22281
|
}
|
|
20403
|
-
if (process.env.CI && /^(1|true)$/i.test(process.env.CI)) {
|
|
20404
|
-
return true;
|
|
20405
|
-
}
|
|
20406
22282
|
return false;
|
|
20407
22283
|
}
|
|
20408
22284
|
function searchsocketVitePlugin(options = {}) {
|
|
@@ -20427,7 +22303,8 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20427
22303
|
const pipeline = await IndexPipeline.create({
|
|
20428
22304
|
cwd,
|
|
20429
22305
|
configPath: options.configPath,
|
|
20430
|
-
logger: logger3
|
|
22306
|
+
logger: logger3,
|
|
22307
|
+
hooks: options.hooks
|
|
20431
22308
|
});
|
|
20432
22309
|
const stats = await pipeline.run({
|
|
20433
22310
|
changedOnly: options.changedOnly ?? true,
|