searchsocket 0.5.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +731 -514
- package/dist/cli.js +3335 -492
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +2378 -475
- package/dist/index.d.cts +113 -40
- package/dist/index.d.ts +113 -40
- package/dist/index.js +2378 -475
- package/dist/{plugin-B_npJSux.d.cts → plugin-C61L-ykY.d.ts} +2 -1
- package/dist/{plugin-M-aW0ev6.d.ts → plugin-DoBW1gkK.d.cts} +2 -1
- package/dist/sveltekit.cjs +2430 -494
- package/dist/sveltekit.d.cts +2 -2
- package/dist/sveltekit.d.ts +2 -2
- package/dist/sveltekit.js +2416 -480
- package/dist/templates/search-dialog/SearchDialog.svelte +175 -0
- package/dist/templates/search-input/SearchInput.svelte +151 -0
- package/dist/templates/search-results/SearchResults.svelte +75 -0
- package/dist/{types-Dk43uz25.d.cts → types-029hl6P2.d.cts} +180 -9
- package/dist/{types-Dk43uz25.d.ts → types-029hl6P2.d.ts} +180 -9
- package/package.json +28 -11
- package/src/svelte/SearchSocket.svelte +35 -0
- package/src/svelte/index.svelte.ts +181 -0
package/dist/sveltekit.cjs
CHANGED
|
@@ -1,27 +1,33 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
var
|
|
3
|
+
var crypto = require('crypto');
|
|
4
|
+
var fs9 = require('fs/promises');
|
|
4
5
|
var path = require('path');
|
|
6
|
+
var webStandardStreamableHttp_js = require('@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js');
|
|
7
|
+
var fs = require('fs');
|
|
5
8
|
var jiti = require('jiti');
|
|
6
9
|
var zod = require('zod');
|
|
10
|
+
var mcp_js = require('@modelcontextprotocol/sdk/server/mcp.js');
|
|
11
|
+
require('@modelcontextprotocol/sdk/server/stdio.js');
|
|
12
|
+
require('@modelcontextprotocol/sdk/server/streamableHttp.js');
|
|
13
|
+
require('@modelcontextprotocol/sdk/server/express.js');
|
|
7
14
|
var child_process = require('child_process');
|
|
8
|
-
var
|
|
15
|
+
var vector = require('@upstash/vector');
|
|
9
16
|
var cheerio = require('cheerio');
|
|
10
17
|
var matter = require('gray-matter');
|
|
11
18
|
var fg = require('fast-glob');
|
|
12
19
|
var pLimit = require('p-limit');
|
|
13
|
-
var fs3 = require('fs/promises');
|
|
14
20
|
var net = require('net');
|
|
15
21
|
var zlib = require('zlib');
|
|
16
22
|
|
|
17
23
|
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
18
24
|
|
|
19
|
-
var
|
|
25
|
+
var fs9__default = /*#__PURE__*/_interopDefault(fs9);
|
|
20
26
|
var path__default = /*#__PURE__*/_interopDefault(path);
|
|
27
|
+
var fs__default = /*#__PURE__*/_interopDefault(fs);
|
|
21
28
|
var matter__default = /*#__PURE__*/_interopDefault(matter);
|
|
22
29
|
var fg__default = /*#__PURE__*/_interopDefault(fg);
|
|
23
30
|
var pLimit__default = /*#__PURE__*/_interopDefault(pLimit);
|
|
24
|
-
var fs3__default = /*#__PURE__*/_interopDefault(fs3);
|
|
25
31
|
var net__default = /*#__PURE__*/_interopDefault(net);
|
|
26
32
|
|
|
27
33
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
@@ -5021,32 +5027,32 @@ var require_URL = __commonJS({
|
|
|
5021
5027
|
else
|
|
5022
5028
|
return basepath.substring(0, lastslash + 1) + refpath;
|
|
5023
5029
|
}
|
|
5024
|
-
function remove_dot_segments(
|
|
5025
|
-
if (!
|
|
5030
|
+
function remove_dot_segments(path14) {
|
|
5031
|
+
if (!path14) return path14;
|
|
5026
5032
|
var output = "";
|
|
5027
|
-
while (
|
|
5028
|
-
if (
|
|
5029
|
-
|
|
5033
|
+
while (path14.length > 0) {
|
|
5034
|
+
if (path14 === "." || path14 === "..") {
|
|
5035
|
+
path14 = "";
|
|
5030
5036
|
break;
|
|
5031
5037
|
}
|
|
5032
|
-
var twochars =
|
|
5033
|
-
var threechars =
|
|
5034
|
-
var fourchars =
|
|
5038
|
+
var twochars = path14.substring(0, 2);
|
|
5039
|
+
var threechars = path14.substring(0, 3);
|
|
5040
|
+
var fourchars = path14.substring(0, 4);
|
|
5035
5041
|
if (threechars === "../") {
|
|
5036
|
-
|
|
5042
|
+
path14 = path14.substring(3);
|
|
5037
5043
|
} else if (twochars === "./") {
|
|
5038
|
-
|
|
5044
|
+
path14 = path14.substring(2);
|
|
5039
5045
|
} else if (threechars === "/./") {
|
|
5040
|
-
|
|
5041
|
-
} else if (twochars === "/." &&
|
|
5042
|
-
|
|
5043
|
-
} else if (fourchars === "/../" || threechars === "/.." &&
|
|
5044
|
-
|
|
5046
|
+
path14 = "/" + path14.substring(3);
|
|
5047
|
+
} else if (twochars === "/." && path14.length === 2) {
|
|
5048
|
+
path14 = "/";
|
|
5049
|
+
} else if (fourchars === "/../" || threechars === "/.." && path14.length === 3) {
|
|
5050
|
+
path14 = "/" + path14.substring(4);
|
|
5045
5051
|
output = output.replace(/\/?[^\/]*$/, "");
|
|
5046
5052
|
} else {
|
|
5047
|
-
var segment =
|
|
5053
|
+
var segment = path14.match(/(\/?([^\/]*))/)[0];
|
|
5048
5054
|
output += segment;
|
|
5049
|
-
|
|
5055
|
+
path14 = path14.substring(segment.length);
|
|
5050
5056
|
}
|
|
5051
5057
|
}
|
|
5052
5058
|
return output;
|
|
@@ -16642,6 +16648,7 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16642
16648
|
dropSelectors: zod.z.array(zod.z.string()).optional(),
|
|
16643
16649
|
ignoreAttr: zod.z.string().optional(),
|
|
16644
16650
|
noindexAttr: zod.z.string().optional(),
|
|
16651
|
+
imageDescAttr: zod.z.string().optional(),
|
|
16645
16652
|
respectRobotsNoindex: zod.z.boolean().optional()
|
|
16646
16653
|
}).optional(),
|
|
16647
16654
|
transform: zod.z.object({
|
|
@@ -16657,35 +16664,48 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16657
16664
|
headingPathDepth: zod.z.number().int().positive().optional(),
|
|
16658
16665
|
dontSplitInside: zod.z.array(zod.z.enum(["code", "table", "blockquote"])).optional(),
|
|
16659
16666
|
prependTitle: zod.z.boolean().optional(),
|
|
16660
|
-
pageSummaryChunk: zod.z.boolean().optional()
|
|
16667
|
+
pageSummaryChunk: zod.z.boolean().optional(),
|
|
16668
|
+
weightHeadings: zod.z.boolean().optional()
|
|
16661
16669
|
}).optional(),
|
|
16662
16670
|
upstash: zod.z.object({
|
|
16663
16671
|
url: zod.z.string().url().optional(),
|
|
16664
16672
|
token: zod.z.string().min(1).optional(),
|
|
16665
16673
|
urlEnv: zod.z.string().min(1).optional(),
|
|
16666
|
-
tokenEnv: zod.z.string().min(1).optional()
|
|
16674
|
+
tokenEnv: zod.z.string().min(1).optional(),
|
|
16675
|
+
namespaces: zod.z.object({
|
|
16676
|
+
pages: zod.z.string().min(1).optional(),
|
|
16677
|
+
chunks: zod.z.string().min(1).optional()
|
|
16678
|
+
}).optional()
|
|
16679
|
+
}).optional(),
|
|
16680
|
+
embedding: zod.z.object({
|
|
16681
|
+
model: zod.z.string().optional(),
|
|
16682
|
+
dimensions: zod.z.number().int().positive().optional(),
|
|
16683
|
+
taskType: zod.z.string().optional(),
|
|
16684
|
+
batchSize: zod.z.number().int().positive().optional()
|
|
16667
16685
|
}).optional(),
|
|
16668
16686
|
search: zod.z.object({
|
|
16669
|
-
semanticWeight: zod.z.number().min(0).max(1).optional(),
|
|
16670
|
-
inputEnrichment: zod.z.boolean().optional(),
|
|
16671
|
-
reranking: zod.z.boolean().optional(),
|
|
16672
16687
|
dualSearch: zod.z.boolean().optional(),
|
|
16673
16688
|
pageSearchWeight: zod.z.number().min(0).max(1).optional()
|
|
16674
16689
|
}).optional(),
|
|
16675
16690
|
ranking: zod.z.object({
|
|
16676
16691
|
enableIncomingLinkBoost: zod.z.boolean().optional(),
|
|
16677
16692
|
enableDepthBoost: zod.z.boolean().optional(),
|
|
16693
|
+
enableFreshnessBoost: zod.z.boolean().optional(),
|
|
16694
|
+
freshnessDecayRate: zod.z.number().positive().optional(),
|
|
16695
|
+
enableAnchorTextBoost: zod.z.boolean().optional(),
|
|
16678
16696
|
pageWeights: zod.z.record(zod.z.string(), zod.z.number().min(0)).optional(),
|
|
16679
16697
|
aggregationCap: zod.z.number().int().positive().optional(),
|
|
16680
16698
|
aggregationDecay: zod.z.number().min(0).max(1).optional(),
|
|
16681
16699
|
minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
16682
|
-
|
|
16700
|
+
minScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
16683
16701
|
scoreGapThreshold: zod.z.number().min(0).max(1).optional(),
|
|
16684
16702
|
weights: zod.z.object({
|
|
16685
16703
|
incomingLinks: zod.z.number().optional(),
|
|
16686
16704
|
depth: zod.z.number().optional(),
|
|
16687
16705
|
aggregation: zod.z.number().optional(),
|
|
16688
|
-
titleMatch: zod.z.number().optional()
|
|
16706
|
+
titleMatch: zod.z.number().optional(),
|
|
16707
|
+
freshness: zod.z.number().optional(),
|
|
16708
|
+
anchorText: zod.z.number().optional()
|
|
16689
16709
|
}).optional()
|
|
16690
16710
|
}).optional(),
|
|
16691
16711
|
api: zod.z.object({
|
|
@@ -16700,12 +16720,28 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16700
16720
|
}).optional(),
|
|
16701
16721
|
mcp: zod.z.object({
|
|
16702
16722
|
enable: zod.z.boolean().optional(),
|
|
16723
|
+
access: zod.z.enum(["public", "private"]).optional(),
|
|
16703
16724
|
transport: zod.z.enum(["stdio", "http"]).optional(),
|
|
16704
16725
|
http: zod.z.object({
|
|
16705
16726
|
port: zod.z.number().int().positive().optional(),
|
|
16706
|
-
path: zod.z.string().optional()
|
|
16727
|
+
path: zod.z.string().optional(),
|
|
16728
|
+
apiKey: zod.z.string().min(1).optional(),
|
|
16729
|
+
apiKeyEnv: zod.z.string().min(1).optional()
|
|
16730
|
+
}).optional(),
|
|
16731
|
+
handle: zod.z.object({
|
|
16732
|
+
path: zod.z.string().optional(),
|
|
16733
|
+
apiKey: zod.z.string().min(1).optional(),
|
|
16734
|
+
enableJsonResponse: zod.z.boolean().optional()
|
|
16707
16735
|
}).optional()
|
|
16708
16736
|
}).optional(),
|
|
16737
|
+
llmsTxt: zod.z.object({
|
|
16738
|
+
enable: zod.z.boolean().optional(),
|
|
16739
|
+
outputPath: zod.z.string().optional(),
|
|
16740
|
+
title: zod.z.string().optional(),
|
|
16741
|
+
description: zod.z.string().optional(),
|
|
16742
|
+
generateFull: zod.z.boolean().optional(),
|
|
16743
|
+
serveMarkdownVariants: zod.z.boolean().optional()
|
|
16744
|
+
}).optional(),
|
|
16709
16745
|
state: zod.z.object({
|
|
16710
16746
|
dir: zod.z.string().optional()
|
|
16711
16747
|
}).optional()
|
|
@@ -16744,6 +16780,7 @@ function createDefaultConfig(projectId) {
|
|
|
16744
16780
|
dropSelectors: DEFAULT_DROP_SELECTORS,
|
|
16745
16781
|
ignoreAttr: "data-search-ignore",
|
|
16746
16782
|
noindexAttr: "data-search-noindex",
|
|
16783
|
+
imageDescAttr: "data-search-description",
|
|
16747
16784
|
respectRobotsNoindex: true
|
|
16748
16785
|
},
|
|
16749
16786
|
transform: {
|
|
@@ -16753,39 +16790,52 @@ function createDefaultConfig(projectId) {
|
|
|
16753
16790
|
},
|
|
16754
16791
|
chunking: {
|
|
16755
16792
|
strategy: "hybrid",
|
|
16756
|
-
maxChars:
|
|
16793
|
+
maxChars: 1500,
|
|
16757
16794
|
overlapChars: 200,
|
|
16758
16795
|
minChars: 250,
|
|
16759
16796
|
headingPathDepth: 3,
|
|
16760
16797
|
dontSplitInside: ["code", "table", "blockquote"],
|
|
16761
16798
|
prependTitle: true,
|
|
16762
|
-
pageSummaryChunk: true
|
|
16799
|
+
pageSummaryChunk: true,
|
|
16800
|
+
weightHeadings: true
|
|
16763
16801
|
},
|
|
16764
16802
|
upstash: {
|
|
16765
|
-
urlEnv: "
|
|
16766
|
-
tokenEnv: "
|
|
16803
|
+
urlEnv: "UPSTASH_VECTOR_REST_URL",
|
|
16804
|
+
tokenEnv: "UPSTASH_VECTOR_REST_TOKEN",
|
|
16805
|
+
namespaces: {
|
|
16806
|
+
pages: "pages",
|
|
16807
|
+
chunks: "chunks"
|
|
16808
|
+
}
|
|
16809
|
+
},
|
|
16810
|
+
embedding: {
|
|
16811
|
+
model: "bge-large-en-v1.5",
|
|
16812
|
+
dimensions: 1024,
|
|
16813
|
+
taskType: "RETRIEVAL_DOCUMENT",
|
|
16814
|
+
batchSize: 100
|
|
16767
16815
|
},
|
|
16768
16816
|
search: {
|
|
16769
|
-
semanticWeight: 0.75,
|
|
16770
|
-
inputEnrichment: true,
|
|
16771
|
-
reranking: true,
|
|
16772
16817
|
dualSearch: true,
|
|
16773
16818
|
pageSearchWeight: 0.3
|
|
16774
16819
|
},
|
|
16775
16820
|
ranking: {
|
|
16776
16821
|
enableIncomingLinkBoost: true,
|
|
16777
16822
|
enableDepthBoost: true,
|
|
16823
|
+
enableFreshnessBoost: false,
|
|
16824
|
+
freshnessDecayRate: 1e-3,
|
|
16825
|
+
enableAnchorTextBoost: false,
|
|
16778
16826
|
pageWeights: {},
|
|
16779
16827
|
aggregationCap: 5,
|
|
16780
16828
|
aggregationDecay: 0.5,
|
|
16781
16829
|
minChunkScoreRatio: 0.5,
|
|
16782
|
-
|
|
16830
|
+
minScoreRatio: 0.7,
|
|
16783
16831
|
scoreGapThreshold: 0.4,
|
|
16784
16832
|
weights: {
|
|
16785
16833
|
incomingLinks: 0.05,
|
|
16786
16834
|
depth: 0.03,
|
|
16787
16835
|
aggregation: 0.1,
|
|
16788
|
-
titleMatch: 0.15
|
|
16836
|
+
titleMatch: 0.15,
|
|
16837
|
+
freshness: 0.1,
|
|
16838
|
+
anchorText: 0.1
|
|
16789
16839
|
}
|
|
16790
16840
|
},
|
|
16791
16841
|
api: {
|
|
@@ -16796,12 +16846,23 @@ function createDefaultConfig(projectId) {
|
|
|
16796
16846
|
},
|
|
16797
16847
|
mcp: {
|
|
16798
16848
|
enable: process.env.NODE_ENV !== "production",
|
|
16849
|
+
access: "private",
|
|
16799
16850
|
transport: "stdio",
|
|
16800
16851
|
http: {
|
|
16801
16852
|
port: 3338,
|
|
16802
16853
|
path: "/mcp"
|
|
16854
|
+
},
|
|
16855
|
+
handle: {
|
|
16856
|
+
path: "/api/mcp",
|
|
16857
|
+
enableJsonResponse: true
|
|
16803
16858
|
}
|
|
16804
16859
|
},
|
|
16860
|
+
llmsTxt: {
|
|
16861
|
+
enable: false,
|
|
16862
|
+
outputPath: "static/llms.txt",
|
|
16863
|
+
generateFull: true,
|
|
16864
|
+
serveMarkdownVariants: false
|
|
16865
|
+
},
|
|
16805
16866
|
state: {
|
|
16806
16867
|
dir: ".searchsocket"
|
|
16807
16868
|
}
|
|
@@ -16929,7 +16990,15 @@ ${issues}`
|
|
|
16929
16990
|
},
|
|
16930
16991
|
upstash: {
|
|
16931
16992
|
...defaults.upstash,
|
|
16932
|
-
...parsed.upstash
|
|
16993
|
+
...parsed.upstash,
|
|
16994
|
+
namespaces: {
|
|
16995
|
+
...defaults.upstash.namespaces,
|
|
16996
|
+
...parsed.upstash?.namespaces
|
|
16997
|
+
}
|
|
16998
|
+
},
|
|
16999
|
+
embedding: {
|
|
17000
|
+
...defaults.embedding,
|
|
17001
|
+
...parsed.embedding
|
|
16933
17002
|
},
|
|
16934
17003
|
search: {
|
|
16935
17004
|
...defaults.search,
|
|
@@ -16966,8 +17035,16 @@ ${issues}`
|
|
|
16966
17035
|
http: {
|
|
16967
17036
|
...defaults.mcp.http,
|
|
16968
17037
|
...parsed.mcp?.http
|
|
17038
|
+
},
|
|
17039
|
+
handle: {
|
|
17040
|
+
...defaults.mcp.handle,
|
|
17041
|
+
...parsed.mcp?.handle
|
|
16969
17042
|
}
|
|
16970
17043
|
},
|
|
17044
|
+
llmsTxt: {
|
|
17045
|
+
...defaults.llmsTxt,
|
|
17046
|
+
...parsed.llmsTxt
|
|
17047
|
+
},
|
|
16971
17048
|
state: {
|
|
16972
17049
|
...defaults.state,
|
|
16973
17050
|
...parsed.state
|
|
@@ -16987,6 +17064,15 @@ ${issues}`
|
|
|
16987
17064
|
maxDepth: 10
|
|
16988
17065
|
};
|
|
16989
17066
|
}
|
|
17067
|
+
if (merged.mcp.access === "public") {
|
|
17068
|
+
const resolvedKey = merged.mcp.http.apiKey ?? (merged.mcp.http.apiKeyEnv ? process.env[merged.mcp.http.apiKeyEnv] : void 0);
|
|
17069
|
+
if (!resolvedKey) {
|
|
17070
|
+
throw new SearchSocketError(
|
|
17071
|
+
"CONFIG_MISSING",
|
|
17072
|
+
'`mcp.access` is "public" but no API key is configured. Set `mcp.http.apiKey` or `mcp.http.apiKeyEnv`.'
|
|
17073
|
+
);
|
|
17074
|
+
}
|
|
17075
|
+
}
|
|
16990
17076
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
16991
17077
|
throw new SearchSocketError("CONFIG_MISSING", "`source.crawl.baseUrl` is required when source.mode is crawl.");
|
|
16992
17078
|
}
|
|
@@ -17035,13 +17121,84 @@ function normalizeMarkdown(input) {
|
|
|
17035
17121
|
function sanitizeScopeName(scopeName) {
|
|
17036
17122
|
return scopeName.toLowerCase().replace(/[^a-z0-9._-]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80);
|
|
17037
17123
|
}
|
|
17124
|
+
function markdownToPlain(markdown) {
|
|
17125
|
+
return markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/[#>*_|\-]/g, " ").replace(/\s+/g, " ").trim();
|
|
17126
|
+
}
|
|
17038
17127
|
function toSnippet(markdown, maxLen = 220) {
|
|
17039
|
-
const plain = markdown
|
|
17128
|
+
const plain = markdownToPlain(markdown);
|
|
17040
17129
|
if (plain.length <= maxLen) {
|
|
17041
17130
|
return plain;
|
|
17042
17131
|
}
|
|
17043
17132
|
return `${plain.slice(0, Math.max(0, maxLen - 1)).trim()}\u2026`;
|
|
17044
17133
|
}
|
|
17134
|
+
function queryAwareExcerpt(markdown, query, maxLen = 220) {
|
|
17135
|
+
const plain = markdownToPlain(markdown);
|
|
17136
|
+
if (plain.length <= maxLen) return plain;
|
|
17137
|
+
const tokens = query.toLowerCase().split(/\s+/).filter((t) => t.length >= 2);
|
|
17138
|
+
if (tokens.length === 0) return toSnippet(markdown, maxLen);
|
|
17139
|
+
const positions = [];
|
|
17140
|
+
for (let ti = 0; ti < tokens.length; ti++) {
|
|
17141
|
+
const escaped = tokens[ti].replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
17142
|
+
const re = new RegExp(escaped, "gi");
|
|
17143
|
+
let m;
|
|
17144
|
+
while ((m = re.exec(plain)) !== null) {
|
|
17145
|
+
positions.push({ start: m.index, end: m.index + m[0].length, tokenIdx: ti });
|
|
17146
|
+
}
|
|
17147
|
+
}
|
|
17148
|
+
if (positions.length === 0) return toSnippet(markdown, maxLen);
|
|
17149
|
+
positions.sort((a, b) => a.start - b.start);
|
|
17150
|
+
let bestUniqueCount = 0;
|
|
17151
|
+
let bestTotalCount = 0;
|
|
17152
|
+
let bestLeft = 0;
|
|
17153
|
+
let bestRight = 0;
|
|
17154
|
+
let left = 0;
|
|
17155
|
+
const tokenCounts = /* @__PURE__ */ new Map();
|
|
17156
|
+
for (let right = 0; right < positions.length; right++) {
|
|
17157
|
+
tokenCounts.set(positions[right].tokenIdx, (tokenCounts.get(positions[right].tokenIdx) ?? 0) + 1);
|
|
17158
|
+
while (positions[right].end - positions[left].start > maxLen && left < right) {
|
|
17159
|
+
const leftToken = positions[left].tokenIdx;
|
|
17160
|
+
const cnt = tokenCounts.get(leftToken) - 1;
|
|
17161
|
+
if (cnt === 0) tokenCounts.delete(leftToken);
|
|
17162
|
+
else tokenCounts.set(leftToken, cnt);
|
|
17163
|
+
left++;
|
|
17164
|
+
}
|
|
17165
|
+
const uniqueCount = tokenCounts.size;
|
|
17166
|
+
const totalCount = right - left + 1;
|
|
17167
|
+
if (uniqueCount > bestUniqueCount || uniqueCount === bestUniqueCount && totalCount > bestTotalCount) {
|
|
17168
|
+
bestUniqueCount = uniqueCount;
|
|
17169
|
+
bestTotalCount = totalCount;
|
|
17170
|
+
bestLeft = left;
|
|
17171
|
+
bestRight = right;
|
|
17172
|
+
}
|
|
17173
|
+
}
|
|
17174
|
+
const mid = Math.floor((positions[bestLeft].start + positions[bestRight].end) / 2);
|
|
17175
|
+
let start = Math.max(0, mid - Math.floor(maxLen / 2));
|
|
17176
|
+
let end = Math.min(plain.length, start + maxLen);
|
|
17177
|
+
start = Math.max(0, end - maxLen);
|
|
17178
|
+
if (start > 0) {
|
|
17179
|
+
const spaceIdx = plain.lastIndexOf(" ", start);
|
|
17180
|
+
if (spaceIdx > start - 30) {
|
|
17181
|
+
start = spaceIdx + 1;
|
|
17182
|
+
}
|
|
17183
|
+
}
|
|
17184
|
+
if (end < plain.length) {
|
|
17185
|
+
const spaceIdx = plain.indexOf(" ", end);
|
|
17186
|
+
if (spaceIdx !== -1 && spaceIdx < end + 30) {
|
|
17187
|
+
end = spaceIdx;
|
|
17188
|
+
}
|
|
17189
|
+
}
|
|
17190
|
+
let excerpt = plain.slice(start, end);
|
|
17191
|
+
if (excerpt.length > Math.ceil(maxLen * 1.2)) {
|
|
17192
|
+
excerpt = excerpt.slice(0, maxLen);
|
|
17193
|
+
const lastSpace = excerpt.lastIndexOf(" ");
|
|
17194
|
+
if (lastSpace > maxLen * 0.5) {
|
|
17195
|
+
excerpt = excerpt.slice(0, lastSpace);
|
|
17196
|
+
}
|
|
17197
|
+
}
|
|
17198
|
+
const prefix = start > 0 ? "\u2026" : "";
|
|
17199
|
+
const suffix = end < plain.length ? "\u2026" : "";
|
|
17200
|
+
return `${prefix}${excerpt}${suffix}`;
|
|
17201
|
+
}
|
|
17045
17202
|
function extractFirstParagraph(markdown) {
|
|
17046
17203
|
const lines = markdown.split("\n");
|
|
17047
17204
|
let inFence = false;
|
|
@@ -17148,162 +17305,342 @@ function joinUrl(baseUrl, route) {
|
|
|
17148
17305
|
const routePart = ensureLeadingSlash(route);
|
|
17149
17306
|
return `${base}${routePart}`;
|
|
17150
17307
|
}
|
|
17151
|
-
|
|
17152
|
-
// src/vector/upstash.ts
|
|
17153
|
-
function chunkIndexName(scope) {
|
|
17154
|
-
return `${scope.projectId}--${scope.scopeName}`;
|
|
17155
|
-
}
|
|
17156
|
-
function pageIndexName(scope) {
|
|
17157
|
-
return `${scope.projectId}--${scope.scopeName}--pages`;
|
|
17158
|
-
}
|
|
17159
17308
|
var UpstashSearchStore = class {
|
|
17160
|
-
|
|
17309
|
+
index;
|
|
17310
|
+
pagesNs;
|
|
17311
|
+
chunksNs;
|
|
17161
17312
|
constructor(opts) {
|
|
17162
|
-
this.
|
|
17163
|
-
|
|
17164
|
-
|
|
17165
|
-
return this.client.index(chunkIndexName(scope));
|
|
17166
|
-
}
|
|
17167
|
-
pageIndex(scope) {
|
|
17168
|
-
return this.client.index(pageIndexName(scope));
|
|
17313
|
+
this.index = opts.index;
|
|
17314
|
+
this.pagesNs = opts.index.namespace(opts.pagesNamespace);
|
|
17315
|
+
this.chunksNs = opts.index.namespace(opts.chunksNamespace);
|
|
17169
17316
|
}
|
|
17170
17317
|
async upsertChunks(chunks, scope) {
|
|
17171
17318
|
if (chunks.length === 0) return;
|
|
17172
|
-
const
|
|
17173
|
-
const BATCH_SIZE = 100;
|
|
17319
|
+
const BATCH_SIZE = 90;
|
|
17174
17320
|
for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
|
|
17175
17321
|
const batch = chunks.slice(i, i + BATCH_SIZE);
|
|
17176
|
-
await
|
|
17177
|
-
|
|
17178
|
-
|
|
17179
|
-
|
|
17180
|
-
|
|
17181
|
-
|
|
17182
|
-
|
|
17183
|
-
|
|
17184
|
-
|
|
17185
|
-
|
|
17186
|
-
|
|
17187
|
-
|
|
17322
|
+
await this.chunksNs.upsert(
|
|
17323
|
+
batch.map((c) => ({
|
|
17324
|
+
id: c.id,
|
|
17325
|
+
data: c.data,
|
|
17326
|
+
metadata: {
|
|
17327
|
+
...c.metadata,
|
|
17328
|
+
projectId: scope.projectId,
|
|
17329
|
+
scopeName: scope.scopeName,
|
|
17330
|
+
type: c.metadata.type || "chunk"
|
|
17331
|
+
}
|
|
17332
|
+
}))
|
|
17333
|
+
);
|
|
17334
|
+
}
|
|
17335
|
+
}
|
|
17336
|
+
async search(data, opts, scope) {
|
|
17337
|
+
const filterParts = [
|
|
17338
|
+
`projectId = '${scope.projectId}'`,
|
|
17339
|
+
`scopeName = '${scope.scopeName}'`
|
|
17340
|
+
];
|
|
17341
|
+
if (opts.filter) {
|
|
17342
|
+
filterParts.push(opts.filter);
|
|
17343
|
+
}
|
|
17344
|
+
const results = await this.chunksNs.query({
|
|
17345
|
+
data,
|
|
17346
|
+
topK: opts.limit,
|
|
17347
|
+
includeMetadata: true,
|
|
17348
|
+
filter: filterParts.join(" AND "),
|
|
17349
|
+
queryMode: vector.QueryMode.HYBRID,
|
|
17350
|
+
fusionAlgorithm: vector.FusionAlgorithm.DBSF
|
|
17351
|
+
});
|
|
17352
|
+
return results.map((doc) => ({
|
|
17353
|
+
id: String(doc.id),
|
|
17354
|
+
score: doc.score,
|
|
17355
|
+
metadata: {
|
|
17356
|
+
projectId: doc.metadata?.projectId ?? "",
|
|
17357
|
+
scopeName: doc.metadata?.scopeName ?? "",
|
|
17358
|
+
url: doc.metadata?.url ?? "",
|
|
17359
|
+
path: doc.metadata?.path ?? "",
|
|
17360
|
+
title: doc.metadata?.title ?? "",
|
|
17361
|
+
sectionTitle: doc.metadata?.sectionTitle ?? "",
|
|
17362
|
+
headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
|
|
17363
|
+
snippet: doc.metadata?.snippet ?? "",
|
|
17364
|
+
chunkText: doc.metadata?.chunkText ?? "",
|
|
17365
|
+
ordinal: doc.metadata?.ordinal ?? 0,
|
|
17366
|
+
contentHash: doc.metadata?.contentHash ?? "",
|
|
17367
|
+
depth: doc.metadata?.depth ?? 0,
|
|
17368
|
+
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17369
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
17370
|
+
tags: doc.metadata?.tags ?? [],
|
|
17371
|
+
description: doc.metadata?.description || void 0,
|
|
17372
|
+
keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17373
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
|
|
17374
|
+
incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
|
|
17375
|
+
}
|
|
17376
|
+
}));
|
|
17377
|
+
}
|
|
17378
|
+
async searchChunksByUrl(data, url, opts, scope) {
|
|
17379
|
+
const filterParts = [
|
|
17380
|
+
`projectId = '${scope.projectId}'`,
|
|
17381
|
+
`scopeName = '${scope.scopeName}'`,
|
|
17382
|
+
`url = '${url}'`
|
|
17383
|
+
];
|
|
17384
|
+
if (opts.filter) {
|
|
17385
|
+
filterParts.push(opts.filter);
|
|
17386
|
+
}
|
|
17387
|
+
const results = await this.chunksNs.query({
|
|
17388
|
+
data,
|
|
17389
|
+
topK: opts.limit,
|
|
17390
|
+
includeMetadata: true,
|
|
17391
|
+
filter: filterParts.join(" AND "),
|
|
17392
|
+
queryMode: vector.QueryMode.HYBRID,
|
|
17393
|
+
fusionAlgorithm: vector.FusionAlgorithm.DBSF
|
|
17188
17394
|
});
|
|
17189
17395
|
return results.map((doc) => ({
|
|
17190
|
-
id: doc.id,
|
|
17396
|
+
id: String(doc.id),
|
|
17191
17397
|
score: doc.score,
|
|
17192
17398
|
metadata: {
|
|
17193
17399
|
projectId: doc.metadata?.projectId ?? "",
|
|
17194
17400
|
scopeName: doc.metadata?.scopeName ?? "",
|
|
17195
|
-
url: doc.
|
|
17401
|
+
url: doc.metadata?.url ?? "",
|
|
17196
17402
|
path: doc.metadata?.path ?? "",
|
|
17197
|
-
title: doc.
|
|
17198
|
-
sectionTitle: doc.
|
|
17199
|
-
headingPath: doc.
|
|
17403
|
+
title: doc.metadata?.title ?? "",
|
|
17404
|
+
sectionTitle: doc.metadata?.sectionTitle ?? "",
|
|
17405
|
+
headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
|
|
17200
17406
|
snippet: doc.metadata?.snippet ?? "",
|
|
17201
|
-
chunkText: doc.
|
|
17407
|
+
chunkText: doc.metadata?.chunkText ?? "",
|
|
17202
17408
|
ordinal: doc.metadata?.ordinal ?? 0,
|
|
17203
17409
|
contentHash: doc.metadata?.contentHash ?? "",
|
|
17204
17410
|
depth: doc.metadata?.depth ?? 0,
|
|
17205
17411
|
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17206
17412
|
routeFile: doc.metadata?.routeFile ?? "",
|
|
17207
|
-
tags: doc.
|
|
17413
|
+
tags: doc.metadata?.tags ?? [],
|
|
17208
17414
|
description: doc.metadata?.description || void 0,
|
|
17209
|
-
keywords: doc.metadata?.keywords ? doc.metadata.keywords
|
|
17415
|
+
keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17416
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
|
|
17417
|
+
incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
|
|
17210
17418
|
}
|
|
17211
17419
|
}));
|
|
17212
17420
|
}
|
|
17213
|
-
async
|
|
17214
|
-
|
|
17421
|
+
async searchPagesByText(data, opts, scope) {
|
|
17422
|
+
return this.queryPages({ data }, opts, scope);
|
|
17423
|
+
}
|
|
17424
|
+
async searchPagesByVector(vector, opts, scope) {
|
|
17425
|
+
return this.queryPages({ vector }, opts, scope);
|
|
17426
|
+
}
|
|
17427
|
+
async queryPages(input, opts, scope) {
|
|
17428
|
+
const filterParts = [
|
|
17429
|
+
`projectId = '${scope.projectId}'`,
|
|
17430
|
+
`scopeName = '${scope.scopeName}'`
|
|
17431
|
+
];
|
|
17432
|
+
if (opts.filter) {
|
|
17433
|
+
filterParts.push(opts.filter);
|
|
17434
|
+
}
|
|
17215
17435
|
let results;
|
|
17216
17436
|
try {
|
|
17217
|
-
results = await
|
|
17218
|
-
|
|
17219
|
-
|
|
17220
|
-
|
|
17221
|
-
|
|
17222
|
-
|
|
17223
|
-
|
|
17437
|
+
results = await this.pagesNs.query({
|
|
17438
|
+
...input,
|
|
17439
|
+
topK: opts.limit,
|
|
17440
|
+
includeMetadata: true,
|
|
17441
|
+
filter: filterParts.join(" AND "),
|
|
17442
|
+
queryMode: vector.QueryMode.HYBRID,
|
|
17443
|
+
fusionAlgorithm: vector.FusionAlgorithm.DBSF
|
|
17224
17444
|
});
|
|
17225
17445
|
} catch {
|
|
17226
17446
|
return [];
|
|
17227
17447
|
}
|
|
17228
17448
|
return results.map((doc) => ({
|
|
17229
|
-
id: doc.id,
|
|
17449
|
+
id: String(doc.id),
|
|
17230
17450
|
score: doc.score,
|
|
17231
|
-
title: doc.
|
|
17232
|
-
url: doc.
|
|
17233
|
-
description: doc.
|
|
17234
|
-
tags: doc.
|
|
17451
|
+
title: doc.metadata?.title ?? "",
|
|
17452
|
+
url: doc.metadata?.url ?? "",
|
|
17453
|
+
description: doc.metadata?.description ?? "",
|
|
17454
|
+
tags: doc.metadata?.tags ?? [],
|
|
17235
17455
|
depth: doc.metadata?.depth ?? 0,
|
|
17236
17456
|
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17237
|
-
routeFile: doc.metadata?.routeFile ?? ""
|
|
17457
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
17458
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0
|
|
17238
17459
|
}));
|
|
17239
17460
|
}
|
|
17240
|
-
async deleteByIds(ids,
|
|
17461
|
+
async deleteByIds(ids, _scope) {
|
|
17241
17462
|
if (ids.length === 0) return;
|
|
17242
|
-
const
|
|
17243
|
-
const BATCH_SIZE = 500;
|
|
17463
|
+
const BATCH_SIZE = 90;
|
|
17244
17464
|
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17245
17465
|
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17246
|
-
await
|
|
17466
|
+
await this.chunksNs.delete(batch);
|
|
17247
17467
|
}
|
|
17248
17468
|
}
|
|
17249
17469
|
async deleteScope(scope) {
|
|
17250
|
-
|
|
17251
|
-
const
|
|
17252
|
-
|
|
17253
|
-
|
|
17254
|
-
|
|
17255
|
-
|
|
17256
|
-
|
|
17257
|
-
|
|
17258
|
-
|
|
17470
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17471
|
+
const ids = [];
|
|
17472
|
+
let cursor = "0";
|
|
17473
|
+
try {
|
|
17474
|
+
for (; ; ) {
|
|
17475
|
+
const result = await ns.range({
|
|
17476
|
+
cursor,
|
|
17477
|
+
limit: 100,
|
|
17478
|
+
includeMetadata: true
|
|
17479
|
+
});
|
|
17480
|
+
for (const doc of result.vectors) {
|
|
17481
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17482
|
+
ids.push(String(doc.id));
|
|
17483
|
+
}
|
|
17484
|
+
}
|
|
17485
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17486
|
+
cursor = result.nextCursor;
|
|
17487
|
+
}
|
|
17488
|
+
} catch {
|
|
17489
|
+
}
|
|
17490
|
+
if (ids.length > 0) {
|
|
17491
|
+
const BATCH_SIZE = 90;
|
|
17492
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17493
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17494
|
+
await ns.delete(batch);
|
|
17495
|
+
}
|
|
17496
|
+
}
|
|
17259
17497
|
}
|
|
17260
17498
|
}
|
|
17261
17499
|
async listScopes(projectId) {
|
|
17262
|
-
const
|
|
17263
|
-
const
|
|
17264
|
-
|
|
17265
|
-
|
|
17266
|
-
|
|
17267
|
-
|
|
17268
|
-
|
|
17269
|
-
|
|
17270
|
-
|
|
17271
|
-
|
|
17272
|
-
|
|
17273
|
-
|
|
17274
|
-
|
|
17275
|
-
|
|
17276
|
-
|
|
17277
|
-
|
|
17500
|
+
const scopeMap = /* @__PURE__ */ new Map();
|
|
17501
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17502
|
+
let cursor = "0";
|
|
17503
|
+
try {
|
|
17504
|
+
for (; ; ) {
|
|
17505
|
+
const result = await ns.range({
|
|
17506
|
+
cursor,
|
|
17507
|
+
limit: 100,
|
|
17508
|
+
includeMetadata: true
|
|
17509
|
+
});
|
|
17510
|
+
for (const doc of result.vectors) {
|
|
17511
|
+
if (doc.metadata?.projectId === projectId) {
|
|
17512
|
+
const scopeName = doc.metadata.scopeName ?? "";
|
|
17513
|
+
scopeMap.set(scopeName, (scopeMap.get(scopeName) ?? 0) + 1);
|
|
17514
|
+
}
|
|
17515
|
+
}
|
|
17516
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17517
|
+
cursor = result.nextCursor;
|
|
17518
|
+
}
|
|
17519
|
+
} catch {
|
|
17520
|
+
}
|
|
17521
|
+
}
|
|
17522
|
+
return [...scopeMap.entries()].map(([scopeName, count]) => ({
|
|
17523
|
+
projectId,
|
|
17524
|
+
scopeName,
|
|
17525
|
+
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
17526
|
+
documentCount: count
|
|
17527
|
+
}));
|
|
17528
|
+
}
|
|
17529
|
+
async getContentHashes(scope) {
|
|
17530
|
+
return this.scanHashes(this.chunksNs, scope);
|
|
17531
|
+
}
|
|
17532
|
+
/**
|
|
17533
|
+
* Fetch content hashes for a specific set of chunk keys using direct fetch()
|
|
17534
|
+
* instead of range(). This avoids potential issues with range() returning
|
|
17535
|
+
* vectors from the wrong namespace on hybrid indexes.
|
|
17536
|
+
*/
|
|
17537
|
+
async fetchContentHashesForKeys(keys, scope) {
|
|
17538
|
+
const map = /* @__PURE__ */ new Map();
|
|
17539
|
+
if (keys.length === 0) return map;
|
|
17540
|
+
const BATCH_SIZE = 90;
|
|
17541
|
+
for (let i = 0; i < keys.length; i += BATCH_SIZE) {
|
|
17542
|
+
const batch = keys.slice(i, i + BATCH_SIZE);
|
|
17278
17543
|
try {
|
|
17279
|
-
const
|
|
17280
|
-
|
|
17281
|
-
projectId,
|
|
17282
|
-
scopeName,
|
|
17283
|
-
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
17284
|
-
documentCount: info.documentCount
|
|
17544
|
+
const results = await this.chunksNs.fetch(batch, {
|
|
17545
|
+
includeMetadata: true
|
|
17285
17546
|
});
|
|
17547
|
+
for (const doc of results) {
|
|
17548
|
+
if (doc && doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17549
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17550
|
+
}
|
|
17551
|
+
}
|
|
17286
17552
|
} catch {
|
|
17287
|
-
|
|
17288
|
-
|
|
17289
|
-
|
|
17290
|
-
|
|
17291
|
-
|
|
17553
|
+
}
|
|
17554
|
+
}
|
|
17555
|
+
return map;
|
|
17556
|
+
}
|
|
17557
|
+
/**
|
|
17558
|
+
* Scan all IDs in the chunks namespace for this scope.
|
|
17559
|
+
* Used for deletion detection (finding stale chunk keys).
|
|
17560
|
+
*/
|
|
17561
|
+
async scanChunkIds(scope) {
|
|
17562
|
+
const ids = /* @__PURE__ */ new Set();
|
|
17563
|
+
let cursor = "0";
|
|
17564
|
+
try {
|
|
17565
|
+
for (; ; ) {
|
|
17566
|
+
const result = await this.chunksNs.range({
|
|
17567
|
+
cursor,
|
|
17568
|
+
limit: 100,
|
|
17569
|
+
includeMetadata: true
|
|
17292
17570
|
});
|
|
17571
|
+
for (const doc of result.vectors) {
|
|
17572
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17573
|
+
ids.add(String(doc.id));
|
|
17574
|
+
}
|
|
17575
|
+
}
|
|
17576
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17577
|
+
cursor = result.nextCursor;
|
|
17293
17578
|
}
|
|
17579
|
+
} catch {
|
|
17294
17580
|
}
|
|
17295
|
-
return
|
|
17581
|
+
return ids;
|
|
17296
17582
|
}
|
|
17297
|
-
async
|
|
17583
|
+
async scanHashes(ns, scope) {
|
|
17584
|
+
const map = /* @__PURE__ */ new Map();
|
|
17585
|
+
let cursor = "0";
|
|
17586
|
+
try {
|
|
17587
|
+
for (; ; ) {
|
|
17588
|
+
const result = await ns.range({
|
|
17589
|
+
cursor,
|
|
17590
|
+
limit: 100,
|
|
17591
|
+
includeMetadata: true
|
|
17592
|
+
});
|
|
17593
|
+
for (const doc of result.vectors) {
|
|
17594
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17595
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17596
|
+
}
|
|
17597
|
+
}
|
|
17598
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17599
|
+
cursor = result.nextCursor;
|
|
17600
|
+
}
|
|
17601
|
+
} catch {
|
|
17602
|
+
}
|
|
17603
|
+
return map;
|
|
17604
|
+
}
|
|
17605
|
+
async listPages(scope, opts) {
|
|
17606
|
+
const cursor = opts?.cursor ?? "0";
|
|
17607
|
+
const limit = opts?.limit ?? 50;
|
|
17608
|
+
try {
|
|
17609
|
+
const result = await this.pagesNs.range({
|
|
17610
|
+
cursor,
|
|
17611
|
+
limit,
|
|
17612
|
+
includeMetadata: true
|
|
17613
|
+
});
|
|
17614
|
+
const pages = result.vectors.filter(
|
|
17615
|
+
(doc) => doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && (!opts?.pathPrefix || (doc.metadata?.url ?? "").startsWith(opts.pathPrefix))
|
|
17616
|
+
).map((doc) => ({
|
|
17617
|
+
url: doc.metadata?.url ?? "",
|
|
17618
|
+
title: doc.metadata?.title ?? "",
|
|
17619
|
+
description: doc.metadata?.description ?? "",
|
|
17620
|
+
routeFile: doc.metadata?.routeFile ?? ""
|
|
17621
|
+
}));
|
|
17622
|
+
const response = { pages };
|
|
17623
|
+
if (result.nextCursor && result.nextCursor !== "0") {
|
|
17624
|
+
response.nextCursor = result.nextCursor;
|
|
17625
|
+
}
|
|
17626
|
+
return response;
|
|
17627
|
+
} catch {
|
|
17628
|
+
return { pages: [] };
|
|
17629
|
+
}
|
|
17630
|
+
}
|
|
17631
|
+
async getPageHashes(scope) {
|
|
17298
17632
|
const map = /* @__PURE__ */ new Map();
|
|
17299
|
-
const index = this.chunkIndex(scope);
|
|
17300
17633
|
let cursor = "0";
|
|
17301
17634
|
try {
|
|
17302
17635
|
for (; ; ) {
|
|
17303
|
-
const result = await
|
|
17304
|
-
|
|
17305
|
-
|
|
17306
|
-
|
|
17636
|
+
const result = await this.pagesNs.range({
|
|
17637
|
+
cursor,
|
|
17638
|
+
limit: 100,
|
|
17639
|
+
includeMetadata: true
|
|
17640
|
+
});
|
|
17641
|
+
for (const doc of result.vectors) {
|
|
17642
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17643
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17307
17644
|
}
|
|
17308
17645
|
}
|
|
17309
17646
|
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
@@ -17313,47 +17650,43 @@ var UpstashSearchStore = class {
|
|
|
17313
17650
|
}
|
|
17314
17651
|
return map;
|
|
17315
17652
|
}
|
|
17653
|
+
async deletePagesByIds(ids, _scope) {
|
|
17654
|
+
if (ids.length === 0) return;
|
|
17655
|
+
const BATCH_SIZE = 90;
|
|
17656
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17657
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17658
|
+
await this.pagesNs.delete(batch);
|
|
17659
|
+
}
|
|
17660
|
+
}
|
|
17316
17661
|
async upsertPages(pages, scope) {
|
|
17317
17662
|
if (pages.length === 0) return;
|
|
17318
|
-
const
|
|
17319
|
-
const BATCH_SIZE = 50;
|
|
17663
|
+
const BATCH_SIZE = 90;
|
|
17320
17664
|
for (let i = 0; i < pages.length; i += BATCH_SIZE) {
|
|
17321
17665
|
const batch = pages.slice(i, i + BATCH_SIZE);
|
|
17322
|
-
|
|
17323
|
-
|
|
17324
|
-
|
|
17325
|
-
|
|
17326
|
-
|
|
17327
|
-
|
|
17328
|
-
|
|
17329
|
-
|
|
17330
|
-
|
|
17331
|
-
|
|
17332
|
-
}
|
|
17333
|
-
|
|
17334
|
-
markdown: p.markdown,
|
|
17335
|
-
projectId: p.projectId,
|
|
17336
|
-
scopeName: p.scopeName,
|
|
17337
|
-
routeFile: p.routeFile,
|
|
17338
|
-
routeResolution: p.routeResolution,
|
|
17339
|
-
incomingLinks: p.incomingLinks,
|
|
17340
|
-
outgoingLinks: p.outgoingLinks,
|
|
17341
|
-
depth: p.depth,
|
|
17342
|
-
indexedAt: p.indexedAt
|
|
17343
|
-
}
|
|
17344
|
-
}));
|
|
17345
|
-
await index.upsert(docs);
|
|
17666
|
+
await this.pagesNs.upsert(
|
|
17667
|
+
batch.map((p) => ({
|
|
17668
|
+
id: p.id,
|
|
17669
|
+
data: p.data,
|
|
17670
|
+
metadata: {
|
|
17671
|
+
...p.metadata,
|
|
17672
|
+
projectId: scope.projectId,
|
|
17673
|
+
scopeName: scope.scopeName,
|
|
17674
|
+
type: "page"
|
|
17675
|
+
}
|
|
17676
|
+
}))
|
|
17677
|
+
);
|
|
17346
17678
|
}
|
|
17347
17679
|
}
|
|
17348
17680
|
async getPage(url, scope) {
|
|
17349
|
-
const index = this.pageIndex(scope);
|
|
17350
17681
|
try {
|
|
17351
|
-
const results = await
|
|
17682
|
+
const results = await this.pagesNs.fetch([url], {
|
|
17683
|
+
includeMetadata: true
|
|
17684
|
+
});
|
|
17352
17685
|
const doc = results[0];
|
|
17353
|
-
if (!doc) return null;
|
|
17686
|
+
if (!doc || !doc.metadata) return null;
|
|
17354
17687
|
return {
|
|
17355
|
-
url: doc.
|
|
17356
|
-
title: doc.
|
|
17688
|
+
url: doc.metadata.url,
|
|
17689
|
+
title: doc.metadata.title,
|
|
17357
17690
|
markdown: doc.metadata.markdown,
|
|
17358
17691
|
projectId: doc.metadata.projectId,
|
|
17359
17692
|
scopeName: doc.metadata.scopeName,
|
|
@@ -17361,27 +17694,86 @@ var UpstashSearchStore = class {
|
|
|
17361
17694
|
routeResolution: doc.metadata.routeResolution,
|
|
17362
17695
|
incomingLinks: doc.metadata.incomingLinks,
|
|
17363
17696
|
outgoingLinks: doc.metadata.outgoingLinks,
|
|
17697
|
+
outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? void 0,
|
|
17364
17698
|
depth: doc.metadata.depth,
|
|
17365
|
-
tags: doc.
|
|
17699
|
+
tags: doc.metadata.tags ?? [],
|
|
17366
17700
|
indexedAt: doc.metadata.indexedAt,
|
|
17367
|
-
summary: doc.
|
|
17368
|
-
description: doc.
|
|
17369
|
-
keywords: doc.
|
|
17701
|
+
summary: doc.metadata.summary || void 0,
|
|
17702
|
+
description: doc.metadata.description || void 0,
|
|
17703
|
+
keywords: doc.metadata.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17704
|
+
publishedAt: typeof doc.metadata.publishedAt === "number" ? doc.metadata.publishedAt : void 0
|
|
17370
17705
|
};
|
|
17371
17706
|
} catch {
|
|
17372
17707
|
return null;
|
|
17373
17708
|
}
|
|
17374
17709
|
}
|
|
17710
|
+
async fetchPageWithVector(url, scope) {
|
|
17711
|
+
try {
|
|
17712
|
+
const results = await this.pagesNs.fetch([url], {
|
|
17713
|
+
includeMetadata: true,
|
|
17714
|
+
includeVectors: true
|
|
17715
|
+
});
|
|
17716
|
+
const doc = results[0];
|
|
17717
|
+
if (!doc || !doc.metadata || !doc.vector) return null;
|
|
17718
|
+
if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
|
|
17719
|
+
return null;
|
|
17720
|
+
}
|
|
17721
|
+
return { metadata: doc.metadata, vector: doc.vector };
|
|
17722
|
+
} catch {
|
|
17723
|
+
return null;
|
|
17724
|
+
}
|
|
17725
|
+
}
|
|
17726
|
+
async fetchPagesBatch(urls, scope) {
|
|
17727
|
+
if (urls.length === 0) return [];
|
|
17728
|
+
try {
|
|
17729
|
+
const results = await this.pagesNs.fetch(urls, {
|
|
17730
|
+
includeMetadata: true
|
|
17731
|
+
});
|
|
17732
|
+
const out = [];
|
|
17733
|
+
for (const doc of results) {
|
|
17734
|
+
if (!doc || !doc.metadata) continue;
|
|
17735
|
+
if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
|
|
17736
|
+
continue;
|
|
17737
|
+
}
|
|
17738
|
+
out.push({
|
|
17739
|
+
url: doc.metadata.url,
|
|
17740
|
+
title: doc.metadata.title,
|
|
17741
|
+
routeFile: doc.metadata.routeFile,
|
|
17742
|
+
outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? []
|
|
17743
|
+
});
|
|
17744
|
+
}
|
|
17745
|
+
return out;
|
|
17746
|
+
} catch {
|
|
17747
|
+
return [];
|
|
17748
|
+
}
|
|
17749
|
+
}
|
|
17375
17750
|
async deletePages(scope) {
|
|
17751
|
+
const ids = [];
|
|
17752
|
+
let cursor = "0";
|
|
17376
17753
|
try {
|
|
17377
|
-
|
|
17378
|
-
|
|
17754
|
+
for (; ; ) {
|
|
17755
|
+
const result = await this.pagesNs.range({
|
|
17756
|
+
cursor,
|
|
17757
|
+
limit: 100,
|
|
17758
|
+
includeMetadata: true
|
|
17759
|
+
});
|
|
17760
|
+
for (const doc of result.vectors) {
|
|
17761
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17762
|
+
ids.push(String(doc.id));
|
|
17763
|
+
}
|
|
17764
|
+
}
|
|
17765
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17766
|
+
cursor = result.nextCursor;
|
|
17767
|
+
}
|
|
17379
17768
|
} catch {
|
|
17380
17769
|
}
|
|
17770
|
+
if (ids.length > 0) {
|
|
17771
|
+
await this.deletePagesByIds(ids, scope);
|
|
17772
|
+
}
|
|
17381
17773
|
}
|
|
17382
17774
|
async health() {
|
|
17383
17775
|
try {
|
|
17384
|
-
await this.
|
|
17776
|
+
await this.index.info();
|
|
17385
17777
|
return { ok: true };
|
|
17386
17778
|
} catch (error) {
|
|
17387
17779
|
return {
|
|
@@ -17391,14 +17783,31 @@ var UpstashSearchStore = class {
|
|
|
17391
17783
|
}
|
|
17392
17784
|
}
|
|
17393
17785
|
async dropAllIndexes(projectId) {
|
|
17394
|
-
const
|
|
17395
|
-
|
|
17396
|
-
|
|
17397
|
-
|
|
17398
|
-
|
|
17399
|
-
const
|
|
17400
|
-
|
|
17401
|
-
|
|
17786
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17787
|
+
const ids = [];
|
|
17788
|
+
let cursor = "0";
|
|
17789
|
+
try {
|
|
17790
|
+
for (; ; ) {
|
|
17791
|
+
const result = await ns.range({
|
|
17792
|
+
cursor,
|
|
17793
|
+
limit: 100,
|
|
17794
|
+
includeMetadata: true
|
|
17795
|
+
});
|
|
17796
|
+
for (const doc of result.vectors) {
|
|
17797
|
+
if (doc.metadata?.projectId === projectId) {
|
|
17798
|
+
ids.push(String(doc.id));
|
|
17799
|
+
}
|
|
17800
|
+
}
|
|
17801
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17802
|
+
cursor = result.nextCursor;
|
|
17803
|
+
}
|
|
17804
|
+
} catch {
|
|
17805
|
+
}
|
|
17806
|
+
if (ids.length > 0) {
|
|
17807
|
+
const BATCH_SIZE = 90;
|
|
17808
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17809
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17810
|
+
await ns.delete(batch);
|
|
17402
17811
|
}
|
|
17403
17812
|
}
|
|
17404
17813
|
}
|
|
@@ -17412,12 +17821,16 @@ async function createUpstashStore(config) {
|
|
|
17412
17821
|
if (!url || !token) {
|
|
17413
17822
|
throw new SearchSocketError(
|
|
17414
17823
|
"VECTOR_BACKEND_UNAVAILABLE",
|
|
17415
|
-
`Missing Upstash
|
|
17824
|
+
`Missing Upstash Vector credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
|
|
17416
17825
|
);
|
|
17417
17826
|
}
|
|
17418
|
-
const {
|
|
17419
|
-
const
|
|
17420
|
-
return new UpstashSearchStore({
|
|
17827
|
+
const { Index } = await import('@upstash/vector');
|
|
17828
|
+
const index = new Index({ url, token });
|
|
17829
|
+
return new UpstashSearchStore({
|
|
17830
|
+
index,
|
|
17831
|
+
pagesNamespace: config.upstash.namespaces.pages,
|
|
17832
|
+
chunksNamespace: config.upstash.namespaces.chunks
|
|
17833
|
+
});
|
|
17421
17834
|
}
|
|
17422
17835
|
|
|
17423
17836
|
// src/utils/pattern.ts
|
|
@@ -17460,29 +17873,65 @@ function nonNegativeOrZero(value) {
|
|
|
17460
17873
|
function normalizeForTitleMatch(text) {
|
|
17461
17874
|
return text.toLowerCase().replace(/[^a-z0-9\s]/g, "").replace(/\s+/g, " ").trim();
|
|
17462
17875
|
}
|
|
17463
|
-
function rankHits(hits, config, query) {
|
|
17876
|
+
function rankHits(hits, config, query, debug) {
|
|
17464
17877
|
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
17465
17878
|
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
17466
17879
|
return hits.map((hit) => {
|
|
17467
|
-
|
|
17880
|
+
const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
17881
|
+
let score = baseScore;
|
|
17882
|
+
let incomingLinkBoostValue = 0;
|
|
17468
17883
|
if (config.ranking.enableIncomingLinkBoost) {
|
|
17469
17884
|
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
17470
|
-
|
|
17885
|
+
incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
|
|
17886
|
+
score += incomingLinkBoostValue;
|
|
17471
17887
|
}
|
|
17888
|
+
let depthBoostValue = 0;
|
|
17472
17889
|
if (config.ranking.enableDepthBoost) {
|
|
17473
17890
|
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
17474
|
-
|
|
17891
|
+
depthBoostValue = depthBoost * config.ranking.weights.depth;
|
|
17892
|
+
score += depthBoostValue;
|
|
17475
17893
|
}
|
|
17894
|
+
let titleMatchBoostValue = 0;
|
|
17476
17895
|
if (normalizedQuery && titleMatchWeight > 0) {
|
|
17477
17896
|
const normalizedTitle = normalizeForTitleMatch(hit.metadata.title);
|
|
17478
17897
|
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
17479
|
-
|
|
17898
|
+
titleMatchBoostValue = titleMatchWeight;
|
|
17899
|
+
score += titleMatchBoostValue;
|
|
17480
17900
|
}
|
|
17481
17901
|
}
|
|
17482
|
-
|
|
17902
|
+
let freshnessBoostValue = 0;
|
|
17903
|
+
if (config.ranking.enableFreshnessBoost) {
|
|
17904
|
+
const publishedAt = hit.metadata.publishedAt;
|
|
17905
|
+
if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
|
|
17906
|
+
const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
|
|
17907
|
+
const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
|
|
17908
|
+
freshnessBoostValue = decay * config.ranking.weights.freshness;
|
|
17909
|
+
score += freshnessBoostValue;
|
|
17910
|
+
}
|
|
17911
|
+
}
|
|
17912
|
+
let anchorTextMatchBoostValue = 0;
|
|
17913
|
+
if (config.ranking.enableAnchorTextBoost && normalizedQuery && config.ranking.weights.anchorText > 0) {
|
|
17914
|
+
const normalizedAnchorText = normalizeForTitleMatch(hit.metadata.incomingAnchorText ?? "");
|
|
17915
|
+
if (normalizedAnchorText.length > 0 && normalizedQuery.length > 0 && (normalizedAnchorText.includes(normalizedQuery) || normalizedQuery.includes(normalizedAnchorText))) {
|
|
17916
|
+
anchorTextMatchBoostValue = config.ranking.weights.anchorText;
|
|
17917
|
+
score += anchorTextMatchBoostValue;
|
|
17918
|
+
}
|
|
17919
|
+
}
|
|
17920
|
+
const result = {
|
|
17483
17921
|
hit,
|
|
17484
17922
|
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
17485
17923
|
};
|
|
17924
|
+
if (debug) {
|
|
17925
|
+
result.breakdown = {
|
|
17926
|
+
baseScore,
|
|
17927
|
+
incomingLinkBoost: incomingLinkBoostValue,
|
|
17928
|
+
depthBoost: depthBoostValue,
|
|
17929
|
+
titleMatchBoost: titleMatchBoostValue,
|
|
17930
|
+
freshnessBoost: freshnessBoostValue,
|
|
17931
|
+
anchorTextMatchBoost: anchorTextMatchBoostValue
|
|
17932
|
+
};
|
|
17933
|
+
}
|
|
17934
|
+
return result;
|
|
17486
17935
|
}).sort((a, b) => {
|
|
17487
17936
|
const delta = b.finalScore - a.finalScore;
|
|
17488
17937
|
return Number.isNaN(delta) ? 0 : delta;
|
|
@@ -17491,12 +17940,13 @@ function rankHits(hits, config, query) {
|
|
|
17491
17940
|
function trimByScoreGap(results, config) {
|
|
17492
17941
|
if (results.length === 0) return results;
|
|
17493
17942
|
const threshold = config.ranking.scoreGapThreshold;
|
|
17494
|
-
const
|
|
17495
|
-
if (
|
|
17496
|
-
const
|
|
17497
|
-
|
|
17498
|
-
|
|
17499
|
-
|
|
17943
|
+
const minScoreRatio = config.ranking.minScoreRatio;
|
|
17944
|
+
if (minScoreRatio > 0 && results.length > 0) {
|
|
17945
|
+
const topScore = results[0].pageScore;
|
|
17946
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
17947
|
+
const minThreshold = topScore * minScoreRatio;
|
|
17948
|
+
results = results.filter((r) => r.pageScore >= minThreshold);
|
|
17949
|
+
}
|
|
17500
17950
|
}
|
|
17501
17951
|
if (threshold > 0 && results.length > 1) {
|
|
17502
17952
|
for (let i = 1; i < results.length; i++) {
|
|
@@ -17566,92 +18016,293 @@ function aggregateByPage(ranked, config) {
|
|
|
17566
18016
|
return Number.isNaN(delta) ? 0 : delta;
|
|
17567
18017
|
});
|
|
17568
18018
|
}
|
|
17569
|
-
function
|
|
17570
|
-
|
|
17571
|
-
const
|
|
17572
|
-
|
|
17573
|
-
|
|
17574
|
-
|
|
17575
|
-
|
|
17576
|
-
|
|
17577
|
-
|
|
17578
|
-
|
|
17579
|
-
|
|
17580
|
-
if (pageHit) {
|
|
17581
|
-
pagesWithChunks.add(url);
|
|
17582
|
-
const blended = (1 - w) * ranked.finalScore + w * pageHit.score;
|
|
17583
|
-
return {
|
|
17584
|
-
hit: ranked.hit,
|
|
17585
|
-
finalScore: Number.isFinite(blended) ? blended : ranked.finalScore
|
|
17586
|
-
};
|
|
18019
|
+
function rankPageHits(pageHits, config, query, debug) {
|
|
18020
|
+
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
18021
|
+
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
18022
|
+
return pageHits.map((hit) => {
|
|
18023
|
+
const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
18024
|
+
let score = baseScore;
|
|
18025
|
+
let incomingLinkBoostValue = 0;
|
|
18026
|
+
if (config.ranking.enableIncomingLinkBoost) {
|
|
18027
|
+
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.incomingLinks));
|
|
18028
|
+
incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
|
|
18029
|
+
score += incomingLinkBoostValue;
|
|
17587
18030
|
}
|
|
17588
|
-
|
|
17589
|
-
|
|
17590
|
-
|
|
17591
|
-
|
|
17592
|
-
|
|
17593
|
-
|
|
17594
|
-
|
|
17595
|
-
|
|
17596
|
-
|
|
17597
|
-
|
|
17598
|
-
|
|
17599
|
-
|
|
17600
|
-
|
|
17601
|
-
|
|
17602
|
-
|
|
17603
|
-
|
|
17604
|
-
|
|
17605
|
-
|
|
17606
|
-
|
|
17607
|
-
|
|
17608
|
-
|
|
17609
|
-
|
|
17610
|
-
routeFile: pageHit.routeFile,
|
|
17611
|
-
tags: pageHit.tags
|
|
18031
|
+
let depthBoostValue = 0;
|
|
18032
|
+
if (config.ranking.enableDepthBoost) {
|
|
18033
|
+
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.depth));
|
|
18034
|
+
depthBoostValue = depthBoost * config.ranking.weights.depth;
|
|
18035
|
+
score += depthBoostValue;
|
|
18036
|
+
}
|
|
18037
|
+
let titleMatchBoostValue = 0;
|
|
18038
|
+
if (normalizedQuery && titleMatchWeight > 0) {
|
|
18039
|
+
const normalizedTitle = normalizeForTitleMatch(hit.title);
|
|
18040
|
+
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
18041
|
+
titleMatchBoostValue = titleMatchWeight;
|
|
18042
|
+
score += titleMatchBoostValue;
|
|
18043
|
+
}
|
|
18044
|
+
}
|
|
18045
|
+
let freshnessBoostValue = 0;
|
|
18046
|
+
if (config.ranking.enableFreshnessBoost) {
|
|
18047
|
+
const publishedAt = hit.publishedAt;
|
|
18048
|
+
if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
|
|
18049
|
+
const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
|
|
18050
|
+
const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
|
|
18051
|
+
freshnessBoostValue = decay * config.ranking.weights.freshness;
|
|
18052
|
+
score += freshnessBoostValue;
|
|
17612
18053
|
}
|
|
18054
|
+
}
|
|
18055
|
+
const pageWeight = findPageWeight(hit.url, config.ranking.pageWeights);
|
|
18056
|
+
if (pageWeight !== 1) {
|
|
18057
|
+
score *= pageWeight;
|
|
18058
|
+
}
|
|
18059
|
+
const result = {
|
|
18060
|
+
url: hit.url,
|
|
18061
|
+
title: hit.title,
|
|
18062
|
+
description: hit.description,
|
|
18063
|
+
routeFile: hit.routeFile,
|
|
18064
|
+
depth: hit.depth,
|
|
18065
|
+
incomingLinks: hit.incomingLinks,
|
|
18066
|
+
tags: hit.tags,
|
|
18067
|
+
baseScore,
|
|
18068
|
+
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY,
|
|
18069
|
+
publishedAt: hit.publishedAt
|
|
17613
18070
|
};
|
|
17614
|
-
|
|
17615
|
-
|
|
17616
|
-
|
|
17617
|
-
|
|
17618
|
-
|
|
17619
|
-
|
|
18071
|
+
if (debug) {
|
|
18072
|
+
result.breakdown = {
|
|
18073
|
+
baseScore,
|
|
18074
|
+
pageWeight,
|
|
18075
|
+
incomingLinkBoost: incomingLinkBoostValue,
|
|
18076
|
+
depthBoost: depthBoostValue,
|
|
18077
|
+
titleMatchBoost: titleMatchBoostValue,
|
|
18078
|
+
freshnessBoost: freshnessBoostValue
|
|
18079
|
+
};
|
|
18080
|
+
}
|
|
18081
|
+
return result;
|
|
18082
|
+
}).filter((p) => findPageWeight(p.url, config.ranking.pageWeights) !== 0).sort((a, b) => {
|
|
17620
18083
|
const delta = b.finalScore - a.finalScore;
|
|
17621
18084
|
return Number.isNaN(delta) ? 0 : delta;
|
|
17622
18085
|
});
|
|
17623
18086
|
}
|
|
17624
|
-
|
|
17625
|
-
|
|
17626
|
-
|
|
17627
|
-
|
|
17628
|
-
|
|
17629
|
-
|
|
17630
|
-
|
|
17631
|
-
|
|
17632
|
-
|
|
17633
|
-
}
|
|
17634
|
-
|
|
17635
|
-
|
|
17636
|
-
|
|
17637
|
-
|
|
17638
|
-
|
|
17639
|
-
|
|
17640
|
-
|
|
17641
|
-
|
|
17642
|
-
|
|
17643
|
-
|
|
17644
|
-
|
|
17645
|
-
|
|
17646
|
-
|
|
17647
|
-
|
|
17648
|
-
|
|
17649
|
-
|
|
17650
|
-
|
|
17651
|
-
|
|
17652
|
-
|
|
17653
|
-
|
|
17654
|
-
|
|
18087
|
+
function trimPagesByScoreGap(results, config) {
|
|
18088
|
+
if (results.length === 0) return results;
|
|
18089
|
+
const threshold = config.ranking.scoreGapThreshold;
|
|
18090
|
+
const minScoreRatio = config.ranking.minScoreRatio;
|
|
18091
|
+
if (minScoreRatio > 0 && results.length > 0) {
|
|
18092
|
+
const topScore = results[0].finalScore;
|
|
18093
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
18094
|
+
const minThreshold = topScore * minScoreRatio;
|
|
18095
|
+
results = results.filter((r) => r.finalScore >= minThreshold);
|
|
18096
|
+
}
|
|
18097
|
+
}
|
|
18098
|
+
if (threshold > 0 && results.length > 1) {
|
|
18099
|
+
for (let i = 1; i < results.length; i++) {
|
|
18100
|
+
const prev = results[i - 1].finalScore;
|
|
18101
|
+
const current = results[i].finalScore;
|
|
18102
|
+
if (prev > 0) {
|
|
18103
|
+
const gap = (prev - current) / prev;
|
|
18104
|
+
if (gap >= threshold) {
|
|
18105
|
+
return results.slice(0, i);
|
|
18106
|
+
}
|
|
18107
|
+
}
|
|
18108
|
+
}
|
|
18109
|
+
}
|
|
18110
|
+
return results;
|
|
18111
|
+
}
|
|
18112
|
+
|
|
18113
|
+
// src/search/related-pages.ts
|
|
18114
|
+
function diceScore(urlA, urlB) {
|
|
18115
|
+
const segmentsA = urlA.split("/").filter(Boolean);
|
|
18116
|
+
const segmentsB = urlB.split("/").filter(Boolean);
|
|
18117
|
+
if (segmentsA.length === 0 && segmentsB.length === 0) return 1;
|
|
18118
|
+
if (segmentsA.length === 0 || segmentsB.length === 0) return 0;
|
|
18119
|
+
let shared = 0;
|
|
18120
|
+
const minLen = Math.min(segmentsA.length, segmentsB.length);
|
|
18121
|
+
for (let i = 0; i < minLen; i++) {
|
|
18122
|
+
if (segmentsA[i] === segmentsB[i]) {
|
|
18123
|
+
shared++;
|
|
18124
|
+
} else {
|
|
18125
|
+
break;
|
|
18126
|
+
}
|
|
18127
|
+
}
|
|
18128
|
+
return 2 * shared / (segmentsA.length + segmentsB.length);
|
|
18129
|
+
}
|
|
18130
|
+
function compositeScore(isLinked, dice, semantic) {
|
|
18131
|
+
return (isLinked ? 0.5 : 0) + 0.3 * dice + 0.2 * semantic;
|
|
18132
|
+
}
|
|
18133
|
+
function dominantRelationshipType(isOutgoing, isIncoming, dice) {
|
|
18134
|
+
if (isOutgoing) return "outgoing_link";
|
|
18135
|
+
if (isIncoming) return "incoming_link";
|
|
18136
|
+
if (dice > 0.4) return "sibling";
|
|
18137
|
+
return "semantic";
|
|
18138
|
+
}
|
|
18139
|
+
|
|
18140
|
+
// src/utils/structured-meta.ts
|
|
18141
|
+
var VALID_KEY_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
|
|
18142
|
+
function validateMetaKey(key) {
|
|
18143
|
+
return VALID_KEY_RE.test(key);
|
|
18144
|
+
}
|
|
18145
|
+
function parseMetaValue(content, dataType) {
|
|
18146
|
+
switch (dataType) {
|
|
18147
|
+
case "number": {
|
|
18148
|
+
const n = Number(content);
|
|
18149
|
+
return Number.isFinite(n) ? n : content;
|
|
18150
|
+
}
|
|
18151
|
+
case "boolean":
|
|
18152
|
+
return content === "true";
|
|
18153
|
+
case "string[]":
|
|
18154
|
+
return content ? content.split(",").map((s) => s.trim()) : [];
|
|
18155
|
+
case "date": {
|
|
18156
|
+
const ms = Number(content);
|
|
18157
|
+
return Number.isFinite(ms) ? ms : content;
|
|
18158
|
+
}
|
|
18159
|
+
default:
|
|
18160
|
+
return content;
|
|
18161
|
+
}
|
|
18162
|
+
}
|
|
18163
|
+
function escapeFilterValue(s) {
|
|
18164
|
+
return s.replace(/'/g, "''");
|
|
18165
|
+
}
|
|
18166
|
+
function buildMetaFilterString(filters) {
|
|
18167
|
+
const clauses = [];
|
|
18168
|
+
for (const [key, value] of Object.entries(filters)) {
|
|
18169
|
+
if (!validateMetaKey(key)) continue;
|
|
18170
|
+
const field = `meta.${key}`;
|
|
18171
|
+
if (typeof value === "string") {
|
|
18172
|
+
clauses.push(`${field} CONTAINS '${escapeFilterValue(value)}'`);
|
|
18173
|
+
} else if (typeof value === "boolean") {
|
|
18174
|
+
clauses.push(`${field} = ${value}`);
|
|
18175
|
+
} else {
|
|
18176
|
+
clauses.push(`${field} = ${value}`);
|
|
18177
|
+
}
|
|
18178
|
+
}
|
|
18179
|
+
return clauses.join(" AND ");
|
|
18180
|
+
}
|
|
18181
|
+
|
|
18182
|
+
// src/search/engine.ts
|
|
18183
|
+
var rankingOverridesSchema = zod.z.object({
|
|
18184
|
+
ranking: zod.z.object({
|
|
18185
|
+
enableIncomingLinkBoost: zod.z.boolean().optional(),
|
|
18186
|
+
enableDepthBoost: zod.z.boolean().optional(),
|
|
18187
|
+
aggregationCap: zod.z.number().int().positive().optional(),
|
|
18188
|
+
aggregationDecay: zod.z.number().min(0).max(1).optional(),
|
|
18189
|
+
minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
18190
|
+
minScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
18191
|
+
scoreGapThreshold: zod.z.number().min(0).max(1).optional(),
|
|
18192
|
+
weights: zod.z.object({
|
|
18193
|
+
incomingLinks: zod.z.number().optional(),
|
|
18194
|
+
depth: zod.z.number().optional(),
|
|
18195
|
+
aggregation: zod.z.number().optional(),
|
|
18196
|
+
titleMatch: zod.z.number().optional()
|
|
18197
|
+
}).optional()
|
|
18198
|
+
}).optional(),
|
|
18199
|
+
search: zod.z.object({
|
|
18200
|
+
pageSearchWeight: zod.z.number().min(0).max(1).optional()
|
|
18201
|
+
}).optional()
|
|
18202
|
+
}).optional();
|
|
18203
|
+
var requestSchema = zod.z.object({
|
|
18204
|
+
q: zod.z.string().trim().min(1),
|
|
18205
|
+
topK: zod.z.number().int().positive().max(100).optional(),
|
|
18206
|
+
scope: zod.z.string().optional(),
|
|
18207
|
+
pathPrefix: zod.z.string().optional(),
|
|
18208
|
+
tags: zod.z.array(zod.z.string()).optional(),
|
|
18209
|
+
filters: zod.z.record(zod.z.string(), zod.z.union([zod.z.string(), zod.z.number(), zod.z.boolean()])).optional(),
|
|
18210
|
+
groupBy: zod.z.enum(["page", "chunk"]).optional(),
|
|
18211
|
+
maxSubResults: zod.z.number().int().positive().max(20).optional(),
|
|
18212
|
+
debug: zod.z.boolean().optional(),
|
|
18213
|
+
rankingOverrides: rankingOverridesSchema
|
|
18214
|
+
});
|
|
18215
|
+
var MAX_SITE_STRUCTURE_PAGES = 2e3;
|
|
18216
|
+
function makeNode(url, depth) {
|
|
18217
|
+
return { url, title: "", depth, routeFile: "", isIndexed: false, childCount: 0, children: [] };
|
|
18218
|
+
}
|
|
18219
|
+
function buildTree(pages, pathPrefix) {
|
|
18220
|
+
const nodeMap = /* @__PURE__ */ new Map();
|
|
18221
|
+
const root2 = makeNode("/", 0);
|
|
18222
|
+
nodeMap.set("/", root2);
|
|
18223
|
+
for (const page of pages) {
|
|
18224
|
+
const normalized = normalizeUrlPath(page.url);
|
|
18225
|
+
const segments = normalized.split("/").filter(Boolean);
|
|
18226
|
+
if (segments.length === 0) {
|
|
18227
|
+
root2.title = page.title;
|
|
18228
|
+
root2.routeFile = page.routeFile;
|
|
18229
|
+
root2.isIndexed = true;
|
|
18230
|
+
continue;
|
|
18231
|
+
}
|
|
18232
|
+
for (let i = 1; i <= segments.length; i++) {
|
|
18233
|
+
const partialUrl = "/" + segments.slice(0, i).join("/");
|
|
18234
|
+
if (!nodeMap.has(partialUrl)) {
|
|
18235
|
+
nodeMap.set(partialUrl, makeNode(partialUrl, i));
|
|
18236
|
+
}
|
|
18237
|
+
}
|
|
18238
|
+
const node = nodeMap.get(normalized);
|
|
18239
|
+
node.title = page.title;
|
|
18240
|
+
node.routeFile = page.routeFile;
|
|
18241
|
+
node.isIndexed = true;
|
|
18242
|
+
}
|
|
18243
|
+
for (const [url, node] of nodeMap) {
|
|
18244
|
+
if (url === "/") continue;
|
|
18245
|
+
const segments = url.split("/").filter(Boolean);
|
|
18246
|
+
const parentUrl = segments.length === 1 ? "/" : "/" + segments.slice(0, -1).join("/");
|
|
18247
|
+
const parent = nodeMap.get(parentUrl) ?? root2;
|
|
18248
|
+
parent.children.push(node);
|
|
18249
|
+
}
|
|
18250
|
+
const sortAndCount = (node) => {
|
|
18251
|
+
node.children.sort((a, b) => a.url.localeCompare(b.url));
|
|
18252
|
+
node.childCount = node.children.length;
|
|
18253
|
+
for (const child of node.children) {
|
|
18254
|
+
sortAndCount(child);
|
|
18255
|
+
}
|
|
18256
|
+
};
|
|
18257
|
+
sortAndCount(root2);
|
|
18258
|
+
if (pathPrefix) {
|
|
18259
|
+
const normalizedPrefix = normalizeUrlPath(pathPrefix);
|
|
18260
|
+
const subtreeRoot = nodeMap.get(normalizedPrefix);
|
|
18261
|
+
if (subtreeRoot) {
|
|
18262
|
+
return subtreeRoot;
|
|
18263
|
+
}
|
|
18264
|
+
return makeNode(normalizedPrefix, normalizedPrefix.split("/").filter(Boolean).length);
|
|
18265
|
+
}
|
|
18266
|
+
return root2;
|
|
18267
|
+
}
|
|
18268
|
+
function mergeRankingOverrides(base, overrides) {
|
|
18269
|
+
return {
|
|
18270
|
+
...base,
|
|
18271
|
+
search: {
|
|
18272
|
+
...base.search,
|
|
18273
|
+
...overrides.search
|
|
18274
|
+
},
|
|
18275
|
+
ranking: {
|
|
18276
|
+
...base.ranking,
|
|
18277
|
+
...overrides.ranking,
|
|
18278
|
+
weights: {
|
|
18279
|
+
...base.ranking.weights,
|
|
18280
|
+
...overrides.ranking?.weights
|
|
18281
|
+
}
|
|
18282
|
+
}
|
|
18283
|
+
};
|
|
18284
|
+
}
|
|
18285
|
+
var SearchEngine = class _SearchEngine {
|
|
18286
|
+
cwd;
|
|
18287
|
+
config;
|
|
18288
|
+
store;
|
|
18289
|
+
constructor(options) {
|
|
18290
|
+
this.cwd = options.cwd;
|
|
18291
|
+
this.config = options.config;
|
|
18292
|
+
this.store = options.store;
|
|
18293
|
+
}
|
|
18294
|
+
static async create(options = {}) {
|
|
18295
|
+
const cwd = path__default.default.resolve(options.cwd ?? process.cwd());
|
|
18296
|
+
const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
|
|
18297
|
+
const store = options.store ?? await createUpstashStore(config);
|
|
18298
|
+
return new _SearchEngine({
|
|
18299
|
+
cwd,
|
|
18300
|
+
config,
|
|
18301
|
+
store
|
|
18302
|
+
});
|
|
18303
|
+
}
|
|
18304
|
+
getConfig() {
|
|
18305
|
+
return this.config;
|
|
17655
18306
|
}
|
|
17656
18307
|
async search(request) {
|
|
17657
18308
|
const parsed = requestSchema.safeParse(request);
|
|
@@ -17660,125 +18311,203 @@ var SearchEngine = class _SearchEngine {
|
|
|
17660
18311
|
}
|
|
17661
18312
|
const input = parsed.data;
|
|
17662
18313
|
const totalStart = process.hrtime.bigint();
|
|
18314
|
+
const effectiveConfig = input.debug && input.rankingOverrides ? mergeRankingOverrides(this.config, input.rankingOverrides) : this.config;
|
|
17663
18315
|
const resolvedScope = resolveScope(this.config, input.scope);
|
|
17664
18316
|
const topK = input.topK ?? 10;
|
|
18317
|
+
const maxSubResults = input.maxSubResults ?? 5;
|
|
17665
18318
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
17666
|
-
const
|
|
17667
|
-
const
|
|
17668
|
-
|
|
17669
|
-
|
|
17670
|
-
|
|
17671
|
-
|
|
17672
|
-
|
|
17673
|
-
|
|
17674
|
-
|
|
18319
|
+
const queryText = input.q;
|
|
18320
|
+
const pathPrefix = input.pathPrefix ? input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}` : void 0;
|
|
18321
|
+
const filterTags = input.tags && input.tags.length > 0 ? input.tags : void 0;
|
|
18322
|
+
const metaFilterStr = input.filters && Object.keys(input.filters).length > 0 ? buildMetaFilterString(input.filters) : "";
|
|
18323
|
+
const metaFilter = metaFilterStr || void 0;
|
|
18324
|
+
const applyPagePostFilters = (hits) => {
|
|
18325
|
+
let filtered = hits;
|
|
18326
|
+
if (pathPrefix) {
|
|
18327
|
+
filtered = filtered.filter((h) => h.url.startsWith(pathPrefix));
|
|
18328
|
+
}
|
|
18329
|
+
if (filterTags) {
|
|
18330
|
+
filtered = filtered.filter(
|
|
18331
|
+
(h) => filterTags.every((tag) => h.tags.includes(tag))
|
|
18332
|
+
);
|
|
17675
18333
|
}
|
|
17676
|
-
|
|
17677
|
-
|
|
17678
|
-
const
|
|
18334
|
+
return filtered;
|
|
18335
|
+
};
|
|
18336
|
+
const applyChunkPostFilters = (hits) => {
|
|
18337
|
+
let filtered = hits;
|
|
18338
|
+
if (filterTags) {
|
|
18339
|
+
filtered = filtered.filter(
|
|
18340
|
+
(h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
|
|
18341
|
+
);
|
|
18342
|
+
}
|
|
18343
|
+
return filtered;
|
|
18344
|
+
};
|
|
17679
18345
|
const searchStart = process.hrtime.bigint();
|
|
17680
|
-
|
|
17681
|
-
|
|
17682
|
-
const
|
|
17683
|
-
const
|
|
17684
|
-
|
|
17685
|
-
|
|
17686
|
-
|
|
17687
|
-
|
|
17688
|
-
|
|
17689
|
-
|
|
17690
|
-
|
|
17691
|
-
|
|
17692
|
-
|
|
17693
|
-
|
|
17694
|
-
|
|
17695
|
-
|
|
17696
|
-
|
|
17697
|
-
{
|
|
17698
|
-
limit: chunkLimit,
|
|
17699
|
-
semanticWeight: this.config.search.semanticWeight,
|
|
17700
|
-
inputEnrichment: this.config.search.inputEnrichment,
|
|
17701
|
-
reranking: false,
|
|
17702
|
-
filter
|
|
17703
|
-
},
|
|
18346
|
+
if (groupByPage) {
|
|
18347
|
+
const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
|
|
18348
|
+
const pageLimit = Math.max(topK * 2, 20);
|
|
18349
|
+
const pageHits = await this.store.searchPagesByText(
|
|
18350
|
+
queryText,
|
|
18351
|
+
{ limit: pageLimit * fetchMultiplier, filter: metaFilter },
|
|
18352
|
+
resolvedScope
|
|
18353
|
+
);
|
|
18354
|
+
const filteredPages = applyPagePostFilters(pageHits);
|
|
18355
|
+
let rankedPages = rankPageHits(filteredPages, effectiveConfig, input.q, input.debug);
|
|
18356
|
+
rankedPages = trimPagesByScoreGap(rankedPages, effectiveConfig);
|
|
18357
|
+
const topPages = rankedPages.slice(0, topK);
|
|
18358
|
+
const chunkPromises = topPages.map(
|
|
18359
|
+
(page) => this.store.searchChunksByUrl(
|
|
18360
|
+
queryText,
|
|
18361
|
+
page.url,
|
|
18362
|
+
{ limit: maxSubResults, filter: metaFilter },
|
|
17704
18363
|
resolvedScope
|
|
17705
|
-
)
|
|
17706
|
-
|
|
17707
|
-
const
|
|
17708
|
-
|
|
18364
|
+
).then((chunks) => applyChunkPostFilters(chunks))
|
|
18365
|
+
);
|
|
18366
|
+
const allChunks = await Promise.all(chunkPromises);
|
|
18367
|
+
const searchMs = hrTimeMs(searchStart);
|
|
18368
|
+
const results = this.buildPageFirstResults(topPages, allChunks, input.q, input.debug, maxSubResults);
|
|
18369
|
+
return {
|
|
18370
|
+
q: input.q,
|
|
18371
|
+
scope: resolvedScope.scopeName,
|
|
18372
|
+
results,
|
|
18373
|
+
meta: {
|
|
18374
|
+
timingsMs: {
|
|
18375
|
+
search: Math.round(searchMs),
|
|
18376
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18377
|
+
}
|
|
18378
|
+
}
|
|
18379
|
+
};
|
|
17709
18380
|
} else {
|
|
18381
|
+
const candidateK = Math.max(50, topK);
|
|
18382
|
+
const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
|
|
17710
18383
|
const hits = await this.store.search(
|
|
17711
|
-
|
|
17712
|
-
{
|
|
17713
|
-
limit: candidateK,
|
|
17714
|
-
semanticWeight: this.config.search.semanticWeight,
|
|
17715
|
-
inputEnrichment: this.config.search.inputEnrichment,
|
|
17716
|
-
reranking: this.config.search.reranking,
|
|
17717
|
-
filter
|
|
17718
|
-
},
|
|
18384
|
+
queryText,
|
|
18385
|
+
{ limit: candidateK * fetchMultiplier, filter: metaFilter },
|
|
17719
18386
|
resolvedScope
|
|
17720
18387
|
);
|
|
17721
|
-
|
|
17722
|
-
|
|
17723
|
-
|
|
17724
|
-
|
|
17725
|
-
|
|
17726
|
-
|
|
17727
|
-
|
|
17728
|
-
|
|
17729
|
-
|
|
17730
|
-
|
|
17731
|
-
|
|
17732
|
-
|
|
18388
|
+
let filtered = hits;
|
|
18389
|
+
if (pathPrefix) {
|
|
18390
|
+
filtered = filtered.filter((h) => h.metadata.url.startsWith(pathPrefix));
|
|
18391
|
+
}
|
|
18392
|
+
if (filterTags) {
|
|
18393
|
+
filtered = filtered.filter(
|
|
18394
|
+
(h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
|
|
18395
|
+
);
|
|
18396
|
+
}
|
|
18397
|
+
const ranked = rankHits(filtered, effectiveConfig, input.q, input.debug);
|
|
18398
|
+
const searchMs = hrTimeMs(searchStart);
|
|
18399
|
+
const results = this.buildResults(ranked, topK, false, maxSubResults, input.q, input.debug, effectiveConfig);
|
|
18400
|
+
return {
|
|
18401
|
+
q: input.q,
|
|
18402
|
+
scope: resolvedScope.scopeName,
|
|
18403
|
+
results,
|
|
18404
|
+
meta: {
|
|
18405
|
+
timingsMs: {
|
|
18406
|
+
search: Math.round(searchMs),
|
|
18407
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18408
|
+
}
|
|
17733
18409
|
}
|
|
18410
|
+
};
|
|
18411
|
+
}
|
|
18412
|
+
}
|
|
18413
|
+
buildPageFirstResults(rankedPages, allChunks, query, debug, maxSubResults = 5) {
|
|
18414
|
+
return rankedPages.map((page, i) => {
|
|
18415
|
+
const chunks = allChunks[i] ?? [];
|
|
18416
|
+
const bestChunk = chunks[0];
|
|
18417
|
+
const snippet = bestChunk ? query ? queryAwareExcerpt(bestChunk.metadata.chunkText, query) : toSnippet(bestChunk.metadata.chunkText) : page.description || page.title;
|
|
18418
|
+
const result = {
|
|
18419
|
+
url: page.url,
|
|
18420
|
+
title: page.title,
|
|
18421
|
+
sectionTitle: bestChunk?.metadata.sectionTitle || void 0,
|
|
18422
|
+
snippet,
|
|
18423
|
+
chunkText: bestChunk?.metadata.chunkText || void 0,
|
|
18424
|
+
score: Number(page.finalScore.toFixed(6)),
|
|
18425
|
+
routeFile: page.routeFile,
|
|
18426
|
+
chunks: chunks.length > 0 ? chunks.slice(0, maxSubResults).map((c) => ({
|
|
18427
|
+
sectionTitle: c.metadata.sectionTitle || void 0,
|
|
18428
|
+
snippet: query ? queryAwareExcerpt(c.metadata.chunkText, query) : toSnippet(c.metadata.chunkText),
|
|
18429
|
+
chunkText: c.metadata.chunkText || void 0,
|
|
18430
|
+
headingPath: c.metadata.headingPath,
|
|
18431
|
+
score: Number(c.score.toFixed(6))
|
|
18432
|
+
})) : void 0
|
|
18433
|
+
};
|
|
18434
|
+
if (debug && page.breakdown) {
|
|
18435
|
+
result.breakdown = {
|
|
18436
|
+
baseScore: page.breakdown.baseScore,
|
|
18437
|
+
incomingLinkBoost: page.breakdown.incomingLinkBoost,
|
|
18438
|
+
depthBoost: page.breakdown.depthBoost,
|
|
18439
|
+
titleMatchBoost: page.breakdown.titleMatchBoost,
|
|
18440
|
+
freshnessBoost: page.breakdown.freshnessBoost,
|
|
18441
|
+
anchorTextMatchBoost: 0
|
|
18442
|
+
};
|
|
17734
18443
|
}
|
|
17735
|
-
|
|
18444
|
+
return result;
|
|
18445
|
+
});
|
|
17736
18446
|
}
|
|
17737
|
-
ensureSnippet(hit) {
|
|
18447
|
+
ensureSnippet(hit, query) {
|
|
18448
|
+
const chunkText = hit.hit.metadata.chunkText;
|
|
18449
|
+
if (query && chunkText) return queryAwareExcerpt(chunkText, query);
|
|
17738
18450
|
const snippet = hit.hit.metadata.snippet;
|
|
17739
18451
|
if (snippet && snippet.length >= 30) return snippet;
|
|
17740
|
-
const chunkText = hit.hit.metadata.chunkText;
|
|
17741
18452
|
if (chunkText) return toSnippet(chunkText);
|
|
17742
18453
|
return snippet || "";
|
|
17743
18454
|
}
|
|
17744
|
-
buildResults(ordered, topK, groupByPage,
|
|
18455
|
+
buildResults(ordered, topK, groupByPage, maxSubResults, query, debug, config) {
|
|
18456
|
+
const cfg = config ?? this.config;
|
|
17745
18457
|
if (groupByPage) {
|
|
17746
|
-
let pages = aggregateByPage(ordered,
|
|
17747
|
-
pages = trimByScoreGap(pages,
|
|
17748
|
-
const minRatio =
|
|
18458
|
+
let pages = aggregateByPage(ordered, cfg);
|
|
18459
|
+
pages = trimByScoreGap(pages, cfg);
|
|
18460
|
+
const minRatio = cfg.ranking.minChunkScoreRatio;
|
|
17749
18461
|
return pages.slice(0, topK).map((page) => {
|
|
17750
18462
|
const bestScore = page.bestChunk.finalScore;
|
|
17751
18463
|
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
17752
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0,
|
|
17753
|
-
|
|
18464
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, maxSubResults);
|
|
18465
|
+
const result = {
|
|
17754
18466
|
url: page.url,
|
|
17755
18467
|
title: page.title,
|
|
17756
18468
|
sectionTitle: page.bestChunk.hit.metadata.sectionTitle || void 0,
|
|
17757
|
-
snippet: this.ensureSnippet(page.bestChunk),
|
|
18469
|
+
snippet: this.ensureSnippet(page.bestChunk, query),
|
|
18470
|
+
chunkText: page.bestChunk.hit.metadata.chunkText || void 0,
|
|
17758
18471
|
score: Number(page.pageScore.toFixed(6)),
|
|
17759
18472
|
routeFile: page.routeFile,
|
|
17760
|
-
chunks: meaningful.length
|
|
18473
|
+
chunks: meaningful.length >= 1 ? meaningful.map((c) => ({
|
|
17761
18474
|
sectionTitle: c.hit.metadata.sectionTitle || void 0,
|
|
17762
|
-
snippet: this.ensureSnippet(c),
|
|
18475
|
+
snippet: this.ensureSnippet(c, query),
|
|
18476
|
+
chunkText: c.hit.metadata.chunkText || void 0,
|
|
17763
18477
|
headingPath: c.hit.metadata.headingPath,
|
|
17764
18478
|
score: Number(c.finalScore.toFixed(6))
|
|
17765
18479
|
})) : void 0
|
|
17766
18480
|
};
|
|
18481
|
+
if (debug && page.bestChunk.breakdown) {
|
|
18482
|
+
result.breakdown = page.bestChunk.breakdown;
|
|
18483
|
+
}
|
|
18484
|
+
return result;
|
|
17767
18485
|
});
|
|
17768
18486
|
} else {
|
|
17769
18487
|
let filtered = ordered;
|
|
17770
|
-
const
|
|
17771
|
-
if (
|
|
17772
|
-
|
|
17773
|
-
|
|
17774
|
-
|
|
17775
|
-
|
|
17776
|
-
|
|
17777
|
-
|
|
17778
|
-
|
|
17779
|
-
|
|
17780
|
-
|
|
17781
|
-
|
|
18488
|
+
const minScoreRatio = cfg.ranking.minScoreRatio;
|
|
18489
|
+
if (minScoreRatio > 0 && ordered.length > 0) {
|
|
18490
|
+
const topScore = ordered[0].finalScore;
|
|
18491
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
18492
|
+
const threshold = topScore * minScoreRatio;
|
|
18493
|
+
filtered = ordered.filter((entry) => entry.finalScore >= threshold);
|
|
18494
|
+
}
|
|
18495
|
+
}
|
|
18496
|
+
return filtered.slice(0, topK).map(({ hit, finalScore, breakdown }) => {
|
|
18497
|
+
const result = {
|
|
18498
|
+
url: hit.metadata.url,
|
|
18499
|
+
title: hit.metadata.title,
|
|
18500
|
+
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
18501
|
+
snippet: this.ensureSnippet({ hit, finalScore }, query),
|
|
18502
|
+
chunkText: hit.metadata.chunkText || void 0,
|
|
18503
|
+
score: Number(finalScore.toFixed(6)),
|
|
18504
|
+
routeFile: hit.metadata.routeFile
|
|
18505
|
+
};
|
|
18506
|
+
if (debug && breakdown) {
|
|
18507
|
+
result.breakdown = breakdown;
|
|
18508
|
+
}
|
|
18509
|
+
return result;
|
|
18510
|
+
});
|
|
17782
18511
|
}
|
|
17783
18512
|
}
|
|
17784
18513
|
async getPage(pathOrUrl, scope) {
|
|
@@ -17804,6 +18533,116 @@ var SearchEngine = class _SearchEngine {
|
|
|
17804
18533
|
markdown: page.markdown
|
|
17805
18534
|
};
|
|
17806
18535
|
}
|
|
18536
|
+
async listPages(opts) {
|
|
18537
|
+
const resolvedScope = resolveScope(this.config, opts?.scope);
|
|
18538
|
+
const pathPrefix = opts?.pathPrefix ? opts.pathPrefix.startsWith("/") ? opts.pathPrefix : `/${opts.pathPrefix}` : void 0;
|
|
18539
|
+
return this.store.listPages(resolvedScope, {
|
|
18540
|
+
cursor: opts?.cursor,
|
|
18541
|
+
limit: opts?.limit,
|
|
18542
|
+
pathPrefix
|
|
18543
|
+
});
|
|
18544
|
+
}
|
|
18545
|
+
async getSiteStructure(opts) {
|
|
18546
|
+
const maxPages = Math.min(opts?.maxPages ?? MAX_SITE_STRUCTURE_PAGES, MAX_SITE_STRUCTURE_PAGES);
|
|
18547
|
+
const allPages = [];
|
|
18548
|
+
let cursor;
|
|
18549
|
+
let truncated = false;
|
|
18550
|
+
do {
|
|
18551
|
+
const result = await this.listPages({
|
|
18552
|
+
pathPrefix: opts?.pathPrefix,
|
|
18553
|
+
scope: opts?.scope,
|
|
18554
|
+
cursor,
|
|
18555
|
+
limit: 200
|
|
18556
|
+
});
|
|
18557
|
+
allPages.push(...result.pages);
|
|
18558
|
+
cursor = result.nextCursor;
|
|
18559
|
+
if (allPages.length >= maxPages) {
|
|
18560
|
+
truncated = allPages.length > maxPages || !!cursor;
|
|
18561
|
+
allPages.length = maxPages;
|
|
18562
|
+
break;
|
|
18563
|
+
}
|
|
18564
|
+
} while (cursor);
|
|
18565
|
+
const root2 = buildTree(allPages, opts?.pathPrefix);
|
|
18566
|
+
return {
|
|
18567
|
+
root: root2,
|
|
18568
|
+
totalPages: allPages.length,
|
|
18569
|
+
truncated
|
|
18570
|
+
};
|
|
18571
|
+
}
|
|
18572
|
+
async getRelatedPages(pathOrUrl, opts) {
|
|
18573
|
+
const resolvedScope = resolveScope(this.config, opts?.scope);
|
|
18574
|
+
const urlPath = this.resolveInputPath(pathOrUrl);
|
|
18575
|
+
const topK = Math.min(opts?.topK ?? 10, 25);
|
|
18576
|
+
const source = await this.store.fetchPageWithVector(urlPath, resolvedScope);
|
|
18577
|
+
if (!source) {
|
|
18578
|
+
throw new SearchSocketError("INVALID_REQUEST", `Indexed page not found for ${urlPath}`, 404);
|
|
18579
|
+
}
|
|
18580
|
+
const sourceOutgoing = new Set(source.metadata.outgoingLinkUrls ?? []);
|
|
18581
|
+
const semanticHits = await this.store.searchPagesByVector(
|
|
18582
|
+
source.vector,
|
|
18583
|
+
{ limit: 50 },
|
|
18584
|
+
resolvedScope
|
|
18585
|
+
);
|
|
18586
|
+
const filteredHits = semanticHits.filter((h) => h.url !== urlPath);
|
|
18587
|
+
const semanticScoreMap = /* @__PURE__ */ new Map();
|
|
18588
|
+
for (const hit of filteredHits) {
|
|
18589
|
+
semanticScoreMap.set(hit.url, hit.score);
|
|
18590
|
+
}
|
|
18591
|
+
const candidateUrls = /* @__PURE__ */ new Set();
|
|
18592
|
+
for (const hit of filteredHits) {
|
|
18593
|
+
candidateUrls.add(hit.url);
|
|
18594
|
+
}
|
|
18595
|
+
for (const url of sourceOutgoing) {
|
|
18596
|
+
if (url !== urlPath) candidateUrls.add(url);
|
|
18597
|
+
}
|
|
18598
|
+
const missingUrls = [...sourceOutgoing].filter(
|
|
18599
|
+
(u) => u !== urlPath && !semanticScoreMap.has(u)
|
|
18600
|
+
);
|
|
18601
|
+
const fetchedPages = missingUrls.length > 0 ? await this.store.fetchPagesBatch(missingUrls, resolvedScope) : [];
|
|
18602
|
+
const metaMap = /* @__PURE__ */ new Map();
|
|
18603
|
+
for (const hit of filteredHits) {
|
|
18604
|
+
metaMap.set(hit.url, { title: hit.title, routeFile: hit.routeFile, outgoingLinkUrls: [] });
|
|
18605
|
+
}
|
|
18606
|
+
for (const p of fetchedPages) {
|
|
18607
|
+
metaMap.set(p.url, { title: p.title, routeFile: p.routeFile, outgoingLinkUrls: p.outgoingLinkUrls });
|
|
18608
|
+
}
|
|
18609
|
+
const semanticUrls = filteredHits.map((h) => h.url);
|
|
18610
|
+
if (semanticUrls.length > 0) {
|
|
18611
|
+
const semanticPageData = await this.store.fetchPagesBatch(semanticUrls, resolvedScope);
|
|
18612
|
+
for (const p of semanticPageData) {
|
|
18613
|
+
const existing = metaMap.get(p.url);
|
|
18614
|
+
if (existing) {
|
|
18615
|
+
existing.outgoingLinkUrls = p.outgoingLinkUrls;
|
|
18616
|
+
}
|
|
18617
|
+
}
|
|
18618
|
+
}
|
|
18619
|
+
const candidates = [];
|
|
18620
|
+
for (const url of candidateUrls) {
|
|
18621
|
+
const meta = metaMap.get(url);
|
|
18622
|
+
if (!meta) continue;
|
|
18623
|
+
const isOutgoing = sourceOutgoing.has(url);
|
|
18624
|
+
const isIncoming = meta.outgoingLinkUrls.includes(urlPath);
|
|
18625
|
+
const isLinked = isOutgoing || isIncoming;
|
|
18626
|
+
const dice = diceScore(urlPath, url);
|
|
18627
|
+
const semantic = semanticScoreMap.get(url) ?? 0;
|
|
18628
|
+
const score = compositeScore(isLinked, dice, semantic);
|
|
18629
|
+
const relationshipType = dominantRelationshipType(isOutgoing, isIncoming, dice);
|
|
18630
|
+
candidates.push({
|
|
18631
|
+
url,
|
|
18632
|
+
title: meta.title,
|
|
18633
|
+
score: Number(score.toFixed(6)),
|
|
18634
|
+
relationshipType,
|
|
18635
|
+
routeFile: meta.routeFile
|
|
18636
|
+
});
|
|
18637
|
+
}
|
|
18638
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
18639
|
+
const results = candidates.slice(0, topK);
|
|
18640
|
+
return {
|
|
18641
|
+
sourceUrl: urlPath,
|
|
18642
|
+
scope: resolvedScope.scopeName,
|
|
18643
|
+
relatedPages: results
|
|
18644
|
+
};
|
|
18645
|
+
}
|
|
17807
18646
|
async health() {
|
|
17808
18647
|
return this.store.health();
|
|
17809
18648
|
}
|
|
@@ -17819,6 +18658,215 @@ var SearchEngine = class _SearchEngine {
|
|
|
17819
18658
|
}
|
|
17820
18659
|
};
|
|
17821
18660
|
|
|
18661
|
+
// src/mcp/server.ts
|
|
18662
|
+
function createServer(engine) {
|
|
18663
|
+
const server = new mcp_js.McpServer({
|
|
18664
|
+
name: "searchsocket-mcp",
|
|
18665
|
+
version: "0.1.0"
|
|
18666
|
+
});
|
|
18667
|
+
server.registerTool(
|
|
18668
|
+
"search",
|
|
18669
|
+
{
|
|
18670
|
+
description: `Semantic site search powered by Upstash Search. Returns url, title, snippet, chunkText, score, and routeFile per result. chunkText contains the full raw chunk markdown. When groupBy is 'page' (default), each result includes a chunks array with section-level sub-results containing sectionTitle, headingPath, snippet, and score. Supports optional filters for structured metadata (e.g. {"version": 2, "deprecated": false}).`,
|
|
18671
|
+
inputSchema: {
|
|
18672
|
+
query: zod.z.string().min(1),
|
|
18673
|
+
scope: zod.z.string().optional(),
|
|
18674
|
+
topK: zod.z.number().int().positive().max(100).optional(),
|
|
18675
|
+
pathPrefix: zod.z.string().optional(),
|
|
18676
|
+
tags: zod.z.array(zod.z.string()).optional(),
|
|
18677
|
+
filters: zod.z.record(zod.z.string(), zod.z.union([zod.z.string(), zod.z.number(), zod.z.boolean()])).optional(),
|
|
18678
|
+
groupBy: zod.z.enum(["page", "chunk"]).optional(),
|
|
18679
|
+
maxSubResults: zod.z.number().int().positive().max(20).optional()
|
|
18680
|
+
},
|
|
18681
|
+
outputSchema: {
|
|
18682
|
+
q: zod.z.string(),
|
|
18683
|
+
scope: zod.z.string(),
|
|
18684
|
+
results: zod.z.array(zod.z.object({
|
|
18685
|
+
url: zod.z.string(),
|
|
18686
|
+
title: zod.z.string(),
|
|
18687
|
+
sectionTitle: zod.z.string().optional(),
|
|
18688
|
+
snippet: zod.z.string(),
|
|
18689
|
+
score: zod.z.number(),
|
|
18690
|
+
routeFile: zod.z.string(),
|
|
18691
|
+
chunks: zod.z.array(zod.z.object({
|
|
18692
|
+
sectionTitle: zod.z.string().optional(),
|
|
18693
|
+
snippet: zod.z.string(),
|
|
18694
|
+
headingPath: zod.z.array(zod.z.string()),
|
|
18695
|
+
score: zod.z.number()
|
|
18696
|
+
})).optional()
|
|
18697
|
+
})),
|
|
18698
|
+
meta: zod.z.object({
|
|
18699
|
+
timingsMs: zod.z.object({
|
|
18700
|
+
search: zod.z.number(),
|
|
18701
|
+
total: zod.z.number()
|
|
18702
|
+
})
|
|
18703
|
+
})
|
|
18704
|
+
}
|
|
18705
|
+
},
|
|
18706
|
+
async (input) => {
|
|
18707
|
+
const result = await engine.search({
|
|
18708
|
+
q: input.query,
|
|
18709
|
+
topK: input.topK,
|
|
18710
|
+
scope: input.scope,
|
|
18711
|
+
pathPrefix: input.pathPrefix,
|
|
18712
|
+
tags: input.tags,
|
|
18713
|
+
filters: input.filters,
|
|
18714
|
+
groupBy: input.groupBy,
|
|
18715
|
+
maxSubResults: input.maxSubResults
|
|
18716
|
+
});
|
|
18717
|
+
return {
|
|
18718
|
+
content: [
|
|
18719
|
+
{
|
|
18720
|
+
type: "text",
|
|
18721
|
+
text: JSON.stringify(result, null, 2)
|
|
18722
|
+
}
|
|
18723
|
+
],
|
|
18724
|
+
structuredContent: result
|
|
18725
|
+
};
|
|
18726
|
+
}
|
|
18727
|
+
);
|
|
18728
|
+
server.registerTool(
|
|
18729
|
+
"get_page",
|
|
18730
|
+
{
|
|
18731
|
+
description: "Fetch indexed markdown for a specific path or URL, including frontmatter and routeFile mapping.",
|
|
18732
|
+
inputSchema: {
|
|
18733
|
+
pathOrUrl: zod.z.string().min(1),
|
|
18734
|
+
scope: zod.z.string().optional()
|
|
18735
|
+
}
|
|
18736
|
+
},
|
|
18737
|
+
async (input) => {
|
|
18738
|
+
const page = await engine.getPage(input.pathOrUrl, input.scope);
|
|
18739
|
+
return {
|
|
18740
|
+
content: [
|
|
18741
|
+
{
|
|
18742
|
+
type: "text",
|
|
18743
|
+
text: JSON.stringify(page, null, 2)
|
|
18744
|
+
}
|
|
18745
|
+
]
|
|
18746
|
+
};
|
|
18747
|
+
}
|
|
18748
|
+
);
|
|
18749
|
+
server.registerTool(
|
|
18750
|
+
"list_pages",
|
|
18751
|
+
{
|
|
18752
|
+
description: "List indexed pages with optional path prefix filtering and cursor-based pagination. Returns url, title, description, and routeFile for each page. Use nextCursor to fetch subsequent pages.",
|
|
18753
|
+
inputSchema: {
|
|
18754
|
+
pathPrefix: zod.z.string().optional(),
|
|
18755
|
+
cursor: zod.z.string().optional(),
|
|
18756
|
+
limit: zod.z.number().int().positive().max(200).optional(),
|
|
18757
|
+
scope: zod.z.string().optional()
|
|
18758
|
+
}
|
|
18759
|
+
},
|
|
18760
|
+
async (input) => {
|
|
18761
|
+
const result = await engine.listPages({
|
|
18762
|
+
pathPrefix: input.pathPrefix,
|
|
18763
|
+
cursor: input.cursor,
|
|
18764
|
+
limit: input.limit,
|
|
18765
|
+
scope: input.scope
|
|
18766
|
+
});
|
|
18767
|
+
return {
|
|
18768
|
+
content: [
|
|
18769
|
+
{
|
|
18770
|
+
type: "text",
|
|
18771
|
+
text: JSON.stringify(result, null, 2)
|
|
18772
|
+
}
|
|
18773
|
+
]
|
|
18774
|
+
};
|
|
18775
|
+
}
|
|
18776
|
+
);
|
|
18777
|
+
server.registerTool(
|
|
18778
|
+
"get_site_structure",
|
|
18779
|
+
{
|
|
18780
|
+
description: "Returns the hierarchical page tree derived from URL paths. Use this to understand site navigation structure, find where pages belong, or scope further operations to a section. Nodes with isIndexed: false are implicit structural parents not directly in the index. Large sites (>2000 pages) return truncated: true.",
|
|
18781
|
+
inputSchema: {
|
|
18782
|
+
pathPrefix: zod.z.string().optional(),
|
|
18783
|
+
scope: zod.z.string().optional(),
|
|
18784
|
+
maxPages: zod.z.number().int().positive().max(2e3).optional()
|
|
18785
|
+
}
|
|
18786
|
+
},
|
|
18787
|
+
async (input) => {
|
|
18788
|
+
const result = await engine.getSiteStructure({
|
|
18789
|
+
pathPrefix: input.pathPrefix,
|
|
18790
|
+
scope: input.scope,
|
|
18791
|
+
maxPages: input.maxPages
|
|
18792
|
+
});
|
|
18793
|
+
return {
|
|
18794
|
+
content: [
|
|
18795
|
+
{
|
|
18796
|
+
type: "text",
|
|
18797
|
+
text: JSON.stringify(result, null, 2)
|
|
18798
|
+
}
|
|
18799
|
+
]
|
|
18800
|
+
};
|
|
18801
|
+
}
|
|
18802
|
+
);
|
|
18803
|
+
server.registerTool(
|
|
18804
|
+
"find_source_file",
|
|
18805
|
+
{
|
|
18806
|
+
description: "Find the SvelteKit source file for a piece of site content. Use this when you need to locate and edit content on the site. Returns the URL, route file path, section title, and a content snippet.",
|
|
18807
|
+
inputSchema: {
|
|
18808
|
+
query: zod.z.string().min(1),
|
|
18809
|
+
scope: zod.z.string().optional()
|
|
18810
|
+
}
|
|
18811
|
+
},
|
|
18812
|
+
async (input) => {
|
|
18813
|
+
const result = await engine.search({
|
|
18814
|
+
q: input.query,
|
|
18815
|
+
topK: 1,
|
|
18816
|
+
scope: input.scope
|
|
18817
|
+
});
|
|
18818
|
+
if (result.results.length === 0) {
|
|
18819
|
+
return {
|
|
18820
|
+
content: [
|
|
18821
|
+
{
|
|
18822
|
+
type: "text",
|
|
18823
|
+
text: JSON.stringify({
|
|
18824
|
+
error: "No matching content found for the given query."
|
|
18825
|
+
})
|
|
18826
|
+
}
|
|
18827
|
+
]
|
|
18828
|
+
};
|
|
18829
|
+
}
|
|
18830
|
+
const match = result.results[0];
|
|
18831
|
+
const { url, routeFile, sectionTitle, snippet } = match;
|
|
18832
|
+
return {
|
|
18833
|
+
content: [
|
|
18834
|
+
{
|
|
18835
|
+
type: "text",
|
|
18836
|
+
text: JSON.stringify({ url, routeFile, sectionTitle, snippet })
|
|
18837
|
+
}
|
|
18838
|
+
]
|
|
18839
|
+
};
|
|
18840
|
+
}
|
|
18841
|
+
);
|
|
18842
|
+
server.registerTool(
|
|
18843
|
+
"get_related_pages",
|
|
18844
|
+
{
|
|
18845
|
+
description: "Find pages related to a given URL using link graph, semantic similarity, and structural proximity. Returns related pages ranked by a composite relatedness score. Use this to discover content connected to a known page.",
|
|
18846
|
+
inputSchema: {
|
|
18847
|
+
pathOrUrl: zod.z.string().min(1),
|
|
18848
|
+
scope: zod.z.string().optional(),
|
|
18849
|
+
topK: zod.z.number().int().positive().max(25).optional()
|
|
18850
|
+
}
|
|
18851
|
+
},
|
|
18852
|
+
async (input) => {
|
|
18853
|
+
const result = await engine.getRelatedPages(input.pathOrUrl, {
|
|
18854
|
+
topK: input.topK,
|
|
18855
|
+
scope: input.scope
|
|
18856
|
+
});
|
|
18857
|
+
return {
|
|
18858
|
+
content: [
|
|
18859
|
+
{
|
|
18860
|
+
type: "text",
|
|
18861
|
+
text: JSON.stringify(result, null, 2)
|
|
18862
|
+
}
|
|
18863
|
+
]
|
|
18864
|
+
};
|
|
18865
|
+
}
|
|
18866
|
+
);
|
|
18867
|
+
return server;
|
|
18868
|
+
}
|
|
18869
|
+
|
|
17822
18870
|
// src/sveltekit/handle.ts
|
|
17823
18871
|
var InMemoryRateLimiter = class {
|
|
17824
18872
|
constructor(windowMs, max) {
|
|
@@ -17847,7 +18895,13 @@ function searchsocketHandle(options = {}) {
|
|
|
17847
18895
|
let enginePromise = null;
|
|
17848
18896
|
let configPromise = null;
|
|
17849
18897
|
let apiPath = options.path;
|
|
18898
|
+
let llmsServePath = null;
|
|
18899
|
+
let serveMarkdownVariants = false;
|
|
18900
|
+
let mcpPath;
|
|
18901
|
+
let mcpApiKey;
|
|
18902
|
+
let mcpEnableJsonResponse = true;
|
|
17850
18903
|
let rateLimiter = null;
|
|
18904
|
+
let notConfigured = false;
|
|
17851
18905
|
const getConfig = async () => {
|
|
17852
18906
|
if (!configPromise) {
|
|
17853
18907
|
let configP;
|
|
@@ -17864,6 +18918,13 @@ function searchsocketHandle(options = {}) {
|
|
|
17864
18918
|
}
|
|
17865
18919
|
configPromise = configP.then((config) => {
|
|
17866
18920
|
apiPath = apiPath ?? config.api.path;
|
|
18921
|
+
mcpPath = config.mcp.handle.path;
|
|
18922
|
+
mcpApiKey = config.mcp.handle.apiKey;
|
|
18923
|
+
mcpEnableJsonResponse = config.mcp.handle.enableJsonResponse;
|
|
18924
|
+
if (config.llmsTxt.enable) {
|
|
18925
|
+
llmsServePath = "/" + config.llmsTxt.outputPath.replace(/^static\//, "");
|
|
18926
|
+
serveMarkdownVariants = config.llmsTxt.serveMarkdownVariants;
|
|
18927
|
+
}
|
|
17867
18928
|
if (config.api.rateLimit && !isServerless()) {
|
|
17868
18929
|
rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
|
|
17869
18930
|
}
|
|
@@ -17873,59 +18934,109 @@ function searchsocketHandle(options = {}) {
|
|
|
17873
18934
|
return configPromise;
|
|
17874
18935
|
};
|
|
17875
18936
|
const getEngine = async () => {
|
|
18937
|
+
if (notConfigured) {
|
|
18938
|
+
throw new SearchSocketError(
|
|
18939
|
+
"SEARCH_NOT_CONFIGURED",
|
|
18940
|
+
"Search is not configured. Set the required Upstash environment variables to enable search.",
|
|
18941
|
+
503
|
|
18942
|
+
);
|
|
18943
|
+
}
|
|
17876
18944
|
if (!enginePromise) {
|
|
17877
18945
|
const config = await getConfig();
|
|
17878
18946
|
enginePromise = SearchEngine.create({
|
|
17879
18947
|
cwd: options.cwd,
|
|
17880
18948
|
config
|
|
18949
|
+
}).catch((error) => {
|
|
18950
|
+
enginePromise = null;
|
|
18951
|
+
if (error instanceof SearchSocketError && error.code === "VECTOR_BACKEND_UNAVAILABLE") {
|
|
18952
|
+
notConfigured = true;
|
|
18953
|
+
throw new SearchSocketError(
|
|
18954
|
+
"SEARCH_NOT_CONFIGURED",
|
|
18955
|
+
"Search is not configured. Set the required Upstash environment variables to enable search.",
|
|
18956
|
+
503
|
|
18957
|
+
);
|
|
18958
|
+
}
|
|
18959
|
+
throw error;
|
|
17881
18960
|
});
|
|
17882
18961
|
}
|
|
17883
18962
|
return enginePromise;
|
|
17884
18963
|
};
|
|
17885
18964
|
const bodyLimit = options.maxBodyBytes ?? 64 * 1024;
|
|
17886
18965
|
return async ({ event, resolve }) => {
|
|
17887
|
-
if (apiPath && event.url.pathname !==
|
|
17888
|
-
|
|
18966
|
+
if (apiPath && !isApiPath(event.url.pathname, apiPath) && event.url.pathname !== llmsServePath) {
|
|
18967
|
+
const isMarkdownVariant = event.request.method === "GET" && event.url.pathname.endsWith(".md");
|
|
18968
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
18969
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
18970
|
+
}
|
|
18971
|
+
if (mcpPath) {
|
|
18972
|
+
if (serveMarkdownVariants && isMarkdownVariant) ; else {
|
|
18973
|
+
return resolve(event);
|
|
18974
|
+
}
|
|
18975
|
+
} else {
|
|
18976
|
+
if (configPromise || options.config || options.rawConfig) {
|
|
18977
|
+
await getConfig();
|
|
18978
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
18979
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
18980
|
+
}
|
|
18981
|
+
if (!(serveMarkdownVariants && isMarkdownVariant)) {
|
|
18982
|
+
return resolve(event);
|
|
18983
|
+
}
|
|
18984
|
+
} else {
|
|
18985
|
+
return resolve(event);
|
|
18986
|
+
}
|
|
18987
|
+
}
|
|
17889
18988
|
}
|
|
17890
18989
|
const config = await getConfig();
|
|
18990
|
+
if (llmsServePath && event.request.method === "GET" && event.url.pathname === llmsServePath) {
|
|
18991
|
+
const cwd = options.cwd ?? process.cwd();
|
|
18992
|
+
const filePath = path__default.default.resolve(cwd, config.llmsTxt.outputPath);
|
|
18993
|
+
try {
|
|
18994
|
+
const content = await fs9__default.default.readFile(filePath, "utf8");
|
|
18995
|
+
return new Response(content, {
|
|
18996
|
+
status: 200,
|
|
18997
|
+
headers: { "content-type": "text/plain; charset=utf-8" }
|
|
18998
|
+
});
|
|
18999
|
+
} catch {
|
|
19000
|
+
return resolve(event);
|
|
19001
|
+
}
|
|
19002
|
+
}
|
|
19003
|
+
if (serveMarkdownVariants && event.request.method === "GET" && event.url.pathname.endsWith(".md")) {
|
|
19004
|
+
let rawPath;
|
|
19005
|
+
try {
|
|
19006
|
+
rawPath = decodeURIComponent(event.url.pathname.slice(0, -3));
|
|
19007
|
+
} catch {
|
|
19008
|
+
return resolve(event);
|
|
19009
|
+
}
|
|
19010
|
+
const scope = event.url.searchParams?.get("scope") ?? void 0;
|
|
19011
|
+
try {
|
|
19012
|
+
const engine = await getEngine();
|
|
19013
|
+
const page = await engine.getPage(rawPath, scope);
|
|
19014
|
+
return new Response(page.markdown, {
|
|
19015
|
+
status: 200,
|
|
19016
|
+
headers: { "content-type": "text/markdown; charset=utf-8" }
|
|
19017
|
+
});
|
|
19018
|
+
} catch (error) {
|
|
19019
|
+
if (error instanceof SearchSocketError && error.status === 404) {
|
|
19020
|
+
return resolve(event);
|
|
19021
|
+
}
|
|
19022
|
+
throw error;
|
|
19023
|
+
}
|
|
19024
|
+
}
|
|
19025
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
19026
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
19027
|
+
}
|
|
17891
19028
|
const targetPath = apiPath ?? config.api.path;
|
|
17892
|
-
if (event.url.pathname
|
|
19029
|
+
if (!isApiPath(event.url.pathname, targetPath)) {
|
|
17893
19030
|
return resolve(event);
|
|
17894
19031
|
}
|
|
17895
|
-
|
|
19032
|
+
const subPath = event.url.pathname.slice(targetPath.length);
|
|
19033
|
+
const method = event.request.method;
|
|
19034
|
+
if (method === "OPTIONS") {
|
|
17896
19035
|
return new Response(null, {
|
|
17897
19036
|
status: 204,
|
|
17898
19037
|
headers: buildCorsHeaders(event.request, config)
|
|
17899
19038
|
});
|
|
17900
19039
|
}
|
|
17901
|
-
if (event.request.method !== "POST") {
|
|
17902
|
-
return withCors(
|
|
17903
|
-
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
|
|
17904
|
-
status: 405,
|
|
17905
|
-
headers: {
|
|
17906
|
-
"content-type": "application/json"
|
|
17907
|
-
}
|
|
17908
|
-
}),
|
|
17909
|
-
event.request,
|
|
17910
|
-
config
|
|
17911
|
-
);
|
|
17912
|
-
}
|
|
17913
|
-
const contentLength = Number(event.request.headers.get("content-length") ?? 0);
|
|
17914
|
-
if (contentLength > bodyLimit) {
|
|
17915
|
-
return withCors(
|
|
17916
|
-
new Response(
|
|
17917
|
-
JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Request body too large", 413))),
|
|
17918
|
-
{
|
|
17919
|
-
status: 413,
|
|
17920
|
-
headers: {
|
|
17921
|
-
"content-type": "application/json"
|
|
17922
|
-
}
|
|
17923
|
-
}
|
|
17924
|
-
),
|
|
17925
|
-
event.request,
|
|
17926
|
-
config
|
|
17927
|
-
);
|
|
17928
|
-
}
|
|
17929
19040
|
if (rateLimiter) {
|
|
17930
19041
|
const ip = event.getClientAddress?.() ?? event.request.headers.get("x-forwarded-for")?.split(",")[0]?.trim() ?? "unknown";
|
|
17931
19042
|
if (!rateLimiter.check(ip)) {
|
|
@@ -17945,39 +19056,32 @@ function searchsocketHandle(options = {}) {
|
|
|
17945
19056
|
}
|
|
17946
19057
|
}
|
|
17947
19058
|
try {
|
|
17948
|
-
|
|
17949
|
-
|
|
17950
|
-
|
|
17951
|
-
} else {
|
|
17952
|
-
let parsedFallback;
|
|
17953
|
-
try {
|
|
17954
|
-
parsedFallback = await event.request.json();
|
|
17955
|
-
} catch (error) {
|
|
17956
|
-
if (error instanceof SyntaxError) {
|
|
17957
|
-
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
17958
|
-
}
|
|
17959
|
-
throw error;
|
|
19059
|
+
if (method === "GET") {
|
|
19060
|
+
if (subPath === "" || subPath === "/") {
|
|
19061
|
+
return await handleGetSearch(event, config, getEngine);
|
|
17960
19062
|
}
|
|
17961
|
-
|
|
17962
|
-
|
|
17963
|
-
|
|
17964
|
-
|
|
19063
|
+
if (subPath === "/health") {
|
|
19064
|
+
return await handleGetHealth(event, config, getEngine);
|
|
19065
|
+
}
|
|
19066
|
+
if (subPath.startsWith("/pages/")) {
|
|
19067
|
+
return await handleGetPage(event, config, getEngine, subPath);
|
|
19068
|
+
}
|
|
19069
|
+
return withCors(
|
|
19070
|
+
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Not found", 404))), {
|
|
19071
|
+
status: 404,
|
|
19072
|
+
headers: { "content-type": "application/json" }
|
|
19073
|
+
}),
|
|
19074
|
+
event.request,
|
|
19075
|
+
config
|
|
19076
|
+
);
|
|
17965
19077
|
}
|
|
17966
|
-
|
|
17967
|
-
|
|
17968
|
-
body = JSON.parse(rawBody);
|
|
17969
|
-
} catch {
|
|
17970
|
-
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
19078
|
+
if (method === "POST" && (subPath === "" || subPath === "/")) {
|
|
19079
|
+
return await handlePostSearch(event, config, getEngine, bodyLimit);
|
|
17971
19080
|
}
|
|
17972
|
-
const engine = await getEngine();
|
|
17973
|
-
const searchRequest = body;
|
|
17974
|
-
const result = await engine.search(searchRequest);
|
|
17975
19081
|
return withCors(
|
|
17976
|
-
new Response(JSON.stringify(
|
|
17977
|
-
status:
|
|
17978
|
-
headers: {
|
|
17979
|
-
"content-type": "application/json"
|
|
17980
|
-
}
|
|
19082
|
+
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
|
|
19083
|
+
status: 405,
|
|
19084
|
+
headers: { "content-type": "application/json" }
|
|
17981
19085
|
}),
|
|
17982
19086
|
event.request,
|
|
17983
19087
|
config
|
|
@@ -17998,6 +19102,183 @@ function searchsocketHandle(options = {}) {
|
|
|
17998
19102
|
}
|
|
17999
19103
|
};
|
|
18000
19104
|
}
|
|
19105
|
+
function isApiPath(pathname, apiPath) {
|
|
19106
|
+
return pathname === apiPath || pathname.startsWith(apiPath + "/");
|
|
19107
|
+
}
|
|
19108
|
+
async function handleGetSearch(event, config, getEngine) {
|
|
19109
|
+
const params = event.url.searchParams;
|
|
19110
|
+
const q = params.get("q");
|
|
19111
|
+
if (!q || q.trim() === "") {
|
|
19112
|
+
throw new SearchSocketError("INVALID_REQUEST", "Missing required query parameter: q", 400);
|
|
19113
|
+
}
|
|
19114
|
+
const searchRequest = { q };
|
|
19115
|
+
const topK = params.get("topK");
|
|
19116
|
+
if (topK !== null) {
|
|
19117
|
+
const parsed = Number.parseInt(topK, 10);
|
|
19118
|
+
if (Number.isNaN(parsed) || parsed < 1) {
|
|
19119
|
+
throw new SearchSocketError("INVALID_REQUEST", "topK must be a positive integer", 400);
|
|
19120
|
+
}
|
|
19121
|
+
searchRequest.topK = parsed;
|
|
19122
|
+
}
|
|
19123
|
+
const scope = params.get("scope");
|
|
19124
|
+
if (scope !== null) searchRequest.scope = scope;
|
|
19125
|
+
const pathPrefix = params.get("pathPrefix");
|
|
19126
|
+
if (pathPrefix !== null) searchRequest.pathPrefix = pathPrefix;
|
|
19127
|
+
const groupBy = params.get("groupBy");
|
|
19128
|
+
if (groupBy) {
|
|
19129
|
+
if (groupBy !== "page" && groupBy !== "chunk") {
|
|
19130
|
+
throw new SearchSocketError("INVALID_REQUEST", 'groupBy must be "page" or "chunk"', 400);
|
|
19131
|
+
}
|
|
19132
|
+
searchRequest.groupBy = groupBy;
|
|
19133
|
+
}
|
|
19134
|
+
const maxSubResults = params.get("maxSubResults");
|
|
19135
|
+
if (maxSubResults !== null) {
|
|
19136
|
+
const parsed = Number.parseInt(maxSubResults, 10);
|
|
19137
|
+
if (Number.isNaN(parsed) || parsed < 1 || parsed > 20) {
|
|
19138
|
+
throw new SearchSocketError("INVALID_REQUEST", "maxSubResults must be a positive integer between 1 and 20", 400);
|
|
19139
|
+
}
|
|
19140
|
+
searchRequest.maxSubResults = parsed;
|
|
19141
|
+
}
|
|
19142
|
+
const tags = params.getAll("tags");
|
|
19143
|
+
if (tags.length > 0) searchRequest.tags = tags;
|
|
19144
|
+
const engine = await getEngine();
|
|
19145
|
+
const result = await engine.search(searchRequest);
|
|
19146
|
+
return withCors(
|
|
19147
|
+
new Response(JSON.stringify(result), {
|
|
19148
|
+
status: 200,
|
|
19149
|
+
headers: { "content-type": "application/json" }
|
|
19150
|
+
}),
|
|
19151
|
+
event.request,
|
|
19152
|
+
config
|
|
19153
|
+
);
|
|
19154
|
+
}
|
|
19155
|
+
async function handleGetHealth(event, config, getEngine) {
|
|
19156
|
+
const engine = await getEngine();
|
|
19157
|
+
const result = await engine.health();
|
|
19158
|
+
return withCors(
|
|
19159
|
+
new Response(JSON.stringify(result), {
|
|
19160
|
+
status: 200,
|
|
19161
|
+
headers: { "content-type": "application/json" }
|
|
19162
|
+
}),
|
|
19163
|
+
event.request,
|
|
19164
|
+
config
|
|
19165
|
+
);
|
|
19166
|
+
}
|
|
19167
|
+
async function handleGetPage(event, config, getEngine, subPath) {
|
|
19168
|
+
const rawPath = subPath.slice("/pages".length);
|
|
19169
|
+
let pagePath;
|
|
19170
|
+
try {
|
|
19171
|
+
pagePath = decodeURIComponent(rawPath);
|
|
19172
|
+
} catch {
|
|
19173
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed page path", 400);
|
|
19174
|
+
}
|
|
19175
|
+
const scope = event.url.searchParams?.get("scope") ?? void 0;
|
|
19176
|
+
const engine = await getEngine();
|
|
19177
|
+
const result = await engine.getPage(pagePath, scope);
|
|
19178
|
+
return withCors(
|
|
19179
|
+
new Response(JSON.stringify(result), {
|
|
19180
|
+
status: 200,
|
|
19181
|
+
headers: { "content-type": "application/json" }
|
|
19182
|
+
}),
|
|
19183
|
+
event.request,
|
|
19184
|
+
config
|
|
19185
|
+
);
|
|
19186
|
+
}
|
|
19187
|
+
async function handlePostSearch(event, config, getEngine, bodyLimit) {
|
|
19188
|
+
const contentLength = Number(event.request.headers.get("content-length") ?? 0);
|
|
19189
|
+
if (contentLength > bodyLimit) {
|
|
19190
|
+
throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
|
|
19191
|
+
}
|
|
19192
|
+
let rawBody;
|
|
19193
|
+
if (typeof event.request.text === "function") {
|
|
19194
|
+
rawBody = await event.request.text();
|
|
19195
|
+
} else {
|
|
19196
|
+
let parsedFallback;
|
|
19197
|
+
try {
|
|
19198
|
+
parsedFallback = await event.request.json();
|
|
19199
|
+
} catch (error) {
|
|
19200
|
+
if (error instanceof SyntaxError) {
|
|
19201
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
19202
|
+
}
|
|
19203
|
+
throw error;
|
|
19204
|
+
}
|
|
19205
|
+
rawBody = JSON.stringify(parsedFallback);
|
|
19206
|
+
}
|
|
19207
|
+
if (Buffer.byteLength(rawBody, "utf8") > bodyLimit) {
|
|
19208
|
+
throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
|
|
19209
|
+
}
|
|
19210
|
+
let body;
|
|
19211
|
+
try {
|
|
19212
|
+
body = JSON.parse(rawBody);
|
|
19213
|
+
} catch {
|
|
19214
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
19215
|
+
}
|
|
19216
|
+
const engine = await getEngine();
|
|
19217
|
+
const searchRequest = body;
|
|
19218
|
+
const result = await engine.search(searchRequest);
|
|
19219
|
+
return withCors(
|
|
19220
|
+
new Response(JSON.stringify(result), {
|
|
19221
|
+
status: 200,
|
|
19222
|
+
headers: { "content-type": "application/json" }
|
|
19223
|
+
}),
|
|
19224
|
+
event.request,
|
|
19225
|
+
config
|
|
19226
|
+
);
|
|
19227
|
+
}
|
|
19228
|
+
async function handleMcpRequest(event, apiKey, enableJsonResponse, getEngine) {
|
|
19229
|
+
if (apiKey) {
|
|
19230
|
+
const authHeader = event.request.headers.get("authorization") ?? "";
|
|
19231
|
+
const token = authHeader.startsWith("Bearer ") ? authHeader.slice(7) : "";
|
|
19232
|
+
const tokenBuf = Buffer.from(token);
|
|
19233
|
+
const keyBuf = Buffer.from(apiKey);
|
|
19234
|
+
if (tokenBuf.length !== keyBuf.length || !crypto.timingSafeEqual(tokenBuf, keyBuf)) {
|
|
19235
|
+
return new Response(
|
|
19236
|
+
JSON.stringify({
|
|
19237
|
+
jsonrpc: "2.0",
|
|
19238
|
+
error: { code: -32001, message: "Unauthorized" },
|
|
19239
|
+
id: null
|
|
19240
|
+
}),
|
|
19241
|
+
{ status: 401, headers: { "content-type": "application/json" } }
|
|
19242
|
+
);
|
|
19243
|
+
}
|
|
19244
|
+
}
|
|
19245
|
+
const transport = new webStandardStreamableHttp_js.WebStandardStreamableHTTPServerTransport({
|
|
19246
|
+
sessionIdGenerator: void 0,
|
|
19247
|
+
enableJsonResponse
|
|
19248
|
+
});
|
|
19249
|
+
let server;
|
|
19250
|
+
try {
|
|
19251
|
+
const engine = await getEngine();
|
|
19252
|
+
server = createServer(engine);
|
|
19253
|
+
await server.connect(transport);
|
|
19254
|
+
const response = await transport.handleRequest(event.request);
|
|
19255
|
+
if (enableJsonResponse) {
|
|
19256
|
+
await transport.close();
|
|
19257
|
+
await server.close();
|
|
19258
|
+
}
|
|
19259
|
+
return response;
|
|
19260
|
+
} catch (error) {
|
|
19261
|
+
try {
|
|
19262
|
+
await transport.close();
|
|
19263
|
+
} catch {
|
|
19264
|
+
}
|
|
19265
|
+
try {
|
|
19266
|
+
await server?.close();
|
|
19267
|
+
} catch {
|
|
19268
|
+
}
|
|
19269
|
+
return new Response(
|
|
19270
|
+
JSON.stringify({
|
|
19271
|
+
jsonrpc: "2.0",
|
|
19272
|
+
error: {
|
|
19273
|
+
code: -32603,
|
|
19274
|
+
message: error instanceof Error ? error.message : "Internal server error"
|
|
19275
|
+
},
|
|
19276
|
+
id: null
|
|
19277
|
+
}),
|
|
19278
|
+
{ status: 500, headers: { "content-type": "application/json" } }
|
|
19279
|
+
);
|
|
19280
|
+
}
|
|
19281
|
+
}
|
|
18001
19282
|
function buildCorsHeaders(request, config) {
|
|
18002
19283
|
const allowOrigins = config.api.cors.allowOrigins;
|
|
18003
19284
|
if (!allowOrigins || allowOrigins.length === 0) {
|
|
@@ -18010,7 +19291,7 @@ function buildCorsHeaders(request, config) {
|
|
|
18010
19291
|
}
|
|
18011
19292
|
return {
|
|
18012
19293
|
"access-control-allow-origin": allowOrigins.includes("*") ? "*" : origin,
|
|
18013
|
-
"access-control-allow-methods": "POST, OPTIONS",
|
|
19294
|
+
"access-control-allow-methods": "GET, POST, OPTIONS",
|
|
18014
19295
|
"access-control-allow-headers": "content-type"
|
|
18015
19296
|
};
|
|
18016
19297
|
}
|
|
@@ -18057,6 +19338,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
|
|
|
18057
19338
|
if (normalizeText(current.text)) {
|
|
18058
19339
|
sections.push({
|
|
18059
19340
|
sectionTitle: current.sectionTitle,
|
|
19341
|
+
headingLevel: current.headingLevel,
|
|
18060
19342
|
headingPath: current.headingPath,
|
|
18061
19343
|
text: current.text.trim()
|
|
18062
19344
|
});
|
|
@@ -18075,6 +19357,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
|
|
|
18075
19357
|
headingStack.length = level;
|
|
18076
19358
|
current = {
|
|
18077
19359
|
sectionTitle: title,
|
|
19360
|
+
headingLevel: level,
|
|
18078
19361
|
headingPath: headingStack.filter((entry) => Boolean(entry)).slice(0, headingPathDepth),
|
|
18079
19362
|
text: `${line}
|
|
18080
19363
|
`
|
|
@@ -18210,6 +19493,7 @@ function splitSection(section, config) {
|
|
|
18210
19493
|
return [
|
|
18211
19494
|
{
|
|
18212
19495
|
sectionTitle: section.sectionTitle,
|
|
19496
|
+
headingLevel: section.headingLevel,
|
|
18213
19497
|
headingPath: section.headingPath,
|
|
18214
19498
|
chunkText: text
|
|
18215
19499
|
}
|
|
@@ -18260,6 +19544,7 @@ ${chunk}`;
|
|
|
18260
19544
|
}
|
|
18261
19545
|
return merged.map((chunkText) => ({
|
|
18262
19546
|
sectionTitle: section.sectionTitle,
|
|
19547
|
+
headingLevel: section.headingLevel,
|
|
18263
19548
|
headingPath: section.headingPath,
|
|
18264
19549
|
chunkText
|
|
18265
19550
|
}));
|
|
@@ -18275,6 +19560,18 @@ function buildSummaryChunkText(page) {
|
|
|
18275
19560
|
}
|
|
18276
19561
|
return parts.join("\n\n");
|
|
18277
19562
|
}
|
|
19563
|
+
function buildEmbeddingTitle(chunk) {
|
|
19564
|
+
if (!chunk.sectionTitle || chunk.headingLevel === void 0) return void 0;
|
|
19565
|
+
if (chunk.headingPath.length > 1) {
|
|
19566
|
+
const path14 = chunk.headingPath.join(" > ");
|
|
19567
|
+
const lastInPath = chunk.headingPath[chunk.headingPath.length - 1];
|
|
19568
|
+
if (lastInPath !== chunk.sectionTitle) {
|
|
19569
|
+
return `${chunk.title} \u2014 ${path14} > ${chunk.sectionTitle}`;
|
|
19570
|
+
}
|
|
19571
|
+
return `${chunk.title} \u2014 ${path14}`;
|
|
19572
|
+
}
|
|
19573
|
+
return `${chunk.title} \u2014 ${chunk.sectionTitle}`;
|
|
19574
|
+
}
|
|
18278
19575
|
function buildEmbeddingText(chunk, prependTitle) {
|
|
18279
19576
|
if (!prependTitle) return chunk.chunkText;
|
|
18280
19577
|
const prefix = chunk.sectionTitle ? `${chunk.title} \u2014 ${chunk.sectionTitle}` : chunk.title;
|
|
@@ -18305,10 +19602,14 @@ function chunkPage(page, config, scope) {
|
|
|
18305
19602
|
tags: page.tags,
|
|
18306
19603
|
contentHash: "",
|
|
18307
19604
|
description: page.description,
|
|
18308
|
-
keywords: page.keywords
|
|
19605
|
+
keywords: page.keywords,
|
|
19606
|
+
publishedAt: page.publishedAt,
|
|
19607
|
+
incomingAnchorText: page.incomingAnchorText,
|
|
19608
|
+
meta: page.meta
|
|
18309
19609
|
};
|
|
18310
19610
|
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
18311
|
-
|
|
19611
|
+
const metaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
|
|
19612
|
+
summaryChunk.contentHash = sha256(normalizeText(embeddingText) + metaSuffix);
|
|
18312
19613
|
chunks.push(summaryChunk);
|
|
18313
19614
|
}
|
|
18314
19615
|
const ordinalOffset = config.chunking.pageSummaryChunk ? 1 : 0;
|
|
@@ -18325,6 +19626,7 @@ function chunkPage(page, config, scope) {
|
|
|
18325
19626
|
path: page.url,
|
|
18326
19627
|
title: page.title,
|
|
18327
19628
|
sectionTitle: entry.sectionTitle,
|
|
19629
|
+
headingLevel: entry.headingLevel,
|
|
18328
19630
|
headingPath: entry.headingPath,
|
|
18329
19631
|
chunkText: entry.chunkText,
|
|
18330
19632
|
snippet: toSnippet(entry.chunkText),
|
|
@@ -18334,10 +19636,16 @@ function chunkPage(page, config, scope) {
|
|
|
18334
19636
|
tags: page.tags,
|
|
18335
19637
|
contentHash: "",
|
|
18336
19638
|
description: page.description,
|
|
18337
|
-
keywords: page.keywords
|
|
19639
|
+
keywords: page.keywords,
|
|
19640
|
+
publishedAt: page.publishedAt,
|
|
19641
|
+
incomingAnchorText: page.incomingAnchorText,
|
|
19642
|
+
meta: page.meta
|
|
18338
19643
|
};
|
|
18339
19644
|
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
18340
|
-
|
|
19645
|
+
const embeddingTitle = config.chunking.weightHeadings ? buildEmbeddingTitle(chunk) : void 0;
|
|
19646
|
+
const chunkMetaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
|
|
19647
|
+
const hashInput = embeddingTitle ? `${normalizeText(embeddingText)}|title:${embeddingTitle}` : normalizeText(embeddingText);
|
|
19648
|
+
chunk.contentHash = sha256(hashInput + chunkMetaSuffix);
|
|
18341
19649
|
chunks.push(chunk);
|
|
18342
19650
|
}
|
|
18343
19651
|
return chunks;
|
|
@@ -19170,6 +20478,69 @@ function gfm(turndownService) {
|
|
|
19170
20478
|
}
|
|
19171
20479
|
|
|
19172
20480
|
// src/indexing/extractor.ts
|
|
20481
|
+
function normalizeDateToMs(value) {
|
|
20482
|
+
if (value == null) return void 0;
|
|
20483
|
+
if (value instanceof Date) {
|
|
20484
|
+
const ts = value.getTime();
|
|
20485
|
+
return Number.isFinite(ts) ? ts : void 0;
|
|
20486
|
+
}
|
|
20487
|
+
if (typeof value === "string") {
|
|
20488
|
+
const ts = new Date(value).getTime();
|
|
20489
|
+
return Number.isFinite(ts) ? ts : void 0;
|
|
20490
|
+
}
|
|
20491
|
+
if (typeof value === "number") {
|
|
20492
|
+
return Number.isFinite(value) ? value : void 0;
|
|
20493
|
+
}
|
|
20494
|
+
return void 0;
|
|
20495
|
+
}
|
|
20496
|
+
var FRONTMATTER_DATE_FIELDS = ["date", "publishedAt", "updatedAt", "published_at", "updated_at"];
|
|
20497
|
+
function extractPublishedAtFromFrontmatter(data) {
|
|
20498
|
+
for (const field of FRONTMATTER_DATE_FIELDS) {
|
|
20499
|
+
const val = normalizeDateToMs(data[field]);
|
|
20500
|
+
if (val !== void 0) return val;
|
|
20501
|
+
}
|
|
20502
|
+
return void 0;
|
|
20503
|
+
}
|
|
20504
|
+
function extractPublishedAtFromHtml($) {
|
|
20505
|
+
const jsonLdScripts = $('script[type="application/ld+json"]');
|
|
20506
|
+
for (let i = 0; i < jsonLdScripts.length; i++) {
|
|
20507
|
+
try {
|
|
20508
|
+
const raw = $(jsonLdScripts[i]).html();
|
|
20509
|
+
if (!raw) continue;
|
|
20510
|
+
const parsed = JSON.parse(raw);
|
|
20511
|
+
const candidates = [];
|
|
20512
|
+
if (Array.isArray(parsed)) {
|
|
20513
|
+
candidates.push(...parsed);
|
|
20514
|
+
} else if (parsed && typeof parsed === "object") {
|
|
20515
|
+
candidates.push(parsed);
|
|
20516
|
+
if (Array.isArray(parsed["@graph"])) {
|
|
20517
|
+
candidates.push(...parsed["@graph"]);
|
|
20518
|
+
}
|
|
20519
|
+
}
|
|
20520
|
+
for (const candidate of candidates) {
|
|
20521
|
+
const val = normalizeDateToMs(candidate.datePublished);
|
|
20522
|
+
if (val !== void 0) return val;
|
|
20523
|
+
}
|
|
20524
|
+
} catch {
|
|
20525
|
+
}
|
|
20526
|
+
}
|
|
20527
|
+
const ogTime = $('meta[property="article:published_time"]').attr("content")?.trim();
|
|
20528
|
+
if (ogTime) {
|
|
20529
|
+
const val = normalizeDateToMs(ogTime);
|
|
20530
|
+
if (val !== void 0) return val;
|
|
20531
|
+
}
|
|
20532
|
+
const itempropDate = $('meta[itemprop="datePublished"]').attr("content")?.trim() || $('time[itemprop="datePublished"]').attr("datetime")?.trim();
|
|
20533
|
+
if (itempropDate) {
|
|
20534
|
+
const val = normalizeDateToMs(itempropDate);
|
|
20535
|
+
if (val !== void 0) return val;
|
|
20536
|
+
}
|
|
20537
|
+
const timeEl = $("time[datetime]").first().attr("datetime")?.trim();
|
|
20538
|
+
if (timeEl) {
|
|
20539
|
+
const val = normalizeDateToMs(timeEl);
|
|
20540
|
+
if (val !== void 0) return val;
|
|
20541
|
+
}
|
|
20542
|
+
return void 0;
|
|
20543
|
+
}
|
|
19173
20544
|
function hasTopLevelNoindexComment(markdown) {
|
|
19174
20545
|
const lines = markdown.split(/\r?\n/);
|
|
19175
20546
|
let inFence = false;
|
|
@@ -19185,6 +20556,97 @@ function hasTopLevelNoindexComment(markdown) {
|
|
|
19185
20556
|
}
|
|
19186
20557
|
return false;
|
|
19187
20558
|
}
|
|
20559
|
+
var GARBAGE_ALT_WORDS = /* @__PURE__ */ new Set([
|
|
20560
|
+
"image",
|
|
20561
|
+
"photo",
|
|
20562
|
+
"picture",
|
|
20563
|
+
"icon",
|
|
20564
|
+
"logo",
|
|
20565
|
+
"banner",
|
|
20566
|
+
"screenshot",
|
|
20567
|
+
"thumbnail",
|
|
20568
|
+
"img",
|
|
20569
|
+
"graphic",
|
|
20570
|
+
"illustration",
|
|
20571
|
+
"spacer",
|
|
20572
|
+
"pixel",
|
|
20573
|
+
"placeholder",
|
|
20574
|
+
"avatar",
|
|
20575
|
+
"background"
|
|
20576
|
+
]);
|
|
20577
|
+
var IMAGE_EXT_RE = /\.(jpg|jpeg|png|gif|svg|webp|avif|bmp|ico)(\?.*)?$/i;
|
|
20578
|
+
function isMeaningfulAlt(alt) {
|
|
20579
|
+
const trimmed = alt.trim();
|
|
20580
|
+
if (!trimmed || trimmed.length < 5) return false;
|
|
20581
|
+
if (IMAGE_EXT_RE.test(trimmed)) return false;
|
|
20582
|
+
if (GARBAGE_ALT_WORDS.has(trimmed.toLowerCase())) return false;
|
|
20583
|
+
return true;
|
|
20584
|
+
}
|
|
20585
|
+
function resolveImageText(img, $, imageDescAttr) {
|
|
20586
|
+
const imgDesc = img.attr(imageDescAttr)?.trim();
|
|
20587
|
+
if (imgDesc) return imgDesc;
|
|
20588
|
+
const figure = img.closest("figure");
|
|
20589
|
+
if (figure.length) {
|
|
20590
|
+
const figDesc = figure.attr(imageDescAttr)?.trim();
|
|
20591
|
+
if (figDesc) return figDesc;
|
|
20592
|
+
}
|
|
20593
|
+
const alt = img.attr("alt")?.trim() ?? "";
|
|
20594
|
+
const caption = figure.length ? figure.find("figcaption").first().text().trim() : "";
|
|
20595
|
+
if (isMeaningfulAlt(alt) && caption) {
|
|
20596
|
+
return `${alt} \u2014 ${caption}`;
|
|
20597
|
+
}
|
|
20598
|
+
if (isMeaningfulAlt(alt)) {
|
|
20599
|
+
return alt;
|
|
20600
|
+
}
|
|
20601
|
+
if (caption) {
|
|
20602
|
+
return caption;
|
|
20603
|
+
}
|
|
20604
|
+
return null;
|
|
20605
|
+
}
|
|
20606
|
+
var STOP_ANCHORS = /* @__PURE__ */ new Set([
|
|
20607
|
+
"here",
|
|
20608
|
+
"click",
|
|
20609
|
+
"click here",
|
|
20610
|
+
"read more",
|
|
20611
|
+
"link",
|
|
20612
|
+
"this",
|
|
20613
|
+
"more"
|
|
20614
|
+
]);
|
|
20615
|
+
function normalizeAnchorText(raw) {
|
|
20616
|
+
const normalized = raw.replace(/\s+/g, " ").trim().toLowerCase();
|
|
20617
|
+
if (normalized.length < 3) return "";
|
|
20618
|
+
if (STOP_ANCHORS.has(normalized)) return "";
|
|
20619
|
+
if (normalized.length > 100) return normalized.slice(0, 100);
|
|
20620
|
+
return normalized;
|
|
20621
|
+
}
|
|
20622
|
+
function escapeHtml(text) {
|
|
20623
|
+
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">");
|
|
20624
|
+
}
|
|
20625
|
+
function preprocessImages(root2, $, imageDescAttr) {
|
|
20626
|
+
root2.find("picture").each((_i, el) => {
|
|
20627
|
+
const picture = $(el);
|
|
20628
|
+
const img = picture.find("img").first();
|
|
20629
|
+
const parentFigure = picture.closest("figure");
|
|
20630
|
+
const text = img.length ? resolveImageText(img, $, imageDescAttr) : null;
|
|
20631
|
+
if (text) {
|
|
20632
|
+
if (parentFigure.length) parentFigure.find("figcaption").remove();
|
|
20633
|
+
picture.replaceWith(`<span>${escapeHtml(text)}</span>`);
|
|
20634
|
+
} else {
|
|
20635
|
+
picture.remove();
|
|
20636
|
+
}
|
|
20637
|
+
});
|
|
20638
|
+
root2.find("img").each((_i, el) => {
|
|
20639
|
+
const img = $(el);
|
|
20640
|
+
const parentFigure = img.closest("figure");
|
|
20641
|
+
const text = resolveImageText(img, $, imageDescAttr);
|
|
20642
|
+
if (text) {
|
|
20643
|
+
if (parentFigure.length) parentFigure.find("figcaption").remove();
|
|
20644
|
+
img.replaceWith(`<span>${escapeHtml(text)}</span>`);
|
|
20645
|
+
} else {
|
|
20646
|
+
img.remove();
|
|
20647
|
+
}
|
|
20648
|
+
});
|
|
20649
|
+
}
|
|
19188
20650
|
function extractFromHtml(url, html, config) {
|
|
19189
20651
|
const $ = cheerio.load(html);
|
|
19190
20652
|
const normalizedUrl = normalizeUrlPath(url);
|
|
@@ -19210,6 +20672,20 @@ function extractFromHtml(url, html, config) {
|
|
|
19210
20672
|
if (weight === 0) {
|
|
19211
20673
|
return null;
|
|
19212
20674
|
}
|
|
20675
|
+
if ($('meta[name="searchsocket:noindex"]').attr("content") === "true") {
|
|
20676
|
+
return null;
|
|
20677
|
+
}
|
|
20678
|
+
const RESERVED_META_KEYS = /* @__PURE__ */ new Set(["noindex", "tags"]);
|
|
20679
|
+
const meta = {};
|
|
20680
|
+
$('meta[name^="searchsocket:"]').each((_i, el) => {
|
|
20681
|
+
const name = $(el).attr("name") ?? "";
|
|
20682
|
+
const key = name.slice("searchsocket:".length);
|
|
20683
|
+
if (!key || RESERVED_META_KEYS.has(key) || !validateMetaKey(key)) return;
|
|
20684
|
+
const content = $(el).attr("content") ?? "";
|
|
20685
|
+
const dataType = $(el).attr("data-type") ?? "string";
|
|
20686
|
+
meta[key] = parseMetaValue(content, dataType);
|
|
20687
|
+
});
|
|
20688
|
+
const componentTags = $('meta[name="searchsocket:tags"]').attr("content")?.trim();
|
|
19213
20689
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
19214
20690
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
19215
20691
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -19221,7 +20697,9 @@ function extractFromHtml(url, html, config) {
|
|
|
19221
20697
|
root2.find(selector).remove();
|
|
19222
20698
|
}
|
|
19223
20699
|
root2.find(`[${config.extract.ignoreAttr}]`).remove();
|
|
20700
|
+
preprocessImages(root2, $, config.extract.imageDescAttr);
|
|
19224
20701
|
const outgoingLinks = [];
|
|
20702
|
+
const seenLinkKeys = /* @__PURE__ */ new Set();
|
|
19225
20703
|
root2.find("a[href]").each((_index, node) => {
|
|
19226
20704
|
const href = $(node).attr("href");
|
|
19227
20705
|
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:")) {
|
|
@@ -19232,7 +20710,19 @@ function extractFromHtml(url, html, config) {
|
|
|
19232
20710
|
if (!["http:", "https:"].includes(parsed.protocol)) {
|
|
19233
20711
|
return;
|
|
19234
20712
|
}
|
|
19235
|
-
|
|
20713
|
+
const url2 = normalizeUrlPath(parsed.pathname);
|
|
20714
|
+
let anchorText = normalizeAnchorText($(node).text());
|
|
20715
|
+
if (!anchorText) {
|
|
20716
|
+
const imgAlt = $(node).find("img").first().attr("alt") ?? "";
|
|
20717
|
+
if (isMeaningfulAlt(imgAlt)) {
|
|
20718
|
+
anchorText = normalizeAnchorText(imgAlt);
|
|
20719
|
+
}
|
|
20720
|
+
}
|
|
20721
|
+
const key = `${url2}|${anchorText}`;
|
|
20722
|
+
if (!seenLinkKeys.has(key)) {
|
|
20723
|
+
seenLinkKeys.add(key);
|
|
20724
|
+
outgoingLinks.push({ url: url2, anchorText });
|
|
20725
|
+
}
|
|
19236
20726
|
} catch {
|
|
19237
20727
|
}
|
|
19238
20728
|
});
|
|
@@ -19257,16 +20747,25 @@ function extractFromHtml(url, html, config) {
|
|
|
19257
20747
|
return null;
|
|
19258
20748
|
}
|
|
19259
20749
|
const tags = normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1);
|
|
20750
|
+
const publishedAt = extractPublishedAtFromHtml($);
|
|
20751
|
+
if (componentTags) {
|
|
20752
|
+
const extraTags = componentTags.split(",").map((t) => t.trim()).filter(Boolean);
|
|
20753
|
+
for (const t of extraTags) {
|
|
20754
|
+
if (!tags.includes(t)) tags.push(t);
|
|
20755
|
+
}
|
|
20756
|
+
}
|
|
19260
20757
|
return {
|
|
19261
20758
|
url: normalizeUrlPath(url),
|
|
19262
20759
|
title,
|
|
19263
20760
|
markdown,
|
|
19264
|
-
outgoingLinks
|
|
20761
|
+
outgoingLinks,
|
|
19265
20762
|
noindex: false,
|
|
19266
20763
|
tags,
|
|
19267
20764
|
description,
|
|
19268
20765
|
keywords,
|
|
19269
|
-
weight
|
|
20766
|
+
weight,
|
|
20767
|
+
publishedAt,
|
|
20768
|
+
meta: Object.keys(meta).length > 0 ? meta : void 0
|
|
19270
20769
|
};
|
|
19271
20770
|
}
|
|
19272
20771
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -19287,6 +20786,24 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19287
20786
|
if (mdWeight === 0) {
|
|
19288
20787
|
return null;
|
|
19289
20788
|
}
|
|
20789
|
+
let mdMeta;
|
|
20790
|
+
const rawMeta = searchsocketMeta?.meta;
|
|
20791
|
+
if (rawMeta && typeof rawMeta === "object" && !Array.isArray(rawMeta)) {
|
|
20792
|
+
const metaObj = {};
|
|
20793
|
+
for (const [key, val] of Object.entries(rawMeta)) {
|
|
20794
|
+
if (!validateMetaKey(key)) continue;
|
|
20795
|
+
if (typeof val === "string" || typeof val === "number" || typeof val === "boolean") {
|
|
20796
|
+
metaObj[key] = val;
|
|
20797
|
+
} else if (Array.isArray(val) && val.every((v) => typeof v === "string")) {
|
|
20798
|
+
metaObj[key] = val;
|
|
20799
|
+
} else if (val instanceof Date) {
|
|
20800
|
+
metaObj[key] = val.getTime();
|
|
20801
|
+
}
|
|
20802
|
+
}
|
|
20803
|
+
if (Object.keys(metaObj).length > 0) {
|
|
20804
|
+
mdMeta = metaObj;
|
|
20805
|
+
}
|
|
20806
|
+
}
|
|
19290
20807
|
const content = parsed.content;
|
|
19291
20808
|
const normalized = normalizeMarkdown(content);
|
|
19292
20809
|
if (!normalizeText(normalized)) {
|
|
@@ -19301,6 +20818,7 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19301
20818
|
fmKeywords = frontmatter.keywords.split(",").map((k) => k.trim()).filter(Boolean);
|
|
19302
20819
|
}
|
|
19303
20820
|
if (fmKeywords && fmKeywords.length === 0) fmKeywords = void 0;
|
|
20821
|
+
const publishedAt = extractPublishedAtFromFrontmatter(frontmatter);
|
|
19304
20822
|
return {
|
|
19305
20823
|
url: normalizeUrlPath(url),
|
|
19306
20824
|
title: resolvedTitle,
|
|
@@ -19310,7 +20828,9 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19310
20828
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
19311
20829
|
description: fmDescription,
|
|
19312
20830
|
keywords: fmKeywords,
|
|
19313
|
-
weight: mdWeight
|
|
20831
|
+
weight: mdWeight,
|
|
20832
|
+
publishedAt,
|
|
20833
|
+
meta: mdMeta
|
|
19314
20834
|
};
|
|
19315
20835
|
}
|
|
19316
20836
|
function segmentToRegex(segment) {
|
|
@@ -19473,7 +20993,7 @@ async function parseManifest(cwd, outputDir) {
|
|
|
19473
20993
|
const manifestPath = path__default.default.resolve(cwd, outputDir, "server", "manifest-full.js");
|
|
19474
20994
|
let content;
|
|
19475
20995
|
try {
|
|
19476
|
-
content = await
|
|
20996
|
+
content = await fs9__default.default.readFile(manifestPath, "utf8");
|
|
19477
20997
|
} catch {
|
|
19478
20998
|
throw new SearchSocketError(
|
|
19479
20999
|
"BUILD_MANIFEST_NOT_FOUND",
|
|
@@ -19784,6 +21304,125 @@ function filePathToUrl(filePath, baseDir) {
|
|
|
19784
21304
|
const noExt = relative.replace(/\.md$/i, "").replace(/\/index$/i, "");
|
|
19785
21305
|
return normalizeUrlPath(noExt || "/");
|
|
19786
21306
|
}
|
|
21307
|
+
var ROUTE_FILE_RE = /\+(page|layout|error)(@[^.]+)?\.svelte$/;
|
|
21308
|
+
function isSvelteComponentFile(filePath) {
|
|
21309
|
+
if (!filePath.endsWith(".svelte")) return false;
|
|
21310
|
+
return !ROUTE_FILE_RE.test(filePath);
|
|
21311
|
+
}
|
|
21312
|
+
function extractSvelteComponentMeta(source) {
|
|
21313
|
+
const componentMatch = source.match(/<!--\s*@component\s*([\s\S]*?)\s*-->/);
|
|
21314
|
+
const description = componentMatch?.[1]?.trim() || void 0;
|
|
21315
|
+
const propsMatch = source.match(
|
|
21316
|
+
/let\s+\{([\s\S]*?)\}\s*(?::\s*([^=;{][\s\S]*?))?\s*=\s*\$props\(\)/
|
|
21317
|
+
);
|
|
21318
|
+
const props = [];
|
|
21319
|
+
if (propsMatch) {
|
|
21320
|
+
const destructureBlock = propsMatch[1];
|
|
21321
|
+
const typeAnnotation = propsMatch[2]?.trim();
|
|
21322
|
+
let resolvedTypeMap;
|
|
21323
|
+
if (typeAnnotation && /^[A-Z]\w*$/.test(typeAnnotation)) {
|
|
21324
|
+
resolvedTypeMap = resolveTypeReference(source, typeAnnotation);
|
|
21325
|
+
} else if (typeAnnotation && typeAnnotation.startsWith("{")) {
|
|
21326
|
+
resolvedTypeMap = parseInlineTypeAnnotation(typeAnnotation);
|
|
21327
|
+
}
|
|
21328
|
+
const propEntries = splitDestructureBlock(destructureBlock);
|
|
21329
|
+
for (const entry of propEntries) {
|
|
21330
|
+
const trimmed = entry.trim();
|
|
21331
|
+
if (!trimmed || trimmed.startsWith("...")) continue;
|
|
21332
|
+
let propName;
|
|
21333
|
+
let defaultValue;
|
|
21334
|
+
const renameMatch = trimmed.match(/^(\w+)\s*:\s*\w+\s*(?:=\s*([\s\S]+))?$/);
|
|
21335
|
+
if (renameMatch) {
|
|
21336
|
+
propName = renameMatch[1];
|
|
21337
|
+
defaultValue = renameMatch[2]?.trim();
|
|
21338
|
+
} else {
|
|
21339
|
+
const defaultMatch = trimmed.match(/^(\w+)\s*=\s*([\s\S]+)$/);
|
|
21340
|
+
if (defaultMatch) {
|
|
21341
|
+
propName = defaultMatch[1];
|
|
21342
|
+
defaultValue = defaultMatch[2]?.trim();
|
|
21343
|
+
} else {
|
|
21344
|
+
propName = trimmed.match(/^(\w+)/)?.[1] ?? trimmed;
|
|
21345
|
+
}
|
|
21346
|
+
}
|
|
21347
|
+
const propType = resolvedTypeMap?.get(propName);
|
|
21348
|
+
props.push({
|
|
21349
|
+
name: propName,
|
|
21350
|
+
...propType ? { type: propType } : {},
|
|
21351
|
+
...defaultValue ? { default: defaultValue } : {}
|
|
21352
|
+
});
|
|
21353
|
+
}
|
|
21354
|
+
}
|
|
21355
|
+
return { description, props };
|
|
21356
|
+
}
|
|
21357
|
+
function splitDestructureBlock(block) {
|
|
21358
|
+
const entries = [];
|
|
21359
|
+
let depth = 0;
|
|
21360
|
+
let current = "";
|
|
21361
|
+
for (const ch of block) {
|
|
21362
|
+
if (ch === "{" || ch === "[" || ch === "(") {
|
|
21363
|
+
depth++;
|
|
21364
|
+
current += ch;
|
|
21365
|
+
} else if (ch === "}" || ch === "]" || ch === ")") {
|
|
21366
|
+
depth--;
|
|
21367
|
+
current += ch;
|
|
21368
|
+
} else if (ch === "," && depth === 0) {
|
|
21369
|
+
entries.push(current);
|
|
21370
|
+
current = "";
|
|
21371
|
+
} else {
|
|
21372
|
+
current += ch;
|
|
21373
|
+
}
|
|
21374
|
+
}
|
|
21375
|
+
if (current.trim()) entries.push(current);
|
|
21376
|
+
return entries;
|
|
21377
|
+
}
|
|
21378
|
+
function resolveTypeReference(source, typeName) {
|
|
21379
|
+
const startRe = new RegExp(`(?:interface\\s+${typeName}\\s*|type\\s+${typeName}\\s*=\\s*)\\{`);
|
|
21380
|
+
const startMatch = source.match(startRe);
|
|
21381
|
+
if (!startMatch || startMatch.index === void 0) return void 0;
|
|
21382
|
+
const bodyStart = startMatch.index + startMatch[0].length;
|
|
21383
|
+
let depth = 1;
|
|
21384
|
+
let i = bodyStart;
|
|
21385
|
+
while (i < source.length && depth > 0) {
|
|
21386
|
+
if (source[i] === "{") depth++;
|
|
21387
|
+
else if (source[i] === "}") depth--;
|
|
21388
|
+
i++;
|
|
21389
|
+
}
|
|
21390
|
+
if (depth !== 0) return void 0;
|
|
21391
|
+
const body = source.slice(bodyStart, i - 1);
|
|
21392
|
+
return parseTypeMembers(body);
|
|
21393
|
+
}
|
|
21394
|
+
function parseInlineTypeAnnotation(annotation) {
|
|
21395
|
+
const inner = annotation.replace(/^\{/, "").replace(/\}$/, "");
|
|
21396
|
+
return parseTypeMembers(inner);
|
|
21397
|
+
}
|
|
21398
|
+
function parseTypeMembers(body) {
|
|
21399
|
+
const map = /* @__PURE__ */ new Map();
|
|
21400
|
+
const members = body.split(/[;\n]/).map((m) => m.trim()).filter(Boolean);
|
|
21401
|
+
for (const member of members) {
|
|
21402
|
+
const memberMatch = member.match(/^(\w+)\??\s*:\s*(.+)$/);
|
|
21403
|
+
if (memberMatch) {
|
|
21404
|
+
map.set(memberMatch[1], memberMatch[2].replace(/,\s*$/, "").trim());
|
|
21405
|
+
}
|
|
21406
|
+
}
|
|
21407
|
+
return map;
|
|
21408
|
+
}
|
|
21409
|
+
function buildComponentMarkdown(componentName, meta) {
|
|
21410
|
+
if (!meta.description && meta.props.length === 0) return "";
|
|
21411
|
+
const parts = [`${componentName} component.`];
|
|
21412
|
+
if (meta.description) {
|
|
21413
|
+
parts.push(meta.description);
|
|
21414
|
+
}
|
|
21415
|
+
if (meta.props.length > 0) {
|
|
21416
|
+
const propEntries = meta.props.map((p) => {
|
|
21417
|
+
let entry = p.name;
|
|
21418
|
+
if (p.type) entry += ` (${p.type})`;
|
|
21419
|
+
if (p.default) entry += ` default: ${p.default}`;
|
|
21420
|
+
return entry;
|
|
21421
|
+
});
|
|
21422
|
+
parts.push(`Props: ${propEntries.join(", ")}.`);
|
|
21423
|
+
}
|
|
21424
|
+
return parts.join(" ");
|
|
21425
|
+
}
|
|
19787
21426
|
function normalizeSvelteToMarkdown(source) {
|
|
19788
21427
|
return source.replace(/<script[\s\S]*?<\/script>/g, "").replace(/<style[\s\S]*?<\/style>/g, "").replace(/<[^>]+>/g, " ").replace(/\{[^}]+\}/g, " ").replace(/\s+/g, " ").trim();
|
|
19789
21428
|
}
|
|
@@ -19802,13 +21441,27 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
19802
21441
|
const selected = typeof limit === "number" ? files.slice(0, limit) : files;
|
|
19803
21442
|
const pages = [];
|
|
19804
21443
|
for (const filePath of selected) {
|
|
19805
|
-
const raw = await
|
|
19806
|
-
|
|
21444
|
+
const raw = await fs9__default.default.readFile(filePath, "utf8");
|
|
21445
|
+
let markdown;
|
|
21446
|
+
let tags;
|
|
21447
|
+
if (filePath.endsWith(".md")) {
|
|
21448
|
+
markdown = raw;
|
|
21449
|
+
} else if (isSvelteComponentFile(filePath)) {
|
|
21450
|
+
const componentName = path__default.default.basename(filePath, ".svelte");
|
|
21451
|
+
const meta = extractSvelteComponentMeta(raw);
|
|
21452
|
+
const componentMarkdown = buildComponentMarkdown(componentName, meta);
|
|
21453
|
+
const templateContent = normalizeSvelteToMarkdown(raw);
|
|
21454
|
+
markdown = componentMarkdown ? [componentMarkdown, templateContent].filter(Boolean).join("\n\n") : templateContent;
|
|
21455
|
+
tags = ["component"];
|
|
21456
|
+
} else {
|
|
21457
|
+
markdown = normalizeSvelteToMarkdown(raw);
|
|
21458
|
+
}
|
|
19807
21459
|
pages.push({
|
|
19808
21460
|
url: filePathToUrl(filePath, baseDir),
|
|
19809
21461
|
markdown,
|
|
19810
21462
|
sourcePath: path__default.default.relative(cwd, filePath).replace(/\\/g, "/"),
|
|
19811
|
-
outgoingLinks: []
|
|
21463
|
+
outgoingLinks: [],
|
|
21464
|
+
...tags ? { tags } : {}
|
|
19812
21465
|
});
|
|
19813
21466
|
}
|
|
19814
21467
|
return pages;
|
|
@@ -19938,7 +21591,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
19938
21591
|
const selected = typeof limit === "number" ? htmlFiles.slice(0, limit) : htmlFiles;
|
|
19939
21592
|
const pages = [];
|
|
19940
21593
|
for (const filePath of selected) {
|
|
19941
|
-
const html = await
|
|
21594
|
+
const html = await fs9__default.default.readFile(filePath, "utf8");
|
|
19942
21595
|
pages.push({
|
|
19943
21596
|
url: staticHtmlFileToUrl(filePath, outputDir),
|
|
19944
21597
|
html,
|
|
@@ -20001,7 +21654,7 @@ function isBlockedByRobots(urlPath, rules3) {
|
|
|
20001
21654
|
}
|
|
20002
21655
|
async function loadRobotsTxtFromDir(dir) {
|
|
20003
21656
|
try {
|
|
20004
|
-
const content = await
|
|
21657
|
+
const content = await fs9__default.default.readFile(path__default.default.join(dir, "robots.txt"), "utf8");
|
|
20005
21658
|
return parseRobotsTxt(content);
|
|
20006
21659
|
} catch {
|
|
20007
21660
|
return null;
|
|
@@ -20018,6 +21671,81 @@ async function fetchRobotsTxt(baseUrl) {
|
|
|
20018
21671
|
return null;
|
|
20019
21672
|
}
|
|
20020
21673
|
}
|
|
21674
|
+
function resolvePageUrl(pageUrl, baseUrl) {
|
|
21675
|
+
if (!baseUrl) return pageUrl;
|
|
21676
|
+
try {
|
|
21677
|
+
return new URL(pageUrl, baseUrl).href;
|
|
21678
|
+
} catch {
|
|
21679
|
+
return pageUrl;
|
|
21680
|
+
}
|
|
21681
|
+
}
|
|
21682
|
+
function generateLlmsTxt(pages, config) {
|
|
21683
|
+
const title = config.llmsTxt.title ?? config.project.id;
|
|
21684
|
+
const description = config.llmsTxt.description;
|
|
21685
|
+
const baseUrl = config.project.baseUrl;
|
|
21686
|
+
const lines = [`# ${title}`];
|
|
21687
|
+
if (description) {
|
|
21688
|
+
lines.push("", `> ${description}`);
|
|
21689
|
+
}
|
|
21690
|
+
const filtered = pages.filter(
|
|
21691
|
+
(p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
|
|
21692
|
+
);
|
|
21693
|
+
const sorted = [...filtered].sort((a, b) => {
|
|
21694
|
+
if (a.depth !== b.depth) return a.depth - b.depth;
|
|
21695
|
+
return b.incomingLinks - a.incomingLinks;
|
|
21696
|
+
});
|
|
21697
|
+
if (sorted.length > 0) {
|
|
21698
|
+
lines.push("", "## Pages", "");
|
|
21699
|
+
for (const page of sorted) {
|
|
21700
|
+
const url = resolvePageUrl(page.url, baseUrl);
|
|
21701
|
+
if (page.description) {
|
|
21702
|
+
lines.push(`- [${page.title}](${url}): ${page.description}`);
|
|
21703
|
+
} else {
|
|
21704
|
+
lines.push(`- [${page.title}](${url})`);
|
|
21705
|
+
}
|
|
21706
|
+
}
|
|
21707
|
+
}
|
|
21708
|
+
lines.push("");
|
|
21709
|
+
return lines.join("\n");
|
|
21710
|
+
}
|
|
21711
|
+
function generateLlmsFullTxt(pages, config) {
|
|
21712
|
+
const title = config.llmsTxt.title ?? config.project.id;
|
|
21713
|
+
const description = config.llmsTxt.description;
|
|
21714
|
+
const baseUrl = config.project.baseUrl;
|
|
21715
|
+
const lines = [`# ${title}`];
|
|
21716
|
+
if (description) {
|
|
21717
|
+
lines.push("", `> ${description}`);
|
|
21718
|
+
}
|
|
21719
|
+
const filtered = pages.filter(
|
|
21720
|
+
(p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
|
|
21721
|
+
);
|
|
21722
|
+
const sorted = [...filtered].sort((a, b) => {
|
|
21723
|
+
if (a.depth !== b.depth) return a.depth - b.depth;
|
|
21724
|
+
return b.incomingLinks - a.incomingLinks;
|
|
21725
|
+
});
|
|
21726
|
+
for (const page of sorted) {
|
|
21727
|
+
const url = resolvePageUrl(page.url, baseUrl);
|
|
21728
|
+
lines.push("", "---", "", `## [${page.title}](${url})`, "");
|
|
21729
|
+
lines.push(page.markdown.trim());
|
|
21730
|
+
}
|
|
21731
|
+
lines.push("");
|
|
21732
|
+
return lines.join("\n");
|
|
21733
|
+
}
|
|
21734
|
+
async function writeLlmsTxt(pages, config, cwd, logger3) {
|
|
21735
|
+
const outputPath = path__default.default.resolve(cwd, config.llmsTxt.outputPath);
|
|
21736
|
+
const outputDir = path__default.default.dirname(outputPath);
|
|
21737
|
+
await fs9__default.default.mkdir(outputDir, { recursive: true });
|
|
21738
|
+
const content = generateLlmsTxt(pages, config);
|
|
21739
|
+
await fs9__default.default.writeFile(outputPath, content, "utf8");
|
|
21740
|
+
logger3.info(`Generated llms.txt at ${config.llmsTxt.outputPath}`);
|
|
21741
|
+
if (config.llmsTxt.generateFull) {
|
|
21742
|
+
const fullPath = outputPath.replace(/\.txt$/, "-full.txt");
|
|
21743
|
+
const fullContent = generateLlmsFullTxt(pages, config);
|
|
21744
|
+
await fs9__default.default.writeFile(fullPath, fullContent, "utf8");
|
|
21745
|
+
const relativeFull = path__default.default.relative(cwd, fullPath);
|
|
21746
|
+
logger3.info(`Generated llms-full.txt at ${relativeFull}`);
|
|
21747
|
+
}
|
|
21748
|
+
}
|
|
20021
21749
|
|
|
20022
21750
|
// src/indexing/pipeline.ts
|
|
20023
21751
|
function buildPageSummary(page, maxChars = 3500) {
|
|
@@ -20036,16 +21764,33 @@ function buildPageSummary(page, maxChars = 3500) {
|
|
|
20036
21764
|
if (joined.length <= maxChars) return joined;
|
|
20037
21765
|
return joined.slice(0, maxChars).trim();
|
|
20038
21766
|
}
|
|
21767
|
+
function buildPageContentHash(page) {
|
|
21768
|
+
const parts = [
|
|
21769
|
+
page.title,
|
|
21770
|
+
page.description ?? "",
|
|
21771
|
+
(page.keywords ?? []).slice().sort().join(","),
|
|
21772
|
+
page.tags.slice().sort().join(","),
|
|
21773
|
+
page.markdown,
|
|
21774
|
+
String(page.outgoingLinks),
|
|
21775
|
+
String(page.publishedAt ?? ""),
|
|
21776
|
+
page.incomingAnchorText ?? "",
|
|
21777
|
+
(page.outgoingLinkUrls ?? []).slice().sort().join(","),
|
|
21778
|
+
page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : ""
|
|
21779
|
+
];
|
|
21780
|
+
return sha256(parts.join("|"));
|
|
21781
|
+
}
|
|
20039
21782
|
var IndexPipeline = class _IndexPipeline {
|
|
20040
21783
|
cwd;
|
|
20041
21784
|
config;
|
|
20042
21785
|
store;
|
|
20043
21786
|
logger;
|
|
21787
|
+
hooks;
|
|
20044
21788
|
constructor(options) {
|
|
20045
21789
|
this.cwd = options.cwd;
|
|
20046
21790
|
this.config = options.config;
|
|
20047
21791
|
this.store = options.store;
|
|
20048
21792
|
this.logger = options.logger;
|
|
21793
|
+
this.hooks = options.hooks;
|
|
20049
21794
|
}
|
|
20050
21795
|
static async create(options = {}) {
|
|
20051
21796
|
const cwd = path__default.default.resolve(options.cwd ?? process.cwd());
|
|
@@ -20055,7 +21800,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20055
21800
|
cwd,
|
|
20056
21801
|
config,
|
|
20057
21802
|
store,
|
|
20058
|
-
logger: options.logger ?? new Logger()
|
|
21803
|
+
logger: options.logger ?? new Logger(),
|
|
21804
|
+
hooks: options.hooks ?? {}
|
|
20059
21805
|
});
|
|
20060
21806
|
}
|
|
20061
21807
|
getConfig() {
|
|
@@ -20076,7 +21822,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20076
21822
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
20077
21823
|
ensureStateDirs(this.cwd, this.config.state.dir);
|
|
20078
21824
|
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
20079
|
-
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-
|
|
21825
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-vector)`);
|
|
20080
21826
|
if (options.force) {
|
|
20081
21827
|
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
20082
21828
|
}
|
|
@@ -20084,9 +21830,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20084
21830
|
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
20085
21831
|
}
|
|
20086
21832
|
const manifestStart = stageStart();
|
|
20087
|
-
const
|
|
21833
|
+
const existingPageHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getPageHashes(scope);
|
|
20088
21834
|
stageEnd("manifest", manifestStart);
|
|
20089
|
-
this.logger.debug(`Manifest: ${
|
|
21835
|
+
this.logger.debug(`Manifest: ${existingPageHashes.size} existing page hashes loaded`);
|
|
20090
21836
|
const sourceStart = stageStart();
|
|
20091
21837
|
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
20092
21838
|
let sourcePages;
|
|
@@ -20163,11 +21909,61 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20163
21909
|
);
|
|
20164
21910
|
continue;
|
|
20165
21911
|
}
|
|
20166
|
-
|
|
21912
|
+
if (sourcePage.tags && sourcePage.tags.length > 0) {
|
|
21913
|
+
extracted.tags = [.../* @__PURE__ */ new Set([...extracted.tags, ...sourcePage.tags])];
|
|
21914
|
+
}
|
|
21915
|
+
let accepted;
|
|
21916
|
+
if (this.hooks.transformPage) {
|
|
21917
|
+
const transformed = await this.hooks.transformPage(extracted);
|
|
21918
|
+
if (transformed === null) {
|
|
21919
|
+
this.logger.debug(`Page ${sourcePage.url} skipped by transformPage hook`);
|
|
21920
|
+
continue;
|
|
21921
|
+
}
|
|
21922
|
+
accepted = transformed;
|
|
21923
|
+
} else {
|
|
21924
|
+
accepted = extracted;
|
|
21925
|
+
}
|
|
21926
|
+
extractedPages.push(accepted);
|
|
20167
21927
|
this.logger.event("page_extracted", {
|
|
20168
|
-
url:
|
|
21928
|
+
url: accepted.url
|
|
20169
21929
|
});
|
|
20170
21930
|
}
|
|
21931
|
+
const customRecords = options.customRecords ?? [];
|
|
21932
|
+
if (customRecords.length > 0) {
|
|
21933
|
+
this.logger.info(`Processing ${customRecords.length} custom record${customRecords.length === 1 ? "" : "s"}...`);
|
|
21934
|
+
for (const record of customRecords) {
|
|
21935
|
+
const normalizedUrl = normalizeUrlPath(record.url);
|
|
21936
|
+
const normalized = normalizeMarkdown(record.content);
|
|
21937
|
+
if (!normalized.trim()) {
|
|
21938
|
+
this.logger.warn(`Custom record ${normalizedUrl} has empty content and was skipped.`);
|
|
21939
|
+
continue;
|
|
21940
|
+
}
|
|
21941
|
+
const urlTags = normalizedUrl.split("/").filter(Boolean).slice(0, 1);
|
|
21942
|
+
const tags = record.tags ? [.../* @__PURE__ */ new Set([...urlTags, ...record.tags])] : urlTags;
|
|
21943
|
+
const extracted = {
|
|
21944
|
+
url: normalizedUrl,
|
|
21945
|
+
title: record.title,
|
|
21946
|
+
markdown: normalized,
|
|
21947
|
+
outgoingLinks: [],
|
|
21948
|
+
noindex: false,
|
|
21949
|
+
tags,
|
|
21950
|
+
weight: record.weight
|
|
21951
|
+
};
|
|
21952
|
+
let accepted;
|
|
21953
|
+
if (this.hooks.transformPage) {
|
|
21954
|
+
const transformed = await this.hooks.transformPage(extracted);
|
|
21955
|
+
if (transformed === null) {
|
|
21956
|
+
this.logger.debug(`Custom record ${normalizedUrl} skipped by transformPage hook`);
|
|
21957
|
+
continue;
|
|
21958
|
+
}
|
|
21959
|
+
accepted = transformed;
|
|
21960
|
+
} else {
|
|
21961
|
+
accepted = extracted;
|
|
21962
|
+
}
|
|
21963
|
+
extractedPages.push(accepted);
|
|
21964
|
+
this.logger.event("page_extracted", { url: accepted.url, custom: true });
|
|
21965
|
+
}
|
|
21966
|
+
}
|
|
20171
21967
|
extractedPages.sort((a, b) => a.url.localeCompare(b.url));
|
|
20172
21968
|
const uniquePages = [];
|
|
20173
21969
|
const seenUrls = /* @__PURE__ */ new Set();
|
|
@@ -20200,15 +21996,28 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20200
21996
|
const linkStart = stageStart();
|
|
20201
21997
|
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
20202
21998
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
21999
|
+
const incomingAnchorTexts = /* @__PURE__ */ new Map();
|
|
20203
22000
|
for (const page of indexablePages) {
|
|
20204
22001
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
20205
22002
|
}
|
|
20206
22003
|
for (const page of indexablePages) {
|
|
20207
|
-
|
|
22004
|
+
const seenForCount = /* @__PURE__ */ new Set();
|
|
22005
|
+
const seenForAnchor = /* @__PURE__ */ new Set();
|
|
22006
|
+
for (const { url: outgoing, anchorText } of page.outgoingLinks) {
|
|
20208
22007
|
if (!pageSet.has(outgoing)) {
|
|
20209
22008
|
continue;
|
|
20210
22009
|
}
|
|
20211
|
-
|
|
22010
|
+
if (!seenForCount.has(outgoing)) {
|
|
22011
|
+
seenForCount.add(outgoing);
|
|
22012
|
+
incomingLinkCount.set(outgoing, (incomingLinkCount.get(outgoing) ?? 0) + 1);
|
|
22013
|
+
}
|
|
22014
|
+
if (anchorText && !seenForAnchor.has(outgoing)) {
|
|
22015
|
+
seenForAnchor.add(outgoing);
|
|
22016
|
+
if (!incomingAnchorTexts.has(outgoing)) {
|
|
22017
|
+
incomingAnchorTexts.set(outgoing, /* @__PURE__ */ new Set());
|
|
22018
|
+
}
|
|
22019
|
+
incomingAnchorTexts.get(outgoing).add(anchorText);
|
|
22020
|
+
}
|
|
20212
22021
|
}
|
|
20213
22022
|
}
|
|
20214
22023
|
stageEnd("links", linkStart);
|
|
@@ -20227,6 +22036,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20227
22036
|
});
|
|
20228
22037
|
}
|
|
20229
22038
|
}
|
|
22039
|
+
for (const record of customRecords) {
|
|
22040
|
+
const normalizedUrl = normalizeUrlPath(record.url);
|
|
22041
|
+
if (!precomputedRoutes.has(normalizedUrl)) {
|
|
22042
|
+
precomputedRoutes.set(normalizedUrl, {
|
|
22043
|
+
routeFile: "",
|
|
22044
|
+
routeResolution: "exact"
|
|
22045
|
+
});
|
|
22046
|
+
}
|
|
22047
|
+
}
|
|
20230
22048
|
for (const page of indexablePages) {
|
|
20231
22049
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
20232
22050
|
if (routeMatch.routeResolution === "best-effort") {
|
|
@@ -20244,6 +22062,17 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20244
22062
|
} else {
|
|
20245
22063
|
routeExact += 1;
|
|
20246
22064
|
}
|
|
22065
|
+
const anchorSet = incomingAnchorTexts.get(page.url);
|
|
22066
|
+
let incomingAnchorText;
|
|
22067
|
+
if (anchorSet && anchorSet.size > 0) {
|
|
22068
|
+
let joined = "";
|
|
22069
|
+
for (const phrase of anchorSet) {
|
|
22070
|
+
const next2 = joined ? `${joined} ${phrase}` : phrase;
|
|
22071
|
+
if (next2.length > 500) break;
|
|
22072
|
+
joined = next2;
|
|
22073
|
+
}
|
|
22074
|
+
incomingAnchorText = joined || void 0;
|
|
22075
|
+
}
|
|
20247
22076
|
const indexedPage = {
|
|
20248
22077
|
url: page.url,
|
|
20249
22078
|
title: page.title,
|
|
@@ -20253,40 +22082,113 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20253
22082
|
generatedAt: nowIso(),
|
|
20254
22083
|
incomingLinks: incomingLinkCount.get(page.url) ?? 0,
|
|
20255
22084
|
outgoingLinks: page.outgoingLinks.length,
|
|
22085
|
+
outgoingLinkUrls: page.outgoingLinks.map((l) => typeof l === "string" ? l : l.url),
|
|
20256
22086
|
depth: getUrlDepth(page.url),
|
|
20257
22087
|
tags: page.tags,
|
|
20258
22088
|
markdown: page.markdown,
|
|
20259
22089
|
description: page.description,
|
|
20260
|
-
keywords: page.keywords
|
|
22090
|
+
keywords: page.keywords,
|
|
22091
|
+
publishedAt: page.publishedAt,
|
|
22092
|
+
incomingAnchorText,
|
|
22093
|
+
meta: page.meta
|
|
20261
22094
|
};
|
|
20262
22095
|
pages.push(indexedPage);
|
|
20263
22096
|
this.logger.event("page_indexed", { url: page.url });
|
|
20264
22097
|
}
|
|
22098
|
+
const pageRecords = pages.map((p) => {
|
|
22099
|
+
const summary = buildPageSummary(p);
|
|
22100
|
+
return {
|
|
22101
|
+
url: p.url,
|
|
22102
|
+
title: p.title,
|
|
22103
|
+
markdown: p.markdown,
|
|
22104
|
+
projectId: scope.projectId,
|
|
22105
|
+
scopeName: scope.scopeName,
|
|
22106
|
+
routeFile: p.routeFile,
|
|
22107
|
+
routeResolution: p.routeResolution,
|
|
22108
|
+
incomingLinks: p.incomingLinks,
|
|
22109
|
+
outgoingLinks: p.outgoingLinks,
|
|
22110
|
+
outgoingLinkUrls: p.outgoingLinkUrls,
|
|
22111
|
+
depth: p.depth,
|
|
22112
|
+
tags: p.tags,
|
|
22113
|
+
indexedAt: p.generatedAt,
|
|
22114
|
+
summary,
|
|
22115
|
+
description: p.description,
|
|
22116
|
+
keywords: p.keywords,
|
|
22117
|
+
contentHash: buildPageContentHash(p),
|
|
22118
|
+
publishedAt: p.publishedAt,
|
|
22119
|
+
meta: p.meta
|
|
22120
|
+
};
|
|
22121
|
+
});
|
|
22122
|
+
const currentPageUrls = new Set(pageRecords.map((r) => r.url));
|
|
22123
|
+
const changedPages = pageRecords.filter(
|
|
22124
|
+
(r) => !existingPageHashes.has(r.url) || existingPageHashes.get(r.url) !== r.contentHash
|
|
22125
|
+
);
|
|
22126
|
+
const deletedPageUrls = [...existingPageHashes.keys()].filter((url) => !currentPageUrls.has(url));
|
|
20265
22127
|
if (!options.dryRun) {
|
|
20266
|
-
|
|
20267
|
-
|
|
20268
|
-
|
|
20269
|
-
|
|
20270
|
-
|
|
20271
|
-
|
|
20272
|
-
|
|
20273
|
-
|
|
20274
|
-
|
|
20275
|
-
|
|
20276
|
-
|
|
20277
|
-
|
|
20278
|
-
|
|
20279
|
-
|
|
20280
|
-
|
|
20281
|
-
|
|
20282
|
-
|
|
20283
|
-
|
|
20284
|
-
|
|
20285
|
-
|
|
20286
|
-
|
|
20287
|
-
|
|
22128
|
+
if (options.force) {
|
|
22129
|
+
await this.store.deletePages(scope);
|
|
22130
|
+
this.logger.info(`Upserting ${pageRecords.length} page summaries...`);
|
|
22131
|
+
const pageDocs = pageRecords.map((r) => ({
|
|
22132
|
+
id: r.url,
|
|
22133
|
+
data: r.summary ?? r.title,
|
|
22134
|
+
metadata: {
|
|
22135
|
+
title: r.title,
|
|
22136
|
+
url: r.url,
|
|
22137
|
+
description: r.description ?? "",
|
|
22138
|
+
keywords: r.keywords ?? [],
|
|
22139
|
+
summary: r.summary ?? "",
|
|
22140
|
+
tags: r.tags,
|
|
22141
|
+
markdown: r.markdown,
|
|
22142
|
+
routeFile: r.routeFile,
|
|
22143
|
+
routeResolution: r.routeResolution,
|
|
22144
|
+
incomingLinks: r.incomingLinks,
|
|
22145
|
+
outgoingLinks: r.outgoingLinks,
|
|
22146
|
+
outgoingLinkUrls: r.outgoingLinkUrls ?? [],
|
|
22147
|
+
depth: r.depth,
|
|
22148
|
+
indexedAt: r.indexedAt,
|
|
22149
|
+
contentHash: r.contentHash ?? "",
|
|
22150
|
+
publishedAt: r.publishedAt ?? null,
|
|
22151
|
+
...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
|
|
22152
|
+
}
|
|
22153
|
+
}));
|
|
22154
|
+
await this.store.upsertPages(pageDocs, scope);
|
|
22155
|
+
} else {
|
|
22156
|
+
if (changedPages.length > 0) {
|
|
22157
|
+
this.logger.info(`Upserting ${changedPages.length} changed page summaries...`);
|
|
22158
|
+
const pageDocs = changedPages.map((r) => ({
|
|
22159
|
+
id: r.url,
|
|
22160
|
+
data: r.summary ?? r.title,
|
|
22161
|
+
metadata: {
|
|
22162
|
+
title: r.title,
|
|
22163
|
+
url: r.url,
|
|
22164
|
+
description: r.description ?? "",
|
|
22165
|
+
keywords: r.keywords ?? [],
|
|
22166
|
+
summary: r.summary ?? "",
|
|
22167
|
+
tags: r.tags,
|
|
22168
|
+
markdown: r.markdown,
|
|
22169
|
+
routeFile: r.routeFile,
|
|
22170
|
+
routeResolution: r.routeResolution,
|
|
22171
|
+
incomingLinks: r.incomingLinks,
|
|
22172
|
+
outgoingLinks: r.outgoingLinks,
|
|
22173
|
+
outgoingLinkUrls: r.outgoingLinkUrls ?? [],
|
|
22174
|
+
depth: r.depth,
|
|
22175
|
+
indexedAt: r.indexedAt,
|
|
22176
|
+
contentHash: r.contentHash ?? "",
|
|
22177
|
+
publishedAt: r.publishedAt ?? null,
|
|
22178
|
+
...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
|
|
22179
|
+
}
|
|
22180
|
+
}));
|
|
22181
|
+
await this.store.upsertPages(pageDocs, scope);
|
|
22182
|
+
}
|
|
22183
|
+
if (deletedPageUrls.length > 0) {
|
|
22184
|
+
await this.store.deletePagesByIds(deletedPageUrls, scope);
|
|
22185
|
+
}
|
|
22186
|
+
}
|
|
20288
22187
|
}
|
|
22188
|
+
const pagesChanged = options.force ? pageRecords.length : changedPages.length;
|
|
22189
|
+
const pagesDeleted = deletedPageUrls.length;
|
|
20289
22190
|
stageEnd("pages", pagesStart);
|
|
22191
|
+
this.logger.info(`Page changes: ${pagesChanged} changed/new, ${pagesDeleted} deleted, ${pageRecords.length - changedPages.length} unchanged`);
|
|
20290
22192
|
this.logger.info(`Indexed ${pages.length} page${pages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["pages"]}ms)`);
|
|
20291
22193
|
const chunkStart = stageStart();
|
|
20292
22194
|
this.logger.info("Chunking pages...");
|
|
@@ -20295,6 +22197,18 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20295
22197
|
if (typeof maxChunks === "number") {
|
|
20296
22198
|
chunks = chunks.slice(0, maxChunks);
|
|
20297
22199
|
}
|
|
22200
|
+
if (this.hooks.transformChunk) {
|
|
22201
|
+
const transformed = [];
|
|
22202
|
+
for (const chunk of chunks) {
|
|
22203
|
+
const result = await this.hooks.transformChunk(chunk);
|
|
22204
|
+
if (result === null) {
|
|
22205
|
+
this.logger.debug(`Chunk ${chunk.chunkKey} skipped by transformChunk hook`);
|
|
22206
|
+
continue;
|
|
22207
|
+
}
|
|
22208
|
+
transformed.push(result);
|
|
22209
|
+
}
|
|
22210
|
+
chunks = transformed;
|
|
22211
|
+
}
|
|
20298
22212
|
for (const chunk of chunks) {
|
|
20299
22213
|
this.logger.event("chunked", {
|
|
20300
22214
|
url: chunk.url,
|
|
@@ -20307,7 +22221,12 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20307
22221
|
for (const chunk of chunks) {
|
|
20308
22222
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
20309
22223
|
}
|
|
20310
|
-
const
|
|
22224
|
+
const chunkHashStart = stageStart();
|
|
22225
|
+
const currentChunkKeys = chunks.map((c) => c.chunkKey);
|
|
22226
|
+
const existingHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.fetchContentHashesForKeys(currentChunkKeys, scope);
|
|
22227
|
+
stageEnd("chunk_hashes", chunkHashStart);
|
|
22228
|
+
this.logger.debug(`Fetched ${existingHashes.size} existing chunk hashes for ${currentChunkKeys.length} current keys`);
|
|
22229
|
+
let changedChunks = chunks.filter((chunk) => {
|
|
20311
22230
|
if (options.force) {
|
|
20312
22231
|
return true;
|
|
20313
22232
|
}
|
|
@@ -20320,37 +22239,45 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20320
22239
|
}
|
|
20321
22240
|
return existingHash !== chunk.contentHash;
|
|
20322
22241
|
});
|
|
20323
|
-
const
|
|
22242
|
+
const existingChunkIds = options.force ? /* @__PURE__ */ new Set() : await this.store.scanChunkIds(scope);
|
|
22243
|
+
const deletes = [...existingChunkIds].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
22244
|
+
if (this.hooks.beforeIndex) {
|
|
22245
|
+
changedChunks = await this.hooks.beforeIndex(changedChunks);
|
|
22246
|
+
}
|
|
20324
22247
|
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
20325
22248
|
const upsertStart = stageStart();
|
|
20326
22249
|
let documentsUpserted = 0;
|
|
20327
22250
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
20328
|
-
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash
|
|
20329
|
-
const UPSTASH_CONTENT_LIMIT = 4096;
|
|
22251
|
+
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Vector...`);
|
|
20330
22252
|
const docs = changedChunks.map((chunk) => {
|
|
20331
|
-
const
|
|
20332
|
-
|
|
20333
|
-
|
|
20334
|
-
|
|
20335
|
-
|
|
20336
|
-
|
|
20337
|
-
const textBudget = Math.max(500, UPSTASH_CONTENT_LIMIT - otherFieldsLen - 50);
|
|
20338
|
-
const text = buildEmbeddingText(chunk, this.config.chunking.prependTitle).slice(0, textBudget);
|
|
22253
|
+
const embeddingText = buildEmbeddingText(chunk, this.config.chunking.prependTitle);
|
|
22254
|
+
if (embeddingText.length > 2e3) {
|
|
22255
|
+
this.logger.warn(
|
|
22256
|
+
`Chunk ${chunk.chunkKey} text is ${embeddingText.length} chars (~${Math.round(embeddingText.length / 4)} tokens), which may exceed the 512-token model limit and be silently truncated.`
|
|
22257
|
+
);
|
|
22258
|
+
}
|
|
20339
22259
|
return {
|
|
20340
22260
|
id: chunk.chunkKey,
|
|
20341
|
-
|
|
22261
|
+
data: embeddingText,
|
|
20342
22262
|
metadata: {
|
|
20343
|
-
|
|
20344
|
-
scopeName: scope.scopeName,
|
|
22263
|
+
url: chunk.url,
|
|
20345
22264
|
path: chunk.path,
|
|
22265
|
+
title: chunk.title,
|
|
22266
|
+
sectionTitle: chunk.sectionTitle ?? "",
|
|
22267
|
+
headingPath: chunk.headingPath.join(" > "),
|
|
20346
22268
|
snippet: chunk.snippet,
|
|
22269
|
+
chunkText: embeddingText,
|
|
22270
|
+
tags: chunk.tags,
|
|
20347
22271
|
ordinal: chunk.ordinal,
|
|
20348
22272
|
contentHash: chunk.contentHash,
|
|
20349
22273
|
depth: chunk.depth,
|
|
20350
22274
|
incomingLinks: chunk.incomingLinks,
|
|
20351
22275
|
routeFile: chunk.routeFile,
|
|
20352
22276
|
description: chunk.description ?? "",
|
|
20353
|
-
keywords:
|
|
22277
|
+
keywords: chunk.keywords ?? [],
|
|
22278
|
+
publishedAt: chunk.publishedAt ?? null,
|
|
22279
|
+
incomingAnchorText: chunk.incomingAnchorText ?? "",
|
|
22280
|
+
...chunk.meta && Object.keys(chunk.meta).length > 0 ? { meta: chunk.meta } : {}
|
|
20354
22281
|
}
|
|
20355
22282
|
};
|
|
20356
22283
|
});
|
|
@@ -20368,9 +22295,16 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20368
22295
|
} else {
|
|
20369
22296
|
this.logger.info("No chunks to upsert \u2014 all up to date");
|
|
20370
22297
|
}
|
|
22298
|
+
if (this.config.llmsTxt.enable && !options.dryRun) {
|
|
22299
|
+
const llmsStart = stageStart();
|
|
22300
|
+
await writeLlmsTxt(pages, this.config, this.cwd, this.logger);
|
|
22301
|
+
stageEnd("llms_txt", llmsStart);
|
|
22302
|
+
}
|
|
20371
22303
|
this.logger.info("Done.");
|
|
20372
|
-
|
|
22304
|
+
const stats = {
|
|
20373
22305
|
pagesProcessed: pages.length,
|
|
22306
|
+
pagesChanged,
|
|
22307
|
+
pagesDeleted,
|
|
20374
22308
|
chunksTotal: chunks.length,
|
|
20375
22309
|
chunksChanged: changedChunks.length,
|
|
20376
22310
|
documentsUpserted,
|
|
@@ -20379,6 +22313,10 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20379
22313
|
routeBestEffort,
|
|
20380
22314
|
stageTimingsMs
|
|
20381
22315
|
};
|
|
22316
|
+
if (this.hooks.afterIndex) {
|
|
22317
|
+
await this.hooks.afterIndex(stats);
|
|
22318
|
+
}
|
|
22319
|
+
return stats;
|
|
20382
22320
|
}
|
|
20383
22321
|
};
|
|
20384
22322
|
|
|
@@ -20400,9 +22338,6 @@ function shouldRunAutoIndex(options) {
|
|
|
20400
22338
|
if (explicit && /^(1|true|yes)$/i.test(explicit)) {
|
|
20401
22339
|
return true;
|
|
20402
22340
|
}
|
|
20403
|
-
if (process.env.CI && /^(1|true)$/i.test(process.env.CI)) {
|
|
20404
|
-
return true;
|
|
20405
|
-
}
|
|
20406
22341
|
return false;
|
|
20407
22342
|
}
|
|
20408
22343
|
function searchsocketVitePlugin(options = {}) {
|
|
@@ -20427,7 +22362,8 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20427
22362
|
const pipeline = await IndexPipeline.create({
|
|
20428
22363
|
cwd,
|
|
20429
22364
|
configPath: options.configPath,
|
|
20430
|
-
logger: logger3
|
|
22365
|
+
logger: logger3,
|
|
22366
|
+
hooks: options.hooks
|
|
20431
22367
|
});
|
|
20432
22368
|
const stats = await pipeline.run({
|
|
20433
22369
|
changedOnly: options.changedOnly ?? true,
|