searchsocket 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +731 -514
- package/dist/cli.js +3308 -524
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +2310 -466
- package/dist/index.d.cts +101 -40
- package/dist/index.d.ts +101 -40
- package/dist/index.js +2310 -466
- package/dist/{plugin-B_npJSux.d.cts → plugin-C61L-ykY.d.ts} +2 -1
- package/dist/{plugin-M-aW0ev6.d.ts → plugin-DoBW1gkK.d.cts} +2 -1
- package/dist/sveltekit.cjs +2342 -465
- package/dist/sveltekit.d.cts +2 -2
- package/dist/sveltekit.d.ts +2 -2
- package/dist/sveltekit.js +2344 -467
- package/dist/templates/search-dialog/SearchDialog.svelte +175 -0
- package/dist/templates/search-input/SearchInput.svelte +151 -0
- package/dist/templates/search-results/SearchResults.svelte +75 -0
- package/dist/{types-Dk43uz25.d.cts → types-029hl6P2.d.cts} +180 -9
- package/dist/{types-Dk43uz25.d.ts → types-029hl6P2.d.ts} +180 -9
- package/package.json +20 -2
- package/src/svelte/SearchSocket.svelte +35 -0
- package/src/svelte/index.svelte.ts +181 -0
package/dist/sveltekit.js
CHANGED
|
@@ -1,14 +1,20 @@
|
|
|
1
|
-
import
|
|
1
|
+
import { timingSafeEqual, createHash } from 'crypto';
|
|
2
|
+
import fs9 from 'fs/promises';
|
|
2
3
|
import path from 'path';
|
|
4
|
+
import { WebStandardStreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js';
|
|
5
|
+
import fs from 'fs';
|
|
3
6
|
import { createJiti } from 'jiti';
|
|
4
7
|
import { z } from 'zod';
|
|
8
|
+
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
9
|
+
import '@modelcontextprotocol/sdk/server/stdio.js';
|
|
10
|
+
import '@modelcontextprotocol/sdk/server/streamableHttp.js';
|
|
11
|
+
import '@modelcontextprotocol/sdk/server/express.js';
|
|
5
12
|
import { execSync, spawn } from 'child_process';
|
|
6
|
-
import {
|
|
13
|
+
import { FusionAlgorithm, QueryMode } from '@upstash/vector';
|
|
7
14
|
import { load } from 'cheerio';
|
|
8
15
|
import matter from 'gray-matter';
|
|
9
16
|
import fg from 'fast-glob';
|
|
10
17
|
import pLimit from 'p-limit';
|
|
11
|
-
import fs3 from 'fs/promises';
|
|
12
18
|
import net from 'net';
|
|
13
19
|
import { gunzipSync } from 'zlib';
|
|
14
20
|
|
|
@@ -5009,32 +5015,32 @@ var require_URL = __commonJS({
|
|
|
5009
5015
|
else
|
|
5010
5016
|
return basepath.substring(0, lastslash + 1) + refpath;
|
|
5011
5017
|
}
|
|
5012
|
-
function remove_dot_segments(
|
|
5013
|
-
if (!
|
|
5018
|
+
function remove_dot_segments(path14) {
|
|
5019
|
+
if (!path14) return path14;
|
|
5014
5020
|
var output = "";
|
|
5015
|
-
while (
|
|
5016
|
-
if (
|
|
5017
|
-
|
|
5021
|
+
while (path14.length > 0) {
|
|
5022
|
+
if (path14 === "." || path14 === "..") {
|
|
5023
|
+
path14 = "";
|
|
5018
5024
|
break;
|
|
5019
5025
|
}
|
|
5020
|
-
var twochars =
|
|
5021
|
-
var threechars =
|
|
5022
|
-
var fourchars =
|
|
5026
|
+
var twochars = path14.substring(0, 2);
|
|
5027
|
+
var threechars = path14.substring(0, 3);
|
|
5028
|
+
var fourchars = path14.substring(0, 4);
|
|
5023
5029
|
if (threechars === "../") {
|
|
5024
|
-
|
|
5030
|
+
path14 = path14.substring(3);
|
|
5025
5031
|
} else if (twochars === "./") {
|
|
5026
|
-
|
|
5032
|
+
path14 = path14.substring(2);
|
|
5027
5033
|
} else if (threechars === "/./") {
|
|
5028
|
-
|
|
5029
|
-
} else if (twochars === "/." &&
|
|
5030
|
-
|
|
5031
|
-
} else if (fourchars === "/../" || threechars === "/.." &&
|
|
5032
|
-
|
|
5034
|
+
path14 = "/" + path14.substring(3);
|
|
5035
|
+
} else if (twochars === "/." && path14.length === 2) {
|
|
5036
|
+
path14 = "/";
|
|
5037
|
+
} else if (fourchars === "/../" || threechars === "/.." && path14.length === 3) {
|
|
5038
|
+
path14 = "/" + path14.substring(4);
|
|
5033
5039
|
output = output.replace(/\/?[^\/]*$/, "");
|
|
5034
5040
|
} else {
|
|
5035
|
-
var segment =
|
|
5041
|
+
var segment = path14.match(/(\/?([^\/]*))/)[0];
|
|
5036
5042
|
output += segment;
|
|
5037
|
-
|
|
5043
|
+
path14 = path14.substring(segment.length);
|
|
5038
5044
|
}
|
|
5039
5045
|
}
|
|
5040
5046
|
return output;
|
|
@@ -16630,6 +16636,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
16630
16636
|
dropSelectors: z.array(z.string()).optional(),
|
|
16631
16637
|
ignoreAttr: z.string().optional(),
|
|
16632
16638
|
noindexAttr: z.string().optional(),
|
|
16639
|
+
imageDescAttr: z.string().optional(),
|
|
16633
16640
|
respectRobotsNoindex: z.boolean().optional()
|
|
16634
16641
|
}).optional(),
|
|
16635
16642
|
transform: z.object({
|
|
@@ -16645,35 +16652,48 @@ var searchSocketConfigSchema = z.object({
|
|
|
16645
16652
|
headingPathDepth: z.number().int().positive().optional(),
|
|
16646
16653
|
dontSplitInside: z.array(z.enum(["code", "table", "blockquote"])).optional(),
|
|
16647
16654
|
prependTitle: z.boolean().optional(),
|
|
16648
|
-
pageSummaryChunk: z.boolean().optional()
|
|
16655
|
+
pageSummaryChunk: z.boolean().optional(),
|
|
16656
|
+
weightHeadings: z.boolean().optional()
|
|
16649
16657
|
}).optional(),
|
|
16650
16658
|
upstash: z.object({
|
|
16651
16659
|
url: z.string().url().optional(),
|
|
16652
16660
|
token: z.string().min(1).optional(),
|
|
16653
16661
|
urlEnv: z.string().min(1).optional(),
|
|
16654
|
-
tokenEnv: z.string().min(1).optional()
|
|
16662
|
+
tokenEnv: z.string().min(1).optional(),
|
|
16663
|
+
namespaces: z.object({
|
|
16664
|
+
pages: z.string().min(1).optional(),
|
|
16665
|
+
chunks: z.string().min(1).optional()
|
|
16666
|
+
}).optional()
|
|
16667
|
+
}).optional(),
|
|
16668
|
+
embedding: z.object({
|
|
16669
|
+
model: z.string().optional(),
|
|
16670
|
+
dimensions: z.number().int().positive().optional(),
|
|
16671
|
+
taskType: z.string().optional(),
|
|
16672
|
+
batchSize: z.number().int().positive().optional()
|
|
16655
16673
|
}).optional(),
|
|
16656
16674
|
search: z.object({
|
|
16657
|
-
semanticWeight: z.number().min(0).max(1).optional(),
|
|
16658
|
-
inputEnrichment: z.boolean().optional(),
|
|
16659
|
-
reranking: z.boolean().optional(),
|
|
16660
16675
|
dualSearch: z.boolean().optional(),
|
|
16661
16676
|
pageSearchWeight: z.number().min(0).max(1).optional()
|
|
16662
16677
|
}).optional(),
|
|
16663
16678
|
ranking: z.object({
|
|
16664
16679
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
16665
16680
|
enableDepthBoost: z.boolean().optional(),
|
|
16681
|
+
enableFreshnessBoost: z.boolean().optional(),
|
|
16682
|
+
freshnessDecayRate: z.number().positive().optional(),
|
|
16683
|
+
enableAnchorTextBoost: z.boolean().optional(),
|
|
16666
16684
|
pageWeights: z.record(z.string(), z.number().min(0)).optional(),
|
|
16667
16685
|
aggregationCap: z.number().int().positive().optional(),
|
|
16668
16686
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
16669
16687
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
16670
|
-
|
|
16688
|
+
minScoreRatio: z.number().min(0).max(1).optional(),
|
|
16671
16689
|
scoreGapThreshold: z.number().min(0).max(1).optional(),
|
|
16672
16690
|
weights: z.object({
|
|
16673
16691
|
incomingLinks: z.number().optional(),
|
|
16674
16692
|
depth: z.number().optional(),
|
|
16675
16693
|
aggregation: z.number().optional(),
|
|
16676
|
-
titleMatch: z.number().optional()
|
|
16694
|
+
titleMatch: z.number().optional(),
|
|
16695
|
+
freshness: z.number().optional(),
|
|
16696
|
+
anchorText: z.number().optional()
|
|
16677
16697
|
}).optional()
|
|
16678
16698
|
}).optional(),
|
|
16679
16699
|
api: z.object({
|
|
@@ -16688,12 +16708,28 @@ var searchSocketConfigSchema = z.object({
|
|
|
16688
16708
|
}).optional(),
|
|
16689
16709
|
mcp: z.object({
|
|
16690
16710
|
enable: z.boolean().optional(),
|
|
16711
|
+
access: z.enum(["public", "private"]).optional(),
|
|
16691
16712
|
transport: z.enum(["stdio", "http"]).optional(),
|
|
16692
16713
|
http: z.object({
|
|
16693
16714
|
port: z.number().int().positive().optional(),
|
|
16694
|
-
path: z.string().optional()
|
|
16715
|
+
path: z.string().optional(),
|
|
16716
|
+
apiKey: z.string().min(1).optional(),
|
|
16717
|
+
apiKeyEnv: z.string().min(1).optional()
|
|
16718
|
+
}).optional(),
|
|
16719
|
+
handle: z.object({
|
|
16720
|
+
path: z.string().optional(),
|
|
16721
|
+
apiKey: z.string().min(1).optional(),
|
|
16722
|
+
enableJsonResponse: z.boolean().optional()
|
|
16695
16723
|
}).optional()
|
|
16696
16724
|
}).optional(),
|
|
16725
|
+
llmsTxt: z.object({
|
|
16726
|
+
enable: z.boolean().optional(),
|
|
16727
|
+
outputPath: z.string().optional(),
|
|
16728
|
+
title: z.string().optional(),
|
|
16729
|
+
description: z.string().optional(),
|
|
16730
|
+
generateFull: z.boolean().optional(),
|
|
16731
|
+
serveMarkdownVariants: z.boolean().optional()
|
|
16732
|
+
}).optional(),
|
|
16697
16733
|
state: z.object({
|
|
16698
16734
|
dir: z.string().optional()
|
|
16699
16735
|
}).optional()
|
|
@@ -16732,6 +16768,7 @@ function createDefaultConfig(projectId) {
|
|
|
16732
16768
|
dropSelectors: DEFAULT_DROP_SELECTORS,
|
|
16733
16769
|
ignoreAttr: "data-search-ignore",
|
|
16734
16770
|
noindexAttr: "data-search-noindex",
|
|
16771
|
+
imageDescAttr: "data-search-description",
|
|
16735
16772
|
respectRobotsNoindex: true
|
|
16736
16773
|
},
|
|
16737
16774
|
transform: {
|
|
@@ -16741,39 +16778,52 @@ function createDefaultConfig(projectId) {
|
|
|
16741
16778
|
},
|
|
16742
16779
|
chunking: {
|
|
16743
16780
|
strategy: "hybrid",
|
|
16744
|
-
maxChars:
|
|
16781
|
+
maxChars: 1500,
|
|
16745
16782
|
overlapChars: 200,
|
|
16746
16783
|
minChars: 250,
|
|
16747
16784
|
headingPathDepth: 3,
|
|
16748
16785
|
dontSplitInside: ["code", "table", "blockquote"],
|
|
16749
16786
|
prependTitle: true,
|
|
16750
|
-
pageSummaryChunk: true
|
|
16787
|
+
pageSummaryChunk: true,
|
|
16788
|
+
weightHeadings: true
|
|
16751
16789
|
},
|
|
16752
16790
|
upstash: {
|
|
16753
|
-
urlEnv: "
|
|
16754
|
-
tokenEnv: "
|
|
16791
|
+
urlEnv: "UPSTASH_VECTOR_REST_URL",
|
|
16792
|
+
tokenEnv: "UPSTASH_VECTOR_REST_TOKEN",
|
|
16793
|
+
namespaces: {
|
|
16794
|
+
pages: "pages",
|
|
16795
|
+
chunks: "chunks"
|
|
16796
|
+
}
|
|
16797
|
+
},
|
|
16798
|
+
embedding: {
|
|
16799
|
+
model: "bge-large-en-v1.5",
|
|
16800
|
+
dimensions: 1024,
|
|
16801
|
+
taskType: "RETRIEVAL_DOCUMENT",
|
|
16802
|
+
batchSize: 100
|
|
16755
16803
|
},
|
|
16756
16804
|
search: {
|
|
16757
|
-
semanticWeight: 0.75,
|
|
16758
|
-
inputEnrichment: true,
|
|
16759
|
-
reranking: true,
|
|
16760
16805
|
dualSearch: true,
|
|
16761
16806
|
pageSearchWeight: 0.3
|
|
16762
16807
|
},
|
|
16763
16808
|
ranking: {
|
|
16764
16809
|
enableIncomingLinkBoost: true,
|
|
16765
16810
|
enableDepthBoost: true,
|
|
16811
|
+
enableFreshnessBoost: false,
|
|
16812
|
+
freshnessDecayRate: 1e-3,
|
|
16813
|
+
enableAnchorTextBoost: false,
|
|
16766
16814
|
pageWeights: {},
|
|
16767
16815
|
aggregationCap: 5,
|
|
16768
16816
|
aggregationDecay: 0.5,
|
|
16769
16817
|
minChunkScoreRatio: 0.5,
|
|
16770
|
-
|
|
16818
|
+
minScoreRatio: 0.7,
|
|
16771
16819
|
scoreGapThreshold: 0.4,
|
|
16772
16820
|
weights: {
|
|
16773
16821
|
incomingLinks: 0.05,
|
|
16774
16822
|
depth: 0.03,
|
|
16775
16823
|
aggregation: 0.1,
|
|
16776
|
-
titleMatch: 0.15
|
|
16824
|
+
titleMatch: 0.15,
|
|
16825
|
+
freshness: 0.1,
|
|
16826
|
+
anchorText: 0.1
|
|
16777
16827
|
}
|
|
16778
16828
|
},
|
|
16779
16829
|
api: {
|
|
@@ -16784,12 +16834,23 @@ function createDefaultConfig(projectId) {
|
|
|
16784
16834
|
},
|
|
16785
16835
|
mcp: {
|
|
16786
16836
|
enable: process.env.NODE_ENV !== "production",
|
|
16837
|
+
access: "private",
|
|
16787
16838
|
transport: "stdio",
|
|
16788
16839
|
http: {
|
|
16789
16840
|
port: 3338,
|
|
16790
16841
|
path: "/mcp"
|
|
16842
|
+
},
|
|
16843
|
+
handle: {
|
|
16844
|
+
path: "/api/mcp",
|
|
16845
|
+
enableJsonResponse: true
|
|
16791
16846
|
}
|
|
16792
16847
|
},
|
|
16848
|
+
llmsTxt: {
|
|
16849
|
+
enable: false,
|
|
16850
|
+
outputPath: "static/llms.txt",
|
|
16851
|
+
generateFull: true,
|
|
16852
|
+
serveMarkdownVariants: false
|
|
16853
|
+
},
|
|
16793
16854
|
state: {
|
|
16794
16855
|
dir: ".searchsocket"
|
|
16795
16856
|
}
|
|
@@ -16917,7 +16978,15 @@ ${issues}`
|
|
|
16917
16978
|
},
|
|
16918
16979
|
upstash: {
|
|
16919
16980
|
...defaults.upstash,
|
|
16920
|
-
...parsed.upstash
|
|
16981
|
+
...parsed.upstash,
|
|
16982
|
+
namespaces: {
|
|
16983
|
+
...defaults.upstash.namespaces,
|
|
16984
|
+
...parsed.upstash?.namespaces
|
|
16985
|
+
}
|
|
16986
|
+
},
|
|
16987
|
+
embedding: {
|
|
16988
|
+
...defaults.embedding,
|
|
16989
|
+
...parsed.embedding
|
|
16921
16990
|
},
|
|
16922
16991
|
search: {
|
|
16923
16992
|
...defaults.search,
|
|
@@ -16954,8 +17023,16 @@ ${issues}`
|
|
|
16954
17023
|
http: {
|
|
16955
17024
|
...defaults.mcp.http,
|
|
16956
17025
|
...parsed.mcp?.http
|
|
17026
|
+
},
|
|
17027
|
+
handle: {
|
|
17028
|
+
...defaults.mcp.handle,
|
|
17029
|
+
...parsed.mcp?.handle
|
|
16957
17030
|
}
|
|
16958
17031
|
},
|
|
17032
|
+
llmsTxt: {
|
|
17033
|
+
...defaults.llmsTxt,
|
|
17034
|
+
...parsed.llmsTxt
|
|
17035
|
+
},
|
|
16959
17036
|
state: {
|
|
16960
17037
|
...defaults.state,
|
|
16961
17038
|
...parsed.state
|
|
@@ -16975,6 +17052,15 @@ ${issues}`
|
|
|
16975
17052
|
maxDepth: 10
|
|
16976
17053
|
};
|
|
16977
17054
|
}
|
|
17055
|
+
if (merged.mcp.access === "public") {
|
|
17056
|
+
const resolvedKey = merged.mcp.http.apiKey ?? (merged.mcp.http.apiKeyEnv ? process.env[merged.mcp.http.apiKeyEnv] : void 0);
|
|
17057
|
+
if (!resolvedKey) {
|
|
17058
|
+
throw new SearchSocketError(
|
|
17059
|
+
"CONFIG_MISSING",
|
|
17060
|
+
'`mcp.access` is "public" but no API key is configured. Set `mcp.http.apiKey` or `mcp.http.apiKeyEnv`.'
|
|
17061
|
+
);
|
|
17062
|
+
}
|
|
17063
|
+
}
|
|
16978
17064
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
16979
17065
|
throw new SearchSocketError("CONFIG_MISSING", "`source.crawl.baseUrl` is required when source.mode is crawl.");
|
|
16980
17066
|
}
|
|
@@ -17023,13 +17109,84 @@ function normalizeMarkdown(input) {
|
|
|
17023
17109
|
function sanitizeScopeName(scopeName) {
|
|
17024
17110
|
return scopeName.toLowerCase().replace(/[^a-z0-9._-]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80);
|
|
17025
17111
|
}
|
|
17112
|
+
function markdownToPlain(markdown) {
|
|
17113
|
+
return markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/[#>*_|\-]/g, " ").replace(/\s+/g, " ").trim();
|
|
17114
|
+
}
|
|
17026
17115
|
function toSnippet(markdown, maxLen = 220) {
|
|
17027
|
-
const plain = markdown
|
|
17116
|
+
const plain = markdownToPlain(markdown);
|
|
17028
17117
|
if (plain.length <= maxLen) {
|
|
17029
17118
|
return plain;
|
|
17030
17119
|
}
|
|
17031
17120
|
return `${plain.slice(0, Math.max(0, maxLen - 1)).trim()}\u2026`;
|
|
17032
17121
|
}
|
|
17122
|
+
function queryAwareExcerpt(markdown, query, maxLen = 220) {
|
|
17123
|
+
const plain = markdownToPlain(markdown);
|
|
17124
|
+
if (plain.length <= maxLen) return plain;
|
|
17125
|
+
const tokens = query.toLowerCase().split(/\s+/).filter((t) => t.length >= 2);
|
|
17126
|
+
if (tokens.length === 0) return toSnippet(markdown, maxLen);
|
|
17127
|
+
const positions = [];
|
|
17128
|
+
for (let ti = 0; ti < tokens.length; ti++) {
|
|
17129
|
+
const escaped = tokens[ti].replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
17130
|
+
const re = new RegExp(escaped, "gi");
|
|
17131
|
+
let m;
|
|
17132
|
+
while ((m = re.exec(plain)) !== null) {
|
|
17133
|
+
positions.push({ start: m.index, end: m.index + m[0].length, tokenIdx: ti });
|
|
17134
|
+
}
|
|
17135
|
+
}
|
|
17136
|
+
if (positions.length === 0) return toSnippet(markdown, maxLen);
|
|
17137
|
+
positions.sort((a, b) => a.start - b.start);
|
|
17138
|
+
let bestUniqueCount = 0;
|
|
17139
|
+
let bestTotalCount = 0;
|
|
17140
|
+
let bestLeft = 0;
|
|
17141
|
+
let bestRight = 0;
|
|
17142
|
+
let left = 0;
|
|
17143
|
+
const tokenCounts = /* @__PURE__ */ new Map();
|
|
17144
|
+
for (let right = 0; right < positions.length; right++) {
|
|
17145
|
+
tokenCounts.set(positions[right].tokenIdx, (tokenCounts.get(positions[right].tokenIdx) ?? 0) + 1);
|
|
17146
|
+
while (positions[right].end - positions[left].start > maxLen && left < right) {
|
|
17147
|
+
const leftToken = positions[left].tokenIdx;
|
|
17148
|
+
const cnt = tokenCounts.get(leftToken) - 1;
|
|
17149
|
+
if (cnt === 0) tokenCounts.delete(leftToken);
|
|
17150
|
+
else tokenCounts.set(leftToken, cnt);
|
|
17151
|
+
left++;
|
|
17152
|
+
}
|
|
17153
|
+
const uniqueCount = tokenCounts.size;
|
|
17154
|
+
const totalCount = right - left + 1;
|
|
17155
|
+
if (uniqueCount > bestUniqueCount || uniqueCount === bestUniqueCount && totalCount > bestTotalCount) {
|
|
17156
|
+
bestUniqueCount = uniqueCount;
|
|
17157
|
+
bestTotalCount = totalCount;
|
|
17158
|
+
bestLeft = left;
|
|
17159
|
+
bestRight = right;
|
|
17160
|
+
}
|
|
17161
|
+
}
|
|
17162
|
+
const mid = Math.floor((positions[bestLeft].start + positions[bestRight].end) / 2);
|
|
17163
|
+
let start = Math.max(0, mid - Math.floor(maxLen / 2));
|
|
17164
|
+
let end = Math.min(plain.length, start + maxLen);
|
|
17165
|
+
start = Math.max(0, end - maxLen);
|
|
17166
|
+
if (start > 0) {
|
|
17167
|
+
const spaceIdx = plain.lastIndexOf(" ", start);
|
|
17168
|
+
if (spaceIdx > start - 30) {
|
|
17169
|
+
start = spaceIdx + 1;
|
|
17170
|
+
}
|
|
17171
|
+
}
|
|
17172
|
+
if (end < plain.length) {
|
|
17173
|
+
const spaceIdx = plain.indexOf(" ", end);
|
|
17174
|
+
if (spaceIdx !== -1 && spaceIdx < end + 30) {
|
|
17175
|
+
end = spaceIdx;
|
|
17176
|
+
}
|
|
17177
|
+
}
|
|
17178
|
+
let excerpt = plain.slice(start, end);
|
|
17179
|
+
if (excerpt.length > Math.ceil(maxLen * 1.2)) {
|
|
17180
|
+
excerpt = excerpt.slice(0, maxLen);
|
|
17181
|
+
const lastSpace = excerpt.lastIndexOf(" ");
|
|
17182
|
+
if (lastSpace > maxLen * 0.5) {
|
|
17183
|
+
excerpt = excerpt.slice(0, lastSpace);
|
|
17184
|
+
}
|
|
17185
|
+
}
|
|
17186
|
+
const prefix = start > 0 ? "\u2026" : "";
|
|
17187
|
+
const suffix = end < plain.length ? "\u2026" : "";
|
|
17188
|
+
return `${prefix}${excerpt}${suffix}`;
|
|
17189
|
+
}
|
|
17033
17190
|
function extractFirstParagraph(markdown) {
|
|
17034
17191
|
const lines = markdown.split("\n");
|
|
17035
17192
|
let inFence = false;
|
|
@@ -17136,162 +17293,288 @@ function joinUrl(baseUrl, route) {
|
|
|
17136
17293
|
const routePart = ensureLeadingSlash(route);
|
|
17137
17294
|
return `${base}${routePart}`;
|
|
17138
17295
|
}
|
|
17139
|
-
|
|
17140
|
-
// src/vector/upstash.ts
|
|
17141
|
-
function chunkIndexName(scope) {
|
|
17142
|
-
return `${scope.projectId}--${scope.scopeName}`;
|
|
17143
|
-
}
|
|
17144
|
-
function pageIndexName(scope) {
|
|
17145
|
-
return `${scope.projectId}--${scope.scopeName}--pages`;
|
|
17146
|
-
}
|
|
17147
17296
|
var UpstashSearchStore = class {
|
|
17148
|
-
|
|
17297
|
+
index;
|
|
17298
|
+
pagesNs;
|
|
17299
|
+
chunksNs;
|
|
17149
17300
|
constructor(opts) {
|
|
17150
|
-
this.
|
|
17151
|
-
|
|
17152
|
-
|
|
17153
|
-
return this.client.index(chunkIndexName(scope));
|
|
17154
|
-
}
|
|
17155
|
-
pageIndex(scope) {
|
|
17156
|
-
return this.client.index(pageIndexName(scope));
|
|
17301
|
+
this.index = opts.index;
|
|
17302
|
+
this.pagesNs = opts.index.namespace(opts.pagesNamespace);
|
|
17303
|
+
this.chunksNs = opts.index.namespace(opts.chunksNamespace);
|
|
17157
17304
|
}
|
|
17158
17305
|
async upsertChunks(chunks, scope) {
|
|
17159
17306
|
if (chunks.length === 0) return;
|
|
17160
|
-
const index = this.chunkIndex(scope);
|
|
17161
17307
|
const BATCH_SIZE = 100;
|
|
17162
17308
|
for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
|
|
17163
17309
|
const batch = chunks.slice(i, i + BATCH_SIZE);
|
|
17164
|
-
await
|
|
17165
|
-
|
|
17166
|
-
|
|
17167
|
-
|
|
17168
|
-
|
|
17169
|
-
|
|
17170
|
-
|
|
17171
|
-
|
|
17172
|
-
|
|
17173
|
-
|
|
17174
|
-
|
|
17175
|
-
|
|
17310
|
+
await this.chunksNs.upsert(
|
|
17311
|
+
batch.map((c) => ({
|
|
17312
|
+
id: c.id,
|
|
17313
|
+
data: c.data,
|
|
17314
|
+
metadata: {
|
|
17315
|
+
...c.metadata,
|
|
17316
|
+
projectId: scope.projectId,
|
|
17317
|
+
scopeName: scope.scopeName,
|
|
17318
|
+
type: c.metadata.type || "chunk"
|
|
17319
|
+
}
|
|
17320
|
+
}))
|
|
17321
|
+
);
|
|
17322
|
+
}
|
|
17323
|
+
}
|
|
17324
|
+
async search(data, opts, scope) {
|
|
17325
|
+
const filterParts = [
|
|
17326
|
+
`projectId = '${scope.projectId}'`,
|
|
17327
|
+
`scopeName = '${scope.scopeName}'`
|
|
17328
|
+
];
|
|
17329
|
+
if (opts.filter) {
|
|
17330
|
+
filterParts.push(opts.filter);
|
|
17331
|
+
}
|
|
17332
|
+
const results = await this.chunksNs.query({
|
|
17333
|
+
data,
|
|
17334
|
+
topK: opts.limit,
|
|
17335
|
+
includeMetadata: true,
|
|
17336
|
+
filter: filterParts.join(" AND "),
|
|
17337
|
+
queryMode: QueryMode.HYBRID,
|
|
17338
|
+
fusionAlgorithm: FusionAlgorithm.DBSF
|
|
17339
|
+
});
|
|
17340
|
+
return results.map((doc) => ({
|
|
17341
|
+
id: String(doc.id),
|
|
17342
|
+
score: doc.score,
|
|
17343
|
+
metadata: {
|
|
17344
|
+
projectId: doc.metadata?.projectId ?? "",
|
|
17345
|
+
scopeName: doc.metadata?.scopeName ?? "",
|
|
17346
|
+
url: doc.metadata?.url ?? "",
|
|
17347
|
+
path: doc.metadata?.path ?? "",
|
|
17348
|
+
title: doc.metadata?.title ?? "",
|
|
17349
|
+
sectionTitle: doc.metadata?.sectionTitle ?? "",
|
|
17350
|
+
headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
|
|
17351
|
+
snippet: doc.metadata?.snippet ?? "",
|
|
17352
|
+
chunkText: doc.metadata?.chunkText ?? "",
|
|
17353
|
+
ordinal: doc.metadata?.ordinal ?? 0,
|
|
17354
|
+
contentHash: doc.metadata?.contentHash ?? "",
|
|
17355
|
+
depth: doc.metadata?.depth ?? 0,
|
|
17356
|
+
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17357
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
17358
|
+
tags: doc.metadata?.tags ?? [],
|
|
17359
|
+
description: doc.metadata?.description || void 0,
|
|
17360
|
+
keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17361
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
|
|
17362
|
+
incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
|
|
17363
|
+
}
|
|
17364
|
+
}));
|
|
17365
|
+
}
|
|
17366
|
+
async searchChunksByUrl(data, url, opts, scope) {
|
|
17367
|
+
const filterParts = [
|
|
17368
|
+
`projectId = '${scope.projectId}'`,
|
|
17369
|
+
`scopeName = '${scope.scopeName}'`,
|
|
17370
|
+
`url = '${url}'`
|
|
17371
|
+
];
|
|
17372
|
+
if (opts.filter) {
|
|
17373
|
+
filterParts.push(opts.filter);
|
|
17374
|
+
}
|
|
17375
|
+
const results = await this.chunksNs.query({
|
|
17376
|
+
data,
|
|
17377
|
+
topK: opts.limit,
|
|
17378
|
+
includeMetadata: true,
|
|
17379
|
+
filter: filterParts.join(" AND "),
|
|
17380
|
+
queryMode: QueryMode.HYBRID,
|
|
17381
|
+
fusionAlgorithm: FusionAlgorithm.DBSF
|
|
17176
17382
|
});
|
|
17177
17383
|
return results.map((doc) => ({
|
|
17178
|
-
id: doc.id,
|
|
17384
|
+
id: String(doc.id),
|
|
17179
17385
|
score: doc.score,
|
|
17180
17386
|
metadata: {
|
|
17181
17387
|
projectId: doc.metadata?.projectId ?? "",
|
|
17182
17388
|
scopeName: doc.metadata?.scopeName ?? "",
|
|
17183
|
-
url: doc.
|
|
17389
|
+
url: doc.metadata?.url ?? "",
|
|
17184
17390
|
path: doc.metadata?.path ?? "",
|
|
17185
|
-
title: doc.
|
|
17186
|
-
sectionTitle: doc.
|
|
17187
|
-
headingPath: doc.
|
|
17391
|
+
title: doc.metadata?.title ?? "",
|
|
17392
|
+
sectionTitle: doc.metadata?.sectionTitle ?? "",
|
|
17393
|
+
headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
|
|
17188
17394
|
snippet: doc.metadata?.snippet ?? "",
|
|
17189
|
-
chunkText: doc.
|
|
17395
|
+
chunkText: doc.metadata?.chunkText ?? "",
|
|
17190
17396
|
ordinal: doc.metadata?.ordinal ?? 0,
|
|
17191
17397
|
contentHash: doc.metadata?.contentHash ?? "",
|
|
17192
17398
|
depth: doc.metadata?.depth ?? 0,
|
|
17193
17399
|
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17194
17400
|
routeFile: doc.metadata?.routeFile ?? "",
|
|
17195
|
-
tags: doc.
|
|
17401
|
+
tags: doc.metadata?.tags ?? [],
|
|
17196
17402
|
description: doc.metadata?.description || void 0,
|
|
17197
|
-
keywords: doc.metadata?.keywords ? doc.metadata.keywords
|
|
17403
|
+
keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17404
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
|
|
17405
|
+
incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
|
|
17198
17406
|
}
|
|
17199
17407
|
}));
|
|
17200
17408
|
}
|
|
17201
|
-
async
|
|
17202
|
-
|
|
17409
|
+
async searchPagesByText(data, opts, scope) {
|
|
17410
|
+
return this.queryPages({ data }, opts, scope);
|
|
17411
|
+
}
|
|
17412
|
+
async searchPagesByVector(vector, opts, scope) {
|
|
17413
|
+
return this.queryPages({ vector }, opts, scope);
|
|
17414
|
+
}
|
|
17415
|
+
async queryPages(input, opts, scope) {
|
|
17416
|
+
const filterParts = [
|
|
17417
|
+
`projectId = '${scope.projectId}'`,
|
|
17418
|
+
`scopeName = '${scope.scopeName}'`
|
|
17419
|
+
];
|
|
17420
|
+
if (opts.filter) {
|
|
17421
|
+
filterParts.push(opts.filter);
|
|
17422
|
+
}
|
|
17203
17423
|
let results;
|
|
17204
17424
|
try {
|
|
17205
|
-
results = await
|
|
17206
|
-
|
|
17207
|
-
|
|
17208
|
-
|
|
17209
|
-
|
|
17210
|
-
|
|
17211
|
-
|
|
17425
|
+
results = await this.pagesNs.query({
|
|
17426
|
+
...input,
|
|
17427
|
+
topK: opts.limit,
|
|
17428
|
+
includeMetadata: true,
|
|
17429
|
+
filter: filterParts.join(" AND "),
|
|
17430
|
+
queryMode: QueryMode.HYBRID,
|
|
17431
|
+
fusionAlgorithm: FusionAlgorithm.DBSF
|
|
17212
17432
|
});
|
|
17213
17433
|
} catch {
|
|
17214
17434
|
return [];
|
|
17215
17435
|
}
|
|
17216
17436
|
return results.map((doc) => ({
|
|
17217
|
-
id: doc.id,
|
|
17437
|
+
id: String(doc.id),
|
|
17218
17438
|
score: doc.score,
|
|
17219
|
-
title: doc.
|
|
17220
|
-
url: doc.
|
|
17221
|
-
description: doc.
|
|
17222
|
-
tags: doc.
|
|
17439
|
+
title: doc.metadata?.title ?? "",
|
|
17440
|
+
url: doc.metadata?.url ?? "",
|
|
17441
|
+
description: doc.metadata?.description ?? "",
|
|
17442
|
+
tags: doc.metadata?.tags ?? [],
|
|
17223
17443
|
depth: doc.metadata?.depth ?? 0,
|
|
17224
17444
|
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17225
|
-
routeFile: doc.metadata?.routeFile ?? ""
|
|
17445
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
17446
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0
|
|
17226
17447
|
}));
|
|
17227
17448
|
}
|
|
17228
|
-
async deleteByIds(ids,
|
|
17449
|
+
async deleteByIds(ids, _scope) {
|
|
17229
17450
|
if (ids.length === 0) return;
|
|
17230
|
-
const
|
|
17231
|
-
const BATCH_SIZE = 500;
|
|
17451
|
+
const BATCH_SIZE = 100;
|
|
17232
17452
|
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17233
17453
|
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17234
|
-
await
|
|
17454
|
+
await this.chunksNs.delete(batch);
|
|
17235
17455
|
}
|
|
17236
17456
|
}
|
|
17237
17457
|
async deleteScope(scope) {
|
|
17238
|
-
|
|
17239
|
-
const
|
|
17240
|
-
|
|
17241
|
-
|
|
17242
|
-
|
|
17243
|
-
|
|
17244
|
-
|
|
17245
|
-
|
|
17246
|
-
|
|
17458
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17459
|
+
const ids = [];
|
|
17460
|
+
let cursor = "0";
|
|
17461
|
+
try {
|
|
17462
|
+
for (; ; ) {
|
|
17463
|
+
const result = await ns.range({
|
|
17464
|
+
cursor,
|
|
17465
|
+
limit: 100,
|
|
17466
|
+
includeMetadata: true
|
|
17467
|
+
});
|
|
17468
|
+
for (const doc of result.vectors) {
|
|
17469
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17470
|
+
ids.push(String(doc.id));
|
|
17471
|
+
}
|
|
17472
|
+
}
|
|
17473
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17474
|
+
cursor = result.nextCursor;
|
|
17475
|
+
}
|
|
17476
|
+
} catch {
|
|
17477
|
+
}
|
|
17478
|
+
if (ids.length > 0) {
|
|
17479
|
+
const BATCH_SIZE = 100;
|
|
17480
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17481
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17482
|
+
await ns.delete(batch);
|
|
17483
|
+
}
|
|
17484
|
+
}
|
|
17247
17485
|
}
|
|
17248
17486
|
}
|
|
17249
17487
|
async listScopes(projectId) {
|
|
17250
|
-
const
|
|
17251
|
-
const
|
|
17252
|
-
|
|
17253
|
-
for (const name of allIndexes) {
|
|
17254
|
-
if (name.startsWith(prefix) && !name.endsWith("--pages")) {
|
|
17255
|
-
const scopeName = name.slice(prefix.length);
|
|
17256
|
-
scopeNames.add(scopeName);
|
|
17257
|
-
}
|
|
17258
|
-
}
|
|
17259
|
-
const scopes = [];
|
|
17260
|
-
for (const scopeName of scopeNames) {
|
|
17261
|
-
const scope = {
|
|
17262
|
-
projectId,
|
|
17263
|
-
scopeName,
|
|
17264
|
-
scopeId: `${projectId}:${scopeName}`
|
|
17265
|
-
};
|
|
17488
|
+
const scopeMap = /* @__PURE__ */ new Map();
|
|
17489
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17490
|
+
let cursor = "0";
|
|
17266
17491
|
try {
|
|
17267
|
-
|
|
17268
|
-
|
|
17269
|
-
|
|
17270
|
-
|
|
17271
|
-
|
|
17272
|
-
|
|
17273
|
-
|
|
17492
|
+
for (; ; ) {
|
|
17493
|
+
const result = await ns.range({
|
|
17494
|
+
cursor,
|
|
17495
|
+
limit: 100,
|
|
17496
|
+
includeMetadata: true
|
|
17497
|
+
});
|
|
17498
|
+
for (const doc of result.vectors) {
|
|
17499
|
+
if (doc.metadata?.projectId === projectId) {
|
|
17500
|
+
const scopeName = doc.metadata.scopeName ?? "";
|
|
17501
|
+
scopeMap.set(scopeName, (scopeMap.get(scopeName) ?? 0) + 1);
|
|
17502
|
+
}
|
|
17503
|
+
}
|
|
17504
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17505
|
+
cursor = result.nextCursor;
|
|
17506
|
+
}
|
|
17274
17507
|
} catch {
|
|
17275
|
-
scopes.push({
|
|
17276
|
-
projectId,
|
|
17277
|
-
scopeName,
|
|
17278
|
-
lastIndexedAt: "unknown",
|
|
17279
|
-
documentCount: 0
|
|
17280
|
-
});
|
|
17281
17508
|
}
|
|
17282
17509
|
}
|
|
17283
|
-
return
|
|
17510
|
+
return [...scopeMap.entries()].map(([scopeName, count]) => ({
|
|
17511
|
+
projectId,
|
|
17512
|
+
scopeName,
|
|
17513
|
+
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
17514
|
+
documentCount: count
|
|
17515
|
+
}));
|
|
17284
17516
|
}
|
|
17285
17517
|
async getContentHashes(scope) {
|
|
17286
17518
|
const map = /* @__PURE__ */ new Map();
|
|
17287
|
-
const index = this.chunkIndex(scope);
|
|
17288
17519
|
let cursor = "0";
|
|
17289
17520
|
try {
|
|
17290
17521
|
for (; ; ) {
|
|
17291
|
-
const result = await
|
|
17292
|
-
|
|
17293
|
-
|
|
17294
|
-
|
|
17522
|
+
const result = await this.chunksNs.range({
|
|
17523
|
+
cursor,
|
|
17524
|
+
limit: 100,
|
|
17525
|
+
includeMetadata: true
|
|
17526
|
+
});
|
|
17527
|
+
for (const doc of result.vectors) {
|
|
17528
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17529
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17530
|
+
}
|
|
17531
|
+
}
|
|
17532
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17533
|
+
cursor = result.nextCursor;
|
|
17534
|
+
}
|
|
17535
|
+
} catch {
|
|
17536
|
+
}
|
|
17537
|
+
return map;
|
|
17538
|
+
}
|
|
17539
|
+
async listPages(scope, opts) {
|
|
17540
|
+
const cursor = opts?.cursor ?? "0";
|
|
17541
|
+
const limit = opts?.limit ?? 50;
|
|
17542
|
+
try {
|
|
17543
|
+
const result = await this.pagesNs.range({
|
|
17544
|
+
cursor,
|
|
17545
|
+
limit,
|
|
17546
|
+
includeMetadata: true
|
|
17547
|
+
});
|
|
17548
|
+
const pages = result.vectors.filter(
|
|
17549
|
+
(doc) => doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && (!opts?.pathPrefix || (doc.metadata?.url ?? "").startsWith(opts.pathPrefix))
|
|
17550
|
+
).map((doc) => ({
|
|
17551
|
+
url: doc.metadata?.url ?? "",
|
|
17552
|
+
title: doc.metadata?.title ?? "",
|
|
17553
|
+
description: doc.metadata?.description ?? "",
|
|
17554
|
+
routeFile: doc.metadata?.routeFile ?? ""
|
|
17555
|
+
}));
|
|
17556
|
+
const response = { pages };
|
|
17557
|
+
if (result.nextCursor && result.nextCursor !== "0") {
|
|
17558
|
+
response.nextCursor = result.nextCursor;
|
|
17559
|
+
}
|
|
17560
|
+
return response;
|
|
17561
|
+
} catch {
|
|
17562
|
+
return { pages: [] };
|
|
17563
|
+
}
|
|
17564
|
+
}
|
|
17565
|
+
async getPageHashes(scope) {
|
|
17566
|
+
const map = /* @__PURE__ */ new Map();
|
|
17567
|
+
let cursor = "0";
|
|
17568
|
+
try {
|
|
17569
|
+
for (; ; ) {
|
|
17570
|
+
const result = await this.pagesNs.range({
|
|
17571
|
+
cursor,
|
|
17572
|
+
limit: 100,
|
|
17573
|
+
includeMetadata: true
|
|
17574
|
+
});
|
|
17575
|
+
for (const doc of result.vectors) {
|
|
17576
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17577
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17295
17578
|
}
|
|
17296
17579
|
}
|
|
17297
17580
|
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
@@ -17301,47 +17584,43 @@ var UpstashSearchStore = class {
|
|
|
17301
17584
|
}
|
|
17302
17585
|
return map;
|
|
17303
17586
|
}
|
|
17587
|
+
async deletePagesByIds(ids, _scope) {
|
|
17588
|
+
if (ids.length === 0) return;
|
|
17589
|
+
const BATCH_SIZE = 50;
|
|
17590
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17591
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17592
|
+
await this.pagesNs.delete(batch);
|
|
17593
|
+
}
|
|
17594
|
+
}
|
|
17304
17595
|
async upsertPages(pages, scope) {
|
|
17305
17596
|
if (pages.length === 0) return;
|
|
17306
|
-
const index = this.pageIndex(scope);
|
|
17307
17597
|
const BATCH_SIZE = 50;
|
|
17308
17598
|
for (let i = 0; i < pages.length; i += BATCH_SIZE) {
|
|
17309
17599
|
const batch = pages.slice(i, i + BATCH_SIZE);
|
|
17310
|
-
|
|
17311
|
-
|
|
17312
|
-
|
|
17313
|
-
|
|
17314
|
-
|
|
17315
|
-
|
|
17316
|
-
|
|
17317
|
-
|
|
17318
|
-
|
|
17319
|
-
|
|
17320
|
-
}
|
|
17321
|
-
|
|
17322
|
-
markdown: p.markdown,
|
|
17323
|
-
projectId: p.projectId,
|
|
17324
|
-
scopeName: p.scopeName,
|
|
17325
|
-
routeFile: p.routeFile,
|
|
17326
|
-
routeResolution: p.routeResolution,
|
|
17327
|
-
incomingLinks: p.incomingLinks,
|
|
17328
|
-
outgoingLinks: p.outgoingLinks,
|
|
17329
|
-
depth: p.depth,
|
|
17330
|
-
indexedAt: p.indexedAt
|
|
17331
|
-
}
|
|
17332
|
-
}));
|
|
17333
|
-
await index.upsert(docs);
|
|
17600
|
+
await this.pagesNs.upsert(
|
|
17601
|
+
batch.map((p) => ({
|
|
17602
|
+
id: p.id,
|
|
17603
|
+
data: p.data,
|
|
17604
|
+
metadata: {
|
|
17605
|
+
...p.metadata,
|
|
17606
|
+
projectId: scope.projectId,
|
|
17607
|
+
scopeName: scope.scopeName,
|
|
17608
|
+
type: "page"
|
|
17609
|
+
}
|
|
17610
|
+
}))
|
|
17611
|
+
);
|
|
17334
17612
|
}
|
|
17335
17613
|
}
|
|
17336
17614
|
async getPage(url, scope) {
|
|
17337
|
-
const index = this.pageIndex(scope);
|
|
17338
17615
|
try {
|
|
17339
|
-
const results = await
|
|
17616
|
+
const results = await this.pagesNs.fetch([url], {
|
|
17617
|
+
includeMetadata: true
|
|
17618
|
+
});
|
|
17340
17619
|
const doc = results[0];
|
|
17341
|
-
if (!doc) return null;
|
|
17620
|
+
if (!doc || !doc.metadata) return null;
|
|
17342
17621
|
return {
|
|
17343
|
-
url: doc.
|
|
17344
|
-
title: doc.
|
|
17622
|
+
url: doc.metadata.url,
|
|
17623
|
+
title: doc.metadata.title,
|
|
17345
17624
|
markdown: doc.metadata.markdown,
|
|
17346
17625
|
projectId: doc.metadata.projectId,
|
|
17347
17626
|
scopeName: doc.metadata.scopeName,
|
|
@@ -17349,27 +17628,86 @@ var UpstashSearchStore = class {
|
|
|
17349
17628
|
routeResolution: doc.metadata.routeResolution,
|
|
17350
17629
|
incomingLinks: doc.metadata.incomingLinks,
|
|
17351
17630
|
outgoingLinks: doc.metadata.outgoingLinks,
|
|
17631
|
+
outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? void 0,
|
|
17352
17632
|
depth: doc.metadata.depth,
|
|
17353
|
-
tags: doc.
|
|
17633
|
+
tags: doc.metadata.tags ?? [],
|
|
17354
17634
|
indexedAt: doc.metadata.indexedAt,
|
|
17355
|
-
summary: doc.
|
|
17356
|
-
description: doc.
|
|
17357
|
-
keywords: doc.
|
|
17635
|
+
summary: doc.metadata.summary || void 0,
|
|
17636
|
+
description: doc.metadata.description || void 0,
|
|
17637
|
+
keywords: doc.metadata.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17638
|
+
publishedAt: typeof doc.metadata.publishedAt === "number" ? doc.metadata.publishedAt : void 0
|
|
17358
17639
|
};
|
|
17359
17640
|
} catch {
|
|
17360
17641
|
return null;
|
|
17361
17642
|
}
|
|
17362
17643
|
}
|
|
17644
|
+
async fetchPageWithVector(url, scope) {
|
|
17645
|
+
try {
|
|
17646
|
+
const results = await this.pagesNs.fetch([url], {
|
|
17647
|
+
includeMetadata: true,
|
|
17648
|
+
includeVectors: true
|
|
17649
|
+
});
|
|
17650
|
+
const doc = results[0];
|
|
17651
|
+
if (!doc || !doc.metadata || !doc.vector) return null;
|
|
17652
|
+
if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
|
|
17653
|
+
return null;
|
|
17654
|
+
}
|
|
17655
|
+
return { metadata: doc.metadata, vector: doc.vector };
|
|
17656
|
+
} catch {
|
|
17657
|
+
return null;
|
|
17658
|
+
}
|
|
17659
|
+
}
|
|
17660
|
+
async fetchPagesBatch(urls, scope) {
|
|
17661
|
+
if (urls.length === 0) return [];
|
|
17662
|
+
try {
|
|
17663
|
+
const results = await this.pagesNs.fetch(urls, {
|
|
17664
|
+
includeMetadata: true
|
|
17665
|
+
});
|
|
17666
|
+
const out = [];
|
|
17667
|
+
for (const doc of results) {
|
|
17668
|
+
if (!doc || !doc.metadata) continue;
|
|
17669
|
+
if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
|
|
17670
|
+
continue;
|
|
17671
|
+
}
|
|
17672
|
+
out.push({
|
|
17673
|
+
url: doc.metadata.url,
|
|
17674
|
+
title: doc.metadata.title,
|
|
17675
|
+
routeFile: doc.metadata.routeFile,
|
|
17676
|
+
outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? []
|
|
17677
|
+
});
|
|
17678
|
+
}
|
|
17679
|
+
return out;
|
|
17680
|
+
} catch {
|
|
17681
|
+
return [];
|
|
17682
|
+
}
|
|
17683
|
+
}
|
|
17363
17684
|
async deletePages(scope) {
|
|
17685
|
+
const ids = [];
|
|
17686
|
+
let cursor = "0";
|
|
17364
17687
|
try {
|
|
17365
|
-
|
|
17366
|
-
|
|
17688
|
+
for (; ; ) {
|
|
17689
|
+
const result = await this.pagesNs.range({
|
|
17690
|
+
cursor,
|
|
17691
|
+
limit: 100,
|
|
17692
|
+
includeMetadata: true
|
|
17693
|
+
});
|
|
17694
|
+
for (const doc of result.vectors) {
|
|
17695
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17696
|
+
ids.push(String(doc.id));
|
|
17697
|
+
}
|
|
17698
|
+
}
|
|
17699
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17700
|
+
cursor = result.nextCursor;
|
|
17701
|
+
}
|
|
17367
17702
|
} catch {
|
|
17368
17703
|
}
|
|
17704
|
+
if (ids.length > 0) {
|
|
17705
|
+
await this.deletePagesByIds(ids, scope);
|
|
17706
|
+
}
|
|
17369
17707
|
}
|
|
17370
17708
|
async health() {
|
|
17371
17709
|
try {
|
|
17372
|
-
await this.
|
|
17710
|
+
await this.index.info();
|
|
17373
17711
|
return { ok: true };
|
|
17374
17712
|
} catch (error) {
|
|
17375
17713
|
return {
|
|
@@ -17379,14 +17717,31 @@ var UpstashSearchStore = class {
|
|
|
17379
17717
|
}
|
|
17380
17718
|
}
|
|
17381
17719
|
async dropAllIndexes(projectId) {
|
|
17382
|
-
const
|
|
17383
|
-
|
|
17384
|
-
|
|
17385
|
-
|
|
17386
|
-
|
|
17387
|
-
const
|
|
17388
|
-
|
|
17389
|
-
|
|
17720
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17721
|
+
const ids = [];
|
|
17722
|
+
let cursor = "0";
|
|
17723
|
+
try {
|
|
17724
|
+
for (; ; ) {
|
|
17725
|
+
const result = await ns.range({
|
|
17726
|
+
cursor,
|
|
17727
|
+
limit: 100,
|
|
17728
|
+
includeMetadata: true
|
|
17729
|
+
});
|
|
17730
|
+
for (const doc of result.vectors) {
|
|
17731
|
+
if (doc.metadata?.projectId === projectId) {
|
|
17732
|
+
ids.push(String(doc.id));
|
|
17733
|
+
}
|
|
17734
|
+
}
|
|
17735
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17736
|
+
cursor = result.nextCursor;
|
|
17737
|
+
}
|
|
17738
|
+
} catch {
|
|
17739
|
+
}
|
|
17740
|
+
if (ids.length > 0) {
|
|
17741
|
+
const BATCH_SIZE = 100;
|
|
17742
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17743
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17744
|
+
await ns.delete(batch);
|
|
17390
17745
|
}
|
|
17391
17746
|
}
|
|
17392
17747
|
}
|
|
@@ -17400,12 +17755,16 @@ async function createUpstashStore(config) {
|
|
|
17400
17755
|
if (!url || !token) {
|
|
17401
17756
|
throw new SearchSocketError(
|
|
17402
17757
|
"VECTOR_BACKEND_UNAVAILABLE",
|
|
17403
|
-
`Missing Upstash
|
|
17758
|
+
`Missing Upstash Vector credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
|
|
17404
17759
|
);
|
|
17405
17760
|
}
|
|
17406
|
-
const {
|
|
17407
|
-
const
|
|
17408
|
-
return new UpstashSearchStore({
|
|
17761
|
+
const { Index } = await import('@upstash/vector');
|
|
17762
|
+
const index = new Index({ url, token });
|
|
17763
|
+
return new UpstashSearchStore({
|
|
17764
|
+
index,
|
|
17765
|
+
pagesNamespace: config.upstash.namespaces.pages,
|
|
17766
|
+
chunksNamespace: config.upstash.namespaces.chunks
|
|
17767
|
+
});
|
|
17409
17768
|
}
|
|
17410
17769
|
|
|
17411
17770
|
// src/utils/pattern.ts
|
|
@@ -17448,29 +17807,65 @@ function nonNegativeOrZero(value) {
|
|
|
17448
17807
|
function normalizeForTitleMatch(text) {
|
|
17449
17808
|
return text.toLowerCase().replace(/[^a-z0-9\s]/g, "").replace(/\s+/g, " ").trim();
|
|
17450
17809
|
}
|
|
17451
|
-
function rankHits(hits, config, query) {
|
|
17810
|
+
function rankHits(hits, config, query, debug) {
|
|
17452
17811
|
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
17453
17812
|
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
17454
17813
|
return hits.map((hit) => {
|
|
17455
|
-
|
|
17814
|
+
const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
17815
|
+
let score = baseScore;
|
|
17816
|
+
let incomingLinkBoostValue = 0;
|
|
17456
17817
|
if (config.ranking.enableIncomingLinkBoost) {
|
|
17457
17818
|
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
17458
|
-
|
|
17819
|
+
incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
|
|
17820
|
+
score += incomingLinkBoostValue;
|
|
17459
17821
|
}
|
|
17822
|
+
let depthBoostValue = 0;
|
|
17460
17823
|
if (config.ranking.enableDepthBoost) {
|
|
17461
17824
|
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
17462
|
-
|
|
17825
|
+
depthBoostValue = depthBoost * config.ranking.weights.depth;
|
|
17826
|
+
score += depthBoostValue;
|
|
17463
17827
|
}
|
|
17828
|
+
let titleMatchBoostValue = 0;
|
|
17464
17829
|
if (normalizedQuery && titleMatchWeight > 0) {
|
|
17465
17830
|
const normalizedTitle = normalizeForTitleMatch(hit.metadata.title);
|
|
17466
17831
|
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
17467
|
-
|
|
17832
|
+
titleMatchBoostValue = titleMatchWeight;
|
|
17833
|
+
score += titleMatchBoostValue;
|
|
17468
17834
|
}
|
|
17469
17835
|
}
|
|
17470
|
-
|
|
17836
|
+
let freshnessBoostValue = 0;
|
|
17837
|
+
if (config.ranking.enableFreshnessBoost) {
|
|
17838
|
+
const publishedAt = hit.metadata.publishedAt;
|
|
17839
|
+
if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
|
|
17840
|
+
const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
|
|
17841
|
+
const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
|
|
17842
|
+
freshnessBoostValue = decay * config.ranking.weights.freshness;
|
|
17843
|
+
score += freshnessBoostValue;
|
|
17844
|
+
}
|
|
17845
|
+
}
|
|
17846
|
+
let anchorTextMatchBoostValue = 0;
|
|
17847
|
+
if (config.ranking.enableAnchorTextBoost && normalizedQuery && config.ranking.weights.anchorText > 0) {
|
|
17848
|
+
const normalizedAnchorText = normalizeForTitleMatch(hit.metadata.incomingAnchorText ?? "");
|
|
17849
|
+
if (normalizedAnchorText.length > 0 && normalizedQuery.length > 0 && (normalizedAnchorText.includes(normalizedQuery) || normalizedQuery.includes(normalizedAnchorText))) {
|
|
17850
|
+
anchorTextMatchBoostValue = config.ranking.weights.anchorText;
|
|
17851
|
+
score += anchorTextMatchBoostValue;
|
|
17852
|
+
}
|
|
17853
|
+
}
|
|
17854
|
+
const result = {
|
|
17471
17855
|
hit,
|
|
17472
17856
|
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
17473
17857
|
};
|
|
17858
|
+
if (debug) {
|
|
17859
|
+
result.breakdown = {
|
|
17860
|
+
baseScore,
|
|
17861
|
+
incomingLinkBoost: incomingLinkBoostValue,
|
|
17862
|
+
depthBoost: depthBoostValue,
|
|
17863
|
+
titleMatchBoost: titleMatchBoostValue,
|
|
17864
|
+
freshnessBoost: freshnessBoostValue,
|
|
17865
|
+
anchorTextMatchBoost: anchorTextMatchBoostValue
|
|
17866
|
+
};
|
|
17867
|
+
}
|
|
17868
|
+
return result;
|
|
17474
17869
|
}).sort((a, b) => {
|
|
17475
17870
|
const delta = b.finalScore - a.finalScore;
|
|
17476
17871
|
return Number.isNaN(delta) ? 0 : delta;
|
|
@@ -17479,12 +17874,13 @@ function rankHits(hits, config, query) {
|
|
|
17479
17874
|
function trimByScoreGap(results, config) {
|
|
17480
17875
|
if (results.length === 0) return results;
|
|
17481
17876
|
const threshold = config.ranking.scoreGapThreshold;
|
|
17482
|
-
const
|
|
17483
|
-
if (
|
|
17484
|
-
const
|
|
17485
|
-
|
|
17486
|
-
|
|
17487
|
-
|
|
17877
|
+
const minScoreRatio = config.ranking.minScoreRatio;
|
|
17878
|
+
if (minScoreRatio > 0 && results.length > 0) {
|
|
17879
|
+
const topScore = results[0].pageScore;
|
|
17880
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
17881
|
+
const minThreshold = topScore * minScoreRatio;
|
|
17882
|
+
results = results.filter((r) => r.pageScore >= minThreshold);
|
|
17883
|
+
}
|
|
17488
17884
|
}
|
|
17489
17885
|
if (threshold > 0 && results.length > 1) {
|
|
17490
17886
|
for (let i = 1; i < results.length; i++) {
|
|
@@ -17554,79 +17950,280 @@ function aggregateByPage(ranked, config) {
|
|
|
17554
17950
|
return Number.isNaN(delta) ? 0 : delta;
|
|
17555
17951
|
});
|
|
17556
17952
|
}
|
|
17557
|
-
function
|
|
17558
|
-
|
|
17559
|
-
const
|
|
17560
|
-
|
|
17561
|
-
|
|
17562
|
-
|
|
17563
|
-
|
|
17564
|
-
|
|
17565
|
-
|
|
17566
|
-
|
|
17567
|
-
|
|
17568
|
-
if (pageHit) {
|
|
17569
|
-
pagesWithChunks.add(url);
|
|
17570
|
-
const blended = (1 - w) * ranked.finalScore + w * pageHit.score;
|
|
17571
|
-
return {
|
|
17572
|
-
hit: ranked.hit,
|
|
17573
|
-
finalScore: Number.isFinite(blended) ? blended : ranked.finalScore
|
|
17574
|
-
};
|
|
17953
|
+
function rankPageHits(pageHits, config, query, debug) {
|
|
17954
|
+
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
17955
|
+
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
17956
|
+
return pageHits.map((hit) => {
|
|
17957
|
+
const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
17958
|
+
let score = baseScore;
|
|
17959
|
+
let incomingLinkBoostValue = 0;
|
|
17960
|
+
if (config.ranking.enableIncomingLinkBoost) {
|
|
17961
|
+
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.incomingLinks));
|
|
17962
|
+
incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
|
|
17963
|
+
score += incomingLinkBoostValue;
|
|
17575
17964
|
}
|
|
17576
|
-
|
|
17577
|
-
|
|
17578
|
-
|
|
17579
|
-
|
|
17580
|
-
|
|
17581
|
-
|
|
17582
|
-
|
|
17583
|
-
|
|
17584
|
-
|
|
17585
|
-
|
|
17586
|
-
|
|
17587
|
-
|
|
17588
|
-
|
|
17589
|
-
|
|
17590
|
-
|
|
17591
|
-
|
|
17592
|
-
|
|
17593
|
-
|
|
17594
|
-
|
|
17595
|
-
|
|
17596
|
-
|
|
17597
|
-
|
|
17598
|
-
routeFile: pageHit.routeFile,
|
|
17599
|
-
tags: pageHit.tags
|
|
17965
|
+
let depthBoostValue = 0;
|
|
17966
|
+
if (config.ranking.enableDepthBoost) {
|
|
17967
|
+
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.depth));
|
|
17968
|
+
depthBoostValue = depthBoost * config.ranking.weights.depth;
|
|
17969
|
+
score += depthBoostValue;
|
|
17970
|
+
}
|
|
17971
|
+
let titleMatchBoostValue = 0;
|
|
17972
|
+
if (normalizedQuery && titleMatchWeight > 0) {
|
|
17973
|
+
const normalizedTitle = normalizeForTitleMatch(hit.title);
|
|
17974
|
+
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
17975
|
+
titleMatchBoostValue = titleMatchWeight;
|
|
17976
|
+
score += titleMatchBoostValue;
|
|
17977
|
+
}
|
|
17978
|
+
}
|
|
17979
|
+
let freshnessBoostValue = 0;
|
|
17980
|
+
if (config.ranking.enableFreshnessBoost) {
|
|
17981
|
+
const publishedAt = hit.publishedAt;
|
|
17982
|
+
if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
|
|
17983
|
+
const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
|
|
17984
|
+
const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
|
|
17985
|
+
freshnessBoostValue = decay * config.ranking.weights.freshness;
|
|
17986
|
+
score += freshnessBoostValue;
|
|
17600
17987
|
}
|
|
17988
|
+
}
|
|
17989
|
+
const pageWeight = findPageWeight(hit.url, config.ranking.pageWeights);
|
|
17990
|
+
if (pageWeight !== 1) {
|
|
17991
|
+
score *= pageWeight;
|
|
17992
|
+
}
|
|
17993
|
+
const result = {
|
|
17994
|
+
url: hit.url,
|
|
17995
|
+
title: hit.title,
|
|
17996
|
+
description: hit.description,
|
|
17997
|
+
routeFile: hit.routeFile,
|
|
17998
|
+
depth: hit.depth,
|
|
17999
|
+
incomingLinks: hit.incomingLinks,
|
|
18000
|
+
tags: hit.tags,
|
|
18001
|
+
baseScore,
|
|
18002
|
+
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY,
|
|
18003
|
+
publishedAt: hit.publishedAt
|
|
17601
18004
|
};
|
|
17602
|
-
|
|
17603
|
-
|
|
17604
|
-
|
|
17605
|
-
|
|
17606
|
-
|
|
17607
|
-
|
|
18005
|
+
if (debug) {
|
|
18006
|
+
result.breakdown = {
|
|
18007
|
+
baseScore,
|
|
18008
|
+
pageWeight,
|
|
18009
|
+
incomingLinkBoost: incomingLinkBoostValue,
|
|
18010
|
+
depthBoost: depthBoostValue,
|
|
18011
|
+
titleMatchBoost: titleMatchBoostValue,
|
|
18012
|
+
freshnessBoost: freshnessBoostValue
|
|
18013
|
+
};
|
|
18014
|
+
}
|
|
18015
|
+
return result;
|
|
18016
|
+
}).filter((p) => findPageWeight(p.url, config.ranking.pageWeights) !== 0).sort((a, b) => {
|
|
17608
18017
|
const delta = b.finalScore - a.finalScore;
|
|
17609
18018
|
return Number.isNaN(delta) ? 0 : delta;
|
|
17610
18019
|
});
|
|
17611
18020
|
}
|
|
18021
|
+
function trimPagesByScoreGap(results, config) {
|
|
18022
|
+
if (results.length === 0) return results;
|
|
18023
|
+
const threshold = config.ranking.scoreGapThreshold;
|
|
18024
|
+
const minScoreRatio = config.ranking.minScoreRatio;
|
|
18025
|
+
if (minScoreRatio > 0 && results.length > 0) {
|
|
18026
|
+
const topScore = results[0].finalScore;
|
|
18027
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
18028
|
+
const minThreshold = topScore * minScoreRatio;
|
|
18029
|
+
results = results.filter((r) => r.finalScore >= minThreshold);
|
|
18030
|
+
}
|
|
18031
|
+
}
|
|
18032
|
+
if (threshold > 0 && results.length > 1) {
|
|
18033
|
+
for (let i = 1; i < results.length; i++) {
|
|
18034
|
+
const prev = results[i - 1].finalScore;
|
|
18035
|
+
const current = results[i].finalScore;
|
|
18036
|
+
if (prev > 0) {
|
|
18037
|
+
const gap = (prev - current) / prev;
|
|
18038
|
+
if (gap >= threshold) {
|
|
18039
|
+
return results.slice(0, i);
|
|
18040
|
+
}
|
|
18041
|
+
}
|
|
18042
|
+
}
|
|
18043
|
+
}
|
|
18044
|
+
return results;
|
|
18045
|
+
}
|
|
18046
|
+
|
|
18047
|
+
// src/search/related-pages.ts
|
|
18048
|
+
function diceScore(urlA, urlB) {
|
|
18049
|
+
const segmentsA = urlA.split("/").filter(Boolean);
|
|
18050
|
+
const segmentsB = urlB.split("/").filter(Boolean);
|
|
18051
|
+
if (segmentsA.length === 0 && segmentsB.length === 0) return 1;
|
|
18052
|
+
if (segmentsA.length === 0 || segmentsB.length === 0) return 0;
|
|
18053
|
+
let shared = 0;
|
|
18054
|
+
const minLen = Math.min(segmentsA.length, segmentsB.length);
|
|
18055
|
+
for (let i = 0; i < minLen; i++) {
|
|
18056
|
+
if (segmentsA[i] === segmentsB[i]) {
|
|
18057
|
+
shared++;
|
|
18058
|
+
} else {
|
|
18059
|
+
break;
|
|
18060
|
+
}
|
|
18061
|
+
}
|
|
18062
|
+
return 2 * shared / (segmentsA.length + segmentsB.length);
|
|
18063
|
+
}
|
|
18064
|
+
function compositeScore(isLinked, dice, semantic) {
|
|
18065
|
+
return (isLinked ? 0.5 : 0) + 0.3 * dice + 0.2 * semantic;
|
|
18066
|
+
}
|
|
18067
|
+
function dominantRelationshipType(isOutgoing, isIncoming, dice) {
|
|
18068
|
+
if (isOutgoing) return "outgoing_link";
|
|
18069
|
+
if (isIncoming) return "incoming_link";
|
|
18070
|
+
if (dice > 0.4) return "sibling";
|
|
18071
|
+
return "semantic";
|
|
18072
|
+
}
|
|
18073
|
+
|
|
18074
|
+
// src/utils/structured-meta.ts
|
|
18075
|
+
var VALID_KEY_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
|
|
18076
|
+
function validateMetaKey(key) {
|
|
18077
|
+
return VALID_KEY_RE.test(key);
|
|
18078
|
+
}
|
|
18079
|
+
function parseMetaValue(content, dataType) {
|
|
18080
|
+
switch (dataType) {
|
|
18081
|
+
case "number": {
|
|
18082
|
+
const n = Number(content);
|
|
18083
|
+
return Number.isFinite(n) ? n : content;
|
|
18084
|
+
}
|
|
18085
|
+
case "boolean":
|
|
18086
|
+
return content === "true";
|
|
18087
|
+
case "string[]":
|
|
18088
|
+
return content ? content.split(",").map((s) => s.trim()) : [];
|
|
18089
|
+
case "date": {
|
|
18090
|
+
const ms = Number(content);
|
|
18091
|
+
return Number.isFinite(ms) ? ms : content;
|
|
18092
|
+
}
|
|
18093
|
+
default:
|
|
18094
|
+
return content;
|
|
18095
|
+
}
|
|
18096
|
+
}
|
|
18097
|
+
function escapeFilterValue(s) {
|
|
18098
|
+
return s.replace(/'/g, "''");
|
|
18099
|
+
}
|
|
18100
|
+
function buildMetaFilterString(filters) {
|
|
18101
|
+
const clauses = [];
|
|
18102
|
+
for (const [key, value] of Object.entries(filters)) {
|
|
18103
|
+
if (!validateMetaKey(key)) continue;
|
|
18104
|
+
const field = `meta.${key}`;
|
|
18105
|
+
if (typeof value === "string") {
|
|
18106
|
+
clauses.push(`${field} CONTAINS '${escapeFilterValue(value)}'`);
|
|
18107
|
+
} else if (typeof value === "boolean") {
|
|
18108
|
+
clauses.push(`${field} = ${value}`);
|
|
18109
|
+
} else {
|
|
18110
|
+
clauses.push(`${field} = ${value}`);
|
|
18111
|
+
}
|
|
18112
|
+
}
|
|
18113
|
+
return clauses.join(" AND ");
|
|
18114
|
+
}
|
|
17612
18115
|
|
|
17613
18116
|
// src/search/engine.ts
|
|
18117
|
+
var rankingOverridesSchema = z.object({
|
|
18118
|
+
ranking: z.object({
|
|
18119
|
+
enableIncomingLinkBoost: z.boolean().optional(),
|
|
18120
|
+
enableDepthBoost: z.boolean().optional(),
|
|
18121
|
+
aggregationCap: z.number().int().positive().optional(),
|
|
18122
|
+
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
18123
|
+
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
18124
|
+
minScoreRatio: z.number().min(0).max(1).optional(),
|
|
18125
|
+
scoreGapThreshold: z.number().min(0).max(1).optional(),
|
|
18126
|
+
weights: z.object({
|
|
18127
|
+
incomingLinks: z.number().optional(),
|
|
18128
|
+
depth: z.number().optional(),
|
|
18129
|
+
aggregation: z.number().optional(),
|
|
18130
|
+
titleMatch: z.number().optional()
|
|
18131
|
+
}).optional()
|
|
18132
|
+
}).optional(),
|
|
18133
|
+
search: z.object({
|
|
18134
|
+
pageSearchWeight: z.number().min(0).max(1).optional()
|
|
18135
|
+
}).optional()
|
|
18136
|
+
}).optional();
|
|
17614
18137
|
var requestSchema = z.object({
|
|
17615
18138
|
q: z.string().trim().min(1),
|
|
17616
18139
|
topK: z.number().int().positive().max(100).optional(),
|
|
17617
18140
|
scope: z.string().optional(),
|
|
17618
18141
|
pathPrefix: z.string().optional(),
|
|
17619
18142
|
tags: z.array(z.string()).optional(),
|
|
17620
|
-
|
|
18143
|
+
filters: z.record(z.string(), z.union([z.string(), z.number(), z.boolean()])).optional(),
|
|
18144
|
+
groupBy: z.enum(["page", "chunk"]).optional(),
|
|
18145
|
+
maxSubResults: z.number().int().positive().max(20).optional(),
|
|
18146
|
+
debug: z.boolean().optional(),
|
|
18147
|
+
rankingOverrides: rankingOverridesSchema
|
|
17621
18148
|
});
|
|
17622
|
-
var
|
|
17623
|
-
|
|
17624
|
-
|
|
17625
|
-
|
|
17626
|
-
|
|
17627
|
-
|
|
17628
|
-
|
|
17629
|
-
|
|
18149
|
+
var MAX_SITE_STRUCTURE_PAGES = 2e3;
|
|
18150
|
+
function makeNode(url, depth) {
|
|
18151
|
+
return { url, title: "", depth, routeFile: "", isIndexed: false, childCount: 0, children: [] };
|
|
18152
|
+
}
|
|
18153
|
+
function buildTree(pages, pathPrefix) {
|
|
18154
|
+
const nodeMap = /* @__PURE__ */ new Map();
|
|
18155
|
+
const root2 = makeNode("/", 0);
|
|
18156
|
+
nodeMap.set("/", root2);
|
|
18157
|
+
for (const page of pages) {
|
|
18158
|
+
const normalized = normalizeUrlPath(page.url);
|
|
18159
|
+
const segments = normalized.split("/").filter(Boolean);
|
|
18160
|
+
if (segments.length === 0) {
|
|
18161
|
+
root2.title = page.title;
|
|
18162
|
+
root2.routeFile = page.routeFile;
|
|
18163
|
+
root2.isIndexed = true;
|
|
18164
|
+
continue;
|
|
18165
|
+
}
|
|
18166
|
+
for (let i = 1; i <= segments.length; i++) {
|
|
18167
|
+
const partialUrl = "/" + segments.slice(0, i).join("/");
|
|
18168
|
+
if (!nodeMap.has(partialUrl)) {
|
|
18169
|
+
nodeMap.set(partialUrl, makeNode(partialUrl, i));
|
|
18170
|
+
}
|
|
18171
|
+
}
|
|
18172
|
+
const node = nodeMap.get(normalized);
|
|
18173
|
+
node.title = page.title;
|
|
18174
|
+
node.routeFile = page.routeFile;
|
|
18175
|
+
node.isIndexed = true;
|
|
18176
|
+
}
|
|
18177
|
+
for (const [url, node] of nodeMap) {
|
|
18178
|
+
if (url === "/") continue;
|
|
18179
|
+
const segments = url.split("/").filter(Boolean);
|
|
18180
|
+
const parentUrl = segments.length === 1 ? "/" : "/" + segments.slice(0, -1).join("/");
|
|
18181
|
+
const parent = nodeMap.get(parentUrl) ?? root2;
|
|
18182
|
+
parent.children.push(node);
|
|
18183
|
+
}
|
|
18184
|
+
const sortAndCount = (node) => {
|
|
18185
|
+
node.children.sort((a, b) => a.url.localeCompare(b.url));
|
|
18186
|
+
node.childCount = node.children.length;
|
|
18187
|
+
for (const child of node.children) {
|
|
18188
|
+
sortAndCount(child);
|
|
18189
|
+
}
|
|
18190
|
+
};
|
|
18191
|
+
sortAndCount(root2);
|
|
18192
|
+
if (pathPrefix) {
|
|
18193
|
+
const normalizedPrefix = normalizeUrlPath(pathPrefix);
|
|
18194
|
+
const subtreeRoot = nodeMap.get(normalizedPrefix);
|
|
18195
|
+
if (subtreeRoot) {
|
|
18196
|
+
return subtreeRoot;
|
|
18197
|
+
}
|
|
18198
|
+
return makeNode(normalizedPrefix, normalizedPrefix.split("/").filter(Boolean).length);
|
|
18199
|
+
}
|
|
18200
|
+
return root2;
|
|
18201
|
+
}
|
|
18202
|
+
function mergeRankingOverrides(base, overrides) {
|
|
18203
|
+
return {
|
|
18204
|
+
...base,
|
|
18205
|
+
search: {
|
|
18206
|
+
...base.search,
|
|
18207
|
+
...overrides.search
|
|
18208
|
+
},
|
|
18209
|
+
ranking: {
|
|
18210
|
+
...base.ranking,
|
|
18211
|
+
...overrides.ranking,
|
|
18212
|
+
weights: {
|
|
18213
|
+
...base.ranking.weights,
|
|
18214
|
+
...overrides.ranking?.weights
|
|
18215
|
+
}
|
|
18216
|
+
}
|
|
18217
|
+
};
|
|
18218
|
+
}
|
|
18219
|
+
var SearchEngine = class _SearchEngine {
|
|
18220
|
+
cwd;
|
|
18221
|
+
config;
|
|
18222
|
+
store;
|
|
18223
|
+
constructor(options) {
|
|
18224
|
+
this.cwd = options.cwd;
|
|
18225
|
+
this.config = options.config;
|
|
18226
|
+
this.store = options.store;
|
|
17630
18227
|
}
|
|
17631
18228
|
static async create(options = {}) {
|
|
17632
18229
|
const cwd = path.resolve(options.cwd ?? process.cwd());
|
|
@@ -17648,125 +18245,203 @@ var SearchEngine = class _SearchEngine {
|
|
|
17648
18245
|
}
|
|
17649
18246
|
const input = parsed.data;
|
|
17650
18247
|
const totalStart = process.hrtime.bigint();
|
|
18248
|
+
const effectiveConfig = input.debug && input.rankingOverrides ? mergeRankingOverrides(this.config, input.rankingOverrides) : this.config;
|
|
17651
18249
|
const resolvedScope = resolveScope(this.config, input.scope);
|
|
17652
18250
|
const topK = input.topK ?? 10;
|
|
18251
|
+
const maxSubResults = input.maxSubResults ?? 5;
|
|
17653
18252
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
17654
|
-
const
|
|
17655
|
-
const
|
|
17656
|
-
|
|
17657
|
-
|
|
17658
|
-
|
|
17659
|
-
|
|
17660
|
-
|
|
17661
|
-
|
|
17662
|
-
|
|
18253
|
+
const queryText = input.q;
|
|
18254
|
+
const pathPrefix = input.pathPrefix ? input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}` : void 0;
|
|
18255
|
+
const filterTags = input.tags && input.tags.length > 0 ? input.tags : void 0;
|
|
18256
|
+
const metaFilterStr = input.filters && Object.keys(input.filters).length > 0 ? buildMetaFilterString(input.filters) : "";
|
|
18257
|
+
const metaFilter = metaFilterStr || void 0;
|
|
18258
|
+
const applyPagePostFilters = (hits) => {
|
|
18259
|
+
let filtered = hits;
|
|
18260
|
+
if (pathPrefix) {
|
|
18261
|
+
filtered = filtered.filter((h) => h.url.startsWith(pathPrefix));
|
|
18262
|
+
}
|
|
18263
|
+
if (filterTags) {
|
|
18264
|
+
filtered = filtered.filter(
|
|
18265
|
+
(h) => filterTags.every((tag) => h.tags.includes(tag))
|
|
18266
|
+
);
|
|
17663
18267
|
}
|
|
17664
|
-
|
|
17665
|
-
|
|
17666
|
-
const
|
|
18268
|
+
return filtered;
|
|
18269
|
+
};
|
|
18270
|
+
const applyChunkPostFilters = (hits) => {
|
|
18271
|
+
let filtered = hits;
|
|
18272
|
+
if (filterTags) {
|
|
18273
|
+
filtered = filtered.filter(
|
|
18274
|
+
(h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
|
|
18275
|
+
);
|
|
18276
|
+
}
|
|
18277
|
+
return filtered;
|
|
18278
|
+
};
|
|
17667
18279
|
const searchStart = process.hrtime.bigint();
|
|
17668
|
-
|
|
17669
|
-
|
|
17670
|
-
const
|
|
17671
|
-
const
|
|
17672
|
-
|
|
17673
|
-
|
|
17674
|
-
|
|
17675
|
-
|
|
17676
|
-
|
|
17677
|
-
|
|
17678
|
-
|
|
17679
|
-
|
|
17680
|
-
|
|
17681
|
-
|
|
17682
|
-
|
|
17683
|
-
|
|
17684
|
-
|
|
17685
|
-
{
|
|
17686
|
-
limit: chunkLimit,
|
|
17687
|
-
semanticWeight: this.config.search.semanticWeight,
|
|
17688
|
-
inputEnrichment: this.config.search.inputEnrichment,
|
|
17689
|
-
reranking: false,
|
|
17690
|
-
filter
|
|
17691
|
-
},
|
|
18280
|
+
if (groupByPage) {
|
|
18281
|
+
const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
|
|
18282
|
+
const pageLimit = Math.max(topK * 2, 20);
|
|
18283
|
+
const pageHits = await this.store.searchPagesByText(
|
|
18284
|
+
queryText,
|
|
18285
|
+
{ limit: pageLimit * fetchMultiplier, filter: metaFilter },
|
|
18286
|
+
resolvedScope
|
|
18287
|
+
);
|
|
18288
|
+
const filteredPages = applyPagePostFilters(pageHits);
|
|
18289
|
+
let rankedPages = rankPageHits(filteredPages, effectiveConfig, input.q, input.debug);
|
|
18290
|
+
rankedPages = trimPagesByScoreGap(rankedPages, effectiveConfig);
|
|
18291
|
+
const topPages = rankedPages.slice(0, topK);
|
|
18292
|
+
const chunkPromises = topPages.map(
|
|
18293
|
+
(page) => this.store.searchChunksByUrl(
|
|
18294
|
+
queryText,
|
|
18295
|
+
page.url,
|
|
18296
|
+
{ limit: maxSubResults, filter: metaFilter },
|
|
17692
18297
|
resolvedScope
|
|
17693
|
-
)
|
|
17694
|
-
|
|
17695
|
-
const
|
|
17696
|
-
|
|
18298
|
+
).then((chunks) => applyChunkPostFilters(chunks))
|
|
18299
|
+
);
|
|
18300
|
+
const allChunks = await Promise.all(chunkPromises);
|
|
18301
|
+
const searchMs = hrTimeMs(searchStart);
|
|
18302
|
+
const results = this.buildPageFirstResults(topPages, allChunks, input.q, input.debug, maxSubResults);
|
|
18303
|
+
return {
|
|
18304
|
+
q: input.q,
|
|
18305
|
+
scope: resolvedScope.scopeName,
|
|
18306
|
+
results,
|
|
18307
|
+
meta: {
|
|
18308
|
+
timingsMs: {
|
|
18309
|
+
search: Math.round(searchMs),
|
|
18310
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18311
|
+
}
|
|
18312
|
+
}
|
|
18313
|
+
};
|
|
17697
18314
|
} else {
|
|
18315
|
+
const candidateK = Math.max(50, topK);
|
|
18316
|
+
const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
|
|
17698
18317
|
const hits = await this.store.search(
|
|
17699
|
-
|
|
17700
|
-
{
|
|
17701
|
-
limit: candidateK,
|
|
17702
|
-
semanticWeight: this.config.search.semanticWeight,
|
|
17703
|
-
inputEnrichment: this.config.search.inputEnrichment,
|
|
17704
|
-
reranking: this.config.search.reranking,
|
|
17705
|
-
filter
|
|
17706
|
-
},
|
|
18318
|
+
queryText,
|
|
18319
|
+
{ limit: candidateK * fetchMultiplier, filter: metaFilter },
|
|
17707
18320
|
resolvedScope
|
|
17708
18321
|
);
|
|
17709
|
-
|
|
17710
|
-
|
|
17711
|
-
|
|
17712
|
-
|
|
17713
|
-
|
|
17714
|
-
|
|
17715
|
-
|
|
17716
|
-
|
|
17717
|
-
|
|
17718
|
-
|
|
17719
|
-
|
|
17720
|
-
|
|
18322
|
+
let filtered = hits;
|
|
18323
|
+
if (pathPrefix) {
|
|
18324
|
+
filtered = filtered.filter((h) => h.metadata.url.startsWith(pathPrefix));
|
|
18325
|
+
}
|
|
18326
|
+
if (filterTags) {
|
|
18327
|
+
filtered = filtered.filter(
|
|
18328
|
+
(h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
|
|
18329
|
+
);
|
|
18330
|
+
}
|
|
18331
|
+
const ranked = rankHits(filtered, effectiveConfig, input.q, input.debug);
|
|
18332
|
+
const searchMs = hrTimeMs(searchStart);
|
|
18333
|
+
const results = this.buildResults(ranked, topK, false, maxSubResults, input.q, input.debug, effectiveConfig);
|
|
18334
|
+
return {
|
|
18335
|
+
q: input.q,
|
|
18336
|
+
scope: resolvedScope.scopeName,
|
|
18337
|
+
results,
|
|
18338
|
+
meta: {
|
|
18339
|
+
timingsMs: {
|
|
18340
|
+
search: Math.round(searchMs),
|
|
18341
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18342
|
+
}
|
|
17721
18343
|
}
|
|
18344
|
+
};
|
|
18345
|
+
}
|
|
18346
|
+
}
|
|
18347
|
+
buildPageFirstResults(rankedPages, allChunks, query, debug, maxSubResults = 5) {
|
|
18348
|
+
return rankedPages.map((page, i) => {
|
|
18349
|
+
const chunks = allChunks[i] ?? [];
|
|
18350
|
+
const bestChunk = chunks[0];
|
|
18351
|
+
const snippet = bestChunk ? query ? queryAwareExcerpt(bestChunk.metadata.chunkText, query) : toSnippet(bestChunk.metadata.chunkText) : page.description || page.title;
|
|
18352
|
+
const result = {
|
|
18353
|
+
url: page.url,
|
|
18354
|
+
title: page.title,
|
|
18355
|
+
sectionTitle: bestChunk?.metadata.sectionTitle || void 0,
|
|
18356
|
+
snippet,
|
|
18357
|
+
chunkText: bestChunk?.metadata.chunkText || void 0,
|
|
18358
|
+
score: Number(page.finalScore.toFixed(6)),
|
|
18359
|
+
routeFile: page.routeFile,
|
|
18360
|
+
chunks: chunks.length > 0 ? chunks.slice(0, maxSubResults).map((c) => ({
|
|
18361
|
+
sectionTitle: c.metadata.sectionTitle || void 0,
|
|
18362
|
+
snippet: query ? queryAwareExcerpt(c.metadata.chunkText, query) : toSnippet(c.metadata.chunkText),
|
|
18363
|
+
chunkText: c.metadata.chunkText || void 0,
|
|
18364
|
+
headingPath: c.metadata.headingPath,
|
|
18365
|
+
score: Number(c.score.toFixed(6))
|
|
18366
|
+
})) : void 0
|
|
18367
|
+
};
|
|
18368
|
+
if (debug && page.breakdown) {
|
|
18369
|
+
result.breakdown = {
|
|
18370
|
+
baseScore: page.breakdown.baseScore,
|
|
18371
|
+
incomingLinkBoost: page.breakdown.incomingLinkBoost,
|
|
18372
|
+
depthBoost: page.breakdown.depthBoost,
|
|
18373
|
+
titleMatchBoost: page.breakdown.titleMatchBoost,
|
|
18374
|
+
freshnessBoost: page.breakdown.freshnessBoost,
|
|
18375
|
+
anchorTextMatchBoost: 0
|
|
18376
|
+
};
|
|
17722
18377
|
}
|
|
17723
|
-
|
|
18378
|
+
return result;
|
|
18379
|
+
});
|
|
17724
18380
|
}
|
|
17725
|
-
ensureSnippet(hit) {
|
|
18381
|
+
ensureSnippet(hit, query) {
|
|
18382
|
+
const chunkText = hit.hit.metadata.chunkText;
|
|
18383
|
+
if (query && chunkText) return queryAwareExcerpt(chunkText, query);
|
|
17726
18384
|
const snippet = hit.hit.metadata.snippet;
|
|
17727
18385
|
if (snippet && snippet.length >= 30) return snippet;
|
|
17728
|
-
const chunkText = hit.hit.metadata.chunkText;
|
|
17729
18386
|
if (chunkText) return toSnippet(chunkText);
|
|
17730
18387
|
return snippet || "";
|
|
17731
18388
|
}
|
|
17732
|
-
buildResults(ordered, topK, groupByPage,
|
|
18389
|
+
buildResults(ordered, topK, groupByPage, maxSubResults, query, debug, config) {
|
|
18390
|
+
const cfg = config ?? this.config;
|
|
17733
18391
|
if (groupByPage) {
|
|
17734
|
-
let pages = aggregateByPage(ordered,
|
|
17735
|
-
pages = trimByScoreGap(pages,
|
|
17736
|
-
const minRatio =
|
|
18392
|
+
let pages = aggregateByPage(ordered, cfg);
|
|
18393
|
+
pages = trimByScoreGap(pages, cfg);
|
|
18394
|
+
const minRatio = cfg.ranking.minChunkScoreRatio;
|
|
17737
18395
|
return pages.slice(0, topK).map((page) => {
|
|
17738
18396
|
const bestScore = page.bestChunk.finalScore;
|
|
17739
18397
|
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
17740
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0,
|
|
17741
|
-
|
|
18398
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, maxSubResults);
|
|
18399
|
+
const result = {
|
|
17742
18400
|
url: page.url,
|
|
17743
18401
|
title: page.title,
|
|
17744
18402
|
sectionTitle: page.bestChunk.hit.metadata.sectionTitle || void 0,
|
|
17745
|
-
snippet: this.ensureSnippet(page.bestChunk),
|
|
18403
|
+
snippet: this.ensureSnippet(page.bestChunk, query),
|
|
18404
|
+
chunkText: page.bestChunk.hit.metadata.chunkText || void 0,
|
|
17746
18405
|
score: Number(page.pageScore.toFixed(6)),
|
|
17747
18406
|
routeFile: page.routeFile,
|
|
17748
|
-
chunks: meaningful.length
|
|
18407
|
+
chunks: meaningful.length >= 1 ? meaningful.map((c) => ({
|
|
17749
18408
|
sectionTitle: c.hit.metadata.sectionTitle || void 0,
|
|
17750
|
-
snippet: this.ensureSnippet(c),
|
|
18409
|
+
snippet: this.ensureSnippet(c, query),
|
|
18410
|
+
chunkText: c.hit.metadata.chunkText || void 0,
|
|
17751
18411
|
headingPath: c.hit.metadata.headingPath,
|
|
17752
18412
|
score: Number(c.finalScore.toFixed(6))
|
|
17753
18413
|
})) : void 0
|
|
17754
18414
|
};
|
|
18415
|
+
if (debug && page.bestChunk.breakdown) {
|
|
18416
|
+
result.breakdown = page.bestChunk.breakdown;
|
|
18417
|
+
}
|
|
18418
|
+
return result;
|
|
17755
18419
|
});
|
|
17756
18420
|
} else {
|
|
17757
18421
|
let filtered = ordered;
|
|
17758
|
-
const
|
|
17759
|
-
if (
|
|
17760
|
-
|
|
17761
|
-
|
|
17762
|
-
|
|
17763
|
-
|
|
17764
|
-
|
|
17765
|
-
|
|
17766
|
-
|
|
17767
|
-
|
|
17768
|
-
|
|
17769
|
-
|
|
18422
|
+
const minScoreRatio = cfg.ranking.minScoreRatio;
|
|
18423
|
+
if (minScoreRatio > 0 && ordered.length > 0) {
|
|
18424
|
+
const topScore = ordered[0].finalScore;
|
|
18425
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
18426
|
+
const threshold = topScore * minScoreRatio;
|
|
18427
|
+
filtered = ordered.filter((entry) => entry.finalScore >= threshold);
|
|
18428
|
+
}
|
|
18429
|
+
}
|
|
18430
|
+
return filtered.slice(0, topK).map(({ hit, finalScore, breakdown }) => {
|
|
18431
|
+
const result = {
|
|
18432
|
+
url: hit.metadata.url,
|
|
18433
|
+
title: hit.metadata.title,
|
|
18434
|
+
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
18435
|
+
snippet: this.ensureSnippet({ hit, finalScore }, query),
|
|
18436
|
+
chunkText: hit.metadata.chunkText || void 0,
|
|
18437
|
+
score: Number(finalScore.toFixed(6)),
|
|
18438
|
+
routeFile: hit.metadata.routeFile
|
|
18439
|
+
};
|
|
18440
|
+
if (debug && breakdown) {
|
|
18441
|
+
result.breakdown = breakdown;
|
|
18442
|
+
}
|
|
18443
|
+
return result;
|
|
18444
|
+
});
|
|
17770
18445
|
}
|
|
17771
18446
|
}
|
|
17772
18447
|
async getPage(pathOrUrl, scope) {
|
|
@@ -17792,6 +18467,116 @@ var SearchEngine = class _SearchEngine {
|
|
|
17792
18467
|
markdown: page.markdown
|
|
17793
18468
|
};
|
|
17794
18469
|
}
|
|
18470
|
+
async listPages(opts) {
|
|
18471
|
+
const resolvedScope = resolveScope(this.config, opts?.scope);
|
|
18472
|
+
const pathPrefix = opts?.pathPrefix ? opts.pathPrefix.startsWith("/") ? opts.pathPrefix : `/${opts.pathPrefix}` : void 0;
|
|
18473
|
+
return this.store.listPages(resolvedScope, {
|
|
18474
|
+
cursor: opts?.cursor,
|
|
18475
|
+
limit: opts?.limit,
|
|
18476
|
+
pathPrefix
|
|
18477
|
+
});
|
|
18478
|
+
}
|
|
18479
|
+
async getSiteStructure(opts) {
|
|
18480
|
+
const maxPages = Math.min(opts?.maxPages ?? MAX_SITE_STRUCTURE_PAGES, MAX_SITE_STRUCTURE_PAGES);
|
|
18481
|
+
const allPages = [];
|
|
18482
|
+
let cursor;
|
|
18483
|
+
let truncated = false;
|
|
18484
|
+
do {
|
|
18485
|
+
const result = await this.listPages({
|
|
18486
|
+
pathPrefix: opts?.pathPrefix,
|
|
18487
|
+
scope: opts?.scope,
|
|
18488
|
+
cursor,
|
|
18489
|
+
limit: 200
|
|
18490
|
+
});
|
|
18491
|
+
allPages.push(...result.pages);
|
|
18492
|
+
cursor = result.nextCursor;
|
|
18493
|
+
if (allPages.length >= maxPages) {
|
|
18494
|
+
truncated = allPages.length > maxPages || !!cursor;
|
|
18495
|
+
allPages.length = maxPages;
|
|
18496
|
+
break;
|
|
18497
|
+
}
|
|
18498
|
+
} while (cursor);
|
|
18499
|
+
const root2 = buildTree(allPages, opts?.pathPrefix);
|
|
18500
|
+
return {
|
|
18501
|
+
root: root2,
|
|
18502
|
+
totalPages: allPages.length,
|
|
18503
|
+
truncated
|
|
18504
|
+
};
|
|
18505
|
+
}
|
|
18506
|
+
async getRelatedPages(pathOrUrl, opts) {
|
|
18507
|
+
const resolvedScope = resolveScope(this.config, opts?.scope);
|
|
18508
|
+
const urlPath = this.resolveInputPath(pathOrUrl);
|
|
18509
|
+
const topK = Math.min(opts?.topK ?? 10, 25);
|
|
18510
|
+
const source = await this.store.fetchPageWithVector(urlPath, resolvedScope);
|
|
18511
|
+
if (!source) {
|
|
18512
|
+
throw new SearchSocketError("INVALID_REQUEST", `Indexed page not found for ${urlPath}`, 404);
|
|
18513
|
+
}
|
|
18514
|
+
const sourceOutgoing = new Set(source.metadata.outgoingLinkUrls ?? []);
|
|
18515
|
+
const semanticHits = await this.store.searchPagesByVector(
|
|
18516
|
+
source.vector,
|
|
18517
|
+
{ limit: 50 },
|
|
18518
|
+
resolvedScope
|
|
18519
|
+
);
|
|
18520
|
+
const filteredHits = semanticHits.filter((h) => h.url !== urlPath);
|
|
18521
|
+
const semanticScoreMap = /* @__PURE__ */ new Map();
|
|
18522
|
+
for (const hit of filteredHits) {
|
|
18523
|
+
semanticScoreMap.set(hit.url, hit.score);
|
|
18524
|
+
}
|
|
18525
|
+
const candidateUrls = /* @__PURE__ */ new Set();
|
|
18526
|
+
for (const hit of filteredHits) {
|
|
18527
|
+
candidateUrls.add(hit.url);
|
|
18528
|
+
}
|
|
18529
|
+
for (const url of sourceOutgoing) {
|
|
18530
|
+
if (url !== urlPath) candidateUrls.add(url);
|
|
18531
|
+
}
|
|
18532
|
+
const missingUrls = [...sourceOutgoing].filter(
|
|
18533
|
+
(u) => u !== urlPath && !semanticScoreMap.has(u)
|
|
18534
|
+
);
|
|
18535
|
+
const fetchedPages = missingUrls.length > 0 ? await this.store.fetchPagesBatch(missingUrls, resolvedScope) : [];
|
|
18536
|
+
const metaMap = /* @__PURE__ */ new Map();
|
|
18537
|
+
for (const hit of filteredHits) {
|
|
18538
|
+
metaMap.set(hit.url, { title: hit.title, routeFile: hit.routeFile, outgoingLinkUrls: [] });
|
|
18539
|
+
}
|
|
18540
|
+
for (const p of fetchedPages) {
|
|
18541
|
+
metaMap.set(p.url, { title: p.title, routeFile: p.routeFile, outgoingLinkUrls: p.outgoingLinkUrls });
|
|
18542
|
+
}
|
|
18543
|
+
const semanticUrls = filteredHits.map((h) => h.url);
|
|
18544
|
+
if (semanticUrls.length > 0) {
|
|
18545
|
+
const semanticPageData = await this.store.fetchPagesBatch(semanticUrls, resolvedScope);
|
|
18546
|
+
for (const p of semanticPageData) {
|
|
18547
|
+
const existing = metaMap.get(p.url);
|
|
18548
|
+
if (existing) {
|
|
18549
|
+
existing.outgoingLinkUrls = p.outgoingLinkUrls;
|
|
18550
|
+
}
|
|
18551
|
+
}
|
|
18552
|
+
}
|
|
18553
|
+
const candidates = [];
|
|
18554
|
+
for (const url of candidateUrls) {
|
|
18555
|
+
const meta = metaMap.get(url);
|
|
18556
|
+
if (!meta) continue;
|
|
18557
|
+
const isOutgoing = sourceOutgoing.has(url);
|
|
18558
|
+
const isIncoming = meta.outgoingLinkUrls.includes(urlPath);
|
|
18559
|
+
const isLinked = isOutgoing || isIncoming;
|
|
18560
|
+
const dice = diceScore(urlPath, url);
|
|
18561
|
+
const semantic = semanticScoreMap.get(url) ?? 0;
|
|
18562
|
+
const score = compositeScore(isLinked, dice, semantic);
|
|
18563
|
+
const relationshipType = dominantRelationshipType(isOutgoing, isIncoming, dice);
|
|
18564
|
+
candidates.push({
|
|
18565
|
+
url,
|
|
18566
|
+
title: meta.title,
|
|
18567
|
+
score: Number(score.toFixed(6)),
|
|
18568
|
+
relationshipType,
|
|
18569
|
+
routeFile: meta.routeFile
|
|
18570
|
+
});
|
|
18571
|
+
}
|
|
18572
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
18573
|
+
const results = candidates.slice(0, topK);
|
|
18574
|
+
return {
|
|
18575
|
+
sourceUrl: urlPath,
|
|
18576
|
+
scope: resolvedScope.scopeName,
|
|
18577
|
+
relatedPages: results
|
|
18578
|
+
};
|
|
18579
|
+
}
|
|
17795
18580
|
async health() {
|
|
17796
18581
|
return this.store.health();
|
|
17797
18582
|
}
|
|
@@ -17807,6 +18592,215 @@ var SearchEngine = class _SearchEngine {
|
|
|
17807
18592
|
}
|
|
17808
18593
|
};
|
|
17809
18594
|
|
|
18595
|
+
// src/mcp/server.ts
|
|
18596
|
+
function createServer(engine) {
|
|
18597
|
+
const server = new McpServer({
|
|
18598
|
+
name: "searchsocket-mcp",
|
|
18599
|
+
version: "0.1.0"
|
|
18600
|
+
});
|
|
18601
|
+
server.registerTool(
|
|
18602
|
+
"search",
|
|
18603
|
+
{
|
|
18604
|
+
description: `Semantic site search powered by Upstash Search. Returns url, title, snippet, chunkText, score, and routeFile per result. chunkText contains the full raw chunk markdown. When groupBy is 'page' (default), each result includes a chunks array with section-level sub-results containing sectionTitle, headingPath, snippet, and score. Supports optional filters for structured metadata (e.g. {"version": 2, "deprecated": false}).`,
|
|
18605
|
+
inputSchema: {
|
|
18606
|
+
query: z.string().min(1),
|
|
18607
|
+
scope: z.string().optional(),
|
|
18608
|
+
topK: z.number().int().positive().max(100).optional(),
|
|
18609
|
+
pathPrefix: z.string().optional(),
|
|
18610
|
+
tags: z.array(z.string()).optional(),
|
|
18611
|
+
filters: z.record(z.string(), z.union([z.string(), z.number(), z.boolean()])).optional(),
|
|
18612
|
+
groupBy: z.enum(["page", "chunk"]).optional(),
|
|
18613
|
+
maxSubResults: z.number().int().positive().max(20).optional()
|
|
18614
|
+
},
|
|
18615
|
+
outputSchema: {
|
|
18616
|
+
q: z.string(),
|
|
18617
|
+
scope: z.string(),
|
|
18618
|
+
results: z.array(z.object({
|
|
18619
|
+
url: z.string(),
|
|
18620
|
+
title: z.string(),
|
|
18621
|
+
sectionTitle: z.string().optional(),
|
|
18622
|
+
snippet: z.string(),
|
|
18623
|
+
score: z.number(),
|
|
18624
|
+
routeFile: z.string(),
|
|
18625
|
+
chunks: z.array(z.object({
|
|
18626
|
+
sectionTitle: z.string().optional(),
|
|
18627
|
+
snippet: z.string(),
|
|
18628
|
+
headingPath: z.array(z.string()),
|
|
18629
|
+
score: z.number()
|
|
18630
|
+
})).optional()
|
|
18631
|
+
})),
|
|
18632
|
+
meta: z.object({
|
|
18633
|
+
timingsMs: z.object({
|
|
18634
|
+
search: z.number(),
|
|
18635
|
+
total: z.number()
|
|
18636
|
+
})
|
|
18637
|
+
})
|
|
18638
|
+
}
|
|
18639
|
+
},
|
|
18640
|
+
async (input) => {
|
|
18641
|
+
const result = await engine.search({
|
|
18642
|
+
q: input.query,
|
|
18643
|
+
topK: input.topK,
|
|
18644
|
+
scope: input.scope,
|
|
18645
|
+
pathPrefix: input.pathPrefix,
|
|
18646
|
+
tags: input.tags,
|
|
18647
|
+
filters: input.filters,
|
|
18648
|
+
groupBy: input.groupBy,
|
|
18649
|
+
maxSubResults: input.maxSubResults
|
|
18650
|
+
});
|
|
18651
|
+
return {
|
|
18652
|
+
content: [
|
|
18653
|
+
{
|
|
18654
|
+
type: "text",
|
|
18655
|
+
text: JSON.stringify(result, null, 2)
|
|
18656
|
+
}
|
|
18657
|
+
],
|
|
18658
|
+
structuredContent: result
|
|
18659
|
+
};
|
|
18660
|
+
}
|
|
18661
|
+
);
|
|
18662
|
+
server.registerTool(
|
|
18663
|
+
"get_page",
|
|
18664
|
+
{
|
|
18665
|
+
description: "Fetch indexed markdown for a specific path or URL, including frontmatter and routeFile mapping.",
|
|
18666
|
+
inputSchema: {
|
|
18667
|
+
pathOrUrl: z.string().min(1),
|
|
18668
|
+
scope: z.string().optional()
|
|
18669
|
+
}
|
|
18670
|
+
},
|
|
18671
|
+
async (input) => {
|
|
18672
|
+
const page = await engine.getPage(input.pathOrUrl, input.scope);
|
|
18673
|
+
return {
|
|
18674
|
+
content: [
|
|
18675
|
+
{
|
|
18676
|
+
type: "text",
|
|
18677
|
+
text: JSON.stringify(page, null, 2)
|
|
18678
|
+
}
|
|
18679
|
+
]
|
|
18680
|
+
};
|
|
18681
|
+
}
|
|
18682
|
+
);
|
|
18683
|
+
server.registerTool(
|
|
18684
|
+
"list_pages",
|
|
18685
|
+
{
|
|
18686
|
+
description: "List indexed pages with optional path prefix filtering and cursor-based pagination. Returns url, title, description, and routeFile for each page. Use nextCursor to fetch subsequent pages.",
|
|
18687
|
+
inputSchema: {
|
|
18688
|
+
pathPrefix: z.string().optional(),
|
|
18689
|
+
cursor: z.string().optional(),
|
|
18690
|
+
limit: z.number().int().positive().max(200).optional(),
|
|
18691
|
+
scope: z.string().optional()
|
|
18692
|
+
}
|
|
18693
|
+
},
|
|
18694
|
+
async (input) => {
|
|
18695
|
+
const result = await engine.listPages({
|
|
18696
|
+
pathPrefix: input.pathPrefix,
|
|
18697
|
+
cursor: input.cursor,
|
|
18698
|
+
limit: input.limit,
|
|
18699
|
+
scope: input.scope
|
|
18700
|
+
});
|
|
18701
|
+
return {
|
|
18702
|
+
content: [
|
|
18703
|
+
{
|
|
18704
|
+
type: "text",
|
|
18705
|
+
text: JSON.stringify(result, null, 2)
|
|
18706
|
+
}
|
|
18707
|
+
]
|
|
18708
|
+
};
|
|
18709
|
+
}
|
|
18710
|
+
);
|
|
18711
|
+
server.registerTool(
|
|
18712
|
+
"get_site_structure",
|
|
18713
|
+
{
|
|
18714
|
+
description: "Returns the hierarchical page tree derived from URL paths. Use this to understand site navigation structure, find where pages belong, or scope further operations to a section. Nodes with isIndexed: false are implicit structural parents not directly in the index. Large sites (>2000 pages) return truncated: true.",
|
|
18715
|
+
inputSchema: {
|
|
18716
|
+
pathPrefix: z.string().optional(),
|
|
18717
|
+
scope: z.string().optional(),
|
|
18718
|
+
maxPages: z.number().int().positive().max(2e3).optional()
|
|
18719
|
+
}
|
|
18720
|
+
},
|
|
18721
|
+
async (input) => {
|
|
18722
|
+
const result = await engine.getSiteStructure({
|
|
18723
|
+
pathPrefix: input.pathPrefix,
|
|
18724
|
+
scope: input.scope,
|
|
18725
|
+
maxPages: input.maxPages
|
|
18726
|
+
});
|
|
18727
|
+
return {
|
|
18728
|
+
content: [
|
|
18729
|
+
{
|
|
18730
|
+
type: "text",
|
|
18731
|
+
text: JSON.stringify(result, null, 2)
|
|
18732
|
+
}
|
|
18733
|
+
]
|
|
18734
|
+
};
|
|
18735
|
+
}
|
|
18736
|
+
);
|
|
18737
|
+
server.registerTool(
|
|
18738
|
+
"find_source_file",
|
|
18739
|
+
{
|
|
18740
|
+
description: "Find the SvelteKit source file for a piece of site content. Use this when you need to locate and edit content on the site. Returns the URL, route file path, section title, and a content snippet.",
|
|
18741
|
+
inputSchema: {
|
|
18742
|
+
query: z.string().min(1),
|
|
18743
|
+
scope: z.string().optional()
|
|
18744
|
+
}
|
|
18745
|
+
},
|
|
18746
|
+
async (input) => {
|
|
18747
|
+
const result = await engine.search({
|
|
18748
|
+
q: input.query,
|
|
18749
|
+
topK: 1,
|
|
18750
|
+
scope: input.scope
|
|
18751
|
+
});
|
|
18752
|
+
if (result.results.length === 0) {
|
|
18753
|
+
return {
|
|
18754
|
+
content: [
|
|
18755
|
+
{
|
|
18756
|
+
type: "text",
|
|
18757
|
+
text: JSON.stringify({
|
|
18758
|
+
error: "No matching content found for the given query."
|
|
18759
|
+
})
|
|
18760
|
+
}
|
|
18761
|
+
]
|
|
18762
|
+
};
|
|
18763
|
+
}
|
|
18764
|
+
const match = result.results[0];
|
|
18765
|
+
const { url, routeFile, sectionTitle, snippet } = match;
|
|
18766
|
+
return {
|
|
18767
|
+
content: [
|
|
18768
|
+
{
|
|
18769
|
+
type: "text",
|
|
18770
|
+
text: JSON.stringify({ url, routeFile, sectionTitle, snippet })
|
|
18771
|
+
}
|
|
18772
|
+
]
|
|
18773
|
+
};
|
|
18774
|
+
}
|
|
18775
|
+
);
|
|
18776
|
+
server.registerTool(
|
|
18777
|
+
"get_related_pages",
|
|
18778
|
+
{
|
|
18779
|
+
description: "Find pages related to a given URL using link graph, semantic similarity, and structural proximity. Returns related pages ranked by a composite relatedness score. Use this to discover content connected to a known page.",
|
|
18780
|
+
inputSchema: {
|
|
18781
|
+
pathOrUrl: z.string().min(1),
|
|
18782
|
+
scope: z.string().optional(),
|
|
18783
|
+
topK: z.number().int().positive().max(25).optional()
|
|
18784
|
+
}
|
|
18785
|
+
},
|
|
18786
|
+
async (input) => {
|
|
18787
|
+
const result = await engine.getRelatedPages(input.pathOrUrl, {
|
|
18788
|
+
topK: input.topK,
|
|
18789
|
+
scope: input.scope
|
|
18790
|
+
});
|
|
18791
|
+
return {
|
|
18792
|
+
content: [
|
|
18793
|
+
{
|
|
18794
|
+
type: "text",
|
|
18795
|
+
text: JSON.stringify(result, null, 2)
|
|
18796
|
+
}
|
|
18797
|
+
]
|
|
18798
|
+
};
|
|
18799
|
+
}
|
|
18800
|
+
);
|
|
18801
|
+
return server;
|
|
18802
|
+
}
|
|
18803
|
+
|
|
17810
18804
|
// src/sveltekit/handle.ts
|
|
17811
18805
|
var InMemoryRateLimiter = class {
|
|
17812
18806
|
constructor(windowMs, max) {
|
|
@@ -17835,7 +18829,13 @@ function searchsocketHandle(options = {}) {
|
|
|
17835
18829
|
let enginePromise = null;
|
|
17836
18830
|
let configPromise = null;
|
|
17837
18831
|
let apiPath = options.path;
|
|
18832
|
+
let llmsServePath = null;
|
|
18833
|
+
let serveMarkdownVariants = false;
|
|
18834
|
+
let mcpPath;
|
|
18835
|
+
let mcpApiKey;
|
|
18836
|
+
let mcpEnableJsonResponse = true;
|
|
17838
18837
|
let rateLimiter = null;
|
|
18838
|
+
let notConfigured = false;
|
|
17839
18839
|
const getConfig = async () => {
|
|
17840
18840
|
if (!configPromise) {
|
|
17841
18841
|
let configP;
|
|
@@ -17852,6 +18852,13 @@ function searchsocketHandle(options = {}) {
|
|
|
17852
18852
|
}
|
|
17853
18853
|
configPromise = configP.then((config) => {
|
|
17854
18854
|
apiPath = apiPath ?? config.api.path;
|
|
18855
|
+
mcpPath = config.mcp.handle.path;
|
|
18856
|
+
mcpApiKey = config.mcp.handle.apiKey;
|
|
18857
|
+
mcpEnableJsonResponse = config.mcp.handle.enableJsonResponse;
|
|
18858
|
+
if (config.llmsTxt.enable) {
|
|
18859
|
+
llmsServePath = "/" + config.llmsTxt.outputPath.replace(/^static\//, "");
|
|
18860
|
+
serveMarkdownVariants = config.llmsTxt.serveMarkdownVariants;
|
|
18861
|
+
}
|
|
17855
18862
|
if (config.api.rateLimit && !isServerless()) {
|
|
17856
18863
|
rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
|
|
17857
18864
|
}
|
|
@@ -17861,59 +18868,109 @@ function searchsocketHandle(options = {}) {
|
|
|
17861
18868
|
return configPromise;
|
|
17862
18869
|
};
|
|
17863
18870
|
const getEngine = async () => {
|
|
18871
|
+
if (notConfigured) {
|
|
18872
|
+
throw new SearchSocketError(
|
|
18873
|
+
"SEARCH_NOT_CONFIGURED",
|
|
18874
|
+
"Search is not configured. Set the required Upstash environment variables to enable search.",
|
|
18875
|
+
503
|
|
18876
|
+
);
|
|
18877
|
+
}
|
|
17864
18878
|
if (!enginePromise) {
|
|
17865
18879
|
const config = await getConfig();
|
|
17866
18880
|
enginePromise = SearchEngine.create({
|
|
17867
18881
|
cwd: options.cwd,
|
|
17868
18882
|
config
|
|
18883
|
+
}).catch((error) => {
|
|
18884
|
+
enginePromise = null;
|
|
18885
|
+
if (error instanceof SearchSocketError && error.code === "VECTOR_BACKEND_UNAVAILABLE") {
|
|
18886
|
+
notConfigured = true;
|
|
18887
|
+
throw new SearchSocketError(
|
|
18888
|
+
"SEARCH_NOT_CONFIGURED",
|
|
18889
|
+
"Search is not configured. Set the required Upstash environment variables to enable search.",
|
|
18890
|
+
503
|
|
18891
|
+
);
|
|
18892
|
+
}
|
|
18893
|
+
throw error;
|
|
17869
18894
|
});
|
|
17870
18895
|
}
|
|
17871
18896
|
return enginePromise;
|
|
17872
18897
|
};
|
|
17873
18898
|
const bodyLimit = options.maxBodyBytes ?? 64 * 1024;
|
|
17874
18899
|
return async ({ event, resolve }) => {
|
|
17875
|
-
if (apiPath && event.url.pathname !==
|
|
17876
|
-
|
|
18900
|
+
if (apiPath && !isApiPath(event.url.pathname, apiPath) && event.url.pathname !== llmsServePath) {
|
|
18901
|
+
const isMarkdownVariant = event.request.method === "GET" && event.url.pathname.endsWith(".md");
|
|
18902
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
18903
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
18904
|
+
}
|
|
18905
|
+
if (mcpPath) {
|
|
18906
|
+
if (serveMarkdownVariants && isMarkdownVariant) ; else {
|
|
18907
|
+
return resolve(event);
|
|
18908
|
+
}
|
|
18909
|
+
} else {
|
|
18910
|
+
if (configPromise || options.config || options.rawConfig) {
|
|
18911
|
+
await getConfig();
|
|
18912
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
18913
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
18914
|
+
}
|
|
18915
|
+
if (!(serveMarkdownVariants && isMarkdownVariant)) {
|
|
18916
|
+
return resolve(event);
|
|
18917
|
+
}
|
|
18918
|
+
} else {
|
|
18919
|
+
return resolve(event);
|
|
18920
|
+
}
|
|
18921
|
+
}
|
|
17877
18922
|
}
|
|
17878
18923
|
const config = await getConfig();
|
|
18924
|
+
if (llmsServePath && event.request.method === "GET" && event.url.pathname === llmsServePath) {
|
|
18925
|
+
const cwd = options.cwd ?? process.cwd();
|
|
18926
|
+
const filePath = path.resolve(cwd, config.llmsTxt.outputPath);
|
|
18927
|
+
try {
|
|
18928
|
+
const content = await fs9.readFile(filePath, "utf8");
|
|
18929
|
+
return new Response(content, {
|
|
18930
|
+
status: 200,
|
|
18931
|
+
headers: { "content-type": "text/plain; charset=utf-8" }
|
|
18932
|
+
});
|
|
18933
|
+
} catch {
|
|
18934
|
+
return resolve(event);
|
|
18935
|
+
}
|
|
18936
|
+
}
|
|
18937
|
+
if (serveMarkdownVariants && event.request.method === "GET" && event.url.pathname.endsWith(".md")) {
|
|
18938
|
+
let rawPath;
|
|
18939
|
+
try {
|
|
18940
|
+
rawPath = decodeURIComponent(event.url.pathname.slice(0, -3));
|
|
18941
|
+
} catch {
|
|
18942
|
+
return resolve(event);
|
|
18943
|
+
}
|
|
18944
|
+
const scope = event.url.searchParams?.get("scope") ?? void 0;
|
|
18945
|
+
try {
|
|
18946
|
+
const engine = await getEngine();
|
|
18947
|
+
const page = await engine.getPage(rawPath, scope);
|
|
18948
|
+
return new Response(page.markdown, {
|
|
18949
|
+
status: 200,
|
|
18950
|
+
headers: { "content-type": "text/markdown; charset=utf-8" }
|
|
18951
|
+
});
|
|
18952
|
+
} catch (error) {
|
|
18953
|
+
if (error instanceof SearchSocketError && error.status === 404) {
|
|
18954
|
+
return resolve(event);
|
|
18955
|
+
}
|
|
18956
|
+
throw error;
|
|
18957
|
+
}
|
|
18958
|
+
}
|
|
18959
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
18960
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
18961
|
+
}
|
|
17879
18962
|
const targetPath = apiPath ?? config.api.path;
|
|
17880
|
-
if (event.url.pathname
|
|
18963
|
+
if (!isApiPath(event.url.pathname, targetPath)) {
|
|
17881
18964
|
return resolve(event);
|
|
17882
18965
|
}
|
|
17883
|
-
|
|
18966
|
+
const subPath = event.url.pathname.slice(targetPath.length);
|
|
18967
|
+
const method = event.request.method;
|
|
18968
|
+
if (method === "OPTIONS") {
|
|
17884
18969
|
return new Response(null, {
|
|
17885
18970
|
status: 204,
|
|
17886
18971
|
headers: buildCorsHeaders(event.request, config)
|
|
17887
18972
|
});
|
|
17888
18973
|
}
|
|
17889
|
-
if (event.request.method !== "POST") {
|
|
17890
|
-
return withCors(
|
|
17891
|
-
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
|
|
17892
|
-
status: 405,
|
|
17893
|
-
headers: {
|
|
17894
|
-
"content-type": "application/json"
|
|
17895
|
-
}
|
|
17896
|
-
}),
|
|
17897
|
-
event.request,
|
|
17898
|
-
config
|
|
17899
|
-
);
|
|
17900
|
-
}
|
|
17901
|
-
const contentLength = Number(event.request.headers.get("content-length") ?? 0);
|
|
17902
|
-
if (contentLength > bodyLimit) {
|
|
17903
|
-
return withCors(
|
|
17904
|
-
new Response(
|
|
17905
|
-
JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Request body too large", 413))),
|
|
17906
|
-
{
|
|
17907
|
-
status: 413,
|
|
17908
|
-
headers: {
|
|
17909
|
-
"content-type": "application/json"
|
|
17910
|
-
}
|
|
17911
|
-
}
|
|
17912
|
-
),
|
|
17913
|
-
event.request,
|
|
17914
|
-
config
|
|
17915
|
-
);
|
|
17916
|
-
}
|
|
17917
18974
|
if (rateLimiter) {
|
|
17918
18975
|
const ip = event.getClientAddress?.() ?? event.request.headers.get("x-forwarded-for")?.split(",")[0]?.trim() ?? "unknown";
|
|
17919
18976
|
if (!rateLimiter.check(ip)) {
|
|
@@ -17933,39 +18990,32 @@ function searchsocketHandle(options = {}) {
|
|
|
17933
18990
|
}
|
|
17934
18991
|
}
|
|
17935
18992
|
try {
|
|
17936
|
-
|
|
17937
|
-
|
|
17938
|
-
|
|
17939
|
-
} else {
|
|
17940
|
-
let parsedFallback;
|
|
17941
|
-
try {
|
|
17942
|
-
parsedFallback = await event.request.json();
|
|
17943
|
-
} catch (error) {
|
|
17944
|
-
if (error instanceof SyntaxError) {
|
|
17945
|
-
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
17946
|
-
}
|
|
17947
|
-
throw error;
|
|
18993
|
+
if (method === "GET") {
|
|
18994
|
+
if (subPath === "" || subPath === "/") {
|
|
18995
|
+
return await handleGetSearch(event, config, getEngine);
|
|
17948
18996
|
}
|
|
17949
|
-
|
|
18997
|
+
if (subPath === "/health") {
|
|
18998
|
+
return await handleGetHealth(event, config, getEngine);
|
|
18999
|
+
}
|
|
19000
|
+
if (subPath.startsWith("/pages/")) {
|
|
19001
|
+
return await handleGetPage(event, config, getEngine, subPath);
|
|
19002
|
+
}
|
|
19003
|
+
return withCors(
|
|
19004
|
+
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Not found", 404))), {
|
|
19005
|
+
status: 404,
|
|
19006
|
+
headers: { "content-type": "application/json" }
|
|
19007
|
+
}),
|
|
19008
|
+
event.request,
|
|
19009
|
+
config
|
|
19010
|
+
);
|
|
17950
19011
|
}
|
|
17951
|
-
if (
|
|
17952
|
-
|
|
19012
|
+
if (method === "POST" && (subPath === "" || subPath === "/")) {
|
|
19013
|
+
return await handlePostSearch(event, config, getEngine, bodyLimit);
|
|
17953
19014
|
}
|
|
17954
|
-
let body;
|
|
17955
|
-
try {
|
|
17956
|
-
body = JSON.parse(rawBody);
|
|
17957
|
-
} catch {
|
|
17958
|
-
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
17959
|
-
}
|
|
17960
|
-
const engine = await getEngine();
|
|
17961
|
-
const searchRequest = body;
|
|
17962
|
-
const result = await engine.search(searchRequest);
|
|
17963
19015
|
return withCors(
|
|
17964
|
-
new Response(JSON.stringify(
|
|
17965
|
-
status:
|
|
17966
|
-
headers: {
|
|
17967
|
-
"content-type": "application/json"
|
|
17968
|
-
}
|
|
19016
|
+
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
|
|
19017
|
+
status: 405,
|
|
19018
|
+
headers: { "content-type": "application/json" }
|
|
17969
19019
|
}),
|
|
17970
19020
|
event.request,
|
|
17971
19021
|
config
|
|
@@ -17986,6 +19036,183 @@ function searchsocketHandle(options = {}) {
|
|
|
17986
19036
|
}
|
|
17987
19037
|
};
|
|
17988
19038
|
}
|
|
19039
|
+
function isApiPath(pathname, apiPath) {
|
|
19040
|
+
return pathname === apiPath || pathname.startsWith(apiPath + "/");
|
|
19041
|
+
}
|
|
19042
|
+
async function handleGetSearch(event, config, getEngine) {
|
|
19043
|
+
const params = event.url.searchParams;
|
|
19044
|
+
const q = params.get("q");
|
|
19045
|
+
if (!q || q.trim() === "") {
|
|
19046
|
+
throw new SearchSocketError("INVALID_REQUEST", "Missing required query parameter: q", 400);
|
|
19047
|
+
}
|
|
19048
|
+
const searchRequest = { q };
|
|
19049
|
+
const topK = params.get("topK");
|
|
19050
|
+
if (topK !== null) {
|
|
19051
|
+
const parsed = Number.parseInt(topK, 10);
|
|
19052
|
+
if (Number.isNaN(parsed) || parsed < 1) {
|
|
19053
|
+
throw new SearchSocketError("INVALID_REQUEST", "topK must be a positive integer", 400);
|
|
19054
|
+
}
|
|
19055
|
+
searchRequest.topK = parsed;
|
|
19056
|
+
}
|
|
19057
|
+
const scope = params.get("scope");
|
|
19058
|
+
if (scope !== null) searchRequest.scope = scope;
|
|
19059
|
+
const pathPrefix = params.get("pathPrefix");
|
|
19060
|
+
if (pathPrefix !== null) searchRequest.pathPrefix = pathPrefix;
|
|
19061
|
+
const groupBy = params.get("groupBy");
|
|
19062
|
+
if (groupBy) {
|
|
19063
|
+
if (groupBy !== "page" && groupBy !== "chunk") {
|
|
19064
|
+
throw new SearchSocketError("INVALID_REQUEST", 'groupBy must be "page" or "chunk"', 400);
|
|
19065
|
+
}
|
|
19066
|
+
searchRequest.groupBy = groupBy;
|
|
19067
|
+
}
|
|
19068
|
+
const maxSubResults = params.get("maxSubResults");
|
|
19069
|
+
if (maxSubResults !== null) {
|
|
19070
|
+
const parsed = Number.parseInt(maxSubResults, 10);
|
|
19071
|
+
if (Number.isNaN(parsed) || parsed < 1 || parsed > 20) {
|
|
19072
|
+
throw new SearchSocketError("INVALID_REQUEST", "maxSubResults must be a positive integer between 1 and 20", 400);
|
|
19073
|
+
}
|
|
19074
|
+
searchRequest.maxSubResults = parsed;
|
|
19075
|
+
}
|
|
19076
|
+
const tags = params.getAll("tags");
|
|
19077
|
+
if (tags.length > 0) searchRequest.tags = tags;
|
|
19078
|
+
const engine = await getEngine();
|
|
19079
|
+
const result = await engine.search(searchRequest);
|
|
19080
|
+
return withCors(
|
|
19081
|
+
new Response(JSON.stringify(result), {
|
|
19082
|
+
status: 200,
|
|
19083
|
+
headers: { "content-type": "application/json" }
|
|
19084
|
+
}),
|
|
19085
|
+
event.request,
|
|
19086
|
+
config
|
|
19087
|
+
);
|
|
19088
|
+
}
|
|
19089
|
+
async function handleGetHealth(event, config, getEngine) {
|
|
19090
|
+
const engine = await getEngine();
|
|
19091
|
+
const result = await engine.health();
|
|
19092
|
+
return withCors(
|
|
19093
|
+
new Response(JSON.stringify(result), {
|
|
19094
|
+
status: 200,
|
|
19095
|
+
headers: { "content-type": "application/json" }
|
|
19096
|
+
}),
|
|
19097
|
+
event.request,
|
|
19098
|
+
config
|
|
19099
|
+
);
|
|
19100
|
+
}
|
|
19101
|
+
async function handleGetPage(event, config, getEngine, subPath) {
|
|
19102
|
+
const rawPath = subPath.slice("/pages".length);
|
|
19103
|
+
let pagePath;
|
|
19104
|
+
try {
|
|
19105
|
+
pagePath = decodeURIComponent(rawPath);
|
|
19106
|
+
} catch {
|
|
19107
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed page path", 400);
|
|
19108
|
+
}
|
|
19109
|
+
const scope = event.url.searchParams?.get("scope") ?? void 0;
|
|
19110
|
+
const engine = await getEngine();
|
|
19111
|
+
const result = await engine.getPage(pagePath, scope);
|
|
19112
|
+
return withCors(
|
|
19113
|
+
new Response(JSON.stringify(result), {
|
|
19114
|
+
status: 200,
|
|
19115
|
+
headers: { "content-type": "application/json" }
|
|
19116
|
+
}),
|
|
19117
|
+
event.request,
|
|
19118
|
+
config
|
|
19119
|
+
);
|
|
19120
|
+
}
|
|
19121
|
+
async function handlePostSearch(event, config, getEngine, bodyLimit) {
|
|
19122
|
+
const contentLength = Number(event.request.headers.get("content-length") ?? 0);
|
|
19123
|
+
if (contentLength > bodyLimit) {
|
|
19124
|
+
throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
|
|
19125
|
+
}
|
|
19126
|
+
let rawBody;
|
|
19127
|
+
if (typeof event.request.text === "function") {
|
|
19128
|
+
rawBody = await event.request.text();
|
|
19129
|
+
} else {
|
|
19130
|
+
let parsedFallback;
|
|
19131
|
+
try {
|
|
19132
|
+
parsedFallback = await event.request.json();
|
|
19133
|
+
} catch (error) {
|
|
19134
|
+
if (error instanceof SyntaxError) {
|
|
19135
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
19136
|
+
}
|
|
19137
|
+
throw error;
|
|
19138
|
+
}
|
|
19139
|
+
rawBody = JSON.stringify(parsedFallback);
|
|
19140
|
+
}
|
|
19141
|
+
if (Buffer.byteLength(rawBody, "utf8") > bodyLimit) {
|
|
19142
|
+
throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
|
|
19143
|
+
}
|
|
19144
|
+
let body;
|
|
19145
|
+
try {
|
|
19146
|
+
body = JSON.parse(rawBody);
|
|
19147
|
+
} catch {
|
|
19148
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
19149
|
+
}
|
|
19150
|
+
const engine = await getEngine();
|
|
19151
|
+
const searchRequest = body;
|
|
19152
|
+
const result = await engine.search(searchRequest);
|
|
19153
|
+
return withCors(
|
|
19154
|
+
new Response(JSON.stringify(result), {
|
|
19155
|
+
status: 200,
|
|
19156
|
+
headers: { "content-type": "application/json" }
|
|
19157
|
+
}),
|
|
19158
|
+
event.request,
|
|
19159
|
+
config
|
|
19160
|
+
);
|
|
19161
|
+
}
|
|
19162
|
+
async function handleMcpRequest(event, apiKey, enableJsonResponse, getEngine) {
|
|
19163
|
+
if (apiKey) {
|
|
19164
|
+
const authHeader = event.request.headers.get("authorization") ?? "";
|
|
19165
|
+
const token = authHeader.startsWith("Bearer ") ? authHeader.slice(7) : "";
|
|
19166
|
+
const tokenBuf = Buffer.from(token);
|
|
19167
|
+
const keyBuf = Buffer.from(apiKey);
|
|
19168
|
+
if (tokenBuf.length !== keyBuf.length || !timingSafeEqual(tokenBuf, keyBuf)) {
|
|
19169
|
+
return new Response(
|
|
19170
|
+
JSON.stringify({
|
|
19171
|
+
jsonrpc: "2.0",
|
|
19172
|
+
error: { code: -32001, message: "Unauthorized" },
|
|
19173
|
+
id: null
|
|
19174
|
+
}),
|
|
19175
|
+
{ status: 401, headers: { "content-type": "application/json" } }
|
|
19176
|
+
);
|
|
19177
|
+
}
|
|
19178
|
+
}
|
|
19179
|
+
const transport = new WebStandardStreamableHTTPServerTransport({
|
|
19180
|
+
sessionIdGenerator: void 0,
|
|
19181
|
+
enableJsonResponse
|
|
19182
|
+
});
|
|
19183
|
+
let server;
|
|
19184
|
+
try {
|
|
19185
|
+
const engine = await getEngine();
|
|
19186
|
+
server = createServer(engine);
|
|
19187
|
+
await server.connect(transport);
|
|
19188
|
+
const response = await transport.handleRequest(event.request);
|
|
19189
|
+
if (enableJsonResponse) {
|
|
19190
|
+
await transport.close();
|
|
19191
|
+
await server.close();
|
|
19192
|
+
}
|
|
19193
|
+
return response;
|
|
19194
|
+
} catch (error) {
|
|
19195
|
+
try {
|
|
19196
|
+
await transport.close();
|
|
19197
|
+
} catch {
|
|
19198
|
+
}
|
|
19199
|
+
try {
|
|
19200
|
+
await server?.close();
|
|
19201
|
+
} catch {
|
|
19202
|
+
}
|
|
19203
|
+
return new Response(
|
|
19204
|
+
JSON.stringify({
|
|
19205
|
+
jsonrpc: "2.0",
|
|
19206
|
+
error: {
|
|
19207
|
+
code: -32603,
|
|
19208
|
+
message: error instanceof Error ? error.message : "Internal server error"
|
|
19209
|
+
},
|
|
19210
|
+
id: null
|
|
19211
|
+
}),
|
|
19212
|
+
{ status: 500, headers: { "content-type": "application/json" } }
|
|
19213
|
+
);
|
|
19214
|
+
}
|
|
19215
|
+
}
|
|
17989
19216
|
function buildCorsHeaders(request, config) {
|
|
17990
19217
|
const allowOrigins = config.api.cors.allowOrigins;
|
|
17991
19218
|
if (!allowOrigins || allowOrigins.length === 0) {
|
|
@@ -17998,7 +19225,7 @@ function buildCorsHeaders(request, config) {
|
|
|
17998
19225
|
}
|
|
17999
19226
|
return {
|
|
18000
19227
|
"access-control-allow-origin": allowOrigins.includes("*") ? "*" : origin,
|
|
18001
|
-
"access-control-allow-methods": "POST, OPTIONS",
|
|
19228
|
+
"access-control-allow-methods": "GET, POST, OPTIONS",
|
|
18002
19229
|
"access-control-allow-headers": "content-type"
|
|
18003
19230
|
};
|
|
18004
19231
|
}
|
|
@@ -18045,6 +19272,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
|
|
|
18045
19272
|
if (normalizeText(current.text)) {
|
|
18046
19273
|
sections.push({
|
|
18047
19274
|
sectionTitle: current.sectionTitle,
|
|
19275
|
+
headingLevel: current.headingLevel,
|
|
18048
19276
|
headingPath: current.headingPath,
|
|
18049
19277
|
text: current.text.trim()
|
|
18050
19278
|
});
|
|
@@ -18063,6 +19291,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
|
|
|
18063
19291
|
headingStack.length = level;
|
|
18064
19292
|
current = {
|
|
18065
19293
|
sectionTitle: title,
|
|
19294
|
+
headingLevel: level,
|
|
18066
19295
|
headingPath: headingStack.filter((entry) => Boolean(entry)).slice(0, headingPathDepth),
|
|
18067
19296
|
text: `${line}
|
|
18068
19297
|
`
|
|
@@ -18198,6 +19427,7 @@ function splitSection(section, config) {
|
|
|
18198
19427
|
return [
|
|
18199
19428
|
{
|
|
18200
19429
|
sectionTitle: section.sectionTitle,
|
|
19430
|
+
headingLevel: section.headingLevel,
|
|
18201
19431
|
headingPath: section.headingPath,
|
|
18202
19432
|
chunkText: text
|
|
18203
19433
|
}
|
|
@@ -18248,6 +19478,7 @@ ${chunk}`;
|
|
|
18248
19478
|
}
|
|
18249
19479
|
return merged.map((chunkText) => ({
|
|
18250
19480
|
sectionTitle: section.sectionTitle,
|
|
19481
|
+
headingLevel: section.headingLevel,
|
|
18251
19482
|
headingPath: section.headingPath,
|
|
18252
19483
|
chunkText
|
|
18253
19484
|
}));
|
|
@@ -18263,6 +19494,18 @@ function buildSummaryChunkText(page) {
|
|
|
18263
19494
|
}
|
|
18264
19495
|
return parts.join("\n\n");
|
|
18265
19496
|
}
|
|
19497
|
+
function buildEmbeddingTitle(chunk) {
|
|
19498
|
+
if (!chunk.sectionTitle || chunk.headingLevel === void 0) return void 0;
|
|
19499
|
+
if (chunk.headingPath.length > 1) {
|
|
19500
|
+
const path14 = chunk.headingPath.join(" > ");
|
|
19501
|
+
const lastInPath = chunk.headingPath[chunk.headingPath.length - 1];
|
|
19502
|
+
if (lastInPath !== chunk.sectionTitle) {
|
|
19503
|
+
return `${chunk.title} \u2014 ${path14} > ${chunk.sectionTitle}`;
|
|
19504
|
+
}
|
|
19505
|
+
return `${chunk.title} \u2014 ${path14}`;
|
|
19506
|
+
}
|
|
19507
|
+
return `${chunk.title} \u2014 ${chunk.sectionTitle}`;
|
|
19508
|
+
}
|
|
18266
19509
|
function buildEmbeddingText(chunk, prependTitle) {
|
|
18267
19510
|
if (!prependTitle) return chunk.chunkText;
|
|
18268
19511
|
const prefix = chunk.sectionTitle ? `${chunk.title} \u2014 ${chunk.sectionTitle}` : chunk.title;
|
|
@@ -18293,10 +19536,14 @@ function chunkPage(page, config, scope) {
|
|
|
18293
19536
|
tags: page.tags,
|
|
18294
19537
|
contentHash: "",
|
|
18295
19538
|
description: page.description,
|
|
18296
|
-
keywords: page.keywords
|
|
19539
|
+
keywords: page.keywords,
|
|
19540
|
+
publishedAt: page.publishedAt,
|
|
19541
|
+
incomingAnchorText: page.incomingAnchorText,
|
|
19542
|
+
meta: page.meta
|
|
18297
19543
|
};
|
|
18298
19544
|
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
18299
|
-
|
|
19545
|
+
const metaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
|
|
19546
|
+
summaryChunk.contentHash = sha256(normalizeText(embeddingText) + metaSuffix);
|
|
18300
19547
|
chunks.push(summaryChunk);
|
|
18301
19548
|
}
|
|
18302
19549
|
const ordinalOffset = config.chunking.pageSummaryChunk ? 1 : 0;
|
|
@@ -18313,6 +19560,7 @@ function chunkPage(page, config, scope) {
|
|
|
18313
19560
|
path: page.url,
|
|
18314
19561
|
title: page.title,
|
|
18315
19562
|
sectionTitle: entry.sectionTitle,
|
|
19563
|
+
headingLevel: entry.headingLevel,
|
|
18316
19564
|
headingPath: entry.headingPath,
|
|
18317
19565
|
chunkText: entry.chunkText,
|
|
18318
19566
|
snippet: toSnippet(entry.chunkText),
|
|
@@ -18322,10 +19570,16 @@ function chunkPage(page, config, scope) {
|
|
|
18322
19570
|
tags: page.tags,
|
|
18323
19571
|
contentHash: "",
|
|
18324
19572
|
description: page.description,
|
|
18325
|
-
keywords: page.keywords
|
|
19573
|
+
keywords: page.keywords,
|
|
19574
|
+
publishedAt: page.publishedAt,
|
|
19575
|
+
incomingAnchorText: page.incomingAnchorText,
|
|
19576
|
+
meta: page.meta
|
|
18326
19577
|
};
|
|
18327
19578
|
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
18328
|
-
|
|
19579
|
+
const embeddingTitle = config.chunking.weightHeadings ? buildEmbeddingTitle(chunk) : void 0;
|
|
19580
|
+
const chunkMetaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
|
|
19581
|
+
const hashInput = embeddingTitle ? `${normalizeText(embeddingText)}|title:${embeddingTitle}` : normalizeText(embeddingText);
|
|
19582
|
+
chunk.contentHash = sha256(hashInput + chunkMetaSuffix);
|
|
18329
19583
|
chunks.push(chunk);
|
|
18330
19584
|
}
|
|
18331
19585
|
return chunks;
|
|
@@ -19158,6 +20412,69 @@ function gfm(turndownService) {
|
|
|
19158
20412
|
}
|
|
19159
20413
|
|
|
19160
20414
|
// src/indexing/extractor.ts
|
|
20415
|
+
function normalizeDateToMs(value) {
|
|
20416
|
+
if (value == null) return void 0;
|
|
20417
|
+
if (value instanceof Date) {
|
|
20418
|
+
const ts = value.getTime();
|
|
20419
|
+
return Number.isFinite(ts) ? ts : void 0;
|
|
20420
|
+
}
|
|
20421
|
+
if (typeof value === "string") {
|
|
20422
|
+
const ts = new Date(value).getTime();
|
|
20423
|
+
return Number.isFinite(ts) ? ts : void 0;
|
|
20424
|
+
}
|
|
20425
|
+
if (typeof value === "number") {
|
|
20426
|
+
return Number.isFinite(value) ? value : void 0;
|
|
20427
|
+
}
|
|
20428
|
+
return void 0;
|
|
20429
|
+
}
|
|
20430
|
+
var FRONTMATTER_DATE_FIELDS = ["date", "publishedAt", "updatedAt", "published_at", "updated_at"];
|
|
20431
|
+
function extractPublishedAtFromFrontmatter(data) {
|
|
20432
|
+
for (const field of FRONTMATTER_DATE_FIELDS) {
|
|
20433
|
+
const val = normalizeDateToMs(data[field]);
|
|
20434
|
+
if (val !== void 0) return val;
|
|
20435
|
+
}
|
|
20436
|
+
return void 0;
|
|
20437
|
+
}
|
|
20438
|
+
function extractPublishedAtFromHtml($) {
|
|
20439
|
+
const jsonLdScripts = $('script[type="application/ld+json"]');
|
|
20440
|
+
for (let i = 0; i < jsonLdScripts.length; i++) {
|
|
20441
|
+
try {
|
|
20442
|
+
const raw = $(jsonLdScripts[i]).html();
|
|
20443
|
+
if (!raw) continue;
|
|
20444
|
+
const parsed = JSON.parse(raw);
|
|
20445
|
+
const candidates = [];
|
|
20446
|
+
if (Array.isArray(parsed)) {
|
|
20447
|
+
candidates.push(...parsed);
|
|
20448
|
+
} else if (parsed && typeof parsed === "object") {
|
|
20449
|
+
candidates.push(parsed);
|
|
20450
|
+
if (Array.isArray(parsed["@graph"])) {
|
|
20451
|
+
candidates.push(...parsed["@graph"]);
|
|
20452
|
+
}
|
|
20453
|
+
}
|
|
20454
|
+
for (const candidate of candidates) {
|
|
20455
|
+
const val = normalizeDateToMs(candidate.datePublished);
|
|
20456
|
+
if (val !== void 0) return val;
|
|
20457
|
+
}
|
|
20458
|
+
} catch {
|
|
20459
|
+
}
|
|
20460
|
+
}
|
|
20461
|
+
const ogTime = $('meta[property="article:published_time"]').attr("content")?.trim();
|
|
20462
|
+
if (ogTime) {
|
|
20463
|
+
const val = normalizeDateToMs(ogTime);
|
|
20464
|
+
if (val !== void 0) return val;
|
|
20465
|
+
}
|
|
20466
|
+
const itempropDate = $('meta[itemprop="datePublished"]').attr("content")?.trim() || $('time[itemprop="datePublished"]').attr("datetime")?.trim();
|
|
20467
|
+
if (itempropDate) {
|
|
20468
|
+
const val = normalizeDateToMs(itempropDate);
|
|
20469
|
+
if (val !== void 0) return val;
|
|
20470
|
+
}
|
|
20471
|
+
const timeEl = $("time[datetime]").first().attr("datetime")?.trim();
|
|
20472
|
+
if (timeEl) {
|
|
20473
|
+
const val = normalizeDateToMs(timeEl);
|
|
20474
|
+
if (val !== void 0) return val;
|
|
20475
|
+
}
|
|
20476
|
+
return void 0;
|
|
20477
|
+
}
|
|
19161
20478
|
function hasTopLevelNoindexComment(markdown) {
|
|
19162
20479
|
const lines = markdown.split(/\r?\n/);
|
|
19163
20480
|
let inFence = false;
|
|
@@ -19173,6 +20490,97 @@ function hasTopLevelNoindexComment(markdown) {
|
|
|
19173
20490
|
}
|
|
19174
20491
|
return false;
|
|
19175
20492
|
}
|
|
20493
|
+
var GARBAGE_ALT_WORDS = /* @__PURE__ */ new Set([
|
|
20494
|
+
"image",
|
|
20495
|
+
"photo",
|
|
20496
|
+
"picture",
|
|
20497
|
+
"icon",
|
|
20498
|
+
"logo",
|
|
20499
|
+
"banner",
|
|
20500
|
+
"screenshot",
|
|
20501
|
+
"thumbnail",
|
|
20502
|
+
"img",
|
|
20503
|
+
"graphic",
|
|
20504
|
+
"illustration",
|
|
20505
|
+
"spacer",
|
|
20506
|
+
"pixel",
|
|
20507
|
+
"placeholder",
|
|
20508
|
+
"avatar",
|
|
20509
|
+
"background"
|
|
20510
|
+
]);
|
|
20511
|
+
var IMAGE_EXT_RE = /\.(jpg|jpeg|png|gif|svg|webp|avif|bmp|ico)(\?.*)?$/i;
|
|
20512
|
+
function isMeaningfulAlt(alt) {
|
|
20513
|
+
const trimmed = alt.trim();
|
|
20514
|
+
if (!trimmed || trimmed.length < 5) return false;
|
|
20515
|
+
if (IMAGE_EXT_RE.test(trimmed)) return false;
|
|
20516
|
+
if (GARBAGE_ALT_WORDS.has(trimmed.toLowerCase())) return false;
|
|
20517
|
+
return true;
|
|
20518
|
+
}
|
|
20519
|
+
function resolveImageText(img, $, imageDescAttr) {
|
|
20520
|
+
const imgDesc = img.attr(imageDescAttr)?.trim();
|
|
20521
|
+
if (imgDesc) return imgDesc;
|
|
20522
|
+
const figure = img.closest("figure");
|
|
20523
|
+
if (figure.length) {
|
|
20524
|
+
const figDesc = figure.attr(imageDescAttr)?.trim();
|
|
20525
|
+
if (figDesc) return figDesc;
|
|
20526
|
+
}
|
|
20527
|
+
const alt = img.attr("alt")?.trim() ?? "";
|
|
20528
|
+
const caption = figure.length ? figure.find("figcaption").first().text().trim() : "";
|
|
20529
|
+
if (isMeaningfulAlt(alt) && caption) {
|
|
20530
|
+
return `${alt} \u2014 ${caption}`;
|
|
20531
|
+
}
|
|
20532
|
+
if (isMeaningfulAlt(alt)) {
|
|
20533
|
+
return alt;
|
|
20534
|
+
}
|
|
20535
|
+
if (caption) {
|
|
20536
|
+
return caption;
|
|
20537
|
+
}
|
|
20538
|
+
return null;
|
|
20539
|
+
}
|
|
20540
|
+
var STOP_ANCHORS = /* @__PURE__ */ new Set([
|
|
20541
|
+
"here",
|
|
20542
|
+
"click",
|
|
20543
|
+
"click here",
|
|
20544
|
+
"read more",
|
|
20545
|
+
"link",
|
|
20546
|
+
"this",
|
|
20547
|
+
"more"
|
|
20548
|
+
]);
|
|
20549
|
+
function normalizeAnchorText(raw) {
|
|
20550
|
+
const normalized = raw.replace(/\s+/g, " ").trim().toLowerCase();
|
|
20551
|
+
if (normalized.length < 3) return "";
|
|
20552
|
+
if (STOP_ANCHORS.has(normalized)) return "";
|
|
20553
|
+
if (normalized.length > 100) return normalized.slice(0, 100);
|
|
20554
|
+
return normalized;
|
|
20555
|
+
}
|
|
20556
|
+
function escapeHtml(text) {
|
|
20557
|
+
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">");
|
|
20558
|
+
}
|
|
20559
|
+
function preprocessImages(root2, $, imageDescAttr) {
|
|
20560
|
+
root2.find("picture").each((_i, el) => {
|
|
20561
|
+
const picture = $(el);
|
|
20562
|
+
const img = picture.find("img").first();
|
|
20563
|
+
const parentFigure = picture.closest("figure");
|
|
20564
|
+
const text = img.length ? resolveImageText(img, $, imageDescAttr) : null;
|
|
20565
|
+
if (text) {
|
|
20566
|
+
if (parentFigure.length) parentFigure.find("figcaption").remove();
|
|
20567
|
+
picture.replaceWith(`<span>${escapeHtml(text)}</span>`);
|
|
20568
|
+
} else {
|
|
20569
|
+
picture.remove();
|
|
20570
|
+
}
|
|
20571
|
+
});
|
|
20572
|
+
root2.find("img").each((_i, el) => {
|
|
20573
|
+
const img = $(el);
|
|
20574
|
+
const parentFigure = img.closest("figure");
|
|
20575
|
+
const text = resolveImageText(img, $, imageDescAttr);
|
|
20576
|
+
if (text) {
|
|
20577
|
+
if (parentFigure.length) parentFigure.find("figcaption").remove();
|
|
20578
|
+
img.replaceWith(`<span>${escapeHtml(text)}</span>`);
|
|
20579
|
+
} else {
|
|
20580
|
+
img.remove();
|
|
20581
|
+
}
|
|
20582
|
+
});
|
|
20583
|
+
}
|
|
19176
20584
|
function extractFromHtml(url, html, config) {
|
|
19177
20585
|
const $ = load(html);
|
|
19178
20586
|
const normalizedUrl = normalizeUrlPath(url);
|
|
@@ -19198,6 +20606,20 @@ function extractFromHtml(url, html, config) {
|
|
|
19198
20606
|
if (weight === 0) {
|
|
19199
20607
|
return null;
|
|
19200
20608
|
}
|
|
20609
|
+
if ($('meta[name="searchsocket:noindex"]').attr("content") === "true") {
|
|
20610
|
+
return null;
|
|
20611
|
+
}
|
|
20612
|
+
const RESERVED_META_KEYS = /* @__PURE__ */ new Set(["noindex", "tags"]);
|
|
20613
|
+
const meta = {};
|
|
20614
|
+
$('meta[name^="searchsocket:"]').each((_i, el) => {
|
|
20615
|
+
const name = $(el).attr("name") ?? "";
|
|
20616
|
+
const key = name.slice("searchsocket:".length);
|
|
20617
|
+
if (!key || RESERVED_META_KEYS.has(key) || !validateMetaKey(key)) return;
|
|
20618
|
+
const content = $(el).attr("content") ?? "";
|
|
20619
|
+
const dataType = $(el).attr("data-type") ?? "string";
|
|
20620
|
+
meta[key] = parseMetaValue(content, dataType);
|
|
20621
|
+
});
|
|
20622
|
+
const componentTags = $('meta[name="searchsocket:tags"]').attr("content")?.trim();
|
|
19201
20623
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
19202
20624
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
19203
20625
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -19209,7 +20631,9 @@ function extractFromHtml(url, html, config) {
|
|
|
19209
20631
|
root2.find(selector).remove();
|
|
19210
20632
|
}
|
|
19211
20633
|
root2.find(`[${config.extract.ignoreAttr}]`).remove();
|
|
20634
|
+
preprocessImages(root2, $, config.extract.imageDescAttr);
|
|
19212
20635
|
const outgoingLinks = [];
|
|
20636
|
+
const seenLinkKeys = /* @__PURE__ */ new Set();
|
|
19213
20637
|
root2.find("a[href]").each((_index, node) => {
|
|
19214
20638
|
const href = $(node).attr("href");
|
|
19215
20639
|
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:")) {
|
|
@@ -19220,7 +20644,19 @@ function extractFromHtml(url, html, config) {
|
|
|
19220
20644
|
if (!["http:", "https:"].includes(parsed.protocol)) {
|
|
19221
20645
|
return;
|
|
19222
20646
|
}
|
|
19223
|
-
|
|
20647
|
+
const url2 = normalizeUrlPath(parsed.pathname);
|
|
20648
|
+
let anchorText = normalizeAnchorText($(node).text());
|
|
20649
|
+
if (!anchorText) {
|
|
20650
|
+
const imgAlt = $(node).find("img").first().attr("alt") ?? "";
|
|
20651
|
+
if (isMeaningfulAlt(imgAlt)) {
|
|
20652
|
+
anchorText = normalizeAnchorText(imgAlt);
|
|
20653
|
+
}
|
|
20654
|
+
}
|
|
20655
|
+
const key = `${url2}|${anchorText}`;
|
|
20656
|
+
if (!seenLinkKeys.has(key)) {
|
|
20657
|
+
seenLinkKeys.add(key);
|
|
20658
|
+
outgoingLinks.push({ url: url2, anchorText });
|
|
20659
|
+
}
|
|
19224
20660
|
} catch {
|
|
19225
20661
|
}
|
|
19226
20662
|
});
|
|
@@ -19245,16 +20681,25 @@ function extractFromHtml(url, html, config) {
|
|
|
19245
20681
|
return null;
|
|
19246
20682
|
}
|
|
19247
20683
|
const tags = normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1);
|
|
20684
|
+
const publishedAt = extractPublishedAtFromHtml($);
|
|
20685
|
+
if (componentTags) {
|
|
20686
|
+
const extraTags = componentTags.split(",").map((t) => t.trim()).filter(Boolean);
|
|
20687
|
+
for (const t of extraTags) {
|
|
20688
|
+
if (!tags.includes(t)) tags.push(t);
|
|
20689
|
+
}
|
|
20690
|
+
}
|
|
19248
20691
|
return {
|
|
19249
20692
|
url: normalizeUrlPath(url),
|
|
19250
20693
|
title,
|
|
19251
20694
|
markdown,
|
|
19252
|
-
outgoingLinks
|
|
20695
|
+
outgoingLinks,
|
|
19253
20696
|
noindex: false,
|
|
19254
20697
|
tags,
|
|
19255
20698
|
description,
|
|
19256
20699
|
keywords,
|
|
19257
|
-
weight
|
|
20700
|
+
weight,
|
|
20701
|
+
publishedAt,
|
|
20702
|
+
meta: Object.keys(meta).length > 0 ? meta : void 0
|
|
19258
20703
|
};
|
|
19259
20704
|
}
|
|
19260
20705
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -19275,6 +20720,24 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19275
20720
|
if (mdWeight === 0) {
|
|
19276
20721
|
return null;
|
|
19277
20722
|
}
|
|
20723
|
+
let mdMeta;
|
|
20724
|
+
const rawMeta = searchsocketMeta?.meta;
|
|
20725
|
+
if (rawMeta && typeof rawMeta === "object" && !Array.isArray(rawMeta)) {
|
|
20726
|
+
const metaObj = {};
|
|
20727
|
+
for (const [key, val] of Object.entries(rawMeta)) {
|
|
20728
|
+
if (!validateMetaKey(key)) continue;
|
|
20729
|
+
if (typeof val === "string" || typeof val === "number" || typeof val === "boolean") {
|
|
20730
|
+
metaObj[key] = val;
|
|
20731
|
+
} else if (Array.isArray(val) && val.every((v) => typeof v === "string")) {
|
|
20732
|
+
metaObj[key] = val;
|
|
20733
|
+
} else if (val instanceof Date) {
|
|
20734
|
+
metaObj[key] = val.getTime();
|
|
20735
|
+
}
|
|
20736
|
+
}
|
|
20737
|
+
if (Object.keys(metaObj).length > 0) {
|
|
20738
|
+
mdMeta = metaObj;
|
|
20739
|
+
}
|
|
20740
|
+
}
|
|
19278
20741
|
const content = parsed.content;
|
|
19279
20742
|
const normalized = normalizeMarkdown(content);
|
|
19280
20743
|
if (!normalizeText(normalized)) {
|
|
@@ -19289,6 +20752,7 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19289
20752
|
fmKeywords = frontmatter.keywords.split(",").map((k) => k.trim()).filter(Boolean);
|
|
19290
20753
|
}
|
|
19291
20754
|
if (fmKeywords && fmKeywords.length === 0) fmKeywords = void 0;
|
|
20755
|
+
const publishedAt = extractPublishedAtFromFrontmatter(frontmatter);
|
|
19292
20756
|
return {
|
|
19293
20757
|
url: normalizeUrlPath(url),
|
|
19294
20758
|
title: resolvedTitle,
|
|
@@ -19298,7 +20762,9 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19298
20762
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
19299
20763
|
description: fmDescription,
|
|
19300
20764
|
keywords: fmKeywords,
|
|
19301
|
-
weight: mdWeight
|
|
20765
|
+
weight: mdWeight,
|
|
20766
|
+
publishedAt,
|
|
20767
|
+
meta: mdMeta
|
|
19302
20768
|
};
|
|
19303
20769
|
}
|
|
19304
20770
|
function segmentToRegex(segment) {
|
|
@@ -19461,7 +20927,7 @@ async function parseManifest(cwd, outputDir) {
|
|
|
19461
20927
|
const manifestPath = path.resolve(cwd, outputDir, "server", "manifest-full.js");
|
|
19462
20928
|
let content;
|
|
19463
20929
|
try {
|
|
19464
|
-
content = await
|
|
20930
|
+
content = await fs9.readFile(manifestPath, "utf8");
|
|
19465
20931
|
} catch {
|
|
19466
20932
|
throw new SearchSocketError(
|
|
19467
20933
|
"BUILD_MANIFEST_NOT_FOUND",
|
|
@@ -19772,6 +21238,125 @@ function filePathToUrl(filePath, baseDir) {
|
|
|
19772
21238
|
const noExt = relative.replace(/\.md$/i, "").replace(/\/index$/i, "");
|
|
19773
21239
|
return normalizeUrlPath(noExt || "/");
|
|
19774
21240
|
}
|
|
21241
|
+
var ROUTE_FILE_RE = /\+(page|layout|error)(@[^.]+)?\.svelte$/;
|
|
21242
|
+
function isSvelteComponentFile(filePath) {
|
|
21243
|
+
if (!filePath.endsWith(".svelte")) return false;
|
|
21244
|
+
return !ROUTE_FILE_RE.test(filePath);
|
|
21245
|
+
}
|
|
21246
|
+
function extractSvelteComponentMeta(source) {
|
|
21247
|
+
const componentMatch = source.match(/<!--\s*@component\s*([\s\S]*?)\s*-->/);
|
|
21248
|
+
const description = componentMatch?.[1]?.trim() || void 0;
|
|
21249
|
+
const propsMatch = source.match(
|
|
21250
|
+
/let\s+\{([\s\S]*?)\}\s*(?::\s*([^=;{][\s\S]*?))?\s*=\s*\$props\(\)/
|
|
21251
|
+
);
|
|
21252
|
+
const props = [];
|
|
21253
|
+
if (propsMatch) {
|
|
21254
|
+
const destructureBlock = propsMatch[1];
|
|
21255
|
+
const typeAnnotation = propsMatch[2]?.trim();
|
|
21256
|
+
let resolvedTypeMap;
|
|
21257
|
+
if (typeAnnotation && /^[A-Z]\w*$/.test(typeAnnotation)) {
|
|
21258
|
+
resolvedTypeMap = resolveTypeReference(source, typeAnnotation);
|
|
21259
|
+
} else if (typeAnnotation && typeAnnotation.startsWith("{")) {
|
|
21260
|
+
resolvedTypeMap = parseInlineTypeAnnotation(typeAnnotation);
|
|
21261
|
+
}
|
|
21262
|
+
const propEntries = splitDestructureBlock(destructureBlock);
|
|
21263
|
+
for (const entry of propEntries) {
|
|
21264
|
+
const trimmed = entry.trim();
|
|
21265
|
+
if (!trimmed || trimmed.startsWith("...")) continue;
|
|
21266
|
+
let propName;
|
|
21267
|
+
let defaultValue;
|
|
21268
|
+
const renameMatch = trimmed.match(/^(\w+)\s*:\s*\w+\s*(?:=\s*([\s\S]+))?$/);
|
|
21269
|
+
if (renameMatch) {
|
|
21270
|
+
propName = renameMatch[1];
|
|
21271
|
+
defaultValue = renameMatch[2]?.trim();
|
|
21272
|
+
} else {
|
|
21273
|
+
const defaultMatch = trimmed.match(/^(\w+)\s*=\s*([\s\S]+)$/);
|
|
21274
|
+
if (defaultMatch) {
|
|
21275
|
+
propName = defaultMatch[1];
|
|
21276
|
+
defaultValue = defaultMatch[2]?.trim();
|
|
21277
|
+
} else {
|
|
21278
|
+
propName = trimmed.match(/^(\w+)/)?.[1] ?? trimmed;
|
|
21279
|
+
}
|
|
21280
|
+
}
|
|
21281
|
+
const propType = resolvedTypeMap?.get(propName);
|
|
21282
|
+
props.push({
|
|
21283
|
+
name: propName,
|
|
21284
|
+
...propType ? { type: propType } : {},
|
|
21285
|
+
...defaultValue ? { default: defaultValue } : {}
|
|
21286
|
+
});
|
|
21287
|
+
}
|
|
21288
|
+
}
|
|
21289
|
+
return { description, props };
|
|
21290
|
+
}
|
|
21291
|
+
function splitDestructureBlock(block) {
|
|
21292
|
+
const entries = [];
|
|
21293
|
+
let depth = 0;
|
|
21294
|
+
let current = "";
|
|
21295
|
+
for (const ch of block) {
|
|
21296
|
+
if (ch === "{" || ch === "[" || ch === "(") {
|
|
21297
|
+
depth++;
|
|
21298
|
+
current += ch;
|
|
21299
|
+
} else if (ch === "}" || ch === "]" || ch === ")") {
|
|
21300
|
+
depth--;
|
|
21301
|
+
current += ch;
|
|
21302
|
+
} else if (ch === "," && depth === 0) {
|
|
21303
|
+
entries.push(current);
|
|
21304
|
+
current = "";
|
|
21305
|
+
} else {
|
|
21306
|
+
current += ch;
|
|
21307
|
+
}
|
|
21308
|
+
}
|
|
21309
|
+
if (current.trim()) entries.push(current);
|
|
21310
|
+
return entries;
|
|
21311
|
+
}
|
|
21312
|
+
function resolveTypeReference(source, typeName) {
|
|
21313
|
+
const startRe = new RegExp(`(?:interface\\s+${typeName}\\s*|type\\s+${typeName}\\s*=\\s*)\\{`);
|
|
21314
|
+
const startMatch = source.match(startRe);
|
|
21315
|
+
if (!startMatch || startMatch.index === void 0) return void 0;
|
|
21316
|
+
const bodyStart = startMatch.index + startMatch[0].length;
|
|
21317
|
+
let depth = 1;
|
|
21318
|
+
let i = bodyStart;
|
|
21319
|
+
while (i < source.length && depth > 0) {
|
|
21320
|
+
if (source[i] === "{") depth++;
|
|
21321
|
+
else if (source[i] === "}") depth--;
|
|
21322
|
+
i++;
|
|
21323
|
+
}
|
|
21324
|
+
if (depth !== 0) return void 0;
|
|
21325
|
+
const body = source.slice(bodyStart, i - 1);
|
|
21326
|
+
return parseTypeMembers(body);
|
|
21327
|
+
}
|
|
21328
|
+
function parseInlineTypeAnnotation(annotation) {
|
|
21329
|
+
const inner = annotation.replace(/^\{/, "").replace(/\}$/, "");
|
|
21330
|
+
return parseTypeMembers(inner);
|
|
21331
|
+
}
|
|
21332
|
+
function parseTypeMembers(body) {
|
|
21333
|
+
const map = /* @__PURE__ */ new Map();
|
|
21334
|
+
const members = body.split(/[;\n]/).map((m) => m.trim()).filter(Boolean);
|
|
21335
|
+
for (const member of members) {
|
|
21336
|
+
const memberMatch = member.match(/^(\w+)\??\s*:\s*(.+)$/);
|
|
21337
|
+
if (memberMatch) {
|
|
21338
|
+
map.set(memberMatch[1], memberMatch[2].replace(/,\s*$/, "").trim());
|
|
21339
|
+
}
|
|
21340
|
+
}
|
|
21341
|
+
return map;
|
|
21342
|
+
}
|
|
21343
|
+
function buildComponentMarkdown(componentName, meta) {
|
|
21344
|
+
if (!meta.description && meta.props.length === 0) return "";
|
|
21345
|
+
const parts = [`${componentName} component.`];
|
|
21346
|
+
if (meta.description) {
|
|
21347
|
+
parts.push(meta.description);
|
|
21348
|
+
}
|
|
21349
|
+
if (meta.props.length > 0) {
|
|
21350
|
+
const propEntries = meta.props.map((p) => {
|
|
21351
|
+
let entry = p.name;
|
|
21352
|
+
if (p.type) entry += ` (${p.type})`;
|
|
21353
|
+
if (p.default) entry += ` default: ${p.default}`;
|
|
21354
|
+
return entry;
|
|
21355
|
+
});
|
|
21356
|
+
parts.push(`Props: ${propEntries.join(", ")}.`);
|
|
21357
|
+
}
|
|
21358
|
+
return parts.join(" ");
|
|
21359
|
+
}
|
|
19775
21360
|
function normalizeSvelteToMarkdown(source) {
|
|
19776
21361
|
return source.replace(/<script[\s\S]*?<\/script>/g, "").replace(/<style[\s\S]*?<\/style>/g, "").replace(/<[^>]+>/g, " ").replace(/\{[^}]+\}/g, " ").replace(/\s+/g, " ").trim();
|
|
19777
21362
|
}
|
|
@@ -19790,13 +21375,27 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
19790
21375
|
const selected = typeof limit === "number" ? files.slice(0, limit) : files;
|
|
19791
21376
|
const pages = [];
|
|
19792
21377
|
for (const filePath of selected) {
|
|
19793
|
-
const raw = await
|
|
19794
|
-
|
|
21378
|
+
const raw = await fs9.readFile(filePath, "utf8");
|
|
21379
|
+
let markdown;
|
|
21380
|
+
let tags;
|
|
21381
|
+
if (filePath.endsWith(".md")) {
|
|
21382
|
+
markdown = raw;
|
|
21383
|
+
} else if (isSvelteComponentFile(filePath)) {
|
|
21384
|
+
const componentName = path.basename(filePath, ".svelte");
|
|
21385
|
+
const meta = extractSvelteComponentMeta(raw);
|
|
21386
|
+
const componentMarkdown = buildComponentMarkdown(componentName, meta);
|
|
21387
|
+
const templateContent = normalizeSvelteToMarkdown(raw);
|
|
21388
|
+
markdown = componentMarkdown ? [componentMarkdown, templateContent].filter(Boolean).join("\n\n") : templateContent;
|
|
21389
|
+
tags = ["component"];
|
|
21390
|
+
} else {
|
|
21391
|
+
markdown = normalizeSvelteToMarkdown(raw);
|
|
21392
|
+
}
|
|
19795
21393
|
pages.push({
|
|
19796
21394
|
url: filePathToUrl(filePath, baseDir),
|
|
19797
21395
|
markdown,
|
|
19798
21396
|
sourcePath: path.relative(cwd, filePath).replace(/\\/g, "/"),
|
|
19799
|
-
outgoingLinks: []
|
|
21397
|
+
outgoingLinks: [],
|
|
21398
|
+
...tags ? { tags } : {}
|
|
19800
21399
|
});
|
|
19801
21400
|
}
|
|
19802
21401
|
return pages;
|
|
@@ -19926,7 +21525,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
19926
21525
|
const selected = typeof limit === "number" ? htmlFiles.slice(0, limit) : htmlFiles;
|
|
19927
21526
|
const pages = [];
|
|
19928
21527
|
for (const filePath of selected) {
|
|
19929
|
-
const html = await
|
|
21528
|
+
const html = await fs9.readFile(filePath, "utf8");
|
|
19930
21529
|
pages.push({
|
|
19931
21530
|
url: staticHtmlFileToUrl(filePath, outputDir),
|
|
19932
21531
|
html,
|
|
@@ -19989,7 +21588,7 @@ function isBlockedByRobots(urlPath, rules3) {
|
|
|
19989
21588
|
}
|
|
19990
21589
|
async function loadRobotsTxtFromDir(dir) {
|
|
19991
21590
|
try {
|
|
19992
|
-
const content = await
|
|
21591
|
+
const content = await fs9.readFile(path.join(dir, "robots.txt"), "utf8");
|
|
19993
21592
|
return parseRobotsTxt(content);
|
|
19994
21593
|
} catch {
|
|
19995
21594
|
return null;
|
|
@@ -20006,6 +21605,81 @@ async function fetchRobotsTxt(baseUrl) {
|
|
|
20006
21605
|
return null;
|
|
20007
21606
|
}
|
|
20008
21607
|
}
|
|
21608
|
+
function resolvePageUrl(pageUrl, baseUrl) {
|
|
21609
|
+
if (!baseUrl) return pageUrl;
|
|
21610
|
+
try {
|
|
21611
|
+
return new URL(pageUrl, baseUrl).href;
|
|
21612
|
+
} catch {
|
|
21613
|
+
return pageUrl;
|
|
21614
|
+
}
|
|
21615
|
+
}
|
|
21616
|
+
function generateLlmsTxt(pages, config) {
|
|
21617
|
+
const title = config.llmsTxt.title ?? config.project.id;
|
|
21618
|
+
const description = config.llmsTxt.description;
|
|
21619
|
+
const baseUrl = config.project.baseUrl;
|
|
21620
|
+
const lines = [`# ${title}`];
|
|
21621
|
+
if (description) {
|
|
21622
|
+
lines.push("", `> ${description}`);
|
|
21623
|
+
}
|
|
21624
|
+
const filtered = pages.filter(
|
|
21625
|
+
(p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
|
|
21626
|
+
);
|
|
21627
|
+
const sorted = [...filtered].sort((a, b) => {
|
|
21628
|
+
if (a.depth !== b.depth) return a.depth - b.depth;
|
|
21629
|
+
return b.incomingLinks - a.incomingLinks;
|
|
21630
|
+
});
|
|
21631
|
+
if (sorted.length > 0) {
|
|
21632
|
+
lines.push("", "## Pages", "");
|
|
21633
|
+
for (const page of sorted) {
|
|
21634
|
+
const url = resolvePageUrl(page.url, baseUrl);
|
|
21635
|
+
if (page.description) {
|
|
21636
|
+
lines.push(`- [${page.title}](${url}): ${page.description}`);
|
|
21637
|
+
} else {
|
|
21638
|
+
lines.push(`- [${page.title}](${url})`);
|
|
21639
|
+
}
|
|
21640
|
+
}
|
|
21641
|
+
}
|
|
21642
|
+
lines.push("");
|
|
21643
|
+
return lines.join("\n");
|
|
21644
|
+
}
|
|
21645
|
+
function generateLlmsFullTxt(pages, config) {
|
|
21646
|
+
const title = config.llmsTxt.title ?? config.project.id;
|
|
21647
|
+
const description = config.llmsTxt.description;
|
|
21648
|
+
const baseUrl = config.project.baseUrl;
|
|
21649
|
+
const lines = [`# ${title}`];
|
|
21650
|
+
if (description) {
|
|
21651
|
+
lines.push("", `> ${description}`);
|
|
21652
|
+
}
|
|
21653
|
+
const filtered = pages.filter(
|
|
21654
|
+
(p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
|
|
21655
|
+
);
|
|
21656
|
+
const sorted = [...filtered].sort((a, b) => {
|
|
21657
|
+
if (a.depth !== b.depth) return a.depth - b.depth;
|
|
21658
|
+
return b.incomingLinks - a.incomingLinks;
|
|
21659
|
+
});
|
|
21660
|
+
for (const page of sorted) {
|
|
21661
|
+
const url = resolvePageUrl(page.url, baseUrl);
|
|
21662
|
+
lines.push("", "---", "", `## [${page.title}](${url})`, "");
|
|
21663
|
+
lines.push(page.markdown.trim());
|
|
21664
|
+
}
|
|
21665
|
+
lines.push("");
|
|
21666
|
+
return lines.join("\n");
|
|
21667
|
+
}
|
|
21668
|
+
async function writeLlmsTxt(pages, config, cwd, logger3) {
|
|
21669
|
+
const outputPath = path.resolve(cwd, config.llmsTxt.outputPath);
|
|
21670
|
+
const outputDir = path.dirname(outputPath);
|
|
21671
|
+
await fs9.mkdir(outputDir, { recursive: true });
|
|
21672
|
+
const content = generateLlmsTxt(pages, config);
|
|
21673
|
+
await fs9.writeFile(outputPath, content, "utf8");
|
|
21674
|
+
logger3.info(`Generated llms.txt at ${config.llmsTxt.outputPath}`);
|
|
21675
|
+
if (config.llmsTxt.generateFull) {
|
|
21676
|
+
const fullPath = outputPath.replace(/\.txt$/, "-full.txt");
|
|
21677
|
+
const fullContent = generateLlmsFullTxt(pages, config);
|
|
21678
|
+
await fs9.writeFile(fullPath, fullContent, "utf8");
|
|
21679
|
+
const relativeFull = path.relative(cwd, fullPath);
|
|
21680
|
+
logger3.info(`Generated llms-full.txt at ${relativeFull}`);
|
|
21681
|
+
}
|
|
21682
|
+
}
|
|
20009
21683
|
|
|
20010
21684
|
// src/indexing/pipeline.ts
|
|
20011
21685
|
function buildPageSummary(page, maxChars = 3500) {
|
|
@@ -20024,16 +21698,33 @@ function buildPageSummary(page, maxChars = 3500) {
|
|
|
20024
21698
|
if (joined.length <= maxChars) return joined;
|
|
20025
21699
|
return joined.slice(0, maxChars).trim();
|
|
20026
21700
|
}
|
|
21701
|
+
function buildPageContentHash(page) {
|
|
21702
|
+
const parts = [
|
|
21703
|
+
page.title,
|
|
21704
|
+
page.description ?? "",
|
|
21705
|
+
(page.keywords ?? []).slice().sort().join(","),
|
|
21706
|
+
page.tags.slice().sort().join(","),
|
|
21707
|
+
page.markdown,
|
|
21708
|
+
String(page.outgoingLinks),
|
|
21709
|
+
String(page.publishedAt ?? ""),
|
|
21710
|
+
page.incomingAnchorText ?? "",
|
|
21711
|
+
(page.outgoingLinkUrls ?? []).slice().sort().join(","),
|
|
21712
|
+
page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : ""
|
|
21713
|
+
];
|
|
21714
|
+
return sha256(parts.join("|"));
|
|
21715
|
+
}
|
|
20027
21716
|
var IndexPipeline = class _IndexPipeline {
|
|
20028
21717
|
cwd;
|
|
20029
21718
|
config;
|
|
20030
21719
|
store;
|
|
20031
21720
|
logger;
|
|
21721
|
+
hooks;
|
|
20032
21722
|
constructor(options) {
|
|
20033
21723
|
this.cwd = options.cwd;
|
|
20034
21724
|
this.config = options.config;
|
|
20035
21725
|
this.store = options.store;
|
|
20036
21726
|
this.logger = options.logger;
|
|
21727
|
+
this.hooks = options.hooks;
|
|
20037
21728
|
}
|
|
20038
21729
|
static async create(options = {}) {
|
|
20039
21730
|
const cwd = path.resolve(options.cwd ?? process.cwd());
|
|
@@ -20043,7 +21734,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20043
21734
|
cwd,
|
|
20044
21735
|
config,
|
|
20045
21736
|
store,
|
|
20046
|
-
logger: options.logger ?? new Logger()
|
|
21737
|
+
logger: options.logger ?? new Logger(),
|
|
21738
|
+
hooks: options.hooks ?? {}
|
|
20047
21739
|
});
|
|
20048
21740
|
}
|
|
20049
21741
|
getConfig() {
|
|
@@ -20064,7 +21756,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20064
21756
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
20065
21757
|
ensureStateDirs(this.cwd, this.config.state.dir);
|
|
20066
21758
|
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
20067
|
-
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-
|
|
21759
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-vector)`);
|
|
20068
21760
|
if (options.force) {
|
|
20069
21761
|
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
20070
21762
|
}
|
|
@@ -20073,8 +21765,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20073
21765
|
}
|
|
20074
21766
|
const manifestStart = stageStart();
|
|
20075
21767
|
const existingHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getContentHashes(scope);
|
|
21768
|
+
const existingPageHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getPageHashes(scope);
|
|
20076
21769
|
stageEnd("manifest", manifestStart);
|
|
20077
|
-
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
21770
|
+
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes, ${existingPageHashes.size} existing page hashes loaded`);
|
|
20078
21771
|
const sourceStart = stageStart();
|
|
20079
21772
|
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
20080
21773
|
let sourcePages;
|
|
@@ -20151,11 +21844,61 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20151
21844
|
);
|
|
20152
21845
|
continue;
|
|
20153
21846
|
}
|
|
20154
|
-
|
|
21847
|
+
if (sourcePage.tags && sourcePage.tags.length > 0) {
|
|
21848
|
+
extracted.tags = [.../* @__PURE__ */ new Set([...extracted.tags, ...sourcePage.tags])];
|
|
21849
|
+
}
|
|
21850
|
+
let accepted;
|
|
21851
|
+
if (this.hooks.transformPage) {
|
|
21852
|
+
const transformed = await this.hooks.transformPage(extracted);
|
|
21853
|
+
if (transformed === null) {
|
|
21854
|
+
this.logger.debug(`Page ${sourcePage.url} skipped by transformPage hook`);
|
|
21855
|
+
continue;
|
|
21856
|
+
}
|
|
21857
|
+
accepted = transformed;
|
|
21858
|
+
} else {
|
|
21859
|
+
accepted = extracted;
|
|
21860
|
+
}
|
|
21861
|
+
extractedPages.push(accepted);
|
|
20155
21862
|
this.logger.event("page_extracted", {
|
|
20156
|
-
url:
|
|
21863
|
+
url: accepted.url
|
|
20157
21864
|
});
|
|
20158
21865
|
}
|
|
21866
|
+
const customRecords = options.customRecords ?? [];
|
|
21867
|
+
if (customRecords.length > 0) {
|
|
21868
|
+
this.logger.info(`Processing ${customRecords.length} custom record${customRecords.length === 1 ? "" : "s"}...`);
|
|
21869
|
+
for (const record of customRecords) {
|
|
21870
|
+
const normalizedUrl = normalizeUrlPath(record.url);
|
|
21871
|
+
const normalized = normalizeMarkdown(record.content);
|
|
21872
|
+
if (!normalized.trim()) {
|
|
21873
|
+
this.logger.warn(`Custom record ${normalizedUrl} has empty content and was skipped.`);
|
|
21874
|
+
continue;
|
|
21875
|
+
}
|
|
21876
|
+
const urlTags = normalizedUrl.split("/").filter(Boolean).slice(0, 1);
|
|
21877
|
+
const tags = record.tags ? [.../* @__PURE__ */ new Set([...urlTags, ...record.tags])] : urlTags;
|
|
21878
|
+
const extracted = {
|
|
21879
|
+
url: normalizedUrl,
|
|
21880
|
+
title: record.title,
|
|
21881
|
+
markdown: normalized,
|
|
21882
|
+
outgoingLinks: [],
|
|
21883
|
+
noindex: false,
|
|
21884
|
+
tags,
|
|
21885
|
+
weight: record.weight
|
|
21886
|
+
};
|
|
21887
|
+
let accepted;
|
|
21888
|
+
if (this.hooks.transformPage) {
|
|
21889
|
+
const transformed = await this.hooks.transformPage(extracted);
|
|
21890
|
+
if (transformed === null) {
|
|
21891
|
+
this.logger.debug(`Custom record ${normalizedUrl} skipped by transformPage hook`);
|
|
21892
|
+
continue;
|
|
21893
|
+
}
|
|
21894
|
+
accepted = transformed;
|
|
21895
|
+
} else {
|
|
21896
|
+
accepted = extracted;
|
|
21897
|
+
}
|
|
21898
|
+
extractedPages.push(accepted);
|
|
21899
|
+
this.logger.event("page_extracted", { url: accepted.url, custom: true });
|
|
21900
|
+
}
|
|
21901
|
+
}
|
|
20159
21902
|
extractedPages.sort((a, b) => a.url.localeCompare(b.url));
|
|
20160
21903
|
const uniquePages = [];
|
|
20161
21904
|
const seenUrls = /* @__PURE__ */ new Set();
|
|
@@ -20188,15 +21931,28 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20188
21931
|
const linkStart = stageStart();
|
|
20189
21932
|
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
20190
21933
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
21934
|
+
const incomingAnchorTexts = /* @__PURE__ */ new Map();
|
|
20191
21935
|
for (const page of indexablePages) {
|
|
20192
21936
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
20193
21937
|
}
|
|
20194
21938
|
for (const page of indexablePages) {
|
|
20195
|
-
|
|
21939
|
+
const seenForCount = /* @__PURE__ */ new Set();
|
|
21940
|
+
const seenForAnchor = /* @__PURE__ */ new Set();
|
|
21941
|
+
for (const { url: outgoing, anchorText } of page.outgoingLinks) {
|
|
20196
21942
|
if (!pageSet.has(outgoing)) {
|
|
20197
21943
|
continue;
|
|
20198
21944
|
}
|
|
20199
|
-
|
|
21945
|
+
if (!seenForCount.has(outgoing)) {
|
|
21946
|
+
seenForCount.add(outgoing);
|
|
21947
|
+
incomingLinkCount.set(outgoing, (incomingLinkCount.get(outgoing) ?? 0) + 1);
|
|
21948
|
+
}
|
|
21949
|
+
if (anchorText && !seenForAnchor.has(outgoing)) {
|
|
21950
|
+
seenForAnchor.add(outgoing);
|
|
21951
|
+
if (!incomingAnchorTexts.has(outgoing)) {
|
|
21952
|
+
incomingAnchorTexts.set(outgoing, /* @__PURE__ */ new Set());
|
|
21953
|
+
}
|
|
21954
|
+
incomingAnchorTexts.get(outgoing).add(anchorText);
|
|
21955
|
+
}
|
|
20200
21956
|
}
|
|
20201
21957
|
}
|
|
20202
21958
|
stageEnd("links", linkStart);
|
|
@@ -20215,6 +21971,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20215
21971
|
});
|
|
20216
21972
|
}
|
|
20217
21973
|
}
|
|
21974
|
+
for (const record of customRecords) {
|
|
21975
|
+
const normalizedUrl = normalizeUrlPath(record.url);
|
|
21976
|
+
if (!precomputedRoutes.has(normalizedUrl)) {
|
|
21977
|
+
precomputedRoutes.set(normalizedUrl, {
|
|
21978
|
+
routeFile: "",
|
|
21979
|
+
routeResolution: "exact"
|
|
21980
|
+
});
|
|
21981
|
+
}
|
|
21982
|
+
}
|
|
20218
21983
|
for (const page of indexablePages) {
|
|
20219
21984
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
20220
21985
|
if (routeMatch.routeResolution === "best-effort") {
|
|
@@ -20232,6 +21997,17 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20232
21997
|
} else {
|
|
20233
21998
|
routeExact += 1;
|
|
20234
21999
|
}
|
|
22000
|
+
const anchorSet = incomingAnchorTexts.get(page.url);
|
|
22001
|
+
let incomingAnchorText;
|
|
22002
|
+
if (anchorSet && anchorSet.size > 0) {
|
|
22003
|
+
let joined = "";
|
|
22004
|
+
for (const phrase of anchorSet) {
|
|
22005
|
+
const next2 = joined ? `${joined} ${phrase}` : phrase;
|
|
22006
|
+
if (next2.length > 500) break;
|
|
22007
|
+
joined = next2;
|
|
22008
|
+
}
|
|
22009
|
+
incomingAnchorText = joined || void 0;
|
|
22010
|
+
}
|
|
20235
22011
|
const indexedPage = {
|
|
20236
22012
|
url: page.url,
|
|
20237
22013
|
title: page.title,
|
|
@@ -20241,40 +22017,113 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20241
22017
|
generatedAt: nowIso(),
|
|
20242
22018
|
incomingLinks: incomingLinkCount.get(page.url) ?? 0,
|
|
20243
22019
|
outgoingLinks: page.outgoingLinks.length,
|
|
22020
|
+
outgoingLinkUrls: page.outgoingLinks.map((l) => typeof l === "string" ? l : l.url),
|
|
20244
22021
|
depth: getUrlDepth(page.url),
|
|
20245
22022
|
tags: page.tags,
|
|
20246
22023
|
markdown: page.markdown,
|
|
20247
22024
|
description: page.description,
|
|
20248
|
-
keywords: page.keywords
|
|
22025
|
+
keywords: page.keywords,
|
|
22026
|
+
publishedAt: page.publishedAt,
|
|
22027
|
+
incomingAnchorText,
|
|
22028
|
+
meta: page.meta
|
|
20249
22029
|
};
|
|
20250
22030
|
pages.push(indexedPage);
|
|
20251
22031
|
this.logger.event("page_indexed", { url: page.url });
|
|
20252
22032
|
}
|
|
22033
|
+
const pageRecords = pages.map((p) => {
|
|
22034
|
+
const summary = buildPageSummary(p);
|
|
22035
|
+
return {
|
|
22036
|
+
url: p.url,
|
|
22037
|
+
title: p.title,
|
|
22038
|
+
markdown: p.markdown,
|
|
22039
|
+
projectId: scope.projectId,
|
|
22040
|
+
scopeName: scope.scopeName,
|
|
22041
|
+
routeFile: p.routeFile,
|
|
22042
|
+
routeResolution: p.routeResolution,
|
|
22043
|
+
incomingLinks: p.incomingLinks,
|
|
22044
|
+
outgoingLinks: p.outgoingLinks,
|
|
22045
|
+
outgoingLinkUrls: p.outgoingLinkUrls,
|
|
22046
|
+
depth: p.depth,
|
|
22047
|
+
tags: p.tags,
|
|
22048
|
+
indexedAt: p.generatedAt,
|
|
22049
|
+
summary,
|
|
22050
|
+
description: p.description,
|
|
22051
|
+
keywords: p.keywords,
|
|
22052
|
+
contentHash: buildPageContentHash(p),
|
|
22053
|
+
publishedAt: p.publishedAt,
|
|
22054
|
+
meta: p.meta
|
|
22055
|
+
};
|
|
22056
|
+
});
|
|
22057
|
+
const currentPageUrls = new Set(pageRecords.map((r) => r.url));
|
|
22058
|
+
const changedPages = pageRecords.filter(
|
|
22059
|
+
(r) => !existingPageHashes.has(r.url) || existingPageHashes.get(r.url) !== r.contentHash
|
|
22060
|
+
);
|
|
22061
|
+
const deletedPageUrls = [...existingPageHashes.keys()].filter((url) => !currentPageUrls.has(url));
|
|
20253
22062
|
if (!options.dryRun) {
|
|
20254
|
-
|
|
20255
|
-
|
|
20256
|
-
|
|
20257
|
-
|
|
20258
|
-
|
|
20259
|
-
|
|
20260
|
-
|
|
20261
|
-
|
|
20262
|
-
|
|
20263
|
-
|
|
20264
|
-
|
|
20265
|
-
|
|
20266
|
-
|
|
20267
|
-
|
|
20268
|
-
|
|
20269
|
-
|
|
20270
|
-
|
|
20271
|
-
|
|
20272
|
-
|
|
20273
|
-
|
|
20274
|
-
|
|
20275
|
-
|
|
22063
|
+
if (options.force) {
|
|
22064
|
+
await this.store.deletePages(scope);
|
|
22065
|
+
this.logger.info(`Upserting ${pageRecords.length} page summaries...`);
|
|
22066
|
+
const pageDocs = pageRecords.map((r) => ({
|
|
22067
|
+
id: r.url,
|
|
22068
|
+
data: r.summary ?? r.title,
|
|
22069
|
+
metadata: {
|
|
22070
|
+
title: r.title,
|
|
22071
|
+
url: r.url,
|
|
22072
|
+
description: r.description ?? "",
|
|
22073
|
+
keywords: r.keywords ?? [],
|
|
22074
|
+
summary: r.summary ?? "",
|
|
22075
|
+
tags: r.tags,
|
|
22076
|
+
markdown: r.markdown,
|
|
22077
|
+
routeFile: r.routeFile,
|
|
22078
|
+
routeResolution: r.routeResolution,
|
|
22079
|
+
incomingLinks: r.incomingLinks,
|
|
22080
|
+
outgoingLinks: r.outgoingLinks,
|
|
22081
|
+
outgoingLinkUrls: r.outgoingLinkUrls ?? [],
|
|
22082
|
+
depth: r.depth,
|
|
22083
|
+
indexedAt: r.indexedAt,
|
|
22084
|
+
contentHash: r.contentHash ?? "",
|
|
22085
|
+
publishedAt: r.publishedAt ?? null,
|
|
22086
|
+
...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
|
|
22087
|
+
}
|
|
22088
|
+
}));
|
|
22089
|
+
await this.store.upsertPages(pageDocs, scope);
|
|
22090
|
+
} else {
|
|
22091
|
+
if (changedPages.length > 0) {
|
|
22092
|
+
this.logger.info(`Upserting ${changedPages.length} changed page summaries...`);
|
|
22093
|
+
const pageDocs = changedPages.map((r) => ({
|
|
22094
|
+
id: r.url,
|
|
22095
|
+
data: r.summary ?? r.title,
|
|
22096
|
+
metadata: {
|
|
22097
|
+
title: r.title,
|
|
22098
|
+
url: r.url,
|
|
22099
|
+
description: r.description ?? "",
|
|
22100
|
+
keywords: r.keywords ?? [],
|
|
22101
|
+
summary: r.summary ?? "",
|
|
22102
|
+
tags: r.tags,
|
|
22103
|
+
markdown: r.markdown,
|
|
22104
|
+
routeFile: r.routeFile,
|
|
22105
|
+
routeResolution: r.routeResolution,
|
|
22106
|
+
incomingLinks: r.incomingLinks,
|
|
22107
|
+
outgoingLinks: r.outgoingLinks,
|
|
22108
|
+
outgoingLinkUrls: r.outgoingLinkUrls ?? [],
|
|
22109
|
+
depth: r.depth,
|
|
22110
|
+
indexedAt: r.indexedAt,
|
|
22111
|
+
contentHash: r.contentHash ?? "",
|
|
22112
|
+
publishedAt: r.publishedAt ?? null,
|
|
22113
|
+
...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
|
|
22114
|
+
}
|
|
22115
|
+
}));
|
|
22116
|
+
await this.store.upsertPages(pageDocs, scope);
|
|
22117
|
+
}
|
|
22118
|
+
if (deletedPageUrls.length > 0) {
|
|
22119
|
+
await this.store.deletePagesByIds(deletedPageUrls, scope);
|
|
22120
|
+
}
|
|
22121
|
+
}
|
|
20276
22122
|
}
|
|
22123
|
+
const pagesChanged = options.force ? pageRecords.length : changedPages.length;
|
|
22124
|
+
const pagesDeleted = deletedPageUrls.length;
|
|
20277
22125
|
stageEnd("pages", pagesStart);
|
|
22126
|
+
this.logger.info(`Page changes: ${pagesChanged} changed/new, ${pagesDeleted} deleted, ${pageRecords.length - changedPages.length} unchanged`);
|
|
20278
22127
|
this.logger.info(`Indexed ${pages.length} page${pages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["pages"]}ms)`);
|
|
20279
22128
|
const chunkStart = stageStart();
|
|
20280
22129
|
this.logger.info("Chunking pages...");
|
|
@@ -20283,6 +22132,18 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20283
22132
|
if (typeof maxChunks === "number") {
|
|
20284
22133
|
chunks = chunks.slice(0, maxChunks);
|
|
20285
22134
|
}
|
|
22135
|
+
if (this.hooks.transformChunk) {
|
|
22136
|
+
const transformed = [];
|
|
22137
|
+
for (const chunk of chunks) {
|
|
22138
|
+
const result = await this.hooks.transformChunk(chunk);
|
|
22139
|
+
if (result === null) {
|
|
22140
|
+
this.logger.debug(`Chunk ${chunk.chunkKey} skipped by transformChunk hook`);
|
|
22141
|
+
continue;
|
|
22142
|
+
}
|
|
22143
|
+
transformed.push(result);
|
|
22144
|
+
}
|
|
22145
|
+
chunks = transformed;
|
|
22146
|
+
}
|
|
20286
22147
|
for (const chunk of chunks) {
|
|
20287
22148
|
this.logger.event("chunked", {
|
|
20288
22149
|
url: chunk.url,
|
|
@@ -20295,7 +22156,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20295
22156
|
for (const chunk of chunks) {
|
|
20296
22157
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
20297
22158
|
}
|
|
20298
|
-
|
|
22159
|
+
let changedChunks = chunks.filter((chunk) => {
|
|
20299
22160
|
if (options.force) {
|
|
20300
22161
|
return true;
|
|
20301
22162
|
}
|
|
@@ -20309,36 +22170,43 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20309
22170
|
return existingHash !== chunk.contentHash;
|
|
20310
22171
|
});
|
|
20311
22172
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
22173
|
+
if (this.hooks.beforeIndex) {
|
|
22174
|
+
changedChunks = await this.hooks.beforeIndex(changedChunks);
|
|
22175
|
+
}
|
|
20312
22176
|
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
20313
22177
|
const upsertStart = stageStart();
|
|
20314
22178
|
let documentsUpserted = 0;
|
|
20315
22179
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
20316
|
-
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash
|
|
20317
|
-
const UPSTASH_CONTENT_LIMIT = 4096;
|
|
22180
|
+
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Vector...`);
|
|
20318
22181
|
const docs = changedChunks.map((chunk) => {
|
|
20319
|
-
const
|
|
20320
|
-
|
|
20321
|
-
|
|
20322
|
-
|
|
20323
|
-
|
|
20324
|
-
|
|
20325
|
-
const textBudget = Math.max(500, UPSTASH_CONTENT_LIMIT - otherFieldsLen - 50);
|
|
20326
|
-
const text = buildEmbeddingText(chunk, this.config.chunking.prependTitle).slice(0, textBudget);
|
|
22182
|
+
const embeddingText = buildEmbeddingText(chunk, this.config.chunking.prependTitle);
|
|
22183
|
+
if (embeddingText.length > 2e3) {
|
|
22184
|
+
this.logger.warn(
|
|
22185
|
+
`Chunk ${chunk.chunkKey} text is ${embeddingText.length} chars (~${Math.round(embeddingText.length / 4)} tokens), which may exceed the 512-token model limit and be silently truncated.`
|
|
22186
|
+
);
|
|
22187
|
+
}
|
|
20327
22188
|
return {
|
|
20328
22189
|
id: chunk.chunkKey,
|
|
20329
|
-
|
|
22190
|
+
data: embeddingText,
|
|
20330
22191
|
metadata: {
|
|
20331
|
-
|
|
20332
|
-
scopeName: scope.scopeName,
|
|
22192
|
+
url: chunk.url,
|
|
20333
22193
|
path: chunk.path,
|
|
22194
|
+
title: chunk.title,
|
|
22195
|
+
sectionTitle: chunk.sectionTitle ?? "",
|
|
22196
|
+
headingPath: chunk.headingPath.join(" > "),
|
|
20334
22197
|
snippet: chunk.snippet,
|
|
22198
|
+
chunkText: embeddingText,
|
|
22199
|
+
tags: chunk.tags,
|
|
20335
22200
|
ordinal: chunk.ordinal,
|
|
20336
22201
|
contentHash: chunk.contentHash,
|
|
20337
22202
|
depth: chunk.depth,
|
|
20338
22203
|
incomingLinks: chunk.incomingLinks,
|
|
20339
22204
|
routeFile: chunk.routeFile,
|
|
20340
22205
|
description: chunk.description ?? "",
|
|
20341
|
-
keywords:
|
|
22206
|
+
keywords: chunk.keywords ?? [],
|
|
22207
|
+
publishedAt: chunk.publishedAt ?? null,
|
|
22208
|
+
incomingAnchorText: chunk.incomingAnchorText ?? "",
|
|
22209
|
+
...chunk.meta && Object.keys(chunk.meta).length > 0 ? { meta: chunk.meta } : {}
|
|
20342
22210
|
}
|
|
20343
22211
|
};
|
|
20344
22212
|
});
|
|
@@ -20356,9 +22224,16 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20356
22224
|
} else {
|
|
20357
22225
|
this.logger.info("No chunks to upsert \u2014 all up to date");
|
|
20358
22226
|
}
|
|
22227
|
+
if (this.config.llmsTxt.enable && !options.dryRun) {
|
|
22228
|
+
const llmsStart = stageStart();
|
|
22229
|
+
await writeLlmsTxt(pages, this.config, this.cwd, this.logger);
|
|
22230
|
+
stageEnd("llms_txt", llmsStart);
|
|
22231
|
+
}
|
|
20359
22232
|
this.logger.info("Done.");
|
|
20360
|
-
|
|
22233
|
+
const stats = {
|
|
20361
22234
|
pagesProcessed: pages.length,
|
|
22235
|
+
pagesChanged,
|
|
22236
|
+
pagesDeleted,
|
|
20362
22237
|
chunksTotal: chunks.length,
|
|
20363
22238
|
chunksChanged: changedChunks.length,
|
|
20364
22239
|
documentsUpserted,
|
|
@@ -20367,6 +22242,10 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20367
22242
|
routeBestEffort,
|
|
20368
22243
|
stageTimingsMs
|
|
20369
22244
|
};
|
|
22245
|
+
if (this.hooks.afterIndex) {
|
|
22246
|
+
await this.hooks.afterIndex(stats);
|
|
22247
|
+
}
|
|
22248
|
+
return stats;
|
|
20370
22249
|
}
|
|
20371
22250
|
};
|
|
20372
22251
|
|
|
@@ -20388,9 +22267,6 @@ function shouldRunAutoIndex(options) {
|
|
|
20388
22267
|
if (explicit && /^(1|true|yes)$/i.test(explicit)) {
|
|
20389
22268
|
return true;
|
|
20390
22269
|
}
|
|
20391
|
-
if (process.env.CI && /^(1|true)$/i.test(process.env.CI)) {
|
|
20392
|
-
return true;
|
|
20393
|
-
}
|
|
20394
22270
|
return false;
|
|
20395
22271
|
}
|
|
20396
22272
|
function searchsocketVitePlugin(options = {}) {
|
|
@@ -20415,7 +22291,8 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20415
22291
|
const pipeline = await IndexPipeline.create({
|
|
20416
22292
|
cwd,
|
|
20417
22293
|
configPath: options.configPath,
|
|
20418
|
-
logger: logger3
|
|
22294
|
+
logger: logger3,
|
|
22295
|
+
hooks: options.hooks
|
|
20419
22296
|
});
|
|
20420
22297
|
const stats = await pipeline.run({
|
|
20421
22298
|
changedOnly: options.changedOnly ?? true,
|