searchsocket 0.5.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +731 -514
- package/dist/cli.js +3335 -492
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +2378 -475
- package/dist/index.d.cts +113 -40
- package/dist/index.d.ts +113 -40
- package/dist/index.js +2378 -475
- package/dist/{plugin-B_npJSux.d.cts → plugin-C61L-ykY.d.ts} +2 -1
- package/dist/{plugin-M-aW0ev6.d.ts → plugin-DoBW1gkK.d.cts} +2 -1
- package/dist/sveltekit.cjs +2430 -494
- package/dist/sveltekit.d.cts +2 -2
- package/dist/sveltekit.d.ts +2 -2
- package/dist/sveltekit.js +2416 -480
- package/dist/templates/search-dialog/SearchDialog.svelte +175 -0
- package/dist/templates/search-input/SearchInput.svelte +151 -0
- package/dist/templates/search-results/SearchResults.svelte +75 -0
- package/dist/{types-Dk43uz25.d.cts → types-029hl6P2.d.cts} +180 -9
- package/dist/{types-Dk43uz25.d.ts → types-029hl6P2.d.ts} +180 -9
- package/package.json +28 -11
- package/src/svelte/SearchSocket.svelte +35 -0
- package/src/svelte/index.svelte.ts +181 -0
package/dist/index.js
CHANGED
|
@@ -3,18 +3,20 @@ import path from 'path';
|
|
|
3
3
|
import { createJiti } from 'jiti';
|
|
4
4
|
import { z } from 'zod';
|
|
5
5
|
import { execSync, spawn } from 'child_process';
|
|
6
|
-
import {
|
|
6
|
+
import { FusionAlgorithm, QueryMode } from '@upstash/vector';
|
|
7
|
+
import { timingSafeEqual, createHash } from 'crypto';
|
|
7
8
|
import { load } from 'cheerio';
|
|
8
9
|
import matter from 'gray-matter';
|
|
9
10
|
import fg from 'fast-glob';
|
|
10
11
|
import pLimit from 'p-limit';
|
|
11
|
-
import
|
|
12
|
+
import fs8 from 'fs/promises';
|
|
12
13
|
import net from 'net';
|
|
13
14
|
import { gunzipSync } from 'zlib';
|
|
14
15
|
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
15
16
|
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
16
17
|
import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/streamableHttp.js';
|
|
17
18
|
import { createMcpExpressApp } from '@modelcontextprotocol/sdk/server/express.js';
|
|
19
|
+
import { WebStandardStreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js';
|
|
18
20
|
|
|
19
21
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
20
22
|
var __commonJS = (cb, mod) => function __require() {
|
|
@@ -5013,32 +5015,32 @@ var require_URL = __commonJS({
|
|
|
5013
5015
|
else
|
|
5014
5016
|
return basepath.substring(0, lastslash + 1) + refpath;
|
|
5015
5017
|
}
|
|
5016
|
-
function remove_dot_segments(
|
|
5017
|
-
if (!
|
|
5018
|
+
function remove_dot_segments(path14) {
|
|
5019
|
+
if (!path14) return path14;
|
|
5018
5020
|
var output = "";
|
|
5019
|
-
while (
|
|
5020
|
-
if (
|
|
5021
|
-
|
|
5021
|
+
while (path14.length > 0) {
|
|
5022
|
+
if (path14 === "." || path14 === "..") {
|
|
5023
|
+
path14 = "";
|
|
5022
5024
|
break;
|
|
5023
5025
|
}
|
|
5024
|
-
var twochars =
|
|
5025
|
-
var threechars =
|
|
5026
|
-
var fourchars =
|
|
5026
|
+
var twochars = path14.substring(0, 2);
|
|
5027
|
+
var threechars = path14.substring(0, 3);
|
|
5028
|
+
var fourchars = path14.substring(0, 4);
|
|
5027
5029
|
if (threechars === "../") {
|
|
5028
|
-
|
|
5030
|
+
path14 = path14.substring(3);
|
|
5029
5031
|
} else if (twochars === "./") {
|
|
5030
|
-
|
|
5032
|
+
path14 = path14.substring(2);
|
|
5031
5033
|
} else if (threechars === "/./") {
|
|
5032
|
-
|
|
5033
|
-
} else if (twochars === "/." &&
|
|
5034
|
-
|
|
5035
|
-
} else if (fourchars === "/../" || threechars === "/.." &&
|
|
5036
|
-
|
|
5034
|
+
path14 = "/" + path14.substring(3);
|
|
5035
|
+
} else if (twochars === "/." && path14.length === 2) {
|
|
5036
|
+
path14 = "/";
|
|
5037
|
+
} else if (fourchars === "/../" || threechars === "/.." && path14.length === 3) {
|
|
5038
|
+
path14 = "/" + path14.substring(4);
|
|
5037
5039
|
output = output.replace(/\/?[^\/]*$/, "");
|
|
5038
5040
|
} else {
|
|
5039
|
-
var segment =
|
|
5041
|
+
var segment = path14.match(/(\/?([^\/]*))/)[0];
|
|
5040
5042
|
output += segment;
|
|
5041
|
-
|
|
5043
|
+
path14 = path14.substring(segment.length);
|
|
5042
5044
|
}
|
|
5043
5045
|
}
|
|
5044
5046
|
return output;
|
|
@@ -16634,6 +16636,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
16634
16636
|
dropSelectors: z.array(z.string()).optional(),
|
|
16635
16637
|
ignoreAttr: z.string().optional(),
|
|
16636
16638
|
noindexAttr: z.string().optional(),
|
|
16639
|
+
imageDescAttr: z.string().optional(),
|
|
16637
16640
|
respectRobotsNoindex: z.boolean().optional()
|
|
16638
16641
|
}).optional(),
|
|
16639
16642
|
transform: z.object({
|
|
@@ -16649,35 +16652,48 @@ var searchSocketConfigSchema = z.object({
|
|
|
16649
16652
|
headingPathDepth: z.number().int().positive().optional(),
|
|
16650
16653
|
dontSplitInside: z.array(z.enum(["code", "table", "blockquote"])).optional(),
|
|
16651
16654
|
prependTitle: z.boolean().optional(),
|
|
16652
|
-
pageSummaryChunk: z.boolean().optional()
|
|
16655
|
+
pageSummaryChunk: z.boolean().optional(),
|
|
16656
|
+
weightHeadings: z.boolean().optional()
|
|
16653
16657
|
}).optional(),
|
|
16654
16658
|
upstash: z.object({
|
|
16655
16659
|
url: z.string().url().optional(),
|
|
16656
16660
|
token: z.string().min(1).optional(),
|
|
16657
16661
|
urlEnv: z.string().min(1).optional(),
|
|
16658
|
-
tokenEnv: z.string().min(1).optional()
|
|
16662
|
+
tokenEnv: z.string().min(1).optional(),
|
|
16663
|
+
namespaces: z.object({
|
|
16664
|
+
pages: z.string().min(1).optional(),
|
|
16665
|
+
chunks: z.string().min(1).optional()
|
|
16666
|
+
}).optional()
|
|
16667
|
+
}).optional(),
|
|
16668
|
+
embedding: z.object({
|
|
16669
|
+
model: z.string().optional(),
|
|
16670
|
+
dimensions: z.number().int().positive().optional(),
|
|
16671
|
+
taskType: z.string().optional(),
|
|
16672
|
+
batchSize: z.number().int().positive().optional()
|
|
16659
16673
|
}).optional(),
|
|
16660
16674
|
search: z.object({
|
|
16661
|
-
semanticWeight: z.number().min(0).max(1).optional(),
|
|
16662
|
-
inputEnrichment: z.boolean().optional(),
|
|
16663
|
-
reranking: z.boolean().optional(),
|
|
16664
16675
|
dualSearch: z.boolean().optional(),
|
|
16665
16676
|
pageSearchWeight: z.number().min(0).max(1).optional()
|
|
16666
16677
|
}).optional(),
|
|
16667
16678
|
ranking: z.object({
|
|
16668
16679
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
16669
16680
|
enableDepthBoost: z.boolean().optional(),
|
|
16681
|
+
enableFreshnessBoost: z.boolean().optional(),
|
|
16682
|
+
freshnessDecayRate: z.number().positive().optional(),
|
|
16683
|
+
enableAnchorTextBoost: z.boolean().optional(),
|
|
16670
16684
|
pageWeights: z.record(z.string(), z.number().min(0)).optional(),
|
|
16671
16685
|
aggregationCap: z.number().int().positive().optional(),
|
|
16672
16686
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
16673
16687
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
16674
|
-
|
|
16688
|
+
minScoreRatio: z.number().min(0).max(1).optional(),
|
|
16675
16689
|
scoreGapThreshold: z.number().min(0).max(1).optional(),
|
|
16676
16690
|
weights: z.object({
|
|
16677
16691
|
incomingLinks: z.number().optional(),
|
|
16678
16692
|
depth: z.number().optional(),
|
|
16679
16693
|
aggregation: z.number().optional(),
|
|
16680
|
-
titleMatch: z.number().optional()
|
|
16694
|
+
titleMatch: z.number().optional(),
|
|
16695
|
+
freshness: z.number().optional(),
|
|
16696
|
+
anchorText: z.number().optional()
|
|
16681
16697
|
}).optional()
|
|
16682
16698
|
}).optional(),
|
|
16683
16699
|
api: z.object({
|
|
@@ -16692,12 +16708,28 @@ var searchSocketConfigSchema = z.object({
|
|
|
16692
16708
|
}).optional(),
|
|
16693
16709
|
mcp: z.object({
|
|
16694
16710
|
enable: z.boolean().optional(),
|
|
16711
|
+
access: z.enum(["public", "private"]).optional(),
|
|
16695
16712
|
transport: z.enum(["stdio", "http"]).optional(),
|
|
16696
16713
|
http: z.object({
|
|
16697
16714
|
port: z.number().int().positive().optional(),
|
|
16698
|
-
path: z.string().optional()
|
|
16715
|
+
path: z.string().optional(),
|
|
16716
|
+
apiKey: z.string().min(1).optional(),
|
|
16717
|
+
apiKeyEnv: z.string().min(1).optional()
|
|
16718
|
+
}).optional(),
|
|
16719
|
+
handle: z.object({
|
|
16720
|
+
path: z.string().optional(),
|
|
16721
|
+
apiKey: z.string().min(1).optional(),
|
|
16722
|
+
enableJsonResponse: z.boolean().optional()
|
|
16699
16723
|
}).optional()
|
|
16700
16724
|
}).optional(),
|
|
16725
|
+
llmsTxt: z.object({
|
|
16726
|
+
enable: z.boolean().optional(),
|
|
16727
|
+
outputPath: z.string().optional(),
|
|
16728
|
+
title: z.string().optional(),
|
|
16729
|
+
description: z.string().optional(),
|
|
16730
|
+
generateFull: z.boolean().optional(),
|
|
16731
|
+
serveMarkdownVariants: z.boolean().optional()
|
|
16732
|
+
}).optional(),
|
|
16701
16733
|
state: z.object({
|
|
16702
16734
|
dir: z.string().optional()
|
|
16703
16735
|
}).optional()
|
|
@@ -16736,6 +16768,7 @@ function createDefaultConfig(projectId) {
|
|
|
16736
16768
|
dropSelectors: DEFAULT_DROP_SELECTORS,
|
|
16737
16769
|
ignoreAttr: "data-search-ignore",
|
|
16738
16770
|
noindexAttr: "data-search-noindex",
|
|
16771
|
+
imageDescAttr: "data-search-description",
|
|
16739
16772
|
respectRobotsNoindex: true
|
|
16740
16773
|
},
|
|
16741
16774
|
transform: {
|
|
@@ -16745,39 +16778,52 @@ function createDefaultConfig(projectId) {
|
|
|
16745
16778
|
},
|
|
16746
16779
|
chunking: {
|
|
16747
16780
|
strategy: "hybrid",
|
|
16748
|
-
maxChars:
|
|
16781
|
+
maxChars: 1500,
|
|
16749
16782
|
overlapChars: 200,
|
|
16750
16783
|
minChars: 250,
|
|
16751
16784
|
headingPathDepth: 3,
|
|
16752
16785
|
dontSplitInside: ["code", "table", "blockquote"],
|
|
16753
16786
|
prependTitle: true,
|
|
16754
|
-
pageSummaryChunk: true
|
|
16787
|
+
pageSummaryChunk: true,
|
|
16788
|
+
weightHeadings: true
|
|
16755
16789
|
},
|
|
16756
16790
|
upstash: {
|
|
16757
|
-
urlEnv: "
|
|
16758
|
-
tokenEnv: "
|
|
16791
|
+
urlEnv: "UPSTASH_VECTOR_REST_URL",
|
|
16792
|
+
tokenEnv: "UPSTASH_VECTOR_REST_TOKEN",
|
|
16793
|
+
namespaces: {
|
|
16794
|
+
pages: "pages",
|
|
16795
|
+
chunks: "chunks"
|
|
16796
|
+
}
|
|
16797
|
+
},
|
|
16798
|
+
embedding: {
|
|
16799
|
+
model: "bge-large-en-v1.5",
|
|
16800
|
+
dimensions: 1024,
|
|
16801
|
+
taskType: "RETRIEVAL_DOCUMENT",
|
|
16802
|
+
batchSize: 100
|
|
16759
16803
|
},
|
|
16760
16804
|
search: {
|
|
16761
|
-
semanticWeight: 0.75,
|
|
16762
|
-
inputEnrichment: true,
|
|
16763
|
-
reranking: true,
|
|
16764
16805
|
dualSearch: true,
|
|
16765
16806
|
pageSearchWeight: 0.3
|
|
16766
16807
|
},
|
|
16767
16808
|
ranking: {
|
|
16768
16809
|
enableIncomingLinkBoost: true,
|
|
16769
16810
|
enableDepthBoost: true,
|
|
16811
|
+
enableFreshnessBoost: false,
|
|
16812
|
+
freshnessDecayRate: 1e-3,
|
|
16813
|
+
enableAnchorTextBoost: false,
|
|
16770
16814
|
pageWeights: {},
|
|
16771
16815
|
aggregationCap: 5,
|
|
16772
16816
|
aggregationDecay: 0.5,
|
|
16773
16817
|
minChunkScoreRatio: 0.5,
|
|
16774
|
-
|
|
16818
|
+
minScoreRatio: 0.7,
|
|
16775
16819
|
scoreGapThreshold: 0.4,
|
|
16776
16820
|
weights: {
|
|
16777
16821
|
incomingLinks: 0.05,
|
|
16778
16822
|
depth: 0.03,
|
|
16779
16823
|
aggregation: 0.1,
|
|
16780
|
-
titleMatch: 0.15
|
|
16824
|
+
titleMatch: 0.15,
|
|
16825
|
+
freshness: 0.1,
|
|
16826
|
+
anchorText: 0.1
|
|
16781
16827
|
}
|
|
16782
16828
|
},
|
|
16783
16829
|
api: {
|
|
@@ -16788,12 +16834,23 @@ function createDefaultConfig(projectId) {
|
|
|
16788
16834
|
},
|
|
16789
16835
|
mcp: {
|
|
16790
16836
|
enable: process.env.NODE_ENV !== "production",
|
|
16837
|
+
access: "private",
|
|
16791
16838
|
transport: "stdio",
|
|
16792
16839
|
http: {
|
|
16793
16840
|
port: 3338,
|
|
16794
16841
|
path: "/mcp"
|
|
16842
|
+
},
|
|
16843
|
+
handle: {
|
|
16844
|
+
path: "/api/mcp",
|
|
16845
|
+
enableJsonResponse: true
|
|
16795
16846
|
}
|
|
16796
16847
|
},
|
|
16848
|
+
llmsTxt: {
|
|
16849
|
+
enable: false,
|
|
16850
|
+
outputPath: "static/llms.txt",
|
|
16851
|
+
generateFull: true,
|
|
16852
|
+
serveMarkdownVariants: false
|
|
16853
|
+
},
|
|
16797
16854
|
state: {
|
|
16798
16855
|
dir: ".searchsocket"
|
|
16799
16856
|
}
|
|
@@ -16921,7 +16978,15 @@ ${issues}`
|
|
|
16921
16978
|
},
|
|
16922
16979
|
upstash: {
|
|
16923
16980
|
...defaults.upstash,
|
|
16924
|
-
...parsed.upstash
|
|
16981
|
+
...parsed.upstash,
|
|
16982
|
+
namespaces: {
|
|
16983
|
+
...defaults.upstash.namespaces,
|
|
16984
|
+
...parsed.upstash?.namespaces
|
|
16985
|
+
}
|
|
16986
|
+
},
|
|
16987
|
+
embedding: {
|
|
16988
|
+
...defaults.embedding,
|
|
16989
|
+
...parsed.embedding
|
|
16925
16990
|
},
|
|
16926
16991
|
search: {
|
|
16927
16992
|
...defaults.search,
|
|
@@ -16958,8 +17023,16 @@ ${issues}`
|
|
|
16958
17023
|
http: {
|
|
16959
17024
|
...defaults.mcp.http,
|
|
16960
17025
|
...parsed.mcp?.http
|
|
17026
|
+
},
|
|
17027
|
+
handle: {
|
|
17028
|
+
...defaults.mcp.handle,
|
|
17029
|
+
...parsed.mcp?.handle
|
|
16961
17030
|
}
|
|
16962
17031
|
},
|
|
17032
|
+
llmsTxt: {
|
|
17033
|
+
...defaults.llmsTxt,
|
|
17034
|
+
...parsed.llmsTxt
|
|
17035
|
+
},
|
|
16963
17036
|
state: {
|
|
16964
17037
|
...defaults.state,
|
|
16965
17038
|
...parsed.state
|
|
@@ -16979,6 +17052,15 @@ ${issues}`
|
|
|
16979
17052
|
maxDepth: 10
|
|
16980
17053
|
};
|
|
16981
17054
|
}
|
|
17055
|
+
if (merged.mcp.access === "public") {
|
|
17056
|
+
const resolvedKey = merged.mcp.http.apiKey ?? (merged.mcp.http.apiKeyEnv ? process.env[merged.mcp.http.apiKeyEnv] : void 0);
|
|
17057
|
+
if (!resolvedKey) {
|
|
17058
|
+
throw new SearchSocketError(
|
|
17059
|
+
"CONFIG_MISSING",
|
|
17060
|
+
'`mcp.access` is "public" but no API key is configured. Set `mcp.http.apiKey` or `mcp.http.apiKeyEnv`.'
|
|
17061
|
+
);
|
|
17062
|
+
}
|
|
17063
|
+
}
|
|
16982
17064
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
16983
17065
|
throw new SearchSocketError("CONFIG_MISSING", "`source.crawl.baseUrl` is required when source.mode is crawl.");
|
|
16984
17066
|
}
|
|
@@ -17042,13 +17124,84 @@ function normalizeMarkdown(input) {
|
|
|
17042
17124
|
function sanitizeScopeName(scopeName) {
|
|
17043
17125
|
return scopeName.toLowerCase().replace(/[^a-z0-9._-]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80);
|
|
17044
17126
|
}
|
|
17127
|
+
function markdownToPlain(markdown) {
|
|
17128
|
+
return markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/[#>*_|\-]/g, " ").replace(/\s+/g, " ").trim();
|
|
17129
|
+
}
|
|
17045
17130
|
function toSnippet(markdown, maxLen = 220) {
|
|
17046
|
-
const plain = markdown
|
|
17131
|
+
const plain = markdownToPlain(markdown);
|
|
17047
17132
|
if (plain.length <= maxLen) {
|
|
17048
17133
|
return plain;
|
|
17049
17134
|
}
|
|
17050
17135
|
return `${plain.slice(0, Math.max(0, maxLen - 1)).trim()}\u2026`;
|
|
17051
17136
|
}
|
|
17137
|
+
function queryAwareExcerpt(markdown, query, maxLen = 220) {
|
|
17138
|
+
const plain = markdownToPlain(markdown);
|
|
17139
|
+
if (plain.length <= maxLen) return plain;
|
|
17140
|
+
const tokens = query.toLowerCase().split(/\s+/).filter((t) => t.length >= 2);
|
|
17141
|
+
if (tokens.length === 0) return toSnippet(markdown, maxLen);
|
|
17142
|
+
const positions = [];
|
|
17143
|
+
for (let ti = 0; ti < tokens.length; ti++) {
|
|
17144
|
+
const escaped = tokens[ti].replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
17145
|
+
const re = new RegExp(escaped, "gi");
|
|
17146
|
+
let m;
|
|
17147
|
+
while ((m = re.exec(plain)) !== null) {
|
|
17148
|
+
positions.push({ start: m.index, end: m.index + m[0].length, tokenIdx: ti });
|
|
17149
|
+
}
|
|
17150
|
+
}
|
|
17151
|
+
if (positions.length === 0) return toSnippet(markdown, maxLen);
|
|
17152
|
+
positions.sort((a, b) => a.start - b.start);
|
|
17153
|
+
let bestUniqueCount = 0;
|
|
17154
|
+
let bestTotalCount = 0;
|
|
17155
|
+
let bestLeft = 0;
|
|
17156
|
+
let bestRight = 0;
|
|
17157
|
+
let left = 0;
|
|
17158
|
+
const tokenCounts = /* @__PURE__ */ new Map();
|
|
17159
|
+
for (let right = 0; right < positions.length; right++) {
|
|
17160
|
+
tokenCounts.set(positions[right].tokenIdx, (tokenCounts.get(positions[right].tokenIdx) ?? 0) + 1);
|
|
17161
|
+
while (positions[right].end - positions[left].start > maxLen && left < right) {
|
|
17162
|
+
const leftToken = positions[left].tokenIdx;
|
|
17163
|
+
const cnt = tokenCounts.get(leftToken) - 1;
|
|
17164
|
+
if (cnt === 0) tokenCounts.delete(leftToken);
|
|
17165
|
+
else tokenCounts.set(leftToken, cnt);
|
|
17166
|
+
left++;
|
|
17167
|
+
}
|
|
17168
|
+
const uniqueCount = tokenCounts.size;
|
|
17169
|
+
const totalCount = right - left + 1;
|
|
17170
|
+
if (uniqueCount > bestUniqueCount || uniqueCount === bestUniqueCount && totalCount > bestTotalCount) {
|
|
17171
|
+
bestUniqueCount = uniqueCount;
|
|
17172
|
+
bestTotalCount = totalCount;
|
|
17173
|
+
bestLeft = left;
|
|
17174
|
+
bestRight = right;
|
|
17175
|
+
}
|
|
17176
|
+
}
|
|
17177
|
+
const mid = Math.floor((positions[bestLeft].start + positions[bestRight].end) / 2);
|
|
17178
|
+
let start = Math.max(0, mid - Math.floor(maxLen / 2));
|
|
17179
|
+
let end = Math.min(plain.length, start + maxLen);
|
|
17180
|
+
start = Math.max(0, end - maxLen);
|
|
17181
|
+
if (start > 0) {
|
|
17182
|
+
const spaceIdx = plain.lastIndexOf(" ", start);
|
|
17183
|
+
if (spaceIdx > start - 30) {
|
|
17184
|
+
start = spaceIdx + 1;
|
|
17185
|
+
}
|
|
17186
|
+
}
|
|
17187
|
+
if (end < plain.length) {
|
|
17188
|
+
const spaceIdx = plain.indexOf(" ", end);
|
|
17189
|
+
if (spaceIdx !== -1 && spaceIdx < end + 30) {
|
|
17190
|
+
end = spaceIdx;
|
|
17191
|
+
}
|
|
17192
|
+
}
|
|
17193
|
+
let excerpt = plain.slice(start, end);
|
|
17194
|
+
if (excerpt.length > Math.ceil(maxLen * 1.2)) {
|
|
17195
|
+
excerpt = excerpt.slice(0, maxLen);
|
|
17196
|
+
const lastSpace = excerpt.lastIndexOf(" ");
|
|
17197
|
+
if (lastSpace > maxLen * 0.5) {
|
|
17198
|
+
excerpt = excerpt.slice(0, lastSpace);
|
|
17199
|
+
}
|
|
17200
|
+
}
|
|
17201
|
+
const prefix = start > 0 ? "\u2026" : "";
|
|
17202
|
+
const suffix = end < plain.length ? "\u2026" : "";
|
|
17203
|
+
return `${prefix}${excerpt}${suffix}`;
|
|
17204
|
+
}
|
|
17052
17205
|
function extractFirstParagraph(markdown) {
|
|
17053
17206
|
const lines = markdown.split("\n");
|
|
17054
17207
|
let inFence = false;
|
|
@@ -17109,162 +17262,342 @@ function ensureStateDirs(cwd, stateDir, scope) {
|
|
|
17109
17262
|
fs.mkdirSync(statePath, { recursive: true });
|
|
17110
17263
|
return { statePath };
|
|
17111
17264
|
}
|
|
17112
|
-
|
|
17113
|
-
// src/vector/upstash.ts
|
|
17114
|
-
function chunkIndexName(scope) {
|
|
17115
|
-
return `${scope.projectId}--${scope.scopeName}`;
|
|
17116
|
-
}
|
|
17117
|
-
function pageIndexName(scope) {
|
|
17118
|
-
return `${scope.projectId}--${scope.scopeName}--pages`;
|
|
17119
|
-
}
|
|
17120
17265
|
var UpstashSearchStore = class {
|
|
17121
|
-
|
|
17266
|
+
index;
|
|
17267
|
+
pagesNs;
|
|
17268
|
+
chunksNs;
|
|
17122
17269
|
constructor(opts) {
|
|
17123
|
-
this.
|
|
17124
|
-
|
|
17125
|
-
|
|
17126
|
-
return this.client.index(chunkIndexName(scope));
|
|
17127
|
-
}
|
|
17128
|
-
pageIndex(scope) {
|
|
17129
|
-
return this.client.index(pageIndexName(scope));
|
|
17270
|
+
this.index = opts.index;
|
|
17271
|
+
this.pagesNs = opts.index.namespace(opts.pagesNamespace);
|
|
17272
|
+
this.chunksNs = opts.index.namespace(opts.chunksNamespace);
|
|
17130
17273
|
}
|
|
17131
17274
|
async upsertChunks(chunks, scope) {
|
|
17132
17275
|
if (chunks.length === 0) return;
|
|
17133
|
-
const
|
|
17134
|
-
const BATCH_SIZE = 100;
|
|
17276
|
+
const BATCH_SIZE = 90;
|
|
17135
17277
|
for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
|
|
17136
17278
|
const batch = chunks.slice(i, i + BATCH_SIZE);
|
|
17137
|
-
await
|
|
17138
|
-
|
|
17139
|
-
|
|
17140
|
-
|
|
17141
|
-
|
|
17142
|
-
|
|
17143
|
-
|
|
17144
|
-
|
|
17145
|
-
|
|
17146
|
-
|
|
17147
|
-
|
|
17148
|
-
|
|
17279
|
+
await this.chunksNs.upsert(
|
|
17280
|
+
batch.map((c) => ({
|
|
17281
|
+
id: c.id,
|
|
17282
|
+
data: c.data,
|
|
17283
|
+
metadata: {
|
|
17284
|
+
...c.metadata,
|
|
17285
|
+
projectId: scope.projectId,
|
|
17286
|
+
scopeName: scope.scopeName,
|
|
17287
|
+
type: c.metadata.type || "chunk"
|
|
17288
|
+
}
|
|
17289
|
+
}))
|
|
17290
|
+
);
|
|
17291
|
+
}
|
|
17292
|
+
}
|
|
17293
|
+
async search(data, opts, scope) {
|
|
17294
|
+
const filterParts = [
|
|
17295
|
+
`projectId = '${scope.projectId}'`,
|
|
17296
|
+
`scopeName = '${scope.scopeName}'`
|
|
17297
|
+
];
|
|
17298
|
+
if (opts.filter) {
|
|
17299
|
+
filterParts.push(opts.filter);
|
|
17300
|
+
}
|
|
17301
|
+
const results = await this.chunksNs.query({
|
|
17302
|
+
data,
|
|
17303
|
+
topK: opts.limit,
|
|
17304
|
+
includeMetadata: true,
|
|
17305
|
+
filter: filterParts.join(" AND "),
|
|
17306
|
+
queryMode: QueryMode.HYBRID,
|
|
17307
|
+
fusionAlgorithm: FusionAlgorithm.DBSF
|
|
17308
|
+
});
|
|
17309
|
+
return results.map((doc) => ({
|
|
17310
|
+
id: String(doc.id),
|
|
17311
|
+
score: doc.score,
|
|
17312
|
+
metadata: {
|
|
17313
|
+
projectId: doc.metadata?.projectId ?? "",
|
|
17314
|
+
scopeName: doc.metadata?.scopeName ?? "",
|
|
17315
|
+
url: doc.metadata?.url ?? "",
|
|
17316
|
+
path: doc.metadata?.path ?? "",
|
|
17317
|
+
title: doc.metadata?.title ?? "",
|
|
17318
|
+
sectionTitle: doc.metadata?.sectionTitle ?? "",
|
|
17319
|
+
headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
|
|
17320
|
+
snippet: doc.metadata?.snippet ?? "",
|
|
17321
|
+
chunkText: doc.metadata?.chunkText ?? "",
|
|
17322
|
+
ordinal: doc.metadata?.ordinal ?? 0,
|
|
17323
|
+
contentHash: doc.metadata?.contentHash ?? "",
|
|
17324
|
+
depth: doc.metadata?.depth ?? 0,
|
|
17325
|
+
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17326
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
17327
|
+
tags: doc.metadata?.tags ?? [],
|
|
17328
|
+
description: doc.metadata?.description || void 0,
|
|
17329
|
+
keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17330
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
|
|
17331
|
+
incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
|
|
17332
|
+
}
|
|
17333
|
+
}));
|
|
17334
|
+
}
|
|
17335
|
+
async searchChunksByUrl(data, url, opts, scope) {
|
|
17336
|
+
const filterParts = [
|
|
17337
|
+
`projectId = '${scope.projectId}'`,
|
|
17338
|
+
`scopeName = '${scope.scopeName}'`,
|
|
17339
|
+
`url = '${url}'`
|
|
17340
|
+
];
|
|
17341
|
+
if (opts.filter) {
|
|
17342
|
+
filterParts.push(opts.filter);
|
|
17343
|
+
}
|
|
17344
|
+
const results = await this.chunksNs.query({
|
|
17345
|
+
data,
|
|
17346
|
+
topK: opts.limit,
|
|
17347
|
+
includeMetadata: true,
|
|
17348
|
+
filter: filterParts.join(" AND "),
|
|
17349
|
+
queryMode: QueryMode.HYBRID,
|
|
17350
|
+
fusionAlgorithm: FusionAlgorithm.DBSF
|
|
17149
17351
|
});
|
|
17150
17352
|
return results.map((doc) => ({
|
|
17151
|
-
id: doc.id,
|
|
17353
|
+
id: String(doc.id),
|
|
17152
17354
|
score: doc.score,
|
|
17153
17355
|
metadata: {
|
|
17154
17356
|
projectId: doc.metadata?.projectId ?? "",
|
|
17155
17357
|
scopeName: doc.metadata?.scopeName ?? "",
|
|
17156
|
-
url: doc.
|
|
17358
|
+
url: doc.metadata?.url ?? "",
|
|
17157
17359
|
path: doc.metadata?.path ?? "",
|
|
17158
|
-
title: doc.
|
|
17159
|
-
sectionTitle: doc.
|
|
17160
|
-
headingPath: doc.
|
|
17360
|
+
title: doc.metadata?.title ?? "",
|
|
17361
|
+
sectionTitle: doc.metadata?.sectionTitle ?? "",
|
|
17362
|
+
headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
|
|
17161
17363
|
snippet: doc.metadata?.snippet ?? "",
|
|
17162
|
-
chunkText: doc.
|
|
17364
|
+
chunkText: doc.metadata?.chunkText ?? "",
|
|
17163
17365
|
ordinal: doc.metadata?.ordinal ?? 0,
|
|
17164
17366
|
contentHash: doc.metadata?.contentHash ?? "",
|
|
17165
17367
|
depth: doc.metadata?.depth ?? 0,
|
|
17166
17368
|
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17167
17369
|
routeFile: doc.metadata?.routeFile ?? "",
|
|
17168
|
-
tags: doc.
|
|
17370
|
+
tags: doc.metadata?.tags ?? [],
|
|
17169
17371
|
description: doc.metadata?.description || void 0,
|
|
17170
|
-
keywords: doc.metadata?.keywords ? doc.metadata.keywords
|
|
17372
|
+
keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17373
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
|
|
17374
|
+
incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
|
|
17171
17375
|
}
|
|
17172
17376
|
}));
|
|
17173
17377
|
}
|
|
17174
|
-
async
|
|
17175
|
-
|
|
17378
|
+
async searchPagesByText(data, opts, scope) {
|
|
17379
|
+
return this.queryPages({ data }, opts, scope);
|
|
17380
|
+
}
|
|
17381
|
+
async searchPagesByVector(vector, opts, scope) {
|
|
17382
|
+
return this.queryPages({ vector }, opts, scope);
|
|
17383
|
+
}
|
|
17384
|
+
async queryPages(input, opts, scope) {
|
|
17385
|
+
const filterParts = [
|
|
17386
|
+
`projectId = '${scope.projectId}'`,
|
|
17387
|
+
`scopeName = '${scope.scopeName}'`
|
|
17388
|
+
];
|
|
17389
|
+
if (opts.filter) {
|
|
17390
|
+
filterParts.push(opts.filter);
|
|
17391
|
+
}
|
|
17176
17392
|
let results;
|
|
17177
17393
|
try {
|
|
17178
|
-
results = await
|
|
17179
|
-
|
|
17180
|
-
|
|
17181
|
-
|
|
17182
|
-
|
|
17183
|
-
|
|
17184
|
-
|
|
17394
|
+
results = await this.pagesNs.query({
|
|
17395
|
+
...input,
|
|
17396
|
+
topK: opts.limit,
|
|
17397
|
+
includeMetadata: true,
|
|
17398
|
+
filter: filterParts.join(" AND "),
|
|
17399
|
+
queryMode: QueryMode.HYBRID,
|
|
17400
|
+
fusionAlgorithm: FusionAlgorithm.DBSF
|
|
17185
17401
|
});
|
|
17186
17402
|
} catch {
|
|
17187
17403
|
return [];
|
|
17188
17404
|
}
|
|
17189
17405
|
return results.map((doc) => ({
|
|
17190
|
-
id: doc.id,
|
|
17406
|
+
id: String(doc.id),
|
|
17191
17407
|
score: doc.score,
|
|
17192
|
-
title: doc.
|
|
17193
|
-
url: doc.
|
|
17194
|
-
description: doc.
|
|
17195
|
-
tags: doc.
|
|
17408
|
+
title: doc.metadata?.title ?? "",
|
|
17409
|
+
url: doc.metadata?.url ?? "",
|
|
17410
|
+
description: doc.metadata?.description ?? "",
|
|
17411
|
+
tags: doc.metadata?.tags ?? [],
|
|
17196
17412
|
depth: doc.metadata?.depth ?? 0,
|
|
17197
17413
|
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17198
|
-
routeFile: doc.metadata?.routeFile ?? ""
|
|
17414
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
17415
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0
|
|
17199
17416
|
}));
|
|
17200
17417
|
}
|
|
17201
|
-
async deleteByIds(ids,
|
|
17418
|
+
async deleteByIds(ids, _scope) {
|
|
17202
17419
|
if (ids.length === 0) return;
|
|
17203
|
-
const
|
|
17204
|
-
const BATCH_SIZE = 500;
|
|
17420
|
+
const BATCH_SIZE = 90;
|
|
17205
17421
|
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17206
17422
|
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17207
|
-
await
|
|
17423
|
+
await this.chunksNs.delete(batch);
|
|
17208
17424
|
}
|
|
17209
17425
|
}
|
|
17210
17426
|
async deleteScope(scope) {
|
|
17211
|
-
|
|
17212
|
-
const
|
|
17213
|
-
|
|
17214
|
-
|
|
17215
|
-
|
|
17216
|
-
|
|
17217
|
-
|
|
17218
|
-
|
|
17219
|
-
|
|
17427
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17428
|
+
const ids = [];
|
|
17429
|
+
let cursor = "0";
|
|
17430
|
+
try {
|
|
17431
|
+
for (; ; ) {
|
|
17432
|
+
const result = await ns.range({
|
|
17433
|
+
cursor,
|
|
17434
|
+
limit: 100,
|
|
17435
|
+
includeMetadata: true
|
|
17436
|
+
});
|
|
17437
|
+
for (const doc of result.vectors) {
|
|
17438
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17439
|
+
ids.push(String(doc.id));
|
|
17440
|
+
}
|
|
17441
|
+
}
|
|
17442
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17443
|
+
cursor = result.nextCursor;
|
|
17444
|
+
}
|
|
17445
|
+
} catch {
|
|
17446
|
+
}
|
|
17447
|
+
if (ids.length > 0) {
|
|
17448
|
+
const BATCH_SIZE = 90;
|
|
17449
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17450
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17451
|
+
await ns.delete(batch);
|
|
17452
|
+
}
|
|
17453
|
+
}
|
|
17220
17454
|
}
|
|
17221
17455
|
}
|
|
17222
17456
|
async listScopes(projectId) {
|
|
17223
|
-
const
|
|
17224
|
-
const
|
|
17225
|
-
|
|
17226
|
-
|
|
17227
|
-
|
|
17228
|
-
|
|
17229
|
-
|
|
17230
|
-
|
|
17231
|
-
|
|
17232
|
-
|
|
17233
|
-
|
|
17234
|
-
|
|
17235
|
-
|
|
17236
|
-
|
|
17237
|
-
|
|
17238
|
-
|
|
17457
|
+
const scopeMap = /* @__PURE__ */ new Map();
|
|
17458
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17459
|
+
let cursor = "0";
|
|
17460
|
+
try {
|
|
17461
|
+
for (; ; ) {
|
|
17462
|
+
const result = await ns.range({
|
|
17463
|
+
cursor,
|
|
17464
|
+
limit: 100,
|
|
17465
|
+
includeMetadata: true
|
|
17466
|
+
});
|
|
17467
|
+
for (const doc of result.vectors) {
|
|
17468
|
+
if (doc.metadata?.projectId === projectId) {
|
|
17469
|
+
const scopeName = doc.metadata.scopeName ?? "";
|
|
17470
|
+
scopeMap.set(scopeName, (scopeMap.get(scopeName) ?? 0) + 1);
|
|
17471
|
+
}
|
|
17472
|
+
}
|
|
17473
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17474
|
+
cursor = result.nextCursor;
|
|
17475
|
+
}
|
|
17476
|
+
} catch {
|
|
17477
|
+
}
|
|
17478
|
+
}
|
|
17479
|
+
return [...scopeMap.entries()].map(([scopeName, count]) => ({
|
|
17480
|
+
projectId,
|
|
17481
|
+
scopeName,
|
|
17482
|
+
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
17483
|
+
documentCount: count
|
|
17484
|
+
}));
|
|
17485
|
+
}
|
|
17486
|
+
async getContentHashes(scope) {
|
|
17487
|
+
return this.scanHashes(this.chunksNs, scope);
|
|
17488
|
+
}
|
|
17489
|
+
/**
|
|
17490
|
+
* Fetch content hashes for a specific set of chunk keys using direct fetch()
|
|
17491
|
+
* instead of range(). This avoids potential issues with range() returning
|
|
17492
|
+
* vectors from the wrong namespace on hybrid indexes.
|
|
17493
|
+
*/
|
|
17494
|
+
async fetchContentHashesForKeys(keys, scope) {
|
|
17495
|
+
const map = /* @__PURE__ */ new Map();
|
|
17496
|
+
if (keys.length === 0) return map;
|
|
17497
|
+
const BATCH_SIZE = 90;
|
|
17498
|
+
for (let i = 0; i < keys.length; i += BATCH_SIZE) {
|
|
17499
|
+
const batch = keys.slice(i, i + BATCH_SIZE);
|
|
17239
17500
|
try {
|
|
17240
|
-
const
|
|
17241
|
-
|
|
17242
|
-
projectId,
|
|
17243
|
-
scopeName,
|
|
17244
|
-
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
17245
|
-
documentCount: info.documentCount
|
|
17501
|
+
const results = await this.chunksNs.fetch(batch, {
|
|
17502
|
+
includeMetadata: true
|
|
17246
17503
|
});
|
|
17504
|
+
for (const doc of results) {
|
|
17505
|
+
if (doc && doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17506
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17507
|
+
}
|
|
17508
|
+
}
|
|
17247
17509
|
} catch {
|
|
17248
|
-
|
|
17249
|
-
|
|
17250
|
-
|
|
17251
|
-
|
|
17252
|
-
|
|
17510
|
+
}
|
|
17511
|
+
}
|
|
17512
|
+
return map;
|
|
17513
|
+
}
|
|
17514
|
+
/**
|
|
17515
|
+
* Scan all IDs in the chunks namespace for this scope.
|
|
17516
|
+
* Used for deletion detection (finding stale chunk keys).
|
|
17517
|
+
*/
|
|
17518
|
+
async scanChunkIds(scope) {
|
|
17519
|
+
const ids = /* @__PURE__ */ new Set();
|
|
17520
|
+
let cursor = "0";
|
|
17521
|
+
try {
|
|
17522
|
+
for (; ; ) {
|
|
17523
|
+
const result = await this.chunksNs.range({
|
|
17524
|
+
cursor,
|
|
17525
|
+
limit: 100,
|
|
17526
|
+
includeMetadata: true
|
|
17253
17527
|
});
|
|
17528
|
+
for (const doc of result.vectors) {
|
|
17529
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17530
|
+
ids.add(String(doc.id));
|
|
17531
|
+
}
|
|
17532
|
+
}
|
|
17533
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17534
|
+
cursor = result.nextCursor;
|
|
17254
17535
|
}
|
|
17536
|
+
} catch {
|
|
17255
17537
|
}
|
|
17256
|
-
return
|
|
17538
|
+
return ids;
|
|
17257
17539
|
}
|
|
17258
|
-
async
|
|
17540
|
+
async scanHashes(ns, scope) {
|
|
17541
|
+
const map = /* @__PURE__ */ new Map();
|
|
17542
|
+
let cursor = "0";
|
|
17543
|
+
try {
|
|
17544
|
+
for (; ; ) {
|
|
17545
|
+
const result = await ns.range({
|
|
17546
|
+
cursor,
|
|
17547
|
+
limit: 100,
|
|
17548
|
+
includeMetadata: true
|
|
17549
|
+
});
|
|
17550
|
+
for (const doc of result.vectors) {
|
|
17551
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17552
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17553
|
+
}
|
|
17554
|
+
}
|
|
17555
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17556
|
+
cursor = result.nextCursor;
|
|
17557
|
+
}
|
|
17558
|
+
} catch {
|
|
17559
|
+
}
|
|
17560
|
+
return map;
|
|
17561
|
+
}
|
|
17562
|
+
async listPages(scope, opts) {
|
|
17563
|
+
const cursor = opts?.cursor ?? "0";
|
|
17564
|
+
const limit = opts?.limit ?? 50;
|
|
17565
|
+
try {
|
|
17566
|
+
const result = await this.pagesNs.range({
|
|
17567
|
+
cursor,
|
|
17568
|
+
limit,
|
|
17569
|
+
includeMetadata: true
|
|
17570
|
+
});
|
|
17571
|
+
const pages = result.vectors.filter(
|
|
17572
|
+
(doc) => doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && (!opts?.pathPrefix || (doc.metadata?.url ?? "").startsWith(opts.pathPrefix))
|
|
17573
|
+
).map((doc) => ({
|
|
17574
|
+
url: doc.metadata?.url ?? "",
|
|
17575
|
+
title: doc.metadata?.title ?? "",
|
|
17576
|
+
description: doc.metadata?.description ?? "",
|
|
17577
|
+
routeFile: doc.metadata?.routeFile ?? ""
|
|
17578
|
+
}));
|
|
17579
|
+
const response = { pages };
|
|
17580
|
+
if (result.nextCursor && result.nextCursor !== "0") {
|
|
17581
|
+
response.nextCursor = result.nextCursor;
|
|
17582
|
+
}
|
|
17583
|
+
return response;
|
|
17584
|
+
} catch {
|
|
17585
|
+
return { pages: [] };
|
|
17586
|
+
}
|
|
17587
|
+
}
|
|
17588
|
+
async getPageHashes(scope) {
|
|
17259
17589
|
const map = /* @__PURE__ */ new Map();
|
|
17260
|
-
const index = this.chunkIndex(scope);
|
|
17261
17590
|
let cursor = "0";
|
|
17262
17591
|
try {
|
|
17263
17592
|
for (; ; ) {
|
|
17264
|
-
const result = await
|
|
17265
|
-
|
|
17266
|
-
|
|
17267
|
-
|
|
17593
|
+
const result = await this.pagesNs.range({
|
|
17594
|
+
cursor,
|
|
17595
|
+
limit: 100,
|
|
17596
|
+
includeMetadata: true
|
|
17597
|
+
});
|
|
17598
|
+
for (const doc of result.vectors) {
|
|
17599
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17600
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17268
17601
|
}
|
|
17269
17602
|
}
|
|
17270
17603
|
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
@@ -17274,47 +17607,43 @@ var UpstashSearchStore = class {
|
|
|
17274
17607
|
}
|
|
17275
17608
|
return map;
|
|
17276
17609
|
}
|
|
17610
|
+
async deletePagesByIds(ids, _scope) {
|
|
17611
|
+
if (ids.length === 0) return;
|
|
17612
|
+
const BATCH_SIZE = 90;
|
|
17613
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17614
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17615
|
+
await this.pagesNs.delete(batch);
|
|
17616
|
+
}
|
|
17617
|
+
}
|
|
17277
17618
|
async upsertPages(pages, scope) {
|
|
17278
17619
|
if (pages.length === 0) return;
|
|
17279
|
-
const
|
|
17280
|
-
const BATCH_SIZE = 50;
|
|
17620
|
+
const BATCH_SIZE = 90;
|
|
17281
17621
|
for (let i = 0; i < pages.length; i += BATCH_SIZE) {
|
|
17282
17622
|
const batch = pages.slice(i, i + BATCH_SIZE);
|
|
17283
|
-
|
|
17284
|
-
|
|
17285
|
-
|
|
17286
|
-
|
|
17287
|
-
|
|
17288
|
-
|
|
17289
|
-
|
|
17290
|
-
|
|
17291
|
-
|
|
17292
|
-
|
|
17293
|
-
}
|
|
17294
|
-
|
|
17295
|
-
markdown: p.markdown,
|
|
17296
|
-
projectId: p.projectId,
|
|
17297
|
-
scopeName: p.scopeName,
|
|
17298
|
-
routeFile: p.routeFile,
|
|
17299
|
-
routeResolution: p.routeResolution,
|
|
17300
|
-
incomingLinks: p.incomingLinks,
|
|
17301
|
-
outgoingLinks: p.outgoingLinks,
|
|
17302
|
-
depth: p.depth,
|
|
17303
|
-
indexedAt: p.indexedAt
|
|
17304
|
-
}
|
|
17305
|
-
}));
|
|
17306
|
-
await index.upsert(docs);
|
|
17623
|
+
await this.pagesNs.upsert(
|
|
17624
|
+
batch.map((p) => ({
|
|
17625
|
+
id: p.id,
|
|
17626
|
+
data: p.data,
|
|
17627
|
+
metadata: {
|
|
17628
|
+
...p.metadata,
|
|
17629
|
+
projectId: scope.projectId,
|
|
17630
|
+
scopeName: scope.scopeName,
|
|
17631
|
+
type: "page"
|
|
17632
|
+
}
|
|
17633
|
+
}))
|
|
17634
|
+
);
|
|
17307
17635
|
}
|
|
17308
17636
|
}
|
|
17309
17637
|
async getPage(url, scope) {
|
|
17310
|
-
const index = this.pageIndex(scope);
|
|
17311
17638
|
try {
|
|
17312
|
-
const results = await
|
|
17639
|
+
const results = await this.pagesNs.fetch([url], {
|
|
17640
|
+
includeMetadata: true
|
|
17641
|
+
});
|
|
17313
17642
|
const doc = results[0];
|
|
17314
|
-
if (!doc) return null;
|
|
17643
|
+
if (!doc || !doc.metadata) return null;
|
|
17315
17644
|
return {
|
|
17316
|
-
url: doc.
|
|
17317
|
-
title: doc.
|
|
17645
|
+
url: doc.metadata.url,
|
|
17646
|
+
title: doc.metadata.title,
|
|
17318
17647
|
markdown: doc.metadata.markdown,
|
|
17319
17648
|
projectId: doc.metadata.projectId,
|
|
17320
17649
|
scopeName: doc.metadata.scopeName,
|
|
@@ -17322,27 +17651,86 @@ var UpstashSearchStore = class {
|
|
|
17322
17651
|
routeResolution: doc.metadata.routeResolution,
|
|
17323
17652
|
incomingLinks: doc.metadata.incomingLinks,
|
|
17324
17653
|
outgoingLinks: doc.metadata.outgoingLinks,
|
|
17654
|
+
outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? void 0,
|
|
17325
17655
|
depth: doc.metadata.depth,
|
|
17326
|
-
tags: doc.
|
|
17656
|
+
tags: doc.metadata.tags ?? [],
|
|
17327
17657
|
indexedAt: doc.metadata.indexedAt,
|
|
17328
|
-
summary: doc.
|
|
17329
|
-
description: doc.
|
|
17330
|
-
keywords: doc.
|
|
17658
|
+
summary: doc.metadata.summary || void 0,
|
|
17659
|
+
description: doc.metadata.description || void 0,
|
|
17660
|
+
keywords: doc.metadata.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17661
|
+
publishedAt: typeof doc.metadata.publishedAt === "number" ? doc.metadata.publishedAt : void 0
|
|
17331
17662
|
};
|
|
17332
17663
|
} catch {
|
|
17333
17664
|
return null;
|
|
17334
17665
|
}
|
|
17335
17666
|
}
|
|
17667
|
+
async fetchPageWithVector(url, scope) {
|
|
17668
|
+
try {
|
|
17669
|
+
const results = await this.pagesNs.fetch([url], {
|
|
17670
|
+
includeMetadata: true,
|
|
17671
|
+
includeVectors: true
|
|
17672
|
+
});
|
|
17673
|
+
const doc = results[0];
|
|
17674
|
+
if (!doc || !doc.metadata || !doc.vector) return null;
|
|
17675
|
+
if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
|
|
17676
|
+
return null;
|
|
17677
|
+
}
|
|
17678
|
+
return { metadata: doc.metadata, vector: doc.vector };
|
|
17679
|
+
} catch {
|
|
17680
|
+
return null;
|
|
17681
|
+
}
|
|
17682
|
+
}
|
|
17683
|
+
async fetchPagesBatch(urls, scope) {
|
|
17684
|
+
if (urls.length === 0) return [];
|
|
17685
|
+
try {
|
|
17686
|
+
const results = await this.pagesNs.fetch(urls, {
|
|
17687
|
+
includeMetadata: true
|
|
17688
|
+
});
|
|
17689
|
+
const out = [];
|
|
17690
|
+
for (const doc of results) {
|
|
17691
|
+
if (!doc || !doc.metadata) continue;
|
|
17692
|
+
if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
|
|
17693
|
+
continue;
|
|
17694
|
+
}
|
|
17695
|
+
out.push({
|
|
17696
|
+
url: doc.metadata.url,
|
|
17697
|
+
title: doc.metadata.title,
|
|
17698
|
+
routeFile: doc.metadata.routeFile,
|
|
17699
|
+
outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? []
|
|
17700
|
+
});
|
|
17701
|
+
}
|
|
17702
|
+
return out;
|
|
17703
|
+
} catch {
|
|
17704
|
+
return [];
|
|
17705
|
+
}
|
|
17706
|
+
}
|
|
17336
17707
|
async deletePages(scope) {
|
|
17708
|
+
const ids = [];
|
|
17709
|
+
let cursor = "0";
|
|
17337
17710
|
try {
|
|
17338
|
-
|
|
17339
|
-
|
|
17711
|
+
for (; ; ) {
|
|
17712
|
+
const result = await this.pagesNs.range({
|
|
17713
|
+
cursor,
|
|
17714
|
+
limit: 100,
|
|
17715
|
+
includeMetadata: true
|
|
17716
|
+
});
|
|
17717
|
+
for (const doc of result.vectors) {
|
|
17718
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17719
|
+
ids.push(String(doc.id));
|
|
17720
|
+
}
|
|
17721
|
+
}
|
|
17722
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17723
|
+
cursor = result.nextCursor;
|
|
17724
|
+
}
|
|
17340
17725
|
} catch {
|
|
17341
17726
|
}
|
|
17727
|
+
if (ids.length > 0) {
|
|
17728
|
+
await this.deletePagesByIds(ids, scope);
|
|
17729
|
+
}
|
|
17342
17730
|
}
|
|
17343
17731
|
async health() {
|
|
17344
17732
|
try {
|
|
17345
|
-
await this.
|
|
17733
|
+
await this.index.info();
|
|
17346
17734
|
return { ok: true };
|
|
17347
17735
|
} catch (error) {
|
|
17348
17736
|
return {
|
|
@@ -17352,14 +17740,31 @@ var UpstashSearchStore = class {
|
|
|
17352
17740
|
}
|
|
17353
17741
|
}
|
|
17354
17742
|
async dropAllIndexes(projectId) {
|
|
17355
|
-
const
|
|
17356
|
-
|
|
17357
|
-
|
|
17358
|
-
|
|
17359
|
-
|
|
17360
|
-
const
|
|
17361
|
-
|
|
17362
|
-
|
|
17743
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17744
|
+
const ids = [];
|
|
17745
|
+
let cursor = "0";
|
|
17746
|
+
try {
|
|
17747
|
+
for (; ; ) {
|
|
17748
|
+
const result = await ns.range({
|
|
17749
|
+
cursor,
|
|
17750
|
+
limit: 100,
|
|
17751
|
+
includeMetadata: true
|
|
17752
|
+
});
|
|
17753
|
+
for (const doc of result.vectors) {
|
|
17754
|
+
if (doc.metadata?.projectId === projectId) {
|
|
17755
|
+
ids.push(String(doc.id));
|
|
17756
|
+
}
|
|
17757
|
+
}
|
|
17758
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17759
|
+
cursor = result.nextCursor;
|
|
17760
|
+
}
|
|
17761
|
+
} catch {
|
|
17762
|
+
}
|
|
17763
|
+
if (ids.length > 0) {
|
|
17764
|
+
const BATCH_SIZE = 90;
|
|
17765
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17766
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17767
|
+
await ns.delete(batch);
|
|
17363
17768
|
}
|
|
17364
17769
|
}
|
|
17365
17770
|
}
|
|
@@ -17373,12 +17778,16 @@ async function createUpstashStore(config) {
|
|
|
17373
17778
|
if (!url || !token) {
|
|
17374
17779
|
throw new SearchSocketError(
|
|
17375
17780
|
"VECTOR_BACKEND_UNAVAILABLE",
|
|
17376
|
-
`Missing Upstash
|
|
17781
|
+
`Missing Upstash Vector credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
|
|
17377
17782
|
);
|
|
17378
17783
|
}
|
|
17379
|
-
const {
|
|
17380
|
-
const
|
|
17381
|
-
return new UpstashSearchStore({
|
|
17784
|
+
const { Index } = await import('@upstash/vector');
|
|
17785
|
+
const index = new Index({ url, token });
|
|
17786
|
+
return new UpstashSearchStore({
|
|
17787
|
+
index,
|
|
17788
|
+
pagesNamespace: config.upstash.namespaces.pages,
|
|
17789
|
+
chunksNamespace: config.upstash.namespaces.chunks
|
|
17790
|
+
});
|
|
17382
17791
|
}
|
|
17383
17792
|
function sha1(input) {
|
|
17384
17793
|
return createHash("sha1").update(input).digest("hex");
|
|
@@ -17446,6 +17855,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
|
|
|
17446
17855
|
if (normalizeText(current.text)) {
|
|
17447
17856
|
sections.push({
|
|
17448
17857
|
sectionTitle: current.sectionTitle,
|
|
17858
|
+
headingLevel: current.headingLevel,
|
|
17449
17859
|
headingPath: current.headingPath,
|
|
17450
17860
|
text: current.text.trim()
|
|
17451
17861
|
});
|
|
@@ -17464,6 +17874,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
|
|
|
17464
17874
|
headingStack.length = level;
|
|
17465
17875
|
current = {
|
|
17466
17876
|
sectionTitle: title,
|
|
17877
|
+
headingLevel: level,
|
|
17467
17878
|
headingPath: headingStack.filter((entry) => Boolean(entry)).slice(0, headingPathDepth),
|
|
17468
17879
|
text: `${line}
|
|
17469
17880
|
`
|
|
@@ -17599,6 +18010,7 @@ function splitSection(section, config) {
|
|
|
17599
18010
|
return [
|
|
17600
18011
|
{
|
|
17601
18012
|
sectionTitle: section.sectionTitle,
|
|
18013
|
+
headingLevel: section.headingLevel,
|
|
17602
18014
|
headingPath: section.headingPath,
|
|
17603
18015
|
chunkText: text
|
|
17604
18016
|
}
|
|
@@ -17649,6 +18061,7 @@ ${chunk}`;
|
|
|
17649
18061
|
}
|
|
17650
18062
|
return merged.map((chunkText) => ({
|
|
17651
18063
|
sectionTitle: section.sectionTitle,
|
|
18064
|
+
headingLevel: section.headingLevel,
|
|
17652
18065
|
headingPath: section.headingPath,
|
|
17653
18066
|
chunkText
|
|
17654
18067
|
}));
|
|
@@ -17664,6 +18077,18 @@ function buildSummaryChunkText(page) {
|
|
|
17664
18077
|
}
|
|
17665
18078
|
return parts.join("\n\n");
|
|
17666
18079
|
}
|
|
18080
|
+
function buildEmbeddingTitle(chunk) {
|
|
18081
|
+
if (!chunk.sectionTitle || chunk.headingLevel === void 0) return void 0;
|
|
18082
|
+
if (chunk.headingPath.length > 1) {
|
|
18083
|
+
const path14 = chunk.headingPath.join(" > ");
|
|
18084
|
+
const lastInPath = chunk.headingPath[chunk.headingPath.length - 1];
|
|
18085
|
+
if (lastInPath !== chunk.sectionTitle) {
|
|
18086
|
+
return `${chunk.title} \u2014 ${path14} > ${chunk.sectionTitle}`;
|
|
18087
|
+
}
|
|
18088
|
+
return `${chunk.title} \u2014 ${path14}`;
|
|
18089
|
+
}
|
|
18090
|
+
return `${chunk.title} \u2014 ${chunk.sectionTitle}`;
|
|
18091
|
+
}
|
|
17667
18092
|
function buildEmbeddingText(chunk, prependTitle) {
|
|
17668
18093
|
if (!prependTitle) return chunk.chunkText;
|
|
17669
18094
|
const prefix = chunk.sectionTitle ? `${chunk.title} \u2014 ${chunk.sectionTitle}` : chunk.title;
|
|
@@ -17694,10 +18119,14 @@ function chunkPage(page, config, scope) {
|
|
|
17694
18119
|
tags: page.tags,
|
|
17695
18120
|
contentHash: "",
|
|
17696
18121
|
description: page.description,
|
|
17697
|
-
keywords: page.keywords
|
|
18122
|
+
keywords: page.keywords,
|
|
18123
|
+
publishedAt: page.publishedAt,
|
|
18124
|
+
incomingAnchorText: page.incomingAnchorText,
|
|
18125
|
+
meta: page.meta
|
|
17698
18126
|
};
|
|
17699
18127
|
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
17700
|
-
|
|
18128
|
+
const metaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
|
|
18129
|
+
summaryChunk.contentHash = sha256(normalizeText(embeddingText) + metaSuffix);
|
|
17701
18130
|
chunks.push(summaryChunk);
|
|
17702
18131
|
}
|
|
17703
18132
|
const ordinalOffset = config.chunking.pageSummaryChunk ? 1 : 0;
|
|
@@ -17714,6 +18143,7 @@ function chunkPage(page, config, scope) {
|
|
|
17714
18143
|
path: page.url,
|
|
17715
18144
|
title: page.title,
|
|
17716
18145
|
sectionTitle: entry.sectionTitle,
|
|
18146
|
+
headingLevel: entry.headingLevel,
|
|
17717
18147
|
headingPath: entry.headingPath,
|
|
17718
18148
|
chunkText: entry.chunkText,
|
|
17719
18149
|
snippet: toSnippet(entry.chunkText),
|
|
@@ -17723,10 +18153,16 @@ function chunkPage(page, config, scope) {
|
|
|
17723
18153
|
tags: page.tags,
|
|
17724
18154
|
contentHash: "",
|
|
17725
18155
|
description: page.description,
|
|
17726
|
-
keywords: page.keywords
|
|
18156
|
+
keywords: page.keywords,
|
|
18157
|
+
publishedAt: page.publishedAt,
|
|
18158
|
+
incomingAnchorText: page.incomingAnchorText,
|
|
18159
|
+
meta: page.meta
|
|
17727
18160
|
};
|
|
17728
18161
|
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
17729
|
-
|
|
18162
|
+
const embeddingTitle = config.chunking.weightHeadings ? buildEmbeddingTitle(chunk) : void 0;
|
|
18163
|
+
const chunkMetaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
|
|
18164
|
+
const hashInput = embeddingTitle ? `${normalizeText(embeddingText)}|title:${embeddingTitle}` : normalizeText(embeddingText);
|
|
18165
|
+
chunk.contentHash = sha256(hashInput + chunkMetaSuffix);
|
|
17730
18166
|
chunks.push(chunk);
|
|
17731
18167
|
}
|
|
17732
18168
|
return chunks;
|
|
@@ -18558,7 +18994,112 @@ function gfm(turndownService) {
|
|
|
18558
18994
|
]);
|
|
18559
18995
|
}
|
|
18560
18996
|
|
|
18997
|
+
// src/utils/structured-meta.ts
|
|
18998
|
+
var VALID_KEY_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
|
|
18999
|
+
function validateMetaKey(key) {
|
|
19000
|
+
return VALID_KEY_RE.test(key);
|
|
19001
|
+
}
|
|
19002
|
+
function parseMetaValue(content, dataType) {
|
|
19003
|
+
switch (dataType) {
|
|
19004
|
+
case "number": {
|
|
19005
|
+
const n = Number(content);
|
|
19006
|
+
return Number.isFinite(n) ? n : content;
|
|
19007
|
+
}
|
|
19008
|
+
case "boolean":
|
|
19009
|
+
return content === "true";
|
|
19010
|
+
case "string[]":
|
|
19011
|
+
return content ? content.split(",").map((s) => s.trim()) : [];
|
|
19012
|
+
case "date": {
|
|
19013
|
+
const ms = Number(content);
|
|
19014
|
+
return Number.isFinite(ms) ? ms : content;
|
|
19015
|
+
}
|
|
19016
|
+
default:
|
|
19017
|
+
return content;
|
|
19018
|
+
}
|
|
19019
|
+
}
|
|
19020
|
+
function escapeFilterValue(s) {
|
|
19021
|
+
return s.replace(/'/g, "''");
|
|
19022
|
+
}
|
|
19023
|
+
function buildMetaFilterString(filters) {
|
|
19024
|
+
const clauses = [];
|
|
19025
|
+
for (const [key, value] of Object.entries(filters)) {
|
|
19026
|
+
if (!validateMetaKey(key)) continue;
|
|
19027
|
+
const field = `meta.${key}`;
|
|
19028
|
+
if (typeof value === "string") {
|
|
19029
|
+
clauses.push(`${field} CONTAINS '${escapeFilterValue(value)}'`);
|
|
19030
|
+
} else if (typeof value === "boolean") {
|
|
19031
|
+
clauses.push(`${field} = ${value}`);
|
|
19032
|
+
} else {
|
|
19033
|
+
clauses.push(`${field} = ${value}`);
|
|
19034
|
+
}
|
|
19035
|
+
}
|
|
19036
|
+
return clauses.join(" AND ");
|
|
19037
|
+
}
|
|
19038
|
+
|
|
18561
19039
|
// src/indexing/extractor.ts
|
|
19040
|
+
function normalizeDateToMs(value) {
|
|
19041
|
+
if (value == null) return void 0;
|
|
19042
|
+
if (value instanceof Date) {
|
|
19043
|
+
const ts = value.getTime();
|
|
19044
|
+
return Number.isFinite(ts) ? ts : void 0;
|
|
19045
|
+
}
|
|
19046
|
+
if (typeof value === "string") {
|
|
19047
|
+
const ts = new Date(value).getTime();
|
|
19048
|
+
return Number.isFinite(ts) ? ts : void 0;
|
|
19049
|
+
}
|
|
19050
|
+
if (typeof value === "number") {
|
|
19051
|
+
return Number.isFinite(value) ? value : void 0;
|
|
19052
|
+
}
|
|
19053
|
+
return void 0;
|
|
19054
|
+
}
|
|
19055
|
+
var FRONTMATTER_DATE_FIELDS = ["date", "publishedAt", "updatedAt", "published_at", "updated_at"];
|
|
19056
|
+
function extractPublishedAtFromFrontmatter(data) {
|
|
19057
|
+
for (const field of FRONTMATTER_DATE_FIELDS) {
|
|
19058
|
+
const val = normalizeDateToMs(data[field]);
|
|
19059
|
+
if (val !== void 0) return val;
|
|
19060
|
+
}
|
|
19061
|
+
return void 0;
|
|
19062
|
+
}
|
|
19063
|
+
function extractPublishedAtFromHtml($) {
|
|
19064
|
+
const jsonLdScripts = $('script[type="application/ld+json"]');
|
|
19065
|
+
for (let i = 0; i < jsonLdScripts.length; i++) {
|
|
19066
|
+
try {
|
|
19067
|
+
const raw = $(jsonLdScripts[i]).html();
|
|
19068
|
+
if (!raw) continue;
|
|
19069
|
+
const parsed = JSON.parse(raw);
|
|
19070
|
+
const candidates = [];
|
|
19071
|
+
if (Array.isArray(parsed)) {
|
|
19072
|
+
candidates.push(...parsed);
|
|
19073
|
+
} else if (parsed && typeof parsed === "object") {
|
|
19074
|
+
candidates.push(parsed);
|
|
19075
|
+
if (Array.isArray(parsed["@graph"])) {
|
|
19076
|
+
candidates.push(...parsed["@graph"]);
|
|
19077
|
+
}
|
|
19078
|
+
}
|
|
19079
|
+
for (const candidate of candidates) {
|
|
19080
|
+
const val = normalizeDateToMs(candidate.datePublished);
|
|
19081
|
+
if (val !== void 0) return val;
|
|
19082
|
+
}
|
|
19083
|
+
} catch {
|
|
19084
|
+
}
|
|
19085
|
+
}
|
|
19086
|
+
const ogTime = $('meta[property="article:published_time"]').attr("content")?.trim();
|
|
19087
|
+
if (ogTime) {
|
|
19088
|
+
const val = normalizeDateToMs(ogTime);
|
|
19089
|
+
if (val !== void 0) return val;
|
|
19090
|
+
}
|
|
19091
|
+
const itempropDate = $('meta[itemprop="datePublished"]').attr("content")?.trim() || $('time[itemprop="datePublished"]').attr("datetime")?.trim();
|
|
19092
|
+
if (itempropDate) {
|
|
19093
|
+
const val = normalizeDateToMs(itempropDate);
|
|
19094
|
+
if (val !== void 0) return val;
|
|
19095
|
+
}
|
|
19096
|
+
const timeEl = $("time[datetime]").first().attr("datetime")?.trim();
|
|
19097
|
+
if (timeEl) {
|
|
19098
|
+
const val = normalizeDateToMs(timeEl);
|
|
19099
|
+
if (val !== void 0) return val;
|
|
19100
|
+
}
|
|
19101
|
+
return void 0;
|
|
19102
|
+
}
|
|
18562
19103
|
function hasTopLevelNoindexComment(markdown) {
|
|
18563
19104
|
const lines = markdown.split(/\r?\n/);
|
|
18564
19105
|
let inFence = false;
|
|
@@ -18574,6 +19115,97 @@ function hasTopLevelNoindexComment(markdown) {
|
|
|
18574
19115
|
}
|
|
18575
19116
|
return false;
|
|
18576
19117
|
}
|
|
19118
|
+
var GARBAGE_ALT_WORDS = /* @__PURE__ */ new Set([
|
|
19119
|
+
"image",
|
|
19120
|
+
"photo",
|
|
19121
|
+
"picture",
|
|
19122
|
+
"icon",
|
|
19123
|
+
"logo",
|
|
19124
|
+
"banner",
|
|
19125
|
+
"screenshot",
|
|
19126
|
+
"thumbnail",
|
|
19127
|
+
"img",
|
|
19128
|
+
"graphic",
|
|
19129
|
+
"illustration",
|
|
19130
|
+
"spacer",
|
|
19131
|
+
"pixel",
|
|
19132
|
+
"placeholder",
|
|
19133
|
+
"avatar",
|
|
19134
|
+
"background"
|
|
19135
|
+
]);
|
|
19136
|
+
var IMAGE_EXT_RE = /\.(jpg|jpeg|png|gif|svg|webp|avif|bmp|ico)(\?.*)?$/i;
|
|
19137
|
+
function isMeaningfulAlt(alt) {
|
|
19138
|
+
const trimmed = alt.trim();
|
|
19139
|
+
if (!trimmed || trimmed.length < 5) return false;
|
|
19140
|
+
if (IMAGE_EXT_RE.test(trimmed)) return false;
|
|
19141
|
+
if (GARBAGE_ALT_WORDS.has(trimmed.toLowerCase())) return false;
|
|
19142
|
+
return true;
|
|
19143
|
+
}
|
|
19144
|
+
function resolveImageText(img, $, imageDescAttr) {
|
|
19145
|
+
const imgDesc = img.attr(imageDescAttr)?.trim();
|
|
19146
|
+
if (imgDesc) return imgDesc;
|
|
19147
|
+
const figure = img.closest("figure");
|
|
19148
|
+
if (figure.length) {
|
|
19149
|
+
const figDesc = figure.attr(imageDescAttr)?.trim();
|
|
19150
|
+
if (figDesc) return figDesc;
|
|
19151
|
+
}
|
|
19152
|
+
const alt = img.attr("alt")?.trim() ?? "";
|
|
19153
|
+
const caption = figure.length ? figure.find("figcaption").first().text().trim() : "";
|
|
19154
|
+
if (isMeaningfulAlt(alt) && caption) {
|
|
19155
|
+
return `${alt} \u2014 ${caption}`;
|
|
19156
|
+
}
|
|
19157
|
+
if (isMeaningfulAlt(alt)) {
|
|
19158
|
+
return alt;
|
|
19159
|
+
}
|
|
19160
|
+
if (caption) {
|
|
19161
|
+
return caption;
|
|
19162
|
+
}
|
|
19163
|
+
return null;
|
|
19164
|
+
}
|
|
19165
|
+
var STOP_ANCHORS = /* @__PURE__ */ new Set([
|
|
19166
|
+
"here",
|
|
19167
|
+
"click",
|
|
19168
|
+
"click here",
|
|
19169
|
+
"read more",
|
|
19170
|
+
"link",
|
|
19171
|
+
"this",
|
|
19172
|
+
"more"
|
|
19173
|
+
]);
|
|
19174
|
+
function normalizeAnchorText(raw) {
|
|
19175
|
+
const normalized = raw.replace(/\s+/g, " ").trim().toLowerCase();
|
|
19176
|
+
if (normalized.length < 3) return "";
|
|
19177
|
+
if (STOP_ANCHORS.has(normalized)) return "";
|
|
19178
|
+
if (normalized.length > 100) return normalized.slice(0, 100);
|
|
19179
|
+
return normalized;
|
|
19180
|
+
}
|
|
19181
|
+
function escapeHtml(text) {
|
|
19182
|
+
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">");
|
|
19183
|
+
}
|
|
19184
|
+
function preprocessImages(root2, $, imageDescAttr) {
|
|
19185
|
+
root2.find("picture").each((_i, el) => {
|
|
19186
|
+
const picture = $(el);
|
|
19187
|
+
const img = picture.find("img").first();
|
|
19188
|
+
const parentFigure = picture.closest("figure");
|
|
19189
|
+
const text = img.length ? resolveImageText(img, $, imageDescAttr) : null;
|
|
19190
|
+
if (text) {
|
|
19191
|
+
if (parentFigure.length) parentFigure.find("figcaption").remove();
|
|
19192
|
+
picture.replaceWith(`<span>${escapeHtml(text)}</span>`);
|
|
19193
|
+
} else {
|
|
19194
|
+
picture.remove();
|
|
19195
|
+
}
|
|
19196
|
+
});
|
|
19197
|
+
root2.find("img").each((_i, el) => {
|
|
19198
|
+
const img = $(el);
|
|
19199
|
+
const parentFigure = img.closest("figure");
|
|
19200
|
+
const text = resolveImageText(img, $, imageDescAttr);
|
|
19201
|
+
if (text) {
|
|
19202
|
+
if (parentFigure.length) parentFigure.find("figcaption").remove();
|
|
19203
|
+
img.replaceWith(`<span>${escapeHtml(text)}</span>`);
|
|
19204
|
+
} else {
|
|
19205
|
+
img.remove();
|
|
19206
|
+
}
|
|
19207
|
+
});
|
|
19208
|
+
}
|
|
18577
19209
|
function extractFromHtml(url, html, config) {
|
|
18578
19210
|
const $ = load(html);
|
|
18579
19211
|
const normalizedUrl = normalizeUrlPath(url);
|
|
@@ -18599,6 +19231,20 @@ function extractFromHtml(url, html, config) {
|
|
|
18599
19231
|
if (weight === 0) {
|
|
18600
19232
|
return null;
|
|
18601
19233
|
}
|
|
19234
|
+
if ($('meta[name="searchsocket:noindex"]').attr("content") === "true") {
|
|
19235
|
+
return null;
|
|
19236
|
+
}
|
|
19237
|
+
const RESERVED_META_KEYS = /* @__PURE__ */ new Set(["noindex", "tags"]);
|
|
19238
|
+
const meta = {};
|
|
19239
|
+
$('meta[name^="searchsocket:"]').each((_i, el) => {
|
|
19240
|
+
const name = $(el).attr("name") ?? "";
|
|
19241
|
+
const key = name.slice("searchsocket:".length);
|
|
19242
|
+
if (!key || RESERVED_META_KEYS.has(key) || !validateMetaKey(key)) return;
|
|
19243
|
+
const content = $(el).attr("content") ?? "";
|
|
19244
|
+
const dataType = $(el).attr("data-type") ?? "string";
|
|
19245
|
+
meta[key] = parseMetaValue(content, dataType);
|
|
19246
|
+
});
|
|
19247
|
+
const componentTags = $('meta[name="searchsocket:tags"]').attr("content")?.trim();
|
|
18602
19248
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
18603
19249
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
18604
19250
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -18610,7 +19256,9 @@ function extractFromHtml(url, html, config) {
|
|
|
18610
19256
|
root2.find(selector).remove();
|
|
18611
19257
|
}
|
|
18612
19258
|
root2.find(`[${config.extract.ignoreAttr}]`).remove();
|
|
19259
|
+
preprocessImages(root2, $, config.extract.imageDescAttr);
|
|
18613
19260
|
const outgoingLinks = [];
|
|
19261
|
+
const seenLinkKeys = /* @__PURE__ */ new Set();
|
|
18614
19262
|
root2.find("a[href]").each((_index, node) => {
|
|
18615
19263
|
const href = $(node).attr("href");
|
|
18616
19264
|
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:")) {
|
|
@@ -18621,7 +19269,19 @@ function extractFromHtml(url, html, config) {
|
|
|
18621
19269
|
if (!["http:", "https:"].includes(parsed.protocol)) {
|
|
18622
19270
|
return;
|
|
18623
19271
|
}
|
|
18624
|
-
|
|
19272
|
+
const url2 = normalizeUrlPath(parsed.pathname);
|
|
19273
|
+
let anchorText = normalizeAnchorText($(node).text());
|
|
19274
|
+
if (!anchorText) {
|
|
19275
|
+
const imgAlt = $(node).find("img").first().attr("alt") ?? "";
|
|
19276
|
+
if (isMeaningfulAlt(imgAlt)) {
|
|
19277
|
+
anchorText = normalizeAnchorText(imgAlt);
|
|
19278
|
+
}
|
|
19279
|
+
}
|
|
19280
|
+
const key = `${url2}|${anchorText}`;
|
|
19281
|
+
if (!seenLinkKeys.has(key)) {
|
|
19282
|
+
seenLinkKeys.add(key);
|
|
19283
|
+
outgoingLinks.push({ url: url2, anchorText });
|
|
19284
|
+
}
|
|
18625
19285
|
} catch {
|
|
18626
19286
|
}
|
|
18627
19287
|
});
|
|
@@ -18646,16 +19306,25 @@ function extractFromHtml(url, html, config) {
|
|
|
18646
19306
|
return null;
|
|
18647
19307
|
}
|
|
18648
19308
|
const tags = normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1);
|
|
19309
|
+
const publishedAt = extractPublishedAtFromHtml($);
|
|
19310
|
+
if (componentTags) {
|
|
19311
|
+
const extraTags = componentTags.split(",").map((t) => t.trim()).filter(Boolean);
|
|
19312
|
+
for (const t of extraTags) {
|
|
19313
|
+
if (!tags.includes(t)) tags.push(t);
|
|
19314
|
+
}
|
|
19315
|
+
}
|
|
18649
19316
|
return {
|
|
18650
19317
|
url: normalizeUrlPath(url),
|
|
18651
19318
|
title,
|
|
18652
19319
|
markdown,
|
|
18653
|
-
outgoingLinks
|
|
19320
|
+
outgoingLinks,
|
|
18654
19321
|
noindex: false,
|
|
18655
19322
|
tags,
|
|
18656
19323
|
description,
|
|
18657
19324
|
keywords,
|
|
18658
|
-
weight
|
|
19325
|
+
weight,
|
|
19326
|
+
publishedAt,
|
|
19327
|
+
meta: Object.keys(meta).length > 0 ? meta : void 0
|
|
18659
19328
|
};
|
|
18660
19329
|
}
|
|
18661
19330
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -18676,6 +19345,24 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
18676
19345
|
if (mdWeight === 0) {
|
|
18677
19346
|
return null;
|
|
18678
19347
|
}
|
|
19348
|
+
let mdMeta;
|
|
19349
|
+
const rawMeta = searchsocketMeta?.meta;
|
|
19350
|
+
if (rawMeta && typeof rawMeta === "object" && !Array.isArray(rawMeta)) {
|
|
19351
|
+
const metaObj = {};
|
|
19352
|
+
for (const [key, val] of Object.entries(rawMeta)) {
|
|
19353
|
+
if (!validateMetaKey(key)) continue;
|
|
19354
|
+
if (typeof val === "string" || typeof val === "number" || typeof val === "boolean") {
|
|
19355
|
+
metaObj[key] = val;
|
|
19356
|
+
} else if (Array.isArray(val) && val.every((v) => typeof v === "string")) {
|
|
19357
|
+
metaObj[key] = val;
|
|
19358
|
+
} else if (val instanceof Date) {
|
|
19359
|
+
metaObj[key] = val.getTime();
|
|
19360
|
+
}
|
|
19361
|
+
}
|
|
19362
|
+
if (Object.keys(metaObj).length > 0) {
|
|
19363
|
+
mdMeta = metaObj;
|
|
19364
|
+
}
|
|
19365
|
+
}
|
|
18679
19366
|
const content = parsed.content;
|
|
18680
19367
|
const normalized = normalizeMarkdown(content);
|
|
18681
19368
|
if (!normalizeText(normalized)) {
|
|
@@ -18690,6 +19377,7 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
18690
19377
|
fmKeywords = frontmatter.keywords.split(",").map((k) => k.trim()).filter(Boolean);
|
|
18691
19378
|
}
|
|
18692
19379
|
if (fmKeywords && fmKeywords.length === 0) fmKeywords = void 0;
|
|
19380
|
+
const publishedAt = extractPublishedAtFromFrontmatter(frontmatter);
|
|
18693
19381
|
return {
|
|
18694
19382
|
url: normalizeUrlPath(url),
|
|
18695
19383
|
title: resolvedTitle,
|
|
@@ -18699,7 +19387,9 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
18699
19387
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
18700
19388
|
description: fmDescription,
|
|
18701
19389
|
keywords: fmKeywords,
|
|
18702
|
-
weight: mdWeight
|
|
19390
|
+
weight: mdWeight,
|
|
19391
|
+
publishedAt,
|
|
19392
|
+
meta: mdMeta
|
|
18703
19393
|
};
|
|
18704
19394
|
}
|
|
18705
19395
|
function segmentToRegex(segment) {
|
|
@@ -18894,7 +19584,7 @@ async function parseManifest(cwd, outputDir) {
|
|
|
18894
19584
|
const manifestPath = path.resolve(cwd, outputDir, "server", "manifest-full.js");
|
|
18895
19585
|
let content;
|
|
18896
19586
|
try {
|
|
18897
|
-
content = await
|
|
19587
|
+
content = await fs8.readFile(manifestPath, "utf8");
|
|
18898
19588
|
} catch {
|
|
18899
19589
|
throw new SearchSocketError(
|
|
18900
19590
|
"BUILD_MANIFEST_NOT_FOUND",
|
|
@@ -19205,13 +19895,132 @@ function filePathToUrl(filePath, baseDir) {
|
|
|
19205
19895
|
const noExt = relative.replace(/\.md$/i, "").replace(/\/index$/i, "");
|
|
19206
19896
|
return normalizeUrlPath(noExt || "/");
|
|
19207
19897
|
}
|
|
19208
|
-
|
|
19209
|
-
|
|
19898
|
+
var ROUTE_FILE_RE = /\+(page|layout|error)(@[^.]+)?\.svelte$/;
|
|
19899
|
+
function isSvelteComponentFile(filePath) {
|
|
19900
|
+
if (!filePath.endsWith(".svelte")) return false;
|
|
19901
|
+
return !ROUTE_FILE_RE.test(filePath);
|
|
19210
19902
|
}
|
|
19211
|
-
|
|
19212
|
-
const
|
|
19213
|
-
|
|
19214
|
-
|
|
19903
|
+
function extractSvelteComponentMeta(source) {
|
|
19904
|
+
const componentMatch = source.match(/<!--\s*@component\s*([\s\S]*?)\s*-->/);
|
|
19905
|
+
const description = componentMatch?.[1]?.trim() || void 0;
|
|
19906
|
+
const propsMatch = source.match(
|
|
19907
|
+
/let\s+\{([\s\S]*?)\}\s*(?::\s*([^=;{][\s\S]*?))?\s*=\s*\$props\(\)/
|
|
19908
|
+
);
|
|
19909
|
+
const props = [];
|
|
19910
|
+
if (propsMatch) {
|
|
19911
|
+
const destructureBlock = propsMatch[1];
|
|
19912
|
+
const typeAnnotation = propsMatch[2]?.trim();
|
|
19913
|
+
let resolvedTypeMap;
|
|
19914
|
+
if (typeAnnotation && /^[A-Z]\w*$/.test(typeAnnotation)) {
|
|
19915
|
+
resolvedTypeMap = resolveTypeReference(source, typeAnnotation);
|
|
19916
|
+
} else if (typeAnnotation && typeAnnotation.startsWith("{")) {
|
|
19917
|
+
resolvedTypeMap = parseInlineTypeAnnotation(typeAnnotation);
|
|
19918
|
+
}
|
|
19919
|
+
const propEntries = splitDestructureBlock(destructureBlock);
|
|
19920
|
+
for (const entry of propEntries) {
|
|
19921
|
+
const trimmed = entry.trim();
|
|
19922
|
+
if (!trimmed || trimmed.startsWith("...")) continue;
|
|
19923
|
+
let propName;
|
|
19924
|
+
let defaultValue;
|
|
19925
|
+
const renameMatch = trimmed.match(/^(\w+)\s*:\s*\w+\s*(?:=\s*([\s\S]+))?$/);
|
|
19926
|
+
if (renameMatch) {
|
|
19927
|
+
propName = renameMatch[1];
|
|
19928
|
+
defaultValue = renameMatch[2]?.trim();
|
|
19929
|
+
} else {
|
|
19930
|
+
const defaultMatch = trimmed.match(/^(\w+)\s*=\s*([\s\S]+)$/);
|
|
19931
|
+
if (defaultMatch) {
|
|
19932
|
+
propName = defaultMatch[1];
|
|
19933
|
+
defaultValue = defaultMatch[2]?.trim();
|
|
19934
|
+
} else {
|
|
19935
|
+
propName = trimmed.match(/^(\w+)/)?.[1] ?? trimmed;
|
|
19936
|
+
}
|
|
19937
|
+
}
|
|
19938
|
+
const propType = resolvedTypeMap?.get(propName);
|
|
19939
|
+
props.push({
|
|
19940
|
+
name: propName,
|
|
19941
|
+
...propType ? { type: propType } : {},
|
|
19942
|
+
...defaultValue ? { default: defaultValue } : {}
|
|
19943
|
+
});
|
|
19944
|
+
}
|
|
19945
|
+
}
|
|
19946
|
+
return { description, props };
|
|
19947
|
+
}
|
|
19948
|
+
function splitDestructureBlock(block) {
|
|
19949
|
+
const entries = [];
|
|
19950
|
+
let depth = 0;
|
|
19951
|
+
let current = "";
|
|
19952
|
+
for (const ch of block) {
|
|
19953
|
+
if (ch === "{" || ch === "[" || ch === "(") {
|
|
19954
|
+
depth++;
|
|
19955
|
+
current += ch;
|
|
19956
|
+
} else if (ch === "}" || ch === "]" || ch === ")") {
|
|
19957
|
+
depth--;
|
|
19958
|
+
current += ch;
|
|
19959
|
+
} else if (ch === "," && depth === 0) {
|
|
19960
|
+
entries.push(current);
|
|
19961
|
+
current = "";
|
|
19962
|
+
} else {
|
|
19963
|
+
current += ch;
|
|
19964
|
+
}
|
|
19965
|
+
}
|
|
19966
|
+
if (current.trim()) entries.push(current);
|
|
19967
|
+
return entries;
|
|
19968
|
+
}
|
|
19969
|
+
function resolveTypeReference(source, typeName) {
|
|
19970
|
+
const startRe = new RegExp(`(?:interface\\s+${typeName}\\s*|type\\s+${typeName}\\s*=\\s*)\\{`);
|
|
19971
|
+
const startMatch = source.match(startRe);
|
|
19972
|
+
if (!startMatch || startMatch.index === void 0) return void 0;
|
|
19973
|
+
const bodyStart = startMatch.index + startMatch[0].length;
|
|
19974
|
+
let depth = 1;
|
|
19975
|
+
let i = bodyStart;
|
|
19976
|
+
while (i < source.length && depth > 0) {
|
|
19977
|
+
if (source[i] === "{") depth++;
|
|
19978
|
+
else if (source[i] === "}") depth--;
|
|
19979
|
+
i++;
|
|
19980
|
+
}
|
|
19981
|
+
if (depth !== 0) return void 0;
|
|
19982
|
+
const body = source.slice(bodyStart, i - 1);
|
|
19983
|
+
return parseTypeMembers(body);
|
|
19984
|
+
}
|
|
19985
|
+
function parseInlineTypeAnnotation(annotation) {
|
|
19986
|
+
const inner = annotation.replace(/^\{/, "").replace(/\}$/, "");
|
|
19987
|
+
return parseTypeMembers(inner);
|
|
19988
|
+
}
|
|
19989
|
+
function parseTypeMembers(body) {
|
|
19990
|
+
const map = /* @__PURE__ */ new Map();
|
|
19991
|
+
const members = body.split(/[;\n]/).map((m) => m.trim()).filter(Boolean);
|
|
19992
|
+
for (const member of members) {
|
|
19993
|
+
const memberMatch = member.match(/^(\w+)\??\s*:\s*(.+)$/);
|
|
19994
|
+
if (memberMatch) {
|
|
19995
|
+
map.set(memberMatch[1], memberMatch[2].replace(/,\s*$/, "").trim());
|
|
19996
|
+
}
|
|
19997
|
+
}
|
|
19998
|
+
return map;
|
|
19999
|
+
}
|
|
20000
|
+
function buildComponentMarkdown(componentName, meta) {
|
|
20001
|
+
if (!meta.description && meta.props.length === 0) return "";
|
|
20002
|
+
const parts = [`${componentName} component.`];
|
|
20003
|
+
if (meta.description) {
|
|
20004
|
+
parts.push(meta.description);
|
|
20005
|
+
}
|
|
20006
|
+
if (meta.props.length > 0) {
|
|
20007
|
+
const propEntries = meta.props.map((p) => {
|
|
20008
|
+
let entry = p.name;
|
|
20009
|
+
if (p.type) entry += ` (${p.type})`;
|
|
20010
|
+
if (p.default) entry += ` default: ${p.default}`;
|
|
20011
|
+
return entry;
|
|
20012
|
+
});
|
|
20013
|
+
parts.push(`Props: ${propEntries.join(", ")}.`);
|
|
20014
|
+
}
|
|
20015
|
+
return parts.join(" ");
|
|
20016
|
+
}
|
|
20017
|
+
function normalizeSvelteToMarkdown(source) {
|
|
20018
|
+
return source.replace(/<script[\s\S]*?<\/script>/g, "").replace(/<style[\s\S]*?<\/style>/g, "").replace(/<[^>]+>/g, " ").replace(/\{[^}]+\}/g, " ").replace(/\s+/g, " ").trim();
|
|
20019
|
+
}
|
|
20020
|
+
async function loadContentFilesPages(cwd, config, maxPages) {
|
|
20021
|
+
const contentConfig = config.source.contentFiles;
|
|
20022
|
+
if (!contentConfig) {
|
|
20023
|
+
throw new Error("content-files config is missing");
|
|
19215
20024
|
}
|
|
19216
20025
|
const baseDir = path.resolve(cwd, contentConfig.baseDir);
|
|
19217
20026
|
const files = await fg(contentConfig.globs, {
|
|
@@ -19223,13 +20032,27 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
19223
20032
|
const selected = typeof limit === "number" ? files.slice(0, limit) : files;
|
|
19224
20033
|
const pages = [];
|
|
19225
20034
|
for (const filePath of selected) {
|
|
19226
|
-
const raw = await
|
|
19227
|
-
|
|
20035
|
+
const raw = await fs8.readFile(filePath, "utf8");
|
|
20036
|
+
let markdown;
|
|
20037
|
+
let tags;
|
|
20038
|
+
if (filePath.endsWith(".md")) {
|
|
20039
|
+
markdown = raw;
|
|
20040
|
+
} else if (isSvelteComponentFile(filePath)) {
|
|
20041
|
+
const componentName = path.basename(filePath, ".svelte");
|
|
20042
|
+
const meta = extractSvelteComponentMeta(raw);
|
|
20043
|
+
const componentMarkdown = buildComponentMarkdown(componentName, meta);
|
|
20044
|
+
const templateContent = normalizeSvelteToMarkdown(raw);
|
|
20045
|
+
markdown = componentMarkdown ? [componentMarkdown, templateContent].filter(Boolean).join("\n\n") : templateContent;
|
|
20046
|
+
tags = ["component"];
|
|
20047
|
+
} else {
|
|
20048
|
+
markdown = normalizeSvelteToMarkdown(raw);
|
|
20049
|
+
}
|
|
19228
20050
|
pages.push({
|
|
19229
20051
|
url: filePathToUrl(filePath, baseDir),
|
|
19230
20052
|
markdown,
|
|
19231
20053
|
sourcePath: path.relative(cwd, filePath).replace(/\\/g, "/"),
|
|
19232
|
-
outgoingLinks: []
|
|
20054
|
+
outgoingLinks: [],
|
|
20055
|
+
...tags ? { tags } : {}
|
|
19233
20056
|
});
|
|
19234
20057
|
}
|
|
19235
20058
|
return pages;
|
|
@@ -19359,7 +20182,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
19359
20182
|
const selected = typeof limit === "number" ? htmlFiles.slice(0, limit) : htmlFiles;
|
|
19360
20183
|
const pages = [];
|
|
19361
20184
|
for (const filePath of selected) {
|
|
19362
|
-
const html = await
|
|
20185
|
+
const html = await fs8.readFile(filePath, "utf8");
|
|
19363
20186
|
pages.push({
|
|
19364
20187
|
url: staticHtmlFileToUrl(filePath, outputDir),
|
|
19365
20188
|
html,
|
|
@@ -19422,7 +20245,7 @@ function isBlockedByRobots(urlPath, rules3) {
|
|
|
19422
20245
|
}
|
|
19423
20246
|
async function loadRobotsTxtFromDir(dir) {
|
|
19424
20247
|
try {
|
|
19425
|
-
const content = await
|
|
20248
|
+
const content = await fs8.readFile(path.join(dir, "robots.txt"), "utf8");
|
|
19426
20249
|
return parseRobotsTxt(content);
|
|
19427
20250
|
} catch {
|
|
19428
20251
|
return null;
|
|
@@ -19450,29 +20273,65 @@ function nonNegativeOrZero(value) {
|
|
|
19450
20273
|
function normalizeForTitleMatch(text) {
|
|
19451
20274
|
return text.toLowerCase().replace(/[^a-z0-9\s]/g, "").replace(/\s+/g, " ").trim();
|
|
19452
20275
|
}
|
|
19453
|
-
function rankHits(hits, config, query) {
|
|
20276
|
+
function rankHits(hits, config, query, debug) {
|
|
19454
20277
|
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
19455
20278
|
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
19456
20279
|
return hits.map((hit) => {
|
|
19457
|
-
|
|
20280
|
+
const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
20281
|
+
let score = baseScore;
|
|
20282
|
+
let incomingLinkBoostValue = 0;
|
|
19458
20283
|
if (config.ranking.enableIncomingLinkBoost) {
|
|
19459
20284
|
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
19460
|
-
|
|
20285
|
+
incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
|
|
20286
|
+
score += incomingLinkBoostValue;
|
|
19461
20287
|
}
|
|
20288
|
+
let depthBoostValue = 0;
|
|
19462
20289
|
if (config.ranking.enableDepthBoost) {
|
|
19463
20290
|
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
19464
|
-
|
|
20291
|
+
depthBoostValue = depthBoost * config.ranking.weights.depth;
|
|
20292
|
+
score += depthBoostValue;
|
|
19465
20293
|
}
|
|
20294
|
+
let titleMatchBoostValue = 0;
|
|
19466
20295
|
if (normalizedQuery && titleMatchWeight > 0) {
|
|
19467
20296
|
const normalizedTitle = normalizeForTitleMatch(hit.metadata.title);
|
|
19468
20297
|
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
19469
|
-
|
|
20298
|
+
titleMatchBoostValue = titleMatchWeight;
|
|
20299
|
+
score += titleMatchBoostValue;
|
|
19470
20300
|
}
|
|
19471
20301
|
}
|
|
19472
|
-
|
|
20302
|
+
let freshnessBoostValue = 0;
|
|
20303
|
+
if (config.ranking.enableFreshnessBoost) {
|
|
20304
|
+
const publishedAt = hit.metadata.publishedAt;
|
|
20305
|
+
if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
|
|
20306
|
+
const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
|
|
20307
|
+
const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
|
|
20308
|
+
freshnessBoostValue = decay * config.ranking.weights.freshness;
|
|
20309
|
+
score += freshnessBoostValue;
|
|
20310
|
+
}
|
|
20311
|
+
}
|
|
20312
|
+
let anchorTextMatchBoostValue = 0;
|
|
20313
|
+
if (config.ranking.enableAnchorTextBoost && normalizedQuery && config.ranking.weights.anchorText > 0) {
|
|
20314
|
+
const normalizedAnchorText = normalizeForTitleMatch(hit.metadata.incomingAnchorText ?? "");
|
|
20315
|
+
if (normalizedAnchorText.length > 0 && normalizedQuery.length > 0 && (normalizedAnchorText.includes(normalizedQuery) || normalizedQuery.includes(normalizedAnchorText))) {
|
|
20316
|
+
anchorTextMatchBoostValue = config.ranking.weights.anchorText;
|
|
20317
|
+
score += anchorTextMatchBoostValue;
|
|
20318
|
+
}
|
|
20319
|
+
}
|
|
20320
|
+
const result = {
|
|
19473
20321
|
hit,
|
|
19474
20322
|
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
19475
20323
|
};
|
|
20324
|
+
if (debug) {
|
|
20325
|
+
result.breakdown = {
|
|
20326
|
+
baseScore,
|
|
20327
|
+
incomingLinkBoost: incomingLinkBoostValue,
|
|
20328
|
+
depthBoost: depthBoostValue,
|
|
20329
|
+
titleMatchBoost: titleMatchBoostValue,
|
|
20330
|
+
freshnessBoost: freshnessBoostValue,
|
|
20331
|
+
anchorTextMatchBoost: anchorTextMatchBoostValue
|
|
20332
|
+
};
|
|
20333
|
+
}
|
|
20334
|
+
return result;
|
|
19476
20335
|
}).sort((a, b) => {
|
|
19477
20336
|
const delta = b.finalScore - a.finalScore;
|
|
19478
20337
|
return Number.isNaN(delta) ? 0 : delta;
|
|
@@ -19481,12 +20340,13 @@ function rankHits(hits, config, query) {
|
|
|
19481
20340
|
function trimByScoreGap(results, config) {
|
|
19482
20341
|
if (results.length === 0) return results;
|
|
19483
20342
|
const threshold = config.ranking.scoreGapThreshold;
|
|
19484
|
-
const
|
|
19485
|
-
if (
|
|
19486
|
-
const
|
|
19487
|
-
|
|
19488
|
-
|
|
19489
|
-
|
|
20343
|
+
const minScoreRatio = config.ranking.minScoreRatio;
|
|
20344
|
+
if (minScoreRatio > 0 && results.length > 0) {
|
|
20345
|
+
const topScore = results[0].pageScore;
|
|
20346
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
20347
|
+
const minThreshold = topScore * minScoreRatio;
|
|
20348
|
+
results = results.filter((r) => r.pageScore >= minThreshold);
|
|
20349
|
+
}
|
|
19490
20350
|
}
|
|
19491
20351
|
if (threshold > 0 && results.length > 1) {
|
|
19492
20352
|
for (let i = 1; i < results.length; i++) {
|
|
@@ -19556,61 +20416,99 @@ function aggregateByPage(ranked, config) {
|
|
|
19556
20416
|
return Number.isNaN(delta) ? 0 : delta;
|
|
19557
20417
|
});
|
|
19558
20418
|
}
|
|
19559
|
-
function
|
|
19560
|
-
|
|
19561
|
-
const
|
|
19562
|
-
|
|
19563
|
-
|
|
19564
|
-
|
|
19565
|
-
|
|
19566
|
-
|
|
19567
|
-
|
|
19568
|
-
|
|
19569
|
-
|
|
19570
|
-
if (pageHit) {
|
|
19571
|
-
pagesWithChunks.add(url);
|
|
19572
|
-
const blended = (1 - w) * ranked.finalScore + w * pageHit.score;
|
|
19573
|
-
return {
|
|
19574
|
-
hit: ranked.hit,
|
|
19575
|
-
finalScore: Number.isFinite(blended) ? blended : ranked.finalScore
|
|
19576
|
-
};
|
|
20419
|
+
function rankPageHits(pageHits, config, query, debug) {
|
|
20420
|
+
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
20421
|
+
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
20422
|
+
return pageHits.map((hit) => {
|
|
20423
|
+
const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
20424
|
+
let score = baseScore;
|
|
20425
|
+
let incomingLinkBoostValue = 0;
|
|
20426
|
+
if (config.ranking.enableIncomingLinkBoost) {
|
|
20427
|
+
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.incomingLinks));
|
|
20428
|
+
incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
|
|
20429
|
+
score += incomingLinkBoostValue;
|
|
19577
20430
|
}
|
|
19578
|
-
|
|
19579
|
-
|
|
19580
|
-
|
|
19581
|
-
|
|
19582
|
-
|
|
19583
|
-
|
|
19584
|
-
|
|
19585
|
-
|
|
19586
|
-
|
|
19587
|
-
|
|
19588
|
-
|
|
19589
|
-
|
|
19590
|
-
|
|
19591
|
-
|
|
19592
|
-
|
|
19593
|
-
|
|
19594
|
-
|
|
19595
|
-
|
|
19596
|
-
|
|
19597
|
-
|
|
19598
|
-
|
|
19599
|
-
|
|
19600
|
-
routeFile: pageHit.routeFile,
|
|
19601
|
-
tags: pageHit.tags
|
|
20431
|
+
let depthBoostValue = 0;
|
|
20432
|
+
if (config.ranking.enableDepthBoost) {
|
|
20433
|
+
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.depth));
|
|
20434
|
+
depthBoostValue = depthBoost * config.ranking.weights.depth;
|
|
20435
|
+
score += depthBoostValue;
|
|
20436
|
+
}
|
|
20437
|
+
let titleMatchBoostValue = 0;
|
|
20438
|
+
if (normalizedQuery && titleMatchWeight > 0) {
|
|
20439
|
+
const normalizedTitle = normalizeForTitleMatch(hit.title);
|
|
20440
|
+
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
20441
|
+
titleMatchBoostValue = titleMatchWeight;
|
|
20442
|
+
score += titleMatchBoostValue;
|
|
20443
|
+
}
|
|
20444
|
+
}
|
|
20445
|
+
let freshnessBoostValue = 0;
|
|
20446
|
+
if (config.ranking.enableFreshnessBoost) {
|
|
20447
|
+
const publishedAt = hit.publishedAt;
|
|
20448
|
+
if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
|
|
20449
|
+
const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
|
|
20450
|
+
const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
|
|
20451
|
+
freshnessBoostValue = decay * config.ranking.weights.freshness;
|
|
20452
|
+
score += freshnessBoostValue;
|
|
19602
20453
|
}
|
|
20454
|
+
}
|
|
20455
|
+
const pageWeight = findPageWeight(hit.url, config.ranking.pageWeights);
|
|
20456
|
+
if (pageWeight !== 1) {
|
|
20457
|
+
score *= pageWeight;
|
|
20458
|
+
}
|
|
20459
|
+
const result = {
|
|
20460
|
+
url: hit.url,
|
|
20461
|
+
title: hit.title,
|
|
20462
|
+
description: hit.description,
|
|
20463
|
+
routeFile: hit.routeFile,
|
|
20464
|
+
depth: hit.depth,
|
|
20465
|
+
incomingLinks: hit.incomingLinks,
|
|
20466
|
+
tags: hit.tags,
|
|
20467
|
+
baseScore,
|
|
20468
|
+
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY,
|
|
20469
|
+
publishedAt: hit.publishedAt
|
|
19603
20470
|
};
|
|
19604
|
-
|
|
19605
|
-
|
|
19606
|
-
|
|
19607
|
-
|
|
19608
|
-
|
|
19609
|
-
|
|
20471
|
+
if (debug) {
|
|
20472
|
+
result.breakdown = {
|
|
20473
|
+
baseScore,
|
|
20474
|
+
pageWeight,
|
|
20475
|
+
incomingLinkBoost: incomingLinkBoostValue,
|
|
20476
|
+
depthBoost: depthBoostValue,
|
|
20477
|
+
titleMatchBoost: titleMatchBoostValue,
|
|
20478
|
+
freshnessBoost: freshnessBoostValue
|
|
20479
|
+
};
|
|
20480
|
+
}
|
|
20481
|
+
return result;
|
|
20482
|
+
}).filter((p) => findPageWeight(p.url, config.ranking.pageWeights) !== 0).sort((a, b) => {
|
|
19610
20483
|
const delta = b.finalScore - a.finalScore;
|
|
19611
20484
|
return Number.isNaN(delta) ? 0 : delta;
|
|
19612
20485
|
});
|
|
19613
20486
|
}
|
|
20487
|
+
function trimPagesByScoreGap(results, config) {
|
|
20488
|
+
if (results.length === 0) return results;
|
|
20489
|
+
const threshold = config.ranking.scoreGapThreshold;
|
|
20490
|
+
const minScoreRatio = config.ranking.minScoreRatio;
|
|
20491
|
+
if (minScoreRatio > 0 && results.length > 0) {
|
|
20492
|
+
const topScore = results[0].finalScore;
|
|
20493
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
20494
|
+
const minThreshold = topScore * minScoreRatio;
|
|
20495
|
+
results = results.filter((r) => r.finalScore >= minThreshold);
|
|
20496
|
+
}
|
|
20497
|
+
}
|
|
20498
|
+
if (threshold > 0 && results.length > 1) {
|
|
20499
|
+
for (let i = 1; i < results.length; i++) {
|
|
20500
|
+
const prev = results[i - 1].finalScore;
|
|
20501
|
+
const current = results[i].finalScore;
|
|
20502
|
+
if (prev > 0) {
|
|
20503
|
+
const gap = (prev - current) / prev;
|
|
20504
|
+
if (gap >= threshold) {
|
|
20505
|
+
return results.slice(0, i);
|
|
20506
|
+
}
|
|
20507
|
+
}
|
|
20508
|
+
}
|
|
20509
|
+
}
|
|
20510
|
+
return results;
|
|
20511
|
+
}
|
|
19614
20512
|
|
|
19615
20513
|
// src/utils/time.ts
|
|
19616
20514
|
function nowIso() {
|
|
@@ -19619,6 +20517,81 @@ function nowIso() {
|
|
|
19619
20517
|
function hrTimeMs(start) {
|
|
19620
20518
|
return Number(process.hrtime.bigint() - start) / 1e6;
|
|
19621
20519
|
}
|
|
20520
|
+
function resolvePageUrl(pageUrl, baseUrl) {
|
|
20521
|
+
if (!baseUrl) return pageUrl;
|
|
20522
|
+
try {
|
|
20523
|
+
return new URL(pageUrl, baseUrl).href;
|
|
20524
|
+
} catch {
|
|
20525
|
+
return pageUrl;
|
|
20526
|
+
}
|
|
20527
|
+
}
|
|
20528
|
+
function generateLlmsTxt(pages, config) {
|
|
20529
|
+
const title = config.llmsTxt.title ?? config.project.id;
|
|
20530
|
+
const description = config.llmsTxt.description;
|
|
20531
|
+
const baseUrl = config.project.baseUrl;
|
|
20532
|
+
const lines = [`# ${title}`];
|
|
20533
|
+
if (description) {
|
|
20534
|
+
lines.push("", `> ${description}`);
|
|
20535
|
+
}
|
|
20536
|
+
const filtered = pages.filter(
|
|
20537
|
+
(p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
|
|
20538
|
+
);
|
|
20539
|
+
const sorted = [...filtered].sort((a, b) => {
|
|
20540
|
+
if (a.depth !== b.depth) return a.depth - b.depth;
|
|
20541
|
+
return b.incomingLinks - a.incomingLinks;
|
|
20542
|
+
});
|
|
20543
|
+
if (sorted.length > 0) {
|
|
20544
|
+
lines.push("", "## Pages", "");
|
|
20545
|
+
for (const page of sorted) {
|
|
20546
|
+
const url = resolvePageUrl(page.url, baseUrl);
|
|
20547
|
+
if (page.description) {
|
|
20548
|
+
lines.push(`- [${page.title}](${url}): ${page.description}`);
|
|
20549
|
+
} else {
|
|
20550
|
+
lines.push(`- [${page.title}](${url})`);
|
|
20551
|
+
}
|
|
20552
|
+
}
|
|
20553
|
+
}
|
|
20554
|
+
lines.push("");
|
|
20555
|
+
return lines.join("\n");
|
|
20556
|
+
}
|
|
20557
|
+
function generateLlmsFullTxt(pages, config) {
|
|
20558
|
+
const title = config.llmsTxt.title ?? config.project.id;
|
|
20559
|
+
const description = config.llmsTxt.description;
|
|
20560
|
+
const baseUrl = config.project.baseUrl;
|
|
20561
|
+
const lines = [`# ${title}`];
|
|
20562
|
+
if (description) {
|
|
20563
|
+
lines.push("", `> ${description}`);
|
|
20564
|
+
}
|
|
20565
|
+
const filtered = pages.filter(
|
|
20566
|
+
(p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
|
|
20567
|
+
);
|
|
20568
|
+
const sorted = [...filtered].sort((a, b) => {
|
|
20569
|
+
if (a.depth !== b.depth) return a.depth - b.depth;
|
|
20570
|
+
return b.incomingLinks - a.incomingLinks;
|
|
20571
|
+
});
|
|
20572
|
+
for (const page of sorted) {
|
|
20573
|
+
const url = resolvePageUrl(page.url, baseUrl);
|
|
20574
|
+
lines.push("", "---", "", `## [${page.title}](${url})`, "");
|
|
20575
|
+
lines.push(page.markdown.trim());
|
|
20576
|
+
}
|
|
20577
|
+
lines.push("");
|
|
20578
|
+
return lines.join("\n");
|
|
20579
|
+
}
|
|
20580
|
+
async function writeLlmsTxt(pages, config, cwd, logger3) {
|
|
20581
|
+
const outputPath = path.resolve(cwd, config.llmsTxt.outputPath);
|
|
20582
|
+
const outputDir = path.dirname(outputPath);
|
|
20583
|
+
await fs8.mkdir(outputDir, { recursive: true });
|
|
20584
|
+
const content = generateLlmsTxt(pages, config);
|
|
20585
|
+
await fs8.writeFile(outputPath, content, "utf8");
|
|
20586
|
+
logger3.info(`Generated llms.txt at ${config.llmsTxt.outputPath}`);
|
|
20587
|
+
if (config.llmsTxt.generateFull) {
|
|
20588
|
+
const fullPath = outputPath.replace(/\.txt$/, "-full.txt");
|
|
20589
|
+
const fullContent = generateLlmsFullTxt(pages, config);
|
|
20590
|
+
await fs8.writeFile(fullPath, fullContent, "utf8");
|
|
20591
|
+
const relativeFull = path.relative(cwd, fullPath);
|
|
20592
|
+
logger3.info(`Generated llms-full.txt at ${relativeFull}`);
|
|
20593
|
+
}
|
|
20594
|
+
}
|
|
19622
20595
|
|
|
19623
20596
|
// src/indexing/pipeline.ts
|
|
19624
20597
|
function buildPageSummary(page, maxChars = 3500) {
|
|
@@ -19637,16 +20610,33 @@ function buildPageSummary(page, maxChars = 3500) {
|
|
|
19637
20610
|
if (joined.length <= maxChars) return joined;
|
|
19638
20611
|
return joined.slice(0, maxChars).trim();
|
|
19639
20612
|
}
|
|
20613
|
+
function buildPageContentHash(page) {
|
|
20614
|
+
const parts = [
|
|
20615
|
+
page.title,
|
|
20616
|
+
page.description ?? "",
|
|
20617
|
+
(page.keywords ?? []).slice().sort().join(","),
|
|
20618
|
+
page.tags.slice().sort().join(","),
|
|
20619
|
+
page.markdown,
|
|
20620
|
+
String(page.outgoingLinks),
|
|
20621
|
+
String(page.publishedAt ?? ""),
|
|
20622
|
+
page.incomingAnchorText ?? "",
|
|
20623
|
+
(page.outgoingLinkUrls ?? []).slice().sort().join(","),
|
|
20624
|
+
page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : ""
|
|
20625
|
+
];
|
|
20626
|
+
return sha256(parts.join("|"));
|
|
20627
|
+
}
|
|
19640
20628
|
var IndexPipeline = class _IndexPipeline {
|
|
19641
20629
|
cwd;
|
|
19642
20630
|
config;
|
|
19643
20631
|
store;
|
|
19644
20632
|
logger;
|
|
20633
|
+
hooks;
|
|
19645
20634
|
constructor(options) {
|
|
19646
20635
|
this.cwd = options.cwd;
|
|
19647
20636
|
this.config = options.config;
|
|
19648
20637
|
this.store = options.store;
|
|
19649
20638
|
this.logger = options.logger;
|
|
20639
|
+
this.hooks = options.hooks;
|
|
19650
20640
|
}
|
|
19651
20641
|
static async create(options = {}) {
|
|
19652
20642
|
const cwd = path.resolve(options.cwd ?? process.cwd());
|
|
@@ -19656,7 +20646,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19656
20646
|
cwd,
|
|
19657
20647
|
config,
|
|
19658
20648
|
store,
|
|
19659
|
-
logger: options.logger ?? new Logger()
|
|
20649
|
+
logger: options.logger ?? new Logger(),
|
|
20650
|
+
hooks: options.hooks ?? {}
|
|
19660
20651
|
});
|
|
19661
20652
|
}
|
|
19662
20653
|
getConfig() {
|
|
@@ -19677,7 +20668,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19677
20668
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
19678
20669
|
ensureStateDirs(this.cwd, this.config.state.dir);
|
|
19679
20670
|
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
19680
|
-
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-
|
|
20671
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-vector)`);
|
|
19681
20672
|
if (options.force) {
|
|
19682
20673
|
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
19683
20674
|
}
|
|
@@ -19685,9 +20676,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19685
20676
|
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
19686
20677
|
}
|
|
19687
20678
|
const manifestStart = stageStart();
|
|
19688
|
-
const
|
|
20679
|
+
const existingPageHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getPageHashes(scope);
|
|
19689
20680
|
stageEnd("manifest", manifestStart);
|
|
19690
|
-
this.logger.debug(`Manifest: ${
|
|
20681
|
+
this.logger.debug(`Manifest: ${existingPageHashes.size} existing page hashes loaded`);
|
|
19691
20682
|
const sourceStart = stageStart();
|
|
19692
20683
|
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
19693
20684
|
let sourcePages;
|
|
@@ -19764,11 +20755,61 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19764
20755
|
);
|
|
19765
20756
|
continue;
|
|
19766
20757
|
}
|
|
19767
|
-
|
|
20758
|
+
if (sourcePage.tags && sourcePage.tags.length > 0) {
|
|
20759
|
+
extracted.tags = [.../* @__PURE__ */ new Set([...extracted.tags, ...sourcePage.tags])];
|
|
20760
|
+
}
|
|
20761
|
+
let accepted;
|
|
20762
|
+
if (this.hooks.transformPage) {
|
|
20763
|
+
const transformed = await this.hooks.transformPage(extracted);
|
|
20764
|
+
if (transformed === null) {
|
|
20765
|
+
this.logger.debug(`Page ${sourcePage.url} skipped by transformPage hook`);
|
|
20766
|
+
continue;
|
|
20767
|
+
}
|
|
20768
|
+
accepted = transformed;
|
|
20769
|
+
} else {
|
|
20770
|
+
accepted = extracted;
|
|
20771
|
+
}
|
|
20772
|
+
extractedPages.push(accepted);
|
|
19768
20773
|
this.logger.event("page_extracted", {
|
|
19769
|
-
url:
|
|
20774
|
+
url: accepted.url
|
|
19770
20775
|
});
|
|
19771
20776
|
}
|
|
20777
|
+
const customRecords = options.customRecords ?? [];
|
|
20778
|
+
if (customRecords.length > 0) {
|
|
20779
|
+
this.logger.info(`Processing ${customRecords.length} custom record${customRecords.length === 1 ? "" : "s"}...`);
|
|
20780
|
+
for (const record of customRecords) {
|
|
20781
|
+
const normalizedUrl = normalizeUrlPath(record.url);
|
|
20782
|
+
const normalized = normalizeMarkdown(record.content);
|
|
20783
|
+
if (!normalized.trim()) {
|
|
20784
|
+
this.logger.warn(`Custom record ${normalizedUrl} has empty content and was skipped.`);
|
|
20785
|
+
continue;
|
|
20786
|
+
}
|
|
20787
|
+
const urlTags = normalizedUrl.split("/").filter(Boolean).slice(0, 1);
|
|
20788
|
+
const tags = record.tags ? [.../* @__PURE__ */ new Set([...urlTags, ...record.tags])] : urlTags;
|
|
20789
|
+
const extracted = {
|
|
20790
|
+
url: normalizedUrl,
|
|
20791
|
+
title: record.title,
|
|
20792
|
+
markdown: normalized,
|
|
20793
|
+
outgoingLinks: [],
|
|
20794
|
+
noindex: false,
|
|
20795
|
+
tags,
|
|
20796
|
+
weight: record.weight
|
|
20797
|
+
};
|
|
20798
|
+
let accepted;
|
|
20799
|
+
if (this.hooks.transformPage) {
|
|
20800
|
+
const transformed = await this.hooks.transformPage(extracted);
|
|
20801
|
+
if (transformed === null) {
|
|
20802
|
+
this.logger.debug(`Custom record ${normalizedUrl} skipped by transformPage hook`);
|
|
20803
|
+
continue;
|
|
20804
|
+
}
|
|
20805
|
+
accepted = transformed;
|
|
20806
|
+
} else {
|
|
20807
|
+
accepted = extracted;
|
|
20808
|
+
}
|
|
20809
|
+
extractedPages.push(accepted);
|
|
20810
|
+
this.logger.event("page_extracted", { url: accepted.url, custom: true });
|
|
20811
|
+
}
|
|
20812
|
+
}
|
|
19772
20813
|
extractedPages.sort((a, b) => a.url.localeCompare(b.url));
|
|
19773
20814
|
const uniquePages = [];
|
|
19774
20815
|
const seenUrls = /* @__PURE__ */ new Set();
|
|
@@ -19801,15 +20842,28 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19801
20842
|
const linkStart = stageStart();
|
|
19802
20843
|
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
19803
20844
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
20845
|
+
const incomingAnchorTexts = /* @__PURE__ */ new Map();
|
|
19804
20846
|
for (const page of indexablePages) {
|
|
19805
20847
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
19806
20848
|
}
|
|
19807
20849
|
for (const page of indexablePages) {
|
|
19808
|
-
|
|
20850
|
+
const seenForCount = /* @__PURE__ */ new Set();
|
|
20851
|
+
const seenForAnchor = /* @__PURE__ */ new Set();
|
|
20852
|
+
for (const { url: outgoing, anchorText } of page.outgoingLinks) {
|
|
19809
20853
|
if (!pageSet.has(outgoing)) {
|
|
19810
20854
|
continue;
|
|
19811
20855
|
}
|
|
19812
|
-
|
|
20856
|
+
if (!seenForCount.has(outgoing)) {
|
|
20857
|
+
seenForCount.add(outgoing);
|
|
20858
|
+
incomingLinkCount.set(outgoing, (incomingLinkCount.get(outgoing) ?? 0) + 1);
|
|
20859
|
+
}
|
|
20860
|
+
if (anchorText && !seenForAnchor.has(outgoing)) {
|
|
20861
|
+
seenForAnchor.add(outgoing);
|
|
20862
|
+
if (!incomingAnchorTexts.has(outgoing)) {
|
|
20863
|
+
incomingAnchorTexts.set(outgoing, /* @__PURE__ */ new Set());
|
|
20864
|
+
}
|
|
20865
|
+
incomingAnchorTexts.get(outgoing).add(anchorText);
|
|
20866
|
+
}
|
|
19813
20867
|
}
|
|
19814
20868
|
}
|
|
19815
20869
|
stageEnd("links", linkStart);
|
|
@@ -19828,6 +20882,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19828
20882
|
});
|
|
19829
20883
|
}
|
|
19830
20884
|
}
|
|
20885
|
+
for (const record of customRecords) {
|
|
20886
|
+
const normalizedUrl = normalizeUrlPath(record.url);
|
|
20887
|
+
if (!precomputedRoutes.has(normalizedUrl)) {
|
|
20888
|
+
precomputedRoutes.set(normalizedUrl, {
|
|
20889
|
+
routeFile: "",
|
|
20890
|
+
routeResolution: "exact"
|
|
20891
|
+
});
|
|
20892
|
+
}
|
|
20893
|
+
}
|
|
19831
20894
|
for (const page of indexablePages) {
|
|
19832
20895
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
19833
20896
|
if (routeMatch.routeResolution === "best-effort") {
|
|
@@ -19845,6 +20908,17 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19845
20908
|
} else {
|
|
19846
20909
|
routeExact += 1;
|
|
19847
20910
|
}
|
|
20911
|
+
const anchorSet = incomingAnchorTexts.get(page.url);
|
|
20912
|
+
let incomingAnchorText;
|
|
20913
|
+
if (anchorSet && anchorSet.size > 0) {
|
|
20914
|
+
let joined = "";
|
|
20915
|
+
for (const phrase of anchorSet) {
|
|
20916
|
+
const next2 = joined ? `${joined} ${phrase}` : phrase;
|
|
20917
|
+
if (next2.length > 500) break;
|
|
20918
|
+
joined = next2;
|
|
20919
|
+
}
|
|
20920
|
+
incomingAnchorText = joined || void 0;
|
|
20921
|
+
}
|
|
19848
20922
|
const indexedPage = {
|
|
19849
20923
|
url: page.url,
|
|
19850
20924
|
title: page.title,
|
|
@@ -19854,40 +20928,113 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19854
20928
|
generatedAt: nowIso(),
|
|
19855
20929
|
incomingLinks: incomingLinkCount.get(page.url) ?? 0,
|
|
19856
20930
|
outgoingLinks: page.outgoingLinks.length,
|
|
20931
|
+
outgoingLinkUrls: page.outgoingLinks.map((l) => typeof l === "string" ? l : l.url),
|
|
19857
20932
|
depth: getUrlDepth(page.url),
|
|
19858
20933
|
tags: page.tags,
|
|
19859
20934
|
markdown: page.markdown,
|
|
19860
20935
|
description: page.description,
|
|
19861
|
-
keywords: page.keywords
|
|
20936
|
+
keywords: page.keywords,
|
|
20937
|
+
publishedAt: page.publishedAt,
|
|
20938
|
+
incomingAnchorText,
|
|
20939
|
+
meta: page.meta
|
|
19862
20940
|
};
|
|
19863
20941
|
pages.push(indexedPage);
|
|
19864
20942
|
this.logger.event("page_indexed", { url: page.url });
|
|
19865
20943
|
}
|
|
20944
|
+
const pageRecords = pages.map((p) => {
|
|
20945
|
+
const summary = buildPageSummary(p);
|
|
20946
|
+
return {
|
|
20947
|
+
url: p.url,
|
|
20948
|
+
title: p.title,
|
|
20949
|
+
markdown: p.markdown,
|
|
20950
|
+
projectId: scope.projectId,
|
|
20951
|
+
scopeName: scope.scopeName,
|
|
20952
|
+
routeFile: p.routeFile,
|
|
20953
|
+
routeResolution: p.routeResolution,
|
|
20954
|
+
incomingLinks: p.incomingLinks,
|
|
20955
|
+
outgoingLinks: p.outgoingLinks,
|
|
20956
|
+
outgoingLinkUrls: p.outgoingLinkUrls,
|
|
20957
|
+
depth: p.depth,
|
|
20958
|
+
tags: p.tags,
|
|
20959
|
+
indexedAt: p.generatedAt,
|
|
20960
|
+
summary,
|
|
20961
|
+
description: p.description,
|
|
20962
|
+
keywords: p.keywords,
|
|
20963
|
+
contentHash: buildPageContentHash(p),
|
|
20964
|
+
publishedAt: p.publishedAt,
|
|
20965
|
+
meta: p.meta
|
|
20966
|
+
};
|
|
20967
|
+
});
|
|
20968
|
+
const currentPageUrls = new Set(pageRecords.map((r) => r.url));
|
|
20969
|
+
const changedPages = pageRecords.filter(
|
|
20970
|
+
(r) => !existingPageHashes.has(r.url) || existingPageHashes.get(r.url) !== r.contentHash
|
|
20971
|
+
);
|
|
20972
|
+
const deletedPageUrls = [...existingPageHashes.keys()].filter((url) => !currentPageUrls.has(url));
|
|
19866
20973
|
if (!options.dryRun) {
|
|
19867
|
-
|
|
19868
|
-
|
|
19869
|
-
|
|
19870
|
-
|
|
19871
|
-
|
|
19872
|
-
|
|
19873
|
-
|
|
19874
|
-
|
|
19875
|
-
|
|
19876
|
-
|
|
19877
|
-
|
|
19878
|
-
|
|
19879
|
-
|
|
19880
|
-
|
|
19881
|
-
|
|
19882
|
-
|
|
19883
|
-
|
|
19884
|
-
|
|
19885
|
-
|
|
19886
|
-
|
|
19887
|
-
|
|
19888
|
-
|
|
20974
|
+
if (options.force) {
|
|
20975
|
+
await this.store.deletePages(scope);
|
|
20976
|
+
this.logger.info(`Upserting ${pageRecords.length} page summaries...`);
|
|
20977
|
+
const pageDocs = pageRecords.map((r) => ({
|
|
20978
|
+
id: r.url,
|
|
20979
|
+
data: r.summary ?? r.title,
|
|
20980
|
+
metadata: {
|
|
20981
|
+
title: r.title,
|
|
20982
|
+
url: r.url,
|
|
20983
|
+
description: r.description ?? "",
|
|
20984
|
+
keywords: r.keywords ?? [],
|
|
20985
|
+
summary: r.summary ?? "",
|
|
20986
|
+
tags: r.tags,
|
|
20987
|
+
markdown: r.markdown,
|
|
20988
|
+
routeFile: r.routeFile,
|
|
20989
|
+
routeResolution: r.routeResolution,
|
|
20990
|
+
incomingLinks: r.incomingLinks,
|
|
20991
|
+
outgoingLinks: r.outgoingLinks,
|
|
20992
|
+
outgoingLinkUrls: r.outgoingLinkUrls ?? [],
|
|
20993
|
+
depth: r.depth,
|
|
20994
|
+
indexedAt: r.indexedAt,
|
|
20995
|
+
contentHash: r.contentHash ?? "",
|
|
20996
|
+
publishedAt: r.publishedAt ?? null,
|
|
20997
|
+
...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
|
|
20998
|
+
}
|
|
20999
|
+
}));
|
|
21000
|
+
await this.store.upsertPages(pageDocs, scope);
|
|
21001
|
+
} else {
|
|
21002
|
+
if (changedPages.length > 0) {
|
|
21003
|
+
this.logger.info(`Upserting ${changedPages.length} changed page summaries...`);
|
|
21004
|
+
const pageDocs = changedPages.map((r) => ({
|
|
21005
|
+
id: r.url,
|
|
21006
|
+
data: r.summary ?? r.title,
|
|
21007
|
+
metadata: {
|
|
21008
|
+
title: r.title,
|
|
21009
|
+
url: r.url,
|
|
21010
|
+
description: r.description ?? "",
|
|
21011
|
+
keywords: r.keywords ?? [],
|
|
21012
|
+
summary: r.summary ?? "",
|
|
21013
|
+
tags: r.tags,
|
|
21014
|
+
markdown: r.markdown,
|
|
21015
|
+
routeFile: r.routeFile,
|
|
21016
|
+
routeResolution: r.routeResolution,
|
|
21017
|
+
incomingLinks: r.incomingLinks,
|
|
21018
|
+
outgoingLinks: r.outgoingLinks,
|
|
21019
|
+
outgoingLinkUrls: r.outgoingLinkUrls ?? [],
|
|
21020
|
+
depth: r.depth,
|
|
21021
|
+
indexedAt: r.indexedAt,
|
|
21022
|
+
contentHash: r.contentHash ?? "",
|
|
21023
|
+
publishedAt: r.publishedAt ?? null,
|
|
21024
|
+
...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
|
|
21025
|
+
}
|
|
21026
|
+
}));
|
|
21027
|
+
await this.store.upsertPages(pageDocs, scope);
|
|
21028
|
+
}
|
|
21029
|
+
if (deletedPageUrls.length > 0) {
|
|
21030
|
+
await this.store.deletePagesByIds(deletedPageUrls, scope);
|
|
21031
|
+
}
|
|
21032
|
+
}
|
|
19889
21033
|
}
|
|
21034
|
+
const pagesChanged = options.force ? pageRecords.length : changedPages.length;
|
|
21035
|
+
const pagesDeleted = deletedPageUrls.length;
|
|
19890
21036
|
stageEnd("pages", pagesStart);
|
|
21037
|
+
this.logger.info(`Page changes: ${pagesChanged} changed/new, ${pagesDeleted} deleted, ${pageRecords.length - changedPages.length} unchanged`);
|
|
19891
21038
|
this.logger.info(`Indexed ${pages.length} page${pages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["pages"]}ms)`);
|
|
19892
21039
|
const chunkStart = stageStart();
|
|
19893
21040
|
this.logger.info("Chunking pages...");
|
|
@@ -19896,6 +21043,18 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19896
21043
|
if (typeof maxChunks === "number") {
|
|
19897
21044
|
chunks = chunks.slice(0, maxChunks);
|
|
19898
21045
|
}
|
|
21046
|
+
if (this.hooks.transformChunk) {
|
|
21047
|
+
const transformed = [];
|
|
21048
|
+
for (const chunk of chunks) {
|
|
21049
|
+
const result = await this.hooks.transformChunk(chunk);
|
|
21050
|
+
if (result === null) {
|
|
21051
|
+
this.logger.debug(`Chunk ${chunk.chunkKey} skipped by transformChunk hook`);
|
|
21052
|
+
continue;
|
|
21053
|
+
}
|
|
21054
|
+
transformed.push(result);
|
|
21055
|
+
}
|
|
21056
|
+
chunks = transformed;
|
|
21057
|
+
}
|
|
19899
21058
|
for (const chunk of chunks) {
|
|
19900
21059
|
this.logger.event("chunked", {
|
|
19901
21060
|
url: chunk.url,
|
|
@@ -19908,7 +21067,12 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19908
21067
|
for (const chunk of chunks) {
|
|
19909
21068
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
19910
21069
|
}
|
|
19911
|
-
const
|
|
21070
|
+
const chunkHashStart = stageStart();
|
|
21071
|
+
const currentChunkKeys = chunks.map((c) => c.chunkKey);
|
|
21072
|
+
const existingHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.fetchContentHashesForKeys(currentChunkKeys, scope);
|
|
21073
|
+
stageEnd("chunk_hashes", chunkHashStart);
|
|
21074
|
+
this.logger.debug(`Fetched ${existingHashes.size} existing chunk hashes for ${currentChunkKeys.length} current keys`);
|
|
21075
|
+
let changedChunks = chunks.filter((chunk) => {
|
|
19912
21076
|
if (options.force) {
|
|
19913
21077
|
return true;
|
|
19914
21078
|
}
|
|
@@ -19921,37 +21085,45 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19921
21085
|
}
|
|
19922
21086
|
return existingHash !== chunk.contentHash;
|
|
19923
21087
|
});
|
|
19924
|
-
const
|
|
21088
|
+
const existingChunkIds = options.force ? /* @__PURE__ */ new Set() : await this.store.scanChunkIds(scope);
|
|
21089
|
+
const deletes = [...existingChunkIds].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
21090
|
+
if (this.hooks.beforeIndex) {
|
|
21091
|
+
changedChunks = await this.hooks.beforeIndex(changedChunks);
|
|
21092
|
+
}
|
|
19925
21093
|
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
19926
21094
|
const upsertStart = stageStart();
|
|
19927
21095
|
let documentsUpserted = 0;
|
|
19928
21096
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
19929
|
-
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash
|
|
19930
|
-
const UPSTASH_CONTENT_LIMIT = 4096;
|
|
21097
|
+
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Vector...`);
|
|
19931
21098
|
const docs = changedChunks.map((chunk) => {
|
|
19932
|
-
const
|
|
19933
|
-
|
|
19934
|
-
|
|
19935
|
-
|
|
19936
|
-
|
|
19937
|
-
|
|
19938
|
-
const textBudget = Math.max(500, UPSTASH_CONTENT_LIMIT - otherFieldsLen - 50);
|
|
19939
|
-
const text = buildEmbeddingText(chunk, this.config.chunking.prependTitle).slice(0, textBudget);
|
|
21099
|
+
const embeddingText = buildEmbeddingText(chunk, this.config.chunking.prependTitle);
|
|
21100
|
+
if (embeddingText.length > 2e3) {
|
|
21101
|
+
this.logger.warn(
|
|
21102
|
+
`Chunk ${chunk.chunkKey} text is ${embeddingText.length} chars (~${Math.round(embeddingText.length / 4)} tokens), which may exceed the 512-token model limit and be silently truncated.`
|
|
21103
|
+
);
|
|
21104
|
+
}
|
|
19940
21105
|
return {
|
|
19941
21106
|
id: chunk.chunkKey,
|
|
19942
|
-
|
|
21107
|
+
data: embeddingText,
|
|
19943
21108
|
metadata: {
|
|
19944
|
-
|
|
19945
|
-
scopeName: scope.scopeName,
|
|
21109
|
+
url: chunk.url,
|
|
19946
21110
|
path: chunk.path,
|
|
21111
|
+
title: chunk.title,
|
|
21112
|
+
sectionTitle: chunk.sectionTitle ?? "",
|
|
21113
|
+
headingPath: chunk.headingPath.join(" > "),
|
|
19947
21114
|
snippet: chunk.snippet,
|
|
21115
|
+
chunkText: embeddingText,
|
|
21116
|
+
tags: chunk.tags,
|
|
19948
21117
|
ordinal: chunk.ordinal,
|
|
19949
21118
|
contentHash: chunk.contentHash,
|
|
19950
21119
|
depth: chunk.depth,
|
|
19951
21120
|
incomingLinks: chunk.incomingLinks,
|
|
19952
21121
|
routeFile: chunk.routeFile,
|
|
19953
21122
|
description: chunk.description ?? "",
|
|
19954
|
-
keywords:
|
|
21123
|
+
keywords: chunk.keywords ?? [],
|
|
21124
|
+
publishedAt: chunk.publishedAt ?? null,
|
|
21125
|
+
incomingAnchorText: chunk.incomingAnchorText ?? "",
|
|
21126
|
+
...chunk.meta && Object.keys(chunk.meta).length > 0 ? { meta: chunk.meta } : {}
|
|
19955
21127
|
}
|
|
19956
21128
|
};
|
|
19957
21129
|
});
|
|
@@ -19969,9 +21141,16 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19969
21141
|
} else {
|
|
19970
21142
|
this.logger.info("No chunks to upsert \u2014 all up to date");
|
|
19971
21143
|
}
|
|
21144
|
+
if (this.config.llmsTxt.enable && !options.dryRun) {
|
|
21145
|
+
const llmsStart = stageStart();
|
|
21146
|
+
await writeLlmsTxt(pages, this.config, this.cwd, this.logger);
|
|
21147
|
+
stageEnd("llms_txt", llmsStart);
|
|
21148
|
+
}
|
|
19972
21149
|
this.logger.info("Done.");
|
|
19973
|
-
|
|
21150
|
+
const stats = {
|
|
19974
21151
|
pagesProcessed: pages.length,
|
|
21152
|
+
pagesChanged,
|
|
21153
|
+
pagesDeleted,
|
|
19975
21154
|
chunksTotal: chunks.length,
|
|
19976
21155
|
chunksChanged: changedChunks.length,
|
|
19977
21156
|
documentsUpserted,
|
|
@@ -19980,16 +21159,143 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19980
21159
|
routeBestEffort,
|
|
19981
21160
|
stageTimingsMs
|
|
19982
21161
|
};
|
|
21162
|
+
if (this.hooks.afterIndex) {
|
|
21163
|
+
await this.hooks.afterIndex(stats);
|
|
21164
|
+
}
|
|
21165
|
+
return stats;
|
|
19983
21166
|
}
|
|
19984
21167
|
};
|
|
21168
|
+
|
|
21169
|
+
// src/search/related-pages.ts
|
|
21170
|
+
function diceScore(urlA, urlB) {
|
|
21171
|
+
const segmentsA = urlA.split("/").filter(Boolean);
|
|
21172
|
+
const segmentsB = urlB.split("/").filter(Boolean);
|
|
21173
|
+
if (segmentsA.length === 0 && segmentsB.length === 0) return 1;
|
|
21174
|
+
if (segmentsA.length === 0 || segmentsB.length === 0) return 0;
|
|
21175
|
+
let shared = 0;
|
|
21176
|
+
const minLen = Math.min(segmentsA.length, segmentsB.length);
|
|
21177
|
+
for (let i = 0; i < minLen; i++) {
|
|
21178
|
+
if (segmentsA[i] === segmentsB[i]) {
|
|
21179
|
+
shared++;
|
|
21180
|
+
} else {
|
|
21181
|
+
break;
|
|
21182
|
+
}
|
|
21183
|
+
}
|
|
21184
|
+
return 2 * shared / (segmentsA.length + segmentsB.length);
|
|
21185
|
+
}
|
|
21186
|
+
function compositeScore(isLinked, dice, semantic) {
|
|
21187
|
+
return (isLinked ? 0.5 : 0) + 0.3 * dice + 0.2 * semantic;
|
|
21188
|
+
}
|
|
21189
|
+
function dominantRelationshipType(isOutgoing, isIncoming, dice) {
|
|
21190
|
+
if (isOutgoing) return "outgoing_link";
|
|
21191
|
+
if (isIncoming) return "incoming_link";
|
|
21192
|
+
if (dice > 0.4) return "sibling";
|
|
21193
|
+
return "semantic";
|
|
21194
|
+
}
|
|
21195
|
+
|
|
21196
|
+
// src/search/engine.ts
|
|
21197
|
+
var rankingOverridesSchema = z.object({
|
|
21198
|
+
ranking: z.object({
|
|
21199
|
+
enableIncomingLinkBoost: z.boolean().optional(),
|
|
21200
|
+
enableDepthBoost: z.boolean().optional(),
|
|
21201
|
+
aggregationCap: z.number().int().positive().optional(),
|
|
21202
|
+
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
21203
|
+
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
21204
|
+
minScoreRatio: z.number().min(0).max(1).optional(),
|
|
21205
|
+
scoreGapThreshold: z.number().min(0).max(1).optional(),
|
|
21206
|
+
weights: z.object({
|
|
21207
|
+
incomingLinks: z.number().optional(),
|
|
21208
|
+
depth: z.number().optional(),
|
|
21209
|
+
aggregation: z.number().optional(),
|
|
21210
|
+
titleMatch: z.number().optional()
|
|
21211
|
+
}).optional()
|
|
21212
|
+
}).optional(),
|
|
21213
|
+
search: z.object({
|
|
21214
|
+
pageSearchWeight: z.number().min(0).max(1).optional()
|
|
21215
|
+
}).optional()
|
|
21216
|
+
}).optional();
|
|
19985
21217
|
var requestSchema = z.object({
|
|
19986
21218
|
q: z.string().trim().min(1),
|
|
19987
21219
|
topK: z.number().int().positive().max(100).optional(),
|
|
19988
21220
|
scope: z.string().optional(),
|
|
19989
21221
|
pathPrefix: z.string().optional(),
|
|
19990
21222
|
tags: z.array(z.string()).optional(),
|
|
19991
|
-
|
|
21223
|
+
filters: z.record(z.string(), z.union([z.string(), z.number(), z.boolean()])).optional(),
|
|
21224
|
+
groupBy: z.enum(["page", "chunk"]).optional(),
|
|
21225
|
+
maxSubResults: z.number().int().positive().max(20).optional(),
|
|
21226
|
+
debug: z.boolean().optional(),
|
|
21227
|
+
rankingOverrides: rankingOverridesSchema
|
|
19992
21228
|
});
|
|
21229
|
+
var MAX_SITE_STRUCTURE_PAGES = 2e3;
|
|
21230
|
+
function makeNode(url, depth) {
|
|
21231
|
+
return { url, title: "", depth, routeFile: "", isIndexed: false, childCount: 0, children: [] };
|
|
21232
|
+
}
|
|
21233
|
+
function buildTree(pages, pathPrefix) {
|
|
21234
|
+
const nodeMap = /* @__PURE__ */ new Map();
|
|
21235
|
+
const root2 = makeNode("/", 0);
|
|
21236
|
+
nodeMap.set("/", root2);
|
|
21237
|
+
for (const page of pages) {
|
|
21238
|
+
const normalized = normalizeUrlPath(page.url);
|
|
21239
|
+
const segments = normalized.split("/").filter(Boolean);
|
|
21240
|
+
if (segments.length === 0) {
|
|
21241
|
+
root2.title = page.title;
|
|
21242
|
+
root2.routeFile = page.routeFile;
|
|
21243
|
+
root2.isIndexed = true;
|
|
21244
|
+
continue;
|
|
21245
|
+
}
|
|
21246
|
+
for (let i = 1; i <= segments.length; i++) {
|
|
21247
|
+
const partialUrl = "/" + segments.slice(0, i).join("/");
|
|
21248
|
+
if (!nodeMap.has(partialUrl)) {
|
|
21249
|
+
nodeMap.set(partialUrl, makeNode(partialUrl, i));
|
|
21250
|
+
}
|
|
21251
|
+
}
|
|
21252
|
+
const node = nodeMap.get(normalized);
|
|
21253
|
+
node.title = page.title;
|
|
21254
|
+
node.routeFile = page.routeFile;
|
|
21255
|
+
node.isIndexed = true;
|
|
21256
|
+
}
|
|
21257
|
+
for (const [url, node] of nodeMap) {
|
|
21258
|
+
if (url === "/") continue;
|
|
21259
|
+
const segments = url.split("/").filter(Boolean);
|
|
21260
|
+
const parentUrl = segments.length === 1 ? "/" : "/" + segments.slice(0, -1).join("/");
|
|
21261
|
+
const parent = nodeMap.get(parentUrl) ?? root2;
|
|
21262
|
+
parent.children.push(node);
|
|
21263
|
+
}
|
|
21264
|
+
const sortAndCount = (node) => {
|
|
21265
|
+
node.children.sort((a, b) => a.url.localeCompare(b.url));
|
|
21266
|
+
node.childCount = node.children.length;
|
|
21267
|
+
for (const child of node.children) {
|
|
21268
|
+
sortAndCount(child);
|
|
21269
|
+
}
|
|
21270
|
+
};
|
|
21271
|
+
sortAndCount(root2);
|
|
21272
|
+
if (pathPrefix) {
|
|
21273
|
+
const normalizedPrefix = normalizeUrlPath(pathPrefix);
|
|
21274
|
+
const subtreeRoot = nodeMap.get(normalizedPrefix);
|
|
21275
|
+
if (subtreeRoot) {
|
|
21276
|
+
return subtreeRoot;
|
|
21277
|
+
}
|
|
21278
|
+
return makeNode(normalizedPrefix, normalizedPrefix.split("/").filter(Boolean).length);
|
|
21279
|
+
}
|
|
21280
|
+
return root2;
|
|
21281
|
+
}
|
|
21282
|
+
function mergeRankingOverrides(base, overrides) {
|
|
21283
|
+
return {
|
|
21284
|
+
...base,
|
|
21285
|
+
search: {
|
|
21286
|
+
...base.search,
|
|
21287
|
+
...overrides.search
|
|
21288
|
+
},
|
|
21289
|
+
ranking: {
|
|
21290
|
+
...base.ranking,
|
|
21291
|
+
...overrides.ranking,
|
|
21292
|
+
weights: {
|
|
21293
|
+
...base.ranking.weights,
|
|
21294
|
+
...overrides.ranking?.weights
|
|
21295
|
+
}
|
|
21296
|
+
}
|
|
21297
|
+
};
|
|
21298
|
+
}
|
|
19993
21299
|
var SearchEngine = class _SearchEngine {
|
|
19994
21300
|
cwd;
|
|
19995
21301
|
config;
|
|
@@ -20019,125 +21325,203 @@ var SearchEngine = class _SearchEngine {
|
|
|
20019
21325
|
}
|
|
20020
21326
|
const input = parsed.data;
|
|
20021
21327
|
const totalStart = process.hrtime.bigint();
|
|
21328
|
+
const effectiveConfig = input.debug && input.rankingOverrides ? mergeRankingOverrides(this.config, input.rankingOverrides) : this.config;
|
|
20022
21329
|
const resolvedScope = resolveScope(this.config, input.scope);
|
|
20023
21330
|
const topK = input.topK ?? 10;
|
|
21331
|
+
const maxSubResults = input.maxSubResults ?? 5;
|
|
20024
21332
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
20025
|
-
const
|
|
20026
|
-
const
|
|
20027
|
-
|
|
20028
|
-
|
|
20029
|
-
|
|
20030
|
-
|
|
20031
|
-
|
|
20032
|
-
|
|
20033
|
-
|
|
21333
|
+
const queryText = input.q;
|
|
21334
|
+
const pathPrefix = input.pathPrefix ? input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}` : void 0;
|
|
21335
|
+
const filterTags = input.tags && input.tags.length > 0 ? input.tags : void 0;
|
|
21336
|
+
const metaFilterStr = input.filters && Object.keys(input.filters).length > 0 ? buildMetaFilterString(input.filters) : "";
|
|
21337
|
+
const metaFilter = metaFilterStr || void 0;
|
|
21338
|
+
const applyPagePostFilters = (hits) => {
|
|
21339
|
+
let filtered = hits;
|
|
21340
|
+
if (pathPrefix) {
|
|
21341
|
+
filtered = filtered.filter((h) => h.url.startsWith(pathPrefix));
|
|
21342
|
+
}
|
|
21343
|
+
if (filterTags) {
|
|
21344
|
+
filtered = filtered.filter(
|
|
21345
|
+
(h) => filterTags.every((tag) => h.tags.includes(tag))
|
|
21346
|
+
);
|
|
20034
21347
|
}
|
|
20035
|
-
|
|
20036
|
-
|
|
20037
|
-
const
|
|
21348
|
+
return filtered;
|
|
21349
|
+
};
|
|
21350
|
+
const applyChunkPostFilters = (hits) => {
|
|
21351
|
+
let filtered = hits;
|
|
21352
|
+
if (filterTags) {
|
|
21353
|
+
filtered = filtered.filter(
|
|
21354
|
+
(h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
|
|
21355
|
+
);
|
|
21356
|
+
}
|
|
21357
|
+
return filtered;
|
|
21358
|
+
};
|
|
20038
21359
|
const searchStart = process.hrtime.bigint();
|
|
20039
|
-
|
|
20040
|
-
|
|
20041
|
-
const
|
|
20042
|
-
const
|
|
20043
|
-
|
|
20044
|
-
|
|
20045
|
-
|
|
20046
|
-
|
|
20047
|
-
|
|
20048
|
-
|
|
20049
|
-
|
|
20050
|
-
|
|
20051
|
-
|
|
20052
|
-
|
|
20053
|
-
|
|
20054
|
-
|
|
20055
|
-
|
|
20056
|
-
{
|
|
20057
|
-
limit: chunkLimit,
|
|
20058
|
-
semanticWeight: this.config.search.semanticWeight,
|
|
20059
|
-
inputEnrichment: this.config.search.inputEnrichment,
|
|
20060
|
-
reranking: false,
|
|
20061
|
-
filter
|
|
20062
|
-
},
|
|
21360
|
+
if (groupByPage) {
|
|
21361
|
+
const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
|
|
21362
|
+
const pageLimit = Math.max(topK * 2, 20);
|
|
21363
|
+
const pageHits = await this.store.searchPagesByText(
|
|
21364
|
+
queryText,
|
|
21365
|
+
{ limit: pageLimit * fetchMultiplier, filter: metaFilter },
|
|
21366
|
+
resolvedScope
|
|
21367
|
+
);
|
|
21368
|
+
const filteredPages = applyPagePostFilters(pageHits);
|
|
21369
|
+
let rankedPages = rankPageHits(filteredPages, effectiveConfig, input.q, input.debug);
|
|
21370
|
+
rankedPages = trimPagesByScoreGap(rankedPages, effectiveConfig);
|
|
21371
|
+
const topPages = rankedPages.slice(0, topK);
|
|
21372
|
+
const chunkPromises = topPages.map(
|
|
21373
|
+
(page) => this.store.searchChunksByUrl(
|
|
21374
|
+
queryText,
|
|
21375
|
+
page.url,
|
|
21376
|
+
{ limit: maxSubResults, filter: metaFilter },
|
|
20063
21377
|
resolvedScope
|
|
20064
|
-
)
|
|
20065
|
-
|
|
20066
|
-
const
|
|
20067
|
-
|
|
21378
|
+
).then((chunks) => applyChunkPostFilters(chunks))
|
|
21379
|
+
);
|
|
21380
|
+
const allChunks = await Promise.all(chunkPromises);
|
|
21381
|
+
const searchMs = hrTimeMs(searchStart);
|
|
21382
|
+
const results = this.buildPageFirstResults(topPages, allChunks, input.q, input.debug, maxSubResults);
|
|
21383
|
+
return {
|
|
21384
|
+
q: input.q,
|
|
21385
|
+
scope: resolvedScope.scopeName,
|
|
21386
|
+
results,
|
|
21387
|
+
meta: {
|
|
21388
|
+
timingsMs: {
|
|
21389
|
+
search: Math.round(searchMs),
|
|
21390
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
21391
|
+
}
|
|
21392
|
+
}
|
|
21393
|
+
};
|
|
20068
21394
|
} else {
|
|
21395
|
+
const candidateK = Math.max(50, topK);
|
|
21396
|
+
const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
|
|
20069
21397
|
const hits = await this.store.search(
|
|
20070
|
-
|
|
20071
|
-
{
|
|
20072
|
-
limit: candidateK,
|
|
20073
|
-
semanticWeight: this.config.search.semanticWeight,
|
|
20074
|
-
inputEnrichment: this.config.search.inputEnrichment,
|
|
20075
|
-
reranking: this.config.search.reranking,
|
|
20076
|
-
filter
|
|
20077
|
-
},
|
|
21398
|
+
queryText,
|
|
21399
|
+
{ limit: candidateK * fetchMultiplier, filter: metaFilter },
|
|
20078
21400
|
resolvedScope
|
|
20079
21401
|
);
|
|
20080
|
-
|
|
20081
|
-
|
|
20082
|
-
|
|
20083
|
-
|
|
20084
|
-
|
|
20085
|
-
|
|
20086
|
-
|
|
20087
|
-
|
|
20088
|
-
|
|
20089
|
-
|
|
20090
|
-
|
|
20091
|
-
|
|
21402
|
+
let filtered = hits;
|
|
21403
|
+
if (pathPrefix) {
|
|
21404
|
+
filtered = filtered.filter((h) => h.metadata.url.startsWith(pathPrefix));
|
|
21405
|
+
}
|
|
21406
|
+
if (filterTags) {
|
|
21407
|
+
filtered = filtered.filter(
|
|
21408
|
+
(h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
|
|
21409
|
+
);
|
|
21410
|
+
}
|
|
21411
|
+
const ranked = rankHits(filtered, effectiveConfig, input.q, input.debug);
|
|
21412
|
+
const searchMs = hrTimeMs(searchStart);
|
|
21413
|
+
const results = this.buildResults(ranked, topK, false, maxSubResults, input.q, input.debug, effectiveConfig);
|
|
21414
|
+
return {
|
|
21415
|
+
q: input.q,
|
|
21416
|
+
scope: resolvedScope.scopeName,
|
|
21417
|
+
results,
|
|
21418
|
+
meta: {
|
|
21419
|
+
timingsMs: {
|
|
21420
|
+
search: Math.round(searchMs),
|
|
21421
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
21422
|
+
}
|
|
20092
21423
|
}
|
|
21424
|
+
};
|
|
21425
|
+
}
|
|
21426
|
+
}
|
|
21427
|
+
buildPageFirstResults(rankedPages, allChunks, query, debug, maxSubResults = 5) {
|
|
21428
|
+
return rankedPages.map((page, i) => {
|
|
21429
|
+
const chunks = allChunks[i] ?? [];
|
|
21430
|
+
const bestChunk = chunks[0];
|
|
21431
|
+
const snippet = bestChunk ? query ? queryAwareExcerpt(bestChunk.metadata.chunkText, query) : toSnippet(bestChunk.metadata.chunkText) : page.description || page.title;
|
|
21432
|
+
const result = {
|
|
21433
|
+
url: page.url,
|
|
21434
|
+
title: page.title,
|
|
21435
|
+
sectionTitle: bestChunk?.metadata.sectionTitle || void 0,
|
|
21436
|
+
snippet,
|
|
21437
|
+
chunkText: bestChunk?.metadata.chunkText || void 0,
|
|
21438
|
+
score: Number(page.finalScore.toFixed(6)),
|
|
21439
|
+
routeFile: page.routeFile,
|
|
21440
|
+
chunks: chunks.length > 0 ? chunks.slice(0, maxSubResults).map((c) => ({
|
|
21441
|
+
sectionTitle: c.metadata.sectionTitle || void 0,
|
|
21442
|
+
snippet: query ? queryAwareExcerpt(c.metadata.chunkText, query) : toSnippet(c.metadata.chunkText),
|
|
21443
|
+
chunkText: c.metadata.chunkText || void 0,
|
|
21444
|
+
headingPath: c.metadata.headingPath,
|
|
21445
|
+
score: Number(c.score.toFixed(6))
|
|
21446
|
+
})) : void 0
|
|
21447
|
+
};
|
|
21448
|
+
if (debug && page.breakdown) {
|
|
21449
|
+
result.breakdown = {
|
|
21450
|
+
baseScore: page.breakdown.baseScore,
|
|
21451
|
+
incomingLinkBoost: page.breakdown.incomingLinkBoost,
|
|
21452
|
+
depthBoost: page.breakdown.depthBoost,
|
|
21453
|
+
titleMatchBoost: page.breakdown.titleMatchBoost,
|
|
21454
|
+
freshnessBoost: page.breakdown.freshnessBoost,
|
|
21455
|
+
anchorTextMatchBoost: 0
|
|
21456
|
+
};
|
|
20093
21457
|
}
|
|
20094
|
-
|
|
21458
|
+
return result;
|
|
21459
|
+
});
|
|
20095
21460
|
}
|
|
20096
|
-
ensureSnippet(hit) {
|
|
21461
|
+
ensureSnippet(hit, query) {
|
|
21462
|
+
const chunkText = hit.hit.metadata.chunkText;
|
|
21463
|
+
if (query && chunkText) return queryAwareExcerpt(chunkText, query);
|
|
20097
21464
|
const snippet = hit.hit.metadata.snippet;
|
|
20098
21465
|
if (snippet && snippet.length >= 30) return snippet;
|
|
20099
|
-
const chunkText = hit.hit.metadata.chunkText;
|
|
20100
21466
|
if (chunkText) return toSnippet(chunkText);
|
|
20101
21467
|
return snippet || "";
|
|
20102
21468
|
}
|
|
20103
|
-
buildResults(ordered, topK, groupByPage,
|
|
21469
|
+
buildResults(ordered, topK, groupByPage, maxSubResults, query, debug, config) {
|
|
21470
|
+
const cfg = config ?? this.config;
|
|
20104
21471
|
if (groupByPage) {
|
|
20105
|
-
let pages = aggregateByPage(ordered,
|
|
20106
|
-
pages = trimByScoreGap(pages,
|
|
20107
|
-
const minRatio =
|
|
21472
|
+
let pages = aggregateByPage(ordered, cfg);
|
|
21473
|
+
pages = trimByScoreGap(pages, cfg);
|
|
21474
|
+
const minRatio = cfg.ranking.minChunkScoreRatio;
|
|
20108
21475
|
return pages.slice(0, topK).map((page) => {
|
|
20109
21476
|
const bestScore = page.bestChunk.finalScore;
|
|
20110
21477
|
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
20111
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0,
|
|
20112
|
-
|
|
21478
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, maxSubResults);
|
|
21479
|
+
const result = {
|
|
20113
21480
|
url: page.url,
|
|
20114
21481
|
title: page.title,
|
|
20115
21482
|
sectionTitle: page.bestChunk.hit.metadata.sectionTitle || void 0,
|
|
20116
|
-
snippet: this.ensureSnippet(page.bestChunk),
|
|
21483
|
+
snippet: this.ensureSnippet(page.bestChunk, query),
|
|
21484
|
+
chunkText: page.bestChunk.hit.metadata.chunkText || void 0,
|
|
20117
21485
|
score: Number(page.pageScore.toFixed(6)),
|
|
20118
21486
|
routeFile: page.routeFile,
|
|
20119
|
-
chunks: meaningful.length
|
|
21487
|
+
chunks: meaningful.length >= 1 ? meaningful.map((c) => ({
|
|
20120
21488
|
sectionTitle: c.hit.metadata.sectionTitle || void 0,
|
|
20121
|
-
snippet: this.ensureSnippet(c),
|
|
21489
|
+
snippet: this.ensureSnippet(c, query),
|
|
21490
|
+
chunkText: c.hit.metadata.chunkText || void 0,
|
|
20122
21491
|
headingPath: c.hit.metadata.headingPath,
|
|
20123
21492
|
score: Number(c.finalScore.toFixed(6))
|
|
20124
21493
|
})) : void 0
|
|
20125
21494
|
};
|
|
21495
|
+
if (debug && page.bestChunk.breakdown) {
|
|
21496
|
+
result.breakdown = page.bestChunk.breakdown;
|
|
21497
|
+
}
|
|
21498
|
+
return result;
|
|
20126
21499
|
});
|
|
20127
21500
|
} else {
|
|
20128
21501
|
let filtered = ordered;
|
|
20129
|
-
const
|
|
20130
|
-
if (
|
|
20131
|
-
|
|
20132
|
-
|
|
20133
|
-
|
|
20134
|
-
|
|
20135
|
-
|
|
20136
|
-
|
|
20137
|
-
|
|
20138
|
-
|
|
20139
|
-
|
|
20140
|
-
|
|
21502
|
+
const minScoreRatio = cfg.ranking.minScoreRatio;
|
|
21503
|
+
if (minScoreRatio > 0 && ordered.length > 0) {
|
|
21504
|
+
const topScore = ordered[0].finalScore;
|
|
21505
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
21506
|
+
const threshold = topScore * minScoreRatio;
|
|
21507
|
+
filtered = ordered.filter((entry) => entry.finalScore >= threshold);
|
|
21508
|
+
}
|
|
21509
|
+
}
|
|
21510
|
+
return filtered.slice(0, topK).map(({ hit, finalScore, breakdown }) => {
|
|
21511
|
+
const result = {
|
|
21512
|
+
url: hit.metadata.url,
|
|
21513
|
+
title: hit.metadata.title,
|
|
21514
|
+
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
21515
|
+
snippet: this.ensureSnippet({ hit, finalScore }, query),
|
|
21516
|
+
chunkText: hit.metadata.chunkText || void 0,
|
|
21517
|
+
score: Number(finalScore.toFixed(6)),
|
|
21518
|
+
routeFile: hit.metadata.routeFile
|
|
21519
|
+
};
|
|
21520
|
+
if (debug && breakdown) {
|
|
21521
|
+
result.breakdown = breakdown;
|
|
21522
|
+
}
|
|
21523
|
+
return result;
|
|
21524
|
+
});
|
|
20141
21525
|
}
|
|
20142
21526
|
}
|
|
20143
21527
|
async getPage(pathOrUrl, scope) {
|
|
@@ -20163,6 +21547,116 @@ var SearchEngine = class _SearchEngine {
|
|
|
20163
21547
|
markdown: page.markdown
|
|
20164
21548
|
};
|
|
20165
21549
|
}
|
|
21550
|
+
async listPages(opts) {
|
|
21551
|
+
const resolvedScope = resolveScope(this.config, opts?.scope);
|
|
21552
|
+
const pathPrefix = opts?.pathPrefix ? opts.pathPrefix.startsWith("/") ? opts.pathPrefix : `/${opts.pathPrefix}` : void 0;
|
|
21553
|
+
return this.store.listPages(resolvedScope, {
|
|
21554
|
+
cursor: opts?.cursor,
|
|
21555
|
+
limit: opts?.limit,
|
|
21556
|
+
pathPrefix
|
|
21557
|
+
});
|
|
21558
|
+
}
|
|
21559
|
+
async getSiteStructure(opts) {
|
|
21560
|
+
const maxPages = Math.min(opts?.maxPages ?? MAX_SITE_STRUCTURE_PAGES, MAX_SITE_STRUCTURE_PAGES);
|
|
21561
|
+
const allPages = [];
|
|
21562
|
+
let cursor;
|
|
21563
|
+
let truncated = false;
|
|
21564
|
+
do {
|
|
21565
|
+
const result = await this.listPages({
|
|
21566
|
+
pathPrefix: opts?.pathPrefix,
|
|
21567
|
+
scope: opts?.scope,
|
|
21568
|
+
cursor,
|
|
21569
|
+
limit: 200
|
|
21570
|
+
});
|
|
21571
|
+
allPages.push(...result.pages);
|
|
21572
|
+
cursor = result.nextCursor;
|
|
21573
|
+
if (allPages.length >= maxPages) {
|
|
21574
|
+
truncated = allPages.length > maxPages || !!cursor;
|
|
21575
|
+
allPages.length = maxPages;
|
|
21576
|
+
break;
|
|
21577
|
+
}
|
|
21578
|
+
} while (cursor);
|
|
21579
|
+
const root2 = buildTree(allPages, opts?.pathPrefix);
|
|
21580
|
+
return {
|
|
21581
|
+
root: root2,
|
|
21582
|
+
totalPages: allPages.length,
|
|
21583
|
+
truncated
|
|
21584
|
+
};
|
|
21585
|
+
}
|
|
21586
|
+
async getRelatedPages(pathOrUrl, opts) {
|
|
21587
|
+
const resolvedScope = resolveScope(this.config, opts?.scope);
|
|
21588
|
+
const urlPath = this.resolveInputPath(pathOrUrl);
|
|
21589
|
+
const topK = Math.min(opts?.topK ?? 10, 25);
|
|
21590
|
+
const source = await this.store.fetchPageWithVector(urlPath, resolvedScope);
|
|
21591
|
+
if (!source) {
|
|
21592
|
+
throw new SearchSocketError("INVALID_REQUEST", `Indexed page not found for ${urlPath}`, 404);
|
|
21593
|
+
}
|
|
21594
|
+
const sourceOutgoing = new Set(source.metadata.outgoingLinkUrls ?? []);
|
|
21595
|
+
const semanticHits = await this.store.searchPagesByVector(
|
|
21596
|
+
source.vector,
|
|
21597
|
+
{ limit: 50 },
|
|
21598
|
+
resolvedScope
|
|
21599
|
+
);
|
|
21600
|
+
const filteredHits = semanticHits.filter((h) => h.url !== urlPath);
|
|
21601
|
+
const semanticScoreMap = /* @__PURE__ */ new Map();
|
|
21602
|
+
for (const hit of filteredHits) {
|
|
21603
|
+
semanticScoreMap.set(hit.url, hit.score);
|
|
21604
|
+
}
|
|
21605
|
+
const candidateUrls = /* @__PURE__ */ new Set();
|
|
21606
|
+
for (const hit of filteredHits) {
|
|
21607
|
+
candidateUrls.add(hit.url);
|
|
21608
|
+
}
|
|
21609
|
+
for (const url of sourceOutgoing) {
|
|
21610
|
+
if (url !== urlPath) candidateUrls.add(url);
|
|
21611
|
+
}
|
|
21612
|
+
const missingUrls = [...sourceOutgoing].filter(
|
|
21613
|
+
(u) => u !== urlPath && !semanticScoreMap.has(u)
|
|
21614
|
+
);
|
|
21615
|
+
const fetchedPages = missingUrls.length > 0 ? await this.store.fetchPagesBatch(missingUrls, resolvedScope) : [];
|
|
21616
|
+
const metaMap = /* @__PURE__ */ new Map();
|
|
21617
|
+
for (const hit of filteredHits) {
|
|
21618
|
+
metaMap.set(hit.url, { title: hit.title, routeFile: hit.routeFile, outgoingLinkUrls: [] });
|
|
21619
|
+
}
|
|
21620
|
+
for (const p of fetchedPages) {
|
|
21621
|
+
metaMap.set(p.url, { title: p.title, routeFile: p.routeFile, outgoingLinkUrls: p.outgoingLinkUrls });
|
|
21622
|
+
}
|
|
21623
|
+
const semanticUrls = filteredHits.map((h) => h.url);
|
|
21624
|
+
if (semanticUrls.length > 0) {
|
|
21625
|
+
const semanticPageData = await this.store.fetchPagesBatch(semanticUrls, resolvedScope);
|
|
21626
|
+
for (const p of semanticPageData) {
|
|
21627
|
+
const existing = metaMap.get(p.url);
|
|
21628
|
+
if (existing) {
|
|
21629
|
+
existing.outgoingLinkUrls = p.outgoingLinkUrls;
|
|
21630
|
+
}
|
|
21631
|
+
}
|
|
21632
|
+
}
|
|
21633
|
+
const candidates = [];
|
|
21634
|
+
for (const url of candidateUrls) {
|
|
21635
|
+
const meta = metaMap.get(url);
|
|
21636
|
+
if (!meta) continue;
|
|
21637
|
+
const isOutgoing = sourceOutgoing.has(url);
|
|
21638
|
+
const isIncoming = meta.outgoingLinkUrls.includes(urlPath);
|
|
21639
|
+
const isLinked = isOutgoing || isIncoming;
|
|
21640
|
+
const dice = diceScore(urlPath, url);
|
|
21641
|
+
const semantic = semanticScoreMap.get(url) ?? 0;
|
|
21642
|
+
const score = compositeScore(isLinked, dice, semantic);
|
|
21643
|
+
const relationshipType = dominantRelationshipType(isOutgoing, isIncoming, dice);
|
|
21644
|
+
candidates.push({
|
|
21645
|
+
url,
|
|
21646
|
+
title: meta.title,
|
|
21647
|
+
score: Number(score.toFixed(6)),
|
|
21648
|
+
relationshipType,
|
|
21649
|
+
routeFile: meta.routeFile
|
|
21650
|
+
});
|
|
21651
|
+
}
|
|
21652
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
21653
|
+
const results = candidates.slice(0, topK);
|
|
21654
|
+
return {
|
|
21655
|
+
sourceUrl: urlPath,
|
|
21656
|
+
scope: resolvedScope.scopeName,
|
|
21657
|
+
relatedPages: results
|
|
21658
|
+
};
|
|
21659
|
+
}
|
|
20166
21660
|
async health() {
|
|
20167
21661
|
return this.store.health();
|
|
20168
21662
|
}
|
|
@@ -20185,14 +21679,40 @@ function createServer(engine) {
|
|
|
20185
21679
|
server.registerTool(
|
|
20186
21680
|
"search",
|
|
20187
21681
|
{
|
|
20188
|
-
description:
|
|
21682
|
+
description: `Semantic site search powered by Upstash Search. Returns url, title, snippet, chunkText, score, and routeFile per result. chunkText contains the full raw chunk markdown. When groupBy is 'page' (default), each result includes a chunks array with section-level sub-results containing sectionTitle, headingPath, snippet, and score. Supports optional filters for structured metadata (e.g. {"version": 2, "deprecated": false}).`,
|
|
20189
21683
|
inputSchema: {
|
|
20190
21684
|
query: z.string().min(1),
|
|
20191
21685
|
scope: z.string().optional(),
|
|
20192
21686
|
topK: z.number().int().positive().max(100).optional(),
|
|
20193
21687
|
pathPrefix: z.string().optional(),
|
|
20194
21688
|
tags: z.array(z.string()).optional(),
|
|
20195
|
-
|
|
21689
|
+
filters: z.record(z.string(), z.union([z.string(), z.number(), z.boolean()])).optional(),
|
|
21690
|
+
groupBy: z.enum(["page", "chunk"]).optional(),
|
|
21691
|
+
maxSubResults: z.number().int().positive().max(20).optional()
|
|
21692
|
+
},
|
|
21693
|
+
outputSchema: {
|
|
21694
|
+
q: z.string(),
|
|
21695
|
+
scope: z.string(),
|
|
21696
|
+
results: z.array(z.object({
|
|
21697
|
+
url: z.string(),
|
|
21698
|
+
title: z.string(),
|
|
21699
|
+
sectionTitle: z.string().optional(),
|
|
21700
|
+
snippet: z.string(),
|
|
21701
|
+
score: z.number(),
|
|
21702
|
+
routeFile: z.string(),
|
|
21703
|
+
chunks: z.array(z.object({
|
|
21704
|
+
sectionTitle: z.string().optional(),
|
|
21705
|
+
snippet: z.string(),
|
|
21706
|
+
headingPath: z.array(z.string()),
|
|
21707
|
+
score: z.number()
|
|
21708
|
+
})).optional()
|
|
21709
|
+
})),
|
|
21710
|
+
meta: z.object({
|
|
21711
|
+
timingsMs: z.object({
|
|
21712
|
+
search: z.number(),
|
|
21713
|
+
total: z.number()
|
|
21714
|
+
})
|
|
21715
|
+
})
|
|
20196
21716
|
}
|
|
20197
21717
|
},
|
|
20198
21718
|
async (input) => {
|
|
@@ -20202,7 +21722,9 @@ function createServer(engine) {
|
|
|
20202
21722
|
scope: input.scope,
|
|
20203
21723
|
pathPrefix: input.pathPrefix,
|
|
20204
21724
|
tags: input.tags,
|
|
20205
|
-
|
|
21725
|
+
filters: input.filters,
|
|
21726
|
+
groupBy: input.groupBy,
|
|
21727
|
+
maxSubResults: input.maxSubResults
|
|
20206
21728
|
});
|
|
20207
21729
|
return {
|
|
20208
21730
|
content: [
|
|
@@ -20210,7 +21732,8 @@ function createServer(engine) {
|
|
|
20210
21732
|
type: "text",
|
|
20211
21733
|
text: JSON.stringify(result, null, 2)
|
|
20212
21734
|
}
|
|
20213
|
-
]
|
|
21735
|
+
],
|
|
21736
|
+
structuredContent: result
|
|
20214
21737
|
};
|
|
20215
21738
|
}
|
|
20216
21739
|
);
|
|
@@ -20235,8 +21758,134 @@ function createServer(engine) {
|
|
|
20235
21758
|
};
|
|
20236
21759
|
}
|
|
20237
21760
|
);
|
|
21761
|
+
server.registerTool(
|
|
21762
|
+
"list_pages",
|
|
21763
|
+
{
|
|
21764
|
+
description: "List indexed pages with optional path prefix filtering and cursor-based pagination. Returns url, title, description, and routeFile for each page. Use nextCursor to fetch subsequent pages.",
|
|
21765
|
+
inputSchema: {
|
|
21766
|
+
pathPrefix: z.string().optional(),
|
|
21767
|
+
cursor: z.string().optional(),
|
|
21768
|
+
limit: z.number().int().positive().max(200).optional(),
|
|
21769
|
+
scope: z.string().optional()
|
|
21770
|
+
}
|
|
21771
|
+
},
|
|
21772
|
+
async (input) => {
|
|
21773
|
+
const result = await engine.listPages({
|
|
21774
|
+
pathPrefix: input.pathPrefix,
|
|
21775
|
+
cursor: input.cursor,
|
|
21776
|
+
limit: input.limit,
|
|
21777
|
+
scope: input.scope
|
|
21778
|
+
});
|
|
21779
|
+
return {
|
|
21780
|
+
content: [
|
|
21781
|
+
{
|
|
21782
|
+
type: "text",
|
|
21783
|
+
text: JSON.stringify(result, null, 2)
|
|
21784
|
+
}
|
|
21785
|
+
]
|
|
21786
|
+
};
|
|
21787
|
+
}
|
|
21788
|
+
);
|
|
21789
|
+
server.registerTool(
|
|
21790
|
+
"get_site_structure",
|
|
21791
|
+
{
|
|
21792
|
+
description: "Returns the hierarchical page tree derived from URL paths. Use this to understand site navigation structure, find where pages belong, or scope further operations to a section. Nodes with isIndexed: false are implicit structural parents not directly in the index. Large sites (>2000 pages) return truncated: true.",
|
|
21793
|
+
inputSchema: {
|
|
21794
|
+
pathPrefix: z.string().optional(),
|
|
21795
|
+
scope: z.string().optional(),
|
|
21796
|
+
maxPages: z.number().int().positive().max(2e3).optional()
|
|
21797
|
+
}
|
|
21798
|
+
},
|
|
21799
|
+
async (input) => {
|
|
21800
|
+
const result = await engine.getSiteStructure({
|
|
21801
|
+
pathPrefix: input.pathPrefix,
|
|
21802
|
+
scope: input.scope,
|
|
21803
|
+
maxPages: input.maxPages
|
|
21804
|
+
});
|
|
21805
|
+
return {
|
|
21806
|
+
content: [
|
|
21807
|
+
{
|
|
21808
|
+
type: "text",
|
|
21809
|
+
text: JSON.stringify(result, null, 2)
|
|
21810
|
+
}
|
|
21811
|
+
]
|
|
21812
|
+
};
|
|
21813
|
+
}
|
|
21814
|
+
);
|
|
21815
|
+
server.registerTool(
|
|
21816
|
+
"find_source_file",
|
|
21817
|
+
{
|
|
21818
|
+
description: "Find the SvelteKit source file for a piece of site content. Use this when you need to locate and edit content on the site. Returns the URL, route file path, section title, and a content snippet.",
|
|
21819
|
+
inputSchema: {
|
|
21820
|
+
query: z.string().min(1),
|
|
21821
|
+
scope: z.string().optional()
|
|
21822
|
+
}
|
|
21823
|
+
},
|
|
21824
|
+
async (input) => {
|
|
21825
|
+
const result = await engine.search({
|
|
21826
|
+
q: input.query,
|
|
21827
|
+
topK: 1,
|
|
21828
|
+
scope: input.scope
|
|
21829
|
+
});
|
|
21830
|
+
if (result.results.length === 0) {
|
|
21831
|
+
return {
|
|
21832
|
+
content: [
|
|
21833
|
+
{
|
|
21834
|
+
type: "text",
|
|
21835
|
+
text: JSON.stringify({
|
|
21836
|
+
error: "No matching content found for the given query."
|
|
21837
|
+
})
|
|
21838
|
+
}
|
|
21839
|
+
]
|
|
21840
|
+
};
|
|
21841
|
+
}
|
|
21842
|
+
const match = result.results[0];
|
|
21843
|
+
const { url, routeFile, sectionTitle, snippet } = match;
|
|
21844
|
+
return {
|
|
21845
|
+
content: [
|
|
21846
|
+
{
|
|
21847
|
+
type: "text",
|
|
21848
|
+
text: JSON.stringify({ url, routeFile, sectionTitle, snippet })
|
|
21849
|
+
}
|
|
21850
|
+
]
|
|
21851
|
+
};
|
|
21852
|
+
}
|
|
21853
|
+
);
|
|
21854
|
+
server.registerTool(
|
|
21855
|
+
"get_related_pages",
|
|
21856
|
+
{
|
|
21857
|
+
description: "Find pages related to a given URL using link graph, semantic similarity, and structural proximity. Returns related pages ranked by a composite relatedness score. Use this to discover content connected to a known page.",
|
|
21858
|
+
inputSchema: {
|
|
21859
|
+
pathOrUrl: z.string().min(1),
|
|
21860
|
+
scope: z.string().optional(),
|
|
21861
|
+
topK: z.number().int().positive().max(25).optional()
|
|
21862
|
+
}
|
|
21863
|
+
},
|
|
21864
|
+
async (input) => {
|
|
21865
|
+
const result = await engine.getRelatedPages(input.pathOrUrl, {
|
|
21866
|
+
topK: input.topK,
|
|
21867
|
+
scope: input.scope
|
|
21868
|
+
});
|
|
21869
|
+
return {
|
|
21870
|
+
content: [
|
|
21871
|
+
{
|
|
21872
|
+
type: "text",
|
|
21873
|
+
text: JSON.stringify(result, null, 2)
|
|
21874
|
+
}
|
|
21875
|
+
]
|
|
21876
|
+
};
|
|
21877
|
+
}
|
|
21878
|
+
);
|
|
20238
21879
|
return server;
|
|
20239
21880
|
}
|
|
21881
|
+
function resolveApiKey(config) {
|
|
21882
|
+
return config.mcp.http.apiKey ?? (config.mcp.http.apiKeyEnv ? process.env[config.mcp.http.apiKeyEnv] : void 0);
|
|
21883
|
+
}
|
|
21884
|
+
function verifyApiKey(provided, expected) {
|
|
21885
|
+
const a = createHash("sha256").update(provided).digest();
|
|
21886
|
+
const b = createHash("sha256").update(expected).digest();
|
|
21887
|
+
return timingSafeEqual(a, b);
|
|
21888
|
+
}
|
|
20240
21889
|
function redirectConsoleToStderr() {
|
|
20241
21890
|
console.log = (...args) => {
|
|
20242
21891
|
process.stderr.write(`[LOG] ${args.map(String).join(" ")}
|
|
@@ -20251,7 +21900,22 @@ async function startHttpServer(serverFactory, config, opts) {
|
|
|
20251
21900
|
const app = createMcpExpressApp();
|
|
20252
21901
|
const port = opts.httpPort ?? config.mcp.http.port;
|
|
20253
21902
|
const endpointPath = opts.httpPath ?? config.mcp.http.path;
|
|
21903
|
+
const isPublic = config.mcp.access === "public";
|
|
21904
|
+
const host = isPublic ? "0.0.0.0" : "127.0.0.1";
|
|
21905
|
+
const apiKey = isPublic ? resolveApiKey(config) : void 0;
|
|
20254
21906
|
app.post(endpointPath, async (req, res) => {
|
|
21907
|
+
if (isPublic && apiKey) {
|
|
21908
|
+
const authHeader = req.headers["authorization"];
|
|
21909
|
+
const provided = (authHeader?.startsWith("Bearer ") ? authHeader.slice(7) : void 0) ?? req.headers["x-api-key"] ?? "";
|
|
21910
|
+
if (!provided || !verifyApiKey(provided, apiKey)) {
|
|
21911
|
+
res.status(401).json({
|
|
21912
|
+
jsonrpc: "2.0",
|
|
21913
|
+
error: { code: -32001, message: "Unauthorized" },
|
|
21914
|
+
id: null
|
|
21915
|
+
});
|
|
21916
|
+
return;
|
|
21917
|
+
}
|
|
21918
|
+
}
|
|
20255
21919
|
const server = serverFactory();
|
|
20256
21920
|
const transport = new StreamableHTTPServerTransport({
|
|
20257
21921
|
sessionIdGenerator: void 0
|
|
@@ -20301,9 +21965,12 @@ async function startHttpServer(serverFactory, config, opts) {
|
|
|
20301
21965
|
);
|
|
20302
21966
|
});
|
|
20303
21967
|
await new Promise((resolve, reject) => {
|
|
20304
|
-
const instance = app.listen(port,
|
|
20305
|
-
process.stderr.write(`SearchSocket MCP HTTP server listening on http
|
|
21968
|
+
const instance = app.listen(port, host, () => {
|
|
21969
|
+
process.stderr.write(`SearchSocket MCP HTTP server listening on http://${host}:${port}${endpointPath}
|
|
20306
21970
|
`);
|
|
21971
|
+
if (isPublic) {
|
|
21972
|
+
process.stderr.write("WARNING: Server is in public mode. Ensure HTTPS is configured via a reverse proxy for production use.\n");
|
|
21973
|
+
}
|
|
20307
21974
|
resolve();
|
|
20308
21975
|
});
|
|
20309
21976
|
instance.once("error", reject);
|
|
@@ -20318,6 +21985,13 @@ async function runMcpServer(options = {}) {
|
|
|
20318
21985
|
cwd: options.cwd,
|
|
20319
21986
|
configPath: options.configPath
|
|
20320
21987
|
});
|
|
21988
|
+
if (options.access) config.mcp.access = options.access;
|
|
21989
|
+
if (options.apiKey) config.mcp.http.apiKey = options.apiKey;
|
|
21990
|
+
if (config.mcp.access === "public" && !resolveApiKey(config)) {
|
|
21991
|
+
throw new Error(
|
|
21992
|
+
'MCP access is "public" but no API key is configured. Pass --api-key or set mcp.http.apiKey / mcp.http.apiKeyEnv in config.'
|
|
21993
|
+
);
|
|
21994
|
+
}
|
|
20321
21995
|
const resolvedTransport = options.transport ?? config.mcp.transport;
|
|
20322
21996
|
if (resolvedTransport === "stdio") {
|
|
20323
21997
|
redirectConsoleToStderr();
|
|
@@ -20335,8 +22009,6 @@ async function runMcpServer(options = {}) {
|
|
|
20335
22009
|
const stdioTransport = new StdioServerTransport();
|
|
20336
22010
|
await server.connect(stdioTransport);
|
|
20337
22011
|
}
|
|
20338
|
-
|
|
20339
|
-
// src/sveltekit/handle.ts
|
|
20340
22012
|
var InMemoryRateLimiter = class {
|
|
20341
22013
|
constructor(windowMs, max) {
|
|
20342
22014
|
this.windowMs = windowMs;
|
|
@@ -20364,7 +22036,13 @@ function searchsocketHandle(options = {}) {
|
|
|
20364
22036
|
let enginePromise = null;
|
|
20365
22037
|
let configPromise = null;
|
|
20366
22038
|
let apiPath = options.path;
|
|
22039
|
+
let llmsServePath = null;
|
|
22040
|
+
let serveMarkdownVariants = false;
|
|
22041
|
+
let mcpPath;
|
|
22042
|
+
let mcpApiKey;
|
|
22043
|
+
let mcpEnableJsonResponse = true;
|
|
20367
22044
|
let rateLimiter = null;
|
|
22045
|
+
let notConfigured = false;
|
|
20368
22046
|
const getConfig = async () => {
|
|
20369
22047
|
if (!configPromise) {
|
|
20370
22048
|
let configP;
|
|
@@ -20381,6 +22059,13 @@ function searchsocketHandle(options = {}) {
|
|
|
20381
22059
|
}
|
|
20382
22060
|
configPromise = configP.then((config) => {
|
|
20383
22061
|
apiPath = apiPath ?? config.api.path;
|
|
22062
|
+
mcpPath = config.mcp.handle.path;
|
|
22063
|
+
mcpApiKey = config.mcp.handle.apiKey;
|
|
22064
|
+
mcpEnableJsonResponse = config.mcp.handle.enableJsonResponse;
|
|
22065
|
+
if (config.llmsTxt.enable) {
|
|
22066
|
+
llmsServePath = "/" + config.llmsTxt.outputPath.replace(/^static\//, "");
|
|
22067
|
+
serveMarkdownVariants = config.llmsTxt.serveMarkdownVariants;
|
|
22068
|
+
}
|
|
20384
22069
|
if (config.api.rateLimit && !isServerless()) {
|
|
20385
22070
|
rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
|
|
20386
22071
|
}
|
|
@@ -20390,59 +22075,109 @@ function searchsocketHandle(options = {}) {
|
|
|
20390
22075
|
return configPromise;
|
|
20391
22076
|
};
|
|
20392
22077
|
const getEngine = async () => {
|
|
22078
|
+
if (notConfigured) {
|
|
22079
|
+
throw new SearchSocketError(
|
|
22080
|
+
"SEARCH_NOT_CONFIGURED",
|
|
22081
|
+
"Search is not configured. Set the required Upstash environment variables to enable search.",
|
|
22082
|
+
503
|
|
22083
|
+
);
|
|
22084
|
+
}
|
|
20393
22085
|
if (!enginePromise) {
|
|
20394
22086
|
const config = await getConfig();
|
|
20395
22087
|
enginePromise = SearchEngine.create({
|
|
20396
22088
|
cwd: options.cwd,
|
|
20397
22089
|
config
|
|
22090
|
+
}).catch((error) => {
|
|
22091
|
+
enginePromise = null;
|
|
22092
|
+
if (error instanceof SearchSocketError && error.code === "VECTOR_BACKEND_UNAVAILABLE") {
|
|
22093
|
+
notConfigured = true;
|
|
22094
|
+
throw new SearchSocketError(
|
|
22095
|
+
"SEARCH_NOT_CONFIGURED",
|
|
22096
|
+
"Search is not configured. Set the required Upstash environment variables to enable search.",
|
|
22097
|
+
503
|
|
22098
|
+
);
|
|
22099
|
+
}
|
|
22100
|
+
throw error;
|
|
20398
22101
|
});
|
|
20399
22102
|
}
|
|
20400
22103
|
return enginePromise;
|
|
20401
22104
|
};
|
|
20402
22105
|
const bodyLimit = options.maxBodyBytes ?? 64 * 1024;
|
|
20403
22106
|
return async ({ event, resolve }) => {
|
|
20404
|
-
if (apiPath && event.url.pathname !==
|
|
20405
|
-
|
|
22107
|
+
if (apiPath && !isApiPath(event.url.pathname, apiPath) && event.url.pathname !== llmsServePath) {
|
|
22108
|
+
const isMarkdownVariant = event.request.method === "GET" && event.url.pathname.endsWith(".md");
|
|
22109
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
22110
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
22111
|
+
}
|
|
22112
|
+
if (mcpPath) {
|
|
22113
|
+
if (serveMarkdownVariants && isMarkdownVariant) ; else {
|
|
22114
|
+
return resolve(event);
|
|
22115
|
+
}
|
|
22116
|
+
} else {
|
|
22117
|
+
if (configPromise || options.config || options.rawConfig) {
|
|
22118
|
+
await getConfig();
|
|
22119
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
22120
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
22121
|
+
}
|
|
22122
|
+
if (!(serveMarkdownVariants && isMarkdownVariant)) {
|
|
22123
|
+
return resolve(event);
|
|
22124
|
+
}
|
|
22125
|
+
} else {
|
|
22126
|
+
return resolve(event);
|
|
22127
|
+
}
|
|
22128
|
+
}
|
|
20406
22129
|
}
|
|
20407
22130
|
const config = await getConfig();
|
|
22131
|
+
if (llmsServePath && event.request.method === "GET" && event.url.pathname === llmsServePath) {
|
|
22132
|
+
const cwd = options.cwd ?? process.cwd();
|
|
22133
|
+
const filePath = path.resolve(cwd, config.llmsTxt.outputPath);
|
|
22134
|
+
try {
|
|
22135
|
+
const content = await fs8.readFile(filePath, "utf8");
|
|
22136
|
+
return new Response(content, {
|
|
22137
|
+
status: 200,
|
|
22138
|
+
headers: { "content-type": "text/plain; charset=utf-8" }
|
|
22139
|
+
});
|
|
22140
|
+
} catch {
|
|
22141
|
+
return resolve(event);
|
|
22142
|
+
}
|
|
22143
|
+
}
|
|
22144
|
+
if (serveMarkdownVariants && event.request.method === "GET" && event.url.pathname.endsWith(".md")) {
|
|
22145
|
+
let rawPath;
|
|
22146
|
+
try {
|
|
22147
|
+
rawPath = decodeURIComponent(event.url.pathname.slice(0, -3));
|
|
22148
|
+
} catch {
|
|
22149
|
+
return resolve(event);
|
|
22150
|
+
}
|
|
22151
|
+
const scope = event.url.searchParams?.get("scope") ?? void 0;
|
|
22152
|
+
try {
|
|
22153
|
+
const engine = await getEngine();
|
|
22154
|
+
const page = await engine.getPage(rawPath, scope);
|
|
22155
|
+
return new Response(page.markdown, {
|
|
22156
|
+
status: 200,
|
|
22157
|
+
headers: { "content-type": "text/markdown; charset=utf-8" }
|
|
22158
|
+
});
|
|
22159
|
+
} catch (error) {
|
|
22160
|
+
if (error instanceof SearchSocketError && error.status === 404) {
|
|
22161
|
+
return resolve(event);
|
|
22162
|
+
}
|
|
22163
|
+
throw error;
|
|
22164
|
+
}
|
|
22165
|
+
}
|
|
22166
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
22167
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
22168
|
+
}
|
|
20408
22169
|
const targetPath = apiPath ?? config.api.path;
|
|
20409
|
-
if (event.url.pathname
|
|
22170
|
+
if (!isApiPath(event.url.pathname, targetPath)) {
|
|
20410
22171
|
return resolve(event);
|
|
20411
22172
|
}
|
|
20412
|
-
|
|
22173
|
+
const subPath = event.url.pathname.slice(targetPath.length);
|
|
22174
|
+
const method = event.request.method;
|
|
22175
|
+
if (method === "OPTIONS") {
|
|
20413
22176
|
return new Response(null, {
|
|
20414
22177
|
status: 204,
|
|
20415
22178
|
headers: buildCorsHeaders(event.request, config)
|
|
20416
22179
|
});
|
|
20417
22180
|
}
|
|
20418
|
-
if (event.request.method !== "POST") {
|
|
20419
|
-
return withCors(
|
|
20420
|
-
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
|
|
20421
|
-
status: 405,
|
|
20422
|
-
headers: {
|
|
20423
|
-
"content-type": "application/json"
|
|
20424
|
-
}
|
|
20425
|
-
}),
|
|
20426
|
-
event.request,
|
|
20427
|
-
config
|
|
20428
|
-
);
|
|
20429
|
-
}
|
|
20430
|
-
const contentLength = Number(event.request.headers.get("content-length") ?? 0);
|
|
20431
|
-
if (contentLength > bodyLimit) {
|
|
20432
|
-
return withCors(
|
|
20433
|
-
new Response(
|
|
20434
|
-
JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Request body too large", 413))),
|
|
20435
|
-
{
|
|
20436
|
-
status: 413,
|
|
20437
|
-
headers: {
|
|
20438
|
-
"content-type": "application/json"
|
|
20439
|
-
}
|
|
20440
|
-
}
|
|
20441
|
-
),
|
|
20442
|
-
event.request,
|
|
20443
|
-
config
|
|
20444
|
-
);
|
|
20445
|
-
}
|
|
20446
22181
|
if (rateLimiter) {
|
|
20447
22182
|
const ip = event.getClientAddress?.() ?? event.request.headers.get("x-forwarded-for")?.split(",")[0]?.trim() ?? "unknown";
|
|
20448
22183
|
if (!rateLimiter.check(ip)) {
|
|
@@ -20462,39 +22197,32 @@ function searchsocketHandle(options = {}) {
|
|
|
20462
22197
|
}
|
|
20463
22198
|
}
|
|
20464
22199
|
try {
|
|
20465
|
-
|
|
20466
|
-
|
|
20467
|
-
|
|
20468
|
-
} else {
|
|
20469
|
-
let parsedFallback;
|
|
20470
|
-
try {
|
|
20471
|
-
parsedFallback = await event.request.json();
|
|
20472
|
-
} catch (error) {
|
|
20473
|
-
if (error instanceof SyntaxError) {
|
|
20474
|
-
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
20475
|
-
}
|
|
20476
|
-
throw error;
|
|
22200
|
+
if (method === "GET") {
|
|
22201
|
+
if (subPath === "" || subPath === "/") {
|
|
22202
|
+
return await handleGetSearch(event, config, getEngine);
|
|
20477
22203
|
}
|
|
20478
|
-
|
|
22204
|
+
if (subPath === "/health") {
|
|
22205
|
+
return await handleGetHealth(event, config, getEngine);
|
|
22206
|
+
}
|
|
22207
|
+
if (subPath.startsWith("/pages/")) {
|
|
22208
|
+
return await handleGetPage(event, config, getEngine, subPath);
|
|
22209
|
+
}
|
|
22210
|
+
return withCors(
|
|
22211
|
+
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Not found", 404))), {
|
|
22212
|
+
status: 404,
|
|
22213
|
+
headers: { "content-type": "application/json" }
|
|
22214
|
+
}),
|
|
22215
|
+
event.request,
|
|
22216
|
+
config
|
|
22217
|
+
);
|
|
20479
22218
|
}
|
|
20480
|
-
if (
|
|
20481
|
-
|
|
22219
|
+
if (method === "POST" && (subPath === "" || subPath === "/")) {
|
|
22220
|
+
return await handlePostSearch(event, config, getEngine, bodyLimit);
|
|
20482
22221
|
}
|
|
20483
|
-
let body;
|
|
20484
|
-
try {
|
|
20485
|
-
body = JSON.parse(rawBody);
|
|
20486
|
-
} catch {
|
|
20487
|
-
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
20488
|
-
}
|
|
20489
|
-
const engine = await getEngine();
|
|
20490
|
-
const searchRequest = body;
|
|
20491
|
-
const result = await engine.search(searchRequest);
|
|
20492
22222
|
return withCors(
|
|
20493
|
-
new Response(JSON.stringify(
|
|
20494
|
-
status:
|
|
20495
|
-
headers: {
|
|
20496
|
-
"content-type": "application/json"
|
|
20497
|
-
}
|
|
22223
|
+
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
|
|
22224
|
+
status: 405,
|
|
22225
|
+
headers: { "content-type": "application/json" }
|
|
20498
22226
|
}),
|
|
20499
22227
|
event.request,
|
|
20500
22228
|
config
|
|
@@ -20515,6 +22243,183 @@ function searchsocketHandle(options = {}) {
|
|
|
20515
22243
|
}
|
|
20516
22244
|
};
|
|
20517
22245
|
}
|
|
22246
|
+
function isApiPath(pathname, apiPath) {
|
|
22247
|
+
return pathname === apiPath || pathname.startsWith(apiPath + "/");
|
|
22248
|
+
}
|
|
22249
|
+
async function handleGetSearch(event, config, getEngine) {
|
|
22250
|
+
const params = event.url.searchParams;
|
|
22251
|
+
const q = params.get("q");
|
|
22252
|
+
if (!q || q.trim() === "") {
|
|
22253
|
+
throw new SearchSocketError("INVALID_REQUEST", "Missing required query parameter: q", 400);
|
|
22254
|
+
}
|
|
22255
|
+
const searchRequest = { q };
|
|
22256
|
+
const topK = params.get("topK");
|
|
22257
|
+
if (topK !== null) {
|
|
22258
|
+
const parsed = Number.parseInt(topK, 10);
|
|
22259
|
+
if (Number.isNaN(parsed) || parsed < 1) {
|
|
22260
|
+
throw new SearchSocketError("INVALID_REQUEST", "topK must be a positive integer", 400);
|
|
22261
|
+
}
|
|
22262
|
+
searchRequest.topK = parsed;
|
|
22263
|
+
}
|
|
22264
|
+
const scope = params.get("scope");
|
|
22265
|
+
if (scope !== null) searchRequest.scope = scope;
|
|
22266
|
+
const pathPrefix = params.get("pathPrefix");
|
|
22267
|
+
if (pathPrefix !== null) searchRequest.pathPrefix = pathPrefix;
|
|
22268
|
+
const groupBy = params.get("groupBy");
|
|
22269
|
+
if (groupBy) {
|
|
22270
|
+
if (groupBy !== "page" && groupBy !== "chunk") {
|
|
22271
|
+
throw new SearchSocketError("INVALID_REQUEST", 'groupBy must be "page" or "chunk"', 400);
|
|
22272
|
+
}
|
|
22273
|
+
searchRequest.groupBy = groupBy;
|
|
22274
|
+
}
|
|
22275
|
+
const maxSubResults = params.get("maxSubResults");
|
|
22276
|
+
if (maxSubResults !== null) {
|
|
22277
|
+
const parsed = Number.parseInt(maxSubResults, 10);
|
|
22278
|
+
if (Number.isNaN(parsed) || parsed < 1 || parsed > 20) {
|
|
22279
|
+
throw new SearchSocketError("INVALID_REQUEST", "maxSubResults must be a positive integer between 1 and 20", 400);
|
|
22280
|
+
}
|
|
22281
|
+
searchRequest.maxSubResults = parsed;
|
|
22282
|
+
}
|
|
22283
|
+
const tags = params.getAll("tags");
|
|
22284
|
+
if (tags.length > 0) searchRequest.tags = tags;
|
|
22285
|
+
const engine = await getEngine();
|
|
22286
|
+
const result = await engine.search(searchRequest);
|
|
22287
|
+
return withCors(
|
|
22288
|
+
new Response(JSON.stringify(result), {
|
|
22289
|
+
status: 200,
|
|
22290
|
+
headers: { "content-type": "application/json" }
|
|
22291
|
+
}),
|
|
22292
|
+
event.request,
|
|
22293
|
+
config
|
|
22294
|
+
);
|
|
22295
|
+
}
|
|
22296
|
+
async function handleGetHealth(event, config, getEngine) {
|
|
22297
|
+
const engine = await getEngine();
|
|
22298
|
+
const result = await engine.health();
|
|
22299
|
+
return withCors(
|
|
22300
|
+
new Response(JSON.stringify(result), {
|
|
22301
|
+
status: 200,
|
|
22302
|
+
headers: { "content-type": "application/json" }
|
|
22303
|
+
}),
|
|
22304
|
+
event.request,
|
|
22305
|
+
config
|
|
22306
|
+
);
|
|
22307
|
+
}
|
|
22308
|
+
async function handleGetPage(event, config, getEngine, subPath) {
|
|
22309
|
+
const rawPath = subPath.slice("/pages".length);
|
|
22310
|
+
let pagePath;
|
|
22311
|
+
try {
|
|
22312
|
+
pagePath = decodeURIComponent(rawPath);
|
|
22313
|
+
} catch {
|
|
22314
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed page path", 400);
|
|
22315
|
+
}
|
|
22316
|
+
const scope = event.url.searchParams?.get("scope") ?? void 0;
|
|
22317
|
+
const engine = await getEngine();
|
|
22318
|
+
const result = await engine.getPage(pagePath, scope);
|
|
22319
|
+
return withCors(
|
|
22320
|
+
new Response(JSON.stringify(result), {
|
|
22321
|
+
status: 200,
|
|
22322
|
+
headers: { "content-type": "application/json" }
|
|
22323
|
+
}),
|
|
22324
|
+
event.request,
|
|
22325
|
+
config
|
|
22326
|
+
);
|
|
22327
|
+
}
|
|
22328
|
+
async function handlePostSearch(event, config, getEngine, bodyLimit) {
|
|
22329
|
+
const contentLength = Number(event.request.headers.get("content-length") ?? 0);
|
|
22330
|
+
if (contentLength > bodyLimit) {
|
|
22331
|
+
throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
|
|
22332
|
+
}
|
|
22333
|
+
let rawBody;
|
|
22334
|
+
if (typeof event.request.text === "function") {
|
|
22335
|
+
rawBody = await event.request.text();
|
|
22336
|
+
} else {
|
|
22337
|
+
let parsedFallback;
|
|
22338
|
+
try {
|
|
22339
|
+
parsedFallback = await event.request.json();
|
|
22340
|
+
} catch (error) {
|
|
22341
|
+
if (error instanceof SyntaxError) {
|
|
22342
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
22343
|
+
}
|
|
22344
|
+
throw error;
|
|
22345
|
+
}
|
|
22346
|
+
rawBody = JSON.stringify(parsedFallback);
|
|
22347
|
+
}
|
|
22348
|
+
if (Buffer.byteLength(rawBody, "utf8") > bodyLimit) {
|
|
22349
|
+
throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
|
|
22350
|
+
}
|
|
22351
|
+
let body;
|
|
22352
|
+
try {
|
|
22353
|
+
body = JSON.parse(rawBody);
|
|
22354
|
+
} catch {
|
|
22355
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
22356
|
+
}
|
|
22357
|
+
const engine = await getEngine();
|
|
22358
|
+
const searchRequest = body;
|
|
22359
|
+
const result = await engine.search(searchRequest);
|
|
22360
|
+
return withCors(
|
|
22361
|
+
new Response(JSON.stringify(result), {
|
|
22362
|
+
status: 200,
|
|
22363
|
+
headers: { "content-type": "application/json" }
|
|
22364
|
+
}),
|
|
22365
|
+
event.request,
|
|
22366
|
+
config
|
|
22367
|
+
);
|
|
22368
|
+
}
|
|
22369
|
+
async function handleMcpRequest(event, apiKey, enableJsonResponse, getEngine) {
|
|
22370
|
+
if (apiKey) {
|
|
22371
|
+
const authHeader = event.request.headers.get("authorization") ?? "";
|
|
22372
|
+
const token = authHeader.startsWith("Bearer ") ? authHeader.slice(7) : "";
|
|
22373
|
+
const tokenBuf = Buffer.from(token);
|
|
22374
|
+
const keyBuf = Buffer.from(apiKey);
|
|
22375
|
+
if (tokenBuf.length !== keyBuf.length || !timingSafeEqual(tokenBuf, keyBuf)) {
|
|
22376
|
+
return new Response(
|
|
22377
|
+
JSON.stringify({
|
|
22378
|
+
jsonrpc: "2.0",
|
|
22379
|
+
error: { code: -32001, message: "Unauthorized" },
|
|
22380
|
+
id: null
|
|
22381
|
+
}),
|
|
22382
|
+
{ status: 401, headers: { "content-type": "application/json" } }
|
|
22383
|
+
);
|
|
22384
|
+
}
|
|
22385
|
+
}
|
|
22386
|
+
const transport = new WebStandardStreamableHTTPServerTransport({
|
|
22387
|
+
sessionIdGenerator: void 0,
|
|
22388
|
+
enableJsonResponse
|
|
22389
|
+
});
|
|
22390
|
+
let server;
|
|
22391
|
+
try {
|
|
22392
|
+
const engine = await getEngine();
|
|
22393
|
+
server = createServer(engine);
|
|
22394
|
+
await server.connect(transport);
|
|
22395
|
+
const response = await transport.handleRequest(event.request);
|
|
22396
|
+
if (enableJsonResponse) {
|
|
22397
|
+
await transport.close();
|
|
22398
|
+
await server.close();
|
|
22399
|
+
}
|
|
22400
|
+
return response;
|
|
22401
|
+
} catch (error) {
|
|
22402
|
+
try {
|
|
22403
|
+
await transport.close();
|
|
22404
|
+
} catch {
|
|
22405
|
+
}
|
|
22406
|
+
try {
|
|
22407
|
+
await server?.close();
|
|
22408
|
+
} catch {
|
|
22409
|
+
}
|
|
22410
|
+
return new Response(
|
|
22411
|
+
JSON.stringify({
|
|
22412
|
+
jsonrpc: "2.0",
|
|
22413
|
+
error: {
|
|
22414
|
+
code: -32603,
|
|
22415
|
+
message: error instanceof Error ? error.message : "Internal server error"
|
|
22416
|
+
},
|
|
22417
|
+
id: null
|
|
22418
|
+
}),
|
|
22419
|
+
{ status: 500, headers: { "content-type": "application/json" } }
|
|
22420
|
+
);
|
|
22421
|
+
}
|
|
22422
|
+
}
|
|
20518
22423
|
function buildCorsHeaders(request, config) {
|
|
20519
22424
|
const allowOrigins = config.api.cors.allowOrigins;
|
|
20520
22425
|
if (!allowOrigins || allowOrigins.length === 0) {
|
|
@@ -20527,7 +22432,7 @@ function buildCorsHeaders(request, config) {
|
|
|
20527
22432
|
}
|
|
20528
22433
|
return {
|
|
20529
22434
|
"access-control-allow-origin": allowOrigins.includes("*") ? "*" : origin,
|
|
20530
|
-
"access-control-allow-methods": "POST, OPTIONS",
|
|
22435
|
+
"access-control-allow-methods": "GET, POST, OPTIONS",
|
|
20531
22436
|
"access-control-allow-headers": "content-type"
|
|
20532
22437
|
};
|
|
20533
22438
|
}
|
|
@@ -20563,9 +22468,6 @@ function shouldRunAutoIndex(options) {
|
|
|
20563
22468
|
if (explicit && /^(1|true|yes)$/i.test(explicit)) {
|
|
20564
22469
|
return true;
|
|
20565
22470
|
}
|
|
20566
|
-
if (process.env.CI && /^(1|true)$/i.test(process.env.CI)) {
|
|
20567
|
-
return true;
|
|
20568
|
-
}
|
|
20569
22471
|
return false;
|
|
20570
22472
|
}
|
|
20571
22473
|
function searchsocketVitePlugin(options = {}) {
|
|
@@ -20590,7 +22492,8 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20590
22492
|
const pipeline = await IndexPipeline.create({
|
|
20591
22493
|
cwd,
|
|
20592
22494
|
configPath: options.configPath,
|
|
20593
|
-
logger: logger3
|
|
22495
|
+
logger: logger3,
|
|
22496
|
+
hooks: options.hooks
|
|
20594
22497
|
});
|
|
20595
22498
|
const stats = await pipeline.run({
|
|
20596
22499
|
changedOnly: options.changedOnly ?? true,
|