searchsocket 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +731 -514
- package/dist/cli.js +3308 -524
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +2310 -466
- package/dist/index.d.cts +101 -40
- package/dist/index.d.ts +101 -40
- package/dist/index.js +2310 -466
- package/dist/{plugin-B_npJSux.d.cts → plugin-C61L-ykY.d.ts} +2 -1
- package/dist/{plugin-M-aW0ev6.d.ts → plugin-DoBW1gkK.d.cts} +2 -1
- package/dist/sveltekit.cjs +2342 -465
- package/dist/sveltekit.d.cts +2 -2
- package/dist/sveltekit.d.ts +2 -2
- package/dist/sveltekit.js +2344 -467
- package/dist/templates/search-dialog/SearchDialog.svelte +175 -0
- package/dist/templates/search-input/SearchInput.svelte +151 -0
- package/dist/templates/search-results/SearchResults.svelte +75 -0
- package/dist/{types-Dk43uz25.d.cts → types-029hl6P2.d.cts} +180 -9
- package/dist/{types-Dk43uz25.d.ts → types-029hl6P2.d.ts} +180 -9
- package/package.json +20 -2
- package/src/svelte/SearchSocket.svelte +35 -0
- package/src/svelte/index.svelte.ts +181 -0
package/dist/index.js
CHANGED
|
@@ -3,18 +3,20 @@ import path from 'path';
|
|
|
3
3
|
import { createJiti } from 'jiti';
|
|
4
4
|
import { z } from 'zod';
|
|
5
5
|
import { execSync, spawn } from 'child_process';
|
|
6
|
-
import {
|
|
6
|
+
import { FusionAlgorithm, QueryMode } from '@upstash/vector';
|
|
7
|
+
import { timingSafeEqual, createHash } from 'crypto';
|
|
7
8
|
import { load } from 'cheerio';
|
|
8
9
|
import matter from 'gray-matter';
|
|
9
10
|
import fg from 'fast-glob';
|
|
10
11
|
import pLimit from 'p-limit';
|
|
11
|
-
import
|
|
12
|
+
import fs8 from 'fs/promises';
|
|
12
13
|
import net from 'net';
|
|
13
14
|
import { gunzipSync } from 'zlib';
|
|
14
15
|
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
15
16
|
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
16
17
|
import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/streamableHttp.js';
|
|
17
18
|
import { createMcpExpressApp } from '@modelcontextprotocol/sdk/server/express.js';
|
|
19
|
+
import { WebStandardStreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js';
|
|
18
20
|
|
|
19
21
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
20
22
|
var __commonJS = (cb, mod) => function __require() {
|
|
@@ -5013,32 +5015,32 @@ var require_URL = __commonJS({
|
|
|
5013
5015
|
else
|
|
5014
5016
|
return basepath.substring(0, lastslash + 1) + refpath;
|
|
5015
5017
|
}
|
|
5016
|
-
function remove_dot_segments(
|
|
5017
|
-
if (!
|
|
5018
|
+
function remove_dot_segments(path14) {
|
|
5019
|
+
if (!path14) return path14;
|
|
5018
5020
|
var output = "";
|
|
5019
|
-
while (
|
|
5020
|
-
if (
|
|
5021
|
-
|
|
5021
|
+
while (path14.length > 0) {
|
|
5022
|
+
if (path14 === "." || path14 === "..") {
|
|
5023
|
+
path14 = "";
|
|
5022
5024
|
break;
|
|
5023
5025
|
}
|
|
5024
|
-
var twochars =
|
|
5025
|
-
var threechars =
|
|
5026
|
-
var fourchars =
|
|
5026
|
+
var twochars = path14.substring(0, 2);
|
|
5027
|
+
var threechars = path14.substring(0, 3);
|
|
5028
|
+
var fourchars = path14.substring(0, 4);
|
|
5027
5029
|
if (threechars === "../") {
|
|
5028
|
-
|
|
5030
|
+
path14 = path14.substring(3);
|
|
5029
5031
|
} else if (twochars === "./") {
|
|
5030
|
-
|
|
5032
|
+
path14 = path14.substring(2);
|
|
5031
5033
|
} else if (threechars === "/./") {
|
|
5032
|
-
|
|
5033
|
-
} else if (twochars === "/." &&
|
|
5034
|
-
|
|
5035
|
-
} else if (fourchars === "/../" || threechars === "/.." &&
|
|
5036
|
-
|
|
5034
|
+
path14 = "/" + path14.substring(3);
|
|
5035
|
+
} else if (twochars === "/." && path14.length === 2) {
|
|
5036
|
+
path14 = "/";
|
|
5037
|
+
} else if (fourchars === "/../" || threechars === "/.." && path14.length === 3) {
|
|
5038
|
+
path14 = "/" + path14.substring(4);
|
|
5037
5039
|
output = output.replace(/\/?[^\/]*$/, "");
|
|
5038
5040
|
} else {
|
|
5039
|
-
var segment =
|
|
5041
|
+
var segment = path14.match(/(\/?([^\/]*))/)[0];
|
|
5040
5042
|
output += segment;
|
|
5041
|
-
|
|
5043
|
+
path14 = path14.substring(segment.length);
|
|
5042
5044
|
}
|
|
5043
5045
|
}
|
|
5044
5046
|
return output;
|
|
@@ -16634,6 +16636,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
16634
16636
|
dropSelectors: z.array(z.string()).optional(),
|
|
16635
16637
|
ignoreAttr: z.string().optional(),
|
|
16636
16638
|
noindexAttr: z.string().optional(),
|
|
16639
|
+
imageDescAttr: z.string().optional(),
|
|
16637
16640
|
respectRobotsNoindex: z.boolean().optional()
|
|
16638
16641
|
}).optional(),
|
|
16639
16642
|
transform: z.object({
|
|
@@ -16649,35 +16652,48 @@ var searchSocketConfigSchema = z.object({
|
|
|
16649
16652
|
headingPathDepth: z.number().int().positive().optional(),
|
|
16650
16653
|
dontSplitInside: z.array(z.enum(["code", "table", "blockquote"])).optional(),
|
|
16651
16654
|
prependTitle: z.boolean().optional(),
|
|
16652
|
-
pageSummaryChunk: z.boolean().optional()
|
|
16655
|
+
pageSummaryChunk: z.boolean().optional(),
|
|
16656
|
+
weightHeadings: z.boolean().optional()
|
|
16653
16657
|
}).optional(),
|
|
16654
16658
|
upstash: z.object({
|
|
16655
16659
|
url: z.string().url().optional(),
|
|
16656
16660
|
token: z.string().min(1).optional(),
|
|
16657
16661
|
urlEnv: z.string().min(1).optional(),
|
|
16658
|
-
tokenEnv: z.string().min(1).optional()
|
|
16662
|
+
tokenEnv: z.string().min(1).optional(),
|
|
16663
|
+
namespaces: z.object({
|
|
16664
|
+
pages: z.string().min(1).optional(),
|
|
16665
|
+
chunks: z.string().min(1).optional()
|
|
16666
|
+
}).optional()
|
|
16667
|
+
}).optional(),
|
|
16668
|
+
embedding: z.object({
|
|
16669
|
+
model: z.string().optional(),
|
|
16670
|
+
dimensions: z.number().int().positive().optional(),
|
|
16671
|
+
taskType: z.string().optional(),
|
|
16672
|
+
batchSize: z.number().int().positive().optional()
|
|
16659
16673
|
}).optional(),
|
|
16660
16674
|
search: z.object({
|
|
16661
|
-
semanticWeight: z.number().min(0).max(1).optional(),
|
|
16662
|
-
inputEnrichment: z.boolean().optional(),
|
|
16663
|
-
reranking: z.boolean().optional(),
|
|
16664
16675
|
dualSearch: z.boolean().optional(),
|
|
16665
16676
|
pageSearchWeight: z.number().min(0).max(1).optional()
|
|
16666
16677
|
}).optional(),
|
|
16667
16678
|
ranking: z.object({
|
|
16668
16679
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
16669
16680
|
enableDepthBoost: z.boolean().optional(),
|
|
16681
|
+
enableFreshnessBoost: z.boolean().optional(),
|
|
16682
|
+
freshnessDecayRate: z.number().positive().optional(),
|
|
16683
|
+
enableAnchorTextBoost: z.boolean().optional(),
|
|
16670
16684
|
pageWeights: z.record(z.string(), z.number().min(0)).optional(),
|
|
16671
16685
|
aggregationCap: z.number().int().positive().optional(),
|
|
16672
16686
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
16673
16687
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
16674
|
-
|
|
16688
|
+
minScoreRatio: z.number().min(0).max(1).optional(),
|
|
16675
16689
|
scoreGapThreshold: z.number().min(0).max(1).optional(),
|
|
16676
16690
|
weights: z.object({
|
|
16677
16691
|
incomingLinks: z.number().optional(),
|
|
16678
16692
|
depth: z.number().optional(),
|
|
16679
16693
|
aggregation: z.number().optional(),
|
|
16680
|
-
titleMatch: z.number().optional()
|
|
16694
|
+
titleMatch: z.number().optional(),
|
|
16695
|
+
freshness: z.number().optional(),
|
|
16696
|
+
anchorText: z.number().optional()
|
|
16681
16697
|
}).optional()
|
|
16682
16698
|
}).optional(),
|
|
16683
16699
|
api: z.object({
|
|
@@ -16692,12 +16708,28 @@ var searchSocketConfigSchema = z.object({
|
|
|
16692
16708
|
}).optional(),
|
|
16693
16709
|
mcp: z.object({
|
|
16694
16710
|
enable: z.boolean().optional(),
|
|
16711
|
+
access: z.enum(["public", "private"]).optional(),
|
|
16695
16712
|
transport: z.enum(["stdio", "http"]).optional(),
|
|
16696
16713
|
http: z.object({
|
|
16697
16714
|
port: z.number().int().positive().optional(),
|
|
16698
|
-
path: z.string().optional()
|
|
16715
|
+
path: z.string().optional(),
|
|
16716
|
+
apiKey: z.string().min(1).optional(),
|
|
16717
|
+
apiKeyEnv: z.string().min(1).optional()
|
|
16718
|
+
}).optional(),
|
|
16719
|
+
handle: z.object({
|
|
16720
|
+
path: z.string().optional(),
|
|
16721
|
+
apiKey: z.string().min(1).optional(),
|
|
16722
|
+
enableJsonResponse: z.boolean().optional()
|
|
16699
16723
|
}).optional()
|
|
16700
16724
|
}).optional(),
|
|
16725
|
+
llmsTxt: z.object({
|
|
16726
|
+
enable: z.boolean().optional(),
|
|
16727
|
+
outputPath: z.string().optional(),
|
|
16728
|
+
title: z.string().optional(),
|
|
16729
|
+
description: z.string().optional(),
|
|
16730
|
+
generateFull: z.boolean().optional(),
|
|
16731
|
+
serveMarkdownVariants: z.boolean().optional()
|
|
16732
|
+
}).optional(),
|
|
16701
16733
|
state: z.object({
|
|
16702
16734
|
dir: z.string().optional()
|
|
16703
16735
|
}).optional()
|
|
@@ -16736,6 +16768,7 @@ function createDefaultConfig(projectId) {
|
|
|
16736
16768
|
dropSelectors: DEFAULT_DROP_SELECTORS,
|
|
16737
16769
|
ignoreAttr: "data-search-ignore",
|
|
16738
16770
|
noindexAttr: "data-search-noindex",
|
|
16771
|
+
imageDescAttr: "data-search-description",
|
|
16739
16772
|
respectRobotsNoindex: true
|
|
16740
16773
|
},
|
|
16741
16774
|
transform: {
|
|
@@ -16745,39 +16778,52 @@ function createDefaultConfig(projectId) {
|
|
|
16745
16778
|
},
|
|
16746
16779
|
chunking: {
|
|
16747
16780
|
strategy: "hybrid",
|
|
16748
|
-
maxChars:
|
|
16781
|
+
maxChars: 1500,
|
|
16749
16782
|
overlapChars: 200,
|
|
16750
16783
|
minChars: 250,
|
|
16751
16784
|
headingPathDepth: 3,
|
|
16752
16785
|
dontSplitInside: ["code", "table", "blockquote"],
|
|
16753
16786
|
prependTitle: true,
|
|
16754
|
-
pageSummaryChunk: true
|
|
16787
|
+
pageSummaryChunk: true,
|
|
16788
|
+
weightHeadings: true
|
|
16755
16789
|
},
|
|
16756
16790
|
upstash: {
|
|
16757
|
-
urlEnv: "
|
|
16758
|
-
tokenEnv: "
|
|
16791
|
+
urlEnv: "UPSTASH_VECTOR_REST_URL",
|
|
16792
|
+
tokenEnv: "UPSTASH_VECTOR_REST_TOKEN",
|
|
16793
|
+
namespaces: {
|
|
16794
|
+
pages: "pages",
|
|
16795
|
+
chunks: "chunks"
|
|
16796
|
+
}
|
|
16797
|
+
},
|
|
16798
|
+
embedding: {
|
|
16799
|
+
model: "bge-large-en-v1.5",
|
|
16800
|
+
dimensions: 1024,
|
|
16801
|
+
taskType: "RETRIEVAL_DOCUMENT",
|
|
16802
|
+
batchSize: 100
|
|
16759
16803
|
},
|
|
16760
16804
|
search: {
|
|
16761
|
-
semanticWeight: 0.75,
|
|
16762
|
-
inputEnrichment: true,
|
|
16763
|
-
reranking: true,
|
|
16764
16805
|
dualSearch: true,
|
|
16765
16806
|
pageSearchWeight: 0.3
|
|
16766
16807
|
},
|
|
16767
16808
|
ranking: {
|
|
16768
16809
|
enableIncomingLinkBoost: true,
|
|
16769
16810
|
enableDepthBoost: true,
|
|
16811
|
+
enableFreshnessBoost: false,
|
|
16812
|
+
freshnessDecayRate: 1e-3,
|
|
16813
|
+
enableAnchorTextBoost: false,
|
|
16770
16814
|
pageWeights: {},
|
|
16771
16815
|
aggregationCap: 5,
|
|
16772
16816
|
aggregationDecay: 0.5,
|
|
16773
16817
|
minChunkScoreRatio: 0.5,
|
|
16774
|
-
|
|
16818
|
+
minScoreRatio: 0.7,
|
|
16775
16819
|
scoreGapThreshold: 0.4,
|
|
16776
16820
|
weights: {
|
|
16777
16821
|
incomingLinks: 0.05,
|
|
16778
16822
|
depth: 0.03,
|
|
16779
16823
|
aggregation: 0.1,
|
|
16780
|
-
titleMatch: 0.15
|
|
16824
|
+
titleMatch: 0.15,
|
|
16825
|
+
freshness: 0.1,
|
|
16826
|
+
anchorText: 0.1
|
|
16781
16827
|
}
|
|
16782
16828
|
},
|
|
16783
16829
|
api: {
|
|
@@ -16788,12 +16834,23 @@ function createDefaultConfig(projectId) {
|
|
|
16788
16834
|
},
|
|
16789
16835
|
mcp: {
|
|
16790
16836
|
enable: process.env.NODE_ENV !== "production",
|
|
16837
|
+
access: "private",
|
|
16791
16838
|
transport: "stdio",
|
|
16792
16839
|
http: {
|
|
16793
16840
|
port: 3338,
|
|
16794
16841
|
path: "/mcp"
|
|
16842
|
+
},
|
|
16843
|
+
handle: {
|
|
16844
|
+
path: "/api/mcp",
|
|
16845
|
+
enableJsonResponse: true
|
|
16795
16846
|
}
|
|
16796
16847
|
},
|
|
16848
|
+
llmsTxt: {
|
|
16849
|
+
enable: false,
|
|
16850
|
+
outputPath: "static/llms.txt",
|
|
16851
|
+
generateFull: true,
|
|
16852
|
+
serveMarkdownVariants: false
|
|
16853
|
+
},
|
|
16797
16854
|
state: {
|
|
16798
16855
|
dir: ".searchsocket"
|
|
16799
16856
|
}
|
|
@@ -16921,7 +16978,15 @@ ${issues}`
|
|
|
16921
16978
|
},
|
|
16922
16979
|
upstash: {
|
|
16923
16980
|
...defaults.upstash,
|
|
16924
|
-
...parsed.upstash
|
|
16981
|
+
...parsed.upstash,
|
|
16982
|
+
namespaces: {
|
|
16983
|
+
...defaults.upstash.namespaces,
|
|
16984
|
+
...parsed.upstash?.namespaces
|
|
16985
|
+
}
|
|
16986
|
+
},
|
|
16987
|
+
embedding: {
|
|
16988
|
+
...defaults.embedding,
|
|
16989
|
+
...parsed.embedding
|
|
16925
16990
|
},
|
|
16926
16991
|
search: {
|
|
16927
16992
|
...defaults.search,
|
|
@@ -16958,8 +17023,16 @@ ${issues}`
|
|
|
16958
17023
|
http: {
|
|
16959
17024
|
...defaults.mcp.http,
|
|
16960
17025
|
...parsed.mcp?.http
|
|
17026
|
+
},
|
|
17027
|
+
handle: {
|
|
17028
|
+
...defaults.mcp.handle,
|
|
17029
|
+
...parsed.mcp?.handle
|
|
16961
17030
|
}
|
|
16962
17031
|
},
|
|
17032
|
+
llmsTxt: {
|
|
17033
|
+
...defaults.llmsTxt,
|
|
17034
|
+
...parsed.llmsTxt
|
|
17035
|
+
},
|
|
16963
17036
|
state: {
|
|
16964
17037
|
...defaults.state,
|
|
16965
17038
|
...parsed.state
|
|
@@ -16979,6 +17052,15 @@ ${issues}`
|
|
|
16979
17052
|
maxDepth: 10
|
|
16980
17053
|
};
|
|
16981
17054
|
}
|
|
17055
|
+
if (merged.mcp.access === "public") {
|
|
17056
|
+
const resolvedKey = merged.mcp.http.apiKey ?? (merged.mcp.http.apiKeyEnv ? process.env[merged.mcp.http.apiKeyEnv] : void 0);
|
|
17057
|
+
if (!resolvedKey) {
|
|
17058
|
+
throw new SearchSocketError(
|
|
17059
|
+
"CONFIG_MISSING",
|
|
17060
|
+
'`mcp.access` is "public" but no API key is configured. Set `mcp.http.apiKey` or `mcp.http.apiKeyEnv`.'
|
|
17061
|
+
);
|
|
17062
|
+
}
|
|
17063
|
+
}
|
|
16982
17064
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
16983
17065
|
throw new SearchSocketError("CONFIG_MISSING", "`source.crawl.baseUrl` is required when source.mode is crawl.");
|
|
16984
17066
|
}
|
|
@@ -17042,13 +17124,84 @@ function normalizeMarkdown(input) {
|
|
|
17042
17124
|
function sanitizeScopeName(scopeName) {
|
|
17043
17125
|
return scopeName.toLowerCase().replace(/[^a-z0-9._-]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80);
|
|
17044
17126
|
}
|
|
17127
|
+
function markdownToPlain(markdown) {
|
|
17128
|
+
return markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/[#>*_|\-]/g, " ").replace(/\s+/g, " ").trim();
|
|
17129
|
+
}
|
|
17045
17130
|
function toSnippet(markdown, maxLen = 220) {
|
|
17046
|
-
const plain = markdown
|
|
17131
|
+
const plain = markdownToPlain(markdown);
|
|
17047
17132
|
if (plain.length <= maxLen) {
|
|
17048
17133
|
return plain;
|
|
17049
17134
|
}
|
|
17050
17135
|
return `${plain.slice(0, Math.max(0, maxLen - 1)).trim()}\u2026`;
|
|
17051
17136
|
}
|
|
17137
|
+
function queryAwareExcerpt(markdown, query, maxLen = 220) {
|
|
17138
|
+
const plain = markdownToPlain(markdown);
|
|
17139
|
+
if (plain.length <= maxLen) return plain;
|
|
17140
|
+
const tokens = query.toLowerCase().split(/\s+/).filter((t) => t.length >= 2);
|
|
17141
|
+
if (tokens.length === 0) return toSnippet(markdown, maxLen);
|
|
17142
|
+
const positions = [];
|
|
17143
|
+
for (let ti = 0; ti < tokens.length; ti++) {
|
|
17144
|
+
const escaped = tokens[ti].replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
17145
|
+
const re = new RegExp(escaped, "gi");
|
|
17146
|
+
let m;
|
|
17147
|
+
while ((m = re.exec(plain)) !== null) {
|
|
17148
|
+
positions.push({ start: m.index, end: m.index + m[0].length, tokenIdx: ti });
|
|
17149
|
+
}
|
|
17150
|
+
}
|
|
17151
|
+
if (positions.length === 0) return toSnippet(markdown, maxLen);
|
|
17152
|
+
positions.sort((a, b) => a.start - b.start);
|
|
17153
|
+
let bestUniqueCount = 0;
|
|
17154
|
+
let bestTotalCount = 0;
|
|
17155
|
+
let bestLeft = 0;
|
|
17156
|
+
let bestRight = 0;
|
|
17157
|
+
let left = 0;
|
|
17158
|
+
const tokenCounts = /* @__PURE__ */ new Map();
|
|
17159
|
+
for (let right = 0; right < positions.length; right++) {
|
|
17160
|
+
tokenCounts.set(positions[right].tokenIdx, (tokenCounts.get(positions[right].tokenIdx) ?? 0) + 1);
|
|
17161
|
+
while (positions[right].end - positions[left].start > maxLen && left < right) {
|
|
17162
|
+
const leftToken = positions[left].tokenIdx;
|
|
17163
|
+
const cnt = tokenCounts.get(leftToken) - 1;
|
|
17164
|
+
if (cnt === 0) tokenCounts.delete(leftToken);
|
|
17165
|
+
else tokenCounts.set(leftToken, cnt);
|
|
17166
|
+
left++;
|
|
17167
|
+
}
|
|
17168
|
+
const uniqueCount = tokenCounts.size;
|
|
17169
|
+
const totalCount = right - left + 1;
|
|
17170
|
+
if (uniqueCount > bestUniqueCount || uniqueCount === bestUniqueCount && totalCount > bestTotalCount) {
|
|
17171
|
+
bestUniqueCount = uniqueCount;
|
|
17172
|
+
bestTotalCount = totalCount;
|
|
17173
|
+
bestLeft = left;
|
|
17174
|
+
bestRight = right;
|
|
17175
|
+
}
|
|
17176
|
+
}
|
|
17177
|
+
const mid = Math.floor((positions[bestLeft].start + positions[bestRight].end) / 2);
|
|
17178
|
+
let start = Math.max(0, mid - Math.floor(maxLen / 2));
|
|
17179
|
+
let end = Math.min(plain.length, start + maxLen);
|
|
17180
|
+
start = Math.max(0, end - maxLen);
|
|
17181
|
+
if (start > 0) {
|
|
17182
|
+
const spaceIdx = plain.lastIndexOf(" ", start);
|
|
17183
|
+
if (spaceIdx > start - 30) {
|
|
17184
|
+
start = spaceIdx + 1;
|
|
17185
|
+
}
|
|
17186
|
+
}
|
|
17187
|
+
if (end < plain.length) {
|
|
17188
|
+
const spaceIdx = plain.indexOf(" ", end);
|
|
17189
|
+
if (spaceIdx !== -1 && spaceIdx < end + 30) {
|
|
17190
|
+
end = spaceIdx;
|
|
17191
|
+
}
|
|
17192
|
+
}
|
|
17193
|
+
let excerpt = plain.slice(start, end);
|
|
17194
|
+
if (excerpt.length > Math.ceil(maxLen * 1.2)) {
|
|
17195
|
+
excerpt = excerpt.slice(0, maxLen);
|
|
17196
|
+
const lastSpace = excerpt.lastIndexOf(" ");
|
|
17197
|
+
if (lastSpace > maxLen * 0.5) {
|
|
17198
|
+
excerpt = excerpt.slice(0, lastSpace);
|
|
17199
|
+
}
|
|
17200
|
+
}
|
|
17201
|
+
const prefix = start > 0 ? "\u2026" : "";
|
|
17202
|
+
const suffix = end < plain.length ? "\u2026" : "";
|
|
17203
|
+
return `${prefix}${excerpt}${suffix}`;
|
|
17204
|
+
}
|
|
17052
17205
|
function extractFirstParagraph(markdown) {
|
|
17053
17206
|
const lines = markdown.split("\n");
|
|
17054
17207
|
let inFence = false;
|
|
@@ -17109,162 +17262,288 @@ function ensureStateDirs(cwd, stateDir, scope) {
|
|
|
17109
17262
|
fs.mkdirSync(statePath, { recursive: true });
|
|
17110
17263
|
return { statePath };
|
|
17111
17264
|
}
|
|
17112
|
-
|
|
17113
|
-
// src/vector/upstash.ts
|
|
17114
|
-
function chunkIndexName(scope) {
|
|
17115
|
-
return `${scope.projectId}--${scope.scopeName}`;
|
|
17116
|
-
}
|
|
17117
|
-
function pageIndexName(scope) {
|
|
17118
|
-
return `${scope.projectId}--${scope.scopeName}--pages`;
|
|
17119
|
-
}
|
|
17120
17265
|
var UpstashSearchStore = class {
|
|
17121
|
-
|
|
17266
|
+
index;
|
|
17267
|
+
pagesNs;
|
|
17268
|
+
chunksNs;
|
|
17122
17269
|
constructor(opts) {
|
|
17123
|
-
this.
|
|
17124
|
-
|
|
17125
|
-
|
|
17126
|
-
return this.client.index(chunkIndexName(scope));
|
|
17127
|
-
}
|
|
17128
|
-
pageIndex(scope) {
|
|
17129
|
-
return this.client.index(pageIndexName(scope));
|
|
17270
|
+
this.index = opts.index;
|
|
17271
|
+
this.pagesNs = opts.index.namespace(opts.pagesNamespace);
|
|
17272
|
+
this.chunksNs = opts.index.namespace(opts.chunksNamespace);
|
|
17130
17273
|
}
|
|
17131
17274
|
async upsertChunks(chunks, scope) {
|
|
17132
17275
|
if (chunks.length === 0) return;
|
|
17133
|
-
const index = this.chunkIndex(scope);
|
|
17134
17276
|
const BATCH_SIZE = 100;
|
|
17135
17277
|
for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
|
|
17136
17278
|
const batch = chunks.slice(i, i + BATCH_SIZE);
|
|
17137
|
-
await
|
|
17138
|
-
|
|
17139
|
-
|
|
17140
|
-
|
|
17141
|
-
|
|
17142
|
-
|
|
17143
|
-
|
|
17144
|
-
|
|
17145
|
-
|
|
17146
|
-
|
|
17147
|
-
|
|
17148
|
-
|
|
17279
|
+
await this.chunksNs.upsert(
|
|
17280
|
+
batch.map((c) => ({
|
|
17281
|
+
id: c.id,
|
|
17282
|
+
data: c.data,
|
|
17283
|
+
metadata: {
|
|
17284
|
+
...c.metadata,
|
|
17285
|
+
projectId: scope.projectId,
|
|
17286
|
+
scopeName: scope.scopeName,
|
|
17287
|
+
type: c.metadata.type || "chunk"
|
|
17288
|
+
}
|
|
17289
|
+
}))
|
|
17290
|
+
);
|
|
17291
|
+
}
|
|
17292
|
+
}
|
|
17293
|
+
async search(data, opts, scope) {
|
|
17294
|
+
const filterParts = [
|
|
17295
|
+
`projectId = '${scope.projectId}'`,
|
|
17296
|
+
`scopeName = '${scope.scopeName}'`
|
|
17297
|
+
];
|
|
17298
|
+
if (opts.filter) {
|
|
17299
|
+
filterParts.push(opts.filter);
|
|
17300
|
+
}
|
|
17301
|
+
const results = await this.chunksNs.query({
|
|
17302
|
+
data,
|
|
17303
|
+
topK: opts.limit,
|
|
17304
|
+
includeMetadata: true,
|
|
17305
|
+
filter: filterParts.join(" AND "),
|
|
17306
|
+
queryMode: QueryMode.HYBRID,
|
|
17307
|
+
fusionAlgorithm: FusionAlgorithm.DBSF
|
|
17308
|
+
});
|
|
17309
|
+
return results.map((doc) => ({
|
|
17310
|
+
id: String(doc.id),
|
|
17311
|
+
score: doc.score,
|
|
17312
|
+
metadata: {
|
|
17313
|
+
projectId: doc.metadata?.projectId ?? "",
|
|
17314
|
+
scopeName: doc.metadata?.scopeName ?? "",
|
|
17315
|
+
url: doc.metadata?.url ?? "",
|
|
17316
|
+
path: doc.metadata?.path ?? "",
|
|
17317
|
+
title: doc.metadata?.title ?? "",
|
|
17318
|
+
sectionTitle: doc.metadata?.sectionTitle ?? "",
|
|
17319
|
+
headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
|
|
17320
|
+
snippet: doc.metadata?.snippet ?? "",
|
|
17321
|
+
chunkText: doc.metadata?.chunkText ?? "",
|
|
17322
|
+
ordinal: doc.metadata?.ordinal ?? 0,
|
|
17323
|
+
contentHash: doc.metadata?.contentHash ?? "",
|
|
17324
|
+
depth: doc.metadata?.depth ?? 0,
|
|
17325
|
+
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17326
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
17327
|
+
tags: doc.metadata?.tags ?? [],
|
|
17328
|
+
description: doc.metadata?.description || void 0,
|
|
17329
|
+
keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17330
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
|
|
17331
|
+
incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
|
|
17332
|
+
}
|
|
17333
|
+
}));
|
|
17334
|
+
}
|
|
17335
|
+
async searchChunksByUrl(data, url, opts, scope) {
|
|
17336
|
+
const filterParts = [
|
|
17337
|
+
`projectId = '${scope.projectId}'`,
|
|
17338
|
+
`scopeName = '${scope.scopeName}'`,
|
|
17339
|
+
`url = '${url}'`
|
|
17340
|
+
];
|
|
17341
|
+
if (opts.filter) {
|
|
17342
|
+
filterParts.push(opts.filter);
|
|
17343
|
+
}
|
|
17344
|
+
const results = await this.chunksNs.query({
|
|
17345
|
+
data,
|
|
17346
|
+
topK: opts.limit,
|
|
17347
|
+
includeMetadata: true,
|
|
17348
|
+
filter: filterParts.join(" AND "),
|
|
17349
|
+
queryMode: QueryMode.HYBRID,
|
|
17350
|
+
fusionAlgorithm: FusionAlgorithm.DBSF
|
|
17149
17351
|
});
|
|
17150
17352
|
return results.map((doc) => ({
|
|
17151
|
-
id: doc.id,
|
|
17353
|
+
id: String(doc.id),
|
|
17152
17354
|
score: doc.score,
|
|
17153
17355
|
metadata: {
|
|
17154
17356
|
projectId: doc.metadata?.projectId ?? "",
|
|
17155
17357
|
scopeName: doc.metadata?.scopeName ?? "",
|
|
17156
|
-
url: doc.
|
|
17358
|
+
url: doc.metadata?.url ?? "",
|
|
17157
17359
|
path: doc.metadata?.path ?? "",
|
|
17158
|
-
title: doc.
|
|
17159
|
-
sectionTitle: doc.
|
|
17160
|
-
headingPath: doc.
|
|
17360
|
+
title: doc.metadata?.title ?? "",
|
|
17361
|
+
sectionTitle: doc.metadata?.sectionTitle ?? "",
|
|
17362
|
+
headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
|
|
17161
17363
|
snippet: doc.metadata?.snippet ?? "",
|
|
17162
|
-
chunkText: doc.
|
|
17364
|
+
chunkText: doc.metadata?.chunkText ?? "",
|
|
17163
17365
|
ordinal: doc.metadata?.ordinal ?? 0,
|
|
17164
17366
|
contentHash: doc.metadata?.contentHash ?? "",
|
|
17165
17367
|
depth: doc.metadata?.depth ?? 0,
|
|
17166
17368
|
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17167
17369
|
routeFile: doc.metadata?.routeFile ?? "",
|
|
17168
|
-
tags: doc.
|
|
17370
|
+
tags: doc.metadata?.tags ?? [],
|
|
17169
17371
|
description: doc.metadata?.description || void 0,
|
|
17170
|
-
keywords: doc.metadata?.keywords ? doc.metadata.keywords
|
|
17372
|
+
keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17373
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
|
|
17374
|
+
incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
|
|
17171
17375
|
}
|
|
17172
17376
|
}));
|
|
17173
17377
|
}
|
|
17174
|
-
async
|
|
17175
|
-
|
|
17378
|
+
async searchPagesByText(data, opts, scope) {
|
|
17379
|
+
return this.queryPages({ data }, opts, scope);
|
|
17380
|
+
}
|
|
17381
|
+
async searchPagesByVector(vector, opts, scope) {
|
|
17382
|
+
return this.queryPages({ vector }, opts, scope);
|
|
17383
|
+
}
|
|
17384
|
+
async queryPages(input, opts, scope) {
|
|
17385
|
+
const filterParts = [
|
|
17386
|
+
`projectId = '${scope.projectId}'`,
|
|
17387
|
+
`scopeName = '${scope.scopeName}'`
|
|
17388
|
+
];
|
|
17389
|
+
if (opts.filter) {
|
|
17390
|
+
filterParts.push(opts.filter);
|
|
17391
|
+
}
|
|
17176
17392
|
let results;
|
|
17177
17393
|
try {
|
|
17178
|
-
results = await
|
|
17179
|
-
|
|
17180
|
-
|
|
17181
|
-
|
|
17182
|
-
|
|
17183
|
-
|
|
17184
|
-
|
|
17394
|
+
results = await this.pagesNs.query({
|
|
17395
|
+
...input,
|
|
17396
|
+
topK: opts.limit,
|
|
17397
|
+
includeMetadata: true,
|
|
17398
|
+
filter: filterParts.join(" AND "),
|
|
17399
|
+
queryMode: QueryMode.HYBRID,
|
|
17400
|
+
fusionAlgorithm: FusionAlgorithm.DBSF
|
|
17185
17401
|
});
|
|
17186
17402
|
} catch {
|
|
17187
17403
|
return [];
|
|
17188
17404
|
}
|
|
17189
17405
|
return results.map((doc) => ({
|
|
17190
|
-
id: doc.id,
|
|
17406
|
+
id: String(doc.id),
|
|
17191
17407
|
score: doc.score,
|
|
17192
|
-
title: doc.
|
|
17193
|
-
url: doc.
|
|
17194
|
-
description: doc.
|
|
17195
|
-
tags: doc.
|
|
17408
|
+
title: doc.metadata?.title ?? "",
|
|
17409
|
+
url: doc.metadata?.url ?? "",
|
|
17410
|
+
description: doc.metadata?.description ?? "",
|
|
17411
|
+
tags: doc.metadata?.tags ?? [],
|
|
17196
17412
|
depth: doc.metadata?.depth ?? 0,
|
|
17197
17413
|
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17198
|
-
routeFile: doc.metadata?.routeFile ?? ""
|
|
17414
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
17415
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0
|
|
17199
17416
|
}));
|
|
17200
17417
|
}
|
|
17201
|
-
async deleteByIds(ids,
|
|
17418
|
+
async deleteByIds(ids, _scope) {
|
|
17202
17419
|
if (ids.length === 0) return;
|
|
17203
|
-
const
|
|
17204
|
-
const BATCH_SIZE = 500;
|
|
17420
|
+
const BATCH_SIZE = 100;
|
|
17205
17421
|
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17206
17422
|
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17207
|
-
await
|
|
17423
|
+
await this.chunksNs.delete(batch);
|
|
17208
17424
|
}
|
|
17209
17425
|
}
|
|
17210
17426
|
async deleteScope(scope) {
|
|
17211
|
-
|
|
17212
|
-
const
|
|
17213
|
-
|
|
17214
|
-
|
|
17215
|
-
|
|
17216
|
-
|
|
17217
|
-
|
|
17218
|
-
|
|
17219
|
-
|
|
17427
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17428
|
+
const ids = [];
|
|
17429
|
+
let cursor = "0";
|
|
17430
|
+
try {
|
|
17431
|
+
for (; ; ) {
|
|
17432
|
+
const result = await ns.range({
|
|
17433
|
+
cursor,
|
|
17434
|
+
limit: 100,
|
|
17435
|
+
includeMetadata: true
|
|
17436
|
+
});
|
|
17437
|
+
for (const doc of result.vectors) {
|
|
17438
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17439
|
+
ids.push(String(doc.id));
|
|
17440
|
+
}
|
|
17441
|
+
}
|
|
17442
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17443
|
+
cursor = result.nextCursor;
|
|
17444
|
+
}
|
|
17445
|
+
} catch {
|
|
17446
|
+
}
|
|
17447
|
+
if (ids.length > 0) {
|
|
17448
|
+
const BATCH_SIZE = 100;
|
|
17449
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17450
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17451
|
+
await ns.delete(batch);
|
|
17452
|
+
}
|
|
17453
|
+
}
|
|
17220
17454
|
}
|
|
17221
17455
|
}
|
|
17222
17456
|
async listScopes(projectId) {
|
|
17223
|
-
const
|
|
17224
|
-
const
|
|
17225
|
-
|
|
17226
|
-
for (const name of allIndexes) {
|
|
17227
|
-
if (name.startsWith(prefix) && !name.endsWith("--pages")) {
|
|
17228
|
-
const scopeName = name.slice(prefix.length);
|
|
17229
|
-
scopeNames.add(scopeName);
|
|
17230
|
-
}
|
|
17231
|
-
}
|
|
17232
|
-
const scopes = [];
|
|
17233
|
-
for (const scopeName of scopeNames) {
|
|
17234
|
-
const scope = {
|
|
17235
|
-
projectId,
|
|
17236
|
-
scopeName,
|
|
17237
|
-
scopeId: `${projectId}:${scopeName}`
|
|
17238
|
-
};
|
|
17457
|
+
const scopeMap = /* @__PURE__ */ new Map();
|
|
17458
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17459
|
+
let cursor = "0";
|
|
17239
17460
|
try {
|
|
17240
|
-
|
|
17241
|
-
|
|
17242
|
-
|
|
17243
|
-
|
|
17244
|
-
|
|
17245
|
-
|
|
17246
|
-
|
|
17461
|
+
for (; ; ) {
|
|
17462
|
+
const result = await ns.range({
|
|
17463
|
+
cursor,
|
|
17464
|
+
limit: 100,
|
|
17465
|
+
includeMetadata: true
|
|
17466
|
+
});
|
|
17467
|
+
for (const doc of result.vectors) {
|
|
17468
|
+
if (doc.metadata?.projectId === projectId) {
|
|
17469
|
+
const scopeName = doc.metadata.scopeName ?? "";
|
|
17470
|
+
scopeMap.set(scopeName, (scopeMap.get(scopeName) ?? 0) + 1);
|
|
17471
|
+
}
|
|
17472
|
+
}
|
|
17473
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17474
|
+
cursor = result.nextCursor;
|
|
17475
|
+
}
|
|
17247
17476
|
} catch {
|
|
17248
|
-
scopes.push({
|
|
17249
|
-
projectId,
|
|
17250
|
-
scopeName,
|
|
17251
|
-
lastIndexedAt: "unknown",
|
|
17252
|
-
documentCount: 0
|
|
17253
|
-
});
|
|
17254
17477
|
}
|
|
17255
17478
|
}
|
|
17256
|
-
return
|
|
17479
|
+
return [...scopeMap.entries()].map(([scopeName, count]) => ({
|
|
17480
|
+
projectId,
|
|
17481
|
+
scopeName,
|
|
17482
|
+
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
17483
|
+
documentCount: count
|
|
17484
|
+
}));
|
|
17257
17485
|
}
|
|
17258
17486
|
async getContentHashes(scope) {
|
|
17259
17487
|
const map = /* @__PURE__ */ new Map();
|
|
17260
|
-
const index = this.chunkIndex(scope);
|
|
17261
17488
|
let cursor = "0";
|
|
17262
17489
|
try {
|
|
17263
17490
|
for (; ; ) {
|
|
17264
|
-
const result = await
|
|
17265
|
-
|
|
17266
|
-
|
|
17267
|
-
|
|
17491
|
+
const result = await this.chunksNs.range({
|
|
17492
|
+
cursor,
|
|
17493
|
+
limit: 100,
|
|
17494
|
+
includeMetadata: true
|
|
17495
|
+
});
|
|
17496
|
+
for (const doc of result.vectors) {
|
|
17497
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17498
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17499
|
+
}
|
|
17500
|
+
}
|
|
17501
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17502
|
+
cursor = result.nextCursor;
|
|
17503
|
+
}
|
|
17504
|
+
} catch {
|
|
17505
|
+
}
|
|
17506
|
+
return map;
|
|
17507
|
+
}
|
|
17508
|
+
async listPages(scope, opts) {
|
|
17509
|
+
const cursor = opts?.cursor ?? "0";
|
|
17510
|
+
const limit = opts?.limit ?? 50;
|
|
17511
|
+
try {
|
|
17512
|
+
const result = await this.pagesNs.range({
|
|
17513
|
+
cursor,
|
|
17514
|
+
limit,
|
|
17515
|
+
includeMetadata: true
|
|
17516
|
+
});
|
|
17517
|
+
const pages = result.vectors.filter(
|
|
17518
|
+
(doc) => doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && (!opts?.pathPrefix || (doc.metadata?.url ?? "").startsWith(opts.pathPrefix))
|
|
17519
|
+
).map((doc) => ({
|
|
17520
|
+
url: doc.metadata?.url ?? "",
|
|
17521
|
+
title: doc.metadata?.title ?? "",
|
|
17522
|
+
description: doc.metadata?.description ?? "",
|
|
17523
|
+
routeFile: doc.metadata?.routeFile ?? ""
|
|
17524
|
+
}));
|
|
17525
|
+
const response = { pages };
|
|
17526
|
+
if (result.nextCursor && result.nextCursor !== "0") {
|
|
17527
|
+
response.nextCursor = result.nextCursor;
|
|
17528
|
+
}
|
|
17529
|
+
return response;
|
|
17530
|
+
} catch {
|
|
17531
|
+
return { pages: [] };
|
|
17532
|
+
}
|
|
17533
|
+
}
|
|
17534
|
+
async getPageHashes(scope) {
|
|
17535
|
+
const map = /* @__PURE__ */ new Map();
|
|
17536
|
+
let cursor = "0";
|
|
17537
|
+
try {
|
|
17538
|
+
for (; ; ) {
|
|
17539
|
+
const result = await this.pagesNs.range({
|
|
17540
|
+
cursor,
|
|
17541
|
+
limit: 100,
|
|
17542
|
+
includeMetadata: true
|
|
17543
|
+
});
|
|
17544
|
+
for (const doc of result.vectors) {
|
|
17545
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17546
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17268
17547
|
}
|
|
17269
17548
|
}
|
|
17270
17549
|
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
@@ -17274,47 +17553,43 @@ var UpstashSearchStore = class {
|
|
|
17274
17553
|
}
|
|
17275
17554
|
return map;
|
|
17276
17555
|
}
|
|
17556
|
+
async deletePagesByIds(ids, _scope) {
|
|
17557
|
+
if (ids.length === 0) return;
|
|
17558
|
+
const BATCH_SIZE = 50;
|
|
17559
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17560
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17561
|
+
await this.pagesNs.delete(batch);
|
|
17562
|
+
}
|
|
17563
|
+
}
|
|
17277
17564
|
async upsertPages(pages, scope) {
|
|
17278
17565
|
if (pages.length === 0) return;
|
|
17279
|
-
const index = this.pageIndex(scope);
|
|
17280
17566
|
const BATCH_SIZE = 50;
|
|
17281
17567
|
for (let i = 0; i < pages.length; i += BATCH_SIZE) {
|
|
17282
17568
|
const batch = pages.slice(i, i + BATCH_SIZE);
|
|
17283
|
-
|
|
17284
|
-
|
|
17285
|
-
|
|
17286
|
-
|
|
17287
|
-
|
|
17288
|
-
|
|
17289
|
-
|
|
17290
|
-
|
|
17291
|
-
|
|
17292
|
-
|
|
17293
|
-
}
|
|
17294
|
-
|
|
17295
|
-
markdown: p.markdown,
|
|
17296
|
-
projectId: p.projectId,
|
|
17297
|
-
scopeName: p.scopeName,
|
|
17298
|
-
routeFile: p.routeFile,
|
|
17299
|
-
routeResolution: p.routeResolution,
|
|
17300
|
-
incomingLinks: p.incomingLinks,
|
|
17301
|
-
outgoingLinks: p.outgoingLinks,
|
|
17302
|
-
depth: p.depth,
|
|
17303
|
-
indexedAt: p.indexedAt
|
|
17304
|
-
}
|
|
17305
|
-
}));
|
|
17306
|
-
await index.upsert(docs);
|
|
17569
|
+
await this.pagesNs.upsert(
|
|
17570
|
+
batch.map((p) => ({
|
|
17571
|
+
id: p.id,
|
|
17572
|
+
data: p.data,
|
|
17573
|
+
metadata: {
|
|
17574
|
+
...p.metadata,
|
|
17575
|
+
projectId: scope.projectId,
|
|
17576
|
+
scopeName: scope.scopeName,
|
|
17577
|
+
type: "page"
|
|
17578
|
+
}
|
|
17579
|
+
}))
|
|
17580
|
+
);
|
|
17307
17581
|
}
|
|
17308
17582
|
}
|
|
17309
17583
|
async getPage(url, scope) {
|
|
17310
|
-
const index = this.pageIndex(scope);
|
|
17311
17584
|
try {
|
|
17312
|
-
const results = await
|
|
17585
|
+
const results = await this.pagesNs.fetch([url], {
|
|
17586
|
+
includeMetadata: true
|
|
17587
|
+
});
|
|
17313
17588
|
const doc = results[0];
|
|
17314
|
-
if (!doc) return null;
|
|
17589
|
+
if (!doc || !doc.metadata) return null;
|
|
17315
17590
|
return {
|
|
17316
|
-
url: doc.
|
|
17317
|
-
title: doc.
|
|
17591
|
+
url: doc.metadata.url,
|
|
17592
|
+
title: doc.metadata.title,
|
|
17318
17593
|
markdown: doc.metadata.markdown,
|
|
17319
17594
|
projectId: doc.metadata.projectId,
|
|
17320
17595
|
scopeName: doc.metadata.scopeName,
|
|
@@ -17322,27 +17597,86 @@ var UpstashSearchStore = class {
|
|
|
17322
17597
|
routeResolution: doc.metadata.routeResolution,
|
|
17323
17598
|
incomingLinks: doc.metadata.incomingLinks,
|
|
17324
17599
|
outgoingLinks: doc.metadata.outgoingLinks,
|
|
17600
|
+
outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? void 0,
|
|
17325
17601
|
depth: doc.metadata.depth,
|
|
17326
|
-
tags: doc.
|
|
17602
|
+
tags: doc.metadata.tags ?? [],
|
|
17327
17603
|
indexedAt: doc.metadata.indexedAt,
|
|
17328
|
-
summary: doc.
|
|
17329
|
-
description: doc.
|
|
17330
|
-
keywords: doc.
|
|
17604
|
+
summary: doc.metadata.summary || void 0,
|
|
17605
|
+
description: doc.metadata.description || void 0,
|
|
17606
|
+
keywords: doc.metadata.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17607
|
+
publishedAt: typeof doc.metadata.publishedAt === "number" ? doc.metadata.publishedAt : void 0
|
|
17331
17608
|
};
|
|
17332
17609
|
} catch {
|
|
17333
17610
|
return null;
|
|
17334
17611
|
}
|
|
17335
17612
|
}
|
|
17613
|
+
async fetchPageWithVector(url, scope) {
|
|
17614
|
+
try {
|
|
17615
|
+
const results = await this.pagesNs.fetch([url], {
|
|
17616
|
+
includeMetadata: true,
|
|
17617
|
+
includeVectors: true
|
|
17618
|
+
});
|
|
17619
|
+
const doc = results[0];
|
|
17620
|
+
if (!doc || !doc.metadata || !doc.vector) return null;
|
|
17621
|
+
if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
|
|
17622
|
+
return null;
|
|
17623
|
+
}
|
|
17624
|
+
return { metadata: doc.metadata, vector: doc.vector };
|
|
17625
|
+
} catch {
|
|
17626
|
+
return null;
|
|
17627
|
+
}
|
|
17628
|
+
}
|
|
17629
|
+
async fetchPagesBatch(urls, scope) {
|
|
17630
|
+
if (urls.length === 0) return [];
|
|
17631
|
+
try {
|
|
17632
|
+
const results = await this.pagesNs.fetch(urls, {
|
|
17633
|
+
includeMetadata: true
|
|
17634
|
+
});
|
|
17635
|
+
const out = [];
|
|
17636
|
+
for (const doc of results) {
|
|
17637
|
+
if (!doc || !doc.metadata) continue;
|
|
17638
|
+
if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
|
|
17639
|
+
continue;
|
|
17640
|
+
}
|
|
17641
|
+
out.push({
|
|
17642
|
+
url: doc.metadata.url,
|
|
17643
|
+
title: doc.metadata.title,
|
|
17644
|
+
routeFile: doc.metadata.routeFile,
|
|
17645
|
+
outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? []
|
|
17646
|
+
});
|
|
17647
|
+
}
|
|
17648
|
+
return out;
|
|
17649
|
+
} catch {
|
|
17650
|
+
return [];
|
|
17651
|
+
}
|
|
17652
|
+
}
|
|
17336
17653
|
async deletePages(scope) {
|
|
17654
|
+
const ids = [];
|
|
17655
|
+
let cursor = "0";
|
|
17337
17656
|
try {
|
|
17338
|
-
|
|
17339
|
-
|
|
17657
|
+
for (; ; ) {
|
|
17658
|
+
const result = await this.pagesNs.range({
|
|
17659
|
+
cursor,
|
|
17660
|
+
limit: 100,
|
|
17661
|
+
includeMetadata: true
|
|
17662
|
+
});
|
|
17663
|
+
for (const doc of result.vectors) {
|
|
17664
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17665
|
+
ids.push(String(doc.id));
|
|
17666
|
+
}
|
|
17667
|
+
}
|
|
17668
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17669
|
+
cursor = result.nextCursor;
|
|
17670
|
+
}
|
|
17340
17671
|
} catch {
|
|
17341
17672
|
}
|
|
17673
|
+
if (ids.length > 0) {
|
|
17674
|
+
await this.deletePagesByIds(ids, scope);
|
|
17675
|
+
}
|
|
17342
17676
|
}
|
|
17343
17677
|
async health() {
|
|
17344
17678
|
try {
|
|
17345
|
-
await this.
|
|
17679
|
+
await this.index.info();
|
|
17346
17680
|
return { ok: true };
|
|
17347
17681
|
} catch (error) {
|
|
17348
17682
|
return {
|
|
@@ -17352,14 +17686,31 @@ var UpstashSearchStore = class {
|
|
|
17352
17686
|
}
|
|
17353
17687
|
}
|
|
17354
17688
|
async dropAllIndexes(projectId) {
|
|
17355
|
-
const
|
|
17356
|
-
|
|
17357
|
-
|
|
17358
|
-
|
|
17359
|
-
|
|
17360
|
-
const
|
|
17361
|
-
|
|
17362
|
-
|
|
17689
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17690
|
+
const ids = [];
|
|
17691
|
+
let cursor = "0";
|
|
17692
|
+
try {
|
|
17693
|
+
for (; ; ) {
|
|
17694
|
+
const result = await ns.range({
|
|
17695
|
+
cursor,
|
|
17696
|
+
limit: 100,
|
|
17697
|
+
includeMetadata: true
|
|
17698
|
+
});
|
|
17699
|
+
for (const doc of result.vectors) {
|
|
17700
|
+
if (doc.metadata?.projectId === projectId) {
|
|
17701
|
+
ids.push(String(doc.id));
|
|
17702
|
+
}
|
|
17703
|
+
}
|
|
17704
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17705
|
+
cursor = result.nextCursor;
|
|
17706
|
+
}
|
|
17707
|
+
} catch {
|
|
17708
|
+
}
|
|
17709
|
+
if (ids.length > 0) {
|
|
17710
|
+
const BATCH_SIZE = 100;
|
|
17711
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17712
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17713
|
+
await ns.delete(batch);
|
|
17363
17714
|
}
|
|
17364
17715
|
}
|
|
17365
17716
|
}
|
|
@@ -17373,12 +17724,16 @@ async function createUpstashStore(config) {
|
|
|
17373
17724
|
if (!url || !token) {
|
|
17374
17725
|
throw new SearchSocketError(
|
|
17375
17726
|
"VECTOR_BACKEND_UNAVAILABLE",
|
|
17376
|
-
`Missing Upstash
|
|
17727
|
+
`Missing Upstash Vector credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
|
|
17377
17728
|
);
|
|
17378
17729
|
}
|
|
17379
|
-
const {
|
|
17380
|
-
const
|
|
17381
|
-
return new UpstashSearchStore({
|
|
17730
|
+
const { Index } = await import('@upstash/vector');
|
|
17731
|
+
const index = new Index({ url, token });
|
|
17732
|
+
return new UpstashSearchStore({
|
|
17733
|
+
index,
|
|
17734
|
+
pagesNamespace: config.upstash.namespaces.pages,
|
|
17735
|
+
chunksNamespace: config.upstash.namespaces.chunks
|
|
17736
|
+
});
|
|
17382
17737
|
}
|
|
17383
17738
|
function sha1(input) {
|
|
17384
17739
|
return createHash("sha1").update(input).digest("hex");
|
|
@@ -17446,6 +17801,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
|
|
|
17446
17801
|
if (normalizeText(current.text)) {
|
|
17447
17802
|
sections.push({
|
|
17448
17803
|
sectionTitle: current.sectionTitle,
|
|
17804
|
+
headingLevel: current.headingLevel,
|
|
17449
17805
|
headingPath: current.headingPath,
|
|
17450
17806
|
text: current.text.trim()
|
|
17451
17807
|
});
|
|
@@ -17464,6 +17820,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
|
|
|
17464
17820
|
headingStack.length = level;
|
|
17465
17821
|
current = {
|
|
17466
17822
|
sectionTitle: title,
|
|
17823
|
+
headingLevel: level,
|
|
17467
17824
|
headingPath: headingStack.filter((entry) => Boolean(entry)).slice(0, headingPathDepth),
|
|
17468
17825
|
text: `${line}
|
|
17469
17826
|
`
|
|
@@ -17599,6 +17956,7 @@ function splitSection(section, config) {
|
|
|
17599
17956
|
return [
|
|
17600
17957
|
{
|
|
17601
17958
|
sectionTitle: section.sectionTitle,
|
|
17959
|
+
headingLevel: section.headingLevel,
|
|
17602
17960
|
headingPath: section.headingPath,
|
|
17603
17961
|
chunkText: text
|
|
17604
17962
|
}
|
|
@@ -17649,6 +18007,7 @@ ${chunk}`;
|
|
|
17649
18007
|
}
|
|
17650
18008
|
return merged.map((chunkText) => ({
|
|
17651
18009
|
sectionTitle: section.sectionTitle,
|
|
18010
|
+
headingLevel: section.headingLevel,
|
|
17652
18011
|
headingPath: section.headingPath,
|
|
17653
18012
|
chunkText
|
|
17654
18013
|
}));
|
|
@@ -17664,6 +18023,18 @@ function buildSummaryChunkText(page) {
|
|
|
17664
18023
|
}
|
|
17665
18024
|
return parts.join("\n\n");
|
|
17666
18025
|
}
|
|
18026
|
+
function buildEmbeddingTitle(chunk) {
|
|
18027
|
+
if (!chunk.sectionTitle || chunk.headingLevel === void 0) return void 0;
|
|
18028
|
+
if (chunk.headingPath.length > 1) {
|
|
18029
|
+
const path14 = chunk.headingPath.join(" > ");
|
|
18030
|
+
const lastInPath = chunk.headingPath[chunk.headingPath.length - 1];
|
|
18031
|
+
if (lastInPath !== chunk.sectionTitle) {
|
|
18032
|
+
return `${chunk.title} \u2014 ${path14} > ${chunk.sectionTitle}`;
|
|
18033
|
+
}
|
|
18034
|
+
return `${chunk.title} \u2014 ${path14}`;
|
|
18035
|
+
}
|
|
18036
|
+
return `${chunk.title} \u2014 ${chunk.sectionTitle}`;
|
|
18037
|
+
}
|
|
17667
18038
|
function buildEmbeddingText(chunk, prependTitle) {
|
|
17668
18039
|
if (!prependTitle) return chunk.chunkText;
|
|
17669
18040
|
const prefix = chunk.sectionTitle ? `${chunk.title} \u2014 ${chunk.sectionTitle}` : chunk.title;
|
|
@@ -17694,10 +18065,14 @@ function chunkPage(page, config, scope) {
|
|
|
17694
18065
|
tags: page.tags,
|
|
17695
18066
|
contentHash: "",
|
|
17696
18067
|
description: page.description,
|
|
17697
|
-
keywords: page.keywords
|
|
18068
|
+
keywords: page.keywords,
|
|
18069
|
+
publishedAt: page.publishedAt,
|
|
18070
|
+
incomingAnchorText: page.incomingAnchorText,
|
|
18071
|
+
meta: page.meta
|
|
17698
18072
|
};
|
|
17699
18073
|
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
17700
|
-
|
|
18074
|
+
const metaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
|
|
18075
|
+
summaryChunk.contentHash = sha256(normalizeText(embeddingText) + metaSuffix);
|
|
17701
18076
|
chunks.push(summaryChunk);
|
|
17702
18077
|
}
|
|
17703
18078
|
const ordinalOffset = config.chunking.pageSummaryChunk ? 1 : 0;
|
|
@@ -17714,6 +18089,7 @@ function chunkPage(page, config, scope) {
|
|
|
17714
18089
|
path: page.url,
|
|
17715
18090
|
title: page.title,
|
|
17716
18091
|
sectionTitle: entry.sectionTitle,
|
|
18092
|
+
headingLevel: entry.headingLevel,
|
|
17717
18093
|
headingPath: entry.headingPath,
|
|
17718
18094
|
chunkText: entry.chunkText,
|
|
17719
18095
|
snippet: toSnippet(entry.chunkText),
|
|
@@ -17723,10 +18099,16 @@ function chunkPage(page, config, scope) {
|
|
|
17723
18099
|
tags: page.tags,
|
|
17724
18100
|
contentHash: "",
|
|
17725
18101
|
description: page.description,
|
|
17726
|
-
keywords: page.keywords
|
|
18102
|
+
keywords: page.keywords,
|
|
18103
|
+
publishedAt: page.publishedAt,
|
|
18104
|
+
incomingAnchorText: page.incomingAnchorText,
|
|
18105
|
+
meta: page.meta
|
|
17727
18106
|
};
|
|
17728
18107
|
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
17729
|
-
|
|
18108
|
+
const embeddingTitle = config.chunking.weightHeadings ? buildEmbeddingTitle(chunk) : void 0;
|
|
18109
|
+
const chunkMetaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
|
|
18110
|
+
const hashInput = embeddingTitle ? `${normalizeText(embeddingText)}|title:${embeddingTitle}` : normalizeText(embeddingText);
|
|
18111
|
+
chunk.contentHash = sha256(hashInput + chunkMetaSuffix);
|
|
17730
18112
|
chunks.push(chunk);
|
|
17731
18113
|
}
|
|
17732
18114
|
return chunks;
|
|
@@ -18558,7 +18940,112 @@ function gfm(turndownService) {
|
|
|
18558
18940
|
]);
|
|
18559
18941
|
}
|
|
18560
18942
|
|
|
18943
|
+
// src/utils/structured-meta.ts
|
|
18944
|
+
var VALID_KEY_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
|
|
18945
|
+
function validateMetaKey(key) {
|
|
18946
|
+
return VALID_KEY_RE.test(key);
|
|
18947
|
+
}
|
|
18948
|
+
function parseMetaValue(content, dataType) {
|
|
18949
|
+
switch (dataType) {
|
|
18950
|
+
case "number": {
|
|
18951
|
+
const n = Number(content);
|
|
18952
|
+
return Number.isFinite(n) ? n : content;
|
|
18953
|
+
}
|
|
18954
|
+
case "boolean":
|
|
18955
|
+
return content === "true";
|
|
18956
|
+
case "string[]":
|
|
18957
|
+
return content ? content.split(",").map((s) => s.trim()) : [];
|
|
18958
|
+
case "date": {
|
|
18959
|
+
const ms = Number(content);
|
|
18960
|
+
return Number.isFinite(ms) ? ms : content;
|
|
18961
|
+
}
|
|
18962
|
+
default:
|
|
18963
|
+
return content;
|
|
18964
|
+
}
|
|
18965
|
+
}
|
|
18966
|
+
function escapeFilterValue(s) {
|
|
18967
|
+
return s.replace(/'/g, "''");
|
|
18968
|
+
}
|
|
18969
|
+
function buildMetaFilterString(filters) {
|
|
18970
|
+
const clauses = [];
|
|
18971
|
+
for (const [key, value] of Object.entries(filters)) {
|
|
18972
|
+
if (!validateMetaKey(key)) continue;
|
|
18973
|
+
const field = `meta.${key}`;
|
|
18974
|
+
if (typeof value === "string") {
|
|
18975
|
+
clauses.push(`${field} CONTAINS '${escapeFilterValue(value)}'`);
|
|
18976
|
+
} else if (typeof value === "boolean") {
|
|
18977
|
+
clauses.push(`${field} = ${value}`);
|
|
18978
|
+
} else {
|
|
18979
|
+
clauses.push(`${field} = ${value}`);
|
|
18980
|
+
}
|
|
18981
|
+
}
|
|
18982
|
+
return clauses.join(" AND ");
|
|
18983
|
+
}
|
|
18984
|
+
|
|
18561
18985
|
// src/indexing/extractor.ts
|
|
18986
|
+
function normalizeDateToMs(value) {
|
|
18987
|
+
if (value == null) return void 0;
|
|
18988
|
+
if (value instanceof Date) {
|
|
18989
|
+
const ts = value.getTime();
|
|
18990
|
+
return Number.isFinite(ts) ? ts : void 0;
|
|
18991
|
+
}
|
|
18992
|
+
if (typeof value === "string") {
|
|
18993
|
+
const ts = new Date(value).getTime();
|
|
18994
|
+
return Number.isFinite(ts) ? ts : void 0;
|
|
18995
|
+
}
|
|
18996
|
+
if (typeof value === "number") {
|
|
18997
|
+
return Number.isFinite(value) ? value : void 0;
|
|
18998
|
+
}
|
|
18999
|
+
return void 0;
|
|
19000
|
+
}
|
|
19001
|
+
var FRONTMATTER_DATE_FIELDS = ["date", "publishedAt", "updatedAt", "published_at", "updated_at"];
|
|
19002
|
+
function extractPublishedAtFromFrontmatter(data) {
|
|
19003
|
+
for (const field of FRONTMATTER_DATE_FIELDS) {
|
|
19004
|
+
const val = normalizeDateToMs(data[field]);
|
|
19005
|
+
if (val !== void 0) return val;
|
|
19006
|
+
}
|
|
19007
|
+
return void 0;
|
|
19008
|
+
}
|
|
19009
|
+
function extractPublishedAtFromHtml($) {
|
|
19010
|
+
const jsonLdScripts = $('script[type="application/ld+json"]');
|
|
19011
|
+
for (let i = 0; i < jsonLdScripts.length; i++) {
|
|
19012
|
+
try {
|
|
19013
|
+
const raw = $(jsonLdScripts[i]).html();
|
|
19014
|
+
if (!raw) continue;
|
|
19015
|
+
const parsed = JSON.parse(raw);
|
|
19016
|
+
const candidates = [];
|
|
19017
|
+
if (Array.isArray(parsed)) {
|
|
19018
|
+
candidates.push(...parsed);
|
|
19019
|
+
} else if (parsed && typeof parsed === "object") {
|
|
19020
|
+
candidates.push(parsed);
|
|
19021
|
+
if (Array.isArray(parsed["@graph"])) {
|
|
19022
|
+
candidates.push(...parsed["@graph"]);
|
|
19023
|
+
}
|
|
19024
|
+
}
|
|
19025
|
+
for (const candidate of candidates) {
|
|
19026
|
+
const val = normalizeDateToMs(candidate.datePublished);
|
|
19027
|
+
if (val !== void 0) return val;
|
|
19028
|
+
}
|
|
19029
|
+
} catch {
|
|
19030
|
+
}
|
|
19031
|
+
}
|
|
19032
|
+
const ogTime = $('meta[property="article:published_time"]').attr("content")?.trim();
|
|
19033
|
+
if (ogTime) {
|
|
19034
|
+
const val = normalizeDateToMs(ogTime);
|
|
19035
|
+
if (val !== void 0) return val;
|
|
19036
|
+
}
|
|
19037
|
+
const itempropDate = $('meta[itemprop="datePublished"]').attr("content")?.trim() || $('time[itemprop="datePublished"]').attr("datetime")?.trim();
|
|
19038
|
+
if (itempropDate) {
|
|
19039
|
+
const val = normalizeDateToMs(itempropDate);
|
|
19040
|
+
if (val !== void 0) return val;
|
|
19041
|
+
}
|
|
19042
|
+
const timeEl = $("time[datetime]").first().attr("datetime")?.trim();
|
|
19043
|
+
if (timeEl) {
|
|
19044
|
+
const val = normalizeDateToMs(timeEl);
|
|
19045
|
+
if (val !== void 0) return val;
|
|
19046
|
+
}
|
|
19047
|
+
return void 0;
|
|
19048
|
+
}
|
|
18562
19049
|
function hasTopLevelNoindexComment(markdown) {
|
|
18563
19050
|
const lines = markdown.split(/\r?\n/);
|
|
18564
19051
|
let inFence = false;
|
|
@@ -18574,6 +19061,97 @@ function hasTopLevelNoindexComment(markdown) {
|
|
|
18574
19061
|
}
|
|
18575
19062
|
return false;
|
|
18576
19063
|
}
|
|
19064
|
+
var GARBAGE_ALT_WORDS = /* @__PURE__ */ new Set([
|
|
19065
|
+
"image",
|
|
19066
|
+
"photo",
|
|
19067
|
+
"picture",
|
|
19068
|
+
"icon",
|
|
19069
|
+
"logo",
|
|
19070
|
+
"banner",
|
|
19071
|
+
"screenshot",
|
|
19072
|
+
"thumbnail",
|
|
19073
|
+
"img",
|
|
19074
|
+
"graphic",
|
|
19075
|
+
"illustration",
|
|
19076
|
+
"spacer",
|
|
19077
|
+
"pixel",
|
|
19078
|
+
"placeholder",
|
|
19079
|
+
"avatar",
|
|
19080
|
+
"background"
|
|
19081
|
+
]);
|
|
19082
|
+
var IMAGE_EXT_RE = /\.(jpg|jpeg|png|gif|svg|webp|avif|bmp|ico)(\?.*)?$/i;
|
|
19083
|
+
function isMeaningfulAlt(alt) {
|
|
19084
|
+
const trimmed = alt.trim();
|
|
19085
|
+
if (!trimmed || trimmed.length < 5) return false;
|
|
19086
|
+
if (IMAGE_EXT_RE.test(trimmed)) return false;
|
|
19087
|
+
if (GARBAGE_ALT_WORDS.has(trimmed.toLowerCase())) return false;
|
|
19088
|
+
return true;
|
|
19089
|
+
}
|
|
19090
|
+
function resolveImageText(img, $, imageDescAttr) {
|
|
19091
|
+
const imgDesc = img.attr(imageDescAttr)?.trim();
|
|
19092
|
+
if (imgDesc) return imgDesc;
|
|
19093
|
+
const figure = img.closest("figure");
|
|
19094
|
+
if (figure.length) {
|
|
19095
|
+
const figDesc = figure.attr(imageDescAttr)?.trim();
|
|
19096
|
+
if (figDesc) return figDesc;
|
|
19097
|
+
}
|
|
19098
|
+
const alt = img.attr("alt")?.trim() ?? "";
|
|
19099
|
+
const caption = figure.length ? figure.find("figcaption").first().text().trim() : "";
|
|
19100
|
+
if (isMeaningfulAlt(alt) && caption) {
|
|
19101
|
+
return `${alt} \u2014 ${caption}`;
|
|
19102
|
+
}
|
|
19103
|
+
if (isMeaningfulAlt(alt)) {
|
|
19104
|
+
return alt;
|
|
19105
|
+
}
|
|
19106
|
+
if (caption) {
|
|
19107
|
+
return caption;
|
|
19108
|
+
}
|
|
19109
|
+
return null;
|
|
19110
|
+
}
|
|
19111
|
+
var STOP_ANCHORS = /* @__PURE__ */ new Set([
|
|
19112
|
+
"here",
|
|
19113
|
+
"click",
|
|
19114
|
+
"click here",
|
|
19115
|
+
"read more",
|
|
19116
|
+
"link",
|
|
19117
|
+
"this",
|
|
19118
|
+
"more"
|
|
19119
|
+
]);
|
|
19120
|
+
function normalizeAnchorText(raw) {
|
|
19121
|
+
const normalized = raw.replace(/\s+/g, " ").trim().toLowerCase();
|
|
19122
|
+
if (normalized.length < 3) return "";
|
|
19123
|
+
if (STOP_ANCHORS.has(normalized)) return "";
|
|
19124
|
+
if (normalized.length > 100) return normalized.slice(0, 100);
|
|
19125
|
+
return normalized;
|
|
19126
|
+
}
|
|
19127
|
+
function escapeHtml(text) {
|
|
19128
|
+
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">");
|
|
19129
|
+
}
|
|
19130
|
+
function preprocessImages(root2, $, imageDescAttr) {
|
|
19131
|
+
root2.find("picture").each((_i, el) => {
|
|
19132
|
+
const picture = $(el);
|
|
19133
|
+
const img = picture.find("img").first();
|
|
19134
|
+
const parentFigure = picture.closest("figure");
|
|
19135
|
+
const text = img.length ? resolveImageText(img, $, imageDescAttr) : null;
|
|
19136
|
+
if (text) {
|
|
19137
|
+
if (parentFigure.length) parentFigure.find("figcaption").remove();
|
|
19138
|
+
picture.replaceWith(`<span>${escapeHtml(text)}</span>`);
|
|
19139
|
+
} else {
|
|
19140
|
+
picture.remove();
|
|
19141
|
+
}
|
|
19142
|
+
});
|
|
19143
|
+
root2.find("img").each((_i, el) => {
|
|
19144
|
+
const img = $(el);
|
|
19145
|
+
const parentFigure = img.closest("figure");
|
|
19146
|
+
const text = resolveImageText(img, $, imageDescAttr);
|
|
19147
|
+
if (text) {
|
|
19148
|
+
if (parentFigure.length) parentFigure.find("figcaption").remove();
|
|
19149
|
+
img.replaceWith(`<span>${escapeHtml(text)}</span>`);
|
|
19150
|
+
} else {
|
|
19151
|
+
img.remove();
|
|
19152
|
+
}
|
|
19153
|
+
});
|
|
19154
|
+
}
|
|
18577
19155
|
function extractFromHtml(url, html, config) {
|
|
18578
19156
|
const $ = load(html);
|
|
18579
19157
|
const normalizedUrl = normalizeUrlPath(url);
|
|
@@ -18599,6 +19177,20 @@ function extractFromHtml(url, html, config) {
|
|
|
18599
19177
|
if (weight === 0) {
|
|
18600
19178
|
return null;
|
|
18601
19179
|
}
|
|
19180
|
+
if ($('meta[name="searchsocket:noindex"]').attr("content") === "true") {
|
|
19181
|
+
return null;
|
|
19182
|
+
}
|
|
19183
|
+
const RESERVED_META_KEYS = /* @__PURE__ */ new Set(["noindex", "tags"]);
|
|
19184
|
+
const meta = {};
|
|
19185
|
+
$('meta[name^="searchsocket:"]').each((_i, el) => {
|
|
19186
|
+
const name = $(el).attr("name") ?? "";
|
|
19187
|
+
const key = name.slice("searchsocket:".length);
|
|
19188
|
+
if (!key || RESERVED_META_KEYS.has(key) || !validateMetaKey(key)) return;
|
|
19189
|
+
const content = $(el).attr("content") ?? "";
|
|
19190
|
+
const dataType = $(el).attr("data-type") ?? "string";
|
|
19191
|
+
meta[key] = parseMetaValue(content, dataType);
|
|
19192
|
+
});
|
|
19193
|
+
const componentTags = $('meta[name="searchsocket:tags"]').attr("content")?.trim();
|
|
18602
19194
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
18603
19195
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
18604
19196
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -18610,7 +19202,9 @@ function extractFromHtml(url, html, config) {
|
|
|
18610
19202
|
root2.find(selector).remove();
|
|
18611
19203
|
}
|
|
18612
19204
|
root2.find(`[${config.extract.ignoreAttr}]`).remove();
|
|
19205
|
+
preprocessImages(root2, $, config.extract.imageDescAttr);
|
|
18613
19206
|
const outgoingLinks = [];
|
|
19207
|
+
const seenLinkKeys = /* @__PURE__ */ new Set();
|
|
18614
19208
|
root2.find("a[href]").each((_index, node) => {
|
|
18615
19209
|
const href = $(node).attr("href");
|
|
18616
19210
|
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:")) {
|
|
@@ -18621,7 +19215,19 @@ function extractFromHtml(url, html, config) {
|
|
|
18621
19215
|
if (!["http:", "https:"].includes(parsed.protocol)) {
|
|
18622
19216
|
return;
|
|
18623
19217
|
}
|
|
18624
|
-
|
|
19218
|
+
const url2 = normalizeUrlPath(parsed.pathname);
|
|
19219
|
+
let anchorText = normalizeAnchorText($(node).text());
|
|
19220
|
+
if (!anchorText) {
|
|
19221
|
+
const imgAlt = $(node).find("img").first().attr("alt") ?? "";
|
|
19222
|
+
if (isMeaningfulAlt(imgAlt)) {
|
|
19223
|
+
anchorText = normalizeAnchorText(imgAlt);
|
|
19224
|
+
}
|
|
19225
|
+
}
|
|
19226
|
+
const key = `${url2}|${anchorText}`;
|
|
19227
|
+
if (!seenLinkKeys.has(key)) {
|
|
19228
|
+
seenLinkKeys.add(key);
|
|
19229
|
+
outgoingLinks.push({ url: url2, anchorText });
|
|
19230
|
+
}
|
|
18625
19231
|
} catch {
|
|
18626
19232
|
}
|
|
18627
19233
|
});
|
|
@@ -18646,16 +19252,25 @@ function extractFromHtml(url, html, config) {
|
|
|
18646
19252
|
return null;
|
|
18647
19253
|
}
|
|
18648
19254
|
const tags = normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1);
|
|
19255
|
+
const publishedAt = extractPublishedAtFromHtml($);
|
|
19256
|
+
if (componentTags) {
|
|
19257
|
+
const extraTags = componentTags.split(",").map((t) => t.trim()).filter(Boolean);
|
|
19258
|
+
for (const t of extraTags) {
|
|
19259
|
+
if (!tags.includes(t)) tags.push(t);
|
|
19260
|
+
}
|
|
19261
|
+
}
|
|
18649
19262
|
return {
|
|
18650
19263
|
url: normalizeUrlPath(url),
|
|
18651
19264
|
title,
|
|
18652
19265
|
markdown,
|
|
18653
|
-
outgoingLinks
|
|
19266
|
+
outgoingLinks,
|
|
18654
19267
|
noindex: false,
|
|
18655
19268
|
tags,
|
|
18656
19269
|
description,
|
|
18657
19270
|
keywords,
|
|
18658
|
-
weight
|
|
19271
|
+
weight,
|
|
19272
|
+
publishedAt,
|
|
19273
|
+
meta: Object.keys(meta).length > 0 ? meta : void 0
|
|
18659
19274
|
};
|
|
18660
19275
|
}
|
|
18661
19276
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -18676,6 +19291,24 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
18676
19291
|
if (mdWeight === 0) {
|
|
18677
19292
|
return null;
|
|
18678
19293
|
}
|
|
19294
|
+
let mdMeta;
|
|
19295
|
+
const rawMeta = searchsocketMeta?.meta;
|
|
19296
|
+
if (rawMeta && typeof rawMeta === "object" && !Array.isArray(rawMeta)) {
|
|
19297
|
+
const metaObj = {};
|
|
19298
|
+
for (const [key, val] of Object.entries(rawMeta)) {
|
|
19299
|
+
if (!validateMetaKey(key)) continue;
|
|
19300
|
+
if (typeof val === "string" || typeof val === "number" || typeof val === "boolean") {
|
|
19301
|
+
metaObj[key] = val;
|
|
19302
|
+
} else if (Array.isArray(val) && val.every((v) => typeof v === "string")) {
|
|
19303
|
+
metaObj[key] = val;
|
|
19304
|
+
} else if (val instanceof Date) {
|
|
19305
|
+
metaObj[key] = val.getTime();
|
|
19306
|
+
}
|
|
19307
|
+
}
|
|
19308
|
+
if (Object.keys(metaObj).length > 0) {
|
|
19309
|
+
mdMeta = metaObj;
|
|
19310
|
+
}
|
|
19311
|
+
}
|
|
18679
19312
|
const content = parsed.content;
|
|
18680
19313
|
const normalized = normalizeMarkdown(content);
|
|
18681
19314
|
if (!normalizeText(normalized)) {
|
|
@@ -18690,6 +19323,7 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
18690
19323
|
fmKeywords = frontmatter.keywords.split(",").map((k) => k.trim()).filter(Boolean);
|
|
18691
19324
|
}
|
|
18692
19325
|
if (fmKeywords && fmKeywords.length === 0) fmKeywords = void 0;
|
|
19326
|
+
const publishedAt = extractPublishedAtFromFrontmatter(frontmatter);
|
|
18693
19327
|
return {
|
|
18694
19328
|
url: normalizeUrlPath(url),
|
|
18695
19329
|
title: resolvedTitle,
|
|
@@ -18699,7 +19333,9 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
18699
19333
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
18700
19334
|
description: fmDescription,
|
|
18701
19335
|
keywords: fmKeywords,
|
|
18702
|
-
weight: mdWeight
|
|
19336
|
+
weight: mdWeight,
|
|
19337
|
+
publishedAt,
|
|
19338
|
+
meta: mdMeta
|
|
18703
19339
|
};
|
|
18704
19340
|
}
|
|
18705
19341
|
function segmentToRegex(segment) {
|
|
@@ -18894,7 +19530,7 @@ async function parseManifest(cwd, outputDir) {
|
|
|
18894
19530
|
const manifestPath = path.resolve(cwd, outputDir, "server", "manifest-full.js");
|
|
18895
19531
|
let content;
|
|
18896
19532
|
try {
|
|
18897
|
-
content = await
|
|
19533
|
+
content = await fs8.readFile(manifestPath, "utf8");
|
|
18898
19534
|
} catch {
|
|
18899
19535
|
throw new SearchSocketError(
|
|
18900
19536
|
"BUILD_MANIFEST_NOT_FOUND",
|
|
@@ -19205,6 +19841,125 @@ function filePathToUrl(filePath, baseDir) {
|
|
|
19205
19841
|
const noExt = relative.replace(/\.md$/i, "").replace(/\/index$/i, "");
|
|
19206
19842
|
return normalizeUrlPath(noExt || "/");
|
|
19207
19843
|
}
|
|
19844
|
+
var ROUTE_FILE_RE = /\+(page|layout|error)(@[^.]+)?\.svelte$/;
|
|
19845
|
+
function isSvelteComponentFile(filePath) {
|
|
19846
|
+
if (!filePath.endsWith(".svelte")) return false;
|
|
19847
|
+
return !ROUTE_FILE_RE.test(filePath);
|
|
19848
|
+
}
|
|
19849
|
+
function extractSvelteComponentMeta(source) {
|
|
19850
|
+
const componentMatch = source.match(/<!--\s*@component\s*([\s\S]*?)\s*-->/);
|
|
19851
|
+
const description = componentMatch?.[1]?.trim() || void 0;
|
|
19852
|
+
const propsMatch = source.match(
|
|
19853
|
+
/let\s+\{([\s\S]*?)\}\s*(?::\s*([^=;{][\s\S]*?))?\s*=\s*\$props\(\)/
|
|
19854
|
+
);
|
|
19855
|
+
const props = [];
|
|
19856
|
+
if (propsMatch) {
|
|
19857
|
+
const destructureBlock = propsMatch[1];
|
|
19858
|
+
const typeAnnotation = propsMatch[2]?.trim();
|
|
19859
|
+
let resolvedTypeMap;
|
|
19860
|
+
if (typeAnnotation && /^[A-Z]\w*$/.test(typeAnnotation)) {
|
|
19861
|
+
resolvedTypeMap = resolveTypeReference(source, typeAnnotation);
|
|
19862
|
+
} else if (typeAnnotation && typeAnnotation.startsWith("{")) {
|
|
19863
|
+
resolvedTypeMap = parseInlineTypeAnnotation(typeAnnotation);
|
|
19864
|
+
}
|
|
19865
|
+
const propEntries = splitDestructureBlock(destructureBlock);
|
|
19866
|
+
for (const entry of propEntries) {
|
|
19867
|
+
const trimmed = entry.trim();
|
|
19868
|
+
if (!trimmed || trimmed.startsWith("...")) continue;
|
|
19869
|
+
let propName;
|
|
19870
|
+
let defaultValue;
|
|
19871
|
+
const renameMatch = trimmed.match(/^(\w+)\s*:\s*\w+\s*(?:=\s*([\s\S]+))?$/);
|
|
19872
|
+
if (renameMatch) {
|
|
19873
|
+
propName = renameMatch[1];
|
|
19874
|
+
defaultValue = renameMatch[2]?.trim();
|
|
19875
|
+
} else {
|
|
19876
|
+
const defaultMatch = trimmed.match(/^(\w+)\s*=\s*([\s\S]+)$/);
|
|
19877
|
+
if (defaultMatch) {
|
|
19878
|
+
propName = defaultMatch[1];
|
|
19879
|
+
defaultValue = defaultMatch[2]?.trim();
|
|
19880
|
+
} else {
|
|
19881
|
+
propName = trimmed.match(/^(\w+)/)?.[1] ?? trimmed;
|
|
19882
|
+
}
|
|
19883
|
+
}
|
|
19884
|
+
const propType = resolvedTypeMap?.get(propName);
|
|
19885
|
+
props.push({
|
|
19886
|
+
name: propName,
|
|
19887
|
+
...propType ? { type: propType } : {},
|
|
19888
|
+
...defaultValue ? { default: defaultValue } : {}
|
|
19889
|
+
});
|
|
19890
|
+
}
|
|
19891
|
+
}
|
|
19892
|
+
return { description, props };
|
|
19893
|
+
}
|
|
19894
|
+
function splitDestructureBlock(block) {
|
|
19895
|
+
const entries = [];
|
|
19896
|
+
let depth = 0;
|
|
19897
|
+
let current = "";
|
|
19898
|
+
for (const ch of block) {
|
|
19899
|
+
if (ch === "{" || ch === "[" || ch === "(") {
|
|
19900
|
+
depth++;
|
|
19901
|
+
current += ch;
|
|
19902
|
+
} else if (ch === "}" || ch === "]" || ch === ")") {
|
|
19903
|
+
depth--;
|
|
19904
|
+
current += ch;
|
|
19905
|
+
} else if (ch === "," && depth === 0) {
|
|
19906
|
+
entries.push(current);
|
|
19907
|
+
current = "";
|
|
19908
|
+
} else {
|
|
19909
|
+
current += ch;
|
|
19910
|
+
}
|
|
19911
|
+
}
|
|
19912
|
+
if (current.trim()) entries.push(current);
|
|
19913
|
+
return entries;
|
|
19914
|
+
}
|
|
19915
|
+
function resolveTypeReference(source, typeName) {
|
|
19916
|
+
const startRe = new RegExp(`(?:interface\\s+${typeName}\\s*|type\\s+${typeName}\\s*=\\s*)\\{`);
|
|
19917
|
+
const startMatch = source.match(startRe);
|
|
19918
|
+
if (!startMatch || startMatch.index === void 0) return void 0;
|
|
19919
|
+
const bodyStart = startMatch.index + startMatch[0].length;
|
|
19920
|
+
let depth = 1;
|
|
19921
|
+
let i = bodyStart;
|
|
19922
|
+
while (i < source.length && depth > 0) {
|
|
19923
|
+
if (source[i] === "{") depth++;
|
|
19924
|
+
else if (source[i] === "}") depth--;
|
|
19925
|
+
i++;
|
|
19926
|
+
}
|
|
19927
|
+
if (depth !== 0) return void 0;
|
|
19928
|
+
const body = source.slice(bodyStart, i - 1);
|
|
19929
|
+
return parseTypeMembers(body);
|
|
19930
|
+
}
|
|
19931
|
+
function parseInlineTypeAnnotation(annotation) {
|
|
19932
|
+
const inner = annotation.replace(/^\{/, "").replace(/\}$/, "");
|
|
19933
|
+
return parseTypeMembers(inner);
|
|
19934
|
+
}
|
|
19935
|
+
function parseTypeMembers(body) {
|
|
19936
|
+
const map = /* @__PURE__ */ new Map();
|
|
19937
|
+
const members = body.split(/[;\n]/).map((m) => m.trim()).filter(Boolean);
|
|
19938
|
+
for (const member of members) {
|
|
19939
|
+
const memberMatch = member.match(/^(\w+)\??\s*:\s*(.+)$/);
|
|
19940
|
+
if (memberMatch) {
|
|
19941
|
+
map.set(memberMatch[1], memberMatch[2].replace(/,\s*$/, "").trim());
|
|
19942
|
+
}
|
|
19943
|
+
}
|
|
19944
|
+
return map;
|
|
19945
|
+
}
|
|
19946
|
+
function buildComponentMarkdown(componentName, meta) {
|
|
19947
|
+
if (!meta.description && meta.props.length === 0) return "";
|
|
19948
|
+
const parts = [`${componentName} component.`];
|
|
19949
|
+
if (meta.description) {
|
|
19950
|
+
parts.push(meta.description);
|
|
19951
|
+
}
|
|
19952
|
+
if (meta.props.length > 0) {
|
|
19953
|
+
const propEntries = meta.props.map((p) => {
|
|
19954
|
+
let entry = p.name;
|
|
19955
|
+
if (p.type) entry += ` (${p.type})`;
|
|
19956
|
+
if (p.default) entry += ` default: ${p.default}`;
|
|
19957
|
+
return entry;
|
|
19958
|
+
});
|
|
19959
|
+
parts.push(`Props: ${propEntries.join(", ")}.`);
|
|
19960
|
+
}
|
|
19961
|
+
return parts.join(" ");
|
|
19962
|
+
}
|
|
19208
19963
|
function normalizeSvelteToMarkdown(source) {
|
|
19209
19964
|
return source.replace(/<script[\s\S]*?<\/script>/g, "").replace(/<style[\s\S]*?<\/style>/g, "").replace(/<[^>]+>/g, " ").replace(/\{[^}]+\}/g, " ").replace(/\s+/g, " ").trim();
|
|
19210
19965
|
}
|
|
@@ -19223,13 +19978,27 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
19223
19978
|
const selected = typeof limit === "number" ? files.slice(0, limit) : files;
|
|
19224
19979
|
const pages = [];
|
|
19225
19980
|
for (const filePath of selected) {
|
|
19226
|
-
const raw = await
|
|
19227
|
-
|
|
19981
|
+
const raw = await fs8.readFile(filePath, "utf8");
|
|
19982
|
+
let markdown;
|
|
19983
|
+
let tags;
|
|
19984
|
+
if (filePath.endsWith(".md")) {
|
|
19985
|
+
markdown = raw;
|
|
19986
|
+
} else if (isSvelteComponentFile(filePath)) {
|
|
19987
|
+
const componentName = path.basename(filePath, ".svelte");
|
|
19988
|
+
const meta = extractSvelteComponentMeta(raw);
|
|
19989
|
+
const componentMarkdown = buildComponentMarkdown(componentName, meta);
|
|
19990
|
+
const templateContent = normalizeSvelteToMarkdown(raw);
|
|
19991
|
+
markdown = componentMarkdown ? [componentMarkdown, templateContent].filter(Boolean).join("\n\n") : templateContent;
|
|
19992
|
+
tags = ["component"];
|
|
19993
|
+
} else {
|
|
19994
|
+
markdown = normalizeSvelteToMarkdown(raw);
|
|
19995
|
+
}
|
|
19228
19996
|
pages.push({
|
|
19229
19997
|
url: filePathToUrl(filePath, baseDir),
|
|
19230
19998
|
markdown,
|
|
19231
19999
|
sourcePath: path.relative(cwd, filePath).replace(/\\/g, "/"),
|
|
19232
|
-
outgoingLinks: []
|
|
20000
|
+
outgoingLinks: [],
|
|
20001
|
+
...tags ? { tags } : {}
|
|
19233
20002
|
});
|
|
19234
20003
|
}
|
|
19235
20004
|
return pages;
|
|
@@ -19359,7 +20128,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
19359
20128
|
const selected = typeof limit === "number" ? htmlFiles.slice(0, limit) : htmlFiles;
|
|
19360
20129
|
const pages = [];
|
|
19361
20130
|
for (const filePath of selected) {
|
|
19362
|
-
const html = await
|
|
20131
|
+
const html = await fs8.readFile(filePath, "utf8");
|
|
19363
20132
|
pages.push({
|
|
19364
20133
|
url: staticHtmlFileToUrl(filePath, outputDir),
|
|
19365
20134
|
html,
|
|
@@ -19422,7 +20191,7 @@ function isBlockedByRobots(urlPath, rules3) {
|
|
|
19422
20191
|
}
|
|
19423
20192
|
async function loadRobotsTxtFromDir(dir) {
|
|
19424
20193
|
try {
|
|
19425
|
-
const content = await
|
|
20194
|
+
const content = await fs8.readFile(path.join(dir, "robots.txt"), "utf8");
|
|
19426
20195
|
return parseRobotsTxt(content);
|
|
19427
20196
|
} catch {
|
|
19428
20197
|
return null;
|
|
@@ -19450,29 +20219,65 @@ function nonNegativeOrZero(value) {
|
|
|
19450
20219
|
function normalizeForTitleMatch(text) {
|
|
19451
20220
|
return text.toLowerCase().replace(/[^a-z0-9\s]/g, "").replace(/\s+/g, " ").trim();
|
|
19452
20221
|
}
|
|
19453
|
-
function rankHits(hits, config, query) {
|
|
20222
|
+
function rankHits(hits, config, query, debug) {
|
|
19454
20223
|
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
19455
20224
|
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
19456
20225
|
return hits.map((hit) => {
|
|
19457
|
-
|
|
20226
|
+
const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
20227
|
+
let score = baseScore;
|
|
20228
|
+
let incomingLinkBoostValue = 0;
|
|
19458
20229
|
if (config.ranking.enableIncomingLinkBoost) {
|
|
19459
20230
|
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
19460
|
-
|
|
20231
|
+
incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
|
|
20232
|
+
score += incomingLinkBoostValue;
|
|
19461
20233
|
}
|
|
20234
|
+
let depthBoostValue = 0;
|
|
19462
20235
|
if (config.ranking.enableDepthBoost) {
|
|
19463
20236
|
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
19464
|
-
|
|
20237
|
+
depthBoostValue = depthBoost * config.ranking.weights.depth;
|
|
20238
|
+
score += depthBoostValue;
|
|
19465
20239
|
}
|
|
20240
|
+
let titleMatchBoostValue = 0;
|
|
19466
20241
|
if (normalizedQuery && titleMatchWeight > 0) {
|
|
19467
20242
|
const normalizedTitle = normalizeForTitleMatch(hit.metadata.title);
|
|
19468
20243
|
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
19469
|
-
|
|
20244
|
+
titleMatchBoostValue = titleMatchWeight;
|
|
20245
|
+
score += titleMatchBoostValue;
|
|
19470
20246
|
}
|
|
19471
20247
|
}
|
|
19472
|
-
|
|
20248
|
+
let freshnessBoostValue = 0;
|
|
20249
|
+
if (config.ranking.enableFreshnessBoost) {
|
|
20250
|
+
const publishedAt = hit.metadata.publishedAt;
|
|
20251
|
+
if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
|
|
20252
|
+
const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
|
|
20253
|
+
const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
|
|
20254
|
+
freshnessBoostValue = decay * config.ranking.weights.freshness;
|
|
20255
|
+
score += freshnessBoostValue;
|
|
20256
|
+
}
|
|
20257
|
+
}
|
|
20258
|
+
let anchorTextMatchBoostValue = 0;
|
|
20259
|
+
if (config.ranking.enableAnchorTextBoost && normalizedQuery && config.ranking.weights.anchorText > 0) {
|
|
20260
|
+
const normalizedAnchorText = normalizeForTitleMatch(hit.metadata.incomingAnchorText ?? "");
|
|
20261
|
+
if (normalizedAnchorText.length > 0 && normalizedQuery.length > 0 && (normalizedAnchorText.includes(normalizedQuery) || normalizedQuery.includes(normalizedAnchorText))) {
|
|
20262
|
+
anchorTextMatchBoostValue = config.ranking.weights.anchorText;
|
|
20263
|
+
score += anchorTextMatchBoostValue;
|
|
20264
|
+
}
|
|
20265
|
+
}
|
|
20266
|
+
const result = {
|
|
19473
20267
|
hit,
|
|
19474
20268
|
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
19475
20269
|
};
|
|
20270
|
+
if (debug) {
|
|
20271
|
+
result.breakdown = {
|
|
20272
|
+
baseScore,
|
|
20273
|
+
incomingLinkBoost: incomingLinkBoostValue,
|
|
20274
|
+
depthBoost: depthBoostValue,
|
|
20275
|
+
titleMatchBoost: titleMatchBoostValue,
|
|
20276
|
+
freshnessBoost: freshnessBoostValue,
|
|
20277
|
+
anchorTextMatchBoost: anchorTextMatchBoostValue
|
|
20278
|
+
};
|
|
20279
|
+
}
|
|
20280
|
+
return result;
|
|
19476
20281
|
}).sort((a, b) => {
|
|
19477
20282
|
const delta = b.finalScore - a.finalScore;
|
|
19478
20283
|
return Number.isNaN(delta) ? 0 : delta;
|
|
@@ -19481,12 +20286,13 @@ function rankHits(hits, config, query) {
|
|
|
19481
20286
|
function trimByScoreGap(results, config) {
|
|
19482
20287
|
if (results.length === 0) return results;
|
|
19483
20288
|
const threshold = config.ranking.scoreGapThreshold;
|
|
19484
|
-
const
|
|
19485
|
-
if (
|
|
19486
|
-
const
|
|
19487
|
-
|
|
19488
|
-
|
|
19489
|
-
|
|
20289
|
+
const minScoreRatio = config.ranking.minScoreRatio;
|
|
20290
|
+
if (minScoreRatio > 0 && results.length > 0) {
|
|
20291
|
+
const topScore = results[0].pageScore;
|
|
20292
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
20293
|
+
const minThreshold = topScore * minScoreRatio;
|
|
20294
|
+
results = results.filter((r) => r.pageScore >= minThreshold);
|
|
20295
|
+
}
|
|
19490
20296
|
}
|
|
19491
20297
|
if (threshold > 0 && results.length > 1) {
|
|
19492
20298
|
for (let i = 1; i < results.length; i++) {
|
|
@@ -19556,61 +20362,99 @@ function aggregateByPage(ranked, config) {
|
|
|
19556
20362
|
return Number.isNaN(delta) ? 0 : delta;
|
|
19557
20363
|
});
|
|
19558
20364
|
}
|
|
19559
|
-
function
|
|
19560
|
-
|
|
19561
|
-
const
|
|
19562
|
-
|
|
19563
|
-
|
|
19564
|
-
|
|
19565
|
-
|
|
19566
|
-
|
|
19567
|
-
|
|
19568
|
-
|
|
19569
|
-
|
|
19570
|
-
if (pageHit) {
|
|
19571
|
-
pagesWithChunks.add(url);
|
|
19572
|
-
const blended = (1 - w) * ranked.finalScore + w * pageHit.score;
|
|
19573
|
-
return {
|
|
19574
|
-
hit: ranked.hit,
|
|
19575
|
-
finalScore: Number.isFinite(blended) ? blended : ranked.finalScore
|
|
19576
|
-
};
|
|
20365
|
+
function rankPageHits(pageHits, config, query, debug) {
|
|
20366
|
+
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
20367
|
+
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
20368
|
+
return pageHits.map((hit) => {
|
|
20369
|
+
const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
20370
|
+
let score = baseScore;
|
|
20371
|
+
let incomingLinkBoostValue = 0;
|
|
20372
|
+
if (config.ranking.enableIncomingLinkBoost) {
|
|
20373
|
+
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.incomingLinks));
|
|
20374
|
+
incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
|
|
20375
|
+
score += incomingLinkBoostValue;
|
|
19577
20376
|
}
|
|
19578
|
-
|
|
19579
|
-
|
|
19580
|
-
|
|
19581
|
-
|
|
19582
|
-
|
|
19583
|
-
|
|
19584
|
-
|
|
19585
|
-
|
|
19586
|
-
|
|
19587
|
-
|
|
19588
|
-
|
|
19589
|
-
|
|
19590
|
-
path: pageHit.url,
|
|
19591
|
-
title: pageHit.title,
|
|
19592
|
-
sectionTitle: "",
|
|
19593
|
-
headingPath: [],
|
|
19594
|
-
snippet: pageHit.description || pageHit.title,
|
|
19595
|
-
chunkText: pageHit.description || pageHit.title,
|
|
19596
|
-
ordinal: 0,
|
|
19597
|
-
contentHash: "",
|
|
19598
|
-
depth: pageHit.depth,
|
|
19599
|
-
incomingLinks: pageHit.incomingLinks,
|
|
19600
|
-
routeFile: pageHit.routeFile,
|
|
19601
|
-
tags: pageHit.tags
|
|
20377
|
+
let depthBoostValue = 0;
|
|
20378
|
+
if (config.ranking.enableDepthBoost) {
|
|
20379
|
+
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.depth));
|
|
20380
|
+
depthBoostValue = depthBoost * config.ranking.weights.depth;
|
|
20381
|
+
score += depthBoostValue;
|
|
20382
|
+
}
|
|
20383
|
+
let titleMatchBoostValue = 0;
|
|
20384
|
+
if (normalizedQuery && titleMatchWeight > 0) {
|
|
20385
|
+
const normalizedTitle = normalizeForTitleMatch(hit.title);
|
|
20386
|
+
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
20387
|
+
titleMatchBoostValue = titleMatchWeight;
|
|
20388
|
+
score += titleMatchBoostValue;
|
|
19602
20389
|
}
|
|
20390
|
+
}
|
|
20391
|
+
let freshnessBoostValue = 0;
|
|
20392
|
+
if (config.ranking.enableFreshnessBoost) {
|
|
20393
|
+
const publishedAt = hit.publishedAt;
|
|
20394
|
+
if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
|
|
20395
|
+
const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
|
|
20396
|
+
const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
|
|
20397
|
+
freshnessBoostValue = decay * config.ranking.weights.freshness;
|
|
20398
|
+
score += freshnessBoostValue;
|
|
20399
|
+
}
|
|
20400
|
+
}
|
|
20401
|
+
const pageWeight = findPageWeight(hit.url, config.ranking.pageWeights);
|
|
20402
|
+
if (pageWeight !== 1) {
|
|
20403
|
+
score *= pageWeight;
|
|
20404
|
+
}
|
|
20405
|
+
const result = {
|
|
20406
|
+
url: hit.url,
|
|
20407
|
+
title: hit.title,
|
|
20408
|
+
description: hit.description,
|
|
20409
|
+
routeFile: hit.routeFile,
|
|
20410
|
+
depth: hit.depth,
|
|
20411
|
+
incomingLinks: hit.incomingLinks,
|
|
20412
|
+
tags: hit.tags,
|
|
20413
|
+
baseScore,
|
|
20414
|
+
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY,
|
|
20415
|
+
publishedAt: hit.publishedAt
|
|
19603
20416
|
};
|
|
19604
|
-
|
|
19605
|
-
|
|
19606
|
-
|
|
19607
|
-
|
|
19608
|
-
|
|
19609
|
-
|
|
20417
|
+
if (debug) {
|
|
20418
|
+
result.breakdown = {
|
|
20419
|
+
baseScore,
|
|
20420
|
+
pageWeight,
|
|
20421
|
+
incomingLinkBoost: incomingLinkBoostValue,
|
|
20422
|
+
depthBoost: depthBoostValue,
|
|
20423
|
+
titleMatchBoost: titleMatchBoostValue,
|
|
20424
|
+
freshnessBoost: freshnessBoostValue
|
|
20425
|
+
};
|
|
20426
|
+
}
|
|
20427
|
+
return result;
|
|
20428
|
+
}).filter((p) => findPageWeight(p.url, config.ranking.pageWeights) !== 0).sort((a, b) => {
|
|
19610
20429
|
const delta = b.finalScore - a.finalScore;
|
|
19611
20430
|
return Number.isNaN(delta) ? 0 : delta;
|
|
19612
20431
|
});
|
|
19613
20432
|
}
|
|
20433
|
+
function trimPagesByScoreGap(results, config) {
|
|
20434
|
+
if (results.length === 0) return results;
|
|
20435
|
+
const threshold = config.ranking.scoreGapThreshold;
|
|
20436
|
+
const minScoreRatio = config.ranking.minScoreRatio;
|
|
20437
|
+
if (minScoreRatio > 0 && results.length > 0) {
|
|
20438
|
+
const topScore = results[0].finalScore;
|
|
20439
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
20440
|
+
const minThreshold = topScore * minScoreRatio;
|
|
20441
|
+
results = results.filter((r) => r.finalScore >= minThreshold);
|
|
20442
|
+
}
|
|
20443
|
+
}
|
|
20444
|
+
if (threshold > 0 && results.length > 1) {
|
|
20445
|
+
for (let i = 1; i < results.length; i++) {
|
|
20446
|
+
const prev = results[i - 1].finalScore;
|
|
20447
|
+
const current = results[i].finalScore;
|
|
20448
|
+
if (prev > 0) {
|
|
20449
|
+
const gap = (prev - current) / prev;
|
|
20450
|
+
if (gap >= threshold) {
|
|
20451
|
+
return results.slice(0, i);
|
|
20452
|
+
}
|
|
20453
|
+
}
|
|
20454
|
+
}
|
|
20455
|
+
}
|
|
20456
|
+
return results;
|
|
20457
|
+
}
|
|
19614
20458
|
|
|
19615
20459
|
// src/utils/time.ts
|
|
19616
20460
|
function nowIso() {
|
|
@@ -19619,6 +20463,81 @@ function nowIso() {
|
|
|
19619
20463
|
function hrTimeMs(start) {
|
|
19620
20464
|
return Number(process.hrtime.bigint() - start) / 1e6;
|
|
19621
20465
|
}
|
|
20466
|
+
function resolvePageUrl(pageUrl, baseUrl) {
|
|
20467
|
+
if (!baseUrl) return pageUrl;
|
|
20468
|
+
try {
|
|
20469
|
+
return new URL(pageUrl, baseUrl).href;
|
|
20470
|
+
} catch {
|
|
20471
|
+
return pageUrl;
|
|
20472
|
+
}
|
|
20473
|
+
}
|
|
20474
|
+
function generateLlmsTxt(pages, config) {
|
|
20475
|
+
const title = config.llmsTxt.title ?? config.project.id;
|
|
20476
|
+
const description = config.llmsTxt.description;
|
|
20477
|
+
const baseUrl = config.project.baseUrl;
|
|
20478
|
+
const lines = [`# ${title}`];
|
|
20479
|
+
if (description) {
|
|
20480
|
+
lines.push("", `> ${description}`);
|
|
20481
|
+
}
|
|
20482
|
+
const filtered = pages.filter(
|
|
20483
|
+
(p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
|
|
20484
|
+
);
|
|
20485
|
+
const sorted = [...filtered].sort((a, b) => {
|
|
20486
|
+
if (a.depth !== b.depth) return a.depth - b.depth;
|
|
20487
|
+
return b.incomingLinks - a.incomingLinks;
|
|
20488
|
+
});
|
|
20489
|
+
if (sorted.length > 0) {
|
|
20490
|
+
lines.push("", "## Pages", "");
|
|
20491
|
+
for (const page of sorted) {
|
|
20492
|
+
const url = resolvePageUrl(page.url, baseUrl);
|
|
20493
|
+
if (page.description) {
|
|
20494
|
+
lines.push(`- [${page.title}](${url}): ${page.description}`);
|
|
20495
|
+
} else {
|
|
20496
|
+
lines.push(`- [${page.title}](${url})`);
|
|
20497
|
+
}
|
|
20498
|
+
}
|
|
20499
|
+
}
|
|
20500
|
+
lines.push("");
|
|
20501
|
+
return lines.join("\n");
|
|
20502
|
+
}
|
|
20503
|
+
function generateLlmsFullTxt(pages, config) {
|
|
20504
|
+
const title = config.llmsTxt.title ?? config.project.id;
|
|
20505
|
+
const description = config.llmsTxt.description;
|
|
20506
|
+
const baseUrl = config.project.baseUrl;
|
|
20507
|
+
const lines = [`# ${title}`];
|
|
20508
|
+
if (description) {
|
|
20509
|
+
lines.push("", `> ${description}`);
|
|
20510
|
+
}
|
|
20511
|
+
const filtered = pages.filter(
|
|
20512
|
+
(p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
|
|
20513
|
+
);
|
|
20514
|
+
const sorted = [...filtered].sort((a, b) => {
|
|
20515
|
+
if (a.depth !== b.depth) return a.depth - b.depth;
|
|
20516
|
+
return b.incomingLinks - a.incomingLinks;
|
|
20517
|
+
});
|
|
20518
|
+
for (const page of sorted) {
|
|
20519
|
+
const url = resolvePageUrl(page.url, baseUrl);
|
|
20520
|
+
lines.push("", "---", "", `## [${page.title}](${url})`, "");
|
|
20521
|
+
lines.push(page.markdown.trim());
|
|
20522
|
+
}
|
|
20523
|
+
lines.push("");
|
|
20524
|
+
return lines.join("\n");
|
|
20525
|
+
}
|
|
20526
|
+
async function writeLlmsTxt(pages, config, cwd, logger3) {
|
|
20527
|
+
const outputPath = path.resolve(cwd, config.llmsTxt.outputPath);
|
|
20528
|
+
const outputDir = path.dirname(outputPath);
|
|
20529
|
+
await fs8.mkdir(outputDir, { recursive: true });
|
|
20530
|
+
const content = generateLlmsTxt(pages, config);
|
|
20531
|
+
await fs8.writeFile(outputPath, content, "utf8");
|
|
20532
|
+
logger3.info(`Generated llms.txt at ${config.llmsTxt.outputPath}`);
|
|
20533
|
+
if (config.llmsTxt.generateFull) {
|
|
20534
|
+
const fullPath = outputPath.replace(/\.txt$/, "-full.txt");
|
|
20535
|
+
const fullContent = generateLlmsFullTxt(pages, config);
|
|
20536
|
+
await fs8.writeFile(fullPath, fullContent, "utf8");
|
|
20537
|
+
const relativeFull = path.relative(cwd, fullPath);
|
|
20538
|
+
logger3.info(`Generated llms-full.txt at ${relativeFull}`);
|
|
20539
|
+
}
|
|
20540
|
+
}
|
|
19622
20541
|
|
|
19623
20542
|
// src/indexing/pipeline.ts
|
|
19624
20543
|
function buildPageSummary(page, maxChars = 3500) {
|
|
@@ -19637,16 +20556,33 @@ function buildPageSummary(page, maxChars = 3500) {
|
|
|
19637
20556
|
if (joined.length <= maxChars) return joined;
|
|
19638
20557
|
return joined.slice(0, maxChars).trim();
|
|
19639
20558
|
}
|
|
20559
|
+
function buildPageContentHash(page) {
|
|
20560
|
+
const parts = [
|
|
20561
|
+
page.title,
|
|
20562
|
+
page.description ?? "",
|
|
20563
|
+
(page.keywords ?? []).slice().sort().join(","),
|
|
20564
|
+
page.tags.slice().sort().join(","),
|
|
20565
|
+
page.markdown,
|
|
20566
|
+
String(page.outgoingLinks),
|
|
20567
|
+
String(page.publishedAt ?? ""),
|
|
20568
|
+
page.incomingAnchorText ?? "",
|
|
20569
|
+
(page.outgoingLinkUrls ?? []).slice().sort().join(","),
|
|
20570
|
+
page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : ""
|
|
20571
|
+
];
|
|
20572
|
+
return sha256(parts.join("|"));
|
|
20573
|
+
}
|
|
19640
20574
|
var IndexPipeline = class _IndexPipeline {
|
|
19641
20575
|
cwd;
|
|
19642
20576
|
config;
|
|
19643
20577
|
store;
|
|
19644
20578
|
logger;
|
|
20579
|
+
hooks;
|
|
19645
20580
|
constructor(options) {
|
|
19646
20581
|
this.cwd = options.cwd;
|
|
19647
20582
|
this.config = options.config;
|
|
19648
20583
|
this.store = options.store;
|
|
19649
20584
|
this.logger = options.logger;
|
|
20585
|
+
this.hooks = options.hooks;
|
|
19650
20586
|
}
|
|
19651
20587
|
static async create(options = {}) {
|
|
19652
20588
|
const cwd = path.resolve(options.cwd ?? process.cwd());
|
|
@@ -19656,7 +20592,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19656
20592
|
cwd,
|
|
19657
20593
|
config,
|
|
19658
20594
|
store,
|
|
19659
|
-
logger: options.logger ?? new Logger()
|
|
20595
|
+
logger: options.logger ?? new Logger(),
|
|
20596
|
+
hooks: options.hooks ?? {}
|
|
19660
20597
|
});
|
|
19661
20598
|
}
|
|
19662
20599
|
getConfig() {
|
|
@@ -19677,7 +20614,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19677
20614
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
19678
20615
|
ensureStateDirs(this.cwd, this.config.state.dir);
|
|
19679
20616
|
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
19680
|
-
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-
|
|
20617
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-vector)`);
|
|
19681
20618
|
if (options.force) {
|
|
19682
20619
|
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
19683
20620
|
}
|
|
@@ -19686,8 +20623,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19686
20623
|
}
|
|
19687
20624
|
const manifestStart = stageStart();
|
|
19688
20625
|
const existingHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getContentHashes(scope);
|
|
20626
|
+
const existingPageHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getPageHashes(scope);
|
|
19689
20627
|
stageEnd("manifest", manifestStart);
|
|
19690
|
-
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
20628
|
+
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes, ${existingPageHashes.size} existing page hashes loaded`);
|
|
19691
20629
|
const sourceStart = stageStart();
|
|
19692
20630
|
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
19693
20631
|
let sourcePages;
|
|
@@ -19764,11 +20702,61 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19764
20702
|
);
|
|
19765
20703
|
continue;
|
|
19766
20704
|
}
|
|
19767
|
-
|
|
20705
|
+
if (sourcePage.tags && sourcePage.tags.length > 0) {
|
|
20706
|
+
extracted.tags = [.../* @__PURE__ */ new Set([...extracted.tags, ...sourcePage.tags])];
|
|
20707
|
+
}
|
|
20708
|
+
let accepted;
|
|
20709
|
+
if (this.hooks.transformPage) {
|
|
20710
|
+
const transformed = await this.hooks.transformPage(extracted);
|
|
20711
|
+
if (transformed === null) {
|
|
20712
|
+
this.logger.debug(`Page ${sourcePage.url} skipped by transformPage hook`);
|
|
20713
|
+
continue;
|
|
20714
|
+
}
|
|
20715
|
+
accepted = transformed;
|
|
20716
|
+
} else {
|
|
20717
|
+
accepted = extracted;
|
|
20718
|
+
}
|
|
20719
|
+
extractedPages.push(accepted);
|
|
19768
20720
|
this.logger.event("page_extracted", {
|
|
19769
|
-
url:
|
|
20721
|
+
url: accepted.url
|
|
19770
20722
|
});
|
|
19771
20723
|
}
|
|
20724
|
+
const customRecords = options.customRecords ?? [];
|
|
20725
|
+
if (customRecords.length > 0) {
|
|
20726
|
+
this.logger.info(`Processing ${customRecords.length} custom record${customRecords.length === 1 ? "" : "s"}...`);
|
|
20727
|
+
for (const record of customRecords) {
|
|
20728
|
+
const normalizedUrl = normalizeUrlPath(record.url);
|
|
20729
|
+
const normalized = normalizeMarkdown(record.content);
|
|
20730
|
+
if (!normalized.trim()) {
|
|
20731
|
+
this.logger.warn(`Custom record ${normalizedUrl} has empty content and was skipped.`);
|
|
20732
|
+
continue;
|
|
20733
|
+
}
|
|
20734
|
+
const urlTags = normalizedUrl.split("/").filter(Boolean).slice(0, 1);
|
|
20735
|
+
const tags = record.tags ? [.../* @__PURE__ */ new Set([...urlTags, ...record.tags])] : urlTags;
|
|
20736
|
+
const extracted = {
|
|
20737
|
+
url: normalizedUrl,
|
|
20738
|
+
title: record.title,
|
|
20739
|
+
markdown: normalized,
|
|
20740
|
+
outgoingLinks: [],
|
|
20741
|
+
noindex: false,
|
|
20742
|
+
tags,
|
|
20743
|
+
weight: record.weight
|
|
20744
|
+
};
|
|
20745
|
+
let accepted;
|
|
20746
|
+
if (this.hooks.transformPage) {
|
|
20747
|
+
const transformed = await this.hooks.transformPage(extracted);
|
|
20748
|
+
if (transformed === null) {
|
|
20749
|
+
this.logger.debug(`Custom record ${normalizedUrl} skipped by transformPage hook`);
|
|
20750
|
+
continue;
|
|
20751
|
+
}
|
|
20752
|
+
accepted = transformed;
|
|
20753
|
+
} else {
|
|
20754
|
+
accepted = extracted;
|
|
20755
|
+
}
|
|
20756
|
+
extractedPages.push(accepted);
|
|
20757
|
+
this.logger.event("page_extracted", { url: accepted.url, custom: true });
|
|
20758
|
+
}
|
|
20759
|
+
}
|
|
19772
20760
|
extractedPages.sort((a, b) => a.url.localeCompare(b.url));
|
|
19773
20761
|
const uniquePages = [];
|
|
19774
20762
|
const seenUrls = /* @__PURE__ */ new Set();
|
|
@@ -19801,15 +20789,28 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19801
20789
|
const linkStart = stageStart();
|
|
19802
20790
|
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
19803
20791
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
20792
|
+
const incomingAnchorTexts = /* @__PURE__ */ new Map();
|
|
19804
20793
|
for (const page of indexablePages) {
|
|
19805
20794
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
19806
20795
|
}
|
|
19807
20796
|
for (const page of indexablePages) {
|
|
19808
|
-
|
|
20797
|
+
const seenForCount = /* @__PURE__ */ new Set();
|
|
20798
|
+
const seenForAnchor = /* @__PURE__ */ new Set();
|
|
20799
|
+
for (const { url: outgoing, anchorText } of page.outgoingLinks) {
|
|
19809
20800
|
if (!pageSet.has(outgoing)) {
|
|
19810
20801
|
continue;
|
|
19811
20802
|
}
|
|
19812
|
-
|
|
20803
|
+
if (!seenForCount.has(outgoing)) {
|
|
20804
|
+
seenForCount.add(outgoing);
|
|
20805
|
+
incomingLinkCount.set(outgoing, (incomingLinkCount.get(outgoing) ?? 0) + 1);
|
|
20806
|
+
}
|
|
20807
|
+
if (anchorText && !seenForAnchor.has(outgoing)) {
|
|
20808
|
+
seenForAnchor.add(outgoing);
|
|
20809
|
+
if (!incomingAnchorTexts.has(outgoing)) {
|
|
20810
|
+
incomingAnchorTexts.set(outgoing, /* @__PURE__ */ new Set());
|
|
20811
|
+
}
|
|
20812
|
+
incomingAnchorTexts.get(outgoing).add(anchorText);
|
|
20813
|
+
}
|
|
19813
20814
|
}
|
|
19814
20815
|
}
|
|
19815
20816
|
stageEnd("links", linkStart);
|
|
@@ -19828,6 +20829,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19828
20829
|
});
|
|
19829
20830
|
}
|
|
19830
20831
|
}
|
|
20832
|
+
for (const record of customRecords) {
|
|
20833
|
+
const normalizedUrl = normalizeUrlPath(record.url);
|
|
20834
|
+
if (!precomputedRoutes.has(normalizedUrl)) {
|
|
20835
|
+
precomputedRoutes.set(normalizedUrl, {
|
|
20836
|
+
routeFile: "",
|
|
20837
|
+
routeResolution: "exact"
|
|
20838
|
+
});
|
|
20839
|
+
}
|
|
20840
|
+
}
|
|
19831
20841
|
for (const page of indexablePages) {
|
|
19832
20842
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
19833
20843
|
if (routeMatch.routeResolution === "best-effort") {
|
|
@@ -19845,6 +20855,17 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19845
20855
|
} else {
|
|
19846
20856
|
routeExact += 1;
|
|
19847
20857
|
}
|
|
20858
|
+
const anchorSet = incomingAnchorTexts.get(page.url);
|
|
20859
|
+
let incomingAnchorText;
|
|
20860
|
+
if (anchorSet && anchorSet.size > 0) {
|
|
20861
|
+
let joined = "";
|
|
20862
|
+
for (const phrase of anchorSet) {
|
|
20863
|
+
const next2 = joined ? `${joined} ${phrase}` : phrase;
|
|
20864
|
+
if (next2.length > 500) break;
|
|
20865
|
+
joined = next2;
|
|
20866
|
+
}
|
|
20867
|
+
incomingAnchorText = joined || void 0;
|
|
20868
|
+
}
|
|
19848
20869
|
const indexedPage = {
|
|
19849
20870
|
url: page.url,
|
|
19850
20871
|
title: page.title,
|
|
@@ -19854,40 +20875,113 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19854
20875
|
generatedAt: nowIso(),
|
|
19855
20876
|
incomingLinks: incomingLinkCount.get(page.url) ?? 0,
|
|
19856
20877
|
outgoingLinks: page.outgoingLinks.length,
|
|
20878
|
+
outgoingLinkUrls: page.outgoingLinks.map((l) => typeof l === "string" ? l : l.url),
|
|
19857
20879
|
depth: getUrlDepth(page.url),
|
|
19858
20880
|
tags: page.tags,
|
|
19859
20881
|
markdown: page.markdown,
|
|
19860
20882
|
description: page.description,
|
|
19861
|
-
keywords: page.keywords
|
|
20883
|
+
keywords: page.keywords,
|
|
20884
|
+
publishedAt: page.publishedAt,
|
|
20885
|
+
incomingAnchorText,
|
|
20886
|
+
meta: page.meta
|
|
19862
20887
|
};
|
|
19863
20888
|
pages.push(indexedPage);
|
|
19864
20889
|
this.logger.event("page_indexed", { url: page.url });
|
|
19865
20890
|
}
|
|
20891
|
+
const pageRecords = pages.map((p) => {
|
|
20892
|
+
const summary = buildPageSummary(p);
|
|
20893
|
+
return {
|
|
20894
|
+
url: p.url,
|
|
20895
|
+
title: p.title,
|
|
20896
|
+
markdown: p.markdown,
|
|
20897
|
+
projectId: scope.projectId,
|
|
20898
|
+
scopeName: scope.scopeName,
|
|
20899
|
+
routeFile: p.routeFile,
|
|
20900
|
+
routeResolution: p.routeResolution,
|
|
20901
|
+
incomingLinks: p.incomingLinks,
|
|
20902
|
+
outgoingLinks: p.outgoingLinks,
|
|
20903
|
+
outgoingLinkUrls: p.outgoingLinkUrls,
|
|
20904
|
+
depth: p.depth,
|
|
20905
|
+
tags: p.tags,
|
|
20906
|
+
indexedAt: p.generatedAt,
|
|
20907
|
+
summary,
|
|
20908
|
+
description: p.description,
|
|
20909
|
+
keywords: p.keywords,
|
|
20910
|
+
contentHash: buildPageContentHash(p),
|
|
20911
|
+
publishedAt: p.publishedAt,
|
|
20912
|
+
meta: p.meta
|
|
20913
|
+
};
|
|
20914
|
+
});
|
|
20915
|
+
const currentPageUrls = new Set(pageRecords.map((r) => r.url));
|
|
20916
|
+
const changedPages = pageRecords.filter(
|
|
20917
|
+
(r) => !existingPageHashes.has(r.url) || existingPageHashes.get(r.url) !== r.contentHash
|
|
20918
|
+
);
|
|
20919
|
+
const deletedPageUrls = [...existingPageHashes.keys()].filter((url) => !currentPageUrls.has(url));
|
|
19866
20920
|
if (!options.dryRun) {
|
|
19867
|
-
|
|
19868
|
-
|
|
19869
|
-
|
|
19870
|
-
|
|
19871
|
-
|
|
19872
|
-
|
|
19873
|
-
|
|
19874
|
-
|
|
19875
|
-
|
|
19876
|
-
|
|
19877
|
-
|
|
19878
|
-
|
|
19879
|
-
|
|
19880
|
-
|
|
19881
|
-
|
|
19882
|
-
|
|
19883
|
-
|
|
19884
|
-
|
|
19885
|
-
|
|
19886
|
-
|
|
19887
|
-
|
|
19888
|
-
|
|
20921
|
+
if (options.force) {
|
|
20922
|
+
await this.store.deletePages(scope);
|
|
20923
|
+
this.logger.info(`Upserting ${pageRecords.length} page summaries...`);
|
|
20924
|
+
const pageDocs = pageRecords.map((r) => ({
|
|
20925
|
+
id: r.url,
|
|
20926
|
+
data: r.summary ?? r.title,
|
|
20927
|
+
metadata: {
|
|
20928
|
+
title: r.title,
|
|
20929
|
+
url: r.url,
|
|
20930
|
+
description: r.description ?? "",
|
|
20931
|
+
keywords: r.keywords ?? [],
|
|
20932
|
+
summary: r.summary ?? "",
|
|
20933
|
+
tags: r.tags,
|
|
20934
|
+
markdown: r.markdown,
|
|
20935
|
+
routeFile: r.routeFile,
|
|
20936
|
+
routeResolution: r.routeResolution,
|
|
20937
|
+
incomingLinks: r.incomingLinks,
|
|
20938
|
+
outgoingLinks: r.outgoingLinks,
|
|
20939
|
+
outgoingLinkUrls: r.outgoingLinkUrls ?? [],
|
|
20940
|
+
depth: r.depth,
|
|
20941
|
+
indexedAt: r.indexedAt,
|
|
20942
|
+
contentHash: r.contentHash ?? "",
|
|
20943
|
+
publishedAt: r.publishedAt ?? null,
|
|
20944
|
+
...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
|
|
20945
|
+
}
|
|
20946
|
+
}));
|
|
20947
|
+
await this.store.upsertPages(pageDocs, scope);
|
|
20948
|
+
} else {
|
|
20949
|
+
if (changedPages.length > 0) {
|
|
20950
|
+
this.logger.info(`Upserting ${changedPages.length} changed page summaries...`);
|
|
20951
|
+
const pageDocs = changedPages.map((r) => ({
|
|
20952
|
+
id: r.url,
|
|
20953
|
+
data: r.summary ?? r.title,
|
|
20954
|
+
metadata: {
|
|
20955
|
+
title: r.title,
|
|
20956
|
+
url: r.url,
|
|
20957
|
+
description: r.description ?? "",
|
|
20958
|
+
keywords: r.keywords ?? [],
|
|
20959
|
+
summary: r.summary ?? "",
|
|
20960
|
+
tags: r.tags,
|
|
20961
|
+
markdown: r.markdown,
|
|
20962
|
+
routeFile: r.routeFile,
|
|
20963
|
+
routeResolution: r.routeResolution,
|
|
20964
|
+
incomingLinks: r.incomingLinks,
|
|
20965
|
+
outgoingLinks: r.outgoingLinks,
|
|
20966
|
+
outgoingLinkUrls: r.outgoingLinkUrls ?? [],
|
|
20967
|
+
depth: r.depth,
|
|
20968
|
+
indexedAt: r.indexedAt,
|
|
20969
|
+
contentHash: r.contentHash ?? "",
|
|
20970
|
+
publishedAt: r.publishedAt ?? null,
|
|
20971
|
+
...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
|
|
20972
|
+
}
|
|
20973
|
+
}));
|
|
20974
|
+
await this.store.upsertPages(pageDocs, scope);
|
|
20975
|
+
}
|
|
20976
|
+
if (deletedPageUrls.length > 0) {
|
|
20977
|
+
await this.store.deletePagesByIds(deletedPageUrls, scope);
|
|
20978
|
+
}
|
|
20979
|
+
}
|
|
19889
20980
|
}
|
|
20981
|
+
const pagesChanged = options.force ? pageRecords.length : changedPages.length;
|
|
20982
|
+
const pagesDeleted = deletedPageUrls.length;
|
|
19890
20983
|
stageEnd("pages", pagesStart);
|
|
20984
|
+
this.logger.info(`Page changes: ${pagesChanged} changed/new, ${pagesDeleted} deleted, ${pageRecords.length - changedPages.length} unchanged`);
|
|
19891
20985
|
this.logger.info(`Indexed ${pages.length} page${pages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["pages"]}ms)`);
|
|
19892
20986
|
const chunkStart = stageStart();
|
|
19893
20987
|
this.logger.info("Chunking pages...");
|
|
@@ -19896,6 +20990,18 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19896
20990
|
if (typeof maxChunks === "number") {
|
|
19897
20991
|
chunks = chunks.slice(0, maxChunks);
|
|
19898
20992
|
}
|
|
20993
|
+
if (this.hooks.transformChunk) {
|
|
20994
|
+
const transformed = [];
|
|
20995
|
+
for (const chunk of chunks) {
|
|
20996
|
+
const result = await this.hooks.transformChunk(chunk);
|
|
20997
|
+
if (result === null) {
|
|
20998
|
+
this.logger.debug(`Chunk ${chunk.chunkKey} skipped by transformChunk hook`);
|
|
20999
|
+
continue;
|
|
21000
|
+
}
|
|
21001
|
+
transformed.push(result);
|
|
21002
|
+
}
|
|
21003
|
+
chunks = transformed;
|
|
21004
|
+
}
|
|
19899
21005
|
for (const chunk of chunks) {
|
|
19900
21006
|
this.logger.event("chunked", {
|
|
19901
21007
|
url: chunk.url,
|
|
@@ -19908,7 +21014,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19908
21014
|
for (const chunk of chunks) {
|
|
19909
21015
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
19910
21016
|
}
|
|
19911
|
-
|
|
21017
|
+
let changedChunks = chunks.filter((chunk) => {
|
|
19912
21018
|
if (options.force) {
|
|
19913
21019
|
return true;
|
|
19914
21020
|
}
|
|
@@ -19922,36 +21028,43 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19922
21028
|
return existingHash !== chunk.contentHash;
|
|
19923
21029
|
});
|
|
19924
21030
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
21031
|
+
if (this.hooks.beforeIndex) {
|
|
21032
|
+
changedChunks = await this.hooks.beforeIndex(changedChunks);
|
|
21033
|
+
}
|
|
19925
21034
|
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
19926
21035
|
const upsertStart = stageStart();
|
|
19927
21036
|
let documentsUpserted = 0;
|
|
19928
21037
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
19929
|
-
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash
|
|
19930
|
-
const UPSTASH_CONTENT_LIMIT = 4096;
|
|
21038
|
+
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Vector...`);
|
|
19931
21039
|
const docs = changedChunks.map((chunk) => {
|
|
19932
|
-
const
|
|
19933
|
-
|
|
19934
|
-
|
|
19935
|
-
|
|
19936
|
-
|
|
19937
|
-
|
|
19938
|
-
const textBudget = Math.max(500, UPSTASH_CONTENT_LIMIT - otherFieldsLen - 50);
|
|
19939
|
-
const text = buildEmbeddingText(chunk, this.config.chunking.prependTitle).slice(0, textBudget);
|
|
21040
|
+
const embeddingText = buildEmbeddingText(chunk, this.config.chunking.prependTitle);
|
|
21041
|
+
if (embeddingText.length > 2e3) {
|
|
21042
|
+
this.logger.warn(
|
|
21043
|
+
`Chunk ${chunk.chunkKey} text is ${embeddingText.length} chars (~${Math.round(embeddingText.length / 4)} tokens), which may exceed the 512-token model limit and be silently truncated.`
|
|
21044
|
+
);
|
|
21045
|
+
}
|
|
19940
21046
|
return {
|
|
19941
21047
|
id: chunk.chunkKey,
|
|
19942
|
-
|
|
21048
|
+
data: embeddingText,
|
|
19943
21049
|
metadata: {
|
|
19944
|
-
|
|
19945
|
-
scopeName: scope.scopeName,
|
|
21050
|
+
url: chunk.url,
|
|
19946
21051
|
path: chunk.path,
|
|
21052
|
+
title: chunk.title,
|
|
21053
|
+
sectionTitle: chunk.sectionTitle ?? "",
|
|
21054
|
+
headingPath: chunk.headingPath.join(" > "),
|
|
19947
21055
|
snippet: chunk.snippet,
|
|
21056
|
+
chunkText: embeddingText,
|
|
21057
|
+
tags: chunk.tags,
|
|
19948
21058
|
ordinal: chunk.ordinal,
|
|
19949
21059
|
contentHash: chunk.contentHash,
|
|
19950
21060
|
depth: chunk.depth,
|
|
19951
21061
|
incomingLinks: chunk.incomingLinks,
|
|
19952
21062
|
routeFile: chunk.routeFile,
|
|
19953
21063
|
description: chunk.description ?? "",
|
|
19954
|
-
keywords:
|
|
21064
|
+
keywords: chunk.keywords ?? [],
|
|
21065
|
+
publishedAt: chunk.publishedAt ?? null,
|
|
21066
|
+
incomingAnchorText: chunk.incomingAnchorText ?? "",
|
|
21067
|
+
...chunk.meta && Object.keys(chunk.meta).length > 0 ? { meta: chunk.meta } : {}
|
|
19955
21068
|
}
|
|
19956
21069
|
};
|
|
19957
21070
|
});
|
|
@@ -19969,9 +21082,16 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19969
21082
|
} else {
|
|
19970
21083
|
this.logger.info("No chunks to upsert \u2014 all up to date");
|
|
19971
21084
|
}
|
|
21085
|
+
if (this.config.llmsTxt.enable && !options.dryRun) {
|
|
21086
|
+
const llmsStart = stageStart();
|
|
21087
|
+
await writeLlmsTxt(pages, this.config, this.cwd, this.logger);
|
|
21088
|
+
stageEnd("llms_txt", llmsStart);
|
|
21089
|
+
}
|
|
19972
21090
|
this.logger.info("Done.");
|
|
19973
|
-
|
|
21091
|
+
const stats = {
|
|
19974
21092
|
pagesProcessed: pages.length,
|
|
21093
|
+
pagesChanged,
|
|
21094
|
+
pagesDeleted,
|
|
19975
21095
|
chunksTotal: chunks.length,
|
|
19976
21096
|
chunksChanged: changedChunks.length,
|
|
19977
21097
|
documentsUpserted,
|
|
@@ -19980,16 +21100,143 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19980
21100
|
routeBestEffort,
|
|
19981
21101
|
stageTimingsMs
|
|
19982
21102
|
};
|
|
21103
|
+
if (this.hooks.afterIndex) {
|
|
21104
|
+
await this.hooks.afterIndex(stats);
|
|
21105
|
+
}
|
|
21106
|
+
return stats;
|
|
19983
21107
|
}
|
|
19984
21108
|
};
|
|
21109
|
+
|
|
21110
|
+
// src/search/related-pages.ts
|
|
21111
|
+
function diceScore(urlA, urlB) {
|
|
21112
|
+
const segmentsA = urlA.split("/").filter(Boolean);
|
|
21113
|
+
const segmentsB = urlB.split("/").filter(Boolean);
|
|
21114
|
+
if (segmentsA.length === 0 && segmentsB.length === 0) return 1;
|
|
21115
|
+
if (segmentsA.length === 0 || segmentsB.length === 0) return 0;
|
|
21116
|
+
let shared = 0;
|
|
21117
|
+
const minLen = Math.min(segmentsA.length, segmentsB.length);
|
|
21118
|
+
for (let i = 0; i < minLen; i++) {
|
|
21119
|
+
if (segmentsA[i] === segmentsB[i]) {
|
|
21120
|
+
shared++;
|
|
21121
|
+
} else {
|
|
21122
|
+
break;
|
|
21123
|
+
}
|
|
21124
|
+
}
|
|
21125
|
+
return 2 * shared / (segmentsA.length + segmentsB.length);
|
|
21126
|
+
}
|
|
21127
|
+
function compositeScore(isLinked, dice, semantic) {
|
|
21128
|
+
return (isLinked ? 0.5 : 0) + 0.3 * dice + 0.2 * semantic;
|
|
21129
|
+
}
|
|
21130
|
+
function dominantRelationshipType(isOutgoing, isIncoming, dice) {
|
|
21131
|
+
if (isOutgoing) return "outgoing_link";
|
|
21132
|
+
if (isIncoming) return "incoming_link";
|
|
21133
|
+
if (dice > 0.4) return "sibling";
|
|
21134
|
+
return "semantic";
|
|
21135
|
+
}
|
|
21136
|
+
|
|
21137
|
+
// src/search/engine.ts
|
|
21138
|
+
var rankingOverridesSchema = z.object({
|
|
21139
|
+
ranking: z.object({
|
|
21140
|
+
enableIncomingLinkBoost: z.boolean().optional(),
|
|
21141
|
+
enableDepthBoost: z.boolean().optional(),
|
|
21142
|
+
aggregationCap: z.number().int().positive().optional(),
|
|
21143
|
+
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
21144
|
+
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
21145
|
+
minScoreRatio: z.number().min(0).max(1).optional(),
|
|
21146
|
+
scoreGapThreshold: z.number().min(0).max(1).optional(),
|
|
21147
|
+
weights: z.object({
|
|
21148
|
+
incomingLinks: z.number().optional(),
|
|
21149
|
+
depth: z.number().optional(),
|
|
21150
|
+
aggregation: z.number().optional(),
|
|
21151
|
+
titleMatch: z.number().optional()
|
|
21152
|
+
}).optional()
|
|
21153
|
+
}).optional(),
|
|
21154
|
+
search: z.object({
|
|
21155
|
+
pageSearchWeight: z.number().min(0).max(1).optional()
|
|
21156
|
+
}).optional()
|
|
21157
|
+
}).optional();
|
|
19985
21158
|
var requestSchema = z.object({
|
|
19986
21159
|
q: z.string().trim().min(1),
|
|
19987
21160
|
topK: z.number().int().positive().max(100).optional(),
|
|
19988
21161
|
scope: z.string().optional(),
|
|
19989
21162
|
pathPrefix: z.string().optional(),
|
|
19990
21163
|
tags: z.array(z.string()).optional(),
|
|
19991
|
-
|
|
21164
|
+
filters: z.record(z.string(), z.union([z.string(), z.number(), z.boolean()])).optional(),
|
|
21165
|
+
groupBy: z.enum(["page", "chunk"]).optional(),
|
|
21166
|
+
maxSubResults: z.number().int().positive().max(20).optional(),
|
|
21167
|
+
debug: z.boolean().optional(),
|
|
21168
|
+
rankingOverrides: rankingOverridesSchema
|
|
19992
21169
|
});
|
|
21170
|
+
var MAX_SITE_STRUCTURE_PAGES = 2e3;
|
|
21171
|
+
function makeNode(url, depth) {
|
|
21172
|
+
return { url, title: "", depth, routeFile: "", isIndexed: false, childCount: 0, children: [] };
|
|
21173
|
+
}
|
|
21174
|
+
function buildTree(pages, pathPrefix) {
|
|
21175
|
+
const nodeMap = /* @__PURE__ */ new Map();
|
|
21176
|
+
const root2 = makeNode("/", 0);
|
|
21177
|
+
nodeMap.set("/", root2);
|
|
21178
|
+
for (const page of pages) {
|
|
21179
|
+
const normalized = normalizeUrlPath(page.url);
|
|
21180
|
+
const segments = normalized.split("/").filter(Boolean);
|
|
21181
|
+
if (segments.length === 0) {
|
|
21182
|
+
root2.title = page.title;
|
|
21183
|
+
root2.routeFile = page.routeFile;
|
|
21184
|
+
root2.isIndexed = true;
|
|
21185
|
+
continue;
|
|
21186
|
+
}
|
|
21187
|
+
for (let i = 1; i <= segments.length; i++) {
|
|
21188
|
+
const partialUrl = "/" + segments.slice(0, i).join("/");
|
|
21189
|
+
if (!nodeMap.has(partialUrl)) {
|
|
21190
|
+
nodeMap.set(partialUrl, makeNode(partialUrl, i));
|
|
21191
|
+
}
|
|
21192
|
+
}
|
|
21193
|
+
const node = nodeMap.get(normalized);
|
|
21194
|
+
node.title = page.title;
|
|
21195
|
+
node.routeFile = page.routeFile;
|
|
21196
|
+
node.isIndexed = true;
|
|
21197
|
+
}
|
|
21198
|
+
for (const [url, node] of nodeMap) {
|
|
21199
|
+
if (url === "/") continue;
|
|
21200
|
+
const segments = url.split("/").filter(Boolean);
|
|
21201
|
+
const parentUrl = segments.length === 1 ? "/" : "/" + segments.slice(0, -1).join("/");
|
|
21202
|
+
const parent = nodeMap.get(parentUrl) ?? root2;
|
|
21203
|
+
parent.children.push(node);
|
|
21204
|
+
}
|
|
21205
|
+
const sortAndCount = (node) => {
|
|
21206
|
+
node.children.sort((a, b) => a.url.localeCompare(b.url));
|
|
21207
|
+
node.childCount = node.children.length;
|
|
21208
|
+
for (const child of node.children) {
|
|
21209
|
+
sortAndCount(child);
|
|
21210
|
+
}
|
|
21211
|
+
};
|
|
21212
|
+
sortAndCount(root2);
|
|
21213
|
+
if (pathPrefix) {
|
|
21214
|
+
const normalizedPrefix = normalizeUrlPath(pathPrefix);
|
|
21215
|
+
const subtreeRoot = nodeMap.get(normalizedPrefix);
|
|
21216
|
+
if (subtreeRoot) {
|
|
21217
|
+
return subtreeRoot;
|
|
21218
|
+
}
|
|
21219
|
+
return makeNode(normalizedPrefix, normalizedPrefix.split("/").filter(Boolean).length);
|
|
21220
|
+
}
|
|
21221
|
+
return root2;
|
|
21222
|
+
}
|
|
21223
|
+
function mergeRankingOverrides(base, overrides) {
|
|
21224
|
+
return {
|
|
21225
|
+
...base,
|
|
21226
|
+
search: {
|
|
21227
|
+
...base.search,
|
|
21228
|
+
...overrides.search
|
|
21229
|
+
},
|
|
21230
|
+
ranking: {
|
|
21231
|
+
...base.ranking,
|
|
21232
|
+
...overrides.ranking,
|
|
21233
|
+
weights: {
|
|
21234
|
+
...base.ranking.weights,
|
|
21235
|
+
...overrides.ranking?.weights
|
|
21236
|
+
}
|
|
21237
|
+
}
|
|
21238
|
+
};
|
|
21239
|
+
}
|
|
19993
21240
|
var SearchEngine = class _SearchEngine {
|
|
19994
21241
|
cwd;
|
|
19995
21242
|
config;
|
|
@@ -20019,125 +21266,203 @@ var SearchEngine = class _SearchEngine {
|
|
|
20019
21266
|
}
|
|
20020
21267
|
const input = parsed.data;
|
|
20021
21268
|
const totalStart = process.hrtime.bigint();
|
|
21269
|
+
const effectiveConfig = input.debug && input.rankingOverrides ? mergeRankingOverrides(this.config, input.rankingOverrides) : this.config;
|
|
20022
21270
|
const resolvedScope = resolveScope(this.config, input.scope);
|
|
20023
21271
|
const topK = input.topK ?? 10;
|
|
21272
|
+
const maxSubResults = input.maxSubResults ?? 5;
|
|
20024
21273
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
20025
|
-
const
|
|
20026
|
-
const
|
|
20027
|
-
|
|
20028
|
-
|
|
20029
|
-
|
|
20030
|
-
|
|
20031
|
-
|
|
20032
|
-
|
|
20033
|
-
|
|
21274
|
+
const queryText = input.q;
|
|
21275
|
+
const pathPrefix = input.pathPrefix ? input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}` : void 0;
|
|
21276
|
+
const filterTags = input.tags && input.tags.length > 0 ? input.tags : void 0;
|
|
21277
|
+
const metaFilterStr = input.filters && Object.keys(input.filters).length > 0 ? buildMetaFilterString(input.filters) : "";
|
|
21278
|
+
const metaFilter = metaFilterStr || void 0;
|
|
21279
|
+
const applyPagePostFilters = (hits) => {
|
|
21280
|
+
let filtered = hits;
|
|
21281
|
+
if (pathPrefix) {
|
|
21282
|
+
filtered = filtered.filter((h) => h.url.startsWith(pathPrefix));
|
|
21283
|
+
}
|
|
21284
|
+
if (filterTags) {
|
|
21285
|
+
filtered = filtered.filter(
|
|
21286
|
+
(h) => filterTags.every((tag) => h.tags.includes(tag))
|
|
21287
|
+
);
|
|
20034
21288
|
}
|
|
20035
|
-
|
|
20036
|
-
|
|
20037
|
-
const
|
|
21289
|
+
return filtered;
|
|
21290
|
+
};
|
|
21291
|
+
const applyChunkPostFilters = (hits) => {
|
|
21292
|
+
let filtered = hits;
|
|
21293
|
+
if (filterTags) {
|
|
21294
|
+
filtered = filtered.filter(
|
|
21295
|
+
(h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
|
|
21296
|
+
);
|
|
21297
|
+
}
|
|
21298
|
+
return filtered;
|
|
21299
|
+
};
|
|
20038
21300
|
const searchStart = process.hrtime.bigint();
|
|
20039
|
-
|
|
20040
|
-
|
|
20041
|
-
const
|
|
20042
|
-
const
|
|
20043
|
-
|
|
20044
|
-
|
|
20045
|
-
|
|
20046
|
-
|
|
20047
|
-
|
|
20048
|
-
|
|
20049
|
-
|
|
20050
|
-
|
|
20051
|
-
|
|
20052
|
-
|
|
20053
|
-
|
|
20054
|
-
|
|
20055
|
-
|
|
20056
|
-
{
|
|
20057
|
-
limit: chunkLimit,
|
|
20058
|
-
semanticWeight: this.config.search.semanticWeight,
|
|
20059
|
-
inputEnrichment: this.config.search.inputEnrichment,
|
|
20060
|
-
reranking: false,
|
|
20061
|
-
filter
|
|
20062
|
-
},
|
|
21301
|
+
if (groupByPage) {
|
|
21302
|
+
const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
|
|
21303
|
+
const pageLimit = Math.max(topK * 2, 20);
|
|
21304
|
+
const pageHits = await this.store.searchPagesByText(
|
|
21305
|
+
queryText,
|
|
21306
|
+
{ limit: pageLimit * fetchMultiplier, filter: metaFilter },
|
|
21307
|
+
resolvedScope
|
|
21308
|
+
);
|
|
21309
|
+
const filteredPages = applyPagePostFilters(pageHits);
|
|
21310
|
+
let rankedPages = rankPageHits(filteredPages, effectiveConfig, input.q, input.debug);
|
|
21311
|
+
rankedPages = trimPagesByScoreGap(rankedPages, effectiveConfig);
|
|
21312
|
+
const topPages = rankedPages.slice(0, topK);
|
|
21313
|
+
const chunkPromises = topPages.map(
|
|
21314
|
+
(page) => this.store.searchChunksByUrl(
|
|
21315
|
+
queryText,
|
|
21316
|
+
page.url,
|
|
21317
|
+
{ limit: maxSubResults, filter: metaFilter },
|
|
20063
21318
|
resolvedScope
|
|
20064
|
-
)
|
|
20065
|
-
|
|
20066
|
-
const
|
|
20067
|
-
|
|
21319
|
+
).then((chunks) => applyChunkPostFilters(chunks))
|
|
21320
|
+
);
|
|
21321
|
+
const allChunks = await Promise.all(chunkPromises);
|
|
21322
|
+
const searchMs = hrTimeMs(searchStart);
|
|
21323
|
+
const results = this.buildPageFirstResults(topPages, allChunks, input.q, input.debug, maxSubResults);
|
|
21324
|
+
return {
|
|
21325
|
+
q: input.q,
|
|
21326
|
+
scope: resolvedScope.scopeName,
|
|
21327
|
+
results,
|
|
21328
|
+
meta: {
|
|
21329
|
+
timingsMs: {
|
|
21330
|
+
search: Math.round(searchMs),
|
|
21331
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
21332
|
+
}
|
|
21333
|
+
}
|
|
21334
|
+
};
|
|
20068
21335
|
} else {
|
|
21336
|
+
const candidateK = Math.max(50, topK);
|
|
21337
|
+
const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
|
|
20069
21338
|
const hits = await this.store.search(
|
|
20070
|
-
|
|
20071
|
-
{
|
|
20072
|
-
limit: candidateK,
|
|
20073
|
-
semanticWeight: this.config.search.semanticWeight,
|
|
20074
|
-
inputEnrichment: this.config.search.inputEnrichment,
|
|
20075
|
-
reranking: this.config.search.reranking,
|
|
20076
|
-
filter
|
|
20077
|
-
},
|
|
21339
|
+
queryText,
|
|
21340
|
+
{ limit: candidateK * fetchMultiplier, filter: metaFilter },
|
|
20078
21341
|
resolvedScope
|
|
20079
21342
|
);
|
|
20080
|
-
|
|
20081
|
-
|
|
20082
|
-
|
|
20083
|
-
|
|
20084
|
-
|
|
20085
|
-
|
|
20086
|
-
|
|
20087
|
-
|
|
20088
|
-
|
|
20089
|
-
|
|
20090
|
-
|
|
20091
|
-
|
|
21343
|
+
let filtered = hits;
|
|
21344
|
+
if (pathPrefix) {
|
|
21345
|
+
filtered = filtered.filter((h) => h.metadata.url.startsWith(pathPrefix));
|
|
21346
|
+
}
|
|
21347
|
+
if (filterTags) {
|
|
21348
|
+
filtered = filtered.filter(
|
|
21349
|
+
(h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
|
|
21350
|
+
);
|
|
21351
|
+
}
|
|
21352
|
+
const ranked = rankHits(filtered, effectiveConfig, input.q, input.debug);
|
|
21353
|
+
const searchMs = hrTimeMs(searchStart);
|
|
21354
|
+
const results = this.buildResults(ranked, topK, false, maxSubResults, input.q, input.debug, effectiveConfig);
|
|
21355
|
+
return {
|
|
21356
|
+
q: input.q,
|
|
21357
|
+
scope: resolvedScope.scopeName,
|
|
21358
|
+
results,
|
|
21359
|
+
meta: {
|
|
21360
|
+
timingsMs: {
|
|
21361
|
+
search: Math.round(searchMs),
|
|
21362
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
21363
|
+
}
|
|
20092
21364
|
}
|
|
21365
|
+
};
|
|
21366
|
+
}
|
|
21367
|
+
}
|
|
21368
|
+
buildPageFirstResults(rankedPages, allChunks, query, debug, maxSubResults = 5) {
|
|
21369
|
+
return rankedPages.map((page, i) => {
|
|
21370
|
+
const chunks = allChunks[i] ?? [];
|
|
21371
|
+
const bestChunk = chunks[0];
|
|
21372
|
+
const snippet = bestChunk ? query ? queryAwareExcerpt(bestChunk.metadata.chunkText, query) : toSnippet(bestChunk.metadata.chunkText) : page.description || page.title;
|
|
21373
|
+
const result = {
|
|
21374
|
+
url: page.url,
|
|
21375
|
+
title: page.title,
|
|
21376
|
+
sectionTitle: bestChunk?.metadata.sectionTitle || void 0,
|
|
21377
|
+
snippet,
|
|
21378
|
+
chunkText: bestChunk?.metadata.chunkText || void 0,
|
|
21379
|
+
score: Number(page.finalScore.toFixed(6)),
|
|
21380
|
+
routeFile: page.routeFile,
|
|
21381
|
+
chunks: chunks.length > 0 ? chunks.slice(0, maxSubResults).map((c) => ({
|
|
21382
|
+
sectionTitle: c.metadata.sectionTitle || void 0,
|
|
21383
|
+
snippet: query ? queryAwareExcerpt(c.metadata.chunkText, query) : toSnippet(c.metadata.chunkText),
|
|
21384
|
+
chunkText: c.metadata.chunkText || void 0,
|
|
21385
|
+
headingPath: c.metadata.headingPath,
|
|
21386
|
+
score: Number(c.score.toFixed(6))
|
|
21387
|
+
})) : void 0
|
|
21388
|
+
};
|
|
21389
|
+
if (debug && page.breakdown) {
|
|
21390
|
+
result.breakdown = {
|
|
21391
|
+
baseScore: page.breakdown.baseScore,
|
|
21392
|
+
incomingLinkBoost: page.breakdown.incomingLinkBoost,
|
|
21393
|
+
depthBoost: page.breakdown.depthBoost,
|
|
21394
|
+
titleMatchBoost: page.breakdown.titleMatchBoost,
|
|
21395
|
+
freshnessBoost: page.breakdown.freshnessBoost,
|
|
21396
|
+
anchorTextMatchBoost: 0
|
|
21397
|
+
};
|
|
20093
21398
|
}
|
|
20094
|
-
|
|
21399
|
+
return result;
|
|
21400
|
+
});
|
|
20095
21401
|
}
|
|
20096
|
-
ensureSnippet(hit) {
|
|
21402
|
+
ensureSnippet(hit, query) {
|
|
21403
|
+
const chunkText = hit.hit.metadata.chunkText;
|
|
21404
|
+
if (query && chunkText) return queryAwareExcerpt(chunkText, query);
|
|
20097
21405
|
const snippet = hit.hit.metadata.snippet;
|
|
20098
21406
|
if (snippet && snippet.length >= 30) return snippet;
|
|
20099
|
-
const chunkText = hit.hit.metadata.chunkText;
|
|
20100
21407
|
if (chunkText) return toSnippet(chunkText);
|
|
20101
21408
|
return snippet || "";
|
|
20102
21409
|
}
|
|
20103
|
-
buildResults(ordered, topK, groupByPage,
|
|
21410
|
+
buildResults(ordered, topK, groupByPage, maxSubResults, query, debug, config) {
|
|
21411
|
+
const cfg = config ?? this.config;
|
|
20104
21412
|
if (groupByPage) {
|
|
20105
|
-
let pages = aggregateByPage(ordered,
|
|
20106
|
-
pages = trimByScoreGap(pages,
|
|
20107
|
-
const minRatio =
|
|
21413
|
+
let pages = aggregateByPage(ordered, cfg);
|
|
21414
|
+
pages = trimByScoreGap(pages, cfg);
|
|
21415
|
+
const minRatio = cfg.ranking.minChunkScoreRatio;
|
|
20108
21416
|
return pages.slice(0, topK).map((page) => {
|
|
20109
21417
|
const bestScore = page.bestChunk.finalScore;
|
|
20110
21418
|
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
20111
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0,
|
|
20112
|
-
|
|
21419
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, maxSubResults);
|
|
21420
|
+
const result = {
|
|
20113
21421
|
url: page.url,
|
|
20114
21422
|
title: page.title,
|
|
20115
21423
|
sectionTitle: page.bestChunk.hit.metadata.sectionTitle || void 0,
|
|
20116
|
-
snippet: this.ensureSnippet(page.bestChunk),
|
|
21424
|
+
snippet: this.ensureSnippet(page.bestChunk, query),
|
|
21425
|
+
chunkText: page.bestChunk.hit.metadata.chunkText || void 0,
|
|
20117
21426
|
score: Number(page.pageScore.toFixed(6)),
|
|
20118
21427
|
routeFile: page.routeFile,
|
|
20119
|
-
chunks: meaningful.length
|
|
21428
|
+
chunks: meaningful.length >= 1 ? meaningful.map((c) => ({
|
|
20120
21429
|
sectionTitle: c.hit.metadata.sectionTitle || void 0,
|
|
20121
|
-
snippet: this.ensureSnippet(c),
|
|
21430
|
+
snippet: this.ensureSnippet(c, query),
|
|
21431
|
+
chunkText: c.hit.metadata.chunkText || void 0,
|
|
20122
21432
|
headingPath: c.hit.metadata.headingPath,
|
|
20123
21433
|
score: Number(c.finalScore.toFixed(6))
|
|
20124
21434
|
})) : void 0
|
|
20125
21435
|
};
|
|
21436
|
+
if (debug && page.bestChunk.breakdown) {
|
|
21437
|
+
result.breakdown = page.bestChunk.breakdown;
|
|
21438
|
+
}
|
|
21439
|
+
return result;
|
|
20126
21440
|
});
|
|
20127
21441
|
} else {
|
|
20128
21442
|
let filtered = ordered;
|
|
20129
|
-
const
|
|
20130
|
-
if (
|
|
20131
|
-
|
|
20132
|
-
|
|
20133
|
-
|
|
20134
|
-
|
|
20135
|
-
|
|
20136
|
-
|
|
20137
|
-
|
|
20138
|
-
|
|
20139
|
-
|
|
20140
|
-
|
|
21443
|
+
const minScoreRatio = cfg.ranking.minScoreRatio;
|
|
21444
|
+
if (minScoreRatio > 0 && ordered.length > 0) {
|
|
21445
|
+
const topScore = ordered[0].finalScore;
|
|
21446
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
21447
|
+
const threshold = topScore * minScoreRatio;
|
|
21448
|
+
filtered = ordered.filter((entry) => entry.finalScore >= threshold);
|
|
21449
|
+
}
|
|
21450
|
+
}
|
|
21451
|
+
return filtered.slice(0, topK).map(({ hit, finalScore, breakdown }) => {
|
|
21452
|
+
const result = {
|
|
21453
|
+
url: hit.metadata.url,
|
|
21454
|
+
title: hit.metadata.title,
|
|
21455
|
+
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
21456
|
+
snippet: this.ensureSnippet({ hit, finalScore }, query),
|
|
21457
|
+
chunkText: hit.metadata.chunkText || void 0,
|
|
21458
|
+
score: Number(finalScore.toFixed(6)),
|
|
21459
|
+
routeFile: hit.metadata.routeFile
|
|
21460
|
+
};
|
|
21461
|
+
if (debug && breakdown) {
|
|
21462
|
+
result.breakdown = breakdown;
|
|
21463
|
+
}
|
|
21464
|
+
return result;
|
|
21465
|
+
});
|
|
20141
21466
|
}
|
|
20142
21467
|
}
|
|
20143
21468
|
async getPage(pathOrUrl, scope) {
|
|
@@ -20163,6 +21488,116 @@ var SearchEngine = class _SearchEngine {
|
|
|
20163
21488
|
markdown: page.markdown
|
|
20164
21489
|
};
|
|
20165
21490
|
}
|
|
21491
|
+
async listPages(opts) {
|
|
21492
|
+
const resolvedScope = resolveScope(this.config, opts?.scope);
|
|
21493
|
+
const pathPrefix = opts?.pathPrefix ? opts.pathPrefix.startsWith("/") ? opts.pathPrefix : `/${opts.pathPrefix}` : void 0;
|
|
21494
|
+
return this.store.listPages(resolvedScope, {
|
|
21495
|
+
cursor: opts?.cursor,
|
|
21496
|
+
limit: opts?.limit,
|
|
21497
|
+
pathPrefix
|
|
21498
|
+
});
|
|
21499
|
+
}
|
|
21500
|
+
async getSiteStructure(opts) {
|
|
21501
|
+
const maxPages = Math.min(opts?.maxPages ?? MAX_SITE_STRUCTURE_PAGES, MAX_SITE_STRUCTURE_PAGES);
|
|
21502
|
+
const allPages = [];
|
|
21503
|
+
let cursor;
|
|
21504
|
+
let truncated = false;
|
|
21505
|
+
do {
|
|
21506
|
+
const result = await this.listPages({
|
|
21507
|
+
pathPrefix: opts?.pathPrefix,
|
|
21508
|
+
scope: opts?.scope,
|
|
21509
|
+
cursor,
|
|
21510
|
+
limit: 200
|
|
21511
|
+
});
|
|
21512
|
+
allPages.push(...result.pages);
|
|
21513
|
+
cursor = result.nextCursor;
|
|
21514
|
+
if (allPages.length >= maxPages) {
|
|
21515
|
+
truncated = allPages.length > maxPages || !!cursor;
|
|
21516
|
+
allPages.length = maxPages;
|
|
21517
|
+
break;
|
|
21518
|
+
}
|
|
21519
|
+
} while (cursor);
|
|
21520
|
+
const root2 = buildTree(allPages, opts?.pathPrefix);
|
|
21521
|
+
return {
|
|
21522
|
+
root: root2,
|
|
21523
|
+
totalPages: allPages.length,
|
|
21524
|
+
truncated
|
|
21525
|
+
};
|
|
21526
|
+
}
|
|
21527
|
+
async getRelatedPages(pathOrUrl, opts) {
|
|
21528
|
+
const resolvedScope = resolveScope(this.config, opts?.scope);
|
|
21529
|
+
const urlPath = this.resolveInputPath(pathOrUrl);
|
|
21530
|
+
const topK = Math.min(opts?.topK ?? 10, 25);
|
|
21531
|
+
const source = await this.store.fetchPageWithVector(urlPath, resolvedScope);
|
|
21532
|
+
if (!source) {
|
|
21533
|
+
throw new SearchSocketError("INVALID_REQUEST", `Indexed page not found for ${urlPath}`, 404);
|
|
21534
|
+
}
|
|
21535
|
+
const sourceOutgoing = new Set(source.metadata.outgoingLinkUrls ?? []);
|
|
21536
|
+
const semanticHits = await this.store.searchPagesByVector(
|
|
21537
|
+
source.vector,
|
|
21538
|
+
{ limit: 50 },
|
|
21539
|
+
resolvedScope
|
|
21540
|
+
);
|
|
21541
|
+
const filteredHits = semanticHits.filter((h) => h.url !== urlPath);
|
|
21542
|
+
const semanticScoreMap = /* @__PURE__ */ new Map();
|
|
21543
|
+
for (const hit of filteredHits) {
|
|
21544
|
+
semanticScoreMap.set(hit.url, hit.score);
|
|
21545
|
+
}
|
|
21546
|
+
const candidateUrls = /* @__PURE__ */ new Set();
|
|
21547
|
+
for (const hit of filteredHits) {
|
|
21548
|
+
candidateUrls.add(hit.url);
|
|
21549
|
+
}
|
|
21550
|
+
for (const url of sourceOutgoing) {
|
|
21551
|
+
if (url !== urlPath) candidateUrls.add(url);
|
|
21552
|
+
}
|
|
21553
|
+
const missingUrls = [...sourceOutgoing].filter(
|
|
21554
|
+
(u) => u !== urlPath && !semanticScoreMap.has(u)
|
|
21555
|
+
);
|
|
21556
|
+
const fetchedPages = missingUrls.length > 0 ? await this.store.fetchPagesBatch(missingUrls, resolvedScope) : [];
|
|
21557
|
+
const metaMap = /* @__PURE__ */ new Map();
|
|
21558
|
+
for (const hit of filteredHits) {
|
|
21559
|
+
metaMap.set(hit.url, { title: hit.title, routeFile: hit.routeFile, outgoingLinkUrls: [] });
|
|
21560
|
+
}
|
|
21561
|
+
for (const p of fetchedPages) {
|
|
21562
|
+
metaMap.set(p.url, { title: p.title, routeFile: p.routeFile, outgoingLinkUrls: p.outgoingLinkUrls });
|
|
21563
|
+
}
|
|
21564
|
+
const semanticUrls = filteredHits.map((h) => h.url);
|
|
21565
|
+
if (semanticUrls.length > 0) {
|
|
21566
|
+
const semanticPageData = await this.store.fetchPagesBatch(semanticUrls, resolvedScope);
|
|
21567
|
+
for (const p of semanticPageData) {
|
|
21568
|
+
const existing = metaMap.get(p.url);
|
|
21569
|
+
if (existing) {
|
|
21570
|
+
existing.outgoingLinkUrls = p.outgoingLinkUrls;
|
|
21571
|
+
}
|
|
21572
|
+
}
|
|
21573
|
+
}
|
|
21574
|
+
const candidates = [];
|
|
21575
|
+
for (const url of candidateUrls) {
|
|
21576
|
+
const meta = metaMap.get(url);
|
|
21577
|
+
if (!meta) continue;
|
|
21578
|
+
const isOutgoing = sourceOutgoing.has(url);
|
|
21579
|
+
const isIncoming = meta.outgoingLinkUrls.includes(urlPath);
|
|
21580
|
+
const isLinked = isOutgoing || isIncoming;
|
|
21581
|
+
const dice = diceScore(urlPath, url);
|
|
21582
|
+
const semantic = semanticScoreMap.get(url) ?? 0;
|
|
21583
|
+
const score = compositeScore(isLinked, dice, semantic);
|
|
21584
|
+
const relationshipType = dominantRelationshipType(isOutgoing, isIncoming, dice);
|
|
21585
|
+
candidates.push({
|
|
21586
|
+
url,
|
|
21587
|
+
title: meta.title,
|
|
21588
|
+
score: Number(score.toFixed(6)),
|
|
21589
|
+
relationshipType,
|
|
21590
|
+
routeFile: meta.routeFile
|
|
21591
|
+
});
|
|
21592
|
+
}
|
|
21593
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
21594
|
+
const results = candidates.slice(0, topK);
|
|
21595
|
+
return {
|
|
21596
|
+
sourceUrl: urlPath,
|
|
21597
|
+
scope: resolvedScope.scopeName,
|
|
21598
|
+
relatedPages: results
|
|
21599
|
+
};
|
|
21600
|
+
}
|
|
20166
21601
|
async health() {
|
|
20167
21602
|
return this.store.health();
|
|
20168
21603
|
}
|
|
@@ -20185,14 +21620,40 @@ function createServer(engine) {
|
|
|
20185
21620
|
server.registerTool(
|
|
20186
21621
|
"search",
|
|
20187
21622
|
{
|
|
20188
|
-
description:
|
|
21623
|
+
description: `Semantic site search powered by Upstash Search. Returns url, title, snippet, chunkText, score, and routeFile per result. chunkText contains the full raw chunk markdown. When groupBy is 'page' (default), each result includes a chunks array with section-level sub-results containing sectionTitle, headingPath, snippet, and score. Supports optional filters for structured metadata (e.g. {"version": 2, "deprecated": false}).`,
|
|
20189
21624
|
inputSchema: {
|
|
20190
21625
|
query: z.string().min(1),
|
|
20191
21626
|
scope: z.string().optional(),
|
|
20192
21627
|
topK: z.number().int().positive().max(100).optional(),
|
|
20193
21628
|
pathPrefix: z.string().optional(),
|
|
20194
21629
|
tags: z.array(z.string()).optional(),
|
|
20195
|
-
|
|
21630
|
+
filters: z.record(z.string(), z.union([z.string(), z.number(), z.boolean()])).optional(),
|
|
21631
|
+
groupBy: z.enum(["page", "chunk"]).optional(),
|
|
21632
|
+
maxSubResults: z.number().int().positive().max(20).optional()
|
|
21633
|
+
},
|
|
21634
|
+
outputSchema: {
|
|
21635
|
+
q: z.string(),
|
|
21636
|
+
scope: z.string(),
|
|
21637
|
+
results: z.array(z.object({
|
|
21638
|
+
url: z.string(),
|
|
21639
|
+
title: z.string(),
|
|
21640
|
+
sectionTitle: z.string().optional(),
|
|
21641
|
+
snippet: z.string(),
|
|
21642
|
+
score: z.number(),
|
|
21643
|
+
routeFile: z.string(),
|
|
21644
|
+
chunks: z.array(z.object({
|
|
21645
|
+
sectionTitle: z.string().optional(),
|
|
21646
|
+
snippet: z.string(),
|
|
21647
|
+
headingPath: z.array(z.string()),
|
|
21648
|
+
score: z.number()
|
|
21649
|
+
})).optional()
|
|
21650
|
+
})),
|
|
21651
|
+
meta: z.object({
|
|
21652
|
+
timingsMs: z.object({
|
|
21653
|
+
search: z.number(),
|
|
21654
|
+
total: z.number()
|
|
21655
|
+
})
|
|
21656
|
+
})
|
|
20196
21657
|
}
|
|
20197
21658
|
},
|
|
20198
21659
|
async (input) => {
|
|
@@ -20202,7 +21663,9 @@ function createServer(engine) {
|
|
|
20202
21663
|
scope: input.scope,
|
|
20203
21664
|
pathPrefix: input.pathPrefix,
|
|
20204
21665
|
tags: input.tags,
|
|
20205
|
-
|
|
21666
|
+
filters: input.filters,
|
|
21667
|
+
groupBy: input.groupBy,
|
|
21668
|
+
maxSubResults: input.maxSubResults
|
|
20206
21669
|
});
|
|
20207
21670
|
return {
|
|
20208
21671
|
content: [
|
|
@@ -20210,7 +21673,8 @@ function createServer(engine) {
|
|
|
20210
21673
|
type: "text",
|
|
20211
21674
|
text: JSON.stringify(result, null, 2)
|
|
20212
21675
|
}
|
|
20213
|
-
]
|
|
21676
|
+
],
|
|
21677
|
+
structuredContent: result
|
|
20214
21678
|
};
|
|
20215
21679
|
}
|
|
20216
21680
|
);
|
|
@@ -20235,8 +21699,134 @@ function createServer(engine) {
|
|
|
20235
21699
|
};
|
|
20236
21700
|
}
|
|
20237
21701
|
);
|
|
21702
|
+
server.registerTool(
|
|
21703
|
+
"list_pages",
|
|
21704
|
+
{
|
|
21705
|
+
description: "List indexed pages with optional path prefix filtering and cursor-based pagination. Returns url, title, description, and routeFile for each page. Use nextCursor to fetch subsequent pages.",
|
|
21706
|
+
inputSchema: {
|
|
21707
|
+
pathPrefix: z.string().optional(),
|
|
21708
|
+
cursor: z.string().optional(),
|
|
21709
|
+
limit: z.number().int().positive().max(200).optional(),
|
|
21710
|
+
scope: z.string().optional()
|
|
21711
|
+
}
|
|
21712
|
+
},
|
|
21713
|
+
async (input) => {
|
|
21714
|
+
const result = await engine.listPages({
|
|
21715
|
+
pathPrefix: input.pathPrefix,
|
|
21716
|
+
cursor: input.cursor,
|
|
21717
|
+
limit: input.limit,
|
|
21718
|
+
scope: input.scope
|
|
21719
|
+
});
|
|
21720
|
+
return {
|
|
21721
|
+
content: [
|
|
21722
|
+
{
|
|
21723
|
+
type: "text",
|
|
21724
|
+
text: JSON.stringify(result, null, 2)
|
|
21725
|
+
}
|
|
21726
|
+
]
|
|
21727
|
+
};
|
|
21728
|
+
}
|
|
21729
|
+
);
|
|
21730
|
+
server.registerTool(
|
|
21731
|
+
"get_site_structure",
|
|
21732
|
+
{
|
|
21733
|
+
description: "Returns the hierarchical page tree derived from URL paths. Use this to understand site navigation structure, find where pages belong, or scope further operations to a section. Nodes with isIndexed: false are implicit structural parents not directly in the index. Large sites (>2000 pages) return truncated: true.",
|
|
21734
|
+
inputSchema: {
|
|
21735
|
+
pathPrefix: z.string().optional(),
|
|
21736
|
+
scope: z.string().optional(),
|
|
21737
|
+
maxPages: z.number().int().positive().max(2e3).optional()
|
|
21738
|
+
}
|
|
21739
|
+
},
|
|
21740
|
+
async (input) => {
|
|
21741
|
+
const result = await engine.getSiteStructure({
|
|
21742
|
+
pathPrefix: input.pathPrefix,
|
|
21743
|
+
scope: input.scope,
|
|
21744
|
+
maxPages: input.maxPages
|
|
21745
|
+
});
|
|
21746
|
+
return {
|
|
21747
|
+
content: [
|
|
21748
|
+
{
|
|
21749
|
+
type: "text",
|
|
21750
|
+
text: JSON.stringify(result, null, 2)
|
|
21751
|
+
}
|
|
21752
|
+
]
|
|
21753
|
+
};
|
|
21754
|
+
}
|
|
21755
|
+
);
|
|
21756
|
+
server.registerTool(
|
|
21757
|
+
"find_source_file",
|
|
21758
|
+
{
|
|
21759
|
+
description: "Find the SvelteKit source file for a piece of site content. Use this when you need to locate and edit content on the site. Returns the URL, route file path, section title, and a content snippet.",
|
|
21760
|
+
inputSchema: {
|
|
21761
|
+
query: z.string().min(1),
|
|
21762
|
+
scope: z.string().optional()
|
|
21763
|
+
}
|
|
21764
|
+
},
|
|
21765
|
+
async (input) => {
|
|
21766
|
+
const result = await engine.search({
|
|
21767
|
+
q: input.query,
|
|
21768
|
+
topK: 1,
|
|
21769
|
+
scope: input.scope
|
|
21770
|
+
});
|
|
21771
|
+
if (result.results.length === 0) {
|
|
21772
|
+
return {
|
|
21773
|
+
content: [
|
|
21774
|
+
{
|
|
21775
|
+
type: "text",
|
|
21776
|
+
text: JSON.stringify({
|
|
21777
|
+
error: "No matching content found for the given query."
|
|
21778
|
+
})
|
|
21779
|
+
}
|
|
21780
|
+
]
|
|
21781
|
+
};
|
|
21782
|
+
}
|
|
21783
|
+
const match = result.results[0];
|
|
21784
|
+
const { url, routeFile, sectionTitle, snippet } = match;
|
|
21785
|
+
return {
|
|
21786
|
+
content: [
|
|
21787
|
+
{
|
|
21788
|
+
type: "text",
|
|
21789
|
+
text: JSON.stringify({ url, routeFile, sectionTitle, snippet })
|
|
21790
|
+
}
|
|
21791
|
+
]
|
|
21792
|
+
};
|
|
21793
|
+
}
|
|
21794
|
+
);
|
|
21795
|
+
server.registerTool(
|
|
21796
|
+
"get_related_pages",
|
|
21797
|
+
{
|
|
21798
|
+
description: "Find pages related to a given URL using link graph, semantic similarity, and structural proximity. Returns related pages ranked by a composite relatedness score. Use this to discover content connected to a known page.",
|
|
21799
|
+
inputSchema: {
|
|
21800
|
+
pathOrUrl: z.string().min(1),
|
|
21801
|
+
scope: z.string().optional(),
|
|
21802
|
+
topK: z.number().int().positive().max(25).optional()
|
|
21803
|
+
}
|
|
21804
|
+
},
|
|
21805
|
+
async (input) => {
|
|
21806
|
+
const result = await engine.getRelatedPages(input.pathOrUrl, {
|
|
21807
|
+
topK: input.topK,
|
|
21808
|
+
scope: input.scope
|
|
21809
|
+
});
|
|
21810
|
+
return {
|
|
21811
|
+
content: [
|
|
21812
|
+
{
|
|
21813
|
+
type: "text",
|
|
21814
|
+
text: JSON.stringify(result, null, 2)
|
|
21815
|
+
}
|
|
21816
|
+
]
|
|
21817
|
+
};
|
|
21818
|
+
}
|
|
21819
|
+
);
|
|
20238
21820
|
return server;
|
|
20239
21821
|
}
|
|
21822
|
+
function resolveApiKey(config) {
|
|
21823
|
+
return config.mcp.http.apiKey ?? (config.mcp.http.apiKeyEnv ? process.env[config.mcp.http.apiKeyEnv] : void 0);
|
|
21824
|
+
}
|
|
21825
|
+
function verifyApiKey(provided, expected) {
|
|
21826
|
+
const a = createHash("sha256").update(provided).digest();
|
|
21827
|
+
const b = createHash("sha256").update(expected).digest();
|
|
21828
|
+
return timingSafeEqual(a, b);
|
|
21829
|
+
}
|
|
20240
21830
|
function redirectConsoleToStderr() {
|
|
20241
21831
|
console.log = (...args) => {
|
|
20242
21832
|
process.stderr.write(`[LOG] ${args.map(String).join(" ")}
|
|
@@ -20251,7 +21841,22 @@ async function startHttpServer(serverFactory, config, opts) {
|
|
|
20251
21841
|
const app = createMcpExpressApp();
|
|
20252
21842
|
const port = opts.httpPort ?? config.mcp.http.port;
|
|
20253
21843
|
const endpointPath = opts.httpPath ?? config.mcp.http.path;
|
|
21844
|
+
const isPublic = config.mcp.access === "public";
|
|
21845
|
+
const host = isPublic ? "0.0.0.0" : "127.0.0.1";
|
|
21846
|
+
const apiKey = isPublic ? resolveApiKey(config) : void 0;
|
|
20254
21847
|
app.post(endpointPath, async (req, res) => {
|
|
21848
|
+
if (isPublic && apiKey) {
|
|
21849
|
+
const authHeader = req.headers["authorization"];
|
|
21850
|
+
const provided = (authHeader?.startsWith("Bearer ") ? authHeader.slice(7) : void 0) ?? req.headers["x-api-key"] ?? "";
|
|
21851
|
+
if (!provided || !verifyApiKey(provided, apiKey)) {
|
|
21852
|
+
res.status(401).json({
|
|
21853
|
+
jsonrpc: "2.0",
|
|
21854
|
+
error: { code: -32001, message: "Unauthorized" },
|
|
21855
|
+
id: null
|
|
21856
|
+
});
|
|
21857
|
+
return;
|
|
21858
|
+
}
|
|
21859
|
+
}
|
|
20255
21860
|
const server = serverFactory();
|
|
20256
21861
|
const transport = new StreamableHTTPServerTransport({
|
|
20257
21862
|
sessionIdGenerator: void 0
|
|
@@ -20301,9 +21906,12 @@ async function startHttpServer(serverFactory, config, opts) {
|
|
|
20301
21906
|
);
|
|
20302
21907
|
});
|
|
20303
21908
|
await new Promise((resolve, reject) => {
|
|
20304
|
-
const instance = app.listen(port,
|
|
20305
|
-
process.stderr.write(`SearchSocket MCP HTTP server listening on http
|
|
21909
|
+
const instance = app.listen(port, host, () => {
|
|
21910
|
+
process.stderr.write(`SearchSocket MCP HTTP server listening on http://${host}:${port}${endpointPath}
|
|
20306
21911
|
`);
|
|
21912
|
+
if (isPublic) {
|
|
21913
|
+
process.stderr.write("WARNING: Server is in public mode. Ensure HTTPS is configured via a reverse proxy for production use.\n");
|
|
21914
|
+
}
|
|
20307
21915
|
resolve();
|
|
20308
21916
|
});
|
|
20309
21917
|
instance.once("error", reject);
|
|
@@ -20318,6 +21926,13 @@ async function runMcpServer(options = {}) {
|
|
|
20318
21926
|
cwd: options.cwd,
|
|
20319
21927
|
configPath: options.configPath
|
|
20320
21928
|
});
|
|
21929
|
+
if (options.access) config.mcp.access = options.access;
|
|
21930
|
+
if (options.apiKey) config.mcp.http.apiKey = options.apiKey;
|
|
21931
|
+
if (config.mcp.access === "public" && !resolveApiKey(config)) {
|
|
21932
|
+
throw new Error(
|
|
21933
|
+
'MCP access is "public" but no API key is configured. Pass --api-key or set mcp.http.apiKey / mcp.http.apiKeyEnv in config.'
|
|
21934
|
+
);
|
|
21935
|
+
}
|
|
20321
21936
|
const resolvedTransport = options.transport ?? config.mcp.transport;
|
|
20322
21937
|
if (resolvedTransport === "stdio") {
|
|
20323
21938
|
redirectConsoleToStderr();
|
|
@@ -20335,8 +21950,6 @@ async function runMcpServer(options = {}) {
|
|
|
20335
21950
|
const stdioTransport = new StdioServerTransport();
|
|
20336
21951
|
await server.connect(stdioTransport);
|
|
20337
21952
|
}
|
|
20338
|
-
|
|
20339
|
-
// src/sveltekit/handle.ts
|
|
20340
21953
|
var InMemoryRateLimiter = class {
|
|
20341
21954
|
constructor(windowMs, max) {
|
|
20342
21955
|
this.windowMs = windowMs;
|
|
@@ -20364,7 +21977,13 @@ function searchsocketHandle(options = {}) {
|
|
|
20364
21977
|
let enginePromise = null;
|
|
20365
21978
|
let configPromise = null;
|
|
20366
21979
|
let apiPath = options.path;
|
|
21980
|
+
let llmsServePath = null;
|
|
21981
|
+
let serveMarkdownVariants = false;
|
|
21982
|
+
let mcpPath;
|
|
21983
|
+
let mcpApiKey;
|
|
21984
|
+
let mcpEnableJsonResponse = true;
|
|
20367
21985
|
let rateLimiter = null;
|
|
21986
|
+
let notConfigured = false;
|
|
20368
21987
|
const getConfig = async () => {
|
|
20369
21988
|
if (!configPromise) {
|
|
20370
21989
|
let configP;
|
|
@@ -20381,6 +22000,13 @@ function searchsocketHandle(options = {}) {
|
|
|
20381
22000
|
}
|
|
20382
22001
|
configPromise = configP.then((config) => {
|
|
20383
22002
|
apiPath = apiPath ?? config.api.path;
|
|
22003
|
+
mcpPath = config.mcp.handle.path;
|
|
22004
|
+
mcpApiKey = config.mcp.handle.apiKey;
|
|
22005
|
+
mcpEnableJsonResponse = config.mcp.handle.enableJsonResponse;
|
|
22006
|
+
if (config.llmsTxt.enable) {
|
|
22007
|
+
llmsServePath = "/" + config.llmsTxt.outputPath.replace(/^static\//, "");
|
|
22008
|
+
serveMarkdownVariants = config.llmsTxt.serveMarkdownVariants;
|
|
22009
|
+
}
|
|
20384
22010
|
if (config.api.rateLimit && !isServerless()) {
|
|
20385
22011
|
rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
|
|
20386
22012
|
}
|
|
@@ -20390,59 +22016,109 @@ function searchsocketHandle(options = {}) {
|
|
|
20390
22016
|
return configPromise;
|
|
20391
22017
|
};
|
|
20392
22018
|
const getEngine = async () => {
|
|
22019
|
+
if (notConfigured) {
|
|
22020
|
+
throw new SearchSocketError(
|
|
22021
|
+
"SEARCH_NOT_CONFIGURED",
|
|
22022
|
+
"Search is not configured. Set the required Upstash environment variables to enable search.",
|
|
22023
|
+
503
|
|
22024
|
+
);
|
|
22025
|
+
}
|
|
20393
22026
|
if (!enginePromise) {
|
|
20394
22027
|
const config = await getConfig();
|
|
20395
22028
|
enginePromise = SearchEngine.create({
|
|
20396
22029
|
cwd: options.cwd,
|
|
20397
22030
|
config
|
|
22031
|
+
}).catch((error) => {
|
|
22032
|
+
enginePromise = null;
|
|
22033
|
+
if (error instanceof SearchSocketError && error.code === "VECTOR_BACKEND_UNAVAILABLE") {
|
|
22034
|
+
notConfigured = true;
|
|
22035
|
+
throw new SearchSocketError(
|
|
22036
|
+
"SEARCH_NOT_CONFIGURED",
|
|
22037
|
+
"Search is not configured. Set the required Upstash environment variables to enable search.",
|
|
22038
|
+
503
|
|
22039
|
+
);
|
|
22040
|
+
}
|
|
22041
|
+
throw error;
|
|
20398
22042
|
});
|
|
20399
22043
|
}
|
|
20400
22044
|
return enginePromise;
|
|
20401
22045
|
};
|
|
20402
22046
|
const bodyLimit = options.maxBodyBytes ?? 64 * 1024;
|
|
20403
22047
|
return async ({ event, resolve }) => {
|
|
20404
|
-
if (apiPath && event.url.pathname !==
|
|
20405
|
-
|
|
22048
|
+
if (apiPath && !isApiPath(event.url.pathname, apiPath) && event.url.pathname !== llmsServePath) {
|
|
22049
|
+
const isMarkdownVariant = event.request.method === "GET" && event.url.pathname.endsWith(".md");
|
|
22050
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
22051
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
22052
|
+
}
|
|
22053
|
+
if (mcpPath) {
|
|
22054
|
+
if (serveMarkdownVariants && isMarkdownVariant) ; else {
|
|
22055
|
+
return resolve(event);
|
|
22056
|
+
}
|
|
22057
|
+
} else {
|
|
22058
|
+
if (configPromise || options.config || options.rawConfig) {
|
|
22059
|
+
await getConfig();
|
|
22060
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
22061
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
22062
|
+
}
|
|
22063
|
+
if (!(serveMarkdownVariants && isMarkdownVariant)) {
|
|
22064
|
+
return resolve(event);
|
|
22065
|
+
}
|
|
22066
|
+
} else {
|
|
22067
|
+
return resolve(event);
|
|
22068
|
+
}
|
|
22069
|
+
}
|
|
20406
22070
|
}
|
|
20407
22071
|
const config = await getConfig();
|
|
22072
|
+
if (llmsServePath && event.request.method === "GET" && event.url.pathname === llmsServePath) {
|
|
22073
|
+
const cwd = options.cwd ?? process.cwd();
|
|
22074
|
+
const filePath = path.resolve(cwd, config.llmsTxt.outputPath);
|
|
22075
|
+
try {
|
|
22076
|
+
const content = await fs8.readFile(filePath, "utf8");
|
|
22077
|
+
return new Response(content, {
|
|
22078
|
+
status: 200,
|
|
22079
|
+
headers: { "content-type": "text/plain; charset=utf-8" }
|
|
22080
|
+
});
|
|
22081
|
+
} catch {
|
|
22082
|
+
return resolve(event);
|
|
22083
|
+
}
|
|
22084
|
+
}
|
|
22085
|
+
if (serveMarkdownVariants && event.request.method === "GET" && event.url.pathname.endsWith(".md")) {
|
|
22086
|
+
let rawPath;
|
|
22087
|
+
try {
|
|
22088
|
+
rawPath = decodeURIComponent(event.url.pathname.slice(0, -3));
|
|
22089
|
+
} catch {
|
|
22090
|
+
return resolve(event);
|
|
22091
|
+
}
|
|
22092
|
+
const scope = event.url.searchParams?.get("scope") ?? void 0;
|
|
22093
|
+
try {
|
|
22094
|
+
const engine = await getEngine();
|
|
22095
|
+
const page = await engine.getPage(rawPath, scope);
|
|
22096
|
+
return new Response(page.markdown, {
|
|
22097
|
+
status: 200,
|
|
22098
|
+
headers: { "content-type": "text/markdown; charset=utf-8" }
|
|
22099
|
+
});
|
|
22100
|
+
} catch (error) {
|
|
22101
|
+
if (error instanceof SearchSocketError && error.status === 404) {
|
|
22102
|
+
return resolve(event);
|
|
22103
|
+
}
|
|
22104
|
+
throw error;
|
|
22105
|
+
}
|
|
22106
|
+
}
|
|
22107
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
22108
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
22109
|
+
}
|
|
20408
22110
|
const targetPath = apiPath ?? config.api.path;
|
|
20409
|
-
if (event.url.pathname
|
|
22111
|
+
if (!isApiPath(event.url.pathname, targetPath)) {
|
|
20410
22112
|
return resolve(event);
|
|
20411
22113
|
}
|
|
20412
|
-
|
|
22114
|
+
const subPath = event.url.pathname.slice(targetPath.length);
|
|
22115
|
+
const method = event.request.method;
|
|
22116
|
+
if (method === "OPTIONS") {
|
|
20413
22117
|
return new Response(null, {
|
|
20414
22118
|
status: 204,
|
|
20415
22119
|
headers: buildCorsHeaders(event.request, config)
|
|
20416
22120
|
});
|
|
20417
22121
|
}
|
|
20418
|
-
if (event.request.method !== "POST") {
|
|
20419
|
-
return withCors(
|
|
20420
|
-
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
|
|
20421
|
-
status: 405,
|
|
20422
|
-
headers: {
|
|
20423
|
-
"content-type": "application/json"
|
|
20424
|
-
}
|
|
20425
|
-
}),
|
|
20426
|
-
event.request,
|
|
20427
|
-
config
|
|
20428
|
-
);
|
|
20429
|
-
}
|
|
20430
|
-
const contentLength = Number(event.request.headers.get("content-length") ?? 0);
|
|
20431
|
-
if (contentLength > bodyLimit) {
|
|
20432
|
-
return withCors(
|
|
20433
|
-
new Response(
|
|
20434
|
-
JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Request body too large", 413))),
|
|
20435
|
-
{
|
|
20436
|
-
status: 413,
|
|
20437
|
-
headers: {
|
|
20438
|
-
"content-type": "application/json"
|
|
20439
|
-
}
|
|
20440
|
-
}
|
|
20441
|
-
),
|
|
20442
|
-
event.request,
|
|
20443
|
-
config
|
|
20444
|
-
);
|
|
20445
|
-
}
|
|
20446
22122
|
if (rateLimiter) {
|
|
20447
22123
|
const ip = event.getClientAddress?.() ?? event.request.headers.get("x-forwarded-for")?.split(",")[0]?.trim() ?? "unknown";
|
|
20448
22124
|
if (!rateLimiter.check(ip)) {
|
|
@@ -20462,39 +22138,32 @@ function searchsocketHandle(options = {}) {
|
|
|
20462
22138
|
}
|
|
20463
22139
|
}
|
|
20464
22140
|
try {
|
|
20465
|
-
|
|
20466
|
-
|
|
20467
|
-
|
|
20468
|
-
} else {
|
|
20469
|
-
let parsedFallback;
|
|
20470
|
-
try {
|
|
20471
|
-
parsedFallback = await event.request.json();
|
|
20472
|
-
} catch (error) {
|
|
20473
|
-
if (error instanceof SyntaxError) {
|
|
20474
|
-
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
20475
|
-
}
|
|
20476
|
-
throw error;
|
|
22141
|
+
if (method === "GET") {
|
|
22142
|
+
if (subPath === "" || subPath === "/") {
|
|
22143
|
+
return await handleGetSearch(event, config, getEngine);
|
|
20477
22144
|
}
|
|
20478
|
-
|
|
20479
|
-
|
|
20480
|
-
|
|
20481
|
-
|
|
22145
|
+
if (subPath === "/health") {
|
|
22146
|
+
return await handleGetHealth(event, config, getEngine);
|
|
22147
|
+
}
|
|
22148
|
+
if (subPath.startsWith("/pages/")) {
|
|
22149
|
+
return await handleGetPage(event, config, getEngine, subPath);
|
|
22150
|
+
}
|
|
22151
|
+
return withCors(
|
|
22152
|
+
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Not found", 404))), {
|
|
22153
|
+
status: 404,
|
|
22154
|
+
headers: { "content-type": "application/json" }
|
|
22155
|
+
}),
|
|
22156
|
+
event.request,
|
|
22157
|
+
config
|
|
22158
|
+
);
|
|
20482
22159
|
}
|
|
20483
|
-
|
|
20484
|
-
|
|
20485
|
-
body = JSON.parse(rawBody);
|
|
20486
|
-
} catch {
|
|
20487
|
-
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
22160
|
+
if (method === "POST" && (subPath === "" || subPath === "/")) {
|
|
22161
|
+
return await handlePostSearch(event, config, getEngine, bodyLimit);
|
|
20488
22162
|
}
|
|
20489
|
-
const engine = await getEngine();
|
|
20490
|
-
const searchRequest = body;
|
|
20491
|
-
const result = await engine.search(searchRequest);
|
|
20492
22163
|
return withCors(
|
|
20493
|
-
new Response(JSON.stringify(
|
|
20494
|
-
status:
|
|
20495
|
-
headers: {
|
|
20496
|
-
"content-type": "application/json"
|
|
20497
|
-
}
|
|
22164
|
+
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
|
|
22165
|
+
status: 405,
|
|
22166
|
+
headers: { "content-type": "application/json" }
|
|
20498
22167
|
}),
|
|
20499
22168
|
event.request,
|
|
20500
22169
|
config
|
|
@@ -20515,6 +22184,183 @@ function searchsocketHandle(options = {}) {
|
|
|
20515
22184
|
}
|
|
20516
22185
|
};
|
|
20517
22186
|
}
|
|
22187
|
+
function isApiPath(pathname, apiPath) {
|
|
22188
|
+
return pathname === apiPath || pathname.startsWith(apiPath + "/");
|
|
22189
|
+
}
|
|
22190
|
+
async function handleGetSearch(event, config, getEngine) {
|
|
22191
|
+
const params = event.url.searchParams;
|
|
22192
|
+
const q = params.get("q");
|
|
22193
|
+
if (!q || q.trim() === "") {
|
|
22194
|
+
throw new SearchSocketError("INVALID_REQUEST", "Missing required query parameter: q", 400);
|
|
22195
|
+
}
|
|
22196
|
+
const searchRequest = { q };
|
|
22197
|
+
const topK = params.get("topK");
|
|
22198
|
+
if (topK !== null) {
|
|
22199
|
+
const parsed = Number.parseInt(topK, 10);
|
|
22200
|
+
if (Number.isNaN(parsed) || parsed < 1) {
|
|
22201
|
+
throw new SearchSocketError("INVALID_REQUEST", "topK must be a positive integer", 400);
|
|
22202
|
+
}
|
|
22203
|
+
searchRequest.topK = parsed;
|
|
22204
|
+
}
|
|
22205
|
+
const scope = params.get("scope");
|
|
22206
|
+
if (scope !== null) searchRequest.scope = scope;
|
|
22207
|
+
const pathPrefix = params.get("pathPrefix");
|
|
22208
|
+
if (pathPrefix !== null) searchRequest.pathPrefix = pathPrefix;
|
|
22209
|
+
const groupBy = params.get("groupBy");
|
|
22210
|
+
if (groupBy) {
|
|
22211
|
+
if (groupBy !== "page" && groupBy !== "chunk") {
|
|
22212
|
+
throw new SearchSocketError("INVALID_REQUEST", 'groupBy must be "page" or "chunk"', 400);
|
|
22213
|
+
}
|
|
22214
|
+
searchRequest.groupBy = groupBy;
|
|
22215
|
+
}
|
|
22216
|
+
const maxSubResults = params.get("maxSubResults");
|
|
22217
|
+
if (maxSubResults !== null) {
|
|
22218
|
+
const parsed = Number.parseInt(maxSubResults, 10);
|
|
22219
|
+
if (Number.isNaN(parsed) || parsed < 1 || parsed > 20) {
|
|
22220
|
+
throw new SearchSocketError("INVALID_REQUEST", "maxSubResults must be a positive integer between 1 and 20", 400);
|
|
22221
|
+
}
|
|
22222
|
+
searchRequest.maxSubResults = parsed;
|
|
22223
|
+
}
|
|
22224
|
+
const tags = params.getAll("tags");
|
|
22225
|
+
if (tags.length > 0) searchRequest.tags = tags;
|
|
22226
|
+
const engine = await getEngine();
|
|
22227
|
+
const result = await engine.search(searchRequest);
|
|
22228
|
+
return withCors(
|
|
22229
|
+
new Response(JSON.stringify(result), {
|
|
22230
|
+
status: 200,
|
|
22231
|
+
headers: { "content-type": "application/json" }
|
|
22232
|
+
}),
|
|
22233
|
+
event.request,
|
|
22234
|
+
config
|
|
22235
|
+
);
|
|
22236
|
+
}
|
|
22237
|
+
async function handleGetHealth(event, config, getEngine) {
|
|
22238
|
+
const engine = await getEngine();
|
|
22239
|
+
const result = await engine.health();
|
|
22240
|
+
return withCors(
|
|
22241
|
+
new Response(JSON.stringify(result), {
|
|
22242
|
+
status: 200,
|
|
22243
|
+
headers: { "content-type": "application/json" }
|
|
22244
|
+
}),
|
|
22245
|
+
event.request,
|
|
22246
|
+
config
|
|
22247
|
+
);
|
|
22248
|
+
}
|
|
22249
|
+
async function handleGetPage(event, config, getEngine, subPath) {
|
|
22250
|
+
const rawPath = subPath.slice("/pages".length);
|
|
22251
|
+
let pagePath;
|
|
22252
|
+
try {
|
|
22253
|
+
pagePath = decodeURIComponent(rawPath);
|
|
22254
|
+
} catch {
|
|
22255
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed page path", 400);
|
|
22256
|
+
}
|
|
22257
|
+
const scope = event.url.searchParams?.get("scope") ?? void 0;
|
|
22258
|
+
const engine = await getEngine();
|
|
22259
|
+
const result = await engine.getPage(pagePath, scope);
|
|
22260
|
+
return withCors(
|
|
22261
|
+
new Response(JSON.stringify(result), {
|
|
22262
|
+
status: 200,
|
|
22263
|
+
headers: { "content-type": "application/json" }
|
|
22264
|
+
}),
|
|
22265
|
+
event.request,
|
|
22266
|
+
config
|
|
22267
|
+
);
|
|
22268
|
+
}
|
|
22269
|
+
async function handlePostSearch(event, config, getEngine, bodyLimit) {
|
|
22270
|
+
const contentLength = Number(event.request.headers.get("content-length") ?? 0);
|
|
22271
|
+
if (contentLength > bodyLimit) {
|
|
22272
|
+
throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
|
|
22273
|
+
}
|
|
22274
|
+
let rawBody;
|
|
22275
|
+
if (typeof event.request.text === "function") {
|
|
22276
|
+
rawBody = await event.request.text();
|
|
22277
|
+
} else {
|
|
22278
|
+
let parsedFallback;
|
|
22279
|
+
try {
|
|
22280
|
+
parsedFallback = await event.request.json();
|
|
22281
|
+
} catch (error) {
|
|
22282
|
+
if (error instanceof SyntaxError) {
|
|
22283
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
22284
|
+
}
|
|
22285
|
+
throw error;
|
|
22286
|
+
}
|
|
22287
|
+
rawBody = JSON.stringify(parsedFallback);
|
|
22288
|
+
}
|
|
22289
|
+
if (Buffer.byteLength(rawBody, "utf8") > bodyLimit) {
|
|
22290
|
+
throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
|
|
22291
|
+
}
|
|
22292
|
+
let body;
|
|
22293
|
+
try {
|
|
22294
|
+
body = JSON.parse(rawBody);
|
|
22295
|
+
} catch {
|
|
22296
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
22297
|
+
}
|
|
22298
|
+
const engine = await getEngine();
|
|
22299
|
+
const searchRequest = body;
|
|
22300
|
+
const result = await engine.search(searchRequest);
|
|
22301
|
+
return withCors(
|
|
22302
|
+
new Response(JSON.stringify(result), {
|
|
22303
|
+
status: 200,
|
|
22304
|
+
headers: { "content-type": "application/json" }
|
|
22305
|
+
}),
|
|
22306
|
+
event.request,
|
|
22307
|
+
config
|
|
22308
|
+
);
|
|
22309
|
+
}
|
|
22310
|
+
async function handleMcpRequest(event, apiKey, enableJsonResponse, getEngine) {
|
|
22311
|
+
if (apiKey) {
|
|
22312
|
+
const authHeader = event.request.headers.get("authorization") ?? "";
|
|
22313
|
+
const token = authHeader.startsWith("Bearer ") ? authHeader.slice(7) : "";
|
|
22314
|
+
const tokenBuf = Buffer.from(token);
|
|
22315
|
+
const keyBuf = Buffer.from(apiKey);
|
|
22316
|
+
if (tokenBuf.length !== keyBuf.length || !timingSafeEqual(tokenBuf, keyBuf)) {
|
|
22317
|
+
return new Response(
|
|
22318
|
+
JSON.stringify({
|
|
22319
|
+
jsonrpc: "2.0",
|
|
22320
|
+
error: { code: -32001, message: "Unauthorized" },
|
|
22321
|
+
id: null
|
|
22322
|
+
}),
|
|
22323
|
+
{ status: 401, headers: { "content-type": "application/json" } }
|
|
22324
|
+
);
|
|
22325
|
+
}
|
|
22326
|
+
}
|
|
22327
|
+
const transport = new WebStandardStreamableHTTPServerTransport({
|
|
22328
|
+
sessionIdGenerator: void 0,
|
|
22329
|
+
enableJsonResponse
|
|
22330
|
+
});
|
|
22331
|
+
let server;
|
|
22332
|
+
try {
|
|
22333
|
+
const engine = await getEngine();
|
|
22334
|
+
server = createServer(engine);
|
|
22335
|
+
await server.connect(transport);
|
|
22336
|
+
const response = await transport.handleRequest(event.request);
|
|
22337
|
+
if (enableJsonResponse) {
|
|
22338
|
+
await transport.close();
|
|
22339
|
+
await server.close();
|
|
22340
|
+
}
|
|
22341
|
+
return response;
|
|
22342
|
+
} catch (error) {
|
|
22343
|
+
try {
|
|
22344
|
+
await transport.close();
|
|
22345
|
+
} catch {
|
|
22346
|
+
}
|
|
22347
|
+
try {
|
|
22348
|
+
await server?.close();
|
|
22349
|
+
} catch {
|
|
22350
|
+
}
|
|
22351
|
+
return new Response(
|
|
22352
|
+
JSON.stringify({
|
|
22353
|
+
jsonrpc: "2.0",
|
|
22354
|
+
error: {
|
|
22355
|
+
code: -32603,
|
|
22356
|
+
message: error instanceof Error ? error.message : "Internal server error"
|
|
22357
|
+
},
|
|
22358
|
+
id: null
|
|
22359
|
+
}),
|
|
22360
|
+
{ status: 500, headers: { "content-type": "application/json" } }
|
|
22361
|
+
);
|
|
22362
|
+
}
|
|
22363
|
+
}
|
|
20518
22364
|
function buildCorsHeaders(request, config) {
|
|
20519
22365
|
const allowOrigins = config.api.cors.allowOrigins;
|
|
20520
22366
|
if (!allowOrigins || allowOrigins.length === 0) {
|
|
@@ -20527,7 +22373,7 @@ function buildCorsHeaders(request, config) {
|
|
|
20527
22373
|
}
|
|
20528
22374
|
return {
|
|
20529
22375
|
"access-control-allow-origin": allowOrigins.includes("*") ? "*" : origin,
|
|
20530
|
-
"access-control-allow-methods": "POST, OPTIONS",
|
|
22376
|
+
"access-control-allow-methods": "GET, POST, OPTIONS",
|
|
20531
22377
|
"access-control-allow-headers": "content-type"
|
|
20532
22378
|
};
|
|
20533
22379
|
}
|
|
@@ -20563,9 +22409,6 @@ function shouldRunAutoIndex(options) {
|
|
|
20563
22409
|
if (explicit && /^(1|true|yes)$/i.test(explicit)) {
|
|
20564
22410
|
return true;
|
|
20565
22411
|
}
|
|
20566
|
-
if (process.env.CI && /^(1|true)$/i.test(process.env.CI)) {
|
|
20567
|
-
return true;
|
|
20568
|
-
}
|
|
20569
22412
|
return false;
|
|
20570
22413
|
}
|
|
20571
22414
|
function searchsocketVitePlugin(options = {}) {
|
|
@@ -20590,7 +22433,8 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20590
22433
|
const pipeline = await IndexPipeline.create({
|
|
20591
22434
|
cwd,
|
|
20592
22435
|
configPath: options.configPath,
|
|
20593
|
-
logger: logger3
|
|
22436
|
+
logger: logger3,
|
|
22437
|
+
hooks: options.hooks
|
|
20594
22438
|
});
|
|
20595
22439
|
const stats = await pipeline.run({
|
|
20596
22440
|
changedOnly: options.changedOnly ?? true,
|