searchsocket 0.5.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +731 -514
- package/dist/cli.js +3335 -492
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +2378 -475
- package/dist/index.d.cts +113 -40
- package/dist/index.d.ts +113 -40
- package/dist/index.js +2378 -475
- package/dist/{plugin-B_npJSux.d.cts → plugin-C61L-ykY.d.ts} +2 -1
- package/dist/{plugin-M-aW0ev6.d.ts → plugin-DoBW1gkK.d.cts} +2 -1
- package/dist/sveltekit.cjs +2430 -494
- package/dist/sveltekit.d.cts +2 -2
- package/dist/sveltekit.d.ts +2 -2
- package/dist/sveltekit.js +2416 -480
- package/dist/templates/search-dialog/SearchDialog.svelte +175 -0
- package/dist/templates/search-input/SearchInput.svelte +151 -0
- package/dist/templates/search-results/SearchResults.svelte +75 -0
- package/dist/{types-Dk43uz25.d.cts → types-029hl6P2.d.cts} +180 -9
- package/dist/{types-Dk43uz25.d.ts → types-029hl6P2.d.ts} +180 -9
- package/package.json +28 -11
- package/src/svelte/SearchSocket.svelte +35 -0
- package/src/svelte/index.svelte.ts +181 -0
package/dist/sveltekit.js
CHANGED
|
@@ -1,14 +1,20 @@
|
|
|
1
|
-
import
|
|
1
|
+
import { timingSafeEqual, createHash } from 'crypto';
|
|
2
|
+
import fs9 from 'fs/promises';
|
|
2
3
|
import path from 'path';
|
|
4
|
+
import { WebStandardStreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js';
|
|
5
|
+
import fs from 'fs';
|
|
3
6
|
import { createJiti } from 'jiti';
|
|
4
7
|
import { z } from 'zod';
|
|
8
|
+
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
9
|
+
import '@modelcontextprotocol/sdk/server/stdio.js';
|
|
10
|
+
import '@modelcontextprotocol/sdk/server/streamableHttp.js';
|
|
11
|
+
import '@modelcontextprotocol/sdk/server/express.js';
|
|
5
12
|
import { execSync, spawn } from 'child_process';
|
|
6
|
-
import {
|
|
13
|
+
import { FusionAlgorithm, QueryMode } from '@upstash/vector';
|
|
7
14
|
import { load } from 'cheerio';
|
|
8
15
|
import matter from 'gray-matter';
|
|
9
16
|
import fg from 'fast-glob';
|
|
10
17
|
import pLimit from 'p-limit';
|
|
11
|
-
import fs3 from 'fs/promises';
|
|
12
18
|
import net from 'net';
|
|
13
19
|
import { gunzipSync } from 'zlib';
|
|
14
20
|
|
|
@@ -5009,32 +5015,32 @@ var require_URL = __commonJS({
|
|
|
5009
5015
|
else
|
|
5010
5016
|
return basepath.substring(0, lastslash + 1) + refpath;
|
|
5011
5017
|
}
|
|
5012
|
-
function remove_dot_segments(
|
|
5013
|
-
if (!
|
|
5018
|
+
function remove_dot_segments(path14) {
|
|
5019
|
+
if (!path14) return path14;
|
|
5014
5020
|
var output = "";
|
|
5015
|
-
while (
|
|
5016
|
-
if (
|
|
5017
|
-
|
|
5021
|
+
while (path14.length > 0) {
|
|
5022
|
+
if (path14 === "." || path14 === "..") {
|
|
5023
|
+
path14 = "";
|
|
5018
5024
|
break;
|
|
5019
5025
|
}
|
|
5020
|
-
var twochars =
|
|
5021
|
-
var threechars =
|
|
5022
|
-
var fourchars =
|
|
5026
|
+
var twochars = path14.substring(0, 2);
|
|
5027
|
+
var threechars = path14.substring(0, 3);
|
|
5028
|
+
var fourchars = path14.substring(0, 4);
|
|
5023
5029
|
if (threechars === "../") {
|
|
5024
|
-
|
|
5030
|
+
path14 = path14.substring(3);
|
|
5025
5031
|
} else if (twochars === "./") {
|
|
5026
|
-
|
|
5032
|
+
path14 = path14.substring(2);
|
|
5027
5033
|
} else if (threechars === "/./") {
|
|
5028
|
-
|
|
5029
|
-
} else if (twochars === "/." &&
|
|
5030
|
-
|
|
5031
|
-
} else if (fourchars === "/../" || threechars === "/.." &&
|
|
5032
|
-
|
|
5034
|
+
path14 = "/" + path14.substring(3);
|
|
5035
|
+
} else if (twochars === "/." && path14.length === 2) {
|
|
5036
|
+
path14 = "/";
|
|
5037
|
+
} else if (fourchars === "/../" || threechars === "/.." && path14.length === 3) {
|
|
5038
|
+
path14 = "/" + path14.substring(4);
|
|
5033
5039
|
output = output.replace(/\/?[^\/]*$/, "");
|
|
5034
5040
|
} else {
|
|
5035
|
-
var segment =
|
|
5041
|
+
var segment = path14.match(/(\/?([^\/]*))/)[0];
|
|
5036
5042
|
output += segment;
|
|
5037
|
-
|
|
5043
|
+
path14 = path14.substring(segment.length);
|
|
5038
5044
|
}
|
|
5039
5045
|
}
|
|
5040
5046
|
return output;
|
|
@@ -16630,6 +16636,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
16630
16636
|
dropSelectors: z.array(z.string()).optional(),
|
|
16631
16637
|
ignoreAttr: z.string().optional(),
|
|
16632
16638
|
noindexAttr: z.string().optional(),
|
|
16639
|
+
imageDescAttr: z.string().optional(),
|
|
16633
16640
|
respectRobotsNoindex: z.boolean().optional()
|
|
16634
16641
|
}).optional(),
|
|
16635
16642
|
transform: z.object({
|
|
@@ -16645,35 +16652,48 @@ var searchSocketConfigSchema = z.object({
|
|
|
16645
16652
|
headingPathDepth: z.number().int().positive().optional(),
|
|
16646
16653
|
dontSplitInside: z.array(z.enum(["code", "table", "blockquote"])).optional(),
|
|
16647
16654
|
prependTitle: z.boolean().optional(),
|
|
16648
|
-
pageSummaryChunk: z.boolean().optional()
|
|
16655
|
+
pageSummaryChunk: z.boolean().optional(),
|
|
16656
|
+
weightHeadings: z.boolean().optional()
|
|
16649
16657
|
}).optional(),
|
|
16650
16658
|
upstash: z.object({
|
|
16651
16659
|
url: z.string().url().optional(),
|
|
16652
16660
|
token: z.string().min(1).optional(),
|
|
16653
16661
|
urlEnv: z.string().min(1).optional(),
|
|
16654
|
-
tokenEnv: z.string().min(1).optional()
|
|
16662
|
+
tokenEnv: z.string().min(1).optional(),
|
|
16663
|
+
namespaces: z.object({
|
|
16664
|
+
pages: z.string().min(1).optional(),
|
|
16665
|
+
chunks: z.string().min(1).optional()
|
|
16666
|
+
}).optional()
|
|
16667
|
+
}).optional(),
|
|
16668
|
+
embedding: z.object({
|
|
16669
|
+
model: z.string().optional(),
|
|
16670
|
+
dimensions: z.number().int().positive().optional(),
|
|
16671
|
+
taskType: z.string().optional(),
|
|
16672
|
+
batchSize: z.number().int().positive().optional()
|
|
16655
16673
|
}).optional(),
|
|
16656
16674
|
search: z.object({
|
|
16657
|
-
semanticWeight: z.number().min(0).max(1).optional(),
|
|
16658
|
-
inputEnrichment: z.boolean().optional(),
|
|
16659
|
-
reranking: z.boolean().optional(),
|
|
16660
16675
|
dualSearch: z.boolean().optional(),
|
|
16661
16676
|
pageSearchWeight: z.number().min(0).max(1).optional()
|
|
16662
16677
|
}).optional(),
|
|
16663
16678
|
ranking: z.object({
|
|
16664
16679
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
16665
16680
|
enableDepthBoost: z.boolean().optional(),
|
|
16681
|
+
enableFreshnessBoost: z.boolean().optional(),
|
|
16682
|
+
freshnessDecayRate: z.number().positive().optional(),
|
|
16683
|
+
enableAnchorTextBoost: z.boolean().optional(),
|
|
16666
16684
|
pageWeights: z.record(z.string(), z.number().min(0)).optional(),
|
|
16667
16685
|
aggregationCap: z.number().int().positive().optional(),
|
|
16668
16686
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
16669
16687
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
16670
|
-
|
|
16688
|
+
minScoreRatio: z.number().min(0).max(1).optional(),
|
|
16671
16689
|
scoreGapThreshold: z.number().min(0).max(1).optional(),
|
|
16672
16690
|
weights: z.object({
|
|
16673
16691
|
incomingLinks: z.number().optional(),
|
|
16674
16692
|
depth: z.number().optional(),
|
|
16675
16693
|
aggregation: z.number().optional(),
|
|
16676
|
-
titleMatch: z.number().optional()
|
|
16694
|
+
titleMatch: z.number().optional(),
|
|
16695
|
+
freshness: z.number().optional(),
|
|
16696
|
+
anchorText: z.number().optional()
|
|
16677
16697
|
}).optional()
|
|
16678
16698
|
}).optional(),
|
|
16679
16699
|
api: z.object({
|
|
@@ -16688,12 +16708,28 @@ var searchSocketConfigSchema = z.object({
|
|
|
16688
16708
|
}).optional(),
|
|
16689
16709
|
mcp: z.object({
|
|
16690
16710
|
enable: z.boolean().optional(),
|
|
16711
|
+
access: z.enum(["public", "private"]).optional(),
|
|
16691
16712
|
transport: z.enum(["stdio", "http"]).optional(),
|
|
16692
16713
|
http: z.object({
|
|
16693
16714
|
port: z.number().int().positive().optional(),
|
|
16694
|
-
path: z.string().optional()
|
|
16715
|
+
path: z.string().optional(),
|
|
16716
|
+
apiKey: z.string().min(1).optional(),
|
|
16717
|
+
apiKeyEnv: z.string().min(1).optional()
|
|
16718
|
+
}).optional(),
|
|
16719
|
+
handle: z.object({
|
|
16720
|
+
path: z.string().optional(),
|
|
16721
|
+
apiKey: z.string().min(1).optional(),
|
|
16722
|
+
enableJsonResponse: z.boolean().optional()
|
|
16695
16723
|
}).optional()
|
|
16696
16724
|
}).optional(),
|
|
16725
|
+
llmsTxt: z.object({
|
|
16726
|
+
enable: z.boolean().optional(),
|
|
16727
|
+
outputPath: z.string().optional(),
|
|
16728
|
+
title: z.string().optional(),
|
|
16729
|
+
description: z.string().optional(),
|
|
16730
|
+
generateFull: z.boolean().optional(),
|
|
16731
|
+
serveMarkdownVariants: z.boolean().optional()
|
|
16732
|
+
}).optional(),
|
|
16697
16733
|
state: z.object({
|
|
16698
16734
|
dir: z.string().optional()
|
|
16699
16735
|
}).optional()
|
|
@@ -16732,6 +16768,7 @@ function createDefaultConfig(projectId) {
|
|
|
16732
16768
|
dropSelectors: DEFAULT_DROP_SELECTORS,
|
|
16733
16769
|
ignoreAttr: "data-search-ignore",
|
|
16734
16770
|
noindexAttr: "data-search-noindex",
|
|
16771
|
+
imageDescAttr: "data-search-description",
|
|
16735
16772
|
respectRobotsNoindex: true
|
|
16736
16773
|
},
|
|
16737
16774
|
transform: {
|
|
@@ -16741,39 +16778,52 @@ function createDefaultConfig(projectId) {
|
|
|
16741
16778
|
},
|
|
16742
16779
|
chunking: {
|
|
16743
16780
|
strategy: "hybrid",
|
|
16744
|
-
maxChars:
|
|
16781
|
+
maxChars: 1500,
|
|
16745
16782
|
overlapChars: 200,
|
|
16746
16783
|
minChars: 250,
|
|
16747
16784
|
headingPathDepth: 3,
|
|
16748
16785
|
dontSplitInside: ["code", "table", "blockquote"],
|
|
16749
16786
|
prependTitle: true,
|
|
16750
|
-
pageSummaryChunk: true
|
|
16787
|
+
pageSummaryChunk: true,
|
|
16788
|
+
weightHeadings: true
|
|
16751
16789
|
},
|
|
16752
16790
|
upstash: {
|
|
16753
|
-
urlEnv: "
|
|
16754
|
-
tokenEnv: "
|
|
16791
|
+
urlEnv: "UPSTASH_VECTOR_REST_URL",
|
|
16792
|
+
tokenEnv: "UPSTASH_VECTOR_REST_TOKEN",
|
|
16793
|
+
namespaces: {
|
|
16794
|
+
pages: "pages",
|
|
16795
|
+
chunks: "chunks"
|
|
16796
|
+
}
|
|
16797
|
+
},
|
|
16798
|
+
embedding: {
|
|
16799
|
+
model: "bge-large-en-v1.5",
|
|
16800
|
+
dimensions: 1024,
|
|
16801
|
+
taskType: "RETRIEVAL_DOCUMENT",
|
|
16802
|
+
batchSize: 100
|
|
16755
16803
|
},
|
|
16756
16804
|
search: {
|
|
16757
|
-
semanticWeight: 0.75,
|
|
16758
|
-
inputEnrichment: true,
|
|
16759
|
-
reranking: true,
|
|
16760
16805
|
dualSearch: true,
|
|
16761
16806
|
pageSearchWeight: 0.3
|
|
16762
16807
|
},
|
|
16763
16808
|
ranking: {
|
|
16764
16809
|
enableIncomingLinkBoost: true,
|
|
16765
16810
|
enableDepthBoost: true,
|
|
16811
|
+
enableFreshnessBoost: false,
|
|
16812
|
+
freshnessDecayRate: 1e-3,
|
|
16813
|
+
enableAnchorTextBoost: false,
|
|
16766
16814
|
pageWeights: {},
|
|
16767
16815
|
aggregationCap: 5,
|
|
16768
16816
|
aggregationDecay: 0.5,
|
|
16769
16817
|
minChunkScoreRatio: 0.5,
|
|
16770
|
-
|
|
16818
|
+
minScoreRatio: 0.7,
|
|
16771
16819
|
scoreGapThreshold: 0.4,
|
|
16772
16820
|
weights: {
|
|
16773
16821
|
incomingLinks: 0.05,
|
|
16774
16822
|
depth: 0.03,
|
|
16775
16823
|
aggregation: 0.1,
|
|
16776
|
-
titleMatch: 0.15
|
|
16824
|
+
titleMatch: 0.15,
|
|
16825
|
+
freshness: 0.1,
|
|
16826
|
+
anchorText: 0.1
|
|
16777
16827
|
}
|
|
16778
16828
|
},
|
|
16779
16829
|
api: {
|
|
@@ -16784,12 +16834,23 @@ function createDefaultConfig(projectId) {
|
|
|
16784
16834
|
},
|
|
16785
16835
|
mcp: {
|
|
16786
16836
|
enable: process.env.NODE_ENV !== "production",
|
|
16837
|
+
access: "private",
|
|
16787
16838
|
transport: "stdio",
|
|
16788
16839
|
http: {
|
|
16789
16840
|
port: 3338,
|
|
16790
16841
|
path: "/mcp"
|
|
16842
|
+
},
|
|
16843
|
+
handle: {
|
|
16844
|
+
path: "/api/mcp",
|
|
16845
|
+
enableJsonResponse: true
|
|
16791
16846
|
}
|
|
16792
16847
|
},
|
|
16848
|
+
llmsTxt: {
|
|
16849
|
+
enable: false,
|
|
16850
|
+
outputPath: "static/llms.txt",
|
|
16851
|
+
generateFull: true,
|
|
16852
|
+
serveMarkdownVariants: false
|
|
16853
|
+
},
|
|
16793
16854
|
state: {
|
|
16794
16855
|
dir: ".searchsocket"
|
|
16795
16856
|
}
|
|
@@ -16917,7 +16978,15 @@ ${issues}`
|
|
|
16917
16978
|
},
|
|
16918
16979
|
upstash: {
|
|
16919
16980
|
...defaults.upstash,
|
|
16920
|
-
...parsed.upstash
|
|
16981
|
+
...parsed.upstash,
|
|
16982
|
+
namespaces: {
|
|
16983
|
+
...defaults.upstash.namespaces,
|
|
16984
|
+
...parsed.upstash?.namespaces
|
|
16985
|
+
}
|
|
16986
|
+
},
|
|
16987
|
+
embedding: {
|
|
16988
|
+
...defaults.embedding,
|
|
16989
|
+
...parsed.embedding
|
|
16921
16990
|
},
|
|
16922
16991
|
search: {
|
|
16923
16992
|
...defaults.search,
|
|
@@ -16954,8 +17023,16 @@ ${issues}`
|
|
|
16954
17023
|
http: {
|
|
16955
17024
|
...defaults.mcp.http,
|
|
16956
17025
|
...parsed.mcp?.http
|
|
17026
|
+
},
|
|
17027
|
+
handle: {
|
|
17028
|
+
...defaults.mcp.handle,
|
|
17029
|
+
...parsed.mcp?.handle
|
|
16957
17030
|
}
|
|
16958
17031
|
},
|
|
17032
|
+
llmsTxt: {
|
|
17033
|
+
...defaults.llmsTxt,
|
|
17034
|
+
...parsed.llmsTxt
|
|
17035
|
+
},
|
|
16959
17036
|
state: {
|
|
16960
17037
|
...defaults.state,
|
|
16961
17038
|
...parsed.state
|
|
@@ -16975,6 +17052,15 @@ ${issues}`
|
|
|
16975
17052
|
maxDepth: 10
|
|
16976
17053
|
};
|
|
16977
17054
|
}
|
|
17055
|
+
if (merged.mcp.access === "public") {
|
|
17056
|
+
const resolvedKey = merged.mcp.http.apiKey ?? (merged.mcp.http.apiKeyEnv ? process.env[merged.mcp.http.apiKeyEnv] : void 0);
|
|
17057
|
+
if (!resolvedKey) {
|
|
17058
|
+
throw new SearchSocketError(
|
|
17059
|
+
"CONFIG_MISSING",
|
|
17060
|
+
'`mcp.access` is "public" but no API key is configured. Set `mcp.http.apiKey` or `mcp.http.apiKeyEnv`.'
|
|
17061
|
+
);
|
|
17062
|
+
}
|
|
17063
|
+
}
|
|
16978
17064
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
16979
17065
|
throw new SearchSocketError("CONFIG_MISSING", "`source.crawl.baseUrl` is required when source.mode is crawl.");
|
|
16980
17066
|
}
|
|
@@ -17023,13 +17109,84 @@ function normalizeMarkdown(input) {
|
|
|
17023
17109
|
function sanitizeScopeName(scopeName) {
|
|
17024
17110
|
return scopeName.toLowerCase().replace(/[^a-z0-9._-]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80);
|
|
17025
17111
|
}
|
|
17112
|
+
function markdownToPlain(markdown) {
|
|
17113
|
+
return markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/[#>*_|\-]/g, " ").replace(/\s+/g, " ").trim();
|
|
17114
|
+
}
|
|
17026
17115
|
function toSnippet(markdown, maxLen = 220) {
|
|
17027
|
-
const plain = markdown
|
|
17116
|
+
const plain = markdownToPlain(markdown);
|
|
17028
17117
|
if (plain.length <= maxLen) {
|
|
17029
17118
|
return plain;
|
|
17030
17119
|
}
|
|
17031
17120
|
return `${plain.slice(0, Math.max(0, maxLen - 1)).trim()}\u2026`;
|
|
17032
17121
|
}
|
|
17122
|
+
function queryAwareExcerpt(markdown, query, maxLen = 220) {
|
|
17123
|
+
const plain = markdownToPlain(markdown);
|
|
17124
|
+
if (plain.length <= maxLen) return plain;
|
|
17125
|
+
const tokens = query.toLowerCase().split(/\s+/).filter((t) => t.length >= 2);
|
|
17126
|
+
if (tokens.length === 0) return toSnippet(markdown, maxLen);
|
|
17127
|
+
const positions = [];
|
|
17128
|
+
for (let ti = 0; ti < tokens.length; ti++) {
|
|
17129
|
+
const escaped = tokens[ti].replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
17130
|
+
const re = new RegExp(escaped, "gi");
|
|
17131
|
+
let m;
|
|
17132
|
+
while ((m = re.exec(plain)) !== null) {
|
|
17133
|
+
positions.push({ start: m.index, end: m.index + m[0].length, tokenIdx: ti });
|
|
17134
|
+
}
|
|
17135
|
+
}
|
|
17136
|
+
if (positions.length === 0) return toSnippet(markdown, maxLen);
|
|
17137
|
+
positions.sort((a, b) => a.start - b.start);
|
|
17138
|
+
let bestUniqueCount = 0;
|
|
17139
|
+
let bestTotalCount = 0;
|
|
17140
|
+
let bestLeft = 0;
|
|
17141
|
+
let bestRight = 0;
|
|
17142
|
+
let left = 0;
|
|
17143
|
+
const tokenCounts = /* @__PURE__ */ new Map();
|
|
17144
|
+
for (let right = 0; right < positions.length; right++) {
|
|
17145
|
+
tokenCounts.set(positions[right].tokenIdx, (tokenCounts.get(positions[right].tokenIdx) ?? 0) + 1);
|
|
17146
|
+
while (positions[right].end - positions[left].start > maxLen && left < right) {
|
|
17147
|
+
const leftToken = positions[left].tokenIdx;
|
|
17148
|
+
const cnt = tokenCounts.get(leftToken) - 1;
|
|
17149
|
+
if (cnt === 0) tokenCounts.delete(leftToken);
|
|
17150
|
+
else tokenCounts.set(leftToken, cnt);
|
|
17151
|
+
left++;
|
|
17152
|
+
}
|
|
17153
|
+
const uniqueCount = tokenCounts.size;
|
|
17154
|
+
const totalCount = right - left + 1;
|
|
17155
|
+
if (uniqueCount > bestUniqueCount || uniqueCount === bestUniqueCount && totalCount > bestTotalCount) {
|
|
17156
|
+
bestUniqueCount = uniqueCount;
|
|
17157
|
+
bestTotalCount = totalCount;
|
|
17158
|
+
bestLeft = left;
|
|
17159
|
+
bestRight = right;
|
|
17160
|
+
}
|
|
17161
|
+
}
|
|
17162
|
+
const mid = Math.floor((positions[bestLeft].start + positions[bestRight].end) / 2);
|
|
17163
|
+
let start = Math.max(0, mid - Math.floor(maxLen / 2));
|
|
17164
|
+
let end = Math.min(plain.length, start + maxLen);
|
|
17165
|
+
start = Math.max(0, end - maxLen);
|
|
17166
|
+
if (start > 0) {
|
|
17167
|
+
const spaceIdx = plain.lastIndexOf(" ", start);
|
|
17168
|
+
if (spaceIdx > start - 30) {
|
|
17169
|
+
start = spaceIdx + 1;
|
|
17170
|
+
}
|
|
17171
|
+
}
|
|
17172
|
+
if (end < plain.length) {
|
|
17173
|
+
const spaceIdx = plain.indexOf(" ", end);
|
|
17174
|
+
if (spaceIdx !== -1 && spaceIdx < end + 30) {
|
|
17175
|
+
end = spaceIdx;
|
|
17176
|
+
}
|
|
17177
|
+
}
|
|
17178
|
+
let excerpt = plain.slice(start, end);
|
|
17179
|
+
if (excerpt.length > Math.ceil(maxLen * 1.2)) {
|
|
17180
|
+
excerpt = excerpt.slice(0, maxLen);
|
|
17181
|
+
const lastSpace = excerpt.lastIndexOf(" ");
|
|
17182
|
+
if (lastSpace > maxLen * 0.5) {
|
|
17183
|
+
excerpt = excerpt.slice(0, lastSpace);
|
|
17184
|
+
}
|
|
17185
|
+
}
|
|
17186
|
+
const prefix = start > 0 ? "\u2026" : "";
|
|
17187
|
+
const suffix = end < plain.length ? "\u2026" : "";
|
|
17188
|
+
return `${prefix}${excerpt}${suffix}`;
|
|
17189
|
+
}
|
|
17033
17190
|
function extractFirstParagraph(markdown) {
|
|
17034
17191
|
const lines = markdown.split("\n");
|
|
17035
17192
|
let inFence = false;
|
|
@@ -17136,162 +17293,342 @@ function joinUrl(baseUrl, route) {
|
|
|
17136
17293
|
const routePart = ensureLeadingSlash(route);
|
|
17137
17294
|
return `${base}${routePart}`;
|
|
17138
17295
|
}
|
|
17139
|
-
|
|
17140
|
-
// src/vector/upstash.ts
|
|
17141
|
-
function chunkIndexName(scope) {
|
|
17142
|
-
return `${scope.projectId}--${scope.scopeName}`;
|
|
17143
|
-
}
|
|
17144
|
-
function pageIndexName(scope) {
|
|
17145
|
-
return `${scope.projectId}--${scope.scopeName}--pages`;
|
|
17146
|
-
}
|
|
17147
17296
|
var UpstashSearchStore = class {
|
|
17148
|
-
|
|
17297
|
+
index;
|
|
17298
|
+
pagesNs;
|
|
17299
|
+
chunksNs;
|
|
17149
17300
|
constructor(opts) {
|
|
17150
|
-
this.
|
|
17151
|
-
|
|
17152
|
-
|
|
17153
|
-
return this.client.index(chunkIndexName(scope));
|
|
17154
|
-
}
|
|
17155
|
-
pageIndex(scope) {
|
|
17156
|
-
return this.client.index(pageIndexName(scope));
|
|
17301
|
+
this.index = opts.index;
|
|
17302
|
+
this.pagesNs = opts.index.namespace(opts.pagesNamespace);
|
|
17303
|
+
this.chunksNs = opts.index.namespace(opts.chunksNamespace);
|
|
17157
17304
|
}
|
|
17158
17305
|
async upsertChunks(chunks, scope) {
|
|
17159
17306
|
if (chunks.length === 0) return;
|
|
17160
|
-
const
|
|
17161
|
-
const BATCH_SIZE = 100;
|
|
17307
|
+
const BATCH_SIZE = 90;
|
|
17162
17308
|
for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
|
|
17163
17309
|
const batch = chunks.slice(i, i + BATCH_SIZE);
|
|
17164
|
-
await
|
|
17165
|
-
|
|
17166
|
-
|
|
17167
|
-
|
|
17168
|
-
|
|
17169
|
-
|
|
17170
|
-
|
|
17171
|
-
|
|
17172
|
-
|
|
17173
|
-
|
|
17174
|
-
|
|
17175
|
-
|
|
17310
|
+
await this.chunksNs.upsert(
|
|
17311
|
+
batch.map((c) => ({
|
|
17312
|
+
id: c.id,
|
|
17313
|
+
data: c.data,
|
|
17314
|
+
metadata: {
|
|
17315
|
+
...c.metadata,
|
|
17316
|
+
projectId: scope.projectId,
|
|
17317
|
+
scopeName: scope.scopeName,
|
|
17318
|
+
type: c.metadata.type || "chunk"
|
|
17319
|
+
}
|
|
17320
|
+
}))
|
|
17321
|
+
);
|
|
17322
|
+
}
|
|
17323
|
+
}
|
|
17324
|
+
async search(data, opts, scope) {
|
|
17325
|
+
const filterParts = [
|
|
17326
|
+
`projectId = '${scope.projectId}'`,
|
|
17327
|
+
`scopeName = '${scope.scopeName}'`
|
|
17328
|
+
];
|
|
17329
|
+
if (opts.filter) {
|
|
17330
|
+
filterParts.push(opts.filter);
|
|
17331
|
+
}
|
|
17332
|
+
const results = await this.chunksNs.query({
|
|
17333
|
+
data,
|
|
17334
|
+
topK: opts.limit,
|
|
17335
|
+
includeMetadata: true,
|
|
17336
|
+
filter: filterParts.join(" AND "),
|
|
17337
|
+
queryMode: QueryMode.HYBRID,
|
|
17338
|
+
fusionAlgorithm: FusionAlgorithm.DBSF
|
|
17339
|
+
});
|
|
17340
|
+
return results.map((doc) => ({
|
|
17341
|
+
id: String(doc.id),
|
|
17342
|
+
score: doc.score,
|
|
17343
|
+
metadata: {
|
|
17344
|
+
projectId: doc.metadata?.projectId ?? "",
|
|
17345
|
+
scopeName: doc.metadata?.scopeName ?? "",
|
|
17346
|
+
url: doc.metadata?.url ?? "",
|
|
17347
|
+
path: doc.metadata?.path ?? "",
|
|
17348
|
+
title: doc.metadata?.title ?? "",
|
|
17349
|
+
sectionTitle: doc.metadata?.sectionTitle ?? "",
|
|
17350
|
+
headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
|
|
17351
|
+
snippet: doc.metadata?.snippet ?? "",
|
|
17352
|
+
chunkText: doc.metadata?.chunkText ?? "",
|
|
17353
|
+
ordinal: doc.metadata?.ordinal ?? 0,
|
|
17354
|
+
contentHash: doc.metadata?.contentHash ?? "",
|
|
17355
|
+
depth: doc.metadata?.depth ?? 0,
|
|
17356
|
+
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17357
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
17358
|
+
tags: doc.metadata?.tags ?? [],
|
|
17359
|
+
description: doc.metadata?.description || void 0,
|
|
17360
|
+
keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17361
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
|
|
17362
|
+
incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
|
|
17363
|
+
}
|
|
17364
|
+
}));
|
|
17365
|
+
}
|
|
17366
|
+
async searchChunksByUrl(data, url, opts, scope) {
|
|
17367
|
+
const filterParts = [
|
|
17368
|
+
`projectId = '${scope.projectId}'`,
|
|
17369
|
+
`scopeName = '${scope.scopeName}'`,
|
|
17370
|
+
`url = '${url}'`
|
|
17371
|
+
];
|
|
17372
|
+
if (opts.filter) {
|
|
17373
|
+
filterParts.push(opts.filter);
|
|
17374
|
+
}
|
|
17375
|
+
const results = await this.chunksNs.query({
|
|
17376
|
+
data,
|
|
17377
|
+
topK: opts.limit,
|
|
17378
|
+
includeMetadata: true,
|
|
17379
|
+
filter: filterParts.join(" AND "),
|
|
17380
|
+
queryMode: QueryMode.HYBRID,
|
|
17381
|
+
fusionAlgorithm: FusionAlgorithm.DBSF
|
|
17176
17382
|
});
|
|
17177
17383
|
return results.map((doc) => ({
|
|
17178
|
-
id: doc.id,
|
|
17384
|
+
id: String(doc.id),
|
|
17179
17385
|
score: doc.score,
|
|
17180
17386
|
metadata: {
|
|
17181
17387
|
projectId: doc.metadata?.projectId ?? "",
|
|
17182
17388
|
scopeName: doc.metadata?.scopeName ?? "",
|
|
17183
|
-
url: doc.
|
|
17389
|
+
url: doc.metadata?.url ?? "",
|
|
17184
17390
|
path: doc.metadata?.path ?? "",
|
|
17185
|
-
title: doc.
|
|
17186
|
-
sectionTitle: doc.
|
|
17187
|
-
headingPath: doc.
|
|
17391
|
+
title: doc.metadata?.title ?? "",
|
|
17392
|
+
sectionTitle: doc.metadata?.sectionTitle ?? "",
|
|
17393
|
+
headingPath: doc.metadata?.headingPath ? String(doc.metadata.headingPath).split(" > ").filter(Boolean) : [],
|
|
17188
17394
|
snippet: doc.metadata?.snippet ?? "",
|
|
17189
|
-
chunkText: doc.
|
|
17395
|
+
chunkText: doc.metadata?.chunkText ?? "",
|
|
17190
17396
|
ordinal: doc.metadata?.ordinal ?? 0,
|
|
17191
17397
|
contentHash: doc.metadata?.contentHash ?? "",
|
|
17192
17398
|
depth: doc.metadata?.depth ?? 0,
|
|
17193
17399
|
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17194
17400
|
routeFile: doc.metadata?.routeFile ?? "",
|
|
17195
|
-
tags: doc.
|
|
17401
|
+
tags: doc.metadata?.tags ?? [],
|
|
17196
17402
|
description: doc.metadata?.description || void 0,
|
|
17197
|
-
keywords: doc.metadata?.keywords ? doc.metadata.keywords
|
|
17403
|
+
keywords: doc.metadata?.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17404
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0,
|
|
17405
|
+
incomingAnchorText: doc.metadata?.incomingAnchorText || void 0
|
|
17198
17406
|
}
|
|
17199
17407
|
}));
|
|
17200
17408
|
}
|
|
17201
|
-
async
|
|
17202
|
-
|
|
17409
|
+
async searchPagesByText(data, opts, scope) {
|
|
17410
|
+
return this.queryPages({ data }, opts, scope);
|
|
17411
|
+
}
|
|
17412
|
+
async searchPagesByVector(vector, opts, scope) {
|
|
17413
|
+
return this.queryPages({ vector }, opts, scope);
|
|
17414
|
+
}
|
|
17415
|
+
async queryPages(input, opts, scope) {
|
|
17416
|
+
const filterParts = [
|
|
17417
|
+
`projectId = '${scope.projectId}'`,
|
|
17418
|
+
`scopeName = '${scope.scopeName}'`
|
|
17419
|
+
];
|
|
17420
|
+
if (opts.filter) {
|
|
17421
|
+
filterParts.push(opts.filter);
|
|
17422
|
+
}
|
|
17203
17423
|
let results;
|
|
17204
17424
|
try {
|
|
17205
|
-
results = await
|
|
17206
|
-
|
|
17207
|
-
|
|
17208
|
-
|
|
17209
|
-
|
|
17210
|
-
|
|
17211
|
-
|
|
17425
|
+
results = await this.pagesNs.query({
|
|
17426
|
+
...input,
|
|
17427
|
+
topK: opts.limit,
|
|
17428
|
+
includeMetadata: true,
|
|
17429
|
+
filter: filterParts.join(" AND "),
|
|
17430
|
+
queryMode: QueryMode.HYBRID,
|
|
17431
|
+
fusionAlgorithm: FusionAlgorithm.DBSF
|
|
17212
17432
|
});
|
|
17213
17433
|
} catch {
|
|
17214
17434
|
return [];
|
|
17215
17435
|
}
|
|
17216
17436
|
return results.map((doc) => ({
|
|
17217
|
-
id: doc.id,
|
|
17437
|
+
id: String(doc.id),
|
|
17218
17438
|
score: doc.score,
|
|
17219
|
-
title: doc.
|
|
17220
|
-
url: doc.
|
|
17221
|
-
description: doc.
|
|
17222
|
-
tags: doc.
|
|
17439
|
+
title: doc.metadata?.title ?? "",
|
|
17440
|
+
url: doc.metadata?.url ?? "",
|
|
17441
|
+
description: doc.metadata?.description ?? "",
|
|
17442
|
+
tags: doc.metadata?.tags ?? [],
|
|
17223
17443
|
depth: doc.metadata?.depth ?? 0,
|
|
17224
17444
|
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
17225
|
-
routeFile: doc.metadata?.routeFile ?? ""
|
|
17445
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
17446
|
+
publishedAt: typeof doc.metadata?.publishedAt === "number" ? doc.metadata.publishedAt : void 0
|
|
17226
17447
|
}));
|
|
17227
17448
|
}
|
|
17228
|
-
async deleteByIds(ids,
|
|
17449
|
+
async deleteByIds(ids, _scope) {
|
|
17229
17450
|
if (ids.length === 0) return;
|
|
17230
|
-
const
|
|
17231
|
-
const BATCH_SIZE = 500;
|
|
17451
|
+
const BATCH_SIZE = 90;
|
|
17232
17452
|
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17233
17453
|
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17234
|
-
await
|
|
17454
|
+
await this.chunksNs.delete(batch);
|
|
17235
17455
|
}
|
|
17236
17456
|
}
|
|
17237
17457
|
async deleteScope(scope) {
|
|
17238
|
-
|
|
17239
|
-
const
|
|
17240
|
-
|
|
17241
|
-
|
|
17242
|
-
|
|
17243
|
-
|
|
17244
|
-
|
|
17245
|
-
|
|
17246
|
-
|
|
17458
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17459
|
+
const ids = [];
|
|
17460
|
+
let cursor = "0";
|
|
17461
|
+
try {
|
|
17462
|
+
for (; ; ) {
|
|
17463
|
+
const result = await ns.range({
|
|
17464
|
+
cursor,
|
|
17465
|
+
limit: 100,
|
|
17466
|
+
includeMetadata: true
|
|
17467
|
+
});
|
|
17468
|
+
for (const doc of result.vectors) {
|
|
17469
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17470
|
+
ids.push(String(doc.id));
|
|
17471
|
+
}
|
|
17472
|
+
}
|
|
17473
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17474
|
+
cursor = result.nextCursor;
|
|
17475
|
+
}
|
|
17476
|
+
} catch {
|
|
17477
|
+
}
|
|
17478
|
+
if (ids.length > 0) {
|
|
17479
|
+
const BATCH_SIZE = 90;
|
|
17480
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17481
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17482
|
+
await ns.delete(batch);
|
|
17483
|
+
}
|
|
17484
|
+
}
|
|
17247
17485
|
}
|
|
17248
17486
|
}
|
|
17249
17487
|
async listScopes(projectId) {
|
|
17250
|
-
const
|
|
17251
|
-
const
|
|
17252
|
-
|
|
17253
|
-
|
|
17254
|
-
|
|
17255
|
-
|
|
17256
|
-
|
|
17257
|
-
|
|
17258
|
-
|
|
17259
|
-
|
|
17260
|
-
|
|
17261
|
-
|
|
17262
|
-
|
|
17263
|
-
|
|
17264
|
-
|
|
17265
|
-
|
|
17488
|
+
const scopeMap = /* @__PURE__ */ new Map();
|
|
17489
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17490
|
+
let cursor = "0";
|
|
17491
|
+
try {
|
|
17492
|
+
for (; ; ) {
|
|
17493
|
+
const result = await ns.range({
|
|
17494
|
+
cursor,
|
|
17495
|
+
limit: 100,
|
|
17496
|
+
includeMetadata: true
|
|
17497
|
+
});
|
|
17498
|
+
for (const doc of result.vectors) {
|
|
17499
|
+
if (doc.metadata?.projectId === projectId) {
|
|
17500
|
+
const scopeName = doc.metadata.scopeName ?? "";
|
|
17501
|
+
scopeMap.set(scopeName, (scopeMap.get(scopeName) ?? 0) + 1);
|
|
17502
|
+
}
|
|
17503
|
+
}
|
|
17504
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17505
|
+
cursor = result.nextCursor;
|
|
17506
|
+
}
|
|
17507
|
+
} catch {
|
|
17508
|
+
}
|
|
17509
|
+
}
|
|
17510
|
+
return [...scopeMap.entries()].map(([scopeName, count]) => ({
|
|
17511
|
+
projectId,
|
|
17512
|
+
scopeName,
|
|
17513
|
+
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
17514
|
+
documentCount: count
|
|
17515
|
+
}));
|
|
17516
|
+
}
|
|
17517
|
+
async getContentHashes(scope) {
|
|
17518
|
+
return this.scanHashes(this.chunksNs, scope);
|
|
17519
|
+
}
|
|
17520
|
+
/**
|
|
17521
|
+
* Fetch content hashes for a specific set of chunk keys using direct fetch()
|
|
17522
|
+
* instead of range(). This avoids potential issues with range() returning
|
|
17523
|
+
* vectors from the wrong namespace on hybrid indexes.
|
|
17524
|
+
*/
|
|
17525
|
+
async fetchContentHashesForKeys(keys, scope) {
|
|
17526
|
+
const map = /* @__PURE__ */ new Map();
|
|
17527
|
+
if (keys.length === 0) return map;
|
|
17528
|
+
const BATCH_SIZE = 90;
|
|
17529
|
+
for (let i = 0; i < keys.length; i += BATCH_SIZE) {
|
|
17530
|
+
const batch = keys.slice(i, i + BATCH_SIZE);
|
|
17266
17531
|
try {
|
|
17267
|
-
const
|
|
17268
|
-
|
|
17269
|
-
projectId,
|
|
17270
|
-
scopeName,
|
|
17271
|
-
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
17272
|
-
documentCount: info.documentCount
|
|
17532
|
+
const results = await this.chunksNs.fetch(batch, {
|
|
17533
|
+
includeMetadata: true
|
|
17273
17534
|
});
|
|
17535
|
+
for (const doc of results) {
|
|
17536
|
+
if (doc && doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17537
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17538
|
+
}
|
|
17539
|
+
}
|
|
17274
17540
|
} catch {
|
|
17275
|
-
|
|
17276
|
-
|
|
17277
|
-
|
|
17278
|
-
|
|
17279
|
-
|
|
17541
|
+
}
|
|
17542
|
+
}
|
|
17543
|
+
return map;
|
|
17544
|
+
}
|
|
17545
|
+
/**
|
|
17546
|
+
* Scan all IDs in the chunks namespace for this scope.
|
|
17547
|
+
* Used for deletion detection (finding stale chunk keys).
|
|
17548
|
+
*/
|
|
17549
|
+
async scanChunkIds(scope) {
|
|
17550
|
+
const ids = /* @__PURE__ */ new Set();
|
|
17551
|
+
let cursor = "0";
|
|
17552
|
+
try {
|
|
17553
|
+
for (; ; ) {
|
|
17554
|
+
const result = await this.chunksNs.range({
|
|
17555
|
+
cursor,
|
|
17556
|
+
limit: 100,
|
|
17557
|
+
includeMetadata: true
|
|
17280
17558
|
});
|
|
17559
|
+
for (const doc of result.vectors) {
|
|
17560
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17561
|
+
ids.add(String(doc.id));
|
|
17562
|
+
}
|
|
17563
|
+
}
|
|
17564
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17565
|
+
cursor = result.nextCursor;
|
|
17281
17566
|
}
|
|
17567
|
+
} catch {
|
|
17282
17568
|
}
|
|
17283
|
-
return
|
|
17569
|
+
return ids;
|
|
17284
17570
|
}
|
|
17285
|
-
async
|
|
17571
|
+
async scanHashes(ns, scope) {
|
|
17572
|
+
const map = /* @__PURE__ */ new Map();
|
|
17573
|
+
let cursor = "0";
|
|
17574
|
+
try {
|
|
17575
|
+
for (; ; ) {
|
|
17576
|
+
const result = await ns.range({
|
|
17577
|
+
cursor,
|
|
17578
|
+
limit: 100,
|
|
17579
|
+
includeMetadata: true
|
|
17580
|
+
});
|
|
17581
|
+
for (const doc of result.vectors) {
|
|
17582
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17583
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17584
|
+
}
|
|
17585
|
+
}
|
|
17586
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17587
|
+
cursor = result.nextCursor;
|
|
17588
|
+
}
|
|
17589
|
+
} catch {
|
|
17590
|
+
}
|
|
17591
|
+
return map;
|
|
17592
|
+
}
|
|
17593
|
+
async listPages(scope, opts) {
|
|
17594
|
+
const cursor = opts?.cursor ?? "0";
|
|
17595
|
+
const limit = opts?.limit ?? 50;
|
|
17596
|
+
try {
|
|
17597
|
+
const result = await this.pagesNs.range({
|
|
17598
|
+
cursor,
|
|
17599
|
+
limit,
|
|
17600
|
+
includeMetadata: true
|
|
17601
|
+
});
|
|
17602
|
+
const pages = result.vectors.filter(
|
|
17603
|
+
(doc) => doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && (!opts?.pathPrefix || (doc.metadata?.url ?? "").startsWith(opts.pathPrefix))
|
|
17604
|
+
).map((doc) => ({
|
|
17605
|
+
url: doc.metadata?.url ?? "",
|
|
17606
|
+
title: doc.metadata?.title ?? "",
|
|
17607
|
+
description: doc.metadata?.description ?? "",
|
|
17608
|
+
routeFile: doc.metadata?.routeFile ?? ""
|
|
17609
|
+
}));
|
|
17610
|
+
const response = { pages };
|
|
17611
|
+
if (result.nextCursor && result.nextCursor !== "0") {
|
|
17612
|
+
response.nextCursor = result.nextCursor;
|
|
17613
|
+
}
|
|
17614
|
+
return response;
|
|
17615
|
+
} catch {
|
|
17616
|
+
return { pages: [] };
|
|
17617
|
+
}
|
|
17618
|
+
}
|
|
17619
|
+
async getPageHashes(scope) {
|
|
17286
17620
|
const map = /* @__PURE__ */ new Map();
|
|
17287
|
-
const index = this.chunkIndex(scope);
|
|
17288
17621
|
let cursor = "0";
|
|
17289
17622
|
try {
|
|
17290
17623
|
for (; ; ) {
|
|
17291
|
-
const result = await
|
|
17292
|
-
|
|
17293
|
-
|
|
17294
|
-
|
|
17624
|
+
const result = await this.pagesNs.range({
|
|
17625
|
+
cursor,
|
|
17626
|
+
limit: 100,
|
|
17627
|
+
includeMetadata: true
|
|
17628
|
+
});
|
|
17629
|
+
for (const doc of result.vectors) {
|
|
17630
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName && doc.metadata?.contentHash) {
|
|
17631
|
+
map.set(String(doc.id), doc.metadata.contentHash);
|
|
17295
17632
|
}
|
|
17296
17633
|
}
|
|
17297
17634
|
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
@@ -17301,47 +17638,43 @@ var UpstashSearchStore = class {
|
|
|
17301
17638
|
}
|
|
17302
17639
|
return map;
|
|
17303
17640
|
}
|
|
17641
|
+
async deletePagesByIds(ids, _scope) {
|
|
17642
|
+
if (ids.length === 0) return;
|
|
17643
|
+
const BATCH_SIZE = 90;
|
|
17644
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17645
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17646
|
+
await this.pagesNs.delete(batch);
|
|
17647
|
+
}
|
|
17648
|
+
}
|
|
17304
17649
|
async upsertPages(pages, scope) {
|
|
17305
17650
|
if (pages.length === 0) return;
|
|
17306
|
-
const
|
|
17307
|
-
const BATCH_SIZE = 50;
|
|
17651
|
+
const BATCH_SIZE = 90;
|
|
17308
17652
|
for (let i = 0; i < pages.length; i += BATCH_SIZE) {
|
|
17309
17653
|
const batch = pages.slice(i, i + BATCH_SIZE);
|
|
17310
|
-
|
|
17311
|
-
|
|
17312
|
-
|
|
17313
|
-
|
|
17314
|
-
|
|
17315
|
-
|
|
17316
|
-
|
|
17317
|
-
|
|
17318
|
-
|
|
17319
|
-
|
|
17320
|
-
}
|
|
17321
|
-
|
|
17322
|
-
markdown: p.markdown,
|
|
17323
|
-
projectId: p.projectId,
|
|
17324
|
-
scopeName: p.scopeName,
|
|
17325
|
-
routeFile: p.routeFile,
|
|
17326
|
-
routeResolution: p.routeResolution,
|
|
17327
|
-
incomingLinks: p.incomingLinks,
|
|
17328
|
-
outgoingLinks: p.outgoingLinks,
|
|
17329
|
-
depth: p.depth,
|
|
17330
|
-
indexedAt: p.indexedAt
|
|
17331
|
-
}
|
|
17332
|
-
}));
|
|
17333
|
-
await index.upsert(docs);
|
|
17654
|
+
await this.pagesNs.upsert(
|
|
17655
|
+
batch.map((p) => ({
|
|
17656
|
+
id: p.id,
|
|
17657
|
+
data: p.data,
|
|
17658
|
+
metadata: {
|
|
17659
|
+
...p.metadata,
|
|
17660
|
+
projectId: scope.projectId,
|
|
17661
|
+
scopeName: scope.scopeName,
|
|
17662
|
+
type: "page"
|
|
17663
|
+
}
|
|
17664
|
+
}))
|
|
17665
|
+
);
|
|
17334
17666
|
}
|
|
17335
17667
|
}
|
|
17336
17668
|
async getPage(url, scope) {
|
|
17337
|
-
const index = this.pageIndex(scope);
|
|
17338
17669
|
try {
|
|
17339
|
-
const results = await
|
|
17670
|
+
const results = await this.pagesNs.fetch([url], {
|
|
17671
|
+
includeMetadata: true
|
|
17672
|
+
});
|
|
17340
17673
|
const doc = results[0];
|
|
17341
|
-
if (!doc) return null;
|
|
17674
|
+
if (!doc || !doc.metadata) return null;
|
|
17342
17675
|
return {
|
|
17343
|
-
url: doc.
|
|
17344
|
-
title: doc.
|
|
17676
|
+
url: doc.metadata.url,
|
|
17677
|
+
title: doc.metadata.title,
|
|
17345
17678
|
markdown: doc.metadata.markdown,
|
|
17346
17679
|
projectId: doc.metadata.projectId,
|
|
17347
17680
|
scopeName: doc.metadata.scopeName,
|
|
@@ -17349,27 +17682,86 @@ var UpstashSearchStore = class {
|
|
|
17349
17682
|
routeResolution: doc.metadata.routeResolution,
|
|
17350
17683
|
incomingLinks: doc.metadata.incomingLinks,
|
|
17351
17684
|
outgoingLinks: doc.metadata.outgoingLinks,
|
|
17685
|
+
outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? void 0,
|
|
17352
17686
|
depth: doc.metadata.depth,
|
|
17353
|
-
tags: doc.
|
|
17687
|
+
tags: doc.metadata.tags ?? [],
|
|
17354
17688
|
indexedAt: doc.metadata.indexedAt,
|
|
17355
|
-
summary: doc.
|
|
17356
|
-
description: doc.
|
|
17357
|
-
keywords: doc.
|
|
17689
|
+
summary: doc.metadata.summary || void 0,
|
|
17690
|
+
description: doc.metadata.description || void 0,
|
|
17691
|
+
keywords: doc.metadata.keywords?.length ? doc.metadata.keywords : void 0,
|
|
17692
|
+
publishedAt: typeof doc.metadata.publishedAt === "number" ? doc.metadata.publishedAt : void 0
|
|
17358
17693
|
};
|
|
17359
17694
|
} catch {
|
|
17360
17695
|
return null;
|
|
17361
17696
|
}
|
|
17362
17697
|
}
|
|
17698
|
+
async fetchPageWithVector(url, scope) {
|
|
17699
|
+
try {
|
|
17700
|
+
const results = await this.pagesNs.fetch([url], {
|
|
17701
|
+
includeMetadata: true,
|
|
17702
|
+
includeVectors: true
|
|
17703
|
+
});
|
|
17704
|
+
const doc = results[0];
|
|
17705
|
+
if (!doc || !doc.metadata || !doc.vector) return null;
|
|
17706
|
+
if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
|
|
17707
|
+
return null;
|
|
17708
|
+
}
|
|
17709
|
+
return { metadata: doc.metadata, vector: doc.vector };
|
|
17710
|
+
} catch {
|
|
17711
|
+
return null;
|
|
17712
|
+
}
|
|
17713
|
+
}
|
|
17714
|
+
async fetchPagesBatch(urls, scope) {
|
|
17715
|
+
if (urls.length === 0) return [];
|
|
17716
|
+
try {
|
|
17717
|
+
const results = await this.pagesNs.fetch(urls, {
|
|
17718
|
+
includeMetadata: true
|
|
17719
|
+
});
|
|
17720
|
+
const out = [];
|
|
17721
|
+
for (const doc of results) {
|
|
17722
|
+
if (!doc || !doc.metadata) continue;
|
|
17723
|
+
if (doc.metadata.projectId !== scope.projectId || doc.metadata.scopeName !== scope.scopeName) {
|
|
17724
|
+
continue;
|
|
17725
|
+
}
|
|
17726
|
+
out.push({
|
|
17727
|
+
url: doc.metadata.url,
|
|
17728
|
+
title: doc.metadata.title,
|
|
17729
|
+
routeFile: doc.metadata.routeFile,
|
|
17730
|
+
outgoingLinkUrls: doc.metadata.outgoingLinkUrls ?? []
|
|
17731
|
+
});
|
|
17732
|
+
}
|
|
17733
|
+
return out;
|
|
17734
|
+
} catch {
|
|
17735
|
+
return [];
|
|
17736
|
+
}
|
|
17737
|
+
}
|
|
17363
17738
|
async deletePages(scope) {
|
|
17739
|
+
const ids = [];
|
|
17740
|
+
let cursor = "0";
|
|
17364
17741
|
try {
|
|
17365
|
-
|
|
17366
|
-
|
|
17742
|
+
for (; ; ) {
|
|
17743
|
+
const result = await this.pagesNs.range({
|
|
17744
|
+
cursor,
|
|
17745
|
+
limit: 100,
|
|
17746
|
+
includeMetadata: true
|
|
17747
|
+
});
|
|
17748
|
+
for (const doc of result.vectors) {
|
|
17749
|
+
if (doc.metadata?.projectId === scope.projectId && doc.metadata?.scopeName === scope.scopeName) {
|
|
17750
|
+
ids.push(String(doc.id));
|
|
17751
|
+
}
|
|
17752
|
+
}
|
|
17753
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17754
|
+
cursor = result.nextCursor;
|
|
17755
|
+
}
|
|
17367
17756
|
} catch {
|
|
17368
17757
|
}
|
|
17758
|
+
if (ids.length > 0) {
|
|
17759
|
+
await this.deletePagesByIds(ids, scope);
|
|
17760
|
+
}
|
|
17369
17761
|
}
|
|
17370
17762
|
async health() {
|
|
17371
17763
|
try {
|
|
17372
|
-
await this.
|
|
17764
|
+
await this.index.info();
|
|
17373
17765
|
return { ok: true };
|
|
17374
17766
|
} catch (error) {
|
|
17375
17767
|
return {
|
|
@@ -17379,14 +17771,31 @@ var UpstashSearchStore = class {
|
|
|
17379
17771
|
}
|
|
17380
17772
|
}
|
|
17381
17773
|
async dropAllIndexes(projectId) {
|
|
17382
|
-
const
|
|
17383
|
-
|
|
17384
|
-
|
|
17385
|
-
|
|
17386
|
-
|
|
17387
|
-
const
|
|
17388
|
-
|
|
17389
|
-
|
|
17774
|
+
for (const ns of [this.chunksNs, this.pagesNs]) {
|
|
17775
|
+
const ids = [];
|
|
17776
|
+
let cursor = "0";
|
|
17777
|
+
try {
|
|
17778
|
+
for (; ; ) {
|
|
17779
|
+
const result = await ns.range({
|
|
17780
|
+
cursor,
|
|
17781
|
+
limit: 100,
|
|
17782
|
+
includeMetadata: true
|
|
17783
|
+
});
|
|
17784
|
+
for (const doc of result.vectors) {
|
|
17785
|
+
if (doc.metadata?.projectId === projectId) {
|
|
17786
|
+
ids.push(String(doc.id));
|
|
17787
|
+
}
|
|
17788
|
+
}
|
|
17789
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
17790
|
+
cursor = result.nextCursor;
|
|
17791
|
+
}
|
|
17792
|
+
} catch {
|
|
17793
|
+
}
|
|
17794
|
+
if (ids.length > 0) {
|
|
17795
|
+
const BATCH_SIZE = 90;
|
|
17796
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
17797
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
17798
|
+
await ns.delete(batch);
|
|
17390
17799
|
}
|
|
17391
17800
|
}
|
|
17392
17801
|
}
|
|
@@ -17400,12 +17809,16 @@ async function createUpstashStore(config) {
|
|
|
17400
17809
|
if (!url || !token) {
|
|
17401
17810
|
throw new SearchSocketError(
|
|
17402
17811
|
"VECTOR_BACKEND_UNAVAILABLE",
|
|
17403
|
-
`Missing Upstash
|
|
17812
|
+
`Missing Upstash Vector credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
|
|
17404
17813
|
);
|
|
17405
17814
|
}
|
|
17406
|
-
const {
|
|
17407
|
-
const
|
|
17408
|
-
return new UpstashSearchStore({
|
|
17815
|
+
const { Index } = await import('@upstash/vector');
|
|
17816
|
+
const index = new Index({ url, token });
|
|
17817
|
+
return new UpstashSearchStore({
|
|
17818
|
+
index,
|
|
17819
|
+
pagesNamespace: config.upstash.namespaces.pages,
|
|
17820
|
+
chunksNamespace: config.upstash.namespaces.chunks
|
|
17821
|
+
});
|
|
17409
17822
|
}
|
|
17410
17823
|
|
|
17411
17824
|
// src/utils/pattern.ts
|
|
@@ -17448,29 +17861,65 @@ function nonNegativeOrZero(value) {
|
|
|
17448
17861
|
function normalizeForTitleMatch(text) {
|
|
17449
17862
|
return text.toLowerCase().replace(/[^a-z0-9\s]/g, "").replace(/\s+/g, " ").trim();
|
|
17450
17863
|
}
|
|
17451
|
-
function rankHits(hits, config, query) {
|
|
17864
|
+
function rankHits(hits, config, query, debug) {
|
|
17452
17865
|
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
17453
17866
|
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
17454
17867
|
return hits.map((hit) => {
|
|
17455
|
-
|
|
17868
|
+
const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
17869
|
+
let score = baseScore;
|
|
17870
|
+
let incomingLinkBoostValue = 0;
|
|
17456
17871
|
if (config.ranking.enableIncomingLinkBoost) {
|
|
17457
17872
|
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
17458
|
-
|
|
17873
|
+
incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
|
|
17874
|
+
score += incomingLinkBoostValue;
|
|
17459
17875
|
}
|
|
17876
|
+
let depthBoostValue = 0;
|
|
17460
17877
|
if (config.ranking.enableDepthBoost) {
|
|
17461
17878
|
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
17462
|
-
|
|
17879
|
+
depthBoostValue = depthBoost * config.ranking.weights.depth;
|
|
17880
|
+
score += depthBoostValue;
|
|
17463
17881
|
}
|
|
17882
|
+
let titleMatchBoostValue = 0;
|
|
17464
17883
|
if (normalizedQuery && titleMatchWeight > 0) {
|
|
17465
17884
|
const normalizedTitle = normalizeForTitleMatch(hit.metadata.title);
|
|
17466
17885
|
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
17467
|
-
|
|
17886
|
+
titleMatchBoostValue = titleMatchWeight;
|
|
17887
|
+
score += titleMatchBoostValue;
|
|
17468
17888
|
}
|
|
17469
17889
|
}
|
|
17470
|
-
|
|
17890
|
+
let freshnessBoostValue = 0;
|
|
17891
|
+
if (config.ranking.enableFreshnessBoost) {
|
|
17892
|
+
const publishedAt = hit.metadata.publishedAt;
|
|
17893
|
+
if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
|
|
17894
|
+
const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
|
|
17895
|
+
const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
|
|
17896
|
+
freshnessBoostValue = decay * config.ranking.weights.freshness;
|
|
17897
|
+
score += freshnessBoostValue;
|
|
17898
|
+
}
|
|
17899
|
+
}
|
|
17900
|
+
let anchorTextMatchBoostValue = 0;
|
|
17901
|
+
if (config.ranking.enableAnchorTextBoost && normalizedQuery && config.ranking.weights.anchorText > 0) {
|
|
17902
|
+
const normalizedAnchorText = normalizeForTitleMatch(hit.metadata.incomingAnchorText ?? "");
|
|
17903
|
+
if (normalizedAnchorText.length > 0 && normalizedQuery.length > 0 && (normalizedAnchorText.includes(normalizedQuery) || normalizedQuery.includes(normalizedAnchorText))) {
|
|
17904
|
+
anchorTextMatchBoostValue = config.ranking.weights.anchorText;
|
|
17905
|
+
score += anchorTextMatchBoostValue;
|
|
17906
|
+
}
|
|
17907
|
+
}
|
|
17908
|
+
const result = {
|
|
17471
17909
|
hit,
|
|
17472
17910
|
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
17473
17911
|
};
|
|
17912
|
+
if (debug) {
|
|
17913
|
+
result.breakdown = {
|
|
17914
|
+
baseScore,
|
|
17915
|
+
incomingLinkBoost: incomingLinkBoostValue,
|
|
17916
|
+
depthBoost: depthBoostValue,
|
|
17917
|
+
titleMatchBoost: titleMatchBoostValue,
|
|
17918
|
+
freshnessBoost: freshnessBoostValue,
|
|
17919
|
+
anchorTextMatchBoost: anchorTextMatchBoostValue
|
|
17920
|
+
};
|
|
17921
|
+
}
|
|
17922
|
+
return result;
|
|
17474
17923
|
}).sort((a, b) => {
|
|
17475
17924
|
const delta = b.finalScore - a.finalScore;
|
|
17476
17925
|
return Number.isNaN(delta) ? 0 : delta;
|
|
@@ -17479,12 +17928,13 @@ function rankHits(hits, config, query) {
|
|
|
17479
17928
|
function trimByScoreGap(results, config) {
|
|
17480
17929
|
if (results.length === 0) return results;
|
|
17481
17930
|
const threshold = config.ranking.scoreGapThreshold;
|
|
17482
|
-
const
|
|
17483
|
-
if (
|
|
17484
|
-
const
|
|
17485
|
-
|
|
17486
|
-
|
|
17487
|
-
|
|
17931
|
+
const minScoreRatio = config.ranking.minScoreRatio;
|
|
17932
|
+
if (minScoreRatio > 0 && results.length > 0) {
|
|
17933
|
+
const topScore = results[0].pageScore;
|
|
17934
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
17935
|
+
const minThreshold = topScore * minScoreRatio;
|
|
17936
|
+
results = results.filter((r) => r.pageScore >= minThreshold);
|
|
17937
|
+
}
|
|
17488
17938
|
}
|
|
17489
17939
|
if (threshold > 0 && results.length > 1) {
|
|
17490
17940
|
for (let i = 1; i < results.length; i++) {
|
|
@@ -17554,82 +18004,283 @@ function aggregateByPage(ranked, config) {
|
|
|
17554
18004
|
return Number.isNaN(delta) ? 0 : delta;
|
|
17555
18005
|
});
|
|
17556
18006
|
}
|
|
17557
|
-
function
|
|
17558
|
-
|
|
17559
|
-
const
|
|
17560
|
-
|
|
17561
|
-
|
|
17562
|
-
|
|
17563
|
-
|
|
17564
|
-
|
|
17565
|
-
|
|
17566
|
-
|
|
17567
|
-
|
|
17568
|
-
if (pageHit) {
|
|
17569
|
-
pagesWithChunks.add(url);
|
|
17570
|
-
const blended = (1 - w) * ranked.finalScore + w * pageHit.score;
|
|
17571
|
-
return {
|
|
17572
|
-
hit: ranked.hit,
|
|
17573
|
-
finalScore: Number.isFinite(blended) ? blended : ranked.finalScore
|
|
17574
|
-
};
|
|
18007
|
+
function rankPageHits(pageHits, config, query, debug) {
|
|
18008
|
+
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
18009
|
+
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
18010
|
+
return pageHits.map((hit) => {
|
|
18011
|
+
const baseScore = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
18012
|
+
let score = baseScore;
|
|
18013
|
+
let incomingLinkBoostValue = 0;
|
|
18014
|
+
if (config.ranking.enableIncomingLinkBoost) {
|
|
18015
|
+
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.incomingLinks));
|
|
18016
|
+
incomingLinkBoostValue = incomingBoost * config.ranking.weights.incomingLinks;
|
|
18017
|
+
score += incomingLinkBoostValue;
|
|
17575
18018
|
}
|
|
17576
|
-
|
|
17577
|
-
|
|
17578
|
-
|
|
17579
|
-
|
|
17580
|
-
|
|
17581
|
-
|
|
17582
|
-
|
|
17583
|
-
|
|
17584
|
-
|
|
17585
|
-
|
|
17586
|
-
|
|
17587
|
-
|
|
17588
|
-
|
|
17589
|
-
|
|
17590
|
-
|
|
17591
|
-
|
|
17592
|
-
|
|
17593
|
-
|
|
17594
|
-
|
|
17595
|
-
|
|
17596
|
-
|
|
17597
|
-
|
|
17598
|
-
routeFile: pageHit.routeFile,
|
|
17599
|
-
tags: pageHit.tags
|
|
18019
|
+
let depthBoostValue = 0;
|
|
18020
|
+
if (config.ranking.enableDepthBoost) {
|
|
18021
|
+
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.depth));
|
|
18022
|
+
depthBoostValue = depthBoost * config.ranking.weights.depth;
|
|
18023
|
+
score += depthBoostValue;
|
|
18024
|
+
}
|
|
18025
|
+
let titleMatchBoostValue = 0;
|
|
18026
|
+
if (normalizedQuery && titleMatchWeight > 0) {
|
|
18027
|
+
const normalizedTitle = normalizeForTitleMatch(hit.title);
|
|
18028
|
+
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
18029
|
+
titleMatchBoostValue = titleMatchWeight;
|
|
18030
|
+
score += titleMatchBoostValue;
|
|
18031
|
+
}
|
|
18032
|
+
}
|
|
18033
|
+
let freshnessBoostValue = 0;
|
|
18034
|
+
if (config.ranking.enableFreshnessBoost) {
|
|
18035
|
+
const publishedAt = hit.publishedAt;
|
|
18036
|
+
if (typeof publishedAt === "number" && Number.isFinite(publishedAt)) {
|
|
18037
|
+
const daysSince = Math.max(0, (Date.now() - publishedAt) / 864e5);
|
|
18038
|
+
const decay = 1 / (1 + nonNegativeOrZero(daysSince) * config.ranking.freshnessDecayRate);
|
|
18039
|
+
freshnessBoostValue = decay * config.ranking.weights.freshness;
|
|
18040
|
+
score += freshnessBoostValue;
|
|
17600
18041
|
}
|
|
18042
|
+
}
|
|
18043
|
+
const pageWeight = findPageWeight(hit.url, config.ranking.pageWeights);
|
|
18044
|
+
if (pageWeight !== 1) {
|
|
18045
|
+
score *= pageWeight;
|
|
18046
|
+
}
|
|
18047
|
+
const result = {
|
|
18048
|
+
url: hit.url,
|
|
18049
|
+
title: hit.title,
|
|
18050
|
+
description: hit.description,
|
|
18051
|
+
routeFile: hit.routeFile,
|
|
18052
|
+
depth: hit.depth,
|
|
18053
|
+
incomingLinks: hit.incomingLinks,
|
|
18054
|
+
tags: hit.tags,
|
|
18055
|
+
baseScore,
|
|
18056
|
+
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY,
|
|
18057
|
+
publishedAt: hit.publishedAt
|
|
17601
18058
|
};
|
|
17602
|
-
|
|
17603
|
-
|
|
17604
|
-
|
|
17605
|
-
|
|
17606
|
-
|
|
17607
|
-
|
|
18059
|
+
if (debug) {
|
|
18060
|
+
result.breakdown = {
|
|
18061
|
+
baseScore,
|
|
18062
|
+
pageWeight,
|
|
18063
|
+
incomingLinkBoost: incomingLinkBoostValue,
|
|
18064
|
+
depthBoost: depthBoostValue,
|
|
18065
|
+
titleMatchBoost: titleMatchBoostValue,
|
|
18066
|
+
freshnessBoost: freshnessBoostValue
|
|
18067
|
+
};
|
|
18068
|
+
}
|
|
18069
|
+
return result;
|
|
18070
|
+
}).filter((p) => findPageWeight(p.url, config.ranking.pageWeights) !== 0).sort((a, b) => {
|
|
17608
18071
|
const delta = b.finalScore - a.finalScore;
|
|
17609
18072
|
return Number.isNaN(delta) ? 0 : delta;
|
|
17610
18073
|
});
|
|
17611
18074
|
}
|
|
18075
|
+
function trimPagesByScoreGap(results, config) {
|
|
18076
|
+
if (results.length === 0) return results;
|
|
18077
|
+
const threshold = config.ranking.scoreGapThreshold;
|
|
18078
|
+
const minScoreRatio = config.ranking.minScoreRatio;
|
|
18079
|
+
if (minScoreRatio > 0 && results.length > 0) {
|
|
18080
|
+
const topScore = results[0].finalScore;
|
|
18081
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
18082
|
+
const minThreshold = topScore * minScoreRatio;
|
|
18083
|
+
results = results.filter((r) => r.finalScore >= minThreshold);
|
|
18084
|
+
}
|
|
18085
|
+
}
|
|
18086
|
+
if (threshold > 0 && results.length > 1) {
|
|
18087
|
+
for (let i = 1; i < results.length; i++) {
|
|
18088
|
+
const prev = results[i - 1].finalScore;
|
|
18089
|
+
const current = results[i].finalScore;
|
|
18090
|
+
if (prev > 0) {
|
|
18091
|
+
const gap = (prev - current) / prev;
|
|
18092
|
+
if (gap >= threshold) {
|
|
18093
|
+
return results.slice(0, i);
|
|
18094
|
+
}
|
|
18095
|
+
}
|
|
18096
|
+
}
|
|
18097
|
+
}
|
|
18098
|
+
return results;
|
|
18099
|
+
}
|
|
17612
18100
|
|
|
17613
|
-
// src/search/
|
|
17614
|
-
|
|
17615
|
-
|
|
17616
|
-
|
|
17617
|
-
|
|
17618
|
-
|
|
17619
|
-
|
|
17620
|
-
|
|
17621
|
-
|
|
17622
|
-
|
|
17623
|
-
|
|
17624
|
-
|
|
17625
|
-
|
|
17626
|
-
|
|
17627
|
-
this.cwd = options.cwd;
|
|
17628
|
-
this.config = options.config;
|
|
17629
|
-
this.store = options.store;
|
|
18101
|
+
// src/search/related-pages.ts
|
|
18102
|
+
function diceScore(urlA, urlB) {
|
|
18103
|
+
const segmentsA = urlA.split("/").filter(Boolean);
|
|
18104
|
+
const segmentsB = urlB.split("/").filter(Boolean);
|
|
18105
|
+
if (segmentsA.length === 0 && segmentsB.length === 0) return 1;
|
|
18106
|
+
if (segmentsA.length === 0 || segmentsB.length === 0) return 0;
|
|
18107
|
+
let shared = 0;
|
|
18108
|
+
const minLen = Math.min(segmentsA.length, segmentsB.length);
|
|
18109
|
+
for (let i = 0; i < minLen; i++) {
|
|
18110
|
+
if (segmentsA[i] === segmentsB[i]) {
|
|
18111
|
+
shared++;
|
|
18112
|
+
} else {
|
|
18113
|
+
break;
|
|
18114
|
+
}
|
|
17630
18115
|
}
|
|
17631
|
-
|
|
17632
|
-
|
|
18116
|
+
return 2 * shared / (segmentsA.length + segmentsB.length);
|
|
18117
|
+
}
|
|
18118
|
+
function compositeScore(isLinked, dice, semantic) {
|
|
18119
|
+
return (isLinked ? 0.5 : 0) + 0.3 * dice + 0.2 * semantic;
|
|
18120
|
+
}
|
|
18121
|
+
function dominantRelationshipType(isOutgoing, isIncoming, dice) {
|
|
18122
|
+
if (isOutgoing) return "outgoing_link";
|
|
18123
|
+
if (isIncoming) return "incoming_link";
|
|
18124
|
+
if (dice > 0.4) return "sibling";
|
|
18125
|
+
return "semantic";
|
|
18126
|
+
}
|
|
18127
|
+
|
|
18128
|
+
// src/utils/structured-meta.ts
|
|
18129
|
+
var VALID_KEY_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
|
|
18130
|
+
function validateMetaKey(key) {
|
|
18131
|
+
return VALID_KEY_RE.test(key);
|
|
18132
|
+
}
|
|
18133
|
+
function parseMetaValue(content, dataType) {
|
|
18134
|
+
switch (dataType) {
|
|
18135
|
+
case "number": {
|
|
18136
|
+
const n = Number(content);
|
|
18137
|
+
return Number.isFinite(n) ? n : content;
|
|
18138
|
+
}
|
|
18139
|
+
case "boolean":
|
|
18140
|
+
return content === "true";
|
|
18141
|
+
case "string[]":
|
|
18142
|
+
return content ? content.split(",").map((s) => s.trim()) : [];
|
|
18143
|
+
case "date": {
|
|
18144
|
+
const ms = Number(content);
|
|
18145
|
+
return Number.isFinite(ms) ? ms : content;
|
|
18146
|
+
}
|
|
18147
|
+
default:
|
|
18148
|
+
return content;
|
|
18149
|
+
}
|
|
18150
|
+
}
|
|
18151
|
+
function escapeFilterValue(s) {
|
|
18152
|
+
return s.replace(/'/g, "''");
|
|
18153
|
+
}
|
|
18154
|
+
function buildMetaFilterString(filters) {
|
|
18155
|
+
const clauses = [];
|
|
18156
|
+
for (const [key, value] of Object.entries(filters)) {
|
|
18157
|
+
if (!validateMetaKey(key)) continue;
|
|
18158
|
+
const field = `meta.${key}`;
|
|
18159
|
+
if (typeof value === "string") {
|
|
18160
|
+
clauses.push(`${field} CONTAINS '${escapeFilterValue(value)}'`);
|
|
18161
|
+
} else if (typeof value === "boolean") {
|
|
18162
|
+
clauses.push(`${field} = ${value}`);
|
|
18163
|
+
} else {
|
|
18164
|
+
clauses.push(`${field} = ${value}`);
|
|
18165
|
+
}
|
|
18166
|
+
}
|
|
18167
|
+
return clauses.join(" AND ");
|
|
18168
|
+
}
|
|
18169
|
+
|
|
18170
|
+
// src/search/engine.ts
|
|
18171
|
+
var rankingOverridesSchema = z.object({
|
|
18172
|
+
ranking: z.object({
|
|
18173
|
+
enableIncomingLinkBoost: z.boolean().optional(),
|
|
18174
|
+
enableDepthBoost: z.boolean().optional(),
|
|
18175
|
+
aggregationCap: z.number().int().positive().optional(),
|
|
18176
|
+
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
18177
|
+
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
18178
|
+
minScoreRatio: z.number().min(0).max(1).optional(),
|
|
18179
|
+
scoreGapThreshold: z.number().min(0).max(1).optional(),
|
|
18180
|
+
weights: z.object({
|
|
18181
|
+
incomingLinks: z.number().optional(),
|
|
18182
|
+
depth: z.number().optional(),
|
|
18183
|
+
aggregation: z.number().optional(),
|
|
18184
|
+
titleMatch: z.number().optional()
|
|
18185
|
+
}).optional()
|
|
18186
|
+
}).optional(),
|
|
18187
|
+
search: z.object({
|
|
18188
|
+
pageSearchWeight: z.number().min(0).max(1).optional()
|
|
18189
|
+
}).optional()
|
|
18190
|
+
}).optional();
|
|
18191
|
+
var requestSchema = z.object({
|
|
18192
|
+
q: z.string().trim().min(1),
|
|
18193
|
+
topK: z.number().int().positive().max(100).optional(),
|
|
18194
|
+
scope: z.string().optional(),
|
|
18195
|
+
pathPrefix: z.string().optional(),
|
|
18196
|
+
tags: z.array(z.string()).optional(),
|
|
18197
|
+
filters: z.record(z.string(), z.union([z.string(), z.number(), z.boolean()])).optional(),
|
|
18198
|
+
groupBy: z.enum(["page", "chunk"]).optional(),
|
|
18199
|
+
maxSubResults: z.number().int().positive().max(20).optional(),
|
|
18200
|
+
debug: z.boolean().optional(),
|
|
18201
|
+
rankingOverrides: rankingOverridesSchema
|
|
18202
|
+
});
|
|
18203
|
+
var MAX_SITE_STRUCTURE_PAGES = 2e3;
|
|
18204
|
+
function makeNode(url, depth) {
|
|
18205
|
+
return { url, title: "", depth, routeFile: "", isIndexed: false, childCount: 0, children: [] };
|
|
18206
|
+
}
|
|
18207
|
+
function buildTree(pages, pathPrefix) {
|
|
18208
|
+
const nodeMap = /* @__PURE__ */ new Map();
|
|
18209
|
+
const root2 = makeNode("/", 0);
|
|
18210
|
+
nodeMap.set("/", root2);
|
|
18211
|
+
for (const page of pages) {
|
|
18212
|
+
const normalized = normalizeUrlPath(page.url);
|
|
18213
|
+
const segments = normalized.split("/").filter(Boolean);
|
|
18214
|
+
if (segments.length === 0) {
|
|
18215
|
+
root2.title = page.title;
|
|
18216
|
+
root2.routeFile = page.routeFile;
|
|
18217
|
+
root2.isIndexed = true;
|
|
18218
|
+
continue;
|
|
18219
|
+
}
|
|
18220
|
+
for (let i = 1; i <= segments.length; i++) {
|
|
18221
|
+
const partialUrl = "/" + segments.slice(0, i).join("/");
|
|
18222
|
+
if (!nodeMap.has(partialUrl)) {
|
|
18223
|
+
nodeMap.set(partialUrl, makeNode(partialUrl, i));
|
|
18224
|
+
}
|
|
18225
|
+
}
|
|
18226
|
+
const node = nodeMap.get(normalized);
|
|
18227
|
+
node.title = page.title;
|
|
18228
|
+
node.routeFile = page.routeFile;
|
|
18229
|
+
node.isIndexed = true;
|
|
18230
|
+
}
|
|
18231
|
+
for (const [url, node] of nodeMap) {
|
|
18232
|
+
if (url === "/") continue;
|
|
18233
|
+
const segments = url.split("/").filter(Boolean);
|
|
18234
|
+
const parentUrl = segments.length === 1 ? "/" : "/" + segments.slice(0, -1).join("/");
|
|
18235
|
+
const parent = nodeMap.get(parentUrl) ?? root2;
|
|
18236
|
+
parent.children.push(node);
|
|
18237
|
+
}
|
|
18238
|
+
const sortAndCount = (node) => {
|
|
18239
|
+
node.children.sort((a, b) => a.url.localeCompare(b.url));
|
|
18240
|
+
node.childCount = node.children.length;
|
|
18241
|
+
for (const child of node.children) {
|
|
18242
|
+
sortAndCount(child);
|
|
18243
|
+
}
|
|
18244
|
+
};
|
|
18245
|
+
sortAndCount(root2);
|
|
18246
|
+
if (pathPrefix) {
|
|
18247
|
+
const normalizedPrefix = normalizeUrlPath(pathPrefix);
|
|
18248
|
+
const subtreeRoot = nodeMap.get(normalizedPrefix);
|
|
18249
|
+
if (subtreeRoot) {
|
|
18250
|
+
return subtreeRoot;
|
|
18251
|
+
}
|
|
18252
|
+
return makeNode(normalizedPrefix, normalizedPrefix.split("/").filter(Boolean).length);
|
|
18253
|
+
}
|
|
18254
|
+
return root2;
|
|
18255
|
+
}
|
|
18256
|
+
function mergeRankingOverrides(base, overrides) {
|
|
18257
|
+
return {
|
|
18258
|
+
...base,
|
|
18259
|
+
search: {
|
|
18260
|
+
...base.search,
|
|
18261
|
+
...overrides.search
|
|
18262
|
+
},
|
|
18263
|
+
ranking: {
|
|
18264
|
+
...base.ranking,
|
|
18265
|
+
...overrides.ranking,
|
|
18266
|
+
weights: {
|
|
18267
|
+
...base.ranking.weights,
|
|
18268
|
+
...overrides.ranking?.weights
|
|
18269
|
+
}
|
|
18270
|
+
}
|
|
18271
|
+
};
|
|
18272
|
+
}
|
|
18273
|
+
var SearchEngine = class _SearchEngine {
|
|
18274
|
+
cwd;
|
|
18275
|
+
config;
|
|
18276
|
+
store;
|
|
18277
|
+
constructor(options) {
|
|
18278
|
+
this.cwd = options.cwd;
|
|
18279
|
+
this.config = options.config;
|
|
18280
|
+
this.store = options.store;
|
|
18281
|
+
}
|
|
18282
|
+
static async create(options = {}) {
|
|
18283
|
+
const cwd = path.resolve(options.cwd ?? process.cwd());
|
|
17633
18284
|
const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
|
|
17634
18285
|
const store = options.store ?? await createUpstashStore(config);
|
|
17635
18286
|
return new _SearchEngine({
|
|
@@ -17648,125 +18299,203 @@ var SearchEngine = class _SearchEngine {
|
|
|
17648
18299
|
}
|
|
17649
18300
|
const input = parsed.data;
|
|
17650
18301
|
const totalStart = process.hrtime.bigint();
|
|
18302
|
+
const effectiveConfig = input.debug && input.rankingOverrides ? mergeRankingOverrides(this.config, input.rankingOverrides) : this.config;
|
|
17651
18303
|
const resolvedScope = resolveScope(this.config, input.scope);
|
|
17652
18304
|
const topK = input.topK ?? 10;
|
|
18305
|
+
const maxSubResults = input.maxSubResults ?? 5;
|
|
17653
18306
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
17654
|
-
const
|
|
17655
|
-
const
|
|
17656
|
-
|
|
17657
|
-
|
|
17658
|
-
|
|
17659
|
-
|
|
17660
|
-
|
|
17661
|
-
|
|
17662
|
-
|
|
18307
|
+
const queryText = input.q;
|
|
18308
|
+
const pathPrefix = input.pathPrefix ? input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}` : void 0;
|
|
18309
|
+
const filterTags = input.tags && input.tags.length > 0 ? input.tags : void 0;
|
|
18310
|
+
const metaFilterStr = input.filters && Object.keys(input.filters).length > 0 ? buildMetaFilterString(input.filters) : "";
|
|
18311
|
+
const metaFilter = metaFilterStr || void 0;
|
|
18312
|
+
const applyPagePostFilters = (hits) => {
|
|
18313
|
+
let filtered = hits;
|
|
18314
|
+
if (pathPrefix) {
|
|
18315
|
+
filtered = filtered.filter((h) => h.url.startsWith(pathPrefix));
|
|
18316
|
+
}
|
|
18317
|
+
if (filterTags) {
|
|
18318
|
+
filtered = filtered.filter(
|
|
18319
|
+
(h) => filterTags.every((tag) => h.tags.includes(tag))
|
|
18320
|
+
);
|
|
17663
18321
|
}
|
|
17664
|
-
|
|
17665
|
-
|
|
17666
|
-
const
|
|
18322
|
+
return filtered;
|
|
18323
|
+
};
|
|
18324
|
+
const applyChunkPostFilters = (hits) => {
|
|
18325
|
+
let filtered = hits;
|
|
18326
|
+
if (filterTags) {
|
|
18327
|
+
filtered = filtered.filter(
|
|
18328
|
+
(h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
|
|
18329
|
+
);
|
|
18330
|
+
}
|
|
18331
|
+
return filtered;
|
|
18332
|
+
};
|
|
17667
18333
|
const searchStart = process.hrtime.bigint();
|
|
17668
|
-
|
|
17669
|
-
|
|
17670
|
-
const
|
|
17671
|
-
const
|
|
17672
|
-
|
|
17673
|
-
|
|
17674
|
-
|
|
17675
|
-
|
|
17676
|
-
|
|
17677
|
-
|
|
17678
|
-
|
|
17679
|
-
|
|
17680
|
-
|
|
17681
|
-
|
|
17682
|
-
|
|
17683
|
-
|
|
17684
|
-
|
|
17685
|
-
{
|
|
17686
|
-
limit: chunkLimit,
|
|
17687
|
-
semanticWeight: this.config.search.semanticWeight,
|
|
17688
|
-
inputEnrichment: this.config.search.inputEnrichment,
|
|
17689
|
-
reranking: false,
|
|
17690
|
-
filter
|
|
17691
|
-
},
|
|
18334
|
+
if (groupByPage) {
|
|
18335
|
+
const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
|
|
18336
|
+
const pageLimit = Math.max(topK * 2, 20);
|
|
18337
|
+
const pageHits = await this.store.searchPagesByText(
|
|
18338
|
+
queryText,
|
|
18339
|
+
{ limit: pageLimit * fetchMultiplier, filter: metaFilter },
|
|
18340
|
+
resolvedScope
|
|
18341
|
+
);
|
|
18342
|
+
const filteredPages = applyPagePostFilters(pageHits);
|
|
18343
|
+
let rankedPages = rankPageHits(filteredPages, effectiveConfig, input.q, input.debug);
|
|
18344
|
+
rankedPages = trimPagesByScoreGap(rankedPages, effectiveConfig);
|
|
18345
|
+
const topPages = rankedPages.slice(0, topK);
|
|
18346
|
+
const chunkPromises = topPages.map(
|
|
18347
|
+
(page) => this.store.searchChunksByUrl(
|
|
18348
|
+
queryText,
|
|
18349
|
+
page.url,
|
|
18350
|
+
{ limit: maxSubResults, filter: metaFilter },
|
|
17692
18351
|
resolvedScope
|
|
17693
|
-
)
|
|
17694
|
-
|
|
17695
|
-
const
|
|
17696
|
-
|
|
18352
|
+
).then((chunks) => applyChunkPostFilters(chunks))
|
|
18353
|
+
);
|
|
18354
|
+
const allChunks = await Promise.all(chunkPromises);
|
|
18355
|
+
const searchMs = hrTimeMs(searchStart);
|
|
18356
|
+
const results = this.buildPageFirstResults(topPages, allChunks, input.q, input.debug, maxSubResults);
|
|
18357
|
+
return {
|
|
18358
|
+
q: input.q,
|
|
18359
|
+
scope: resolvedScope.scopeName,
|
|
18360
|
+
results,
|
|
18361
|
+
meta: {
|
|
18362
|
+
timingsMs: {
|
|
18363
|
+
search: Math.round(searchMs),
|
|
18364
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18365
|
+
}
|
|
18366
|
+
}
|
|
18367
|
+
};
|
|
17697
18368
|
} else {
|
|
18369
|
+
const candidateK = Math.max(50, topK);
|
|
18370
|
+
const fetchMultiplier = pathPrefix || filterTags ? 2 : 1;
|
|
17698
18371
|
const hits = await this.store.search(
|
|
17699
|
-
|
|
17700
|
-
{
|
|
17701
|
-
limit: candidateK,
|
|
17702
|
-
semanticWeight: this.config.search.semanticWeight,
|
|
17703
|
-
inputEnrichment: this.config.search.inputEnrichment,
|
|
17704
|
-
reranking: this.config.search.reranking,
|
|
17705
|
-
filter
|
|
17706
|
-
},
|
|
18372
|
+
queryText,
|
|
18373
|
+
{ limit: candidateK * fetchMultiplier, filter: metaFilter },
|
|
17707
18374
|
resolvedScope
|
|
17708
18375
|
);
|
|
17709
|
-
|
|
17710
|
-
|
|
17711
|
-
|
|
17712
|
-
|
|
17713
|
-
|
|
17714
|
-
|
|
17715
|
-
|
|
17716
|
-
|
|
17717
|
-
|
|
17718
|
-
|
|
17719
|
-
|
|
17720
|
-
|
|
18376
|
+
let filtered = hits;
|
|
18377
|
+
if (pathPrefix) {
|
|
18378
|
+
filtered = filtered.filter((h) => h.metadata.url.startsWith(pathPrefix));
|
|
18379
|
+
}
|
|
18380
|
+
if (filterTags) {
|
|
18381
|
+
filtered = filtered.filter(
|
|
18382
|
+
(h) => filterTags.every((tag) => h.metadata.tags.includes(tag))
|
|
18383
|
+
);
|
|
18384
|
+
}
|
|
18385
|
+
const ranked = rankHits(filtered, effectiveConfig, input.q, input.debug);
|
|
18386
|
+
const searchMs = hrTimeMs(searchStart);
|
|
18387
|
+
const results = this.buildResults(ranked, topK, false, maxSubResults, input.q, input.debug, effectiveConfig);
|
|
18388
|
+
return {
|
|
18389
|
+
q: input.q,
|
|
18390
|
+
scope: resolvedScope.scopeName,
|
|
18391
|
+
results,
|
|
18392
|
+
meta: {
|
|
18393
|
+
timingsMs: {
|
|
18394
|
+
search: Math.round(searchMs),
|
|
18395
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18396
|
+
}
|
|
17721
18397
|
}
|
|
18398
|
+
};
|
|
18399
|
+
}
|
|
18400
|
+
}
|
|
18401
|
+
buildPageFirstResults(rankedPages, allChunks, query, debug, maxSubResults = 5) {
|
|
18402
|
+
return rankedPages.map((page, i) => {
|
|
18403
|
+
const chunks = allChunks[i] ?? [];
|
|
18404
|
+
const bestChunk = chunks[0];
|
|
18405
|
+
const snippet = bestChunk ? query ? queryAwareExcerpt(bestChunk.metadata.chunkText, query) : toSnippet(bestChunk.metadata.chunkText) : page.description || page.title;
|
|
18406
|
+
const result = {
|
|
18407
|
+
url: page.url,
|
|
18408
|
+
title: page.title,
|
|
18409
|
+
sectionTitle: bestChunk?.metadata.sectionTitle || void 0,
|
|
18410
|
+
snippet,
|
|
18411
|
+
chunkText: bestChunk?.metadata.chunkText || void 0,
|
|
18412
|
+
score: Number(page.finalScore.toFixed(6)),
|
|
18413
|
+
routeFile: page.routeFile,
|
|
18414
|
+
chunks: chunks.length > 0 ? chunks.slice(0, maxSubResults).map((c) => ({
|
|
18415
|
+
sectionTitle: c.metadata.sectionTitle || void 0,
|
|
18416
|
+
snippet: query ? queryAwareExcerpt(c.metadata.chunkText, query) : toSnippet(c.metadata.chunkText),
|
|
18417
|
+
chunkText: c.metadata.chunkText || void 0,
|
|
18418
|
+
headingPath: c.metadata.headingPath,
|
|
18419
|
+
score: Number(c.score.toFixed(6))
|
|
18420
|
+
})) : void 0
|
|
18421
|
+
};
|
|
18422
|
+
if (debug && page.breakdown) {
|
|
18423
|
+
result.breakdown = {
|
|
18424
|
+
baseScore: page.breakdown.baseScore,
|
|
18425
|
+
incomingLinkBoost: page.breakdown.incomingLinkBoost,
|
|
18426
|
+
depthBoost: page.breakdown.depthBoost,
|
|
18427
|
+
titleMatchBoost: page.breakdown.titleMatchBoost,
|
|
18428
|
+
freshnessBoost: page.breakdown.freshnessBoost,
|
|
18429
|
+
anchorTextMatchBoost: 0
|
|
18430
|
+
};
|
|
17722
18431
|
}
|
|
17723
|
-
|
|
18432
|
+
return result;
|
|
18433
|
+
});
|
|
17724
18434
|
}
|
|
17725
|
-
ensureSnippet(hit) {
|
|
18435
|
+
ensureSnippet(hit, query) {
|
|
18436
|
+
const chunkText = hit.hit.metadata.chunkText;
|
|
18437
|
+
if (query && chunkText) return queryAwareExcerpt(chunkText, query);
|
|
17726
18438
|
const snippet = hit.hit.metadata.snippet;
|
|
17727
18439
|
if (snippet && snippet.length >= 30) return snippet;
|
|
17728
|
-
const chunkText = hit.hit.metadata.chunkText;
|
|
17729
18440
|
if (chunkText) return toSnippet(chunkText);
|
|
17730
18441
|
return snippet || "";
|
|
17731
18442
|
}
|
|
17732
|
-
buildResults(ordered, topK, groupByPage,
|
|
18443
|
+
buildResults(ordered, topK, groupByPage, maxSubResults, query, debug, config) {
|
|
18444
|
+
const cfg = config ?? this.config;
|
|
17733
18445
|
if (groupByPage) {
|
|
17734
|
-
let pages = aggregateByPage(ordered,
|
|
17735
|
-
pages = trimByScoreGap(pages,
|
|
17736
|
-
const minRatio =
|
|
18446
|
+
let pages = aggregateByPage(ordered, cfg);
|
|
18447
|
+
pages = trimByScoreGap(pages, cfg);
|
|
18448
|
+
const minRatio = cfg.ranking.minChunkScoreRatio;
|
|
17737
18449
|
return pages.slice(0, topK).map((page) => {
|
|
17738
18450
|
const bestScore = page.bestChunk.finalScore;
|
|
17739
18451
|
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
17740
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0,
|
|
17741
|
-
|
|
18452
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, maxSubResults);
|
|
18453
|
+
const result = {
|
|
17742
18454
|
url: page.url,
|
|
17743
18455
|
title: page.title,
|
|
17744
18456
|
sectionTitle: page.bestChunk.hit.metadata.sectionTitle || void 0,
|
|
17745
|
-
snippet: this.ensureSnippet(page.bestChunk),
|
|
18457
|
+
snippet: this.ensureSnippet(page.bestChunk, query),
|
|
18458
|
+
chunkText: page.bestChunk.hit.metadata.chunkText || void 0,
|
|
17746
18459
|
score: Number(page.pageScore.toFixed(6)),
|
|
17747
18460
|
routeFile: page.routeFile,
|
|
17748
|
-
chunks: meaningful.length
|
|
18461
|
+
chunks: meaningful.length >= 1 ? meaningful.map((c) => ({
|
|
17749
18462
|
sectionTitle: c.hit.metadata.sectionTitle || void 0,
|
|
17750
|
-
snippet: this.ensureSnippet(c),
|
|
18463
|
+
snippet: this.ensureSnippet(c, query),
|
|
18464
|
+
chunkText: c.hit.metadata.chunkText || void 0,
|
|
17751
18465
|
headingPath: c.hit.metadata.headingPath,
|
|
17752
18466
|
score: Number(c.finalScore.toFixed(6))
|
|
17753
18467
|
})) : void 0
|
|
17754
18468
|
};
|
|
18469
|
+
if (debug && page.bestChunk.breakdown) {
|
|
18470
|
+
result.breakdown = page.bestChunk.breakdown;
|
|
18471
|
+
}
|
|
18472
|
+
return result;
|
|
17755
18473
|
});
|
|
17756
18474
|
} else {
|
|
17757
18475
|
let filtered = ordered;
|
|
17758
|
-
const
|
|
17759
|
-
if (
|
|
17760
|
-
|
|
17761
|
-
|
|
17762
|
-
|
|
17763
|
-
|
|
17764
|
-
|
|
17765
|
-
|
|
17766
|
-
|
|
17767
|
-
|
|
17768
|
-
|
|
17769
|
-
|
|
18476
|
+
const minScoreRatio = cfg.ranking.minScoreRatio;
|
|
18477
|
+
if (minScoreRatio > 0 && ordered.length > 0) {
|
|
18478
|
+
const topScore = ordered[0].finalScore;
|
|
18479
|
+
if (Number.isFinite(topScore) && topScore > 0) {
|
|
18480
|
+
const threshold = topScore * minScoreRatio;
|
|
18481
|
+
filtered = ordered.filter((entry) => entry.finalScore >= threshold);
|
|
18482
|
+
}
|
|
18483
|
+
}
|
|
18484
|
+
return filtered.slice(0, topK).map(({ hit, finalScore, breakdown }) => {
|
|
18485
|
+
const result = {
|
|
18486
|
+
url: hit.metadata.url,
|
|
18487
|
+
title: hit.metadata.title,
|
|
18488
|
+
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
18489
|
+
snippet: this.ensureSnippet({ hit, finalScore }, query),
|
|
18490
|
+
chunkText: hit.metadata.chunkText || void 0,
|
|
18491
|
+
score: Number(finalScore.toFixed(6)),
|
|
18492
|
+
routeFile: hit.metadata.routeFile
|
|
18493
|
+
};
|
|
18494
|
+
if (debug && breakdown) {
|
|
18495
|
+
result.breakdown = breakdown;
|
|
18496
|
+
}
|
|
18497
|
+
return result;
|
|
18498
|
+
});
|
|
17770
18499
|
}
|
|
17771
18500
|
}
|
|
17772
18501
|
async getPage(pathOrUrl, scope) {
|
|
@@ -17792,6 +18521,116 @@ var SearchEngine = class _SearchEngine {
|
|
|
17792
18521
|
markdown: page.markdown
|
|
17793
18522
|
};
|
|
17794
18523
|
}
|
|
18524
|
+
async listPages(opts) {
|
|
18525
|
+
const resolvedScope = resolveScope(this.config, opts?.scope);
|
|
18526
|
+
const pathPrefix = opts?.pathPrefix ? opts.pathPrefix.startsWith("/") ? opts.pathPrefix : `/${opts.pathPrefix}` : void 0;
|
|
18527
|
+
return this.store.listPages(resolvedScope, {
|
|
18528
|
+
cursor: opts?.cursor,
|
|
18529
|
+
limit: opts?.limit,
|
|
18530
|
+
pathPrefix
|
|
18531
|
+
});
|
|
18532
|
+
}
|
|
18533
|
+
async getSiteStructure(opts) {
|
|
18534
|
+
const maxPages = Math.min(opts?.maxPages ?? MAX_SITE_STRUCTURE_PAGES, MAX_SITE_STRUCTURE_PAGES);
|
|
18535
|
+
const allPages = [];
|
|
18536
|
+
let cursor;
|
|
18537
|
+
let truncated = false;
|
|
18538
|
+
do {
|
|
18539
|
+
const result = await this.listPages({
|
|
18540
|
+
pathPrefix: opts?.pathPrefix,
|
|
18541
|
+
scope: opts?.scope,
|
|
18542
|
+
cursor,
|
|
18543
|
+
limit: 200
|
|
18544
|
+
});
|
|
18545
|
+
allPages.push(...result.pages);
|
|
18546
|
+
cursor = result.nextCursor;
|
|
18547
|
+
if (allPages.length >= maxPages) {
|
|
18548
|
+
truncated = allPages.length > maxPages || !!cursor;
|
|
18549
|
+
allPages.length = maxPages;
|
|
18550
|
+
break;
|
|
18551
|
+
}
|
|
18552
|
+
} while (cursor);
|
|
18553
|
+
const root2 = buildTree(allPages, opts?.pathPrefix);
|
|
18554
|
+
return {
|
|
18555
|
+
root: root2,
|
|
18556
|
+
totalPages: allPages.length,
|
|
18557
|
+
truncated
|
|
18558
|
+
};
|
|
18559
|
+
}
|
|
18560
|
+
async getRelatedPages(pathOrUrl, opts) {
|
|
18561
|
+
const resolvedScope = resolveScope(this.config, opts?.scope);
|
|
18562
|
+
const urlPath = this.resolveInputPath(pathOrUrl);
|
|
18563
|
+
const topK = Math.min(opts?.topK ?? 10, 25);
|
|
18564
|
+
const source = await this.store.fetchPageWithVector(urlPath, resolvedScope);
|
|
18565
|
+
if (!source) {
|
|
18566
|
+
throw new SearchSocketError("INVALID_REQUEST", `Indexed page not found for ${urlPath}`, 404);
|
|
18567
|
+
}
|
|
18568
|
+
const sourceOutgoing = new Set(source.metadata.outgoingLinkUrls ?? []);
|
|
18569
|
+
const semanticHits = await this.store.searchPagesByVector(
|
|
18570
|
+
source.vector,
|
|
18571
|
+
{ limit: 50 },
|
|
18572
|
+
resolvedScope
|
|
18573
|
+
);
|
|
18574
|
+
const filteredHits = semanticHits.filter((h) => h.url !== urlPath);
|
|
18575
|
+
const semanticScoreMap = /* @__PURE__ */ new Map();
|
|
18576
|
+
for (const hit of filteredHits) {
|
|
18577
|
+
semanticScoreMap.set(hit.url, hit.score);
|
|
18578
|
+
}
|
|
18579
|
+
const candidateUrls = /* @__PURE__ */ new Set();
|
|
18580
|
+
for (const hit of filteredHits) {
|
|
18581
|
+
candidateUrls.add(hit.url);
|
|
18582
|
+
}
|
|
18583
|
+
for (const url of sourceOutgoing) {
|
|
18584
|
+
if (url !== urlPath) candidateUrls.add(url);
|
|
18585
|
+
}
|
|
18586
|
+
const missingUrls = [...sourceOutgoing].filter(
|
|
18587
|
+
(u) => u !== urlPath && !semanticScoreMap.has(u)
|
|
18588
|
+
);
|
|
18589
|
+
const fetchedPages = missingUrls.length > 0 ? await this.store.fetchPagesBatch(missingUrls, resolvedScope) : [];
|
|
18590
|
+
const metaMap = /* @__PURE__ */ new Map();
|
|
18591
|
+
for (const hit of filteredHits) {
|
|
18592
|
+
metaMap.set(hit.url, { title: hit.title, routeFile: hit.routeFile, outgoingLinkUrls: [] });
|
|
18593
|
+
}
|
|
18594
|
+
for (const p of fetchedPages) {
|
|
18595
|
+
metaMap.set(p.url, { title: p.title, routeFile: p.routeFile, outgoingLinkUrls: p.outgoingLinkUrls });
|
|
18596
|
+
}
|
|
18597
|
+
const semanticUrls = filteredHits.map((h) => h.url);
|
|
18598
|
+
if (semanticUrls.length > 0) {
|
|
18599
|
+
const semanticPageData = await this.store.fetchPagesBatch(semanticUrls, resolvedScope);
|
|
18600
|
+
for (const p of semanticPageData) {
|
|
18601
|
+
const existing = metaMap.get(p.url);
|
|
18602
|
+
if (existing) {
|
|
18603
|
+
existing.outgoingLinkUrls = p.outgoingLinkUrls;
|
|
18604
|
+
}
|
|
18605
|
+
}
|
|
18606
|
+
}
|
|
18607
|
+
const candidates = [];
|
|
18608
|
+
for (const url of candidateUrls) {
|
|
18609
|
+
const meta = metaMap.get(url);
|
|
18610
|
+
if (!meta) continue;
|
|
18611
|
+
const isOutgoing = sourceOutgoing.has(url);
|
|
18612
|
+
const isIncoming = meta.outgoingLinkUrls.includes(urlPath);
|
|
18613
|
+
const isLinked = isOutgoing || isIncoming;
|
|
18614
|
+
const dice = diceScore(urlPath, url);
|
|
18615
|
+
const semantic = semanticScoreMap.get(url) ?? 0;
|
|
18616
|
+
const score = compositeScore(isLinked, dice, semantic);
|
|
18617
|
+
const relationshipType = dominantRelationshipType(isOutgoing, isIncoming, dice);
|
|
18618
|
+
candidates.push({
|
|
18619
|
+
url,
|
|
18620
|
+
title: meta.title,
|
|
18621
|
+
score: Number(score.toFixed(6)),
|
|
18622
|
+
relationshipType,
|
|
18623
|
+
routeFile: meta.routeFile
|
|
18624
|
+
});
|
|
18625
|
+
}
|
|
18626
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
18627
|
+
const results = candidates.slice(0, topK);
|
|
18628
|
+
return {
|
|
18629
|
+
sourceUrl: urlPath,
|
|
18630
|
+
scope: resolvedScope.scopeName,
|
|
18631
|
+
relatedPages: results
|
|
18632
|
+
};
|
|
18633
|
+
}
|
|
17795
18634
|
async health() {
|
|
17796
18635
|
return this.store.health();
|
|
17797
18636
|
}
|
|
@@ -17807,6 +18646,215 @@ var SearchEngine = class _SearchEngine {
|
|
|
17807
18646
|
}
|
|
17808
18647
|
};
|
|
17809
18648
|
|
|
18649
|
+
// src/mcp/server.ts
|
|
18650
|
+
function createServer(engine) {
|
|
18651
|
+
const server = new McpServer({
|
|
18652
|
+
name: "searchsocket-mcp",
|
|
18653
|
+
version: "0.1.0"
|
|
18654
|
+
});
|
|
18655
|
+
server.registerTool(
|
|
18656
|
+
"search",
|
|
18657
|
+
{
|
|
18658
|
+
description: `Semantic site search powered by Upstash Search. Returns url, title, snippet, chunkText, score, and routeFile per result. chunkText contains the full raw chunk markdown. When groupBy is 'page' (default), each result includes a chunks array with section-level sub-results containing sectionTitle, headingPath, snippet, and score. Supports optional filters for structured metadata (e.g. {"version": 2, "deprecated": false}).`,
|
|
18659
|
+
inputSchema: {
|
|
18660
|
+
query: z.string().min(1),
|
|
18661
|
+
scope: z.string().optional(),
|
|
18662
|
+
topK: z.number().int().positive().max(100).optional(),
|
|
18663
|
+
pathPrefix: z.string().optional(),
|
|
18664
|
+
tags: z.array(z.string()).optional(),
|
|
18665
|
+
filters: z.record(z.string(), z.union([z.string(), z.number(), z.boolean()])).optional(),
|
|
18666
|
+
groupBy: z.enum(["page", "chunk"]).optional(),
|
|
18667
|
+
maxSubResults: z.number().int().positive().max(20).optional()
|
|
18668
|
+
},
|
|
18669
|
+
outputSchema: {
|
|
18670
|
+
q: z.string(),
|
|
18671
|
+
scope: z.string(),
|
|
18672
|
+
results: z.array(z.object({
|
|
18673
|
+
url: z.string(),
|
|
18674
|
+
title: z.string(),
|
|
18675
|
+
sectionTitle: z.string().optional(),
|
|
18676
|
+
snippet: z.string(),
|
|
18677
|
+
score: z.number(),
|
|
18678
|
+
routeFile: z.string(),
|
|
18679
|
+
chunks: z.array(z.object({
|
|
18680
|
+
sectionTitle: z.string().optional(),
|
|
18681
|
+
snippet: z.string(),
|
|
18682
|
+
headingPath: z.array(z.string()),
|
|
18683
|
+
score: z.number()
|
|
18684
|
+
})).optional()
|
|
18685
|
+
})),
|
|
18686
|
+
meta: z.object({
|
|
18687
|
+
timingsMs: z.object({
|
|
18688
|
+
search: z.number(),
|
|
18689
|
+
total: z.number()
|
|
18690
|
+
})
|
|
18691
|
+
})
|
|
18692
|
+
}
|
|
18693
|
+
},
|
|
18694
|
+
async (input) => {
|
|
18695
|
+
const result = await engine.search({
|
|
18696
|
+
q: input.query,
|
|
18697
|
+
topK: input.topK,
|
|
18698
|
+
scope: input.scope,
|
|
18699
|
+
pathPrefix: input.pathPrefix,
|
|
18700
|
+
tags: input.tags,
|
|
18701
|
+
filters: input.filters,
|
|
18702
|
+
groupBy: input.groupBy,
|
|
18703
|
+
maxSubResults: input.maxSubResults
|
|
18704
|
+
});
|
|
18705
|
+
return {
|
|
18706
|
+
content: [
|
|
18707
|
+
{
|
|
18708
|
+
type: "text",
|
|
18709
|
+
text: JSON.stringify(result, null, 2)
|
|
18710
|
+
}
|
|
18711
|
+
],
|
|
18712
|
+
structuredContent: result
|
|
18713
|
+
};
|
|
18714
|
+
}
|
|
18715
|
+
);
|
|
18716
|
+
server.registerTool(
|
|
18717
|
+
"get_page",
|
|
18718
|
+
{
|
|
18719
|
+
description: "Fetch indexed markdown for a specific path or URL, including frontmatter and routeFile mapping.",
|
|
18720
|
+
inputSchema: {
|
|
18721
|
+
pathOrUrl: z.string().min(1),
|
|
18722
|
+
scope: z.string().optional()
|
|
18723
|
+
}
|
|
18724
|
+
},
|
|
18725
|
+
async (input) => {
|
|
18726
|
+
const page = await engine.getPage(input.pathOrUrl, input.scope);
|
|
18727
|
+
return {
|
|
18728
|
+
content: [
|
|
18729
|
+
{
|
|
18730
|
+
type: "text",
|
|
18731
|
+
text: JSON.stringify(page, null, 2)
|
|
18732
|
+
}
|
|
18733
|
+
]
|
|
18734
|
+
};
|
|
18735
|
+
}
|
|
18736
|
+
);
|
|
18737
|
+
server.registerTool(
|
|
18738
|
+
"list_pages",
|
|
18739
|
+
{
|
|
18740
|
+
description: "List indexed pages with optional path prefix filtering and cursor-based pagination. Returns url, title, description, and routeFile for each page. Use nextCursor to fetch subsequent pages.",
|
|
18741
|
+
inputSchema: {
|
|
18742
|
+
pathPrefix: z.string().optional(),
|
|
18743
|
+
cursor: z.string().optional(),
|
|
18744
|
+
limit: z.number().int().positive().max(200).optional(),
|
|
18745
|
+
scope: z.string().optional()
|
|
18746
|
+
}
|
|
18747
|
+
},
|
|
18748
|
+
async (input) => {
|
|
18749
|
+
const result = await engine.listPages({
|
|
18750
|
+
pathPrefix: input.pathPrefix,
|
|
18751
|
+
cursor: input.cursor,
|
|
18752
|
+
limit: input.limit,
|
|
18753
|
+
scope: input.scope
|
|
18754
|
+
});
|
|
18755
|
+
return {
|
|
18756
|
+
content: [
|
|
18757
|
+
{
|
|
18758
|
+
type: "text",
|
|
18759
|
+
text: JSON.stringify(result, null, 2)
|
|
18760
|
+
}
|
|
18761
|
+
]
|
|
18762
|
+
};
|
|
18763
|
+
}
|
|
18764
|
+
);
|
|
18765
|
+
server.registerTool(
|
|
18766
|
+
"get_site_structure",
|
|
18767
|
+
{
|
|
18768
|
+
description: "Returns the hierarchical page tree derived from URL paths. Use this to understand site navigation structure, find where pages belong, or scope further operations to a section. Nodes with isIndexed: false are implicit structural parents not directly in the index. Large sites (>2000 pages) return truncated: true.",
|
|
18769
|
+
inputSchema: {
|
|
18770
|
+
pathPrefix: z.string().optional(),
|
|
18771
|
+
scope: z.string().optional(),
|
|
18772
|
+
maxPages: z.number().int().positive().max(2e3).optional()
|
|
18773
|
+
}
|
|
18774
|
+
},
|
|
18775
|
+
async (input) => {
|
|
18776
|
+
const result = await engine.getSiteStructure({
|
|
18777
|
+
pathPrefix: input.pathPrefix,
|
|
18778
|
+
scope: input.scope,
|
|
18779
|
+
maxPages: input.maxPages
|
|
18780
|
+
});
|
|
18781
|
+
return {
|
|
18782
|
+
content: [
|
|
18783
|
+
{
|
|
18784
|
+
type: "text",
|
|
18785
|
+
text: JSON.stringify(result, null, 2)
|
|
18786
|
+
}
|
|
18787
|
+
]
|
|
18788
|
+
};
|
|
18789
|
+
}
|
|
18790
|
+
);
|
|
18791
|
+
server.registerTool(
|
|
18792
|
+
"find_source_file",
|
|
18793
|
+
{
|
|
18794
|
+
description: "Find the SvelteKit source file for a piece of site content. Use this when you need to locate and edit content on the site. Returns the URL, route file path, section title, and a content snippet.",
|
|
18795
|
+
inputSchema: {
|
|
18796
|
+
query: z.string().min(1),
|
|
18797
|
+
scope: z.string().optional()
|
|
18798
|
+
}
|
|
18799
|
+
},
|
|
18800
|
+
async (input) => {
|
|
18801
|
+
const result = await engine.search({
|
|
18802
|
+
q: input.query,
|
|
18803
|
+
topK: 1,
|
|
18804
|
+
scope: input.scope
|
|
18805
|
+
});
|
|
18806
|
+
if (result.results.length === 0) {
|
|
18807
|
+
return {
|
|
18808
|
+
content: [
|
|
18809
|
+
{
|
|
18810
|
+
type: "text",
|
|
18811
|
+
text: JSON.stringify({
|
|
18812
|
+
error: "No matching content found for the given query."
|
|
18813
|
+
})
|
|
18814
|
+
}
|
|
18815
|
+
]
|
|
18816
|
+
};
|
|
18817
|
+
}
|
|
18818
|
+
const match = result.results[0];
|
|
18819
|
+
const { url, routeFile, sectionTitle, snippet } = match;
|
|
18820
|
+
return {
|
|
18821
|
+
content: [
|
|
18822
|
+
{
|
|
18823
|
+
type: "text",
|
|
18824
|
+
text: JSON.stringify({ url, routeFile, sectionTitle, snippet })
|
|
18825
|
+
}
|
|
18826
|
+
]
|
|
18827
|
+
};
|
|
18828
|
+
}
|
|
18829
|
+
);
|
|
18830
|
+
server.registerTool(
|
|
18831
|
+
"get_related_pages",
|
|
18832
|
+
{
|
|
18833
|
+
description: "Find pages related to a given URL using link graph, semantic similarity, and structural proximity. Returns related pages ranked by a composite relatedness score. Use this to discover content connected to a known page.",
|
|
18834
|
+
inputSchema: {
|
|
18835
|
+
pathOrUrl: z.string().min(1),
|
|
18836
|
+
scope: z.string().optional(),
|
|
18837
|
+
topK: z.number().int().positive().max(25).optional()
|
|
18838
|
+
}
|
|
18839
|
+
},
|
|
18840
|
+
async (input) => {
|
|
18841
|
+
const result = await engine.getRelatedPages(input.pathOrUrl, {
|
|
18842
|
+
topK: input.topK,
|
|
18843
|
+
scope: input.scope
|
|
18844
|
+
});
|
|
18845
|
+
return {
|
|
18846
|
+
content: [
|
|
18847
|
+
{
|
|
18848
|
+
type: "text",
|
|
18849
|
+
text: JSON.stringify(result, null, 2)
|
|
18850
|
+
}
|
|
18851
|
+
]
|
|
18852
|
+
};
|
|
18853
|
+
}
|
|
18854
|
+
);
|
|
18855
|
+
return server;
|
|
18856
|
+
}
|
|
18857
|
+
|
|
17810
18858
|
// src/sveltekit/handle.ts
|
|
17811
18859
|
var InMemoryRateLimiter = class {
|
|
17812
18860
|
constructor(windowMs, max) {
|
|
@@ -17835,7 +18883,13 @@ function searchsocketHandle(options = {}) {
|
|
|
17835
18883
|
let enginePromise = null;
|
|
17836
18884
|
let configPromise = null;
|
|
17837
18885
|
let apiPath = options.path;
|
|
18886
|
+
let llmsServePath = null;
|
|
18887
|
+
let serveMarkdownVariants = false;
|
|
18888
|
+
let mcpPath;
|
|
18889
|
+
let mcpApiKey;
|
|
18890
|
+
let mcpEnableJsonResponse = true;
|
|
17838
18891
|
let rateLimiter = null;
|
|
18892
|
+
let notConfigured = false;
|
|
17839
18893
|
const getConfig = async () => {
|
|
17840
18894
|
if (!configPromise) {
|
|
17841
18895
|
let configP;
|
|
@@ -17852,6 +18906,13 @@ function searchsocketHandle(options = {}) {
|
|
|
17852
18906
|
}
|
|
17853
18907
|
configPromise = configP.then((config) => {
|
|
17854
18908
|
apiPath = apiPath ?? config.api.path;
|
|
18909
|
+
mcpPath = config.mcp.handle.path;
|
|
18910
|
+
mcpApiKey = config.mcp.handle.apiKey;
|
|
18911
|
+
mcpEnableJsonResponse = config.mcp.handle.enableJsonResponse;
|
|
18912
|
+
if (config.llmsTxt.enable) {
|
|
18913
|
+
llmsServePath = "/" + config.llmsTxt.outputPath.replace(/^static\//, "");
|
|
18914
|
+
serveMarkdownVariants = config.llmsTxt.serveMarkdownVariants;
|
|
18915
|
+
}
|
|
17855
18916
|
if (config.api.rateLimit && !isServerless()) {
|
|
17856
18917
|
rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
|
|
17857
18918
|
}
|
|
@@ -17861,59 +18922,109 @@ function searchsocketHandle(options = {}) {
|
|
|
17861
18922
|
return configPromise;
|
|
17862
18923
|
};
|
|
17863
18924
|
const getEngine = async () => {
|
|
18925
|
+
if (notConfigured) {
|
|
18926
|
+
throw new SearchSocketError(
|
|
18927
|
+
"SEARCH_NOT_CONFIGURED",
|
|
18928
|
+
"Search is not configured. Set the required Upstash environment variables to enable search.",
|
|
18929
|
+
503
|
|
18930
|
+
);
|
|
18931
|
+
}
|
|
17864
18932
|
if (!enginePromise) {
|
|
17865
18933
|
const config = await getConfig();
|
|
17866
18934
|
enginePromise = SearchEngine.create({
|
|
17867
18935
|
cwd: options.cwd,
|
|
17868
18936
|
config
|
|
18937
|
+
}).catch((error) => {
|
|
18938
|
+
enginePromise = null;
|
|
18939
|
+
if (error instanceof SearchSocketError && error.code === "VECTOR_BACKEND_UNAVAILABLE") {
|
|
18940
|
+
notConfigured = true;
|
|
18941
|
+
throw new SearchSocketError(
|
|
18942
|
+
"SEARCH_NOT_CONFIGURED",
|
|
18943
|
+
"Search is not configured. Set the required Upstash environment variables to enable search.",
|
|
18944
|
+
503
|
|
18945
|
+
);
|
|
18946
|
+
}
|
|
18947
|
+
throw error;
|
|
17869
18948
|
});
|
|
17870
18949
|
}
|
|
17871
18950
|
return enginePromise;
|
|
17872
18951
|
};
|
|
17873
18952
|
const bodyLimit = options.maxBodyBytes ?? 64 * 1024;
|
|
17874
18953
|
return async ({ event, resolve }) => {
|
|
17875
|
-
if (apiPath && event.url.pathname !==
|
|
17876
|
-
|
|
18954
|
+
if (apiPath && !isApiPath(event.url.pathname, apiPath) && event.url.pathname !== llmsServePath) {
|
|
18955
|
+
const isMarkdownVariant = event.request.method === "GET" && event.url.pathname.endsWith(".md");
|
|
18956
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
18957
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
18958
|
+
}
|
|
18959
|
+
if (mcpPath) {
|
|
18960
|
+
if (serveMarkdownVariants && isMarkdownVariant) ; else {
|
|
18961
|
+
return resolve(event);
|
|
18962
|
+
}
|
|
18963
|
+
} else {
|
|
18964
|
+
if (configPromise || options.config || options.rawConfig) {
|
|
18965
|
+
await getConfig();
|
|
18966
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
18967
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
18968
|
+
}
|
|
18969
|
+
if (!(serveMarkdownVariants && isMarkdownVariant)) {
|
|
18970
|
+
return resolve(event);
|
|
18971
|
+
}
|
|
18972
|
+
} else {
|
|
18973
|
+
return resolve(event);
|
|
18974
|
+
}
|
|
18975
|
+
}
|
|
17877
18976
|
}
|
|
17878
18977
|
const config = await getConfig();
|
|
18978
|
+
if (llmsServePath && event.request.method === "GET" && event.url.pathname === llmsServePath) {
|
|
18979
|
+
const cwd = options.cwd ?? process.cwd();
|
|
18980
|
+
const filePath = path.resolve(cwd, config.llmsTxt.outputPath);
|
|
18981
|
+
try {
|
|
18982
|
+
const content = await fs9.readFile(filePath, "utf8");
|
|
18983
|
+
return new Response(content, {
|
|
18984
|
+
status: 200,
|
|
18985
|
+
headers: { "content-type": "text/plain; charset=utf-8" }
|
|
18986
|
+
});
|
|
18987
|
+
} catch {
|
|
18988
|
+
return resolve(event);
|
|
18989
|
+
}
|
|
18990
|
+
}
|
|
18991
|
+
if (serveMarkdownVariants && event.request.method === "GET" && event.url.pathname.endsWith(".md")) {
|
|
18992
|
+
let rawPath;
|
|
18993
|
+
try {
|
|
18994
|
+
rawPath = decodeURIComponent(event.url.pathname.slice(0, -3));
|
|
18995
|
+
} catch {
|
|
18996
|
+
return resolve(event);
|
|
18997
|
+
}
|
|
18998
|
+
const scope = event.url.searchParams?.get("scope") ?? void 0;
|
|
18999
|
+
try {
|
|
19000
|
+
const engine = await getEngine();
|
|
19001
|
+
const page = await engine.getPage(rawPath, scope);
|
|
19002
|
+
return new Response(page.markdown, {
|
|
19003
|
+
status: 200,
|
|
19004
|
+
headers: { "content-type": "text/markdown; charset=utf-8" }
|
|
19005
|
+
});
|
|
19006
|
+
} catch (error) {
|
|
19007
|
+
if (error instanceof SearchSocketError && error.status === 404) {
|
|
19008
|
+
return resolve(event);
|
|
19009
|
+
}
|
|
19010
|
+
throw error;
|
|
19011
|
+
}
|
|
19012
|
+
}
|
|
19013
|
+
if (mcpPath && event.url.pathname === mcpPath) {
|
|
19014
|
+
return handleMcpRequest(event, mcpApiKey, mcpEnableJsonResponse, getEngine);
|
|
19015
|
+
}
|
|
17879
19016
|
const targetPath = apiPath ?? config.api.path;
|
|
17880
|
-
if (event.url.pathname
|
|
19017
|
+
if (!isApiPath(event.url.pathname, targetPath)) {
|
|
17881
19018
|
return resolve(event);
|
|
17882
19019
|
}
|
|
17883
|
-
|
|
19020
|
+
const subPath = event.url.pathname.slice(targetPath.length);
|
|
19021
|
+
const method = event.request.method;
|
|
19022
|
+
if (method === "OPTIONS") {
|
|
17884
19023
|
return new Response(null, {
|
|
17885
19024
|
status: 204,
|
|
17886
19025
|
headers: buildCorsHeaders(event.request, config)
|
|
17887
19026
|
});
|
|
17888
19027
|
}
|
|
17889
|
-
if (event.request.method !== "POST") {
|
|
17890
|
-
return withCors(
|
|
17891
|
-
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
|
|
17892
|
-
status: 405,
|
|
17893
|
-
headers: {
|
|
17894
|
-
"content-type": "application/json"
|
|
17895
|
-
}
|
|
17896
|
-
}),
|
|
17897
|
-
event.request,
|
|
17898
|
-
config
|
|
17899
|
-
);
|
|
17900
|
-
}
|
|
17901
|
-
const contentLength = Number(event.request.headers.get("content-length") ?? 0);
|
|
17902
|
-
if (contentLength > bodyLimit) {
|
|
17903
|
-
return withCors(
|
|
17904
|
-
new Response(
|
|
17905
|
-
JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Request body too large", 413))),
|
|
17906
|
-
{
|
|
17907
|
-
status: 413,
|
|
17908
|
-
headers: {
|
|
17909
|
-
"content-type": "application/json"
|
|
17910
|
-
}
|
|
17911
|
-
}
|
|
17912
|
-
),
|
|
17913
|
-
event.request,
|
|
17914
|
-
config
|
|
17915
|
-
);
|
|
17916
|
-
}
|
|
17917
19028
|
if (rateLimiter) {
|
|
17918
19029
|
const ip = event.getClientAddress?.() ?? event.request.headers.get("x-forwarded-for")?.split(",")[0]?.trim() ?? "unknown";
|
|
17919
19030
|
if (!rateLimiter.check(ip)) {
|
|
@@ -17933,39 +19044,32 @@ function searchsocketHandle(options = {}) {
|
|
|
17933
19044
|
}
|
|
17934
19045
|
}
|
|
17935
19046
|
try {
|
|
17936
|
-
|
|
17937
|
-
|
|
17938
|
-
|
|
17939
|
-
} else {
|
|
17940
|
-
let parsedFallback;
|
|
17941
|
-
try {
|
|
17942
|
-
parsedFallback = await event.request.json();
|
|
17943
|
-
} catch (error) {
|
|
17944
|
-
if (error instanceof SyntaxError) {
|
|
17945
|
-
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
17946
|
-
}
|
|
17947
|
-
throw error;
|
|
19047
|
+
if (method === "GET") {
|
|
19048
|
+
if (subPath === "" || subPath === "/") {
|
|
19049
|
+
return await handleGetSearch(event, config, getEngine);
|
|
17948
19050
|
}
|
|
17949
|
-
|
|
17950
|
-
|
|
17951
|
-
|
|
17952
|
-
|
|
19051
|
+
if (subPath === "/health") {
|
|
19052
|
+
return await handleGetHealth(event, config, getEngine);
|
|
19053
|
+
}
|
|
19054
|
+
if (subPath.startsWith("/pages/")) {
|
|
19055
|
+
return await handleGetPage(event, config, getEngine, subPath);
|
|
19056
|
+
}
|
|
19057
|
+
return withCors(
|
|
19058
|
+
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Not found", 404))), {
|
|
19059
|
+
status: 404,
|
|
19060
|
+
headers: { "content-type": "application/json" }
|
|
19061
|
+
}),
|
|
19062
|
+
event.request,
|
|
19063
|
+
config
|
|
19064
|
+
);
|
|
17953
19065
|
}
|
|
17954
|
-
|
|
17955
|
-
|
|
17956
|
-
body = JSON.parse(rawBody);
|
|
17957
|
-
} catch {
|
|
17958
|
-
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
19066
|
+
if (method === "POST" && (subPath === "" || subPath === "/")) {
|
|
19067
|
+
return await handlePostSearch(event, config, getEngine, bodyLimit);
|
|
17959
19068
|
}
|
|
17960
|
-
const engine = await getEngine();
|
|
17961
|
-
const searchRequest = body;
|
|
17962
|
-
const result = await engine.search(searchRequest);
|
|
17963
19069
|
return withCors(
|
|
17964
|
-
new Response(JSON.stringify(
|
|
17965
|
-
status:
|
|
17966
|
-
headers: {
|
|
17967
|
-
"content-type": "application/json"
|
|
17968
|
-
}
|
|
19070
|
+
new Response(JSON.stringify(toErrorPayload(new SearchSocketError("INVALID_REQUEST", "Method not allowed", 405))), {
|
|
19071
|
+
status: 405,
|
|
19072
|
+
headers: { "content-type": "application/json" }
|
|
17969
19073
|
}),
|
|
17970
19074
|
event.request,
|
|
17971
19075
|
config
|
|
@@ -17986,6 +19090,183 @@ function searchsocketHandle(options = {}) {
|
|
|
17986
19090
|
}
|
|
17987
19091
|
};
|
|
17988
19092
|
}
|
|
19093
|
+
function isApiPath(pathname, apiPath) {
|
|
19094
|
+
return pathname === apiPath || pathname.startsWith(apiPath + "/");
|
|
19095
|
+
}
|
|
19096
|
+
async function handleGetSearch(event, config, getEngine) {
|
|
19097
|
+
const params = event.url.searchParams;
|
|
19098
|
+
const q = params.get("q");
|
|
19099
|
+
if (!q || q.trim() === "") {
|
|
19100
|
+
throw new SearchSocketError("INVALID_REQUEST", "Missing required query parameter: q", 400);
|
|
19101
|
+
}
|
|
19102
|
+
const searchRequest = { q };
|
|
19103
|
+
const topK = params.get("topK");
|
|
19104
|
+
if (topK !== null) {
|
|
19105
|
+
const parsed = Number.parseInt(topK, 10);
|
|
19106
|
+
if (Number.isNaN(parsed) || parsed < 1) {
|
|
19107
|
+
throw new SearchSocketError("INVALID_REQUEST", "topK must be a positive integer", 400);
|
|
19108
|
+
}
|
|
19109
|
+
searchRequest.topK = parsed;
|
|
19110
|
+
}
|
|
19111
|
+
const scope = params.get("scope");
|
|
19112
|
+
if (scope !== null) searchRequest.scope = scope;
|
|
19113
|
+
const pathPrefix = params.get("pathPrefix");
|
|
19114
|
+
if (pathPrefix !== null) searchRequest.pathPrefix = pathPrefix;
|
|
19115
|
+
const groupBy = params.get("groupBy");
|
|
19116
|
+
if (groupBy) {
|
|
19117
|
+
if (groupBy !== "page" && groupBy !== "chunk") {
|
|
19118
|
+
throw new SearchSocketError("INVALID_REQUEST", 'groupBy must be "page" or "chunk"', 400);
|
|
19119
|
+
}
|
|
19120
|
+
searchRequest.groupBy = groupBy;
|
|
19121
|
+
}
|
|
19122
|
+
const maxSubResults = params.get("maxSubResults");
|
|
19123
|
+
if (maxSubResults !== null) {
|
|
19124
|
+
const parsed = Number.parseInt(maxSubResults, 10);
|
|
19125
|
+
if (Number.isNaN(parsed) || parsed < 1 || parsed > 20) {
|
|
19126
|
+
throw new SearchSocketError("INVALID_REQUEST", "maxSubResults must be a positive integer between 1 and 20", 400);
|
|
19127
|
+
}
|
|
19128
|
+
searchRequest.maxSubResults = parsed;
|
|
19129
|
+
}
|
|
19130
|
+
const tags = params.getAll("tags");
|
|
19131
|
+
if (tags.length > 0) searchRequest.tags = tags;
|
|
19132
|
+
const engine = await getEngine();
|
|
19133
|
+
const result = await engine.search(searchRequest);
|
|
19134
|
+
return withCors(
|
|
19135
|
+
new Response(JSON.stringify(result), {
|
|
19136
|
+
status: 200,
|
|
19137
|
+
headers: { "content-type": "application/json" }
|
|
19138
|
+
}),
|
|
19139
|
+
event.request,
|
|
19140
|
+
config
|
|
19141
|
+
);
|
|
19142
|
+
}
|
|
19143
|
+
async function handleGetHealth(event, config, getEngine) {
|
|
19144
|
+
const engine = await getEngine();
|
|
19145
|
+
const result = await engine.health();
|
|
19146
|
+
return withCors(
|
|
19147
|
+
new Response(JSON.stringify(result), {
|
|
19148
|
+
status: 200,
|
|
19149
|
+
headers: { "content-type": "application/json" }
|
|
19150
|
+
}),
|
|
19151
|
+
event.request,
|
|
19152
|
+
config
|
|
19153
|
+
);
|
|
19154
|
+
}
|
|
19155
|
+
async function handleGetPage(event, config, getEngine, subPath) {
|
|
19156
|
+
const rawPath = subPath.slice("/pages".length);
|
|
19157
|
+
let pagePath;
|
|
19158
|
+
try {
|
|
19159
|
+
pagePath = decodeURIComponent(rawPath);
|
|
19160
|
+
} catch {
|
|
19161
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed page path", 400);
|
|
19162
|
+
}
|
|
19163
|
+
const scope = event.url.searchParams?.get("scope") ?? void 0;
|
|
19164
|
+
const engine = await getEngine();
|
|
19165
|
+
const result = await engine.getPage(pagePath, scope);
|
|
19166
|
+
return withCors(
|
|
19167
|
+
new Response(JSON.stringify(result), {
|
|
19168
|
+
status: 200,
|
|
19169
|
+
headers: { "content-type": "application/json" }
|
|
19170
|
+
}),
|
|
19171
|
+
event.request,
|
|
19172
|
+
config
|
|
19173
|
+
);
|
|
19174
|
+
}
|
|
19175
|
+
async function handlePostSearch(event, config, getEngine, bodyLimit) {
|
|
19176
|
+
const contentLength = Number(event.request.headers.get("content-length") ?? 0);
|
|
19177
|
+
if (contentLength > bodyLimit) {
|
|
19178
|
+
throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
|
|
19179
|
+
}
|
|
19180
|
+
let rawBody;
|
|
19181
|
+
if (typeof event.request.text === "function") {
|
|
19182
|
+
rawBody = await event.request.text();
|
|
19183
|
+
} else {
|
|
19184
|
+
let parsedFallback;
|
|
19185
|
+
try {
|
|
19186
|
+
parsedFallback = await event.request.json();
|
|
19187
|
+
} catch (error) {
|
|
19188
|
+
if (error instanceof SyntaxError) {
|
|
19189
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
19190
|
+
}
|
|
19191
|
+
throw error;
|
|
19192
|
+
}
|
|
19193
|
+
rawBody = JSON.stringify(parsedFallback);
|
|
19194
|
+
}
|
|
19195
|
+
if (Buffer.byteLength(rawBody, "utf8") > bodyLimit) {
|
|
19196
|
+
throw new SearchSocketError("INVALID_REQUEST", "Request body too large", 413);
|
|
19197
|
+
}
|
|
19198
|
+
let body;
|
|
19199
|
+
try {
|
|
19200
|
+
body = JSON.parse(rawBody);
|
|
19201
|
+
} catch {
|
|
19202
|
+
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
19203
|
+
}
|
|
19204
|
+
const engine = await getEngine();
|
|
19205
|
+
const searchRequest = body;
|
|
19206
|
+
const result = await engine.search(searchRequest);
|
|
19207
|
+
return withCors(
|
|
19208
|
+
new Response(JSON.stringify(result), {
|
|
19209
|
+
status: 200,
|
|
19210
|
+
headers: { "content-type": "application/json" }
|
|
19211
|
+
}),
|
|
19212
|
+
event.request,
|
|
19213
|
+
config
|
|
19214
|
+
);
|
|
19215
|
+
}
|
|
19216
|
+
async function handleMcpRequest(event, apiKey, enableJsonResponse, getEngine) {
|
|
19217
|
+
if (apiKey) {
|
|
19218
|
+
const authHeader = event.request.headers.get("authorization") ?? "";
|
|
19219
|
+
const token = authHeader.startsWith("Bearer ") ? authHeader.slice(7) : "";
|
|
19220
|
+
const tokenBuf = Buffer.from(token);
|
|
19221
|
+
const keyBuf = Buffer.from(apiKey);
|
|
19222
|
+
if (tokenBuf.length !== keyBuf.length || !timingSafeEqual(tokenBuf, keyBuf)) {
|
|
19223
|
+
return new Response(
|
|
19224
|
+
JSON.stringify({
|
|
19225
|
+
jsonrpc: "2.0",
|
|
19226
|
+
error: { code: -32001, message: "Unauthorized" },
|
|
19227
|
+
id: null
|
|
19228
|
+
}),
|
|
19229
|
+
{ status: 401, headers: { "content-type": "application/json" } }
|
|
19230
|
+
);
|
|
19231
|
+
}
|
|
19232
|
+
}
|
|
19233
|
+
const transport = new WebStandardStreamableHTTPServerTransport({
|
|
19234
|
+
sessionIdGenerator: void 0,
|
|
19235
|
+
enableJsonResponse
|
|
19236
|
+
});
|
|
19237
|
+
let server;
|
|
19238
|
+
try {
|
|
19239
|
+
const engine = await getEngine();
|
|
19240
|
+
server = createServer(engine);
|
|
19241
|
+
await server.connect(transport);
|
|
19242
|
+
const response = await transport.handleRequest(event.request);
|
|
19243
|
+
if (enableJsonResponse) {
|
|
19244
|
+
await transport.close();
|
|
19245
|
+
await server.close();
|
|
19246
|
+
}
|
|
19247
|
+
return response;
|
|
19248
|
+
} catch (error) {
|
|
19249
|
+
try {
|
|
19250
|
+
await transport.close();
|
|
19251
|
+
} catch {
|
|
19252
|
+
}
|
|
19253
|
+
try {
|
|
19254
|
+
await server?.close();
|
|
19255
|
+
} catch {
|
|
19256
|
+
}
|
|
19257
|
+
return new Response(
|
|
19258
|
+
JSON.stringify({
|
|
19259
|
+
jsonrpc: "2.0",
|
|
19260
|
+
error: {
|
|
19261
|
+
code: -32603,
|
|
19262
|
+
message: error instanceof Error ? error.message : "Internal server error"
|
|
19263
|
+
},
|
|
19264
|
+
id: null
|
|
19265
|
+
}),
|
|
19266
|
+
{ status: 500, headers: { "content-type": "application/json" } }
|
|
19267
|
+
);
|
|
19268
|
+
}
|
|
19269
|
+
}
|
|
17989
19270
|
function buildCorsHeaders(request, config) {
|
|
17990
19271
|
const allowOrigins = config.api.cors.allowOrigins;
|
|
17991
19272
|
if (!allowOrigins || allowOrigins.length === 0) {
|
|
@@ -17998,7 +19279,7 @@ function buildCorsHeaders(request, config) {
|
|
|
17998
19279
|
}
|
|
17999
19280
|
return {
|
|
18000
19281
|
"access-control-allow-origin": allowOrigins.includes("*") ? "*" : origin,
|
|
18001
|
-
"access-control-allow-methods": "POST, OPTIONS",
|
|
19282
|
+
"access-control-allow-methods": "GET, POST, OPTIONS",
|
|
18002
19283
|
"access-control-allow-headers": "content-type"
|
|
18003
19284
|
};
|
|
18004
19285
|
}
|
|
@@ -18045,6 +19326,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
|
|
|
18045
19326
|
if (normalizeText(current.text)) {
|
|
18046
19327
|
sections.push({
|
|
18047
19328
|
sectionTitle: current.sectionTitle,
|
|
19329
|
+
headingLevel: current.headingLevel,
|
|
18048
19330
|
headingPath: current.headingPath,
|
|
18049
19331
|
text: current.text.trim()
|
|
18050
19332
|
});
|
|
@@ -18063,6 +19345,7 @@ function parseHeadingSections(markdown, headingPathDepth) {
|
|
|
18063
19345
|
headingStack.length = level;
|
|
18064
19346
|
current = {
|
|
18065
19347
|
sectionTitle: title,
|
|
19348
|
+
headingLevel: level,
|
|
18066
19349
|
headingPath: headingStack.filter((entry) => Boolean(entry)).slice(0, headingPathDepth),
|
|
18067
19350
|
text: `${line}
|
|
18068
19351
|
`
|
|
@@ -18198,6 +19481,7 @@ function splitSection(section, config) {
|
|
|
18198
19481
|
return [
|
|
18199
19482
|
{
|
|
18200
19483
|
sectionTitle: section.sectionTitle,
|
|
19484
|
+
headingLevel: section.headingLevel,
|
|
18201
19485
|
headingPath: section.headingPath,
|
|
18202
19486
|
chunkText: text
|
|
18203
19487
|
}
|
|
@@ -18248,6 +19532,7 @@ ${chunk}`;
|
|
|
18248
19532
|
}
|
|
18249
19533
|
return merged.map((chunkText) => ({
|
|
18250
19534
|
sectionTitle: section.sectionTitle,
|
|
19535
|
+
headingLevel: section.headingLevel,
|
|
18251
19536
|
headingPath: section.headingPath,
|
|
18252
19537
|
chunkText
|
|
18253
19538
|
}));
|
|
@@ -18263,6 +19548,18 @@ function buildSummaryChunkText(page) {
|
|
|
18263
19548
|
}
|
|
18264
19549
|
return parts.join("\n\n");
|
|
18265
19550
|
}
|
|
19551
|
+
function buildEmbeddingTitle(chunk) {
|
|
19552
|
+
if (!chunk.sectionTitle || chunk.headingLevel === void 0) return void 0;
|
|
19553
|
+
if (chunk.headingPath.length > 1) {
|
|
19554
|
+
const path14 = chunk.headingPath.join(" > ");
|
|
19555
|
+
const lastInPath = chunk.headingPath[chunk.headingPath.length - 1];
|
|
19556
|
+
if (lastInPath !== chunk.sectionTitle) {
|
|
19557
|
+
return `${chunk.title} \u2014 ${path14} > ${chunk.sectionTitle}`;
|
|
19558
|
+
}
|
|
19559
|
+
return `${chunk.title} \u2014 ${path14}`;
|
|
19560
|
+
}
|
|
19561
|
+
return `${chunk.title} \u2014 ${chunk.sectionTitle}`;
|
|
19562
|
+
}
|
|
18266
19563
|
function buildEmbeddingText(chunk, prependTitle) {
|
|
18267
19564
|
if (!prependTitle) return chunk.chunkText;
|
|
18268
19565
|
const prefix = chunk.sectionTitle ? `${chunk.title} \u2014 ${chunk.sectionTitle}` : chunk.title;
|
|
@@ -18293,10 +19590,14 @@ function chunkPage(page, config, scope) {
|
|
|
18293
19590
|
tags: page.tags,
|
|
18294
19591
|
contentHash: "",
|
|
18295
19592
|
description: page.description,
|
|
18296
|
-
keywords: page.keywords
|
|
19593
|
+
keywords: page.keywords,
|
|
19594
|
+
publishedAt: page.publishedAt,
|
|
19595
|
+
incomingAnchorText: page.incomingAnchorText,
|
|
19596
|
+
meta: page.meta
|
|
18297
19597
|
};
|
|
18298
19598
|
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
18299
|
-
|
|
19599
|
+
const metaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
|
|
19600
|
+
summaryChunk.contentHash = sha256(normalizeText(embeddingText) + metaSuffix);
|
|
18300
19601
|
chunks.push(summaryChunk);
|
|
18301
19602
|
}
|
|
18302
19603
|
const ordinalOffset = config.chunking.pageSummaryChunk ? 1 : 0;
|
|
@@ -18313,6 +19614,7 @@ function chunkPage(page, config, scope) {
|
|
|
18313
19614
|
path: page.url,
|
|
18314
19615
|
title: page.title,
|
|
18315
19616
|
sectionTitle: entry.sectionTitle,
|
|
19617
|
+
headingLevel: entry.headingLevel,
|
|
18316
19618
|
headingPath: entry.headingPath,
|
|
18317
19619
|
chunkText: entry.chunkText,
|
|
18318
19620
|
snippet: toSnippet(entry.chunkText),
|
|
@@ -18322,10 +19624,16 @@ function chunkPage(page, config, scope) {
|
|
|
18322
19624
|
tags: page.tags,
|
|
18323
19625
|
contentHash: "",
|
|
18324
19626
|
description: page.description,
|
|
18325
|
-
keywords: page.keywords
|
|
19627
|
+
keywords: page.keywords,
|
|
19628
|
+
publishedAt: page.publishedAt,
|
|
19629
|
+
incomingAnchorText: page.incomingAnchorText,
|
|
19630
|
+
meta: page.meta
|
|
18326
19631
|
};
|
|
18327
19632
|
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
18328
|
-
|
|
19633
|
+
const embeddingTitle = config.chunking.weightHeadings ? buildEmbeddingTitle(chunk) : void 0;
|
|
19634
|
+
const chunkMetaSuffix = page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : "";
|
|
19635
|
+
const hashInput = embeddingTitle ? `${normalizeText(embeddingText)}|title:${embeddingTitle}` : normalizeText(embeddingText);
|
|
19636
|
+
chunk.contentHash = sha256(hashInput + chunkMetaSuffix);
|
|
18329
19637
|
chunks.push(chunk);
|
|
18330
19638
|
}
|
|
18331
19639
|
return chunks;
|
|
@@ -19158,6 +20466,69 @@ function gfm(turndownService) {
|
|
|
19158
20466
|
}
|
|
19159
20467
|
|
|
19160
20468
|
// src/indexing/extractor.ts
|
|
20469
|
+
function normalizeDateToMs(value) {
|
|
20470
|
+
if (value == null) return void 0;
|
|
20471
|
+
if (value instanceof Date) {
|
|
20472
|
+
const ts = value.getTime();
|
|
20473
|
+
return Number.isFinite(ts) ? ts : void 0;
|
|
20474
|
+
}
|
|
20475
|
+
if (typeof value === "string") {
|
|
20476
|
+
const ts = new Date(value).getTime();
|
|
20477
|
+
return Number.isFinite(ts) ? ts : void 0;
|
|
20478
|
+
}
|
|
20479
|
+
if (typeof value === "number") {
|
|
20480
|
+
return Number.isFinite(value) ? value : void 0;
|
|
20481
|
+
}
|
|
20482
|
+
return void 0;
|
|
20483
|
+
}
|
|
20484
|
+
var FRONTMATTER_DATE_FIELDS = ["date", "publishedAt", "updatedAt", "published_at", "updated_at"];
|
|
20485
|
+
function extractPublishedAtFromFrontmatter(data) {
|
|
20486
|
+
for (const field of FRONTMATTER_DATE_FIELDS) {
|
|
20487
|
+
const val = normalizeDateToMs(data[field]);
|
|
20488
|
+
if (val !== void 0) return val;
|
|
20489
|
+
}
|
|
20490
|
+
return void 0;
|
|
20491
|
+
}
|
|
20492
|
+
function extractPublishedAtFromHtml($) {
|
|
20493
|
+
const jsonLdScripts = $('script[type="application/ld+json"]');
|
|
20494
|
+
for (let i = 0; i < jsonLdScripts.length; i++) {
|
|
20495
|
+
try {
|
|
20496
|
+
const raw = $(jsonLdScripts[i]).html();
|
|
20497
|
+
if (!raw) continue;
|
|
20498
|
+
const parsed = JSON.parse(raw);
|
|
20499
|
+
const candidates = [];
|
|
20500
|
+
if (Array.isArray(parsed)) {
|
|
20501
|
+
candidates.push(...parsed);
|
|
20502
|
+
} else if (parsed && typeof parsed === "object") {
|
|
20503
|
+
candidates.push(parsed);
|
|
20504
|
+
if (Array.isArray(parsed["@graph"])) {
|
|
20505
|
+
candidates.push(...parsed["@graph"]);
|
|
20506
|
+
}
|
|
20507
|
+
}
|
|
20508
|
+
for (const candidate of candidates) {
|
|
20509
|
+
const val = normalizeDateToMs(candidate.datePublished);
|
|
20510
|
+
if (val !== void 0) return val;
|
|
20511
|
+
}
|
|
20512
|
+
} catch {
|
|
20513
|
+
}
|
|
20514
|
+
}
|
|
20515
|
+
const ogTime = $('meta[property="article:published_time"]').attr("content")?.trim();
|
|
20516
|
+
if (ogTime) {
|
|
20517
|
+
const val = normalizeDateToMs(ogTime);
|
|
20518
|
+
if (val !== void 0) return val;
|
|
20519
|
+
}
|
|
20520
|
+
const itempropDate = $('meta[itemprop="datePublished"]').attr("content")?.trim() || $('time[itemprop="datePublished"]').attr("datetime")?.trim();
|
|
20521
|
+
if (itempropDate) {
|
|
20522
|
+
const val = normalizeDateToMs(itempropDate);
|
|
20523
|
+
if (val !== void 0) return val;
|
|
20524
|
+
}
|
|
20525
|
+
const timeEl = $("time[datetime]").first().attr("datetime")?.trim();
|
|
20526
|
+
if (timeEl) {
|
|
20527
|
+
const val = normalizeDateToMs(timeEl);
|
|
20528
|
+
if (val !== void 0) return val;
|
|
20529
|
+
}
|
|
20530
|
+
return void 0;
|
|
20531
|
+
}
|
|
19161
20532
|
function hasTopLevelNoindexComment(markdown) {
|
|
19162
20533
|
const lines = markdown.split(/\r?\n/);
|
|
19163
20534
|
let inFence = false;
|
|
@@ -19173,6 +20544,97 @@ function hasTopLevelNoindexComment(markdown) {
|
|
|
19173
20544
|
}
|
|
19174
20545
|
return false;
|
|
19175
20546
|
}
|
|
20547
|
+
var GARBAGE_ALT_WORDS = /* @__PURE__ */ new Set([
|
|
20548
|
+
"image",
|
|
20549
|
+
"photo",
|
|
20550
|
+
"picture",
|
|
20551
|
+
"icon",
|
|
20552
|
+
"logo",
|
|
20553
|
+
"banner",
|
|
20554
|
+
"screenshot",
|
|
20555
|
+
"thumbnail",
|
|
20556
|
+
"img",
|
|
20557
|
+
"graphic",
|
|
20558
|
+
"illustration",
|
|
20559
|
+
"spacer",
|
|
20560
|
+
"pixel",
|
|
20561
|
+
"placeholder",
|
|
20562
|
+
"avatar",
|
|
20563
|
+
"background"
|
|
20564
|
+
]);
|
|
20565
|
+
var IMAGE_EXT_RE = /\.(jpg|jpeg|png|gif|svg|webp|avif|bmp|ico)(\?.*)?$/i;
|
|
20566
|
+
function isMeaningfulAlt(alt) {
|
|
20567
|
+
const trimmed = alt.trim();
|
|
20568
|
+
if (!trimmed || trimmed.length < 5) return false;
|
|
20569
|
+
if (IMAGE_EXT_RE.test(trimmed)) return false;
|
|
20570
|
+
if (GARBAGE_ALT_WORDS.has(trimmed.toLowerCase())) return false;
|
|
20571
|
+
return true;
|
|
20572
|
+
}
|
|
20573
|
+
function resolveImageText(img, $, imageDescAttr) {
|
|
20574
|
+
const imgDesc = img.attr(imageDescAttr)?.trim();
|
|
20575
|
+
if (imgDesc) return imgDesc;
|
|
20576
|
+
const figure = img.closest("figure");
|
|
20577
|
+
if (figure.length) {
|
|
20578
|
+
const figDesc = figure.attr(imageDescAttr)?.trim();
|
|
20579
|
+
if (figDesc) return figDesc;
|
|
20580
|
+
}
|
|
20581
|
+
const alt = img.attr("alt")?.trim() ?? "";
|
|
20582
|
+
const caption = figure.length ? figure.find("figcaption").first().text().trim() : "";
|
|
20583
|
+
if (isMeaningfulAlt(alt) && caption) {
|
|
20584
|
+
return `${alt} \u2014 ${caption}`;
|
|
20585
|
+
}
|
|
20586
|
+
if (isMeaningfulAlt(alt)) {
|
|
20587
|
+
return alt;
|
|
20588
|
+
}
|
|
20589
|
+
if (caption) {
|
|
20590
|
+
return caption;
|
|
20591
|
+
}
|
|
20592
|
+
return null;
|
|
20593
|
+
}
|
|
20594
|
+
var STOP_ANCHORS = /* @__PURE__ */ new Set([
|
|
20595
|
+
"here",
|
|
20596
|
+
"click",
|
|
20597
|
+
"click here",
|
|
20598
|
+
"read more",
|
|
20599
|
+
"link",
|
|
20600
|
+
"this",
|
|
20601
|
+
"more"
|
|
20602
|
+
]);
|
|
20603
|
+
function normalizeAnchorText(raw) {
|
|
20604
|
+
const normalized = raw.replace(/\s+/g, " ").trim().toLowerCase();
|
|
20605
|
+
if (normalized.length < 3) return "";
|
|
20606
|
+
if (STOP_ANCHORS.has(normalized)) return "";
|
|
20607
|
+
if (normalized.length > 100) return normalized.slice(0, 100);
|
|
20608
|
+
return normalized;
|
|
20609
|
+
}
|
|
20610
|
+
function escapeHtml(text) {
|
|
20611
|
+
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">");
|
|
20612
|
+
}
|
|
20613
|
+
function preprocessImages(root2, $, imageDescAttr) {
|
|
20614
|
+
root2.find("picture").each((_i, el) => {
|
|
20615
|
+
const picture = $(el);
|
|
20616
|
+
const img = picture.find("img").first();
|
|
20617
|
+
const parentFigure = picture.closest("figure");
|
|
20618
|
+
const text = img.length ? resolveImageText(img, $, imageDescAttr) : null;
|
|
20619
|
+
if (text) {
|
|
20620
|
+
if (parentFigure.length) parentFigure.find("figcaption").remove();
|
|
20621
|
+
picture.replaceWith(`<span>${escapeHtml(text)}</span>`);
|
|
20622
|
+
} else {
|
|
20623
|
+
picture.remove();
|
|
20624
|
+
}
|
|
20625
|
+
});
|
|
20626
|
+
root2.find("img").each((_i, el) => {
|
|
20627
|
+
const img = $(el);
|
|
20628
|
+
const parentFigure = img.closest("figure");
|
|
20629
|
+
const text = resolveImageText(img, $, imageDescAttr);
|
|
20630
|
+
if (text) {
|
|
20631
|
+
if (parentFigure.length) parentFigure.find("figcaption").remove();
|
|
20632
|
+
img.replaceWith(`<span>${escapeHtml(text)}</span>`);
|
|
20633
|
+
} else {
|
|
20634
|
+
img.remove();
|
|
20635
|
+
}
|
|
20636
|
+
});
|
|
20637
|
+
}
|
|
19176
20638
|
function extractFromHtml(url, html, config) {
|
|
19177
20639
|
const $ = load(html);
|
|
19178
20640
|
const normalizedUrl = normalizeUrlPath(url);
|
|
@@ -19198,6 +20660,20 @@ function extractFromHtml(url, html, config) {
|
|
|
19198
20660
|
if (weight === 0) {
|
|
19199
20661
|
return null;
|
|
19200
20662
|
}
|
|
20663
|
+
if ($('meta[name="searchsocket:noindex"]').attr("content") === "true") {
|
|
20664
|
+
return null;
|
|
20665
|
+
}
|
|
20666
|
+
const RESERVED_META_KEYS = /* @__PURE__ */ new Set(["noindex", "tags"]);
|
|
20667
|
+
const meta = {};
|
|
20668
|
+
$('meta[name^="searchsocket:"]').each((_i, el) => {
|
|
20669
|
+
const name = $(el).attr("name") ?? "";
|
|
20670
|
+
const key = name.slice("searchsocket:".length);
|
|
20671
|
+
if (!key || RESERVED_META_KEYS.has(key) || !validateMetaKey(key)) return;
|
|
20672
|
+
const content = $(el).attr("content") ?? "";
|
|
20673
|
+
const dataType = $(el).attr("data-type") ?? "string";
|
|
20674
|
+
meta[key] = parseMetaValue(content, dataType);
|
|
20675
|
+
});
|
|
20676
|
+
const componentTags = $('meta[name="searchsocket:tags"]').attr("content")?.trim();
|
|
19201
20677
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
19202
20678
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
19203
20679
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -19209,7 +20685,9 @@ function extractFromHtml(url, html, config) {
|
|
|
19209
20685
|
root2.find(selector).remove();
|
|
19210
20686
|
}
|
|
19211
20687
|
root2.find(`[${config.extract.ignoreAttr}]`).remove();
|
|
20688
|
+
preprocessImages(root2, $, config.extract.imageDescAttr);
|
|
19212
20689
|
const outgoingLinks = [];
|
|
20690
|
+
const seenLinkKeys = /* @__PURE__ */ new Set();
|
|
19213
20691
|
root2.find("a[href]").each((_index, node) => {
|
|
19214
20692
|
const href = $(node).attr("href");
|
|
19215
20693
|
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:")) {
|
|
@@ -19220,7 +20698,19 @@ function extractFromHtml(url, html, config) {
|
|
|
19220
20698
|
if (!["http:", "https:"].includes(parsed.protocol)) {
|
|
19221
20699
|
return;
|
|
19222
20700
|
}
|
|
19223
|
-
|
|
20701
|
+
const url2 = normalizeUrlPath(parsed.pathname);
|
|
20702
|
+
let anchorText = normalizeAnchorText($(node).text());
|
|
20703
|
+
if (!anchorText) {
|
|
20704
|
+
const imgAlt = $(node).find("img").first().attr("alt") ?? "";
|
|
20705
|
+
if (isMeaningfulAlt(imgAlt)) {
|
|
20706
|
+
anchorText = normalizeAnchorText(imgAlt);
|
|
20707
|
+
}
|
|
20708
|
+
}
|
|
20709
|
+
const key = `${url2}|${anchorText}`;
|
|
20710
|
+
if (!seenLinkKeys.has(key)) {
|
|
20711
|
+
seenLinkKeys.add(key);
|
|
20712
|
+
outgoingLinks.push({ url: url2, anchorText });
|
|
20713
|
+
}
|
|
19224
20714
|
} catch {
|
|
19225
20715
|
}
|
|
19226
20716
|
});
|
|
@@ -19245,16 +20735,25 @@ function extractFromHtml(url, html, config) {
|
|
|
19245
20735
|
return null;
|
|
19246
20736
|
}
|
|
19247
20737
|
const tags = normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1);
|
|
20738
|
+
const publishedAt = extractPublishedAtFromHtml($);
|
|
20739
|
+
if (componentTags) {
|
|
20740
|
+
const extraTags = componentTags.split(",").map((t) => t.trim()).filter(Boolean);
|
|
20741
|
+
for (const t of extraTags) {
|
|
20742
|
+
if (!tags.includes(t)) tags.push(t);
|
|
20743
|
+
}
|
|
20744
|
+
}
|
|
19248
20745
|
return {
|
|
19249
20746
|
url: normalizeUrlPath(url),
|
|
19250
20747
|
title,
|
|
19251
20748
|
markdown,
|
|
19252
|
-
outgoingLinks
|
|
20749
|
+
outgoingLinks,
|
|
19253
20750
|
noindex: false,
|
|
19254
20751
|
tags,
|
|
19255
20752
|
description,
|
|
19256
20753
|
keywords,
|
|
19257
|
-
weight
|
|
20754
|
+
weight,
|
|
20755
|
+
publishedAt,
|
|
20756
|
+
meta: Object.keys(meta).length > 0 ? meta : void 0
|
|
19258
20757
|
};
|
|
19259
20758
|
}
|
|
19260
20759
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -19275,6 +20774,24 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19275
20774
|
if (mdWeight === 0) {
|
|
19276
20775
|
return null;
|
|
19277
20776
|
}
|
|
20777
|
+
let mdMeta;
|
|
20778
|
+
const rawMeta = searchsocketMeta?.meta;
|
|
20779
|
+
if (rawMeta && typeof rawMeta === "object" && !Array.isArray(rawMeta)) {
|
|
20780
|
+
const metaObj = {};
|
|
20781
|
+
for (const [key, val] of Object.entries(rawMeta)) {
|
|
20782
|
+
if (!validateMetaKey(key)) continue;
|
|
20783
|
+
if (typeof val === "string" || typeof val === "number" || typeof val === "boolean") {
|
|
20784
|
+
metaObj[key] = val;
|
|
20785
|
+
} else if (Array.isArray(val) && val.every((v) => typeof v === "string")) {
|
|
20786
|
+
metaObj[key] = val;
|
|
20787
|
+
} else if (val instanceof Date) {
|
|
20788
|
+
metaObj[key] = val.getTime();
|
|
20789
|
+
}
|
|
20790
|
+
}
|
|
20791
|
+
if (Object.keys(metaObj).length > 0) {
|
|
20792
|
+
mdMeta = metaObj;
|
|
20793
|
+
}
|
|
20794
|
+
}
|
|
19278
20795
|
const content = parsed.content;
|
|
19279
20796
|
const normalized = normalizeMarkdown(content);
|
|
19280
20797
|
if (!normalizeText(normalized)) {
|
|
@@ -19289,6 +20806,7 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19289
20806
|
fmKeywords = frontmatter.keywords.split(",").map((k) => k.trim()).filter(Boolean);
|
|
19290
20807
|
}
|
|
19291
20808
|
if (fmKeywords && fmKeywords.length === 0) fmKeywords = void 0;
|
|
20809
|
+
const publishedAt = extractPublishedAtFromFrontmatter(frontmatter);
|
|
19292
20810
|
return {
|
|
19293
20811
|
url: normalizeUrlPath(url),
|
|
19294
20812
|
title: resolvedTitle,
|
|
@@ -19298,7 +20816,9 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19298
20816
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
19299
20817
|
description: fmDescription,
|
|
19300
20818
|
keywords: fmKeywords,
|
|
19301
|
-
weight: mdWeight
|
|
20819
|
+
weight: mdWeight,
|
|
20820
|
+
publishedAt,
|
|
20821
|
+
meta: mdMeta
|
|
19302
20822
|
};
|
|
19303
20823
|
}
|
|
19304
20824
|
function segmentToRegex(segment) {
|
|
@@ -19461,7 +20981,7 @@ async function parseManifest(cwd, outputDir) {
|
|
|
19461
20981
|
const manifestPath = path.resolve(cwd, outputDir, "server", "manifest-full.js");
|
|
19462
20982
|
let content;
|
|
19463
20983
|
try {
|
|
19464
|
-
content = await
|
|
20984
|
+
content = await fs9.readFile(manifestPath, "utf8");
|
|
19465
20985
|
} catch {
|
|
19466
20986
|
throw new SearchSocketError(
|
|
19467
20987
|
"BUILD_MANIFEST_NOT_FOUND",
|
|
@@ -19772,6 +21292,125 @@ function filePathToUrl(filePath, baseDir) {
|
|
|
19772
21292
|
const noExt = relative.replace(/\.md$/i, "").replace(/\/index$/i, "");
|
|
19773
21293
|
return normalizeUrlPath(noExt || "/");
|
|
19774
21294
|
}
|
|
21295
|
+
var ROUTE_FILE_RE = /\+(page|layout|error)(@[^.]+)?\.svelte$/;
|
|
21296
|
+
function isSvelteComponentFile(filePath) {
|
|
21297
|
+
if (!filePath.endsWith(".svelte")) return false;
|
|
21298
|
+
return !ROUTE_FILE_RE.test(filePath);
|
|
21299
|
+
}
|
|
21300
|
+
function extractSvelteComponentMeta(source) {
|
|
21301
|
+
const componentMatch = source.match(/<!--\s*@component\s*([\s\S]*?)\s*-->/);
|
|
21302
|
+
const description = componentMatch?.[1]?.trim() || void 0;
|
|
21303
|
+
const propsMatch = source.match(
|
|
21304
|
+
/let\s+\{([\s\S]*?)\}\s*(?::\s*([^=;{][\s\S]*?))?\s*=\s*\$props\(\)/
|
|
21305
|
+
);
|
|
21306
|
+
const props = [];
|
|
21307
|
+
if (propsMatch) {
|
|
21308
|
+
const destructureBlock = propsMatch[1];
|
|
21309
|
+
const typeAnnotation = propsMatch[2]?.trim();
|
|
21310
|
+
let resolvedTypeMap;
|
|
21311
|
+
if (typeAnnotation && /^[A-Z]\w*$/.test(typeAnnotation)) {
|
|
21312
|
+
resolvedTypeMap = resolveTypeReference(source, typeAnnotation);
|
|
21313
|
+
} else if (typeAnnotation && typeAnnotation.startsWith("{")) {
|
|
21314
|
+
resolvedTypeMap = parseInlineTypeAnnotation(typeAnnotation);
|
|
21315
|
+
}
|
|
21316
|
+
const propEntries = splitDestructureBlock(destructureBlock);
|
|
21317
|
+
for (const entry of propEntries) {
|
|
21318
|
+
const trimmed = entry.trim();
|
|
21319
|
+
if (!trimmed || trimmed.startsWith("...")) continue;
|
|
21320
|
+
let propName;
|
|
21321
|
+
let defaultValue;
|
|
21322
|
+
const renameMatch = trimmed.match(/^(\w+)\s*:\s*\w+\s*(?:=\s*([\s\S]+))?$/);
|
|
21323
|
+
if (renameMatch) {
|
|
21324
|
+
propName = renameMatch[1];
|
|
21325
|
+
defaultValue = renameMatch[2]?.trim();
|
|
21326
|
+
} else {
|
|
21327
|
+
const defaultMatch = trimmed.match(/^(\w+)\s*=\s*([\s\S]+)$/);
|
|
21328
|
+
if (defaultMatch) {
|
|
21329
|
+
propName = defaultMatch[1];
|
|
21330
|
+
defaultValue = defaultMatch[2]?.trim();
|
|
21331
|
+
} else {
|
|
21332
|
+
propName = trimmed.match(/^(\w+)/)?.[1] ?? trimmed;
|
|
21333
|
+
}
|
|
21334
|
+
}
|
|
21335
|
+
const propType = resolvedTypeMap?.get(propName);
|
|
21336
|
+
props.push({
|
|
21337
|
+
name: propName,
|
|
21338
|
+
...propType ? { type: propType } : {},
|
|
21339
|
+
...defaultValue ? { default: defaultValue } : {}
|
|
21340
|
+
});
|
|
21341
|
+
}
|
|
21342
|
+
}
|
|
21343
|
+
return { description, props };
|
|
21344
|
+
}
|
|
21345
|
+
function splitDestructureBlock(block) {
|
|
21346
|
+
const entries = [];
|
|
21347
|
+
let depth = 0;
|
|
21348
|
+
let current = "";
|
|
21349
|
+
for (const ch of block) {
|
|
21350
|
+
if (ch === "{" || ch === "[" || ch === "(") {
|
|
21351
|
+
depth++;
|
|
21352
|
+
current += ch;
|
|
21353
|
+
} else if (ch === "}" || ch === "]" || ch === ")") {
|
|
21354
|
+
depth--;
|
|
21355
|
+
current += ch;
|
|
21356
|
+
} else if (ch === "," && depth === 0) {
|
|
21357
|
+
entries.push(current);
|
|
21358
|
+
current = "";
|
|
21359
|
+
} else {
|
|
21360
|
+
current += ch;
|
|
21361
|
+
}
|
|
21362
|
+
}
|
|
21363
|
+
if (current.trim()) entries.push(current);
|
|
21364
|
+
return entries;
|
|
21365
|
+
}
|
|
21366
|
+
function resolveTypeReference(source, typeName) {
|
|
21367
|
+
const startRe = new RegExp(`(?:interface\\s+${typeName}\\s*|type\\s+${typeName}\\s*=\\s*)\\{`);
|
|
21368
|
+
const startMatch = source.match(startRe);
|
|
21369
|
+
if (!startMatch || startMatch.index === void 0) return void 0;
|
|
21370
|
+
const bodyStart = startMatch.index + startMatch[0].length;
|
|
21371
|
+
let depth = 1;
|
|
21372
|
+
let i = bodyStart;
|
|
21373
|
+
while (i < source.length && depth > 0) {
|
|
21374
|
+
if (source[i] === "{") depth++;
|
|
21375
|
+
else if (source[i] === "}") depth--;
|
|
21376
|
+
i++;
|
|
21377
|
+
}
|
|
21378
|
+
if (depth !== 0) return void 0;
|
|
21379
|
+
const body = source.slice(bodyStart, i - 1);
|
|
21380
|
+
return parseTypeMembers(body);
|
|
21381
|
+
}
|
|
21382
|
+
function parseInlineTypeAnnotation(annotation) {
|
|
21383
|
+
const inner = annotation.replace(/^\{/, "").replace(/\}$/, "");
|
|
21384
|
+
return parseTypeMembers(inner);
|
|
21385
|
+
}
|
|
21386
|
+
function parseTypeMembers(body) {
|
|
21387
|
+
const map = /* @__PURE__ */ new Map();
|
|
21388
|
+
const members = body.split(/[;\n]/).map((m) => m.trim()).filter(Boolean);
|
|
21389
|
+
for (const member of members) {
|
|
21390
|
+
const memberMatch = member.match(/^(\w+)\??\s*:\s*(.+)$/);
|
|
21391
|
+
if (memberMatch) {
|
|
21392
|
+
map.set(memberMatch[1], memberMatch[2].replace(/,\s*$/, "").trim());
|
|
21393
|
+
}
|
|
21394
|
+
}
|
|
21395
|
+
return map;
|
|
21396
|
+
}
|
|
21397
|
+
function buildComponentMarkdown(componentName, meta) {
|
|
21398
|
+
if (!meta.description && meta.props.length === 0) return "";
|
|
21399
|
+
const parts = [`${componentName} component.`];
|
|
21400
|
+
if (meta.description) {
|
|
21401
|
+
parts.push(meta.description);
|
|
21402
|
+
}
|
|
21403
|
+
if (meta.props.length > 0) {
|
|
21404
|
+
const propEntries = meta.props.map((p) => {
|
|
21405
|
+
let entry = p.name;
|
|
21406
|
+
if (p.type) entry += ` (${p.type})`;
|
|
21407
|
+
if (p.default) entry += ` default: ${p.default}`;
|
|
21408
|
+
return entry;
|
|
21409
|
+
});
|
|
21410
|
+
parts.push(`Props: ${propEntries.join(", ")}.`);
|
|
21411
|
+
}
|
|
21412
|
+
return parts.join(" ");
|
|
21413
|
+
}
|
|
19775
21414
|
function normalizeSvelteToMarkdown(source) {
|
|
19776
21415
|
return source.replace(/<script[\s\S]*?<\/script>/g, "").replace(/<style[\s\S]*?<\/style>/g, "").replace(/<[^>]+>/g, " ").replace(/\{[^}]+\}/g, " ").replace(/\s+/g, " ").trim();
|
|
19777
21416
|
}
|
|
@@ -19790,13 +21429,27 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
19790
21429
|
const selected = typeof limit === "number" ? files.slice(0, limit) : files;
|
|
19791
21430
|
const pages = [];
|
|
19792
21431
|
for (const filePath of selected) {
|
|
19793
|
-
const raw = await
|
|
19794
|
-
|
|
21432
|
+
const raw = await fs9.readFile(filePath, "utf8");
|
|
21433
|
+
let markdown;
|
|
21434
|
+
let tags;
|
|
21435
|
+
if (filePath.endsWith(".md")) {
|
|
21436
|
+
markdown = raw;
|
|
21437
|
+
} else if (isSvelteComponentFile(filePath)) {
|
|
21438
|
+
const componentName = path.basename(filePath, ".svelte");
|
|
21439
|
+
const meta = extractSvelteComponentMeta(raw);
|
|
21440
|
+
const componentMarkdown = buildComponentMarkdown(componentName, meta);
|
|
21441
|
+
const templateContent = normalizeSvelteToMarkdown(raw);
|
|
21442
|
+
markdown = componentMarkdown ? [componentMarkdown, templateContent].filter(Boolean).join("\n\n") : templateContent;
|
|
21443
|
+
tags = ["component"];
|
|
21444
|
+
} else {
|
|
21445
|
+
markdown = normalizeSvelteToMarkdown(raw);
|
|
21446
|
+
}
|
|
19795
21447
|
pages.push({
|
|
19796
21448
|
url: filePathToUrl(filePath, baseDir),
|
|
19797
21449
|
markdown,
|
|
19798
21450
|
sourcePath: path.relative(cwd, filePath).replace(/\\/g, "/"),
|
|
19799
|
-
outgoingLinks: []
|
|
21451
|
+
outgoingLinks: [],
|
|
21452
|
+
...tags ? { tags } : {}
|
|
19800
21453
|
});
|
|
19801
21454
|
}
|
|
19802
21455
|
return pages;
|
|
@@ -19926,7 +21579,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
19926
21579
|
const selected = typeof limit === "number" ? htmlFiles.slice(0, limit) : htmlFiles;
|
|
19927
21580
|
const pages = [];
|
|
19928
21581
|
for (const filePath of selected) {
|
|
19929
|
-
const html = await
|
|
21582
|
+
const html = await fs9.readFile(filePath, "utf8");
|
|
19930
21583
|
pages.push({
|
|
19931
21584
|
url: staticHtmlFileToUrl(filePath, outputDir),
|
|
19932
21585
|
html,
|
|
@@ -19989,7 +21642,7 @@ function isBlockedByRobots(urlPath, rules3) {
|
|
|
19989
21642
|
}
|
|
19990
21643
|
async function loadRobotsTxtFromDir(dir) {
|
|
19991
21644
|
try {
|
|
19992
|
-
const content = await
|
|
21645
|
+
const content = await fs9.readFile(path.join(dir, "robots.txt"), "utf8");
|
|
19993
21646
|
return parseRobotsTxt(content);
|
|
19994
21647
|
} catch {
|
|
19995
21648
|
return null;
|
|
@@ -20006,6 +21659,81 @@ async function fetchRobotsTxt(baseUrl) {
|
|
|
20006
21659
|
return null;
|
|
20007
21660
|
}
|
|
20008
21661
|
}
|
|
21662
|
+
function resolvePageUrl(pageUrl, baseUrl) {
|
|
21663
|
+
if (!baseUrl) return pageUrl;
|
|
21664
|
+
try {
|
|
21665
|
+
return new URL(pageUrl, baseUrl).href;
|
|
21666
|
+
} catch {
|
|
21667
|
+
return pageUrl;
|
|
21668
|
+
}
|
|
21669
|
+
}
|
|
21670
|
+
function generateLlmsTxt(pages, config) {
|
|
21671
|
+
const title = config.llmsTxt.title ?? config.project.id;
|
|
21672
|
+
const description = config.llmsTxt.description;
|
|
21673
|
+
const baseUrl = config.project.baseUrl;
|
|
21674
|
+
const lines = [`# ${title}`];
|
|
21675
|
+
if (description) {
|
|
21676
|
+
lines.push("", `> ${description}`);
|
|
21677
|
+
}
|
|
21678
|
+
const filtered = pages.filter(
|
|
21679
|
+
(p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
|
|
21680
|
+
);
|
|
21681
|
+
const sorted = [...filtered].sort((a, b) => {
|
|
21682
|
+
if (a.depth !== b.depth) return a.depth - b.depth;
|
|
21683
|
+
return b.incomingLinks - a.incomingLinks;
|
|
21684
|
+
});
|
|
21685
|
+
if (sorted.length > 0) {
|
|
21686
|
+
lines.push("", "## Pages", "");
|
|
21687
|
+
for (const page of sorted) {
|
|
21688
|
+
const url = resolvePageUrl(page.url, baseUrl);
|
|
21689
|
+
if (page.description) {
|
|
21690
|
+
lines.push(`- [${page.title}](${url}): ${page.description}`);
|
|
21691
|
+
} else {
|
|
21692
|
+
lines.push(`- [${page.title}](${url})`);
|
|
21693
|
+
}
|
|
21694
|
+
}
|
|
21695
|
+
}
|
|
21696
|
+
lines.push("");
|
|
21697
|
+
return lines.join("\n");
|
|
21698
|
+
}
|
|
21699
|
+
function generateLlmsFullTxt(pages, config) {
|
|
21700
|
+
const title = config.llmsTxt.title ?? config.project.id;
|
|
21701
|
+
const description = config.llmsTxt.description;
|
|
21702
|
+
const baseUrl = config.project.baseUrl;
|
|
21703
|
+
const lines = [`# ${title}`];
|
|
21704
|
+
if (description) {
|
|
21705
|
+
lines.push("", `> ${description}`);
|
|
21706
|
+
}
|
|
21707
|
+
const filtered = pages.filter(
|
|
21708
|
+
(p) => p.url !== "/llms.txt" && p.url !== "/llms-full.txt"
|
|
21709
|
+
);
|
|
21710
|
+
const sorted = [...filtered].sort((a, b) => {
|
|
21711
|
+
if (a.depth !== b.depth) return a.depth - b.depth;
|
|
21712
|
+
return b.incomingLinks - a.incomingLinks;
|
|
21713
|
+
});
|
|
21714
|
+
for (const page of sorted) {
|
|
21715
|
+
const url = resolvePageUrl(page.url, baseUrl);
|
|
21716
|
+
lines.push("", "---", "", `## [${page.title}](${url})`, "");
|
|
21717
|
+
lines.push(page.markdown.trim());
|
|
21718
|
+
}
|
|
21719
|
+
lines.push("");
|
|
21720
|
+
return lines.join("\n");
|
|
21721
|
+
}
|
|
21722
|
+
async function writeLlmsTxt(pages, config, cwd, logger3) {
|
|
21723
|
+
const outputPath = path.resolve(cwd, config.llmsTxt.outputPath);
|
|
21724
|
+
const outputDir = path.dirname(outputPath);
|
|
21725
|
+
await fs9.mkdir(outputDir, { recursive: true });
|
|
21726
|
+
const content = generateLlmsTxt(pages, config);
|
|
21727
|
+
await fs9.writeFile(outputPath, content, "utf8");
|
|
21728
|
+
logger3.info(`Generated llms.txt at ${config.llmsTxt.outputPath}`);
|
|
21729
|
+
if (config.llmsTxt.generateFull) {
|
|
21730
|
+
const fullPath = outputPath.replace(/\.txt$/, "-full.txt");
|
|
21731
|
+
const fullContent = generateLlmsFullTxt(pages, config);
|
|
21732
|
+
await fs9.writeFile(fullPath, fullContent, "utf8");
|
|
21733
|
+
const relativeFull = path.relative(cwd, fullPath);
|
|
21734
|
+
logger3.info(`Generated llms-full.txt at ${relativeFull}`);
|
|
21735
|
+
}
|
|
21736
|
+
}
|
|
20009
21737
|
|
|
20010
21738
|
// src/indexing/pipeline.ts
|
|
20011
21739
|
function buildPageSummary(page, maxChars = 3500) {
|
|
@@ -20024,16 +21752,33 @@ function buildPageSummary(page, maxChars = 3500) {
|
|
|
20024
21752
|
if (joined.length <= maxChars) return joined;
|
|
20025
21753
|
return joined.slice(0, maxChars).trim();
|
|
20026
21754
|
}
|
|
21755
|
+
function buildPageContentHash(page) {
|
|
21756
|
+
const parts = [
|
|
21757
|
+
page.title,
|
|
21758
|
+
page.description ?? "",
|
|
21759
|
+
(page.keywords ?? []).slice().sort().join(","),
|
|
21760
|
+
page.tags.slice().sort().join(","),
|
|
21761
|
+
page.markdown,
|
|
21762
|
+
String(page.outgoingLinks),
|
|
21763
|
+
String(page.publishedAt ?? ""),
|
|
21764
|
+
page.incomingAnchorText ?? "",
|
|
21765
|
+
(page.outgoingLinkUrls ?? []).slice().sort().join(","),
|
|
21766
|
+
page.meta ? JSON.stringify(page.meta, Object.keys(page.meta).sort()) : ""
|
|
21767
|
+
];
|
|
21768
|
+
return sha256(parts.join("|"));
|
|
21769
|
+
}
|
|
20027
21770
|
var IndexPipeline = class _IndexPipeline {
|
|
20028
21771
|
cwd;
|
|
20029
21772
|
config;
|
|
20030
21773
|
store;
|
|
20031
21774
|
logger;
|
|
21775
|
+
hooks;
|
|
20032
21776
|
constructor(options) {
|
|
20033
21777
|
this.cwd = options.cwd;
|
|
20034
21778
|
this.config = options.config;
|
|
20035
21779
|
this.store = options.store;
|
|
20036
21780
|
this.logger = options.logger;
|
|
21781
|
+
this.hooks = options.hooks;
|
|
20037
21782
|
}
|
|
20038
21783
|
static async create(options = {}) {
|
|
20039
21784
|
const cwd = path.resolve(options.cwd ?? process.cwd());
|
|
@@ -20043,7 +21788,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20043
21788
|
cwd,
|
|
20044
21789
|
config,
|
|
20045
21790
|
store,
|
|
20046
|
-
logger: options.logger ?? new Logger()
|
|
21791
|
+
logger: options.logger ?? new Logger(),
|
|
21792
|
+
hooks: options.hooks ?? {}
|
|
20047
21793
|
});
|
|
20048
21794
|
}
|
|
20049
21795
|
getConfig() {
|
|
@@ -20064,7 +21810,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20064
21810
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
20065
21811
|
ensureStateDirs(this.cwd, this.config.state.dir);
|
|
20066
21812
|
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
20067
|
-
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-
|
|
21813
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-vector)`);
|
|
20068
21814
|
if (options.force) {
|
|
20069
21815
|
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
20070
21816
|
}
|
|
@@ -20072,9 +21818,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20072
21818
|
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
20073
21819
|
}
|
|
20074
21820
|
const manifestStart = stageStart();
|
|
20075
|
-
const
|
|
21821
|
+
const existingPageHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getPageHashes(scope);
|
|
20076
21822
|
stageEnd("manifest", manifestStart);
|
|
20077
|
-
this.logger.debug(`Manifest: ${
|
|
21823
|
+
this.logger.debug(`Manifest: ${existingPageHashes.size} existing page hashes loaded`);
|
|
20078
21824
|
const sourceStart = stageStart();
|
|
20079
21825
|
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
20080
21826
|
let sourcePages;
|
|
@@ -20151,11 +21897,61 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20151
21897
|
);
|
|
20152
21898
|
continue;
|
|
20153
21899
|
}
|
|
20154
|
-
|
|
21900
|
+
if (sourcePage.tags && sourcePage.tags.length > 0) {
|
|
21901
|
+
extracted.tags = [.../* @__PURE__ */ new Set([...extracted.tags, ...sourcePage.tags])];
|
|
21902
|
+
}
|
|
21903
|
+
let accepted;
|
|
21904
|
+
if (this.hooks.transformPage) {
|
|
21905
|
+
const transformed = await this.hooks.transformPage(extracted);
|
|
21906
|
+
if (transformed === null) {
|
|
21907
|
+
this.logger.debug(`Page ${sourcePage.url} skipped by transformPage hook`);
|
|
21908
|
+
continue;
|
|
21909
|
+
}
|
|
21910
|
+
accepted = transformed;
|
|
21911
|
+
} else {
|
|
21912
|
+
accepted = extracted;
|
|
21913
|
+
}
|
|
21914
|
+
extractedPages.push(accepted);
|
|
20155
21915
|
this.logger.event("page_extracted", {
|
|
20156
|
-
url:
|
|
21916
|
+
url: accepted.url
|
|
20157
21917
|
});
|
|
20158
21918
|
}
|
|
21919
|
+
const customRecords = options.customRecords ?? [];
|
|
21920
|
+
if (customRecords.length > 0) {
|
|
21921
|
+
this.logger.info(`Processing ${customRecords.length} custom record${customRecords.length === 1 ? "" : "s"}...`);
|
|
21922
|
+
for (const record of customRecords) {
|
|
21923
|
+
const normalizedUrl = normalizeUrlPath(record.url);
|
|
21924
|
+
const normalized = normalizeMarkdown(record.content);
|
|
21925
|
+
if (!normalized.trim()) {
|
|
21926
|
+
this.logger.warn(`Custom record ${normalizedUrl} has empty content and was skipped.`);
|
|
21927
|
+
continue;
|
|
21928
|
+
}
|
|
21929
|
+
const urlTags = normalizedUrl.split("/").filter(Boolean).slice(0, 1);
|
|
21930
|
+
const tags = record.tags ? [.../* @__PURE__ */ new Set([...urlTags, ...record.tags])] : urlTags;
|
|
21931
|
+
const extracted = {
|
|
21932
|
+
url: normalizedUrl,
|
|
21933
|
+
title: record.title,
|
|
21934
|
+
markdown: normalized,
|
|
21935
|
+
outgoingLinks: [],
|
|
21936
|
+
noindex: false,
|
|
21937
|
+
tags,
|
|
21938
|
+
weight: record.weight
|
|
21939
|
+
};
|
|
21940
|
+
let accepted;
|
|
21941
|
+
if (this.hooks.transformPage) {
|
|
21942
|
+
const transformed = await this.hooks.transformPage(extracted);
|
|
21943
|
+
if (transformed === null) {
|
|
21944
|
+
this.logger.debug(`Custom record ${normalizedUrl} skipped by transformPage hook`);
|
|
21945
|
+
continue;
|
|
21946
|
+
}
|
|
21947
|
+
accepted = transformed;
|
|
21948
|
+
} else {
|
|
21949
|
+
accepted = extracted;
|
|
21950
|
+
}
|
|
21951
|
+
extractedPages.push(accepted);
|
|
21952
|
+
this.logger.event("page_extracted", { url: accepted.url, custom: true });
|
|
21953
|
+
}
|
|
21954
|
+
}
|
|
20159
21955
|
extractedPages.sort((a, b) => a.url.localeCompare(b.url));
|
|
20160
21956
|
const uniquePages = [];
|
|
20161
21957
|
const seenUrls = /* @__PURE__ */ new Set();
|
|
@@ -20188,15 +21984,28 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20188
21984
|
const linkStart = stageStart();
|
|
20189
21985
|
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
20190
21986
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
21987
|
+
const incomingAnchorTexts = /* @__PURE__ */ new Map();
|
|
20191
21988
|
for (const page of indexablePages) {
|
|
20192
21989
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
20193
21990
|
}
|
|
20194
21991
|
for (const page of indexablePages) {
|
|
20195
|
-
|
|
21992
|
+
const seenForCount = /* @__PURE__ */ new Set();
|
|
21993
|
+
const seenForAnchor = /* @__PURE__ */ new Set();
|
|
21994
|
+
for (const { url: outgoing, anchorText } of page.outgoingLinks) {
|
|
20196
21995
|
if (!pageSet.has(outgoing)) {
|
|
20197
21996
|
continue;
|
|
20198
21997
|
}
|
|
20199
|
-
|
|
21998
|
+
if (!seenForCount.has(outgoing)) {
|
|
21999
|
+
seenForCount.add(outgoing);
|
|
22000
|
+
incomingLinkCount.set(outgoing, (incomingLinkCount.get(outgoing) ?? 0) + 1);
|
|
22001
|
+
}
|
|
22002
|
+
if (anchorText && !seenForAnchor.has(outgoing)) {
|
|
22003
|
+
seenForAnchor.add(outgoing);
|
|
22004
|
+
if (!incomingAnchorTexts.has(outgoing)) {
|
|
22005
|
+
incomingAnchorTexts.set(outgoing, /* @__PURE__ */ new Set());
|
|
22006
|
+
}
|
|
22007
|
+
incomingAnchorTexts.get(outgoing).add(anchorText);
|
|
22008
|
+
}
|
|
20200
22009
|
}
|
|
20201
22010
|
}
|
|
20202
22011
|
stageEnd("links", linkStart);
|
|
@@ -20215,6 +22024,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20215
22024
|
});
|
|
20216
22025
|
}
|
|
20217
22026
|
}
|
|
22027
|
+
for (const record of customRecords) {
|
|
22028
|
+
const normalizedUrl = normalizeUrlPath(record.url);
|
|
22029
|
+
if (!precomputedRoutes.has(normalizedUrl)) {
|
|
22030
|
+
precomputedRoutes.set(normalizedUrl, {
|
|
22031
|
+
routeFile: "",
|
|
22032
|
+
routeResolution: "exact"
|
|
22033
|
+
});
|
|
22034
|
+
}
|
|
22035
|
+
}
|
|
20218
22036
|
for (const page of indexablePages) {
|
|
20219
22037
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
20220
22038
|
if (routeMatch.routeResolution === "best-effort") {
|
|
@@ -20232,6 +22050,17 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20232
22050
|
} else {
|
|
20233
22051
|
routeExact += 1;
|
|
20234
22052
|
}
|
|
22053
|
+
const anchorSet = incomingAnchorTexts.get(page.url);
|
|
22054
|
+
let incomingAnchorText;
|
|
22055
|
+
if (anchorSet && anchorSet.size > 0) {
|
|
22056
|
+
let joined = "";
|
|
22057
|
+
for (const phrase of anchorSet) {
|
|
22058
|
+
const next2 = joined ? `${joined} ${phrase}` : phrase;
|
|
22059
|
+
if (next2.length > 500) break;
|
|
22060
|
+
joined = next2;
|
|
22061
|
+
}
|
|
22062
|
+
incomingAnchorText = joined || void 0;
|
|
22063
|
+
}
|
|
20235
22064
|
const indexedPage = {
|
|
20236
22065
|
url: page.url,
|
|
20237
22066
|
title: page.title,
|
|
@@ -20241,40 +22070,113 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20241
22070
|
generatedAt: nowIso(),
|
|
20242
22071
|
incomingLinks: incomingLinkCount.get(page.url) ?? 0,
|
|
20243
22072
|
outgoingLinks: page.outgoingLinks.length,
|
|
22073
|
+
outgoingLinkUrls: page.outgoingLinks.map((l) => typeof l === "string" ? l : l.url),
|
|
20244
22074
|
depth: getUrlDepth(page.url),
|
|
20245
22075
|
tags: page.tags,
|
|
20246
22076
|
markdown: page.markdown,
|
|
20247
22077
|
description: page.description,
|
|
20248
|
-
keywords: page.keywords
|
|
22078
|
+
keywords: page.keywords,
|
|
22079
|
+
publishedAt: page.publishedAt,
|
|
22080
|
+
incomingAnchorText,
|
|
22081
|
+
meta: page.meta
|
|
20249
22082
|
};
|
|
20250
22083
|
pages.push(indexedPage);
|
|
20251
22084
|
this.logger.event("page_indexed", { url: page.url });
|
|
20252
22085
|
}
|
|
22086
|
+
const pageRecords = pages.map((p) => {
|
|
22087
|
+
const summary = buildPageSummary(p);
|
|
22088
|
+
return {
|
|
22089
|
+
url: p.url,
|
|
22090
|
+
title: p.title,
|
|
22091
|
+
markdown: p.markdown,
|
|
22092
|
+
projectId: scope.projectId,
|
|
22093
|
+
scopeName: scope.scopeName,
|
|
22094
|
+
routeFile: p.routeFile,
|
|
22095
|
+
routeResolution: p.routeResolution,
|
|
22096
|
+
incomingLinks: p.incomingLinks,
|
|
22097
|
+
outgoingLinks: p.outgoingLinks,
|
|
22098
|
+
outgoingLinkUrls: p.outgoingLinkUrls,
|
|
22099
|
+
depth: p.depth,
|
|
22100
|
+
tags: p.tags,
|
|
22101
|
+
indexedAt: p.generatedAt,
|
|
22102
|
+
summary,
|
|
22103
|
+
description: p.description,
|
|
22104
|
+
keywords: p.keywords,
|
|
22105
|
+
contentHash: buildPageContentHash(p),
|
|
22106
|
+
publishedAt: p.publishedAt,
|
|
22107
|
+
meta: p.meta
|
|
22108
|
+
};
|
|
22109
|
+
});
|
|
22110
|
+
const currentPageUrls = new Set(pageRecords.map((r) => r.url));
|
|
22111
|
+
const changedPages = pageRecords.filter(
|
|
22112
|
+
(r) => !existingPageHashes.has(r.url) || existingPageHashes.get(r.url) !== r.contentHash
|
|
22113
|
+
);
|
|
22114
|
+
const deletedPageUrls = [...existingPageHashes.keys()].filter((url) => !currentPageUrls.has(url));
|
|
20253
22115
|
if (!options.dryRun) {
|
|
20254
|
-
|
|
20255
|
-
|
|
20256
|
-
|
|
20257
|
-
|
|
20258
|
-
|
|
20259
|
-
|
|
20260
|
-
|
|
20261
|
-
|
|
20262
|
-
|
|
20263
|
-
|
|
20264
|
-
|
|
20265
|
-
|
|
20266
|
-
|
|
20267
|
-
|
|
20268
|
-
|
|
20269
|
-
|
|
20270
|
-
|
|
20271
|
-
|
|
20272
|
-
|
|
20273
|
-
|
|
20274
|
-
|
|
20275
|
-
|
|
22116
|
+
if (options.force) {
|
|
22117
|
+
await this.store.deletePages(scope);
|
|
22118
|
+
this.logger.info(`Upserting ${pageRecords.length} page summaries...`);
|
|
22119
|
+
const pageDocs = pageRecords.map((r) => ({
|
|
22120
|
+
id: r.url,
|
|
22121
|
+
data: r.summary ?? r.title,
|
|
22122
|
+
metadata: {
|
|
22123
|
+
title: r.title,
|
|
22124
|
+
url: r.url,
|
|
22125
|
+
description: r.description ?? "",
|
|
22126
|
+
keywords: r.keywords ?? [],
|
|
22127
|
+
summary: r.summary ?? "",
|
|
22128
|
+
tags: r.tags,
|
|
22129
|
+
markdown: r.markdown,
|
|
22130
|
+
routeFile: r.routeFile,
|
|
22131
|
+
routeResolution: r.routeResolution,
|
|
22132
|
+
incomingLinks: r.incomingLinks,
|
|
22133
|
+
outgoingLinks: r.outgoingLinks,
|
|
22134
|
+
outgoingLinkUrls: r.outgoingLinkUrls ?? [],
|
|
22135
|
+
depth: r.depth,
|
|
22136
|
+
indexedAt: r.indexedAt,
|
|
22137
|
+
contentHash: r.contentHash ?? "",
|
|
22138
|
+
publishedAt: r.publishedAt ?? null,
|
|
22139
|
+
...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
|
|
22140
|
+
}
|
|
22141
|
+
}));
|
|
22142
|
+
await this.store.upsertPages(pageDocs, scope);
|
|
22143
|
+
} else {
|
|
22144
|
+
if (changedPages.length > 0) {
|
|
22145
|
+
this.logger.info(`Upserting ${changedPages.length} changed page summaries...`);
|
|
22146
|
+
const pageDocs = changedPages.map((r) => ({
|
|
22147
|
+
id: r.url,
|
|
22148
|
+
data: r.summary ?? r.title,
|
|
22149
|
+
metadata: {
|
|
22150
|
+
title: r.title,
|
|
22151
|
+
url: r.url,
|
|
22152
|
+
description: r.description ?? "",
|
|
22153
|
+
keywords: r.keywords ?? [],
|
|
22154
|
+
summary: r.summary ?? "",
|
|
22155
|
+
tags: r.tags,
|
|
22156
|
+
markdown: r.markdown,
|
|
22157
|
+
routeFile: r.routeFile,
|
|
22158
|
+
routeResolution: r.routeResolution,
|
|
22159
|
+
incomingLinks: r.incomingLinks,
|
|
22160
|
+
outgoingLinks: r.outgoingLinks,
|
|
22161
|
+
outgoingLinkUrls: r.outgoingLinkUrls ?? [],
|
|
22162
|
+
depth: r.depth,
|
|
22163
|
+
indexedAt: r.indexedAt,
|
|
22164
|
+
contentHash: r.contentHash ?? "",
|
|
22165
|
+
publishedAt: r.publishedAt ?? null,
|
|
22166
|
+
...r.meta && Object.keys(r.meta).length > 0 ? { meta: r.meta } : {}
|
|
22167
|
+
}
|
|
22168
|
+
}));
|
|
22169
|
+
await this.store.upsertPages(pageDocs, scope);
|
|
22170
|
+
}
|
|
22171
|
+
if (deletedPageUrls.length > 0) {
|
|
22172
|
+
await this.store.deletePagesByIds(deletedPageUrls, scope);
|
|
22173
|
+
}
|
|
22174
|
+
}
|
|
20276
22175
|
}
|
|
22176
|
+
const pagesChanged = options.force ? pageRecords.length : changedPages.length;
|
|
22177
|
+
const pagesDeleted = deletedPageUrls.length;
|
|
20277
22178
|
stageEnd("pages", pagesStart);
|
|
22179
|
+
this.logger.info(`Page changes: ${pagesChanged} changed/new, ${pagesDeleted} deleted, ${pageRecords.length - changedPages.length} unchanged`);
|
|
20278
22180
|
this.logger.info(`Indexed ${pages.length} page${pages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["pages"]}ms)`);
|
|
20279
22181
|
const chunkStart = stageStart();
|
|
20280
22182
|
this.logger.info("Chunking pages...");
|
|
@@ -20283,6 +22185,18 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20283
22185
|
if (typeof maxChunks === "number") {
|
|
20284
22186
|
chunks = chunks.slice(0, maxChunks);
|
|
20285
22187
|
}
|
|
22188
|
+
if (this.hooks.transformChunk) {
|
|
22189
|
+
const transformed = [];
|
|
22190
|
+
for (const chunk of chunks) {
|
|
22191
|
+
const result = await this.hooks.transformChunk(chunk);
|
|
22192
|
+
if (result === null) {
|
|
22193
|
+
this.logger.debug(`Chunk ${chunk.chunkKey} skipped by transformChunk hook`);
|
|
22194
|
+
continue;
|
|
22195
|
+
}
|
|
22196
|
+
transformed.push(result);
|
|
22197
|
+
}
|
|
22198
|
+
chunks = transformed;
|
|
22199
|
+
}
|
|
20286
22200
|
for (const chunk of chunks) {
|
|
20287
22201
|
this.logger.event("chunked", {
|
|
20288
22202
|
url: chunk.url,
|
|
@@ -20295,7 +22209,12 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20295
22209
|
for (const chunk of chunks) {
|
|
20296
22210
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
20297
22211
|
}
|
|
20298
|
-
const
|
|
22212
|
+
const chunkHashStart = stageStart();
|
|
22213
|
+
const currentChunkKeys = chunks.map((c) => c.chunkKey);
|
|
22214
|
+
const existingHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.fetchContentHashesForKeys(currentChunkKeys, scope);
|
|
22215
|
+
stageEnd("chunk_hashes", chunkHashStart);
|
|
22216
|
+
this.logger.debug(`Fetched ${existingHashes.size} existing chunk hashes for ${currentChunkKeys.length} current keys`);
|
|
22217
|
+
let changedChunks = chunks.filter((chunk) => {
|
|
20299
22218
|
if (options.force) {
|
|
20300
22219
|
return true;
|
|
20301
22220
|
}
|
|
@@ -20308,37 +22227,45 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20308
22227
|
}
|
|
20309
22228
|
return existingHash !== chunk.contentHash;
|
|
20310
22229
|
});
|
|
20311
|
-
const
|
|
22230
|
+
const existingChunkIds = options.force ? /* @__PURE__ */ new Set() : await this.store.scanChunkIds(scope);
|
|
22231
|
+
const deletes = [...existingChunkIds].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
22232
|
+
if (this.hooks.beforeIndex) {
|
|
22233
|
+
changedChunks = await this.hooks.beforeIndex(changedChunks);
|
|
22234
|
+
}
|
|
20312
22235
|
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
20313
22236
|
const upsertStart = stageStart();
|
|
20314
22237
|
let documentsUpserted = 0;
|
|
20315
22238
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
20316
|
-
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash
|
|
20317
|
-
const UPSTASH_CONTENT_LIMIT = 4096;
|
|
22239
|
+
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Vector...`);
|
|
20318
22240
|
const docs = changedChunks.map((chunk) => {
|
|
20319
|
-
const
|
|
20320
|
-
|
|
20321
|
-
|
|
20322
|
-
|
|
20323
|
-
|
|
20324
|
-
|
|
20325
|
-
const textBudget = Math.max(500, UPSTASH_CONTENT_LIMIT - otherFieldsLen - 50);
|
|
20326
|
-
const text = buildEmbeddingText(chunk, this.config.chunking.prependTitle).slice(0, textBudget);
|
|
22241
|
+
const embeddingText = buildEmbeddingText(chunk, this.config.chunking.prependTitle);
|
|
22242
|
+
if (embeddingText.length > 2e3) {
|
|
22243
|
+
this.logger.warn(
|
|
22244
|
+
`Chunk ${chunk.chunkKey} text is ${embeddingText.length} chars (~${Math.round(embeddingText.length / 4)} tokens), which may exceed the 512-token model limit and be silently truncated.`
|
|
22245
|
+
);
|
|
22246
|
+
}
|
|
20327
22247
|
return {
|
|
20328
22248
|
id: chunk.chunkKey,
|
|
20329
|
-
|
|
22249
|
+
data: embeddingText,
|
|
20330
22250
|
metadata: {
|
|
20331
|
-
|
|
20332
|
-
scopeName: scope.scopeName,
|
|
22251
|
+
url: chunk.url,
|
|
20333
22252
|
path: chunk.path,
|
|
22253
|
+
title: chunk.title,
|
|
22254
|
+
sectionTitle: chunk.sectionTitle ?? "",
|
|
22255
|
+
headingPath: chunk.headingPath.join(" > "),
|
|
20334
22256
|
snippet: chunk.snippet,
|
|
22257
|
+
chunkText: embeddingText,
|
|
22258
|
+
tags: chunk.tags,
|
|
20335
22259
|
ordinal: chunk.ordinal,
|
|
20336
22260
|
contentHash: chunk.contentHash,
|
|
20337
22261
|
depth: chunk.depth,
|
|
20338
22262
|
incomingLinks: chunk.incomingLinks,
|
|
20339
22263
|
routeFile: chunk.routeFile,
|
|
20340
22264
|
description: chunk.description ?? "",
|
|
20341
|
-
keywords:
|
|
22265
|
+
keywords: chunk.keywords ?? [],
|
|
22266
|
+
publishedAt: chunk.publishedAt ?? null,
|
|
22267
|
+
incomingAnchorText: chunk.incomingAnchorText ?? "",
|
|
22268
|
+
...chunk.meta && Object.keys(chunk.meta).length > 0 ? { meta: chunk.meta } : {}
|
|
20342
22269
|
}
|
|
20343
22270
|
};
|
|
20344
22271
|
});
|
|
@@ -20356,9 +22283,16 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20356
22283
|
} else {
|
|
20357
22284
|
this.logger.info("No chunks to upsert \u2014 all up to date");
|
|
20358
22285
|
}
|
|
22286
|
+
if (this.config.llmsTxt.enable && !options.dryRun) {
|
|
22287
|
+
const llmsStart = stageStart();
|
|
22288
|
+
await writeLlmsTxt(pages, this.config, this.cwd, this.logger);
|
|
22289
|
+
stageEnd("llms_txt", llmsStart);
|
|
22290
|
+
}
|
|
20359
22291
|
this.logger.info("Done.");
|
|
20360
|
-
|
|
22292
|
+
const stats = {
|
|
20361
22293
|
pagesProcessed: pages.length,
|
|
22294
|
+
pagesChanged,
|
|
22295
|
+
pagesDeleted,
|
|
20362
22296
|
chunksTotal: chunks.length,
|
|
20363
22297
|
chunksChanged: changedChunks.length,
|
|
20364
22298
|
documentsUpserted,
|
|
@@ -20367,6 +22301,10 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20367
22301
|
routeBestEffort,
|
|
20368
22302
|
stageTimingsMs
|
|
20369
22303
|
};
|
|
22304
|
+
if (this.hooks.afterIndex) {
|
|
22305
|
+
await this.hooks.afterIndex(stats);
|
|
22306
|
+
}
|
|
22307
|
+
return stats;
|
|
20370
22308
|
}
|
|
20371
22309
|
};
|
|
20372
22310
|
|
|
@@ -20388,9 +22326,6 @@ function shouldRunAutoIndex(options) {
|
|
|
20388
22326
|
if (explicit && /^(1|true|yes)$/i.test(explicit)) {
|
|
20389
22327
|
return true;
|
|
20390
22328
|
}
|
|
20391
|
-
if (process.env.CI && /^(1|true)$/i.test(process.env.CI)) {
|
|
20392
|
-
return true;
|
|
20393
|
-
}
|
|
20394
22329
|
return false;
|
|
20395
22330
|
}
|
|
20396
22331
|
function searchsocketVitePlugin(options = {}) {
|
|
@@ -20415,7 +22350,8 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20415
22350
|
const pipeline = await IndexPipeline.create({
|
|
20416
22351
|
cwd,
|
|
20417
22352
|
configPath: options.configPath,
|
|
20418
|
-
logger: logger3
|
|
22353
|
+
logger: logger3,
|
|
22354
|
+
hooks: options.hooks
|
|
20419
22355
|
});
|
|
20420
22356
|
const stats = await pipeline.run({
|
|
20421
22357
|
changedOnly: options.changedOnly ?? true,
|