searchsocket 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +848 -0
- package/dist/cli.js +3860 -0
- package/dist/cli.js.map +1 -0
- package/dist/client.cjs +36 -0
- package/dist/client.cjs.map +1 -0
- package/dist/client.d.cts +11 -0
- package/dist/client.d.ts +11 -0
- package/dist/client.js +34 -0
- package/dist/client.js.map +1 -0
- package/dist/index.cjs +20767 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +119 -0
- package/dist/index.d.ts +119 -0
- package/dist/index.js +20742 -0
- package/dist/index.js.map +1 -0
- package/dist/sveltekit.cjs +20578 -0
- package/dist/sveltekit.cjs.map +1 -0
- package/dist/sveltekit.d.cts +37 -0
- package/dist/sveltekit.d.ts +37 -0
- package/dist/sveltekit.js +20563 -0
- package/dist/sveltekit.js.map +1 -0
- package/dist/types-D1K46vwd.d.cts +403 -0
- package/dist/types-D1K46vwd.d.ts +403 -0
- package/package.json +86 -0
package/dist/cli.js
ADDED
|
@@ -0,0 +1,3860 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// src/cli.ts
|
|
4
|
+
import fs9 from "fs";
|
|
5
|
+
import fsp from "fs/promises";
|
|
6
|
+
import path13 from "path";
|
|
7
|
+
import { execSync as execSync2 } from "child_process";
|
|
8
|
+
import { config as dotenvConfig } from "dotenv";
|
|
9
|
+
import chokidar from "chokidar";
|
|
10
|
+
import { Command } from "commander";
|
|
11
|
+
|
|
12
|
+
// package.json
|
|
13
|
+
var package_default = {
|
|
14
|
+
name: "searchsocket",
|
|
15
|
+
version: "0.2.0",
|
|
16
|
+
description: "Semantic site search and MCP retrieval for SvelteKit static sites",
|
|
17
|
+
license: "MIT",
|
|
18
|
+
author: "Greg Priday <greg@siteorigin.com>",
|
|
19
|
+
repository: {
|
|
20
|
+
type: "git",
|
|
21
|
+
url: "https://github.com/gregpriday/searchsocket.git"
|
|
22
|
+
},
|
|
23
|
+
homepage: "https://github.com/gregpriday/searchsocket",
|
|
24
|
+
bugs: {
|
|
25
|
+
url: "https://github.com/gregpriday/searchsocket/issues"
|
|
26
|
+
},
|
|
27
|
+
keywords: [
|
|
28
|
+
"search",
|
|
29
|
+
"semantic-search",
|
|
30
|
+
"sveltekit",
|
|
31
|
+
"mcp",
|
|
32
|
+
"embeddings",
|
|
33
|
+
"vector-search",
|
|
34
|
+
"site-search",
|
|
35
|
+
"static-site"
|
|
36
|
+
],
|
|
37
|
+
type: "module",
|
|
38
|
+
files: [
|
|
39
|
+
"dist",
|
|
40
|
+
"README.md"
|
|
41
|
+
],
|
|
42
|
+
bin: {
|
|
43
|
+
searchsocket: "dist/cli.js"
|
|
44
|
+
},
|
|
45
|
+
exports: {
|
|
46
|
+
".": {
|
|
47
|
+
types: "./dist/index.d.ts",
|
|
48
|
+
import: "./dist/index.js",
|
|
49
|
+
require: "./dist/index.cjs"
|
|
50
|
+
},
|
|
51
|
+
"./sveltekit": {
|
|
52
|
+
types: "./dist/sveltekit.d.ts",
|
|
53
|
+
import: "./dist/sveltekit.js",
|
|
54
|
+
require: "./dist/sveltekit.cjs"
|
|
55
|
+
},
|
|
56
|
+
"./client": {
|
|
57
|
+
types: "./dist/client.d.ts",
|
|
58
|
+
import: "./dist/client.js",
|
|
59
|
+
require: "./dist/client.cjs"
|
|
60
|
+
}
|
|
61
|
+
},
|
|
62
|
+
scripts: {
|
|
63
|
+
build: "tsup",
|
|
64
|
+
clean: "rm -rf dist",
|
|
65
|
+
typecheck: "tsc --noEmit",
|
|
66
|
+
test: "vitest run",
|
|
67
|
+
"test:watch": "vitest"
|
|
68
|
+
},
|
|
69
|
+
engines: {
|
|
70
|
+
node: ">=20"
|
|
71
|
+
},
|
|
72
|
+
packageManager: "pnpm@10.29.2",
|
|
73
|
+
dependencies: {
|
|
74
|
+
"@libsql/client": "^0.17.0",
|
|
75
|
+
"@modelcontextprotocol/sdk": "^1.26.0",
|
|
76
|
+
cheerio: "^1.2.0",
|
|
77
|
+
chokidar: "^5.0.0",
|
|
78
|
+
commander: "^14.0.3",
|
|
79
|
+
dotenv: "^17.3.1",
|
|
80
|
+
express: "^5.2.1",
|
|
81
|
+
"fast-glob": "^3.3.3",
|
|
82
|
+
"gray-matter": "^4.0.3",
|
|
83
|
+
jiti: "^2.6.1",
|
|
84
|
+
openai: "^6.19.0",
|
|
85
|
+
"p-limit": "^7.3.0",
|
|
86
|
+
turndown: "^7.2.2",
|
|
87
|
+
"turndown-plugin-gfm": "^1.0.2",
|
|
88
|
+
zod: "^4.3.6"
|
|
89
|
+
},
|
|
90
|
+
devDependencies: {
|
|
91
|
+
"@types/express": "^5.0.6",
|
|
92
|
+
"@types/node": "^25.2.2",
|
|
93
|
+
"@types/turndown": "^5.0.6",
|
|
94
|
+
tsup: "^8.5.1",
|
|
95
|
+
typescript: "^5.9.3",
|
|
96
|
+
vitest: "^4.0.18"
|
|
97
|
+
}
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
// src/config/load.ts
|
|
101
|
+
import fs from "fs";
|
|
102
|
+
import path from "path";
|
|
103
|
+
import { createJiti } from "jiti";
|
|
104
|
+
|
|
105
|
+
// src/config/schema.ts
|
|
106
|
+
import { z } from "zod";
|
|
107
|
+
var searchSocketConfigSchema = z.object({
|
|
108
|
+
project: z.object({
|
|
109
|
+
id: z.string().min(1).optional(),
|
|
110
|
+
baseUrl: z.string().url().optional()
|
|
111
|
+
}).optional(),
|
|
112
|
+
scope: z.object({
|
|
113
|
+
mode: z.enum(["fixed", "git", "env"]).optional(),
|
|
114
|
+
fixed: z.string().min(1).optional(),
|
|
115
|
+
envVar: z.string().min(1).optional(),
|
|
116
|
+
sanitize: z.boolean().optional()
|
|
117
|
+
}).optional(),
|
|
118
|
+
source: z.object({
|
|
119
|
+
mode: z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
|
|
120
|
+
staticOutputDir: z.string().min(1).optional(),
|
|
121
|
+
strictRouteMapping: z.boolean().optional(),
|
|
122
|
+
crawl: z.object({
|
|
123
|
+
baseUrl: z.string().url(),
|
|
124
|
+
routes: z.array(z.string()).optional(),
|
|
125
|
+
sitemapUrl: z.string().optional()
|
|
126
|
+
}).optional(),
|
|
127
|
+
contentFiles: z.object({
|
|
128
|
+
globs: z.array(z.string()).min(1),
|
|
129
|
+
baseDir: z.string().optional()
|
|
130
|
+
}).optional(),
|
|
131
|
+
build: z.object({
|
|
132
|
+
outputDir: z.string().min(1).optional(),
|
|
133
|
+
paramValues: z.record(z.string(), z.array(z.string())).optional(),
|
|
134
|
+
exclude: z.array(z.string()).optional(),
|
|
135
|
+
previewTimeout: z.number().int().positive().optional()
|
|
136
|
+
}).optional()
|
|
137
|
+
}).optional(),
|
|
138
|
+
extract: z.object({
|
|
139
|
+
mainSelector: z.string().optional(),
|
|
140
|
+
dropTags: z.array(z.string()).optional(),
|
|
141
|
+
dropSelectors: z.array(z.string()).optional(),
|
|
142
|
+
ignoreAttr: z.string().optional(),
|
|
143
|
+
noindexAttr: z.string().optional(),
|
|
144
|
+
respectRobotsNoindex: z.boolean().optional()
|
|
145
|
+
}).optional(),
|
|
146
|
+
transform: z.object({
|
|
147
|
+
output: z.literal("markdown").optional(),
|
|
148
|
+
preserveCodeBlocks: z.boolean().optional(),
|
|
149
|
+
preserveTables: z.boolean().optional()
|
|
150
|
+
}).optional(),
|
|
151
|
+
chunking: z.object({
|
|
152
|
+
strategy: z.literal("hybrid").optional(),
|
|
153
|
+
maxChars: z.number().int().positive().optional(),
|
|
154
|
+
overlapChars: z.number().int().nonnegative().optional(),
|
|
155
|
+
minChars: z.number().int().positive().optional(),
|
|
156
|
+
headingPathDepth: z.number().int().positive().optional(),
|
|
157
|
+
dontSplitInside: z.array(z.enum(["code", "table", "blockquote"])).optional(),
|
|
158
|
+
prependTitle: z.boolean().optional(),
|
|
159
|
+
pageSummaryChunk: z.boolean().optional()
|
|
160
|
+
}).optional(),
|
|
161
|
+
embeddings: z.object({
|
|
162
|
+
provider: z.literal("openai").optional(),
|
|
163
|
+
model: z.string().min(1).optional(),
|
|
164
|
+
apiKeyEnv: z.string().min(1).optional(),
|
|
165
|
+
batchSize: z.number().int().positive().optional(),
|
|
166
|
+
concurrency: z.number().int().positive().optional(),
|
|
167
|
+
pricePer1kTokens: z.number().positive().optional()
|
|
168
|
+
}).optional(),
|
|
169
|
+
vector: z.object({
|
|
170
|
+
dimension: z.number().int().positive().optional(),
|
|
171
|
+
turso: z.object({
|
|
172
|
+
urlEnv: z.string().optional(),
|
|
173
|
+
authTokenEnv: z.string().optional(),
|
|
174
|
+
localPath: z.string().optional()
|
|
175
|
+
}).optional()
|
|
176
|
+
}).optional(),
|
|
177
|
+
rerank: z.object({
|
|
178
|
+
provider: z.enum(["none", "jina"]).optional(),
|
|
179
|
+
topN: z.number().int().positive().optional(),
|
|
180
|
+
jina: z.object({
|
|
181
|
+
apiKeyEnv: z.string().optional(),
|
|
182
|
+
model: z.string().optional()
|
|
183
|
+
}).optional()
|
|
184
|
+
}).optional(),
|
|
185
|
+
ranking: z.object({
|
|
186
|
+
enableIncomingLinkBoost: z.boolean().optional(),
|
|
187
|
+
enableDepthBoost: z.boolean().optional(),
|
|
188
|
+
pageWeights: z.record(z.string(), z.number().positive()).optional(),
|
|
189
|
+
aggregationCap: z.number().int().positive().optional(),
|
|
190
|
+
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
191
|
+
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
192
|
+
weights: z.object({
|
|
193
|
+
incomingLinks: z.number().optional(),
|
|
194
|
+
depth: z.number().optional(),
|
|
195
|
+
rerank: z.number().optional(),
|
|
196
|
+
aggregation: z.number().optional()
|
|
197
|
+
}).optional()
|
|
198
|
+
}).optional(),
|
|
199
|
+
api: z.object({
|
|
200
|
+
path: z.string().optional(),
|
|
201
|
+
cors: z.object({
|
|
202
|
+
allowOrigins: z.array(z.string()).optional()
|
|
203
|
+
}).optional(),
|
|
204
|
+
rateLimit: z.object({
|
|
205
|
+
windowMs: z.number().int().positive().optional(),
|
|
206
|
+
max: z.number().int().positive().optional()
|
|
207
|
+
}).optional()
|
|
208
|
+
}).optional(),
|
|
209
|
+
mcp: z.object({
|
|
210
|
+
enable: z.boolean().optional(),
|
|
211
|
+
transport: z.enum(["stdio", "http"]).optional(),
|
|
212
|
+
http: z.object({
|
|
213
|
+
port: z.number().int().positive().optional(),
|
|
214
|
+
path: z.string().optional()
|
|
215
|
+
}).optional()
|
|
216
|
+
}).optional(),
|
|
217
|
+
state: z.object({
|
|
218
|
+
dir: z.string().optional(),
|
|
219
|
+
writeMirror: z.boolean().optional()
|
|
220
|
+
}).optional()
|
|
221
|
+
});
|
|
222
|
+
|
|
223
|
+
// src/config/defaults.ts
|
|
224
|
+
var DEFAULT_DROP_SELECTORS = [
|
|
225
|
+
".sidebar",
|
|
226
|
+
".toc",
|
|
227
|
+
".table-of-contents",
|
|
228
|
+
".breadcrumbs",
|
|
229
|
+
".breadcrumb",
|
|
230
|
+
"[role='navigation']"
|
|
231
|
+
];
|
|
232
|
+
function createDefaultConfig(projectId) {
|
|
233
|
+
return {
|
|
234
|
+
project: {
|
|
235
|
+
id: projectId
|
|
236
|
+
},
|
|
237
|
+
scope: {
|
|
238
|
+
mode: "fixed",
|
|
239
|
+
fixed: "main",
|
|
240
|
+
envVar: "SEARCHSOCKET_SCOPE",
|
|
241
|
+
sanitize: true
|
|
242
|
+
},
|
|
243
|
+
source: {
|
|
244
|
+
mode: "static-output",
|
|
245
|
+
staticOutputDir: "build",
|
|
246
|
+
strictRouteMapping: false
|
|
247
|
+
},
|
|
248
|
+
extract: {
|
|
249
|
+
mainSelector: "main",
|
|
250
|
+
dropTags: ["header", "nav", "footer", "aside"],
|
|
251
|
+
dropSelectors: DEFAULT_DROP_SELECTORS,
|
|
252
|
+
ignoreAttr: "data-search-ignore",
|
|
253
|
+
noindexAttr: "data-search-noindex",
|
|
254
|
+
respectRobotsNoindex: true
|
|
255
|
+
},
|
|
256
|
+
transform: {
|
|
257
|
+
output: "markdown",
|
|
258
|
+
preserveCodeBlocks: true,
|
|
259
|
+
preserveTables: true
|
|
260
|
+
},
|
|
261
|
+
chunking: {
|
|
262
|
+
strategy: "hybrid",
|
|
263
|
+
maxChars: 2200,
|
|
264
|
+
overlapChars: 200,
|
|
265
|
+
minChars: 250,
|
|
266
|
+
headingPathDepth: 3,
|
|
267
|
+
dontSplitInside: ["code", "table", "blockquote"],
|
|
268
|
+
prependTitle: true,
|
|
269
|
+
pageSummaryChunk: true
|
|
270
|
+
},
|
|
271
|
+
embeddings: {
|
|
272
|
+
provider: "openai",
|
|
273
|
+
model: "text-embedding-3-small",
|
|
274
|
+
apiKeyEnv: "OPENAI_API_KEY",
|
|
275
|
+
batchSize: 64,
|
|
276
|
+
concurrency: 4
|
|
277
|
+
},
|
|
278
|
+
vector: {
|
|
279
|
+
turso: {
|
|
280
|
+
urlEnv: "TURSO_DATABASE_URL",
|
|
281
|
+
authTokenEnv: "TURSO_AUTH_TOKEN",
|
|
282
|
+
localPath: ".searchsocket/vectors.db"
|
|
283
|
+
}
|
|
284
|
+
},
|
|
285
|
+
rerank: {
|
|
286
|
+
provider: "none",
|
|
287
|
+
topN: 20,
|
|
288
|
+
jina: {
|
|
289
|
+
apiKeyEnv: "JINA_API_KEY",
|
|
290
|
+
model: "jina-reranker-v2-base-multilingual"
|
|
291
|
+
}
|
|
292
|
+
},
|
|
293
|
+
ranking: {
|
|
294
|
+
enableIncomingLinkBoost: true,
|
|
295
|
+
enableDepthBoost: true,
|
|
296
|
+
pageWeights: {},
|
|
297
|
+
aggregationCap: 5,
|
|
298
|
+
aggregationDecay: 0.5,
|
|
299
|
+
minChunkScoreRatio: 0.5,
|
|
300
|
+
weights: {
|
|
301
|
+
incomingLinks: 0.05,
|
|
302
|
+
depth: 0.03,
|
|
303
|
+
rerank: 1,
|
|
304
|
+
aggregation: 0.1
|
|
305
|
+
}
|
|
306
|
+
},
|
|
307
|
+
api: {
|
|
308
|
+
path: "/api/search",
|
|
309
|
+
cors: {
|
|
310
|
+
allowOrigins: []
|
|
311
|
+
}
|
|
312
|
+
},
|
|
313
|
+
mcp: {
|
|
314
|
+
enable: process.env.NODE_ENV !== "production",
|
|
315
|
+
transport: "stdio",
|
|
316
|
+
http: {
|
|
317
|
+
port: 3338,
|
|
318
|
+
path: "/mcp"
|
|
319
|
+
}
|
|
320
|
+
},
|
|
321
|
+
state: {
|
|
322
|
+
dir: ".searchsocket",
|
|
323
|
+
writeMirror: false
|
|
324
|
+
}
|
|
325
|
+
};
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// src/errors/index.ts
|
|
329
|
+
var SearchSocketError = class extends Error {
|
|
330
|
+
code;
|
|
331
|
+
status;
|
|
332
|
+
constructor(code, message, status = 500) {
|
|
333
|
+
super(message);
|
|
334
|
+
this.code = code;
|
|
335
|
+
this.status = status;
|
|
336
|
+
}
|
|
337
|
+
};
|
|
338
|
+
|
|
339
|
+
// src/config/load.ts
|
|
340
|
+
function inferProjectId(cwd) {
|
|
341
|
+
const packageJsonPath = path.join(cwd, "package.json");
|
|
342
|
+
if (!fs.existsSync(packageJsonPath)) {
|
|
343
|
+
return path.basename(cwd);
|
|
344
|
+
}
|
|
345
|
+
const raw = JSON.parse(fs.readFileSync(packageJsonPath, "utf8"));
|
|
346
|
+
return (raw.name ?? path.basename(cwd)).replace(/[^a-zA-Z0-9._-]/g, "-");
|
|
347
|
+
}
|
|
348
|
+
function detectSourceMode(cwd, config, parsedInput) {
|
|
349
|
+
if (parsedInput.source?.mode) {
|
|
350
|
+
return parsedInput.source.mode;
|
|
351
|
+
}
|
|
352
|
+
if (parsedInput.source?.build) {
|
|
353
|
+
return "build";
|
|
354
|
+
}
|
|
355
|
+
if (parsedInput.source?.crawl) {
|
|
356
|
+
return "crawl";
|
|
357
|
+
}
|
|
358
|
+
if (parsedInput.source?.contentFiles) {
|
|
359
|
+
return "content-files";
|
|
360
|
+
}
|
|
361
|
+
const staticOutputPath = path.resolve(cwd, config.source.staticOutputDir);
|
|
362
|
+
if (fs.existsSync(staticOutputPath)) {
|
|
363
|
+
return "static-output";
|
|
364
|
+
}
|
|
365
|
+
throw new SearchSocketError(
|
|
366
|
+
"CONFIG_MISSING",
|
|
367
|
+
`Unable to auto-detect source mode because ${staticOutputPath} does not exist. Set \`source.mode\` explicitly (static-output, crawl, content-files, or build).`
|
|
368
|
+
);
|
|
369
|
+
}
|
|
370
|
+
function mergeConfig(cwd, rawConfig) {
|
|
371
|
+
const projectId = rawConfig.project?.id ?? inferProjectId(cwd);
|
|
372
|
+
const defaults = createDefaultConfig(projectId);
|
|
373
|
+
const parseResult = searchSocketConfigSchema.safeParse(rawConfig);
|
|
374
|
+
if (!parseResult.success) {
|
|
375
|
+
const issues = parseResult.error.issues.map((issue) => ` ${issue.path.join(".")}: ${issue.message}`).join("\n");
|
|
376
|
+
throw new SearchSocketError(
|
|
377
|
+
"CONFIG_MISSING",
|
|
378
|
+
`Invalid searchsocket.config.ts:
|
|
379
|
+
${issues}`
|
|
380
|
+
);
|
|
381
|
+
}
|
|
382
|
+
const parsed = parseResult.data;
|
|
383
|
+
const merged = {
|
|
384
|
+
...defaults,
|
|
385
|
+
project: {
|
|
386
|
+
...defaults.project,
|
|
387
|
+
...parsed.project
|
|
388
|
+
},
|
|
389
|
+
scope: {
|
|
390
|
+
...defaults.scope,
|
|
391
|
+
...parsed.scope
|
|
392
|
+
},
|
|
393
|
+
source: {
|
|
394
|
+
...defaults.source,
|
|
395
|
+
...parsed.source,
|
|
396
|
+
crawl: parsed.source?.crawl ? {
|
|
397
|
+
...defaults.source.crawl,
|
|
398
|
+
...parsed.source.crawl,
|
|
399
|
+
routes: parsed.source.crawl.routes ?? []
|
|
400
|
+
} : defaults.source.crawl,
|
|
401
|
+
contentFiles: parsed.source?.contentFiles ? {
|
|
402
|
+
...defaults.source.contentFiles,
|
|
403
|
+
...parsed.source.contentFiles,
|
|
404
|
+
baseDir: parsed.source.contentFiles.baseDir ?? defaults.source.contentFiles?.baseDir ?? cwd
|
|
405
|
+
} : defaults.source.contentFiles,
|
|
406
|
+
build: parsed.source?.build ? {
|
|
407
|
+
outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
|
|
408
|
+
paramValues: parsed.source.build.paramValues ?? {},
|
|
409
|
+
exclude: parsed.source.build.exclude ?? [],
|
|
410
|
+
previewTimeout: parsed.source.build.previewTimeout ?? 3e4
|
|
411
|
+
} : void 0
|
|
412
|
+
},
|
|
413
|
+
extract: {
|
|
414
|
+
...defaults.extract,
|
|
415
|
+
...parsed.extract
|
|
416
|
+
},
|
|
417
|
+
transform: {
|
|
418
|
+
...defaults.transform,
|
|
419
|
+
...parsed.transform
|
|
420
|
+
},
|
|
421
|
+
chunking: {
|
|
422
|
+
...defaults.chunking,
|
|
423
|
+
...parsed.chunking
|
|
424
|
+
},
|
|
425
|
+
embeddings: {
|
|
426
|
+
...defaults.embeddings,
|
|
427
|
+
...parsed.embeddings
|
|
428
|
+
},
|
|
429
|
+
vector: {
|
|
430
|
+
...defaults.vector,
|
|
431
|
+
...parsed.vector,
|
|
432
|
+
turso: {
|
|
433
|
+
...defaults.vector.turso,
|
|
434
|
+
...parsed.vector?.turso
|
|
435
|
+
}
|
|
436
|
+
},
|
|
437
|
+
rerank: {
|
|
438
|
+
...defaults.rerank,
|
|
439
|
+
...parsed.rerank,
|
|
440
|
+
jina: {
|
|
441
|
+
...defaults.rerank.jina,
|
|
442
|
+
...parsed.rerank?.jina
|
|
443
|
+
}
|
|
444
|
+
},
|
|
445
|
+
ranking: {
|
|
446
|
+
...defaults.ranking,
|
|
447
|
+
...parsed.ranking,
|
|
448
|
+
pageWeights: {
|
|
449
|
+
...defaults.ranking.pageWeights,
|
|
450
|
+
...parsed.ranking?.pageWeights
|
|
451
|
+
},
|
|
452
|
+
weights: {
|
|
453
|
+
...defaults.ranking.weights,
|
|
454
|
+
...parsed.ranking?.weights
|
|
455
|
+
}
|
|
456
|
+
},
|
|
457
|
+
api: {
|
|
458
|
+
...defaults.api,
|
|
459
|
+
...parsed.api,
|
|
460
|
+
cors: {
|
|
461
|
+
...defaults.api.cors,
|
|
462
|
+
...parsed.api?.cors,
|
|
463
|
+
allowOrigins: parsed.api?.cors?.allowOrigins ?? defaults.api.cors.allowOrigins
|
|
464
|
+
},
|
|
465
|
+
rateLimit: parsed.api?.rateLimit ? {
|
|
466
|
+
windowMs: parsed.api.rateLimit.windowMs ?? 6e4,
|
|
467
|
+
max: parsed.api.rateLimit.max ?? 60
|
|
468
|
+
} : defaults.api.rateLimit
|
|
469
|
+
},
|
|
470
|
+
mcp: {
|
|
471
|
+
...defaults.mcp,
|
|
472
|
+
...parsed.mcp,
|
|
473
|
+
http: {
|
|
474
|
+
...defaults.mcp.http,
|
|
475
|
+
...parsed.mcp?.http
|
|
476
|
+
}
|
|
477
|
+
},
|
|
478
|
+
state: {
|
|
479
|
+
...defaults.state,
|
|
480
|
+
...parsed.state
|
|
481
|
+
}
|
|
482
|
+
};
|
|
483
|
+
merged.project.id = projectId;
|
|
484
|
+
merged.source.mode = detectSourceMode(cwd, merged, parsed);
|
|
485
|
+
if (merged.source.mode === "build" && !merged.source.build) {
|
|
486
|
+
merged.source.build = {
|
|
487
|
+
outputDir: ".svelte-kit/output",
|
|
488
|
+
paramValues: {},
|
|
489
|
+
exclude: [],
|
|
490
|
+
previewTimeout: 3e4
|
|
491
|
+
};
|
|
492
|
+
}
|
|
493
|
+
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
494
|
+
throw new SearchSocketError("CONFIG_MISSING", "`source.crawl.baseUrl` is required when source.mode is crawl.");
|
|
495
|
+
}
|
|
496
|
+
if (merged.source.mode === "content-files" && (!merged.source.contentFiles || merged.source.contentFiles.globs.length === 0)) {
|
|
497
|
+
throw new SearchSocketError(
|
|
498
|
+
"CONFIG_MISSING",
|
|
499
|
+
"`source.contentFiles.globs` is required when source.mode is content-files."
|
|
500
|
+
);
|
|
501
|
+
}
|
|
502
|
+
return merged;
|
|
503
|
+
}
|
|
504
|
+
async function loadConfig(options = {}) {
|
|
505
|
+
const cwd = path.resolve(options.cwd ?? process.cwd());
|
|
506
|
+
const configPath = path.resolve(cwd, options.configPath ?? "searchsocket.config.ts");
|
|
507
|
+
if (!fs.existsSync(configPath)) {
|
|
508
|
+
if (options.allowMissing) {
|
|
509
|
+
return mergeConfig(cwd, {
|
|
510
|
+
source: {
|
|
511
|
+
mode: "static-output"
|
|
512
|
+
}
|
|
513
|
+
});
|
|
514
|
+
}
|
|
515
|
+
throw new SearchSocketError(
|
|
516
|
+
"CONFIG_MISSING",
|
|
517
|
+
`Configuration file not found at ${configPath}. Run \`searchsocket init\` first.`
|
|
518
|
+
);
|
|
519
|
+
}
|
|
520
|
+
const jiti = createJiti(cwd, { interopDefault: true });
|
|
521
|
+
const loaded = await jiti.import(configPath);
|
|
522
|
+
const raw = loaded.default ?? loaded;
|
|
523
|
+
return mergeConfig(cwd, raw);
|
|
524
|
+
}
|
|
525
|
+
function writeMinimalConfig(cwd) {
|
|
526
|
+
const target = path.join(cwd, "searchsocket.config.ts");
|
|
527
|
+
if (fs.existsSync(target)) {
|
|
528
|
+
return target;
|
|
529
|
+
}
|
|
530
|
+
const content = `export default {
|
|
531
|
+
embeddings: { apiKeyEnv: "OPENAI_API_KEY" }
|
|
532
|
+
};
|
|
533
|
+
`;
|
|
534
|
+
fs.writeFileSync(target, content, "utf8");
|
|
535
|
+
return target;
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
// src/core/logger.ts
|
|
539
|
+
var Logger = class {
|
|
540
|
+
json;
|
|
541
|
+
verbose;
|
|
542
|
+
stderrOnly;
|
|
543
|
+
constructor(opts = {}) {
|
|
544
|
+
this.json = opts.json ?? false;
|
|
545
|
+
this.verbose = opts.verbose ?? false;
|
|
546
|
+
this.stderrOnly = opts.stderrOnly ?? false;
|
|
547
|
+
}
|
|
548
|
+
info(message) {
|
|
549
|
+
if (this.json) {
|
|
550
|
+
return;
|
|
551
|
+
}
|
|
552
|
+
this.writeOut(`${message}
|
|
553
|
+
`);
|
|
554
|
+
}
|
|
555
|
+
debug(message) {
|
|
556
|
+
if (!this.verbose) {
|
|
557
|
+
return;
|
|
558
|
+
}
|
|
559
|
+
if (this.json) {
|
|
560
|
+
this.logJson("debug", { message });
|
|
561
|
+
return;
|
|
562
|
+
}
|
|
563
|
+
this.writeOut(`${message}
|
|
564
|
+
`);
|
|
565
|
+
}
|
|
566
|
+
warn(message) {
|
|
567
|
+
if (this.json) {
|
|
568
|
+
this.logJson("warn", { message });
|
|
569
|
+
return;
|
|
570
|
+
}
|
|
571
|
+
process.stderr.write(`WARN: ${message}
|
|
572
|
+
`);
|
|
573
|
+
}
|
|
574
|
+
error(message) {
|
|
575
|
+
if (this.json) {
|
|
576
|
+
this.logJson("error", { message });
|
|
577
|
+
return;
|
|
578
|
+
}
|
|
579
|
+
process.stderr.write(`ERROR: ${message}
|
|
580
|
+
`);
|
|
581
|
+
}
|
|
582
|
+
event(event, data) {
|
|
583
|
+
if (!this.json && !this.verbose) {
|
|
584
|
+
return;
|
|
585
|
+
}
|
|
586
|
+
if (this.json) {
|
|
587
|
+
this.logJson(event, data);
|
|
588
|
+
return;
|
|
589
|
+
}
|
|
590
|
+
this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
|
|
591
|
+
`);
|
|
592
|
+
}
|
|
593
|
+
writeOut(text) {
|
|
594
|
+
if (this.stderrOnly) {
|
|
595
|
+
process.stderr.write(text);
|
|
596
|
+
} else {
|
|
597
|
+
process.stdout.write(text);
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
logJson(event, data) {
|
|
601
|
+
const entry = {
|
|
602
|
+
event,
|
|
603
|
+
ts: (/* @__PURE__ */ new Date()).toISOString(),
|
|
604
|
+
data
|
|
605
|
+
};
|
|
606
|
+
this.writeOut(`${JSON.stringify(entry)}
|
|
607
|
+
`);
|
|
608
|
+
}
|
|
609
|
+
};
|
|
610
|
+
|
|
611
|
+
// src/core/scope.ts
|
|
612
|
+
import { execSync } from "child_process";
|
|
613
|
+
|
|
614
|
+
// src/utils/text.ts
|
|
615
|
+
function normalizeText(input) {
|
|
616
|
+
return input.replace(/\r\n/g, "\n").replace(/\s+/g, " ").trim();
|
|
617
|
+
}
|
|
618
|
+
function normalizeMarkdown(input) {
|
|
619
|
+
return input.replace(/\r\n/g, "\n").replace(/[ \t]+$/gm, "").trim() + "\n";
|
|
620
|
+
}
|
|
621
|
+
function sanitizeScopeName(scopeName) {
|
|
622
|
+
return scopeName.toLowerCase().replace(/[^a-z0-9._-]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80);
|
|
623
|
+
}
|
|
624
|
+
function toSnippet(markdown, maxLen = 220) {
|
|
625
|
+
const plain = markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/[#>*_|\-]/g, " ").replace(/\s+/g, " ").trim();
|
|
626
|
+
if (plain.length <= maxLen) {
|
|
627
|
+
return plain;
|
|
628
|
+
}
|
|
629
|
+
return `${plain.slice(0, Math.max(0, maxLen - 1)).trim()}\u2026`;
|
|
630
|
+
}
|
|
631
|
+
function extractFirstParagraph(markdown) {
|
|
632
|
+
const lines = markdown.split("\n");
|
|
633
|
+
let inFence = false;
|
|
634
|
+
const collected = [];
|
|
635
|
+
for (const line of lines) {
|
|
636
|
+
const trimmed = line.trim();
|
|
637
|
+
if (/^(```|~~~)/.test(trimmed)) {
|
|
638
|
+
inFence = !inFence;
|
|
639
|
+
if (collected.length > 0) break;
|
|
640
|
+
continue;
|
|
641
|
+
}
|
|
642
|
+
if (inFence) continue;
|
|
643
|
+
if (/^#{1,6}\s/.test(trimmed)) {
|
|
644
|
+
if (collected.length > 0) break;
|
|
645
|
+
continue;
|
|
646
|
+
}
|
|
647
|
+
if (!trimmed) {
|
|
648
|
+
if (collected.length > 0) break;
|
|
649
|
+
continue;
|
|
650
|
+
}
|
|
651
|
+
collected.push(trimmed);
|
|
652
|
+
}
|
|
653
|
+
return collected.join(" ");
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
// src/core/scope.ts
|
|
657
|
+
function resolveRawScopeName(config) {
|
|
658
|
+
if (config.scope.mode === "fixed") {
|
|
659
|
+
return config.scope.fixed;
|
|
660
|
+
}
|
|
661
|
+
if (config.scope.mode === "env") {
|
|
662
|
+
const value = process.env[config.scope.envVar];
|
|
663
|
+
if (!value) {
|
|
664
|
+
throw new Error(`Scope mode is env but ${config.scope.envVar} is not set.`);
|
|
665
|
+
}
|
|
666
|
+
return value;
|
|
667
|
+
}
|
|
668
|
+
try {
|
|
669
|
+
return execSync("git rev-parse --abbrev-ref HEAD", {
|
|
670
|
+
encoding: "utf8",
|
|
671
|
+
stdio: ["ignore", "pipe", "ignore"]
|
|
672
|
+
}).trim();
|
|
673
|
+
} catch {
|
|
674
|
+
return config.scope.fixed;
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
function resolveScope(config, override) {
|
|
678
|
+
const rawName = override ?? resolveRawScopeName(config);
|
|
679
|
+
const scopeName = config.scope.sanitize ? sanitizeScopeName(rawName) : rawName;
|
|
680
|
+
return {
|
|
681
|
+
projectId: config.project.id,
|
|
682
|
+
scopeName,
|
|
683
|
+
scopeId: `${config.project.id}:${scopeName}`
|
|
684
|
+
};
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
// src/core/state.ts
|
|
688
|
+
import fs2 from "fs";
|
|
689
|
+
import path2 from "path";
|
|
690
|
+
function ensureStateDirs(cwd, stateDir, scope) {
|
|
691
|
+
const statePath = path2.resolve(cwd, stateDir);
|
|
692
|
+
const pagesPath = path2.join(statePath, "pages", scope.scopeName);
|
|
693
|
+
fs2.mkdirSync(pagesPath, { recursive: true });
|
|
694
|
+
return { statePath, pagesPath };
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
// src/embeddings/openai.ts
|
|
698
|
+
import OpenAI from "openai";
|
|
699
|
+
import pLimit from "p-limit";
|
|
700
|
+
function sleep(ms) {
|
|
701
|
+
return new Promise((resolve) => {
|
|
702
|
+
setTimeout(resolve, ms);
|
|
703
|
+
});
|
|
704
|
+
}
|
|
705
|
+
var OpenAIEmbeddingsProvider = class {
|
|
706
|
+
client;
|
|
707
|
+
batchSize;
|
|
708
|
+
concurrency;
|
|
709
|
+
constructor(options) {
|
|
710
|
+
if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
|
|
711
|
+
throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
|
|
712
|
+
}
|
|
713
|
+
if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
|
|
714
|
+
throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
|
|
715
|
+
}
|
|
716
|
+
this.client = new OpenAI({
|
|
717
|
+
apiKey: options.apiKey
|
|
718
|
+
});
|
|
719
|
+
this.batchSize = options.batchSize;
|
|
720
|
+
this.concurrency = options.concurrency;
|
|
721
|
+
}
|
|
722
|
+
estimateTokens(text) {
|
|
723
|
+
const normalized = text.trim();
|
|
724
|
+
if (!normalized) {
|
|
725
|
+
return 0;
|
|
726
|
+
}
|
|
727
|
+
const wordCount = normalized.match(/[A-Za-z0-9_]+/g)?.length ?? 0;
|
|
728
|
+
const punctuationCount = normalized.match(/[^\s\w]/g)?.length ?? 0;
|
|
729
|
+
const cjkCount = normalized.match(/[\u3400-\u9fff]/g)?.length ?? 0;
|
|
730
|
+
const charEstimate = Math.ceil(normalized.length / 4);
|
|
731
|
+
const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
|
|
732
|
+
return Math.max(1, Math.max(charEstimate, lexicalEstimate));
|
|
733
|
+
}
|
|
734
|
+
async embedTexts(texts, modelId) {
|
|
735
|
+
if (texts.length === 0) {
|
|
736
|
+
return [];
|
|
737
|
+
}
|
|
738
|
+
const batches = [];
|
|
739
|
+
for (let i = 0; i < texts.length; i += this.batchSize) {
|
|
740
|
+
batches.push({
|
|
741
|
+
index: i,
|
|
742
|
+
values: texts.slice(i, i + this.batchSize)
|
|
743
|
+
});
|
|
744
|
+
}
|
|
745
|
+
const outputs = new Array(batches.length);
|
|
746
|
+
const limit = pLimit(this.concurrency);
|
|
747
|
+
await Promise.all(
|
|
748
|
+
batches.map(
|
|
749
|
+
(batch, position) => limit(async () => {
|
|
750
|
+
outputs[position] = await this.embedWithRetry(batch.values, modelId);
|
|
751
|
+
})
|
|
752
|
+
)
|
|
753
|
+
);
|
|
754
|
+
return outputs.flat();
|
|
755
|
+
}
|
|
756
|
+
async embedWithRetry(texts, modelId) {
|
|
757
|
+
const maxAttempts = 5;
|
|
758
|
+
let attempt = 0;
|
|
759
|
+
while (attempt < maxAttempts) {
|
|
760
|
+
attempt += 1;
|
|
761
|
+
try {
|
|
762
|
+
const response = await this.client.embeddings.create({
|
|
763
|
+
model: modelId,
|
|
764
|
+
input: texts,
|
|
765
|
+
encoding_format: "float"
|
|
766
|
+
});
|
|
767
|
+
return response.data.map((entry) => entry.embedding);
|
|
768
|
+
} catch (error) {
|
|
769
|
+
const status = error.status;
|
|
770
|
+
const retryable = status === 429 || typeof status === "number" && status >= 500;
|
|
771
|
+
if (!retryable || attempt >= maxAttempts) {
|
|
772
|
+
throw error;
|
|
773
|
+
}
|
|
774
|
+
const delay = Math.min(2 ** attempt * 300, 5e3);
|
|
775
|
+
await sleep(delay);
|
|
776
|
+
}
|
|
777
|
+
}
|
|
778
|
+
throw new Error("Unreachable retry state");
|
|
779
|
+
}
|
|
780
|
+
};
|
|
781
|
+
|
|
782
|
+
// src/embeddings/factory.ts
|
|
783
|
+
function createEmbeddingsProvider(config) {
|
|
784
|
+
if (config.embeddings.provider !== "openai") {
|
|
785
|
+
throw new SearchSocketError(
|
|
786
|
+
"CONFIG_MISSING",
|
|
787
|
+
`Unsupported embeddings provider ${config.embeddings.provider}`
|
|
788
|
+
);
|
|
789
|
+
}
|
|
790
|
+
const apiKey = process.env[config.embeddings.apiKeyEnv];
|
|
791
|
+
if (!apiKey) {
|
|
792
|
+
throw new SearchSocketError(
|
|
793
|
+
"CONFIG_MISSING",
|
|
794
|
+
`Missing embeddings API key env var: ${config.embeddings.apiKeyEnv}`
|
|
795
|
+
);
|
|
796
|
+
}
|
|
797
|
+
return new OpenAIEmbeddingsProvider({
|
|
798
|
+
apiKey,
|
|
799
|
+
batchSize: config.embeddings.batchSize,
|
|
800
|
+
concurrency: config.embeddings.concurrency
|
|
801
|
+
});
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
// src/indexing/pipeline.ts
|
|
805
|
+
import path11 from "path";
|
|
806
|
+
|
|
807
|
+
// src/vector/factory.ts
|
|
808
|
+
import fs3 from "fs";
|
|
809
|
+
import path3 from "path";
|
|
810
|
+
|
|
811
|
+
// src/vector/turso.ts
|
|
812
|
+
var TursoVectorStore = class {
|
|
813
|
+
client;
|
|
814
|
+
dimension;
|
|
815
|
+
chunksReady = false;
|
|
816
|
+
registryReady = false;
|
|
817
|
+
pagesReady = false;
|
|
818
|
+
constructor(opts) {
|
|
819
|
+
this.client = opts.client;
|
|
820
|
+
this.dimension = opts.dimension;
|
|
821
|
+
}
|
|
822
|
+
async ensureRegistry() {
|
|
823
|
+
if (this.registryReady) return;
|
|
824
|
+
await this.client.execute(`
|
|
825
|
+
CREATE TABLE IF NOT EXISTS registry (
|
|
826
|
+
scope_key TEXT PRIMARY KEY,
|
|
827
|
+
project_id TEXT NOT NULL,
|
|
828
|
+
scope_name TEXT NOT NULL,
|
|
829
|
+
model_id TEXT NOT NULL,
|
|
830
|
+
last_indexed_at TEXT NOT NULL,
|
|
831
|
+
vector_count INTEGER,
|
|
832
|
+
last_estimate_tokens INTEGER,
|
|
833
|
+
last_estimate_cost_usd REAL,
|
|
834
|
+
last_estimate_changed_chunks INTEGER
|
|
835
|
+
)
|
|
836
|
+
`);
|
|
837
|
+
const estimateCols = [
|
|
838
|
+
{ name: "last_estimate_tokens", def: "INTEGER" },
|
|
839
|
+
{ name: "last_estimate_cost_usd", def: "REAL" },
|
|
840
|
+
{ name: "last_estimate_changed_chunks", def: "INTEGER" }
|
|
841
|
+
];
|
|
842
|
+
for (const col of estimateCols) {
|
|
843
|
+
try {
|
|
844
|
+
await this.client.execute(`ALTER TABLE registry ADD COLUMN ${col.name} ${col.def}`);
|
|
845
|
+
} catch (error) {
|
|
846
|
+
if (error instanceof Error && !error.message.includes("duplicate column")) {
|
|
847
|
+
throw error;
|
|
848
|
+
}
|
|
849
|
+
}
|
|
850
|
+
}
|
|
851
|
+
this.registryReady = true;
|
|
852
|
+
}
|
|
853
|
+
async ensureChunks(dim) {
|
|
854
|
+
if (this.chunksReady) return;
|
|
855
|
+
await this.client.batch([
|
|
856
|
+
`CREATE TABLE IF NOT EXISTS chunks (
|
|
857
|
+
id TEXT PRIMARY KEY,
|
|
858
|
+
project_id TEXT NOT NULL,
|
|
859
|
+
scope_name TEXT NOT NULL,
|
|
860
|
+
url TEXT NOT NULL,
|
|
861
|
+
path TEXT NOT NULL,
|
|
862
|
+
title TEXT NOT NULL,
|
|
863
|
+
section_title TEXT NOT NULL DEFAULT '',
|
|
864
|
+
heading_path TEXT NOT NULL DEFAULT '[]',
|
|
865
|
+
snippet TEXT NOT NULL DEFAULT '',
|
|
866
|
+
content_hash TEXT NOT NULL DEFAULT '',
|
|
867
|
+
model_id TEXT NOT NULL DEFAULT '',
|
|
868
|
+
depth INTEGER NOT NULL DEFAULT 0,
|
|
869
|
+
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
870
|
+
route_file TEXT NOT NULL DEFAULT '',
|
|
871
|
+
tags TEXT NOT NULL DEFAULT '[]',
|
|
872
|
+
embedding F32_BLOB(${dim})
|
|
873
|
+
)`,
|
|
874
|
+
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
875
|
+
]);
|
|
876
|
+
this.chunksReady = true;
|
|
877
|
+
}
|
|
878
|
+
async ensurePages() {
|
|
879
|
+
if (this.pagesReady) return;
|
|
880
|
+
await this.client.execute(`
|
|
881
|
+
CREATE TABLE IF NOT EXISTS pages (
|
|
882
|
+
project_id TEXT NOT NULL,
|
|
883
|
+
scope_name TEXT NOT NULL,
|
|
884
|
+
url TEXT NOT NULL,
|
|
885
|
+
title TEXT NOT NULL,
|
|
886
|
+
markdown TEXT NOT NULL,
|
|
887
|
+
route_file TEXT NOT NULL DEFAULT '',
|
|
888
|
+
route_resolution TEXT NOT NULL DEFAULT 'exact',
|
|
889
|
+
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
890
|
+
outgoing_links INTEGER NOT NULL DEFAULT 0,
|
|
891
|
+
depth INTEGER NOT NULL DEFAULT 0,
|
|
892
|
+
tags TEXT NOT NULL DEFAULT '[]',
|
|
893
|
+
indexed_at TEXT NOT NULL,
|
|
894
|
+
PRIMARY KEY (project_id, scope_name, url)
|
|
895
|
+
)
|
|
896
|
+
`);
|
|
897
|
+
this.pagesReady = true;
|
|
898
|
+
}
|
|
899
|
+
async chunksTableExists() {
|
|
900
|
+
try {
|
|
901
|
+
await this.client.execute("SELECT 1 FROM chunks LIMIT 0");
|
|
902
|
+
return true;
|
|
903
|
+
} catch (error) {
|
|
904
|
+
if (error instanceof Error && error.message.includes("no such table")) {
|
|
905
|
+
return false;
|
|
906
|
+
}
|
|
907
|
+
throw error;
|
|
908
|
+
}
|
|
909
|
+
}
|
|
910
|
+
async upsert(records, _scope) {
|
|
911
|
+
if (records.length === 0) return;
|
|
912
|
+
const dim = this.dimension ?? records[0].vector.length;
|
|
913
|
+
await this.ensureChunks(dim);
|
|
914
|
+
const BATCH_SIZE = 100;
|
|
915
|
+
for (let i = 0; i < records.length; i += BATCH_SIZE) {
|
|
916
|
+
const batch = records.slice(i, i + BATCH_SIZE);
|
|
917
|
+
const stmts = batch.map((r) => ({
|
|
918
|
+
sql: `INSERT OR REPLACE INTO chunks
|
|
919
|
+
(id, project_id, scope_name, url, path, title, section_title,
|
|
920
|
+
heading_path, snippet, content_hash, model_id, depth,
|
|
921
|
+
incoming_links, route_file, tags, embedding)
|
|
922
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
923
|
+
args: [
|
|
924
|
+
r.id,
|
|
925
|
+
r.metadata.projectId,
|
|
926
|
+
r.metadata.scopeName,
|
|
927
|
+
r.metadata.url,
|
|
928
|
+
r.metadata.path,
|
|
929
|
+
r.metadata.title,
|
|
930
|
+
r.metadata.sectionTitle,
|
|
931
|
+
JSON.stringify(r.metadata.headingPath),
|
|
932
|
+
r.metadata.snippet,
|
|
933
|
+
r.metadata.contentHash,
|
|
934
|
+
r.metadata.modelId,
|
|
935
|
+
r.metadata.depth,
|
|
936
|
+
r.metadata.incomingLinks,
|
|
937
|
+
r.metadata.routeFile,
|
|
938
|
+
JSON.stringify(r.metadata.tags),
|
|
939
|
+
JSON.stringify(r.vector)
|
|
940
|
+
]
|
|
941
|
+
}));
|
|
942
|
+
await this.client.batch(stmts);
|
|
943
|
+
}
|
|
944
|
+
}
|
|
945
|
+
async query(queryVector, opts, scope) {
|
|
946
|
+
const dim = this.dimension ?? queryVector.length;
|
|
947
|
+
await this.ensureChunks(dim);
|
|
948
|
+
const queryJson = JSON.stringify(queryVector);
|
|
949
|
+
const rs = await this.client.execute({
|
|
950
|
+
sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
|
|
951
|
+
c.section_title, c.heading_path, c.snippet, c.content_hash,
|
|
952
|
+
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
953
|
+
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
954
|
+
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
955
|
+
JOIN chunks AS c ON c.rowid = v.id`,
|
|
956
|
+
args: [queryJson, queryJson, opts.topK]
|
|
957
|
+
});
|
|
958
|
+
let hits = [];
|
|
959
|
+
for (const row of rs.rows) {
|
|
960
|
+
const projectId = row.project_id;
|
|
961
|
+
const scopeName = row.scope_name;
|
|
962
|
+
if (projectId !== scope.projectId || scopeName !== scope.scopeName) {
|
|
963
|
+
continue;
|
|
964
|
+
}
|
|
965
|
+
const rowPath = row.path;
|
|
966
|
+
if (opts.pathPrefix) {
|
|
967
|
+
const rawPrefix = opts.pathPrefix.startsWith("/") ? opts.pathPrefix : `/${opts.pathPrefix}`;
|
|
968
|
+
const prefix = rawPrefix.endsWith("/") ? rawPrefix : `${rawPrefix}/`;
|
|
969
|
+
const normalizedPath = rowPath.replace(/\/$/, "");
|
|
970
|
+
const normalizedPrefix = rawPrefix.replace(/\/$/, "");
|
|
971
|
+
if (normalizedPath !== normalizedPrefix && !rowPath.startsWith(prefix)) {
|
|
972
|
+
continue;
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
const tags = JSON.parse(row.tags || "[]");
|
|
976
|
+
if (opts.tags && opts.tags.length > 0) {
|
|
977
|
+
if (!opts.tags.every((t) => tags.includes(t))) {
|
|
978
|
+
continue;
|
|
979
|
+
}
|
|
980
|
+
}
|
|
981
|
+
const distance = row.distance;
|
|
982
|
+
const score = 1 - distance;
|
|
983
|
+
hits.push({
|
|
984
|
+
id: row.id,
|
|
985
|
+
score,
|
|
986
|
+
metadata: {
|
|
987
|
+
projectId,
|
|
988
|
+
scopeName,
|
|
989
|
+
url: row.url,
|
|
990
|
+
path: rowPath,
|
|
991
|
+
title: row.title,
|
|
992
|
+
sectionTitle: row.section_title,
|
|
993
|
+
headingPath: JSON.parse(row.heading_path || "[]"),
|
|
994
|
+
snippet: row.snippet,
|
|
995
|
+
contentHash: row.content_hash,
|
|
996
|
+
modelId: row.model_id,
|
|
997
|
+
depth: row.depth,
|
|
998
|
+
incomingLinks: row.incoming_links,
|
|
999
|
+
routeFile: row.route_file,
|
|
1000
|
+
tags
|
|
1001
|
+
}
|
|
1002
|
+
});
|
|
1003
|
+
}
|
|
1004
|
+
hits.sort((a, b) => b.score - a.score);
|
|
1005
|
+
return hits;
|
|
1006
|
+
}
|
|
1007
|
+
async deleteByIds(ids, scope) {
|
|
1008
|
+
if (ids.length === 0) return;
|
|
1009
|
+
const BATCH_SIZE = 500;
|
|
1010
|
+
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
1011
|
+
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
1012
|
+
const placeholders = batch.map(() => "?").join(", ");
|
|
1013
|
+
await this.client.execute({
|
|
1014
|
+
sql: `DELETE FROM chunks WHERE project_id = ? AND scope_name = ? AND id IN (${placeholders})`,
|
|
1015
|
+
args: [scope.projectId, scope.scopeName, ...batch]
|
|
1016
|
+
});
|
|
1017
|
+
}
|
|
1018
|
+
}
|
|
1019
|
+
async deleteScope(scope) {
|
|
1020
|
+
await this.ensureRegistry();
|
|
1021
|
+
try {
|
|
1022
|
+
await this.client.execute({
|
|
1023
|
+
sql: `DELETE FROM chunks WHERE project_id = ? AND scope_name = ?`,
|
|
1024
|
+
args: [scope.projectId, scope.scopeName]
|
|
1025
|
+
});
|
|
1026
|
+
} catch (error) {
|
|
1027
|
+
if (error instanceof Error && !error.message.includes("no such table")) {
|
|
1028
|
+
throw error;
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
try {
|
|
1032
|
+
await this.client.execute({
|
|
1033
|
+
sql: `DELETE FROM pages WHERE project_id = ? AND scope_name = ?`,
|
|
1034
|
+
args: [scope.projectId, scope.scopeName]
|
|
1035
|
+
});
|
|
1036
|
+
} catch (error) {
|
|
1037
|
+
if (error instanceof Error && !error.message.includes("no such table")) {
|
|
1038
|
+
throw error;
|
|
1039
|
+
}
|
|
1040
|
+
}
|
|
1041
|
+
await this.client.execute({
|
|
1042
|
+
sql: `DELETE FROM registry WHERE project_id = ? AND scope_name = ?`,
|
|
1043
|
+
args: [scope.projectId, scope.scopeName]
|
|
1044
|
+
});
|
|
1045
|
+
}
|
|
1046
|
+
async listScopes(scopeProjectId) {
|
|
1047
|
+
await this.ensureRegistry();
|
|
1048
|
+
const rs = await this.client.execute({
|
|
1049
|
+
sql: `SELECT project_id, scope_name, model_id, last_indexed_at, vector_count,
|
|
1050
|
+
last_estimate_tokens, last_estimate_cost_usd, last_estimate_changed_chunks
|
|
1051
|
+
FROM registry WHERE project_id = ?`,
|
|
1052
|
+
args: [scopeProjectId]
|
|
1053
|
+
});
|
|
1054
|
+
return rs.rows.map((row) => ({
|
|
1055
|
+
projectId: row.project_id,
|
|
1056
|
+
scopeName: row.scope_name,
|
|
1057
|
+
modelId: row.model_id,
|
|
1058
|
+
lastIndexedAt: row.last_indexed_at,
|
|
1059
|
+
vectorCount: row.vector_count,
|
|
1060
|
+
lastEstimateTokens: row.last_estimate_tokens,
|
|
1061
|
+
lastEstimateCostUSD: row.last_estimate_cost_usd,
|
|
1062
|
+
lastEstimateChangedChunks: row.last_estimate_changed_chunks
|
|
1063
|
+
}));
|
|
1064
|
+
}
|
|
1065
|
+
async recordScope(info) {
|
|
1066
|
+
await this.ensureRegistry();
|
|
1067
|
+
const key = `${info.projectId}:${info.scopeName}`;
|
|
1068
|
+
await this.client.execute({
|
|
1069
|
+
sql: `INSERT OR REPLACE INTO registry
|
|
1070
|
+
(scope_key, project_id, scope_name, model_id, last_indexed_at, vector_count,
|
|
1071
|
+
last_estimate_tokens, last_estimate_cost_usd, last_estimate_changed_chunks)
|
|
1072
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
|
1073
|
+
args: [
|
|
1074
|
+
key,
|
|
1075
|
+
info.projectId,
|
|
1076
|
+
info.scopeName,
|
|
1077
|
+
info.modelId,
|
|
1078
|
+
info.lastIndexedAt,
|
|
1079
|
+
info.vectorCount ?? null,
|
|
1080
|
+
info.lastEstimateTokens ?? null,
|
|
1081
|
+
info.lastEstimateCostUSD ?? null,
|
|
1082
|
+
info.lastEstimateChangedChunks ?? null
|
|
1083
|
+
]
|
|
1084
|
+
});
|
|
1085
|
+
}
|
|
1086
|
+
async getContentHashes(scope) {
|
|
1087
|
+
const exists = await this.chunksTableExists();
|
|
1088
|
+
if (!exists) return /* @__PURE__ */ new Map();
|
|
1089
|
+
const rs = await this.client.execute({
|
|
1090
|
+
sql: `SELECT id, content_hash FROM chunks WHERE project_id = ? AND scope_name = ?`,
|
|
1091
|
+
args: [scope.projectId, scope.scopeName]
|
|
1092
|
+
});
|
|
1093
|
+
const map = /* @__PURE__ */ new Map();
|
|
1094
|
+
for (const row of rs.rows) {
|
|
1095
|
+
map.set(row.id, row.content_hash);
|
|
1096
|
+
}
|
|
1097
|
+
return map;
|
|
1098
|
+
}
|
|
1099
|
+
async upsertPages(pages, scope) {
|
|
1100
|
+
if (pages.length === 0) return;
|
|
1101
|
+
await this.ensurePages();
|
|
1102
|
+
for (const page of pages) {
|
|
1103
|
+
if (page.projectId !== scope.projectId || page.scopeName !== scope.scopeName) {
|
|
1104
|
+
throw new Error(
|
|
1105
|
+
`Page scope mismatch: page has ${page.projectId}:${page.scopeName} but scope is ${scope.projectId}:${scope.scopeName}`
|
|
1106
|
+
);
|
|
1107
|
+
}
|
|
1108
|
+
}
|
|
1109
|
+
const BATCH_SIZE = 100;
|
|
1110
|
+
for (let i = 0; i < pages.length; i += BATCH_SIZE) {
|
|
1111
|
+
const batch = pages.slice(i, i + BATCH_SIZE);
|
|
1112
|
+
const stmts = batch.map((p) => ({
|
|
1113
|
+
sql: `INSERT OR REPLACE INTO pages
|
|
1114
|
+
(project_id, scope_name, url, title, markdown, route_file,
|
|
1115
|
+
route_resolution, incoming_links, outgoing_links, depth, tags, indexed_at)
|
|
1116
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
|
1117
|
+
args: [
|
|
1118
|
+
p.projectId,
|
|
1119
|
+
p.scopeName,
|
|
1120
|
+
p.url,
|
|
1121
|
+
p.title,
|
|
1122
|
+
p.markdown,
|
|
1123
|
+
p.routeFile,
|
|
1124
|
+
p.routeResolution,
|
|
1125
|
+
p.incomingLinks,
|
|
1126
|
+
p.outgoingLinks,
|
|
1127
|
+
p.depth,
|
|
1128
|
+
JSON.stringify(p.tags),
|
|
1129
|
+
p.indexedAt
|
|
1130
|
+
]
|
|
1131
|
+
}));
|
|
1132
|
+
await this.client.batch(stmts);
|
|
1133
|
+
}
|
|
1134
|
+
}
|
|
1135
|
+
async getPage(url, scope) {
|
|
1136
|
+
await this.ensurePages();
|
|
1137
|
+
const rs = await this.client.execute({
|
|
1138
|
+
sql: `SELECT * FROM pages WHERE project_id = ? AND scope_name = ? AND url = ?`,
|
|
1139
|
+
args: [scope.projectId, scope.scopeName, url]
|
|
1140
|
+
});
|
|
1141
|
+
if (rs.rows.length === 0) return null;
|
|
1142
|
+
const row = rs.rows[0];
|
|
1143
|
+
return {
|
|
1144
|
+
url: row.url,
|
|
1145
|
+
title: row.title,
|
|
1146
|
+
markdown: row.markdown,
|
|
1147
|
+
projectId: row.project_id,
|
|
1148
|
+
scopeName: row.scope_name,
|
|
1149
|
+
routeFile: row.route_file,
|
|
1150
|
+
routeResolution: row.route_resolution,
|
|
1151
|
+
incomingLinks: row.incoming_links,
|
|
1152
|
+
outgoingLinks: row.outgoing_links,
|
|
1153
|
+
depth: row.depth,
|
|
1154
|
+
tags: JSON.parse(row.tags || "[]"),
|
|
1155
|
+
indexedAt: row.indexed_at
|
|
1156
|
+
};
|
|
1157
|
+
}
|
|
1158
|
+
async deletePages(scope) {
|
|
1159
|
+
await this.ensurePages();
|
|
1160
|
+
await this.client.execute({
|
|
1161
|
+
sql: `DELETE FROM pages WHERE project_id = ? AND scope_name = ?`,
|
|
1162
|
+
args: [scope.projectId, scope.scopeName]
|
|
1163
|
+
});
|
|
1164
|
+
}
|
|
1165
|
+
async getScopeModelId(scope) {
|
|
1166
|
+
await this.ensureRegistry();
|
|
1167
|
+
const rs = await this.client.execute({
|
|
1168
|
+
sql: `SELECT model_id FROM registry WHERE project_id = ? AND scope_name = ?`,
|
|
1169
|
+
args: [scope.projectId, scope.scopeName]
|
|
1170
|
+
});
|
|
1171
|
+
if (rs.rows.length === 0) return null;
|
|
1172
|
+
return rs.rows[0].model_id;
|
|
1173
|
+
}
|
|
1174
|
+
async health() {
|
|
1175
|
+
try {
|
|
1176
|
+
await this.client.execute("SELECT 1");
|
|
1177
|
+
return { ok: true };
|
|
1178
|
+
} catch (error) {
|
|
1179
|
+
return {
|
|
1180
|
+
ok: false,
|
|
1181
|
+
details: error instanceof Error ? error.message : "unknown error"
|
|
1182
|
+
};
|
|
1183
|
+
}
|
|
1184
|
+
}
|
|
1185
|
+
};
|
|
1186
|
+
|
|
1187
|
+
// src/vector/factory.ts
|
|
1188
|
+
async function createVectorStore(config, cwd) {
|
|
1189
|
+
const turso = config.vector.turso;
|
|
1190
|
+
const remoteUrl = process.env[turso.urlEnv];
|
|
1191
|
+
if (remoteUrl) {
|
|
1192
|
+
const { createClient: createClient2 } = await import("@libsql/client/http");
|
|
1193
|
+
const authToken = process.env[turso.authTokenEnv];
|
|
1194
|
+
const client2 = createClient2({
|
|
1195
|
+
url: remoteUrl,
|
|
1196
|
+
authToken
|
|
1197
|
+
});
|
|
1198
|
+
return new TursoVectorStore({
|
|
1199
|
+
client: client2,
|
|
1200
|
+
dimension: config.vector.dimension
|
|
1201
|
+
});
|
|
1202
|
+
}
|
|
1203
|
+
const { createClient } = await import("@libsql/client");
|
|
1204
|
+
const localPath = path3.resolve(cwd, turso.localPath);
|
|
1205
|
+
fs3.mkdirSync(path3.dirname(localPath), { recursive: true });
|
|
1206
|
+
const client = createClient({
|
|
1207
|
+
url: `file:${localPath}`
|
|
1208
|
+
});
|
|
1209
|
+
return new TursoVectorStore({
|
|
1210
|
+
client,
|
|
1211
|
+
dimension: config.vector.dimension
|
|
1212
|
+
});
|
|
1213
|
+
}
|
|
1214
|
+
|
|
1215
|
+
// src/utils/hash.ts
|
|
1216
|
+
import { createHash } from "crypto";
|
|
1217
|
+
function sha1(input) {
|
|
1218
|
+
return createHash("sha1").update(input).digest("hex");
|
|
1219
|
+
}
|
|
1220
|
+
function sha256(input) {
|
|
1221
|
+
return createHash("sha256").update(input).digest("hex");
|
|
1222
|
+
}
|
|
1223
|
+
|
|
1224
|
+
// src/utils/path.ts
|
|
1225
|
+
import path4 from "path";
|
|
1226
|
+
function normalizeUrlPath(rawPath) {
|
|
1227
|
+
let out = rawPath.trim();
|
|
1228
|
+
if (!out.startsWith("/")) {
|
|
1229
|
+
out = `/${out}`;
|
|
1230
|
+
}
|
|
1231
|
+
out = out.replace(/\/+/g, "/");
|
|
1232
|
+
if (out.length > 1 && out.endsWith("/")) {
|
|
1233
|
+
out = out.slice(0, -1);
|
|
1234
|
+
}
|
|
1235
|
+
return out;
|
|
1236
|
+
}
|
|
1237
|
+
function urlPathToMirrorRelative(urlPath) {
|
|
1238
|
+
const normalized = normalizeUrlPath(urlPath);
|
|
1239
|
+
if (normalized === "/") {
|
|
1240
|
+
return "index.md";
|
|
1241
|
+
}
|
|
1242
|
+
return `${normalized.slice(1)}.md`;
|
|
1243
|
+
}
|
|
1244
|
+
function staticHtmlFileToUrl(filePath, rootDir) {
|
|
1245
|
+
const relative = path4.relative(rootDir, filePath).replace(/\\/g, "/");
|
|
1246
|
+
if (relative === "index.html") {
|
|
1247
|
+
return "/";
|
|
1248
|
+
}
|
|
1249
|
+
if (relative.endsWith("/index.html")) {
|
|
1250
|
+
return normalizeUrlPath(relative.slice(0, -"/index.html".length));
|
|
1251
|
+
}
|
|
1252
|
+
if (relative.endsWith(".html")) {
|
|
1253
|
+
return normalizeUrlPath(relative.slice(0, -".html".length));
|
|
1254
|
+
}
|
|
1255
|
+
return normalizeUrlPath(relative);
|
|
1256
|
+
}
|
|
1257
|
+
function getUrlDepth(urlPath) {
|
|
1258
|
+
if (urlPath === "/") {
|
|
1259
|
+
return 0;
|
|
1260
|
+
}
|
|
1261
|
+
return normalizeUrlPath(urlPath).split("/").filter(Boolean).length;
|
|
1262
|
+
}
|
|
1263
|
+
function humanizeUrlPath(urlPath) {
|
|
1264
|
+
const normalized = normalizeUrlPath(urlPath);
|
|
1265
|
+
if (normalized === "/") return "";
|
|
1266
|
+
return normalized.slice(1).split("/").map((segment) => segment.replace(/[-_]/g, " ")).join(" / ");
|
|
1267
|
+
}
|
|
1268
|
+
function ensureLeadingSlash(value) {
|
|
1269
|
+
return value.startsWith("/") ? value : `/${value}`;
|
|
1270
|
+
}
|
|
1271
|
+
function joinUrl(baseUrl, route) {
|
|
1272
|
+
const base = baseUrl.endsWith("/") ? baseUrl.slice(0, -1) : baseUrl;
|
|
1273
|
+
const routePart = ensureLeadingSlash(route);
|
|
1274
|
+
return `${base}${routePart}`;
|
|
1275
|
+
}
|
|
1276
|
+
|
|
1277
|
+
// src/indexing/chunker.ts
|
|
1278
|
+
var FENCE_LINE_RE = /^(```|~~~)/;
|
|
1279
|
+
function parseHeadingSections(markdown, headingPathDepth) {
|
|
1280
|
+
const lines = markdown.split("\n");
|
|
1281
|
+
const sections = [];
|
|
1282
|
+
const headingStack = [];
|
|
1283
|
+
let inFence = false;
|
|
1284
|
+
let current = {
|
|
1285
|
+
sectionTitle: void 0,
|
|
1286
|
+
headingPath: [],
|
|
1287
|
+
text: ""
|
|
1288
|
+
};
|
|
1289
|
+
const flush = () => {
|
|
1290
|
+
if (normalizeText(current.text)) {
|
|
1291
|
+
sections.push({
|
|
1292
|
+
sectionTitle: current.sectionTitle,
|
|
1293
|
+
headingPath: current.headingPath,
|
|
1294
|
+
text: current.text.trim()
|
|
1295
|
+
});
|
|
1296
|
+
}
|
|
1297
|
+
};
|
|
1298
|
+
for (const line of lines) {
|
|
1299
|
+
if (FENCE_LINE_RE.test(line.trim())) {
|
|
1300
|
+
inFence = !inFence;
|
|
1301
|
+
}
|
|
1302
|
+
const headingMatch = !inFence ? line.match(/^(#{1,6})\s+(.+)$/) : null;
|
|
1303
|
+
if (headingMatch) {
|
|
1304
|
+
flush();
|
|
1305
|
+
const level = (headingMatch[1] ?? "#").length;
|
|
1306
|
+
const title = (headingMatch[2] ?? "").trim();
|
|
1307
|
+
headingStack[level - 1] = title;
|
|
1308
|
+
headingStack.length = level;
|
|
1309
|
+
current = {
|
|
1310
|
+
sectionTitle: title,
|
|
1311
|
+
headingPath: headingStack.filter((entry) => Boolean(entry)).slice(0, headingPathDepth),
|
|
1312
|
+
text: `${line}
|
|
1313
|
+
`
|
|
1314
|
+
};
|
|
1315
|
+
continue;
|
|
1316
|
+
}
|
|
1317
|
+
current.text += `${line}
|
|
1318
|
+
`;
|
|
1319
|
+
}
|
|
1320
|
+
flush();
|
|
1321
|
+
if (sections.length === 0 && normalizeText(markdown)) {
|
|
1322
|
+
sections.push({
|
|
1323
|
+
sectionTitle: void 0,
|
|
1324
|
+
headingPath: [],
|
|
1325
|
+
text: markdown.trim()
|
|
1326
|
+
});
|
|
1327
|
+
}
|
|
1328
|
+
return sections;
|
|
1329
|
+
}
|
|
1330
|
+
function blockify(text, config) {
|
|
1331
|
+
const lines = text.split("\n");
|
|
1332
|
+
const blocks = [];
|
|
1333
|
+
let inFence = false;
|
|
1334
|
+
let current = [];
|
|
1335
|
+
const flush = () => {
|
|
1336
|
+
const value = current.join("\n").trim();
|
|
1337
|
+
if (value) {
|
|
1338
|
+
blocks.push(value);
|
|
1339
|
+
}
|
|
1340
|
+
current = [];
|
|
1341
|
+
};
|
|
1342
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
1343
|
+
const line = lines[i] ?? "";
|
|
1344
|
+
const trimmed = line.trim();
|
|
1345
|
+
if (FENCE_LINE_RE.test(trimmed)) {
|
|
1346
|
+
inFence = !inFence;
|
|
1347
|
+
current.push(line);
|
|
1348
|
+
continue;
|
|
1349
|
+
}
|
|
1350
|
+
if (inFence) {
|
|
1351
|
+
current.push(line);
|
|
1352
|
+
continue;
|
|
1353
|
+
}
|
|
1354
|
+
const isTableLine = /^\|.*\|$/.test(trimmed) || /^\|?\s*:?-+:?\s*\|/.test(trimmed);
|
|
1355
|
+
const isQuoteLine = /^>/.test(trimmed);
|
|
1356
|
+
if (isTableLine && config.dontSplitInside.includes("table")) {
|
|
1357
|
+
current.push(line);
|
|
1358
|
+
while (i + 1 < lines.length) {
|
|
1359
|
+
const next = lines[i + 1];
|
|
1360
|
+
if (!next || !/^\|/.test(next.trim())) {
|
|
1361
|
+
break;
|
|
1362
|
+
}
|
|
1363
|
+
i += 1;
|
|
1364
|
+
current.push(lines[i] ?? "");
|
|
1365
|
+
}
|
|
1366
|
+
continue;
|
|
1367
|
+
}
|
|
1368
|
+
if (isQuoteLine && config.dontSplitInside.includes("blockquote")) {
|
|
1369
|
+
current.push(line);
|
|
1370
|
+
while (i + 1 < lines.length) {
|
|
1371
|
+
const next = lines[i + 1];
|
|
1372
|
+
if (!next || !/^>/.test(next.trim())) {
|
|
1373
|
+
break;
|
|
1374
|
+
}
|
|
1375
|
+
i += 1;
|
|
1376
|
+
current.push(lines[i] ?? "");
|
|
1377
|
+
}
|
|
1378
|
+
continue;
|
|
1379
|
+
}
|
|
1380
|
+
if (!trimmed) {
|
|
1381
|
+
flush();
|
|
1382
|
+
continue;
|
|
1383
|
+
}
|
|
1384
|
+
current.push(line);
|
|
1385
|
+
}
|
|
1386
|
+
flush();
|
|
1387
|
+
return blocks;
|
|
1388
|
+
}
|
|
1389
|
+
function isProtectedBlock(block, config) {
|
|
1390
|
+
const lines = block.trim().split("\n");
|
|
1391
|
+
const first = (lines[0] ?? "").trim();
|
|
1392
|
+
const last = (lines[lines.length - 1] ?? "").trim();
|
|
1393
|
+
const isCodeBlock = FENCE_LINE_RE.test(first) && FENCE_LINE_RE.test(last);
|
|
1394
|
+
if (isCodeBlock && config.dontSplitInside.includes("code")) {
|
|
1395
|
+
return true;
|
|
1396
|
+
}
|
|
1397
|
+
const isTableBlock = lines.every((line) => {
|
|
1398
|
+
const trimmed = line.trim();
|
|
1399
|
+
return trimmed.length === 0 || /^\|.*\|$/.test(trimmed) || /^\|?\s*:?-+:?\s*\|/.test(trimmed);
|
|
1400
|
+
});
|
|
1401
|
+
if (isTableBlock && config.dontSplitInside.includes("table")) {
|
|
1402
|
+
return true;
|
|
1403
|
+
}
|
|
1404
|
+
const isQuoteBlock = lines.every((line) => {
|
|
1405
|
+
const trimmed = line.trim();
|
|
1406
|
+
return trimmed.length === 0 || trimmed.startsWith(">");
|
|
1407
|
+
});
|
|
1408
|
+
return isQuoteBlock && config.dontSplitInside.includes("blockquote");
|
|
1409
|
+
}
|
|
1410
|
+
function splitOversizedBlock(block, config) {
|
|
1411
|
+
const trimmed = block.trim();
|
|
1412
|
+
if (trimmed.length <= config.maxChars || isProtectedBlock(trimmed, config)) {
|
|
1413
|
+
return [trimmed];
|
|
1414
|
+
}
|
|
1415
|
+
const chunks = [];
|
|
1416
|
+
let start = 0;
|
|
1417
|
+
while (start < trimmed.length) {
|
|
1418
|
+
let end = Math.min(start + config.maxChars, trimmed.length);
|
|
1419
|
+
if (end < trimmed.length) {
|
|
1420
|
+
const boundary = trimmed.lastIndexOf(" ", end);
|
|
1421
|
+
if (boundary > start + Math.floor(config.maxChars * 0.6)) {
|
|
1422
|
+
end = boundary;
|
|
1423
|
+
}
|
|
1424
|
+
}
|
|
1425
|
+
const chunk = trimmed.slice(start, end).trim();
|
|
1426
|
+
if (chunk) {
|
|
1427
|
+
chunks.push(chunk);
|
|
1428
|
+
}
|
|
1429
|
+
if (end >= trimmed.length) {
|
|
1430
|
+
break;
|
|
1431
|
+
}
|
|
1432
|
+
const nextStart = Math.max(0, end - config.overlapChars);
|
|
1433
|
+
start = nextStart > start ? nextStart : end;
|
|
1434
|
+
}
|
|
1435
|
+
return chunks.length > 0 ? chunks : [trimmed];
|
|
1436
|
+
}
|
|
1437
|
+
function splitSection(section, config) {
|
|
1438
|
+
const text = section.text.trim();
|
|
1439
|
+
if (!text) {
|
|
1440
|
+
return [];
|
|
1441
|
+
}
|
|
1442
|
+
if (text.length <= config.maxChars) {
|
|
1443
|
+
return [
|
|
1444
|
+
{
|
|
1445
|
+
sectionTitle: section.sectionTitle,
|
|
1446
|
+
headingPath: section.headingPath,
|
|
1447
|
+
chunkText: text
|
|
1448
|
+
}
|
|
1449
|
+
];
|
|
1450
|
+
}
|
|
1451
|
+
const blocks = blockify(text, config);
|
|
1452
|
+
const chunks = [];
|
|
1453
|
+
let current = "";
|
|
1454
|
+
for (const block of blocks) {
|
|
1455
|
+
const pieces = splitOversizedBlock(block, config);
|
|
1456
|
+
for (const piece of pieces) {
|
|
1457
|
+
if (!current) {
|
|
1458
|
+
current = piece;
|
|
1459
|
+
continue;
|
|
1460
|
+
}
|
|
1461
|
+
const candidate = `${current}
|
|
1462
|
+
|
|
1463
|
+
${piece}`;
|
|
1464
|
+
if (candidate.length <= config.maxChars) {
|
|
1465
|
+
current = candidate;
|
|
1466
|
+
continue;
|
|
1467
|
+
}
|
|
1468
|
+
chunks.push(current);
|
|
1469
|
+
const overlap = current.slice(Math.max(0, current.length - config.overlapChars)).trim();
|
|
1470
|
+
const withOverlap = overlap ? `${overlap}
|
|
1471
|
+
|
|
1472
|
+
${piece}` : piece;
|
|
1473
|
+
current = withOverlap.length <= config.maxChars ? withOverlap : piece;
|
|
1474
|
+
}
|
|
1475
|
+
}
|
|
1476
|
+
if (current.trim()) {
|
|
1477
|
+
chunks.push(current.trim());
|
|
1478
|
+
}
|
|
1479
|
+
const merged = [];
|
|
1480
|
+
for (const chunk of chunks) {
|
|
1481
|
+
if (merged.length === 0) {
|
|
1482
|
+
merged.push(chunk);
|
|
1483
|
+
continue;
|
|
1484
|
+
}
|
|
1485
|
+
const canMerge = chunk.length < config.minChars && merged[merged.length - 1] !== void 0 && (merged[merged.length - 1]?.length ?? 0) + 2 + chunk.length <= config.maxChars;
|
|
1486
|
+
if (canMerge) {
|
|
1487
|
+
merged[merged.length - 1] = `${merged[merged.length - 1]}
|
|
1488
|
+
|
|
1489
|
+
${chunk}`;
|
|
1490
|
+
} else {
|
|
1491
|
+
merged.push(chunk);
|
|
1492
|
+
}
|
|
1493
|
+
}
|
|
1494
|
+
return merged.map((chunkText) => ({
|
|
1495
|
+
sectionTitle: section.sectionTitle,
|
|
1496
|
+
headingPath: section.headingPath,
|
|
1497
|
+
chunkText
|
|
1498
|
+
}));
|
|
1499
|
+
}
|
|
1500
|
+
function buildSummaryChunkText(page) {
|
|
1501
|
+
const parts = [page.title];
|
|
1502
|
+
const humanized = humanizeUrlPath(page.url);
|
|
1503
|
+
if (humanized) parts.push(humanized);
|
|
1504
|
+
const body = page.description ?? extractFirstParagraph(page.markdown);
|
|
1505
|
+
if (body) parts.push(body);
|
|
1506
|
+
if (page.keywords && page.keywords.length > 0) {
|
|
1507
|
+
parts.push(page.keywords.join(", "));
|
|
1508
|
+
}
|
|
1509
|
+
return parts.join("\n\n");
|
|
1510
|
+
}
|
|
1511
|
+
function buildEmbeddingText(chunk, prependTitle) {
|
|
1512
|
+
if (!prependTitle) return chunk.chunkText;
|
|
1513
|
+
const prefix = chunk.sectionTitle ? `${chunk.title} \u2014 ${chunk.sectionTitle}` : chunk.title;
|
|
1514
|
+
return `${prefix}
|
|
1515
|
+
|
|
1516
|
+
${chunk.chunkText}`;
|
|
1517
|
+
}
|
|
1518
|
+
function chunkMirrorPage(page, config, scope) {
|
|
1519
|
+
const sections = parseHeadingSections(page.markdown, config.chunking.headingPathDepth);
|
|
1520
|
+
const rawChunks = sections.flatMap((section) => splitSection(section, config.chunking));
|
|
1521
|
+
const chunks = [];
|
|
1522
|
+
if (config.chunking.pageSummaryChunk) {
|
|
1523
|
+
const summaryText = buildSummaryChunkText(page);
|
|
1524
|
+
const summaryChunkKey = sha1(`${scope.scopeName}|${page.url}|__summary__`);
|
|
1525
|
+
const summaryChunk = {
|
|
1526
|
+
chunkKey: summaryChunkKey,
|
|
1527
|
+
ordinal: 0,
|
|
1528
|
+
url: page.url,
|
|
1529
|
+
path: page.url,
|
|
1530
|
+
title: page.title,
|
|
1531
|
+
sectionTitle: void 0,
|
|
1532
|
+
headingPath: [],
|
|
1533
|
+
chunkText: summaryText,
|
|
1534
|
+
snippet: toSnippet(summaryText),
|
|
1535
|
+
depth: page.depth,
|
|
1536
|
+
incomingLinks: page.incomingLinks,
|
|
1537
|
+
routeFile: page.routeFile,
|
|
1538
|
+
tags: page.tags,
|
|
1539
|
+
contentHash: ""
|
|
1540
|
+
};
|
|
1541
|
+
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
1542
|
+
summaryChunk.contentHash = sha256(normalizeText(embeddingText));
|
|
1543
|
+
chunks.push(summaryChunk);
|
|
1544
|
+
}
|
|
1545
|
+
const ordinalOffset = config.chunking.pageSummaryChunk ? 1 : 0;
|
|
1546
|
+
for (let index = 0; index < rawChunks.length; index++) {
|
|
1547
|
+
const entry = rawChunks[index];
|
|
1548
|
+
const sectionTitleNormalized = normalizeText(entry.sectionTitle ?? "").toLowerCase();
|
|
1549
|
+
const chunkKey = sha1(
|
|
1550
|
+
`${scope.scopeName}|${page.url}|${index}|${sectionTitleNormalized}`
|
|
1551
|
+
);
|
|
1552
|
+
const chunk = {
|
|
1553
|
+
chunkKey,
|
|
1554
|
+
ordinal: index + ordinalOffset,
|
|
1555
|
+
url: page.url,
|
|
1556
|
+
path: page.url,
|
|
1557
|
+
title: page.title,
|
|
1558
|
+
sectionTitle: entry.sectionTitle,
|
|
1559
|
+
headingPath: entry.headingPath,
|
|
1560
|
+
chunkText: entry.chunkText,
|
|
1561
|
+
snippet: toSnippet(entry.chunkText),
|
|
1562
|
+
depth: page.depth,
|
|
1563
|
+
incomingLinks: page.incomingLinks,
|
|
1564
|
+
routeFile: page.routeFile,
|
|
1565
|
+
tags: page.tags,
|
|
1566
|
+
contentHash: ""
|
|
1567
|
+
};
|
|
1568
|
+
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
1569
|
+
chunk.contentHash = sha256(normalizeText(embeddingText));
|
|
1570
|
+
chunks.push(chunk);
|
|
1571
|
+
}
|
|
1572
|
+
return chunks;
|
|
1573
|
+
}
|
|
1574
|
+
|
|
1575
|
+
// src/indexing/extractor.ts
|
|
1576
|
+
import { load } from "cheerio";
|
|
1577
|
+
import matter from "gray-matter";
|
|
1578
|
+
import TurndownService from "turndown";
|
|
1579
|
+
import { gfm, highlightedCodeBlock, strikethrough, tables, taskListItems } from "turndown-plugin-gfm";
|
|
1580
|
+
function hasTopLevelNoindexComment(markdown) {
|
|
1581
|
+
const lines = markdown.split(/\r?\n/);
|
|
1582
|
+
let inFence = false;
|
|
1583
|
+
for (const line of lines) {
|
|
1584
|
+
const trimmed = line.trim();
|
|
1585
|
+
if (/^(```|~~~)/.test(trimmed)) {
|
|
1586
|
+
inFence = !inFence;
|
|
1587
|
+
continue;
|
|
1588
|
+
}
|
|
1589
|
+
if (!inFence && /<!--\s*noindex\s*-->/i.test(line)) {
|
|
1590
|
+
return true;
|
|
1591
|
+
}
|
|
1592
|
+
}
|
|
1593
|
+
return false;
|
|
1594
|
+
}
|
|
1595
|
+
function extractFromHtml(url, html, config) {
|
|
1596
|
+
const $ = load(html);
|
|
1597
|
+
const normalizedUrl = normalizeUrlPath(url);
|
|
1598
|
+
const pageBaseUrl = new URL(`https://searchsocket.local${normalizedUrl}`);
|
|
1599
|
+
const title = normalizeText($("title").first().text() || "") || normalizeText($(`${config.extract.mainSelector} h1`).first().text() || "") || normalizedUrl;
|
|
1600
|
+
if (config.extract.respectRobotsNoindex) {
|
|
1601
|
+
const robots = $("meta[name='robots']").attr("content") ?? "";
|
|
1602
|
+
if (/\bnoindex\b/i.test(robots)) {
|
|
1603
|
+
return null;
|
|
1604
|
+
}
|
|
1605
|
+
}
|
|
1606
|
+
if ($(`[${config.extract.noindexAttr}]`).length > 0) {
|
|
1607
|
+
return null;
|
|
1608
|
+
}
|
|
1609
|
+
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
1610
|
+
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
1611
|
+
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
1612
|
+
const root = $(config.extract.mainSelector).first().length ? $(config.extract.mainSelector).first().clone() : $("body").first().clone();
|
|
1613
|
+
for (const tagName of config.extract.dropTags) {
|
|
1614
|
+
root.find(tagName).remove();
|
|
1615
|
+
}
|
|
1616
|
+
for (const selector of config.extract.dropSelectors) {
|
|
1617
|
+
root.find(selector).remove();
|
|
1618
|
+
}
|
|
1619
|
+
root.find(`[${config.extract.ignoreAttr}]`).remove();
|
|
1620
|
+
const outgoingLinks = [];
|
|
1621
|
+
root.find("a[href]").each((_index, node) => {
|
|
1622
|
+
const href = $(node).attr("href");
|
|
1623
|
+
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:")) {
|
|
1624
|
+
return;
|
|
1625
|
+
}
|
|
1626
|
+
try {
|
|
1627
|
+
const parsed = new URL(href, pageBaseUrl);
|
|
1628
|
+
if (!["http:", "https:"].includes(parsed.protocol)) {
|
|
1629
|
+
return;
|
|
1630
|
+
}
|
|
1631
|
+
outgoingLinks.push(normalizeUrlPath(parsed.pathname));
|
|
1632
|
+
} catch {
|
|
1633
|
+
}
|
|
1634
|
+
});
|
|
1635
|
+
const turndown = new TurndownService({
|
|
1636
|
+
headingStyle: "atx",
|
|
1637
|
+
codeBlockStyle: "fenced"
|
|
1638
|
+
});
|
|
1639
|
+
if (config.transform.preserveCodeBlocks && config.transform.preserveTables) {
|
|
1640
|
+
turndown.use(gfm);
|
|
1641
|
+
} else {
|
|
1642
|
+
turndown.use(strikethrough);
|
|
1643
|
+
turndown.use(taskListItems);
|
|
1644
|
+
if (config.transform.preserveTables) {
|
|
1645
|
+
turndown.use(tables);
|
|
1646
|
+
}
|
|
1647
|
+
if (config.transform.preserveCodeBlocks) {
|
|
1648
|
+
turndown.use(highlightedCodeBlock);
|
|
1649
|
+
}
|
|
1650
|
+
}
|
|
1651
|
+
const markdown = normalizeMarkdown(turndown.turndown(root.html() ?? ""));
|
|
1652
|
+
if (!normalizeText(markdown)) {
|
|
1653
|
+
return null;
|
|
1654
|
+
}
|
|
1655
|
+
const tags = normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1);
|
|
1656
|
+
return {
|
|
1657
|
+
url: normalizeUrlPath(url),
|
|
1658
|
+
title,
|
|
1659
|
+
markdown,
|
|
1660
|
+
outgoingLinks: [...new Set(outgoingLinks)],
|
|
1661
|
+
noindex: false,
|
|
1662
|
+
tags,
|
|
1663
|
+
description,
|
|
1664
|
+
keywords
|
|
1665
|
+
};
|
|
1666
|
+
}
|
|
1667
|
+
function extractFromMarkdown(url, markdown, title) {
|
|
1668
|
+
if (hasTopLevelNoindexComment(markdown)) {
|
|
1669
|
+
return null;
|
|
1670
|
+
}
|
|
1671
|
+
const parsed = matter(markdown);
|
|
1672
|
+
const frontmatter = parsed.data;
|
|
1673
|
+
const searchsocketMeta = frontmatter.searchsocket;
|
|
1674
|
+
if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
|
|
1675
|
+
return null;
|
|
1676
|
+
}
|
|
1677
|
+
const content = parsed.content;
|
|
1678
|
+
const normalized = normalizeMarkdown(content);
|
|
1679
|
+
if (!normalizeText(normalized)) {
|
|
1680
|
+
return null;
|
|
1681
|
+
}
|
|
1682
|
+
const resolvedTitle = title ?? (typeof frontmatter.title === "string" ? frontmatter.title : void 0) ?? normalizeUrlPath(url);
|
|
1683
|
+
const fmDescription = typeof frontmatter.description === "string" ? frontmatter.description.trim() || void 0 : void 0;
|
|
1684
|
+
let fmKeywords;
|
|
1685
|
+
if (Array.isArray(frontmatter.keywords)) {
|
|
1686
|
+
fmKeywords = frontmatter.keywords.filter((k) => typeof k === "string" && k.trim().length > 0).map((k) => k.trim());
|
|
1687
|
+
} else if (typeof frontmatter.keywords === "string" && frontmatter.keywords.trim()) {
|
|
1688
|
+
fmKeywords = frontmatter.keywords.split(",").map((k) => k.trim()).filter(Boolean);
|
|
1689
|
+
}
|
|
1690
|
+
if (fmKeywords && fmKeywords.length === 0) fmKeywords = void 0;
|
|
1691
|
+
return {
|
|
1692
|
+
url: normalizeUrlPath(url),
|
|
1693
|
+
title: resolvedTitle,
|
|
1694
|
+
markdown: normalized,
|
|
1695
|
+
outgoingLinks: [],
|
|
1696
|
+
noindex: false,
|
|
1697
|
+
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
1698
|
+
description: fmDescription,
|
|
1699
|
+
keywords: fmKeywords
|
|
1700
|
+
};
|
|
1701
|
+
}
|
|
1702
|
+
|
|
1703
|
+
// src/indexing/mirror.ts
|
|
1704
|
+
import fs4 from "fs/promises";
|
|
1705
|
+
import path5 from "path";
|
|
1706
|
+
function yamlString(value) {
|
|
1707
|
+
return JSON.stringify(value);
|
|
1708
|
+
}
|
|
1709
|
+
function yamlArray(values) {
|
|
1710
|
+
return `[${values.map((v) => JSON.stringify(v)).join(", ")}]`;
|
|
1711
|
+
}
|
|
1712
|
+
function buildMirrorMarkdown(page) {
|
|
1713
|
+
const frontmatterLines = [
|
|
1714
|
+
"---",
|
|
1715
|
+
`url: ${yamlString(page.url)}`,
|
|
1716
|
+
`title: ${yamlString(page.title)}`,
|
|
1717
|
+
`scope: ${yamlString(page.scope)}`,
|
|
1718
|
+
`routeFile: ${yamlString(page.routeFile)}`,
|
|
1719
|
+
`routeResolution: ${yamlString(page.routeResolution)}`,
|
|
1720
|
+
`generatedAt: ${yamlString(page.generatedAt)}`,
|
|
1721
|
+
`incomingLinks: ${page.incomingLinks}`,
|
|
1722
|
+
`outgoingLinks: ${page.outgoingLinks}`,
|
|
1723
|
+
`depth: ${page.depth}`,
|
|
1724
|
+
`tags: ${yamlArray(page.tags)}`,
|
|
1725
|
+
"---",
|
|
1726
|
+
""
|
|
1727
|
+
];
|
|
1728
|
+
return `${frontmatterLines.join("\n")}${normalizeMarkdown(page.markdown)}`;
|
|
1729
|
+
}
|
|
1730
|
+
function stripGeneratedAt(content) {
|
|
1731
|
+
return content.replace(/^generatedAt: .*$/m, "");
|
|
1732
|
+
}
|
|
1733
|
+
async function writeMirrorPage(statePath, scope, page) {
|
|
1734
|
+
const relative = urlPathToMirrorRelative(page.url);
|
|
1735
|
+
const outputPath = path5.join(statePath, "pages", scope.scopeName, relative);
|
|
1736
|
+
await fs4.mkdir(path5.dirname(outputPath), { recursive: true });
|
|
1737
|
+
const newContent = buildMirrorMarkdown(page);
|
|
1738
|
+
try {
|
|
1739
|
+
const existing = await fs4.readFile(outputPath, "utf8");
|
|
1740
|
+
if (stripGeneratedAt(existing) === stripGeneratedAt(newContent)) {
|
|
1741
|
+
return outputPath;
|
|
1742
|
+
}
|
|
1743
|
+
} catch {
|
|
1744
|
+
}
|
|
1745
|
+
await fs4.writeFile(outputPath, newContent, "utf8");
|
|
1746
|
+
return outputPath;
|
|
1747
|
+
}
|
|
1748
|
+
async function cleanMirrorForScope(statePath, scope) {
|
|
1749
|
+
const target = path5.join(statePath, "pages", scope.scopeName);
|
|
1750
|
+
await fs4.rm(target, { recursive: true, force: true });
|
|
1751
|
+
await fs4.mkdir(target, { recursive: true });
|
|
1752
|
+
}
|
|
1753
|
+
|
|
1754
|
+
// src/indexing/route-mapper.ts
|
|
1755
|
+
import path6 from "path";
|
|
1756
|
+
import fg from "fast-glob";
|
|
1757
|
+
function segmentToRegex(segment) {
|
|
1758
|
+
if (segment.startsWith("(") && segment.endsWith(")")) {
|
|
1759
|
+
return { regex: "", score: 0 };
|
|
1760
|
+
}
|
|
1761
|
+
if (/^\[\[\.\.\.[^\]]+\]\]$/.test(segment)) {
|
|
1762
|
+
return { regex: "(?:/.+)?", score: -2 };
|
|
1763
|
+
}
|
|
1764
|
+
if (/^\[\.\.\.[^\]]+\]$/.test(segment)) {
|
|
1765
|
+
return { regex: "/.+", score: 0 };
|
|
1766
|
+
}
|
|
1767
|
+
if (/^\[\[[^\]]+\]\]$/.test(segment)) {
|
|
1768
|
+
return { regex: "(?:/[^/]+)?", score: -1 };
|
|
1769
|
+
}
|
|
1770
|
+
if (/^\[[^\]]+\]$/.test(segment)) {
|
|
1771
|
+
return { regex: "/[^/]+", score: 3 };
|
|
1772
|
+
}
|
|
1773
|
+
return { regex: `/${segment.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}`, score: 10 };
|
|
1774
|
+
}
|
|
1775
|
+
function routeFileToPattern(routeFile, cwd) {
|
|
1776
|
+
const relative = path6.relative(cwd, routeFile).replace(/\\/g, "/");
|
|
1777
|
+
const withoutPrefix = relative.replace(/^src\/routes\/?/, "");
|
|
1778
|
+
const withoutPage = withoutPrefix.replace(/\/\+page\.[^/]+$/, "");
|
|
1779
|
+
const segments = withoutPage.split("/").filter(Boolean);
|
|
1780
|
+
let regex = "^";
|
|
1781
|
+
let score = 0;
|
|
1782
|
+
if (segments.length === 0) {
|
|
1783
|
+
regex += "/";
|
|
1784
|
+
} else {
|
|
1785
|
+
for (const segment of segments) {
|
|
1786
|
+
const converted = segmentToRegex(segment);
|
|
1787
|
+
regex += converted.regex;
|
|
1788
|
+
score += converted.score;
|
|
1789
|
+
}
|
|
1790
|
+
}
|
|
1791
|
+
regex += "/?$";
|
|
1792
|
+
return {
|
|
1793
|
+
routeFile: relative,
|
|
1794
|
+
regex: new RegExp(regex),
|
|
1795
|
+
score
|
|
1796
|
+
};
|
|
1797
|
+
}
|
|
1798
|
+
async function buildRoutePatterns(cwd) {
|
|
1799
|
+
const files = await fg("src/routes/**/+page.svelte", {
|
|
1800
|
+
cwd,
|
|
1801
|
+
absolute: true
|
|
1802
|
+
});
|
|
1803
|
+
return files.map((file) => routeFileToPattern(file, cwd)).sort((a, b) => b.score - a.score || b.routeFile.length - a.routeFile.length);
|
|
1804
|
+
}
|
|
1805
|
+
function mapUrlToRoute(urlPath, patterns) {
|
|
1806
|
+
const normalized = normalizeUrlPath(urlPath);
|
|
1807
|
+
for (const pattern of patterns) {
|
|
1808
|
+
if (pattern.regex.test(normalized)) {
|
|
1809
|
+
return {
|
|
1810
|
+
routeFile: pattern.routeFile,
|
|
1811
|
+
routeResolution: "exact"
|
|
1812
|
+
};
|
|
1813
|
+
}
|
|
1814
|
+
}
|
|
1815
|
+
const rootRoute = patterns.find((pattern) => pattern.routeFile === "src/routes/+page.svelte");
|
|
1816
|
+
if (rootRoute) {
|
|
1817
|
+
return {
|
|
1818
|
+
routeFile: rootRoute.routeFile,
|
|
1819
|
+
routeResolution: "best-effort"
|
|
1820
|
+
};
|
|
1821
|
+
}
|
|
1822
|
+
const fallback = patterns[0];
|
|
1823
|
+
return {
|
|
1824
|
+
routeFile: fallback?.routeFile ?? "src/routes/+page.svelte",
|
|
1825
|
+
routeResolution: "best-effort"
|
|
1826
|
+
};
|
|
1827
|
+
}
|
|
1828
|
+
|
|
1829
|
+
// src/indexing/sources/build/index.ts
|
|
1830
|
+
import pLimit2 from "p-limit";
|
|
1831
|
+
|
|
1832
|
+
// src/indexing/sources/build/manifest-parser.ts
|
|
1833
|
+
import fs5 from "fs/promises";
|
|
1834
|
+
import path7 from "path";
|
|
1835
|
+
function routeIdToFile(routeId) {
|
|
1836
|
+
if (routeId === "/") {
|
|
1837
|
+
return "src/routes/+page.svelte";
|
|
1838
|
+
}
|
|
1839
|
+
return `src/routes${routeId}/+page.svelte`;
|
|
1840
|
+
}
|
|
1841
|
+
function routeIdToUrl(routeId) {
|
|
1842
|
+
if (routeId === "/") return "/";
|
|
1843
|
+
return routeId.split("/").filter((seg) => !(seg.startsWith("(") && seg.endsWith(")"))).join("/") || "/";
|
|
1844
|
+
}
|
|
1845
|
+
async function parseManifest(cwd, outputDir) {
|
|
1846
|
+
const manifestPath = path7.resolve(cwd, outputDir, "server", "manifest-full.js");
|
|
1847
|
+
let content;
|
|
1848
|
+
try {
|
|
1849
|
+
content = await fs5.readFile(manifestPath, "utf8");
|
|
1850
|
+
} catch {
|
|
1851
|
+
throw new SearchSocketError(
|
|
1852
|
+
"BUILD_MANIFEST_NOT_FOUND",
|
|
1853
|
+
`SvelteKit build manifest not found at ${manifestPath}. Run \`vite build\` first.`
|
|
1854
|
+
);
|
|
1855
|
+
}
|
|
1856
|
+
const routes = [];
|
|
1857
|
+
const idRegex = /id:\s*"([^"]+)"/g;
|
|
1858
|
+
const idMatches = [];
|
|
1859
|
+
let idMatch;
|
|
1860
|
+
while ((idMatch = idRegex.exec(content)) !== null) {
|
|
1861
|
+
idMatches.push({ id: idMatch[1], index: idMatch.index });
|
|
1862
|
+
}
|
|
1863
|
+
for (let i = 0; i < idMatches.length; i++) {
|
|
1864
|
+
const current = idMatches[i];
|
|
1865
|
+
const nextIndex = idMatches[i + 1]?.index ?? content.length;
|
|
1866
|
+
const block = content.slice(current.index, nextIndex);
|
|
1867
|
+
const isPage = /page:\s*\{/.test(block);
|
|
1868
|
+
if (!isPage) continue;
|
|
1869
|
+
const isDynamic = current.id.includes("[");
|
|
1870
|
+
routes.push({
|
|
1871
|
+
id: current.id,
|
|
1872
|
+
isPage: true,
|
|
1873
|
+
isDynamic,
|
|
1874
|
+
routeFile: routeIdToFile(current.id)
|
|
1875
|
+
});
|
|
1876
|
+
}
|
|
1877
|
+
return routes;
|
|
1878
|
+
}
|
|
1879
|
+
function expandRoutes(routes, paramValues, exclude, logger3) {
|
|
1880
|
+
const expanded = [];
|
|
1881
|
+
for (const route of routes) {
|
|
1882
|
+
const url = routeIdToUrl(route.id);
|
|
1883
|
+
if (isExcluded(url, exclude)) continue;
|
|
1884
|
+
if (!route.isDynamic) {
|
|
1885
|
+
expanded.push({ url, routeFile: route.routeFile });
|
|
1886
|
+
continue;
|
|
1887
|
+
}
|
|
1888
|
+
const values = paramValues[route.id] ?? paramValues[url];
|
|
1889
|
+
if (!values || values.length === 0) {
|
|
1890
|
+
logger3.warn(
|
|
1891
|
+
`Skipping dynamic route ${route.id}: no paramValues provided. Add paramValues["${route.id}"] or paramValues["${url}"] to your build config.`
|
|
1892
|
+
);
|
|
1893
|
+
continue;
|
|
1894
|
+
}
|
|
1895
|
+
for (const value of values) {
|
|
1896
|
+
const expandedUrl = expandDynamicUrl(url, value);
|
|
1897
|
+
if (!isExcluded(expandedUrl, exclude)) {
|
|
1898
|
+
expanded.push({ url: expandedUrl, routeFile: route.routeFile });
|
|
1899
|
+
}
|
|
1900
|
+
}
|
|
1901
|
+
}
|
|
1902
|
+
return expanded;
|
|
1903
|
+
}
|
|
1904
|
+
function expandDynamicUrl(url, value) {
|
|
1905
|
+
return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
|
|
1906
|
+
}
|
|
1907
|
+
function isExcluded(url, patterns) {
|
|
1908
|
+
for (const pattern of patterns) {
|
|
1909
|
+
if (pattern.endsWith("/*")) {
|
|
1910
|
+
const prefix = pattern.slice(0, -1);
|
|
1911
|
+
if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
|
|
1912
|
+
} else if (url === pattern) {
|
|
1913
|
+
return true;
|
|
1914
|
+
}
|
|
1915
|
+
}
|
|
1916
|
+
return false;
|
|
1917
|
+
}
|
|
1918
|
+
|
|
1919
|
+
// src/indexing/sources/build/preview-server.ts
|
|
1920
|
+
import net from "net";
|
|
1921
|
+
import path8 from "path";
|
|
1922
|
+
import fs6 from "fs";
|
|
1923
|
+
import { spawn } from "child_process";
|
|
1924
|
+
function findFreePort() {
|
|
1925
|
+
return new Promise((resolve, reject) => {
|
|
1926
|
+
const server = net.createServer();
|
|
1927
|
+
server.listen(0, "127.0.0.1", () => {
|
|
1928
|
+
const addr = server.address();
|
|
1929
|
+
if (!addr || typeof addr === "string") {
|
|
1930
|
+
server.close(() => reject(new Error("Failed to get port")));
|
|
1931
|
+
return;
|
|
1932
|
+
}
|
|
1933
|
+
const port = addr.port;
|
|
1934
|
+
server.close(() => resolve(port));
|
|
1935
|
+
});
|
|
1936
|
+
server.on("error", reject);
|
|
1937
|
+
});
|
|
1938
|
+
}
|
|
1939
|
+
async function waitForReady(url, timeout, child) {
|
|
1940
|
+
const deadline = Date.now() + timeout;
|
|
1941
|
+
while (Date.now() < deadline) {
|
|
1942
|
+
if (child.exitCode !== null) {
|
|
1943
|
+
throw new SearchSocketError(
|
|
1944
|
+
"BUILD_SERVER_FAILED",
|
|
1945
|
+
`vite preview exited with code ${child.exitCode} before becoming ready.`
|
|
1946
|
+
);
|
|
1947
|
+
}
|
|
1948
|
+
try {
|
|
1949
|
+
const res = await fetch(url, { signal: AbortSignal.timeout(2e3) });
|
|
1950
|
+
if (res.status < 500) return;
|
|
1951
|
+
} catch {
|
|
1952
|
+
}
|
|
1953
|
+
await new Promise((resolve) => setTimeout(resolve, 250));
|
|
1954
|
+
}
|
|
1955
|
+
throw new SearchSocketError(
|
|
1956
|
+
"BUILD_SERVER_FAILED",
|
|
1957
|
+
`vite preview did not become ready within ${timeout}ms. Check that \`vite build\` completed successfully.`
|
|
1958
|
+
);
|
|
1959
|
+
}
|
|
1960
|
+
async function startPreviewServer(cwd, options, logger3) {
|
|
1961
|
+
const viteBin = path8.join(cwd, "node_modules", ".bin", "vite");
|
|
1962
|
+
if (!fs6.existsSync(viteBin)) {
|
|
1963
|
+
throw new SearchSocketError(
|
|
1964
|
+
"BUILD_SERVER_FAILED",
|
|
1965
|
+
`vite binary not found at ${viteBin}. Ensure vite is installed.`
|
|
1966
|
+
);
|
|
1967
|
+
}
|
|
1968
|
+
const port = await findFreePort();
|
|
1969
|
+
const baseUrl = `http://127.0.0.1:${port}`;
|
|
1970
|
+
logger3.event("preview_server_starting", { port });
|
|
1971
|
+
const child = spawn(viteBin, ["preview", "--port", String(port), "--strictPort", "--host", "127.0.0.1"], {
|
|
1972
|
+
cwd,
|
|
1973
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
1974
|
+
env: { ...process.env }
|
|
1975
|
+
});
|
|
1976
|
+
let stderr = "";
|
|
1977
|
+
child.stderr?.on("data", (chunk) => {
|
|
1978
|
+
stderr += chunk.toString();
|
|
1979
|
+
});
|
|
1980
|
+
const shutdown = async () => {
|
|
1981
|
+
if (child.exitCode !== null) return;
|
|
1982
|
+
child.kill("SIGTERM");
|
|
1983
|
+
await Promise.race([
|
|
1984
|
+
new Promise((resolve) => child.on("close", () => resolve())),
|
|
1985
|
+
new Promise((resolve) => setTimeout(() => {
|
|
1986
|
+
if (child.exitCode === null) child.kill("SIGKILL");
|
|
1987
|
+
resolve();
|
|
1988
|
+
}, 3e3))
|
|
1989
|
+
]);
|
|
1990
|
+
};
|
|
1991
|
+
try {
|
|
1992
|
+
await waitForReady(baseUrl, options.previewTimeout, child);
|
|
1993
|
+
} catch (error) {
|
|
1994
|
+
await shutdown();
|
|
1995
|
+
if (stderr) {
|
|
1996
|
+
logger3.warn(`vite preview stderr: ${stderr.slice(0, 500)}`);
|
|
1997
|
+
}
|
|
1998
|
+
throw error;
|
|
1999
|
+
}
|
|
2000
|
+
logger3.event("preview_server_ready", { port, baseUrl });
|
|
2001
|
+
return { baseUrl, port, shutdown };
|
|
2002
|
+
}
|
|
2003
|
+
|
|
2004
|
+
// src/indexing/sources/build/index.ts
|
|
2005
|
+
var logger = new Logger();
|
|
2006
|
+
async function loadBuildPages(cwd, config, maxPages) {
|
|
2007
|
+
const buildConfig = config.source.build;
|
|
2008
|
+
if (!buildConfig) {
|
|
2009
|
+
throw new Error("build source config is missing");
|
|
2010
|
+
}
|
|
2011
|
+
const routes = await parseManifest(cwd, buildConfig.outputDir);
|
|
2012
|
+
const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
|
|
2013
|
+
logger.event("build_routes_discovered", {
|
|
2014
|
+
manifestRoutes: routes.length,
|
|
2015
|
+
expandedRoutes: expanded.length
|
|
2016
|
+
});
|
|
2017
|
+
const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
|
|
2018
|
+
const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
|
|
2019
|
+
const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
2020
|
+
try {
|
|
2021
|
+
const concurrencyLimit = pLimit2(8);
|
|
2022
|
+
const results = await Promise.allSettled(
|
|
2023
|
+
selected.map(
|
|
2024
|
+
(route) => concurrencyLimit(async () => {
|
|
2025
|
+
const fetchUrl = joinUrl(server.baseUrl, route.url);
|
|
2026
|
+
const response = await fetch(fetchUrl);
|
|
2027
|
+
if (!response.ok) {
|
|
2028
|
+
throw new Error(`Failed to fetch ${route.url}: ${response.status} ${response.statusText}`);
|
|
2029
|
+
}
|
|
2030
|
+
return {
|
|
2031
|
+
url: normalizeUrlPath(route.url),
|
|
2032
|
+
html: await response.text(),
|
|
2033
|
+
sourcePath: route.routeFile,
|
|
2034
|
+
outgoingLinks: [],
|
|
2035
|
+
routeFile: route.routeFile,
|
|
2036
|
+
routeResolution: "exact"
|
|
2037
|
+
};
|
|
2038
|
+
})
|
|
2039
|
+
)
|
|
2040
|
+
);
|
|
2041
|
+
const pages = [];
|
|
2042
|
+
for (let i = 0; i < results.length; i += 1) {
|
|
2043
|
+
const result = results[i];
|
|
2044
|
+
if (!result) continue;
|
|
2045
|
+
if (result.status === "fulfilled") {
|
|
2046
|
+
pages.push(result.value);
|
|
2047
|
+
} else {
|
|
2048
|
+
const route = selected[i]?.url ?? "unknown";
|
|
2049
|
+
logger.warn(
|
|
2050
|
+
`Skipping build route ${route}: ${result.reason instanceof Error ? result.reason.message : String(result.reason)}`
|
|
2051
|
+
);
|
|
2052
|
+
}
|
|
2053
|
+
}
|
|
2054
|
+
return pages;
|
|
2055
|
+
} finally {
|
|
2056
|
+
await server.shutdown();
|
|
2057
|
+
}
|
|
2058
|
+
}
|
|
2059
|
+
|
|
2060
|
+
// src/indexing/sources/content-files.ts
|
|
2061
|
+
import fs7 from "fs/promises";
|
|
2062
|
+
import path9 from "path";
|
|
2063
|
+
import fg2 from "fast-glob";
|
|
2064
|
+
function filePathToUrl(filePath, baseDir) {
|
|
2065
|
+
const relative = path9.relative(baseDir, filePath).replace(/\\/g, "/");
|
|
2066
|
+
const segments = relative.split("/").filter(Boolean);
|
|
2067
|
+
if (/(^|\/)\+page\.svelte$/.test(relative)) {
|
|
2068
|
+
const routeSegments = segments.slice();
|
|
2069
|
+
if ((routeSegments[0] ?? "").toLowerCase() === "src" && (routeSegments[1] ?? "").toLowerCase() === "routes") {
|
|
2070
|
+
routeSegments.splice(0, 2);
|
|
2071
|
+
} else if ((routeSegments[0] ?? "").toLowerCase() === "routes") {
|
|
2072
|
+
routeSegments.splice(0, 1);
|
|
2073
|
+
}
|
|
2074
|
+
const routePath = routeSegments.filter((segment) => segment !== "+page.svelte").filter((segment) => segment && !segment.startsWith("(")).map(
|
|
2075
|
+
(segment) => segment.replace(/^\[\[[^\]]+\]\]$/, "optional").replace(/^\[\.\.\.[^\]]+\]$/, "splat").replace(/^\[[^\]]+\]$/, "param")
|
|
2076
|
+
).join("/");
|
|
2077
|
+
return normalizeUrlPath(routePath || "/");
|
|
2078
|
+
}
|
|
2079
|
+
const noExt = relative.replace(/\.md$/i, "").replace(/\/index$/i, "");
|
|
2080
|
+
return normalizeUrlPath(noExt || "/");
|
|
2081
|
+
}
|
|
2082
|
+
function normalizeSvelteToMarkdown(source) {
|
|
2083
|
+
return source.replace(/<script[\s\S]*?<\/script>/g, "").replace(/<style[\s\S]*?<\/style>/g, "").replace(/<[^>]+>/g, " ").replace(/\{[^}]+\}/g, " ").replace(/\s+/g, " ").trim();
|
|
2084
|
+
}
|
|
2085
|
+
async function loadContentFilesPages(cwd, config, maxPages) {
|
|
2086
|
+
const contentConfig = config.source.contentFiles;
|
|
2087
|
+
if (!contentConfig) {
|
|
2088
|
+
throw new Error("content-files config is missing");
|
|
2089
|
+
}
|
|
2090
|
+
const baseDir = path9.resolve(cwd, contentConfig.baseDir);
|
|
2091
|
+
const files = await fg2(contentConfig.globs, {
|
|
2092
|
+
cwd: baseDir,
|
|
2093
|
+
absolute: true,
|
|
2094
|
+
onlyFiles: true
|
|
2095
|
+
});
|
|
2096
|
+
const limit = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
|
|
2097
|
+
const selected = typeof limit === "number" ? files.slice(0, limit) : files;
|
|
2098
|
+
const pages = [];
|
|
2099
|
+
for (const filePath of selected) {
|
|
2100
|
+
const raw = await fs7.readFile(filePath, "utf8");
|
|
2101
|
+
const markdown = filePath.endsWith(".md") ? raw : normalizeSvelteToMarkdown(raw);
|
|
2102
|
+
pages.push({
|
|
2103
|
+
url: filePathToUrl(filePath, baseDir),
|
|
2104
|
+
markdown,
|
|
2105
|
+
sourcePath: path9.relative(cwd, filePath).replace(/\\/g, "/"),
|
|
2106
|
+
outgoingLinks: []
|
|
2107
|
+
});
|
|
2108
|
+
}
|
|
2109
|
+
return pages;
|
|
2110
|
+
}
|
|
2111
|
+
|
|
2112
|
+
// src/indexing/sources/crawl.ts
|
|
2113
|
+
import { gunzipSync } from "zlib";
|
|
2114
|
+
import { load as cheerioLoad } from "cheerio";
|
|
2115
|
+
import pLimit3 from "p-limit";
|
|
2116
|
+
var logger2 = new Logger();
|
|
2117
|
+
function extractLocs(xml) {
|
|
2118
|
+
const $ = cheerioLoad(xml, { xmlMode: true });
|
|
2119
|
+
const locs = [];
|
|
2120
|
+
$("loc").each((_i, el) => {
|
|
2121
|
+
const text = $(el).text().trim();
|
|
2122
|
+
if (text) {
|
|
2123
|
+
locs.push(text);
|
|
2124
|
+
}
|
|
2125
|
+
});
|
|
2126
|
+
return locs;
|
|
2127
|
+
}
|
|
2128
|
+
function isSitemapIndex(xml) {
|
|
2129
|
+
const $ = cheerioLoad(xml, { xmlMode: true });
|
|
2130
|
+
return $("sitemapindex").length > 0;
|
|
2131
|
+
}
|
|
2132
|
+
async function fetchSitemapXml(url) {
|
|
2133
|
+
const res = await fetch(url);
|
|
2134
|
+
if (!res.ok) {
|
|
2135
|
+
throw new Error(`Failed to fetch sitemap ${url}: ${res.status} ${res.statusText}`);
|
|
2136
|
+
}
|
|
2137
|
+
if (url.endsWith(".gz")) {
|
|
2138
|
+
const buffer = Buffer.from(await res.arrayBuffer());
|
|
2139
|
+
return gunzipSync(buffer).toString("utf8");
|
|
2140
|
+
}
|
|
2141
|
+
return res.text();
|
|
2142
|
+
}
|
|
2143
|
+
function resolveSitemapUrl(baseUrl, candidate) {
|
|
2144
|
+
return candidate.startsWith("http") ? candidate : joinUrl(baseUrl, candidate);
|
|
2145
|
+
}
|
|
2146
|
+
async function parseSitemap(xml, baseUrl, visitedSitemaps) {
|
|
2147
|
+
if (isSitemapIndex(xml)) {
|
|
2148
|
+
const childUrls = extractLocs(xml);
|
|
2149
|
+
const routes2 = [];
|
|
2150
|
+
for (const childUrl of childUrls) {
|
|
2151
|
+
const childRoutes = await parseSitemapFromUrl(childUrl, baseUrl, visitedSitemaps);
|
|
2152
|
+
routes2.push(...childRoutes);
|
|
2153
|
+
}
|
|
2154
|
+
return [...new Set(routes2)];
|
|
2155
|
+
}
|
|
2156
|
+
const locs = extractLocs(xml);
|
|
2157
|
+
const routes = [];
|
|
2158
|
+
for (const loc of locs) {
|
|
2159
|
+
try {
|
|
2160
|
+
const parsed = loc.startsWith("http://") || loc.startsWith("https://") ? new URL(loc) : new URL(loc, baseUrl);
|
|
2161
|
+
if (!["http:", "https:"].includes(parsed.protocol)) {
|
|
2162
|
+
continue;
|
|
2163
|
+
}
|
|
2164
|
+
routes.push(normalizeUrlPath(parsed.pathname));
|
|
2165
|
+
} catch {
|
|
2166
|
+
}
|
|
2167
|
+
}
|
|
2168
|
+
return [...new Set(routes)];
|
|
2169
|
+
}
|
|
2170
|
+
async function parseSitemapFromUrl(url, baseUrl, visitedSitemaps) {
|
|
2171
|
+
const resolved = resolveSitemapUrl(baseUrl, url);
|
|
2172
|
+
if (visitedSitemaps.has(resolved)) {
|
|
2173
|
+
return [];
|
|
2174
|
+
}
|
|
2175
|
+
visitedSitemaps.add(resolved);
|
|
2176
|
+
const xml = await fetchSitemapXml(resolved);
|
|
2177
|
+
return parseSitemap(xml, baseUrl, visitedSitemaps);
|
|
2178
|
+
}
|
|
2179
|
+
async function resolveRoutes(config) {
|
|
2180
|
+
const crawlConfig = config.source.crawl;
|
|
2181
|
+
if (!crawlConfig) {
|
|
2182
|
+
return [];
|
|
2183
|
+
}
|
|
2184
|
+
if (crawlConfig.routes.length > 0) {
|
|
2185
|
+
return [...new Set(crawlConfig.routes.map((route) => normalizeUrlPath(ensureLeadingSlash(route))))];
|
|
2186
|
+
}
|
|
2187
|
+
if (!crawlConfig.sitemapUrl) {
|
|
2188
|
+
return ["/"];
|
|
2189
|
+
}
|
|
2190
|
+
return parseSitemapFromUrl(crawlConfig.sitemapUrl, crawlConfig.baseUrl, /* @__PURE__ */ new Set());
|
|
2191
|
+
}
|
|
2192
|
+
async function loadCrawledPages(config, maxPages) {
|
|
2193
|
+
const crawlConfig = config.source.crawl;
|
|
2194
|
+
if (!crawlConfig) {
|
|
2195
|
+
throw new Error("crawl source config is missing");
|
|
2196
|
+
}
|
|
2197
|
+
const routes = await resolveRoutes(config);
|
|
2198
|
+
const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
|
|
2199
|
+
const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
|
|
2200
|
+
const concurrencyLimit = pLimit3(8);
|
|
2201
|
+
const results = await Promise.allSettled(
|
|
2202
|
+
selected.map(
|
|
2203
|
+
(route) => concurrencyLimit(async () => {
|
|
2204
|
+
const url = joinUrl(crawlConfig.baseUrl, route);
|
|
2205
|
+
const response = await fetch(url);
|
|
2206
|
+
if (!response.ok) {
|
|
2207
|
+
throw new Error(`Failed to fetch route ${route}: ${response.status} ${response.statusText}`);
|
|
2208
|
+
}
|
|
2209
|
+
return {
|
|
2210
|
+
url: normalizeUrlPath(route),
|
|
2211
|
+
html: await response.text(),
|
|
2212
|
+
sourcePath: url,
|
|
2213
|
+
outgoingLinks: []
|
|
2214
|
+
};
|
|
2215
|
+
})
|
|
2216
|
+
)
|
|
2217
|
+
);
|
|
2218
|
+
const pages = [];
|
|
2219
|
+
for (let i = 0; i < results.length; i += 1) {
|
|
2220
|
+
const result = results[i];
|
|
2221
|
+
if (!result) continue;
|
|
2222
|
+
if (result.status === "fulfilled") {
|
|
2223
|
+
pages.push(result.value);
|
|
2224
|
+
} else {
|
|
2225
|
+
const route = selected[i] ?? "unknown";
|
|
2226
|
+
logger2.warn(`Skipping route ${route}: ${result.reason instanceof Error ? result.reason.message : String(result.reason)}`);
|
|
2227
|
+
}
|
|
2228
|
+
}
|
|
2229
|
+
return pages;
|
|
2230
|
+
}
|
|
2231
|
+
|
|
2232
|
+
// src/indexing/sources/static-output.ts
|
|
2233
|
+
import fs8 from "fs/promises";
|
|
2234
|
+
import path10 from "path";
|
|
2235
|
+
import fg3 from "fast-glob";
|
|
2236
|
+
async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
2237
|
+
const outputDir = path10.resolve(cwd, config.source.staticOutputDir);
|
|
2238
|
+
const htmlFiles = await fg3(["**/*.html"], {
|
|
2239
|
+
cwd: outputDir,
|
|
2240
|
+
absolute: true
|
|
2241
|
+
});
|
|
2242
|
+
const limit = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
|
|
2243
|
+
const selected = typeof limit === "number" ? htmlFiles.slice(0, limit) : htmlFiles;
|
|
2244
|
+
const pages = [];
|
|
2245
|
+
for (const filePath of selected) {
|
|
2246
|
+
const html = await fs8.readFile(filePath, "utf8");
|
|
2247
|
+
pages.push({
|
|
2248
|
+
url: staticHtmlFileToUrl(filePath, outputDir),
|
|
2249
|
+
html,
|
|
2250
|
+
sourcePath: path10.relative(cwd, filePath).replace(/\\/g, "/"),
|
|
2251
|
+
outgoingLinks: []
|
|
2252
|
+
});
|
|
2253
|
+
}
|
|
2254
|
+
return pages;
|
|
2255
|
+
}
|
|
2256
|
+
|
|
2257
|
+
// src/utils/time.ts
|
|
2258
|
+
function nowIso() {
|
|
2259
|
+
return (/* @__PURE__ */ new Date()).toISOString();
|
|
2260
|
+
}
|
|
2261
|
+
function hrTimeMs(start) {
|
|
2262
|
+
return Number(process.hrtime.bigint() - start) / 1e6;
|
|
2263
|
+
}
|
|
2264
|
+
|
|
2265
|
+
// src/indexing/pipeline.ts
|
|
2266
|
+
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
2267
|
+
"text-embedding-3-small": 2e-5,
|
|
2268
|
+
"text-embedding-3-large": 13e-5,
|
|
2269
|
+
"text-embedding-ada-002": 1e-4
|
|
2270
|
+
};
|
|
2271
|
+
var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
|
|
2272
|
+
var IndexPipeline = class _IndexPipeline {
|
|
2273
|
+
cwd;
|
|
2274
|
+
config;
|
|
2275
|
+
embeddings;
|
|
2276
|
+
vectorStore;
|
|
2277
|
+
logger;
|
|
2278
|
+
constructor(options) {
|
|
2279
|
+
this.cwd = options.cwd;
|
|
2280
|
+
this.config = options.config;
|
|
2281
|
+
this.embeddings = options.embeddings;
|
|
2282
|
+
this.vectorStore = options.vectorStore;
|
|
2283
|
+
this.logger = options.logger;
|
|
2284
|
+
}
|
|
2285
|
+
static async create(options = {}) {
|
|
2286
|
+
const cwd = path11.resolve(options.cwd ?? process.cwd());
|
|
2287
|
+
const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
|
|
2288
|
+
const embeddings = options.embeddingsProvider ?? createEmbeddingsProvider(config);
|
|
2289
|
+
const vectorStore = options.vectorStore ?? await createVectorStore(config, cwd);
|
|
2290
|
+
return new _IndexPipeline({
|
|
2291
|
+
cwd,
|
|
2292
|
+
config,
|
|
2293
|
+
embeddings,
|
|
2294
|
+
vectorStore,
|
|
2295
|
+
logger: options.logger ?? new Logger()
|
|
2296
|
+
});
|
|
2297
|
+
}
|
|
2298
|
+
getConfig() {
|
|
2299
|
+
return this.config;
|
|
2300
|
+
}
|
|
2301
|
+
async run(rawOptions = {}) {
|
|
2302
|
+
const options = {
|
|
2303
|
+
changedOnly: rawOptions.changedOnly ?? true,
|
|
2304
|
+
force: rawOptions.force ?? false,
|
|
2305
|
+
dryRun: rawOptions.dryRun ?? false,
|
|
2306
|
+
...rawOptions
|
|
2307
|
+
};
|
|
2308
|
+
const stageTimingsMs = {};
|
|
2309
|
+
const stageStart = () => process.hrtime.bigint();
|
|
2310
|
+
const stageEnd = (name, start) => {
|
|
2311
|
+
stageTimingsMs[name] = Math.round(hrTimeMs(start));
|
|
2312
|
+
};
|
|
2313
|
+
const scope = resolveScope(this.config, options.scopeOverride);
|
|
2314
|
+
const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
|
|
2315
|
+
if (options.force) {
|
|
2316
|
+
await cleanMirrorForScope(statePath, scope);
|
|
2317
|
+
}
|
|
2318
|
+
const manifestStart = stageStart();
|
|
2319
|
+
const existingHashes = await this.vectorStore.getContentHashes(scope);
|
|
2320
|
+
const existingModelId = await this.vectorStore.getScopeModelId(scope);
|
|
2321
|
+
if (existingModelId && existingModelId !== this.config.embeddings.model && !options.force) {
|
|
2322
|
+
throw new SearchSocketError(
|
|
2323
|
+
"EMBEDDING_MODEL_MISMATCH",
|
|
2324
|
+
`Scope ${scope.scopeName} uses model ${existingModelId}. Re-run with --force to migrate.`
|
|
2325
|
+
);
|
|
2326
|
+
}
|
|
2327
|
+
stageEnd("manifest", manifestStart);
|
|
2328
|
+
const sourceStart = stageStart();
|
|
2329
|
+
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
2330
|
+
let sourcePages;
|
|
2331
|
+
if (sourceMode === "static-output") {
|
|
2332
|
+
sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
|
|
2333
|
+
} else if (sourceMode === "crawl") {
|
|
2334
|
+
sourcePages = await loadCrawledPages(this.config, options.maxPages);
|
|
2335
|
+
} else if (sourceMode === "build") {
|
|
2336
|
+
sourcePages = await loadBuildPages(this.cwd, this.config, options.maxPages);
|
|
2337
|
+
} else {
|
|
2338
|
+
sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
|
|
2339
|
+
}
|
|
2340
|
+
stageEnd("source", sourceStart);
|
|
2341
|
+
const routeStart = stageStart();
|
|
2342
|
+
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
2343
|
+
stageEnd("route_map", routeStart);
|
|
2344
|
+
const extractStart = stageStart();
|
|
2345
|
+
const extractedPages = [];
|
|
2346
|
+
for (const sourcePage of sourcePages) {
|
|
2347
|
+
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
2348
|
+
if (!extracted) {
|
|
2349
|
+
this.logger.warn(
|
|
2350
|
+
`Page ${sourcePage.url} produced no extractable content and was skipped. Check extract.mainSelector, extract.dropTags, and extract.dropSelectors settings.`
|
|
2351
|
+
);
|
|
2352
|
+
continue;
|
|
2353
|
+
}
|
|
2354
|
+
extractedPages.push(extracted);
|
|
2355
|
+
this.logger.event("page_extracted", {
|
|
2356
|
+
url: extracted.url
|
|
2357
|
+
});
|
|
2358
|
+
}
|
|
2359
|
+
extractedPages.sort((a, b) => a.url.localeCompare(b.url));
|
|
2360
|
+
const uniquePages = [];
|
|
2361
|
+
const seenUrls = /* @__PURE__ */ new Set();
|
|
2362
|
+
for (const page of extractedPages) {
|
|
2363
|
+
if (seenUrls.has(page.url)) {
|
|
2364
|
+
this.logger.warn(
|
|
2365
|
+
`Duplicate page source for ${page.url}; keeping first extracted page and skipping the duplicate.`
|
|
2366
|
+
);
|
|
2367
|
+
continue;
|
|
2368
|
+
}
|
|
2369
|
+
seenUrls.add(page.url);
|
|
2370
|
+
uniquePages.push(page);
|
|
2371
|
+
}
|
|
2372
|
+
stageEnd("extract", extractStart);
|
|
2373
|
+
const linkStart = stageStart();
|
|
2374
|
+
const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
|
|
2375
|
+
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
2376
|
+
for (const page of uniquePages) {
|
|
2377
|
+
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
2378
|
+
}
|
|
2379
|
+
for (const page of uniquePages) {
|
|
2380
|
+
for (const outgoing of page.outgoingLinks) {
|
|
2381
|
+
if (!pageSet.has(outgoing)) {
|
|
2382
|
+
continue;
|
|
2383
|
+
}
|
|
2384
|
+
incomingLinkCount.set(outgoing, (incomingLinkCount.get(outgoing) ?? 0) + 1);
|
|
2385
|
+
}
|
|
2386
|
+
}
|
|
2387
|
+
stageEnd("links", linkStart);
|
|
2388
|
+
const mirrorStart = stageStart();
|
|
2389
|
+
const mirrorPages = [];
|
|
2390
|
+
let routeExact = 0;
|
|
2391
|
+
let routeBestEffort = 0;
|
|
2392
|
+
const precomputedRoutes = /* @__PURE__ */ new Map();
|
|
2393
|
+
for (const sp of sourcePages) {
|
|
2394
|
+
if (sp.routeFile) {
|
|
2395
|
+
precomputedRoutes.set(normalizeUrlPath(sp.url), {
|
|
2396
|
+
routeFile: sp.routeFile,
|
|
2397
|
+
routeResolution: sp.routeResolution ?? "exact"
|
|
2398
|
+
});
|
|
2399
|
+
}
|
|
2400
|
+
}
|
|
2401
|
+
for (const page of uniquePages) {
|
|
2402
|
+
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
2403
|
+
if (routeMatch.routeResolution === "best-effort") {
|
|
2404
|
+
if (this.config.source.strictRouteMapping) {
|
|
2405
|
+
throw new SearchSocketError(
|
|
2406
|
+
"ROUTE_MAPPING_FAILED",
|
|
2407
|
+
`Strict route mapping enabled: no exact route match for ${page.url} (resolved to ${routeMatch.routeFile}). Disable source.strictRouteMapping or add the missing route file.`,
|
|
2408
|
+
400
|
|
2409
|
+
);
|
|
2410
|
+
}
|
|
2411
|
+
this.logger.warn(
|
|
2412
|
+
`No exact route match for ${page.url}, falling back to ${routeMatch.routeFile}.`
|
|
2413
|
+
);
|
|
2414
|
+
routeBestEffort += 1;
|
|
2415
|
+
} else {
|
|
2416
|
+
routeExact += 1;
|
|
2417
|
+
}
|
|
2418
|
+
const mirror = {
|
|
2419
|
+
url: page.url,
|
|
2420
|
+
title: page.title,
|
|
2421
|
+
scope: scope.scopeName,
|
|
2422
|
+
routeFile: routeMatch.routeFile,
|
|
2423
|
+
routeResolution: routeMatch.routeResolution,
|
|
2424
|
+
generatedAt: nowIso(),
|
|
2425
|
+
incomingLinks: incomingLinkCount.get(page.url) ?? 0,
|
|
2426
|
+
outgoingLinks: page.outgoingLinks.length,
|
|
2427
|
+
depth: getUrlDepth(page.url),
|
|
2428
|
+
tags: page.tags,
|
|
2429
|
+
markdown: page.markdown,
|
|
2430
|
+
description: page.description,
|
|
2431
|
+
keywords: page.keywords
|
|
2432
|
+
};
|
|
2433
|
+
mirrorPages.push(mirror);
|
|
2434
|
+
if (this.config.state.writeMirror) {
|
|
2435
|
+
await writeMirrorPage(statePath, scope, mirror);
|
|
2436
|
+
}
|
|
2437
|
+
this.logger.event("markdown_written", { url: page.url });
|
|
2438
|
+
}
|
|
2439
|
+
if (!options.dryRun) {
|
|
2440
|
+
const pageRecords = mirrorPages.map((mp) => ({
|
|
2441
|
+
url: mp.url,
|
|
2442
|
+
title: mp.title,
|
|
2443
|
+
markdown: mp.markdown,
|
|
2444
|
+
projectId: scope.projectId,
|
|
2445
|
+
scopeName: scope.scopeName,
|
|
2446
|
+
routeFile: mp.routeFile,
|
|
2447
|
+
routeResolution: mp.routeResolution,
|
|
2448
|
+
incomingLinks: mp.incomingLinks,
|
|
2449
|
+
outgoingLinks: mp.outgoingLinks,
|
|
2450
|
+
depth: mp.depth,
|
|
2451
|
+
tags: mp.tags,
|
|
2452
|
+
indexedAt: mp.generatedAt
|
|
2453
|
+
}));
|
|
2454
|
+
await this.vectorStore.deletePages(scope);
|
|
2455
|
+
await this.vectorStore.upsertPages(pageRecords, scope);
|
|
2456
|
+
}
|
|
2457
|
+
stageEnd("mirror", mirrorStart);
|
|
2458
|
+
const chunkStart = stageStart();
|
|
2459
|
+
let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
|
|
2460
|
+
const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
|
|
2461
|
+
if (typeof maxChunks === "number") {
|
|
2462
|
+
chunks = chunks.slice(0, maxChunks);
|
|
2463
|
+
}
|
|
2464
|
+
for (const chunk of chunks) {
|
|
2465
|
+
this.logger.event("chunked", {
|
|
2466
|
+
url: chunk.url,
|
|
2467
|
+
chunkKey: chunk.chunkKey
|
|
2468
|
+
});
|
|
2469
|
+
}
|
|
2470
|
+
stageEnd("chunk", chunkStart);
|
|
2471
|
+
const currentChunkMap = /* @__PURE__ */ new Map();
|
|
2472
|
+
for (const chunk of chunks) {
|
|
2473
|
+
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
2474
|
+
}
|
|
2475
|
+
const changedChunks = chunks.filter((chunk) => {
|
|
2476
|
+
if (options.force) {
|
|
2477
|
+
return true;
|
|
2478
|
+
}
|
|
2479
|
+
const existingHash = existingHashes.get(chunk.chunkKey);
|
|
2480
|
+
if (!existingHash) {
|
|
2481
|
+
return true;
|
|
2482
|
+
}
|
|
2483
|
+
if (!options.changedOnly) {
|
|
2484
|
+
return true;
|
|
2485
|
+
}
|
|
2486
|
+
return existingHash !== chunk.contentHash;
|
|
2487
|
+
});
|
|
2488
|
+
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
2489
|
+
const embedStart = stageStart();
|
|
2490
|
+
const chunkTokenEstimates = /* @__PURE__ */ new Map();
|
|
2491
|
+
for (const chunk of changedChunks) {
|
|
2492
|
+
chunkTokenEstimates.set(chunk.chunkKey, this.embeddings.estimateTokens(buildEmbeddingText(chunk, this.config.chunking.prependTitle)));
|
|
2493
|
+
}
|
|
2494
|
+
const estimatedTokens = changedChunks.reduce(
|
|
2495
|
+
(sum, chunk) => sum + (chunkTokenEstimates.get(chunk.chunkKey) ?? 0),
|
|
2496
|
+
0
|
|
2497
|
+
);
|
|
2498
|
+
const pricePer1k = this.config.embeddings.pricePer1kTokens ?? EMBEDDING_PRICE_PER_1K_TOKENS_USD[this.config.embeddings.model] ?? DEFAULT_EMBEDDING_PRICE_PER_1K;
|
|
2499
|
+
const estimatedCostUSD = estimatedTokens / 1e3 * pricePer1k;
|
|
2500
|
+
let newEmbeddings = 0;
|
|
2501
|
+
const vectorsByChunk = /* @__PURE__ */ new Map();
|
|
2502
|
+
if (!options.dryRun && changedChunks.length > 0) {
|
|
2503
|
+
const embeddings = await this.embeddings.embedTexts(
|
|
2504
|
+
changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
|
|
2505
|
+
this.config.embeddings.model
|
|
2506
|
+
);
|
|
2507
|
+
if (embeddings.length !== changedChunks.length) {
|
|
2508
|
+
throw new SearchSocketError(
|
|
2509
|
+
"VECTOR_BACKEND_UNAVAILABLE",
|
|
2510
|
+
`Embedding provider returned ${embeddings.length} vectors for ${changedChunks.length} chunks.`
|
|
2511
|
+
);
|
|
2512
|
+
}
|
|
2513
|
+
for (let i = 0; i < changedChunks.length; i += 1) {
|
|
2514
|
+
const chunk = changedChunks[i];
|
|
2515
|
+
const embedding = embeddings[i];
|
|
2516
|
+
if (!chunk || !embedding || embedding.length === 0 || embedding.some((value) => !Number.isFinite(value))) {
|
|
2517
|
+
throw new SearchSocketError(
|
|
2518
|
+
"VECTOR_BACKEND_UNAVAILABLE",
|
|
2519
|
+
`Embedding provider returned an invalid vector for chunk index ${i}.`
|
|
2520
|
+
);
|
|
2521
|
+
}
|
|
2522
|
+
vectorsByChunk.set(chunk.chunkKey, embedding);
|
|
2523
|
+
newEmbeddings += 1;
|
|
2524
|
+
this.logger.event("embedded_new", { chunkKey: chunk.chunkKey });
|
|
2525
|
+
}
|
|
2526
|
+
}
|
|
2527
|
+
stageEnd("embedding", embedStart);
|
|
2528
|
+
const syncStart = stageStart();
|
|
2529
|
+
if (!options.dryRun) {
|
|
2530
|
+
const upserts = [];
|
|
2531
|
+
for (const chunk of changedChunks) {
|
|
2532
|
+
const vector = vectorsByChunk.get(chunk.chunkKey);
|
|
2533
|
+
if (!vector) {
|
|
2534
|
+
continue;
|
|
2535
|
+
}
|
|
2536
|
+
upserts.push({
|
|
2537
|
+
id: chunk.chunkKey,
|
|
2538
|
+
vector,
|
|
2539
|
+
metadata: {
|
|
2540
|
+
projectId: scope.projectId,
|
|
2541
|
+
scopeName: scope.scopeName,
|
|
2542
|
+
url: chunk.url,
|
|
2543
|
+
path: chunk.path,
|
|
2544
|
+
title: chunk.title,
|
|
2545
|
+
sectionTitle: chunk.sectionTitle ?? "",
|
|
2546
|
+
headingPath: chunk.headingPath,
|
|
2547
|
+
snippet: chunk.snippet,
|
|
2548
|
+
contentHash: chunk.contentHash,
|
|
2549
|
+
modelId: this.config.embeddings.model,
|
|
2550
|
+
depth: chunk.depth,
|
|
2551
|
+
incomingLinks: chunk.incomingLinks,
|
|
2552
|
+
routeFile: chunk.routeFile,
|
|
2553
|
+
tags: chunk.tags
|
|
2554
|
+
}
|
|
2555
|
+
});
|
|
2556
|
+
}
|
|
2557
|
+
if (upserts.length > 0) {
|
|
2558
|
+
await this.vectorStore.upsert(upserts, scope);
|
|
2559
|
+
this.logger.event("upserted", { count: upserts.length });
|
|
2560
|
+
}
|
|
2561
|
+
if (deletes.length > 0) {
|
|
2562
|
+
await this.vectorStore.deleteByIds(deletes, scope);
|
|
2563
|
+
this.logger.event("deleted", { count: deletes.length });
|
|
2564
|
+
}
|
|
2565
|
+
}
|
|
2566
|
+
stageEnd("sync", syncStart);
|
|
2567
|
+
const finalizeStart = stageStart();
|
|
2568
|
+
if (!options.dryRun) {
|
|
2569
|
+
const scopeInfo = {
|
|
2570
|
+
projectId: scope.projectId,
|
|
2571
|
+
scopeName: scope.scopeName,
|
|
2572
|
+
modelId: this.config.embeddings.model,
|
|
2573
|
+
lastIndexedAt: nowIso(),
|
|
2574
|
+
vectorCount: chunks.length,
|
|
2575
|
+
lastEstimateTokens: estimatedTokens,
|
|
2576
|
+
lastEstimateCostUSD: Number(estimatedCostUSD.toFixed(8)),
|
|
2577
|
+
lastEstimateChangedChunks: changedChunks.length
|
|
2578
|
+
};
|
|
2579
|
+
await this.vectorStore.recordScope(scopeInfo);
|
|
2580
|
+
this.logger.event("registry_updated", {
|
|
2581
|
+
scope: scope.scopeName,
|
|
2582
|
+
vectorCount: chunks.length
|
|
2583
|
+
});
|
|
2584
|
+
}
|
|
2585
|
+
stageEnd("finalize", finalizeStart);
|
|
2586
|
+
return {
|
|
2587
|
+
pagesProcessed: mirrorPages.length,
|
|
2588
|
+
chunksTotal: chunks.length,
|
|
2589
|
+
chunksChanged: changedChunks.length,
|
|
2590
|
+
newEmbeddings,
|
|
2591
|
+
deletes: deletes.length,
|
|
2592
|
+
estimatedTokens,
|
|
2593
|
+
estimatedCostUSD: Number(estimatedCostUSD.toFixed(8)),
|
|
2594
|
+
routeExact,
|
|
2595
|
+
routeBestEffort,
|
|
2596
|
+
stageTimingsMs
|
|
2597
|
+
};
|
|
2598
|
+
}
|
|
2599
|
+
};
|
|
2600
|
+
|
|
2601
|
+
// src/mcp/server.ts
|
|
2602
|
+
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
2603
|
+
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
2604
|
+
import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
|
|
2605
|
+
import { createMcpExpressApp } from "@modelcontextprotocol/sdk/server/express.js";
|
|
2606
|
+
import { z as z3 } from "zod";
|
|
2607
|
+
|
|
2608
|
+
// src/search/engine.ts
|
|
2609
|
+
import path12 from "path";
|
|
2610
|
+
import { z as z2 } from "zod";
|
|
2611
|
+
|
|
2612
|
+
// src/rerank/jina.ts
|
|
2613
|
+
function sleep2(ms) {
|
|
2614
|
+
return new Promise((resolve) => {
|
|
2615
|
+
setTimeout(resolve, ms);
|
|
2616
|
+
});
|
|
2617
|
+
}
|
|
2618
|
+
var JinaReranker = class {
|
|
2619
|
+
apiKey;
|
|
2620
|
+
model;
|
|
2621
|
+
maxRetries;
|
|
2622
|
+
constructor(options) {
|
|
2623
|
+
this.apiKey = options.apiKey;
|
|
2624
|
+
this.model = options.model;
|
|
2625
|
+
this.maxRetries = options.maxRetries ?? 4;
|
|
2626
|
+
}
|
|
2627
|
+
async rerank(query, candidates, topN) {
|
|
2628
|
+
if (candidates.length === 0) {
|
|
2629
|
+
return [];
|
|
2630
|
+
}
|
|
2631
|
+
const body = {
|
|
2632
|
+
model: this.model,
|
|
2633
|
+
query,
|
|
2634
|
+
documents: candidates.map((candidate) => candidate.text),
|
|
2635
|
+
top_n: topN ?? candidates.length
|
|
2636
|
+
};
|
|
2637
|
+
let attempt = 0;
|
|
2638
|
+
while (attempt <= this.maxRetries) {
|
|
2639
|
+
attempt += 1;
|
|
2640
|
+
let response;
|
|
2641
|
+
try {
|
|
2642
|
+
response = await fetch("https://api.jina.ai/v1/rerank", {
|
|
2643
|
+
method: "POST",
|
|
2644
|
+
headers: {
|
|
2645
|
+
"content-type": "application/json",
|
|
2646
|
+
authorization: `Bearer ${this.apiKey}`
|
|
2647
|
+
},
|
|
2648
|
+
body: JSON.stringify(body)
|
|
2649
|
+
});
|
|
2650
|
+
} catch (error) {
|
|
2651
|
+
if (attempt <= this.maxRetries) {
|
|
2652
|
+
await sleep2(Math.min(300 * 2 ** attempt, 4e3));
|
|
2653
|
+
continue;
|
|
2654
|
+
}
|
|
2655
|
+
throw error;
|
|
2656
|
+
}
|
|
2657
|
+
if (!response.ok) {
|
|
2658
|
+
const retryable = response.status === 429 || response.status >= 500;
|
|
2659
|
+
if (retryable && attempt <= this.maxRetries) {
|
|
2660
|
+
await sleep2(Math.min(300 * 2 ** attempt, 4e3));
|
|
2661
|
+
continue;
|
|
2662
|
+
}
|
|
2663
|
+
const errorBody = await response.text();
|
|
2664
|
+
throw new Error(`Jina rerank failed (${response.status}): ${errorBody}`);
|
|
2665
|
+
}
|
|
2666
|
+
const payload = await response.json();
|
|
2667
|
+
const rawResults = payload.results ?? payload.data ?? [];
|
|
2668
|
+
if (!Array.isArray(rawResults)) {
|
|
2669
|
+
throw new Error("Invalid Jina rerank response format");
|
|
2670
|
+
}
|
|
2671
|
+
return rawResults.flatMap((item) => {
|
|
2672
|
+
const index = item.index;
|
|
2673
|
+
if (typeof index !== "number" || index < 0 || index >= candidates.length) {
|
|
2674
|
+
return [];
|
|
2675
|
+
}
|
|
2676
|
+
const candidate = candidates[index];
|
|
2677
|
+
if (!candidate) {
|
|
2678
|
+
return [];
|
|
2679
|
+
}
|
|
2680
|
+
const score = typeof item.relevance_score === "number" ? item.relevance_score : item.score ?? 0;
|
|
2681
|
+
return [
|
|
2682
|
+
{
|
|
2683
|
+
id: candidate.id,
|
|
2684
|
+
score
|
|
2685
|
+
}
|
|
2686
|
+
];
|
|
2687
|
+
}).sort((a, b) => b.score - a.score);
|
|
2688
|
+
}
|
|
2689
|
+
throw new Error("Jina rerank request failed after retries");
|
|
2690
|
+
}
|
|
2691
|
+
};
|
|
2692
|
+
|
|
2693
|
+
// src/rerank/factory.ts
|
|
2694
|
+
function createReranker(config) {
|
|
2695
|
+
if (config.rerank.provider === "none") {
|
|
2696
|
+
return null;
|
|
2697
|
+
}
|
|
2698
|
+
if (config.rerank.provider === "jina") {
|
|
2699
|
+
const apiKey = process.env[config.rerank.jina.apiKeyEnv];
|
|
2700
|
+
if (!apiKey) {
|
|
2701
|
+
return null;
|
|
2702
|
+
}
|
|
2703
|
+
return new JinaReranker({
|
|
2704
|
+
apiKey,
|
|
2705
|
+
model: config.rerank.jina.model
|
|
2706
|
+
});
|
|
2707
|
+
}
|
|
2708
|
+
return null;
|
|
2709
|
+
}
|
|
2710
|
+
|
|
2711
|
+
// src/search/ranking.ts
|
|
2712
|
+
function nonNegativeOrZero(value) {
|
|
2713
|
+
if (!Number.isFinite(value)) {
|
|
2714
|
+
return 0;
|
|
2715
|
+
}
|
|
2716
|
+
return Math.max(0, value);
|
|
2717
|
+
}
|
|
2718
|
+
function rankHits(hits, config) {
|
|
2719
|
+
return hits.map((hit) => {
|
|
2720
|
+
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
2721
|
+
if (config.ranking.enableIncomingLinkBoost) {
|
|
2722
|
+
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
2723
|
+
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
2724
|
+
}
|
|
2725
|
+
if (config.ranking.enableDepthBoost) {
|
|
2726
|
+
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
2727
|
+
score += depthBoost * config.ranking.weights.depth;
|
|
2728
|
+
}
|
|
2729
|
+
return {
|
|
2730
|
+
hit,
|
|
2731
|
+
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
2732
|
+
};
|
|
2733
|
+
}).sort((a, b) => {
|
|
2734
|
+
const delta = b.finalScore - a.finalScore;
|
|
2735
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
2736
|
+
});
|
|
2737
|
+
}
|
|
2738
|
+
function findPageWeight(url, pageWeights) {
|
|
2739
|
+
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
2740
|
+
const normalizedUrl = norm(url);
|
|
2741
|
+
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
2742
|
+
if (norm(pattern) === normalizedUrl) {
|
|
2743
|
+
return weight;
|
|
2744
|
+
}
|
|
2745
|
+
}
|
|
2746
|
+
let bestPrefix = "";
|
|
2747
|
+
let bestWeight = 1;
|
|
2748
|
+
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
2749
|
+
const normalizedPattern = norm(pattern);
|
|
2750
|
+
if (normalizedPattern === "/") continue;
|
|
2751
|
+
const prefix = `${normalizedPattern}/`;
|
|
2752
|
+
if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
|
|
2753
|
+
bestPrefix = prefix;
|
|
2754
|
+
bestWeight = weight;
|
|
2755
|
+
}
|
|
2756
|
+
}
|
|
2757
|
+
return bestWeight;
|
|
2758
|
+
}
|
|
2759
|
+
function aggregateByPage(ranked, config) {
|
|
2760
|
+
const groups = /* @__PURE__ */ new Map();
|
|
2761
|
+
for (const hit of ranked) {
|
|
2762
|
+
const url = hit.hit.metadata.url;
|
|
2763
|
+
const group = groups.get(url);
|
|
2764
|
+
if (group) group.push(hit);
|
|
2765
|
+
else groups.set(url, [hit]);
|
|
2766
|
+
}
|
|
2767
|
+
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
2768
|
+
const pages = [];
|
|
2769
|
+
for (const [url, chunks] of groups) {
|
|
2770
|
+
chunks.sort((a, b) => {
|
|
2771
|
+
const delta = b.finalScore - a.finalScore;
|
|
2772
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
2773
|
+
});
|
|
2774
|
+
const best = chunks[0];
|
|
2775
|
+
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
2776
|
+
const topChunks = chunks.slice(0, aggregationCap);
|
|
2777
|
+
let aggregationBonus = 0;
|
|
2778
|
+
for (let i = 1; i < topChunks.length; i++) {
|
|
2779
|
+
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
2780
|
+
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
2781
|
+
}
|
|
2782
|
+
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
2783
|
+
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
2784
|
+
if (pageWeight !== 1) {
|
|
2785
|
+
pageScore *= pageWeight;
|
|
2786
|
+
}
|
|
2787
|
+
pages.push({
|
|
2788
|
+
url,
|
|
2789
|
+
title: best.hit.metadata.title,
|
|
2790
|
+
routeFile: best.hit.metadata.routeFile,
|
|
2791
|
+
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
2792
|
+
bestChunk: best,
|
|
2793
|
+
matchingChunks: chunks
|
|
2794
|
+
});
|
|
2795
|
+
}
|
|
2796
|
+
return pages.sort((a, b) => {
|
|
2797
|
+
const delta = b.pageScore - a.pageScore;
|
|
2798
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
2799
|
+
});
|
|
2800
|
+
}
|
|
2801
|
+
|
|
2802
|
+
// src/search/engine.ts
|
|
2803
|
+
var requestSchema = z2.object({
|
|
2804
|
+
q: z2.string().trim().min(1),
|
|
2805
|
+
topK: z2.number().int().positive().max(100).optional(),
|
|
2806
|
+
scope: z2.string().optional(),
|
|
2807
|
+
pathPrefix: z2.string().optional(),
|
|
2808
|
+
tags: z2.array(z2.string()).optional(),
|
|
2809
|
+
rerank: z2.boolean().optional(),
|
|
2810
|
+
groupBy: z2.enum(["page", "chunk"]).optional()
|
|
2811
|
+
});
|
|
2812
|
+
var SearchEngine = class _SearchEngine {
|
|
2813
|
+
cwd;
|
|
2814
|
+
config;
|
|
2815
|
+
embeddings;
|
|
2816
|
+
vectorStore;
|
|
2817
|
+
reranker;
|
|
2818
|
+
constructor(options) {
|
|
2819
|
+
this.cwd = options.cwd;
|
|
2820
|
+
this.config = options.config;
|
|
2821
|
+
this.embeddings = options.embeddings;
|
|
2822
|
+
this.vectorStore = options.vectorStore;
|
|
2823
|
+
this.reranker = options.reranker;
|
|
2824
|
+
}
|
|
2825
|
+
static async create(options = {}) {
|
|
2826
|
+
const cwd = path12.resolve(options.cwd ?? process.cwd());
|
|
2827
|
+
const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
|
|
2828
|
+
const embeddings = options.embeddingsProvider ?? createEmbeddingsProvider(config);
|
|
2829
|
+
const vectorStore = options.vectorStore ?? await createVectorStore(config, cwd);
|
|
2830
|
+
const reranker = options.reranker === void 0 ? createReranker(config) : options.reranker;
|
|
2831
|
+
return new _SearchEngine({
|
|
2832
|
+
cwd,
|
|
2833
|
+
config,
|
|
2834
|
+
embeddings,
|
|
2835
|
+
vectorStore,
|
|
2836
|
+
reranker
|
|
2837
|
+
});
|
|
2838
|
+
}
|
|
2839
|
+
getConfig() {
|
|
2840
|
+
return this.config;
|
|
2841
|
+
}
|
|
2842
|
+
async search(request) {
|
|
2843
|
+
const parsed = requestSchema.safeParse(request);
|
|
2844
|
+
if (!parsed.success) {
|
|
2845
|
+
throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
|
|
2846
|
+
}
|
|
2847
|
+
const input = parsed.data;
|
|
2848
|
+
const totalStart = process.hrtime.bigint();
|
|
2849
|
+
const resolvedScope = resolveScope(this.config, input.scope);
|
|
2850
|
+
await this.assertModelCompatibility(resolvedScope);
|
|
2851
|
+
const topK = input.topK ?? 10;
|
|
2852
|
+
const wantsRerank = Boolean(input.rerank);
|
|
2853
|
+
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
2854
|
+
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
2855
|
+
const embedStart = process.hrtime.bigint();
|
|
2856
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
|
|
2857
|
+
const queryVector = queryEmbeddings[0];
|
|
2858
|
+
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
2859
|
+
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
2860
|
+
}
|
|
2861
|
+
const embedMs = hrTimeMs(embedStart);
|
|
2862
|
+
const vectorStart = process.hrtime.bigint();
|
|
2863
|
+
const hits = await this.vectorStore.query(
|
|
2864
|
+
queryVector,
|
|
2865
|
+
{
|
|
2866
|
+
topK: candidateK,
|
|
2867
|
+
pathPrefix: input.pathPrefix,
|
|
2868
|
+
tags: input.tags
|
|
2869
|
+
},
|
|
2870
|
+
resolvedScope
|
|
2871
|
+
);
|
|
2872
|
+
const vectorMs = hrTimeMs(vectorStart);
|
|
2873
|
+
const ranked = rankHits(hits, this.config);
|
|
2874
|
+
let usedRerank = false;
|
|
2875
|
+
let rerankMs = 0;
|
|
2876
|
+
let ordered = ranked;
|
|
2877
|
+
if (wantsRerank) {
|
|
2878
|
+
const rerankStart = process.hrtime.bigint();
|
|
2879
|
+
ordered = await this.rerankHits(input.q, ranked, topK);
|
|
2880
|
+
rerankMs = hrTimeMs(rerankStart);
|
|
2881
|
+
usedRerank = true;
|
|
2882
|
+
}
|
|
2883
|
+
let results;
|
|
2884
|
+
if (groupByPage) {
|
|
2885
|
+
const pages = aggregateByPage(ordered, this.config);
|
|
2886
|
+
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
2887
|
+
results = pages.slice(0, topK).map((page) => {
|
|
2888
|
+
const bestScore = page.bestChunk.finalScore;
|
|
2889
|
+
const minScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
2890
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore).slice(0, 5);
|
|
2891
|
+
return {
|
|
2892
|
+
url: page.url,
|
|
2893
|
+
title: page.title,
|
|
2894
|
+
sectionTitle: page.bestChunk.hit.metadata.sectionTitle || void 0,
|
|
2895
|
+
snippet: page.bestChunk.hit.metadata.snippet,
|
|
2896
|
+
score: Number(page.pageScore.toFixed(6)),
|
|
2897
|
+
routeFile: page.routeFile,
|
|
2898
|
+
chunks: meaningful.length > 1 ? meaningful.map((c) => ({
|
|
2899
|
+
sectionTitle: c.hit.metadata.sectionTitle || void 0,
|
|
2900
|
+
snippet: c.hit.metadata.snippet,
|
|
2901
|
+
headingPath: c.hit.metadata.headingPath,
|
|
2902
|
+
score: Number(c.finalScore.toFixed(6))
|
|
2903
|
+
})) : void 0
|
|
2904
|
+
};
|
|
2905
|
+
});
|
|
2906
|
+
} else {
|
|
2907
|
+
results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
2908
|
+
url: hit.metadata.url,
|
|
2909
|
+
title: hit.metadata.title,
|
|
2910
|
+
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
2911
|
+
snippet: hit.metadata.snippet,
|
|
2912
|
+
score: Number(finalScore.toFixed(6)),
|
|
2913
|
+
routeFile: hit.metadata.routeFile
|
|
2914
|
+
}));
|
|
2915
|
+
}
|
|
2916
|
+
return {
|
|
2917
|
+
q: input.q,
|
|
2918
|
+
scope: resolvedScope.scopeName,
|
|
2919
|
+
results,
|
|
2920
|
+
meta: {
|
|
2921
|
+
timingsMs: {
|
|
2922
|
+
embed: Math.round(embedMs),
|
|
2923
|
+
vector: Math.round(vectorMs),
|
|
2924
|
+
rerank: Math.round(rerankMs),
|
|
2925
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
2926
|
+
},
|
|
2927
|
+
usedRerank,
|
|
2928
|
+
modelId: this.config.embeddings.model
|
|
2929
|
+
}
|
|
2930
|
+
};
|
|
2931
|
+
}
|
|
2932
|
+
async getPage(pathOrUrl, scope) {
|
|
2933
|
+
const resolvedScope = resolveScope(this.config, scope);
|
|
2934
|
+
const urlPath = this.resolveInputPath(pathOrUrl);
|
|
2935
|
+
const page = await this.vectorStore.getPage(urlPath, resolvedScope);
|
|
2936
|
+
if (!page) {
|
|
2937
|
+
throw new SearchSocketError("INVALID_REQUEST", `Indexed page not found for ${urlPath}`, 404);
|
|
2938
|
+
}
|
|
2939
|
+
return {
|
|
2940
|
+
url: page.url,
|
|
2941
|
+
frontmatter: {
|
|
2942
|
+
url: page.url,
|
|
2943
|
+
title: page.title,
|
|
2944
|
+
routeFile: page.routeFile,
|
|
2945
|
+
routeResolution: page.routeResolution,
|
|
2946
|
+
incomingLinks: page.incomingLinks,
|
|
2947
|
+
outgoingLinks: page.outgoingLinks,
|
|
2948
|
+
depth: page.depth,
|
|
2949
|
+
tags: page.tags,
|
|
2950
|
+
indexedAt: page.indexedAt
|
|
2951
|
+
},
|
|
2952
|
+
markdown: page.markdown
|
|
2953
|
+
};
|
|
2954
|
+
}
|
|
2955
|
+
async health() {
|
|
2956
|
+
return this.vectorStore.health();
|
|
2957
|
+
}
|
|
2958
|
+
resolveInputPath(pathOrUrl) {
|
|
2959
|
+
try {
|
|
2960
|
+
if (/^https?:\/\//.test(pathOrUrl)) {
|
|
2961
|
+
return normalizeUrlPath(new URL(pathOrUrl).pathname);
|
|
2962
|
+
}
|
|
2963
|
+
} catch {
|
|
2964
|
+
}
|
|
2965
|
+
const withoutQueryOrHash = pathOrUrl.split(/[?#]/)[0] ?? pathOrUrl;
|
|
2966
|
+
return normalizeUrlPath(withoutQueryOrHash);
|
|
2967
|
+
}
|
|
2968
|
+
async assertModelCompatibility(scope) {
|
|
2969
|
+
const modelId = await this.vectorStore.getScopeModelId(scope);
|
|
2970
|
+
if (modelId && modelId !== this.config.embeddings.model) {
|
|
2971
|
+
throw new SearchSocketError(
|
|
2972
|
+
"EMBEDDING_MODEL_MISMATCH",
|
|
2973
|
+
`Scope ${scope.scopeName} was indexed with ${modelId}. Current config uses ${this.config.embeddings.model}. Re-index with --force.`
|
|
2974
|
+
);
|
|
2975
|
+
}
|
|
2976
|
+
}
|
|
2977
|
+
async rerankHits(query, ranked, topK) {
|
|
2978
|
+
if (this.config.rerank.provider !== "jina") {
|
|
2979
|
+
throw new SearchSocketError(
|
|
2980
|
+
"INVALID_REQUEST",
|
|
2981
|
+
"rerank=true requested but rerank.provider is not configured as 'jina'.",
|
|
2982
|
+
400
|
|
2983
|
+
);
|
|
2984
|
+
}
|
|
2985
|
+
if (!this.reranker) {
|
|
2986
|
+
throw new SearchSocketError(
|
|
2987
|
+
"CONFIG_MISSING",
|
|
2988
|
+
`rerank=true requested but ${this.config.rerank.jina.apiKeyEnv} is not set.`,
|
|
2989
|
+
400
|
|
2990
|
+
);
|
|
2991
|
+
}
|
|
2992
|
+
const candidates = ranked.map(({ hit }) => ({
|
|
2993
|
+
id: hit.id,
|
|
2994
|
+
text: [hit.metadata.title, hit.metadata.sectionTitle, hit.metadata.snippet].filter(Boolean).join("\n")
|
|
2995
|
+
}));
|
|
2996
|
+
const reranked = await this.reranker.rerank(
|
|
2997
|
+
query,
|
|
2998
|
+
candidates,
|
|
2999
|
+
Math.max(topK, this.config.rerank.topN)
|
|
3000
|
+
);
|
|
3001
|
+
const rerankScoreById = new Map(reranked.map((entry) => [entry.id, entry.score]));
|
|
3002
|
+
return ranked.map((entry) => {
|
|
3003
|
+
const rerankScore = rerankScoreById.get(entry.hit.id);
|
|
3004
|
+
const safeBaseScore = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
|
|
3005
|
+
if (rerankScore === void 0 || !Number.isFinite(rerankScore)) {
|
|
3006
|
+
return {
|
|
3007
|
+
...entry,
|
|
3008
|
+
finalScore: safeBaseScore
|
|
3009
|
+
};
|
|
3010
|
+
}
|
|
3011
|
+
const combinedScore = rerankScore * this.config.ranking.weights.rerank + safeBaseScore * 1e-3;
|
|
3012
|
+
return {
|
|
3013
|
+
...entry,
|
|
3014
|
+
finalScore: Number.isFinite(combinedScore) ? combinedScore : safeBaseScore
|
|
3015
|
+
};
|
|
3016
|
+
}).sort((a, b) => {
|
|
3017
|
+
const delta = b.finalScore - a.finalScore;
|
|
3018
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
3019
|
+
});
|
|
3020
|
+
}
|
|
3021
|
+
};
|
|
3022
|
+
|
|
3023
|
+
// src/mcp/server.ts
|
|
3024
|
+
function createServer(engine) {
|
|
3025
|
+
const server = new McpServer({
|
|
3026
|
+
name: "searchsocket-mcp",
|
|
3027
|
+
version: "0.1.0"
|
|
3028
|
+
});
|
|
3029
|
+
server.registerTool(
|
|
3030
|
+
"search",
|
|
3031
|
+
{
|
|
3032
|
+
description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, and topK.",
|
|
3033
|
+
inputSchema: {
|
|
3034
|
+
query: z3.string().min(1),
|
|
3035
|
+
scope: z3.string().optional(),
|
|
3036
|
+
topK: z3.number().int().positive().max(100).optional(),
|
|
3037
|
+
pathPrefix: z3.string().optional(),
|
|
3038
|
+
tags: z3.array(z3.string()).optional(),
|
|
3039
|
+
groupBy: z3.enum(["page", "chunk"]).optional()
|
|
3040
|
+
}
|
|
3041
|
+
},
|
|
3042
|
+
async (input) => {
|
|
3043
|
+
const result = await engine.search({
|
|
3044
|
+
q: input.query,
|
|
3045
|
+
topK: input.topK,
|
|
3046
|
+
scope: input.scope,
|
|
3047
|
+
pathPrefix: input.pathPrefix,
|
|
3048
|
+
tags: input.tags,
|
|
3049
|
+
groupBy: input.groupBy
|
|
3050
|
+
});
|
|
3051
|
+
return {
|
|
3052
|
+
content: [
|
|
3053
|
+
{
|
|
3054
|
+
type: "text",
|
|
3055
|
+
text: JSON.stringify(result, null, 2)
|
|
3056
|
+
}
|
|
3057
|
+
]
|
|
3058
|
+
};
|
|
3059
|
+
}
|
|
3060
|
+
);
|
|
3061
|
+
server.registerTool(
|
|
3062
|
+
"get_page",
|
|
3063
|
+
{
|
|
3064
|
+
description: "Fetch indexed markdown for a specific path or URL, including frontmatter and routeFile mapping.",
|
|
3065
|
+
inputSchema: {
|
|
3066
|
+
pathOrUrl: z3.string().min(1),
|
|
3067
|
+
scope: z3.string().optional()
|
|
3068
|
+
}
|
|
3069
|
+
},
|
|
3070
|
+
async (input) => {
|
|
3071
|
+
const page = await engine.getPage(input.pathOrUrl, input.scope);
|
|
3072
|
+
return {
|
|
3073
|
+
content: [
|
|
3074
|
+
{
|
|
3075
|
+
type: "text",
|
|
3076
|
+
text: JSON.stringify(page, null, 2)
|
|
3077
|
+
}
|
|
3078
|
+
]
|
|
3079
|
+
};
|
|
3080
|
+
}
|
|
3081
|
+
);
|
|
3082
|
+
return server;
|
|
3083
|
+
}
|
|
3084
|
+
function redirectConsoleToStderr() {
|
|
3085
|
+
const originalLog = console.log;
|
|
3086
|
+
console.log = (...args) => {
|
|
3087
|
+
process.stderr.write(`[LOG] ${args.map(String).join(" ")}
|
|
3088
|
+
`);
|
|
3089
|
+
};
|
|
3090
|
+
console.warn = (...args) => {
|
|
3091
|
+
process.stderr.write(`[WARN] ${args.map(String).join(" ")}
|
|
3092
|
+
`);
|
|
3093
|
+
};
|
|
3094
|
+
void originalLog;
|
|
3095
|
+
}
|
|
3096
|
+
async function startHttpServer(serverFactory, config, opts) {
|
|
3097
|
+
const app = createMcpExpressApp();
|
|
3098
|
+
const port = opts.httpPort ?? config.mcp.http.port;
|
|
3099
|
+
const endpointPath = opts.httpPath ?? config.mcp.http.path;
|
|
3100
|
+
app.post(endpointPath, async (req, res) => {
|
|
3101
|
+
const server = serverFactory();
|
|
3102
|
+
const transport = new StreamableHTTPServerTransport({
|
|
3103
|
+
sessionIdGenerator: void 0
|
|
3104
|
+
});
|
|
3105
|
+
try {
|
|
3106
|
+
await server.connect(transport);
|
|
3107
|
+
await transport.handleRequest(req, res, req.body);
|
|
3108
|
+
res.on("close", () => {
|
|
3109
|
+
transport.close();
|
|
3110
|
+
server.close();
|
|
3111
|
+
});
|
|
3112
|
+
} catch (error) {
|
|
3113
|
+
if (!res.headersSent) {
|
|
3114
|
+
res.status(500).json({
|
|
3115
|
+
jsonrpc: "2.0",
|
|
3116
|
+
error: {
|
|
3117
|
+
code: -32603,
|
|
3118
|
+
message: error instanceof Error ? error.message : "Internal server error"
|
|
3119
|
+
},
|
|
3120
|
+
id: null
|
|
3121
|
+
});
|
|
3122
|
+
}
|
|
3123
|
+
}
|
|
3124
|
+
});
|
|
3125
|
+
app.get(endpointPath, (_req, res) => {
|
|
3126
|
+
res.writeHead(405).end(
|
|
3127
|
+
JSON.stringify({
|
|
3128
|
+
jsonrpc: "2.0",
|
|
3129
|
+
error: {
|
|
3130
|
+
code: -32e3,
|
|
3131
|
+
message: "Method not allowed"
|
|
3132
|
+
},
|
|
3133
|
+
id: null
|
|
3134
|
+
})
|
|
3135
|
+
);
|
|
3136
|
+
});
|
|
3137
|
+
app.delete(endpointPath, (_req, res) => {
|
|
3138
|
+
res.writeHead(405).end(
|
|
3139
|
+
JSON.stringify({
|
|
3140
|
+
jsonrpc: "2.0",
|
|
3141
|
+
error: {
|
|
3142
|
+
code: -32e3,
|
|
3143
|
+
message: "Method not allowed"
|
|
3144
|
+
},
|
|
3145
|
+
id: null
|
|
3146
|
+
})
|
|
3147
|
+
);
|
|
3148
|
+
});
|
|
3149
|
+
await new Promise((resolve, reject) => {
|
|
3150
|
+
const instance = app.listen(port, "127.0.0.1", () => {
|
|
3151
|
+
process.stderr.write(`SearchSocket MCP HTTP server listening on http://127.0.0.1:${port}${endpointPath}
|
|
3152
|
+
`);
|
|
3153
|
+
resolve();
|
|
3154
|
+
});
|
|
3155
|
+
instance.once("error", reject);
|
|
3156
|
+
process.on("SIGINT", async () => {
|
|
3157
|
+
await new Promise((shutdownResolve) => instance.close(() => shutdownResolve()));
|
|
3158
|
+
process.exit(0);
|
|
3159
|
+
});
|
|
3160
|
+
});
|
|
3161
|
+
}
|
|
3162
|
+
async function runMcpServer(options = {}) {
|
|
3163
|
+
const config = await loadConfig({
|
|
3164
|
+
cwd: options.cwd,
|
|
3165
|
+
configPath: options.configPath
|
|
3166
|
+
});
|
|
3167
|
+
const resolvedTransport = options.transport ?? config.mcp.transport;
|
|
3168
|
+
if (resolvedTransport === "stdio") {
|
|
3169
|
+
redirectConsoleToStderr();
|
|
3170
|
+
}
|
|
3171
|
+
const engine = await SearchEngine.create({
|
|
3172
|
+
cwd: options.cwd,
|
|
3173
|
+
configPath: options.configPath,
|
|
3174
|
+
config
|
|
3175
|
+
});
|
|
3176
|
+
if (resolvedTransport === "http") {
|
|
3177
|
+
await startHttpServer(() => createServer(engine), config, options);
|
|
3178
|
+
return;
|
|
3179
|
+
}
|
|
3180
|
+
const server = createServer(engine);
|
|
3181
|
+
const stdioTransport = new StdioServerTransport();
|
|
3182
|
+
await server.connect(stdioTransport);
|
|
3183
|
+
}
|
|
3184
|
+
|
|
3185
|
+
// src/cli.ts
|
|
3186
|
+
function parsePositiveInt(value, flag) {
|
|
3187
|
+
const parsed = Number(value);
|
|
3188
|
+
if (!Number.isInteger(parsed) || parsed <= 0) {
|
|
3189
|
+
throw new SearchSocketError("INVALID_REQUEST", `${flag} must be a positive integer`, 400);
|
|
3190
|
+
}
|
|
3191
|
+
return parsed;
|
|
3192
|
+
}
|
|
3193
|
+
function parseDurationMs(value) {
|
|
3194
|
+
const match = value.trim().match(/^(\d+)(ms|s|m|h|d)$/i);
|
|
3195
|
+
if (!match) {
|
|
3196
|
+
throw new SearchSocketError(
|
|
3197
|
+
"INVALID_REQUEST",
|
|
3198
|
+
"Duration must look like 30d, 12h, 15m, 45s, or 500ms",
|
|
3199
|
+
400
|
|
3200
|
+
);
|
|
3201
|
+
}
|
|
3202
|
+
const amount = Number(match[1]);
|
|
3203
|
+
const unit = (match[2] ?? "").toLowerCase();
|
|
3204
|
+
switch (unit) {
|
|
3205
|
+
case "ms":
|
|
3206
|
+
return amount;
|
|
3207
|
+
case "s":
|
|
3208
|
+
return amount * 1e3;
|
|
3209
|
+
case "m":
|
|
3210
|
+
return amount * 6e4;
|
|
3211
|
+
case "h":
|
|
3212
|
+
return amount * 36e5;
|
|
3213
|
+
case "d":
|
|
3214
|
+
return amount * 864e5;
|
|
3215
|
+
default:
|
|
3216
|
+
throw new SearchSocketError("INVALID_REQUEST", `Unsupported duration unit: ${unit}`, 400);
|
|
3217
|
+
}
|
|
3218
|
+
}
|
|
3219
|
+
function formatUsd(value) {
|
|
3220
|
+
return `$${value.toFixed(6)}`;
|
|
3221
|
+
}
|
|
3222
|
+
function printIndexSummary(stats) {
|
|
3223
|
+
process.stdout.write(`pages processed: ${stats.pagesProcessed}
|
|
3224
|
+
`);
|
|
3225
|
+
process.stdout.write(`chunks total: ${stats.chunksTotal}
|
|
3226
|
+
`);
|
|
3227
|
+
process.stdout.write(`chunks changed: ${stats.chunksChanged}
|
|
3228
|
+
`);
|
|
3229
|
+
process.stdout.write(`embeddings created: ${stats.newEmbeddings}
|
|
3230
|
+
`);
|
|
3231
|
+
process.stdout.write(`deletes: ${stats.deletes}
|
|
3232
|
+
`);
|
|
3233
|
+
process.stdout.write(`estimated tokens: ${stats.estimatedTokens}
|
|
3234
|
+
`);
|
|
3235
|
+
process.stdout.write(`estimated cost (USD): ${formatUsd(stats.estimatedCostUSD)}
|
|
3236
|
+
`);
|
|
3237
|
+
process.stdout.write(`route mapping: ${stats.routeExact} exact, ${stats.routeBestEffort} best-effort
|
|
3238
|
+
`);
|
|
3239
|
+
process.stdout.write("stage timings (ms):\n");
|
|
3240
|
+
for (const [stage, ms] of Object.entries(stats.stageTimingsMs)) {
|
|
3241
|
+
process.stdout.write(` ${stage}: ${ms}
|
|
3242
|
+
`);
|
|
3243
|
+
}
|
|
3244
|
+
}
|
|
3245
|
+
function collectWatchPaths(config, cwd) {
|
|
3246
|
+
const paths = ["src/routes/**"];
|
|
3247
|
+
if (config.source.mode === "content-files" && config.source.contentFiles) {
|
|
3248
|
+
for (const pattern of config.source.contentFiles.globs) {
|
|
3249
|
+
paths.push(path13.join(config.source.contentFiles.baseDir, pattern));
|
|
3250
|
+
}
|
|
3251
|
+
}
|
|
3252
|
+
if (config.source.mode === "static-output") {
|
|
3253
|
+
paths.push(config.source.staticOutputDir);
|
|
3254
|
+
}
|
|
3255
|
+
if (config.source.mode === "crawl") {
|
|
3256
|
+
paths.push("searchsocket.config.ts");
|
|
3257
|
+
}
|
|
3258
|
+
if (config.source.mode === "build" && config.source.build) {
|
|
3259
|
+
paths.push("searchsocket.config.ts");
|
|
3260
|
+
paths.push(config.source.build.outputDir);
|
|
3261
|
+
}
|
|
3262
|
+
return paths.map((value) => path13.resolve(cwd, value));
|
|
3263
|
+
}
|
|
3264
|
+
function ensureStateDir(cwd) {
|
|
3265
|
+
const target = path13.join(cwd, ".searchsocket");
|
|
3266
|
+
fs9.mkdirSync(target, { recursive: true });
|
|
3267
|
+
return target;
|
|
3268
|
+
}
|
|
3269
|
+
function ensureGitignore(cwd) {
|
|
3270
|
+
const gitignorePath = path13.join(cwd, ".gitignore");
|
|
3271
|
+
const entries = [
|
|
3272
|
+
".searchsocket/vectors.db",
|
|
3273
|
+
".searchsocket/vectors.db-shm",
|
|
3274
|
+
".searchsocket/vectors.db-wal",
|
|
3275
|
+
".searchsocket/manifest.json",
|
|
3276
|
+
".searchsocket/registry.json"
|
|
3277
|
+
];
|
|
3278
|
+
let content = "";
|
|
3279
|
+
if (fs9.existsSync(gitignorePath)) {
|
|
3280
|
+
content = fs9.readFileSync(gitignorePath, "utf8");
|
|
3281
|
+
}
|
|
3282
|
+
const lines = content.split("\n");
|
|
3283
|
+
const missing = entries.filter((entry) => !lines.some((line) => line.trim() === entry));
|
|
3284
|
+
if (missing.length === 0) {
|
|
3285
|
+
return;
|
|
3286
|
+
}
|
|
3287
|
+
const block = `
|
|
3288
|
+
# SearchSocket local state
|
|
3289
|
+
${missing.join("\n")}
|
|
3290
|
+
`;
|
|
3291
|
+
fs9.writeFileSync(gitignorePath, content.trimEnd() + block, "utf8");
|
|
3292
|
+
}
|
|
3293
|
+
function readScopesFromFile(filePath) {
|
|
3294
|
+
const raw = fs9.readFileSync(filePath, "utf8");
|
|
3295
|
+
return new Set(
|
|
3296
|
+
raw.split(/\r?\n/).map((line) => line.trim()).filter(Boolean)
|
|
3297
|
+
);
|
|
3298
|
+
}
|
|
3299
|
+
function readRemoteGitBranches(cwd) {
|
|
3300
|
+
try {
|
|
3301
|
+
const output = execSync2("git branch -r --format='%(refname:short)'", {
|
|
3302
|
+
cwd,
|
|
3303
|
+
encoding: "utf8",
|
|
3304
|
+
stdio: ["ignore", "pipe", "ignore"]
|
|
3305
|
+
});
|
|
3306
|
+
const scopes = output.split(/\r?\n/).map((line) => line.trim()).filter(Boolean).map((line) => line.replace(/^origin\//, ""));
|
|
3307
|
+
if (scopes.length <= 1) {
|
|
3308
|
+
process.stdout.write(
|
|
3309
|
+
"warning: git branch -r returned 1 or fewer branches. If running in CI, ensure the checkout step uses fetch-depth: 0 to avoid accidentally pruning active branch scopes.\n"
|
|
3310
|
+
);
|
|
3311
|
+
}
|
|
3312
|
+
return new Set(scopes);
|
|
3313
|
+
} catch {
|
|
3314
|
+
return /* @__PURE__ */ new Set();
|
|
3315
|
+
}
|
|
3316
|
+
}
|
|
3317
|
+
async function loadResolvedConfigForDev(cwd, configPath) {
|
|
3318
|
+
const resolvedConfigPath = path13.resolve(cwd, configPath ?? "searchsocket.config.ts");
|
|
3319
|
+
if (fs9.existsSync(resolvedConfigPath)) {
|
|
3320
|
+
return loadConfig({ cwd, configPath });
|
|
3321
|
+
}
|
|
3322
|
+
return mergeConfig(cwd, {});
|
|
3323
|
+
}
|
|
3324
|
+
function getRootOptions(command) {
|
|
3325
|
+
const maybeParent = command.parent;
|
|
3326
|
+
const optsFn = maybeParent?.opts;
|
|
3327
|
+
if (typeof optsFn !== "function") {
|
|
3328
|
+
return {};
|
|
3329
|
+
}
|
|
3330
|
+
return optsFn.call(maybeParent);
|
|
3331
|
+
}
|
|
3332
|
+
async function runIndexCommand(opts) {
|
|
3333
|
+
const logger3 = new Logger({
|
|
3334
|
+
verbose: opts.verbose,
|
|
3335
|
+
json: opts.json
|
|
3336
|
+
});
|
|
3337
|
+
const pipeline = await IndexPipeline.create({
|
|
3338
|
+
cwd: opts.cwd,
|
|
3339
|
+
configPath: opts.configPath,
|
|
3340
|
+
logger: logger3
|
|
3341
|
+
});
|
|
3342
|
+
const stats = await pipeline.run({
|
|
3343
|
+
scopeOverride: opts.scope,
|
|
3344
|
+
changedOnly: opts.changedOnly,
|
|
3345
|
+
force: opts.force,
|
|
3346
|
+
dryRun: opts.dryRun,
|
|
3347
|
+
sourceOverride: opts.source,
|
|
3348
|
+
maxPages: opts.maxPages,
|
|
3349
|
+
maxChunks: opts.maxChunks,
|
|
3350
|
+
verbose: opts.verbose
|
|
3351
|
+
});
|
|
3352
|
+
if (opts.json) {
|
|
3353
|
+
process.stdout.write(`${JSON.stringify(stats, null, 2)}
|
|
3354
|
+
`);
|
|
3355
|
+
return;
|
|
3356
|
+
}
|
|
3357
|
+
printIndexSummary(stats);
|
|
3358
|
+
}
|
|
3359
|
+
var program = new Command();
|
|
3360
|
+
program.name("searchsocket").description("Semantic site search and MCP retrieval for SvelteKit").version(package_default.version).option("-C, --cwd <path>", "working directory", process.cwd()).option("--config <path>", "config path (defaults to searchsocket.config.ts)");
|
|
3361
|
+
program.command("init").description("Create searchsocket.config.ts and .searchsocket state directory").action(async (_opts, command) => {
|
|
3362
|
+
const root = getRootOptions(command).cwd ?? process.cwd();
|
|
3363
|
+
const cwd = path13.resolve(root);
|
|
3364
|
+
const configPath = writeMinimalConfig(cwd);
|
|
3365
|
+
const stateDir = ensureStateDir(cwd);
|
|
3366
|
+
ensureGitignore(cwd);
|
|
3367
|
+
process.stdout.write(`created/verified config: ${configPath}
|
|
3368
|
+
`);
|
|
3369
|
+
process.stdout.write(`created/verified state dir: ${stateDir}
|
|
3370
|
+
|
|
3371
|
+
`);
|
|
3372
|
+
process.stdout.write("SvelteKit hook snippet:\n\n");
|
|
3373
|
+
process.stdout.write('import { searchsocketHandle } from "searchsocket/sveltekit";\n\n');
|
|
3374
|
+
process.stdout.write("export const handle = searchsocketHandle();\n\n");
|
|
3375
|
+
process.stdout.write("Optional build-triggered indexing plugin:\n\n");
|
|
3376
|
+
process.stdout.write('import { searchsocketVitePlugin } from "searchsocket/sveltekit";\n\n');
|
|
3377
|
+
process.stdout.write("// svelte.config.js / vite plugins:\n");
|
|
3378
|
+
process.stdout.write("// searchsocketVitePlugin({ enabled: true, changedOnly: true })\n");
|
|
3379
|
+
process.stdout.write("// or env-driven: SEARCHSOCKET_AUTO_INDEX=1 pnpm build\n");
|
|
3380
|
+
});
|
|
3381
|
+
program.command("index").description("Index site content into markdown mirror + vector store").option("--scope <name>", "scope override").option("--changed-only", "only process changed chunks", true).option("--no-changed-only", "re-index regardless of previous manifest").option("--force", "force full mirror rebuild and re-upsert", false).option("--dry-run", "compute plan and cost, no API writes", false).option("--source <mode>", "source mode override: static-output|crawl|content-files|build").option("--max-pages <n>", "limit pages processed").option("--max-chunks <n>", "limit chunks processed").option("--verbose", "verbose output", false).option("--json", "emit JSON logs and summary", false).action(async (opts, command) => {
|
|
3382
|
+
const rootOpts = getRootOptions(command);
|
|
3383
|
+
const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3384
|
+
await runIndexCommand({
|
|
3385
|
+
cwd,
|
|
3386
|
+
configPath: rootOpts?.config,
|
|
3387
|
+
scope: opts.scope,
|
|
3388
|
+
changedOnly: opts.changedOnly,
|
|
3389
|
+
force: opts.force,
|
|
3390
|
+
dryRun: opts.dryRun,
|
|
3391
|
+
source: opts.source,
|
|
3392
|
+
maxPages: opts.maxPages ? parsePositiveInt(opts.maxPages, "--max-pages") : void 0,
|
|
3393
|
+
maxChunks: opts.maxChunks ? parsePositiveInt(opts.maxChunks, "--max-chunks") : void 0,
|
|
3394
|
+
verbose: opts.verbose,
|
|
3395
|
+
json: opts.json
|
|
3396
|
+
});
|
|
3397
|
+
});
|
|
3398
|
+
program.command("status").description("Show scope, indexing state, backend health, and recent cost estimate").option("--scope <name>", "scope override").action(async (opts, command) => {
|
|
3399
|
+
const rootOpts = getRootOptions(command);
|
|
3400
|
+
const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3401
|
+
const config = await loadConfig({ cwd, configPath: rootOpts?.config });
|
|
3402
|
+
const scope = resolveScope(config, opts.scope);
|
|
3403
|
+
let vectorStore;
|
|
3404
|
+
let health = { ok: false, details: "not checked" };
|
|
3405
|
+
try {
|
|
3406
|
+
vectorStore = await createVectorStore(config, cwd);
|
|
3407
|
+
health = await vectorStore.health();
|
|
3408
|
+
} catch (error) {
|
|
3409
|
+
health = {
|
|
3410
|
+
ok: false,
|
|
3411
|
+
details: error instanceof Error ? error.message : "unknown error"
|
|
3412
|
+
};
|
|
3413
|
+
process.stdout.write(`project: ${config.project.id}
|
|
3414
|
+
`);
|
|
3415
|
+
process.stdout.write(`vector health: error (${health.details})
|
|
3416
|
+
`);
|
|
3417
|
+
process.exitCode = 1;
|
|
3418
|
+
return;
|
|
3419
|
+
}
|
|
3420
|
+
let scopeRegistry = [];
|
|
3421
|
+
let scopeInfo;
|
|
3422
|
+
let hashes = /* @__PURE__ */ new Map();
|
|
3423
|
+
try {
|
|
3424
|
+
scopeRegistry = await vectorStore.listScopes(config.project.id);
|
|
3425
|
+
scopeInfo = scopeRegistry.find((entry) => entry.scopeName === scope.scopeName);
|
|
3426
|
+
hashes = await vectorStore.getContentHashes(scope);
|
|
3427
|
+
} catch (error) {
|
|
3428
|
+
process.stdout.write(`project: ${config.project.id}
|
|
3429
|
+
`);
|
|
3430
|
+
process.stdout.write(`resolved scope: ${scope.scopeName}
|
|
3431
|
+
`);
|
|
3432
|
+
process.stdout.write(`vector health: error (${error instanceof Error ? error.message : "unknown error"})
|
|
3433
|
+
`);
|
|
3434
|
+
process.exitCode = 1;
|
|
3435
|
+
return;
|
|
3436
|
+
}
|
|
3437
|
+
process.stdout.write(`project: ${config.project.id}
|
|
3438
|
+
`);
|
|
3439
|
+
process.stdout.write(`resolved scope: ${scope.scopeName}
|
|
3440
|
+
`);
|
|
3441
|
+
process.stdout.write(`embedding model: ${config.embeddings.model}
|
|
3442
|
+
`);
|
|
3443
|
+
const tursoUrl = process.env[config.vector.turso.urlEnv];
|
|
3444
|
+
const vectorMode = tursoUrl ? `remote (${tursoUrl})` : `local (${config.vector.turso.localPath})`;
|
|
3445
|
+
process.stdout.write(`vector backend: turso/libsql (${vectorMode})
|
|
3446
|
+
`);
|
|
3447
|
+
process.stdout.write(`vector health: ${health.ok ? "ok" : `error (${health.details ?? "n/a"})`}
|
|
3448
|
+
`);
|
|
3449
|
+
if (scopeInfo) {
|
|
3450
|
+
process.stdout.write(`last indexed (${scope.scopeName}): ${scopeInfo.lastIndexedAt ?? "never"}
|
|
3451
|
+
`);
|
|
3452
|
+
process.stdout.write(`tracked chunks: ${hashes.size}
|
|
3453
|
+
`);
|
|
3454
|
+
if (scopeInfo.lastEstimateTokens != null) {
|
|
3455
|
+
process.stdout.write(`last estimated tokens: ${scopeInfo.lastEstimateTokens}
|
|
3456
|
+
`);
|
|
3457
|
+
}
|
|
3458
|
+
if (scopeInfo.lastEstimateCostUSD != null) {
|
|
3459
|
+
process.stdout.write(`last estimated cost: ${formatUsd(scopeInfo.lastEstimateCostUSD)}
|
|
3460
|
+
`);
|
|
3461
|
+
}
|
|
3462
|
+
} else {
|
|
3463
|
+
process.stdout.write(`last indexed (${scope.scopeName}): never
|
|
3464
|
+
`);
|
|
3465
|
+
}
|
|
3466
|
+
if (scopeRegistry.length > 0) {
|
|
3467
|
+
process.stdout.write("\nregistry scopes:\n");
|
|
3468
|
+
for (const item of scopeRegistry) {
|
|
3469
|
+
process.stdout.write(
|
|
3470
|
+
` - ${item.scopeName} model=${item.modelId} lastIndexedAt=${item.lastIndexedAt} vectors=${item.vectorCount ?? "unknown"}
|
|
3471
|
+
`
|
|
3472
|
+
);
|
|
3473
|
+
}
|
|
3474
|
+
}
|
|
3475
|
+
});
|
|
3476
|
+
program.command("dev").description("Watch content files/routes and incrementally reindex on changes").option("--scope <name>", "scope override").option("--mcp", "start MCP server (http transport) alongside watcher", false).option("--mcp-port <n>", "MCP HTTP port", "3338").option("--mcp-path <path>", "MCP HTTP path", "/mcp").option("--verbose", "verbose logs", false).action(async (opts, command) => {
|
|
3477
|
+
const rootOpts = getRootOptions(command);
|
|
3478
|
+
const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3479
|
+
const config = await loadResolvedConfigForDev(cwd, rootOpts?.config);
|
|
3480
|
+
const watchPaths = collectWatchPaths(config, cwd);
|
|
3481
|
+
process.stdout.write("starting searchsocket dev watcher...\n");
|
|
3482
|
+
process.stdout.write(`watching:
|
|
3483
|
+
${watchPaths.map((entry) => ` - ${entry}`).join("\n")}
|
|
3484
|
+
`);
|
|
3485
|
+
let running = false;
|
|
3486
|
+
let pending = false;
|
|
3487
|
+
let timer = null;
|
|
3488
|
+
const run = async () => {
|
|
3489
|
+
if (running) {
|
|
3490
|
+
pending = true;
|
|
3491
|
+
return;
|
|
3492
|
+
}
|
|
3493
|
+
running = true;
|
|
3494
|
+
try {
|
|
3495
|
+
await runIndexCommand({
|
|
3496
|
+
cwd,
|
|
3497
|
+
configPath: rootOpts?.config,
|
|
3498
|
+
scope: opts.scope,
|
|
3499
|
+
changedOnly: true,
|
|
3500
|
+
force: false,
|
|
3501
|
+
dryRun: false,
|
|
3502
|
+
verbose: opts.verbose,
|
|
3503
|
+
json: false
|
|
3504
|
+
});
|
|
3505
|
+
} catch (error) {
|
|
3506
|
+
process.stderr.write(`index error: ${error instanceof Error ? error.message : String(error)}
|
|
3507
|
+
`);
|
|
3508
|
+
} finally {
|
|
3509
|
+
running = false;
|
|
3510
|
+
if (pending) {
|
|
3511
|
+
pending = false;
|
|
3512
|
+
await run();
|
|
3513
|
+
}
|
|
3514
|
+
}
|
|
3515
|
+
};
|
|
3516
|
+
await run();
|
|
3517
|
+
const watcher = chokidar.watch(watchPaths, {
|
|
3518
|
+
ignoreInitial: true
|
|
3519
|
+
});
|
|
3520
|
+
watcher.on("all", (event, changedPath) => {
|
|
3521
|
+
process.stdout.write(`detected ${event}: ${changedPath}
|
|
3522
|
+
`);
|
|
3523
|
+
if (timer) {
|
|
3524
|
+
clearTimeout(timer);
|
|
3525
|
+
}
|
|
3526
|
+
timer = setTimeout(() => {
|
|
3527
|
+
void run();
|
|
3528
|
+
}, 350);
|
|
3529
|
+
});
|
|
3530
|
+
if (opts.mcp) {
|
|
3531
|
+
void runMcpServer({
|
|
3532
|
+
cwd,
|
|
3533
|
+
configPath: rootOpts?.config,
|
|
3534
|
+
transport: "http",
|
|
3535
|
+
httpPort: parsePositiveInt(opts.mcpPort, "--mcp-port"),
|
|
3536
|
+
httpPath: opts.mcpPath
|
|
3537
|
+
});
|
|
3538
|
+
}
|
|
3539
|
+
await new Promise((resolve) => {
|
|
3540
|
+
process.on("SIGINT", () => {
|
|
3541
|
+
void watcher.close().then(() => resolve());
|
|
3542
|
+
});
|
|
3543
|
+
});
|
|
3544
|
+
});
|
|
3545
|
+
program.command("clean").description("Delete local state and optionally delete remote vectors for a scope").option("--scope <name>", "scope override").option("--remote", "delete remote scope vectors", false).action(async (opts, command) => {
|
|
3546
|
+
const rootOpts = getRootOptions(command);
|
|
3547
|
+
const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3548
|
+
const config = await loadConfig({ cwd, configPath: rootOpts?.config });
|
|
3549
|
+
const scope = resolveScope(config, opts.scope);
|
|
3550
|
+
const statePath = path13.join(cwd, config.state.dir);
|
|
3551
|
+
await fsp.rm(statePath, { recursive: true, force: true });
|
|
3552
|
+
process.stdout.write(`deleted local state directory: ${statePath}
|
|
3553
|
+
`);
|
|
3554
|
+
if (opts.remote) {
|
|
3555
|
+
const vectorStore = await createVectorStore(config, cwd);
|
|
3556
|
+
await vectorStore.deleteScope(scope);
|
|
3557
|
+
process.stdout.write(`deleted remote vectors for scope ${scope.scopeName}
|
|
3558
|
+
`);
|
|
3559
|
+
}
|
|
3560
|
+
});
|
|
3561
|
+
program.command("prune").description("List/delete stale scopes (dry-run by default)").option("--apply", "apply deletions", false).option("--scopes-file <path>", "file containing active scopes").option("--older-than <duration>", "ttl cutoff like 30d").action(async (opts, command) => {
|
|
3562
|
+
const rootOpts = getRootOptions(command);
|
|
3563
|
+
const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3564
|
+
const config = await loadConfig({ cwd, configPath: rootOpts?.config });
|
|
3565
|
+
const baseScope = resolveScope(config);
|
|
3566
|
+
let vectorStore;
|
|
3567
|
+
let scopes;
|
|
3568
|
+
try {
|
|
3569
|
+
vectorStore = await createVectorStore(config, cwd);
|
|
3570
|
+
scopes = await vectorStore.listScopes(config.project.id);
|
|
3571
|
+
} catch (error) {
|
|
3572
|
+
process.stderr.write(
|
|
3573
|
+
`error: failed to access Turso vector store: ${error instanceof Error ? error.message : String(error)}
|
|
3574
|
+
`
|
|
3575
|
+
);
|
|
3576
|
+
process.exitCode = 1;
|
|
3577
|
+
return;
|
|
3578
|
+
}
|
|
3579
|
+
process.stdout.write(`using remote registry
|
|
3580
|
+
`);
|
|
3581
|
+
let keepScopes = /* @__PURE__ */ new Set();
|
|
3582
|
+
if (opts.scopesFile) {
|
|
3583
|
+
keepScopes = readScopesFromFile(path13.resolve(cwd, opts.scopesFile));
|
|
3584
|
+
} else {
|
|
3585
|
+
keepScopes = readRemoteGitBranches(cwd);
|
|
3586
|
+
}
|
|
3587
|
+
if (config.scope.sanitize && keepScopes.size > 0) {
|
|
3588
|
+
keepScopes = new Set([...keepScopes].map(sanitizeScopeName));
|
|
3589
|
+
}
|
|
3590
|
+
const olderThanMs = opts.olderThan ? parseDurationMs(opts.olderThan) : void 0;
|
|
3591
|
+
const now = Date.now();
|
|
3592
|
+
const stale = scopes.filter((entry) => {
|
|
3593
|
+
if (entry.scopeName === "main") {
|
|
3594
|
+
return false;
|
|
3595
|
+
}
|
|
3596
|
+
let staleByList = false;
|
|
3597
|
+
if (keepScopes.size > 0) {
|
|
3598
|
+
staleByList = !keepScopes.has(entry.scopeName);
|
|
3599
|
+
}
|
|
3600
|
+
let staleByTtl = false;
|
|
3601
|
+
if (olderThanMs) {
|
|
3602
|
+
staleByTtl = now - Date.parse(entry.lastIndexedAt) > olderThanMs;
|
|
3603
|
+
}
|
|
3604
|
+
if (keepScopes.size > 0 && olderThanMs) {
|
|
3605
|
+
return staleByList || staleByTtl;
|
|
3606
|
+
}
|
|
3607
|
+
if (keepScopes.size > 0) {
|
|
3608
|
+
return staleByList;
|
|
3609
|
+
}
|
|
3610
|
+
if (olderThanMs) {
|
|
3611
|
+
return staleByTtl;
|
|
3612
|
+
}
|
|
3613
|
+
return false;
|
|
3614
|
+
});
|
|
3615
|
+
if (stale.length === 0) {
|
|
3616
|
+
process.stdout.write("no stale scopes found\n");
|
|
3617
|
+
return;
|
|
3618
|
+
}
|
|
3619
|
+
process.stdout.write(`stale scopes (${stale.length}):
|
|
3620
|
+
`);
|
|
3621
|
+
for (const entry of stale) {
|
|
3622
|
+
process.stdout.write(` - ${entry.scopeName} lastIndexedAt=${entry.lastIndexedAt}
|
|
3623
|
+
`);
|
|
3624
|
+
}
|
|
3625
|
+
if (!opts.apply) {
|
|
3626
|
+
process.stdout.write("dry-run only. pass --apply to delete these scopes.\n");
|
|
3627
|
+
return;
|
|
3628
|
+
}
|
|
3629
|
+
let deleted = 0;
|
|
3630
|
+
for (const entry of stale) {
|
|
3631
|
+
const scope = {
|
|
3632
|
+
projectId: config.project.id,
|
|
3633
|
+
scopeName: entry.scopeName,
|
|
3634
|
+
scopeId: `${config.project.id}:${entry.scopeName}`
|
|
3635
|
+
};
|
|
3636
|
+
try {
|
|
3637
|
+
await vectorStore.deleteScope(scope);
|
|
3638
|
+
deleted += 1;
|
|
3639
|
+
} catch (error) {
|
|
3640
|
+
process.stdout.write(
|
|
3641
|
+
`failed to delete scope ${entry.scopeName}: ${error instanceof Error ? error.message : String(error)}
|
|
3642
|
+
`
|
|
3643
|
+
);
|
|
3644
|
+
}
|
|
3645
|
+
}
|
|
3646
|
+
process.stdout.write(`deleted scopes: ${deleted}
|
|
3647
|
+
`);
|
|
3648
|
+
if (baseScope.scopeName === "main") {
|
|
3649
|
+
process.stdout.write("main scope retained\n");
|
|
3650
|
+
}
|
|
3651
|
+
});
|
|
3652
|
+
program.command("doctor").description("Validate config, env vars, provider connectivity, and local write access").action(async (_opts, command) => {
|
|
3653
|
+
const rootOpts = getRootOptions(command);
|
|
3654
|
+
const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3655
|
+
const checks = [];
|
|
3656
|
+
let config = null;
|
|
3657
|
+
try {
|
|
3658
|
+
config = await loadConfig({ cwd, configPath: rootOpts?.config });
|
|
3659
|
+
checks.push({ name: "config parse", ok: true });
|
|
3660
|
+
} catch (error) {
|
|
3661
|
+
checks.push({
|
|
3662
|
+
name: "config parse",
|
|
3663
|
+
ok: false,
|
|
3664
|
+
details: error instanceof Error ? error.message : "unknown error"
|
|
3665
|
+
});
|
|
3666
|
+
}
|
|
3667
|
+
if (config) {
|
|
3668
|
+
const embKey = process.env[config.embeddings.apiKeyEnv];
|
|
3669
|
+
checks.push({
|
|
3670
|
+
name: `env ${config.embeddings.apiKeyEnv}`,
|
|
3671
|
+
ok: Boolean(embKey),
|
|
3672
|
+
details: embKey ? void 0 : "missing"
|
|
3673
|
+
});
|
|
3674
|
+
{
|
|
3675
|
+
const tursoUrl = process.env[config.vector.turso.urlEnv];
|
|
3676
|
+
checks.push({
|
|
3677
|
+
name: "turso/libsql",
|
|
3678
|
+
ok: true,
|
|
3679
|
+
details: tursoUrl ? `remote: ${tursoUrl}` : `local file: ${config.vector.turso.localPath}`
|
|
3680
|
+
});
|
|
3681
|
+
}
|
|
3682
|
+
if (config.rerank.provider === "jina") {
|
|
3683
|
+
const jinaKey = process.env[config.rerank.jina.apiKeyEnv];
|
|
3684
|
+
checks.push({
|
|
3685
|
+
name: `env ${config.rerank.jina.apiKeyEnv}`,
|
|
3686
|
+
ok: Boolean(jinaKey),
|
|
3687
|
+
details: jinaKey ? void 0 : "missing"
|
|
3688
|
+
});
|
|
3689
|
+
}
|
|
3690
|
+
if (config.source.mode === "static-output") {
|
|
3691
|
+
const outputDir = path13.resolve(cwd, config.source.staticOutputDir);
|
|
3692
|
+
const exists = fs9.existsSync(outputDir);
|
|
3693
|
+
checks.push({
|
|
3694
|
+
name: "source: static output dir",
|
|
3695
|
+
ok: exists,
|
|
3696
|
+
details: exists ? outputDir : `${outputDir} not found (run your build first)`
|
|
3697
|
+
});
|
|
3698
|
+
} else if (config.source.mode === "build") {
|
|
3699
|
+
const buildConfig = config.source.build;
|
|
3700
|
+
if (buildConfig) {
|
|
3701
|
+
const manifestPath = path13.resolve(cwd, buildConfig.outputDir, "server", "manifest-full.js");
|
|
3702
|
+
const manifestExists = fs9.existsSync(manifestPath);
|
|
3703
|
+
checks.push({
|
|
3704
|
+
name: "source: build manifest",
|
|
3705
|
+
ok: manifestExists,
|
|
3706
|
+
details: manifestExists ? manifestPath : `${manifestPath} not found (run \`vite build\` first)`
|
|
3707
|
+
});
|
|
3708
|
+
const viteBin = path13.resolve(cwd, "node_modules", ".bin", "vite");
|
|
3709
|
+
const viteExists = fs9.existsSync(viteBin);
|
|
3710
|
+
checks.push({
|
|
3711
|
+
name: "source: vite binary",
|
|
3712
|
+
ok: viteExists,
|
|
3713
|
+
details: viteExists ? viteBin : `${viteBin} not found (install vite)`
|
|
3714
|
+
});
|
|
3715
|
+
} else {
|
|
3716
|
+
checks.push({
|
|
3717
|
+
name: "source: build config",
|
|
3718
|
+
ok: false,
|
|
3719
|
+
details: "source.build config missing"
|
|
3720
|
+
});
|
|
3721
|
+
}
|
|
3722
|
+
} else if (config.source.mode === "content-files") {
|
|
3723
|
+
const contentConfig = config.source.contentFiles;
|
|
3724
|
+
if (contentConfig) {
|
|
3725
|
+
const fg4 = await import("fast-glob");
|
|
3726
|
+
const baseDir = path13.resolve(cwd, contentConfig.baseDir);
|
|
3727
|
+
const files = await fg4.default(contentConfig.globs, { cwd: baseDir, onlyFiles: true });
|
|
3728
|
+
checks.push({
|
|
3729
|
+
name: "source: content files",
|
|
3730
|
+
ok: files.length > 0,
|
|
3731
|
+
details: files.length > 0 ? `${files.length} files matched` : `no files matched globs ${contentConfig.globs.join(", ")} in ${baseDir}`
|
|
3732
|
+
});
|
|
3733
|
+
} else {
|
|
3734
|
+
checks.push({
|
|
3735
|
+
name: "source: content files",
|
|
3736
|
+
ok: false,
|
|
3737
|
+
details: "source.contentFiles config missing"
|
|
3738
|
+
});
|
|
3739
|
+
}
|
|
3740
|
+
}
|
|
3741
|
+
try {
|
|
3742
|
+
const provider = createEmbeddingsProvider(config);
|
|
3743
|
+
await provider.embedTexts(["searchsocket doctor ping"], config.embeddings.model);
|
|
3744
|
+
checks.push({ name: "embedding provider connectivity", ok: true });
|
|
3745
|
+
} catch (error) {
|
|
3746
|
+
checks.push({
|
|
3747
|
+
name: "embedding provider connectivity",
|
|
3748
|
+
ok: false,
|
|
3749
|
+
details: error instanceof Error ? error.message : "unknown error"
|
|
3750
|
+
});
|
|
3751
|
+
}
|
|
3752
|
+
let store = null;
|
|
3753
|
+
try {
|
|
3754
|
+
store = await createVectorStore(config, cwd);
|
|
3755
|
+
const health = await store.health();
|
|
3756
|
+
checks.push({
|
|
3757
|
+
name: "vector backend connectivity",
|
|
3758
|
+
ok: health.ok,
|
|
3759
|
+
details: health.details
|
|
3760
|
+
});
|
|
3761
|
+
} catch (error) {
|
|
3762
|
+
checks.push({
|
|
3763
|
+
name: "vector backend connectivity",
|
|
3764
|
+
ok: false,
|
|
3765
|
+
details: error instanceof Error ? error.message : "unknown error"
|
|
3766
|
+
});
|
|
3767
|
+
}
|
|
3768
|
+
if (store) {
|
|
3769
|
+
try {
|
|
3770
|
+
const testScope = {
|
|
3771
|
+
projectId: config.project.id,
|
|
3772
|
+
scopeName: "_searchsocket_doctor_probe",
|
|
3773
|
+
scopeId: `${config.project.id}:_searchsocket_doctor_probe`
|
|
3774
|
+
};
|
|
3775
|
+
await store.recordScope({
|
|
3776
|
+
projectId: testScope.projectId,
|
|
3777
|
+
scopeName: testScope.scopeName,
|
|
3778
|
+
modelId: config.embeddings.model,
|
|
3779
|
+
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3780
|
+
vectorCount: 0
|
|
3781
|
+
});
|
|
3782
|
+
await store.deleteScope(testScope);
|
|
3783
|
+
checks.push({ name: "vector backend write permission", ok: true });
|
|
3784
|
+
} catch (error) {
|
|
3785
|
+
checks.push({
|
|
3786
|
+
name: "vector backend write permission",
|
|
3787
|
+
ok: false,
|
|
3788
|
+
details: error instanceof Error ? error.message : "write test failed"
|
|
3789
|
+
});
|
|
3790
|
+
}
|
|
3791
|
+
}
|
|
3792
|
+
try {
|
|
3793
|
+
const scope = resolveScope(config);
|
|
3794
|
+
const { statePath } = ensureStateDirs(cwd, config.state.dir, scope);
|
|
3795
|
+
const testPath = path13.join(statePath, ".write-test");
|
|
3796
|
+
await fsp.writeFile(testPath, "ok\n", "utf8");
|
|
3797
|
+
await fsp.rm(testPath, { force: true });
|
|
3798
|
+
checks.push({ name: "state directory writable", ok: true });
|
|
3799
|
+
} catch (error) {
|
|
3800
|
+
checks.push({
|
|
3801
|
+
name: "state directory writable",
|
|
3802
|
+
ok: false,
|
|
3803
|
+
details: error instanceof Error ? error.message : "unknown error"
|
|
3804
|
+
});
|
|
3805
|
+
}
|
|
3806
|
+
}
|
|
3807
|
+
let hasFailure = false;
|
|
3808
|
+
for (const check of checks) {
|
|
3809
|
+
process.stdout.write(`${check.ok ? "PASS" : "FAIL"} ${check.name}`);
|
|
3810
|
+
if (check.details) {
|
|
3811
|
+
process.stdout.write(` (${check.details})`);
|
|
3812
|
+
}
|
|
3813
|
+
process.stdout.write("\n");
|
|
3814
|
+
if (!check.ok) {
|
|
3815
|
+
hasFailure = true;
|
|
3816
|
+
}
|
|
3817
|
+
}
|
|
3818
|
+
if (hasFailure) {
|
|
3819
|
+
process.exitCode = 1;
|
|
3820
|
+
}
|
|
3821
|
+
});
|
|
3822
|
+
program.command("mcp").description("Run SearchSocket MCP server").option("--transport <transport>", "stdio|http", "stdio").option("--port <n>", "HTTP port", "3338").option("--path <path>", "HTTP path", "/mcp").action(async (opts, command) => {
|
|
3823
|
+
const rootOpts = getRootOptions(command);
|
|
3824
|
+
const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3825
|
+
await runMcpServer({
|
|
3826
|
+
cwd,
|
|
3827
|
+
configPath: rootOpts?.config,
|
|
3828
|
+
transport: opts.transport,
|
|
3829
|
+
httpPort: parsePositiveInt(opts.port, "--port"),
|
|
3830
|
+
httpPath: opts.path
|
|
3831
|
+
});
|
|
3832
|
+
});
|
|
3833
|
+
program.command("search").description("Quick local CLI search against indexed vectors").requiredOption("--q <query>", "search query").option("--scope <name>", "scope override").option("--top-k <n>", "top K results", "10").option("--path-prefix <prefix>", "path prefix filter").option("--rerank", "enable configured reranker", false).action(async (opts, command) => {
|
|
3834
|
+
const rootOpts = getRootOptions(command);
|
|
3835
|
+
const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3836
|
+
const engine = await SearchEngine.create({
|
|
3837
|
+
cwd,
|
|
3838
|
+
configPath: rootOpts?.config
|
|
3839
|
+
});
|
|
3840
|
+
const result = await engine.search({
|
|
3841
|
+
q: opts.q,
|
|
3842
|
+
scope: opts.scope,
|
|
3843
|
+
topK: parsePositiveInt(opts.topK, "--top-k"),
|
|
3844
|
+
pathPrefix: opts.pathPrefix,
|
|
3845
|
+
rerank: opts.rerank
|
|
3846
|
+
});
|
|
3847
|
+
process.stdout.write(`${JSON.stringify(result, null, 2)}
|
|
3848
|
+
`);
|
|
3849
|
+
});
|
|
3850
|
+
async function main() {
|
|
3851
|
+
dotenvConfig({ path: path13.resolve(process.cwd(), ".env") });
|
|
3852
|
+
await program.parseAsync(process.argv);
|
|
3853
|
+
}
|
|
3854
|
+
main().catch((error) => {
|
|
3855
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3856
|
+
process.stderr.write(`searchsocket error: ${message}
|
|
3857
|
+
`);
|
|
3858
|
+
process.exit(1);
|
|
3859
|
+
});
|
|
3860
|
+
//# sourceMappingURL=cli.js.map
|