searchsocket 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js ADDED
@@ -0,0 +1,3860 @@
1
+ #!/usr/bin/env node
2
+
3
+ // src/cli.ts
4
+ import fs9 from "fs";
5
+ import fsp from "fs/promises";
6
+ import path13 from "path";
7
+ import { execSync as execSync2 } from "child_process";
8
+ import { config as dotenvConfig } from "dotenv";
9
+ import chokidar from "chokidar";
10
+ import { Command } from "commander";
11
+
12
+ // package.json
13
+ var package_default = {
14
+ name: "searchsocket",
15
+ version: "0.2.0",
16
+ description: "Semantic site search and MCP retrieval for SvelteKit static sites",
17
+ license: "MIT",
18
+ author: "Greg Priday <greg@siteorigin.com>",
19
+ repository: {
20
+ type: "git",
21
+ url: "https://github.com/gregpriday/searchsocket.git"
22
+ },
23
+ homepage: "https://github.com/gregpriday/searchsocket",
24
+ bugs: {
25
+ url: "https://github.com/gregpriday/searchsocket/issues"
26
+ },
27
+ keywords: [
28
+ "search",
29
+ "semantic-search",
30
+ "sveltekit",
31
+ "mcp",
32
+ "embeddings",
33
+ "vector-search",
34
+ "site-search",
35
+ "static-site"
36
+ ],
37
+ type: "module",
38
+ files: [
39
+ "dist",
40
+ "README.md"
41
+ ],
42
+ bin: {
43
+ searchsocket: "dist/cli.js"
44
+ },
45
+ exports: {
46
+ ".": {
47
+ types: "./dist/index.d.ts",
48
+ import: "./dist/index.js",
49
+ require: "./dist/index.cjs"
50
+ },
51
+ "./sveltekit": {
52
+ types: "./dist/sveltekit.d.ts",
53
+ import: "./dist/sveltekit.js",
54
+ require: "./dist/sveltekit.cjs"
55
+ },
56
+ "./client": {
57
+ types: "./dist/client.d.ts",
58
+ import: "./dist/client.js",
59
+ require: "./dist/client.cjs"
60
+ }
61
+ },
62
+ scripts: {
63
+ build: "tsup",
64
+ clean: "rm -rf dist",
65
+ typecheck: "tsc --noEmit",
66
+ test: "vitest run",
67
+ "test:watch": "vitest"
68
+ },
69
+ engines: {
70
+ node: ">=20"
71
+ },
72
+ packageManager: "pnpm@10.29.2",
73
+ dependencies: {
74
+ "@libsql/client": "^0.17.0",
75
+ "@modelcontextprotocol/sdk": "^1.26.0",
76
+ cheerio: "^1.2.0",
77
+ chokidar: "^5.0.0",
78
+ commander: "^14.0.3",
79
+ dotenv: "^17.3.1",
80
+ express: "^5.2.1",
81
+ "fast-glob": "^3.3.3",
82
+ "gray-matter": "^4.0.3",
83
+ jiti: "^2.6.1",
84
+ openai: "^6.19.0",
85
+ "p-limit": "^7.3.0",
86
+ turndown: "^7.2.2",
87
+ "turndown-plugin-gfm": "^1.0.2",
88
+ zod: "^4.3.6"
89
+ },
90
+ devDependencies: {
91
+ "@types/express": "^5.0.6",
92
+ "@types/node": "^25.2.2",
93
+ "@types/turndown": "^5.0.6",
94
+ tsup: "^8.5.1",
95
+ typescript: "^5.9.3",
96
+ vitest: "^4.0.18"
97
+ }
98
+ };
99
+
100
+ // src/config/load.ts
101
+ import fs from "fs";
102
+ import path from "path";
103
+ import { createJiti } from "jiti";
104
+
105
+ // src/config/schema.ts
106
+ import { z } from "zod";
107
+ var searchSocketConfigSchema = z.object({
108
+ project: z.object({
109
+ id: z.string().min(1).optional(),
110
+ baseUrl: z.string().url().optional()
111
+ }).optional(),
112
+ scope: z.object({
113
+ mode: z.enum(["fixed", "git", "env"]).optional(),
114
+ fixed: z.string().min(1).optional(),
115
+ envVar: z.string().min(1).optional(),
116
+ sanitize: z.boolean().optional()
117
+ }).optional(),
118
+ source: z.object({
119
+ mode: z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
120
+ staticOutputDir: z.string().min(1).optional(),
121
+ strictRouteMapping: z.boolean().optional(),
122
+ crawl: z.object({
123
+ baseUrl: z.string().url(),
124
+ routes: z.array(z.string()).optional(),
125
+ sitemapUrl: z.string().optional()
126
+ }).optional(),
127
+ contentFiles: z.object({
128
+ globs: z.array(z.string()).min(1),
129
+ baseDir: z.string().optional()
130
+ }).optional(),
131
+ build: z.object({
132
+ outputDir: z.string().min(1).optional(),
133
+ paramValues: z.record(z.string(), z.array(z.string())).optional(),
134
+ exclude: z.array(z.string()).optional(),
135
+ previewTimeout: z.number().int().positive().optional()
136
+ }).optional()
137
+ }).optional(),
138
+ extract: z.object({
139
+ mainSelector: z.string().optional(),
140
+ dropTags: z.array(z.string()).optional(),
141
+ dropSelectors: z.array(z.string()).optional(),
142
+ ignoreAttr: z.string().optional(),
143
+ noindexAttr: z.string().optional(),
144
+ respectRobotsNoindex: z.boolean().optional()
145
+ }).optional(),
146
+ transform: z.object({
147
+ output: z.literal("markdown").optional(),
148
+ preserveCodeBlocks: z.boolean().optional(),
149
+ preserveTables: z.boolean().optional()
150
+ }).optional(),
151
+ chunking: z.object({
152
+ strategy: z.literal("hybrid").optional(),
153
+ maxChars: z.number().int().positive().optional(),
154
+ overlapChars: z.number().int().nonnegative().optional(),
155
+ minChars: z.number().int().positive().optional(),
156
+ headingPathDepth: z.number().int().positive().optional(),
157
+ dontSplitInside: z.array(z.enum(["code", "table", "blockquote"])).optional(),
158
+ prependTitle: z.boolean().optional(),
159
+ pageSummaryChunk: z.boolean().optional()
160
+ }).optional(),
161
+ embeddings: z.object({
162
+ provider: z.literal("openai").optional(),
163
+ model: z.string().min(1).optional(),
164
+ apiKeyEnv: z.string().min(1).optional(),
165
+ batchSize: z.number().int().positive().optional(),
166
+ concurrency: z.number().int().positive().optional(),
167
+ pricePer1kTokens: z.number().positive().optional()
168
+ }).optional(),
169
+ vector: z.object({
170
+ dimension: z.number().int().positive().optional(),
171
+ turso: z.object({
172
+ urlEnv: z.string().optional(),
173
+ authTokenEnv: z.string().optional(),
174
+ localPath: z.string().optional()
175
+ }).optional()
176
+ }).optional(),
177
+ rerank: z.object({
178
+ provider: z.enum(["none", "jina"]).optional(),
179
+ topN: z.number().int().positive().optional(),
180
+ jina: z.object({
181
+ apiKeyEnv: z.string().optional(),
182
+ model: z.string().optional()
183
+ }).optional()
184
+ }).optional(),
185
+ ranking: z.object({
186
+ enableIncomingLinkBoost: z.boolean().optional(),
187
+ enableDepthBoost: z.boolean().optional(),
188
+ pageWeights: z.record(z.string(), z.number().positive()).optional(),
189
+ aggregationCap: z.number().int().positive().optional(),
190
+ aggregationDecay: z.number().min(0).max(1).optional(),
191
+ minChunkScoreRatio: z.number().min(0).max(1).optional(),
192
+ weights: z.object({
193
+ incomingLinks: z.number().optional(),
194
+ depth: z.number().optional(),
195
+ rerank: z.number().optional(),
196
+ aggregation: z.number().optional()
197
+ }).optional()
198
+ }).optional(),
199
+ api: z.object({
200
+ path: z.string().optional(),
201
+ cors: z.object({
202
+ allowOrigins: z.array(z.string()).optional()
203
+ }).optional(),
204
+ rateLimit: z.object({
205
+ windowMs: z.number().int().positive().optional(),
206
+ max: z.number().int().positive().optional()
207
+ }).optional()
208
+ }).optional(),
209
+ mcp: z.object({
210
+ enable: z.boolean().optional(),
211
+ transport: z.enum(["stdio", "http"]).optional(),
212
+ http: z.object({
213
+ port: z.number().int().positive().optional(),
214
+ path: z.string().optional()
215
+ }).optional()
216
+ }).optional(),
217
+ state: z.object({
218
+ dir: z.string().optional(),
219
+ writeMirror: z.boolean().optional()
220
+ }).optional()
221
+ });
222
+
223
+ // src/config/defaults.ts
224
+ var DEFAULT_DROP_SELECTORS = [
225
+ ".sidebar",
226
+ ".toc",
227
+ ".table-of-contents",
228
+ ".breadcrumbs",
229
+ ".breadcrumb",
230
+ "[role='navigation']"
231
+ ];
232
+ function createDefaultConfig(projectId) {
233
+ return {
234
+ project: {
235
+ id: projectId
236
+ },
237
+ scope: {
238
+ mode: "fixed",
239
+ fixed: "main",
240
+ envVar: "SEARCHSOCKET_SCOPE",
241
+ sanitize: true
242
+ },
243
+ source: {
244
+ mode: "static-output",
245
+ staticOutputDir: "build",
246
+ strictRouteMapping: false
247
+ },
248
+ extract: {
249
+ mainSelector: "main",
250
+ dropTags: ["header", "nav", "footer", "aside"],
251
+ dropSelectors: DEFAULT_DROP_SELECTORS,
252
+ ignoreAttr: "data-search-ignore",
253
+ noindexAttr: "data-search-noindex",
254
+ respectRobotsNoindex: true
255
+ },
256
+ transform: {
257
+ output: "markdown",
258
+ preserveCodeBlocks: true,
259
+ preserveTables: true
260
+ },
261
+ chunking: {
262
+ strategy: "hybrid",
263
+ maxChars: 2200,
264
+ overlapChars: 200,
265
+ minChars: 250,
266
+ headingPathDepth: 3,
267
+ dontSplitInside: ["code", "table", "blockquote"],
268
+ prependTitle: true,
269
+ pageSummaryChunk: true
270
+ },
271
+ embeddings: {
272
+ provider: "openai",
273
+ model: "text-embedding-3-small",
274
+ apiKeyEnv: "OPENAI_API_KEY",
275
+ batchSize: 64,
276
+ concurrency: 4
277
+ },
278
+ vector: {
279
+ turso: {
280
+ urlEnv: "TURSO_DATABASE_URL",
281
+ authTokenEnv: "TURSO_AUTH_TOKEN",
282
+ localPath: ".searchsocket/vectors.db"
283
+ }
284
+ },
285
+ rerank: {
286
+ provider: "none",
287
+ topN: 20,
288
+ jina: {
289
+ apiKeyEnv: "JINA_API_KEY",
290
+ model: "jina-reranker-v2-base-multilingual"
291
+ }
292
+ },
293
+ ranking: {
294
+ enableIncomingLinkBoost: true,
295
+ enableDepthBoost: true,
296
+ pageWeights: {},
297
+ aggregationCap: 5,
298
+ aggregationDecay: 0.5,
299
+ minChunkScoreRatio: 0.5,
300
+ weights: {
301
+ incomingLinks: 0.05,
302
+ depth: 0.03,
303
+ rerank: 1,
304
+ aggregation: 0.1
305
+ }
306
+ },
307
+ api: {
308
+ path: "/api/search",
309
+ cors: {
310
+ allowOrigins: []
311
+ }
312
+ },
313
+ mcp: {
314
+ enable: process.env.NODE_ENV !== "production",
315
+ transport: "stdio",
316
+ http: {
317
+ port: 3338,
318
+ path: "/mcp"
319
+ }
320
+ },
321
+ state: {
322
+ dir: ".searchsocket",
323
+ writeMirror: false
324
+ }
325
+ };
326
+ }
327
+
328
+ // src/errors/index.ts
329
+ var SearchSocketError = class extends Error {
330
+ code;
331
+ status;
332
+ constructor(code, message, status = 500) {
333
+ super(message);
334
+ this.code = code;
335
+ this.status = status;
336
+ }
337
+ };
338
+
339
+ // src/config/load.ts
340
+ function inferProjectId(cwd) {
341
+ const packageJsonPath = path.join(cwd, "package.json");
342
+ if (!fs.existsSync(packageJsonPath)) {
343
+ return path.basename(cwd);
344
+ }
345
+ const raw = JSON.parse(fs.readFileSync(packageJsonPath, "utf8"));
346
+ return (raw.name ?? path.basename(cwd)).replace(/[^a-zA-Z0-9._-]/g, "-");
347
+ }
348
+ function detectSourceMode(cwd, config, parsedInput) {
349
+ if (parsedInput.source?.mode) {
350
+ return parsedInput.source.mode;
351
+ }
352
+ if (parsedInput.source?.build) {
353
+ return "build";
354
+ }
355
+ if (parsedInput.source?.crawl) {
356
+ return "crawl";
357
+ }
358
+ if (parsedInput.source?.contentFiles) {
359
+ return "content-files";
360
+ }
361
+ const staticOutputPath = path.resolve(cwd, config.source.staticOutputDir);
362
+ if (fs.existsSync(staticOutputPath)) {
363
+ return "static-output";
364
+ }
365
+ throw new SearchSocketError(
366
+ "CONFIG_MISSING",
367
+ `Unable to auto-detect source mode because ${staticOutputPath} does not exist. Set \`source.mode\` explicitly (static-output, crawl, content-files, or build).`
368
+ );
369
+ }
370
+ function mergeConfig(cwd, rawConfig) {
371
+ const projectId = rawConfig.project?.id ?? inferProjectId(cwd);
372
+ const defaults = createDefaultConfig(projectId);
373
+ const parseResult = searchSocketConfigSchema.safeParse(rawConfig);
374
+ if (!parseResult.success) {
375
+ const issues = parseResult.error.issues.map((issue) => ` ${issue.path.join(".")}: ${issue.message}`).join("\n");
376
+ throw new SearchSocketError(
377
+ "CONFIG_MISSING",
378
+ `Invalid searchsocket.config.ts:
379
+ ${issues}`
380
+ );
381
+ }
382
+ const parsed = parseResult.data;
383
+ const merged = {
384
+ ...defaults,
385
+ project: {
386
+ ...defaults.project,
387
+ ...parsed.project
388
+ },
389
+ scope: {
390
+ ...defaults.scope,
391
+ ...parsed.scope
392
+ },
393
+ source: {
394
+ ...defaults.source,
395
+ ...parsed.source,
396
+ crawl: parsed.source?.crawl ? {
397
+ ...defaults.source.crawl,
398
+ ...parsed.source.crawl,
399
+ routes: parsed.source.crawl.routes ?? []
400
+ } : defaults.source.crawl,
401
+ contentFiles: parsed.source?.contentFiles ? {
402
+ ...defaults.source.contentFiles,
403
+ ...parsed.source.contentFiles,
404
+ baseDir: parsed.source.contentFiles.baseDir ?? defaults.source.contentFiles?.baseDir ?? cwd
405
+ } : defaults.source.contentFiles,
406
+ build: parsed.source?.build ? {
407
+ outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
408
+ paramValues: parsed.source.build.paramValues ?? {},
409
+ exclude: parsed.source.build.exclude ?? [],
410
+ previewTimeout: parsed.source.build.previewTimeout ?? 3e4
411
+ } : void 0
412
+ },
413
+ extract: {
414
+ ...defaults.extract,
415
+ ...parsed.extract
416
+ },
417
+ transform: {
418
+ ...defaults.transform,
419
+ ...parsed.transform
420
+ },
421
+ chunking: {
422
+ ...defaults.chunking,
423
+ ...parsed.chunking
424
+ },
425
+ embeddings: {
426
+ ...defaults.embeddings,
427
+ ...parsed.embeddings
428
+ },
429
+ vector: {
430
+ ...defaults.vector,
431
+ ...parsed.vector,
432
+ turso: {
433
+ ...defaults.vector.turso,
434
+ ...parsed.vector?.turso
435
+ }
436
+ },
437
+ rerank: {
438
+ ...defaults.rerank,
439
+ ...parsed.rerank,
440
+ jina: {
441
+ ...defaults.rerank.jina,
442
+ ...parsed.rerank?.jina
443
+ }
444
+ },
445
+ ranking: {
446
+ ...defaults.ranking,
447
+ ...parsed.ranking,
448
+ pageWeights: {
449
+ ...defaults.ranking.pageWeights,
450
+ ...parsed.ranking?.pageWeights
451
+ },
452
+ weights: {
453
+ ...defaults.ranking.weights,
454
+ ...parsed.ranking?.weights
455
+ }
456
+ },
457
+ api: {
458
+ ...defaults.api,
459
+ ...parsed.api,
460
+ cors: {
461
+ ...defaults.api.cors,
462
+ ...parsed.api?.cors,
463
+ allowOrigins: parsed.api?.cors?.allowOrigins ?? defaults.api.cors.allowOrigins
464
+ },
465
+ rateLimit: parsed.api?.rateLimit ? {
466
+ windowMs: parsed.api.rateLimit.windowMs ?? 6e4,
467
+ max: parsed.api.rateLimit.max ?? 60
468
+ } : defaults.api.rateLimit
469
+ },
470
+ mcp: {
471
+ ...defaults.mcp,
472
+ ...parsed.mcp,
473
+ http: {
474
+ ...defaults.mcp.http,
475
+ ...parsed.mcp?.http
476
+ }
477
+ },
478
+ state: {
479
+ ...defaults.state,
480
+ ...parsed.state
481
+ }
482
+ };
483
+ merged.project.id = projectId;
484
+ merged.source.mode = detectSourceMode(cwd, merged, parsed);
485
+ if (merged.source.mode === "build" && !merged.source.build) {
486
+ merged.source.build = {
487
+ outputDir: ".svelte-kit/output",
488
+ paramValues: {},
489
+ exclude: [],
490
+ previewTimeout: 3e4
491
+ };
492
+ }
493
+ if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
494
+ throw new SearchSocketError("CONFIG_MISSING", "`source.crawl.baseUrl` is required when source.mode is crawl.");
495
+ }
496
+ if (merged.source.mode === "content-files" && (!merged.source.contentFiles || merged.source.contentFiles.globs.length === 0)) {
497
+ throw new SearchSocketError(
498
+ "CONFIG_MISSING",
499
+ "`source.contentFiles.globs` is required when source.mode is content-files."
500
+ );
501
+ }
502
+ return merged;
503
+ }
504
+ async function loadConfig(options = {}) {
505
+ const cwd = path.resolve(options.cwd ?? process.cwd());
506
+ const configPath = path.resolve(cwd, options.configPath ?? "searchsocket.config.ts");
507
+ if (!fs.existsSync(configPath)) {
508
+ if (options.allowMissing) {
509
+ return mergeConfig(cwd, {
510
+ source: {
511
+ mode: "static-output"
512
+ }
513
+ });
514
+ }
515
+ throw new SearchSocketError(
516
+ "CONFIG_MISSING",
517
+ `Configuration file not found at ${configPath}. Run \`searchsocket init\` first.`
518
+ );
519
+ }
520
+ const jiti = createJiti(cwd, { interopDefault: true });
521
+ const loaded = await jiti.import(configPath);
522
+ const raw = loaded.default ?? loaded;
523
+ return mergeConfig(cwd, raw);
524
+ }
525
+ function writeMinimalConfig(cwd) {
526
+ const target = path.join(cwd, "searchsocket.config.ts");
527
+ if (fs.existsSync(target)) {
528
+ return target;
529
+ }
530
+ const content = `export default {
531
+ embeddings: { apiKeyEnv: "OPENAI_API_KEY" }
532
+ };
533
+ `;
534
+ fs.writeFileSync(target, content, "utf8");
535
+ return target;
536
+ }
537
+
538
+ // src/core/logger.ts
539
+ var Logger = class {
540
+ json;
541
+ verbose;
542
+ stderrOnly;
543
+ constructor(opts = {}) {
544
+ this.json = opts.json ?? false;
545
+ this.verbose = opts.verbose ?? false;
546
+ this.stderrOnly = opts.stderrOnly ?? false;
547
+ }
548
+ info(message) {
549
+ if (this.json) {
550
+ return;
551
+ }
552
+ this.writeOut(`${message}
553
+ `);
554
+ }
555
+ debug(message) {
556
+ if (!this.verbose) {
557
+ return;
558
+ }
559
+ if (this.json) {
560
+ this.logJson("debug", { message });
561
+ return;
562
+ }
563
+ this.writeOut(`${message}
564
+ `);
565
+ }
566
+ warn(message) {
567
+ if (this.json) {
568
+ this.logJson("warn", { message });
569
+ return;
570
+ }
571
+ process.stderr.write(`WARN: ${message}
572
+ `);
573
+ }
574
+ error(message) {
575
+ if (this.json) {
576
+ this.logJson("error", { message });
577
+ return;
578
+ }
579
+ process.stderr.write(`ERROR: ${message}
580
+ `);
581
+ }
582
+ event(event, data) {
583
+ if (!this.json && !this.verbose) {
584
+ return;
585
+ }
586
+ if (this.json) {
587
+ this.logJson(event, data);
588
+ return;
589
+ }
590
+ this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
591
+ `);
592
+ }
593
+ writeOut(text) {
594
+ if (this.stderrOnly) {
595
+ process.stderr.write(text);
596
+ } else {
597
+ process.stdout.write(text);
598
+ }
599
+ }
600
+ logJson(event, data) {
601
+ const entry = {
602
+ event,
603
+ ts: (/* @__PURE__ */ new Date()).toISOString(),
604
+ data
605
+ };
606
+ this.writeOut(`${JSON.stringify(entry)}
607
+ `);
608
+ }
609
+ };
610
+
611
+ // src/core/scope.ts
612
+ import { execSync } from "child_process";
613
+
614
+ // src/utils/text.ts
615
+ function normalizeText(input) {
616
+ return input.replace(/\r\n/g, "\n").replace(/\s+/g, " ").trim();
617
+ }
618
+ function normalizeMarkdown(input) {
619
+ return input.replace(/\r\n/g, "\n").replace(/[ \t]+$/gm, "").trim() + "\n";
620
+ }
621
+ function sanitizeScopeName(scopeName) {
622
+ return scopeName.toLowerCase().replace(/[^a-z0-9._-]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80);
623
+ }
624
+ function toSnippet(markdown, maxLen = 220) {
625
+ const plain = markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/[#>*_|\-]/g, " ").replace(/\s+/g, " ").trim();
626
+ if (plain.length <= maxLen) {
627
+ return plain;
628
+ }
629
+ return `${plain.slice(0, Math.max(0, maxLen - 1)).trim()}\u2026`;
630
+ }
631
+ function extractFirstParagraph(markdown) {
632
+ const lines = markdown.split("\n");
633
+ let inFence = false;
634
+ const collected = [];
635
+ for (const line of lines) {
636
+ const trimmed = line.trim();
637
+ if (/^(```|~~~)/.test(trimmed)) {
638
+ inFence = !inFence;
639
+ if (collected.length > 0) break;
640
+ continue;
641
+ }
642
+ if (inFence) continue;
643
+ if (/^#{1,6}\s/.test(trimmed)) {
644
+ if (collected.length > 0) break;
645
+ continue;
646
+ }
647
+ if (!trimmed) {
648
+ if (collected.length > 0) break;
649
+ continue;
650
+ }
651
+ collected.push(trimmed);
652
+ }
653
+ return collected.join(" ");
654
+ }
655
+
656
+ // src/core/scope.ts
657
+ function resolveRawScopeName(config) {
658
+ if (config.scope.mode === "fixed") {
659
+ return config.scope.fixed;
660
+ }
661
+ if (config.scope.mode === "env") {
662
+ const value = process.env[config.scope.envVar];
663
+ if (!value) {
664
+ throw new Error(`Scope mode is env but ${config.scope.envVar} is not set.`);
665
+ }
666
+ return value;
667
+ }
668
+ try {
669
+ return execSync("git rev-parse --abbrev-ref HEAD", {
670
+ encoding: "utf8",
671
+ stdio: ["ignore", "pipe", "ignore"]
672
+ }).trim();
673
+ } catch {
674
+ return config.scope.fixed;
675
+ }
676
+ }
677
+ function resolveScope(config, override) {
678
+ const rawName = override ?? resolveRawScopeName(config);
679
+ const scopeName = config.scope.sanitize ? sanitizeScopeName(rawName) : rawName;
680
+ return {
681
+ projectId: config.project.id,
682
+ scopeName,
683
+ scopeId: `${config.project.id}:${scopeName}`
684
+ };
685
+ }
686
+
687
+ // src/core/state.ts
688
+ import fs2 from "fs";
689
+ import path2 from "path";
690
+ function ensureStateDirs(cwd, stateDir, scope) {
691
+ const statePath = path2.resolve(cwd, stateDir);
692
+ const pagesPath = path2.join(statePath, "pages", scope.scopeName);
693
+ fs2.mkdirSync(pagesPath, { recursive: true });
694
+ return { statePath, pagesPath };
695
+ }
696
+
697
+ // src/embeddings/openai.ts
698
+ import OpenAI from "openai";
699
+ import pLimit from "p-limit";
700
+ function sleep(ms) {
701
+ return new Promise((resolve) => {
702
+ setTimeout(resolve, ms);
703
+ });
704
+ }
705
+ var OpenAIEmbeddingsProvider = class {
706
+ client;
707
+ batchSize;
708
+ concurrency;
709
+ constructor(options) {
710
+ if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
711
+ throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
712
+ }
713
+ if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
714
+ throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
715
+ }
716
+ this.client = new OpenAI({
717
+ apiKey: options.apiKey
718
+ });
719
+ this.batchSize = options.batchSize;
720
+ this.concurrency = options.concurrency;
721
+ }
722
+ estimateTokens(text) {
723
+ const normalized = text.trim();
724
+ if (!normalized) {
725
+ return 0;
726
+ }
727
+ const wordCount = normalized.match(/[A-Za-z0-9_]+/g)?.length ?? 0;
728
+ const punctuationCount = normalized.match(/[^\s\w]/g)?.length ?? 0;
729
+ const cjkCount = normalized.match(/[\u3400-\u9fff]/g)?.length ?? 0;
730
+ const charEstimate = Math.ceil(normalized.length / 4);
731
+ const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
732
+ return Math.max(1, Math.max(charEstimate, lexicalEstimate));
733
+ }
734
+ async embedTexts(texts, modelId) {
735
+ if (texts.length === 0) {
736
+ return [];
737
+ }
738
+ const batches = [];
739
+ for (let i = 0; i < texts.length; i += this.batchSize) {
740
+ batches.push({
741
+ index: i,
742
+ values: texts.slice(i, i + this.batchSize)
743
+ });
744
+ }
745
+ const outputs = new Array(batches.length);
746
+ const limit = pLimit(this.concurrency);
747
+ await Promise.all(
748
+ batches.map(
749
+ (batch, position) => limit(async () => {
750
+ outputs[position] = await this.embedWithRetry(batch.values, modelId);
751
+ })
752
+ )
753
+ );
754
+ return outputs.flat();
755
+ }
756
+ async embedWithRetry(texts, modelId) {
757
+ const maxAttempts = 5;
758
+ let attempt = 0;
759
+ while (attempt < maxAttempts) {
760
+ attempt += 1;
761
+ try {
762
+ const response = await this.client.embeddings.create({
763
+ model: modelId,
764
+ input: texts,
765
+ encoding_format: "float"
766
+ });
767
+ return response.data.map((entry) => entry.embedding);
768
+ } catch (error) {
769
+ const status = error.status;
770
+ const retryable = status === 429 || typeof status === "number" && status >= 500;
771
+ if (!retryable || attempt >= maxAttempts) {
772
+ throw error;
773
+ }
774
+ const delay = Math.min(2 ** attempt * 300, 5e3);
775
+ await sleep(delay);
776
+ }
777
+ }
778
+ throw new Error("Unreachable retry state");
779
+ }
780
+ };
781
+
782
+ // src/embeddings/factory.ts
783
+ function createEmbeddingsProvider(config) {
784
+ if (config.embeddings.provider !== "openai") {
785
+ throw new SearchSocketError(
786
+ "CONFIG_MISSING",
787
+ `Unsupported embeddings provider ${config.embeddings.provider}`
788
+ );
789
+ }
790
+ const apiKey = process.env[config.embeddings.apiKeyEnv];
791
+ if (!apiKey) {
792
+ throw new SearchSocketError(
793
+ "CONFIG_MISSING",
794
+ `Missing embeddings API key env var: ${config.embeddings.apiKeyEnv}`
795
+ );
796
+ }
797
+ return new OpenAIEmbeddingsProvider({
798
+ apiKey,
799
+ batchSize: config.embeddings.batchSize,
800
+ concurrency: config.embeddings.concurrency
801
+ });
802
+ }
803
+
804
+ // src/indexing/pipeline.ts
805
+ import path11 from "path";
806
+
807
+ // src/vector/factory.ts
808
+ import fs3 from "fs";
809
+ import path3 from "path";
810
+
811
+ // src/vector/turso.ts
812
+ var TursoVectorStore = class {
813
+ client;
814
+ dimension;
815
+ chunksReady = false;
816
+ registryReady = false;
817
+ pagesReady = false;
818
+ constructor(opts) {
819
+ this.client = opts.client;
820
+ this.dimension = opts.dimension;
821
+ }
822
+ async ensureRegistry() {
823
+ if (this.registryReady) return;
824
+ await this.client.execute(`
825
+ CREATE TABLE IF NOT EXISTS registry (
826
+ scope_key TEXT PRIMARY KEY,
827
+ project_id TEXT NOT NULL,
828
+ scope_name TEXT NOT NULL,
829
+ model_id TEXT NOT NULL,
830
+ last_indexed_at TEXT NOT NULL,
831
+ vector_count INTEGER,
832
+ last_estimate_tokens INTEGER,
833
+ last_estimate_cost_usd REAL,
834
+ last_estimate_changed_chunks INTEGER
835
+ )
836
+ `);
837
+ const estimateCols = [
838
+ { name: "last_estimate_tokens", def: "INTEGER" },
839
+ { name: "last_estimate_cost_usd", def: "REAL" },
840
+ { name: "last_estimate_changed_chunks", def: "INTEGER" }
841
+ ];
842
+ for (const col of estimateCols) {
843
+ try {
844
+ await this.client.execute(`ALTER TABLE registry ADD COLUMN ${col.name} ${col.def}`);
845
+ } catch (error) {
846
+ if (error instanceof Error && !error.message.includes("duplicate column")) {
847
+ throw error;
848
+ }
849
+ }
850
+ }
851
+ this.registryReady = true;
852
+ }
853
+ async ensureChunks(dim) {
854
+ if (this.chunksReady) return;
855
+ await this.client.batch([
856
+ `CREATE TABLE IF NOT EXISTS chunks (
857
+ id TEXT PRIMARY KEY,
858
+ project_id TEXT NOT NULL,
859
+ scope_name TEXT NOT NULL,
860
+ url TEXT NOT NULL,
861
+ path TEXT NOT NULL,
862
+ title TEXT NOT NULL,
863
+ section_title TEXT NOT NULL DEFAULT '',
864
+ heading_path TEXT NOT NULL DEFAULT '[]',
865
+ snippet TEXT NOT NULL DEFAULT '',
866
+ content_hash TEXT NOT NULL DEFAULT '',
867
+ model_id TEXT NOT NULL DEFAULT '',
868
+ depth INTEGER NOT NULL DEFAULT 0,
869
+ incoming_links INTEGER NOT NULL DEFAULT 0,
870
+ route_file TEXT NOT NULL DEFAULT '',
871
+ tags TEXT NOT NULL DEFAULT '[]',
872
+ embedding F32_BLOB(${dim})
873
+ )`,
874
+ `CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
875
+ ]);
876
+ this.chunksReady = true;
877
+ }
878
+ async ensurePages() {
879
+ if (this.pagesReady) return;
880
+ await this.client.execute(`
881
+ CREATE TABLE IF NOT EXISTS pages (
882
+ project_id TEXT NOT NULL,
883
+ scope_name TEXT NOT NULL,
884
+ url TEXT NOT NULL,
885
+ title TEXT NOT NULL,
886
+ markdown TEXT NOT NULL,
887
+ route_file TEXT NOT NULL DEFAULT '',
888
+ route_resolution TEXT NOT NULL DEFAULT 'exact',
889
+ incoming_links INTEGER NOT NULL DEFAULT 0,
890
+ outgoing_links INTEGER NOT NULL DEFAULT 0,
891
+ depth INTEGER NOT NULL DEFAULT 0,
892
+ tags TEXT NOT NULL DEFAULT '[]',
893
+ indexed_at TEXT NOT NULL,
894
+ PRIMARY KEY (project_id, scope_name, url)
895
+ )
896
+ `);
897
+ this.pagesReady = true;
898
+ }
899
+ async chunksTableExists() {
900
+ try {
901
+ await this.client.execute("SELECT 1 FROM chunks LIMIT 0");
902
+ return true;
903
+ } catch (error) {
904
+ if (error instanceof Error && error.message.includes("no such table")) {
905
+ return false;
906
+ }
907
+ throw error;
908
+ }
909
+ }
910
+ async upsert(records, _scope) {
911
+ if (records.length === 0) return;
912
+ const dim = this.dimension ?? records[0].vector.length;
913
+ await this.ensureChunks(dim);
914
+ const BATCH_SIZE = 100;
915
+ for (let i = 0; i < records.length; i += BATCH_SIZE) {
916
+ const batch = records.slice(i, i + BATCH_SIZE);
917
+ const stmts = batch.map((r) => ({
918
+ sql: `INSERT OR REPLACE INTO chunks
919
+ (id, project_id, scope_name, url, path, title, section_title,
920
+ heading_path, snippet, content_hash, model_id, depth,
921
+ incoming_links, route_file, tags, embedding)
922
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
923
+ args: [
924
+ r.id,
925
+ r.metadata.projectId,
926
+ r.metadata.scopeName,
927
+ r.metadata.url,
928
+ r.metadata.path,
929
+ r.metadata.title,
930
+ r.metadata.sectionTitle,
931
+ JSON.stringify(r.metadata.headingPath),
932
+ r.metadata.snippet,
933
+ r.metadata.contentHash,
934
+ r.metadata.modelId,
935
+ r.metadata.depth,
936
+ r.metadata.incomingLinks,
937
+ r.metadata.routeFile,
938
+ JSON.stringify(r.metadata.tags),
939
+ JSON.stringify(r.vector)
940
+ ]
941
+ }));
942
+ await this.client.batch(stmts);
943
+ }
944
+ }
945
+ async query(queryVector, opts, scope) {
946
+ const dim = this.dimension ?? queryVector.length;
947
+ await this.ensureChunks(dim);
948
+ const queryJson = JSON.stringify(queryVector);
949
+ const rs = await this.client.execute({
950
+ sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
951
+ c.section_title, c.heading_path, c.snippet, c.content_hash,
952
+ c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
953
+ vector_distance_cos(c.embedding, vector(?)) AS distance
954
+ FROM vector_top_k('idx', vector(?), ?) AS v
955
+ JOIN chunks AS c ON c.rowid = v.id`,
956
+ args: [queryJson, queryJson, opts.topK]
957
+ });
958
+ let hits = [];
959
+ for (const row of rs.rows) {
960
+ const projectId = row.project_id;
961
+ const scopeName = row.scope_name;
962
+ if (projectId !== scope.projectId || scopeName !== scope.scopeName) {
963
+ continue;
964
+ }
965
+ const rowPath = row.path;
966
+ if (opts.pathPrefix) {
967
+ const rawPrefix = opts.pathPrefix.startsWith("/") ? opts.pathPrefix : `/${opts.pathPrefix}`;
968
+ const prefix = rawPrefix.endsWith("/") ? rawPrefix : `${rawPrefix}/`;
969
+ const normalizedPath = rowPath.replace(/\/$/, "");
970
+ const normalizedPrefix = rawPrefix.replace(/\/$/, "");
971
+ if (normalizedPath !== normalizedPrefix && !rowPath.startsWith(prefix)) {
972
+ continue;
973
+ }
974
+ }
975
+ const tags = JSON.parse(row.tags || "[]");
976
+ if (opts.tags && opts.tags.length > 0) {
977
+ if (!opts.tags.every((t) => tags.includes(t))) {
978
+ continue;
979
+ }
980
+ }
981
+ const distance = row.distance;
982
+ const score = 1 - distance;
983
+ hits.push({
984
+ id: row.id,
985
+ score,
986
+ metadata: {
987
+ projectId,
988
+ scopeName,
989
+ url: row.url,
990
+ path: rowPath,
991
+ title: row.title,
992
+ sectionTitle: row.section_title,
993
+ headingPath: JSON.parse(row.heading_path || "[]"),
994
+ snippet: row.snippet,
995
+ contentHash: row.content_hash,
996
+ modelId: row.model_id,
997
+ depth: row.depth,
998
+ incomingLinks: row.incoming_links,
999
+ routeFile: row.route_file,
1000
+ tags
1001
+ }
1002
+ });
1003
+ }
1004
+ hits.sort((a, b) => b.score - a.score);
1005
+ return hits;
1006
+ }
1007
+ async deleteByIds(ids, scope) {
1008
+ if (ids.length === 0) return;
1009
+ const BATCH_SIZE = 500;
1010
+ for (let i = 0; i < ids.length; i += BATCH_SIZE) {
1011
+ const batch = ids.slice(i, i + BATCH_SIZE);
1012
+ const placeholders = batch.map(() => "?").join(", ");
1013
+ await this.client.execute({
1014
+ sql: `DELETE FROM chunks WHERE project_id = ? AND scope_name = ? AND id IN (${placeholders})`,
1015
+ args: [scope.projectId, scope.scopeName, ...batch]
1016
+ });
1017
+ }
1018
+ }
1019
+ async deleteScope(scope) {
1020
+ await this.ensureRegistry();
1021
+ try {
1022
+ await this.client.execute({
1023
+ sql: `DELETE FROM chunks WHERE project_id = ? AND scope_name = ?`,
1024
+ args: [scope.projectId, scope.scopeName]
1025
+ });
1026
+ } catch (error) {
1027
+ if (error instanceof Error && !error.message.includes("no such table")) {
1028
+ throw error;
1029
+ }
1030
+ }
1031
+ try {
1032
+ await this.client.execute({
1033
+ sql: `DELETE FROM pages WHERE project_id = ? AND scope_name = ?`,
1034
+ args: [scope.projectId, scope.scopeName]
1035
+ });
1036
+ } catch (error) {
1037
+ if (error instanceof Error && !error.message.includes("no such table")) {
1038
+ throw error;
1039
+ }
1040
+ }
1041
+ await this.client.execute({
1042
+ sql: `DELETE FROM registry WHERE project_id = ? AND scope_name = ?`,
1043
+ args: [scope.projectId, scope.scopeName]
1044
+ });
1045
+ }
1046
+ async listScopes(scopeProjectId) {
1047
+ await this.ensureRegistry();
1048
+ const rs = await this.client.execute({
1049
+ sql: `SELECT project_id, scope_name, model_id, last_indexed_at, vector_count,
1050
+ last_estimate_tokens, last_estimate_cost_usd, last_estimate_changed_chunks
1051
+ FROM registry WHERE project_id = ?`,
1052
+ args: [scopeProjectId]
1053
+ });
1054
+ return rs.rows.map((row) => ({
1055
+ projectId: row.project_id,
1056
+ scopeName: row.scope_name,
1057
+ modelId: row.model_id,
1058
+ lastIndexedAt: row.last_indexed_at,
1059
+ vectorCount: row.vector_count,
1060
+ lastEstimateTokens: row.last_estimate_tokens,
1061
+ lastEstimateCostUSD: row.last_estimate_cost_usd,
1062
+ lastEstimateChangedChunks: row.last_estimate_changed_chunks
1063
+ }));
1064
+ }
1065
+ async recordScope(info) {
1066
+ await this.ensureRegistry();
1067
+ const key = `${info.projectId}:${info.scopeName}`;
1068
+ await this.client.execute({
1069
+ sql: `INSERT OR REPLACE INTO registry
1070
+ (scope_key, project_id, scope_name, model_id, last_indexed_at, vector_count,
1071
+ last_estimate_tokens, last_estimate_cost_usd, last_estimate_changed_chunks)
1072
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
1073
+ args: [
1074
+ key,
1075
+ info.projectId,
1076
+ info.scopeName,
1077
+ info.modelId,
1078
+ info.lastIndexedAt,
1079
+ info.vectorCount ?? null,
1080
+ info.lastEstimateTokens ?? null,
1081
+ info.lastEstimateCostUSD ?? null,
1082
+ info.lastEstimateChangedChunks ?? null
1083
+ ]
1084
+ });
1085
+ }
1086
+ async getContentHashes(scope) {
1087
+ const exists = await this.chunksTableExists();
1088
+ if (!exists) return /* @__PURE__ */ new Map();
1089
+ const rs = await this.client.execute({
1090
+ sql: `SELECT id, content_hash FROM chunks WHERE project_id = ? AND scope_name = ?`,
1091
+ args: [scope.projectId, scope.scopeName]
1092
+ });
1093
+ const map = /* @__PURE__ */ new Map();
1094
+ for (const row of rs.rows) {
1095
+ map.set(row.id, row.content_hash);
1096
+ }
1097
+ return map;
1098
+ }
1099
+ async upsertPages(pages, scope) {
1100
+ if (pages.length === 0) return;
1101
+ await this.ensurePages();
1102
+ for (const page of pages) {
1103
+ if (page.projectId !== scope.projectId || page.scopeName !== scope.scopeName) {
1104
+ throw new Error(
1105
+ `Page scope mismatch: page has ${page.projectId}:${page.scopeName} but scope is ${scope.projectId}:${scope.scopeName}`
1106
+ );
1107
+ }
1108
+ }
1109
+ const BATCH_SIZE = 100;
1110
+ for (let i = 0; i < pages.length; i += BATCH_SIZE) {
1111
+ const batch = pages.slice(i, i + BATCH_SIZE);
1112
+ const stmts = batch.map((p) => ({
1113
+ sql: `INSERT OR REPLACE INTO pages
1114
+ (project_id, scope_name, url, title, markdown, route_file,
1115
+ route_resolution, incoming_links, outgoing_links, depth, tags, indexed_at)
1116
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
1117
+ args: [
1118
+ p.projectId,
1119
+ p.scopeName,
1120
+ p.url,
1121
+ p.title,
1122
+ p.markdown,
1123
+ p.routeFile,
1124
+ p.routeResolution,
1125
+ p.incomingLinks,
1126
+ p.outgoingLinks,
1127
+ p.depth,
1128
+ JSON.stringify(p.tags),
1129
+ p.indexedAt
1130
+ ]
1131
+ }));
1132
+ await this.client.batch(stmts);
1133
+ }
1134
+ }
1135
+ async getPage(url, scope) {
1136
+ await this.ensurePages();
1137
+ const rs = await this.client.execute({
1138
+ sql: `SELECT * FROM pages WHERE project_id = ? AND scope_name = ? AND url = ?`,
1139
+ args: [scope.projectId, scope.scopeName, url]
1140
+ });
1141
+ if (rs.rows.length === 0) return null;
1142
+ const row = rs.rows[0];
1143
+ return {
1144
+ url: row.url,
1145
+ title: row.title,
1146
+ markdown: row.markdown,
1147
+ projectId: row.project_id,
1148
+ scopeName: row.scope_name,
1149
+ routeFile: row.route_file,
1150
+ routeResolution: row.route_resolution,
1151
+ incomingLinks: row.incoming_links,
1152
+ outgoingLinks: row.outgoing_links,
1153
+ depth: row.depth,
1154
+ tags: JSON.parse(row.tags || "[]"),
1155
+ indexedAt: row.indexed_at
1156
+ };
1157
+ }
1158
+ async deletePages(scope) {
1159
+ await this.ensurePages();
1160
+ await this.client.execute({
1161
+ sql: `DELETE FROM pages WHERE project_id = ? AND scope_name = ?`,
1162
+ args: [scope.projectId, scope.scopeName]
1163
+ });
1164
+ }
1165
+ async getScopeModelId(scope) {
1166
+ await this.ensureRegistry();
1167
+ const rs = await this.client.execute({
1168
+ sql: `SELECT model_id FROM registry WHERE project_id = ? AND scope_name = ?`,
1169
+ args: [scope.projectId, scope.scopeName]
1170
+ });
1171
+ if (rs.rows.length === 0) return null;
1172
+ return rs.rows[0].model_id;
1173
+ }
1174
+ async health() {
1175
+ try {
1176
+ await this.client.execute("SELECT 1");
1177
+ return { ok: true };
1178
+ } catch (error) {
1179
+ return {
1180
+ ok: false,
1181
+ details: error instanceof Error ? error.message : "unknown error"
1182
+ };
1183
+ }
1184
+ }
1185
+ };
1186
+
1187
+ // src/vector/factory.ts
1188
+ async function createVectorStore(config, cwd) {
1189
+ const turso = config.vector.turso;
1190
+ const remoteUrl = process.env[turso.urlEnv];
1191
+ if (remoteUrl) {
1192
+ const { createClient: createClient2 } = await import("@libsql/client/http");
1193
+ const authToken = process.env[turso.authTokenEnv];
1194
+ const client2 = createClient2({
1195
+ url: remoteUrl,
1196
+ authToken
1197
+ });
1198
+ return new TursoVectorStore({
1199
+ client: client2,
1200
+ dimension: config.vector.dimension
1201
+ });
1202
+ }
1203
+ const { createClient } = await import("@libsql/client");
1204
+ const localPath = path3.resolve(cwd, turso.localPath);
1205
+ fs3.mkdirSync(path3.dirname(localPath), { recursive: true });
1206
+ const client = createClient({
1207
+ url: `file:${localPath}`
1208
+ });
1209
+ return new TursoVectorStore({
1210
+ client,
1211
+ dimension: config.vector.dimension
1212
+ });
1213
+ }
1214
+
1215
+ // src/utils/hash.ts
1216
+ import { createHash } from "crypto";
1217
+ function sha1(input) {
1218
+ return createHash("sha1").update(input).digest("hex");
1219
+ }
1220
+ function sha256(input) {
1221
+ return createHash("sha256").update(input).digest("hex");
1222
+ }
1223
+
1224
+ // src/utils/path.ts
1225
+ import path4 from "path";
1226
+ function normalizeUrlPath(rawPath) {
1227
+ let out = rawPath.trim();
1228
+ if (!out.startsWith("/")) {
1229
+ out = `/${out}`;
1230
+ }
1231
+ out = out.replace(/\/+/g, "/");
1232
+ if (out.length > 1 && out.endsWith("/")) {
1233
+ out = out.slice(0, -1);
1234
+ }
1235
+ return out;
1236
+ }
1237
+ function urlPathToMirrorRelative(urlPath) {
1238
+ const normalized = normalizeUrlPath(urlPath);
1239
+ if (normalized === "/") {
1240
+ return "index.md";
1241
+ }
1242
+ return `${normalized.slice(1)}.md`;
1243
+ }
1244
+ function staticHtmlFileToUrl(filePath, rootDir) {
1245
+ const relative = path4.relative(rootDir, filePath).replace(/\\/g, "/");
1246
+ if (relative === "index.html") {
1247
+ return "/";
1248
+ }
1249
+ if (relative.endsWith("/index.html")) {
1250
+ return normalizeUrlPath(relative.slice(0, -"/index.html".length));
1251
+ }
1252
+ if (relative.endsWith(".html")) {
1253
+ return normalizeUrlPath(relative.slice(0, -".html".length));
1254
+ }
1255
+ return normalizeUrlPath(relative);
1256
+ }
1257
+ function getUrlDepth(urlPath) {
1258
+ if (urlPath === "/") {
1259
+ return 0;
1260
+ }
1261
+ return normalizeUrlPath(urlPath).split("/").filter(Boolean).length;
1262
+ }
1263
+ function humanizeUrlPath(urlPath) {
1264
+ const normalized = normalizeUrlPath(urlPath);
1265
+ if (normalized === "/") return "";
1266
+ return normalized.slice(1).split("/").map((segment) => segment.replace(/[-_]/g, " ")).join(" / ");
1267
+ }
1268
+ function ensureLeadingSlash(value) {
1269
+ return value.startsWith("/") ? value : `/${value}`;
1270
+ }
1271
+ function joinUrl(baseUrl, route) {
1272
+ const base = baseUrl.endsWith("/") ? baseUrl.slice(0, -1) : baseUrl;
1273
+ const routePart = ensureLeadingSlash(route);
1274
+ return `${base}${routePart}`;
1275
+ }
1276
+
1277
+ // src/indexing/chunker.ts
1278
+ var FENCE_LINE_RE = /^(```|~~~)/;
1279
+ function parseHeadingSections(markdown, headingPathDepth) {
1280
+ const lines = markdown.split("\n");
1281
+ const sections = [];
1282
+ const headingStack = [];
1283
+ let inFence = false;
1284
+ let current = {
1285
+ sectionTitle: void 0,
1286
+ headingPath: [],
1287
+ text: ""
1288
+ };
1289
+ const flush = () => {
1290
+ if (normalizeText(current.text)) {
1291
+ sections.push({
1292
+ sectionTitle: current.sectionTitle,
1293
+ headingPath: current.headingPath,
1294
+ text: current.text.trim()
1295
+ });
1296
+ }
1297
+ };
1298
+ for (const line of lines) {
1299
+ if (FENCE_LINE_RE.test(line.trim())) {
1300
+ inFence = !inFence;
1301
+ }
1302
+ const headingMatch = !inFence ? line.match(/^(#{1,6})\s+(.+)$/) : null;
1303
+ if (headingMatch) {
1304
+ flush();
1305
+ const level = (headingMatch[1] ?? "#").length;
1306
+ const title = (headingMatch[2] ?? "").trim();
1307
+ headingStack[level - 1] = title;
1308
+ headingStack.length = level;
1309
+ current = {
1310
+ sectionTitle: title,
1311
+ headingPath: headingStack.filter((entry) => Boolean(entry)).slice(0, headingPathDepth),
1312
+ text: `${line}
1313
+ `
1314
+ };
1315
+ continue;
1316
+ }
1317
+ current.text += `${line}
1318
+ `;
1319
+ }
1320
+ flush();
1321
+ if (sections.length === 0 && normalizeText(markdown)) {
1322
+ sections.push({
1323
+ sectionTitle: void 0,
1324
+ headingPath: [],
1325
+ text: markdown.trim()
1326
+ });
1327
+ }
1328
+ return sections;
1329
+ }
1330
+ function blockify(text, config) {
1331
+ const lines = text.split("\n");
1332
+ const blocks = [];
1333
+ let inFence = false;
1334
+ let current = [];
1335
+ const flush = () => {
1336
+ const value = current.join("\n").trim();
1337
+ if (value) {
1338
+ blocks.push(value);
1339
+ }
1340
+ current = [];
1341
+ };
1342
+ for (let i = 0; i < lines.length; i += 1) {
1343
+ const line = lines[i] ?? "";
1344
+ const trimmed = line.trim();
1345
+ if (FENCE_LINE_RE.test(trimmed)) {
1346
+ inFence = !inFence;
1347
+ current.push(line);
1348
+ continue;
1349
+ }
1350
+ if (inFence) {
1351
+ current.push(line);
1352
+ continue;
1353
+ }
1354
+ const isTableLine = /^\|.*\|$/.test(trimmed) || /^\|?\s*:?-+:?\s*\|/.test(trimmed);
1355
+ const isQuoteLine = /^>/.test(trimmed);
1356
+ if (isTableLine && config.dontSplitInside.includes("table")) {
1357
+ current.push(line);
1358
+ while (i + 1 < lines.length) {
1359
+ const next = lines[i + 1];
1360
+ if (!next || !/^\|/.test(next.trim())) {
1361
+ break;
1362
+ }
1363
+ i += 1;
1364
+ current.push(lines[i] ?? "");
1365
+ }
1366
+ continue;
1367
+ }
1368
+ if (isQuoteLine && config.dontSplitInside.includes("blockquote")) {
1369
+ current.push(line);
1370
+ while (i + 1 < lines.length) {
1371
+ const next = lines[i + 1];
1372
+ if (!next || !/^>/.test(next.trim())) {
1373
+ break;
1374
+ }
1375
+ i += 1;
1376
+ current.push(lines[i] ?? "");
1377
+ }
1378
+ continue;
1379
+ }
1380
+ if (!trimmed) {
1381
+ flush();
1382
+ continue;
1383
+ }
1384
+ current.push(line);
1385
+ }
1386
+ flush();
1387
+ return blocks;
1388
+ }
1389
+ function isProtectedBlock(block, config) {
1390
+ const lines = block.trim().split("\n");
1391
+ const first = (lines[0] ?? "").trim();
1392
+ const last = (lines[lines.length - 1] ?? "").trim();
1393
+ const isCodeBlock = FENCE_LINE_RE.test(first) && FENCE_LINE_RE.test(last);
1394
+ if (isCodeBlock && config.dontSplitInside.includes("code")) {
1395
+ return true;
1396
+ }
1397
+ const isTableBlock = lines.every((line) => {
1398
+ const trimmed = line.trim();
1399
+ return trimmed.length === 0 || /^\|.*\|$/.test(trimmed) || /^\|?\s*:?-+:?\s*\|/.test(trimmed);
1400
+ });
1401
+ if (isTableBlock && config.dontSplitInside.includes("table")) {
1402
+ return true;
1403
+ }
1404
+ const isQuoteBlock = lines.every((line) => {
1405
+ const trimmed = line.trim();
1406
+ return trimmed.length === 0 || trimmed.startsWith(">");
1407
+ });
1408
+ return isQuoteBlock && config.dontSplitInside.includes("blockquote");
1409
+ }
1410
+ function splitOversizedBlock(block, config) {
1411
+ const trimmed = block.trim();
1412
+ if (trimmed.length <= config.maxChars || isProtectedBlock(trimmed, config)) {
1413
+ return [trimmed];
1414
+ }
1415
+ const chunks = [];
1416
+ let start = 0;
1417
+ while (start < trimmed.length) {
1418
+ let end = Math.min(start + config.maxChars, trimmed.length);
1419
+ if (end < trimmed.length) {
1420
+ const boundary = trimmed.lastIndexOf(" ", end);
1421
+ if (boundary > start + Math.floor(config.maxChars * 0.6)) {
1422
+ end = boundary;
1423
+ }
1424
+ }
1425
+ const chunk = trimmed.slice(start, end).trim();
1426
+ if (chunk) {
1427
+ chunks.push(chunk);
1428
+ }
1429
+ if (end >= trimmed.length) {
1430
+ break;
1431
+ }
1432
+ const nextStart = Math.max(0, end - config.overlapChars);
1433
+ start = nextStart > start ? nextStart : end;
1434
+ }
1435
+ return chunks.length > 0 ? chunks : [trimmed];
1436
+ }
1437
+ function splitSection(section, config) {
1438
+ const text = section.text.trim();
1439
+ if (!text) {
1440
+ return [];
1441
+ }
1442
+ if (text.length <= config.maxChars) {
1443
+ return [
1444
+ {
1445
+ sectionTitle: section.sectionTitle,
1446
+ headingPath: section.headingPath,
1447
+ chunkText: text
1448
+ }
1449
+ ];
1450
+ }
1451
+ const blocks = blockify(text, config);
1452
+ const chunks = [];
1453
+ let current = "";
1454
+ for (const block of blocks) {
1455
+ const pieces = splitOversizedBlock(block, config);
1456
+ for (const piece of pieces) {
1457
+ if (!current) {
1458
+ current = piece;
1459
+ continue;
1460
+ }
1461
+ const candidate = `${current}
1462
+
1463
+ ${piece}`;
1464
+ if (candidate.length <= config.maxChars) {
1465
+ current = candidate;
1466
+ continue;
1467
+ }
1468
+ chunks.push(current);
1469
+ const overlap = current.slice(Math.max(0, current.length - config.overlapChars)).trim();
1470
+ const withOverlap = overlap ? `${overlap}
1471
+
1472
+ ${piece}` : piece;
1473
+ current = withOverlap.length <= config.maxChars ? withOverlap : piece;
1474
+ }
1475
+ }
1476
+ if (current.trim()) {
1477
+ chunks.push(current.trim());
1478
+ }
1479
+ const merged = [];
1480
+ for (const chunk of chunks) {
1481
+ if (merged.length === 0) {
1482
+ merged.push(chunk);
1483
+ continue;
1484
+ }
1485
+ const canMerge = chunk.length < config.minChars && merged[merged.length - 1] !== void 0 && (merged[merged.length - 1]?.length ?? 0) + 2 + chunk.length <= config.maxChars;
1486
+ if (canMerge) {
1487
+ merged[merged.length - 1] = `${merged[merged.length - 1]}
1488
+
1489
+ ${chunk}`;
1490
+ } else {
1491
+ merged.push(chunk);
1492
+ }
1493
+ }
1494
+ return merged.map((chunkText) => ({
1495
+ sectionTitle: section.sectionTitle,
1496
+ headingPath: section.headingPath,
1497
+ chunkText
1498
+ }));
1499
+ }
1500
+ function buildSummaryChunkText(page) {
1501
+ const parts = [page.title];
1502
+ const humanized = humanizeUrlPath(page.url);
1503
+ if (humanized) parts.push(humanized);
1504
+ const body = page.description ?? extractFirstParagraph(page.markdown);
1505
+ if (body) parts.push(body);
1506
+ if (page.keywords && page.keywords.length > 0) {
1507
+ parts.push(page.keywords.join(", "));
1508
+ }
1509
+ return parts.join("\n\n");
1510
+ }
1511
+ function buildEmbeddingText(chunk, prependTitle) {
1512
+ if (!prependTitle) return chunk.chunkText;
1513
+ const prefix = chunk.sectionTitle ? `${chunk.title} \u2014 ${chunk.sectionTitle}` : chunk.title;
1514
+ return `${prefix}
1515
+
1516
+ ${chunk.chunkText}`;
1517
+ }
1518
+ function chunkMirrorPage(page, config, scope) {
1519
+ const sections = parseHeadingSections(page.markdown, config.chunking.headingPathDepth);
1520
+ const rawChunks = sections.flatMap((section) => splitSection(section, config.chunking));
1521
+ const chunks = [];
1522
+ if (config.chunking.pageSummaryChunk) {
1523
+ const summaryText = buildSummaryChunkText(page);
1524
+ const summaryChunkKey = sha1(`${scope.scopeName}|${page.url}|__summary__`);
1525
+ const summaryChunk = {
1526
+ chunkKey: summaryChunkKey,
1527
+ ordinal: 0,
1528
+ url: page.url,
1529
+ path: page.url,
1530
+ title: page.title,
1531
+ sectionTitle: void 0,
1532
+ headingPath: [],
1533
+ chunkText: summaryText,
1534
+ snippet: toSnippet(summaryText),
1535
+ depth: page.depth,
1536
+ incomingLinks: page.incomingLinks,
1537
+ routeFile: page.routeFile,
1538
+ tags: page.tags,
1539
+ contentHash: ""
1540
+ };
1541
+ const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
1542
+ summaryChunk.contentHash = sha256(normalizeText(embeddingText));
1543
+ chunks.push(summaryChunk);
1544
+ }
1545
+ const ordinalOffset = config.chunking.pageSummaryChunk ? 1 : 0;
1546
+ for (let index = 0; index < rawChunks.length; index++) {
1547
+ const entry = rawChunks[index];
1548
+ const sectionTitleNormalized = normalizeText(entry.sectionTitle ?? "").toLowerCase();
1549
+ const chunkKey = sha1(
1550
+ `${scope.scopeName}|${page.url}|${index}|${sectionTitleNormalized}`
1551
+ );
1552
+ const chunk = {
1553
+ chunkKey,
1554
+ ordinal: index + ordinalOffset,
1555
+ url: page.url,
1556
+ path: page.url,
1557
+ title: page.title,
1558
+ sectionTitle: entry.sectionTitle,
1559
+ headingPath: entry.headingPath,
1560
+ chunkText: entry.chunkText,
1561
+ snippet: toSnippet(entry.chunkText),
1562
+ depth: page.depth,
1563
+ incomingLinks: page.incomingLinks,
1564
+ routeFile: page.routeFile,
1565
+ tags: page.tags,
1566
+ contentHash: ""
1567
+ };
1568
+ const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
1569
+ chunk.contentHash = sha256(normalizeText(embeddingText));
1570
+ chunks.push(chunk);
1571
+ }
1572
+ return chunks;
1573
+ }
1574
+
1575
+ // src/indexing/extractor.ts
1576
+ import { load } from "cheerio";
1577
+ import matter from "gray-matter";
1578
+ import TurndownService from "turndown";
1579
+ import { gfm, highlightedCodeBlock, strikethrough, tables, taskListItems } from "turndown-plugin-gfm";
1580
+ function hasTopLevelNoindexComment(markdown) {
1581
+ const lines = markdown.split(/\r?\n/);
1582
+ let inFence = false;
1583
+ for (const line of lines) {
1584
+ const trimmed = line.trim();
1585
+ if (/^(```|~~~)/.test(trimmed)) {
1586
+ inFence = !inFence;
1587
+ continue;
1588
+ }
1589
+ if (!inFence && /<!--\s*noindex\s*-->/i.test(line)) {
1590
+ return true;
1591
+ }
1592
+ }
1593
+ return false;
1594
+ }
1595
+ function extractFromHtml(url, html, config) {
1596
+ const $ = load(html);
1597
+ const normalizedUrl = normalizeUrlPath(url);
1598
+ const pageBaseUrl = new URL(`https://searchsocket.local${normalizedUrl}`);
1599
+ const title = normalizeText($("title").first().text() || "") || normalizeText($(`${config.extract.mainSelector} h1`).first().text() || "") || normalizedUrl;
1600
+ if (config.extract.respectRobotsNoindex) {
1601
+ const robots = $("meta[name='robots']").attr("content") ?? "";
1602
+ if (/\bnoindex\b/i.test(robots)) {
1603
+ return null;
1604
+ }
1605
+ }
1606
+ if ($(`[${config.extract.noindexAttr}]`).length > 0) {
1607
+ return null;
1608
+ }
1609
+ const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
1610
+ const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
1611
+ const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
1612
+ const root = $(config.extract.mainSelector).first().length ? $(config.extract.mainSelector).first().clone() : $("body").first().clone();
1613
+ for (const tagName of config.extract.dropTags) {
1614
+ root.find(tagName).remove();
1615
+ }
1616
+ for (const selector of config.extract.dropSelectors) {
1617
+ root.find(selector).remove();
1618
+ }
1619
+ root.find(`[${config.extract.ignoreAttr}]`).remove();
1620
+ const outgoingLinks = [];
1621
+ root.find("a[href]").each((_index, node) => {
1622
+ const href = $(node).attr("href");
1623
+ if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:")) {
1624
+ return;
1625
+ }
1626
+ try {
1627
+ const parsed = new URL(href, pageBaseUrl);
1628
+ if (!["http:", "https:"].includes(parsed.protocol)) {
1629
+ return;
1630
+ }
1631
+ outgoingLinks.push(normalizeUrlPath(parsed.pathname));
1632
+ } catch {
1633
+ }
1634
+ });
1635
+ const turndown = new TurndownService({
1636
+ headingStyle: "atx",
1637
+ codeBlockStyle: "fenced"
1638
+ });
1639
+ if (config.transform.preserveCodeBlocks && config.transform.preserveTables) {
1640
+ turndown.use(gfm);
1641
+ } else {
1642
+ turndown.use(strikethrough);
1643
+ turndown.use(taskListItems);
1644
+ if (config.transform.preserveTables) {
1645
+ turndown.use(tables);
1646
+ }
1647
+ if (config.transform.preserveCodeBlocks) {
1648
+ turndown.use(highlightedCodeBlock);
1649
+ }
1650
+ }
1651
+ const markdown = normalizeMarkdown(turndown.turndown(root.html() ?? ""));
1652
+ if (!normalizeText(markdown)) {
1653
+ return null;
1654
+ }
1655
+ const tags = normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1);
1656
+ return {
1657
+ url: normalizeUrlPath(url),
1658
+ title,
1659
+ markdown,
1660
+ outgoingLinks: [...new Set(outgoingLinks)],
1661
+ noindex: false,
1662
+ tags,
1663
+ description,
1664
+ keywords
1665
+ };
1666
+ }
1667
+ function extractFromMarkdown(url, markdown, title) {
1668
+ if (hasTopLevelNoindexComment(markdown)) {
1669
+ return null;
1670
+ }
1671
+ const parsed = matter(markdown);
1672
+ const frontmatter = parsed.data;
1673
+ const searchsocketMeta = frontmatter.searchsocket;
1674
+ if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
1675
+ return null;
1676
+ }
1677
+ const content = parsed.content;
1678
+ const normalized = normalizeMarkdown(content);
1679
+ if (!normalizeText(normalized)) {
1680
+ return null;
1681
+ }
1682
+ const resolvedTitle = title ?? (typeof frontmatter.title === "string" ? frontmatter.title : void 0) ?? normalizeUrlPath(url);
1683
+ const fmDescription = typeof frontmatter.description === "string" ? frontmatter.description.trim() || void 0 : void 0;
1684
+ let fmKeywords;
1685
+ if (Array.isArray(frontmatter.keywords)) {
1686
+ fmKeywords = frontmatter.keywords.filter((k) => typeof k === "string" && k.trim().length > 0).map((k) => k.trim());
1687
+ } else if (typeof frontmatter.keywords === "string" && frontmatter.keywords.trim()) {
1688
+ fmKeywords = frontmatter.keywords.split(",").map((k) => k.trim()).filter(Boolean);
1689
+ }
1690
+ if (fmKeywords && fmKeywords.length === 0) fmKeywords = void 0;
1691
+ return {
1692
+ url: normalizeUrlPath(url),
1693
+ title: resolvedTitle,
1694
+ markdown: normalized,
1695
+ outgoingLinks: [],
1696
+ noindex: false,
1697
+ tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
1698
+ description: fmDescription,
1699
+ keywords: fmKeywords
1700
+ };
1701
+ }
1702
+
1703
+ // src/indexing/mirror.ts
1704
+ import fs4 from "fs/promises";
1705
+ import path5 from "path";
1706
+ function yamlString(value) {
1707
+ return JSON.stringify(value);
1708
+ }
1709
+ function yamlArray(values) {
1710
+ return `[${values.map((v) => JSON.stringify(v)).join(", ")}]`;
1711
+ }
1712
+ function buildMirrorMarkdown(page) {
1713
+ const frontmatterLines = [
1714
+ "---",
1715
+ `url: ${yamlString(page.url)}`,
1716
+ `title: ${yamlString(page.title)}`,
1717
+ `scope: ${yamlString(page.scope)}`,
1718
+ `routeFile: ${yamlString(page.routeFile)}`,
1719
+ `routeResolution: ${yamlString(page.routeResolution)}`,
1720
+ `generatedAt: ${yamlString(page.generatedAt)}`,
1721
+ `incomingLinks: ${page.incomingLinks}`,
1722
+ `outgoingLinks: ${page.outgoingLinks}`,
1723
+ `depth: ${page.depth}`,
1724
+ `tags: ${yamlArray(page.tags)}`,
1725
+ "---",
1726
+ ""
1727
+ ];
1728
+ return `${frontmatterLines.join("\n")}${normalizeMarkdown(page.markdown)}`;
1729
+ }
1730
+ function stripGeneratedAt(content) {
1731
+ return content.replace(/^generatedAt: .*$/m, "");
1732
+ }
1733
+ async function writeMirrorPage(statePath, scope, page) {
1734
+ const relative = urlPathToMirrorRelative(page.url);
1735
+ const outputPath = path5.join(statePath, "pages", scope.scopeName, relative);
1736
+ await fs4.mkdir(path5.dirname(outputPath), { recursive: true });
1737
+ const newContent = buildMirrorMarkdown(page);
1738
+ try {
1739
+ const existing = await fs4.readFile(outputPath, "utf8");
1740
+ if (stripGeneratedAt(existing) === stripGeneratedAt(newContent)) {
1741
+ return outputPath;
1742
+ }
1743
+ } catch {
1744
+ }
1745
+ await fs4.writeFile(outputPath, newContent, "utf8");
1746
+ return outputPath;
1747
+ }
1748
+ async function cleanMirrorForScope(statePath, scope) {
1749
+ const target = path5.join(statePath, "pages", scope.scopeName);
1750
+ await fs4.rm(target, { recursive: true, force: true });
1751
+ await fs4.mkdir(target, { recursive: true });
1752
+ }
1753
+
1754
+ // src/indexing/route-mapper.ts
1755
+ import path6 from "path";
1756
+ import fg from "fast-glob";
1757
+ function segmentToRegex(segment) {
1758
+ if (segment.startsWith("(") && segment.endsWith(")")) {
1759
+ return { regex: "", score: 0 };
1760
+ }
1761
+ if (/^\[\[\.\.\.[^\]]+\]\]$/.test(segment)) {
1762
+ return { regex: "(?:/.+)?", score: -2 };
1763
+ }
1764
+ if (/^\[\.\.\.[^\]]+\]$/.test(segment)) {
1765
+ return { regex: "/.+", score: 0 };
1766
+ }
1767
+ if (/^\[\[[^\]]+\]\]$/.test(segment)) {
1768
+ return { regex: "(?:/[^/]+)?", score: -1 };
1769
+ }
1770
+ if (/^\[[^\]]+\]$/.test(segment)) {
1771
+ return { regex: "/[^/]+", score: 3 };
1772
+ }
1773
+ return { regex: `/${segment.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}`, score: 10 };
1774
+ }
1775
+ function routeFileToPattern(routeFile, cwd) {
1776
+ const relative = path6.relative(cwd, routeFile).replace(/\\/g, "/");
1777
+ const withoutPrefix = relative.replace(/^src\/routes\/?/, "");
1778
+ const withoutPage = withoutPrefix.replace(/\/\+page\.[^/]+$/, "");
1779
+ const segments = withoutPage.split("/").filter(Boolean);
1780
+ let regex = "^";
1781
+ let score = 0;
1782
+ if (segments.length === 0) {
1783
+ regex += "/";
1784
+ } else {
1785
+ for (const segment of segments) {
1786
+ const converted = segmentToRegex(segment);
1787
+ regex += converted.regex;
1788
+ score += converted.score;
1789
+ }
1790
+ }
1791
+ regex += "/?$";
1792
+ return {
1793
+ routeFile: relative,
1794
+ regex: new RegExp(regex),
1795
+ score
1796
+ };
1797
+ }
1798
+ async function buildRoutePatterns(cwd) {
1799
+ const files = await fg("src/routes/**/+page.svelte", {
1800
+ cwd,
1801
+ absolute: true
1802
+ });
1803
+ return files.map((file) => routeFileToPattern(file, cwd)).sort((a, b) => b.score - a.score || b.routeFile.length - a.routeFile.length);
1804
+ }
1805
+ function mapUrlToRoute(urlPath, patterns) {
1806
+ const normalized = normalizeUrlPath(urlPath);
1807
+ for (const pattern of patterns) {
1808
+ if (pattern.regex.test(normalized)) {
1809
+ return {
1810
+ routeFile: pattern.routeFile,
1811
+ routeResolution: "exact"
1812
+ };
1813
+ }
1814
+ }
1815
+ const rootRoute = patterns.find((pattern) => pattern.routeFile === "src/routes/+page.svelte");
1816
+ if (rootRoute) {
1817
+ return {
1818
+ routeFile: rootRoute.routeFile,
1819
+ routeResolution: "best-effort"
1820
+ };
1821
+ }
1822
+ const fallback = patterns[0];
1823
+ return {
1824
+ routeFile: fallback?.routeFile ?? "src/routes/+page.svelte",
1825
+ routeResolution: "best-effort"
1826
+ };
1827
+ }
1828
+
1829
+ // src/indexing/sources/build/index.ts
1830
+ import pLimit2 from "p-limit";
1831
+
1832
+ // src/indexing/sources/build/manifest-parser.ts
1833
+ import fs5 from "fs/promises";
1834
+ import path7 from "path";
1835
+ function routeIdToFile(routeId) {
1836
+ if (routeId === "/") {
1837
+ return "src/routes/+page.svelte";
1838
+ }
1839
+ return `src/routes${routeId}/+page.svelte`;
1840
+ }
1841
+ function routeIdToUrl(routeId) {
1842
+ if (routeId === "/") return "/";
1843
+ return routeId.split("/").filter((seg) => !(seg.startsWith("(") && seg.endsWith(")"))).join("/") || "/";
1844
+ }
1845
+ async function parseManifest(cwd, outputDir) {
1846
+ const manifestPath = path7.resolve(cwd, outputDir, "server", "manifest-full.js");
1847
+ let content;
1848
+ try {
1849
+ content = await fs5.readFile(manifestPath, "utf8");
1850
+ } catch {
1851
+ throw new SearchSocketError(
1852
+ "BUILD_MANIFEST_NOT_FOUND",
1853
+ `SvelteKit build manifest not found at ${manifestPath}. Run \`vite build\` first.`
1854
+ );
1855
+ }
1856
+ const routes = [];
1857
+ const idRegex = /id:\s*"([^"]+)"/g;
1858
+ const idMatches = [];
1859
+ let idMatch;
1860
+ while ((idMatch = idRegex.exec(content)) !== null) {
1861
+ idMatches.push({ id: idMatch[1], index: idMatch.index });
1862
+ }
1863
+ for (let i = 0; i < idMatches.length; i++) {
1864
+ const current = idMatches[i];
1865
+ const nextIndex = idMatches[i + 1]?.index ?? content.length;
1866
+ const block = content.slice(current.index, nextIndex);
1867
+ const isPage = /page:\s*\{/.test(block);
1868
+ if (!isPage) continue;
1869
+ const isDynamic = current.id.includes("[");
1870
+ routes.push({
1871
+ id: current.id,
1872
+ isPage: true,
1873
+ isDynamic,
1874
+ routeFile: routeIdToFile(current.id)
1875
+ });
1876
+ }
1877
+ return routes;
1878
+ }
1879
+ function expandRoutes(routes, paramValues, exclude, logger3) {
1880
+ const expanded = [];
1881
+ for (const route of routes) {
1882
+ const url = routeIdToUrl(route.id);
1883
+ if (isExcluded(url, exclude)) continue;
1884
+ if (!route.isDynamic) {
1885
+ expanded.push({ url, routeFile: route.routeFile });
1886
+ continue;
1887
+ }
1888
+ const values = paramValues[route.id] ?? paramValues[url];
1889
+ if (!values || values.length === 0) {
1890
+ logger3.warn(
1891
+ `Skipping dynamic route ${route.id}: no paramValues provided. Add paramValues["${route.id}"] or paramValues["${url}"] to your build config.`
1892
+ );
1893
+ continue;
1894
+ }
1895
+ for (const value of values) {
1896
+ const expandedUrl = expandDynamicUrl(url, value);
1897
+ if (!isExcluded(expandedUrl, exclude)) {
1898
+ expanded.push({ url: expandedUrl, routeFile: route.routeFile });
1899
+ }
1900
+ }
1901
+ }
1902
+ return expanded;
1903
+ }
1904
+ function expandDynamicUrl(url, value) {
1905
+ return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
1906
+ }
1907
+ function isExcluded(url, patterns) {
1908
+ for (const pattern of patterns) {
1909
+ if (pattern.endsWith("/*")) {
1910
+ const prefix = pattern.slice(0, -1);
1911
+ if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
1912
+ } else if (url === pattern) {
1913
+ return true;
1914
+ }
1915
+ }
1916
+ return false;
1917
+ }
1918
+
1919
+ // src/indexing/sources/build/preview-server.ts
1920
+ import net from "net";
1921
+ import path8 from "path";
1922
+ import fs6 from "fs";
1923
+ import { spawn } from "child_process";
1924
+ function findFreePort() {
1925
+ return new Promise((resolve, reject) => {
1926
+ const server = net.createServer();
1927
+ server.listen(0, "127.0.0.1", () => {
1928
+ const addr = server.address();
1929
+ if (!addr || typeof addr === "string") {
1930
+ server.close(() => reject(new Error("Failed to get port")));
1931
+ return;
1932
+ }
1933
+ const port = addr.port;
1934
+ server.close(() => resolve(port));
1935
+ });
1936
+ server.on("error", reject);
1937
+ });
1938
+ }
1939
+ async function waitForReady(url, timeout, child) {
1940
+ const deadline = Date.now() + timeout;
1941
+ while (Date.now() < deadline) {
1942
+ if (child.exitCode !== null) {
1943
+ throw new SearchSocketError(
1944
+ "BUILD_SERVER_FAILED",
1945
+ `vite preview exited with code ${child.exitCode} before becoming ready.`
1946
+ );
1947
+ }
1948
+ try {
1949
+ const res = await fetch(url, { signal: AbortSignal.timeout(2e3) });
1950
+ if (res.status < 500) return;
1951
+ } catch {
1952
+ }
1953
+ await new Promise((resolve) => setTimeout(resolve, 250));
1954
+ }
1955
+ throw new SearchSocketError(
1956
+ "BUILD_SERVER_FAILED",
1957
+ `vite preview did not become ready within ${timeout}ms. Check that \`vite build\` completed successfully.`
1958
+ );
1959
+ }
1960
+ async function startPreviewServer(cwd, options, logger3) {
1961
+ const viteBin = path8.join(cwd, "node_modules", ".bin", "vite");
1962
+ if (!fs6.existsSync(viteBin)) {
1963
+ throw new SearchSocketError(
1964
+ "BUILD_SERVER_FAILED",
1965
+ `vite binary not found at ${viteBin}. Ensure vite is installed.`
1966
+ );
1967
+ }
1968
+ const port = await findFreePort();
1969
+ const baseUrl = `http://127.0.0.1:${port}`;
1970
+ logger3.event("preview_server_starting", { port });
1971
+ const child = spawn(viteBin, ["preview", "--port", String(port), "--strictPort", "--host", "127.0.0.1"], {
1972
+ cwd,
1973
+ stdio: ["ignore", "pipe", "pipe"],
1974
+ env: { ...process.env }
1975
+ });
1976
+ let stderr = "";
1977
+ child.stderr?.on("data", (chunk) => {
1978
+ stderr += chunk.toString();
1979
+ });
1980
+ const shutdown = async () => {
1981
+ if (child.exitCode !== null) return;
1982
+ child.kill("SIGTERM");
1983
+ await Promise.race([
1984
+ new Promise((resolve) => child.on("close", () => resolve())),
1985
+ new Promise((resolve) => setTimeout(() => {
1986
+ if (child.exitCode === null) child.kill("SIGKILL");
1987
+ resolve();
1988
+ }, 3e3))
1989
+ ]);
1990
+ };
1991
+ try {
1992
+ await waitForReady(baseUrl, options.previewTimeout, child);
1993
+ } catch (error) {
1994
+ await shutdown();
1995
+ if (stderr) {
1996
+ logger3.warn(`vite preview stderr: ${stderr.slice(0, 500)}`);
1997
+ }
1998
+ throw error;
1999
+ }
2000
+ logger3.event("preview_server_ready", { port, baseUrl });
2001
+ return { baseUrl, port, shutdown };
2002
+ }
2003
+
2004
+ // src/indexing/sources/build/index.ts
2005
+ var logger = new Logger();
2006
+ async function loadBuildPages(cwd, config, maxPages) {
2007
+ const buildConfig = config.source.build;
2008
+ if (!buildConfig) {
2009
+ throw new Error("build source config is missing");
2010
+ }
2011
+ const routes = await parseManifest(cwd, buildConfig.outputDir);
2012
+ const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
2013
+ logger.event("build_routes_discovered", {
2014
+ manifestRoutes: routes.length,
2015
+ expandedRoutes: expanded.length
2016
+ });
2017
+ const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
2018
+ const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
2019
+ const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
2020
+ try {
2021
+ const concurrencyLimit = pLimit2(8);
2022
+ const results = await Promise.allSettled(
2023
+ selected.map(
2024
+ (route) => concurrencyLimit(async () => {
2025
+ const fetchUrl = joinUrl(server.baseUrl, route.url);
2026
+ const response = await fetch(fetchUrl);
2027
+ if (!response.ok) {
2028
+ throw new Error(`Failed to fetch ${route.url}: ${response.status} ${response.statusText}`);
2029
+ }
2030
+ return {
2031
+ url: normalizeUrlPath(route.url),
2032
+ html: await response.text(),
2033
+ sourcePath: route.routeFile,
2034
+ outgoingLinks: [],
2035
+ routeFile: route.routeFile,
2036
+ routeResolution: "exact"
2037
+ };
2038
+ })
2039
+ )
2040
+ );
2041
+ const pages = [];
2042
+ for (let i = 0; i < results.length; i += 1) {
2043
+ const result = results[i];
2044
+ if (!result) continue;
2045
+ if (result.status === "fulfilled") {
2046
+ pages.push(result.value);
2047
+ } else {
2048
+ const route = selected[i]?.url ?? "unknown";
2049
+ logger.warn(
2050
+ `Skipping build route ${route}: ${result.reason instanceof Error ? result.reason.message : String(result.reason)}`
2051
+ );
2052
+ }
2053
+ }
2054
+ return pages;
2055
+ } finally {
2056
+ await server.shutdown();
2057
+ }
2058
+ }
2059
+
2060
+ // src/indexing/sources/content-files.ts
2061
+ import fs7 from "fs/promises";
2062
+ import path9 from "path";
2063
+ import fg2 from "fast-glob";
2064
+ function filePathToUrl(filePath, baseDir) {
2065
+ const relative = path9.relative(baseDir, filePath).replace(/\\/g, "/");
2066
+ const segments = relative.split("/").filter(Boolean);
2067
+ if (/(^|\/)\+page\.svelte$/.test(relative)) {
2068
+ const routeSegments = segments.slice();
2069
+ if ((routeSegments[0] ?? "").toLowerCase() === "src" && (routeSegments[1] ?? "").toLowerCase() === "routes") {
2070
+ routeSegments.splice(0, 2);
2071
+ } else if ((routeSegments[0] ?? "").toLowerCase() === "routes") {
2072
+ routeSegments.splice(0, 1);
2073
+ }
2074
+ const routePath = routeSegments.filter((segment) => segment !== "+page.svelte").filter((segment) => segment && !segment.startsWith("(")).map(
2075
+ (segment) => segment.replace(/^\[\[[^\]]+\]\]$/, "optional").replace(/^\[\.\.\.[^\]]+\]$/, "splat").replace(/^\[[^\]]+\]$/, "param")
2076
+ ).join("/");
2077
+ return normalizeUrlPath(routePath || "/");
2078
+ }
2079
+ const noExt = relative.replace(/\.md$/i, "").replace(/\/index$/i, "");
2080
+ return normalizeUrlPath(noExt || "/");
2081
+ }
2082
+ function normalizeSvelteToMarkdown(source) {
2083
+ return source.replace(/<script[\s\S]*?<\/script>/g, "").replace(/<style[\s\S]*?<\/style>/g, "").replace(/<[^>]+>/g, " ").replace(/\{[^}]+\}/g, " ").replace(/\s+/g, " ").trim();
2084
+ }
2085
+ async function loadContentFilesPages(cwd, config, maxPages) {
2086
+ const contentConfig = config.source.contentFiles;
2087
+ if (!contentConfig) {
2088
+ throw new Error("content-files config is missing");
2089
+ }
2090
+ const baseDir = path9.resolve(cwd, contentConfig.baseDir);
2091
+ const files = await fg2(contentConfig.globs, {
2092
+ cwd: baseDir,
2093
+ absolute: true,
2094
+ onlyFiles: true
2095
+ });
2096
+ const limit = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
2097
+ const selected = typeof limit === "number" ? files.slice(0, limit) : files;
2098
+ const pages = [];
2099
+ for (const filePath of selected) {
2100
+ const raw = await fs7.readFile(filePath, "utf8");
2101
+ const markdown = filePath.endsWith(".md") ? raw : normalizeSvelteToMarkdown(raw);
2102
+ pages.push({
2103
+ url: filePathToUrl(filePath, baseDir),
2104
+ markdown,
2105
+ sourcePath: path9.relative(cwd, filePath).replace(/\\/g, "/"),
2106
+ outgoingLinks: []
2107
+ });
2108
+ }
2109
+ return pages;
2110
+ }
2111
+
2112
+ // src/indexing/sources/crawl.ts
2113
+ import { gunzipSync } from "zlib";
2114
+ import { load as cheerioLoad } from "cheerio";
2115
+ import pLimit3 from "p-limit";
2116
+ var logger2 = new Logger();
2117
+ function extractLocs(xml) {
2118
+ const $ = cheerioLoad(xml, { xmlMode: true });
2119
+ const locs = [];
2120
+ $("loc").each((_i, el) => {
2121
+ const text = $(el).text().trim();
2122
+ if (text) {
2123
+ locs.push(text);
2124
+ }
2125
+ });
2126
+ return locs;
2127
+ }
2128
+ function isSitemapIndex(xml) {
2129
+ const $ = cheerioLoad(xml, { xmlMode: true });
2130
+ return $("sitemapindex").length > 0;
2131
+ }
2132
+ async function fetchSitemapXml(url) {
2133
+ const res = await fetch(url);
2134
+ if (!res.ok) {
2135
+ throw new Error(`Failed to fetch sitemap ${url}: ${res.status} ${res.statusText}`);
2136
+ }
2137
+ if (url.endsWith(".gz")) {
2138
+ const buffer = Buffer.from(await res.arrayBuffer());
2139
+ return gunzipSync(buffer).toString("utf8");
2140
+ }
2141
+ return res.text();
2142
+ }
2143
+ function resolveSitemapUrl(baseUrl, candidate) {
2144
+ return candidate.startsWith("http") ? candidate : joinUrl(baseUrl, candidate);
2145
+ }
2146
+ async function parseSitemap(xml, baseUrl, visitedSitemaps) {
2147
+ if (isSitemapIndex(xml)) {
2148
+ const childUrls = extractLocs(xml);
2149
+ const routes2 = [];
2150
+ for (const childUrl of childUrls) {
2151
+ const childRoutes = await parseSitemapFromUrl(childUrl, baseUrl, visitedSitemaps);
2152
+ routes2.push(...childRoutes);
2153
+ }
2154
+ return [...new Set(routes2)];
2155
+ }
2156
+ const locs = extractLocs(xml);
2157
+ const routes = [];
2158
+ for (const loc of locs) {
2159
+ try {
2160
+ const parsed = loc.startsWith("http://") || loc.startsWith("https://") ? new URL(loc) : new URL(loc, baseUrl);
2161
+ if (!["http:", "https:"].includes(parsed.protocol)) {
2162
+ continue;
2163
+ }
2164
+ routes.push(normalizeUrlPath(parsed.pathname));
2165
+ } catch {
2166
+ }
2167
+ }
2168
+ return [...new Set(routes)];
2169
+ }
2170
+ async function parseSitemapFromUrl(url, baseUrl, visitedSitemaps) {
2171
+ const resolved = resolveSitemapUrl(baseUrl, url);
2172
+ if (visitedSitemaps.has(resolved)) {
2173
+ return [];
2174
+ }
2175
+ visitedSitemaps.add(resolved);
2176
+ const xml = await fetchSitemapXml(resolved);
2177
+ return parseSitemap(xml, baseUrl, visitedSitemaps);
2178
+ }
2179
+ async function resolveRoutes(config) {
2180
+ const crawlConfig = config.source.crawl;
2181
+ if (!crawlConfig) {
2182
+ return [];
2183
+ }
2184
+ if (crawlConfig.routes.length > 0) {
2185
+ return [...new Set(crawlConfig.routes.map((route) => normalizeUrlPath(ensureLeadingSlash(route))))];
2186
+ }
2187
+ if (!crawlConfig.sitemapUrl) {
2188
+ return ["/"];
2189
+ }
2190
+ return parseSitemapFromUrl(crawlConfig.sitemapUrl, crawlConfig.baseUrl, /* @__PURE__ */ new Set());
2191
+ }
2192
+ async function loadCrawledPages(config, maxPages) {
2193
+ const crawlConfig = config.source.crawl;
2194
+ if (!crawlConfig) {
2195
+ throw new Error("crawl source config is missing");
2196
+ }
2197
+ const routes = await resolveRoutes(config);
2198
+ const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
2199
+ const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
2200
+ const concurrencyLimit = pLimit3(8);
2201
+ const results = await Promise.allSettled(
2202
+ selected.map(
2203
+ (route) => concurrencyLimit(async () => {
2204
+ const url = joinUrl(crawlConfig.baseUrl, route);
2205
+ const response = await fetch(url);
2206
+ if (!response.ok) {
2207
+ throw new Error(`Failed to fetch route ${route}: ${response.status} ${response.statusText}`);
2208
+ }
2209
+ return {
2210
+ url: normalizeUrlPath(route),
2211
+ html: await response.text(),
2212
+ sourcePath: url,
2213
+ outgoingLinks: []
2214
+ };
2215
+ })
2216
+ )
2217
+ );
2218
+ const pages = [];
2219
+ for (let i = 0; i < results.length; i += 1) {
2220
+ const result = results[i];
2221
+ if (!result) continue;
2222
+ if (result.status === "fulfilled") {
2223
+ pages.push(result.value);
2224
+ } else {
2225
+ const route = selected[i] ?? "unknown";
2226
+ logger2.warn(`Skipping route ${route}: ${result.reason instanceof Error ? result.reason.message : String(result.reason)}`);
2227
+ }
2228
+ }
2229
+ return pages;
2230
+ }
2231
+
2232
+ // src/indexing/sources/static-output.ts
2233
+ import fs8 from "fs/promises";
2234
+ import path10 from "path";
2235
+ import fg3 from "fast-glob";
2236
+ async function loadStaticOutputPages(cwd, config, maxPages) {
2237
+ const outputDir = path10.resolve(cwd, config.source.staticOutputDir);
2238
+ const htmlFiles = await fg3(["**/*.html"], {
2239
+ cwd: outputDir,
2240
+ absolute: true
2241
+ });
2242
+ const limit = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
2243
+ const selected = typeof limit === "number" ? htmlFiles.slice(0, limit) : htmlFiles;
2244
+ const pages = [];
2245
+ for (const filePath of selected) {
2246
+ const html = await fs8.readFile(filePath, "utf8");
2247
+ pages.push({
2248
+ url: staticHtmlFileToUrl(filePath, outputDir),
2249
+ html,
2250
+ sourcePath: path10.relative(cwd, filePath).replace(/\\/g, "/"),
2251
+ outgoingLinks: []
2252
+ });
2253
+ }
2254
+ return pages;
2255
+ }
2256
+
2257
+ // src/utils/time.ts
2258
+ function nowIso() {
2259
+ return (/* @__PURE__ */ new Date()).toISOString();
2260
+ }
2261
+ function hrTimeMs(start) {
2262
+ return Number(process.hrtime.bigint() - start) / 1e6;
2263
+ }
2264
+
2265
+ // src/indexing/pipeline.ts
2266
+ var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
2267
+ "text-embedding-3-small": 2e-5,
2268
+ "text-embedding-3-large": 13e-5,
2269
+ "text-embedding-ada-002": 1e-4
2270
+ };
2271
+ var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
2272
+ var IndexPipeline = class _IndexPipeline {
2273
+ cwd;
2274
+ config;
2275
+ embeddings;
2276
+ vectorStore;
2277
+ logger;
2278
+ constructor(options) {
2279
+ this.cwd = options.cwd;
2280
+ this.config = options.config;
2281
+ this.embeddings = options.embeddings;
2282
+ this.vectorStore = options.vectorStore;
2283
+ this.logger = options.logger;
2284
+ }
2285
+ static async create(options = {}) {
2286
+ const cwd = path11.resolve(options.cwd ?? process.cwd());
2287
+ const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
2288
+ const embeddings = options.embeddingsProvider ?? createEmbeddingsProvider(config);
2289
+ const vectorStore = options.vectorStore ?? await createVectorStore(config, cwd);
2290
+ return new _IndexPipeline({
2291
+ cwd,
2292
+ config,
2293
+ embeddings,
2294
+ vectorStore,
2295
+ logger: options.logger ?? new Logger()
2296
+ });
2297
+ }
2298
+ getConfig() {
2299
+ return this.config;
2300
+ }
2301
+ async run(rawOptions = {}) {
2302
+ const options = {
2303
+ changedOnly: rawOptions.changedOnly ?? true,
2304
+ force: rawOptions.force ?? false,
2305
+ dryRun: rawOptions.dryRun ?? false,
2306
+ ...rawOptions
2307
+ };
2308
+ const stageTimingsMs = {};
2309
+ const stageStart = () => process.hrtime.bigint();
2310
+ const stageEnd = (name, start) => {
2311
+ stageTimingsMs[name] = Math.round(hrTimeMs(start));
2312
+ };
2313
+ const scope = resolveScope(this.config, options.scopeOverride);
2314
+ const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
2315
+ if (options.force) {
2316
+ await cleanMirrorForScope(statePath, scope);
2317
+ }
2318
+ const manifestStart = stageStart();
2319
+ const existingHashes = await this.vectorStore.getContentHashes(scope);
2320
+ const existingModelId = await this.vectorStore.getScopeModelId(scope);
2321
+ if (existingModelId && existingModelId !== this.config.embeddings.model && !options.force) {
2322
+ throw new SearchSocketError(
2323
+ "EMBEDDING_MODEL_MISMATCH",
2324
+ `Scope ${scope.scopeName} uses model ${existingModelId}. Re-run with --force to migrate.`
2325
+ );
2326
+ }
2327
+ stageEnd("manifest", manifestStart);
2328
+ const sourceStart = stageStart();
2329
+ const sourceMode = options.sourceOverride ?? this.config.source.mode;
2330
+ let sourcePages;
2331
+ if (sourceMode === "static-output") {
2332
+ sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
2333
+ } else if (sourceMode === "crawl") {
2334
+ sourcePages = await loadCrawledPages(this.config, options.maxPages);
2335
+ } else if (sourceMode === "build") {
2336
+ sourcePages = await loadBuildPages(this.cwd, this.config, options.maxPages);
2337
+ } else {
2338
+ sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
2339
+ }
2340
+ stageEnd("source", sourceStart);
2341
+ const routeStart = stageStart();
2342
+ const routePatterns = await buildRoutePatterns(this.cwd);
2343
+ stageEnd("route_map", routeStart);
2344
+ const extractStart = stageStart();
2345
+ const extractedPages = [];
2346
+ for (const sourcePage of sourcePages) {
2347
+ const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
2348
+ if (!extracted) {
2349
+ this.logger.warn(
2350
+ `Page ${sourcePage.url} produced no extractable content and was skipped. Check extract.mainSelector, extract.dropTags, and extract.dropSelectors settings.`
2351
+ );
2352
+ continue;
2353
+ }
2354
+ extractedPages.push(extracted);
2355
+ this.logger.event("page_extracted", {
2356
+ url: extracted.url
2357
+ });
2358
+ }
2359
+ extractedPages.sort((a, b) => a.url.localeCompare(b.url));
2360
+ const uniquePages = [];
2361
+ const seenUrls = /* @__PURE__ */ new Set();
2362
+ for (const page of extractedPages) {
2363
+ if (seenUrls.has(page.url)) {
2364
+ this.logger.warn(
2365
+ `Duplicate page source for ${page.url}; keeping first extracted page and skipping the duplicate.`
2366
+ );
2367
+ continue;
2368
+ }
2369
+ seenUrls.add(page.url);
2370
+ uniquePages.push(page);
2371
+ }
2372
+ stageEnd("extract", extractStart);
2373
+ const linkStart = stageStart();
2374
+ const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
2375
+ const incomingLinkCount = /* @__PURE__ */ new Map();
2376
+ for (const page of uniquePages) {
2377
+ incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
2378
+ }
2379
+ for (const page of uniquePages) {
2380
+ for (const outgoing of page.outgoingLinks) {
2381
+ if (!pageSet.has(outgoing)) {
2382
+ continue;
2383
+ }
2384
+ incomingLinkCount.set(outgoing, (incomingLinkCount.get(outgoing) ?? 0) + 1);
2385
+ }
2386
+ }
2387
+ stageEnd("links", linkStart);
2388
+ const mirrorStart = stageStart();
2389
+ const mirrorPages = [];
2390
+ let routeExact = 0;
2391
+ let routeBestEffort = 0;
2392
+ const precomputedRoutes = /* @__PURE__ */ new Map();
2393
+ for (const sp of sourcePages) {
2394
+ if (sp.routeFile) {
2395
+ precomputedRoutes.set(normalizeUrlPath(sp.url), {
2396
+ routeFile: sp.routeFile,
2397
+ routeResolution: sp.routeResolution ?? "exact"
2398
+ });
2399
+ }
2400
+ }
2401
+ for (const page of uniquePages) {
2402
+ const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
2403
+ if (routeMatch.routeResolution === "best-effort") {
2404
+ if (this.config.source.strictRouteMapping) {
2405
+ throw new SearchSocketError(
2406
+ "ROUTE_MAPPING_FAILED",
2407
+ `Strict route mapping enabled: no exact route match for ${page.url} (resolved to ${routeMatch.routeFile}). Disable source.strictRouteMapping or add the missing route file.`,
2408
+ 400
2409
+ );
2410
+ }
2411
+ this.logger.warn(
2412
+ `No exact route match for ${page.url}, falling back to ${routeMatch.routeFile}.`
2413
+ );
2414
+ routeBestEffort += 1;
2415
+ } else {
2416
+ routeExact += 1;
2417
+ }
2418
+ const mirror = {
2419
+ url: page.url,
2420
+ title: page.title,
2421
+ scope: scope.scopeName,
2422
+ routeFile: routeMatch.routeFile,
2423
+ routeResolution: routeMatch.routeResolution,
2424
+ generatedAt: nowIso(),
2425
+ incomingLinks: incomingLinkCount.get(page.url) ?? 0,
2426
+ outgoingLinks: page.outgoingLinks.length,
2427
+ depth: getUrlDepth(page.url),
2428
+ tags: page.tags,
2429
+ markdown: page.markdown,
2430
+ description: page.description,
2431
+ keywords: page.keywords
2432
+ };
2433
+ mirrorPages.push(mirror);
2434
+ if (this.config.state.writeMirror) {
2435
+ await writeMirrorPage(statePath, scope, mirror);
2436
+ }
2437
+ this.logger.event("markdown_written", { url: page.url });
2438
+ }
2439
+ if (!options.dryRun) {
2440
+ const pageRecords = mirrorPages.map((mp) => ({
2441
+ url: mp.url,
2442
+ title: mp.title,
2443
+ markdown: mp.markdown,
2444
+ projectId: scope.projectId,
2445
+ scopeName: scope.scopeName,
2446
+ routeFile: mp.routeFile,
2447
+ routeResolution: mp.routeResolution,
2448
+ incomingLinks: mp.incomingLinks,
2449
+ outgoingLinks: mp.outgoingLinks,
2450
+ depth: mp.depth,
2451
+ tags: mp.tags,
2452
+ indexedAt: mp.generatedAt
2453
+ }));
2454
+ await this.vectorStore.deletePages(scope);
2455
+ await this.vectorStore.upsertPages(pageRecords, scope);
2456
+ }
2457
+ stageEnd("mirror", mirrorStart);
2458
+ const chunkStart = stageStart();
2459
+ let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
2460
+ const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
2461
+ if (typeof maxChunks === "number") {
2462
+ chunks = chunks.slice(0, maxChunks);
2463
+ }
2464
+ for (const chunk of chunks) {
2465
+ this.logger.event("chunked", {
2466
+ url: chunk.url,
2467
+ chunkKey: chunk.chunkKey
2468
+ });
2469
+ }
2470
+ stageEnd("chunk", chunkStart);
2471
+ const currentChunkMap = /* @__PURE__ */ new Map();
2472
+ for (const chunk of chunks) {
2473
+ currentChunkMap.set(chunk.chunkKey, chunk);
2474
+ }
2475
+ const changedChunks = chunks.filter((chunk) => {
2476
+ if (options.force) {
2477
+ return true;
2478
+ }
2479
+ const existingHash = existingHashes.get(chunk.chunkKey);
2480
+ if (!existingHash) {
2481
+ return true;
2482
+ }
2483
+ if (!options.changedOnly) {
2484
+ return true;
2485
+ }
2486
+ return existingHash !== chunk.contentHash;
2487
+ });
2488
+ const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
2489
+ const embedStart = stageStart();
2490
+ const chunkTokenEstimates = /* @__PURE__ */ new Map();
2491
+ for (const chunk of changedChunks) {
2492
+ chunkTokenEstimates.set(chunk.chunkKey, this.embeddings.estimateTokens(buildEmbeddingText(chunk, this.config.chunking.prependTitle)));
2493
+ }
2494
+ const estimatedTokens = changedChunks.reduce(
2495
+ (sum, chunk) => sum + (chunkTokenEstimates.get(chunk.chunkKey) ?? 0),
2496
+ 0
2497
+ );
2498
+ const pricePer1k = this.config.embeddings.pricePer1kTokens ?? EMBEDDING_PRICE_PER_1K_TOKENS_USD[this.config.embeddings.model] ?? DEFAULT_EMBEDDING_PRICE_PER_1K;
2499
+ const estimatedCostUSD = estimatedTokens / 1e3 * pricePer1k;
2500
+ let newEmbeddings = 0;
2501
+ const vectorsByChunk = /* @__PURE__ */ new Map();
2502
+ if (!options.dryRun && changedChunks.length > 0) {
2503
+ const embeddings = await this.embeddings.embedTexts(
2504
+ changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
2505
+ this.config.embeddings.model
2506
+ );
2507
+ if (embeddings.length !== changedChunks.length) {
2508
+ throw new SearchSocketError(
2509
+ "VECTOR_BACKEND_UNAVAILABLE",
2510
+ `Embedding provider returned ${embeddings.length} vectors for ${changedChunks.length} chunks.`
2511
+ );
2512
+ }
2513
+ for (let i = 0; i < changedChunks.length; i += 1) {
2514
+ const chunk = changedChunks[i];
2515
+ const embedding = embeddings[i];
2516
+ if (!chunk || !embedding || embedding.length === 0 || embedding.some((value) => !Number.isFinite(value))) {
2517
+ throw new SearchSocketError(
2518
+ "VECTOR_BACKEND_UNAVAILABLE",
2519
+ `Embedding provider returned an invalid vector for chunk index ${i}.`
2520
+ );
2521
+ }
2522
+ vectorsByChunk.set(chunk.chunkKey, embedding);
2523
+ newEmbeddings += 1;
2524
+ this.logger.event("embedded_new", { chunkKey: chunk.chunkKey });
2525
+ }
2526
+ }
2527
+ stageEnd("embedding", embedStart);
2528
+ const syncStart = stageStart();
2529
+ if (!options.dryRun) {
2530
+ const upserts = [];
2531
+ for (const chunk of changedChunks) {
2532
+ const vector = vectorsByChunk.get(chunk.chunkKey);
2533
+ if (!vector) {
2534
+ continue;
2535
+ }
2536
+ upserts.push({
2537
+ id: chunk.chunkKey,
2538
+ vector,
2539
+ metadata: {
2540
+ projectId: scope.projectId,
2541
+ scopeName: scope.scopeName,
2542
+ url: chunk.url,
2543
+ path: chunk.path,
2544
+ title: chunk.title,
2545
+ sectionTitle: chunk.sectionTitle ?? "",
2546
+ headingPath: chunk.headingPath,
2547
+ snippet: chunk.snippet,
2548
+ contentHash: chunk.contentHash,
2549
+ modelId: this.config.embeddings.model,
2550
+ depth: chunk.depth,
2551
+ incomingLinks: chunk.incomingLinks,
2552
+ routeFile: chunk.routeFile,
2553
+ tags: chunk.tags
2554
+ }
2555
+ });
2556
+ }
2557
+ if (upserts.length > 0) {
2558
+ await this.vectorStore.upsert(upserts, scope);
2559
+ this.logger.event("upserted", { count: upserts.length });
2560
+ }
2561
+ if (deletes.length > 0) {
2562
+ await this.vectorStore.deleteByIds(deletes, scope);
2563
+ this.logger.event("deleted", { count: deletes.length });
2564
+ }
2565
+ }
2566
+ stageEnd("sync", syncStart);
2567
+ const finalizeStart = stageStart();
2568
+ if (!options.dryRun) {
2569
+ const scopeInfo = {
2570
+ projectId: scope.projectId,
2571
+ scopeName: scope.scopeName,
2572
+ modelId: this.config.embeddings.model,
2573
+ lastIndexedAt: nowIso(),
2574
+ vectorCount: chunks.length,
2575
+ lastEstimateTokens: estimatedTokens,
2576
+ lastEstimateCostUSD: Number(estimatedCostUSD.toFixed(8)),
2577
+ lastEstimateChangedChunks: changedChunks.length
2578
+ };
2579
+ await this.vectorStore.recordScope(scopeInfo);
2580
+ this.logger.event("registry_updated", {
2581
+ scope: scope.scopeName,
2582
+ vectorCount: chunks.length
2583
+ });
2584
+ }
2585
+ stageEnd("finalize", finalizeStart);
2586
+ return {
2587
+ pagesProcessed: mirrorPages.length,
2588
+ chunksTotal: chunks.length,
2589
+ chunksChanged: changedChunks.length,
2590
+ newEmbeddings,
2591
+ deletes: deletes.length,
2592
+ estimatedTokens,
2593
+ estimatedCostUSD: Number(estimatedCostUSD.toFixed(8)),
2594
+ routeExact,
2595
+ routeBestEffort,
2596
+ stageTimingsMs
2597
+ };
2598
+ }
2599
+ };
2600
+
2601
+ // src/mcp/server.ts
2602
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
2603
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
2604
+ import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
2605
+ import { createMcpExpressApp } from "@modelcontextprotocol/sdk/server/express.js";
2606
+ import { z as z3 } from "zod";
2607
+
2608
+ // src/search/engine.ts
2609
+ import path12 from "path";
2610
+ import { z as z2 } from "zod";
2611
+
2612
+ // src/rerank/jina.ts
2613
+ function sleep2(ms) {
2614
+ return new Promise((resolve) => {
2615
+ setTimeout(resolve, ms);
2616
+ });
2617
+ }
2618
+ var JinaReranker = class {
2619
+ apiKey;
2620
+ model;
2621
+ maxRetries;
2622
+ constructor(options) {
2623
+ this.apiKey = options.apiKey;
2624
+ this.model = options.model;
2625
+ this.maxRetries = options.maxRetries ?? 4;
2626
+ }
2627
+ async rerank(query, candidates, topN) {
2628
+ if (candidates.length === 0) {
2629
+ return [];
2630
+ }
2631
+ const body = {
2632
+ model: this.model,
2633
+ query,
2634
+ documents: candidates.map((candidate) => candidate.text),
2635
+ top_n: topN ?? candidates.length
2636
+ };
2637
+ let attempt = 0;
2638
+ while (attempt <= this.maxRetries) {
2639
+ attempt += 1;
2640
+ let response;
2641
+ try {
2642
+ response = await fetch("https://api.jina.ai/v1/rerank", {
2643
+ method: "POST",
2644
+ headers: {
2645
+ "content-type": "application/json",
2646
+ authorization: `Bearer ${this.apiKey}`
2647
+ },
2648
+ body: JSON.stringify(body)
2649
+ });
2650
+ } catch (error) {
2651
+ if (attempt <= this.maxRetries) {
2652
+ await sleep2(Math.min(300 * 2 ** attempt, 4e3));
2653
+ continue;
2654
+ }
2655
+ throw error;
2656
+ }
2657
+ if (!response.ok) {
2658
+ const retryable = response.status === 429 || response.status >= 500;
2659
+ if (retryable && attempt <= this.maxRetries) {
2660
+ await sleep2(Math.min(300 * 2 ** attempt, 4e3));
2661
+ continue;
2662
+ }
2663
+ const errorBody = await response.text();
2664
+ throw new Error(`Jina rerank failed (${response.status}): ${errorBody}`);
2665
+ }
2666
+ const payload = await response.json();
2667
+ const rawResults = payload.results ?? payload.data ?? [];
2668
+ if (!Array.isArray(rawResults)) {
2669
+ throw new Error("Invalid Jina rerank response format");
2670
+ }
2671
+ return rawResults.flatMap((item) => {
2672
+ const index = item.index;
2673
+ if (typeof index !== "number" || index < 0 || index >= candidates.length) {
2674
+ return [];
2675
+ }
2676
+ const candidate = candidates[index];
2677
+ if (!candidate) {
2678
+ return [];
2679
+ }
2680
+ const score = typeof item.relevance_score === "number" ? item.relevance_score : item.score ?? 0;
2681
+ return [
2682
+ {
2683
+ id: candidate.id,
2684
+ score
2685
+ }
2686
+ ];
2687
+ }).sort((a, b) => b.score - a.score);
2688
+ }
2689
+ throw new Error("Jina rerank request failed after retries");
2690
+ }
2691
+ };
2692
+
2693
+ // src/rerank/factory.ts
2694
+ function createReranker(config) {
2695
+ if (config.rerank.provider === "none") {
2696
+ return null;
2697
+ }
2698
+ if (config.rerank.provider === "jina") {
2699
+ const apiKey = process.env[config.rerank.jina.apiKeyEnv];
2700
+ if (!apiKey) {
2701
+ return null;
2702
+ }
2703
+ return new JinaReranker({
2704
+ apiKey,
2705
+ model: config.rerank.jina.model
2706
+ });
2707
+ }
2708
+ return null;
2709
+ }
2710
+
2711
+ // src/search/ranking.ts
2712
+ function nonNegativeOrZero(value) {
2713
+ if (!Number.isFinite(value)) {
2714
+ return 0;
2715
+ }
2716
+ return Math.max(0, value);
2717
+ }
2718
+ function rankHits(hits, config) {
2719
+ return hits.map((hit) => {
2720
+ let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
2721
+ if (config.ranking.enableIncomingLinkBoost) {
2722
+ const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
2723
+ score += incomingBoost * config.ranking.weights.incomingLinks;
2724
+ }
2725
+ if (config.ranking.enableDepthBoost) {
2726
+ const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
2727
+ score += depthBoost * config.ranking.weights.depth;
2728
+ }
2729
+ return {
2730
+ hit,
2731
+ finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
2732
+ };
2733
+ }).sort((a, b) => {
2734
+ const delta = b.finalScore - a.finalScore;
2735
+ return Number.isNaN(delta) ? 0 : delta;
2736
+ });
2737
+ }
2738
+ function findPageWeight(url, pageWeights) {
2739
+ const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
2740
+ const normalizedUrl = norm(url);
2741
+ for (const [pattern, weight] of Object.entries(pageWeights)) {
2742
+ if (norm(pattern) === normalizedUrl) {
2743
+ return weight;
2744
+ }
2745
+ }
2746
+ let bestPrefix = "";
2747
+ let bestWeight = 1;
2748
+ for (const [pattern, weight] of Object.entries(pageWeights)) {
2749
+ const normalizedPattern = norm(pattern);
2750
+ if (normalizedPattern === "/") continue;
2751
+ const prefix = `${normalizedPattern}/`;
2752
+ if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
2753
+ bestPrefix = prefix;
2754
+ bestWeight = weight;
2755
+ }
2756
+ }
2757
+ return bestWeight;
2758
+ }
2759
+ function aggregateByPage(ranked, config) {
2760
+ const groups = /* @__PURE__ */ new Map();
2761
+ for (const hit of ranked) {
2762
+ const url = hit.hit.metadata.url;
2763
+ const group = groups.get(url);
2764
+ if (group) group.push(hit);
2765
+ else groups.set(url, [hit]);
2766
+ }
2767
+ const { aggregationCap, aggregationDecay } = config.ranking;
2768
+ const pages = [];
2769
+ for (const [url, chunks] of groups) {
2770
+ chunks.sort((a, b) => {
2771
+ const delta = b.finalScore - a.finalScore;
2772
+ return Number.isNaN(delta) ? 0 : delta;
2773
+ });
2774
+ const best = chunks[0];
2775
+ const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
2776
+ const topChunks = chunks.slice(0, aggregationCap);
2777
+ let aggregationBonus = 0;
2778
+ for (let i = 1; i < topChunks.length; i++) {
2779
+ const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
2780
+ aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
2781
+ }
2782
+ let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
2783
+ const pageWeight = findPageWeight(url, config.ranking.pageWeights);
2784
+ if (pageWeight !== 1) {
2785
+ pageScore *= pageWeight;
2786
+ }
2787
+ pages.push({
2788
+ url,
2789
+ title: best.hit.metadata.title,
2790
+ routeFile: best.hit.metadata.routeFile,
2791
+ pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
2792
+ bestChunk: best,
2793
+ matchingChunks: chunks
2794
+ });
2795
+ }
2796
+ return pages.sort((a, b) => {
2797
+ const delta = b.pageScore - a.pageScore;
2798
+ return Number.isNaN(delta) ? 0 : delta;
2799
+ });
2800
+ }
2801
+
2802
+ // src/search/engine.ts
2803
+ var requestSchema = z2.object({
2804
+ q: z2.string().trim().min(1),
2805
+ topK: z2.number().int().positive().max(100).optional(),
2806
+ scope: z2.string().optional(),
2807
+ pathPrefix: z2.string().optional(),
2808
+ tags: z2.array(z2.string()).optional(),
2809
+ rerank: z2.boolean().optional(),
2810
+ groupBy: z2.enum(["page", "chunk"]).optional()
2811
+ });
2812
+ var SearchEngine = class _SearchEngine {
2813
+ cwd;
2814
+ config;
2815
+ embeddings;
2816
+ vectorStore;
2817
+ reranker;
2818
+ constructor(options) {
2819
+ this.cwd = options.cwd;
2820
+ this.config = options.config;
2821
+ this.embeddings = options.embeddings;
2822
+ this.vectorStore = options.vectorStore;
2823
+ this.reranker = options.reranker;
2824
+ }
2825
+ static async create(options = {}) {
2826
+ const cwd = path12.resolve(options.cwd ?? process.cwd());
2827
+ const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
2828
+ const embeddings = options.embeddingsProvider ?? createEmbeddingsProvider(config);
2829
+ const vectorStore = options.vectorStore ?? await createVectorStore(config, cwd);
2830
+ const reranker = options.reranker === void 0 ? createReranker(config) : options.reranker;
2831
+ return new _SearchEngine({
2832
+ cwd,
2833
+ config,
2834
+ embeddings,
2835
+ vectorStore,
2836
+ reranker
2837
+ });
2838
+ }
2839
+ getConfig() {
2840
+ return this.config;
2841
+ }
2842
+ async search(request) {
2843
+ const parsed = requestSchema.safeParse(request);
2844
+ if (!parsed.success) {
2845
+ throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
2846
+ }
2847
+ const input = parsed.data;
2848
+ const totalStart = process.hrtime.bigint();
2849
+ const resolvedScope = resolveScope(this.config, input.scope);
2850
+ await this.assertModelCompatibility(resolvedScope);
2851
+ const topK = input.topK ?? 10;
2852
+ const wantsRerank = Boolean(input.rerank);
2853
+ const groupByPage = (input.groupBy ?? "page") === "page";
2854
+ const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
2855
+ const embedStart = process.hrtime.bigint();
2856
+ const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
2857
+ const queryVector = queryEmbeddings[0];
2858
+ if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
2859
+ throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
2860
+ }
2861
+ const embedMs = hrTimeMs(embedStart);
2862
+ const vectorStart = process.hrtime.bigint();
2863
+ const hits = await this.vectorStore.query(
2864
+ queryVector,
2865
+ {
2866
+ topK: candidateK,
2867
+ pathPrefix: input.pathPrefix,
2868
+ tags: input.tags
2869
+ },
2870
+ resolvedScope
2871
+ );
2872
+ const vectorMs = hrTimeMs(vectorStart);
2873
+ const ranked = rankHits(hits, this.config);
2874
+ let usedRerank = false;
2875
+ let rerankMs = 0;
2876
+ let ordered = ranked;
2877
+ if (wantsRerank) {
2878
+ const rerankStart = process.hrtime.bigint();
2879
+ ordered = await this.rerankHits(input.q, ranked, topK);
2880
+ rerankMs = hrTimeMs(rerankStart);
2881
+ usedRerank = true;
2882
+ }
2883
+ let results;
2884
+ if (groupByPage) {
2885
+ const pages = aggregateByPage(ordered, this.config);
2886
+ const minRatio = this.config.ranking.minChunkScoreRatio;
2887
+ results = pages.slice(0, topK).map((page) => {
2888
+ const bestScore = page.bestChunk.finalScore;
2889
+ const minScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
2890
+ const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore).slice(0, 5);
2891
+ return {
2892
+ url: page.url,
2893
+ title: page.title,
2894
+ sectionTitle: page.bestChunk.hit.metadata.sectionTitle || void 0,
2895
+ snippet: page.bestChunk.hit.metadata.snippet,
2896
+ score: Number(page.pageScore.toFixed(6)),
2897
+ routeFile: page.routeFile,
2898
+ chunks: meaningful.length > 1 ? meaningful.map((c) => ({
2899
+ sectionTitle: c.hit.metadata.sectionTitle || void 0,
2900
+ snippet: c.hit.metadata.snippet,
2901
+ headingPath: c.hit.metadata.headingPath,
2902
+ score: Number(c.finalScore.toFixed(6))
2903
+ })) : void 0
2904
+ };
2905
+ });
2906
+ } else {
2907
+ results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
2908
+ url: hit.metadata.url,
2909
+ title: hit.metadata.title,
2910
+ sectionTitle: hit.metadata.sectionTitle || void 0,
2911
+ snippet: hit.metadata.snippet,
2912
+ score: Number(finalScore.toFixed(6)),
2913
+ routeFile: hit.metadata.routeFile
2914
+ }));
2915
+ }
2916
+ return {
2917
+ q: input.q,
2918
+ scope: resolvedScope.scopeName,
2919
+ results,
2920
+ meta: {
2921
+ timingsMs: {
2922
+ embed: Math.round(embedMs),
2923
+ vector: Math.round(vectorMs),
2924
+ rerank: Math.round(rerankMs),
2925
+ total: Math.round(hrTimeMs(totalStart))
2926
+ },
2927
+ usedRerank,
2928
+ modelId: this.config.embeddings.model
2929
+ }
2930
+ };
2931
+ }
2932
+ async getPage(pathOrUrl, scope) {
2933
+ const resolvedScope = resolveScope(this.config, scope);
2934
+ const urlPath = this.resolveInputPath(pathOrUrl);
2935
+ const page = await this.vectorStore.getPage(urlPath, resolvedScope);
2936
+ if (!page) {
2937
+ throw new SearchSocketError("INVALID_REQUEST", `Indexed page not found for ${urlPath}`, 404);
2938
+ }
2939
+ return {
2940
+ url: page.url,
2941
+ frontmatter: {
2942
+ url: page.url,
2943
+ title: page.title,
2944
+ routeFile: page.routeFile,
2945
+ routeResolution: page.routeResolution,
2946
+ incomingLinks: page.incomingLinks,
2947
+ outgoingLinks: page.outgoingLinks,
2948
+ depth: page.depth,
2949
+ tags: page.tags,
2950
+ indexedAt: page.indexedAt
2951
+ },
2952
+ markdown: page.markdown
2953
+ };
2954
+ }
2955
+ async health() {
2956
+ return this.vectorStore.health();
2957
+ }
2958
+ resolveInputPath(pathOrUrl) {
2959
+ try {
2960
+ if (/^https?:\/\//.test(pathOrUrl)) {
2961
+ return normalizeUrlPath(new URL(pathOrUrl).pathname);
2962
+ }
2963
+ } catch {
2964
+ }
2965
+ const withoutQueryOrHash = pathOrUrl.split(/[?#]/)[0] ?? pathOrUrl;
2966
+ return normalizeUrlPath(withoutQueryOrHash);
2967
+ }
2968
+ async assertModelCompatibility(scope) {
2969
+ const modelId = await this.vectorStore.getScopeModelId(scope);
2970
+ if (modelId && modelId !== this.config.embeddings.model) {
2971
+ throw new SearchSocketError(
2972
+ "EMBEDDING_MODEL_MISMATCH",
2973
+ `Scope ${scope.scopeName} was indexed with ${modelId}. Current config uses ${this.config.embeddings.model}. Re-index with --force.`
2974
+ );
2975
+ }
2976
+ }
2977
+ async rerankHits(query, ranked, topK) {
2978
+ if (this.config.rerank.provider !== "jina") {
2979
+ throw new SearchSocketError(
2980
+ "INVALID_REQUEST",
2981
+ "rerank=true requested but rerank.provider is not configured as 'jina'.",
2982
+ 400
2983
+ );
2984
+ }
2985
+ if (!this.reranker) {
2986
+ throw new SearchSocketError(
2987
+ "CONFIG_MISSING",
2988
+ `rerank=true requested but ${this.config.rerank.jina.apiKeyEnv} is not set.`,
2989
+ 400
2990
+ );
2991
+ }
2992
+ const candidates = ranked.map(({ hit }) => ({
2993
+ id: hit.id,
2994
+ text: [hit.metadata.title, hit.metadata.sectionTitle, hit.metadata.snippet].filter(Boolean).join("\n")
2995
+ }));
2996
+ const reranked = await this.reranker.rerank(
2997
+ query,
2998
+ candidates,
2999
+ Math.max(topK, this.config.rerank.topN)
3000
+ );
3001
+ const rerankScoreById = new Map(reranked.map((entry) => [entry.id, entry.score]));
3002
+ return ranked.map((entry) => {
3003
+ const rerankScore = rerankScoreById.get(entry.hit.id);
3004
+ const safeBaseScore = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
3005
+ if (rerankScore === void 0 || !Number.isFinite(rerankScore)) {
3006
+ return {
3007
+ ...entry,
3008
+ finalScore: safeBaseScore
3009
+ };
3010
+ }
3011
+ const combinedScore = rerankScore * this.config.ranking.weights.rerank + safeBaseScore * 1e-3;
3012
+ return {
3013
+ ...entry,
3014
+ finalScore: Number.isFinite(combinedScore) ? combinedScore : safeBaseScore
3015
+ };
3016
+ }).sort((a, b) => {
3017
+ const delta = b.finalScore - a.finalScore;
3018
+ return Number.isNaN(delta) ? 0 : delta;
3019
+ });
3020
+ }
3021
+ };
3022
+
3023
+ // src/mcp/server.ts
3024
+ function createServer(engine) {
3025
+ const server = new McpServer({
3026
+ name: "searchsocket-mcp",
3027
+ version: "0.1.0"
3028
+ });
3029
+ server.registerTool(
3030
+ "search",
3031
+ {
3032
+ description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, and topK.",
3033
+ inputSchema: {
3034
+ query: z3.string().min(1),
3035
+ scope: z3.string().optional(),
3036
+ topK: z3.number().int().positive().max(100).optional(),
3037
+ pathPrefix: z3.string().optional(),
3038
+ tags: z3.array(z3.string()).optional(),
3039
+ groupBy: z3.enum(["page", "chunk"]).optional()
3040
+ }
3041
+ },
3042
+ async (input) => {
3043
+ const result = await engine.search({
3044
+ q: input.query,
3045
+ topK: input.topK,
3046
+ scope: input.scope,
3047
+ pathPrefix: input.pathPrefix,
3048
+ tags: input.tags,
3049
+ groupBy: input.groupBy
3050
+ });
3051
+ return {
3052
+ content: [
3053
+ {
3054
+ type: "text",
3055
+ text: JSON.stringify(result, null, 2)
3056
+ }
3057
+ ]
3058
+ };
3059
+ }
3060
+ );
3061
+ server.registerTool(
3062
+ "get_page",
3063
+ {
3064
+ description: "Fetch indexed markdown for a specific path or URL, including frontmatter and routeFile mapping.",
3065
+ inputSchema: {
3066
+ pathOrUrl: z3.string().min(1),
3067
+ scope: z3.string().optional()
3068
+ }
3069
+ },
3070
+ async (input) => {
3071
+ const page = await engine.getPage(input.pathOrUrl, input.scope);
3072
+ return {
3073
+ content: [
3074
+ {
3075
+ type: "text",
3076
+ text: JSON.stringify(page, null, 2)
3077
+ }
3078
+ ]
3079
+ };
3080
+ }
3081
+ );
3082
+ return server;
3083
+ }
3084
+ function redirectConsoleToStderr() {
3085
+ const originalLog = console.log;
3086
+ console.log = (...args) => {
3087
+ process.stderr.write(`[LOG] ${args.map(String).join(" ")}
3088
+ `);
3089
+ };
3090
+ console.warn = (...args) => {
3091
+ process.stderr.write(`[WARN] ${args.map(String).join(" ")}
3092
+ `);
3093
+ };
3094
+ void originalLog;
3095
+ }
3096
+ async function startHttpServer(serverFactory, config, opts) {
3097
+ const app = createMcpExpressApp();
3098
+ const port = opts.httpPort ?? config.mcp.http.port;
3099
+ const endpointPath = opts.httpPath ?? config.mcp.http.path;
3100
+ app.post(endpointPath, async (req, res) => {
3101
+ const server = serverFactory();
3102
+ const transport = new StreamableHTTPServerTransport({
3103
+ sessionIdGenerator: void 0
3104
+ });
3105
+ try {
3106
+ await server.connect(transport);
3107
+ await transport.handleRequest(req, res, req.body);
3108
+ res.on("close", () => {
3109
+ transport.close();
3110
+ server.close();
3111
+ });
3112
+ } catch (error) {
3113
+ if (!res.headersSent) {
3114
+ res.status(500).json({
3115
+ jsonrpc: "2.0",
3116
+ error: {
3117
+ code: -32603,
3118
+ message: error instanceof Error ? error.message : "Internal server error"
3119
+ },
3120
+ id: null
3121
+ });
3122
+ }
3123
+ }
3124
+ });
3125
+ app.get(endpointPath, (_req, res) => {
3126
+ res.writeHead(405).end(
3127
+ JSON.stringify({
3128
+ jsonrpc: "2.0",
3129
+ error: {
3130
+ code: -32e3,
3131
+ message: "Method not allowed"
3132
+ },
3133
+ id: null
3134
+ })
3135
+ );
3136
+ });
3137
+ app.delete(endpointPath, (_req, res) => {
3138
+ res.writeHead(405).end(
3139
+ JSON.stringify({
3140
+ jsonrpc: "2.0",
3141
+ error: {
3142
+ code: -32e3,
3143
+ message: "Method not allowed"
3144
+ },
3145
+ id: null
3146
+ })
3147
+ );
3148
+ });
3149
+ await new Promise((resolve, reject) => {
3150
+ const instance = app.listen(port, "127.0.0.1", () => {
3151
+ process.stderr.write(`SearchSocket MCP HTTP server listening on http://127.0.0.1:${port}${endpointPath}
3152
+ `);
3153
+ resolve();
3154
+ });
3155
+ instance.once("error", reject);
3156
+ process.on("SIGINT", async () => {
3157
+ await new Promise((shutdownResolve) => instance.close(() => shutdownResolve()));
3158
+ process.exit(0);
3159
+ });
3160
+ });
3161
+ }
3162
+ async function runMcpServer(options = {}) {
3163
+ const config = await loadConfig({
3164
+ cwd: options.cwd,
3165
+ configPath: options.configPath
3166
+ });
3167
+ const resolvedTransport = options.transport ?? config.mcp.transport;
3168
+ if (resolvedTransport === "stdio") {
3169
+ redirectConsoleToStderr();
3170
+ }
3171
+ const engine = await SearchEngine.create({
3172
+ cwd: options.cwd,
3173
+ configPath: options.configPath,
3174
+ config
3175
+ });
3176
+ if (resolvedTransport === "http") {
3177
+ await startHttpServer(() => createServer(engine), config, options);
3178
+ return;
3179
+ }
3180
+ const server = createServer(engine);
3181
+ const stdioTransport = new StdioServerTransport();
3182
+ await server.connect(stdioTransport);
3183
+ }
3184
+
3185
+ // src/cli.ts
3186
+ function parsePositiveInt(value, flag) {
3187
+ const parsed = Number(value);
3188
+ if (!Number.isInteger(parsed) || parsed <= 0) {
3189
+ throw new SearchSocketError("INVALID_REQUEST", `${flag} must be a positive integer`, 400);
3190
+ }
3191
+ return parsed;
3192
+ }
3193
+ function parseDurationMs(value) {
3194
+ const match = value.trim().match(/^(\d+)(ms|s|m|h|d)$/i);
3195
+ if (!match) {
3196
+ throw new SearchSocketError(
3197
+ "INVALID_REQUEST",
3198
+ "Duration must look like 30d, 12h, 15m, 45s, or 500ms",
3199
+ 400
3200
+ );
3201
+ }
3202
+ const amount = Number(match[1]);
3203
+ const unit = (match[2] ?? "").toLowerCase();
3204
+ switch (unit) {
3205
+ case "ms":
3206
+ return amount;
3207
+ case "s":
3208
+ return amount * 1e3;
3209
+ case "m":
3210
+ return amount * 6e4;
3211
+ case "h":
3212
+ return amount * 36e5;
3213
+ case "d":
3214
+ return amount * 864e5;
3215
+ default:
3216
+ throw new SearchSocketError("INVALID_REQUEST", `Unsupported duration unit: ${unit}`, 400);
3217
+ }
3218
+ }
3219
+ function formatUsd(value) {
3220
+ return `$${value.toFixed(6)}`;
3221
+ }
3222
+ function printIndexSummary(stats) {
3223
+ process.stdout.write(`pages processed: ${stats.pagesProcessed}
3224
+ `);
3225
+ process.stdout.write(`chunks total: ${stats.chunksTotal}
3226
+ `);
3227
+ process.stdout.write(`chunks changed: ${stats.chunksChanged}
3228
+ `);
3229
+ process.stdout.write(`embeddings created: ${stats.newEmbeddings}
3230
+ `);
3231
+ process.stdout.write(`deletes: ${stats.deletes}
3232
+ `);
3233
+ process.stdout.write(`estimated tokens: ${stats.estimatedTokens}
3234
+ `);
3235
+ process.stdout.write(`estimated cost (USD): ${formatUsd(stats.estimatedCostUSD)}
3236
+ `);
3237
+ process.stdout.write(`route mapping: ${stats.routeExact} exact, ${stats.routeBestEffort} best-effort
3238
+ `);
3239
+ process.stdout.write("stage timings (ms):\n");
3240
+ for (const [stage, ms] of Object.entries(stats.stageTimingsMs)) {
3241
+ process.stdout.write(` ${stage}: ${ms}
3242
+ `);
3243
+ }
3244
+ }
3245
+ function collectWatchPaths(config, cwd) {
3246
+ const paths = ["src/routes/**"];
3247
+ if (config.source.mode === "content-files" && config.source.contentFiles) {
3248
+ for (const pattern of config.source.contentFiles.globs) {
3249
+ paths.push(path13.join(config.source.contentFiles.baseDir, pattern));
3250
+ }
3251
+ }
3252
+ if (config.source.mode === "static-output") {
3253
+ paths.push(config.source.staticOutputDir);
3254
+ }
3255
+ if (config.source.mode === "crawl") {
3256
+ paths.push("searchsocket.config.ts");
3257
+ }
3258
+ if (config.source.mode === "build" && config.source.build) {
3259
+ paths.push("searchsocket.config.ts");
3260
+ paths.push(config.source.build.outputDir);
3261
+ }
3262
+ return paths.map((value) => path13.resolve(cwd, value));
3263
+ }
3264
+ function ensureStateDir(cwd) {
3265
+ const target = path13.join(cwd, ".searchsocket");
3266
+ fs9.mkdirSync(target, { recursive: true });
3267
+ return target;
3268
+ }
3269
+ function ensureGitignore(cwd) {
3270
+ const gitignorePath = path13.join(cwd, ".gitignore");
3271
+ const entries = [
3272
+ ".searchsocket/vectors.db",
3273
+ ".searchsocket/vectors.db-shm",
3274
+ ".searchsocket/vectors.db-wal",
3275
+ ".searchsocket/manifest.json",
3276
+ ".searchsocket/registry.json"
3277
+ ];
3278
+ let content = "";
3279
+ if (fs9.existsSync(gitignorePath)) {
3280
+ content = fs9.readFileSync(gitignorePath, "utf8");
3281
+ }
3282
+ const lines = content.split("\n");
3283
+ const missing = entries.filter((entry) => !lines.some((line) => line.trim() === entry));
3284
+ if (missing.length === 0) {
3285
+ return;
3286
+ }
3287
+ const block = `
3288
+ # SearchSocket local state
3289
+ ${missing.join("\n")}
3290
+ `;
3291
+ fs9.writeFileSync(gitignorePath, content.trimEnd() + block, "utf8");
3292
+ }
3293
+ function readScopesFromFile(filePath) {
3294
+ const raw = fs9.readFileSync(filePath, "utf8");
3295
+ return new Set(
3296
+ raw.split(/\r?\n/).map((line) => line.trim()).filter(Boolean)
3297
+ );
3298
+ }
3299
+ function readRemoteGitBranches(cwd) {
3300
+ try {
3301
+ const output = execSync2("git branch -r --format='%(refname:short)'", {
3302
+ cwd,
3303
+ encoding: "utf8",
3304
+ stdio: ["ignore", "pipe", "ignore"]
3305
+ });
3306
+ const scopes = output.split(/\r?\n/).map((line) => line.trim()).filter(Boolean).map((line) => line.replace(/^origin\//, ""));
3307
+ if (scopes.length <= 1) {
3308
+ process.stdout.write(
3309
+ "warning: git branch -r returned 1 or fewer branches. If running in CI, ensure the checkout step uses fetch-depth: 0 to avoid accidentally pruning active branch scopes.\n"
3310
+ );
3311
+ }
3312
+ return new Set(scopes);
3313
+ } catch {
3314
+ return /* @__PURE__ */ new Set();
3315
+ }
3316
+ }
3317
+ async function loadResolvedConfigForDev(cwd, configPath) {
3318
+ const resolvedConfigPath = path13.resolve(cwd, configPath ?? "searchsocket.config.ts");
3319
+ if (fs9.existsSync(resolvedConfigPath)) {
3320
+ return loadConfig({ cwd, configPath });
3321
+ }
3322
+ return mergeConfig(cwd, {});
3323
+ }
3324
+ function getRootOptions(command) {
3325
+ const maybeParent = command.parent;
3326
+ const optsFn = maybeParent?.opts;
3327
+ if (typeof optsFn !== "function") {
3328
+ return {};
3329
+ }
3330
+ return optsFn.call(maybeParent);
3331
+ }
3332
+ async function runIndexCommand(opts) {
3333
+ const logger3 = new Logger({
3334
+ verbose: opts.verbose,
3335
+ json: opts.json
3336
+ });
3337
+ const pipeline = await IndexPipeline.create({
3338
+ cwd: opts.cwd,
3339
+ configPath: opts.configPath,
3340
+ logger: logger3
3341
+ });
3342
+ const stats = await pipeline.run({
3343
+ scopeOverride: opts.scope,
3344
+ changedOnly: opts.changedOnly,
3345
+ force: opts.force,
3346
+ dryRun: opts.dryRun,
3347
+ sourceOverride: opts.source,
3348
+ maxPages: opts.maxPages,
3349
+ maxChunks: opts.maxChunks,
3350
+ verbose: opts.verbose
3351
+ });
3352
+ if (opts.json) {
3353
+ process.stdout.write(`${JSON.stringify(stats, null, 2)}
3354
+ `);
3355
+ return;
3356
+ }
3357
+ printIndexSummary(stats);
3358
+ }
3359
+ var program = new Command();
3360
+ program.name("searchsocket").description("Semantic site search and MCP retrieval for SvelteKit").version(package_default.version).option("-C, --cwd <path>", "working directory", process.cwd()).option("--config <path>", "config path (defaults to searchsocket.config.ts)");
3361
+ program.command("init").description("Create searchsocket.config.ts and .searchsocket state directory").action(async (_opts, command) => {
3362
+ const root = getRootOptions(command).cwd ?? process.cwd();
3363
+ const cwd = path13.resolve(root);
3364
+ const configPath = writeMinimalConfig(cwd);
3365
+ const stateDir = ensureStateDir(cwd);
3366
+ ensureGitignore(cwd);
3367
+ process.stdout.write(`created/verified config: ${configPath}
3368
+ `);
3369
+ process.stdout.write(`created/verified state dir: ${stateDir}
3370
+
3371
+ `);
3372
+ process.stdout.write("SvelteKit hook snippet:\n\n");
3373
+ process.stdout.write('import { searchsocketHandle } from "searchsocket/sveltekit";\n\n');
3374
+ process.stdout.write("export const handle = searchsocketHandle();\n\n");
3375
+ process.stdout.write("Optional build-triggered indexing plugin:\n\n");
3376
+ process.stdout.write('import { searchsocketVitePlugin } from "searchsocket/sveltekit";\n\n');
3377
+ process.stdout.write("// svelte.config.js / vite plugins:\n");
3378
+ process.stdout.write("// searchsocketVitePlugin({ enabled: true, changedOnly: true })\n");
3379
+ process.stdout.write("// or env-driven: SEARCHSOCKET_AUTO_INDEX=1 pnpm build\n");
3380
+ });
3381
+ program.command("index").description("Index site content into markdown mirror + vector store").option("--scope <name>", "scope override").option("--changed-only", "only process changed chunks", true).option("--no-changed-only", "re-index regardless of previous manifest").option("--force", "force full mirror rebuild and re-upsert", false).option("--dry-run", "compute plan and cost, no API writes", false).option("--source <mode>", "source mode override: static-output|crawl|content-files|build").option("--max-pages <n>", "limit pages processed").option("--max-chunks <n>", "limit chunks processed").option("--verbose", "verbose output", false).option("--json", "emit JSON logs and summary", false).action(async (opts, command) => {
3382
+ const rootOpts = getRootOptions(command);
3383
+ const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
3384
+ await runIndexCommand({
3385
+ cwd,
3386
+ configPath: rootOpts?.config,
3387
+ scope: opts.scope,
3388
+ changedOnly: opts.changedOnly,
3389
+ force: opts.force,
3390
+ dryRun: opts.dryRun,
3391
+ source: opts.source,
3392
+ maxPages: opts.maxPages ? parsePositiveInt(opts.maxPages, "--max-pages") : void 0,
3393
+ maxChunks: opts.maxChunks ? parsePositiveInt(opts.maxChunks, "--max-chunks") : void 0,
3394
+ verbose: opts.verbose,
3395
+ json: opts.json
3396
+ });
3397
+ });
3398
+ program.command("status").description("Show scope, indexing state, backend health, and recent cost estimate").option("--scope <name>", "scope override").action(async (opts, command) => {
3399
+ const rootOpts = getRootOptions(command);
3400
+ const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
3401
+ const config = await loadConfig({ cwd, configPath: rootOpts?.config });
3402
+ const scope = resolveScope(config, opts.scope);
3403
+ let vectorStore;
3404
+ let health = { ok: false, details: "not checked" };
3405
+ try {
3406
+ vectorStore = await createVectorStore(config, cwd);
3407
+ health = await vectorStore.health();
3408
+ } catch (error) {
3409
+ health = {
3410
+ ok: false,
3411
+ details: error instanceof Error ? error.message : "unknown error"
3412
+ };
3413
+ process.stdout.write(`project: ${config.project.id}
3414
+ `);
3415
+ process.stdout.write(`vector health: error (${health.details})
3416
+ `);
3417
+ process.exitCode = 1;
3418
+ return;
3419
+ }
3420
+ let scopeRegistry = [];
3421
+ let scopeInfo;
3422
+ let hashes = /* @__PURE__ */ new Map();
3423
+ try {
3424
+ scopeRegistry = await vectorStore.listScopes(config.project.id);
3425
+ scopeInfo = scopeRegistry.find((entry) => entry.scopeName === scope.scopeName);
3426
+ hashes = await vectorStore.getContentHashes(scope);
3427
+ } catch (error) {
3428
+ process.stdout.write(`project: ${config.project.id}
3429
+ `);
3430
+ process.stdout.write(`resolved scope: ${scope.scopeName}
3431
+ `);
3432
+ process.stdout.write(`vector health: error (${error instanceof Error ? error.message : "unknown error"})
3433
+ `);
3434
+ process.exitCode = 1;
3435
+ return;
3436
+ }
3437
+ process.stdout.write(`project: ${config.project.id}
3438
+ `);
3439
+ process.stdout.write(`resolved scope: ${scope.scopeName}
3440
+ `);
3441
+ process.stdout.write(`embedding model: ${config.embeddings.model}
3442
+ `);
3443
+ const tursoUrl = process.env[config.vector.turso.urlEnv];
3444
+ const vectorMode = tursoUrl ? `remote (${tursoUrl})` : `local (${config.vector.turso.localPath})`;
3445
+ process.stdout.write(`vector backend: turso/libsql (${vectorMode})
3446
+ `);
3447
+ process.stdout.write(`vector health: ${health.ok ? "ok" : `error (${health.details ?? "n/a"})`}
3448
+ `);
3449
+ if (scopeInfo) {
3450
+ process.stdout.write(`last indexed (${scope.scopeName}): ${scopeInfo.lastIndexedAt ?? "never"}
3451
+ `);
3452
+ process.stdout.write(`tracked chunks: ${hashes.size}
3453
+ `);
3454
+ if (scopeInfo.lastEstimateTokens != null) {
3455
+ process.stdout.write(`last estimated tokens: ${scopeInfo.lastEstimateTokens}
3456
+ `);
3457
+ }
3458
+ if (scopeInfo.lastEstimateCostUSD != null) {
3459
+ process.stdout.write(`last estimated cost: ${formatUsd(scopeInfo.lastEstimateCostUSD)}
3460
+ `);
3461
+ }
3462
+ } else {
3463
+ process.stdout.write(`last indexed (${scope.scopeName}): never
3464
+ `);
3465
+ }
3466
+ if (scopeRegistry.length > 0) {
3467
+ process.stdout.write("\nregistry scopes:\n");
3468
+ for (const item of scopeRegistry) {
3469
+ process.stdout.write(
3470
+ ` - ${item.scopeName} model=${item.modelId} lastIndexedAt=${item.lastIndexedAt} vectors=${item.vectorCount ?? "unknown"}
3471
+ `
3472
+ );
3473
+ }
3474
+ }
3475
+ });
3476
+ program.command("dev").description("Watch content files/routes and incrementally reindex on changes").option("--scope <name>", "scope override").option("--mcp", "start MCP server (http transport) alongside watcher", false).option("--mcp-port <n>", "MCP HTTP port", "3338").option("--mcp-path <path>", "MCP HTTP path", "/mcp").option("--verbose", "verbose logs", false).action(async (opts, command) => {
3477
+ const rootOpts = getRootOptions(command);
3478
+ const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
3479
+ const config = await loadResolvedConfigForDev(cwd, rootOpts?.config);
3480
+ const watchPaths = collectWatchPaths(config, cwd);
3481
+ process.stdout.write("starting searchsocket dev watcher...\n");
3482
+ process.stdout.write(`watching:
3483
+ ${watchPaths.map((entry) => ` - ${entry}`).join("\n")}
3484
+ `);
3485
+ let running = false;
3486
+ let pending = false;
3487
+ let timer = null;
3488
+ const run = async () => {
3489
+ if (running) {
3490
+ pending = true;
3491
+ return;
3492
+ }
3493
+ running = true;
3494
+ try {
3495
+ await runIndexCommand({
3496
+ cwd,
3497
+ configPath: rootOpts?.config,
3498
+ scope: opts.scope,
3499
+ changedOnly: true,
3500
+ force: false,
3501
+ dryRun: false,
3502
+ verbose: opts.verbose,
3503
+ json: false
3504
+ });
3505
+ } catch (error) {
3506
+ process.stderr.write(`index error: ${error instanceof Error ? error.message : String(error)}
3507
+ `);
3508
+ } finally {
3509
+ running = false;
3510
+ if (pending) {
3511
+ pending = false;
3512
+ await run();
3513
+ }
3514
+ }
3515
+ };
3516
+ await run();
3517
+ const watcher = chokidar.watch(watchPaths, {
3518
+ ignoreInitial: true
3519
+ });
3520
+ watcher.on("all", (event, changedPath) => {
3521
+ process.stdout.write(`detected ${event}: ${changedPath}
3522
+ `);
3523
+ if (timer) {
3524
+ clearTimeout(timer);
3525
+ }
3526
+ timer = setTimeout(() => {
3527
+ void run();
3528
+ }, 350);
3529
+ });
3530
+ if (opts.mcp) {
3531
+ void runMcpServer({
3532
+ cwd,
3533
+ configPath: rootOpts?.config,
3534
+ transport: "http",
3535
+ httpPort: parsePositiveInt(opts.mcpPort, "--mcp-port"),
3536
+ httpPath: opts.mcpPath
3537
+ });
3538
+ }
3539
+ await new Promise((resolve) => {
3540
+ process.on("SIGINT", () => {
3541
+ void watcher.close().then(() => resolve());
3542
+ });
3543
+ });
3544
+ });
3545
+ program.command("clean").description("Delete local state and optionally delete remote vectors for a scope").option("--scope <name>", "scope override").option("--remote", "delete remote scope vectors", false).action(async (opts, command) => {
3546
+ const rootOpts = getRootOptions(command);
3547
+ const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
3548
+ const config = await loadConfig({ cwd, configPath: rootOpts?.config });
3549
+ const scope = resolveScope(config, opts.scope);
3550
+ const statePath = path13.join(cwd, config.state.dir);
3551
+ await fsp.rm(statePath, { recursive: true, force: true });
3552
+ process.stdout.write(`deleted local state directory: ${statePath}
3553
+ `);
3554
+ if (opts.remote) {
3555
+ const vectorStore = await createVectorStore(config, cwd);
3556
+ await vectorStore.deleteScope(scope);
3557
+ process.stdout.write(`deleted remote vectors for scope ${scope.scopeName}
3558
+ `);
3559
+ }
3560
+ });
3561
+ program.command("prune").description("List/delete stale scopes (dry-run by default)").option("--apply", "apply deletions", false).option("--scopes-file <path>", "file containing active scopes").option("--older-than <duration>", "ttl cutoff like 30d").action(async (opts, command) => {
3562
+ const rootOpts = getRootOptions(command);
3563
+ const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
3564
+ const config = await loadConfig({ cwd, configPath: rootOpts?.config });
3565
+ const baseScope = resolveScope(config);
3566
+ let vectorStore;
3567
+ let scopes;
3568
+ try {
3569
+ vectorStore = await createVectorStore(config, cwd);
3570
+ scopes = await vectorStore.listScopes(config.project.id);
3571
+ } catch (error) {
3572
+ process.stderr.write(
3573
+ `error: failed to access Turso vector store: ${error instanceof Error ? error.message : String(error)}
3574
+ `
3575
+ );
3576
+ process.exitCode = 1;
3577
+ return;
3578
+ }
3579
+ process.stdout.write(`using remote registry
3580
+ `);
3581
+ let keepScopes = /* @__PURE__ */ new Set();
3582
+ if (opts.scopesFile) {
3583
+ keepScopes = readScopesFromFile(path13.resolve(cwd, opts.scopesFile));
3584
+ } else {
3585
+ keepScopes = readRemoteGitBranches(cwd);
3586
+ }
3587
+ if (config.scope.sanitize && keepScopes.size > 0) {
3588
+ keepScopes = new Set([...keepScopes].map(sanitizeScopeName));
3589
+ }
3590
+ const olderThanMs = opts.olderThan ? parseDurationMs(opts.olderThan) : void 0;
3591
+ const now = Date.now();
3592
+ const stale = scopes.filter((entry) => {
3593
+ if (entry.scopeName === "main") {
3594
+ return false;
3595
+ }
3596
+ let staleByList = false;
3597
+ if (keepScopes.size > 0) {
3598
+ staleByList = !keepScopes.has(entry.scopeName);
3599
+ }
3600
+ let staleByTtl = false;
3601
+ if (olderThanMs) {
3602
+ staleByTtl = now - Date.parse(entry.lastIndexedAt) > olderThanMs;
3603
+ }
3604
+ if (keepScopes.size > 0 && olderThanMs) {
3605
+ return staleByList || staleByTtl;
3606
+ }
3607
+ if (keepScopes.size > 0) {
3608
+ return staleByList;
3609
+ }
3610
+ if (olderThanMs) {
3611
+ return staleByTtl;
3612
+ }
3613
+ return false;
3614
+ });
3615
+ if (stale.length === 0) {
3616
+ process.stdout.write("no stale scopes found\n");
3617
+ return;
3618
+ }
3619
+ process.stdout.write(`stale scopes (${stale.length}):
3620
+ `);
3621
+ for (const entry of stale) {
3622
+ process.stdout.write(` - ${entry.scopeName} lastIndexedAt=${entry.lastIndexedAt}
3623
+ `);
3624
+ }
3625
+ if (!opts.apply) {
3626
+ process.stdout.write("dry-run only. pass --apply to delete these scopes.\n");
3627
+ return;
3628
+ }
3629
+ let deleted = 0;
3630
+ for (const entry of stale) {
3631
+ const scope = {
3632
+ projectId: config.project.id,
3633
+ scopeName: entry.scopeName,
3634
+ scopeId: `${config.project.id}:${entry.scopeName}`
3635
+ };
3636
+ try {
3637
+ await vectorStore.deleteScope(scope);
3638
+ deleted += 1;
3639
+ } catch (error) {
3640
+ process.stdout.write(
3641
+ `failed to delete scope ${entry.scopeName}: ${error instanceof Error ? error.message : String(error)}
3642
+ `
3643
+ );
3644
+ }
3645
+ }
3646
+ process.stdout.write(`deleted scopes: ${deleted}
3647
+ `);
3648
+ if (baseScope.scopeName === "main") {
3649
+ process.stdout.write("main scope retained\n");
3650
+ }
3651
+ });
3652
+ program.command("doctor").description("Validate config, env vars, provider connectivity, and local write access").action(async (_opts, command) => {
3653
+ const rootOpts = getRootOptions(command);
3654
+ const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
3655
+ const checks = [];
3656
+ let config = null;
3657
+ try {
3658
+ config = await loadConfig({ cwd, configPath: rootOpts?.config });
3659
+ checks.push({ name: "config parse", ok: true });
3660
+ } catch (error) {
3661
+ checks.push({
3662
+ name: "config parse",
3663
+ ok: false,
3664
+ details: error instanceof Error ? error.message : "unknown error"
3665
+ });
3666
+ }
3667
+ if (config) {
3668
+ const embKey = process.env[config.embeddings.apiKeyEnv];
3669
+ checks.push({
3670
+ name: `env ${config.embeddings.apiKeyEnv}`,
3671
+ ok: Boolean(embKey),
3672
+ details: embKey ? void 0 : "missing"
3673
+ });
3674
+ {
3675
+ const tursoUrl = process.env[config.vector.turso.urlEnv];
3676
+ checks.push({
3677
+ name: "turso/libsql",
3678
+ ok: true,
3679
+ details: tursoUrl ? `remote: ${tursoUrl}` : `local file: ${config.vector.turso.localPath}`
3680
+ });
3681
+ }
3682
+ if (config.rerank.provider === "jina") {
3683
+ const jinaKey = process.env[config.rerank.jina.apiKeyEnv];
3684
+ checks.push({
3685
+ name: `env ${config.rerank.jina.apiKeyEnv}`,
3686
+ ok: Boolean(jinaKey),
3687
+ details: jinaKey ? void 0 : "missing"
3688
+ });
3689
+ }
3690
+ if (config.source.mode === "static-output") {
3691
+ const outputDir = path13.resolve(cwd, config.source.staticOutputDir);
3692
+ const exists = fs9.existsSync(outputDir);
3693
+ checks.push({
3694
+ name: "source: static output dir",
3695
+ ok: exists,
3696
+ details: exists ? outputDir : `${outputDir} not found (run your build first)`
3697
+ });
3698
+ } else if (config.source.mode === "build") {
3699
+ const buildConfig = config.source.build;
3700
+ if (buildConfig) {
3701
+ const manifestPath = path13.resolve(cwd, buildConfig.outputDir, "server", "manifest-full.js");
3702
+ const manifestExists = fs9.existsSync(manifestPath);
3703
+ checks.push({
3704
+ name: "source: build manifest",
3705
+ ok: manifestExists,
3706
+ details: manifestExists ? manifestPath : `${manifestPath} not found (run \`vite build\` first)`
3707
+ });
3708
+ const viteBin = path13.resolve(cwd, "node_modules", ".bin", "vite");
3709
+ const viteExists = fs9.existsSync(viteBin);
3710
+ checks.push({
3711
+ name: "source: vite binary",
3712
+ ok: viteExists,
3713
+ details: viteExists ? viteBin : `${viteBin} not found (install vite)`
3714
+ });
3715
+ } else {
3716
+ checks.push({
3717
+ name: "source: build config",
3718
+ ok: false,
3719
+ details: "source.build config missing"
3720
+ });
3721
+ }
3722
+ } else if (config.source.mode === "content-files") {
3723
+ const contentConfig = config.source.contentFiles;
3724
+ if (contentConfig) {
3725
+ const fg4 = await import("fast-glob");
3726
+ const baseDir = path13.resolve(cwd, contentConfig.baseDir);
3727
+ const files = await fg4.default(contentConfig.globs, { cwd: baseDir, onlyFiles: true });
3728
+ checks.push({
3729
+ name: "source: content files",
3730
+ ok: files.length > 0,
3731
+ details: files.length > 0 ? `${files.length} files matched` : `no files matched globs ${contentConfig.globs.join(", ")} in ${baseDir}`
3732
+ });
3733
+ } else {
3734
+ checks.push({
3735
+ name: "source: content files",
3736
+ ok: false,
3737
+ details: "source.contentFiles config missing"
3738
+ });
3739
+ }
3740
+ }
3741
+ try {
3742
+ const provider = createEmbeddingsProvider(config);
3743
+ await provider.embedTexts(["searchsocket doctor ping"], config.embeddings.model);
3744
+ checks.push({ name: "embedding provider connectivity", ok: true });
3745
+ } catch (error) {
3746
+ checks.push({
3747
+ name: "embedding provider connectivity",
3748
+ ok: false,
3749
+ details: error instanceof Error ? error.message : "unknown error"
3750
+ });
3751
+ }
3752
+ let store = null;
3753
+ try {
3754
+ store = await createVectorStore(config, cwd);
3755
+ const health = await store.health();
3756
+ checks.push({
3757
+ name: "vector backend connectivity",
3758
+ ok: health.ok,
3759
+ details: health.details
3760
+ });
3761
+ } catch (error) {
3762
+ checks.push({
3763
+ name: "vector backend connectivity",
3764
+ ok: false,
3765
+ details: error instanceof Error ? error.message : "unknown error"
3766
+ });
3767
+ }
3768
+ if (store) {
3769
+ try {
3770
+ const testScope = {
3771
+ projectId: config.project.id,
3772
+ scopeName: "_searchsocket_doctor_probe",
3773
+ scopeId: `${config.project.id}:_searchsocket_doctor_probe`
3774
+ };
3775
+ await store.recordScope({
3776
+ projectId: testScope.projectId,
3777
+ scopeName: testScope.scopeName,
3778
+ modelId: config.embeddings.model,
3779
+ lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
3780
+ vectorCount: 0
3781
+ });
3782
+ await store.deleteScope(testScope);
3783
+ checks.push({ name: "vector backend write permission", ok: true });
3784
+ } catch (error) {
3785
+ checks.push({
3786
+ name: "vector backend write permission",
3787
+ ok: false,
3788
+ details: error instanceof Error ? error.message : "write test failed"
3789
+ });
3790
+ }
3791
+ }
3792
+ try {
3793
+ const scope = resolveScope(config);
3794
+ const { statePath } = ensureStateDirs(cwd, config.state.dir, scope);
3795
+ const testPath = path13.join(statePath, ".write-test");
3796
+ await fsp.writeFile(testPath, "ok\n", "utf8");
3797
+ await fsp.rm(testPath, { force: true });
3798
+ checks.push({ name: "state directory writable", ok: true });
3799
+ } catch (error) {
3800
+ checks.push({
3801
+ name: "state directory writable",
3802
+ ok: false,
3803
+ details: error instanceof Error ? error.message : "unknown error"
3804
+ });
3805
+ }
3806
+ }
3807
+ let hasFailure = false;
3808
+ for (const check of checks) {
3809
+ process.stdout.write(`${check.ok ? "PASS" : "FAIL"} ${check.name}`);
3810
+ if (check.details) {
3811
+ process.stdout.write(` (${check.details})`);
3812
+ }
3813
+ process.stdout.write("\n");
3814
+ if (!check.ok) {
3815
+ hasFailure = true;
3816
+ }
3817
+ }
3818
+ if (hasFailure) {
3819
+ process.exitCode = 1;
3820
+ }
3821
+ });
3822
+ program.command("mcp").description("Run SearchSocket MCP server").option("--transport <transport>", "stdio|http", "stdio").option("--port <n>", "HTTP port", "3338").option("--path <path>", "HTTP path", "/mcp").action(async (opts, command) => {
3823
+ const rootOpts = getRootOptions(command);
3824
+ const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
3825
+ await runMcpServer({
3826
+ cwd,
3827
+ configPath: rootOpts?.config,
3828
+ transport: opts.transport,
3829
+ httpPort: parsePositiveInt(opts.port, "--port"),
3830
+ httpPath: opts.path
3831
+ });
3832
+ });
3833
+ program.command("search").description("Quick local CLI search against indexed vectors").requiredOption("--q <query>", "search query").option("--scope <name>", "scope override").option("--top-k <n>", "top K results", "10").option("--path-prefix <prefix>", "path prefix filter").option("--rerank", "enable configured reranker", false).action(async (opts, command) => {
3834
+ const rootOpts = getRootOptions(command);
3835
+ const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
3836
+ const engine = await SearchEngine.create({
3837
+ cwd,
3838
+ configPath: rootOpts?.config
3839
+ });
3840
+ const result = await engine.search({
3841
+ q: opts.q,
3842
+ scope: opts.scope,
3843
+ topK: parsePositiveInt(opts.topK, "--top-k"),
3844
+ pathPrefix: opts.pathPrefix,
3845
+ rerank: opts.rerank
3846
+ });
3847
+ process.stdout.write(`${JSON.stringify(result, null, 2)}
3848
+ `);
3849
+ });
3850
+ async function main() {
3851
+ dotenvConfig({ path: path13.resolve(process.cwd(), ".env") });
3852
+ await program.parseAsync(process.argv);
3853
+ }
3854
+ main().catch((error) => {
3855
+ const message = error instanceof Error ? error.message : String(error);
3856
+ process.stderr.write(`searchsocket error: ${message}
3857
+ `);
3858
+ process.exit(1);
3859
+ });
3860
+ //# sourceMappingURL=cli.js.map