searchsocket 0.3.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +57 -39
- package/dist/cli.js +947 -1378
- package/dist/client.cjs +45 -0
- package/dist/client.d.cts +3 -2
- package/dist/client.d.ts +3 -2
- package/dist/client.js +45 -1
- package/dist/index.cjs +909 -1286
- package/dist/index.d.cts +73 -33
- package/dist/index.d.ts +73 -33
- package/dist/index.js +906 -1281
- package/dist/plugin-B_npJSux.d.cts +36 -0
- package/dist/plugin-M-aW0ev6.d.ts +36 -0
- package/dist/scroll.cjs +185 -0
- package/dist/scroll.d.cts +42 -0
- package/dist/scroll.d.ts +42 -0
- package/dist/scroll.js +183 -0
- package/dist/sveltekit.cjs +997 -1204
- package/dist/sveltekit.d.cts +3 -43
- package/dist/sveltekit.d.ts +3 -43
- package/dist/sveltekit.js +995 -1202
- package/dist/{types-BrG6XTUU.d.cts → types-Dk43uz25.d.cts} +50 -109
- package/dist/{types-BrG6XTUU.d.ts → types-Dk43uz25.d.ts} +50 -109
- package/package.json +10 -3
package/dist/cli.js
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/cli.ts
|
|
4
|
-
import
|
|
4
|
+
import fs8 from "fs";
|
|
5
5
|
import fsp from "fs/promises";
|
|
6
|
-
import
|
|
6
|
+
import path12 from "path";
|
|
7
7
|
import { execSync as execSync2 } from "child_process";
|
|
8
8
|
import { config as dotenvConfig } from "dotenv";
|
|
9
9
|
import chokidar from "chokidar";
|
|
@@ -12,7 +12,7 @@ import { Command } from "commander";
|
|
|
12
12
|
// package.json
|
|
13
13
|
var package_default = {
|
|
14
14
|
name: "searchsocket",
|
|
15
|
-
version: "0.
|
|
15
|
+
version: "0.5.0",
|
|
16
16
|
description: "Semantic site search and MCP retrieval for SvelteKit static sites",
|
|
17
17
|
license: "MIT",
|
|
18
18
|
author: "Greg Priday <greg@siteorigin.com>",
|
|
@@ -58,6 +58,11 @@ var package_default = {
|
|
|
58
58
|
types: "./dist/client.d.ts",
|
|
59
59
|
import: "./dist/client.js",
|
|
60
60
|
require: "./dist/client.cjs"
|
|
61
|
+
},
|
|
62
|
+
"./scroll": {
|
|
63
|
+
types: "./dist/scroll.d.ts",
|
|
64
|
+
import: "./dist/scroll.js",
|
|
65
|
+
require: "./dist/scroll.cjs"
|
|
61
66
|
}
|
|
62
67
|
},
|
|
63
68
|
scripts: {
|
|
@@ -65,15 +70,16 @@ var package_default = {
|
|
|
65
70
|
clean: "rm -rf dist",
|
|
66
71
|
typecheck: "tsc --noEmit",
|
|
67
72
|
test: "vitest run",
|
|
68
|
-
"test:watch": "vitest"
|
|
73
|
+
"test:watch": "vitest",
|
|
74
|
+
"test:quality": "SEARCHSOCKET_QUALITY_TESTS=1 vitest run tests/quality.test.ts"
|
|
69
75
|
},
|
|
70
76
|
engines: {
|
|
71
77
|
node: ">=20"
|
|
72
78
|
},
|
|
73
79
|
packageManager: "pnpm@10.29.2",
|
|
74
80
|
dependencies: {
|
|
75
|
-
"@libsql/client": "^0.17.0",
|
|
76
81
|
"@modelcontextprotocol/sdk": "^1.26.0",
|
|
82
|
+
"@upstash/search": "^0.1.7",
|
|
77
83
|
cheerio: "^1.2.0",
|
|
78
84
|
chokidar: "^5.0.0",
|
|
79
85
|
commander: "^14.0.3",
|
|
@@ -91,6 +97,7 @@ var package_default = {
|
|
|
91
97
|
"@types/express": "^5.0.6",
|
|
92
98
|
"@types/node": "^25.2.2",
|
|
93
99
|
"@types/turndown": "^5.0.6",
|
|
100
|
+
jsdom: "^28.1.0",
|
|
94
101
|
tsup: "^8.5.1",
|
|
95
102
|
typescript: "^5.9.3",
|
|
96
103
|
vitest: "^4.0.18"
|
|
@@ -115,6 +122,8 @@ var searchSocketConfigSchema = z.object({
|
|
|
115
122
|
envVar: z.string().min(1).optional(),
|
|
116
123
|
sanitize: z.boolean().optional()
|
|
117
124
|
}).optional(),
|
|
125
|
+
exclude: z.array(z.string()).optional(),
|
|
126
|
+
respectRobotsTxt: z.boolean().optional(),
|
|
118
127
|
source: z.object({
|
|
119
128
|
mode: z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
|
|
120
129
|
staticOutputDir: z.string().min(1).optional(),
|
|
@@ -162,29 +171,18 @@ var searchSocketConfigSchema = z.object({
|
|
|
162
171
|
prependTitle: z.boolean().optional(),
|
|
163
172
|
pageSummaryChunk: z.boolean().optional()
|
|
164
173
|
}).optional(),
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
batchSize: z.number().int().positive().optional(),
|
|
171
|
-
concurrency: z.number().int().positive().optional(),
|
|
172
|
-
pricePer1kTokens: z.number().positive().optional()
|
|
173
|
-
}).optional(),
|
|
174
|
-
vector: z.object({
|
|
175
|
-
dimension: z.number().int().positive().optional(),
|
|
176
|
-
turso: z.object({
|
|
177
|
-
url: z.string().url().optional(),
|
|
178
|
-
authToken: z.string().min(1).optional(),
|
|
179
|
-
urlEnv: z.string().optional(),
|
|
180
|
-
authTokenEnv: z.string().optional(),
|
|
181
|
-
localPath: z.string().optional()
|
|
182
|
-
}).optional()
|
|
174
|
+
upstash: z.object({
|
|
175
|
+
url: z.string().url().optional(),
|
|
176
|
+
token: z.string().min(1).optional(),
|
|
177
|
+
urlEnv: z.string().min(1).optional(),
|
|
178
|
+
tokenEnv: z.string().min(1).optional()
|
|
183
179
|
}).optional(),
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
180
|
+
search: z.object({
|
|
181
|
+
semanticWeight: z.number().min(0).max(1).optional(),
|
|
182
|
+
inputEnrichment: z.boolean().optional(),
|
|
183
|
+
reranking: z.boolean().optional(),
|
|
184
|
+
dualSearch: z.boolean().optional(),
|
|
185
|
+
pageSearchWeight: z.number().min(0).max(1).optional()
|
|
188
186
|
}).optional(),
|
|
189
187
|
ranking: z.object({
|
|
190
188
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
@@ -194,11 +192,12 @@ var searchSocketConfigSchema = z.object({
|
|
|
194
192
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
195
193
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
196
194
|
minScore: z.number().min(0).max(1).optional(),
|
|
195
|
+
scoreGapThreshold: z.number().min(0).max(1).optional(),
|
|
197
196
|
weights: z.object({
|
|
198
197
|
incomingLinks: z.number().optional(),
|
|
199
198
|
depth: z.number().optional(),
|
|
200
|
-
|
|
201
|
-
|
|
199
|
+
aggregation: z.number().optional(),
|
|
200
|
+
titleMatch: z.number().optional()
|
|
202
201
|
}).optional()
|
|
203
202
|
}).optional(),
|
|
204
203
|
api: z.object({
|
|
@@ -220,8 +219,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
220
219
|
}).optional()
|
|
221
220
|
}).optional(),
|
|
222
221
|
state: z.object({
|
|
223
|
-
dir: z.string().optional()
|
|
224
|
-
writeMirror: z.boolean().optional()
|
|
222
|
+
dir: z.string().optional()
|
|
225
223
|
}).optional()
|
|
226
224
|
});
|
|
227
225
|
|
|
@@ -245,6 +243,8 @@ function createDefaultConfig(projectId) {
|
|
|
245
243
|
envVar: "SEARCHSOCKET_SCOPE",
|
|
246
244
|
sanitize: true
|
|
247
245
|
},
|
|
246
|
+
exclude: [],
|
|
247
|
+
respectRobotsTxt: true,
|
|
248
248
|
source: {
|
|
249
249
|
mode: "static-output",
|
|
250
250
|
staticOutputDir: "build",
|
|
@@ -273,24 +273,16 @@ function createDefaultConfig(projectId) {
|
|
|
273
273
|
prependTitle: true,
|
|
274
274
|
pageSummaryChunk: true
|
|
275
275
|
},
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
apiKeyEnv: "JINA_API_KEY",
|
|
280
|
-
batchSize: 64,
|
|
281
|
-
concurrency: 4
|
|
276
|
+
upstash: {
|
|
277
|
+
urlEnv: "UPSTASH_SEARCH_REST_URL",
|
|
278
|
+
tokenEnv: "UPSTASH_SEARCH_REST_TOKEN"
|
|
282
279
|
},
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
},
|
|
290
|
-
rerank: {
|
|
291
|
-
enabled: false,
|
|
292
|
-
topN: 20,
|
|
293
|
-
model: "jina-reranker-v2-base-multilingual"
|
|
280
|
+
search: {
|
|
281
|
+
semanticWeight: 0.75,
|
|
282
|
+
inputEnrichment: true,
|
|
283
|
+
reranking: true,
|
|
284
|
+
dualSearch: true,
|
|
285
|
+
pageSearchWeight: 0.3
|
|
294
286
|
},
|
|
295
287
|
ranking: {
|
|
296
288
|
enableIncomingLinkBoost: true,
|
|
@@ -299,12 +291,13 @@ function createDefaultConfig(projectId) {
|
|
|
299
291
|
aggregationCap: 5,
|
|
300
292
|
aggregationDecay: 0.5,
|
|
301
293
|
minChunkScoreRatio: 0.5,
|
|
302
|
-
minScore: 0,
|
|
294
|
+
minScore: 0.3,
|
|
295
|
+
scoreGapThreshold: 0.4,
|
|
303
296
|
weights: {
|
|
304
297
|
incomingLinks: 0.05,
|
|
305
298
|
depth: 0.03,
|
|
306
|
-
|
|
307
|
-
|
|
299
|
+
aggregation: 0.1,
|
|
300
|
+
titleMatch: 0.15
|
|
308
301
|
}
|
|
309
302
|
},
|
|
310
303
|
api: {
|
|
@@ -322,8 +315,7 @@ function createDefaultConfig(projectId) {
|
|
|
322
315
|
}
|
|
323
316
|
},
|
|
324
317
|
state: {
|
|
325
|
-
dir: ".searchsocket"
|
|
326
|
-
writeMirror: false
|
|
318
|
+
dir: ".searchsocket"
|
|
327
319
|
}
|
|
328
320
|
};
|
|
329
321
|
}
|
|
@@ -393,6 +385,8 @@ ${issues}`
|
|
|
393
385
|
...defaults.scope,
|
|
394
386
|
...parsed.scope
|
|
395
387
|
},
|
|
388
|
+
exclude: parsed.exclude ?? defaults.exclude,
|
|
389
|
+
respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
|
|
396
390
|
source: {
|
|
397
391
|
...defaults.source,
|
|
398
392
|
...parsed.source,
|
|
@@ -429,21 +423,13 @@ ${issues}`
|
|
|
429
423
|
...defaults.chunking,
|
|
430
424
|
...parsed.chunking
|
|
431
425
|
},
|
|
432
|
-
|
|
433
|
-
...defaults.
|
|
434
|
-
...parsed.
|
|
435
|
-
},
|
|
436
|
-
vector: {
|
|
437
|
-
...defaults.vector,
|
|
438
|
-
...parsed.vector,
|
|
439
|
-
turso: {
|
|
440
|
-
...defaults.vector.turso,
|
|
441
|
-
...parsed.vector?.turso
|
|
442
|
-
}
|
|
426
|
+
upstash: {
|
|
427
|
+
...defaults.upstash,
|
|
428
|
+
...parsed.upstash
|
|
443
429
|
},
|
|
444
|
-
|
|
445
|
-
...defaults.
|
|
446
|
-
...parsed.
|
|
430
|
+
search: {
|
|
431
|
+
...defaults.search,
|
|
432
|
+
...parsed.search
|
|
447
433
|
},
|
|
448
434
|
ranking: {
|
|
449
435
|
...defaults.ranking,
|
|
@@ -535,7 +521,8 @@ function writeMinimalConfig(cwd) {
|
|
|
535
521
|
return target;
|
|
536
522
|
}
|
|
537
523
|
const content = `export default {
|
|
538
|
-
|
|
524
|
+
// Upstash Search credentials (set via env vars or directly here)
|
|
525
|
+
// upstash: { urlEnv: "UPSTASH_SEARCH_REST_URL", tokenEnv: "UPSTASH_SEARCH_REST_TOKEN" }
|
|
539
526
|
};
|
|
540
527
|
`;
|
|
541
528
|
fs.writeFileSync(target, content, "utf8");
|
|
@@ -698,576 +685,246 @@ import fs2 from "fs";
|
|
|
698
685
|
import path2 from "path";
|
|
699
686
|
function ensureStateDirs(cwd, stateDir, scope) {
|
|
700
687
|
const statePath = path2.resolve(cwd, stateDir);
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
return { statePath, pagesPath };
|
|
704
|
-
}
|
|
705
|
-
|
|
706
|
-
// src/embeddings/jina.ts
|
|
707
|
-
import pLimit from "p-limit";
|
|
708
|
-
function sleep(ms) {
|
|
709
|
-
return new Promise((resolve) => {
|
|
710
|
-
setTimeout(resolve, ms);
|
|
711
|
-
});
|
|
712
|
-
}
|
|
713
|
-
var JinaEmbeddingsProvider = class {
|
|
714
|
-
apiKey;
|
|
715
|
-
batchSize;
|
|
716
|
-
concurrency;
|
|
717
|
-
defaultTask;
|
|
718
|
-
constructor(options) {
|
|
719
|
-
if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
|
|
720
|
-
throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
|
|
721
|
-
}
|
|
722
|
-
if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
|
|
723
|
-
throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
|
|
724
|
-
}
|
|
725
|
-
this.apiKey = options.apiKey;
|
|
726
|
-
this.batchSize = options.batchSize;
|
|
727
|
-
this.concurrency = options.concurrency;
|
|
728
|
-
this.defaultTask = options.task ?? "retrieval.passage";
|
|
729
|
-
}
|
|
730
|
-
estimateTokens(text) {
|
|
731
|
-
const normalized = text.trim();
|
|
732
|
-
if (!normalized) {
|
|
733
|
-
return 0;
|
|
734
|
-
}
|
|
735
|
-
const wordCount = normalized.match(/[A-Za-z0-9_]+/g)?.length ?? 0;
|
|
736
|
-
const punctuationCount = normalized.match(/[^\s\w]/g)?.length ?? 0;
|
|
737
|
-
const cjkCount = normalized.match(/[\u3400-\u9fff]/g)?.length ?? 0;
|
|
738
|
-
const charEstimate = Math.ceil(normalized.length / 4);
|
|
739
|
-
const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
|
|
740
|
-
return Math.max(1, Math.max(charEstimate, lexicalEstimate));
|
|
741
|
-
}
|
|
742
|
-
async embedTexts(texts, modelId, task) {
|
|
743
|
-
if (texts.length === 0) {
|
|
744
|
-
return [];
|
|
745
|
-
}
|
|
746
|
-
const batches = [];
|
|
747
|
-
for (let i = 0; i < texts.length; i += this.batchSize) {
|
|
748
|
-
batches.push({
|
|
749
|
-
index: i,
|
|
750
|
-
values: texts.slice(i, i + this.batchSize)
|
|
751
|
-
});
|
|
752
|
-
}
|
|
753
|
-
const outputs = new Array(batches.length);
|
|
754
|
-
const limit = pLimit(this.concurrency);
|
|
755
|
-
await Promise.all(
|
|
756
|
-
batches.map(
|
|
757
|
-
(batch, position) => limit(async () => {
|
|
758
|
-
outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
|
|
759
|
-
})
|
|
760
|
-
)
|
|
761
|
-
);
|
|
762
|
-
return outputs.flat();
|
|
763
|
-
}
|
|
764
|
-
async embedWithRetry(texts, modelId, task) {
|
|
765
|
-
const maxAttempts = 5;
|
|
766
|
-
let attempt = 0;
|
|
767
|
-
while (attempt < maxAttempts) {
|
|
768
|
-
attempt += 1;
|
|
769
|
-
let response;
|
|
770
|
-
try {
|
|
771
|
-
response = await fetch("https://api.jina.ai/v1/embeddings", {
|
|
772
|
-
method: "POST",
|
|
773
|
-
headers: {
|
|
774
|
-
"content-type": "application/json",
|
|
775
|
-
authorization: `Bearer ${this.apiKey}`
|
|
776
|
-
},
|
|
777
|
-
body: JSON.stringify({
|
|
778
|
-
model: modelId,
|
|
779
|
-
input: texts,
|
|
780
|
-
task
|
|
781
|
-
})
|
|
782
|
-
});
|
|
783
|
-
} catch (error) {
|
|
784
|
-
if (attempt >= maxAttempts) {
|
|
785
|
-
throw error;
|
|
786
|
-
}
|
|
787
|
-
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
788
|
-
continue;
|
|
789
|
-
}
|
|
790
|
-
if (!response.ok) {
|
|
791
|
-
const retryable = response.status === 429 || response.status >= 500;
|
|
792
|
-
if (!retryable || attempt >= maxAttempts) {
|
|
793
|
-
const errorBody = await response.text();
|
|
794
|
-
throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
|
|
795
|
-
}
|
|
796
|
-
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
797
|
-
continue;
|
|
798
|
-
}
|
|
799
|
-
const payload = await response.json();
|
|
800
|
-
if (!payload.data || !Array.isArray(payload.data)) {
|
|
801
|
-
throw new Error("Invalid Jina embeddings response format");
|
|
802
|
-
}
|
|
803
|
-
return payload.data.map((entry) => entry.embedding);
|
|
804
|
-
}
|
|
805
|
-
throw new Error("Unreachable retry state");
|
|
806
|
-
}
|
|
807
|
-
};
|
|
808
|
-
|
|
809
|
-
// src/embeddings/factory.ts
|
|
810
|
-
function createEmbeddingsProvider(config) {
|
|
811
|
-
if (config.embeddings.provider !== "jina") {
|
|
812
|
-
throw new SearchSocketError(
|
|
813
|
-
"CONFIG_MISSING",
|
|
814
|
-
`Unsupported embeddings provider ${config.embeddings.provider}`
|
|
815
|
-
);
|
|
816
|
-
}
|
|
817
|
-
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
818
|
-
if (!apiKey) {
|
|
819
|
-
throw new SearchSocketError(
|
|
820
|
-
"CONFIG_MISSING",
|
|
821
|
-
`Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
|
|
822
|
-
);
|
|
823
|
-
}
|
|
824
|
-
return new JinaEmbeddingsProvider({
|
|
825
|
-
apiKey,
|
|
826
|
-
batchSize: config.embeddings.batchSize,
|
|
827
|
-
concurrency: config.embeddings.concurrency
|
|
828
|
-
});
|
|
688
|
+
fs2.mkdirSync(statePath, { recursive: true });
|
|
689
|
+
return { statePath };
|
|
829
690
|
}
|
|
830
691
|
|
|
831
692
|
// src/indexing/pipeline.ts
|
|
832
|
-
import
|
|
833
|
-
|
|
834
|
-
// src/vector/factory.ts
|
|
835
|
-
import fs3 from "fs";
|
|
836
|
-
import path3 from "path";
|
|
693
|
+
import path10 from "path";
|
|
837
694
|
|
|
838
|
-
// src/
|
|
839
|
-
function
|
|
840
|
-
return
|
|
695
|
+
// src/vector/upstash.ts
|
|
696
|
+
function chunkIndexName(scope) {
|
|
697
|
+
return `${scope.projectId}--${scope.scopeName}`;
|
|
841
698
|
}
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
699
|
+
function pageIndexName(scope) {
|
|
700
|
+
return `${scope.projectId}--${scope.scopeName}--pages`;
|
|
701
|
+
}
|
|
702
|
+
var UpstashSearchStore = class {
|
|
845
703
|
client;
|
|
846
|
-
dimension;
|
|
847
|
-
chunksReady = false;
|
|
848
|
-
registryReady = false;
|
|
849
|
-
pagesReady = false;
|
|
850
704
|
constructor(opts) {
|
|
851
705
|
this.client = opts.client;
|
|
852
|
-
this.dimension = opts.dimension;
|
|
853
|
-
}
|
|
854
|
-
async ensureRegistry() {
|
|
855
|
-
if (this.registryReady) return;
|
|
856
|
-
await this.client.execute(`
|
|
857
|
-
CREATE TABLE IF NOT EXISTS registry (
|
|
858
|
-
scope_key TEXT PRIMARY KEY,
|
|
859
|
-
project_id TEXT NOT NULL,
|
|
860
|
-
scope_name TEXT NOT NULL,
|
|
861
|
-
model_id TEXT NOT NULL,
|
|
862
|
-
last_indexed_at TEXT NOT NULL,
|
|
863
|
-
vector_count INTEGER,
|
|
864
|
-
last_estimate_tokens INTEGER,
|
|
865
|
-
last_estimate_cost_usd REAL,
|
|
866
|
-
last_estimate_changed_chunks INTEGER
|
|
867
|
-
)
|
|
868
|
-
`);
|
|
869
|
-
const estimateCols = [
|
|
870
|
-
{ name: "last_estimate_tokens", def: "INTEGER" },
|
|
871
|
-
{ name: "last_estimate_cost_usd", def: "REAL" },
|
|
872
|
-
{ name: "last_estimate_changed_chunks", def: "INTEGER" }
|
|
873
|
-
];
|
|
874
|
-
for (const col of estimateCols) {
|
|
875
|
-
try {
|
|
876
|
-
await this.client.execute(`ALTER TABLE registry ADD COLUMN ${col.name} ${col.def}`);
|
|
877
|
-
} catch (error) {
|
|
878
|
-
if (error instanceof Error && !error.message.includes("duplicate column")) {
|
|
879
|
-
throw error;
|
|
880
|
-
}
|
|
881
|
-
}
|
|
882
|
-
}
|
|
883
|
-
this.registryReady = true;
|
|
884
|
-
}
|
|
885
|
-
async ensureChunks(dim) {
|
|
886
|
-
if (this.chunksReady) return;
|
|
887
|
-
const exists = await this.chunksTableExists();
|
|
888
|
-
if (exists) {
|
|
889
|
-
const currentDim = await this.getChunksDimension();
|
|
890
|
-
if (currentDim !== null && currentDim !== dim) {
|
|
891
|
-
await this.client.batch([
|
|
892
|
-
"DROP INDEX IF EXISTS idx",
|
|
893
|
-
"DROP TABLE IF EXISTS chunks"
|
|
894
|
-
]);
|
|
895
|
-
}
|
|
896
|
-
}
|
|
897
|
-
await this.client.batch([
|
|
898
|
-
`CREATE TABLE IF NOT EXISTS chunks (
|
|
899
|
-
id TEXT PRIMARY KEY,
|
|
900
|
-
project_id TEXT NOT NULL,
|
|
901
|
-
scope_name TEXT NOT NULL,
|
|
902
|
-
url TEXT NOT NULL,
|
|
903
|
-
path TEXT NOT NULL,
|
|
904
|
-
title TEXT NOT NULL,
|
|
905
|
-
section_title TEXT NOT NULL DEFAULT '',
|
|
906
|
-
heading_path TEXT NOT NULL DEFAULT '[]',
|
|
907
|
-
snippet TEXT NOT NULL DEFAULT '',
|
|
908
|
-
chunk_text TEXT NOT NULL DEFAULT '',
|
|
909
|
-
ordinal INTEGER NOT NULL DEFAULT 0,
|
|
910
|
-
content_hash TEXT NOT NULL DEFAULT '',
|
|
911
|
-
model_id TEXT NOT NULL DEFAULT '',
|
|
912
|
-
depth INTEGER NOT NULL DEFAULT 0,
|
|
913
|
-
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
914
|
-
route_file TEXT NOT NULL DEFAULT '',
|
|
915
|
-
tags TEXT NOT NULL DEFAULT '[]',
|
|
916
|
-
description TEXT NOT NULL DEFAULT '',
|
|
917
|
-
keywords TEXT NOT NULL DEFAULT '[]',
|
|
918
|
-
embedding F32_BLOB(${dim})
|
|
919
|
-
)`,
|
|
920
|
-
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
921
|
-
]);
|
|
922
|
-
this.chunksReady = true;
|
|
923
|
-
}
|
|
924
|
-
async ensurePages() {
|
|
925
|
-
if (this.pagesReady) return;
|
|
926
|
-
await this.client.execute(`
|
|
927
|
-
CREATE TABLE IF NOT EXISTS pages (
|
|
928
|
-
project_id TEXT NOT NULL,
|
|
929
|
-
scope_name TEXT NOT NULL,
|
|
930
|
-
url TEXT NOT NULL,
|
|
931
|
-
title TEXT NOT NULL,
|
|
932
|
-
markdown TEXT NOT NULL,
|
|
933
|
-
route_file TEXT NOT NULL DEFAULT '',
|
|
934
|
-
route_resolution TEXT NOT NULL DEFAULT 'exact',
|
|
935
|
-
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
936
|
-
outgoing_links INTEGER NOT NULL DEFAULT 0,
|
|
937
|
-
depth INTEGER NOT NULL DEFAULT 0,
|
|
938
|
-
tags TEXT NOT NULL DEFAULT '[]',
|
|
939
|
-
indexed_at TEXT NOT NULL,
|
|
940
|
-
PRIMARY KEY (project_id, scope_name, url)
|
|
941
|
-
)
|
|
942
|
-
`);
|
|
943
|
-
this.pagesReady = true;
|
|
944
706
|
}
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
await this.client.execute("SELECT 1 FROM chunks LIMIT 0");
|
|
948
|
-
return true;
|
|
949
|
-
} catch (error) {
|
|
950
|
-
if (error instanceof Error && error.message.includes("no such table")) {
|
|
951
|
-
return false;
|
|
952
|
-
}
|
|
953
|
-
throw error;
|
|
954
|
-
}
|
|
707
|
+
chunkIndex(scope) {
|
|
708
|
+
return this.client.index(chunkIndexName(scope));
|
|
955
709
|
}
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
* Returns null if the table doesn't exist or the dimension can't be parsed.
|
|
959
|
-
*/
|
|
960
|
-
async getChunksDimension() {
|
|
961
|
-
try {
|
|
962
|
-
const rs = await this.client.execute(
|
|
963
|
-
"SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
|
|
964
|
-
);
|
|
965
|
-
if (rs.rows.length === 0) return null;
|
|
966
|
-
const sql = rs.rows[0].sql;
|
|
967
|
-
const match = sql.match(/F32_BLOB\((\d+)\)/i);
|
|
968
|
-
return match ? parseInt(match[1], 10) : null;
|
|
969
|
-
} catch {
|
|
970
|
-
return null;
|
|
971
|
-
}
|
|
710
|
+
pageIndex(scope) {
|
|
711
|
+
return this.client.index(pageIndexName(scope));
|
|
972
712
|
}
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
*/
|
|
977
|
-
async dropAllTables() {
|
|
978
|
-
await this.client.batch([
|
|
979
|
-
"DROP INDEX IF EXISTS idx",
|
|
980
|
-
"DROP TABLE IF EXISTS chunks",
|
|
981
|
-
"DROP TABLE IF EXISTS registry",
|
|
982
|
-
"DROP TABLE IF EXISTS pages"
|
|
983
|
-
]);
|
|
984
|
-
this.chunksReady = false;
|
|
985
|
-
this.registryReady = false;
|
|
986
|
-
this.pagesReady = false;
|
|
987
|
-
}
|
|
988
|
-
async upsert(records, _scope) {
|
|
989
|
-
if (records.length === 0) return;
|
|
990
|
-
const dim = this.dimension ?? records[0].vector.length;
|
|
991
|
-
await this.ensureChunks(dim);
|
|
713
|
+
async upsertChunks(chunks, scope) {
|
|
714
|
+
if (chunks.length === 0) return;
|
|
715
|
+
const index = this.chunkIndex(scope);
|
|
992
716
|
const BATCH_SIZE = 100;
|
|
993
|
-
for (let i = 0; i <
|
|
994
|
-
const batch =
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
r.metadata.title,
|
|
1008
|
-
r.metadata.sectionTitle,
|
|
1009
|
-
JSON.stringify(r.metadata.headingPath),
|
|
1010
|
-
r.metadata.snippet,
|
|
1011
|
-
r.metadata.chunkText,
|
|
1012
|
-
r.metadata.ordinal,
|
|
1013
|
-
r.metadata.contentHash,
|
|
1014
|
-
r.metadata.modelId,
|
|
1015
|
-
r.metadata.depth,
|
|
1016
|
-
r.metadata.incomingLinks,
|
|
1017
|
-
r.metadata.routeFile,
|
|
1018
|
-
JSON.stringify(r.metadata.tags),
|
|
1019
|
-
r.metadata.description ?? "",
|
|
1020
|
-
JSON.stringify(r.metadata.keywords ?? []),
|
|
1021
|
-
JSON.stringify(r.vector)
|
|
1022
|
-
]
|
|
1023
|
-
}));
|
|
1024
|
-
await this.client.batch(stmts);
|
|
1025
|
-
}
|
|
1026
|
-
}
|
|
1027
|
-
async query(queryVector, opts, scope) {
|
|
1028
|
-
const dim = this.dimension ?? queryVector.length;
|
|
1029
|
-
await this.ensureChunks(dim);
|
|
1030
|
-
const queryJson = JSON.stringify(queryVector);
|
|
1031
|
-
const rs = await this.client.execute({
|
|
1032
|
-
sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
|
|
1033
|
-
c.section_title, c.heading_path, c.snippet, c.chunk_text,
|
|
1034
|
-
c.ordinal, c.content_hash,
|
|
1035
|
-
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
1036
|
-
c.description, c.keywords,
|
|
1037
|
-
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
1038
|
-
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
1039
|
-
JOIN chunks AS c ON c.rowid = v.id`,
|
|
1040
|
-
args: [queryJson, queryJson, opts.topK]
|
|
717
|
+
for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
|
|
718
|
+
const batch = chunks.slice(i, i + BATCH_SIZE);
|
|
719
|
+
await index.upsert(batch);
|
|
720
|
+
}
|
|
721
|
+
}
|
|
722
|
+
async search(query, opts, scope) {
|
|
723
|
+
const index = this.chunkIndex(scope);
|
|
724
|
+
const results = await index.search({
|
|
725
|
+
query,
|
|
726
|
+
limit: opts.limit,
|
|
727
|
+
semanticWeight: opts.semanticWeight,
|
|
728
|
+
inputEnrichment: opts.inputEnrichment,
|
|
729
|
+
reranking: opts.reranking,
|
|
730
|
+
filter: opts.filter
|
|
1041
731
|
});
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
}
|
|
732
|
+
return results.map((doc) => ({
|
|
733
|
+
id: doc.id,
|
|
734
|
+
score: doc.score,
|
|
735
|
+
metadata: {
|
|
736
|
+
projectId: doc.metadata?.projectId ?? "",
|
|
737
|
+
scopeName: doc.metadata?.scopeName ?? "",
|
|
738
|
+
url: doc.content.url,
|
|
739
|
+
path: doc.metadata?.path ?? "",
|
|
740
|
+
title: doc.content.title,
|
|
741
|
+
sectionTitle: doc.content.sectionTitle,
|
|
742
|
+
headingPath: doc.content.headingPath ? doc.content.headingPath.split(" > ").filter(Boolean) : [],
|
|
743
|
+
snippet: doc.metadata?.snippet ?? "",
|
|
744
|
+
chunkText: doc.content.text,
|
|
745
|
+
ordinal: doc.metadata?.ordinal ?? 0,
|
|
746
|
+
contentHash: doc.metadata?.contentHash ?? "",
|
|
747
|
+
depth: doc.metadata?.depth ?? 0,
|
|
748
|
+
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
749
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
750
|
+
tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
|
|
751
|
+
description: doc.metadata?.description || void 0,
|
|
752
|
+
keywords: doc.metadata?.keywords ? doc.metadata.keywords.split(",").filter(Boolean) : void 0
|
|
1064
753
|
}
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
scopeName,
|
|
1079
|
-
url: row.url,
|
|
1080
|
-
path: rowPath,
|
|
1081
|
-
title: row.title,
|
|
1082
|
-
sectionTitle: row.section_title,
|
|
1083
|
-
headingPath: JSON.parse(row.heading_path || "[]"),
|
|
1084
|
-
snippet: row.snippet,
|
|
1085
|
-
chunkText: row.chunk_text || "",
|
|
1086
|
-
ordinal: row.ordinal || 0,
|
|
1087
|
-
contentHash: row.content_hash,
|
|
1088
|
-
modelId: row.model_id,
|
|
1089
|
-
depth: row.depth,
|
|
1090
|
-
incomingLinks: row.incoming_links,
|
|
1091
|
-
routeFile: row.route_file,
|
|
1092
|
-
tags,
|
|
1093
|
-
description,
|
|
1094
|
-
keywords
|
|
1095
|
-
}
|
|
754
|
+
}));
|
|
755
|
+
}
|
|
756
|
+
async searchPages(query, opts, scope) {
|
|
757
|
+
const index = this.pageIndex(scope);
|
|
758
|
+
let results;
|
|
759
|
+
try {
|
|
760
|
+
results = await index.search({
|
|
761
|
+
query,
|
|
762
|
+
limit: opts.limit,
|
|
763
|
+
semanticWeight: opts.semanticWeight,
|
|
764
|
+
inputEnrichment: opts.inputEnrichment,
|
|
765
|
+
reranking: true,
|
|
766
|
+
filter: opts.filter
|
|
1096
767
|
});
|
|
768
|
+
} catch {
|
|
769
|
+
return [];
|
|
1097
770
|
}
|
|
1098
|
-
|
|
1099
|
-
|
|
771
|
+
return results.map((doc) => ({
|
|
772
|
+
id: doc.id,
|
|
773
|
+
score: doc.score,
|
|
774
|
+
title: doc.content.title,
|
|
775
|
+
url: doc.content.url,
|
|
776
|
+
description: doc.content.description ?? "",
|
|
777
|
+
tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
|
|
778
|
+
depth: doc.metadata?.depth ?? 0,
|
|
779
|
+
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
780
|
+
routeFile: doc.metadata?.routeFile ?? ""
|
|
781
|
+
}));
|
|
1100
782
|
}
|
|
1101
783
|
async deleteByIds(ids, scope) {
|
|
1102
784
|
if (ids.length === 0) return;
|
|
785
|
+
const index = this.chunkIndex(scope);
|
|
1103
786
|
const BATCH_SIZE = 500;
|
|
1104
787
|
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
1105
788
|
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
1106
|
-
|
|
1107
|
-
await this.client.execute({
|
|
1108
|
-
sql: `DELETE FROM chunks WHERE project_id = ? AND scope_name = ? AND id IN (${placeholders})`,
|
|
1109
|
-
args: [scope.projectId, scope.scopeName, ...batch]
|
|
1110
|
-
});
|
|
789
|
+
await index.delete(batch);
|
|
1111
790
|
}
|
|
1112
791
|
}
|
|
1113
792
|
async deleteScope(scope) {
|
|
1114
|
-
await this.ensureRegistry();
|
|
1115
793
|
try {
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
});
|
|
1120
|
-
} catch (error) {
|
|
1121
|
-
if (error instanceof Error && !error.message.includes("no such table")) {
|
|
1122
|
-
throw error;
|
|
1123
|
-
}
|
|
794
|
+
const chunkIdx = this.chunkIndex(scope);
|
|
795
|
+
await chunkIdx.deleteIndex();
|
|
796
|
+
} catch {
|
|
1124
797
|
}
|
|
1125
798
|
try {
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
});
|
|
1130
|
-
} catch (error) {
|
|
1131
|
-
if (error instanceof Error && !error.message.includes("no such table")) {
|
|
1132
|
-
throw error;
|
|
1133
|
-
}
|
|
799
|
+
const pageIdx = this.pageIndex(scope);
|
|
800
|
+
await pageIdx.deleteIndex();
|
|
801
|
+
} catch {
|
|
1134
802
|
}
|
|
1135
|
-
await this.client.execute({
|
|
1136
|
-
sql: `DELETE FROM registry WHERE project_id = ? AND scope_name = ?`,
|
|
1137
|
-
args: [scope.projectId, scope.scopeName]
|
|
1138
|
-
});
|
|
1139
|
-
}
|
|
1140
|
-
async listScopes(scopeProjectId) {
|
|
1141
|
-
await this.ensureRegistry();
|
|
1142
|
-
const rs = await this.client.execute({
|
|
1143
|
-
sql: `SELECT project_id, scope_name, model_id, last_indexed_at, vector_count,
|
|
1144
|
-
last_estimate_tokens, last_estimate_cost_usd, last_estimate_changed_chunks
|
|
1145
|
-
FROM registry WHERE project_id = ?`,
|
|
1146
|
-
args: [scopeProjectId]
|
|
1147
|
-
});
|
|
1148
|
-
return rs.rows.map((row) => ({
|
|
1149
|
-
projectId: row.project_id,
|
|
1150
|
-
scopeName: row.scope_name,
|
|
1151
|
-
modelId: row.model_id,
|
|
1152
|
-
lastIndexedAt: row.last_indexed_at,
|
|
1153
|
-
vectorCount: row.vector_count,
|
|
1154
|
-
lastEstimateTokens: row.last_estimate_tokens,
|
|
1155
|
-
lastEstimateCostUSD: row.last_estimate_cost_usd,
|
|
1156
|
-
lastEstimateChangedChunks: row.last_estimate_changed_chunks
|
|
1157
|
-
}));
|
|
1158
803
|
}
|
|
1159
|
-
async
|
|
1160
|
-
await this.
|
|
1161
|
-
const
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
804
|
+
async listScopes(projectId) {
|
|
805
|
+
const allIndexes = await this.client.listIndexes();
|
|
806
|
+
const prefix = `${projectId}--`;
|
|
807
|
+
const scopeNames = /* @__PURE__ */ new Set();
|
|
808
|
+
for (const name of allIndexes) {
|
|
809
|
+
if (name.startsWith(prefix) && !name.endsWith("--pages")) {
|
|
810
|
+
const scopeName = name.slice(prefix.length);
|
|
811
|
+
scopeNames.add(scopeName);
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
const scopes = [];
|
|
815
|
+
for (const scopeName of scopeNames) {
|
|
816
|
+
const scope = {
|
|
817
|
+
projectId,
|
|
818
|
+
scopeName,
|
|
819
|
+
scopeId: `${projectId}:${scopeName}`
|
|
820
|
+
};
|
|
821
|
+
try {
|
|
822
|
+
const info = await this.chunkIndex(scope).info();
|
|
823
|
+
scopes.push({
|
|
824
|
+
projectId,
|
|
825
|
+
scopeName,
|
|
826
|
+
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
827
|
+
documentCount: info.documentCount
|
|
828
|
+
});
|
|
829
|
+
} catch {
|
|
830
|
+
scopes.push({
|
|
831
|
+
projectId,
|
|
832
|
+
scopeName,
|
|
833
|
+
lastIndexedAt: "unknown",
|
|
834
|
+
documentCount: 0
|
|
835
|
+
});
|
|
836
|
+
}
|
|
837
|
+
}
|
|
838
|
+
return scopes;
|
|
1179
839
|
}
|
|
1180
840
|
async getContentHashes(scope) {
|
|
1181
|
-
const exists = await this.chunksTableExists();
|
|
1182
|
-
if (!exists) return /* @__PURE__ */ new Map();
|
|
1183
|
-
const rs = await this.client.execute({
|
|
1184
|
-
sql: `SELECT id, content_hash FROM chunks WHERE project_id = ? AND scope_name = ?`,
|
|
1185
|
-
args: [scope.projectId, scope.scopeName]
|
|
1186
|
-
});
|
|
1187
841
|
const map = /* @__PURE__ */ new Map();
|
|
1188
|
-
|
|
1189
|
-
|
|
842
|
+
const index = this.chunkIndex(scope);
|
|
843
|
+
let cursor = "0";
|
|
844
|
+
try {
|
|
845
|
+
for (; ; ) {
|
|
846
|
+
const result = await index.range({ cursor, limit: 100 });
|
|
847
|
+
for (const doc of result.documents) {
|
|
848
|
+
if (doc.metadata?.contentHash) {
|
|
849
|
+
map.set(doc.id, doc.metadata.contentHash);
|
|
850
|
+
}
|
|
851
|
+
}
|
|
852
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
853
|
+
cursor = result.nextCursor;
|
|
854
|
+
}
|
|
855
|
+
} catch {
|
|
1190
856
|
}
|
|
1191
857
|
return map;
|
|
1192
858
|
}
|
|
1193
859
|
async upsertPages(pages, scope) {
|
|
1194
860
|
if (pages.length === 0) return;
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
if (page.projectId !== scope.projectId || page.scopeName !== scope.scopeName) {
|
|
1198
|
-
throw new Error(
|
|
1199
|
-
`Page scope mismatch: page has ${page.projectId}:${page.scopeName} but scope is ${scope.projectId}:${scope.scopeName}`
|
|
1200
|
-
);
|
|
1201
|
-
}
|
|
1202
|
-
}
|
|
1203
|
-
const BATCH_SIZE = 100;
|
|
861
|
+
const index = this.pageIndex(scope);
|
|
862
|
+
const BATCH_SIZE = 50;
|
|
1204
863
|
for (let i = 0; i < pages.length; i += BATCH_SIZE) {
|
|
1205
864
|
const batch = pages.slice(i, i + BATCH_SIZE);
|
|
1206
|
-
const
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
p.
|
|
1213
|
-
p.
|
|
1214
|
-
p.
|
|
1215
|
-
p.
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
p.
|
|
1219
|
-
p.
|
|
1220
|
-
p.
|
|
1221
|
-
p.
|
|
1222
|
-
|
|
1223
|
-
p.
|
|
1224
|
-
|
|
865
|
+
const docs = batch.map((p) => ({
|
|
866
|
+
id: p.url,
|
|
867
|
+
content: {
|
|
868
|
+
title: p.title,
|
|
869
|
+
url: p.url,
|
|
870
|
+
type: "page",
|
|
871
|
+
description: p.description ?? "",
|
|
872
|
+
keywords: (p.keywords ?? []).join(","),
|
|
873
|
+
summary: p.summary ?? "",
|
|
874
|
+
tags: p.tags.join(",")
|
|
875
|
+
},
|
|
876
|
+
metadata: {
|
|
877
|
+
markdown: p.markdown,
|
|
878
|
+
projectId: p.projectId,
|
|
879
|
+
scopeName: p.scopeName,
|
|
880
|
+
routeFile: p.routeFile,
|
|
881
|
+
routeResolution: p.routeResolution,
|
|
882
|
+
incomingLinks: p.incomingLinks,
|
|
883
|
+
outgoingLinks: p.outgoingLinks,
|
|
884
|
+
depth: p.depth,
|
|
885
|
+
indexedAt: p.indexedAt
|
|
886
|
+
}
|
|
1225
887
|
}));
|
|
1226
|
-
await
|
|
888
|
+
await index.upsert(docs);
|
|
1227
889
|
}
|
|
1228
890
|
}
|
|
1229
891
|
async getPage(url, scope) {
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
892
|
+
const index = this.pageIndex(scope);
|
|
893
|
+
try {
|
|
894
|
+
const results = await index.fetch([url]);
|
|
895
|
+
const doc = results[0];
|
|
896
|
+
if (!doc) return null;
|
|
897
|
+
return {
|
|
898
|
+
url: doc.content.url,
|
|
899
|
+
title: doc.content.title,
|
|
900
|
+
markdown: doc.metadata.markdown,
|
|
901
|
+
projectId: doc.metadata.projectId,
|
|
902
|
+
scopeName: doc.metadata.scopeName,
|
|
903
|
+
routeFile: doc.metadata.routeFile,
|
|
904
|
+
routeResolution: doc.metadata.routeResolution,
|
|
905
|
+
incomingLinks: doc.metadata.incomingLinks,
|
|
906
|
+
outgoingLinks: doc.metadata.outgoingLinks,
|
|
907
|
+
depth: doc.metadata.depth,
|
|
908
|
+
tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
|
|
909
|
+
indexedAt: doc.metadata.indexedAt,
|
|
910
|
+
summary: doc.content.summary || void 0,
|
|
911
|
+
description: doc.content.description || void 0,
|
|
912
|
+
keywords: doc.content.keywords ? doc.content.keywords.split(",").filter(Boolean) : void 0
|
|
913
|
+
};
|
|
914
|
+
} catch {
|
|
915
|
+
return null;
|
|
916
|
+
}
|
|
1251
917
|
}
|
|
1252
918
|
async deletePages(scope) {
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
}
|
|
1258
|
-
}
|
|
1259
|
-
async getScopeModelId(scope) {
|
|
1260
|
-
await this.ensureRegistry();
|
|
1261
|
-
const rs = await this.client.execute({
|
|
1262
|
-
sql: `SELECT model_id FROM registry WHERE project_id = ? AND scope_name = ?`,
|
|
1263
|
-
args: [scope.projectId, scope.scopeName]
|
|
1264
|
-
});
|
|
1265
|
-
if (rs.rows.length === 0) return null;
|
|
1266
|
-
return rs.rows[0].model_id;
|
|
919
|
+
try {
|
|
920
|
+
const index = this.pageIndex(scope);
|
|
921
|
+
await index.reset();
|
|
922
|
+
} catch {
|
|
923
|
+
}
|
|
1267
924
|
}
|
|
1268
925
|
async health() {
|
|
1269
926
|
try {
|
|
1270
|
-
await this.client.
|
|
927
|
+
await this.client.info();
|
|
1271
928
|
return { ok: true };
|
|
1272
929
|
} catch (error) {
|
|
1273
930
|
return {
|
|
@@ -1276,40 +933,34 @@ var TursoVectorStore = class {
|
|
|
1276
933
|
};
|
|
1277
934
|
}
|
|
1278
935
|
}
|
|
936
|
+
async dropAllIndexes(projectId) {
|
|
937
|
+
const allIndexes = await this.client.listIndexes();
|
|
938
|
+
const prefix = `${projectId}--`;
|
|
939
|
+
for (const name of allIndexes) {
|
|
940
|
+
if (name.startsWith(prefix)) {
|
|
941
|
+
try {
|
|
942
|
+
const index = this.client.index(name);
|
|
943
|
+
await index.deleteIndex();
|
|
944
|
+
} catch {
|
|
945
|
+
}
|
|
946
|
+
}
|
|
947
|
+
}
|
|
948
|
+
}
|
|
1279
949
|
};
|
|
1280
950
|
|
|
1281
951
|
// src/vector/factory.ts
|
|
1282
|
-
async function
|
|
1283
|
-
const
|
|
1284
|
-
const
|
|
1285
|
-
if (
|
|
1286
|
-
const { createClient: createClient2 } = await import("@libsql/client/http");
|
|
1287
|
-
const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
|
|
1288
|
-
const client2 = createClient2({
|
|
1289
|
-
url: remoteUrl,
|
|
1290
|
-
authToken
|
|
1291
|
-
});
|
|
1292
|
-
return new TursoVectorStore({
|
|
1293
|
-
client: client2,
|
|
1294
|
-
dimension: config.vector.dimension
|
|
1295
|
-
});
|
|
1296
|
-
}
|
|
1297
|
-
if (isServerless()) {
|
|
952
|
+
async function createUpstashStore(config) {
|
|
953
|
+
const url = config.upstash.url ?? process.env[config.upstash.urlEnv];
|
|
954
|
+
const token = config.upstash.token ?? process.env[config.upstash.tokenEnv];
|
|
955
|
+
if (!url || !token) {
|
|
1298
956
|
throw new SearchSocketError(
|
|
1299
957
|
"VECTOR_BACKEND_UNAVAILABLE",
|
|
1300
|
-
`
|
|
958
|
+
`Missing Upstash Search credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
|
|
1301
959
|
);
|
|
1302
960
|
}
|
|
1303
|
-
const {
|
|
1304
|
-
const
|
|
1305
|
-
|
|
1306
|
-
const client = createClient({
|
|
1307
|
-
url: `file:${localPath}`
|
|
1308
|
-
});
|
|
1309
|
-
return new TursoVectorStore({
|
|
1310
|
-
client,
|
|
1311
|
-
dimension: config.vector.dimension
|
|
1312
|
-
});
|
|
961
|
+
const { Search } = await import("@upstash/search");
|
|
962
|
+
const client = new Search({ url, token });
|
|
963
|
+
return new UpstashSearchStore({ client });
|
|
1313
964
|
}
|
|
1314
965
|
|
|
1315
966
|
// src/utils/hash.ts
|
|
@@ -1322,7 +973,7 @@ function sha256(input) {
|
|
|
1322
973
|
}
|
|
1323
974
|
|
|
1324
975
|
// src/utils/path.ts
|
|
1325
|
-
import
|
|
976
|
+
import path3 from "path";
|
|
1326
977
|
function normalizeUrlPath(rawPath) {
|
|
1327
978
|
let out = rawPath.trim();
|
|
1328
979
|
if (!out.startsWith("/")) {
|
|
@@ -1334,15 +985,8 @@ function normalizeUrlPath(rawPath) {
|
|
|
1334
985
|
}
|
|
1335
986
|
return out;
|
|
1336
987
|
}
|
|
1337
|
-
function urlPathToMirrorRelative(urlPath) {
|
|
1338
|
-
const normalized = normalizeUrlPath(urlPath);
|
|
1339
|
-
if (normalized === "/") {
|
|
1340
|
-
return "index.md";
|
|
1341
|
-
}
|
|
1342
|
-
return `${normalized.slice(1)}.md`;
|
|
1343
|
-
}
|
|
1344
988
|
function staticHtmlFileToUrl(filePath, rootDir) {
|
|
1345
|
-
const relative =
|
|
989
|
+
const relative = path3.relative(rootDir, filePath).replace(/\\/g, "/");
|
|
1346
990
|
if (relative === "index.html") {
|
|
1347
991
|
return "/";
|
|
1348
992
|
}
|
|
@@ -1615,7 +1259,7 @@ function buildEmbeddingText(chunk, prependTitle) {
|
|
|
1615
1259
|
|
|
1616
1260
|
${chunk.chunkText}`;
|
|
1617
1261
|
}
|
|
1618
|
-
function
|
|
1262
|
+
function chunkPage(page, config, scope) {
|
|
1619
1263
|
const sections = parseHeadingSections(page.markdown, config.chunking.headingPathDepth);
|
|
1620
1264
|
const rawChunks = sections.flatMap((section) => splitSection(section, config.chunking));
|
|
1621
1265
|
const chunks = [];
|
|
@@ -1710,6 +1354,17 @@ function extractFromHtml(url, html, config) {
|
|
|
1710
1354
|
if ($(`[${config.extract.noindexAttr}]`).length > 0) {
|
|
1711
1355
|
return null;
|
|
1712
1356
|
}
|
|
1357
|
+
const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
|
|
1358
|
+
let weight;
|
|
1359
|
+
if (weightRaw !== void 0) {
|
|
1360
|
+
const parsed = Number(weightRaw);
|
|
1361
|
+
if (Number.isFinite(parsed) && parsed >= 0) {
|
|
1362
|
+
weight = parsed;
|
|
1363
|
+
}
|
|
1364
|
+
}
|
|
1365
|
+
if (weight === 0) {
|
|
1366
|
+
return null;
|
|
1367
|
+
}
|
|
1713
1368
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
1714
1369
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
1715
1370
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -1765,7 +1420,8 @@ function extractFromHtml(url, html, config) {
|
|
|
1765
1420
|
noindex: false,
|
|
1766
1421
|
tags,
|
|
1767
1422
|
description,
|
|
1768
|
-
keywords
|
|
1423
|
+
keywords,
|
|
1424
|
+
weight
|
|
1769
1425
|
};
|
|
1770
1426
|
}
|
|
1771
1427
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -1778,6 +1434,14 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
1778
1434
|
if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
|
|
1779
1435
|
return null;
|
|
1780
1436
|
}
|
|
1437
|
+
let mdWeight;
|
|
1438
|
+
const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
|
|
1439
|
+
if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
|
|
1440
|
+
mdWeight = rawWeight;
|
|
1441
|
+
}
|
|
1442
|
+
if (mdWeight === 0) {
|
|
1443
|
+
return null;
|
|
1444
|
+
}
|
|
1781
1445
|
const content = parsed.content;
|
|
1782
1446
|
const normalized = normalizeMarkdown(content);
|
|
1783
1447
|
if (!normalizeText(normalized)) {
|
|
@@ -1800,63 +1464,13 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
1800
1464
|
noindex: false,
|
|
1801
1465
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
1802
1466
|
description: fmDescription,
|
|
1803
|
-
keywords: fmKeywords
|
|
1467
|
+
keywords: fmKeywords,
|
|
1468
|
+
weight: mdWeight
|
|
1804
1469
|
};
|
|
1805
1470
|
}
|
|
1806
1471
|
|
|
1807
|
-
// src/indexing/mirror.ts
|
|
1808
|
-
import fs4 from "fs/promises";
|
|
1809
|
-
import path5 from "path";
|
|
1810
|
-
function yamlString(value) {
|
|
1811
|
-
return JSON.stringify(value);
|
|
1812
|
-
}
|
|
1813
|
-
function yamlArray(values) {
|
|
1814
|
-
return `[${values.map((v) => JSON.stringify(v)).join(", ")}]`;
|
|
1815
|
-
}
|
|
1816
|
-
function buildMirrorMarkdown(page) {
|
|
1817
|
-
const frontmatterLines = [
|
|
1818
|
-
"---",
|
|
1819
|
-
`url: ${yamlString(page.url)}`,
|
|
1820
|
-
`title: ${yamlString(page.title)}`,
|
|
1821
|
-
`scope: ${yamlString(page.scope)}`,
|
|
1822
|
-
`routeFile: ${yamlString(page.routeFile)}`,
|
|
1823
|
-
`routeResolution: ${yamlString(page.routeResolution)}`,
|
|
1824
|
-
`generatedAt: ${yamlString(page.generatedAt)}`,
|
|
1825
|
-
`incomingLinks: ${page.incomingLinks}`,
|
|
1826
|
-
`outgoingLinks: ${page.outgoingLinks}`,
|
|
1827
|
-
`depth: ${page.depth}`,
|
|
1828
|
-
`tags: ${yamlArray(page.tags)}`,
|
|
1829
|
-
"---",
|
|
1830
|
-
""
|
|
1831
|
-
];
|
|
1832
|
-
return `${frontmatterLines.join("\n")}${normalizeMarkdown(page.markdown)}`;
|
|
1833
|
-
}
|
|
1834
|
-
function stripGeneratedAt(content) {
|
|
1835
|
-
return content.replace(/^generatedAt: .*$/m, "");
|
|
1836
|
-
}
|
|
1837
|
-
async function writeMirrorPage(statePath, scope, page) {
|
|
1838
|
-
const relative = urlPathToMirrorRelative(page.url);
|
|
1839
|
-
const outputPath = path5.join(statePath, "pages", scope.scopeName, relative);
|
|
1840
|
-
await fs4.mkdir(path5.dirname(outputPath), { recursive: true });
|
|
1841
|
-
const newContent = buildMirrorMarkdown(page);
|
|
1842
|
-
try {
|
|
1843
|
-
const existing = await fs4.readFile(outputPath, "utf8");
|
|
1844
|
-
if (stripGeneratedAt(existing) === stripGeneratedAt(newContent)) {
|
|
1845
|
-
return outputPath;
|
|
1846
|
-
}
|
|
1847
|
-
} catch {
|
|
1848
|
-
}
|
|
1849
|
-
await fs4.writeFile(outputPath, newContent, "utf8");
|
|
1850
|
-
return outputPath;
|
|
1851
|
-
}
|
|
1852
|
-
async function cleanMirrorForScope(statePath, scope) {
|
|
1853
|
-
const target = path5.join(statePath, "pages", scope.scopeName);
|
|
1854
|
-
await fs4.rm(target, { recursive: true, force: true });
|
|
1855
|
-
await fs4.mkdir(target, { recursive: true });
|
|
1856
|
-
}
|
|
1857
|
-
|
|
1858
1472
|
// src/indexing/route-mapper.ts
|
|
1859
|
-
import
|
|
1473
|
+
import path4 from "path";
|
|
1860
1474
|
import fg from "fast-glob";
|
|
1861
1475
|
function segmentToRegex(segment) {
|
|
1862
1476
|
if (segment.startsWith("(") && segment.endsWith(")")) {
|
|
@@ -1877,7 +1491,7 @@ function segmentToRegex(segment) {
|
|
|
1877
1491
|
return { regex: `/${segment.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}`, score: 10 };
|
|
1878
1492
|
}
|
|
1879
1493
|
function routeFileToPattern(routeFile, cwd) {
|
|
1880
|
-
const relative =
|
|
1494
|
+
const relative = path4.relative(cwd, routeFile).replace(/\\/g, "/");
|
|
1881
1495
|
const withoutPrefix = relative.replace(/^src\/routes\/?/, "");
|
|
1882
1496
|
const withoutPage = withoutPrefix.replace(/\/\+page\.[^/]+$/, "");
|
|
1883
1497
|
const segments = withoutPage.split("/").filter(Boolean);
|
|
@@ -1932,11 +1546,43 @@ function mapUrlToRoute(urlPath, patterns) {
|
|
|
1932
1546
|
|
|
1933
1547
|
// src/indexing/sources/build/index.ts
|
|
1934
1548
|
import { load as cheerioLoad } from "cheerio";
|
|
1935
|
-
import
|
|
1549
|
+
import pLimit from "p-limit";
|
|
1550
|
+
|
|
1551
|
+
// src/indexing/sources/build/manifest-parser.ts
|
|
1552
|
+
import fs3 from "fs/promises";
|
|
1553
|
+
import path5 from "path";
|
|
1554
|
+
|
|
1555
|
+
// src/utils/pattern.ts
|
|
1556
|
+
function matchUrlPattern(url, pattern) {
|
|
1557
|
+
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
1558
|
+
const normalizedUrl = norm(url);
|
|
1559
|
+
const normalizedPattern = norm(pattern);
|
|
1560
|
+
if (normalizedPattern.endsWith("/**")) {
|
|
1561
|
+
const prefix = normalizedPattern.slice(0, -3);
|
|
1562
|
+
if (prefix === "") {
|
|
1563
|
+
return true;
|
|
1564
|
+
}
|
|
1565
|
+
return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
|
|
1566
|
+
}
|
|
1567
|
+
if (normalizedPattern.endsWith("/*")) {
|
|
1568
|
+
const prefix = normalizedPattern.slice(0, -2);
|
|
1569
|
+
if (prefix === "") {
|
|
1570
|
+
return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
|
|
1571
|
+
}
|
|
1572
|
+
if (!normalizedUrl.startsWith(prefix + "/")) return false;
|
|
1573
|
+
const rest = normalizedUrl.slice(prefix.length + 1);
|
|
1574
|
+
return rest.length > 0 && !rest.includes("/");
|
|
1575
|
+
}
|
|
1576
|
+
return normalizedUrl === normalizedPattern;
|
|
1577
|
+
}
|
|
1578
|
+
function matchUrlPatterns(url, patterns) {
|
|
1579
|
+
for (const pattern of patterns) {
|
|
1580
|
+
if (matchUrlPattern(url, pattern)) return true;
|
|
1581
|
+
}
|
|
1582
|
+
return false;
|
|
1583
|
+
}
|
|
1936
1584
|
|
|
1937
1585
|
// src/indexing/sources/build/manifest-parser.ts
|
|
1938
|
-
import fs5 from "fs/promises";
|
|
1939
|
-
import path7 from "path";
|
|
1940
1586
|
function routeIdToFile(routeId) {
|
|
1941
1587
|
if (routeId === "/") {
|
|
1942
1588
|
return "src/routes/+page.svelte";
|
|
@@ -1948,10 +1594,10 @@ function routeIdToUrl(routeId) {
|
|
|
1948
1594
|
return routeId.split("/").filter((seg) => !(seg.startsWith("(") && seg.endsWith(")"))).join("/") || "/";
|
|
1949
1595
|
}
|
|
1950
1596
|
async function parseManifest(cwd, outputDir) {
|
|
1951
|
-
const manifestPath =
|
|
1597
|
+
const manifestPath = path5.resolve(cwd, outputDir, "server", "manifest-full.js");
|
|
1952
1598
|
let content;
|
|
1953
1599
|
try {
|
|
1954
|
-
content = await
|
|
1600
|
+
content = await fs3.readFile(manifestPath, "utf8");
|
|
1955
1601
|
} catch {
|
|
1956
1602
|
throw new SearchSocketError(
|
|
1957
1603
|
"BUILD_MANIFEST_NOT_FOUND",
|
|
@@ -2010,21 +1656,13 @@ function expandDynamicUrl(url, value) {
|
|
|
2010
1656
|
return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
|
|
2011
1657
|
}
|
|
2012
1658
|
function isExcluded(url, patterns) {
|
|
2013
|
-
|
|
2014
|
-
if (pattern.endsWith("/*")) {
|
|
2015
|
-
const prefix = pattern.slice(0, -1);
|
|
2016
|
-
if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
|
|
2017
|
-
} else if (url === pattern) {
|
|
2018
|
-
return true;
|
|
2019
|
-
}
|
|
2020
|
-
}
|
|
2021
|
-
return false;
|
|
1659
|
+
return matchUrlPatterns(url, patterns);
|
|
2022
1660
|
}
|
|
2023
1661
|
|
|
2024
1662
|
// src/indexing/sources/build/preview-server.ts
|
|
2025
1663
|
import net from "net";
|
|
2026
|
-
import
|
|
2027
|
-
import
|
|
1664
|
+
import path6 from "path";
|
|
1665
|
+
import fs4 from "fs";
|
|
2028
1666
|
import { spawn } from "child_process";
|
|
2029
1667
|
function findFreePort() {
|
|
2030
1668
|
return new Promise((resolve, reject) => {
|
|
@@ -2063,8 +1701,8 @@ async function waitForReady(url, timeout, child) {
|
|
|
2063
1701
|
);
|
|
2064
1702
|
}
|
|
2065
1703
|
async function startPreviewServer(cwd, options, logger3) {
|
|
2066
|
-
const viteBin =
|
|
2067
|
-
if (!
|
|
1704
|
+
const viteBin = path6.join(cwd, "node_modules", ".bin", "vite");
|
|
1705
|
+
if (!fs4.existsSync(viteBin)) {
|
|
2068
1706
|
throw new SearchSocketError(
|
|
2069
1707
|
"BUILD_SERVER_FAILED",
|
|
2070
1708
|
`vite binary not found at ${viteBin}. Ensure vite is installed.`
|
|
@@ -2138,7 +1776,7 @@ async function discoverPages(server, buildConfig, pipelineMaxPages) {
|
|
|
2138
1776
|
const visited = /* @__PURE__ */ new Set();
|
|
2139
1777
|
const pages = [];
|
|
2140
1778
|
const queue = [];
|
|
2141
|
-
const limit =
|
|
1779
|
+
const limit = pLimit(8);
|
|
2142
1780
|
for (const seed of seedUrls) {
|
|
2143
1781
|
const normalized = normalizeUrlPath(seed);
|
|
2144
1782
|
if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
|
|
@@ -2220,7 +1858,7 @@ async function loadBuildPages(cwd, config, maxPages) {
|
|
|
2220
1858
|
const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
|
|
2221
1859
|
const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
2222
1860
|
try {
|
|
2223
|
-
const concurrencyLimit =
|
|
1861
|
+
const concurrencyLimit = pLimit(8);
|
|
2224
1862
|
const results = await Promise.allSettled(
|
|
2225
1863
|
selected.map(
|
|
2226
1864
|
(route) => concurrencyLimit(async () => {
|
|
@@ -2260,11 +1898,11 @@ async function loadBuildPages(cwd, config, maxPages) {
|
|
|
2260
1898
|
}
|
|
2261
1899
|
|
|
2262
1900
|
// src/indexing/sources/content-files.ts
|
|
2263
|
-
import
|
|
2264
|
-
import
|
|
1901
|
+
import fs5 from "fs/promises";
|
|
1902
|
+
import path7 from "path";
|
|
2265
1903
|
import fg2 from "fast-glob";
|
|
2266
1904
|
function filePathToUrl(filePath, baseDir) {
|
|
2267
|
-
const relative =
|
|
1905
|
+
const relative = path7.relative(baseDir, filePath).replace(/\\/g, "/");
|
|
2268
1906
|
const segments = relative.split("/").filter(Boolean);
|
|
2269
1907
|
if (/(^|\/)\+page\.svelte$/.test(relative)) {
|
|
2270
1908
|
const routeSegments = segments.slice();
|
|
@@ -2289,7 +1927,7 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
2289
1927
|
if (!contentConfig) {
|
|
2290
1928
|
throw new Error("content-files config is missing");
|
|
2291
1929
|
}
|
|
2292
|
-
const baseDir =
|
|
1930
|
+
const baseDir = path7.resolve(cwd, contentConfig.baseDir);
|
|
2293
1931
|
const files = await fg2(contentConfig.globs, {
|
|
2294
1932
|
cwd: baseDir,
|
|
2295
1933
|
absolute: true,
|
|
@@ -2299,12 +1937,12 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
2299
1937
|
const selected = typeof limit === "number" ? files.slice(0, limit) : files;
|
|
2300
1938
|
const pages = [];
|
|
2301
1939
|
for (const filePath of selected) {
|
|
2302
|
-
const raw = await
|
|
1940
|
+
const raw = await fs5.readFile(filePath, "utf8");
|
|
2303
1941
|
const markdown = filePath.endsWith(".md") ? raw : normalizeSvelteToMarkdown(raw);
|
|
2304
1942
|
pages.push({
|
|
2305
1943
|
url: filePathToUrl(filePath, baseDir),
|
|
2306
1944
|
markdown,
|
|
2307
|
-
sourcePath:
|
|
1945
|
+
sourcePath: path7.relative(cwd, filePath).replace(/\\/g, "/"),
|
|
2308
1946
|
outgoingLinks: []
|
|
2309
1947
|
});
|
|
2310
1948
|
}
|
|
@@ -2314,7 +1952,7 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
2314
1952
|
// src/indexing/sources/crawl.ts
|
|
2315
1953
|
import { gunzipSync } from "zlib";
|
|
2316
1954
|
import { load as cheerioLoad2 } from "cheerio";
|
|
2317
|
-
import
|
|
1955
|
+
import pLimit2 from "p-limit";
|
|
2318
1956
|
var logger2 = new Logger();
|
|
2319
1957
|
function extractLocs(xml) {
|
|
2320
1958
|
const $ = cheerioLoad2(xml, { xmlMode: true });
|
|
@@ -2399,7 +2037,7 @@ async function loadCrawledPages(config, maxPages) {
|
|
|
2399
2037
|
const routes = await resolveRoutes(config);
|
|
2400
2038
|
const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
|
|
2401
2039
|
const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
|
|
2402
|
-
const concurrencyLimit =
|
|
2040
|
+
const concurrencyLimit = pLimit2(8);
|
|
2403
2041
|
const results = await Promise.allSettled(
|
|
2404
2042
|
selected.map(
|
|
2405
2043
|
(route) => concurrencyLimit(async () => {
|
|
@@ -2432,11 +2070,11 @@ async function loadCrawledPages(config, maxPages) {
|
|
|
2432
2070
|
}
|
|
2433
2071
|
|
|
2434
2072
|
// src/indexing/sources/static-output.ts
|
|
2435
|
-
import
|
|
2436
|
-
import
|
|
2073
|
+
import fs6 from "fs/promises";
|
|
2074
|
+
import path8 from "path";
|
|
2437
2075
|
import fg3 from "fast-glob";
|
|
2438
2076
|
async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
2439
|
-
const outputDir =
|
|
2077
|
+
const outputDir = path8.resolve(cwd, config.source.staticOutputDir);
|
|
2440
2078
|
const htmlFiles = await fg3(["**/*.html"], {
|
|
2441
2079
|
cwd: outputDir,
|
|
2442
2080
|
absolute: true
|
|
@@ -2445,55 +2083,309 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
2445
2083
|
const selected = typeof limit === "number" ? htmlFiles.slice(0, limit) : htmlFiles;
|
|
2446
2084
|
const pages = [];
|
|
2447
2085
|
for (const filePath of selected) {
|
|
2448
|
-
const html = await
|
|
2086
|
+
const html = await fs6.readFile(filePath, "utf8");
|
|
2449
2087
|
pages.push({
|
|
2450
2088
|
url: staticHtmlFileToUrl(filePath, outputDir),
|
|
2451
2089
|
html,
|
|
2452
|
-
sourcePath:
|
|
2090
|
+
sourcePath: path8.relative(cwd, filePath).replace(/\\/g, "/"),
|
|
2453
2091
|
outgoingLinks: []
|
|
2454
2092
|
});
|
|
2455
2093
|
}
|
|
2456
2094
|
return pages;
|
|
2457
2095
|
}
|
|
2458
2096
|
|
|
2459
|
-
// src/
|
|
2460
|
-
|
|
2461
|
-
|
|
2462
|
-
|
|
2463
|
-
|
|
2464
|
-
|
|
2097
|
+
// src/indexing/robots.ts
|
|
2098
|
+
import fs7 from "fs/promises";
|
|
2099
|
+
import path9 from "path";
|
|
2100
|
+
function parseRobotsTxt(content, userAgent = "Searchsocket") {
|
|
2101
|
+
const lines = content.split(/\r?\n/);
|
|
2102
|
+
const agentGroups = /* @__PURE__ */ new Map();
|
|
2103
|
+
let currentAgents = [];
|
|
2104
|
+
for (const rawLine of lines) {
|
|
2105
|
+
const line = rawLine.replace(/#.*$/, "").trim();
|
|
2106
|
+
if (!line) continue;
|
|
2107
|
+
const colonIdx = line.indexOf(":");
|
|
2108
|
+
if (colonIdx === -1) continue;
|
|
2109
|
+
const directive = line.slice(0, colonIdx).trim().toLowerCase();
|
|
2110
|
+
const value = line.slice(colonIdx + 1).trim();
|
|
2111
|
+
if (directive === "user-agent") {
|
|
2112
|
+
const agentName = value.toLowerCase();
|
|
2113
|
+
currentAgents.push(agentName);
|
|
2114
|
+
if (!agentGroups.has(agentName)) {
|
|
2115
|
+
agentGroups.set(agentName, { disallow: [], allow: [] });
|
|
2116
|
+
}
|
|
2117
|
+
} else if (directive === "disallow" && value && currentAgents.length > 0) {
|
|
2118
|
+
for (const agent of currentAgents) {
|
|
2119
|
+
agentGroups.get(agent).disallow.push(value);
|
|
2120
|
+
}
|
|
2121
|
+
} else if (directive === "allow" && value && currentAgents.length > 0) {
|
|
2122
|
+
for (const agent of currentAgents) {
|
|
2123
|
+
agentGroups.get(agent).allow.push(value);
|
|
2124
|
+
}
|
|
2125
|
+
} else if (directive !== "disallow" && directive !== "allow") {
|
|
2126
|
+
currentAgents = [];
|
|
2127
|
+
}
|
|
2128
|
+
}
|
|
2129
|
+
const specific = agentGroups.get(userAgent.toLowerCase());
|
|
2130
|
+
if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
|
|
2131
|
+
return specific;
|
|
2132
|
+
}
|
|
2133
|
+
return agentGroups.get("*") ?? { disallow: [], allow: [] };
|
|
2465
2134
|
}
|
|
2466
|
-
|
|
2467
|
-
|
|
2468
|
-
|
|
2469
|
-
|
|
2470
|
-
|
|
2471
|
-
|
|
2472
|
-
var IndexPipeline = class _IndexPipeline {
|
|
2473
|
-
cwd;
|
|
2474
|
-
config;
|
|
2475
|
-
embeddings;
|
|
2476
|
-
vectorStore;
|
|
2477
|
-
logger;
|
|
2478
|
-
constructor(options) {
|
|
2479
|
-
this.cwd = options.cwd;
|
|
2480
|
-
this.config = options.config;
|
|
2481
|
-
this.embeddings = options.embeddings;
|
|
2482
|
-
this.vectorStore = options.vectorStore;
|
|
2483
|
-
this.logger = options.logger;
|
|
2135
|
+
function isBlockedByRobots(urlPath, rules) {
|
|
2136
|
+
let longestDisallow = "";
|
|
2137
|
+
for (const pattern of rules.disallow) {
|
|
2138
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
|
|
2139
|
+
longestDisallow = pattern;
|
|
2140
|
+
}
|
|
2484
2141
|
}
|
|
2485
|
-
|
|
2486
|
-
|
|
2487
|
-
|
|
2488
|
-
|
|
2489
|
-
|
|
2490
|
-
|
|
2491
|
-
|
|
2492
|
-
|
|
2493
|
-
|
|
2494
|
-
|
|
2495
|
-
|
|
2496
|
-
|
|
2142
|
+
if (!longestDisallow) return false;
|
|
2143
|
+
let longestAllow = "";
|
|
2144
|
+
for (const pattern of rules.allow) {
|
|
2145
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
|
|
2146
|
+
longestAllow = pattern;
|
|
2147
|
+
}
|
|
2148
|
+
}
|
|
2149
|
+
return longestAllow.length < longestDisallow.length;
|
|
2150
|
+
}
|
|
2151
|
+
async function loadRobotsTxtFromDir(dir) {
|
|
2152
|
+
try {
|
|
2153
|
+
const content = await fs7.readFile(path9.join(dir, "robots.txt"), "utf8");
|
|
2154
|
+
return parseRobotsTxt(content);
|
|
2155
|
+
} catch {
|
|
2156
|
+
return null;
|
|
2157
|
+
}
|
|
2158
|
+
}
|
|
2159
|
+
async function fetchRobotsTxt(baseUrl) {
|
|
2160
|
+
try {
|
|
2161
|
+
const url = new URL("/robots.txt", baseUrl).href;
|
|
2162
|
+
const response = await fetch(url);
|
|
2163
|
+
if (!response.ok) return null;
|
|
2164
|
+
const content = await response.text();
|
|
2165
|
+
return parseRobotsTxt(content);
|
|
2166
|
+
} catch {
|
|
2167
|
+
return null;
|
|
2168
|
+
}
|
|
2169
|
+
}
|
|
2170
|
+
|
|
2171
|
+
// src/search/ranking.ts
|
|
2172
|
+
function nonNegativeOrZero(value) {
|
|
2173
|
+
if (!Number.isFinite(value)) {
|
|
2174
|
+
return 0;
|
|
2175
|
+
}
|
|
2176
|
+
return Math.max(0, value);
|
|
2177
|
+
}
|
|
2178
|
+
function normalizeForTitleMatch(text) {
|
|
2179
|
+
return text.toLowerCase().replace(/[^a-z0-9\s]/g, "").replace(/\s+/g, " ").trim();
|
|
2180
|
+
}
|
|
2181
|
+
function rankHits(hits, config, query) {
|
|
2182
|
+
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
2183
|
+
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
2184
|
+
return hits.map((hit) => {
|
|
2185
|
+
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
2186
|
+
if (config.ranking.enableIncomingLinkBoost) {
|
|
2187
|
+
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
2188
|
+
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
2189
|
+
}
|
|
2190
|
+
if (config.ranking.enableDepthBoost) {
|
|
2191
|
+
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
2192
|
+
score += depthBoost * config.ranking.weights.depth;
|
|
2193
|
+
}
|
|
2194
|
+
if (normalizedQuery && titleMatchWeight > 0) {
|
|
2195
|
+
const normalizedTitle = normalizeForTitleMatch(hit.metadata.title);
|
|
2196
|
+
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
2197
|
+
score += titleMatchWeight;
|
|
2198
|
+
}
|
|
2199
|
+
}
|
|
2200
|
+
return {
|
|
2201
|
+
hit,
|
|
2202
|
+
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
2203
|
+
};
|
|
2204
|
+
}).sort((a, b) => {
|
|
2205
|
+
const delta = b.finalScore - a.finalScore;
|
|
2206
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
2207
|
+
});
|
|
2208
|
+
}
|
|
2209
|
+
function trimByScoreGap(results, config) {
|
|
2210
|
+
if (results.length === 0) return results;
|
|
2211
|
+
const threshold = config.ranking.scoreGapThreshold;
|
|
2212
|
+
const minScore = config.ranking.minScore;
|
|
2213
|
+
if (minScore > 0 && results.length > 0) {
|
|
2214
|
+
const sortedScores = results.map((r) => r.pageScore).sort((a, b) => a - b);
|
|
2215
|
+
const mid = Math.floor(sortedScores.length / 2);
|
|
2216
|
+
const median = sortedScores.length % 2 === 0 ? (sortedScores[mid - 1] + sortedScores[mid]) / 2 : sortedScores[mid];
|
|
2217
|
+
if (median < minScore) return [];
|
|
2218
|
+
}
|
|
2219
|
+
if (threshold > 0 && results.length > 1) {
|
|
2220
|
+
for (let i = 1; i < results.length; i++) {
|
|
2221
|
+
const prev = results[i - 1].pageScore;
|
|
2222
|
+
const current = results[i].pageScore;
|
|
2223
|
+
if (prev > 0) {
|
|
2224
|
+
const gap = (prev - current) / prev;
|
|
2225
|
+
if (gap >= threshold) {
|
|
2226
|
+
return results.slice(0, i);
|
|
2227
|
+
}
|
|
2228
|
+
}
|
|
2229
|
+
}
|
|
2230
|
+
}
|
|
2231
|
+
return results;
|
|
2232
|
+
}
|
|
2233
|
+
function findPageWeight(url, pageWeights) {
|
|
2234
|
+
let bestPattern = "";
|
|
2235
|
+
let bestWeight = 1;
|
|
2236
|
+
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
2237
|
+
if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
|
|
2238
|
+
bestPattern = pattern;
|
|
2239
|
+
bestWeight = weight;
|
|
2240
|
+
}
|
|
2241
|
+
}
|
|
2242
|
+
return bestWeight;
|
|
2243
|
+
}
|
|
2244
|
+
function aggregateByPage(ranked, config) {
|
|
2245
|
+
const groups = /* @__PURE__ */ new Map();
|
|
2246
|
+
for (const hit of ranked) {
|
|
2247
|
+
const url = hit.hit.metadata.url;
|
|
2248
|
+
const group = groups.get(url);
|
|
2249
|
+
if (group) group.push(hit);
|
|
2250
|
+
else groups.set(url, [hit]);
|
|
2251
|
+
}
|
|
2252
|
+
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
2253
|
+
const pages = [];
|
|
2254
|
+
for (const [url, chunks] of groups) {
|
|
2255
|
+
chunks.sort((a, b) => {
|
|
2256
|
+
const delta = b.finalScore - a.finalScore;
|
|
2257
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
2258
|
+
});
|
|
2259
|
+
const best = chunks[0];
|
|
2260
|
+
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
2261
|
+
const topChunks = chunks.slice(0, aggregationCap);
|
|
2262
|
+
let aggregationBonus = 0;
|
|
2263
|
+
for (let i = 1; i < topChunks.length; i++) {
|
|
2264
|
+
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
2265
|
+
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
2266
|
+
}
|
|
2267
|
+
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
2268
|
+
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
2269
|
+
if (pageWeight === 0) continue;
|
|
2270
|
+
if (pageWeight !== 1) {
|
|
2271
|
+
pageScore *= pageWeight;
|
|
2272
|
+
}
|
|
2273
|
+
pages.push({
|
|
2274
|
+
url,
|
|
2275
|
+
title: best.hit.metadata.title,
|
|
2276
|
+
routeFile: best.hit.metadata.routeFile,
|
|
2277
|
+
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
2278
|
+
bestChunk: best,
|
|
2279
|
+
matchingChunks: chunks
|
|
2280
|
+
});
|
|
2281
|
+
}
|
|
2282
|
+
return pages.sort((a, b) => {
|
|
2283
|
+
const delta = b.pageScore - a.pageScore;
|
|
2284
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
2285
|
+
});
|
|
2286
|
+
}
|
|
2287
|
+
function mergePageAndChunkResults(pageHits, rankedChunks, config) {
|
|
2288
|
+
if (pageHits.length === 0) return rankedChunks;
|
|
2289
|
+
const w = config.search.pageSearchWeight;
|
|
2290
|
+
const pageScoreMap = /* @__PURE__ */ new Map();
|
|
2291
|
+
for (const ph of pageHits) {
|
|
2292
|
+
pageScoreMap.set(ph.url, ph);
|
|
2293
|
+
}
|
|
2294
|
+
const pagesWithChunks = /* @__PURE__ */ new Set();
|
|
2295
|
+
const merged = rankedChunks.map((ranked) => {
|
|
2296
|
+
const url = ranked.hit.metadata.url;
|
|
2297
|
+
const pageHit = pageScoreMap.get(url);
|
|
2298
|
+
if (pageHit) {
|
|
2299
|
+
pagesWithChunks.add(url);
|
|
2300
|
+
const blended = (1 - w) * ranked.finalScore + w * pageHit.score;
|
|
2301
|
+
return {
|
|
2302
|
+
hit: ranked.hit,
|
|
2303
|
+
finalScore: Number.isFinite(blended) ? blended : ranked.finalScore
|
|
2304
|
+
};
|
|
2305
|
+
}
|
|
2306
|
+
return ranked;
|
|
2307
|
+
});
|
|
2308
|
+
for (const [url, pageHit] of pageScoreMap) {
|
|
2309
|
+
if (pagesWithChunks.has(url)) continue;
|
|
2310
|
+
const syntheticScore = pageHit.score * w;
|
|
2311
|
+
const syntheticHit = {
|
|
2312
|
+
id: `page:${url}`,
|
|
2313
|
+
score: pageHit.score,
|
|
2314
|
+
metadata: {
|
|
2315
|
+
projectId: "",
|
|
2316
|
+
scopeName: "",
|
|
2317
|
+
url: pageHit.url,
|
|
2318
|
+
path: pageHit.url,
|
|
2319
|
+
title: pageHit.title,
|
|
2320
|
+
sectionTitle: "",
|
|
2321
|
+
headingPath: [],
|
|
2322
|
+
snippet: pageHit.description || pageHit.title,
|
|
2323
|
+
chunkText: pageHit.description || pageHit.title,
|
|
2324
|
+
ordinal: 0,
|
|
2325
|
+
contentHash: "",
|
|
2326
|
+
depth: pageHit.depth,
|
|
2327
|
+
incomingLinks: pageHit.incomingLinks,
|
|
2328
|
+
routeFile: pageHit.routeFile,
|
|
2329
|
+
tags: pageHit.tags
|
|
2330
|
+
}
|
|
2331
|
+
};
|
|
2332
|
+
merged.push({
|
|
2333
|
+
hit: syntheticHit,
|
|
2334
|
+
finalScore: Number.isFinite(syntheticScore) ? syntheticScore : 0
|
|
2335
|
+
});
|
|
2336
|
+
}
|
|
2337
|
+
return merged.sort((a, b) => {
|
|
2338
|
+
const delta = b.finalScore - a.finalScore;
|
|
2339
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
2340
|
+
});
|
|
2341
|
+
}
|
|
2342
|
+
|
|
2343
|
+
// src/utils/time.ts
|
|
2344
|
+
function nowIso() {
|
|
2345
|
+
return (/* @__PURE__ */ new Date()).toISOString();
|
|
2346
|
+
}
|
|
2347
|
+
function hrTimeMs(start) {
|
|
2348
|
+
return Number(process.hrtime.bigint() - start) / 1e6;
|
|
2349
|
+
}
|
|
2350
|
+
|
|
2351
|
+
// src/indexing/pipeline.ts
|
|
2352
|
+
function buildPageSummary(page, maxChars = 3500) {
|
|
2353
|
+
const parts = [page.title];
|
|
2354
|
+
if (page.description) {
|
|
2355
|
+
parts.push(page.description);
|
|
2356
|
+
}
|
|
2357
|
+
if (page.keywords && page.keywords.length > 0) {
|
|
2358
|
+
parts.push(page.keywords.join(", "));
|
|
2359
|
+
}
|
|
2360
|
+
const plainBody = page.markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/!?\[([^\]]*)\]\([^)]*\)/g, "$1").replace(/^#{1,6}\s+/gm, "").replace(/[>*_|~\-]/g, " ").replace(/\s+/g, " ").trim();
|
|
2361
|
+
if (plainBody) {
|
|
2362
|
+
parts.push(plainBody);
|
|
2363
|
+
}
|
|
2364
|
+
const joined = parts.join("\n\n");
|
|
2365
|
+
if (joined.length <= maxChars) return joined;
|
|
2366
|
+
return joined.slice(0, maxChars).trim();
|
|
2367
|
+
}
|
|
2368
|
+
var IndexPipeline = class _IndexPipeline {
|
|
2369
|
+
cwd;
|
|
2370
|
+
config;
|
|
2371
|
+
store;
|
|
2372
|
+
logger;
|
|
2373
|
+
constructor(options) {
|
|
2374
|
+
this.cwd = options.cwd;
|
|
2375
|
+
this.config = options.config;
|
|
2376
|
+
this.store = options.store;
|
|
2377
|
+
this.logger = options.logger;
|
|
2378
|
+
}
|
|
2379
|
+
static async create(options = {}) {
|
|
2380
|
+
const cwd = path10.resolve(options.cwd ?? process.cwd());
|
|
2381
|
+
const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
|
|
2382
|
+
const store = options.store ?? await createUpstashStore(config);
|
|
2383
|
+
return new _IndexPipeline({
|
|
2384
|
+
cwd,
|
|
2385
|
+
config,
|
|
2386
|
+
store,
|
|
2387
|
+
logger: options.logger ?? new Logger()
|
|
2388
|
+
});
|
|
2497
2389
|
}
|
|
2498
2390
|
getConfig() {
|
|
2499
2391
|
return this.config;
|
|
@@ -2511,25 +2403,17 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2511
2403
|
stageTimingsMs[name] = Math.round(hrTimeMs(start));
|
|
2512
2404
|
};
|
|
2513
2405
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
2514
|
-
|
|
2406
|
+
ensureStateDirs(this.cwd, this.config.state.dir, scope);
|
|
2515
2407
|
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
2516
|
-
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode},
|
|
2408
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-search)`);
|
|
2517
2409
|
if (options.force) {
|
|
2518
2410
|
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
2519
|
-
await cleanMirrorForScope(statePath, scope);
|
|
2520
2411
|
}
|
|
2521
2412
|
if (options.dryRun) {
|
|
2522
2413
|
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
2523
2414
|
}
|
|
2524
2415
|
const manifestStart = stageStart();
|
|
2525
|
-
const existingHashes = await this.
|
|
2526
|
-
const existingModelId = await this.vectorStore.getScopeModelId(scope);
|
|
2527
|
-
if (existingModelId && existingModelId !== this.config.embeddings.model && !options.force) {
|
|
2528
|
-
throw new SearchSocketError(
|
|
2529
|
-
"EMBEDDING_MODEL_MISMATCH",
|
|
2530
|
-
`Scope ${scope.scopeName} uses model ${existingModelId}. Re-run with --force to migrate.`
|
|
2531
|
-
);
|
|
2532
|
-
}
|
|
2416
|
+
const existingHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getContentHashes(scope);
|
|
2533
2417
|
stageEnd("manifest", manifestStart);
|
|
2534
2418
|
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
2535
2419
|
const sourceStart = stageStart();
|
|
@@ -2546,6 +2430,53 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2546
2430
|
}
|
|
2547
2431
|
stageEnd("source", sourceStart);
|
|
2548
2432
|
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
2433
|
+
const filterStart = stageStart();
|
|
2434
|
+
let filteredSourcePages = sourcePages;
|
|
2435
|
+
if (this.config.exclude.length > 0) {
|
|
2436
|
+
const beforeExclude = filteredSourcePages.length;
|
|
2437
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
2438
|
+
const url = normalizeUrlPath(p.url);
|
|
2439
|
+
if (matchUrlPatterns(url, this.config.exclude)) {
|
|
2440
|
+
this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
|
|
2441
|
+
return false;
|
|
2442
|
+
}
|
|
2443
|
+
return true;
|
|
2444
|
+
});
|
|
2445
|
+
const excludedCount = beforeExclude - filteredSourcePages.length;
|
|
2446
|
+
if (excludedCount > 0) {
|
|
2447
|
+
this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
|
|
2448
|
+
}
|
|
2449
|
+
}
|
|
2450
|
+
if (this.config.respectRobotsTxt) {
|
|
2451
|
+
let robotsRules = null;
|
|
2452
|
+
if (sourceMode === "static-output") {
|
|
2453
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
2454
|
+
path10.resolve(this.cwd, this.config.source.staticOutputDir)
|
|
2455
|
+
);
|
|
2456
|
+
} else if (sourceMode === "build" && this.config.source.build) {
|
|
2457
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
2458
|
+
path10.resolve(this.cwd, this.config.source.build.outputDir)
|
|
2459
|
+
);
|
|
2460
|
+
} else if (sourceMode === "crawl" && this.config.source.crawl) {
|
|
2461
|
+
robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
|
|
2462
|
+
}
|
|
2463
|
+
if (robotsRules) {
|
|
2464
|
+
const beforeRobots = filteredSourcePages.length;
|
|
2465
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
2466
|
+
const url = normalizeUrlPath(p.url);
|
|
2467
|
+
if (isBlockedByRobots(url, robotsRules)) {
|
|
2468
|
+
this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
|
|
2469
|
+
return false;
|
|
2470
|
+
}
|
|
2471
|
+
return true;
|
|
2472
|
+
});
|
|
2473
|
+
const robotsExcluded = beforeRobots - filteredSourcePages.length;
|
|
2474
|
+
if (robotsExcluded > 0) {
|
|
2475
|
+
this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
|
|
2476
|
+
}
|
|
2477
|
+
}
|
|
2478
|
+
}
|
|
2479
|
+
stageEnd("filter", filterStart);
|
|
2549
2480
|
const routeStart = stageStart();
|
|
2550
2481
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
2551
2482
|
stageEnd("route_map", routeStart);
|
|
@@ -2553,7 +2484,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2553
2484
|
const extractStart = stageStart();
|
|
2554
2485
|
this.logger.info("Extracting content...");
|
|
2555
2486
|
const extractedPages = [];
|
|
2556
|
-
for (const sourcePage of
|
|
2487
|
+
for (const sourcePage of filteredSourcePages) {
|
|
2557
2488
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
2558
2489
|
if (!extracted) {
|
|
2559
2490
|
this.logger.warn(
|
|
@@ -2579,16 +2510,29 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2579
2510
|
seenUrls.add(page.url);
|
|
2580
2511
|
uniquePages.push(page);
|
|
2581
2512
|
}
|
|
2513
|
+
const indexablePages = [];
|
|
2514
|
+
for (const page of uniquePages) {
|
|
2515
|
+
const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
|
|
2516
|
+
if (effectiveWeight === 0) {
|
|
2517
|
+
this.logger.debug(`Excluding ${page.url} (zero weight)`);
|
|
2518
|
+
continue;
|
|
2519
|
+
}
|
|
2520
|
+
indexablePages.push(page);
|
|
2521
|
+
}
|
|
2522
|
+
const zeroWeightCount = uniquePages.length - indexablePages.length;
|
|
2523
|
+
if (zeroWeightCount > 0) {
|
|
2524
|
+
this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
|
|
2525
|
+
}
|
|
2582
2526
|
stageEnd("extract", extractStart);
|
|
2583
|
-
const skippedPages =
|
|
2584
|
-
this.logger.info(`Extracted ${
|
|
2527
|
+
const skippedPages = filteredSourcePages.length - indexablePages.length;
|
|
2528
|
+
this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
2585
2529
|
const linkStart = stageStart();
|
|
2586
|
-
const pageSet = new Set(
|
|
2530
|
+
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
2587
2531
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
2588
|
-
for (const page of
|
|
2532
|
+
for (const page of indexablePages) {
|
|
2589
2533
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
2590
2534
|
}
|
|
2591
|
-
for (const page of
|
|
2535
|
+
for (const page of indexablePages) {
|
|
2592
2536
|
for (const outgoing of page.outgoingLinks) {
|
|
2593
2537
|
if (!pageSet.has(outgoing)) {
|
|
2594
2538
|
continue;
|
|
@@ -2598,9 +2542,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2598
2542
|
}
|
|
2599
2543
|
stageEnd("links", linkStart);
|
|
2600
2544
|
this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
|
|
2601
|
-
const
|
|
2602
|
-
this.logger.info("
|
|
2603
|
-
const
|
|
2545
|
+
const pagesStart = stageStart();
|
|
2546
|
+
this.logger.info("Building indexed pages...");
|
|
2547
|
+
const pages = [];
|
|
2604
2548
|
let routeExact = 0;
|
|
2605
2549
|
let routeBestEffort = 0;
|
|
2606
2550
|
const precomputedRoutes = /* @__PURE__ */ new Map();
|
|
@@ -2612,7 +2556,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2612
2556
|
});
|
|
2613
2557
|
}
|
|
2614
2558
|
}
|
|
2615
|
-
for (const page of
|
|
2559
|
+
for (const page of indexablePages) {
|
|
2616
2560
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
2617
2561
|
if (routeMatch.routeResolution === "best-effort") {
|
|
2618
2562
|
if (this.config.source.strictRouteMapping) {
|
|
@@ -2629,7 +2573,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2629
2573
|
} else {
|
|
2630
2574
|
routeExact += 1;
|
|
2631
2575
|
}
|
|
2632
|
-
const
|
|
2576
|
+
const indexedPage = {
|
|
2633
2577
|
url: page.url,
|
|
2634
2578
|
title: page.title,
|
|
2635
2579
|
scope: scope.scopeName,
|
|
@@ -2644,35 +2588,38 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2644
2588
|
description: page.description,
|
|
2645
2589
|
keywords: page.keywords
|
|
2646
2590
|
};
|
|
2647
|
-
|
|
2648
|
-
|
|
2649
|
-
await writeMirrorPage(statePath, scope, mirror);
|
|
2650
|
-
}
|
|
2651
|
-
this.logger.event("markdown_written", { url: page.url });
|
|
2591
|
+
pages.push(indexedPage);
|
|
2592
|
+
this.logger.event("page_indexed", { url: page.url });
|
|
2652
2593
|
}
|
|
2653
2594
|
if (!options.dryRun) {
|
|
2654
|
-
const pageRecords =
|
|
2655
|
-
|
|
2656
|
-
|
|
2657
|
-
|
|
2658
|
-
|
|
2659
|
-
|
|
2660
|
-
|
|
2661
|
-
|
|
2662
|
-
|
|
2663
|
-
|
|
2664
|
-
|
|
2665
|
-
|
|
2666
|
-
|
|
2667
|
-
|
|
2668
|
-
|
|
2669
|
-
|
|
2595
|
+
const pageRecords = pages.map((p) => {
|
|
2596
|
+
const summary = buildPageSummary(p);
|
|
2597
|
+
return {
|
|
2598
|
+
url: p.url,
|
|
2599
|
+
title: p.title,
|
|
2600
|
+
markdown: p.markdown,
|
|
2601
|
+
projectId: scope.projectId,
|
|
2602
|
+
scopeName: scope.scopeName,
|
|
2603
|
+
routeFile: p.routeFile,
|
|
2604
|
+
routeResolution: p.routeResolution,
|
|
2605
|
+
incomingLinks: p.incomingLinks,
|
|
2606
|
+
outgoingLinks: p.outgoingLinks,
|
|
2607
|
+
depth: p.depth,
|
|
2608
|
+
tags: p.tags,
|
|
2609
|
+
indexedAt: p.generatedAt,
|
|
2610
|
+
summary,
|
|
2611
|
+
description: p.description,
|
|
2612
|
+
keywords: p.keywords
|
|
2613
|
+
};
|
|
2614
|
+
});
|
|
2615
|
+
await this.store.deletePages(scope);
|
|
2616
|
+
await this.store.upsertPages(pageRecords, scope);
|
|
2670
2617
|
}
|
|
2671
|
-
stageEnd("
|
|
2672
|
-
this.logger.info(`
|
|
2618
|
+
stageEnd("pages", pagesStart);
|
|
2619
|
+
this.logger.info(`Indexed ${pages.length} page${pages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["pages"]}ms)`);
|
|
2673
2620
|
const chunkStart = stageStart();
|
|
2674
2621
|
this.logger.info("Chunking pages...");
|
|
2675
|
-
let chunks =
|
|
2622
|
+
let chunks = pages.flatMap((page) => chunkPage(page, this.config, scope));
|
|
2676
2623
|
const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
|
|
2677
2624
|
if (typeof maxChunks === "number") {
|
|
2678
2625
|
chunks = chunks.slice(0, maxChunks);
|
|
@@ -2704,125 +2651,61 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2704
2651
|
});
|
|
2705
2652
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
2706
2653
|
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
2707
|
-
const
|
|
2708
|
-
|
|
2709
|
-
for (const chunk of changedChunks) {
|
|
2710
|
-
chunkTokenEstimates.set(chunk.chunkKey, this.embeddings.estimateTokens(buildEmbeddingText(chunk, this.config.chunking.prependTitle)));
|
|
2711
|
-
}
|
|
2712
|
-
const estimatedTokens = changedChunks.reduce(
|
|
2713
|
-
(sum, chunk) => sum + (chunkTokenEstimates.get(chunk.chunkKey) ?? 0),
|
|
2714
|
-
0
|
|
2715
|
-
);
|
|
2716
|
-
const pricePer1k = this.config.embeddings.pricePer1kTokens ?? EMBEDDING_PRICE_PER_1K_TOKENS_USD[this.config.embeddings.model] ?? DEFAULT_EMBEDDING_PRICE_PER_1K;
|
|
2717
|
-
const estimatedCostUSD = estimatedTokens / 1e3 * pricePer1k;
|
|
2718
|
-
let newEmbeddings = 0;
|
|
2719
|
-
const vectorsByChunk = /* @__PURE__ */ new Map();
|
|
2654
|
+
const upsertStart = stageStart();
|
|
2655
|
+
let documentsUpserted = 0;
|
|
2720
2656
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
2721
|
-
this.logger.info(`
|
|
2722
|
-
const
|
|
2723
|
-
|
|
2724
|
-
|
|
2725
|
-
|
|
2726
|
-
|
|
2727
|
-
|
|
2728
|
-
|
|
2729
|
-
|
|
2730
|
-
|
|
2731
|
-
|
|
2732
|
-
|
|
2733
|
-
|
|
2734
|
-
|
|
2735
|
-
const embedding = embeddings[i];
|
|
2736
|
-
if (!chunk || !embedding || embedding.length === 0 || embedding.some((value) => !Number.isFinite(value))) {
|
|
2737
|
-
throw new SearchSocketError(
|
|
2738
|
-
"VECTOR_BACKEND_UNAVAILABLE",
|
|
2739
|
-
`Embedding provider returned an invalid vector for chunk index ${i}.`
|
|
2740
|
-
);
|
|
2741
|
-
}
|
|
2742
|
-
vectorsByChunk.set(chunk.chunkKey, embedding);
|
|
2743
|
-
newEmbeddings += 1;
|
|
2744
|
-
this.logger.event("embedded_new", { chunkKey: chunk.chunkKey });
|
|
2745
|
-
}
|
|
2746
|
-
}
|
|
2747
|
-
stageEnd("embedding", embedStart);
|
|
2748
|
-
if (changedChunks.length > 0) {
|
|
2749
|
-
this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
|
|
2750
|
-
} else {
|
|
2751
|
-
this.logger.info("No chunks to embed \u2014 all up to date");
|
|
2752
|
-
}
|
|
2753
|
-
const syncStart = stageStart();
|
|
2754
|
-
if (!options.dryRun) {
|
|
2755
|
-
this.logger.info("Syncing vectors...");
|
|
2756
|
-
const upserts = [];
|
|
2757
|
-
for (const chunk of changedChunks) {
|
|
2758
|
-
const vector = vectorsByChunk.get(chunk.chunkKey);
|
|
2759
|
-
if (!vector) {
|
|
2760
|
-
continue;
|
|
2761
|
-
}
|
|
2762
|
-
upserts.push({
|
|
2657
|
+
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Search...`);
|
|
2658
|
+
const UPSTASH_CONTENT_LIMIT = 4096;
|
|
2659
|
+
const FIELD_OVERHEAD = 200;
|
|
2660
|
+
const MAX_TEXT_CHARS = UPSTASH_CONTENT_LIMIT - FIELD_OVERHEAD;
|
|
2661
|
+
const docs = changedChunks.map((chunk) => {
|
|
2662
|
+
const title = chunk.title;
|
|
2663
|
+
const sectionTitle = chunk.sectionTitle ?? "";
|
|
2664
|
+
const url = chunk.url;
|
|
2665
|
+
const tags = chunk.tags.join(",");
|
|
2666
|
+
const headingPath = chunk.headingPath.join(" > ");
|
|
2667
|
+
const otherFieldsLen = title.length + sectionTitle.length + url.length + tags.length + headingPath.length;
|
|
2668
|
+
const textBudget = Math.max(500, UPSTASH_CONTENT_LIMIT - otherFieldsLen - 50);
|
|
2669
|
+
const text = buildEmbeddingText(chunk, this.config.chunking.prependTitle).slice(0, textBudget);
|
|
2670
|
+
return {
|
|
2763
2671
|
id: chunk.chunkKey,
|
|
2764
|
-
|
|
2672
|
+
content: { title, sectionTitle, text, url, tags, headingPath },
|
|
2765
2673
|
metadata: {
|
|
2766
2674
|
projectId: scope.projectId,
|
|
2767
2675
|
scopeName: scope.scopeName,
|
|
2768
|
-
url: chunk.url,
|
|
2769
2676
|
path: chunk.path,
|
|
2770
|
-
title: chunk.title,
|
|
2771
|
-
sectionTitle: chunk.sectionTitle ?? "",
|
|
2772
|
-
headingPath: chunk.headingPath,
|
|
2773
2677
|
snippet: chunk.snippet,
|
|
2774
|
-
chunkText: chunk.chunkText.slice(0, 4e3),
|
|
2775
2678
|
ordinal: chunk.ordinal,
|
|
2776
2679
|
contentHash: chunk.contentHash,
|
|
2777
|
-
modelId: this.config.embeddings.model,
|
|
2778
2680
|
depth: chunk.depth,
|
|
2779
2681
|
incomingLinks: chunk.incomingLinks,
|
|
2780
2682
|
routeFile: chunk.routeFile,
|
|
2781
|
-
|
|
2782
|
-
|
|
2783
|
-
keywords: chunk.keywords
|
|
2683
|
+
description: chunk.description ?? "",
|
|
2684
|
+
keywords: (chunk.keywords ?? []).join(",")
|
|
2784
2685
|
}
|
|
2785
|
-
}
|
|
2786
|
-
}
|
|
2787
|
-
if (upserts.length > 0) {
|
|
2788
|
-
await this.vectorStore.upsert(upserts, scope);
|
|
2789
|
-
this.logger.event("upserted", { count: upserts.length });
|
|
2790
|
-
}
|
|
2791
|
-
if (deletes.length > 0) {
|
|
2792
|
-
await this.vectorStore.deleteByIds(deletes, scope);
|
|
2793
|
-
this.logger.event("deleted", { count: deletes.length });
|
|
2794
|
-
}
|
|
2795
|
-
}
|
|
2796
|
-
stageEnd("sync", syncStart);
|
|
2797
|
-
this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
|
|
2798
|
-
const finalizeStart = stageStart();
|
|
2799
|
-
if (!options.dryRun) {
|
|
2800
|
-
const scopeInfo = {
|
|
2801
|
-
projectId: scope.projectId,
|
|
2802
|
-
scopeName: scope.scopeName,
|
|
2803
|
-
modelId: this.config.embeddings.model,
|
|
2804
|
-
lastIndexedAt: nowIso(),
|
|
2805
|
-
vectorCount: chunks.length,
|
|
2806
|
-
lastEstimateTokens: estimatedTokens,
|
|
2807
|
-
lastEstimateCostUSD: Number(estimatedCostUSD.toFixed(8)),
|
|
2808
|
-
lastEstimateChangedChunks: changedChunks.length
|
|
2809
|
-
};
|
|
2810
|
-
await this.vectorStore.recordScope(scopeInfo);
|
|
2811
|
-
this.logger.event("registry_updated", {
|
|
2812
|
-
scope: scope.scopeName,
|
|
2813
|
-
vectorCount: chunks.length
|
|
2686
|
+
};
|
|
2814
2687
|
});
|
|
2688
|
+
await this.store.upsertChunks(docs, scope);
|
|
2689
|
+
documentsUpserted = docs.length;
|
|
2690
|
+
this.logger.event("upserted", { count: docs.length });
|
|
2691
|
+
}
|
|
2692
|
+
if (!options.dryRun && deletes.length > 0) {
|
|
2693
|
+
await this.store.deleteByIds(deletes, scope);
|
|
2694
|
+
this.logger.event("deleted", { count: deletes.length });
|
|
2695
|
+
}
|
|
2696
|
+
stageEnd("upsert", upsertStart);
|
|
2697
|
+
if (changedChunks.length > 0) {
|
|
2698
|
+
this.logger.info(`Upserted ${documentsUpserted} document${documentsUpserted === 1 ? "" : "s"} (${stageTimingsMs["upsert"]}ms)`);
|
|
2699
|
+
} else {
|
|
2700
|
+
this.logger.info("No chunks to upsert \u2014 all up to date");
|
|
2815
2701
|
}
|
|
2816
|
-
stageEnd("finalize", finalizeStart);
|
|
2817
2702
|
this.logger.info("Done.");
|
|
2818
2703
|
return {
|
|
2819
|
-
pagesProcessed:
|
|
2704
|
+
pagesProcessed: pages.length,
|
|
2820
2705
|
chunksTotal: chunks.length,
|
|
2821
2706
|
chunksChanged: changedChunks.length,
|
|
2822
|
-
|
|
2707
|
+
documentsUpserted,
|
|
2823
2708
|
deletes: deletes.length,
|
|
2824
|
-
estimatedTokens,
|
|
2825
|
-
estimatedCostUSD: Number(estimatedCostUSD.toFixed(8)),
|
|
2826
2709
|
routeExact,
|
|
2827
2710
|
routeBestEffort,
|
|
2828
2711
|
stageTimingsMs
|
|
@@ -2838,233 +2721,33 @@ import { createMcpExpressApp } from "@modelcontextprotocol/sdk/server/express.js
|
|
|
2838
2721
|
import { z as z3 } from "zod";
|
|
2839
2722
|
|
|
2840
2723
|
// src/search/engine.ts
|
|
2841
|
-
import
|
|
2724
|
+
import path11 from "path";
|
|
2842
2725
|
import { z as z2 } from "zod";
|
|
2843
|
-
|
|
2844
|
-
// src/rerank/jina.ts
|
|
2845
|
-
function sleep2(ms) {
|
|
2846
|
-
return new Promise((resolve) => {
|
|
2847
|
-
setTimeout(resolve, ms);
|
|
2848
|
-
});
|
|
2849
|
-
}
|
|
2850
|
-
var JinaReranker = class {
|
|
2851
|
-
apiKey;
|
|
2852
|
-
model;
|
|
2853
|
-
maxRetries;
|
|
2854
|
-
constructor(options) {
|
|
2855
|
-
this.apiKey = options.apiKey;
|
|
2856
|
-
this.model = options.model;
|
|
2857
|
-
this.maxRetries = options.maxRetries ?? 2;
|
|
2858
|
-
}
|
|
2859
|
-
async rerank(query, candidates, topN) {
|
|
2860
|
-
if (candidates.length === 0) {
|
|
2861
|
-
return [];
|
|
2862
|
-
}
|
|
2863
|
-
const body = {
|
|
2864
|
-
model: this.model,
|
|
2865
|
-
query,
|
|
2866
|
-
documents: candidates.map((candidate) => candidate.text),
|
|
2867
|
-
top_n: topN ?? candidates.length,
|
|
2868
|
-
return_documents: false
|
|
2869
|
-
};
|
|
2870
|
-
let attempt = 0;
|
|
2871
|
-
while (attempt <= this.maxRetries) {
|
|
2872
|
-
attempt += 1;
|
|
2873
|
-
let response;
|
|
2874
|
-
try {
|
|
2875
|
-
response = await fetch("https://api.jina.ai/v1/rerank", {
|
|
2876
|
-
method: "POST",
|
|
2877
|
-
headers: {
|
|
2878
|
-
"content-type": "application/json",
|
|
2879
|
-
authorization: `Bearer ${this.apiKey}`
|
|
2880
|
-
},
|
|
2881
|
-
body: JSON.stringify(body)
|
|
2882
|
-
});
|
|
2883
|
-
} catch (error) {
|
|
2884
|
-
if (attempt <= this.maxRetries) {
|
|
2885
|
-
await sleep2(Math.min(300 * 2 ** attempt, 4e3));
|
|
2886
|
-
continue;
|
|
2887
|
-
}
|
|
2888
|
-
throw error;
|
|
2889
|
-
}
|
|
2890
|
-
if (!response.ok) {
|
|
2891
|
-
const retryable = response.status === 429 || response.status >= 500;
|
|
2892
|
-
if (retryable && attempt <= this.maxRetries) {
|
|
2893
|
-
await sleep2(Math.min(300 * 2 ** attempt, 4e3));
|
|
2894
|
-
continue;
|
|
2895
|
-
}
|
|
2896
|
-
const errorBody = await response.text();
|
|
2897
|
-
throw new Error(`Jina rerank failed (${response.status}): ${errorBody}`);
|
|
2898
|
-
}
|
|
2899
|
-
const payload = await response.json();
|
|
2900
|
-
const rawResults = payload.results ?? payload.data ?? [];
|
|
2901
|
-
if (!Array.isArray(rawResults)) {
|
|
2902
|
-
throw new Error("Invalid Jina rerank response format");
|
|
2903
|
-
}
|
|
2904
|
-
return rawResults.flatMap((item) => {
|
|
2905
|
-
const index = item.index;
|
|
2906
|
-
if (typeof index !== "number" || index < 0 || index >= candidates.length) {
|
|
2907
|
-
return [];
|
|
2908
|
-
}
|
|
2909
|
-
const candidate = candidates[index];
|
|
2910
|
-
if (!candidate) {
|
|
2911
|
-
return [];
|
|
2912
|
-
}
|
|
2913
|
-
const score = typeof item.relevance_score === "number" ? item.relevance_score : item.score ?? 0;
|
|
2914
|
-
return [
|
|
2915
|
-
{
|
|
2916
|
-
id: candidate.id,
|
|
2917
|
-
score
|
|
2918
|
-
}
|
|
2919
|
-
];
|
|
2920
|
-
}).sort((a, b) => b.score - a.score);
|
|
2921
|
-
}
|
|
2922
|
-
throw new Error("Jina rerank request failed after retries");
|
|
2923
|
-
}
|
|
2924
|
-
};
|
|
2925
|
-
|
|
2926
|
-
// src/rerank/factory.ts
|
|
2927
|
-
function createReranker(config) {
|
|
2928
|
-
if (!config.rerank.enabled) {
|
|
2929
|
-
return null;
|
|
2930
|
-
}
|
|
2931
|
-
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
2932
|
-
if (!apiKey) {
|
|
2933
|
-
return null;
|
|
2934
|
-
}
|
|
2935
|
-
return new JinaReranker({
|
|
2936
|
-
apiKey,
|
|
2937
|
-
model: config.rerank.model
|
|
2938
|
-
});
|
|
2939
|
-
}
|
|
2940
|
-
|
|
2941
|
-
// src/search/ranking.ts
|
|
2942
|
-
function nonNegativeOrZero(value) {
|
|
2943
|
-
if (!Number.isFinite(value)) {
|
|
2944
|
-
return 0;
|
|
2945
|
-
}
|
|
2946
|
-
return Math.max(0, value);
|
|
2947
|
-
}
|
|
2948
|
-
function rankHits(hits, config) {
|
|
2949
|
-
return hits.map((hit) => {
|
|
2950
|
-
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
2951
|
-
if (config.ranking.enableIncomingLinkBoost) {
|
|
2952
|
-
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
2953
|
-
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
2954
|
-
}
|
|
2955
|
-
if (config.ranking.enableDepthBoost) {
|
|
2956
|
-
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
2957
|
-
score += depthBoost * config.ranking.weights.depth;
|
|
2958
|
-
}
|
|
2959
|
-
return {
|
|
2960
|
-
hit,
|
|
2961
|
-
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
2962
|
-
};
|
|
2963
|
-
}).sort((a, b) => {
|
|
2964
|
-
const delta = b.finalScore - a.finalScore;
|
|
2965
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
2966
|
-
});
|
|
2967
|
-
}
|
|
2968
|
-
function findPageWeight(url, pageWeights) {
|
|
2969
|
-
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
2970
|
-
const normalizedUrl = norm(url);
|
|
2971
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
2972
|
-
if (norm(pattern) === normalizedUrl) {
|
|
2973
|
-
return weight;
|
|
2974
|
-
}
|
|
2975
|
-
}
|
|
2976
|
-
let bestPrefix = "";
|
|
2977
|
-
let bestWeight = 1;
|
|
2978
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
2979
|
-
const normalizedPattern = norm(pattern);
|
|
2980
|
-
if (normalizedPattern === "/") continue;
|
|
2981
|
-
const prefix = `${normalizedPattern}/`;
|
|
2982
|
-
if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
|
|
2983
|
-
bestPrefix = prefix;
|
|
2984
|
-
bestWeight = weight;
|
|
2985
|
-
}
|
|
2986
|
-
}
|
|
2987
|
-
return bestWeight;
|
|
2988
|
-
}
|
|
2989
|
-
function aggregateByPage(ranked, config) {
|
|
2990
|
-
const groups = /* @__PURE__ */ new Map();
|
|
2991
|
-
for (const hit of ranked) {
|
|
2992
|
-
const url = hit.hit.metadata.url;
|
|
2993
|
-
const group = groups.get(url);
|
|
2994
|
-
if (group) group.push(hit);
|
|
2995
|
-
else groups.set(url, [hit]);
|
|
2996
|
-
}
|
|
2997
|
-
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
2998
|
-
const pages = [];
|
|
2999
|
-
for (const [url, chunks] of groups) {
|
|
3000
|
-
chunks.sort((a, b) => {
|
|
3001
|
-
const delta = b.finalScore - a.finalScore;
|
|
3002
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
3003
|
-
});
|
|
3004
|
-
const best = chunks[0];
|
|
3005
|
-
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
3006
|
-
const topChunks = chunks.slice(0, aggregationCap);
|
|
3007
|
-
let aggregationBonus = 0;
|
|
3008
|
-
for (let i = 1; i < topChunks.length; i++) {
|
|
3009
|
-
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
3010
|
-
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
3011
|
-
}
|
|
3012
|
-
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
3013
|
-
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
3014
|
-
if (pageWeight === 0) continue;
|
|
3015
|
-
if (pageWeight !== 1) {
|
|
3016
|
-
pageScore *= pageWeight;
|
|
3017
|
-
}
|
|
3018
|
-
pages.push({
|
|
3019
|
-
url,
|
|
3020
|
-
title: best.hit.metadata.title,
|
|
3021
|
-
routeFile: best.hit.metadata.routeFile,
|
|
3022
|
-
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
3023
|
-
bestChunk: best,
|
|
3024
|
-
matchingChunks: chunks
|
|
3025
|
-
});
|
|
3026
|
-
}
|
|
3027
|
-
return pages.sort((a, b) => {
|
|
3028
|
-
const delta = b.pageScore - a.pageScore;
|
|
3029
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
3030
|
-
});
|
|
3031
|
-
}
|
|
3032
|
-
|
|
3033
|
-
// src/search/engine.ts
|
|
3034
2726
|
var requestSchema = z2.object({
|
|
3035
2727
|
q: z2.string().trim().min(1),
|
|
3036
2728
|
topK: z2.number().int().positive().max(100).optional(),
|
|
3037
2729
|
scope: z2.string().optional(),
|
|
3038
2730
|
pathPrefix: z2.string().optional(),
|
|
3039
2731
|
tags: z2.array(z2.string()).optional(),
|
|
3040
|
-
rerank: z2.boolean().optional(),
|
|
3041
2732
|
groupBy: z2.enum(["page", "chunk"]).optional()
|
|
3042
2733
|
});
|
|
3043
2734
|
var SearchEngine = class _SearchEngine {
|
|
3044
2735
|
cwd;
|
|
3045
2736
|
config;
|
|
3046
|
-
|
|
3047
|
-
vectorStore;
|
|
3048
|
-
reranker;
|
|
2737
|
+
store;
|
|
3049
2738
|
constructor(options) {
|
|
3050
2739
|
this.cwd = options.cwd;
|
|
3051
2740
|
this.config = options.config;
|
|
3052
|
-
this.
|
|
3053
|
-
this.vectorStore = options.vectorStore;
|
|
3054
|
-
this.reranker = options.reranker;
|
|
2741
|
+
this.store = options.store;
|
|
3055
2742
|
}
|
|
3056
2743
|
static async create(options = {}) {
|
|
3057
|
-
const cwd =
|
|
2744
|
+
const cwd = path11.resolve(options.cwd ?? process.cwd());
|
|
3058
2745
|
const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
|
|
3059
|
-
const
|
|
3060
|
-
const vectorStore = options.vectorStore ?? await createVectorStore(config, cwd);
|
|
3061
|
-
const reranker = options.reranker === void 0 ? createReranker(config) : options.reranker;
|
|
2746
|
+
const store = options.store ?? await createUpstashStore(config);
|
|
3062
2747
|
return new _SearchEngine({
|
|
3063
2748
|
cwd,
|
|
3064
2749
|
config,
|
|
3065
|
-
|
|
3066
|
-
vectorStore,
|
|
3067
|
-
reranker
|
|
2750
|
+
store
|
|
3068
2751
|
});
|
|
3069
2752
|
}
|
|
3070
2753
|
getConfig() {
|
|
@@ -3078,99 +2761,130 @@ var SearchEngine = class _SearchEngine {
|
|
|
3078
2761
|
const input = parsed.data;
|
|
3079
2762
|
const totalStart = process.hrtime.bigint();
|
|
3080
2763
|
const resolvedScope = resolveScope(this.config, input.scope);
|
|
3081
|
-
await this.assertModelCompatibility(resolvedScope);
|
|
3082
2764
|
const topK = input.topK ?? 10;
|
|
3083
|
-
const wantsRerank = Boolean(input.rerank);
|
|
3084
2765
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
3085
2766
|
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
3086
|
-
const
|
|
3087
|
-
|
|
3088
|
-
|
|
3089
|
-
|
|
3090
|
-
|
|
3091
|
-
|
|
3092
|
-
|
|
3093
|
-
|
|
3094
|
-
|
|
3095
|
-
queryVector,
|
|
3096
|
-
{
|
|
3097
|
-
topK: candidateK,
|
|
3098
|
-
pathPrefix: input.pathPrefix,
|
|
3099
|
-
tags: input.tags
|
|
3100
|
-
},
|
|
3101
|
-
resolvedScope
|
|
3102
|
-
);
|
|
3103
|
-
const vectorMs = hrTimeMs(vectorStart);
|
|
3104
|
-
const ranked = rankHits(hits, this.config);
|
|
3105
|
-
let usedRerank = false;
|
|
3106
|
-
let rerankMs = 0;
|
|
3107
|
-
let ordered = ranked;
|
|
3108
|
-
if (wantsRerank) {
|
|
3109
|
-
const rerankStart = process.hrtime.bigint();
|
|
3110
|
-
ordered = await this.rerankHits(input.q, ranked, topK);
|
|
3111
|
-
rerankMs = hrTimeMs(rerankStart);
|
|
3112
|
-
usedRerank = true;
|
|
2767
|
+
const filterParts = [];
|
|
2768
|
+
if (input.pathPrefix) {
|
|
2769
|
+
const prefix = input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}`;
|
|
2770
|
+
filterParts.push(`url GLOB '${prefix}*'`);
|
|
2771
|
+
}
|
|
2772
|
+
if (input.tags && input.tags.length > 0) {
|
|
2773
|
+
for (const tag of input.tags) {
|
|
2774
|
+
filterParts.push(`tags GLOB '*${tag}*'`);
|
|
2775
|
+
}
|
|
3113
2776
|
}
|
|
3114
|
-
|
|
3115
|
-
const
|
|
2777
|
+
const filter = filterParts.length > 0 ? filterParts.join(" AND ") : void 0;
|
|
2778
|
+
const useDualSearch = this.config.search.dualSearch && groupByPage;
|
|
2779
|
+
const searchStart = process.hrtime.bigint();
|
|
2780
|
+
let ranked;
|
|
2781
|
+
if (useDualSearch) {
|
|
2782
|
+
const chunkLimit = Math.max(topK * 10, 100);
|
|
2783
|
+
const pageLimit = 20;
|
|
2784
|
+
const [pageHits, chunkHits] = await Promise.all([
|
|
2785
|
+
this.store.searchPages(
|
|
2786
|
+
input.q,
|
|
2787
|
+
{
|
|
2788
|
+
limit: pageLimit,
|
|
2789
|
+
semanticWeight: this.config.search.semanticWeight,
|
|
2790
|
+
inputEnrichment: this.config.search.inputEnrichment,
|
|
2791
|
+
filter
|
|
2792
|
+
},
|
|
2793
|
+
resolvedScope
|
|
2794
|
+
),
|
|
2795
|
+
this.store.search(
|
|
2796
|
+
input.q,
|
|
2797
|
+
{
|
|
2798
|
+
limit: chunkLimit,
|
|
2799
|
+
semanticWeight: this.config.search.semanticWeight,
|
|
2800
|
+
inputEnrichment: this.config.search.inputEnrichment,
|
|
2801
|
+
reranking: false,
|
|
2802
|
+
filter
|
|
2803
|
+
},
|
|
2804
|
+
resolvedScope
|
|
2805
|
+
)
|
|
2806
|
+
]);
|
|
2807
|
+
const rankedChunks = rankHits(chunkHits, this.config, input.q);
|
|
2808
|
+
ranked = mergePageAndChunkResults(pageHits, rankedChunks, this.config);
|
|
2809
|
+
} else {
|
|
2810
|
+
const hits = await this.store.search(
|
|
2811
|
+
input.q,
|
|
2812
|
+
{
|
|
2813
|
+
limit: candidateK,
|
|
2814
|
+
semanticWeight: this.config.search.semanticWeight,
|
|
2815
|
+
inputEnrichment: this.config.search.inputEnrichment,
|
|
2816
|
+
reranking: this.config.search.reranking,
|
|
2817
|
+
filter
|
|
2818
|
+
},
|
|
2819
|
+
resolvedScope
|
|
2820
|
+
);
|
|
2821
|
+
ranked = rankHits(hits, this.config, input.q);
|
|
2822
|
+
}
|
|
2823
|
+
const searchMs = hrTimeMs(searchStart);
|
|
2824
|
+
const results = this.buildResults(ranked, topK, groupByPage, input.q);
|
|
2825
|
+
return {
|
|
2826
|
+
q: input.q,
|
|
2827
|
+
scope: resolvedScope.scopeName,
|
|
2828
|
+
results,
|
|
2829
|
+
meta: {
|
|
2830
|
+
timingsMs: {
|
|
2831
|
+
search: Math.round(searchMs),
|
|
2832
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
2833
|
+
}
|
|
2834
|
+
}
|
|
2835
|
+
};
|
|
2836
|
+
}
|
|
2837
|
+
ensureSnippet(hit) {
|
|
2838
|
+
const snippet = hit.hit.metadata.snippet;
|
|
2839
|
+
if (snippet && snippet.length >= 30) return snippet;
|
|
2840
|
+
const chunkText = hit.hit.metadata.chunkText;
|
|
2841
|
+
if (chunkText) return toSnippet(chunkText);
|
|
2842
|
+
return snippet || "";
|
|
2843
|
+
}
|
|
2844
|
+
buildResults(ordered, topK, groupByPage, _query) {
|
|
3116
2845
|
if (groupByPage) {
|
|
3117
2846
|
let pages = aggregateByPage(ordered, this.config);
|
|
3118
|
-
|
|
3119
|
-
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
3120
|
-
}
|
|
2847
|
+
pages = trimByScoreGap(pages, this.config);
|
|
3121
2848
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
3122
|
-
|
|
2849
|
+
return pages.slice(0, topK).map((page) => {
|
|
3123
2850
|
const bestScore = page.bestChunk.finalScore;
|
|
3124
|
-
const
|
|
3125
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
2851
|
+
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
2852
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
|
|
3126
2853
|
return {
|
|
3127
2854
|
url: page.url,
|
|
3128
2855
|
title: page.title,
|
|
3129
2856
|
sectionTitle: page.bestChunk.hit.metadata.sectionTitle || void 0,
|
|
3130
|
-
snippet: page.bestChunk
|
|
2857
|
+
snippet: this.ensureSnippet(page.bestChunk),
|
|
3131
2858
|
score: Number(page.pageScore.toFixed(6)),
|
|
3132
2859
|
routeFile: page.routeFile,
|
|
3133
2860
|
chunks: meaningful.length > 1 ? meaningful.map((c) => ({
|
|
3134
2861
|
sectionTitle: c.hit.metadata.sectionTitle || void 0,
|
|
3135
|
-
snippet: c
|
|
2862
|
+
snippet: this.ensureSnippet(c),
|
|
3136
2863
|
headingPath: c.hit.metadata.headingPath,
|
|
3137
2864
|
score: Number(c.finalScore.toFixed(6))
|
|
3138
2865
|
})) : void 0
|
|
3139
2866
|
};
|
|
3140
2867
|
});
|
|
3141
2868
|
} else {
|
|
2869
|
+
let filtered = ordered;
|
|
2870
|
+
const minScore = this.config.ranking.minScore;
|
|
3142
2871
|
if (minScore > 0) {
|
|
3143
|
-
|
|
2872
|
+
filtered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
3144
2873
|
}
|
|
3145
|
-
|
|
2874
|
+
return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
3146
2875
|
url: hit.metadata.url,
|
|
3147
2876
|
title: hit.metadata.title,
|
|
3148
2877
|
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
3149
|
-
snippet: hit
|
|
2878
|
+
snippet: this.ensureSnippet({ hit, finalScore }),
|
|
3150
2879
|
score: Number(finalScore.toFixed(6)),
|
|
3151
2880
|
routeFile: hit.metadata.routeFile
|
|
3152
2881
|
}));
|
|
3153
2882
|
}
|
|
3154
|
-
return {
|
|
3155
|
-
q: input.q,
|
|
3156
|
-
scope: resolvedScope.scopeName,
|
|
3157
|
-
results,
|
|
3158
|
-
meta: {
|
|
3159
|
-
timingsMs: {
|
|
3160
|
-
embed: Math.round(embedMs),
|
|
3161
|
-
vector: Math.round(vectorMs),
|
|
3162
|
-
rerank: Math.round(rerankMs),
|
|
3163
|
-
total: Math.round(hrTimeMs(totalStart))
|
|
3164
|
-
},
|
|
3165
|
-
usedRerank,
|
|
3166
|
-
modelId: this.config.embeddings.model
|
|
3167
|
-
}
|
|
3168
|
-
};
|
|
3169
2883
|
}
|
|
3170
2884
|
async getPage(pathOrUrl, scope) {
|
|
3171
2885
|
const resolvedScope = resolveScope(this.config, scope);
|
|
3172
2886
|
const urlPath = this.resolveInputPath(pathOrUrl);
|
|
3173
|
-
const page = await this.
|
|
2887
|
+
const page = await this.store.getPage(urlPath, resolvedScope);
|
|
3174
2888
|
if (!page) {
|
|
3175
2889
|
throw new SearchSocketError("INVALID_REQUEST", `Indexed page not found for ${urlPath}`, 404);
|
|
3176
2890
|
}
|
|
@@ -3191,7 +2905,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
3191
2905
|
};
|
|
3192
2906
|
}
|
|
3193
2907
|
async health() {
|
|
3194
|
-
return this.
|
|
2908
|
+
return this.store.health();
|
|
3195
2909
|
}
|
|
3196
2910
|
resolveInputPath(pathOrUrl) {
|
|
3197
2911
|
try {
|
|
@@ -3203,90 +2917,6 @@ var SearchEngine = class _SearchEngine {
|
|
|
3203
2917
|
const withoutQueryOrHash = pathOrUrl.split(/[?#]/)[0] ?? pathOrUrl;
|
|
3204
2918
|
return normalizeUrlPath(withoutQueryOrHash);
|
|
3205
2919
|
}
|
|
3206
|
-
async assertModelCompatibility(scope) {
|
|
3207
|
-
const modelId = await this.vectorStore.getScopeModelId(scope);
|
|
3208
|
-
if (modelId && modelId !== this.config.embeddings.model) {
|
|
3209
|
-
throw new SearchSocketError(
|
|
3210
|
-
"EMBEDDING_MODEL_MISMATCH",
|
|
3211
|
-
`Scope ${scope.scopeName} was indexed with ${modelId}. Current config uses ${this.config.embeddings.model}. Re-index with --force.`
|
|
3212
|
-
);
|
|
3213
|
-
}
|
|
3214
|
-
}
|
|
3215
|
-
async rerankHits(query, ranked, topK) {
|
|
3216
|
-
if (!this.config.rerank.enabled) {
|
|
3217
|
-
throw new SearchSocketError(
|
|
3218
|
-
"INVALID_REQUEST",
|
|
3219
|
-
"rerank=true requested but rerank.enabled is not set to true.",
|
|
3220
|
-
400
|
|
3221
|
-
);
|
|
3222
|
-
}
|
|
3223
|
-
if (!this.reranker) {
|
|
3224
|
-
throw new SearchSocketError(
|
|
3225
|
-
"CONFIG_MISSING",
|
|
3226
|
-
`rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
|
|
3227
|
-
400
|
|
3228
|
-
);
|
|
3229
|
-
}
|
|
3230
|
-
const pageGroups = /* @__PURE__ */ new Map();
|
|
3231
|
-
for (const entry of ranked) {
|
|
3232
|
-
const url = entry.hit.metadata.url;
|
|
3233
|
-
const group = pageGroups.get(url);
|
|
3234
|
-
if (group) group.push(entry);
|
|
3235
|
-
else pageGroups.set(url, [entry]);
|
|
3236
|
-
}
|
|
3237
|
-
const MAX_CHUNKS_PER_PAGE = 5;
|
|
3238
|
-
const MIN_CHUNKS_PER_PAGE = 1;
|
|
3239
|
-
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
3240
|
-
const MAX_DOC_CHARS = 2e3;
|
|
3241
|
-
const pageCandidates = [];
|
|
3242
|
-
for (const [url, chunks] of pageGroups) {
|
|
3243
|
-
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
3244
|
-
const bestScore = byScore[0].finalScore;
|
|
3245
|
-
const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
|
|
3246
|
-
const selected = byScore.filter(
|
|
3247
|
-
(c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
|
|
3248
|
-
).slice(0, MAX_CHUNKS_PER_PAGE);
|
|
3249
|
-
selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
|
|
3250
|
-
const first = selected[0].hit.metadata;
|
|
3251
|
-
const parts = [first.title];
|
|
3252
|
-
if (first.description) {
|
|
3253
|
-
parts.push(first.description);
|
|
3254
|
-
}
|
|
3255
|
-
if (first.keywords && first.keywords.length > 0) {
|
|
3256
|
-
parts.push(first.keywords.join(", "));
|
|
3257
|
-
}
|
|
3258
|
-
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
3259
|
-
parts.push(body);
|
|
3260
|
-
let text = parts.join("\n\n");
|
|
3261
|
-
if (text.length > MAX_DOC_CHARS) {
|
|
3262
|
-
text = text.slice(0, MAX_DOC_CHARS);
|
|
3263
|
-
}
|
|
3264
|
-
pageCandidates.push({ id: url, text });
|
|
3265
|
-
}
|
|
3266
|
-
const maxCandidates = Math.max(topK, this.config.rerank.topN);
|
|
3267
|
-
const cappedCandidates = pageCandidates.slice(0, maxCandidates);
|
|
3268
|
-
const reranked = await this.reranker.rerank(
|
|
3269
|
-
query,
|
|
3270
|
-
cappedCandidates,
|
|
3271
|
-
maxCandidates
|
|
3272
|
-
);
|
|
3273
|
-
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
3274
|
-
return ranked.map((entry) => {
|
|
3275
|
-
const pageScore = scoreByUrl.get(entry.hit.metadata.url);
|
|
3276
|
-
const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
|
|
3277
|
-
if (pageScore === void 0 || !Number.isFinite(pageScore)) {
|
|
3278
|
-
return { ...entry, finalScore: base };
|
|
3279
|
-
}
|
|
3280
|
-
const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
|
|
3281
|
-
return {
|
|
3282
|
-
...entry,
|
|
3283
|
-
finalScore: Number.isFinite(combined) ? combined : base
|
|
3284
|
-
};
|
|
3285
|
-
}).sort((a, b) => {
|
|
3286
|
-
const delta = b.finalScore - a.finalScore;
|
|
3287
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
3288
|
-
});
|
|
3289
|
-
}
|
|
3290
2920
|
};
|
|
3291
2921
|
|
|
3292
2922
|
// src/mcp/server.ts
|
|
@@ -3298,7 +2928,7 @@ function createServer(engine) {
|
|
|
3298
2928
|
server.registerTool(
|
|
3299
2929
|
"search",
|
|
3300
2930
|
{
|
|
3301
|
-
description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, and
|
|
2931
|
+
description: "Semantic site search powered by Upstash Search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, topK, and groupBy.",
|
|
3302
2932
|
inputSchema: {
|
|
3303
2933
|
query: z3.string().min(1),
|
|
3304
2934
|
scope: z3.string().optional(),
|
|
@@ -3485,9 +3115,6 @@ function parseDurationMs(value) {
|
|
|
3485
3115
|
throw new SearchSocketError("INVALID_REQUEST", `Unsupported duration unit: ${unit}`, 400);
|
|
3486
3116
|
}
|
|
3487
3117
|
}
|
|
3488
|
-
function formatUsd(value) {
|
|
3489
|
-
return `$${value.toFixed(6)}`;
|
|
3490
|
-
}
|
|
3491
3118
|
function printIndexSummary(stats) {
|
|
3492
3119
|
process.stdout.write(`pages processed: ${stats.pagesProcessed}
|
|
3493
3120
|
`);
|
|
@@ -3495,13 +3122,9 @@ function printIndexSummary(stats) {
|
|
|
3495
3122
|
`);
|
|
3496
3123
|
process.stdout.write(`chunks changed: ${stats.chunksChanged}
|
|
3497
3124
|
`);
|
|
3498
|
-
process.stdout.write(`
|
|
3125
|
+
process.stdout.write(`documents upserted: ${stats.documentsUpserted}
|
|
3499
3126
|
`);
|
|
3500
3127
|
process.stdout.write(`deletes: ${stats.deletes}
|
|
3501
|
-
`);
|
|
3502
|
-
process.stdout.write(`estimated tokens: ${stats.estimatedTokens}
|
|
3503
|
-
`);
|
|
3504
|
-
process.stdout.write(`estimated cost (USD): ${formatUsd(stats.estimatedCostUSD)}
|
|
3505
3128
|
`);
|
|
3506
3129
|
process.stdout.write(`route mapping: ${stats.routeExact} exact, ${stats.routeBestEffort} best-effort
|
|
3507
3130
|
`);
|
|
@@ -3515,7 +3138,7 @@ function collectWatchPaths(config, cwd) {
|
|
|
3515
3138
|
const paths = ["src/routes/**"];
|
|
3516
3139
|
if (config.source.mode === "content-files" && config.source.contentFiles) {
|
|
3517
3140
|
for (const pattern of config.source.contentFiles.globs) {
|
|
3518
|
-
paths.push(
|
|
3141
|
+
paths.push(path12.join(config.source.contentFiles.baseDir, pattern));
|
|
3519
3142
|
}
|
|
3520
3143
|
}
|
|
3521
3144
|
if (config.source.mode === "static-output") {
|
|
@@ -3528,25 +3151,22 @@ function collectWatchPaths(config, cwd) {
|
|
|
3528
3151
|
paths.push("searchsocket.config.ts");
|
|
3529
3152
|
paths.push(config.source.build.outputDir);
|
|
3530
3153
|
}
|
|
3531
|
-
return paths.map((value) =>
|
|
3154
|
+
return paths.map((value) => path12.resolve(cwd, value));
|
|
3532
3155
|
}
|
|
3533
3156
|
function ensureStateDir(cwd) {
|
|
3534
|
-
const target =
|
|
3535
|
-
|
|
3157
|
+
const target = path12.join(cwd, ".searchsocket");
|
|
3158
|
+
fs8.mkdirSync(target, { recursive: true });
|
|
3536
3159
|
return target;
|
|
3537
3160
|
}
|
|
3538
3161
|
function ensureGitignore(cwd) {
|
|
3539
|
-
const gitignorePath =
|
|
3162
|
+
const gitignorePath = path12.join(cwd, ".gitignore");
|
|
3540
3163
|
const entries = [
|
|
3541
|
-
".searchsocket/vectors.db",
|
|
3542
|
-
".searchsocket/vectors.db-shm",
|
|
3543
|
-
".searchsocket/vectors.db-wal",
|
|
3544
3164
|
".searchsocket/manifest.json",
|
|
3545
3165
|
".searchsocket/registry.json"
|
|
3546
3166
|
];
|
|
3547
3167
|
let content = "";
|
|
3548
|
-
if (
|
|
3549
|
-
content =
|
|
3168
|
+
if (fs8.existsSync(gitignorePath)) {
|
|
3169
|
+
content = fs8.readFileSync(gitignorePath, "utf8");
|
|
3550
3170
|
}
|
|
3551
3171
|
const lines = content.split("\n");
|
|
3552
3172
|
const missing = entries.filter((entry) => !lines.some((line) => line.trim() === entry));
|
|
@@ -3557,10 +3177,10 @@ function ensureGitignore(cwd) {
|
|
|
3557
3177
|
# SearchSocket local state
|
|
3558
3178
|
${missing.join("\n")}
|
|
3559
3179
|
`;
|
|
3560
|
-
|
|
3180
|
+
fs8.writeFileSync(gitignorePath, content.trimEnd() + block, "utf8");
|
|
3561
3181
|
}
|
|
3562
3182
|
function readScopesFromFile(filePath) {
|
|
3563
|
-
const raw =
|
|
3183
|
+
const raw = fs8.readFileSync(filePath, "utf8");
|
|
3564
3184
|
return new Set(
|
|
3565
3185
|
raw.split(/\r?\n/).map((line) => line.trim()).filter(Boolean)
|
|
3566
3186
|
);
|
|
@@ -3584,8 +3204,8 @@ function readRemoteGitBranches(cwd) {
|
|
|
3584
3204
|
}
|
|
3585
3205
|
}
|
|
3586
3206
|
async function loadResolvedConfigForDev(cwd, configPath) {
|
|
3587
|
-
const resolvedConfigPath =
|
|
3588
|
-
if (
|
|
3207
|
+
const resolvedConfigPath = path12.resolve(cwd, configPath ?? "searchsocket.config.ts");
|
|
3208
|
+
if (fs8.existsSync(resolvedConfigPath)) {
|
|
3589
3209
|
return loadConfig({ cwd, configPath });
|
|
3590
3210
|
}
|
|
3591
3211
|
return mergeConfig(cwd, {});
|
|
@@ -3632,7 +3252,7 @@ var program = new Command();
|
|
|
3632
3252
|
program.name("searchsocket").description("Semantic site search and MCP retrieval for SvelteKit").version(package_default.version).option("-C, --cwd <path>", "working directory", process.cwd()).option("--config <path>", "config path (defaults to searchsocket.config.ts)");
|
|
3633
3253
|
program.command("init").description("Create searchsocket.config.ts and .searchsocket state directory").action(async (_opts, command) => {
|
|
3634
3254
|
const root = getRootOptions(command).cwd ?? process.cwd();
|
|
3635
|
-
const cwd =
|
|
3255
|
+
const cwd = path12.resolve(root);
|
|
3636
3256
|
const configPath = writeMinimalConfig(cwd);
|
|
3637
3257
|
const stateDir = ensureStateDir(cwd);
|
|
3638
3258
|
ensureGitignore(cwd);
|
|
@@ -3650,15 +3270,15 @@ program.command("init").description("Create searchsocket.config.ts and .searchso
|
|
|
3650
3270
|
process.stdout.write("// searchsocketVitePlugin({ enabled: true, changedOnly: true })\n");
|
|
3651
3271
|
process.stdout.write("// or env-driven: SEARCHSOCKET_AUTO_INDEX=1 pnpm build\n");
|
|
3652
3272
|
});
|
|
3653
|
-
program.command("index").description("Index site content into
|
|
3273
|
+
program.command("index").description("Index site content into Upstash Search").option("--scope <name>", "scope override").option("--changed-only", "only process changed chunks", true).option("--no-changed-only", "re-index regardless of previous manifest").option("--force", "force full rebuild", false).option("--dry-run", "compute plan, no writes", false).option("--source <mode>", "source mode override: static-output|crawl|content-files|build").option("--max-pages <n>", "limit pages processed").option("--max-chunks <n>", "limit chunks processed").option("--quiet", "suppress all output except errors and warnings", false).option("--verbose", "verbose output", false).option("--json", "emit JSON logs and summary", false).action(async (opts, command) => {
|
|
3654
3274
|
const rootOpts = getRootOptions(command);
|
|
3655
|
-
const cwd =
|
|
3275
|
+
const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3656
3276
|
await runIndexCommand({
|
|
3657
3277
|
cwd,
|
|
3658
3278
|
configPath: rootOpts?.config,
|
|
3659
3279
|
scope: opts.scope,
|
|
3660
3280
|
changedOnly: opts.changedOnly,
|
|
3661
|
-
force: opts.force,
|
|
3281
|
+
force: opts.force || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
|
|
3662
3282
|
dryRun: opts.dryRun,
|
|
3663
3283
|
source: opts.source,
|
|
3664
3284
|
maxPages: opts.maxPages ? parsePositiveInt(opts.maxPages, "--max-pages") : void 0,
|
|
@@ -3668,16 +3288,16 @@ program.command("index").description("Index site content into markdown mirror +
|
|
|
3668
3288
|
json: opts.json
|
|
3669
3289
|
});
|
|
3670
3290
|
});
|
|
3671
|
-
program.command("status").description("Show scope, indexing state, backend health
|
|
3291
|
+
program.command("status").description("Show scope, indexing state, and backend health").option("--scope <name>", "scope override").action(async (opts, command) => {
|
|
3672
3292
|
const rootOpts = getRootOptions(command);
|
|
3673
|
-
const cwd =
|
|
3293
|
+
const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3674
3294
|
const config = await loadConfig({ cwd, configPath: rootOpts?.config });
|
|
3675
3295
|
const scope = resolveScope(config, opts.scope);
|
|
3676
|
-
let
|
|
3296
|
+
let store;
|
|
3677
3297
|
let health = { ok: false, details: "not checked" };
|
|
3678
3298
|
try {
|
|
3679
|
-
|
|
3680
|
-
health = await
|
|
3299
|
+
store = await createUpstashStore(config);
|
|
3300
|
+
health = await store.health();
|
|
3681
3301
|
} catch (error) {
|
|
3682
3302
|
health = {
|
|
3683
3303
|
ok: false,
|
|
@@ -3685,24 +3305,22 @@ program.command("status").description("Show scope, indexing state, backend healt
|
|
|
3685
3305
|
};
|
|
3686
3306
|
process.stdout.write(`project: ${config.project.id}
|
|
3687
3307
|
`);
|
|
3688
|
-
process.stdout.write(`
|
|
3308
|
+
process.stdout.write(`backend health: error (${health.details})
|
|
3689
3309
|
`);
|
|
3690
3310
|
process.exitCode = 1;
|
|
3691
3311
|
return;
|
|
3692
3312
|
}
|
|
3693
3313
|
let scopeRegistry = [];
|
|
3694
3314
|
let scopeInfo;
|
|
3695
|
-
let hashes = /* @__PURE__ */ new Map();
|
|
3696
3315
|
try {
|
|
3697
|
-
scopeRegistry = await
|
|
3316
|
+
scopeRegistry = await store.listScopes(config.project.id);
|
|
3698
3317
|
scopeInfo = scopeRegistry.find((entry) => entry.scopeName === scope.scopeName);
|
|
3699
|
-
hashes = await vectorStore.getContentHashes(scope);
|
|
3700
3318
|
} catch (error) {
|
|
3701
3319
|
process.stdout.write(`project: ${config.project.id}
|
|
3702
3320
|
`);
|
|
3703
3321
|
process.stdout.write(`resolved scope: ${scope.scopeName}
|
|
3704
3322
|
`);
|
|
3705
|
-
process.stdout.write(`
|
|
3323
|
+
process.stdout.write(`backend health: error (${error instanceof Error ? error.message : "unknown error"})
|
|
3706
3324
|
`);
|
|
3707
3325
|
process.exitCode = 1;
|
|
3708
3326
|
return;
|
|
@@ -3711,25 +3329,15 @@ program.command("status").description("Show scope, indexing state, backend healt
|
|
|
3711
3329
|
`);
|
|
3712
3330
|
process.stdout.write(`resolved scope: ${scope.scopeName}
|
|
3713
3331
|
`);
|
|
3714
|
-
process.stdout.write(`
|
|
3715
|
-
`);
|
|
3716
|
-
const tursoUrl = process.env[config.vector.turso.urlEnv];
|
|
3717
|
-
const vectorMode = tursoUrl ? `remote (${tursoUrl})` : `local (${config.vector.turso.localPath})`;
|
|
3718
|
-
process.stdout.write(`vector backend: turso/libsql (${vectorMode})
|
|
3332
|
+
process.stdout.write(`backend: upstash-search
|
|
3719
3333
|
`);
|
|
3720
|
-
process.stdout.write(`
|
|
3334
|
+
process.stdout.write(`backend health: ${health.ok ? "ok" : `error (${health.details ?? "n/a"})`}
|
|
3721
3335
|
`);
|
|
3722
3336
|
if (scopeInfo) {
|
|
3723
3337
|
process.stdout.write(`last indexed (${scope.scopeName}): ${scopeInfo.lastIndexedAt ?? "never"}
|
|
3724
3338
|
`);
|
|
3725
|
-
|
|
3726
|
-
`
|
|
3727
|
-
if (scopeInfo.lastEstimateTokens != null) {
|
|
3728
|
-
process.stdout.write(`last estimated tokens: ${scopeInfo.lastEstimateTokens}
|
|
3729
|
-
`);
|
|
3730
|
-
}
|
|
3731
|
-
if (scopeInfo.lastEstimateCostUSD != null) {
|
|
3732
|
-
process.stdout.write(`last estimated cost: ${formatUsd(scopeInfo.lastEstimateCostUSD)}
|
|
3339
|
+
if (scopeInfo.documentCount != null) {
|
|
3340
|
+
process.stdout.write(`documents: ${scopeInfo.documentCount}
|
|
3733
3341
|
`);
|
|
3734
3342
|
}
|
|
3735
3343
|
} else {
|
|
@@ -3740,7 +3348,7 @@ program.command("status").description("Show scope, indexing state, backend healt
|
|
|
3740
3348
|
process.stdout.write("\nregistry scopes:\n");
|
|
3741
3349
|
for (const item of scopeRegistry) {
|
|
3742
3350
|
process.stdout.write(
|
|
3743
|
-
` - ${item.scopeName}
|
|
3351
|
+
` - ${item.scopeName} lastIndexedAt=${item.lastIndexedAt} documents=${item.documentCount ?? "unknown"}
|
|
3744
3352
|
`
|
|
3745
3353
|
);
|
|
3746
3354
|
}
|
|
@@ -3748,7 +3356,7 @@ program.command("status").description("Show scope, indexing state, backend healt
|
|
|
3748
3356
|
});
|
|
3749
3357
|
program.command("dev").description("Watch content files/routes and incrementally reindex on changes").option("--scope <name>", "scope override").option("--mcp", "start MCP server (http transport) alongside watcher", false).option("--mcp-port <n>", "MCP HTTP port", "3338").option("--mcp-path <path>", "MCP HTTP path", "/mcp").option("--verbose", "verbose logs", false).action(async (opts, command) => {
|
|
3750
3358
|
const rootOpts = getRootOptions(command);
|
|
3751
|
-
const cwd =
|
|
3359
|
+
const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3752
3360
|
const config = await loadResolvedConfigForDev(cwd, rootOpts?.config);
|
|
3753
3361
|
const watchPaths = collectWatchPaths(config, cwd);
|
|
3754
3362
|
process.stdout.write("starting searchsocket dev watcher...\n");
|
|
@@ -3815,45 +3423,44 @@ ${watchPaths.map((entry) => ` - ${entry}`).join("\n")}
|
|
|
3815
3423
|
});
|
|
3816
3424
|
});
|
|
3817
3425
|
});
|
|
3818
|
-
program.command("clean").description("Delete local state and optionally delete remote
|
|
3426
|
+
program.command("clean").description("Delete local state and optionally delete remote indexes for a scope").option("--scope <name>", "scope override").option("--remote", "delete remote scope indexes", false).action(async (opts, command) => {
|
|
3819
3427
|
const rootOpts = getRootOptions(command);
|
|
3820
|
-
const cwd =
|
|
3428
|
+
const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3821
3429
|
const config = await loadConfig({ cwd, configPath: rootOpts?.config });
|
|
3822
|
-
const
|
|
3823
|
-
const statePath = path13.join(cwd, config.state.dir);
|
|
3430
|
+
const statePath = path12.join(cwd, config.state.dir);
|
|
3824
3431
|
await fsp.rm(statePath, { recursive: true, force: true });
|
|
3825
3432
|
process.stdout.write(`deleted local state directory: ${statePath}
|
|
3826
3433
|
`);
|
|
3827
3434
|
if (opts.remote) {
|
|
3828
|
-
const
|
|
3829
|
-
await
|
|
3830
|
-
process.stdout.write(`dropped all remote
|
|
3435
|
+
const store = await createUpstashStore(config);
|
|
3436
|
+
await store.dropAllIndexes(config.project.id);
|
|
3437
|
+
process.stdout.write(`dropped all remote indexes for project ${config.project.id}
|
|
3831
3438
|
`);
|
|
3832
3439
|
}
|
|
3833
3440
|
});
|
|
3834
3441
|
program.command("prune").description("List/delete stale scopes (dry-run by default)").option("--apply", "apply deletions", false).option("--scopes-file <path>", "file containing active scopes").option("--older-than <duration>", "ttl cutoff like 30d").action(async (opts, command) => {
|
|
3835
3442
|
const rootOpts = getRootOptions(command);
|
|
3836
|
-
const cwd =
|
|
3443
|
+
const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3837
3444
|
const config = await loadConfig({ cwd, configPath: rootOpts?.config });
|
|
3838
3445
|
const baseScope = resolveScope(config);
|
|
3839
|
-
let
|
|
3446
|
+
let store;
|
|
3840
3447
|
let scopes;
|
|
3841
3448
|
try {
|
|
3842
|
-
|
|
3843
|
-
scopes = await
|
|
3449
|
+
store = await createUpstashStore(config);
|
|
3450
|
+
scopes = await store.listScopes(config.project.id);
|
|
3844
3451
|
} catch (error) {
|
|
3845
3452
|
process.stderr.write(
|
|
3846
|
-
`error: failed to access
|
|
3453
|
+
`error: failed to access Upstash Search: ${error instanceof Error ? error.message : String(error)}
|
|
3847
3454
|
`
|
|
3848
3455
|
);
|
|
3849
3456
|
process.exitCode = 1;
|
|
3850
3457
|
return;
|
|
3851
3458
|
}
|
|
3852
|
-
process.stdout.write(`using
|
|
3459
|
+
process.stdout.write(`using Upstash Search
|
|
3853
3460
|
`);
|
|
3854
3461
|
let keepScopes = /* @__PURE__ */ new Set();
|
|
3855
3462
|
if (opts.scopesFile) {
|
|
3856
|
-
keepScopes = readScopesFromFile(
|
|
3463
|
+
keepScopes = readScopesFromFile(path12.resolve(cwd, opts.scopesFile));
|
|
3857
3464
|
} else {
|
|
3858
3465
|
keepScopes = readRemoteGitBranches(cwd);
|
|
3859
3466
|
}
|
|
@@ -3871,7 +3478,7 @@ program.command("prune").description("List/delete stale scopes (dry-run by defau
|
|
|
3871
3478
|
staleByList = !keepScopes.has(entry.scopeName);
|
|
3872
3479
|
}
|
|
3873
3480
|
let staleByTtl = false;
|
|
3874
|
-
if (olderThanMs) {
|
|
3481
|
+
if (olderThanMs && entry.lastIndexedAt !== "unknown") {
|
|
3875
3482
|
staleByTtl = now - Date.parse(entry.lastIndexedAt) > olderThanMs;
|
|
3876
3483
|
}
|
|
3877
3484
|
if (keepScopes.size > 0 && olderThanMs) {
|
|
@@ -3907,7 +3514,7 @@ program.command("prune").description("List/delete stale scopes (dry-run by defau
|
|
|
3907
3514
|
scopeId: `${config.project.id}:${entry.scopeName}`
|
|
3908
3515
|
};
|
|
3909
3516
|
try {
|
|
3910
|
-
await
|
|
3517
|
+
await store.deleteScope(scope);
|
|
3911
3518
|
deleted += 1;
|
|
3912
3519
|
} catch (error) {
|
|
3913
3520
|
process.stdout.write(
|
|
@@ -3924,7 +3531,7 @@ program.command("prune").description("List/delete stale scopes (dry-run by defau
|
|
|
3924
3531
|
});
|
|
3925
3532
|
program.command("doctor").description("Validate config, env vars, provider connectivity, and local write access").action(async (_opts, command) => {
|
|
3926
3533
|
const rootOpts = getRootOptions(command);
|
|
3927
|
-
const cwd =
|
|
3534
|
+
const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3928
3535
|
const checks = [];
|
|
3929
3536
|
let config = null;
|
|
3930
3537
|
try {
|
|
@@ -3938,23 +3545,21 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
3938
3545
|
});
|
|
3939
3546
|
}
|
|
3940
3547
|
if (config) {
|
|
3941
|
-
const
|
|
3548
|
+
const upstashUrl = config.upstash.url ?? process.env[config.upstash.urlEnv];
|
|
3549
|
+
const upstashToken = config.upstash.token ?? process.env[config.upstash.tokenEnv];
|
|
3942
3550
|
checks.push({
|
|
3943
|
-
name: `env ${config.
|
|
3944
|
-
ok: Boolean(
|
|
3945
|
-
details:
|
|
3551
|
+
name: `env ${config.upstash.urlEnv}`,
|
|
3552
|
+
ok: Boolean(upstashUrl),
|
|
3553
|
+
details: upstashUrl ? void 0 : "missing"
|
|
3554
|
+
});
|
|
3555
|
+
checks.push({
|
|
3556
|
+
name: `env ${config.upstash.tokenEnv}`,
|
|
3557
|
+
ok: Boolean(upstashToken),
|
|
3558
|
+
details: upstashToken ? void 0 : "missing"
|
|
3946
3559
|
});
|
|
3947
|
-
{
|
|
3948
|
-
const tursoUrl = process.env[config.vector.turso.urlEnv];
|
|
3949
|
-
checks.push({
|
|
3950
|
-
name: "turso/libsql",
|
|
3951
|
-
ok: true,
|
|
3952
|
-
details: tursoUrl ? `remote: ${tursoUrl}` : `local file: ${config.vector.turso.localPath}`
|
|
3953
|
-
});
|
|
3954
|
-
}
|
|
3955
3560
|
if (config.source.mode === "static-output") {
|
|
3956
|
-
const outputDir =
|
|
3957
|
-
const exists =
|
|
3561
|
+
const outputDir = path12.resolve(cwd, config.source.staticOutputDir);
|
|
3562
|
+
const exists = fs8.existsSync(outputDir);
|
|
3958
3563
|
checks.push({
|
|
3959
3564
|
name: "source: static output dir",
|
|
3960
3565
|
ok: exists,
|
|
@@ -3963,15 +3568,15 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
3963
3568
|
} else if (config.source.mode === "build") {
|
|
3964
3569
|
const buildConfig = config.source.build;
|
|
3965
3570
|
if (buildConfig) {
|
|
3966
|
-
const manifestPath =
|
|
3967
|
-
const manifestExists =
|
|
3571
|
+
const manifestPath = path12.resolve(cwd, buildConfig.outputDir, "server", "manifest-full.js");
|
|
3572
|
+
const manifestExists = fs8.existsSync(manifestPath);
|
|
3968
3573
|
checks.push({
|
|
3969
3574
|
name: "source: build manifest",
|
|
3970
3575
|
ok: manifestExists,
|
|
3971
3576
|
details: manifestExists ? manifestPath : `${manifestPath} not found (run \`vite build\` first)`
|
|
3972
3577
|
});
|
|
3973
|
-
const viteBin =
|
|
3974
|
-
const viteExists =
|
|
3578
|
+
const viteBin = path12.resolve(cwd, "node_modules", ".bin", "vite");
|
|
3579
|
+
const viteExists = fs8.existsSync(viteBin);
|
|
3975
3580
|
checks.push({
|
|
3976
3581
|
name: "source: vite binary",
|
|
3977
3582
|
ok: viteExists,
|
|
@@ -3988,7 +3593,7 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
3988
3593
|
const contentConfig = config.source.contentFiles;
|
|
3989
3594
|
if (contentConfig) {
|
|
3990
3595
|
const fg4 = await import("fast-glob");
|
|
3991
|
-
const baseDir =
|
|
3596
|
+
const baseDir = path12.resolve(cwd, contentConfig.baseDir);
|
|
3992
3597
|
const files = await fg4.default(contentConfig.globs, { cwd: baseDir, onlyFiles: true });
|
|
3993
3598
|
checks.push({
|
|
3994
3599
|
name: "source: content files",
|
|
@@ -4003,61 +3608,26 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
4003
3608
|
});
|
|
4004
3609
|
}
|
|
4005
3610
|
}
|
|
4006
|
-
try {
|
|
4007
|
-
const provider = createEmbeddingsProvider(config);
|
|
4008
|
-
await provider.embedTexts(["searchsocket doctor ping"], config.embeddings.model);
|
|
4009
|
-
checks.push({ name: "embedding provider connectivity", ok: true });
|
|
4010
|
-
} catch (error) {
|
|
4011
|
-
checks.push({
|
|
4012
|
-
name: "embedding provider connectivity",
|
|
4013
|
-
ok: false,
|
|
4014
|
-
details: error instanceof Error ? error.message : "unknown error"
|
|
4015
|
-
});
|
|
4016
|
-
}
|
|
4017
3611
|
let store = null;
|
|
4018
3612
|
try {
|
|
4019
|
-
store = await
|
|
3613
|
+
store = await createUpstashStore(config);
|
|
4020
3614
|
const health = await store.health();
|
|
4021
3615
|
checks.push({
|
|
4022
|
-
name: "
|
|
3616
|
+
name: "upstash search connectivity",
|
|
4023
3617
|
ok: health.ok,
|
|
4024
3618
|
details: health.details
|
|
4025
3619
|
});
|
|
4026
3620
|
} catch (error) {
|
|
4027
3621
|
checks.push({
|
|
4028
|
-
name: "
|
|
3622
|
+
name: "upstash search connectivity",
|
|
4029
3623
|
ok: false,
|
|
4030
3624
|
details: error instanceof Error ? error.message : "unknown error"
|
|
4031
3625
|
});
|
|
4032
3626
|
}
|
|
4033
|
-
if (store) {
|
|
4034
|
-
try {
|
|
4035
|
-
const testScope = {
|
|
4036
|
-
projectId: config.project.id,
|
|
4037
|
-
scopeName: "_searchsocket_doctor_probe",
|
|
4038
|
-
scopeId: `${config.project.id}:_searchsocket_doctor_probe`
|
|
4039
|
-
};
|
|
4040
|
-
await store.recordScope({
|
|
4041
|
-
projectId: testScope.projectId,
|
|
4042
|
-
scopeName: testScope.scopeName,
|
|
4043
|
-
modelId: config.embeddings.model,
|
|
4044
|
-
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4045
|
-
vectorCount: 0
|
|
4046
|
-
});
|
|
4047
|
-
await store.deleteScope(testScope);
|
|
4048
|
-
checks.push({ name: "vector backend write permission", ok: true });
|
|
4049
|
-
} catch (error) {
|
|
4050
|
-
checks.push({
|
|
4051
|
-
name: "vector backend write permission",
|
|
4052
|
-
ok: false,
|
|
4053
|
-
details: error instanceof Error ? error.message : "write test failed"
|
|
4054
|
-
});
|
|
4055
|
-
}
|
|
4056
|
-
}
|
|
4057
3627
|
try {
|
|
4058
3628
|
const scope = resolveScope(config);
|
|
4059
3629
|
const { statePath } = ensureStateDirs(cwd, config.state.dir, scope);
|
|
4060
|
-
const testPath =
|
|
3630
|
+
const testPath = path12.join(statePath, ".write-test");
|
|
4061
3631
|
await fsp.writeFile(testPath, "ok\n", "utf8");
|
|
4062
3632
|
await fsp.rm(testPath, { force: true });
|
|
4063
3633
|
checks.push({ name: "state directory writable", ok: true });
|
|
@@ -4086,7 +3656,7 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
4086
3656
|
});
|
|
4087
3657
|
program.command("mcp").description("Run SearchSocket MCP server").option("--transport <transport>", "stdio|http", "stdio").option("--port <n>", "HTTP port", "3338").option("--path <path>", "HTTP path", "/mcp").action(async (opts, command) => {
|
|
4088
3658
|
const rootOpts = getRootOptions(command);
|
|
4089
|
-
const cwd =
|
|
3659
|
+
const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
|
|
4090
3660
|
await runMcpServer({
|
|
4091
3661
|
cwd,
|
|
4092
3662
|
configPath: rootOpts?.config,
|
|
@@ -4095,9 +3665,9 @@ program.command("mcp").description("Run SearchSocket MCP server").option("--tran
|
|
|
4095
3665
|
httpPath: opts.path
|
|
4096
3666
|
});
|
|
4097
3667
|
});
|
|
4098
|
-
program.command("search").description("Quick
|
|
3668
|
+
program.command("search").description("Quick CLI search against Upstash Search").requiredOption("--q <query>", "search query").option("--scope <name>", "scope override").option("--top-k <n>", "top K results", "10").option("--path-prefix <prefix>", "path prefix filter").action(async (opts, command) => {
|
|
4099
3669
|
const rootOpts = getRootOptions(command);
|
|
4100
|
-
const cwd =
|
|
3670
|
+
const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
|
|
4101
3671
|
const engine = await SearchEngine.create({
|
|
4102
3672
|
cwd,
|
|
4103
3673
|
configPath: rootOpts?.config
|
|
@@ -4106,14 +3676,13 @@ program.command("search").description("Quick local CLI search against indexed ve
|
|
|
4106
3676
|
q: opts.q,
|
|
4107
3677
|
scope: opts.scope,
|
|
4108
3678
|
topK: parsePositiveInt(opts.topK, "--top-k"),
|
|
4109
|
-
pathPrefix: opts.pathPrefix
|
|
4110
|
-
rerank: opts.rerank
|
|
3679
|
+
pathPrefix: opts.pathPrefix
|
|
4111
3680
|
});
|
|
4112
3681
|
process.stdout.write(`${JSON.stringify(result, null, 2)}
|
|
4113
3682
|
`);
|
|
4114
3683
|
});
|
|
4115
3684
|
async function main() {
|
|
4116
|
-
dotenvConfig({ path:
|
|
3685
|
+
dotenvConfig({ path: path12.resolve(process.cwd(), ".env") });
|
|
4117
3686
|
await program.parseAsync(process.argv);
|
|
4118
3687
|
}
|
|
4119
3688
|
main().catch((error) => {
|