searchsocket 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -31
- package/dist/cli.js +634 -1326
- package/dist/client.cjs +41 -117
- package/dist/client.d.cts +3 -17
- package/dist/client.d.ts +3 -17
- package/dist/client.js +41 -117
- package/dist/index.cjs +608 -1398
- package/dist/index.d.cts +73 -35
- package/dist/index.d.ts +73 -35
- package/dist/index.js +605 -1392
- package/dist/plugin-B_npJSux.d.cts +36 -0
- package/dist/plugin-M-aW0ev6.d.ts +36 -0
- package/dist/scroll.cjs +185 -0
- package/dist/scroll.d.cts +42 -0
- package/dist/scroll.d.ts +42 -0
- package/dist/scroll.js +183 -0
- package/dist/sveltekit.cjs +781 -1278
- package/dist/sveltekit.d.cts +3 -43
- package/dist/sveltekit.d.ts +3 -43
- package/dist/sveltekit.js +779 -1276
- package/dist/{types-z2dw3H6E.d.cts → types-Dk43uz25.d.cts} +46 -141
- package/dist/{types-z2dw3H6E.d.ts → types-Dk43uz25.d.ts} +46 -141
- package/package.json +10 -3
package/dist/cli.js
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/cli.ts
|
|
4
|
-
import
|
|
4
|
+
import fs8 from "fs";
|
|
5
5
|
import fsp from "fs/promises";
|
|
6
|
-
import
|
|
6
|
+
import path12 from "path";
|
|
7
7
|
import { execSync as execSync2 } from "child_process";
|
|
8
8
|
import { config as dotenvConfig } from "dotenv";
|
|
9
9
|
import chokidar from "chokidar";
|
|
@@ -12,7 +12,7 @@ import { Command } from "commander";
|
|
|
12
12
|
// package.json
|
|
13
13
|
var package_default = {
|
|
14
14
|
name: "searchsocket",
|
|
15
|
-
version: "0.
|
|
15
|
+
version: "0.5.0",
|
|
16
16
|
description: "Semantic site search and MCP retrieval for SvelteKit static sites",
|
|
17
17
|
license: "MIT",
|
|
18
18
|
author: "Greg Priday <greg@siteorigin.com>",
|
|
@@ -58,6 +58,11 @@ var package_default = {
|
|
|
58
58
|
types: "./dist/client.d.ts",
|
|
59
59
|
import: "./dist/client.js",
|
|
60
60
|
require: "./dist/client.cjs"
|
|
61
|
+
},
|
|
62
|
+
"./scroll": {
|
|
63
|
+
types: "./dist/scroll.d.ts",
|
|
64
|
+
import: "./dist/scroll.js",
|
|
65
|
+
require: "./dist/scroll.cjs"
|
|
61
66
|
}
|
|
62
67
|
},
|
|
63
68
|
scripts: {
|
|
@@ -65,15 +70,16 @@ var package_default = {
|
|
|
65
70
|
clean: "rm -rf dist",
|
|
66
71
|
typecheck: "tsc --noEmit",
|
|
67
72
|
test: "vitest run",
|
|
68
|
-
"test:watch": "vitest"
|
|
73
|
+
"test:watch": "vitest",
|
|
74
|
+
"test:quality": "SEARCHSOCKET_QUALITY_TESTS=1 vitest run tests/quality.test.ts"
|
|
69
75
|
},
|
|
70
76
|
engines: {
|
|
71
77
|
node: ">=20"
|
|
72
78
|
},
|
|
73
79
|
packageManager: "pnpm@10.29.2",
|
|
74
80
|
dependencies: {
|
|
75
|
-
"@libsql/client": "^0.17.0",
|
|
76
81
|
"@modelcontextprotocol/sdk": "^1.26.0",
|
|
82
|
+
"@upstash/search": "^0.1.7",
|
|
77
83
|
cheerio: "^1.2.0",
|
|
78
84
|
chokidar: "^5.0.0",
|
|
79
85
|
commander: "^14.0.3",
|
|
@@ -91,6 +97,7 @@ var package_default = {
|
|
|
91
97
|
"@types/express": "^5.0.6",
|
|
92
98
|
"@types/node": "^25.2.2",
|
|
93
99
|
"@types/turndown": "^5.0.6",
|
|
100
|
+
jsdom: "^28.1.0",
|
|
94
101
|
tsup: "^8.5.1",
|
|
95
102
|
typescript: "^5.9.3",
|
|
96
103
|
vitest: "^4.0.18"
|
|
@@ -164,29 +171,18 @@ var searchSocketConfigSchema = z.object({
|
|
|
164
171
|
prependTitle: z.boolean().optional(),
|
|
165
172
|
pageSummaryChunk: z.boolean().optional()
|
|
166
173
|
}).optional(),
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
batchSize: z.number().int().positive().optional(),
|
|
173
|
-
concurrency: z.number().int().positive().optional(),
|
|
174
|
-
pricePer1kTokens: z.number().positive().optional()
|
|
175
|
-
}).optional(),
|
|
176
|
-
vector: z.object({
|
|
177
|
-
dimension: z.number().int().positive().optional(),
|
|
178
|
-
turso: z.object({
|
|
179
|
-
url: z.string().url().optional(),
|
|
180
|
-
authToken: z.string().min(1).optional(),
|
|
181
|
-
urlEnv: z.string().optional(),
|
|
182
|
-
authTokenEnv: z.string().optional(),
|
|
183
|
-
localPath: z.string().optional()
|
|
184
|
-
}).optional()
|
|
174
|
+
upstash: z.object({
|
|
175
|
+
url: z.string().url().optional(),
|
|
176
|
+
token: z.string().min(1).optional(),
|
|
177
|
+
urlEnv: z.string().min(1).optional(),
|
|
178
|
+
tokenEnv: z.string().min(1).optional()
|
|
185
179
|
}).optional(),
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
180
|
+
search: z.object({
|
|
181
|
+
semanticWeight: z.number().min(0).max(1).optional(),
|
|
182
|
+
inputEnrichment: z.boolean().optional(),
|
|
183
|
+
reranking: z.boolean().optional(),
|
|
184
|
+
dualSearch: z.boolean().optional(),
|
|
185
|
+
pageSearchWeight: z.number().min(0).max(1).optional()
|
|
190
186
|
}).optional(),
|
|
191
187
|
ranking: z.object({
|
|
192
188
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
@@ -196,11 +192,12 @@ var searchSocketConfigSchema = z.object({
|
|
|
196
192
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
197
193
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
198
194
|
minScore: z.number().min(0).max(1).optional(),
|
|
195
|
+
scoreGapThreshold: z.number().min(0).max(1).optional(),
|
|
199
196
|
weights: z.object({
|
|
200
197
|
incomingLinks: z.number().optional(),
|
|
201
198
|
depth: z.number().optional(),
|
|
202
|
-
|
|
203
|
-
|
|
199
|
+
aggregation: z.number().optional(),
|
|
200
|
+
titleMatch: z.number().optional()
|
|
204
201
|
}).optional()
|
|
205
202
|
}).optional(),
|
|
206
203
|
api: z.object({
|
|
@@ -222,8 +219,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
222
219
|
}).optional()
|
|
223
220
|
}).optional(),
|
|
224
221
|
state: z.object({
|
|
225
|
-
dir: z.string().optional()
|
|
226
|
-
writeMirror: z.boolean().optional()
|
|
222
|
+
dir: z.string().optional()
|
|
227
223
|
}).optional()
|
|
228
224
|
});
|
|
229
225
|
|
|
@@ -277,24 +273,16 @@ function createDefaultConfig(projectId) {
|
|
|
277
273
|
prependTitle: true,
|
|
278
274
|
pageSummaryChunk: true
|
|
279
275
|
},
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
apiKeyEnv: "JINA_API_KEY",
|
|
284
|
-
batchSize: 64,
|
|
285
|
-
concurrency: 4
|
|
276
|
+
upstash: {
|
|
277
|
+
urlEnv: "UPSTASH_SEARCH_REST_URL",
|
|
278
|
+
tokenEnv: "UPSTASH_SEARCH_REST_TOKEN"
|
|
286
279
|
},
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
},
|
|
294
|
-
rerank: {
|
|
295
|
-
enabled: true,
|
|
296
|
-
topN: 20,
|
|
297
|
-
model: "jina-reranker-v3"
|
|
280
|
+
search: {
|
|
281
|
+
semanticWeight: 0.75,
|
|
282
|
+
inputEnrichment: true,
|
|
283
|
+
reranking: true,
|
|
284
|
+
dualSearch: true,
|
|
285
|
+
pageSearchWeight: 0.3
|
|
298
286
|
},
|
|
299
287
|
ranking: {
|
|
300
288
|
enableIncomingLinkBoost: true,
|
|
@@ -303,12 +291,13 @@ function createDefaultConfig(projectId) {
|
|
|
303
291
|
aggregationCap: 5,
|
|
304
292
|
aggregationDecay: 0.5,
|
|
305
293
|
minChunkScoreRatio: 0.5,
|
|
306
|
-
minScore: 0,
|
|
294
|
+
minScore: 0.3,
|
|
295
|
+
scoreGapThreshold: 0.4,
|
|
307
296
|
weights: {
|
|
308
297
|
incomingLinks: 0.05,
|
|
309
298
|
depth: 0.03,
|
|
310
|
-
|
|
311
|
-
|
|
299
|
+
aggregation: 0.1,
|
|
300
|
+
titleMatch: 0.15
|
|
312
301
|
}
|
|
313
302
|
},
|
|
314
303
|
api: {
|
|
@@ -326,8 +315,7 @@ function createDefaultConfig(projectId) {
|
|
|
326
315
|
}
|
|
327
316
|
},
|
|
328
317
|
state: {
|
|
329
|
-
dir: ".searchsocket"
|
|
330
|
-
writeMirror: false
|
|
318
|
+
dir: ".searchsocket"
|
|
331
319
|
}
|
|
332
320
|
};
|
|
333
321
|
}
|
|
@@ -435,21 +423,13 @@ ${issues}`
|
|
|
435
423
|
...defaults.chunking,
|
|
436
424
|
...parsed.chunking
|
|
437
425
|
},
|
|
438
|
-
|
|
439
|
-
...defaults.
|
|
440
|
-
...parsed.
|
|
426
|
+
upstash: {
|
|
427
|
+
...defaults.upstash,
|
|
428
|
+
...parsed.upstash
|
|
441
429
|
},
|
|
442
|
-
|
|
443
|
-
...defaults.
|
|
444
|
-
...parsed.
|
|
445
|
-
turso: {
|
|
446
|
-
...defaults.vector.turso,
|
|
447
|
-
...parsed.vector?.turso
|
|
448
|
-
}
|
|
449
|
-
},
|
|
450
|
-
rerank: {
|
|
451
|
-
...defaults.rerank,
|
|
452
|
-
...parsed.rerank
|
|
430
|
+
search: {
|
|
431
|
+
...defaults.search,
|
|
432
|
+
...parsed.search
|
|
453
433
|
},
|
|
454
434
|
ranking: {
|
|
455
435
|
...defaults.ranking,
|
|
@@ -541,7 +521,8 @@ function writeMinimalConfig(cwd) {
|
|
|
541
521
|
return target;
|
|
542
522
|
}
|
|
543
523
|
const content = `export default {
|
|
544
|
-
|
|
524
|
+
// Upstash Search credentials (set via env vars or directly here)
|
|
525
|
+
// upstash: { urlEnv: "UPSTASH_SEARCH_REST_URL", tokenEnv: "UPSTASH_SEARCH_REST_TOKEN" }
|
|
545
526
|
};
|
|
546
527
|
`;
|
|
547
528
|
fs.writeFileSync(target, content, "utf8");
|
|
@@ -704,576 +685,246 @@ import fs2 from "fs";
|
|
|
704
685
|
import path2 from "path";
|
|
705
686
|
function ensureStateDirs(cwd, stateDir, scope) {
|
|
706
687
|
const statePath = path2.resolve(cwd, stateDir);
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
return { statePath, pagesPath };
|
|
710
|
-
}
|
|
711
|
-
|
|
712
|
-
// src/embeddings/jina.ts
|
|
713
|
-
import pLimit from "p-limit";
|
|
714
|
-
function sleep(ms) {
|
|
715
|
-
return new Promise((resolve) => {
|
|
716
|
-
setTimeout(resolve, ms);
|
|
717
|
-
});
|
|
718
|
-
}
|
|
719
|
-
var JinaEmbeddingsProvider = class {
|
|
720
|
-
apiKey;
|
|
721
|
-
batchSize;
|
|
722
|
-
concurrency;
|
|
723
|
-
defaultTask;
|
|
724
|
-
constructor(options) {
|
|
725
|
-
if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
|
|
726
|
-
throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
|
|
727
|
-
}
|
|
728
|
-
if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
|
|
729
|
-
throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
|
|
730
|
-
}
|
|
731
|
-
this.apiKey = options.apiKey;
|
|
732
|
-
this.batchSize = options.batchSize;
|
|
733
|
-
this.concurrency = options.concurrency;
|
|
734
|
-
this.defaultTask = options.task ?? "retrieval.passage";
|
|
735
|
-
}
|
|
736
|
-
estimateTokens(text) {
|
|
737
|
-
const normalized = text.trim();
|
|
738
|
-
if (!normalized) {
|
|
739
|
-
return 0;
|
|
740
|
-
}
|
|
741
|
-
const wordCount = normalized.match(/[A-Za-z0-9_]+/g)?.length ?? 0;
|
|
742
|
-
const punctuationCount = normalized.match(/[^\s\w]/g)?.length ?? 0;
|
|
743
|
-
const cjkCount = normalized.match(/[\u3400-\u9fff]/g)?.length ?? 0;
|
|
744
|
-
const charEstimate = Math.ceil(normalized.length / 4);
|
|
745
|
-
const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
|
|
746
|
-
return Math.max(1, Math.max(charEstimate, lexicalEstimate));
|
|
747
|
-
}
|
|
748
|
-
async embedTexts(texts, modelId, task) {
|
|
749
|
-
if (texts.length === 0) {
|
|
750
|
-
return [];
|
|
751
|
-
}
|
|
752
|
-
const batches = [];
|
|
753
|
-
for (let i = 0; i < texts.length; i += this.batchSize) {
|
|
754
|
-
batches.push({
|
|
755
|
-
index: i,
|
|
756
|
-
values: texts.slice(i, i + this.batchSize)
|
|
757
|
-
});
|
|
758
|
-
}
|
|
759
|
-
const outputs = new Array(batches.length);
|
|
760
|
-
const limit = pLimit(this.concurrency);
|
|
761
|
-
await Promise.all(
|
|
762
|
-
batches.map(
|
|
763
|
-
(batch, position) => limit(async () => {
|
|
764
|
-
outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
|
|
765
|
-
})
|
|
766
|
-
)
|
|
767
|
-
);
|
|
768
|
-
return outputs.flat();
|
|
769
|
-
}
|
|
770
|
-
async embedWithRetry(texts, modelId, task) {
|
|
771
|
-
const maxAttempts = 5;
|
|
772
|
-
let attempt = 0;
|
|
773
|
-
while (attempt < maxAttempts) {
|
|
774
|
-
attempt += 1;
|
|
775
|
-
let response;
|
|
776
|
-
try {
|
|
777
|
-
response = await fetch("https://api.jina.ai/v1/embeddings", {
|
|
778
|
-
method: "POST",
|
|
779
|
-
headers: {
|
|
780
|
-
"content-type": "application/json",
|
|
781
|
-
authorization: `Bearer ${this.apiKey}`
|
|
782
|
-
},
|
|
783
|
-
body: JSON.stringify({
|
|
784
|
-
model: modelId,
|
|
785
|
-
input: texts,
|
|
786
|
-
task
|
|
787
|
-
})
|
|
788
|
-
});
|
|
789
|
-
} catch (error) {
|
|
790
|
-
if (attempt >= maxAttempts) {
|
|
791
|
-
throw error;
|
|
792
|
-
}
|
|
793
|
-
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
794
|
-
continue;
|
|
795
|
-
}
|
|
796
|
-
if (!response.ok) {
|
|
797
|
-
const retryable = response.status === 429 || response.status >= 500;
|
|
798
|
-
if (!retryable || attempt >= maxAttempts) {
|
|
799
|
-
const errorBody = await response.text();
|
|
800
|
-
throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
|
|
801
|
-
}
|
|
802
|
-
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
803
|
-
continue;
|
|
804
|
-
}
|
|
805
|
-
const payload = await response.json();
|
|
806
|
-
if (!payload.data || !Array.isArray(payload.data)) {
|
|
807
|
-
throw new Error("Invalid Jina embeddings response format");
|
|
808
|
-
}
|
|
809
|
-
return payload.data.map((entry) => entry.embedding);
|
|
810
|
-
}
|
|
811
|
-
throw new Error("Unreachable retry state");
|
|
812
|
-
}
|
|
813
|
-
};
|
|
814
|
-
|
|
815
|
-
// src/embeddings/factory.ts
|
|
816
|
-
function createEmbeddingsProvider(config) {
|
|
817
|
-
if (config.embeddings.provider !== "jina") {
|
|
818
|
-
throw new SearchSocketError(
|
|
819
|
-
"CONFIG_MISSING",
|
|
820
|
-
`Unsupported embeddings provider ${config.embeddings.provider}`
|
|
821
|
-
);
|
|
822
|
-
}
|
|
823
|
-
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
824
|
-
if (!apiKey) {
|
|
825
|
-
throw new SearchSocketError(
|
|
826
|
-
"CONFIG_MISSING",
|
|
827
|
-
`Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
|
|
828
|
-
);
|
|
829
|
-
}
|
|
830
|
-
return new JinaEmbeddingsProvider({
|
|
831
|
-
apiKey,
|
|
832
|
-
batchSize: config.embeddings.batchSize,
|
|
833
|
-
concurrency: config.embeddings.concurrency
|
|
834
|
-
});
|
|
688
|
+
fs2.mkdirSync(statePath, { recursive: true });
|
|
689
|
+
return { statePath };
|
|
835
690
|
}
|
|
836
691
|
|
|
837
692
|
// src/indexing/pipeline.ts
|
|
838
|
-
import
|
|
839
|
-
|
|
840
|
-
// src/vector/factory.ts
|
|
841
|
-
import fs3 from "fs";
|
|
842
|
-
import path3 from "path";
|
|
693
|
+
import path10 from "path";
|
|
843
694
|
|
|
844
|
-
// src/
|
|
845
|
-
function
|
|
846
|
-
return
|
|
695
|
+
// src/vector/upstash.ts
|
|
696
|
+
function chunkIndexName(scope) {
|
|
697
|
+
return `${scope.projectId}--${scope.scopeName}`;
|
|
847
698
|
}
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
699
|
+
function pageIndexName(scope) {
|
|
700
|
+
return `${scope.projectId}--${scope.scopeName}--pages`;
|
|
701
|
+
}
|
|
702
|
+
var UpstashSearchStore = class {
|
|
851
703
|
client;
|
|
852
|
-
dimension;
|
|
853
|
-
chunksReady = false;
|
|
854
|
-
registryReady = false;
|
|
855
|
-
pagesReady = false;
|
|
856
704
|
constructor(opts) {
|
|
857
705
|
this.client = opts.client;
|
|
858
|
-
this.dimension = opts.dimension;
|
|
859
|
-
}
|
|
860
|
-
async ensureRegistry() {
|
|
861
|
-
if (this.registryReady) return;
|
|
862
|
-
await this.client.execute(`
|
|
863
|
-
CREATE TABLE IF NOT EXISTS registry (
|
|
864
|
-
scope_key TEXT PRIMARY KEY,
|
|
865
|
-
project_id TEXT NOT NULL,
|
|
866
|
-
scope_name TEXT NOT NULL,
|
|
867
|
-
model_id TEXT NOT NULL,
|
|
868
|
-
last_indexed_at TEXT NOT NULL,
|
|
869
|
-
vector_count INTEGER,
|
|
870
|
-
last_estimate_tokens INTEGER,
|
|
871
|
-
last_estimate_cost_usd REAL,
|
|
872
|
-
last_estimate_changed_chunks INTEGER
|
|
873
|
-
)
|
|
874
|
-
`);
|
|
875
|
-
const estimateCols = [
|
|
876
|
-
{ name: "last_estimate_tokens", def: "INTEGER" },
|
|
877
|
-
{ name: "last_estimate_cost_usd", def: "REAL" },
|
|
878
|
-
{ name: "last_estimate_changed_chunks", def: "INTEGER" }
|
|
879
|
-
];
|
|
880
|
-
for (const col of estimateCols) {
|
|
881
|
-
try {
|
|
882
|
-
await this.client.execute(`ALTER TABLE registry ADD COLUMN ${col.name} ${col.def}`);
|
|
883
|
-
} catch (error) {
|
|
884
|
-
if (error instanceof Error && !error.message.includes("duplicate column")) {
|
|
885
|
-
throw error;
|
|
886
|
-
}
|
|
887
|
-
}
|
|
888
|
-
}
|
|
889
|
-
this.registryReady = true;
|
|
890
|
-
}
|
|
891
|
-
async ensureChunks(dim) {
|
|
892
|
-
if (this.chunksReady) return;
|
|
893
|
-
const exists = await this.chunksTableExists();
|
|
894
|
-
if (exists) {
|
|
895
|
-
const currentDim = await this.getChunksDimension();
|
|
896
|
-
if (currentDim !== null && currentDim !== dim) {
|
|
897
|
-
await this.client.batch([
|
|
898
|
-
"DROP INDEX IF EXISTS idx",
|
|
899
|
-
"DROP TABLE IF EXISTS chunks"
|
|
900
|
-
]);
|
|
901
|
-
}
|
|
902
|
-
}
|
|
903
|
-
await this.client.batch([
|
|
904
|
-
`CREATE TABLE IF NOT EXISTS chunks (
|
|
905
|
-
id TEXT PRIMARY KEY,
|
|
906
|
-
project_id TEXT NOT NULL,
|
|
907
|
-
scope_name TEXT NOT NULL,
|
|
908
|
-
url TEXT NOT NULL,
|
|
909
|
-
path TEXT NOT NULL,
|
|
910
|
-
title TEXT NOT NULL,
|
|
911
|
-
section_title TEXT NOT NULL DEFAULT '',
|
|
912
|
-
heading_path TEXT NOT NULL DEFAULT '[]',
|
|
913
|
-
snippet TEXT NOT NULL DEFAULT '',
|
|
914
|
-
chunk_text TEXT NOT NULL DEFAULT '',
|
|
915
|
-
ordinal INTEGER NOT NULL DEFAULT 0,
|
|
916
|
-
content_hash TEXT NOT NULL DEFAULT '',
|
|
917
|
-
model_id TEXT NOT NULL DEFAULT '',
|
|
918
|
-
depth INTEGER NOT NULL DEFAULT 0,
|
|
919
|
-
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
920
|
-
route_file TEXT NOT NULL DEFAULT '',
|
|
921
|
-
tags TEXT NOT NULL DEFAULT '[]',
|
|
922
|
-
description TEXT NOT NULL DEFAULT '',
|
|
923
|
-
keywords TEXT NOT NULL DEFAULT '[]',
|
|
924
|
-
embedding F32_BLOB(${dim})
|
|
925
|
-
)`,
|
|
926
|
-
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
927
|
-
]);
|
|
928
|
-
this.chunksReady = true;
|
|
929
|
-
}
|
|
930
|
-
async ensurePages() {
|
|
931
|
-
if (this.pagesReady) return;
|
|
932
|
-
await this.client.execute(`
|
|
933
|
-
CREATE TABLE IF NOT EXISTS pages (
|
|
934
|
-
project_id TEXT NOT NULL,
|
|
935
|
-
scope_name TEXT NOT NULL,
|
|
936
|
-
url TEXT NOT NULL,
|
|
937
|
-
title TEXT NOT NULL,
|
|
938
|
-
markdown TEXT NOT NULL,
|
|
939
|
-
route_file TEXT NOT NULL DEFAULT '',
|
|
940
|
-
route_resolution TEXT NOT NULL DEFAULT 'exact',
|
|
941
|
-
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
942
|
-
outgoing_links INTEGER NOT NULL DEFAULT 0,
|
|
943
|
-
depth INTEGER NOT NULL DEFAULT 0,
|
|
944
|
-
tags TEXT NOT NULL DEFAULT '[]',
|
|
945
|
-
indexed_at TEXT NOT NULL,
|
|
946
|
-
PRIMARY KEY (project_id, scope_name, url)
|
|
947
|
-
)
|
|
948
|
-
`);
|
|
949
|
-
this.pagesReady = true;
|
|
950
706
|
}
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
await this.client.execute("SELECT 1 FROM chunks LIMIT 0");
|
|
954
|
-
return true;
|
|
955
|
-
} catch (error) {
|
|
956
|
-
if (error instanceof Error && error.message.includes("no such table")) {
|
|
957
|
-
return false;
|
|
958
|
-
}
|
|
959
|
-
throw error;
|
|
960
|
-
}
|
|
707
|
+
chunkIndex(scope) {
|
|
708
|
+
return this.client.index(chunkIndexName(scope));
|
|
961
709
|
}
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
* Returns null if the table doesn't exist or the dimension can't be parsed.
|
|
965
|
-
*/
|
|
966
|
-
async getChunksDimension() {
|
|
967
|
-
try {
|
|
968
|
-
const rs = await this.client.execute(
|
|
969
|
-
"SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
|
|
970
|
-
);
|
|
971
|
-
if (rs.rows.length === 0) return null;
|
|
972
|
-
const sql = rs.rows[0].sql;
|
|
973
|
-
const match = sql.match(/F32_BLOB\((\d+)\)/i);
|
|
974
|
-
return match ? parseInt(match[1], 10) : null;
|
|
975
|
-
} catch {
|
|
976
|
-
return null;
|
|
977
|
-
}
|
|
710
|
+
pageIndex(scope) {
|
|
711
|
+
return this.client.index(pageIndexName(scope));
|
|
978
712
|
}
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
*/
|
|
983
|
-
async dropAllTables() {
|
|
984
|
-
await this.client.batch([
|
|
985
|
-
"DROP INDEX IF EXISTS idx",
|
|
986
|
-
"DROP TABLE IF EXISTS chunks",
|
|
987
|
-
"DROP TABLE IF EXISTS registry",
|
|
988
|
-
"DROP TABLE IF EXISTS pages"
|
|
989
|
-
]);
|
|
990
|
-
this.chunksReady = false;
|
|
991
|
-
this.registryReady = false;
|
|
992
|
-
this.pagesReady = false;
|
|
993
|
-
}
|
|
994
|
-
async upsert(records, _scope) {
|
|
995
|
-
if (records.length === 0) return;
|
|
996
|
-
const dim = this.dimension ?? records[0].vector.length;
|
|
997
|
-
await this.ensureChunks(dim);
|
|
713
|
+
async upsertChunks(chunks, scope) {
|
|
714
|
+
if (chunks.length === 0) return;
|
|
715
|
+
const index = this.chunkIndex(scope);
|
|
998
716
|
const BATCH_SIZE = 100;
|
|
999
|
-
for (let i = 0; i <
|
|
1000
|
-
const batch =
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
r.metadata.title,
|
|
1014
|
-
r.metadata.sectionTitle,
|
|
1015
|
-
JSON.stringify(r.metadata.headingPath),
|
|
1016
|
-
r.metadata.snippet,
|
|
1017
|
-
r.metadata.chunkText,
|
|
1018
|
-
r.metadata.ordinal,
|
|
1019
|
-
r.metadata.contentHash,
|
|
1020
|
-
r.metadata.modelId,
|
|
1021
|
-
r.metadata.depth,
|
|
1022
|
-
r.metadata.incomingLinks,
|
|
1023
|
-
r.metadata.routeFile,
|
|
1024
|
-
JSON.stringify(r.metadata.tags),
|
|
1025
|
-
r.metadata.description ?? "",
|
|
1026
|
-
JSON.stringify(r.metadata.keywords ?? []),
|
|
1027
|
-
JSON.stringify(r.vector)
|
|
1028
|
-
]
|
|
1029
|
-
}));
|
|
1030
|
-
await this.client.batch(stmts);
|
|
1031
|
-
}
|
|
1032
|
-
}
|
|
1033
|
-
async query(queryVector, opts, scope) {
|
|
1034
|
-
const dim = this.dimension ?? queryVector.length;
|
|
1035
|
-
await this.ensureChunks(dim);
|
|
1036
|
-
const queryJson = JSON.stringify(queryVector);
|
|
1037
|
-
const rs = await this.client.execute({
|
|
1038
|
-
sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
|
|
1039
|
-
c.section_title, c.heading_path, c.snippet, c.chunk_text,
|
|
1040
|
-
c.ordinal, c.content_hash,
|
|
1041
|
-
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
1042
|
-
c.description, c.keywords,
|
|
1043
|
-
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
1044
|
-
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
1045
|
-
JOIN chunks AS c ON c.rowid = v.id`,
|
|
1046
|
-
args: [queryJson, queryJson, opts.topK]
|
|
717
|
+
for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
|
|
718
|
+
const batch = chunks.slice(i, i + BATCH_SIZE);
|
|
719
|
+
await index.upsert(batch);
|
|
720
|
+
}
|
|
721
|
+
}
|
|
722
|
+
async search(query, opts, scope) {
|
|
723
|
+
const index = this.chunkIndex(scope);
|
|
724
|
+
const results = await index.search({
|
|
725
|
+
query,
|
|
726
|
+
limit: opts.limit,
|
|
727
|
+
semanticWeight: opts.semanticWeight,
|
|
728
|
+
inputEnrichment: opts.inputEnrichment,
|
|
729
|
+
reranking: opts.reranking,
|
|
730
|
+
filter: opts.filter
|
|
1047
731
|
});
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
}
|
|
732
|
+
return results.map((doc) => ({
|
|
733
|
+
id: doc.id,
|
|
734
|
+
score: doc.score,
|
|
735
|
+
metadata: {
|
|
736
|
+
projectId: doc.metadata?.projectId ?? "",
|
|
737
|
+
scopeName: doc.metadata?.scopeName ?? "",
|
|
738
|
+
url: doc.content.url,
|
|
739
|
+
path: doc.metadata?.path ?? "",
|
|
740
|
+
title: doc.content.title,
|
|
741
|
+
sectionTitle: doc.content.sectionTitle,
|
|
742
|
+
headingPath: doc.content.headingPath ? doc.content.headingPath.split(" > ").filter(Boolean) : [],
|
|
743
|
+
snippet: doc.metadata?.snippet ?? "",
|
|
744
|
+
chunkText: doc.content.text,
|
|
745
|
+
ordinal: doc.metadata?.ordinal ?? 0,
|
|
746
|
+
contentHash: doc.metadata?.contentHash ?? "",
|
|
747
|
+
depth: doc.metadata?.depth ?? 0,
|
|
748
|
+
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
749
|
+
routeFile: doc.metadata?.routeFile ?? "",
|
|
750
|
+
tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
|
|
751
|
+
description: doc.metadata?.description || void 0,
|
|
752
|
+
keywords: doc.metadata?.keywords ? doc.metadata.keywords.split(",").filter(Boolean) : void 0
|
|
1070
753
|
}
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
scopeName,
|
|
1085
|
-
url: row.url,
|
|
1086
|
-
path: rowPath,
|
|
1087
|
-
title: row.title,
|
|
1088
|
-
sectionTitle: row.section_title,
|
|
1089
|
-
headingPath: JSON.parse(row.heading_path || "[]"),
|
|
1090
|
-
snippet: row.snippet,
|
|
1091
|
-
chunkText: row.chunk_text || "",
|
|
1092
|
-
ordinal: row.ordinal || 0,
|
|
1093
|
-
contentHash: row.content_hash,
|
|
1094
|
-
modelId: row.model_id,
|
|
1095
|
-
depth: row.depth,
|
|
1096
|
-
incomingLinks: row.incoming_links,
|
|
1097
|
-
routeFile: row.route_file,
|
|
1098
|
-
tags,
|
|
1099
|
-
description,
|
|
1100
|
-
keywords
|
|
1101
|
-
}
|
|
754
|
+
}));
|
|
755
|
+
}
|
|
756
|
+
async searchPages(query, opts, scope) {
|
|
757
|
+
const index = this.pageIndex(scope);
|
|
758
|
+
let results;
|
|
759
|
+
try {
|
|
760
|
+
results = await index.search({
|
|
761
|
+
query,
|
|
762
|
+
limit: opts.limit,
|
|
763
|
+
semanticWeight: opts.semanticWeight,
|
|
764
|
+
inputEnrichment: opts.inputEnrichment,
|
|
765
|
+
reranking: true,
|
|
766
|
+
filter: opts.filter
|
|
1102
767
|
});
|
|
768
|
+
} catch {
|
|
769
|
+
return [];
|
|
1103
770
|
}
|
|
1104
|
-
|
|
1105
|
-
|
|
771
|
+
return results.map((doc) => ({
|
|
772
|
+
id: doc.id,
|
|
773
|
+
score: doc.score,
|
|
774
|
+
title: doc.content.title,
|
|
775
|
+
url: doc.content.url,
|
|
776
|
+
description: doc.content.description ?? "",
|
|
777
|
+
tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
|
|
778
|
+
depth: doc.metadata?.depth ?? 0,
|
|
779
|
+
incomingLinks: doc.metadata?.incomingLinks ?? 0,
|
|
780
|
+
routeFile: doc.metadata?.routeFile ?? ""
|
|
781
|
+
}));
|
|
1106
782
|
}
|
|
1107
783
|
async deleteByIds(ids, scope) {
|
|
1108
784
|
if (ids.length === 0) return;
|
|
785
|
+
const index = this.chunkIndex(scope);
|
|
1109
786
|
const BATCH_SIZE = 500;
|
|
1110
787
|
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
1111
788
|
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
1112
|
-
|
|
1113
|
-
await this.client.execute({
|
|
1114
|
-
sql: `DELETE FROM chunks WHERE project_id = ? AND scope_name = ? AND id IN (${placeholders})`,
|
|
1115
|
-
args: [scope.projectId, scope.scopeName, ...batch]
|
|
1116
|
-
});
|
|
789
|
+
await index.delete(batch);
|
|
1117
790
|
}
|
|
1118
791
|
}
|
|
1119
792
|
async deleteScope(scope) {
|
|
1120
|
-
await this.ensureRegistry();
|
|
1121
793
|
try {
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
});
|
|
1126
|
-
} catch (error) {
|
|
1127
|
-
if (error instanceof Error && !error.message.includes("no such table")) {
|
|
1128
|
-
throw error;
|
|
1129
|
-
}
|
|
794
|
+
const chunkIdx = this.chunkIndex(scope);
|
|
795
|
+
await chunkIdx.deleteIndex();
|
|
796
|
+
} catch {
|
|
1130
797
|
}
|
|
1131
798
|
try {
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
});
|
|
1136
|
-
} catch (error) {
|
|
1137
|
-
if (error instanceof Error && !error.message.includes("no such table")) {
|
|
1138
|
-
throw error;
|
|
1139
|
-
}
|
|
799
|
+
const pageIdx = this.pageIndex(scope);
|
|
800
|
+
await pageIdx.deleteIndex();
|
|
801
|
+
} catch {
|
|
1140
802
|
}
|
|
1141
|
-
await this.client.execute({
|
|
1142
|
-
sql: `DELETE FROM registry WHERE project_id = ? AND scope_name = ?`,
|
|
1143
|
-
args: [scope.projectId, scope.scopeName]
|
|
1144
|
-
});
|
|
1145
803
|
}
|
|
1146
|
-
async listScopes(
|
|
1147
|
-
await this.
|
|
1148
|
-
const
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
info.lastEstimateCostUSD ?? null,
|
|
1182
|
-
info.lastEstimateChangedChunks ?? null
|
|
1183
|
-
]
|
|
1184
|
-
});
|
|
804
|
+
async listScopes(projectId) {
|
|
805
|
+
const allIndexes = await this.client.listIndexes();
|
|
806
|
+
const prefix = `${projectId}--`;
|
|
807
|
+
const scopeNames = /* @__PURE__ */ new Set();
|
|
808
|
+
for (const name of allIndexes) {
|
|
809
|
+
if (name.startsWith(prefix) && !name.endsWith("--pages")) {
|
|
810
|
+
const scopeName = name.slice(prefix.length);
|
|
811
|
+
scopeNames.add(scopeName);
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
const scopes = [];
|
|
815
|
+
for (const scopeName of scopeNames) {
|
|
816
|
+
const scope = {
|
|
817
|
+
projectId,
|
|
818
|
+
scopeName,
|
|
819
|
+
scopeId: `${projectId}:${scopeName}`
|
|
820
|
+
};
|
|
821
|
+
try {
|
|
822
|
+
const info = await this.chunkIndex(scope).info();
|
|
823
|
+
scopes.push({
|
|
824
|
+
projectId,
|
|
825
|
+
scopeName,
|
|
826
|
+
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
827
|
+
documentCount: info.documentCount
|
|
828
|
+
});
|
|
829
|
+
} catch {
|
|
830
|
+
scopes.push({
|
|
831
|
+
projectId,
|
|
832
|
+
scopeName,
|
|
833
|
+
lastIndexedAt: "unknown",
|
|
834
|
+
documentCount: 0
|
|
835
|
+
});
|
|
836
|
+
}
|
|
837
|
+
}
|
|
838
|
+
return scopes;
|
|
1185
839
|
}
|
|
1186
840
|
async getContentHashes(scope) {
|
|
1187
|
-
const exists = await this.chunksTableExists();
|
|
1188
|
-
if (!exists) return /* @__PURE__ */ new Map();
|
|
1189
|
-
const rs = await this.client.execute({
|
|
1190
|
-
sql: `SELECT id, content_hash FROM chunks WHERE project_id = ? AND scope_name = ?`,
|
|
1191
|
-
args: [scope.projectId, scope.scopeName]
|
|
1192
|
-
});
|
|
1193
841
|
const map = /* @__PURE__ */ new Map();
|
|
1194
|
-
|
|
1195
|
-
|
|
842
|
+
const index = this.chunkIndex(scope);
|
|
843
|
+
let cursor = "0";
|
|
844
|
+
try {
|
|
845
|
+
for (; ; ) {
|
|
846
|
+
const result = await index.range({ cursor, limit: 100 });
|
|
847
|
+
for (const doc of result.documents) {
|
|
848
|
+
if (doc.metadata?.contentHash) {
|
|
849
|
+
map.set(doc.id, doc.metadata.contentHash);
|
|
850
|
+
}
|
|
851
|
+
}
|
|
852
|
+
if (!result.nextCursor || result.nextCursor === "0") break;
|
|
853
|
+
cursor = result.nextCursor;
|
|
854
|
+
}
|
|
855
|
+
} catch {
|
|
1196
856
|
}
|
|
1197
857
|
return map;
|
|
1198
858
|
}
|
|
1199
859
|
async upsertPages(pages, scope) {
|
|
1200
860
|
if (pages.length === 0) return;
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
if (page.projectId !== scope.projectId || page.scopeName !== scope.scopeName) {
|
|
1204
|
-
throw new Error(
|
|
1205
|
-
`Page scope mismatch: page has ${page.projectId}:${page.scopeName} but scope is ${scope.projectId}:${scope.scopeName}`
|
|
1206
|
-
);
|
|
1207
|
-
}
|
|
1208
|
-
}
|
|
1209
|
-
const BATCH_SIZE = 100;
|
|
861
|
+
const index = this.pageIndex(scope);
|
|
862
|
+
const BATCH_SIZE = 50;
|
|
1210
863
|
for (let i = 0; i < pages.length; i += BATCH_SIZE) {
|
|
1211
864
|
const batch = pages.slice(i, i + BATCH_SIZE);
|
|
1212
|
-
const
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
p.
|
|
1219
|
-
p.
|
|
1220
|
-
p.
|
|
1221
|
-
p.
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
p.
|
|
1225
|
-
p.
|
|
1226
|
-
p.
|
|
1227
|
-
p.
|
|
1228
|
-
|
|
1229
|
-
p.
|
|
1230
|
-
|
|
865
|
+
const docs = batch.map((p) => ({
|
|
866
|
+
id: p.url,
|
|
867
|
+
content: {
|
|
868
|
+
title: p.title,
|
|
869
|
+
url: p.url,
|
|
870
|
+
type: "page",
|
|
871
|
+
description: p.description ?? "",
|
|
872
|
+
keywords: (p.keywords ?? []).join(","),
|
|
873
|
+
summary: p.summary ?? "",
|
|
874
|
+
tags: p.tags.join(",")
|
|
875
|
+
},
|
|
876
|
+
metadata: {
|
|
877
|
+
markdown: p.markdown,
|
|
878
|
+
projectId: p.projectId,
|
|
879
|
+
scopeName: p.scopeName,
|
|
880
|
+
routeFile: p.routeFile,
|
|
881
|
+
routeResolution: p.routeResolution,
|
|
882
|
+
incomingLinks: p.incomingLinks,
|
|
883
|
+
outgoingLinks: p.outgoingLinks,
|
|
884
|
+
depth: p.depth,
|
|
885
|
+
indexedAt: p.indexedAt
|
|
886
|
+
}
|
|
1231
887
|
}));
|
|
1232
|
-
await
|
|
888
|
+
await index.upsert(docs);
|
|
1233
889
|
}
|
|
1234
890
|
}
|
|
1235
891
|
async getPage(url, scope) {
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
892
|
+
const index = this.pageIndex(scope);
|
|
893
|
+
try {
|
|
894
|
+
const results = await index.fetch([url]);
|
|
895
|
+
const doc = results[0];
|
|
896
|
+
if (!doc) return null;
|
|
897
|
+
return {
|
|
898
|
+
url: doc.content.url,
|
|
899
|
+
title: doc.content.title,
|
|
900
|
+
markdown: doc.metadata.markdown,
|
|
901
|
+
projectId: doc.metadata.projectId,
|
|
902
|
+
scopeName: doc.metadata.scopeName,
|
|
903
|
+
routeFile: doc.metadata.routeFile,
|
|
904
|
+
routeResolution: doc.metadata.routeResolution,
|
|
905
|
+
incomingLinks: doc.metadata.incomingLinks,
|
|
906
|
+
outgoingLinks: doc.metadata.outgoingLinks,
|
|
907
|
+
depth: doc.metadata.depth,
|
|
908
|
+
tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
|
|
909
|
+
indexedAt: doc.metadata.indexedAt,
|
|
910
|
+
summary: doc.content.summary || void 0,
|
|
911
|
+
description: doc.content.description || void 0,
|
|
912
|
+
keywords: doc.content.keywords ? doc.content.keywords.split(",").filter(Boolean) : void 0
|
|
913
|
+
};
|
|
914
|
+
} catch {
|
|
915
|
+
return null;
|
|
916
|
+
}
|
|
1257
917
|
}
|
|
1258
918
|
async deletePages(scope) {
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
}
|
|
1264
|
-
}
|
|
1265
|
-
async getScopeModelId(scope) {
|
|
1266
|
-
await this.ensureRegistry();
|
|
1267
|
-
const rs = await this.client.execute({
|
|
1268
|
-
sql: `SELECT model_id FROM registry WHERE project_id = ? AND scope_name = ?`,
|
|
1269
|
-
args: [scope.projectId, scope.scopeName]
|
|
1270
|
-
});
|
|
1271
|
-
if (rs.rows.length === 0) return null;
|
|
1272
|
-
return rs.rows[0].model_id;
|
|
919
|
+
try {
|
|
920
|
+
const index = this.pageIndex(scope);
|
|
921
|
+
await index.reset();
|
|
922
|
+
} catch {
|
|
923
|
+
}
|
|
1273
924
|
}
|
|
1274
925
|
async health() {
|
|
1275
926
|
try {
|
|
1276
|
-
await this.client.
|
|
927
|
+
await this.client.info();
|
|
1277
928
|
return { ok: true };
|
|
1278
929
|
} catch (error) {
|
|
1279
930
|
return {
|
|
@@ -1282,40 +933,34 @@ var TursoVectorStore = class {
|
|
|
1282
933
|
};
|
|
1283
934
|
}
|
|
1284
935
|
}
|
|
936
|
+
async dropAllIndexes(projectId) {
|
|
937
|
+
const allIndexes = await this.client.listIndexes();
|
|
938
|
+
const prefix = `${projectId}--`;
|
|
939
|
+
for (const name of allIndexes) {
|
|
940
|
+
if (name.startsWith(prefix)) {
|
|
941
|
+
try {
|
|
942
|
+
const index = this.client.index(name);
|
|
943
|
+
await index.deleteIndex();
|
|
944
|
+
} catch {
|
|
945
|
+
}
|
|
946
|
+
}
|
|
947
|
+
}
|
|
948
|
+
}
|
|
1285
949
|
};
|
|
1286
950
|
|
|
1287
951
|
// src/vector/factory.ts
|
|
1288
|
-
async function
|
|
1289
|
-
const
|
|
1290
|
-
const
|
|
1291
|
-
if (
|
|
1292
|
-
const { createClient: createClient2 } = await import("@libsql/client/http");
|
|
1293
|
-
const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
|
|
1294
|
-
const client2 = createClient2({
|
|
1295
|
-
url: remoteUrl,
|
|
1296
|
-
authToken
|
|
1297
|
-
});
|
|
1298
|
-
return new TursoVectorStore({
|
|
1299
|
-
client: client2,
|
|
1300
|
-
dimension: config.vector.dimension
|
|
1301
|
-
});
|
|
1302
|
-
}
|
|
1303
|
-
if (isServerless()) {
|
|
952
|
+
async function createUpstashStore(config) {
|
|
953
|
+
const url = config.upstash.url ?? process.env[config.upstash.urlEnv];
|
|
954
|
+
const token = config.upstash.token ?? process.env[config.upstash.tokenEnv];
|
|
955
|
+
if (!url || !token) {
|
|
1304
956
|
throw new SearchSocketError(
|
|
1305
957
|
"VECTOR_BACKEND_UNAVAILABLE",
|
|
1306
|
-
`
|
|
958
|
+
`Missing Upstash Search credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
|
|
1307
959
|
);
|
|
1308
960
|
}
|
|
1309
|
-
const {
|
|
1310
|
-
const
|
|
1311
|
-
|
|
1312
|
-
const client = createClient({
|
|
1313
|
-
url: `file:${localPath}`
|
|
1314
|
-
});
|
|
1315
|
-
return new TursoVectorStore({
|
|
1316
|
-
client,
|
|
1317
|
-
dimension: config.vector.dimension
|
|
1318
|
-
});
|
|
961
|
+
const { Search } = await import("@upstash/search");
|
|
962
|
+
const client = new Search({ url, token });
|
|
963
|
+
return new UpstashSearchStore({ client });
|
|
1319
964
|
}
|
|
1320
965
|
|
|
1321
966
|
// src/utils/hash.ts
|
|
@@ -1328,7 +973,7 @@ function sha256(input) {
|
|
|
1328
973
|
}
|
|
1329
974
|
|
|
1330
975
|
// src/utils/path.ts
|
|
1331
|
-
import
|
|
976
|
+
import path3 from "path";
|
|
1332
977
|
function normalizeUrlPath(rawPath) {
|
|
1333
978
|
let out = rawPath.trim();
|
|
1334
979
|
if (!out.startsWith("/")) {
|
|
@@ -1340,15 +985,8 @@ function normalizeUrlPath(rawPath) {
|
|
|
1340
985
|
}
|
|
1341
986
|
return out;
|
|
1342
987
|
}
|
|
1343
|
-
function urlPathToMirrorRelative(urlPath) {
|
|
1344
|
-
const normalized = normalizeUrlPath(urlPath);
|
|
1345
|
-
if (normalized === "/") {
|
|
1346
|
-
return "index.md";
|
|
1347
|
-
}
|
|
1348
|
-
return `${normalized.slice(1)}.md`;
|
|
1349
|
-
}
|
|
1350
988
|
function staticHtmlFileToUrl(filePath, rootDir) {
|
|
1351
|
-
const relative =
|
|
989
|
+
const relative = path3.relative(rootDir, filePath).replace(/\\/g, "/");
|
|
1352
990
|
if (relative === "index.html") {
|
|
1353
991
|
return "/";
|
|
1354
992
|
}
|
|
@@ -1621,7 +1259,7 @@ function buildEmbeddingText(chunk, prependTitle) {
|
|
|
1621
1259
|
|
|
1622
1260
|
${chunk.chunkText}`;
|
|
1623
1261
|
}
|
|
1624
|
-
function
|
|
1262
|
+
function chunkPage(page, config, scope) {
|
|
1625
1263
|
const sections = parseHeadingSections(page.markdown, config.chunking.headingPathDepth);
|
|
1626
1264
|
const rawChunks = sections.flatMap((section) => splitSection(section, config.chunking));
|
|
1627
1265
|
const chunks = [];
|
|
@@ -1831,59 +1469,8 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
1831
1469
|
};
|
|
1832
1470
|
}
|
|
1833
1471
|
|
|
1834
|
-
// src/indexing/mirror.ts
|
|
1835
|
-
import fs4 from "fs/promises";
|
|
1836
|
-
import path5 from "path";
|
|
1837
|
-
function yamlString(value) {
|
|
1838
|
-
return JSON.stringify(value);
|
|
1839
|
-
}
|
|
1840
|
-
function yamlArray(values) {
|
|
1841
|
-
return `[${values.map((v) => JSON.stringify(v)).join(", ")}]`;
|
|
1842
|
-
}
|
|
1843
|
-
function buildMirrorMarkdown(page) {
|
|
1844
|
-
const frontmatterLines = [
|
|
1845
|
-
"---",
|
|
1846
|
-
`url: ${yamlString(page.url)}`,
|
|
1847
|
-
`title: ${yamlString(page.title)}`,
|
|
1848
|
-
`scope: ${yamlString(page.scope)}`,
|
|
1849
|
-
`routeFile: ${yamlString(page.routeFile)}`,
|
|
1850
|
-
`routeResolution: ${yamlString(page.routeResolution)}`,
|
|
1851
|
-
`generatedAt: ${yamlString(page.generatedAt)}`,
|
|
1852
|
-
`incomingLinks: ${page.incomingLinks}`,
|
|
1853
|
-
`outgoingLinks: ${page.outgoingLinks}`,
|
|
1854
|
-
`depth: ${page.depth}`,
|
|
1855
|
-
`tags: ${yamlArray(page.tags)}`,
|
|
1856
|
-
"---",
|
|
1857
|
-
""
|
|
1858
|
-
];
|
|
1859
|
-
return `${frontmatterLines.join("\n")}${normalizeMarkdown(page.markdown)}`;
|
|
1860
|
-
}
|
|
1861
|
-
function stripGeneratedAt(content) {
|
|
1862
|
-
return content.replace(/^generatedAt: .*$/m, "");
|
|
1863
|
-
}
|
|
1864
|
-
async function writeMirrorPage(statePath, scope, page) {
|
|
1865
|
-
const relative = urlPathToMirrorRelative(page.url);
|
|
1866
|
-
const outputPath = path5.join(statePath, "pages", scope.scopeName, relative);
|
|
1867
|
-
await fs4.mkdir(path5.dirname(outputPath), { recursive: true });
|
|
1868
|
-
const newContent = buildMirrorMarkdown(page);
|
|
1869
|
-
try {
|
|
1870
|
-
const existing = await fs4.readFile(outputPath, "utf8");
|
|
1871
|
-
if (stripGeneratedAt(existing) === stripGeneratedAt(newContent)) {
|
|
1872
|
-
return outputPath;
|
|
1873
|
-
}
|
|
1874
|
-
} catch {
|
|
1875
|
-
}
|
|
1876
|
-
await fs4.writeFile(outputPath, newContent, "utf8");
|
|
1877
|
-
return outputPath;
|
|
1878
|
-
}
|
|
1879
|
-
async function cleanMirrorForScope(statePath, scope) {
|
|
1880
|
-
const target = path5.join(statePath, "pages", scope.scopeName);
|
|
1881
|
-
await fs4.rm(target, { recursive: true, force: true });
|
|
1882
|
-
await fs4.mkdir(target, { recursive: true });
|
|
1883
|
-
}
|
|
1884
|
-
|
|
1885
1472
|
// src/indexing/route-mapper.ts
|
|
1886
|
-
import
|
|
1473
|
+
import path4 from "path";
|
|
1887
1474
|
import fg from "fast-glob";
|
|
1888
1475
|
function segmentToRegex(segment) {
|
|
1889
1476
|
if (segment.startsWith("(") && segment.endsWith(")")) {
|
|
@@ -1904,7 +1491,7 @@ function segmentToRegex(segment) {
|
|
|
1904
1491
|
return { regex: `/${segment.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}`, score: 10 };
|
|
1905
1492
|
}
|
|
1906
1493
|
function routeFileToPattern(routeFile, cwd) {
|
|
1907
|
-
const relative =
|
|
1494
|
+
const relative = path4.relative(cwd, routeFile).replace(/\\/g, "/");
|
|
1908
1495
|
const withoutPrefix = relative.replace(/^src\/routes\/?/, "");
|
|
1909
1496
|
const withoutPage = withoutPrefix.replace(/\/\+page\.[^/]+$/, "");
|
|
1910
1497
|
const segments = withoutPage.split("/").filter(Boolean);
|
|
@@ -1959,11 +1546,11 @@ function mapUrlToRoute(urlPath, patterns) {
|
|
|
1959
1546
|
|
|
1960
1547
|
// src/indexing/sources/build/index.ts
|
|
1961
1548
|
import { load as cheerioLoad } from "cheerio";
|
|
1962
|
-
import
|
|
1549
|
+
import pLimit from "p-limit";
|
|
1963
1550
|
|
|
1964
1551
|
// src/indexing/sources/build/manifest-parser.ts
|
|
1965
|
-
import
|
|
1966
|
-
import
|
|
1552
|
+
import fs3 from "fs/promises";
|
|
1553
|
+
import path5 from "path";
|
|
1967
1554
|
|
|
1968
1555
|
// src/utils/pattern.ts
|
|
1969
1556
|
function matchUrlPattern(url, pattern) {
|
|
@@ -2007,10 +1594,10 @@ function routeIdToUrl(routeId) {
|
|
|
2007
1594
|
return routeId.split("/").filter((seg) => !(seg.startsWith("(") && seg.endsWith(")"))).join("/") || "/";
|
|
2008
1595
|
}
|
|
2009
1596
|
async function parseManifest(cwd, outputDir) {
|
|
2010
|
-
const manifestPath =
|
|
1597
|
+
const manifestPath = path5.resolve(cwd, outputDir, "server", "manifest-full.js");
|
|
2011
1598
|
let content;
|
|
2012
1599
|
try {
|
|
2013
|
-
content = await
|
|
1600
|
+
content = await fs3.readFile(manifestPath, "utf8");
|
|
2014
1601
|
} catch {
|
|
2015
1602
|
throw new SearchSocketError(
|
|
2016
1603
|
"BUILD_MANIFEST_NOT_FOUND",
|
|
@@ -2074,8 +1661,8 @@ function isExcluded(url, patterns) {
|
|
|
2074
1661
|
|
|
2075
1662
|
// src/indexing/sources/build/preview-server.ts
|
|
2076
1663
|
import net from "net";
|
|
2077
|
-
import
|
|
2078
|
-
import
|
|
1664
|
+
import path6 from "path";
|
|
1665
|
+
import fs4 from "fs";
|
|
2079
1666
|
import { spawn } from "child_process";
|
|
2080
1667
|
function findFreePort() {
|
|
2081
1668
|
return new Promise((resolve, reject) => {
|
|
@@ -2114,8 +1701,8 @@ async function waitForReady(url, timeout, child) {
|
|
|
2114
1701
|
);
|
|
2115
1702
|
}
|
|
2116
1703
|
async function startPreviewServer(cwd, options, logger3) {
|
|
2117
|
-
const viteBin =
|
|
2118
|
-
if (!
|
|
1704
|
+
const viteBin = path6.join(cwd, "node_modules", ".bin", "vite");
|
|
1705
|
+
if (!fs4.existsSync(viteBin)) {
|
|
2119
1706
|
throw new SearchSocketError(
|
|
2120
1707
|
"BUILD_SERVER_FAILED",
|
|
2121
1708
|
`vite binary not found at ${viteBin}. Ensure vite is installed.`
|
|
@@ -2189,7 +1776,7 @@ async function discoverPages(server, buildConfig, pipelineMaxPages) {
|
|
|
2189
1776
|
const visited = /* @__PURE__ */ new Set();
|
|
2190
1777
|
const pages = [];
|
|
2191
1778
|
const queue = [];
|
|
2192
|
-
const limit =
|
|
1779
|
+
const limit = pLimit(8);
|
|
2193
1780
|
for (const seed of seedUrls) {
|
|
2194
1781
|
const normalized = normalizeUrlPath(seed);
|
|
2195
1782
|
if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
|
|
@@ -2271,7 +1858,7 @@ async function loadBuildPages(cwd, config, maxPages) {
|
|
|
2271
1858
|
const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
|
|
2272
1859
|
const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
2273
1860
|
try {
|
|
2274
|
-
const concurrencyLimit =
|
|
1861
|
+
const concurrencyLimit = pLimit(8);
|
|
2275
1862
|
const results = await Promise.allSettled(
|
|
2276
1863
|
selected.map(
|
|
2277
1864
|
(route) => concurrencyLimit(async () => {
|
|
@@ -2311,11 +1898,11 @@ async function loadBuildPages(cwd, config, maxPages) {
|
|
|
2311
1898
|
}
|
|
2312
1899
|
|
|
2313
1900
|
// src/indexing/sources/content-files.ts
|
|
2314
|
-
import
|
|
2315
|
-
import
|
|
1901
|
+
import fs5 from "fs/promises";
|
|
1902
|
+
import path7 from "path";
|
|
2316
1903
|
import fg2 from "fast-glob";
|
|
2317
1904
|
function filePathToUrl(filePath, baseDir) {
|
|
2318
|
-
const relative =
|
|
1905
|
+
const relative = path7.relative(baseDir, filePath).replace(/\\/g, "/");
|
|
2319
1906
|
const segments = relative.split("/").filter(Boolean);
|
|
2320
1907
|
if (/(^|\/)\+page\.svelte$/.test(relative)) {
|
|
2321
1908
|
const routeSegments = segments.slice();
|
|
@@ -2340,7 +1927,7 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
2340
1927
|
if (!contentConfig) {
|
|
2341
1928
|
throw new Error("content-files config is missing");
|
|
2342
1929
|
}
|
|
2343
|
-
const baseDir =
|
|
1930
|
+
const baseDir = path7.resolve(cwd, contentConfig.baseDir);
|
|
2344
1931
|
const files = await fg2(contentConfig.globs, {
|
|
2345
1932
|
cwd: baseDir,
|
|
2346
1933
|
absolute: true,
|
|
@@ -2350,12 +1937,12 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
2350
1937
|
const selected = typeof limit === "number" ? files.slice(0, limit) : files;
|
|
2351
1938
|
const pages = [];
|
|
2352
1939
|
for (const filePath of selected) {
|
|
2353
|
-
const raw = await
|
|
1940
|
+
const raw = await fs5.readFile(filePath, "utf8");
|
|
2354
1941
|
const markdown = filePath.endsWith(".md") ? raw : normalizeSvelteToMarkdown(raw);
|
|
2355
1942
|
pages.push({
|
|
2356
1943
|
url: filePathToUrl(filePath, baseDir),
|
|
2357
1944
|
markdown,
|
|
2358
|
-
sourcePath:
|
|
1945
|
+
sourcePath: path7.relative(cwd, filePath).replace(/\\/g, "/"),
|
|
2359
1946
|
outgoingLinks: []
|
|
2360
1947
|
});
|
|
2361
1948
|
}
|
|
@@ -2365,7 +1952,7 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
2365
1952
|
// src/indexing/sources/crawl.ts
|
|
2366
1953
|
import { gunzipSync } from "zlib";
|
|
2367
1954
|
import { load as cheerioLoad2 } from "cheerio";
|
|
2368
|
-
import
|
|
1955
|
+
import pLimit2 from "p-limit";
|
|
2369
1956
|
var logger2 = new Logger();
|
|
2370
1957
|
function extractLocs(xml) {
|
|
2371
1958
|
const $ = cheerioLoad2(xml, { xmlMode: true });
|
|
@@ -2450,7 +2037,7 @@ async function loadCrawledPages(config, maxPages) {
|
|
|
2450
2037
|
const routes = await resolveRoutes(config);
|
|
2451
2038
|
const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
|
|
2452
2039
|
const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
|
|
2453
|
-
const concurrencyLimit =
|
|
2040
|
+
const concurrencyLimit = pLimit2(8);
|
|
2454
2041
|
const results = await Promise.allSettled(
|
|
2455
2042
|
selected.map(
|
|
2456
2043
|
(route) => concurrencyLimit(async () => {
|
|
@@ -2483,11 +2070,11 @@ async function loadCrawledPages(config, maxPages) {
|
|
|
2483
2070
|
}
|
|
2484
2071
|
|
|
2485
2072
|
// src/indexing/sources/static-output.ts
|
|
2486
|
-
import
|
|
2487
|
-
import
|
|
2073
|
+
import fs6 from "fs/promises";
|
|
2074
|
+
import path8 from "path";
|
|
2488
2075
|
import fg3 from "fast-glob";
|
|
2489
2076
|
async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
2490
|
-
const outputDir =
|
|
2077
|
+
const outputDir = path8.resolve(cwd, config.source.staticOutputDir);
|
|
2491
2078
|
const htmlFiles = await fg3(["**/*.html"], {
|
|
2492
2079
|
cwd: outputDir,
|
|
2493
2080
|
absolute: true
|
|
@@ -2496,11 +2083,11 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
2496
2083
|
const selected = typeof limit === "number" ? htmlFiles.slice(0, limit) : htmlFiles;
|
|
2497
2084
|
const pages = [];
|
|
2498
2085
|
for (const filePath of selected) {
|
|
2499
|
-
const html = await
|
|
2086
|
+
const html = await fs6.readFile(filePath, "utf8");
|
|
2500
2087
|
pages.push({
|
|
2501
2088
|
url: staticHtmlFileToUrl(filePath, outputDir),
|
|
2502
2089
|
html,
|
|
2503
|
-
sourcePath:
|
|
2090
|
+
sourcePath: path8.relative(cwd, filePath).replace(/\\/g, "/"),
|
|
2504
2091
|
outgoingLinks: []
|
|
2505
2092
|
});
|
|
2506
2093
|
}
|
|
@@ -2508,8 +2095,8 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
2508
2095
|
}
|
|
2509
2096
|
|
|
2510
2097
|
// src/indexing/robots.ts
|
|
2511
|
-
import
|
|
2512
|
-
import
|
|
2098
|
+
import fs7 from "fs/promises";
|
|
2099
|
+
import path9 from "path";
|
|
2513
2100
|
function parseRobotsTxt(content, userAgent = "Searchsocket") {
|
|
2514
2101
|
const lines = content.split(/\r?\n/);
|
|
2515
2102
|
const agentGroups = /* @__PURE__ */ new Map();
|
|
@@ -2563,7 +2150,7 @@ function isBlockedByRobots(urlPath, rules) {
|
|
|
2563
2150
|
}
|
|
2564
2151
|
async function loadRobotsTxtFromDir(dir) {
|
|
2565
2152
|
try {
|
|
2566
|
-
const content = await
|
|
2153
|
+
const content = await fs7.readFile(path9.join(dir, "robots.txt"), "utf8");
|
|
2567
2154
|
return parseRobotsTxt(content);
|
|
2568
2155
|
} catch {
|
|
2569
2156
|
return null;
|
|
@@ -2588,7 +2175,12 @@ function nonNegativeOrZero(value) {
|
|
|
2588
2175
|
}
|
|
2589
2176
|
return Math.max(0, value);
|
|
2590
2177
|
}
|
|
2591
|
-
function
|
|
2178
|
+
function normalizeForTitleMatch(text) {
|
|
2179
|
+
return text.toLowerCase().replace(/[^a-z0-9\s]/g, "").replace(/\s+/g, " ").trim();
|
|
2180
|
+
}
|
|
2181
|
+
function rankHits(hits, config, query) {
|
|
2182
|
+
const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
|
|
2183
|
+
const titleMatchWeight = config.ranking.weights.titleMatch;
|
|
2592
2184
|
return hits.map((hit) => {
|
|
2593
2185
|
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
2594
2186
|
if (config.ranking.enableIncomingLinkBoost) {
|
|
@@ -2599,6 +2191,12 @@ function rankHits(hits, config) {
|
|
|
2599
2191
|
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
2600
2192
|
score += depthBoost * config.ranking.weights.depth;
|
|
2601
2193
|
}
|
|
2194
|
+
if (normalizedQuery && titleMatchWeight > 0) {
|
|
2195
|
+
const normalizedTitle = normalizeForTitleMatch(hit.metadata.title);
|
|
2196
|
+
if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
|
|
2197
|
+
score += titleMatchWeight;
|
|
2198
|
+
}
|
|
2199
|
+
}
|
|
2602
2200
|
return {
|
|
2603
2201
|
hit,
|
|
2604
2202
|
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
@@ -2608,6 +2206,30 @@ function rankHits(hits, config) {
|
|
|
2608
2206
|
return Number.isNaN(delta) ? 0 : delta;
|
|
2609
2207
|
});
|
|
2610
2208
|
}
|
|
2209
|
+
function trimByScoreGap(results, config) {
|
|
2210
|
+
if (results.length === 0) return results;
|
|
2211
|
+
const threshold = config.ranking.scoreGapThreshold;
|
|
2212
|
+
const minScore = config.ranking.minScore;
|
|
2213
|
+
if (minScore > 0 && results.length > 0) {
|
|
2214
|
+
const sortedScores = results.map((r) => r.pageScore).sort((a, b) => a - b);
|
|
2215
|
+
const mid = Math.floor(sortedScores.length / 2);
|
|
2216
|
+
const median = sortedScores.length % 2 === 0 ? (sortedScores[mid - 1] + sortedScores[mid]) / 2 : sortedScores[mid];
|
|
2217
|
+
if (median < minScore) return [];
|
|
2218
|
+
}
|
|
2219
|
+
if (threshold > 0 && results.length > 1) {
|
|
2220
|
+
for (let i = 1; i < results.length; i++) {
|
|
2221
|
+
const prev = results[i - 1].pageScore;
|
|
2222
|
+
const current = results[i].pageScore;
|
|
2223
|
+
if (prev > 0) {
|
|
2224
|
+
const gap = (prev - current) / prev;
|
|
2225
|
+
if (gap >= threshold) {
|
|
2226
|
+
return results.slice(0, i);
|
|
2227
|
+
}
|
|
2228
|
+
}
|
|
2229
|
+
}
|
|
2230
|
+
}
|
|
2231
|
+
return results;
|
|
2232
|
+
}
|
|
2611
2233
|
function findPageWeight(url, pageWeights) {
|
|
2612
2234
|
let bestPattern = "";
|
|
2613
2235
|
let bestWeight = 1;
|
|
@@ -2662,6 +2284,61 @@ function aggregateByPage(ranked, config) {
|
|
|
2662
2284
|
return Number.isNaN(delta) ? 0 : delta;
|
|
2663
2285
|
});
|
|
2664
2286
|
}
|
|
2287
|
+
function mergePageAndChunkResults(pageHits, rankedChunks, config) {
|
|
2288
|
+
if (pageHits.length === 0) return rankedChunks;
|
|
2289
|
+
const w = config.search.pageSearchWeight;
|
|
2290
|
+
const pageScoreMap = /* @__PURE__ */ new Map();
|
|
2291
|
+
for (const ph of pageHits) {
|
|
2292
|
+
pageScoreMap.set(ph.url, ph);
|
|
2293
|
+
}
|
|
2294
|
+
const pagesWithChunks = /* @__PURE__ */ new Set();
|
|
2295
|
+
const merged = rankedChunks.map((ranked) => {
|
|
2296
|
+
const url = ranked.hit.metadata.url;
|
|
2297
|
+
const pageHit = pageScoreMap.get(url);
|
|
2298
|
+
if (pageHit) {
|
|
2299
|
+
pagesWithChunks.add(url);
|
|
2300
|
+
const blended = (1 - w) * ranked.finalScore + w * pageHit.score;
|
|
2301
|
+
return {
|
|
2302
|
+
hit: ranked.hit,
|
|
2303
|
+
finalScore: Number.isFinite(blended) ? blended : ranked.finalScore
|
|
2304
|
+
};
|
|
2305
|
+
}
|
|
2306
|
+
return ranked;
|
|
2307
|
+
});
|
|
2308
|
+
for (const [url, pageHit] of pageScoreMap) {
|
|
2309
|
+
if (pagesWithChunks.has(url)) continue;
|
|
2310
|
+
const syntheticScore = pageHit.score * w;
|
|
2311
|
+
const syntheticHit = {
|
|
2312
|
+
id: `page:${url}`,
|
|
2313
|
+
score: pageHit.score,
|
|
2314
|
+
metadata: {
|
|
2315
|
+
projectId: "",
|
|
2316
|
+
scopeName: "",
|
|
2317
|
+
url: pageHit.url,
|
|
2318
|
+
path: pageHit.url,
|
|
2319
|
+
title: pageHit.title,
|
|
2320
|
+
sectionTitle: "",
|
|
2321
|
+
headingPath: [],
|
|
2322
|
+
snippet: pageHit.description || pageHit.title,
|
|
2323
|
+
chunkText: pageHit.description || pageHit.title,
|
|
2324
|
+
ordinal: 0,
|
|
2325
|
+
contentHash: "",
|
|
2326
|
+
depth: pageHit.depth,
|
|
2327
|
+
incomingLinks: pageHit.incomingLinks,
|
|
2328
|
+
routeFile: pageHit.routeFile,
|
|
2329
|
+
tags: pageHit.tags
|
|
2330
|
+
}
|
|
2331
|
+
};
|
|
2332
|
+
merged.push({
|
|
2333
|
+
hit: syntheticHit,
|
|
2334
|
+
finalScore: Number.isFinite(syntheticScore) ? syntheticScore : 0
|
|
2335
|
+
});
|
|
2336
|
+
}
|
|
2337
|
+
return merged.sort((a, b) => {
|
|
2338
|
+
const delta = b.finalScore - a.finalScore;
|
|
2339
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
2340
|
+
});
|
|
2341
|
+
}
|
|
2665
2342
|
|
|
2666
2343
|
// src/utils/time.ts
|
|
2667
2344
|
function nowIso() {
|
|
@@ -2672,34 +2349,41 @@ function hrTimeMs(start) {
|
|
|
2672
2349
|
}
|
|
2673
2350
|
|
|
2674
2351
|
// src/indexing/pipeline.ts
|
|
2675
|
-
|
|
2676
|
-
|
|
2677
|
-
|
|
2678
|
-
|
|
2679
|
-
|
|
2352
|
+
function buildPageSummary(page, maxChars = 3500) {
|
|
2353
|
+
const parts = [page.title];
|
|
2354
|
+
if (page.description) {
|
|
2355
|
+
parts.push(page.description);
|
|
2356
|
+
}
|
|
2357
|
+
if (page.keywords && page.keywords.length > 0) {
|
|
2358
|
+
parts.push(page.keywords.join(", "));
|
|
2359
|
+
}
|
|
2360
|
+
const plainBody = page.markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/!?\[([^\]]*)\]\([^)]*\)/g, "$1").replace(/^#{1,6}\s+/gm, "").replace(/[>*_|~\-]/g, " ").replace(/\s+/g, " ").trim();
|
|
2361
|
+
if (plainBody) {
|
|
2362
|
+
parts.push(plainBody);
|
|
2363
|
+
}
|
|
2364
|
+
const joined = parts.join("\n\n");
|
|
2365
|
+
if (joined.length <= maxChars) return joined;
|
|
2366
|
+
return joined.slice(0, maxChars).trim();
|
|
2367
|
+
}
|
|
2680
2368
|
var IndexPipeline = class _IndexPipeline {
|
|
2681
2369
|
cwd;
|
|
2682
2370
|
config;
|
|
2683
|
-
|
|
2684
|
-
vectorStore;
|
|
2371
|
+
store;
|
|
2685
2372
|
logger;
|
|
2686
2373
|
constructor(options) {
|
|
2687
2374
|
this.cwd = options.cwd;
|
|
2688
2375
|
this.config = options.config;
|
|
2689
|
-
this.
|
|
2690
|
-
this.vectorStore = options.vectorStore;
|
|
2376
|
+
this.store = options.store;
|
|
2691
2377
|
this.logger = options.logger;
|
|
2692
2378
|
}
|
|
2693
2379
|
static async create(options = {}) {
|
|
2694
|
-
const cwd =
|
|
2380
|
+
const cwd = path10.resolve(options.cwd ?? process.cwd());
|
|
2695
2381
|
const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
|
|
2696
|
-
const
|
|
2697
|
-
const vectorStore = options.vectorStore ?? await createVectorStore(config, cwd);
|
|
2382
|
+
const store = options.store ?? await createUpstashStore(config);
|
|
2698
2383
|
return new _IndexPipeline({
|
|
2699
2384
|
cwd,
|
|
2700
2385
|
config,
|
|
2701
|
-
|
|
2702
|
-
vectorStore,
|
|
2386
|
+
store,
|
|
2703
2387
|
logger: options.logger ?? new Logger()
|
|
2704
2388
|
});
|
|
2705
2389
|
}
|
|
@@ -2719,25 +2403,17 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2719
2403
|
stageTimingsMs[name] = Math.round(hrTimeMs(start));
|
|
2720
2404
|
};
|
|
2721
2405
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
2722
|
-
|
|
2406
|
+
ensureStateDirs(this.cwd, this.config.state.dir, scope);
|
|
2723
2407
|
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
2724
|
-
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode},
|
|
2408
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-search)`);
|
|
2725
2409
|
if (options.force) {
|
|
2726
2410
|
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
2727
|
-
await cleanMirrorForScope(statePath, scope);
|
|
2728
2411
|
}
|
|
2729
2412
|
if (options.dryRun) {
|
|
2730
2413
|
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
2731
2414
|
}
|
|
2732
2415
|
const manifestStart = stageStart();
|
|
2733
|
-
const existingHashes = await this.
|
|
2734
|
-
const existingModelId = await this.vectorStore.getScopeModelId(scope);
|
|
2735
|
-
if (existingModelId && existingModelId !== this.config.embeddings.model && !options.force) {
|
|
2736
|
-
throw new SearchSocketError(
|
|
2737
|
-
"EMBEDDING_MODEL_MISMATCH",
|
|
2738
|
-
`Scope ${scope.scopeName} uses model ${existingModelId}. Re-run with --force to migrate.`
|
|
2739
|
-
);
|
|
2740
|
-
}
|
|
2416
|
+
const existingHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getContentHashes(scope);
|
|
2741
2417
|
stageEnd("manifest", manifestStart);
|
|
2742
2418
|
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
2743
2419
|
const sourceStart = stageStart();
|
|
@@ -2775,11 +2451,11 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2775
2451
|
let robotsRules = null;
|
|
2776
2452
|
if (sourceMode === "static-output") {
|
|
2777
2453
|
robotsRules = await loadRobotsTxtFromDir(
|
|
2778
|
-
|
|
2454
|
+
path10.resolve(this.cwd, this.config.source.staticOutputDir)
|
|
2779
2455
|
);
|
|
2780
2456
|
} else if (sourceMode === "build" && this.config.source.build) {
|
|
2781
2457
|
robotsRules = await loadRobotsTxtFromDir(
|
|
2782
|
-
|
|
2458
|
+
path10.resolve(this.cwd, this.config.source.build.outputDir)
|
|
2783
2459
|
);
|
|
2784
2460
|
} else if (sourceMode === "crawl" && this.config.source.crawl) {
|
|
2785
2461
|
robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
|
|
@@ -2866,9 +2542,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2866
2542
|
}
|
|
2867
2543
|
stageEnd("links", linkStart);
|
|
2868
2544
|
this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
|
|
2869
|
-
const
|
|
2870
|
-
this.logger.info("
|
|
2871
|
-
const
|
|
2545
|
+
const pagesStart = stageStart();
|
|
2546
|
+
this.logger.info("Building indexed pages...");
|
|
2547
|
+
const pages = [];
|
|
2872
2548
|
let routeExact = 0;
|
|
2873
2549
|
let routeBestEffort = 0;
|
|
2874
2550
|
const precomputedRoutes = /* @__PURE__ */ new Map();
|
|
@@ -2897,7 +2573,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2897
2573
|
} else {
|
|
2898
2574
|
routeExact += 1;
|
|
2899
2575
|
}
|
|
2900
|
-
const
|
|
2576
|
+
const indexedPage = {
|
|
2901
2577
|
url: page.url,
|
|
2902
2578
|
title: page.title,
|
|
2903
2579
|
scope: scope.scopeName,
|
|
@@ -2912,35 +2588,38 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2912
2588
|
description: page.description,
|
|
2913
2589
|
keywords: page.keywords
|
|
2914
2590
|
};
|
|
2915
|
-
|
|
2916
|
-
|
|
2917
|
-
await writeMirrorPage(statePath, scope, mirror);
|
|
2918
|
-
}
|
|
2919
|
-
this.logger.event("markdown_written", { url: page.url });
|
|
2591
|
+
pages.push(indexedPage);
|
|
2592
|
+
this.logger.event("page_indexed", { url: page.url });
|
|
2920
2593
|
}
|
|
2921
2594
|
if (!options.dryRun) {
|
|
2922
|
-
const pageRecords =
|
|
2923
|
-
|
|
2924
|
-
|
|
2925
|
-
|
|
2926
|
-
|
|
2927
|
-
|
|
2928
|
-
|
|
2929
|
-
|
|
2930
|
-
|
|
2931
|
-
|
|
2932
|
-
|
|
2933
|
-
|
|
2934
|
-
|
|
2935
|
-
|
|
2936
|
-
|
|
2937
|
-
|
|
2595
|
+
const pageRecords = pages.map((p) => {
|
|
2596
|
+
const summary = buildPageSummary(p);
|
|
2597
|
+
return {
|
|
2598
|
+
url: p.url,
|
|
2599
|
+
title: p.title,
|
|
2600
|
+
markdown: p.markdown,
|
|
2601
|
+
projectId: scope.projectId,
|
|
2602
|
+
scopeName: scope.scopeName,
|
|
2603
|
+
routeFile: p.routeFile,
|
|
2604
|
+
routeResolution: p.routeResolution,
|
|
2605
|
+
incomingLinks: p.incomingLinks,
|
|
2606
|
+
outgoingLinks: p.outgoingLinks,
|
|
2607
|
+
depth: p.depth,
|
|
2608
|
+
tags: p.tags,
|
|
2609
|
+
indexedAt: p.generatedAt,
|
|
2610
|
+
summary,
|
|
2611
|
+
description: p.description,
|
|
2612
|
+
keywords: p.keywords
|
|
2613
|
+
};
|
|
2614
|
+
});
|
|
2615
|
+
await this.store.deletePages(scope);
|
|
2616
|
+
await this.store.upsertPages(pageRecords, scope);
|
|
2938
2617
|
}
|
|
2939
|
-
stageEnd("
|
|
2940
|
-
this.logger.info(`
|
|
2618
|
+
stageEnd("pages", pagesStart);
|
|
2619
|
+
this.logger.info(`Indexed ${pages.length} page${pages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["pages"]}ms)`);
|
|
2941
2620
|
const chunkStart = stageStart();
|
|
2942
2621
|
this.logger.info("Chunking pages...");
|
|
2943
|
-
let chunks =
|
|
2622
|
+
let chunks = pages.flatMap((page) => chunkPage(page, this.config, scope));
|
|
2944
2623
|
const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
|
|
2945
2624
|
if (typeof maxChunks === "number") {
|
|
2946
2625
|
chunks = chunks.slice(0, maxChunks);
|
|
@@ -2972,125 +2651,61 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2972
2651
|
});
|
|
2973
2652
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
2974
2653
|
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
2975
|
-
const
|
|
2976
|
-
|
|
2977
|
-
for (const chunk of changedChunks) {
|
|
2978
|
-
chunkTokenEstimates.set(chunk.chunkKey, this.embeddings.estimateTokens(buildEmbeddingText(chunk, this.config.chunking.prependTitle)));
|
|
2979
|
-
}
|
|
2980
|
-
const estimatedTokens = changedChunks.reduce(
|
|
2981
|
-
(sum, chunk) => sum + (chunkTokenEstimates.get(chunk.chunkKey) ?? 0),
|
|
2982
|
-
0
|
|
2983
|
-
);
|
|
2984
|
-
const pricePer1k = this.config.embeddings.pricePer1kTokens ?? EMBEDDING_PRICE_PER_1K_TOKENS_USD[this.config.embeddings.model] ?? DEFAULT_EMBEDDING_PRICE_PER_1K;
|
|
2985
|
-
const estimatedCostUSD = estimatedTokens / 1e3 * pricePer1k;
|
|
2986
|
-
let newEmbeddings = 0;
|
|
2987
|
-
const vectorsByChunk = /* @__PURE__ */ new Map();
|
|
2654
|
+
const upsertStart = stageStart();
|
|
2655
|
+
let documentsUpserted = 0;
|
|
2988
2656
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
2989
|
-
this.logger.info(`
|
|
2990
|
-
const
|
|
2991
|
-
|
|
2992
|
-
|
|
2993
|
-
|
|
2994
|
-
|
|
2995
|
-
|
|
2996
|
-
|
|
2997
|
-
|
|
2998
|
-
|
|
2999
|
-
|
|
3000
|
-
|
|
3001
|
-
|
|
3002
|
-
|
|
3003
|
-
const embedding = embeddings[i];
|
|
3004
|
-
if (!chunk || !embedding || embedding.length === 0 || embedding.some((value) => !Number.isFinite(value))) {
|
|
3005
|
-
throw new SearchSocketError(
|
|
3006
|
-
"VECTOR_BACKEND_UNAVAILABLE",
|
|
3007
|
-
`Embedding provider returned an invalid vector for chunk index ${i}.`
|
|
3008
|
-
);
|
|
3009
|
-
}
|
|
3010
|
-
vectorsByChunk.set(chunk.chunkKey, embedding);
|
|
3011
|
-
newEmbeddings += 1;
|
|
3012
|
-
this.logger.event("embedded_new", { chunkKey: chunk.chunkKey });
|
|
3013
|
-
}
|
|
3014
|
-
}
|
|
3015
|
-
stageEnd("embedding", embedStart);
|
|
3016
|
-
if (changedChunks.length > 0) {
|
|
3017
|
-
this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
|
|
3018
|
-
} else {
|
|
3019
|
-
this.logger.info("No chunks to embed \u2014 all up to date");
|
|
3020
|
-
}
|
|
3021
|
-
const syncStart = stageStart();
|
|
3022
|
-
if (!options.dryRun) {
|
|
3023
|
-
this.logger.info("Syncing vectors...");
|
|
3024
|
-
const upserts = [];
|
|
3025
|
-
for (const chunk of changedChunks) {
|
|
3026
|
-
const vector = vectorsByChunk.get(chunk.chunkKey);
|
|
3027
|
-
if (!vector) {
|
|
3028
|
-
continue;
|
|
3029
|
-
}
|
|
3030
|
-
upserts.push({
|
|
2657
|
+
this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Search...`);
|
|
2658
|
+
const UPSTASH_CONTENT_LIMIT = 4096;
|
|
2659
|
+
const FIELD_OVERHEAD = 200;
|
|
2660
|
+
const MAX_TEXT_CHARS = UPSTASH_CONTENT_LIMIT - FIELD_OVERHEAD;
|
|
2661
|
+
const docs = changedChunks.map((chunk) => {
|
|
2662
|
+
const title = chunk.title;
|
|
2663
|
+
const sectionTitle = chunk.sectionTitle ?? "";
|
|
2664
|
+
const url = chunk.url;
|
|
2665
|
+
const tags = chunk.tags.join(",");
|
|
2666
|
+
const headingPath = chunk.headingPath.join(" > ");
|
|
2667
|
+
const otherFieldsLen = title.length + sectionTitle.length + url.length + tags.length + headingPath.length;
|
|
2668
|
+
const textBudget = Math.max(500, UPSTASH_CONTENT_LIMIT - otherFieldsLen - 50);
|
|
2669
|
+
const text = buildEmbeddingText(chunk, this.config.chunking.prependTitle).slice(0, textBudget);
|
|
2670
|
+
return {
|
|
3031
2671
|
id: chunk.chunkKey,
|
|
3032
|
-
|
|
2672
|
+
content: { title, sectionTitle, text, url, tags, headingPath },
|
|
3033
2673
|
metadata: {
|
|
3034
2674
|
projectId: scope.projectId,
|
|
3035
2675
|
scopeName: scope.scopeName,
|
|
3036
|
-
url: chunk.url,
|
|
3037
2676
|
path: chunk.path,
|
|
3038
|
-
title: chunk.title,
|
|
3039
|
-
sectionTitle: chunk.sectionTitle ?? "",
|
|
3040
|
-
headingPath: chunk.headingPath,
|
|
3041
2677
|
snippet: chunk.snippet,
|
|
3042
|
-
chunkText: chunk.chunkText.slice(0, 4e3),
|
|
3043
2678
|
ordinal: chunk.ordinal,
|
|
3044
2679
|
contentHash: chunk.contentHash,
|
|
3045
|
-
modelId: this.config.embeddings.model,
|
|
3046
2680
|
depth: chunk.depth,
|
|
3047
2681
|
incomingLinks: chunk.incomingLinks,
|
|
3048
2682
|
routeFile: chunk.routeFile,
|
|
3049
|
-
|
|
3050
|
-
|
|
3051
|
-
keywords: chunk.keywords
|
|
2683
|
+
description: chunk.description ?? "",
|
|
2684
|
+
keywords: (chunk.keywords ?? []).join(",")
|
|
3052
2685
|
}
|
|
3053
|
-
}
|
|
3054
|
-
}
|
|
3055
|
-
if (upserts.length > 0) {
|
|
3056
|
-
await this.vectorStore.upsert(upserts, scope);
|
|
3057
|
-
this.logger.event("upserted", { count: upserts.length });
|
|
3058
|
-
}
|
|
3059
|
-
if (deletes.length > 0) {
|
|
3060
|
-
await this.vectorStore.deleteByIds(deletes, scope);
|
|
3061
|
-
this.logger.event("deleted", { count: deletes.length });
|
|
3062
|
-
}
|
|
3063
|
-
}
|
|
3064
|
-
stageEnd("sync", syncStart);
|
|
3065
|
-
this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
|
|
3066
|
-
const finalizeStart = stageStart();
|
|
3067
|
-
if (!options.dryRun) {
|
|
3068
|
-
const scopeInfo = {
|
|
3069
|
-
projectId: scope.projectId,
|
|
3070
|
-
scopeName: scope.scopeName,
|
|
3071
|
-
modelId: this.config.embeddings.model,
|
|
3072
|
-
lastIndexedAt: nowIso(),
|
|
3073
|
-
vectorCount: chunks.length,
|
|
3074
|
-
lastEstimateTokens: estimatedTokens,
|
|
3075
|
-
lastEstimateCostUSD: Number(estimatedCostUSD.toFixed(8)),
|
|
3076
|
-
lastEstimateChangedChunks: changedChunks.length
|
|
3077
|
-
};
|
|
3078
|
-
await this.vectorStore.recordScope(scopeInfo);
|
|
3079
|
-
this.logger.event("registry_updated", {
|
|
3080
|
-
scope: scope.scopeName,
|
|
3081
|
-
vectorCount: chunks.length
|
|
2686
|
+
};
|
|
3082
2687
|
});
|
|
2688
|
+
await this.store.upsertChunks(docs, scope);
|
|
2689
|
+
documentsUpserted = docs.length;
|
|
2690
|
+
this.logger.event("upserted", { count: docs.length });
|
|
2691
|
+
}
|
|
2692
|
+
if (!options.dryRun && deletes.length > 0) {
|
|
2693
|
+
await this.store.deleteByIds(deletes, scope);
|
|
2694
|
+
this.logger.event("deleted", { count: deletes.length });
|
|
2695
|
+
}
|
|
2696
|
+
stageEnd("upsert", upsertStart);
|
|
2697
|
+
if (changedChunks.length > 0) {
|
|
2698
|
+
this.logger.info(`Upserted ${documentsUpserted} document${documentsUpserted === 1 ? "" : "s"} (${stageTimingsMs["upsert"]}ms)`);
|
|
2699
|
+
} else {
|
|
2700
|
+
this.logger.info("No chunks to upsert \u2014 all up to date");
|
|
3083
2701
|
}
|
|
3084
|
-
stageEnd("finalize", finalizeStart);
|
|
3085
2702
|
this.logger.info("Done.");
|
|
3086
2703
|
return {
|
|
3087
|
-
pagesProcessed:
|
|
2704
|
+
pagesProcessed: pages.length,
|
|
3088
2705
|
chunksTotal: chunks.length,
|
|
3089
2706
|
chunksChanged: changedChunks.length,
|
|
3090
|
-
|
|
2707
|
+
documentsUpserted,
|
|
3091
2708
|
deletes: deletes.length,
|
|
3092
|
-
estimatedTokens,
|
|
3093
|
-
estimatedCostUSD: Number(estimatedCostUSD.toFixed(8)),
|
|
3094
2709
|
routeExact,
|
|
3095
2710
|
routeBestEffort,
|
|
3096
2711
|
stageTimingsMs
|
|
@@ -3106,142 +2721,33 @@ import { createMcpExpressApp } from "@modelcontextprotocol/sdk/server/express.js
|
|
|
3106
2721
|
import { z as z3 } from "zod";
|
|
3107
2722
|
|
|
3108
2723
|
// src/search/engine.ts
|
|
3109
|
-
import
|
|
2724
|
+
import path11 from "path";
|
|
3110
2725
|
import { z as z2 } from "zod";
|
|
3111
|
-
|
|
3112
|
-
// src/rerank/jina.ts
|
|
3113
|
-
function sleep2(ms) {
|
|
3114
|
-
return new Promise((resolve) => {
|
|
3115
|
-
setTimeout(resolve, ms);
|
|
3116
|
-
});
|
|
3117
|
-
}
|
|
3118
|
-
var JinaReranker = class {
|
|
3119
|
-
apiKey;
|
|
3120
|
-
model;
|
|
3121
|
-
maxRetries;
|
|
3122
|
-
constructor(options) {
|
|
3123
|
-
this.apiKey = options.apiKey;
|
|
3124
|
-
this.model = options.model;
|
|
3125
|
-
this.maxRetries = options.maxRetries ?? 2;
|
|
3126
|
-
}
|
|
3127
|
-
async rerank(query, candidates, topN) {
|
|
3128
|
-
if (candidates.length === 0) {
|
|
3129
|
-
return [];
|
|
3130
|
-
}
|
|
3131
|
-
const body = {
|
|
3132
|
-
model: this.model,
|
|
3133
|
-
query,
|
|
3134
|
-
documents: candidates.map((candidate) => candidate.text),
|
|
3135
|
-
top_n: topN ?? candidates.length,
|
|
3136
|
-
return_documents: false
|
|
3137
|
-
};
|
|
3138
|
-
let attempt = 0;
|
|
3139
|
-
while (attempt <= this.maxRetries) {
|
|
3140
|
-
attempt += 1;
|
|
3141
|
-
let response;
|
|
3142
|
-
try {
|
|
3143
|
-
response = await fetch("https://api.jina.ai/v1/rerank", {
|
|
3144
|
-
method: "POST",
|
|
3145
|
-
headers: {
|
|
3146
|
-
"content-type": "application/json",
|
|
3147
|
-
authorization: `Bearer ${this.apiKey}`
|
|
3148
|
-
},
|
|
3149
|
-
body: JSON.stringify(body)
|
|
3150
|
-
});
|
|
3151
|
-
} catch (error) {
|
|
3152
|
-
if (attempt <= this.maxRetries) {
|
|
3153
|
-
await sleep2(Math.min(300 * 2 ** attempt, 4e3));
|
|
3154
|
-
continue;
|
|
3155
|
-
}
|
|
3156
|
-
throw error;
|
|
3157
|
-
}
|
|
3158
|
-
if (!response.ok) {
|
|
3159
|
-
const retryable = response.status === 429 || response.status >= 500;
|
|
3160
|
-
if (retryable && attempt <= this.maxRetries) {
|
|
3161
|
-
await sleep2(Math.min(300 * 2 ** attempt, 4e3));
|
|
3162
|
-
continue;
|
|
3163
|
-
}
|
|
3164
|
-
const errorBody = await response.text();
|
|
3165
|
-
throw new Error(`Jina rerank failed (${response.status}): ${errorBody}`);
|
|
3166
|
-
}
|
|
3167
|
-
const payload = await response.json();
|
|
3168
|
-
const rawResults = payload.results ?? payload.data ?? [];
|
|
3169
|
-
if (!Array.isArray(rawResults)) {
|
|
3170
|
-
throw new Error("Invalid Jina rerank response format");
|
|
3171
|
-
}
|
|
3172
|
-
return rawResults.flatMap((item) => {
|
|
3173
|
-
const index = item.index;
|
|
3174
|
-
if (typeof index !== "number" || index < 0 || index >= candidates.length) {
|
|
3175
|
-
return [];
|
|
3176
|
-
}
|
|
3177
|
-
const candidate = candidates[index];
|
|
3178
|
-
if (!candidate) {
|
|
3179
|
-
return [];
|
|
3180
|
-
}
|
|
3181
|
-
const score = typeof item.relevance_score === "number" ? item.relevance_score : item.score ?? 0;
|
|
3182
|
-
return [
|
|
3183
|
-
{
|
|
3184
|
-
id: candidate.id,
|
|
3185
|
-
score
|
|
3186
|
-
}
|
|
3187
|
-
];
|
|
3188
|
-
}).sort((a, b) => b.score - a.score);
|
|
3189
|
-
}
|
|
3190
|
-
throw new Error("Jina rerank request failed after retries");
|
|
3191
|
-
}
|
|
3192
|
-
};
|
|
3193
|
-
|
|
3194
|
-
// src/rerank/factory.ts
|
|
3195
|
-
function createReranker(config) {
|
|
3196
|
-
if (!config.rerank.enabled) {
|
|
3197
|
-
return null;
|
|
3198
|
-
}
|
|
3199
|
-
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
3200
|
-
if (!apiKey) {
|
|
3201
|
-
return null;
|
|
3202
|
-
}
|
|
3203
|
-
return new JinaReranker({
|
|
3204
|
-
apiKey,
|
|
3205
|
-
model: config.rerank.model
|
|
3206
|
-
});
|
|
3207
|
-
}
|
|
3208
|
-
|
|
3209
|
-
// src/search/engine.ts
|
|
3210
2726
|
var requestSchema = z2.object({
|
|
3211
2727
|
q: z2.string().trim().min(1),
|
|
3212
2728
|
topK: z2.number().int().positive().max(100).optional(),
|
|
3213
2729
|
scope: z2.string().optional(),
|
|
3214
2730
|
pathPrefix: z2.string().optional(),
|
|
3215
2731
|
tags: z2.array(z2.string()).optional(),
|
|
3216
|
-
|
|
3217
|
-
groupBy: z2.enum(["page", "chunk"]).optional(),
|
|
3218
|
-
stream: z2.boolean().optional()
|
|
2732
|
+
groupBy: z2.enum(["page", "chunk"]).optional()
|
|
3219
2733
|
});
|
|
3220
2734
|
var SearchEngine = class _SearchEngine {
|
|
3221
2735
|
cwd;
|
|
3222
2736
|
config;
|
|
3223
|
-
|
|
3224
|
-
vectorStore;
|
|
3225
|
-
reranker;
|
|
2737
|
+
store;
|
|
3226
2738
|
constructor(options) {
|
|
3227
2739
|
this.cwd = options.cwd;
|
|
3228
2740
|
this.config = options.config;
|
|
3229
|
-
this.
|
|
3230
|
-
this.vectorStore = options.vectorStore;
|
|
3231
|
-
this.reranker = options.reranker;
|
|
2741
|
+
this.store = options.store;
|
|
3232
2742
|
}
|
|
3233
2743
|
static async create(options = {}) {
|
|
3234
|
-
const cwd =
|
|
2744
|
+
const cwd = path11.resolve(options.cwd ?? process.cwd());
|
|
3235
2745
|
const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
|
|
3236
|
-
const
|
|
3237
|
-
const vectorStore = options.vectorStore ?? await createVectorStore(config, cwd);
|
|
3238
|
-
const reranker = options.reranker === void 0 ? createReranker(config) : options.reranker;
|
|
2746
|
+
const store = options.store ?? await createUpstashStore(config);
|
|
3239
2747
|
return new _SearchEngine({
|
|
3240
2748
|
cwd,
|
|
3241
2749
|
config,
|
|
3242
|
-
|
|
3243
|
-
vectorStore,
|
|
3244
|
-
reranker
|
|
2750
|
+
store
|
|
3245
2751
|
});
|
|
3246
2752
|
}
|
|
3247
2753
|
getConfig() {
|
|
@@ -3255,142 +2761,90 @@ var SearchEngine = class _SearchEngine {
|
|
|
3255
2761
|
const input = parsed.data;
|
|
3256
2762
|
const totalStart = process.hrtime.bigint();
|
|
3257
2763
|
const resolvedScope = resolveScope(this.config, input.scope);
|
|
3258
|
-
await this.assertModelCompatibility(resolvedScope);
|
|
3259
2764
|
const topK = input.topK ?? 10;
|
|
3260
|
-
const wantsRerank = Boolean(input.rerank);
|
|
3261
2765
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
3262
2766
|
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
3263
|
-
const
|
|
3264
|
-
|
|
3265
|
-
|
|
3266
|
-
|
|
3267
|
-
|
|
3268
|
-
|
|
3269
|
-
|
|
3270
|
-
|
|
3271
|
-
|
|
3272
|
-
|
|
3273
|
-
|
|
3274
|
-
|
|
3275
|
-
|
|
3276
|
-
|
|
3277
|
-
|
|
3278
|
-
|
|
3279
|
-
|
|
3280
|
-
|
|
3281
|
-
|
|
3282
|
-
|
|
3283
|
-
|
|
3284
|
-
|
|
3285
|
-
|
|
3286
|
-
|
|
3287
|
-
|
|
3288
|
-
|
|
3289
|
-
|
|
3290
|
-
|
|
3291
|
-
|
|
2767
|
+
const filterParts = [];
|
|
2768
|
+
if (input.pathPrefix) {
|
|
2769
|
+
const prefix = input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}`;
|
|
2770
|
+
filterParts.push(`url GLOB '${prefix}*'`);
|
|
2771
|
+
}
|
|
2772
|
+
if (input.tags && input.tags.length > 0) {
|
|
2773
|
+
for (const tag of input.tags) {
|
|
2774
|
+
filterParts.push(`tags GLOB '*${tag}*'`);
|
|
2775
|
+
}
|
|
2776
|
+
}
|
|
2777
|
+
const filter = filterParts.length > 0 ? filterParts.join(" AND ") : void 0;
|
|
2778
|
+
const useDualSearch = this.config.search.dualSearch && groupByPage;
|
|
2779
|
+
const searchStart = process.hrtime.bigint();
|
|
2780
|
+
let ranked;
|
|
2781
|
+
if (useDualSearch) {
|
|
2782
|
+
const chunkLimit = Math.max(topK * 10, 100);
|
|
2783
|
+
const pageLimit = 20;
|
|
2784
|
+
const [pageHits, chunkHits] = await Promise.all([
|
|
2785
|
+
this.store.searchPages(
|
|
2786
|
+
input.q,
|
|
2787
|
+
{
|
|
2788
|
+
limit: pageLimit,
|
|
2789
|
+
semanticWeight: this.config.search.semanticWeight,
|
|
2790
|
+
inputEnrichment: this.config.search.inputEnrichment,
|
|
2791
|
+
filter
|
|
2792
|
+
},
|
|
2793
|
+
resolvedScope
|
|
2794
|
+
),
|
|
2795
|
+
this.store.search(
|
|
2796
|
+
input.q,
|
|
2797
|
+
{
|
|
2798
|
+
limit: chunkLimit,
|
|
2799
|
+
semanticWeight: this.config.search.semanticWeight,
|
|
2800
|
+
inputEnrichment: this.config.search.inputEnrichment,
|
|
2801
|
+
reranking: false,
|
|
2802
|
+
filter
|
|
2803
|
+
},
|
|
2804
|
+
resolvedScope
|
|
2805
|
+
)
|
|
2806
|
+
]);
|
|
2807
|
+
const rankedChunks = rankHits(chunkHits, this.config, input.q);
|
|
2808
|
+
ranked = mergePageAndChunkResults(pageHits, rankedChunks, this.config);
|
|
2809
|
+
} else {
|
|
2810
|
+
const hits = await this.store.search(
|
|
2811
|
+
input.q,
|
|
2812
|
+
{
|
|
2813
|
+
limit: candidateK,
|
|
2814
|
+
semanticWeight: this.config.search.semanticWeight,
|
|
2815
|
+
inputEnrichment: this.config.search.inputEnrichment,
|
|
2816
|
+
reranking: this.config.search.reranking,
|
|
2817
|
+
filter
|
|
2818
|
+
},
|
|
2819
|
+
resolvedScope
|
|
2820
|
+
);
|
|
2821
|
+
ranked = rankHits(hits, this.config, input.q);
|
|
2822
|
+
}
|
|
2823
|
+
const searchMs = hrTimeMs(searchStart);
|
|
2824
|
+
const results = this.buildResults(ranked, topK, groupByPage, input.q);
|
|
3292
2825
|
return {
|
|
3293
2826
|
q: input.q,
|
|
3294
2827
|
scope: resolvedScope.scopeName,
|
|
3295
2828
|
results,
|
|
3296
2829
|
meta: {
|
|
3297
2830
|
timingsMs: {
|
|
3298
|
-
|
|
3299
|
-
vector: Math.round(vectorMs),
|
|
3300
|
-
rerank: Math.round(rerankMs),
|
|
2831
|
+
search: Math.round(searchMs),
|
|
3301
2832
|
total: Math.round(hrTimeMs(totalStart))
|
|
3302
|
-
},
|
|
3303
|
-
usedRerank,
|
|
3304
|
-
modelId: this.config.embeddings.model
|
|
3305
|
-
}
|
|
3306
|
-
};
|
|
3307
|
-
}
|
|
3308
|
-
async *searchStreaming(request) {
|
|
3309
|
-
const parsed = requestSchema.safeParse(request);
|
|
3310
|
-
if (!parsed.success) {
|
|
3311
|
-
throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
|
|
3312
|
-
}
|
|
3313
|
-
const input = parsed.data;
|
|
3314
|
-
const wantsRerank = Boolean(input.rerank);
|
|
3315
|
-
if (!wantsRerank) {
|
|
3316
|
-
const response = await this.search(request);
|
|
3317
|
-
yield { phase: "initial", data: response };
|
|
3318
|
-
return;
|
|
3319
|
-
}
|
|
3320
|
-
const totalStart = process.hrtime.bigint();
|
|
3321
|
-
const resolvedScope = resolveScope(this.config, input.scope);
|
|
3322
|
-
await this.assertModelCompatibility(resolvedScope);
|
|
3323
|
-
const topK = input.topK ?? 10;
|
|
3324
|
-
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
3325
|
-
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
3326
|
-
const embedStart = process.hrtime.bigint();
|
|
3327
|
-
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
3328
|
-
const queryVector = queryEmbeddings[0];
|
|
3329
|
-
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
3330
|
-
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
3331
|
-
}
|
|
3332
|
-
const embedMs = hrTimeMs(embedStart);
|
|
3333
|
-
const vectorStart = process.hrtime.bigint();
|
|
3334
|
-
const hits = await this.vectorStore.query(
|
|
3335
|
-
queryVector,
|
|
3336
|
-
{
|
|
3337
|
-
topK: candidateK,
|
|
3338
|
-
pathPrefix: input.pathPrefix,
|
|
3339
|
-
tags: input.tags
|
|
3340
|
-
},
|
|
3341
|
-
resolvedScope
|
|
3342
|
-
);
|
|
3343
|
-
const vectorMs = hrTimeMs(vectorStart);
|
|
3344
|
-
const ranked = rankHits(hits, this.config);
|
|
3345
|
-
const initialResults = this.buildResults(ranked, topK, groupByPage);
|
|
3346
|
-
yield {
|
|
3347
|
-
phase: "initial",
|
|
3348
|
-
data: {
|
|
3349
|
-
q: input.q,
|
|
3350
|
-
scope: resolvedScope.scopeName,
|
|
3351
|
-
results: initialResults,
|
|
3352
|
-
meta: {
|
|
3353
|
-
timingsMs: {
|
|
3354
|
-
embed: Math.round(embedMs),
|
|
3355
|
-
vector: Math.round(vectorMs),
|
|
3356
|
-
rerank: 0,
|
|
3357
|
-
total: Math.round(hrTimeMs(totalStart))
|
|
3358
|
-
},
|
|
3359
|
-
usedRerank: false,
|
|
3360
|
-
modelId: this.config.embeddings.model
|
|
3361
|
-
}
|
|
3362
|
-
}
|
|
3363
|
-
};
|
|
3364
|
-
const rerankStart = process.hrtime.bigint();
|
|
3365
|
-
const reranked = await this.rerankHits(input.q, ranked, topK);
|
|
3366
|
-
const rerankMs = hrTimeMs(rerankStart);
|
|
3367
|
-
const rerankedResults = this.buildResults(reranked, topK, groupByPage);
|
|
3368
|
-
yield {
|
|
3369
|
-
phase: "reranked",
|
|
3370
|
-
data: {
|
|
3371
|
-
q: input.q,
|
|
3372
|
-
scope: resolvedScope.scopeName,
|
|
3373
|
-
results: rerankedResults,
|
|
3374
|
-
meta: {
|
|
3375
|
-
timingsMs: {
|
|
3376
|
-
embed: Math.round(embedMs),
|
|
3377
|
-
vector: Math.round(vectorMs),
|
|
3378
|
-
rerank: Math.round(rerankMs),
|
|
3379
|
-
total: Math.round(hrTimeMs(totalStart))
|
|
3380
|
-
},
|
|
3381
|
-
usedRerank: true,
|
|
3382
|
-
modelId: this.config.embeddings.model
|
|
3383
2833
|
}
|
|
3384
2834
|
}
|
|
3385
2835
|
};
|
|
3386
2836
|
}
|
|
3387
|
-
|
|
3388
|
-
const
|
|
2837
|
+
ensureSnippet(hit) {
|
|
2838
|
+
const snippet = hit.hit.metadata.snippet;
|
|
2839
|
+
if (snippet && snippet.length >= 30) return snippet;
|
|
2840
|
+
const chunkText = hit.hit.metadata.chunkText;
|
|
2841
|
+
if (chunkText) return toSnippet(chunkText);
|
|
2842
|
+
return snippet || "";
|
|
2843
|
+
}
|
|
2844
|
+
buildResults(ordered, topK, groupByPage, _query) {
|
|
3389
2845
|
if (groupByPage) {
|
|
3390
2846
|
let pages = aggregateByPage(ordered, this.config);
|
|
3391
|
-
|
|
3392
|
-
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
3393
|
-
}
|
|
2847
|
+
pages = trimByScoreGap(pages, this.config);
|
|
3394
2848
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
3395
2849
|
return pages.slice(0, topK).map((page) => {
|
|
3396
2850
|
const bestScore = page.bestChunk.finalScore;
|
|
@@ -3400,12 +2854,12 @@ var SearchEngine = class _SearchEngine {
|
|
|
3400
2854
|
url: page.url,
|
|
3401
2855
|
title: page.title,
|
|
3402
2856
|
sectionTitle: page.bestChunk.hit.metadata.sectionTitle || void 0,
|
|
3403
|
-
snippet: page.bestChunk
|
|
2857
|
+
snippet: this.ensureSnippet(page.bestChunk),
|
|
3404
2858
|
score: Number(page.pageScore.toFixed(6)),
|
|
3405
2859
|
routeFile: page.routeFile,
|
|
3406
2860
|
chunks: meaningful.length > 1 ? meaningful.map((c) => ({
|
|
3407
2861
|
sectionTitle: c.hit.metadata.sectionTitle || void 0,
|
|
3408
|
-
snippet: c
|
|
2862
|
+
snippet: this.ensureSnippet(c),
|
|
3409
2863
|
headingPath: c.hit.metadata.headingPath,
|
|
3410
2864
|
score: Number(c.finalScore.toFixed(6))
|
|
3411
2865
|
})) : void 0
|
|
@@ -3413,6 +2867,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
3413
2867
|
});
|
|
3414
2868
|
} else {
|
|
3415
2869
|
let filtered = ordered;
|
|
2870
|
+
const minScore = this.config.ranking.minScore;
|
|
3416
2871
|
if (minScore > 0) {
|
|
3417
2872
|
filtered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
3418
2873
|
}
|
|
@@ -3420,7 +2875,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
3420
2875
|
url: hit.metadata.url,
|
|
3421
2876
|
title: hit.metadata.title,
|
|
3422
2877
|
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
3423
|
-
snippet: hit
|
|
2878
|
+
snippet: this.ensureSnippet({ hit, finalScore }),
|
|
3424
2879
|
score: Number(finalScore.toFixed(6)),
|
|
3425
2880
|
routeFile: hit.metadata.routeFile
|
|
3426
2881
|
}));
|
|
@@ -3429,7 +2884,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
3429
2884
|
async getPage(pathOrUrl, scope) {
|
|
3430
2885
|
const resolvedScope = resolveScope(this.config, scope);
|
|
3431
2886
|
const urlPath = this.resolveInputPath(pathOrUrl);
|
|
3432
|
-
const page = await this.
|
|
2887
|
+
const page = await this.store.getPage(urlPath, resolvedScope);
|
|
3433
2888
|
if (!page) {
|
|
3434
2889
|
throw new SearchSocketError("INVALID_REQUEST", `Indexed page not found for ${urlPath}`, 404);
|
|
3435
2890
|
}
|
|
@@ -3450,7 +2905,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
3450
2905
|
};
|
|
3451
2906
|
}
|
|
3452
2907
|
async health() {
|
|
3453
|
-
return this.
|
|
2908
|
+
return this.store.health();
|
|
3454
2909
|
}
|
|
3455
2910
|
resolveInputPath(pathOrUrl) {
|
|
3456
2911
|
try {
|
|
@@ -3462,94 +2917,10 @@ var SearchEngine = class _SearchEngine {
|
|
|
3462
2917
|
const withoutQueryOrHash = pathOrUrl.split(/[?#]/)[0] ?? pathOrUrl;
|
|
3463
2918
|
return normalizeUrlPath(withoutQueryOrHash);
|
|
3464
2919
|
}
|
|
3465
|
-
async assertModelCompatibility(scope) {
|
|
3466
|
-
const modelId = await this.vectorStore.getScopeModelId(scope);
|
|
3467
|
-
if (modelId && modelId !== this.config.embeddings.model) {
|
|
3468
|
-
throw new SearchSocketError(
|
|
3469
|
-
"EMBEDDING_MODEL_MISMATCH",
|
|
3470
|
-
`Scope ${scope.scopeName} was indexed with ${modelId}. Current config uses ${this.config.embeddings.model}. Re-index with --force.`
|
|
3471
|
-
);
|
|
3472
|
-
}
|
|
3473
|
-
}
|
|
3474
|
-
async rerankHits(query, ranked, topK) {
|
|
3475
|
-
if (!this.config.rerank.enabled) {
|
|
3476
|
-
throw new SearchSocketError(
|
|
3477
|
-
"INVALID_REQUEST",
|
|
3478
|
-
"rerank=true requested but rerank.enabled is not set to true.",
|
|
3479
|
-
400
|
|
3480
|
-
);
|
|
3481
|
-
}
|
|
3482
|
-
if (!this.reranker) {
|
|
3483
|
-
throw new SearchSocketError(
|
|
3484
|
-
"CONFIG_MISSING",
|
|
3485
|
-
`rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
|
|
3486
|
-
400
|
|
3487
|
-
);
|
|
3488
|
-
}
|
|
3489
|
-
const pageGroups = /* @__PURE__ */ new Map();
|
|
3490
|
-
for (const entry of ranked) {
|
|
3491
|
-
const url = entry.hit.metadata.url;
|
|
3492
|
-
const group = pageGroups.get(url);
|
|
3493
|
-
if (group) group.push(entry);
|
|
3494
|
-
else pageGroups.set(url, [entry]);
|
|
3495
|
-
}
|
|
3496
|
-
const MAX_CHUNKS_PER_PAGE = 5;
|
|
3497
|
-
const MIN_CHUNKS_PER_PAGE = 1;
|
|
3498
|
-
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
3499
|
-
const MAX_DOC_CHARS = 2e3;
|
|
3500
|
-
const pageCandidates = [];
|
|
3501
|
-
for (const [url, chunks] of pageGroups) {
|
|
3502
|
-
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
3503
|
-
const bestScore = byScore[0].finalScore;
|
|
3504
|
-
const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
|
|
3505
|
-
const selected = byScore.filter(
|
|
3506
|
-
(c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
|
|
3507
|
-
).slice(0, MAX_CHUNKS_PER_PAGE);
|
|
3508
|
-
selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
|
|
3509
|
-
const first = selected[0].hit.metadata;
|
|
3510
|
-
const parts = [first.title];
|
|
3511
|
-
if (first.description) {
|
|
3512
|
-
parts.push(first.description);
|
|
3513
|
-
}
|
|
3514
|
-
if (first.keywords && first.keywords.length > 0) {
|
|
3515
|
-
parts.push(first.keywords.join(", "));
|
|
3516
|
-
}
|
|
3517
|
-
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
3518
|
-
parts.push(body);
|
|
3519
|
-
let text = parts.join("\n\n");
|
|
3520
|
-
if (text.length > MAX_DOC_CHARS) {
|
|
3521
|
-
text = text.slice(0, MAX_DOC_CHARS);
|
|
3522
|
-
}
|
|
3523
|
-
pageCandidates.push({ id: url, text });
|
|
3524
|
-
}
|
|
3525
|
-
const maxCandidates = Math.max(topK, this.config.rerank.topN);
|
|
3526
|
-
const cappedCandidates = pageCandidates.slice(0, maxCandidates);
|
|
3527
|
-
const reranked = await this.reranker.rerank(
|
|
3528
|
-
query,
|
|
3529
|
-
cappedCandidates,
|
|
3530
|
-
maxCandidates
|
|
3531
|
-
);
|
|
3532
|
-
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
3533
|
-
return ranked.map((entry) => {
|
|
3534
|
-
const pageScore = scoreByUrl.get(entry.hit.metadata.url);
|
|
3535
|
-
const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
|
|
3536
|
-
if (pageScore === void 0 || !Number.isFinite(pageScore)) {
|
|
3537
|
-
return { ...entry, finalScore: base };
|
|
3538
|
-
}
|
|
3539
|
-
const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
|
|
3540
|
-
return {
|
|
3541
|
-
...entry,
|
|
3542
|
-
finalScore: Number.isFinite(combined) ? combined : base
|
|
3543
|
-
};
|
|
3544
|
-
}).sort((a, b) => {
|
|
3545
|
-
const delta = b.finalScore - a.finalScore;
|
|
3546
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
3547
|
-
});
|
|
3548
|
-
}
|
|
3549
2920
|
};
|
|
3550
2921
|
|
|
3551
2922
|
// src/mcp/server.ts
|
|
3552
|
-
function createServer(engine
|
|
2923
|
+
function createServer(engine) {
|
|
3553
2924
|
const server = new McpServer({
|
|
3554
2925
|
name: "searchsocket-mcp",
|
|
3555
2926
|
version: "0.1.0"
|
|
@@ -3557,15 +2928,14 @@ function createServer(engine, config) {
|
|
|
3557
2928
|
server.registerTool(
|
|
3558
2929
|
"search",
|
|
3559
2930
|
{
|
|
3560
|
-
description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, topK, and
|
|
2931
|
+
description: "Semantic site search powered by Upstash Search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, topK, and groupBy.",
|
|
3561
2932
|
inputSchema: {
|
|
3562
2933
|
query: z3.string().min(1),
|
|
3563
2934
|
scope: z3.string().optional(),
|
|
3564
2935
|
topK: z3.number().int().positive().max(100).optional(),
|
|
3565
2936
|
pathPrefix: z3.string().optional(),
|
|
3566
2937
|
tags: z3.array(z3.string()).optional(),
|
|
3567
|
-
groupBy: z3.enum(["page", "chunk"]).optional()
|
|
3568
|
-
rerank: z3.boolean().optional().describe("Enable reranking for better relevance (uses Jina Reranker). Defaults to true when rerank is enabled in config.")
|
|
2938
|
+
groupBy: z3.enum(["page", "chunk"]).optional()
|
|
3569
2939
|
}
|
|
3570
2940
|
},
|
|
3571
2941
|
async (input) => {
|
|
@@ -3575,8 +2945,7 @@ function createServer(engine, config) {
|
|
|
3575
2945
|
scope: input.scope,
|
|
3576
2946
|
pathPrefix: input.pathPrefix,
|
|
3577
2947
|
tags: input.tags,
|
|
3578
|
-
groupBy: input.groupBy
|
|
3579
|
-
rerank: input.rerank ?? config.rerank.enabled
|
|
2948
|
+
groupBy: input.groupBy
|
|
3580
2949
|
});
|
|
3581
2950
|
return {
|
|
3582
2951
|
content: [
|
|
@@ -3704,10 +3073,10 @@ async function runMcpServer(options = {}) {
|
|
|
3704
3073
|
config
|
|
3705
3074
|
});
|
|
3706
3075
|
if (resolvedTransport === "http") {
|
|
3707
|
-
await startHttpServer(() => createServer(engine
|
|
3076
|
+
await startHttpServer(() => createServer(engine), config, options);
|
|
3708
3077
|
return;
|
|
3709
3078
|
}
|
|
3710
|
-
const server = createServer(engine
|
|
3079
|
+
const server = createServer(engine);
|
|
3711
3080
|
const stdioTransport = new StdioServerTransport();
|
|
3712
3081
|
await server.connect(stdioTransport);
|
|
3713
3082
|
}
|
|
@@ -3746,9 +3115,6 @@ function parseDurationMs(value) {
|
|
|
3746
3115
|
throw new SearchSocketError("INVALID_REQUEST", `Unsupported duration unit: ${unit}`, 400);
|
|
3747
3116
|
}
|
|
3748
3117
|
}
|
|
3749
|
-
function formatUsd(value) {
|
|
3750
|
-
return `$${value.toFixed(6)}`;
|
|
3751
|
-
}
|
|
3752
3118
|
function printIndexSummary(stats) {
|
|
3753
3119
|
process.stdout.write(`pages processed: ${stats.pagesProcessed}
|
|
3754
3120
|
`);
|
|
@@ -3756,13 +3122,9 @@ function printIndexSummary(stats) {
|
|
|
3756
3122
|
`);
|
|
3757
3123
|
process.stdout.write(`chunks changed: ${stats.chunksChanged}
|
|
3758
3124
|
`);
|
|
3759
|
-
process.stdout.write(`
|
|
3125
|
+
process.stdout.write(`documents upserted: ${stats.documentsUpserted}
|
|
3760
3126
|
`);
|
|
3761
3127
|
process.stdout.write(`deletes: ${stats.deletes}
|
|
3762
|
-
`);
|
|
3763
|
-
process.stdout.write(`estimated tokens: ${stats.estimatedTokens}
|
|
3764
|
-
`);
|
|
3765
|
-
process.stdout.write(`estimated cost (USD): ${formatUsd(stats.estimatedCostUSD)}
|
|
3766
3128
|
`);
|
|
3767
3129
|
process.stdout.write(`route mapping: ${stats.routeExact} exact, ${stats.routeBestEffort} best-effort
|
|
3768
3130
|
`);
|
|
@@ -3776,7 +3138,7 @@ function collectWatchPaths(config, cwd) {
|
|
|
3776
3138
|
const paths = ["src/routes/**"];
|
|
3777
3139
|
if (config.source.mode === "content-files" && config.source.contentFiles) {
|
|
3778
3140
|
for (const pattern of config.source.contentFiles.globs) {
|
|
3779
|
-
paths.push(
|
|
3141
|
+
paths.push(path12.join(config.source.contentFiles.baseDir, pattern));
|
|
3780
3142
|
}
|
|
3781
3143
|
}
|
|
3782
3144
|
if (config.source.mode === "static-output") {
|
|
@@ -3789,25 +3151,22 @@ function collectWatchPaths(config, cwd) {
|
|
|
3789
3151
|
paths.push("searchsocket.config.ts");
|
|
3790
3152
|
paths.push(config.source.build.outputDir);
|
|
3791
3153
|
}
|
|
3792
|
-
return paths.map((value) =>
|
|
3154
|
+
return paths.map((value) => path12.resolve(cwd, value));
|
|
3793
3155
|
}
|
|
3794
3156
|
function ensureStateDir(cwd) {
|
|
3795
|
-
const target =
|
|
3796
|
-
|
|
3157
|
+
const target = path12.join(cwd, ".searchsocket");
|
|
3158
|
+
fs8.mkdirSync(target, { recursive: true });
|
|
3797
3159
|
return target;
|
|
3798
3160
|
}
|
|
3799
3161
|
function ensureGitignore(cwd) {
|
|
3800
|
-
const gitignorePath =
|
|
3162
|
+
const gitignorePath = path12.join(cwd, ".gitignore");
|
|
3801
3163
|
const entries = [
|
|
3802
|
-
".searchsocket/vectors.db",
|
|
3803
|
-
".searchsocket/vectors.db-shm",
|
|
3804
|
-
".searchsocket/vectors.db-wal",
|
|
3805
3164
|
".searchsocket/manifest.json",
|
|
3806
3165
|
".searchsocket/registry.json"
|
|
3807
3166
|
];
|
|
3808
3167
|
let content = "";
|
|
3809
|
-
if (
|
|
3810
|
-
content =
|
|
3168
|
+
if (fs8.existsSync(gitignorePath)) {
|
|
3169
|
+
content = fs8.readFileSync(gitignorePath, "utf8");
|
|
3811
3170
|
}
|
|
3812
3171
|
const lines = content.split("\n");
|
|
3813
3172
|
const missing = entries.filter((entry) => !lines.some((line) => line.trim() === entry));
|
|
@@ -3818,10 +3177,10 @@ function ensureGitignore(cwd) {
|
|
|
3818
3177
|
# SearchSocket local state
|
|
3819
3178
|
${missing.join("\n")}
|
|
3820
3179
|
`;
|
|
3821
|
-
|
|
3180
|
+
fs8.writeFileSync(gitignorePath, content.trimEnd() + block, "utf8");
|
|
3822
3181
|
}
|
|
3823
3182
|
function readScopesFromFile(filePath) {
|
|
3824
|
-
const raw =
|
|
3183
|
+
const raw = fs8.readFileSync(filePath, "utf8");
|
|
3825
3184
|
return new Set(
|
|
3826
3185
|
raw.split(/\r?\n/).map((line) => line.trim()).filter(Boolean)
|
|
3827
3186
|
);
|
|
@@ -3845,8 +3204,8 @@ function readRemoteGitBranches(cwd) {
|
|
|
3845
3204
|
}
|
|
3846
3205
|
}
|
|
3847
3206
|
async function loadResolvedConfigForDev(cwd, configPath) {
|
|
3848
|
-
const resolvedConfigPath =
|
|
3849
|
-
if (
|
|
3207
|
+
const resolvedConfigPath = path12.resolve(cwd, configPath ?? "searchsocket.config.ts");
|
|
3208
|
+
if (fs8.existsSync(resolvedConfigPath)) {
|
|
3850
3209
|
return loadConfig({ cwd, configPath });
|
|
3851
3210
|
}
|
|
3852
3211
|
return mergeConfig(cwd, {});
|
|
@@ -3893,7 +3252,7 @@ var program = new Command();
|
|
|
3893
3252
|
program.name("searchsocket").description("Semantic site search and MCP retrieval for SvelteKit").version(package_default.version).option("-C, --cwd <path>", "working directory", process.cwd()).option("--config <path>", "config path (defaults to searchsocket.config.ts)");
|
|
3894
3253
|
program.command("init").description("Create searchsocket.config.ts and .searchsocket state directory").action(async (_opts, command) => {
|
|
3895
3254
|
const root = getRootOptions(command).cwd ?? process.cwd();
|
|
3896
|
-
const cwd =
|
|
3255
|
+
const cwd = path12.resolve(root);
|
|
3897
3256
|
const configPath = writeMinimalConfig(cwd);
|
|
3898
3257
|
const stateDir = ensureStateDir(cwd);
|
|
3899
3258
|
ensureGitignore(cwd);
|
|
@@ -3911,9 +3270,9 @@ program.command("init").description("Create searchsocket.config.ts and .searchso
|
|
|
3911
3270
|
process.stdout.write("// searchsocketVitePlugin({ enabled: true, changedOnly: true })\n");
|
|
3912
3271
|
process.stdout.write("// or env-driven: SEARCHSOCKET_AUTO_INDEX=1 pnpm build\n");
|
|
3913
3272
|
});
|
|
3914
|
-
program.command("index").description("Index site content into
|
|
3273
|
+
program.command("index").description("Index site content into Upstash Search").option("--scope <name>", "scope override").option("--changed-only", "only process changed chunks", true).option("--no-changed-only", "re-index regardless of previous manifest").option("--force", "force full rebuild", false).option("--dry-run", "compute plan, no writes", false).option("--source <mode>", "source mode override: static-output|crawl|content-files|build").option("--max-pages <n>", "limit pages processed").option("--max-chunks <n>", "limit chunks processed").option("--quiet", "suppress all output except errors and warnings", false).option("--verbose", "verbose output", false).option("--json", "emit JSON logs and summary", false).action(async (opts, command) => {
|
|
3915
3274
|
const rootOpts = getRootOptions(command);
|
|
3916
|
-
const cwd =
|
|
3275
|
+
const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3917
3276
|
await runIndexCommand({
|
|
3918
3277
|
cwd,
|
|
3919
3278
|
configPath: rootOpts?.config,
|
|
@@ -3929,16 +3288,16 @@ program.command("index").description("Index site content into markdown mirror +
|
|
|
3929
3288
|
json: opts.json
|
|
3930
3289
|
});
|
|
3931
3290
|
});
|
|
3932
|
-
program.command("status").description("Show scope, indexing state, backend health
|
|
3291
|
+
program.command("status").description("Show scope, indexing state, and backend health").option("--scope <name>", "scope override").action(async (opts, command) => {
|
|
3933
3292
|
const rootOpts = getRootOptions(command);
|
|
3934
|
-
const cwd =
|
|
3293
|
+
const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3935
3294
|
const config = await loadConfig({ cwd, configPath: rootOpts?.config });
|
|
3936
3295
|
const scope = resolveScope(config, opts.scope);
|
|
3937
|
-
let
|
|
3296
|
+
let store;
|
|
3938
3297
|
let health = { ok: false, details: "not checked" };
|
|
3939
3298
|
try {
|
|
3940
|
-
|
|
3941
|
-
health = await
|
|
3299
|
+
store = await createUpstashStore(config);
|
|
3300
|
+
health = await store.health();
|
|
3942
3301
|
} catch (error) {
|
|
3943
3302
|
health = {
|
|
3944
3303
|
ok: false,
|
|
@@ -3946,24 +3305,22 @@ program.command("status").description("Show scope, indexing state, backend healt
|
|
|
3946
3305
|
};
|
|
3947
3306
|
process.stdout.write(`project: ${config.project.id}
|
|
3948
3307
|
`);
|
|
3949
|
-
process.stdout.write(`
|
|
3308
|
+
process.stdout.write(`backend health: error (${health.details})
|
|
3950
3309
|
`);
|
|
3951
3310
|
process.exitCode = 1;
|
|
3952
3311
|
return;
|
|
3953
3312
|
}
|
|
3954
3313
|
let scopeRegistry = [];
|
|
3955
3314
|
let scopeInfo;
|
|
3956
|
-
let hashes = /* @__PURE__ */ new Map();
|
|
3957
3315
|
try {
|
|
3958
|
-
scopeRegistry = await
|
|
3316
|
+
scopeRegistry = await store.listScopes(config.project.id);
|
|
3959
3317
|
scopeInfo = scopeRegistry.find((entry) => entry.scopeName === scope.scopeName);
|
|
3960
|
-
hashes = await vectorStore.getContentHashes(scope);
|
|
3961
3318
|
} catch (error) {
|
|
3962
3319
|
process.stdout.write(`project: ${config.project.id}
|
|
3963
3320
|
`);
|
|
3964
3321
|
process.stdout.write(`resolved scope: ${scope.scopeName}
|
|
3965
3322
|
`);
|
|
3966
|
-
process.stdout.write(`
|
|
3323
|
+
process.stdout.write(`backend health: error (${error instanceof Error ? error.message : "unknown error"})
|
|
3967
3324
|
`);
|
|
3968
3325
|
process.exitCode = 1;
|
|
3969
3326
|
return;
|
|
@@ -3972,25 +3329,15 @@ program.command("status").description("Show scope, indexing state, backend healt
|
|
|
3972
3329
|
`);
|
|
3973
3330
|
process.stdout.write(`resolved scope: ${scope.scopeName}
|
|
3974
3331
|
`);
|
|
3975
|
-
process.stdout.write(`
|
|
3976
|
-
`);
|
|
3977
|
-
const tursoUrl = process.env[config.vector.turso.urlEnv];
|
|
3978
|
-
const vectorMode = tursoUrl ? `remote (${tursoUrl})` : `local (${config.vector.turso.localPath})`;
|
|
3979
|
-
process.stdout.write(`vector backend: turso/libsql (${vectorMode})
|
|
3332
|
+
process.stdout.write(`backend: upstash-search
|
|
3980
3333
|
`);
|
|
3981
|
-
process.stdout.write(`
|
|
3334
|
+
process.stdout.write(`backend health: ${health.ok ? "ok" : `error (${health.details ?? "n/a"})`}
|
|
3982
3335
|
`);
|
|
3983
3336
|
if (scopeInfo) {
|
|
3984
3337
|
process.stdout.write(`last indexed (${scope.scopeName}): ${scopeInfo.lastIndexedAt ?? "never"}
|
|
3985
3338
|
`);
|
|
3986
|
-
|
|
3987
|
-
`
|
|
3988
|
-
if (scopeInfo.lastEstimateTokens != null) {
|
|
3989
|
-
process.stdout.write(`last estimated tokens: ${scopeInfo.lastEstimateTokens}
|
|
3990
|
-
`);
|
|
3991
|
-
}
|
|
3992
|
-
if (scopeInfo.lastEstimateCostUSD != null) {
|
|
3993
|
-
process.stdout.write(`last estimated cost: ${formatUsd(scopeInfo.lastEstimateCostUSD)}
|
|
3339
|
+
if (scopeInfo.documentCount != null) {
|
|
3340
|
+
process.stdout.write(`documents: ${scopeInfo.documentCount}
|
|
3994
3341
|
`);
|
|
3995
3342
|
}
|
|
3996
3343
|
} else {
|
|
@@ -4001,7 +3348,7 @@ program.command("status").description("Show scope, indexing state, backend healt
|
|
|
4001
3348
|
process.stdout.write("\nregistry scopes:\n");
|
|
4002
3349
|
for (const item of scopeRegistry) {
|
|
4003
3350
|
process.stdout.write(
|
|
4004
|
-
` - ${item.scopeName}
|
|
3351
|
+
` - ${item.scopeName} lastIndexedAt=${item.lastIndexedAt} documents=${item.documentCount ?? "unknown"}
|
|
4005
3352
|
`
|
|
4006
3353
|
);
|
|
4007
3354
|
}
|
|
@@ -4009,7 +3356,7 @@ program.command("status").description("Show scope, indexing state, backend healt
|
|
|
4009
3356
|
});
|
|
4010
3357
|
program.command("dev").description("Watch content files/routes and incrementally reindex on changes").option("--scope <name>", "scope override").option("--mcp", "start MCP server (http transport) alongside watcher", false).option("--mcp-port <n>", "MCP HTTP port", "3338").option("--mcp-path <path>", "MCP HTTP path", "/mcp").option("--verbose", "verbose logs", false).action(async (opts, command) => {
|
|
4011
3358
|
const rootOpts = getRootOptions(command);
|
|
4012
|
-
const cwd =
|
|
3359
|
+
const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
|
|
4013
3360
|
const config = await loadResolvedConfigForDev(cwd, rootOpts?.config);
|
|
4014
3361
|
const watchPaths = collectWatchPaths(config, cwd);
|
|
4015
3362
|
process.stdout.write("starting searchsocket dev watcher...\n");
|
|
@@ -4076,45 +3423,44 @@ ${watchPaths.map((entry) => ` - ${entry}`).join("\n")}
|
|
|
4076
3423
|
});
|
|
4077
3424
|
});
|
|
4078
3425
|
});
|
|
4079
|
-
program.command("clean").description("Delete local state and optionally delete remote
|
|
3426
|
+
program.command("clean").description("Delete local state and optionally delete remote indexes for a scope").option("--scope <name>", "scope override").option("--remote", "delete remote scope indexes", false).action(async (opts, command) => {
|
|
4080
3427
|
const rootOpts = getRootOptions(command);
|
|
4081
|
-
const cwd =
|
|
3428
|
+
const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
|
|
4082
3429
|
const config = await loadConfig({ cwd, configPath: rootOpts?.config });
|
|
4083
|
-
const
|
|
4084
|
-
const statePath = path14.join(cwd, config.state.dir);
|
|
3430
|
+
const statePath = path12.join(cwd, config.state.dir);
|
|
4085
3431
|
await fsp.rm(statePath, { recursive: true, force: true });
|
|
4086
3432
|
process.stdout.write(`deleted local state directory: ${statePath}
|
|
4087
3433
|
`);
|
|
4088
3434
|
if (opts.remote) {
|
|
4089
|
-
const
|
|
4090
|
-
await
|
|
4091
|
-
process.stdout.write(`dropped all remote
|
|
3435
|
+
const store = await createUpstashStore(config);
|
|
3436
|
+
await store.dropAllIndexes(config.project.id);
|
|
3437
|
+
process.stdout.write(`dropped all remote indexes for project ${config.project.id}
|
|
4092
3438
|
`);
|
|
4093
3439
|
}
|
|
4094
3440
|
});
|
|
4095
3441
|
program.command("prune").description("List/delete stale scopes (dry-run by default)").option("--apply", "apply deletions", false).option("--scopes-file <path>", "file containing active scopes").option("--older-than <duration>", "ttl cutoff like 30d").action(async (opts, command) => {
|
|
4096
3442
|
const rootOpts = getRootOptions(command);
|
|
4097
|
-
const cwd =
|
|
3443
|
+
const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
|
|
4098
3444
|
const config = await loadConfig({ cwd, configPath: rootOpts?.config });
|
|
4099
3445
|
const baseScope = resolveScope(config);
|
|
4100
|
-
let
|
|
3446
|
+
let store;
|
|
4101
3447
|
let scopes;
|
|
4102
3448
|
try {
|
|
4103
|
-
|
|
4104
|
-
scopes = await
|
|
3449
|
+
store = await createUpstashStore(config);
|
|
3450
|
+
scopes = await store.listScopes(config.project.id);
|
|
4105
3451
|
} catch (error) {
|
|
4106
3452
|
process.stderr.write(
|
|
4107
|
-
`error: failed to access
|
|
3453
|
+
`error: failed to access Upstash Search: ${error instanceof Error ? error.message : String(error)}
|
|
4108
3454
|
`
|
|
4109
3455
|
);
|
|
4110
3456
|
process.exitCode = 1;
|
|
4111
3457
|
return;
|
|
4112
3458
|
}
|
|
4113
|
-
process.stdout.write(`using
|
|
3459
|
+
process.stdout.write(`using Upstash Search
|
|
4114
3460
|
`);
|
|
4115
3461
|
let keepScopes = /* @__PURE__ */ new Set();
|
|
4116
3462
|
if (opts.scopesFile) {
|
|
4117
|
-
keepScopes = readScopesFromFile(
|
|
3463
|
+
keepScopes = readScopesFromFile(path12.resolve(cwd, opts.scopesFile));
|
|
4118
3464
|
} else {
|
|
4119
3465
|
keepScopes = readRemoteGitBranches(cwd);
|
|
4120
3466
|
}
|
|
@@ -4132,7 +3478,7 @@ program.command("prune").description("List/delete stale scopes (dry-run by defau
|
|
|
4132
3478
|
staleByList = !keepScopes.has(entry.scopeName);
|
|
4133
3479
|
}
|
|
4134
3480
|
let staleByTtl = false;
|
|
4135
|
-
if (olderThanMs) {
|
|
3481
|
+
if (olderThanMs && entry.lastIndexedAt !== "unknown") {
|
|
4136
3482
|
staleByTtl = now - Date.parse(entry.lastIndexedAt) > olderThanMs;
|
|
4137
3483
|
}
|
|
4138
3484
|
if (keepScopes.size > 0 && olderThanMs) {
|
|
@@ -4168,7 +3514,7 @@ program.command("prune").description("List/delete stale scopes (dry-run by defau
|
|
|
4168
3514
|
scopeId: `${config.project.id}:${entry.scopeName}`
|
|
4169
3515
|
};
|
|
4170
3516
|
try {
|
|
4171
|
-
await
|
|
3517
|
+
await store.deleteScope(scope);
|
|
4172
3518
|
deleted += 1;
|
|
4173
3519
|
} catch (error) {
|
|
4174
3520
|
process.stdout.write(
|
|
@@ -4185,7 +3531,7 @@ program.command("prune").description("List/delete stale scopes (dry-run by defau
|
|
|
4185
3531
|
});
|
|
4186
3532
|
program.command("doctor").description("Validate config, env vars, provider connectivity, and local write access").action(async (_opts, command) => {
|
|
4187
3533
|
const rootOpts = getRootOptions(command);
|
|
4188
|
-
const cwd =
|
|
3534
|
+
const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
|
|
4189
3535
|
const checks = [];
|
|
4190
3536
|
let config = null;
|
|
4191
3537
|
try {
|
|
@@ -4199,23 +3545,21 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
4199
3545
|
});
|
|
4200
3546
|
}
|
|
4201
3547
|
if (config) {
|
|
4202
|
-
const
|
|
3548
|
+
const upstashUrl = config.upstash.url ?? process.env[config.upstash.urlEnv];
|
|
3549
|
+
const upstashToken = config.upstash.token ?? process.env[config.upstash.tokenEnv];
|
|
4203
3550
|
checks.push({
|
|
4204
|
-
name: `env ${config.
|
|
4205
|
-
ok: Boolean(
|
|
4206
|
-
details:
|
|
3551
|
+
name: `env ${config.upstash.urlEnv}`,
|
|
3552
|
+
ok: Boolean(upstashUrl),
|
|
3553
|
+
details: upstashUrl ? void 0 : "missing"
|
|
3554
|
+
});
|
|
3555
|
+
checks.push({
|
|
3556
|
+
name: `env ${config.upstash.tokenEnv}`,
|
|
3557
|
+
ok: Boolean(upstashToken),
|
|
3558
|
+
details: upstashToken ? void 0 : "missing"
|
|
4207
3559
|
});
|
|
4208
|
-
{
|
|
4209
|
-
const tursoUrl = process.env[config.vector.turso.urlEnv];
|
|
4210
|
-
checks.push({
|
|
4211
|
-
name: "turso/libsql",
|
|
4212
|
-
ok: true,
|
|
4213
|
-
details: tursoUrl ? `remote: ${tursoUrl}` : `local file: ${config.vector.turso.localPath}`
|
|
4214
|
-
});
|
|
4215
|
-
}
|
|
4216
3560
|
if (config.source.mode === "static-output") {
|
|
4217
|
-
const outputDir =
|
|
4218
|
-
const exists =
|
|
3561
|
+
const outputDir = path12.resolve(cwd, config.source.staticOutputDir);
|
|
3562
|
+
const exists = fs8.existsSync(outputDir);
|
|
4219
3563
|
checks.push({
|
|
4220
3564
|
name: "source: static output dir",
|
|
4221
3565
|
ok: exists,
|
|
@@ -4224,15 +3568,15 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
4224
3568
|
} else if (config.source.mode === "build") {
|
|
4225
3569
|
const buildConfig = config.source.build;
|
|
4226
3570
|
if (buildConfig) {
|
|
4227
|
-
const manifestPath =
|
|
4228
|
-
const manifestExists =
|
|
3571
|
+
const manifestPath = path12.resolve(cwd, buildConfig.outputDir, "server", "manifest-full.js");
|
|
3572
|
+
const manifestExists = fs8.existsSync(manifestPath);
|
|
4229
3573
|
checks.push({
|
|
4230
3574
|
name: "source: build manifest",
|
|
4231
3575
|
ok: manifestExists,
|
|
4232
3576
|
details: manifestExists ? manifestPath : `${manifestPath} not found (run \`vite build\` first)`
|
|
4233
3577
|
});
|
|
4234
|
-
const viteBin =
|
|
4235
|
-
const viteExists =
|
|
3578
|
+
const viteBin = path12.resolve(cwd, "node_modules", ".bin", "vite");
|
|
3579
|
+
const viteExists = fs8.existsSync(viteBin);
|
|
4236
3580
|
checks.push({
|
|
4237
3581
|
name: "source: vite binary",
|
|
4238
3582
|
ok: viteExists,
|
|
@@ -4249,7 +3593,7 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
4249
3593
|
const contentConfig = config.source.contentFiles;
|
|
4250
3594
|
if (contentConfig) {
|
|
4251
3595
|
const fg4 = await import("fast-glob");
|
|
4252
|
-
const baseDir =
|
|
3596
|
+
const baseDir = path12.resolve(cwd, contentConfig.baseDir);
|
|
4253
3597
|
const files = await fg4.default(contentConfig.globs, { cwd: baseDir, onlyFiles: true });
|
|
4254
3598
|
checks.push({
|
|
4255
3599
|
name: "source: content files",
|
|
@@ -4264,61 +3608,26 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
4264
3608
|
});
|
|
4265
3609
|
}
|
|
4266
3610
|
}
|
|
4267
|
-
try {
|
|
4268
|
-
const provider = createEmbeddingsProvider(config);
|
|
4269
|
-
await provider.embedTexts(["searchsocket doctor ping"], config.embeddings.model);
|
|
4270
|
-
checks.push({ name: "embedding provider connectivity", ok: true });
|
|
4271
|
-
} catch (error) {
|
|
4272
|
-
checks.push({
|
|
4273
|
-
name: "embedding provider connectivity",
|
|
4274
|
-
ok: false,
|
|
4275
|
-
details: error instanceof Error ? error.message : "unknown error"
|
|
4276
|
-
});
|
|
4277
|
-
}
|
|
4278
3611
|
let store = null;
|
|
4279
3612
|
try {
|
|
4280
|
-
store = await
|
|
3613
|
+
store = await createUpstashStore(config);
|
|
4281
3614
|
const health = await store.health();
|
|
4282
3615
|
checks.push({
|
|
4283
|
-
name: "
|
|
3616
|
+
name: "upstash search connectivity",
|
|
4284
3617
|
ok: health.ok,
|
|
4285
3618
|
details: health.details
|
|
4286
3619
|
});
|
|
4287
3620
|
} catch (error) {
|
|
4288
3621
|
checks.push({
|
|
4289
|
-
name: "
|
|
3622
|
+
name: "upstash search connectivity",
|
|
4290
3623
|
ok: false,
|
|
4291
3624
|
details: error instanceof Error ? error.message : "unknown error"
|
|
4292
3625
|
});
|
|
4293
3626
|
}
|
|
4294
|
-
if (store) {
|
|
4295
|
-
try {
|
|
4296
|
-
const testScope = {
|
|
4297
|
-
projectId: config.project.id,
|
|
4298
|
-
scopeName: "_searchsocket_doctor_probe",
|
|
4299
|
-
scopeId: `${config.project.id}:_searchsocket_doctor_probe`
|
|
4300
|
-
};
|
|
4301
|
-
await store.recordScope({
|
|
4302
|
-
projectId: testScope.projectId,
|
|
4303
|
-
scopeName: testScope.scopeName,
|
|
4304
|
-
modelId: config.embeddings.model,
|
|
4305
|
-
lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4306
|
-
vectorCount: 0
|
|
4307
|
-
});
|
|
4308
|
-
await store.deleteScope(testScope);
|
|
4309
|
-
checks.push({ name: "vector backend write permission", ok: true });
|
|
4310
|
-
} catch (error) {
|
|
4311
|
-
checks.push({
|
|
4312
|
-
name: "vector backend write permission",
|
|
4313
|
-
ok: false,
|
|
4314
|
-
details: error instanceof Error ? error.message : "write test failed"
|
|
4315
|
-
});
|
|
4316
|
-
}
|
|
4317
|
-
}
|
|
4318
3627
|
try {
|
|
4319
3628
|
const scope = resolveScope(config);
|
|
4320
3629
|
const { statePath } = ensureStateDirs(cwd, config.state.dir, scope);
|
|
4321
|
-
const testPath =
|
|
3630
|
+
const testPath = path12.join(statePath, ".write-test");
|
|
4322
3631
|
await fsp.writeFile(testPath, "ok\n", "utf8");
|
|
4323
3632
|
await fsp.rm(testPath, { force: true });
|
|
4324
3633
|
checks.push({ name: "state directory writable", ok: true });
|
|
@@ -4347,7 +3656,7 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
4347
3656
|
});
|
|
4348
3657
|
program.command("mcp").description("Run SearchSocket MCP server").option("--transport <transport>", "stdio|http", "stdio").option("--port <n>", "HTTP port", "3338").option("--path <path>", "HTTP path", "/mcp").action(async (opts, command) => {
|
|
4349
3658
|
const rootOpts = getRootOptions(command);
|
|
4350
|
-
const cwd =
|
|
3659
|
+
const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
|
|
4351
3660
|
await runMcpServer({
|
|
4352
3661
|
cwd,
|
|
4353
3662
|
configPath: rootOpts?.config,
|
|
@@ -4356,9 +3665,9 @@ program.command("mcp").description("Run SearchSocket MCP server").option("--tran
|
|
|
4356
3665
|
httpPath: opts.path
|
|
4357
3666
|
});
|
|
4358
3667
|
});
|
|
4359
|
-
program.command("search").description("Quick
|
|
3668
|
+
program.command("search").description("Quick CLI search against Upstash Search").requiredOption("--q <query>", "search query").option("--scope <name>", "scope override").option("--top-k <n>", "top K results", "10").option("--path-prefix <prefix>", "path prefix filter").action(async (opts, command) => {
|
|
4360
3669
|
const rootOpts = getRootOptions(command);
|
|
4361
|
-
const cwd =
|
|
3670
|
+
const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
|
|
4362
3671
|
const engine = await SearchEngine.create({
|
|
4363
3672
|
cwd,
|
|
4364
3673
|
configPath: rootOpts?.config
|
|
@@ -4367,14 +3676,13 @@ program.command("search").description("Quick local CLI search against indexed ve
|
|
|
4367
3676
|
q: opts.q,
|
|
4368
3677
|
scope: opts.scope,
|
|
4369
3678
|
topK: parsePositiveInt(opts.topK, "--top-k"),
|
|
4370
|
-
pathPrefix: opts.pathPrefix
|
|
4371
|
-
rerank: opts.rerank
|
|
3679
|
+
pathPrefix: opts.pathPrefix
|
|
4372
3680
|
});
|
|
4373
3681
|
process.stdout.write(`${JSON.stringify(result, null, 2)}
|
|
4374
3682
|
`);
|
|
4375
3683
|
});
|
|
4376
3684
|
async function main() {
|
|
4377
|
-
dotenvConfig({ path:
|
|
3685
|
+
dotenvConfig({ path: path12.resolve(process.cwd(), ".env") });
|
|
4378
3686
|
await program.parseAsync(process.argv);
|
|
4379
3687
|
}
|
|
4380
3688
|
main().catch((error) => {
|