searchsocket 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -42
- package/dist/cli.js +348 -111
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +367 -104
- package/dist/index.d.cts +20 -3
- package/dist/index.d.ts +20 -3
- package/dist/index.js +365 -103
- package/dist/sveltekit.cjs +350 -104
- package/dist/sveltekit.d.cts +8 -2
- package/dist/sveltekit.d.ts +8 -2
- package/dist/sveltekit.js +349 -102
- package/dist/{types-D1K46vwd.d.cts → types-DAXk6A3Y.d.cts} +25 -13
- package/dist/{types-D1K46vwd.d.ts → types-DAXk6A3Y.d.ts} +25 -13
- package/package.json +3 -3
- package/dist/cli.js.map +0 -1
- package/dist/client.cjs.map +0 -1
- package/dist/client.js.map +0 -1
- package/dist/index.cjs.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/sveltekit.cjs.map +0 -1
- package/dist/sveltekit.js.map +0 -1
package/dist/cli.js
CHANGED
|
@@ -12,13 +12,13 @@ import { Command } from "commander";
|
|
|
12
12
|
// package.json
|
|
13
13
|
var package_default = {
|
|
14
14
|
name: "searchsocket",
|
|
15
|
-
version: "0.
|
|
15
|
+
version: "0.3.0",
|
|
16
16
|
description: "Semantic site search and MCP retrieval for SvelteKit static sites",
|
|
17
17
|
license: "MIT",
|
|
18
18
|
author: "Greg Priday <greg@siteorigin.com>",
|
|
19
19
|
repository: {
|
|
20
20
|
type: "git",
|
|
21
|
-
url: "https://github.com/gregpriday/searchsocket.git"
|
|
21
|
+
url: "git+https://github.com/gregpriday/searchsocket.git"
|
|
22
22
|
},
|
|
23
23
|
homepage: "https://github.com/gregpriday/searchsocket",
|
|
24
24
|
bugs: {
|
|
@@ -37,6 +37,7 @@ var package_default = {
|
|
|
37
37
|
type: "module",
|
|
38
38
|
files: [
|
|
39
39
|
"dist",
|
|
40
|
+
"!dist/**/*.map",
|
|
40
41
|
"README.md"
|
|
41
42
|
],
|
|
42
43
|
bin: {
|
|
@@ -81,7 +82,6 @@ var package_default = {
|
|
|
81
82
|
"fast-glob": "^3.3.3",
|
|
82
83
|
"gray-matter": "^4.0.3",
|
|
83
84
|
jiti: "^2.6.1",
|
|
84
|
-
openai: "^6.19.0",
|
|
85
85
|
"p-limit": "^7.3.0",
|
|
86
86
|
turndown: "^7.2.2",
|
|
87
87
|
"turndown-plugin-gfm": "^1.0.2",
|
|
@@ -132,7 +132,11 @@ var searchSocketConfigSchema = z.object({
|
|
|
132
132
|
outputDir: z.string().min(1).optional(),
|
|
133
133
|
paramValues: z.record(z.string(), z.array(z.string())).optional(),
|
|
134
134
|
exclude: z.array(z.string()).optional(),
|
|
135
|
-
previewTimeout: z.number().int().positive().optional()
|
|
135
|
+
previewTimeout: z.number().int().positive().optional(),
|
|
136
|
+
discover: z.boolean().optional(),
|
|
137
|
+
seedUrls: z.array(z.string()).optional(),
|
|
138
|
+
maxPages: z.number().int().positive().optional(),
|
|
139
|
+
maxDepth: z.number().int().nonnegative().optional()
|
|
136
140
|
}).optional()
|
|
137
141
|
}).optional(),
|
|
138
142
|
extract: z.object({
|
|
@@ -159,8 +163,9 @@ var searchSocketConfigSchema = z.object({
|
|
|
159
163
|
pageSummaryChunk: z.boolean().optional()
|
|
160
164
|
}).optional(),
|
|
161
165
|
embeddings: z.object({
|
|
162
|
-
provider: z.literal("
|
|
166
|
+
provider: z.literal("jina").optional(),
|
|
163
167
|
model: z.string().min(1).optional(),
|
|
168
|
+
apiKey: z.string().min(1).optional(),
|
|
164
169
|
apiKeyEnv: z.string().min(1).optional(),
|
|
165
170
|
batchSize: z.number().int().positive().optional(),
|
|
166
171
|
concurrency: z.number().int().positive().optional(),
|
|
@@ -169,18 +174,17 @@ var searchSocketConfigSchema = z.object({
|
|
|
169
174
|
vector: z.object({
|
|
170
175
|
dimension: z.number().int().positive().optional(),
|
|
171
176
|
turso: z.object({
|
|
177
|
+
url: z.string().url().optional(),
|
|
178
|
+
authToken: z.string().min(1).optional(),
|
|
172
179
|
urlEnv: z.string().optional(),
|
|
173
180
|
authTokenEnv: z.string().optional(),
|
|
174
181
|
localPath: z.string().optional()
|
|
175
182
|
}).optional()
|
|
176
183
|
}).optional(),
|
|
177
184
|
rerank: z.object({
|
|
178
|
-
|
|
185
|
+
enabled: z.boolean().optional(),
|
|
179
186
|
topN: z.number().int().positive().optional(),
|
|
180
|
-
|
|
181
|
-
apiKeyEnv: z.string().optional(),
|
|
182
|
-
model: z.string().optional()
|
|
183
|
-
}).optional()
|
|
187
|
+
model: z.string().optional()
|
|
184
188
|
}).optional(),
|
|
185
189
|
ranking: z.object({
|
|
186
190
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
@@ -189,6 +193,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
189
193
|
aggregationCap: z.number().int().positive().optional(),
|
|
190
194
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
191
195
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
196
|
+
minScore: z.number().min(0).max(1).optional(),
|
|
192
197
|
weights: z.object({
|
|
193
198
|
incomingLinks: z.number().optional(),
|
|
194
199
|
depth: z.number().optional(),
|
|
@@ -269,9 +274,9 @@ function createDefaultConfig(projectId) {
|
|
|
269
274
|
pageSummaryChunk: true
|
|
270
275
|
},
|
|
271
276
|
embeddings: {
|
|
272
|
-
provider: "
|
|
273
|
-
model: "
|
|
274
|
-
apiKeyEnv: "
|
|
277
|
+
provider: "jina",
|
|
278
|
+
model: "jina-embeddings-v3",
|
|
279
|
+
apiKeyEnv: "JINA_API_KEY",
|
|
275
280
|
batchSize: 64,
|
|
276
281
|
concurrency: 4
|
|
277
282
|
},
|
|
@@ -283,12 +288,9 @@ function createDefaultConfig(projectId) {
|
|
|
283
288
|
}
|
|
284
289
|
},
|
|
285
290
|
rerank: {
|
|
286
|
-
|
|
291
|
+
enabled: false,
|
|
287
292
|
topN: 20,
|
|
288
|
-
|
|
289
|
-
apiKeyEnv: "JINA_API_KEY",
|
|
290
|
-
model: "jina-reranker-v2-base-multilingual"
|
|
291
|
-
}
|
|
293
|
+
model: "jina-reranker-v2-base-multilingual"
|
|
292
294
|
},
|
|
293
295
|
ranking: {
|
|
294
296
|
enableIncomingLinkBoost: true,
|
|
@@ -297,6 +299,7 @@ function createDefaultConfig(projectId) {
|
|
|
297
299
|
aggregationCap: 5,
|
|
298
300
|
aggregationDecay: 0.5,
|
|
299
301
|
minChunkScoreRatio: 0.5,
|
|
302
|
+
minScore: 0,
|
|
300
303
|
weights: {
|
|
301
304
|
incomingLinks: 0.05,
|
|
302
305
|
depth: 0.03,
|
|
@@ -407,7 +410,11 @@ ${issues}`
|
|
|
407
410
|
outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
|
|
408
411
|
paramValues: parsed.source.build.paramValues ?? {},
|
|
409
412
|
exclude: parsed.source.build.exclude ?? [],
|
|
410
|
-
previewTimeout: parsed.source.build.previewTimeout ?? 3e4
|
|
413
|
+
previewTimeout: parsed.source.build.previewTimeout ?? 3e4,
|
|
414
|
+
discover: parsed.source.build.discover ?? false,
|
|
415
|
+
seedUrls: parsed.source.build.seedUrls ?? ["/"],
|
|
416
|
+
maxPages: parsed.source.build.maxPages ?? 200,
|
|
417
|
+
maxDepth: parsed.source.build.maxDepth ?? 10
|
|
411
418
|
} : void 0
|
|
412
419
|
},
|
|
413
420
|
extract: {
|
|
@@ -436,11 +443,7 @@ ${issues}`
|
|
|
436
443
|
},
|
|
437
444
|
rerank: {
|
|
438
445
|
...defaults.rerank,
|
|
439
|
-
...parsed.rerank
|
|
440
|
-
jina: {
|
|
441
|
-
...defaults.rerank.jina,
|
|
442
|
-
...parsed.rerank?.jina
|
|
443
|
-
}
|
|
446
|
+
...parsed.rerank
|
|
444
447
|
},
|
|
445
448
|
ranking: {
|
|
446
449
|
...defaults.ranking,
|
|
@@ -487,7 +490,11 @@ ${issues}`
|
|
|
487
490
|
outputDir: ".svelte-kit/output",
|
|
488
491
|
paramValues: {},
|
|
489
492
|
exclude: [],
|
|
490
|
-
previewTimeout: 3e4
|
|
493
|
+
previewTimeout: 3e4,
|
|
494
|
+
discover: false,
|
|
495
|
+
seedUrls: ["/"],
|
|
496
|
+
maxPages: 200,
|
|
497
|
+
maxDepth: 10
|
|
491
498
|
};
|
|
492
499
|
}
|
|
493
500
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
@@ -528,7 +535,7 @@ function writeMinimalConfig(cwd) {
|
|
|
528
535
|
return target;
|
|
529
536
|
}
|
|
530
537
|
const content = `export default {
|
|
531
|
-
embeddings: { apiKeyEnv: "
|
|
538
|
+
embeddings: { apiKeyEnv: "JINA_API_KEY" }
|
|
532
539
|
};
|
|
533
540
|
`;
|
|
534
541
|
fs.writeFileSync(target, content, "utf8");
|
|
@@ -539,14 +546,16 @@ function writeMinimalConfig(cwd) {
|
|
|
539
546
|
var Logger = class {
|
|
540
547
|
json;
|
|
541
548
|
verbose;
|
|
549
|
+
quiet;
|
|
542
550
|
stderrOnly;
|
|
543
551
|
constructor(opts = {}) {
|
|
544
552
|
this.json = opts.json ?? false;
|
|
545
553
|
this.verbose = opts.verbose ?? false;
|
|
554
|
+
this.quiet = opts.quiet ?? false;
|
|
546
555
|
this.stderrOnly = opts.stderrOnly ?? false;
|
|
547
556
|
}
|
|
548
557
|
info(message) {
|
|
549
|
-
if (this.json) {
|
|
558
|
+
if (this.quiet || this.json) {
|
|
550
559
|
return;
|
|
551
560
|
}
|
|
552
561
|
this.writeOut(`${message}
|
|
@@ -560,7 +569,7 @@ var Logger = class {
|
|
|
560
569
|
this.logJson("debug", { message });
|
|
561
570
|
return;
|
|
562
571
|
}
|
|
563
|
-
this.writeOut(
|
|
572
|
+
this.writeOut(` ${message}
|
|
564
573
|
`);
|
|
565
574
|
}
|
|
566
575
|
warn(message) {
|
|
@@ -587,7 +596,7 @@ var Logger = class {
|
|
|
587
596
|
this.logJson(event, data);
|
|
588
597
|
return;
|
|
589
598
|
}
|
|
590
|
-
this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
|
|
599
|
+
this.writeOut(` [${event}] ${data ? JSON.stringify(data) : ""}
|
|
591
600
|
`);
|
|
592
601
|
}
|
|
593
602
|
writeOut(text) {
|
|
@@ -694,18 +703,18 @@ function ensureStateDirs(cwd, stateDir, scope) {
|
|
|
694
703
|
return { statePath, pagesPath };
|
|
695
704
|
}
|
|
696
705
|
|
|
697
|
-
// src/embeddings/
|
|
698
|
-
import OpenAI from "openai";
|
|
706
|
+
// src/embeddings/jina.ts
|
|
699
707
|
import pLimit from "p-limit";
|
|
700
708
|
function sleep(ms) {
|
|
701
709
|
return new Promise((resolve) => {
|
|
702
710
|
setTimeout(resolve, ms);
|
|
703
711
|
});
|
|
704
712
|
}
|
|
705
|
-
var
|
|
706
|
-
|
|
713
|
+
var JinaEmbeddingsProvider = class {
|
|
714
|
+
apiKey;
|
|
707
715
|
batchSize;
|
|
708
716
|
concurrency;
|
|
717
|
+
defaultTask;
|
|
709
718
|
constructor(options) {
|
|
710
719
|
if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
|
|
711
720
|
throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
|
|
@@ -713,11 +722,10 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
713
722
|
if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
|
|
714
723
|
throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
|
|
715
724
|
}
|
|
716
|
-
this.
|
|
717
|
-
apiKey: options.apiKey
|
|
718
|
-
});
|
|
725
|
+
this.apiKey = options.apiKey;
|
|
719
726
|
this.batchSize = options.batchSize;
|
|
720
727
|
this.concurrency = options.concurrency;
|
|
728
|
+
this.defaultTask = options.task ?? "retrieval.passage";
|
|
721
729
|
}
|
|
722
730
|
estimateTokens(text) {
|
|
723
731
|
const normalized = text.trim();
|
|
@@ -731,7 +739,7 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
731
739
|
const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
|
|
732
740
|
return Math.max(1, Math.max(charEstimate, lexicalEstimate));
|
|
733
741
|
}
|
|
734
|
-
async embedTexts(texts, modelId) {
|
|
742
|
+
async embedTexts(texts, modelId, task) {
|
|
735
743
|
if (texts.length === 0) {
|
|
736
744
|
return [];
|
|
737
745
|
}
|
|
@@ -747,33 +755,52 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
747
755
|
await Promise.all(
|
|
748
756
|
batches.map(
|
|
749
757
|
(batch, position) => limit(async () => {
|
|
750
|
-
outputs[position] = await this.embedWithRetry(batch.values, modelId);
|
|
758
|
+
outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
|
|
751
759
|
})
|
|
752
760
|
)
|
|
753
761
|
);
|
|
754
762
|
return outputs.flat();
|
|
755
763
|
}
|
|
756
|
-
async embedWithRetry(texts, modelId) {
|
|
764
|
+
async embedWithRetry(texts, modelId, task) {
|
|
757
765
|
const maxAttempts = 5;
|
|
758
766
|
let attempt = 0;
|
|
759
767
|
while (attempt < maxAttempts) {
|
|
760
768
|
attempt += 1;
|
|
769
|
+
let response;
|
|
761
770
|
try {
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
771
|
+
response = await fetch("https://api.jina.ai/v1/embeddings", {
|
|
772
|
+
method: "POST",
|
|
773
|
+
headers: {
|
|
774
|
+
"content-type": "application/json",
|
|
775
|
+
authorization: `Bearer ${this.apiKey}`
|
|
776
|
+
},
|
|
777
|
+
body: JSON.stringify({
|
|
778
|
+
model: modelId,
|
|
779
|
+
input: texts,
|
|
780
|
+
task
|
|
781
|
+
})
|
|
766
782
|
});
|
|
767
|
-
return response.data.map((entry) => entry.embedding);
|
|
768
783
|
} catch (error) {
|
|
769
|
-
|
|
770
|
-
const retryable = status === 429 || typeof status === "number" && status >= 500;
|
|
771
|
-
if (!retryable || attempt >= maxAttempts) {
|
|
784
|
+
if (attempt >= maxAttempts) {
|
|
772
785
|
throw error;
|
|
773
786
|
}
|
|
774
|
-
|
|
775
|
-
|
|
787
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
788
|
+
continue;
|
|
776
789
|
}
|
|
790
|
+
if (!response.ok) {
|
|
791
|
+
const retryable = response.status === 429 || response.status >= 500;
|
|
792
|
+
if (!retryable || attempt >= maxAttempts) {
|
|
793
|
+
const errorBody = await response.text();
|
|
794
|
+
throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
|
|
795
|
+
}
|
|
796
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
797
|
+
continue;
|
|
798
|
+
}
|
|
799
|
+
const payload = await response.json();
|
|
800
|
+
if (!payload.data || !Array.isArray(payload.data)) {
|
|
801
|
+
throw new Error("Invalid Jina embeddings response format");
|
|
802
|
+
}
|
|
803
|
+
return payload.data.map((entry) => entry.embedding);
|
|
777
804
|
}
|
|
778
805
|
throw new Error("Unreachable retry state");
|
|
779
806
|
}
|
|
@@ -781,20 +808,20 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
781
808
|
|
|
782
809
|
// src/embeddings/factory.ts
|
|
783
810
|
function createEmbeddingsProvider(config) {
|
|
784
|
-
if (config.embeddings.provider !== "
|
|
811
|
+
if (config.embeddings.provider !== "jina") {
|
|
785
812
|
throw new SearchSocketError(
|
|
786
813
|
"CONFIG_MISSING",
|
|
787
814
|
`Unsupported embeddings provider ${config.embeddings.provider}`
|
|
788
815
|
);
|
|
789
816
|
}
|
|
790
|
-
const apiKey = process.env[config.embeddings.apiKeyEnv];
|
|
817
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
791
818
|
if (!apiKey) {
|
|
792
819
|
throw new SearchSocketError(
|
|
793
820
|
"CONFIG_MISSING",
|
|
794
|
-
`Missing embeddings API key env var
|
|
821
|
+
`Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
|
|
795
822
|
);
|
|
796
823
|
}
|
|
797
|
-
return new
|
|
824
|
+
return new JinaEmbeddingsProvider({
|
|
798
825
|
apiKey,
|
|
799
826
|
batchSize: config.embeddings.batchSize,
|
|
800
827
|
concurrency: config.embeddings.concurrency
|
|
@@ -808,6 +835,11 @@ import path11 from "path";
|
|
|
808
835
|
import fs3 from "fs";
|
|
809
836
|
import path3 from "path";
|
|
810
837
|
|
|
838
|
+
// src/core/serverless.ts
|
|
839
|
+
function isServerless() {
|
|
840
|
+
return !!(process.env.VERCEL || process.env.NETLIFY || process.env.AWS_LAMBDA_FUNCTION_NAME || process.env.FUNCTIONS_WORKER || process.env.CF_PAGES);
|
|
841
|
+
}
|
|
842
|
+
|
|
811
843
|
// src/vector/turso.ts
|
|
812
844
|
var TursoVectorStore = class {
|
|
813
845
|
client;
|
|
@@ -852,6 +884,16 @@ var TursoVectorStore = class {
|
|
|
852
884
|
}
|
|
853
885
|
async ensureChunks(dim) {
|
|
854
886
|
if (this.chunksReady) return;
|
|
887
|
+
const exists = await this.chunksTableExists();
|
|
888
|
+
if (exists) {
|
|
889
|
+
const currentDim = await this.getChunksDimension();
|
|
890
|
+
if (currentDim !== null && currentDim !== dim) {
|
|
891
|
+
await this.client.batch([
|
|
892
|
+
"DROP INDEX IF EXISTS idx",
|
|
893
|
+
"DROP TABLE IF EXISTS chunks"
|
|
894
|
+
]);
|
|
895
|
+
}
|
|
896
|
+
}
|
|
855
897
|
await this.client.batch([
|
|
856
898
|
`CREATE TABLE IF NOT EXISTS chunks (
|
|
857
899
|
id TEXT PRIMARY KEY,
|
|
@@ -863,6 +905,8 @@ var TursoVectorStore = class {
|
|
|
863
905
|
section_title TEXT NOT NULL DEFAULT '',
|
|
864
906
|
heading_path TEXT NOT NULL DEFAULT '[]',
|
|
865
907
|
snippet TEXT NOT NULL DEFAULT '',
|
|
908
|
+
chunk_text TEXT NOT NULL DEFAULT '',
|
|
909
|
+
ordinal INTEGER NOT NULL DEFAULT 0,
|
|
866
910
|
content_hash TEXT NOT NULL DEFAULT '',
|
|
867
911
|
model_id TEXT NOT NULL DEFAULT '',
|
|
868
912
|
depth INTEGER NOT NULL DEFAULT 0,
|
|
@@ -873,6 +917,19 @@ var TursoVectorStore = class {
|
|
|
873
917
|
)`,
|
|
874
918
|
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
875
919
|
]);
|
|
920
|
+
const chunkMigrationCols = [
|
|
921
|
+
{ name: "chunk_text", def: "TEXT NOT NULL DEFAULT ''" },
|
|
922
|
+
{ name: "ordinal", def: "INTEGER NOT NULL DEFAULT 0" }
|
|
923
|
+
];
|
|
924
|
+
for (const col of chunkMigrationCols) {
|
|
925
|
+
try {
|
|
926
|
+
await this.client.execute(`ALTER TABLE chunks ADD COLUMN ${col.name} ${col.def}`);
|
|
927
|
+
} catch (error) {
|
|
928
|
+
if (error instanceof Error && !error.message.includes("duplicate column")) {
|
|
929
|
+
throw error;
|
|
930
|
+
}
|
|
931
|
+
}
|
|
932
|
+
}
|
|
876
933
|
this.chunksReady = true;
|
|
877
934
|
}
|
|
878
935
|
async ensurePages() {
|
|
@@ -907,6 +964,38 @@ var TursoVectorStore = class {
|
|
|
907
964
|
throw error;
|
|
908
965
|
}
|
|
909
966
|
}
|
|
967
|
+
/**
|
|
968
|
+
* Read the current F32_BLOB dimension from the chunks table schema.
|
|
969
|
+
* Returns null if the table doesn't exist or the dimension can't be parsed.
|
|
970
|
+
*/
|
|
971
|
+
async getChunksDimension() {
|
|
972
|
+
try {
|
|
973
|
+
const rs = await this.client.execute(
|
|
974
|
+
"SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
|
|
975
|
+
);
|
|
976
|
+
if (rs.rows.length === 0) return null;
|
|
977
|
+
const sql = rs.rows[0].sql;
|
|
978
|
+
const match = sql.match(/F32_BLOB\((\d+)\)/i);
|
|
979
|
+
return match ? parseInt(match[1], 10) : null;
|
|
980
|
+
} catch {
|
|
981
|
+
return null;
|
|
982
|
+
}
|
|
983
|
+
}
|
|
984
|
+
/**
|
|
985
|
+
* Drop all SearchSocket tables (chunks, registry, pages) and their indexes.
|
|
986
|
+
* Used by `clean --remote` for a full reset.
|
|
987
|
+
*/
|
|
988
|
+
async dropAllTables() {
|
|
989
|
+
await this.client.batch([
|
|
990
|
+
"DROP INDEX IF EXISTS idx",
|
|
991
|
+
"DROP TABLE IF EXISTS chunks",
|
|
992
|
+
"DROP TABLE IF EXISTS registry",
|
|
993
|
+
"DROP TABLE IF EXISTS pages"
|
|
994
|
+
]);
|
|
995
|
+
this.chunksReady = false;
|
|
996
|
+
this.registryReady = false;
|
|
997
|
+
this.pagesReady = false;
|
|
998
|
+
}
|
|
910
999
|
async upsert(records, _scope) {
|
|
911
1000
|
if (records.length === 0) return;
|
|
912
1001
|
const dim = this.dimension ?? records[0].vector.length;
|
|
@@ -917,9 +1006,9 @@ var TursoVectorStore = class {
|
|
|
917
1006
|
const stmts = batch.map((r) => ({
|
|
918
1007
|
sql: `INSERT OR REPLACE INTO chunks
|
|
919
1008
|
(id, project_id, scope_name, url, path, title, section_title,
|
|
920
|
-
heading_path, snippet, content_hash, model_id, depth,
|
|
1009
|
+
heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
|
|
921
1010
|
incoming_links, route_file, tags, embedding)
|
|
922
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
1011
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
923
1012
|
args: [
|
|
924
1013
|
r.id,
|
|
925
1014
|
r.metadata.projectId,
|
|
@@ -930,6 +1019,8 @@ var TursoVectorStore = class {
|
|
|
930
1019
|
r.metadata.sectionTitle,
|
|
931
1020
|
JSON.stringify(r.metadata.headingPath),
|
|
932
1021
|
r.metadata.snippet,
|
|
1022
|
+
r.metadata.chunkText,
|
|
1023
|
+
r.metadata.ordinal,
|
|
933
1024
|
r.metadata.contentHash,
|
|
934
1025
|
r.metadata.modelId,
|
|
935
1026
|
r.metadata.depth,
|
|
@@ -948,7 +1039,8 @@ var TursoVectorStore = class {
|
|
|
948
1039
|
const queryJson = JSON.stringify(queryVector);
|
|
949
1040
|
const rs = await this.client.execute({
|
|
950
1041
|
sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
|
|
951
|
-
c.section_title, c.heading_path, c.snippet, c.
|
|
1042
|
+
c.section_title, c.heading_path, c.snippet, c.chunk_text,
|
|
1043
|
+
c.ordinal, c.content_hash,
|
|
952
1044
|
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
953
1045
|
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
954
1046
|
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
@@ -992,6 +1084,8 @@ var TursoVectorStore = class {
|
|
|
992
1084
|
sectionTitle: row.section_title,
|
|
993
1085
|
headingPath: JSON.parse(row.heading_path || "[]"),
|
|
994
1086
|
snippet: row.snippet,
|
|
1087
|
+
chunkText: row.chunk_text || "",
|
|
1088
|
+
ordinal: row.ordinal || 0,
|
|
995
1089
|
contentHash: row.content_hash,
|
|
996
1090
|
modelId: row.model_id,
|
|
997
1091
|
depth: row.depth,
|
|
@@ -1187,10 +1281,10 @@ var TursoVectorStore = class {
|
|
|
1187
1281
|
// src/vector/factory.ts
|
|
1188
1282
|
async function createVectorStore(config, cwd) {
|
|
1189
1283
|
const turso = config.vector.turso;
|
|
1190
|
-
const remoteUrl = process.env[turso.urlEnv];
|
|
1284
|
+
const remoteUrl = turso.url ?? process.env[turso.urlEnv];
|
|
1191
1285
|
if (remoteUrl) {
|
|
1192
1286
|
const { createClient: createClient2 } = await import("@libsql/client/http");
|
|
1193
|
-
const authToken = process.env[turso.authTokenEnv];
|
|
1287
|
+
const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
|
|
1194
1288
|
const client2 = createClient2({
|
|
1195
1289
|
url: remoteUrl,
|
|
1196
1290
|
authToken
|
|
@@ -1200,6 +1294,12 @@ async function createVectorStore(config, cwd) {
|
|
|
1200
1294
|
dimension: config.vector.dimension
|
|
1201
1295
|
});
|
|
1202
1296
|
}
|
|
1297
|
+
if (isServerless()) {
|
|
1298
|
+
throw new SearchSocketError(
|
|
1299
|
+
"VECTOR_BACKEND_UNAVAILABLE",
|
|
1300
|
+
`No remote vector database URL found (checked vector.turso.url and env var "${turso.urlEnv}"). Local SQLite storage is not available in serverless environments. Set ${turso.urlEnv} or pass vector.turso.url directly.`
|
|
1301
|
+
);
|
|
1302
|
+
}
|
|
1203
1303
|
const { createClient } = await import("@libsql/client");
|
|
1204
1304
|
const localPath = path3.resolve(cwd, turso.localPath);
|
|
1205
1305
|
fs3.mkdirSync(path3.dirname(localPath), { recursive: true });
|
|
@@ -1827,6 +1927,7 @@ function mapUrlToRoute(urlPath, patterns) {
|
|
|
1827
1927
|
}
|
|
1828
1928
|
|
|
1829
1929
|
// src/indexing/sources/build/index.ts
|
|
1930
|
+
import { load as cheerioLoad } from "cheerio";
|
|
1830
1931
|
import pLimit2 from "p-limit";
|
|
1831
1932
|
|
|
1832
1933
|
// src/indexing/sources/build/manifest-parser.ts
|
|
@@ -2003,11 +2104,108 @@ async function startPreviewServer(cwd, options, logger3) {
|
|
|
2003
2104
|
|
|
2004
2105
|
// src/indexing/sources/build/index.ts
|
|
2005
2106
|
var logger = new Logger();
|
|
2107
|
+
function extractLinksFromHtml(html, pageUrl, baseOrigin) {
|
|
2108
|
+
const $ = cheerioLoad(html);
|
|
2109
|
+
const links = [];
|
|
2110
|
+
$("a[href]").each((_i, el) => {
|
|
2111
|
+
const href = $(el).attr("href");
|
|
2112
|
+
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
|
|
2113
|
+
return;
|
|
2114
|
+
}
|
|
2115
|
+
try {
|
|
2116
|
+
const resolved = new URL(href, `${baseOrigin}${pageUrl}`);
|
|
2117
|
+
if (resolved.origin !== baseOrigin) return;
|
|
2118
|
+
if (!["http:", "https:"].includes(resolved.protocol)) return;
|
|
2119
|
+
links.push(normalizeUrlPath(resolved.pathname));
|
|
2120
|
+
} catch {
|
|
2121
|
+
}
|
|
2122
|
+
});
|
|
2123
|
+
return [...new Set(links)];
|
|
2124
|
+
}
|
|
2125
|
+
async function discoverPages(server, buildConfig, pipelineMaxPages) {
|
|
2126
|
+
const { seedUrls, maxDepth, exclude } = buildConfig;
|
|
2127
|
+
const baseOrigin = new URL(server.baseUrl).origin;
|
|
2128
|
+
let effectiveMax = buildConfig.maxPages;
|
|
2129
|
+
if (typeof pipelineMaxPages === "number") {
|
|
2130
|
+
const floored = Math.max(0, Math.floor(pipelineMaxPages));
|
|
2131
|
+
effectiveMax = Math.min(effectiveMax, floored);
|
|
2132
|
+
}
|
|
2133
|
+
if (effectiveMax === 0) return [];
|
|
2134
|
+
const visited = /* @__PURE__ */ new Set();
|
|
2135
|
+
const pages = [];
|
|
2136
|
+
const queue = [];
|
|
2137
|
+
const limit = pLimit2(8);
|
|
2138
|
+
for (const seed of seedUrls) {
|
|
2139
|
+
const normalized = normalizeUrlPath(seed);
|
|
2140
|
+
if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
|
|
2141
|
+
visited.add(normalized);
|
|
2142
|
+
queue.push({ url: normalized, depth: 0 });
|
|
2143
|
+
}
|
|
2144
|
+
}
|
|
2145
|
+
while (queue.length > 0 && pages.length < effectiveMax) {
|
|
2146
|
+
const remaining = effectiveMax - pages.length;
|
|
2147
|
+
const batch = queue.splice(0, remaining);
|
|
2148
|
+
const results = await Promise.allSettled(
|
|
2149
|
+
batch.map(
|
|
2150
|
+
(item) => limit(async () => {
|
|
2151
|
+
const fullUrl = joinUrl(server.baseUrl, item.url);
|
|
2152
|
+
const response = await fetch(fullUrl);
|
|
2153
|
+
if (!response.ok) {
|
|
2154
|
+
logger.warn(`Skipping ${item.url}: ${response.status} ${response.statusText}`);
|
|
2155
|
+
return null;
|
|
2156
|
+
}
|
|
2157
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
2158
|
+
if (!contentType.includes("text/html")) {
|
|
2159
|
+
return null;
|
|
2160
|
+
}
|
|
2161
|
+
const html = await response.text();
|
|
2162
|
+
if (item.depth < maxDepth) {
|
|
2163
|
+
const links = extractLinksFromHtml(html, item.url, baseOrigin);
|
|
2164
|
+
for (const link of links) {
|
|
2165
|
+
if (!visited.has(link) && !isExcluded(link, exclude)) {
|
|
2166
|
+
visited.add(link);
|
|
2167
|
+
queue.push({ url: link, depth: item.depth + 1 });
|
|
2168
|
+
}
|
|
2169
|
+
}
|
|
2170
|
+
}
|
|
2171
|
+
return {
|
|
2172
|
+
url: item.url,
|
|
2173
|
+
html,
|
|
2174
|
+
sourcePath: fullUrl,
|
|
2175
|
+
outgoingLinks: []
|
|
2176
|
+
};
|
|
2177
|
+
})
|
|
2178
|
+
)
|
|
2179
|
+
);
|
|
2180
|
+
for (const result of results) {
|
|
2181
|
+
if (result.status === "fulfilled" && result.value) {
|
|
2182
|
+
pages.push(result.value);
|
|
2183
|
+
}
|
|
2184
|
+
}
|
|
2185
|
+
}
|
|
2186
|
+
if (pages.length >= effectiveMax && queue.length > 0) {
|
|
2187
|
+
logger.warn(`Discovery crawl reached maxPages limit (${effectiveMax}), ${queue.length} URLs not visited.`);
|
|
2188
|
+
}
|
|
2189
|
+
logger.event("build_discover_complete", {
|
|
2190
|
+
pagesFound: pages.length,
|
|
2191
|
+
urlsVisited: visited.size,
|
|
2192
|
+
urlsSkipped: queue.length
|
|
2193
|
+
});
|
|
2194
|
+
return pages;
|
|
2195
|
+
}
|
|
2006
2196
|
async function loadBuildPages(cwd, config, maxPages) {
|
|
2007
2197
|
const buildConfig = config.source.build;
|
|
2008
2198
|
if (!buildConfig) {
|
|
2009
2199
|
throw new Error("build source config is missing");
|
|
2010
2200
|
}
|
|
2201
|
+
if (buildConfig.discover) {
|
|
2202
|
+
const server2 = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
2203
|
+
try {
|
|
2204
|
+
return await discoverPages(server2, buildConfig, maxPages);
|
|
2205
|
+
} finally {
|
|
2206
|
+
await server2.shutdown();
|
|
2207
|
+
}
|
|
2208
|
+
}
|
|
2011
2209
|
const routes = await parseManifest(cwd, buildConfig.outputDir);
|
|
2012
2210
|
const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
|
|
2013
2211
|
logger.event("build_routes_discovered", {
|
|
@@ -2111,11 +2309,11 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
2111
2309
|
|
|
2112
2310
|
// src/indexing/sources/crawl.ts
|
|
2113
2311
|
import { gunzipSync } from "zlib";
|
|
2114
|
-
import { load as
|
|
2312
|
+
import { load as cheerioLoad2 } from "cheerio";
|
|
2115
2313
|
import pLimit3 from "p-limit";
|
|
2116
2314
|
var logger2 = new Logger();
|
|
2117
2315
|
function extractLocs(xml) {
|
|
2118
|
-
const $ =
|
|
2316
|
+
const $ = cheerioLoad2(xml, { xmlMode: true });
|
|
2119
2317
|
const locs = [];
|
|
2120
2318
|
$("loc").each((_i, el) => {
|
|
2121
2319
|
const text = $(el).text().trim();
|
|
@@ -2126,7 +2324,7 @@ function extractLocs(xml) {
|
|
|
2126
2324
|
return locs;
|
|
2127
2325
|
}
|
|
2128
2326
|
function isSitemapIndex(xml) {
|
|
2129
|
-
const $ =
|
|
2327
|
+
const $ = cheerioLoad2(xml, { xmlMode: true });
|
|
2130
2328
|
return $("sitemapindex").length > 0;
|
|
2131
2329
|
}
|
|
2132
2330
|
async function fetchSitemapXml(url) {
|
|
@@ -2264,9 +2462,7 @@ function hrTimeMs(start) {
|
|
|
2264
2462
|
|
|
2265
2463
|
// src/indexing/pipeline.ts
|
|
2266
2464
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
2267
|
-
"
|
|
2268
|
-
"text-embedding-3-large": 13e-5,
|
|
2269
|
-
"text-embedding-ada-002": 1e-4
|
|
2465
|
+
"jina-embeddings-v3": 2e-5
|
|
2270
2466
|
};
|
|
2271
2467
|
var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
|
|
2272
2468
|
var IndexPipeline = class _IndexPipeline {
|
|
@@ -2312,9 +2508,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2312
2508
|
};
|
|
2313
2509
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
2314
2510
|
const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
|
|
2511
|
+
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
2512
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, model: ${this.config.embeddings.model})`);
|
|
2315
2513
|
if (options.force) {
|
|
2514
|
+
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
2316
2515
|
await cleanMirrorForScope(statePath, scope);
|
|
2317
2516
|
}
|
|
2517
|
+
if (options.dryRun) {
|
|
2518
|
+
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
2519
|
+
}
|
|
2318
2520
|
const manifestStart = stageStart();
|
|
2319
2521
|
const existingHashes = await this.vectorStore.getContentHashes(scope);
|
|
2320
2522
|
const existingModelId = await this.vectorStore.getScopeModelId(scope);
|
|
@@ -2325,8 +2527,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2325
2527
|
);
|
|
2326
2528
|
}
|
|
2327
2529
|
stageEnd("manifest", manifestStart);
|
|
2530
|
+
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
2328
2531
|
const sourceStart = stageStart();
|
|
2329
|
-
|
|
2532
|
+
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
2330
2533
|
let sourcePages;
|
|
2331
2534
|
if (sourceMode === "static-output") {
|
|
2332
2535
|
sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
|
|
@@ -2338,10 +2541,13 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2338
2541
|
sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
|
|
2339
2542
|
}
|
|
2340
2543
|
stageEnd("source", sourceStart);
|
|
2544
|
+
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
2341
2545
|
const routeStart = stageStart();
|
|
2342
2546
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
2343
2547
|
stageEnd("route_map", routeStart);
|
|
2548
|
+
this.logger.debug(`Route mapping: ${routePatterns.length} pattern${routePatterns.length === 1 ? "" : "s"} discovered (${stageTimingsMs["route_map"]}ms)`);
|
|
2344
2549
|
const extractStart = stageStart();
|
|
2550
|
+
this.logger.info("Extracting content...");
|
|
2345
2551
|
const extractedPages = [];
|
|
2346
2552
|
for (const sourcePage of sourcePages) {
|
|
2347
2553
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
@@ -2370,6 +2576,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2370
2576
|
uniquePages.push(page);
|
|
2371
2577
|
}
|
|
2372
2578
|
stageEnd("extract", extractStart);
|
|
2579
|
+
const skippedPages = sourcePages.length - uniquePages.length;
|
|
2580
|
+
this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
2373
2581
|
const linkStart = stageStart();
|
|
2374
2582
|
const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
|
|
2375
2583
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
@@ -2385,7 +2593,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2385
2593
|
}
|
|
2386
2594
|
}
|
|
2387
2595
|
stageEnd("links", linkStart);
|
|
2596
|
+
this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
|
|
2388
2597
|
const mirrorStart = stageStart();
|
|
2598
|
+
this.logger.info("Writing mirror pages...");
|
|
2389
2599
|
const mirrorPages = [];
|
|
2390
2600
|
let routeExact = 0;
|
|
2391
2601
|
let routeBestEffort = 0;
|
|
@@ -2455,7 +2665,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2455
2665
|
await this.vectorStore.upsertPages(pageRecords, scope);
|
|
2456
2666
|
}
|
|
2457
2667
|
stageEnd("mirror", mirrorStart);
|
|
2668
|
+
this.logger.info(`Mirrored ${mirrorPages.length} page${mirrorPages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["mirror"]}ms)`);
|
|
2458
2669
|
const chunkStart = stageStart();
|
|
2670
|
+
this.logger.info("Chunking pages...");
|
|
2459
2671
|
let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
|
|
2460
2672
|
const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
|
|
2461
2673
|
if (typeof maxChunks === "number") {
|
|
@@ -2468,6 +2680,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2468
2680
|
});
|
|
2469
2681
|
}
|
|
2470
2682
|
stageEnd("chunk", chunkStart);
|
|
2683
|
+
this.logger.info(`Chunked into ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} (${stageTimingsMs["chunk"]}ms)`);
|
|
2471
2684
|
const currentChunkMap = /* @__PURE__ */ new Map();
|
|
2472
2685
|
for (const chunk of chunks) {
|
|
2473
2686
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
@@ -2486,6 +2699,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2486
2699
|
return existingHash !== chunk.contentHash;
|
|
2487
2700
|
});
|
|
2488
2701
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
2702
|
+
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
2489
2703
|
const embedStart = stageStart();
|
|
2490
2704
|
const chunkTokenEstimates = /* @__PURE__ */ new Map();
|
|
2491
2705
|
for (const chunk of changedChunks) {
|
|
@@ -2500,9 +2714,11 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2500
2714
|
let newEmbeddings = 0;
|
|
2501
2715
|
const vectorsByChunk = /* @__PURE__ */ new Map();
|
|
2502
2716
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
2717
|
+
this.logger.info(`Embedding ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} (~${estimatedTokens.toLocaleString()} tokens, ~$${estimatedCostUSD.toFixed(6)})...`);
|
|
2503
2718
|
const embeddings = await this.embeddings.embedTexts(
|
|
2504
2719
|
changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
|
|
2505
|
-
this.config.embeddings.model
|
|
2720
|
+
this.config.embeddings.model,
|
|
2721
|
+
"retrieval.passage"
|
|
2506
2722
|
);
|
|
2507
2723
|
if (embeddings.length !== changedChunks.length) {
|
|
2508
2724
|
throw new SearchSocketError(
|
|
@@ -2525,8 +2741,14 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2525
2741
|
}
|
|
2526
2742
|
}
|
|
2527
2743
|
stageEnd("embedding", embedStart);
|
|
2744
|
+
if (changedChunks.length > 0) {
|
|
2745
|
+
this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
|
|
2746
|
+
} else {
|
|
2747
|
+
this.logger.info("No chunks to embed \u2014 all up to date");
|
|
2748
|
+
}
|
|
2528
2749
|
const syncStart = stageStart();
|
|
2529
2750
|
if (!options.dryRun) {
|
|
2751
|
+
this.logger.info("Syncing vectors...");
|
|
2530
2752
|
const upserts = [];
|
|
2531
2753
|
for (const chunk of changedChunks) {
|
|
2532
2754
|
const vector = vectorsByChunk.get(chunk.chunkKey);
|
|
@@ -2545,6 +2767,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2545
2767
|
sectionTitle: chunk.sectionTitle ?? "",
|
|
2546
2768
|
headingPath: chunk.headingPath,
|
|
2547
2769
|
snippet: chunk.snippet,
|
|
2770
|
+
chunkText: chunk.chunkText.slice(0, 4e3),
|
|
2771
|
+
ordinal: chunk.ordinal,
|
|
2548
2772
|
contentHash: chunk.contentHash,
|
|
2549
2773
|
modelId: this.config.embeddings.model,
|
|
2550
2774
|
depth: chunk.depth,
|
|
@@ -2564,6 +2788,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2564
2788
|
}
|
|
2565
2789
|
}
|
|
2566
2790
|
stageEnd("sync", syncStart);
|
|
2791
|
+
this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
|
|
2567
2792
|
const finalizeStart = stageStart();
|
|
2568
2793
|
if (!options.dryRun) {
|
|
2569
2794
|
const scopeInfo = {
|
|
@@ -2583,6 +2808,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2583
2808
|
});
|
|
2584
2809
|
}
|
|
2585
2810
|
stageEnd("finalize", finalizeStart);
|
|
2811
|
+
this.logger.info("Done.");
|
|
2586
2812
|
return {
|
|
2587
2813
|
pagesProcessed: mirrorPages.length,
|
|
2588
2814
|
chunksTotal: chunks.length,
|
|
@@ -2692,20 +2918,17 @@ var JinaReranker = class {
|
|
|
2692
2918
|
|
|
2693
2919
|
// src/rerank/factory.ts
|
|
2694
2920
|
function createReranker(config) {
|
|
2695
|
-
if (config.rerank.
|
|
2921
|
+
if (!config.rerank.enabled) {
|
|
2696
2922
|
return null;
|
|
2697
2923
|
}
|
|
2698
|
-
|
|
2699
|
-
|
|
2700
|
-
|
|
2701
|
-
return null;
|
|
2702
|
-
}
|
|
2703
|
-
return new JinaReranker({
|
|
2704
|
-
apiKey,
|
|
2705
|
-
model: config.rerank.jina.model
|
|
2706
|
-
});
|
|
2924
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
2925
|
+
if (!apiKey) {
|
|
2926
|
+
return null;
|
|
2707
2927
|
}
|
|
2708
|
-
return
|
|
2928
|
+
return new JinaReranker({
|
|
2929
|
+
apiKey,
|
|
2930
|
+
model: config.rerank.model
|
|
2931
|
+
});
|
|
2709
2932
|
}
|
|
2710
2933
|
|
|
2711
2934
|
// src/search/ranking.ts
|
|
@@ -2853,7 +3076,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
2853
3076
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
2854
3077
|
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
2855
3078
|
const embedStart = process.hrtime.bigint();
|
|
2856
|
-
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
|
|
3079
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
2857
3080
|
const queryVector = queryEmbeddings[0];
|
|
2858
3081
|
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
2859
3082
|
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
@@ -2881,13 +3104,17 @@ var SearchEngine = class _SearchEngine {
|
|
|
2881
3104
|
usedRerank = true;
|
|
2882
3105
|
}
|
|
2883
3106
|
let results;
|
|
3107
|
+
const minScore = this.config.ranking.minScore;
|
|
2884
3108
|
if (groupByPage) {
|
|
2885
|
-
|
|
3109
|
+
let pages = aggregateByPage(ordered, this.config);
|
|
3110
|
+
if (minScore > 0) {
|
|
3111
|
+
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
3112
|
+
}
|
|
2886
3113
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
2887
3114
|
results = pages.slice(0, topK).map((page) => {
|
|
2888
3115
|
const bestScore = page.bestChunk.finalScore;
|
|
2889
|
-
const
|
|
2890
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
3116
|
+
const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
3117
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
|
|
2891
3118
|
return {
|
|
2892
3119
|
url: page.url,
|
|
2893
3120
|
title: page.title,
|
|
@@ -2904,6 +3131,9 @@ var SearchEngine = class _SearchEngine {
|
|
|
2904
3131
|
};
|
|
2905
3132
|
});
|
|
2906
3133
|
} else {
|
|
3134
|
+
if (minScore > 0) {
|
|
3135
|
+
ordered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
3136
|
+
}
|
|
2907
3137
|
results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
2908
3138
|
url: hit.metadata.url,
|
|
2909
3139
|
title: hit.metadata.title,
|
|
@@ -2975,43 +3205,54 @@ var SearchEngine = class _SearchEngine {
|
|
|
2975
3205
|
}
|
|
2976
3206
|
}
|
|
2977
3207
|
async rerankHits(query, ranked, topK) {
|
|
2978
|
-
if (this.config.rerank.
|
|
3208
|
+
if (!this.config.rerank.enabled) {
|
|
2979
3209
|
throw new SearchSocketError(
|
|
2980
3210
|
"INVALID_REQUEST",
|
|
2981
|
-
"rerank=true requested but rerank.
|
|
3211
|
+
"rerank=true requested but rerank.enabled is not set to true.",
|
|
2982
3212
|
400
|
|
2983
3213
|
);
|
|
2984
3214
|
}
|
|
2985
3215
|
if (!this.reranker) {
|
|
2986
3216
|
throw new SearchSocketError(
|
|
2987
3217
|
"CONFIG_MISSING",
|
|
2988
|
-
`rerank=true requested but ${this.config.
|
|
3218
|
+
`rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
|
|
2989
3219
|
400
|
|
2990
3220
|
);
|
|
2991
3221
|
}
|
|
2992
|
-
const
|
|
2993
|
-
|
|
2994
|
-
|
|
2995
|
-
|
|
3222
|
+
const pageGroups = /* @__PURE__ */ new Map();
|
|
3223
|
+
for (const entry of ranked) {
|
|
3224
|
+
const url = entry.hit.metadata.url;
|
|
3225
|
+
const group = pageGroups.get(url);
|
|
3226
|
+
if (group) group.push(entry);
|
|
3227
|
+
else pageGroups.set(url, [entry]);
|
|
3228
|
+
}
|
|
3229
|
+
const pageCandidates = [];
|
|
3230
|
+
for (const [url, chunks] of pageGroups) {
|
|
3231
|
+
const sorted = [...chunks].sort(
|
|
3232
|
+
(a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0)
|
|
3233
|
+
);
|
|
3234
|
+
const title = sorted[0].hit.metadata.title;
|
|
3235
|
+
const body = sorted.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
3236
|
+
pageCandidates.push({ id: url, text: `${title}
|
|
3237
|
+
|
|
3238
|
+
${body}` });
|
|
3239
|
+
}
|
|
2996
3240
|
const reranked = await this.reranker.rerank(
|
|
2997
3241
|
query,
|
|
2998
|
-
|
|
3242
|
+
pageCandidates,
|
|
2999
3243
|
Math.max(topK, this.config.rerank.topN)
|
|
3000
3244
|
);
|
|
3001
|
-
const
|
|
3245
|
+
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
3002
3246
|
return ranked.map((entry) => {
|
|
3003
|
-
const
|
|
3004
|
-
const
|
|
3005
|
-
if (
|
|
3006
|
-
return {
|
|
3007
|
-
...entry,
|
|
3008
|
-
finalScore: safeBaseScore
|
|
3009
|
-
};
|
|
3247
|
+
const pageScore = scoreByUrl.get(entry.hit.metadata.url);
|
|
3248
|
+
const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
|
|
3249
|
+
if (pageScore === void 0 || !Number.isFinite(pageScore)) {
|
|
3250
|
+
return { ...entry, finalScore: base };
|
|
3010
3251
|
}
|
|
3011
|
-
const
|
|
3252
|
+
const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
|
|
3012
3253
|
return {
|
|
3013
3254
|
...entry,
|
|
3014
|
-
finalScore: Number.isFinite(
|
|
3255
|
+
finalScore: Number.isFinite(combined) ? combined : base
|
|
3015
3256
|
};
|
|
3016
3257
|
}).sort((a, b) => {
|
|
3017
3258
|
const delta = b.finalScore - a.finalScore;
|
|
@@ -3331,6 +3572,7 @@ function getRootOptions(command) {
|
|
|
3331
3572
|
}
|
|
3332
3573
|
async function runIndexCommand(opts) {
|
|
3333
3574
|
const logger3 = new Logger({
|
|
3575
|
+
quiet: opts.quiet,
|
|
3334
3576
|
verbose: opts.verbose,
|
|
3335
3577
|
json: opts.json
|
|
3336
3578
|
});
|
|
@@ -3354,7 +3596,9 @@ async function runIndexCommand(opts) {
|
|
|
3354
3596
|
`);
|
|
3355
3597
|
return;
|
|
3356
3598
|
}
|
|
3357
|
-
|
|
3599
|
+
if (!opts.quiet) {
|
|
3600
|
+
printIndexSummary(stats);
|
|
3601
|
+
}
|
|
3358
3602
|
}
|
|
3359
3603
|
var program = new Command();
|
|
3360
3604
|
program.name("searchsocket").description("Semantic site search and MCP retrieval for SvelteKit").version(package_default.version).option("-C, --cwd <path>", "working directory", process.cwd()).option("--config <path>", "config path (defaults to searchsocket.config.ts)");
|
|
@@ -3378,7 +3622,7 @@ program.command("init").description("Create searchsocket.config.ts and .searchso
|
|
|
3378
3622
|
process.stdout.write("// searchsocketVitePlugin({ enabled: true, changedOnly: true })\n");
|
|
3379
3623
|
process.stdout.write("// or env-driven: SEARCHSOCKET_AUTO_INDEX=1 pnpm build\n");
|
|
3380
3624
|
});
|
|
3381
|
-
program.command("index").description("Index site content into markdown mirror + vector store").option("--scope <name>", "scope override").option("--changed-only", "only process changed chunks", true).option("--no-changed-only", "re-index regardless of previous manifest").option("--force", "force full mirror rebuild and re-upsert", false).option("--dry-run", "compute plan and cost, no API writes", false).option("--source <mode>", "source mode override: static-output|crawl|content-files|build").option("--max-pages <n>", "limit pages processed").option("--max-chunks <n>", "limit chunks processed").option("--verbose", "verbose output", false).option("--json", "emit JSON logs and summary", false).action(async (opts, command) => {
|
|
3625
|
+
program.command("index").description("Index site content into markdown mirror + vector store").option("--scope <name>", "scope override").option("--changed-only", "only process changed chunks", true).option("--no-changed-only", "re-index regardless of previous manifest").option("--force", "force full mirror rebuild and re-upsert", false).option("--dry-run", "compute plan and cost, no API writes", false).option("--source <mode>", "source mode override: static-output|crawl|content-files|build").option("--max-pages <n>", "limit pages processed").option("--max-chunks <n>", "limit chunks processed").option("--quiet", "suppress all output except errors and warnings", false).option("--verbose", "verbose output", false).option("--json", "emit JSON logs and summary", false).action(async (opts, command) => {
|
|
3382
3626
|
const rootOpts = getRootOptions(command);
|
|
3383
3627
|
const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3384
3628
|
await runIndexCommand({
|
|
@@ -3391,6 +3635,7 @@ program.command("index").description("Index site content into markdown mirror +
|
|
|
3391
3635
|
source: opts.source,
|
|
3392
3636
|
maxPages: opts.maxPages ? parsePositiveInt(opts.maxPages, "--max-pages") : void 0,
|
|
3393
3637
|
maxChunks: opts.maxChunks ? parsePositiveInt(opts.maxChunks, "--max-chunks") : void 0,
|
|
3638
|
+
quiet: opts.quiet,
|
|
3394
3639
|
verbose: opts.verbose,
|
|
3395
3640
|
json: opts.json
|
|
3396
3641
|
});
|
|
@@ -3553,8 +3798,8 @@ program.command("clean").description("Delete local state and optionally delete r
|
|
|
3553
3798
|
`);
|
|
3554
3799
|
if (opts.remote) {
|
|
3555
3800
|
const vectorStore = await createVectorStore(config, cwd);
|
|
3556
|
-
await vectorStore.
|
|
3557
|
-
process.stdout.write(`
|
|
3801
|
+
await vectorStore.dropAllTables();
|
|
3802
|
+
process.stdout.write(`dropped all remote tables (chunks, registry, pages)
|
|
3558
3803
|
`);
|
|
3559
3804
|
}
|
|
3560
3805
|
});
|
|
@@ -3679,14 +3924,6 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
3679
3924
|
details: tursoUrl ? `remote: ${tursoUrl}` : `local file: ${config.vector.turso.localPath}`
|
|
3680
3925
|
});
|
|
3681
3926
|
}
|
|
3682
|
-
if (config.rerank.provider === "jina") {
|
|
3683
|
-
const jinaKey = process.env[config.rerank.jina.apiKeyEnv];
|
|
3684
|
-
checks.push({
|
|
3685
|
-
name: `env ${config.rerank.jina.apiKeyEnv}`,
|
|
3686
|
-
ok: Boolean(jinaKey),
|
|
3687
|
-
details: jinaKey ? void 0 : "missing"
|
|
3688
|
-
});
|
|
3689
|
-
}
|
|
3690
3927
|
if (config.source.mode === "static-output") {
|
|
3691
3928
|
const outputDir = path13.resolve(cwd, config.source.staticOutputDir);
|
|
3692
3929
|
const exists = fs9.existsSync(outputDir);
|