searchsocket 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -42
- package/dist/cli.js +346 -110
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +367 -104
- package/dist/index.d.cts +20 -3
- package/dist/index.d.ts +20 -3
- package/dist/index.js +365 -103
- package/dist/sveltekit.cjs +350 -104
- package/dist/sveltekit.d.cts +8 -2
- package/dist/sveltekit.d.ts +8 -2
- package/dist/sveltekit.js +349 -102
- package/dist/{types-D1K46vwd.d.cts → types-DAXk6A3Y.d.cts} +25 -13
- package/dist/{types-D1K46vwd.d.ts → types-DAXk6A3Y.d.ts} +25 -13
- package/package.json +1 -2
package/dist/cli.js
CHANGED
|
@@ -12,7 +12,7 @@ import { Command } from "commander";
|
|
|
12
12
|
// package.json
|
|
13
13
|
var package_default = {
|
|
14
14
|
name: "searchsocket",
|
|
15
|
-
version: "0.
|
|
15
|
+
version: "0.3.0",
|
|
16
16
|
description: "Semantic site search and MCP retrieval for SvelteKit static sites",
|
|
17
17
|
license: "MIT",
|
|
18
18
|
author: "Greg Priday <greg@siteorigin.com>",
|
|
@@ -82,7 +82,6 @@ var package_default = {
|
|
|
82
82
|
"fast-glob": "^3.3.3",
|
|
83
83
|
"gray-matter": "^4.0.3",
|
|
84
84
|
jiti: "^2.6.1",
|
|
85
|
-
openai: "^6.19.0",
|
|
86
85
|
"p-limit": "^7.3.0",
|
|
87
86
|
turndown: "^7.2.2",
|
|
88
87
|
"turndown-plugin-gfm": "^1.0.2",
|
|
@@ -133,7 +132,11 @@ var searchSocketConfigSchema = z.object({
|
|
|
133
132
|
outputDir: z.string().min(1).optional(),
|
|
134
133
|
paramValues: z.record(z.string(), z.array(z.string())).optional(),
|
|
135
134
|
exclude: z.array(z.string()).optional(),
|
|
136
|
-
previewTimeout: z.number().int().positive().optional()
|
|
135
|
+
previewTimeout: z.number().int().positive().optional(),
|
|
136
|
+
discover: z.boolean().optional(),
|
|
137
|
+
seedUrls: z.array(z.string()).optional(),
|
|
138
|
+
maxPages: z.number().int().positive().optional(),
|
|
139
|
+
maxDepth: z.number().int().nonnegative().optional()
|
|
137
140
|
}).optional()
|
|
138
141
|
}).optional(),
|
|
139
142
|
extract: z.object({
|
|
@@ -160,8 +163,9 @@ var searchSocketConfigSchema = z.object({
|
|
|
160
163
|
pageSummaryChunk: z.boolean().optional()
|
|
161
164
|
}).optional(),
|
|
162
165
|
embeddings: z.object({
|
|
163
|
-
provider: z.literal("
|
|
166
|
+
provider: z.literal("jina").optional(),
|
|
164
167
|
model: z.string().min(1).optional(),
|
|
168
|
+
apiKey: z.string().min(1).optional(),
|
|
165
169
|
apiKeyEnv: z.string().min(1).optional(),
|
|
166
170
|
batchSize: z.number().int().positive().optional(),
|
|
167
171
|
concurrency: z.number().int().positive().optional(),
|
|
@@ -170,18 +174,17 @@ var searchSocketConfigSchema = z.object({
|
|
|
170
174
|
vector: z.object({
|
|
171
175
|
dimension: z.number().int().positive().optional(),
|
|
172
176
|
turso: z.object({
|
|
177
|
+
url: z.string().url().optional(),
|
|
178
|
+
authToken: z.string().min(1).optional(),
|
|
173
179
|
urlEnv: z.string().optional(),
|
|
174
180
|
authTokenEnv: z.string().optional(),
|
|
175
181
|
localPath: z.string().optional()
|
|
176
182
|
}).optional()
|
|
177
183
|
}).optional(),
|
|
178
184
|
rerank: z.object({
|
|
179
|
-
|
|
185
|
+
enabled: z.boolean().optional(),
|
|
180
186
|
topN: z.number().int().positive().optional(),
|
|
181
|
-
|
|
182
|
-
apiKeyEnv: z.string().optional(),
|
|
183
|
-
model: z.string().optional()
|
|
184
|
-
}).optional()
|
|
187
|
+
model: z.string().optional()
|
|
185
188
|
}).optional(),
|
|
186
189
|
ranking: z.object({
|
|
187
190
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
@@ -190,6 +193,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
190
193
|
aggregationCap: z.number().int().positive().optional(),
|
|
191
194
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
192
195
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
196
|
+
minScore: z.number().min(0).max(1).optional(),
|
|
193
197
|
weights: z.object({
|
|
194
198
|
incomingLinks: z.number().optional(),
|
|
195
199
|
depth: z.number().optional(),
|
|
@@ -270,9 +274,9 @@ function createDefaultConfig(projectId) {
|
|
|
270
274
|
pageSummaryChunk: true
|
|
271
275
|
},
|
|
272
276
|
embeddings: {
|
|
273
|
-
provider: "
|
|
274
|
-
model: "
|
|
275
|
-
apiKeyEnv: "
|
|
277
|
+
provider: "jina",
|
|
278
|
+
model: "jina-embeddings-v3",
|
|
279
|
+
apiKeyEnv: "JINA_API_KEY",
|
|
276
280
|
batchSize: 64,
|
|
277
281
|
concurrency: 4
|
|
278
282
|
},
|
|
@@ -284,12 +288,9 @@ function createDefaultConfig(projectId) {
|
|
|
284
288
|
}
|
|
285
289
|
},
|
|
286
290
|
rerank: {
|
|
287
|
-
|
|
291
|
+
enabled: false,
|
|
288
292
|
topN: 20,
|
|
289
|
-
|
|
290
|
-
apiKeyEnv: "JINA_API_KEY",
|
|
291
|
-
model: "jina-reranker-v2-base-multilingual"
|
|
292
|
-
}
|
|
293
|
+
model: "jina-reranker-v2-base-multilingual"
|
|
293
294
|
},
|
|
294
295
|
ranking: {
|
|
295
296
|
enableIncomingLinkBoost: true,
|
|
@@ -298,6 +299,7 @@ function createDefaultConfig(projectId) {
|
|
|
298
299
|
aggregationCap: 5,
|
|
299
300
|
aggregationDecay: 0.5,
|
|
300
301
|
minChunkScoreRatio: 0.5,
|
|
302
|
+
minScore: 0,
|
|
301
303
|
weights: {
|
|
302
304
|
incomingLinks: 0.05,
|
|
303
305
|
depth: 0.03,
|
|
@@ -408,7 +410,11 @@ ${issues}`
|
|
|
408
410
|
outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
|
|
409
411
|
paramValues: parsed.source.build.paramValues ?? {},
|
|
410
412
|
exclude: parsed.source.build.exclude ?? [],
|
|
411
|
-
previewTimeout: parsed.source.build.previewTimeout ?? 3e4
|
|
413
|
+
previewTimeout: parsed.source.build.previewTimeout ?? 3e4,
|
|
414
|
+
discover: parsed.source.build.discover ?? false,
|
|
415
|
+
seedUrls: parsed.source.build.seedUrls ?? ["/"],
|
|
416
|
+
maxPages: parsed.source.build.maxPages ?? 200,
|
|
417
|
+
maxDepth: parsed.source.build.maxDepth ?? 10
|
|
412
418
|
} : void 0
|
|
413
419
|
},
|
|
414
420
|
extract: {
|
|
@@ -437,11 +443,7 @@ ${issues}`
|
|
|
437
443
|
},
|
|
438
444
|
rerank: {
|
|
439
445
|
...defaults.rerank,
|
|
440
|
-
...parsed.rerank
|
|
441
|
-
jina: {
|
|
442
|
-
...defaults.rerank.jina,
|
|
443
|
-
...parsed.rerank?.jina
|
|
444
|
-
}
|
|
446
|
+
...parsed.rerank
|
|
445
447
|
},
|
|
446
448
|
ranking: {
|
|
447
449
|
...defaults.ranking,
|
|
@@ -488,7 +490,11 @@ ${issues}`
|
|
|
488
490
|
outputDir: ".svelte-kit/output",
|
|
489
491
|
paramValues: {},
|
|
490
492
|
exclude: [],
|
|
491
|
-
previewTimeout: 3e4
|
|
493
|
+
previewTimeout: 3e4,
|
|
494
|
+
discover: false,
|
|
495
|
+
seedUrls: ["/"],
|
|
496
|
+
maxPages: 200,
|
|
497
|
+
maxDepth: 10
|
|
492
498
|
};
|
|
493
499
|
}
|
|
494
500
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
@@ -529,7 +535,7 @@ function writeMinimalConfig(cwd) {
|
|
|
529
535
|
return target;
|
|
530
536
|
}
|
|
531
537
|
const content = `export default {
|
|
532
|
-
embeddings: { apiKeyEnv: "
|
|
538
|
+
embeddings: { apiKeyEnv: "JINA_API_KEY" }
|
|
533
539
|
};
|
|
534
540
|
`;
|
|
535
541
|
fs.writeFileSync(target, content, "utf8");
|
|
@@ -540,14 +546,16 @@ function writeMinimalConfig(cwd) {
|
|
|
540
546
|
var Logger = class {
|
|
541
547
|
json;
|
|
542
548
|
verbose;
|
|
549
|
+
quiet;
|
|
543
550
|
stderrOnly;
|
|
544
551
|
constructor(opts = {}) {
|
|
545
552
|
this.json = opts.json ?? false;
|
|
546
553
|
this.verbose = opts.verbose ?? false;
|
|
554
|
+
this.quiet = opts.quiet ?? false;
|
|
547
555
|
this.stderrOnly = opts.stderrOnly ?? false;
|
|
548
556
|
}
|
|
549
557
|
info(message) {
|
|
550
|
-
if (this.json) {
|
|
558
|
+
if (this.quiet || this.json) {
|
|
551
559
|
return;
|
|
552
560
|
}
|
|
553
561
|
this.writeOut(`${message}
|
|
@@ -561,7 +569,7 @@ var Logger = class {
|
|
|
561
569
|
this.logJson("debug", { message });
|
|
562
570
|
return;
|
|
563
571
|
}
|
|
564
|
-
this.writeOut(
|
|
572
|
+
this.writeOut(` ${message}
|
|
565
573
|
`);
|
|
566
574
|
}
|
|
567
575
|
warn(message) {
|
|
@@ -588,7 +596,7 @@ var Logger = class {
|
|
|
588
596
|
this.logJson(event, data);
|
|
589
597
|
return;
|
|
590
598
|
}
|
|
591
|
-
this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
|
|
599
|
+
this.writeOut(` [${event}] ${data ? JSON.stringify(data) : ""}
|
|
592
600
|
`);
|
|
593
601
|
}
|
|
594
602
|
writeOut(text) {
|
|
@@ -695,18 +703,18 @@ function ensureStateDirs(cwd, stateDir, scope) {
|
|
|
695
703
|
return { statePath, pagesPath };
|
|
696
704
|
}
|
|
697
705
|
|
|
698
|
-
// src/embeddings/
|
|
699
|
-
import OpenAI from "openai";
|
|
706
|
+
// src/embeddings/jina.ts
|
|
700
707
|
import pLimit from "p-limit";
|
|
701
708
|
function sleep(ms) {
|
|
702
709
|
return new Promise((resolve) => {
|
|
703
710
|
setTimeout(resolve, ms);
|
|
704
711
|
});
|
|
705
712
|
}
|
|
706
|
-
var
|
|
707
|
-
|
|
713
|
+
var JinaEmbeddingsProvider = class {
|
|
714
|
+
apiKey;
|
|
708
715
|
batchSize;
|
|
709
716
|
concurrency;
|
|
717
|
+
defaultTask;
|
|
710
718
|
constructor(options) {
|
|
711
719
|
if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
|
|
712
720
|
throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
|
|
@@ -714,11 +722,10 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
714
722
|
if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
|
|
715
723
|
throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
|
|
716
724
|
}
|
|
717
|
-
this.
|
|
718
|
-
apiKey: options.apiKey
|
|
719
|
-
});
|
|
725
|
+
this.apiKey = options.apiKey;
|
|
720
726
|
this.batchSize = options.batchSize;
|
|
721
727
|
this.concurrency = options.concurrency;
|
|
728
|
+
this.defaultTask = options.task ?? "retrieval.passage";
|
|
722
729
|
}
|
|
723
730
|
estimateTokens(text) {
|
|
724
731
|
const normalized = text.trim();
|
|
@@ -732,7 +739,7 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
732
739
|
const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
|
|
733
740
|
return Math.max(1, Math.max(charEstimate, lexicalEstimate));
|
|
734
741
|
}
|
|
735
|
-
async embedTexts(texts, modelId) {
|
|
742
|
+
async embedTexts(texts, modelId, task) {
|
|
736
743
|
if (texts.length === 0) {
|
|
737
744
|
return [];
|
|
738
745
|
}
|
|
@@ -748,33 +755,52 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
748
755
|
await Promise.all(
|
|
749
756
|
batches.map(
|
|
750
757
|
(batch, position) => limit(async () => {
|
|
751
|
-
outputs[position] = await this.embedWithRetry(batch.values, modelId);
|
|
758
|
+
outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
|
|
752
759
|
})
|
|
753
760
|
)
|
|
754
761
|
);
|
|
755
762
|
return outputs.flat();
|
|
756
763
|
}
|
|
757
|
-
async embedWithRetry(texts, modelId) {
|
|
764
|
+
async embedWithRetry(texts, modelId, task) {
|
|
758
765
|
const maxAttempts = 5;
|
|
759
766
|
let attempt = 0;
|
|
760
767
|
while (attempt < maxAttempts) {
|
|
761
768
|
attempt += 1;
|
|
769
|
+
let response;
|
|
762
770
|
try {
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
771
|
+
response = await fetch("https://api.jina.ai/v1/embeddings", {
|
|
772
|
+
method: "POST",
|
|
773
|
+
headers: {
|
|
774
|
+
"content-type": "application/json",
|
|
775
|
+
authorization: `Bearer ${this.apiKey}`
|
|
776
|
+
},
|
|
777
|
+
body: JSON.stringify({
|
|
778
|
+
model: modelId,
|
|
779
|
+
input: texts,
|
|
780
|
+
task
|
|
781
|
+
})
|
|
767
782
|
});
|
|
768
|
-
return response.data.map((entry) => entry.embedding);
|
|
769
783
|
} catch (error) {
|
|
770
|
-
|
|
771
|
-
const retryable = status === 429 || typeof status === "number" && status >= 500;
|
|
772
|
-
if (!retryable || attempt >= maxAttempts) {
|
|
784
|
+
if (attempt >= maxAttempts) {
|
|
773
785
|
throw error;
|
|
774
786
|
}
|
|
775
|
-
|
|
776
|
-
|
|
787
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
788
|
+
continue;
|
|
777
789
|
}
|
|
790
|
+
if (!response.ok) {
|
|
791
|
+
const retryable = response.status === 429 || response.status >= 500;
|
|
792
|
+
if (!retryable || attempt >= maxAttempts) {
|
|
793
|
+
const errorBody = await response.text();
|
|
794
|
+
throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
|
|
795
|
+
}
|
|
796
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
797
|
+
continue;
|
|
798
|
+
}
|
|
799
|
+
const payload = await response.json();
|
|
800
|
+
if (!payload.data || !Array.isArray(payload.data)) {
|
|
801
|
+
throw new Error("Invalid Jina embeddings response format");
|
|
802
|
+
}
|
|
803
|
+
return payload.data.map((entry) => entry.embedding);
|
|
778
804
|
}
|
|
779
805
|
throw new Error("Unreachable retry state");
|
|
780
806
|
}
|
|
@@ -782,20 +808,20 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
782
808
|
|
|
783
809
|
// src/embeddings/factory.ts
|
|
784
810
|
function createEmbeddingsProvider(config) {
|
|
785
|
-
if (config.embeddings.provider !== "
|
|
811
|
+
if (config.embeddings.provider !== "jina") {
|
|
786
812
|
throw new SearchSocketError(
|
|
787
813
|
"CONFIG_MISSING",
|
|
788
814
|
`Unsupported embeddings provider ${config.embeddings.provider}`
|
|
789
815
|
);
|
|
790
816
|
}
|
|
791
|
-
const apiKey = process.env[config.embeddings.apiKeyEnv];
|
|
817
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
792
818
|
if (!apiKey) {
|
|
793
819
|
throw new SearchSocketError(
|
|
794
820
|
"CONFIG_MISSING",
|
|
795
|
-
`Missing embeddings API key env var
|
|
821
|
+
`Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
|
|
796
822
|
);
|
|
797
823
|
}
|
|
798
|
-
return new
|
|
824
|
+
return new JinaEmbeddingsProvider({
|
|
799
825
|
apiKey,
|
|
800
826
|
batchSize: config.embeddings.batchSize,
|
|
801
827
|
concurrency: config.embeddings.concurrency
|
|
@@ -809,6 +835,11 @@ import path11 from "path";
|
|
|
809
835
|
import fs3 from "fs";
|
|
810
836
|
import path3 from "path";
|
|
811
837
|
|
|
838
|
+
// src/core/serverless.ts
|
|
839
|
+
function isServerless() {
|
|
840
|
+
return !!(process.env.VERCEL || process.env.NETLIFY || process.env.AWS_LAMBDA_FUNCTION_NAME || process.env.FUNCTIONS_WORKER || process.env.CF_PAGES);
|
|
841
|
+
}
|
|
842
|
+
|
|
812
843
|
// src/vector/turso.ts
|
|
813
844
|
var TursoVectorStore = class {
|
|
814
845
|
client;
|
|
@@ -853,6 +884,16 @@ var TursoVectorStore = class {
|
|
|
853
884
|
}
|
|
854
885
|
async ensureChunks(dim) {
|
|
855
886
|
if (this.chunksReady) return;
|
|
887
|
+
const exists = await this.chunksTableExists();
|
|
888
|
+
if (exists) {
|
|
889
|
+
const currentDim = await this.getChunksDimension();
|
|
890
|
+
if (currentDim !== null && currentDim !== dim) {
|
|
891
|
+
await this.client.batch([
|
|
892
|
+
"DROP INDEX IF EXISTS idx",
|
|
893
|
+
"DROP TABLE IF EXISTS chunks"
|
|
894
|
+
]);
|
|
895
|
+
}
|
|
896
|
+
}
|
|
856
897
|
await this.client.batch([
|
|
857
898
|
`CREATE TABLE IF NOT EXISTS chunks (
|
|
858
899
|
id TEXT PRIMARY KEY,
|
|
@@ -864,6 +905,8 @@ var TursoVectorStore = class {
|
|
|
864
905
|
section_title TEXT NOT NULL DEFAULT '',
|
|
865
906
|
heading_path TEXT NOT NULL DEFAULT '[]',
|
|
866
907
|
snippet TEXT NOT NULL DEFAULT '',
|
|
908
|
+
chunk_text TEXT NOT NULL DEFAULT '',
|
|
909
|
+
ordinal INTEGER NOT NULL DEFAULT 0,
|
|
867
910
|
content_hash TEXT NOT NULL DEFAULT '',
|
|
868
911
|
model_id TEXT NOT NULL DEFAULT '',
|
|
869
912
|
depth INTEGER NOT NULL DEFAULT 0,
|
|
@@ -874,6 +917,19 @@ var TursoVectorStore = class {
|
|
|
874
917
|
)`,
|
|
875
918
|
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
876
919
|
]);
|
|
920
|
+
const chunkMigrationCols = [
|
|
921
|
+
{ name: "chunk_text", def: "TEXT NOT NULL DEFAULT ''" },
|
|
922
|
+
{ name: "ordinal", def: "INTEGER NOT NULL DEFAULT 0" }
|
|
923
|
+
];
|
|
924
|
+
for (const col of chunkMigrationCols) {
|
|
925
|
+
try {
|
|
926
|
+
await this.client.execute(`ALTER TABLE chunks ADD COLUMN ${col.name} ${col.def}`);
|
|
927
|
+
} catch (error) {
|
|
928
|
+
if (error instanceof Error && !error.message.includes("duplicate column")) {
|
|
929
|
+
throw error;
|
|
930
|
+
}
|
|
931
|
+
}
|
|
932
|
+
}
|
|
877
933
|
this.chunksReady = true;
|
|
878
934
|
}
|
|
879
935
|
async ensurePages() {
|
|
@@ -908,6 +964,38 @@ var TursoVectorStore = class {
|
|
|
908
964
|
throw error;
|
|
909
965
|
}
|
|
910
966
|
}
|
|
967
|
+
/**
|
|
968
|
+
* Read the current F32_BLOB dimension from the chunks table schema.
|
|
969
|
+
* Returns null if the table doesn't exist or the dimension can't be parsed.
|
|
970
|
+
*/
|
|
971
|
+
async getChunksDimension() {
|
|
972
|
+
try {
|
|
973
|
+
const rs = await this.client.execute(
|
|
974
|
+
"SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
|
|
975
|
+
);
|
|
976
|
+
if (rs.rows.length === 0) return null;
|
|
977
|
+
const sql = rs.rows[0].sql;
|
|
978
|
+
const match = sql.match(/F32_BLOB\((\d+)\)/i);
|
|
979
|
+
return match ? parseInt(match[1], 10) : null;
|
|
980
|
+
} catch {
|
|
981
|
+
return null;
|
|
982
|
+
}
|
|
983
|
+
}
|
|
984
|
+
/**
|
|
985
|
+
* Drop all SearchSocket tables (chunks, registry, pages) and their indexes.
|
|
986
|
+
* Used by `clean --remote` for a full reset.
|
|
987
|
+
*/
|
|
988
|
+
async dropAllTables() {
|
|
989
|
+
await this.client.batch([
|
|
990
|
+
"DROP INDEX IF EXISTS idx",
|
|
991
|
+
"DROP TABLE IF EXISTS chunks",
|
|
992
|
+
"DROP TABLE IF EXISTS registry",
|
|
993
|
+
"DROP TABLE IF EXISTS pages"
|
|
994
|
+
]);
|
|
995
|
+
this.chunksReady = false;
|
|
996
|
+
this.registryReady = false;
|
|
997
|
+
this.pagesReady = false;
|
|
998
|
+
}
|
|
911
999
|
async upsert(records, _scope) {
|
|
912
1000
|
if (records.length === 0) return;
|
|
913
1001
|
const dim = this.dimension ?? records[0].vector.length;
|
|
@@ -918,9 +1006,9 @@ var TursoVectorStore = class {
|
|
|
918
1006
|
const stmts = batch.map((r) => ({
|
|
919
1007
|
sql: `INSERT OR REPLACE INTO chunks
|
|
920
1008
|
(id, project_id, scope_name, url, path, title, section_title,
|
|
921
|
-
heading_path, snippet, content_hash, model_id, depth,
|
|
1009
|
+
heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
|
|
922
1010
|
incoming_links, route_file, tags, embedding)
|
|
923
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
1011
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
924
1012
|
args: [
|
|
925
1013
|
r.id,
|
|
926
1014
|
r.metadata.projectId,
|
|
@@ -931,6 +1019,8 @@ var TursoVectorStore = class {
|
|
|
931
1019
|
r.metadata.sectionTitle,
|
|
932
1020
|
JSON.stringify(r.metadata.headingPath),
|
|
933
1021
|
r.metadata.snippet,
|
|
1022
|
+
r.metadata.chunkText,
|
|
1023
|
+
r.metadata.ordinal,
|
|
934
1024
|
r.metadata.contentHash,
|
|
935
1025
|
r.metadata.modelId,
|
|
936
1026
|
r.metadata.depth,
|
|
@@ -949,7 +1039,8 @@ var TursoVectorStore = class {
|
|
|
949
1039
|
const queryJson = JSON.stringify(queryVector);
|
|
950
1040
|
const rs = await this.client.execute({
|
|
951
1041
|
sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
|
|
952
|
-
c.section_title, c.heading_path, c.snippet, c.
|
|
1042
|
+
c.section_title, c.heading_path, c.snippet, c.chunk_text,
|
|
1043
|
+
c.ordinal, c.content_hash,
|
|
953
1044
|
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
954
1045
|
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
955
1046
|
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
@@ -993,6 +1084,8 @@ var TursoVectorStore = class {
|
|
|
993
1084
|
sectionTitle: row.section_title,
|
|
994
1085
|
headingPath: JSON.parse(row.heading_path || "[]"),
|
|
995
1086
|
snippet: row.snippet,
|
|
1087
|
+
chunkText: row.chunk_text || "",
|
|
1088
|
+
ordinal: row.ordinal || 0,
|
|
996
1089
|
contentHash: row.content_hash,
|
|
997
1090
|
modelId: row.model_id,
|
|
998
1091
|
depth: row.depth,
|
|
@@ -1188,10 +1281,10 @@ var TursoVectorStore = class {
|
|
|
1188
1281
|
// src/vector/factory.ts
|
|
1189
1282
|
async function createVectorStore(config, cwd) {
|
|
1190
1283
|
const turso = config.vector.turso;
|
|
1191
|
-
const remoteUrl = process.env[turso.urlEnv];
|
|
1284
|
+
const remoteUrl = turso.url ?? process.env[turso.urlEnv];
|
|
1192
1285
|
if (remoteUrl) {
|
|
1193
1286
|
const { createClient: createClient2 } = await import("@libsql/client/http");
|
|
1194
|
-
const authToken = process.env[turso.authTokenEnv];
|
|
1287
|
+
const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
|
|
1195
1288
|
const client2 = createClient2({
|
|
1196
1289
|
url: remoteUrl,
|
|
1197
1290
|
authToken
|
|
@@ -1201,6 +1294,12 @@ async function createVectorStore(config, cwd) {
|
|
|
1201
1294
|
dimension: config.vector.dimension
|
|
1202
1295
|
});
|
|
1203
1296
|
}
|
|
1297
|
+
if (isServerless()) {
|
|
1298
|
+
throw new SearchSocketError(
|
|
1299
|
+
"VECTOR_BACKEND_UNAVAILABLE",
|
|
1300
|
+
`No remote vector database URL found (checked vector.turso.url and env var "${turso.urlEnv}"). Local SQLite storage is not available in serverless environments. Set ${turso.urlEnv} or pass vector.turso.url directly.`
|
|
1301
|
+
);
|
|
1302
|
+
}
|
|
1204
1303
|
const { createClient } = await import("@libsql/client");
|
|
1205
1304
|
const localPath = path3.resolve(cwd, turso.localPath);
|
|
1206
1305
|
fs3.mkdirSync(path3.dirname(localPath), { recursive: true });
|
|
@@ -1828,6 +1927,7 @@ function mapUrlToRoute(urlPath, patterns) {
|
|
|
1828
1927
|
}
|
|
1829
1928
|
|
|
1830
1929
|
// src/indexing/sources/build/index.ts
|
|
1930
|
+
import { load as cheerioLoad } from "cheerio";
|
|
1831
1931
|
import pLimit2 from "p-limit";
|
|
1832
1932
|
|
|
1833
1933
|
// src/indexing/sources/build/manifest-parser.ts
|
|
@@ -2004,11 +2104,108 @@ async function startPreviewServer(cwd, options, logger3) {
|
|
|
2004
2104
|
|
|
2005
2105
|
// src/indexing/sources/build/index.ts
|
|
2006
2106
|
var logger = new Logger();
|
|
2107
|
+
function extractLinksFromHtml(html, pageUrl, baseOrigin) {
|
|
2108
|
+
const $ = cheerioLoad(html);
|
|
2109
|
+
const links = [];
|
|
2110
|
+
$("a[href]").each((_i, el) => {
|
|
2111
|
+
const href = $(el).attr("href");
|
|
2112
|
+
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
|
|
2113
|
+
return;
|
|
2114
|
+
}
|
|
2115
|
+
try {
|
|
2116
|
+
const resolved = new URL(href, `${baseOrigin}${pageUrl}`);
|
|
2117
|
+
if (resolved.origin !== baseOrigin) return;
|
|
2118
|
+
if (!["http:", "https:"].includes(resolved.protocol)) return;
|
|
2119
|
+
links.push(normalizeUrlPath(resolved.pathname));
|
|
2120
|
+
} catch {
|
|
2121
|
+
}
|
|
2122
|
+
});
|
|
2123
|
+
return [...new Set(links)];
|
|
2124
|
+
}
|
|
2125
|
+
async function discoverPages(server, buildConfig, pipelineMaxPages) {
|
|
2126
|
+
const { seedUrls, maxDepth, exclude } = buildConfig;
|
|
2127
|
+
const baseOrigin = new URL(server.baseUrl).origin;
|
|
2128
|
+
let effectiveMax = buildConfig.maxPages;
|
|
2129
|
+
if (typeof pipelineMaxPages === "number") {
|
|
2130
|
+
const floored = Math.max(0, Math.floor(pipelineMaxPages));
|
|
2131
|
+
effectiveMax = Math.min(effectiveMax, floored);
|
|
2132
|
+
}
|
|
2133
|
+
if (effectiveMax === 0) return [];
|
|
2134
|
+
const visited = /* @__PURE__ */ new Set();
|
|
2135
|
+
const pages = [];
|
|
2136
|
+
const queue = [];
|
|
2137
|
+
const limit = pLimit2(8);
|
|
2138
|
+
for (const seed of seedUrls) {
|
|
2139
|
+
const normalized = normalizeUrlPath(seed);
|
|
2140
|
+
if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
|
|
2141
|
+
visited.add(normalized);
|
|
2142
|
+
queue.push({ url: normalized, depth: 0 });
|
|
2143
|
+
}
|
|
2144
|
+
}
|
|
2145
|
+
while (queue.length > 0 && pages.length < effectiveMax) {
|
|
2146
|
+
const remaining = effectiveMax - pages.length;
|
|
2147
|
+
const batch = queue.splice(0, remaining);
|
|
2148
|
+
const results = await Promise.allSettled(
|
|
2149
|
+
batch.map(
|
|
2150
|
+
(item) => limit(async () => {
|
|
2151
|
+
const fullUrl = joinUrl(server.baseUrl, item.url);
|
|
2152
|
+
const response = await fetch(fullUrl);
|
|
2153
|
+
if (!response.ok) {
|
|
2154
|
+
logger.warn(`Skipping ${item.url}: ${response.status} ${response.statusText}`);
|
|
2155
|
+
return null;
|
|
2156
|
+
}
|
|
2157
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
2158
|
+
if (!contentType.includes("text/html")) {
|
|
2159
|
+
return null;
|
|
2160
|
+
}
|
|
2161
|
+
const html = await response.text();
|
|
2162
|
+
if (item.depth < maxDepth) {
|
|
2163
|
+
const links = extractLinksFromHtml(html, item.url, baseOrigin);
|
|
2164
|
+
for (const link of links) {
|
|
2165
|
+
if (!visited.has(link) && !isExcluded(link, exclude)) {
|
|
2166
|
+
visited.add(link);
|
|
2167
|
+
queue.push({ url: link, depth: item.depth + 1 });
|
|
2168
|
+
}
|
|
2169
|
+
}
|
|
2170
|
+
}
|
|
2171
|
+
return {
|
|
2172
|
+
url: item.url,
|
|
2173
|
+
html,
|
|
2174
|
+
sourcePath: fullUrl,
|
|
2175
|
+
outgoingLinks: []
|
|
2176
|
+
};
|
|
2177
|
+
})
|
|
2178
|
+
)
|
|
2179
|
+
);
|
|
2180
|
+
for (const result of results) {
|
|
2181
|
+
if (result.status === "fulfilled" && result.value) {
|
|
2182
|
+
pages.push(result.value);
|
|
2183
|
+
}
|
|
2184
|
+
}
|
|
2185
|
+
}
|
|
2186
|
+
if (pages.length >= effectiveMax && queue.length > 0) {
|
|
2187
|
+
logger.warn(`Discovery crawl reached maxPages limit (${effectiveMax}), ${queue.length} URLs not visited.`);
|
|
2188
|
+
}
|
|
2189
|
+
logger.event("build_discover_complete", {
|
|
2190
|
+
pagesFound: pages.length,
|
|
2191
|
+
urlsVisited: visited.size,
|
|
2192
|
+
urlsSkipped: queue.length
|
|
2193
|
+
});
|
|
2194
|
+
return pages;
|
|
2195
|
+
}
|
|
2007
2196
|
async function loadBuildPages(cwd, config, maxPages) {
|
|
2008
2197
|
const buildConfig = config.source.build;
|
|
2009
2198
|
if (!buildConfig) {
|
|
2010
2199
|
throw new Error("build source config is missing");
|
|
2011
2200
|
}
|
|
2201
|
+
if (buildConfig.discover) {
|
|
2202
|
+
const server2 = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
2203
|
+
try {
|
|
2204
|
+
return await discoverPages(server2, buildConfig, maxPages);
|
|
2205
|
+
} finally {
|
|
2206
|
+
await server2.shutdown();
|
|
2207
|
+
}
|
|
2208
|
+
}
|
|
2012
2209
|
const routes = await parseManifest(cwd, buildConfig.outputDir);
|
|
2013
2210
|
const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
|
|
2014
2211
|
logger.event("build_routes_discovered", {
|
|
@@ -2112,11 +2309,11 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
2112
2309
|
|
|
2113
2310
|
// src/indexing/sources/crawl.ts
|
|
2114
2311
|
import { gunzipSync } from "zlib";
|
|
2115
|
-
import { load as
|
|
2312
|
+
import { load as cheerioLoad2 } from "cheerio";
|
|
2116
2313
|
import pLimit3 from "p-limit";
|
|
2117
2314
|
var logger2 = new Logger();
|
|
2118
2315
|
function extractLocs(xml) {
|
|
2119
|
-
const $ =
|
|
2316
|
+
const $ = cheerioLoad2(xml, { xmlMode: true });
|
|
2120
2317
|
const locs = [];
|
|
2121
2318
|
$("loc").each((_i, el) => {
|
|
2122
2319
|
const text = $(el).text().trim();
|
|
@@ -2127,7 +2324,7 @@ function extractLocs(xml) {
|
|
|
2127
2324
|
return locs;
|
|
2128
2325
|
}
|
|
2129
2326
|
function isSitemapIndex(xml) {
|
|
2130
|
-
const $ =
|
|
2327
|
+
const $ = cheerioLoad2(xml, { xmlMode: true });
|
|
2131
2328
|
return $("sitemapindex").length > 0;
|
|
2132
2329
|
}
|
|
2133
2330
|
async function fetchSitemapXml(url) {
|
|
@@ -2265,9 +2462,7 @@ function hrTimeMs(start) {
|
|
|
2265
2462
|
|
|
2266
2463
|
// src/indexing/pipeline.ts
|
|
2267
2464
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
2268
|
-
"
|
|
2269
|
-
"text-embedding-3-large": 13e-5,
|
|
2270
|
-
"text-embedding-ada-002": 1e-4
|
|
2465
|
+
"jina-embeddings-v3": 2e-5
|
|
2271
2466
|
};
|
|
2272
2467
|
var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
|
|
2273
2468
|
var IndexPipeline = class _IndexPipeline {
|
|
@@ -2313,9 +2508,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2313
2508
|
};
|
|
2314
2509
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
2315
2510
|
const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
|
|
2511
|
+
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
2512
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, model: ${this.config.embeddings.model})`);
|
|
2316
2513
|
if (options.force) {
|
|
2514
|
+
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
2317
2515
|
await cleanMirrorForScope(statePath, scope);
|
|
2318
2516
|
}
|
|
2517
|
+
if (options.dryRun) {
|
|
2518
|
+
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
2519
|
+
}
|
|
2319
2520
|
const manifestStart = stageStart();
|
|
2320
2521
|
const existingHashes = await this.vectorStore.getContentHashes(scope);
|
|
2321
2522
|
const existingModelId = await this.vectorStore.getScopeModelId(scope);
|
|
@@ -2326,8 +2527,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2326
2527
|
);
|
|
2327
2528
|
}
|
|
2328
2529
|
stageEnd("manifest", manifestStart);
|
|
2530
|
+
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
2329
2531
|
const sourceStart = stageStart();
|
|
2330
|
-
|
|
2532
|
+
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
2331
2533
|
let sourcePages;
|
|
2332
2534
|
if (sourceMode === "static-output") {
|
|
2333
2535
|
sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
|
|
@@ -2339,10 +2541,13 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2339
2541
|
sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
|
|
2340
2542
|
}
|
|
2341
2543
|
stageEnd("source", sourceStart);
|
|
2544
|
+
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
2342
2545
|
const routeStart = stageStart();
|
|
2343
2546
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
2344
2547
|
stageEnd("route_map", routeStart);
|
|
2548
|
+
this.logger.debug(`Route mapping: ${routePatterns.length} pattern${routePatterns.length === 1 ? "" : "s"} discovered (${stageTimingsMs["route_map"]}ms)`);
|
|
2345
2549
|
const extractStart = stageStart();
|
|
2550
|
+
this.logger.info("Extracting content...");
|
|
2346
2551
|
const extractedPages = [];
|
|
2347
2552
|
for (const sourcePage of sourcePages) {
|
|
2348
2553
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
@@ -2371,6 +2576,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2371
2576
|
uniquePages.push(page);
|
|
2372
2577
|
}
|
|
2373
2578
|
stageEnd("extract", extractStart);
|
|
2579
|
+
const skippedPages = sourcePages.length - uniquePages.length;
|
|
2580
|
+
this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
2374
2581
|
const linkStart = stageStart();
|
|
2375
2582
|
const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
|
|
2376
2583
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
@@ -2386,7 +2593,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2386
2593
|
}
|
|
2387
2594
|
}
|
|
2388
2595
|
stageEnd("links", linkStart);
|
|
2596
|
+
this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
|
|
2389
2597
|
const mirrorStart = stageStart();
|
|
2598
|
+
this.logger.info("Writing mirror pages...");
|
|
2390
2599
|
const mirrorPages = [];
|
|
2391
2600
|
let routeExact = 0;
|
|
2392
2601
|
let routeBestEffort = 0;
|
|
@@ -2456,7 +2665,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2456
2665
|
await this.vectorStore.upsertPages(pageRecords, scope);
|
|
2457
2666
|
}
|
|
2458
2667
|
stageEnd("mirror", mirrorStart);
|
|
2668
|
+
this.logger.info(`Mirrored ${mirrorPages.length} page${mirrorPages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["mirror"]}ms)`);
|
|
2459
2669
|
const chunkStart = stageStart();
|
|
2670
|
+
this.logger.info("Chunking pages...");
|
|
2460
2671
|
let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
|
|
2461
2672
|
const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
|
|
2462
2673
|
if (typeof maxChunks === "number") {
|
|
@@ -2469,6 +2680,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2469
2680
|
});
|
|
2470
2681
|
}
|
|
2471
2682
|
stageEnd("chunk", chunkStart);
|
|
2683
|
+
this.logger.info(`Chunked into ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} (${stageTimingsMs["chunk"]}ms)`);
|
|
2472
2684
|
const currentChunkMap = /* @__PURE__ */ new Map();
|
|
2473
2685
|
for (const chunk of chunks) {
|
|
2474
2686
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
@@ -2487,6 +2699,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2487
2699
|
return existingHash !== chunk.contentHash;
|
|
2488
2700
|
});
|
|
2489
2701
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
2702
|
+
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
2490
2703
|
const embedStart = stageStart();
|
|
2491
2704
|
const chunkTokenEstimates = /* @__PURE__ */ new Map();
|
|
2492
2705
|
for (const chunk of changedChunks) {
|
|
@@ -2501,9 +2714,11 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2501
2714
|
let newEmbeddings = 0;
|
|
2502
2715
|
const vectorsByChunk = /* @__PURE__ */ new Map();
|
|
2503
2716
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
2717
|
+
this.logger.info(`Embedding ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} (~${estimatedTokens.toLocaleString()} tokens, ~$${estimatedCostUSD.toFixed(6)})...`);
|
|
2504
2718
|
const embeddings = await this.embeddings.embedTexts(
|
|
2505
2719
|
changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
|
|
2506
|
-
this.config.embeddings.model
|
|
2720
|
+
this.config.embeddings.model,
|
|
2721
|
+
"retrieval.passage"
|
|
2507
2722
|
);
|
|
2508
2723
|
if (embeddings.length !== changedChunks.length) {
|
|
2509
2724
|
throw new SearchSocketError(
|
|
@@ -2526,8 +2741,14 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2526
2741
|
}
|
|
2527
2742
|
}
|
|
2528
2743
|
stageEnd("embedding", embedStart);
|
|
2744
|
+
if (changedChunks.length > 0) {
|
|
2745
|
+
this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
|
|
2746
|
+
} else {
|
|
2747
|
+
this.logger.info("No chunks to embed \u2014 all up to date");
|
|
2748
|
+
}
|
|
2529
2749
|
const syncStart = stageStart();
|
|
2530
2750
|
if (!options.dryRun) {
|
|
2751
|
+
this.logger.info("Syncing vectors...");
|
|
2531
2752
|
const upserts = [];
|
|
2532
2753
|
for (const chunk of changedChunks) {
|
|
2533
2754
|
const vector = vectorsByChunk.get(chunk.chunkKey);
|
|
@@ -2546,6 +2767,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2546
2767
|
sectionTitle: chunk.sectionTitle ?? "",
|
|
2547
2768
|
headingPath: chunk.headingPath,
|
|
2548
2769
|
snippet: chunk.snippet,
|
|
2770
|
+
chunkText: chunk.chunkText.slice(0, 4e3),
|
|
2771
|
+
ordinal: chunk.ordinal,
|
|
2549
2772
|
contentHash: chunk.contentHash,
|
|
2550
2773
|
modelId: this.config.embeddings.model,
|
|
2551
2774
|
depth: chunk.depth,
|
|
@@ -2565,6 +2788,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2565
2788
|
}
|
|
2566
2789
|
}
|
|
2567
2790
|
stageEnd("sync", syncStart);
|
|
2791
|
+
this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
|
|
2568
2792
|
const finalizeStart = stageStart();
|
|
2569
2793
|
if (!options.dryRun) {
|
|
2570
2794
|
const scopeInfo = {
|
|
@@ -2584,6 +2808,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2584
2808
|
});
|
|
2585
2809
|
}
|
|
2586
2810
|
stageEnd("finalize", finalizeStart);
|
|
2811
|
+
this.logger.info("Done.");
|
|
2587
2812
|
return {
|
|
2588
2813
|
pagesProcessed: mirrorPages.length,
|
|
2589
2814
|
chunksTotal: chunks.length,
|
|
@@ -2693,20 +2918,17 @@ var JinaReranker = class {
|
|
|
2693
2918
|
|
|
2694
2919
|
// src/rerank/factory.ts
|
|
2695
2920
|
function createReranker(config) {
|
|
2696
|
-
if (config.rerank.
|
|
2921
|
+
if (!config.rerank.enabled) {
|
|
2697
2922
|
return null;
|
|
2698
2923
|
}
|
|
2699
|
-
|
|
2700
|
-
|
|
2701
|
-
|
|
2702
|
-
return null;
|
|
2703
|
-
}
|
|
2704
|
-
return new JinaReranker({
|
|
2705
|
-
apiKey,
|
|
2706
|
-
model: config.rerank.jina.model
|
|
2707
|
-
});
|
|
2924
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
2925
|
+
if (!apiKey) {
|
|
2926
|
+
return null;
|
|
2708
2927
|
}
|
|
2709
|
-
return
|
|
2928
|
+
return new JinaReranker({
|
|
2929
|
+
apiKey,
|
|
2930
|
+
model: config.rerank.model
|
|
2931
|
+
});
|
|
2710
2932
|
}
|
|
2711
2933
|
|
|
2712
2934
|
// src/search/ranking.ts
|
|
@@ -2854,7 +3076,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
2854
3076
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
2855
3077
|
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
2856
3078
|
const embedStart = process.hrtime.bigint();
|
|
2857
|
-
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
|
|
3079
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
2858
3080
|
const queryVector = queryEmbeddings[0];
|
|
2859
3081
|
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
2860
3082
|
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
@@ -2882,13 +3104,17 @@ var SearchEngine = class _SearchEngine {
|
|
|
2882
3104
|
usedRerank = true;
|
|
2883
3105
|
}
|
|
2884
3106
|
let results;
|
|
3107
|
+
const minScore = this.config.ranking.minScore;
|
|
2885
3108
|
if (groupByPage) {
|
|
2886
|
-
|
|
3109
|
+
let pages = aggregateByPage(ordered, this.config);
|
|
3110
|
+
if (minScore > 0) {
|
|
3111
|
+
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
3112
|
+
}
|
|
2887
3113
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
2888
3114
|
results = pages.slice(0, topK).map((page) => {
|
|
2889
3115
|
const bestScore = page.bestChunk.finalScore;
|
|
2890
|
-
const
|
|
2891
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
3116
|
+
const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
3117
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
|
|
2892
3118
|
return {
|
|
2893
3119
|
url: page.url,
|
|
2894
3120
|
title: page.title,
|
|
@@ -2905,6 +3131,9 @@ var SearchEngine = class _SearchEngine {
|
|
|
2905
3131
|
};
|
|
2906
3132
|
});
|
|
2907
3133
|
} else {
|
|
3134
|
+
if (minScore > 0) {
|
|
3135
|
+
ordered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
3136
|
+
}
|
|
2908
3137
|
results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
2909
3138
|
url: hit.metadata.url,
|
|
2910
3139
|
title: hit.metadata.title,
|
|
@@ -2976,43 +3205,54 @@ var SearchEngine = class _SearchEngine {
|
|
|
2976
3205
|
}
|
|
2977
3206
|
}
|
|
2978
3207
|
async rerankHits(query, ranked, topK) {
|
|
2979
|
-
if (this.config.rerank.
|
|
3208
|
+
if (!this.config.rerank.enabled) {
|
|
2980
3209
|
throw new SearchSocketError(
|
|
2981
3210
|
"INVALID_REQUEST",
|
|
2982
|
-
"rerank=true requested but rerank.
|
|
3211
|
+
"rerank=true requested but rerank.enabled is not set to true.",
|
|
2983
3212
|
400
|
|
2984
3213
|
);
|
|
2985
3214
|
}
|
|
2986
3215
|
if (!this.reranker) {
|
|
2987
3216
|
throw new SearchSocketError(
|
|
2988
3217
|
"CONFIG_MISSING",
|
|
2989
|
-
`rerank=true requested but ${this.config.
|
|
3218
|
+
`rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
|
|
2990
3219
|
400
|
|
2991
3220
|
);
|
|
2992
3221
|
}
|
|
2993
|
-
const
|
|
2994
|
-
|
|
2995
|
-
|
|
2996
|
-
|
|
3222
|
+
const pageGroups = /* @__PURE__ */ new Map();
|
|
3223
|
+
for (const entry of ranked) {
|
|
3224
|
+
const url = entry.hit.metadata.url;
|
|
3225
|
+
const group = pageGroups.get(url);
|
|
3226
|
+
if (group) group.push(entry);
|
|
3227
|
+
else pageGroups.set(url, [entry]);
|
|
3228
|
+
}
|
|
3229
|
+
const pageCandidates = [];
|
|
3230
|
+
for (const [url, chunks] of pageGroups) {
|
|
3231
|
+
const sorted = [...chunks].sort(
|
|
3232
|
+
(a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0)
|
|
3233
|
+
);
|
|
3234
|
+
const title = sorted[0].hit.metadata.title;
|
|
3235
|
+
const body = sorted.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
3236
|
+
pageCandidates.push({ id: url, text: `${title}
|
|
3237
|
+
|
|
3238
|
+
${body}` });
|
|
3239
|
+
}
|
|
2997
3240
|
const reranked = await this.reranker.rerank(
|
|
2998
3241
|
query,
|
|
2999
|
-
|
|
3242
|
+
pageCandidates,
|
|
3000
3243
|
Math.max(topK, this.config.rerank.topN)
|
|
3001
3244
|
);
|
|
3002
|
-
const
|
|
3245
|
+
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
3003
3246
|
return ranked.map((entry) => {
|
|
3004
|
-
const
|
|
3005
|
-
const
|
|
3006
|
-
if (
|
|
3007
|
-
return {
|
|
3008
|
-
...entry,
|
|
3009
|
-
finalScore: safeBaseScore
|
|
3010
|
-
};
|
|
3247
|
+
const pageScore = scoreByUrl.get(entry.hit.metadata.url);
|
|
3248
|
+
const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
|
|
3249
|
+
if (pageScore === void 0 || !Number.isFinite(pageScore)) {
|
|
3250
|
+
return { ...entry, finalScore: base };
|
|
3011
3251
|
}
|
|
3012
|
-
const
|
|
3252
|
+
const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
|
|
3013
3253
|
return {
|
|
3014
3254
|
...entry,
|
|
3015
|
-
finalScore: Number.isFinite(
|
|
3255
|
+
finalScore: Number.isFinite(combined) ? combined : base
|
|
3016
3256
|
};
|
|
3017
3257
|
}).sort((a, b) => {
|
|
3018
3258
|
const delta = b.finalScore - a.finalScore;
|
|
@@ -3332,6 +3572,7 @@ function getRootOptions(command) {
|
|
|
3332
3572
|
}
|
|
3333
3573
|
async function runIndexCommand(opts) {
|
|
3334
3574
|
const logger3 = new Logger({
|
|
3575
|
+
quiet: opts.quiet,
|
|
3335
3576
|
verbose: opts.verbose,
|
|
3336
3577
|
json: opts.json
|
|
3337
3578
|
});
|
|
@@ -3355,7 +3596,9 @@ async function runIndexCommand(opts) {
|
|
|
3355
3596
|
`);
|
|
3356
3597
|
return;
|
|
3357
3598
|
}
|
|
3358
|
-
|
|
3599
|
+
if (!opts.quiet) {
|
|
3600
|
+
printIndexSummary(stats);
|
|
3601
|
+
}
|
|
3359
3602
|
}
|
|
3360
3603
|
var program = new Command();
|
|
3361
3604
|
program.name("searchsocket").description("Semantic site search and MCP retrieval for SvelteKit").version(package_default.version).option("-C, --cwd <path>", "working directory", process.cwd()).option("--config <path>", "config path (defaults to searchsocket.config.ts)");
|
|
@@ -3379,7 +3622,7 @@ program.command("init").description("Create searchsocket.config.ts and .searchso
|
|
|
3379
3622
|
process.stdout.write("// searchsocketVitePlugin({ enabled: true, changedOnly: true })\n");
|
|
3380
3623
|
process.stdout.write("// or env-driven: SEARCHSOCKET_AUTO_INDEX=1 pnpm build\n");
|
|
3381
3624
|
});
|
|
3382
|
-
program.command("index").description("Index site content into markdown mirror + vector store").option("--scope <name>", "scope override").option("--changed-only", "only process changed chunks", true).option("--no-changed-only", "re-index regardless of previous manifest").option("--force", "force full mirror rebuild and re-upsert", false).option("--dry-run", "compute plan and cost, no API writes", false).option("--source <mode>", "source mode override: static-output|crawl|content-files|build").option("--max-pages <n>", "limit pages processed").option("--max-chunks <n>", "limit chunks processed").option("--verbose", "verbose output", false).option("--json", "emit JSON logs and summary", false).action(async (opts, command) => {
|
|
3625
|
+
program.command("index").description("Index site content into markdown mirror + vector store").option("--scope <name>", "scope override").option("--changed-only", "only process changed chunks", true).option("--no-changed-only", "re-index regardless of previous manifest").option("--force", "force full mirror rebuild and re-upsert", false).option("--dry-run", "compute plan and cost, no API writes", false).option("--source <mode>", "source mode override: static-output|crawl|content-files|build").option("--max-pages <n>", "limit pages processed").option("--max-chunks <n>", "limit chunks processed").option("--quiet", "suppress all output except errors and warnings", false).option("--verbose", "verbose output", false).option("--json", "emit JSON logs and summary", false).action(async (opts, command) => {
|
|
3383
3626
|
const rootOpts = getRootOptions(command);
|
|
3384
3627
|
const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3385
3628
|
await runIndexCommand({
|
|
@@ -3392,6 +3635,7 @@ program.command("index").description("Index site content into markdown mirror +
|
|
|
3392
3635
|
source: opts.source,
|
|
3393
3636
|
maxPages: opts.maxPages ? parsePositiveInt(opts.maxPages, "--max-pages") : void 0,
|
|
3394
3637
|
maxChunks: opts.maxChunks ? parsePositiveInt(opts.maxChunks, "--max-chunks") : void 0,
|
|
3638
|
+
quiet: opts.quiet,
|
|
3395
3639
|
verbose: opts.verbose,
|
|
3396
3640
|
json: opts.json
|
|
3397
3641
|
});
|
|
@@ -3554,8 +3798,8 @@ program.command("clean").description("Delete local state and optionally delete r
|
|
|
3554
3798
|
`);
|
|
3555
3799
|
if (opts.remote) {
|
|
3556
3800
|
const vectorStore = await createVectorStore(config, cwd);
|
|
3557
|
-
await vectorStore.
|
|
3558
|
-
process.stdout.write(`
|
|
3801
|
+
await vectorStore.dropAllTables();
|
|
3802
|
+
process.stdout.write(`dropped all remote tables (chunks, registry, pages)
|
|
3559
3803
|
`);
|
|
3560
3804
|
}
|
|
3561
3805
|
});
|
|
@@ -3680,14 +3924,6 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
3680
3924
|
details: tursoUrl ? `remote: ${tursoUrl}` : `local file: ${config.vector.turso.localPath}`
|
|
3681
3925
|
});
|
|
3682
3926
|
}
|
|
3683
|
-
if (config.rerank.provider === "jina") {
|
|
3684
|
-
const jinaKey = process.env[config.rerank.jina.apiKeyEnv];
|
|
3685
|
-
checks.push({
|
|
3686
|
-
name: `env ${config.rerank.jina.apiKeyEnv}`,
|
|
3687
|
-
ok: Boolean(jinaKey),
|
|
3688
|
-
details: jinaKey ? void 0 : "missing"
|
|
3689
|
-
});
|
|
3690
|
-
}
|
|
3691
3927
|
if (config.source.mode === "static-output") {
|
|
3692
3928
|
const outputDir = path13.resolve(cwd, config.source.staticOutputDir);
|
|
3693
3929
|
const exists = fs9.existsSync(outputDir);
|