searchsocket 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -42
- package/dist/cli.js +370 -115
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +391 -109
- package/dist/index.d.cts +20 -3
- package/dist/index.d.ts +20 -3
- package/dist/index.js +389 -108
- package/dist/sveltekit.cjs +374 -109
- package/dist/sveltekit.d.cts +8 -2
- package/dist/sveltekit.d.ts +8 -2
- package/dist/sveltekit.js +373 -107
- package/dist/{types-D1K46vwd.d.cts → types-BrG6XTUU.d.cts} +29 -13
- package/dist/{types-D1K46vwd.d.ts → types-BrG6XTUU.d.ts} +29 -13
- package/package.json +1 -2
package/dist/cli.js
CHANGED
|
@@ -12,7 +12,7 @@ import { Command } from "commander";
|
|
|
12
12
|
// package.json
|
|
13
13
|
var package_default = {
|
|
14
14
|
name: "searchsocket",
|
|
15
|
-
version: "0.
|
|
15
|
+
version: "0.3.1",
|
|
16
16
|
description: "Semantic site search and MCP retrieval for SvelteKit static sites",
|
|
17
17
|
license: "MIT",
|
|
18
18
|
author: "Greg Priday <greg@siteorigin.com>",
|
|
@@ -82,7 +82,6 @@ var package_default = {
|
|
|
82
82
|
"fast-glob": "^3.3.3",
|
|
83
83
|
"gray-matter": "^4.0.3",
|
|
84
84
|
jiti: "^2.6.1",
|
|
85
|
-
openai: "^6.19.0",
|
|
86
85
|
"p-limit": "^7.3.0",
|
|
87
86
|
turndown: "^7.2.2",
|
|
88
87
|
"turndown-plugin-gfm": "^1.0.2",
|
|
@@ -133,7 +132,11 @@ var searchSocketConfigSchema = z.object({
|
|
|
133
132
|
outputDir: z.string().min(1).optional(),
|
|
134
133
|
paramValues: z.record(z.string(), z.array(z.string())).optional(),
|
|
135
134
|
exclude: z.array(z.string()).optional(),
|
|
136
|
-
previewTimeout: z.number().int().positive().optional()
|
|
135
|
+
previewTimeout: z.number().int().positive().optional(),
|
|
136
|
+
discover: z.boolean().optional(),
|
|
137
|
+
seedUrls: z.array(z.string()).optional(),
|
|
138
|
+
maxPages: z.number().int().positive().optional(),
|
|
139
|
+
maxDepth: z.number().int().nonnegative().optional()
|
|
137
140
|
}).optional()
|
|
138
141
|
}).optional(),
|
|
139
142
|
extract: z.object({
|
|
@@ -160,8 +163,9 @@ var searchSocketConfigSchema = z.object({
|
|
|
160
163
|
pageSummaryChunk: z.boolean().optional()
|
|
161
164
|
}).optional(),
|
|
162
165
|
embeddings: z.object({
|
|
163
|
-
provider: z.literal("
|
|
166
|
+
provider: z.literal("jina").optional(),
|
|
164
167
|
model: z.string().min(1).optional(),
|
|
168
|
+
apiKey: z.string().min(1).optional(),
|
|
165
169
|
apiKeyEnv: z.string().min(1).optional(),
|
|
166
170
|
batchSize: z.number().int().positive().optional(),
|
|
167
171
|
concurrency: z.number().int().positive().optional(),
|
|
@@ -170,18 +174,17 @@ var searchSocketConfigSchema = z.object({
|
|
|
170
174
|
vector: z.object({
|
|
171
175
|
dimension: z.number().int().positive().optional(),
|
|
172
176
|
turso: z.object({
|
|
177
|
+
url: z.string().url().optional(),
|
|
178
|
+
authToken: z.string().min(1).optional(),
|
|
173
179
|
urlEnv: z.string().optional(),
|
|
174
180
|
authTokenEnv: z.string().optional(),
|
|
175
181
|
localPath: z.string().optional()
|
|
176
182
|
}).optional()
|
|
177
183
|
}).optional(),
|
|
178
184
|
rerank: z.object({
|
|
179
|
-
|
|
185
|
+
enabled: z.boolean().optional(),
|
|
180
186
|
topN: z.number().int().positive().optional(),
|
|
181
|
-
|
|
182
|
-
apiKeyEnv: z.string().optional(),
|
|
183
|
-
model: z.string().optional()
|
|
184
|
-
}).optional()
|
|
187
|
+
model: z.string().optional()
|
|
185
188
|
}).optional(),
|
|
186
189
|
ranking: z.object({
|
|
187
190
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
@@ -190,6 +193,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
190
193
|
aggregationCap: z.number().int().positive().optional(),
|
|
191
194
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
192
195
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
196
|
+
minScore: z.number().min(0).max(1).optional(),
|
|
193
197
|
weights: z.object({
|
|
194
198
|
incomingLinks: z.number().optional(),
|
|
195
199
|
depth: z.number().optional(),
|
|
@@ -270,9 +274,9 @@ function createDefaultConfig(projectId) {
|
|
|
270
274
|
pageSummaryChunk: true
|
|
271
275
|
},
|
|
272
276
|
embeddings: {
|
|
273
|
-
provider: "
|
|
274
|
-
model: "
|
|
275
|
-
apiKeyEnv: "
|
|
277
|
+
provider: "jina",
|
|
278
|
+
model: "jina-embeddings-v3",
|
|
279
|
+
apiKeyEnv: "JINA_API_KEY",
|
|
276
280
|
batchSize: 64,
|
|
277
281
|
concurrency: 4
|
|
278
282
|
},
|
|
@@ -284,12 +288,9 @@ function createDefaultConfig(projectId) {
|
|
|
284
288
|
}
|
|
285
289
|
},
|
|
286
290
|
rerank: {
|
|
287
|
-
|
|
291
|
+
enabled: false,
|
|
288
292
|
topN: 20,
|
|
289
|
-
|
|
290
|
-
apiKeyEnv: "JINA_API_KEY",
|
|
291
|
-
model: "jina-reranker-v2-base-multilingual"
|
|
292
|
-
}
|
|
293
|
+
model: "jina-reranker-v2-base-multilingual"
|
|
293
294
|
},
|
|
294
295
|
ranking: {
|
|
295
296
|
enableIncomingLinkBoost: true,
|
|
@@ -298,6 +299,7 @@ function createDefaultConfig(projectId) {
|
|
|
298
299
|
aggregationCap: 5,
|
|
299
300
|
aggregationDecay: 0.5,
|
|
300
301
|
minChunkScoreRatio: 0.5,
|
|
302
|
+
minScore: 0,
|
|
301
303
|
weights: {
|
|
302
304
|
incomingLinks: 0.05,
|
|
303
305
|
depth: 0.03,
|
|
@@ -408,7 +410,11 @@ ${issues}`
|
|
|
408
410
|
outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
|
|
409
411
|
paramValues: parsed.source.build.paramValues ?? {},
|
|
410
412
|
exclude: parsed.source.build.exclude ?? [],
|
|
411
|
-
previewTimeout: parsed.source.build.previewTimeout ?? 3e4
|
|
413
|
+
previewTimeout: parsed.source.build.previewTimeout ?? 3e4,
|
|
414
|
+
discover: parsed.source.build.discover ?? false,
|
|
415
|
+
seedUrls: parsed.source.build.seedUrls ?? ["/"],
|
|
416
|
+
maxPages: parsed.source.build.maxPages ?? 200,
|
|
417
|
+
maxDepth: parsed.source.build.maxDepth ?? 10
|
|
412
418
|
} : void 0
|
|
413
419
|
},
|
|
414
420
|
extract: {
|
|
@@ -437,11 +443,7 @@ ${issues}`
|
|
|
437
443
|
},
|
|
438
444
|
rerank: {
|
|
439
445
|
...defaults.rerank,
|
|
440
|
-
...parsed.rerank
|
|
441
|
-
jina: {
|
|
442
|
-
...defaults.rerank.jina,
|
|
443
|
-
...parsed.rerank?.jina
|
|
444
|
-
}
|
|
446
|
+
...parsed.rerank
|
|
445
447
|
},
|
|
446
448
|
ranking: {
|
|
447
449
|
...defaults.ranking,
|
|
@@ -488,7 +490,11 @@ ${issues}`
|
|
|
488
490
|
outputDir: ".svelte-kit/output",
|
|
489
491
|
paramValues: {},
|
|
490
492
|
exclude: [],
|
|
491
|
-
previewTimeout: 3e4
|
|
493
|
+
previewTimeout: 3e4,
|
|
494
|
+
discover: false,
|
|
495
|
+
seedUrls: ["/"],
|
|
496
|
+
maxPages: 200,
|
|
497
|
+
maxDepth: 10
|
|
492
498
|
};
|
|
493
499
|
}
|
|
494
500
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
@@ -529,7 +535,7 @@ function writeMinimalConfig(cwd) {
|
|
|
529
535
|
return target;
|
|
530
536
|
}
|
|
531
537
|
const content = `export default {
|
|
532
|
-
embeddings: { apiKeyEnv: "
|
|
538
|
+
embeddings: { apiKeyEnv: "JINA_API_KEY" }
|
|
533
539
|
};
|
|
534
540
|
`;
|
|
535
541
|
fs.writeFileSync(target, content, "utf8");
|
|
@@ -540,14 +546,16 @@ function writeMinimalConfig(cwd) {
|
|
|
540
546
|
var Logger = class {
|
|
541
547
|
json;
|
|
542
548
|
verbose;
|
|
549
|
+
quiet;
|
|
543
550
|
stderrOnly;
|
|
544
551
|
constructor(opts = {}) {
|
|
545
552
|
this.json = opts.json ?? false;
|
|
546
553
|
this.verbose = opts.verbose ?? false;
|
|
554
|
+
this.quiet = opts.quiet ?? false;
|
|
547
555
|
this.stderrOnly = opts.stderrOnly ?? false;
|
|
548
556
|
}
|
|
549
557
|
info(message) {
|
|
550
|
-
if (this.json) {
|
|
558
|
+
if (this.quiet || this.json) {
|
|
551
559
|
return;
|
|
552
560
|
}
|
|
553
561
|
this.writeOut(`${message}
|
|
@@ -561,7 +569,7 @@ var Logger = class {
|
|
|
561
569
|
this.logJson("debug", { message });
|
|
562
570
|
return;
|
|
563
571
|
}
|
|
564
|
-
this.writeOut(
|
|
572
|
+
this.writeOut(` ${message}
|
|
565
573
|
`);
|
|
566
574
|
}
|
|
567
575
|
warn(message) {
|
|
@@ -588,7 +596,7 @@ var Logger = class {
|
|
|
588
596
|
this.logJson(event, data);
|
|
589
597
|
return;
|
|
590
598
|
}
|
|
591
|
-
this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
|
|
599
|
+
this.writeOut(` [${event}] ${data ? JSON.stringify(data) : ""}
|
|
592
600
|
`);
|
|
593
601
|
}
|
|
594
602
|
writeOut(text) {
|
|
@@ -695,18 +703,18 @@ function ensureStateDirs(cwd, stateDir, scope) {
|
|
|
695
703
|
return { statePath, pagesPath };
|
|
696
704
|
}
|
|
697
705
|
|
|
698
|
-
// src/embeddings/
|
|
699
|
-
import OpenAI from "openai";
|
|
706
|
+
// src/embeddings/jina.ts
|
|
700
707
|
import pLimit from "p-limit";
|
|
701
708
|
function sleep(ms) {
|
|
702
709
|
return new Promise((resolve) => {
|
|
703
710
|
setTimeout(resolve, ms);
|
|
704
711
|
});
|
|
705
712
|
}
|
|
706
|
-
var
|
|
707
|
-
|
|
713
|
+
var JinaEmbeddingsProvider = class {
|
|
714
|
+
apiKey;
|
|
708
715
|
batchSize;
|
|
709
716
|
concurrency;
|
|
717
|
+
defaultTask;
|
|
710
718
|
constructor(options) {
|
|
711
719
|
if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
|
|
712
720
|
throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
|
|
@@ -714,11 +722,10 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
714
722
|
if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
|
|
715
723
|
throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
|
|
716
724
|
}
|
|
717
|
-
this.
|
|
718
|
-
apiKey: options.apiKey
|
|
719
|
-
});
|
|
725
|
+
this.apiKey = options.apiKey;
|
|
720
726
|
this.batchSize = options.batchSize;
|
|
721
727
|
this.concurrency = options.concurrency;
|
|
728
|
+
this.defaultTask = options.task ?? "retrieval.passage";
|
|
722
729
|
}
|
|
723
730
|
estimateTokens(text) {
|
|
724
731
|
const normalized = text.trim();
|
|
@@ -732,7 +739,7 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
732
739
|
const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
|
|
733
740
|
return Math.max(1, Math.max(charEstimate, lexicalEstimate));
|
|
734
741
|
}
|
|
735
|
-
async embedTexts(texts, modelId) {
|
|
742
|
+
async embedTexts(texts, modelId, task) {
|
|
736
743
|
if (texts.length === 0) {
|
|
737
744
|
return [];
|
|
738
745
|
}
|
|
@@ -748,33 +755,52 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
748
755
|
await Promise.all(
|
|
749
756
|
batches.map(
|
|
750
757
|
(batch, position) => limit(async () => {
|
|
751
|
-
outputs[position] = await this.embedWithRetry(batch.values, modelId);
|
|
758
|
+
outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
|
|
752
759
|
})
|
|
753
760
|
)
|
|
754
761
|
);
|
|
755
762
|
return outputs.flat();
|
|
756
763
|
}
|
|
757
|
-
async embedWithRetry(texts, modelId) {
|
|
764
|
+
async embedWithRetry(texts, modelId, task) {
|
|
758
765
|
const maxAttempts = 5;
|
|
759
766
|
let attempt = 0;
|
|
760
767
|
while (attempt < maxAttempts) {
|
|
761
768
|
attempt += 1;
|
|
769
|
+
let response;
|
|
762
770
|
try {
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
771
|
+
response = await fetch("https://api.jina.ai/v1/embeddings", {
|
|
772
|
+
method: "POST",
|
|
773
|
+
headers: {
|
|
774
|
+
"content-type": "application/json",
|
|
775
|
+
authorization: `Bearer ${this.apiKey}`
|
|
776
|
+
},
|
|
777
|
+
body: JSON.stringify({
|
|
778
|
+
model: modelId,
|
|
779
|
+
input: texts,
|
|
780
|
+
task
|
|
781
|
+
})
|
|
767
782
|
});
|
|
768
|
-
return response.data.map((entry) => entry.embedding);
|
|
769
783
|
} catch (error) {
|
|
770
|
-
|
|
771
|
-
const retryable = status === 429 || typeof status === "number" && status >= 500;
|
|
772
|
-
if (!retryable || attempt >= maxAttempts) {
|
|
784
|
+
if (attempt >= maxAttempts) {
|
|
773
785
|
throw error;
|
|
774
786
|
}
|
|
775
|
-
|
|
776
|
-
|
|
787
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
788
|
+
continue;
|
|
789
|
+
}
|
|
790
|
+
if (!response.ok) {
|
|
791
|
+
const retryable = response.status === 429 || response.status >= 500;
|
|
792
|
+
if (!retryable || attempt >= maxAttempts) {
|
|
793
|
+
const errorBody = await response.text();
|
|
794
|
+
throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
|
|
795
|
+
}
|
|
796
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
797
|
+
continue;
|
|
798
|
+
}
|
|
799
|
+
const payload = await response.json();
|
|
800
|
+
if (!payload.data || !Array.isArray(payload.data)) {
|
|
801
|
+
throw new Error("Invalid Jina embeddings response format");
|
|
777
802
|
}
|
|
803
|
+
return payload.data.map((entry) => entry.embedding);
|
|
778
804
|
}
|
|
779
805
|
throw new Error("Unreachable retry state");
|
|
780
806
|
}
|
|
@@ -782,20 +808,20 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
782
808
|
|
|
783
809
|
// src/embeddings/factory.ts
|
|
784
810
|
function createEmbeddingsProvider(config) {
|
|
785
|
-
if (config.embeddings.provider !== "
|
|
811
|
+
if (config.embeddings.provider !== "jina") {
|
|
786
812
|
throw new SearchSocketError(
|
|
787
813
|
"CONFIG_MISSING",
|
|
788
814
|
`Unsupported embeddings provider ${config.embeddings.provider}`
|
|
789
815
|
);
|
|
790
816
|
}
|
|
791
|
-
const apiKey = process.env[config.embeddings.apiKeyEnv];
|
|
817
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
792
818
|
if (!apiKey) {
|
|
793
819
|
throw new SearchSocketError(
|
|
794
820
|
"CONFIG_MISSING",
|
|
795
|
-
`Missing embeddings API key env var
|
|
821
|
+
`Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
|
|
796
822
|
);
|
|
797
823
|
}
|
|
798
|
-
return new
|
|
824
|
+
return new JinaEmbeddingsProvider({
|
|
799
825
|
apiKey,
|
|
800
826
|
batchSize: config.embeddings.batchSize,
|
|
801
827
|
concurrency: config.embeddings.concurrency
|
|
@@ -809,6 +835,11 @@ import path11 from "path";
|
|
|
809
835
|
import fs3 from "fs";
|
|
810
836
|
import path3 from "path";
|
|
811
837
|
|
|
838
|
+
// src/core/serverless.ts
|
|
839
|
+
function isServerless() {
|
|
840
|
+
return !!(process.env.VERCEL || process.env.NETLIFY || process.env.AWS_LAMBDA_FUNCTION_NAME || process.env.FUNCTIONS_WORKER || process.env.CF_PAGES);
|
|
841
|
+
}
|
|
842
|
+
|
|
812
843
|
// src/vector/turso.ts
|
|
813
844
|
var TursoVectorStore = class {
|
|
814
845
|
client;
|
|
@@ -853,6 +884,16 @@ var TursoVectorStore = class {
|
|
|
853
884
|
}
|
|
854
885
|
async ensureChunks(dim) {
|
|
855
886
|
if (this.chunksReady) return;
|
|
887
|
+
const exists = await this.chunksTableExists();
|
|
888
|
+
if (exists) {
|
|
889
|
+
const currentDim = await this.getChunksDimension();
|
|
890
|
+
if (currentDim !== null && currentDim !== dim) {
|
|
891
|
+
await this.client.batch([
|
|
892
|
+
"DROP INDEX IF EXISTS idx",
|
|
893
|
+
"DROP TABLE IF EXISTS chunks"
|
|
894
|
+
]);
|
|
895
|
+
}
|
|
896
|
+
}
|
|
856
897
|
await this.client.batch([
|
|
857
898
|
`CREATE TABLE IF NOT EXISTS chunks (
|
|
858
899
|
id TEXT PRIMARY KEY,
|
|
@@ -864,12 +905,16 @@ var TursoVectorStore = class {
|
|
|
864
905
|
section_title TEXT NOT NULL DEFAULT '',
|
|
865
906
|
heading_path TEXT NOT NULL DEFAULT '[]',
|
|
866
907
|
snippet TEXT NOT NULL DEFAULT '',
|
|
908
|
+
chunk_text TEXT NOT NULL DEFAULT '',
|
|
909
|
+
ordinal INTEGER NOT NULL DEFAULT 0,
|
|
867
910
|
content_hash TEXT NOT NULL DEFAULT '',
|
|
868
911
|
model_id TEXT NOT NULL DEFAULT '',
|
|
869
912
|
depth INTEGER NOT NULL DEFAULT 0,
|
|
870
913
|
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
871
914
|
route_file TEXT NOT NULL DEFAULT '',
|
|
872
915
|
tags TEXT NOT NULL DEFAULT '[]',
|
|
916
|
+
description TEXT NOT NULL DEFAULT '',
|
|
917
|
+
keywords TEXT NOT NULL DEFAULT '[]',
|
|
873
918
|
embedding F32_BLOB(${dim})
|
|
874
919
|
)`,
|
|
875
920
|
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
@@ -908,6 +953,38 @@ var TursoVectorStore = class {
|
|
|
908
953
|
throw error;
|
|
909
954
|
}
|
|
910
955
|
}
|
|
956
|
+
/**
|
|
957
|
+
* Read the current F32_BLOB dimension from the chunks table schema.
|
|
958
|
+
* Returns null if the table doesn't exist or the dimension can't be parsed.
|
|
959
|
+
*/
|
|
960
|
+
async getChunksDimension() {
|
|
961
|
+
try {
|
|
962
|
+
const rs = await this.client.execute(
|
|
963
|
+
"SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
|
|
964
|
+
);
|
|
965
|
+
if (rs.rows.length === 0) return null;
|
|
966
|
+
const sql = rs.rows[0].sql;
|
|
967
|
+
const match = sql.match(/F32_BLOB\((\d+)\)/i);
|
|
968
|
+
return match ? parseInt(match[1], 10) : null;
|
|
969
|
+
} catch {
|
|
970
|
+
return null;
|
|
971
|
+
}
|
|
972
|
+
}
|
|
973
|
+
/**
|
|
974
|
+
* Drop all SearchSocket tables (chunks, registry, pages) and their indexes.
|
|
975
|
+
* Used by `clean --remote` for a full reset.
|
|
976
|
+
*/
|
|
977
|
+
async dropAllTables() {
|
|
978
|
+
await this.client.batch([
|
|
979
|
+
"DROP INDEX IF EXISTS idx",
|
|
980
|
+
"DROP TABLE IF EXISTS chunks",
|
|
981
|
+
"DROP TABLE IF EXISTS registry",
|
|
982
|
+
"DROP TABLE IF EXISTS pages"
|
|
983
|
+
]);
|
|
984
|
+
this.chunksReady = false;
|
|
985
|
+
this.registryReady = false;
|
|
986
|
+
this.pagesReady = false;
|
|
987
|
+
}
|
|
911
988
|
async upsert(records, _scope) {
|
|
912
989
|
if (records.length === 0) return;
|
|
913
990
|
const dim = this.dimension ?? records[0].vector.length;
|
|
@@ -918,9 +995,9 @@ var TursoVectorStore = class {
|
|
|
918
995
|
const stmts = batch.map((r) => ({
|
|
919
996
|
sql: `INSERT OR REPLACE INTO chunks
|
|
920
997
|
(id, project_id, scope_name, url, path, title, section_title,
|
|
921
|
-
heading_path, snippet, content_hash, model_id, depth,
|
|
922
|
-
incoming_links, route_file, tags, embedding)
|
|
923
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
998
|
+
heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
|
|
999
|
+
incoming_links, route_file, tags, description, keywords, embedding)
|
|
1000
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
924
1001
|
args: [
|
|
925
1002
|
r.id,
|
|
926
1003
|
r.metadata.projectId,
|
|
@@ -931,12 +1008,16 @@ var TursoVectorStore = class {
|
|
|
931
1008
|
r.metadata.sectionTitle,
|
|
932
1009
|
JSON.stringify(r.metadata.headingPath),
|
|
933
1010
|
r.metadata.snippet,
|
|
1011
|
+
r.metadata.chunkText,
|
|
1012
|
+
r.metadata.ordinal,
|
|
934
1013
|
r.metadata.contentHash,
|
|
935
1014
|
r.metadata.modelId,
|
|
936
1015
|
r.metadata.depth,
|
|
937
1016
|
r.metadata.incomingLinks,
|
|
938
1017
|
r.metadata.routeFile,
|
|
939
1018
|
JSON.stringify(r.metadata.tags),
|
|
1019
|
+
r.metadata.description ?? "",
|
|
1020
|
+
JSON.stringify(r.metadata.keywords ?? []),
|
|
940
1021
|
JSON.stringify(r.vector)
|
|
941
1022
|
]
|
|
942
1023
|
}));
|
|
@@ -949,8 +1030,10 @@ var TursoVectorStore = class {
|
|
|
949
1030
|
const queryJson = JSON.stringify(queryVector);
|
|
950
1031
|
const rs = await this.client.execute({
|
|
951
1032
|
sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
|
|
952
|
-
c.section_title, c.heading_path, c.snippet, c.
|
|
1033
|
+
c.section_title, c.heading_path, c.snippet, c.chunk_text,
|
|
1034
|
+
c.ordinal, c.content_hash,
|
|
953
1035
|
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
1036
|
+
c.description, c.keywords,
|
|
954
1037
|
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
955
1038
|
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
956
1039
|
JOIN chunks AS c ON c.rowid = v.id`,
|
|
@@ -981,6 +1064,12 @@ var TursoVectorStore = class {
|
|
|
981
1064
|
}
|
|
982
1065
|
const distance = row.distance;
|
|
983
1066
|
const score = 1 - distance;
|
|
1067
|
+
const description = row.description || void 0;
|
|
1068
|
+
const keywords = (() => {
|
|
1069
|
+
const raw = row.keywords || "[]";
|
|
1070
|
+
const parsed = JSON.parse(raw);
|
|
1071
|
+
return parsed.length > 0 ? parsed : void 0;
|
|
1072
|
+
})();
|
|
984
1073
|
hits.push({
|
|
985
1074
|
id: row.id,
|
|
986
1075
|
score,
|
|
@@ -993,12 +1082,16 @@ var TursoVectorStore = class {
|
|
|
993
1082
|
sectionTitle: row.section_title,
|
|
994
1083
|
headingPath: JSON.parse(row.heading_path || "[]"),
|
|
995
1084
|
snippet: row.snippet,
|
|
1085
|
+
chunkText: row.chunk_text || "",
|
|
1086
|
+
ordinal: row.ordinal || 0,
|
|
996
1087
|
contentHash: row.content_hash,
|
|
997
1088
|
modelId: row.model_id,
|
|
998
1089
|
depth: row.depth,
|
|
999
1090
|
incomingLinks: row.incoming_links,
|
|
1000
1091
|
routeFile: row.route_file,
|
|
1001
|
-
tags
|
|
1092
|
+
tags,
|
|
1093
|
+
description,
|
|
1094
|
+
keywords
|
|
1002
1095
|
}
|
|
1003
1096
|
});
|
|
1004
1097
|
}
|
|
@@ -1188,10 +1281,10 @@ var TursoVectorStore = class {
|
|
|
1188
1281
|
// src/vector/factory.ts
|
|
1189
1282
|
async function createVectorStore(config, cwd) {
|
|
1190
1283
|
const turso = config.vector.turso;
|
|
1191
|
-
const remoteUrl = process.env[turso.urlEnv];
|
|
1284
|
+
const remoteUrl = turso.url ?? process.env[turso.urlEnv];
|
|
1192
1285
|
if (remoteUrl) {
|
|
1193
1286
|
const { createClient: createClient2 } = await import("@libsql/client/http");
|
|
1194
|
-
const authToken = process.env[turso.authTokenEnv];
|
|
1287
|
+
const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
|
|
1195
1288
|
const client2 = createClient2({
|
|
1196
1289
|
url: remoteUrl,
|
|
1197
1290
|
authToken
|
|
@@ -1201,6 +1294,12 @@ async function createVectorStore(config, cwd) {
|
|
|
1201
1294
|
dimension: config.vector.dimension
|
|
1202
1295
|
});
|
|
1203
1296
|
}
|
|
1297
|
+
if (isServerless()) {
|
|
1298
|
+
throw new SearchSocketError(
|
|
1299
|
+
"VECTOR_BACKEND_UNAVAILABLE",
|
|
1300
|
+
`No remote vector database URL found (checked vector.turso.url and env var "${turso.urlEnv}"). Local SQLite storage is not available in serverless environments. Set ${turso.urlEnv} or pass vector.turso.url directly.`
|
|
1301
|
+
);
|
|
1302
|
+
}
|
|
1204
1303
|
const { createClient } = await import("@libsql/client");
|
|
1205
1304
|
const localPath = path3.resolve(cwd, turso.localPath);
|
|
1206
1305
|
fs3.mkdirSync(path3.dirname(localPath), { recursive: true });
|
|
@@ -1537,7 +1636,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
1537
1636
|
incomingLinks: page.incomingLinks,
|
|
1538
1637
|
routeFile: page.routeFile,
|
|
1539
1638
|
tags: page.tags,
|
|
1540
|
-
contentHash: ""
|
|
1639
|
+
contentHash: "",
|
|
1640
|
+
description: page.description,
|
|
1641
|
+
keywords: page.keywords
|
|
1541
1642
|
};
|
|
1542
1643
|
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
1543
1644
|
summaryChunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -1564,7 +1665,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
1564
1665
|
incomingLinks: page.incomingLinks,
|
|
1565
1666
|
routeFile: page.routeFile,
|
|
1566
1667
|
tags: page.tags,
|
|
1567
|
-
contentHash: ""
|
|
1668
|
+
contentHash: "",
|
|
1669
|
+
description: page.description,
|
|
1670
|
+
keywords: page.keywords
|
|
1568
1671
|
};
|
|
1569
1672
|
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
1570
1673
|
chunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -1828,6 +1931,7 @@ function mapUrlToRoute(urlPath, patterns) {
|
|
|
1828
1931
|
}
|
|
1829
1932
|
|
|
1830
1933
|
// src/indexing/sources/build/index.ts
|
|
1934
|
+
import { load as cheerioLoad } from "cheerio";
|
|
1831
1935
|
import pLimit2 from "p-limit";
|
|
1832
1936
|
|
|
1833
1937
|
// src/indexing/sources/build/manifest-parser.ts
|
|
@@ -2004,11 +2108,108 @@ async function startPreviewServer(cwd, options, logger3) {
|
|
|
2004
2108
|
|
|
2005
2109
|
// src/indexing/sources/build/index.ts
|
|
2006
2110
|
var logger = new Logger();
|
|
2111
|
+
function extractLinksFromHtml(html, pageUrl, baseOrigin) {
|
|
2112
|
+
const $ = cheerioLoad(html);
|
|
2113
|
+
const links = [];
|
|
2114
|
+
$("a[href]").each((_i, el) => {
|
|
2115
|
+
const href = $(el).attr("href");
|
|
2116
|
+
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
|
|
2117
|
+
return;
|
|
2118
|
+
}
|
|
2119
|
+
try {
|
|
2120
|
+
const resolved = new URL(href, `${baseOrigin}${pageUrl}`);
|
|
2121
|
+
if (resolved.origin !== baseOrigin) return;
|
|
2122
|
+
if (!["http:", "https:"].includes(resolved.protocol)) return;
|
|
2123
|
+
links.push(normalizeUrlPath(resolved.pathname));
|
|
2124
|
+
} catch {
|
|
2125
|
+
}
|
|
2126
|
+
});
|
|
2127
|
+
return [...new Set(links)];
|
|
2128
|
+
}
|
|
2129
|
+
async function discoverPages(server, buildConfig, pipelineMaxPages) {
|
|
2130
|
+
const { seedUrls, maxDepth, exclude } = buildConfig;
|
|
2131
|
+
const baseOrigin = new URL(server.baseUrl).origin;
|
|
2132
|
+
let effectiveMax = buildConfig.maxPages;
|
|
2133
|
+
if (typeof pipelineMaxPages === "number") {
|
|
2134
|
+
const floored = Math.max(0, Math.floor(pipelineMaxPages));
|
|
2135
|
+
effectiveMax = Math.min(effectiveMax, floored);
|
|
2136
|
+
}
|
|
2137
|
+
if (effectiveMax === 0) return [];
|
|
2138
|
+
const visited = /* @__PURE__ */ new Set();
|
|
2139
|
+
const pages = [];
|
|
2140
|
+
const queue = [];
|
|
2141
|
+
const limit = pLimit2(8);
|
|
2142
|
+
for (const seed of seedUrls) {
|
|
2143
|
+
const normalized = normalizeUrlPath(seed);
|
|
2144
|
+
if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
|
|
2145
|
+
visited.add(normalized);
|
|
2146
|
+
queue.push({ url: normalized, depth: 0 });
|
|
2147
|
+
}
|
|
2148
|
+
}
|
|
2149
|
+
while (queue.length > 0 && pages.length < effectiveMax) {
|
|
2150
|
+
const remaining = effectiveMax - pages.length;
|
|
2151
|
+
const batch = queue.splice(0, remaining);
|
|
2152
|
+
const results = await Promise.allSettled(
|
|
2153
|
+
batch.map(
|
|
2154
|
+
(item) => limit(async () => {
|
|
2155
|
+
const fullUrl = joinUrl(server.baseUrl, item.url);
|
|
2156
|
+
const response = await fetch(fullUrl);
|
|
2157
|
+
if (!response.ok) {
|
|
2158
|
+
logger.warn(`Skipping ${item.url}: ${response.status} ${response.statusText}`);
|
|
2159
|
+
return null;
|
|
2160
|
+
}
|
|
2161
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
2162
|
+
if (!contentType.includes("text/html")) {
|
|
2163
|
+
return null;
|
|
2164
|
+
}
|
|
2165
|
+
const html = await response.text();
|
|
2166
|
+
if (item.depth < maxDepth) {
|
|
2167
|
+
const links = extractLinksFromHtml(html, item.url, baseOrigin);
|
|
2168
|
+
for (const link of links) {
|
|
2169
|
+
if (!visited.has(link) && !isExcluded(link, exclude)) {
|
|
2170
|
+
visited.add(link);
|
|
2171
|
+
queue.push({ url: link, depth: item.depth + 1 });
|
|
2172
|
+
}
|
|
2173
|
+
}
|
|
2174
|
+
}
|
|
2175
|
+
return {
|
|
2176
|
+
url: item.url,
|
|
2177
|
+
html,
|
|
2178
|
+
sourcePath: fullUrl,
|
|
2179
|
+
outgoingLinks: []
|
|
2180
|
+
};
|
|
2181
|
+
})
|
|
2182
|
+
)
|
|
2183
|
+
);
|
|
2184
|
+
for (const result of results) {
|
|
2185
|
+
if (result.status === "fulfilled" && result.value) {
|
|
2186
|
+
pages.push(result.value);
|
|
2187
|
+
}
|
|
2188
|
+
}
|
|
2189
|
+
}
|
|
2190
|
+
if (pages.length >= effectiveMax && queue.length > 0) {
|
|
2191
|
+
logger.warn(`Discovery crawl reached maxPages limit (${effectiveMax}), ${queue.length} URLs not visited.`);
|
|
2192
|
+
}
|
|
2193
|
+
logger.event("build_discover_complete", {
|
|
2194
|
+
pagesFound: pages.length,
|
|
2195
|
+
urlsVisited: visited.size,
|
|
2196
|
+
urlsSkipped: queue.length
|
|
2197
|
+
});
|
|
2198
|
+
return pages;
|
|
2199
|
+
}
|
|
2007
2200
|
async function loadBuildPages(cwd, config, maxPages) {
|
|
2008
2201
|
const buildConfig = config.source.build;
|
|
2009
2202
|
if (!buildConfig) {
|
|
2010
2203
|
throw new Error("build source config is missing");
|
|
2011
2204
|
}
|
|
2205
|
+
if (buildConfig.discover) {
|
|
2206
|
+
const server2 = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
2207
|
+
try {
|
|
2208
|
+
return await discoverPages(server2, buildConfig, maxPages);
|
|
2209
|
+
} finally {
|
|
2210
|
+
await server2.shutdown();
|
|
2211
|
+
}
|
|
2212
|
+
}
|
|
2012
2213
|
const routes = await parseManifest(cwd, buildConfig.outputDir);
|
|
2013
2214
|
const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
|
|
2014
2215
|
logger.event("build_routes_discovered", {
|
|
@@ -2112,11 +2313,11 @@ async function loadContentFilesPages(cwd, config, maxPages) {
|
|
|
2112
2313
|
|
|
2113
2314
|
// src/indexing/sources/crawl.ts
|
|
2114
2315
|
import { gunzipSync } from "zlib";
|
|
2115
|
-
import { load as
|
|
2316
|
+
import { load as cheerioLoad2 } from "cheerio";
|
|
2116
2317
|
import pLimit3 from "p-limit";
|
|
2117
2318
|
var logger2 = new Logger();
|
|
2118
2319
|
function extractLocs(xml) {
|
|
2119
|
-
const $ =
|
|
2320
|
+
const $ = cheerioLoad2(xml, { xmlMode: true });
|
|
2120
2321
|
const locs = [];
|
|
2121
2322
|
$("loc").each((_i, el) => {
|
|
2122
2323
|
const text = $(el).text().trim();
|
|
@@ -2127,7 +2328,7 @@ function extractLocs(xml) {
|
|
|
2127
2328
|
return locs;
|
|
2128
2329
|
}
|
|
2129
2330
|
function isSitemapIndex(xml) {
|
|
2130
|
-
const $ =
|
|
2331
|
+
const $ = cheerioLoad2(xml, { xmlMode: true });
|
|
2131
2332
|
return $("sitemapindex").length > 0;
|
|
2132
2333
|
}
|
|
2133
2334
|
async function fetchSitemapXml(url) {
|
|
@@ -2265,9 +2466,7 @@ function hrTimeMs(start) {
|
|
|
2265
2466
|
|
|
2266
2467
|
// src/indexing/pipeline.ts
|
|
2267
2468
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
2268
|
-
"
|
|
2269
|
-
"text-embedding-3-large": 13e-5,
|
|
2270
|
-
"text-embedding-ada-002": 1e-4
|
|
2469
|
+
"jina-embeddings-v3": 2e-5
|
|
2271
2470
|
};
|
|
2272
2471
|
var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
|
|
2273
2472
|
var IndexPipeline = class _IndexPipeline {
|
|
@@ -2313,9 +2512,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2313
2512
|
};
|
|
2314
2513
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
2315
2514
|
const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
|
|
2515
|
+
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
2516
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, model: ${this.config.embeddings.model})`);
|
|
2316
2517
|
if (options.force) {
|
|
2518
|
+
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
2317
2519
|
await cleanMirrorForScope(statePath, scope);
|
|
2318
2520
|
}
|
|
2521
|
+
if (options.dryRun) {
|
|
2522
|
+
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
2523
|
+
}
|
|
2319
2524
|
const manifestStart = stageStart();
|
|
2320
2525
|
const existingHashes = await this.vectorStore.getContentHashes(scope);
|
|
2321
2526
|
const existingModelId = await this.vectorStore.getScopeModelId(scope);
|
|
@@ -2326,8 +2531,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2326
2531
|
);
|
|
2327
2532
|
}
|
|
2328
2533
|
stageEnd("manifest", manifestStart);
|
|
2534
|
+
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
2329
2535
|
const sourceStart = stageStart();
|
|
2330
|
-
|
|
2536
|
+
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
2331
2537
|
let sourcePages;
|
|
2332
2538
|
if (sourceMode === "static-output") {
|
|
2333
2539
|
sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
|
|
@@ -2339,10 +2545,13 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2339
2545
|
sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
|
|
2340
2546
|
}
|
|
2341
2547
|
stageEnd("source", sourceStart);
|
|
2548
|
+
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
2342
2549
|
const routeStart = stageStart();
|
|
2343
2550
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
2344
2551
|
stageEnd("route_map", routeStart);
|
|
2552
|
+
this.logger.debug(`Route mapping: ${routePatterns.length} pattern${routePatterns.length === 1 ? "" : "s"} discovered (${stageTimingsMs["route_map"]}ms)`);
|
|
2345
2553
|
const extractStart = stageStart();
|
|
2554
|
+
this.logger.info("Extracting content...");
|
|
2346
2555
|
const extractedPages = [];
|
|
2347
2556
|
for (const sourcePage of sourcePages) {
|
|
2348
2557
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
@@ -2371,6 +2580,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2371
2580
|
uniquePages.push(page);
|
|
2372
2581
|
}
|
|
2373
2582
|
stageEnd("extract", extractStart);
|
|
2583
|
+
const skippedPages = sourcePages.length - uniquePages.length;
|
|
2584
|
+
this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
2374
2585
|
const linkStart = stageStart();
|
|
2375
2586
|
const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
|
|
2376
2587
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
@@ -2386,7 +2597,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2386
2597
|
}
|
|
2387
2598
|
}
|
|
2388
2599
|
stageEnd("links", linkStart);
|
|
2600
|
+
this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
|
|
2389
2601
|
const mirrorStart = stageStart();
|
|
2602
|
+
this.logger.info("Writing mirror pages...");
|
|
2390
2603
|
const mirrorPages = [];
|
|
2391
2604
|
let routeExact = 0;
|
|
2392
2605
|
let routeBestEffort = 0;
|
|
@@ -2456,7 +2669,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2456
2669
|
await this.vectorStore.upsertPages(pageRecords, scope);
|
|
2457
2670
|
}
|
|
2458
2671
|
stageEnd("mirror", mirrorStart);
|
|
2672
|
+
this.logger.info(`Mirrored ${mirrorPages.length} page${mirrorPages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["mirror"]}ms)`);
|
|
2459
2673
|
const chunkStart = stageStart();
|
|
2674
|
+
this.logger.info("Chunking pages...");
|
|
2460
2675
|
let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
|
|
2461
2676
|
const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
|
|
2462
2677
|
if (typeof maxChunks === "number") {
|
|
@@ -2469,6 +2684,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2469
2684
|
});
|
|
2470
2685
|
}
|
|
2471
2686
|
stageEnd("chunk", chunkStart);
|
|
2687
|
+
this.logger.info(`Chunked into ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} (${stageTimingsMs["chunk"]}ms)`);
|
|
2472
2688
|
const currentChunkMap = /* @__PURE__ */ new Map();
|
|
2473
2689
|
for (const chunk of chunks) {
|
|
2474
2690
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
@@ -2487,6 +2703,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2487
2703
|
return existingHash !== chunk.contentHash;
|
|
2488
2704
|
});
|
|
2489
2705
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
2706
|
+
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
2490
2707
|
const embedStart = stageStart();
|
|
2491
2708
|
const chunkTokenEstimates = /* @__PURE__ */ new Map();
|
|
2492
2709
|
for (const chunk of changedChunks) {
|
|
@@ -2501,9 +2718,11 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2501
2718
|
let newEmbeddings = 0;
|
|
2502
2719
|
const vectorsByChunk = /* @__PURE__ */ new Map();
|
|
2503
2720
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
2721
|
+
this.logger.info(`Embedding ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} (~${estimatedTokens.toLocaleString()} tokens, ~$${estimatedCostUSD.toFixed(6)})...`);
|
|
2504
2722
|
const embeddings = await this.embeddings.embedTexts(
|
|
2505
2723
|
changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
|
|
2506
|
-
this.config.embeddings.model
|
|
2724
|
+
this.config.embeddings.model,
|
|
2725
|
+
"retrieval.passage"
|
|
2507
2726
|
);
|
|
2508
2727
|
if (embeddings.length !== changedChunks.length) {
|
|
2509
2728
|
throw new SearchSocketError(
|
|
@@ -2526,8 +2745,14 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2526
2745
|
}
|
|
2527
2746
|
}
|
|
2528
2747
|
stageEnd("embedding", embedStart);
|
|
2748
|
+
if (changedChunks.length > 0) {
|
|
2749
|
+
this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
|
|
2750
|
+
} else {
|
|
2751
|
+
this.logger.info("No chunks to embed \u2014 all up to date");
|
|
2752
|
+
}
|
|
2529
2753
|
const syncStart = stageStart();
|
|
2530
2754
|
if (!options.dryRun) {
|
|
2755
|
+
this.logger.info("Syncing vectors...");
|
|
2531
2756
|
const upserts = [];
|
|
2532
2757
|
for (const chunk of changedChunks) {
|
|
2533
2758
|
const vector = vectorsByChunk.get(chunk.chunkKey);
|
|
@@ -2546,12 +2771,16 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2546
2771
|
sectionTitle: chunk.sectionTitle ?? "",
|
|
2547
2772
|
headingPath: chunk.headingPath,
|
|
2548
2773
|
snippet: chunk.snippet,
|
|
2774
|
+
chunkText: chunk.chunkText.slice(0, 4e3),
|
|
2775
|
+
ordinal: chunk.ordinal,
|
|
2549
2776
|
contentHash: chunk.contentHash,
|
|
2550
2777
|
modelId: this.config.embeddings.model,
|
|
2551
2778
|
depth: chunk.depth,
|
|
2552
2779
|
incomingLinks: chunk.incomingLinks,
|
|
2553
2780
|
routeFile: chunk.routeFile,
|
|
2554
|
-
tags: chunk.tags
|
|
2781
|
+
tags: chunk.tags,
|
|
2782
|
+
description: chunk.description,
|
|
2783
|
+
keywords: chunk.keywords
|
|
2555
2784
|
}
|
|
2556
2785
|
});
|
|
2557
2786
|
}
|
|
@@ -2565,6 +2794,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2565
2794
|
}
|
|
2566
2795
|
}
|
|
2567
2796
|
stageEnd("sync", syncStart);
|
|
2797
|
+
this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
|
|
2568
2798
|
const finalizeStart = stageStart();
|
|
2569
2799
|
if (!options.dryRun) {
|
|
2570
2800
|
const scopeInfo = {
|
|
@@ -2584,6 +2814,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2584
2814
|
});
|
|
2585
2815
|
}
|
|
2586
2816
|
stageEnd("finalize", finalizeStart);
|
|
2817
|
+
this.logger.info("Done.");
|
|
2587
2818
|
return {
|
|
2588
2819
|
pagesProcessed: mirrorPages.length,
|
|
2589
2820
|
chunksTotal: chunks.length,
|
|
@@ -2693,20 +2924,17 @@ var JinaReranker = class {
|
|
|
2693
2924
|
|
|
2694
2925
|
// src/rerank/factory.ts
|
|
2695
2926
|
function createReranker(config) {
|
|
2696
|
-
if (config.rerank.
|
|
2927
|
+
if (!config.rerank.enabled) {
|
|
2697
2928
|
return null;
|
|
2698
2929
|
}
|
|
2699
|
-
|
|
2700
|
-
|
|
2701
|
-
|
|
2702
|
-
return null;
|
|
2703
|
-
}
|
|
2704
|
-
return new JinaReranker({
|
|
2705
|
-
apiKey,
|
|
2706
|
-
model: config.rerank.jina.model
|
|
2707
|
-
});
|
|
2930
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
2931
|
+
if (!apiKey) {
|
|
2932
|
+
return null;
|
|
2708
2933
|
}
|
|
2709
|
-
return
|
|
2934
|
+
return new JinaReranker({
|
|
2935
|
+
apiKey,
|
|
2936
|
+
model: config.rerank.model
|
|
2937
|
+
});
|
|
2710
2938
|
}
|
|
2711
2939
|
|
|
2712
2940
|
// src/search/ranking.ts
|
|
@@ -2854,7 +3082,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
2854
3082
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
2855
3083
|
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
2856
3084
|
const embedStart = process.hrtime.bigint();
|
|
2857
|
-
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
|
|
3085
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
2858
3086
|
const queryVector = queryEmbeddings[0];
|
|
2859
3087
|
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
2860
3088
|
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
@@ -2882,13 +3110,17 @@ var SearchEngine = class _SearchEngine {
|
|
|
2882
3110
|
usedRerank = true;
|
|
2883
3111
|
}
|
|
2884
3112
|
let results;
|
|
3113
|
+
const minScore = this.config.ranking.minScore;
|
|
2885
3114
|
if (groupByPage) {
|
|
2886
|
-
|
|
3115
|
+
let pages = aggregateByPage(ordered, this.config);
|
|
3116
|
+
if (minScore > 0) {
|
|
3117
|
+
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
3118
|
+
}
|
|
2887
3119
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
2888
3120
|
results = pages.slice(0, topK).map((page) => {
|
|
2889
3121
|
const bestScore = page.bestChunk.finalScore;
|
|
2890
|
-
const
|
|
2891
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
3122
|
+
const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
3123
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
|
|
2892
3124
|
return {
|
|
2893
3125
|
url: page.url,
|
|
2894
3126
|
title: page.title,
|
|
@@ -2905,6 +3137,9 @@ var SearchEngine = class _SearchEngine {
|
|
|
2905
3137
|
};
|
|
2906
3138
|
});
|
|
2907
3139
|
} else {
|
|
3140
|
+
if (minScore > 0) {
|
|
3141
|
+
ordered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
3142
|
+
}
|
|
2908
3143
|
results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
2909
3144
|
url: hit.metadata.url,
|
|
2910
3145
|
title: hit.metadata.title,
|
|
@@ -2976,43 +3211,67 @@ var SearchEngine = class _SearchEngine {
|
|
|
2976
3211
|
}
|
|
2977
3212
|
}
|
|
2978
3213
|
async rerankHits(query, ranked, topK) {
|
|
2979
|
-
if (this.config.rerank.
|
|
3214
|
+
if (!this.config.rerank.enabled) {
|
|
2980
3215
|
throw new SearchSocketError(
|
|
2981
3216
|
"INVALID_REQUEST",
|
|
2982
|
-
"rerank=true requested but rerank.
|
|
3217
|
+
"rerank=true requested but rerank.enabled is not set to true.",
|
|
2983
3218
|
400
|
|
2984
3219
|
);
|
|
2985
3220
|
}
|
|
2986
3221
|
if (!this.reranker) {
|
|
2987
3222
|
throw new SearchSocketError(
|
|
2988
3223
|
"CONFIG_MISSING",
|
|
2989
|
-
`rerank=true requested but ${this.config.
|
|
3224
|
+
`rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
|
|
2990
3225
|
400
|
|
2991
3226
|
);
|
|
2992
3227
|
}
|
|
2993
|
-
const
|
|
2994
|
-
|
|
2995
|
-
|
|
2996
|
-
|
|
3228
|
+
const pageGroups = /* @__PURE__ */ new Map();
|
|
3229
|
+
for (const entry of ranked) {
|
|
3230
|
+
const url = entry.hit.metadata.url;
|
|
3231
|
+
const group = pageGroups.get(url);
|
|
3232
|
+
if (group) group.push(entry);
|
|
3233
|
+
else pageGroups.set(url, [entry]);
|
|
3234
|
+
}
|
|
3235
|
+
const MAX_CHUNKS_PER_PAGE = 5;
|
|
3236
|
+
const MIN_CHUNKS_PER_PAGE = 1;
|
|
3237
|
+
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
3238
|
+
const pageCandidates = [];
|
|
3239
|
+
for (const [url, chunks] of pageGroups) {
|
|
3240
|
+
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
3241
|
+
const bestScore = byScore[0].finalScore;
|
|
3242
|
+
const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
|
|
3243
|
+
const selected = byScore.filter(
|
|
3244
|
+
(c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
|
|
3245
|
+
).slice(0, MAX_CHUNKS_PER_PAGE);
|
|
3246
|
+
selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
|
|
3247
|
+
const first = selected[0].hit.metadata;
|
|
3248
|
+
const parts = [first.title];
|
|
3249
|
+
if (first.description) {
|
|
3250
|
+
parts.push(first.description);
|
|
3251
|
+
}
|
|
3252
|
+
if (first.keywords && first.keywords.length > 0) {
|
|
3253
|
+
parts.push(first.keywords.join(", "));
|
|
3254
|
+
}
|
|
3255
|
+
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
3256
|
+
parts.push(body);
|
|
3257
|
+
pageCandidates.push({ id: url, text: parts.join("\n\n") });
|
|
3258
|
+
}
|
|
2997
3259
|
const reranked = await this.reranker.rerank(
|
|
2998
3260
|
query,
|
|
2999
|
-
|
|
3261
|
+
pageCandidates,
|
|
3000
3262
|
Math.max(topK, this.config.rerank.topN)
|
|
3001
3263
|
);
|
|
3002
|
-
const
|
|
3264
|
+
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
3003
3265
|
return ranked.map((entry) => {
|
|
3004
|
-
const
|
|
3005
|
-
const
|
|
3006
|
-
if (
|
|
3007
|
-
return {
|
|
3008
|
-
...entry,
|
|
3009
|
-
finalScore: safeBaseScore
|
|
3010
|
-
};
|
|
3266
|
+
const pageScore = scoreByUrl.get(entry.hit.metadata.url);
|
|
3267
|
+
const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
|
|
3268
|
+
if (pageScore === void 0 || !Number.isFinite(pageScore)) {
|
|
3269
|
+
return { ...entry, finalScore: base };
|
|
3011
3270
|
}
|
|
3012
|
-
const
|
|
3271
|
+
const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
|
|
3013
3272
|
return {
|
|
3014
3273
|
...entry,
|
|
3015
|
-
finalScore: Number.isFinite(
|
|
3274
|
+
finalScore: Number.isFinite(combined) ? combined : base
|
|
3016
3275
|
};
|
|
3017
3276
|
}).sort((a, b) => {
|
|
3018
3277
|
const delta = b.finalScore - a.finalScore;
|
|
@@ -3332,6 +3591,7 @@ function getRootOptions(command) {
|
|
|
3332
3591
|
}
|
|
3333
3592
|
async function runIndexCommand(opts) {
|
|
3334
3593
|
const logger3 = new Logger({
|
|
3594
|
+
quiet: opts.quiet,
|
|
3335
3595
|
verbose: opts.verbose,
|
|
3336
3596
|
json: opts.json
|
|
3337
3597
|
});
|
|
@@ -3355,7 +3615,9 @@ async function runIndexCommand(opts) {
|
|
|
3355
3615
|
`);
|
|
3356
3616
|
return;
|
|
3357
3617
|
}
|
|
3358
|
-
|
|
3618
|
+
if (!opts.quiet) {
|
|
3619
|
+
printIndexSummary(stats);
|
|
3620
|
+
}
|
|
3359
3621
|
}
|
|
3360
3622
|
var program = new Command();
|
|
3361
3623
|
program.name("searchsocket").description("Semantic site search and MCP retrieval for SvelteKit").version(package_default.version).option("-C, --cwd <path>", "working directory", process.cwd()).option("--config <path>", "config path (defaults to searchsocket.config.ts)");
|
|
@@ -3379,7 +3641,7 @@ program.command("init").description("Create searchsocket.config.ts and .searchso
|
|
|
3379
3641
|
process.stdout.write("// searchsocketVitePlugin({ enabled: true, changedOnly: true })\n");
|
|
3380
3642
|
process.stdout.write("// or env-driven: SEARCHSOCKET_AUTO_INDEX=1 pnpm build\n");
|
|
3381
3643
|
});
|
|
3382
|
-
program.command("index").description("Index site content into markdown mirror + vector store").option("--scope <name>", "scope override").option("--changed-only", "only process changed chunks", true).option("--no-changed-only", "re-index regardless of previous manifest").option("--force", "force full mirror rebuild and re-upsert", false).option("--dry-run", "compute plan and cost, no API writes", false).option("--source <mode>", "source mode override: static-output|crawl|content-files|build").option("--max-pages <n>", "limit pages processed").option("--max-chunks <n>", "limit chunks processed").option("--verbose", "verbose output", false).option("--json", "emit JSON logs and summary", false).action(async (opts, command) => {
|
|
3644
|
+
program.command("index").description("Index site content into markdown mirror + vector store").option("--scope <name>", "scope override").option("--changed-only", "only process changed chunks", true).option("--no-changed-only", "re-index regardless of previous manifest").option("--force", "force full mirror rebuild and re-upsert", false).option("--dry-run", "compute plan and cost, no API writes", false).option("--source <mode>", "source mode override: static-output|crawl|content-files|build").option("--max-pages <n>", "limit pages processed").option("--max-chunks <n>", "limit chunks processed").option("--quiet", "suppress all output except errors and warnings", false).option("--verbose", "verbose output", false).option("--json", "emit JSON logs and summary", false).action(async (opts, command) => {
|
|
3383
3645
|
const rootOpts = getRootOptions(command);
|
|
3384
3646
|
const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3385
3647
|
await runIndexCommand({
|
|
@@ -3392,6 +3654,7 @@ program.command("index").description("Index site content into markdown mirror +
|
|
|
3392
3654
|
source: opts.source,
|
|
3393
3655
|
maxPages: opts.maxPages ? parsePositiveInt(opts.maxPages, "--max-pages") : void 0,
|
|
3394
3656
|
maxChunks: opts.maxChunks ? parsePositiveInt(opts.maxChunks, "--max-chunks") : void 0,
|
|
3657
|
+
quiet: opts.quiet,
|
|
3395
3658
|
verbose: opts.verbose,
|
|
3396
3659
|
json: opts.json
|
|
3397
3660
|
});
|
|
@@ -3554,8 +3817,8 @@ program.command("clean").description("Delete local state and optionally delete r
|
|
|
3554
3817
|
`);
|
|
3555
3818
|
if (opts.remote) {
|
|
3556
3819
|
const vectorStore = await createVectorStore(config, cwd);
|
|
3557
|
-
await vectorStore.
|
|
3558
|
-
process.stdout.write(`
|
|
3820
|
+
await vectorStore.dropAllTables();
|
|
3821
|
+
process.stdout.write(`dropped all remote tables (chunks, registry, pages)
|
|
3559
3822
|
`);
|
|
3560
3823
|
}
|
|
3561
3824
|
});
|
|
@@ -3680,14 +3943,6 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
3680
3943
|
details: tursoUrl ? `remote: ${tursoUrl}` : `local file: ${config.vector.turso.localPath}`
|
|
3681
3944
|
});
|
|
3682
3945
|
}
|
|
3683
|
-
if (config.rerank.provider === "jina") {
|
|
3684
|
-
const jinaKey = process.env[config.rerank.jina.apiKeyEnv];
|
|
3685
|
-
checks.push({
|
|
3686
|
-
name: `env ${config.rerank.jina.apiKeyEnv}`,
|
|
3687
|
-
ok: Boolean(jinaKey),
|
|
3688
|
-
details: jinaKey ? void 0 : "missing"
|
|
3689
|
-
});
|
|
3690
|
-
}
|
|
3691
3946
|
if (config.source.mode === "static-output") {
|
|
3692
3947
|
const outputDir = path13.resolve(cwd, config.source.staticOutputDir);
|
|
3693
3948
|
const exists = fs9.existsSync(outputDir);
|