searchsocket 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -42
- package/dist/cli.js +348 -111
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +367 -104
- package/dist/index.d.cts +20 -3
- package/dist/index.d.ts +20 -3
- package/dist/index.js +365 -103
- package/dist/sveltekit.cjs +350 -104
- package/dist/sveltekit.d.cts +8 -2
- package/dist/sveltekit.d.ts +8 -2
- package/dist/sveltekit.js +349 -102
- package/dist/{types-D1K46vwd.d.cts → types-DAXk6A3Y.d.cts} +25 -13
- package/dist/{types-D1K46vwd.d.ts → types-DAXk6A3Y.d.ts} +25 -13
- package/package.json +3 -3
- package/dist/cli.js.map +0 -1
- package/dist/client.cjs.map +0 -1
- package/dist/client.js.map +0 -1
- package/dist/index.cjs.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/sveltekit.cjs.map +0 -1
- package/dist/sveltekit.js.map +0 -1
package/dist/sveltekit.cjs
CHANGED
|
@@ -4,8 +4,7 @@ var fs = require('fs');
|
|
|
4
4
|
var path = require('path');
|
|
5
5
|
var jiti = require('jiti');
|
|
6
6
|
var zod = require('zod');
|
|
7
|
-
var
|
|
8
|
-
var pLimit = require('p-limit');
|
|
7
|
+
var pLimit2 = require('p-limit');
|
|
9
8
|
var child_process = require('child_process');
|
|
10
9
|
var crypto = require('crypto');
|
|
11
10
|
var cheerio = require('cheerio');
|
|
@@ -19,8 +18,7 @@ function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
|
19
18
|
|
|
20
19
|
var fs__default = /*#__PURE__*/_interopDefault(fs);
|
|
21
20
|
var path__default = /*#__PURE__*/_interopDefault(path);
|
|
22
|
-
var
|
|
23
|
-
var pLimit__default = /*#__PURE__*/_interopDefault(pLimit);
|
|
21
|
+
var pLimit2__default = /*#__PURE__*/_interopDefault(pLimit2);
|
|
24
22
|
var matter__default = /*#__PURE__*/_interopDefault(matter);
|
|
25
23
|
var fs4__default = /*#__PURE__*/_interopDefault(fs4);
|
|
26
24
|
var fg__default = /*#__PURE__*/_interopDefault(fg);
|
|
@@ -16629,7 +16627,11 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16629
16627
|
outputDir: zod.z.string().min(1).optional(),
|
|
16630
16628
|
paramValues: zod.z.record(zod.z.string(), zod.z.array(zod.z.string())).optional(),
|
|
16631
16629
|
exclude: zod.z.array(zod.z.string()).optional(),
|
|
16632
|
-
previewTimeout: zod.z.number().int().positive().optional()
|
|
16630
|
+
previewTimeout: zod.z.number().int().positive().optional(),
|
|
16631
|
+
discover: zod.z.boolean().optional(),
|
|
16632
|
+
seedUrls: zod.z.array(zod.z.string()).optional(),
|
|
16633
|
+
maxPages: zod.z.number().int().positive().optional(),
|
|
16634
|
+
maxDepth: zod.z.number().int().nonnegative().optional()
|
|
16633
16635
|
}).optional()
|
|
16634
16636
|
}).optional(),
|
|
16635
16637
|
extract: zod.z.object({
|
|
@@ -16656,8 +16658,9 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16656
16658
|
pageSummaryChunk: zod.z.boolean().optional()
|
|
16657
16659
|
}).optional(),
|
|
16658
16660
|
embeddings: zod.z.object({
|
|
16659
|
-
provider: zod.z.literal("
|
|
16661
|
+
provider: zod.z.literal("jina").optional(),
|
|
16660
16662
|
model: zod.z.string().min(1).optional(),
|
|
16663
|
+
apiKey: zod.z.string().min(1).optional(),
|
|
16661
16664
|
apiKeyEnv: zod.z.string().min(1).optional(),
|
|
16662
16665
|
batchSize: zod.z.number().int().positive().optional(),
|
|
16663
16666
|
concurrency: zod.z.number().int().positive().optional(),
|
|
@@ -16666,18 +16669,17 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16666
16669
|
vector: zod.z.object({
|
|
16667
16670
|
dimension: zod.z.number().int().positive().optional(),
|
|
16668
16671
|
turso: zod.z.object({
|
|
16672
|
+
url: zod.z.string().url().optional(),
|
|
16673
|
+
authToken: zod.z.string().min(1).optional(),
|
|
16669
16674
|
urlEnv: zod.z.string().optional(),
|
|
16670
16675
|
authTokenEnv: zod.z.string().optional(),
|
|
16671
16676
|
localPath: zod.z.string().optional()
|
|
16672
16677
|
}).optional()
|
|
16673
16678
|
}).optional(),
|
|
16674
16679
|
rerank: zod.z.object({
|
|
16675
|
-
|
|
16680
|
+
enabled: zod.z.boolean().optional(),
|
|
16676
16681
|
topN: zod.z.number().int().positive().optional(),
|
|
16677
|
-
|
|
16678
|
-
apiKeyEnv: zod.z.string().optional(),
|
|
16679
|
-
model: zod.z.string().optional()
|
|
16680
|
-
}).optional()
|
|
16682
|
+
model: zod.z.string().optional()
|
|
16681
16683
|
}).optional(),
|
|
16682
16684
|
ranking: zod.z.object({
|
|
16683
16685
|
enableIncomingLinkBoost: zod.z.boolean().optional(),
|
|
@@ -16686,6 +16688,7 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16686
16688
|
aggregationCap: zod.z.number().int().positive().optional(),
|
|
16687
16689
|
aggregationDecay: zod.z.number().min(0).max(1).optional(),
|
|
16688
16690
|
minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
16691
|
+
minScore: zod.z.number().min(0).max(1).optional(),
|
|
16689
16692
|
weights: zod.z.object({
|
|
16690
16693
|
incomingLinks: zod.z.number().optional(),
|
|
16691
16694
|
depth: zod.z.number().optional(),
|
|
@@ -16766,9 +16769,9 @@ function createDefaultConfig(projectId) {
|
|
|
16766
16769
|
pageSummaryChunk: true
|
|
16767
16770
|
},
|
|
16768
16771
|
embeddings: {
|
|
16769
|
-
provider: "
|
|
16770
|
-
model: "
|
|
16771
|
-
apiKeyEnv: "
|
|
16772
|
+
provider: "jina",
|
|
16773
|
+
model: "jina-embeddings-v3",
|
|
16774
|
+
apiKeyEnv: "JINA_API_KEY",
|
|
16772
16775
|
batchSize: 64,
|
|
16773
16776
|
concurrency: 4
|
|
16774
16777
|
},
|
|
@@ -16780,12 +16783,9 @@ function createDefaultConfig(projectId) {
|
|
|
16780
16783
|
}
|
|
16781
16784
|
},
|
|
16782
16785
|
rerank: {
|
|
16783
|
-
|
|
16786
|
+
enabled: false,
|
|
16784
16787
|
topN: 20,
|
|
16785
|
-
|
|
16786
|
-
apiKeyEnv: "JINA_API_KEY",
|
|
16787
|
-
model: "jina-reranker-v2-base-multilingual"
|
|
16788
|
-
}
|
|
16788
|
+
model: "jina-reranker-v2-base-multilingual"
|
|
16789
16789
|
},
|
|
16790
16790
|
ranking: {
|
|
16791
16791
|
enableIncomingLinkBoost: true,
|
|
@@ -16794,6 +16794,7 @@ function createDefaultConfig(projectId) {
|
|
|
16794
16794
|
aggregationCap: 5,
|
|
16795
16795
|
aggregationDecay: 0.5,
|
|
16796
16796
|
minChunkScoreRatio: 0.5,
|
|
16797
|
+
minScore: 0,
|
|
16797
16798
|
weights: {
|
|
16798
16799
|
incomingLinks: 0.05,
|
|
16799
16800
|
depth: 0.03,
|
|
@@ -16920,7 +16921,11 @@ ${issues}`
|
|
|
16920
16921
|
outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
|
|
16921
16922
|
paramValues: parsed.source.build.paramValues ?? {},
|
|
16922
16923
|
exclude: parsed.source.build.exclude ?? [],
|
|
16923
|
-
previewTimeout: parsed.source.build.previewTimeout ?? 3e4
|
|
16924
|
+
previewTimeout: parsed.source.build.previewTimeout ?? 3e4,
|
|
16925
|
+
discover: parsed.source.build.discover ?? false,
|
|
16926
|
+
seedUrls: parsed.source.build.seedUrls ?? ["/"],
|
|
16927
|
+
maxPages: parsed.source.build.maxPages ?? 200,
|
|
16928
|
+
maxDepth: parsed.source.build.maxDepth ?? 10
|
|
16924
16929
|
} : void 0
|
|
16925
16930
|
},
|
|
16926
16931
|
extract: {
|
|
@@ -16949,11 +16954,7 @@ ${issues}`
|
|
|
16949
16954
|
},
|
|
16950
16955
|
rerank: {
|
|
16951
16956
|
...defaults.rerank,
|
|
16952
|
-
...parsed.rerank
|
|
16953
|
-
jina: {
|
|
16954
|
-
...defaults.rerank.jina,
|
|
16955
|
-
...parsed.rerank?.jina
|
|
16956
|
-
}
|
|
16957
|
+
...parsed.rerank
|
|
16957
16958
|
},
|
|
16958
16959
|
ranking: {
|
|
16959
16960
|
...defaults.ranking,
|
|
@@ -17000,7 +17001,11 @@ ${issues}`
|
|
|
17000
17001
|
outputDir: ".svelte-kit/output",
|
|
17001
17002
|
paramValues: {},
|
|
17002
17003
|
exclude: [],
|
|
17003
|
-
previewTimeout: 3e4
|
|
17004
|
+
previewTimeout: 3e4,
|
|
17005
|
+
discover: false,
|
|
17006
|
+
seedUrls: ["/"],
|
|
17007
|
+
maxPages: 200,
|
|
17008
|
+
maxDepth: 10
|
|
17004
17009
|
};
|
|
17005
17010
|
}
|
|
17006
17011
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
@@ -17035,15 +17040,21 @@ async function loadConfig(options = {}) {
|
|
|
17035
17040
|
const raw = loaded.default ?? loaded;
|
|
17036
17041
|
return mergeConfig(cwd, raw);
|
|
17037
17042
|
}
|
|
17043
|
+
|
|
17044
|
+
// src/core/serverless.ts
|
|
17045
|
+
function isServerless() {
|
|
17046
|
+
return !!(process.env.VERCEL || process.env.NETLIFY || process.env.AWS_LAMBDA_FUNCTION_NAME || process.env.FUNCTIONS_WORKER || process.env.CF_PAGES);
|
|
17047
|
+
}
|
|
17038
17048
|
function sleep(ms) {
|
|
17039
17049
|
return new Promise((resolve) => {
|
|
17040
17050
|
setTimeout(resolve, ms);
|
|
17041
17051
|
});
|
|
17042
17052
|
}
|
|
17043
|
-
var
|
|
17044
|
-
|
|
17053
|
+
var JinaEmbeddingsProvider = class {
|
|
17054
|
+
apiKey;
|
|
17045
17055
|
batchSize;
|
|
17046
17056
|
concurrency;
|
|
17057
|
+
defaultTask;
|
|
17047
17058
|
constructor(options) {
|
|
17048
17059
|
if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
|
|
17049
17060
|
throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
|
|
@@ -17051,11 +17062,10 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17051
17062
|
if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
|
|
17052
17063
|
throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
|
|
17053
17064
|
}
|
|
17054
|
-
this.
|
|
17055
|
-
apiKey: options.apiKey
|
|
17056
|
-
});
|
|
17065
|
+
this.apiKey = options.apiKey;
|
|
17057
17066
|
this.batchSize = options.batchSize;
|
|
17058
17067
|
this.concurrency = options.concurrency;
|
|
17068
|
+
this.defaultTask = options.task ?? "retrieval.passage";
|
|
17059
17069
|
}
|
|
17060
17070
|
estimateTokens(text) {
|
|
17061
17071
|
const normalized = text.trim();
|
|
@@ -17069,7 +17079,7 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17069
17079
|
const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
|
|
17070
17080
|
return Math.max(1, Math.max(charEstimate, lexicalEstimate));
|
|
17071
17081
|
}
|
|
17072
|
-
async embedTexts(texts, modelId) {
|
|
17082
|
+
async embedTexts(texts, modelId, task) {
|
|
17073
17083
|
if (texts.length === 0) {
|
|
17074
17084
|
return [];
|
|
17075
17085
|
}
|
|
@@ -17081,37 +17091,56 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17081
17091
|
});
|
|
17082
17092
|
}
|
|
17083
17093
|
const outputs = new Array(batches.length);
|
|
17084
|
-
const limit =
|
|
17094
|
+
const limit = pLimit2__default.default(this.concurrency);
|
|
17085
17095
|
await Promise.all(
|
|
17086
17096
|
batches.map(
|
|
17087
17097
|
(batch, position) => limit(async () => {
|
|
17088
|
-
outputs[position] = await this.embedWithRetry(batch.values, modelId);
|
|
17098
|
+
outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
|
|
17089
17099
|
})
|
|
17090
17100
|
)
|
|
17091
17101
|
);
|
|
17092
17102
|
return outputs.flat();
|
|
17093
17103
|
}
|
|
17094
|
-
async embedWithRetry(texts, modelId) {
|
|
17104
|
+
async embedWithRetry(texts, modelId, task) {
|
|
17095
17105
|
const maxAttempts = 5;
|
|
17096
17106
|
let attempt = 0;
|
|
17097
17107
|
while (attempt < maxAttempts) {
|
|
17098
17108
|
attempt += 1;
|
|
17109
|
+
let response;
|
|
17099
17110
|
try {
|
|
17100
|
-
|
|
17101
|
-
|
|
17102
|
-
|
|
17103
|
-
|
|
17111
|
+
response = await fetch("https://api.jina.ai/v1/embeddings", {
|
|
17112
|
+
method: "POST",
|
|
17113
|
+
headers: {
|
|
17114
|
+
"content-type": "application/json",
|
|
17115
|
+
authorization: `Bearer ${this.apiKey}`
|
|
17116
|
+
},
|
|
17117
|
+
body: JSON.stringify({
|
|
17118
|
+
model: modelId,
|
|
17119
|
+
input: texts,
|
|
17120
|
+
task
|
|
17121
|
+
})
|
|
17104
17122
|
});
|
|
17105
|
-
return response.data.map((entry) => entry.embedding);
|
|
17106
17123
|
} catch (error) {
|
|
17107
|
-
|
|
17108
|
-
const retryable = status === 429 || typeof status === "number" && status >= 500;
|
|
17109
|
-
if (!retryable || attempt >= maxAttempts) {
|
|
17124
|
+
if (attempt >= maxAttempts) {
|
|
17110
17125
|
throw error;
|
|
17111
17126
|
}
|
|
17112
|
-
|
|
17113
|
-
|
|
17127
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17128
|
+
continue;
|
|
17129
|
+
}
|
|
17130
|
+
if (!response.ok) {
|
|
17131
|
+
const retryable = response.status === 429 || response.status >= 500;
|
|
17132
|
+
if (!retryable || attempt >= maxAttempts) {
|
|
17133
|
+
const errorBody = await response.text();
|
|
17134
|
+
throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
|
|
17135
|
+
}
|
|
17136
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17137
|
+
continue;
|
|
17114
17138
|
}
|
|
17139
|
+
const payload = await response.json();
|
|
17140
|
+
if (!payload.data || !Array.isArray(payload.data)) {
|
|
17141
|
+
throw new Error("Invalid Jina embeddings response format");
|
|
17142
|
+
}
|
|
17143
|
+
return payload.data.map((entry) => entry.embedding);
|
|
17115
17144
|
}
|
|
17116
17145
|
throw new Error("Unreachable retry state");
|
|
17117
17146
|
}
|
|
@@ -17119,20 +17148,20 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17119
17148
|
|
|
17120
17149
|
// src/embeddings/factory.ts
|
|
17121
17150
|
function createEmbeddingsProvider(config) {
|
|
17122
|
-
if (config.embeddings.provider !== "
|
|
17151
|
+
if (config.embeddings.provider !== "jina") {
|
|
17123
17152
|
throw new SearchSocketError(
|
|
17124
17153
|
"CONFIG_MISSING",
|
|
17125
17154
|
`Unsupported embeddings provider ${config.embeddings.provider}`
|
|
17126
17155
|
);
|
|
17127
17156
|
}
|
|
17128
|
-
const apiKey = process.env[config.embeddings.apiKeyEnv];
|
|
17157
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17129
17158
|
if (!apiKey) {
|
|
17130
17159
|
throw new SearchSocketError(
|
|
17131
17160
|
"CONFIG_MISSING",
|
|
17132
|
-
`Missing embeddings API key env var
|
|
17161
|
+
`Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
|
|
17133
17162
|
);
|
|
17134
17163
|
}
|
|
17135
|
-
return new
|
|
17164
|
+
return new JinaEmbeddingsProvider({
|
|
17136
17165
|
apiKey,
|
|
17137
17166
|
batchSize: config.embeddings.batchSize,
|
|
17138
17167
|
concurrency: config.embeddings.concurrency
|
|
@@ -17295,20 +17324,17 @@ var JinaReranker = class {
|
|
|
17295
17324
|
|
|
17296
17325
|
// src/rerank/factory.ts
|
|
17297
17326
|
function createReranker(config) {
|
|
17298
|
-
if (config.rerank.
|
|
17327
|
+
if (!config.rerank.enabled) {
|
|
17299
17328
|
return null;
|
|
17300
17329
|
}
|
|
17301
|
-
|
|
17302
|
-
|
|
17303
|
-
|
|
17304
|
-
return null;
|
|
17305
|
-
}
|
|
17306
|
-
return new JinaReranker({
|
|
17307
|
-
apiKey,
|
|
17308
|
-
model: config.rerank.jina.model
|
|
17309
|
-
});
|
|
17330
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17331
|
+
if (!apiKey) {
|
|
17332
|
+
return null;
|
|
17310
17333
|
}
|
|
17311
|
-
return
|
|
17334
|
+
return new JinaReranker({
|
|
17335
|
+
apiKey,
|
|
17336
|
+
model: config.rerank.model
|
|
17337
|
+
});
|
|
17312
17338
|
}
|
|
17313
17339
|
|
|
17314
17340
|
// src/utils/time.ts
|
|
@@ -17413,6 +17439,16 @@ var TursoVectorStore = class {
|
|
|
17413
17439
|
}
|
|
17414
17440
|
async ensureChunks(dim) {
|
|
17415
17441
|
if (this.chunksReady) return;
|
|
17442
|
+
const exists = await this.chunksTableExists();
|
|
17443
|
+
if (exists) {
|
|
17444
|
+
const currentDim = await this.getChunksDimension();
|
|
17445
|
+
if (currentDim !== null && currentDim !== dim) {
|
|
17446
|
+
await this.client.batch([
|
|
17447
|
+
"DROP INDEX IF EXISTS idx",
|
|
17448
|
+
"DROP TABLE IF EXISTS chunks"
|
|
17449
|
+
]);
|
|
17450
|
+
}
|
|
17451
|
+
}
|
|
17416
17452
|
await this.client.batch([
|
|
17417
17453
|
`CREATE TABLE IF NOT EXISTS chunks (
|
|
17418
17454
|
id TEXT PRIMARY KEY,
|
|
@@ -17424,6 +17460,8 @@ var TursoVectorStore = class {
|
|
|
17424
17460
|
section_title TEXT NOT NULL DEFAULT '',
|
|
17425
17461
|
heading_path TEXT NOT NULL DEFAULT '[]',
|
|
17426
17462
|
snippet TEXT NOT NULL DEFAULT '',
|
|
17463
|
+
chunk_text TEXT NOT NULL DEFAULT '',
|
|
17464
|
+
ordinal INTEGER NOT NULL DEFAULT 0,
|
|
17427
17465
|
content_hash TEXT NOT NULL DEFAULT '',
|
|
17428
17466
|
model_id TEXT NOT NULL DEFAULT '',
|
|
17429
17467
|
depth INTEGER NOT NULL DEFAULT 0,
|
|
@@ -17434,6 +17472,19 @@ var TursoVectorStore = class {
|
|
|
17434
17472
|
)`,
|
|
17435
17473
|
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
17436
17474
|
]);
|
|
17475
|
+
const chunkMigrationCols = [
|
|
17476
|
+
{ name: "chunk_text", def: "TEXT NOT NULL DEFAULT ''" },
|
|
17477
|
+
{ name: "ordinal", def: "INTEGER NOT NULL DEFAULT 0" }
|
|
17478
|
+
];
|
|
17479
|
+
for (const col of chunkMigrationCols) {
|
|
17480
|
+
try {
|
|
17481
|
+
await this.client.execute(`ALTER TABLE chunks ADD COLUMN ${col.name} ${col.def}`);
|
|
17482
|
+
} catch (error) {
|
|
17483
|
+
if (error instanceof Error && !error.message.includes("duplicate column")) {
|
|
17484
|
+
throw error;
|
|
17485
|
+
}
|
|
17486
|
+
}
|
|
17487
|
+
}
|
|
17437
17488
|
this.chunksReady = true;
|
|
17438
17489
|
}
|
|
17439
17490
|
async ensurePages() {
|
|
@@ -17468,6 +17519,38 @@ var TursoVectorStore = class {
|
|
|
17468
17519
|
throw error;
|
|
17469
17520
|
}
|
|
17470
17521
|
}
|
|
17522
|
+
/**
|
|
17523
|
+
* Read the current F32_BLOB dimension from the chunks table schema.
|
|
17524
|
+
* Returns null if the table doesn't exist or the dimension can't be parsed.
|
|
17525
|
+
*/
|
|
17526
|
+
async getChunksDimension() {
|
|
17527
|
+
try {
|
|
17528
|
+
const rs = await this.client.execute(
|
|
17529
|
+
"SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
|
|
17530
|
+
);
|
|
17531
|
+
if (rs.rows.length === 0) return null;
|
|
17532
|
+
const sql = rs.rows[0].sql;
|
|
17533
|
+
const match = sql.match(/F32_BLOB\((\d+)\)/i);
|
|
17534
|
+
return match ? parseInt(match[1], 10) : null;
|
|
17535
|
+
} catch {
|
|
17536
|
+
return null;
|
|
17537
|
+
}
|
|
17538
|
+
}
|
|
17539
|
+
/**
|
|
17540
|
+
* Drop all SearchSocket tables (chunks, registry, pages) and their indexes.
|
|
17541
|
+
* Used by `clean --remote` for a full reset.
|
|
17542
|
+
*/
|
|
17543
|
+
async dropAllTables() {
|
|
17544
|
+
await this.client.batch([
|
|
17545
|
+
"DROP INDEX IF EXISTS idx",
|
|
17546
|
+
"DROP TABLE IF EXISTS chunks",
|
|
17547
|
+
"DROP TABLE IF EXISTS registry",
|
|
17548
|
+
"DROP TABLE IF EXISTS pages"
|
|
17549
|
+
]);
|
|
17550
|
+
this.chunksReady = false;
|
|
17551
|
+
this.registryReady = false;
|
|
17552
|
+
this.pagesReady = false;
|
|
17553
|
+
}
|
|
17471
17554
|
async upsert(records, _scope) {
|
|
17472
17555
|
if (records.length === 0) return;
|
|
17473
17556
|
const dim = this.dimension ?? records[0].vector.length;
|
|
@@ -17478,9 +17561,9 @@ var TursoVectorStore = class {
|
|
|
17478
17561
|
const stmts = batch.map((r) => ({
|
|
17479
17562
|
sql: `INSERT OR REPLACE INTO chunks
|
|
17480
17563
|
(id, project_id, scope_name, url, path, title, section_title,
|
|
17481
|
-
heading_path, snippet, content_hash, model_id, depth,
|
|
17564
|
+
heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
|
|
17482
17565
|
incoming_links, route_file, tags, embedding)
|
|
17483
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17566
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17484
17567
|
args: [
|
|
17485
17568
|
r.id,
|
|
17486
17569
|
r.metadata.projectId,
|
|
@@ -17491,6 +17574,8 @@ var TursoVectorStore = class {
|
|
|
17491
17574
|
r.metadata.sectionTitle,
|
|
17492
17575
|
JSON.stringify(r.metadata.headingPath),
|
|
17493
17576
|
r.metadata.snippet,
|
|
17577
|
+
r.metadata.chunkText,
|
|
17578
|
+
r.metadata.ordinal,
|
|
17494
17579
|
r.metadata.contentHash,
|
|
17495
17580
|
r.metadata.modelId,
|
|
17496
17581
|
r.metadata.depth,
|
|
@@ -17509,7 +17594,8 @@ var TursoVectorStore = class {
|
|
|
17509
17594
|
const queryJson = JSON.stringify(queryVector);
|
|
17510
17595
|
const rs = await this.client.execute({
|
|
17511
17596
|
sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
|
|
17512
|
-
c.section_title, c.heading_path, c.snippet, c.
|
|
17597
|
+
c.section_title, c.heading_path, c.snippet, c.chunk_text,
|
|
17598
|
+
c.ordinal, c.content_hash,
|
|
17513
17599
|
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
17514
17600
|
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
17515
17601
|
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
@@ -17553,6 +17639,8 @@ var TursoVectorStore = class {
|
|
|
17553
17639
|
sectionTitle: row.section_title,
|
|
17554
17640
|
headingPath: JSON.parse(row.heading_path || "[]"),
|
|
17555
17641
|
snippet: row.snippet,
|
|
17642
|
+
chunkText: row.chunk_text || "",
|
|
17643
|
+
ordinal: row.ordinal || 0,
|
|
17556
17644
|
contentHash: row.content_hash,
|
|
17557
17645
|
modelId: row.model_id,
|
|
17558
17646
|
depth: row.depth,
|
|
@@ -17748,10 +17836,10 @@ var TursoVectorStore = class {
|
|
|
17748
17836
|
// src/vector/factory.ts
|
|
17749
17837
|
async function createVectorStore(config, cwd) {
|
|
17750
17838
|
const turso = config.vector.turso;
|
|
17751
|
-
const remoteUrl = process.env[turso.urlEnv];
|
|
17839
|
+
const remoteUrl = turso.url ?? process.env[turso.urlEnv];
|
|
17752
17840
|
if (remoteUrl) {
|
|
17753
17841
|
const { createClient: createClient2 } = await import('@libsql/client/http');
|
|
17754
|
-
const authToken = process.env[turso.authTokenEnv];
|
|
17842
|
+
const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
|
|
17755
17843
|
const client2 = createClient2({
|
|
17756
17844
|
url: remoteUrl,
|
|
17757
17845
|
authToken
|
|
@@ -17761,6 +17849,12 @@ async function createVectorStore(config, cwd) {
|
|
|
17761
17849
|
dimension: config.vector.dimension
|
|
17762
17850
|
});
|
|
17763
17851
|
}
|
|
17852
|
+
if (isServerless()) {
|
|
17853
|
+
throw new SearchSocketError(
|
|
17854
|
+
"VECTOR_BACKEND_UNAVAILABLE",
|
|
17855
|
+
`No remote vector database URL found (checked vector.turso.url and env var "${turso.urlEnv}"). Local SQLite storage is not available in serverless environments. Set ${turso.urlEnv} or pass vector.turso.url directly.`
|
|
17856
|
+
);
|
|
17857
|
+
}
|
|
17764
17858
|
const { createClient } = await import('@libsql/client');
|
|
17765
17859
|
const localPath = path__default.default.resolve(cwd, turso.localPath);
|
|
17766
17860
|
fs__default.default.mkdirSync(path__default.default.dirname(localPath), { recursive: true });
|
|
@@ -17918,7 +18012,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
17918
18012
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
17919
18013
|
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
17920
18014
|
const embedStart = process.hrtime.bigint();
|
|
17921
|
-
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
|
|
18015
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
17922
18016
|
const queryVector = queryEmbeddings[0];
|
|
17923
18017
|
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
17924
18018
|
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
@@ -17946,13 +18040,17 @@ var SearchEngine = class _SearchEngine {
|
|
|
17946
18040
|
usedRerank = true;
|
|
17947
18041
|
}
|
|
17948
18042
|
let results;
|
|
18043
|
+
const minScore = this.config.ranking.minScore;
|
|
17949
18044
|
if (groupByPage) {
|
|
17950
|
-
|
|
18045
|
+
let pages = aggregateByPage(ordered, this.config);
|
|
18046
|
+
if (minScore > 0) {
|
|
18047
|
+
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
18048
|
+
}
|
|
17951
18049
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
17952
18050
|
results = pages.slice(0, topK).map((page) => {
|
|
17953
18051
|
const bestScore = page.bestChunk.finalScore;
|
|
17954
|
-
const
|
|
17955
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
18052
|
+
const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
18053
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
|
|
17956
18054
|
return {
|
|
17957
18055
|
url: page.url,
|
|
17958
18056
|
title: page.title,
|
|
@@ -17969,6 +18067,9 @@ var SearchEngine = class _SearchEngine {
|
|
|
17969
18067
|
};
|
|
17970
18068
|
});
|
|
17971
18069
|
} else {
|
|
18070
|
+
if (minScore > 0) {
|
|
18071
|
+
ordered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
18072
|
+
}
|
|
17972
18073
|
results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
17973
18074
|
url: hit.metadata.url,
|
|
17974
18075
|
title: hit.metadata.title,
|
|
@@ -18040,43 +18141,54 @@ var SearchEngine = class _SearchEngine {
|
|
|
18040
18141
|
}
|
|
18041
18142
|
}
|
|
18042
18143
|
async rerankHits(query, ranked, topK) {
|
|
18043
|
-
if (this.config.rerank.
|
|
18144
|
+
if (!this.config.rerank.enabled) {
|
|
18044
18145
|
throw new SearchSocketError(
|
|
18045
18146
|
"INVALID_REQUEST",
|
|
18046
|
-
"rerank=true requested but rerank.
|
|
18147
|
+
"rerank=true requested but rerank.enabled is not set to true.",
|
|
18047
18148
|
400
|
|
18048
18149
|
);
|
|
18049
18150
|
}
|
|
18050
18151
|
if (!this.reranker) {
|
|
18051
18152
|
throw new SearchSocketError(
|
|
18052
18153
|
"CONFIG_MISSING",
|
|
18053
|
-
`rerank=true requested but ${this.config.
|
|
18154
|
+
`rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
|
|
18054
18155
|
400
|
|
18055
18156
|
);
|
|
18056
18157
|
}
|
|
18057
|
-
const
|
|
18058
|
-
|
|
18059
|
-
|
|
18060
|
-
|
|
18158
|
+
const pageGroups = /* @__PURE__ */ new Map();
|
|
18159
|
+
for (const entry of ranked) {
|
|
18160
|
+
const url = entry.hit.metadata.url;
|
|
18161
|
+
const group = pageGroups.get(url);
|
|
18162
|
+
if (group) group.push(entry);
|
|
18163
|
+
else pageGroups.set(url, [entry]);
|
|
18164
|
+
}
|
|
18165
|
+
const pageCandidates = [];
|
|
18166
|
+
for (const [url, chunks] of pageGroups) {
|
|
18167
|
+
const sorted = [...chunks].sort(
|
|
18168
|
+
(a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0)
|
|
18169
|
+
);
|
|
18170
|
+
const title = sorted[0].hit.metadata.title;
|
|
18171
|
+
const body = sorted.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
18172
|
+
pageCandidates.push({ id: url, text: `${title}
|
|
18173
|
+
|
|
18174
|
+
${body}` });
|
|
18175
|
+
}
|
|
18061
18176
|
const reranked = await this.reranker.rerank(
|
|
18062
18177
|
query,
|
|
18063
|
-
|
|
18178
|
+
pageCandidates,
|
|
18064
18179
|
Math.max(topK, this.config.rerank.topN)
|
|
18065
18180
|
);
|
|
18066
|
-
const
|
|
18181
|
+
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
18067
18182
|
return ranked.map((entry) => {
|
|
18068
|
-
const
|
|
18069
|
-
const
|
|
18070
|
-
if (
|
|
18071
|
-
return {
|
|
18072
|
-
...entry,
|
|
18073
|
-
finalScore: safeBaseScore
|
|
18074
|
-
};
|
|
18183
|
+
const pageScore = scoreByUrl.get(entry.hit.metadata.url);
|
|
18184
|
+
const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
|
|
18185
|
+
if (pageScore === void 0 || !Number.isFinite(pageScore)) {
|
|
18186
|
+
return { ...entry, finalScore: base };
|
|
18075
18187
|
}
|
|
18076
|
-
const
|
|
18188
|
+
const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
|
|
18077
18189
|
return {
|
|
18078
18190
|
...entry,
|
|
18079
|
-
finalScore: Number.isFinite(
|
|
18191
|
+
finalScore: Number.isFinite(combined) ? combined : base
|
|
18080
18192
|
};
|
|
18081
18193
|
}).sort((a, b) => {
|
|
18082
18194
|
const delta = b.finalScore - a.finalScore;
|
|
@@ -18116,13 +18228,21 @@ function searchsocketHandle(options = {}) {
|
|
|
18116
18228
|
let rateLimiter = null;
|
|
18117
18229
|
const getConfig = async () => {
|
|
18118
18230
|
if (!configPromise) {
|
|
18119
|
-
|
|
18120
|
-
|
|
18121
|
-
|
|
18122
|
-
})
|
|
18231
|
+
let configP;
|
|
18232
|
+
if (options.config) {
|
|
18233
|
+
configP = Promise.resolve(options.config);
|
|
18234
|
+
} else if (options.rawConfig) {
|
|
18235
|
+
const cwd = options.cwd ?? process.cwd();
|
|
18236
|
+
configP = Promise.resolve(mergeConfig(cwd, options.rawConfig));
|
|
18237
|
+
} else {
|
|
18238
|
+
configP = loadConfig({
|
|
18239
|
+
cwd: options.cwd,
|
|
18240
|
+
configPath: options.configPath
|
|
18241
|
+
});
|
|
18242
|
+
}
|
|
18123
18243
|
configPromise = configP.then((config) => {
|
|
18124
18244
|
apiPath = apiPath ?? config.api.path;
|
|
18125
|
-
if (config.api.rateLimit) {
|
|
18245
|
+
if (config.api.rateLimit && !isServerless()) {
|
|
18126
18246
|
rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
|
|
18127
18247
|
}
|
|
18128
18248
|
return config;
|
|
@@ -18132,10 +18252,9 @@ function searchsocketHandle(options = {}) {
|
|
|
18132
18252
|
};
|
|
18133
18253
|
const getEngine = async () => {
|
|
18134
18254
|
if (!enginePromise) {
|
|
18135
|
-
const config =
|
|
18255
|
+
const config = await getConfig();
|
|
18136
18256
|
enginePromise = SearchEngine.create({
|
|
18137
18257
|
cwd: options.cwd,
|
|
18138
|
-
configPath: options.configPath,
|
|
18139
18258
|
config
|
|
18140
18259
|
});
|
|
18141
18260
|
}
|
|
@@ -19670,14 +19789,16 @@ function mapUrlToRoute(urlPath, patterns) {
|
|
|
19670
19789
|
var Logger = class {
|
|
19671
19790
|
json;
|
|
19672
19791
|
verbose;
|
|
19792
|
+
quiet;
|
|
19673
19793
|
stderrOnly;
|
|
19674
19794
|
constructor(opts = {}) {
|
|
19675
19795
|
this.json = opts.json ?? false;
|
|
19676
19796
|
this.verbose = opts.verbose ?? false;
|
|
19797
|
+
this.quiet = opts.quiet ?? false;
|
|
19677
19798
|
this.stderrOnly = opts.stderrOnly ?? false;
|
|
19678
19799
|
}
|
|
19679
19800
|
info(message) {
|
|
19680
|
-
if (this.json) {
|
|
19801
|
+
if (this.quiet || this.json) {
|
|
19681
19802
|
return;
|
|
19682
19803
|
}
|
|
19683
19804
|
this.writeOut(`${message}
|
|
@@ -19691,7 +19812,7 @@ var Logger = class {
|
|
|
19691
19812
|
this.logJson("debug", { message });
|
|
19692
19813
|
return;
|
|
19693
19814
|
}
|
|
19694
|
-
this.writeOut(
|
|
19815
|
+
this.writeOut(` ${message}
|
|
19695
19816
|
`);
|
|
19696
19817
|
}
|
|
19697
19818
|
warn(message) {
|
|
@@ -19718,7 +19839,7 @@ var Logger = class {
|
|
|
19718
19839
|
this.logJson(event, data);
|
|
19719
19840
|
return;
|
|
19720
19841
|
}
|
|
19721
|
-
this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
|
|
19842
|
+
this.writeOut(` [${event}] ${data ? JSON.stringify(data) : ""}
|
|
19722
19843
|
`);
|
|
19723
19844
|
}
|
|
19724
19845
|
writeOut(text) {
|
|
@@ -19903,11 +20024,108 @@ async function startPreviewServer(cwd, options, logger3) {
|
|
|
19903
20024
|
|
|
19904
20025
|
// src/indexing/sources/build/index.ts
|
|
19905
20026
|
var logger = new Logger();
|
|
20027
|
+
function extractLinksFromHtml(html, pageUrl, baseOrigin) {
|
|
20028
|
+
const $ = cheerio.load(html);
|
|
20029
|
+
const links = [];
|
|
20030
|
+
$("a[href]").each((_i, el) => {
|
|
20031
|
+
const href = $(el).attr("href");
|
|
20032
|
+
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
|
|
20033
|
+
return;
|
|
20034
|
+
}
|
|
20035
|
+
try {
|
|
20036
|
+
const resolved = new URL(href, `${baseOrigin}${pageUrl}`);
|
|
20037
|
+
if (resolved.origin !== baseOrigin) return;
|
|
20038
|
+
if (!["http:", "https:"].includes(resolved.protocol)) return;
|
|
20039
|
+
links.push(normalizeUrlPath(resolved.pathname));
|
|
20040
|
+
} catch {
|
|
20041
|
+
}
|
|
20042
|
+
});
|
|
20043
|
+
return [...new Set(links)];
|
|
20044
|
+
}
|
|
20045
|
+
async function discoverPages(server, buildConfig, pipelineMaxPages) {
|
|
20046
|
+
const { seedUrls, maxDepth, exclude } = buildConfig;
|
|
20047
|
+
const baseOrigin = new URL(server.baseUrl).origin;
|
|
20048
|
+
let effectiveMax = buildConfig.maxPages;
|
|
20049
|
+
if (typeof pipelineMaxPages === "number") {
|
|
20050
|
+
const floored = Math.max(0, Math.floor(pipelineMaxPages));
|
|
20051
|
+
effectiveMax = Math.min(effectiveMax, floored);
|
|
20052
|
+
}
|
|
20053
|
+
if (effectiveMax === 0) return [];
|
|
20054
|
+
const visited = /* @__PURE__ */ new Set();
|
|
20055
|
+
const pages = [];
|
|
20056
|
+
const queue = [];
|
|
20057
|
+
const limit = pLimit2__default.default(8);
|
|
20058
|
+
for (const seed of seedUrls) {
|
|
20059
|
+
const normalized = normalizeUrlPath(seed);
|
|
20060
|
+
if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
|
|
20061
|
+
visited.add(normalized);
|
|
20062
|
+
queue.push({ url: normalized, depth: 0 });
|
|
20063
|
+
}
|
|
20064
|
+
}
|
|
20065
|
+
while (queue.length > 0 && pages.length < effectiveMax) {
|
|
20066
|
+
const remaining = effectiveMax - pages.length;
|
|
20067
|
+
const batch = queue.splice(0, remaining);
|
|
20068
|
+
const results = await Promise.allSettled(
|
|
20069
|
+
batch.map(
|
|
20070
|
+
(item) => limit(async () => {
|
|
20071
|
+
const fullUrl = joinUrl(server.baseUrl, item.url);
|
|
20072
|
+
const response = await fetch(fullUrl);
|
|
20073
|
+
if (!response.ok) {
|
|
20074
|
+
logger.warn(`Skipping ${item.url}: ${response.status} ${response.statusText}`);
|
|
20075
|
+
return null;
|
|
20076
|
+
}
|
|
20077
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
20078
|
+
if (!contentType.includes("text/html")) {
|
|
20079
|
+
return null;
|
|
20080
|
+
}
|
|
20081
|
+
const html = await response.text();
|
|
20082
|
+
if (item.depth < maxDepth) {
|
|
20083
|
+
const links = extractLinksFromHtml(html, item.url, baseOrigin);
|
|
20084
|
+
for (const link of links) {
|
|
20085
|
+
if (!visited.has(link) && !isExcluded(link, exclude)) {
|
|
20086
|
+
visited.add(link);
|
|
20087
|
+
queue.push({ url: link, depth: item.depth + 1 });
|
|
20088
|
+
}
|
|
20089
|
+
}
|
|
20090
|
+
}
|
|
20091
|
+
return {
|
|
20092
|
+
url: item.url,
|
|
20093
|
+
html,
|
|
20094
|
+
sourcePath: fullUrl,
|
|
20095
|
+
outgoingLinks: []
|
|
20096
|
+
};
|
|
20097
|
+
})
|
|
20098
|
+
)
|
|
20099
|
+
);
|
|
20100
|
+
for (const result of results) {
|
|
20101
|
+
if (result.status === "fulfilled" && result.value) {
|
|
20102
|
+
pages.push(result.value);
|
|
20103
|
+
}
|
|
20104
|
+
}
|
|
20105
|
+
}
|
|
20106
|
+
if (pages.length >= effectiveMax && queue.length > 0) {
|
|
20107
|
+
logger.warn(`Discovery crawl reached maxPages limit (${effectiveMax}), ${queue.length} URLs not visited.`);
|
|
20108
|
+
}
|
|
20109
|
+
logger.event("build_discover_complete", {
|
|
20110
|
+
pagesFound: pages.length,
|
|
20111
|
+
urlsVisited: visited.size,
|
|
20112
|
+
urlsSkipped: queue.length
|
|
20113
|
+
});
|
|
20114
|
+
return pages;
|
|
20115
|
+
}
|
|
19906
20116
|
async function loadBuildPages(cwd, config, maxPages) {
|
|
19907
20117
|
const buildConfig = config.source.build;
|
|
19908
20118
|
if (!buildConfig) {
|
|
19909
20119
|
throw new Error("build source config is missing");
|
|
19910
20120
|
}
|
|
20121
|
+
if (buildConfig.discover) {
|
|
20122
|
+
const server2 = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
20123
|
+
try {
|
|
20124
|
+
return await discoverPages(server2, buildConfig, maxPages);
|
|
20125
|
+
} finally {
|
|
20126
|
+
await server2.shutdown();
|
|
20127
|
+
}
|
|
20128
|
+
}
|
|
19911
20129
|
const routes = await parseManifest(cwd, buildConfig.outputDir);
|
|
19912
20130
|
const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
|
|
19913
20131
|
logger.event("build_routes_discovered", {
|
|
@@ -19918,7 +20136,7 @@ async function loadBuildPages(cwd, config, maxPages) {
|
|
|
19918
20136
|
const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
|
|
19919
20137
|
const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
19920
20138
|
try {
|
|
19921
|
-
const concurrencyLimit =
|
|
20139
|
+
const concurrencyLimit = pLimit2__default.default(8);
|
|
19922
20140
|
const results = await Promise.allSettled(
|
|
19923
20141
|
selected.map(
|
|
19924
20142
|
(route) => concurrencyLimit(async () => {
|
|
@@ -20087,7 +20305,7 @@ async function loadCrawledPages(config, maxPages) {
|
|
|
20087
20305
|
const routes = await resolveRoutes(config);
|
|
20088
20306
|
const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
|
|
20089
20307
|
const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
|
|
20090
|
-
const concurrencyLimit =
|
|
20308
|
+
const concurrencyLimit = pLimit2__default.default(8);
|
|
20091
20309
|
const results = await Promise.allSettled(
|
|
20092
20310
|
selected.map(
|
|
20093
20311
|
(route) => concurrencyLimit(async () => {
|
|
@@ -20141,9 +20359,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
20141
20359
|
|
|
20142
20360
|
// src/indexing/pipeline.ts
|
|
20143
20361
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
20144
|
-
"
|
|
20145
|
-
"text-embedding-3-large": 13e-5,
|
|
20146
|
-
"text-embedding-ada-002": 1e-4
|
|
20362
|
+
"jina-embeddings-v3": 2e-5
|
|
20147
20363
|
};
|
|
20148
20364
|
var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
|
|
20149
20365
|
var IndexPipeline = class _IndexPipeline {
|
|
@@ -20189,9 +20405,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20189
20405
|
};
|
|
20190
20406
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
20191
20407
|
const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
|
|
20408
|
+
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
20409
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, model: ${this.config.embeddings.model})`);
|
|
20192
20410
|
if (options.force) {
|
|
20411
|
+
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
20193
20412
|
await cleanMirrorForScope(statePath, scope);
|
|
20194
20413
|
}
|
|
20414
|
+
if (options.dryRun) {
|
|
20415
|
+
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
20416
|
+
}
|
|
20195
20417
|
const manifestStart = stageStart();
|
|
20196
20418
|
const existingHashes = await this.vectorStore.getContentHashes(scope);
|
|
20197
20419
|
const existingModelId = await this.vectorStore.getScopeModelId(scope);
|
|
@@ -20202,8 +20424,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20202
20424
|
);
|
|
20203
20425
|
}
|
|
20204
20426
|
stageEnd("manifest", manifestStart);
|
|
20427
|
+
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
20205
20428
|
const sourceStart = stageStart();
|
|
20206
|
-
|
|
20429
|
+
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
20207
20430
|
let sourcePages;
|
|
20208
20431
|
if (sourceMode === "static-output") {
|
|
20209
20432
|
sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
|
|
@@ -20215,10 +20438,13 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20215
20438
|
sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
|
|
20216
20439
|
}
|
|
20217
20440
|
stageEnd("source", sourceStart);
|
|
20441
|
+
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
20218
20442
|
const routeStart = stageStart();
|
|
20219
20443
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
20220
20444
|
stageEnd("route_map", routeStart);
|
|
20445
|
+
this.logger.debug(`Route mapping: ${routePatterns.length} pattern${routePatterns.length === 1 ? "" : "s"} discovered (${stageTimingsMs["route_map"]}ms)`);
|
|
20221
20446
|
const extractStart = stageStart();
|
|
20447
|
+
this.logger.info("Extracting content...");
|
|
20222
20448
|
const extractedPages = [];
|
|
20223
20449
|
for (const sourcePage of sourcePages) {
|
|
20224
20450
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
@@ -20247,6 +20473,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20247
20473
|
uniquePages.push(page);
|
|
20248
20474
|
}
|
|
20249
20475
|
stageEnd("extract", extractStart);
|
|
20476
|
+
const skippedPages = sourcePages.length - uniquePages.length;
|
|
20477
|
+
this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
20250
20478
|
const linkStart = stageStart();
|
|
20251
20479
|
const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
|
|
20252
20480
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
@@ -20262,7 +20490,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20262
20490
|
}
|
|
20263
20491
|
}
|
|
20264
20492
|
stageEnd("links", linkStart);
|
|
20493
|
+
this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
|
|
20265
20494
|
const mirrorStart = stageStart();
|
|
20495
|
+
this.logger.info("Writing mirror pages...");
|
|
20266
20496
|
const mirrorPages = [];
|
|
20267
20497
|
let routeExact = 0;
|
|
20268
20498
|
let routeBestEffort = 0;
|
|
@@ -20332,7 +20562,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20332
20562
|
await this.vectorStore.upsertPages(pageRecords, scope);
|
|
20333
20563
|
}
|
|
20334
20564
|
stageEnd("mirror", mirrorStart);
|
|
20565
|
+
this.logger.info(`Mirrored ${mirrorPages.length} page${mirrorPages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["mirror"]}ms)`);
|
|
20335
20566
|
const chunkStart = stageStart();
|
|
20567
|
+
this.logger.info("Chunking pages...");
|
|
20336
20568
|
let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
|
|
20337
20569
|
const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
|
|
20338
20570
|
if (typeof maxChunks === "number") {
|
|
@@ -20345,6 +20577,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20345
20577
|
});
|
|
20346
20578
|
}
|
|
20347
20579
|
stageEnd("chunk", chunkStart);
|
|
20580
|
+
this.logger.info(`Chunked into ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} (${stageTimingsMs["chunk"]}ms)`);
|
|
20348
20581
|
const currentChunkMap = /* @__PURE__ */ new Map();
|
|
20349
20582
|
for (const chunk of chunks) {
|
|
20350
20583
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
@@ -20363,6 +20596,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20363
20596
|
return existingHash !== chunk.contentHash;
|
|
20364
20597
|
});
|
|
20365
20598
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
20599
|
+
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
20366
20600
|
const embedStart = stageStart();
|
|
20367
20601
|
const chunkTokenEstimates = /* @__PURE__ */ new Map();
|
|
20368
20602
|
for (const chunk of changedChunks) {
|
|
@@ -20377,9 +20611,11 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20377
20611
|
let newEmbeddings = 0;
|
|
20378
20612
|
const vectorsByChunk = /* @__PURE__ */ new Map();
|
|
20379
20613
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
20614
|
+
this.logger.info(`Embedding ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} (~${estimatedTokens.toLocaleString()} tokens, ~$${estimatedCostUSD.toFixed(6)})...`);
|
|
20380
20615
|
const embeddings = await this.embeddings.embedTexts(
|
|
20381
20616
|
changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
|
|
20382
|
-
this.config.embeddings.model
|
|
20617
|
+
this.config.embeddings.model,
|
|
20618
|
+
"retrieval.passage"
|
|
20383
20619
|
);
|
|
20384
20620
|
if (embeddings.length !== changedChunks.length) {
|
|
20385
20621
|
throw new SearchSocketError(
|
|
@@ -20402,8 +20638,14 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20402
20638
|
}
|
|
20403
20639
|
}
|
|
20404
20640
|
stageEnd("embedding", embedStart);
|
|
20641
|
+
if (changedChunks.length > 0) {
|
|
20642
|
+
this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
|
|
20643
|
+
} else {
|
|
20644
|
+
this.logger.info("No chunks to embed \u2014 all up to date");
|
|
20645
|
+
}
|
|
20405
20646
|
const syncStart = stageStart();
|
|
20406
20647
|
if (!options.dryRun) {
|
|
20648
|
+
this.logger.info("Syncing vectors...");
|
|
20407
20649
|
const upserts = [];
|
|
20408
20650
|
for (const chunk of changedChunks) {
|
|
20409
20651
|
const vector = vectorsByChunk.get(chunk.chunkKey);
|
|
@@ -20422,6 +20664,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20422
20664
|
sectionTitle: chunk.sectionTitle ?? "",
|
|
20423
20665
|
headingPath: chunk.headingPath,
|
|
20424
20666
|
snippet: chunk.snippet,
|
|
20667
|
+
chunkText: chunk.chunkText.slice(0, 4e3),
|
|
20668
|
+
ordinal: chunk.ordinal,
|
|
20425
20669
|
contentHash: chunk.contentHash,
|
|
20426
20670
|
modelId: this.config.embeddings.model,
|
|
20427
20671
|
depth: chunk.depth,
|
|
@@ -20441,6 +20685,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20441
20685
|
}
|
|
20442
20686
|
}
|
|
20443
20687
|
stageEnd("sync", syncStart);
|
|
20688
|
+
this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
|
|
20444
20689
|
const finalizeStart = stageStart();
|
|
20445
20690
|
if (!options.dryRun) {
|
|
20446
20691
|
const scopeInfo = {
|
|
@@ -20460,6 +20705,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20460
20705
|
});
|
|
20461
20706
|
}
|
|
20462
20707
|
stageEnd("finalize", finalizeStart);
|
|
20708
|
+
this.logger.info("Done.");
|
|
20463
20709
|
return {
|
|
20464
20710
|
pagesProcessed: mirrorPages.length,
|
|
20465
20711
|
chunksTotal: chunks.length,
|