searchsocket 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -42
- package/dist/cli.js +370 -115
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +391 -109
- package/dist/index.d.cts +20 -3
- package/dist/index.d.ts +20 -3
- package/dist/index.js +389 -108
- package/dist/sveltekit.cjs +374 -109
- package/dist/sveltekit.d.cts +8 -2
- package/dist/sveltekit.d.ts +8 -2
- package/dist/sveltekit.js +373 -107
- package/dist/{types-D1K46vwd.d.cts → types-BrG6XTUU.d.cts} +29 -13
- package/dist/{types-D1K46vwd.d.ts → types-BrG6XTUU.d.ts} +29 -13
- package/package.json +1 -2
package/dist/index.js
CHANGED
|
@@ -3,8 +3,7 @@ import path from 'path';
|
|
|
3
3
|
import { createJiti } from 'jiti';
|
|
4
4
|
import { z } from 'zod';
|
|
5
5
|
import { execSync, spawn } from 'child_process';
|
|
6
|
-
import
|
|
7
|
-
import pLimit from 'p-limit';
|
|
6
|
+
import pLimit2 from 'p-limit';
|
|
8
7
|
import { createHash } from 'crypto';
|
|
9
8
|
import { load } from 'cheerio';
|
|
10
9
|
import matter from 'gray-matter';
|
|
@@ -16620,7 +16619,11 @@ var searchSocketConfigSchema = z.object({
|
|
|
16620
16619
|
outputDir: z.string().min(1).optional(),
|
|
16621
16620
|
paramValues: z.record(z.string(), z.array(z.string())).optional(),
|
|
16622
16621
|
exclude: z.array(z.string()).optional(),
|
|
16623
|
-
previewTimeout: z.number().int().positive().optional()
|
|
16622
|
+
previewTimeout: z.number().int().positive().optional(),
|
|
16623
|
+
discover: z.boolean().optional(),
|
|
16624
|
+
seedUrls: z.array(z.string()).optional(),
|
|
16625
|
+
maxPages: z.number().int().positive().optional(),
|
|
16626
|
+
maxDepth: z.number().int().nonnegative().optional()
|
|
16624
16627
|
}).optional()
|
|
16625
16628
|
}).optional(),
|
|
16626
16629
|
extract: z.object({
|
|
@@ -16647,8 +16650,9 @@ var searchSocketConfigSchema = z.object({
|
|
|
16647
16650
|
pageSummaryChunk: z.boolean().optional()
|
|
16648
16651
|
}).optional(),
|
|
16649
16652
|
embeddings: z.object({
|
|
16650
|
-
provider: z.literal("
|
|
16653
|
+
provider: z.literal("jina").optional(),
|
|
16651
16654
|
model: z.string().min(1).optional(),
|
|
16655
|
+
apiKey: z.string().min(1).optional(),
|
|
16652
16656
|
apiKeyEnv: z.string().min(1).optional(),
|
|
16653
16657
|
batchSize: z.number().int().positive().optional(),
|
|
16654
16658
|
concurrency: z.number().int().positive().optional(),
|
|
@@ -16657,18 +16661,17 @@ var searchSocketConfigSchema = z.object({
|
|
|
16657
16661
|
vector: z.object({
|
|
16658
16662
|
dimension: z.number().int().positive().optional(),
|
|
16659
16663
|
turso: z.object({
|
|
16664
|
+
url: z.string().url().optional(),
|
|
16665
|
+
authToken: z.string().min(1).optional(),
|
|
16660
16666
|
urlEnv: z.string().optional(),
|
|
16661
16667
|
authTokenEnv: z.string().optional(),
|
|
16662
16668
|
localPath: z.string().optional()
|
|
16663
16669
|
}).optional()
|
|
16664
16670
|
}).optional(),
|
|
16665
16671
|
rerank: z.object({
|
|
16666
|
-
|
|
16672
|
+
enabled: z.boolean().optional(),
|
|
16667
16673
|
topN: z.number().int().positive().optional(),
|
|
16668
|
-
|
|
16669
|
-
apiKeyEnv: z.string().optional(),
|
|
16670
|
-
model: z.string().optional()
|
|
16671
|
-
}).optional()
|
|
16674
|
+
model: z.string().optional()
|
|
16672
16675
|
}).optional(),
|
|
16673
16676
|
ranking: z.object({
|
|
16674
16677
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
@@ -16677,6 +16680,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
16677
16680
|
aggregationCap: z.number().int().positive().optional(),
|
|
16678
16681
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
16679
16682
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
16683
|
+
minScore: z.number().min(0).max(1).optional(),
|
|
16680
16684
|
weights: z.object({
|
|
16681
16685
|
incomingLinks: z.number().optional(),
|
|
16682
16686
|
depth: z.number().optional(),
|
|
@@ -16757,9 +16761,9 @@ function createDefaultConfig(projectId) {
|
|
|
16757
16761
|
pageSummaryChunk: true
|
|
16758
16762
|
},
|
|
16759
16763
|
embeddings: {
|
|
16760
|
-
provider: "
|
|
16761
|
-
model: "
|
|
16762
|
-
apiKeyEnv: "
|
|
16764
|
+
provider: "jina",
|
|
16765
|
+
model: "jina-embeddings-v3",
|
|
16766
|
+
apiKeyEnv: "JINA_API_KEY",
|
|
16763
16767
|
batchSize: 64,
|
|
16764
16768
|
concurrency: 4
|
|
16765
16769
|
},
|
|
@@ -16771,12 +16775,9 @@ function createDefaultConfig(projectId) {
|
|
|
16771
16775
|
}
|
|
16772
16776
|
},
|
|
16773
16777
|
rerank: {
|
|
16774
|
-
|
|
16778
|
+
enabled: false,
|
|
16775
16779
|
topN: 20,
|
|
16776
|
-
|
|
16777
|
-
apiKeyEnv: "JINA_API_KEY",
|
|
16778
|
-
model: "jina-reranker-v2-base-multilingual"
|
|
16779
|
-
}
|
|
16780
|
+
model: "jina-reranker-v2-base-multilingual"
|
|
16780
16781
|
},
|
|
16781
16782
|
ranking: {
|
|
16782
16783
|
enableIncomingLinkBoost: true,
|
|
@@ -16785,6 +16786,7 @@ function createDefaultConfig(projectId) {
|
|
|
16785
16786
|
aggregationCap: 5,
|
|
16786
16787
|
aggregationDecay: 0.5,
|
|
16787
16788
|
minChunkScoreRatio: 0.5,
|
|
16789
|
+
minScore: 0,
|
|
16788
16790
|
weights: {
|
|
16789
16791
|
incomingLinks: 0.05,
|
|
16790
16792
|
depth: 0.03,
|
|
@@ -16911,7 +16913,11 @@ ${issues}`
|
|
|
16911
16913
|
outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
|
|
16912
16914
|
paramValues: parsed.source.build.paramValues ?? {},
|
|
16913
16915
|
exclude: parsed.source.build.exclude ?? [],
|
|
16914
|
-
previewTimeout: parsed.source.build.previewTimeout ?? 3e4
|
|
16916
|
+
previewTimeout: parsed.source.build.previewTimeout ?? 3e4,
|
|
16917
|
+
discover: parsed.source.build.discover ?? false,
|
|
16918
|
+
seedUrls: parsed.source.build.seedUrls ?? ["/"],
|
|
16919
|
+
maxPages: parsed.source.build.maxPages ?? 200,
|
|
16920
|
+
maxDepth: parsed.source.build.maxDepth ?? 10
|
|
16915
16921
|
} : void 0
|
|
16916
16922
|
},
|
|
16917
16923
|
extract: {
|
|
@@ -16940,11 +16946,7 @@ ${issues}`
|
|
|
16940
16946
|
},
|
|
16941
16947
|
rerank: {
|
|
16942
16948
|
...defaults.rerank,
|
|
16943
|
-
...parsed.rerank
|
|
16944
|
-
jina: {
|
|
16945
|
-
...defaults.rerank.jina,
|
|
16946
|
-
...parsed.rerank?.jina
|
|
16947
|
-
}
|
|
16949
|
+
...parsed.rerank
|
|
16948
16950
|
},
|
|
16949
16951
|
ranking: {
|
|
16950
16952
|
...defaults.ranking,
|
|
@@ -16991,7 +16993,11 @@ ${issues}`
|
|
|
16991
16993
|
outputDir: ".svelte-kit/output",
|
|
16992
16994
|
paramValues: {},
|
|
16993
16995
|
exclude: [],
|
|
16994
|
-
previewTimeout: 3e4
|
|
16996
|
+
previewTimeout: 3e4,
|
|
16997
|
+
discover: false,
|
|
16998
|
+
seedUrls: ["/"],
|
|
16999
|
+
maxPages: 200,
|
|
17000
|
+
maxDepth: 10
|
|
16995
17001
|
};
|
|
16996
17002
|
}
|
|
16997
17003
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
@@ -17005,6 +17011,21 @@ ${issues}`
|
|
|
17005
17011
|
}
|
|
17006
17012
|
return merged;
|
|
17007
17013
|
}
|
|
17014
|
+
function mergeConfigServerless(rawConfig) {
|
|
17015
|
+
if (!rawConfig.project?.id) {
|
|
17016
|
+
throw new SearchSocketError(
|
|
17017
|
+
"CONFIG_MISSING",
|
|
17018
|
+
"`project.id` is required for serverless config (cannot infer from package.json)."
|
|
17019
|
+
);
|
|
17020
|
+
}
|
|
17021
|
+
if (!rawConfig.source?.mode) {
|
|
17022
|
+
throw new SearchSocketError(
|
|
17023
|
+
"CONFIG_MISSING",
|
|
17024
|
+
"`source.mode` is required for serverless config (cannot auto-detect from filesystem)."
|
|
17025
|
+
);
|
|
17026
|
+
}
|
|
17027
|
+
return mergeConfig(process.cwd(), rawConfig);
|
|
17028
|
+
}
|
|
17008
17029
|
async function loadConfig(options = {}) {
|
|
17009
17030
|
const cwd = path.resolve(options.cwd ?? process.cwd());
|
|
17010
17031
|
const configPath = path.resolve(cwd, options.configPath ?? "searchsocket.config.ts");
|
|
@@ -17027,6 +17048,11 @@ async function loadConfig(options = {}) {
|
|
|
17027
17048
|
return mergeConfig(cwd, raw);
|
|
17028
17049
|
}
|
|
17029
17050
|
|
|
17051
|
+
// src/core/serverless.ts
|
|
17052
|
+
function isServerless() {
|
|
17053
|
+
return !!(process.env.VERCEL || process.env.NETLIFY || process.env.AWS_LAMBDA_FUNCTION_NAME || process.env.FUNCTIONS_WORKER || process.env.CF_PAGES);
|
|
17054
|
+
}
|
|
17055
|
+
|
|
17030
17056
|
// src/utils/text.ts
|
|
17031
17057
|
function normalizeText(input) {
|
|
17032
17058
|
return input.replace(/\r\n/g, "\n").replace(/\s+/g, " ").trim();
|
|
@@ -17104,10 +17130,11 @@ function sleep(ms) {
|
|
|
17104
17130
|
setTimeout(resolve, ms);
|
|
17105
17131
|
});
|
|
17106
17132
|
}
|
|
17107
|
-
var
|
|
17108
|
-
|
|
17133
|
+
var JinaEmbeddingsProvider = class {
|
|
17134
|
+
apiKey;
|
|
17109
17135
|
batchSize;
|
|
17110
17136
|
concurrency;
|
|
17137
|
+
defaultTask;
|
|
17111
17138
|
constructor(options) {
|
|
17112
17139
|
if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
|
|
17113
17140
|
throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
|
|
@@ -17115,11 +17142,10 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17115
17142
|
if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
|
|
17116
17143
|
throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
|
|
17117
17144
|
}
|
|
17118
|
-
this.
|
|
17119
|
-
apiKey: options.apiKey
|
|
17120
|
-
});
|
|
17145
|
+
this.apiKey = options.apiKey;
|
|
17121
17146
|
this.batchSize = options.batchSize;
|
|
17122
17147
|
this.concurrency = options.concurrency;
|
|
17148
|
+
this.defaultTask = options.task ?? "retrieval.passage";
|
|
17123
17149
|
}
|
|
17124
17150
|
estimateTokens(text) {
|
|
17125
17151
|
const normalized = text.trim();
|
|
@@ -17133,7 +17159,7 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17133
17159
|
const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
|
|
17134
17160
|
return Math.max(1, Math.max(charEstimate, lexicalEstimate));
|
|
17135
17161
|
}
|
|
17136
|
-
async embedTexts(texts, modelId) {
|
|
17162
|
+
async embedTexts(texts, modelId, task) {
|
|
17137
17163
|
if (texts.length === 0) {
|
|
17138
17164
|
return [];
|
|
17139
17165
|
}
|
|
@@ -17145,37 +17171,56 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17145
17171
|
});
|
|
17146
17172
|
}
|
|
17147
17173
|
const outputs = new Array(batches.length);
|
|
17148
|
-
const limit =
|
|
17174
|
+
const limit = pLimit2(this.concurrency);
|
|
17149
17175
|
await Promise.all(
|
|
17150
17176
|
batches.map(
|
|
17151
17177
|
(batch, position) => limit(async () => {
|
|
17152
|
-
outputs[position] = await this.embedWithRetry(batch.values, modelId);
|
|
17178
|
+
outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
|
|
17153
17179
|
})
|
|
17154
17180
|
)
|
|
17155
17181
|
);
|
|
17156
17182
|
return outputs.flat();
|
|
17157
17183
|
}
|
|
17158
|
-
async embedWithRetry(texts, modelId) {
|
|
17184
|
+
async embedWithRetry(texts, modelId, task) {
|
|
17159
17185
|
const maxAttempts = 5;
|
|
17160
17186
|
let attempt = 0;
|
|
17161
17187
|
while (attempt < maxAttempts) {
|
|
17162
17188
|
attempt += 1;
|
|
17189
|
+
let response;
|
|
17163
17190
|
try {
|
|
17164
|
-
|
|
17165
|
-
|
|
17166
|
-
|
|
17167
|
-
|
|
17191
|
+
response = await fetch("https://api.jina.ai/v1/embeddings", {
|
|
17192
|
+
method: "POST",
|
|
17193
|
+
headers: {
|
|
17194
|
+
"content-type": "application/json",
|
|
17195
|
+
authorization: `Bearer ${this.apiKey}`
|
|
17196
|
+
},
|
|
17197
|
+
body: JSON.stringify({
|
|
17198
|
+
model: modelId,
|
|
17199
|
+
input: texts,
|
|
17200
|
+
task
|
|
17201
|
+
})
|
|
17168
17202
|
});
|
|
17169
|
-
return response.data.map((entry) => entry.embedding);
|
|
17170
17203
|
} catch (error) {
|
|
17171
|
-
|
|
17172
|
-
const retryable = status === 429 || typeof status === "number" && status >= 500;
|
|
17173
|
-
if (!retryable || attempt >= maxAttempts) {
|
|
17204
|
+
if (attempt >= maxAttempts) {
|
|
17174
17205
|
throw error;
|
|
17175
17206
|
}
|
|
17176
|
-
|
|
17177
|
-
|
|
17207
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17208
|
+
continue;
|
|
17209
|
+
}
|
|
17210
|
+
if (!response.ok) {
|
|
17211
|
+
const retryable = response.status === 429 || response.status >= 500;
|
|
17212
|
+
if (!retryable || attempt >= maxAttempts) {
|
|
17213
|
+
const errorBody = await response.text();
|
|
17214
|
+
throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
|
|
17215
|
+
}
|
|
17216
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17217
|
+
continue;
|
|
17218
|
+
}
|
|
17219
|
+
const payload = await response.json();
|
|
17220
|
+
if (!payload.data || !Array.isArray(payload.data)) {
|
|
17221
|
+
throw new Error("Invalid Jina embeddings response format");
|
|
17178
17222
|
}
|
|
17223
|
+
return payload.data.map((entry) => entry.embedding);
|
|
17179
17224
|
}
|
|
17180
17225
|
throw new Error("Unreachable retry state");
|
|
17181
17226
|
}
|
|
@@ -17183,20 +17228,20 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17183
17228
|
|
|
17184
17229
|
// src/embeddings/factory.ts
|
|
17185
17230
|
function createEmbeddingsProvider(config) {
|
|
17186
|
-
if (config.embeddings.provider !== "
|
|
17231
|
+
if (config.embeddings.provider !== "jina") {
|
|
17187
17232
|
throw new SearchSocketError(
|
|
17188
17233
|
"CONFIG_MISSING",
|
|
17189
17234
|
`Unsupported embeddings provider ${config.embeddings.provider}`
|
|
17190
17235
|
);
|
|
17191
17236
|
}
|
|
17192
|
-
const apiKey = process.env[config.embeddings.apiKeyEnv];
|
|
17237
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17193
17238
|
if (!apiKey) {
|
|
17194
17239
|
throw new SearchSocketError(
|
|
17195
17240
|
"CONFIG_MISSING",
|
|
17196
|
-
`Missing embeddings API key env var
|
|
17241
|
+
`Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
|
|
17197
17242
|
);
|
|
17198
17243
|
}
|
|
17199
|
-
return new
|
|
17244
|
+
return new JinaEmbeddingsProvider({
|
|
17200
17245
|
apiKey,
|
|
17201
17246
|
batchSize: config.embeddings.batchSize,
|
|
17202
17247
|
concurrency: config.embeddings.concurrency
|
|
@@ -17286,20 +17331,17 @@ var JinaReranker = class {
|
|
|
17286
17331
|
|
|
17287
17332
|
// src/rerank/factory.ts
|
|
17288
17333
|
function createReranker(config) {
|
|
17289
|
-
if (config.rerank.
|
|
17334
|
+
if (!config.rerank.enabled) {
|
|
17290
17335
|
return null;
|
|
17291
17336
|
}
|
|
17292
|
-
|
|
17293
|
-
|
|
17294
|
-
|
|
17295
|
-
return null;
|
|
17296
|
-
}
|
|
17297
|
-
return new JinaReranker({
|
|
17298
|
-
apiKey,
|
|
17299
|
-
model: config.rerank.jina.model
|
|
17300
|
-
});
|
|
17337
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17338
|
+
if (!apiKey) {
|
|
17339
|
+
return null;
|
|
17301
17340
|
}
|
|
17302
|
-
return
|
|
17341
|
+
return new JinaReranker({
|
|
17342
|
+
apiKey,
|
|
17343
|
+
model: config.rerank.model
|
|
17344
|
+
});
|
|
17303
17345
|
}
|
|
17304
17346
|
function ensureStateDirs(cwd, stateDir, scope) {
|
|
17305
17347
|
const statePath = path.resolve(cwd, stateDir);
|
|
@@ -17352,6 +17394,16 @@ var TursoVectorStore = class {
|
|
|
17352
17394
|
}
|
|
17353
17395
|
async ensureChunks(dim) {
|
|
17354
17396
|
if (this.chunksReady) return;
|
|
17397
|
+
const exists = await this.chunksTableExists();
|
|
17398
|
+
if (exists) {
|
|
17399
|
+
const currentDim = await this.getChunksDimension();
|
|
17400
|
+
if (currentDim !== null && currentDim !== dim) {
|
|
17401
|
+
await this.client.batch([
|
|
17402
|
+
"DROP INDEX IF EXISTS idx",
|
|
17403
|
+
"DROP TABLE IF EXISTS chunks"
|
|
17404
|
+
]);
|
|
17405
|
+
}
|
|
17406
|
+
}
|
|
17355
17407
|
await this.client.batch([
|
|
17356
17408
|
`CREATE TABLE IF NOT EXISTS chunks (
|
|
17357
17409
|
id TEXT PRIMARY KEY,
|
|
@@ -17363,12 +17415,16 @@ var TursoVectorStore = class {
|
|
|
17363
17415
|
section_title TEXT NOT NULL DEFAULT '',
|
|
17364
17416
|
heading_path TEXT NOT NULL DEFAULT '[]',
|
|
17365
17417
|
snippet TEXT NOT NULL DEFAULT '',
|
|
17418
|
+
chunk_text TEXT NOT NULL DEFAULT '',
|
|
17419
|
+
ordinal INTEGER NOT NULL DEFAULT 0,
|
|
17366
17420
|
content_hash TEXT NOT NULL DEFAULT '',
|
|
17367
17421
|
model_id TEXT NOT NULL DEFAULT '',
|
|
17368
17422
|
depth INTEGER NOT NULL DEFAULT 0,
|
|
17369
17423
|
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
17370
17424
|
route_file TEXT NOT NULL DEFAULT '',
|
|
17371
17425
|
tags TEXT NOT NULL DEFAULT '[]',
|
|
17426
|
+
description TEXT NOT NULL DEFAULT '',
|
|
17427
|
+
keywords TEXT NOT NULL DEFAULT '[]',
|
|
17372
17428
|
embedding F32_BLOB(${dim})
|
|
17373
17429
|
)`,
|
|
17374
17430
|
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
@@ -17407,6 +17463,38 @@ var TursoVectorStore = class {
|
|
|
17407
17463
|
throw error;
|
|
17408
17464
|
}
|
|
17409
17465
|
}
|
|
17466
|
+
/**
|
|
17467
|
+
* Read the current F32_BLOB dimension from the chunks table schema.
|
|
17468
|
+
* Returns null if the table doesn't exist or the dimension can't be parsed.
|
|
17469
|
+
*/
|
|
17470
|
+
async getChunksDimension() {
|
|
17471
|
+
try {
|
|
17472
|
+
const rs = await this.client.execute(
|
|
17473
|
+
"SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
|
|
17474
|
+
);
|
|
17475
|
+
if (rs.rows.length === 0) return null;
|
|
17476
|
+
const sql = rs.rows[0].sql;
|
|
17477
|
+
const match = sql.match(/F32_BLOB\((\d+)\)/i);
|
|
17478
|
+
return match ? parseInt(match[1], 10) : null;
|
|
17479
|
+
} catch {
|
|
17480
|
+
return null;
|
|
17481
|
+
}
|
|
17482
|
+
}
|
|
17483
|
+
/**
|
|
17484
|
+
* Drop all SearchSocket tables (chunks, registry, pages) and their indexes.
|
|
17485
|
+
* Used by `clean --remote` for a full reset.
|
|
17486
|
+
*/
|
|
17487
|
+
async dropAllTables() {
|
|
17488
|
+
await this.client.batch([
|
|
17489
|
+
"DROP INDEX IF EXISTS idx",
|
|
17490
|
+
"DROP TABLE IF EXISTS chunks",
|
|
17491
|
+
"DROP TABLE IF EXISTS registry",
|
|
17492
|
+
"DROP TABLE IF EXISTS pages"
|
|
17493
|
+
]);
|
|
17494
|
+
this.chunksReady = false;
|
|
17495
|
+
this.registryReady = false;
|
|
17496
|
+
this.pagesReady = false;
|
|
17497
|
+
}
|
|
17410
17498
|
async upsert(records, _scope) {
|
|
17411
17499
|
if (records.length === 0) return;
|
|
17412
17500
|
const dim = this.dimension ?? records[0].vector.length;
|
|
@@ -17417,9 +17505,9 @@ var TursoVectorStore = class {
|
|
|
17417
17505
|
const stmts = batch.map((r) => ({
|
|
17418
17506
|
sql: `INSERT OR REPLACE INTO chunks
|
|
17419
17507
|
(id, project_id, scope_name, url, path, title, section_title,
|
|
17420
|
-
heading_path, snippet, content_hash, model_id, depth,
|
|
17421
|
-
incoming_links, route_file, tags, embedding)
|
|
17422
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17508
|
+
heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
|
|
17509
|
+
incoming_links, route_file, tags, description, keywords, embedding)
|
|
17510
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17423
17511
|
args: [
|
|
17424
17512
|
r.id,
|
|
17425
17513
|
r.metadata.projectId,
|
|
@@ -17430,12 +17518,16 @@ var TursoVectorStore = class {
|
|
|
17430
17518
|
r.metadata.sectionTitle,
|
|
17431
17519
|
JSON.stringify(r.metadata.headingPath),
|
|
17432
17520
|
r.metadata.snippet,
|
|
17521
|
+
r.metadata.chunkText,
|
|
17522
|
+
r.metadata.ordinal,
|
|
17433
17523
|
r.metadata.contentHash,
|
|
17434
17524
|
r.metadata.modelId,
|
|
17435
17525
|
r.metadata.depth,
|
|
17436
17526
|
r.metadata.incomingLinks,
|
|
17437
17527
|
r.metadata.routeFile,
|
|
17438
17528
|
JSON.stringify(r.metadata.tags),
|
|
17529
|
+
r.metadata.description ?? "",
|
|
17530
|
+
JSON.stringify(r.metadata.keywords ?? []),
|
|
17439
17531
|
JSON.stringify(r.vector)
|
|
17440
17532
|
]
|
|
17441
17533
|
}));
|
|
@@ -17448,8 +17540,10 @@ var TursoVectorStore = class {
|
|
|
17448
17540
|
const queryJson = JSON.stringify(queryVector);
|
|
17449
17541
|
const rs = await this.client.execute({
|
|
17450
17542
|
sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
|
|
17451
|
-
c.section_title, c.heading_path, c.snippet, c.
|
|
17543
|
+
c.section_title, c.heading_path, c.snippet, c.chunk_text,
|
|
17544
|
+
c.ordinal, c.content_hash,
|
|
17452
17545
|
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
17546
|
+
c.description, c.keywords,
|
|
17453
17547
|
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
17454
17548
|
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
17455
17549
|
JOIN chunks AS c ON c.rowid = v.id`,
|
|
@@ -17480,6 +17574,12 @@ var TursoVectorStore = class {
|
|
|
17480
17574
|
}
|
|
17481
17575
|
const distance = row.distance;
|
|
17482
17576
|
const score = 1 - distance;
|
|
17577
|
+
const description = row.description || void 0;
|
|
17578
|
+
const keywords = (() => {
|
|
17579
|
+
const raw = row.keywords || "[]";
|
|
17580
|
+
const parsed = JSON.parse(raw);
|
|
17581
|
+
return parsed.length > 0 ? parsed : void 0;
|
|
17582
|
+
})();
|
|
17483
17583
|
hits.push({
|
|
17484
17584
|
id: row.id,
|
|
17485
17585
|
score,
|
|
@@ -17492,12 +17592,16 @@ var TursoVectorStore = class {
|
|
|
17492
17592
|
sectionTitle: row.section_title,
|
|
17493
17593
|
headingPath: JSON.parse(row.heading_path || "[]"),
|
|
17494
17594
|
snippet: row.snippet,
|
|
17595
|
+
chunkText: row.chunk_text || "",
|
|
17596
|
+
ordinal: row.ordinal || 0,
|
|
17495
17597
|
contentHash: row.content_hash,
|
|
17496
17598
|
modelId: row.model_id,
|
|
17497
17599
|
depth: row.depth,
|
|
17498
17600
|
incomingLinks: row.incoming_links,
|
|
17499
17601
|
routeFile: row.route_file,
|
|
17500
|
-
tags
|
|
17602
|
+
tags,
|
|
17603
|
+
description,
|
|
17604
|
+
keywords
|
|
17501
17605
|
}
|
|
17502
17606
|
});
|
|
17503
17607
|
}
|
|
@@ -17687,10 +17791,10 @@ var TursoVectorStore = class {
|
|
|
17687
17791
|
// src/vector/factory.ts
|
|
17688
17792
|
async function createVectorStore(config, cwd) {
|
|
17689
17793
|
const turso = config.vector.turso;
|
|
17690
|
-
const remoteUrl = process.env[turso.urlEnv];
|
|
17794
|
+
const remoteUrl = turso.url ?? process.env[turso.urlEnv];
|
|
17691
17795
|
if (remoteUrl) {
|
|
17692
17796
|
const { createClient: createClient2 } = await import('@libsql/client/http');
|
|
17693
|
-
const authToken = process.env[turso.authTokenEnv];
|
|
17797
|
+
const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
|
|
17694
17798
|
const client2 = createClient2({
|
|
17695
17799
|
url: remoteUrl,
|
|
17696
17800
|
authToken
|
|
@@ -17700,6 +17804,12 @@ async function createVectorStore(config, cwd) {
|
|
|
17700
17804
|
dimension: config.vector.dimension
|
|
17701
17805
|
});
|
|
17702
17806
|
}
|
|
17807
|
+
if (isServerless()) {
|
|
17808
|
+
throw new SearchSocketError(
|
|
17809
|
+
"VECTOR_BACKEND_UNAVAILABLE",
|
|
17810
|
+
`No remote vector database URL found (checked vector.turso.url and env var "${turso.urlEnv}"). Local SQLite storage is not available in serverless environments. Set ${turso.urlEnv} or pass vector.turso.url directly.`
|
|
17811
|
+
);
|
|
17812
|
+
}
|
|
17703
17813
|
const { createClient } = await import('@libsql/client');
|
|
17704
17814
|
const localPath = path.resolve(cwd, turso.localPath);
|
|
17705
17815
|
fs.mkdirSync(path.dirname(localPath), { recursive: true });
|
|
@@ -18030,7 +18140,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
18030
18140
|
incomingLinks: page.incomingLinks,
|
|
18031
18141
|
routeFile: page.routeFile,
|
|
18032
18142
|
tags: page.tags,
|
|
18033
|
-
contentHash: ""
|
|
18143
|
+
contentHash: "",
|
|
18144
|
+
description: page.description,
|
|
18145
|
+
keywords: page.keywords
|
|
18034
18146
|
};
|
|
18035
18147
|
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
18036
18148
|
summaryChunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -18057,7 +18169,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
18057
18169
|
incomingLinks: page.incomingLinks,
|
|
18058
18170
|
routeFile: page.routeFile,
|
|
18059
18171
|
tags: page.tags,
|
|
18060
|
-
contentHash: ""
|
|
18172
|
+
contentHash: "",
|
|
18173
|
+
description: page.description,
|
|
18174
|
+
keywords: page.keywords
|
|
18061
18175
|
};
|
|
18062
18176
|
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
18063
18177
|
chunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -19138,14 +19252,16 @@ function mapUrlToRoute(urlPath, patterns) {
|
|
|
19138
19252
|
var Logger = class {
|
|
19139
19253
|
json;
|
|
19140
19254
|
verbose;
|
|
19255
|
+
quiet;
|
|
19141
19256
|
stderrOnly;
|
|
19142
19257
|
constructor(opts = {}) {
|
|
19143
19258
|
this.json = opts.json ?? false;
|
|
19144
19259
|
this.verbose = opts.verbose ?? false;
|
|
19260
|
+
this.quiet = opts.quiet ?? false;
|
|
19145
19261
|
this.stderrOnly = opts.stderrOnly ?? false;
|
|
19146
19262
|
}
|
|
19147
19263
|
info(message) {
|
|
19148
|
-
if (this.json) {
|
|
19264
|
+
if (this.quiet || this.json) {
|
|
19149
19265
|
return;
|
|
19150
19266
|
}
|
|
19151
19267
|
this.writeOut(`${message}
|
|
@@ -19159,7 +19275,7 @@ var Logger = class {
|
|
|
19159
19275
|
this.logJson("debug", { message });
|
|
19160
19276
|
return;
|
|
19161
19277
|
}
|
|
19162
|
-
this.writeOut(
|
|
19278
|
+
this.writeOut(` ${message}
|
|
19163
19279
|
`);
|
|
19164
19280
|
}
|
|
19165
19281
|
warn(message) {
|
|
@@ -19186,7 +19302,7 @@ var Logger = class {
|
|
|
19186
19302
|
this.logJson(event, data);
|
|
19187
19303
|
return;
|
|
19188
19304
|
}
|
|
19189
|
-
this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
|
|
19305
|
+
this.writeOut(` [${event}] ${data ? JSON.stringify(data) : ""}
|
|
19190
19306
|
`);
|
|
19191
19307
|
}
|
|
19192
19308
|
writeOut(text) {
|
|
@@ -19371,11 +19487,108 @@ async function startPreviewServer(cwd, options, logger3) {
|
|
|
19371
19487
|
|
|
19372
19488
|
// src/indexing/sources/build/index.ts
|
|
19373
19489
|
var logger = new Logger();
|
|
19490
|
+
function extractLinksFromHtml(html, pageUrl, baseOrigin) {
|
|
19491
|
+
const $ = load(html);
|
|
19492
|
+
const links = [];
|
|
19493
|
+
$("a[href]").each((_i, el) => {
|
|
19494
|
+
const href = $(el).attr("href");
|
|
19495
|
+
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
|
|
19496
|
+
return;
|
|
19497
|
+
}
|
|
19498
|
+
try {
|
|
19499
|
+
const resolved = new URL(href, `${baseOrigin}${pageUrl}`);
|
|
19500
|
+
if (resolved.origin !== baseOrigin) return;
|
|
19501
|
+
if (!["http:", "https:"].includes(resolved.protocol)) return;
|
|
19502
|
+
links.push(normalizeUrlPath(resolved.pathname));
|
|
19503
|
+
} catch {
|
|
19504
|
+
}
|
|
19505
|
+
});
|
|
19506
|
+
return [...new Set(links)];
|
|
19507
|
+
}
|
|
19508
|
+
async function discoverPages(server, buildConfig, pipelineMaxPages) {
|
|
19509
|
+
const { seedUrls, maxDepth, exclude } = buildConfig;
|
|
19510
|
+
const baseOrigin = new URL(server.baseUrl).origin;
|
|
19511
|
+
let effectiveMax = buildConfig.maxPages;
|
|
19512
|
+
if (typeof pipelineMaxPages === "number") {
|
|
19513
|
+
const floored = Math.max(0, Math.floor(pipelineMaxPages));
|
|
19514
|
+
effectiveMax = Math.min(effectiveMax, floored);
|
|
19515
|
+
}
|
|
19516
|
+
if (effectiveMax === 0) return [];
|
|
19517
|
+
const visited = /* @__PURE__ */ new Set();
|
|
19518
|
+
const pages = [];
|
|
19519
|
+
const queue = [];
|
|
19520
|
+
const limit = pLimit2(8);
|
|
19521
|
+
for (const seed of seedUrls) {
|
|
19522
|
+
const normalized = normalizeUrlPath(seed);
|
|
19523
|
+
if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
|
|
19524
|
+
visited.add(normalized);
|
|
19525
|
+
queue.push({ url: normalized, depth: 0 });
|
|
19526
|
+
}
|
|
19527
|
+
}
|
|
19528
|
+
while (queue.length > 0 && pages.length < effectiveMax) {
|
|
19529
|
+
const remaining = effectiveMax - pages.length;
|
|
19530
|
+
const batch = queue.splice(0, remaining);
|
|
19531
|
+
const results = await Promise.allSettled(
|
|
19532
|
+
batch.map(
|
|
19533
|
+
(item) => limit(async () => {
|
|
19534
|
+
const fullUrl = joinUrl(server.baseUrl, item.url);
|
|
19535
|
+
const response = await fetch(fullUrl);
|
|
19536
|
+
if (!response.ok) {
|
|
19537
|
+
logger.warn(`Skipping ${item.url}: ${response.status} ${response.statusText}`);
|
|
19538
|
+
return null;
|
|
19539
|
+
}
|
|
19540
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
19541
|
+
if (!contentType.includes("text/html")) {
|
|
19542
|
+
return null;
|
|
19543
|
+
}
|
|
19544
|
+
const html = await response.text();
|
|
19545
|
+
if (item.depth < maxDepth) {
|
|
19546
|
+
const links = extractLinksFromHtml(html, item.url, baseOrigin);
|
|
19547
|
+
for (const link of links) {
|
|
19548
|
+
if (!visited.has(link) && !isExcluded(link, exclude)) {
|
|
19549
|
+
visited.add(link);
|
|
19550
|
+
queue.push({ url: link, depth: item.depth + 1 });
|
|
19551
|
+
}
|
|
19552
|
+
}
|
|
19553
|
+
}
|
|
19554
|
+
return {
|
|
19555
|
+
url: item.url,
|
|
19556
|
+
html,
|
|
19557
|
+
sourcePath: fullUrl,
|
|
19558
|
+
outgoingLinks: []
|
|
19559
|
+
};
|
|
19560
|
+
})
|
|
19561
|
+
)
|
|
19562
|
+
);
|
|
19563
|
+
for (const result of results) {
|
|
19564
|
+
if (result.status === "fulfilled" && result.value) {
|
|
19565
|
+
pages.push(result.value);
|
|
19566
|
+
}
|
|
19567
|
+
}
|
|
19568
|
+
}
|
|
19569
|
+
if (pages.length >= effectiveMax && queue.length > 0) {
|
|
19570
|
+
logger.warn(`Discovery crawl reached maxPages limit (${effectiveMax}), ${queue.length} URLs not visited.`);
|
|
19571
|
+
}
|
|
19572
|
+
logger.event("build_discover_complete", {
|
|
19573
|
+
pagesFound: pages.length,
|
|
19574
|
+
urlsVisited: visited.size,
|
|
19575
|
+
urlsSkipped: queue.length
|
|
19576
|
+
});
|
|
19577
|
+
return pages;
|
|
19578
|
+
}
|
|
19374
19579
|
async function loadBuildPages(cwd, config, maxPages) {
|
|
19375
19580
|
const buildConfig = config.source.build;
|
|
19376
19581
|
if (!buildConfig) {
|
|
19377
19582
|
throw new Error("build source config is missing");
|
|
19378
19583
|
}
|
|
19584
|
+
if (buildConfig.discover) {
|
|
19585
|
+
const server2 = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
19586
|
+
try {
|
|
19587
|
+
return await discoverPages(server2, buildConfig, maxPages);
|
|
19588
|
+
} finally {
|
|
19589
|
+
await server2.shutdown();
|
|
19590
|
+
}
|
|
19591
|
+
}
|
|
19379
19592
|
const routes = await parseManifest(cwd, buildConfig.outputDir);
|
|
19380
19593
|
const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
|
|
19381
19594
|
logger.event("build_routes_discovered", {
|
|
@@ -19386,7 +19599,7 @@ async function loadBuildPages(cwd, config, maxPages) {
|
|
|
19386
19599
|
const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
|
|
19387
19600
|
const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
19388
19601
|
try {
|
|
19389
|
-
const concurrencyLimit =
|
|
19602
|
+
const concurrencyLimit = pLimit2(8);
|
|
19390
19603
|
const results = await Promise.allSettled(
|
|
19391
19604
|
selected.map(
|
|
19392
19605
|
(route) => concurrencyLimit(async () => {
|
|
@@ -19555,7 +19768,7 @@ async function loadCrawledPages(config, maxPages) {
|
|
|
19555
19768
|
const routes = await resolveRoutes(config);
|
|
19556
19769
|
const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
|
|
19557
19770
|
const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
|
|
19558
|
-
const concurrencyLimit =
|
|
19771
|
+
const concurrencyLimit = pLimit2(8);
|
|
19559
19772
|
const results = await Promise.allSettled(
|
|
19560
19773
|
selected.map(
|
|
19561
19774
|
(route) => concurrencyLimit(async () => {
|
|
@@ -19617,9 +19830,7 @@ function hrTimeMs(start) {
|
|
|
19617
19830
|
|
|
19618
19831
|
// src/indexing/pipeline.ts
|
|
19619
19832
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
19620
|
-
"
|
|
19621
|
-
"text-embedding-3-large": 13e-5,
|
|
19622
|
-
"text-embedding-ada-002": 1e-4
|
|
19833
|
+
"jina-embeddings-v3": 2e-5
|
|
19623
19834
|
};
|
|
19624
19835
|
var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
|
|
19625
19836
|
var IndexPipeline = class _IndexPipeline {
|
|
@@ -19665,9 +19876,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19665
19876
|
};
|
|
19666
19877
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
19667
19878
|
const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
|
|
19879
|
+
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
19880
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, model: ${this.config.embeddings.model})`);
|
|
19668
19881
|
if (options.force) {
|
|
19882
|
+
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
19669
19883
|
await cleanMirrorForScope(statePath, scope);
|
|
19670
19884
|
}
|
|
19885
|
+
if (options.dryRun) {
|
|
19886
|
+
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
19887
|
+
}
|
|
19671
19888
|
const manifestStart = stageStart();
|
|
19672
19889
|
const existingHashes = await this.vectorStore.getContentHashes(scope);
|
|
19673
19890
|
const existingModelId = await this.vectorStore.getScopeModelId(scope);
|
|
@@ -19678,8 +19895,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19678
19895
|
);
|
|
19679
19896
|
}
|
|
19680
19897
|
stageEnd("manifest", manifestStart);
|
|
19898
|
+
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
19681
19899
|
const sourceStart = stageStart();
|
|
19682
|
-
|
|
19900
|
+
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
19683
19901
|
let sourcePages;
|
|
19684
19902
|
if (sourceMode === "static-output") {
|
|
19685
19903
|
sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
|
|
@@ -19691,10 +19909,13 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19691
19909
|
sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
|
|
19692
19910
|
}
|
|
19693
19911
|
stageEnd("source", sourceStart);
|
|
19912
|
+
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
19694
19913
|
const routeStart = stageStart();
|
|
19695
19914
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
19696
19915
|
stageEnd("route_map", routeStart);
|
|
19916
|
+
this.logger.debug(`Route mapping: ${routePatterns.length} pattern${routePatterns.length === 1 ? "" : "s"} discovered (${stageTimingsMs["route_map"]}ms)`);
|
|
19697
19917
|
const extractStart = stageStart();
|
|
19918
|
+
this.logger.info("Extracting content...");
|
|
19698
19919
|
const extractedPages = [];
|
|
19699
19920
|
for (const sourcePage of sourcePages) {
|
|
19700
19921
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
@@ -19723,6 +19944,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19723
19944
|
uniquePages.push(page);
|
|
19724
19945
|
}
|
|
19725
19946
|
stageEnd("extract", extractStart);
|
|
19947
|
+
const skippedPages = sourcePages.length - uniquePages.length;
|
|
19948
|
+
this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
19726
19949
|
const linkStart = stageStart();
|
|
19727
19950
|
const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
|
|
19728
19951
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
@@ -19738,7 +19961,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19738
19961
|
}
|
|
19739
19962
|
}
|
|
19740
19963
|
stageEnd("links", linkStart);
|
|
19964
|
+
this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
|
|
19741
19965
|
const mirrorStart = stageStart();
|
|
19966
|
+
this.logger.info("Writing mirror pages...");
|
|
19742
19967
|
const mirrorPages = [];
|
|
19743
19968
|
let routeExact = 0;
|
|
19744
19969
|
let routeBestEffort = 0;
|
|
@@ -19808,7 +20033,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19808
20033
|
await this.vectorStore.upsertPages(pageRecords, scope);
|
|
19809
20034
|
}
|
|
19810
20035
|
stageEnd("mirror", mirrorStart);
|
|
20036
|
+
this.logger.info(`Mirrored ${mirrorPages.length} page${mirrorPages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["mirror"]}ms)`);
|
|
19811
20037
|
const chunkStart = stageStart();
|
|
20038
|
+
this.logger.info("Chunking pages...");
|
|
19812
20039
|
let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
|
|
19813
20040
|
const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
|
|
19814
20041
|
if (typeof maxChunks === "number") {
|
|
@@ -19821,6 +20048,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19821
20048
|
});
|
|
19822
20049
|
}
|
|
19823
20050
|
stageEnd("chunk", chunkStart);
|
|
20051
|
+
this.logger.info(`Chunked into ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} (${stageTimingsMs["chunk"]}ms)`);
|
|
19824
20052
|
const currentChunkMap = /* @__PURE__ */ new Map();
|
|
19825
20053
|
for (const chunk of chunks) {
|
|
19826
20054
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
@@ -19839,6 +20067,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19839
20067
|
return existingHash !== chunk.contentHash;
|
|
19840
20068
|
});
|
|
19841
20069
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
20070
|
+
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
19842
20071
|
const embedStart = stageStart();
|
|
19843
20072
|
const chunkTokenEstimates = /* @__PURE__ */ new Map();
|
|
19844
20073
|
for (const chunk of changedChunks) {
|
|
@@ -19853,9 +20082,11 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19853
20082
|
let newEmbeddings = 0;
|
|
19854
20083
|
const vectorsByChunk = /* @__PURE__ */ new Map();
|
|
19855
20084
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
20085
|
+
this.logger.info(`Embedding ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} (~${estimatedTokens.toLocaleString()} tokens, ~$${estimatedCostUSD.toFixed(6)})...`);
|
|
19856
20086
|
const embeddings = await this.embeddings.embedTexts(
|
|
19857
20087
|
changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
|
|
19858
|
-
this.config.embeddings.model
|
|
20088
|
+
this.config.embeddings.model,
|
|
20089
|
+
"retrieval.passage"
|
|
19859
20090
|
);
|
|
19860
20091
|
if (embeddings.length !== changedChunks.length) {
|
|
19861
20092
|
throw new SearchSocketError(
|
|
@@ -19878,8 +20109,14 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19878
20109
|
}
|
|
19879
20110
|
}
|
|
19880
20111
|
stageEnd("embedding", embedStart);
|
|
20112
|
+
if (changedChunks.length > 0) {
|
|
20113
|
+
this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
|
|
20114
|
+
} else {
|
|
20115
|
+
this.logger.info("No chunks to embed \u2014 all up to date");
|
|
20116
|
+
}
|
|
19881
20117
|
const syncStart = stageStart();
|
|
19882
20118
|
if (!options.dryRun) {
|
|
20119
|
+
this.logger.info("Syncing vectors...");
|
|
19883
20120
|
const upserts = [];
|
|
19884
20121
|
for (const chunk of changedChunks) {
|
|
19885
20122
|
const vector = vectorsByChunk.get(chunk.chunkKey);
|
|
@@ -19898,12 +20135,16 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19898
20135
|
sectionTitle: chunk.sectionTitle ?? "",
|
|
19899
20136
|
headingPath: chunk.headingPath,
|
|
19900
20137
|
snippet: chunk.snippet,
|
|
20138
|
+
chunkText: chunk.chunkText.slice(0, 4e3),
|
|
20139
|
+
ordinal: chunk.ordinal,
|
|
19901
20140
|
contentHash: chunk.contentHash,
|
|
19902
20141
|
modelId: this.config.embeddings.model,
|
|
19903
20142
|
depth: chunk.depth,
|
|
19904
20143
|
incomingLinks: chunk.incomingLinks,
|
|
19905
20144
|
routeFile: chunk.routeFile,
|
|
19906
|
-
tags: chunk.tags
|
|
20145
|
+
tags: chunk.tags,
|
|
20146
|
+
description: chunk.description,
|
|
20147
|
+
keywords: chunk.keywords
|
|
19907
20148
|
}
|
|
19908
20149
|
});
|
|
19909
20150
|
}
|
|
@@ -19917,6 +20158,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19917
20158
|
}
|
|
19918
20159
|
}
|
|
19919
20160
|
stageEnd("sync", syncStart);
|
|
20161
|
+
this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
|
|
19920
20162
|
const finalizeStart = stageStart();
|
|
19921
20163
|
if (!options.dryRun) {
|
|
19922
20164
|
const scopeInfo = {
|
|
@@ -19936,6 +20178,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19936
20178
|
});
|
|
19937
20179
|
}
|
|
19938
20180
|
stageEnd("finalize", finalizeStart);
|
|
20181
|
+
this.logger.info("Done.");
|
|
19939
20182
|
return {
|
|
19940
20183
|
pagesProcessed: mirrorPages.length,
|
|
19941
20184
|
chunksTotal: chunks.length,
|
|
@@ -20096,7 +20339,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
20096
20339
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
20097
20340
|
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
20098
20341
|
const embedStart = process.hrtime.bigint();
|
|
20099
|
-
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
|
|
20342
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
20100
20343
|
const queryVector = queryEmbeddings[0];
|
|
20101
20344
|
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
20102
20345
|
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
@@ -20124,13 +20367,17 @@ var SearchEngine = class _SearchEngine {
|
|
|
20124
20367
|
usedRerank = true;
|
|
20125
20368
|
}
|
|
20126
20369
|
let results;
|
|
20370
|
+
const minScore = this.config.ranking.minScore;
|
|
20127
20371
|
if (groupByPage) {
|
|
20128
|
-
|
|
20372
|
+
let pages = aggregateByPage(ordered, this.config);
|
|
20373
|
+
if (minScore > 0) {
|
|
20374
|
+
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
20375
|
+
}
|
|
20129
20376
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
20130
20377
|
results = pages.slice(0, topK).map((page) => {
|
|
20131
20378
|
const bestScore = page.bestChunk.finalScore;
|
|
20132
|
-
const
|
|
20133
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
20379
|
+
const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
20380
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
|
|
20134
20381
|
return {
|
|
20135
20382
|
url: page.url,
|
|
20136
20383
|
title: page.title,
|
|
@@ -20147,6 +20394,9 @@ var SearchEngine = class _SearchEngine {
|
|
|
20147
20394
|
};
|
|
20148
20395
|
});
|
|
20149
20396
|
} else {
|
|
20397
|
+
if (minScore > 0) {
|
|
20398
|
+
ordered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
20399
|
+
}
|
|
20150
20400
|
results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
20151
20401
|
url: hit.metadata.url,
|
|
20152
20402
|
title: hit.metadata.title,
|
|
@@ -20218,43 +20468,67 @@ var SearchEngine = class _SearchEngine {
|
|
|
20218
20468
|
}
|
|
20219
20469
|
}
|
|
20220
20470
|
async rerankHits(query, ranked, topK) {
|
|
20221
|
-
if (this.config.rerank.
|
|
20471
|
+
if (!this.config.rerank.enabled) {
|
|
20222
20472
|
throw new SearchSocketError(
|
|
20223
20473
|
"INVALID_REQUEST",
|
|
20224
|
-
"rerank=true requested but rerank.
|
|
20474
|
+
"rerank=true requested but rerank.enabled is not set to true.",
|
|
20225
20475
|
400
|
|
20226
20476
|
);
|
|
20227
20477
|
}
|
|
20228
20478
|
if (!this.reranker) {
|
|
20229
20479
|
throw new SearchSocketError(
|
|
20230
20480
|
"CONFIG_MISSING",
|
|
20231
|
-
`rerank=true requested but ${this.config.
|
|
20481
|
+
`rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
|
|
20232
20482
|
400
|
|
20233
20483
|
);
|
|
20234
20484
|
}
|
|
20235
|
-
const
|
|
20236
|
-
|
|
20237
|
-
|
|
20238
|
-
|
|
20485
|
+
const pageGroups = /* @__PURE__ */ new Map();
|
|
20486
|
+
for (const entry of ranked) {
|
|
20487
|
+
const url = entry.hit.metadata.url;
|
|
20488
|
+
const group = pageGroups.get(url);
|
|
20489
|
+
if (group) group.push(entry);
|
|
20490
|
+
else pageGroups.set(url, [entry]);
|
|
20491
|
+
}
|
|
20492
|
+
const MAX_CHUNKS_PER_PAGE = 5;
|
|
20493
|
+
const MIN_CHUNKS_PER_PAGE = 1;
|
|
20494
|
+
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
20495
|
+
const pageCandidates = [];
|
|
20496
|
+
for (const [url, chunks] of pageGroups) {
|
|
20497
|
+
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
20498
|
+
const bestScore = byScore[0].finalScore;
|
|
20499
|
+
const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
|
|
20500
|
+
const selected = byScore.filter(
|
|
20501
|
+
(c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
|
|
20502
|
+
).slice(0, MAX_CHUNKS_PER_PAGE);
|
|
20503
|
+
selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
|
|
20504
|
+
const first = selected[0].hit.metadata;
|
|
20505
|
+
const parts = [first.title];
|
|
20506
|
+
if (first.description) {
|
|
20507
|
+
parts.push(first.description);
|
|
20508
|
+
}
|
|
20509
|
+
if (first.keywords && first.keywords.length > 0) {
|
|
20510
|
+
parts.push(first.keywords.join(", "));
|
|
20511
|
+
}
|
|
20512
|
+
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
20513
|
+
parts.push(body);
|
|
20514
|
+
pageCandidates.push({ id: url, text: parts.join("\n\n") });
|
|
20515
|
+
}
|
|
20239
20516
|
const reranked = await this.reranker.rerank(
|
|
20240
20517
|
query,
|
|
20241
|
-
|
|
20518
|
+
pageCandidates,
|
|
20242
20519
|
Math.max(topK, this.config.rerank.topN)
|
|
20243
20520
|
);
|
|
20244
|
-
const
|
|
20521
|
+
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
20245
20522
|
return ranked.map((entry) => {
|
|
20246
|
-
const
|
|
20247
|
-
const
|
|
20248
|
-
if (
|
|
20249
|
-
return {
|
|
20250
|
-
...entry,
|
|
20251
|
-
finalScore: safeBaseScore
|
|
20252
|
-
};
|
|
20523
|
+
const pageScore = scoreByUrl.get(entry.hit.metadata.url);
|
|
20524
|
+
const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
|
|
20525
|
+
if (pageScore === void 0 || !Number.isFinite(pageScore)) {
|
|
20526
|
+
return { ...entry, finalScore: base };
|
|
20253
20527
|
}
|
|
20254
|
-
const
|
|
20528
|
+
const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
|
|
20255
20529
|
return {
|
|
20256
20530
|
...entry,
|
|
20257
|
-
finalScore: Number.isFinite(
|
|
20531
|
+
finalScore: Number.isFinite(combined) ? combined : base
|
|
20258
20532
|
};
|
|
20259
20533
|
}).sort((a, b) => {
|
|
20260
20534
|
const delta = b.finalScore - a.finalScore;
|
|
@@ -20452,13 +20726,21 @@ function searchsocketHandle(options = {}) {
|
|
|
20452
20726
|
let rateLimiter = null;
|
|
20453
20727
|
const getConfig = async () => {
|
|
20454
20728
|
if (!configPromise) {
|
|
20455
|
-
|
|
20456
|
-
|
|
20457
|
-
|
|
20458
|
-
})
|
|
20729
|
+
let configP;
|
|
20730
|
+
if (options.config) {
|
|
20731
|
+
configP = Promise.resolve(options.config);
|
|
20732
|
+
} else if (options.rawConfig) {
|
|
20733
|
+
const cwd = options.cwd ?? process.cwd();
|
|
20734
|
+
configP = Promise.resolve(mergeConfig(cwd, options.rawConfig));
|
|
20735
|
+
} else {
|
|
20736
|
+
configP = loadConfig({
|
|
20737
|
+
cwd: options.cwd,
|
|
20738
|
+
configPath: options.configPath
|
|
20739
|
+
});
|
|
20740
|
+
}
|
|
20459
20741
|
configPromise = configP.then((config) => {
|
|
20460
20742
|
apiPath = apiPath ?? config.api.path;
|
|
20461
|
-
if (config.api.rateLimit) {
|
|
20743
|
+
if (config.api.rateLimit && !isServerless()) {
|
|
20462
20744
|
rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
|
|
20463
20745
|
}
|
|
20464
20746
|
return config;
|
|
@@ -20468,10 +20750,9 @@ function searchsocketHandle(options = {}) {
|
|
|
20468
20750
|
};
|
|
20469
20751
|
const getEngine = async () => {
|
|
20470
20752
|
if (!enginePromise) {
|
|
20471
|
-
const config =
|
|
20753
|
+
const config = await getConfig();
|
|
20472
20754
|
enginePromise = SearchEngine.create({
|
|
20473
20755
|
cwd: options.cwd,
|
|
20474
|
-
configPath: options.configPath,
|
|
20475
20756
|
config
|
|
20476
20757
|
});
|
|
20477
20758
|
}
|
|
@@ -20737,6 +21018,6 @@ function createSearchClient(options = {}) {
|
|
|
20737
21018
|
*)
|
|
20738
21019
|
*/
|
|
20739
21020
|
|
|
20740
|
-
export { IndexPipeline, JinaReranker, SearchEngine, createEmbeddingsProvider, createReranker, createSearchClient, createVectorStore, loadConfig, mergeConfig, resolveScope, runMcpServer, searchsocketHandle, searchsocketVitePlugin };
|
|
21021
|
+
export { IndexPipeline, JinaReranker, SearchEngine, createEmbeddingsProvider, createReranker, createSearchClient, createVectorStore, isServerless, loadConfig, mergeConfig, mergeConfigServerless, resolveScope, runMcpServer, searchsocketHandle, searchsocketVitePlugin };
|
|
20741
21022
|
//# sourceMappingURL=index.js.map
|
|
20742
21023
|
//# sourceMappingURL=index.js.map
|