searchsocket 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -42
- package/dist/cli.js +348 -111
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +367 -104
- package/dist/index.d.cts +20 -3
- package/dist/index.d.ts +20 -3
- package/dist/index.js +365 -103
- package/dist/sveltekit.cjs +350 -104
- package/dist/sveltekit.d.cts +8 -2
- package/dist/sveltekit.d.ts +8 -2
- package/dist/sveltekit.js +349 -102
- package/dist/{types-D1K46vwd.d.cts → types-DAXk6A3Y.d.cts} +25 -13
- package/dist/{types-D1K46vwd.d.ts → types-DAXk6A3Y.d.ts} +25 -13
- package/package.json +3 -3
- package/dist/cli.js.map +0 -1
- package/dist/client.cjs.map +0 -1
- package/dist/client.js.map +0 -1
- package/dist/index.cjs.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/sveltekit.cjs.map +0 -1
- package/dist/sveltekit.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -3,8 +3,7 @@ import path from 'path';
|
|
|
3
3
|
import { createJiti } from 'jiti';
|
|
4
4
|
import { z } from 'zod';
|
|
5
5
|
import { execSync, spawn } from 'child_process';
|
|
6
|
-
import
|
|
7
|
-
import pLimit from 'p-limit';
|
|
6
|
+
import pLimit2 from 'p-limit';
|
|
8
7
|
import { createHash } from 'crypto';
|
|
9
8
|
import { load } from 'cheerio';
|
|
10
9
|
import matter from 'gray-matter';
|
|
@@ -16620,7 +16619,11 @@ var searchSocketConfigSchema = z.object({
|
|
|
16620
16619
|
outputDir: z.string().min(1).optional(),
|
|
16621
16620
|
paramValues: z.record(z.string(), z.array(z.string())).optional(),
|
|
16622
16621
|
exclude: z.array(z.string()).optional(),
|
|
16623
|
-
previewTimeout: z.number().int().positive().optional()
|
|
16622
|
+
previewTimeout: z.number().int().positive().optional(),
|
|
16623
|
+
discover: z.boolean().optional(),
|
|
16624
|
+
seedUrls: z.array(z.string()).optional(),
|
|
16625
|
+
maxPages: z.number().int().positive().optional(),
|
|
16626
|
+
maxDepth: z.number().int().nonnegative().optional()
|
|
16624
16627
|
}).optional()
|
|
16625
16628
|
}).optional(),
|
|
16626
16629
|
extract: z.object({
|
|
@@ -16647,8 +16650,9 @@ var searchSocketConfigSchema = z.object({
|
|
|
16647
16650
|
pageSummaryChunk: z.boolean().optional()
|
|
16648
16651
|
}).optional(),
|
|
16649
16652
|
embeddings: z.object({
|
|
16650
|
-
provider: z.literal("
|
|
16653
|
+
provider: z.literal("jina").optional(),
|
|
16651
16654
|
model: z.string().min(1).optional(),
|
|
16655
|
+
apiKey: z.string().min(1).optional(),
|
|
16652
16656
|
apiKeyEnv: z.string().min(1).optional(),
|
|
16653
16657
|
batchSize: z.number().int().positive().optional(),
|
|
16654
16658
|
concurrency: z.number().int().positive().optional(),
|
|
@@ -16657,18 +16661,17 @@ var searchSocketConfigSchema = z.object({
|
|
|
16657
16661
|
vector: z.object({
|
|
16658
16662
|
dimension: z.number().int().positive().optional(),
|
|
16659
16663
|
turso: z.object({
|
|
16664
|
+
url: z.string().url().optional(),
|
|
16665
|
+
authToken: z.string().min(1).optional(),
|
|
16660
16666
|
urlEnv: z.string().optional(),
|
|
16661
16667
|
authTokenEnv: z.string().optional(),
|
|
16662
16668
|
localPath: z.string().optional()
|
|
16663
16669
|
}).optional()
|
|
16664
16670
|
}).optional(),
|
|
16665
16671
|
rerank: z.object({
|
|
16666
|
-
|
|
16672
|
+
enabled: z.boolean().optional(),
|
|
16667
16673
|
topN: z.number().int().positive().optional(),
|
|
16668
|
-
|
|
16669
|
-
apiKeyEnv: z.string().optional(),
|
|
16670
|
-
model: z.string().optional()
|
|
16671
|
-
}).optional()
|
|
16674
|
+
model: z.string().optional()
|
|
16672
16675
|
}).optional(),
|
|
16673
16676
|
ranking: z.object({
|
|
16674
16677
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
@@ -16677,6 +16680,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
16677
16680
|
aggregationCap: z.number().int().positive().optional(),
|
|
16678
16681
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
16679
16682
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
16683
|
+
minScore: z.number().min(0).max(1).optional(),
|
|
16680
16684
|
weights: z.object({
|
|
16681
16685
|
incomingLinks: z.number().optional(),
|
|
16682
16686
|
depth: z.number().optional(),
|
|
@@ -16757,9 +16761,9 @@ function createDefaultConfig(projectId) {
|
|
|
16757
16761
|
pageSummaryChunk: true
|
|
16758
16762
|
},
|
|
16759
16763
|
embeddings: {
|
|
16760
|
-
provider: "
|
|
16761
|
-
model: "
|
|
16762
|
-
apiKeyEnv: "
|
|
16764
|
+
provider: "jina",
|
|
16765
|
+
model: "jina-embeddings-v3",
|
|
16766
|
+
apiKeyEnv: "JINA_API_KEY",
|
|
16763
16767
|
batchSize: 64,
|
|
16764
16768
|
concurrency: 4
|
|
16765
16769
|
},
|
|
@@ -16771,12 +16775,9 @@ function createDefaultConfig(projectId) {
|
|
|
16771
16775
|
}
|
|
16772
16776
|
},
|
|
16773
16777
|
rerank: {
|
|
16774
|
-
|
|
16778
|
+
enabled: false,
|
|
16775
16779
|
topN: 20,
|
|
16776
|
-
|
|
16777
|
-
apiKeyEnv: "JINA_API_KEY",
|
|
16778
|
-
model: "jina-reranker-v2-base-multilingual"
|
|
16779
|
-
}
|
|
16780
|
+
model: "jina-reranker-v2-base-multilingual"
|
|
16780
16781
|
},
|
|
16781
16782
|
ranking: {
|
|
16782
16783
|
enableIncomingLinkBoost: true,
|
|
@@ -16785,6 +16786,7 @@ function createDefaultConfig(projectId) {
|
|
|
16785
16786
|
aggregationCap: 5,
|
|
16786
16787
|
aggregationDecay: 0.5,
|
|
16787
16788
|
minChunkScoreRatio: 0.5,
|
|
16789
|
+
minScore: 0,
|
|
16788
16790
|
weights: {
|
|
16789
16791
|
incomingLinks: 0.05,
|
|
16790
16792
|
depth: 0.03,
|
|
@@ -16911,7 +16913,11 @@ ${issues}`
|
|
|
16911
16913
|
outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
|
|
16912
16914
|
paramValues: parsed.source.build.paramValues ?? {},
|
|
16913
16915
|
exclude: parsed.source.build.exclude ?? [],
|
|
16914
|
-
previewTimeout: parsed.source.build.previewTimeout ?? 3e4
|
|
16916
|
+
previewTimeout: parsed.source.build.previewTimeout ?? 3e4,
|
|
16917
|
+
discover: parsed.source.build.discover ?? false,
|
|
16918
|
+
seedUrls: parsed.source.build.seedUrls ?? ["/"],
|
|
16919
|
+
maxPages: parsed.source.build.maxPages ?? 200,
|
|
16920
|
+
maxDepth: parsed.source.build.maxDepth ?? 10
|
|
16915
16921
|
} : void 0
|
|
16916
16922
|
},
|
|
16917
16923
|
extract: {
|
|
@@ -16940,11 +16946,7 @@ ${issues}`
|
|
|
16940
16946
|
},
|
|
16941
16947
|
rerank: {
|
|
16942
16948
|
...defaults.rerank,
|
|
16943
|
-
...parsed.rerank
|
|
16944
|
-
jina: {
|
|
16945
|
-
...defaults.rerank.jina,
|
|
16946
|
-
...parsed.rerank?.jina
|
|
16947
|
-
}
|
|
16949
|
+
...parsed.rerank
|
|
16948
16950
|
},
|
|
16949
16951
|
ranking: {
|
|
16950
16952
|
...defaults.ranking,
|
|
@@ -16991,7 +16993,11 @@ ${issues}`
|
|
|
16991
16993
|
outputDir: ".svelte-kit/output",
|
|
16992
16994
|
paramValues: {},
|
|
16993
16995
|
exclude: [],
|
|
16994
|
-
previewTimeout: 3e4
|
|
16996
|
+
previewTimeout: 3e4,
|
|
16997
|
+
discover: false,
|
|
16998
|
+
seedUrls: ["/"],
|
|
16999
|
+
maxPages: 200,
|
|
17000
|
+
maxDepth: 10
|
|
16995
17001
|
};
|
|
16996
17002
|
}
|
|
16997
17003
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
@@ -17005,6 +17011,21 @@ ${issues}`
|
|
|
17005
17011
|
}
|
|
17006
17012
|
return merged;
|
|
17007
17013
|
}
|
|
17014
|
+
function mergeConfigServerless(rawConfig) {
|
|
17015
|
+
if (!rawConfig.project?.id) {
|
|
17016
|
+
throw new SearchSocketError(
|
|
17017
|
+
"CONFIG_MISSING",
|
|
17018
|
+
"`project.id` is required for serverless config (cannot infer from package.json)."
|
|
17019
|
+
);
|
|
17020
|
+
}
|
|
17021
|
+
if (!rawConfig.source?.mode) {
|
|
17022
|
+
throw new SearchSocketError(
|
|
17023
|
+
"CONFIG_MISSING",
|
|
17024
|
+
"`source.mode` is required for serverless config (cannot auto-detect from filesystem)."
|
|
17025
|
+
);
|
|
17026
|
+
}
|
|
17027
|
+
return mergeConfig(process.cwd(), rawConfig);
|
|
17028
|
+
}
|
|
17008
17029
|
async function loadConfig(options = {}) {
|
|
17009
17030
|
const cwd = path.resolve(options.cwd ?? process.cwd());
|
|
17010
17031
|
const configPath = path.resolve(cwd, options.configPath ?? "searchsocket.config.ts");
|
|
@@ -17027,6 +17048,11 @@ async function loadConfig(options = {}) {
|
|
|
17027
17048
|
return mergeConfig(cwd, raw);
|
|
17028
17049
|
}
|
|
17029
17050
|
|
|
17051
|
+
// src/core/serverless.ts
|
|
17052
|
+
function isServerless() {
|
|
17053
|
+
return !!(process.env.VERCEL || process.env.NETLIFY || process.env.AWS_LAMBDA_FUNCTION_NAME || process.env.FUNCTIONS_WORKER || process.env.CF_PAGES);
|
|
17054
|
+
}
|
|
17055
|
+
|
|
17030
17056
|
// src/utils/text.ts
|
|
17031
17057
|
function normalizeText(input) {
|
|
17032
17058
|
return input.replace(/\r\n/g, "\n").replace(/\s+/g, " ").trim();
|
|
@@ -17104,10 +17130,11 @@ function sleep(ms) {
|
|
|
17104
17130
|
setTimeout(resolve, ms);
|
|
17105
17131
|
});
|
|
17106
17132
|
}
|
|
17107
|
-
var
|
|
17108
|
-
|
|
17133
|
+
var JinaEmbeddingsProvider = class {
|
|
17134
|
+
apiKey;
|
|
17109
17135
|
batchSize;
|
|
17110
17136
|
concurrency;
|
|
17137
|
+
defaultTask;
|
|
17111
17138
|
constructor(options) {
|
|
17112
17139
|
if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
|
|
17113
17140
|
throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
|
|
@@ -17115,11 +17142,10 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17115
17142
|
if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
|
|
17116
17143
|
throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
|
|
17117
17144
|
}
|
|
17118
|
-
this.
|
|
17119
|
-
apiKey: options.apiKey
|
|
17120
|
-
});
|
|
17145
|
+
this.apiKey = options.apiKey;
|
|
17121
17146
|
this.batchSize = options.batchSize;
|
|
17122
17147
|
this.concurrency = options.concurrency;
|
|
17148
|
+
this.defaultTask = options.task ?? "retrieval.passage";
|
|
17123
17149
|
}
|
|
17124
17150
|
estimateTokens(text) {
|
|
17125
17151
|
const normalized = text.trim();
|
|
@@ -17133,7 +17159,7 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17133
17159
|
const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
|
|
17134
17160
|
return Math.max(1, Math.max(charEstimate, lexicalEstimate));
|
|
17135
17161
|
}
|
|
17136
|
-
async embedTexts(texts, modelId) {
|
|
17162
|
+
async embedTexts(texts, modelId, task) {
|
|
17137
17163
|
if (texts.length === 0) {
|
|
17138
17164
|
return [];
|
|
17139
17165
|
}
|
|
@@ -17145,37 +17171,56 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17145
17171
|
});
|
|
17146
17172
|
}
|
|
17147
17173
|
const outputs = new Array(batches.length);
|
|
17148
|
-
const limit =
|
|
17174
|
+
const limit = pLimit2(this.concurrency);
|
|
17149
17175
|
await Promise.all(
|
|
17150
17176
|
batches.map(
|
|
17151
17177
|
(batch, position) => limit(async () => {
|
|
17152
|
-
outputs[position] = await this.embedWithRetry(batch.values, modelId);
|
|
17178
|
+
outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
|
|
17153
17179
|
})
|
|
17154
17180
|
)
|
|
17155
17181
|
);
|
|
17156
17182
|
return outputs.flat();
|
|
17157
17183
|
}
|
|
17158
|
-
async embedWithRetry(texts, modelId) {
|
|
17184
|
+
async embedWithRetry(texts, modelId, task) {
|
|
17159
17185
|
const maxAttempts = 5;
|
|
17160
17186
|
let attempt = 0;
|
|
17161
17187
|
while (attempt < maxAttempts) {
|
|
17162
17188
|
attempt += 1;
|
|
17189
|
+
let response;
|
|
17163
17190
|
try {
|
|
17164
|
-
|
|
17165
|
-
|
|
17166
|
-
|
|
17167
|
-
|
|
17191
|
+
response = await fetch("https://api.jina.ai/v1/embeddings", {
|
|
17192
|
+
method: "POST",
|
|
17193
|
+
headers: {
|
|
17194
|
+
"content-type": "application/json",
|
|
17195
|
+
authorization: `Bearer ${this.apiKey}`
|
|
17196
|
+
},
|
|
17197
|
+
body: JSON.stringify({
|
|
17198
|
+
model: modelId,
|
|
17199
|
+
input: texts,
|
|
17200
|
+
task
|
|
17201
|
+
})
|
|
17168
17202
|
});
|
|
17169
|
-
return response.data.map((entry) => entry.embedding);
|
|
17170
17203
|
} catch (error) {
|
|
17171
|
-
|
|
17172
|
-
const retryable = status === 429 || typeof status === "number" && status >= 500;
|
|
17173
|
-
if (!retryable || attempt >= maxAttempts) {
|
|
17204
|
+
if (attempt >= maxAttempts) {
|
|
17174
17205
|
throw error;
|
|
17175
17206
|
}
|
|
17176
|
-
|
|
17177
|
-
|
|
17207
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17208
|
+
continue;
|
|
17209
|
+
}
|
|
17210
|
+
if (!response.ok) {
|
|
17211
|
+
const retryable = response.status === 429 || response.status >= 500;
|
|
17212
|
+
if (!retryable || attempt >= maxAttempts) {
|
|
17213
|
+
const errorBody = await response.text();
|
|
17214
|
+
throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
|
|
17215
|
+
}
|
|
17216
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17217
|
+
continue;
|
|
17178
17218
|
}
|
|
17219
|
+
const payload = await response.json();
|
|
17220
|
+
if (!payload.data || !Array.isArray(payload.data)) {
|
|
17221
|
+
throw new Error("Invalid Jina embeddings response format");
|
|
17222
|
+
}
|
|
17223
|
+
return payload.data.map((entry) => entry.embedding);
|
|
17179
17224
|
}
|
|
17180
17225
|
throw new Error("Unreachable retry state");
|
|
17181
17226
|
}
|
|
@@ -17183,20 +17228,20 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17183
17228
|
|
|
17184
17229
|
// src/embeddings/factory.ts
|
|
17185
17230
|
function createEmbeddingsProvider(config) {
|
|
17186
|
-
if (config.embeddings.provider !== "
|
|
17231
|
+
if (config.embeddings.provider !== "jina") {
|
|
17187
17232
|
throw new SearchSocketError(
|
|
17188
17233
|
"CONFIG_MISSING",
|
|
17189
17234
|
`Unsupported embeddings provider ${config.embeddings.provider}`
|
|
17190
17235
|
);
|
|
17191
17236
|
}
|
|
17192
|
-
const apiKey = process.env[config.embeddings.apiKeyEnv];
|
|
17237
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17193
17238
|
if (!apiKey) {
|
|
17194
17239
|
throw new SearchSocketError(
|
|
17195
17240
|
"CONFIG_MISSING",
|
|
17196
|
-
`Missing embeddings API key env var
|
|
17241
|
+
`Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
|
|
17197
17242
|
);
|
|
17198
17243
|
}
|
|
17199
|
-
return new
|
|
17244
|
+
return new JinaEmbeddingsProvider({
|
|
17200
17245
|
apiKey,
|
|
17201
17246
|
batchSize: config.embeddings.batchSize,
|
|
17202
17247
|
concurrency: config.embeddings.concurrency
|
|
@@ -17286,20 +17331,17 @@ var JinaReranker = class {
|
|
|
17286
17331
|
|
|
17287
17332
|
// src/rerank/factory.ts
|
|
17288
17333
|
function createReranker(config) {
|
|
17289
|
-
if (config.rerank.
|
|
17334
|
+
if (!config.rerank.enabled) {
|
|
17290
17335
|
return null;
|
|
17291
17336
|
}
|
|
17292
|
-
|
|
17293
|
-
|
|
17294
|
-
|
|
17295
|
-
return null;
|
|
17296
|
-
}
|
|
17297
|
-
return new JinaReranker({
|
|
17298
|
-
apiKey,
|
|
17299
|
-
model: config.rerank.jina.model
|
|
17300
|
-
});
|
|
17337
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17338
|
+
if (!apiKey) {
|
|
17339
|
+
return null;
|
|
17301
17340
|
}
|
|
17302
|
-
return
|
|
17341
|
+
return new JinaReranker({
|
|
17342
|
+
apiKey,
|
|
17343
|
+
model: config.rerank.model
|
|
17344
|
+
});
|
|
17303
17345
|
}
|
|
17304
17346
|
function ensureStateDirs(cwd, stateDir, scope) {
|
|
17305
17347
|
const statePath = path.resolve(cwd, stateDir);
|
|
@@ -17352,6 +17394,16 @@ var TursoVectorStore = class {
|
|
|
17352
17394
|
}
|
|
17353
17395
|
async ensureChunks(dim) {
|
|
17354
17396
|
if (this.chunksReady) return;
|
|
17397
|
+
const exists = await this.chunksTableExists();
|
|
17398
|
+
if (exists) {
|
|
17399
|
+
const currentDim = await this.getChunksDimension();
|
|
17400
|
+
if (currentDim !== null && currentDim !== dim) {
|
|
17401
|
+
await this.client.batch([
|
|
17402
|
+
"DROP INDEX IF EXISTS idx",
|
|
17403
|
+
"DROP TABLE IF EXISTS chunks"
|
|
17404
|
+
]);
|
|
17405
|
+
}
|
|
17406
|
+
}
|
|
17355
17407
|
await this.client.batch([
|
|
17356
17408
|
`CREATE TABLE IF NOT EXISTS chunks (
|
|
17357
17409
|
id TEXT PRIMARY KEY,
|
|
@@ -17363,6 +17415,8 @@ var TursoVectorStore = class {
|
|
|
17363
17415
|
section_title TEXT NOT NULL DEFAULT '',
|
|
17364
17416
|
heading_path TEXT NOT NULL DEFAULT '[]',
|
|
17365
17417
|
snippet TEXT NOT NULL DEFAULT '',
|
|
17418
|
+
chunk_text TEXT NOT NULL DEFAULT '',
|
|
17419
|
+
ordinal INTEGER NOT NULL DEFAULT 0,
|
|
17366
17420
|
content_hash TEXT NOT NULL DEFAULT '',
|
|
17367
17421
|
model_id TEXT NOT NULL DEFAULT '',
|
|
17368
17422
|
depth INTEGER NOT NULL DEFAULT 0,
|
|
@@ -17373,6 +17427,19 @@ var TursoVectorStore = class {
|
|
|
17373
17427
|
)`,
|
|
17374
17428
|
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
17375
17429
|
]);
|
|
17430
|
+
const chunkMigrationCols = [
|
|
17431
|
+
{ name: "chunk_text", def: "TEXT NOT NULL DEFAULT ''" },
|
|
17432
|
+
{ name: "ordinal", def: "INTEGER NOT NULL DEFAULT 0" }
|
|
17433
|
+
];
|
|
17434
|
+
for (const col of chunkMigrationCols) {
|
|
17435
|
+
try {
|
|
17436
|
+
await this.client.execute(`ALTER TABLE chunks ADD COLUMN ${col.name} ${col.def}`);
|
|
17437
|
+
} catch (error) {
|
|
17438
|
+
if (error instanceof Error && !error.message.includes("duplicate column")) {
|
|
17439
|
+
throw error;
|
|
17440
|
+
}
|
|
17441
|
+
}
|
|
17442
|
+
}
|
|
17376
17443
|
this.chunksReady = true;
|
|
17377
17444
|
}
|
|
17378
17445
|
async ensurePages() {
|
|
@@ -17407,6 +17474,38 @@ var TursoVectorStore = class {
|
|
|
17407
17474
|
throw error;
|
|
17408
17475
|
}
|
|
17409
17476
|
}
|
|
17477
|
+
/**
|
|
17478
|
+
* Read the current F32_BLOB dimension from the chunks table schema.
|
|
17479
|
+
* Returns null if the table doesn't exist or the dimension can't be parsed.
|
|
17480
|
+
*/
|
|
17481
|
+
async getChunksDimension() {
|
|
17482
|
+
try {
|
|
17483
|
+
const rs = await this.client.execute(
|
|
17484
|
+
"SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
|
|
17485
|
+
);
|
|
17486
|
+
if (rs.rows.length === 0) return null;
|
|
17487
|
+
const sql = rs.rows[0].sql;
|
|
17488
|
+
const match = sql.match(/F32_BLOB\((\d+)\)/i);
|
|
17489
|
+
return match ? parseInt(match[1], 10) : null;
|
|
17490
|
+
} catch {
|
|
17491
|
+
return null;
|
|
17492
|
+
}
|
|
17493
|
+
}
|
|
17494
|
+
/**
|
|
17495
|
+
* Drop all SearchSocket tables (chunks, registry, pages) and their indexes.
|
|
17496
|
+
* Used by `clean --remote` for a full reset.
|
|
17497
|
+
*/
|
|
17498
|
+
async dropAllTables() {
|
|
17499
|
+
await this.client.batch([
|
|
17500
|
+
"DROP INDEX IF EXISTS idx",
|
|
17501
|
+
"DROP TABLE IF EXISTS chunks",
|
|
17502
|
+
"DROP TABLE IF EXISTS registry",
|
|
17503
|
+
"DROP TABLE IF EXISTS pages"
|
|
17504
|
+
]);
|
|
17505
|
+
this.chunksReady = false;
|
|
17506
|
+
this.registryReady = false;
|
|
17507
|
+
this.pagesReady = false;
|
|
17508
|
+
}
|
|
17410
17509
|
async upsert(records, _scope) {
|
|
17411
17510
|
if (records.length === 0) return;
|
|
17412
17511
|
const dim = this.dimension ?? records[0].vector.length;
|
|
@@ -17417,9 +17516,9 @@ var TursoVectorStore = class {
|
|
|
17417
17516
|
const stmts = batch.map((r) => ({
|
|
17418
17517
|
sql: `INSERT OR REPLACE INTO chunks
|
|
17419
17518
|
(id, project_id, scope_name, url, path, title, section_title,
|
|
17420
|
-
heading_path, snippet, content_hash, model_id, depth,
|
|
17519
|
+
heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
|
|
17421
17520
|
incoming_links, route_file, tags, embedding)
|
|
17422
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17521
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17423
17522
|
args: [
|
|
17424
17523
|
r.id,
|
|
17425
17524
|
r.metadata.projectId,
|
|
@@ -17430,6 +17529,8 @@ var TursoVectorStore = class {
|
|
|
17430
17529
|
r.metadata.sectionTitle,
|
|
17431
17530
|
JSON.stringify(r.metadata.headingPath),
|
|
17432
17531
|
r.metadata.snippet,
|
|
17532
|
+
r.metadata.chunkText,
|
|
17533
|
+
r.metadata.ordinal,
|
|
17433
17534
|
r.metadata.contentHash,
|
|
17434
17535
|
r.metadata.modelId,
|
|
17435
17536
|
r.metadata.depth,
|
|
@@ -17448,7 +17549,8 @@ var TursoVectorStore = class {
|
|
|
17448
17549
|
const queryJson = JSON.stringify(queryVector);
|
|
17449
17550
|
const rs = await this.client.execute({
|
|
17450
17551
|
sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
|
|
17451
|
-
c.section_title, c.heading_path, c.snippet, c.
|
|
17552
|
+
c.section_title, c.heading_path, c.snippet, c.chunk_text,
|
|
17553
|
+
c.ordinal, c.content_hash,
|
|
17452
17554
|
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
17453
17555
|
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
17454
17556
|
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
@@ -17492,6 +17594,8 @@ var TursoVectorStore = class {
|
|
|
17492
17594
|
sectionTitle: row.section_title,
|
|
17493
17595
|
headingPath: JSON.parse(row.heading_path || "[]"),
|
|
17494
17596
|
snippet: row.snippet,
|
|
17597
|
+
chunkText: row.chunk_text || "",
|
|
17598
|
+
ordinal: row.ordinal || 0,
|
|
17495
17599
|
contentHash: row.content_hash,
|
|
17496
17600
|
modelId: row.model_id,
|
|
17497
17601
|
depth: row.depth,
|
|
@@ -17687,10 +17791,10 @@ var TursoVectorStore = class {
|
|
|
17687
17791
|
// src/vector/factory.ts
|
|
17688
17792
|
async function createVectorStore(config, cwd) {
|
|
17689
17793
|
const turso = config.vector.turso;
|
|
17690
|
-
const remoteUrl = process.env[turso.urlEnv];
|
|
17794
|
+
const remoteUrl = turso.url ?? process.env[turso.urlEnv];
|
|
17691
17795
|
if (remoteUrl) {
|
|
17692
17796
|
const { createClient: createClient2 } = await import('@libsql/client/http');
|
|
17693
|
-
const authToken = process.env[turso.authTokenEnv];
|
|
17797
|
+
const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
|
|
17694
17798
|
const client2 = createClient2({
|
|
17695
17799
|
url: remoteUrl,
|
|
17696
17800
|
authToken
|
|
@@ -17700,6 +17804,12 @@ async function createVectorStore(config, cwd) {
|
|
|
17700
17804
|
dimension: config.vector.dimension
|
|
17701
17805
|
});
|
|
17702
17806
|
}
|
|
17807
|
+
if (isServerless()) {
|
|
17808
|
+
throw new SearchSocketError(
|
|
17809
|
+
"VECTOR_BACKEND_UNAVAILABLE",
|
|
17810
|
+
`No remote vector database URL found (checked vector.turso.url and env var "${turso.urlEnv}"). Local SQLite storage is not available in serverless environments. Set ${turso.urlEnv} or pass vector.turso.url directly.`
|
|
17811
|
+
);
|
|
17812
|
+
}
|
|
17703
17813
|
const { createClient } = await import('@libsql/client');
|
|
17704
17814
|
const localPath = path.resolve(cwd, turso.localPath);
|
|
17705
17815
|
fs.mkdirSync(path.dirname(localPath), { recursive: true });
|
|
@@ -19138,14 +19248,16 @@ function mapUrlToRoute(urlPath, patterns) {
|
|
|
19138
19248
|
var Logger = class {
|
|
19139
19249
|
json;
|
|
19140
19250
|
verbose;
|
|
19251
|
+
quiet;
|
|
19141
19252
|
stderrOnly;
|
|
19142
19253
|
constructor(opts = {}) {
|
|
19143
19254
|
this.json = opts.json ?? false;
|
|
19144
19255
|
this.verbose = opts.verbose ?? false;
|
|
19256
|
+
this.quiet = opts.quiet ?? false;
|
|
19145
19257
|
this.stderrOnly = opts.stderrOnly ?? false;
|
|
19146
19258
|
}
|
|
19147
19259
|
info(message) {
|
|
19148
|
-
if (this.json) {
|
|
19260
|
+
if (this.quiet || this.json) {
|
|
19149
19261
|
return;
|
|
19150
19262
|
}
|
|
19151
19263
|
this.writeOut(`${message}
|
|
@@ -19159,7 +19271,7 @@ var Logger = class {
|
|
|
19159
19271
|
this.logJson("debug", { message });
|
|
19160
19272
|
return;
|
|
19161
19273
|
}
|
|
19162
|
-
this.writeOut(
|
|
19274
|
+
this.writeOut(` ${message}
|
|
19163
19275
|
`);
|
|
19164
19276
|
}
|
|
19165
19277
|
warn(message) {
|
|
@@ -19186,7 +19298,7 @@ var Logger = class {
|
|
|
19186
19298
|
this.logJson(event, data);
|
|
19187
19299
|
return;
|
|
19188
19300
|
}
|
|
19189
|
-
this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
|
|
19301
|
+
this.writeOut(` [${event}] ${data ? JSON.stringify(data) : ""}
|
|
19190
19302
|
`);
|
|
19191
19303
|
}
|
|
19192
19304
|
writeOut(text) {
|
|
@@ -19371,11 +19483,108 @@ async function startPreviewServer(cwd, options, logger3) {
|
|
|
19371
19483
|
|
|
19372
19484
|
// src/indexing/sources/build/index.ts
|
|
19373
19485
|
var logger = new Logger();
|
|
19486
|
+
function extractLinksFromHtml(html, pageUrl, baseOrigin) {
|
|
19487
|
+
const $ = load(html);
|
|
19488
|
+
const links = [];
|
|
19489
|
+
$("a[href]").each((_i, el) => {
|
|
19490
|
+
const href = $(el).attr("href");
|
|
19491
|
+
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
|
|
19492
|
+
return;
|
|
19493
|
+
}
|
|
19494
|
+
try {
|
|
19495
|
+
const resolved = new URL(href, `${baseOrigin}${pageUrl}`);
|
|
19496
|
+
if (resolved.origin !== baseOrigin) return;
|
|
19497
|
+
if (!["http:", "https:"].includes(resolved.protocol)) return;
|
|
19498
|
+
links.push(normalizeUrlPath(resolved.pathname));
|
|
19499
|
+
} catch {
|
|
19500
|
+
}
|
|
19501
|
+
});
|
|
19502
|
+
return [...new Set(links)];
|
|
19503
|
+
}
|
|
19504
|
+
async function discoverPages(server, buildConfig, pipelineMaxPages) {
|
|
19505
|
+
const { seedUrls, maxDepth, exclude } = buildConfig;
|
|
19506
|
+
const baseOrigin = new URL(server.baseUrl).origin;
|
|
19507
|
+
let effectiveMax = buildConfig.maxPages;
|
|
19508
|
+
if (typeof pipelineMaxPages === "number") {
|
|
19509
|
+
const floored = Math.max(0, Math.floor(pipelineMaxPages));
|
|
19510
|
+
effectiveMax = Math.min(effectiveMax, floored);
|
|
19511
|
+
}
|
|
19512
|
+
if (effectiveMax === 0) return [];
|
|
19513
|
+
const visited = /* @__PURE__ */ new Set();
|
|
19514
|
+
const pages = [];
|
|
19515
|
+
const queue = [];
|
|
19516
|
+
const limit = pLimit2(8);
|
|
19517
|
+
for (const seed of seedUrls) {
|
|
19518
|
+
const normalized = normalizeUrlPath(seed);
|
|
19519
|
+
if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
|
|
19520
|
+
visited.add(normalized);
|
|
19521
|
+
queue.push({ url: normalized, depth: 0 });
|
|
19522
|
+
}
|
|
19523
|
+
}
|
|
19524
|
+
while (queue.length > 0 && pages.length < effectiveMax) {
|
|
19525
|
+
const remaining = effectiveMax - pages.length;
|
|
19526
|
+
const batch = queue.splice(0, remaining);
|
|
19527
|
+
const results = await Promise.allSettled(
|
|
19528
|
+
batch.map(
|
|
19529
|
+
(item) => limit(async () => {
|
|
19530
|
+
const fullUrl = joinUrl(server.baseUrl, item.url);
|
|
19531
|
+
const response = await fetch(fullUrl);
|
|
19532
|
+
if (!response.ok) {
|
|
19533
|
+
logger.warn(`Skipping ${item.url}: ${response.status} ${response.statusText}`);
|
|
19534
|
+
return null;
|
|
19535
|
+
}
|
|
19536
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
19537
|
+
if (!contentType.includes("text/html")) {
|
|
19538
|
+
return null;
|
|
19539
|
+
}
|
|
19540
|
+
const html = await response.text();
|
|
19541
|
+
if (item.depth < maxDepth) {
|
|
19542
|
+
const links = extractLinksFromHtml(html, item.url, baseOrigin);
|
|
19543
|
+
for (const link of links) {
|
|
19544
|
+
if (!visited.has(link) && !isExcluded(link, exclude)) {
|
|
19545
|
+
visited.add(link);
|
|
19546
|
+
queue.push({ url: link, depth: item.depth + 1 });
|
|
19547
|
+
}
|
|
19548
|
+
}
|
|
19549
|
+
}
|
|
19550
|
+
return {
|
|
19551
|
+
url: item.url,
|
|
19552
|
+
html,
|
|
19553
|
+
sourcePath: fullUrl,
|
|
19554
|
+
outgoingLinks: []
|
|
19555
|
+
};
|
|
19556
|
+
})
|
|
19557
|
+
)
|
|
19558
|
+
);
|
|
19559
|
+
for (const result of results) {
|
|
19560
|
+
if (result.status === "fulfilled" && result.value) {
|
|
19561
|
+
pages.push(result.value);
|
|
19562
|
+
}
|
|
19563
|
+
}
|
|
19564
|
+
}
|
|
19565
|
+
if (pages.length >= effectiveMax && queue.length > 0) {
|
|
19566
|
+
logger.warn(`Discovery crawl reached maxPages limit (${effectiveMax}), ${queue.length} URLs not visited.`);
|
|
19567
|
+
}
|
|
19568
|
+
logger.event("build_discover_complete", {
|
|
19569
|
+
pagesFound: pages.length,
|
|
19570
|
+
urlsVisited: visited.size,
|
|
19571
|
+
urlsSkipped: queue.length
|
|
19572
|
+
});
|
|
19573
|
+
return pages;
|
|
19574
|
+
}
|
|
19374
19575
|
async function loadBuildPages(cwd, config, maxPages) {
|
|
19375
19576
|
const buildConfig = config.source.build;
|
|
19376
19577
|
if (!buildConfig) {
|
|
19377
19578
|
throw new Error("build source config is missing");
|
|
19378
19579
|
}
|
|
19580
|
+
if (buildConfig.discover) {
|
|
19581
|
+
const server2 = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
19582
|
+
try {
|
|
19583
|
+
return await discoverPages(server2, buildConfig, maxPages);
|
|
19584
|
+
} finally {
|
|
19585
|
+
await server2.shutdown();
|
|
19586
|
+
}
|
|
19587
|
+
}
|
|
19379
19588
|
const routes = await parseManifest(cwd, buildConfig.outputDir);
|
|
19380
19589
|
const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
|
|
19381
19590
|
logger.event("build_routes_discovered", {
|
|
@@ -19386,7 +19595,7 @@ async function loadBuildPages(cwd, config, maxPages) {
|
|
|
19386
19595
|
const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
|
|
19387
19596
|
const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
19388
19597
|
try {
|
|
19389
|
-
const concurrencyLimit =
|
|
19598
|
+
const concurrencyLimit = pLimit2(8);
|
|
19390
19599
|
const results = await Promise.allSettled(
|
|
19391
19600
|
selected.map(
|
|
19392
19601
|
(route) => concurrencyLimit(async () => {
|
|
@@ -19555,7 +19764,7 @@ async function loadCrawledPages(config, maxPages) {
|
|
|
19555
19764
|
const routes = await resolveRoutes(config);
|
|
19556
19765
|
const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
|
|
19557
19766
|
const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
|
|
19558
|
-
const concurrencyLimit =
|
|
19767
|
+
const concurrencyLimit = pLimit2(8);
|
|
19559
19768
|
const results = await Promise.allSettled(
|
|
19560
19769
|
selected.map(
|
|
19561
19770
|
(route) => concurrencyLimit(async () => {
|
|
@@ -19617,9 +19826,7 @@ function hrTimeMs(start) {
|
|
|
19617
19826
|
|
|
19618
19827
|
// src/indexing/pipeline.ts
|
|
19619
19828
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
19620
|
-
"
|
|
19621
|
-
"text-embedding-3-large": 13e-5,
|
|
19622
|
-
"text-embedding-ada-002": 1e-4
|
|
19829
|
+
"jina-embeddings-v3": 2e-5
|
|
19623
19830
|
};
|
|
19624
19831
|
var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
|
|
19625
19832
|
var IndexPipeline = class _IndexPipeline {
|
|
@@ -19665,9 +19872,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19665
19872
|
};
|
|
19666
19873
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
19667
19874
|
const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
|
|
19875
|
+
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
19876
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, model: ${this.config.embeddings.model})`);
|
|
19668
19877
|
if (options.force) {
|
|
19878
|
+
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
19669
19879
|
await cleanMirrorForScope(statePath, scope);
|
|
19670
19880
|
}
|
|
19881
|
+
if (options.dryRun) {
|
|
19882
|
+
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
19883
|
+
}
|
|
19671
19884
|
const manifestStart = stageStart();
|
|
19672
19885
|
const existingHashes = await this.vectorStore.getContentHashes(scope);
|
|
19673
19886
|
const existingModelId = await this.vectorStore.getScopeModelId(scope);
|
|
@@ -19678,8 +19891,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19678
19891
|
);
|
|
19679
19892
|
}
|
|
19680
19893
|
stageEnd("manifest", manifestStart);
|
|
19894
|
+
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
19681
19895
|
const sourceStart = stageStart();
|
|
19682
|
-
|
|
19896
|
+
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
19683
19897
|
let sourcePages;
|
|
19684
19898
|
if (sourceMode === "static-output") {
|
|
19685
19899
|
sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
|
|
@@ -19691,10 +19905,13 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19691
19905
|
sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
|
|
19692
19906
|
}
|
|
19693
19907
|
stageEnd("source", sourceStart);
|
|
19908
|
+
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
19694
19909
|
const routeStart = stageStart();
|
|
19695
19910
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
19696
19911
|
stageEnd("route_map", routeStart);
|
|
19912
|
+
this.logger.debug(`Route mapping: ${routePatterns.length} pattern${routePatterns.length === 1 ? "" : "s"} discovered (${stageTimingsMs["route_map"]}ms)`);
|
|
19697
19913
|
const extractStart = stageStart();
|
|
19914
|
+
this.logger.info("Extracting content...");
|
|
19698
19915
|
const extractedPages = [];
|
|
19699
19916
|
for (const sourcePage of sourcePages) {
|
|
19700
19917
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
@@ -19723,6 +19940,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19723
19940
|
uniquePages.push(page);
|
|
19724
19941
|
}
|
|
19725
19942
|
stageEnd("extract", extractStart);
|
|
19943
|
+
const skippedPages = sourcePages.length - uniquePages.length;
|
|
19944
|
+
this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
19726
19945
|
const linkStart = stageStart();
|
|
19727
19946
|
const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
|
|
19728
19947
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
@@ -19738,7 +19957,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19738
19957
|
}
|
|
19739
19958
|
}
|
|
19740
19959
|
stageEnd("links", linkStart);
|
|
19960
|
+
this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
|
|
19741
19961
|
const mirrorStart = stageStart();
|
|
19962
|
+
this.logger.info("Writing mirror pages...");
|
|
19742
19963
|
const mirrorPages = [];
|
|
19743
19964
|
let routeExact = 0;
|
|
19744
19965
|
let routeBestEffort = 0;
|
|
@@ -19808,7 +20029,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19808
20029
|
await this.vectorStore.upsertPages(pageRecords, scope);
|
|
19809
20030
|
}
|
|
19810
20031
|
stageEnd("mirror", mirrorStart);
|
|
20032
|
+
this.logger.info(`Mirrored ${mirrorPages.length} page${mirrorPages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["mirror"]}ms)`);
|
|
19811
20033
|
const chunkStart = stageStart();
|
|
20034
|
+
this.logger.info("Chunking pages...");
|
|
19812
20035
|
let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
|
|
19813
20036
|
const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
|
|
19814
20037
|
if (typeof maxChunks === "number") {
|
|
@@ -19821,6 +20044,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19821
20044
|
});
|
|
19822
20045
|
}
|
|
19823
20046
|
stageEnd("chunk", chunkStart);
|
|
20047
|
+
this.logger.info(`Chunked into ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} (${stageTimingsMs["chunk"]}ms)`);
|
|
19824
20048
|
const currentChunkMap = /* @__PURE__ */ new Map();
|
|
19825
20049
|
for (const chunk of chunks) {
|
|
19826
20050
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
@@ -19839,6 +20063,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19839
20063
|
return existingHash !== chunk.contentHash;
|
|
19840
20064
|
});
|
|
19841
20065
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
20066
|
+
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
19842
20067
|
const embedStart = stageStart();
|
|
19843
20068
|
const chunkTokenEstimates = /* @__PURE__ */ new Map();
|
|
19844
20069
|
for (const chunk of changedChunks) {
|
|
@@ -19853,9 +20078,11 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19853
20078
|
let newEmbeddings = 0;
|
|
19854
20079
|
const vectorsByChunk = /* @__PURE__ */ new Map();
|
|
19855
20080
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
20081
|
+
this.logger.info(`Embedding ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} (~${estimatedTokens.toLocaleString()} tokens, ~$${estimatedCostUSD.toFixed(6)})...`);
|
|
19856
20082
|
const embeddings = await this.embeddings.embedTexts(
|
|
19857
20083
|
changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
|
|
19858
|
-
this.config.embeddings.model
|
|
20084
|
+
this.config.embeddings.model,
|
|
20085
|
+
"retrieval.passage"
|
|
19859
20086
|
);
|
|
19860
20087
|
if (embeddings.length !== changedChunks.length) {
|
|
19861
20088
|
throw new SearchSocketError(
|
|
@@ -19878,8 +20105,14 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19878
20105
|
}
|
|
19879
20106
|
}
|
|
19880
20107
|
stageEnd("embedding", embedStart);
|
|
20108
|
+
if (changedChunks.length > 0) {
|
|
20109
|
+
this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
|
|
20110
|
+
} else {
|
|
20111
|
+
this.logger.info("No chunks to embed \u2014 all up to date");
|
|
20112
|
+
}
|
|
19881
20113
|
const syncStart = stageStart();
|
|
19882
20114
|
if (!options.dryRun) {
|
|
20115
|
+
this.logger.info("Syncing vectors...");
|
|
19883
20116
|
const upserts = [];
|
|
19884
20117
|
for (const chunk of changedChunks) {
|
|
19885
20118
|
const vector = vectorsByChunk.get(chunk.chunkKey);
|
|
@@ -19898,6 +20131,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19898
20131
|
sectionTitle: chunk.sectionTitle ?? "",
|
|
19899
20132
|
headingPath: chunk.headingPath,
|
|
19900
20133
|
snippet: chunk.snippet,
|
|
20134
|
+
chunkText: chunk.chunkText.slice(0, 4e3),
|
|
20135
|
+
ordinal: chunk.ordinal,
|
|
19901
20136
|
contentHash: chunk.contentHash,
|
|
19902
20137
|
modelId: this.config.embeddings.model,
|
|
19903
20138
|
depth: chunk.depth,
|
|
@@ -19917,6 +20152,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19917
20152
|
}
|
|
19918
20153
|
}
|
|
19919
20154
|
stageEnd("sync", syncStart);
|
|
20155
|
+
this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
|
|
19920
20156
|
const finalizeStart = stageStart();
|
|
19921
20157
|
if (!options.dryRun) {
|
|
19922
20158
|
const scopeInfo = {
|
|
@@ -19936,6 +20172,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19936
20172
|
});
|
|
19937
20173
|
}
|
|
19938
20174
|
stageEnd("finalize", finalizeStart);
|
|
20175
|
+
this.logger.info("Done.");
|
|
19939
20176
|
return {
|
|
19940
20177
|
pagesProcessed: mirrorPages.length,
|
|
19941
20178
|
chunksTotal: chunks.length,
|
|
@@ -20096,7 +20333,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
20096
20333
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
20097
20334
|
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
20098
20335
|
const embedStart = process.hrtime.bigint();
|
|
20099
|
-
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
|
|
20336
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
20100
20337
|
const queryVector = queryEmbeddings[0];
|
|
20101
20338
|
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
20102
20339
|
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
@@ -20124,13 +20361,17 @@ var SearchEngine = class _SearchEngine {
|
|
|
20124
20361
|
usedRerank = true;
|
|
20125
20362
|
}
|
|
20126
20363
|
let results;
|
|
20364
|
+
const minScore = this.config.ranking.minScore;
|
|
20127
20365
|
if (groupByPage) {
|
|
20128
|
-
|
|
20366
|
+
let pages = aggregateByPage(ordered, this.config);
|
|
20367
|
+
if (minScore > 0) {
|
|
20368
|
+
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
20369
|
+
}
|
|
20129
20370
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
20130
20371
|
results = pages.slice(0, topK).map((page) => {
|
|
20131
20372
|
const bestScore = page.bestChunk.finalScore;
|
|
20132
|
-
const
|
|
20133
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
20373
|
+
const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
20374
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
|
|
20134
20375
|
return {
|
|
20135
20376
|
url: page.url,
|
|
20136
20377
|
title: page.title,
|
|
@@ -20147,6 +20388,9 @@ var SearchEngine = class _SearchEngine {
|
|
|
20147
20388
|
};
|
|
20148
20389
|
});
|
|
20149
20390
|
} else {
|
|
20391
|
+
if (minScore > 0) {
|
|
20392
|
+
ordered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
20393
|
+
}
|
|
20150
20394
|
results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
20151
20395
|
url: hit.metadata.url,
|
|
20152
20396
|
title: hit.metadata.title,
|
|
@@ -20218,43 +20462,54 @@ var SearchEngine = class _SearchEngine {
|
|
|
20218
20462
|
}
|
|
20219
20463
|
}
|
|
20220
20464
|
async rerankHits(query, ranked, topK) {
|
|
20221
|
-
if (this.config.rerank.
|
|
20465
|
+
if (!this.config.rerank.enabled) {
|
|
20222
20466
|
throw new SearchSocketError(
|
|
20223
20467
|
"INVALID_REQUEST",
|
|
20224
|
-
"rerank=true requested but rerank.
|
|
20468
|
+
"rerank=true requested but rerank.enabled is not set to true.",
|
|
20225
20469
|
400
|
|
20226
20470
|
);
|
|
20227
20471
|
}
|
|
20228
20472
|
if (!this.reranker) {
|
|
20229
20473
|
throw new SearchSocketError(
|
|
20230
20474
|
"CONFIG_MISSING",
|
|
20231
|
-
`rerank=true requested but ${this.config.
|
|
20475
|
+
`rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
|
|
20232
20476
|
400
|
|
20233
20477
|
);
|
|
20234
20478
|
}
|
|
20235
|
-
const
|
|
20236
|
-
|
|
20237
|
-
|
|
20238
|
-
|
|
20479
|
+
const pageGroups = /* @__PURE__ */ new Map();
|
|
20480
|
+
for (const entry of ranked) {
|
|
20481
|
+
const url = entry.hit.metadata.url;
|
|
20482
|
+
const group = pageGroups.get(url);
|
|
20483
|
+
if (group) group.push(entry);
|
|
20484
|
+
else pageGroups.set(url, [entry]);
|
|
20485
|
+
}
|
|
20486
|
+
const pageCandidates = [];
|
|
20487
|
+
for (const [url, chunks] of pageGroups) {
|
|
20488
|
+
const sorted = [...chunks].sort(
|
|
20489
|
+
(a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0)
|
|
20490
|
+
);
|
|
20491
|
+
const title = sorted[0].hit.metadata.title;
|
|
20492
|
+
const body = sorted.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
20493
|
+
pageCandidates.push({ id: url, text: `${title}
|
|
20494
|
+
|
|
20495
|
+
${body}` });
|
|
20496
|
+
}
|
|
20239
20497
|
const reranked = await this.reranker.rerank(
|
|
20240
20498
|
query,
|
|
20241
|
-
|
|
20499
|
+
pageCandidates,
|
|
20242
20500
|
Math.max(topK, this.config.rerank.topN)
|
|
20243
20501
|
);
|
|
20244
|
-
const
|
|
20502
|
+
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
20245
20503
|
return ranked.map((entry) => {
|
|
20246
|
-
const
|
|
20247
|
-
const
|
|
20248
|
-
if (
|
|
20249
|
-
return {
|
|
20250
|
-
...entry,
|
|
20251
|
-
finalScore: safeBaseScore
|
|
20252
|
-
};
|
|
20504
|
+
const pageScore = scoreByUrl.get(entry.hit.metadata.url);
|
|
20505
|
+
const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
|
|
20506
|
+
if (pageScore === void 0 || !Number.isFinite(pageScore)) {
|
|
20507
|
+
return { ...entry, finalScore: base };
|
|
20253
20508
|
}
|
|
20254
|
-
const
|
|
20509
|
+
const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
|
|
20255
20510
|
return {
|
|
20256
20511
|
...entry,
|
|
20257
|
-
finalScore: Number.isFinite(
|
|
20512
|
+
finalScore: Number.isFinite(combined) ? combined : base
|
|
20258
20513
|
};
|
|
20259
20514
|
}).sort((a, b) => {
|
|
20260
20515
|
const delta = b.finalScore - a.finalScore;
|
|
@@ -20452,13 +20707,21 @@ function searchsocketHandle(options = {}) {
|
|
|
20452
20707
|
let rateLimiter = null;
|
|
20453
20708
|
const getConfig = async () => {
|
|
20454
20709
|
if (!configPromise) {
|
|
20455
|
-
|
|
20456
|
-
|
|
20457
|
-
|
|
20458
|
-
})
|
|
20710
|
+
let configP;
|
|
20711
|
+
if (options.config) {
|
|
20712
|
+
configP = Promise.resolve(options.config);
|
|
20713
|
+
} else if (options.rawConfig) {
|
|
20714
|
+
const cwd = options.cwd ?? process.cwd();
|
|
20715
|
+
configP = Promise.resolve(mergeConfig(cwd, options.rawConfig));
|
|
20716
|
+
} else {
|
|
20717
|
+
configP = loadConfig({
|
|
20718
|
+
cwd: options.cwd,
|
|
20719
|
+
configPath: options.configPath
|
|
20720
|
+
});
|
|
20721
|
+
}
|
|
20459
20722
|
configPromise = configP.then((config) => {
|
|
20460
20723
|
apiPath = apiPath ?? config.api.path;
|
|
20461
|
-
if (config.api.rateLimit) {
|
|
20724
|
+
if (config.api.rateLimit && !isServerless()) {
|
|
20462
20725
|
rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
|
|
20463
20726
|
}
|
|
20464
20727
|
return config;
|
|
@@ -20468,10 +20731,9 @@ function searchsocketHandle(options = {}) {
|
|
|
20468
20731
|
};
|
|
20469
20732
|
const getEngine = async () => {
|
|
20470
20733
|
if (!enginePromise) {
|
|
20471
|
-
const config =
|
|
20734
|
+
const config = await getConfig();
|
|
20472
20735
|
enginePromise = SearchEngine.create({
|
|
20473
20736
|
cwd: options.cwd,
|
|
20474
|
-
configPath: options.configPath,
|
|
20475
20737
|
config
|
|
20476
20738
|
});
|
|
20477
20739
|
}
|
|
@@ -20737,6 +20999,6 @@ function createSearchClient(options = {}) {
|
|
|
20737
20999
|
*)
|
|
20738
21000
|
*/
|
|
20739
21001
|
|
|
20740
|
-
export { IndexPipeline, JinaReranker, SearchEngine, createEmbeddingsProvider, createReranker, createSearchClient, createVectorStore, loadConfig, mergeConfig, resolveScope, runMcpServer, searchsocketHandle, searchsocketVitePlugin };
|
|
21002
|
+
export { IndexPipeline, JinaReranker, SearchEngine, createEmbeddingsProvider, createReranker, createSearchClient, createVectorStore, isServerless, loadConfig, mergeConfig, mergeConfigServerless, resolveScope, runMcpServer, searchsocketHandle, searchsocketVitePlugin };
|
|
20741
21003
|
//# sourceMappingURL=index.js.map
|
|
20742
21004
|
//# sourceMappingURL=index.js.map
|