searchsocket 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -42
- package/dist/cli.js +348 -111
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +367 -104
- package/dist/index.d.cts +20 -3
- package/dist/index.d.ts +20 -3
- package/dist/index.js +365 -103
- package/dist/sveltekit.cjs +350 -104
- package/dist/sveltekit.d.cts +8 -2
- package/dist/sveltekit.d.ts +8 -2
- package/dist/sveltekit.js +349 -102
- package/dist/{types-D1K46vwd.d.cts → types-DAXk6A3Y.d.cts} +25 -13
- package/dist/{types-D1K46vwd.d.ts → types-DAXk6A3Y.d.ts} +25 -13
- package/package.json +3 -3
- package/dist/cli.js.map +0 -1
- package/dist/client.cjs.map +0 -1
- package/dist/client.js.map +0 -1
- package/dist/index.cjs.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/sveltekit.cjs.map +0 -1
- package/dist/sveltekit.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -5,8 +5,7 @@ var path = require('path');
|
|
|
5
5
|
var jiti = require('jiti');
|
|
6
6
|
var zod = require('zod');
|
|
7
7
|
var child_process = require('child_process');
|
|
8
|
-
var
|
|
9
|
-
var pLimit = require('p-limit');
|
|
8
|
+
var pLimit2 = require('p-limit');
|
|
10
9
|
var crypto = require('crypto');
|
|
11
10
|
var cheerio = require('cheerio');
|
|
12
11
|
var matter = require('gray-matter');
|
|
@@ -23,8 +22,7 @@ function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
|
23
22
|
|
|
24
23
|
var fs__default = /*#__PURE__*/_interopDefault(fs);
|
|
25
24
|
var path__default = /*#__PURE__*/_interopDefault(path);
|
|
26
|
-
var
|
|
27
|
-
var pLimit__default = /*#__PURE__*/_interopDefault(pLimit);
|
|
25
|
+
var pLimit2__default = /*#__PURE__*/_interopDefault(pLimit2);
|
|
28
26
|
var matter__default = /*#__PURE__*/_interopDefault(matter);
|
|
29
27
|
var fs4__default = /*#__PURE__*/_interopDefault(fs4);
|
|
30
28
|
var fg__default = /*#__PURE__*/_interopDefault(fg);
|
|
@@ -16633,7 +16631,11 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16633
16631
|
outputDir: zod.z.string().min(1).optional(),
|
|
16634
16632
|
paramValues: zod.z.record(zod.z.string(), zod.z.array(zod.z.string())).optional(),
|
|
16635
16633
|
exclude: zod.z.array(zod.z.string()).optional(),
|
|
16636
|
-
previewTimeout: zod.z.number().int().positive().optional()
|
|
16634
|
+
previewTimeout: zod.z.number().int().positive().optional(),
|
|
16635
|
+
discover: zod.z.boolean().optional(),
|
|
16636
|
+
seedUrls: zod.z.array(zod.z.string()).optional(),
|
|
16637
|
+
maxPages: zod.z.number().int().positive().optional(),
|
|
16638
|
+
maxDepth: zod.z.number().int().nonnegative().optional()
|
|
16637
16639
|
}).optional()
|
|
16638
16640
|
}).optional(),
|
|
16639
16641
|
extract: zod.z.object({
|
|
@@ -16660,8 +16662,9 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16660
16662
|
pageSummaryChunk: zod.z.boolean().optional()
|
|
16661
16663
|
}).optional(),
|
|
16662
16664
|
embeddings: zod.z.object({
|
|
16663
|
-
provider: zod.z.literal("
|
|
16665
|
+
provider: zod.z.literal("jina").optional(),
|
|
16664
16666
|
model: zod.z.string().min(1).optional(),
|
|
16667
|
+
apiKey: zod.z.string().min(1).optional(),
|
|
16665
16668
|
apiKeyEnv: zod.z.string().min(1).optional(),
|
|
16666
16669
|
batchSize: zod.z.number().int().positive().optional(),
|
|
16667
16670
|
concurrency: zod.z.number().int().positive().optional(),
|
|
@@ -16670,18 +16673,17 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16670
16673
|
vector: zod.z.object({
|
|
16671
16674
|
dimension: zod.z.number().int().positive().optional(),
|
|
16672
16675
|
turso: zod.z.object({
|
|
16676
|
+
url: zod.z.string().url().optional(),
|
|
16677
|
+
authToken: zod.z.string().min(1).optional(),
|
|
16673
16678
|
urlEnv: zod.z.string().optional(),
|
|
16674
16679
|
authTokenEnv: zod.z.string().optional(),
|
|
16675
16680
|
localPath: zod.z.string().optional()
|
|
16676
16681
|
}).optional()
|
|
16677
16682
|
}).optional(),
|
|
16678
16683
|
rerank: zod.z.object({
|
|
16679
|
-
|
|
16684
|
+
enabled: zod.z.boolean().optional(),
|
|
16680
16685
|
topN: zod.z.number().int().positive().optional(),
|
|
16681
|
-
|
|
16682
|
-
apiKeyEnv: zod.z.string().optional(),
|
|
16683
|
-
model: zod.z.string().optional()
|
|
16684
|
-
}).optional()
|
|
16686
|
+
model: zod.z.string().optional()
|
|
16685
16687
|
}).optional(),
|
|
16686
16688
|
ranking: zod.z.object({
|
|
16687
16689
|
enableIncomingLinkBoost: zod.z.boolean().optional(),
|
|
@@ -16690,6 +16692,7 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16690
16692
|
aggregationCap: zod.z.number().int().positive().optional(),
|
|
16691
16693
|
aggregationDecay: zod.z.number().min(0).max(1).optional(),
|
|
16692
16694
|
minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
16695
|
+
minScore: zod.z.number().min(0).max(1).optional(),
|
|
16693
16696
|
weights: zod.z.object({
|
|
16694
16697
|
incomingLinks: zod.z.number().optional(),
|
|
16695
16698
|
depth: zod.z.number().optional(),
|
|
@@ -16770,9 +16773,9 @@ function createDefaultConfig(projectId) {
|
|
|
16770
16773
|
pageSummaryChunk: true
|
|
16771
16774
|
},
|
|
16772
16775
|
embeddings: {
|
|
16773
|
-
provider: "
|
|
16774
|
-
model: "
|
|
16775
|
-
apiKeyEnv: "
|
|
16776
|
+
provider: "jina",
|
|
16777
|
+
model: "jina-embeddings-v3",
|
|
16778
|
+
apiKeyEnv: "JINA_API_KEY",
|
|
16776
16779
|
batchSize: 64,
|
|
16777
16780
|
concurrency: 4
|
|
16778
16781
|
},
|
|
@@ -16784,12 +16787,9 @@ function createDefaultConfig(projectId) {
|
|
|
16784
16787
|
}
|
|
16785
16788
|
},
|
|
16786
16789
|
rerank: {
|
|
16787
|
-
|
|
16790
|
+
enabled: false,
|
|
16788
16791
|
topN: 20,
|
|
16789
|
-
|
|
16790
|
-
apiKeyEnv: "JINA_API_KEY",
|
|
16791
|
-
model: "jina-reranker-v2-base-multilingual"
|
|
16792
|
-
}
|
|
16792
|
+
model: "jina-reranker-v2-base-multilingual"
|
|
16793
16793
|
},
|
|
16794
16794
|
ranking: {
|
|
16795
16795
|
enableIncomingLinkBoost: true,
|
|
@@ -16798,6 +16798,7 @@ function createDefaultConfig(projectId) {
|
|
|
16798
16798
|
aggregationCap: 5,
|
|
16799
16799
|
aggregationDecay: 0.5,
|
|
16800
16800
|
minChunkScoreRatio: 0.5,
|
|
16801
|
+
minScore: 0,
|
|
16801
16802
|
weights: {
|
|
16802
16803
|
incomingLinks: 0.05,
|
|
16803
16804
|
depth: 0.03,
|
|
@@ -16924,7 +16925,11 @@ ${issues}`
|
|
|
16924
16925
|
outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
|
|
16925
16926
|
paramValues: parsed.source.build.paramValues ?? {},
|
|
16926
16927
|
exclude: parsed.source.build.exclude ?? [],
|
|
16927
|
-
previewTimeout: parsed.source.build.previewTimeout ?? 3e4
|
|
16928
|
+
previewTimeout: parsed.source.build.previewTimeout ?? 3e4,
|
|
16929
|
+
discover: parsed.source.build.discover ?? false,
|
|
16930
|
+
seedUrls: parsed.source.build.seedUrls ?? ["/"],
|
|
16931
|
+
maxPages: parsed.source.build.maxPages ?? 200,
|
|
16932
|
+
maxDepth: parsed.source.build.maxDepth ?? 10
|
|
16928
16933
|
} : void 0
|
|
16929
16934
|
},
|
|
16930
16935
|
extract: {
|
|
@@ -16953,11 +16958,7 @@ ${issues}`
|
|
|
16953
16958
|
},
|
|
16954
16959
|
rerank: {
|
|
16955
16960
|
...defaults.rerank,
|
|
16956
|
-
...parsed.rerank
|
|
16957
|
-
jina: {
|
|
16958
|
-
...defaults.rerank.jina,
|
|
16959
|
-
...parsed.rerank?.jina
|
|
16960
|
-
}
|
|
16961
|
+
...parsed.rerank
|
|
16961
16962
|
},
|
|
16962
16963
|
ranking: {
|
|
16963
16964
|
...defaults.ranking,
|
|
@@ -17004,7 +17005,11 @@ ${issues}`
|
|
|
17004
17005
|
outputDir: ".svelte-kit/output",
|
|
17005
17006
|
paramValues: {},
|
|
17006
17007
|
exclude: [],
|
|
17007
|
-
previewTimeout: 3e4
|
|
17008
|
+
previewTimeout: 3e4,
|
|
17009
|
+
discover: false,
|
|
17010
|
+
seedUrls: ["/"],
|
|
17011
|
+
maxPages: 200,
|
|
17012
|
+
maxDepth: 10
|
|
17008
17013
|
};
|
|
17009
17014
|
}
|
|
17010
17015
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
@@ -17018,6 +17023,21 @@ ${issues}`
|
|
|
17018
17023
|
}
|
|
17019
17024
|
return merged;
|
|
17020
17025
|
}
|
|
17026
|
+
function mergeConfigServerless(rawConfig) {
|
|
17027
|
+
if (!rawConfig.project?.id) {
|
|
17028
|
+
throw new SearchSocketError(
|
|
17029
|
+
"CONFIG_MISSING",
|
|
17030
|
+
"`project.id` is required for serverless config (cannot infer from package.json)."
|
|
17031
|
+
);
|
|
17032
|
+
}
|
|
17033
|
+
if (!rawConfig.source?.mode) {
|
|
17034
|
+
throw new SearchSocketError(
|
|
17035
|
+
"CONFIG_MISSING",
|
|
17036
|
+
"`source.mode` is required for serverless config (cannot auto-detect from filesystem)."
|
|
17037
|
+
);
|
|
17038
|
+
}
|
|
17039
|
+
return mergeConfig(process.cwd(), rawConfig);
|
|
17040
|
+
}
|
|
17021
17041
|
async function loadConfig(options = {}) {
|
|
17022
17042
|
const cwd = path__default.default.resolve(options.cwd ?? process.cwd());
|
|
17023
17043
|
const configPath = path__default.default.resolve(cwd, options.configPath ?? "searchsocket.config.ts");
|
|
@@ -17040,6 +17060,11 @@ async function loadConfig(options = {}) {
|
|
|
17040
17060
|
return mergeConfig(cwd, raw);
|
|
17041
17061
|
}
|
|
17042
17062
|
|
|
17063
|
+
// src/core/serverless.ts
|
|
17064
|
+
function isServerless() {
|
|
17065
|
+
return !!(process.env.VERCEL || process.env.NETLIFY || process.env.AWS_LAMBDA_FUNCTION_NAME || process.env.FUNCTIONS_WORKER || process.env.CF_PAGES);
|
|
17066
|
+
}
|
|
17067
|
+
|
|
17043
17068
|
// src/utils/text.ts
|
|
17044
17069
|
function normalizeText(input) {
|
|
17045
17070
|
return input.replace(/\r\n/g, "\n").replace(/\s+/g, " ").trim();
|
|
@@ -17117,10 +17142,11 @@ function sleep(ms) {
|
|
|
17117
17142
|
setTimeout(resolve, ms);
|
|
17118
17143
|
});
|
|
17119
17144
|
}
|
|
17120
|
-
var
|
|
17121
|
-
|
|
17145
|
+
var JinaEmbeddingsProvider = class {
|
|
17146
|
+
apiKey;
|
|
17122
17147
|
batchSize;
|
|
17123
17148
|
concurrency;
|
|
17149
|
+
defaultTask;
|
|
17124
17150
|
constructor(options) {
|
|
17125
17151
|
if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
|
|
17126
17152
|
throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
|
|
@@ -17128,11 +17154,10 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17128
17154
|
if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
|
|
17129
17155
|
throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
|
|
17130
17156
|
}
|
|
17131
|
-
this.
|
|
17132
|
-
apiKey: options.apiKey
|
|
17133
|
-
});
|
|
17157
|
+
this.apiKey = options.apiKey;
|
|
17134
17158
|
this.batchSize = options.batchSize;
|
|
17135
17159
|
this.concurrency = options.concurrency;
|
|
17160
|
+
this.defaultTask = options.task ?? "retrieval.passage";
|
|
17136
17161
|
}
|
|
17137
17162
|
estimateTokens(text) {
|
|
17138
17163
|
const normalized = text.trim();
|
|
@@ -17146,7 +17171,7 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17146
17171
|
const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
|
|
17147
17172
|
return Math.max(1, Math.max(charEstimate, lexicalEstimate));
|
|
17148
17173
|
}
|
|
17149
|
-
async embedTexts(texts, modelId) {
|
|
17174
|
+
async embedTexts(texts, modelId, task) {
|
|
17150
17175
|
if (texts.length === 0) {
|
|
17151
17176
|
return [];
|
|
17152
17177
|
}
|
|
@@ -17158,37 +17183,56 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17158
17183
|
});
|
|
17159
17184
|
}
|
|
17160
17185
|
const outputs = new Array(batches.length);
|
|
17161
|
-
const limit =
|
|
17186
|
+
const limit = pLimit2__default.default(this.concurrency);
|
|
17162
17187
|
await Promise.all(
|
|
17163
17188
|
batches.map(
|
|
17164
17189
|
(batch, position) => limit(async () => {
|
|
17165
|
-
outputs[position] = await this.embedWithRetry(batch.values, modelId);
|
|
17190
|
+
outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
|
|
17166
17191
|
})
|
|
17167
17192
|
)
|
|
17168
17193
|
);
|
|
17169
17194
|
return outputs.flat();
|
|
17170
17195
|
}
|
|
17171
|
-
async embedWithRetry(texts, modelId) {
|
|
17196
|
+
async embedWithRetry(texts, modelId, task) {
|
|
17172
17197
|
const maxAttempts = 5;
|
|
17173
17198
|
let attempt = 0;
|
|
17174
17199
|
while (attempt < maxAttempts) {
|
|
17175
17200
|
attempt += 1;
|
|
17201
|
+
let response;
|
|
17176
17202
|
try {
|
|
17177
|
-
|
|
17178
|
-
|
|
17179
|
-
|
|
17180
|
-
|
|
17203
|
+
response = await fetch("https://api.jina.ai/v1/embeddings", {
|
|
17204
|
+
method: "POST",
|
|
17205
|
+
headers: {
|
|
17206
|
+
"content-type": "application/json",
|
|
17207
|
+
authorization: `Bearer ${this.apiKey}`
|
|
17208
|
+
},
|
|
17209
|
+
body: JSON.stringify({
|
|
17210
|
+
model: modelId,
|
|
17211
|
+
input: texts,
|
|
17212
|
+
task
|
|
17213
|
+
})
|
|
17181
17214
|
});
|
|
17182
|
-
return response.data.map((entry) => entry.embedding);
|
|
17183
17215
|
} catch (error) {
|
|
17184
|
-
|
|
17185
|
-
const retryable = status === 429 || typeof status === "number" && status >= 500;
|
|
17186
|
-
if (!retryable || attempt >= maxAttempts) {
|
|
17216
|
+
if (attempt >= maxAttempts) {
|
|
17187
17217
|
throw error;
|
|
17188
17218
|
}
|
|
17189
|
-
|
|
17190
|
-
|
|
17219
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17220
|
+
continue;
|
|
17221
|
+
}
|
|
17222
|
+
if (!response.ok) {
|
|
17223
|
+
const retryable = response.status === 429 || response.status >= 500;
|
|
17224
|
+
if (!retryable || attempt >= maxAttempts) {
|
|
17225
|
+
const errorBody = await response.text();
|
|
17226
|
+
throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
|
|
17227
|
+
}
|
|
17228
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17229
|
+
continue;
|
|
17191
17230
|
}
|
|
17231
|
+
const payload = await response.json();
|
|
17232
|
+
if (!payload.data || !Array.isArray(payload.data)) {
|
|
17233
|
+
throw new Error("Invalid Jina embeddings response format");
|
|
17234
|
+
}
|
|
17235
|
+
return payload.data.map((entry) => entry.embedding);
|
|
17192
17236
|
}
|
|
17193
17237
|
throw new Error("Unreachable retry state");
|
|
17194
17238
|
}
|
|
@@ -17196,20 +17240,20 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17196
17240
|
|
|
17197
17241
|
// src/embeddings/factory.ts
|
|
17198
17242
|
function createEmbeddingsProvider(config) {
|
|
17199
|
-
if (config.embeddings.provider !== "
|
|
17243
|
+
if (config.embeddings.provider !== "jina") {
|
|
17200
17244
|
throw new SearchSocketError(
|
|
17201
17245
|
"CONFIG_MISSING",
|
|
17202
17246
|
`Unsupported embeddings provider ${config.embeddings.provider}`
|
|
17203
17247
|
);
|
|
17204
17248
|
}
|
|
17205
|
-
const apiKey = process.env[config.embeddings.apiKeyEnv];
|
|
17249
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17206
17250
|
if (!apiKey) {
|
|
17207
17251
|
throw new SearchSocketError(
|
|
17208
17252
|
"CONFIG_MISSING",
|
|
17209
|
-
`Missing embeddings API key env var
|
|
17253
|
+
`Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
|
|
17210
17254
|
);
|
|
17211
17255
|
}
|
|
17212
|
-
return new
|
|
17256
|
+
return new JinaEmbeddingsProvider({
|
|
17213
17257
|
apiKey,
|
|
17214
17258
|
batchSize: config.embeddings.batchSize,
|
|
17215
17259
|
concurrency: config.embeddings.concurrency
|
|
@@ -17299,20 +17343,17 @@ var JinaReranker = class {
|
|
|
17299
17343
|
|
|
17300
17344
|
// src/rerank/factory.ts
|
|
17301
17345
|
function createReranker(config) {
|
|
17302
|
-
if (config.rerank.
|
|
17346
|
+
if (!config.rerank.enabled) {
|
|
17303
17347
|
return null;
|
|
17304
17348
|
}
|
|
17305
|
-
|
|
17306
|
-
|
|
17307
|
-
|
|
17308
|
-
return null;
|
|
17309
|
-
}
|
|
17310
|
-
return new JinaReranker({
|
|
17311
|
-
apiKey,
|
|
17312
|
-
model: config.rerank.jina.model
|
|
17313
|
-
});
|
|
17349
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17350
|
+
if (!apiKey) {
|
|
17351
|
+
return null;
|
|
17314
17352
|
}
|
|
17315
|
-
return
|
|
17353
|
+
return new JinaReranker({
|
|
17354
|
+
apiKey,
|
|
17355
|
+
model: config.rerank.model
|
|
17356
|
+
});
|
|
17316
17357
|
}
|
|
17317
17358
|
function ensureStateDirs(cwd, stateDir, scope) {
|
|
17318
17359
|
const statePath = path__default.default.resolve(cwd, stateDir);
|
|
@@ -17365,6 +17406,16 @@ var TursoVectorStore = class {
|
|
|
17365
17406
|
}
|
|
17366
17407
|
async ensureChunks(dim) {
|
|
17367
17408
|
if (this.chunksReady) return;
|
|
17409
|
+
const exists = await this.chunksTableExists();
|
|
17410
|
+
if (exists) {
|
|
17411
|
+
const currentDim = await this.getChunksDimension();
|
|
17412
|
+
if (currentDim !== null && currentDim !== dim) {
|
|
17413
|
+
await this.client.batch([
|
|
17414
|
+
"DROP INDEX IF EXISTS idx",
|
|
17415
|
+
"DROP TABLE IF EXISTS chunks"
|
|
17416
|
+
]);
|
|
17417
|
+
}
|
|
17418
|
+
}
|
|
17368
17419
|
await this.client.batch([
|
|
17369
17420
|
`CREATE TABLE IF NOT EXISTS chunks (
|
|
17370
17421
|
id TEXT PRIMARY KEY,
|
|
@@ -17376,6 +17427,8 @@ var TursoVectorStore = class {
|
|
|
17376
17427
|
section_title TEXT NOT NULL DEFAULT '',
|
|
17377
17428
|
heading_path TEXT NOT NULL DEFAULT '[]',
|
|
17378
17429
|
snippet TEXT NOT NULL DEFAULT '',
|
|
17430
|
+
chunk_text TEXT NOT NULL DEFAULT '',
|
|
17431
|
+
ordinal INTEGER NOT NULL DEFAULT 0,
|
|
17379
17432
|
content_hash TEXT NOT NULL DEFAULT '',
|
|
17380
17433
|
model_id TEXT NOT NULL DEFAULT '',
|
|
17381
17434
|
depth INTEGER NOT NULL DEFAULT 0,
|
|
@@ -17386,6 +17439,19 @@ var TursoVectorStore = class {
|
|
|
17386
17439
|
)`,
|
|
17387
17440
|
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
17388
17441
|
]);
|
|
17442
|
+
const chunkMigrationCols = [
|
|
17443
|
+
{ name: "chunk_text", def: "TEXT NOT NULL DEFAULT ''" },
|
|
17444
|
+
{ name: "ordinal", def: "INTEGER NOT NULL DEFAULT 0" }
|
|
17445
|
+
];
|
|
17446
|
+
for (const col of chunkMigrationCols) {
|
|
17447
|
+
try {
|
|
17448
|
+
await this.client.execute(`ALTER TABLE chunks ADD COLUMN ${col.name} ${col.def}`);
|
|
17449
|
+
} catch (error) {
|
|
17450
|
+
if (error instanceof Error && !error.message.includes("duplicate column")) {
|
|
17451
|
+
throw error;
|
|
17452
|
+
}
|
|
17453
|
+
}
|
|
17454
|
+
}
|
|
17389
17455
|
this.chunksReady = true;
|
|
17390
17456
|
}
|
|
17391
17457
|
async ensurePages() {
|
|
@@ -17420,6 +17486,38 @@ var TursoVectorStore = class {
|
|
|
17420
17486
|
throw error;
|
|
17421
17487
|
}
|
|
17422
17488
|
}
|
|
17489
|
+
/**
|
|
17490
|
+
* Read the current F32_BLOB dimension from the chunks table schema.
|
|
17491
|
+
* Returns null if the table doesn't exist or the dimension can't be parsed.
|
|
17492
|
+
*/
|
|
17493
|
+
async getChunksDimension() {
|
|
17494
|
+
try {
|
|
17495
|
+
const rs = await this.client.execute(
|
|
17496
|
+
"SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
|
|
17497
|
+
);
|
|
17498
|
+
if (rs.rows.length === 0) return null;
|
|
17499
|
+
const sql = rs.rows[0].sql;
|
|
17500
|
+
const match = sql.match(/F32_BLOB\((\d+)\)/i);
|
|
17501
|
+
return match ? parseInt(match[1], 10) : null;
|
|
17502
|
+
} catch {
|
|
17503
|
+
return null;
|
|
17504
|
+
}
|
|
17505
|
+
}
|
|
17506
|
+
/**
|
|
17507
|
+
* Drop all SearchSocket tables (chunks, registry, pages) and their indexes.
|
|
17508
|
+
* Used by `clean --remote` for a full reset.
|
|
17509
|
+
*/
|
|
17510
|
+
async dropAllTables() {
|
|
17511
|
+
await this.client.batch([
|
|
17512
|
+
"DROP INDEX IF EXISTS idx",
|
|
17513
|
+
"DROP TABLE IF EXISTS chunks",
|
|
17514
|
+
"DROP TABLE IF EXISTS registry",
|
|
17515
|
+
"DROP TABLE IF EXISTS pages"
|
|
17516
|
+
]);
|
|
17517
|
+
this.chunksReady = false;
|
|
17518
|
+
this.registryReady = false;
|
|
17519
|
+
this.pagesReady = false;
|
|
17520
|
+
}
|
|
17423
17521
|
async upsert(records, _scope) {
|
|
17424
17522
|
if (records.length === 0) return;
|
|
17425
17523
|
const dim = this.dimension ?? records[0].vector.length;
|
|
@@ -17430,9 +17528,9 @@ var TursoVectorStore = class {
|
|
|
17430
17528
|
const stmts = batch.map((r) => ({
|
|
17431
17529
|
sql: `INSERT OR REPLACE INTO chunks
|
|
17432
17530
|
(id, project_id, scope_name, url, path, title, section_title,
|
|
17433
|
-
heading_path, snippet, content_hash, model_id, depth,
|
|
17531
|
+
heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
|
|
17434
17532
|
incoming_links, route_file, tags, embedding)
|
|
17435
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17533
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17436
17534
|
args: [
|
|
17437
17535
|
r.id,
|
|
17438
17536
|
r.metadata.projectId,
|
|
@@ -17443,6 +17541,8 @@ var TursoVectorStore = class {
|
|
|
17443
17541
|
r.metadata.sectionTitle,
|
|
17444
17542
|
JSON.stringify(r.metadata.headingPath),
|
|
17445
17543
|
r.metadata.snippet,
|
|
17544
|
+
r.metadata.chunkText,
|
|
17545
|
+
r.metadata.ordinal,
|
|
17446
17546
|
r.metadata.contentHash,
|
|
17447
17547
|
r.metadata.modelId,
|
|
17448
17548
|
r.metadata.depth,
|
|
@@ -17461,7 +17561,8 @@ var TursoVectorStore = class {
|
|
|
17461
17561
|
const queryJson = JSON.stringify(queryVector);
|
|
17462
17562
|
const rs = await this.client.execute({
|
|
17463
17563
|
sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
|
|
17464
|
-
c.section_title, c.heading_path, c.snippet, c.
|
|
17564
|
+
c.section_title, c.heading_path, c.snippet, c.chunk_text,
|
|
17565
|
+
c.ordinal, c.content_hash,
|
|
17465
17566
|
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
17466
17567
|
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
17467
17568
|
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
@@ -17505,6 +17606,8 @@ var TursoVectorStore = class {
|
|
|
17505
17606
|
sectionTitle: row.section_title,
|
|
17506
17607
|
headingPath: JSON.parse(row.heading_path || "[]"),
|
|
17507
17608
|
snippet: row.snippet,
|
|
17609
|
+
chunkText: row.chunk_text || "",
|
|
17610
|
+
ordinal: row.ordinal || 0,
|
|
17508
17611
|
contentHash: row.content_hash,
|
|
17509
17612
|
modelId: row.model_id,
|
|
17510
17613
|
depth: row.depth,
|
|
@@ -17700,10 +17803,10 @@ var TursoVectorStore = class {
|
|
|
17700
17803
|
// src/vector/factory.ts
|
|
17701
17804
|
async function createVectorStore(config, cwd) {
|
|
17702
17805
|
const turso = config.vector.turso;
|
|
17703
|
-
const remoteUrl = process.env[turso.urlEnv];
|
|
17806
|
+
const remoteUrl = turso.url ?? process.env[turso.urlEnv];
|
|
17704
17807
|
if (remoteUrl) {
|
|
17705
17808
|
const { createClient: createClient2 } = await import('@libsql/client/http');
|
|
17706
|
-
const authToken = process.env[turso.authTokenEnv];
|
|
17809
|
+
const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
|
|
17707
17810
|
const client2 = createClient2({
|
|
17708
17811
|
url: remoteUrl,
|
|
17709
17812
|
authToken
|
|
@@ -17713,6 +17816,12 @@ async function createVectorStore(config, cwd) {
|
|
|
17713
17816
|
dimension: config.vector.dimension
|
|
17714
17817
|
});
|
|
17715
17818
|
}
|
|
17819
|
+
if (isServerless()) {
|
|
17820
|
+
throw new SearchSocketError(
|
|
17821
|
+
"VECTOR_BACKEND_UNAVAILABLE",
|
|
17822
|
+
`No remote vector database URL found (checked vector.turso.url and env var "${turso.urlEnv}"). Local SQLite storage is not available in serverless environments. Set ${turso.urlEnv} or pass vector.turso.url directly.`
|
|
17823
|
+
);
|
|
17824
|
+
}
|
|
17716
17825
|
const { createClient } = await import('@libsql/client');
|
|
17717
17826
|
const localPath = path__default.default.resolve(cwd, turso.localPath);
|
|
17718
17827
|
fs__default.default.mkdirSync(path__default.default.dirname(localPath), { recursive: true });
|
|
@@ -19151,14 +19260,16 @@ function mapUrlToRoute(urlPath, patterns) {
|
|
|
19151
19260
|
var Logger = class {
|
|
19152
19261
|
json;
|
|
19153
19262
|
verbose;
|
|
19263
|
+
quiet;
|
|
19154
19264
|
stderrOnly;
|
|
19155
19265
|
constructor(opts = {}) {
|
|
19156
19266
|
this.json = opts.json ?? false;
|
|
19157
19267
|
this.verbose = opts.verbose ?? false;
|
|
19268
|
+
this.quiet = opts.quiet ?? false;
|
|
19158
19269
|
this.stderrOnly = opts.stderrOnly ?? false;
|
|
19159
19270
|
}
|
|
19160
19271
|
info(message) {
|
|
19161
|
-
if (this.json) {
|
|
19272
|
+
if (this.quiet || this.json) {
|
|
19162
19273
|
return;
|
|
19163
19274
|
}
|
|
19164
19275
|
this.writeOut(`${message}
|
|
@@ -19172,7 +19283,7 @@ var Logger = class {
|
|
|
19172
19283
|
this.logJson("debug", { message });
|
|
19173
19284
|
return;
|
|
19174
19285
|
}
|
|
19175
|
-
this.writeOut(
|
|
19286
|
+
this.writeOut(` ${message}
|
|
19176
19287
|
`);
|
|
19177
19288
|
}
|
|
19178
19289
|
warn(message) {
|
|
@@ -19199,7 +19310,7 @@ var Logger = class {
|
|
|
19199
19310
|
this.logJson(event, data);
|
|
19200
19311
|
return;
|
|
19201
19312
|
}
|
|
19202
|
-
this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
|
|
19313
|
+
this.writeOut(` [${event}] ${data ? JSON.stringify(data) : ""}
|
|
19203
19314
|
`);
|
|
19204
19315
|
}
|
|
19205
19316
|
writeOut(text) {
|
|
@@ -19384,11 +19495,108 @@ async function startPreviewServer(cwd, options, logger3) {
|
|
|
19384
19495
|
|
|
19385
19496
|
// src/indexing/sources/build/index.ts
|
|
19386
19497
|
var logger = new Logger();
|
|
19498
|
+
function extractLinksFromHtml(html, pageUrl, baseOrigin) {
|
|
19499
|
+
const $ = cheerio.load(html);
|
|
19500
|
+
const links = [];
|
|
19501
|
+
$("a[href]").each((_i, el) => {
|
|
19502
|
+
const href = $(el).attr("href");
|
|
19503
|
+
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
|
|
19504
|
+
return;
|
|
19505
|
+
}
|
|
19506
|
+
try {
|
|
19507
|
+
const resolved = new URL(href, `${baseOrigin}${pageUrl}`);
|
|
19508
|
+
if (resolved.origin !== baseOrigin) return;
|
|
19509
|
+
if (!["http:", "https:"].includes(resolved.protocol)) return;
|
|
19510
|
+
links.push(normalizeUrlPath(resolved.pathname));
|
|
19511
|
+
} catch {
|
|
19512
|
+
}
|
|
19513
|
+
});
|
|
19514
|
+
return [...new Set(links)];
|
|
19515
|
+
}
|
|
19516
|
+
async function discoverPages(server, buildConfig, pipelineMaxPages) {
|
|
19517
|
+
const { seedUrls, maxDepth, exclude } = buildConfig;
|
|
19518
|
+
const baseOrigin = new URL(server.baseUrl).origin;
|
|
19519
|
+
let effectiveMax = buildConfig.maxPages;
|
|
19520
|
+
if (typeof pipelineMaxPages === "number") {
|
|
19521
|
+
const floored = Math.max(0, Math.floor(pipelineMaxPages));
|
|
19522
|
+
effectiveMax = Math.min(effectiveMax, floored);
|
|
19523
|
+
}
|
|
19524
|
+
if (effectiveMax === 0) return [];
|
|
19525
|
+
const visited = /* @__PURE__ */ new Set();
|
|
19526
|
+
const pages = [];
|
|
19527
|
+
const queue = [];
|
|
19528
|
+
const limit = pLimit2__default.default(8);
|
|
19529
|
+
for (const seed of seedUrls) {
|
|
19530
|
+
const normalized = normalizeUrlPath(seed);
|
|
19531
|
+
if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
|
|
19532
|
+
visited.add(normalized);
|
|
19533
|
+
queue.push({ url: normalized, depth: 0 });
|
|
19534
|
+
}
|
|
19535
|
+
}
|
|
19536
|
+
while (queue.length > 0 && pages.length < effectiveMax) {
|
|
19537
|
+
const remaining = effectiveMax - pages.length;
|
|
19538
|
+
const batch = queue.splice(0, remaining);
|
|
19539
|
+
const results = await Promise.allSettled(
|
|
19540
|
+
batch.map(
|
|
19541
|
+
(item) => limit(async () => {
|
|
19542
|
+
const fullUrl = joinUrl(server.baseUrl, item.url);
|
|
19543
|
+
const response = await fetch(fullUrl);
|
|
19544
|
+
if (!response.ok) {
|
|
19545
|
+
logger.warn(`Skipping ${item.url}: ${response.status} ${response.statusText}`);
|
|
19546
|
+
return null;
|
|
19547
|
+
}
|
|
19548
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
19549
|
+
if (!contentType.includes("text/html")) {
|
|
19550
|
+
return null;
|
|
19551
|
+
}
|
|
19552
|
+
const html = await response.text();
|
|
19553
|
+
if (item.depth < maxDepth) {
|
|
19554
|
+
const links = extractLinksFromHtml(html, item.url, baseOrigin);
|
|
19555
|
+
for (const link of links) {
|
|
19556
|
+
if (!visited.has(link) && !isExcluded(link, exclude)) {
|
|
19557
|
+
visited.add(link);
|
|
19558
|
+
queue.push({ url: link, depth: item.depth + 1 });
|
|
19559
|
+
}
|
|
19560
|
+
}
|
|
19561
|
+
}
|
|
19562
|
+
return {
|
|
19563
|
+
url: item.url,
|
|
19564
|
+
html,
|
|
19565
|
+
sourcePath: fullUrl,
|
|
19566
|
+
outgoingLinks: []
|
|
19567
|
+
};
|
|
19568
|
+
})
|
|
19569
|
+
)
|
|
19570
|
+
);
|
|
19571
|
+
for (const result of results) {
|
|
19572
|
+
if (result.status === "fulfilled" && result.value) {
|
|
19573
|
+
pages.push(result.value);
|
|
19574
|
+
}
|
|
19575
|
+
}
|
|
19576
|
+
}
|
|
19577
|
+
if (pages.length >= effectiveMax && queue.length > 0) {
|
|
19578
|
+
logger.warn(`Discovery crawl reached maxPages limit (${effectiveMax}), ${queue.length} URLs not visited.`);
|
|
19579
|
+
}
|
|
19580
|
+
logger.event("build_discover_complete", {
|
|
19581
|
+
pagesFound: pages.length,
|
|
19582
|
+
urlsVisited: visited.size,
|
|
19583
|
+
urlsSkipped: queue.length
|
|
19584
|
+
});
|
|
19585
|
+
return pages;
|
|
19586
|
+
}
|
|
19387
19587
|
async function loadBuildPages(cwd, config, maxPages) {
|
|
19388
19588
|
const buildConfig = config.source.build;
|
|
19389
19589
|
if (!buildConfig) {
|
|
19390
19590
|
throw new Error("build source config is missing");
|
|
19391
19591
|
}
|
|
19592
|
+
if (buildConfig.discover) {
|
|
19593
|
+
const server2 = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
19594
|
+
try {
|
|
19595
|
+
return await discoverPages(server2, buildConfig, maxPages);
|
|
19596
|
+
} finally {
|
|
19597
|
+
await server2.shutdown();
|
|
19598
|
+
}
|
|
19599
|
+
}
|
|
19392
19600
|
const routes = await parseManifest(cwd, buildConfig.outputDir);
|
|
19393
19601
|
const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
|
|
19394
19602
|
logger.event("build_routes_discovered", {
|
|
@@ -19399,7 +19607,7 @@ async function loadBuildPages(cwd, config, maxPages) {
|
|
|
19399
19607
|
const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
|
|
19400
19608
|
const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
19401
19609
|
try {
|
|
19402
|
-
const concurrencyLimit =
|
|
19610
|
+
const concurrencyLimit = pLimit2__default.default(8);
|
|
19403
19611
|
const results = await Promise.allSettled(
|
|
19404
19612
|
selected.map(
|
|
19405
19613
|
(route) => concurrencyLimit(async () => {
|
|
@@ -19568,7 +19776,7 @@ async function loadCrawledPages(config, maxPages) {
|
|
|
19568
19776
|
const routes = await resolveRoutes(config);
|
|
19569
19777
|
const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
|
|
19570
19778
|
const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
|
|
19571
|
-
const concurrencyLimit =
|
|
19779
|
+
const concurrencyLimit = pLimit2__default.default(8);
|
|
19572
19780
|
const results = await Promise.allSettled(
|
|
19573
19781
|
selected.map(
|
|
19574
19782
|
(route) => concurrencyLimit(async () => {
|
|
@@ -19630,9 +19838,7 @@ function hrTimeMs(start) {
|
|
|
19630
19838
|
|
|
19631
19839
|
// src/indexing/pipeline.ts
|
|
19632
19840
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
19633
|
-
"
|
|
19634
|
-
"text-embedding-3-large": 13e-5,
|
|
19635
|
-
"text-embedding-ada-002": 1e-4
|
|
19841
|
+
"jina-embeddings-v3": 2e-5
|
|
19636
19842
|
};
|
|
19637
19843
|
var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
|
|
19638
19844
|
var IndexPipeline = class _IndexPipeline {
|
|
@@ -19678,9 +19884,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19678
19884
|
};
|
|
19679
19885
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
19680
19886
|
const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
|
|
19887
|
+
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
19888
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, model: ${this.config.embeddings.model})`);
|
|
19681
19889
|
if (options.force) {
|
|
19890
|
+
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
19682
19891
|
await cleanMirrorForScope(statePath, scope);
|
|
19683
19892
|
}
|
|
19893
|
+
if (options.dryRun) {
|
|
19894
|
+
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
19895
|
+
}
|
|
19684
19896
|
const manifestStart = stageStart();
|
|
19685
19897
|
const existingHashes = await this.vectorStore.getContentHashes(scope);
|
|
19686
19898
|
const existingModelId = await this.vectorStore.getScopeModelId(scope);
|
|
@@ -19691,8 +19903,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19691
19903
|
);
|
|
19692
19904
|
}
|
|
19693
19905
|
stageEnd("manifest", manifestStart);
|
|
19906
|
+
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
19694
19907
|
const sourceStart = stageStart();
|
|
19695
|
-
|
|
19908
|
+
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
19696
19909
|
let sourcePages;
|
|
19697
19910
|
if (sourceMode === "static-output") {
|
|
19698
19911
|
sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
|
|
@@ -19704,10 +19917,13 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19704
19917
|
sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
|
|
19705
19918
|
}
|
|
19706
19919
|
stageEnd("source", sourceStart);
|
|
19920
|
+
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
19707
19921
|
const routeStart = stageStart();
|
|
19708
19922
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
19709
19923
|
stageEnd("route_map", routeStart);
|
|
19924
|
+
this.logger.debug(`Route mapping: ${routePatterns.length} pattern${routePatterns.length === 1 ? "" : "s"} discovered (${stageTimingsMs["route_map"]}ms)`);
|
|
19710
19925
|
const extractStart = stageStart();
|
|
19926
|
+
this.logger.info("Extracting content...");
|
|
19711
19927
|
const extractedPages = [];
|
|
19712
19928
|
for (const sourcePage of sourcePages) {
|
|
19713
19929
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
@@ -19736,6 +19952,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19736
19952
|
uniquePages.push(page);
|
|
19737
19953
|
}
|
|
19738
19954
|
stageEnd("extract", extractStart);
|
|
19955
|
+
const skippedPages = sourcePages.length - uniquePages.length;
|
|
19956
|
+
this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
19739
19957
|
const linkStart = stageStart();
|
|
19740
19958
|
const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
|
|
19741
19959
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
@@ -19751,7 +19969,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19751
19969
|
}
|
|
19752
19970
|
}
|
|
19753
19971
|
stageEnd("links", linkStart);
|
|
19972
|
+
this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
|
|
19754
19973
|
const mirrorStart = stageStart();
|
|
19974
|
+
this.logger.info("Writing mirror pages...");
|
|
19755
19975
|
const mirrorPages = [];
|
|
19756
19976
|
let routeExact = 0;
|
|
19757
19977
|
let routeBestEffort = 0;
|
|
@@ -19821,7 +20041,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19821
20041
|
await this.vectorStore.upsertPages(pageRecords, scope);
|
|
19822
20042
|
}
|
|
19823
20043
|
stageEnd("mirror", mirrorStart);
|
|
20044
|
+
this.logger.info(`Mirrored ${mirrorPages.length} page${mirrorPages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["mirror"]}ms)`);
|
|
19824
20045
|
const chunkStart = stageStart();
|
|
20046
|
+
this.logger.info("Chunking pages...");
|
|
19825
20047
|
let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
|
|
19826
20048
|
const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
|
|
19827
20049
|
if (typeof maxChunks === "number") {
|
|
@@ -19834,6 +20056,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19834
20056
|
});
|
|
19835
20057
|
}
|
|
19836
20058
|
stageEnd("chunk", chunkStart);
|
|
20059
|
+
this.logger.info(`Chunked into ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} (${stageTimingsMs["chunk"]}ms)`);
|
|
19837
20060
|
const currentChunkMap = /* @__PURE__ */ new Map();
|
|
19838
20061
|
for (const chunk of chunks) {
|
|
19839
20062
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
@@ -19852,6 +20075,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19852
20075
|
return existingHash !== chunk.contentHash;
|
|
19853
20076
|
});
|
|
19854
20077
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
20078
|
+
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
19855
20079
|
const embedStart = stageStart();
|
|
19856
20080
|
const chunkTokenEstimates = /* @__PURE__ */ new Map();
|
|
19857
20081
|
for (const chunk of changedChunks) {
|
|
@@ -19866,9 +20090,11 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19866
20090
|
let newEmbeddings = 0;
|
|
19867
20091
|
const vectorsByChunk = /* @__PURE__ */ new Map();
|
|
19868
20092
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
20093
|
+
this.logger.info(`Embedding ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} (~${estimatedTokens.toLocaleString()} tokens, ~$${estimatedCostUSD.toFixed(6)})...`);
|
|
19869
20094
|
const embeddings = await this.embeddings.embedTexts(
|
|
19870
20095
|
changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
|
|
19871
|
-
this.config.embeddings.model
|
|
20096
|
+
this.config.embeddings.model,
|
|
20097
|
+
"retrieval.passage"
|
|
19872
20098
|
);
|
|
19873
20099
|
if (embeddings.length !== changedChunks.length) {
|
|
19874
20100
|
throw new SearchSocketError(
|
|
@@ -19891,8 +20117,14 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19891
20117
|
}
|
|
19892
20118
|
}
|
|
19893
20119
|
stageEnd("embedding", embedStart);
|
|
20120
|
+
if (changedChunks.length > 0) {
|
|
20121
|
+
this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
|
|
20122
|
+
} else {
|
|
20123
|
+
this.logger.info("No chunks to embed \u2014 all up to date");
|
|
20124
|
+
}
|
|
19894
20125
|
const syncStart = stageStart();
|
|
19895
20126
|
if (!options.dryRun) {
|
|
20127
|
+
this.logger.info("Syncing vectors...");
|
|
19896
20128
|
const upserts = [];
|
|
19897
20129
|
for (const chunk of changedChunks) {
|
|
19898
20130
|
const vector = vectorsByChunk.get(chunk.chunkKey);
|
|
@@ -19911,6 +20143,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19911
20143
|
sectionTitle: chunk.sectionTitle ?? "",
|
|
19912
20144
|
headingPath: chunk.headingPath,
|
|
19913
20145
|
snippet: chunk.snippet,
|
|
20146
|
+
chunkText: chunk.chunkText.slice(0, 4e3),
|
|
20147
|
+
ordinal: chunk.ordinal,
|
|
19914
20148
|
contentHash: chunk.contentHash,
|
|
19915
20149
|
modelId: this.config.embeddings.model,
|
|
19916
20150
|
depth: chunk.depth,
|
|
@@ -19930,6 +20164,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19930
20164
|
}
|
|
19931
20165
|
}
|
|
19932
20166
|
stageEnd("sync", syncStart);
|
|
20167
|
+
this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
|
|
19933
20168
|
const finalizeStart = stageStart();
|
|
19934
20169
|
if (!options.dryRun) {
|
|
19935
20170
|
const scopeInfo = {
|
|
@@ -19949,6 +20184,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19949
20184
|
});
|
|
19950
20185
|
}
|
|
19951
20186
|
stageEnd("finalize", finalizeStart);
|
|
20187
|
+
this.logger.info("Done.");
|
|
19952
20188
|
return {
|
|
19953
20189
|
pagesProcessed: mirrorPages.length,
|
|
19954
20190
|
chunksTotal: chunks.length,
|
|
@@ -20109,7 +20345,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
20109
20345
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
20110
20346
|
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
20111
20347
|
const embedStart = process.hrtime.bigint();
|
|
20112
|
-
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
|
|
20348
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
20113
20349
|
const queryVector = queryEmbeddings[0];
|
|
20114
20350
|
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
20115
20351
|
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
@@ -20137,13 +20373,17 @@ var SearchEngine = class _SearchEngine {
|
|
|
20137
20373
|
usedRerank = true;
|
|
20138
20374
|
}
|
|
20139
20375
|
let results;
|
|
20376
|
+
const minScore = this.config.ranking.minScore;
|
|
20140
20377
|
if (groupByPage) {
|
|
20141
|
-
|
|
20378
|
+
let pages = aggregateByPage(ordered, this.config);
|
|
20379
|
+
if (minScore > 0) {
|
|
20380
|
+
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
20381
|
+
}
|
|
20142
20382
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
20143
20383
|
results = pages.slice(0, topK).map((page) => {
|
|
20144
20384
|
const bestScore = page.bestChunk.finalScore;
|
|
20145
|
-
const
|
|
20146
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
20385
|
+
const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
20386
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
|
|
20147
20387
|
return {
|
|
20148
20388
|
url: page.url,
|
|
20149
20389
|
title: page.title,
|
|
@@ -20160,6 +20400,9 @@ var SearchEngine = class _SearchEngine {
|
|
|
20160
20400
|
};
|
|
20161
20401
|
});
|
|
20162
20402
|
} else {
|
|
20403
|
+
if (minScore > 0) {
|
|
20404
|
+
ordered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
20405
|
+
}
|
|
20163
20406
|
results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
20164
20407
|
url: hit.metadata.url,
|
|
20165
20408
|
title: hit.metadata.title,
|
|
@@ -20231,43 +20474,54 @@ var SearchEngine = class _SearchEngine {
|
|
|
20231
20474
|
}
|
|
20232
20475
|
}
|
|
20233
20476
|
async rerankHits(query, ranked, topK) {
|
|
20234
|
-
if (this.config.rerank.
|
|
20477
|
+
if (!this.config.rerank.enabled) {
|
|
20235
20478
|
throw new SearchSocketError(
|
|
20236
20479
|
"INVALID_REQUEST",
|
|
20237
|
-
"rerank=true requested but rerank.
|
|
20480
|
+
"rerank=true requested but rerank.enabled is not set to true.",
|
|
20238
20481
|
400
|
|
20239
20482
|
);
|
|
20240
20483
|
}
|
|
20241
20484
|
if (!this.reranker) {
|
|
20242
20485
|
throw new SearchSocketError(
|
|
20243
20486
|
"CONFIG_MISSING",
|
|
20244
|
-
`rerank=true requested but ${this.config.
|
|
20487
|
+
`rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
|
|
20245
20488
|
400
|
|
20246
20489
|
);
|
|
20247
20490
|
}
|
|
20248
|
-
const
|
|
20249
|
-
|
|
20250
|
-
|
|
20251
|
-
|
|
20491
|
+
const pageGroups = /* @__PURE__ */ new Map();
|
|
20492
|
+
for (const entry of ranked) {
|
|
20493
|
+
const url = entry.hit.metadata.url;
|
|
20494
|
+
const group = pageGroups.get(url);
|
|
20495
|
+
if (group) group.push(entry);
|
|
20496
|
+
else pageGroups.set(url, [entry]);
|
|
20497
|
+
}
|
|
20498
|
+
const pageCandidates = [];
|
|
20499
|
+
for (const [url, chunks] of pageGroups) {
|
|
20500
|
+
const sorted = [...chunks].sort(
|
|
20501
|
+
(a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0)
|
|
20502
|
+
);
|
|
20503
|
+
const title = sorted[0].hit.metadata.title;
|
|
20504
|
+
const body = sorted.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
20505
|
+
pageCandidates.push({ id: url, text: `${title}
|
|
20506
|
+
|
|
20507
|
+
${body}` });
|
|
20508
|
+
}
|
|
20252
20509
|
const reranked = await this.reranker.rerank(
|
|
20253
20510
|
query,
|
|
20254
|
-
|
|
20511
|
+
pageCandidates,
|
|
20255
20512
|
Math.max(topK, this.config.rerank.topN)
|
|
20256
20513
|
);
|
|
20257
|
-
const
|
|
20514
|
+
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
20258
20515
|
return ranked.map((entry) => {
|
|
20259
|
-
const
|
|
20260
|
-
const
|
|
20261
|
-
if (
|
|
20262
|
-
return {
|
|
20263
|
-
...entry,
|
|
20264
|
-
finalScore: safeBaseScore
|
|
20265
|
-
};
|
|
20516
|
+
const pageScore = scoreByUrl.get(entry.hit.metadata.url);
|
|
20517
|
+
const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
|
|
20518
|
+
if (pageScore === void 0 || !Number.isFinite(pageScore)) {
|
|
20519
|
+
return { ...entry, finalScore: base };
|
|
20266
20520
|
}
|
|
20267
|
-
const
|
|
20521
|
+
const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
|
|
20268
20522
|
return {
|
|
20269
20523
|
...entry,
|
|
20270
|
-
finalScore: Number.isFinite(
|
|
20524
|
+
finalScore: Number.isFinite(combined) ? combined : base
|
|
20271
20525
|
};
|
|
20272
20526
|
}).sort((a, b) => {
|
|
20273
20527
|
const delta = b.finalScore - a.finalScore;
|
|
@@ -20465,13 +20719,21 @@ function searchsocketHandle(options = {}) {
|
|
|
20465
20719
|
let rateLimiter = null;
|
|
20466
20720
|
const getConfig = async () => {
|
|
20467
20721
|
if (!configPromise) {
|
|
20468
|
-
|
|
20469
|
-
|
|
20470
|
-
|
|
20471
|
-
})
|
|
20722
|
+
let configP;
|
|
20723
|
+
if (options.config) {
|
|
20724
|
+
configP = Promise.resolve(options.config);
|
|
20725
|
+
} else if (options.rawConfig) {
|
|
20726
|
+
const cwd = options.cwd ?? process.cwd();
|
|
20727
|
+
configP = Promise.resolve(mergeConfig(cwd, options.rawConfig));
|
|
20728
|
+
} else {
|
|
20729
|
+
configP = loadConfig({
|
|
20730
|
+
cwd: options.cwd,
|
|
20731
|
+
configPath: options.configPath
|
|
20732
|
+
});
|
|
20733
|
+
}
|
|
20472
20734
|
configPromise = configP.then((config) => {
|
|
20473
20735
|
apiPath = apiPath ?? config.api.path;
|
|
20474
|
-
if (config.api.rateLimit) {
|
|
20736
|
+
if (config.api.rateLimit && !isServerless()) {
|
|
20475
20737
|
rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
|
|
20476
20738
|
}
|
|
20477
20739
|
return config;
|
|
@@ -20481,10 +20743,9 @@ function searchsocketHandle(options = {}) {
|
|
|
20481
20743
|
};
|
|
20482
20744
|
const getEngine = async () => {
|
|
20483
20745
|
if (!enginePromise) {
|
|
20484
|
-
const config =
|
|
20746
|
+
const config = await getConfig();
|
|
20485
20747
|
enginePromise = SearchEngine.create({
|
|
20486
20748
|
cwd: options.cwd,
|
|
20487
|
-
configPath: options.configPath,
|
|
20488
20749
|
config
|
|
20489
20750
|
});
|
|
20490
20751
|
}
|
|
@@ -20757,8 +21018,10 @@ exports.createEmbeddingsProvider = createEmbeddingsProvider;
|
|
|
20757
21018
|
exports.createReranker = createReranker;
|
|
20758
21019
|
exports.createSearchClient = createSearchClient;
|
|
20759
21020
|
exports.createVectorStore = createVectorStore;
|
|
21021
|
+
exports.isServerless = isServerless;
|
|
20760
21022
|
exports.loadConfig = loadConfig;
|
|
20761
21023
|
exports.mergeConfig = mergeConfig;
|
|
21024
|
+
exports.mergeConfigServerless = mergeConfigServerless;
|
|
20762
21025
|
exports.resolveScope = resolveScope;
|
|
20763
21026
|
exports.runMcpServer = runMcpServer;
|
|
20764
21027
|
exports.searchsocketHandle = searchsocketHandle;
|