searchsocket 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -42
- package/dist/cli.js +370 -115
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +391 -109
- package/dist/index.d.cts +20 -3
- package/dist/index.d.ts +20 -3
- package/dist/index.js +389 -108
- package/dist/sveltekit.cjs +374 -109
- package/dist/sveltekit.d.cts +8 -2
- package/dist/sveltekit.d.ts +8 -2
- package/dist/sveltekit.js +373 -107
- package/dist/{types-D1K46vwd.d.cts → types-BrG6XTUU.d.cts} +29 -13
- package/dist/{types-D1K46vwd.d.ts → types-BrG6XTUU.d.ts} +29 -13
- package/package.json +1 -2
package/dist/index.cjs
CHANGED
|
@@ -5,8 +5,7 @@ var path = require('path');
|
|
|
5
5
|
var jiti = require('jiti');
|
|
6
6
|
var zod = require('zod');
|
|
7
7
|
var child_process = require('child_process');
|
|
8
|
-
var
|
|
9
|
-
var pLimit = require('p-limit');
|
|
8
|
+
var pLimit2 = require('p-limit');
|
|
10
9
|
var crypto = require('crypto');
|
|
11
10
|
var cheerio = require('cheerio');
|
|
12
11
|
var matter = require('gray-matter');
|
|
@@ -23,8 +22,7 @@ function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
|
23
22
|
|
|
24
23
|
var fs__default = /*#__PURE__*/_interopDefault(fs);
|
|
25
24
|
var path__default = /*#__PURE__*/_interopDefault(path);
|
|
26
|
-
var
|
|
27
|
-
var pLimit__default = /*#__PURE__*/_interopDefault(pLimit);
|
|
25
|
+
var pLimit2__default = /*#__PURE__*/_interopDefault(pLimit2);
|
|
28
26
|
var matter__default = /*#__PURE__*/_interopDefault(matter);
|
|
29
27
|
var fs4__default = /*#__PURE__*/_interopDefault(fs4);
|
|
30
28
|
var fg__default = /*#__PURE__*/_interopDefault(fg);
|
|
@@ -16633,7 +16631,11 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16633
16631
|
outputDir: zod.z.string().min(1).optional(),
|
|
16634
16632
|
paramValues: zod.z.record(zod.z.string(), zod.z.array(zod.z.string())).optional(),
|
|
16635
16633
|
exclude: zod.z.array(zod.z.string()).optional(),
|
|
16636
|
-
previewTimeout: zod.z.number().int().positive().optional()
|
|
16634
|
+
previewTimeout: zod.z.number().int().positive().optional(),
|
|
16635
|
+
discover: zod.z.boolean().optional(),
|
|
16636
|
+
seedUrls: zod.z.array(zod.z.string()).optional(),
|
|
16637
|
+
maxPages: zod.z.number().int().positive().optional(),
|
|
16638
|
+
maxDepth: zod.z.number().int().nonnegative().optional()
|
|
16637
16639
|
}).optional()
|
|
16638
16640
|
}).optional(),
|
|
16639
16641
|
extract: zod.z.object({
|
|
@@ -16660,8 +16662,9 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16660
16662
|
pageSummaryChunk: zod.z.boolean().optional()
|
|
16661
16663
|
}).optional(),
|
|
16662
16664
|
embeddings: zod.z.object({
|
|
16663
|
-
provider: zod.z.literal("
|
|
16665
|
+
provider: zod.z.literal("jina").optional(),
|
|
16664
16666
|
model: zod.z.string().min(1).optional(),
|
|
16667
|
+
apiKey: zod.z.string().min(1).optional(),
|
|
16665
16668
|
apiKeyEnv: zod.z.string().min(1).optional(),
|
|
16666
16669
|
batchSize: zod.z.number().int().positive().optional(),
|
|
16667
16670
|
concurrency: zod.z.number().int().positive().optional(),
|
|
@@ -16670,18 +16673,17 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16670
16673
|
vector: zod.z.object({
|
|
16671
16674
|
dimension: zod.z.number().int().positive().optional(),
|
|
16672
16675
|
turso: zod.z.object({
|
|
16676
|
+
url: zod.z.string().url().optional(),
|
|
16677
|
+
authToken: zod.z.string().min(1).optional(),
|
|
16673
16678
|
urlEnv: zod.z.string().optional(),
|
|
16674
16679
|
authTokenEnv: zod.z.string().optional(),
|
|
16675
16680
|
localPath: zod.z.string().optional()
|
|
16676
16681
|
}).optional()
|
|
16677
16682
|
}).optional(),
|
|
16678
16683
|
rerank: zod.z.object({
|
|
16679
|
-
|
|
16684
|
+
enabled: zod.z.boolean().optional(),
|
|
16680
16685
|
topN: zod.z.number().int().positive().optional(),
|
|
16681
|
-
|
|
16682
|
-
apiKeyEnv: zod.z.string().optional(),
|
|
16683
|
-
model: zod.z.string().optional()
|
|
16684
|
-
}).optional()
|
|
16686
|
+
model: zod.z.string().optional()
|
|
16685
16687
|
}).optional(),
|
|
16686
16688
|
ranking: zod.z.object({
|
|
16687
16689
|
enableIncomingLinkBoost: zod.z.boolean().optional(),
|
|
@@ -16690,6 +16692,7 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16690
16692
|
aggregationCap: zod.z.number().int().positive().optional(),
|
|
16691
16693
|
aggregationDecay: zod.z.number().min(0).max(1).optional(),
|
|
16692
16694
|
minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
16695
|
+
minScore: zod.z.number().min(0).max(1).optional(),
|
|
16693
16696
|
weights: zod.z.object({
|
|
16694
16697
|
incomingLinks: zod.z.number().optional(),
|
|
16695
16698
|
depth: zod.z.number().optional(),
|
|
@@ -16770,9 +16773,9 @@ function createDefaultConfig(projectId) {
|
|
|
16770
16773
|
pageSummaryChunk: true
|
|
16771
16774
|
},
|
|
16772
16775
|
embeddings: {
|
|
16773
|
-
provider: "
|
|
16774
|
-
model: "
|
|
16775
|
-
apiKeyEnv: "
|
|
16776
|
+
provider: "jina",
|
|
16777
|
+
model: "jina-embeddings-v3",
|
|
16778
|
+
apiKeyEnv: "JINA_API_KEY",
|
|
16776
16779
|
batchSize: 64,
|
|
16777
16780
|
concurrency: 4
|
|
16778
16781
|
},
|
|
@@ -16784,12 +16787,9 @@ function createDefaultConfig(projectId) {
|
|
|
16784
16787
|
}
|
|
16785
16788
|
},
|
|
16786
16789
|
rerank: {
|
|
16787
|
-
|
|
16790
|
+
enabled: false,
|
|
16788
16791
|
topN: 20,
|
|
16789
|
-
|
|
16790
|
-
apiKeyEnv: "JINA_API_KEY",
|
|
16791
|
-
model: "jina-reranker-v2-base-multilingual"
|
|
16792
|
-
}
|
|
16792
|
+
model: "jina-reranker-v2-base-multilingual"
|
|
16793
16793
|
},
|
|
16794
16794
|
ranking: {
|
|
16795
16795
|
enableIncomingLinkBoost: true,
|
|
@@ -16798,6 +16798,7 @@ function createDefaultConfig(projectId) {
|
|
|
16798
16798
|
aggregationCap: 5,
|
|
16799
16799
|
aggregationDecay: 0.5,
|
|
16800
16800
|
minChunkScoreRatio: 0.5,
|
|
16801
|
+
minScore: 0,
|
|
16801
16802
|
weights: {
|
|
16802
16803
|
incomingLinks: 0.05,
|
|
16803
16804
|
depth: 0.03,
|
|
@@ -16924,7 +16925,11 @@ ${issues}`
|
|
|
16924
16925
|
outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
|
|
16925
16926
|
paramValues: parsed.source.build.paramValues ?? {},
|
|
16926
16927
|
exclude: parsed.source.build.exclude ?? [],
|
|
16927
|
-
previewTimeout: parsed.source.build.previewTimeout ?? 3e4
|
|
16928
|
+
previewTimeout: parsed.source.build.previewTimeout ?? 3e4,
|
|
16929
|
+
discover: parsed.source.build.discover ?? false,
|
|
16930
|
+
seedUrls: parsed.source.build.seedUrls ?? ["/"],
|
|
16931
|
+
maxPages: parsed.source.build.maxPages ?? 200,
|
|
16932
|
+
maxDepth: parsed.source.build.maxDepth ?? 10
|
|
16928
16933
|
} : void 0
|
|
16929
16934
|
},
|
|
16930
16935
|
extract: {
|
|
@@ -16953,11 +16958,7 @@ ${issues}`
|
|
|
16953
16958
|
},
|
|
16954
16959
|
rerank: {
|
|
16955
16960
|
...defaults.rerank,
|
|
16956
|
-
...parsed.rerank
|
|
16957
|
-
jina: {
|
|
16958
|
-
...defaults.rerank.jina,
|
|
16959
|
-
...parsed.rerank?.jina
|
|
16960
|
-
}
|
|
16961
|
+
...parsed.rerank
|
|
16961
16962
|
},
|
|
16962
16963
|
ranking: {
|
|
16963
16964
|
...defaults.ranking,
|
|
@@ -17004,7 +17005,11 @@ ${issues}`
|
|
|
17004
17005
|
outputDir: ".svelte-kit/output",
|
|
17005
17006
|
paramValues: {},
|
|
17006
17007
|
exclude: [],
|
|
17007
|
-
previewTimeout: 3e4
|
|
17008
|
+
previewTimeout: 3e4,
|
|
17009
|
+
discover: false,
|
|
17010
|
+
seedUrls: ["/"],
|
|
17011
|
+
maxPages: 200,
|
|
17012
|
+
maxDepth: 10
|
|
17008
17013
|
};
|
|
17009
17014
|
}
|
|
17010
17015
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
@@ -17018,6 +17023,21 @@ ${issues}`
|
|
|
17018
17023
|
}
|
|
17019
17024
|
return merged;
|
|
17020
17025
|
}
|
|
17026
|
+
function mergeConfigServerless(rawConfig) {
|
|
17027
|
+
if (!rawConfig.project?.id) {
|
|
17028
|
+
throw new SearchSocketError(
|
|
17029
|
+
"CONFIG_MISSING",
|
|
17030
|
+
"`project.id` is required for serverless config (cannot infer from package.json)."
|
|
17031
|
+
);
|
|
17032
|
+
}
|
|
17033
|
+
if (!rawConfig.source?.mode) {
|
|
17034
|
+
throw new SearchSocketError(
|
|
17035
|
+
"CONFIG_MISSING",
|
|
17036
|
+
"`source.mode` is required for serverless config (cannot auto-detect from filesystem)."
|
|
17037
|
+
);
|
|
17038
|
+
}
|
|
17039
|
+
return mergeConfig(process.cwd(), rawConfig);
|
|
17040
|
+
}
|
|
17021
17041
|
async function loadConfig(options = {}) {
|
|
17022
17042
|
const cwd = path__default.default.resolve(options.cwd ?? process.cwd());
|
|
17023
17043
|
const configPath = path__default.default.resolve(cwd, options.configPath ?? "searchsocket.config.ts");
|
|
@@ -17040,6 +17060,11 @@ async function loadConfig(options = {}) {
|
|
|
17040
17060
|
return mergeConfig(cwd, raw);
|
|
17041
17061
|
}
|
|
17042
17062
|
|
|
17063
|
+
// src/core/serverless.ts
|
|
17064
|
+
function isServerless() {
|
|
17065
|
+
return !!(process.env.VERCEL || process.env.NETLIFY || process.env.AWS_LAMBDA_FUNCTION_NAME || process.env.FUNCTIONS_WORKER || process.env.CF_PAGES);
|
|
17066
|
+
}
|
|
17067
|
+
|
|
17043
17068
|
// src/utils/text.ts
|
|
17044
17069
|
function normalizeText(input) {
|
|
17045
17070
|
return input.replace(/\r\n/g, "\n").replace(/\s+/g, " ").trim();
|
|
@@ -17117,10 +17142,11 @@ function sleep(ms) {
|
|
|
17117
17142
|
setTimeout(resolve, ms);
|
|
17118
17143
|
});
|
|
17119
17144
|
}
|
|
17120
|
-
var
|
|
17121
|
-
|
|
17145
|
+
var JinaEmbeddingsProvider = class {
|
|
17146
|
+
apiKey;
|
|
17122
17147
|
batchSize;
|
|
17123
17148
|
concurrency;
|
|
17149
|
+
defaultTask;
|
|
17124
17150
|
constructor(options) {
|
|
17125
17151
|
if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
|
|
17126
17152
|
throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
|
|
@@ -17128,11 +17154,10 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17128
17154
|
if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
|
|
17129
17155
|
throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
|
|
17130
17156
|
}
|
|
17131
|
-
this.
|
|
17132
|
-
apiKey: options.apiKey
|
|
17133
|
-
});
|
|
17157
|
+
this.apiKey = options.apiKey;
|
|
17134
17158
|
this.batchSize = options.batchSize;
|
|
17135
17159
|
this.concurrency = options.concurrency;
|
|
17160
|
+
this.defaultTask = options.task ?? "retrieval.passage";
|
|
17136
17161
|
}
|
|
17137
17162
|
estimateTokens(text) {
|
|
17138
17163
|
const normalized = text.trim();
|
|
@@ -17146,7 +17171,7 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17146
17171
|
const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
|
|
17147
17172
|
return Math.max(1, Math.max(charEstimate, lexicalEstimate));
|
|
17148
17173
|
}
|
|
17149
|
-
async embedTexts(texts, modelId) {
|
|
17174
|
+
async embedTexts(texts, modelId, task) {
|
|
17150
17175
|
if (texts.length === 0) {
|
|
17151
17176
|
return [];
|
|
17152
17177
|
}
|
|
@@ -17158,37 +17183,56 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17158
17183
|
});
|
|
17159
17184
|
}
|
|
17160
17185
|
const outputs = new Array(batches.length);
|
|
17161
|
-
const limit =
|
|
17186
|
+
const limit = pLimit2__default.default(this.concurrency);
|
|
17162
17187
|
await Promise.all(
|
|
17163
17188
|
batches.map(
|
|
17164
17189
|
(batch, position) => limit(async () => {
|
|
17165
|
-
outputs[position] = await this.embedWithRetry(batch.values, modelId);
|
|
17190
|
+
outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
|
|
17166
17191
|
})
|
|
17167
17192
|
)
|
|
17168
17193
|
);
|
|
17169
17194
|
return outputs.flat();
|
|
17170
17195
|
}
|
|
17171
|
-
async embedWithRetry(texts, modelId) {
|
|
17196
|
+
async embedWithRetry(texts, modelId, task) {
|
|
17172
17197
|
const maxAttempts = 5;
|
|
17173
17198
|
let attempt = 0;
|
|
17174
17199
|
while (attempt < maxAttempts) {
|
|
17175
17200
|
attempt += 1;
|
|
17201
|
+
let response;
|
|
17176
17202
|
try {
|
|
17177
|
-
|
|
17178
|
-
|
|
17179
|
-
|
|
17180
|
-
|
|
17203
|
+
response = await fetch("https://api.jina.ai/v1/embeddings", {
|
|
17204
|
+
method: "POST",
|
|
17205
|
+
headers: {
|
|
17206
|
+
"content-type": "application/json",
|
|
17207
|
+
authorization: `Bearer ${this.apiKey}`
|
|
17208
|
+
},
|
|
17209
|
+
body: JSON.stringify({
|
|
17210
|
+
model: modelId,
|
|
17211
|
+
input: texts,
|
|
17212
|
+
task
|
|
17213
|
+
})
|
|
17181
17214
|
});
|
|
17182
|
-
return response.data.map((entry) => entry.embedding);
|
|
17183
17215
|
} catch (error) {
|
|
17184
|
-
|
|
17185
|
-
const retryable = status === 429 || typeof status === "number" && status >= 500;
|
|
17186
|
-
if (!retryable || attempt >= maxAttempts) {
|
|
17216
|
+
if (attempt >= maxAttempts) {
|
|
17187
17217
|
throw error;
|
|
17188
17218
|
}
|
|
17189
|
-
|
|
17190
|
-
|
|
17219
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17220
|
+
continue;
|
|
17221
|
+
}
|
|
17222
|
+
if (!response.ok) {
|
|
17223
|
+
const retryable = response.status === 429 || response.status >= 500;
|
|
17224
|
+
if (!retryable || attempt >= maxAttempts) {
|
|
17225
|
+
const errorBody = await response.text();
|
|
17226
|
+
throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
|
|
17227
|
+
}
|
|
17228
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17229
|
+
continue;
|
|
17230
|
+
}
|
|
17231
|
+
const payload = await response.json();
|
|
17232
|
+
if (!payload.data || !Array.isArray(payload.data)) {
|
|
17233
|
+
throw new Error("Invalid Jina embeddings response format");
|
|
17191
17234
|
}
|
|
17235
|
+
return payload.data.map((entry) => entry.embedding);
|
|
17192
17236
|
}
|
|
17193
17237
|
throw new Error("Unreachable retry state");
|
|
17194
17238
|
}
|
|
@@ -17196,20 +17240,20 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17196
17240
|
|
|
17197
17241
|
// src/embeddings/factory.ts
|
|
17198
17242
|
function createEmbeddingsProvider(config) {
|
|
17199
|
-
if (config.embeddings.provider !== "
|
|
17243
|
+
if (config.embeddings.provider !== "jina") {
|
|
17200
17244
|
throw new SearchSocketError(
|
|
17201
17245
|
"CONFIG_MISSING",
|
|
17202
17246
|
`Unsupported embeddings provider ${config.embeddings.provider}`
|
|
17203
17247
|
);
|
|
17204
17248
|
}
|
|
17205
|
-
const apiKey = process.env[config.embeddings.apiKeyEnv];
|
|
17249
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17206
17250
|
if (!apiKey) {
|
|
17207
17251
|
throw new SearchSocketError(
|
|
17208
17252
|
"CONFIG_MISSING",
|
|
17209
|
-
`Missing embeddings API key env var
|
|
17253
|
+
`Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
|
|
17210
17254
|
);
|
|
17211
17255
|
}
|
|
17212
|
-
return new
|
|
17256
|
+
return new JinaEmbeddingsProvider({
|
|
17213
17257
|
apiKey,
|
|
17214
17258
|
batchSize: config.embeddings.batchSize,
|
|
17215
17259
|
concurrency: config.embeddings.concurrency
|
|
@@ -17299,20 +17343,17 @@ var JinaReranker = class {
|
|
|
17299
17343
|
|
|
17300
17344
|
// src/rerank/factory.ts
|
|
17301
17345
|
function createReranker(config) {
|
|
17302
|
-
if (config.rerank.
|
|
17346
|
+
if (!config.rerank.enabled) {
|
|
17303
17347
|
return null;
|
|
17304
17348
|
}
|
|
17305
|
-
|
|
17306
|
-
|
|
17307
|
-
|
|
17308
|
-
return null;
|
|
17309
|
-
}
|
|
17310
|
-
return new JinaReranker({
|
|
17311
|
-
apiKey,
|
|
17312
|
-
model: config.rerank.jina.model
|
|
17313
|
-
});
|
|
17349
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17350
|
+
if (!apiKey) {
|
|
17351
|
+
return null;
|
|
17314
17352
|
}
|
|
17315
|
-
return
|
|
17353
|
+
return new JinaReranker({
|
|
17354
|
+
apiKey,
|
|
17355
|
+
model: config.rerank.model
|
|
17356
|
+
});
|
|
17316
17357
|
}
|
|
17317
17358
|
function ensureStateDirs(cwd, stateDir, scope) {
|
|
17318
17359
|
const statePath = path__default.default.resolve(cwd, stateDir);
|
|
@@ -17365,6 +17406,16 @@ var TursoVectorStore = class {
|
|
|
17365
17406
|
}
|
|
17366
17407
|
async ensureChunks(dim) {
|
|
17367
17408
|
if (this.chunksReady) return;
|
|
17409
|
+
const exists = await this.chunksTableExists();
|
|
17410
|
+
if (exists) {
|
|
17411
|
+
const currentDim = await this.getChunksDimension();
|
|
17412
|
+
if (currentDim !== null && currentDim !== dim) {
|
|
17413
|
+
await this.client.batch([
|
|
17414
|
+
"DROP INDEX IF EXISTS idx",
|
|
17415
|
+
"DROP TABLE IF EXISTS chunks"
|
|
17416
|
+
]);
|
|
17417
|
+
}
|
|
17418
|
+
}
|
|
17368
17419
|
await this.client.batch([
|
|
17369
17420
|
`CREATE TABLE IF NOT EXISTS chunks (
|
|
17370
17421
|
id TEXT PRIMARY KEY,
|
|
@@ -17376,12 +17427,16 @@ var TursoVectorStore = class {
|
|
|
17376
17427
|
section_title TEXT NOT NULL DEFAULT '',
|
|
17377
17428
|
heading_path TEXT NOT NULL DEFAULT '[]',
|
|
17378
17429
|
snippet TEXT NOT NULL DEFAULT '',
|
|
17430
|
+
chunk_text TEXT NOT NULL DEFAULT '',
|
|
17431
|
+
ordinal INTEGER NOT NULL DEFAULT 0,
|
|
17379
17432
|
content_hash TEXT NOT NULL DEFAULT '',
|
|
17380
17433
|
model_id TEXT NOT NULL DEFAULT '',
|
|
17381
17434
|
depth INTEGER NOT NULL DEFAULT 0,
|
|
17382
17435
|
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
17383
17436
|
route_file TEXT NOT NULL DEFAULT '',
|
|
17384
17437
|
tags TEXT NOT NULL DEFAULT '[]',
|
|
17438
|
+
description TEXT NOT NULL DEFAULT '',
|
|
17439
|
+
keywords TEXT NOT NULL DEFAULT '[]',
|
|
17385
17440
|
embedding F32_BLOB(${dim})
|
|
17386
17441
|
)`,
|
|
17387
17442
|
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
@@ -17420,6 +17475,38 @@ var TursoVectorStore = class {
|
|
|
17420
17475
|
throw error;
|
|
17421
17476
|
}
|
|
17422
17477
|
}
|
|
17478
|
+
/**
|
|
17479
|
+
* Read the current F32_BLOB dimension from the chunks table schema.
|
|
17480
|
+
* Returns null if the table doesn't exist or the dimension can't be parsed.
|
|
17481
|
+
*/
|
|
17482
|
+
async getChunksDimension() {
|
|
17483
|
+
try {
|
|
17484
|
+
const rs = await this.client.execute(
|
|
17485
|
+
"SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
|
|
17486
|
+
);
|
|
17487
|
+
if (rs.rows.length === 0) return null;
|
|
17488
|
+
const sql = rs.rows[0].sql;
|
|
17489
|
+
const match = sql.match(/F32_BLOB\((\d+)\)/i);
|
|
17490
|
+
return match ? parseInt(match[1], 10) : null;
|
|
17491
|
+
} catch {
|
|
17492
|
+
return null;
|
|
17493
|
+
}
|
|
17494
|
+
}
|
|
17495
|
+
/**
|
|
17496
|
+
* Drop all SearchSocket tables (chunks, registry, pages) and their indexes.
|
|
17497
|
+
* Used by `clean --remote` for a full reset.
|
|
17498
|
+
*/
|
|
17499
|
+
async dropAllTables() {
|
|
17500
|
+
await this.client.batch([
|
|
17501
|
+
"DROP INDEX IF EXISTS idx",
|
|
17502
|
+
"DROP TABLE IF EXISTS chunks",
|
|
17503
|
+
"DROP TABLE IF EXISTS registry",
|
|
17504
|
+
"DROP TABLE IF EXISTS pages"
|
|
17505
|
+
]);
|
|
17506
|
+
this.chunksReady = false;
|
|
17507
|
+
this.registryReady = false;
|
|
17508
|
+
this.pagesReady = false;
|
|
17509
|
+
}
|
|
17423
17510
|
async upsert(records, _scope) {
|
|
17424
17511
|
if (records.length === 0) return;
|
|
17425
17512
|
const dim = this.dimension ?? records[0].vector.length;
|
|
@@ -17430,9 +17517,9 @@ var TursoVectorStore = class {
|
|
|
17430
17517
|
const stmts = batch.map((r) => ({
|
|
17431
17518
|
sql: `INSERT OR REPLACE INTO chunks
|
|
17432
17519
|
(id, project_id, scope_name, url, path, title, section_title,
|
|
17433
|
-
heading_path, snippet, content_hash, model_id, depth,
|
|
17434
|
-
incoming_links, route_file, tags, embedding)
|
|
17435
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17520
|
+
heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
|
|
17521
|
+
incoming_links, route_file, tags, description, keywords, embedding)
|
|
17522
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17436
17523
|
args: [
|
|
17437
17524
|
r.id,
|
|
17438
17525
|
r.metadata.projectId,
|
|
@@ -17443,12 +17530,16 @@ var TursoVectorStore = class {
|
|
|
17443
17530
|
r.metadata.sectionTitle,
|
|
17444
17531
|
JSON.stringify(r.metadata.headingPath),
|
|
17445
17532
|
r.metadata.snippet,
|
|
17533
|
+
r.metadata.chunkText,
|
|
17534
|
+
r.metadata.ordinal,
|
|
17446
17535
|
r.metadata.contentHash,
|
|
17447
17536
|
r.metadata.modelId,
|
|
17448
17537
|
r.metadata.depth,
|
|
17449
17538
|
r.metadata.incomingLinks,
|
|
17450
17539
|
r.metadata.routeFile,
|
|
17451
17540
|
JSON.stringify(r.metadata.tags),
|
|
17541
|
+
r.metadata.description ?? "",
|
|
17542
|
+
JSON.stringify(r.metadata.keywords ?? []),
|
|
17452
17543
|
JSON.stringify(r.vector)
|
|
17453
17544
|
]
|
|
17454
17545
|
}));
|
|
@@ -17461,8 +17552,10 @@ var TursoVectorStore = class {
|
|
|
17461
17552
|
const queryJson = JSON.stringify(queryVector);
|
|
17462
17553
|
const rs = await this.client.execute({
|
|
17463
17554
|
sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
|
|
17464
|
-
c.section_title, c.heading_path, c.snippet, c.
|
|
17555
|
+
c.section_title, c.heading_path, c.snippet, c.chunk_text,
|
|
17556
|
+
c.ordinal, c.content_hash,
|
|
17465
17557
|
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
17558
|
+
c.description, c.keywords,
|
|
17466
17559
|
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
17467
17560
|
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
17468
17561
|
JOIN chunks AS c ON c.rowid = v.id`,
|
|
@@ -17493,6 +17586,12 @@ var TursoVectorStore = class {
|
|
|
17493
17586
|
}
|
|
17494
17587
|
const distance = row.distance;
|
|
17495
17588
|
const score = 1 - distance;
|
|
17589
|
+
const description = row.description || void 0;
|
|
17590
|
+
const keywords = (() => {
|
|
17591
|
+
const raw = row.keywords || "[]";
|
|
17592
|
+
const parsed = JSON.parse(raw);
|
|
17593
|
+
return parsed.length > 0 ? parsed : void 0;
|
|
17594
|
+
})();
|
|
17496
17595
|
hits.push({
|
|
17497
17596
|
id: row.id,
|
|
17498
17597
|
score,
|
|
@@ -17505,12 +17604,16 @@ var TursoVectorStore = class {
|
|
|
17505
17604
|
sectionTitle: row.section_title,
|
|
17506
17605
|
headingPath: JSON.parse(row.heading_path || "[]"),
|
|
17507
17606
|
snippet: row.snippet,
|
|
17607
|
+
chunkText: row.chunk_text || "",
|
|
17608
|
+
ordinal: row.ordinal || 0,
|
|
17508
17609
|
contentHash: row.content_hash,
|
|
17509
17610
|
modelId: row.model_id,
|
|
17510
17611
|
depth: row.depth,
|
|
17511
17612
|
incomingLinks: row.incoming_links,
|
|
17512
17613
|
routeFile: row.route_file,
|
|
17513
|
-
tags
|
|
17614
|
+
tags,
|
|
17615
|
+
description,
|
|
17616
|
+
keywords
|
|
17514
17617
|
}
|
|
17515
17618
|
});
|
|
17516
17619
|
}
|
|
@@ -17700,10 +17803,10 @@ var TursoVectorStore = class {
|
|
|
17700
17803
|
// src/vector/factory.ts
|
|
17701
17804
|
async function createVectorStore(config, cwd) {
|
|
17702
17805
|
const turso = config.vector.turso;
|
|
17703
|
-
const remoteUrl = process.env[turso.urlEnv];
|
|
17806
|
+
const remoteUrl = turso.url ?? process.env[turso.urlEnv];
|
|
17704
17807
|
if (remoteUrl) {
|
|
17705
17808
|
const { createClient: createClient2 } = await import('@libsql/client/http');
|
|
17706
|
-
const authToken = process.env[turso.authTokenEnv];
|
|
17809
|
+
const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
|
|
17707
17810
|
const client2 = createClient2({
|
|
17708
17811
|
url: remoteUrl,
|
|
17709
17812
|
authToken
|
|
@@ -17713,6 +17816,12 @@ async function createVectorStore(config, cwd) {
|
|
|
17713
17816
|
dimension: config.vector.dimension
|
|
17714
17817
|
});
|
|
17715
17818
|
}
|
|
17819
|
+
if (isServerless()) {
|
|
17820
|
+
throw new SearchSocketError(
|
|
17821
|
+
"VECTOR_BACKEND_UNAVAILABLE",
|
|
17822
|
+
`No remote vector database URL found (checked vector.turso.url and env var "${turso.urlEnv}"). Local SQLite storage is not available in serverless environments. Set ${turso.urlEnv} or pass vector.turso.url directly.`
|
|
17823
|
+
);
|
|
17824
|
+
}
|
|
17716
17825
|
const { createClient } = await import('@libsql/client');
|
|
17717
17826
|
const localPath = path__default.default.resolve(cwd, turso.localPath);
|
|
17718
17827
|
fs__default.default.mkdirSync(path__default.default.dirname(localPath), { recursive: true });
|
|
@@ -18043,7 +18152,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
18043
18152
|
incomingLinks: page.incomingLinks,
|
|
18044
18153
|
routeFile: page.routeFile,
|
|
18045
18154
|
tags: page.tags,
|
|
18046
|
-
contentHash: ""
|
|
18155
|
+
contentHash: "",
|
|
18156
|
+
description: page.description,
|
|
18157
|
+
keywords: page.keywords
|
|
18047
18158
|
};
|
|
18048
18159
|
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
18049
18160
|
summaryChunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -18070,7 +18181,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
18070
18181
|
incomingLinks: page.incomingLinks,
|
|
18071
18182
|
routeFile: page.routeFile,
|
|
18072
18183
|
tags: page.tags,
|
|
18073
|
-
contentHash: ""
|
|
18184
|
+
contentHash: "",
|
|
18185
|
+
description: page.description,
|
|
18186
|
+
keywords: page.keywords
|
|
18074
18187
|
};
|
|
18075
18188
|
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
18076
18189
|
chunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -19151,14 +19264,16 @@ function mapUrlToRoute(urlPath, patterns) {
|
|
|
19151
19264
|
var Logger = class {
|
|
19152
19265
|
json;
|
|
19153
19266
|
verbose;
|
|
19267
|
+
quiet;
|
|
19154
19268
|
stderrOnly;
|
|
19155
19269
|
constructor(opts = {}) {
|
|
19156
19270
|
this.json = opts.json ?? false;
|
|
19157
19271
|
this.verbose = opts.verbose ?? false;
|
|
19272
|
+
this.quiet = opts.quiet ?? false;
|
|
19158
19273
|
this.stderrOnly = opts.stderrOnly ?? false;
|
|
19159
19274
|
}
|
|
19160
19275
|
info(message) {
|
|
19161
|
-
if (this.json) {
|
|
19276
|
+
if (this.quiet || this.json) {
|
|
19162
19277
|
return;
|
|
19163
19278
|
}
|
|
19164
19279
|
this.writeOut(`${message}
|
|
@@ -19172,7 +19287,7 @@ var Logger = class {
|
|
|
19172
19287
|
this.logJson("debug", { message });
|
|
19173
19288
|
return;
|
|
19174
19289
|
}
|
|
19175
|
-
this.writeOut(
|
|
19290
|
+
this.writeOut(` ${message}
|
|
19176
19291
|
`);
|
|
19177
19292
|
}
|
|
19178
19293
|
warn(message) {
|
|
@@ -19199,7 +19314,7 @@ var Logger = class {
|
|
|
19199
19314
|
this.logJson(event, data);
|
|
19200
19315
|
return;
|
|
19201
19316
|
}
|
|
19202
|
-
this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
|
|
19317
|
+
this.writeOut(` [${event}] ${data ? JSON.stringify(data) : ""}
|
|
19203
19318
|
`);
|
|
19204
19319
|
}
|
|
19205
19320
|
writeOut(text) {
|
|
@@ -19384,11 +19499,108 @@ async function startPreviewServer(cwd, options, logger3) {
|
|
|
19384
19499
|
|
|
19385
19500
|
// src/indexing/sources/build/index.ts
|
|
19386
19501
|
var logger = new Logger();
|
|
19502
|
+
function extractLinksFromHtml(html, pageUrl, baseOrigin) {
|
|
19503
|
+
const $ = cheerio.load(html);
|
|
19504
|
+
const links = [];
|
|
19505
|
+
$("a[href]").each((_i, el) => {
|
|
19506
|
+
const href = $(el).attr("href");
|
|
19507
|
+
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
|
|
19508
|
+
return;
|
|
19509
|
+
}
|
|
19510
|
+
try {
|
|
19511
|
+
const resolved = new URL(href, `${baseOrigin}${pageUrl}`);
|
|
19512
|
+
if (resolved.origin !== baseOrigin) return;
|
|
19513
|
+
if (!["http:", "https:"].includes(resolved.protocol)) return;
|
|
19514
|
+
links.push(normalizeUrlPath(resolved.pathname));
|
|
19515
|
+
} catch {
|
|
19516
|
+
}
|
|
19517
|
+
});
|
|
19518
|
+
return [...new Set(links)];
|
|
19519
|
+
}
|
|
19520
|
+
async function discoverPages(server, buildConfig, pipelineMaxPages) {
|
|
19521
|
+
const { seedUrls, maxDepth, exclude } = buildConfig;
|
|
19522
|
+
const baseOrigin = new URL(server.baseUrl).origin;
|
|
19523
|
+
let effectiveMax = buildConfig.maxPages;
|
|
19524
|
+
if (typeof pipelineMaxPages === "number") {
|
|
19525
|
+
const floored = Math.max(0, Math.floor(pipelineMaxPages));
|
|
19526
|
+
effectiveMax = Math.min(effectiveMax, floored);
|
|
19527
|
+
}
|
|
19528
|
+
if (effectiveMax === 0) return [];
|
|
19529
|
+
const visited = /* @__PURE__ */ new Set();
|
|
19530
|
+
const pages = [];
|
|
19531
|
+
const queue = [];
|
|
19532
|
+
const limit = pLimit2__default.default(8);
|
|
19533
|
+
for (const seed of seedUrls) {
|
|
19534
|
+
const normalized = normalizeUrlPath(seed);
|
|
19535
|
+
if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
|
|
19536
|
+
visited.add(normalized);
|
|
19537
|
+
queue.push({ url: normalized, depth: 0 });
|
|
19538
|
+
}
|
|
19539
|
+
}
|
|
19540
|
+
while (queue.length > 0 && pages.length < effectiveMax) {
|
|
19541
|
+
const remaining = effectiveMax - pages.length;
|
|
19542
|
+
const batch = queue.splice(0, remaining);
|
|
19543
|
+
const results = await Promise.allSettled(
|
|
19544
|
+
batch.map(
|
|
19545
|
+
(item) => limit(async () => {
|
|
19546
|
+
const fullUrl = joinUrl(server.baseUrl, item.url);
|
|
19547
|
+
const response = await fetch(fullUrl);
|
|
19548
|
+
if (!response.ok) {
|
|
19549
|
+
logger.warn(`Skipping ${item.url}: ${response.status} ${response.statusText}`);
|
|
19550
|
+
return null;
|
|
19551
|
+
}
|
|
19552
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
19553
|
+
if (!contentType.includes("text/html")) {
|
|
19554
|
+
return null;
|
|
19555
|
+
}
|
|
19556
|
+
const html = await response.text();
|
|
19557
|
+
if (item.depth < maxDepth) {
|
|
19558
|
+
const links = extractLinksFromHtml(html, item.url, baseOrigin);
|
|
19559
|
+
for (const link of links) {
|
|
19560
|
+
if (!visited.has(link) && !isExcluded(link, exclude)) {
|
|
19561
|
+
visited.add(link);
|
|
19562
|
+
queue.push({ url: link, depth: item.depth + 1 });
|
|
19563
|
+
}
|
|
19564
|
+
}
|
|
19565
|
+
}
|
|
19566
|
+
return {
|
|
19567
|
+
url: item.url,
|
|
19568
|
+
html,
|
|
19569
|
+
sourcePath: fullUrl,
|
|
19570
|
+
outgoingLinks: []
|
|
19571
|
+
};
|
|
19572
|
+
})
|
|
19573
|
+
)
|
|
19574
|
+
);
|
|
19575
|
+
for (const result of results) {
|
|
19576
|
+
if (result.status === "fulfilled" && result.value) {
|
|
19577
|
+
pages.push(result.value);
|
|
19578
|
+
}
|
|
19579
|
+
}
|
|
19580
|
+
}
|
|
19581
|
+
if (pages.length >= effectiveMax && queue.length > 0) {
|
|
19582
|
+
logger.warn(`Discovery crawl reached maxPages limit (${effectiveMax}), ${queue.length} URLs not visited.`);
|
|
19583
|
+
}
|
|
19584
|
+
logger.event("build_discover_complete", {
|
|
19585
|
+
pagesFound: pages.length,
|
|
19586
|
+
urlsVisited: visited.size,
|
|
19587
|
+
urlsSkipped: queue.length
|
|
19588
|
+
});
|
|
19589
|
+
return pages;
|
|
19590
|
+
}
|
|
19387
19591
|
async function loadBuildPages(cwd, config, maxPages) {
|
|
19388
19592
|
const buildConfig = config.source.build;
|
|
19389
19593
|
if (!buildConfig) {
|
|
19390
19594
|
throw new Error("build source config is missing");
|
|
19391
19595
|
}
|
|
19596
|
+
if (buildConfig.discover) {
|
|
19597
|
+
const server2 = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
19598
|
+
try {
|
|
19599
|
+
return await discoverPages(server2, buildConfig, maxPages);
|
|
19600
|
+
} finally {
|
|
19601
|
+
await server2.shutdown();
|
|
19602
|
+
}
|
|
19603
|
+
}
|
|
19392
19604
|
const routes = await parseManifest(cwd, buildConfig.outputDir);
|
|
19393
19605
|
const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
|
|
19394
19606
|
logger.event("build_routes_discovered", {
|
|
@@ -19399,7 +19611,7 @@ async function loadBuildPages(cwd, config, maxPages) {
|
|
|
19399
19611
|
const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
|
|
19400
19612
|
const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
19401
19613
|
try {
|
|
19402
|
-
const concurrencyLimit =
|
|
19614
|
+
const concurrencyLimit = pLimit2__default.default(8);
|
|
19403
19615
|
const results = await Promise.allSettled(
|
|
19404
19616
|
selected.map(
|
|
19405
19617
|
(route) => concurrencyLimit(async () => {
|
|
@@ -19568,7 +19780,7 @@ async function loadCrawledPages(config, maxPages) {
|
|
|
19568
19780
|
const routes = await resolveRoutes(config);
|
|
19569
19781
|
const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
|
|
19570
19782
|
const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
|
|
19571
|
-
const concurrencyLimit =
|
|
19783
|
+
const concurrencyLimit = pLimit2__default.default(8);
|
|
19572
19784
|
const results = await Promise.allSettled(
|
|
19573
19785
|
selected.map(
|
|
19574
19786
|
(route) => concurrencyLimit(async () => {
|
|
@@ -19630,9 +19842,7 @@ function hrTimeMs(start) {
|
|
|
19630
19842
|
|
|
19631
19843
|
// src/indexing/pipeline.ts
|
|
19632
19844
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
19633
|
-
"
|
|
19634
|
-
"text-embedding-3-large": 13e-5,
|
|
19635
|
-
"text-embedding-ada-002": 1e-4
|
|
19845
|
+
"jina-embeddings-v3": 2e-5
|
|
19636
19846
|
};
|
|
19637
19847
|
var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
|
|
19638
19848
|
var IndexPipeline = class _IndexPipeline {
|
|
@@ -19678,9 +19888,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19678
19888
|
};
|
|
19679
19889
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
19680
19890
|
const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
|
|
19891
|
+
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
19892
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, model: ${this.config.embeddings.model})`);
|
|
19681
19893
|
if (options.force) {
|
|
19894
|
+
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
19682
19895
|
await cleanMirrorForScope(statePath, scope);
|
|
19683
19896
|
}
|
|
19897
|
+
if (options.dryRun) {
|
|
19898
|
+
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
19899
|
+
}
|
|
19684
19900
|
const manifestStart = stageStart();
|
|
19685
19901
|
const existingHashes = await this.vectorStore.getContentHashes(scope);
|
|
19686
19902
|
const existingModelId = await this.vectorStore.getScopeModelId(scope);
|
|
@@ -19691,8 +19907,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19691
19907
|
);
|
|
19692
19908
|
}
|
|
19693
19909
|
stageEnd("manifest", manifestStart);
|
|
19910
|
+
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
19694
19911
|
const sourceStart = stageStart();
|
|
19695
|
-
|
|
19912
|
+
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
19696
19913
|
let sourcePages;
|
|
19697
19914
|
if (sourceMode === "static-output") {
|
|
19698
19915
|
sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
|
|
@@ -19704,10 +19921,13 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19704
19921
|
sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
|
|
19705
19922
|
}
|
|
19706
19923
|
stageEnd("source", sourceStart);
|
|
19924
|
+
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
19707
19925
|
const routeStart = stageStart();
|
|
19708
19926
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
19709
19927
|
stageEnd("route_map", routeStart);
|
|
19928
|
+
this.logger.debug(`Route mapping: ${routePatterns.length} pattern${routePatterns.length === 1 ? "" : "s"} discovered (${stageTimingsMs["route_map"]}ms)`);
|
|
19710
19929
|
const extractStart = stageStart();
|
|
19930
|
+
this.logger.info("Extracting content...");
|
|
19711
19931
|
const extractedPages = [];
|
|
19712
19932
|
for (const sourcePage of sourcePages) {
|
|
19713
19933
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
@@ -19736,6 +19956,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19736
19956
|
uniquePages.push(page);
|
|
19737
19957
|
}
|
|
19738
19958
|
stageEnd("extract", extractStart);
|
|
19959
|
+
const skippedPages = sourcePages.length - uniquePages.length;
|
|
19960
|
+
this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
19739
19961
|
const linkStart = stageStart();
|
|
19740
19962
|
const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
|
|
19741
19963
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
@@ -19751,7 +19973,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19751
19973
|
}
|
|
19752
19974
|
}
|
|
19753
19975
|
stageEnd("links", linkStart);
|
|
19976
|
+
this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
|
|
19754
19977
|
const mirrorStart = stageStart();
|
|
19978
|
+
this.logger.info("Writing mirror pages...");
|
|
19755
19979
|
const mirrorPages = [];
|
|
19756
19980
|
let routeExact = 0;
|
|
19757
19981
|
let routeBestEffort = 0;
|
|
@@ -19821,7 +20045,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19821
20045
|
await this.vectorStore.upsertPages(pageRecords, scope);
|
|
19822
20046
|
}
|
|
19823
20047
|
stageEnd("mirror", mirrorStart);
|
|
20048
|
+
this.logger.info(`Mirrored ${mirrorPages.length} page${mirrorPages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["mirror"]}ms)`);
|
|
19824
20049
|
const chunkStart = stageStart();
|
|
20050
|
+
this.logger.info("Chunking pages...");
|
|
19825
20051
|
let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
|
|
19826
20052
|
const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
|
|
19827
20053
|
if (typeof maxChunks === "number") {
|
|
@@ -19834,6 +20060,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19834
20060
|
});
|
|
19835
20061
|
}
|
|
19836
20062
|
stageEnd("chunk", chunkStart);
|
|
20063
|
+
this.logger.info(`Chunked into ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} (${stageTimingsMs["chunk"]}ms)`);
|
|
19837
20064
|
const currentChunkMap = /* @__PURE__ */ new Map();
|
|
19838
20065
|
for (const chunk of chunks) {
|
|
19839
20066
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
@@ -19852,6 +20079,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19852
20079
|
return existingHash !== chunk.contentHash;
|
|
19853
20080
|
});
|
|
19854
20081
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
20082
|
+
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
19855
20083
|
const embedStart = stageStart();
|
|
19856
20084
|
const chunkTokenEstimates = /* @__PURE__ */ new Map();
|
|
19857
20085
|
for (const chunk of changedChunks) {
|
|
@@ -19866,9 +20094,11 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19866
20094
|
let newEmbeddings = 0;
|
|
19867
20095
|
const vectorsByChunk = /* @__PURE__ */ new Map();
|
|
19868
20096
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
20097
|
+
this.logger.info(`Embedding ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} (~${estimatedTokens.toLocaleString()} tokens, ~$${estimatedCostUSD.toFixed(6)})...`);
|
|
19869
20098
|
const embeddings = await this.embeddings.embedTexts(
|
|
19870
20099
|
changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
|
|
19871
|
-
this.config.embeddings.model
|
|
20100
|
+
this.config.embeddings.model,
|
|
20101
|
+
"retrieval.passage"
|
|
19872
20102
|
);
|
|
19873
20103
|
if (embeddings.length !== changedChunks.length) {
|
|
19874
20104
|
throw new SearchSocketError(
|
|
@@ -19891,8 +20121,14 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19891
20121
|
}
|
|
19892
20122
|
}
|
|
19893
20123
|
stageEnd("embedding", embedStart);
|
|
20124
|
+
if (changedChunks.length > 0) {
|
|
20125
|
+
this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
|
|
20126
|
+
} else {
|
|
20127
|
+
this.logger.info("No chunks to embed \u2014 all up to date");
|
|
20128
|
+
}
|
|
19894
20129
|
const syncStart = stageStart();
|
|
19895
20130
|
if (!options.dryRun) {
|
|
20131
|
+
this.logger.info("Syncing vectors...");
|
|
19896
20132
|
const upserts = [];
|
|
19897
20133
|
for (const chunk of changedChunks) {
|
|
19898
20134
|
const vector = vectorsByChunk.get(chunk.chunkKey);
|
|
@@ -19911,12 +20147,16 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19911
20147
|
sectionTitle: chunk.sectionTitle ?? "",
|
|
19912
20148
|
headingPath: chunk.headingPath,
|
|
19913
20149
|
snippet: chunk.snippet,
|
|
20150
|
+
chunkText: chunk.chunkText.slice(0, 4e3),
|
|
20151
|
+
ordinal: chunk.ordinal,
|
|
19914
20152
|
contentHash: chunk.contentHash,
|
|
19915
20153
|
modelId: this.config.embeddings.model,
|
|
19916
20154
|
depth: chunk.depth,
|
|
19917
20155
|
incomingLinks: chunk.incomingLinks,
|
|
19918
20156
|
routeFile: chunk.routeFile,
|
|
19919
|
-
tags: chunk.tags
|
|
20157
|
+
tags: chunk.tags,
|
|
20158
|
+
description: chunk.description,
|
|
20159
|
+
keywords: chunk.keywords
|
|
19920
20160
|
}
|
|
19921
20161
|
});
|
|
19922
20162
|
}
|
|
@@ -19930,6 +20170,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19930
20170
|
}
|
|
19931
20171
|
}
|
|
19932
20172
|
stageEnd("sync", syncStart);
|
|
20173
|
+
this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
|
|
19933
20174
|
const finalizeStart = stageStart();
|
|
19934
20175
|
if (!options.dryRun) {
|
|
19935
20176
|
const scopeInfo = {
|
|
@@ -19949,6 +20190,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19949
20190
|
});
|
|
19950
20191
|
}
|
|
19951
20192
|
stageEnd("finalize", finalizeStart);
|
|
20193
|
+
this.logger.info("Done.");
|
|
19952
20194
|
return {
|
|
19953
20195
|
pagesProcessed: mirrorPages.length,
|
|
19954
20196
|
chunksTotal: chunks.length,
|
|
@@ -20109,7 +20351,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
20109
20351
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
20110
20352
|
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
20111
20353
|
const embedStart = process.hrtime.bigint();
|
|
20112
|
-
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
|
|
20354
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
20113
20355
|
const queryVector = queryEmbeddings[0];
|
|
20114
20356
|
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
20115
20357
|
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
@@ -20137,13 +20379,17 @@ var SearchEngine = class _SearchEngine {
|
|
|
20137
20379
|
usedRerank = true;
|
|
20138
20380
|
}
|
|
20139
20381
|
let results;
|
|
20382
|
+
const minScore = this.config.ranking.minScore;
|
|
20140
20383
|
if (groupByPage) {
|
|
20141
|
-
|
|
20384
|
+
let pages = aggregateByPage(ordered, this.config);
|
|
20385
|
+
if (minScore > 0) {
|
|
20386
|
+
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
20387
|
+
}
|
|
20142
20388
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
20143
20389
|
results = pages.slice(0, topK).map((page) => {
|
|
20144
20390
|
const bestScore = page.bestChunk.finalScore;
|
|
20145
|
-
const
|
|
20146
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
20391
|
+
const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
20392
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
|
|
20147
20393
|
return {
|
|
20148
20394
|
url: page.url,
|
|
20149
20395
|
title: page.title,
|
|
@@ -20160,6 +20406,9 @@ var SearchEngine = class _SearchEngine {
|
|
|
20160
20406
|
};
|
|
20161
20407
|
});
|
|
20162
20408
|
} else {
|
|
20409
|
+
if (minScore > 0) {
|
|
20410
|
+
ordered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
20411
|
+
}
|
|
20163
20412
|
results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
20164
20413
|
url: hit.metadata.url,
|
|
20165
20414
|
title: hit.metadata.title,
|
|
@@ -20231,43 +20480,67 @@ var SearchEngine = class _SearchEngine {
|
|
|
20231
20480
|
}
|
|
20232
20481
|
}
|
|
20233
20482
|
async rerankHits(query, ranked, topK) {
|
|
20234
|
-
if (this.config.rerank.
|
|
20483
|
+
if (!this.config.rerank.enabled) {
|
|
20235
20484
|
throw new SearchSocketError(
|
|
20236
20485
|
"INVALID_REQUEST",
|
|
20237
|
-
"rerank=true requested but rerank.
|
|
20486
|
+
"rerank=true requested but rerank.enabled is not set to true.",
|
|
20238
20487
|
400
|
|
20239
20488
|
);
|
|
20240
20489
|
}
|
|
20241
20490
|
if (!this.reranker) {
|
|
20242
20491
|
throw new SearchSocketError(
|
|
20243
20492
|
"CONFIG_MISSING",
|
|
20244
|
-
`rerank=true requested but ${this.config.
|
|
20493
|
+
`rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
|
|
20245
20494
|
400
|
|
20246
20495
|
);
|
|
20247
20496
|
}
|
|
20248
|
-
const
|
|
20249
|
-
|
|
20250
|
-
|
|
20251
|
-
|
|
20497
|
+
const pageGroups = /* @__PURE__ */ new Map();
|
|
20498
|
+
for (const entry of ranked) {
|
|
20499
|
+
const url = entry.hit.metadata.url;
|
|
20500
|
+
const group = pageGroups.get(url);
|
|
20501
|
+
if (group) group.push(entry);
|
|
20502
|
+
else pageGroups.set(url, [entry]);
|
|
20503
|
+
}
|
|
20504
|
+
const MAX_CHUNKS_PER_PAGE = 5;
|
|
20505
|
+
const MIN_CHUNKS_PER_PAGE = 1;
|
|
20506
|
+
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
20507
|
+
const pageCandidates = [];
|
|
20508
|
+
for (const [url, chunks] of pageGroups) {
|
|
20509
|
+
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
20510
|
+
const bestScore = byScore[0].finalScore;
|
|
20511
|
+
const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
|
|
20512
|
+
const selected = byScore.filter(
|
|
20513
|
+
(c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
|
|
20514
|
+
).slice(0, MAX_CHUNKS_PER_PAGE);
|
|
20515
|
+
selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
|
|
20516
|
+
const first = selected[0].hit.metadata;
|
|
20517
|
+
const parts = [first.title];
|
|
20518
|
+
if (first.description) {
|
|
20519
|
+
parts.push(first.description);
|
|
20520
|
+
}
|
|
20521
|
+
if (first.keywords && first.keywords.length > 0) {
|
|
20522
|
+
parts.push(first.keywords.join(", "));
|
|
20523
|
+
}
|
|
20524
|
+
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
20525
|
+
parts.push(body);
|
|
20526
|
+
pageCandidates.push({ id: url, text: parts.join("\n\n") });
|
|
20527
|
+
}
|
|
20252
20528
|
const reranked = await this.reranker.rerank(
|
|
20253
20529
|
query,
|
|
20254
|
-
|
|
20530
|
+
pageCandidates,
|
|
20255
20531
|
Math.max(topK, this.config.rerank.topN)
|
|
20256
20532
|
);
|
|
20257
|
-
const
|
|
20533
|
+
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
20258
20534
|
return ranked.map((entry) => {
|
|
20259
|
-
const
|
|
20260
|
-
const
|
|
20261
|
-
if (
|
|
20262
|
-
return {
|
|
20263
|
-
...entry,
|
|
20264
|
-
finalScore: safeBaseScore
|
|
20265
|
-
};
|
|
20535
|
+
const pageScore = scoreByUrl.get(entry.hit.metadata.url);
|
|
20536
|
+
const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
|
|
20537
|
+
if (pageScore === void 0 || !Number.isFinite(pageScore)) {
|
|
20538
|
+
return { ...entry, finalScore: base };
|
|
20266
20539
|
}
|
|
20267
|
-
const
|
|
20540
|
+
const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
|
|
20268
20541
|
return {
|
|
20269
20542
|
...entry,
|
|
20270
|
-
finalScore: Number.isFinite(
|
|
20543
|
+
finalScore: Number.isFinite(combined) ? combined : base
|
|
20271
20544
|
};
|
|
20272
20545
|
}).sort((a, b) => {
|
|
20273
20546
|
const delta = b.finalScore - a.finalScore;
|
|
@@ -20465,13 +20738,21 @@ function searchsocketHandle(options = {}) {
|
|
|
20465
20738
|
let rateLimiter = null;
|
|
20466
20739
|
const getConfig = async () => {
|
|
20467
20740
|
if (!configPromise) {
|
|
20468
|
-
|
|
20469
|
-
|
|
20470
|
-
|
|
20471
|
-
})
|
|
20741
|
+
let configP;
|
|
20742
|
+
if (options.config) {
|
|
20743
|
+
configP = Promise.resolve(options.config);
|
|
20744
|
+
} else if (options.rawConfig) {
|
|
20745
|
+
const cwd = options.cwd ?? process.cwd();
|
|
20746
|
+
configP = Promise.resolve(mergeConfig(cwd, options.rawConfig));
|
|
20747
|
+
} else {
|
|
20748
|
+
configP = loadConfig({
|
|
20749
|
+
cwd: options.cwd,
|
|
20750
|
+
configPath: options.configPath
|
|
20751
|
+
});
|
|
20752
|
+
}
|
|
20472
20753
|
configPromise = configP.then((config) => {
|
|
20473
20754
|
apiPath = apiPath ?? config.api.path;
|
|
20474
|
-
if (config.api.rateLimit) {
|
|
20755
|
+
if (config.api.rateLimit && !isServerless()) {
|
|
20475
20756
|
rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
|
|
20476
20757
|
}
|
|
20477
20758
|
return config;
|
|
@@ -20481,10 +20762,9 @@ function searchsocketHandle(options = {}) {
|
|
|
20481
20762
|
};
|
|
20482
20763
|
const getEngine = async () => {
|
|
20483
20764
|
if (!enginePromise) {
|
|
20484
|
-
const config =
|
|
20765
|
+
const config = await getConfig();
|
|
20485
20766
|
enginePromise = SearchEngine.create({
|
|
20486
20767
|
cwd: options.cwd,
|
|
20487
|
-
configPath: options.configPath,
|
|
20488
20768
|
config
|
|
20489
20769
|
});
|
|
20490
20770
|
}
|
|
@@ -20757,8 +21037,10 @@ exports.createEmbeddingsProvider = createEmbeddingsProvider;
|
|
|
20757
21037
|
exports.createReranker = createReranker;
|
|
20758
21038
|
exports.createSearchClient = createSearchClient;
|
|
20759
21039
|
exports.createVectorStore = createVectorStore;
|
|
21040
|
+
exports.isServerless = isServerless;
|
|
20760
21041
|
exports.loadConfig = loadConfig;
|
|
20761
21042
|
exports.mergeConfig = mergeConfig;
|
|
21043
|
+
exports.mergeConfigServerless = mergeConfigServerless;
|
|
20762
21044
|
exports.resolveScope = resolveScope;
|
|
20763
21045
|
exports.runMcpServer = runMcpServer;
|
|
20764
21046
|
exports.searchsocketHandle = searchsocketHandle;
|