searchsocket 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -42
- package/dist/cli.js +370 -115
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +391 -109
- package/dist/index.d.cts +20 -3
- package/dist/index.d.ts +20 -3
- package/dist/index.js +389 -108
- package/dist/sveltekit.cjs +374 -109
- package/dist/sveltekit.d.cts +8 -2
- package/dist/sveltekit.d.ts +8 -2
- package/dist/sveltekit.js +373 -107
- package/dist/{types-D1K46vwd.d.cts → types-BrG6XTUU.d.cts} +29 -13
- package/dist/{types-D1K46vwd.d.ts → types-BrG6XTUU.d.ts} +29 -13
- package/package.json +1 -2
package/dist/sveltekit.cjs
CHANGED
|
@@ -4,8 +4,7 @@ var fs = require('fs');
|
|
|
4
4
|
var path = require('path');
|
|
5
5
|
var jiti = require('jiti');
|
|
6
6
|
var zod = require('zod');
|
|
7
|
-
var
|
|
8
|
-
var pLimit = require('p-limit');
|
|
7
|
+
var pLimit2 = require('p-limit');
|
|
9
8
|
var child_process = require('child_process');
|
|
10
9
|
var crypto = require('crypto');
|
|
11
10
|
var cheerio = require('cheerio');
|
|
@@ -19,8 +18,7 @@ function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
|
19
18
|
|
|
20
19
|
var fs__default = /*#__PURE__*/_interopDefault(fs);
|
|
21
20
|
var path__default = /*#__PURE__*/_interopDefault(path);
|
|
22
|
-
var
|
|
23
|
-
var pLimit__default = /*#__PURE__*/_interopDefault(pLimit);
|
|
21
|
+
var pLimit2__default = /*#__PURE__*/_interopDefault(pLimit2);
|
|
24
22
|
var matter__default = /*#__PURE__*/_interopDefault(matter);
|
|
25
23
|
var fs4__default = /*#__PURE__*/_interopDefault(fs4);
|
|
26
24
|
var fg__default = /*#__PURE__*/_interopDefault(fg);
|
|
@@ -16629,7 +16627,11 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16629
16627
|
outputDir: zod.z.string().min(1).optional(),
|
|
16630
16628
|
paramValues: zod.z.record(zod.z.string(), zod.z.array(zod.z.string())).optional(),
|
|
16631
16629
|
exclude: zod.z.array(zod.z.string()).optional(),
|
|
16632
|
-
previewTimeout: zod.z.number().int().positive().optional()
|
|
16630
|
+
previewTimeout: zod.z.number().int().positive().optional(),
|
|
16631
|
+
discover: zod.z.boolean().optional(),
|
|
16632
|
+
seedUrls: zod.z.array(zod.z.string()).optional(),
|
|
16633
|
+
maxPages: zod.z.number().int().positive().optional(),
|
|
16634
|
+
maxDepth: zod.z.number().int().nonnegative().optional()
|
|
16633
16635
|
}).optional()
|
|
16634
16636
|
}).optional(),
|
|
16635
16637
|
extract: zod.z.object({
|
|
@@ -16656,8 +16658,9 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16656
16658
|
pageSummaryChunk: zod.z.boolean().optional()
|
|
16657
16659
|
}).optional(),
|
|
16658
16660
|
embeddings: zod.z.object({
|
|
16659
|
-
provider: zod.z.literal("
|
|
16661
|
+
provider: zod.z.literal("jina").optional(),
|
|
16660
16662
|
model: zod.z.string().min(1).optional(),
|
|
16663
|
+
apiKey: zod.z.string().min(1).optional(),
|
|
16661
16664
|
apiKeyEnv: zod.z.string().min(1).optional(),
|
|
16662
16665
|
batchSize: zod.z.number().int().positive().optional(),
|
|
16663
16666
|
concurrency: zod.z.number().int().positive().optional(),
|
|
@@ -16666,18 +16669,17 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16666
16669
|
vector: zod.z.object({
|
|
16667
16670
|
dimension: zod.z.number().int().positive().optional(),
|
|
16668
16671
|
turso: zod.z.object({
|
|
16672
|
+
url: zod.z.string().url().optional(),
|
|
16673
|
+
authToken: zod.z.string().min(1).optional(),
|
|
16669
16674
|
urlEnv: zod.z.string().optional(),
|
|
16670
16675
|
authTokenEnv: zod.z.string().optional(),
|
|
16671
16676
|
localPath: zod.z.string().optional()
|
|
16672
16677
|
}).optional()
|
|
16673
16678
|
}).optional(),
|
|
16674
16679
|
rerank: zod.z.object({
|
|
16675
|
-
|
|
16680
|
+
enabled: zod.z.boolean().optional(),
|
|
16676
16681
|
topN: zod.z.number().int().positive().optional(),
|
|
16677
|
-
|
|
16678
|
-
apiKeyEnv: zod.z.string().optional(),
|
|
16679
|
-
model: zod.z.string().optional()
|
|
16680
|
-
}).optional()
|
|
16682
|
+
model: zod.z.string().optional()
|
|
16681
16683
|
}).optional(),
|
|
16682
16684
|
ranking: zod.z.object({
|
|
16683
16685
|
enableIncomingLinkBoost: zod.z.boolean().optional(),
|
|
@@ -16686,6 +16688,7 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16686
16688
|
aggregationCap: zod.z.number().int().positive().optional(),
|
|
16687
16689
|
aggregationDecay: zod.z.number().min(0).max(1).optional(),
|
|
16688
16690
|
minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
16691
|
+
minScore: zod.z.number().min(0).max(1).optional(),
|
|
16689
16692
|
weights: zod.z.object({
|
|
16690
16693
|
incomingLinks: zod.z.number().optional(),
|
|
16691
16694
|
depth: zod.z.number().optional(),
|
|
@@ -16766,9 +16769,9 @@ function createDefaultConfig(projectId) {
|
|
|
16766
16769
|
pageSummaryChunk: true
|
|
16767
16770
|
},
|
|
16768
16771
|
embeddings: {
|
|
16769
|
-
provider: "
|
|
16770
|
-
model: "
|
|
16771
|
-
apiKeyEnv: "
|
|
16772
|
+
provider: "jina",
|
|
16773
|
+
model: "jina-embeddings-v3",
|
|
16774
|
+
apiKeyEnv: "JINA_API_KEY",
|
|
16772
16775
|
batchSize: 64,
|
|
16773
16776
|
concurrency: 4
|
|
16774
16777
|
},
|
|
@@ -16780,12 +16783,9 @@ function createDefaultConfig(projectId) {
|
|
|
16780
16783
|
}
|
|
16781
16784
|
},
|
|
16782
16785
|
rerank: {
|
|
16783
|
-
|
|
16786
|
+
enabled: false,
|
|
16784
16787
|
topN: 20,
|
|
16785
|
-
|
|
16786
|
-
apiKeyEnv: "JINA_API_KEY",
|
|
16787
|
-
model: "jina-reranker-v2-base-multilingual"
|
|
16788
|
-
}
|
|
16788
|
+
model: "jina-reranker-v2-base-multilingual"
|
|
16789
16789
|
},
|
|
16790
16790
|
ranking: {
|
|
16791
16791
|
enableIncomingLinkBoost: true,
|
|
@@ -16794,6 +16794,7 @@ function createDefaultConfig(projectId) {
|
|
|
16794
16794
|
aggregationCap: 5,
|
|
16795
16795
|
aggregationDecay: 0.5,
|
|
16796
16796
|
minChunkScoreRatio: 0.5,
|
|
16797
|
+
minScore: 0,
|
|
16797
16798
|
weights: {
|
|
16798
16799
|
incomingLinks: 0.05,
|
|
16799
16800
|
depth: 0.03,
|
|
@@ -16920,7 +16921,11 @@ ${issues}`
|
|
|
16920
16921
|
outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
|
|
16921
16922
|
paramValues: parsed.source.build.paramValues ?? {},
|
|
16922
16923
|
exclude: parsed.source.build.exclude ?? [],
|
|
16923
|
-
previewTimeout: parsed.source.build.previewTimeout ?? 3e4
|
|
16924
|
+
previewTimeout: parsed.source.build.previewTimeout ?? 3e4,
|
|
16925
|
+
discover: parsed.source.build.discover ?? false,
|
|
16926
|
+
seedUrls: parsed.source.build.seedUrls ?? ["/"],
|
|
16927
|
+
maxPages: parsed.source.build.maxPages ?? 200,
|
|
16928
|
+
maxDepth: parsed.source.build.maxDepth ?? 10
|
|
16924
16929
|
} : void 0
|
|
16925
16930
|
},
|
|
16926
16931
|
extract: {
|
|
@@ -16949,11 +16954,7 @@ ${issues}`
|
|
|
16949
16954
|
},
|
|
16950
16955
|
rerank: {
|
|
16951
16956
|
...defaults.rerank,
|
|
16952
|
-
...parsed.rerank
|
|
16953
|
-
jina: {
|
|
16954
|
-
...defaults.rerank.jina,
|
|
16955
|
-
...parsed.rerank?.jina
|
|
16956
|
-
}
|
|
16957
|
+
...parsed.rerank
|
|
16957
16958
|
},
|
|
16958
16959
|
ranking: {
|
|
16959
16960
|
...defaults.ranking,
|
|
@@ -17000,7 +17001,11 @@ ${issues}`
|
|
|
17000
17001
|
outputDir: ".svelte-kit/output",
|
|
17001
17002
|
paramValues: {},
|
|
17002
17003
|
exclude: [],
|
|
17003
|
-
previewTimeout: 3e4
|
|
17004
|
+
previewTimeout: 3e4,
|
|
17005
|
+
discover: false,
|
|
17006
|
+
seedUrls: ["/"],
|
|
17007
|
+
maxPages: 200,
|
|
17008
|
+
maxDepth: 10
|
|
17004
17009
|
};
|
|
17005
17010
|
}
|
|
17006
17011
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
@@ -17035,15 +17040,21 @@ async function loadConfig(options = {}) {
|
|
|
17035
17040
|
const raw = loaded.default ?? loaded;
|
|
17036
17041
|
return mergeConfig(cwd, raw);
|
|
17037
17042
|
}
|
|
17043
|
+
|
|
17044
|
+
// src/core/serverless.ts
|
|
17045
|
+
function isServerless() {
|
|
17046
|
+
return !!(process.env.VERCEL || process.env.NETLIFY || process.env.AWS_LAMBDA_FUNCTION_NAME || process.env.FUNCTIONS_WORKER || process.env.CF_PAGES);
|
|
17047
|
+
}
|
|
17038
17048
|
function sleep(ms) {
|
|
17039
17049
|
return new Promise((resolve) => {
|
|
17040
17050
|
setTimeout(resolve, ms);
|
|
17041
17051
|
});
|
|
17042
17052
|
}
|
|
17043
|
-
var
|
|
17044
|
-
|
|
17053
|
+
var JinaEmbeddingsProvider = class {
|
|
17054
|
+
apiKey;
|
|
17045
17055
|
batchSize;
|
|
17046
17056
|
concurrency;
|
|
17057
|
+
defaultTask;
|
|
17047
17058
|
constructor(options) {
|
|
17048
17059
|
if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
|
|
17049
17060
|
throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
|
|
@@ -17051,11 +17062,10 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17051
17062
|
if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
|
|
17052
17063
|
throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
|
|
17053
17064
|
}
|
|
17054
|
-
this.
|
|
17055
|
-
apiKey: options.apiKey
|
|
17056
|
-
});
|
|
17065
|
+
this.apiKey = options.apiKey;
|
|
17057
17066
|
this.batchSize = options.batchSize;
|
|
17058
17067
|
this.concurrency = options.concurrency;
|
|
17068
|
+
this.defaultTask = options.task ?? "retrieval.passage";
|
|
17059
17069
|
}
|
|
17060
17070
|
estimateTokens(text) {
|
|
17061
17071
|
const normalized = text.trim();
|
|
@@ -17069,7 +17079,7 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17069
17079
|
const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
|
|
17070
17080
|
return Math.max(1, Math.max(charEstimate, lexicalEstimate));
|
|
17071
17081
|
}
|
|
17072
|
-
async embedTexts(texts, modelId) {
|
|
17082
|
+
async embedTexts(texts, modelId, task) {
|
|
17073
17083
|
if (texts.length === 0) {
|
|
17074
17084
|
return [];
|
|
17075
17085
|
}
|
|
@@ -17081,37 +17091,56 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17081
17091
|
});
|
|
17082
17092
|
}
|
|
17083
17093
|
const outputs = new Array(batches.length);
|
|
17084
|
-
const limit =
|
|
17094
|
+
const limit = pLimit2__default.default(this.concurrency);
|
|
17085
17095
|
await Promise.all(
|
|
17086
17096
|
batches.map(
|
|
17087
17097
|
(batch, position) => limit(async () => {
|
|
17088
|
-
outputs[position] = await this.embedWithRetry(batch.values, modelId);
|
|
17098
|
+
outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
|
|
17089
17099
|
})
|
|
17090
17100
|
)
|
|
17091
17101
|
);
|
|
17092
17102
|
return outputs.flat();
|
|
17093
17103
|
}
|
|
17094
|
-
async embedWithRetry(texts, modelId) {
|
|
17104
|
+
async embedWithRetry(texts, modelId, task) {
|
|
17095
17105
|
const maxAttempts = 5;
|
|
17096
17106
|
let attempt = 0;
|
|
17097
17107
|
while (attempt < maxAttempts) {
|
|
17098
17108
|
attempt += 1;
|
|
17109
|
+
let response;
|
|
17099
17110
|
try {
|
|
17100
|
-
|
|
17101
|
-
|
|
17102
|
-
|
|
17103
|
-
|
|
17111
|
+
response = await fetch("https://api.jina.ai/v1/embeddings", {
|
|
17112
|
+
method: "POST",
|
|
17113
|
+
headers: {
|
|
17114
|
+
"content-type": "application/json",
|
|
17115
|
+
authorization: `Bearer ${this.apiKey}`
|
|
17116
|
+
},
|
|
17117
|
+
body: JSON.stringify({
|
|
17118
|
+
model: modelId,
|
|
17119
|
+
input: texts,
|
|
17120
|
+
task
|
|
17121
|
+
})
|
|
17104
17122
|
});
|
|
17105
|
-
return response.data.map((entry) => entry.embedding);
|
|
17106
17123
|
} catch (error) {
|
|
17107
|
-
|
|
17108
|
-
const retryable = status === 429 || typeof status === "number" && status >= 500;
|
|
17109
|
-
if (!retryable || attempt >= maxAttempts) {
|
|
17124
|
+
if (attempt >= maxAttempts) {
|
|
17110
17125
|
throw error;
|
|
17111
17126
|
}
|
|
17112
|
-
|
|
17113
|
-
|
|
17127
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17128
|
+
continue;
|
|
17114
17129
|
}
|
|
17130
|
+
if (!response.ok) {
|
|
17131
|
+
const retryable = response.status === 429 || response.status >= 500;
|
|
17132
|
+
if (!retryable || attempt >= maxAttempts) {
|
|
17133
|
+
const errorBody = await response.text();
|
|
17134
|
+
throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
|
|
17135
|
+
}
|
|
17136
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17137
|
+
continue;
|
|
17138
|
+
}
|
|
17139
|
+
const payload = await response.json();
|
|
17140
|
+
if (!payload.data || !Array.isArray(payload.data)) {
|
|
17141
|
+
throw new Error("Invalid Jina embeddings response format");
|
|
17142
|
+
}
|
|
17143
|
+
return payload.data.map((entry) => entry.embedding);
|
|
17115
17144
|
}
|
|
17116
17145
|
throw new Error("Unreachable retry state");
|
|
17117
17146
|
}
|
|
@@ -17119,20 +17148,20 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17119
17148
|
|
|
17120
17149
|
// src/embeddings/factory.ts
|
|
17121
17150
|
function createEmbeddingsProvider(config) {
|
|
17122
|
-
if (config.embeddings.provider !== "
|
|
17151
|
+
if (config.embeddings.provider !== "jina") {
|
|
17123
17152
|
throw new SearchSocketError(
|
|
17124
17153
|
"CONFIG_MISSING",
|
|
17125
17154
|
`Unsupported embeddings provider ${config.embeddings.provider}`
|
|
17126
17155
|
);
|
|
17127
17156
|
}
|
|
17128
|
-
const apiKey = process.env[config.embeddings.apiKeyEnv];
|
|
17157
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17129
17158
|
if (!apiKey) {
|
|
17130
17159
|
throw new SearchSocketError(
|
|
17131
17160
|
"CONFIG_MISSING",
|
|
17132
|
-
`Missing embeddings API key env var
|
|
17161
|
+
`Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
|
|
17133
17162
|
);
|
|
17134
17163
|
}
|
|
17135
|
-
return new
|
|
17164
|
+
return new JinaEmbeddingsProvider({
|
|
17136
17165
|
apiKey,
|
|
17137
17166
|
batchSize: config.embeddings.batchSize,
|
|
17138
17167
|
concurrency: config.embeddings.concurrency
|
|
@@ -17295,20 +17324,17 @@ var JinaReranker = class {
|
|
|
17295
17324
|
|
|
17296
17325
|
// src/rerank/factory.ts
|
|
17297
17326
|
function createReranker(config) {
|
|
17298
|
-
if (config.rerank.
|
|
17327
|
+
if (!config.rerank.enabled) {
|
|
17299
17328
|
return null;
|
|
17300
17329
|
}
|
|
17301
|
-
|
|
17302
|
-
|
|
17303
|
-
|
|
17304
|
-
return null;
|
|
17305
|
-
}
|
|
17306
|
-
return new JinaReranker({
|
|
17307
|
-
apiKey,
|
|
17308
|
-
model: config.rerank.jina.model
|
|
17309
|
-
});
|
|
17330
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17331
|
+
if (!apiKey) {
|
|
17332
|
+
return null;
|
|
17310
17333
|
}
|
|
17311
|
-
return
|
|
17334
|
+
return new JinaReranker({
|
|
17335
|
+
apiKey,
|
|
17336
|
+
model: config.rerank.model
|
|
17337
|
+
});
|
|
17312
17338
|
}
|
|
17313
17339
|
|
|
17314
17340
|
// src/utils/time.ts
|
|
@@ -17413,6 +17439,16 @@ var TursoVectorStore = class {
|
|
|
17413
17439
|
}
|
|
17414
17440
|
async ensureChunks(dim) {
|
|
17415
17441
|
if (this.chunksReady) return;
|
|
17442
|
+
const exists = await this.chunksTableExists();
|
|
17443
|
+
if (exists) {
|
|
17444
|
+
const currentDim = await this.getChunksDimension();
|
|
17445
|
+
if (currentDim !== null && currentDim !== dim) {
|
|
17446
|
+
await this.client.batch([
|
|
17447
|
+
"DROP INDEX IF EXISTS idx",
|
|
17448
|
+
"DROP TABLE IF EXISTS chunks"
|
|
17449
|
+
]);
|
|
17450
|
+
}
|
|
17451
|
+
}
|
|
17416
17452
|
await this.client.batch([
|
|
17417
17453
|
`CREATE TABLE IF NOT EXISTS chunks (
|
|
17418
17454
|
id TEXT PRIMARY KEY,
|
|
@@ -17424,12 +17460,16 @@ var TursoVectorStore = class {
|
|
|
17424
17460
|
section_title TEXT NOT NULL DEFAULT '',
|
|
17425
17461
|
heading_path TEXT NOT NULL DEFAULT '[]',
|
|
17426
17462
|
snippet TEXT NOT NULL DEFAULT '',
|
|
17463
|
+
chunk_text TEXT NOT NULL DEFAULT '',
|
|
17464
|
+
ordinal INTEGER NOT NULL DEFAULT 0,
|
|
17427
17465
|
content_hash TEXT NOT NULL DEFAULT '',
|
|
17428
17466
|
model_id TEXT NOT NULL DEFAULT '',
|
|
17429
17467
|
depth INTEGER NOT NULL DEFAULT 0,
|
|
17430
17468
|
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
17431
17469
|
route_file TEXT NOT NULL DEFAULT '',
|
|
17432
17470
|
tags TEXT NOT NULL DEFAULT '[]',
|
|
17471
|
+
description TEXT NOT NULL DEFAULT '',
|
|
17472
|
+
keywords TEXT NOT NULL DEFAULT '[]',
|
|
17433
17473
|
embedding F32_BLOB(${dim})
|
|
17434
17474
|
)`,
|
|
17435
17475
|
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
@@ -17468,6 +17508,38 @@ var TursoVectorStore = class {
|
|
|
17468
17508
|
throw error;
|
|
17469
17509
|
}
|
|
17470
17510
|
}
|
|
17511
|
+
/**
|
|
17512
|
+
* Read the current F32_BLOB dimension from the chunks table schema.
|
|
17513
|
+
* Returns null if the table doesn't exist or the dimension can't be parsed.
|
|
17514
|
+
*/
|
|
17515
|
+
async getChunksDimension() {
|
|
17516
|
+
try {
|
|
17517
|
+
const rs = await this.client.execute(
|
|
17518
|
+
"SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
|
|
17519
|
+
);
|
|
17520
|
+
if (rs.rows.length === 0) return null;
|
|
17521
|
+
const sql = rs.rows[0].sql;
|
|
17522
|
+
const match = sql.match(/F32_BLOB\((\d+)\)/i);
|
|
17523
|
+
return match ? parseInt(match[1], 10) : null;
|
|
17524
|
+
} catch {
|
|
17525
|
+
return null;
|
|
17526
|
+
}
|
|
17527
|
+
}
|
|
17528
|
+
/**
|
|
17529
|
+
* Drop all SearchSocket tables (chunks, registry, pages) and their indexes.
|
|
17530
|
+
* Used by `clean --remote` for a full reset.
|
|
17531
|
+
*/
|
|
17532
|
+
async dropAllTables() {
|
|
17533
|
+
await this.client.batch([
|
|
17534
|
+
"DROP INDEX IF EXISTS idx",
|
|
17535
|
+
"DROP TABLE IF EXISTS chunks",
|
|
17536
|
+
"DROP TABLE IF EXISTS registry",
|
|
17537
|
+
"DROP TABLE IF EXISTS pages"
|
|
17538
|
+
]);
|
|
17539
|
+
this.chunksReady = false;
|
|
17540
|
+
this.registryReady = false;
|
|
17541
|
+
this.pagesReady = false;
|
|
17542
|
+
}
|
|
17471
17543
|
async upsert(records, _scope) {
|
|
17472
17544
|
if (records.length === 0) return;
|
|
17473
17545
|
const dim = this.dimension ?? records[0].vector.length;
|
|
@@ -17478,9 +17550,9 @@ var TursoVectorStore = class {
|
|
|
17478
17550
|
const stmts = batch.map((r) => ({
|
|
17479
17551
|
sql: `INSERT OR REPLACE INTO chunks
|
|
17480
17552
|
(id, project_id, scope_name, url, path, title, section_title,
|
|
17481
|
-
heading_path, snippet, content_hash, model_id, depth,
|
|
17482
|
-
incoming_links, route_file, tags, embedding)
|
|
17483
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17553
|
+
heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
|
|
17554
|
+
incoming_links, route_file, tags, description, keywords, embedding)
|
|
17555
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17484
17556
|
args: [
|
|
17485
17557
|
r.id,
|
|
17486
17558
|
r.metadata.projectId,
|
|
@@ -17491,12 +17563,16 @@ var TursoVectorStore = class {
|
|
|
17491
17563
|
r.metadata.sectionTitle,
|
|
17492
17564
|
JSON.stringify(r.metadata.headingPath),
|
|
17493
17565
|
r.metadata.snippet,
|
|
17566
|
+
r.metadata.chunkText,
|
|
17567
|
+
r.metadata.ordinal,
|
|
17494
17568
|
r.metadata.contentHash,
|
|
17495
17569
|
r.metadata.modelId,
|
|
17496
17570
|
r.metadata.depth,
|
|
17497
17571
|
r.metadata.incomingLinks,
|
|
17498
17572
|
r.metadata.routeFile,
|
|
17499
17573
|
JSON.stringify(r.metadata.tags),
|
|
17574
|
+
r.metadata.description ?? "",
|
|
17575
|
+
JSON.stringify(r.metadata.keywords ?? []),
|
|
17500
17576
|
JSON.stringify(r.vector)
|
|
17501
17577
|
]
|
|
17502
17578
|
}));
|
|
@@ -17509,8 +17585,10 @@ var TursoVectorStore = class {
|
|
|
17509
17585
|
const queryJson = JSON.stringify(queryVector);
|
|
17510
17586
|
const rs = await this.client.execute({
|
|
17511
17587
|
sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
|
|
17512
|
-
c.section_title, c.heading_path, c.snippet, c.
|
|
17588
|
+
c.section_title, c.heading_path, c.snippet, c.chunk_text,
|
|
17589
|
+
c.ordinal, c.content_hash,
|
|
17513
17590
|
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
17591
|
+
c.description, c.keywords,
|
|
17514
17592
|
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
17515
17593
|
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
17516
17594
|
JOIN chunks AS c ON c.rowid = v.id`,
|
|
@@ -17541,6 +17619,12 @@ var TursoVectorStore = class {
|
|
|
17541
17619
|
}
|
|
17542
17620
|
const distance = row.distance;
|
|
17543
17621
|
const score = 1 - distance;
|
|
17622
|
+
const description = row.description || void 0;
|
|
17623
|
+
const keywords = (() => {
|
|
17624
|
+
const raw = row.keywords || "[]";
|
|
17625
|
+
const parsed = JSON.parse(raw);
|
|
17626
|
+
return parsed.length > 0 ? parsed : void 0;
|
|
17627
|
+
})();
|
|
17544
17628
|
hits.push({
|
|
17545
17629
|
id: row.id,
|
|
17546
17630
|
score,
|
|
@@ -17553,12 +17637,16 @@ var TursoVectorStore = class {
|
|
|
17553
17637
|
sectionTitle: row.section_title,
|
|
17554
17638
|
headingPath: JSON.parse(row.heading_path || "[]"),
|
|
17555
17639
|
snippet: row.snippet,
|
|
17640
|
+
chunkText: row.chunk_text || "",
|
|
17641
|
+
ordinal: row.ordinal || 0,
|
|
17556
17642
|
contentHash: row.content_hash,
|
|
17557
17643
|
modelId: row.model_id,
|
|
17558
17644
|
depth: row.depth,
|
|
17559
17645
|
incomingLinks: row.incoming_links,
|
|
17560
17646
|
routeFile: row.route_file,
|
|
17561
|
-
tags
|
|
17647
|
+
tags,
|
|
17648
|
+
description,
|
|
17649
|
+
keywords
|
|
17562
17650
|
}
|
|
17563
17651
|
});
|
|
17564
17652
|
}
|
|
@@ -17748,10 +17836,10 @@ var TursoVectorStore = class {
|
|
|
17748
17836
|
// src/vector/factory.ts
|
|
17749
17837
|
async function createVectorStore(config, cwd) {
|
|
17750
17838
|
const turso = config.vector.turso;
|
|
17751
|
-
const remoteUrl = process.env[turso.urlEnv];
|
|
17839
|
+
const remoteUrl = turso.url ?? process.env[turso.urlEnv];
|
|
17752
17840
|
if (remoteUrl) {
|
|
17753
17841
|
const { createClient: createClient2 } = await import('@libsql/client/http');
|
|
17754
|
-
const authToken = process.env[turso.authTokenEnv];
|
|
17842
|
+
const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
|
|
17755
17843
|
const client2 = createClient2({
|
|
17756
17844
|
url: remoteUrl,
|
|
17757
17845
|
authToken
|
|
@@ -17761,6 +17849,12 @@ async function createVectorStore(config, cwd) {
|
|
|
17761
17849
|
dimension: config.vector.dimension
|
|
17762
17850
|
});
|
|
17763
17851
|
}
|
|
17852
|
+
if (isServerless()) {
|
|
17853
|
+
throw new SearchSocketError(
|
|
17854
|
+
"VECTOR_BACKEND_UNAVAILABLE",
|
|
17855
|
+
`No remote vector database URL found (checked vector.turso.url and env var "${turso.urlEnv}"). Local SQLite storage is not available in serverless environments. Set ${turso.urlEnv} or pass vector.turso.url directly.`
|
|
17856
|
+
);
|
|
17857
|
+
}
|
|
17764
17858
|
const { createClient } = await import('@libsql/client');
|
|
17765
17859
|
const localPath = path__default.default.resolve(cwd, turso.localPath);
|
|
17766
17860
|
fs__default.default.mkdirSync(path__default.default.dirname(localPath), { recursive: true });
|
|
@@ -17918,7 +18012,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
17918
18012
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
17919
18013
|
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
17920
18014
|
const embedStart = process.hrtime.bigint();
|
|
17921
|
-
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
|
|
18015
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
17922
18016
|
const queryVector = queryEmbeddings[0];
|
|
17923
18017
|
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
17924
18018
|
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
@@ -17946,13 +18040,17 @@ var SearchEngine = class _SearchEngine {
|
|
|
17946
18040
|
usedRerank = true;
|
|
17947
18041
|
}
|
|
17948
18042
|
let results;
|
|
18043
|
+
const minScore = this.config.ranking.minScore;
|
|
17949
18044
|
if (groupByPage) {
|
|
17950
|
-
|
|
18045
|
+
let pages = aggregateByPage(ordered, this.config);
|
|
18046
|
+
if (minScore > 0) {
|
|
18047
|
+
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
18048
|
+
}
|
|
17951
18049
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
17952
18050
|
results = pages.slice(0, topK).map((page) => {
|
|
17953
18051
|
const bestScore = page.bestChunk.finalScore;
|
|
17954
|
-
const
|
|
17955
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
18052
|
+
const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
18053
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
|
|
17956
18054
|
return {
|
|
17957
18055
|
url: page.url,
|
|
17958
18056
|
title: page.title,
|
|
@@ -17969,6 +18067,9 @@ var SearchEngine = class _SearchEngine {
|
|
|
17969
18067
|
};
|
|
17970
18068
|
});
|
|
17971
18069
|
} else {
|
|
18070
|
+
if (minScore > 0) {
|
|
18071
|
+
ordered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
18072
|
+
}
|
|
17972
18073
|
results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
17973
18074
|
url: hit.metadata.url,
|
|
17974
18075
|
title: hit.metadata.title,
|
|
@@ -18040,43 +18141,67 @@ var SearchEngine = class _SearchEngine {
|
|
|
18040
18141
|
}
|
|
18041
18142
|
}
|
|
18042
18143
|
async rerankHits(query, ranked, topK) {
|
|
18043
|
-
if (this.config.rerank.
|
|
18144
|
+
if (!this.config.rerank.enabled) {
|
|
18044
18145
|
throw new SearchSocketError(
|
|
18045
18146
|
"INVALID_REQUEST",
|
|
18046
|
-
"rerank=true requested but rerank.
|
|
18147
|
+
"rerank=true requested but rerank.enabled is not set to true.",
|
|
18047
18148
|
400
|
|
18048
18149
|
);
|
|
18049
18150
|
}
|
|
18050
18151
|
if (!this.reranker) {
|
|
18051
18152
|
throw new SearchSocketError(
|
|
18052
18153
|
"CONFIG_MISSING",
|
|
18053
|
-
`rerank=true requested but ${this.config.
|
|
18154
|
+
`rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
|
|
18054
18155
|
400
|
|
18055
18156
|
);
|
|
18056
18157
|
}
|
|
18057
|
-
const
|
|
18058
|
-
|
|
18059
|
-
|
|
18060
|
-
|
|
18158
|
+
const pageGroups = /* @__PURE__ */ new Map();
|
|
18159
|
+
for (const entry of ranked) {
|
|
18160
|
+
const url = entry.hit.metadata.url;
|
|
18161
|
+
const group = pageGroups.get(url);
|
|
18162
|
+
if (group) group.push(entry);
|
|
18163
|
+
else pageGroups.set(url, [entry]);
|
|
18164
|
+
}
|
|
18165
|
+
const MAX_CHUNKS_PER_PAGE = 5;
|
|
18166
|
+
const MIN_CHUNKS_PER_PAGE = 1;
|
|
18167
|
+
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
18168
|
+
const pageCandidates = [];
|
|
18169
|
+
for (const [url, chunks] of pageGroups) {
|
|
18170
|
+
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
18171
|
+
const bestScore = byScore[0].finalScore;
|
|
18172
|
+
const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
|
|
18173
|
+
const selected = byScore.filter(
|
|
18174
|
+
(c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
|
|
18175
|
+
).slice(0, MAX_CHUNKS_PER_PAGE);
|
|
18176
|
+
selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
|
|
18177
|
+
const first = selected[0].hit.metadata;
|
|
18178
|
+
const parts = [first.title];
|
|
18179
|
+
if (first.description) {
|
|
18180
|
+
parts.push(first.description);
|
|
18181
|
+
}
|
|
18182
|
+
if (first.keywords && first.keywords.length > 0) {
|
|
18183
|
+
parts.push(first.keywords.join(", "));
|
|
18184
|
+
}
|
|
18185
|
+
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
18186
|
+
parts.push(body);
|
|
18187
|
+
pageCandidates.push({ id: url, text: parts.join("\n\n") });
|
|
18188
|
+
}
|
|
18061
18189
|
const reranked = await this.reranker.rerank(
|
|
18062
18190
|
query,
|
|
18063
|
-
|
|
18191
|
+
pageCandidates,
|
|
18064
18192
|
Math.max(topK, this.config.rerank.topN)
|
|
18065
18193
|
);
|
|
18066
|
-
const
|
|
18194
|
+
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
18067
18195
|
return ranked.map((entry) => {
|
|
18068
|
-
const
|
|
18069
|
-
const
|
|
18070
|
-
if (
|
|
18071
|
-
return {
|
|
18072
|
-
...entry,
|
|
18073
|
-
finalScore: safeBaseScore
|
|
18074
|
-
};
|
|
18196
|
+
const pageScore = scoreByUrl.get(entry.hit.metadata.url);
|
|
18197
|
+
const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
|
|
18198
|
+
if (pageScore === void 0 || !Number.isFinite(pageScore)) {
|
|
18199
|
+
return { ...entry, finalScore: base };
|
|
18075
18200
|
}
|
|
18076
|
-
const
|
|
18201
|
+
const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
|
|
18077
18202
|
return {
|
|
18078
18203
|
...entry,
|
|
18079
|
-
finalScore: Number.isFinite(
|
|
18204
|
+
finalScore: Number.isFinite(combined) ? combined : base
|
|
18080
18205
|
};
|
|
18081
18206
|
}).sort((a, b) => {
|
|
18082
18207
|
const delta = b.finalScore - a.finalScore;
|
|
@@ -18116,13 +18241,21 @@ function searchsocketHandle(options = {}) {
|
|
|
18116
18241
|
let rateLimiter = null;
|
|
18117
18242
|
const getConfig = async () => {
|
|
18118
18243
|
if (!configPromise) {
|
|
18119
|
-
|
|
18120
|
-
|
|
18121
|
-
|
|
18122
|
-
})
|
|
18244
|
+
let configP;
|
|
18245
|
+
if (options.config) {
|
|
18246
|
+
configP = Promise.resolve(options.config);
|
|
18247
|
+
} else if (options.rawConfig) {
|
|
18248
|
+
const cwd = options.cwd ?? process.cwd();
|
|
18249
|
+
configP = Promise.resolve(mergeConfig(cwd, options.rawConfig));
|
|
18250
|
+
} else {
|
|
18251
|
+
configP = loadConfig({
|
|
18252
|
+
cwd: options.cwd,
|
|
18253
|
+
configPath: options.configPath
|
|
18254
|
+
});
|
|
18255
|
+
}
|
|
18123
18256
|
configPromise = configP.then((config) => {
|
|
18124
18257
|
apiPath = apiPath ?? config.api.path;
|
|
18125
|
-
if (config.api.rateLimit) {
|
|
18258
|
+
if (config.api.rateLimit && !isServerless()) {
|
|
18126
18259
|
rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
|
|
18127
18260
|
}
|
|
18128
18261
|
return config;
|
|
@@ -18132,10 +18265,9 @@ function searchsocketHandle(options = {}) {
|
|
|
18132
18265
|
};
|
|
18133
18266
|
const getEngine = async () => {
|
|
18134
18267
|
if (!enginePromise) {
|
|
18135
|
-
const config =
|
|
18268
|
+
const config = await getConfig();
|
|
18136
18269
|
enginePromise = SearchEngine.create({
|
|
18137
18270
|
cwd: options.cwd,
|
|
18138
|
-
configPath: options.configPath,
|
|
18139
18271
|
config
|
|
18140
18272
|
});
|
|
18141
18273
|
}
|
|
@@ -18562,7 +18694,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
18562
18694
|
incomingLinks: page.incomingLinks,
|
|
18563
18695
|
routeFile: page.routeFile,
|
|
18564
18696
|
tags: page.tags,
|
|
18565
|
-
contentHash: ""
|
|
18697
|
+
contentHash: "",
|
|
18698
|
+
description: page.description,
|
|
18699
|
+
keywords: page.keywords
|
|
18566
18700
|
};
|
|
18567
18701
|
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
18568
18702
|
summaryChunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -18589,7 +18723,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
18589
18723
|
incomingLinks: page.incomingLinks,
|
|
18590
18724
|
routeFile: page.routeFile,
|
|
18591
18725
|
tags: page.tags,
|
|
18592
|
-
contentHash: ""
|
|
18726
|
+
contentHash: "",
|
|
18727
|
+
description: page.description,
|
|
18728
|
+
keywords: page.keywords
|
|
18593
18729
|
};
|
|
18594
18730
|
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
18595
18731
|
chunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -19670,14 +19806,16 @@ function mapUrlToRoute(urlPath, patterns) {
|
|
|
19670
19806
|
var Logger = class {
|
|
19671
19807
|
json;
|
|
19672
19808
|
verbose;
|
|
19809
|
+
quiet;
|
|
19673
19810
|
stderrOnly;
|
|
19674
19811
|
constructor(opts = {}) {
|
|
19675
19812
|
this.json = opts.json ?? false;
|
|
19676
19813
|
this.verbose = opts.verbose ?? false;
|
|
19814
|
+
this.quiet = opts.quiet ?? false;
|
|
19677
19815
|
this.stderrOnly = opts.stderrOnly ?? false;
|
|
19678
19816
|
}
|
|
19679
19817
|
info(message) {
|
|
19680
|
-
if (this.json) {
|
|
19818
|
+
if (this.quiet || this.json) {
|
|
19681
19819
|
return;
|
|
19682
19820
|
}
|
|
19683
19821
|
this.writeOut(`${message}
|
|
@@ -19691,7 +19829,7 @@ var Logger = class {
|
|
|
19691
19829
|
this.logJson("debug", { message });
|
|
19692
19830
|
return;
|
|
19693
19831
|
}
|
|
19694
|
-
this.writeOut(
|
|
19832
|
+
this.writeOut(` ${message}
|
|
19695
19833
|
`);
|
|
19696
19834
|
}
|
|
19697
19835
|
warn(message) {
|
|
@@ -19718,7 +19856,7 @@ var Logger = class {
|
|
|
19718
19856
|
this.logJson(event, data);
|
|
19719
19857
|
return;
|
|
19720
19858
|
}
|
|
19721
|
-
this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
|
|
19859
|
+
this.writeOut(` [${event}] ${data ? JSON.stringify(data) : ""}
|
|
19722
19860
|
`);
|
|
19723
19861
|
}
|
|
19724
19862
|
writeOut(text) {
|
|
@@ -19903,11 +20041,108 @@ async function startPreviewServer(cwd, options, logger3) {
|
|
|
19903
20041
|
|
|
19904
20042
|
// src/indexing/sources/build/index.ts
|
|
19905
20043
|
var logger = new Logger();
|
|
20044
|
+
function extractLinksFromHtml(html, pageUrl, baseOrigin) {
|
|
20045
|
+
const $ = cheerio.load(html);
|
|
20046
|
+
const links = [];
|
|
20047
|
+
$("a[href]").each((_i, el) => {
|
|
20048
|
+
const href = $(el).attr("href");
|
|
20049
|
+
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
|
|
20050
|
+
return;
|
|
20051
|
+
}
|
|
20052
|
+
try {
|
|
20053
|
+
const resolved = new URL(href, `${baseOrigin}${pageUrl}`);
|
|
20054
|
+
if (resolved.origin !== baseOrigin) return;
|
|
20055
|
+
if (!["http:", "https:"].includes(resolved.protocol)) return;
|
|
20056
|
+
links.push(normalizeUrlPath(resolved.pathname));
|
|
20057
|
+
} catch {
|
|
20058
|
+
}
|
|
20059
|
+
});
|
|
20060
|
+
return [...new Set(links)];
|
|
20061
|
+
}
|
|
20062
|
+
async function discoverPages(server, buildConfig, pipelineMaxPages) {
|
|
20063
|
+
const { seedUrls, maxDepth, exclude } = buildConfig;
|
|
20064
|
+
const baseOrigin = new URL(server.baseUrl).origin;
|
|
20065
|
+
let effectiveMax = buildConfig.maxPages;
|
|
20066
|
+
if (typeof pipelineMaxPages === "number") {
|
|
20067
|
+
const floored = Math.max(0, Math.floor(pipelineMaxPages));
|
|
20068
|
+
effectiveMax = Math.min(effectiveMax, floored);
|
|
20069
|
+
}
|
|
20070
|
+
if (effectiveMax === 0) return [];
|
|
20071
|
+
const visited = /* @__PURE__ */ new Set();
|
|
20072
|
+
const pages = [];
|
|
20073
|
+
const queue = [];
|
|
20074
|
+
const limit = pLimit2__default.default(8);
|
|
20075
|
+
for (const seed of seedUrls) {
|
|
20076
|
+
const normalized = normalizeUrlPath(seed);
|
|
20077
|
+
if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
|
|
20078
|
+
visited.add(normalized);
|
|
20079
|
+
queue.push({ url: normalized, depth: 0 });
|
|
20080
|
+
}
|
|
20081
|
+
}
|
|
20082
|
+
while (queue.length > 0 && pages.length < effectiveMax) {
|
|
20083
|
+
const remaining = effectiveMax - pages.length;
|
|
20084
|
+
const batch = queue.splice(0, remaining);
|
|
20085
|
+
const results = await Promise.allSettled(
|
|
20086
|
+
batch.map(
|
|
20087
|
+
(item) => limit(async () => {
|
|
20088
|
+
const fullUrl = joinUrl(server.baseUrl, item.url);
|
|
20089
|
+
const response = await fetch(fullUrl);
|
|
20090
|
+
if (!response.ok) {
|
|
20091
|
+
logger.warn(`Skipping ${item.url}: ${response.status} ${response.statusText}`);
|
|
20092
|
+
return null;
|
|
20093
|
+
}
|
|
20094
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
20095
|
+
if (!contentType.includes("text/html")) {
|
|
20096
|
+
return null;
|
|
20097
|
+
}
|
|
20098
|
+
const html = await response.text();
|
|
20099
|
+
if (item.depth < maxDepth) {
|
|
20100
|
+
const links = extractLinksFromHtml(html, item.url, baseOrigin);
|
|
20101
|
+
for (const link of links) {
|
|
20102
|
+
if (!visited.has(link) && !isExcluded(link, exclude)) {
|
|
20103
|
+
visited.add(link);
|
|
20104
|
+
queue.push({ url: link, depth: item.depth + 1 });
|
|
20105
|
+
}
|
|
20106
|
+
}
|
|
20107
|
+
}
|
|
20108
|
+
return {
|
|
20109
|
+
url: item.url,
|
|
20110
|
+
html,
|
|
20111
|
+
sourcePath: fullUrl,
|
|
20112
|
+
outgoingLinks: []
|
|
20113
|
+
};
|
|
20114
|
+
})
|
|
20115
|
+
)
|
|
20116
|
+
);
|
|
20117
|
+
for (const result of results) {
|
|
20118
|
+
if (result.status === "fulfilled" && result.value) {
|
|
20119
|
+
pages.push(result.value);
|
|
20120
|
+
}
|
|
20121
|
+
}
|
|
20122
|
+
}
|
|
20123
|
+
if (pages.length >= effectiveMax && queue.length > 0) {
|
|
20124
|
+
logger.warn(`Discovery crawl reached maxPages limit (${effectiveMax}), ${queue.length} URLs not visited.`);
|
|
20125
|
+
}
|
|
20126
|
+
logger.event("build_discover_complete", {
|
|
20127
|
+
pagesFound: pages.length,
|
|
20128
|
+
urlsVisited: visited.size,
|
|
20129
|
+
urlsSkipped: queue.length
|
|
20130
|
+
});
|
|
20131
|
+
return pages;
|
|
20132
|
+
}
|
|
19906
20133
|
async function loadBuildPages(cwd, config, maxPages) {
|
|
19907
20134
|
const buildConfig = config.source.build;
|
|
19908
20135
|
if (!buildConfig) {
|
|
19909
20136
|
throw new Error("build source config is missing");
|
|
19910
20137
|
}
|
|
20138
|
+
if (buildConfig.discover) {
|
|
20139
|
+
const server2 = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
20140
|
+
try {
|
|
20141
|
+
return await discoverPages(server2, buildConfig, maxPages);
|
|
20142
|
+
} finally {
|
|
20143
|
+
await server2.shutdown();
|
|
20144
|
+
}
|
|
20145
|
+
}
|
|
19911
20146
|
const routes = await parseManifest(cwd, buildConfig.outputDir);
|
|
19912
20147
|
const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
|
|
19913
20148
|
logger.event("build_routes_discovered", {
|
|
@@ -19918,7 +20153,7 @@ async function loadBuildPages(cwd, config, maxPages) {
|
|
|
19918
20153
|
const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
|
|
19919
20154
|
const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
19920
20155
|
try {
|
|
19921
|
-
const concurrencyLimit =
|
|
20156
|
+
const concurrencyLimit = pLimit2__default.default(8);
|
|
19922
20157
|
const results = await Promise.allSettled(
|
|
19923
20158
|
selected.map(
|
|
19924
20159
|
(route) => concurrencyLimit(async () => {
|
|
@@ -20087,7 +20322,7 @@ async function loadCrawledPages(config, maxPages) {
|
|
|
20087
20322
|
const routes = await resolveRoutes(config);
|
|
20088
20323
|
const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
|
|
20089
20324
|
const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
|
|
20090
|
-
const concurrencyLimit =
|
|
20325
|
+
const concurrencyLimit = pLimit2__default.default(8);
|
|
20091
20326
|
const results = await Promise.allSettled(
|
|
20092
20327
|
selected.map(
|
|
20093
20328
|
(route) => concurrencyLimit(async () => {
|
|
@@ -20141,9 +20376,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
20141
20376
|
|
|
20142
20377
|
// src/indexing/pipeline.ts
|
|
20143
20378
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
20144
|
-
"
|
|
20145
|
-
"text-embedding-3-large": 13e-5,
|
|
20146
|
-
"text-embedding-ada-002": 1e-4
|
|
20379
|
+
"jina-embeddings-v3": 2e-5
|
|
20147
20380
|
};
|
|
20148
20381
|
var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
|
|
20149
20382
|
var IndexPipeline = class _IndexPipeline {
|
|
@@ -20189,9 +20422,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20189
20422
|
};
|
|
20190
20423
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
20191
20424
|
const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
|
|
20425
|
+
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
20426
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, model: ${this.config.embeddings.model})`);
|
|
20192
20427
|
if (options.force) {
|
|
20428
|
+
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
20193
20429
|
await cleanMirrorForScope(statePath, scope);
|
|
20194
20430
|
}
|
|
20431
|
+
if (options.dryRun) {
|
|
20432
|
+
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
20433
|
+
}
|
|
20195
20434
|
const manifestStart = stageStart();
|
|
20196
20435
|
const existingHashes = await this.vectorStore.getContentHashes(scope);
|
|
20197
20436
|
const existingModelId = await this.vectorStore.getScopeModelId(scope);
|
|
@@ -20202,8 +20441,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20202
20441
|
);
|
|
20203
20442
|
}
|
|
20204
20443
|
stageEnd("manifest", manifestStart);
|
|
20444
|
+
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
20205
20445
|
const sourceStart = stageStart();
|
|
20206
|
-
|
|
20446
|
+
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
20207
20447
|
let sourcePages;
|
|
20208
20448
|
if (sourceMode === "static-output") {
|
|
20209
20449
|
sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
|
|
@@ -20215,10 +20455,13 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20215
20455
|
sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
|
|
20216
20456
|
}
|
|
20217
20457
|
stageEnd("source", sourceStart);
|
|
20458
|
+
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
20218
20459
|
const routeStart = stageStart();
|
|
20219
20460
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
20220
20461
|
stageEnd("route_map", routeStart);
|
|
20462
|
+
this.logger.debug(`Route mapping: ${routePatterns.length} pattern${routePatterns.length === 1 ? "" : "s"} discovered (${stageTimingsMs["route_map"]}ms)`);
|
|
20221
20463
|
const extractStart = stageStart();
|
|
20464
|
+
this.logger.info("Extracting content...");
|
|
20222
20465
|
const extractedPages = [];
|
|
20223
20466
|
for (const sourcePage of sourcePages) {
|
|
20224
20467
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
@@ -20247,6 +20490,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20247
20490
|
uniquePages.push(page);
|
|
20248
20491
|
}
|
|
20249
20492
|
stageEnd("extract", extractStart);
|
|
20493
|
+
const skippedPages = sourcePages.length - uniquePages.length;
|
|
20494
|
+
this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
20250
20495
|
const linkStart = stageStart();
|
|
20251
20496
|
const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
|
|
20252
20497
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
@@ -20262,7 +20507,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20262
20507
|
}
|
|
20263
20508
|
}
|
|
20264
20509
|
stageEnd("links", linkStart);
|
|
20510
|
+
this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
|
|
20265
20511
|
const mirrorStart = stageStart();
|
|
20512
|
+
this.logger.info("Writing mirror pages...");
|
|
20266
20513
|
const mirrorPages = [];
|
|
20267
20514
|
let routeExact = 0;
|
|
20268
20515
|
let routeBestEffort = 0;
|
|
@@ -20332,7 +20579,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20332
20579
|
await this.vectorStore.upsertPages(pageRecords, scope);
|
|
20333
20580
|
}
|
|
20334
20581
|
stageEnd("mirror", mirrorStart);
|
|
20582
|
+
this.logger.info(`Mirrored ${mirrorPages.length} page${mirrorPages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["mirror"]}ms)`);
|
|
20335
20583
|
const chunkStart = stageStart();
|
|
20584
|
+
this.logger.info("Chunking pages...");
|
|
20336
20585
|
let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
|
|
20337
20586
|
const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
|
|
20338
20587
|
if (typeof maxChunks === "number") {
|
|
@@ -20345,6 +20594,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20345
20594
|
});
|
|
20346
20595
|
}
|
|
20347
20596
|
stageEnd("chunk", chunkStart);
|
|
20597
|
+
this.logger.info(`Chunked into ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} (${stageTimingsMs["chunk"]}ms)`);
|
|
20348
20598
|
const currentChunkMap = /* @__PURE__ */ new Map();
|
|
20349
20599
|
for (const chunk of chunks) {
|
|
20350
20600
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
@@ -20363,6 +20613,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20363
20613
|
return existingHash !== chunk.contentHash;
|
|
20364
20614
|
});
|
|
20365
20615
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
20616
|
+
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
20366
20617
|
const embedStart = stageStart();
|
|
20367
20618
|
const chunkTokenEstimates = /* @__PURE__ */ new Map();
|
|
20368
20619
|
for (const chunk of changedChunks) {
|
|
@@ -20377,9 +20628,11 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20377
20628
|
let newEmbeddings = 0;
|
|
20378
20629
|
const vectorsByChunk = /* @__PURE__ */ new Map();
|
|
20379
20630
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
20631
|
+
this.logger.info(`Embedding ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} (~${estimatedTokens.toLocaleString()} tokens, ~$${estimatedCostUSD.toFixed(6)})...`);
|
|
20380
20632
|
const embeddings = await this.embeddings.embedTexts(
|
|
20381
20633
|
changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
|
|
20382
|
-
this.config.embeddings.model
|
|
20634
|
+
this.config.embeddings.model,
|
|
20635
|
+
"retrieval.passage"
|
|
20383
20636
|
);
|
|
20384
20637
|
if (embeddings.length !== changedChunks.length) {
|
|
20385
20638
|
throw new SearchSocketError(
|
|
@@ -20402,8 +20655,14 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20402
20655
|
}
|
|
20403
20656
|
}
|
|
20404
20657
|
stageEnd("embedding", embedStart);
|
|
20658
|
+
if (changedChunks.length > 0) {
|
|
20659
|
+
this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
|
|
20660
|
+
} else {
|
|
20661
|
+
this.logger.info("No chunks to embed \u2014 all up to date");
|
|
20662
|
+
}
|
|
20405
20663
|
const syncStart = stageStart();
|
|
20406
20664
|
if (!options.dryRun) {
|
|
20665
|
+
this.logger.info("Syncing vectors...");
|
|
20407
20666
|
const upserts = [];
|
|
20408
20667
|
for (const chunk of changedChunks) {
|
|
20409
20668
|
const vector = vectorsByChunk.get(chunk.chunkKey);
|
|
@@ -20422,12 +20681,16 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20422
20681
|
sectionTitle: chunk.sectionTitle ?? "",
|
|
20423
20682
|
headingPath: chunk.headingPath,
|
|
20424
20683
|
snippet: chunk.snippet,
|
|
20684
|
+
chunkText: chunk.chunkText.slice(0, 4e3),
|
|
20685
|
+
ordinal: chunk.ordinal,
|
|
20425
20686
|
contentHash: chunk.contentHash,
|
|
20426
20687
|
modelId: this.config.embeddings.model,
|
|
20427
20688
|
depth: chunk.depth,
|
|
20428
20689
|
incomingLinks: chunk.incomingLinks,
|
|
20429
20690
|
routeFile: chunk.routeFile,
|
|
20430
|
-
tags: chunk.tags
|
|
20691
|
+
tags: chunk.tags,
|
|
20692
|
+
description: chunk.description,
|
|
20693
|
+
keywords: chunk.keywords
|
|
20431
20694
|
}
|
|
20432
20695
|
});
|
|
20433
20696
|
}
|
|
@@ -20441,6 +20704,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20441
20704
|
}
|
|
20442
20705
|
}
|
|
20443
20706
|
stageEnd("sync", syncStart);
|
|
20707
|
+
this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
|
|
20444
20708
|
const finalizeStart = stageStart();
|
|
20445
20709
|
if (!options.dryRun) {
|
|
20446
20710
|
const scopeInfo = {
|
|
@@ -20460,6 +20724,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20460
20724
|
});
|
|
20461
20725
|
}
|
|
20462
20726
|
stageEnd("finalize", finalizeStart);
|
|
20727
|
+
this.logger.info("Done.");
|
|
20463
20728
|
return {
|
|
20464
20729
|
pagesProcessed: mirrorPages.length,
|
|
20465
20730
|
chunksTotal: chunks.length,
|