searchsocket 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -42
- package/dist/cli.js +370 -115
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +391 -109
- package/dist/index.d.cts +20 -3
- package/dist/index.d.ts +20 -3
- package/dist/index.js +389 -108
- package/dist/sveltekit.cjs +374 -109
- package/dist/sveltekit.d.cts +8 -2
- package/dist/sveltekit.d.ts +8 -2
- package/dist/sveltekit.js +373 -107
- package/dist/{types-D1K46vwd.d.cts → types-BrG6XTUU.d.cts} +29 -13
- package/dist/{types-D1K46vwd.d.ts → types-BrG6XTUU.d.ts} +29 -13
- package/package.json +1 -2
package/dist/sveltekit.js
CHANGED
|
@@ -2,8 +2,7 @@ import fs from 'fs';
|
|
|
2
2
|
import path from 'path';
|
|
3
3
|
import { createJiti } from 'jiti';
|
|
4
4
|
import { z } from 'zod';
|
|
5
|
-
import
|
|
6
|
-
import pLimit from 'p-limit';
|
|
5
|
+
import pLimit2 from 'p-limit';
|
|
7
6
|
import { execSync, spawn } from 'child_process';
|
|
8
7
|
import { createHash } from 'crypto';
|
|
9
8
|
import { load } from 'cheerio';
|
|
@@ -16616,7 +16615,11 @@ var searchSocketConfigSchema = z.object({
|
|
|
16616
16615
|
outputDir: z.string().min(1).optional(),
|
|
16617
16616
|
paramValues: z.record(z.string(), z.array(z.string())).optional(),
|
|
16618
16617
|
exclude: z.array(z.string()).optional(),
|
|
16619
|
-
previewTimeout: z.number().int().positive().optional()
|
|
16618
|
+
previewTimeout: z.number().int().positive().optional(),
|
|
16619
|
+
discover: z.boolean().optional(),
|
|
16620
|
+
seedUrls: z.array(z.string()).optional(),
|
|
16621
|
+
maxPages: z.number().int().positive().optional(),
|
|
16622
|
+
maxDepth: z.number().int().nonnegative().optional()
|
|
16620
16623
|
}).optional()
|
|
16621
16624
|
}).optional(),
|
|
16622
16625
|
extract: z.object({
|
|
@@ -16643,8 +16646,9 @@ var searchSocketConfigSchema = z.object({
|
|
|
16643
16646
|
pageSummaryChunk: z.boolean().optional()
|
|
16644
16647
|
}).optional(),
|
|
16645
16648
|
embeddings: z.object({
|
|
16646
|
-
provider: z.literal("
|
|
16649
|
+
provider: z.literal("jina").optional(),
|
|
16647
16650
|
model: z.string().min(1).optional(),
|
|
16651
|
+
apiKey: z.string().min(1).optional(),
|
|
16648
16652
|
apiKeyEnv: z.string().min(1).optional(),
|
|
16649
16653
|
batchSize: z.number().int().positive().optional(),
|
|
16650
16654
|
concurrency: z.number().int().positive().optional(),
|
|
@@ -16653,18 +16657,17 @@ var searchSocketConfigSchema = z.object({
|
|
|
16653
16657
|
vector: z.object({
|
|
16654
16658
|
dimension: z.number().int().positive().optional(),
|
|
16655
16659
|
turso: z.object({
|
|
16660
|
+
url: z.string().url().optional(),
|
|
16661
|
+
authToken: z.string().min(1).optional(),
|
|
16656
16662
|
urlEnv: z.string().optional(),
|
|
16657
16663
|
authTokenEnv: z.string().optional(),
|
|
16658
16664
|
localPath: z.string().optional()
|
|
16659
16665
|
}).optional()
|
|
16660
16666
|
}).optional(),
|
|
16661
16667
|
rerank: z.object({
|
|
16662
|
-
|
|
16668
|
+
enabled: z.boolean().optional(),
|
|
16663
16669
|
topN: z.number().int().positive().optional(),
|
|
16664
|
-
|
|
16665
|
-
apiKeyEnv: z.string().optional(),
|
|
16666
|
-
model: z.string().optional()
|
|
16667
|
-
}).optional()
|
|
16670
|
+
model: z.string().optional()
|
|
16668
16671
|
}).optional(),
|
|
16669
16672
|
ranking: z.object({
|
|
16670
16673
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
@@ -16673,6 +16676,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
16673
16676
|
aggregationCap: z.number().int().positive().optional(),
|
|
16674
16677
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
16675
16678
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
16679
|
+
minScore: z.number().min(0).max(1).optional(),
|
|
16676
16680
|
weights: z.object({
|
|
16677
16681
|
incomingLinks: z.number().optional(),
|
|
16678
16682
|
depth: z.number().optional(),
|
|
@@ -16753,9 +16757,9 @@ function createDefaultConfig(projectId) {
|
|
|
16753
16757
|
pageSummaryChunk: true
|
|
16754
16758
|
},
|
|
16755
16759
|
embeddings: {
|
|
16756
|
-
provider: "
|
|
16757
|
-
model: "
|
|
16758
|
-
apiKeyEnv: "
|
|
16760
|
+
provider: "jina",
|
|
16761
|
+
model: "jina-embeddings-v3",
|
|
16762
|
+
apiKeyEnv: "JINA_API_KEY",
|
|
16759
16763
|
batchSize: 64,
|
|
16760
16764
|
concurrency: 4
|
|
16761
16765
|
},
|
|
@@ -16767,12 +16771,9 @@ function createDefaultConfig(projectId) {
|
|
|
16767
16771
|
}
|
|
16768
16772
|
},
|
|
16769
16773
|
rerank: {
|
|
16770
|
-
|
|
16774
|
+
enabled: false,
|
|
16771
16775
|
topN: 20,
|
|
16772
|
-
|
|
16773
|
-
apiKeyEnv: "JINA_API_KEY",
|
|
16774
|
-
model: "jina-reranker-v2-base-multilingual"
|
|
16775
|
-
}
|
|
16776
|
+
model: "jina-reranker-v2-base-multilingual"
|
|
16776
16777
|
},
|
|
16777
16778
|
ranking: {
|
|
16778
16779
|
enableIncomingLinkBoost: true,
|
|
@@ -16781,6 +16782,7 @@ function createDefaultConfig(projectId) {
|
|
|
16781
16782
|
aggregationCap: 5,
|
|
16782
16783
|
aggregationDecay: 0.5,
|
|
16783
16784
|
minChunkScoreRatio: 0.5,
|
|
16785
|
+
minScore: 0,
|
|
16784
16786
|
weights: {
|
|
16785
16787
|
incomingLinks: 0.05,
|
|
16786
16788
|
depth: 0.03,
|
|
@@ -16907,7 +16909,11 @@ ${issues}`
|
|
|
16907
16909
|
outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
|
|
16908
16910
|
paramValues: parsed.source.build.paramValues ?? {},
|
|
16909
16911
|
exclude: parsed.source.build.exclude ?? [],
|
|
16910
|
-
previewTimeout: parsed.source.build.previewTimeout ?? 3e4
|
|
16912
|
+
previewTimeout: parsed.source.build.previewTimeout ?? 3e4,
|
|
16913
|
+
discover: parsed.source.build.discover ?? false,
|
|
16914
|
+
seedUrls: parsed.source.build.seedUrls ?? ["/"],
|
|
16915
|
+
maxPages: parsed.source.build.maxPages ?? 200,
|
|
16916
|
+
maxDepth: parsed.source.build.maxDepth ?? 10
|
|
16911
16917
|
} : void 0
|
|
16912
16918
|
},
|
|
16913
16919
|
extract: {
|
|
@@ -16936,11 +16942,7 @@ ${issues}`
|
|
|
16936
16942
|
},
|
|
16937
16943
|
rerank: {
|
|
16938
16944
|
...defaults.rerank,
|
|
16939
|
-
...parsed.rerank
|
|
16940
|
-
jina: {
|
|
16941
|
-
...defaults.rerank.jina,
|
|
16942
|
-
...parsed.rerank?.jina
|
|
16943
|
-
}
|
|
16945
|
+
...parsed.rerank
|
|
16944
16946
|
},
|
|
16945
16947
|
ranking: {
|
|
16946
16948
|
...defaults.ranking,
|
|
@@ -16987,7 +16989,11 @@ ${issues}`
|
|
|
16987
16989
|
outputDir: ".svelte-kit/output",
|
|
16988
16990
|
paramValues: {},
|
|
16989
16991
|
exclude: [],
|
|
16990
|
-
previewTimeout: 3e4
|
|
16992
|
+
previewTimeout: 3e4,
|
|
16993
|
+
discover: false,
|
|
16994
|
+
seedUrls: ["/"],
|
|
16995
|
+
maxPages: 200,
|
|
16996
|
+
maxDepth: 10
|
|
16991
16997
|
};
|
|
16992
16998
|
}
|
|
16993
16999
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
@@ -17022,15 +17028,21 @@ async function loadConfig(options = {}) {
|
|
|
17022
17028
|
const raw = loaded.default ?? loaded;
|
|
17023
17029
|
return mergeConfig(cwd, raw);
|
|
17024
17030
|
}
|
|
17031
|
+
|
|
17032
|
+
// src/core/serverless.ts
|
|
17033
|
+
function isServerless() {
|
|
17034
|
+
return !!(process.env.VERCEL || process.env.NETLIFY || process.env.AWS_LAMBDA_FUNCTION_NAME || process.env.FUNCTIONS_WORKER || process.env.CF_PAGES);
|
|
17035
|
+
}
|
|
17025
17036
|
function sleep(ms) {
|
|
17026
17037
|
return new Promise((resolve) => {
|
|
17027
17038
|
setTimeout(resolve, ms);
|
|
17028
17039
|
});
|
|
17029
17040
|
}
|
|
17030
|
-
var
|
|
17031
|
-
|
|
17041
|
+
var JinaEmbeddingsProvider = class {
|
|
17042
|
+
apiKey;
|
|
17032
17043
|
batchSize;
|
|
17033
17044
|
concurrency;
|
|
17045
|
+
defaultTask;
|
|
17034
17046
|
constructor(options) {
|
|
17035
17047
|
if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
|
|
17036
17048
|
throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
|
|
@@ -17038,11 +17050,10 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17038
17050
|
if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
|
|
17039
17051
|
throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
|
|
17040
17052
|
}
|
|
17041
|
-
this.
|
|
17042
|
-
apiKey: options.apiKey
|
|
17043
|
-
});
|
|
17053
|
+
this.apiKey = options.apiKey;
|
|
17044
17054
|
this.batchSize = options.batchSize;
|
|
17045
17055
|
this.concurrency = options.concurrency;
|
|
17056
|
+
this.defaultTask = options.task ?? "retrieval.passage";
|
|
17046
17057
|
}
|
|
17047
17058
|
estimateTokens(text) {
|
|
17048
17059
|
const normalized = text.trim();
|
|
@@ -17056,7 +17067,7 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17056
17067
|
const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
|
|
17057
17068
|
return Math.max(1, Math.max(charEstimate, lexicalEstimate));
|
|
17058
17069
|
}
|
|
17059
|
-
async embedTexts(texts, modelId) {
|
|
17070
|
+
async embedTexts(texts, modelId, task) {
|
|
17060
17071
|
if (texts.length === 0) {
|
|
17061
17072
|
return [];
|
|
17062
17073
|
}
|
|
@@ -17068,37 +17079,56 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17068
17079
|
});
|
|
17069
17080
|
}
|
|
17070
17081
|
const outputs = new Array(batches.length);
|
|
17071
|
-
const limit =
|
|
17082
|
+
const limit = pLimit2(this.concurrency);
|
|
17072
17083
|
await Promise.all(
|
|
17073
17084
|
batches.map(
|
|
17074
17085
|
(batch, position) => limit(async () => {
|
|
17075
|
-
outputs[position] = await this.embedWithRetry(batch.values, modelId);
|
|
17086
|
+
outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
|
|
17076
17087
|
})
|
|
17077
17088
|
)
|
|
17078
17089
|
);
|
|
17079
17090
|
return outputs.flat();
|
|
17080
17091
|
}
|
|
17081
|
-
async embedWithRetry(texts, modelId) {
|
|
17092
|
+
async embedWithRetry(texts, modelId, task) {
|
|
17082
17093
|
const maxAttempts = 5;
|
|
17083
17094
|
let attempt = 0;
|
|
17084
17095
|
while (attempt < maxAttempts) {
|
|
17085
17096
|
attempt += 1;
|
|
17097
|
+
let response;
|
|
17086
17098
|
try {
|
|
17087
|
-
|
|
17088
|
-
|
|
17089
|
-
|
|
17090
|
-
|
|
17099
|
+
response = await fetch("https://api.jina.ai/v1/embeddings", {
|
|
17100
|
+
method: "POST",
|
|
17101
|
+
headers: {
|
|
17102
|
+
"content-type": "application/json",
|
|
17103
|
+
authorization: `Bearer ${this.apiKey}`
|
|
17104
|
+
},
|
|
17105
|
+
body: JSON.stringify({
|
|
17106
|
+
model: modelId,
|
|
17107
|
+
input: texts,
|
|
17108
|
+
task
|
|
17109
|
+
})
|
|
17091
17110
|
});
|
|
17092
|
-
return response.data.map((entry) => entry.embedding);
|
|
17093
17111
|
} catch (error) {
|
|
17094
|
-
|
|
17095
|
-
const retryable = status === 429 || typeof status === "number" && status >= 500;
|
|
17096
|
-
if (!retryable || attempt >= maxAttempts) {
|
|
17112
|
+
if (attempt >= maxAttempts) {
|
|
17097
17113
|
throw error;
|
|
17098
17114
|
}
|
|
17099
|
-
|
|
17100
|
-
|
|
17115
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17116
|
+
continue;
|
|
17101
17117
|
}
|
|
17118
|
+
if (!response.ok) {
|
|
17119
|
+
const retryable = response.status === 429 || response.status >= 500;
|
|
17120
|
+
if (!retryable || attempt >= maxAttempts) {
|
|
17121
|
+
const errorBody = await response.text();
|
|
17122
|
+
throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
|
|
17123
|
+
}
|
|
17124
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17125
|
+
continue;
|
|
17126
|
+
}
|
|
17127
|
+
const payload = await response.json();
|
|
17128
|
+
if (!payload.data || !Array.isArray(payload.data)) {
|
|
17129
|
+
throw new Error("Invalid Jina embeddings response format");
|
|
17130
|
+
}
|
|
17131
|
+
return payload.data.map((entry) => entry.embedding);
|
|
17102
17132
|
}
|
|
17103
17133
|
throw new Error("Unreachable retry state");
|
|
17104
17134
|
}
|
|
@@ -17106,20 +17136,20 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17106
17136
|
|
|
17107
17137
|
// src/embeddings/factory.ts
|
|
17108
17138
|
function createEmbeddingsProvider(config) {
|
|
17109
|
-
if (config.embeddings.provider !== "
|
|
17139
|
+
if (config.embeddings.provider !== "jina") {
|
|
17110
17140
|
throw new SearchSocketError(
|
|
17111
17141
|
"CONFIG_MISSING",
|
|
17112
17142
|
`Unsupported embeddings provider ${config.embeddings.provider}`
|
|
17113
17143
|
);
|
|
17114
17144
|
}
|
|
17115
|
-
const apiKey = process.env[config.embeddings.apiKeyEnv];
|
|
17145
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17116
17146
|
if (!apiKey) {
|
|
17117
17147
|
throw new SearchSocketError(
|
|
17118
17148
|
"CONFIG_MISSING",
|
|
17119
|
-
`Missing embeddings API key env var
|
|
17149
|
+
`Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
|
|
17120
17150
|
);
|
|
17121
17151
|
}
|
|
17122
|
-
return new
|
|
17152
|
+
return new JinaEmbeddingsProvider({
|
|
17123
17153
|
apiKey,
|
|
17124
17154
|
batchSize: config.embeddings.batchSize,
|
|
17125
17155
|
concurrency: config.embeddings.concurrency
|
|
@@ -17282,20 +17312,17 @@ var JinaReranker = class {
|
|
|
17282
17312
|
|
|
17283
17313
|
// src/rerank/factory.ts
|
|
17284
17314
|
function createReranker(config) {
|
|
17285
|
-
if (config.rerank.
|
|
17315
|
+
if (!config.rerank.enabled) {
|
|
17286
17316
|
return null;
|
|
17287
17317
|
}
|
|
17288
|
-
|
|
17289
|
-
|
|
17290
|
-
|
|
17291
|
-
return null;
|
|
17292
|
-
}
|
|
17293
|
-
return new JinaReranker({
|
|
17294
|
-
apiKey,
|
|
17295
|
-
model: config.rerank.jina.model
|
|
17296
|
-
});
|
|
17318
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17319
|
+
if (!apiKey) {
|
|
17320
|
+
return null;
|
|
17297
17321
|
}
|
|
17298
|
-
return
|
|
17322
|
+
return new JinaReranker({
|
|
17323
|
+
apiKey,
|
|
17324
|
+
model: config.rerank.model
|
|
17325
|
+
});
|
|
17299
17326
|
}
|
|
17300
17327
|
|
|
17301
17328
|
// src/utils/time.ts
|
|
@@ -17400,6 +17427,16 @@ var TursoVectorStore = class {
|
|
|
17400
17427
|
}
|
|
17401
17428
|
async ensureChunks(dim) {
|
|
17402
17429
|
if (this.chunksReady) return;
|
|
17430
|
+
const exists = await this.chunksTableExists();
|
|
17431
|
+
if (exists) {
|
|
17432
|
+
const currentDim = await this.getChunksDimension();
|
|
17433
|
+
if (currentDim !== null && currentDim !== dim) {
|
|
17434
|
+
await this.client.batch([
|
|
17435
|
+
"DROP INDEX IF EXISTS idx",
|
|
17436
|
+
"DROP TABLE IF EXISTS chunks"
|
|
17437
|
+
]);
|
|
17438
|
+
}
|
|
17439
|
+
}
|
|
17403
17440
|
await this.client.batch([
|
|
17404
17441
|
`CREATE TABLE IF NOT EXISTS chunks (
|
|
17405
17442
|
id TEXT PRIMARY KEY,
|
|
@@ -17411,12 +17448,16 @@ var TursoVectorStore = class {
|
|
|
17411
17448
|
section_title TEXT NOT NULL DEFAULT '',
|
|
17412
17449
|
heading_path TEXT NOT NULL DEFAULT '[]',
|
|
17413
17450
|
snippet TEXT NOT NULL DEFAULT '',
|
|
17451
|
+
chunk_text TEXT NOT NULL DEFAULT '',
|
|
17452
|
+
ordinal INTEGER NOT NULL DEFAULT 0,
|
|
17414
17453
|
content_hash TEXT NOT NULL DEFAULT '',
|
|
17415
17454
|
model_id TEXT NOT NULL DEFAULT '',
|
|
17416
17455
|
depth INTEGER NOT NULL DEFAULT 0,
|
|
17417
17456
|
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
17418
17457
|
route_file TEXT NOT NULL DEFAULT '',
|
|
17419
17458
|
tags TEXT NOT NULL DEFAULT '[]',
|
|
17459
|
+
description TEXT NOT NULL DEFAULT '',
|
|
17460
|
+
keywords TEXT NOT NULL DEFAULT '[]',
|
|
17420
17461
|
embedding F32_BLOB(${dim})
|
|
17421
17462
|
)`,
|
|
17422
17463
|
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
@@ -17455,6 +17496,38 @@ var TursoVectorStore = class {
|
|
|
17455
17496
|
throw error;
|
|
17456
17497
|
}
|
|
17457
17498
|
}
|
|
17499
|
+
/**
|
|
17500
|
+
* Read the current F32_BLOB dimension from the chunks table schema.
|
|
17501
|
+
* Returns null if the table doesn't exist or the dimension can't be parsed.
|
|
17502
|
+
*/
|
|
17503
|
+
async getChunksDimension() {
|
|
17504
|
+
try {
|
|
17505
|
+
const rs = await this.client.execute(
|
|
17506
|
+
"SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
|
|
17507
|
+
);
|
|
17508
|
+
if (rs.rows.length === 0) return null;
|
|
17509
|
+
const sql = rs.rows[0].sql;
|
|
17510
|
+
const match = sql.match(/F32_BLOB\((\d+)\)/i);
|
|
17511
|
+
return match ? parseInt(match[1], 10) : null;
|
|
17512
|
+
} catch {
|
|
17513
|
+
return null;
|
|
17514
|
+
}
|
|
17515
|
+
}
|
|
17516
|
+
/**
|
|
17517
|
+
* Drop all SearchSocket tables (chunks, registry, pages) and their indexes.
|
|
17518
|
+
* Used by `clean --remote` for a full reset.
|
|
17519
|
+
*/
|
|
17520
|
+
async dropAllTables() {
|
|
17521
|
+
await this.client.batch([
|
|
17522
|
+
"DROP INDEX IF EXISTS idx",
|
|
17523
|
+
"DROP TABLE IF EXISTS chunks",
|
|
17524
|
+
"DROP TABLE IF EXISTS registry",
|
|
17525
|
+
"DROP TABLE IF EXISTS pages"
|
|
17526
|
+
]);
|
|
17527
|
+
this.chunksReady = false;
|
|
17528
|
+
this.registryReady = false;
|
|
17529
|
+
this.pagesReady = false;
|
|
17530
|
+
}
|
|
17458
17531
|
async upsert(records, _scope) {
|
|
17459
17532
|
if (records.length === 0) return;
|
|
17460
17533
|
const dim = this.dimension ?? records[0].vector.length;
|
|
@@ -17465,9 +17538,9 @@ var TursoVectorStore = class {
|
|
|
17465
17538
|
const stmts = batch.map((r) => ({
|
|
17466
17539
|
sql: `INSERT OR REPLACE INTO chunks
|
|
17467
17540
|
(id, project_id, scope_name, url, path, title, section_title,
|
|
17468
|
-
heading_path, snippet, content_hash, model_id, depth,
|
|
17469
|
-
incoming_links, route_file, tags, embedding)
|
|
17470
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17541
|
+
heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
|
|
17542
|
+
incoming_links, route_file, tags, description, keywords, embedding)
|
|
17543
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17471
17544
|
args: [
|
|
17472
17545
|
r.id,
|
|
17473
17546
|
r.metadata.projectId,
|
|
@@ -17478,12 +17551,16 @@ var TursoVectorStore = class {
|
|
|
17478
17551
|
r.metadata.sectionTitle,
|
|
17479
17552
|
JSON.stringify(r.metadata.headingPath),
|
|
17480
17553
|
r.metadata.snippet,
|
|
17554
|
+
r.metadata.chunkText,
|
|
17555
|
+
r.metadata.ordinal,
|
|
17481
17556
|
r.metadata.contentHash,
|
|
17482
17557
|
r.metadata.modelId,
|
|
17483
17558
|
r.metadata.depth,
|
|
17484
17559
|
r.metadata.incomingLinks,
|
|
17485
17560
|
r.metadata.routeFile,
|
|
17486
17561
|
JSON.stringify(r.metadata.tags),
|
|
17562
|
+
r.metadata.description ?? "",
|
|
17563
|
+
JSON.stringify(r.metadata.keywords ?? []),
|
|
17487
17564
|
JSON.stringify(r.vector)
|
|
17488
17565
|
]
|
|
17489
17566
|
}));
|
|
@@ -17496,8 +17573,10 @@ var TursoVectorStore = class {
|
|
|
17496
17573
|
const queryJson = JSON.stringify(queryVector);
|
|
17497
17574
|
const rs = await this.client.execute({
|
|
17498
17575
|
sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
|
|
17499
|
-
c.section_title, c.heading_path, c.snippet, c.
|
|
17576
|
+
c.section_title, c.heading_path, c.snippet, c.chunk_text,
|
|
17577
|
+
c.ordinal, c.content_hash,
|
|
17500
17578
|
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
17579
|
+
c.description, c.keywords,
|
|
17501
17580
|
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
17502
17581
|
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
17503
17582
|
JOIN chunks AS c ON c.rowid = v.id`,
|
|
@@ -17528,6 +17607,12 @@ var TursoVectorStore = class {
|
|
|
17528
17607
|
}
|
|
17529
17608
|
const distance = row.distance;
|
|
17530
17609
|
const score = 1 - distance;
|
|
17610
|
+
const description = row.description || void 0;
|
|
17611
|
+
const keywords = (() => {
|
|
17612
|
+
const raw = row.keywords || "[]";
|
|
17613
|
+
const parsed = JSON.parse(raw);
|
|
17614
|
+
return parsed.length > 0 ? parsed : void 0;
|
|
17615
|
+
})();
|
|
17531
17616
|
hits.push({
|
|
17532
17617
|
id: row.id,
|
|
17533
17618
|
score,
|
|
@@ -17540,12 +17625,16 @@ var TursoVectorStore = class {
|
|
|
17540
17625
|
sectionTitle: row.section_title,
|
|
17541
17626
|
headingPath: JSON.parse(row.heading_path || "[]"),
|
|
17542
17627
|
snippet: row.snippet,
|
|
17628
|
+
chunkText: row.chunk_text || "",
|
|
17629
|
+
ordinal: row.ordinal || 0,
|
|
17543
17630
|
contentHash: row.content_hash,
|
|
17544
17631
|
modelId: row.model_id,
|
|
17545
17632
|
depth: row.depth,
|
|
17546
17633
|
incomingLinks: row.incoming_links,
|
|
17547
17634
|
routeFile: row.route_file,
|
|
17548
|
-
tags
|
|
17635
|
+
tags,
|
|
17636
|
+
description,
|
|
17637
|
+
keywords
|
|
17549
17638
|
}
|
|
17550
17639
|
});
|
|
17551
17640
|
}
|
|
@@ -17735,10 +17824,10 @@ var TursoVectorStore = class {
|
|
|
17735
17824
|
// src/vector/factory.ts
|
|
17736
17825
|
async function createVectorStore(config, cwd) {
|
|
17737
17826
|
const turso = config.vector.turso;
|
|
17738
|
-
const remoteUrl = process.env[turso.urlEnv];
|
|
17827
|
+
const remoteUrl = turso.url ?? process.env[turso.urlEnv];
|
|
17739
17828
|
if (remoteUrl) {
|
|
17740
17829
|
const { createClient: createClient2 } = await import('@libsql/client/http');
|
|
17741
|
-
const authToken = process.env[turso.authTokenEnv];
|
|
17830
|
+
const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
|
|
17742
17831
|
const client2 = createClient2({
|
|
17743
17832
|
url: remoteUrl,
|
|
17744
17833
|
authToken
|
|
@@ -17748,6 +17837,12 @@ async function createVectorStore(config, cwd) {
|
|
|
17748
17837
|
dimension: config.vector.dimension
|
|
17749
17838
|
});
|
|
17750
17839
|
}
|
|
17840
|
+
if (isServerless()) {
|
|
17841
|
+
throw new SearchSocketError(
|
|
17842
|
+
"VECTOR_BACKEND_UNAVAILABLE",
|
|
17843
|
+
`No remote vector database URL found (checked vector.turso.url and env var "${turso.urlEnv}"). Local SQLite storage is not available in serverless environments. Set ${turso.urlEnv} or pass vector.turso.url directly.`
|
|
17844
|
+
);
|
|
17845
|
+
}
|
|
17751
17846
|
const { createClient } = await import('@libsql/client');
|
|
17752
17847
|
const localPath = path.resolve(cwd, turso.localPath);
|
|
17753
17848
|
fs.mkdirSync(path.dirname(localPath), { recursive: true });
|
|
@@ -17905,7 +18000,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
17905
18000
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
17906
18001
|
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
17907
18002
|
const embedStart = process.hrtime.bigint();
|
|
17908
|
-
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
|
|
18003
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
17909
18004
|
const queryVector = queryEmbeddings[0];
|
|
17910
18005
|
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
17911
18006
|
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
@@ -17933,13 +18028,17 @@ var SearchEngine = class _SearchEngine {
|
|
|
17933
18028
|
usedRerank = true;
|
|
17934
18029
|
}
|
|
17935
18030
|
let results;
|
|
18031
|
+
const minScore = this.config.ranking.minScore;
|
|
17936
18032
|
if (groupByPage) {
|
|
17937
|
-
|
|
18033
|
+
let pages = aggregateByPage(ordered, this.config);
|
|
18034
|
+
if (minScore > 0) {
|
|
18035
|
+
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
18036
|
+
}
|
|
17938
18037
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
17939
18038
|
results = pages.slice(0, topK).map((page) => {
|
|
17940
18039
|
const bestScore = page.bestChunk.finalScore;
|
|
17941
|
-
const
|
|
17942
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
18040
|
+
const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
18041
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
|
|
17943
18042
|
return {
|
|
17944
18043
|
url: page.url,
|
|
17945
18044
|
title: page.title,
|
|
@@ -17956,6 +18055,9 @@ var SearchEngine = class _SearchEngine {
|
|
|
17956
18055
|
};
|
|
17957
18056
|
});
|
|
17958
18057
|
} else {
|
|
18058
|
+
if (minScore > 0) {
|
|
18059
|
+
ordered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
18060
|
+
}
|
|
17959
18061
|
results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
17960
18062
|
url: hit.metadata.url,
|
|
17961
18063
|
title: hit.metadata.title,
|
|
@@ -18027,43 +18129,67 @@ var SearchEngine = class _SearchEngine {
|
|
|
18027
18129
|
}
|
|
18028
18130
|
}
|
|
18029
18131
|
async rerankHits(query, ranked, topK) {
|
|
18030
|
-
if (this.config.rerank.
|
|
18132
|
+
if (!this.config.rerank.enabled) {
|
|
18031
18133
|
throw new SearchSocketError(
|
|
18032
18134
|
"INVALID_REQUEST",
|
|
18033
|
-
"rerank=true requested but rerank.
|
|
18135
|
+
"rerank=true requested but rerank.enabled is not set to true.",
|
|
18034
18136
|
400
|
|
18035
18137
|
);
|
|
18036
18138
|
}
|
|
18037
18139
|
if (!this.reranker) {
|
|
18038
18140
|
throw new SearchSocketError(
|
|
18039
18141
|
"CONFIG_MISSING",
|
|
18040
|
-
`rerank=true requested but ${this.config.
|
|
18142
|
+
`rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
|
|
18041
18143
|
400
|
|
18042
18144
|
);
|
|
18043
18145
|
}
|
|
18044
|
-
const
|
|
18045
|
-
|
|
18046
|
-
|
|
18047
|
-
|
|
18146
|
+
const pageGroups = /* @__PURE__ */ new Map();
|
|
18147
|
+
for (const entry of ranked) {
|
|
18148
|
+
const url = entry.hit.metadata.url;
|
|
18149
|
+
const group = pageGroups.get(url);
|
|
18150
|
+
if (group) group.push(entry);
|
|
18151
|
+
else pageGroups.set(url, [entry]);
|
|
18152
|
+
}
|
|
18153
|
+
const MAX_CHUNKS_PER_PAGE = 5;
|
|
18154
|
+
const MIN_CHUNKS_PER_PAGE = 1;
|
|
18155
|
+
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
18156
|
+
const pageCandidates = [];
|
|
18157
|
+
for (const [url, chunks] of pageGroups) {
|
|
18158
|
+
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
18159
|
+
const bestScore = byScore[0].finalScore;
|
|
18160
|
+
const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
|
|
18161
|
+
const selected = byScore.filter(
|
|
18162
|
+
(c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
|
|
18163
|
+
).slice(0, MAX_CHUNKS_PER_PAGE);
|
|
18164
|
+
selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
|
|
18165
|
+
const first = selected[0].hit.metadata;
|
|
18166
|
+
const parts = [first.title];
|
|
18167
|
+
if (first.description) {
|
|
18168
|
+
parts.push(first.description);
|
|
18169
|
+
}
|
|
18170
|
+
if (first.keywords && first.keywords.length > 0) {
|
|
18171
|
+
parts.push(first.keywords.join(", "));
|
|
18172
|
+
}
|
|
18173
|
+
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
18174
|
+
parts.push(body);
|
|
18175
|
+
pageCandidates.push({ id: url, text: parts.join("\n\n") });
|
|
18176
|
+
}
|
|
18048
18177
|
const reranked = await this.reranker.rerank(
|
|
18049
18178
|
query,
|
|
18050
|
-
|
|
18179
|
+
pageCandidates,
|
|
18051
18180
|
Math.max(topK, this.config.rerank.topN)
|
|
18052
18181
|
);
|
|
18053
|
-
const
|
|
18182
|
+
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
18054
18183
|
return ranked.map((entry) => {
|
|
18055
|
-
const
|
|
18056
|
-
const
|
|
18057
|
-
if (
|
|
18058
|
-
return {
|
|
18059
|
-
...entry,
|
|
18060
|
-
finalScore: safeBaseScore
|
|
18061
|
-
};
|
|
18184
|
+
const pageScore = scoreByUrl.get(entry.hit.metadata.url);
|
|
18185
|
+
const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
|
|
18186
|
+
if (pageScore === void 0 || !Number.isFinite(pageScore)) {
|
|
18187
|
+
return { ...entry, finalScore: base };
|
|
18062
18188
|
}
|
|
18063
|
-
const
|
|
18189
|
+
const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
|
|
18064
18190
|
return {
|
|
18065
18191
|
...entry,
|
|
18066
|
-
finalScore: Number.isFinite(
|
|
18192
|
+
finalScore: Number.isFinite(combined) ? combined : base
|
|
18067
18193
|
};
|
|
18068
18194
|
}).sort((a, b) => {
|
|
18069
18195
|
const delta = b.finalScore - a.finalScore;
|
|
@@ -18103,13 +18229,21 @@ function searchsocketHandle(options = {}) {
|
|
|
18103
18229
|
let rateLimiter = null;
|
|
18104
18230
|
const getConfig = async () => {
|
|
18105
18231
|
if (!configPromise) {
|
|
18106
|
-
|
|
18107
|
-
|
|
18108
|
-
|
|
18109
|
-
})
|
|
18232
|
+
let configP;
|
|
18233
|
+
if (options.config) {
|
|
18234
|
+
configP = Promise.resolve(options.config);
|
|
18235
|
+
} else if (options.rawConfig) {
|
|
18236
|
+
const cwd = options.cwd ?? process.cwd();
|
|
18237
|
+
configP = Promise.resolve(mergeConfig(cwd, options.rawConfig));
|
|
18238
|
+
} else {
|
|
18239
|
+
configP = loadConfig({
|
|
18240
|
+
cwd: options.cwd,
|
|
18241
|
+
configPath: options.configPath
|
|
18242
|
+
});
|
|
18243
|
+
}
|
|
18110
18244
|
configPromise = configP.then((config) => {
|
|
18111
18245
|
apiPath = apiPath ?? config.api.path;
|
|
18112
|
-
if (config.api.rateLimit) {
|
|
18246
|
+
if (config.api.rateLimit && !isServerless()) {
|
|
18113
18247
|
rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
|
|
18114
18248
|
}
|
|
18115
18249
|
return config;
|
|
@@ -18119,10 +18253,9 @@ function searchsocketHandle(options = {}) {
|
|
|
18119
18253
|
};
|
|
18120
18254
|
const getEngine = async () => {
|
|
18121
18255
|
if (!enginePromise) {
|
|
18122
|
-
const config =
|
|
18256
|
+
const config = await getConfig();
|
|
18123
18257
|
enginePromise = SearchEngine.create({
|
|
18124
18258
|
cwd: options.cwd,
|
|
18125
|
-
configPath: options.configPath,
|
|
18126
18259
|
config
|
|
18127
18260
|
});
|
|
18128
18261
|
}
|
|
@@ -18549,7 +18682,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
18549
18682
|
incomingLinks: page.incomingLinks,
|
|
18550
18683
|
routeFile: page.routeFile,
|
|
18551
18684
|
tags: page.tags,
|
|
18552
|
-
contentHash: ""
|
|
18685
|
+
contentHash: "",
|
|
18686
|
+
description: page.description,
|
|
18687
|
+
keywords: page.keywords
|
|
18553
18688
|
};
|
|
18554
18689
|
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
18555
18690
|
summaryChunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -18576,7 +18711,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
18576
18711
|
incomingLinks: page.incomingLinks,
|
|
18577
18712
|
routeFile: page.routeFile,
|
|
18578
18713
|
tags: page.tags,
|
|
18579
|
-
contentHash: ""
|
|
18714
|
+
contentHash: "",
|
|
18715
|
+
description: page.description,
|
|
18716
|
+
keywords: page.keywords
|
|
18580
18717
|
};
|
|
18581
18718
|
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
18582
18719
|
chunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -19657,14 +19794,16 @@ function mapUrlToRoute(urlPath, patterns) {
|
|
|
19657
19794
|
var Logger = class {
|
|
19658
19795
|
json;
|
|
19659
19796
|
verbose;
|
|
19797
|
+
quiet;
|
|
19660
19798
|
stderrOnly;
|
|
19661
19799
|
constructor(opts = {}) {
|
|
19662
19800
|
this.json = opts.json ?? false;
|
|
19663
19801
|
this.verbose = opts.verbose ?? false;
|
|
19802
|
+
this.quiet = opts.quiet ?? false;
|
|
19664
19803
|
this.stderrOnly = opts.stderrOnly ?? false;
|
|
19665
19804
|
}
|
|
19666
19805
|
info(message) {
|
|
19667
|
-
if (this.json) {
|
|
19806
|
+
if (this.quiet || this.json) {
|
|
19668
19807
|
return;
|
|
19669
19808
|
}
|
|
19670
19809
|
this.writeOut(`${message}
|
|
@@ -19678,7 +19817,7 @@ var Logger = class {
|
|
|
19678
19817
|
this.logJson("debug", { message });
|
|
19679
19818
|
return;
|
|
19680
19819
|
}
|
|
19681
|
-
this.writeOut(
|
|
19820
|
+
this.writeOut(` ${message}
|
|
19682
19821
|
`);
|
|
19683
19822
|
}
|
|
19684
19823
|
warn(message) {
|
|
@@ -19705,7 +19844,7 @@ var Logger = class {
|
|
|
19705
19844
|
this.logJson(event, data);
|
|
19706
19845
|
return;
|
|
19707
19846
|
}
|
|
19708
|
-
this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
|
|
19847
|
+
this.writeOut(` [${event}] ${data ? JSON.stringify(data) : ""}
|
|
19709
19848
|
`);
|
|
19710
19849
|
}
|
|
19711
19850
|
writeOut(text) {
|
|
@@ -19890,11 +20029,108 @@ async function startPreviewServer(cwd, options, logger3) {
|
|
|
19890
20029
|
|
|
19891
20030
|
// src/indexing/sources/build/index.ts
|
|
19892
20031
|
var logger = new Logger();
|
|
20032
|
+
function extractLinksFromHtml(html, pageUrl, baseOrigin) {
|
|
20033
|
+
const $ = load(html);
|
|
20034
|
+
const links = [];
|
|
20035
|
+
$("a[href]").each((_i, el) => {
|
|
20036
|
+
const href = $(el).attr("href");
|
|
20037
|
+
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
|
|
20038
|
+
return;
|
|
20039
|
+
}
|
|
20040
|
+
try {
|
|
20041
|
+
const resolved = new URL(href, `${baseOrigin}${pageUrl}`);
|
|
20042
|
+
if (resolved.origin !== baseOrigin) return;
|
|
20043
|
+
if (!["http:", "https:"].includes(resolved.protocol)) return;
|
|
20044
|
+
links.push(normalizeUrlPath(resolved.pathname));
|
|
20045
|
+
} catch {
|
|
20046
|
+
}
|
|
20047
|
+
});
|
|
20048
|
+
return [...new Set(links)];
|
|
20049
|
+
}
|
|
20050
|
+
async function discoverPages(server, buildConfig, pipelineMaxPages) {
|
|
20051
|
+
const { seedUrls, maxDepth, exclude } = buildConfig;
|
|
20052
|
+
const baseOrigin = new URL(server.baseUrl).origin;
|
|
20053
|
+
let effectiveMax = buildConfig.maxPages;
|
|
20054
|
+
if (typeof pipelineMaxPages === "number") {
|
|
20055
|
+
const floored = Math.max(0, Math.floor(pipelineMaxPages));
|
|
20056
|
+
effectiveMax = Math.min(effectiveMax, floored);
|
|
20057
|
+
}
|
|
20058
|
+
if (effectiveMax === 0) return [];
|
|
20059
|
+
const visited = /* @__PURE__ */ new Set();
|
|
20060
|
+
const pages = [];
|
|
20061
|
+
const queue = [];
|
|
20062
|
+
const limit = pLimit2(8);
|
|
20063
|
+
for (const seed of seedUrls) {
|
|
20064
|
+
const normalized = normalizeUrlPath(seed);
|
|
20065
|
+
if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
|
|
20066
|
+
visited.add(normalized);
|
|
20067
|
+
queue.push({ url: normalized, depth: 0 });
|
|
20068
|
+
}
|
|
20069
|
+
}
|
|
20070
|
+
while (queue.length > 0 && pages.length < effectiveMax) {
|
|
20071
|
+
const remaining = effectiveMax - pages.length;
|
|
20072
|
+
const batch = queue.splice(0, remaining);
|
|
20073
|
+
const results = await Promise.allSettled(
|
|
20074
|
+
batch.map(
|
|
20075
|
+
(item) => limit(async () => {
|
|
20076
|
+
const fullUrl = joinUrl(server.baseUrl, item.url);
|
|
20077
|
+
const response = await fetch(fullUrl);
|
|
20078
|
+
if (!response.ok) {
|
|
20079
|
+
logger.warn(`Skipping ${item.url}: ${response.status} ${response.statusText}`);
|
|
20080
|
+
return null;
|
|
20081
|
+
}
|
|
20082
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
20083
|
+
if (!contentType.includes("text/html")) {
|
|
20084
|
+
return null;
|
|
20085
|
+
}
|
|
20086
|
+
const html = await response.text();
|
|
20087
|
+
if (item.depth < maxDepth) {
|
|
20088
|
+
const links = extractLinksFromHtml(html, item.url, baseOrigin);
|
|
20089
|
+
for (const link of links) {
|
|
20090
|
+
if (!visited.has(link) && !isExcluded(link, exclude)) {
|
|
20091
|
+
visited.add(link);
|
|
20092
|
+
queue.push({ url: link, depth: item.depth + 1 });
|
|
20093
|
+
}
|
|
20094
|
+
}
|
|
20095
|
+
}
|
|
20096
|
+
return {
|
|
20097
|
+
url: item.url,
|
|
20098
|
+
html,
|
|
20099
|
+
sourcePath: fullUrl,
|
|
20100
|
+
outgoingLinks: []
|
|
20101
|
+
};
|
|
20102
|
+
})
|
|
20103
|
+
)
|
|
20104
|
+
);
|
|
20105
|
+
for (const result of results) {
|
|
20106
|
+
if (result.status === "fulfilled" && result.value) {
|
|
20107
|
+
pages.push(result.value);
|
|
20108
|
+
}
|
|
20109
|
+
}
|
|
20110
|
+
}
|
|
20111
|
+
if (pages.length >= effectiveMax && queue.length > 0) {
|
|
20112
|
+
logger.warn(`Discovery crawl reached maxPages limit (${effectiveMax}), ${queue.length} URLs not visited.`);
|
|
20113
|
+
}
|
|
20114
|
+
logger.event("build_discover_complete", {
|
|
20115
|
+
pagesFound: pages.length,
|
|
20116
|
+
urlsVisited: visited.size,
|
|
20117
|
+
urlsSkipped: queue.length
|
|
20118
|
+
});
|
|
20119
|
+
return pages;
|
|
20120
|
+
}
|
|
19893
20121
|
async function loadBuildPages(cwd, config, maxPages) {
|
|
19894
20122
|
const buildConfig = config.source.build;
|
|
19895
20123
|
if (!buildConfig) {
|
|
19896
20124
|
throw new Error("build source config is missing");
|
|
19897
20125
|
}
|
|
20126
|
+
if (buildConfig.discover) {
|
|
20127
|
+
const server2 = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
20128
|
+
try {
|
|
20129
|
+
return await discoverPages(server2, buildConfig, maxPages);
|
|
20130
|
+
} finally {
|
|
20131
|
+
await server2.shutdown();
|
|
20132
|
+
}
|
|
20133
|
+
}
|
|
19898
20134
|
const routes = await parseManifest(cwd, buildConfig.outputDir);
|
|
19899
20135
|
const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
|
|
19900
20136
|
logger.event("build_routes_discovered", {
|
|
@@ -19905,7 +20141,7 @@ async function loadBuildPages(cwd, config, maxPages) {
|
|
|
19905
20141
|
const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
|
|
19906
20142
|
const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
19907
20143
|
try {
|
|
19908
|
-
const concurrencyLimit =
|
|
20144
|
+
const concurrencyLimit = pLimit2(8);
|
|
19909
20145
|
const results = await Promise.allSettled(
|
|
19910
20146
|
selected.map(
|
|
19911
20147
|
(route) => concurrencyLimit(async () => {
|
|
@@ -20074,7 +20310,7 @@ async function loadCrawledPages(config, maxPages) {
|
|
|
20074
20310
|
const routes = await resolveRoutes(config);
|
|
20075
20311
|
const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
|
|
20076
20312
|
const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
|
|
20077
|
-
const concurrencyLimit =
|
|
20313
|
+
const concurrencyLimit = pLimit2(8);
|
|
20078
20314
|
const results = await Promise.allSettled(
|
|
20079
20315
|
selected.map(
|
|
20080
20316
|
(route) => concurrencyLimit(async () => {
|
|
@@ -20128,9 +20364,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
20128
20364
|
|
|
20129
20365
|
// src/indexing/pipeline.ts
|
|
20130
20366
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
20131
|
-
"
|
|
20132
|
-
"text-embedding-3-large": 13e-5,
|
|
20133
|
-
"text-embedding-ada-002": 1e-4
|
|
20367
|
+
"jina-embeddings-v3": 2e-5
|
|
20134
20368
|
};
|
|
20135
20369
|
var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
|
|
20136
20370
|
var IndexPipeline = class _IndexPipeline {
|
|
@@ -20176,9 +20410,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20176
20410
|
};
|
|
20177
20411
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
20178
20412
|
const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
|
|
20413
|
+
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
20414
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, model: ${this.config.embeddings.model})`);
|
|
20179
20415
|
if (options.force) {
|
|
20416
|
+
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
20180
20417
|
await cleanMirrorForScope(statePath, scope);
|
|
20181
20418
|
}
|
|
20419
|
+
if (options.dryRun) {
|
|
20420
|
+
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
20421
|
+
}
|
|
20182
20422
|
const manifestStart = stageStart();
|
|
20183
20423
|
const existingHashes = await this.vectorStore.getContentHashes(scope);
|
|
20184
20424
|
const existingModelId = await this.vectorStore.getScopeModelId(scope);
|
|
@@ -20189,8 +20429,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20189
20429
|
);
|
|
20190
20430
|
}
|
|
20191
20431
|
stageEnd("manifest", manifestStart);
|
|
20432
|
+
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
20192
20433
|
const sourceStart = stageStart();
|
|
20193
|
-
|
|
20434
|
+
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
20194
20435
|
let sourcePages;
|
|
20195
20436
|
if (sourceMode === "static-output") {
|
|
20196
20437
|
sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
|
|
@@ -20202,10 +20443,13 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20202
20443
|
sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
|
|
20203
20444
|
}
|
|
20204
20445
|
stageEnd("source", sourceStart);
|
|
20446
|
+
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
20205
20447
|
const routeStart = stageStart();
|
|
20206
20448
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
20207
20449
|
stageEnd("route_map", routeStart);
|
|
20450
|
+
this.logger.debug(`Route mapping: ${routePatterns.length} pattern${routePatterns.length === 1 ? "" : "s"} discovered (${stageTimingsMs["route_map"]}ms)`);
|
|
20208
20451
|
const extractStart = stageStart();
|
|
20452
|
+
this.logger.info("Extracting content...");
|
|
20209
20453
|
const extractedPages = [];
|
|
20210
20454
|
for (const sourcePage of sourcePages) {
|
|
20211
20455
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
@@ -20234,6 +20478,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20234
20478
|
uniquePages.push(page);
|
|
20235
20479
|
}
|
|
20236
20480
|
stageEnd("extract", extractStart);
|
|
20481
|
+
const skippedPages = sourcePages.length - uniquePages.length;
|
|
20482
|
+
this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
20237
20483
|
const linkStart = stageStart();
|
|
20238
20484
|
const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
|
|
20239
20485
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
@@ -20249,7 +20495,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20249
20495
|
}
|
|
20250
20496
|
}
|
|
20251
20497
|
stageEnd("links", linkStart);
|
|
20498
|
+
this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
|
|
20252
20499
|
const mirrorStart = stageStart();
|
|
20500
|
+
this.logger.info("Writing mirror pages...");
|
|
20253
20501
|
const mirrorPages = [];
|
|
20254
20502
|
let routeExact = 0;
|
|
20255
20503
|
let routeBestEffort = 0;
|
|
@@ -20319,7 +20567,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20319
20567
|
await this.vectorStore.upsertPages(pageRecords, scope);
|
|
20320
20568
|
}
|
|
20321
20569
|
stageEnd("mirror", mirrorStart);
|
|
20570
|
+
this.logger.info(`Mirrored ${mirrorPages.length} page${mirrorPages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["mirror"]}ms)`);
|
|
20322
20571
|
const chunkStart = stageStart();
|
|
20572
|
+
this.logger.info("Chunking pages...");
|
|
20323
20573
|
let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
|
|
20324
20574
|
const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
|
|
20325
20575
|
if (typeof maxChunks === "number") {
|
|
@@ -20332,6 +20582,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20332
20582
|
});
|
|
20333
20583
|
}
|
|
20334
20584
|
stageEnd("chunk", chunkStart);
|
|
20585
|
+
this.logger.info(`Chunked into ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} (${stageTimingsMs["chunk"]}ms)`);
|
|
20335
20586
|
const currentChunkMap = /* @__PURE__ */ new Map();
|
|
20336
20587
|
for (const chunk of chunks) {
|
|
20337
20588
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
@@ -20350,6 +20601,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20350
20601
|
return existingHash !== chunk.contentHash;
|
|
20351
20602
|
});
|
|
20352
20603
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
20604
|
+
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
20353
20605
|
const embedStart = stageStart();
|
|
20354
20606
|
const chunkTokenEstimates = /* @__PURE__ */ new Map();
|
|
20355
20607
|
for (const chunk of changedChunks) {
|
|
@@ -20364,9 +20616,11 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20364
20616
|
let newEmbeddings = 0;
|
|
20365
20617
|
const vectorsByChunk = /* @__PURE__ */ new Map();
|
|
20366
20618
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
20619
|
+
this.logger.info(`Embedding ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} (~${estimatedTokens.toLocaleString()} tokens, ~$${estimatedCostUSD.toFixed(6)})...`);
|
|
20367
20620
|
const embeddings = await this.embeddings.embedTexts(
|
|
20368
20621
|
changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
|
|
20369
|
-
this.config.embeddings.model
|
|
20622
|
+
this.config.embeddings.model,
|
|
20623
|
+
"retrieval.passage"
|
|
20370
20624
|
);
|
|
20371
20625
|
if (embeddings.length !== changedChunks.length) {
|
|
20372
20626
|
throw new SearchSocketError(
|
|
@@ -20389,8 +20643,14 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20389
20643
|
}
|
|
20390
20644
|
}
|
|
20391
20645
|
stageEnd("embedding", embedStart);
|
|
20646
|
+
if (changedChunks.length > 0) {
|
|
20647
|
+
this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
|
|
20648
|
+
} else {
|
|
20649
|
+
this.logger.info("No chunks to embed \u2014 all up to date");
|
|
20650
|
+
}
|
|
20392
20651
|
const syncStart = stageStart();
|
|
20393
20652
|
if (!options.dryRun) {
|
|
20653
|
+
this.logger.info("Syncing vectors...");
|
|
20394
20654
|
const upserts = [];
|
|
20395
20655
|
for (const chunk of changedChunks) {
|
|
20396
20656
|
const vector = vectorsByChunk.get(chunk.chunkKey);
|
|
@@ -20409,12 +20669,16 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20409
20669
|
sectionTitle: chunk.sectionTitle ?? "",
|
|
20410
20670
|
headingPath: chunk.headingPath,
|
|
20411
20671
|
snippet: chunk.snippet,
|
|
20672
|
+
chunkText: chunk.chunkText.slice(0, 4e3),
|
|
20673
|
+
ordinal: chunk.ordinal,
|
|
20412
20674
|
contentHash: chunk.contentHash,
|
|
20413
20675
|
modelId: this.config.embeddings.model,
|
|
20414
20676
|
depth: chunk.depth,
|
|
20415
20677
|
incomingLinks: chunk.incomingLinks,
|
|
20416
20678
|
routeFile: chunk.routeFile,
|
|
20417
|
-
tags: chunk.tags
|
|
20679
|
+
tags: chunk.tags,
|
|
20680
|
+
description: chunk.description,
|
|
20681
|
+
keywords: chunk.keywords
|
|
20418
20682
|
}
|
|
20419
20683
|
});
|
|
20420
20684
|
}
|
|
@@ -20428,6 +20692,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20428
20692
|
}
|
|
20429
20693
|
}
|
|
20430
20694
|
stageEnd("sync", syncStart);
|
|
20695
|
+
this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
|
|
20431
20696
|
const finalizeStart = stageStart();
|
|
20432
20697
|
if (!options.dryRun) {
|
|
20433
20698
|
const scopeInfo = {
|
|
@@ -20447,6 +20712,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20447
20712
|
});
|
|
20448
20713
|
}
|
|
20449
20714
|
stageEnd("finalize", finalizeStart);
|
|
20715
|
+
this.logger.info("Done.");
|
|
20450
20716
|
return {
|
|
20451
20717
|
pagesProcessed: mirrorPages.length,
|
|
20452
20718
|
chunksTotal: chunks.length,
|