searchsocket 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -42
- package/dist/cli.js +348 -111
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +367 -104
- package/dist/index.d.cts +20 -3
- package/dist/index.d.ts +20 -3
- package/dist/index.js +365 -103
- package/dist/sveltekit.cjs +350 -104
- package/dist/sveltekit.d.cts +8 -2
- package/dist/sveltekit.d.ts +8 -2
- package/dist/sveltekit.js +349 -102
- package/dist/{types-D1K46vwd.d.cts → types-DAXk6A3Y.d.cts} +25 -13
- package/dist/{types-D1K46vwd.d.ts → types-DAXk6A3Y.d.ts} +25 -13
- package/package.json +3 -3
- package/dist/cli.js.map +0 -1
- package/dist/client.cjs.map +0 -1
- package/dist/client.js.map +0 -1
- package/dist/index.cjs.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/sveltekit.cjs.map +0 -1
- package/dist/sveltekit.js.map +0 -1
package/dist/sveltekit.js
CHANGED
|
@@ -2,8 +2,7 @@ import fs from 'fs';
|
|
|
2
2
|
import path from 'path';
|
|
3
3
|
import { createJiti } from 'jiti';
|
|
4
4
|
import { z } from 'zod';
|
|
5
|
-
import
|
|
6
|
-
import pLimit from 'p-limit';
|
|
5
|
+
import pLimit2 from 'p-limit';
|
|
7
6
|
import { execSync, spawn } from 'child_process';
|
|
8
7
|
import { createHash } from 'crypto';
|
|
9
8
|
import { load } from 'cheerio';
|
|
@@ -16616,7 +16615,11 @@ var searchSocketConfigSchema = z.object({
|
|
|
16616
16615
|
outputDir: z.string().min(1).optional(),
|
|
16617
16616
|
paramValues: z.record(z.string(), z.array(z.string())).optional(),
|
|
16618
16617
|
exclude: z.array(z.string()).optional(),
|
|
16619
|
-
previewTimeout: z.number().int().positive().optional()
|
|
16618
|
+
previewTimeout: z.number().int().positive().optional(),
|
|
16619
|
+
discover: z.boolean().optional(),
|
|
16620
|
+
seedUrls: z.array(z.string()).optional(),
|
|
16621
|
+
maxPages: z.number().int().positive().optional(),
|
|
16622
|
+
maxDepth: z.number().int().nonnegative().optional()
|
|
16620
16623
|
}).optional()
|
|
16621
16624
|
}).optional(),
|
|
16622
16625
|
extract: z.object({
|
|
@@ -16643,8 +16646,9 @@ var searchSocketConfigSchema = z.object({
|
|
|
16643
16646
|
pageSummaryChunk: z.boolean().optional()
|
|
16644
16647
|
}).optional(),
|
|
16645
16648
|
embeddings: z.object({
|
|
16646
|
-
provider: z.literal("
|
|
16649
|
+
provider: z.literal("jina").optional(),
|
|
16647
16650
|
model: z.string().min(1).optional(),
|
|
16651
|
+
apiKey: z.string().min(1).optional(),
|
|
16648
16652
|
apiKeyEnv: z.string().min(1).optional(),
|
|
16649
16653
|
batchSize: z.number().int().positive().optional(),
|
|
16650
16654
|
concurrency: z.number().int().positive().optional(),
|
|
@@ -16653,18 +16657,17 @@ var searchSocketConfigSchema = z.object({
|
|
|
16653
16657
|
vector: z.object({
|
|
16654
16658
|
dimension: z.number().int().positive().optional(),
|
|
16655
16659
|
turso: z.object({
|
|
16660
|
+
url: z.string().url().optional(),
|
|
16661
|
+
authToken: z.string().min(1).optional(),
|
|
16656
16662
|
urlEnv: z.string().optional(),
|
|
16657
16663
|
authTokenEnv: z.string().optional(),
|
|
16658
16664
|
localPath: z.string().optional()
|
|
16659
16665
|
}).optional()
|
|
16660
16666
|
}).optional(),
|
|
16661
16667
|
rerank: z.object({
|
|
16662
|
-
|
|
16668
|
+
enabled: z.boolean().optional(),
|
|
16663
16669
|
topN: z.number().int().positive().optional(),
|
|
16664
|
-
|
|
16665
|
-
apiKeyEnv: z.string().optional(),
|
|
16666
|
-
model: z.string().optional()
|
|
16667
|
-
}).optional()
|
|
16670
|
+
model: z.string().optional()
|
|
16668
16671
|
}).optional(),
|
|
16669
16672
|
ranking: z.object({
|
|
16670
16673
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
@@ -16673,6 +16676,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
16673
16676
|
aggregationCap: z.number().int().positive().optional(),
|
|
16674
16677
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
16675
16678
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
16679
|
+
minScore: z.number().min(0).max(1).optional(),
|
|
16676
16680
|
weights: z.object({
|
|
16677
16681
|
incomingLinks: z.number().optional(),
|
|
16678
16682
|
depth: z.number().optional(),
|
|
@@ -16753,9 +16757,9 @@ function createDefaultConfig(projectId) {
|
|
|
16753
16757
|
pageSummaryChunk: true
|
|
16754
16758
|
},
|
|
16755
16759
|
embeddings: {
|
|
16756
|
-
provider: "
|
|
16757
|
-
model: "
|
|
16758
|
-
apiKeyEnv: "
|
|
16760
|
+
provider: "jina",
|
|
16761
|
+
model: "jina-embeddings-v3",
|
|
16762
|
+
apiKeyEnv: "JINA_API_KEY",
|
|
16759
16763
|
batchSize: 64,
|
|
16760
16764
|
concurrency: 4
|
|
16761
16765
|
},
|
|
@@ -16767,12 +16771,9 @@ function createDefaultConfig(projectId) {
|
|
|
16767
16771
|
}
|
|
16768
16772
|
},
|
|
16769
16773
|
rerank: {
|
|
16770
|
-
|
|
16774
|
+
enabled: false,
|
|
16771
16775
|
topN: 20,
|
|
16772
|
-
|
|
16773
|
-
apiKeyEnv: "JINA_API_KEY",
|
|
16774
|
-
model: "jina-reranker-v2-base-multilingual"
|
|
16775
|
-
}
|
|
16776
|
+
model: "jina-reranker-v2-base-multilingual"
|
|
16776
16777
|
},
|
|
16777
16778
|
ranking: {
|
|
16778
16779
|
enableIncomingLinkBoost: true,
|
|
@@ -16781,6 +16782,7 @@ function createDefaultConfig(projectId) {
|
|
|
16781
16782
|
aggregationCap: 5,
|
|
16782
16783
|
aggregationDecay: 0.5,
|
|
16783
16784
|
minChunkScoreRatio: 0.5,
|
|
16785
|
+
minScore: 0,
|
|
16784
16786
|
weights: {
|
|
16785
16787
|
incomingLinks: 0.05,
|
|
16786
16788
|
depth: 0.03,
|
|
@@ -16907,7 +16909,11 @@ ${issues}`
|
|
|
16907
16909
|
outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
|
|
16908
16910
|
paramValues: parsed.source.build.paramValues ?? {},
|
|
16909
16911
|
exclude: parsed.source.build.exclude ?? [],
|
|
16910
|
-
previewTimeout: parsed.source.build.previewTimeout ?? 3e4
|
|
16912
|
+
previewTimeout: parsed.source.build.previewTimeout ?? 3e4,
|
|
16913
|
+
discover: parsed.source.build.discover ?? false,
|
|
16914
|
+
seedUrls: parsed.source.build.seedUrls ?? ["/"],
|
|
16915
|
+
maxPages: parsed.source.build.maxPages ?? 200,
|
|
16916
|
+
maxDepth: parsed.source.build.maxDepth ?? 10
|
|
16911
16917
|
} : void 0
|
|
16912
16918
|
},
|
|
16913
16919
|
extract: {
|
|
@@ -16936,11 +16942,7 @@ ${issues}`
|
|
|
16936
16942
|
},
|
|
16937
16943
|
rerank: {
|
|
16938
16944
|
...defaults.rerank,
|
|
16939
|
-
...parsed.rerank
|
|
16940
|
-
jina: {
|
|
16941
|
-
...defaults.rerank.jina,
|
|
16942
|
-
...parsed.rerank?.jina
|
|
16943
|
-
}
|
|
16945
|
+
...parsed.rerank
|
|
16944
16946
|
},
|
|
16945
16947
|
ranking: {
|
|
16946
16948
|
...defaults.ranking,
|
|
@@ -16987,7 +16989,11 @@ ${issues}`
|
|
|
16987
16989
|
outputDir: ".svelte-kit/output",
|
|
16988
16990
|
paramValues: {},
|
|
16989
16991
|
exclude: [],
|
|
16990
|
-
previewTimeout: 3e4
|
|
16992
|
+
previewTimeout: 3e4,
|
|
16993
|
+
discover: false,
|
|
16994
|
+
seedUrls: ["/"],
|
|
16995
|
+
maxPages: 200,
|
|
16996
|
+
maxDepth: 10
|
|
16991
16997
|
};
|
|
16992
16998
|
}
|
|
16993
16999
|
if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
|
|
@@ -17022,15 +17028,21 @@ async function loadConfig(options = {}) {
|
|
|
17022
17028
|
const raw = loaded.default ?? loaded;
|
|
17023
17029
|
return mergeConfig(cwd, raw);
|
|
17024
17030
|
}
|
|
17031
|
+
|
|
17032
|
+
// src/core/serverless.ts
|
|
17033
|
+
function isServerless() {
|
|
17034
|
+
return !!(process.env.VERCEL || process.env.NETLIFY || process.env.AWS_LAMBDA_FUNCTION_NAME || process.env.FUNCTIONS_WORKER || process.env.CF_PAGES);
|
|
17035
|
+
}
|
|
17025
17036
|
function sleep(ms) {
|
|
17026
17037
|
return new Promise((resolve) => {
|
|
17027
17038
|
setTimeout(resolve, ms);
|
|
17028
17039
|
});
|
|
17029
17040
|
}
|
|
17030
|
-
var
|
|
17031
|
-
|
|
17041
|
+
var JinaEmbeddingsProvider = class {
|
|
17042
|
+
apiKey;
|
|
17032
17043
|
batchSize;
|
|
17033
17044
|
concurrency;
|
|
17045
|
+
defaultTask;
|
|
17034
17046
|
constructor(options) {
|
|
17035
17047
|
if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
|
|
17036
17048
|
throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
|
|
@@ -17038,11 +17050,10 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17038
17050
|
if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
|
|
17039
17051
|
throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
|
|
17040
17052
|
}
|
|
17041
|
-
this.
|
|
17042
|
-
apiKey: options.apiKey
|
|
17043
|
-
});
|
|
17053
|
+
this.apiKey = options.apiKey;
|
|
17044
17054
|
this.batchSize = options.batchSize;
|
|
17045
17055
|
this.concurrency = options.concurrency;
|
|
17056
|
+
this.defaultTask = options.task ?? "retrieval.passage";
|
|
17046
17057
|
}
|
|
17047
17058
|
estimateTokens(text) {
|
|
17048
17059
|
const normalized = text.trim();
|
|
@@ -17056,7 +17067,7 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17056
17067
|
const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
|
|
17057
17068
|
return Math.max(1, Math.max(charEstimate, lexicalEstimate));
|
|
17058
17069
|
}
|
|
17059
|
-
async embedTexts(texts, modelId) {
|
|
17070
|
+
async embedTexts(texts, modelId, task) {
|
|
17060
17071
|
if (texts.length === 0) {
|
|
17061
17072
|
return [];
|
|
17062
17073
|
}
|
|
@@ -17068,37 +17079,56 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17068
17079
|
});
|
|
17069
17080
|
}
|
|
17070
17081
|
const outputs = new Array(batches.length);
|
|
17071
|
-
const limit =
|
|
17082
|
+
const limit = pLimit2(this.concurrency);
|
|
17072
17083
|
await Promise.all(
|
|
17073
17084
|
batches.map(
|
|
17074
17085
|
(batch, position) => limit(async () => {
|
|
17075
|
-
outputs[position] = await this.embedWithRetry(batch.values, modelId);
|
|
17086
|
+
outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
|
|
17076
17087
|
})
|
|
17077
17088
|
)
|
|
17078
17089
|
);
|
|
17079
17090
|
return outputs.flat();
|
|
17080
17091
|
}
|
|
17081
|
-
async embedWithRetry(texts, modelId) {
|
|
17092
|
+
async embedWithRetry(texts, modelId, task) {
|
|
17082
17093
|
const maxAttempts = 5;
|
|
17083
17094
|
let attempt = 0;
|
|
17084
17095
|
while (attempt < maxAttempts) {
|
|
17085
17096
|
attempt += 1;
|
|
17097
|
+
let response;
|
|
17086
17098
|
try {
|
|
17087
|
-
|
|
17088
|
-
|
|
17089
|
-
|
|
17090
|
-
|
|
17099
|
+
response = await fetch("https://api.jina.ai/v1/embeddings", {
|
|
17100
|
+
method: "POST",
|
|
17101
|
+
headers: {
|
|
17102
|
+
"content-type": "application/json",
|
|
17103
|
+
authorization: `Bearer ${this.apiKey}`
|
|
17104
|
+
},
|
|
17105
|
+
body: JSON.stringify({
|
|
17106
|
+
model: modelId,
|
|
17107
|
+
input: texts,
|
|
17108
|
+
task
|
|
17109
|
+
})
|
|
17091
17110
|
});
|
|
17092
|
-
return response.data.map((entry) => entry.embedding);
|
|
17093
17111
|
} catch (error) {
|
|
17094
|
-
|
|
17095
|
-
const retryable = status === 429 || typeof status === "number" && status >= 500;
|
|
17096
|
-
if (!retryable || attempt >= maxAttempts) {
|
|
17112
|
+
if (attempt >= maxAttempts) {
|
|
17097
17113
|
throw error;
|
|
17098
17114
|
}
|
|
17099
|
-
|
|
17100
|
-
|
|
17115
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17116
|
+
continue;
|
|
17117
|
+
}
|
|
17118
|
+
if (!response.ok) {
|
|
17119
|
+
const retryable = response.status === 429 || response.status >= 500;
|
|
17120
|
+
if (!retryable || attempt >= maxAttempts) {
|
|
17121
|
+
const errorBody = await response.text();
|
|
17122
|
+
throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
|
|
17123
|
+
}
|
|
17124
|
+
await sleep(Math.min(2 ** attempt * 300, 5e3));
|
|
17125
|
+
continue;
|
|
17101
17126
|
}
|
|
17127
|
+
const payload = await response.json();
|
|
17128
|
+
if (!payload.data || !Array.isArray(payload.data)) {
|
|
17129
|
+
throw new Error("Invalid Jina embeddings response format");
|
|
17130
|
+
}
|
|
17131
|
+
return payload.data.map((entry) => entry.embedding);
|
|
17102
17132
|
}
|
|
17103
17133
|
throw new Error("Unreachable retry state");
|
|
17104
17134
|
}
|
|
@@ -17106,20 +17136,20 @@ var OpenAIEmbeddingsProvider = class {
|
|
|
17106
17136
|
|
|
17107
17137
|
// src/embeddings/factory.ts
|
|
17108
17138
|
function createEmbeddingsProvider(config) {
|
|
17109
|
-
if (config.embeddings.provider !== "
|
|
17139
|
+
if (config.embeddings.provider !== "jina") {
|
|
17110
17140
|
throw new SearchSocketError(
|
|
17111
17141
|
"CONFIG_MISSING",
|
|
17112
17142
|
`Unsupported embeddings provider ${config.embeddings.provider}`
|
|
17113
17143
|
);
|
|
17114
17144
|
}
|
|
17115
|
-
const apiKey = process.env[config.embeddings.apiKeyEnv];
|
|
17145
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17116
17146
|
if (!apiKey) {
|
|
17117
17147
|
throw new SearchSocketError(
|
|
17118
17148
|
"CONFIG_MISSING",
|
|
17119
|
-
`Missing embeddings API key env var
|
|
17149
|
+
`Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
|
|
17120
17150
|
);
|
|
17121
17151
|
}
|
|
17122
|
-
return new
|
|
17152
|
+
return new JinaEmbeddingsProvider({
|
|
17123
17153
|
apiKey,
|
|
17124
17154
|
batchSize: config.embeddings.batchSize,
|
|
17125
17155
|
concurrency: config.embeddings.concurrency
|
|
@@ -17282,20 +17312,17 @@ var JinaReranker = class {
|
|
|
17282
17312
|
|
|
17283
17313
|
// src/rerank/factory.ts
|
|
17284
17314
|
function createReranker(config) {
|
|
17285
|
-
if (config.rerank.
|
|
17315
|
+
if (!config.rerank.enabled) {
|
|
17286
17316
|
return null;
|
|
17287
17317
|
}
|
|
17288
|
-
|
|
17289
|
-
|
|
17290
|
-
|
|
17291
|
-
return null;
|
|
17292
|
-
}
|
|
17293
|
-
return new JinaReranker({
|
|
17294
|
-
apiKey,
|
|
17295
|
-
model: config.rerank.jina.model
|
|
17296
|
-
});
|
|
17318
|
+
const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
|
|
17319
|
+
if (!apiKey) {
|
|
17320
|
+
return null;
|
|
17297
17321
|
}
|
|
17298
|
-
return
|
|
17322
|
+
return new JinaReranker({
|
|
17323
|
+
apiKey,
|
|
17324
|
+
model: config.rerank.model
|
|
17325
|
+
});
|
|
17299
17326
|
}
|
|
17300
17327
|
|
|
17301
17328
|
// src/utils/time.ts
|
|
@@ -17400,6 +17427,16 @@ var TursoVectorStore = class {
|
|
|
17400
17427
|
}
|
|
17401
17428
|
async ensureChunks(dim) {
|
|
17402
17429
|
if (this.chunksReady) return;
|
|
17430
|
+
const exists = await this.chunksTableExists();
|
|
17431
|
+
if (exists) {
|
|
17432
|
+
const currentDim = await this.getChunksDimension();
|
|
17433
|
+
if (currentDim !== null && currentDim !== dim) {
|
|
17434
|
+
await this.client.batch([
|
|
17435
|
+
"DROP INDEX IF EXISTS idx",
|
|
17436
|
+
"DROP TABLE IF EXISTS chunks"
|
|
17437
|
+
]);
|
|
17438
|
+
}
|
|
17439
|
+
}
|
|
17403
17440
|
await this.client.batch([
|
|
17404
17441
|
`CREATE TABLE IF NOT EXISTS chunks (
|
|
17405
17442
|
id TEXT PRIMARY KEY,
|
|
@@ -17411,6 +17448,8 @@ var TursoVectorStore = class {
|
|
|
17411
17448
|
section_title TEXT NOT NULL DEFAULT '',
|
|
17412
17449
|
heading_path TEXT NOT NULL DEFAULT '[]',
|
|
17413
17450
|
snippet TEXT NOT NULL DEFAULT '',
|
|
17451
|
+
chunk_text TEXT NOT NULL DEFAULT '',
|
|
17452
|
+
ordinal INTEGER NOT NULL DEFAULT 0,
|
|
17414
17453
|
content_hash TEXT NOT NULL DEFAULT '',
|
|
17415
17454
|
model_id TEXT NOT NULL DEFAULT '',
|
|
17416
17455
|
depth INTEGER NOT NULL DEFAULT 0,
|
|
@@ -17421,6 +17460,19 @@ var TursoVectorStore = class {
|
|
|
17421
17460
|
)`,
|
|
17422
17461
|
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
17423
17462
|
]);
|
|
17463
|
+
const chunkMigrationCols = [
|
|
17464
|
+
{ name: "chunk_text", def: "TEXT NOT NULL DEFAULT ''" },
|
|
17465
|
+
{ name: "ordinal", def: "INTEGER NOT NULL DEFAULT 0" }
|
|
17466
|
+
];
|
|
17467
|
+
for (const col of chunkMigrationCols) {
|
|
17468
|
+
try {
|
|
17469
|
+
await this.client.execute(`ALTER TABLE chunks ADD COLUMN ${col.name} ${col.def}`);
|
|
17470
|
+
} catch (error) {
|
|
17471
|
+
if (error instanceof Error && !error.message.includes("duplicate column")) {
|
|
17472
|
+
throw error;
|
|
17473
|
+
}
|
|
17474
|
+
}
|
|
17475
|
+
}
|
|
17424
17476
|
this.chunksReady = true;
|
|
17425
17477
|
}
|
|
17426
17478
|
async ensurePages() {
|
|
@@ -17455,6 +17507,38 @@ var TursoVectorStore = class {
|
|
|
17455
17507
|
throw error;
|
|
17456
17508
|
}
|
|
17457
17509
|
}
|
|
17510
|
+
/**
|
|
17511
|
+
* Read the current F32_BLOB dimension from the chunks table schema.
|
|
17512
|
+
* Returns null if the table doesn't exist or the dimension can't be parsed.
|
|
17513
|
+
*/
|
|
17514
|
+
async getChunksDimension() {
|
|
17515
|
+
try {
|
|
17516
|
+
const rs = await this.client.execute(
|
|
17517
|
+
"SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
|
|
17518
|
+
);
|
|
17519
|
+
if (rs.rows.length === 0) return null;
|
|
17520
|
+
const sql = rs.rows[0].sql;
|
|
17521
|
+
const match = sql.match(/F32_BLOB\((\d+)\)/i);
|
|
17522
|
+
return match ? parseInt(match[1], 10) : null;
|
|
17523
|
+
} catch {
|
|
17524
|
+
return null;
|
|
17525
|
+
}
|
|
17526
|
+
}
|
|
17527
|
+
/**
|
|
17528
|
+
* Drop all SearchSocket tables (chunks, registry, pages) and their indexes.
|
|
17529
|
+
* Used by `clean --remote` for a full reset.
|
|
17530
|
+
*/
|
|
17531
|
+
async dropAllTables() {
|
|
17532
|
+
await this.client.batch([
|
|
17533
|
+
"DROP INDEX IF EXISTS idx",
|
|
17534
|
+
"DROP TABLE IF EXISTS chunks",
|
|
17535
|
+
"DROP TABLE IF EXISTS registry",
|
|
17536
|
+
"DROP TABLE IF EXISTS pages"
|
|
17537
|
+
]);
|
|
17538
|
+
this.chunksReady = false;
|
|
17539
|
+
this.registryReady = false;
|
|
17540
|
+
this.pagesReady = false;
|
|
17541
|
+
}
|
|
17458
17542
|
async upsert(records, _scope) {
|
|
17459
17543
|
if (records.length === 0) return;
|
|
17460
17544
|
const dim = this.dimension ?? records[0].vector.length;
|
|
@@ -17465,9 +17549,9 @@ var TursoVectorStore = class {
|
|
|
17465
17549
|
const stmts = batch.map((r) => ({
|
|
17466
17550
|
sql: `INSERT OR REPLACE INTO chunks
|
|
17467
17551
|
(id, project_id, scope_name, url, path, title, section_title,
|
|
17468
|
-
heading_path, snippet, content_hash, model_id, depth,
|
|
17552
|
+
heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
|
|
17469
17553
|
incoming_links, route_file, tags, embedding)
|
|
17470
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17554
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17471
17555
|
args: [
|
|
17472
17556
|
r.id,
|
|
17473
17557
|
r.metadata.projectId,
|
|
@@ -17478,6 +17562,8 @@ var TursoVectorStore = class {
|
|
|
17478
17562
|
r.metadata.sectionTitle,
|
|
17479
17563
|
JSON.stringify(r.metadata.headingPath),
|
|
17480
17564
|
r.metadata.snippet,
|
|
17565
|
+
r.metadata.chunkText,
|
|
17566
|
+
r.metadata.ordinal,
|
|
17481
17567
|
r.metadata.contentHash,
|
|
17482
17568
|
r.metadata.modelId,
|
|
17483
17569
|
r.metadata.depth,
|
|
@@ -17496,7 +17582,8 @@ var TursoVectorStore = class {
|
|
|
17496
17582
|
const queryJson = JSON.stringify(queryVector);
|
|
17497
17583
|
const rs = await this.client.execute({
|
|
17498
17584
|
sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
|
|
17499
|
-
c.section_title, c.heading_path, c.snippet, c.
|
|
17585
|
+
c.section_title, c.heading_path, c.snippet, c.chunk_text,
|
|
17586
|
+
c.ordinal, c.content_hash,
|
|
17500
17587
|
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
17501
17588
|
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
17502
17589
|
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
@@ -17540,6 +17627,8 @@ var TursoVectorStore = class {
|
|
|
17540
17627
|
sectionTitle: row.section_title,
|
|
17541
17628
|
headingPath: JSON.parse(row.heading_path || "[]"),
|
|
17542
17629
|
snippet: row.snippet,
|
|
17630
|
+
chunkText: row.chunk_text || "",
|
|
17631
|
+
ordinal: row.ordinal || 0,
|
|
17543
17632
|
contentHash: row.content_hash,
|
|
17544
17633
|
modelId: row.model_id,
|
|
17545
17634
|
depth: row.depth,
|
|
@@ -17735,10 +17824,10 @@ var TursoVectorStore = class {
|
|
|
17735
17824
|
// src/vector/factory.ts
|
|
17736
17825
|
async function createVectorStore(config, cwd) {
|
|
17737
17826
|
const turso = config.vector.turso;
|
|
17738
|
-
const remoteUrl = process.env[turso.urlEnv];
|
|
17827
|
+
const remoteUrl = turso.url ?? process.env[turso.urlEnv];
|
|
17739
17828
|
if (remoteUrl) {
|
|
17740
17829
|
const { createClient: createClient2 } = await import('@libsql/client/http');
|
|
17741
|
-
const authToken = process.env[turso.authTokenEnv];
|
|
17830
|
+
const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
|
|
17742
17831
|
const client2 = createClient2({
|
|
17743
17832
|
url: remoteUrl,
|
|
17744
17833
|
authToken
|
|
@@ -17748,6 +17837,12 @@ async function createVectorStore(config, cwd) {
|
|
|
17748
17837
|
dimension: config.vector.dimension
|
|
17749
17838
|
});
|
|
17750
17839
|
}
|
|
17840
|
+
if (isServerless()) {
|
|
17841
|
+
throw new SearchSocketError(
|
|
17842
|
+
"VECTOR_BACKEND_UNAVAILABLE",
|
|
17843
|
+
`No remote vector database URL found (checked vector.turso.url and env var "${turso.urlEnv}"). Local SQLite storage is not available in serverless environments. Set ${turso.urlEnv} or pass vector.turso.url directly.`
|
|
17844
|
+
);
|
|
17845
|
+
}
|
|
17751
17846
|
const { createClient } = await import('@libsql/client');
|
|
17752
17847
|
const localPath = path.resolve(cwd, turso.localPath);
|
|
17753
17848
|
fs.mkdirSync(path.dirname(localPath), { recursive: true });
|
|
@@ -17905,7 +18000,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
17905
18000
|
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
17906
18001
|
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
17907
18002
|
const embedStart = process.hrtime.bigint();
|
|
17908
|
-
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
|
|
18003
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
17909
18004
|
const queryVector = queryEmbeddings[0];
|
|
17910
18005
|
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
17911
18006
|
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
@@ -17933,13 +18028,17 @@ var SearchEngine = class _SearchEngine {
|
|
|
17933
18028
|
usedRerank = true;
|
|
17934
18029
|
}
|
|
17935
18030
|
let results;
|
|
18031
|
+
const minScore = this.config.ranking.minScore;
|
|
17936
18032
|
if (groupByPage) {
|
|
17937
|
-
|
|
18033
|
+
let pages = aggregateByPage(ordered, this.config);
|
|
18034
|
+
if (minScore > 0) {
|
|
18035
|
+
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
18036
|
+
}
|
|
17938
18037
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
17939
18038
|
results = pages.slice(0, topK).map((page) => {
|
|
17940
18039
|
const bestScore = page.bestChunk.finalScore;
|
|
17941
|
-
const
|
|
17942
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
18040
|
+
const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
18041
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
|
|
17943
18042
|
return {
|
|
17944
18043
|
url: page.url,
|
|
17945
18044
|
title: page.title,
|
|
@@ -17956,6 +18055,9 @@ var SearchEngine = class _SearchEngine {
|
|
|
17956
18055
|
};
|
|
17957
18056
|
});
|
|
17958
18057
|
} else {
|
|
18058
|
+
if (minScore > 0) {
|
|
18059
|
+
ordered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
18060
|
+
}
|
|
17959
18061
|
results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
17960
18062
|
url: hit.metadata.url,
|
|
17961
18063
|
title: hit.metadata.title,
|
|
@@ -18027,43 +18129,54 @@ var SearchEngine = class _SearchEngine {
|
|
|
18027
18129
|
}
|
|
18028
18130
|
}
|
|
18029
18131
|
async rerankHits(query, ranked, topK) {
|
|
18030
|
-
if (this.config.rerank.
|
|
18132
|
+
if (!this.config.rerank.enabled) {
|
|
18031
18133
|
throw new SearchSocketError(
|
|
18032
18134
|
"INVALID_REQUEST",
|
|
18033
|
-
"rerank=true requested but rerank.
|
|
18135
|
+
"rerank=true requested but rerank.enabled is not set to true.",
|
|
18034
18136
|
400
|
|
18035
18137
|
);
|
|
18036
18138
|
}
|
|
18037
18139
|
if (!this.reranker) {
|
|
18038
18140
|
throw new SearchSocketError(
|
|
18039
18141
|
"CONFIG_MISSING",
|
|
18040
|
-
`rerank=true requested but ${this.config.
|
|
18142
|
+
`rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
|
|
18041
18143
|
400
|
|
18042
18144
|
);
|
|
18043
18145
|
}
|
|
18044
|
-
const
|
|
18045
|
-
|
|
18046
|
-
|
|
18047
|
-
|
|
18146
|
+
const pageGroups = /* @__PURE__ */ new Map();
|
|
18147
|
+
for (const entry of ranked) {
|
|
18148
|
+
const url = entry.hit.metadata.url;
|
|
18149
|
+
const group = pageGroups.get(url);
|
|
18150
|
+
if (group) group.push(entry);
|
|
18151
|
+
else pageGroups.set(url, [entry]);
|
|
18152
|
+
}
|
|
18153
|
+
const pageCandidates = [];
|
|
18154
|
+
for (const [url, chunks] of pageGroups) {
|
|
18155
|
+
const sorted = [...chunks].sort(
|
|
18156
|
+
(a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0)
|
|
18157
|
+
);
|
|
18158
|
+
const title = sorted[0].hit.metadata.title;
|
|
18159
|
+
const body = sorted.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
18160
|
+
pageCandidates.push({ id: url, text: `${title}
|
|
18161
|
+
|
|
18162
|
+
${body}` });
|
|
18163
|
+
}
|
|
18048
18164
|
const reranked = await this.reranker.rerank(
|
|
18049
18165
|
query,
|
|
18050
|
-
|
|
18166
|
+
pageCandidates,
|
|
18051
18167
|
Math.max(topK, this.config.rerank.topN)
|
|
18052
18168
|
);
|
|
18053
|
-
const
|
|
18169
|
+
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
18054
18170
|
return ranked.map((entry) => {
|
|
18055
|
-
const
|
|
18056
|
-
const
|
|
18057
|
-
if (
|
|
18058
|
-
return {
|
|
18059
|
-
...entry,
|
|
18060
|
-
finalScore: safeBaseScore
|
|
18061
|
-
};
|
|
18171
|
+
const pageScore = scoreByUrl.get(entry.hit.metadata.url);
|
|
18172
|
+
const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
|
|
18173
|
+
if (pageScore === void 0 || !Number.isFinite(pageScore)) {
|
|
18174
|
+
return { ...entry, finalScore: base };
|
|
18062
18175
|
}
|
|
18063
|
-
const
|
|
18176
|
+
const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
|
|
18064
18177
|
return {
|
|
18065
18178
|
...entry,
|
|
18066
|
-
finalScore: Number.isFinite(
|
|
18179
|
+
finalScore: Number.isFinite(combined) ? combined : base
|
|
18067
18180
|
};
|
|
18068
18181
|
}).sort((a, b) => {
|
|
18069
18182
|
const delta = b.finalScore - a.finalScore;
|
|
@@ -18103,13 +18216,21 @@ function searchsocketHandle(options = {}) {
|
|
|
18103
18216
|
let rateLimiter = null;
|
|
18104
18217
|
const getConfig = async () => {
|
|
18105
18218
|
if (!configPromise) {
|
|
18106
|
-
|
|
18107
|
-
|
|
18108
|
-
|
|
18109
|
-
})
|
|
18219
|
+
let configP;
|
|
18220
|
+
if (options.config) {
|
|
18221
|
+
configP = Promise.resolve(options.config);
|
|
18222
|
+
} else if (options.rawConfig) {
|
|
18223
|
+
const cwd = options.cwd ?? process.cwd();
|
|
18224
|
+
configP = Promise.resolve(mergeConfig(cwd, options.rawConfig));
|
|
18225
|
+
} else {
|
|
18226
|
+
configP = loadConfig({
|
|
18227
|
+
cwd: options.cwd,
|
|
18228
|
+
configPath: options.configPath
|
|
18229
|
+
});
|
|
18230
|
+
}
|
|
18110
18231
|
configPromise = configP.then((config) => {
|
|
18111
18232
|
apiPath = apiPath ?? config.api.path;
|
|
18112
|
-
if (config.api.rateLimit) {
|
|
18233
|
+
if (config.api.rateLimit && !isServerless()) {
|
|
18113
18234
|
rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
|
|
18114
18235
|
}
|
|
18115
18236
|
return config;
|
|
@@ -18119,10 +18240,9 @@ function searchsocketHandle(options = {}) {
|
|
|
18119
18240
|
};
|
|
18120
18241
|
const getEngine = async () => {
|
|
18121
18242
|
if (!enginePromise) {
|
|
18122
|
-
const config =
|
|
18243
|
+
const config = await getConfig();
|
|
18123
18244
|
enginePromise = SearchEngine.create({
|
|
18124
18245
|
cwd: options.cwd,
|
|
18125
|
-
configPath: options.configPath,
|
|
18126
18246
|
config
|
|
18127
18247
|
});
|
|
18128
18248
|
}
|
|
@@ -19657,14 +19777,16 @@ function mapUrlToRoute(urlPath, patterns) {
|
|
|
19657
19777
|
var Logger = class {
|
|
19658
19778
|
json;
|
|
19659
19779
|
verbose;
|
|
19780
|
+
quiet;
|
|
19660
19781
|
stderrOnly;
|
|
19661
19782
|
constructor(opts = {}) {
|
|
19662
19783
|
this.json = opts.json ?? false;
|
|
19663
19784
|
this.verbose = opts.verbose ?? false;
|
|
19785
|
+
this.quiet = opts.quiet ?? false;
|
|
19664
19786
|
this.stderrOnly = opts.stderrOnly ?? false;
|
|
19665
19787
|
}
|
|
19666
19788
|
info(message) {
|
|
19667
|
-
if (this.json) {
|
|
19789
|
+
if (this.quiet || this.json) {
|
|
19668
19790
|
return;
|
|
19669
19791
|
}
|
|
19670
19792
|
this.writeOut(`${message}
|
|
@@ -19678,7 +19800,7 @@ var Logger = class {
|
|
|
19678
19800
|
this.logJson("debug", { message });
|
|
19679
19801
|
return;
|
|
19680
19802
|
}
|
|
19681
|
-
this.writeOut(
|
|
19803
|
+
this.writeOut(` ${message}
|
|
19682
19804
|
`);
|
|
19683
19805
|
}
|
|
19684
19806
|
warn(message) {
|
|
@@ -19705,7 +19827,7 @@ var Logger = class {
|
|
|
19705
19827
|
this.logJson(event, data);
|
|
19706
19828
|
return;
|
|
19707
19829
|
}
|
|
19708
|
-
this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
|
|
19830
|
+
this.writeOut(` [${event}] ${data ? JSON.stringify(data) : ""}
|
|
19709
19831
|
`);
|
|
19710
19832
|
}
|
|
19711
19833
|
writeOut(text) {
|
|
@@ -19890,11 +20012,108 @@ async function startPreviewServer(cwd, options, logger3) {
|
|
|
19890
20012
|
|
|
19891
20013
|
// src/indexing/sources/build/index.ts
|
|
19892
20014
|
var logger = new Logger();
|
|
20015
|
+
function extractLinksFromHtml(html, pageUrl, baseOrigin) {
|
|
20016
|
+
const $ = load(html);
|
|
20017
|
+
const links = [];
|
|
20018
|
+
$("a[href]").each((_i, el) => {
|
|
20019
|
+
const href = $(el).attr("href");
|
|
20020
|
+
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
|
|
20021
|
+
return;
|
|
20022
|
+
}
|
|
20023
|
+
try {
|
|
20024
|
+
const resolved = new URL(href, `${baseOrigin}${pageUrl}`);
|
|
20025
|
+
if (resolved.origin !== baseOrigin) return;
|
|
20026
|
+
if (!["http:", "https:"].includes(resolved.protocol)) return;
|
|
20027
|
+
links.push(normalizeUrlPath(resolved.pathname));
|
|
20028
|
+
} catch {
|
|
20029
|
+
}
|
|
20030
|
+
});
|
|
20031
|
+
return [...new Set(links)];
|
|
20032
|
+
}
|
|
20033
|
+
async function discoverPages(server, buildConfig, pipelineMaxPages) {
|
|
20034
|
+
const { seedUrls, maxDepth, exclude } = buildConfig;
|
|
20035
|
+
const baseOrigin = new URL(server.baseUrl).origin;
|
|
20036
|
+
let effectiveMax = buildConfig.maxPages;
|
|
20037
|
+
if (typeof pipelineMaxPages === "number") {
|
|
20038
|
+
const floored = Math.max(0, Math.floor(pipelineMaxPages));
|
|
20039
|
+
effectiveMax = Math.min(effectiveMax, floored);
|
|
20040
|
+
}
|
|
20041
|
+
if (effectiveMax === 0) return [];
|
|
20042
|
+
const visited = /* @__PURE__ */ new Set();
|
|
20043
|
+
const pages = [];
|
|
20044
|
+
const queue = [];
|
|
20045
|
+
const limit = pLimit2(8);
|
|
20046
|
+
for (const seed of seedUrls) {
|
|
20047
|
+
const normalized = normalizeUrlPath(seed);
|
|
20048
|
+
if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
|
|
20049
|
+
visited.add(normalized);
|
|
20050
|
+
queue.push({ url: normalized, depth: 0 });
|
|
20051
|
+
}
|
|
20052
|
+
}
|
|
20053
|
+
while (queue.length > 0 && pages.length < effectiveMax) {
|
|
20054
|
+
const remaining = effectiveMax - pages.length;
|
|
20055
|
+
const batch = queue.splice(0, remaining);
|
|
20056
|
+
const results = await Promise.allSettled(
|
|
20057
|
+
batch.map(
|
|
20058
|
+
(item) => limit(async () => {
|
|
20059
|
+
const fullUrl = joinUrl(server.baseUrl, item.url);
|
|
20060
|
+
const response = await fetch(fullUrl);
|
|
20061
|
+
if (!response.ok) {
|
|
20062
|
+
logger.warn(`Skipping ${item.url}: ${response.status} ${response.statusText}`);
|
|
20063
|
+
return null;
|
|
20064
|
+
}
|
|
20065
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
20066
|
+
if (!contentType.includes("text/html")) {
|
|
20067
|
+
return null;
|
|
20068
|
+
}
|
|
20069
|
+
const html = await response.text();
|
|
20070
|
+
if (item.depth < maxDepth) {
|
|
20071
|
+
const links = extractLinksFromHtml(html, item.url, baseOrigin);
|
|
20072
|
+
for (const link of links) {
|
|
20073
|
+
if (!visited.has(link) && !isExcluded(link, exclude)) {
|
|
20074
|
+
visited.add(link);
|
|
20075
|
+
queue.push({ url: link, depth: item.depth + 1 });
|
|
20076
|
+
}
|
|
20077
|
+
}
|
|
20078
|
+
}
|
|
20079
|
+
return {
|
|
20080
|
+
url: item.url,
|
|
20081
|
+
html,
|
|
20082
|
+
sourcePath: fullUrl,
|
|
20083
|
+
outgoingLinks: []
|
|
20084
|
+
};
|
|
20085
|
+
})
|
|
20086
|
+
)
|
|
20087
|
+
);
|
|
20088
|
+
for (const result of results) {
|
|
20089
|
+
if (result.status === "fulfilled" && result.value) {
|
|
20090
|
+
pages.push(result.value);
|
|
20091
|
+
}
|
|
20092
|
+
}
|
|
20093
|
+
}
|
|
20094
|
+
if (pages.length >= effectiveMax && queue.length > 0) {
|
|
20095
|
+
logger.warn(`Discovery crawl reached maxPages limit (${effectiveMax}), ${queue.length} URLs not visited.`);
|
|
20096
|
+
}
|
|
20097
|
+
logger.event("build_discover_complete", {
|
|
20098
|
+
pagesFound: pages.length,
|
|
20099
|
+
urlsVisited: visited.size,
|
|
20100
|
+
urlsSkipped: queue.length
|
|
20101
|
+
});
|
|
20102
|
+
return pages;
|
|
20103
|
+
}
|
|
19893
20104
|
async function loadBuildPages(cwd, config, maxPages) {
|
|
19894
20105
|
const buildConfig = config.source.build;
|
|
19895
20106
|
if (!buildConfig) {
|
|
19896
20107
|
throw new Error("build source config is missing");
|
|
19897
20108
|
}
|
|
20109
|
+
if (buildConfig.discover) {
|
|
20110
|
+
const server2 = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
20111
|
+
try {
|
|
20112
|
+
return await discoverPages(server2, buildConfig, maxPages);
|
|
20113
|
+
} finally {
|
|
20114
|
+
await server2.shutdown();
|
|
20115
|
+
}
|
|
20116
|
+
}
|
|
19898
20117
|
const routes = await parseManifest(cwd, buildConfig.outputDir);
|
|
19899
20118
|
const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
|
|
19900
20119
|
logger.event("build_routes_discovered", {
|
|
@@ -19905,7 +20124,7 @@ async function loadBuildPages(cwd, config, maxPages) {
|
|
|
19905
20124
|
const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
|
|
19906
20125
|
const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
|
|
19907
20126
|
try {
|
|
19908
|
-
const concurrencyLimit =
|
|
20127
|
+
const concurrencyLimit = pLimit2(8);
|
|
19909
20128
|
const results = await Promise.allSettled(
|
|
19910
20129
|
selected.map(
|
|
19911
20130
|
(route) => concurrencyLimit(async () => {
|
|
@@ -20074,7 +20293,7 @@ async function loadCrawledPages(config, maxPages) {
|
|
|
20074
20293
|
const routes = await resolveRoutes(config);
|
|
20075
20294
|
const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
|
|
20076
20295
|
const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
|
|
20077
|
-
const concurrencyLimit =
|
|
20296
|
+
const concurrencyLimit = pLimit2(8);
|
|
20078
20297
|
const results = await Promise.allSettled(
|
|
20079
20298
|
selected.map(
|
|
20080
20299
|
(route) => concurrencyLimit(async () => {
|
|
@@ -20128,9 +20347,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
20128
20347
|
|
|
20129
20348
|
// src/indexing/pipeline.ts
|
|
20130
20349
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
20131
|
-
"
|
|
20132
|
-
"text-embedding-3-large": 13e-5,
|
|
20133
|
-
"text-embedding-ada-002": 1e-4
|
|
20350
|
+
"jina-embeddings-v3": 2e-5
|
|
20134
20351
|
};
|
|
20135
20352
|
var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
|
|
20136
20353
|
var IndexPipeline = class _IndexPipeline {
|
|
@@ -20176,9 +20393,15 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20176
20393
|
};
|
|
20177
20394
|
const scope = resolveScope(this.config, options.scopeOverride);
|
|
20178
20395
|
const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
|
|
20396
|
+
const sourceMode = options.sourceOverride ?? this.config.source.mode;
|
|
20397
|
+
this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, model: ${this.config.embeddings.model})`);
|
|
20179
20398
|
if (options.force) {
|
|
20399
|
+
this.logger.info("Force mode enabled \u2014 full rebuild");
|
|
20180
20400
|
await cleanMirrorForScope(statePath, scope);
|
|
20181
20401
|
}
|
|
20402
|
+
if (options.dryRun) {
|
|
20403
|
+
this.logger.info("Dry run \u2014 no writes will be performed");
|
|
20404
|
+
}
|
|
20182
20405
|
const manifestStart = stageStart();
|
|
20183
20406
|
const existingHashes = await this.vectorStore.getContentHashes(scope);
|
|
20184
20407
|
const existingModelId = await this.vectorStore.getScopeModelId(scope);
|
|
@@ -20189,8 +20412,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20189
20412
|
);
|
|
20190
20413
|
}
|
|
20191
20414
|
stageEnd("manifest", manifestStart);
|
|
20415
|
+
this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
|
|
20192
20416
|
const sourceStart = stageStart();
|
|
20193
|
-
|
|
20417
|
+
this.logger.info(`Loading pages (source: ${sourceMode})...`);
|
|
20194
20418
|
let sourcePages;
|
|
20195
20419
|
if (sourceMode === "static-output") {
|
|
20196
20420
|
sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
|
|
@@ -20202,10 +20426,13 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20202
20426
|
sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
|
|
20203
20427
|
}
|
|
20204
20428
|
stageEnd("source", sourceStart);
|
|
20429
|
+
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
20205
20430
|
const routeStart = stageStart();
|
|
20206
20431
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
20207
20432
|
stageEnd("route_map", routeStart);
|
|
20433
|
+
this.logger.debug(`Route mapping: ${routePatterns.length} pattern${routePatterns.length === 1 ? "" : "s"} discovered (${stageTimingsMs["route_map"]}ms)`);
|
|
20208
20434
|
const extractStart = stageStart();
|
|
20435
|
+
this.logger.info("Extracting content...");
|
|
20209
20436
|
const extractedPages = [];
|
|
20210
20437
|
for (const sourcePage of sourcePages) {
|
|
20211
20438
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
@@ -20234,6 +20461,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20234
20461
|
uniquePages.push(page);
|
|
20235
20462
|
}
|
|
20236
20463
|
stageEnd("extract", extractStart);
|
|
20464
|
+
const skippedPages = sourcePages.length - uniquePages.length;
|
|
20465
|
+
this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
20237
20466
|
const linkStart = stageStart();
|
|
20238
20467
|
const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
|
|
20239
20468
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
@@ -20249,7 +20478,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20249
20478
|
}
|
|
20250
20479
|
}
|
|
20251
20480
|
stageEnd("links", linkStart);
|
|
20481
|
+
this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
|
|
20252
20482
|
const mirrorStart = stageStart();
|
|
20483
|
+
this.logger.info("Writing mirror pages...");
|
|
20253
20484
|
const mirrorPages = [];
|
|
20254
20485
|
let routeExact = 0;
|
|
20255
20486
|
let routeBestEffort = 0;
|
|
@@ -20319,7 +20550,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20319
20550
|
await this.vectorStore.upsertPages(pageRecords, scope);
|
|
20320
20551
|
}
|
|
20321
20552
|
stageEnd("mirror", mirrorStart);
|
|
20553
|
+
this.logger.info(`Mirrored ${mirrorPages.length} page${mirrorPages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["mirror"]}ms)`);
|
|
20322
20554
|
const chunkStart = stageStart();
|
|
20555
|
+
this.logger.info("Chunking pages...");
|
|
20323
20556
|
let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
|
|
20324
20557
|
const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
|
|
20325
20558
|
if (typeof maxChunks === "number") {
|
|
@@ -20332,6 +20565,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20332
20565
|
});
|
|
20333
20566
|
}
|
|
20334
20567
|
stageEnd("chunk", chunkStart);
|
|
20568
|
+
this.logger.info(`Chunked into ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} (${stageTimingsMs["chunk"]}ms)`);
|
|
20335
20569
|
const currentChunkMap = /* @__PURE__ */ new Map();
|
|
20336
20570
|
for (const chunk of chunks) {
|
|
20337
20571
|
currentChunkMap.set(chunk.chunkKey, chunk);
|
|
@@ -20350,6 +20584,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20350
20584
|
return existingHash !== chunk.contentHash;
|
|
20351
20585
|
});
|
|
20352
20586
|
const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
|
|
20587
|
+
this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
|
|
20353
20588
|
const embedStart = stageStart();
|
|
20354
20589
|
const chunkTokenEstimates = /* @__PURE__ */ new Map();
|
|
20355
20590
|
for (const chunk of changedChunks) {
|
|
@@ -20364,9 +20599,11 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20364
20599
|
let newEmbeddings = 0;
|
|
20365
20600
|
const vectorsByChunk = /* @__PURE__ */ new Map();
|
|
20366
20601
|
if (!options.dryRun && changedChunks.length > 0) {
|
|
20602
|
+
this.logger.info(`Embedding ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} (~${estimatedTokens.toLocaleString()} tokens, ~$${estimatedCostUSD.toFixed(6)})...`);
|
|
20367
20603
|
const embeddings = await this.embeddings.embedTexts(
|
|
20368
20604
|
changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
|
|
20369
|
-
this.config.embeddings.model
|
|
20605
|
+
this.config.embeddings.model,
|
|
20606
|
+
"retrieval.passage"
|
|
20370
20607
|
);
|
|
20371
20608
|
if (embeddings.length !== changedChunks.length) {
|
|
20372
20609
|
throw new SearchSocketError(
|
|
@@ -20389,8 +20626,14 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20389
20626
|
}
|
|
20390
20627
|
}
|
|
20391
20628
|
stageEnd("embedding", embedStart);
|
|
20629
|
+
if (changedChunks.length > 0) {
|
|
20630
|
+
this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
|
|
20631
|
+
} else {
|
|
20632
|
+
this.logger.info("No chunks to embed \u2014 all up to date");
|
|
20633
|
+
}
|
|
20392
20634
|
const syncStart = stageStart();
|
|
20393
20635
|
if (!options.dryRun) {
|
|
20636
|
+
this.logger.info("Syncing vectors...");
|
|
20394
20637
|
const upserts = [];
|
|
20395
20638
|
for (const chunk of changedChunks) {
|
|
20396
20639
|
const vector = vectorsByChunk.get(chunk.chunkKey);
|
|
@@ -20409,6 +20652,8 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20409
20652
|
sectionTitle: chunk.sectionTitle ?? "",
|
|
20410
20653
|
headingPath: chunk.headingPath,
|
|
20411
20654
|
snippet: chunk.snippet,
|
|
20655
|
+
chunkText: chunk.chunkText.slice(0, 4e3),
|
|
20656
|
+
ordinal: chunk.ordinal,
|
|
20412
20657
|
contentHash: chunk.contentHash,
|
|
20413
20658
|
modelId: this.config.embeddings.model,
|
|
20414
20659
|
depth: chunk.depth,
|
|
@@ -20428,6 +20673,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20428
20673
|
}
|
|
20429
20674
|
}
|
|
20430
20675
|
stageEnd("sync", syncStart);
|
|
20676
|
+
this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
|
|
20431
20677
|
const finalizeStart = stageStart();
|
|
20432
20678
|
if (!options.dryRun) {
|
|
20433
20679
|
const scopeInfo = {
|
|
@@ -20447,6 +20693,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20447
20693
|
});
|
|
20448
20694
|
}
|
|
20449
20695
|
stageEnd("finalize", finalizeStart);
|
|
20696
|
+
this.logger.info("Done.");
|
|
20450
20697
|
return {
|
|
20451
20698
|
pagesProcessed: mirrorPages.length,
|
|
20452
20699
|
chunksTotal: chunks.length,
|