searchsocket 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,8 +4,7 @@ var fs = require('fs');
4
4
  var path = require('path');
5
5
  var jiti = require('jiti');
6
6
  var zod = require('zod');
7
- var OpenAI = require('openai');
8
- var pLimit = require('p-limit');
7
+ var pLimit2 = require('p-limit');
9
8
  var child_process = require('child_process');
10
9
  var crypto = require('crypto');
11
10
  var cheerio = require('cheerio');
@@ -19,8 +18,7 @@ function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
19
18
 
20
19
  var fs__default = /*#__PURE__*/_interopDefault(fs);
21
20
  var path__default = /*#__PURE__*/_interopDefault(path);
22
- var OpenAI__default = /*#__PURE__*/_interopDefault(OpenAI);
23
- var pLimit__default = /*#__PURE__*/_interopDefault(pLimit);
21
+ var pLimit2__default = /*#__PURE__*/_interopDefault(pLimit2);
24
22
  var matter__default = /*#__PURE__*/_interopDefault(matter);
25
23
  var fs4__default = /*#__PURE__*/_interopDefault(fs4);
26
24
  var fg__default = /*#__PURE__*/_interopDefault(fg);
@@ -16629,7 +16627,11 @@ var searchSocketConfigSchema = zod.z.object({
16629
16627
  outputDir: zod.z.string().min(1).optional(),
16630
16628
  paramValues: zod.z.record(zod.z.string(), zod.z.array(zod.z.string())).optional(),
16631
16629
  exclude: zod.z.array(zod.z.string()).optional(),
16632
- previewTimeout: zod.z.number().int().positive().optional()
16630
+ previewTimeout: zod.z.number().int().positive().optional(),
16631
+ discover: zod.z.boolean().optional(),
16632
+ seedUrls: zod.z.array(zod.z.string()).optional(),
16633
+ maxPages: zod.z.number().int().positive().optional(),
16634
+ maxDepth: zod.z.number().int().nonnegative().optional()
16633
16635
  }).optional()
16634
16636
  }).optional(),
16635
16637
  extract: zod.z.object({
@@ -16656,8 +16658,9 @@ var searchSocketConfigSchema = zod.z.object({
16656
16658
  pageSummaryChunk: zod.z.boolean().optional()
16657
16659
  }).optional(),
16658
16660
  embeddings: zod.z.object({
16659
- provider: zod.z.literal("openai").optional(),
16661
+ provider: zod.z.literal("jina").optional(),
16660
16662
  model: zod.z.string().min(1).optional(),
16663
+ apiKey: zod.z.string().min(1).optional(),
16661
16664
  apiKeyEnv: zod.z.string().min(1).optional(),
16662
16665
  batchSize: zod.z.number().int().positive().optional(),
16663
16666
  concurrency: zod.z.number().int().positive().optional(),
@@ -16666,18 +16669,17 @@ var searchSocketConfigSchema = zod.z.object({
16666
16669
  vector: zod.z.object({
16667
16670
  dimension: zod.z.number().int().positive().optional(),
16668
16671
  turso: zod.z.object({
16672
+ url: zod.z.string().url().optional(),
16673
+ authToken: zod.z.string().min(1).optional(),
16669
16674
  urlEnv: zod.z.string().optional(),
16670
16675
  authTokenEnv: zod.z.string().optional(),
16671
16676
  localPath: zod.z.string().optional()
16672
16677
  }).optional()
16673
16678
  }).optional(),
16674
16679
  rerank: zod.z.object({
16675
- provider: zod.z.enum(["none", "jina"]).optional(),
16680
+ enabled: zod.z.boolean().optional(),
16676
16681
  topN: zod.z.number().int().positive().optional(),
16677
- jina: zod.z.object({
16678
- apiKeyEnv: zod.z.string().optional(),
16679
- model: zod.z.string().optional()
16680
- }).optional()
16682
+ model: zod.z.string().optional()
16681
16683
  }).optional(),
16682
16684
  ranking: zod.z.object({
16683
16685
  enableIncomingLinkBoost: zod.z.boolean().optional(),
@@ -16686,6 +16688,7 @@ var searchSocketConfigSchema = zod.z.object({
16686
16688
  aggregationCap: zod.z.number().int().positive().optional(),
16687
16689
  aggregationDecay: zod.z.number().min(0).max(1).optional(),
16688
16690
  minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
16691
+ minScore: zod.z.number().min(0).max(1).optional(),
16689
16692
  weights: zod.z.object({
16690
16693
  incomingLinks: zod.z.number().optional(),
16691
16694
  depth: zod.z.number().optional(),
@@ -16766,9 +16769,9 @@ function createDefaultConfig(projectId) {
16766
16769
  pageSummaryChunk: true
16767
16770
  },
16768
16771
  embeddings: {
16769
- provider: "openai",
16770
- model: "text-embedding-3-small",
16771
- apiKeyEnv: "OPENAI_API_KEY",
16772
+ provider: "jina",
16773
+ model: "jina-embeddings-v3",
16774
+ apiKeyEnv: "JINA_API_KEY",
16772
16775
  batchSize: 64,
16773
16776
  concurrency: 4
16774
16777
  },
@@ -16780,12 +16783,9 @@ function createDefaultConfig(projectId) {
16780
16783
  }
16781
16784
  },
16782
16785
  rerank: {
16783
- provider: "none",
16786
+ enabled: false,
16784
16787
  topN: 20,
16785
- jina: {
16786
- apiKeyEnv: "JINA_API_KEY",
16787
- model: "jina-reranker-v2-base-multilingual"
16788
- }
16788
+ model: "jina-reranker-v2-base-multilingual"
16789
16789
  },
16790
16790
  ranking: {
16791
16791
  enableIncomingLinkBoost: true,
@@ -16794,6 +16794,7 @@ function createDefaultConfig(projectId) {
16794
16794
  aggregationCap: 5,
16795
16795
  aggregationDecay: 0.5,
16796
16796
  minChunkScoreRatio: 0.5,
16797
+ minScore: 0,
16797
16798
  weights: {
16798
16799
  incomingLinks: 0.05,
16799
16800
  depth: 0.03,
@@ -16920,7 +16921,11 @@ ${issues}`
16920
16921
  outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
16921
16922
  paramValues: parsed.source.build.paramValues ?? {},
16922
16923
  exclude: parsed.source.build.exclude ?? [],
16923
- previewTimeout: parsed.source.build.previewTimeout ?? 3e4
16924
+ previewTimeout: parsed.source.build.previewTimeout ?? 3e4,
16925
+ discover: parsed.source.build.discover ?? false,
16926
+ seedUrls: parsed.source.build.seedUrls ?? ["/"],
16927
+ maxPages: parsed.source.build.maxPages ?? 200,
16928
+ maxDepth: parsed.source.build.maxDepth ?? 10
16924
16929
  } : void 0
16925
16930
  },
16926
16931
  extract: {
@@ -16949,11 +16954,7 @@ ${issues}`
16949
16954
  },
16950
16955
  rerank: {
16951
16956
  ...defaults.rerank,
16952
- ...parsed.rerank,
16953
- jina: {
16954
- ...defaults.rerank.jina,
16955
- ...parsed.rerank?.jina
16956
- }
16957
+ ...parsed.rerank
16957
16958
  },
16958
16959
  ranking: {
16959
16960
  ...defaults.ranking,
@@ -17000,7 +17001,11 @@ ${issues}`
17000
17001
  outputDir: ".svelte-kit/output",
17001
17002
  paramValues: {},
17002
17003
  exclude: [],
17003
- previewTimeout: 3e4
17004
+ previewTimeout: 3e4,
17005
+ discover: false,
17006
+ seedUrls: ["/"],
17007
+ maxPages: 200,
17008
+ maxDepth: 10
17004
17009
  };
17005
17010
  }
17006
17011
  if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
@@ -17035,15 +17040,21 @@ async function loadConfig(options = {}) {
17035
17040
  const raw = loaded.default ?? loaded;
17036
17041
  return mergeConfig(cwd, raw);
17037
17042
  }
17043
+
17044
+ // src/core/serverless.ts
17045
+ function isServerless() {
17046
+ return !!(process.env.VERCEL || process.env.NETLIFY || process.env.AWS_LAMBDA_FUNCTION_NAME || process.env.FUNCTIONS_WORKER || process.env.CF_PAGES);
17047
+ }
17038
17048
  function sleep(ms) {
17039
17049
  return new Promise((resolve) => {
17040
17050
  setTimeout(resolve, ms);
17041
17051
  });
17042
17052
  }
17043
- var OpenAIEmbeddingsProvider = class {
17044
- client;
17053
+ var JinaEmbeddingsProvider = class {
17054
+ apiKey;
17045
17055
  batchSize;
17046
17056
  concurrency;
17057
+ defaultTask;
17047
17058
  constructor(options) {
17048
17059
  if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
17049
17060
  throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
@@ -17051,11 +17062,10 @@ var OpenAIEmbeddingsProvider = class {
17051
17062
  if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
17052
17063
  throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
17053
17064
  }
17054
- this.client = new OpenAI__default.default({
17055
- apiKey: options.apiKey
17056
- });
17065
+ this.apiKey = options.apiKey;
17057
17066
  this.batchSize = options.batchSize;
17058
17067
  this.concurrency = options.concurrency;
17068
+ this.defaultTask = options.task ?? "retrieval.passage";
17059
17069
  }
17060
17070
  estimateTokens(text) {
17061
17071
  const normalized = text.trim();
@@ -17069,7 +17079,7 @@ var OpenAIEmbeddingsProvider = class {
17069
17079
  const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
17070
17080
  return Math.max(1, Math.max(charEstimate, lexicalEstimate));
17071
17081
  }
17072
- async embedTexts(texts, modelId) {
17082
+ async embedTexts(texts, modelId, task) {
17073
17083
  if (texts.length === 0) {
17074
17084
  return [];
17075
17085
  }
@@ -17081,37 +17091,56 @@ var OpenAIEmbeddingsProvider = class {
17081
17091
  });
17082
17092
  }
17083
17093
  const outputs = new Array(batches.length);
17084
- const limit = pLimit__default.default(this.concurrency);
17094
+ const limit = pLimit2__default.default(this.concurrency);
17085
17095
  await Promise.all(
17086
17096
  batches.map(
17087
17097
  (batch, position) => limit(async () => {
17088
- outputs[position] = await this.embedWithRetry(batch.values, modelId);
17098
+ outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
17089
17099
  })
17090
17100
  )
17091
17101
  );
17092
17102
  return outputs.flat();
17093
17103
  }
17094
- async embedWithRetry(texts, modelId) {
17104
+ async embedWithRetry(texts, modelId, task) {
17095
17105
  const maxAttempts = 5;
17096
17106
  let attempt = 0;
17097
17107
  while (attempt < maxAttempts) {
17098
17108
  attempt += 1;
17109
+ let response;
17099
17110
  try {
17100
- const response = await this.client.embeddings.create({
17101
- model: modelId,
17102
- input: texts,
17103
- encoding_format: "float"
17111
+ response = await fetch("https://api.jina.ai/v1/embeddings", {
17112
+ method: "POST",
17113
+ headers: {
17114
+ "content-type": "application/json",
17115
+ authorization: `Bearer ${this.apiKey}`
17116
+ },
17117
+ body: JSON.stringify({
17118
+ model: modelId,
17119
+ input: texts,
17120
+ task
17121
+ })
17104
17122
  });
17105
- return response.data.map((entry) => entry.embedding);
17106
17123
  } catch (error) {
17107
- const status = error.status;
17108
- const retryable = status === 429 || typeof status === "number" && status >= 500;
17109
- if (!retryable || attempt >= maxAttempts) {
17124
+ if (attempt >= maxAttempts) {
17110
17125
  throw error;
17111
17126
  }
17112
- const delay = Math.min(2 ** attempt * 300, 5e3);
17113
- await sleep(delay);
17127
+ await sleep(Math.min(2 ** attempt * 300, 5e3));
17128
+ continue;
17114
17129
  }
17130
+ if (!response.ok) {
17131
+ const retryable = response.status === 429 || response.status >= 500;
17132
+ if (!retryable || attempt >= maxAttempts) {
17133
+ const errorBody = await response.text();
17134
+ throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
17135
+ }
17136
+ await sleep(Math.min(2 ** attempt * 300, 5e3));
17137
+ continue;
17138
+ }
17139
+ const payload = await response.json();
17140
+ if (!payload.data || !Array.isArray(payload.data)) {
17141
+ throw new Error("Invalid Jina embeddings response format");
17142
+ }
17143
+ return payload.data.map((entry) => entry.embedding);
17115
17144
  }
17116
17145
  throw new Error("Unreachable retry state");
17117
17146
  }
@@ -17119,20 +17148,20 @@ var OpenAIEmbeddingsProvider = class {
17119
17148
 
17120
17149
  // src/embeddings/factory.ts
17121
17150
  function createEmbeddingsProvider(config) {
17122
- if (config.embeddings.provider !== "openai") {
17151
+ if (config.embeddings.provider !== "jina") {
17123
17152
  throw new SearchSocketError(
17124
17153
  "CONFIG_MISSING",
17125
17154
  `Unsupported embeddings provider ${config.embeddings.provider}`
17126
17155
  );
17127
17156
  }
17128
- const apiKey = process.env[config.embeddings.apiKeyEnv];
17157
+ const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
17129
17158
  if (!apiKey) {
17130
17159
  throw new SearchSocketError(
17131
17160
  "CONFIG_MISSING",
17132
- `Missing embeddings API key env var: ${config.embeddings.apiKeyEnv}`
17161
+ `Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
17133
17162
  );
17134
17163
  }
17135
- return new OpenAIEmbeddingsProvider({
17164
+ return new JinaEmbeddingsProvider({
17136
17165
  apiKey,
17137
17166
  batchSize: config.embeddings.batchSize,
17138
17167
  concurrency: config.embeddings.concurrency
@@ -17295,20 +17324,17 @@ var JinaReranker = class {
17295
17324
 
17296
17325
  // src/rerank/factory.ts
17297
17326
  function createReranker(config) {
17298
- if (config.rerank.provider === "none") {
17327
+ if (!config.rerank.enabled) {
17299
17328
  return null;
17300
17329
  }
17301
- if (config.rerank.provider === "jina") {
17302
- const apiKey = process.env[config.rerank.jina.apiKeyEnv];
17303
- if (!apiKey) {
17304
- return null;
17305
- }
17306
- return new JinaReranker({
17307
- apiKey,
17308
- model: config.rerank.jina.model
17309
- });
17330
+ const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
17331
+ if (!apiKey) {
17332
+ return null;
17310
17333
  }
17311
- return null;
17334
+ return new JinaReranker({
17335
+ apiKey,
17336
+ model: config.rerank.model
17337
+ });
17312
17338
  }
17313
17339
 
17314
17340
  // src/utils/time.ts
@@ -17413,6 +17439,16 @@ var TursoVectorStore = class {
17413
17439
  }
17414
17440
  async ensureChunks(dim) {
17415
17441
  if (this.chunksReady) return;
17442
+ const exists = await this.chunksTableExists();
17443
+ if (exists) {
17444
+ const currentDim = await this.getChunksDimension();
17445
+ if (currentDim !== null && currentDim !== dim) {
17446
+ await this.client.batch([
17447
+ "DROP INDEX IF EXISTS idx",
17448
+ "DROP TABLE IF EXISTS chunks"
17449
+ ]);
17450
+ }
17451
+ }
17416
17452
  await this.client.batch([
17417
17453
  `CREATE TABLE IF NOT EXISTS chunks (
17418
17454
  id TEXT PRIMARY KEY,
@@ -17424,12 +17460,16 @@ var TursoVectorStore = class {
17424
17460
  section_title TEXT NOT NULL DEFAULT '',
17425
17461
  heading_path TEXT NOT NULL DEFAULT '[]',
17426
17462
  snippet TEXT NOT NULL DEFAULT '',
17463
+ chunk_text TEXT NOT NULL DEFAULT '',
17464
+ ordinal INTEGER NOT NULL DEFAULT 0,
17427
17465
  content_hash TEXT NOT NULL DEFAULT '',
17428
17466
  model_id TEXT NOT NULL DEFAULT '',
17429
17467
  depth INTEGER NOT NULL DEFAULT 0,
17430
17468
  incoming_links INTEGER NOT NULL DEFAULT 0,
17431
17469
  route_file TEXT NOT NULL DEFAULT '',
17432
17470
  tags TEXT NOT NULL DEFAULT '[]',
17471
+ description TEXT NOT NULL DEFAULT '',
17472
+ keywords TEXT NOT NULL DEFAULT '[]',
17433
17473
  embedding F32_BLOB(${dim})
17434
17474
  )`,
17435
17475
  `CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
@@ -17468,6 +17508,38 @@ var TursoVectorStore = class {
17468
17508
  throw error;
17469
17509
  }
17470
17510
  }
17511
+ /**
17512
+ * Read the current F32_BLOB dimension from the chunks table schema.
17513
+ * Returns null if the table doesn't exist or the dimension can't be parsed.
17514
+ */
17515
+ async getChunksDimension() {
17516
+ try {
17517
+ const rs = await this.client.execute(
17518
+ "SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
17519
+ );
17520
+ if (rs.rows.length === 0) return null;
17521
+ const sql = rs.rows[0].sql;
17522
+ const match = sql.match(/F32_BLOB\((\d+)\)/i);
17523
+ return match ? parseInt(match[1], 10) : null;
17524
+ } catch {
17525
+ return null;
17526
+ }
17527
+ }
17528
+ /**
17529
+ * Drop all SearchSocket tables (chunks, registry, pages) and their indexes.
17530
+ * Used by `clean --remote` for a full reset.
17531
+ */
17532
+ async dropAllTables() {
17533
+ await this.client.batch([
17534
+ "DROP INDEX IF EXISTS idx",
17535
+ "DROP TABLE IF EXISTS chunks",
17536
+ "DROP TABLE IF EXISTS registry",
17537
+ "DROP TABLE IF EXISTS pages"
17538
+ ]);
17539
+ this.chunksReady = false;
17540
+ this.registryReady = false;
17541
+ this.pagesReady = false;
17542
+ }
17471
17543
  async upsert(records, _scope) {
17472
17544
  if (records.length === 0) return;
17473
17545
  const dim = this.dimension ?? records[0].vector.length;
@@ -17478,9 +17550,9 @@ var TursoVectorStore = class {
17478
17550
  const stmts = batch.map((r) => ({
17479
17551
  sql: `INSERT OR REPLACE INTO chunks
17480
17552
  (id, project_id, scope_name, url, path, title, section_title,
17481
- heading_path, snippet, content_hash, model_id, depth,
17482
- incoming_links, route_file, tags, embedding)
17483
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
17553
+ heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
17554
+ incoming_links, route_file, tags, description, keywords, embedding)
17555
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
17484
17556
  args: [
17485
17557
  r.id,
17486
17558
  r.metadata.projectId,
@@ -17491,12 +17563,16 @@ var TursoVectorStore = class {
17491
17563
  r.metadata.sectionTitle,
17492
17564
  JSON.stringify(r.metadata.headingPath),
17493
17565
  r.metadata.snippet,
17566
+ r.metadata.chunkText,
17567
+ r.metadata.ordinal,
17494
17568
  r.metadata.contentHash,
17495
17569
  r.metadata.modelId,
17496
17570
  r.metadata.depth,
17497
17571
  r.metadata.incomingLinks,
17498
17572
  r.metadata.routeFile,
17499
17573
  JSON.stringify(r.metadata.tags),
17574
+ r.metadata.description ?? "",
17575
+ JSON.stringify(r.metadata.keywords ?? []),
17500
17576
  JSON.stringify(r.vector)
17501
17577
  ]
17502
17578
  }));
@@ -17509,8 +17585,10 @@ var TursoVectorStore = class {
17509
17585
  const queryJson = JSON.stringify(queryVector);
17510
17586
  const rs = await this.client.execute({
17511
17587
  sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
17512
- c.section_title, c.heading_path, c.snippet, c.content_hash,
17588
+ c.section_title, c.heading_path, c.snippet, c.chunk_text,
17589
+ c.ordinal, c.content_hash,
17513
17590
  c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
17591
+ c.description, c.keywords,
17514
17592
  vector_distance_cos(c.embedding, vector(?)) AS distance
17515
17593
  FROM vector_top_k('idx', vector(?), ?) AS v
17516
17594
  JOIN chunks AS c ON c.rowid = v.id`,
@@ -17541,6 +17619,12 @@ var TursoVectorStore = class {
17541
17619
  }
17542
17620
  const distance = row.distance;
17543
17621
  const score = 1 - distance;
17622
+ const description = row.description || void 0;
17623
+ const keywords = (() => {
17624
+ const raw = row.keywords || "[]";
17625
+ const parsed = JSON.parse(raw);
17626
+ return parsed.length > 0 ? parsed : void 0;
17627
+ })();
17544
17628
  hits.push({
17545
17629
  id: row.id,
17546
17630
  score,
@@ -17553,12 +17637,16 @@ var TursoVectorStore = class {
17553
17637
  sectionTitle: row.section_title,
17554
17638
  headingPath: JSON.parse(row.heading_path || "[]"),
17555
17639
  snippet: row.snippet,
17640
+ chunkText: row.chunk_text || "",
17641
+ ordinal: row.ordinal || 0,
17556
17642
  contentHash: row.content_hash,
17557
17643
  modelId: row.model_id,
17558
17644
  depth: row.depth,
17559
17645
  incomingLinks: row.incoming_links,
17560
17646
  routeFile: row.route_file,
17561
- tags
17647
+ tags,
17648
+ description,
17649
+ keywords
17562
17650
  }
17563
17651
  });
17564
17652
  }
@@ -17748,10 +17836,10 @@ var TursoVectorStore = class {
17748
17836
  // src/vector/factory.ts
17749
17837
  async function createVectorStore(config, cwd) {
17750
17838
  const turso = config.vector.turso;
17751
- const remoteUrl = process.env[turso.urlEnv];
17839
+ const remoteUrl = turso.url ?? process.env[turso.urlEnv];
17752
17840
  if (remoteUrl) {
17753
17841
  const { createClient: createClient2 } = await import('@libsql/client/http');
17754
- const authToken = process.env[turso.authTokenEnv];
17842
+ const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
17755
17843
  const client2 = createClient2({
17756
17844
  url: remoteUrl,
17757
17845
  authToken
@@ -17761,6 +17849,12 @@ async function createVectorStore(config, cwd) {
17761
17849
  dimension: config.vector.dimension
17762
17850
  });
17763
17851
  }
17852
+ if (isServerless()) {
17853
+ throw new SearchSocketError(
17854
+ "VECTOR_BACKEND_UNAVAILABLE",
17855
+ `No remote vector database URL found (checked vector.turso.url and env var "${turso.urlEnv}"). Local SQLite storage is not available in serverless environments. Set ${turso.urlEnv} or pass vector.turso.url directly.`
17856
+ );
17857
+ }
17764
17858
  const { createClient } = await import('@libsql/client');
17765
17859
  const localPath = path__default.default.resolve(cwd, turso.localPath);
17766
17860
  fs__default.default.mkdirSync(path__default.default.dirname(localPath), { recursive: true });
@@ -17918,7 +18012,7 @@ var SearchEngine = class _SearchEngine {
17918
18012
  const groupByPage = (input.groupBy ?? "page") === "page";
17919
18013
  const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
17920
18014
  const embedStart = process.hrtime.bigint();
17921
- const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
18015
+ const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
17922
18016
  const queryVector = queryEmbeddings[0];
17923
18017
  if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
17924
18018
  throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
@@ -17946,13 +18040,17 @@ var SearchEngine = class _SearchEngine {
17946
18040
  usedRerank = true;
17947
18041
  }
17948
18042
  let results;
18043
+ const minScore = this.config.ranking.minScore;
17949
18044
  if (groupByPage) {
17950
- const pages = aggregateByPage(ordered, this.config);
18045
+ let pages = aggregateByPage(ordered, this.config);
18046
+ if (minScore > 0) {
18047
+ pages = pages.filter((p) => p.pageScore >= minScore);
18048
+ }
17951
18049
  const minRatio = this.config.ranking.minChunkScoreRatio;
17952
18050
  results = pages.slice(0, topK).map((page) => {
17953
18051
  const bestScore = page.bestChunk.finalScore;
17954
- const minScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
17955
- const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore).slice(0, 5);
18052
+ const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
18053
+ const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
17956
18054
  return {
17957
18055
  url: page.url,
17958
18056
  title: page.title,
@@ -17969,6 +18067,9 @@ var SearchEngine = class _SearchEngine {
17969
18067
  };
17970
18068
  });
17971
18069
  } else {
18070
+ if (minScore > 0) {
18071
+ ordered = ordered.filter((entry) => entry.finalScore >= minScore);
18072
+ }
17972
18073
  results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
17973
18074
  url: hit.metadata.url,
17974
18075
  title: hit.metadata.title,
@@ -18040,43 +18141,67 @@ var SearchEngine = class _SearchEngine {
18040
18141
  }
18041
18142
  }
18042
18143
  async rerankHits(query, ranked, topK) {
18043
- if (this.config.rerank.provider !== "jina") {
18144
+ if (!this.config.rerank.enabled) {
18044
18145
  throw new SearchSocketError(
18045
18146
  "INVALID_REQUEST",
18046
- "rerank=true requested but rerank.provider is not configured as 'jina'.",
18147
+ "rerank=true requested but rerank.enabled is not set to true.",
18047
18148
  400
18048
18149
  );
18049
18150
  }
18050
18151
  if (!this.reranker) {
18051
18152
  throw new SearchSocketError(
18052
18153
  "CONFIG_MISSING",
18053
- `rerank=true requested but ${this.config.rerank.jina.apiKeyEnv} is not set.`,
18154
+ `rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
18054
18155
  400
18055
18156
  );
18056
18157
  }
18057
- const candidates = ranked.map(({ hit }) => ({
18058
- id: hit.id,
18059
- text: [hit.metadata.title, hit.metadata.sectionTitle, hit.metadata.snippet].filter(Boolean).join("\n")
18060
- }));
18158
+ const pageGroups = /* @__PURE__ */ new Map();
18159
+ for (const entry of ranked) {
18160
+ const url = entry.hit.metadata.url;
18161
+ const group = pageGroups.get(url);
18162
+ if (group) group.push(entry);
18163
+ else pageGroups.set(url, [entry]);
18164
+ }
18165
+ const MAX_CHUNKS_PER_PAGE = 5;
18166
+ const MIN_CHUNKS_PER_PAGE = 1;
18167
+ const MIN_CHUNK_SCORE_RATIO = 0.5;
18168
+ const pageCandidates = [];
18169
+ for (const [url, chunks] of pageGroups) {
18170
+ const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
18171
+ const bestScore = byScore[0].finalScore;
18172
+ const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
18173
+ const selected = byScore.filter(
18174
+ (c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
18175
+ ).slice(0, MAX_CHUNKS_PER_PAGE);
18176
+ selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
18177
+ const first = selected[0].hit.metadata;
18178
+ const parts = [first.title];
18179
+ if (first.description) {
18180
+ parts.push(first.description);
18181
+ }
18182
+ if (first.keywords && first.keywords.length > 0) {
18183
+ parts.push(first.keywords.join(", "));
18184
+ }
18185
+ const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
18186
+ parts.push(body);
18187
+ pageCandidates.push({ id: url, text: parts.join("\n\n") });
18188
+ }
18061
18189
  const reranked = await this.reranker.rerank(
18062
18190
  query,
18063
- candidates,
18191
+ pageCandidates,
18064
18192
  Math.max(topK, this.config.rerank.topN)
18065
18193
  );
18066
- const rerankScoreById = new Map(reranked.map((entry) => [entry.id, entry.score]));
18194
+ const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
18067
18195
  return ranked.map((entry) => {
18068
- const rerankScore = rerankScoreById.get(entry.hit.id);
18069
- const safeBaseScore = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
18070
- if (rerankScore === void 0 || !Number.isFinite(rerankScore)) {
18071
- return {
18072
- ...entry,
18073
- finalScore: safeBaseScore
18074
- };
18196
+ const pageScore = scoreByUrl.get(entry.hit.metadata.url);
18197
+ const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
18198
+ if (pageScore === void 0 || !Number.isFinite(pageScore)) {
18199
+ return { ...entry, finalScore: base };
18075
18200
  }
18076
- const combinedScore = rerankScore * this.config.ranking.weights.rerank + safeBaseScore * 1e-3;
18201
+ const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
18077
18202
  return {
18078
18203
  ...entry,
18079
- finalScore: Number.isFinite(combinedScore) ? combinedScore : safeBaseScore
18204
+ finalScore: Number.isFinite(combined) ? combined : base
18080
18205
  };
18081
18206
  }).sort((a, b) => {
18082
18207
  const delta = b.finalScore - a.finalScore;
@@ -18116,13 +18241,21 @@ function searchsocketHandle(options = {}) {
18116
18241
  let rateLimiter = null;
18117
18242
  const getConfig = async () => {
18118
18243
  if (!configPromise) {
18119
- const configP = options.config ? Promise.resolve(options.config) : loadConfig({
18120
- cwd: options.cwd,
18121
- configPath: options.configPath
18122
- });
18244
+ let configP;
18245
+ if (options.config) {
18246
+ configP = Promise.resolve(options.config);
18247
+ } else if (options.rawConfig) {
18248
+ const cwd = options.cwd ?? process.cwd();
18249
+ configP = Promise.resolve(mergeConfig(cwd, options.rawConfig));
18250
+ } else {
18251
+ configP = loadConfig({
18252
+ cwd: options.cwd,
18253
+ configPath: options.configPath
18254
+ });
18255
+ }
18123
18256
  configPromise = configP.then((config) => {
18124
18257
  apiPath = apiPath ?? config.api.path;
18125
- if (config.api.rateLimit) {
18258
+ if (config.api.rateLimit && !isServerless()) {
18126
18259
  rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
18127
18260
  }
18128
18261
  return config;
@@ -18132,10 +18265,9 @@ function searchsocketHandle(options = {}) {
18132
18265
  };
18133
18266
  const getEngine = async () => {
18134
18267
  if (!enginePromise) {
18135
- const config = options.config;
18268
+ const config = await getConfig();
18136
18269
  enginePromise = SearchEngine.create({
18137
18270
  cwd: options.cwd,
18138
- configPath: options.configPath,
18139
18271
  config
18140
18272
  });
18141
18273
  }
@@ -18562,7 +18694,9 @@ function chunkMirrorPage(page, config, scope) {
18562
18694
  incomingLinks: page.incomingLinks,
18563
18695
  routeFile: page.routeFile,
18564
18696
  tags: page.tags,
18565
- contentHash: ""
18697
+ contentHash: "",
18698
+ description: page.description,
18699
+ keywords: page.keywords
18566
18700
  };
18567
18701
  const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
18568
18702
  summaryChunk.contentHash = sha256(normalizeText(embeddingText));
@@ -18589,7 +18723,9 @@ function chunkMirrorPage(page, config, scope) {
18589
18723
  incomingLinks: page.incomingLinks,
18590
18724
  routeFile: page.routeFile,
18591
18725
  tags: page.tags,
18592
- contentHash: ""
18726
+ contentHash: "",
18727
+ description: page.description,
18728
+ keywords: page.keywords
18593
18729
  };
18594
18730
  const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
18595
18731
  chunk.contentHash = sha256(normalizeText(embeddingText));
@@ -19670,14 +19806,16 @@ function mapUrlToRoute(urlPath, patterns) {
19670
19806
  var Logger = class {
19671
19807
  json;
19672
19808
  verbose;
19809
+ quiet;
19673
19810
  stderrOnly;
19674
19811
  constructor(opts = {}) {
19675
19812
  this.json = opts.json ?? false;
19676
19813
  this.verbose = opts.verbose ?? false;
19814
+ this.quiet = opts.quiet ?? false;
19677
19815
  this.stderrOnly = opts.stderrOnly ?? false;
19678
19816
  }
19679
19817
  info(message) {
19680
- if (this.json) {
19818
+ if (this.quiet || this.json) {
19681
19819
  return;
19682
19820
  }
19683
19821
  this.writeOut(`${message}
@@ -19691,7 +19829,7 @@ var Logger = class {
19691
19829
  this.logJson("debug", { message });
19692
19830
  return;
19693
19831
  }
19694
- this.writeOut(`${message}
19832
+ this.writeOut(` ${message}
19695
19833
  `);
19696
19834
  }
19697
19835
  warn(message) {
@@ -19718,7 +19856,7 @@ var Logger = class {
19718
19856
  this.logJson(event, data);
19719
19857
  return;
19720
19858
  }
19721
- this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
19859
+ this.writeOut(` [${event}] ${data ? JSON.stringify(data) : ""}
19722
19860
  `);
19723
19861
  }
19724
19862
  writeOut(text) {
@@ -19903,11 +20041,108 @@ async function startPreviewServer(cwd, options, logger3) {
19903
20041
 
19904
20042
  // src/indexing/sources/build/index.ts
19905
20043
  var logger = new Logger();
20044
+ function extractLinksFromHtml(html, pageUrl, baseOrigin) {
20045
+ const $ = cheerio.load(html);
20046
+ const links = [];
20047
+ $("a[href]").each((_i, el) => {
20048
+ const href = $(el).attr("href");
20049
+ if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
20050
+ return;
20051
+ }
20052
+ try {
20053
+ const resolved = new URL(href, `${baseOrigin}${pageUrl}`);
20054
+ if (resolved.origin !== baseOrigin) return;
20055
+ if (!["http:", "https:"].includes(resolved.protocol)) return;
20056
+ links.push(normalizeUrlPath(resolved.pathname));
20057
+ } catch {
20058
+ }
20059
+ });
20060
+ return [...new Set(links)];
20061
+ }
20062
+ async function discoverPages(server, buildConfig, pipelineMaxPages) {
20063
+ const { seedUrls, maxDepth, exclude } = buildConfig;
20064
+ const baseOrigin = new URL(server.baseUrl).origin;
20065
+ let effectiveMax = buildConfig.maxPages;
20066
+ if (typeof pipelineMaxPages === "number") {
20067
+ const floored = Math.max(0, Math.floor(pipelineMaxPages));
20068
+ effectiveMax = Math.min(effectiveMax, floored);
20069
+ }
20070
+ if (effectiveMax === 0) return [];
20071
+ const visited = /* @__PURE__ */ new Set();
20072
+ const pages = [];
20073
+ const queue = [];
20074
+ const limit = pLimit2__default.default(8);
20075
+ for (const seed of seedUrls) {
20076
+ const normalized = normalizeUrlPath(seed);
20077
+ if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
20078
+ visited.add(normalized);
20079
+ queue.push({ url: normalized, depth: 0 });
20080
+ }
20081
+ }
20082
+ while (queue.length > 0 && pages.length < effectiveMax) {
20083
+ const remaining = effectiveMax - pages.length;
20084
+ const batch = queue.splice(0, remaining);
20085
+ const results = await Promise.allSettled(
20086
+ batch.map(
20087
+ (item) => limit(async () => {
20088
+ const fullUrl = joinUrl(server.baseUrl, item.url);
20089
+ const response = await fetch(fullUrl);
20090
+ if (!response.ok) {
20091
+ logger.warn(`Skipping ${item.url}: ${response.status} ${response.statusText}`);
20092
+ return null;
20093
+ }
20094
+ const contentType = response.headers.get("content-type") ?? "";
20095
+ if (!contentType.includes("text/html")) {
20096
+ return null;
20097
+ }
20098
+ const html = await response.text();
20099
+ if (item.depth < maxDepth) {
20100
+ const links = extractLinksFromHtml(html, item.url, baseOrigin);
20101
+ for (const link of links) {
20102
+ if (!visited.has(link) && !isExcluded(link, exclude)) {
20103
+ visited.add(link);
20104
+ queue.push({ url: link, depth: item.depth + 1 });
20105
+ }
20106
+ }
20107
+ }
20108
+ return {
20109
+ url: item.url,
20110
+ html,
20111
+ sourcePath: fullUrl,
20112
+ outgoingLinks: []
20113
+ };
20114
+ })
20115
+ )
20116
+ );
20117
+ for (const result of results) {
20118
+ if (result.status === "fulfilled" && result.value) {
20119
+ pages.push(result.value);
20120
+ }
20121
+ }
20122
+ }
20123
+ if (pages.length >= effectiveMax && queue.length > 0) {
20124
+ logger.warn(`Discovery crawl reached maxPages limit (${effectiveMax}), ${queue.length} URLs not visited.`);
20125
+ }
20126
+ logger.event("build_discover_complete", {
20127
+ pagesFound: pages.length,
20128
+ urlsVisited: visited.size,
20129
+ urlsSkipped: queue.length
20130
+ });
20131
+ return pages;
20132
+ }
19906
20133
  async function loadBuildPages(cwd, config, maxPages) {
19907
20134
  const buildConfig = config.source.build;
19908
20135
  if (!buildConfig) {
19909
20136
  throw new Error("build source config is missing");
19910
20137
  }
20138
+ if (buildConfig.discover) {
20139
+ const server2 = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
20140
+ try {
20141
+ return await discoverPages(server2, buildConfig, maxPages);
20142
+ } finally {
20143
+ await server2.shutdown();
20144
+ }
20145
+ }
19911
20146
  const routes = await parseManifest(cwd, buildConfig.outputDir);
19912
20147
  const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
19913
20148
  logger.event("build_routes_discovered", {
@@ -19918,7 +20153,7 @@ async function loadBuildPages(cwd, config, maxPages) {
19918
20153
  const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
19919
20154
  const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
19920
20155
  try {
19921
- const concurrencyLimit = pLimit__default.default(8);
20156
+ const concurrencyLimit = pLimit2__default.default(8);
19922
20157
  const results = await Promise.allSettled(
19923
20158
  selected.map(
19924
20159
  (route) => concurrencyLimit(async () => {
@@ -20087,7 +20322,7 @@ async function loadCrawledPages(config, maxPages) {
20087
20322
  const routes = await resolveRoutes(config);
20088
20323
  const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
20089
20324
  const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
20090
- const concurrencyLimit = pLimit__default.default(8);
20325
+ const concurrencyLimit = pLimit2__default.default(8);
20091
20326
  const results = await Promise.allSettled(
20092
20327
  selected.map(
20093
20328
  (route) => concurrencyLimit(async () => {
@@ -20141,9 +20376,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
20141
20376
 
20142
20377
  // src/indexing/pipeline.ts
20143
20378
  var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
20144
- "text-embedding-3-small": 2e-5,
20145
- "text-embedding-3-large": 13e-5,
20146
- "text-embedding-ada-002": 1e-4
20379
+ "jina-embeddings-v3": 2e-5
20147
20380
  };
20148
20381
  var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
20149
20382
  var IndexPipeline = class _IndexPipeline {
@@ -20189,9 +20422,15 @@ var IndexPipeline = class _IndexPipeline {
20189
20422
  };
20190
20423
  const scope = resolveScope(this.config, options.scopeOverride);
20191
20424
  const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
20425
+ const sourceMode = options.sourceOverride ?? this.config.source.mode;
20426
+ this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, model: ${this.config.embeddings.model})`);
20192
20427
  if (options.force) {
20428
+ this.logger.info("Force mode enabled \u2014 full rebuild");
20193
20429
  await cleanMirrorForScope(statePath, scope);
20194
20430
  }
20431
+ if (options.dryRun) {
20432
+ this.logger.info("Dry run \u2014 no writes will be performed");
20433
+ }
20195
20434
  const manifestStart = stageStart();
20196
20435
  const existingHashes = await this.vectorStore.getContentHashes(scope);
20197
20436
  const existingModelId = await this.vectorStore.getScopeModelId(scope);
@@ -20202,8 +20441,9 @@ var IndexPipeline = class _IndexPipeline {
20202
20441
  );
20203
20442
  }
20204
20443
  stageEnd("manifest", manifestStart);
20444
+ this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
20205
20445
  const sourceStart = stageStart();
20206
- const sourceMode = options.sourceOverride ?? this.config.source.mode;
20446
+ this.logger.info(`Loading pages (source: ${sourceMode})...`);
20207
20447
  let sourcePages;
20208
20448
  if (sourceMode === "static-output") {
20209
20449
  sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
@@ -20215,10 +20455,13 @@ var IndexPipeline = class _IndexPipeline {
20215
20455
  sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
20216
20456
  }
20217
20457
  stageEnd("source", sourceStart);
20458
+ this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
20218
20459
  const routeStart = stageStart();
20219
20460
  const routePatterns = await buildRoutePatterns(this.cwd);
20220
20461
  stageEnd("route_map", routeStart);
20462
+ this.logger.debug(`Route mapping: ${routePatterns.length} pattern${routePatterns.length === 1 ? "" : "s"} discovered (${stageTimingsMs["route_map"]}ms)`);
20221
20463
  const extractStart = stageStart();
20464
+ this.logger.info("Extracting content...");
20222
20465
  const extractedPages = [];
20223
20466
  for (const sourcePage of sourcePages) {
20224
20467
  const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
@@ -20247,6 +20490,8 @@ var IndexPipeline = class _IndexPipeline {
20247
20490
  uniquePages.push(page);
20248
20491
  }
20249
20492
  stageEnd("extract", extractStart);
20493
+ const skippedPages = sourcePages.length - uniquePages.length;
20494
+ this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
20250
20495
  const linkStart = stageStart();
20251
20496
  const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
20252
20497
  const incomingLinkCount = /* @__PURE__ */ new Map();
@@ -20262,7 +20507,9 @@ var IndexPipeline = class _IndexPipeline {
20262
20507
  }
20263
20508
  }
20264
20509
  stageEnd("links", linkStart);
20510
+ this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
20265
20511
  const mirrorStart = stageStart();
20512
+ this.logger.info("Writing mirror pages...");
20266
20513
  const mirrorPages = [];
20267
20514
  let routeExact = 0;
20268
20515
  let routeBestEffort = 0;
@@ -20332,7 +20579,9 @@ var IndexPipeline = class _IndexPipeline {
20332
20579
  await this.vectorStore.upsertPages(pageRecords, scope);
20333
20580
  }
20334
20581
  stageEnd("mirror", mirrorStart);
20582
+ this.logger.info(`Mirrored ${mirrorPages.length} page${mirrorPages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["mirror"]}ms)`);
20335
20583
  const chunkStart = stageStart();
20584
+ this.logger.info("Chunking pages...");
20336
20585
  let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
20337
20586
  const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
20338
20587
  if (typeof maxChunks === "number") {
@@ -20345,6 +20594,7 @@ var IndexPipeline = class _IndexPipeline {
20345
20594
  });
20346
20595
  }
20347
20596
  stageEnd("chunk", chunkStart);
20597
+ this.logger.info(`Chunked into ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} (${stageTimingsMs["chunk"]}ms)`);
20348
20598
  const currentChunkMap = /* @__PURE__ */ new Map();
20349
20599
  for (const chunk of chunks) {
20350
20600
  currentChunkMap.set(chunk.chunkKey, chunk);
@@ -20363,6 +20613,7 @@ var IndexPipeline = class _IndexPipeline {
20363
20613
  return existingHash !== chunk.contentHash;
20364
20614
  });
20365
20615
  const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
20616
+ this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
20366
20617
  const embedStart = stageStart();
20367
20618
  const chunkTokenEstimates = /* @__PURE__ */ new Map();
20368
20619
  for (const chunk of changedChunks) {
@@ -20377,9 +20628,11 @@ var IndexPipeline = class _IndexPipeline {
20377
20628
  let newEmbeddings = 0;
20378
20629
  const vectorsByChunk = /* @__PURE__ */ new Map();
20379
20630
  if (!options.dryRun && changedChunks.length > 0) {
20631
+ this.logger.info(`Embedding ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} (~${estimatedTokens.toLocaleString()} tokens, ~$${estimatedCostUSD.toFixed(6)})...`);
20380
20632
  const embeddings = await this.embeddings.embedTexts(
20381
20633
  changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
20382
- this.config.embeddings.model
20634
+ this.config.embeddings.model,
20635
+ "retrieval.passage"
20383
20636
  );
20384
20637
  if (embeddings.length !== changedChunks.length) {
20385
20638
  throw new SearchSocketError(
@@ -20402,8 +20655,14 @@ var IndexPipeline = class _IndexPipeline {
20402
20655
  }
20403
20656
  }
20404
20657
  stageEnd("embedding", embedStart);
20658
+ if (changedChunks.length > 0) {
20659
+ this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
20660
+ } else {
20661
+ this.logger.info("No chunks to embed \u2014 all up to date");
20662
+ }
20405
20663
  const syncStart = stageStart();
20406
20664
  if (!options.dryRun) {
20665
+ this.logger.info("Syncing vectors...");
20407
20666
  const upserts = [];
20408
20667
  for (const chunk of changedChunks) {
20409
20668
  const vector = vectorsByChunk.get(chunk.chunkKey);
@@ -20422,12 +20681,16 @@ var IndexPipeline = class _IndexPipeline {
20422
20681
  sectionTitle: chunk.sectionTitle ?? "",
20423
20682
  headingPath: chunk.headingPath,
20424
20683
  snippet: chunk.snippet,
20684
+ chunkText: chunk.chunkText.slice(0, 4e3),
20685
+ ordinal: chunk.ordinal,
20425
20686
  contentHash: chunk.contentHash,
20426
20687
  modelId: this.config.embeddings.model,
20427
20688
  depth: chunk.depth,
20428
20689
  incomingLinks: chunk.incomingLinks,
20429
20690
  routeFile: chunk.routeFile,
20430
- tags: chunk.tags
20691
+ tags: chunk.tags,
20692
+ description: chunk.description,
20693
+ keywords: chunk.keywords
20431
20694
  }
20432
20695
  });
20433
20696
  }
@@ -20441,6 +20704,7 @@ var IndexPipeline = class _IndexPipeline {
20441
20704
  }
20442
20705
  }
20443
20706
  stageEnd("sync", syncStart);
20707
+ this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
20444
20708
  const finalizeStart = stageStart();
20445
20709
  if (!options.dryRun) {
20446
20710
  const scopeInfo = {
@@ -20460,6 +20724,7 @@ var IndexPipeline = class _IndexPipeline {
20460
20724
  });
20461
20725
  }
20462
20726
  stageEnd("finalize", finalizeStart);
20727
+ this.logger.info("Done.");
20463
20728
  return {
20464
20729
  pagesProcessed: mirrorPages.length,
20465
20730
  chunksTotal: chunks.length,