searchsocket 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -5,8 +5,7 @@ var path = require('path');
5
5
  var jiti = require('jiti');
6
6
  var zod = require('zod');
7
7
  var child_process = require('child_process');
8
- var OpenAI = require('openai');
9
- var pLimit = require('p-limit');
8
+ var pLimit2 = require('p-limit');
10
9
  var crypto = require('crypto');
11
10
  var cheerio = require('cheerio');
12
11
  var matter = require('gray-matter');
@@ -23,8 +22,7 @@ function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
23
22
 
24
23
  var fs__default = /*#__PURE__*/_interopDefault(fs);
25
24
  var path__default = /*#__PURE__*/_interopDefault(path);
26
- var OpenAI__default = /*#__PURE__*/_interopDefault(OpenAI);
27
- var pLimit__default = /*#__PURE__*/_interopDefault(pLimit);
25
+ var pLimit2__default = /*#__PURE__*/_interopDefault(pLimit2);
28
26
  var matter__default = /*#__PURE__*/_interopDefault(matter);
29
27
  var fs4__default = /*#__PURE__*/_interopDefault(fs4);
30
28
  var fg__default = /*#__PURE__*/_interopDefault(fg);
@@ -16633,7 +16631,11 @@ var searchSocketConfigSchema = zod.z.object({
16633
16631
  outputDir: zod.z.string().min(1).optional(),
16634
16632
  paramValues: zod.z.record(zod.z.string(), zod.z.array(zod.z.string())).optional(),
16635
16633
  exclude: zod.z.array(zod.z.string()).optional(),
16636
- previewTimeout: zod.z.number().int().positive().optional()
16634
+ previewTimeout: zod.z.number().int().positive().optional(),
16635
+ discover: zod.z.boolean().optional(),
16636
+ seedUrls: zod.z.array(zod.z.string()).optional(),
16637
+ maxPages: zod.z.number().int().positive().optional(),
16638
+ maxDepth: zod.z.number().int().nonnegative().optional()
16637
16639
  }).optional()
16638
16640
  }).optional(),
16639
16641
  extract: zod.z.object({
@@ -16660,8 +16662,9 @@ var searchSocketConfigSchema = zod.z.object({
16660
16662
  pageSummaryChunk: zod.z.boolean().optional()
16661
16663
  }).optional(),
16662
16664
  embeddings: zod.z.object({
16663
- provider: zod.z.literal("openai").optional(),
16665
+ provider: zod.z.literal("jina").optional(),
16664
16666
  model: zod.z.string().min(1).optional(),
16667
+ apiKey: zod.z.string().min(1).optional(),
16665
16668
  apiKeyEnv: zod.z.string().min(1).optional(),
16666
16669
  batchSize: zod.z.number().int().positive().optional(),
16667
16670
  concurrency: zod.z.number().int().positive().optional(),
@@ -16670,18 +16673,17 @@ var searchSocketConfigSchema = zod.z.object({
16670
16673
  vector: zod.z.object({
16671
16674
  dimension: zod.z.number().int().positive().optional(),
16672
16675
  turso: zod.z.object({
16676
+ url: zod.z.string().url().optional(),
16677
+ authToken: zod.z.string().min(1).optional(),
16673
16678
  urlEnv: zod.z.string().optional(),
16674
16679
  authTokenEnv: zod.z.string().optional(),
16675
16680
  localPath: zod.z.string().optional()
16676
16681
  }).optional()
16677
16682
  }).optional(),
16678
16683
  rerank: zod.z.object({
16679
- provider: zod.z.enum(["none", "jina"]).optional(),
16684
+ enabled: zod.z.boolean().optional(),
16680
16685
  topN: zod.z.number().int().positive().optional(),
16681
- jina: zod.z.object({
16682
- apiKeyEnv: zod.z.string().optional(),
16683
- model: zod.z.string().optional()
16684
- }).optional()
16686
+ model: zod.z.string().optional()
16685
16687
  }).optional(),
16686
16688
  ranking: zod.z.object({
16687
16689
  enableIncomingLinkBoost: zod.z.boolean().optional(),
@@ -16690,6 +16692,7 @@ var searchSocketConfigSchema = zod.z.object({
16690
16692
  aggregationCap: zod.z.number().int().positive().optional(),
16691
16693
  aggregationDecay: zod.z.number().min(0).max(1).optional(),
16692
16694
  minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
16695
+ minScore: zod.z.number().min(0).max(1).optional(),
16693
16696
  weights: zod.z.object({
16694
16697
  incomingLinks: zod.z.number().optional(),
16695
16698
  depth: zod.z.number().optional(),
@@ -16770,9 +16773,9 @@ function createDefaultConfig(projectId) {
16770
16773
  pageSummaryChunk: true
16771
16774
  },
16772
16775
  embeddings: {
16773
- provider: "openai",
16774
- model: "text-embedding-3-small",
16775
- apiKeyEnv: "OPENAI_API_KEY",
16776
+ provider: "jina",
16777
+ model: "jina-embeddings-v3",
16778
+ apiKeyEnv: "JINA_API_KEY",
16776
16779
  batchSize: 64,
16777
16780
  concurrency: 4
16778
16781
  },
@@ -16784,12 +16787,9 @@ function createDefaultConfig(projectId) {
16784
16787
  }
16785
16788
  },
16786
16789
  rerank: {
16787
- provider: "none",
16790
+ enabled: false,
16788
16791
  topN: 20,
16789
- jina: {
16790
- apiKeyEnv: "JINA_API_KEY",
16791
- model: "jina-reranker-v2-base-multilingual"
16792
- }
16792
+ model: "jina-reranker-v2-base-multilingual"
16793
16793
  },
16794
16794
  ranking: {
16795
16795
  enableIncomingLinkBoost: true,
@@ -16798,6 +16798,7 @@ function createDefaultConfig(projectId) {
16798
16798
  aggregationCap: 5,
16799
16799
  aggregationDecay: 0.5,
16800
16800
  minChunkScoreRatio: 0.5,
16801
+ minScore: 0,
16801
16802
  weights: {
16802
16803
  incomingLinks: 0.05,
16803
16804
  depth: 0.03,
@@ -16924,7 +16925,11 @@ ${issues}`
16924
16925
  outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
16925
16926
  paramValues: parsed.source.build.paramValues ?? {},
16926
16927
  exclude: parsed.source.build.exclude ?? [],
16927
- previewTimeout: parsed.source.build.previewTimeout ?? 3e4
16928
+ previewTimeout: parsed.source.build.previewTimeout ?? 3e4,
16929
+ discover: parsed.source.build.discover ?? false,
16930
+ seedUrls: parsed.source.build.seedUrls ?? ["/"],
16931
+ maxPages: parsed.source.build.maxPages ?? 200,
16932
+ maxDepth: parsed.source.build.maxDepth ?? 10
16928
16933
  } : void 0
16929
16934
  },
16930
16935
  extract: {
@@ -16953,11 +16958,7 @@ ${issues}`
16953
16958
  },
16954
16959
  rerank: {
16955
16960
  ...defaults.rerank,
16956
- ...parsed.rerank,
16957
- jina: {
16958
- ...defaults.rerank.jina,
16959
- ...parsed.rerank?.jina
16960
- }
16961
+ ...parsed.rerank
16961
16962
  },
16962
16963
  ranking: {
16963
16964
  ...defaults.ranking,
@@ -17004,7 +17005,11 @@ ${issues}`
17004
17005
  outputDir: ".svelte-kit/output",
17005
17006
  paramValues: {},
17006
17007
  exclude: [],
17007
- previewTimeout: 3e4
17008
+ previewTimeout: 3e4,
17009
+ discover: false,
17010
+ seedUrls: ["/"],
17011
+ maxPages: 200,
17012
+ maxDepth: 10
17008
17013
  };
17009
17014
  }
17010
17015
  if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
@@ -17018,6 +17023,21 @@ ${issues}`
17018
17023
  }
17019
17024
  return merged;
17020
17025
  }
17026
+ function mergeConfigServerless(rawConfig) {
17027
+ if (!rawConfig.project?.id) {
17028
+ throw new SearchSocketError(
17029
+ "CONFIG_MISSING",
17030
+ "`project.id` is required for serverless config (cannot infer from package.json)."
17031
+ );
17032
+ }
17033
+ if (!rawConfig.source?.mode) {
17034
+ throw new SearchSocketError(
17035
+ "CONFIG_MISSING",
17036
+ "`source.mode` is required for serverless config (cannot auto-detect from filesystem)."
17037
+ );
17038
+ }
17039
+ return mergeConfig(process.cwd(), rawConfig);
17040
+ }
17021
17041
  async function loadConfig(options = {}) {
17022
17042
  const cwd = path__default.default.resolve(options.cwd ?? process.cwd());
17023
17043
  const configPath = path__default.default.resolve(cwd, options.configPath ?? "searchsocket.config.ts");
@@ -17040,6 +17060,11 @@ async function loadConfig(options = {}) {
17040
17060
  return mergeConfig(cwd, raw);
17041
17061
  }
17042
17062
 
17063
+ // src/core/serverless.ts
17064
+ function isServerless() {
17065
+ return !!(process.env.VERCEL || process.env.NETLIFY || process.env.AWS_LAMBDA_FUNCTION_NAME || process.env.FUNCTIONS_WORKER || process.env.CF_PAGES);
17066
+ }
17067
+
17043
17068
  // src/utils/text.ts
17044
17069
  function normalizeText(input) {
17045
17070
  return input.replace(/\r\n/g, "\n").replace(/\s+/g, " ").trim();
@@ -17117,10 +17142,11 @@ function sleep(ms) {
17117
17142
  setTimeout(resolve, ms);
17118
17143
  });
17119
17144
  }
17120
- var OpenAIEmbeddingsProvider = class {
17121
- client;
17145
+ var JinaEmbeddingsProvider = class {
17146
+ apiKey;
17122
17147
  batchSize;
17123
17148
  concurrency;
17149
+ defaultTask;
17124
17150
  constructor(options) {
17125
17151
  if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
17126
17152
  throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
@@ -17128,11 +17154,10 @@ var OpenAIEmbeddingsProvider = class {
17128
17154
  if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
17129
17155
  throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
17130
17156
  }
17131
- this.client = new OpenAI__default.default({
17132
- apiKey: options.apiKey
17133
- });
17157
+ this.apiKey = options.apiKey;
17134
17158
  this.batchSize = options.batchSize;
17135
17159
  this.concurrency = options.concurrency;
17160
+ this.defaultTask = options.task ?? "retrieval.passage";
17136
17161
  }
17137
17162
  estimateTokens(text) {
17138
17163
  const normalized = text.trim();
@@ -17146,7 +17171,7 @@ var OpenAIEmbeddingsProvider = class {
17146
17171
  const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
17147
17172
  return Math.max(1, Math.max(charEstimate, lexicalEstimate));
17148
17173
  }
17149
- async embedTexts(texts, modelId) {
17174
+ async embedTexts(texts, modelId, task) {
17150
17175
  if (texts.length === 0) {
17151
17176
  return [];
17152
17177
  }
@@ -17158,37 +17183,56 @@ var OpenAIEmbeddingsProvider = class {
17158
17183
  });
17159
17184
  }
17160
17185
  const outputs = new Array(batches.length);
17161
- const limit = pLimit__default.default(this.concurrency);
17186
+ const limit = pLimit2__default.default(this.concurrency);
17162
17187
  await Promise.all(
17163
17188
  batches.map(
17164
17189
  (batch, position) => limit(async () => {
17165
- outputs[position] = await this.embedWithRetry(batch.values, modelId);
17190
+ outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
17166
17191
  })
17167
17192
  )
17168
17193
  );
17169
17194
  return outputs.flat();
17170
17195
  }
17171
- async embedWithRetry(texts, modelId) {
17196
+ async embedWithRetry(texts, modelId, task) {
17172
17197
  const maxAttempts = 5;
17173
17198
  let attempt = 0;
17174
17199
  while (attempt < maxAttempts) {
17175
17200
  attempt += 1;
17201
+ let response;
17176
17202
  try {
17177
- const response = await this.client.embeddings.create({
17178
- model: modelId,
17179
- input: texts,
17180
- encoding_format: "float"
17203
+ response = await fetch("https://api.jina.ai/v1/embeddings", {
17204
+ method: "POST",
17205
+ headers: {
17206
+ "content-type": "application/json",
17207
+ authorization: `Bearer ${this.apiKey}`
17208
+ },
17209
+ body: JSON.stringify({
17210
+ model: modelId,
17211
+ input: texts,
17212
+ task
17213
+ })
17181
17214
  });
17182
- return response.data.map((entry) => entry.embedding);
17183
17215
  } catch (error) {
17184
- const status = error.status;
17185
- const retryable = status === 429 || typeof status === "number" && status >= 500;
17186
- if (!retryable || attempt >= maxAttempts) {
17216
+ if (attempt >= maxAttempts) {
17187
17217
  throw error;
17188
17218
  }
17189
- const delay = Math.min(2 ** attempt * 300, 5e3);
17190
- await sleep(delay);
17219
+ await sleep(Math.min(2 ** attempt * 300, 5e3));
17220
+ continue;
17221
+ }
17222
+ if (!response.ok) {
17223
+ const retryable = response.status === 429 || response.status >= 500;
17224
+ if (!retryable || attempt >= maxAttempts) {
17225
+ const errorBody = await response.text();
17226
+ throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
17227
+ }
17228
+ await sleep(Math.min(2 ** attempt * 300, 5e3));
17229
+ continue;
17230
+ }
17231
+ const payload = await response.json();
17232
+ if (!payload.data || !Array.isArray(payload.data)) {
17233
+ throw new Error("Invalid Jina embeddings response format");
17191
17234
  }
17235
+ return payload.data.map((entry) => entry.embedding);
17192
17236
  }
17193
17237
  throw new Error("Unreachable retry state");
17194
17238
  }
@@ -17196,20 +17240,20 @@ var OpenAIEmbeddingsProvider = class {
17196
17240
 
17197
17241
  // src/embeddings/factory.ts
17198
17242
  function createEmbeddingsProvider(config) {
17199
- if (config.embeddings.provider !== "openai") {
17243
+ if (config.embeddings.provider !== "jina") {
17200
17244
  throw new SearchSocketError(
17201
17245
  "CONFIG_MISSING",
17202
17246
  `Unsupported embeddings provider ${config.embeddings.provider}`
17203
17247
  );
17204
17248
  }
17205
- const apiKey = process.env[config.embeddings.apiKeyEnv];
17249
+ const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
17206
17250
  if (!apiKey) {
17207
17251
  throw new SearchSocketError(
17208
17252
  "CONFIG_MISSING",
17209
- `Missing embeddings API key env var: ${config.embeddings.apiKeyEnv}`
17253
+ `Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
17210
17254
  );
17211
17255
  }
17212
- return new OpenAIEmbeddingsProvider({
17256
+ return new JinaEmbeddingsProvider({
17213
17257
  apiKey,
17214
17258
  batchSize: config.embeddings.batchSize,
17215
17259
  concurrency: config.embeddings.concurrency
@@ -17299,20 +17343,17 @@ var JinaReranker = class {
17299
17343
 
17300
17344
  // src/rerank/factory.ts
17301
17345
  function createReranker(config) {
17302
- if (config.rerank.provider === "none") {
17346
+ if (!config.rerank.enabled) {
17303
17347
  return null;
17304
17348
  }
17305
- if (config.rerank.provider === "jina") {
17306
- const apiKey = process.env[config.rerank.jina.apiKeyEnv];
17307
- if (!apiKey) {
17308
- return null;
17309
- }
17310
- return new JinaReranker({
17311
- apiKey,
17312
- model: config.rerank.jina.model
17313
- });
17349
+ const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
17350
+ if (!apiKey) {
17351
+ return null;
17314
17352
  }
17315
- return null;
17353
+ return new JinaReranker({
17354
+ apiKey,
17355
+ model: config.rerank.model
17356
+ });
17316
17357
  }
17317
17358
  function ensureStateDirs(cwd, stateDir, scope) {
17318
17359
  const statePath = path__default.default.resolve(cwd, stateDir);
@@ -17365,6 +17406,16 @@ var TursoVectorStore = class {
17365
17406
  }
17366
17407
  async ensureChunks(dim) {
17367
17408
  if (this.chunksReady) return;
17409
+ const exists = await this.chunksTableExists();
17410
+ if (exists) {
17411
+ const currentDim = await this.getChunksDimension();
17412
+ if (currentDim !== null && currentDim !== dim) {
17413
+ await this.client.batch([
17414
+ "DROP INDEX IF EXISTS idx",
17415
+ "DROP TABLE IF EXISTS chunks"
17416
+ ]);
17417
+ }
17418
+ }
17368
17419
  await this.client.batch([
17369
17420
  `CREATE TABLE IF NOT EXISTS chunks (
17370
17421
  id TEXT PRIMARY KEY,
@@ -17376,12 +17427,16 @@ var TursoVectorStore = class {
17376
17427
  section_title TEXT NOT NULL DEFAULT '',
17377
17428
  heading_path TEXT NOT NULL DEFAULT '[]',
17378
17429
  snippet TEXT NOT NULL DEFAULT '',
17430
+ chunk_text TEXT NOT NULL DEFAULT '',
17431
+ ordinal INTEGER NOT NULL DEFAULT 0,
17379
17432
  content_hash TEXT NOT NULL DEFAULT '',
17380
17433
  model_id TEXT NOT NULL DEFAULT '',
17381
17434
  depth INTEGER NOT NULL DEFAULT 0,
17382
17435
  incoming_links INTEGER NOT NULL DEFAULT 0,
17383
17436
  route_file TEXT NOT NULL DEFAULT '',
17384
17437
  tags TEXT NOT NULL DEFAULT '[]',
17438
+ description TEXT NOT NULL DEFAULT '',
17439
+ keywords TEXT NOT NULL DEFAULT '[]',
17385
17440
  embedding F32_BLOB(${dim})
17386
17441
  )`,
17387
17442
  `CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
@@ -17420,6 +17475,38 @@ var TursoVectorStore = class {
17420
17475
  throw error;
17421
17476
  }
17422
17477
  }
17478
+ /**
17479
+ * Read the current F32_BLOB dimension from the chunks table schema.
17480
+ * Returns null if the table doesn't exist or the dimension can't be parsed.
17481
+ */
17482
+ async getChunksDimension() {
17483
+ try {
17484
+ const rs = await this.client.execute(
17485
+ "SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
17486
+ );
17487
+ if (rs.rows.length === 0) return null;
17488
+ const sql = rs.rows[0].sql;
17489
+ const match = sql.match(/F32_BLOB\((\d+)\)/i);
17490
+ return match ? parseInt(match[1], 10) : null;
17491
+ } catch {
17492
+ return null;
17493
+ }
17494
+ }
17495
+ /**
17496
+ * Drop all SearchSocket tables (chunks, registry, pages) and their indexes.
17497
+ * Used by `clean --remote` for a full reset.
17498
+ */
17499
+ async dropAllTables() {
17500
+ await this.client.batch([
17501
+ "DROP INDEX IF EXISTS idx",
17502
+ "DROP TABLE IF EXISTS chunks",
17503
+ "DROP TABLE IF EXISTS registry",
17504
+ "DROP TABLE IF EXISTS pages"
17505
+ ]);
17506
+ this.chunksReady = false;
17507
+ this.registryReady = false;
17508
+ this.pagesReady = false;
17509
+ }
17423
17510
  async upsert(records, _scope) {
17424
17511
  if (records.length === 0) return;
17425
17512
  const dim = this.dimension ?? records[0].vector.length;
@@ -17430,9 +17517,9 @@ var TursoVectorStore = class {
17430
17517
  const stmts = batch.map((r) => ({
17431
17518
  sql: `INSERT OR REPLACE INTO chunks
17432
17519
  (id, project_id, scope_name, url, path, title, section_title,
17433
- heading_path, snippet, content_hash, model_id, depth,
17434
- incoming_links, route_file, tags, embedding)
17435
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
17520
+ heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
17521
+ incoming_links, route_file, tags, description, keywords, embedding)
17522
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
17436
17523
  args: [
17437
17524
  r.id,
17438
17525
  r.metadata.projectId,
@@ -17443,12 +17530,16 @@ var TursoVectorStore = class {
17443
17530
  r.metadata.sectionTitle,
17444
17531
  JSON.stringify(r.metadata.headingPath),
17445
17532
  r.metadata.snippet,
17533
+ r.metadata.chunkText,
17534
+ r.metadata.ordinal,
17446
17535
  r.metadata.contentHash,
17447
17536
  r.metadata.modelId,
17448
17537
  r.metadata.depth,
17449
17538
  r.metadata.incomingLinks,
17450
17539
  r.metadata.routeFile,
17451
17540
  JSON.stringify(r.metadata.tags),
17541
+ r.metadata.description ?? "",
17542
+ JSON.stringify(r.metadata.keywords ?? []),
17452
17543
  JSON.stringify(r.vector)
17453
17544
  ]
17454
17545
  }));
@@ -17461,8 +17552,10 @@ var TursoVectorStore = class {
17461
17552
  const queryJson = JSON.stringify(queryVector);
17462
17553
  const rs = await this.client.execute({
17463
17554
  sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
17464
- c.section_title, c.heading_path, c.snippet, c.content_hash,
17555
+ c.section_title, c.heading_path, c.snippet, c.chunk_text,
17556
+ c.ordinal, c.content_hash,
17465
17557
  c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
17558
+ c.description, c.keywords,
17466
17559
  vector_distance_cos(c.embedding, vector(?)) AS distance
17467
17560
  FROM vector_top_k('idx', vector(?), ?) AS v
17468
17561
  JOIN chunks AS c ON c.rowid = v.id`,
@@ -17493,6 +17586,12 @@ var TursoVectorStore = class {
17493
17586
  }
17494
17587
  const distance = row.distance;
17495
17588
  const score = 1 - distance;
17589
+ const description = row.description || void 0;
17590
+ const keywords = (() => {
17591
+ const raw = row.keywords || "[]";
17592
+ const parsed = JSON.parse(raw);
17593
+ return parsed.length > 0 ? parsed : void 0;
17594
+ })();
17496
17595
  hits.push({
17497
17596
  id: row.id,
17498
17597
  score,
@@ -17505,12 +17604,16 @@ var TursoVectorStore = class {
17505
17604
  sectionTitle: row.section_title,
17506
17605
  headingPath: JSON.parse(row.heading_path || "[]"),
17507
17606
  snippet: row.snippet,
17607
+ chunkText: row.chunk_text || "",
17608
+ ordinal: row.ordinal || 0,
17508
17609
  contentHash: row.content_hash,
17509
17610
  modelId: row.model_id,
17510
17611
  depth: row.depth,
17511
17612
  incomingLinks: row.incoming_links,
17512
17613
  routeFile: row.route_file,
17513
- tags
17614
+ tags,
17615
+ description,
17616
+ keywords
17514
17617
  }
17515
17618
  });
17516
17619
  }
@@ -17700,10 +17803,10 @@ var TursoVectorStore = class {
17700
17803
  // src/vector/factory.ts
17701
17804
  async function createVectorStore(config, cwd) {
17702
17805
  const turso = config.vector.turso;
17703
- const remoteUrl = process.env[turso.urlEnv];
17806
+ const remoteUrl = turso.url ?? process.env[turso.urlEnv];
17704
17807
  if (remoteUrl) {
17705
17808
  const { createClient: createClient2 } = await import('@libsql/client/http');
17706
- const authToken = process.env[turso.authTokenEnv];
17809
+ const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
17707
17810
  const client2 = createClient2({
17708
17811
  url: remoteUrl,
17709
17812
  authToken
@@ -17713,6 +17816,12 @@ async function createVectorStore(config, cwd) {
17713
17816
  dimension: config.vector.dimension
17714
17817
  });
17715
17818
  }
17819
+ if (isServerless()) {
17820
+ throw new SearchSocketError(
17821
+ "VECTOR_BACKEND_UNAVAILABLE",
17822
+ `No remote vector database URL found (checked vector.turso.url and env var "${turso.urlEnv}"). Local SQLite storage is not available in serverless environments. Set ${turso.urlEnv} or pass vector.turso.url directly.`
17823
+ );
17824
+ }
17716
17825
  const { createClient } = await import('@libsql/client');
17717
17826
  const localPath = path__default.default.resolve(cwd, turso.localPath);
17718
17827
  fs__default.default.mkdirSync(path__default.default.dirname(localPath), { recursive: true });
@@ -18043,7 +18152,9 @@ function chunkMirrorPage(page, config, scope) {
18043
18152
  incomingLinks: page.incomingLinks,
18044
18153
  routeFile: page.routeFile,
18045
18154
  tags: page.tags,
18046
- contentHash: ""
18155
+ contentHash: "",
18156
+ description: page.description,
18157
+ keywords: page.keywords
18047
18158
  };
18048
18159
  const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
18049
18160
  summaryChunk.contentHash = sha256(normalizeText(embeddingText));
@@ -18070,7 +18181,9 @@ function chunkMirrorPage(page, config, scope) {
18070
18181
  incomingLinks: page.incomingLinks,
18071
18182
  routeFile: page.routeFile,
18072
18183
  tags: page.tags,
18073
- contentHash: ""
18184
+ contentHash: "",
18185
+ description: page.description,
18186
+ keywords: page.keywords
18074
18187
  };
18075
18188
  const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
18076
18189
  chunk.contentHash = sha256(normalizeText(embeddingText));
@@ -19151,14 +19264,16 @@ function mapUrlToRoute(urlPath, patterns) {
19151
19264
  var Logger = class {
19152
19265
  json;
19153
19266
  verbose;
19267
+ quiet;
19154
19268
  stderrOnly;
19155
19269
  constructor(opts = {}) {
19156
19270
  this.json = opts.json ?? false;
19157
19271
  this.verbose = opts.verbose ?? false;
19272
+ this.quiet = opts.quiet ?? false;
19158
19273
  this.stderrOnly = opts.stderrOnly ?? false;
19159
19274
  }
19160
19275
  info(message) {
19161
- if (this.json) {
19276
+ if (this.quiet || this.json) {
19162
19277
  return;
19163
19278
  }
19164
19279
  this.writeOut(`${message}
@@ -19172,7 +19287,7 @@ var Logger = class {
19172
19287
  this.logJson("debug", { message });
19173
19288
  return;
19174
19289
  }
19175
- this.writeOut(`${message}
19290
+ this.writeOut(` ${message}
19176
19291
  `);
19177
19292
  }
19178
19293
  warn(message) {
@@ -19199,7 +19314,7 @@ var Logger = class {
19199
19314
  this.logJson(event, data);
19200
19315
  return;
19201
19316
  }
19202
- this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
19317
+ this.writeOut(` [${event}] ${data ? JSON.stringify(data) : ""}
19203
19318
  `);
19204
19319
  }
19205
19320
  writeOut(text) {
@@ -19384,11 +19499,108 @@ async function startPreviewServer(cwd, options, logger3) {
19384
19499
 
19385
19500
  // src/indexing/sources/build/index.ts
19386
19501
  var logger = new Logger();
19502
+ function extractLinksFromHtml(html, pageUrl, baseOrigin) {
19503
+ const $ = cheerio.load(html);
19504
+ const links = [];
19505
+ $("a[href]").each((_i, el) => {
19506
+ const href = $(el).attr("href");
19507
+ if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
19508
+ return;
19509
+ }
19510
+ try {
19511
+ const resolved = new URL(href, `${baseOrigin}${pageUrl}`);
19512
+ if (resolved.origin !== baseOrigin) return;
19513
+ if (!["http:", "https:"].includes(resolved.protocol)) return;
19514
+ links.push(normalizeUrlPath(resolved.pathname));
19515
+ } catch {
19516
+ }
19517
+ });
19518
+ return [...new Set(links)];
19519
+ }
19520
+ async function discoverPages(server, buildConfig, pipelineMaxPages) {
19521
+ const { seedUrls, maxDepth, exclude } = buildConfig;
19522
+ const baseOrigin = new URL(server.baseUrl).origin;
19523
+ let effectiveMax = buildConfig.maxPages;
19524
+ if (typeof pipelineMaxPages === "number") {
19525
+ const floored = Math.max(0, Math.floor(pipelineMaxPages));
19526
+ effectiveMax = Math.min(effectiveMax, floored);
19527
+ }
19528
+ if (effectiveMax === 0) return [];
19529
+ const visited = /* @__PURE__ */ new Set();
19530
+ const pages = [];
19531
+ const queue = [];
19532
+ const limit = pLimit2__default.default(8);
19533
+ for (const seed of seedUrls) {
19534
+ const normalized = normalizeUrlPath(seed);
19535
+ if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
19536
+ visited.add(normalized);
19537
+ queue.push({ url: normalized, depth: 0 });
19538
+ }
19539
+ }
19540
+ while (queue.length > 0 && pages.length < effectiveMax) {
19541
+ const remaining = effectiveMax - pages.length;
19542
+ const batch = queue.splice(0, remaining);
19543
+ const results = await Promise.allSettled(
19544
+ batch.map(
19545
+ (item) => limit(async () => {
19546
+ const fullUrl = joinUrl(server.baseUrl, item.url);
19547
+ const response = await fetch(fullUrl);
19548
+ if (!response.ok) {
19549
+ logger.warn(`Skipping ${item.url}: ${response.status} ${response.statusText}`);
19550
+ return null;
19551
+ }
19552
+ const contentType = response.headers.get("content-type") ?? "";
19553
+ if (!contentType.includes("text/html")) {
19554
+ return null;
19555
+ }
19556
+ const html = await response.text();
19557
+ if (item.depth < maxDepth) {
19558
+ const links = extractLinksFromHtml(html, item.url, baseOrigin);
19559
+ for (const link of links) {
19560
+ if (!visited.has(link) && !isExcluded(link, exclude)) {
19561
+ visited.add(link);
19562
+ queue.push({ url: link, depth: item.depth + 1 });
19563
+ }
19564
+ }
19565
+ }
19566
+ return {
19567
+ url: item.url,
19568
+ html,
19569
+ sourcePath: fullUrl,
19570
+ outgoingLinks: []
19571
+ };
19572
+ })
19573
+ )
19574
+ );
19575
+ for (const result of results) {
19576
+ if (result.status === "fulfilled" && result.value) {
19577
+ pages.push(result.value);
19578
+ }
19579
+ }
19580
+ }
19581
+ if (pages.length >= effectiveMax && queue.length > 0) {
19582
+ logger.warn(`Discovery crawl reached maxPages limit (${effectiveMax}), ${queue.length} URLs not visited.`);
19583
+ }
19584
+ logger.event("build_discover_complete", {
19585
+ pagesFound: pages.length,
19586
+ urlsVisited: visited.size,
19587
+ urlsSkipped: queue.length
19588
+ });
19589
+ return pages;
19590
+ }
19387
19591
  async function loadBuildPages(cwd, config, maxPages) {
19388
19592
  const buildConfig = config.source.build;
19389
19593
  if (!buildConfig) {
19390
19594
  throw new Error("build source config is missing");
19391
19595
  }
19596
+ if (buildConfig.discover) {
19597
+ const server2 = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
19598
+ try {
19599
+ return await discoverPages(server2, buildConfig, maxPages);
19600
+ } finally {
19601
+ await server2.shutdown();
19602
+ }
19603
+ }
19392
19604
  const routes = await parseManifest(cwd, buildConfig.outputDir);
19393
19605
  const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
19394
19606
  logger.event("build_routes_discovered", {
@@ -19399,7 +19611,7 @@ async function loadBuildPages(cwd, config, maxPages) {
19399
19611
  const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
19400
19612
  const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
19401
19613
  try {
19402
- const concurrencyLimit = pLimit__default.default(8);
19614
+ const concurrencyLimit = pLimit2__default.default(8);
19403
19615
  const results = await Promise.allSettled(
19404
19616
  selected.map(
19405
19617
  (route) => concurrencyLimit(async () => {
@@ -19568,7 +19780,7 @@ async function loadCrawledPages(config, maxPages) {
19568
19780
  const routes = await resolveRoutes(config);
19569
19781
  const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
19570
19782
  const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
19571
- const concurrencyLimit = pLimit__default.default(8);
19783
+ const concurrencyLimit = pLimit2__default.default(8);
19572
19784
  const results = await Promise.allSettled(
19573
19785
  selected.map(
19574
19786
  (route) => concurrencyLimit(async () => {
@@ -19630,9 +19842,7 @@ function hrTimeMs(start) {
19630
19842
 
19631
19843
  // src/indexing/pipeline.ts
19632
19844
  var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
19633
- "text-embedding-3-small": 2e-5,
19634
- "text-embedding-3-large": 13e-5,
19635
- "text-embedding-ada-002": 1e-4
19845
+ "jina-embeddings-v3": 2e-5
19636
19846
  };
19637
19847
  var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
19638
19848
  var IndexPipeline = class _IndexPipeline {
@@ -19678,9 +19888,15 @@ var IndexPipeline = class _IndexPipeline {
19678
19888
  };
19679
19889
  const scope = resolveScope(this.config, options.scopeOverride);
19680
19890
  const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
19891
+ const sourceMode = options.sourceOverride ?? this.config.source.mode;
19892
+ this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, model: ${this.config.embeddings.model})`);
19681
19893
  if (options.force) {
19894
+ this.logger.info("Force mode enabled \u2014 full rebuild");
19682
19895
  await cleanMirrorForScope(statePath, scope);
19683
19896
  }
19897
+ if (options.dryRun) {
19898
+ this.logger.info("Dry run \u2014 no writes will be performed");
19899
+ }
19684
19900
  const manifestStart = stageStart();
19685
19901
  const existingHashes = await this.vectorStore.getContentHashes(scope);
19686
19902
  const existingModelId = await this.vectorStore.getScopeModelId(scope);
@@ -19691,8 +19907,9 @@ var IndexPipeline = class _IndexPipeline {
19691
19907
  );
19692
19908
  }
19693
19909
  stageEnd("manifest", manifestStart);
19910
+ this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
19694
19911
  const sourceStart = stageStart();
19695
- const sourceMode = options.sourceOverride ?? this.config.source.mode;
19912
+ this.logger.info(`Loading pages (source: ${sourceMode})...`);
19696
19913
  let sourcePages;
19697
19914
  if (sourceMode === "static-output") {
19698
19915
  sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
@@ -19704,10 +19921,13 @@ var IndexPipeline = class _IndexPipeline {
19704
19921
  sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
19705
19922
  }
19706
19923
  stageEnd("source", sourceStart);
19924
+ this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
19707
19925
  const routeStart = stageStart();
19708
19926
  const routePatterns = await buildRoutePatterns(this.cwd);
19709
19927
  stageEnd("route_map", routeStart);
19928
+ this.logger.debug(`Route mapping: ${routePatterns.length} pattern${routePatterns.length === 1 ? "" : "s"} discovered (${stageTimingsMs["route_map"]}ms)`);
19710
19929
  const extractStart = stageStart();
19930
+ this.logger.info("Extracting content...");
19711
19931
  const extractedPages = [];
19712
19932
  for (const sourcePage of sourcePages) {
19713
19933
  const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
@@ -19736,6 +19956,8 @@ var IndexPipeline = class _IndexPipeline {
19736
19956
  uniquePages.push(page);
19737
19957
  }
19738
19958
  stageEnd("extract", extractStart);
19959
+ const skippedPages = sourcePages.length - uniquePages.length;
19960
+ this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
19739
19961
  const linkStart = stageStart();
19740
19962
  const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
19741
19963
  const incomingLinkCount = /* @__PURE__ */ new Map();
@@ -19751,7 +19973,9 @@ var IndexPipeline = class _IndexPipeline {
19751
19973
  }
19752
19974
  }
19753
19975
  stageEnd("links", linkStart);
19976
+ this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
19754
19977
  const mirrorStart = stageStart();
19978
+ this.logger.info("Writing mirror pages...");
19755
19979
  const mirrorPages = [];
19756
19980
  let routeExact = 0;
19757
19981
  let routeBestEffort = 0;
@@ -19821,7 +20045,9 @@ var IndexPipeline = class _IndexPipeline {
19821
20045
  await this.vectorStore.upsertPages(pageRecords, scope);
19822
20046
  }
19823
20047
  stageEnd("mirror", mirrorStart);
20048
+ this.logger.info(`Mirrored ${mirrorPages.length} page${mirrorPages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["mirror"]}ms)`);
19824
20049
  const chunkStart = stageStart();
20050
+ this.logger.info("Chunking pages...");
19825
20051
  let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
19826
20052
  const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
19827
20053
  if (typeof maxChunks === "number") {
@@ -19834,6 +20060,7 @@ var IndexPipeline = class _IndexPipeline {
19834
20060
  });
19835
20061
  }
19836
20062
  stageEnd("chunk", chunkStart);
20063
+ this.logger.info(`Chunked into ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} (${stageTimingsMs["chunk"]}ms)`);
19837
20064
  const currentChunkMap = /* @__PURE__ */ new Map();
19838
20065
  for (const chunk of chunks) {
19839
20066
  currentChunkMap.set(chunk.chunkKey, chunk);
@@ -19852,6 +20079,7 @@ var IndexPipeline = class _IndexPipeline {
19852
20079
  return existingHash !== chunk.contentHash;
19853
20080
  });
19854
20081
  const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
20082
+ this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
19855
20083
  const embedStart = stageStart();
19856
20084
  const chunkTokenEstimates = /* @__PURE__ */ new Map();
19857
20085
  for (const chunk of changedChunks) {
@@ -19866,9 +20094,11 @@ var IndexPipeline = class _IndexPipeline {
19866
20094
  let newEmbeddings = 0;
19867
20095
  const vectorsByChunk = /* @__PURE__ */ new Map();
19868
20096
  if (!options.dryRun && changedChunks.length > 0) {
20097
+ this.logger.info(`Embedding ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} (~${estimatedTokens.toLocaleString()} tokens, ~$${estimatedCostUSD.toFixed(6)})...`);
19869
20098
  const embeddings = await this.embeddings.embedTexts(
19870
20099
  changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
19871
- this.config.embeddings.model
20100
+ this.config.embeddings.model,
20101
+ "retrieval.passage"
19872
20102
  );
19873
20103
  if (embeddings.length !== changedChunks.length) {
19874
20104
  throw new SearchSocketError(
@@ -19891,8 +20121,14 @@ var IndexPipeline = class _IndexPipeline {
19891
20121
  }
19892
20122
  }
19893
20123
  stageEnd("embedding", embedStart);
20124
+ if (changedChunks.length > 0) {
20125
+ this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
20126
+ } else {
20127
+ this.logger.info("No chunks to embed \u2014 all up to date");
20128
+ }
19894
20129
  const syncStart = stageStart();
19895
20130
  if (!options.dryRun) {
20131
+ this.logger.info("Syncing vectors...");
19896
20132
  const upserts = [];
19897
20133
  for (const chunk of changedChunks) {
19898
20134
  const vector = vectorsByChunk.get(chunk.chunkKey);
@@ -19911,12 +20147,16 @@ var IndexPipeline = class _IndexPipeline {
19911
20147
  sectionTitle: chunk.sectionTitle ?? "",
19912
20148
  headingPath: chunk.headingPath,
19913
20149
  snippet: chunk.snippet,
20150
+ chunkText: chunk.chunkText.slice(0, 4e3),
20151
+ ordinal: chunk.ordinal,
19914
20152
  contentHash: chunk.contentHash,
19915
20153
  modelId: this.config.embeddings.model,
19916
20154
  depth: chunk.depth,
19917
20155
  incomingLinks: chunk.incomingLinks,
19918
20156
  routeFile: chunk.routeFile,
19919
- tags: chunk.tags
20157
+ tags: chunk.tags,
20158
+ description: chunk.description,
20159
+ keywords: chunk.keywords
19920
20160
  }
19921
20161
  });
19922
20162
  }
@@ -19930,6 +20170,7 @@ var IndexPipeline = class _IndexPipeline {
19930
20170
  }
19931
20171
  }
19932
20172
  stageEnd("sync", syncStart);
20173
+ this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
19933
20174
  const finalizeStart = stageStart();
19934
20175
  if (!options.dryRun) {
19935
20176
  const scopeInfo = {
@@ -19949,6 +20190,7 @@ var IndexPipeline = class _IndexPipeline {
19949
20190
  });
19950
20191
  }
19951
20192
  stageEnd("finalize", finalizeStart);
20193
+ this.logger.info("Done.");
19952
20194
  return {
19953
20195
  pagesProcessed: mirrorPages.length,
19954
20196
  chunksTotal: chunks.length,
@@ -20109,7 +20351,7 @@ var SearchEngine = class _SearchEngine {
20109
20351
  const groupByPage = (input.groupBy ?? "page") === "page";
20110
20352
  const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
20111
20353
  const embedStart = process.hrtime.bigint();
20112
- const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
20354
+ const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
20113
20355
  const queryVector = queryEmbeddings[0];
20114
20356
  if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
20115
20357
  throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
@@ -20137,13 +20379,17 @@ var SearchEngine = class _SearchEngine {
20137
20379
  usedRerank = true;
20138
20380
  }
20139
20381
  let results;
20382
+ const minScore = this.config.ranking.minScore;
20140
20383
  if (groupByPage) {
20141
- const pages = aggregateByPage(ordered, this.config);
20384
+ let pages = aggregateByPage(ordered, this.config);
20385
+ if (minScore > 0) {
20386
+ pages = pages.filter((p) => p.pageScore >= minScore);
20387
+ }
20142
20388
  const minRatio = this.config.ranking.minChunkScoreRatio;
20143
20389
  results = pages.slice(0, topK).map((page) => {
20144
20390
  const bestScore = page.bestChunk.finalScore;
20145
- const minScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
20146
- const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore).slice(0, 5);
20391
+ const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
20392
+ const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
20147
20393
  return {
20148
20394
  url: page.url,
20149
20395
  title: page.title,
@@ -20160,6 +20406,9 @@ var SearchEngine = class _SearchEngine {
20160
20406
  };
20161
20407
  });
20162
20408
  } else {
20409
+ if (minScore > 0) {
20410
+ ordered = ordered.filter((entry) => entry.finalScore >= minScore);
20411
+ }
20163
20412
  results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
20164
20413
  url: hit.metadata.url,
20165
20414
  title: hit.metadata.title,
@@ -20231,43 +20480,67 @@ var SearchEngine = class _SearchEngine {
20231
20480
  }
20232
20481
  }
20233
20482
  async rerankHits(query, ranked, topK) {
20234
- if (this.config.rerank.provider !== "jina") {
20483
+ if (!this.config.rerank.enabled) {
20235
20484
  throw new SearchSocketError(
20236
20485
  "INVALID_REQUEST",
20237
- "rerank=true requested but rerank.provider is not configured as 'jina'.",
20486
+ "rerank=true requested but rerank.enabled is not set to true.",
20238
20487
  400
20239
20488
  );
20240
20489
  }
20241
20490
  if (!this.reranker) {
20242
20491
  throw new SearchSocketError(
20243
20492
  "CONFIG_MISSING",
20244
- `rerank=true requested but ${this.config.rerank.jina.apiKeyEnv} is not set.`,
20493
+ `rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
20245
20494
  400
20246
20495
  );
20247
20496
  }
20248
- const candidates = ranked.map(({ hit }) => ({
20249
- id: hit.id,
20250
- text: [hit.metadata.title, hit.metadata.sectionTitle, hit.metadata.snippet].filter(Boolean).join("\n")
20251
- }));
20497
+ const pageGroups = /* @__PURE__ */ new Map();
20498
+ for (const entry of ranked) {
20499
+ const url = entry.hit.metadata.url;
20500
+ const group = pageGroups.get(url);
20501
+ if (group) group.push(entry);
20502
+ else pageGroups.set(url, [entry]);
20503
+ }
20504
+ const MAX_CHUNKS_PER_PAGE = 5;
20505
+ const MIN_CHUNKS_PER_PAGE = 1;
20506
+ const MIN_CHUNK_SCORE_RATIO = 0.5;
20507
+ const pageCandidates = [];
20508
+ for (const [url, chunks] of pageGroups) {
20509
+ const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
20510
+ const bestScore = byScore[0].finalScore;
20511
+ const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
20512
+ const selected = byScore.filter(
20513
+ (c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
20514
+ ).slice(0, MAX_CHUNKS_PER_PAGE);
20515
+ selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
20516
+ const first = selected[0].hit.metadata;
20517
+ const parts = [first.title];
20518
+ if (first.description) {
20519
+ parts.push(first.description);
20520
+ }
20521
+ if (first.keywords && first.keywords.length > 0) {
20522
+ parts.push(first.keywords.join(", "));
20523
+ }
20524
+ const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
20525
+ parts.push(body);
20526
+ pageCandidates.push({ id: url, text: parts.join("\n\n") });
20527
+ }
20252
20528
  const reranked = await this.reranker.rerank(
20253
20529
  query,
20254
- candidates,
20530
+ pageCandidates,
20255
20531
  Math.max(topK, this.config.rerank.topN)
20256
20532
  );
20257
- const rerankScoreById = new Map(reranked.map((entry) => [entry.id, entry.score]));
20533
+ const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
20258
20534
  return ranked.map((entry) => {
20259
- const rerankScore = rerankScoreById.get(entry.hit.id);
20260
- const safeBaseScore = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
20261
- if (rerankScore === void 0 || !Number.isFinite(rerankScore)) {
20262
- return {
20263
- ...entry,
20264
- finalScore: safeBaseScore
20265
- };
20535
+ const pageScore = scoreByUrl.get(entry.hit.metadata.url);
20536
+ const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
20537
+ if (pageScore === void 0 || !Number.isFinite(pageScore)) {
20538
+ return { ...entry, finalScore: base };
20266
20539
  }
20267
- const combinedScore = rerankScore * this.config.ranking.weights.rerank + safeBaseScore * 1e-3;
20540
+ const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
20268
20541
  return {
20269
20542
  ...entry,
20270
- finalScore: Number.isFinite(combinedScore) ? combinedScore : safeBaseScore
20543
+ finalScore: Number.isFinite(combined) ? combined : base
20271
20544
  };
20272
20545
  }).sort((a, b) => {
20273
20546
  const delta = b.finalScore - a.finalScore;
@@ -20465,13 +20738,21 @@ function searchsocketHandle(options = {}) {
20465
20738
  let rateLimiter = null;
20466
20739
  const getConfig = async () => {
20467
20740
  if (!configPromise) {
20468
- const configP = options.config ? Promise.resolve(options.config) : loadConfig({
20469
- cwd: options.cwd,
20470
- configPath: options.configPath
20471
- });
20741
+ let configP;
20742
+ if (options.config) {
20743
+ configP = Promise.resolve(options.config);
20744
+ } else if (options.rawConfig) {
20745
+ const cwd = options.cwd ?? process.cwd();
20746
+ configP = Promise.resolve(mergeConfig(cwd, options.rawConfig));
20747
+ } else {
20748
+ configP = loadConfig({
20749
+ cwd: options.cwd,
20750
+ configPath: options.configPath
20751
+ });
20752
+ }
20472
20753
  configPromise = configP.then((config) => {
20473
20754
  apiPath = apiPath ?? config.api.path;
20474
- if (config.api.rateLimit) {
20755
+ if (config.api.rateLimit && !isServerless()) {
20475
20756
  rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
20476
20757
  }
20477
20758
  return config;
@@ -20481,10 +20762,9 @@ function searchsocketHandle(options = {}) {
20481
20762
  };
20482
20763
  const getEngine = async () => {
20483
20764
  if (!enginePromise) {
20484
- const config = options.config;
20765
+ const config = await getConfig();
20485
20766
  enginePromise = SearchEngine.create({
20486
20767
  cwd: options.cwd,
20487
- configPath: options.configPath,
20488
20768
  config
20489
20769
  });
20490
20770
  }
@@ -20757,8 +21037,10 @@ exports.createEmbeddingsProvider = createEmbeddingsProvider;
20757
21037
  exports.createReranker = createReranker;
20758
21038
  exports.createSearchClient = createSearchClient;
20759
21039
  exports.createVectorStore = createVectorStore;
21040
+ exports.isServerless = isServerless;
20760
21041
  exports.loadConfig = loadConfig;
20761
21042
  exports.mergeConfig = mergeConfig;
21043
+ exports.mergeConfigServerless = mergeConfigServerless;
20762
21044
  exports.resolveScope = resolveScope;
20763
21045
  exports.runMcpServer = runMcpServer;
20764
21046
  exports.searchsocketHandle = searchsocketHandle;