searchsocket 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -3,8 +3,7 @@ import path from 'path';
3
3
  import { createJiti } from 'jiti';
4
4
  import { z } from 'zod';
5
5
  import { execSync, spawn } from 'child_process';
6
- import OpenAI from 'openai';
7
- import pLimit from 'p-limit';
6
+ import pLimit2 from 'p-limit';
8
7
  import { createHash } from 'crypto';
9
8
  import { load } from 'cheerio';
10
9
  import matter from 'gray-matter';
@@ -16620,7 +16619,11 @@ var searchSocketConfigSchema = z.object({
16620
16619
  outputDir: z.string().min(1).optional(),
16621
16620
  paramValues: z.record(z.string(), z.array(z.string())).optional(),
16622
16621
  exclude: z.array(z.string()).optional(),
16623
- previewTimeout: z.number().int().positive().optional()
16622
+ previewTimeout: z.number().int().positive().optional(),
16623
+ discover: z.boolean().optional(),
16624
+ seedUrls: z.array(z.string()).optional(),
16625
+ maxPages: z.number().int().positive().optional(),
16626
+ maxDepth: z.number().int().nonnegative().optional()
16624
16627
  }).optional()
16625
16628
  }).optional(),
16626
16629
  extract: z.object({
@@ -16647,8 +16650,9 @@ var searchSocketConfigSchema = z.object({
16647
16650
  pageSummaryChunk: z.boolean().optional()
16648
16651
  }).optional(),
16649
16652
  embeddings: z.object({
16650
- provider: z.literal("openai").optional(),
16653
+ provider: z.literal("jina").optional(),
16651
16654
  model: z.string().min(1).optional(),
16655
+ apiKey: z.string().min(1).optional(),
16652
16656
  apiKeyEnv: z.string().min(1).optional(),
16653
16657
  batchSize: z.number().int().positive().optional(),
16654
16658
  concurrency: z.number().int().positive().optional(),
@@ -16657,18 +16661,17 @@ var searchSocketConfigSchema = z.object({
16657
16661
  vector: z.object({
16658
16662
  dimension: z.number().int().positive().optional(),
16659
16663
  turso: z.object({
16664
+ url: z.string().url().optional(),
16665
+ authToken: z.string().min(1).optional(),
16660
16666
  urlEnv: z.string().optional(),
16661
16667
  authTokenEnv: z.string().optional(),
16662
16668
  localPath: z.string().optional()
16663
16669
  }).optional()
16664
16670
  }).optional(),
16665
16671
  rerank: z.object({
16666
- provider: z.enum(["none", "jina"]).optional(),
16672
+ enabled: z.boolean().optional(),
16667
16673
  topN: z.number().int().positive().optional(),
16668
- jina: z.object({
16669
- apiKeyEnv: z.string().optional(),
16670
- model: z.string().optional()
16671
- }).optional()
16674
+ model: z.string().optional()
16672
16675
  }).optional(),
16673
16676
  ranking: z.object({
16674
16677
  enableIncomingLinkBoost: z.boolean().optional(),
@@ -16677,6 +16680,7 @@ var searchSocketConfigSchema = z.object({
16677
16680
  aggregationCap: z.number().int().positive().optional(),
16678
16681
  aggregationDecay: z.number().min(0).max(1).optional(),
16679
16682
  minChunkScoreRatio: z.number().min(0).max(1).optional(),
16683
+ minScore: z.number().min(0).max(1).optional(),
16680
16684
  weights: z.object({
16681
16685
  incomingLinks: z.number().optional(),
16682
16686
  depth: z.number().optional(),
@@ -16757,9 +16761,9 @@ function createDefaultConfig(projectId) {
16757
16761
  pageSummaryChunk: true
16758
16762
  },
16759
16763
  embeddings: {
16760
- provider: "openai",
16761
- model: "text-embedding-3-small",
16762
- apiKeyEnv: "OPENAI_API_KEY",
16764
+ provider: "jina",
16765
+ model: "jina-embeddings-v3",
16766
+ apiKeyEnv: "JINA_API_KEY",
16763
16767
  batchSize: 64,
16764
16768
  concurrency: 4
16765
16769
  },
@@ -16771,12 +16775,9 @@ function createDefaultConfig(projectId) {
16771
16775
  }
16772
16776
  },
16773
16777
  rerank: {
16774
- provider: "none",
16778
+ enabled: false,
16775
16779
  topN: 20,
16776
- jina: {
16777
- apiKeyEnv: "JINA_API_KEY",
16778
- model: "jina-reranker-v2-base-multilingual"
16779
- }
16780
+ model: "jina-reranker-v2-base-multilingual"
16780
16781
  },
16781
16782
  ranking: {
16782
16783
  enableIncomingLinkBoost: true,
@@ -16785,6 +16786,7 @@ function createDefaultConfig(projectId) {
16785
16786
  aggregationCap: 5,
16786
16787
  aggregationDecay: 0.5,
16787
16788
  minChunkScoreRatio: 0.5,
16789
+ minScore: 0,
16788
16790
  weights: {
16789
16791
  incomingLinks: 0.05,
16790
16792
  depth: 0.03,
@@ -16911,7 +16913,11 @@ ${issues}`
16911
16913
  outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
16912
16914
  paramValues: parsed.source.build.paramValues ?? {},
16913
16915
  exclude: parsed.source.build.exclude ?? [],
16914
- previewTimeout: parsed.source.build.previewTimeout ?? 3e4
16916
+ previewTimeout: parsed.source.build.previewTimeout ?? 3e4,
16917
+ discover: parsed.source.build.discover ?? false,
16918
+ seedUrls: parsed.source.build.seedUrls ?? ["/"],
16919
+ maxPages: parsed.source.build.maxPages ?? 200,
16920
+ maxDepth: parsed.source.build.maxDepth ?? 10
16915
16921
  } : void 0
16916
16922
  },
16917
16923
  extract: {
@@ -16940,11 +16946,7 @@ ${issues}`
16940
16946
  },
16941
16947
  rerank: {
16942
16948
  ...defaults.rerank,
16943
- ...parsed.rerank,
16944
- jina: {
16945
- ...defaults.rerank.jina,
16946
- ...parsed.rerank?.jina
16947
- }
16949
+ ...parsed.rerank
16948
16950
  },
16949
16951
  ranking: {
16950
16952
  ...defaults.ranking,
@@ -16991,7 +16993,11 @@ ${issues}`
16991
16993
  outputDir: ".svelte-kit/output",
16992
16994
  paramValues: {},
16993
16995
  exclude: [],
16994
- previewTimeout: 3e4
16996
+ previewTimeout: 3e4,
16997
+ discover: false,
16998
+ seedUrls: ["/"],
16999
+ maxPages: 200,
17000
+ maxDepth: 10
16995
17001
  };
16996
17002
  }
16997
17003
  if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
@@ -17005,6 +17011,21 @@ ${issues}`
17005
17011
  }
17006
17012
  return merged;
17007
17013
  }
17014
+ function mergeConfigServerless(rawConfig) {
17015
+ if (!rawConfig.project?.id) {
17016
+ throw new SearchSocketError(
17017
+ "CONFIG_MISSING",
17018
+ "`project.id` is required for serverless config (cannot infer from package.json)."
17019
+ );
17020
+ }
17021
+ if (!rawConfig.source?.mode) {
17022
+ throw new SearchSocketError(
17023
+ "CONFIG_MISSING",
17024
+ "`source.mode` is required for serverless config (cannot auto-detect from filesystem)."
17025
+ );
17026
+ }
17027
+ return mergeConfig(process.cwd(), rawConfig);
17028
+ }
17008
17029
  async function loadConfig(options = {}) {
17009
17030
  const cwd = path.resolve(options.cwd ?? process.cwd());
17010
17031
  const configPath = path.resolve(cwd, options.configPath ?? "searchsocket.config.ts");
@@ -17027,6 +17048,11 @@ async function loadConfig(options = {}) {
17027
17048
  return mergeConfig(cwd, raw);
17028
17049
  }
17029
17050
 
17051
+ // src/core/serverless.ts
17052
+ function isServerless() {
17053
+ return !!(process.env.VERCEL || process.env.NETLIFY || process.env.AWS_LAMBDA_FUNCTION_NAME || process.env.FUNCTIONS_WORKER || process.env.CF_PAGES);
17054
+ }
17055
+
17030
17056
  // src/utils/text.ts
17031
17057
  function normalizeText(input) {
17032
17058
  return input.replace(/\r\n/g, "\n").replace(/\s+/g, " ").trim();
@@ -17104,10 +17130,11 @@ function sleep(ms) {
17104
17130
  setTimeout(resolve, ms);
17105
17131
  });
17106
17132
  }
17107
- var OpenAIEmbeddingsProvider = class {
17108
- client;
17133
+ var JinaEmbeddingsProvider = class {
17134
+ apiKey;
17109
17135
  batchSize;
17110
17136
  concurrency;
17137
+ defaultTask;
17111
17138
  constructor(options) {
17112
17139
  if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
17113
17140
  throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
@@ -17115,11 +17142,10 @@ var OpenAIEmbeddingsProvider = class {
17115
17142
  if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
17116
17143
  throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
17117
17144
  }
17118
- this.client = new OpenAI({
17119
- apiKey: options.apiKey
17120
- });
17145
+ this.apiKey = options.apiKey;
17121
17146
  this.batchSize = options.batchSize;
17122
17147
  this.concurrency = options.concurrency;
17148
+ this.defaultTask = options.task ?? "retrieval.passage";
17123
17149
  }
17124
17150
  estimateTokens(text) {
17125
17151
  const normalized = text.trim();
@@ -17133,7 +17159,7 @@ var OpenAIEmbeddingsProvider = class {
17133
17159
  const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
17134
17160
  return Math.max(1, Math.max(charEstimate, lexicalEstimate));
17135
17161
  }
17136
- async embedTexts(texts, modelId) {
17162
+ async embedTexts(texts, modelId, task) {
17137
17163
  if (texts.length === 0) {
17138
17164
  return [];
17139
17165
  }
@@ -17145,37 +17171,56 @@ var OpenAIEmbeddingsProvider = class {
17145
17171
  });
17146
17172
  }
17147
17173
  const outputs = new Array(batches.length);
17148
- const limit = pLimit(this.concurrency);
17174
+ const limit = pLimit2(this.concurrency);
17149
17175
  await Promise.all(
17150
17176
  batches.map(
17151
17177
  (batch, position) => limit(async () => {
17152
- outputs[position] = await this.embedWithRetry(batch.values, modelId);
17178
+ outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
17153
17179
  })
17154
17180
  )
17155
17181
  );
17156
17182
  return outputs.flat();
17157
17183
  }
17158
- async embedWithRetry(texts, modelId) {
17184
+ async embedWithRetry(texts, modelId, task) {
17159
17185
  const maxAttempts = 5;
17160
17186
  let attempt = 0;
17161
17187
  while (attempt < maxAttempts) {
17162
17188
  attempt += 1;
17189
+ let response;
17163
17190
  try {
17164
- const response = await this.client.embeddings.create({
17165
- model: modelId,
17166
- input: texts,
17167
- encoding_format: "float"
17191
+ response = await fetch("https://api.jina.ai/v1/embeddings", {
17192
+ method: "POST",
17193
+ headers: {
17194
+ "content-type": "application/json",
17195
+ authorization: `Bearer ${this.apiKey}`
17196
+ },
17197
+ body: JSON.stringify({
17198
+ model: modelId,
17199
+ input: texts,
17200
+ task
17201
+ })
17168
17202
  });
17169
- return response.data.map((entry) => entry.embedding);
17170
17203
  } catch (error) {
17171
- const status = error.status;
17172
- const retryable = status === 429 || typeof status === "number" && status >= 500;
17173
- if (!retryable || attempt >= maxAttempts) {
17204
+ if (attempt >= maxAttempts) {
17174
17205
  throw error;
17175
17206
  }
17176
- const delay = Math.min(2 ** attempt * 300, 5e3);
17177
- await sleep(delay);
17207
+ await sleep(Math.min(2 ** attempt * 300, 5e3));
17208
+ continue;
17209
+ }
17210
+ if (!response.ok) {
17211
+ const retryable = response.status === 429 || response.status >= 500;
17212
+ if (!retryable || attempt >= maxAttempts) {
17213
+ const errorBody = await response.text();
17214
+ throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
17215
+ }
17216
+ await sleep(Math.min(2 ** attempt * 300, 5e3));
17217
+ continue;
17218
+ }
17219
+ const payload = await response.json();
17220
+ if (!payload.data || !Array.isArray(payload.data)) {
17221
+ throw new Error("Invalid Jina embeddings response format");
17178
17222
  }
17223
+ return payload.data.map((entry) => entry.embedding);
17179
17224
  }
17180
17225
  throw new Error("Unreachable retry state");
17181
17226
  }
@@ -17183,20 +17228,20 @@ var OpenAIEmbeddingsProvider = class {
17183
17228
 
17184
17229
  // src/embeddings/factory.ts
17185
17230
  function createEmbeddingsProvider(config) {
17186
- if (config.embeddings.provider !== "openai") {
17231
+ if (config.embeddings.provider !== "jina") {
17187
17232
  throw new SearchSocketError(
17188
17233
  "CONFIG_MISSING",
17189
17234
  `Unsupported embeddings provider ${config.embeddings.provider}`
17190
17235
  );
17191
17236
  }
17192
- const apiKey = process.env[config.embeddings.apiKeyEnv];
17237
+ const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
17193
17238
  if (!apiKey) {
17194
17239
  throw new SearchSocketError(
17195
17240
  "CONFIG_MISSING",
17196
- `Missing embeddings API key env var: ${config.embeddings.apiKeyEnv}`
17241
+ `Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
17197
17242
  );
17198
17243
  }
17199
- return new OpenAIEmbeddingsProvider({
17244
+ return new JinaEmbeddingsProvider({
17200
17245
  apiKey,
17201
17246
  batchSize: config.embeddings.batchSize,
17202
17247
  concurrency: config.embeddings.concurrency
@@ -17286,20 +17331,17 @@ var JinaReranker = class {
17286
17331
 
17287
17332
  // src/rerank/factory.ts
17288
17333
  function createReranker(config) {
17289
- if (config.rerank.provider === "none") {
17334
+ if (!config.rerank.enabled) {
17290
17335
  return null;
17291
17336
  }
17292
- if (config.rerank.provider === "jina") {
17293
- const apiKey = process.env[config.rerank.jina.apiKeyEnv];
17294
- if (!apiKey) {
17295
- return null;
17296
- }
17297
- return new JinaReranker({
17298
- apiKey,
17299
- model: config.rerank.jina.model
17300
- });
17337
+ const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
17338
+ if (!apiKey) {
17339
+ return null;
17301
17340
  }
17302
- return null;
17341
+ return new JinaReranker({
17342
+ apiKey,
17343
+ model: config.rerank.model
17344
+ });
17303
17345
  }
17304
17346
  function ensureStateDirs(cwd, stateDir, scope) {
17305
17347
  const statePath = path.resolve(cwd, stateDir);
@@ -17352,6 +17394,16 @@ var TursoVectorStore = class {
17352
17394
  }
17353
17395
  async ensureChunks(dim) {
17354
17396
  if (this.chunksReady) return;
17397
+ const exists = await this.chunksTableExists();
17398
+ if (exists) {
17399
+ const currentDim = await this.getChunksDimension();
17400
+ if (currentDim !== null && currentDim !== dim) {
17401
+ await this.client.batch([
17402
+ "DROP INDEX IF EXISTS idx",
17403
+ "DROP TABLE IF EXISTS chunks"
17404
+ ]);
17405
+ }
17406
+ }
17355
17407
  await this.client.batch([
17356
17408
  `CREATE TABLE IF NOT EXISTS chunks (
17357
17409
  id TEXT PRIMARY KEY,
@@ -17363,12 +17415,16 @@ var TursoVectorStore = class {
17363
17415
  section_title TEXT NOT NULL DEFAULT '',
17364
17416
  heading_path TEXT NOT NULL DEFAULT '[]',
17365
17417
  snippet TEXT NOT NULL DEFAULT '',
17418
+ chunk_text TEXT NOT NULL DEFAULT '',
17419
+ ordinal INTEGER NOT NULL DEFAULT 0,
17366
17420
  content_hash TEXT NOT NULL DEFAULT '',
17367
17421
  model_id TEXT NOT NULL DEFAULT '',
17368
17422
  depth INTEGER NOT NULL DEFAULT 0,
17369
17423
  incoming_links INTEGER NOT NULL DEFAULT 0,
17370
17424
  route_file TEXT NOT NULL DEFAULT '',
17371
17425
  tags TEXT NOT NULL DEFAULT '[]',
17426
+ description TEXT NOT NULL DEFAULT '',
17427
+ keywords TEXT NOT NULL DEFAULT '[]',
17372
17428
  embedding F32_BLOB(${dim})
17373
17429
  )`,
17374
17430
  `CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
@@ -17407,6 +17463,38 @@ var TursoVectorStore = class {
17407
17463
  throw error;
17408
17464
  }
17409
17465
  }
17466
+ /**
17467
+ * Read the current F32_BLOB dimension from the chunks table schema.
17468
+ * Returns null if the table doesn't exist or the dimension can't be parsed.
17469
+ */
17470
+ async getChunksDimension() {
17471
+ try {
17472
+ const rs = await this.client.execute(
17473
+ "SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
17474
+ );
17475
+ if (rs.rows.length === 0) return null;
17476
+ const sql = rs.rows[0].sql;
17477
+ const match = sql.match(/F32_BLOB\((\d+)\)/i);
17478
+ return match ? parseInt(match[1], 10) : null;
17479
+ } catch {
17480
+ return null;
17481
+ }
17482
+ }
17483
+ /**
17484
+ * Drop all SearchSocket tables (chunks, registry, pages) and their indexes.
17485
+ * Used by `clean --remote` for a full reset.
17486
+ */
17487
+ async dropAllTables() {
17488
+ await this.client.batch([
17489
+ "DROP INDEX IF EXISTS idx",
17490
+ "DROP TABLE IF EXISTS chunks",
17491
+ "DROP TABLE IF EXISTS registry",
17492
+ "DROP TABLE IF EXISTS pages"
17493
+ ]);
17494
+ this.chunksReady = false;
17495
+ this.registryReady = false;
17496
+ this.pagesReady = false;
17497
+ }
17410
17498
  async upsert(records, _scope) {
17411
17499
  if (records.length === 0) return;
17412
17500
  const dim = this.dimension ?? records[0].vector.length;
@@ -17417,9 +17505,9 @@ var TursoVectorStore = class {
17417
17505
  const stmts = batch.map((r) => ({
17418
17506
  sql: `INSERT OR REPLACE INTO chunks
17419
17507
  (id, project_id, scope_name, url, path, title, section_title,
17420
- heading_path, snippet, content_hash, model_id, depth,
17421
- incoming_links, route_file, tags, embedding)
17422
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
17508
+ heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
17509
+ incoming_links, route_file, tags, description, keywords, embedding)
17510
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
17423
17511
  args: [
17424
17512
  r.id,
17425
17513
  r.metadata.projectId,
@@ -17430,12 +17518,16 @@ var TursoVectorStore = class {
17430
17518
  r.metadata.sectionTitle,
17431
17519
  JSON.stringify(r.metadata.headingPath),
17432
17520
  r.metadata.snippet,
17521
+ r.metadata.chunkText,
17522
+ r.metadata.ordinal,
17433
17523
  r.metadata.contentHash,
17434
17524
  r.metadata.modelId,
17435
17525
  r.metadata.depth,
17436
17526
  r.metadata.incomingLinks,
17437
17527
  r.metadata.routeFile,
17438
17528
  JSON.stringify(r.metadata.tags),
17529
+ r.metadata.description ?? "",
17530
+ JSON.stringify(r.metadata.keywords ?? []),
17439
17531
  JSON.stringify(r.vector)
17440
17532
  ]
17441
17533
  }));
@@ -17448,8 +17540,10 @@ var TursoVectorStore = class {
17448
17540
  const queryJson = JSON.stringify(queryVector);
17449
17541
  const rs = await this.client.execute({
17450
17542
  sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
17451
- c.section_title, c.heading_path, c.snippet, c.content_hash,
17543
+ c.section_title, c.heading_path, c.snippet, c.chunk_text,
17544
+ c.ordinal, c.content_hash,
17452
17545
  c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
17546
+ c.description, c.keywords,
17453
17547
  vector_distance_cos(c.embedding, vector(?)) AS distance
17454
17548
  FROM vector_top_k('idx', vector(?), ?) AS v
17455
17549
  JOIN chunks AS c ON c.rowid = v.id`,
@@ -17480,6 +17574,12 @@ var TursoVectorStore = class {
17480
17574
  }
17481
17575
  const distance = row.distance;
17482
17576
  const score = 1 - distance;
17577
+ const description = row.description || void 0;
17578
+ const keywords = (() => {
17579
+ const raw = row.keywords || "[]";
17580
+ const parsed = JSON.parse(raw);
17581
+ return parsed.length > 0 ? parsed : void 0;
17582
+ })();
17483
17583
  hits.push({
17484
17584
  id: row.id,
17485
17585
  score,
@@ -17492,12 +17592,16 @@ var TursoVectorStore = class {
17492
17592
  sectionTitle: row.section_title,
17493
17593
  headingPath: JSON.parse(row.heading_path || "[]"),
17494
17594
  snippet: row.snippet,
17595
+ chunkText: row.chunk_text || "",
17596
+ ordinal: row.ordinal || 0,
17495
17597
  contentHash: row.content_hash,
17496
17598
  modelId: row.model_id,
17497
17599
  depth: row.depth,
17498
17600
  incomingLinks: row.incoming_links,
17499
17601
  routeFile: row.route_file,
17500
- tags
17602
+ tags,
17603
+ description,
17604
+ keywords
17501
17605
  }
17502
17606
  });
17503
17607
  }
@@ -17687,10 +17791,10 @@ var TursoVectorStore = class {
17687
17791
  // src/vector/factory.ts
17688
17792
  async function createVectorStore(config, cwd) {
17689
17793
  const turso = config.vector.turso;
17690
- const remoteUrl = process.env[turso.urlEnv];
17794
+ const remoteUrl = turso.url ?? process.env[turso.urlEnv];
17691
17795
  if (remoteUrl) {
17692
17796
  const { createClient: createClient2 } = await import('@libsql/client/http');
17693
- const authToken = process.env[turso.authTokenEnv];
17797
+ const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
17694
17798
  const client2 = createClient2({
17695
17799
  url: remoteUrl,
17696
17800
  authToken
@@ -17700,6 +17804,12 @@ async function createVectorStore(config, cwd) {
17700
17804
  dimension: config.vector.dimension
17701
17805
  });
17702
17806
  }
17807
+ if (isServerless()) {
17808
+ throw new SearchSocketError(
17809
+ "VECTOR_BACKEND_UNAVAILABLE",
17810
+ `No remote vector database URL found (checked vector.turso.url and env var "${turso.urlEnv}"). Local SQLite storage is not available in serverless environments. Set ${turso.urlEnv} or pass vector.turso.url directly.`
17811
+ );
17812
+ }
17703
17813
  const { createClient } = await import('@libsql/client');
17704
17814
  const localPath = path.resolve(cwd, turso.localPath);
17705
17815
  fs.mkdirSync(path.dirname(localPath), { recursive: true });
@@ -18030,7 +18140,9 @@ function chunkMirrorPage(page, config, scope) {
18030
18140
  incomingLinks: page.incomingLinks,
18031
18141
  routeFile: page.routeFile,
18032
18142
  tags: page.tags,
18033
- contentHash: ""
18143
+ contentHash: "",
18144
+ description: page.description,
18145
+ keywords: page.keywords
18034
18146
  };
18035
18147
  const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
18036
18148
  summaryChunk.contentHash = sha256(normalizeText(embeddingText));
@@ -18057,7 +18169,9 @@ function chunkMirrorPage(page, config, scope) {
18057
18169
  incomingLinks: page.incomingLinks,
18058
18170
  routeFile: page.routeFile,
18059
18171
  tags: page.tags,
18060
- contentHash: ""
18172
+ contentHash: "",
18173
+ description: page.description,
18174
+ keywords: page.keywords
18061
18175
  };
18062
18176
  const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
18063
18177
  chunk.contentHash = sha256(normalizeText(embeddingText));
@@ -19138,14 +19252,16 @@ function mapUrlToRoute(urlPath, patterns) {
19138
19252
  var Logger = class {
19139
19253
  json;
19140
19254
  verbose;
19255
+ quiet;
19141
19256
  stderrOnly;
19142
19257
  constructor(opts = {}) {
19143
19258
  this.json = opts.json ?? false;
19144
19259
  this.verbose = opts.verbose ?? false;
19260
+ this.quiet = opts.quiet ?? false;
19145
19261
  this.stderrOnly = opts.stderrOnly ?? false;
19146
19262
  }
19147
19263
  info(message) {
19148
- if (this.json) {
19264
+ if (this.quiet || this.json) {
19149
19265
  return;
19150
19266
  }
19151
19267
  this.writeOut(`${message}
@@ -19159,7 +19275,7 @@ var Logger = class {
19159
19275
  this.logJson("debug", { message });
19160
19276
  return;
19161
19277
  }
19162
- this.writeOut(`${message}
19278
+ this.writeOut(` ${message}
19163
19279
  `);
19164
19280
  }
19165
19281
  warn(message) {
@@ -19186,7 +19302,7 @@ var Logger = class {
19186
19302
  this.logJson(event, data);
19187
19303
  return;
19188
19304
  }
19189
- this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
19305
+ this.writeOut(` [${event}] ${data ? JSON.stringify(data) : ""}
19190
19306
  `);
19191
19307
  }
19192
19308
  writeOut(text) {
@@ -19371,11 +19487,108 @@ async function startPreviewServer(cwd, options, logger3) {
19371
19487
 
19372
19488
  // src/indexing/sources/build/index.ts
19373
19489
  var logger = new Logger();
19490
+ function extractLinksFromHtml(html, pageUrl, baseOrigin) {
19491
+ const $ = load(html);
19492
+ const links = [];
19493
+ $("a[href]").each((_i, el) => {
19494
+ const href = $(el).attr("href");
19495
+ if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
19496
+ return;
19497
+ }
19498
+ try {
19499
+ const resolved = new URL(href, `${baseOrigin}${pageUrl}`);
19500
+ if (resolved.origin !== baseOrigin) return;
19501
+ if (!["http:", "https:"].includes(resolved.protocol)) return;
19502
+ links.push(normalizeUrlPath(resolved.pathname));
19503
+ } catch {
19504
+ }
19505
+ });
19506
+ return [...new Set(links)];
19507
+ }
19508
+ async function discoverPages(server, buildConfig, pipelineMaxPages) {
19509
+ const { seedUrls, maxDepth, exclude } = buildConfig;
19510
+ const baseOrigin = new URL(server.baseUrl).origin;
19511
+ let effectiveMax = buildConfig.maxPages;
19512
+ if (typeof pipelineMaxPages === "number") {
19513
+ const floored = Math.max(0, Math.floor(pipelineMaxPages));
19514
+ effectiveMax = Math.min(effectiveMax, floored);
19515
+ }
19516
+ if (effectiveMax === 0) return [];
19517
+ const visited = /* @__PURE__ */ new Set();
19518
+ const pages = [];
19519
+ const queue = [];
19520
+ const limit = pLimit2(8);
19521
+ for (const seed of seedUrls) {
19522
+ const normalized = normalizeUrlPath(seed);
19523
+ if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
19524
+ visited.add(normalized);
19525
+ queue.push({ url: normalized, depth: 0 });
19526
+ }
19527
+ }
19528
+ while (queue.length > 0 && pages.length < effectiveMax) {
19529
+ const remaining = effectiveMax - pages.length;
19530
+ const batch = queue.splice(0, remaining);
19531
+ const results = await Promise.allSettled(
19532
+ batch.map(
19533
+ (item) => limit(async () => {
19534
+ const fullUrl = joinUrl(server.baseUrl, item.url);
19535
+ const response = await fetch(fullUrl);
19536
+ if (!response.ok) {
19537
+ logger.warn(`Skipping ${item.url}: ${response.status} ${response.statusText}`);
19538
+ return null;
19539
+ }
19540
+ const contentType = response.headers.get("content-type") ?? "";
19541
+ if (!contentType.includes("text/html")) {
19542
+ return null;
19543
+ }
19544
+ const html = await response.text();
19545
+ if (item.depth < maxDepth) {
19546
+ const links = extractLinksFromHtml(html, item.url, baseOrigin);
19547
+ for (const link of links) {
19548
+ if (!visited.has(link) && !isExcluded(link, exclude)) {
19549
+ visited.add(link);
19550
+ queue.push({ url: link, depth: item.depth + 1 });
19551
+ }
19552
+ }
19553
+ }
19554
+ return {
19555
+ url: item.url,
19556
+ html,
19557
+ sourcePath: fullUrl,
19558
+ outgoingLinks: []
19559
+ };
19560
+ })
19561
+ )
19562
+ );
19563
+ for (const result of results) {
19564
+ if (result.status === "fulfilled" && result.value) {
19565
+ pages.push(result.value);
19566
+ }
19567
+ }
19568
+ }
19569
+ if (pages.length >= effectiveMax && queue.length > 0) {
19570
+ logger.warn(`Discovery crawl reached maxPages limit (${effectiveMax}), ${queue.length} URLs not visited.`);
19571
+ }
19572
+ logger.event("build_discover_complete", {
19573
+ pagesFound: pages.length,
19574
+ urlsVisited: visited.size,
19575
+ urlsSkipped: queue.length
19576
+ });
19577
+ return pages;
19578
+ }
19374
19579
  async function loadBuildPages(cwd, config, maxPages) {
19375
19580
  const buildConfig = config.source.build;
19376
19581
  if (!buildConfig) {
19377
19582
  throw new Error("build source config is missing");
19378
19583
  }
19584
+ if (buildConfig.discover) {
19585
+ const server2 = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
19586
+ try {
19587
+ return await discoverPages(server2, buildConfig, maxPages);
19588
+ } finally {
19589
+ await server2.shutdown();
19590
+ }
19591
+ }
19379
19592
  const routes = await parseManifest(cwd, buildConfig.outputDir);
19380
19593
  const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
19381
19594
  logger.event("build_routes_discovered", {
@@ -19386,7 +19599,7 @@ async function loadBuildPages(cwd, config, maxPages) {
19386
19599
  const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
19387
19600
  const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
19388
19601
  try {
19389
- const concurrencyLimit = pLimit(8);
19602
+ const concurrencyLimit = pLimit2(8);
19390
19603
  const results = await Promise.allSettled(
19391
19604
  selected.map(
19392
19605
  (route) => concurrencyLimit(async () => {
@@ -19555,7 +19768,7 @@ async function loadCrawledPages(config, maxPages) {
19555
19768
  const routes = await resolveRoutes(config);
19556
19769
  const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
19557
19770
  const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
19558
- const concurrencyLimit = pLimit(8);
19771
+ const concurrencyLimit = pLimit2(8);
19559
19772
  const results = await Promise.allSettled(
19560
19773
  selected.map(
19561
19774
  (route) => concurrencyLimit(async () => {
@@ -19617,9 +19830,7 @@ function hrTimeMs(start) {
19617
19830
 
19618
19831
  // src/indexing/pipeline.ts
19619
19832
  var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
19620
- "text-embedding-3-small": 2e-5,
19621
- "text-embedding-3-large": 13e-5,
19622
- "text-embedding-ada-002": 1e-4
19833
+ "jina-embeddings-v3": 2e-5
19623
19834
  };
19624
19835
  var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
19625
19836
  var IndexPipeline = class _IndexPipeline {
@@ -19665,9 +19876,15 @@ var IndexPipeline = class _IndexPipeline {
19665
19876
  };
19666
19877
  const scope = resolveScope(this.config, options.scopeOverride);
19667
19878
  const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
19879
+ const sourceMode = options.sourceOverride ?? this.config.source.mode;
19880
+ this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, model: ${this.config.embeddings.model})`);
19668
19881
  if (options.force) {
19882
+ this.logger.info("Force mode enabled \u2014 full rebuild");
19669
19883
  await cleanMirrorForScope(statePath, scope);
19670
19884
  }
19885
+ if (options.dryRun) {
19886
+ this.logger.info("Dry run \u2014 no writes will be performed");
19887
+ }
19671
19888
  const manifestStart = stageStart();
19672
19889
  const existingHashes = await this.vectorStore.getContentHashes(scope);
19673
19890
  const existingModelId = await this.vectorStore.getScopeModelId(scope);
@@ -19678,8 +19895,9 @@ var IndexPipeline = class _IndexPipeline {
19678
19895
  );
19679
19896
  }
19680
19897
  stageEnd("manifest", manifestStart);
19898
+ this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
19681
19899
  const sourceStart = stageStart();
19682
- const sourceMode = options.sourceOverride ?? this.config.source.mode;
19900
+ this.logger.info(`Loading pages (source: ${sourceMode})...`);
19683
19901
  let sourcePages;
19684
19902
  if (sourceMode === "static-output") {
19685
19903
  sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
@@ -19691,10 +19909,13 @@ var IndexPipeline = class _IndexPipeline {
19691
19909
  sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
19692
19910
  }
19693
19911
  stageEnd("source", sourceStart);
19912
+ this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
19694
19913
  const routeStart = stageStart();
19695
19914
  const routePatterns = await buildRoutePatterns(this.cwd);
19696
19915
  stageEnd("route_map", routeStart);
19916
+ this.logger.debug(`Route mapping: ${routePatterns.length} pattern${routePatterns.length === 1 ? "" : "s"} discovered (${stageTimingsMs["route_map"]}ms)`);
19697
19917
  const extractStart = stageStart();
19918
+ this.logger.info("Extracting content...");
19698
19919
  const extractedPages = [];
19699
19920
  for (const sourcePage of sourcePages) {
19700
19921
  const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
@@ -19723,6 +19944,8 @@ var IndexPipeline = class _IndexPipeline {
19723
19944
  uniquePages.push(page);
19724
19945
  }
19725
19946
  stageEnd("extract", extractStart);
19947
+ const skippedPages = sourcePages.length - uniquePages.length;
19948
+ this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
19726
19949
  const linkStart = stageStart();
19727
19950
  const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
19728
19951
  const incomingLinkCount = /* @__PURE__ */ new Map();
@@ -19738,7 +19961,9 @@ var IndexPipeline = class _IndexPipeline {
19738
19961
  }
19739
19962
  }
19740
19963
  stageEnd("links", linkStart);
19964
+ this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
19741
19965
  const mirrorStart = stageStart();
19966
+ this.logger.info("Writing mirror pages...");
19742
19967
  const mirrorPages = [];
19743
19968
  let routeExact = 0;
19744
19969
  let routeBestEffort = 0;
@@ -19808,7 +20033,9 @@ var IndexPipeline = class _IndexPipeline {
19808
20033
  await this.vectorStore.upsertPages(pageRecords, scope);
19809
20034
  }
19810
20035
  stageEnd("mirror", mirrorStart);
20036
+ this.logger.info(`Mirrored ${mirrorPages.length} page${mirrorPages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["mirror"]}ms)`);
19811
20037
  const chunkStart = stageStart();
20038
+ this.logger.info("Chunking pages...");
19812
20039
  let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
19813
20040
  const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
19814
20041
  if (typeof maxChunks === "number") {
@@ -19821,6 +20048,7 @@ var IndexPipeline = class _IndexPipeline {
19821
20048
  });
19822
20049
  }
19823
20050
  stageEnd("chunk", chunkStart);
20051
+ this.logger.info(`Chunked into ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} (${stageTimingsMs["chunk"]}ms)`);
19824
20052
  const currentChunkMap = /* @__PURE__ */ new Map();
19825
20053
  for (const chunk of chunks) {
19826
20054
  currentChunkMap.set(chunk.chunkKey, chunk);
@@ -19839,6 +20067,7 @@ var IndexPipeline = class _IndexPipeline {
19839
20067
  return existingHash !== chunk.contentHash;
19840
20068
  });
19841
20069
  const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
20070
+ this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
19842
20071
  const embedStart = stageStart();
19843
20072
  const chunkTokenEstimates = /* @__PURE__ */ new Map();
19844
20073
  for (const chunk of changedChunks) {
@@ -19853,9 +20082,11 @@ var IndexPipeline = class _IndexPipeline {
19853
20082
  let newEmbeddings = 0;
19854
20083
  const vectorsByChunk = /* @__PURE__ */ new Map();
19855
20084
  if (!options.dryRun && changedChunks.length > 0) {
20085
+ this.logger.info(`Embedding ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} (~${estimatedTokens.toLocaleString()} tokens, ~$${estimatedCostUSD.toFixed(6)})...`);
19856
20086
  const embeddings = await this.embeddings.embedTexts(
19857
20087
  changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
19858
- this.config.embeddings.model
20088
+ this.config.embeddings.model,
20089
+ "retrieval.passage"
19859
20090
  );
19860
20091
  if (embeddings.length !== changedChunks.length) {
19861
20092
  throw new SearchSocketError(
@@ -19878,8 +20109,14 @@ var IndexPipeline = class _IndexPipeline {
19878
20109
  }
19879
20110
  }
19880
20111
  stageEnd("embedding", embedStart);
20112
+ if (changedChunks.length > 0) {
20113
+ this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
20114
+ } else {
20115
+ this.logger.info("No chunks to embed \u2014 all up to date");
20116
+ }
19881
20117
  const syncStart = stageStart();
19882
20118
  if (!options.dryRun) {
20119
+ this.logger.info("Syncing vectors...");
19883
20120
  const upserts = [];
19884
20121
  for (const chunk of changedChunks) {
19885
20122
  const vector = vectorsByChunk.get(chunk.chunkKey);
@@ -19898,12 +20135,16 @@ var IndexPipeline = class _IndexPipeline {
19898
20135
  sectionTitle: chunk.sectionTitle ?? "",
19899
20136
  headingPath: chunk.headingPath,
19900
20137
  snippet: chunk.snippet,
20138
+ chunkText: chunk.chunkText.slice(0, 4e3),
20139
+ ordinal: chunk.ordinal,
19901
20140
  contentHash: chunk.contentHash,
19902
20141
  modelId: this.config.embeddings.model,
19903
20142
  depth: chunk.depth,
19904
20143
  incomingLinks: chunk.incomingLinks,
19905
20144
  routeFile: chunk.routeFile,
19906
- tags: chunk.tags
20145
+ tags: chunk.tags,
20146
+ description: chunk.description,
20147
+ keywords: chunk.keywords
19907
20148
  }
19908
20149
  });
19909
20150
  }
@@ -19917,6 +20158,7 @@ var IndexPipeline = class _IndexPipeline {
19917
20158
  }
19918
20159
  }
19919
20160
  stageEnd("sync", syncStart);
20161
+ this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
19920
20162
  const finalizeStart = stageStart();
19921
20163
  if (!options.dryRun) {
19922
20164
  const scopeInfo = {
@@ -19936,6 +20178,7 @@ var IndexPipeline = class _IndexPipeline {
19936
20178
  });
19937
20179
  }
19938
20180
  stageEnd("finalize", finalizeStart);
20181
+ this.logger.info("Done.");
19939
20182
  return {
19940
20183
  pagesProcessed: mirrorPages.length,
19941
20184
  chunksTotal: chunks.length,
@@ -20096,7 +20339,7 @@ var SearchEngine = class _SearchEngine {
20096
20339
  const groupByPage = (input.groupBy ?? "page") === "page";
20097
20340
  const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
20098
20341
  const embedStart = process.hrtime.bigint();
20099
- const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
20342
+ const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
20100
20343
  const queryVector = queryEmbeddings[0];
20101
20344
  if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
20102
20345
  throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
@@ -20124,13 +20367,17 @@ var SearchEngine = class _SearchEngine {
20124
20367
  usedRerank = true;
20125
20368
  }
20126
20369
  let results;
20370
+ const minScore = this.config.ranking.minScore;
20127
20371
  if (groupByPage) {
20128
- const pages = aggregateByPage(ordered, this.config);
20372
+ let pages = aggregateByPage(ordered, this.config);
20373
+ if (minScore > 0) {
20374
+ pages = pages.filter((p) => p.pageScore >= minScore);
20375
+ }
20129
20376
  const minRatio = this.config.ranking.minChunkScoreRatio;
20130
20377
  results = pages.slice(0, topK).map((page) => {
20131
20378
  const bestScore = page.bestChunk.finalScore;
20132
- const minScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
20133
- const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore).slice(0, 5);
20379
+ const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
20380
+ const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
20134
20381
  return {
20135
20382
  url: page.url,
20136
20383
  title: page.title,
@@ -20147,6 +20394,9 @@ var SearchEngine = class _SearchEngine {
20147
20394
  };
20148
20395
  });
20149
20396
  } else {
20397
+ if (minScore > 0) {
20398
+ ordered = ordered.filter((entry) => entry.finalScore >= minScore);
20399
+ }
20150
20400
  results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
20151
20401
  url: hit.metadata.url,
20152
20402
  title: hit.metadata.title,
@@ -20218,43 +20468,67 @@ var SearchEngine = class _SearchEngine {
20218
20468
  }
20219
20469
  }
20220
20470
  async rerankHits(query, ranked, topK) {
20221
- if (this.config.rerank.provider !== "jina") {
20471
+ if (!this.config.rerank.enabled) {
20222
20472
  throw new SearchSocketError(
20223
20473
  "INVALID_REQUEST",
20224
- "rerank=true requested but rerank.provider is not configured as 'jina'.",
20474
+ "rerank=true requested but rerank.enabled is not set to true.",
20225
20475
  400
20226
20476
  );
20227
20477
  }
20228
20478
  if (!this.reranker) {
20229
20479
  throw new SearchSocketError(
20230
20480
  "CONFIG_MISSING",
20231
- `rerank=true requested but ${this.config.rerank.jina.apiKeyEnv} is not set.`,
20481
+ `rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
20232
20482
  400
20233
20483
  );
20234
20484
  }
20235
- const candidates = ranked.map(({ hit }) => ({
20236
- id: hit.id,
20237
- text: [hit.metadata.title, hit.metadata.sectionTitle, hit.metadata.snippet].filter(Boolean).join("\n")
20238
- }));
20485
+ const pageGroups = /* @__PURE__ */ new Map();
20486
+ for (const entry of ranked) {
20487
+ const url = entry.hit.metadata.url;
20488
+ const group = pageGroups.get(url);
20489
+ if (group) group.push(entry);
20490
+ else pageGroups.set(url, [entry]);
20491
+ }
20492
+ const MAX_CHUNKS_PER_PAGE = 5;
20493
+ const MIN_CHUNKS_PER_PAGE = 1;
20494
+ const MIN_CHUNK_SCORE_RATIO = 0.5;
20495
+ const pageCandidates = [];
20496
+ for (const [url, chunks] of pageGroups) {
20497
+ const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
20498
+ const bestScore = byScore[0].finalScore;
20499
+ const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
20500
+ const selected = byScore.filter(
20501
+ (c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
20502
+ ).slice(0, MAX_CHUNKS_PER_PAGE);
20503
+ selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
20504
+ const first = selected[0].hit.metadata;
20505
+ const parts = [first.title];
20506
+ if (first.description) {
20507
+ parts.push(first.description);
20508
+ }
20509
+ if (first.keywords && first.keywords.length > 0) {
20510
+ parts.push(first.keywords.join(", "));
20511
+ }
20512
+ const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
20513
+ parts.push(body);
20514
+ pageCandidates.push({ id: url, text: parts.join("\n\n") });
20515
+ }
20239
20516
  const reranked = await this.reranker.rerank(
20240
20517
  query,
20241
- candidates,
20518
+ pageCandidates,
20242
20519
  Math.max(topK, this.config.rerank.topN)
20243
20520
  );
20244
- const rerankScoreById = new Map(reranked.map((entry) => [entry.id, entry.score]));
20521
+ const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
20245
20522
  return ranked.map((entry) => {
20246
- const rerankScore = rerankScoreById.get(entry.hit.id);
20247
- const safeBaseScore = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
20248
- if (rerankScore === void 0 || !Number.isFinite(rerankScore)) {
20249
- return {
20250
- ...entry,
20251
- finalScore: safeBaseScore
20252
- };
20523
+ const pageScore = scoreByUrl.get(entry.hit.metadata.url);
20524
+ const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
20525
+ if (pageScore === void 0 || !Number.isFinite(pageScore)) {
20526
+ return { ...entry, finalScore: base };
20253
20527
  }
20254
- const combinedScore = rerankScore * this.config.ranking.weights.rerank + safeBaseScore * 1e-3;
20528
+ const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
20255
20529
  return {
20256
20530
  ...entry,
20257
- finalScore: Number.isFinite(combinedScore) ? combinedScore : safeBaseScore
20531
+ finalScore: Number.isFinite(combined) ? combined : base
20258
20532
  };
20259
20533
  }).sort((a, b) => {
20260
20534
  const delta = b.finalScore - a.finalScore;
@@ -20452,13 +20726,21 @@ function searchsocketHandle(options = {}) {
20452
20726
  let rateLimiter = null;
20453
20727
  const getConfig = async () => {
20454
20728
  if (!configPromise) {
20455
- const configP = options.config ? Promise.resolve(options.config) : loadConfig({
20456
- cwd: options.cwd,
20457
- configPath: options.configPath
20458
- });
20729
+ let configP;
20730
+ if (options.config) {
20731
+ configP = Promise.resolve(options.config);
20732
+ } else if (options.rawConfig) {
20733
+ const cwd = options.cwd ?? process.cwd();
20734
+ configP = Promise.resolve(mergeConfig(cwd, options.rawConfig));
20735
+ } else {
20736
+ configP = loadConfig({
20737
+ cwd: options.cwd,
20738
+ configPath: options.configPath
20739
+ });
20740
+ }
20459
20741
  configPromise = configP.then((config) => {
20460
20742
  apiPath = apiPath ?? config.api.path;
20461
- if (config.api.rateLimit) {
20743
+ if (config.api.rateLimit && !isServerless()) {
20462
20744
  rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
20463
20745
  }
20464
20746
  return config;
@@ -20468,10 +20750,9 @@ function searchsocketHandle(options = {}) {
20468
20750
  };
20469
20751
  const getEngine = async () => {
20470
20752
  if (!enginePromise) {
20471
- const config = options.config;
20753
+ const config = await getConfig();
20472
20754
  enginePromise = SearchEngine.create({
20473
20755
  cwd: options.cwd,
20474
- configPath: options.configPath,
20475
20756
  config
20476
20757
  });
20477
20758
  }
@@ -20737,6 +21018,6 @@ function createSearchClient(options = {}) {
20737
21018
  *)
20738
21019
  */
20739
21020
 
20740
- export { IndexPipeline, JinaReranker, SearchEngine, createEmbeddingsProvider, createReranker, createSearchClient, createVectorStore, loadConfig, mergeConfig, resolveScope, runMcpServer, searchsocketHandle, searchsocketVitePlugin };
21021
+ export { IndexPipeline, JinaReranker, SearchEngine, createEmbeddingsProvider, createReranker, createSearchClient, createVectorStore, isServerless, loadConfig, mergeConfig, mergeConfigServerless, resolveScope, runMcpServer, searchsocketHandle, searchsocketVitePlugin };
20741
21022
  //# sourceMappingURL=index.js.map
20742
21023
  //# sourceMappingURL=index.js.map