searchsocket 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/sveltekit.js CHANGED
@@ -2,8 +2,7 @@ import fs from 'fs';
2
2
  import path from 'path';
3
3
  import { createJiti } from 'jiti';
4
4
  import { z } from 'zod';
5
- import OpenAI from 'openai';
6
- import pLimit from 'p-limit';
5
+ import pLimit2 from 'p-limit';
7
6
  import { execSync, spawn } from 'child_process';
8
7
  import { createHash } from 'crypto';
9
8
  import { load } from 'cheerio';
@@ -16616,7 +16615,11 @@ var searchSocketConfigSchema = z.object({
16616
16615
  outputDir: z.string().min(1).optional(),
16617
16616
  paramValues: z.record(z.string(), z.array(z.string())).optional(),
16618
16617
  exclude: z.array(z.string()).optional(),
16619
- previewTimeout: z.number().int().positive().optional()
16618
+ previewTimeout: z.number().int().positive().optional(),
16619
+ discover: z.boolean().optional(),
16620
+ seedUrls: z.array(z.string()).optional(),
16621
+ maxPages: z.number().int().positive().optional(),
16622
+ maxDepth: z.number().int().nonnegative().optional()
16620
16623
  }).optional()
16621
16624
  }).optional(),
16622
16625
  extract: z.object({
@@ -16643,8 +16646,9 @@ var searchSocketConfigSchema = z.object({
16643
16646
  pageSummaryChunk: z.boolean().optional()
16644
16647
  }).optional(),
16645
16648
  embeddings: z.object({
16646
- provider: z.literal("openai").optional(),
16649
+ provider: z.literal("jina").optional(),
16647
16650
  model: z.string().min(1).optional(),
16651
+ apiKey: z.string().min(1).optional(),
16648
16652
  apiKeyEnv: z.string().min(1).optional(),
16649
16653
  batchSize: z.number().int().positive().optional(),
16650
16654
  concurrency: z.number().int().positive().optional(),
@@ -16653,18 +16657,17 @@ var searchSocketConfigSchema = z.object({
16653
16657
  vector: z.object({
16654
16658
  dimension: z.number().int().positive().optional(),
16655
16659
  turso: z.object({
16660
+ url: z.string().url().optional(),
16661
+ authToken: z.string().min(1).optional(),
16656
16662
  urlEnv: z.string().optional(),
16657
16663
  authTokenEnv: z.string().optional(),
16658
16664
  localPath: z.string().optional()
16659
16665
  }).optional()
16660
16666
  }).optional(),
16661
16667
  rerank: z.object({
16662
- provider: z.enum(["none", "jina"]).optional(),
16668
+ enabled: z.boolean().optional(),
16663
16669
  topN: z.number().int().positive().optional(),
16664
- jina: z.object({
16665
- apiKeyEnv: z.string().optional(),
16666
- model: z.string().optional()
16667
- }).optional()
16670
+ model: z.string().optional()
16668
16671
  }).optional(),
16669
16672
  ranking: z.object({
16670
16673
  enableIncomingLinkBoost: z.boolean().optional(),
@@ -16673,6 +16676,7 @@ var searchSocketConfigSchema = z.object({
16673
16676
  aggregationCap: z.number().int().positive().optional(),
16674
16677
  aggregationDecay: z.number().min(0).max(1).optional(),
16675
16678
  minChunkScoreRatio: z.number().min(0).max(1).optional(),
16679
+ minScore: z.number().min(0).max(1).optional(),
16676
16680
  weights: z.object({
16677
16681
  incomingLinks: z.number().optional(),
16678
16682
  depth: z.number().optional(),
@@ -16753,9 +16757,9 @@ function createDefaultConfig(projectId) {
16753
16757
  pageSummaryChunk: true
16754
16758
  },
16755
16759
  embeddings: {
16756
- provider: "openai",
16757
- model: "text-embedding-3-small",
16758
- apiKeyEnv: "OPENAI_API_KEY",
16760
+ provider: "jina",
16761
+ model: "jina-embeddings-v3",
16762
+ apiKeyEnv: "JINA_API_KEY",
16759
16763
  batchSize: 64,
16760
16764
  concurrency: 4
16761
16765
  },
@@ -16767,12 +16771,9 @@ function createDefaultConfig(projectId) {
16767
16771
  }
16768
16772
  },
16769
16773
  rerank: {
16770
- provider: "none",
16774
+ enabled: false,
16771
16775
  topN: 20,
16772
- jina: {
16773
- apiKeyEnv: "JINA_API_KEY",
16774
- model: "jina-reranker-v2-base-multilingual"
16775
- }
16776
+ model: "jina-reranker-v2-base-multilingual"
16776
16777
  },
16777
16778
  ranking: {
16778
16779
  enableIncomingLinkBoost: true,
@@ -16781,6 +16782,7 @@ function createDefaultConfig(projectId) {
16781
16782
  aggregationCap: 5,
16782
16783
  aggregationDecay: 0.5,
16783
16784
  minChunkScoreRatio: 0.5,
16785
+ minScore: 0,
16784
16786
  weights: {
16785
16787
  incomingLinks: 0.05,
16786
16788
  depth: 0.03,
@@ -16907,7 +16909,11 @@ ${issues}`
16907
16909
  outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
16908
16910
  paramValues: parsed.source.build.paramValues ?? {},
16909
16911
  exclude: parsed.source.build.exclude ?? [],
16910
- previewTimeout: parsed.source.build.previewTimeout ?? 3e4
16912
+ previewTimeout: parsed.source.build.previewTimeout ?? 3e4,
16913
+ discover: parsed.source.build.discover ?? false,
16914
+ seedUrls: parsed.source.build.seedUrls ?? ["/"],
16915
+ maxPages: parsed.source.build.maxPages ?? 200,
16916
+ maxDepth: parsed.source.build.maxDepth ?? 10
16911
16917
  } : void 0
16912
16918
  },
16913
16919
  extract: {
@@ -16936,11 +16942,7 @@ ${issues}`
16936
16942
  },
16937
16943
  rerank: {
16938
16944
  ...defaults.rerank,
16939
- ...parsed.rerank,
16940
- jina: {
16941
- ...defaults.rerank.jina,
16942
- ...parsed.rerank?.jina
16943
- }
16945
+ ...parsed.rerank
16944
16946
  },
16945
16947
  ranking: {
16946
16948
  ...defaults.ranking,
@@ -16987,7 +16989,11 @@ ${issues}`
16987
16989
  outputDir: ".svelte-kit/output",
16988
16990
  paramValues: {},
16989
16991
  exclude: [],
16990
- previewTimeout: 3e4
16992
+ previewTimeout: 3e4,
16993
+ discover: false,
16994
+ seedUrls: ["/"],
16995
+ maxPages: 200,
16996
+ maxDepth: 10
16991
16997
  };
16992
16998
  }
16993
16999
  if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
@@ -17022,15 +17028,21 @@ async function loadConfig(options = {}) {
17022
17028
  const raw = loaded.default ?? loaded;
17023
17029
  return mergeConfig(cwd, raw);
17024
17030
  }
17031
+
17032
+ // src/core/serverless.ts
17033
+ function isServerless() {
17034
+ return !!(process.env.VERCEL || process.env.NETLIFY || process.env.AWS_LAMBDA_FUNCTION_NAME || process.env.FUNCTIONS_WORKER || process.env.CF_PAGES);
17035
+ }
17025
17036
  function sleep(ms) {
17026
17037
  return new Promise((resolve) => {
17027
17038
  setTimeout(resolve, ms);
17028
17039
  });
17029
17040
  }
17030
- var OpenAIEmbeddingsProvider = class {
17031
- client;
17041
+ var JinaEmbeddingsProvider = class {
17042
+ apiKey;
17032
17043
  batchSize;
17033
17044
  concurrency;
17045
+ defaultTask;
17034
17046
  constructor(options) {
17035
17047
  if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
17036
17048
  throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
@@ -17038,11 +17050,10 @@ var OpenAIEmbeddingsProvider = class {
17038
17050
  if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
17039
17051
  throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
17040
17052
  }
17041
- this.client = new OpenAI({
17042
- apiKey: options.apiKey
17043
- });
17053
+ this.apiKey = options.apiKey;
17044
17054
  this.batchSize = options.batchSize;
17045
17055
  this.concurrency = options.concurrency;
17056
+ this.defaultTask = options.task ?? "retrieval.passage";
17046
17057
  }
17047
17058
  estimateTokens(text) {
17048
17059
  const normalized = text.trim();
@@ -17056,7 +17067,7 @@ var OpenAIEmbeddingsProvider = class {
17056
17067
  const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
17057
17068
  return Math.max(1, Math.max(charEstimate, lexicalEstimate));
17058
17069
  }
17059
- async embedTexts(texts, modelId) {
17070
+ async embedTexts(texts, modelId, task) {
17060
17071
  if (texts.length === 0) {
17061
17072
  return [];
17062
17073
  }
@@ -17068,37 +17079,56 @@ var OpenAIEmbeddingsProvider = class {
17068
17079
  });
17069
17080
  }
17070
17081
  const outputs = new Array(batches.length);
17071
- const limit = pLimit(this.concurrency);
17082
+ const limit = pLimit2(this.concurrency);
17072
17083
  await Promise.all(
17073
17084
  batches.map(
17074
17085
  (batch, position) => limit(async () => {
17075
- outputs[position] = await this.embedWithRetry(batch.values, modelId);
17086
+ outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
17076
17087
  })
17077
17088
  )
17078
17089
  );
17079
17090
  return outputs.flat();
17080
17091
  }
17081
- async embedWithRetry(texts, modelId) {
17092
+ async embedWithRetry(texts, modelId, task) {
17082
17093
  const maxAttempts = 5;
17083
17094
  let attempt = 0;
17084
17095
  while (attempt < maxAttempts) {
17085
17096
  attempt += 1;
17097
+ let response;
17086
17098
  try {
17087
- const response = await this.client.embeddings.create({
17088
- model: modelId,
17089
- input: texts,
17090
- encoding_format: "float"
17099
+ response = await fetch("https://api.jina.ai/v1/embeddings", {
17100
+ method: "POST",
17101
+ headers: {
17102
+ "content-type": "application/json",
17103
+ authorization: `Bearer ${this.apiKey}`
17104
+ },
17105
+ body: JSON.stringify({
17106
+ model: modelId,
17107
+ input: texts,
17108
+ task
17109
+ })
17091
17110
  });
17092
- return response.data.map((entry) => entry.embedding);
17093
17111
  } catch (error) {
17094
- const status = error.status;
17095
- const retryable = status === 429 || typeof status === "number" && status >= 500;
17096
- if (!retryable || attempt >= maxAttempts) {
17112
+ if (attempt >= maxAttempts) {
17097
17113
  throw error;
17098
17114
  }
17099
- const delay = Math.min(2 ** attempt * 300, 5e3);
17100
- await sleep(delay);
17115
+ await sleep(Math.min(2 ** attempt * 300, 5e3));
17116
+ continue;
17117
+ }
17118
+ if (!response.ok) {
17119
+ const retryable = response.status === 429 || response.status >= 500;
17120
+ if (!retryable || attempt >= maxAttempts) {
17121
+ const errorBody = await response.text();
17122
+ throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
17123
+ }
17124
+ await sleep(Math.min(2 ** attempt * 300, 5e3));
17125
+ continue;
17101
17126
  }
17127
+ const payload = await response.json();
17128
+ if (!payload.data || !Array.isArray(payload.data)) {
17129
+ throw new Error("Invalid Jina embeddings response format");
17130
+ }
17131
+ return payload.data.map((entry) => entry.embedding);
17102
17132
  }
17103
17133
  throw new Error("Unreachable retry state");
17104
17134
  }
@@ -17106,20 +17136,20 @@ var OpenAIEmbeddingsProvider = class {
17106
17136
 
17107
17137
  // src/embeddings/factory.ts
17108
17138
  function createEmbeddingsProvider(config) {
17109
- if (config.embeddings.provider !== "openai") {
17139
+ if (config.embeddings.provider !== "jina") {
17110
17140
  throw new SearchSocketError(
17111
17141
  "CONFIG_MISSING",
17112
17142
  `Unsupported embeddings provider ${config.embeddings.provider}`
17113
17143
  );
17114
17144
  }
17115
- const apiKey = process.env[config.embeddings.apiKeyEnv];
17145
+ const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
17116
17146
  if (!apiKey) {
17117
17147
  throw new SearchSocketError(
17118
17148
  "CONFIG_MISSING",
17119
- `Missing embeddings API key env var: ${config.embeddings.apiKeyEnv}`
17149
+ `Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
17120
17150
  );
17121
17151
  }
17122
- return new OpenAIEmbeddingsProvider({
17152
+ return new JinaEmbeddingsProvider({
17123
17153
  apiKey,
17124
17154
  batchSize: config.embeddings.batchSize,
17125
17155
  concurrency: config.embeddings.concurrency
@@ -17282,20 +17312,17 @@ var JinaReranker = class {
17282
17312
 
17283
17313
  // src/rerank/factory.ts
17284
17314
  function createReranker(config) {
17285
- if (config.rerank.provider === "none") {
17315
+ if (!config.rerank.enabled) {
17286
17316
  return null;
17287
17317
  }
17288
- if (config.rerank.provider === "jina") {
17289
- const apiKey = process.env[config.rerank.jina.apiKeyEnv];
17290
- if (!apiKey) {
17291
- return null;
17292
- }
17293
- return new JinaReranker({
17294
- apiKey,
17295
- model: config.rerank.jina.model
17296
- });
17318
+ const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
17319
+ if (!apiKey) {
17320
+ return null;
17297
17321
  }
17298
- return null;
17322
+ return new JinaReranker({
17323
+ apiKey,
17324
+ model: config.rerank.model
17325
+ });
17299
17326
  }
17300
17327
 
17301
17328
  // src/utils/time.ts
@@ -17400,6 +17427,16 @@ var TursoVectorStore = class {
17400
17427
  }
17401
17428
  async ensureChunks(dim) {
17402
17429
  if (this.chunksReady) return;
17430
+ const exists = await this.chunksTableExists();
17431
+ if (exists) {
17432
+ const currentDim = await this.getChunksDimension();
17433
+ if (currentDim !== null && currentDim !== dim) {
17434
+ await this.client.batch([
17435
+ "DROP INDEX IF EXISTS idx",
17436
+ "DROP TABLE IF EXISTS chunks"
17437
+ ]);
17438
+ }
17439
+ }
17403
17440
  await this.client.batch([
17404
17441
  `CREATE TABLE IF NOT EXISTS chunks (
17405
17442
  id TEXT PRIMARY KEY,
@@ -17411,6 +17448,8 @@ var TursoVectorStore = class {
17411
17448
  section_title TEXT NOT NULL DEFAULT '',
17412
17449
  heading_path TEXT NOT NULL DEFAULT '[]',
17413
17450
  snippet TEXT NOT NULL DEFAULT '',
17451
+ chunk_text TEXT NOT NULL DEFAULT '',
17452
+ ordinal INTEGER NOT NULL DEFAULT 0,
17414
17453
  content_hash TEXT NOT NULL DEFAULT '',
17415
17454
  model_id TEXT NOT NULL DEFAULT '',
17416
17455
  depth INTEGER NOT NULL DEFAULT 0,
@@ -17421,6 +17460,19 @@ var TursoVectorStore = class {
17421
17460
  )`,
17422
17461
  `CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
17423
17462
  ]);
17463
+ const chunkMigrationCols = [
17464
+ { name: "chunk_text", def: "TEXT NOT NULL DEFAULT ''" },
17465
+ { name: "ordinal", def: "INTEGER NOT NULL DEFAULT 0" }
17466
+ ];
17467
+ for (const col of chunkMigrationCols) {
17468
+ try {
17469
+ await this.client.execute(`ALTER TABLE chunks ADD COLUMN ${col.name} ${col.def}`);
17470
+ } catch (error) {
17471
+ if (error instanceof Error && !error.message.includes("duplicate column")) {
17472
+ throw error;
17473
+ }
17474
+ }
17475
+ }
17424
17476
  this.chunksReady = true;
17425
17477
  }
17426
17478
  async ensurePages() {
@@ -17455,6 +17507,38 @@ var TursoVectorStore = class {
17455
17507
  throw error;
17456
17508
  }
17457
17509
  }
17510
+ /**
17511
+ * Read the current F32_BLOB dimension from the chunks table schema.
17512
+ * Returns null if the table doesn't exist or the dimension can't be parsed.
17513
+ */
17514
+ async getChunksDimension() {
17515
+ try {
17516
+ const rs = await this.client.execute(
17517
+ "SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
17518
+ );
17519
+ if (rs.rows.length === 0) return null;
17520
+ const sql = rs.rows[0].sql;
17521
+ const match = sql.match(/F32_BLOB\((\d+)\)/i);
17522
+ return match ? parseInt(match[1], 10) : null;
17523
+ } catch {
17524
+ return null;
17525
+ }
17526
+ }
17527
+ /**
17528
+ * Drop all SearchSocket tables (chunks, registry, pages) and their indexes.
17529
+ * Used by `clean --remote` for a full reset.
17530
+ */
17531
+ async dropAllTables() {
17532
+ await this.client.batch([
17533
+ "DROP INDEX IF EXISTS idx",
17534
+ "DROP TABLE IF EXISTS chunks",
17535
+ "DROP TABLE IF EXISTS registry",
17536
+ "DROP TABLE IF EXISTS pages"
17537
+ ]);
17538
+ this.chunksReady = false;
17539
+ this.registryReady = false;
17540
+ this.pagesReady = false;
17541
+ }
17458
17542
  async upsert(records, _scope) {
17459
17543
  if (records.length === 0) return;
17460
17544
  const dim = this.dimension ?? records[0].vector.length;
@@ -17465,9 +17549,9 @@ var TursoVectorStore = class {
17465
17549
  const stmts = batch.map((r) => ({
17466
17550
  sql: `INSERT OR REPLACE INTO chunks
17467
17551
  (id, project_id, scope_name, url, path, title, section_title,
17468
- heading_path, snippet, content_hash, model_id, depth,
17552
+ heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
17469
17553
  incoming_links, route_file, tags, embedding)
17470
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
17554
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
17471
17555
  args: [
17472
17556
  r.id,
17473
17557
  r.metadata.projectId,
@@ -17478,6 +17562,8 @@ var TursoVectorStore = class {
17478
17562
  r.metadata.sectionTitle,
17479
17563
  JSON.stringify(r.metadata.headingPath),
17480
17564
  r.metadata.snippet,
17565
+ r.metadata.chunkText,
17566
+ r.metadata.ordinal,
17481
17567
  r.metadata.contentHash,
17482
17568
  r.metadata.modelId,
17483
17569
  r.metadata.depth,
@@ -17496,7 +17582,8 @@ var TursoVectorStore = class {
17496
17582
  const queryJson = JSON.stringify(queryVector);
17497
17583
  const rs = await this.client.execute({
17498
17584
  sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
17499
- c.section_title, c.heading_path, c.snippet, c.content_hash,
17585
+ c.section_title, c.heading_path, c.snippet, c.chunk_text,
17586
+ c.ordinal, c.content_hash,
17500
17587
  c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
17501
17588
  vector_distance_cos(c.embedding, vector(?)) AS distance
17502
17589
  FROM vector_top_k('idx', vector(?), ?) AS v
@@ -17540,6 +17627,8 @@ var TursoVectorStore = class {
17540
17627
  sectionTitle: row.section_title,
17541
17628
  headingPath: JSON.parse(row.heading_path || "[]"),
17542
17629
  snippet: row.snippet,
17630
+ chunkText: row.chunk_text || "",
17631
+ ordinal: row.ordinal || 0,
17543
17632
  contentHash: row.content_hash,
17544
17633
  modelId: row.model_id,
17545
17634
  depth: row.depth,
@@ -17735,10 +17824,10 @@ var TursoVectorStore = class {
17735
17824
  // src/vector/factory.ts
17736
17825
  async function createVectorStore(config, cwd) {
17737
17826
  const turso = config.vector.turso;
17738
- const remoteUrl = process.env[turso.urlEnv];
17827
+ const remoteUrl = turso.url ?? process.env[turso.urlEnv];
17739
17828
  if (remoteUrl) {
17740
17829
  const { createClient: createClient2 } = await import('@libsql/client/http');
17741
- const authToken = process.env[turso.authTokenEnv];
17830
+ const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
17742
17831
  const client2 = createClient2({
17743
17832
  url: remoteUrl,
17744
17833
  authToken
@@ -17748,6 +17837,12 @@ async function createVectorStore(config, cwd) {
17748
17837
  dimension: config.vector.dimension
17749
17838
  });
17750
17839
  }
17840
+ if (isServerless()) {
17841
+ throw new SearchSocketError(
17842
+ "VECTOR_BACKEND_UNAVAILABLE",
17843
+ `No remote vector database URL found (checked vector.turso.url and env var "${turso.urlEnv}"). Local SQLite storage is not available in serverless environments. Set ${turso.urlEnv} or pass vector.turso.url directly.`
17844
+ );
17845
+ }
17751
17846
  const { createClient } = await import('@libsql/client');
17752
17847
  const localPath = path.resolve(cwd, turso.localPath);
17753
17848
  fs.mkdirSync(path.dirname(localPath), { recursive: true });
@@ -17905,7 +18000,7 @@ var SearchEngine = class _SearchEngine {
17905
18000
  const groupByPage = (input.groupBy ?? "page") === "page";
17906
18001
  const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
17907
18002
  const embedStart = process.hrtime.bigint();
17908
- const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
18003
+ const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
17909
18004
  const queryVector = queryEmbeddings[0];
17910
18005
  if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
17911
18006
  throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
@@ -17933,13 +18028,17 @@ var SearchEngine = class _SearchEngine {
17933
18028
  usedRerank = true;
17934
18029
  }
17935
18030
  let results;
18031
+ const minScore = this.config.ranking.minScore;
17936
18032
  if (groupByPage) {
17937
- const pages = aggregateByPage(ordered, this.config);
18033
+ let pages = aggregateByPage(ordered, this.config);
18034
+ if (minScore > 0) {
18035
+ pages = pages.filter((p) => p.pageScore >= minScore);
18036
+ }
17938
18037
  const minRatio = this.config.ranking.minChunkScoreRatio;
17939
18038
  results = pages.slice(0, topK).map((page) => {
17940
18039
  const bestScore = page.bestChunk.finalScore;
17941
- const minScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
17942
- const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore).slice(0, 5);
18040
+ const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
18041
+ const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
17943
18042
  return {
17944
18043
  url: page.url,
17945
18044
  title: page.title,
@@ -17956,6 +18055,9 @@ var SearchEngine = class _SearchEngine {
17956
18055
  };
17957
18056
  });
17958
18057
  } else {
18058
+ if (minScore > 0) {
18059
+ ordered = ordered.filter((entry) => entry.finalScore >= minScore);
18060
+ }
17959
18061
  results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
17960
18062
  url: hit.metadata.url,
17961
18063
  title: hit.metadata.title,
@@ -18027,43 +18129,54 @@ var SearchEngine = class _SearchEngine {
18027
18129
  }
18028
18130
  }
18029
18131
  async rerankHits(query, ranked, topK) {
18030
- if (this.config.rerank.provider !== "jina") {
18132
+ if (!this.config.rerank.enabled) {
18031
18133
  throw new SearchSocketError(
18032
18134
  "INVALID_REQUEST",
18033
- "rerank=true requested but rerank.provider is not configured as 'jina'.",
18135
+ "rerank=true requested but rerank.enabled is not set to true.",
18034
18136
  400
18035
18137
  );
18036
18138
  }
18037
18139
  if (!this.reranker) {
18038
18140
  throw new SearchSocketError(
18039
18141
  "CONFIG_MISSING",
18040
- `rerank=true requested but ${this.config.rerank.jina.apiKeyEnv} is not set.`,
18142
+ `rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
18041
18143
  400
18042
18144
  );
18043
18145
  }
18044
- const candidates = ranked.map(({ hit }) => ({
18045
- id: hit.id,
18046
- text: [hit.metadata.title, hit.metadata.sectionTitle, hit.metadata.snippet].filter(Boolean).join("\n")
18047
- }));
18146
+ const pageGroups = /* @__PURE__ */ new Map();
18147
+ for (const entry of ranked) {
18148
+ const url = entry.hit.metadata.url;
18149
+ const group = pageGroups.get(url);
18150
+ if (group) group.push(entry);
18151
+ else pageGroups.set(url, [entry]);
18152
+ }
18153
+ const pageCandidates = [];
18154
+ for (const [url, chunks] of pageGroups) {
18155
+ const sorted = [...chunks].sort(
18156
+ (a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0)
18157
+ );
18158
+ const title = sorted[0].hit.metadata.title;
18159
+ const body = sorted.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
18160
+ pageCandidates.push({ id: url, text: `${title}
18161
+
18162
+ ${body}` });
18163
+ }
18048
18164
  const reranked = await this.reranker.rerank(
18049
18165
  query,
18050
- candidates,
18166
+ pageCandidates,
18051
18167
  Math.max(topK, this.config.rerank.topN)
18052
18168
  );
18053
- const rerankScoreById = new Map(reranked.map((entry) => [entry.id, entry.score]));
18169
+ const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
18054
18170
  return ranked.map((entry) => {
18055
- const rerankScore = rerankScoreById.get(entry.hit.id);
18056
- const safeBaseScore = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
18057
- if (rerankScore === void 0 || !Number.isFinite(rerankScore)) {
18058
- return {
18059
- ...entry,
18060
- finalScore: safeBaseScore
18061
- };
18171
+ const pageScore = scoreByUrl.get(entry.hit.metadata.url);
18172
+ const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
18173
+ if (pageScore === void 0 || !Number.isFinite(pageScore)) {
18174
+ return { ...entry, finalScore: base };
18062
18175
  }
18063
- const combinedScore = rerankScore * this.config.ranking.weights.rerank + safeBaseScore * 1e-3;
18176
+ const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
18064
18177
  return {
18065
18178
  ...entry,
18066
- finalScore: Number.isFinite(combinedScore) ? combinedScore : safeBaseScore
18179
+ finalScore: Number.isFinite(combined) ? combined : base
18067
18180
  };
18068
18181
  }).sort((a, b) => {
18069
18182
  const delta = b.finalScore - a.finalScore;
@@ -18103,13 +18216,21 @@ function searchsocketHandle(options = {}) {
18103
18216
  let rateLimiter = null;
18104
18217
  const getConfig = async () => {
18105
18218
  if (!configPromise) {
18106
- const configP = options.config ? Promise.resolve(options.config) : loadConfig({
18107
- cwd: options.cwd,
18108
- configPath: options.configPath
18109
- });
18219
+ let configP;
18220
+ if (options.config) {
18221
+ configP = Promise.resolve(options.config);
18222
+ } else if (options.rawConfig) {
18223
+ const cwd = options.cwd ?? process.cwd();
18224
+ configP = Promise.resolve(mergeConfig(cwd, options.rawConfig));
18225
+ } else {
18226
+ configP = loadConfig({
18227
+ cwd: options.cwd,
18228
+ configPath: options.configPath
18229
+ });
18230
+ }
18110
18231
  configPromise = configP.then((config) => {
18111
18232
  apiPath = apiPath ?? config.api.path;
18112
- if (config.api.rateLimit) {
18233
+ if (config.api.rateLimit && !isServerless()) {
18113
18234
  rateLimiter = new InMemoryRateLimiter(config.api.rateLimit.windowMs, config.api.rateLimit.max);
18114
18235
  }
18115
18236
  return config;
@@ -18119,10 +18240,9 @@ function searchsocketHandle(options = {}) {
18119
18240
  };
18120
18241
  const getEngine = async () => {
18121
18242
  if (!enginePromise) {
18122
- const config = options.config;
18243
+ const config = await getConfig();
18123
18244
  enginePromise = SearchEngine.create({
18124
18245
  cwd: options.cwd,
18125
- configPath: options.configPath,
18126
18246
  config
18127
18247
  });
18128
18248
  }
@@ -19657,14 +19777,16 @@ function mapUrlToRoute(urlPath, patterns) {
19657
19777
  var Logger = class {
19658
19778
  json;
19659
19779
  verbose;
19780
+ quiet;
19660
19781
  stderrOnly;
19661
19782
  constructor(opts = {}) {
19662
19783
  this.json = opts.json ?? false;
19663
19784
  this.verbose = opts.verbose ?? false;
19785
+ this.quiet = opts.quiet ?? false;
19664
19786
  this.stderrOnly = opts.stderrOnly ?? false;
19665
19787
  }
19666
19788
  info(message) {
19667
- if (this.json) {
19789
+ if (this.quiet || this.json) {
19668
19790
  return;
19669
19791
  }
19670
19792
  this.writeOut(`${message}
@@ -19678,7 +19800,7 @@ var Logger = class {
19678
19800
  this.logJson("debug", { message });
19679
19801
  return;
19680
19802
  }
19681
- this.writeOut(`${message}
19803
+ this.writeOut(` ${message}
19682
19804
  `);
19683
19805
  }
19684
19806
  warn(message) {
@@ -19705,7 +19827,7 @@ var Logger = class {
19705
19827
  this.logJson(event, data);
19706
19828
  return;
19707
19829
  }
19708
- this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
19830
+ this.writeOut(` [${event}] ${data ? JSON.stringify(data) : ""}
19709
19831
  `);
19710
19832
  }
19711
19833
  writeOut(text) {
@@ -19890,11 +20012,108 @@ async function startPreviewServer(cwd, options, logger3) {
19890
20012
 
19891
20013
  // src/indexing/sources/build/index.ts
19892
20014
  var logger = new Logger();
20015
+ function extractLinksFromHtml(html, pageUrl, baseOrigin) {
20016
+ const $ = load(html);
20017
+ const links = [];
20018
+ $("a[href]").each((_i, el) => {
20019
+ const href = $(el).attr("href");
20020
+ if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
20021
+ return;
20022
+ }
20023
+ try {
20024
+ const resolved = new URL(href, `${baseOrigin}${pageUrl}`);
20025
+ if (resolved.origin !== baseOrigin) return;
20026
+ if (!["http:", "https:"].includes(resolved.protocol)) return;
20027
+ links.push(normalizeUrlPath(resolved.pathname));
20028
+ } catch {
20029
+ }
20030
+ });
20031
+ return [...new Set(links)];
20032
+ }
20033
+ async function discoverPages(server, buildConfig, pipelineMaxPages) {
20034
+ const { seedUrls, maxDepth, exclude } = buildConfig;
20035
+ const baseOrigin = new URL(server.baseUrl).origin;
20036
+ let effectiveMax = buildConfig.maxPages;
20037
+ if (typeof pipelineMaxPages === "number") {
20038
+ const floored = Math.max(0, Math.floor(pipelineMaxPages));
20039
+ effectiveMax = Math.min(effectiveMax, floored);
20040
+ }
20041
+ if (effectiveMax === 0) return [];
20042
+ const visited = /* @__PURE__ */ new Set();
20043
+ const pages = [];
20044
+ const queue = [];
20045
+ const limit = pLimit2(8);
20046
+ for (const seed of seedUrls) {
20047
+ const normalized = normalizeUrlPath(seed);
20048
+ if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
20049
+ visited.add(normalized);
20050
+ queue.push({ url: normalized, depth: 0 });
20051
+ }
20052
+ }
20053
+ while (queue.length > 0 && pages.length < effectiveMax) {
20054
+ const remaining = effectiveMax - pages.length;
20055
+ const batch = queue.splice(0, remaining);
20056
+ const results = await Promise.allSettled(
20057
+ batch.map(
20058
+ (item) => limit(async () => {
20059
+ const fullUrl = joinUrl(server.baseUrl, item.url);
20060
+ const response = await fetch(fullUrl);
20061
+ if (!response.ok) {
20062
+ logger.warn(`Skipping ${item.url}: ${response.status} ${response.statusText}`);
20063
+ return null;
20064
+ }
20065
+ const contentType = response.headers.get("content-type") ?? "";
20066
+ if (!contentType.includes("text/html")) {
20067
+ return null;
20068
+ }
20069
+ const html = await response.text();
20070
+ if (item.depth < maxDepth) {
20071
+ const links = extractLinksFromHtml(html, item.url, baseOrigin);
20072
+ for (const link of links) {
20073
+ if (!visited.has(link) && !isExcluded(link, exclude)) {
20074
+ visited.add(link);
20075
+ queue.push({ url: link, depth: item.depth + 1 });
20076
+ }
20077
+ }
20078
+ }
20079
+ return {
20080
+ url: item.url,
20081
+ html,
20082
+ sourcePath: fullUrl,
20083
+ outgoingLinks: []
20084
+ };
20085
+ })
20086
+ )
20087
+ );
20088
+ for (const result of results) {
20089
+ if (result.status === "fulfilled" && result.value) {
20090
+ pages.push(result.value);
20091
+ }
20092
+ }
20093
+ }
20094
+ if (pages.length >= effectiveMax && queue.length > 0) {
20095
+ logger.warn(`Discovery crawl reached maxPages limit (${effectiveMax}), ${queue.length} URLs not visited.`);
20096
+ }
20097
+ logger.event("build_discover_complete", {
20098
+ pagesFound: pages.length,
20099
+ urlsVisited: visited.size,
20100
+ urlsSkipped: queue.length
20101
+ });
20102
+ return pages;
20103
+ }
19893
20104
  async function loadBuildPages(cwd, config, maxPages) {
19894
20105
  const buildConfig = config.source.build;
19895
20106
  if (!buildConfig) {
19896
20107
  throw new Error("build source config is missing");
19897
20108
  }
20109
+ if (buildConfig.discover) {
20110
+ const server2 = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
20111
+ try {
20112
+ return await discoverPages(server2, buildConfig, maxPages);
20113
+ } finally {
20114
+ await server2.shutdown();
20115
+ }
20116
+ }
19898
20117
  const routes = await parseManifest(cwd, buildConfig.outputDir);
19899
20118
  const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
19900
20119
  logger.event("build_routes_discovered", {
@@ -19905,7 +20124,7 @@ async function loadBuildPages(cwd, config, maxPages) {
19905
20124
  const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
19906
20125
  const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
19907
20126
  try {
19908
- const concurrencyLimit = pLimit(8);
20127
+ const concurrencyLimit = pLimit2(8);
19909
20128
  const results = await Promise.allSettled(
19910
20129
  selected.map(
19911
20130
  (route) => concurrencyLimit(async () => {
@@ -20074,7 +20293,7 @@ async function loadCrawledPages(config, maxPages) {
20074
20293
  const routes = await resolveRoutes(config);
20075
20294
  const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
20076
20295
  const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
20077
- const concurrencyLimit = pLimit(8);
20296
+ const concurrencyLimit = pLimit2(8);
20078
20297
  const results = await Promise.allSettled(
20079
20298
  selected.map(
20080
20299
  (route) => concurrencyLimit(async () => {
@@ -20128,9 +20347,7 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
20128
20347
 
20129
20348
  // src/indexing/pipeline.ts
20130
20349
  var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
20131
- "text-embedding-3-small": 2e-5,
20132
- "text-embedding-3-large": 13e-5,
20133
- "text-embedding-ada-002": 1e-4
20350
+ "jina-embeddings-v3": 2e-5
20134
20351
  };
20135
20352
  var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
20136
20353
  var IndexPipeline = class _IndexPipeline {
@@ -20176,9 +20393,15 @@ var IndexPipeline = class _IndexPipeline {
20176
20393
  };
20177
20394
  const scope = resolveScope(this.config, options.scopeOverride);
20178
20395
  const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
20396
+ const sourceMode = options.sourceOverride ?? this.config.source.mode;
20397
+ this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, model: ${this.config.embeddings.model})`);
20179
20398
  if (options.force) {
20399
+ this.logger.info("Force mode enabled \u2014 full rebuild");
20180
20400
  await cleanMirrorForScope(statePath, scope);
20181
20401
  }
20402
+ if (options.dryRun) {
20403
+ this.logger.info("Dry run \u2014 no writes will be performed");
20404
+ }
20182
20405
  const manifestStart = stageStart();
20183
20406
  const existingHashes = await this.vectorStore.getContentHashes(scope);
20184
20407
  const existingModelId = await this.vectorStore.getScopeModelId(scope);
@@ -20189,8 +20412,9 @@ var IndexPipeline = class _IndexPipeline {
20189
20412
  );
20190
20413
  }
20191
20414
  stageEnd("manifest", manifestStart);
20415
+ this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
20192
20416
  const sourceStart = stageStart();
20193
- const sourceMode = options.sourceOverride ?? this.config.source.mode;
20417
+ this.logger.info(`Loading pages (source: ${sourceMode})...`);
20194
20418
  let sourcePages;
20195
20419
  if (sourceMode === "static-output") {
20196
20420
  sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
@@ -20202,10 +20426,13 @@ var IndexPipeline = class _IndexPipeline {
20202
20426
  sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
20203
20427
  }
20204
20428
  stageEnd("source", sourceStart);
20429
+ this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
20205
20430
  const routeStart = stageStart();
20206
20431
  const routePatterns = await buildRoutePatterns(this.cwd);
20207
20432
  stageEnd("route_map", routeStart);
20433
+ this.logger.debug(`Route mapping: ${routePatterns.length} pattern${routePatterns.length === 1 ? "" : "s"} discovered (${stageTimingsMs["route_map"]}ms)`);
20208
20434
  const extractStart = stageStart();
20435
+ this.logger.info("Extracting content...");
20209
20436
  const extractedPages = [];
20210
20437
  for (const sourcePage of sourcePages) {
20211
20438
  const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
@@ -20234,6 +20461,8 @@ var IndexPipeline = class _IndexPipeline {
20234
20461
  uniquePages.push(page);
20235
20462
  }
20236
20463
  stageEnd("extract", extractStart);
20464
+ const skippedPages = sourcePages.length - uniquePages.length;
20465
+ this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
20237
20466
  const linkStart = stageStart();
20238
20467
  const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
20239
20468
  const incomingLinkCount = /* @__PURE__ */ new Map();
@@ -20249,7 +20478,9 @@ var IndexPipeline = class _IndexPipeline {
20249
20478
  }
20250
20479
  }
20251
20480
  stageEnd("links", linkStart);
20481
+ this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
20252
20482
  const mirrorStart = stageStart();
20483
+ this.logger.info("Writing mirror pages...");
20253
20484
  const mirrorPages = [];
20254
20485
  let routeExact = 0;
20255
20486
  let routeBestEffort = 0;
@@ -20319,7 +20550,9 @@ var IndexPipeline = class _IndexPipeline {
20319
20550
  await this.vectorStore.upsertPages(pageRecords, scope);
20320
20551
  }
20321
20552
  stageEnd("mirror", mirrorStart);
20553
+ this.logger.info(`Mirrored ${mirrorPages.length} page${mirrorPages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["mirror"]}ms)`);
20322
20554
  const chunkStart = stageStart();
20555
+ this.logger.info("Chunking pages...");
20323
20556
  let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
20324
20557
  const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
20325
20558
  if (typeof maxChunks === "number") {
@@ -20332,6 +20565,7 @@ var IndexPipeline = class _IndexPipeline {
20332
20565
  });
20333
20566
  }
20334
20567
  stageEnd("chunk", chunkStart);
20568
+ this.logger.info(`Chunked into ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} (${stageTimingsMs["chunk"]}ms)`);
20335
20569
  const currentChunkMap = /* @__PURE__ */ new Map();
20336
20570
  for (const chunk of chunks) {
20337
20571
  currentChunkMap.set(chunk.chunkKey, chunk);
@@ -20350,6 +20584,7 @@ var IndexPipeline = class _IndexPipeline {
20350
20584
  return existingHash !== chunk.contentHash;
20351
20585
  });
20352
20586
  const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
20587
+ this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
20353
20588
  const embedStart = stageStart();
20354
20589
  const chunkTokenEstimates = /* @__PURE__ */ new Map();
20355
20590
  for (const chunk of changedChunks) {
@@ -20364,9 +20599,11 @@ var IndexPipeline = class _IndexPipeline {
20364
20599
  let newEmbeddings = 0;
20365
20600
  const vectorsByChunk = /* @__PURE__ */ new Map();
20366
20601
  if (!options.dryRun && changedChunks.length > 0) {
20602
+ this.logger.info(`Embedding ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} (~${estimatedTokens.toLocaleString()} tokens, ~$${estimatedCostUSD.toFixed(6)})...`);
20367
20603
  const embeddings = await this.embeddings.embedTexts(
20368
20604
  changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
20369
- this.config.embeddings.model
20605
+ this.config.embeddings.model,
20606
+ "retrieval.passage"
20370
20607
  );
20371
20608
  if (embeddings.length !== changedChunks.length) {
20372
20609
  throw new SearchSocketError(
@@ -20389,8 +20626,14 @@ var IndexPipeline = class _IndexPipeline {
20389
20626
  }
20390
20627
  }
20391
20628
  stageEnd("embedding", embedStart);
20629
+ if (changedChunks.length > 0) {
20630
+ this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
20631
+ } else {
20632
+ this.logger.info("No chunks to embed \u2014 all up to date");
20633
+ }
20392
20634
  const syncStart = stageStart();
20393
20635
  if (!options.dryRun) {
20636
+ this.logger.info("Syncing vectors...");
20394
20637
  const upserts = [];
20395
20638
  for (const chunk of changedChunks) {
20396
20639
  const vector = vectorsByChunk.get(chunk.chunkKey);
@@ -20409,6 +20652,8 @@ var IndexPipeline = class _IndexPipeline {
20409
20652
  sectionTitle: chunk.sectionTitle ?? "",
20410
20653
  headingPath: chunk.headingPath,
20411
20654
  snippet: chunk.snippet,
20655
+ chunkText: chunk.chunkText.slice(0, 4e3),
20656
+ ordinal: chunk.ordinal,
20412
20657
  contentHash: chunk.contentHash,
20413
20658
  modelId: this.config.embeddings.model,
20414
20659
  depth: chunk.depth,
@@ -20428,6 +20673,7 @@ var IndexPipeline = class _IndexPipeline {
20428
20673
  }
20429
20674
  }
20430
20675
  stageEnd("sync", syncStart);
20676
+ this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
20431
20677
  const finalizeStart = stageStart();
20432
20678
  if (!options.dryRun) {
20433
20679
  const scopeInfo = {
@@ -20447,6 +20693,7 @@ var IndexPipeline = class _IndexPipeline {
20447
20693
  });
20448
20694
  }
20449
20695
  stageEnd("finalize", finalizeStart);
20696
+ this.logger.info("Done.");
20450
20697
  return {
20451
20698
  pagesProcessed: mirrorPages.length,
20452
20699
  chunksTotal: chunks.length,