searchsocket 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -12,7 +12,7 @@ import { Command } from "commander";
12
12
  // package.json
13
13
  var package_default = {
14
14
  name: "searchsocket",
15
- version: "0.2.1",
15
+ version: "0.3.1",
16
16
  description: "Semantic site search and MCP retrieval for SvelteKit static sites",
17
17
  license: "MIT",
18
18
  author: "Greg Priday <greg@siteorigin.com>",
@@ -82,7 +82,6 @@ var package_default = {
82
82
  "fast-glob": "^3.3.3",
83
83
  "gray-matter": "^4.0.3",
84
84
  jiti: "^2.6.1",
85
- openai: "^6.19.0",
86
85
  "p-limit": "^7.3.0",
87
86
  turndown: "^7.2.2",
88
87
  "turndown-plugin-gfm": "^1.0.2",
@@ -133,7 +132,11 @@ var searchSocketConfigSchema = z.object({
133
132
  outputDir: z.string().min(1).optional(),
134
133
  paramValues: z.record(z.string(), z.array(z.string())).optional(),
135
134
  exclude: z.array(z.string()).optional(),
136
- previewTimeout: z.number().int().positive().optional()
135
+ previewTimeout: z.number().int().positive().optional(),
136
+ discover: z.boolean().optional(),
137
+ seedUrls: z.array(z.string()).optional(),
138
+ maxPages: z.number().int().positive().optional(),
139
+ maxDepth: z.number().int().nonnegative().optional()
137
140
  }).optional()
138
141
  }).optional(),
139
142
  extract: z.object({
@@ -160,8 +163,9 @@ var searchSocketConfigSchema = z.object({
160
163
  pageSummaryChunk: z.boolean().optional()
161
164
  }).optional(),
162
165
  embeddings: z.object({
163
- provider: z.literal("openai").optional(),
166
+ provider: z.literal("jina").optional(),
164
167
  model: z.string().min(1).optional(),
168
+ apiKey: z.string().min(1).optional(),
165
169
  apiKeyEnv: z.string().min(1).optional(),
166
170
  batchSize: z.number().int().positive().optional(),
167
171
  concurrency: z.number().int().positive().optional(),
@@ -170,18 +174,17 @@ var searchSocketConfigSchema = z.object({
170
174
  vector: z.object({
171
175
  dimension: z.number().int().positive().optional(),
172
176
  turso: z.object({
177
+ url: z.string().url().optional(),
178
+ authToken: z.string().min(1).optional(),
173
179
  urlEnv: z.string().optional(),
174
180
  authTokenEnv: z.string().optional(),
175
181
  localPath: z.string().optional()
176
182
  }).optional()
177
183
  }).optional(),
178
184
  rerank: z.object({
179
- provider: z.enum(["none", "jina"]).optional(),
185
+ enabled: z.boolean().optional(),
180
186
  topN: z.number().int().positive().optional(),
181
- jina: z.object({
182
- apiKeyEnv: z.string().optional(),
183
- model: z.string().optional()
184
- }).optional()
187
+ model: z.string().optional()
185
188
  }).optional(),
186
189
  ranking: z.object({
187
190
  enableIncomingLinkBoost: z.boolean().optional(),
@@ -190,6 +193,7 @@ var searchSocketConfigSchema = z.object({
190
193
  aggregationCap: z.number().int().positive().optional(),
191
194
  aggregationDecay: z.number().min(0).max(1).optional(),
192
195
  minChunkScoreRatio: z.number().min(0).max(1).optional(),
196
+ minScore: z.number().min(0).max(1).optional(),
193
197
  weights: z.object({
194
198
  incomingLinks: z.number().optional(),
195
199
  depth: z.number().optional(),
@@ -270,9 +274,9 @@ function createDefaultConfig(projectId) {
270
274
  pageSummaryChunk: true
271
275
  },
272
276
  embeddings: {
273
- provider: "openai",
274
- model: "text-embedding-3-small",
275
- apiKeyEnv: "OPENAI_API_KEY",
277
+ provider: "jina",
278
+ model: "jina-embeddings-v3",
279
+ apiKeyEnv: "JINA_API_KEY",
276
280
  batchSize: 64,
277
281
  concurrency: 4
278
282
  },
@@ -284,12 +288,9 @@ function createDefaultConfig(projectId) {
284
288
  }
285
289
  },
286
290
  rerank: {
287
- provider: "none",
291
+ enabled: false,
288
292
  topN: 20,
289
- jina: {
290
- apiKeyEnv: "JINA_API_KEY",
291
- model: "jina-reranker-v2-base-multilingual"
292
- }
293
+ model: "jina-reranker-v2-base-multilingual"
293
294
  },
294
295
  ranking: {
295
296
  enableIncomingLinkBoost: true,
@@ -298,6 +299,7 @@ function createDefaultConfig(projectId) {
298
299
  aggregationCap: 5,
299
300
  aggregationDecay: 0.5,
300
301
  minChunkScoreRatio: 0.5,
302
+ minScore: 0,
301
303
  weights: {
302
304
  incomingLinks: 0.05,
303
305
  depth: 0.03,
@@ -408,7 +410,11 @@ ${issues}`
408
410
  outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
409
411
  paramValues: parsed.source.build.paramValues ?? {},
410
412
  exclude: parsed.source.build.exclude ?? [],
411
- previewTimeout: parsed.source.build.previewTimeout ?? 3e4
413
+ previewTimeout: parsed.source.build.previewTimeout ?? 3e4,
414
+ discover: parsed.source.build.discover ?? false,
415
+ seedUrls: parsed.source.build.seedUrls ?? ["/"],
416
+ maxPages: parsed.source.build.maxPages ?? 200,
417
+ maxDepth: parsed.source.build.maxDepth ?? 10
412
418
  } : void 0
413
419
  },
414
420
  extract: {
@@ -437,11 +443,7 @@ ${issues}`
437
443
  },
438
444
  rerank: {
439
445
  ...defaults.rerank,
440
- ...parsed.rerank,
441
- jina: {
442
- ...defaults.rerank.jina,
443
- ...parsed.rerank?.jina
444
- }
446
+ ...parsed.rerank
445
447
  },
446
448
  ranking: {
447
449
  ...defaults.ranking,
@@ -488,7 +490,11 @@ ${issues}`
488
490
  outputDir: ".svelte-kit/output",
489
491
  paramValues: {},
490
492
  exclude: [],
491
- previewTimeout: 3e4
493
+ previewTimeout: 3e4,
494
+ discover: false,
495
+ seedUrls: ["/"],
496
+ maxPages: 200,
497
+ maxDepth: 10
492
498
  };
493
499
  }
494
500
  if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
@@ -529,7 +535,7 @@ function writeMinimalConfig(cwd) {
529
535
  return target;
530
536
  }
531
537
  const content = `export default {
532
- embeddings: { apiKeyEnv: "OPENAI_API_KEY" }
538
+ embeddings: { apiKeyEnv: "JINA_API_KEY" }
533
539
  };
534
540
  `;
535
541
  fs.writeFileSync(target, content, "utf8");
@@ -540,14 +546,16 @@ function writeMinimalConfig(cwd) {
540
546
  var Logger = class {
541
547
  json;
542
548
  verbose;
549
+ quiet;
543
550
  stderrOnly;
544
551
  constructor(opts = {}) {
545
552
  this.json = opts.json ?? false;
546
553
  this.verbose = opts.verbose ?? false;
554
+ this.quiet = opts.quiet ?? false;
547
555
  this.stderrOnly = opts.stderrOnly ?? false;
548
556
  }
549
557
  info(message) {
550
- if (this.json) {
558
+ if (this.quiet || this.json) {
551
559
  return;
552
560
  }
553
561
  this.writeOut(`${message}
@@ -561,7 +569,7 @@ var Logger = class {
561
569
  this.logJson("debug", { message });
562
570
  return;
563
571
  }
564
- this.writeOut(`${message}
572
+ this.writeOut(` ${message}
565
573
  `);
566
574
  }
567
575
  warn(message) {
@@ -588,7 +596,7 @@ var Logger = class {
588
596
  this.logJson(event, data);
589
597
  return;
590
598
  }
591
- this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
599
+ this.writeOut(` [${event}] ${data ? JSON.stringify(data) : ""}
592
600
  `);
593
601
  }
594
602
  writeOut(text) {
@@ -695,18 +703,18 @@ function ensureStateDirs(cwd, stateDir, scope) {
695
703
  return { statePath, pagesPath };
696
704
  }
697
705
 
698
- // src/embeddings/openai.ts
699
- import OpenAI from "openai";
706
+ // src/embeddings/jina.ts
700
707
  import pLimit from "p-limit";
701
708
  function sleep(ms) {
702
709
  return new Promise((resolve) => {
703
710
  setTimeout(resolve, ms);
704
711
  });
705
712
  }
706
- var OpenAIEmbeddingsProvider = class {
707
- client;
713
+ var JinaEmbeddingsProvider = class {
714
+ apiKey;
708
715
  batchSize;
709
716
  concurrency;
717
+ defaultTask;
710
718
  constructor(options) {
711
719
  if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
712
720
  throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
@@ -714,11 +722,10 @@ var OpenAIEmbeddingsProvider = class {
714
722
  if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
715
723
  throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
716
724
  }
717
- this.client = new OpenAI({
718
- apiKey: options.apiKey
719
- });
725
+ this.apiKey = options.apiKey;
720
726
  this.batchSize = options.batchSize;
721
727
  this.concurrency = options.concurrency;
728
+ this.defaultTask = options.task ?? "retrieval.passage";
722
729
  }
723
730
  estimateTokens(text) {
724
731
  const normalized = text.trim();
@@ -732,7 +739,7 @@ var OpenAIEmbeddingsProvider = class {
732
739
  const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
733
740
  return Math.max(1, Math.max(charEstimate, lexicalEstimate));
734
741
  }
735
- async embedTexts(texts, modelId) {
742
+ async embedTexts(texts, modelId, task) {
736
743
  if (texts.length === 0) {
737
744
  return [];
738
745
  }
@@ -748,33 +755,52 @@ var OpenAIEmbeddingsProvider = class {
748
755
  await Promise.all(
749
756
  batches.map(
750
757
  (batch, position) => limit(async () => {
751
- outputs[position] = await this.embedWithRetry(batch.values, modelId);
758
+ outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
752
759
  })
753
760
  )
754
761
  );
755
762
  return outputs.flat();
756
763
  }
757
- async embedWithRetry(texts, modelId) {
764
+ async embedWithRetry(texts, modelId, task) {
758
765
  const maxAttempts = 5;
759
766
  let attempt = 0;
760
767
  while (attempt < maxAttempts) {
761
768
  attempt += 1;
769
+ let response;
762
770
  try {
763
- const response = await this.client.embeddings.create({
764
- model: modelId,
765
- input: texts,
766
- encoding_format: "float"
771
+ response = await fetch("https://api.jina.ai/v1/embeddings", {
772
+ method: "POST",
773
+ headers: {
774
+ "content-type": "application/json",
775
+ authorization: `Bearer ${this.apiKey}`
776
+ },
777
+ body: JSON.stringify({
778
+ model: modelId,
779
+ input: texts,
780
+ task
781
+ })
767
782
  });
768
- return response.data.map((entry) => entry.embedding);
769
783
  } catch (error) {
770
- const status = error.status;
771
- const retryable = status === 429 || typeof status === "number" && status >= 500;
772
- if (!retryable || attempt >= maxAttempts) {
784
+ if (attempt >= maxAttempts) {
773
785
  throw error;
774
786
  }
775
- const delay = Math.min(2 ** attempt * 300, 5e3);
776
- await sleep(delay);
787
+ await sleep(Math.min(2 ** attempt * 300, 5e3));
788
+ continue;
789
+ }
790
+ if (!response.ok) {
791
+ const retryable = response.status === 429 || response.status >= 500;
792
+ if (!retryable || attempt >= maxAttempts) {
793
+ const errorBody = await response.text();
794
+ throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
795
+ }
796
+ await sleep(Math.min(2 ** attempt * 300, 5e3));
797
+ continue;
798
+ }
799
+ const payload = await response.json();
800
+ if (!payload.data || !Array.isArray(payload.data)) {
801
+ throw new Error("Invalid Jina embeddings response format");
777
802
  }
803
+ return payload.data.map((entry) => entry.embedding);
778
804
  }
779
805
  throw new Error("Unreachable retry state");
780
806
  }
@@ -782,20 +808,20 @@ var OpenAIEmbeddingsProvider = class {
782
808
 
783
809
  // src/embeddings/factory.ts
784
810
  function createEmbeddingsProvider(config) {
785
- if (config.embeddings.provider !== "openai") {
811
+ if (config.embeddings.provider !== "jina") {
786
812
  throw new SearchSocketError(
787
813
  "CONFIG_MISSING",
788
814
  `Unsupported embeddings provider ${config.embeddings.provider}`
789
815
  );
790
816
  }
791
- const apiKey = process.env[config.embeddings.apiKeyEnv];
817
+ const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
792
818
  if (!apiKey) {
793
819
  throw new SearchSocketError(
794
820
  "CONFIG_MISSING",
795
- `Missing embeddings API key env var: ${config.embeddings.apiKeyEnv}`
821
+ `Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
796
822
  );
797
823
  }
798
- return new OpenAIEmbeddingsProvider({
824
+ return new JinaEmbeddingsProvider({
799
825
  apiKey,
800
826
  batchSize: config.embeddings.batchSize,
801
827
  concurrency: config.embeddings.concurrency
@@ -809,6 +835,11 @@ import path11 from "path";
809
835
  import fs3 from "fs";
810
836
  import path3 from "path";
811
837
 
838
+ // src/core/serverless.ts
839
+ function isServerless() {
840
+ return !!(process.env.VERCEL || process.env.NETLIFY || process.env.AWS_LAMBDA_FUNCTION_NAME || process.env.FUNCTIONS_WORKER || process.env.CF_PAGES);
841
+ }
842
+
812
843
  // src/vector/turso.ts
813
844
  var TursoVectorStore = class {
814
845
  client;
@@ -853,6 +884,16 @@ var TursoVectorStore = class {
853
884
  }
854
885
  async ensureChunks(dim) {
855
886
  if (this.chunksReady) return;
887
+ const exists = await this.chunksTableExists();
888
+ if (exists) {
889
+ const currentDim = await this.getChunksDimension();
890
+ if (currentDim !== null && currentDim !== dim) {
891
+ await this.client.batch([
892
+ "DROP INDEX IF EXISTS idx",
893
+ "DROP TABLE IF EXISTS chunks"
894
+ ]);
895
+ }
896
+ }
856
897
  await this.client.batch([
857
898
  `CREATE TABLE IF NOT EXISTS chunks (
858
899
  id TEXT PRIMARY KEY,
@@ -864,12 +905,16 @@ var TursoVectorStore = class {
864
905
  section_title TEXT NOT NULL DEFAULT '',
865
906
  heading_path TEXT NOT NULL DEFAULT '[]',
866
907
  snippet TEXT NOT NULL DEFAULT '',
908
+ chunk_text TEXT NOT NULL DEFAULT '',
909
+ ordinal INTEGER NOT NULL DEFAULT 0,
867
910
  content_hash TEXT NOT NULL DEFAULT '',
868
911
  model_id TEXT NOT NULL DEFAULT '',
869
912
  depth INTEGER NOT NULL DEFAULT 0,
870
913
  incoming_links INTEGER NOT NULL DEFAULT 0,
871
914
  route_file TEXT NOT NULL DEFAULT '',
872
915
  tags TEXT NOT NULL DEFAULT '[]',
916
+ description TEXT NOT NULL DEFAULT '',
917
+ keywords TEXT NOT NULL DEFAULT '[]',
873
918
  embedding F32_BLOB(${dim})
874
919
  )`,
875
920
  `CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
@@ -908,6 +953,38 @@ var TursoVectorStore = class {
908
953
  throw error;
909
954
  }
910
955
  }
956
+ /**
957
+ * Read the current F32_BLOB dimension from the chunks table schema.
958
+ * Returns null if the table doesn't exist or the dimension can't be parsed.
959
+ */
960
+ async getChunksDimension() {
961
+ try {
962
+ const rs = await this.client.execute(
963
+ "SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
964
+ );
965
+ if (rs.rows.length === 0) return null;
966
+ const sql = rs.rows[0].sql;
967
+ const match = sql.match(/F32_BLOB\((\d+)\)/i);
968
+ return match ? parseInt(match[1], 10) : null;
969
+ } catch {
970
+ return null;
971
+ }
972
+ }
973
+ /**
974
+ * Drop all SearchSocket tables (chunks, registry, pages) and their indexes.
975
+ * Used by `clean --remote` for a full reset.
976
+ */
977
+ async dropAllTables() {
978
+ await this.client.batch([
979
+ "DROP INDEX IF EXISTS idx",
980
+ "DROP TABLE IF EXISTS chunks",
981
+ "DROP TABLE IF EXISTS registry",
982
+ "DROP TABLE IF EXISTS pages"
983
+ ]);
984
+ this.chunksReady = false;
985
+ this.registryReady = false;
986
+ this.pagesReady = false;
987
+ }
911
988
  async upsert(records, _scope) {
912
989
  if (records.length === 0) return;
913
990
  const dim = this.dimension ?? records[0].vector.length;
@@ -918,9 +995,9 @@ var TursoVectorStore = class {
918
995
  const stmts = batch.map((r) => ({
919
996
  sql: `INSERT OR REPLACE INTO chunks
920
997
  (id, project_id, scope_name, url, path, title, section_title,
921
- heading_path, snippet, content_hash, model_id, depth,
922
- incoming_links, route_file, tags, embedding)
923
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
998
+ heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
999
+ incoming_links, route_file, tags, description, keywords, embedding)
1000
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
924
1001
  args: [
925
1002
  r.id,
926
1003
  r.metadata.projectId,
@@ -931,12 +1008,16 @@ var TursoVectorStore = class {
931
1008
  r.metadata.sectionTitle,
932
1009
  JSON.stringify(r.metadata.headingPath),
933
1010
  r.metadata.snippet,
1011
+ r.metadata.chunkText,
1012
+ r.metadata.ordinal,
934
1013
  r.metadata.contentHash,
935
1014
  r.metadata.modelId,
936
1015
  r.metadata.depth,
937
1016
  r.metadata.incomingLinks,
938
1017
  r.metadata.routeFile,
939
1018
  JSON.stringify(r.metadata.tags),
1019
+ r.metadata.description ?? "",
1020
+ JSON.stringify(r.metadata.keywords ?? []),
940
1021
  JSON.stringify(r.vector)
941
1022
  ]
942
1023
  }));
@@ -949,8 +1030,10 @@ var TursoVectorStore = class {
949
1030
  const queryJson = JSON.stringify(queryVector);
950
1031
  const rs = await this.client.execute({
951
1032
  sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
952
- c.section_title, c.heading_path, c.snippet, c.content_hash,
1033
+ c.section_title, c.heading_path, c.snippet, c.chunk_text,
1034
+ c.ordinal, c.content_hash,
953
1035
  c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
1036
+ c.description, c.keywords,
954
1037
  vector_distance_cos(c.embedding, vector(?)) AS distance
955
1038
  FROM vector_top_k('idx', vector(?), ?) AS v
956
1039
  JOIN chunks AS c ON c.rowid = v.id`,
@@ -981,6 +1064,12 @@ var TursoVectorStore = class {
981
1064
  }
982
1065
  const distance = row.distance;
983
1066
  const score = 1 - distance;
1067
+ const description = row.description || void 0;
1068
+ const keywords = (() => {
1069
+ const raw = row.keywords || "[]";
1070
+ const parsed = JSON.parse(raw);
1071
+ return parsed.length > 0 ? parsed : void 0;
1072
+ })();
984
1073
  hits.push({
985
1074
  id: row.id,
986
1075
  score,
@@ -993,12 +1082,16 @@ var TursoVectorStore = class {
993
1082
  sectionTitle: row.section_title,
994
1083
  headingPath: JSON.parse(row.heading_path || "[]"),
995
1084
  snippet: row.snippet,
1085
+ chunkText: row.chunk_text || "",
1086
+ ordinal: row.ordinal || 0,
996
1087
  contentHash: row.content_hash,
997
1088
  modelId: row.model_id,
998
1089
  depth: row.depth,
999
1090
  incomingLinks: row.incoming_links,
1000
1091
  routeFile: row.route_file,
1001
- tags
1092
+ tags,
1093
+ description,
1094
+ keywords
1002
1095
  }
1003
1096
  });
1004
1097
  }
@@ -1188,10 +1281,10 @@ var TursoVectorStore = class {
1188
1281
  // src/vector/factory.ts
1189
1282
  async function createVectorStore(config, cwd) {
1190
1283
  const turso = config.vector.turso;
1191
- const remoteUrl = process.env[turso.urlEnv];
1284
+ const remoteUrl = turso.url ?? process.env[turso.urlEnv];
1192
1285
  if (remoteUrl) {
1193
1286
  const { createClient: createClient2 } = await import("@libsql/client/http");
1194
- const authToken = process.env[turso.authTokenEnv];
1287
+ const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
1195
1288
  const client2 = createClient2({
1196
1289
  url: remoteUrl,
1197
1290
  authToken
@@ -1201,6 +1294,12 @@ async function createVectorStore(config, cwd) {
1201
1294
  dimension: config.vector.dimension
1202
1295
  });
1203
1296
  }
1297
+ if (isServerless()) {
1298
+ throw new SearchSocketError(
1299
+ "VECTOR_BACKEND_UNAVAILABLE",
1300
+ `No remote vector database URL found (checked vector.turso.url and env var "${turso.urlEnv}"). Local SQLite storage is not available in serverless environments. Set ${turso.urlEnv} or pass vector.turso.url directly.`
1301
+ );
1302
+ }
1204
1303
  const { createClient } = await import("@libsql/client");
1205
1304
  const localPath = path3.resolve(cwd, turso.localPath);
1206
1305
  fs3.mkdirSync(path3.dirname(localPath), { recursive: true });
@@ -1537,7 +1636,9 @@ function chunkMirrorPage(page, config, scope) {
1537
1636
  incomingLinks: page.incomingLinks,
1538
1637
  routeFile: page.routeFile,
1539
1638
  tags: page.tags,
1540
- contentHash: ""
1639
+ contentHash: "",
1640
+ description: page.description,
1641
+ keywords: page.keywords
1541
1642
  };
1542
1643
  const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
1543
1644
  summaryChunk.contentHash = sha256(normalizeText(embeddingText));
@@ -1564,7 +1665,9 @@ function chunkMirrorPage(page, config, scope) {
1564
1665
  incomingLinks: page.incomingLinks,
1565
1666
  routeFile: page.routeFile,
1566
1667
  tags: page.tags,
1567
- contentHash: ""
1668
+ contentHash: "",
1669
+ description: page.description,
1670
+ keywords: page.keywords
1568
1671
  };
1569
1672
  const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
1570
1673
  chunk.contentHash = sha256(normalizeText(embeddingText));
@@ -1828,6 +1931,7 @@ function mapUrlToRoute(urlPath, patterns) {
1828
1931
  }
1829
1932
 
1830
1933
  // src/indexing/sources/build/index.ts
1934
+ import { load as cheerioLoad } from "cheerio";
1831
1935
  import pLimit2 from "p-limit";
1832
1936
 
1833
1937
  // src/indexing/sources/build/manifest-parser.ts
@@ -2004,11 +2108,108 @@ async function startPreviewServer(cwd, options, logger3) {
2004
2108
 
2005
2109
  // src/indexing/sources/build/index.ts
2006
2110
  var logger = new Logger();
2111
+ function extractLinksFromHtml(html, pageUrl, baseOrigin) {
2112
+ const $ = cheerioLoad(html);
2113
+ const links = [];
2114
+ $("a[href]").each((_i, el) => {
2115
+ const href = $(el).attr("href");
2116
+ if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
2117
+ return;
2118
+ }
2119
+ try {
2120
+ const resolved = new URL(href, `${baseOrigin}${pageUrl}`);
2121
+ if (resolved.origin !== baseOrigin) return;
2122
+ if (!["http:", "https:"].includes(resolved.protocol)) return;
2123
+ links.push(normalizeUrlPath(resolved.pathname));
2124
+ } catch {
2125
+ }
2126
+ });
2127
+ return [...new Set(links)];
2128
+ }
2129
+ async function discoverPages(server, buildConfig, pipelineMaxPages) {
2130
+ const { seedUrls, maxDepth, exclude } = buildConfig;
2131
+ const baseOrigin = new URL(server.baseUrl).origin;
2132
+ let effectiveMax = buildConfig.maxPages;
2133
+ if (typeof pipelineMaxPages === "number") {
2134
+ const floored = Math.max(0, Math.floor(pipelineMaxPages));
2135
+ effectiveMax = Math.min(effectiveMax, floored);
2136
+ }
2137
+ if (effectiveMax === 0) return [];
2138
+ const visited = /* @__PURE__ */ new Set();
2139
+ const pages = [];
2140
+ const queue = [];
2141
+ const limit = pLimit2(8);
2142
+ for (const seed of seedUrls) {
2143
+ const normalized = normalizeUrlPath(seed);
2144
+ if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
2145
+ visited.add(normalized);
2146
+ queue.push({ url: normalized, depth: 0 });
2147
+ }
2148
+ }
2149
+ while (queue.length > 0 && pages.length < effectiveMax) {
2150
+ const remaining = effectiveMax - pages.length;
2151
+ const batch = queue.splice(0, remaining);
2152
+ const results = await Promise.allSettled(
2153
+ batch.map(
2154
+ (item) => limit(async () => {
2155
+ const fullUrl = joinUrl(server.baseUrl, item.url);
2156
+ const response = await fetch(fullUrl);
2157
+ if (!response.ok) {
2158
+ logger.warn(`Skipping ${item.url}: ${response.status} ${response.statusText}`);
2159
+ return null;
2160
+ }
2161
+ const contentType = response.headers.get("content-type") ?? "";
2162
+ if (!contentType.includes("text/html")) {
2163
+ return null;
2164
+ }
2165
+ const html = await response.text();
2166
+ if (item.depth < maxDepth) {
2167
+ const links = extractLinksFromHtml(html, item.url, baseOrigin);
2168
+ for (const link of links) {
2169
+ if (!visited.has(link) && !isExcluded(link, exclude)) {
2170
+ visited.add(link);
2171
+ queue.push({ url: link, depth: item.depth + 1 });
2172
+ }
2173
+ }
2174
+ }
2175
+ return {
2176
+ url: item.url,
2177
+ html,
2178
+ sourcePath: fullUrl,
2179
+ outgoingLinks: []
2180
+ };
2181
+ })
2182
+ )
2183
+ );
2184
+ for (const result of results) {
2185
+ if (result.status === "fulfilled" && result.value) {
2186
+ pages.push(result.value);
2187
+ }
2188
+ }
2189
+ }
2190
+ if (pages.length >= effectiveMax && queue.length > 0) {
2191
+ logger.warn(`Discovery crawl reached maxPages limit (${effectiveMax}), ${queue.length} URLs not visited.`);
2192
+ }
2193
+ logger.event("build_discover_complete", {
2194
+ pagesFound: pages.length,
2195
+ urlsVisited: visited.size,
2196
+ urlsSkipped: queue.length
2197
+ });
2198
+ return pages;
2199
+ }
2007
2200
  async function loadBuildPages(cwd, config, maxPages) {
2008
2201
  const buildConfig = config.source.build;
2009
2202
  if (!buildConfig) {
2010
2203
  throw new Error("build source config is missing");
2011
2204
  }
2205
+ if (buildConfig.discover) {
2206
+ const server2 = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
2207
+ try {
2208
+ return await discoverPages(server2, buildConfig, maxPages);
2209
+ } finally {
2210
+ await server2.shutdown();
2211
+ }
2212
+ }
2012
2213
  const routes = await parseManifest(cwd, buildConfig.outputDir);
2013
2214
  const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
2014
2215
  logger.event("build_routes_discovered", {
@@ -2112,11 +2313,11 @@ async function loadContentFilesPages(cwd, config, maxPages) {
2112
2313
 
2113
2314
  // src/indexing/sources/crawl.ts
2114
2315
  import { gunzipSync } from "zlib";
2115
- import { load as cheerioLoad } from "cheerio";
2316
+ import { load as cheerioLoad2 } from "cheerio";
2116
2317
  import pLimit3 from "p-limit";
2117
2318
  var logger2 = new Logger();
2118
2319
  function extractLocs(xml) {
2119
- const $ = cheerioLoad(xml, { xmlMode: true });
2320
+ const $ = cheerioLoad2(xml, { xmlMode: true });
2120
2321
  const locs = [];
2121
2322
  $("loc").each((_i, el) => {
2122
2323
  const text = $(el).text().trim();
@@ -2127,7 +2328,7 @@ function extractLocs(xml) {
2127
2328
  return locs;
2128
2329
  }
2129
2330
  function isSitemapIndex(xml) {
2130
- const $ = cheerioLoad(xml, { xmlMode: true });
2331
+ const $ = cheerioLoad2(xml, { xmlMode: true });
2131
2332
  return $("sitemapindex").length > 0;
2132
2333
  }
2133
2334
  async function fetchSitemapXml(url) {
@@ -2265,9 +2466,7 @@ function hrTimeMs(start) {
2265
2466
 
2266
2467
  // src/indexing/pipeline.ts
2267
2468
  var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
2268
- "text-embedding-3-small": 2e-5,
2269
- "text-embedding-3-large": 13e-5,
2270
- "text-embedding-ada-002": 1e-4
2469
+ "jina-embeddings-v3": 2e-5
2271
2470
  };
2272
2471
  var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
2273
2472
  var IndexPipeline = class _IndexPipeline {
@@ -2313,9 +2512,15 @@ var IndexPipeline = class _IndexPipeline {
2313
2512
  };
2314
2513
  const scope = resolveScope(this.config, options.scopeOverride);
2315
2514
  const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
2515
+ const sourceMode = options.sourceOverride ?? this.config.source.mode;
2516
+ this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, model: ${this.config.embeddings.model})`);
2316
2517
  if (options.force) {
2518
+ this.logger.info("Force mode enabled \u2014 full rebuild");
2317
2519
  await cleanMirrorForScope(statePath, scope);
2318
2520
  }
2521
+ if (options.dryRun) {
2522
+ this.logger.info("Dry run \u2014 no writes will be performed");
2523
+ }
2319
2524
  const manifestStart = stageStart();
2320
2525
  const existingHashes = await this.vectorStore.getContentHashes(scope);
2321
2526
  const existingModelId = await this.vectorStore.getScopeModelId(scope);
@@ -2326,8 +2531,9 @@ var IndexPipeline = class _IndexPipeline {
2326
2531
  );
2327
2532
  }
2328
2533
  stageEnd("manifest", manifestStart);
2534
+ this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
2329
2535
  const sourceStart = stageStart();
2330
- const sourceMode = options.sourceOverride ?? this.config.source.mode;
2536
+ this.logger.info(`Loading pages (source: ${sourceMode})...`);
2331
2537
  let sourcePages;
2332
2538
  if (sourceMode === "static-output") {
2333
2539
  sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
@@ -2339,10 +2545,13 @@ var IndexPipeline = class _IndexPipeline {
2339
2545
  sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
2340
2546
  }
2341
2547
  stageEnd("source", sourceStart);
2548
+ this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
2342
2549
  const routeStart = stageStart();
2343
2550
  const routePatterns = await buildRoutePatterns(this.cwd);
2344
2551
  stageEnd("route_map", routeStart);
2552
+ this.logger.debug(`Route mapping: ${routePatterns.length} pattern${routePatterns.length === 1 ? "" : "s"} discovered (${stageTimingsMs["route_map"]}ms)`);
2345
2553
  const extractStart = stageStart();
2554
+ this.logger.info("Extracting content...");
2346
2555
  const extractedPages = [];
2347
2556
  for (const sourcePage of sourcePages) {
2348
2557
  const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
@@ -2371,6 +2580,8 @@ var IndexPipeline = class _IndexPipeline {
2371
2580
  uniquePages.push(page);
2372
2581
  }
2373
2582
  stageEnd("extract", extractStart);
2583
+ const skippedPages = sourcePages.length - uniquePages.length;
2584
+ this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
2374
2585
  const linkStart = stageStart();
2375
2586
  const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
2376
2587
  const incomingLinkCount = /* @__PURE__ */ new Map();
@@ -2386,7 +2597,9 @@ var IndexPipeline = class _IndexPipeline {
2386
2597
  }
2387
2598
  }
2388
2599
  stageEnd("links", linkStart);
2600
+ this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
2389
2601
  const mirrorStart = stageStart();
2602
+ this.logger.info("Writing mirror pages...");
2390
2603
  const mirrorPages = [];
2391
2604
  let routeExact = 0;
2392
2605
  let routeBestEffort = 0;
@@ -2456,7 +2669,9 @@ var IndexPipeline = class _IndexPipeline {
2456
2669
  await this.vectorStore.upsertPages(pageRecords, scope);
2457
2670
  }
2458
2671
  stageEnd("mirror", mirrorStart);
2672
+ this.logger.info(`Mirrored ${mirrorPages.length} page${mirrorPages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["mirror"]}ms)`);
2459
2673
  const chunkStart = stageStart();
2674
+ this.logger.info("Chunking pages...");
2460
2675
  let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
2461
2676
  const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
2462
2677
  if (typeof maxChunks === "number") {
@@ -2469,6 +2684,7 @@ var IndexPipeline = class _IndexPipeline {
2469
2684
  });
2470
2685
  }
2471
2686
  stageEnd("chunk", chunkStart);
2687
+ this.logger.info(`Chunked into ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} (${stageTimingsMs["chunk"]}ms)`);
2472
2688
  const currentChunkMap = /* @__PURE__ */ new Map();
2473
2689
  for (const chunk of chunks) {
2474
2690
  currentChunkMap.set(chunk.chunkKey, chunk);
@@ -2487,6 +2703,7 @@ var IndexPipeline = class _IndexPipeline {
2487
2703
  return existingHash !== chunk.contentHash;
2488
2704
  });
2489
2705
  const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
2706
+ this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
2490
2707
  const embedStart = stageStart();
2491
2708
  const chunkTokenEstimates = /* @__PURE__ */ new Map();
2492
2709
  for (const chunk of changedChunks) {
@@ -2501,9 +2718,11 @@ var IndexPipeline = class _IndexPipeline {
2501
2718
  let newEmbeddings = 0;
2502
2719
  const vectorsByChunk = /* @__PURE__ */ new Map();
2503
2720
  if (!options.dryRun && changedChunks.length > 0) {
2721
+ this.logger.info(`Embedding ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} (~${estimatedTokens.toLocaleString()} tokens, ~$${estimatedCostUSD.toFixed(6)})...`);
2504
2722
  const embeddings = await this.embeddings.embedTexts(
2505
2723
  changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
2506
- this.config.embeddings.model
2724
+ this.config.embeddings.model,
2725
+ "retrieval.passage"
2507
2726
  );
2508
2727
  if (embeddings.length !== changedChunks.length) {
2509
2728
  throw new SearchSocketError(
@@ -2526,8 +2745,14 @@ var IndexPipeline = class _IndexPipeline {
2526
2745
  }
2527
2746
  }
2528
2747
  stageEnd("embedding", embedStart);
2748
+ if (changedChunks.length > 0) {
2749
+ this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
2750
+ } else {
2751
+ this.logger.info("No chunks to embed \u2014 all up to date");
2752
+ }
2529
2753
  const syncStart = stageStart();
2530
2754
  if (!options.dryRun) {
2755
+ this.logger.info("Syncing vectors...");
2531
2756
  const upserts = [];
2532
2757
  for (const chunk of changedChunks) {
2533
2758
  const vector = vectorsByChunk.get(chunk.chunkKey);
@@ -2546,12 +2771,16 @@ var IndexPipeline = class _IndexPipeline {
2546
2771
  sectionTitle: chunk.sectionTitle ?? "",
2547
2772
  headingPath: chunk.headingPath,
2548
2773
  snippet: chunk.snippet,
2774
+ chunkText: chunk.chunkText.slice(0, 4e3),
2775
+ ordinal: chunk.ordinal,
2549
2776
  contentHash: chunk.contentHash,
2550
2777
  modelId: this.config.embeddings.model,
2551
2778
  depth: chunk.depth,
2552
2779
  incomingLinks: chunk.incomingLinks,
2553
2780
  routeFile: chunk.routeFile,
2554
- tags: chunk.tags
2781
+ tags: chunk.tags,
2782
+ description: chunk.description,
2783
+ keywords: chunk.keywords
2555
2784
  }
2556
2785
  });
2557
2786
  }
@@ -2565,6 +2794,7 @@ var IndexPipeline = class _IndexPipeline {
2565
2794
  }
2566
2795
  }
2567
2796
  stageEnd("sync", syncStart);
2797
+ this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
2568
2798
  const finalizeStart = stageStart();
2569
2799
  if (!options.dryRun) {
2570
2800
  const scopeInfo = {
@@ -2584,6 +2814,7 @@ var IndexPipeline = class _IndexPipeline {
2584
2814
  });
2585
2815
  }
2586
2816
  stageEnd("finalize", finalizeStart);
2817
+ this.logger.info("Done.");
2587
2818
  return {
2588
2819
  pagesProcessed: mirrorPages.length,
2589
2820
  chunksTotal: chunks.length,
@@ -2693,20 +2924,17 @@ var JinaReranker = class {
2693
2924
 
2694
2925
  // src/rerank/factory.ts
2695
2926
  function createReranker(config) {
2696
- if (config.rerank.provider === "none") {
2927
+ if (!config.rerank.enabled) {
2697
2928
  return null;
2698
2929
  }
2699
- if (config.rerank.provider === "jina") {
2700
- const apiKey = process.env[config.rerank.jina.apiKeyEnv];
2701
- if (!apiKey) {
2702
- return null;
2703
- }
2704
- return new JinaReranker({
2705
- apiKey,
2706
- model: config.rerank.jina.model
2707
- });
2930
+ const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
2931
+ if (!apiKey) {
2932
+ return null;
2708
2933
  }
2709
- return null;
2934
+ return new JinaReranker({
2935
+ apiKey,
2936
+ model: config.rerank.model
2937
+ });
2710
2938
  }
2711
2939
 
2712
2940
  // src/search/ranking.ts
@@ -2854,7 +3082,7 @@ var SearchEngine = class _SearchEngine {
2854
3082
  const groupByPage = (input.groupBy ?? "page") === "page";
2855
3083
  const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
2856
3084
  const embedStart = process.hrtime.bigint();
2857
- const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
3085
+ const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
2858
3086
  const queryVector = queryEmbeddings[0];
2859
3087
  if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
2860
3088
  throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
@@ -2882,13 +3110,17 @@ var SearchEngine = class _SearchEngine {
2882
3110
  usedRerank = true;
2883
3111
  }
2884
3112
  let results;
3113
+ const minScore = this.config.ranking.minScore;
2885
3114
  if (groupByPage) {
2886
- const pages = aggregateByPage(ordered, this.config);
3115
+ let pages = aggregateByPage(ordered, this.config);
3116
+ if (minScore > 0) {
3117
+ pages = pages.filter((p) => p.pageScore >= minScore);
3118
+ }
2887
3119
  const minRatio = this.config.ranking.minChunkScoreRatio;
2888
3120
  results = pages.slice(0, topK).map((page) => {
2889
3121
  const bestScore = page.bestChunk.finalScore;
2890
- const minScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
2891
- const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore).slice(0, 5);
3122
+ const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
3123
+ const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
2892
3124
  return {
2893
3125
  url: page.url,
2894
3126
  title: page.title,
@@ -2905,6 +3137,9 @@ var SearchEngine = class _SearchEngine {
2905
3137
  };
2906
3138
  });
2907
3139
  } else {
3140
+ if (minScore > 0) {
3141
+ ordered = ordered.filter((entry) => entry.finalScore >= minScore);
3142
+ }
2908
3143
  results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
2909
3144
  url: hit.metadata.url,
2910
3145
  title: hit.metadata.title,
@@ -2976,43 +3211,67 @@ var SearchEngine = class _SearchEngine {
2976
3211
  }
2977
3212
  }
2978
3213
  async rerankHits(query, ranked, topK) {
2979
- if (this.config.rerank.provider !== "jina") {
3214
+ if (!this.config.rerank.enabled) {
2980
3215
  throw new SearchSocketError(
2981
3216
  "INVALID_REQUEST",
2982
- "rerank=true requested but rerank.provider is not configured as 'jina'.",
3217
+ "rerank=true requested but rerank.enabled is not set to true.",
2983
3218
  400
2984
3219
  );
2985
3220
  }
2986
3221
  if (!this.reranker) {
2987
3222
  throw new SearchSocketError(
2988
3223
  "CONFIG_MISSING",
2989
- `rerank=true requested but ${this.config.rerank.jina.apiKeyEnv} is not set.`,
3224
+ `rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
2990
3225
  400
2991
3226
  );
2992
3227
  }
2993
- const candidates = ranked.map(({ hit }) => ({
2994
- id: hit.id,
2995
- text: [hit.metadata.title, hit.metadata.sectionTitle, hit.metadata.snippet].filter(Boolean).join("\n")
2996
- }));
3228
+ const pageGroups = /* @__PURE__ */ new Map();
3229
+ for (const entry of ranked) {
3230
+ const url = entry.hit.metadata.url;
3231
+ const group = pageGroups.get(url);
3232
+ if (group) group.push(entry);
3233
+ else pageGroups.set(url, [entry]);
3234
+ }
3235
+ const MAX_CHUNKS_PER_PAGE = 5;
3236
+ const MIN_CHUNKS_PER_PAGE = 1;
3237
+ const MIN_CHUNK_SCORE_RATIO = 0.5;
3238
+ const pageCandidates = [];
3239
+ for (const [url, chunks] of pageGroups) {
3240
+ const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
3241
+ const bestScore = byScore[0].finalScore;
3242
+ const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
3243
+ const selected = byScore.filter(
3244
+ (c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
3245
+ ).slice(0, MAX_CHUNKS_PER_PAGE);
3246
+ selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
3247
+ const first = selected[0].hit.metadata;
3248
+ const parts = [first.title];
3249
+ if (first.description) {
3250
+ parts.push(first.description);
3251
+ }
3252
+ if (first.keywords && first.keywords.length > 0) {
3253
+ parts.push(first.keywords.join(", "));
3254
+ }
3255
+ const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
3256
+ parts.push(body);
3257
+ pageCandidates.push({ id: url, text: parts.join("\n\n") });
3258
+ }
2997
3259
  const reranked = await this.reranker.rerank(
2998
3260
  query,
2999
- candidates,
3261
+ pageCandidates,
3000
3262
  Math.max(topK, this.config.rerank.topN)
3001
3263
  );
3002
- const rerankScoreById = new Map(reranked.map((entry) => [entry.id, entry.score]));
3264
+ const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
3003
3265
  return ranked.map((entry) => {
3004
- const rerankScore = rerankScoreById.get(entry.hit.id);
3005
- const safeBaseScore = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
3006
- if (rerankScore === void 0 || !Number.isFinite(rerankScore)) {
3007
- return {
3008
- ...entry,
3009
- finalScore: safeBaseScore
3010
- };
3266
+ const pageScore = scoreByUrl.get(entry.hit.metadata.url);
3267
+ const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
3268
+ if (pageScore === void 0 || !Number.isFinite(pageScore)) {
3269
+ return { ...entry, finalScore: base };
3011
3270
  }
3012
- const combinedScore = rerankScore * this.config.ranking.weights.rerank + safeBaseScore * 1e-3;
3271
+ const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
3013
3272
  return {
3014
3273
  ...entry,
3015
- finalScore: Number.isFinite(combinedScore) ? combinedScore : safeBaseScore
3274
+ finalScore: Number.isFinite(combined) ? combined : base
3016
3275
  };
3017
3276
  }).sort((a, b) => {
3018
3277
  const delta = b.finalScore - a.finalScore;
@@ -3332,6 +3591,7 @@ function getRootOptions(command) {
3332
3591
  }
3333
3592
  async function runIndexCommand(opts) {
3334
3593
  const logger3 = new Logger({
3594
+ quiet: opts.quiet,
3335
3595
  verbose: opts.verbose,
3336
3596
  json: opts.json
3337
3597
  });
@@ -3355,7 +3615,9 @@ async function runIndexCommand(opts) {
3355
3615
  `);
3356
3616
  return;
3357
3617
  }
3358
- printIndexSummary(stats);
3618
+ if (!opts.quiet) {
3619
+ printIndexSummary(stats);
3620
+ }
3359
3621
  }
3360
3622
  var program = new Command();
3361
3623
  program.name("searchsocket").description("Semantic site search and MCP retrieval for SvelteKit").version(package_default.version).option("-C, --cwd <path>", "working directory", process.cwd()).option("--config <path>", "config path (defaults to searchsocket.config.ts)");
@@ -3379,7 +3641,7 @@ program.command("init").description("Create searchsocket.config.ts and .searchso
3379
3641
  process.stdout.write("// searchsocketVitePlugin({ enabled: true, changedOnly: true })\n");
3380
3642
  process.stdout.write("// or env-driven: SEARCHSOCKET_AUTO_INDEX=1 pnpm build\n");
3381
3643
  });
3382
- program.command("index").description("Index site content into markdown mirror + vector store").option("--scope <name>", "scope override").option("--changed-only", "only process changed chunks", true).option("--no-changed-only", "re-index regardless of previous manifest").option("--force", "force full mirror rebuild and re-upsert", false).option("--dry-run", "compute plan and cost, no API writes", false).option("--source <mode>", "source mode override: static-output|crawl|content-files|build").option("--max-pages <n>", "limit pages processed").option("--max-chunks <n>", "limit chunks processed").option("--verbose", "verbose output", false).option("--json", "emit JSON logs and summary", false).action(async (opts, command) => {
3644
+ program.command("index").description("Index site content into markdown mirror + vector store").option("--scope <name>", "scope override").option("--changed-only", "only process changed chunks", true).option("--no-changed-only", "re-index regardless of previous manifest").option("--force", "force full mirror rebuild and re-upsert", false).option("--dry-run", "compute plan and cost, no API writes", false).option("--source <mode>", "source mode override: static-output|crawl|content-files|build").option("--max-pages <n>", "limit pages processed").option("--max-chunks <n>", "limit chunks processed").option("--quiet", "suppress all output except errors and warnings", false).option("--verbose", "verbose output", false).option("--json", "emit JSON logs and summary", false).action(async (opts, command) => {
3383
3645
  const rootOpts = getRootOptions(command);
3384
3646
  const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
3385
3647
  await runIndexCommand({
@@ -3392,6 +3654,7 @@ program.command("index").description("Index site content into markdown mirror +
3392
3654
  source: opts.source,
3393
3655
  maxPages: opts.maxPages ? parsePositiveInt(opts.maxPages, "--max-pages") : void 0,
3394
3656
  maxChunks: opts.maxChunks ? parsePositiveInt(opts.maxChunks, "--max-chunks") : void 0,
3657
+ quiet: opts.quiet,
3395
3658
  verbose: opts.verbose,
3396
3659
  json: opts.json
3397
3660
  });
@@ -3554,8 +3817,8 @@ program.command("clean").description("Delete local state and optionally delete r
3554
3817
  `);
3555
3818
  if (opts.remote) {
3556
3819
  const vectorStore = await createVectorStore(config, cwd);
3557
- await vectorStore.deleteScope(scope);
3558
- process.stdout.write(`deleted remote vectors for scope ${scope.scopeName}
3820
+ await vectorStore.dropAllTables();
3821
+ process.stdout.write(`dropped all remote tables (chunks, registry, pages)
3559
3822
  `);
3560
3823
  }
3561
3824
  });
@@ -3680,14 +3943,6 @@ program.command("doctor").description("Validate config, env vars, provider conne
3680
3943
  details: tursoUrl ? `remote: ${tursoUrl}` : `local file: ${config.vector.turso.localPath}`
3681
3944
  });
3682
3945
  }
3683
- if (config.rerank.provider === "jina") {
3684
- const jinaKey = process.env[config.rerank.jina.apiKeyEnv];
3685
- checks.push({
3686
- name: `env ${config.rerank.jina.apiKeyEnv}`,
3687
- ok: Boolean(jinaKey),
3688
- details: jinaKey ? void 0 : "missing"
3689
- });
3690
- }
3691
3946
  if (config.source.mode === "static-output") {
3692
3947
  const outputDir = path13.resolve(cwd, config.source.staticOutputDir);
3693
3948
  const exists = fs9.existsSync(outputDir);