searchsocket 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -12,13 +12,13 @@ import { Command } from "commander";
12
12
  // package.json
13
13
  var package_default = {
14
14
  name: "searchsocket",
15
- version: "0.2.0",
15
+ version: "0.3.0",
16
16
  description: "Semantic site search and MCP retrieval for SvelteKit static sites",
17
17
  license: "MIT",
18
18
  author: "Greg Priday <greg@siteorigin.com>",
19
19
  repository: {
20
20
  type: "git",
21
- url: "https://github.com/gregpriday/searchsocket.git"
21
+ url: "git+https://github.com/gregpriday/searchsocket.git"
22
22
  },
23
23
  homepage: "https://github.com/gregpriday/searchsocket",
24
24
  bugs: {
@@ -37,6 +37,7 @@ var package_default = {
37
37
  type: "module",
38
38
  files: [
39
39
  "dist",
40
+ "!dist/**/*.map",
40
41
  "README.md"
41
42
  ],
42
43
  bin: {
@@ -81,7 +82,6 @@ var package_default = {
81
82
  "fast-glob": "^3.3.3",
82
83
  "gray-matter": "^4.0.3",
83
84
  jiti: "^2.6.1",
84
- openai: "^6.19.0",
85
85
  "p-limit": "^7.3.0",
86
86
  turndown: "^7.2.2",
87
87
  "turndown-plugin-gfm": "^1.0.2",
@@ -132,7 +132,11 @@ var searchSocketConfigSchema = z.object({
132
132
  outputDir: z.string().min(1).optional(),
133
133
  paramValues: z.record(z.string(), z.array(z.string())).optional(),
134
134
  exclude: z.array(z.string()).optional(),
135
- previewTimeout: z.number().int().positive().optional()
135
+ previewTimeout: z.number().int().positive().optional(),
136
+ discover: z.boolean().optional(),
137
+ seedUrls: z.array(z.string()).optional(),
138
+ maxPages: z.number().int().positive().optional(),
139
+ maxDepth: z.number().int().nonnegative().optional()
136
140
  }).optional()
137
141
  }).optional(),
138
142
  extract: z.object({
@@ -159,8 +163,9 @@ var searchSocketConfigSchema = z.object({
159
163
  pageSummaryChunk: z.boolean().optional()
160
164
  }).optional(),
161
165
  embeddings: z.object({
162
- provider: z.literal("openai").optional(),
166
+ provider: z.literal("jina").optional(),
163
167
  model: z.string().min(1).optional(),
168
+ apiKey: z.string().min(1).optional(),
164
169
  apiKeyEnv: z.string().min(1).optional(),
165
170
  batchSize: z.number().int().positive().optional(),
166
171
  concurrency: z.number().int().positive().optional(),
@@ -169,18 +174,17 @@ var searchSocketConfigSchema = z.object({
169
174
  vector: z.object({
170
175
  dimension: z.number().int().positive().optional(),
171
176
  turso: z.object({
177
+ url: z.string().url().optional(),
178
+ authToken: z.string().min(1).optional(),
172
179
  urlEnv: z.string().optional(),
173
180
  authTokenEnv: z.string().optional(),
174
181
  localPath: z.string().optional()
175
182
  }).optional()
176
183
  }).optional(),
177
184
  rerank: z.object({
178
- provider: z.enum(["none", "jina"]).optional(),
185
+ enabled: z.boolean().optional(),
179
186
  topN: z.number().int().positive().optional(),
180
- jina: z.object({
181
- apiKeyEnv: z.string().optional(),
182
- model: z.string().optional()
183
- }).optional()
187
+ model: z.string().optional()
184
188
  }).optional(),
185
189
  ranking: z.object({
186
190
  enableIncomingLinkBoost: z.boolean().optional(),
@@ -189,6 +193,7 @@ var searchSocketConfigSchema = z.object({
189
193
  aggregationCap: z.number().int().positive().optional(),
190
194
  aggregationDecay: z.number().min(0).max(1).optional(),
191
195
  minChunkScoreRatio: z.number().min(0).max(1).optional(),
196
+ minScore: z.number().min(0).max(1).optional(),
192
197
  weights: z.object({
193
198
  incomingLinks: z.number().optional(),
194
199
  depth: z.number().optional(),
@@ -269,9 +274,9 @@ function createDefaultConfig(projectId) {
269
274
  pageSummaryChunk: true
270
275
  },
271
276
  embeddings: {
272
- provider: "openai",
273
- model: "text-embedding-3-small",
274
- apiKeyEnv: "OPENAI_API_KEY",
277
+ provider: "jina",
278
+ model: "jina-embeddings-v3",
279
+ apiKeyEnv: "JINA_API_KEY",
275
280
  batchSize: 64,
276
281
  concurrency: 4
277
282
  },
@@ -283,12 +288,9 @@ function createDefaultConfig(projectId) {
283
288
  }
284
289
  },
285
290
  rerank: {
286
- provider: "none",
291
+ enabled: false,
287
292
  topN: 20,
288
- jina: {
289
- apiKeyEnv: "JINA_API_KEY",
290
- model: "jina-reranker-v2-base-multilingual"
291
- }
293
+ model: "jina-reranker-v2-base-multilingual"
292
294
  },
293
295
  ranking: {
294
296
  enableIncomingLinkBoost: true,
@@ -297,6 +299,7 @@ function createDefaultConfig(projectId) {
297
299
  aggregationCap: 5,
298
300
  aggregationDecay: 0.5,
299
301
  minChunkScoreRatio: 0.5,
302
+ minScore: 0,
300
303
  weights: {
301
304
  incomingLinks: 0.05,
302
305
  depth: 0.03,
@@ -407,7 +410,11 @@ ${issues}`
407
410
  outputDir: parsed.source.build.outputDir ?? ".svelte-kit/output",
408
411
  paramValues: parsed.source.build.paramValues ?? {},
409
412
  exclude: parsed.source.build.exclude ?? [],
410
- previewTimeout: parsed.source.build.previewTimeout ?? 3e4
413
+ previewTimeout: parsed.source.build.previewTimeout ?? 3e4,
414
+ discover: parsed.source.build.discover ?? false,
415
+ seedUrls: parsed.source.build.seedUrls ?? ["/"],
416
+ maxPages: parsed.source.build.maxPages ?? 200,
417
+ maxDepth: parsed.source.build.maxDepth ?? 10
411
418
  } : void 0
412
419
  },
413
420
  extract: {
@@ -436,11 +443,7 @@ ${issues}`
436
443
  },
437
444
  rerank: {
438
445
  ...defaults.rerank,
439
- ...parsed.rerank,
440
- jina: {
441
- ...defaults.rerank.jina,
442
- ...parsed.rerank?.jina
443
- }
446
+ ...parsed.rerank
444
447
  },
445
448
  ranking: {
446
449
  ...defaults.ranking,
@@ -487,7 +490,11 @@ ${issues}`
487
490
  outputDir: ".svelte-kit/output",
488
491
  paramValues: {},
489
492
  exclude: [],
490
- previewTimeout: 3e4
493
+ previewTimeout: 3e4,
494
+ discover: false,
495
+ seedUrls: ["/"],
496
+ maxPages: 200,
497
+ maxDepth: 10
491
498
  };
492
499
  }
493
500
  if (merged.source.mode === "crawl" && !merged.source.crawl?.baseUrl) {
@@ -528,7 +535,7 @@ function writeMinimalConfig(cwd) {
528
535
  return target;
529
536
  }
530
537
  const content = `export default {
531
- embeddings: { apiKeyEnv: "OPENAI_API_KEY" }
538
+ embeddings: { apiKeyEnv: "JINA_API_KEY" }
532
539
  };
533
540
  `;
534
541
  fs.writeFileSync(target, content, "utf8");
@@ -539,14 +546,16 @@ function writeMinimalConfig(cwd) {
539
546
  var Logger = class {
540
547
  json;
541
548
  verbose;
549
+ quiet;
542
550
  stderrOnly;
543
551
  constructor(opts = {}) {
544
552
  this.json = opts.json ?? false;
545
553
  this.verbose = opts.verbose ?? false;
554
+ this.quiet = opts.quiet ?? false;
546
555
  this.stderrOnly = opts.stderrOnly ?? false;
547
556
  }
548
557
  info(message) {
549
- if (this.json) {
558
+ if (this.quiet || this.json) {
550
559
  return;
551
560
  }
552
561
  this.writeOut(`${message}
@@ -560,7 +569,7 @@ var Logger = class {
560
569
  this.logJson("debug", { message });
561
570
  return;
562
571
  }
563
- this.writeOut(`${message}
572
+ this.writeOut(` ${message}
564
573
  `);
565
574
  }
566
575
  warn(message) {
@@ -587,7 +596,7 @@ var Logger = class {
587
596
  this.logJson(event, data);
588
597
  return;
589
598
  }
590
- this.writeOut(`[${event}] ${data ? JSON.stringify(data) : ""}
599
+ this.writeOut(` [${event}] ${data ? JSON.stringify(data) : ""}
591
600
  `);
592
601
  }
593
602
  writeOut(text) {
@@ -694,18 +703,18 @@ function ensureStateDirs(cwd, stateDir, scope) {
694
703
  return { statePath, pagesPath };
695
704
  }
696
705
 
697
- // src/embeddings/openai.ts
698
- import OpenAI from "openai";
706
+ // src/embeddings/jina.ts
699
707
  import pLimit from "p-limit";
700
708
  function sleep(ms) {
701
709
  return new Promise((resolve) => {
702
710
  setTimeout(resolve, ms);
703
711
  });
704
712
  }
705
- var OpenAIEmbeddingsProvider = class {
706
- client;
713
+ var JinaEmbeddingsProvider = class {
714
+ apiKey;
707
715
  batchSize;
708
716
  concurrency;
717
+ defaultTask;
709
718
  constructor(options) {
710
719
  if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
711
720
  throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
@@ -713,11 +722,10 @@ var OpenAIEmbeddingsProvider = class {
713
722
  if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
714
723
  throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
715
724
  }
716
- this.client = new OpenAI({
717
- apiKey: options.apiKey
718
- });
725
+ this.apiKey = options.apiKey;
719
726
  this.batchSize = options.batchSize;
720
727
  this.concurrency = options.concurrency;
728
+ this.defaultTask = options.task ?? "retrieval.passage";
721
729
  }
722
730
  estimateTokens(text) {
723
731
  const normalized = text.trim();
@@ -731,7 +739,7 @@ var OpenAIEmbeddingsProvider = class {
731
739
  const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
732
740
  return Math.max(1, Math.max(charEstimate, lexicalEstimate));
733
741
  }
734
- async embedTexts(texts, modelId) {
742
+ async embedTexts(texts, modelId, task) {
735
743
  if (texts.length === 0) {
736
744
  return [];
737
745
  }
@@ -747,33 +755,52 @@ var OpenAIEmbeddingsProvider = class {
747
755
  await Promise.all(
748
756
  batches.map(
749
757
  (batch, position) => limit(async () => {
750
- outputs[position] = await this.embedWithRetry(batch.values, modelId);
758
+ outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
751
759
  })
752
760
  )
753
761
  );
754
762
  return outputs.flat();
755
763
  }
756
- async embedWithRetry(texts, modelId) {
764
+ async embedWithRetry(texts, modelId, task) {
757
765
  const maxAttempts = 5;
758
766
  let attempt = 0;
759
767
  while (attempt < maxAttempts) {
760
768
  attempt += 1;
769
+ let response;
761
770
  try {
762
- const response = await this.client.embeddings.create({
763
- model: modelId,
764
- input: texts,
765
- encoding_format: "float"
771
+ response = await fetch("https://api.jina.ai/v1/embeddings", {
772
+ method: "POST",
773
+ headers: {
774
+ "content-type": "application/json",
775
+ authorization: `Bearer ${this.apiKey}`
776
+ },
777
+ body: JSON.stringify({
778
+ model: modelId,
779
+ input: texts,
780
+ task
781
+ })
766
782
  });
767
- return response.data.map((entry) => entry.embedding);
768
783
  } catch (error) {
769
- const status = error.status;
770
- const retryable = status === 429 || typeof status === "number" && status >= 500;
771
- if (!retryable || attempt >= maxAttempts) {
784
+ if (attempt >= maxAttempts) {
772
785
  throw error;
773
786
  }
774
- const delay = Math.min(2 ** attempt * 300, 5e3);
775
- await sleep(delay);
787
+ await sleep(Math.min(2 ** attempt * 300, 5e3));
788
+ continue;
776
789
  }
790
+ if (!response.ok) {
791
+ const retryable = response.status === 429 || response.status >= 500;
792
+ if (!retryable || attempt >= maxAttempts) {
793
+ const errorBody = await response.text();
794
+ throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
795
+ }
796
+ await sleep(Math.min(2 ** attempt * 300, 5e3));
797
+ continue;
798
+ }
799
+ const payload = await response.json();
800
+ if (!payload.data || !Array.isArray(payload.data)) {
801
+ throw new Error("Invalid Jina embeddings response format");
802
+ }
803
+ return payload.data.map((entry) => entry.embedding);
777
804
  }
778
805
  throw new Error("Unreachable retry state");
779
806
  }
@@ -781,20 +808,20 @@ var OpenAIEmbeddingsProvider = class {
781
808
 
782
809
  // src/embeddings/factory.ts
783
810
  function createEmbeddingsProvider(config) {
784
- if (config.embeddings.provider !== "openai") {
811
+ if (config.embeddings.provider !== "jina") {
785
812
  throw new SearchSocketError(
786
813
  "CONFIG_MISSING",
787
814
  `Unsupported embeddings provider ${config.embeddings.provider}`
788
815
  );
789
816
  }
790
- const apiKey = process.env[config.embeddings.apiKeyEnv];
817
+ const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
791
818
  if (!apiKey) {
792
819
  throw new SearchSocketError(
793
820
  "CONFIG_MISSING",
794
- `Missing embeddings API key env var: ${config.embeddings.apiKeyEnv}`
821
+ `Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
795
822
  );
796
823
  }
797
- return new OpenAIEmbeddingsProvider({
824
+ return new JinaEmbeddingsProvider({
798
825
  apiKey,
799
826
  batchSize: config.embeddings.batchSize,
800
827
  concurrency: config.embeddings.concurrency
@@ -808,6 +835,11 @@ import path11 from "path";
808
835
  import fs3 from "fs";
809
836
  import path3 from "path";
810
837
 
838
+ // src/core/serverless.ts
839
+ function isServerless() {
840
+ return !!(process.env.VERCEL || process.env.NETLIFY || process.env.AWS_LAMBDA_FUNCTION_NAME || process.env.FUNCTIONS_WORKER || process.env.CF_PAGES);
841
+ }
842
+
811
843
  // src/vector/turso.ts
812
844
  var TursoVectorStore = class {
813
845
  client;
@@ -852,6 +884,16 @@ var TursoVectorStore = class {
852
884
  }
853
885
  async ensureChunks(dim) {
854
886
  if (this.chunksReady) return;
887
+ const exists = await this.chunksTableExists();
888
+ if (exists) {
889
+ const currentDim = await this.getChunksDimension();
890
+ if (currentDim !== null && currentDim !== dim) {
891
+ await this.client.batch([
892
+ "DROP INDEX IF EXISTS idx",
893
+ "DROP TABLE IF EXISTS chunks"
894
+ ]);
895
+ }
896
+ }
855
897
  await this.client.batch([
856
898
  `CREATE TABLE IF NOT EXISTS chunks (
857
899
  id TEXT PRIMARY KEY,
@@ -863,6 +905,8 @@ var TursoVectorStore = class {
863
905
  section_title TEXT NOT NULL DEFAULT '',
864
906
  heading_path TEXT NOT NULL DEFAULT '[]',
865
907
  snippet TEXT NOT NULL DEFAULT '',
908
+ chunk_text TEXT NOT NULL DEFAULT '',
909
+ ordinal INTEGER NOT NULL DEFAULT 0,
866
910
  content_hash TEXT NOT NULL DEFAULT '',
867
911
  model_id TEXT NOT NULL DEFAULT '',
868
912
  depth INTEGER NOT NULL DEFAULT 0,
@@ -873,6 +917,19 @@ var TursoVectorStore = class {
873
917
  )`,
874
918
  `CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
875
919
  ]);
920
+ const chunkMigrationCols = [
921
+ { name: "chunk_text", def: "TEXT NOT NULL DEFAULT ''" },
922
+ { name: "ordinal", def: "INTEGER NOT NULL DEFAULT 0" }
923
+ ];
924
+ for (const col of chunkMigrationCols) {
925
+ try {
926
+ await this.client.execute(`ALTER TABLE chunks ADD COLUMN ${col.name} ${col.def}`);
927
+ } catch (error) {
928
+ if (error instanceof Error && !error.message.includes("duplicate column")) {
929
+ throw error;
930
+ }
931
+ }
932
+ }
876
933
  this.chunksReady = true;
877
934
  }
878
935
  async ensurePages() {
@@ -907,6 +964,38 @@ var TursoVectorStore = class {
907
964
  throw error;
908
965
  }
909
966
  }
967
+ /**
968
+ * Read the current F32_BLOB dimension from the chunks table schema.
969
+ * Returns null if the table doesn't exist or the dimension can't be parsed.
970
+ */
971
+ async getChunksDimension() {
972
+ try {
973
+ const rs = await this.client.execute(
974
+ "SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
975
+ );
976
+ if (rs.rows.length === 0) return null;
977
+ const sql = rs.rows[0].sql;
978
+ const match = sql.match(/F32_BLOB\((\d+)\)/i);
979
+ return match ? parseInt(match[1], 10) : null;
980
+ } catch {
981
+ return null;
982
+ }
983
+ }
984
+ /**
985
+ * Drop all SearchSocket tables (chunks, registry, pages) and their indexes.
986
+ * Used by `clean --remote` for a full reset.
987
+ */
988
+ async dropAllTables() {
989
+ await this.client.batch([
990
+ "DROP INDEX IF EXISTS idx",
991
+ "DROP TABLE IF EXISTS chunks",
992
+ "DROP TABLE IF EXISTS registry",
993
+ "DROP TABLE IF EXISTS pages"
994
+ ]);
995
+ this.chunksReady = false;
996
+ this.registryReady = false;
997
+ this.pagesReady = false;
998
+ }
910
999
  async upsert(records, _scope) {
911
1000
  if (records.length === 0) return;
912
1001
  const dim = this.dimension ?? records[0].vector.length;
@@ -917,9 +1006,9 @@ var TursoVectorStore = class {
917
1006
  const stmts = batch.map((r) => ({
918
1007
  sql: `INSERT OR REPLACE INTO chunks
919
1008
  (id, project_id, scope_name, url, path, title, section_title,
920
- heading_path, snippet, content_hash, model_id, depth,
1009
+ heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
921
1010
  incoming_links, route_file, tags, embedding)
922
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
1011
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
923
1012
  args: [
924
1013
  r.id,
925
1014
  r.metadata.projectId,
@@ -930,6 +1019,8 @@ var TursoVectorStore = class {
930
1019
  r.metadata.sectionTitle,
931
1020
  JSON.stringify(r.metadata.headingPath),
932
1021
  r.metadata.snippet,
1022
+ r.metadata.chunkText,
1023
+ r.metadata.ordinal,
933
1024
  r.metadata.contentHash,
934
1025
  r.metadata.modelId,
935
1026
  r.metadata.depth,
@@ -948,7 +1039,8 @@ var TursoVectorStore = class {
948
1039
  const queryJson = JSON.stringify(queryVector);
949
1040
  const rs = await this.client.execute({
950
1041
  sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
951
- c.section_title, c.heading_path, c.snippet, c.content_hash,
1042
+ c.section_title, c.heading_path, c.snippet, c.chunk_text,
1043
+ c.ordinal, c.content_hash,
952
1044
  c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
953
1045
  vector_distance_cos(c.embedding, vector(?)) AS distance
954
1046
  FROM vector_top_k('idx', vector(?), ?) AS v
@@ -992,6 +1084,8 @@ var TursoVectorStore = class {
992
1084
  sectionTitle: row.section_title,
993
1085
  headingPath: JSON.parse(row.heading_path || "[]"),
994
1086
  snippet: row.snippet,
1087
+ chunkText: row.chunk_text || "",
1088
+ ordinal: row.ordinal || 0,
995
1089
  contentHash: row.content_hash,
996
1090
  modelId: row.model_id,
997
1091
  depth: row.depth,
@@ -1187,10 +1281,10 @@ var TursoVectorStore = class {
1187
1281
  // src/vector/factory.ts
1188
1282
  async function createVectorStore(config, cwd) {
1189
1283
  const turso = config.vector.turso;
1190
- const remoteUrl = process.env[turso.urlEnv];
1284
+ const remoteUrl = turso.url ?? process.env[turso.urlEnv];
1191
1285
  if (remoteUrl) {
1192
1286
  const { createClient: createClient2 } = await import("@libsql/client/http");
1193
- const authToken = process.env[turso.authTokenEnv];
1287
+ const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
1194
1288
  const client2 = createClient2({
1195
1289
  url: remoteUrl,
1196
1290
  authToken
@@ -1200,6 +1294,12 @@ async function createVectorStore(config, cwd) {
1200
1294
  dimension: config.vector.dimension
1201
1295
  });
1202
1296
  }
1297
+ if (isServerless()) {
1298
+ throw new SearchSocketError(
1299
+ "VECTOR_BACKEND_UNAVAILABLE",
1300
+ `No remote vector database URL found (checked vector.turso.url and env var "${turso.urlEnv}"). Local SQLite storage is not available in serverless environments. Set ${turso.urlEnv} or pass vector.turso.url directly.`
1301
+ );
1302
+ }
1203
1303
  const { createClient } = await import("@libsql/client");
1204
1304
  const localPath = path3.resolve(cwd, turso.localPath);
1205
1305
  fs3.mkdirSync(path3.dirname(localPath), { recursive: true });
@@ -1827,6 +1927,7 @@ function mapUrlToRoute(urlPath, patterns) {
1827
1927
  }
1828
1928
 
1829
1929
  // src/indexing/sources/build/index.ts
1930
+ import { load as cheerioLoad } from "cheerio";
1830
1931
  import pLimit2 from "p-limit";
1831
1932
 
1832
1933
  // src/indexing/sources/build/manifest-parser.ts
@@ -2003,11 +2104,108 @@ async function startPreviewServer(cwd, options, logger3) {
2003
2104
 
2004
2105
  // src/indexing/sources/build/index.ts
2005
2106
  var logger = new Logger();
2107
+ function extractLinksFromHtml(html, pageUrl, baseOrigin) {
2108
+ const $ = cheerioLoad(html);
2109
+ const links = [];
2110
+ $("a[href]").each((_i, el) => {
2111
+ const href = $(el).attr("href");
2112
+ if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
2113
+ return;
2114
+ }
2115
+ try {
2116
+ const resolved = new URL(href, `${baseOrigin}${pageUrl}`);
2117
+ if (resolved.origin !== baseOrigin) return;
2118
+ if (!["http:", "https:"].includes(resolved.protocol)) return;
2119
+ links.push(normalizeUrlPath(resolved.pathname));
2120
+ } catch {
2121
+ }
2122
+ });
2123
+ return [...new Set(links)];
2124
+ }
2125
+ async function discoverPages(server, buildConfig, pipelineMaxPages) {
2126
+ const { seedUrls, maxDepth, exclude } = buildConfig;
2127
+ const baseOrigin = new URL(server.baseUrl).origin;
2128
+ let effectiveMax = buildConfig.maxPages;
2129
+ if (typeof pipelineMaxPages === "number") {
2130
+ const floored = Math.max(0, Math.floor(pipelineMaxPages));
2131
+ effectiveMax = Math.min(effectiveMax, floored);
2132
+ }
2133
+ if (effectiveMax === 0) return [];
2134
+ const visited = /* @__PURE__ */ new Set();
2135
+ const pages = [];
2136
+ const queue = [];
2137
+ const limit = pLimit2(8);
2138
+ for (const seed of seedUrls) {
2139
+ const normalized = normalizeUrlPath(seed);
2140
+ if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
2141
+ visited.add(normalized);
2142
+ queue.push({ url: normalized, depth: 0 });
2143
+ }
2144
+ }
2145
+ while (queue.length > 0 && pages.length < effectiveMax) {
2146
+ const remaining = effectiveMax - pages.length;
2147
+ const batch = queue.splice(0, remaining);
2148
+ const results = await Promise.allSettled(
2149
+ batch.map(
2150
+ (item) => limit(async () => {
2151
+ const fullUrl = joinUrl(server.baseUrl, item.url);
2152
+ const response = await fetch(fullUrl);
2153
+ if (!response.ok) {
2154
+ logger.warn(`Skipping ${item.url}: ${response.status} ${response.statusText}`);
2155
+ return null;
2156
+ }
2157
+ const contentType = response.headers.get("content-type") ?? "";
2158
+ if (!contentType.includes("text/html")) {
2159
+ return null;
2160
+ }
2161
+ const html = await response.text();
2162
+ if (item.depth < maxDepth) {
2163
+ const links = extractLinksFromHtml(html, item.url, baseOrigin);
2164
+ for (const link of links) {
2165
+ if (!visited.has(link) && !isExcluded(link, exclude)) {
2166
+ visited.add(link);
2167
+ queue.push({ url: link, depth: item.depth + 1 });
2168
+ }
2169
+ }
2170
+ }
2171
+ return {
2172
+ url: item.url,
2173
+ html,
2174
+ sourcePath: fullUrl,
2175
+ outgoingLinks: []
2176
+ };
2177
+ })
2178
+ )
2179
+ );
2180
+ for (const result of results) {
2181
+ if (result.status === "fulfilled" && result.value) {
2182
+ pages.push(result.value);
2183
+ }
2184
+ }
2185
+ }
2186
+ if (pages.length >= effectiveMax && queue.length > 0) {
2187
+ logger.warn(`Discovery crawl reached maxPages limit (${effectiveMax}), ${queue.length} URLs not visited.`);
2188
+ }
2189
+ logger.event("build_discover_complete", {
2190
+ pagesFound: pages.length,
2191
+ urlsVisited: visited.size,
2192
+ urlsSkipped: queue.length
2193
+ });
2194
+ return pages;
2195
+ }
2006
2196
  async function loadBuildPages(cwd, config, maxPages) {
2007
2197
  const buildConfig = config.source.build;
2008
2198
  if (!buildConfig) {
2009
2199
  throw new Error("build source config is missing");
2010
2200
  }
2201
+ if (buildConfig.discover) {
2202
+ const server2 = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
2203
+ try {
2204
+ return await discoverPages(server2, buildConfig, maxPages);
2205
+ } finally {
2206
+ await server2.shutdown();
2207
+ }
2208
+ }
2011
2209
  const routes = await parseManifest(cwd, buildConfig.outputDir);
2012
2210
  const expanded = expandRoutes(routes, buildConfig.paramValues, buildConfig.exclude, logger);
2013
2211
  logger.event("build_routes_discovered", {
@@ -2111,11 +2309,11 @@ async function loadContentFilesPages(cwd, config, maxPages) {
2111
2309
 
2112
2310
  // src/indexing/sources/crawl.ts
2113
2311
  import { gunzipSync } from "zlib";
2114
- import { load as cheerioLoad } from "cheerio";
2312
+ import { load as cheerioLoad2 } from "cheerio";
2115
2313
  import pLimit3 from "p-limit";
2116
2314
  var logger2 = new Logger();
2117
2315
  function extractLocs(xml) {
2118
- const $ = cheerioLoad(xml, { xmlMode: true });
2316
+ const $ = cheerioLoad2(xml, { xmlMode: true });
2119
2317
  const locs = [];
2120
2318
  $("loc").each((_i, el) => {
2121
2319
  const text = $(el).text().trim();
@@ -2126,7 +2324,7 @@ function extractLocs(xml) {
2126
2324
  return locs;
2127
2325
  }
2128
2326
  function isSitemapIndex(xml) {
2129
- const $ = cheerioLoad(xml, { xmlMode: true });
2327
+ const $ = cheerioLoad2(xml, { xmlMode: true });
2130
2328
  return $("sitemapindex").length > 0;
2131
2329
  }
2132
2330
  async function fetchSitemapXml(url) {
@@ -2264,9 +2462,7 @@ function hrTimeMs(start) {
2264
2462
 
2265
2463
  // src/indexing/pipeline.ts
2266
2464
  var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
2267
- "text-embedding-3-small": 2e-5,
2268
- "text-embedding-3-large": 13e-5,
2269
- "text-embedding-ada-002": 1e-4
2465
+ "jina-embeddings-v3": 2e-5
2270
2466
  };
2271
2467
  var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
2272
2468
  var IndexPipeline = class _IndexPipeline {
@@ -2312,9 +2508,15 @@ var IndexPipeline = class _IndexPipeline {
2312
2508
  };
2313
2509
  const scope = resolveScope(this.config, options.scopeOverride);
2314
2510
  const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
2511
+ const sourceMode = options.sourceOverride ?? this.config.source.mode;
2512
+ this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, model: ${this.config.embeddings.model})`);
2315
2513
  if (options.force) {
2514
+ this.logger.info("Force mode enabled \u2014 full rebuild");
2316
2515
  await cleanMirrorForScope(statePath, scope);
2317
2516
  }
2517
+ if (options.dryRun) {
2518
+ this.logger.info("Dry run \u2014 no writes will be performed");
2519
+ }
2318
2520
  const manifestStart = stageStart();
2319
2521
  const existingHashes = await this.vectorStore.getContentHashes(scope);
2320
2522
  const existingModelId = await this.vectorStore.getScopeModelId(scope);
@@ -2325,8 +2527,9 @@ var IndexPipeline = class _IndexPipeline {
2325
2527
  );
2326
2528
  }
2327
2529
  stageEnd("manifest", manifestStart);
2530
+ this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
2328
2531
  const sourceStart = stageStart();
2329
- const sourceMode = options.sourceOverride ?? this.config.source.mode;
2532
+ this.logger.info(`Loading pages (source: ${sourceMode})...`);
2330
2533
  let sourcePages;
2331
2534
  if (sourceMode === "static-output") {
2332
2535
  sourcePages = await loadStaticOutputPages(this.cwd, this.config, options.maxPages);
@@ -2338,10 +2541,13 @@ var IndexPipeline = class _IndexPipeline {
2338
2541
  sourcePages = await loadContentFilesPages(this.cwd, this.config, options.maxPages);
2339
2542
  }
2340
2543
  stageEnd("source", sourceStart);
2544
+ this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
2341
2545
  const routeStart = stageStart();
2342
2546
  const routePatterns = await buildRoutePatterns(this.cwd);
2343
2547
  stageEnd("route_map", routeStart);
2548
+ this.logger.debug(`Route mapping: ${routePatterns.length} pattern${routePatterns.length === 1 ? "" : "s"} discovered (${stageTimingsMs["route_map"]}ms)`);
2344
2549
  const extractStart = stageStart();
2550
+ this.logger.info("Extracting content...");
2345
2551
  const extractedPages = [];
2346
2552
  for (const sourcePage of sourcePages) {
2347
2553
  const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
@@ -2370,6 +2576,8 @@ var IndexPipeline = class _IndexPipeline {
2370
2576
  uniquePages.push(page);
2371
2577
  }
2372
2578
  stageEnd("extract", extractStart);
2579
+ const skippedPages = sourcePages.length - uniquePages.length;
2580
+ this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
2373
2581
  const linkStart = stageStart();
2374
2582
  const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
2375
2583
  const incomingLinkCount = /* @__PURE__ */ new Map();
@@ -2385,7 +2593,9 @@ var IndexPipeline = class _IndexPipeline {
2385
2593
  }
2386
2594
  }
2387
2595
  stageEnd("links", linkStart);
2596
+ this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
2388
2597
  const mirrorStart = stageStart();
2598
+ this.logger.info("Writing mirror pages...");
2389
2599
  const mirrorPages = [];
2390
2600
  let routeExact = 0;
2391
2601
  let routeBestEffort = 0;
@@ -2455,7 +2665,9 @@ var IndexPipeline = class _IndexPipeline {
2455
2665
  await this.vectorStore.upsertPages(pageRecords, scope);
2456
2666
  }
2457
2667
  stageEnd("mirror", mirrorStart);
2668
+ this.logger.info(`Mirrored ${mirrorPages.length} page${mirrorPages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["mirror"]}ms)`);
2458
2669
  const chunkStart = stageStart();
2670
+ this.logger.info("Chunking pages...");
2459
2671
  let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
2460
2672
  const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
2461
2673
  if (typeof maxChunks === "number") {
@@ -2468,6 +2680,7 @@ var IndexPipeline = class _IndexPipeline {
2468
2680
  });
2469
2681
  }
2470
2682
  stageEnd("chunk", chunkStart);
2683
+ this.logger.info(`Chunked into ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} (${stageTimingsMs["chunk"]}ms)`);
2471
2684
  const currentChunkMap = /* @__PURE__ */ new Map();
2472
2685
  for (const chunk of chunks) {
2473
2686
  currentChunkMap.set(chunk.chunkKey, chunk);
@@ -2486,6 +2699,7 @@ var IndexPipeline = class _IndexPipeline {
2486
2699
  return existingHash !== chunk.contentHash;
2487
2700
  });
2488
2701
  const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
2702
+ this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
2489
2703
  const embedStart = stageStart();
2490
2704
  const chunkTokenEstimates = /* @__PURE__ */ new Map();
2491
2705
  for (const chunk of changedChunks) {
@@ -2500,9 +2714,11 @@ var IndexPipeline = class _IndexPipeline {
2500
2714
  let newEmbeddings = 0;
2501
2715
  const vectorsByChunk = /* @__PURE__ */ new Map();
2502
2716
  if (!options.dryRun && changedChunks.length > 0) {
2717
+ this.logger.info(`Embedding ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} (~${estimatedTokens.toLocaleString()} tokens, ~$${estimatedCostUSD.toFixed(6)})...`);
2503
2718
  const embeddings = await this.embeddings.embedTexts(
2504
2719
  changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
2505
- this.config.embeddings.model
2720
+ this.config.embeddings.model,
2721
+ "retrieval.passage"
2506
2722
  );
2507
2723
  if (embeddings.length !== changedChunks.length) {
2508
2724
  throw new SearchSocketError(
@@ -2525,8 +2741,14 @@ var IndexPipeline = class _IndexPipeline {
2525
2741
  }
2526
2742
  }
2527
2743
  stageEnd("embedding", embedStart);
2744
+ if (changedChunks.length > 0) {
2745
+ this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
2746
+ } else {
2747
+ this.logger.info("No chunks to embed \u2014 all up to date");
2748
+ }
2528
2749
  const syncStart = stageStart();
2529
2750
  if (!options.dryRun) {
2751
+ this.logger.info("Syncing vectors...");
2530
2752
  const upserts = [];
2531
2753
  for (const chunk of changedChunks) {
2532
2754
  const vector = vectorsByChunk.get(chunk.chunkKey);
@@ -2545,6 +2767,8 @@ var IndexPipeline = class _IndexPipeline {
2545
2767
  sectionTitle: chunk.sectionTitle ?? "",
2546
2768
  headingPath: chunk.headingPath,
2547
2769
  snippet: chunk.snippet,
2770
+ chunkText: chunk.chunkText.slice(0, 4e3),
2771
+ ordinal: chunk.ordinal,
2548
2772
  contentHash: chunk.contentHash,
2549
2773
  modelId: this.config.embeddings.model,
2550
2774
  depth: chunk.depth,
@@ -2564,6 +2788,7 @@ var IndexPipeline = class _IndexPipeline {
2564
2788
  }
2565
2789
  }
2566
2790
  stageEnd("sync", syncStart);
2791
+ this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
2567
2792
  const finalizeStart = stageStart();
2568
2793
  if (!options.dryRun) {
2569
2794
  const scopeInfo = {
@@ -2583,6 +2808,7 @@ var IndexPipeline = class _IndexPipeline {
2583
2808
  });
2584
2809
  }
2585
2810
  stageEnd("finalize", finalizeStart);
2811
+ this.logger.info("Done.");
2586
2812
  return {
2587
2813
  pagesProcessed: mirrorPages.length,
2588
2814
  chunksTotal: chunks.length,
@@ -2692,20 +2918,17 @@ var JinaReranker = class {
2692
2918
 
2693
2919
  // src/rerank/factory.ts
2694
2920
  function createReranker(config) {
2695
- if (config.rerank.provider === "none") {
2921
+ if (!config.rerank.enabled) {
2696
2922
  return null;
2697
2923
  }
2698
- if (config.rerank.provider === "jina") {
2699
- const apiKey = process.env[config.rerank.jina.apiKeyEnv];
2700
- if (!apiKey) {
2701
- return null;
2702
- }
2703
- return new JinaReranker({
2704
- apiKey,
2705
- model: config.rerank.jina.model
2706
- });
2924
+ const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
2925
+ if (!apiKey) {
2926
+ return null;
2707
2927
  }
2708
- return null;
2928
+ return new JinaReranker({
2929
+ apiKey,
2930
+ model: config.rerank.model
2931
+ });
2709
2932
  }
2710
2933
 
2711
2934
  // src/search/ranking.ts
@@ -2853,7 +3076,7 @@ var SearchEngine = class _SearchEngine {
2853
3076
  const groupByPage = (input.groupBy ?? "page") === "page";
2854
3077
  const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
2855
3078
  const embedStart = process.hrtime.bigint();
2856
- const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model);
3079
+ const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
2857
3080
  const queryVector = queryEmbeddings[0];
2858
3081
  if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
2859
3082
  throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
@@ -2881,13 +3104,17 @@ var SearchEngine = class _SearchEngine {
2881
3104
  usedRerank = true;
2882
3105
  }
2883
3106
  let results;
3107
+ const minScore = this.config.ranking.minScore;
2884
3108
  if (groupByPage) {
2885
- const pages = aggregateByPage(ordered, this.config);
3109
+ let pages = aggregateByPage(ordered, this.config);
3110
+ if (minScore > 0) {
3111
+ pages = pages.filter((p) => p.pageScore >= minScore);
3112
+ }
2886
3113
  const minRatio = this.config.ranking.minChunkScoreRatio;
2887
3114
  results = pages.slice(0, topK).map((page) => {
2888
3115
  const bestScore = page.bestChunk.finalScore;
2889
- const minScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
2890
- const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore).slice(0, 5);
3116
+ const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
3117
+ const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
2891
3118
  return {
2892
3119
  url: page.url,
2893
3120
  title: page.title,
@@ -2904,6 +3131,9 @@ var SearchEngine = class _SearchEngine {
2904
3131
  };
2905
3132
  });
2906
3133
  } else {
3134
+ if (minScore > 0) {
3135
+ ordered = ordered.filter((entry) => entry.finalScore >= minScore);
3136
+ }
2907
3137
  results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
2908
3138
  url: hit.metadata.url,
2909
3139
  title: hit.metadata.title,
@@ -2975,43 +3205,54 @@ var SearchEngine = class _SearchEngine {
2975
3205
  }
2976
3206
  }
2977
3207
  async rerankHits(query, ranked, topK) {
2978
- if (this.config.rerank.provider !== "jina") {
3208
+ if (!this.config.rerank.enabled) {
2979
3209
  throw new SearchSocketError(
2980
3210
  "INVALID_REQUEST",
2981
- "rerank=true requested but rerank.provider is not configured as 'jina'.",
3211
+ "rerank=true requested but rerank.enabled is not set to true.",
2982
3212
  400
2983
3213
  );
2984
3214
  }
2985
3215
  if (!this.reranker) {
2986
3216
  throw new SearchSocketError(
2987
3217
  "CONFIG_MISSING",
2988
- `rerank=true requested but ${this.config.rerank.jina.apiKeyEnv} is not set.`,
3218
+ `rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
2989
3219
  400
2990
3220
  );
2991
3221
  }
2992
- const candidates = ranked.map(({ hit }) => ({
2993
- id: hit.id,
2994
- text: [hit.metadata.title, hit.metadata.sectionTitle, hit.metadata.snippet].filter(Boolean).join("\n")
2995
- }));
3222
+ const pageGroups = /* @__PURE__ */ new Map();
3223
+ for (const entry of ranked) {
3224
+ const url = entry.hit.metadata.url;
3225
+ const group = pageGroups.get(url);
3226
+ if (group) group.push(entry);
3227
+ else pageGroups.set(url, [entry]);
3228
+ }
3229
+ const pageCandidates = [];
3230
+ for (const [url, chunks] of pageGroups) {
3231
+ const sorted = [...chunks].sort(
3232
+ (a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0)
3233
+ );
3234
+ const title = sorted[0].hit.metadata.title;
3235
+ const body = sorted.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
3236
+ pageCandidates.push({ id: url, text: `${title}
3237
+
3238
+ ${body}` });
3239
+ }
2996
3240
  const reranked = await this.reranker.rerank(
2997
3241
  query,
2998
- candidates,
3242
+ pageCandidates,
2999
3243
  Math.max(topK, this.config.rerank.topN)
3000
3244
  );
3001
- const rerankScoreById = new Map(reranked.map((entry) => [entry.id, entry.score]));
3245
+ const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
3002
3246
  return ranked.map((entry) => {
3003
- const rerankScore = rerankScoreById.get(entry.hit.id);
3004
- const safeBaseScore = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
3005
- if (rerankScore === void 0 || !Number.isFinite(rerankScore)) {
3006
- return {
3007
- ...entry,
3008
- finalScore: safeBaseScore
3009
- };
3247
+ const pageScore = scoreByUrl.get(entry.hit.metadata.url);
3248
+ const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
3249
+ if (pageScore === void 0 || !Number.isFinite(pageScore)) {
3250
+ return { ...entry, finalScore: base };
3010
3251
  }
3011
- const combinedScore = rerankScore * this.config.ranking.weights.rerank + safeBaseScore * 1e-3;
3252
+ const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
3012
3253
  return {
3013
3254
  ...entry,
3014
- finalScore: Number.isFinite(combinedScore) ? combinedScore : safeBaseScore
3255
+ finalScore: Number.isFinite(combined) ? combined : base
3015
3256
  };
3016
3257
  }).sort((a, b) => {
3017
3258
  const delta = b.finalScore - a.finalScore;
@@ -3331,6 +3572,7 @@ function getRootOptions(command) {
3331
3572
  }
3332
3573
  async function runIndexCommand(opts) {
3333
3574
  const logger3 = new Logger({
3575
+ quiet: opts.quiet,
3334
3576
  verbose: opts.verbose,
3335
3577
  json: opts.json
3336
3578
  });
@@ -3354,7 +3596,9 @@ async function runIndexCommand(opts) {
3354
3596
  `);
3355
3597
  return;
3356
3598
  }
3357
- printIndexSummary(stats);
3599
+ if (!opts.quiet) {
3600
+ printIndexSummary(stats);
3601
+ }
3358
3602
  }
3359
3603
  var program = new Command();
3360
3604
  program.name("searchsocket").description("Semantic site search and MCP retrieval for SvelteKit").version(package_default.version).option("-C, --cwd <path>", "working directory", process.cwd()).option("--config <path>", "config path (defaults to searchsocket.config.ts)");
@@ -3378,7 +3622,7 @@ program.command("init").description("Create searchsocket.config.ts and .searchso
3378
3622
  process.stdout.write("// searchsocketVitePlugin({ enabled: true, changedOnly: true })\n");
3379
3623
  process.stdout.write("// or env-driven: SEARCHSOCKET_AUTO_INDEX=1 pnpm build\n");
3380
3624
  });
3381
- program.command("index").description("Index site content into markdown mirror + vector store").option("--scope <name>", "scope override").option("--changed-only", "only process changed chunks", true).option("--no-changed-only", "re-index regardless of previous manifest").option("--force", "force full mirror rebuild and re-upsert", false).option("--dry-run", "compute plan and cost, no API writes", false).option("--source <mode>", "source mode override: static-output|crawl|content-files|build").option("--max-pages <n>", "limit pages processed").option("--max-chunks <n>", "limit chunks processed").option("--verbose", "verbose output", false).option("--json", "emit JSON logs and summary", false).action(async (opts, command) => {
3625
+ program.command("index").description("Index site content into markdown mirror + vector store").option("--scope <name>", "scope override").option("--changed-only", "only process changed chunks", true).option("--no-changed-only", "re-index regardless of previous manifest").option("--force", "force full mirror rebuild and re-upsert", false).option("--dry-run", "compute plan and cost, no API writes", false).option("--source <mode>", "source mode override: static-output|crawl|content-files|build").option("--max-pages <n>", "limit pages processed").option("--max-chunks <n>", "limit chunks processed").option("--quiet", "suppress all output except errors and warnings", false).option("--verbose", "verbose output", false).option("--json", "emit JSON logs and summary", false).action(async (opts, command) => {
3382
3626
  const rootOpts = getRootOptions(command);
3383
3627
  const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
3384
3628
  await runIndexCommand({
@@ -3391,6 +3635,7 @@ program.command("index").description("Index site content into markdown mirror +
3391
3635
  source: opts.source,
3392
3636
  maxPages: opts.maxPages ? parsePositiveInt(opts.maxPages, "--max-pages") : void 0,
3393
3637
  maxChunks: opts.maxChunks ? parsePositiveInt(opts.maxChunks, "--max-chunks") : void 0,
3638
+ quiet: opts.quiet,
3394
3639
  verbose: opts.verbose,
3395
3640
  json: opts.json
3396
3641
  });
@@ -3553,8 +3798,8 @@ program.command("clean").description("Delete local state and optionally delete r
3553
3798
  `);
3554
3799
  if (opts.remote) {
3555
3800
  const vectorStore = await createVectorStore(config, cwd);
3556
- await vectorStore.deleteScope(scope);
3557
- process.stdout.write(`deleted remote vectors for scope ${scope.scopeName}
3801
+ await vectorStore.dropAllTables();
3802
+ process.stdout.write(`dropped all remote tables (chunks, registry, pages)
3558
3803
  `);
3559
3804
  }
3560
3805
  });
@@ -3679,14 +3924,6 @@ program.command("doctor").description("Validate config, env vars, provider conne
3679
3924
  details: tursoUrl ? `remote: ${tursoUrl}` : `local file: ${config.vector.turso.localPath}`
3680
3925
  });
3681
3926
  }
3682
- if (config.rerank.provider === "jina") {
3683
- const jinaKey = process.env[config.rerank.jina.apiKeyEnv];
3684
- checks.push({
3685
- name: `env ${config.rerank.jina.apiKeyEnv}`,
3686
- ok: Boolean(jinaKey),
3687
- details: jinaKey ? void 0 : "missing"
3688
- });
3689
- }
3690
3927
  if (config.source.mode === "static-output") {
3691
3928
  const outputDir = path13.resolve(cwd, config.source.staticOutputDir);
3692
3929
  const exists = fs9.existsSync(outputDir);