searchsocket 0.3.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -1,9 +1,9 @@
1
1
  #!/usr/bin/env node
2
2
 
3
3
  // src/cli.ts
4
- import fs9 from "fs";
4
+ import fs8 from "fs";
5
5
  import fsp from "fs/promises";
6
- import path13 from "path";
6
+ import path12 from "path";
7
7
  import { execSync as execSync2 } from "child_process";
8
8
  import { config as dotenvConfig } from "dotenv";
9
9
  import chokidar from "chokidar";
@@ -12,7 +12,7 @@ import { Command } from "commander";
12
12
  // package.json
13
13
  var package_default = {
14
14
  name: "searchsocket",
15
- version: "0.3.3",
15
+ version: "0.5.0",
16
16
  description: "Semantic site search and MCP retrieval for SvelteKit static sites",
17
17
  license: "MIT",
18
18
  author: "Greg Priday <greg@siteorigin.com>",
@@ -58,6 +58,11 @@ var package_default = {
58
58
  types: "./dist/client.d.ts",
59
59
  import: "./dist/client.js",
60
60
  require: "./dist/client.cjs"
61
+ },
62
+ "./scroll": {
63
+ types: "./dist/scroll.d.ts",
64
+ import: "./dist/scroll.js",
65
+ require: "./dist/scroll.cjs"
61
66
  }
62
67
  },
63
68
  scripts: {
@@ -65,15 +70,16 @@ var package_default = {
65
70
  clean: "rm -rf dist",
66
71
  typecheck: "tsc --noEmit",
67
72
  test: "vitest run",
68
- "test:watch": "vitest"
73
+ "test:watch": "vitest",
74
+ "test:quality": "SEARCHSOCKET_QUALITY_TESTS=1 vitest run tests/quality.test.ts"
69
75
  },
70
76
  engines: {
71
77
  node: ">=20"
72
78
  },
73
79
  packageManager: "pnpm@10.29.2",
74
80
  dependencies: {
75
- "@libsql/client": "^0.17.0",
76
81
  "@modelcontextprotocol/sdk": "^1.26.0",
82
+ "@upstash/search": "^0.1.7",
77
83
  cheerio: "^1.2.0",
78
84
  chokidar: "^5.0.0",
79
85
  commander: "^14.0.3",
@@ -91,6 +97,7 @@ var package_default = {
91
97
  "@types/express": "^5.0.6",
92
98
  "@types/node": "^25.2.2",
93
99
  "@types/turndown": "^5.0.6",
100
+ jsdom: "^28.1.0",
94
101
  tsup: "^8.5.1",
95
102
  typescript: "^5.9.3",
96
103
  vitest: "^4.0.18"
@@ -115,6 +122,8 @@ var searchSocketConfigSchema = z.object({
115
122
  envVar: z.string().min(1).optional(),
116
123
  sanitize: z.boolean().optional()
117
124
  }).optional(),
125
+ exclude: z.array(z.string()).optional(),
126
+ respectRobotsTxt: z.boolean().optional(),
118
127
  source: z.object({
119
128
  mode: z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
120
129
  staticOutputDir: z.string().min(1).optional(),
@@ -162,29 +171,18 @@ var searchSocketConfigSchema = z.object({
162
171
  prependTitle: z.boolean().optional(),
163
172
  pageSummaryChunk: z.boolean().optional()
164
173
  }).optional(),
165
- embeddings: z.object({
166
- provider: z.literal("jina").optional(),
167
- model: z.string().min(1).optional(),
168
- apiKey: z.string().min(1).optional(),
169
- apiKeyEnv: z.string().min(1).optional(),
170
- batchSize: z.number().int().positive().optional(),
171
- concurrency: z.number().int().positive().optional(),
172
- pricePer1kTokens: z.number().positive().optional()
173
- }).optional(),
174
- vector: z.object({
175
- dimension: z.number().int().positive().optional(),
176
- turso: z.object({
177
- url: z.string().url().optional(),
178
- authToken: z.string().min(1).optional(),
179
- urlEnv: z.string().optional(),
180
- authTokenEnv: z.string().optional(),
181
- localPath: z.string().optional()
182
- }).optional()
174
+ upstash: z.object({
175
+ url: z.string().url().optional(),
176
+ token: z.string().min(1).optional(),
177
+ urlEnv: z.string().min(1).optional(),
178
+ tokenEnv: z.string().min(1).optional()
183
179
  }).optional(),
184
- rerank: z.object({
185
- enabled: z.boolean().optional(),
186
- topN: z.number().int().positive().optional(),
187
- model: z.string().optional()
180
+ search: z.object({
181
+ semanticWeight: z.number().min(0).max(1).optional(),
182
+ inputEnrichment: z.boolean().optional(),
183
+ reranking: z.boolean().optional(),
184
+ dualSearch: z.boolean().optional(),
185
+ pageSearchWeight: z.number().min(0).max(1).optional()
188
186
  }).optional(),
189
187
  ranking: z.object({
190
188
  enableIncomingLinkBoost: z.boolean().optional(),
@@ -194,11 +192,12 @@ var searchSocketConfigSchema = z.object({
194
192
  aggregationDecay: z.number().min(0).max(1).optional(),
195
193
  minChunkScoreRatio: z.number().min(0).max(1).optional(),
196
194
  minScore: z.number().min(0).max(1).optional(),
195
+ scoreGapThreshold: z.number().min(0).max(1).optional(),
197
196
  weights: z.object({
198
197
  incomingLinks: z.number().optional(),
199
198
  depth: z.number().optional(),
200
- rerank: z.number().optional(),
201
- aggregation: z.number().optional()
199
+ aggregation: z.number().optional(),
200
+ titleMatch: z.number().optional()
202
201
  }).optional()
203
202
  }).optional(),
204
203
  api: z.object({
@@ -220,8 +219,7 @@ var searchSocketConfigSchema = z.object({
220
219
  }).optional()
221
220
  }).optional(),
222
221
  state: z.object({
223
- dir: z.string().optional(),
224
- writeMirror: z.boolean().optional()
222
+ dir: z.string().optional()
225
223
  }).optional()
226
224
  });
227
225
 
@@ -245,6 +243,8 @@ function createDefaultConfig(projectId) {
245
243
  envVar: "SEARCHSOCKET_SCOPE",
246
244
  sanitize: true
247
245
  },
246
+ exclude: [],
247
+ respectRobotsTxt: true,
248
248
  source: {
249
249
  mode: "static-output",
250
250
  staticOutputDir: "build",
@@ -273,24 +273,16 @@ function createDefaultConfig(projectId) {
273
273
  prependTitle: true,
274
274
  pageSummaryChunk: true
275
275
  },
276
- embeddings: {
277
- provider: "jina",
278
- model: "jina-embeddings-v3",
279
- apiKeyEnv: "JINA_API_KEY",
280
- batchSize: 64,
281
- concurrency: 4
276
+ upstash: {
277
+ urlEnv: "UPSTASH_SEARCH_REST_URL",
278
+ tokenEnv: "UPSTASH_SEARCH_REST_TOKEN"
282
279
  },
283
- vector: {
284
- turso: {
285
- urlEnv: "TURSO_DATABASE_URL",
286
- authTokenEnv: "TURSO_AUTH_TOKEN",
287
- localPath: ".searchsocket/vectors.db"
288
- }
289
- },
290
- rerank: {
291
- enabled: false,
292
- topN: 20,
293
- model: "jina-reranker-v2-base-multilingual"
280
+ search: {
281
+ semanticWeight: 0.75,
282
+ inputEnrichment: true,
283
+ reranking: true,
284
+ dualSearch: true,
285
+ pageSearchWeight: 0.3
294
286
  },
295
287
  ranking: {
296
288
  enableIncomingLinkBoost: true,
@@ -299,12 +291,13 @@ function createDefaultConfig(projectId) {
299
291
  aggregationCap: 5,
300
292
  aggregationDecay: 0.5,
301
293
  minChunkScoreRatio: 0.5,
302
- minScore: 0,
294
+ minScore: 0.3,
295
+ scoreGapThreshold: 0.4,
303
296
  weights: {
304
297
  incomingLinks: 0.05,
305
298
  depth: 0.03,
306
- rerank: 1,
307
- aggregation: 0.1
299
+ aggregation: 0.1,
300
+ titleMatch: 0.15
308
301
  }
309
302
  },
310
303
  api: {
@@ -322,8 +315,7 @@ function createDefaultConfig(projectId) {
322
315
  }
323
316
  },
324
317
  state: {
325
- dir: ".searchsocket",
326
- writeMirror: false
318
+ dir: ".searchsocket"
327
319
  }
328
320
  };
329
321
  }
@@ -393,6 +385,8 @@ ${issues}`
393
385
  ...defaults.scope,
394
386
  ...parsed.scope
395
387
  },
388
+ exclude: parsed.exclude ?? defaults.exclude,
389
+ respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
396
390
  source: {
397
391
  ...defaults.source,
398
392
  ...parsed.source,
@@ -429,21 +423,13 @@ ${issues}`
429
423
  ...defaults.chunking,
430
424
  ...parsed.chunking
431
425
  },
432
- embeddings: {
433
- ...defaults.embeddings,
434
- ...parsed.embeddings
435
- },
436
- vector: {
437
- ...defaults.vector,
438
- ...parsed.vector,
439
- turso: {
440
- ...defaults.vector.turso,
441
- ...parsed.vector?.turso
442
- }
426
+ upstash: {
427
+ ...defaults.upstash,
428
+ ...parsed.upstash
443
429
  },
444
- rerank: {
445
- ...defaults.rerank,
446
- ...parsed.rerank
430
+ search: {
431
+ ...defaults.search,
432
+ ...parsed.search
447
433
  },
448
434
  ranking: {
449
435
  ...defaults.ranking,
@@ -535,7 +521,8 @@ function writeMinimalConfig(cwd) {
535
521
  return target;
536
522
  }
537
523
  const content = `export default {
538
- embeddings: { apiKeyEnv: "JINA_API_KEY" }
524
+ // Upstash Search credentials (set via env vars or directly here)
525
+ // upstash: { urlEnv: "UPSTASH_SEARCH_REST_URL", tokenEnv: "UPSTASH_SEARCH_REST_TOKEN" }
539
526
  };
540
527
  `;
541
528
  fs.writeFileSync(target, content, "utf8");
@@ -698,576 +685,246 @@ import fs2 from "fs";
698
685
  import path2 from "path";
699
686
  function ensureStateDirs(cwd, stateDir, scope) {
700
687
  const statePath = path2.resolve(cwd, stateDir);
701
- const pagesPath = path2.join(statePath, "pages", scope.scopeName);
702
- fs2.mkdirSync(pagesPath, { recursive: true });
703
- return { statePath, pagesPath };
704
- }
705
-
706
- // src/embeddings/jina.ts
707
- import pLimit from "p-limit";
708
- function sleep(ms) {
709
- return new Promise((resolve) => {
710
- setTimeout(resolve, ms);
711
- });
712
- }
713
- var JinaEmbeddingsProvider = class {
714
- apiKey;
715
- batchSize;
716
- concurrency;
717
- defaultTask;
718
- constructor(options) {
719
- if (!Number.isInteger(options.batchSize) || options.batchSize <= 0) {
720
- throw new Error(`Invalid batchSize: ${options.batchSize}. batchSize must be a positive integer.`);
721
- }
722
- if (!Number.isInteger(options.concurrency) || options.concurrency <= 0) {
723
- throw new Error(`Invalid concurrency: ${options.concurrency}. concurrency must be a positive integer.`);
724
- }
725
- this.apiKey = options.apiKey;
726
- this.batchSize = options.batchSize;
727
- this.concurrency = options.concurrency;
728
- this.defaultTask = options.task ?? "retrieval.passage";
729
- }
730
- estimateTokens(text) {
731
- const normalized = text.trim();
732
- if (!normalized) {
733
- return 0;
734
- }
735
- const wordCount = normalized.match(/[A-Za-z0-9_]+/g)?.length ?? 0;
736
- const punctuationCount = normalized.match(/[^\s\w]/g)?.length ?? 0;
737
- const cjkCount = normalized.match(/[\u3400-\u9fff]/g)?.length ?? 0;
738
- const charEstimate = Math.ceil(normalized.length / 4);
739
- const lexicalEstimate = Math.ceil(wordCount * 1.25 + punctuationCount * 0.45 + cjkCount * 1.6);
740
- return Math.max(1, Math.max(charEstimate, lexicalEstimate));
741
- }
742
- async embedTexts(texts, modelId, task) {
743
- if (texts.length === 0) {
744
- return [];
745
- }
746
- const batches = [];
747
- for (let i = 0; i < texts.length; i += this.batchSize) {
748
- batches.push({
749
- index: i,
750
- values: texts.slice(i, i + this.batchSize)
751
- });
752
- }
753
- const outputs = new Array(batches.length);
754
- const limit = pLimit(this.concurrency);
755
- await Promise.all(
756
- batches.map(
757
- (batch, position) => limit(async () => {
758
- outputs[position] = await this.embedWithRetry(batch.values, modelId, task ?? this.defaultTask);
759
- })
760
- )
761
- );
762
- return outputs.flat();
763
- }
764
- async embedWithRetry(texts, modelId, task) {
765
- const maxAttempts = 5;
766
- let attempt = 0;
767
- while (attempt < maxAttempts) {
768
- attempt += 1;
769
- let response;
770
- try {
771
- response = await fetch("https://api.jina.ai/v1/embeddings", {
772
- method: "POST",
773
- headers: {
774
- "content-type": "application/json",
775
- authorization: `Bearer ${this.apiKey}`
776
- },
777
- body: JSON.stringify({
778
- model: modelId,
779
- input: texts,
780
- task
781
- })
782
- });
783
- } catch (error) {
784
- if (attempt >= maxAttempts) {
785
- throw error;
786
- }
787
- await sleep(Math.min(2 ** attempt * 300, 5e3));
788
- continue;
789
- }
790
- if (!response.ok) {
791
- const retryable = response.status === 429 || response.status >= 500;
792
- if (!retryable || attempt >= maxAttempts) {
793
- const errorBody = await response.text();
794
- throw new Error(`Jina embeddings failed (${response.status}): ${errorBody}`);
795
- }
796
- await sleep(Math.min(2 ** attempt * 300, 5e3));
797
- continue;
798
- }
799
- const payload = await response.json();
800
- if (!payload.data || !Array.isArray(payload.data)) {
801
- throw new Error("Invalid Jina embeddings response format");
802
- }
803
- return payload.data.map((entry) => entry.embedding);
804
- }
805
- throw new Error("Unreachable retry state");
806
- }
807
- };
808
-
809
- // src/embeddings/factory.ts
810
- function createEmbeddingsProvider(config) {
811
- if (config.embeddings.provider !== "jina") {
812
- throw new SearchSocketError(
813
- "CONFIG_MISSING",
814
- `Unsupported embeddings provider ${config.embeddings.provider}`
815
- );
816
- }
817
- const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
818
- if (!apiKey) {
819
- throw new SearchSocketError(
820
- "CONFIG_MISSING",
821
- `Missing embeddings API key: provide embeddings.apiKey or set env var ${config.embeddings.apiKeyEnv}`
822
- );
823
- }
824
- return new JinaEmbeddingsProvider({
825
- apiKey,
826
- batchSize: config.embeddings.batchSize,
827
- concurrency: config.embeddings.concurrency
828
- });
688
+ fs2.mkdirSync(statePath, { recursive: true });
689
+ return { statePath };
829
690
  }
830
691
 
831
692
  // src/indexing/pipeline.ts
832
- import path11 from "path";
833
-
834
- // src/vector/factory.ts
835
- import fs3 from "fs";
836
- import path3 from "path";
693
+ import path10 from "path";
837
694
 
838
- // src/core/serverless.ts
839
- function isServerless() {
840
- return !!(process.env.VERCEL || process.env.NETLIFY || process.env.AWS_LAMBDA_FUNCTION_NAME || process.env.FUNCTIONS_WORKER || process.env.CF_PAGES);
695
+ // src/vector/upstash.ts
696
+ function chunkIndexName(scope) {
697
+ return `${scope.projectId}--${scope.scopeName}`;
841
698
  }
842
-
843
- // src/vector/turso.ts
844
- var TursoVectorStore = class {
699
+ function pageIndexName(scope) {
700
+ return `${scope.projectId}--${scope.scopeName}--pages`;
701
+ }
702
+ var UpstashSearchStore = class {
845
703
  client;
846
- dimension;
847
- chunksReady = false;
848
- registryReady = false;
849
- pagesReady = false;
850
704
  constructor(opts) {
851
705
  this.client = opts.client;
852
- this.dimension = opts.dimension;
853
- }
854
- async ensureRegistry() {
855
- if (this.registryReady) return;
856
- await this.client.execute(`
857
- CREATE TABLE IF NOT EXISTS registry (
858
- scope_key TEXT PRIMARY KEY,
859
- project_id TEXT NOT NULL,
860
- scope_name TEXT NOT NULL,
861
- model_id TEXT NOT NULL,
862
- last_indexed_at TEXT NOT NULL,
863
- vector_count INTEGER,
864
- last_estimate_tokens INTEGER,
865
- last_estimate_cost_usd REAL,
866
- last_estimate_changed_chunks INTEGER
867
- )
868
- `);
869
- const estimateCols = [
870
- { name: "last_estimate_tokens", def: "INTEGER" },
871
- { name: "last_estimate_cost_usd", def: "REAL" },
872
- { name: "last_estimate_changed_chunks", def: "INTEGER" }
873
- ];
874
- for (const col of estimateCols) {
875
- try {
876
- await this.client.execute(`ALTER TABLE registry ADD COLUMN ${col.name} ${col.def}`);
877
- } catch (error) {
878
- if (error instanceof Error && !error.message.includes("duplicate column")) {
879
- throw error;
880
- }
881
- }
882
- }
883
- this.registryReady = true;
884
- }
885
- async ensureChunks(dim) {
886
- if (this.chunksReady) return;
887
- const exists = await this.chunksTableExists();
888
- if (exists) {
889
- const currentDim = await this.getChunksDimension();
890
- if (currentDim !== null && currentDim !== dim) {
891
- await this.client.batch([
892
- "DROP INDEX IF EXISTS idx",
893
- "DROP TABLE IF EXISTS chunks"
894
- ]);
895
- }
896
- }
897
- await this.client.batch([
898
- `CREATE TABLE IF NOT EXISTS chunks (
899
- id TEXT PRIMARY KEY,
900
- project_id TEXT NOT NULL,
901
- scope_name TEXT NOT NULL,
902
- url TEXT NOT NULL,
903
- path TEXT NOT NULL,
904
- title TEXT NOT NULL,
905
- section_title TEXT NOT NULL DEFAULT '',
906
- heading_path TEXT NOT NULL DEFAULT '[]',
907
- snippet TEXT NOT NULL DEFAULT '',
908
- chunk_text TEXT NOT NULL DEFAULT '',
909
- ordinal INTEGER NOT NULL DEFAULT 0,
910
- content_hash TEXT NOT NULL DEFAULT '',
911
- model_id TEXT NOT NULL DEFAULT '',
912
- depth INTEGER NOT NULL DEFAULT 0,
913
- incoming_links INTEGER NOT NULL DEFAULT 0,
914
- route_file TEXT NOT NULL DEFAULT '',
915
- tags TEXT NOT NULL DEFAULT '[]',
916
- description TEXT NOT NULL DEFAULT '',
917
- keywords TEXT NOT NULL DEFAULT '[]',
918
- embedding F32_BLOB(${dim})
919
- )`,
920
- `CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
921
- ]);
922
- this.chunksReady = true;
923
- }
924
- async ensurePages() {
925
- if (this.pagesReady) return;
926
- await this.client.execute(`
927
- CREATE TABLE IF NOT EXISTS pages (
928
- project_id TEXT NOT NULL,
929
- scope_name TEXT NOT NULL,
930
- url TEXT NOT NULL,
931
- title TEXT NOT NULL,
932
- markdown TEXT NOT NULL,
933
- route_file TEXT NOT NULL DEFAULT '',
934
- route_resolution TEXT NOT NULL DEFAULT 'exact',
935
- incoming_links INTEGER NOT NULL DEFAULT 0,
936
- outgoing_links INTEGER NOT NULL DEFAULT 0,
937
- depth INTEGER NOT NULL DEFAULT 0,
938
- tags TEXT NOT NULL DEFAULT '[]',
939
- indexed_at TEXT NOT NULL,
940
- PRIMARY KEY (project_id, scope_name, url)
941
- )
942
- `);
943
- this.pagesReady = true;
944
706
  }
945
- async chunksTableExists() {
946
- try {
947
- await this.client.execute("SELECT 1 FROM chunks LIMIT 0");
948
- return true;
949
- } catch (error) {
950
- if (error instanceof Error && error.message.includes("no such table")) {
951
- return false;
952
- }
953
- throw error;
954
- }
707
+ chunkIndex(scope) {
708
+ return this.client.index(chunkIndexName(scope));
955
709
  }
956
- /**
957
- * Read the current F32_BLOB dimension from the chunks table schema.
958
- * Returns null if the table doesn't exist or the dimension can't be parsed.
959
- */
960
- async getChunksDimension() {
961
- try {
962
- const rs = await this.client.execute(
963
- "SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'"
964
- );
965
- if (rs.rows.length === 0) return null;
966
- const sql = rs.rows[0].sql;
967
- const match = sql.match(/F32_BLOB\((\d+)\)/i);
968
- return match ? parseInt(match[1], 10) : null;
969
- } catch {
970
- return null;
971
- }
710
+ pageIndex(scope) {
711
+ return this.client.index(pageIndexName(scope));
972
712
  }
973
- /**
974
- * Drop all SearchSocket tables (chunks, registry, pages) and their indexes.
975
- * Used by `clean --remote` for a full reset.
976
- */
977
- async dropAllTables() {
978
- await this.client.batch([
979
- "DROP INDEX IF EXISTS idx",
980
- "DROP TABLE IF EXISTS chunks",
981
- "DROP TABLE IF EXISTS registry",
982
- "DROP TABLE IF EXISTS pages"
983
- ]);
984
- this.chunksReady = false;
985
- this.registryReady = false;
986
- this.pagesReady = false;
987
- }
988
- async upsert(records, _scope) {
989
- if (records.length === 0) return;
990
- const dim = this.dimension ?? records[0].vector.length;
991
- await this.ensureChunks(dim);
713
+ async upsertChunks(chunks, scope) {
714
+ if (chunks.length === 0) return;
715
+ const index = this.chunkIndex(scope);
992
716
  const BATCH_SIZE = 100;
993
- for (let i = 0; i < records.length; i += BATCH_SIZE) {
994
- const batch = records.slice(i, i + BATCH_SIZE);
995
- const stmts = batch.map((r) => ({
996
- sql: `INSERT OR REPLACE INTO chunks
997
- (id, project_id, scope_name, url, path, title, section_title,
998
- heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
999
- incoming_links, route_file, tags, description, keywords, embedding)
1000
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
1001
- args: [
1002
- r.id,
1003
- r.metadata.projectId,
1004
- r.metadata.scopeName,
1005
- r.metadata.url,
1006
- r.metadata.path,
1007
- r.metadata.title,
1008
- r.metadata.sectionTitle,
1009
- JSON.stringify(r.metadata.headingPath),
1010
- r.metadata.snippet,
1011
- r.metadata.chunkText,
1012
- r.metadata.ordinal,
1013
- r.metadata.contentHash,
1014
- r.metadata.modelId,
1015
- r.metadata.depth,
1016
- r.metadata.incomingLinks,
1017
- r.metadata.routeFile,
1018
- JSON.stringify(r.metadata.tags),
1019
- r.metadata.description ?? "",
1020
- JSON.stringify(r.metadata.keywords ?? []),
1021
- JSON.stringify(r.vector)
1022
- ]
1023
- }));
1024
- await this.client.batch(stmts);
1025
- }
1026
- }
1027
- async query(queryVector, opts, scope) {
1028
- const dim = this.dimension ?? queryVector.length;
1029
- await this.ensureChunks(dim);
1030
- const queryJson = JSON.stringify(queryVector);
1031
- const rs = await this.client.execute({
1032
- sql: `SELECT c.id, c.project_id, c.scope_name, c.url, c.path, c.title,
1033
- c.section_title, c.heading_path, c.snippet, c.chunk_text,
1034
- c.ordinal, c.content_hash,
1035
- c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
1036
- c.description, c.keywords,
1037
- vector_distance_cos(c.embedding, vector(?)) AS distance
1038
- FROM vector_top_k('idx', vector(?), ?) AS v
1039
- JOIN chunks AS c ON c.rowid = v.id`,
1040
- args: [queryJson, queryJson, opts.topK]
717
+ for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
718
+ const batch = chunks.slice(i, i + BATCH_SIZE);
719
+ await index.upsert(batch);
720
+ }
721
+ }
722
+ async search(query, opts, scope) {
723
+ const index = this.chunkIndex(scope);
724
+ const results = await index.search({
725
+ query,
726
+ limit: opts.limit,
727
+ semanticWeight: opts.semanticWeight,
728
+ inputEnrichment: opts.inputEnrichment,
729
+ reranking: opts.reranking,
730
+ filter: opts.filter
1041
731
  });
1042
- let hits = [];
1043
- for (const row of rs.rows) {
1044
- const projectId = row.project_id;
1045
- const scopeName = row.scope_name;
1046
- if (projectId !== scope.projectId || scopeName !== scope.scopeName) {
1047
- continue;
1048
- }
1049
- const rowPath = row.path;
1050
- if (opts.pathPrefix) {
1051
- const rawPrefix = opts.pathPrefix.startsWith("/") ? opts.pathPrefix : `/${opts.pathPrefix}`;
1052
- const prefix = rawPrefix.endsWith("/") ? rawPrefix : `${rawPrefix}/`;
1053
- const normalizedPath = rowPath.replace(/\/$/, "");
1054
- const normalizedPrefix = rawPrefix.replace(/\/$/, "");
1055
- if (normalizedPath !== normalizedPrefix && !rowPath.startsWith(prefix)) {
1056
- continue;
1057
- }
1058
- }
1059
- const tags = JSON.parse(row.tags || "[]");
1060
- if (opts.tags && opts.tags.length > 0) {
1061
- if (!opts.tags.every((t) => tags.includes(t))) {
1062
- continue;
1063
- }
732
+ return results.map((doc) => ({
733
+ id: doc.id,
734
+ score: doc.score,
735
+ metadata: {
736
+ projectId: doc.metadata?.projectId ?? "",
737
+ scopeName: doc.metadata?.scopeName ?? "",
738
+ url: doc.content.url,
739
+ path: doc.metadata?.path ?? "",
740
+ title: doc.content.title,
741
+ sectionTitle: doc.content.sectionTitle,
742
+ headingPath: doc.content.headingPath ? doc.content.headingPath.split(" > ").filter(Boolean) : [],
743
+ snippet: doc.metadata?.snippet ?? "",
744
+ chunkText: doc.content.text,
745
+ ordinal: doc.metadata?.ordinal ?? 0,
746
+ contentHash: doc.metadata?.contentHash ?? "",
747
+ depth: doc.metadata?.depth ?? 0,
748
+ incomingLinks: doc.metadata?.incomingLinks ?? 0,
749
+ routeFile: doc.metadata?.routeFile ?? "",
750
+ tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
751
+ description: doc.metadata?.description || void 0,
752
+ keywords: doc.metadata?.keywords ? doc.metadata.keywords.split(",").filter(Boolean) : void 0
1064
753
  }
1065
- const distance = row.distance;
1066
- const score = 1 - distance;
1067
- const description = row.description || void 0;
1068
- const keywords = (() => {
1069
- const raw = row.keywords || "[]";
1070
- const parsed = JSON.parse(raw);
1071
- return parsed.length > 0 ? parsed : void 0;
1072
- })();
1073
- hits.push({
1074
- id: row.id,
1075
- score,
1076
- metadata: {
1077
- projectId,
1078
- scopeName,
1079
- url: row.url,
1080
- path: rowPath,
1081
- title: row.title,
1082
- sectionTitle: row.section_title,
1083
- headingPath: JSON.parse(row.heading_path || "[]"),
1084
- snippet: row.snippet,
1085
- chunkText: row.chunk_text || "",
1086
- ordinal: row.ordinal || 0,
1087
- contentHash: row.content_hash,
1088
- modelId: row.model_id,
1089
- depth: row.depth,
1090
- incomingLinks: row.incoming_links,
1091
- routeFile: row.route_file,
1092
- tags,
1093
- description,
1094
- keywords
1095
- }
754
+ }));
755
+ }
756
+ async searchPages(query, opts, scope) {
757
+ const index = this.pageIndex(scope);
758
+ let results;
759
+ try {
760
+ results = await index.search({
761
+ query,
762
+ limit: opts.limit,
763
+ semanticWeight: opts.semanticWeight,
764
+ inputEnrichment: opts.inputEnrichment,
765
+ reranking: true,
766
+ filter: opts.filter
1096
767
  });
768
+ } catch {
769
+ return [];
1097
770
  }
1098
- hits.sort((a, b) => b.score - a.score);
1099
- return hits;
771
+ return results.map((doc) => ({
772
+ id: doc.id,
773
+ score: doc.score,
774
+ title: doc.content.title,
775
+ url: doc.content.url,
776
+ description: doc.content.description ?? "",
777
+ tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
778
+ depth: doc.metadata?.depth ?? 0,
779
+ incomingLinks: doc.metadata?.incomingLinks ?? 0,
780
+ routeFile: doc.metadata?.routeFile ?? ""
781
+ }));
1100
782
  }
1101
783
  async deleteByIds(ids, scope) {
1102
784
  if (ids.length === 0) return;
785
+ const index = this.chunkIndex(scope);
1103
786
  const BATCH_SIZE = 500;
1104
787
  for (let i = 0; i < ids.length; i += BATCH_SIZE) {
1105
788
  const batch = ids.slice(i, i + BATCH_SIZE);
1106
- const placeholders = batch.map(() => "?").join(", ");
1107
- await this.client.execute({
1108
- sql: `DELETE FROM chunks WHERE project_id = ? AND scope_name = ? AND id IN (${placeholders})`,
1109
- args: [scope.projectId, scope.scopeName, ...batch]
1110
- });
789
+ await index.delete(batch);
1111
790
  }
1112
791
  }
1113
792
  async deleteScope(scope) {
1114
- await this.ensureRegistry();
1115
793
  try {
1116
- await this.client.execute({
1117
- sql: `DELETE FROM chunks WHERE project_id = ? AND scope_name = ?`,
1118
- args: [scope.projectId, scope.scopeName]
1119
- });
1120
- } catch (error) {
1121
- if (error instanceof Error && !error.message.includes("no such table")) {
1122
- throw error;
1123
- }
794
+ const chunkIdx = this.chunkIndex(scope);
795
+ await chunkIdx.deleteIndex();
796
+ } catch {
1124
797
  }
1125
798
  try {
1126
- await this.client.execute({
1127
- sql: `DELETE FROM pages WHERE project_id = ? AND scope_name = ?`,
1128
- args: [scope.projectId, scope.scopeName]
1129
- });
1130
- } catch (error) {
1131
- if (error instanceof Error && !error.message.includes("no such table")) {
1132
- throw error;
1133
- }
799
+ const pageIdx = this.pageIndex(scope);
800
+ await pageIdx.deleteIndex();
801
+ } catch {
1134
802
  }
1135
- await this.client.execute({
1136
- sql: `DELETE FROM registry WHERE project_id = ? AND scope_name = ?`,
1137
- args: [scope.projectId, scope.scopeName]
1138
- });
1139
- }
1140
- async listScopes(scopeProjectId) {
1141
- await this.ensureRegistry();
1142
- const rs = await this.client.execute({
1143
- sql: `SELECT project_id, scope_name, model_id, last_indexed_at, vector_count,
1144
- last_estimate_tokens, last_estimate_cost_usd, last_estimate_changed_chunks
1145
- FROM registry WHERE project_id = ?`,
1146
- args: [scopeProjectId]
1147
- });
1148
- return rs.rows.map((row) => ({
1149
- projectId: row.project_id,
1150
- scopeName: row.scope_name,
1151
- modelId: row.model_id,
1152
- lastIndexedAt: row.last_indexed_at,
1153
- vectorCount: row.vector_count,
1154
- lastEstimateTokens: row.last_estimate_tokens,
1155
- lastEstimateCostUSD: row.last_estimate_cost_usd,
1156
- lastEstimateChangedChunks: row.last_estimate_changed_chunks
1157
- }));
1158
803
  }
1159
- async recordScope(info) {
1160
- await this.ensureRegistry();
1161
- const key = `${info.projectId}:${info.scopeName}`;
1162
- await this.client.execute({
1163
- sql: `INSERT OR REPLACE INTO registry
1164
- (scope_key, project_id, scope_name, model_id, last_indexed_at, vector_count,
1165
- last_estimate_tokens, last_estimate_cost_usd, last_estimate_changed_chunks)
1166
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
1167
- args: [
1168
- key,
1169
- info.projectId,
1170
- info.scopeName,
1171
- info.modelId,
1172
- info.lastIndexedAt,
1173
- info.vectorCount ?? null,
1174
- info.lastEstimateTokens ?? null,
1175
- info.lastEstimateCostUSD ?? null,
1176
- info.lastEstimateChangedChunks ?? null
1177
- ]
1178
- });
804
+ async listScopes(projectId) {
805
+ const allIndexes = await this.client.listIndexes();
806
+ const prefix = `${projectId}--`;
807
+ const scopeNames = /* @__PURE__ */ new Set();
808
+ for (const name of allIndexes) {
809
+ if (name.startsWith(prefix) && !name.endsWith("--pages")) {
810
+ const scopeName = name.slice(prefix.length);
811
+ scopeNames.add(scopeName);
812
+ }
813
+ }
814
+ const scopes = [];
815
+ for (const scopeName of scopeNames) {
816
+ const scope = {
817
+ projectId,
818
+ scopeName,
819
+ scopeId: `${projectId}:${scopeName}`
820
+ };
821
+ try {
822
+ const info = await this.chunkIndex(scope).info();
823
+ scopes.push({
824
+ projectId,
825
+ scopeName,
826
+ lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
827
+ documentCount: info.documentCount
828
+ });
829
+ } catch {
830
+ scopes.push({
831
+ projectId,
832
+ scopeName,
833
+ lastIndexedAt: "unknown",
834
+ documentCount: 0
835
+ });
836
+ }
837
+ }
838
+ return scopes;
1179
839
  }
1180
840
  async getContentHashes(scope) {
1181
- const exists = await this.chunksTableExists();
1182
- if (!exists) return /* @__PURE__ */ new Map();
1183
- const rs = await this.client.execute({
1184
- sql: `SELECT id, content_hash FROM chunks WHERE project_id = ? AND scope_name = ?`,
1185
- args: [scope.projectId, scope.scopeName]
1186
- });
1187
841
  const map = /* @__PURE__ */ new Map();
1188
- for (const row of rs.rows) {
1189
- map.set(row.id, row.content_hash);
842
+ const index = this.chunkIndex(scope);
843
+ let cursor = "0";
844
+ try {
845
+ for (; ; ) {
846
+ const result = await index.range({ cursor, limit: 100 });
847
+ for (const doc of result.documents) {
848
+ if (doc.metadata?.contentHash) {
849
+ map.set(doc.id, doc.metadata.contentHash);
850
+ }
851
+ }
852
+ if (!result.nextCursor || result.nextCursor === "0") break;
853
+ cursor = result.nextCursor;
854
+ }
855
+ } catch {
1190
856
  }
1191
857
  return map;
1192
858
  }
1193
859
  async upsertPages(pages, scope) {
1194
860
  if (pages.length === 0) return;
1195
- await this.ensurePages();
1196
- for (const page of pages) {
1197
- if (page.projectId !== scope.projectId || page.scopeName !== scope.scopeName) {
1198
- throw new Error(
1199
- `Page scope mismatch: page has ${page.projectId}:${page.scopeName} but scope is ${scope.projectId}:${scope.scopeName}`
1200
- );
1201
- }
1202
- }
1203
- const BATCH_SIZE = 100;
861
+ const index = this.pageIndex(scope);
862
+ const BATCH_SIZE = 50;
1204
863
  for (let i = 0; i < pages.length; i += BATCH_SIZE) {
1205
864
  const batch = pages.slice(i, i + BATCH_SIZE);
1206
- const stmts = batch.map((p) => ({
1207
- sql: `INSERT OR REPLACE INTO pages
1208
- (project_id, scope_name, url, title, markdown, route_file,
1209
- route_resolution, incoming_links, outgoing_links, depth, tags, indexed_at)
1210
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
1211
- args: [
1212
- p.projectId,
1213
- p.scopeName,
1214
- p.url,
1215
- p.title,
1216
- p.markdown,
1217
- p.routeFile,
1218
- p.routeResolution,
1219
- p.incomingLinks,
1220
- p.outgoingLinks,
1221
- p.depth,
1222
- JSON.stringify(p.tags),
1223
- p.indexedAt
1224
- ]
865
+ const docs = batch.map((p) => ({
866
+ id: p.url,
867
+ content: {
868
+ title: p.title,
869
+ url: p.url,
870
+ type: "page",
871
+ description: p.description ?? "",
872
+ keywords: (p.keywords ?? []).join(","),
873
+ summary: p.summary ?? "",
874
+ tags: p.tags.join(",")
875
+ },
876
+ metadata: {
877
+ markdown: p.markdown,
878
+ projectId: p.projectId,
879
+ scopeName: p.scopeName,
880
+ routeFile: p.routeFile,
881
+ routeResolution: p.routeResolution,
882
+ incomingLinks: p.incomingLinks,
883
+ outgoingLinks: p.outgoingLinks,
884
+ depth: p.depth,
885
+ indexedAt: p.indexedAt
886
+ }
1225
887
  }));
1226
- await this.client.batch(stmts);
888
+ await index.upsert(docs);
1227
889
  }
1228
890
  }
1229
891
  async getPage(url, scope) {
1230
- await this.ensurePages();
1231
- const rs = await this.client.execute({
1232
- sql: `SELECT * FROM pages WHERE project_id = ? AND scope_name = ? AND url = ?`,
1233
- args: [scope.projectId, scope.scopeName, url]
1234
- });
1235
- if (rs.rows.length === 0) return null;
1236
- const row = rs.rows[0];
1237
- return {
1238
- url: row.url,
1239
- title: row.title,
1240
- markdown: row.markdown,
1241
- projectId: row.project_id,
1242
- scopeName: row.scope_name,
1243
- routeFile: row.route_file,
1244
- routeResolution: row.route_resolution,
1245
- incomingLinks: row.incoming_links,
1246
- outgoingLinks: row.outgoing_links,
1247
- depth: row.depth,
1248
- tags: JSON.parse(row.tags || "[]"),
1249
- indexedAt: row.indexed_at
1250
- };
892
+ const index = this.pageIndex(scope);
893
+ try {
894
+ const results = await index.fetch([url]);
895
+ const doc = results[0];
896
+ if (!doc) return null;
897
+ return {
898
+ url: doc.content.url,
899
+ title: doc.content.title,
900
+ markdown: doc.metadata.markdown,
901
+ projectId: doc.metadata.projectId,
902
+ scopeName: doc.metadata.scopeName,
903
+ routeFile: doc.metadata.routeFile,
904
+ routeResolution: doc.metadata.routeResolution,
905
+ incomingLinks: doc.metadata.incomingLinks,
906
+ outgoingLinks: doc.metadata.outgoingLinks,
907
+ depth: doc.metadata.depth,
908
+ tags: doc.content.tags ? doc.content.tags.split(",").filter(Boolean) : [],
909
+ indexedAt: doc.metadata.indexedAt,
910
+ summary: doc.content.summary || void 0,
911
+ description: doc.content.description || void 0,
912
+ keywords: doc.content.keywords ? doc.content.keywords.split(",").filter(Boolean) : void 0
913
+ };
914
+ } catch {
915
+ return null;
916
+ }
1251
917
  }
1252
918
  async deletePages(scope) {
1253
- await this.ensurePages();
1254
- await this.client.execute({
1255
- sql: `DELETE FROM pages WHERE project_id = ? AND scope_name = ?`,
1256
- args: [scope.projectId, scope.scopeName]
1257
- });
1258
- }
1259
- async getScopeModelId(scope) {
1260
- await this.ensureRegistry();
1261
- const rs = await this.client.execute({
1262
- sql: `SELECT model_id FROM registry WHERE project_id = ? AND scope_name = ?`,
1263
- args: [scope.projectId, scope.scopeName]
1264
- });
1265
- if (rs.rows.length === 0) return null;
1266
- return rs.rows[0].model_id;
919
+ try {
920
+ const index = this.pageIndex(scope);
921
+ await index.reset();
922
+ } catch {
923
+ }
1267
924
  }
1268
925
  async health() {
1269
926
  try {
1270
- await this.client.execute("SELECT 1");
927
+ await this.client.info();
1271
928
  return { ok: true };
1272
929
  } catch (error) {
1273
930
  return {
@@ -1276,40 +933,34 @@ var TursoVectorStore = class {
1276
933
  };
1277
934
  }
1278
935
  }
936
+ async dropAllIndexes(projectId) {
937
+ const allIndexes = await this.client.listIndexes();
938
+ const prefix = `${projectId}--`;
939
+ for (const name of allIndexes) {
940
+ if (name.startsWith(prefix)) {
941
+ try {
942
+ const index = this.client.index(name);
943
+ await index.deleteIndex();
944
+ } catch {
945
+ }
946
+ }
947
+ }
948
+ }
1279
949
  };
1280
950
 
1281
951
  // src/vector/factory.ts
1282
- async function createVectorStore(config, cwd) {
1283
- const turso = config.vector.turso;
1284
- const remoteUrl = turso.url ?? process.env[turso.urlEnv];
1285
- if (remoteUrl) {
1286
- const { createClient: createClient2 } = await import("@libsql/client/http");
1287
- const authToken = turso.authToken ?? process.env[turso.authTokenEnv];
1288
- const client2 = createClient2({
1289
- url: remoteUrl,
1290
- authToken
1291
- });
1292
- return new TursoVectorStore({
1293
- client: client2,
1294
- dimension: config.vector.dimension
1295
- });
1296
- }
1297
- if (isServerless()) {
952
+ async function createUpstashStore(config) {
953
+ const url = config.upstash.url ?? process.env[config.upstash.urlEnv];
954
+ const token = config.upstash.token ?? process.env[config.upstash.tokenEnv];
955
+ if (!url || !token) {
1298
956
  throw new SearchSocketError(
1299
957
  "VECTOR_BACKEND_UNAVAILABLE",
1300
- `No remote vector database URL found (checked vector.turso.url and env var "${turso.urlEnv}"). Local SQLite storage is not available in serverless environments. Set ${turso.urlEnv} or pass vector.turso.url directly.`
958
+ `Missing Upstash Search credentials. Set ${config.upstash.urlEnv} and ${config.upstash.tokenEnv} environment variables, or pass upstash.url and upstash.token in your config.`
1301
959
  );
1302
960
  }
1303
- const { createClient } = await import("@libsql/client");
1304
- const localPath = path3.resolve(cwd, turso.localPath);
1305
- fs3.mkdirSync(path3.dirname(localPath), { recursive: true });
1306
- const client = createClient({
1307
- url: `file:${localPath}`
1308
- });
1309
- return new TursoVectorStore({
1310
- client,
1311
- dimension: config.vector.dimension
1312
- });
961
+ const { Search } = await import("@upstash/search");
962
+ const client = new Search({ url, token });
963
+ return new UpstashSearchStore({ client });
1313
964
  }
1314
965
 
1315
966
  // src/utils/hash.ts
@@ -1322,7 +973,7 @@ function sha256(input) {
1322
973
  }
1323
974
 
1324
975
  // src/utils/path.ts
1325
- import path4 from "path";
976
+ import path3 from "path";
1326
977
  function normalizeUrlPath(rawPath) {
1327
978
  let out = rawPath.trim();
1328
979
  if (!out.startsWith("/")) {
@@ -1334,15 +985,8 @@ function normalizeUrlPath(rawPath) {
1334
985
  }
1335
986
  return out;
1336
987
  }
1337
- function urlPathToMirrorRelative(urlPath) {
1338
- const normalized = normalizeUrlPath(urlPath);
1339
- if (normalized === "/") {
1340
- return "index.md";
1341
- }
1342
- return `${normalized.slice(1)}.md`;
1343
- }
1344
988
  function staticHtmlFileToUrl(filePath, rootDir) {
1345
- const relative = path4.relative(rootDir, filePath).replace(/\\/g, "/");
989
+ const relative = path3.relative(rootDir, filePath).replace(/\\/g, "/");
1346
990
  if (relative === "index.html") {
1347
991
  return "/";
1348
992
  }
@@ -1615,7 +1259,7 @@ function buildEmbeddingText(chunk, prependTitle) {
1615
1259
 
1616
1260
  ${chunk.chunkText}`;
1617
1261
  }
1618
- function chunkMirrorPage(page, config, scope) {
1262
+ function chunkPage(page, config, scope) {
1619
1263
  const sections = parseHeadingSections(page.markdown, config.chunking.headingPathDepth);
1620
1264
  const rawChunks = sections.flatMap((section) => splitSection(section, config.chunking));
1621
1265
  const chunks = [];
@@ -1710,6 +1354,17 @@ function extractFromHtml(url, html, config) {
1710
1354
  if ($(`[${config.extract.noindexAttr}]`).length > 0) {
1711
1355
  return null;
1712
1356
  }
1357
+ const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
1358
+ let weight;
1359
+ if (weightRaw !== void 0) {
1360
+ const parsed = Number(weightRaw);
1361
+ if (Number.isFinite(parsed) && parsed >= 0) {
1362
+ weight = parsed;
1363
+ }
1364
+ }
1365
+ if (weight === 0) {
1366
+ return null;
1367
+ }
1713
1368
  const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
1714
1369
  const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
1715
1370
  const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
@@ -1765,7 +1420,8 @@ function extractFromHtml(url, html, config) {
1765
1420
  noindex: false,
1766
1421
  tags,
1767
1422
  description,
1768
- keywords
1423
+ keywords,
1424
+ weight
1769
1425
  };
1770
1426
  }
1771
1427
  function extractFromMarkdown(url, markdown, title) {
@@ -1778,6 +1434,14 @@ function extractFromMarkdown(url, markdown, title) {
1778
1434
  if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
1779
1435
  return null;
1780
1436
  }
1437
+ let mdWeight;
1438
+ const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
1439
+ if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
1440
+ mdWeight = rawWeight;
1441
+ }
1442
+ if (mdWeight === 0) {
1443
+ return null;
1444
+ }
1781
1445
  const content = parsed.content;
1782
1446
  const normalized = normalizeMarkdown(content);
1783
1447
  if (!normalizeText(normalized)) {
@@ -1800,63 +1464,13 @@ function extractFromMarkdown(url, markdown, title) {
1800
1464
  noindex: false,
1801
1465
  tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
1802
1466
  description: fmDescription,
1803
- keywords: fmKeywords
1467
+ keywords: fmKeywords,
1468
+ weight: mdWeight
1804
1469
  };
1805
1470
  }
1806
1471
 
1807
- // src/indexing/mirror.ts
1808
- import fs4 from "fs/promises";
1809
- import path5 from "path";
1810
- function yamlString(value) {
1811
- return JSON.stringify(value);
1812
- }
1813
- function yamlArray(values) {
1814
- return `[${values.map((v) => JSON.stringify(v)).join(", ")}]`;
1815
- }
1816
- function buildMirrorMarkdown(page) {
1817
- const frontmatterLines = [
1818
- "---",
1819
- `url: ${yamlString(page.url)}`,
1820
- `title: ${yamlString(page.title)}`,
1821
- `scope: ${yamlString(page.scope)}`,
1822
- `routeFile: ${yamlString(page.routeFile)}`,
1823
- `routeResolution: ${yamlString(page.routeResolution)}`,
1824
- `generatedAt: ${yamlString(page.generatedAt)}`,
1825
- `incomingLinks: ${page.incomingLinks}`,
1826
- `outgoingLinks: ${page.outgoingLinks}`,
1827
- `depth: ${page.depth}`,
1828
- `tags: ${yamlArray(page.tags)}`,
1829
- "---",
1830
- ""
1831
- ];
1832
- return `${frontmatterLines.join("\n")}${normalizeMarkdown(page.markdown)}`;
1833
- }
1834
- function stripGeneratedAt(content) {
1835
- return content.replace(/^generatedAt: .*$/m, "");
1836
- }
1837
- async function writeMirrorPage(statePath, scope, page) {
1838
- const relative = urlPathToMirrorRelative(page.url);
1839
- const outputPath = path5.join(statePath, "pages", scope.scopeName, relative);
1840
- await fs4.mkdir(path5.dirname(outputPath), { recursive: true });
1841
- const newContent = buildMirrorMarkdown(page);
1842
- try {
1843
- const existing = await fs4.readFile(outputPath, "utf8");
1844
- if (stripGeneratedAt(existing) === stripGeneratedAt(newContent)) {
1845
- return outputPath;
1846
- }
1847
- } catch {
1848
- }
1849
- await fs4.writeFile(outputPath, newContent, "utf8");
1850
- return outputPath;
1851
- }
1852
- async function cleanMirrorForScope(statePath, scope) {
1853
- const target = path5.join(statePath, "pages", scope.scopeName);
1854
- await fs4.rm(target, { recursive: true, force: true });
1855
- await fs4.mkdir(target, { recursive: true });
1856
- }
1857
-
1858
1472
  // src/indexing/route-mapper.ts
1859
- import path6 from "path";
1473
+ import path4 from "path";
1860
1474
  import fg from "fast-glob";
1861
1475
  function segmentToRegex(segment) {
1862
1476
  if (segment.startsWith("(") && segment.endsWith(")")) {
@@ -1877,7 +1491,7 @@ function segmentToRegex(segment) {
1877
1491
  return { regex: `/${segment.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}`, score: 10 };
1878
1492
  }
1879
1493
  function routeFileToPattern(routeFile, cwd) {
1880
- const relative = path6.relative(cwd, routeFile).replace(/\\/g, "/");
1494
+ const relative = path4.relative(cwd, routeFile).replace(/\\/g, "/");
1881
1495
  const withoutPrefix = relative.replace(/^src\/routes\/?/, "");
1882
1496
  const withoutPage = withoutPrefix.replace(/\/\+page\.[^/]+$/, "");
1883
1497
  const segments = withoutPage.split("/").filter(Boolean);
@@ -1932,11 +1546,43 @@ function mapUrlToRoute(urlPath, patterns) {
1932
1546
 
1933
1547
  // src/indexing/sources/build/index.ts
1934
1548
  import { load as cheerioLoad } from "cheerio";
1935
- import pLimit2 from "p-limit";
1549
+ import pLimit from "p-limit";
1550
+
1551
+ // src/indexing/sources/build/manifest-parser.ts
1552
+ import fs3 from "fs/promises";
1553
+ import path5 from "path";
1554
+
1555
+ // src/utils/pattern.ts
1556
+ function matchUrlPattern(url, pattern) {
1557
+ const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
1558
+ const normalizedUrl = norm(url);
1559
+ const normalizedPattern = norm(pattern);
1560
+ if (normalizedPattern.endsWith("/**")) {
1561
+ const prefix = normalizedPattern.slice(0, -3);
1562
+ if (prefix === "") {
1563
+ return true;
1564
+ }
1565
+ return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
1566
+ }
1567
+ if (normalizedPattern.endsWith("/*")) {
1568
+ const prefix = normalizedPattern.slice(0, -2);
1569
+ if (prefix === "") {
1570
+ return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
1571
+ }
1572
+ if (!normalizedUrl.startsWith(prefix + "/")) return false;
1573
+ const rest = normalizedUrl.slice(prefix.length + 1);
1574
+ return rest.length > 0 && !rest.includes("/");
1575
+ }
1576
+ return normalizedUrl === normalizedPattern;
1577
+ }
1578
+ function matchUrlPatterns(url, patterns) {
1579
+ for (const pattern of patterns) {
1580
+ if (matchUrlPattern(url, pattern)) return true;
1581
+ }
1582
+ return false;
1583
+ }
1936
1584
 
1937
1585
  // src/indexing/sources/build/manifest-parser.ts
1938
- import fs5 from "fs/promises";
1939
- import path7 from "path";
1940
1586
  function routeIdToFile(routeId) {
1941
1587
  if (routeId === "/") {
1942
1588
  return "src/routes/+page.svelte";
@@ -1948,10 +1594,10 @@ function routeIdToUrl(routeId) {
1948
1594
  return routeId.split("/").filter((seg) => !(seg.startsWith("(") && seg.endsWith(")"))).join("/") || "/";
1949
1595
  }
1950
1596
  async function parseManifest(cwd, outputDir) {
1951
- const manifestPath = path7.resolve(cwd, outputDir, "server", "manifest-full.js");
1597
+ const manifestPath = path5.resolve(cwd, outputDir, "server", "manifest-full.js");
1952
1598
  let content;
1953
1599
  try {
1954
- content = await fs5.readFile(manifestPath, "utf8");
1600
+ content = await fs3.readFile(manifestPath, "utf8");
1955
1601
  } catch {
1956
1602
  throw new SearchSocketError(
1957
1603
  "BUILD_MANIFEST_NOT_FOUND",
@@ -2010,21 +1656,13 @@ function expandDynamicUrl(url, value) {
2010
1656
  return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
2011
1657
  }
2012
1658
  function isExcluded(url, patterns) {
2013
- for (const pattern of patterns) {
2014
- if (pattern.endsWith("/*")) {
2015
- const prefix = pattern.slice(0, -1);
2016
- if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
2017
- } else if (url === pattern) {
2018
- return true;
2019
- }
2020
- }
2021
- return false;
1659
+ return matchUrlPatterns(url, patterns);
2022
1660
  }
2023
1661
 
2024
1662
  // src/indexing/sources/build/preview-server.ts
2025
1663
  import net from "net";
2026
- import path8 from "path";
2027
- import fs6 from "fs";
1664
+ import path6 from "path";
1665
+ import fs4 from "fs";
2028
1666
  import { spawn } from "child_process";
2029
1667
  function findFreePort() {
2030
1668
  return new Promise((resolve, reject) => {
@@ -2063,8 +1701,8 @@ async function waitForReady(url, timeout, child) {
2063
1701
  );
2064
1702
  }
2065
1703
  async function startPreviewServer(cwd, options, logger3) {
2066
- const viteBin = path8.join(cwd, "node_modules", ".bin", "vite");
2067
- if (!fs6.existsSync(viteBin)) {
1704
+ const viteBin = path6.join(cwd, "node_modules", ".bin", "vite");
1705
+ if (!fs4.existsSync(viteBin)) {
2068
1706
  throw new SearchSocketError(
2069
1707
  "BUILD_SERVER_FAILED",
2070
1708
  `vite binary not found at ${viteBin}. Ensure vite is installed.`
@@ -2138,7 +1776,7 @@ async function discoverPages(server, buildConfig, pipelineMaxPages) {
2138
1776
  const visited = /* @__PURE__ */ new Set();
2139
1777
  const pages = [];
2140
1778
  const queue = [];
2141
- const limit = pLimit2(8);
1779
+ const limit = pLimit(8);
2142
1780
  for (const seed of seedUrls) {
2143
1781
  const normalized = normalizeUrlPath(seed);
2144
1782
  if (!visited.has(normalized) && !isExcluded(normalized, exclude)) {
@@ -2220,7 +1858,7 @@ async function loadBuildPages(cwd, config, maxPages) {
2220
1858
  const selected = typeof maxCount === "number" ? expanded.slice(0, maxCount) : expanded;
2221
1859
  const server = await startPreviewServer(cwd, { previewTimeout: buildConfig.previewTimeout }, logger);
2222
1860
  try {
2223
- const concurrencyLimit = pLimit2(8);
1861
+ const concurrencyLimit = pLimit(8);
2224
1862
  const results = await Promise.allSettled(
2225
1863
  selected.map(
2226
1864
  (route) => concurrencyLimit(async () => {
@@ -2260,11 +1898,11 @@ async function loadBuildPages(cwd, config, maxPages) {
2260
1898
  }
2261
1899
 
2262
1900
  // src/indexing/sources/content-files.ts
2263
- import fs7 from "fs/promises";
2264
- import path9 from "path";
1901
+ import fs5 from "fs/promises";
1902
+ import path7 from "path";
2265
1903
  import fg2 from "fast-glob";
2266
1904
  function filePathToUrl(filePath, baseDir) {
2267
- const relative = path9.relative(baseDir, filePath).replace(/\\/g, "/");
1905
+ const relative = path7.relative(baseDir, filePath).replace(/\\/g, "/");
2268
1906
  const segments = relative.split("/").filter(Boolean);
2269
1907
  if (/(^|\/)\+page\.svelte$/.test(relative)) {
2270
1908
  const routeSegments = segments.slice();
@@ -2289,7 +1927,7 @@ async function loadContentFilesPages(cwd, config, maxPages) {
2289
1927
  if (!contentConfig) {
2290
1928
  throw new Error("content-files config is missing");
2291
1929
  }
2292
- const baseDir = path9.resolve(cwd, contentConfig.baseDir);
1930
+ const baseDir = path7.resolve(cwd, contentConfig.baseDir);
2293
1931
  const files = await fg2(contentConfig.globs, {
2294
1932
  cwd: baseDir,
2295
1933
  absolute: true,
@@ -2299,12 +1937,12 @@ async function loadContentFilesPages(cwd, config, maxPages) {
2299
1937
  const selected = typeof limit === "number" ? files.slice(0, limit) : files;
2300
1938
  const pages = [];
2301
1939
  for (const filePath of selected) {
2302
- const raw = await fs7.readFile(filePath, "utf8");
1940
+ const raw = await fs5.readFile(filePath, "utf8");
2303
1941
  const markdown = filePath.endsWith(".md") ? raw : normalizeSvelteToMarkdown(raw);
2304
1942
  pages.push({
2305
1943
  url: filePathToUrl(filePath, baseDir),
2306
1944
  markdown,
2307
- sourcePath: path9.relative(cwd, filePath).replace(/\\/g, "/"),
1945
+ sourcePath: path7.relative(cwd, filePath).replace(/\\/g, "/"),
2308
1946
  outgoingLinks: []
2309
1947
  });
2310
1948
  }
@@ -2314,7 +1952,7 @@ async function loadContentFilesPages(cwd, config, maxPages) {
2314
1952
  // src/indexing/sources/crawl.ts
2315
1953
  import { gunzipSync } from "zlib";
2316
1954
  import { load as cheerioLoad2 } from "cheerio";
2317
- import pLimit3 from "p-limit";
1955
+ import pLimit2 from "p-limit";
2318
1956
  var logger2 = new Logger();
2319
1957
  function extractLocs(xml) {
2320
1958
  const $ = cheerioLoad2(xml, { xmlMode: true });
@@ -2399,7 +2037,7 @@ async function loadCrawledPages(config, maxPages) {
2399
2037
  const routes = await resolveRoutes(config);
2400
2038
  const maxCount = typeof maxPages === "number" ? Math.max(0, Math.floor(maxPages)) : void 0;
2401
2039
  const selected = typeof maxCount === "number" ? routes.slice(0, maxCount) : routes;
2402
- const concurrencyLimit = pLimit3(8);
2040
+ const concurrencyLimit = pLimit2(8);
2403
2041
  const results = await Promise.allSettled(
2404
2042
  selected.map(
2405
2043
  (route) => concurrencyLimit(async () => {
@@ -2432,11 +2070,11 @@ async function loadCrawledPages(config, maxPages) {
2432
2070
  }
2433
2071
 
2434
2072
  // src/indexing/sources/static-output.ts
2435
- import fs8 from "fs/promises";
2436
- import path10 from "path";
2073
+ import fs6 from "fs/promises";
2074
+ import path8 from "path";
2437
2075
  import fg3 from "fast-glob";
2438
2076
  async function loadStaticOutputPages(cwd, config, maxPages) {
2439
- const outputDir = path10.resolve(cwd, config.source.staticOutputDir);
2077
+ const outputDir = path8.resolve(cwd, config.source.staticOutputDir);
2440
2078
  const htmlFiles = await fg3(["**/*.html"], {
2441
2079
  cwd: outputDir,
2442
2080
  absolute: true
@@ -2445,55 +2083,309 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
2445
2083
  const selected = typeof limit === "number" ? htmlFiles.slice(0, limit) : htmlFiles;
2446
2084
  const pages = [];
2447
2085
  for (const filePath of selected) {
2448
- const html = await fs8.readFile(filePath, "utf8");
2086
+ const html = await fs6.readFile(filePath, "utf8");
2449
2087
  pages.push({
2450
2088
  url: staticHtmlFileToUrl(filePath, outputDir),
2451
2089
  html,
2452
- sourcePath: path10.relative(cwd, filePath).replace(/\\/g, "/"),
2090
+ sourcePath: path8.relative(cwd, filePath).replace(/\\/g, "/"),
2453
2091
  outgoingLinks: []
2454
2092
  });
2455
2093
  }
2456
2094
  return pages;
2457
2095
  }
2458
2096
 
2459
- // src/utils/time.ts
2460
- function nowIso() {
2461
- return (/* @__PURE__ */ new Date()).toISOString();
2462
- }
2463
- function hrTimeMs(start) {
2464
- return Number(process.hrtime.bigint() - start) / 1e6;
2097
+ // src/indexing/robots.ts
2098
+ import fs7 from "fs/promises";
2099
+ import path9 from "path";
2100
+ function parseRobotsTxt(content, userAgent = "Searchsocket") {
2101
+ const lines = content.split(/\r?\n/);
2102
+ const agentGroups = /* @__PURE__ */ new Map();
2103
+ let currentAgents = [];
2104
+ for (const rawLine of lines) {
2105
+ const line = rawLine.replace(/#.*$/, "").trim();
2106
+ if (!line) continue;
2107
+ const colonIdx = line.indexOf(":");
2108
+ if (colonIdx === -1) continue;
2109
+ const directive = line.slice(0, colonIdx).trim().toLowerCase();
2110
+ const value = line.slice(colonIdx + 1).trim();
2111
+ if (directive === "user-agent") {
2112
+ const agentName = value.toLowerCase();
2113
+ currentAgents.push(agentName);
2114
+ if (!agentGroups.has(agentName)) {
2115
+ agentGroups.set(agentName, { disallow: [], allow: [] });
2116
+ }
2117
+ } else if (directive === "disallow" && value && currentAgents.length > 0) {
2118
+ for (const agent of currentAgents) {
2119
+ agentGroups.get(agent).disallow.push(value);
2120
+ }
2121
+ } else if (directive === "allow" && value && currentAgents.length > 0) {
2122
+ for (const agent of currentAgents) {
2123
+ agentGroups.get(agent).allow.push(value);
2124
+ }
2125
+ } else if (directive !== "disallow" && directive !== "allow") {
2126
+ currentAgents = [];
2127
+ }
2128
+ }
2129
+ const specific = agentGroups.get(userAgent.toLowerCase());
2130
+ if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
2131
+ return specific;
2132
+ }
2133
+ return agentGroups.get("*") ?? { disallow: [], allow: [] };
2465
2134
  }
2466
-
2467
- // src/indexing/pipeline.ts
2468
- var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
2469
- "jina-embeddings-v3": 2e-5
2470
- };
2471
- var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
2472
- var IndexPipeline = class _IndexPipeline {
2473
- cwd;
2474
- config;
2475
- embeddings;
2476
- vectorStore;
2477
- logger;
2478
- constructor(options) {
2479
- this.cwd = options.cwd;
2480
- this.config = options.config;
2481
- this.embeddings = options.embeddings;
2482
- this.vectorStore = options.vectorStore;
2483
- this.logger = options.logger;
2135
+ function isBlockedByRobots(urlPath, rules) {
2136
+ let longestDisallow = "";
2137
+ for (const pattern of rules.disallow) {
2138
+ if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
2139
+ longestDisallow = pattern;
2140
+ }
2484
2141
  }
2485
- static async create(options = {}) {
2486
- const cwd = path11.resolve(options.cwd ?? process.cwd());
2487
- const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
2488
- const embeddings = options.embeddingsProvider ?? createEmbeddingsProvider(config);
2489
- const vectorStore = options.vectorStore ?? await createVectorStore(config, cwd);
2490
- return new _IndexPipeline({
2491
- cwd,
2492
- config,
2493
- embeddings,
2494
- vectorStore,
2495
- logger: options.logger ?? new Logger()
2496
- });
2142
+ if (!longestDisallow) return false;
2143
+ let longestAllow = "";
2144
+ for (const pattern of rules.allow) {
2145
+ if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
2146
+ longestAllow = pattern;
2147
+ }
2148
+ }
2149
+ return longestAllow.length < longestDisallow.length;
2150
+ }
2151
+ async function loadRobotsTxtFromDir(dir) {
2152
+ try {
2153
+ const content = await fs7.readFile(path9.join(dir, "robots.txt"), "utf8");
2154
+ return parseRobotsTxt(content);
2155
+ } catch {
2156
+ return null;
2157
+ }
2158
+ }
2159
+ async function fetchRobotsTxt(baseUrl) {
2160
+ try {
2161
+ const url = new URL("/robots.txt", baseUrl).href;
2162
+ const response = await fetch(url);
2163
+ if (!response.ok) return null;
2164
+ const content = await response.text();
2165
+ return parseRobotsTxt(content);
2166
+ } catch {
2167
+ return null;
2168
+ }
2169
+ }
2170
+
2171
+ // src/search/ranking.ts
2172
+ function nonNegativeOrZero(value) {
2173
+ if (!Number.isFinite(value)) {
2174
+ return 0;
2175
+ }
2176
+ return Math.max(0, value);
2177
+ }
2178
+ function normalizeForTitleMatch(text) {
2179
+ return text.toLowerCase().replace(/[^a-z0-9\s]/g, "").replace(/\s+/g, " ").trim();
2180
+ }
2181
+ function rankHits(hits, config, query) {
2182
+ const normalizedQuery = query ? normalizeForTitleMatch(query) : "";
2183
+ const titleMatchWeight = config.ranking.weights.titleMatch;
2184
+ return hits.map((hit) => {
2185
+ let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
2186
+ if (config.ranking.enableIncomingLinkBoost) {
2187
+ const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
2188
+ score += incomingBoost * config.ranking.weights.incomingLinks;
2189
+ }
2190
+ if (config.ranking.enableDepthBoost) {
2191
+ const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
2192
+ score += depthBoost * config.ranking.weights.depth;
2193
+ }
2194
+ if (normalizedQuery && titleMatchWeight > 0) {
2195
+ const normalizedTitle = normalizeForTitleMatch(hit.metadata.title);
2196
+ if (normalizedQuery.length > 0 && normalizedTitle.length > 0 && (normalizedTitle.includes(normalizedQuery) || normalizedQuery.includes(normalizedTitle))) {
2197
+ score += titleMatchWeight;
2198
+ }
2199
+ }
2200
+ return {
2201
+ hit,
2202
+ finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
2203
+ };
2204
+ }).sort((a, b) => {
2205
+ const delta = b.finalScore - a.finalScore;
2206
+ return Number.isNaN(delta) ? 0 : delta;
2207
+ });
2208
+ }
2209
+ function trimByScoreGap(results, config) {
2210
+ if (results.length === 0) return results;
2211
+ const threshold = config.ranking.scoreGapThreshold;
2212
+ const minScore = config.ranking.minScore;
2213
+ if (minScore > 0 && results.length > 0) {
2214
+ const sortedScores = results.map((r) => r.pageScore).sort((a, b) => a - b);
2215
+ const mid = Math.floor(sortedScores.length / 2);
2216
+ const median = sortedScores.length % 2 === 0 ? (sortedScores[mid - 1] + sortedScores[mid]) / 2 : sortedScores[mid];
2217
+ if (median < minScore) return [];
2218
+ }
2219
+ if (threshold > 0 && results.length > 1) {
2220
+ for (let i = 1; i < results.length; i++) {
2221
+ const prev = results[i - 1].pageScore;
2222
+ const current = results[i].pageScore;
2223
+ if (prev > 0) {
2224
+ const gap = (prev - current) / prev;
2225
+ if (gap >= threshold) {
2226
+ return results.slice(0, i);
2227
+ }
2228
+ }
2229
+ }
2230
+ }
2231
+ return results;
2232
+ }
2233
+ function findPageWeight(url, pageWeights) {
2234
+ let bestPattern = "";
2235
+ let bestWeight = 1;
2236
+ for (const [pattern, weight] of Object.entries(pageWeights)) {
2237
+ if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
2238
+ bestPattern = pattern;
2239
+ bestWeight = weight;
2240
+ }
2241
+ }
2242
+ return bestWeight;
2243
+ }
2244
+ function aggregateByPage(ranked, config) {
2245
+ const groups = /* @__PURE__ */ new Map();
2246
+ for (const hit of ranked) {
2247
+ const url = hit.hit.metadata.url;
2248
+ const group = groups.get(url);
2249
+ if (group) group.push(hit);
2250
+ else groups.set(url, [hit]);
2251
+ }
2252
+ const { aggregationCap, aggregationDecay } = config.ranking;
2253
+ const pages = [];
2254
+ for (const [url, chunks] of groups) {
2255
+ chunks.sort((a, b) => {
2256
+ const delta = b.finalScore - a.finalScore;
2257
+ return Number.isNaN(delta) ? 0 : delta;
2258
+ });
2259
+ const best = chunks[0];
2260
+ const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
2261
+ const topChunks = chunks.slice(0, aggregationCap);
2262
+ let aggregationBonus = 0;
2263
+ for (let i = 1; i < topChunks.length; i++) {
2264
+ const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
2265
+ aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
2266
+ }
2267
+ let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
2268
+ const pageWeight = findPageWeight(url, config.ranking.pageWeights);
2269
+ if (pageWeight === 0) continue;
2270
+ if (pageWeight !== 1) {
2271
+ pageScore *= pageWeight;
2272
+ }
2273
+ pages.push({
2274
+ url,
2275
+ title: best.hit.metadata.title,
2276
+ routeFile: best.hit.metadata.routeFile,
2277
+ pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
2278
+ bestChunk: best,
2279
+ matchingChunks: chunks
2280
+ });
2281
+ }
2282
+ return pages.sort((a, b) => {
2283
+ const delta = b.pageScore - a.pageScore;
2284
+ return Number.isNaN(delta) ? 0 : delta;
2285
+ });
2286
+ }
2287
+ function mergePageAndChunkResults(pageHits, rankedChunks, config) {
2288
+ if (pageHits.length === 0) return rankedChunks;
2289
+ const w = config.search.pageSearchWeight;
2290
+ const pageScoreMap = /* @__PURE__ */ new Map();
2291
+ for (const ph of pageHits) {
2292
+ pageScoreMap.set(ph.url, ph);
2293
+ }
2294
+ const pagesWithChunks = /* @__PURE__ */ new Set();
2295
+ const merged = rankedChunks.map((ranked) => {
2296
+ const url = ranked.hit.metadata.url;
2297
+ const pageHit = pageScoreMap.get(url);
2298
+ if (pageHit) {
2299
+ pagesWithChunks.add(url);
2300
+ const blended = (1 - w) * ranked.finalScore + w * pageHit.score;
2301
+ return {
2302
+ hit: ranked.hit,
2303
+ finalScore: Number.isFinite(blended) ? blended : ranked.finalScore
2304
+ };
2305
+ }
2306
+ return ranked;
2307
+ });
2308
+ for (const [url, pageHit] of pageScoreMap) {
2309
+ if (pagesWithChunks.has(url)) continue;
2310
+ const syntheticScore = pageHit.score * w;
2311
+ const syntheticHit = {
2312
+ id: `page:${url}`,
2313
+ score: pageHit.score,
2314
+ metadata: {
2315
+ projectId: "",
2316
+ scopeName: "",
2317
+ url: pageHit.url,
2318
+ path: pageHit.url,
2319
+ title: pageHit.title,
2320
+ sectionTitle: "",
2321
+ headingPath: [],
2322
+ snippet: pageHit.description || pageHit.title,
2323
+ chunkText: pageHit.description || pageHit.title,
2324
+ ordinal: 0,
2325
+ contentHash: "",
2326
+ depth: pageHit.depth,
2327
+ incomingLinks: pageHit.incomingLinks,
2328
+ routeFile: pageHit.routeFile,
2329
+ tags: pageHit.tags
2330
+ }
2331
+ };
2332
+ merged.push({
2333
+ hit: syntheticHit,
2334
+ finalScore: Number.isFinite(syntheticScore) ? syntheticScore : 0
2335
+ });
2336
+ }
2337
+ return merged.sort((a, b) => {
2338
+ const delta = b.finalScore - a.finalScore;
2339
+ return Number.isNaN(delta) ? 0 : delta;
2340
+ });
2341
+ }
2342
+
2343
+ // src/utils/time.ts
2344
+ function nowIso() {
2345
+ return (/* @__PURE__ */ new Date()).toISOString();
2346
+ }
2347
+ function hrTimeMs(start) {
2348
+ return Number(process.hrtime.bigint() - start) / 1e6;
2349
+ }
2350
+
2351
+ // src/indexing/pipeline.ts
2352
+ function buildPageSummary(page, maxChars = 3500) {
2353
+ const parts = [page.title];
2354
+ if (page.description) {
2355
+ parts.push(page.description);
2356
+ }
2357
+ if (page.keywords && page.keywords.length > 0) {
2358
+ parts.push(page.keywords.join(", "));
2359
+ }
2360
+ const plainBody = page.markdown.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/!?\[([^\]]*)\]\([^)]*\)/g, "$1").replace(/^#{1,6}\s+/gm, "").replace(/[>*_|~\-]/g, " ").replace(/\s+/g, " ").trim();
2361
+ if (plainBody) {
2362
+ parts.push(plainBody);
2363
+ }
2364
+ const joined = parts.join("\n\n");
2365
+ if (joined.length <= maxChars) return joined;
2366
+ return joined.slice(0, maxChars).trim();
2367
+ }
2368
+ var IndexPipeline = class _IndexPipeline {
2369
+ cwd;
2370
+ config;
2371
+ store;
2372
+ logger;
2373
+ constructor(options) {
2374
+ this.cwd = options.cwd;
2375
+ this.config = options.config;
2376
+ this.store = options.store;
2377
+ this.logger = options.logger;
2378
+ }
2379
+ static async create(options = {}) {
2380
+ const cwd = path10.resolve(options.cwd ?? process.cwd());
2381
+ const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
2382
+ const store = options.store ?? await createUpstashStore(config);
2383
+ return new _IndexPipeline({
2384
+ cwd,
2385
+ config,
2386
+ store,
2387
+ logger: options.logger ?? new Logger()
2388
+ });
2497
2389
  }
2498
2390
  getConfig() {
2499
2391
  return this.config;
@@ -2511,25 +2403,17 @@ var IndexPipeline = class _IndexPipeline {
2511
2403
  stageTimingsMs[name] = Math.round(hrTimeMs(start));
2512
2404
  };
2513
2405
  const scope = resolveScope(this.config, options.scopeOverride);
2514
- const { statePath } = ensureStateDirs(this.cwd, this.config.state.dir, scope);
2406
+ ensureStateDirs(this.cwd, this.config.state.dir, scope);
2515
2407
  const sourceMode = options.sourceOverride ?? this.config.source.mode;
2516
- this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, model: ${this.config.embeddings.model})`);
2408
+ this.logger.info(`Indexing scope "${scope.scopeName}" (source: ${sourceMode}, backend: upstash-search)`);
2517
2409
  if (options.force) {
2518
2410
  this.logger.info("Force mode enabled \u2014 full rebuild");
2519
- await cleanMirrorForScope(statePath, scope);
2520
2411
  }
2521
2412
  if (options.dryRun) {
2522
2413
  this.logger.info("Dry run \u2014 no writes will be performed");
2523
2414
  }
2524
2415
  const manifestStart = stageStart();
2525
- const existingHashes = await this.vectorStore.getContentHashes(scope);
2526
- const existingModelId = await this.vectorStore.getScopeModelId(scope);
2527
- if (existingModelId && existingModelId !== this.config.embeddings.model && !options.force) {
2528
- throw new SearchSocketError(
2529
- "EMBEDDING_MODEL_MISMATCH",
2530
- `Scope ${scope.scopeName} uses model ${existingModelId}. Re-run with --force to migrate.`
2531
- );
2532
- }
2416
+ const existingHashes = options.force ? /* @__PURE__ */ new Map() : await this.store.getContentHashes(scope);
2533
2417
  stageEnd("manifest", manifestStart);
2534
2418
  this.logger.debug(`Manifest: ${existingHashes.size} existing chunk hashes loaded`);
2535
2419
  const sourceStart = stageStart();
@@ -2546,6 +2430,53 @@ var IndexPipeline = class _IndexPipeline {
2546
2430
  }
2547
2431
  stageEnd("source", sourceStart);
2548
2432
  this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
2433
+ const filterStart = stageStart();
2434
+ let filteredSourcePages = sourcePages;
2435
+ if (this.config.exclude.length > 0) {
2436
+ const beforeExclude = filteredSourcePages.length;
2437
+ filteredSourcePages = filteredSourcePages.filter((p) => {
2438
+ const url = normalizeUrlPath(p.url);
2439
+ if (matchUrlPatterns(url, this.config.exclude)) {
2440
+ this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
2441
+ return false;
2442
+ }
2443
+ return true;
2444
+ });
2445
+ const excludedCount = beforeExclude - filteredSourcePages.length;
2446
+ if (excludedCount > 0) {
2447
+ this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
2448
+ }
2449
+ }
2450
+ if (this.config.respectRobotsTxt) {
2451
+ let robotsRules = null;
2452
+ if (sourceMode === "static-output") {
2453
+ robotsRules = await loadRobotsTxtFromDir(
2454
+ path10.resolve(this.cwd, this.config.source.staticOutputDir)
2455
+ );
2456
+ } else if (sourceMode === "build" && this.config.source.build) {
2457
+ robotsRules = await loadRobotsTxtFromDir(
2458
+ path10.resolve(this.cwd, this.config.source.build.outputDir)
2459
+ );
2460
+ } else if (sourceMode === "crawl" && this.config.source.crawl) {
2461
+ robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
2462
+ }
2463
+ if (robotsRules) {
2464
+ const beforeRobots = filteredSourcePages.length;
2465
+ filteredSourcePages = filteredSourcePages.filter((p) => {
2466
+ const url = normalizeUrlPath(p.url);
2467
+ if (isBlockedByRobots(url, robotsRules)) {
2468
+ this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
2469
+ return false;
2470
+ }
2471
+ return true;
2472
+ });
2473
+ const robotsExcluded = beforeRobots - filteredSourcePages.length;
2474
+ if (robotsExcluded > 0) {
2475
+ this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
2476
+ }
2477
+ }
2478
+ }
2479
+ stageEnd("filter", filterStart);
2549
2480
  const routeStart = stageStart();
2550
2481
  const routePatterns = await buildRoutePatterns(this.cwd);
2551
2482
  stageEnd("route_map", routeStart);
@@ -2553,7 +2484,7 @@ var IndexPipeline = class _IndexPipeline {
2553
2484
  const extractStart = stageStart();
2554
2485
  this.logger.info("Extracting content...");
2555
2486
  const extractedPages = [];
2556
- for (const sourcePage of sourcePages) {
2487
+ for (const sourcePage of filteredSourcePages) {
2557
2488
  const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
2558
2489
  if (!extracted) {
2559
2490
  this.logger.warn(
@@ -2579,16 +2510,29 @@ var IndexPipeline = class _IndexPipeline {
2579
2510
  seenUrls.add(page.url);
2580
2511
  uniquePages.push(page);
2581
2512
  }
2513
+ const indexablePages = [];
2514
+ for (const page of uniquePages) {
2515
+ const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
2516
+ if (effectiveWeight === 0) {
2517
+ this.logger.debug(`Excluding ${page.url} (zero weight)`);
2518
+ continue;
2519
+ }
2520
+ indexablePages.push(page);
2521
+ }
2522
+ const zeroWeightCount = uniquePages.length - indexablePages.length;
2523
+ if (zeroWeightCount > 0) {
2524
+ this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
2525
+ }
2582
2526
  stageEnd("extract", extractStart);
2583
- const skippedPages = sourcePages.length - uniquePages.length;
2584
- this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
2527
+ const skippedPages = filteredSourcePages.length - indexablePages.length;
2528
+ this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
2585
2529
  const linkStart = stageStart();
2586
- const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
2530
+ const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
2587
2531
  const incomingLinkCount = /* @__PURE__ */ new Map();
2588
- for (const page of uniquePages) {
2532
+ for (const page of indexablePages) {
2589
2533
  incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
2590
2534
  }
2591
- for (const page of uniquePages) {
2535
+ for (const page of indexablePages) {
2592
2536
  for (const outgoing of page.outgoingLinks) {
2593
2537
  if (!pageSet.has(outgoing)) {
2594
2538
  continue;
@@ -2598,9 +2542,9 @@ var IndexPipeline = class _IndexPipeline {
2598
2542
  }
2599
2543
  stageEnd("links", linkStart);
2600
2544
  this.logger.debug(`Link analysis: computed incoming links for ${incomingLinkCount.size} pages (${stageTimingsMs["links"]}ms)`);
2601
- const mirrorStart = stageStart();
2602
- this.logger.info("Writing mirror pages...");
2603
- const mirrorPages = [];
2545
+ const pagesStart = stageStart();
2546
+ this.logger.info("Building indexed pages...");
2547
+ const pages = [];
2604
2548
  let routeExact = 0;
2605
2549
  let routeBestEffort = 0;
2606
2550
  const precomputedRoutes = /* @__PURE__ */ new Map();
@@ -2612,7 +2556,7 @@ var IndexPipeline = class _IndexPipeline {
2612
2556
  });
2613
2557
  }
2614
2558
  }
2615
- for (const page of uniquePages) {
2559
+ for (const page of indexablePages) {
2616
2560
  const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
2617
2561
  if (routeMatch.routeResolution === "best-effort") {
2618
2562
  if (this.config.source.strictRouteMapping) {
@@ -2629,7 +2573,7 @@ var IndexPipeline = class _IndexPipeline {
2629
2573
  } else {
2630
2574
  routeExact += 1;
2631
2575
  }
2632
- const mirror = {
2576
+ const indexedPage = {
2633
2577
  url: page.url,
2634
2578
  title: page.title,
2635
2579
  scope: scope.scopeName,
@@ -2644,35 +2588,38 @@ var IndexPipeline = class _IndexPipeline {
2644
2588
  description: page.description,
2645
2589
  keywords: page.keywords
2646
2590
  };
2647
- mirrorPages.push(mirror);
2648
- if (this.config.state.writeMirror) {
2649
- await writeMirrorPage(statePath, scope, mirror);
2650
- }
2651
- this.logger.event("markdown_written", { url: page.url });
2591
+ pages.push(indexedPage);
2592
+ this.logger.event("page_indexed", { url: page.url });
2652
2593
  }
2653
2594
  if (!options.dryRun) {
2654
- const pageRecords = mirrorPages.map((mp) => ({
2655
- url: mp.url,
2656
- title: mp.title,
2657
- markdown: mp.markdown,
2658
- projectId: scope.projectId,
2659
- scopeName: scope.scopeName,
2660
- routeFile: mp.routeFile,
2661
- routeResolution: mp.routeResolution,
2662
- incomingLinks: mp.incomingLinks,
2663
- outgoingLinks: mp.outgoingLinks,
2664
- depth: mp.depth,
2665
- tags: mp.tags,
2666
- indexedAt: mp.generatedAt
2667
- }));
2668
- await this.vectorStore.deletePages(scope);
2669
- await this.vectorStore.upsertPages(pageRecords, scope);
2595
+ const pageRecords = pages.map((p) => {
2596
+ const summary = buildPageSummary(p);
2597
+ return {
2598
+ url: p.url,
2599
+ title: p.title,
2600
+ markdown: p.markdown,
2601
+ projectId: scope.projectId,
2602
+ scopeName: scope.scopeName,
2603
+ routeFile: p.routeFile,
2604
+ routeResolution: p.routeResolution,
2605
+ incomingLinks: p.incomingLinks,
2606
+ outgoingLinks: p.outgoingLinks,
2607
+ depth: p.depth,
2608
+ tags: p.tags,
2609
+ indexedAt: p.generatedAt,
2610
+ summary,
2611
+ description: p.description,
2612
+ keywords: p.keywords
2613
+ };
2614
+ });
2615
+ await this.store.deletePages(scope);
2616
+ await this.store.upsertPages(pageRecords, scope);
2670
2617
  }
2671
- stageEnd("mirror", mirrorStart);
2672
- this.logger.info(`Mirrored ${mirrorPages.length} page${mirrorPages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["mirror"]}ms)`);
2618
+ stageEnd("pages", pagesStart);
2619
+ this.logger.info(`Indexed ${pages.length} page${pages.length === 1 ? "" : "s"} (${routeExact} exact, ${routeBestEffort} best-effort) (${stageTimingsMs["pages"]}ms)`);
2673
2620
  const chunkStart = stageStart();
2674
2621
  this.logger.info("Chunking pages...");
2675
- let chunks = mirrorPages.flatMap((page) => chunkMirrorPage(page, this.config, scope));
2622
+ let chunks = pages.flatMap((page) => chunkPage(page, this.config, scope));
2676
2623
  const maxChunks = typeof options.maxChunks === "number" ? Math.max(0, Math.floor(options.maxChunks)) : void 0;
2677
2624
  if (typeof maxChunks === "number") {
2678
2625
  chunks = chunks.slice(0, maxChunks);
@@ -2704,125 +2651,61 @@ var IndexPipeline = class _IndexPipeline {
2704
2651
  });
2705
2652
  const deletes = [...existingHashes.keys()].filter((chunkKey) => !currentChunkMap.has(chunkKey));
2706
2653
  this.logger.info(`Changes detected: ${changedChunks.length} changed, ${deletes.length} deleted, ${chunks.length - changedChunks.length} unchanged`);
2707
- const embedStart = stageStart();
2708
- const chunkTokenEstimates = /* @__PURE__ */ new Map();
2709
- for (const chunk of changedChunks) {
2710
- chunkTokenEstimates.set(chunk.chunkKey, this.embeddings.estimateTokens(buildEmbeddingText(chunk, this.config.chunking.prependTitle)));
2711
- }
2712
- const estimatedTokens = changedChunks.reduce(
2713
- (sum, chunk) => sum + (chunkTokenEstimates.get(chunk.chunkKey) ?? 0),
2714
- 0
2715
- );
2716
- const pricePer1k = this.config.embeddings.pricePer1kTokens ?? EMBEDDING_PRICE_PER_1K_TOKENS_USD[this.config.embeddings.model] ?? DEFAULT_EMBEDDING_PRICE_PER_1K;
2717
- const estimatedCostUSD = estimatedTokens / 1e3 * pricePer1k;
2718
- let newEmbeddings = 0;
2719
- const vectorsByChunk = /* @__PURE__ */ new Map();
2654
+ const upsertStart = stageStart();
2655
+ let documentsUpserted = 0;
2720
2656
  if (!options.dryRun && changedChunks.length > 0) {
2721
- this.logger.info(`Embedding ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} (~${estimatedTokens.toLocaleString()} tokens, ~$${estimatedCostUSD.toFixed(6)})...`);
2722
- const embeddings = await this.embeddings.embedTexts(
2723
- changedChunks.map((chunk) => buildEmbeddingText(chunk, this.config.chunking.prependTitle)),
2724
- this.config.embeddings.model,
2725
- "retrieval.passage"
2726
- );
2727
- if (embeddings.length !== changedChunks.length) {
2728
- throw new SearchSocketError(
2729
- "VECTOR_BACKEND_UNAVAILABLE",
2730
- `Embedding provider returned ${embeddings.length} vectors for ${changedChunks.length} chunks.`
2731
- );
2732
- }
2733
- for (let i = 0; i < changedChunks.length; i += 1) {
2734
- const chunk = changedChunks[i];
2735
- const embedding = embeddings[i];
2736
- if (!chunk || !embedding || embedding.length === 0 || embedding.some((value) => !Number.isFinite(value))) {
2737
- throw new SearchSocketError(
2738
- "VECTOR_BACKEND_UNAVAILABLE",
2739
- `Embedding provider returned an invalid vector for chunk index ${i}.`
2740
- );
2741
- }
2742
- vectorsByChunk.set(chunk.chunkKey, embedding);
2743
- newEmbeddings += 1;
2744
- this.logger.event("embedded_new", { chunkKey: chunk.chunkKey });
2745
- }
2746
- }
2747
- stageEnd("embedding", embedStart);
2748
- if (changedChunks.length > 0) {
2749
- this.logger.info(`Embedded ${newEmbeddings} chunk${newEmbeddings === 1 ? "" : "s"} (${stageTimingsMs["embedding"]}ms)`);
2750
- } else {
2751
- this.logger.info("No chunks to embed \u2014 all up to date");
2752
- }
2753
- const syncStart = stageStart();
2754
- if (!options.dryRun) {
2755
- this.logger.info("Syncing vectors...");
2756
- const upserts = [];
2757
- for (const chunk of changedChunks) {
2758
- const vector = vectorsByChunk.get(chunk.chunkKey);
2759
- if (!vector) {
2760
- continue;
2761
- }
2762
- upserts.push({
2657
+ this.logger.info(`Upserting ${changedChunks.length} chunk${changedChunks.length === 1 ? "" : "s"} to Upstash Search...`);
2658
+ const UPSTASH_CONTENT_LIMIT = 4096;
2659
+ const FIELD_OVERHEAD = 200;
2660
+ const MAX_TEXT_CHARS = UPSTASH_CONTENT_LIMIT - FIELD_OVERHEAD;
2661
+ const docs = changedChunks.map((chunk) => {
2662
+ const title = chunk.title;
2663
+ const sectionTitle = chunk.sectionTitle ?? "";
2664
+ const url = chunk.url;
2665
+ const tags = chunk.tags.join(",");
2666
+ const headingPath = chunk.headingPath.join(" > ");
2667
+ const otherFieldsLen = title.length + sectionTitle.length + url.length + tags.length + headingPath.length;
2668
+ const textBudget = Math.max(500, UPSTASH_CONTENT_LIMIT - otherFieldsLen - 50);
2669
+ const text = buildEmbeddingText(chunk, this.config.chunking.prependTitle).slice(0, textBudget);
2670
+ return {
2763
2671
  id: chunk.chunkKey,
2764
- vector,
2672
+ content: { title, sectionTitle, text, url, tags, headingPath },
2765
2673
  metadata: {
2766
2674
  projectId: scope.projectId,
2767
2675
  scopeName: scope.scopeName,
2768
- url: chunk.url,
2769
2676
  path: chunk.path,
2770
- title: chunk.title,
2771
- sectionTitle: chunk.sectionTitle ?? "",
2772
- headingPath: chunk.headingPath,
2773
2677
  snippet: chunk.snippet,
2774
- chunkText: chunk.chunkText.slice(0, 4e3),
2775
2678
  ordinal: chunk.ordinal,
2776
2679
  contentHash: chunk.contentHash,
2777
- modelId: this.config.embeddings.model,
2778
2680
  depth: chunk.depth,
2779
2681
  incomingLinks: chunk.incomingLinks,
2780
2682
  routeFile: chunk.routeFile,
2781
- tags: chunk.tags,
2782
- description: chunk.description,
2783
- keywords: chunk.keywords
2683
+ description: chunk.description ?? "",
2684
+ keywords: (chunk.keywords ?? []).join(",")
2784
2685
  }
2785
- });
2786
- }
2787
- if (upserts.length > 0) {
2788
- await this.vectorStore.upsert(upserts, scope);
2789
- this.logger.event("upserted", { count: upserts.length });
2790
- }
2791
- if (deletes.length > 0) {
2792
- await this.vectorStore.deleteByIds(deletes, scope);
2793
- this.logger.event("deleted", { count: deletes.length });
2794
- }
2795
- }
2796
- stageEnd("sync", syncStart);
2797
- this.logger.debug(`Sync complete (${stageTimingsMs["sync"]}ms)`);
2798
- const finalizeStart = stageStart();
2799
- if (!options.dryRun) {
2800
- const scopeInfo = {
2801
- projectId: scope.projectId,
2802
- scopeName: scope.scopeName,
2803
- modelId: this.config.embeddings.model,
2804
- lastIndexedAt: nowIso(),
2805
- vectorCount: chunks.length,
2806
- lastEstimateTokens: estimatedTokens,
2807
- lastEstimateCostUSD: Number(estimatedCostUSD.toFixed(8)),
2808
- lastEstimateChangedChunks: changedChunks.length
2809
- };
2810
- await this.vectorStore.recordScope(scopeInfo);
2811
- this.logger.event("registry_updated", {
2812
- scope: scope.scopeName,
2813
- vectorCount: chunks.length
2686
+ };
2814
2687
  });
2688
+ await this.store.upsertChunks(docs, scope);
2689
+ documentsUpserted = docs.length;
2690
+ this.logger.event("upserted", { count: docs.length });
2691
+ }
2692
+ if (!options.dryRun && deletes.length > 0) {
2693
+ await this.store.deleteByIds(deletes, scope);
2694
+ this.logger.event("deleted", { count: deletes.length });
2695
+ }
2696
+ stageEnd("upsert", upsertStart);
2697
+ if (changedChunks.length > 0) {
2698
+ this.logger.info(`Upserted ${documentsUpserted} document${documentsUpserted === 1 ? "" : "s"} (${stageTimingsMs["upsert"]}ms)`);
2699
+ } else {
2700
+ this.logger.info("No chunks to upsert \u2014 all up to date");
2815
2701
  }
2816
- stageEnd("finalize", finalizeStart);
2817
2702
  this.logger.info("Done.");
2818
2703
  return {
2819
- pagesProcessed: mirrorPages.length,
2704
+ pagesProcessed: pages.length,
2820
2705
  chunksTotal: chunks.length,
2821
2706
  chunksChanged: changedChunks.length,
2822
- newEmbeddings,
2707
+ documentsUpserted,
2823
2708
  deletes: deletes.length,
2824
- estimatedTokens,
2825
- estimatedCostUSD: Number(estimatedCostUSD.toFixed(8)),
2826
2709
  routeExact,
2827
2710
  routeBestEffort,
2828
2711
  stageTimingsMs
@@ -2838,233 +2721,33 @@ import { createMcpExpressApp } from "@modelcontextprotocol/sdk/server/express.js
2838
2721
  import { z as z3 } from "zod";
2839
2722
 
2840
2723
  // src/search/engine.ts
2841
- import path12 from "path";
2724
+ import path11 from "path";
2842
2725
  import { z as z2 } from "zod";
2843
-
2844
- // src/rerank/jina.ts
2845
- function sleep2(ms) {
2846
- return new Promise((resolve) => {
2847
- setTimeout(resolve, ms);
2848
- });
2849
- }
2850
- var JinaReranker = class {
2851
- apiKey;
2852
- model;
2853
- maxRetries;
2854
- constructor(options) {
2855
- this.apiKey = options.apiKey;
2856
- this.model = options.model;
2857
- this.maxRetries = options.maxRetries ?? 2;
2858
- }
2859
- async rerank(query, candidates, topN) {
2860
- if (candidates.length === 0) {
2861
- return [];
2862
- }
2863
- const body = {
2864
- model: this.model,
2865
- query,
2866
- documents: candidates.map((candidate) => candidate.text),
2867
- top_n: topN ?? candidates.length,
2868
- return_documents: false
2869
- };
2870
- let attempt = 0;
2871
- while (attempt <= this.maxRetries) {
2872
- attempt += 1;
2873
- let response;
2874
- try {
2875
- response = await fetch("https://api.jina.ai/v1/rerank", {
2876
- method: "POST",
2877
- headers: {
2878
- "content-type": "application/json",
2879
- authorization: `Bearer ${this.apiKey}`
2880
- },
2881
- body: JSON.stringify(body)
2882
- });
2883
- } catch (error) {
2884
- if (attempt <= this.maxRetries) {
2885
- await sleep2(Math.min(300 * 2 ** attempt, 4e3));
2886
- continue;
2887
- }
2888
- throw error;
2889
- }
2890
- if (!response.ok) {
2891
- const retryable = response.status === 429 || response.status >= 500;
2892
- if (retryable && attempt <= this.maxRetries) {
2893
- await sleep2(Math.min(300 * 2 ** attempt, 4e3));
2894
- continue;
2895
- }
2896
- const errorBody = await response.text();
2897
- throw new Error(`Jina rerank failed (${response.status}): ${errorBody}`);
2898
- }
2899
- const payload = await response.json();
2900
- const rawResults = payload.results ?? payload.data ?? [];
2901
- if (!Array.isArray(rawResults)) {
2902
- throw new Error("Invalid Jina rerank response format");
2903
- }
2904
- return rawResults.flatMap((item) => {
2905
- const index = item.index;
2906
- if (typeof index !== "number" || index < 0 || index >= candidates.length) {
2907
- return [];
2908
- }
2909
- const candidate = candidates[index];
2910
- if (!candidate) {
2911
- return [];
2912
- }
2913
- const score = typeof item.relevance_score === "number" ? item.relevance_score : item.score ?? 0;
2914
- return [
2915
- {
2916
- id: candidate.id,
2917
- score
2918
- }
2919
- ];
2920
- }).sort((a, b) => b.score - a.score);
2921
- }
2922
- throw new Error("Jina rerank request failed after retries");
2923
- }
2924
- };
2925
-
2926
- // src/rerank/factory.ts
2927
- function createReranker(config) {
2928
- if (!config.rerank.enabled) {
2929
- return null;
2930
- }
2931
- const apiKey = config.embeddings.apiKey ?? process.env[config.embeddings.apiKeyEnv];
2932
- if (!apiKey) {
2933
- return null;
2934
- }
2935
- return new JinaReranker({
2936
- apiKey,
2937
- model: config.rerank.model
2938
- });
2939
- }
2940
-
2941
- // src/search/ranking.ts
2942
- function nonNegativeOrZero(value) {
2943
- if (!Number.isFinite(value)) {
2944
- return 0;
2945
- }
2946
- return Math.max(0, value);
2947
- }
2948
- function rankHits(hits, config) {
2949
- return hits.map((hit) => {
2950
- let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
2951
- if (config.ranking.enableIncomingLinkBoost) {
2952
- const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
2953
- score += incomingBoost * config.ranking.weights.incomingLinks;
2954
- }
2955
- if (config.ranking.enableDepthBoost) {
2956
- const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
2957
- score += depthBoost * config.ranking.weights.depth;
2958
- }
2959
- return {
2960
- hit,
2961
- finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
2962
- };
2963
- }).sort((a, b) => {
2964
- const delta = b.finalScore - a.finalScore;
2965
- return Number.isNaN(delta) ? 0 : delta;
2966
- });
2967
- }
2968
- function findPageWeight(url, pageWeights) {
2969
- const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
2970
- const normalizedUrl = norm(url);
2971
- for (const [pattern, weight] of Object.entries(pageWeights)) {
2972
- if (norm(pattern) === normalizedUrl) {
2973
- return weight;
2974
- }
2975
- }
2976
- let bestPrefix = "";
2977
- let bestWeight = 1;
2978
- for (const [pattern, weight] of Object.entries(pageWeights)) {
2979
- const normalizedPattern = norm(pattern);
2980
- if (normalizedPattern === "/") continue;
2981
- const prefix = `${normalizedPattern}/`;
2982
- if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
2983
- bestPrefix = prefix;
2984
- bestWeight = weight;
2985
- }
2986
- }
2987
- return bestWeight;
2988
- }
2989
- function aggregateByPage(ranked, config) {
2990
- const groups = /* @__PURE__ */ new Map();
2991
- for (const hit of ranked) {
2992
- const url = hit.hit.metadata.url;
2993
- const group = groups.get(url);
2994
- if (group) group.push(hit);
2995
- else groups.set(url, [hit]);
2996
- }
2997
- const { aggregationCap, aggregationDecay } = config.ranking;
2998
- const pages = [];
2999
- for (const [url, chunks] of groups) {
3000
- chunks.sort((a, b) => {
3001
- const delta = b.finalScore - a.finalScore;
3002
- return Number.isNaN(delta) ? 0 : delta;
3003
- });
3004
- const best = chunks[0];
3005
- const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
3006
- const topChunks = chunks.slice(0, aggregationCap);
3007
- let aggregationBonus = 0;
3008
- for (let i = 1; i < topChunks.length; i++) {
3009
- const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
3010
- aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
3011
- }
3012
- let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
3013
- const pageWeight = findPageWeight(url, config.ranking.pageWeights);
3014
- if (pageWeight === 0) continue;
3015
- if (pageWeight !== 1) {
3016
- pageScore *= pageWeight;
3017
- }
3018
- pages.push({
3019
- url,
3020
- title: best.hit.metadata.title,
3021
- routeFile: best.hit.metadata.routeFile,
3022
- pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
3023
- bestChunk: best,
3024
- matchingChunks: chunks
3025
- });
3026
- }
3027
- return pages.sort((a, b) => {
3028
- const delta = b.pageScore - a.pageScore;
3029
- return Number.isNaN(delta) ? 0 : delta;
3030
- });
3031
- }
3032
-
3033
- // src/search/engine.ts
3034
2726
  var requestSchema = z2.object({
3035
2727
  q: z2.string().trim().min(1),
3036
2728
  topK: z2.number().int().positive().max(100).optional(),
3037
2729
  scope: z2.string().optional(),
3038
2730
  pathPrefix: z2.string().optional(),
3039
2731
  tags: z2.array(z2.string()).optional(),
3040
- rerank: z2.boolean().optional(),
3041
2732
  groupBy: z2.enum(["page", "chunk"]).optional()
3042
2733
  });
3043
2734
  var SearchEngine = class _SearchEngine {
3044
2735
  cwd;
3045
2736
  config;
3046
- embeddings;
3047
- vectorStore;
3048
- reranker;
2737
+ store;
3049
2738
  constructor(options) {
3050
2739
  this.cwd = options.cwd;
3051
2740
  this.config = options.config;
3052
- this.embeddings = options.embeddings;
3053
- this.vectorStore = options.vectorStore;
3054
- this.reranker = options.reranker;
2741
+ this.store = options.store;
3055
2742
  }
3056
2743
  static async create(options = {}) {
3057
- const cwd = path12.resolve(options.cwd ?? process.cwd());
2744
+ const cwd = path11.resolve(options.cwd ?? process.cwd());
3058
2745
  const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
3059
- const embeddings = options.embeddingsProvider ?? createEmbeddingsProvider(config);
3060
- const vectorStore = options.vectorStore ?? await createVectorStore(config, cwd);
3061
- const reranker = options.reranker === void 0 ? createReranker(config) : options.reranker;
2746
+ const store = options.store ?? await createUpstashStore(config);
3062
2747
  return new _SearchEngine({
3063
2748
  cwd,
3064
2749
  config,
3065
- embeddings,
3066
- vectorStore,
3067
- reranker
2750
+ store
3068
2751
  });
3069
2752
  }
3070
2753
  getConfig() {
@@ -3078,99 +2761,130 @@ var SearchEngine = class _SearchEngine {
3078
2761
  const input = parsed.data;
3079
2762
  const totalStart = process.hrtime.bigint();
3080
2763
  const resolvedScope = resolveScope(this.config, input.scope);
3081
- await this.assertModelCompatibility(resolvedScope);
3082
2764
  const topK = input.topK ?? 10;
3083
- const wantsRerank = Boolean(input.rerank);
3084
2765
  const groupByPage = (input.groupBy ?? "page") === "page";
3085
2766
  const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
3086
- const embedStart = process.hrtime.bigint();
3087
- const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
3088
- const queryVector = queryEmbeddings[0];
3089
- if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
3090
- throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
3091
- }
3092
- const embedMs = hrTimeMs(embedStart);
3093
- const vectorStart = process.hrtime.bigint();
3094
- const hits = await this.vectorStore.query(
3095
- queryVector,
3096
- {
3097
- topK: candidateK,
3098
- pathPrefix: input.pathPrefix,
3099
- tags: input.tags
3100
- },
3101
- resolvedScope
3102
- );
3103
- const vectorMs = hrTimeMs(vectorStart);
3104
- const ranked = rankHits(hits, this.config);
3105
- let usedRerank = false;
3106
- let rerankMs = 0;
3107
- let ordered = ranked;
3108
- if (wantsRerank) {
3109
- const rerankStart = process.hrtime.bigint();
3110
- ordered = await this.rerankHits(input.q, ranked, topK);
3111
- rerankMs = hrTimeMs(rerankStart);
3112
- usedRerank = true;
2767
+ const filterParts = [];
2768
+ if (input.pathPrefix) {
2769
+ const prefix = input.pathPrefix.startsWith("/") ? input.pathPrefix : `/${input.pathPrefix}`;
2770
+ filterParts.push(`url GLOB '${prefix}*'`);
2771
+ }
2772
+ if (input.tags && input.tags.length > 0) {
2773
+ for (const tag of input.tags) {
2774
+ filterParts.push(`tags GLOB '*${tag}*'`);
2775
+ }
3113
2776
  }
3114
- let results;
3115
- const minScore = this.config.ranking.minScore;
2777
+ const filter = filterParts.length > 0 ? filterParts.join(" AND ") : void 0;
2778
+ const useDualSearch = this.config.search.dualSearch && groupByPage;
2779
+ const searchStart = process.hrtime.bigint();
2780
+ let ranked;
2781
+ if (useDualSearch) {
2782
+ const chunkLimit = Math.max(topK * 10, 100);
2783
+ const pageLimit = 20;
2784
+ const [pageHits, chunkHits] = await Promise.all([
2785
+ this.store.searchPages(
2786
+ input.q,
2787
+ {
2788
+ limit: pageLimit,
2789
+ semanticWeight: this.config.search.semanticWeight,
2790
+ inputEnrichment: this.config.search.inputEnrichment,
2791
+ filter
2792
+ },
2793
+ resolvedScope
2794
+ ),
2795
+ this.store.search(
2796
+ input.q,
2797
+ {
2798
+ limit: chunkLimit,
2799
+ semanticWeight: this.config.search.semanticWeight,
2800
+ inputEnrichment: this.config.search.inputEnrichment,
2801
+ reranking: false,
2802
+ filter
2803
+ },
2804
+ resolvedScope
2805
+ )
2806
+ ]);
2807
+ const rankedChunks = rankHits(chunkHits, this.config, input.q);
2808
+ ranked = mergePageAndChunkResults(pageHits, rankedChunks, this.config);
2809
+ } else {
2810
+ const hits = await this.store.search(
2811
+ input.q,
2812
+ {
2813
+ limit: candidateK,
2814
+ semanticWeight: this.config.search.semanticWeight,
2815
+ inputEnrichment: this.config.search.inputEnrichment,
2816
+ reranking: this.config.search.reranking,
2817
+ filter
2818
+ },
2819
+ resolvedScope
2820
+ );
2821
+ ranked = rankHits(hits, this.config, input.q);
2822
+ }
2823
+ const searchMs = hrTimeMs(searchStart);
2824
+ const results = this.buildResults(ranked, topK, groupByPage, input.q);
2825
+ return {
2826
+ q: input.q,
2827
+ scope: resolvedScope.scopeName,
2828
+ results,
2829
+ meta: {
2830
+ timingsMs: {
2831
+ search: Math.round(searchMs),
2832
+ total: Math.round(hrTimeMs(totalStart))
2833
+ }
2834
+ }
2835
+ };
2836
+ }
2837
+ ensureSnippet(hit) {
2838
+ const snippet = hit.hit.metadata.snippet;
2839
+ if (snippet && snippet.length >= 30) return snippet;
2840
+ const chunkText = hit.hit.metadata.chunkText;
2841
+ if (chunkText) return toSnippet(chunkText);
2842
+ return snippet || "";
2843
+ }
2844
+ buildResults(ordered, topK, groupByPage, _query) {
3116
2845
  if (groupByPage) {
3117
2846
  let pages = aggregateByPage(ordered, this.config);
3118
- if (minScore > 0) {
3119
- pages = pages.filter((p) => p.pageScore >= minScore);
3120
- }
2847
+ pages = trimByScoreGap(pages, this.config);
3121
2848
  const minRatio = this.config.ranking.minChunkScoreRatio;
3122
- results = pages.slice(0, topK).map((page) => {
2849
+ return pages.slice(0, topK).map((page) => {
3123
2850
  const bestScore = page.bestChunk.finalScore;
3124
- const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
3125
- const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
2851
+ const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
2852
+ const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
3126
2853
  return {
3127
2854
  url: page.url,
3128
2855
  title: page.title,
3129
2856
  sectionTitle: page.bestChunk.hit.metadata.sectionTitle || void 0,
3130
- snippet: page.bestChunk.hit.metadata.snippet,
2857
+ snippet: this.ensureSnippet(page.bestChunk),
3131
2858
  score: Number(page.pageScore.toFixed(6)),
3132
2859
  routeFile: page.routeFile,
3133
2860
  chunks: meaningful.length > 1 ? meaningful.map((c) => ({
3134
2861
  sectionTitle: c.hit.metadata.sectionTitle || void 0,
3135
- snippet: c.hit.metadata.snippet,
2862
+ snippet: this.ensureSnippet(c),
3136
2863
  headingPath: c.hit.metadata.headingPath,
3137
2864
  score: Number(c.finalScore.toFixed(6))
3138
2865
  })) : void 0
3139
2866
  };
3140
2867
  });
3141
2868
  } else {
2869
+ let filtered = ordered;
2870
+ const minScore = this.config.ranking.minScore;
3142
2871
  if (minScore > 0) {
3143
- ordered = ordered.filter((entry) => entry.finalScore >= minScore);
2872
+ filtered = ordered.filter((entry) => entry.finalScore >= minScore);
3144
2873
  }
3145
- results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
2874
+ return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
3146
2875
  url: hit.metadata.url,
3147
2876
  title: hit.metadata.title,
3148
2877
  sectionTitle: hit.metadata.sectionTitle || void 0,
3149
- snippet: hit.metadata.snippet,
2878
+ snippet: this.ensureSnippet({ hit, finalScore }),
3150
2879
  score: Number(finalScore.toFixed(6)),
3151
2880
  routeFile: hit.metadata.routeFile
3152
2881
  }));
3153
2882
  }
3154
- return {
3155
- q: input.q,
3156
- scope: resolvedScope.scopeName,
3157
- results,
3158
- meta: {
3159
- timingsMs: {
3160
- embed: Math.round(embedMs),
3161
- vector: Math.round(vectorMs),
3162
- rerank: Math.round(rerankMs),
3163
- total: Math.round(hrTimeMs(totalStart))
3164
- },
3165
- usedRerank,
3166
- modelId: this.config.embeddings.model
3167
- }
3168
- };
3169
2883
  }
3170
2884
  async getPage(pathOrUrl, scope) {
3171
2885
  const resolvedScope = resolveScope(this.config, scope);
3172
2886
  const urlPath = this.resolveInputPath(pathOrUrl);
3173
- const page = await this.vectorStore.getPage(urlPath, resolvedScope);
2887
+ const page = await this.store.getPage(urlPath, resolvedScope);
3174
2888
  if (!page) {
3175
2889
  throw new SearchSocketError("INVALID_REQUEST", `Indexed page not found for ${urlPath}`, 404);
3176
2890
  }
@@ -3191,7 +2905,7 @@ var SearchEngine = class _SearchEngine {
3191
2905
  };
3192
2906
  }
3193
2907
  async health() {
3194
- return this.vectorStore.health();
2908
+ return this.store.health();
3195
2909
  }
3196
2910
  resolveInputPath(pathOrUrl) {
3197
2911
  try {
@@ -3203,90 +2917,6 @@ var SearchEngine = class _SearchEngine {
3203
2917
  const withoutQueryOrHash = pathOrUrl.split(/[?#]/)[0] ?? pathOrUrl;
3204
2918
  return normalizeUrlPath(withoutQueryOrHash);
3205
2919
  }
3206
- async assertModelCompatibility(scope) {
3207
- const modelId = await this.vectorStore.getScopeModelId(scope);
3208
- if (modelId && modelId !== this.config.embeddings.model) {
3209
- throw new SearchSocketError(
3210
- "EMBEDDING_MODEL_MISMATCH",
3211
- `Scope ${scope.scopeName} was indexed with ${modelId}. Current config uses ${this.config.embeddings.model}. Re-index with --force.`
3212
- );
3213
- }
3214
- }
3215
- async rerankHits(query, ranked, topK) {
3216
- if (!this.config.rerank.enabled) {
3217
- throw new SearchSocketError(
3218
- "INVALID_REQUEST",
3219
- "rerank=true requested but rerank.enabled is not set to true.",
3220
- 400
3221
- );
3222
- }
3223
- if (!this.reranker) {
3224
- throw new SearchSocketError(
3225
- "CONFIG_MISSING",
3226
- `rerank=true requested but ${this.config.embeddings.apiKeyEnv} is not set.`,
3227
- 400
3228
- );
3229
- }
3230
- const pageGroups = /* @__PURE__ */ new Map();
3231
- for (const entry of ranked) {
3232
- const url = entry.hit.metadata.url;
3233
- const group = pageGroups.get(url);
3234
- if (group) group.push(entry);
3235
- else pageGroups.set(url, [entry]);
3236
- }
3237
- const MAX_CHUNKS_PER_PAGE = 5;
3238
- const MIN_CHUNKS_PER_PAGE = 1;
3239
- const MIN_CHUNK_SCORE_RATIO = 0.5;
3240
- const MAX_DOC_CHARS = 2e3;
3241
- const pageCandidates = [];
3242
- for (const [url, chunks] of pageGroups) {
3243
- const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
3244
- const bestScore = byScore[0].finalScore;
3245
- const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
3246
- const selected = byScore.filter(
3247
- (c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
3248
- ).slice(0, MAX_CHUNKS_PER_PAGE);
3249
- selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
3250
- const first = selected[0].hit.metadata;
3251
- const parts = [first.title];
3252
- if (first.description) {
3253
- parts.push(first.description);
3254
- }
3255
- if (first.keywords && first.keywords.length > 0) {
3256
- parts.push(first.keywords.join(", "));
3257
- }
3258
- const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
3259
- parts.push(body);
3260
- let text = parts.join("\n\n");
3261
- if (text.length > MAX_DOC_CHARS) {
3262
- text = text.slice(0, MAX_DOC_CHARS);
3263
- }
3264
- pageCandidates.push({ id: url, text });
3265
- }
3266
- const maxCandidates = Math.max(topK, this.config.rerank.topN);
3267
- const cappedCandidates = pageCandidates.slice(0, maxCandidates);
3268
- const reranked = await this.reranker.rerank(
3269
- query,
3270
- cappedCandidates,
3271
- maxCandidates
3272
- );
3273
- const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
3274
- return ranked.map((entry) => {
3275
- const pageScore = scoreByUrl.get(entry.hit.metadata.url);
3276
- const base = Number.isFinite(entry.finalScore) ? entry.finalScore : Number.NEGATIVE_INFINITY;
3277
- if (pageScore === void 0 || !Number.isFinite(pageScore)) {
3278
- return { ...entry, finalScore: base };
3279
- }
3280
- const combined = pageScore * this.config.ranking.weights.rerank + base * 1e-3;
3281
- return {
3282
- ...entry,
3283
- finalScore: Number.isFinite(combined) ? combined : base
3284
- };
3285
- }).sort((a, b) => {
3286
- const delta = b.finalScore - a.finalScore;
3287
- return Number.isNaN(delta) ? 0 : delta;
3288
- });
3289
- }
3290
2920
  };
3291
2921
 
3292
2922
  // src/mcp/server.ts
@@ -3298,7 +2928,7 @@ function createServer(engine) {
3298
2928
  server.registerTool(
3299
2929
  "search",
3300
2930
  {
3301
- description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, and topK.",
2931
+ description: "Semantic site search powered by Upstash Search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, topK, and groupBy.",
3302
2932
  inputSchema: {
3303
2933
  query: z3.string().min(1),
3304
2934
  scope: z3.string().optional(),
@@ -3485,9 +3115,6 @@ function parseDurationMs(value) {
3485
3115
  throw new SearchSocketError("INVALID_REQUEST", `Unsupported duration unit: ${unit}`, 400);
3486
3116
  }
3487
3117
  }
3488
- function formatUsd(value) {
3489
- return `$${value.toFixed(6)}`;
3490
- }
3491
3118
  function printIndexSummary(stats) {
3492
3119
  process.stdout.write(`pages processed: ${stats.pagesProcessed}
3493
3120
  `);
@@ -3495,13 +3122,9 @@ function printIndexSummary(stats) {
3495
3122
  `);
3496
3123
  process.stdout.write(`chunks changed: ${stats.chunksChanged}
3497
3124
  `);
3498
- process.stdout.write(`embeddings created: ${stats.newEmbeddings}
3125
+ process.stdout.write(`documents upserted: ${stats.documentsUpserted}
3499
3126
  `);
3500
3127
  process.stdout.write(`deletes: ${stats.deletes}
3501
- `);
3502
- process.stdout.write(`estimated tokens: ${stats.estimatedTokens}
3503
- `);
3504
- process.stdout.write(`estimated cost (USD): ${formatUsd(stats.estimatedCostUSD)}
3505
3128
  `);
3506
3129
  process.stdout.write(`route mapping: ${stats.routeExact} exact, ${stats.routeBestEffort} best-effort
3507
3130
  `);
@@ -3515,7 +3138,7 @@ function collectWatchPaths(config, cwd) {
3515
3138
  const paths = ["src/routes/**"];
3516
3139
  if (config.source.mode === "content-files" && config.source.contentFiles) {
3517
3140
  for (const pattern of config.source.contentFiles.globs) {
3518
- paths.push(path13.join(config.source.contentFiles.baseDir, pattern));
3141
+ paths.push(path12.join(config.source.contentFiles.baseDir, pattern));
3519
3142
  }
3520
3143
  }
3521
3144
  if (config.source.mode === "static-output") {
@@ -3528,25 +3151,22 @@ function collectWatchPaths(config, cwd) {
3528
3151
  paths.push("searchsocket.config.ts");
3529
3152
  paths.push(config.source.build.outputDir);
3530
3153
  }
3531
- return paths.map((value) => path13.resolve(cwd, value));
3154
+ return paths.map((value) => path12.resolve(cwd, value));
3532
3155
  }
3533
3156
  function ensureStateDir(cwd) {
3534
- const target = path13.join(cwd, ".searchsocket");
3535
- fs9.mkdirSync(target, { recursive: true });
3157
+ const target = path12.join(cwd, ".searchsocket");
3158
+ fs8.mkdirSync(target, { recursive: true });
3536
3159
  return target;
3537
3160
  }
3538
3161
  function ensureGitignore(cwd) {
3539
- const gitignorePath = path13.join(cwd, ".gitignore");
3162
+ const gitignorePath = path12.join(cwd, ".gitignore");
3540
3163
  const entries = [
3541
- ".searchsocket/vectors.db",
3542
- ".searchsocket/vectors.db-shm",
3543
- ".searchsocket/vectors.db-wal",
3544
3164
  ".searchsocket/manifest.json",
3545
3165
  ".searchsocket/registry.json"
3546
3166
  ];
3547
3167
  let content = "";
3548
- if (fs9.existsSync(gitignorePath)) {
3549
- content = fs9.readFileSync(gitignorePath, "utf8");
3168
+ if (fs8.existsSync(gitignorePath)) {
3169
+ content = fs8.readFileSync(gitignorePath, "utf8");
3550
3170
  }
3551
3171
  const lines = content.split("\n");
3552
3172
  const missing = entries.filter((entry) => !lines.some((line) => line.trim() === entry));
@@ -3557,10 +3177,10 @@ function ensureGitignore(cwd) {
3557
3177
  # SearchSocket local state
3558
3178
  ${missing.join("\n")}
3559
3179
  `;
3560
- fs9.writeFileSync(gitignorePath, content.trimEnd() + block, "utf8");
3180
+ fs8.writeFileSync(gitignorePath, content.trimEnd() + block, "utf8");
3561
3181
  }
3562
3182
  function readScopesFromFile(filePath) {
3563
- const raw = fs9.readFileSync(filePath, "utf8");
3183
+ const raw = fs8.readFileSync(filePath, "utf8");
3564
3184
  return new Set(
3565
3185
  raw.split(/\r?\n/).map((line) => line.trim()).filter(Boolean)
3566
3186
  );
@@ -3584,8 +3204,8 @@ function readRemoteGitBranches(cwd) {
3584
3204
  }
3585
3205
  }
3586
3206
  async function loadResolvedConfigForDev(cwd, configPath) {
3587
- const resolvedConfigPath = path13.resolve(cwd, configPath ?? "searchsocket.config.ts");
3588
- if (fs9.existsSync(resolvedConfigPath)) {
3207
+ const resolvedConfigPath = path12.resolve(cwd, configPath ?? "searchsocket.config.ts");
3208
+ if (fs8.existsSync(resolvedConfigPath)) {
3589
3209
  return loadConfig({ cwd, configPath });
3590
3210
  }
3591
3211
  return mergeConfig(cwd, {});
@@ -3632,7 +3252,7 @@ var program = new Command();
3632
3252
  program.name("searchsocket").description("Semantic site search and MCP retrieval for SvelteKit").version(package_default.version).option("-C, --cwd <path>", "working directory", process.cwd()).option("--config <path>", "config path (defaults to searchsocket.config.ts)");
3633
3253
  program.command("init").description("Create searchsocket.config.ts and .searchsocket state directory").action(async (_opts, command) => {
3634
3254
  const root = getRootOptions(command).cwd ?? process.cwd();
3635
- const cwd = path13.resolve(root);
3255
+ const cwd = path12.resolve(root);
3636
3256
  const configPath = writeMinimalConfig(cwd);
3637
3257
  const stateDir = ensureStateDir(cwd);
3638
3258
  ensureGitignore(cwd);
@@ -3650,15 +3270,15 @@ program.command("init").description("Create searchsocket.config.ts and .searchso
3650
3270
  process.stdout.write("// searchsocketVitePlugin({ enabled: true, changedOnly: true })\n");
3651
3271
  process.stdout.write("// or env-driven: SEARCHSOCKET_AUTO_INDEX=1 pnpm build\n");
3652
3272
  });
3653
- program.command("index").description("Index site content into markdown mirror + vector store").option("--scope <name>", "scope override").option("--changed-only", "only process changed chunks", true).option("--no-changed-only", "re-index regardless of previous manifest").option("--force", "force full mirror rebuild and re-upsert", false).option("--dry-run", "compute plan and cost, no API writes", false).option("--source <mode>", "source mode override: static-output|crawl|content-files|build").option("--max-pages <n>", "limit pages processed").option("--max-chunks <n>", "limit chunks processed").option("--quiet", "suppress all output except errors and warnings", false).option("--verbose", "verbose output", false).option("--json", "emit JSON logs and summary", false).action(async (opts, command) => {
3273
+ program.command("index").description("Index site content into Upstash Search").option("--scope <name>", "scope override").option("--changed-only", "only process changed chunks", true).option("--no-changed-only", "re-index regardless of previous manifest").option("--force", "force full rebuild", false).option("--dry-run", "compute plan, no writes", false).option("--source <mode>", "source mode override: static-output|crawl|content-files|build").option("--max-pages <n>", "limit pages processed").option("--max-chunks <n>", "limit chunks processed").option("--quiet", "suppress all output except errors and warnings", false).option("--verbose", "verbose output", false).option("--json", "emit JSON logs and summary", false).action(async (opts, command) => {
3654
3274
  const rootOpts = getRootOptions(command);
3655
- const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
3275
+ const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
3656
3276
  await runIndexCommand({
3657
3277
  cwd,
3658
3278
  configPath: rootOpts?.config,
3659
3279
  scope: opts.scope,
3660
3280
  changedOnly: opts.changedOnly,
3661
- force: opts.force,
3281
+ force: opts.force || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
3662
3282
  dryRun: opts.dryRun,
3663
3283
  source: opts.source,
3664
3284
  maxPages: opts.maxPages ? parsePositiveInt(opts.maxPages, "--max-pages") : void 0,
@@ -3668,16 +3288,16 @@ program.command("index").description("Index site content into markdown mirror +
3668
3288
  json: opts.json
3669
3289
  });
3670
3290
  });
3671
- program.command("status").description("Show scope, indexing state, backend health, and recent cost estimate").option("--scope <name>", "scope override").action(async (opts, command) => {
3291
+ program.command("status").description("Show scope, indexing state, and backend health").option("--scope <name>", "scope override").action(async (opts, command) => {
3672
3292
  const rootOpts = getRootOptions(command);
3673
- const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
3293
+ const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
3674
3294
  const config = await loadConfig({ cwd, configPath: rootOpts?.config });
3675
3295
  const scope = resolveScope(config, opts.scope);
3676
- let vectorStore;
3296
+ let store;
3677
3297
  let health = { ok: false, details: "not checked" };
3678
3298
  try {
3679
- vectorStore = await createVectorStore(config, cwd);
3680
- health = await vectorStore.health();
3299
+ store = await createUpstashStore(config);
3300
+ health = await store.health();
3681
3301
  } catch (error) {
3682
3302
  health = {
3683
3303
  ok: false,
@@ -3685,24 +3305,22 @@ program.command("status").description("Show scope, indexing state, backend healt
3685
3305
  };
3686
3306
  process.stdout.write(`project: ${config.project.id}
3687
3307
  `);
3688
- process.stdout.write(`vector health: error (${health.details})
3308
+ process.stdout.write(`backend health: error (${health.details})
3689
3309
  `);
3690
3310
  process.exitCode = 1;
3691
3311
  return;
3692
3312
  }
3693
3313
  let scopeRegistry = [];
3694
3314
  let scopeInfo;
3695
- let hashes = /* @__PURE__ */ new Map();
3696
3315
  try {
3697
- scopeRegistry = await vectorStore.listScopes(config.project.id);
3316
+ scopeRegistry = await store.listScopes(config.project.id);
3698
3317
  scopeInfo = scopeRegistry.find((entry) => entry.scopeName === scope.scopeName);
3699
- hashes = await vectorStore.getContentHashes(scope);
3700
3318
  } catch (error) {
3701
3319
  process.stdout.write(`project: ${config.project.id}
3702
3320
  `);
3703
3321
  process.stdout.write(`resolved scope: ${scope.scopeName}
3704
3322
  `);
3705
- process.stdout.write(`vector health: error (${error instanceof Error ? error.message : "unknown error"})
3323
+ process.stdout.write(`backend health: error (${error instanceof Error ? error.message : "unknown error"})
3706
3324
  `);
3707
3325
  process.exitCode = 1;
3708
3326
  return;
@@ -3711,25 +3329,15 @@ program.command("status").description("Show scope, indexing state, backend healt
3711
3329
  `);
3712
3330
  process.stdout.write(`resolved scope: ${scope.scopeName}
3713
3331
  `);
3714
- process.stdout.write(`embedding model: ${config.embeddings.model}
3715
- `);
3716
- const tursoUrl = process.env[config.vector.turso.urlEnv];
3717
- const vectorMode = tursoUrl ? `remote (${tursoUrl})` : `local (${config.vector.turso.localPath})`;
3718
- process.stdout.write(`vector backend: turso/libsql (${vectorMode})
3332
+ process.stdout.write(`backend: upstash-search
3719
3333
  `);
3720
- process.stdout.write(`vector health: ${health.ok ? "ok" : `error (${health.details ?? "n/a"})`}
3334
+ process.stdout.write(`backend health: ${health.ok ? "ok" : `error (${health.details ?? "n/a"})`}
3721
3335
  `);
3722
3336
  if (scopeInfo) {
3723
3337
  process.stdout.write(`last indexed (${scope.scopeName}): ${scopeInfo.lastIndexedAt ?? "never"}
3724
3338
  `);
3725
- process.stdout.write(`tracked chunks: ${hashes.size}
3726
- `);
3727
- if (scopeInfo.lastEstimateTokens != null) {
3728
- process.stdout.write(`last estimated tokens: ${scopeInfo.lastEstimateTokens}
3729
- `);
3730
- }
3731
- if (scopeInfo.lastEstimateCostUSD != null) {
3732
- process.stdout.write(`last estimated cost: ${formatUsd(scopeInfo.lastEstimateCostUSD)}
3339
+ if (scopeInfo.documentCount != null) {
3340
+ process.stdout.write(`documents: ${scopeInfo.documentCount}
3733
3341
  `);
3734
3342
  }
3735
3343
  } else {
@@ -3740,7 +3348,7 @@ program.command("status").description("Show scope, indexing state, backend healt
3740
3348
  process.stdout.write("\nregistry scopes:\n");
3741
3349
  for (const item of scopeRegistry) {
3742
3350
  process.stdout.write(
3743
- ` - ${item.scopeName} model=${item.modelId} lastIndexedAt=${item.lastIndexedAt} vectors=${item.vectorCount ?? "unknown"}
3351
+ ` - ${item.scopeName} lastIndexedAt=${item.lastIndexedAt} documents=${item.documentCount ?? "unknown"}
3744
3352
  `
3745
3353
  );
3746
3354
  }
@@ -3748,7 +3356,7 @@ program.command("status").description("Show scope, indexing state, backend healt
3748
3356
  });
3749
3357
  program.command("dev").description("Watch content files/routes and incrementally reindex on changes").option("--scope <name>", "scope override").option("--mcp", "start MCP server (http transport) alongside watcher", false).option("--mcp-port <n>", "MCP HTTP port", "3338").option("--mcp-path <path>", "MCP HTTP path", "/mcp").option("--verbose", "verbose logs", false).action(async (opts, command) => {
3750
3358
  const rootOpts = getRootOptions(command);
3751
- const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
3359
+ const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
3752
3360
  const config = await loadResolvedConfigForDev(cwd, rootOpts?.config);
3753
3361
  const watchPaths = collectWatchPaths(config, cwd);
3754
3362
  process.stdout.write("starting searchsocket dev watcher...\n");
@@ -3815,45 +3423,44 @@ ${watchPaths.map((entry) => ` - ${entry}`).join("\n")}
3815
3423
  });
3816
3424
  });
3817
3425
  });
3818
- program.command("clean").description("Delete local state and optionally delete remote vectors for a scope").option("--scope <name>", "scope override").option("--remote", "delete remote scope vectors", false).action(async (opts, command) => {
3426
+ program.command("clean").description("Delete local state and optionally delete remote indexes for a scope").option("--scope <name>", "scope override").option("--remote", "delete remote scope indexes", false).action(async (opts, command) => {
3819
3427
  const rootOpts = getRootOptions(command);
3820
- const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
3428
+ const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
3821
3429
  const config = await loadConfig({ cwd, configPath: rootOpts?.config });
3822
- const scope = resolveScope(config, opts.scope);
3823
- const statePath = path13.join(cwd, config.state.dir);
3430
+ const statePath = path12.join(cwd, config.state.dir);
3824
3431
  await fsp.rm(statePath, { recursive: true, force: true });
3825
3432
  process.stdout.write(`deleted local state directory: ${statePath}
3826
3433
  `);
3827
3434
  if (opts.remote) {
3828
- const vectorStore = await createVectorStore(config, cwd);
3829
- await vectorStore.dropAllTables();
3830
- process.stdout.write(`dropped all remote tables (chunks, registry, pages)
3435
+ const store = await createUpstashStore(config);
3436
+ await store.dropAllIndexes(config.project.id);
3437
+ process.stdout.write(`dropped all remote indexes for project ${config.project.id}
3831
3438
  `);
3832
3439
  }
3833
3440
  });
3834
3441
  program.command("prune").description("List/delete stale scopes (dry-run by default)").option("--apply", "apply deletions", false).option("--scopes-file <path>", "file containing active scopes").option("--older-than <duration>", "ttl cutoff like 30d").action(async (opts, command) => {
3835
3442
  const rootOpts = getRootOptions(command);
3836
- const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
3443
+ const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
3837
3444
  const config = await loadConfig({ cwd, configPath: rootOpts?.config });
3838
3445
  const baseScope = resolveScope(config);
3839
- let vectorStore;
3446
+ let store;
3840
3447
  let scopes;
3841
3448
  try {
3842
- vectorStore = await createVectorStore(config, cwd);
3843
- scopes = await vectorStore.listScopes(config.project.id);
3449
+ store = await createUpstashStore(config);
3450
+ scopes = await store.listScopes(config.project.id);
3844
3451
  } catch (error) {
3845
3452
  process.stderr.write(
3846
- `error: failed to access Turso vector store: ${error instanceof Error ? error.message : String(error)}
3453
+ `error: failed to access Upstash Search: ${error instanceof Error ? error.message : String(error)}
3847
3454
  `
3848
3455
  );
3849
3456
  process.exitCode = 1;
3850
3457
  return;
3851
3458
  }
3852
- process.stdout.write(`using remote registry
3459
+ process.stdout.write(`using Upstash Search
3853
3460
  `);
3854
3461
  let keepScopes = /* @__PURE__ */ new Set();
3855
3462
  if (opts.scopesFile) {
3856
- keepScopes = readScopesFromFile(path13.resolve(cwd, opts.scopesFile));
3463
+ keepScopes = readScopesFromFile(path12.resolve(cwd, opts.scopesFile));
3857
3464
  } else {
3858
3465
  keepScopes = readRemoteGitBranches(cwd);
3859
3466
  }
@@ -3871,7 +3478,7 @@ program.command("prune").description("List/delete stale scopes (dry-run by defau
3871
3478
  staleByList = !keepScopes.has(entry.scopeName);
3872
3479
  }
3873
3480
  let staleByTtl = false;
3874
- if (olderThanMs) {
3481
+ if (olderThanMs && entry.lastIndexedAt !== "unknown") {
3875
3482
  staleByTtl = now - Date.parse(entry.lastIndexedAt) > olderThanMs;
3876
3483
  }
3877
3484
  if (keepScopes.size > 0 && olderThanMs) {
@@ -3907,7 +3514,7 @@ program.command("prune").description("List/delete stale scopes (dry-run by defau
3907
3514
  scopeId: `${config.project.id}:${entry.scopeName}`
3908
3515
  };
3909
3516
  try {
3910
- await vectorStore.deleteScope(scope);
3517
+ await store.deleteScope(scope);
3911
3518
  deleted += 1;
3912
3519
  } catch (error) {
3913
3520
  process.stdout.write(
@@ -3924,7 +3531,7 @@ program.command("prune").description("List/delete stale scopes (dry-run by defau
3924
3531
  });
3925
3532
  program.command("doctor").description("Validate config, env vars, provider connectivity, and local write access").action(async (_opts, command) => {
3926
3533
  const rootOpts = getRootOptions(command);
3927
- const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
3534
+ const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
3928
3535
  const checks = [];
3929
3536
  let config = null;
3930
3537
  try {
@@ -3938,23 +3545,21 @@ program.command("doctor").description("Validate config, env vars, provider conne
3938
3545
  });
3939
3546
  }
3940
3547
  if (config) {
3941
- const embKey = process.env[config.embeddings.apiKeyEnv];
3548
+ const upstashUrl = config.upstash.url ?? process.env[config.upstash.urlEnv];
3549
+ const upstashToken = config.upstash.token ?? process.env[config.upstash.tokenEnv];
3942
3550
  checks.push({
3943
- name: `env ${config.embeddings.apiKeyEnv}`,
3944
- ok: Boolean(embKey),
3945
- details: embKey ? void 0 : "missing"
3551
+ name: `env ${config.upstash.urlEnv}`,
3552
+ ok: Boolean(upstashUrl),
3553
+ details: upstashUrl ? void 0 : "missing"
3554
+ });
3555
+ checks.push({
3556
+ name: `env ${config.upstash.tokenEnv}`,
3557
+ ok: Boolean(upstashToken),
3558
+ details: upstashToken ? void 0 : "missing"
3946
3559
  });
3947
- {
3948
- const tursoUrl = process.env[config.vector.turso.urlEnv];
3949
- checks.push({
3950
- name: "turso/libsql",
3951
- ok: true,
3952
- details: tursoUrl ? `remote: ${tursoUrl}` : `local file: ${config.vector.turso.localPath}`
3953
- });
3954
- }
3955
3560
  if (config.source.mode === "static-output") {
3956
- const outputDir = path13.resolve(cwd, config.source.staticOutputDir);
3957
- const exists = fs9.existsSync(outputDir);
3561
+ const outputDir = path12.resolve(cwd, config.source.staticOutputDir);
3562
+ const exists = fs8.existsSync(outputDir);
3958
3563
  checks.push({
3959
3564
  name: "source: static output dir",
3960
3565
  ok: exists,
@@ -3963,15 +3568,15 @@ program.command("doctor").description("Validate config, env vars, provider conne
3963
3568
  } else if (config.source.mode === "build") {
3964
3569
  const buildConfig = config.source.build;
3965
3570
  if (buildConfig) {
3966
- const manifestPath = path13.resolve(cwd, buildConfig.outputDir, "server", "manifest-full.js");
3967
- const manifestExists = fs9.existsSync(manifestPath);
3571
+ const manifestPath = path12.resolve(cwd, buildConfig.outputDir, "server", "manifest-full.js");
3572
+ const manifestExists = fs8.existsSync(manifestPath);
3968
3573
  checks.push({
3969
3574
  name: "source: build manifest",
3970
3575
  ok: manifestExists,
3971
3576
  details: manifestExists ? manifestPath : `${manifestPath} not found (run \`vite build\` first)`
3972
3577
  });
3973
- const viteBin = path13.resolve(cwd, "node_modules", ".bin", "vite");
3974
- const viteExists = fs9.existsSync(viteBin);
3578
+ const viteBin = path12.resolve(cwd, "node_modules", ".bin", "vite");
3579
+ const viteExists = fs8.existsSync(viteBin);
3975
3580
  checks.push({
3976
3581
  name: "source: vite binary",
3977
3582
  ok: viteExists,
@@ -3988,7 +3593,7 @@ program.command("doctor").description("Validate config, env vars, provider conne
3988
3593
  const contentConfig = config.source.contentFiles;
3989
3594
  if (contentConfig) {
3990
3595
  const fg4 = await import("fast-glob");
3991
- const baseDir = path13.resolve(cwd, contentConfig.baseDir);
3596
+ const baseDir = path12.resolve(cwd, contentConfig.baseDir);
3992
3597
  const files = await fg4.default(contentConfig.globs, { cwd: baseDir, onlyFiles: true });
3993
3598
  checks.push({
3994
3599
  name: "source: content files",
@@ -4003,61 +3608,26 @@ program.command("doctor").description("Validate config, env vars, provider conne
4003
3608
  });
4004
3609
  }
4005
3610
  }
4006
- try {
4007
- const provider = createEmbeddingsProvider(config);
4008
- await provider.embedTexts(["searchsocket doctor ping"], config.embeddings.model);
4009
- checks.push({ name: "embedding provider connectivity", ok: true });
4010
- } catch (error) {
4011
- checks.push({
4012
- name: "embedding provider connectivity",
4013
- ok: false,
4014
- details: error instanceof Error ? error.message : "unknown error"
4015
- });
4016
- }
4017
3611
  let store = null;
4018
3612
  try {
4019
- store = await createVectorStore(config, cwd);
3613
+ store = await createUpstashStore(config);
4020
3614
  const health = await store.health();
4021
3615
  checks.push({
4022
- name: "vector backend connectivity",
3616
+ name: "upstash search connectivity",
4023
3617
  ok: health.ok,
4024
3618
  details: health.details
4025
3619
  });
4026
3620
  } catch (error) {
4027
3621
  checks.push({
4028
- name: "vector backend connectivity",
3622
+ name: "upstash search connectivity",
4029
3623
  ok: false,
4030
3624
  details: error instanceof Error ? error.message : "unknown error"
4031
3625
  });
4032
3626
  }
4033
- if (store) {
4034
- try {
4035
- const testScope = {
4036
- projectId: config.project.id,
4037
- scopeName: "_searchsocket_doctor_probe",
4038
- scopeId: `${config.project.id}:_searchsocket_doctor_probe`
4039
- };
4040
- await store.recordScope({
4041
- projectId: testScope.projectId,
4042
- scopeName: testScope.scopeName,
4043
- modelId: config.embeddings.model,
4044
- lastIndexedAt: (/* @__PURE__ */ new Date()).toISOString(),
4045
- vectorCount: 0
4046
- });
4047
- await store.deleteScope(testScope);
4048
- checks.push({ name: "vector backend write permission", ok: true });
4049
- } catch (error) {
4050
- checks.push({
4051
- name: "vector backend write permission",
4052
- ok: false,
4053
- details: error instanceof Error ? error.message : "write test failed"
4054
- });
4055
- }
4056
- }
4057
3627
  try {
4058
3628
  const scope = resolveScope(config);
4059
3629
  const { statePath } = ensureStateDirs(cwd, config.state.dir, scope);
4060
- const testPath = path13.join(statePath, ".write-test");
3630
+ const testPath = path12.join(statePath, ".write-test");
4061
3631
  await fsp.writeFile(testPath, "ok\n", "utf8");
4062
3632
  await fsp.rm(testPath, { force: true });
4063
3633
  checks.push({ name: "state directory writable", ok: true });
@@ -4086,7 +3656,7 @@ program.command("doctor").description("Validate config, env vars, provider conne
4086
3656
  });
4087
3657
  program.command("mcp").description("Run SearchSocket MCP server").option("--transport <transport>", "stdio|http", "stdio").option("--port <n>", "HTTP port", "3338").option("--path <path>", "HTTP path", "/mcp").action(async (opts, command) => {
4088
3658
  const rootOpts = getRootOptions(command);
4089
- const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
3659
+ const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
4090
3660
  await runMcpServer({
4091
3661
  cwd,
4092
3662
  configPath: rootOpts?.config,
@@ -4095,9 +3665,9 @@ program.command("mcp").description("Run SearchSocket MCP server").option("--tran
4095
3665
  httpPath: opts.path
4096
3666
  });
4097
3667
  });
4098
- program.command("search").description("Quick local CLI search against indexed vectors").requiredOption("--q <query>", "search query").option("--scope <name>", "scope override").option("--top-k <n>", "top K results", "10").option("--path-prefix <prefix>", "path prefix filter").option("--rerank", "enable configured reranker", false).action(async (opts, command) => {
3668
+ program.command("search").description("Quick CLI search against Upstash Search").requiredOption("--q <query>", "search query").option("--scope <name>", "scope override").option("--top-k <n>", "top K results", "10").option("--path-prefix <prefix>", "path prefix filter").action(async (opts, command) => {
4099
3669
  const rootOpts = getRootOptions(command);
4100
- const cwd = path13.resolve(rootOpts?.cwd ?? process.cwd());
3670
+ const cwd = path12.resolve(rootOpts?.cwd ?? process.cwd());
4101
3671
  const engine = await SearchEngine.create({
4102
3672
  cwd,
4103
3673
  configPath: rootOpts?.config
@@ -4106,14 +3676,13 @@ program.command("search").description("Quick local CLI search against indexed ve
4106
3676
  q: opts.q,
4107
3677
  scope: opts.scope,
4108
3678
  topK: parsePositiveInt(opts.topK, "--top-k"),
4109
- pathPrefix: opts.pathPrefix,
4110
- rerank: opts.rerank
3679
+ pathPrefix: opts.pathPrefix
4111
3680
  });
4112
3681
  process.stdout.write(`${JSON.stringify(result, null, 2)}
4113
3682
  `);
4114
3683
  });
4115
3684
  async function main() {
4116
- dotenvConfig({ path: path13.resolve(process.cwd(), ".env") });
3685
+ dotenvConfig({ path: path12.resolve(process.cwd(), ".env") });
4117
3686
  await program.parseAsync(process.argv);
4118
3687
  }
4119
3688
  main().catch((error) => {