@romiluz/clawmongo 0.1.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +3 -0
  3. package/dist/cli/boundary-contract-smoke.js +108 -0
  4. package/dist/cli/embedding-policy-smoke.js +66 -0
  5. package/dist/cli/embedding-provider-live-smoke.js +94 -0
  6. package/dist/cli/embedding-provider-smoke.js +81 -0
  7. package/dist/cli/embedding-provider-voyage-batch-smoke.js +129 -0
  8. package/dist/cli/gateway-smoke.js +65 -0
  9. package/dist/cli/health.js +17 -0
  10. package/dist/cli/index-budget-smoke.js +14 -0
  11. package/dist/cli/key-schema-smoke.js +118 -0
  12. package/dist/cli/orchestrator-smoke.js +75 -0
  13. package/dist/cli/provider-adapter-smoke.js +61 -0
  14. package/dist/cli/replica-track-check.js +108 -0
  15. package/dist/cli/retrieval-compat-check.js +196 -0
  16. package/dist/cli/retrieval-contract-smoke.js +72 -0
  17. package/dist/cli/retrieval-eval.js +226 -0
  18. package/dist/cli/retrieval-provider-smoke.js +52 -0
  19. package/dist/cli/retrieval-seed-reembed-smoke.js +54 -0
  20. package/dist/cli/retrieval-seed.js +312 -0
  21. package/dist/cli/runtime-contract-smoke.js +201 -0
  22. package/dist/cli/session-key-smoke.js +62 -0
  23. package/dist/cli/sprint-checks.js +129 -0
  24. package/dist/cli/tool-runtime-smoke.js +68 -0
  25. package/dist/config/deployment-profiles.js +41 -0
  26. package/dist/config/env.js +49 -0
  27. package/dist/contracts/v1.js +1 -0
  28. package/dist/contracts/validators.js +153 -0
  29. package/dist/identity/key-schema.js +31 -0
  30. package/dist/main.js +97 -0
  31. package/dist/modules/eventing/index.js +58 -0
  32. package/dist/modules/eventing/service.js +139 -0
  33. package/dist/modules/gateway/index.js +44 -0
  34. package/dist/modules/gateway/service.js +118 -0
  35. package/dist/modules/ingestion/index.js +46 -0
  36. package/dist/modules/ingestion/service.js +56 -0
  37. package/dist/modules/mongo-store/index.js +21 -0
  38. package/dist/modules/observability/index.js +6 -0
  39. package/dist/modules/orchestrator/index.js +49 -0
  40. package/dist/modules/orchestrator/service.js +220 -0
  41. package/dist/modules/policy-engine/index.js +34 -0
  42. package/dist/modules/policy-engine/service.js +42 -0
  43. package/dist/modules/provider-adapter/index.js +37 -0
  44. package/dist/modules/provider-adapter/service.js +98 -0
  45. package/dist/modules/retrieval/index.js +64 -0
  46. package/dist/modules/stub.js +17 -0
  47. package/dist/modules/tool-runtime/index.js +30 -0
  48. package/dist/modules/tool-runtime/service.js +84 -0
  49. package/dist/retrieval/contracts.js +1 -0
  50. package/dist/retrieval/embeddings/policy.js +42 -0
  51. package/dist/retrieval/embeddings/provider.js +424 -0
  52. package/dist/retrieval/embeddings/query-vector.js +34 -0
  53. package/dist/retrieval/embeddings/voyage-remote-batch.js +312 -0
  54. package/dist/retrieval/engine.js +130 -0
  55. package/dist/retrieval/fixtures.js +123 -0
  56. package/dist/retrieval/providers/fusion.js +390 -0
  57. package/dist/retrieval/providers/lexical.js +267 -0
  58. package/dist/retrieval/providers/shared.js +88 -0
  59. package/dist/retrieval/providers/vector.js +274 -0
  60. package/dist/retrieval/reembed.js +116 -0
  61. package/dist/runtime/bootstrap.js +65 -0
  62. package/dist/runtime/types.js +1 -0
  63. package/dist/session/session-key.js +128 -0
  64. package/dist/store/mongo/bootstrap.js +129 -0
  65. package/dist/store/mongo/indexes.js +110 -0
  66. package/dist/store/mongo/validators.js +238 -0
  67. package/package.json +81 -0
@@ -0,0 +1,226 @@
1
+ import { performance } from "node:perf_hooks";
2
+ import { loadRuntimeConfig } from "../config/env.js";
3
+ import { RetrievalEngine } from "../retrieval/engine.js";
4
+ import { loadRetrievalFixtureBundle } from "../retrieval/fixtures.js";
5
+ import { AdaptiveFusionRetriever } from "../retrieval/providers/fusion.js";
6
+ import { MongoLexicalRetriever } from "../retrieval/providers/lexical.js";
7
+ import { MongoVectorRetriever } from "../retrieval/providers/vector.js";
8
+ import { DEPLOYMENT_PROFILES } from "../config/deployment-profiles.js";
9
+ function parseArgs(argv) {
10
+ let topK = 5;
11
+ let variant = "all";
12
+ for (let index = 0; index < argv.length; index += 1) {
13
+ const token = argv[index];
14
+ if (token === "--topK") {
15
+ const value = Number(argv[index + 1]);
16
+ if (!Number.isInteger(value) || value <= 0) {
17
+ throw new Error(`Invalid --topK value: ${argv[index + 1]}`);
18
+ }
19
+ topK = value;
20
+ index += 1;
21
+ continue;
22
+ }
23
+ if (token === "--variant") {
24
+ const value = argv[index + 1];
25
+ if (!["lexical", "vector", "hybrid", "all"].includes(value)) {
26
+ throw new Error(`Invalid --variant value: ${value}`);
27
+ }
28
+ variant = value;
29
+ index += 1;
30
+ continue;
31
+ }
32
+ if (token === "--help") {
33
+ process.stdout.write([
34
+ "Usage: npm run retrieval:eval -- [--topK <n>] [--variant lexical|vector|hybrid|all]",
35
+ "",
36
+ "Options:",
37
+ " --topK retrieval cutoff for metrics (default: 5)",
38
+ " --variant run one variant or all (default: all)"
39
+ ].join("\n") + "\n");
40
+ process.exit(0);
41
+ }
42
+ throw new Error(`Unknown argument: ${token}`);
43
+ }
44
+ return { topK, variant };
45
+ }
46
+ function percentile(values, percentileValue) {
47
+ if (values.length === 0) {
48
+ return 0;
49
+ }
50
+ const sorted = [...values].sort((left, right) => left - right);
51
+ const rank = (percentileValue / 100) * (sorted.length - 1);
52
+ const lowIndex = Math.floor(rank);
53
+ const highIndex = Math.ceil(rank);
54
+ if (lowIndex === highIndex) {
55
+ return Number(sorted[lowIndex].toFixed(3));
56
+ }
57
+ const weight = rank - lowIndex;
58
+ const interpolated = sorted[lowIndex] * (1 - weight) + sorted[highIndex] * weight;
59
+ return Number(interpolated.toFixed(3));
60
+ }
61
+ function recallAtK(relevantIds, retrievedIds) {
62
+ if (relevantIds.length === 0) {
63
+ return 0;
64
+ }
65
+ const retrievedSet = new Set(retrievedIds);
66
+ const hitCount = relevantIds.filter((id) => retrievedSet.has(id)).length;
67
+ return hitCount / relevantIds.length;
68
+ }
69
+ function mrrAtK(relevantIds, retrievedIds) {
70
+ for (let index = 0; index < retrievedIds.length; index += 1) {
71
+ if (relevantIds.includes(retrievedIds[index])) {
72
+ return 1 / (index + 1);
73
+ }
74
+ }
75
+ return 0;
76
+ }
77
+ function ndcgAtK(relevantIds, retrievedIds) {
78
+ const relevanceSet = new Set(relevantIds);
79
+ const dcg = retrievedIds.reduce((sum, id, index) => {
80
+ const rel = relevanceSet.has(id) ? 1 : 0;
81
+ if (rel === 0) {
82
+ return sum;
83
+ }
84
+ return sum + 1 / Math.log2(index + 2);
85
+ }, 0);
86
+ const idealLength = Math.min(relevantIds.length, retrievedIds.length);
87
+ const idcg = Array.from({ length: idealLength }).reduce((sum, _, index) => sum + 1 / Math.log2(index + 2), 0);
88
+ if (idcg === 0) {
89
+ return 0;
90
+ }
91
+ return dcg / idcg;
92
+ }
93
+ function mean(values) {
94
+ if (values.length === 0) {
95
+ return 0;
96
+ }
97
+ return values.reduce((sum, value) => sum + value, 0) / values.length;
98
+ }
99
+ function buildScopeFilters(entry) {
100
+ return {
101
+ tenant_id: entry.tenant_id,
102
+ workspace_id: entry.workspace_id,
103
+ ...(entry.channel ? { channel: entry.channel } : {}),
104
+ ...(entry.thread_key ? { thread_key: entry.thread_key } : {}),
105
+ embedding_provider: "voyage",
106
+ embedding_model: "voyage-4-large"
107
+ };
108
+ }
109
+ async function run() {
110
+ const options = parseArgs(process.argv.slice(2));
111
+ const config = loadRuntimeConfig();
112
+ if (!config.mongo.uri) {
113
+ throw new Error("CLAWMONGO_MONGODB_URI is required for retrieval evaluation runs.");
114
+ }
115
+ const fixtureBundle = await loadRetrievalFixtureBundle();
116
+ const lexical = new MongoLexicalRetriever({
117
+ deploymentProfile: config.deploymentProfile,
118
+ uri: config.mongo.uri,
119
+ dbName: config.mongo.dbName,
120
+ connectTimeoutMs: config.mongo.connectTimeoutMs
121
+ });
122
+ const vector = new MongoVectorRetriever({
123
+ deploymentProfile: config.deploymentProfile,
124
+ uri: config.mongo.uri,
125
+ dbName: config.mongo.dbName,
126
+ connectTimeoutMs: config.mongo.connectTimeoutMs
127
+ });
128
+ const fusion = new AdaptiveFusionRetriever({
129
+ supportsNativeFusion: DEPLOYMENT_PROFILES[config.deploymentProfile].search.native_fusion,
130
+ uri: config.mongo.uri,
131
+ dbName: config.mongo.dbName,
132
+ connectTimeoutMs: config.mongo.connectTimeoutMs
133
+ });
134
+ const engine = new RetrievalEngine(lexical, vector, fusion);
135
+ const variants = options.variant === "all" ? ["lexical", "vector", "hybrid"] : [options.variant];
136
+ const perVariantRows = {
137
+ lexical: [],
138
+ vector: [],
139
+ hybrid: []
140
+ };
141
+ for (const variant of variants) {
142
+ for (const query of fixtureBundle.queries.queries) {
143
+ const scopeFilters = buildScopeFilters(query);
144
+ const startedAt = performance.now();
145
+ let retrievedIds = [];
146
+ if (variant === "vector") {
147
+ const vectorStage = await vector.searchWithTelemetry({
148
+ query: query.query,
149
+ scopeFilters,
150
+ topK: options.topK,
151
+ profile: config.deploymentProfile,
152
+ preferNativeFusion: true
153
+ });
154
+ retrievedIds = vectorStage.hits.slice(0, options.topK).map((hit) => hit.id);
155
+ }
156
+ else {
157
+ const response = await engine.retrieve({
158
+ query: query.query,
159
+ scopeFilters,
160
+ topK: options.topK,
161
+ profile: config.deploymentProfile,
162
+ preferNativeFusion: variant === "hybrid"
163
+ });
164
+ retrievedIds = response.hits.slice(0, options.topK).map((hit) => hit.id);
165
+ }
166
+ const latencyMs = Number((performance.now() - startedAt).toFixed(3));
167
+ const relevantIds = query.expected_chunk_ids;
168
+ const row = {
169
+ queryId: query.query_id,
170
+ variant,
171
+ relevant: relevantIds.length,
172
+ retrieved: retrievedIds,
173
+ recallAtK: Number(recallAtK(relevantIds, retrievedIds).toFixed(6)),
174
+ mrrAtK: Number(mrrAtK(relevantIds, retrievedIds).toFixed(6)),
175
+ ndcgAtK: Number(ndcgAtK(relevantIds, retrievedIds).toFixed(6)),
176
+ exactTokenHit: query.critical_exact
177
+ ? relevantIds.some((id) => retrievedIds.includes(id))
178
+ : null,
179
+ latencyMs
180
+ };
181
+ perVariantRows[variant].push(row);
182
+ }
183
+ }
184
+ const metrics = Object.fromEntries(variants.map((variant) => {
185
+ const rows = perVariantRows[variant];
186
+ const latencies = rows.map((row) => row.latencyMs);
187
+ const exactRows = rows.filter((row) => row.exactTokenHit !== null);
188
+ const exactTokenHitRate = exactRows.length === 0
189
+ ? null
190
+ : Number((exactRows.filter((row) => row.exactTokenHit === true).length / exactRows.length).toFixed(6));
191
+ return [
192
+ variant,
193
+ {
194
+ queryCount: rows.length,
195
+ recallAtK: Number(mean(rows.map((row) => row.recallAtK)).toFixed(6)),
196
+ mrrAtK: Number(mean(rows.map((row) => row.mrrAtK)).toFixed(6)),
197
+ ndcgAtK: Number(mean(rows.map((row) => row.ndcgAtK)).toFixed(6)),
198
+ exactTokenHitRateAtK: exactTokenHitRate,
199
+ latencyMs: {
200
+ p50: percentile(latencies, 50),
201
+ p95: percentile(latencies, 95),
202
+ p99: percentile(latencies, 99)
203
+ }
204
+ }
205
+ ];
206
+ }));
207
+ process.stdout.write(`${JSON.stringify({
208
+ status: "PASS",
209
+ profile: config.deploymentProfile,
210
+ topK: options.topK,
211
+ dataset: {
212
+ datasetId: fixtureBundle.corpus.dataset_id,
213
+ version: fixtureBundle.corpus.version,
214
+ fingerprint: fixtureBundle.fingerprint,
215
+ queryCount: fixtureBundle.queries.queries.length
216
+ },
217
+ variants,
218
+ metrics,
219
+ sampleRows: Object.fromEntries(variants.map((variant) => [variant, perVariantRows[variant].slice(0, 3)]))
220
+ }, null, 2)}\n`);
221
+ }
222
+ void run().catch((error) => {
223
+ const message = error instanceof Error ? error.message : String(error);
224
+ process.stderr.write(`retrieval-eval failed: ${message}\n`);
225
+ process.exitCode = 1;
226
+ });
@@ -0,0 +1,52 @@
1
+ import assert from "node:assert/strict";
2
+ import { loadRuntimeConfig } from "../config/env.js";
3
+ import { MongoLexicalRetriever } from "../retrieval/providers/lexical.js";
4
+ import { MongoVectorRetriever } from "../retrieval/providers/vector.js";
5
+ const run = async () => {
6
+ const config = loadRuntimeConfig();
7
+ const lexical = new MongoLexicalRetriever({
8
+ deploymentProfile: config.deploymentProfile,
9
+ uri: config.mongo.uri,
10
+ dbName: config.mongo.dbName,
11
+ connectTimeoutMs: config.mongo.connectTimeoutMs
12
+ });
13
+ const vector = new MongoVectorRetriever({
14
+ deploymentProfile: config.deploymentProfile,
15
+ uri: config.mongo.uri,
16
+ dbName: config.mongo.dbName,
17
+ connectTimeoutMs: config.mongo.connectTimeoutMs
18
+ });
19
+ const query = {
20
+ query: "Slack IDs must preserve case D0ADSDZ345P",
21
+ scopeFilters: {
22
+ tenant_id: "tenant-demo",
23
+ workspace_id: "workspace-alpha",
24
+ embedding_provider: "voyage",
25
+ embedding_model: "voyage-4-large"
26
+ },
27
+ topK: 3,
28
+ profile: config.deploymentProfile,
29
+ preferNativeFusion: true
30
+ };
31
+ const lexicalResult = await lexical.searchWithTelemetry(query);
32
+ const vectorResult = await vector.searchWithTelemetry(query);
33
+ assert.equal(lexicalResult.telemetry.stage, "lexical");
34
+ assert.equal(vectorResult.telemetry.stage, "vector");
35
+ assert.equal(lexicalResult.telemetry.resultCount, lexicalResult.hits.length);
36
+ assert.equal(vectorResult.telemetry.resultCount, vectorResult.hits.length);
37
+ process.stdout.write(`${JSON.stringify({
38
+ status: "PASS",
39
+ profile: config.deploymentProfile,
40
+ lexical: {
41
+ hitCount: lexicalResult.hits.length,
42
+ topHit: lexicalResult.hits[0]?.id ?? null,
43
+ telemetry: lexicalResult.telemetry
44
+ },
45
+ vector: {
46
+ hitCount: vectorResult.hits.length,
47
+ topHit: vectorResult.hits[0]?.id ?? null,
48
+ telemetry: vectorResult.telemetry
49
+ }
50
+ }, null, 2)}\n`);
51
+ };
52
+ void run();
@@ -0,0 +1,54 @@
1
+ import assert from "node:assert/strict";
2
+ import { loadRetrievalFixtureBundle } from "../retrieval/fixtures.js";
3
+ import { reembedFixtureChunks } from "../retrieval/reembed.js";
4
+ async function run() {
5
+ const bundle = await loadRetrievalFixtureBundle();
6
+ const result = await reembedFixtureChunks(bundle.corpus.chunks, {
7
+ live: false
8
+ });
9
+ assert.equal(result.summary.enabled, true);
10
+ assert.equal(result.summary.inputType, "document");
11
+ assert.equal(result.summary.live, false);
12
+ assert.equal(result.summary.remoteBatchRequested, false);
13
+ assert.equal(result.summary.chunkCount, bundle.corpus.chunks.length);
14
+ assert.ok(result.summary.groups > 0);
15
+ assert.ok(Object.keys(result.summary.providers).length > 0);
16
+ assert.ok(Object.keys(result.summary.models).length > 0);
17
+ assert.ok((result.summary.backendSources.deterministic ?? 0) > 0);
18
+ assert.ok(result.summary.dimensions.every((value) => Number.isInteger(value) && value > 0));
19
+ const firstChunk = result.chunks[0];
20
+ const originalFirst = bundle.corpus.chunks[0];
21
+ assert.ok(firstChunk, "Expected at least one fixture chunk after re-embedding.");
22
+ assert.ok(originalFirst, "Expected at least one original fixture chunk.");
23
+ assert.equal(firstChunk.embedding_provider.length > 0, true);
24
+ assert.equal(firstChunk.embedding_model.length > 0, true);
25
+ assert.equal(firstChunk.embedding.length > 0, true);
26
+ assert.equal(firstChunk.doc_id, originalFirst.doc_id);
27
+ assert.equal(firstChunk.chunk_id, originalFirst.chunk_id);
28
+ process.stdout.write(`${JSON.stringify({
29
+ status: "PASS",
30
+ checks: [
31
+ {
32
+ id: "reembed_document_input_contract",
33
+ ok: true,
34
+ summary: result.summary
35
+ },
36
+ {
37
+ id: "reembed_chunk_identity_stability",
38
+ ok: true,
39
+ firstChunk: {
40
+ doc_id: firstChunk.doc_id,
41
+ chunk_id: firstChunk.chunk_id,
42
+ embedding_provider: firstChunk.embedding_provider,
43
+ embedding_model: firstChunk.embedding_model,
44
+ dimensions: firstChunk.embedding.length
45
+ }
46
+ }
47
+ ]
48
+ }, null, 2)}\n`);
49
+ }
50
+ void run().catch((error) => {
51
+ const message = error instanceof Error ? error.message : String(error);
52
+ process.stderr.write(`retrieval-seed-reembed-smoke failed: ${message}\n`);
53
+ process.exitCode = 1;
54
+ });
@@ -0,0 +1,312 @@
1
+ import { MongoClient } from "mongodb";
2
+ import { loadRuntimeConfig } from "../config/env.js";
3
+ import { loadRetrievalFixtureBundle, summarizeRetrievalFixture } from "../retrieval/fixtures.js";
4
+ import { reembedFixtureChunks } from "../retrieval/reembed.js";
5
+ function parseArgs(argv) {
6
+ const options = {
7
+ apply: false,
8
+ reset: false,
9
+ reembed: false,
10
+ reembedLive: false,
11
+ reembedVoyageRemoteBatch: false,
12
+ reembedVoyageRemoteBatchWait: true
13
+ };
14
+ for (let index = 0; index < argv.length; index += 1) {
15
+ const token = argv[index];
16
+ if (token === "--apply") {
17
+ options.apply = true;
18
+ continue;
19
+ }
20
+ if (token === "--reset") {
21
+ options.reset = true;
22
+ continue;
23
+ }
24
+ if (token === "--corpus") {
25
+ options.corpusPath = argv[index + 1];
26
+ index += 1;
27
+ continue;
28
+ }
29
+ if (token === "--queries") {
30
+ options.queriesPath = argv[index + 1];
31
+ index += 1;
32
+ continue;
33
+ }
34
+ if (token === "--reembed") {
35
+ options.reembed = true;
36
+ continue;
37
+ }
38
+ if (token === "--reembed-provider") {
39
+ options.reembedProvider = argv[index + 1];
40
+ index += 1;
41
+ continue;
42
+ }
43
+ if (token === "--reembed-model") {
44
+ options.reembedModel = argv[index + 1];
45
+ index += 1;
46
+ continue;
47
+ }
48
+ if (token === "--reembed-dimensions") {
49
+ options.reembedDimensions = Number(argv[index + 1]);
50
+ index += 1;
51
+ continue;
52
+ }
53
+ if (token === "--reembed-live") {
54
+ options.reembedLive = true;
55
+ continue;
56
+ }
57
+ if (token === "--reembed-voyage-remote-batch") {
58
+ options.reembedVoyageRemoteBatch = true;
59
+ continue;
60
+ }
61
+ if (token === "--reembed-voyage-remote-batch-min-texts") {
62
+ options.reembedVoyageRemoteBatchMinTexts = Number(argv[index + 1]);
63
+ index += 1;
64
+ continue;
65
+ }
66
+ if (token === "--reembed-voyage-remote-batch-no-wait") {
67
+ options.reembedVoyageRemoteBatchWait = false;
68
+ continue;
69
+ }
70
+ if (token === "--reembed-voyage-remote-batch-poll-ms") {
71
+ options.reembedVoyageRemoteBatchPollMs = Number(argv[index + 1]);
72
+ index += 1;
73
+ continue;
74
+ }
75
+ if (token === "--reembed-voyage-remote-batch-timeout-ms") {
76
+ options.reembedVoyageRemoteBatchTimeoutMs = Number(argv[index + 1]);
77
+ index += 1;
78
+ continue;
79
+ }
80
+ if (token === "--reembed-voyage-remote-batch-concurrency") {
81
+ options.reembedVoyageRemoteBatchConcurrency = Number(argv[index + 1]);
82
+ index += 1;
83
+ continue;
84
+ }
85
+ if (token === "--help") {
86
+ process.stdout.write([
87
+ "Usage: npm run retrieval:seed -- [--apply] [--reset] [--corpus <path>] [--queries <path>]",
88
+ "",
89
+ "Options:",
90
+ " --apply write fixture corpus + queries into MongoDB",
91
+ " --reset delete current dataset rows before writing",
92
+ " --corpus override fixture corpus path",
93
+ " --queries override fixture queries path",
94
+ " --reembed regenerate chunk embeddings using provider abstraction (inputType=document)",
95
+ " --reembed-provider override embedding provider for all chunks (requires --reembed-model)",
96
+ " --reembed-model override embedding model for all chunks",
97
+ " --reembed-dimensions override embedding dimensions for all chunks",
98
+ " --reembed-live use live provider adapters for re-embedding",
99
+ " --reembed-voyage-remote-batch use Voyage /files + /batches mode for document embeddings (live only)",
100
+ " --reembed-voyage-remote-batch-min-texts <n> minimum texts to trigger remote batch mode",
101
+ " --reembed-voyage-remote-batch-no-wait submit batch without waiting for completion",
102
+ " --reembed-voyage-remote-batch-poll-ms <n> poll interval for batch completion",
103
+ " --reembed-voyage-remote-batch-timeout-ms <n> timeout for batch completion",
104
+ " --reembed-voyage-remote-batch-concurrency <n> concurrent remote batch groups"
105
+ ].join("\n") + "\n");
106
+ process.exit(0);
107
+ }
108
+ throw new Error(`Unknown argument: ${token}`);
109
+ }
110
+ return options;
111
+ }
112
+ function toDate(value) {
113
+ const parsed = new Date(value);
114
+ if (Number.isNaN(parsed.getTime())) {
115
+ throw new Error(`Invalid fixture datetime: '${value}'`);
116
+ }
117
+ return parsed;
118
+ }
119
+ function validateOptions(options) {
120
+ if (options.reembedProvider && !options.reembedModel) {
121
+ throw new Error("--reembed-provider requires --reembed-model so provider/model overrides stay explicit.");
122
+ }
123
+ const remoteBatchTuneFlagsUsed = typeof options.reembedVoyageRemoteBatchMinTexts === "number" ||
124
+ typeof options.reembedVoyageRemoteBatchPollMs === "number" ||
125
+ typeof options.reembedVoyageRemoteBatchTimeoutMs === "number" ||
126
+ typeof options.reembedVoyageRemoteBatchConcurrency === "number" ||
127
+ options.reembedVoyageRemoteBatchWait === false;
128
+ if (options.reembedVoyageRemoteBatch || remoteBatchTuneFlagsUsed) {
129
+ if (!options.reembed) {
130
+ throw new Error("Voyage remote-batch flags require --reembed because they only apply to re-embedding.");
131
+ }
132
+ if (!options.reembedLive) {
133
+ throw new Error("Voyage remote-batch requires --reembed-live because remote /files + /batches is a live-provider path.");
134
+ }
135
+ if (options.reembedProvider &&
136
+ options.reembedProvider.trim().toLowerCase() !== "voyage") {
137
+ throw new Error("Voyage remote-batch flags can only be used when re-embedding provider is voyage.");
138
+ }
139
+ }
140
+ }
141
+ async function run() {
142
+ const options = parseArgs(process.argv.slice(2));
143
+ validateOptions(options);
144
+ const config = loadRuntimeConfig();
145
+ const fixturePaths = {
146
+ corpusPath: options.corpusPath,
147
+ queriesPath: options.queriesPath
148
+ };
149
+ const bundle = await loadRetrievalFixtureBundle(fixturePaths);
150
+ const summary = summarizeRetrievalFixture(bundle);
151
+ let reembedSummary = null;
152
+ let corpusChunks = bundle.corpus.chunks;
153
+ if (options.reembed) {
154
+ const reembedded = await reembedFixtureChunks(bundle.corpus.chunks, {
155
+ provider: options.reembedProvider,
156
+ model: options.reembedModel,
157
+ dimensions: options.reembedDimensions,
158
+ live: options.reembedLive,
159
+ remoteBatch: options.reembedVoyageRemoteBatch,
160
+ remoteBatchMinTexts: options.reembedVoyageRemoteBatchMinTexts,
161
+ remoteBatchWait: options.reembedVoyageRemoteBatchWait,
162
+ remoteBatchPollIntervalMs: options.reembedVoyageRemoteBatchPollMs,
163
+ remoteBatchTimeoutMs: options.reembedVoyageRemoteBatchTimeoutMs,
164
+ remoteBatchConcurrency: options.reembedVoyageRemoteBatchConcurrency
165
+ });
166
+ corpusChunks = reembedded.chunks;
167
+ reembedSummary = reembedded.summary;
168
+ }
169
+ if (!options.apply) {
170
+ process.stdout.write(`${JSON.stringify({
171
+ status: "DRY_RUN",
172
+ apply: false,
173
+ reset: options.reset,
174
+ dataset: summary,
175
+ reembed: reembedSummary,
176
+ files: {
177
+ corpusPath: bundle.corpusPath,
178
+ queriesPath: bundle.queriesPath
179
+ }
180
+ }, null, 2)}\n`);
181
+ return;
182
+ }
183
+ if (!config.mongo.uri) {
184
+ throw new Error("CLAWMONGO_MONGODB_URI is required when --apply is set.");
185
+ }
186
+ const client = new MongoClient(config.mongo.uri, {
187
+ appName: "clawmongo-retrieval-seed",
188
+ serverSelectionTimeoutMS: config.mongo.connectTimeoutMs
189
+ });
190
+ await client.connect();
191
+ try {
192
+ const db = client.db(config.mongo.dbName);
193
+ const memoryChunks = db.collection("memory_chunks");
194
+ const retrievalQueries = db.collection("retrieval_eval_queries");
195
+ const resetResult = {
196
+ memoryChunksDeleted: 0,
197
+ retrievalQueriesDeleted: 0
198
+ };
199
+ if (options.reset) {
200
+ const [chunksDelete, queriesDelete] = await Promise.all([
201
+ memoryChunks.deleteMany({ fixture_dataset: summary.datasetId }),
202
+ retrievalQueries.deleteMany({ fixture_dataset: summary.datasetId })
203
+ ]);
204
+ resetResult.memoryChunksDeleted = chunksDelete.deletedCount;
205
+ resetResult.retrievalQueriesDeleted = queriesDelete.deletedCount;
206
+ }
207
+ const chunkWrites = await memoryChunks.bulkWrite(corpusChunks.map((chunk) => {
208
+ const eventTs = toDate(chunk.timestamp);
209
+ const payload = {
210
+ tenant_id: chunk.tenant_id,
211
+ workspace_id: chunk.workspace_id,
212
+ chunk_id: chunk.chunk_id,
213
+ text: chunk.text,
214
+ embedding: chunk.embedding,
215
+ embedding_provider: chunk.embedding_provider,
216
+ embedding_model: chunk.embedding_model,
217
+ event_ts: eventTs,
218
+ schema_version: chunk.schema_version,
219
+ updated_at: eventTs,
220
+ doc_id: chunk.doc_id,
221
+ source_type: chunk.source_type,
222
+ channel: chunk.channel,
223
+ thread_key: chunk.thread_key,
224
+ language: chunk.language,
225
+ fixture_dataset: summary.datasetId,
226
+ fixture_fingerprint: summary.fingerprint
227
+ };
228
+ return {
229
+ updateOne: {
230
+ filter: {
231
+ tenant_id: chunk.tenant_id,
232
+ workspace_id: chunk.workspace_id,
233
+ chunk_id: chunk.chunk_id
234
+ },
235
+ update: {
236
+ $set: payload,
237
+ $setOnInsert: {
238
+ created_at: eventTs
239
+ }
240
+ },
241
+ upsert: true
242
+ }
243
+ };
244
+ }), { ordered: false });
245
+ const queryTimestamp = toDate(bundle.corpus.generated_at);
246
+ const queryWrites = await retrievalQueries.bulkWrite(bundle.queries.queries.map((query) => {
247
+ const payload = {
248
+ fixture_dataset: summary.datasetId,
249
+ fixture_fingerprint: summary.fingerprint,
250
+ query_id: query.query_id,
251
+ query_class: query.query_class,
252
+ query: query.query,
253
+ tenant_id: query.tenant_id,
254
+ workspace_id: query.workspace_id,
255
+ channel: query.channel ?? null,
256
+ thread_key: query.thread_key ?? null,
257
+ time_range: query.time_range ?? null,
258
+ expected_chunk_ids: query.expected_chunk_ids,
259
+ critical_exact: query.critical_exact,
260
+ scope_valid: query.scope_valid,
261
+ updated_at: queryTimestamp
262
+ };
263
+ return {
264
+ updateOne: {
265
+ filter: {
266
+ fixture_dataset: summary.datasetId,
267
+ query_id: query.query_id
268
+ },
269
+ update: {
270
+ $set: payload,
271
+ $setOnInsert: {
272
+ created_at: queryTimestamp
273
+ }
274
+ },
275
+ upsert: true
276
+ }
277
+ };
278
+ }), { ordered: false });
279
+ process.stdout.write(`${JSON.stringify({
280
+ status: "APPLIED",
281
+ apply: true,
282
+ reset: options.reset,
283
+ dataset: summary,
284
+ reembed: reembedSummary,
285
+ files: {
286
+ corpusPath: bundle.corpusPath,
287
+ queriesPath: bundle.queriesPath
288
+ },
289
+ resetResult,
290
+ writes: {
291
+ memory_chunks: {
292
+ matchedCount: chunkWrites.matchedCount,
293
+ modifiedCount: chunkWrites.modifiedCount,
294
+ upsertedCount: chunkWrites.upsertedCount
295
+ },
296
+ retrieval_eval_queries: {
297
+ matchedCount: queryWrites.matchedCount,
298
+ modifiedCount: queryWrites.modifiedCount,
299
+ upsertedCount: queryWrites.upsertedCount
300
+ }
301
+ }
302
+ }, null, 2)}\n`);
303
+ }
304
+ finally {
305
+ await client.close();
306
+ }
307
+ }
308
+ void run().catch((error) => {
309
+ const message = error instanceof Error ? error.message : String(error);
310
+ process.stderr.write(`retrieval-seed failed: ${message}\n`);
311
+ process.exitCode = 1;
312
+ });