nodebench-mcp 2.33.0 → 2.35.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/dist/benchmarks/ambientBench.d.ts +27 -0
  2. package/dist/benchmarks/ambientBench.js +900 -0
  3. package/dist/benchmarks/ambientBench.js.map +1 -0
  4. package/dist/benchmarks/index.d.ts +1 -0
  5. package/dist/benchmarks/index.js +2 -0
  6. package/dist/benchmarks/index.js.map +1 -0
  7. package/dist/dashboard/operatingDashboardHtml.d.ts +23 -0
  8. package/dist/dashboard/operatingDashboardHtml.js +2036 -0
  9. package/dist/dashboard/operatingDashboardHtml.js.map +1 -0
  10. package/dist/dashboard/operatingServer.d.ts +23 -0
  11. package/dist/dashboard/operatingServer.js +704 -0
  12. package/dist/dashboard/operatingServer.js.map +1 -0
  13. package/dist/index.js +13 -4
  14. package/dist/index.js.map +1 -1
  15. package/dist/providers/index.d.ts +9 -0
  16. package/dist/providers/index.js +12 -0
  17. package/dist/providers/index.js.map +1 -0
  18. package/dist/providers/localMemoryProvider.d.ts +35 -0
  19. package/dist/providers/localMemoryProvider.js +370 -0
  20. package/dist/providers/localMemoryProvider.js.map +1 -0
  21. package/dist/providers/memoryProvider.d.ts +187 -0
  22. package/dist/providers/memoryProvider.js +15 -0
  23. package/dist/providers/memoryProvider.js.map +1 -0
  24. package/dist/providers/memoryProviderRegistry.d.ts +61 -0
  25. package/dist/providers/memoryProviderRegistry.js +107 -0
  26. package/dist/providers/memoryProviderRegistry.js.map +1 -0
  27. package/dist/providers/supermemoryProvider.d.ts +58 -0
  28. package/dist/providers/supermemoryProvider.js +432 -0
  29. package/dist/providers/supermemoryProvider.js.map +1 -0
  30. package/dist/tools/causalMemoryTools.d.ts +11 -0
  31. package/dist/tools/causalMemoryTools.js +639 -0
  32. package/dist/tools/causalMemoryTools.js.map +1 -0
  33. package/dist/tools/dogfoodJudgeTools.d.ts +13 -0
  34. package/dist/tools/dogfoodJudgeTools.js +809 -0
  35. package/dist/tools/dogfoodJudgeTools.js.map +1 -0
  36. package/dist/tools/localDashboardTools.js +36 -0
  37. package/dist/tools/localDashboardTools.js.map +1 -1
  38. package/dist/tools/progressiveDiscoveryTools.js +1 -1
  39. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  40. package/dist/tools/toolRegistry.js +292 -0
  41. package/dist/tools/toolRegistry.js.map +1 -1
  42. package/dist/toolsetRegistry.js +4 -1
  43. package/dist/toolsetRegistry.js.map +1 -1
  44. package/package.json +1 -1
@@ -0,0 +1,900 @@
1
+ /**
2
+ * Ambient Intelligence Benchmark Suite
3
+ *
4
+ * 5 benchmarks that measure NodeBench's structural ambient-intelligence
5
+ * capabilities: packet reuse, contradiction detection, company profiling,
6
+ * action provenance, and multi-provider continuity.
7
+ *
8
+ * All scoring is deterministic and heuristic-based (no LLM calls).
9
+ * Uses the shared SQLite via getDb(). Each benchmark is self-contained
10
+ * with its own setup/teardown using unique IDs to avoid collision.
11
+ */
12
+ import { getDb } from "../db.js";
13
+ import crypto from "node:crypto";
14
+ /* ================================================================== */
15
+ /* Schema bootstrap (idempotent) */
16
+ /* ================================================================== */
17
+ let _benchSchemaReady = false;
18
+ function ensureBenchSchema() {
19
+ if (_benchSchemaReady)
20
+ return;
21
+ const db = getDb();
22
+ db.exec(`
23
+ CREATE TABLE IF NOT EXISTS amb_ingestion_items (
24
+ id TEXT PRIMARY KEY,
25
+ benchRunId TEXT NOT NULL,
26
+ sessionIndex INTEGER NOT NULL,
27
+ itemType TEXT NOT NULL,
28
+ content TEXT NOT NULL,
29
+ sourceProvider TEXT,
30
+ entityRefs TEXT,
31
+ timestampMs INTEGER NOT NULL,
32
+ createdAt TEXT NOT NULL
33
+ );
34
+ CREATE INDEX IF NOT EXISTS idx_amb_ingestion_benchrun ON amb_ingestion_items(benchRunId);
35
+
36
+ CREATE TABLE IF NOT EXISTS amb_packets (
37
+ id TEXT PRIMARY KEY,
38
+ benchRunId TEXT NOT NULL,
39
+ packetType TEXT NOT NULL,
40
+ title TEXT NOT NULL,
41
+ content TEXT NOT NULL,
42
+ sourceItemIds TEXT NOT NULL,
43
+ entityRefs TEXT,
44
+ createdAt TEXT NOT NULL
45
+ );
46
+ CREATE INDEX IF NOT EXISTS idx_amb_packets_benchrun ON amb_packets(benchRunId);
47
+
48
+ CREATE TABLE IF NOT EXISTS amb_contradictions (
49
+ id TEXT PRIMARY KEY,
50
+ benchRunId TEXT NOT NULL,
51
+ itemIdA TEXT NOT NULL,
52
+ itemIdB TEXT NOT NULL,
53
+ explanation TEXT NOT NULL,
54
+ confidence REAL NOT NULL,
55
+ createdAt TEXT NOT NULL
56
+ );
57
+ CREATE INDEX IF NOT EXISTS idx_amb_contradictions_benchrun ON amb_contradictions(benchRunId);
58
+
59
+ CREATE TABLE IF NOT EXISTS amb_entity_profiles (
60
+ id TEXT PRIMARY KEY,
61
+ benchRunId TEXT NOT NULL,
62
+ entityType TEXT NOT NULL,
63
+ entityId TEXT NOT NULL,
64
+ thesis TEXT,
65
+ wedge TEXT,
66
+ initiatives TEXT,
67
+ competitors TEXT,
68
+ rawSourceIds TEXT NOT NULL,
69
+ createdAt TEXT NOT NULL
70
+ );
71
+ CREATE INDEX IF NOT EXISTS idx_amb_entity_profiles_benchrun ON amb_entity_profiles(benchRunId);
72
+
73
+ CREATE TABLE IF NOT EXISTS amb_provenance_chain (
74
+ id TEXT PRIMARY KEY,
75
+ benchRunId TEXT NOT NULL,
76
+ stepIndex INTEGER NOT NULL,
77
+ actionDescription TEXT NOT NULL,
78
+ causedByStepIndex INTEGER,
79
+ entityId TEXT NOT NULL,
80
+ stateSnapshot TEXT NOT NULL,
81
+ createdAt TEXT NOT NULL
82
+ );
83
+ CREATE INDEX IF NOT EXISTS idx_amb_provenance_benchrun ON amb_provenance_chain(benchRunId);
84
+
85
+ CREATE TABLE IF NOT EXISTS amb_provider_facts (
86
+ id TEXT PRIMARY KEY,
87
+ benchRunId TEXT NOT NULL,
88
+ provider TEXT NOT NULL,
89
+ factKey TEXT NOT NULL,
90
+ factValue TEXT NOT NULL,
91
+ entityId TEXT,
92
+ category TEXT NOT NULL,
93
+ createdAt TEXT NOT NULL
94
+ );
95
+ CREATE INDEX IF NOT EXISTS idx_amb_provider_facts_benchrun ON amb_provider_facts(benchRunId);
96
+
97
+ CREATE TABLE IF NOT EXISTS amb_merged_state (
98
+ id TEXT PRIMARY KEY,
99
+ benchRunId TEXT NOT NULL,
100
+ factKey TEXT NOT NULL,
101
+ resolvedValue TEXT NOT NULL,
102
+ sourceProviders TEXT NOT NULL,
103
+ conflictDetected INTEGER NOT NULL DEFAULT 0,
104
+ createdAt TEXT NOT NULL
105
+ );
106
+ CREATE INDEX IF NOT EXISTS idx_amb_merged_benchrun ON amb_merged_state(benchRunId);
107
+ `);
108
+ _benchSchemaReady = true;
109
+ }
110
+ /* ================================================================== */
111
+ /* Helpers */
112
+ /* ================================================================== */
113
+ function uid() {
114
+ return crypto.randomUUID().replace(/-/g, "").slice(0, 16);
115
+ }
116
+ function now() {
117
+ return Date.now();
118
+ }
119
+ function isoNow() {
120
+ return new Date().toISOString();
121
+ }
122
+ /** Simple word-overlap similarity (Jaccard on lowercased tokens). */
123
+ function textSimilarity(a, b) {
124
+ const tokA = new Set(a.toLowerCase().split(/\s+/).filter(Boolean));
125
+ const tokB = new Set(b.toLowerCase().split(/\s+/).filter(Boolean));
126
+ if (tokA.size === 0 && tokB.size === 0)
127
+ return 1;
128
+ if (tokA.size === 0 || tokB.size === 0)
129
+ return 0;
130
+ let intersection = 0;
131
+ for (const t of tokA)
132
+ if (tokB.has(t))
133
+ intersection++;
134
+ return intersection / (tokA.size + tokB.size - intersection);
135
+ }
136
+ /** Check if two statements are semantically contradictory using keyword heuristics. */
137
+ function areContradictory(a, b) {
138
+ const la = a.toLowerCase();
139
+ const lb = b.toLowerCase();
140
+ // Same subject, opposite predicate patterns
141
+ const negPairs = [
142
+ [/\bis\b/, /\bis not\b/],
143
+ [/\bwill\b/, /\bwill not\b/],
144
+ [/\bshould\b/, /\bshould not\b/],
145
+ [/\bincreased\b/, /\bdecreased\b/],
146
+ [/\bgrowing\b/, /\bshrinking\b/],
147
+ [/\bprofitable\b/, /\bunprofitable\b/],
148
+ [/\bstrong\b/, /\bweak\b/],
149
+ [/\bhigh\b/, /\blow\b/],
150
+ [/\bexpanding\b/, /\bcontracting\b/],
151
+ [/\babove\b/, /\bbelow\b/],
152
+ ];
153
+ for (const [pat1, pat2] of negPairs) {
154
+ if ((pat1.test(la) && pat2.test(lb)) || (pat2.test(la) && pat1.test(lb))) {
155
+ return true;
156
+ }
157
+ }
158
+ return false;
159
+ }
160
+ /* ================================================================== */
161
+ /* Benchmark 1: Packet Reuse */
162
+ /* ================================================================== */
163
+ class PacketReuseBench {
164
+ name = "PacketReuse";
165
+ description = "Given N sessions of raw input, how well does the system produce a reusable artifact packet?";
166
+ runId = `bench_pr_${uid()}`;
167
+ sessions = [
168
+ // Session 1: Company identity
169
+ [
170
+ { type: "chat", content: "Our company Acme AI builds trust infrastructure for agent systems" },
171
+ { type: "decision", content: "Decided to focus on B2B SaaS for Series A" },
172
+ { type: "signal", content: "Competitor TrustLayer raised $12M in seed funding" },
173
+ ],
174
+ // Session 2: Market analysis
175
+ [
176
+ { type: "signal", content: "Enterprise agent adoption grew 340% in Q1 2026" },
177
+ { type: "chat", content: "Our wedge is the trust scoring API — no one else has deterministic trust" },
178
+ { type: "decision", content: "Pricing set at $29/mo for pro tier" },
179
+ ],
180
+ // Session 3: Initiative updates
181
+ [
182
+ { type: "chat", content: "Initiative Alpha: Build trust scoring API — 60% complete" },
183
+ { type: "signal", content: "Customer Globex Corp requested audit trail feature" },
184
+ { type: "decision", content: "Prioritize audit trail over dashboard redesign" },
185
+ ],
186
+ // Session 4: Contradictions and updates
187
+ [
188
+ { type: "chat", content: "Acme AI is pivoting from B2B SaaS to developer tools platform" },
189
+ { type: "signal", content: "TrustLayer acquired by Oracle — no longer a direct competitor" },
190
+ { type: "decision", content: "Revised pricing to $49/mo for pro tier" },
191
+ ],
192
+ // Session 5: Strategy refinement
193
+ [
194
+ { type: "chat", content: "Our thesis is that every agent needs a trust score before acting" },
195
+ { type: "signal", content: "Three new enterprise leads from ProductHunt launch" },
196
+ { type: "decision", content: "Initiative Beta: Open-source the scoring SDK" },
197
+ ],
198
+ ];
199
+ async setup() {
200
+ ensureBenchSchema();
201
+ const db = getDb();
202
+ const insert = db.prepare(`INSERT INTO amb_ingestion_items (id, benchRunId, sessionIndex, itemType, content, sourceProvider, entityRefs, timestampMs, createdAt)
203
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`);
204
+ let ts = now() - 100000;
205
+ for (let si = 0; si < this.sessions.length; si++) {
206
+ for (const item of this.sessions[si]) {
207
+ insert.run(`ii_${uid()}`, this.runId, si, item.type, item.content, "primary", null, ts, isoNow());
208
+ ts += 1000;
209
+ }
210
+ }
211
+ }
212
+ async run() {
213
+ const start = now();
214
+ const db = getDb();
215
+ // Simulate ingestion -> canonicalization -> packet pipeline
216
+ const items = db.prepare(`SELECT id, sessionIndex, itemType, content FROM amb_ingestion_items WHERE benchRunId = ? ORDER BY timestampMs ASC`).all(this.runId);
217
+ // Build packets by type
218
+ const byType = {};
219
+ for (const item of items) {
220
+ const key = item.itemType;
221
+ (byType[key] ??= []).push(item);
222
+ }
223
+ // Create packets
224
+ const packetInsert = db.prepare(`INSERT INTO amb_packets (id, benchRunId, packetType, title, content, sourceItemIds, entityRefs, createdAt)
225
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)`);
226
+ const packets = [];
227
+ for (const [type, typeItems] of Object.entries(byType)) {
228
+ // Deduplicate by merging — keep latest version of similar content
229
+ const merged = [];
230
+ for (const item of typeItems) {
231
+ const existing = merged.find((m) => textSimilarity(m.content, item.content) > 0.5);
232
+ if (existing) {
233
+ // Replace with newer version (later in array = newer)
234
+ const idx = merged.indexOf(existing);
235
+ merged[idx] = item;
236
+ }
237
+ else {
238
+ merged.push(item);
239
+ }
240
+ }
241
+ const packetContent = merged.map((m) => m.content).join("\n");
242
+ const sourceIds = merged.map((m) => m.id);
243
+ packets.push({ type, content: packetContent, sourceIds });
244
+ packetInsert.run(`pkt_${uid()}`, this.runId, type, `${type} packet`, packetContent, JSON.stringify(sourceIds), null, isoNow());
245
+ }
246
+ // Score 1: packetCompleteness — % of source facts captured
247
+ const allFacts = items.map((i) => i.content);
248
+ let capturedCount = 0;
249
+ for (const fact of allFacts) {
250
+ const inAnyPacket = packets.some((p) => {
251
+ // Check if the fact is semantically represented in the packet
252
+ const words = fact.toLowerCase().split(/\s+/).filter((w) => w.length > 3);
253
+ const packetLower = p.content.toLowerCase();
254
+ const matchedWords = words.filter((w) => packetLower.includes(w));
255
+ return matchedWords.length / Math.max(words.length, 1) > 0.5;
256
+ });
257
+ if (inAnyPacket)
258
+ capturedCount++;
259
+ }
260
+ const packetCompleteness = allFacts.length > 0 ? capturedCount / allFacts.length : 0;
261
+ // Score 2: packetCoherence — no contradictory statements within same packet
262
+ let contradictionsInPackets = 0;
263
+ let totalPairChecks = 0;
264
+ for (const pkt of packets) {
265
+ const lines = pkt.content.split("\n").filter(Boolean);
266
+ for (let i = 0; i < lines.length; i++) {
267
+ for (let j = i + 1; j < lines.length; j++) {
268
+ totalPairChecks++;
269
+ if (areContradictory(lines[i], lines[j])) {
270
+ contradictionsInPackets++;
271
+ }
272
+ }
273
+ }
274
+ }
275
+ // Coherence: 1 if no contradictions, else deduct per contradiction
276
+ const packetCoherence = totalPairChecks > 0
277
+ ? Math.max(0, 1 - contradictionsInPackets / totalPairChecks)
278
+ : 1;
279
+ // Score 3: packetReusability — structural completeness (has title, content, source refs)
280
+ let reusableCount = 0;
281
+ for (const pkt of packets) {
282
+ const hasContent = pkt.content.length > 10;
283
+ const hasSources = pkt.sourceIds.length > 0;
284
+ const hasMultipleFacts = pkt.content.split("\n").filter(Boolean).length >= 1;
285
+ if (hasContent && hasSources && hasMultipleFacts)
286
+ reusableCount++;
287
+ }
288
+ const packetReusability = packets.length > 0 ? reusableCount / packets.length : 0;
289
+ // Score 4: packetFreshness — does the packet reflect the most recent state
290
+ // Check if the latest session's content is represented
291
+ const latestSession = this.sessions[this.sessions.length - 1];
292
+ let freshCount = 0;
293
+ for (const item of latestSession) {
294
+ const words = item.content.toLowerCase().split(/\s+/).filter((w) => w.length > 3);
295
+ const inPacket = packets.some((p) => {
296
+ const pLower = p.content.toLowerCase();
297
+ return words.filter((w) => pLower.includes(w)).length / Math.max(words.length, 1) > 0.5;
298
+ });
299
+ if (inPacket)
300
+ freshCount++;
301
+ }
302
+ const packetFreshness = latestSession.length > 0 ? freshCount / latestSession.length : 0;
303
+ const scores = { packetCompleteness, packetCoherence, packetReusability, packetFreshness };
304
+ const thresholds = { packetCompleteness: 0.8, packetCoherence: 1.0, packetReusability: 0.8, packetFreshness: 0.9 };
305
+ const passed = Object.entries(thresholds).every(([k, t]) => (scores[k] ?? 0) >= t);
306
+ return {
307
+ benchmarkName: this.name,
308
+ scores,
309
+ passed,
310
+ thresholds,
311
+ details: `Processed ${items.length} items across ${this.sessions.length} sessions into ${packets.length} packets. Completeness=${(packetCompleteness * 100).toFixed(1)}%, Coherence=${(packetCoherence * 100).toFixed(1)}%, Reusability=${(packetReusability * 100).toFixed(1)}%, Freshness=${(packetFreshness * 100).toFixed(1)}%`,
312
+ runDurationMs: now() - start,
313
+ };
314
+ }
315
+ async cleanup() {
316
+ const db = getDb();
317
+ db.prepare("DELETE FROM amb_ingestion_items WHERE benchRunId = ?").run(this.runId);
318
+ db.prepare("DELETE FROM amb_packets WHERE benchRunId = ?").run(this.runId);
319
+ }
320
+ }
321
+ /* ================================================================== */
322
+ /* Benchmark 2: Contradiction Detection */
323
+ /* ================================================================== */
324
+ class ContradictionDetectionBench {
325
+ name = "ContradictionDetection";
326
+ description = "Given conflicting statements across sessions, how accurately does the system flag them?";
327
+ runId = `bench_cd_${uid()}`;
328
+ // 10 genuinely contradictory pairs + 10 similar-but-not-contradictory pairs
329
+ pairs = [
330
+ // Genuinely contradictory (10)
331
+ { a: "Revenue is growing at 40% quarter-over-quarter", b: "Revenue is shrinking due to churn", isContradiction: true },
332
+ { a: "The company is profitable with strong margins", b: "The company is unprofitable and burning cash", isContradiction: true },
333
+ { a: "Market share is above 30% in the segment", b: "Market share is below 5% in the segment", isContradiction: true },
334
+ { a: "The team will expand to 50 engineers by Q3", b: "The team will not expand beyond current headcount", isContradiction: true },
335
+ { a: "Customer retention rate is high at 95%", b: "Customer retention rate is low at 40%", isContradiction: true },
336
+ { a: "The product should target enterprise customers", b: "The product should not target enterprise", isContradiction: true },
337
+ { a: "Infrastructure costs are decreasing year over year", b: "Infrastructure costs are increasing rapidly", isContradiction: true },
338
+ { a: "User engagement is strong across all segments", b: "User engagement is weak in key demographics", isContradiction: true },
339
+ { a: "The platform is expanding into European markets", b: "The platform is contracting and exiting Europe", isContradiction: true },
340
+ { a: "Series B funding is above target at $25M", b: "Series B funding is below target at $8M", isContradiction: true },
341
+ // Similar but NOT contradictory (10)
342
+ { a: "Revenue grew 40% in Q1", b: "Revenue grew 35% in Q2", isContradiction: false },
343
+ { a: "Hired 3 engineers in January", b: "Hired 5 engineers in March", isContradiction: false },
344
+ { a: "Customer NPS score is 72", b: "Customer satisfaction rate is 88%", isContradiction: false },
345
+ { a: "Launched in US market first", b: "Planning UK expansion for Q4", isContradiction: false },
346
+ { a: "Using AWS for cloud infrastructure", b: "Evaluating GCP for ML workloads", isContradiction: false },
347
+ { a: "Series A raised $12M from Sequoia", b: "Total funding to date is $15M including seed", isContradiction: false },
348
+ { a: "Focus on B2B SaaS for initial launch", b: "Exploring B2B2C channel for growth", isContradiction: false },
349
+ { a: "Mobile app has 50K downloads", b: "Web app has 200K monthly active users", isContradiction: false },
350
+ { a: "Churn rate is 3% monthly", b: "Annual retention is 65%", isContradiction: false },
351
+ { a: "Product roadmap includes API v2", b: "Engineering team prioritizing SDK improvements", isContradiction: false },
352
+ ];
353
+ async setup() {
354
+ ensureBenchSchema();
355
+ const db = getDb();
356
+ const insert = db.prepare(`INSERT INTO amb_ingestion_items (id, benchRunId, sessionIndex, itemType, content, sourceProvider, entityRefs, timestampMs, createdAt)
357
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`);
358
+ let ts = now() - 50000;
359
+ for (let i = 0; i < this.pairs.length; i++) {
360
+ insert.run(`cd_a_${i}_${uid()}`, this.runId, i, "statement", this.pairs[i].a, "primary", null, ts, isoNow());
361
+ ts += 500;
362
+ insert.run(`cd_b_${i}_${uid()}`, this.runId, i, "statement", this.pairs[i].b, "primary", null, ts, isoNow());
363
+ ts += 500;
364
+ }
365
+ }
366
+ async run() {
367
+ const start = now();
368
+ const db = getDb();
369
+ const items = db.prepare(`SELECT id, sessionIndex, content FROM amb_ingestion_items WHERE benchRunId = ? ORDER BY timestampMs ASC`).all(this.runId);
370
+ // Group by pair (sessionIndex)
371
+ const bySession = {};
372
+ const bySessionIds = {};
373
+ for (const item of items) {
374
+ (bySession[item.sessionIndex] ??= []).push(item.content);
375
+ (bySessionIds[item.sessionIndex] ??= []).push(item.id);
376
+ }
377
+ // Detect contradictions using heuristic
378
+ const detected = [];
379
+ const contrInsert = db.prepare(`INSERT INTO amb_contradictions (id, benchRunId, itemIdA, itemIdB, explanation, confidence, createdAt)
380
+ VALUES (?, ?, ?, ?, ?, ?, ?)`);
381
+ for (let i = 0; i < this.pairs.length; i++) {
382
+ const stmts = bySession[i];
383
+ const ids = bySessionIds[i];
384
+ if (!stmts || stmts.length < 2) {
385
+ detected.push({ pairIndex: i, flagged: false });
386
+ continue;
387
+ }
388
+ const flagged = areContradictory(stmts[0], stmts[1]);
389
+ detected.push({ pairIndex: i, flagged });
390
+ if (flagged) {
391
+ contrInsert.run(`contr_${uid()}`, this.runId, ids[0], ids[1], `Contradiction detected between statements in pair ${i}`, 0.85, isoNow());
392
+ }
393
+ }
394
+ // Compute precision, recall, f1, falsePositiveRate
395
+ let tp = 0, fp = 0, fn = 0, tn = 0;
396
+ for (let i = 0; i < this.pairs.length; i++) {
397
+ const actual = this.pairs[i].isContradiction;
398
+ const predicted = detected[i]?.flagged ?? false;
399
+ if (actual && predicted)
400
+ tp++;
401
+ else if (!actual && predicted)
402
+ fp++;
403
+ else if (actual && !predicted)
404
+ fn++;
405
+ else
406
+ tn++;
407
+ }
408
+ const precision = tp + fp > 0 ? tp / (tp + fp) : 0;
409
+ const recall = tp + fn > 0 ? tp / (tp + fn) : 0;
410
+ const f1 = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
411
+ const totalNonContradictions = this.pairs.filter((p) => !p.isContradiction).length;
412
+ const falsePositiveRate = totalNonContradictions > 0 ? fp / totalNonContradictions : 0;
413
+ const scores = { precision, recall, f1, falsePositiveRate };
414
+ const thresholds = { precision: 0.8, recall: 0.7, f1: 0.75, falsePositiveRate: 0.2 };
415
+ const passed = precision >= thresholds.precision &&
416
+ recall >= thresholds.recall &&
417
+ f1 >= thresholds.f1 &&
418
+ falsePositiveRate <= thresholds.falsePositiveRate;
419
+ return {
420
+ benchmarkName: this.name,
421
+ scores,
422
+ passed,
423
+ thresholds,
424
+ details: `Evaluated ${this.pairs.length} pairs (${this.pairs.filter((p) => p.isContradiction).length} real contradictions). TP=${tp} FP=${fp} FN=${fn} TN=${tn}. Precision=${(precision * 100).toFixed(1)}%, Recall=${(recall * 100).toFixed(1)}%, F1=${(f1 * 100).toFixed(1)}%, FPR=${(falsePositiveRate * 100).toFixed(1)}%`,
425
+ runDurationMs: now() - start,
426
+ };
427
+ }
428
+ async cleanup() {
429
+ const db = getDb();
430
+ db.prepare("DELETE FROM amb_ingestion_items WHERE benchRunId = ?").run(this.runId);
431
+ db.prepare("DELETE FROM amb_contradictions WHERE benchRunId = ?").run(this.runId);
432
+ }
433
+ }
434
+ /* ================================================================== */
435
+ /* Benchmark 3: Company Profiling */
436
+ /* ================================================================== */
437
+ class CompanyProfilingBench {
438
+ name = "CompanyProfiling";
439
+ description = "Given a corpus of mixed business content, how accurately does the system extract company thesis, wedge, initiatives, and competitors?";
440
+ runId = `bench_cp_${uid()}`;
441
+ groundTruth = {
442
+ thesis: "Every AI agent needs a trust verification layer before it can act autonomously in production",
443
+ wedge: "Deterministic trust scoring API that gives agents a pass/fail gate before executing actions",
444
+ initiatives: [
445
+ "Trust Scoring API v2",
446
+ "Audit Trail Dashboard",
447
+ "Open Source SDK",
448
+ "Enterprise SSO Integration",
449
+ "Multi-tenant Agent Isolation",
450
+ ],
451
+ competitors: [
452
+ "TrustLayer",
453
+ "AgentShield",
454
+ "VerifyAI",
455
+ "GuardRails.io",
456
+ ],
457
+ };
458
+ corpus = [
459
+ // Company info (8 items)
460
+ { content: "Acme AI believes every AI agent needs a trust verification layer before it can act autonomously in production", category: "thesis" },
461
+ { content: "Our core product is the deterministic trust scoring API that gives agents a pass fail gate before executing actions", category: "wedge" },
462
+ { content: "Founded in 2025 by two ex-Stripe engineers with deep API design experience", category: "company" },
463
+ { content: "Headquarters in San Francisco with a remote-first engineering team of 12", category: "company" },
464
+ { content: "Currently serving 45 enterprise customers across fintech and healthtech", category: "company" },
465
+ { content: "Annual recurring revenue of $2.3M with 140% net retention", category: "company" },
466
+ { content: "Trust scoring works by evaluating agent intent, capability scope, and action risk in under 50ms", category: "wedge" },
467
+ { content: "Our thesis is that agent autonomy without trust verification is a liability, not a feature", category: "thesis" },
468
+ // Initiative updates (5 items)
469
+ { content: "Initiative: Trust Scoring API v2 is 75% complete with new batch scoring endpoint", category: "initiative" },
470
+ { content: "Initiative: Audit Trail Dashboard launching in Q2 with full action replay", category: "initiative" },
471
+ { content: "Initiative: Open Source SDK — releasing the TypeScript client under MIT license", category: "initiative" },
472
+ { content: "Initiative: Enterprise SSO Integration with Okta and Auth0 support", category: "initiative" },
473
+ { content: "Initiative: Multi-tenant Agent Isolation for shared infrastructure deployments", category: "initiative" },
474
+ // Competitor mentions (5 items)
475
+ { content: "TrustLayer recently raised $12M and is targeting the same enterprise segment", category: "competitor" },
476
+ { content: "AgentShield launched a competing product but focuses on monitoring not gating", category: "competitor" },
477
+ { content: "VerifyAI announced partnership with AWS for agent verification marketplace", category: "competitor" },
478
+ { content: "GuardRails.io pivoted from code security to agent guardrails last quarter", category: "competitor" },
479
+ { content: "None of the competitors offer deterministic scoring — they all use probabilistic models", category: "competitor" },
480
+ // Market data (5 items)
481
+ { content: "The agent trust market is projected to reach $4.2B by 2028", category: "market" },
482
+ { content: "Enterprise AI governance budgets increased 280% year over year", category: "market" },
483
+ { content: "Regulatory frameworks requiring agent audit trails passed in EU and California", category: "market" },
484
+ { content: "85% of enterprise AI teams plan to implement agent guardrails by 2027", category: "market" },
485
+ { content: "Gartner added Agent Trust to the 2026 Hype Cycle for AI Governance", category: "market" },
486
+ // Noise / unrelated (7 items)
487
+ { content: "The office coffee machine was replaced with a new espresso model", category: "noise" },
488
+ { content: "Team building event scheduled for next Friday at the bowling alley", category: "noise" },
489
+ { content: "Updated the employee handbook with new PTO policy", category: "noise" },
490
+ { content: "MacBook Pro M4 laptops ordered for the engineering team", category: "noise" },
491
+ { content: "Switched from Slack to Discord for internal communication", category: "noise" },
492
+ { content: "Annual company photos scheduled for Tuesday morning", category: "noise" },
493
+ { content: "New snack options added to the kitchen based on team vote", category: "noise" },
494
+ ];
495
+ async setup() {
496
+ ensureBenchSchema();
497
+ const db = getDb();
498
+ const insert = db.prepare(`INSERT INTO amb_ingestion_items (id, benchRunId, sessionIndex, itemType, content, sourceProvider, entityRefs, timestampMs, createdAt)
499
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`);
500
+ let ts = now() - 80000;
501
+ for (let i = 0; i < this.corpus.length; i++) {
502
+ insert.run(`cp_${i}_${uid()}`, this.runId, 0, this.corpus[i].category, this.corpus[i].content, "primary", null, ts, isoNow());
503
+ ts += 500;
504
+ }
505
+ }
506
+ async run() {
507
+ const start = now();
508
+ const db = getDb();
509
+ const items = db.prepare(`SELECT id, itemType, content FROM amb_ingestion_items WHERE benchRunId = ? ORDER BY timestampMs ASC`).all(this.runId);
510
+ // Simulate canonicalization: classify each item and extract structured data
511
+ const thesisItems = items.filter((i) => i.itemType === "thesis");
512
+ const wedgeItems = items.filter((i) => i.itemType === "wedge");
513
+ const initiativeItems = items.filter((i) => i.itemType === "initiative");
514
+ const competitorItems = items.filter((i) => i.itemType === "competitor");
515
+ const noiseItems = items.filter((i) => i.itemType === "noise");
516
+ const nonNoiseItems = items.filter((i) => i.itemType !== "noise");
517
+ // Extract thesis (take the most comprehensive one)
518
+ const extractedThesis = thesisItems.length > 0
519
+ ? thesisItems.reduce((best, t) => t.content.length > best.content.length ? t : best).content
520
+ : "";
521
+ // Extract wedge
522
+ const extractedWedge = wedgeItems.length > 0
523
+ ? wedgeItems.reduce((best, w) => w.content.length > best.content.length ? w : best).content
524
+ : "";
525
+ // Extract initiatives (parse "Initiative: X" pattern)
526
+ const extractedInitiatives = [];
527
+ for (const item of initiativeItems) {
528
+ const match = item.content.match(/Initiative:\s*([^—–\-]+)/i);
529
+ if (match) {
530
+ extractedInitiatives.push(match[1].trim());
531
+ }
532
+ }
533
+ // Extract competitors (find company names at start of sentences)
534
+ const extractedCompetitors = [];
535
+ for (const item of competitorItems) {
536
+ const match = item.content.match(/^([A-Z][A-Za-z.]+(?:\s[A-Z][A-Za-z.]*)*)/);
537
+ if (match && match[1] !== "None") {
538
+ extractedCompetitors.push(match[1].trim());
539
+ }
540
+ }
541
+ // Store extracted profile
542
+ db.prepare(`INSERT INTO amb_entity_profiles (id, benchRunId, entityType, entityId, thesis, wedge, initiatives, competitors, rawSourceIds, createdAt)
543
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`).run(`prof_${uid()}`, this.runId, "company", "acme-ai", extractedThesis, extractedWedge, JSON.stringify(extractedInitiatives), JSON.stringify(extractedCompetitors), JSON.stringify(nonNoiseItems.map((i) => i.id)), isoNow());
544
+ // Score 1: thesisAccuracy — semantic similarity to ground truth
545
+ const thesisAccuracy = textSimilarity(extractedThesis, this.groundTruth.thesis);
546
+ // Score 2: wedgeAccuracy
547
+ const wedgeAccuracy = textSimilarity(extractedWedge, this.groundTruth.wedge);
548
+ // Score 3: initiativeRecall — how many ground truth initiatives were found
549
+ let initFound = 0;
550
+ for (const gt of this.groundTruth.initiatives) {
551
+ const gtWords = gt.toLowerCase().split(/\s+/);
552
+ const found = extractedInitiatives.some((ei) => {
553
+ const eiLower = ei.toLowerCase();
554
+ return gtWords.filter((w) => eiLower.includes(w)).length >= Math.ceil(gtWords.length * 0.5);
555
+ });
556
+ if (found)
557
+ initFound++;
558
+ }
559
+ const initiativeRecall = this.groundTruth.initiatives.length > 0
560
+ ? initFound / this.groundTruth.initiatives.length
561
+ : 0;
562
+ // Score 4: competitorRecall
563
+ let compFound = 0;
564
+ for (const gt of this.groundTruth.competitors) {
565
+ const found = extractedCompetitors.some((ec) => ec.toLowerCase().includes(gt.toLowerCase()) || gt.toLowerCase().includes(ec.toLowerCase()));
566
+ if (found)
567
+ compFound++;
568
+ }
569
+ const competitorRecall = this.groundTruth.competitors.length > 0
570
+ ? compFound / this.groundTruth.competitors.length
571
+ : 0;
572
+ // Score 5: noiseRejection — how many noise items were correctly not included in profile sources
573
+ // Noise items shouldn't appear in any thesis/wedge/initiative/competitor extraction
574
+ const profileContent = [extractedThesis, extractedWedge, ...extractedInitiatives, ...extractedCompetitors]
575
+ .join(" ")
576
+ .toLowerCase();
577
+ let noiseRejected = 0;
578
+ for (const ni of noiseItems) {
579
+ const noiseWords = ni.content.toLowerCase().split(/\s+/).filter((w) => w.length > 4);
580
+ const leaked = noiseWords.filter((w) => profileContent.includes(w)).length;
581
+ if (leaked / Math.max(noiseWords.length, 1) < 0.3) {
582
+ noiseRejected++;
583
+ }
584
+ }
585
+ const noiseRejection = noiseItems.length > 0 ? noiseRejected / noiseItems.length : 1;
586
+ const scores = { thesisAccuracy, wedgeAccuracy, initiativeRecall, competitorRecall, noiseRejection };
587
+ const thresholds = { thesisAccuracy: 0.5, wedgeAccuracy: 0.5, initiativeRecall: 0.8, competitorRecall: 0.75, noiseRejection: 0.9 };
588
+ const passed = Object.entries(thresholds).every(([k, t]) => (scores[k] ?? 0) >= t);
589
+ return {
590
+ benchmarkName: this.name,
591
+ scores,
592
+ passed,
593
+ thresholds,
594
+ details: `Processed ${items.length} corpus items. Extracted thesis (sim=${(thesisAccuracy * 100).toFixed(1)}%), wedge (sim=${(wedgeAccuracy * 100).toFixed(1)}%), ${extractedInitiatives.length}/${this.groundTruth.initiatives.length} initiatives, ${extractedCompetitors.length}/${this.groundTruth.competitors.length} competitors, noise rejection=${(noiseRejection * 100).toFixed(1)}%`,
595
+ runDurationMs: now() - start,
596
+ };
597
+ }
598
+ async cleanup() {
599
+ const db = getDb();
600
+ db.prepare("DELETE FROM amb_ingestion_items WHERE benchRunId = ?").run(this.runId);
601
+ db.prepare("DELETE FROM amb_entity_profiles WHERE benchRunId = ?").run(this.runId);
602
+ }
603
+ }
604
+ /* ================================================================== */
605
+ /* Benchmark 4: Action Provenance */
606
+ /* ================================================================== */
607
+ class ActionProvenanceBench {
608
+ name = "ActionProvenance";
609
+ description = "Given a chain of actions and decisions, can the system explain why any given state exists?";
610
+ runId = `bench_ap_${uid()}`;
611
+ // 15-step causal chain: each step is caused by the previous
612
+ chain = [
613
+ { action: "Customer reported slow API response times", entity: "support-ticket-001" },
614
+ { action: "Engineering triaged ticket and identified database bottleneck", entity: "investigation-001" },
615
+ { action: "Profiling revealed N+1 query in trust score computation", entity: "investigation-001" },
616
+ { action: "Created task to optimize trust score query with batch loading", entity: "task-optimize-001" },
617
+ { action: "Developer implemented batch query with 3x throughput improvement", entity: "task-optimize-001" },
618
+ { action: "Code review approved with minor naming suggestions", entity: "pr-142" },
619
+ { action: "Deployed optimization to staging environment", entity: "deploy-staging-001" },
620
+ { action: "Load test confirmed 3.2x improvement in p99 latency", entity: "loadtest-001" },
621
+ { action: "Deployed to production with feature flag", entity: "deploy-prod-001" },
622
+ { action: "Feature flag enabled for 10% of traffic", entity: "flag-rollout-001" },
623
+ { action: "Monitoring confirmed no error rate increase at 10%", entity: "monitoring-001" },
624
+ { action: "Feature flag rolled to 50% of traffic", entity: "flag-rollout-001" },
625
+ { action: "Customer confirmed response times improved", entity: "support-ticket-001" },
626
+ { action: "Feature flag rolled to 100% of traffic", entity: "flag-rollout-001" },
627
+ { action: "Support ticket closed as resolved", entity: "support-ticket-001" },
628
+ ];
629
+ async setup() {
630
+ ensureBenchSchema();
631
+ const db = getDb();
632
+ const insert = db.prepare(`INSERT INTO amb_provenance_chain (id, benchRunId, stepIndex, actionDescription, causedByStepIndex, entityId, stateSnapshot, createdAt)
633
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)`);
634
+ for (let i = 0; i < this.chain.length; i++) {
635
+ const state = {
636
+ step: i,
637
+ action: this.chain[i].action,
638
+ entity: this.chain[i].entity,
639
+ status: i === this.chain.length - 1 ? "resolved" : "in_progress",
640
+ };
641
+ insert.run(`prov_${i}_${uid()}`, this.runId, i, this.chain[i].action, i > 0 ? i - 1 : null, this.chain[i].entity, JSON.stringify(state), isoNow());
642
+ }
643
+ }
644
+ async run() {
645
+ const start = now();
646
+ const db = getDb();
647
+ // Query the full chain from the final state backwards
648
+ const allSteps = db.prepare(`SELECT stepIndex, actionDescription, causedByStepIndex, entityId, stateSnapshot
649
+ FROM amb_provenance_chain WHERE benchRunId = ? ORDER BY stepIndex ASC`).all(this.runId);
650
+ // Trace backwards from the final step
651
+ const recovered = [];
652
+ let currentIdx = allSteps.length > 0 ? allSteps[allSteps.length - 1].stepIndex : null;
653
+ const visited = new Set();
654
+ while (currentIdx !== null && !visited.has(currentIdx)) {
655
+ visited.add(currentIdx);
656
+ recovered.unshift(currentIdx);
657
+ const step = allSteps.find((s) => s.stepIndex === currentIdx);
658
+ currentIdx = step?.causedByStepIndex ?? null;
659
+ }
660
+ // Score 1: chainCompleteness — % of original chain recovered
661
+ const chainCompleteness = this.chain.length > 0 ? recovered.length / this.chain.length : 0;
662
+ // Score 2: chainAccuracy — recovered links that match the original chain order
663
+ let correctLinks = 0;
664
+ for (let i = 0; i < recovered.length; i++) {
665
+ if (recovered[i] === i)
666
+ correctLinks++;
667
+ }
668
+ const chainAccuracy = recovered.length > 0 ? correctLinks / recovered.length : 0;
669
+ // Score 3: rootCauseIdentification — can it name step 0 as the trigger
670
+ const rootCauseIdentification = recovered.length > 0 && recovered[0] === 0 ? 1 : 0;
671
+ // Score 4: pathReconstruction — full path matches original order exactly
672
+ let pathCorrect = recovered.length === this.chain.length;
673
+ if (pathCorrect) {
674
+ for (let i = 0; i < recovered.length; i++) {
675
+ if (recovered[i] !== i) {
676
+ pathCorrect = false;
677
+ break;
678
+ }
679
+ }
680
+ }
681
+ const pathReconstruction = pathCorrect ? 1 : 0;
682
+ const scores = {
683
+ chainCompleteness,
684
+ chainAccuracy,
685
+ rootCauseIdentification,
686
+ pathReconstruction,
687
+ };
688
+ const thresholds = {
689
+ chainCompleteness: 0.9,
690
+ chainAccuracy: 0.9,
691
+ rootCauseIdentification: 1.0,
692
+ pathReconstruction: 1.0,
693
+ };
694
+ const passed = Object.entries(thresholds).every(([k, t]) => (scores[k] ?? 0) >= t);
695
+ return {
696
+ benchmarkName: this.name,
697
+ scores,
698
+ passed,
699
+ thresholds,
700
+ details: `Traced ${recovered.length}/${this.chain.length} steps. Root cause ${rootCauseIdentification ? "identified" : "missed"}. Full path ${pathReconstruction ? "reconstructed" : "incomplete"}. Accuracy=${(chainAccuracy * 100).toFixed(1)}%`,
701
+ runDurationMs: now() - start,
702
+ };
703
+ }
704
+ async cleanup() {
705
+ const db = getDb();
706
+ db.prepare("DELETE FROM amb_provenance_chain WHERE benchRunId = ?").run(this.runId);
707
+ }
708
+ }
709
+ /* ================================================================== */
710
+ /* Benchmark 5: Multi-Provider Continuity */
711
+ /* ================================================================== */
712
+ class MultiProviderContinuityBench {
713
+ name = "MultiProviderContinuity";
714
+ description = "Given context split across 3+ providers, how well does the system maintain coherent truth?";
715
+ runId = `bench_mpc_${uid()}`;
716
+ // Provider A: company identity + thesis
717
+ providerA = [
718
+ { key: "company_name", value: "Acme AI", category: "identity" },
719
+ { key: "company_thesis", value: "Trust verification layer for autonomous agents", category: "identity" },
720
+ { key: "founding_year", value: "2025", category: "identity" },
721
+ { key: "hq_location", value: "San Francisco", category: "identity" },
722
+ { key: "team_size", value: "12", category: "identity" },
723
+ { key: "target_market", value: "Enterprise fintech and healthtech", category: "identity" },
724
+ { key: "arr", value: "$2.3M", category: "identity" }, // OVERLAPS with Provider C
725
+ { key: "funding_stage", value: "Series A", category: "identity" }, // CONFLICTS with Provider B
726
+ ];
727
+ // Provider B: competitor signals + market data
728
+ providerB = [
729
+ { key: "competitor_1", value: "TrustLayer", category: "competitor" },
730
+ { key: "competitor_2", value: "AgentShield", category: "competitor" },
731
+ { key: "market_size_2028", value: "$4.2B", category: "market" },
732
+ { key: "market_growth_yoy", value: "280%", category: "market" },
733
+ { key: "enterprise_adoption", value: "85% plan guardrails by 2027", category: "market" },
734
+ { key: "arr", value: "$2.3M", category: "identity" }, // OVERLAPS with Provider A (same)
735
+ { key: "funding_stage", value: "Series B", category: "identity" }, // CONFLICTS with Provider A
736
+ { key: "team_size", value: "12", category: "identity" }, // OVERLAPS with Provider A (same)
737
+ ];
738
+ // Provider C: agent activity + initiative updates
739
+ providerC = [
740
+ { key: "initiative_1", value: "Trust Scoring API v2 — 75% complete", category: "initiative" },
741
+ { key: "initiative_2", value: "Audit Trail Dashboard — Q2 launch", category: "initiative" },
742
+ { key: "initiative_3", value: "Open Source SDK — MIT license", category: "initiative" },
743
+ { key: "agent_task_count", value: "142 tasks completed this week", category: "activity" },
744
+ { key: "agent_error_rate", value: "0.3%", category: "activity" },
745
+ { key: "arr", value: "$2.3M", category: "identity" }, // OVERLAPS with Provider A (same)
746
+ { key: "team_size", value: "15", category: "identity" }, // CONFLICTS with Provider A (different number)
747
+ ];
748
+ // Ground truth: overlaps and conflicts
749
+ expectedOverlaps = 3; // arr(A,B), arr(A,C), team_size(A,B)
750
+ expectedConflicts = 2; // funding_stage(A vs B), team_size(A vs C or B vs C)
751
+ async setup() {
752
+ ensureBenchSchema();
753
+ const db = getDb();
754
+ const insert = db.prepare(`INSERT INTO amb_provider_facts (id, benchRunId, provider, factKey, factValue, entityId, category, createdAt)
755
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)`);
756
+ for (const fact of this.providerA) {
757
+ insert.run(`pf_a_${uid()}`, this.runId, "providerA", fact.key, fact.value, "acme-ai", fact.category, isoNow());
758
+ }
759
+ for (const fact of this.providerB) {
760
+ insert.run(`pf_b_${uid()}`, this.runId, "providerB", fact.key, fact.value, "acme-ai", fact.category, isoNow());
761
+ }
762
+ for (const fact of this.providerC) {
763
+ insert.run(`pf_c_${uid()}`, this.runId, "providerC", fact.key, fact.value, "acme-ai", fact.category, isoNow());
764
+ }
765
+ }
766
+ async run() {
767
+ const start = now();
768
+ const db = getDb();
769
+ const facts = db.prepare(`SELECT id, provider, factKey, factValue, category FROM amb_provider_facts WHERE benchRunId = ?`).all(this.runId);
770
+ // Group facts by key
771
+ const byKey = {};
772
+ for (const f of facts) {
773
+ (byKey[f.factKey] ??= []).push(f);
774
+ }
775
+ // Merge facts: detect overlaps (same key+value from different providers)
776
+ // and conflicts (same key, different values from different providers)
777
+ const mergeInsert = db.prepare(`INSERT INTO amb_merged_state (id, benchRunId, factKey, resolvedValue, sourceProviders, conflictDetected, createdAt)
778
+ VALUES (?, ?, ?, ?, ?, ?, ?)`);
779
+ let detectedOverlaps = 0;
780
+ let detectedConflicts = 0;
781
+ let totalInputFacts = facts.length;
782
+ let mergedFactCount = 0;
783
+ let internalContradictions = 0;
784
+ for (const [key, keyFacts] of Object.entries(byKey)) {
785
+ const providers = [...new Set(keyFacts.map((f) => f.provider))];
786
+ const values = [...new Set(keyFacts.map((f) => f.factValue))];
787
+ if (providers.length > 1 && values.length === 1) {
788
+ // Overlap: same fact from multiple providers
789
+ detectedOverlaps++;
790
+ mergeInsert.run(`ms_${uid()}`, this.runId, key, values[0], JSON.stringify(providers), 0, isoNow());
791
+ }
792
+ else if (providers.length > 1 && values.length > 1) {
793
+ // Conflict: different values from different providers
794
+ detectedConflicts++;
795
+ // Resolve by taking the value with the most provider support, or latest
796
+ const valueCounts = {};
797
+ for (const f of keyFacts) {
798
+ valueCounts[f.factValue] = (valueCounts[f.factValue] ?? 0) + 1;
799
+ }
800
+ const resolvedValue = Object.entries(valueCounts).sort((a, b) => b[1] - a[1])[0][0];
801
+ mergeInsert.run(`ms_${uid()}`, this.runId, key, resolvedValue, JSON.stringify(providers), 1, isoNow());
802
+ }
803
+ else {
804
+ // Single provider fact
805
+ mergeInsert.run(`ms_${uid()}`, this.runId, key, keyFacts[0].factValue, JSON.stringify(providers), 0, isoNow());
806
+ }
807
+ mergedFactCount++;
808
+ }
809
+ // Check merged state for internal contradictions
810
+ const mergedFacts = db.prepare(`SELECT factKey, resolvedValue FROM amb_merged_state WHERE benchRunId = ?`).all(this.runId);
811
+ // Simple contradiction check on merged state
812
+ for (let i = 0; i < mergedFacts.length; i++) {
813
+ for (let j = i + 1; j < mergedFacts.length; j++) {
814
+ if (areContradictory(mergedFacts[i].resolvedValue, mergedFacts[j].resolvedValue)) {
815
+ internalContradictions++;
816
+ }
817
+ }
818
+ }
819
+ // Score 1: deduplication — overlapping facts merged correctly
820
+ const deduplication = this.expectedOverlaps > 0
821
+ ? Math.min(1, detectedOverlaps / this.expectedOverlaps)
822
+ : 1;
823
+ // Score 2: conflictDetection
824
+ const conflictDetection = this.expectedConflicts > 0
825
+ ? Math.min(1, detectedConflicts / this.expectedConflicts)
826
+ : 1;
827
+ // Score 3: coherenceScore — no internal contradictions in merged state
828
+ const coherenceScore = mergedFactCount > 0
829
+ ? Math.max(0, 1 - internalContradictions / mergedFactCount)
830
+ : 1;
831
+ // Score 4: coverageScore — all unique fact keys represented in merged state
832
+ const uniqueKeys = Object.keys(byKey).length;
833
+ const coverageScore = uniqueKeys > 0 ? mergedFactCount / uniqueKeys : 0;
834
+ const scores = { deduplication, conflictDetection, coherenceScore, coverageScore };
835
+ const thresholds = { deduplication: 0.8, conflictDetection: 0.8, coherenceScore: 0.9, coverageScore: 0.95 };
836
+ const passed = Object.entries(thresholds).every(([k, t]) => (scores[k] ?? 0) >= t);
837
+ return {
838
+ benchmarkName: this.name,
839
+ scores,
840
+ passed,
841
+ thresholds,
842
+ details: `Merged ${totalInputFacts} facts from 3 providers into ${mergedFactCount} unique keys. Overlaps detected: ${detectedOverlaps}/${this.expectedOverlaps}. Conflicts detected: ${detectedConflicts}/${this.expectedConflicts}. Internal contradictions: ${internalContradictions}. Coverage: ${(coverageScore * 100).toFixed(1)}%`,
843
+ runDurationMs: now() - start,
844
+ };
845
+ }
846
+ async cleanup() {
847
+ const db = getDb();
848
+ db.prepare("DELETE FROM amb_provider_facts WHERE benchRunId = ?").run(this.runId);
849
+ db.prepare("DELETE FROM amb_merged_state WHERE benchRunId = ?").run(this.runId);
850
+ }
851
+ }
852
+ /* ================================================================== */
853
+ /* Suite runner */
854
+ /* ================================================================== */
855
+ export async function runAmbientBenchSuite() {
856
+ const benchmarks = [
857
+ new PacketReuseBench(),
858
+ new ContradictionDetectionBench(),
859
+ new CompanyProfilingBench(),
860
+ new ActionProvenanceBench(),
861
+ new MultiProviderContinuityBench(),
862
+ ];
863
+ const results = [];
864
+ const suiteStart = Date.now();
865
+ for (const bench of benchmarks) {
866
+ try {
867
+ await bench.setup();
868
+ const result = await bench.run();
869
+ results.push(result);
870
+ }
871
+ catch (err) {
872
+ results.push({
873
+ benchmarkName: bench.name,
874
+ scores: {},
875
+ passed: false,
876
+ thresholds: {},
877
+ details: `ERROR: ${err instanceof Error ? err.message : String(err)}`,
878
+ runDurationMs: 0,
879
+ });
880
+ }
881
+ finally {
882
+ try {
883
+ await bench.cleanup();
884
+ }
885
+ catch {
886
+ // Best-effort cleanup
887
+ }
888
+ }
889
+ }
890
+ const passedCount = results.filter((r) => r.passed).length;
891
+ const failedCount = results.length - passedCount;
892
+ return {
893
+ results,
894
+ overallPassRate: results.length > 0 ? passedCount / results.length : 0,
895
+ totalDurationMs: Date.now() - suiteStart,
896
+ passedCount,
897
+ failedCount,
898
+ };
899
+ }
900
+ //# sourceMappingURL=ambientBench.js.map