@arclabs561/ai-visual-test 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.secretsignore.example +20 -0
  2. package/CHANGELOG.md +360 -0
  3. package/CONTRIBUTING.md +63 -0
  4. package/DEPLOYMENT.md +80 -0
  5. package/LICENSE +22 -0
  6. package/README.md +142 -0
  7. package/SECURITY.md +108 -0
  8. package/api/health.js +34 -0
  9. package/api/validate.js +252 -0
  10. package/index.d.ts +1221 -0
  11. package/package.json +112 -0
  12. package/public/index.html +149 -0
  13. package/src/batch-optimizer.mjs +451 -0
  14. package/src/bias-detector.mjs +370 -0
  15. package/src/bias-mitigation.mjs +233 -0
  16. package/src/cache.mjs +433 -0
  17. package/src/config.mjs +268 -0
  18. package/src/constants.mjs +80 -0
  19. package/src/context-compressor.mjs +350 -0
  20. package/src/convenience.mjs +617 -0
  21. package/src/cost-tracker.mjs +257 -0
  22. package/src/cross-modal-consistency.mjs +170 -0
  23. package/src/data-extractor.mjs +232 -0
  24. package/src/dynamic-few-shot.mjs +140 -0
  25. package/src/dynamic-prompts.mjs +361 -0
  26. package/src/ensemble/index.mjs +53 -0
  27. package/src/ensemble-judge.mjs +366 -0
  28. package/src/error-handler.mjs +67 -0
  29. package/src/errors.mjs +167 -0
  30. package/src/experience-propagation.mjs +128 -0
  31. package/src/experience-tracer.mjs +487 -0
  32. package/src/explanation-manager.mjs +299 -0
  33. package/src/feedback-aggregator.mjs +248 -0
  34. package/src/game-goal-prompts.mjs +478 -0
  35. package/src/game-player.mjs +548 -0
  36. package/src/hallucination-detector.mjs +155 -0
  37. package/src/helpers/playwright.mjs +80 -0
  38. package/src/human-validation-manager.mjs +516 -0
  39. package/src/index.mjs +364 -0
  40. package/src/judge.mjs +929 -0
  41. package/src/latency-aware-batch-optimizer.mjs +192 -0
  42. package/src/load-env.mjs +159 -0
  43. package/src/logger.mjs +55 -0
  44. package/src/metrics.mjs +187 -0
  45. package/src/model-tier-selector.mjs +221 -0
  46. package/src/multi-modal/index.mjs +36 -0
  47. package/src/multi-modal-fusion.mjs +190 -0
  48. package/src/multi-modal.mjs +524 -0
  49. package/src/natural-language-specs.mjs +1071 -0
  50. package/src/pair-comparison.mjs +277 -0
  51. package/src/persona/index.mjs +42 -0
  52. package/src/persona-enhanced.mjs +200 -0
  53. package/src/persona-experience.mjs +572 -0
  54. package/src/position-counterbalance.mjs +140 -0
  55. package/src/prompt-composer.mjs +375 -0
  56. package/src/render-change-detector.mjs +583 -0
  57. package/src/research-enhanced-validation.mjs +436 -0
  58. package/src/retry.mjs +152 -0
  59. package/src/rubrics.mjs +231 -0
  60. package/src/score-tracker.mjs +277 -0
  61. package/src/smart-validator.mjs +447 -0
  62. package/src/spec-config.mjs +106 -0
  63. package/src/spec-templates.mjs +347 -0
  64. package/src/specs/index.mjs +38 -0
  65. package/src/temporal/index.mjs +102 -0
  66. package/src/temporal-adaptive.mjs +163 -0
  67. package/src/temporal-batch-optimizer.mjs +222 -0
  68. package/src/temporal-constants.mjs +69 -0
  69. package/src/temporal-context.mjs +49 -0
  70. package/src/temporal-decision-manager.mjs +271 -0
  71. package/src/temporal-decision.mjs +669 -0
  72. package/src/temporal-errors.mjs +58 -0
  73. package/src/temporal-note-pruner.mjs +173 -0
  74. package/src/temporal-preprocessor.mjs +543 -0
  75. package/src/temporal-prompt-formatter.mjs +219 -0
  76. package/src/temporal-validation.mjs +159 -0
  77. package/src/temporal.mjs +415 -0
  78. package/src/type-guards.mjs +311 -0
  79. package/src/uncertainty-reducer.mjs +470 -0
  80. package/src/utils/index.mjs +175 -0
  81. package/src/validation-framework.mjs +321 -0
  82. package/src/validation-result-normalizer.mjs +64 -0
  83. package/src/validation.mjs +243 -0
  84. package/src/validators/accessibility-programmatic.mjs +345 -0
  85. package/src/validators/accessibility-validator.mjs +223 -0
  86. package/src/validators/batch-validator.mjs +143 -0
  87. package/src/validators/hybrid-validator.mjs +268 -0
  88. package/src/validators/index.mjs +34 -0
  89. package/src/validators/prompt-builder.mjs +218 -0
  90. package/src/validators/rubric.mjs +85 -0
  91. package/src/validators/state-programmatic.mjs +260 -0
  92. package/src/validators/state-validator.mjs +291 -0
  93. package/vercel.json +27 -0
@@ -0,0 +1,415 @@
1
+ /**
2
+ * Temporal Aggregator
3
+ *
4
+ * Aggregates opinions over time with coherence checking.
5
+ *
6
+ * Research context:
7
+ * - "Towards Dynamic Theory of Mind: Evaluating LLM Adaptation to Temporal Evolution of Human States"
8
+ * (arXiv:2505.17663) - DynToM benchmark, temporal progression of mental states
9
+ * * We use temporal aggregation concepts (loosely related)
10
+ * * We do NOT implement the DynToM benchmark or specific methods
11
+ * - "The Other Mind: How Language Models Exhibit Human Temporal Cognition" (arXiv:2507.15851)
12
+ * * Paper discusses Weber-Fechner law and logarithmic compression
13
+ * * We use EXPONENTIAL decay (Math.pow), NOT logarithmic compression
14
+ * * We do NOT implement temporal reference points from the research
15
+ * - Temporal aggregation and opinion propagation research
16
+ * - Coherence analysis in temporal sequences
17
+ *
18
+ * IMPORTANT: This implementation uses EXPONENTIAL decay (decayFactor^age), NOT the
19
+ * logarithmic compression (Weber-Fechner law) described in arXiv:2507.15851. We cite
20
+ * the papers for temporal awareness concepts, but do NOT implement their specific
21
+ * findings (logarithmic compression, temporal reference points).
22
+ */
23
+
24
+ /**
25
+ * Aggregate notes temporally with coherence analysis
26
+ *
27
+ * @param {import('./index.mjs').TemporalNote[]} notes - Array of temporal notes
28
+ * @param {{
29
+ * windowSize?: number;
30
+ * decayFactor?: number;
31
+ * coherenceThreshold?: number;
32
+ * }} [options={}] - Aggregation options
33
+ * @returns {import('./index.mjs').AggregatedTemporalNotes} Aggregated temporal notes with windows and coherence
34
+ */
35
+ import { TEMPORAL_CONSTANTS } from './constants.mjs';
36
+
37
+ export function aggregateTemporalNotes(notes, options = {}) {
38
+ const {
39
+ windowSize = TEMPORAL_CONSTANTS.DEFAULT_WINDOW_SIZE_MS,
40
+ decayFactor = TEMPORAL_CONSTANTS.DEFAULT_DECAY_FACTOR,
41
+ coherenceThreshold = TEMPORAL_CONSTANTS.DEFAULT_COHERENCE_THRESHOLD
42
+ } = options;
43
+
44
+ // Filter and sort notes by timestamp
45
+ // Accept any note with a timestamp (not just gameplay_note_)
46
+ const validNotes = notes
47
+ .filter(n => n.timestamp || n.elapsed !== undefined)
48
+ .sort((a, b) => (a.timestamp || 0) - (b.timestamp || 0));
49
+
50
+ // Use validNotes instead of gameplayNotes for broader compatibility
51
+ const gameplayNotes = validNotes;
52
+
53
+ if (gameplayNotes.length === 0) {
54
+ return {
55
+ windows: [],
56
+ summary: 'No gameplay notes available',
57
+ coherence: 1.0,
58
+ conflicts: []
59
+ };
60
+ }
61
+
62
+ // Group notes into temporal windows
63
+ const windows = [];
64
+ const startTime = gameplayNotes[0].timestamp || Date.now();
65
+
66
+ // INVARIANT: Notes are sorted by timestamp (line 48), so elapsed is always >= 0
67
+ // This ensures windowIndex is always >= 0 (Math.floor of non-negative number)
68
+ // If notes were unsorted, negative elapsed would create negative window indices
69
+ for (let i = 0; i < gameplayNotes.length; i++) {
70
+ const note = gameplayNotes[i];
71
+ const elapsed = note.elapsed || (note.timestamp - startTime);
72
+ const windowIndex = Math.floor(elapsed / windowSize);
73
+
74
+ if (!windows[windowIndex]) {
75
+ windows[windowIndex] = {
76
+ index: windowIndex,
77
+ startTime: startTime + (windowIndex * windowSize),
78
+ endTime: startTime + ((windowIndex + 1) * windowSize),
79
+ notes: [],
80
+ weightedScore: 0,
81
+ totalWeight: 0
82
+ };
83
+ }
84
+
85
+ // Calculate weight (exponential decay)
86
+ const age = elapsed;
87
+ const weight = Math.pow(decayFactor, age / windowSize);
88
+
89
+ windows[windowIndex].notes.push({
90
+ ...note,
91
+ weight
92
+ });
93
+
94
+ // Extract score from gameState if available
95
+ // NOTE: Score extraction order matters - gameState.score takes precedence over note.score
96
+ // This is because gameState.score is more reliable (from actual game state)
97
+ const score = note.gameState?.score || note.score || 0;
98
+
99
+ // Accumulate weighted score and total weight for this window
100
+ // INVARIANT: weightedScore must be divided by totalWeight to get average
101
+ // Both are accumulated across all notes in the window
102
+ // The weight uses exponential decay: Math.pow(decayFactor, age / windowSize)
103
+ windows[windowIndex].weightedScore += score * weight;
104
+ windows[windowIndex].totalWeight += weight;
105
+ }
106
+
107
+ // Calculate window summaries
108
+ const windowSummaries = windows.map(window => {
109
+ const avgScore = window.totalWeight > 0
110
+ ? window.weightedScore / window.totalWeight
111
+ : 0;
112
+
113
+ const observations = window.notes.map(n => n.observation || n.assessment || '').join('; ');
114
+
115
+ return {
116
+ window: window.index,
117
+ timeRange: `${Math.round((window.startTime - startTime) / 1000)}s-${Math.round((window.endTime - startTime) / 1000)}s`,
118
+ noteCount: window.notes.length,
119
+ avgScore: Math.round(avgScore),
120
+ observations,
121
+ weightedAvg: window.totalWeight > 0 ? window.weightedScore / window.totalWeight : 0
122
+ };
123
+ });
124
+
125
+ // Coherence analysis: Check for logical progression
126
+ const coherence = calculateCoherence(windowSummaries);
127
+ const conflicts = detectConflicts(windowSummaries);
128
+
129
+ // Generate summary
130
+ const summary = generateSummary(windowSummaries, coherence, conflicts);
131
+
132
+ // Handle timeSpan calculation safely
133
+ const firstElapsed = gameplayNotes[0]?.elapsed ?? 0;
134
+ const lastElapsed = gameplayNotes[gameplayNotes.length - 1]?.elapsed ?? 0;
135
+ const timeSpan = lastElapsed - firstElapsed;
136
+
137
+ return {
138
+ windows: windowSummaries,
139
+ summary,
140
+ coherence,
141
+ conflicts,
142
+ totalNotes: gameplayNotes.length,
143
+ timeSpan: Math.max(0, timeSpan)
144
+ };
145
+ }
146
+
147
+ /**
148
+ * Calculate coherence score (0-1)
149
+ *
150
+ * Coherence measures how consistent temporal notes are over time. Higher coherence
151
+ * indicates stable, predictable patterns. Lower coherence indicates erratic behavior.
152
+ *
153
+ * BUG FIX (2025-01): The adjustedVarianceCoherence calculation was incomplete.
154
+ * It was: `const adjustedVarianceCoherence = Math.max;` which is just a function reference.
155
+ * This would cause incorrect coherence scores for erratic behavior. The fix completes
156
+ * the calculation with proper penalty for direction changes.
157
+ *
158
+ * @param {Array} windows - Temporal window summaries with avgScore
159
+ * @returns {number} Coherence score 0-1 (1 = perfectly consistent, 0 = erratic)
160
+ */
161
+ function calculateCoherence(windows) {
162
+ if (windows.length < 2) return 1.0;
163
+
164
+ // Check for consistent trends (score progression)
165
+ const scores = windows.map(w => w.avgScore).filter(s => !isNaN(s) && isFinite(s));
166
+
167
+ // If no valid scores, return default
168
+ if (scores.length < 2) return 1.0;
169
+
170
+ const trends = [];
171
+
172
+ for (let i = 1; i < scores.length; i++) {
173
+ const change = scores[i] - scores[i - 1];
174
+ trends.push(change >= 0 ? 1 : -1); // Direction only
175
+ }
176
+
177
+ // Metric 1: Direction consistency
178
+ // Count how often the direction of change flips (up→down or down→up)
179
+ // More flips = more erratic behavior
180
+ let directionChanges = 0;
181
+ for (let i = 1; i < trends.length; i++) {
182
+ if (trends[i] !== trends[i - 1]) {
183
+ directionChanges++;
184
+ }
185
+ }
186
+ const directionConsistency = Math.max(0, Math.min(1, 1.0 - (directionChanges / Math.max(1, trends.length))));
187
+
188
+ // Metric 2: Score variance
189
+ // Use stricter normalization that properly penalizes erratic behavior
190
+ //
191
+ // IMPORTANT: We changed from meanScore² to score range because:
192
+ // - meanScore² was too lenient (e.g., mean=5 → maxVariance=25, but scores 0-10 have range=10)
193
+ // - Score range better captures actual variance in the data
194
+ // - For scores 0-10, max reasonable variance is ~25 (when scores vary uniformly from 0 to 10)
195
+ // - For scores 0-100, max reasonable variance is ~2500
196
+ const meanScore = scores.reduce((a, b) => a + b, 0) / scores.length;
197
+ const variance = scores.reduce((sum, score) => sum + Math.pow(score - meanScore, 2), 0) / scores.length;
198
+
199
+ const scoreRange = Math.max(...scores) - Math.min(...scores);
200
+ const maxVariance = Math.max(
201
+ Math.pow(scoreRange / 2, 2), // Variance for uniform distribution over range
202
+ Math.pow(meanScore * 0.5, 2), // Fallback: 50% of mean as standard deviation
203
+ 10 // Minimum to avoid division by tiny numbers
204
+ );
205
+
206
+ // Variance coherence: penalize high variance more aggressively
207
+ const varianceCoherence = Math.max(0, Math.min(1, 1.0 - (variance / maxVariance)));
208
+
209
+ // Add stronger penalty for frequent direction changes (erratic behavior)
210
+ // Direction changes are a strong signal of erratic behavior
211
+ //
212
+ // NOTE: This calculation must be complete! The bug was:
213
+ // const adjustedVarianceCoherence = Math.max; // WRONG - just function reference
214
+ // The fix is:
215
+ // const adjustedVarianceCoherence = Math.max(0, Math.min(1, varianceCoherence * (1.0 - directionChangePenalty * 0.7)));
216
+ //
217
+ // The 0.7 multiplier means direction changes reduce variance coherence by up to 70%
218
+ // This was increased from 0.5 to be more aggressive at detecting erratic behavior
219
+ const directionChangePenalty = directionChanges / Math.max(1, trends.length);
220
+ const adjustedVarianceCoherence = Math.max(0, Math.min(1, varianceCoherence * (1.0 - directionChangePenalty * 0.7)));
221
+
222
+ // Metric 3: Observation consistency
223
+ let observationConsistency = 1.0;
224
+ if (windows.length > 1) {
225
+ const observations = windows.map(w => (w.observations || '').toLowerCase());
226
+ const keywords = observations.map(obs => {
227
+ const words = obs.split(/\s+/).filter(w => w.length > 3);
228
+ return new Set(words);
229
+ });
230
+
231
+ let overlapSum = 0;
232
+ for (let i = 1; i < keywords.length; i++) {
233
+ const prev = keywords[i - 1];
234
+ const curr = keywords[i];
235
+ if (prev && curr && prev.size > 0 && curr.size > 0) {
236
+ const intersection = new Set([...prev].filter(x => curr.has(x)));
237
+ const union = new Set([...prev, ...curr]);
238
+ const overlap = union.size > 0 ? intersection.size / union.size : 0;
239
+ overlapSum += overlap;
240
+ }
241
+ }
242
+ observationConsistency = Math.max(0, Math.min(1, overlapSum / Math.max(1, keywords.length - 1)));
243
+ }
244
+
245
+ // Metric 3: Stability
246
+ // Stability directly penalizes erratic behavior by measuring direction change frequency
247
+ // Stability = 1 - (directionChanges / maxPossibleChanges)
248
+ // For n windows, max possible direction changes is n-2 (can't change at first or last)
249
+ const maxPossibleChanges = Math.max(1, trends.length);
250
+ const stability = Math.max(0, Math.min(1, 1.0 - (directionChanges / maxPossibleChanges)));
251
+
252
+ // Metric 4: Observation consistency (recalculated)
253
+ // Check if observations use similar keywords across windows
254
+ // Less reliable than score-based metrics (keyword matching is approximate)
255
+ observationConsistency = 1.0;
256
+ if (windows.length > 1) {
257
+ const observations = windows.map(w => (w.observations || '').toLowerCase());
258
+ const keywords = observations.map(obs => {
259
+ const words = obs.split(/\s+/).filter(w => w.length > 3);
260
+ return new Set(words);
261
+ });
262
+
263
+ let overlapSum = 0;
264
+ for (let i = 1; i < keywords.length; i++) {
265
+ const prev = keywords[i - 1];
266
+ const curr = keywords[i];
267
+ if (prev && curr && prev.size > 0 && curr.size > 0) {
268
+ const intersection = new Set([...prev].filter(x => curr.has(x)));
269
+ const union = new Set([...prev, ...curr]);
270
+ const overlap = union.size > 0 ? intersection.size / union.size : 0;
271
+ overlapSum += overlap;
272
+ }
273
+ }
274
+ observationConsistency = Math.max(0, Math.min(1, overlapSum / Math.max(1, keywords.length - 1)));
275
+ }
276
+
277
+ // Final coherence: Weighted combination of all metrics
278
+ //
279
+ // Weight rationale (2025-01):
280
+ // - Direction (0.35): Strongest signal of erratic behavior, most reliable
281
+ // - Stability (0.25): Directly measures direction change frequency
282
+ // - Variance (0.25): Captures score spread, adjusted for direction changes
283
+ // - Observation (0.15): Least reliable (keyword-based), lowest weight
284
+ //
285
+ // These weights were chosen to heavily penalize erratic behavior while still
286
+ // considering all aspects of temporal consistency. Don't change without:
287
+ // - Testing with known erratic vs. stable patterns
288
+ // - Validating against human-annotated coherence scores
289
+ // - Measuring impact on conflict detection
290
+ const coherence = (
291
+ directionConsistency * 0.35 +
292
+ stability * 0.25 +
293
+ adjustedVarianceCoherence * 0.25 +
294
+ observationConsistency * 0.15
295
+ );
296
+
297
+ // Clamp to [0, 1] and handle NaN/Infinity
298
+ const clamped = Math.max(0, Math.min(1, isNaN(coherence) || !isFinite(coherence) ? 0.5 : coherence));
299
+ return clamped;
300
+ }
301
+
302
+ /**
303
+ * Detect conflicting opinions
304
+ */
305
+ function detectConflicts(windows) {
306
+ const conflicts = [];
307
+
308
+ const observations = windows.map(w => (w.observations || '').toLowerCase());
309
+
310
+ const positiveWords = ['good', 'great', 'excellent', 'smooth', 'responsive', 'clear'];
311
+ const negativeWords = ['bad', 'poor', 'slow', 'laggy', 'unclear', 'confusing'];
312
+
313
+ for (let i = 0; i < observations.length; i++) {
314
+ const obs = observations[i] || '';
315
+ const hasPositive = positiveWords.some(w => obs.includes(w));
316
+ const hasNegative = negativeWords.some(w => obs.includes(w));
317
+
318
+ if (hasPositive && hasNegative) {
319
+ conflicts.push({
320
+ window: windows[i].window,
321
+ type: 'mixed_sentiment',
322
+ observation: windows[i].observations
323
+ });
324
+ }
325
+ }
326
+
327
+ // Check for score inconsistencies
328
+ for (let i = 1; i < windows.length; i++) {
329
+ if (windows[i] && windows[i - 1] &&
330
+ windows[i].avgScore !== undefined && windows[i - 1].avgScore !== undefined &&
331
+ windows[i].avgScore < windows[i - 1].avgScore) {
332
+ conflicts.push({
333
+ window: windows[i].window,
334
+ type: 'score_decrease',
335
+ previousScore: windows[i - 1].avgScore,
336
+ currentScore: windows[i].avgScore
337
+ });
338
+ }
339
+ }
340
+
341
+ return conflicts;
342
+ }
343
+
344
+ /**
345
+ * Generate human-readable summary
346
+ */
347
+ function generateSummary(windows, coherence, conflicts) {
348
+ const parts = [];
349
+
350
+ parts.push(`Aggregated ${windows.length} temporal windows from gameplay notes.`);
351
+
352
+ if (windows.length > 0) {
353
+ const firstWindow = windows[0];
354
+ const lastWindow = windows[windows.length - 1];
355
+ const firstScore = firstWindow?.avgScore ?? 0;
356
+ const lastScore = lastWindow?.avgScore ?? 0;
357
+ parts.push(`Score progression: ${firstScore} → ${lastScore} (${lastScore - firstScore > 0 ? '+' : ''}${lastScore - firstScore}).`);
358
+ }
359
+
360
+ parts.push(`Temporal coherence: ${(coherence * 100).toFixed(0)}% ${coherence > 0.7 ? '(high)' : coherence > 0.4 ? '(moderate)' : '(low)'}.`);
361
+
362
+ if (conflicts.length > 0) {
363
+ parts.push(`Detected ${conflicts.length} potential conflict${conflicts.length > 1 ? 's' : ''}: ${conflicts.map(c => c.type).join(', ')}.`);
364
+ }
365
+
366
+ return parts.join(' ');
367
+ }
368
+
369
+ /**
370
+ * Format aggregated temporal notes for prompt inclusion
371
+ *
372
+ * @param {import('./index.mjs').AggregatedTemporalNotes} aggregated - Aggregated temporal notes
373
+ * @returns {string} Formatted string for prompt inclusion
374
+ */
375
+ export function formatNotesForPrompt(aggregated) {
376
+ const parts = [];
377
+
378
+ parts.push('TEMPORAL AGGREGATION ANALYSIS:');
379
+ parts.push(aggregated.summary);
380
+ parts.push('');
381
+
382
+ if (aggregated.windows.length > 0) {
383
+ parts.push('Temporal Windows:');
384
+ aggregated.windows.forEach(window => {
385
+ parts.push(` [${window.timeRange}] Score: ${window.avgScore}, Notes: ${window.noteCount}`);
386
+ if (window.observations) {
387
+ parts.push(` Observations: ${window.observations.substring(0, 100)}${window.observations.length > 100 ? '...' : ''}`);
388
+ }
389
+ });
390
+ parts.push('');
391
+ }
392
+
393
+ if (aggregated.conflicts.length > 0) {
394
+ parts.push('Coherence Issues:');
395
+ aggregated.conflicts.forEach(conflict => {
396
+ parts.push(` - ${conflict.type}: ${JSON.stringify(conflict)}`);
397
+ });
398
+ parts.push('');
399
+ }
400
+
401
+ parts.push(`Overall Coherence: ${(aggregated.coherence * 100).toFixed(0)}%`);
402
+
403
+ return parts.join('\n');
404
+ }
405
+
406
+ /**
407
+ * Calculate coherence score for temporal windows
408
+ *
409
+ * @param {import('./index.mjs').TemporalWindow[]} windows - Array of temporal windows
410
+ * @returns {number} Coherence score (0-1)
411
+ */
412
+ export function calculateCoherenceExported(windows) {
413
+ return calculateCoherence(windows);
414
+ }
415
+