@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
package/types.ts DELETED
@@ -1,165 +0,0 @@
1
- /**
2
- * Evaluation Types
3
- *
4
- * Types for the eval dashboard components.
5
- */
6
-
7
- export interface EvalProfile {
8
- name: string;
9
- description: string;
10
- egoProvider?: string;
11
- egoModel?: string;
12
- superegoProvider?: string;
13
- superegoModel?: string;
14
- dialogueEnabled?: boolean;
15
- maxRounds?: number;
16
- }
17
-
18
- export interface EvalScenario {
19
- id: string;
20
- name: string;
21
- description?: string;
22
- category?: string;
23
- turnCount?: number;
24
- isMultiTurn?: boolean;
25
- }
26
-
27
- export interface EvalRun {
28
- id: string;
29
- description?: string;
30
- totalTests?: number;
31
- totalScenarios?: number;
32
- totalConfigurations?: number;
33
- status: 'running' | 'completed' | 'failed';
34
- createdAt: string;
35
- completedAt?: string;
36
- runType?: 'quick' | 'batch' | 'matrix' | 'compare' | 'interaction';
37
- profiles?: string[];
38
- }
39
-
40
- export type EvalDimensionScore = number | { score: number; reasoning?: string; quote?: string } | null;
41
-
42
- export interface EvalDimensionScores {
43
- relevance: EvalDimensionScore;
44
- specificity: EvalDimensionScore;
45
- pedagogical: EvalDimensionScore;
46
- personalization: EvalDimensionScore;
47
- actionability: EvalDimensionScore;
48
- tone: EvalDimensionScore;
49
- }
50
-
51
- export interface EvalSuggestion {
52
- type: string;
53
- title: string;
54
- message: string;
55
- actionTarget?: string;
56
- headline?: string;
57
- body?: string;
58
- priority?: 'high' | 'medium' | 'low';
59
- }
60
-
61
- export interface EvalValidation {
62
- passesRequired: boolean;
63
- passesForbidden: boolean;
64
- requiredMissing: string[];
65
- forbiddenFound: string[];
66
- }
67
-
68
- export interface EvalQuickTestResult {
69
- scenarioId: string;
70
- scenarioName: string;
71
- profile: string;
72
- provider?: string;
73
- model?: string;
74
- passed: boolean;
75
- overallScore: number | null;
76
- latencyMs: number;
77
- scores?: EvalDimensionScores;
78
- validation?: EvalValidation;
79
- suggestions?: EvalSuggestion[];
80
- inputTokens?: number;
81
- outputTokens?: number;
82
- totalTokens?: number;
83
- apiCalls?: number;
84
- dialogueRounds?: number;
85
- evaluationReasoning?: string;
86
- evaluatorModel?: string;
87
- scenarioContext?: {
88
- description: string;
89
- expectedBehavior?: string;
90
- learnerContext?: Record<string, string | undefined>;
91
- };
92
- }
93
-
94
- // Agent role types for dialogue system
95
- export type AgentRole = 'user' | 'ego' | 'superego';
96
- export type DialogueDirection = 'input' | 'request' | 'response';
97
-
98
- export interface EvalDialogueEntry {
99
- timestamp: string;
100
- agent: AgentRole;
101
- action?: string;
102
- model?: string;
103
- provider?: string;
104
- latencyMs?: number;
105
- inputTokens?: number;
106
- outputTokens?: number;
107
- suggestions?: Array<{ type: string; title: string; message: string; priority?: string }>;
108
- verdict?: { approved: boolean; confidence?: number; feedback?: string };
109
- preAnalysis?: {
110
- isPreAnalysis: boolean;
111
- reinterpretations?: unknown[];
112
- overallCaution?: string;
113
- };
114
- from?: AgentRole;
115
- to?: AgentRole;
116
- direction?: DialogueDirection;
117
- rawContext?: string;
118
- contextData?: {
119
- courseId?: string;
120
- courseTitle?: string;
121
- lectureId?: string;
122
- lectureTitle?: string;
123
- recentActivity?: string[];
124
- };
125
- output?: unknown;
126
- cost?: number;
127
- }
128
-
129
- export interface EvalDialogue {
130
- dialogueId: string;
131
- startTime: string;
132
- endTime: string;
133
- entryCount: number;
134
- entries?: EvalDialogueEntry[];
135
- summary?: {
136
- egoCount: number;
137
- superegoCount: number;
138
- totalSuggestions: number;
139
- approvedCount: number;
140
- revisedCount: number;
141
- totalLatencyMs: number;
142
- totalInputTokens?: number;
143
- totalOutputTokens?: number;
144
- };
145
- }
146
-
147
- export interface EvalTrendPoint {
148
- runId: string;
149
- createdAt: string;
150
- description?: string;
151
- runType?: 'quick' | 'eval' | 'matrix' | 'compare' | 'auto';
152
- profiles?: string[];
153
- scenarioCount?: number;
154
- testCount: number;
155
- overallScore: number | null;
156
- dimensions: EvalDimensionScores;
157
- }
158
-
159
- export interface EvalDoc {
160
- name: string;
161
- filename: string;
162
- title: string;
163
- size: number;
164
- modified: string;
165
- }
package/utils/haptics.ts DELETED
@@ -1,45 +0,0 @@
1
- /**
2
- * Haptic Feedback Utilities
3
- *
4
- * Provides consistent vibration patterns for mobile interactions.
5
- * Falls back gracefully when vibration API is not available.
6
- */
7
-
8
- type VibrationPattern = number | number[];
9
-
10
- const vibrate = (pattern: VibrationPattern): void => {
11
- if (typeof navigator !== 'undefined' && navigator.vibrate) {
12
- navigator.vibrate(pattern);
13
- }
14
- };
15
-
16
- export const haptics = {
17
- /** Light tap - tab changes, selections */
18
- light: () => vibrate(5),
19
-
20
- /** Medium tap - pull-to-refresh trigger, confirmations */
21
- medium: () => vibrate(10),
22
-
23
- /** Heavy tap - errors, warnings */
24
- heavy: () => vibrate(20),
25
-
26
- /** Success pattern - test passed, action completed */
27
- success: () => vibrate([10, 50, 10]),
28
-
29
- /** Error pattern - test failed, error occurred */
30
- error: () => vibrate([20, 100, 20, 100, 20]),
31
-
32
- /** Back online notification */
33
- online: () => vibrate([100, 50, 100]),
34
-
35
- /** Went offline notification */
36
- offline: () => vibrate(200),
37
-
38
- /** Copy to clipboard */
39
- copy: () => vibrate(30),
40
-
41
- /** Button press feedback */
42
- button: () => vibrate(8)
43
- };
44
-
45
- export default haptics;