@machinespirits/eval 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/providers.yaml +60 -0
  9. package/config/suggestion-scenarios.yaml +1399 -0
  10. package/config/tutor-agents.yaml +716 -0
  11. package/docs/EVALUATION-VARIABLES.md +589 -0
  12. package/docs/REPLICATION-PLAN.md +577 -0
  13. package/docs/research/build.sh +74 -0
  14. package/docs/research/figures/figure1.png +0 -0
  15. package/docs/research/figures/figure2.png +0 -0
  16. package/docs/research/figures/figure3.png +0 -0
  17. package/docs/research/figures/figure4.png +0 -0
  18. package/docs/research/figures/figure5.png +0 -0
  19. package/docs/research/figures/figure6.png +0 -0
  20. package/docs/research/header.tex +4 -0
  21. package/docs/research/paper-full.md +1909 -0
  22. package/docs/research/paper-short.md +805 -0
  23. package/docs/research/references.bib +1011 -0
  24. package/index.js +15 -6
  25. package/package.json +14 -21
  26. package/routes/evalRoutes.js +88 -36
  27. package/scripts/analyze-judge-reliability.js +401 -0
  28. package/scripts/analyze-run.js +97 -0
  29. package/scripts/analyze-run.mjs +282 -0
  30. package/scripts/analyze-validation-failures.js +141 -0
  31. package/scripts/check-run.mjs +17 -0
  32. package/scripts/code-impasse-strategies.js +1132 -0
  33. package/scripts/compare-runs.js +44 -0
  34. package/scripts/compare-suggestions.js +80 -0
  35. package/scripts/compare-transformation.js +116 -0
  36. package/scripts/dig-into-run.js +158 -0
  37. package/scripts/eval-cli.js +2626 -0
  38. package/scripts/generate-paper-figures.py +452 -0
  39. package/scripts/qualitative-analysis-ai.js +1313 -0
  40. package/scripts/qualitative-analysis.js +688 -0
  41. package/scripts/seed-db.js +87 -0
  42. package/scripts/show-failed-suggestions.js +64 -0
  43. package/scripts/validate-content.js +192 -0
  44. package/server.js +3 -2
  45. package/services/__tests__/evalConfigLoader.test.js +338 -0
  46. package/services/anovaStats.js +499 -0
  47. package/services/contentResolver.js +407 -0
  48. package/services/dialogueTraceAnalyzer.js +454 -0
  49. package/services/evalConfigLoader.js +625 -0
  50. package/services/evaluationRunner.js +2171 -270
  51. package/services/evaluationStore.js +564 -29
  52. package/services/learnerConfigLoader.js +75 -5
  53. package/services/learnerRubricEvaluator.js +284 -0
  54. package/services/learnerTutorInteractionEngine.js +375 -0
  55. package/services/processUtils.js +18 -0
  56. package/services/progressLogger.js +98 -0
  57. package/services/promptRecommendationService.js +31 -26
  58. package/services/promptRewriter.js +427 -0
  59. package/services/rubricEvaluator.js +543 -70
  60. package/services/streamingReporter.js +104 -0
  61. package/services/turnComparisonAnalyzer.js +494 -0
  62. package/components/MobileEvalDashboard.tsx +0 -267
  63. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  64. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  65. package/components/comparison/RecognitionABMode.tsx +0 -385
  66. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  67. package/components/comparison/WinnerIndicator.tsx +0 -64
  68. package/components/comparison/index.ts +0 -5
  69. package/components/mobile/BottomSheet.tsx +0 -233
  70. package/components/mobile/DimensionBreakdown.tsx +0 -210
  71. package/components/mobile/DocsView.tsx +0 -363
  72. package/components/mobile/LogsView.tsx +0 -481
  73. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  74. package/components/mobile/QuickTestView.tsx +0 -1098
  75. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  76. package/components/mobile/RecognitionView.tsx +0 -809
  77. package/components/mobile/RunDetailView.tsx +0 -261
  78. package/components/mobile/RunHistoryView.tsx +0 -367
  79. package/components/mobile/ScoreRadial.tsx +0 -211
  80. package/components/mobile/StreamingLogPanel.tsx +0 -230
  81. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  82. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  83. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  84. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  85. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  86. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  87. package/docs/research/COST-ANALYSIS.md +0 -56
  88. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  89. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  90. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  91. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  92. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  93. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  94. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  95. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  96. package/docs/research/PAPER-UNIFIED.md +0 -659
  97. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  98. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  99. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  100. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  101. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  102. package/docs/research/paper-draft/full-paper.md +0 -136
  103. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  104. package/docs/research/paper-draft/references.bib +0 -515
  105. package/docs/research/transcript-baseline.md +0 -139
  106. package/docs/research/transcript-recognition-multiagent.md +0 -187
  107. package/hooks/useEvalData.ts +0 -625
  108. package/server-init.js +0 -45
  109. package/services/benchmarkService.js +0 -1892
  110. package/types.ts +0 -165
  111. package/utils/haptics.ts +0 -45
@@ -1,124 +0,0 @@
1
- /**
2
- * RecognitionTypeChart Component
3
- *
4
- * Horizontal stacked bar showing distribution of recognition types:
5
- * - Pedagogical (green) - Learning-focused recognition moments
6
- * - Metacognitive (blue) - Reflection on learning process
7
- * - Existential (purple) - Deep identity/meaning moments
8
- */
9
-
10
- import React from 'react';
11
-
12
- interface RecognitionTypeCounts {
13
- pedagogical: number;
14
- metacognitive: number;
15
- existential: number;
16
- }
17
-
18
- interface RecognitionTypeChartProps {
19
- counts: RecognitionTypeCounts;
20
- showLegend?: boolean;
21
- }
22
-
23
- export const RecognitionTypeChart: React.FC<RecognitionTypeChartProps> = ({
24
- counts,
25
- showLegend = true,
26
- }) => {
27
- const total = counts.pedagogical + counts.metacognitive + counts.existential;
28
-
29
- if (total === 0) {
30
- return (
31
- <div className="bg-gray-900/60 backdrop-blur-sm border border-white/5 rounded-xl p-4">
32
- <div className="text-xs text-gray-400 mb-3">Recognition Types</div>
33
- <div className="text-sm text-gray-500 text-center py-4">
34
- No recognition moments recorded
35
- </div>
36
- </div>
37
- );
38
- }
39
-
40
- const percentages = {
41
- pedagogical: (counts.pedagogical / total) * 100,
42
- metacognitive: (counts.metacognitive / total) * 100,
43
- existential: (counts.existential / total) * 100,
44
- };
45
-
46
- const types = [
47
- {
48
- key: 'pedagogical',
49
- label: 'Pedagogical',
50
- count: counts.pedagogical,
51
- percentage: percentages.pedagogical,
52
- color: 'bg-green-500',
53
- textColor: 'text-green-400',
54
- description: 'Learning-focused',
55
- },
56
- {
57
- key: 'metacognitive',
58
- label: 'Metacognitive',
59
- count: counts.metacognitive,
60
- percentage: percentages.metacognitive,
61
- color: 'bg-blue-500',
62
- textColor: 'text-blue-400',
63
- description: 'Process reflection',
64
- },
65
- {
66
- key: 'existential',
67
- label: 'Existential',
68
- count: counts.existential,
69
- percentage: percentages.existential,
70
- color: 'bg-purple-500',
71
- textColor: 'text-purple-400',
72
- description: 'Identity/meaning',
73
- },
74
- ];
75
-
76
- return (
77
- <div className="bg-gray-900/60 backdrop-blur-sm border border-white/5 rounded-xl p-4">
78
- <div className="flex items-center justify-between mb-3">
79
- <div className="text-xs text-gray-400">Recognition Types</div>
80
- <div className="text-xs text-gray-500">{total} total</div>
81
- </div>
82
-
83
- {/* Stacked bar */}
84
- <div className="h-6 rounded-full overflow-hidden flex bg-gray-800">
85
- {types.map(
86
- (type) =>
87
- type.percentage > 0 && (
88
- <div
89
- key={type.key}
90
- className={`${type.color} transition-all duration-500`}
91
- style={{ width: `${type.percentage}%` }}
92
- title={`${type.label}: ${type.count} (${type.percentage.toFixed(1)}%)`}
93
- />
94
- )
95
- )}
96
- </div>
97
-
98
- {/* Legend */}
99
- {showLegend && (
100
- <div className="mt-4 space-y-2">
101
- {types.map((type) => (
102
- <div key={type.key} className="flex items-center justify-between">
103
- <div className="flex items-center gap-2">
104
- <div className={`w-3 h-3 rounded-full ${type.color}`} />
105
- <span className="text-xs text-gray-300">{type.label}</span>
106
- <span className="text-xs text-gray-600">{type.description}</span>
107
- </div>
108
- <div className="flex items-center gap-2">
109
- <span className={`text-xs ${type.textColor} font-medium`}>
110
- {type.count}
111
- </span>
112
- <span className="text-xs text-gray-500">
113
- ({type.percentage.toFixed(0)}%)
114
- </span>
115
- </div>
116
- </div>
117
- ))}
118
- </div>
119
- )}
120
- </div>
121
- );
122
- };
123
-
124
- export default RecognitionTypeChart;