npm - @machinespirits/eval - Versions diffs - 0.1.2 → 0.2.0 - Mend

@machinespirits/eval 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (111) hide show

package/LICENSE +21 -0
package/README.md +161 -0
package/config/eval-settings.yaml +18 -0
package/config/evaluation-rubric-learner.yaml +277 -0
package/config/evaluation-rubric.yaml +613 -0
package/config/interaction-eval-scenarios.yaml +93 -50
package/config/learner-agents.yaml +124 -193
package/config/providers.yaml +60 -0
package/config/suggestion-scenarios.yaml +1399 -0
package/config/tutor-agents.yaml +716 -0
package/docs/EVALUATION-VARIABLES.md +589 -0
package/docs/REPLICATION-PLAN.md +577 -0
package/docs/research/build.sh +74 -0
package/docs/research/figures/figure1.png +0 -0
package/docs/research/figures/figure2.png +0 -0
package/docs/research/figures/figure3.png +0 -0
package/docs/research/figures/figure4.png +0 -0
package/docs/research/figures/figure5.png +0 -0
package/docs/research/figures/figure6.png +0 -0
package/docs/research/header.tex +4 -0
package/docs/research/paper-full.md +1909 -0
package/docs/research/paper-short.md +805 -0
package/docs/research/references.bib +1011 -0
package/index.js +15 -6
package/package.json +14 -21
package/routes/evalRoutes.js +88 -36
package/scripts/analyze-judge-reliability.js +401 -0
package/scripts/analyze-run.js +97 -0
package/scripts/analyze-run.mjs +282 -0
package/scripts/analyze-validation-failures.js +141 -0
package/scripts/check-run.mjs +17 -0
package/scripts/code-impasse-strategies.js +1132 -0
package/scripts/compare-runs.js +44 -0
package/scripts/compare-suggestions.js +80 -0
package/scripts/compare-transformation.js +116 -0
package/scripts/dig-into-run.js +158 -0
package/scripts/eval-cli.js +2626 -0
package/scripts/generate-paper-figures.py +452 -0
package/scripts/qualitative-analysis-ai.js +1313 -0
package/scripts/qualitative-analysis.js +688 -0
package/scripts/seed-db.js +87 -0
package/scripts/show-failed-suggestions.js +64 -0
package/scripts/validate-content.js +192 -0
package/server.js +3 -2
package/services/__tests__/evalConfigLoader.test.js +338 -0
package/services/anovaStats.js +499 -0
package/services/contentResolver.js +407 -0
package/services/dialogueTraceAnalyzer.js +454 -0
package/services/evalConfigLoader.js +625 -0
package/services/evaluationRunner.js +2171 -270
package/services/evaluationStore.js +564 -29
package/services/learnerConfigLoader.js +75 -5
package/services/learnerRubricEvaluator.js +284 -0
package/services/learnerTutorInteractionEngine.js +375 -0
package/services/processUtils.js +18 -0
package/services/progressLogger.js +98 -0
package/services/promptRecommendationService.js +31 -26
package/services/promptRewriter.js +427 -0
package/services/rubricEvaluator.js +543 -70
package/services/streamingReporter.js +104 -0
package/services/turnComparisonAnalyzer.js +494 -0
package/components/MobileEvalDashboard.tsx +0 -267
package/components/comparison/DeltaAnalysisTable.tsx +0 -137
package/components/comparison/ProfileComparisonCard.tsx +0 -176
package/components/comparison/RecognitionABMode.tsx +0 -385
package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
package/components/comparison/WinnerIndicator.tsx +0 -64
package/components/comparison/index.ts +0 -5
package/components/mobile/BottomSheet.tsx +0 -233
package/components/mobile/DimensionBreakdown.tsx +0 -210
package/components/mobile/DocsView.tsx +0 -363
package/components/mobile/LogsView.tsx +0 -481
package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
package/components/mobile/QuickTestView.tsx +0 -1098
package/components/mobile/RecognitionTypeChart.tsx +0 -124
package/components/mobile/RecognitionView.tsx +0 -809
package/components/mobile/RunDetailView.tsx +0 -261
package/components/mobile/RunHistoryView.tsx +0 -367
package/components/mobile/ScoreRadial.tsx +0 -211
package/components/mobile/StreamingLogPanel.tsx +0 -230
package/components/mobile/SynthesisStrategyChart.tsx +0 -140
package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
package/docs/research/COST-ANALYSIS.md +0 -56
package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
package/docs/research/PAPER-UNIFIED.md +0 -659
package/docs/research/PAPER-UNIFIED.pdf +0 -0
package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
package/docs/research/paper-draft/full-paper.md +0 -136
package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
package/docs/research/paper-draft/references.bib +0 -515
package/docs/research/transcript-baseline.md +0 -139
package/docs/research/transcript-recognition-multiagent.md +0 -187
package/hooks/useEvalData.ts +0 -625
package/server-init.js +0 -45
package/services/benchmarkService.js +0 -1892
package/types.ts +0 -165
package/utils/haptics.ts +0 -45

package/docs/research/ADVANCED-EVAL-ANALYSIS.md DELETED Viewed

@@ -1,60 +0,0 @@
-# Advanced Evaluation Analysis
-**Generated:** 2026-01-14T11:07:14.890Z
-## Extended Recognition Scenarios
-These scenarios test recognition quality across multiple conversation turns, where learner responses are contingent on tutor suggestions.
-### Results Summary
-| Scenario | Turns | Baseline | Recognition | Diff | Cohen's d | Sig |
-|----------|-------|----------|-------------|------|-----------|-----|
-| Sustained Dialogue | 8 | 46.3 | 61.0 | +14.7 | 3.60 | * |
-| Breakdown Recovery | 6 | 57.5 | 71.3 | +13.8 | 2.23 | * |
-| Productive Struggle | 5 | 46.5 | 73.2 | +26.7 | 3.32 | * |
-| Mutual Transformation | 5 | 45.1 | 64.3 | +19.1 | 2.89 | * |
-**Aggregate Statistics:**
-- Average improvement: +18.6 points
-- Average effect size: d = 3.01
-- Significant effects: 4/4
-## Contingent Learner Analysis
-Multi-turn scenarios simulate realistic interactions where learner behavior depends on tutor suggestions. Recognition-enhanced tutoring maintains quality advantage even as:
-- Learners follow or reject suggestions
-- Conversations extend over multiple turns
-- Learners express frustration or confusion
-- Repair cycles become necessary
-## Bilateral Measurement Framework
-### Tutor Evaluation Dimensions
-1. **Mutual Recognition**: Acknowledges learner as autonomous subject
-2. **Dialectical Responsiveness**: Shaped by learner's specific input
-3. **Transformative Potential**: Enables genuine growth
-### Learner Evaluation Dimensions (Simulated)
-1. **Authenticity**: Genuine perspective contribution
-2. **Responsiveness**: Engagement with tutor suggestions
-3. **Development**: Growth across turns
-### Bilateral Metric
-> "Does engagement produce genuine mutual development?"
-## Integration with Statistical Findings
-The extended scenario results align with our factorial ANOVA findings:
-1. **Recognition Effect Persists**: The large recognition effect (η² = .422) is maintained across extended interactions, suggesting recognition-oriented prompting produces robust improvements.
-2. **Architecture Effect Context-Dependent**: The marginal architecture effect (η² = .034) may become more important in complex multi-turn scenarios requiring repair cycles.
-3. **Additive Benefits Confirmed**: No interaction effects suggest recognition benefits transfer across different scenario types and lengths.
-## Implications
-1. **Scalability**: Recognition-oriented design scales to longer interactions
-2. **Robustness**: Benefits persist even with contingent learner responses
-3. **Cost-Effectiveness**: Free-tier models achieve recognition quality with proper prompting

package/docs/research/ANOVA-RESULTS-2026-01-14.md DELETED Viewed

@@ -1,257 +0,0 @@
-# Two-Way ANOVA Results
-**Generated:** 2026-01-14T10:22:17.071Z
-**Data Source:** evaluations.db (factorial evaluation runs)
-```
-======================================================================
-TWO-WAY ANOVA RESULTS: 2×2 Factorial Design
-======================================================================
-EXPERIMENTAL DESIGN
-----------------------------------------------------------------------
-Factor A: Architecture (Single-Agent vs Multi-Agent)
-Factor B: Recognition (Standard vs Recognition-Enhanced Prompts)
-Total N: 76
-Grand Mean: 60.87
-CELL STATISTICS
-----------------------------------------------------------------------
-Cell                      N     Mean      SD      95% CI
-----------------------------------------------------------------------
-Single + Standard          8    39.99     9.12   [33.5, 46.4]
-Single + Recognition       6    70.36    26.15   [49.0, 91.7]
-Multi + Standard          31    50.16    15.00   [44.8, 55.5]
-Multi + Recognition       31    75.14    14.68   [69.9, 80.4]
-MARGINAL MEANS
-----------------------------------------------------------------------
-Architecture:  Single = 53.00,  Multi = 62.65
-Recognition:   Standard = 48.07,  Recognition = 74.37
-ANOVA TABLE
-----------------------------------------------------------------------
-Source              SS        df       MS         F        p       Sig
-----------------------------------------------------------------------
-Architecture (A)     1063.08       1     1063.08      4.445     0.050  *
-Recognition (B)     13123.82       1    13123.82     54.877     0.001  ***
-A × B                 124.13       1      124.13      0.519     0.250
-Error               17218.77      72      239.15
-----------------------------------------------------------------------
-Total               31115.95      75
-Significance: *** p < .05, * p < .10
-EFFECT SIZES
-----------------------------------------------------------------------
-Source              η²      Partial η²    Cohen's d    Interpretation
-----------------------------------------------------------------------
-Architecture (A)     0.034       0.058         0.62       Small
-Recognition (B)      0.422       0.433         1.70       Large
-A × B                0.004       0.007        N/A         Negligible
-MAIN EFFECTS (Raw Differences)
-----------------------------------------------------------------------
-Architecture Effect: Multi - Single = +9.65 points
-Recognition Effect:  Recognition - Standard = +26.29 points
-ASSUMPTION CHECKS
-----------------------------------------------------------------------
-Normality (Shapiro-Wilk approx): PASSED ✓
-Homogeneity of Variance (Levene): F = 3.31, p = 0.010 - VIOLATED ✗
-INTERPRETATION
-----------------------------------------------------------------------
-✓ Recognition prompts have a SIGNIFICANT effect (F = 54.88, p < .05)
-  Effect size: large (η² = 0.422)
-✗ Architecture effect is NOT significant (F = 4.45, p = 0.050)
-✗ No significant interaction (F = 0.52, p = 0.250)
-======================================================================
-```
-## JSON Results
-```json
-{
-  "grandMean": 60.87384654818866,
-  "N": 76,
-  "cellStats": {
-    "a0b0": {
-      "n": 8,
-      "mean": 39.98579545454547,
-      "std": 9.11985690330198,
-      "values": [
-        38.44696969696971,
-        38.63636363636365,
-        43.18181818181819,
-        34.848484848484865,
-        52.272727272727295,
-        37.500000000000014,
-        23.86363636363637,
-        51.136363636363654
-      ]
-    },
-    "a0b1": {
-      "n": 6,
-      "mean": 70.3598484848485,
-      "std": 26.148369552557515,
-      "values": [
-        50.3787878787879,
-        76.13636363636364,
-        100.00000000000003,
-        58.14393939393941,
-        37.50000000000002,
-        100.00000000000003
-      ]
-    },
-    "a1b0": {
-      "n": 31,
-      "mean": 50.16175580691711,
-      "std": 14.99861019348702,
-      "values": [
-        42.99242424242427,
-        32.95454545454547,
-        48.86363636363639,
-        49.05303030303031,
-        45.45454545454547,
-        59.09090909090911,
-        63.47402597402599,
-        50.974025974026,
-        58.11688311688312,
-        44.94949494949497,
-        52.02020202020204,
-        41.919191919191945,
-        38.63636363636365,
-        57.00757575757578,
-        48.4848484848485,
-        43.37121212121213,
-        43.181818181818194,
-        50.00000000000002,
-        45.90909090909093,
-        47.72727272727274,
-        53.863636363636395,
-        48.63636363636365,
-        42.80303030303031,
-        51.59090909090911,
-        43.93939393939396,
-        31.818181818181834,
-        18.181818181818194,
-        45.45454545454547,
-        87.5,
-        100,
-        67.04545454545458
-      ]
-    },
-    "a1b1": {
-      "n": 31,
-      "mean": 75.14040171298237,
-      "std": 14.684605144018311,
-      "values": [
-        75.18939393939395,
-        67.04545454545458,
-        100.00000000000003,
-        65.15151515151518,
-        56.818181818181834,
-        95.45454545454548,
-        78.08441558441562,
-        66.23376623376626,
-        69.64285714285717,
-        58.0808080808081,
-        62.12121212121214,
-        62.878787878787904,
-        60.03787878787881,
-        72.9166666666667,
-        82.00757575757578,
-        66.47727272727275,
-        63.636363636363654,
-        52.840909090909115,
-        67.72727272727275,
-        75.68181818181822,
-        60.68181818181819,
-        71.5909090909091,
-        84.09090909090911,
-        68.1818181818182,
-        74.05303030303033,
-        79.54545454545456,
-        93.18181818181822,
-        100.00000000000003,
-        100,
-        100,
-        100
-      ]
-    }
-  },
-  "marginalMeans": {
-    "architecture": {
-      "single": 53.003246753246756,
-      "multi": 62.65107875994973
-    },
-    "recognition": {
-      "standard": 48.07437932437934,
-      "recognition": 74.36517686517688
-    }
-  },
-  "anovaTable": {
-    "architecture": {
-      "SS": 1063.0791445902653,
-      "df": 1,
-      "MS": 1063.0791445902653,
-      "F": 4.445248575427072,
-      "p": 0.05,
-      "sig": false
-    },
-    "recognition": {
-      "SS": 13123.81985503855,
-      "df": 1,
-      "MS": 13123.81985503855,
-      "F": 54.87704449065894,
-      "p": 0.001,
-      "sig": true
-    },
-    "interaction": {
-      "SS": 124.12853011664384,
-      "df": 1,
-      "MS": 124.12853011664384,
-      "F": 0.5190414791586724,
-      "p": 0.25,
-      "sig": false
-    },
-    "error": {
-      "SS": 17218.76675999957,
-      "df": 72,
-      "MS": 239.14953833332734
-    },
-    "total": {
-      "SS": 31115.94559827812,
-      "df": 75
-    }
-  },
-  "effectSizes": {
-    "architecture": {
-      "etaSq": 0.03416509201793609,
-      "partialEtaSq": 0.05814944235600241,
-      "cohenD": 0.6238712311493667
-    },
-    "recognition": {
-      "etaSq": 0.42177152590743666,
-      "partialEtaSq": 0.4325214597404903,
-      "cohenD": 1.7000785480386178
-    },
-    "interaction": {
-      "etaSq": 0.003989225708233428,
-      "partialEtaSq": 0.007157313011477686
-    }
-  },
-  "mainEffects": {
-    "architecture": 9.647832006702977,
-    "recognition": 26.290797540797534
-  },
-  "assumptions": {
-    "normality": true,
-    "homogeneity": false,
-    "leveneF": 3.3126010309809213,
-    "leveneP": 0.01
-  }
-}
-```