npm - @machinespirits/eval - Versions diffs - 0.1.2 → 0.2.1 - Mend

@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

package/LICENSE +21 -0
package/README.md +161 -0
package/config/eval-settings.yaml +18 -0
package/config/evaluation-rubric-learner.yaml +277 -0
package/config/evaluation-rubric.yaml +613 -0
package/config/interaction-eval-scenarios.yaml +93 -50
package/config/learner-agents.yaml +124 -193
package/config/machinespirits-eval.code-workspace +11 -0
package/config/providers.yaml +60 -0
package/config/suggestion-scenarios.yaml +1399 -0
package/config/tutor-agents.yaml +716 -0
package/docs/EVALUATION-VARIABLES.md +589 -0
package/docs/REPLICATION-PLAN.md +577 -0
package/index.js +15 -6
package/package.json +16 -22
package/routes/evalRoutes.js +88 -36
package/scripts/analyze-judge-reliability.js +401 -0
package/scripts/analyze-run.js +97 -0
package/scripts/analyze-run.mjs +282 -0
package/scripts/analyze-validation-failures.js +141 -0
package/scripts/check-run.mjs +17 -0
package/scripts/code-impasse-strategies.js +1132 -0
package/scripts/compare-runs.js +44 -0
package/scripts/compare-suggestions.js +80 -0
package/scripts/compare-transformation.js +116 -0
package/scripts/dig-into-run.js +158 -0
package/scripts/eval-cli.js +2626 -0
package/scripts/generate-paper-figures.py +452 -0
package/scripts/qualitative-analysis-ai.js +1313 -0
package/scripts/qualitative-analysis.js +688 -0
package/scripts/seed-db.js +87 -0
package/scripts/show-failed-suggestions.js +64 -0
package/scripts/validate-content.js +192 -0
package/server.js +3 -2
package/services/__tests__/evalConfigLoader.test.js +338 -0
package/services/anovaStats.js +499 -0
package/services/contentResolver.js +407 -0
package/services/dialogueTraceAnalyzer.js +454 -0
package/services/evalConfigLoader.js +625 -0
package/services/evaluationRunner.js +2171 -270
package/services/evaluationStore.js +564 -29
package/services/learnerConfigLoader.js +75 -5
package/services/learnerRubricEvaluator.js +284 -0
package/services/learnerTutorInteractionEngine.js +375 -0
package/services/processUtils.js +18 -0
package/services/progressLogger.js +98 -0
package/services/promptRecommendationService.js +31 -26
package/services/promptRewriter.js +427 -0
package/services/rubricEvaluator.js +543 -70
package/services/streamingReporter.js +104 -0
package/services/turnComparisonAnalyzer.js +494 -0
package/components/MobileEvalDashboard.tsx +0 -267
package/components/comparison/DeltaAnalysisTable.tsx +0 -137
package/components/comparison/ProfileComparisonCard.tsx +0 -176
package/components/comparison/RecognitionABMode.tsx +0 -385
package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
package/components/comparison/WinnerIndicator.tsx +0 -64
package/components/comparison/index.ts +0 -5
package/components/mobile/BottomSheet.tsx +0 -233
package/components/mobile/DimensionBreakdown.tsx +0 -210
package/components/mobile/DocsView.tsx +0 -363
package/components/mobile/LogsView.tsx +0 -481
package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
package/components/mobile/QuickTestView.tsx +0 -1098
package/components/mobile/RecognitionTypeChart.tsx +0 -124
package/components/mobile/RecognitionView.tsx +0 -809
package/components/mobile/RunDetailView.tsx +0 -261
package/components/mobile/RunHistoryView.tsx +0 -367
package/components/mobile/ScoreRadial.tsx +0 -211
package/components/mobile/StreamingLogPanel.tsx +0 -230
package/components/mobile/SynthesisStrategyChart.tsx +0 -140
package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
package/docs/research/COST-ANALYSIS.md +0 -56
package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
package/docs/research/PAPER-UNIFIED.md +0 -659
package/docs/research/PAPER-UNIFIED.pdf +0 -0
package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
package/docs/research/apa.csl +0 -2133
package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
package/docs/research/paper-draft/full-paper.md +0 -136
package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
package/docs/research/paper-draft/references.bib +0 -515
package/docs/research/transcript-baseline.md +0 -139
package/docs/research/transcript-recognition-multiagent.md +0 -187
package/hooks/useEvalData.ts +0 -625
package/server-init.js +0 -45
package/services/benchmarkService.js +0 -1892
package/types.ts +0 -165
package/utils/haptics.ts +0 -45

package/types.ts DELETED Viewed

@@ -1,165 +0,0 @@
-/**
- * Evaluation Types
- *
- * Types for the eval dashboard components.
- */
-export interface EvalProfile {
-  name: string;
-  description: string;
-  egoProvider?: string;
-  egoModel?: string;
-  superegoProvider?: string;
-  superegoModel?: string;
-  dialogueEnabled?: boolean;
-  maxRounds?: number;
-}
-export interface EvalScenario {
-  id: string;
-  name: string;
-  description?: string;
-  category?: string;
-  turnCount?: number;
-  isMultiTurn?: boolean;
-}
-export interface EvalRun {
-  id: string;
-  description?: string;
-  totalTests?: number;
-  totalScenarios?: number;
-  totalConfigurations?: number;
-  status: 'running' | 'completed' | 'failed';
-  createdAt: string;
-  completedAt?: string;
-  runType?: 'quick' | 'batch' | 'matrix' | 'compare' | 'interaction';
-  profiles?: string[];
-}
-export type EvalDimensionScore = number | { score: number; reasoning?: string; quote?: string } | null;
-export interface EvalDimensionScores {
-  relevance: EvalDimensionScore;
-  specificity: EvalDimensionScore;
-  pedagogical: EvalDimensionScore;
-  personalization: EvalDimensionScore;
-  actionability: EvalDimensionScore;
-  tone: EvalDimensionScore;
-}
-export interface EvalSuggestion {
-  type: string;
-  title: string;
-  message: string;
-  actionTarget?: string;
-  headline?: string;
-  body?: string;
-  priority?: 'high' | 'medium' | 'low';
-}
-export interface EvalValidation {
-  passesRequired: boolean;
-  passesForbidden: boolean;
-  requiredMissing: string[];
-  forbiddenFound: string[];
-}
-export interface EvalQuickTestResult {
-  scenarioId: string;
-  scenarioName: string;
-  profile: string;
-  provider?: string;
-  model?: string;
-  passed: boolean;
-  overallScore: number | null;
-  latencyMs: number;
-  scores?: EvalDimensionScores;
-  validation?: EvalValidation;
-  suggestions?: EvalSuggestion[];
-  inputTokens?: number;
-  outputTokens?: number;
-  totalTokens?: number;
-  apiCalls?: number;
-  dialogueRounds?: number;
-  evaluationReasoning?: string;
-  evaluatorModel?: string;
-  scenarioContext?: {
-    description: string;
-    expectedBehavior?: string;
-    learnerContext?: Record<string, string | undefined>;
-  };
-}
-// Agent role types for dialogue system
-export type AgentRole = 'user' | 'ego' | 'superego';
-export type DialogueDirection = 'input' | 'request' | 'response';
-export interface EvalDialogueEntry {
-  timestamp: string;
-  agent: AgentRole;
-  action?: string;
-  model?: string;
-  provider?: string;
-  latencyMs?: number;
-  inputTokens?: number;
-  outputTokens?: number;
-  suggestions?: Array<{ type: string; title: string; message: string; priority?: string }>;
-  verdict?: { approved: boolean; confidence?: number; feedback?: string };
-  preAnalysis?: {
-    isPreAnalysis: boolean;
-    reinterpretations?: unknown[];
-    overallCaution?: string;
-  };
-  from?: AgentRole;
-  to?: AgentRole;
-  direction?: DialogueDirection;
-  rawContext?: string;
-  contextData?: {
-    courseId?: string;
-    courseTitle?: string;
-    lectureId?: string;
-    lectureTitle?: string;
-    recentActivity?: string[];
-  };
-  output?: unknown;
-  cost?: number;
-}
-export interface EvalDialogue {
-  dialogueId: string;
-  startTime: string;
-  endTime: string;
-  entryCount: number;
-  entries?: EvalDialogueEntry[];
-  summary?: {
-    egoCount: number;
-    superegoCount: number;
-    totalSuggestions: number;
-    approvedCount: number;
-    revisedCount: number;
-    totalLatencyMs: number;
-    totalInputTokens?: number;
-    totalOutputTokens?: number;
-  };
-}
-export interface EvalTrendPoint {
-  runId: string;
-  createdAt: string;
-  description?: string;
-  runType?: 'quick' | 'eval' | 'matrix' | 'compare' | 'auto';
-  profiles?: string[];
-  scenarioCount?: number;
-  testCount: number;
-  overallScore: number | null;
-  dimensions: EvalDimensionScores;
-}
-export interface EvalDoc {
-  name: string;
-  filename: string;
-  title: string;
-  size: number;
-  modified: string;
-}

package/utils/haptics.ts DELETED Viewed

@@ -1,45 +0,0 @@
-/**
- * Haptic Feedback Utilities
- *
- * Provides consistent vibration patterns for mobile interactions.
- * Falls back gracefully when vibration API is not available.
- */
-type VibrationPattern = number | number[];
-const vibrate = (pattern: VibrationPattern): void => {
-  if (typeof navigator !== 'undefined' && navigator.vibrate) {
-    navigator.vibrate(pattern);
-  }
-};
-export const haptics = {
-  /** Light tap - tab changes, selections */
-  light: () => vibrate(5),
-  /** Medium tap - pull-to-refresh trigger, confirmations */
-  medium: () => vibrate(10),
-  /** Heavy tap - errors, warnings */
-  heavy: () => vibrate(20),
-  /** Success pattern - test passed, action completed */
-  success: () => vibrate([10, 50, 10]),
-  /** Error pattern - test failed, error occurred */
-  error: () => vibrate([20, 100, 20, 100, 20]),
-  /** Back online notification */
-  online: () => vibrate([100, 50, 100]),
-  /** Went offline notification */
-  offline: () => vibrate(200),
-  /** Copy to clipboard */
-  copy: () => vibrate(30),
-  /** Button press feedback */
-  button: () => vibrate(8)
-};
-export default haptics;