ppef 1.3.1 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/ppef.schema.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "$schema": "https://json-schema.org/draft/2020-12/schema",
3
- "$id": "https://ppef.dev/schemas/v1.3.0/ppef.schema.json",
3
+ "$id": "https://ppef.dev/schemas/v1.4.0/ppef.schema.json",
4
4
  "title": "ExperimentConfig",
5
5
  "description": "PPEF experiment configuration",
6
6
  "type": "object",
@@ -439,165 +439,2600 @@
439
439
  }
440
440
  ],
441
441
  "$defs": {
442
+ "AggregatedResult": {
443
+ "title": "AggregatedResult",
444
+ "description": "Aggregated result for a SUT",
445
+ "type": "object",
446
+ "properties": {
447
+ "caseClass": {
448
+ "description": "Case class (if grouped)",
449
+ "type": "string"
450
+ },
451
+ "comparisons": {
452
+ "description": "Comparisons with baselines",
453
+ "type": "object",
454
+ "additionalProperties": {
455
+ "title": "ComparisonMetrics",
456
+ "description": "Comparison metrics between primary and baseline SUTs",
457
+ "type": "object",
458
+ "properties": {
459
+ "betterRate": {
460
+ "description": "Win rate (% of cases where primary beats baseline)",
461
+ "type": "number"
462
+ },
463
+ "deltas": {
464
+ "description": "Absolute deltas (primary - baseline)",
465
+ "type": "object",
466
+ "additionalProperties": {
467
+ "type": "number"
468
+ },
469
+ "propertyNames": {
470
+ "type": "string"
471
+ }
472
+ },
473
+ "effectSize": {
474
+ "description": "Effect size (Cohen's d)",
475
+ "type": "number"
476
+ },
477
+ "pValue": {
478
+ "description": "Statistical significance (p-value)",
479
+ "type": "number"
480
+ },
481
+ "ratios": {
482
+ "description": "Ratios (primary / baseline)",
483
+ "type": "object",
484
+ "additionalProperties": {
485
+ "type": "number"
486
+ },
487
+ "propertyNames": {
488
+ "type": "string"
489
+ }
490
+ },
491
+ "uStatistic": {
492
+ "description": "Mann-Whitney U statistic",
493
+ "type": "number"
494
+ }
495
+ },
496
+ "required": [
497
+ "deltas",
498
+ "ratios"
499
+ ],
500
+ "additionalProperties": false
501
+ },
502
+ "propertyNames": {
503
+ "type": "string"
504
+ }
505
+ },
506
+ "correctness": {
507
+ "type": "object",
508
+ "properties": {
509
+ "failureBreakdown": {
510
+ "description": "Breakdown of failure types",
511
+ "type": "object",
512
+ "additionalProperties": {
513
+ "type": "number"
514
+ },
515
+ "propertyNames": {
516
+ "type": "string"
517
+ }
518
+ },
519
+ "matchesExpectedRate": {
520
+ "description": "Fraction of runs matching expected",
521
+ "type": "number"
522
+ },
523
+ "producedOutputRate": {
524
+ "description": "Fraction of runs that produced any output",
525
+ "type": "number"
526
+ },
527
+ "validRate": {
528
+ "description": "Fraction of runs that produced valid output",
529
+ "type": "number"
530
+ }
531
+ },
532
+ "required": [
533
+ "producedOutputRate",
534
+ "validRate"
535
+ ],
536
+ "additionalProperties": false
537
+ },
538
+ "coverage": {
539
+ "title": "CoverageMetrics",
540
+ "description": "Coverage information",
541
+ "type": "object",
542
+ "properties": {
543
+ "caseCoverage": {
544
+ "description": "Fraction of cases covered",
545
+ "type": "number"
546
+ },
547
+ "metricCoverage": {
548
+ "description": "Metric availability (metric name -> coverage fraction)",
549
+ "type": "object",
550
+ "additionalProperties": {
551
+ "type": "number"
552
+ },
553
+ "propertyNames": {
554
+ "type": "string"
555
+ }
556
+ },
557
+ "missingCases": {
558
+ "description": "Missing case IDs",
559
+ "type": "array",
560
+ "items": {
561
+ "type": "string"
562
+ }
563
+ }
564
+ },
565
+ "required": [
566
+ "caseCoverage",
567
+ "metricCoverage"
568
+ ],
569
+ "additionalProperties": false
570
+ },
571
+ "group": {
572
+ "type": "object",
573
+ "properties": {
574
+ "caseCount": {
575
+ "description": "Number of unique cases",
576
+ "type": "integer",
577
+ "minimum": -9007199254740991,
578
+ "maximum": 2147483647
579
+ },
580
+ "configHash": {
581
+ "description": "Hash of configuration",
582
+ "type": "string"
583
+ },
584
+ "runCount": {
585
+ "description": "Number of runs in this aggregate",
586
+ "type": "integer",
587
+ "minimum": -9007199254740991,
588
+ "maximum": 2147483647
589
+ }
590
+ },
591
+ "required": [
592
+ "caseCount",
593
+ "runCount"
594
+ ],
595
+ "additionalProperties": false
596
+ },
597
+ "metadata": {
598
+ "description": "Additional metadata",
599
+ "type": "object",
600
+ "additionalProperties": {
601
+ "anyOf": [
602
+ {
603
+ "type": "string"
604
+ },
605
+ {
606
+ "type": "number"
607
+ },
608
+ {
609
+ "type": "boolean"
610
+ },
611
+ {
612
+ "type": "null"
613
+ }
614
+ ]
615
+ },
616
+ "propertyNames": {
617
+ "type": "string"
618
+ }
619
+ },
620
+ "metrics": {
621
+ "description": "Aggregated metrics (metric name -> summary stats)",
622
+ "type": "object",
623
+ "additionalProperties": {
624
+ "title": "SummaryStats",
625
+ "description": "Summary statistics for a numeric metric",
626
+ "type": "object",
627
+ "properties": {
628
+ "confidence95": {
629
+ "description": "95% confidence interval [lower, upper]",
630
+ "type": "array",
631
+ "prefixItems": [
632
+ {
633
+ "type": "number"
634
+ },
635
+ {
636
+ "type": "number"
637
+ }
638
+ ]
639
+ },
640
+ "max": {
641
+ "description": "Maximum value",
642
+ "type": "number"
643
+ },
644
+ "mean": {
645
+ "description": "Arithmetic mean",
646
+ "type": "number"
647
+ },
648
+ "median": {
649
+ "description": "Median (50th percentile)",
650
+ "type": "number"
651
+ },
652
+ "min": {
653
+ "description": "Minimum value",
654
+ "type": "number"
655
+ },
656
+ "n": {
657
+ "description": "Number of observations",
658
+ "type": "integer",
659
+ "minimum": -9007199254740991,
660
+ "maximum": 2147483647
661
+ },
662
+ "p25": {
663
+ "description": "25th percentile",
664
+ "type": "number"
665
+ },
666
+ "p75": {
667
+ "description": "75th percentile",
668
+ "type": "number"
669
+ },
670
+ "std": {
671
+ "description": "Standard deviation (sample)",
672
+ "type": "number"
673
+ },
674
+ "sum": {
675
+ "description": "Sum of all values",
676
+ "type": "number"
677
+ }
678
+ },
679
+ "required": [
680
+ "max",
681
+ "mean",
682
+ "median",
683
+ "min",
684
+ "n"
685
+ ],
686
+ "additionalProperties": false
687
+ },
688
+ "propertyNames": {
689
+ "type": "string"
690
+ }
691
+ },
692
+ "sut": {
693
+ "description": "SUT identifier",
694
+ "type": "string"
695
+ },
696
+ "sutRole": {
697
+ "description": "Role of the SUT in evaluation",
698
+ "type": "string",
699
+ "oneOf": [
700
+ {
701
+ "description": "The system being evaluated; the novel algorithm or implementation",
702
+ "const": "primary"
703
+ },
704
+ {
705
+ "description": "A reference implementation for comparison",
706
+ "const": "baseline"
707
+ },
708
+ {
709
+ "description": "Ground truth provider; defines correct answers",
710
+ "const": "oracle"
711
+ }
712
+ ]
713
+ }
714
+ },
715
+ "required": [
716
+ "correctness",
717
+ "group",
718
+ "metrics",
719
+ "sut",
720
+ "sutRole"
721
+ ],
722
+ "additionalProperties": false
723
+ },
724
+ "AggregationOutput": {
725
+ "title": "AggregationOutput",
726
+ "description": "Complete aggregation output",
727
+ "type": "object",
728
+ "properties": {
729
+ "aggregates": {
730
+ "description": "Aggregated results",
731
+ "type": "array",
732
+ "items": {
733
+ "title": "AggregatedResult",
734
+ "description": "Aggregated result for a SUT",
735
+ "type": "object",
736
+ "properties": {
737
+ "caseClass": {
738
+ "description": "Case class (if grouped)",
739
+ "type": "string"
740
+ },
741
+ "comparisons": {
742
+ "description": "Comparisons with baselines",
743
+ "type": "object",
744
+ "additionalProperties": {
745
+ "title": "ComparisonMetrics",
746
+ "description": "Comparison metrics between primary and baseline SUTs",
747
+ "type": "object",
748
+ "properties": {
749
+ "betterRate": {
750
+ "description": "Win rate (% of cases where primary beats baseline)",
751
+ "type": "number"
752
+ },
753
+ "deltas": {
754
+ "description": "Absolute deltas (primary - baseline)",
755
+ "type": "object",
756
+ "additionalProperties": {
757
+ "type": "number"
758
+ },
759
+ "propertyNames": {
760
+ "type": "string"
761
+ }
762
+ },
763
+ "effectSize": {
764
+ "description": "Effect size (Cohen's d)",
765
+ "type": "number"
766
+ },
767
+ "pValue": {
768
+ "description": "Statistical significance (p-value)",
769
+ "type": "number"
770
+ },
771
+ "ratios": {
772
+ "description": "Ratios (primary / baseline)",
773
+ "type": "object",
774
+ "additionalProperties": {
775
+ "type": "number"
776
+ },
777
+ "propertyNames": {
778
+ "type": "string"
779
+ }
780
+ },
781
+ "uStatistic": {
782
+ "description": "Mann-Whitney U statistic",
783
+ "type": "number"
784
+ }
785
+ },
786
+ "required": [
787
+ "deltas",
788
+ "ratios"
789
+ ],
790
+ "additionalProperties": false
791
+ },
792
+ "propertyNames": {
793
+ "type": "string"
794
+ }
795
+ },
796
+ "correctness": {
797
+ "type": "object",
798
+ "properties": {
799
+ "failureBreakdown": {
800
+ "description": "Breakdown of failure types",
801
+ "type": "object",
802
+ "additionalProperties": {
803
+ "type": "number"
804
+ },
805
+ "propertyNames": {
806
+ "type": "string"
807
+ }
808
+ },
809
+ "matchesExpectedRate": {
810
+ "description": "Fraction of runs matching expected",
811
+ "type": "number"
812
+ },
813
+ "producedOutputRate": {
814
+ "description": "Fraction of runs that produced any output",
815
+ "type": "number"
816
+ },
817
+ "validRate": {
818
+ "description": "Fraction of runs that produced valid output",
819
+ "type": "number"
820
+ }
821
+ },
822
+ "required": [
823
+ "producedOutputRate",
824
+ "validRate"
825
+ ],
826
+ "additionalProperties": false
827
+ },
828
+ "coverage": {
829
+ "title": "CoverageMetrics",
830
+ "description": "Coverage information",
831
+ "type": "object",
832
+ "properties": {
833
+ "caseCoverage": {
834
+ "description": "Fraction of cases covered",
835
+ "type": "number"
836
+ },
837
+ "metricCoverage": {
838
+ "description": "Metric availability (metric name -> coverage fraction)",
839
+ "type": "object",
840
+ "additionalProperties": {
841
+ "type": "number"
842
+ },
843
+ "propertyNames": {
844
+ "type": "string"
845
+ }
846
+ },
847
+ "missingCases": {
848
+ "description": "Missing case IDs",
849
+ "type": "array",
850
+ "items": {
851
+ "type": "string"
852
+ }
853
+ }
854
+ },
855
+ "required": [
856
+ "caseCoverage",
857
+ "metricCoverage"
858
+ ],
859
+ "additionalProperties": false
860
+ },
861
+ "group": {
862
+ "type": "object",
863
+ "properties": {
864
+ "caseCount": {
865
+ "description": "Number of unique cases",
866
+ "type": "integer",
867
+ "minimum": -9007199254740991,
868
+ "maximum": 2147483647
869
+ },
870
+ "configHash": {
871
+ "description": "Hash of configuration",
872
+ "type": "string"
873
+ },
874
+ "runCount": {
875
+ "description": "Number of runs in this aggregate",
876
+ "type": "integer",
877
+ "minimum": -9007199254740991,
878
+ "maximum": 2147483647
879
+ }
880
+ },
881
+ "required": [
882
+ "caseCount",
883
+ "runCount"
884
+ ],
885
+ "additionalProperties": false
886
+ },
887
+ "metadata": {
888
+ "description": "Additional metadata",
889
+ "type": "object",
890
+ "additionalProperties": {
891
+ "anyOf": [
892
+ {
893
+ "type": "string"
894
+ },
895
+ {
896
+ "type": "number"
897
+ },
898
+ {
899
+ "type": "boolean"
900
+ },
901
+ {
902
+ "type": "null"
903
+ }
904
+ ]
905
+ },
906
+ "propertyNames": {
907
+ "type": "string"
908
+ }
909
+ },
910
+ "metrics": {
911
+ "description": "Aggregated metrics (metric name -> summary stats)",
912
+ "type": "object",
913
+ "additionalProperties": {
914
+ "title": "SummaryStats",
915
+ "description": "Summary statistics for a numeric metric",
916
+ "type": "object",
917
+ "properties": {
918
+ "confidence95": {
919
+ "description": "95% confidence interval [lower, upper]",
920
+ "type": "array",
921
+ "prefixItems": [
922
+ {
923
+ "type": "number"
924
+ },
925
+ {
926
+ "type": "number"
927
+ }
928
+ ]
929
+ },
930
+ "max": {
931
+ "description": "Maximum value",
932
+ "type": "number"
933
+ },
934
+ "mean": {
935
+ "description": "Arithmetic mean",
936
+ "type": "number"
937
+ },
938
+ "median": {
939
+ "description": "Median (50th percentile)",
940
+ "type": "number"
941
+ },
942
+ "min": {
943
+ "description": "Minimum value",
944
+ "type": "number"
945
+ },
946
+ "n": {
947
+ "description": "Number of observations",
948
+ "type": "integer",
949
+ "minimum": -9007199254740991,
950
+ "maximum": 2147483647
951
+ },
952
+ "p25": {
953
+ "description": "25th percentile",
954
+ "type": "number"
955
+ },
956
+ "p75": {
957
+ "description": "75th percentile",
958
+ "type": "number"
959
+ },
960
+ "std": {
961
+ "description": "Standard deviation (sample)",
962
+ "type": "number"
963
+ },
964
+ "sum": {
965
+ "description": "Sum of all values",
966
+ "type": "number"
967
+ }
968
+ },
969
+ "required": [
970
+ "max",
971
+ "mean",
972
+ "median",
973
+ "min",
974
+ "n"
975
+ ],
976
+ "additionalProperties": false
977
+ },
978
+ "propertyNames": {
979
+ "type": "string"
980
+ }
981
+ },
982
+ "sut": {
983
+ "description": "SUT identifier",
984
+ "type": "string"
985
+ },
986
+ "sutRole": {
987
+ "description": "Role of the SUT in evaluation",
988
+ "type": "string",
989
+ "oneOf": [
990
+ {
991
+ "description": "The system being evaluated; the novel algorithm or implementation",
992
+ "const": "primary"
993
+ },
994
+ {
995
+ "description": "A reference implementation for comparison",
996
+ "const": "baseline"
997
+ },
998
+ {
999
+ "description": "Ground truth provider; defines correct answers",
1000
+ "const": "oracle"
1001
+ }
1002
+ ]
1003
+ }
1004
+ },
1005
+ "required": [
1006
+ "correctness",
1007
+ "group",
1008
+ "metrics",
1009
+ "sut",
1010
+ "sutRole"
1011
+ ],
1012
+ "additionalProperties": false
1013
+ }
1014
+ },
1015
+ "metadata": {
1016
+ "description": "Global metadata",
1017
+ "type": "object",
1018
+ "properties": {
1019
+ "caseClassesIncluded": {
1020
+ "description": "Case classes included",
1021
+ "type": "array",
1022
+ "items": {
1023
+ "type": "string"
1024
+ }
1025
+ },
1026
+ "sutsIncluded": {
1027
+ "description": "SUTs included",
1028
+ "type": "array",
1029
+ "items": {
1030
+ "type": "string"
1031
+ }
1032
+ },
1033
+ "totalCases": {
1034
+ "description": "Total unique cases",
1035
+ "type": "integer",
1036
+ "minimum": -9007199254740991,
1037
+ "maximum": 2147483647
1038
+ },
1039
+ "totalRuns": {
1040
+ "description": "Total runs processed",
1041
+ "type": "integer",
1042
+ "minimum": -9007199254740991,
1043
+ "maximum": 2147483647
1044
+ }
1045
+ },
1046
+ "required": [
1047
+ "sutsIncluded",
1048
+ "totalCases",
1049
+ "totalRuns"
1050
+ ],
1051
+ "additionalProperties": false
1052
+ },
1053
+ "timestamp": {
1054
+ "description": "Generation timestamp",
1055
+ "type": "string"
1056
+ },
1057
+ "version": {
1058
+ "description": "Schema version",
1059
+ "type": "string"
1060
+ }
1061
+ },
1062
+ "required": [
1063
+ "aggregates",
1064
+ "timestamp",
1065
+ "version"
1066
+ ],
1067
+ "additionalProperties": false
1068
+ },
1069
+ "ClaimEvaluationSummary": {
1070
+ "title": "ClaimEvaluationSummary",
1071
+ "description": "Summary of all claim evaluations",
1072
+ "type": "object",
1073
+ "properties": {
1074
+ "evaluations": {
1075
+ "description": "Individual claim evaluations",
1076
+ "type": "array",
1077
+ "items": {
1078
+ "title": "ClaimEvaluation",
1079
+ "description": "Result of evaluating a single claim",
1080
+ "type": "object",
1081
+ "properties": {
1082
+ "claim": {
1083
+ "title": "EvaluationClaimOutput",
1084
+ "description": "The claim being evaluated",
1085
+ "type": "object",
1086
+ "properties": {
1087
+ "baseline": {
1088
+ "description": "Baseline SUT for comparison",
1089
+ "type": "string"
1090
+ },
1091
+ "citation": {
1092
+ "description": "Citation/reference for the claim",
1093
+ "type": "string"
1094
+ },
1095
+ "claimId": {
1096
+ "description": "Unique identifier for this claim",
1097
+ "type": "string"
1098
+ },
1099
+ "description": {
1100
+ "description": "Human-readable description",
1101
+ "type": "string"
1102
+ },
1103
+ "direction": {
1104
+ "description": "Expected direction of difference",
1105
+ "type": "string",
1106
+ "oneOf": [
1107
+ {
1108
+ "description": "Primary SUT metric should be greater than baseline",
1109
+ "const": "greater"
1110
+ },
1111
+ {
1112
+ "description": "Primary SUT metric should be less than baseline",
1113
+ "const": "less"
1114
+ },
1115
+ {
1116
+ "description": "Primary SUT metric should be equal to baseline",
1117
+ "const": "equal"
1118
+ }
1119
+ ]
1120
+ },
1121
+ "metric": {
1122
+ "description": "Metric being compared",
1123
+ "type": "string"
1124
+ },
1125
+ "minEffectSize": {
1126
+ "description": "Minimum effect size",
1127
+ "type": "number"
1128
+ },
1129
+ "scope": {
1130
+ "description": "Scope of claim validity",
1131
+ "type": "string",
1132
+ "oneOf": [
1133
+ {
1134
+ "description": "Claim applies across all cases and conditions",
1135
+ "const": "global"
1136
+ },
1137
+ {
1138
+ "description": "Claim applies within a specific case class",
1139
+ "const": "caseClass"
1140
+ },
1141
+ {
1142
+ "description": "Claim applies within a parameter range",
1143
+ "const": "parameterRange"
1144
+ },
1145
+ {
1146
+ "description": "Claim applies to local structural properties",
1147
+ "const": "localStructure"
1148
+ }
1149
+ ]
1150
+ },
1151
+ "scopeConstraints": {
1152
+ "description": "Scope constraints",
1153
+ "type": "object",
1154
+ "additionalProperties": {
1155
+ "anyOf": [
1156
+ {
1157
+ "anyOf": [
1158
+ {
1159
+ "type": "string"
1160
+ },
1161
+ {
1162
+ "type": "number"
1163
+ },
1164
+ {
1165
+ "type": "boolean"
1166
+ },
1167
+ {
1168
+ "type": "null"
1169
+ }
1170
+ ]
1171
+ },
1172
+ {
1173
+ "type": "array",
1174
+ "items": {
1175
+ "anyOf": [
1176
+ {
1177
+ "type": "string"
1178
+ },
1179
+ {
1180
+ "type": "number"
1181
+ },
1182
+ {
1183
+ "type": "boolean"
1184
+ },
1185
+ {
1186
+ "type": "null"
1187
+ }
1188
+ ]
1189
+ }
1190
+ }
1191
+ ]
1192
+ },
1193
+ "propertyNames": {
1194
+ "type": "string"
1195
+ }
1196
+ },
1197
+ "significanceLevel": {
1198
+ "description": "Required significance level",
1199
+ "type": "number"
1200
+ },
1201
+ "sut": {
1202
+ "description": "Primary SUT being evaluated",
1203
+ "type": "string"
1204
+ },
1205
+ "tags": {
1206
+ "description": "Tags for filtering",
1207
+ "type": "array",
1208
+ "items": {
1209
+ "type": "string"
1210
+ }
1211
+ },
1212
+ "threshold": {
1213
+ "description": "Optional threshold for the difference",
1214
+ "type": "number"
1215
+ }
1216
+ },
1217
+ "required": [
1218
+ "baseline",
1219
+ "claimId",
1220
+ "description",
1221
+ "direction",
1222
+ "metric",
1223
+ "scope",
1224
+ "sut"
1225
+ ],
1226
+ "additionalProperties": false
1227
+ },
1228
+ "evidence": {
1229
+ "title": "ClaimEvidence",
1230
+ "description": "Supporting evidence",
1231
+ "type": "object",
1232
+ "properties": {
1233
+ "baselineValue": {
1234
+ "description": "Baseline SUT metric value",
1235
+ "type": "number"
1236
+ },
1237
+ "delta": {
1238
+ "description": "Absolute delta (primary - baseline)",
1239
+ "type": "number"
1240
+ },
1241
+ "deltaCI95": {
1242
+ "description": "95% confidence interval for delta",
1243
+ "type": "array",
1244
+ "prefixItems": [
1245
+ {
1246
+ "type": "number"
1247
+ },
1248
+ {
1249
+ "type": "number"
1250
+ }
1251
+ ]
1252
+ },
1253
+ "effectSize": {
1254
+ "description": "Effect size (Cohen's d)",
1255
+ "type": "number"
1256
+ },
1257
+ "n": {
1258
+ "description": "Number of observations",
1259
+ "type": "integer",
1260
+ "minimum": -9007199254740991,
1261
+ "maximum": 2147483647
1262
+ },
1263
+ "primaryValue": {
1264
+ "description": "Primary SUT metric value",
1265
+ "type": "number"
1266
+ },
1267
+ "pValue": {
1268
+ "description": "P-value from statistical test",
1269
+ "type": "number"
1270
+ },
1271
+ "ratio": {
1272
+ "description": "Ratio (primary / baseline)",
1273
+ "type": "number"
1274
+ }
1275
+ },
1276
+ "required": [
1277
+ "baselineValue",
1278
+ "delta",
1279
+ "primaryValue",
1280
+ "ratio"
1281
+ ],
1282
+ "additionalProperties": false
1283
+ },
1284
+ "inconclusiveReason": {
1285
+ "description": "Reason for inconclusive status",
1286
+ "type": "string"
1287
+ },
1288
+ "notes": {
1289
+ "description": "Additional notes",
1290
+ "type": "array",
1291
+ "items": {
1292
+ "type": "string"
1293
+ }
1294
+ },
1295
+ "status": {
1296
+ "description": "Status of a claim evaluation",
1297
+ "type": "string",
1298
+ "enum": [
1299
+ "satisfied",
1300
+ "violated",
1301
+ "inconclusive"
1302
+ ]
1303
+ }
1304
+ },
1305
+ "required": [
1306
+ "claim",
1307
+ "evidence",
1308
+ "status"
1309
+ ],
1310
+ "additionalProperties": false
1311
+ }
1312
+ },
1313
+ "summary": {
1314
+ "type": "object",
1315
+ "properties": {
1316
+ "inconclusive": {
1317
+ "description": "Claims inconclusive",
1318
+ "type": "integer",
1319
+ "minimum": -9007199254740991,
1320
+ "maximum": 2147483647
1321
+ },
1322
+ "satisfactionRate": {
1323
+ "description": "Satisfaction rate (satisfied / (satisfied + violated))",
1324
+ "type": "number"
1325
+ },
1326
+ "satisfied": {
1327
+ "description": "Claims satisfied",
1328
+ "type": "integer",
1329
+ "minimum": -9007199254740991,
1330
+ "maximum": 2147483647
1331
+ },
1332
+ "total": {
1333
+ "description": "Total claims evaluated",
1334
+ "type": "integer",
1335
+ "minimum": -9007199254740991,
1336
+ "maximum": 2147483647
1337
+ },
1338
+ "violated": {
1339
+ "description": "Claims violated",
1340
+ "type": "integer",
1341
+ "minimum": -9007199254740991,
1342
+ "maximum": 2147483647
1343
+ }
1344
+ },
1345
+ "required": [
1346
+ "inconclusive",
1347
+ "satisfactionRate",
1348
+ "satisfied",
1349
+ "total",
1350
+ "violated"
1351
+ ],
1352
+ "additionalProperties": false
1353
+ },
1354
+ "timestamp": {
1355
+ "description": "Generation timestamp",
1356
+ "type": "string"
1357
+ },
1358
+ "version": {
1359
+ "description": "Schema version",
1360
+ "type": "string"
1361
+ }
1362
+ },
1363
+ "required": [
1364
+ "evaluations",
1365
+ "summary",
1366
+ "timestamp",
1367
+ "version"
1368
+ ],
1369
+ "additionalProperties": false
1370
+ },
442
1371
  "ClaimsEvaluatorConfig": {
443
1372
  "title": "ClaimsEvaluatorConfig",
444
1373
  "description": "Configuration for the claims evaluator",
445
1374
  "type": "object",
446
1375
  "properties": {
447
- "claims": {
448
- "description": "Claims to evaluate",
1376
+ "claims": {
1377
+ "description": "Claims to evaluate",
1378
+ "type": "array",
1379
+ "items": {
1380
+ "title": "EvaluationClaim",
1381
+ "description": "An evaluation claim (hypothesis)",
1382
+ "type": "object",
1383
+ "properties": {
1384
+ "baseline": {
1385
+ "description": "Baseline SUT for comparison",
1386
+ "type": "string",
1387
+ "minLength": 1
1388
+ },
1389
+ "citation": {
1390
+ "description": "Citation/reference for the claim",
1391
+ "type": "string"
1392
+ },
1393
+ "claimId": {
1394
+ "description": "Unique claim identifier",
1395
+ "type": "string",
1396
+ "minLength": 1
1397
+ },
1398
+ "description": {
1399
+ "description": "Human-readable claim description",
1400
+ "type": "string",
1401
+ "minLength": 1
1402
+ },
1403
+ "direction": {
1404
+ "description": "Expected direction of difference",
1405
+ "type": "string",
1406
+ "oneOf": [
1407
+ {
1408
+ "description": "Primary SUT metric should be greater than baseline",
1409
+ "const": "greater"
1410
+ },
1411
+ {
1412
+ "description": "Primary SUT metric should be less than baseline",
1413
+ "const": "less"
1414
+ },
1415
+ {
1416
+ "description": "Primary SUT metric should be equal to baseline",
1417
+ "const": "equal"
1418
+ }
1419
+ ]
1420
+ },
1421
+ "metric": {
1422
+ "description": "Metric being compared",
1423
+ "type": "string",
1424
+ "minLength": 1
1425
+ },
1426
+ "minEffectSize": {
1427
+ "description": "Minimum effect size (Cohen's d)",
1428
+ "type": "number",
1429
+ "minimum": 0
1430
+ },
1431
+ "scope": {
1432
+ "description": "Scope of claim validity",
1433
+ "type": "string",
1434
+ "oneOf": [
1435
+ {
1436
+ "description": "Claim applies across all cases and conditions",
1437
+ "const": "global"
1438
+ },
1439
+ {
1440
+ "description": "Claim applies within a specific case class",
1441
+ "const": "caseClass"
1442
+ },
1443
+ {
1444
+ "description": "Claim applies within a parameter range",
1445
+ "const": "parameterRange"
1446
+ },
1447
+ {
1448
+ "description": "Claim applies to local structural properties",
1449
+ "const": "localStructure"
1450
+ }
1451
+ ]
1452
+ },
1453
+ "scopeConstraints": {
1454
+ "description": "Scope constraints",
1455
+ "type": "object",
1456
+ "additionalProperties": {
1457
+ "anyOf": [
1458
+ {
1459
+ "anyOf": [
1460
+ {
1461
+ "type": "string"
1462
+ },
1463
+ {
1464
+ "type": "number"
1465
+ },
1466
+ {
1467
+ "type": "boolean"
1468
+ },
1469
+ {
1470
+ "type": "null"
1471
+ }
1472
+ ]
1473
+ },
1474
+ {
1475
+ "type": "array",
1476
+ "items": {
1477
+ "anyOf": [
1478
+ {
1479
+ "type": "string"
1480
+ },
1481
+ {
1482
+ "type": "number"
1483
+ },
1484
+ {
1485
+ "type": "boolean"
1486
+ },
1487
+ {
1488
+ "type": "null"
1489
+ }
1490
+ ]
1491
+ }
1492
+ }
1493
+ ]
1494
+ },
1495
+ "propertyNames": {
1496
+ "type": "string"
1497
+ }
1498
+ },
1499
+ "significanceLevel": {
1500
+ "description": "Required significance level (default: 0.05)",
1501
+ "type": "number",
1502
+ "minimum": 0,
1503
+ "maximum": 1
1504
+ },
1505
+ "sut": {
1506
+ "description": "Primary SUT being evaluated",
1507
+ "type": "string",
1508
+ "minLength": 1
1509
+ },
1510
+ "tags": {
1511
+ "description": "Tags for filtering",
1512
+ "type": "array",
1513
+ "items": {
1514
+ "type": "string"
1515
+ }
1516
+ },
1517
+ "threshold": {
1518
+ "description": "Optional threshold for the difference",
1519
+ "type": "number"
1520
+ }
1521
+ },
1522
+ "required": [
1523
+ "baseline",
1524
+ "claimId",
1525
+ "description",
1526
+ "direction",
1527
+ "metric",
1528
+ "scope",
1529
+ "sut"
1530
+ ],
1531
+ "additionalProperties": false
1532
+ },
1533
+ "minItems": 1
1534
+ },
1535
+ "description": {
1536
+ "description": "Evaluator description",
1537
+ "type": "string"
1538
+ },
1539
+ "minEffectSize": {
1540
+ "description": "Global minimum effect size override",
1541
+ "type": "number",
1542
+ "minimum": 0
1543
+ },
1544
+ "name": {
1545
+ "description": "Human-readable evaluator name",
1546
+ "type": "string"
1547
+ },
1548
+ "options": {
1549
+ "description": "Additional evaluator-specific options",
1550
+ "type": "object",
1551
+ "additionalProperties": {},
1552
+ "propertyNames": {
1553
+ "type": "string"
1554
+ }
1555
+ },
1556
+ "significanceLevel": {
1557
+ "description": "Global significance level override",
1558
+ "type": "number",
1559
+ "minimum": 0,
1560
+ "maximum": 1
1561
+ }
1562
+ },
1563
+ "required": [
1564
+ "claims"
1565
+ ],
1566
+ "additionalProperties": false,
1567
+ "examples": [
1568
+ {
1569
+ "claims": [
1570
+ {
1571
+ "description": "Built-in .length reports greater length than spread operator on emoji strings",
1572
+ "baseline": "spread-length",
1573
+ "claimId": "C001",
1574
+ "direction": "greater",
1575
+ "metric": "length",
1576
+ "scope": "global",
1577
+ "sut": "builtin-length"
1578
+ }
1579
+ ],
1580
+ "significanceLevel": 0.05
1581
+ }
1582
+ ]
1583
+ },
1584
+ "CorrectnessResult": {
1585
+ "title": "CorrectnessResult",
1586
+ "description": "Correctness assessment",
1587
+ "type": "object",
1588
+ "properties": {
1589
+ "expectedExists": {
1590
+ "description": "Whether expected output exists (oracle available)",
1591
+ "type": "boolean"
1592
+ },
1593
+ "failureType": {
1594
+ "description": "Failure classification if applicable",
1595
+ "type": "string",
1596
+ "enum": [
1597
+ "no_output",
1598
+ "invalid_structure",
1599
+ "constraint_violation",
1600
+ "exception",
1601
+ "oracle_mismatch",
1602
+ "timeout"
1603
+ ]
1604
+ },
1605
+ "matchesExpected": {
1606
+ "description": "Whether output matches expected (null if no oracle)",
1607
+ "anyOf": [
1608
+ {
1609
+ "type": "boolean"
1610
+ },
1611
+ {
1612
+ "type": "null"
1613
+ }
1614
+ ]
1615
+ },
1616
+ "notes": {
1617
+ "description": "Human-readable failure notes",
1618
+ "type": "array",
1619
+ "items": {
1620
+ "type": "string"
1621
+ }
1622
+ },
1623
+ "producedOutput": {
1624
+ "description": "Whether the SUT produced any output",
1625
+ "type": "boolean"
1626
+ },
1627
+ "valid": {
1628
+ "description": "Whether output is structurally valid",
1629
+ "type": "boolean"
1630
+ }
1631
+ },
1632
+ "required": [
1633
+ "expectedExists",
1634
+ "matchesExpected",
1635
+ "producedOutput",
1636
+ "valid"
1637
+ ],
1638
+ "additionalProperties": false
1639
+ },
1640
+ "CustomEvaluatorConfig": {
1641
+ "title": "CustomEvaluatorConfig",
1642
+ "description": "Configuration for a custom evaluator",
1643
+ "type": "object",
1644
+ "properties": {
1645
+ "customType": {
1646
+ "description": "Custom evaluator type name",
1647
+ "type": "string",
1648
+ "minLength": 1
1649
+ },
1650
+ "description": {
1651
+ "description": "Evaluator description",
1652
+ "type": "string"
1653
+ },
1654
+ "name": {
1655
+ "description": "Human-readable evaluator name",
1656
+ "type": "string"
1657
+ },
1658
+ "options": {
1659
+ "description": "Additional evaluator-specific options",
1660
+ "type": "object",
1661
+ "additionalProperties": {},
1662
+ "propertyNames": {
1663
+ "type": "string"
1664
+ }
1665
+ }
1666
+ },
1667
+ "required": [
1668
+ "customType"
1669
+ ],
1670
+ "additionalProperties": {}
1671
+ },
1672
+ "EvaluationResult": {
1673
+ "title": "EvaluationResult",
1674
+ "description": "Complete evaluation result",
1675
+ "type": "object",
1676
+ "properties": {
1677
+ "correctness": {
1678
+ "title": "CorrectnessResult",
1679
+ "description": "Correctness assessment",
1680
+ "type": "object",
1681
+ "properties": {
1682
+ "expectedExists": {
1683
+ "description": "Whether expected output exists (oracle available)",
1684
+ "type": "boolean"
1685
+ },
1686
+ "failureType": {
1687
+ "description": "Failure classification if applicable",
1688
+ "type": "string",
1689
+ "enum": [
1690
+ "no_output",
1691
+ "invalid_structure",
1692
+ "constraint_violation",
1693
+ "exception",
1694
+ "oracle_mismatch",
1695
+ "timeout"
1696
+ ]
1697
+ },
1698
+ "matchesExpected": {
1699
+ "description": "Whether output matches expected (null if no oracle)",
1700
+ "anyOf": [
1701
+ {
1702
+ "type": "boolean"
1703
+ },
1704
+ {
1705
+ "type": "null"
1706
+ }
1707
+ ]
1708
+ },
1709
+ "notes": {
1710
+ "description": "Human-readable failure notes",
1711
+ "type": "array",
1712
+ "items": {
1713
+ "type": "string"
1714
+ }
1715
+ },
1716
+ "producedOutput": {
1717
+ "description": "Whether the SUT produced any output",
1718
+ "type": "boolean"
1719
+ },
1720
+ "valid": {
1721
+ "description": "Whether output is structurally valid",
1722
+ "type": "boolean"
1723
+ }
1724
+ },
1725
+ "required": [
1726
+ "expectedExists",
1727
+ "matchesExpected",
1728
+ "producedOutput",
1729
+ "valid"
1730
+ ],
1731
+ "additionalProperties": false
1732
+ },
1733
+ "error": {
1734
+ "description": "Error message if the run failed",
1735
+ "type": "string"
1736
+ },
1737
+ "metrics": {
1738
+ "title": "ResultMetrics",
1739
+ "description": "Numeric metrics",
1740
+ "type": "object",
1741
+ "properties": {
1742
+ "extra": {
1743
+ "description": "Additional metrics (overflow)",
1744
+ "type": "object",
1745
+ "additionalProperties": {
1746
+ "type": "number"
1747
+ },
1748
+ "propertyNames": {
1749
+ "type": "string"
1750
+ }
1751
+ },
1752
+ "numeric": {
1753
+ "description": "Primary numeric metrics",
1754
+ "type": "object",
1755
+ "additionalProperties": {
1756
+ "type": "number"
1757
+ },
1758
+ "propertyNames": {
1759
+ "type": "string"
1760
+ }
1761
+ }
1762
+ },
1763
+ "required": [
1764
+ "numeric"
1765
+ ],
1766
+ "additionalProperties": {
1767
+ "anyOf": [
1768
+ {
1769
+ "type": "number"
1770
+ },
1771
+ {
1772
+ "type": "object",
1773
+ "additionalProperties": {
1774
+ "type": "number"
1775
+ },
1776
+ "propertyNames": {
1777
+ "type": "string"
1778
+ }
1779
+ }
1780
+ ]
1781
+ }
1782
+ },
1783
+ "outputs": {
1784
+ "title": "ResultOutputs",
1785
+ "description": "Output artefacts and summaries",
1786
+ "type": "object",
1787
+ "properties": {
1788
+ "artefacts": {
1789
+ "description": "References to generated artefacts",
1790
+ "type": "array",
1791
+ "items": {
1792
+ "title": "ArtefactReference",
1793
+ "description": "Reference to an external artefact",
1794
+ "type": "object",
1795
+ "properties": {
1796
+ "hash": {
1797
+ "type": "string"
1798
+ },
1799
+ "metadata": {
1800
+ "type": "object",
1801
+ "additionalProperties": {
1802
+ "anyOf": [
1803
+ {
1804
+ "type": "string"
1805
+ },
1806
+ {
1807
+ "type": "number"
1808
+ },
1809
+ {
1810
+ "type": "boolean"
1811
+ },
1812
+ {
1813
+ "type": "null"
1814
+ }
1815
+ ]
1816
+ },
1817
+ "propertyNames": {
1818
+ "type": "string"
1819
+ }
1820
+ },
1821
+ "type": {
1822
+ "type": "string",
1823
+ "enum": [
1824
+ "graph",
1825
+ "path-set",
1826
+ "subgraph",
1827
+ "embedding",
1828
+ "other"
1829
+ ]
1830
+ },
1831
+ "uri": {
1832
+ "type": "string"
1833
+ }
1834
+ },
1835
+ "required": [
1836
+ "type",
1837
+ "uri"
1838
+ ],
1839
+ "additionalProperties": false
1840
+ }
1841
+ },
1842
+ "extra": {
1843
+ "description": "Additional untyped outputs",
1844
+ "type": "object",
1845
+ "additionalProperties": {},
1846
+ "propertyNames": {
1847
+ "type": "string"
1848
+ }
1849
+ },
1850
+ "labels": {
1851
+ "description": "Classification labels",
1852
+ "type": "object",
1853
+ "additionalProperties": {
1854
+ "anyOf": [
1855
+ {
1856
+ "type": "string"
1857
+ },
1858
+ {
1859
+ "type": "number"
1860
+ },
1861
+ {
1862
+ "type": "boolean"
1863
+ },
1864
+ {
1865
+ "type": "null"
1866
+ }
1867
+ ]
1868
+ },
1869
+ "propertyNames": {
1870
+ "type": "string"
1871
+ }
1872
+ },
1873
+ "ranking": {
1874
+ "description": "Ranking results",
1875
+ "type": "array",
1876
+ "items": {
1877
+ "title": "RankedItem",
1878
+ "description": "A ranked item for ranking tasks",
1879
+ "type": "object",
1880
+ "properties": {
1881
+ "itemId": {
1882
+ "description": "Item identifier",
1883
+ "type": "string"
1884
+ },
1885
+ "metadata": {
1886
+ "description": "Optional additional metadata",
1887
+ "type": "object",
1888
+ "additionalProperties": {
1889
+ "anyOf": [
1890
+ {
1891
+ "type": "string"
1892
+ },
1893
+ {
1894
+ "type": "number"
1895
+ },
1896
+ {
1897
+ "type": "boolean"
1898
+ },
1899
+ {
1900
+ "type": "null"
1901
+ }
1902
+ ]
1903
+ },
1904
+ "propertyNames": {
1905
+ "type": "string"
1906
+ }
1907
+ },
1908
+ "score": {
1909
+ "description": "Score or rank value",
1910
+ "type": "number"
1911
+ }
1912
+ },
1913
+ "required": [
1914
+ "itemId",
1915
+ "score"
1916
+ ],
1917
+ "additionalProperties": false
1918
+ }
1919
+ },
1920
+ "summary": {
1921
+ "description": "Scalar summary values",
1922
+ "type": "object",
1923
+ "additionalProperties": {
1924
+ "anyOf": [
1925
+ {
1926
+ "anyOf": [
1927
+ {
1928
+ "type": "string"
1929
+ },
1930
+ {
1931
+ "type": "number"
1932
+ },
1933
+ {
1934
+ "type": "boolean"
1935
+ },
1936
+ {
1937
+ "type": "null"
1938
+ }
1939
+ ]
1940
+ },
1941
+ {
1942
+ "type": "array",
1943
+ "items": {
1944
+ "anyOf": [
1945
+ {
1946
+ "type": "string"
1947
+ },
1948
+ {
1949
+ "type": "number"
1950
+ },
1951
+ {
1952
+ "type": "boolean"
1953
+ },
1954
+ {
1955
+ "type": "null"
1956
+ }
1957
+ ]
1958
+ }
1959
+ }
1960
+ ]
1961
+ },
1962
+ "propertyNames": {
1963
+ "type": "string"
1964
+ }
1965
+ }
1966
+ },
1967
+ "additionalProperties": false
1968
+ },
1969
+ "provenance": {
1970
+ "title": "Provenance",
1971
+ "description": "Provenance for reproducibility",
1972
+ "type": "object",
1973
+ "properties": {
1974
+ "dependencyLockHash": {
1975
+ "description": "Hash of package-lock.json for dependency pinning",
1976
+ "type": "string"
1977
+ },
1978
+ "dirty": {
1979
+ "description": "Whether working directory had uncommitted changes",
1980
+ "type": "boolean"
1981
+ },
1982
+ "executionTimeMs": {
1983
+ "description": "Wall-clock execution time in milliseconds",
1984
+ "type": "number"
1985
+ },
1986
+ "finalMemoryBytes": {
1987
+ "description": "Memory usage at completion (bytes)",
1988
+ "type": "number"
1989
+ },
1990
+ "gitCommit": {
1991
+ "description": "Git commit hash",
1992
+ "type": "string"
1993
+ },
1994
+ "parentRunIds": {
1995
+ "description": "Parent run IDs (for derived results)",
1996
+ "type": "array",
1997
+ "items": {
1998
+ "type": "string"
1999
+ }
2000
+ },
2001
+ "peakMemoryBytes": {
2002
+ "description": "Peak memory usage during execution (bytes)",
2003
+ "type": "number"
2004
+ },
2005
+ "runtime": {
2006
+ "description": "Execution environment (platform and arch required; additional fields are language-specific)",
2007
+ "type": "object",
2008
+ "properties": {
2009
+ "arch": {
2010
+ "description": "CPU architecture",
2011
+ "type": "string"
2012
+ },
2013
+ "platform": {
2014
+ "description": "Operating system platform",
2015
+ "type": "string"
2016
+ }
2017
+ },
2018
+ "required": [
2019
+ "arch",
2020
+ "platform"
2021
+ ],
2022
+ "additionalProperties": {
2023
+ "type": "string"
2024
+ }
2025
+ },
2026
+ "timestamp": {
2027
+ "description": "Execution timestamp",
2028
+ "type": "string"
2029
+ }
2030
+ },
2031
+ "required": [
2032
+ "runtime"
2033
+ ],
2034
+ "additionalProperties": false
2035
+ },
2036
+ "run": {
2037
+ "title": "RunContext",
2038
+ "description": "Run identity and context",
2039
+ "type": "object",
2040
+ "properties": {
2041
+ "caseClass": {
2042
+ "description": "Case class for grouping",
2043
+ "type": "string"
2044
+ },
2045
+ "caseId": {
2046
+ "description": "Case identifier",
2047
+ "type": "string"
2048
+ },
2049
+ "config": {
2050
+ "description": "Configuration overrides for this run",
2051
+ "type": "object",
2052
+ "additionalProperties": {
2053
+ "anyOf": [
2054
+ {
2055
+ "type": "string"
2056
+ },
2057
+ {
2058
+ "type": "number"
2059
+ },
2060
+ {
2061
+ "type": "boolean"
2062
+ },
2063
+ {
2064
+ "type": "null"
2065
+ }
2066
+ ]
2067
+ },
2068
+ "propertyNames": {
2069
+ "type": "string"
2070
+ }
2071
+ },
2072
+ "repetition": {
2073
+ "description": "Repetition number for statistical runs",
2074
+ "type": "integer",
2075
+ "minimum": -9007199254740991,
2076
+ "maximum": 2147483647
2077
+ },
2078
+ "runId": {
2079
+ "description": "Deterministic run ID (hash of inputs)",
2080
+ "type": "string"
2081
+ },
2082
+ "seed": {
2083
+ "description": "Random seed if applicable",
2084
+ "type": "number"
2085
+ },
2086
+ "sut": {
2087
+ "description": "SUT identifier",
2088
+ "type": "string"
2089
+ },
2090
+ "sutRole": {
2091
+ "description": "Role of the SUT in evaluation",
2092
+ "type": "string",
2093
+ "oneOf": [
2094
+ {
2095
+ "description": "The system being evaluated; the novel algorithm or implementation",
2096
+ "const": "primary"
2097
+ },
2098
+ {
2099
+ "description": "A reference implementation for comparison",
2100
+ "const": "baseline"
2101
+ },
2102
+ {
2103
+ "description": "Ground truth provider; defines correct answers",
2104
+ "const": "oracle"
2105
+ }
2106
+ ]
2107
+ },
2108
+ "sutVersion": {
2109
+ "description": "SUT version for reproducibility",
2110
+ "type": "string"
2111
+ }
2112
+ },
2113
+ "required": [
2114
+ "caseId",
2115
+ "runId",
2116
+ "sut",
2117
+ "sutRole"
2118
+ ],
2119
+ "additionalProperties": false
2120
+ }
2121
+ },
2122
+ "required": [
2123
+ "correctness",
2124
+ "metrics",
2125
+ "outputs",
2126
+ "provenance",
2127
+ "run"
2128
+ ],
2129
+ "additionalProperties": false
2130
+ },
2131
+ "ExploratoryEvaluationSummary": {
2132
+ "title": "ExploratoryEvaluationSummary",
2133
+ "description": "Summary of exploratory evaluation results",
2134
+ "type": "object",
2135
+ "properties": {
2136
+ "caseClassEffects": {
2137
+ "description": "Case-class effects",
2138
+ "type": "array",
2139
+ "items": {
2140
+ "title": "CaseClassEffect",
2141
+ "description": "Effect of a case class on SUT performance",
2142
+ "type": "object",
2143
+ "properties": {
2144
+ "caseClass": {
2145
+ "type": "string"
2146
+ },
2147
+ "deviationFromMean": {
2148
+ "type": "number"
2149
+ },
2150
+ "metric": {
2151
+ "type": "string"
2152
+ },
2153
+ "percentageDeviation": {
2154
+ "type": "number"
2155
+ },
2156
+ "significant": {
2157
+ "type": "boolean"
2158
+ },
2159
+ "sut": {
2160
+ "type": "string"
2161
+ }
2162
+ },
2163
+ "required": [
2164
+ "caseClass",
2165
+ "deviationFromMean",
2166
+ "metric",
2167
+ "significant",
2168
+ "sut"
2169
+ ],
2170
+ "additionalProperties": false
2171
+ }
2172
+ },
2173
+ "metricCorrelations": {
2174
+ "description": "Metric correlations",
2175
+ "type": "array",
2176
+ "items": {
2177
+ "title": "MetricCorrelation",
2178
+ "description": "Correlation between two metrics",
2179
+ "type": "object",
2180
+ "properties": {
2181
+ "interpretation": {
2182
+ "type": "string"
2183
+ },
2184
+ "metricA": {
2185
+ "type": "string"
2186
+ },
2187
+ "metricB": {
2188
+ "type": "string"
2189
+ },
2190
+ "pearsonR": {
2191
+ "type": "number"
2192
+ },
2193
+ "spearmanRho": {
2194
+ "type": "number"
2195
+ }
2196
+ },
2197
+ "required": [
2198
+ "interpretation",
2199
+ "metricA",
2200
+ "metricB",
2201
+ "pearsonR"
2202
+ ],
2203
+ "additionalProperties": false
2204
+ }
2205
+ },
2206
+ "pairwiseComparisons": {
2207
+ "description": "Pairwise comparisons between SUTs",
2208
+ "type": "array",
2209
+ "items": {
2210
+ "title": "PairwiseComparison",
2211
+ "description": "Pairwise comparison between two SUTs",
2212
+ "type": "object",
2213
+ "properties": {
2214
+ "delta": {
2215
+ "type": "number"
2216
+ },
2217
+ "effectSize": {
2218
+ "type": "number"
2219
+ },
2220
+ "metric": {
2221
+ "type": "string"
2222
+ },
2223
+ "pValue": {
2224
+ "type": "number"
2225
+ },
2226
+ "ratio": {
2227
+ "type": "number"
2228
+ },
2229
+ "significant": {
2230
+ "type": "boolean"
2231
+ },
2232
+ "sutA": {
2233
+ "type": "string"
2234
+ },
2235
+ "sutB": {
2236
+ "type": "string"
2237
+ }
2238
+ },
2239
+ "required": [
2240
+ "delta",
2241
+ "metric",
2242
+ "ratio",
2243
+ "significant",
2244
+ "sutA",
2245
+ "sutB"
2246
+ ],
2247
+ "additionalProperties": false
2248
+ }
2249
+ },
2250
+ "rankings": {
2251
+ "description": "SUT rankings per metric",
2252
+ "type": "object",
2253
+ "additionalProperties": {
2254
+ "type": "array",
2255
+ "items": {
2256
+ "title": "SutMetricRanking",
2257
+ "description": "Ranking of a SUT for a specific metric",
2258
+ "type": "object",
2259
+ "properties": {
2260
+ "mean": {
2261
+ "type": "number"
2262
+ },
2263
+ "median": {
2264
+ "type": "number"
2265
+ },
2266
+ "n": {
2267
+ "type": "integer",
2268
+ "minimum": -9007199254740991,
2269
+ "maximum": 2147483647
2270
+ },
2271
+ "rank": {
2272
+ "type": "integer",
2273
+ "minimum": -9007199254740991,
2274
+ "maximum": 2147483647
2275
+ },
2276
+ "std": {
2277
+ "type": "number"
2278
+ },
2279
+ "sut": {
2280
+ "type": "string"
2281
+ }
2282
+ },
2283
+ "required": [
2284
+ "mean",
2285
+ "median",
2286
+ "n",
2287
+ "rank",
2288
+ "sut"
2289
+ ],
2290
+ "additionalProperties": false
2291
+ }
2292
+ },
2293
+ "propertyNames": {
2294
+ "type": "string"
2295
+ }
2296
+ },
2297
+ "summary": {
2298
+ "type": "object",
2299
+ "properties": {
2300
+ "bestSutPerMetric": {
2301
+ "type": "object",
2302
+ "additionalProperties": {
2303
+ "type": "string"
2304
+ },
2305
+ "propertyNames": {
2306
+ "type": "string"
2307
+ }
2308
+ },
2309
+ "caseClassesAnalyzed": {
2310
+ "type": "integer",
2311
+ "minimum": -9007199254740991,
2312
+ "maximum": 2147483647
2313
+ },
2314
+ "metricsAnalyzed": {
2315
+ "type": "integer",
2316
+ "minimum": -9007199254740991,
2317
+ "maximum": 2147483647
2318
+ },
2319
+ "pairwiseComparisonsCount": {
2320
+ "type": "integer",
2321
+ "minimum": -9007199254740991,
2322
+ "maximum": 2147483647
2323
+ },
2324
+ "significantDifferences": {
2325
+ "type": "integer",
2326
+ "minimum": -9007199254740991,
2327
+ "maximum": 2147483647
2328
+ },
2329
+ "sutsAnalyzed": {
2330
+ "type": "integer",
2331
+ "minimum": -9007199254740991,
2332
+ "maximum": 2147483647
2333
+ }
2334
+ },
2335
+ "required": [
2336
+ "bestSutPerMetric",
2337
+ "metricsAnalyzed",
2338
+ "pairwiseComparisonsCount",
2339
+ "significantDifferences",
2340
+ "sutsAnalyzed"
2341
+ ],
2342
+ "additionalProperties": false
2343
+ },
2344
+ "timestamp": {
2345
+ "description": "Generation timestamp",
2346
+ "type": "string"
2347
+ },
2348
+ "version": {
2349
+ "description": "Schema version",
2350
+ "type": "string"
2351
+ }
2352
+ },
2353
+ "required": [
2354
+ "pairwiseComparisons",
2355
+ "rankings",
2356
+ "summary",
2357
+ "timestamp",
2358
+ "version"
2359
+ ],
2360
+ "additionalProperties": false
2361
+ },
2362
+ "ExploratoryEvaluatorConfig": {
2363
+ "title": "ExploratoryEvaluatorConfig",
2364
+ "description": "Configuration for the exploratory evaluator",
2365
+ "type": "object",
2366
+ "properties": {
2367
+ "analyzeCaseClassEffects": {
2368
+ "description": "Whether to analyze case-class effects",
2369
+ "type": "boolean"
2370
+ },
2371
+ "computeCorrelations": {
2372
+ "description": "Whether to compute metric correlations",
2373
+ "type": "boolean"
2374
+ },
2375
+ "description": {
2376
+ "description": "Evaluator description",
2377
+ "type": "string"
2378
+ },
2379
+ "metricDirections": {
2380
+ "description": "Metric directions for ranking interpretation",
2381
+ "type": "object",
2382
+ "additionalProperties": {
2383
+ "description": "Metric direction for ranking",
2384
+ "type": "string",
2385
+ "oneOf": [
2386
+ {
2387
+ "description": "Higher values indicate better performance",
2388
+ "const": "higher-better"
2389
+ },
2390
+ {
2391
+ "description": "Lower values indicate better performance",
2392
+ "const": "lower-better"
2393
+ }
2394
+ ]
2395
+ },
2396
+ "propertyNames": {
2397
+ "type": "string"
2398
+ }
2399
+ },
2400
+ "metrics": {
2401
+ "description": "Metrics to analyze (all if not specified)",
2402
+ "type": "array",
2403
+ "items": {
2404
+ "type": "string",
2405
+ "minLength": 1
2406
+ }
2407
+ },
2408
+ "minEffectSize": {
2409
+ "description": "Minimum effect size to consider meaningful",
2410
+ "type": "number",
2411
+ "minimum": 0
2412
+ },
2413
+ "name": {
2414
+ "description": "Human-readable evaluator name",
2415
+ "type": "string"
2416
+ },
2417
+ "options": {
2418
+ "description": "Additional evaluator-specific options",
2419
+ "type": "object",
2420
+ "additionalProperties": {},
2421
+ "propertyNames": {
2422
+ "type": "string"
2423
+ }
2424
+ },
2425
+ "significanceLevel": {
2426
+ "description": "Significance level for statistical tests (default: 0.05)",
2427
+ "type": "number",
2428
+ "minimum": 0,
2429
+ "maximum": 1
2430
+ },
2431
+ "suts": {
2432
+ "description": "SUTs to include (all if not specified)",
2433
+ "type": "array",
2434
+ "items": {
2435
+ "type": "string",
2436
+ "minLength": 1
2437
+ }
2438
+ }
2439
+ },
2440
+ "additionalProperties": false,
2441
+ "examples": [
2442
+ {
2443
+ "analyzeCaseClassEffects": true,
2444
+ "computeCorrelations": false,
2445
+ "metricDirections": {
2446
+ "length": "higher-better"
2447
+ },
2448
+ "metrics": [
2449
+ "length"
2450
+ ]
2451
+ }
2452
+ ]
2453
+ },
2454
+ "MetricsEvaluationSummary": {
2455
+ "title": "MetricsEvaluationSummary",
2456
+ "description": "Summary of metrics evaluation",
2457
+ "type": "object",
2458
+ "properties": {
2459
+ "results": {
2460
+ "description": "Individual criterion results",
2461
+ "type": "array",
2462
+ "items": {
2463
+ "title": "MetricsCriterionResult",
2464
+ "description": "Result of evaluating a single metrics criterion",
2465
+ "type": "object",
2466
+ "properties": {
2467
+ "criterion": {
2468
+ "title": "MetricsCriterionOutput",
2469
+ "description": "A metrics evaluation criterion",
2470
+ "type": "object",
2471
+ "properties": {
2472
+ "baseline": {
2473
+ "type": "object",
2474
+ "properties": {
2475
+ "operator": {
2476
+ "description": "Comparison operator",
2477
+ "type": "string",
2478
+ "oneOf": [
2479
+ {
2480
+ "description": "Greater than",
2481
+ "const": "gt"
2482
+ },
2483
+ {
2484
+ "description": "Greater than or equal to",
2485
+ "const": "gte"
2486
+ },
2487
+ {
2488
+ "description": "Less than",
2489
+ "const": "lt"
2490
+ },
2491
+ {
2492
+ "description": "Less than or equal to",
2493
+ "const": "lte"
2494
+ },
2495
+ {
2496
+ "description": "Equal to",
2497
+ "const": "eq"
2498
+ }
2499
+ ]
2500
+ },
2501
+ "sut": {
2502
+ "type": "string"
2503
+ }
2504
+ },
2505
+ "required": [
2506
+ "operator",
2507
+ "sut"
2508
+ ],
2509
+ "additionalProperties": false
2510
+ },
2511
+ "criterionId": {
2512
+ "description": "Unique identifier",
2513
+ "type": "string"
2514
+ },
2515
+ "description": {
2516
+ "description": "Human-readable description",
2517
+ "type": "string"
2518
+ },
2519
+ "metric": {
2520
+ "description": "Metric to evaluate",
2521
+ "type": "string"
2522
+ },
2523
+ "scopeConstraints": {
2524
+ "type": "object",
2525
+ "properties": {
2526
+ "caseClass": {
2527
+ "anyOf": [
2528
+ {
2529
+ "type": "string"
2530
+ },
2531
+ {
2532
+ "type": "array",
2533
+ "items": {
2534
+ "type": "string"
2535
+ }
2536
+ }
2537
+ ]
2538
+ }
2539
+ },
2540
+ "additionalProperties": false
2541
+ },
2542
+ "sut": {
2543
+ "description": "SUT to evaluate (or \"*\" for all SUTs)",
2544
+ "type": "string"
2545
+ },
2546
+ "tags": {
2547
+ "type": "array",
2548
+ "items": {
2549
+ "type": "string"
2550
+ }
2551
+ },
2552
+ "targetRange": {
2553
+ "type": "object",
2554
+ "properties": {
2555
+ "max": {
2556
+ "type": "number"
2557
+ },
2558
+ "maxInclusive": {
2559
+ "type": "boolean"
2560
+ },
2561
+ "min": {
2562
+ "type": "number"
2563
+ },
2564
+ "minInclusive": {
2565
+ "type": "boolean"
2566
+ }
2567
+ },
2568
+ "additionalProperties": false
2569
+ },
2570
+ "threshold": {
2571
+ "type": "object",
2572
+ "properties": {
2573
+ "operator": {
2574
+ "description": "Comparison operator",
2575
+ "type": "string",
2576
+ "oneOf": [
2577
+ {
2578
+ "description": "Greater than",
2579
+ "const": "gt"
2580
+ },
2581
+ {
2582
+ "description": "Greater than or equal to",
2583
+ "const": "gte"
2584
+ },
2585
+ {
2586
+ "description": "Less than",
2587
+ "const": "lt"
2588
+ },
2589
+ {
2590
+ "description": "Less than or equal to",
2591
+ "const": "lte"
2592
+ },
2593
+ {
2594
+ "description": "Equal to",
2595
+ "const": "eq"
2596
+ }
2597
+ ]
2598
+ },
2599
+ "value": {
2600
+ "type": "number"
2601
+ }
2602
+ },
2603
+ "required": [
2604
+ "operator",
2605
+ "value"
2606
+ ],
2607
+ "additionalProperties": false
2608
+ },
2609
+ "type": {
2610
+ "description": "Type of metrics criterion",
2611
+ "type": "string",
2612
+ "oneOf": [
2613
+ {
2614
+ "description": "Compare a metric against a fixed threshold value",
2615
+ "const": "threshold"
2616
+ },
2617
+ {
2618
+ "description": "Compare a metric against a baseline SUT",
2619
+ "const": "baseline"
2620
+ },
2621
+ {
2622
+ "description": "Check that a metric falls within a target range",
2623
+ "const": "target-range"
2624
+ }
2625
+ ]
2626
+ }
2627
+ },
2628
+ "required": [
2629
+ "criterionId",
2630
+ "description",
2631
+ "metric",
2632
+ "sut",
2633
+ "type"
2634
+ ],
2635
+ "additionalProperties": false
2636
+ },
2637
+ "expected": {
2638
+ "type": "object",
2639
+ "properties": {
2640
+ "baselineValue": {
2641
+ "type": "number"
2642
+ },
2643
+ "targetRange": {
2644
+ "type": "object",
2645
+ "properties": {
2646
+ "max": {
2647
+ "type": "number"
2648
+ },
2649
+ "min": {
2650
+ "type": "number"
2651
+ }
2652
+ },
2653
+ "additionalProperties": false
2654
+ },
2655
+ "threshold": {
2656
+ "type": "number"
2657
+ },
2658
+ "type": {
2659
+ "description": "Type of metrics criterion",
2660
+ "type": "string",
2661
+ "oneOf": [
2662
+ {
2663
+ "description": "Compare a metric against a fixed threshold value",
2664
+ "const": "threshold"
2665
+ },
2666
+ {
2667
+ "description": "Compare a metric against a baseline SUT",
2668
+ "const": "baseline"
2669
+ },
2670
+ {
2671
+ "description": "Check that a metric falls within a target range",
2672
+ "const": "target-range"
2673
+ }
2674
+ ]
2675
+ }
2676
+ },
2677
+ "required": [
2678
+ "type"
2679
+ ],
2680
+ "additionalProperties": false
2681
+ },
2682
+ "inconclusiveReason": {
2683
+ "type": "string"
2684
+ },
2685
+ "observed": {
2686
+ "type": "array",
2687
+ "items": {
2688
+ "type": "object",
2689
+ "properties": {
2690
+ "sut": {
2691
+ "type": "string"
2692
+ },
2693
+ "value": {
2694
+ "type": "number"
2695
+ }
2696
+ },
2697
+ "required": [
2698
+ "sut",
2699
+ "value"
2700
+ ],
2701
+ "additionalProperties": false
2702
+ }
2703
+ },
2704
+ "status": {
2705
+ "type": "string",
2706
+ "enum": [
2707
+ "pass",
2708
+ "fail",
2709
+ "inconclusive"
2710
+ ]
2711
+ }
2712
+ },
2713
+ "required": [
2714
+ "criterion",
2715
+ "expected",
2716
+ "observed",
2717
+ "status"
2718
+ ],
2719
+ "additionalProperties": false
2720
+ }
2721
+ },
2722
+ "summary": {
2723
+ "type": "object",
2724
+ "properties": {
2725
+ "failed": {
2726
+ "description": "Criteria failed",
2727
+ "type": "integer",
2728
+ "minimum": -9007199254740991,
2729
+ "maximum": 2147483647
2730
+ },
2731
+ "inconclusive": {
2732
+ "description": "Criteria inconclusive",
2733
+ "type": "integer",
2734
+ "minimum": -9007199254740991,
2735
+ "maximum": 2147483647
2736
+ },
2737
+ "passed": {
2738
+ "description": "Criteria passed",
2739
+ "type": "integer",
2740
+ "minimum": -9007199254740991,
2741
+ "maximum": 2147483647
2742
+ },
2743
+ "passRate": {
2744
+ "description": "Overall pass rate",
2745
+ "type": "number"
2746
+ },
2747
+ "passRateBySut": {
2748
+ "description": "Pass rate by SUT",
2749
+ "type": "object",
2750
+ "additionalProperties": {
2751
+ "type": "number"
2752
+ },
2753
+ "propertyNames": {
2754
+ "type": "string"
2755
+ }
2756
+ },
2757
+ "total": {
2758
+ "description": "Total criteria evaluated",
2759
+ "type": "integer",
2760
+ "minimum": -9007199254740991,
2761
+ "maximum": 2147483647
2762
+ }
2763
+ },
2764
+ "required": [
2765
+ "failed",
2766
+ "inconclusive",
2767
+ "passed",
2768
+ "passRate",
2769
+ "passRateBySut",
2770
+ "total"
2771
+ ],
2772
+ "additionalProperties": false
2773
+ },
2774
+ "timestamp": {
2775
+ "description": "Generation timestamp",
2776
+ "type": "string"
2777
+ },
2778
+ "version": {
2779
+ "description": "Schema version",
2780
+ "type": "string"
2781
+ }
2782
+ },
2783
+ "required": [
2784
+ "results",
2785
+ "summary",
2786
+ "timestamp",
2787
+ "version"
2788
+ ],
2789
+ "additionalProperties": false
2790
+ },
2791
+ "MetricsEvaluatorConfig": {
2792
+ "title": "MetricsEvaluatorConfig",
2793
+ "description": "Configuration for the metrics evaluator",
2794
+ "type": "object",
2795
+ "properties": {
2796
+ "criteria": {
2797
+ "description": "Criteria to evaluate",
449
2798
  "type": "array",
450
2799
  "items": {
451
- "title": "EvaluationClaim",
452
- "description": "An evaluation claim (hypothesis)",
2800
+ "title": "MetricsCriterion",
2801
+ "description": "A metrics evaluation criterion",
453
2802
  "type": "object",
2803
+ "allOf": [
2804
+ {
2805
+ "if": {
2806
+ "properties": {
2807
+ "type": {
2808
+ "const": "threshold"
2809
+ }
2810
+ },
2811
+ "required": [
2812
+ "type"
2813
+ ]
2814
+ },
2815
+ "then": {
2816
+ "required": [
2817
+ "threshold"
2818
+ ]
2819
+ }
2820
+ },
2821
+ {
2822
+ "if": {
2823
+ "properties": {
2824
+ "type": {
2825
+ "const": "baseline"
2826
+ }
2827
+ },
2828
+ "required": [
2829
+ "type"
2830
+ ]
2831
+ },
2832
+ "then": {
2833
+ "required": [
2834
+ "baseline"
2835
+ ]
2836
+ }
2837
+ },
2838
+ {
2839
+ "if": {
2840
+ "properties": {
2841
+ "type": {
2842
+ "const": "target-range"
2843
+ }
2844
+ },
2845
+ "required": [
2846
+ "type"
2847
+ ]
2848
+ },
2849
+ "then": {
2850
+ "required": [
2851
+ "targetRange"
2852
+ ]
2853
+ }
2854
+ }
2855
+ ],
454
2856
  "properties": {
455
2857
  "baseline": {
456
- "description": "Baseline SUT for comparison",
457
- "type": "string",
458
- "minLength": 1
459
- },
460
- "citation": {
461
- "description": "Citation/reference for the claim",
462
- "type": "string"
2858
+ "description": "Baseline comparison (required when type is baseline)",
2859
+ "type": "object",
2860
+ "properties": {
2861
+ "operator": {
2862
+ "description": "Comparison operator",
2863
+ "type": "string",
2864
+ "oneOf": [
2865
+ {
2866
+ "description": "Greater than",
2867
+ "const": "gt"
2868
+ },
2869
+ {
2870
+ "description": "Greater than or equal to",
2871
+ "const": "gte"
2872
+ },
2873
+ {
2874
+ "description": "Less than",
2875
+ "const": "lt"
2876
+ },
2877
+ {
2878
+ "description": "Less than or equal to",
2879
+ "const": "lte"
2880
+ },
2881
+ {
2882
+ "description": "Equal to",
2883
+ "const": "eq"
2884
+ }
2885
+ ]
2886
+ },
2887
+ "sut": {
2888
+ "description": "Baseline SUT identifier",
2889
+ "type": "string",
2890
+ "minLength": 1
2891
+ }
2892
+ },
2893
+ "required": [
2894
+ "operator",
2895
+ "sut"
2896
+ ],
2897
+ "additionalProperties": false
463
2898
  },
464
- "claimId": {
465
- "description": "Unique claim identifier",
2899
+ "criterionId": {
2900
+ "description": "Unique criterion identifier",
466
2901
  "type": "string",
467
2902
  "minLength": 1
468
2903
  },
469
2904
  "description": {
470
- "description": "Human-readable claim description",
2905
+ "description": "Human-readable description",
471
2906
  "type": "string",
472
2907
  "minLength": 1
473
2908
  },
474
- "direction": {
475
- "description": "Expected direction of difference",
2909
+ "metric": {
2910
+ "description": "Metric to evaluate",
476
2911
  "type": "string",
477
- "oneOf": [
478
- {
479
- "description": "Primary SUT metric should be greater than baseline",
480
- "const": "greater"
481
- },
482
- {
483
- "description": "Primary SUT metric should be less than baseline",
484
- "const": "less"
485
- },
486
- {
487
- "description": "Primary SUT metric should be equal to baseline",
488
- "const": "equal"
2912
+ "minLength": 1
2913
+ },
2914
+ "scopeConstraints": {
2915
+ "description": "Optional scope constraints",
2916
+ "type": "object",
2917
+ "properties": {
2918
+ "caseClass": {
2919
+ "description": "Case class filter",
2920
+ "anyOf": [
2921
+ {
2922
+ "type": "string"
2923
+ },
2924
+ {
2925
+ "type": "array",
2926
+ "items": {
2927
+ "type": "string"
2928
+ }
2929
+ }
2930
+ ]
489
2931
  }
490
- ]
2932
+ },
2933
+ "additionalProperties": false
491
2934
  },
492
- "metric": {
493
- "description": "Metric being compared",
2935
+ "sut": {
2936
+ "description": "SUT to evaluate (or \"*\" for all SUTs)",
494
2937
  "type": "string",
495
2938
  "minLength": 1
496
2939
  },
497
- "minEffectSize": {
498
- "description": "Minimum effect size (Cohen's d)",
499
- "type": "number",
500
- "minimum": 0
2940
+ "tags": {
2941
+ "description": "Tags for filtering",
2942
+ "type": "array",
2943
+ "items": {
2944
+ "type": "string"
2945
+ }
501
2946
  },
502
- "scope": {
503
- "description": "Scope of claim validity",
504
- "type": "string",
505
- "oneOf": [
506
- {
507
- "description": "Claim applies across all cases and conditions",
508
- "const": "global"
2947
+ "targetRange": {
2948
+ "description": "Target range (required when type is target-range)",
2949
+ "type": "object",
2950
+ "properties": {
2951
+ "max": {
2952
+ "description": "Maximum value",
2953
+ "type": "number"
509
2954
  },
510
- {
511
- "description": "Claim applies within a specific case class",
512
- "const": "caseClass"
2955
+ "maxInclusive": {
2956
+ "description": "Whether max is inclusive",
2957
+ "type": "boolean"
513
2958
  },
514
- {
515
- "description": "Claim applies within a parameter range",
516
- "const": "parameterRange"
2959
+ "min": {
2960
+ "description": "Minimum value",
2961
+ "type": "number"
517
2962
  },
518
- {
519
- "description": "Claim applies to local structural properties",
520
- "const": "localStructure"
2963
+ "minInclusive": {
2964
+ "description": "Whether min is inclusive",
2965
+ "type": "boolean"
521
2966
  }
522
- ]
2967
+ },
2968
+ "additionalProperties": false
523
2969
  },
524
- "scopeConstraints": {
525
- "description": "Scope constraints",
2970
+ "threshold": {
2971
+ "description": "Threshold operator and value (required when type is threshold)",
526
2972
  "type": "object",
527
- "additionalProperties": {
528
- "anyOf": [
529
- {
530
- "anyOf": [
531
- {
532
- "type": "string"
533
- },
534
- {
535
- "type": "number"
536
- },
537
- {
538
- "type": "boolean"
539
- },
540
- {
541
- "type": "null"
542
- }
543
- ]
544
- },
545
- {
546
- "type": "array",
547
- "items": {
548
- "anyOf": [
549
- {
550
- "type": "string"
551
- },
552
- {
553
- "type": "number"
554
- },
555
- {
556
- "type": "boolean"
557
- },
558
- {
559
- "type": "null"
560
- }
561
- ]
2973
+ "properties": {
2974
+ "operator": {
2975
+ "description": "Comparison operator",
2976
+ "type": "string",
2977
+ "oneOf": [
2978
+ {
2979
+ "description": "Greater than",
2980
+ "const": "gt"
2981
+ },
2982
+ {
2983
+ "description": "Greater than or equal to",
2984
+ "const": "gte"
2985
+ },
2986
+ {
2987
+ "description": "Less than",
2988
+ "const": "lt"
2989
+ },
2990
+ {
2991
+ "description": "Less than or equal to",
2992
+ "const": "lte"
2993
+ },
2994
+ {
2995
+ "description": "Equal to",
2996
+ "const": "eq"
562
2997
  }
563
- }
564
- ]
2998
+ ]
2999
+ },
3000
+ "value": {
3001
+ "description": "Threshold value",
3002
+ "type": "number"
3003
+ }
565
3004
  },
566
- "propertyNames": {
567
- "type": "string"
568
- }
569
- },
570
- "significanceLevel": {
571
- "description": "Required significance level (default: 0.05)",
572
- "type": "number",
573
- "minimum": 0,
574
- "maximum": 1
3005
+ "required": [
3006
+ "operator",
3007
+ "value"
3008
+ ],
3009
+ "additionalProperties": false
575
3010
  },
576
- "sut": {
577
- "description": "Primary SUT being evaluated",
3011
+ "type": {
3012
+ "description": "Type of metrics criterion",
578
3013
  "type": "string",
579
- "minLength": 1
580
- },
581
- "tags": {
582
- "description": "Tags for filtering",
583
- "type": "array",
584
- "items": {
585
- "type": "string"
586
- }
587
- },
588
- "threshold": {
589
- "description": "Optional threshold for the difference",
590
- "type": "number"
3014
+ "oneOf": [
3015
+ {
3016
+ "description": "Compare a metric against a fixed threshold value",
3017
+ "const": "threshold"
3018
+ },
3019
+ {
3020
+ "description": "Compare a metric against a baseline SUT",
3021
+ "const": "baseline"
3022
+ },
3023
+ {
3024
+ "description": "Check that a metric falls within a target range",
3025
+ "const": "target-range"
3026
+ }
3027
+ ]
591
3028
  }
592
3029
  },
593
3030
  "required": [
594
- "baseline",
595
- "claimId",
3031
+ "criterionId",
596
3032
  "description",
597
- "direction",
598
3033
  "metric",
599
- "scope",
600
- "sut"
3034
+ "sut",
3035
+ "type"
601
3036
  ],
602
3037
  "additionalProperties": false
603
3038
  },
@@ -607,11 +3042,6 @@
607
3042
  "description": "Evaluator description",
608
3043
  "type": "string"
609
3044
  },
610
- "minEffectSize": {
611
- "description": "Global minimum effect size override",
612
- "type": "number",
613
- "minimum": 0
614
- },
615
3045
  "name": {
616
3046
  "description": "Human-readable evaluator name",
617
3047
  "type": "string"
@@ -623,473 +3053,779 @@
623
3053
  "propertyNames": {
624
3054
  "type": "string"
625
3055
  }
626
- },
627
- "significanceLevel": {
628
- "description": "Global significance level override",
629
- "type": "number",
630
- "minimum": 0,
631
- "maximum": 1
632
3056
  }
633
3057
  },
634
3058
  "required": [
635
- "claims"
3059
+ "criteria"
636
3060
  ],
637
3061
  "additionalProperties": false,
638
3062
  "examples": [
639
3063
  {
640
- "claims": [
3064
+ "description": "Evaluate length metric against threshold, baseline, and target-range criteria",
3065
+ "criteria": [
641
3066
  {
642
- "description": "Built-in .length reports greater length than spread operator on emoji strings",
643
- "baseline": "spread-length",
644
- "claimId": "C001",
645
- "direction": "greater",
3067
+ "description": "Measured length should be greater than zero",
3068
+ "type": "threshold",
3069
+ "criterionId": "length-threshold",
3070
+ "metric": "length",
3071
+ "sut": "*",
3072
+ "threshold": {
3073
+ "operator": "gt",
3074
+ "value": 0
3075
+ }
3076
+ },
3077
+ {
3078
+ "description": "Built-in .length should be at least as large as spread operator",
3079
+ "type": "baseline",
3080
+ "baseline": {
3081
+ "operator": "gte",
3082
+ "sut": "spread-length"
3083
+ },
3084
+ "criterionId": "length-baseline",
646
3085
  "metric": "length",
647
- "scope": "global",
648
3086
  "sut": "builtin-length"
3087
+ },
3088
+ {
3089
+ "description": "Length should be in reasonable range [1, 100]",
3090
+ "type": "target-range",
3091
+ "criterionId": "length-target-range",
3092
+ "metric": "length",
3093
+ "sut": "*",
3094
+ "targetRange": {
3095
+ "max": 100,
3096
+ "maxInclusive": true,
3097
+ "min": 1,
3098
+ "minInclusive": true
3099
+ }
649
3100
  }
650
3101
  ],
651
- "significanceLevel": 0.05
3102
+ "name": "Metrics-Only Evaluation"
652
3103
  }
653
3104
  ]
654
3105
  },
655
- "CustomEvaluatorConfig": {
656
- "title": "CustomEvaluatorConfig",
657
- "description": "Configuration for a custom evaluator",
3106
+ "Provenance": {
3107
+ "title": "Provenance",
3108
+ "description": "Provenance information for reproducibility",
658
3109
  "type": "object",
659
3110
  "properties": {
660
- "customType": {
661
- "description": "Custom evaluator type name",
662
- "type": "string",
663
- "minLength": 1
664
- },
665
- "description": {
666
- "description": "Evaluator description",
667
- "type": "string"
668
- },
669
- "name": {
670
- "description": "Human-readable evaluator name",
3111
+ "dependencyLockHash": {
3112
+ "description": "Hash of package-lock.json for dependency pinning",
671
3113
  "type": "string"
672
3114
  },
673
- "options": {
674
- "description": "Additional evaluator-specific options",
675
- "type": "object",
676
- "additionalProperties": {},
677
- "propertyNames": {
678
- "type": "string"
679
- }
680
- }
681
- },
682
- "required": [
683
- "customType"
684
- ],
685
- "additionalProperties": {}
686
- },
687
- "ExploratoryEvaluatorConfig": {
688
- "title": "ExploratoryEvaluatorConfig",
689
- "description": "Configuration for the exploratory evaluator",
690
- "type": "object",
691
- "properties": {
692
- "analyzeCaseClassEffects": {
693
- "description": "Whether to analyze case-class effects",
3115
+ "dirty": {
3116
+ "description": "Whether working directory had uncommitted changes",
694
3117
  "type": "boolean"
695
3118
  },
696
- "computeCorrelations": {
697
- "description": "Whether to compute metric correlations",
698
- "type": "boolean"
3119
+ "executionTimeMs": {
3120
+ "description": "Wall-clock execution time in milliseconds",
3121
+ "type": "number"
699
3122
  },
700
- "description": {
701
- "description": "Evaluator description",
702
- "type": "string"
3123
+ "finalMemoryBytes": {
3124
+ "description": "Memory usage at completion (bytes)",
3125
+ "type": "number"
703
3126
  },
704
- "metricDirections": {
705
- "description": "Metric directions for ranking interpretation",
706
- "type": "object",
707
- "additionalProperties": {
708
- "description": "Metric direction for ranking",
709
- "type": "string",
710
- "oneOf": [
711
- {
712
- "description": "Higher values indicate better performance",
713
- "const": "higher-better"
714
- },
715
- {
716
- "description": "Lower values indicate better performance",
717
- "const": "lower-better"
718
- }
719
- ]
720
- },
721
- "propertyNames": {
722
- "type": "string"
723
- }
3127
+ "gitCommit": {
3128
+ "description": "Git commit hash",
3129
+ "type": "string"
724
3130
  },
725
- "metrics": {
726
- "description": "Metrics to analyze (all if not specified)",
3131
+ "parentRunIds": {
3132
+ "description": "Parent run IDs (for derived results)",
727
3133
  "type": "array",
728
3134
  "items": {
729
- "type": "string",
730
- "minLength": 1
3135
+ "type": "string"
731
3136
  }
732
3137
  },
733
- "minEffectSize": {
734
- "description": "Minimum effect size to consider meaningful",
735
- "type": "number",
736
- "minimum": 0
737
- },
738
- "name": {
739
- "description": "Human-readable evaluator name",
740
- "type": "string"
3138
+ "peakMemoryBytes": {
3139
+ "description": "Peak memory usage during execution (bytes)",
3140
+ "type": "number"
741
3141
  },
742
- "options": {
743
- "description": "Additional evaluator-specific options",
3142
+ "runtime": {
3143
+ "description": "Execution environment (platform and arch required; additional fields are language-specific)",
744
3144
  "type": "object",
745
- "additionalProperties": {},
746
- "propertyNames": {
3145
+ "properties": {
3146
+ "arch": {
3147
+ "description": "CPU architecture",
3148
+ "type": "string"
3149
+ },
3150
+ "platform": {
3151
+ "description": "Operating system platform",
3152
+ "type": "string"
3153
+ }
3154
+ },
3155
+ "required": [
3156
+ "arch",
3157
+ "platform"
3158
+ ],
3159
+ "additionalProperties": {
747
3160
  "type": "string"
748
3161
  }
749
3162
  },
750
- "significanceLevel": {
751
- "description": "Significance level for statistical tests (default: 0.05)",
752
- "type": "number",
753
- "minimum": 0,
754
- "maximum": 1
755
- },
756
- "suts": {
757
- "description": "SUTs to include (all if not specified)",
758
- "type": "array",
759
- "items": {
760
- "type": "string",
761
- "minLength": 1
762
- }
3163
+ "timestamp": {
3164
+ "description": "Execution timestamp",
3165
+ "type": "string"
763
3166
  }
764
3167
  },
765
- "additionalProperties": false,
766
- "examples": [
767
- {
768
- "analyzeCaseClassEffects": true,
769
- "computeCorrelations": false,
770
- "metricDirections": {
771
- "length": "higher-better"
772
- },
773
- "metrics": [
774
- "length"
775
- ]
776
- }
777
- ]
3168
+ "required": [
3169
+ "runtime"
3170
+ ],
3171
+ "additionalProperties": false
778
3172
  },
779
- "MetricsEvaluatorConfig": {
780
- "title": "MetricsEvaluatorConfig",
781
- "description": "Configuration for the metrics evaluator",
3173
+ "ResultBatch": {
3174
+ "title": "ResultBatch",
3175
+ "description": "Batch of evaluation results",
782
3176
  "type": "object",
783
3177
  "properties": {
784
- "criteria": {
785
- "description": "Criteria to evaluate",
786
- "type": "array",
787
- "items": {
788
- "title": "MetricsCriterion",
789
- "description": "A metrics evaluation criterion",
790
- "type": "object",
791
- "allOf": [
3178
+ "metadata": {
3179
+ "description": "Optional batch-level metadata",
3180
+ "type": "object",
3181
+ "additionalProperties": {
3182
+ "anyOf": [
3183
+ {
3184
+ "type": "string"
3185
+ },
792
3186
  {
793
- "if": {
794
- "properties": {
795
- "type": {
796
- "const": "threshold"
797
- }
798
- },
799
- "required": [
800
- "type"
801
- ]
802
- },
803
- "then": {
804
- "required": [
805
- "threshold"
806
- ]
807
- }
3187
+ "type": "number"
808
3188
  },
809
3189
  {
810
- "if": {
811
- "properties": {
812
- "type": {
813
- "const": "baseline"
814
- }
815
- },
816
- "required": [
817
- "type"
818
- ]
819
- },
820
- "then": {
821
- "required": [
822
- "baseline"
823
- ]
824
- }
3190
+ "type": "boolean"
825
3191
  },
826
3192
  {
827
- "if": {
828
- "properties": {
829
- "type": {
830
- "const": "target-range"
831
- }
832
- },
833
- "required": [
834
- "type"
835
- ]
836
- },
837
- "then": {
838
- "required": [
839
- "targetRange"
840
- ]
841
- }
3193
+ "type": "null"
842
3194
  }
843
- ],
3195
+ ]
3196
+ },
3197
+ "propertyNames": {
3198
+ "type": "string"
3199
+ }
3200
+ },
3201
+ "results": {
3202
+ "description": "All results in this batch",
3203
+ "type": "array",
3204
+ "items": {
3205
+ "title": "EvaluationResult",
3206
+ "description": "Complete evaluation result",
3207
+ "type": "object",
844
3208
  "properties": {
845
- "baseline": {
846
- "description": "Baseline comparison (required when type is baseline)",
3209
+ "correctness": {
3210
+ "title": "CorrectnessResult",
3211
+ "description": "Correctness assessment",
847
3212
  "type": "object",
848
3213
  "properties": {
849
- "operator": {
850
- "description": "Comparison operator",
3214
+ "expectedExists": {
3215
+ "description": "Whether expected output exists (oracle available)",
3216
+ "type": "boolean"
3217
+ },
3218
+ "failureType": {
3219
+ "description": "Failure classification if applicable",
851
3220
  "type": "string",
852
- "oneOf": [
853
- {
854
- "description": "Greater than",
855
- "const": "gt"
856
- },
857
- {
858
- "description": "Greater than or equal to",
859
- "const": "gte"
860
- },
861
- {
862
- "description": "Less than",
863
- "const": "lt"
864
- },
3221
+ "enum": [
3222
+ "no_output",
3223
+ "invalid_structure",
3224
+ "constraint_violation",
3225
+ "exception",
3226
+ "oracle_mismatch",
3227
+ "timeout"
3228
+ ]
3229
+ },
3230
+ "matchesExpected": {
3231
+ "description": "Whether output matches expected (null if no oracle)",
3232
+ "anyOf": [
865
3233
  {
866
- "description": "Less than or equal to",
867
- "const": "lte"
3234
+ "type": "boolean"
868
3235
  },
869
3236
  {
870
- "description": "Equal to",
871
- "const": "eq"
3237
+ "type": "null"
872
3238
  }
873
3239
  ]
874
3240
  },
875
- "sut": {
876
- "description": "Baseline SUT identifier",
877
- "type": "string",
878
- "minLength": 1
3241
+ "notes": {
3242
+ "description": "Human-readable failure notes",
3243
+ "type": "array",
3244
+ "items": {
3245
+ "type": "string"
3246
+ }
3247
+ },
3248
+ "producedOutput": {
3249
+ "description": "Whether the SUT produced any output",
3250
+ "type": "boolean"
3251
+ },
3252
+ "valid": {
3253
+ "description": "Whether output is structurally valid",
3254
+ "type": "boolean"
879
3255
  }
880
3256
  },
881
3257
  "required": [
882
- "operator",
883
- "sut"
3258
+ "expectedExists",
3259
+ "matchesExpected",
3260
+ "producedOutput",
3261
+ "valid"
884
3262
  ],
885
3263
  "additionalProperties": false
886
3264
  },
887
- "criterionId": {
888
- "description": "Unique criterion identifier",
889
- "type": "string",
890
- "minLength": 1
891
- },
892
- "description": {
893
- "description": "Human-readable description",
894
- "type": "string",
895
- "minLength": 1
896
- },
897
- "metric": {
898
- "description": "Metric to evaluate",
899
- "type": "string",
900
- "minLength": 1
3265
+ "error": {
3266
+ "description": "Error message if the run failed",
3267
+ "type": "string"
901
3268
  },
902
- "scopeConstraints": {
903
- "description": "Optional scope constraints",
3269
+ "metrics": {
3270
+ "title": "ResultMetrics",
3271
+ "description": "Numeric metrics",
904
3272
  "type": "object",
905
3273
  "properties": {
906
- "caseClass": {
907
- "description": "Case class filter",
908
- "anyOf": [
909
- {
3274
+ "extra": {
3275
+ "description": "Additional metrics (overflow)",
3276
+ "type": "object",
3277
+ "additionalProperties": {
3278
+ "type": "number"
3279
+ },
3280
+ "propertyNames": {
3281
+ "type": "string"
3282
+ }
3283
+ },
3284
+ "numeric": {
3285
+ "description": "Primary numeric metrics",
3286
+ "type": "object",
3287
+ "additionalProperties": {
3288
+ "type": "number"
3289
+ },
3290
+ "propertyNames": {
3291
+ "type": "string"
3292
+ }
3293
+ }
3294
+ },
3295
+ "required": [
3296
+ "numeric"
3297
+ ],
3298
+ "additionalProperties": {
3299
+ "anyOf": [
3300
+ {
3301
+ "type": "number"
3302
+ },
3303
+ {
3304
+ "type": "object",
3305
+ "additionalProperties": {
3306
+ "type": "number"
3307
+ },
3308
+ "propertyNames": {
910
3309
  "type": "string"
3310
+ }
3311
+ }
3312
+ ]
3313
+ }
3314
+ },
3315
+ "outputs": {
3316
+ "title": "ResultOutputs",
3317
+ "description": "Output artefacts and summaries",
3318
+ "type": "object",
3319
+ "properties": {
3320
+ "artefacts": {
3321
+ "description": "References to generated artefacts",
3322
+ "type": "array",
3323
+ "items": {
3324
+ "title": "ArtefactReference",
3325
+ "description": "Reference to an external artefact",
3326
+ "type": "object",
3327
+ "properties": {
3328
+ "hash": {
3329
+ "type": "string"
3330
+ },
3331
+ "metadata": {
3332
+ "type": "object",
3333
+ "additionalProperties": {
3334
+ "anyOf": [
3335
+ {
3336
+ "type": "string"
3337
+ },
3338
+ {
3339
+ "type": "number"
3340
+ },
3341
+ {
3342
+ "type": "boolean"
3343
+ },
3344
+ {
3345
+ "type": "null"
3346
+ }
3347
+ ]
3348
+ },
3349
+ "propertyNames": {
3350
+ "type": "string"
3351
+ }
3352
+ },
3353
+ "type": {
3354
+ "type": "string",
3355
+ "enum": [
3356
+ "graph",
3357
+ "path-set",
3358
+ "subgraph",
3359
+ "embedding",
3360
+ "other"
3361
+ ]
3362
+ },
3363
+ "uri": {
3364
+ "type": "string"
3365
+ }
911
3366
  },
912
- {
913
- "type": "array",
914
- "items": {
3367
+ "required": [
3368
+ "type",
3369
+ "uri"
3370
+ ],
3371
+ "additionalProperties": false
3372
+ }
3373
+ },
3374
+ "extra": {
3375
+ "description": "Additional untyped outputs",
3376
+ "type": "object",
3377
+ "additionalProperties": {},
3378
+ "propertyNames": {
3379
+ "type": "string"
3380
+ }
3381
+ },
3382
+ "labels": {
3383
+ "description": "Classification labels",
3384
+ "type": "object",
3385
+ "additionalProperties": {
3386
+ "anyOf": [
3387
+ {
3388
+ "type": "string"
3389
+ },
3390
+ {
3391
+ "type": "number"
3392
+ },
3393
+ {
3394
+ "type": "boolean"
3395
+ },
3396
+ {
3397
+ "type": "null"
3398
+ }
3399
+ ]
3400
+ },
3401
+ "propertyNames": {
3402
+ "type": "string"
3403
+ }
3404
+ },
3405
+ "ranking": {
3406
+ "description": "Ranking results",
3407
+ "type": "array",
3408
+ "items": {
3409
+ "title": "RankedItem",
3410
+ "description": "A ranked item for ranking tasks",
3411
+ "type": "object",
3412
+ "properties": {
3413
+ "itemId": {
3414
+ "description": "Item identifier",
915
3415
  "type": "string"
3416
+ },
3417
+ "metadata": {
3418
+ "description": "Optional additional metadata",
3419
+ "type": "object",
3420
+ "additionalProperties": {
3421
+ "anyOf": [
3422
+ {
3423
+ "type": "string"
3424
+ },
3425
+ {
3426
+ "type": "number"
3427
+ },
3428
+ {
3429
+ "type": "boolean"
3430
+ },
3431
+ {
3432
+ "type": "null"
3433
+ }
3434
+ ]
3435
+ },
3436
+ "propertyNames": {
3437
+ "type": "string"
3438
+ }
3439
+ },
3440
+ "score": {
3441
+ "description": "Score or rank value",
3442
+ "type": "number"
916
3443
  }
917
- }
918
- ]
3444
+ },
3445
+ "required": [
3446
+ "itemId",
3447
+ "score"
3448
+ ],
3449
+ "additionalProperties": false
3450
+ }
3451
+ },
3452
+ "summary": {
3453
+ "description": "Scalar summary values",
3454
+ "type": "object",
3455
+ "additionalProperties": {
3456
+ "anyOf": [
3457
+ {
3458
+ "anyOf": [
3459
+ {
3460
+ "type": "string"
3461
+ },
3462
+ {
3463
+ "type": "number"
3464
+ },
3465
+ {
3466
+ "type": "boolean"
3467
+ },
3468
+ {
3469
+ "type": "null"
3470
+ }
3471
+ ]
3472
+ },
3473
+ {
3474
+ "type": "array",
3475
+ "items": {
3476
+ "anyOf": [
3477
+ {
3478
+ "type": "string"
3479
+ },
3480
+ {
3481
+ "type": "number"
3482
+ },
3483
+ {
3484
+ "type": "boolean"
3485
+ },
3486
+ {
3487
+ "type": "null"
3488
+ }
3489
+ ]
3490
+ }
3491
+ }
3492
+ ]
3493
+ },
3494
+ "propertyNames": {
3495
+ "type": "string"
3496
+ }
919
3497
  }
920
3498
  },
921
3499
  "additionalProperties": false
922
3500
  },
923
- "sut": {
924
- "description": "SUT to evaluate (or \"*\" for all SUTs)",
925
- "type": "string",
926
- "minLength": 1
927
- },
928
- "tags": {
929
- "description": "Tags for filtering",
930
- "type": "array",
931
- "items": {
932
- "type": "string"
933
- }
934
- },
935
- "targetRange": {
936
- "description": "Target range (required when type is target-range)",
3501
+ "provenance": {
3502
+ "title": "Provenance",
3503
+ "description": "Provenance for reproducibility",
937
3504
  "type": "object",
938
3505
  "properties": {
939
- "max": {
940
- "description": "Maximum value",
941
- "type": "number"
3506
+ "dependencyLockHash": {
3507
+ "description": "Hash of package-lock.json for dependency pinning",
3508
+ "type": "string"
942
3509
  },
943
- "maxInclusive": {
944
- "description": "Whether max is inclusive",
3510
+ "dirty": {
3511
+ "description": "Whether working directory had uncommitted changes",
945
3512
  "type": "boolean"
946
3513
  },
947
- "min": {
948
- "description": "Minimum value",
3514
+ "executionTimeMs": {
3515
+ "description": "Wall-clock execution time in milliseconds",
3516
+ "type": "number"
3517
+ },
3518
+ "finalMemoryBytes": {
3519
+ "description": "Memory usage at completion (bytes)",
3520
+ "type": "number"
3521
+ },
3522
+ "gitCommit": {
3523
+ "description": "Git commit hash",
3524
+ "type": "string"
3525
+ },
3526
+ "parentRunIds": {
3527
+ "description": "Parent run IDs (for derived results)",
3528
+ "type": "array",
3529
+ "items": {
3530
+ "type": "string"
3531
+ }
3532
+ },
3533
+ "peakMemoryBytes": {
3534
+ "description": "Peak memory usage during execution (bytes)",
3535
+ "type": "number"
3536
+ },
3537
+ "runtime": {
3538
+ "description": "Execution environment (platform and arch required; additional fields are language-specific)",
3539
+ "type": "object",
3540
+ "properties": {
3541
+ "arch": {
3542
+ "description": "CPU architecture",
3543
+ "type": "string"
3544
+ },
3545
+ "platform": {
3546
+ "description": "Operating system platform",
3547
+ "type": "string"
3548
+ }
3549
+ },
3550
+ "required": [
3551
+ "arch",
3552
+ "platform"
3553
+ ],
3554
+ "additionalProperties": {
3555
+ "type": "string"
3556
+ }
3557
+ },
3558
+ "timestamp": {
3559
+ "description": "Execution timestamp",
3560
+ "type": "string"
3561
+ }
3562
+ },
3563
+ "required": [
3564
+ "runtime"
3565
+ ],
3566
+ "additionalProperties": false
3567
+ },
3568
+ "run": {
3569
+ "title": "RunContext",
3570
+ "description": "Run identity and context",
3571
+ "type": "object",
3572
+ "properties": {
3573
+ "caseClass": {
3574
+ "description": "Case class for grouping",
3575
+ "type": "string"
3576
+ },
3577
+ "caseId": {
3578
+ "description": "Case identifier",
3579
+ "type": "string"
3580
+ },
3581
+ "config": {
3582
+ "description": "Configuration overrides for this run",
3583
+ "type": "object",
3584
+ "additionalProperties": {
3585
+ "anyOf": [
3586
+ {
3587
+ "type": "string"
3588
+ },
3589
+ {
3590
+ "type": "number"
3591
+ },
3592
+ {
3593
+ "type": "boolean"
3594
+ },
3595
+ {
3596
+ "type": "null"
3597
+ }
3598
+ ]
3599
+ },
3600
+ "propertyNames": {
3601
+ "type": "string"
3602
+ }
3603
+ },
3604
+ "repetition": {
3605
+ "description": "Repetition number for statistical runs",
3606
+ "type": "integer",
3607
+ "minimum": -9007199254740991,
3608
+ "maximum": 2147483647
3609
+ },
3610
+ "runId": {
3611
+ "description": "Deterministic run ID (hash of inputs)",
3612
+ "type": "string"
3613
+ },
3614
+ "seed": {
3615
+ "description": "Random seed if applicable",
949
3616
  "type": "number"
950
3617
  },
951
- "minInclusive": {
952
- "description": "Whether min is inclusive",
953
- "type": "boolean"
954
- }
955
- },
956
- "additionalProperties": false
957
- },
958
- "threshold": {
959
- "description": "Threshold operator and value (required when type is threshold)",
960
- "type": "object",
961
- "properties": {
962
- "operator": {
963
- "description": "Comparison operator",
3618
+ "sut": {
3619
+ "description": "SUT identifier",
3620
+ "type": "string"
3621
+ },
3622
+ "sutRole": {
3623
+ "description": "Role of the SUT in evaluation",
964
3624
  "type": "string",
965
3625
  "oneOf": [
966
3626
  {
967
- "description": "Greater than",
968
- "const": "gt"
969
- },
970
- {
971
- "description": "Greater than or equal to",
972
- "const": "gte"
973
- },
974
- {
975
- "description": "Less than",
976
- "const": "lt"
3627
+ "description": "The system being evaluated; the novel algorithm or implementation",
3628
+ "const": "primary"
977
3629
  },
978
3630
  {
979
- "description": "Less than or equal to",
980
- "const": "lte"
3631
+ "description": "A reference implementation for comparison",
3632
+ "const": "baseline"
981
3633
  },
982
3634
  {
983
- "description": "Equal to",
984
- "const": "eq"
3635
+ "description": "Ground truth provider; defines correct answers",
3636
+ "const": "oracle"
985
3637
  }
986
3638
  ]
987
3639
  },
988
- "value": {
989
- "description": "Threshold value",
990
- "type": "number"
3640
+ "sutVersion": {
3641
+ "description": "SUT version for reproducibility",
3642
+ "type": "string"
991
3643
  }
992
3644
  },
993
3645
  "required": [
994
- "operator",
995
- "value"
3646
+ "caseId",
3647
+ "runId",
3648
+ "sut",
3649
+ "sutRole"
996
3650
  ],
997
3651
  "additionalProperties": false
998
- },
999
- "type": {
1000
- "description": "Type of metrics criterion",
1001
- "type": "string",
1002
- "oneOf": [
1003
- {
1004
- "description": "Compare a metric against a fixed threshold value",
1005
- "const": "threshold"
1006
- },
1007
- {
1008
- "description": "Compare a metric against a baseline SUT",
1009
- "const": "baseline"
1010
- },
1011
- {
1012
- "description": "Check that a metric falls within a target range",
1013
- "const": "target-range"
1014
- }
1015
- ]
1016
3652
  }
1017
3653
  },
1018
3654
  "required": [
1019
- "criterionId",
1020
- "description",
1021
- "metric",
1022
- "sut",
1023
- "type"
3655
+ "correctness",
3656
+ "metrics",
3657
+ "outputs",
3658
+ "provenance",
3659
+ "run"
1024
3660
  ],
1025
3661
  "additionalProperties": false
1026
- },
1027
- "minItems": 1
3662
+ }
1028
3663
  },
1029
- "description": {
1030
- "description": "Evaluator description",
3664
+ "timestamp": {
3665
+ "description": "Generation timestamp",
1031
3666
  "type": "string"
1032
3667
  },
1033
- "name": {
1034
- "description": "Human-readable evaluator name",
3668
+ "version": {
3669
+ "description": "Schema version",
1035
3670
  "type": "string"
1036
- },
1037
- "options": {
1038
- "description": "Additional evaluator-specific options",
1039
- "type": "object",
1040
- "additionalProperties": {},
1041
- "propertyNames": {
1042
- "type": "string"
1043
- }
1044
3671
  }
1045
3672
  },
1046
3673
  "required": [
1047
- "criteria"
3674
+ "results",
3675
+ "timestamp",
3676
+ "version"
1048
3677
  ],
1049
- "additionalProperties": false,
1050
- "examples": [
1051
- {
1052
- "description": "Evaluate length metric against threshold, baseline, and target-range criteria",
1053
- "criteria": [
1054
- {
1055
- "description": "Measured length should be greater than zero",
1056
- "type": "threshold",
1057
- "criterionId": "length-threshold",
1058
- "metric": "length",
1059
- "sut": "*",
1060
- "threshold": {
1061
- "operator": "gt",
1062
- "value": 0
3678
+ "additionalProperties": false
3679
+ },
3680
+ "RobustnessAnalysisOutput": {
3681
+ "title": "RobustnessAnalysisOutput",
3682
+ "description": "Complete robustness analysis output",
3683
+ "type": "object",
3684
+ "properties": {
3685
+ "config": {
3686
+ "type": "object",
3687
+ "properties": {
3688
+ "intensityLevels": {
3689
+ "description": "Intensity levels tested",
3690
+ "type": "array",
3691
+ "items": {
3692
+ "type": "number"
1063
3693
  }
1064
3694
  },
1065
- {
1066
- "description": "Built-in .length should be at least as large as spread operator",
1067
- "type": "baseline",
1068
- "baseline": {
1069
- "operator": "gte",
1070
- "sut": "spread-length"
1071
- },
1072
- "criterionId": "length-baseline",
1073
- "metric": "length",
1074
- "sut": "builtin-length"
3695
+ "metrics": {
3696
+ "description": "Metrics analyzed",
3697
+ "type": "array",
3698
+ "items": {
3699
+ "type": "string"
3700
+ }
1075
3701
  },
1076
- {
1077
- "description": "Length should be in reasonable range [1, 100]",
1078
- "type": "target-range",
1079
- "criterionId": "length-target-range",
1080
- "metric": "length",
1081
- "sut": "*",
1082
- "targetRange": {
1083
- "max": 100,
1084
- "maxInclusive": true,
1085
- "min": 1,
1086
- "minInclusive": true
3702
+ "perturbations": {
3703
+ "description": "Perturbations applied",
3704
+ "type": "array",
3705
+ "items": {
3706
+ "type": "string"
1087
3707
  }
3708
+ },
3709
+ "runsPerLevel": {
3710
+ "description": "Runs per perturbation level",
3711
+ "type": "integer",
3712
+ "minimum": -9007199254740991,
3713
+ "maximum": 10000
1088
3714
  }
3715
+ },
3716
+ "required": [
3717
+ "metrics",
3718
+ "perturbations",
3719
+ "runsPerLevel"
1089
3720
  ],
1090
- "name": "Metrics-Only Evaluation"
3721
+ "additionalProperties": false
3722
+ },
3723
+ "results": {
3724
+ "description": "Individual analysis results",
3725
+ "type": "array",
3726
+ "items": {
3727
+ "title": "RobustnessAnalysisResult",
3728
+ "description": "Result of robustness analysis for a single SUT",
3729
+ "type": "object",
3730
+ "properties": {
3731
+ "baselineValue": {
3732
+ "type": "number"
3733
+ },
3734
+ "caseClass": {
3735
+ "type": "string"
3736
+ },
3737
+ "metric": {
3738
+ "type": "string"
3739
+ },
3740
+ "perturbation": {
3741
+ "type": "string"
3742
+ },
3743
+ "robustness": {
3744
+ "title": "RobustnessMetrics",
3745
+ "description": "Robustness analysis metrics",
3746
+ "type": "object",
3747
+ "properties": {
3748
+ "breakpoint": {
3749
+ "type": "number"
3750
+ },
3751
+ "coefficientOfVariation": {
3752
+ "type": "number"
3753
+ },
3754
+ "degradationCurve": {
3755
+ "type": "array",
3756
+ "items": {
3757
+ "type": "object",
3758
+ "properties": {
3759
+ "metricValue": {
3760
+ "type": "number"
3761
+ },
3762
+ "perturbationLevel": {
3763
+ "type": "number"
3764
+ },
3765
+ "stdDev": {
3766
+ "type": "number"
3767
+ }
3768
+ },
3769
+ "required": [
3770
+ "metricValue",
3771
+ "perturbationLevel"
3772
+ ],
3773
+ "additionalProperties": false
3774
+ }
3775
+ },
3776
+ "rankingStability": {
3777
+ "type": "number"
3778
+ },
3779
+ "stdUnderPerturbation": {
3780
+ "type": "number"
3781
+ },
3782
+ "varianceUnderPerturbation": {
3783
+ "type": "number"
3784
+ }
3785
+ },
3786
+ "required": [
3787
+ "coefficientOfVariation",
3788
+ "stdUnderPerturbation",
3789
+ "varianceUnderPerturbation"
3790
+ ],
3791
+ "additionalProperties": false
3792
+ },
3793
+ "runCount": {
3794
+ "type": "integer",
3795
+ "minimum": -9007199254740991,
3796
+ "maximum": 2147483647
3797
+ },
3798
+ "sut": {
3799
+ "type": "string"
3800
+ }
3801
+ },
3802
+ "required": [
3803
+ "baselineValue",
3804
+ "metric",
3805
+ "perturbation",
3806
+ "robustness",
3807
+ "runCount",
3808
+ "sut"
3809
+ ],
3810
+ "additionalProperties": false
3811
+ }
3812
+ },
3813
+ "timestamp": {
3814
+ "description": "Generation timestamp",
3815
+ "type": "string"
3816
+ },
3817
+ "version": {
3818
+ "description": "Schema version",
3819
+ "type": "string"
1091
3820
  }
1092
- ]
3821
+ },
3822
+ "required": [
3823
+ "config",
3824
+ "results",
3825
+ "timestamp",
3826
+ "version"
3827
+ ],
3828
+ "additionalProperties": false
1093
3829
  },
1094
3830
  "RobustnessEvaluatorConfig": {
1095
3831
  "title": "RobustnessEvaluatorConfig",
@@ -1173,6 +3909,156 @@
1173
3909
  "runsPerLevel": 10
1174
3910
  }
1175
3911
  ]
3912
+ },
3913
+ "RunContext": {
3914
+ "title": "RunContext",
3915
+ "description": "Run identity and context",
3916
+ "type": "object",
3917
+ "properties": {
3918
+ "caseClass": {
3919
+ "description": "Case class for grouping",
3920
+ "type": "string"
3921
+ },
3922
+ "caseId": {
3923
+ "description": "Case identifier",
3924
+ "type": "string"
3925
+ },
3926
+ "config": {
3927
+ "description": "Configuration overrides for this run",
3928
+ "type": "object",
3929
+ "additionalProperties": {
3930
+ "anyOf": [
3931
+ {
3932
+ "type": "string"
3933
+ },
3934
+ {
3935
+ "type": "number"
3936
+ },
3937
+ {
3938
+ "type": "boolean"
3939
+ },
3940
+ {
3941
+ "type": "null"
3942
+ }
3943
+ ]
3944
+ },
3945
+ "propertyNames": {
3946
+ "type": "string"
3947
+ }
3948
+ },
3949
+ "repetition": {
3950
+ "description": "Repetition number for statistical runs",
3951
+ "type": "integer",
3952
+ "minimum": -9007199254740991,
3953
+ "maximum": 2147483647
3954
+ },
3955
+ "runId": {
3956
+ "description": "Deterministic run ID (hash of inputs)",
3957
+ "type": "string"
3958
+ },
3959
+ "seed": {
3960
+ "description": "Random seed if applicable",
3961
+ "type": "number"
3962
+ },
3963
+ "sut": {
3964
+ "description": "SUT identifier",
3965
+ "type": "string"
3966
+ },
3967
+ "sutRole": {
3968
+ "description": "Role of the SUT in evaluation",
3969
+ "type": "string",
3970
+ "oneOf": [
3971
+ {
3972
+ "description": "The system being evaluated; the novel algorithm or implementation",
3973
+ "const": "primary"
3974
+ },
3975
+ {
3976
+ "description": "A reference implementation for comparison",
3977
+ "const": "baseline"
3978
+ },
3979
+ {
3980
+ "description": "Ground truth provider; defines correct answers",
3981
+ "const": "oracle"
3982
+ }
3983
+ ]
3984
+ },
3985
+ "sutVersion": {
3986
+ "description": "SUT version for reproducibility",
3987
+ "type": "string"
3988
+ }
3989
+ },
3990
+ "required": [
3991
+ "caseId",
3992
+ "runId",
3993
+ "sut",
3994
+ "sutRole"
3995
+ ],
3996
+ "additionalProperties": false
3997
+ },
3998
+ "SummaryStats": {
3999
+ "title": "SummaryStats",
4000
+ "description": "Summary statistics for a numeric metric",
4001
+ "type": "object",
4002
+ "properties": {
4003
+ "confidence95": {
4004
+ "description": "95% confidence interval [lower, upper]",
4005
+ "type": "array",
4006
+ "prefixItems": [
4007
+ {
4008
+ "type": "number"
4009
+ },
4010
+ {
4011
+ "type": "number"
4012
+ }
4013
+ ]
4014
+ },
4015
+ "max": {
4016
+ "description": "Maximum value",
4017
+ "type": "number"
4018
+ },
4019
+ "mean": {
4020
+ "description": "Arithmetic mean",
4021
+ "type": "number"
4022
+ },
4023
+ "median": {
4024
+ "description": "Median (50th percentile)",
4025
+ "type": "number"
4026
+ },
4027
+ "min": {
4028
+ "description": "Minimum value",
4029
+ "type": "number"
4030
+ },
4031
+ "n": {
4032
+ "description": "Number of observations",
4033
+ "type": "integer",
4034
+ "minimum": -9007199254740991,
4035
+ "maximum": 2147483647
4036
+ },
4037
+ "p25": {
4038
+ "description": "25th percentile",
4039
+ "type": "number"
4040
+ },
4041
+ "p75": {
4042
+ "description": "75th percentile",
4043
+ "type": "number"
4044
+ },
4045
+ "std": {
4046
+ "description": "Standard deviation (sample)",
4047
+ "type": "number"
4048
+ },
4049
+ "sum": {
4050
+ "description": "Sum of all values",
4051
+ "type": "number"
4052
+ }
4053
+ },
4054
+ "required": [
4055
+ "max",
4056
+ "mean",
4057
+ "median",
4058
+ "min",
4059
+ "n"
4060
+ ],
4061
+ "additionalProperties": false
1176
4062
  }
1177
4063
  }
1178
4064
  }