ppef 1.3.1 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -1
- package/dist/executor/__tests__/run-id.unit.test.js +41 -1
- package/dist/executor/__tests__/run-id.unit.test.js.map +1 -1
- package/dist/executor/index.d.ts +1 -1
- package/dist/executor/index.d.ts.map +1 -1
- package/dist/executor/index.js +1 -1
- package/dist/executor/index.js.map +1 -1
- package/dist/executor/run-id.d.ts +20 -2
- package/dist/executor/run-id.d.ts.map +1 -1
- package/dist/executor/run-id.js +54 -12
- package/dist/executor/run-id.js.map +1 -1
- package/dist/schemas/output-schemas.d.ts +924 -0
- package/dist/schemas/output-schemas.d.ts.map +1 -0
- package/dist/schemas/output-schemas.js +611 -0
- package/dist/schemas/output-schemas.js.map +1 -0
- package/dist/types/result.d.ts +2 -2
- package/dist/types/result.d.ts.map +1 -1
- package/package.json +1 -1
- package/ppef.schema.json +3365 -479
package/ppef.schema.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
-
"$id": "https://ppef.dev/schemas/v1.
|
|
3
|
+
"$id": "https://ppef.dev/schemas/v1.4.0/ppef.schema.json",
|
|
4
4
|
"title": "ExperimentConfig",
|
|
5
5
|
"description": "PPEF experiment configuration",
|
|
6
6
|
"type": "object",
|
|
@@ -439,165 +439,2600 @@
|
|
|
439
439
|
}
|
|
440
440
|
],
|
|
441
441
|
"$defs": {
|
|
442
|
+
"AggregatedResult": {
|
|
443
|
+
"title": "AggregatedResult",
|
|
444
|
+
"description": "Aggregated result for a SUT",
|
|
445
|
+
"type": "object",
|
|
446
|
+
"properties": {
|
|
447
|
+
"caseClass": {
|
|
448
|
+
"description": "Case class (if grouped)",
|
|
449
|
+
"type": "string"
|
|
450
|
+
},
|
|
451
|
+
"comparisons": {
|
|
452
|
+
"description": "Comparisons with baselines",
|
|
453
|
+
"type": "object",
|
|
454
|
+
"additionalProperties": {
|
|
455
|
+
"title": "ComparisonMetrics",
|
|
456
|
+
"description": "Comparison metrics between primary and baseline SUTs",
|
|
457
|
+
"type": "object",
|
|
458
|
+
"properties": {
|
|
459
|
+
"betterRate": {
|
|
460
|
+
"description": "Win rate (% of cases where primary beats baseline)",
|
|
461
|
+
"type": "number"
|
|
462
|
+
},
|
|
463
|
+
"deltas": {
|
|
464
|
+
"description": "Absolute deltas (primary - baseline)",
|
|
465
|
+
"type": "object",
|
|
466
|
+
"additionalProperties": {
|
|
467
|
+
"type": "number"
|
|
468
|
+
},
|
|
469
|
+
"propertyNames": {
|
|
470
|
+
"type": "string"
|
|
471
|
+
}
|
|
472
|
+
},
|
|
473
|
+
"effectSize": {
|
|
474
|
+
"description": "Effect size (Cohen's d)",
|
|
475
|
+
"type": "number"
|
|
476
|
+
},
|
|
477
|
+
"pValue": {
|
|
478
|
+
"description": "Statistical significance (p-value)",
|
|
479
|
+
"type": "number"
|
|
480
|
+
},
|
|
481
|
+
"ratios": {
|
|
482
|
+
"description": "Ratios (primary / baseline)",
|
|
483
|
+
"type": "object",
|
|
484
|
+
"additionalProperties": {
|
|
485
|
+
"type": "number"
|
|
486
|
+
},
|
|
487
|
+
"propertyNames": {
|
|
488
|
+
"type": "string"
|
|
489
|
+
}
|
|
490
|
+
},
|
|
491
|
+
"uStatistic": {
|
|
492
|
+
"description": "Mann-Whitney U statistic",
|
|
493
|
+
"type": "number"
|
|
494
|
+
}
|
|
495
|
+
},
|
|
496
|
+
"required": [
|
|
497
|
+
"deltas",
|
|
498
|
+
"ratios"
|
|
499
|
+
],
|
|
500
|
+
"additionalProperties": false
|
|
501
|
+
},
|
|
502
|
+
"propertyNames": {
|
|
503
|
+
"type": "string"
|
|
504
|
+
}
|
|
505
|
+
},
|
|
506
|
+
"correctness": {
|
|
507
|
+
"type": "object",
|
|
508
|
+
"properties": {
|
|
509
|
+
"failureBreakdown": {
|
|
510
|
+
"description": "Breakdown of failure types",
|
|
511
|
+
"type": "object",
|
|
512
|
+
"additionalProperties": {
|
|
513
|
+
"type": "number"
|
|
514
|
+
},
|
|
515
|
+
"propertyNames": {
|
|
516
|
+
"type": "string"
|
|
517
|
+
}
|
|
518
|
+
},
|
|
519
|
+
"matchesExpectedRate": {
|
|
520
|
+
"description": "Fraction of runs matching expected",
|
|
521
|
+
"type": "number"
|
|
522
|
+
},
|
|
523
|
+
"producedOutputRate": {
|
|
524
|
+
"description": "Fraction of runs that produced any output",
|
|
525
|
+
"type": "number"
|
|
526
|
+
},
|
|
527
|
+
"validRate": {
|
|
528
|
+
"description": "Fraction of runs that produced valid output",
|
|
529
|
+
"type": "number"
|
|
530
|
+
}
|
|
531
|
+
},
|
|
532
|
+
"required": [
|
|
533
|
+
"producedOutputRate",
|
|
534
|
+
"validRate"
|
|
535
|
+
],
|
|
536
|
+
"additionalProperties": false
|
|
537
|
+
},
|
|
538
|
+
"coverage": {
|
|
539
|
+
"title": "CoverageMetrics",
|
|
540
|
+
"description": "Coverage information",
|
|
541
|
+
"type": "object",
|
|
542
|
+
"properties": {
|
|
543
|
+
"caseCoverage": {
|
|
544
|
+
"description": "Fraction of cases covered",
|
|
545
|
+
"type": "number"
|
|
546
|
+
},
|
|
547
|
+
"metricCoverage": {
|
|
548
|
+
"description": "Metric availability (metric name -> coverage fraction)",
|
|
549
|
+
"type": "object",
|
|
550
|
+
"additionalProperties": {
|
|
551
|
+
"type": "number"
|
|
552
|
+
},
|
|
553
|
+
"propertyNames": {
|
|
554
|
+
"type": "string"
|
|
555
|
+
}
|
|
556
|
+
},
|
|
557
|
+
"missingCases": {
|
|
558
|
+
"description": "Missing case IDs",
|
|
559
|
+
"type": "array",
|
|
560
|
+
"items": {
|
|
561
|
+
"type": "string"
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
},
|
|
565
|
+
"required": [
|
|
566
|
+
"caseCoverage",
|
|
567
|
+
"metricCoverage"
|
|
568
|
+
],
|
|
569
|
+
"additionalProperties": false
|
|
570
|
+
},
|
|
571
|
+
"group": {
|
|
572
|
+
"type": "object",
|
|
573
|
+
"properties": {
|
|
574
|
+
"caseCount": {
|
|
575
|
+
"description": "Number of unique cases",
|
|
576
|
+
"type": "integer",
|
|
577
|
+
"minimum": -9007199254740991,
|
|
578
|
+
"maximum": 2147483647
|
|
579
|
+
},
|
|
580
|
+
"configHash": {
|
|
581
|
+
"description": "Hash of configuration",
|
|
582
|
+
"type": "string"
|
|
583
|
+
},
|
|
584
|
+
"runCount": {
|
|
585
|
+
"description": "Number of runs in this aggregate",
|
|
586
|
+
"type": "integer",
|
|
587
|
+
"minimum": -9007199254740991,
|
|
588
|
+
"maximum": 2147483647
|
|
589
|
+
}
|
|
590
|
+
},
|
|
591
|
+
"required": [
|
|
592
|
+
"caseCount",
|
|
593
|
+
"runCount"
|
|
594
|
+
],
|
|
595
|
+
"additionalProperties": false
|
|
596
|
+
},
|
|
597
|
+
"metadata": {
|
|
598
|
+
"description": "Additional metadata",
|
|
599
|
+
"type": "object",
|
|
600
|
+
"additionalProperties": {
|
|
601
|
+
"anyOf": [
|
|
602
|
+
{
|
|
603
|
+
"type": "string"
|
|
604
|
+
},
|
|
605
|
+
{
|
|
606
|
+
"type": "number"
|
|
607
|
+
},
|
|
608
|
+
{
|
|
609
|
+
"type": "boolean"
|
|
610
|
+
},
|
|
611
|
+
{
|
|
612
|
+
"type": "null"
|
|
613
|
+
}
|
|
614
|
+
]
|
|
615
|
+
},
|
|
616
|
+
"propertyNames": {
|
|
617
|
+
"type": "string"
|
|
618
|
+
}
|
|
619
|
+
},
|
|
620
|
+
"metrics": {
|
|
621
|
+
"description": "Aggregated metrics (metric name -> summary stats)",
|
|
622
|
+
"type": "object",
|
|
623
|
+
"additionalProperties": {
|
|
624
|
+
"title": "SummaryStats",
|
|
625
|
+
"description": "Summary statistics for a numeric metric",
|
|
626
|
+
"type": "object",
|
|
627
|
+
"properties": {
|
|
628
|
+
"confidence95": {
|
|
629
|
+
"description": "95% confidence interval [lower, upper]",
|
|
630
|
+
"type": "array",
|
|
631
|
+
"prefixItems": [
|
|
632
|
+
{
|
|
633
|
+
"type": "number"
|
|
634
|
+
},
|
|
635
|
+
{
|
|
636
|
+
"type": "number"
|
|
637
|
+
}
|
|
638
|
+
]
|
|
639
|
+
},
|
|
640
|
+
"max": {
|
|
641
|
+
"description": "Maximum value",
|
|
642
|
+
"type": "number"
|
|
643
|
+
},
|
|
644
|
+
"mean": {
|
|
645
|
+
"description": "Arithmetic mean",
|
|
646
|
+
"type": "number"
|
|
647
|
+
},
|
|
648
|
+
"median": {
|
|
649
|
+
"description": "Median (50th percentile)",
|
|
650
|
+
"type": "number"
|
|
651
|
+
},
|
|
652
|
+
"min": {
|
|
653
|
+
"description": "Minimum value",
|
|
654
|
+
"type": "number"
|
|
655
|
+
},
|
|
656
|
+
"n": {
|
|
657
|
+
"description": "Number of observations",
|
|
658
|
+
"type": "integer",
|
|
659
|
+
"minimum": -9007199254740991,
|
|
660
|
+
"maximum": 2147483647
|
|
661
|
+
},
|
|
662
|
+
"p25": {
|
|
663
|
+
"description": "25th percentile",
|
|
664
|
+
"type": "number"
|
|
665
|
+
},
|
|
666
|
+
"p75": {
|
|
667
|
+
"description": "75th percentile",
|
|
668
|
+
"type": "number"
|
|
669
|
+
},
|
|
670
|
+
"std": {
|
|
671
|
+
"description": "Standard deviation (sample)",
|
|
672
|
+
"type": "number"
|
|
673
|
+
},
|
|
674
|
+
"sum": {
|
|
675
|
+
"description": "Sum of all values",
|
|
676
|
+
"type": "number"
|
|
677
|
+
}
|
|
678
|
+
},
|
|
679
|
+
"required": [
|
|
680
|
+
"max",
|
|
681
|
+
"mean",
|
|
682
|
+
"median",
|
|
683
|
+
"min",
|
|
684
|
+
"n"
|
|
685
|
+
],
|
|
686
|
+
"additionalProperties": false
|
|
687
|
+
},
|
|
688
|
+
"propertyNames": {
|
|
689
|
+
"type": "string"
|
|
690
|
+
}
|
|
691
|
+
},
|
|
692
|
+
"sut": {
|
|
693
|
+
"description": "SUT identifier",
|
|
694
|
+
"type": "string"
|
|
695
|
+
},
|
|
696
|
+
"sutRole": {
|
|
697
|
+
"description": "Role of the SUT in evaluation",
|
|
698
|
+
"type": "string",
|
|
699
|
+
"oneOf": [
|
|
700
|
+
{
|
|
701
|
+
"description": "The system being evaluated; the novel algorithm or implementation",
|
|
702
|
+
"const": "primary"
|
|
703
|
+
},
|
|
704
|
+
{
|
|
705
|
+
"description": "A reference implementation for comparison",
|
|
706
|
+
"const": "baseline"
|
|
707
|
+
},
|
|
708
|
+
{
|
|
709
|
+
"description": "Ground truth provider; defines correct answers",
|
|
710
|
+
"const": "oracle"
|
|
711
|
+
}
|
|
712
|
+
]
|
|
713
|
+
}
|
|
714
|
+
},
|
|
715
|
+
"required": [
|
|
716
|
+
"correctness",
|
|
717
|
+
"group",
|
|
718
|
+
"metrics",
|
|
719
|
+
"sut",
|
|
720
|
+
"sutRole"
|
|
721
|
+
],
|
|
722
|
+
"additionalProperties": false
|
|
723
|
+
},
|
|
724
|
+
"AggregationOutput": {
|
|
725
|
+
"title": "AggregationOutput",
|
|
726
|
+
"description": "Complete aggregation output",
|
|
727
|
+
"type": "object",
|
|
728
|
+
"properties": {
|
|
729
|
+
"aggregates": {
|
|
730
|
+
"description": "Aggregated results",
|
|
731
|
+
"type": "array",
|
|
732
|
+
"items": {
|
|
733
|
+
"title": "AggregatedResult",
|
|
734
|
+
"description": "Aggregated result for a SUT",
|
|
735
|
+
"type": "object",
|
|
736
|
+
"properties": {
|
|
737
|
+
"caseClass": {
|
|
738
|
+
"description": "Case class (if grouped)",
|
|
739
|
+
"type": "string"
|
|
740
|
+
},
|
|
741
|
+
"comparisons": {
|
|
742
|
+
"description": "Comparisons with baselines",
|
|
743
|
+
"type": "object",
|
|
744
|
+
"additionalProperties": {
|
|
745
|
+
"title": "ComparisonMetrics",
|
|
746
|
+
"description": "Comparison metrics between primary and baseline SUTs",
|
|
747
|
+
"type": "object",
|
|
748
|
+
"properties": {
|
|
749
|
+
"betterRate": {
|
|
750
|
+
"description": "Win rate (% of cases where primary beats baseline)",
|
|
751
|
+
"type": "number"
|
|
752
|
+
},
|
|
753
|
+
"deltas": {
|
|
754
|
+
"description": "Absolute deltas (primary - baseline)",
|
|
755
|
+
"type": "object",
|
|
756
|
+
"additionalProperties": {
|
|
757
|
+
"type": "number"
|
|
758
|
+
},
|
|
759
|
+
"propertyNames": {
|
|
760
|
+
"type": "string"
|
|
761
|
+
}
|
|
762
|
+
},
|
|
763
|
+
"effectSize": {
|
|
764
|
+
"description": "Effect size (Cohen's d)",
|
|
765
|
+
"type": "number"
|
|
766
|
+
},
|
|
767
|
+
"pValue": {
|
|
768
|
+
"description": "Statistical significance (p-value)",
|
|
769
|
+
"type": "number"
|
|
770
|
+
},
|
|
771
|
+
"ratios": {
|
|
772
|
+
"description": "Ratios (primary / baseline)",
|
|
773
|
+
"type": "object",
|
|
774
|
+
"additionalProperties": {
|
|
775
|
+
"type": "number"
|
|
776
|
+
},
|
|
777
|
+
"propertyNames": {
|
|
778
|
+
"type": "string"
|
|
779
|
+
}
|
|
780
|
+
},
|
|
781
|
+
"uStatistic": {
|
|
782
|
+
"description": "Mann-Whitney U statistic",
|
|
783
|
+
"type": "number"
|
|
784
|
+
}
|
|
785
|
+
},
|
|
786
|
+
"required": [
|
|
787
|
+
"deltas",
|
|
788
|
+
"ratios"
|
|
789
|
+
],
|
|
790
|
+
"additionalProperties": false
|
|
791
|
+
},
|
|
792
|
+
"propertyNames": {
|
|
793
|
+
"type": "string"
|
|
794
|
+
}
|
|
795
|
+
},
|
|
796
|
+
"correctness": {
|
|
797
|
+
"type": "object",
|
|
798
|
+
"properties": {
|
|
799
|
+
"failureBreakdown": {
|
|
800
|
+
"description": "Breakdown of failure types",
|
|
801
|
+
"type": "object",
|
|
802
|
+
"additionalProperties": {
|
|
803
|
+
"type": "number"
|
|
804
|
+
},
|
|
805
|
+
"propertyNames": {
|
|
806
|
+
"type": "string"
|
|
807
|
+
}
|
|
808
|
+
},
|
|
809
|
+
"matchesExpectedRate": {
|
|
810
|
+
"description": "Fraction of runs matching expected",
|
|
811
|
+
"type": "number"
|
|
812
|
+
},
|
|
813
|
+
"producedOutputRate": {
|
|
814
|
+
"description": "Fraction of runs that produced any output",
|
|
815
|
+
"type": "number"
|
|
816
|
+
},
|
|
817
|
+
"validRate": {
|
|
818
|
+
"description": "Fraction of runs that produced valid output",
|
|
819
|
+
"type": "number"
|
|
820
|
+
}
|
|
821
|
+
},
|
|
822
|
+
"required": [
|
|
823
|
+
"producedOutputRate",
|
|
824
|
+
"validRate"
|
|
825
|
+
],
|
|
826
|
+
"additionalProperties": false
|
|
827
|
+
},
|
|
828
|
+
"coverage": {
|
|
829
|
+
"title": "CoverageMetrics",
|
|
830
|
+
"description": "Coverage information",
|
|
831
|
+
"type": "object",
|
|
832
|
+
"properties": {
|
|
833
|
+
"caseCoverage": {
|
|
834
|
+
"description": "Fraction of cases covered",
|
|
835
|
+
"type": "number"
|
|
836
|
+
},
|
|
837
|
+
"metricCoverage": {
|
|
838
|
+
"description": "Metric availability (metric name -> coverage fraction)",
|
|
839
|
+
"type": "object",
|
|
840
|
+
"additionalProperties": {
|
|
841
|
+
"type": "number"
|
|
842
|
+
},
|
|
843
|
+
"propertyNames": {
|
|
844
|
+
"type": "string"
|
|
845
|
+
}
|
|
846
|
+
},
|
|
847
|
+
"missingCases": {
|
|
848
|
+
"description": "Missing case IDs",
|
|
849
|
+
"type": "array",
|
|
850
|
+
"items": {
|
|
851
|
+
"type": "string"
|
|
852
|
+
}
|
|
853
|
+
}
|
|
854
|
+
},
|
|
855
|
+
"required": [
|
|
856
|
+
"caseCoverage",
|
|
857
|
+
"metricCoverage"
|
|
858
|
+
],
|
|
859
|
+
"additionalProperties": false
|
|
860
|
+
},
|
|
861
|
+
"group": {
|
|
862
|
+
"type": "object",
|
|
863
|
+
"properties": {
|
|
864
|
+
"caseCount": {
|
|
865
|
+
"description": "Number of unique cases",
|
|
866
|
+
"type": "integer",
|
|
867
|
+
"minimum": -9007199254740991,
|
|
868
|
+
"maximum": 2147483647
|
|
869
|
+
},
|
|
870
|
+
"configHash": {
|
|
871
|
+
"description": "Hash of configuration",
|
|
872
|
+
"type": "string"
|
|
873
|
+
},
|
|
874
|
+
"runCount": {
|
|
875
|
+
"description": "Number of runs in this aggregate",
|
|
876
|
+
"type": "integer",
|
|
877
|
+
"minimum": -9007199254740991,
|
|
878
|
+
"maximum": 2147483647
|
|
879
|
+
}
|
|
880
|
+
},
|
|
881
|
+
"required": [
|
|
882
|
+
"caseCount",
|
|
883
|
+
"runCount"
|
|
884
|
+
],
|
|
885
|
+
"additionalProperties": false
|
|
886
|
+
},
|
|
887
|
+
"metadata": {
|
|
888
|
+
"description": "Additional metadata",
|
|
889
|
+
"type": "object",
|
|
890
|
+
"additionalProperties": {
|
|
891
|
+
"anyOf": [
|
|
892
|
+
{
|
|
893
|
+
"type": "string"
|
|
894
|
+
},
|
|
895
|
+
{
|
|
896
|
+
"type": "number"
|
|
897
|
+
},
|
|
898
|
+
{
|
|
899
|
+
"type": "boolean"
|
|
900
|
+
},
|
|
901
|
+
{
|
|
902
|
+
"type": "null"
|
|
903
|
+
}
|
|
904
|
+
]
|
|
905
|
+
},
|
|
906
|
+
"propertyNames": {
|
|
907
|
+
"type": "string"
|
|
908
|
+
}
|
|
909
|
+
},
|
|
910
|
+
"metrics": {
|
|
911
|
+
"description": "Aggregated metrics (metric name -> summary stats)",
|
|
912
|
+
"type": "object",
|
|
913
|
+
"additionalProperties": {
|
|
914
|
+
"title": "SummaryStats",
|
|
915
|
+
"description": "Summary statistics for a numeric metric",
|
|
916
|
+
"type": "object",
|
|
917
|
+
"properties": {
|
|
918
|
+
"confidence95": {
|
|
919
|
+
"description": "95% confidence interval [lower, upper]",
|
|
920
|
+
"type": "array",
|
|
921
|
+
"prefixItems": [
|
|
922
|
+
{
|
|
923
|
+
"type": "number"
|
|
924
|
+
},
|
|
925
|
+
{
|
|
926
|
+
"type": "number"
|
|
927
|
+
}
|
|
928
|
+
]
|
|
929
|
+
},
|
|
930
|
+
"max": {
|
|
931
|
+
"description": "Maximum value",
|
|
932
|
+
"type": "number"
|
|
933
|
+
},
|
|
934
|
+
"mean": {
|
|
935
|
+
"description": "Arithmetic mean",
|
|
936
|
+
"type": "number"
|
|
937
|
+
},
|
|
938
|
+
"median": {
|
|
939
|
+
"description": "Median (50th percentile)",
|
|
940
|
+
"type": "number"
|
|
941
|
+
},
|
|
942
|
+
"min": {
|
|
943
|
+
"description": "Minimum value",
|
|
944
|
+
"type": "number"
|
|
945
|
+
},
|
|
946
|
+
"n": {
|
|
947
|
+
"description": "Number of observations",
|
|
948
|
+
"type": "integer",
|
|
949
|
+
"minimum": -9007199254740991,
|
|
950
|
+
"maximum": 2147483647
|
|
951
|
+
},
|
|
952
|
+
"p25": {
|
|
953
|
+
"description": "25th percentile",
|
|
954
|
+
"type": "number"
|
|
955
|
+
},
|
|
956
|
+
"p75": {
|
|
957
|
+
"description": "75th percentile",
|
|
958
|
+
"type": "number"
|
|
959
|
+
},
|
|
960
|
+
"std": {
|
|
961
|
+
"description": "Standard deviation (sample)",
|
|
962
|
+
"type": "number"
|
|
963
|
+
},
|
|
964
|
+
"sum": {
|
|
965
|
+
"description": "Sum of all values",
|
|
966
|
+
"type": "number"
|
|
967
|
+
}
|
|
968
|
+
},
|
|
969
|
+
"required": [
|
|
970
|
+
"max",
|
|
971
|
+
"mean",
|
|
972
|
+
"median",
|
|
973
|
+
"min",
|
|
974
|
+
"n"
|
|
975
|
+
],
|
|
976
|
+
"additionalProperties": false
|
|
977
|
+
},
|
|
978
|
+
"propertyNames": {
|
|
979
|
+
"type": "string"
|
|
980
|
+
}
|
|
981
|
+
},
|
|
982
|
+
"sut": {
|
|
983
|
+
"description": "SUT identifier",
|
|
984
|
+
"type": "string"
|
|
985
|
+
},
|
|
986
|
+
"sutRole": {
|
|
987
|
+
"description": "Role of the SUT in evaluation",
|
|
988
|
+
"type": "string",
|
|
989
|
+
"oneOf": [
|
|
990
|
+
{
|
|
991
|
+
"description": "The system being evaluated; the novel algorithm or implementation",
|
|
992
|
+
"const": "primary"
|
|
993
|
+
},
|
|
994
|
+
{
|
|
995
|
+
"description": "A reference implementation for comparison",
|
|
996
|
+
"const": "baseline"
|
|
997
|
+
},
|
|
998
|
+
{
|
|
999
|
+
"description": "Ground truth provider; defines correct answers",
|
|
1000
|
+
"const": "oracle"
|
|
1001
|
+
}
|
|
1002
|
+
]
|
|
1003
|
+
}
|
|
1004
|
+
},
|
|
1005
|
+
"required": [
|
|
1006
|
+
"correctness",
|
|
1007
|
+
"group",
|
|
1008
|
+
"metrics",
|
|
1009
|
+
"sut",
|
|
1010
|
+
"sutRole"
|
|
1011
|
+
],
|
|
1012
|
+
"additionalProperties": false
|
|
1013
|
+
}
|
|
1014
|
+
},
|
|
1015
|
+
"metadata": {
|
|
1016
|
+
"description": "Global metadata",
|
|
1017
|
+
"type": "object",
|
|
1018
|
+
"properties": {
|
|
1019
|
+
"caseClassesIncluded": {
|
|
1020
|
+
"description": "Case classes included",
|
|
1021
|
+
"type": "array",
|
|
1022
|
+
"items": {
|
|
1023
|
+
"type": "string"
|
|
1024
|
+
}
|
|
1025
|
+
},
|
|
1026
|
+
"sutsIncluded": {
|
|
1027
|
+
"description": "SUTs included",
|
|
1028
|
+
"type": "array",
|
|
1029
|
+
"items": {
|
|
1030
|
+
"type": "string"
|
|
1031
|
+
}
|
|
1032
|
+
},
|
|
1033
|
+
"totalCases": {
|
|
1034
|
+
"description": "Total unique cases",
|
|
1035
|
+
"type": "integer",
|
|
1036
|
+
"minimum": -9007199254740991,
|
|
1037
|
+
"maximum": 2147483647
|
|
1038
|
+
},
|
|
1039
|
+
"totalRuns": {
|
|
1040
|
+
"description": "Total runs processed",
|
|
1041
|
+
"type": "integer",
|
|
1042
|
+
"minimum": -9007199254740991,
|
|
1043
|
+
"maximum": 2147483647
|
|
1044
|
+
}
|
|
1045
|
+
},
|
|
1046
|
+
"required": [
|
|
1047
|
+
"sutsIncluded",
|
|
1048
|
+
"totalCases",
|
|
1049
|
+
"totalRuns"
|
|
1050
|
+
],
|
|
1051
|
+
"additionalProperties": false
|
|
1052
|
+
},
|
|
1053
|
+
"timestamp": {
|
|
1054
|
+
"description": "Generation timestamp",
|
|
1055
|
+
"type": "string"
|
|
1056
|
+
},
|
|
1057
|
+
"version": {
|
|
1058
|
+
"description": "Schema version",
|
|
1059
|
+
"type": "string"
|
|
1060
|
+
}
|
|
1061
|
+
},
|
|
1062
|
+
"required": [
|
|
1063
|
+
"aggregates",
|
|
1064
|
+
"timestamp",
|
|
1065
|
+
"version"
|
|
1066
|
+
],
|
|
1067
|
+
"additionalProperties": false
|
|
1068
|
+
},
|
|
1069
|
+
"ClaimEvaluationSummary": {
|
|
1070
|
+
"title": "ClaimEvaluationSummary",
|
|
1071
|
+
"description": "Summary of all claim evaluations",
|
|
1072
|
+
"type": "object",
|
|
1073
|
+
"properties": {
|
|
1074
|
+
"evaluations": {
|
|
1075
|
+
"description": "Individual claim evaluations",
|
|
1076
|
+
"type": "array",
|
|
1077
|
+
"items": {
|
|
1078
|
+
"title": "ClaimEvaluation",
|
|
1079
|
+
"description": "Result of evaluating a single claim",
|
|
1080
|
+
"type": "object",
|
|
1081
|
+
"properties": {
|
|
1082
|
+
"claim": {
|
|
1083
|
+
"title": "EvaluationClaimOutput",
|
|
1084
|
+
"description": "The claim being evaluated",
|
|
1085
|
+
"type": "object",
|
|
1086
|
+
"properties": {
|
|
1087
|
+
"baseline": {
|
|
1088
|
+
"description": "Baseline SUT for comparison",
|
|
1089
|
+
"type": "string"
|
|
1090
|
+
},
|
|
1091
|
+
"citation": {
|
|
1092
|
+
"description": "Citation/reference for the claim",
|
|
1093
|
+
"type": "string"
|
|
1094
|
+
},
|
|
1095
|
+
"claimId": {
|
|
1096
|
+
"description": "Unique identifier for this claim",
|
|
1097
|
+
"type": "string"
|
|
1098
|
+
},
|
|
1099
|
+
"description": {
|
|
1100
|
+
"description": "Human-readable description",
|
|
1101
|
+
"type": "string"
|
|
1102
|
+
},
|
|
1103
|
+
"direction": {
|
|
1104
|
+
"description": "Expected direction of difference",
|
|
1105
|
+
"type": "string",
|
|
1106
|
+
"oneOf": [
|
|
1107
|
+
{
|
|
1108
|
+
"description": "Primary SUT metric should be greater than baseline",
|
|
1109
|
+
"const": "greater"
|
|
1110
|
+
},
|
|
1111
|
+
{
|
|
1112
|
+
"description": "Primary SUT metric should be less than baseline",
|
|
1113
|
+
"const": "less"
|
|
1114
|
+
},
|
|
1115
|
+
{
|
|
1116
|
+
"description": "Primary SUT metric should be equal to baseline",
|
|
1117
|
+
"const": "equal"
|
|
1118
|
+
}
|
|
1119
|
+
]
|
|
1120
|
+
},
|
|
1121
|
+
"metric": {
|
|
1122
|
+
"description": "Metric being compared",
|
|
1123
|
+
"type": "string"
|
|
1124
|
+
},
|
|
1125
|
+
"minEffectSize": {
|
|
1126
|
+
"description": "Minimum effect size",
|
|
1127
|
+
"type": "number"
|
|
1128
|
+
},
|
|
1129
|
+
"scope": {
|
|
1130
|
+
"description": "Scope of claim validity",
|
|
1131
|
+
"type": "string",
|
|
1132
|
+
"oneOf": [
|
|
1133
|
+
{
|
|
1134
|
+
"description": "Claim applies across all cases and conditions",
|
|
1135
|
+
"const": "global"
|
|
1136
|
+
},
|
|
1137
|
+
{
|
|
1138
|
+
"description": "Claim applies within a specific case class",
|
|
1139
|
+
"const": "caseClass"
|
|
1140
|
+
},
|
|
1141
|
+
{
|
|
1142
|
+
"description": "Claim applies within a parameter range",
|
|
1143
|
+
"const": "parameterRange"
|
|
1144
|
+
},
|
|
1145
|
+
{
|
|
1146
|
+
"description": "Claim applies to local structural properties",
|
|
1147
|
+
"const": "localStructure"
|
|
1148
|
+
}
|
|
1149
|
+
]
|
|
1150
|
+
},
|
|
1151
|
+
"scopeConstraints": {
|
|
1152
|
+
"description": "Scope constraints",
|
|
1153
|
+
"type": "object",
|
|
1154
|
+
"additionalProperties": {
|
|
1155
|
+
"anyOf": [
|
|
1156
|
+
{
|
|
1157
|
+
"anyOf": [
|
|
1158
|
+
{
|
|
1159
|
+
"type": "string"
|
|
1160
|
+
},
|
|
1161
|
+
{
|
|
1162
|
+
"type": "number"
|
|
1163
|
+
},
|
|
1164
|
+
{
|
|
1165
|
+
"type": "boolean"
|
|
1166
|
+
},
|
|
1167
|
+
{
|
|
1168
|
+
"type": "null"
|
|
1169
|
+
}
|
|
1170
|
+
]
|
|
1171
|
+
},
|
|
1172
|
+
{
|
|
1173
|
+
"type": "array",
|
|
1174
|
+
"items": {
|
|
1175
|
+
"anyOf": [
|
|
1176
|
+
{
|
|
1177
|
+
"type": "string"
|
|
1178
|
+
},
|
|
1179
|
+
{
|
|
1180
|
+
"type": "number"
|
|
1181
|
+
},
|
|
1182
|
+
{
|
|
1183
|
+
"type": "boolean"
|
|
1184
|
+
},
|
|
1185
|
+
{
|
|
1186
|
+
"type": "null"
|
|
1187
|
+
}
|
|
1188
|
+
]
|
|
1189
|
+
}
|
|
1190
|
+
}
|
|
1191
|
+
]
|
|
1192
|
+
},
|
|
1193
|
+
"propertyNames": {
|
|
1194
|
+
"type": "string"
|
|
1195
|
+
}
|
|
1196
|
+
},
|
|
1197
|
+
"significanceLevel": {
|
|
1198
|
+
"description": "Required significance level",
|
|
1199
|
+
"type": "number"
|
|
1200
|
+
},
|
|
1201
|
+
"sut": {
|
|
1202
|
+
"description": "Primary SUT being evaluated",
|
|
1203
|
+
"type": "string"
|
|
1204
|
+
},
|
|
1205
|
+
"tags": {
|
|
1206
|
+
"description": "Tags for filtering",
|
|
1207
|
+
"type": "array",
|
|
1208
|
+
"items": {
|
|
1209
|
+
"type": "string"
|
|
1210
|
+
}
|
|
1211
|
+
},
|
|
1212
|
+
"threshold": {
|
|
1213
|
+
"description": "Optional threshold for the difference",
|
|
1214
|
+
"type": "number"
|
|
1215
|
+
}
|
|
1216
|
+
},
|
|
1217
|
+
"required": [
|
|
1218
|
+
"baseline",
|
|
1219
|
+
"claimId",
|
|
1220
|
+
"description",
|
|
1221
|
+
"direction",
|
|
1222
|
+
"metric",
|
|
1223
|
+
"scope",
|
|
1224
|
+
"sut"
|
|
1225
|
+
],
|
|
1226
|
+
"additionalProperties": false
|
|
1227
|
+
},
|
|
1228
|
+
"evidence": {
|
|
1229
|
+
"title": "ClaimEvidence",
|
|
1230
|
+
"description": "Supporting evidence",
|
|
1231
|
+
"type": "object",
|
|
1232
|
+
"properties": {
|
|
1233
|
+
"baselineValue": {
|
|
1234
|
+
"description": "Baseline SUT metric value",
|
|
1235
|
+
"type": "number"
|
|
1236
|
+
},
|
|
1237
|
+
"delta": {
|
|
1238
|
+
"description": "Absolute delta (primary - baseline)",
|
|
1239
|
+
"type": "number"
|
|
1240
|
+
},
|
|
1241
|
+
"deltaCI95": {
|
|
1242
|
+
"description": "95% confidence interval for delta",
|
|
1243
|
+
"type": "array",
|
|
1244
|
+
"prefixItems": [
|
|
1245
|
+
{
|
|
1246
|
+
"type": "number"
|
|
1247
|
+
},
|
|
1248
|
+
{
|
|
1249
|
+
"type": "number"
|
|
1250
|
+
}
|
|
1251
|
+
]
|
|
1252
|
+
},
|
|
1253
|
+
"effectSize": {
|
|
1254
|
+
"description": "Effect size (Cohen's d)",
|
|
1255
|
+
"type": "number"
|
|
1256
|
+
},
|
|
1257
|
+
"n": {
|
|
1258
|
+
"description": "Number of observations",
|
|
1259
|
+
"type": "integer",
|
|
1260
|
+
"minimum": -9007199254740991,
|
|
1261
|
+
"maximum": 2147483647
|
|
1262
|
+
},
|
|
1263
|
+
"primaryValue": {
|
|
1264
|
+
"description": "Primary SUT metric value",
|
|
1265
|
+
"type": "number"
|
|
1266
|
+
},
|
|
1267
|
+
"pValue": {
|
|
1268
|
+
"description": "P-value from statistical test",
|
|
1269
|
+
"type": "number"
|
|
1270
|
+
},
|
|
1271
|
+
"ratio": {
|
|
1272
|
+
"description": "Ratio (primary / baseline)",
|
|
1273
|
+
"type": "number"
|
|
1274
|
+
}
|
|
1275
|
+
},
|
|
1276
|
+
"required": [
|
|
1277
|
+
"baselineValue",
|
|
1278
|
+
"delta",
|
|
1279
|
+
"primaryValue",
|
|
1280
|
+
"ratio"
|
|
1281
|
+
],
|
|
1282
|
+
"additionalProperties": false
|
|
1283
|
+
},
|
|
1284
|
+
"inconclusiveReason": {
|
|
1285
|
+
"description": "Reason for inconclusive status",
|
|
1286
|
+
"type": "string"
|
|
1287
|
+
},
|
|
1288
|
+
"notes": {
|
|
1289
|
+
"description": "Additional notes",
|
|
1290
|
+
"type": "array",
|
|
1291
|
+
"items": {
|
|
1292
|
+
"type": "string"
|
|
1293
|
+
}
|
|
1294
|
+
},
|
|
1295
|
+
"status": {
|
|
1296
|
+
"description": "Status of a claim evaluation",
|
|
1297
|
+
"type": "string",
|
|
1298
|
+
"enum": [
|
|
1299
|
+
"satisfied",
|
|
1300
|
+
"violated",
|
|
1301
|
+
"inconclusive"
|
|
1302
|
+
]
|
|
1303
|
+
}
|
|
1304
|
+
},
|
|
1305
|
+
"required": [
|
|
1306
|
+
"claim",
|
|
1307
|
+
"evidence",
|
|
1308
|
+
"status"
|
|
1309
|
+
],
|
|
1310
|
+
"additionalProperties": false
|
|
1311
|
+
}
|
|
1312
|
+
},
|
|
1313
|
+
"summary": {
|
|
1314
|
+
"type": "object",
|
|
1315
|
+
"properties": {
|
|
1316
|
+
"inconclusive": {
|
|
1317
|
+
"description": "Claims inconclusive",
|
|
1318
|
+
"type": "integer",
|
|
1319
|
+
"minimum": -9007199254740991,
|
|
1320
|
+
"maximum": 2147483647
|
|
1321
|
+
},
|
|
1322
|
+
"satisfactionRate": {
|
|
1323
|
+
"description": "Satisfaction rate (satisfied / (satisfied + violated))",
|
|
1324
|
+
"type": "number"
|
|
1325
|
+
},
|
|
1326
|
+
"satisfied": {
|
|
1327
|
+
"description": "Claims satisfied",
|
|
1328
|
+
"type": "integer",
|
|
1329
|
+
"minimum": -9007199254740991,
|
|
1330
|
+
"maximum": 2147483647
|
|
1331
|
+
},
|
|
1332
|
+
"total": {
|
|
1333
|
+
"description": "Total claims evaluated",
|
|
1334
|
+
"type": "integer",
|
|
1335
|
+
"minimum": -9007199254740991,
|
|
1336
|
+
"maximum": 2147483647
|
|
1337
|
+
},
|
|
1338
|
+
"violated": {
|
|
1339
|
+
"description": "Claims violated",
|
|
1340
|
+
"type": "integer",
|
|
1341
|
+
"minimum": -9007199254740991,
|
|
1342
|
+
"maximum": 2147483647
|
|
1343
|
+
}
|
|
1344
|
+
},
|
|
1345
|
+
"required": [
|
|
1346
|
+
"inconclusive",
|
|
1347
|
+
"satisfactionRate",
|
|
1348
|
+
"satisfied",
|
|
1349
|
+
"total",
|
|
1350
|
+
"violated"
|
|
1351
|
+
],
|
|
1352
|
+
"additionalProperties": false
|
|
1353
|
+
},
|
|
1354
|
+
"timestamp": {
|
|
1355
|
+
"description": "Generation timestamp",
|
|
1356
|
+
"type": "string"
|
|
1357
|
+
},
|
|
1358
|
+
"version": {
|
|
1359
|
+
"description": "Schema version",
|
|
1360
|
+
"type": "string"
|
|
1361
|
+
}
|
|
1362
|
+
},
|
|
1363
|
+
"required": [
|
|
1364
|
+
"evaluations",
|
|
1365
|
+
"summary",
|
|
1366
|
+
"timestamp",
|
|
1367
|
+
"version"
|
|
1368
|
+
],
|
|
1369
|
+
"additionalProperties": false
|
|
1370
|
+
},
|
|
442
1371
|
"ClaimsEvaluatorConfig": {
|
|
443
1372
|
"title": "ClaimsEvaluatorConfig",
|
|
444
1373
|
"description": "Configuration for the claims evaluator",
|
|
445
1374
|
"type": "object",
|
|
446
1375
|
"properties": {
|
|
447
|
-
"claims": {
|
|
448
|
-
"description": "Claims to evaluate",
|
|
1376
|
+
"claims": {
|
|
1377
|
+
"description": "Claims to evaluate",
|
|
1378
|
+
"type": "array",
|
|
1379
|
+
"items": {
|
|
1380
|
+
"title": "EvaluationClaim",
|
|
1381
|
+
"description": "An evaluation claim (hypothesis)",
|
|
1382
|
+
"type": "object",
|
|
1383
|
+
"properties": {
|
|
1384
|
+
"baseline": {
|
|
1385
|
+
"description": "Baseline SUT for comparison",
|
|
1386
|
+
"type": "string",
|
|
1387
|
+
"minLength": 1
|
|
1388
|
+
},
|
|
1389
|
+
"citation": {
|
|
1390
|
+
"description": "Citation/reference for the claim",
|
|
1391
|
+
"type": "string"
|
|
1392
|
+
},
|
|
1393
|
+
"claimId": {
|
|
1394
|
+
"description": "Unique claim identifier",
|
|
1395
|
+
"type": "string",
|
|
1396
|
+
"minLength": 1
|
|
1397
|
+
},
|
|
1398
|
+
"description": {
|
|
1399
|
+
"description": "Human-readable claim description",
|
|
1400
|
+
"type": "string",
|
|
1401
|
+
"minLength": 1
|
|
1402
|
+
},
|
|
1403
|
+
"direction": {
|
|
1404
|
+
"description": "Expected direction of difference",
|
|
1405
|
+
"type": "string",
|
|
1406
|
+
"oneOf": [
|
|
1407
|
+
{
|
|
1408
|
+
"description": "Primary SUT metric should be greater than baseline",
|
|
1409
|
+
"const": "greater"
|
|
1410
|
+
},
|
|
1411
|
+
{
|
|
1412
|
+
"description": "Primary SUT metric should be less than baseline",
|
|
1413
|
+
"const": "less"
|
|
1414
|
+
},
|
|
1415
|
+
{
|
|
1416
|
+
"description": "Primary SUT metric should be equal to baseline",
|
|
1417
|
+
"const": "equal"
|
|
1418
|
+
}
|
|
1419
|
+
]
|
|
1420
|
+
},
|
|
1421
|
+
"metric": {
|
|
1422
|
+
"description": "Metric being compared",
|
|
1423
|
+
"type": "string",
|
|
1424
|
+
"minLength": 1
|
|
1425
|
+
},
|
|
1426
|
+
"minEffectSize": {
|
|
1427
|
+
"description": "Minimum effect size (Cohen's d)",
|
|
1428
|
+
"type": "number",
|
|
1429
|
+
"minimum": 0
|
|
1430
|
+
},
|
|
1431
|
+
"scope": {
|
|
1432
|
+
"description": "Scope of claim validity",
|
|
1433
|
+
"type": "string",
|
|
1434
|
+
"oneOf": [
|
|
1435
|
+
{
|
|
1436
|
+
"description": "Claim applies across all cases and conditions",
|
|
1437
|
+
"const": "global"
|
|
1438
|
+
},
|
|
1439
|
+
{
|
|
1440
|
+
"description": "Claim applies within a specific case class",
|
|
1441
|
+
"const": "caseClass"
|
|
1442
|
+
},
|
|
1443
|
+
{
|
|
1444
|
+
"description": "Claim applies within a parameter range",
|
|
1445
|
+
"const": "parameterRange"
|
|
1446
|
+
},
|
|
1447
|
+
{
|
|
1448
|
+
"description": "Claim applies to local structural properties",
|
|
1449
|
+
"const": "localStructure"
|
|
1450
|
+
}
|
|
1451
|
+
]
|
|
1452
|
+
},
|
|
1453
|
+
"scopeConstraints": {
|
|
1454
|
+
"description": "Scope constraints",
|
|
1455
|
+
"type": "object",
|
|
1456
|
+
"additionalProperties": {
|
|
1457
|
+
"anyOf": [
|
|
1458
|
+
{
|
|
1459
|
+
"anyOf": [
|
|
1460
|
+
{
|
|
1461
|
+
"type": "string"
|
|
1462
|
+
},
|
|
1463
|
+
{
|
|
1464
|
+
"type": "number"
|
|
1465
|
+
},
|
|
1466
|
+
{
|
|
1467
|
+
"type": "boolean"
|
|
1468
|
+
},
|
|
1469
|
+
{
|
|
1470
|
+
"type": "null"
|
|
1471
|
+
}
|
|
1472
|
+
]
|
|
1473
|
+
},
|
|
1474
|
+
{
|
|
1475
|
+
"type": "array",
|
|
1476
|
+
"items": {
|
|
1477
|
+
"anyOf": [
|
|
1478
|
+
{
|
|
1479
|
+
"type": "string"
|
|
1480
|
+
},
|
|
1481
|
+
{
|
|
1482
|
+
"type": "number"
|
|
1483
|
+
},
|
|
1484
|
+
{
|
|
1485
|
+
"type": "boolean"
|
|
1486
|
+
},
|
|
1487
|
+
{
|
|
1488
|
+
"type": "null"
|
|
1489
|
+
}
|
|
1490
|
+
]
|
|
1491
|
+
}
|
|
1492
|
+
}
|
|
1493
|
+
]
|
|
1494
|
+
},
|
|
1495
|
+
"propertyNames": {
|
|
1496
|
+
"type": "string"
|
|
1497
|
+
}
|
|
1498
|
+
},
|
|
1499
|
+
"significanceLevel": {
|
|
1500
|
+
"description": "Required significance level (default: 0.05)",
|
|
1501
|
+
"type": "number",
|
|
1502
|
+
"minimum": 0,
|
|
1503
|
+
"maximum": 1
|
|
1504
|
+
},
|
|
1505
|
+
"sut": {
|
|
1506
|
+
"description": "Primary SUT being evaluated",
|
|
1507
|
+
"type": "string",
|
|
1508
|
+
"minLength": 1
|
|
1509
|
+
},
|
|
1510
|
+
"tags": {
|
|
1511
|
+
"description": "Tags for filtering",
|
|
1512
|
+
"type": "array",
|
|
1513
|
+
"items": {
|
|
1514
|
+
"type": "string"
|
|
1515
|
+
}
|
|
1516
|
+
},
|
|
1517
|
+
"threshold": {
|
|
1518
|
+
"description": "Optional threshold for the difference",
|
|
1519
|
+
"type": "number"
|
|
1520
|
+
}
|
|
1521
|
+
},
|
|
1522
|
+
"required": [
|
|
1523
|
+
"baseline",
|
|
1524
|
+
"claimId",
|
|
1525
|
+
"description",
|
|
1526
|
+
"direction",
|
|
1527
|
+
"metric",
|
|
1528
|
+
"scope",
|
|
1529
|
+
"sut"
|
|
1530
|
+
],
|
|
1531
|
+
"additionalProperties": false
|
|
1532
|
+
},
|
|
1533
|
+
"minItems": 1
|
|
1534
|
+
},
|
|
1535
|
+
"description": {
|
|
1536
|
+
"description": "Evaluator description",
|
|
1537
|
+
"type": "string"
|
|
1538
|
+
},
|
|
1539
|
+
"minEffectSize": {
|
|
1540
|
+
"description": "Global minimum effect size override",
|
|
1541
|
+
"type": "number",
|
|
1542
|
+
"minimum": 0
|
|
1543
|
+
},
|
|
1544
|
+
"name": {
|
|
1545
|
+
"description": "Human-readable evaluator name",
|
|
1546
|
+
"type": "string"
|
|
1547
|
+
},
|
|
1548
|
+
"options": {
|
|
1549
|
+
"description": "Additional evaluator-specific options",
|
|
1550
|
+
"type": "object",
|
|
1551
|
+
"additionalProperties": {},
|
|
1552
|
+
"propertyNames": {
|
|
1553
|
+
"type": "string"
|
|
1554
|
+
}
|
|
1555
|
+
},
|
|
1556
|
+
"significanceLevel": {
|
|
1557
|
+
"description": "Global significance level override",
|
|
1558
|
+
"type": "number",
|
|
1559
|
+
"minimum": 0,
|
|
1560
|
+
"maximum": 1
|
|
1561
|
+
}
|
|
1562
|
+
},
|
|
1563
|
+
"required": [
|
|
1564
|
+
"claims"
|
|
1565
|
+
],
|
|
1566
|
+
"additionalProperties": false,
|
|
1567
|
+
"examples": [
|
|
1568
|
+
{
|
|
1569
|
+
"claims": [
|
|
1570
|
+
{
|
|
1571
|
+
"description": "Built-in .length reports greater length than spread operator on emoji strings",
|
|
1572
|
+
"baseline": "spread-length",
|
|
1573
|
+
"claimId": "C001",
|
|
1574
|
+
"direction": "greater",
|
|
1575
|
+
"metric": "length",
|
|
1576
|
+
"scope": "global",
|
|
1577
|
+
"sut": "builtin-length"
|
|
1578
|
+
}
|
|
1579
|
+
],
|
|
1580
|
+
"significanceLevel": 0.05
|
|
1581
|
+
}
|
|
1582
|
+
]
|
|
1583
|
+
},
|
|
1584
|
+
"CorrectnessResult": {
|
|
1585
|
+
"title": "CorrectnessResult",
|
|
1586
|
+
"description": "Correctness assessment",
|
|
1587
|
+
"type": "object",
|
|
1588
|
+
"properties": {
|
|
1589
|
+
"expectedExists": {
|
|
1590
|
+
"description": "Whether expected output exists (oracle available)",
|
|
1591
|
+
"type": "boolean"
|
|
1592
|
+
},
|
|
1593
|
+
"failureType": {
|
|
1594
|
+
"description": "Failure classification if applicable",
|
|
1595
|
+
"type": "string",
|
|
1596
|
+
"enum": [
|
|
1597
|
+
"no_output",
|
|
1598
|
+
"invalid_structure",
|
|
1599
|
+
"constraint_violation",
|
|
1600
|
+
"exception",
|
|
1601
|
+
"oracle_mismatch",
|
|
1602
|
+
"timeout"
|
|
1603
|
+
]
|
|
1604
|
+
},
|
|
1605
|
+
"matchesExpected": {
|
|
1606
|
+
"description": "Whether output matches expected (null if no oracle)",
|
|
1607
|
+
"anyOf": [
|
|
1608
|
+
{
|
|
1609
|
+
"type": "boolean"
|
|
1610
|
+
},
|
|
1611
|
+
{
|
|
1612
|
+
"type": "null"
|
|
1613
|
+
}
|
|
1614
|
+
]
|
|
1615
|
+
},
|
|
1616
|
+
"notes": {
|
|
1617
|
+
"description": "Human-readable failure notes",
|
|
1618
|
+
"type": "array",
|
|
1619
|
+
"items": {
|
|
1620
|
+
"type": "string"
|
|
1621
|
+
}
|
|
1622
|
+
},
|
|
1623
|
+
"producedOutput": {
|
|
1624
|
+
"description": "Whether the SUT produced any output",
|
|
1625
|
+
"type": "boolean"
|
|
1626
|
+
},
|
|
1627
|
+
"valid": {
|
|
1628
|
+
"description": "Whether output is structurally valid",
|
|
1629
|
+
"type": "boolean"
|
|
1630
|
+
}
|
|
1631
|
+
},
|
|
1632
|
+
"required": [
|
|
1633
|
+
"expectedExists",
|
|
1634
|
+
"matchesExpected",
|
|
1635
|
+
"producedOutput",
|
|
1636
|
+
"valid"
|
|
1637
|
+
],
|
|
1638
|
+
"additionalProperties": false
|
|
1639
|
+
},
|
|
1640
|
+
"CustomEvaluatorConfig": {
|
|
1641
|
+
"title": "CustomEvaluatorConfig",
|
|
1642
|
+
"description": "Configuration for a custom evaluator",
|
|
1643
|
+
"type": "object",
|
|
1644
|
+
"properties": {
|
|
1645
|
+
"customType": {
|
|
1646
|
+
"description": "Custom evaluator type name",
|
|
1647
|
+
"type": "string",
|
|
1648
|
+
"minLength": 1
|
|
1649
|
+
},
|
|
1650
|
+
"description": {
|
|
1651
|
+
"description": "Evaluator description",
|
|
1652
|
+
"type": "string"
|
|
1653
|
+
},
|
|
1654
|
+
"name": {
|
|
1655
|
+
"description": "Human-readable evaluator name",
|
|
1656
|
+
"type": "string"
|
|
1657
|
+
},
|
|
1658
|
+
"options": {
|
|
1659
|
+
"description": "Additional evaluator-specific options",
|
|
1660
|
+
"type": "object",
|
|
1661
|
+
"additionalProperties": {},
|
|
1662
|
+
"propertyNames": {
|
|
1663
|
+
"type": "string"
|
|
1664
|
+
}
|
|
1665
|
+
}
|
|
1666
|
+
},
|
|
1667
|
+
"required": [
|
|
1668
|
+
"customType"
|
|
1669
|
+
],
|
|
1670
|
+
"additionalProperties": {}
|
|
1671
|
+
},
|
|
1672
|
+
"EvaluationResult": {
|
|
1673
|
+
"title": "EvaluationResult",
|
|
1674
|
+
"description": "Complete evaluation result",
|
|
1675
|
+
"type": "object",
|
|
1676
|
+
"properties": {
|
|
1677
|
+
"correctness": {
|
|
1678
|
+
"title": "CorrectnessResult",
|
|
1679
|
+
"description": "Correctness assessment",
|
|
1680
|
+
"type": "object",
|
|
1681
|
+
"properties": {
|
|
1682
|
+
"expectedExists": {
|
|
1683
|
+
"description": "Whether expected output exists (oracle available)",
|
|
1684
|
+
"type": "boolean"
|
|
1685
|
+
},
|
|
1686
|
+
"failureType": {
|
|
1687
|
+
"description": "Failure classification if applicable",
|
|
1688
|
+
"type": "string",
|
|
1689
|
+
"enum": [
|
|
1690
|
+
"no_output",
|
|
1691
|
+
"invalid_structure",
|
|
1692
|
+
"constraint_violation",
|
|
1693
|
+
"exception",
|
|
1694
|
+
"oracle_mismatch",
|
|
1695
|
+
"timeout"
|
|
1696
|
+
]
|
|
1697
|
+
},
|
|
1698
|
+
"matchesExpected": {
|
|
1699
|
+
"description": "Whether output matches expected (null if no oracle)",
|
|
1700
|
+
"anyOf": [
|
|
1701
|
+
{
|
|
1702
|
+
"type": "boolean"
|
|
1703
|
+
},
|
|
1704
|
+
{
|
|
1705
|
+
"type": "null"
|
|
1706
|
+
}
|
|
1707
|
+
]
|
|
1708
|
+
},
|
|
1709
|
+
"notes": {
|
|
1710
|
+
"description": "Human-readable failure notes",
|
|
1711
|
+
"type": "array",
|
|
1712
|
+
"items": {
|
|
1713
|
+
"type": "string"
|
|
1714
|
+
}
|
|
1715
|
+
},
|
|
1716
|
+
"producedOutput": {
|
|
1717
|
+
"description": "Whether the SUT produced any output",
|
|
1718
|
+
"type": "boolean"
|
|
1719
|
+
},
|
|
1720
|
+
"valid": {
|
|
1721
|
+
"description": "Whether output is structurally valid",
|
|
1722
|
+
"type": "boolean"
|
|
1723
|
+
}
|
|
1724
|
+
},
|
|
1725
|
+
"required": [
|
|
1726
|
+
"expectedExists",
|
|
1727
|
+
"matchesExpected",
|
|
1728
|
+
"producedOutput",
|
|
1729
|
+
"valid"
|
|
1730
|
+
],
|
|
1731
|
+
"additionalProperties": false
|
|
1732
|
+
},
|
|
1733
|
+
"error": {
|
|
1734
|
+
"description": "Error message if the run failed",
|
|
1735
|
+
"type": "string"
|
|
1736
|
+
},
|
|
1737
|
+
"metrics": {
|
|
1738
|
+
"title": "ResultMetrics",
|
|
1739
|
+
"description": "Numeric metrics",
|
|
1740
|
+
"type": "object",
|
|
1741
|
+
"properties": {
|
|
1742
|
+
"extra": {
|
|
1743
|
+
"description": "Additional metrics (overflow)",
|
|
1744
|
+
"type": "object",
|
|
1745
|
+
"additionalProperties": {
|
|
1746
|
+
"type": "number"
|
|
1747
|
+
},
|
|
1748
|
+
"propertyNames": {
|
|
1749
|
+
"type": "string"
|
|
1750
|
+
}
|
|
1751
|
+
},
|
|
1752
|
+
"numeric": {
|
|
1753
|
+
"description": "Primary numeric metrics",
|
|
1754
|
+
"type": "object",
|
|
1755
|
+
"additionalProperties": {
|
|
1756
|
+
"type": "number"
|
|
1757
|
+
},
|
|
1758
|
+
"propertyNames": {
|
|
1759
|
+
"type": "string"
|
|
1760
|
+
}
|
|
1761
|
+
}
|
|
1762
|
+
},
|
|
1763
|
+
"required": [
|
|
1764
|
+
"numeric"
|
|
1765
|
+
],
|
|
1766
|
+
"additionalProperties": {
|
|
1767
|
+
"anyOf": [
|
|
1768
|
+
{
|
|
1769
|
+
"type": "number"
|
|
1770
|
+
},
|
|
1771
|
+
{
|
|
1772
|
+
"type": "object",
|
|
1773
|
+
"additionalProperties": {
|
|
1774
|
+
"type": "number"
|
|
1775
|
+
},
|
|
1776
|
+
"propertyNames": {
|
|
1777
|
+
"type": "string"
|
|
1778
|
+
}
|
|
1779
|
+
}
|
|
1780
|
+
]
|
|
1781
|
+
}
|
|
1782
|
+
},
|
|
1783
|
+
"outputs": {
|
|
1784
|
+
"title": "ResultOutputs",
|
|
1785
|
+
"description": "Output artefacts and summaries",
|
|
1786
|
+
"type": "object",
|
|
1787
|
+
"properties": {
|
|
1788
|
+
"artefacts": {
|
|
1789
|
+
"description": "References to generated artefacts",
|
|
1790
|
+
"type": "array",
|
|
1791
|
+
"items": {
|
|
1792
|
+
"title": "ArtefactReference",
|
|
1793
|
+
"description": "Reference to an external artefact",
|
|
1794
|
+
"type": "object",
|
|
1795
|
+
"properties": {
|
|
1796
|
+
"hash": {
|
|
1797
|
+
"type": "string"
|
|
1798
|
+
},
|
|
1799
|
+
"metadata": {
|
|
1800
|
+
"type": "object",
|
|
1801
|
+
"additionalProperties": {
|
|
1802
|
+
"anyOf": [
|
|
1803
|
+
{
|
|
1804
|
+
"type": "string"
|
|
1805
|
+
},
|
|
1806
|
+
{
|
|
1807
|
+
"type": "number"
|
|
1808
|
+
},
|
|
1809
|
+
{
|
|
1810
|
+
"type": "boolean"
|
|
1811
|
+
},
|
|
1812
|
+
{
|
|
1813
|
+
"type": "null"
|
|
1814
|
+
}
|
|
1815
|
+
]
|
|
1816
|
+
},
|
|
1817
|
+
"propertyNames": {
|
|
1818
|
+
"type": "string"
|
|
1819
|
+
}
|
|
1820
|
+
},
|
|
1821
|
+
"type": {
|
|
1822
|
+
"type": "string",
|
|
1823
|
+
"enum": [
|
|
1824
|
+
"graph",
|
|
1825
|
+
"path-set",
|
|
1826
|
+
"subgraph",
|
|
1827
|
+
"embedding",
|
|
1828
|
+
"other"
|
|
1829
|
+
]
|
|
1830
|
+
},
|
|
1831
|
+
"uri": {
|
|
1832
|
+
"type": "string"
|
|
1833
|
+
}
|
|
1834
|
+
},
|
|
1835
|
+
"required": [
|
|
1836
|
+
"type",
|
|
1837
|
+
"uri"
|
|
1838
|
+
],
|
|
1839
|
+
"additionalProperties": false
|
|
1840
|
+
}
|
|
1841
|
+
},
|
|
1842
|
+
"extra": {
|
|
1843
|
+
"description": "Additional untyped outputs",
|
|
1844
|
+
"type": "object",
|
|
1845
|
+
"additionalProperties": {},
|
|
1846
|
+
"propertyNames": {
|
|
1847
|
+
"type": "string"
|
|
1848
|
+
}
|
|
1849
|
+
},
|
|
1850
|
+
"labels": {
|
|
1851
|
+
"description": "Classification labels",
|
|
1852
|
+
"type": "object",
|
|
1853
|
+
"additionalProperties": {
|
|
1854
|
+
"anyOf": [
|
|
1855
|
+
{
|
|
1856
|
+
"type": "string"
|
|
1857
|
+
},
|
|
1858
|
+
{
|
|
1859
|
+
"type": "number"
|
|
1860
|
+
},
|
|
1861
|
+
{
|
|
1862
|
+
"type": "boolean"
|
|
1863
|
+
},
|
|
1864
|
+
{
|
|
1865
|
+
"type": "null"
|
|
1866
|
+
}
|
|
1867
|
+
]
|
|
1868
|
+
},
|
|
1869
|
+
"propertyNames": {
|
|
1870
|
+
"type": "string"
|
|
1871
|
+
}
|
|
1872
|
+
},
|
|
1873
|
+
"ranking": {
|
|
1874
|
+
"description": "Ranking results",
|
|
1875
|
+
"type": "array",
|
|
1876
|
+
"items": {
|
|
1877
|
+
"title": "RankedItem",
|
|
1878
|
+
"description": "A ranked item for ranking tasks",
|
|
1879
|
+
"type": "object",
|
|
1880
|
+
"properties": {
|
|
1881
|
+
"itemId": {
|
|
1882
|
+
"description": "Item identifier",
|
|
1883
|
+
"type": "string"
|
|
1884
|
+
},
|
|
1885
|
+
"metadata": {
|
|
1886
|
+
"description": "Optional additional metadata",
|
|
1887
|
+
"type": "object",
|
|
1888
|
+
"additionalProperties": {
|
|
1889
|
+
"anyOf": [
|
|
1890
|
+
{
|
|
1891
|
+
"type": "string"
|
|
1892
|
+
},
|
|
1893
|
+
{
|
|
1894
|
+
"type": "number"
|
|
1895
|
+
},
|
|
1896
|
+
{
|
|
1897
|
+
"type": "boolean"
|
|
1898
|
+
},
|
|
1899
|
+
{
|
|
1900
|
+
"type": "null"
|
|
1901
|
+
}
|
|
1902
|
+
]
|
|
1903
|
+
},
|
|
1904
|
+
"propertyNames": {
|
|
1905
|
+
"type": "string"
|
|
1906
|
+
}
|
|
1907
|
+
},
|
|
1908
|
+
"score": {
|
|
1909
|
+
"description": "Score or rank value",
|
|
1910
|
+
"type": "number"
|
|
1911
|
+
}
|
|
1912
|
+
},
|
|
1913
|
+
"required": [
|
|
1914
|
+
"itemId",
|
|
1915
|
+
"score"
|
|
1916
|
+
],
|
|
1917
|
+
"additionalProperties": false
|
|
1918
|
+
}
|
|
1919
|
+
},
|
|
1920
|
+
"summary": {
|
|
1921
|
+
"description": "Scalar summary values",
|
|
1922
|
+
"type": "object",
|
|
1923
|
+
"additionalProperties": {
|
|
1924
|
+
"anyOf": [
|
|
1925
|
+
{
|
|
1926
|
+
"anyOf": [
|
|
1927
|
+
{
|
|
1928
|
+
"type": "string"
|
|
1929
|
+
},
|
|
1930
|
+
{
|
|
1931
|
+
"type": "number"
|
|
1932
|
+
},
|
|
1933
|
+
{
|
|
1934
|
+
"type": "boolean"
|
|
1935
|
+
},
|
|
1936
|
+
{
|
|
1937
|
+
"type": "null"
|
|
1938
|
+
}
|
|
1939
|
+
]
|
|
1940
|
+
},
|
|
1941
|
+
{
|
|
1942
|
+
"type": "array",
|
|
1943
|
+
"items": {
|
|
1944
|
+
"anyOf": [
|
|
1945
|
+
{
|
|
1946
|
+
"type": "string"
|
|
1947
|
+
},
|
|
1948
|
+
{
|
|
1949
|
+
"type": "number"
|
|
1950
|
+
},
|
|
1951
|
+
{
|
|
1952
|
+
"type": "boolean"
|
|
1953
|
+
},
|
|
1954
|
+
{
|
|
1955
|
+
"type": "null"
|
|
1956
|
+
}
|
|
1957
|
+
]
|
|
1958
|
+
}
|
|
1959
|
+
}
|
|
1960
|
+
]
|
|
1961
|
+
},
|
|
1962
|
+
"propertyNames": {
|
|
1963
|
+
"type": "string"
|
|
1964
|
+
}
|
|
1965
|
+
}
|
|
1966
|
+
},
|
|
1967
|
+
"additionalProperties": false
|
|
1968
|
+
},
|
|
1969
|
+
"provenance": {
|
|
1970
|
+
"title": "Provenance",
|
|
1971
|
+
"description": "Provenance for reproducibility",
|
|
1972
|
+
"type": "object",
|
|
1973
|
+
"properties": {
|
|
1974
|
+
"dependencyLockHash": {
|
|
1975
|
+
"description": "Hash of package-lock.json for dependency pinning",
|
|
1976
|
+
"type": "string"
|
|
1977
|
+
},
|
|
1978
|
+
"dirty": {
|
|
1979
|
+
"description": "Whether working directory had uncommitted changes",
|
|
1980
|
+
"type": "boolean"
|
|
1981
|
+
},
|
|
1982
|
+
"executionTimeMs": {
|
|
1983
|
+
"description": "Wall-clock execution time in milliseconds",
|
|
1984
|
+
"type": "number"
|
|
1985
|
+
},
|
|
1986
|
+
"finalMemoryBytes": {
|
|
1987
|
+
"description": "Memory usage at completion (bytes)",
|
|
1988
|
+
"type": "number"
|
|
1989
|
+
},
|
|
1990
|
+
"gitCommit": {
|
|
1991
|
+
"description": "Git commit hash",
|
|
1992
|
+
"type": "string"
|
|
1993
|
+
},
|
|
1994
|
+
"parentRunIds": {
|
|
1995
|
+
"description": "Parent run IDs (for derived results)",
|
|
1996
|
+
"type": "array",
|
|
1997
|
+
"items": {
|
|
1998
|
+
"type": "string"
|
|
1999
|
+
}
|
|
2000
|
+
},
|
|
2001
|
+
"peakMemoryBytes": {
|
|
2002
|
+
"description": "Peak memory usage during execution (bytes)",
|
|
2003
|
+
"type": "number"
|
|
2004
|
+
},
|
|
2005
|
+
"runtime": {
|
|
2006
|
+
"description": "Execution environment (platform and arch required; additional fields are language-specific)",
|
|
2007
|
+
"type": "object",
|
|
2008
|
+
"properties": {
|
|
2009
|
+
"arch": {
|
|
2010
|
+
"description": "CPU architecture",
|
|
2011
|
+
"type": "string"
|
|
2012
|
+
},
|
|
2013
|
+
"platform": {
|
|
2014
|
+
"description": "Operating system platform",
|
|
2015
|
+
"type": "string"
|
|
2016
|
+
}
|
|
2017
|
+
},
|
|
2018
|
+
"required": [
|
|
2019
|
+
"arch",
|
|
2020
|
+
"platform"
|
|
2021
|
+
],
|
|
2022
|
+
"additionalProperties": {
|
|
2023
|
+
"type": "string"
|
|
2024
|
+
}
|
|
2025
|
+
},
|
|
2026
|
+
"timestamp": {
|
|
2027
|
+
"description": "Execution timestamp",
|
|
2028
|
+
"type": "string"
|
|
2029
|
+
}
|
|
2030
|
+
},
|
|
2031
|
+
"required": [
|
|
2032
|
+
"runtime"
|
|
2033
|
+
],
|
|
2034
|
+
"additionalProperties": false
|
|
2035
|
+
},
|
|
2036
|
+
"run": {
|
|
2037
|
+
"title": "RunContext",
|
|
2038
|
+
"description": "Run identity and context",
|
|
2039
|
+
"type": "object",
|
|
2040
|
+
"properties": {
|
|
2041
|
+
"caseClass": {
|
|
2042
|
+
"description": "Case class for grouping",
|
|
2043
|
+
"type": "string"
|
|
2044
|
+
},
|
|
2045
|
+
"caseId": {
|
|
2046
|
+
"description": "Case identifier",
|
|
2047
|
+
"type": "string"
|
|
2048
|
+
},
|
|
2049
|
+
"config": {
|
|
2050
|
+
"description": "Configuration overrides for this run",
|
|
2051
|
+
"type": "object",
|
|
2052
|
+
"additionalProperties": {
|
|
2053
|
+
"anyOf": [
|
|
2054
|
+
{
|
|
2055
|
+
"type": "string"
|
|
2056
|
+
},
|
|
2057
|
+
{
|
|
2058
|
+
"type": "number"
|
|
2059
|
+
},
|
|
2060
|
+
{
|
|
2061
|
+
"type": "boolean"
|
|
2062
|
+
},
|
|
2063
|
+
{
|
|
2064
|
+
"type": "null"
|
|
2065
|
+
}
|
|
2066
|
+
]
|
|
2067
|
+
},
|
|
2068
|
+
"propertyNames": {
|
|
2069
|
+
"type": "string"
|
|
2070
|
+
}
|
|
2071
|
+
},
|
|
2072
|
+
"repetition": {
|
|
2073
|
+
"description": "Repetition number for statistical runs",
|
|
2074
|
+
"type": "integer",
|
|
2075
|
+
"minimum": -9007199254740991,
|
|
2076
|
+
"maximum": 2147483647
|
|
2077
|
+
},
|
|
2078
|
+
"runId": {
|
|
2079
|
+
"description": "Deterministic run ID (hash of inputs)",
|
|
2080
|
+
"type": "string"
|
|
2081
|
+
},
|
|
2082
|
+
"seed": {
|
|
2083
|
+
"description": "Random seed if applicable",
|
|
2084
|
+
"type": "number"
|
|
2085
|
+
},
|
|
2086
|
+
"sut": {
|
|
2087
|
+
"description": "SUT identifier",
|
|
2088
|
+
"type": "string"
|
|
2089
|
+
},
|
|
2090
|
+
"sutRole": {
|
|
2091
|
+
"description": "Role of the SUT in evaluation",
|
|
2092
|
+
"type": "string",
|
|
2093
|
+
"oneOf": [
|
|
2094
|
+
{
|
|
2095
|
+
"description": "The system being evaluated; the novel algorithm or implementation",
|
|
2096
|
+
"const": "primary"
|
|
2097
|
+
},
|
|
2098
|
+
{
|
|
2099
|
+
"description": "A reference implementation for comparison",
|
|
2100
|
+
"const": "baseline"
|
|
2101
|
+
},
|
|
2102
|
+
{
|
|
2103
|
+
"description": "Ground truth provider; defines correct answers",
|
|
2104
|
+
"const": "oracle"
|
|
2105
|
+
}
|
|
2106
|
+
]
|
|
2107
|
+
},
|
|
2108
|
+
"sutVersion": {
|
|
2109
|
+
"description": "SUT version for reproducibility",
|
|
2110
|
+
"type": "string"
|
|
2111
|
+
}
|
|
2112
|
+
},
|
|
2113
|
+
"required": [
|
|
2114
|
+
"caseId",
|
|
2115
|
+
"runId",
|
|
2116
|
+
"sut",
|
|
2117
|
+
"sutRole"
|
|
2118
|
+
],
|
|
2119
|
+
"additionalProperties": false
|
|
2120
|
+
}
|
|
2121
|
+
},
|
|
2122
|
+
"required": [
|
|
2123
|
+
"correctness",
|
|
2124
|
+
"metrics",
|
|
2125
|
+
"outputs",
|
|
2126
|
+
"provenance",
|
|
2127
|
+
"run"
|
|
2128
|
+
],
|
|
2129
|
+
"additionalProperties": false
|
|
2130
|
+
},
|
|
2131
|
+
"ExploratoryEvaluationSummary": {
|
|
2132
|
+
"title": "ExploratoryEvaluationSummary",
|
|
2133
|
+
"description": "Summary of exploratory evaluation results",
|
|
2134
|
+
"type": "object",
|
|
2135
|
+
"properties": {
|
|
2136
|
+
"caseClassEffects": {
|
|
2137
|
+
"description": "Case-class effects",
|
|
2138
|
+
"type": "array",
|
|
2139
|
+
"items": {
|
|
2140
|
+
"title": "CaseClassEffect",
|
|
2141
|
+
"description": "Effect of a case class on SUT performance",
|
|
2142
|
+
"type": "object",
|
|
2143
|
+
"properties": {
|
|
2144
|
+
"caseClass": {
|
|
2145
|
+
"type": "string"
|
|
2146
|
+
},
|
|
2147
|
+
"deviationFromMean": {
|
|
2148
|
+
"type": "number"
|
|
2149
|
+
},
|
|
2150
|
+
"metric": {
|
|
2151
|
+
"type": "string"
|
|
2152
|
+
},
|
|
2153
|
+
"percentageDeviation": {
|
|
2154
|
+
"type": "number"
|
|
2155
|
+
},
|
|
2156
|
+
"significant": {
|
|
2157
|
+
"type": "boolean"
|
|
2158
|
+
},
|
|
2159
|
+
"sut": {
|
|
2160
|
+
"type": "string"
|
|
2161
|
+
}
|
|
2162
|
+
},
|
|
2163
|
+
"required": [
|
|
2164
|
+
"caseClass",
|
|
2165
|
+
"deviationFromMean",
|
|
2166
|
+
"metric",
|
|
2167
|
+
"significant",
|
|
2168
|
+
"sut"
|
|
2169
|
+
],
|
|
2170
|
+
"additionalProperties": false
|
|
2171
|
+
}
|
|
2172
|
+
},
|
|
2173
|
+
"metricCorrelations": {
|
|
2174
|
+
"description": "Metric correlations",
|
|
2175
|
+
"type": "array",
|
|
2176
|
+
"items": {
|
|
2177
|
+
"title": "MetricCorrelation",
|
|
2178
|
+
"description": "Correlation between two metrics",
|
|
2179
|
+
"type": "object",
|
|
2180
|
+
"properties": {
|
|
2181
|
+
"interpretation": {
|
|
2182
|
+
"type": "string"
|
|
2183
|
+
},
|
|
2184
|
+
"metricA": {
|
|
2185
|
+
"type": "string"
|
|
2186
|
+
},
|
|
2187
|
+
"metricB": {
|
|
2188
|
+
"type": "string"
|
|
2189
|
+
},
|
|
2190
|
+
"pearsonR": {
|
|
2191
|
+
"type": "number"
|
|
2192
|
+
},
|
|
2193
|
+
"spearmanRho": {
|
|
2194
|
+
"type": "number"
|
|
2195
|
+
}
|
|
2196
|
+
},
|
|
2197
|
+
"required": [
|
|
2198
|
+
"interpretation",
|
|
2199
|
+
"metricA",
|
|
2200
|
+
"metricB",
|
|
2201
|
+
"pearsonR"
|
|
2202
|
+
],
|
|
2203
|
+
"additionalProperties": false
|
|
2204
|
+
}
|
|
2205
|
+
},
|
|
2206
|
+
"pairwiseComparisons": {
|
|
2207
|
+
"description": "Pairwise comparisons between SUTs",
|
|
2208
|
+
"type": "array",
|
|
2209
|
+
"items": {
|
|
2210
|
+
"title": "PairwiseComparison",
|
|
2211
|
+
"description": "Pairwise comparison between two SUTs",
|
|
2212
|
+
"type": "object",
|
|
2213
|
+
"properties": {
|
|
2214
|
+
"delta": {
|
|
2215
|
+
"type": "number"
|
|
2216
|
+
},
|
|
2217
|
+
"effectSize": {
|
|
2218
|
+
"type": "number"
|
|
2219
|
+
},
|
|
2220
|
+
"metric": {
|
|
2221
|
+
"type": "string"
|
|
2222
|
+
},
|
|
2223
|
+
"pValue": {
|
|
2224
|
+
"type": "number"
|
|
2225
|
+
},
|
|
2226
|
+
"ratio": {
|
|
2227
|
+
"type": "number"
|
|
2228
|
+
},
|
|
2229
|
+
"significant": {
|
|
2230
|
+
"type": "boolean"
|
|
2231
|
+
},
|
|
2232
|
+
"sutA": {
|
|
2233
|
+
"type": "string"
|
|
2234
|
+
},
|
|
2235
|
+
"sutB": {
|
|
2236
|
+
"type": "string"
|
|
2237
|
+
}
|
|
2238
|
+
},
|
|
2239
|
+
"required": [
|
|
2240
|
+
"delta",
|
|
2241
|
+
"metric",
|
|
2242
|
+
"ratio",
|
|
2243
|
+
"significant",
|
|
2244
|
+
"sutA",
|
|
2245
|
+
"sutB"
|
|
2246
|
+
],
|
|
2247
|
+
"additionalProperties": false
|
|
2248
|
+
}
|
|
2249
|
+
},
|
|
2250
|
+
"rankings": {
|
|
2251
|
+
"description": "SUT rankings per metric",
|
|
2252
|
+
"type": "object",
|
|
2253
|
+
"additionalProperties": {
|
|
2254
|
+
"type": "array",
|
|
2255
|
+
"items": {
|
|
2256
|
+
"title": "SutMetricRanking",
|
|
2257
|
+
"description": "Ranking of a SUT for a specific metric",
|
|
2258
|
+
"type": "object",
|
|
2259
|
+
"properties": {
|
|
2260
|
+
"mean": {
|
|
2261
|
+
"type": "number"
|
|
2262
|
+
},
|
|
2263
|
+
"median": {
|
|
2264
|
+
"type": "number"
|
|
2265
|
+
},
|
|
2266
|
+
"n": {
|
|
2267
|
+
"type": "integer",
|
|
2268
|
+
"minimum": -9007199254740991,
|
|
2269
|
+
"maximum": 2147483647
|
|
2270
|
+
},
|
|
2271
|
+
"rank": {
|
|
2272
|
+
"type": "integer",
|
|
2273
|
+
"minimum": -9007199254740991,
|
|
2274
|
+
"maximum": 2147483647
|
|
2275
|
+
},
|
|
2276
|
+
"std": {
|
|
2277
|
+
"type": "number"
|
|
2278
|
+
},
|
|
2279
|
+
"sut": {
|
|
2280
|
+
"type": "string"
|
|
2281
|
+
}
|
|
2282
|
+
},
|
|
2283
|
+
"required": [
|
|
2284
|
+
"mean",
|
|
2285
|
+
"median",
|
|
2286
|
+
"n",
|
|
2287
|
+
"rank",
|
|
2288
|
+
"sut"
|
|
2289
|
+
],
|
|
2290
|
+
"additionalProperties": false
|
|
2291
|
+
}
|
|
2292
|
+
},
|
|
2293
|
+
"propertyNames": {
|
|
2294
|
+
"type": "string"
|
|
2295
|
+
}
|
|
2296
|
+
},
|
|
2297
|
+
"summary": {
|
|
2298
|
+
"type": "object",
|
|
2299
|
+
"properties": {
|
|
2300
|
+
"bestSutPerMetric": {
|
|
2301
|
+
"type": "object",
|
|
2302
|
+
"additionalProperties": {
|
|
2303
|
+
"type": "string"
|
|
2304
|
+
},
|
|
2305
|
+
"propertyNames": {
|
|
2306
|
+
"type": "string"
|
|
2307
|
+
}
|
|
2308
|
+
},
|
|
2309
|
+
"caseClassesAnalyzed": {
|
|
2310
|
+
"type": "integer",
|
|
2311
|
+
"minimum": -9007199254740991,
|
|
2312
|
+
"maximum": 2147483647
|
|
2313
|
+
},
|
|
2314
|
+
"metricsAnalyzed": {
|
|
2315
|
+
"type": "integer",
|
|
2316
|
+
"minimum": -9007199254740991,
|
|
2317
|
+
"maximum": 2147483647
|
|
2318
|
+
},
|
|
2319
|
+
"pairwiseComparisonsCount": {
|
|
2320
|
+
"type": "integer",
|
|
2321
|
+
"minimum": -9007199254740991,
|
|
2322
|
+
"maximum": 2147483647
|
|
2323
|
+
},
|
|
2324
|
+
"significantDifferences": {
|
|
2325
|
+
"type": "integer",
|
|
2326
|
+
"minimum": -9007199254740991,
|
|
2327
|
+
"maximum": 2147483647
|
|
2328
|
+
},
|
|
2329
|
+
"sutsAnalyzed": {
|
|
2330
|
+
"type": "integer",
|
|
2331
|
+
"minimum": -9007199254740991,
|
|
2332
|
+
"maximum": 2147483647
|
|
2333
|
+
}
|
|
2334
|
+
},
|
|
2335
|
+
"required": [
|
|
2336
|
+
"bestSutPerMetric",
|
|
2337
|
+
"metricsAnalyzed",
|
|
2338
|
+
"pairwiseComparisonsCount",
|
|
2339
|
+
"significantDifferences",
|
|
2340
|
+
"sutsAnalyzed"
|
|
2341
|
+
],
|
|
2342
|
+
"additionalProperties": false
|
|
2343
|
+
},
|
|
2344
|
+
"timestamp": {
|
|
2345
|
+
"description": "Generation timestamp",
|
|
2346
|
+
"type": "string"
|
|
2347
|
+
},
|
|
2348
|
+
"version": {
|
|
2349
|
+
"description": "Schema version",
|
|
2350
|
+
"type": "string"
|
|
2351
|
+
}
|
|
2352
|
+
},
|
|
2353
|
+
"required": [
|
|
2354
|
+
"pairwiseComparisons",
|
|
2355
|
+
"rankings",
|
|
2356
|
+
"summary",
|
|
2357
|
+
"timestamp",
|
|
2358
|
+
"version"
|
|
2359
|
+
],
|
|
2360
|
+
"additionalProperties": false
|
|
2361
|
+
},
|
|
2362
|
+
"ExploratoryEvaluatorConfig": {
|
|
2363
|
+
"title": "ExploratoryEvaluatorConfig",
|
|
2364
|
+
"description": "Configuration for the exploratory evaluator",
|
|
2365
|
+
"type": "object",
|
|
2366
|
+
"properties": {
|
|
2367
|
+
"analyzeCaseClassEffects": {
|
|
2368
|
+
"description": "Whether to analyze case-class effects",
|
|
2369
|
+
"type": "boolean"
|
|
2370
|
+
},
|
|
2371
|
+
"computeCorrelations": {
|
|
2372
|
+
"description": "Whether to compute metric correlations",
|
|
2373
|
+
"type": "boolean"
|
|
2374
|
+
},
|
|
2375
|
+
"description": {
|
|
2376
|
+
"description": "Evaluator description",
|
|
2377
|
+
"type": "string"
|
|
2378
|
+
},
|
|
2379
|
+
"metricDirections": {
|
|
2380
|
+
"description": "Metric directions for ranking interpretation",
|
|
2381
|
+
"type": "object",
|
|
2382
|
+
"additionalProperties": {
|
|
2383
|
+
"description": "Metric direction for ranking",
|
|
2384
|
+
"type": "string",
|
|
2385
|
+
"oneOf": [
|
|
2386
|
+
{
|
|
2387
|
+
"description": "Higher values indicate better performance",
|
|
2388
|
+
"const": "higher-better"
|
|
2389
|
+
},
|
|
2390
|
+
{
|
|
2391
|
+
"description": "Lower values indicate better performance",
|
|
2392
|
+
"const": "lower-better"
|
|
2393
|
+
}
|
|
2394
|
+
]
|
|
2395
|
+
},
|
|
2396
|
+
"propertyNames": {
|
|
2397
|
+
"type": "string"
|
|
2398
|
+
}
|
|
2399
|
+
},
|
|
2400
|
+
"metrics": {
|
|
2401
|
+
"description": "Metrics to analyze (all if not specified)",
|
|
2402
|
+
"type": "array",
|
|
2403
|
+
"items": {
|
|
2404
|
+
"type": "string",
|
|
2405
|
+
"minLength": 1
|
|
2406
|
+
}
|
|
2407
|
+
},
|
|
2408
|
+
"minEffectSize": {
|
|
2409
|
+
"description": "Minimum effect size to consider meaningful",
|
|
2410
|
+
"type": "number",
|
|
2411
|
+
"minimum": 0
|
|
2412
|
+
},
|
|
2413
|
+
"name": {
|
|
2414
|
+
"description": "Human-readable evaluator name",
|
|
2415
|
+
"type": "string"
|
|
2416
|
+
},
|
|
2417
|
+
"options": {
|
|
2418
|
+
"description": "Additional evaluator-specific options",
|
|
2419
|
+
"type": "object",
|
|
2420
|
+
"additionalProperties": {},
|
|
2421
|
+
"propertyNames": {
|
|
2422
|
+
"type": "string"
|
|
2423
|
+
}
|
|
2424
|
+
},
|
|
2425
|
+
"significanceLevel": {
|
|
2426
|
+
"description": "Significance level for statistical tests (default: 0.05)",
|
|
2427
|
+
"type": "number",
|
|
2428
|
+
"minimum": 0,
|
|
2429
|
+
"maximum": 1
|
|
2430
|
+
},
|
|
2431
|
+
"suts": {
|
|
2432
|
+
"description": "SUTs to include (all if not specified)",
|
|
2433
|
+
"type": "array",
|
|
2434
|
+
"items": {
|
|
2435
|
+
"type": "string",
|
|
2436
|
+
"minLength": 1
|
|
2437
|
+
}
|
|
2438
|
+
}
|
|
2439
|
+
},
|
|
2440
|
+
"additionalProperties": false,
|
|
2441
|
+
"examples": [
|
|
2442
|
+
{
|
|
2443
|
+
"analyzeCaseClassEffects": true,
|
|
2444
|
+
"computeCorrelations": false,
|
|
2445
|
+
"metricDirections": {
|
|
2446
|
+
"length": "higher-better"
|
|
2447
|
+
},
|
|
2448
|
+
"metrics": [
|
|
2449
|
+
"length"
|
|
2450
|
+
]
|
|
2451
|
+
}
|
|
2452
|
+
]
|
|
2453
|
+
},
|
|
2454
|
+
"MetricsEvaluationSummary": {
|
|
2455
|
+
"title": "MetricsEvaluationSummary",
|
|
2456
|
+
"description": "Summary of metrics evaluation",
|
|
2457
|
+
"type": "object",
|
|
2458
|
+
"properties": {
|
|
2459
|
+
"results": {
|
|
2460
|
+
"description": "Individual criterion results",
|
|
2461
|
+
"type": "array",
|
|
2462
|
+
"items": {
|
|
2463
|
+
"title": "MetricsCriterionResult",
|
|
2464
|
+
"description": "Result of evaluating a single metrics criterion",
|
|
2465
|
+
"type": "object",
|
|
2466
|
+
"properties": {
|
|
2467
|
+
"criterion": {
|
|
2468
|
+
"title": "MetricsCriterionOutput",
|
|
2469
|
+
"description": "A metrics evaluation criterion",
|
|
2470
|
+
"type": "object",
|
|
2471
|
+
"properties": {
|
|
2472
|
+
"baseline": {
|
|
2473
|
+
"type": "object",
|
|
2474
|
+
"properties": {
|
|
2475
|
+
"operator": {
|
|
2476
|
+
"description": "Comparison operator",
|
|
2477
|
+
"type": "string",
|
|
2478
|
+
"oneOf": [
|
|
2479
|
+
{
|
|
2480
|
+
"description": "Greater than",
|
|
2481
|
+
"const": "gt"
|
|
2482
|
+
},
|
|
2483
|
+
{
|
|
2484
|
+
"description": "Greater than or equal to",
|
|
2485
|
+
"const": "gte"
|
|
2486
|
+
},
|
|
2487
|
+
{
|
|
2488
|
+
"description": "Less than",
|
|
2489
|
+
"const": "lt"
|
|
2490
|
+
},
|
|
2491
|
+
{
|
|
2492
|
+
"description": "Less than or equal to",
|
|
2493
|
+
"const": "lte"
|
|
2494
|
+
},
|
|
2495
|
+
{
|
|
2496
|
+
"description": "Equal to",
|
|
2497
|
+
"const": "eq"
|
|
2498
|
+
}
|
|
2499
|
+
]
|
|
2500
|
+
},
|
|
2501
|
+
"sut": {
|
|
2502
|
+
"type": "string"
|
|
2503
|
+
}
|
|
2504
|
+
},
|
|
2505
|
+
"required": [
|
|
2506
|
+
"operator",
|
|
2507
|
+
"sut"
|
|
2508
|
+
],
|
|
2509
|
+
"additionalProperties": false
|
|
2510
|
+
},
|
|
2511
|
+
"criterionId": {
|
|
2512
|
+
"description": "Unique identifier",
|
|
2513
|
+
"type": "string"
|
|
2514
|
+
},
|
|
2515
|
+
"description": {
|
|
2516
|
+
"description": "Human-readable description",
|
|
2517
|
+
"type": "string"
|
|
2518
|
+
},
|
|
2519
|
+
"metric": {
|
|
2520
|
+
"description": "Metric to evaluate",
|
|
2521
|
+
"type": "string"
|
|
2522
|
+
},
|
|
2523
|
+
"scopeConstraints": {
|
|
2524
|
+
"type": "object",
|
|
2525
|
+
"properties": {
|
|
2526
|
+
"caseClass": {
|
|
2527
|
+
"anyOf": [
|
|
2528
|
+
{
|
|
2529
|
+
"type": "string"
|
|
2530
|
+
},
|
|
2531
|
+
{
|
|
2532
|
+
"type": "array",
|
|
2533
|
+
"items": {
|
|
2534
|
+
"type": "string"
|
|
2535
|
+
}
|
|
2536
|
+
}
|
|
2537
|
+
]
|
|
2538
|
+
}
|
|
2539
|
+
},
|
|
2540
|
+
"additionalProperties": false
|
|
2541
|
+
},
|
|
2542
|
+
"sut": {
|
|
2543
|
+
"description": "SUT to evaluate (or \"*\" for all SUTs)",
|
|
2544
|
+
"type": "string"
|
|
2545
|
+
},
|
|
2546
|
+
"tags": {
|
|
2547
|
+
"type": "array",
|
|
2548
|
+
"items": {
|
|
2549
|
+
"type": "string"
|
|
2550
|
+
}
|
|
2551
|
+
},
|
|
2552
|
+
"targetRange": {
|
|
2553
|
+
"type": "object",
|
|
2554
|
+
"properties": {
|
|
2555
|
+
"max": {
|
|
2556
|
+
"type": "number"
|
|
2557
|
+
},
|
|
2558
|
+
"maxInclusive": {
|
|
2559
|
+
"type": "boolean"
|
|
2560
|
+
},
|
|
2561
|
+
"min": {
|
|
2562
|
+
"type": "number"
|
|
2563
|
+
},
|
|
2564
|
+
"minInclusive": {
|
|
2565
|
+
"type": "boolean"
|
|
2566
|
+
}
|
|
2567
|
+
},
|
|
2568
|
+
"additionalProperties": false
|
|
2569
|
+
},
|
|
2570
|
+
"threshold": {
|
|
2571
|
+
"type": "object",
|
|
2572
|
+
"properties": {
|
|
2573
|
+
"operator": {
|
|
2574
|
+
"description": "Comparison operator",
|
|
2575
|
+
"type": "string",
|
|
2576
|
+
"oneOf": [
|
|
2577
|
+
{
|
|
2578
|
+
"description": "Greater than",
|
|
2579
|
+
"const": "gt"
|
|
2580
|
+
},
|
|
2581
|
+
{
|
|
2582
|
+
"description": "Greater than or equal to",
|
|
2583
|
+
"const": "gte"
|
|
2584
|
+
},
|
|
2585
|
+
{
|
|
2586
|
+
"description": "Less than",
|
|
2587
|
+
"const": "lt"
|
|
2588
|
+
},
|
|
2589
|
+
{
|
|
2590
|
+
"description": "Less than or equal to",
|
|
2591
|
+
"const": "lte"
|
|
2592
|
+
},
|
|
2593
|
+
{
|
|
2594
|
+
"description": "Equal to",
|
|
2595
|
+
"const": "eq"
|
|
2596
|
+
}
|
|
2597
|
+
]
|
|
2598
|
+
},
|
|
2599
|
+
"value": {
|
|
2600
|
+
"type": "number"
|
|
2601
|
+
}
|
|
2602
|
+
},
|
|
2603
|
+
"required": [
|
|
2604
|
+
"operator",
|
|
2605
|
+
"value"
|
|
2606
|
+
],
|
|
2607
|
+
"additionalProperties": false
|
|
2608
|
+
},
|
|
2609
|
+
"type": {
|
|
2610
|
+
"description": "Type of metrics criterion",
|
|
2611
|
+
"type": "string",
|
|
2612
|
+
"oneOf": [
|
|
2613
|
+
{
|
|
2614
|
+
"description": "Compare a metric against a fixed threshold value",
|
|
2615
|
+
"const": "threshold"
|
|
2616
|
+
},
|
|
2617
|
+
{
|
|
2618
|
+
"description": "Compare a metric against a baseline SUT",
|
|
2619
|
+
"const": "baseline"
|
|
2620
|
+
},
|
|
2621
|
+
{
|
|
2622
|
+
"description": "Check that a metric falls within a target range",
|
|
2623
|
+
"const": "target-range"
|
|
2624
|
+
}
|
|
2625
|
+
]
|
|
2626
|
+
}
|
|
2627
|
+
},
|
|
2628
|
+
"required": [
|
|
2629
|
+
"criterionId",
|
|
2630
|
+
"description",
|
|
2631
|
+
"metric",
|
|
2632
|
+
"sut",
|
|
2633
|
+
"type"
|
|
2634
|
+
],
|
|
2635
|
+
"additionalProperties": false
|
|
2636
|
+
},
|
|
2637
|
+
"expected": {
|
|
2638
|
+
"type": "object",
|
|
2639
|
+
"properties": {
|
|
2640
|
+
"baselineValue": {
|
|
2641
|
+
"type": "number"
|
|
2642
|
+
},
|
|
2643
|
+
"targetRange": {
|
|
2644
|
+
"type": "object",
|
|
2645
|
+
"properties": {
|
|
2646
|
+
"max": {
|
|
2647
|
+
"type": "number"
|
|
2648
|
+
},
|
|
2649
|
+
"min": {
|
|
2650
|
+
"type": "number"
|
|
2651
|
+
}
|
|
2652
|
+
},
|
|
2653
|
+
"additionalProperties": false
|
|
2654
|
+
},
|
|
2655
|
+
"threshold": {
|
|
2656
|
+
"type": "number"
|
|
2657
|
+
},
|
|
2658
|
+
"type": {
|
|
2659
|
+
"description": "Type of metrics criterion",
|
|
2660
|
+
"type": "string",
|
|
2661
|
+
"oneOf": [
|
|
2662
|
+
{
|
|
2663
|
+
"description": "Compare a metric against a fixed threshold value",
|
|
2664
|
+
"const": "threshold"
|
|
2665
|
+
},
|
|
2666
|
+
{
|
|
2667
|
+
"description": "Compare a metric against a baseline SUT",
|
|
2668
|
+
"const": "baseline"
|
|
2669
|
+
},
|
|
2670
|
+
{
|
|
2671
|
+
"description": "Check that a metric falls within a target range",
|
|
2672
|
+
"const": "target-range"
|
|
2673
|
+
}
|
|
2674
|
+
]
|
|
2675
|
+
}
|
|
2676
|
+
},
|
|
2677
|
+
"required": [
|
|
2678
|
+
"type"
|
|
2679
|
+
],
|
|
2680
|
+
"additionalProperties": false
|
|
2681
|
+
},
|
|
2682
|
+
"inconclusiveReason": {
|
|
2683
|
+
"type": "string"
|
|
2684
|
+
},
|
|
2685
|
+
"observed": {
|
|
2686
|
+
"type": "array",
|
|
2687
|
+
"items": {
|
|
2688
|
+
"type": "object",
|
|
2689
|
+
"properties": {
|
|
2690
|
+
"sut": {
|
|
2691
|
+
"type": "string"
|
|
2692
|
+
},
|
|
2693
|
+
"value": {
|
|
2694
|
+
"type": "number"
|
|
2695
|
+
}
|
|
2696
|
+
},
|
|
2697
|
+
"required": [
|
|
2698
|
+
"sut",
|
|
2699
|
+
"value"
|
|
2700
|
+
],
|
|
2701
|
+
"additionalProperties": false
|
|
2702
|
+
}
|
|
2703
|
+
},
|
|
2704
|
+
"status": {
|
|
2705
|
+
"type": "string",
|
|
2706
|
+
"enum": [
|
|
2707
|
+
"pass",
|
|
2708
|
+
"fail",
|
|
2709
|
+
"inconclusive"
|
|
2710
|
+
]
|
|
2711
|
+
}
|
|
2712
|
+
},
|
|
2713
|
+
"required": [
|
|
2714
|
+
"criterion",
|
|
2715
|
+
"expected",
|
|
2716
|
+
"observed",
|
|
2717
|
+
"status"
|
|
2718
|
+
],
|
|
2719
|
+
"additionalProperties": false
|
|
2720
|
+
}
|
|
2721
|
+
},
|
|
2722
|
+
"summary": {
|
|
2723
|
+
"type": "object",
|
|
2724
|
+
"properties": {
|
|
2725
|
+
"failed": {
|
|
2726
|
+
"description": "Criteria failed",
|
|
2727
|
+
"type": "integer",
|
|
2728
|
+
"minimum": -9007199254740991,
|
|
2729
|
+
"maximum": 2147483647
|
|
2730
|
+
},
|
|
2731
|
+
"inconclusive": {
|
|
2732
|
+
"description": "Criteria inconclusive",
|
|
2733
|
+
"type": "integer",
|
|
2734
|
+
"minimum": -9007199254740991,
|
|
2735
|
+
"maximum": 2147483647
|
|
2736
|
+
},
|
|
2737
|
+
"passed": {
|
|
2738
|
+
"description": "Criteria passed",
|
|
2739
|
+
"type": "integer",
|
|
2740
|
+
"minimum": -9007199254740991,
|
|
2741
|
+
"maximum": 2147483647
|
|
2742
|
+
},
|
|
2743
|
+
"passRate": {
|
|
2744
|
+
"description": "Overall pass rate",
|
|
2745
|
+
"type": "number"
|
|
2746
|
+
},
|
|
2747
|
+
"passRateBySut": {
|
|
2748
|
+
"description": "Pass rate by SUT",
|
|
2749
|
+
"type": "object",
|
|
2750
|
+
"additionalProperties": {
|
|
2751
|
+
"type": "number"
|
|
2752
|
+
},
|
|
2753
|
+
"propertyNames": {
|
|
2754
|
+
"type": "string"
|
|
2755
|
+
}
|
|
2756
|
+
},
|
|
2757
|
+
"total": {
|
|
2758
|
+
"description": "Total criteria evaluated",
|
|
2759
|
+
"type": "integer",
|
|
2760
|
+
"minimum": -9007199254740991,
|
|
2761
|
+
"maximum": 2147483647
|
|
2762
|
+
}
|
|
2763
|
+
},
|
|
2764
|
+
"required": [
|
|
2765
|
+
"failed",
|
|
2766
|
+
"inconclusive",
|
|
2767
|
+
"passed",
|
|
2768
|
+
"passRate",
|
|
2769
|
+
"passRateBySut",
|
|
2770
|
+
"total"
|
|
2771
|
+
],
|
|
2772
|
+
"additionalProperties": false
|
|
2773
|
+
},
|
|
2774
|
+
"timestamp": {
|
|
2775
|
+
"description": "Generation timestamp",
|
|
2776
|
+
"type": "string"
|
|
2777
|
+
},
|
|
2778
|
+
"version": {
|
|
2779
|
+
"description": "Schema version",
|
|
2780
|
+
"type": "string"
|
|
2781
|
+
}
|
|
2782
|
+
},
|
|
2783
|
+
"required": [
|
|
2784
|
+
"results",
|
|
2785
|
+
"summary",
|
|
2786
|
+
"timestamp",
|
|
2787
|
+
"version"
|
|
2788
|
+
],
|
|
2789
|
+
"additionalProperties": false
|
|
2790
|
+
},
|
|
2791
|
+
"MetricsEvaluatorConfig": {
|
|
2792
|
+
"title": "MetricsEvaluatorConfig",
|
|
2793
|
+
"description": "Configuration for the metrics evaluator",
|
|
2794
|
+
"type": "object",
|
|
2795
|
+
"properties": {
|
|
2796
|
+
"criteria": {
|
|
2797
|
+
"description": "Criteria to evaluate",
|
|
449
2798
|
"type": "array",
|
|
450
2799
|
"items": {
|
|
451
|
-
"title": "
|
|
452
|
-
"description": "
|
|
2800
|
+
"title": "MetricsCriterion",
|
|
2801
|
+
"description": "A metrics evaluation criterion",
|
|
453
2802
|
"type": "object",
|
|
2803
|
+
"allOf": [
|
|
2804
|
+
{
|
|
2805
|
+
"if": {
|
|
2806
|
+
"properties": {
|
|
2807
|
+
"type": {
|
|
2808
|
+
"const": "threshold"
|
|
2809
|
+
}
|
|
2810
|
+
},
|
|
2811
|
+
"required": [
|
|
2812
|
+
"type"
|
|
2813
|
+
]
|
|
2814
|
+
},
|
|
2815
|
+
"then": {
|
|
2816
|
+
"required": [
|
|
2817
|
+
"threshold"
|
|
2818
|
+
]
|
|
2819
|
+
}
|
|
2820
|
+
},
|
|
2821
|
+
{
|
|
2822
|
+
"if": {
|
|
2823
|
+
"properties": {
|
|
2824
|
+
"type": {
|
|
2825
|
+
"const": "baseline"
|
|
2826
|
+
}
|
|
2827
|
+
},
|
|
2828
|
+
"required": [
|
|
2829
|
+
"type"
|
|
2830
|
+
]
|
|
2831
|
+
},
|
|
2832
|
+
"then": {
|
|
2833
|
+
"required": [
|
|
2834
|
+
"baseline"
|
|
2835
|
+
]
|
|
2836
|
+
}
|
|
2837
|
+
},
|
|
2838
|
+
{
|
|
2839
|
+
"if": {
|
|
2840
|
+
"properties": {
|
|
2841
|
+
"type": {
|
|
2842
|
+
"const": "target-range"
|
|
2843
|
+
}
|
|
2844
|
+
},
|
|
2845
|
+
"required": [
|
|
2846
|
+
"type"
|
|
2847
|
+
]
|
|
2848
|
+
},
|
|
2849
|
+
"then": {
|
|
2850
|
+
"required": [
|
|
2851
|
+
"targetRange"
|
|
2852
|
+
]
|
|
2853
|
+
}
|
|
2854
|
+
}
|
|
2855
|
+
],
|
|
454
2856
|
"properties": {
|
|
455
2857
|
"baseline": {
|
|
456
|
-
"description": "Baseline
|
|
457
|
-
"type": "
|
|
458
|
-
"
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
2858
|
+
"description": "Baseline comparison (required when type is baseline)",
|
|
2859
|
+
"type": "object",
|
|
2860
|
+
"properties": {
|
|
2861
|
+
"operator": {
|
|
2862
|
+
"description": "Comparison operator",
|
|
2863
|
+
"type": "string",
|
|
2864
|
+
"oneOf": [
|
|
2865
|
+
{
|
|
2866
|
+
"description": "Greater than",
|
|
2867
|
+
"const": "gt"
|
|
2868
|
+
},
|
|
2869
|
+
{
|
|
2870
|
+
"description": "Greater than or equal to",
|
|
2871
|
+
"const": "gte"
|
|
2872
|
+
},
|
|
2873
|
+
{
|
|
2874
|
+
"description": "Less than",
|
|
2875
|
+
"const": "lt"
|
|
2876
|
+
},
|
|
2877
|
+
{
|
|
2878
|
+
"description": "Less than or equal to",
|
|
2879
|
+
"const": "lte"
|
|
2880
|
+
},
|
|
2881
|
+
{
|
|
2882
|
+
"description": "Equal to",
|
|
2883
|
+
"const": "eq"
|
|
2884
|
+
}
|
|
2885
|
+
]
|
|
2886
|
+
},
|
|
2887
|
+
"sut": {
|
|
2888
|
+
"description": "Baseline SUT identifier",
|
|
2889
|
+
"type": "string",
|
|
2890
|
+
"minLength": 1
|
|
2891
|
+
}
|
|
2892
|
+
},
|
|
2893
|
+
"required": [
|
|
2894
|
+
"operator",
|
|
2895
|
+
"sut"
|
|
2896
|
+
],
|
|
2897
|
+
"additionalProperties": false
|
|
463
2898
|
},
|
|
464
|
-
"
|
|
465
|
-
"description": "Unique
|
|
2899
|
+
"criterionId": {
|
|
2900
|
+
"description": "Unique criterion identifier",
|
|
466
2901
|
"type": "string",
|
|
467
2902
|
"minLength": 1
|
|
468
2903
|
},
|
|
469
2904
|
"description": {
|
|
470
|
-
"description": "Human-readable
|
|
2905
|
+
"description": "Human-readable description",
|
|
471
2906
|
"type": "string",
|
|
472
2907
|
"minLength": 1
|
|
473
2908
|
},
|
|
474
|
-
"
|
|
475
|
-
"description": "
|
|
2909
|
+
"metric": {
|
|
2910
|
+
"description": "Metric to evaluate",
|
|
476
2911
|
"type": "string",
|
|
477
|
-
"
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
"
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
2912
|
+
"minLength": 1
|
|
2913
|
+
},
|
|
2914
|
+
"scopeConstraints": {
|
|
2915
|
+
"description": "Optional scope constraints",
|
|
2916
|
+
"type": "object",
|
|
2917
|
+
"properties": {
|
|
2918
|
+
"caseClass": {
|
|
2919
|
+
"description": "Case class filter",
|
|
2920
|
+
"anyOf": [
|
|
2921
|
+
{
|
|
2922
|
+
"type": "string"
|
|
2923
|
+
},
|
|
2924
|
+
{
|
|
2925
|
+
"type": "array",
|
|
2926
|
+
"items": {
|
|
2927
|
+
"type": "string"
|
|
2928
|
+
}
|
|
2929
|
+
}
|
|
2930
|
+
]
|
|
489
2931
|
}
|
|
490
|
-
|
|
2932
|
+
},
|
|
2933
|
+
"additionalProperties": false
|
|
491
2934
|
},
|
|
492
|
-
"
|
|
493
|
-
"description": "
|
|
2935
|
+
"sut": {
|
|
2936
|
+
"description": "SUT to evaluate (or \"*\" for all SUTs)",
|
|
494
2937
|
"type": "string",
|
|
495
2938
|
"minLength": 1
|
|
496
2939
|
},
|
|
497
|
-
"
|
|
498
|
-
"description": "
|
|
499
|
-
"type": "
|
|
500
|
-
"
|
|
2940
|
+
"tags": {
|
|
2941
|
+
"description": "Tags for filtering",
|
|
2942
|
+
"type": "array",
|
|
2943
|
+
"items": {
|
|
2944
|
+
"type": "string"
|
|
2945
|
+
}
|
|
501
2946
|
},
|
|
502
|
-
"
|
|
503
|
-
"description": "
|
|
504
|
-
"type": "
|
|
505
|
-
"
|
|
506
|
-
{
|
|
507
|
-
"description": "
|
|
508
|
-
"
|
|
2947
|
+
"targetRange": {
|
|
2948
|
+
"description": "Target range (required when type is target-range)",
|
|
2949
|
+
"type": "object",
|
|
2950
|
+
"properties": {
|
|
2951
|
+
"max": {
|
|
2952
|
+
"description": "Maximum value",
|
|
2953
|
+
"type": "number"
|
|
509
2954
|
},
|
|
510
|
-
{
|
|
511
|
-
"description": "
|
|
512
|
-
"
|
|
2955
|
+
"maxInclusive": {
|
|
2956
|
+
"description": "Whether max is inclusive",
|
|
2957
|
+
"type": "boolean"
|
|
513
2958
|
},
|
|
514
|
-
{
|
|
515
|
-
"description": "
|
|
516
|
-
"
|
|
2959
|
+
"min": {
|
|
2960
|
+
"description": "Minimum value",
|
|
2961
|
+
"type": "number"
|
|
517
2962
|
},
|
|
518
|
-
{
|
|
519
|
-
"description": "
|
|
520
|
-
"
|
|
2963
|
+
"minInclusive": {
|
|
2964
|
+
"description": "Whether min is inclusive",
|
|
2965
|
+
"type": "boolean"
|
|
521
2966
|
}
|
|
522
|
-
|
|
2967
|
+
},
|
|
2968
|
+
"additionalProperties": false
|
|
523
2969
|
},
|
|
524
|
-
"
|
|
525
|
-
"description": "
|
|
2970
|
+
"threshold": {
|
|
2971
|
+
"description": "Threshold operator and value (required when type is threshold)",
|
|
526
2972
|
"type": "object",
|
|
527
|
-
"
|
|
528
|
-
"
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
},
|
|
552
|
-
{
|
|
553
|
-
"type": "number"
|
|
554
|
-
},
|
|
555
|
-
{
|
|
556
|
-
"type": "boolean"
|
|
557
|
-
},
|
|
558
|
-
{
|
|
559
|
-
"type": "null"
|
|
560
|
-
}
|
|
561
|
-
]
|
|
2973
|
+
"properties": {
|
|
2974
|
+
"operator": {
|
|
2975
|
+
"description": "Comparison operator",
|
|
2976
|
+
"type": "string",
|
|
2977
|
+
"oneOf": [
|
|
2978
|
+
{
|
|
2979
|
+
"description": "Greater than",
|
|
2980
|
+
"const": "gt"
|
|
2981
|
+
},
|
|
2982
|
+
{
|
|
2983
|
+
"description": "Greater than or equal to",
|
|
2984
|
+
"const": "gte"
|
|
2985
|
+
},
|
|
2986
|
+
{
|
|
2987
|
+
"description": "Less than",
|
|
2988
|
+
"const": "lt"
|
|
2989
|
+
},
|
|
2990
|
+
{
|
|
2991
|
+
"description": "Less than or equal to",
|
|
2992
|
+
"const": "lte"
|
|
2993
|
+
},
|
|
2994
|
+
{
|
|
2995
|
+
"description": "Equal to",
|
|
2996
|
+
"const": "eq"
|
|
562
2997
|
}
|
|
563
|
-
|
|
564
|
-
|
|
2998
|
+
]
|
|
2999
|
+
},
|
|
3000
|
+
"value": {
|
|
3001
|
+
"description": "Threshold value",
|
|
3002
|
+
"type": "number"
|
|
3003
|
+
}
|
|
565
3004
|
},
|
|
566
|
-
"
|
|
567
|
-
"
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
"description": "Required significance level (default: 0.05)",
|
|
572
|
-
"type": "number",
|
|
573
|
-
"minimum": 0,
|
|
574
|
-
"maximum": 1
|
|
3005
|
+
"required": [
|
|
3006
|
+
"operator",
|
|
3007
|
+
"value"
|
|
3008
|
+
],
|
|
3009
|
+
"additionalProperties": false
|
|
575
3010
|
},
|
|
576
|
-
"
|
|
577
|
-
"description": "
|
|
3011
|
+
"type": {
|
|
3012
|
+
"description": "Type of metrics criterion",
|
|
578
3013
|
"type": "string",
|
|
579
|
-
"
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
3014
|
+
"oneOf": [
|
|
3015
|
+
{
|
|
3016
|
+
"description": "Compare a metric against a fixed threshold value",
|
|
3017
|
+
"const": "threshold"
|
|
3018
|
+
},
|
|
3019
|
+
{
|
|
3020
|
+
"description": "Compare a metric against a baseline SUT",
|
|
3021
|
+
"const": "baseline"
|
|
3022
|
+
},
|
|
3023
|
+
{
|
|
3024
|
+
"description": "Check that a metric falls within a target range",
|
|
3025
|
+
"const": "target-range"
|
|
3026
|
+
}
|
|
3027
|
+
]
|
|
591
3028
|
}
|
|
592
3029
|
},
|
|
593
3030
|
"required": [
|
|
594
|
-
"
|
|
595
|
-
"claimId",
|
|
3031
|
+
"criterionId",
|
|
596
3032
|
"description",
|
|
597
|
-
"direction",
|
|
598
3033
|
"metric",
|
|
599
|
-
"
|
|
600
|
-
"
|
|
3034
|
+
"sut",
|
|
3035
|
+
"type"
|
|
601
3036
|
],
|
|
602
3037
|
"additionalProperties": false
|
|
603
3038
|
},
|
|
@@ -607,11 +3042,6 @@
|
|
|
607
3042
|
"description": "Evaluator description",
|
|
608
3043
|
"type": "string"
|
|
609
3044
|
},
|
|
610
|
-
"minEffectSize": {
|
|
611
|
-
"description": "Global minimum effect size override",
|
|
612
|
-
"type": "number",
|
|
613
|
-
"minimum": 0
|
|
614
|
-
},
|
|
615
3045
|
"name": {
|
|
616
3046
|
"description": "Human-readable evaluator name",
|
|
617
3047
|
"type": "string"
|
|
@@ -623,473 +3053,779 @@
|
|
|
623
3053
|
"propertyNames": {
|
|
624
3054
|
"type": "string"
|
|
625
3055
|
}
|
|
626
|
-
},
|
|
627
|
-
"significanceLevel": {
|
|
628
|
-
"description": "Global significance level override",
|
|
629
|
-
"type": "number",
|
|
630
|
-
"minimum": 0,
|
|
631
|
-
"maximum": 1
|
|
632
3056
|
}
|
|
633
3057
|
},
|
|
634
3058
|
"required": [
|
|
635
|
-
"
|
|
3059
|
+
"criteria"
|
|
636
3060
|
],
|
|
637
3061
|
"additionalProperties": false,
|
|
638
3062
|
"examples": [
|
|
639
3063
|
{
|
|
640
|
-
"
|
|
3064
|
+
"description": "Evaluate length metric against threshold, baseline, and target-range criteria",
|
|
3065
|
+
"criteria": [
|
|
641
3066
|
{
|
|
642
|
-
"description": "
|
|
643
|
-
"
|
|
644
|
-
"
|
|
645
|
-
"
|
|
3067
|
+
"description": "Measured length should be greater than zero",
|
|
3068
|
+
"type": "threshold",
|
|
3069
|
+
"criterionId": "length-threshold",
|
|
3070
|
+
"metric": "length",
|
|
3071
|
+
"sut": "*",
|
|
3072
|
+
"threshold": {
|
|
3073
|
+
"operator": "gt",
|
|
3074
|
+
"value": 0
|
|
3075
|
+
}
|
|
3076
|
+
},
|
|
3077
|
+
{
|
|
3078
|
+
"description": "Built-in .length should be at least as large as spread operator",
|
|
3079
|
+
"type": "baseline",
|
|
3080
|
+
"baseline": {
|
|
3081
|
+
"operator": "gte",
|
|
3082
|
+
"sut": "spread-length"
|
|
3083
|
+
},
|
|
3084
|
+
"criterionId": "length-baseline",
|
|
646
3085
|
"metric": "length",
|
|
647
|
-
"scope": "global",
|
|
648
3086
|
"sut": "builtin-length"
|
|
3087
|
+
},
|
|
3088
|
+
{
|
|
3089
|
+
"description": "Length should be in reasonable range [1, 100]",
|
|
3090
|
+
"type": "target-range",
|
|
3091
|
+
"criterionId": "length-target-range",
|
|
3092
|
+
"metric": "length",
|
|
3093
|
+
"sut": "*",
|
|
3094
|
+
"targetRange": {
|
|
3095
|
+
"max": 100,
|
|
3096
|
+
"maxInclusive": true,
|
|
3097
|
+
"min": 1,
|
|
3098
|
+
"minInclusive": true
|
|
3099
|
+
}
|
|
649
3100
|
}
|
|
650
3101
|
],
|
|
651
|
-
"
|
|
3102
|
+
"name": "Metrics-Only Evaluation"
|
|
652
3103
|
}
|
|
653
3104
|
]
|
|
654
3105
|
},
|
|
655
|
-
"
|
|
656
|
-
"title": "
|
|
657
|
-
"description": "
|
|
3106
|
+
"Provenance": {
|
|
3107
|
+
"title": "Provenance",
|
|
3108
|
+
"description": "Provenance information for reproducibility",
|
|
658
3109
|
"type": "object",
|
|
659
3110
|
"properties": {
|
|
660
|
-
"
|
|
661
|
-
"description": "
|
|
662
|
-
"type": "string",
|
|
663
|
-
"minLength": 1
|
|
664
|
-
},
|
|
665
|
-
"description": {
|
|
666
|
-
"description": "Evaluator description",
|
|
667
|
-
"type": "string"
|
|
668
|
-
},
|
|
669
|
-
"name": {
|
|
670
|
-
"description": "Human-readable evaluator name",
|
|
3111
|
+
"dependencyLockHash": {
|
|
3112
|
+
"description": "Hash of package-lock.json for dependency pinning",
|
|
671
3113
|
"type": "string"
|
|
672
3114
|
},
|
|
673
|
-
"
|
|
674
|
-
"description": "
|
|
675
|
-
"type": "object",
|
|
676
|
-
"additionalProperties": {},
|
|
677
|
-
"propertyNames": {
|
|
678
|
-
"type": "string"
|
|
679
|
-
}
|
|
680
|
-
}
|
|
681
|
-
},
|
|
682
|
-
"required": [
|
|
683
|
-
"customType"
|
|
684
|
-
],
|
|
685
|
-
"additionalProperties": {}
|
|
686
|
-
},
|
|
687
|
-
"ExploratoryEvaluatorConfig": {
|
|
688
|
-
"title": "ExploratoryEvaluatorConfig",
|
|
689
|
-
"description": "Configuration for the exploratory evaluator",
|
|
690
|
-
"type": "object",
|
|
691
|
-
"properties": {
|
|
692
|
-
"analyzeCaseClassEffects": {
|
|
693
|
-
"description": "Whether to analyze case-class effects",
|
|
3115
|
+
"dirty": {
|
|
3116
|
+
"description": "Whether working directory had uncommitted changes",
|
|
694
3117
|
"type": "boolean"
|
|
695
3118
|
},
|
|
696
|
-
"
|
|
697
|
-
"description": "
|
|
698
|
-
"type": "
|
|
3119
|
+
"executionTimeMs": {
|
|
3120
|
+
"description": "Wall-clock execution time in milliseconds",
|
|
3121
|
+
"type": "number"
|
|
699
3122
|
},
|
|
700
|
-
"
|
|
701
|
-
"description": "
|
|
702
|
-
"type": "
|
|
3123
|
+
"finalMemoryBytes": {
|
|
3124
|
+
"description": "Memory usage at completion (bytes)",
|
|
3125
|
+
"type": "number"
|
|
703
3126
|
},
|
|
704
|
-
"
|
|
705
|
-
"description": "
|
|
706
|
-
"type": "
|
|
707
|
-
"additionalProperties": {
|
|
708
|
-
"description": "Metric direction for ranking",
|
|
709
|
-
"type": "string",
|
|
710
|
-
"oneOf": [
|
|
711
|
-
{
|
|
712
|
-
"description": "Higher values indicate better performance",
|
|
713
|
-
"const": "higher-better"
|
|
714
|
-
},
|
|
715
|
-
{
|
|
716
|
-
"description": "Lower values indicate better performance",
|
|
717
|
-
"const": "lower-better"
|
|
718
|
-
}
|
|
719
|
-
]
|
|
720
|
-
},
|
|
721
|
-
"propertyNames": {
|
|
722
|
-
"type": "string"
|
|
723
|
-
}
|
|
3127
|
+
"gitCommit": {
|
|
3128
|
+
"description": "Git commit hash",
|
|
3129
|
+
"type": "string"
|
|
724
3130
|
},
|
|
725
|
-
"
|
|
726
|
-
"description": "
|
|
3131
|
+
"parentRunIds": {
|
|
3132
|
+
"description": "Parent run IDs (for derived results)",
|
|
727
3133
|
"type": "array",
|
|
728
3134
|
"items": {
|
|
729
|
-
"type": "string"
|
|
730
|
-
"minLength": 1
|
|
3135
|
+
"type": "string"
|
|
731
3136
|
}
|
|
732
3137
|
},
|
|
733
|
-
"
|
|
734
|
-
"description": "
|
|
735
|
-
"type": "number"
|
|
736
|
-
"minimum": 0
|
|
737
|
-
},
|
|
738
|
-
"name": {
|
|
739
|
-
"description": "Human-readable evaluator name",
|
|
740
|
-
"type": "string"
|
|
3138
|
+
"peakMemoryBytes": {
|
|
3139
|
+
"description": "Peak memory usage during execution (bytes)",
|
|
3140
|
+
"type": "number"
|
|
741
3141
|
},
|
|
742
|
-
"
|
|
743
|
-
"description": "
|
|
3142
|
+
"runtime": {
|
|
3143
|
+
"description": "Execution environment (platform and arch required; additional fields are language-specific)",
|
|
744
3144
|
"type": "object",
|
|
745
|
-
"
|
|
746
|
-
|
|
3145
|
+
"properties": {
|
|
3146
|
+
"arch": {
|
|
3147
|
+
"description": "CPU architecture",
|
|
3148
|
+
"type": "string"
|
|
3149
|
+
},
|
|
3150
|
+
"platform": {
|
|
3151
|
+
"description": "Operating system platform",
|
|
3152
|
+
"type": "string"
|
|
3153
|
+
}
|
|
3154
|
+
},
|
|
3155
|
+
"required": [
|
|
3156
|
+
"arch",
|
|
3157
|
+
"platform"
|
|
3158
|
+
],
|
|
3159
|
+
"additionalProperties": {
|
|
747
3160
|
"type": "string"
|
|
748
3161
|
}
|
|
749
3162
|
},
|
|
750
|
-
"
|
|
751
|
-
"description": "
|
|
752
|
-
"type": "
|
|
753
|
-
"minimum": 0,
|
|
754
|
-
"maximum": 1
|
|
755
|
-
},
|
|
756
|
-
"suts": {
|
|
757
|
-
"description": "SUTs to include (all if not specified)",
|
|
758
|
-
"type": "array",
|
|
759
|
-
"items": {
|
|
760
|
-
"type": "string",
|
|
761
|
-
"minLength": 1
|
|
762
|
-
}
|
|
3163
|
+
"timestamp": {
|
|
3164
|
+
"description": "Execution timestamp",
|
|
3165
|
+
"type": "string"
|
|
763
3166
|
}
|
|
764
3167
|
},
|
|
765
|
-
"
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
"computeCorrelations": false,
|
|
770
|
-
"metricDirections": {
|
|
771
|
-
"length": "higher-better"
|
|
772
|
-
},
|
|
773
|
-
"metrics": [
|
|
774
|
-
"length"
|
|
775
|
-
]
|
|
776
|
-
}
|
|
777
|
-
]
|
|
3168
|
+
"required": [
|
|
3169
|
+
"runtime"
|
|
3170
|
+
],
|
|
3171
|
+
"additionalProperties": false
|
|
778
3172
|
},
|
|
779
|
-
"
|
|
780
|
-
"title": "
|
|
781
|
-
"description": "
|
|
3173
|
+
"ResultBatch": {
|
|
3174
|
+
"title": "ResultBatch",
|
|
3175
|
+
"description": "Batch of evaluation results",
|
|
782
3176
|
"type": "object",
|
|
783
3177
|
"properties": {
|
|
784
|
-
"
|
|
785
|
-
"description": "
|
|
786
|
-
"type": "
|
|
787
|
-
"
|
|
788
|
-
"
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
3178
|
+
"metadata": {
|
|
3179
|
+
"description": "Optional batch-level metadata",
|
|
3180
|
+
"type": "object",
|
|
3181
|
+
"additionalProperties": {
|
|
3182
|
+
"anyOf": [
|
|
3183
|
+
{
|
|
3184
|
+
"type": "string"
|
|
3185
|
+
},
|
|
792
3186
|
{
|
|
793
|
-
"
|
|
794
|
-
"properties": {
|
|
795
|
-
"type": {
|
|
796
|
-
"const": "threshold"
|
|
797
|
-
}
|
|
798
|
-
},
|
|
799
|
-
"required": [
|
|
800
|
-
"type"
|
|
801
|
-
]
|
|
802
|
-
},
|
|
803
|
-
"then": {
|
|
804
|
-
"required": [
|
|
805
|
-
"threshold"
|
|
806
|
-
]
|
|
807
|
-
}
|
|
3187
|
+
"type": "number"
|
|
808
3188
|
},
|
|
809
3189
|
{
|
|
810
|
-
"
|
|
811
|
-
"properties": {
|
|
812
|
-
"type": {
|
|
813
|
-
"const": "baseline"
|
|
814
|
-
}
|
|
815
|
-
},
|
|
816
|
-
"required": [
|
|
817
|
-
"type"
|
|
818
|
-
]
|
|
819
|
-
},
|
|
820
|
-
"then": {
|
|
821
|
-
"required": [
|
|
822
|
-
"baseline"
|
|
823
|
-
]
|
|
824
|
-
}
|
|
3190
|
+
"type": "boolean"
|
|
825
3191
|
},
|
|
826
3192
|
{
|
|
827
|
-
"
|
|
828
|
-
"properties": {
|
|
829
|
-
"type": {
|
|
830
|
-
"const": "target-range"
|
|
831
|
-
}
|
|
832
|
-
},
|
|
833
|
-
"required": [
|
|
834
|
-
"type"
|
|
835
|
-
]
|
|
836
|
-
},
|
|
837
|
-
"then": {
|
|
838
|
-
"required": [
|
|
839
|
-
"targetRange"
|
|
840
|
-
]
|
|
841
|
-
}
|
|
3193
|
+
"type": "null"
|
|
842
3194
|
}
|
|
843
|
-
]
|
|
3195
|
+
]
|
|
3196
|
+
},
|
|
3197
|
+
"propertyNames": {
|
|
3198
|
+
"type": "string"
|
|
3199
|
+
}
|
|
3200
|
+
},
|
|
3201
|
+
"results": {
|
|
3202
|
+
"description": "All results in this batch",
|
|
3203
|
+
"type": "array",
|
|
3204
|
+
"items": {
|
|
3205
|
+
"title": "EvaluationResult",
|
|
3206
|
+
"description": "Complete evaluation result",
|
|
3207
|
+
"type": "object",
|
|
844
3208
|
"properties": {
|
|
845
|
-
"
|
|
846
|
-
"
|
|
3209
|
+
"correctness": {
|
|
3210
|
+
"title": "CorrectnessResult",
|
|
3211
|
+
"description": "Correctness assessment",
|
|
847
3212
|
"type": "object",
|
|
848
3213
|
"properties": {
|
|
849
|
-
"
|
|
850
|
-
"description": "
|
|
3214
|
+
"expectedExists": {
|
|
3215
|
+
"description": "Whether expected output exists (oracle available)",
|
|
3216
|
+
"type": "boolean"
|
|
3217
|
+
},
|
|
3218
|
+
"failureType": {
|
|
3219
|
+
"description": "Failure classification if applicable",
|
|
851
3220
|
"type": "string",
|
|
852
|
-
"
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
},
|
|
3221
|
+
"enum": [
|
|
3222
|
+
"no_output",
|
|
3223
|
+
"invalid_structure",
|
|
3224
|
+
"constraint_violation",
|
|
3225
|
+
"exception",
|
|
3226
|
+
"oracle_mismatch",
|
|
3227
|
+
"timeout"
|
|
3228
|
+
]
|
|
3229
|
+
},
|
|
3230
|
+
"matchesExpected": {
|
|
3231
|
+
"description": "Whether output matches expected (null if no oracle)",
|
|
3232
|
+
"anyOf": [
|
|
865
3233
|
{
|
|
866
|
-
"
|
|
867
|
-
"const": "lte"
|
|
3234
|
+
"type": "boolean"
|
|
868
3235
|
},
|
|
869
3236
|
{
|
|
870
|
-
"
|
|
871
|
-
"const": "eq"
|
|
3237
|
+
"type": "null"
|
|
872
3238
|
}
|
|
873
3239
|
]
|
|
874
3240
|
},
|
|
875
|
-
"
|
|
876
|
-
"description": "
|
|
877
|
-
"type": "
|
|
878
|
-
"
|
|
3241
|
+
"notes": {
|
|
3242
|
+
"description": "Human-readable failure notes",
|
|
3243
|
+
"type": "array",
|
|
3244
|
+
"items": {
|
|
3245
|
+
"type": "string"
|
|
3246
|
+
}
|
|
3247
|
+
},
|
|
3248
|
+
"producedOutput": {
|
|
3249
|
+
"description": "Whether the SUT produced any output",
|
|
3250
|
+
"type": "boolean"
|
|
3251
|
+
},
|
|
3252
|
+
"valid": {
|
|
3253
|
+
"description": "Whether output is structurally valid",
|
|
3254
|
+
"type": "boolean"
|
|
879
3255
|
}
|
|
880
3256
|
},
|
|
881
3257
|
"required": [
|
|
882
|
-
"
|
|
883
|
-
"
|
|
3258
|
+
"expectedExists",
|
|
3259
|
+
"matchesExpected",
|
|
3260
|
+
"producedOutput",
|
|
3261
|
+
"valid"
|
|
884
3262
|
],
|
|
885
3263
|
"additionalProperties": false
|
|
886
3264
|
},
|
|
887
|
-
"
|
|
888
|
-
"description": "
|
|
889
|
-
"type": "string"
|
|
890
|
-
"minLength": 1
|
|
891
|
-
},
|
|
892
|
-
"description": {
|
|
893
|
-
"description": "Human-readable description",
|
|
894
|
-
"type": "string",
|
|
895
|
-
"minLength": 1
|
|
896
|
-
},
|
|
897
|
-
"metric": {
|
|
898
|
-
"description": "Metric to evaluate",
|
|
899
|
-
"type": "string",
|
|
900
|
-
"minLength": 1
|
|
3265
|
+
"error": {
|
|
3266
|
+
"description": "Error message if the run failed",
|
|
3267
|
+
"type": "string"
|
|
901
3268
|
},
|
|
902
|
-
"
|
|
903
|
-
"
|
|
3269
|
+
"metrics": {
|
|
3270
|
+
"title": "ResultMetrics",
|
|
3271
|
+
"description": "Numeric metrics",
|
|
904
3272
|
"type": "object",
|
|
905
3273
|
"properties": {
|
|
906
|
-
"
|
|
907
|
-
"description": "
|
|
908
|
-
"
|
|
909
|
-
|
|
3274
|
+
"extra": {
|
|
3275
|
+
"description": "Additional metrics (overflow)",
|
|
3276
|
+
"type": "object",
|
|
3277
|
+
"additionalProperties": {
|
|
3278
|
+
"type": "number"
|
|
3279
|
+
},
|
|
3280
|
+
"propertyNames": {
|
|
3281
|
+
"type": "string"
|
|
3282
|
+
}
|
|
3283
|
+
},
|
|
3284
|
+
"numeric": {
|
|
3285
|
+
"description": "Primary numeric metrics",
|
|
3286
|
+
"type": "object",
|
|
3287
|
+
"additionalProperties": {
|
|
3288
|
+
"type": "number"
|
|
3289
|
+
},
|
|
3290
|
+
"propertyNames": {
|
|
3291
|
+
"type": "string"
|
|
3292
|
+
}
|
|
3293
|
+
}
|
|
3294
|
+
},
|
|
3295
|
+
"required": [
|
|
3296
|
+
"numeric"
|
|
3297
|
+
],
|
|
3298
|
+
"additionalProperties": {
|
|
3299
|
+
"anyOf": [
|
|
3300
|
+
{
|
|
3301
|
+
"type": "number"
|
|
3302
|
+
},
|
|
3303
|
+
{
|
|
3304
|
+
"type": "object",
|
|
3305
|
+
"additionalProperties": {
|
|
3306
|
+
"type": "number"
|
|
3307
|
+
},
|
|
3308
|
+
"propertyNames": {
|
|
910
3309
|
"type": "string"
|
|
3310
|
+
}
|
|
3311
|
+
}
|
|
3312
|
+
]
|
|
3313
|
+
}
|
|
3314
|
+
},
|
|
3315
|
+
"outputs": {
|
|
3316
|
+
"title": "ResultOutputs",
|
|
3317
|
+
"description": "Output artefacts and summaries",
|
|
3318
|
+
"type": "object",
|
|
3319
|
+
"properties": {
|
|
3320
|
+
"artefacts": {
|
|
3321
|
+
"description": "References to generated artefacts",
|
|
3322
|
+
"type": "array",
|
|
3323
|
+
"items": {
|
|
3324
|
+
"title": "ArtefactReference",
|
|
3325
|
+
"description": "Reference to an external artefact",
|
|
3326
|
+
"type": "object",
|
|
3327
|
+
"properties": {
|
|
3328
|
+
"hash": {
|
|
3329
|
+
"type": "string"
|
|
3330
|
+
},
|
|
3331
|
+
"metadata": {
|
|
3332
|
+
"type": "object",
|
|
3333
|
+
"additionalProperties": {
|
|
3334
|
+
"anyOf": [
|
|
3335
|
+
{
|
|
3336
|
+
"type": "string"
|
|
3337
|
+
},
|
|
3338
|
+
{
|
|
3339
|
+
"type": "number"
|
|
3340
|
+
},
|
|
3341
|
+
{
|
|
3342
|
+
"type": "boolean"
|
|
3343
|
+
},
|
|
3344
|
+
{
|
|
3345
|
+
"type": "null"
|
|
3346
|
+
}
|
|
3347
|
+
]
|
|
3348
|
+
},
|
|
3349
|
+
"propertyNames": {
|
|
3350
|
+
"type": "string"
|
|
3351
|
+
}
|
|
3352
|
+
},
|
|
3353
|
+
"type": {
|
|
3354
|
+
"type": "string",
|
|
3355
|
+
"enum": [
|
|
3356
|
+
"graph",
|
|
3357
|
+
"path-set",
|
|
3358
|
+
"subgraph",
|
|
3359
|
+
"embedding",
|
|
3360
|
+
"other"
|
|
3361
|
+
]
|
|
3362
|
+
},
|
|
3363
|
+
"uri": {
|
|
3364
|
+
"type": "string"
|
|
3365
|
+
}
|
|
911
3366
|
},
|
|
912
|
-
|
|
913
|
-
"type"
|
|
914
|
-
"
|
|
3367
|
+
"required": [
|
|
3368
|
+
"type",
|
|
3369
|
+
"uri"
|
|
3370
|
+
],
|
|
3371
|
+
"additionalProperties": false
|
|
3372
|
+
}
|
|
3373
|
+
},
|
|
3374
|
+
"extra": {
|
|
3375
|
+
"description": "Additional untyped outputs",
|
|
3376
|
+
"type": "object",
|
|
3377
|
+
"additionalProperties": {},
|
|
3378
|
+
"propertyNames": {
|
|
3379
|
+
"type": "string"
|
|
3380
|
+
}
|
|
3381
|
+
},
|
|
3382
|
+
"labels": {
|
|
3383
|
+
"description": "Classification labels",
|
|
3384
|
+
"type": "object",
|
|
3385
|
+
"additionalProperties": {
|
|
3386
|
+
"anyOf": [
|
|
3387
|
+
{
|
|
3388
|
+
"type": "string"
|
|
3389
|
+
},
|
|
3390
|
+
{
|
|
3391
|
+
"type": "number"
|
|
3392
|
+
},
|
|
3393
|
+
{
|
|
3394
|
+
"type": "boolean"
|
|
3395
|
+
},
|
|
3396
|
+
{
|
|
3397
|
+
"type": "null"
|
|
3398
|
+
}
|
|
3399
|
+
]
|
|
3400
|
+
},
|
|
3401
|
+
"propertyNames": {
|
|
3402
|
+
"type": "string"
|
|
3403
|
+
}
|
|
3404
|
+
},
|
|
3405
|
+
"ranking": {
|
|
3406
|
+
"description": "Ranking results",
|
|
3407
|
+
"type": "array",
|
|
3408
|
+
"items": {
|
|
3409
|
+
"title": "RankedItem",
|
|
3410
|
+
"description": "A ranked item for ranking tasks",
|
|
3411
|
+
"type": "object",
|
|
3412
|
+
"properties": {
|
|
3413
|
+
"itemId": {
|
|
3414
|
+
"description": "Item identifier",
|
|
915
3415
|
"type": "string"
|
|
3416
|
+
},
|
|
3417
|
+
"metadata": {
|
|
3418
|
+
"description": "Optional additional metadata",
|
|
3419
|
+
"type": "object",
|
|
3420
|
+
"additionalProperties": {
|
|
3421
|
+
"anyOf": [
|
|
3422
|
+
{
|
|
3423
|
+
"type": "string"
|
|
3424
|
+
},
|
|
3425
|
+
{
|
|
3426
|
+
"type": "number"
|
|
3427
|
+
},
|
|
3428
|
+
{
|
|
3429
|
+
"type": "boolean"
|
|
3430
|
+
},
|
|
3431
|
+
{
|
|
3432
|
+
"type": "null"
|
|
3433
|
+
}
|
|
3434
|
+
]
|
|
3435
|
+
},
|
|
3436
|
+
"propertyNames": {
|
|
3437
|
+
"type": "string"
|
|
3438
|
+
}
|
|
3439
|
+
},
|
|
3440
|
+
"score": {
|
|
3441
|
+
"description": "Score or rank value",
|
|
3442
|
+
"type": "number"
|
|
916
3443
|
}
|
|
917
|
-
}
|
|
918
|
-
|
|
3444
|
+
},
|
|
3445
|
+
"required": [
|
|
3446
|
+
"itemId",
|
|
3447
|
+
"score"
|
|
3448
|
+
],
|
|
3449
|
+
"additionalProperties": false
|
|
3450
|
+
}
|
|
3451
|
+
},
|
|
3452
|
+
"summary": {
|
|
3453
|
+
"description": "Scalar summary values",
|
|
3454
|
+
"type": "object",
|
|
3455
|
+
"additionalProperties": {
|
|
3456
|
+
"anyOf": [
|
|
3457
|
+
{
|
|
3458
|
+
"anyOf": [
|
|
3459
|
+
{
|
|
3460
|
+
"type": "string"
|
|
3461
|
+
},
|
|
3462
|
+
{
|
|
3463
|
+
"type": "number"
|
|
3464
|
+
},
|
|
3465
|
+
{
|
|
3466
|
+
"type": "boolean"
|
|
3467
|
+
},
|
|
3468
|
+
{
|
|
3469
|
+
"type": "null"
|
|
3470
|
+
}
|
|
3471
|
+
]
|
|
3472
|
+
},
|
|
3473
|
+
{
|
|
3474
|
+
"type": "array",
|
|
3475
|
+
"items": {
|
|
3476
|
+
"anyOf": [
|
|
3477
|
+
{
|
|
3478
|
+
"type": "string"
|
|
3479
|
+
},
|
|
3480
|
+
{
|
|
3481
|
+
"type": "number"
|
|
3482
|
+
},
|
|
3483
|
+
{
|
|
3484
|
+
"type": "boolean"
|
|
3485
|
+
},
|
|
3486
|
+
{
|
|
3487
|
+
"type": "null"
|
|
3488
|
+
}
|
|
3489
|
+
]
|
|
3490
|
+
}
|
|
3491
|
+
}
|
|
3492
|
+
]
|
|
3493
|
+
},
|
|
3494
|
+
"propertyNames": {
|
|
3495
|
+
"type": "string"
|
|
3496
|
+
}
|
|
919
3497
|
}
|
|
920
3498
|
},
|
|
921
3499
|
"additionalProperties": false
|
|
922
3500
|
},
|
|
923
|
-
"
|
|
924
|
-
"
|
|
925
|
-
"
|
|
926
|
-
"minLength": 1
|
|
927
|
-
},
|
|
928
|
-
"tags": {
|
|
929
|
-
"description": "Tags for filtering",
|
|
930
|
-
"type": "array",
|
|
931
|
-
"items": {
|
|
932
|
-
"type": "string"
|
|
933
|
-
}
|
|
934
|
-
},
|
|
935
|
-
"targetRange": {
|
|
936
|
-
"description": "Target range (required when type is target-range)",
|
|
3501
|
+
"provenance": {
|
|
3502
|
+
"title": "Provenance",
|
|
3503
|
+
"description": "Provenance for reproducibility",
|
|
937
3504
|
"type": "object",
|
|
938
3505
|
"properties": {
|
|
939
|
-
"
|
|
940
|
-
"description": "
|
|
941
|
-
"type": "
|
|
3506
|
+
"dependencyLockHash": {
|
|
3507
|
+
"description": "Hash of package-lock.json for dependency pinning",
|
|
3508
|
+
"type": "string"
|
|
942
3509
|
},
|
|
943
|
-
"
|
|
944
|
-
"description": "Whether
|
|
3510
|
+
"dirty": {
|
|
3511
|
+
"description": "Whether working directory had uncommitted changes",
|
|
945
3512
|
"type": "boolean"
|
|
946
3513
|
},
|
|
947
|
-
"
|
|
948
|
-
"description": "
|
|
3514
|
+
"executionTimeMs": {
|
|
3515
|
+
"description": "Wall-clock execution time in milliseconds",
|
|
3516
|
+
"type": "number"
|
|
3517
|
+
},
|
|
3518
|
+
"finalMemoryBytes": {
|
|
3519
|
+
"description": "Memory usage at completion (bytes)",
|
|
3520
|
+
"type": "number"
|
|
3521
|
+
},
|
|
3522
|
+
"gitCommit": {
|
|
3523
|
+
"description": "Git commit hash",
|
|
3524
|
+
"type": "string"
|
|
3525
|
+
},
|
|
3526
|
+
"parentRunIds": {
|
|
3527
|
+
"description": "Parent run IDs (for derived results)",
|
|
3528
|
+
"type": "array",
|
|
3529
|
+
"items": {
|
|
3530
|
+
"type": "string"
|
|
3531
|
+
}
|
|
3532
|
+
},
|
|
3533
|
+
"peakMemoryBytes": {
|
|
3534
|
+
"description": "Peak memory usage during execution (bytes)",
|
|
3535
|
+
"type": "number"
|
|
3536
|
+
},
|
|
3537
|
+
"runtime": {
|
|
3538
|
+
"description": "Execution environment (platform and arch required; additional fields are language-specific)",
|
|
3539
|
+
"type": "object",
|
|
3540
|
+
"properties": {
|
|
3541
|
+
"arch": {
|
|
3542
|
+
"description": "CPU architecture",
|
|
3543
|
+
"type": "string"
|
|
3544
|
+
},
|
|
3545
|
+
"platform": {
|
|
3546
|
+
"description": "Operating system platform",
|
|
3547
|
+
"type": "string"
|
|
3548
|
+
}
|
|
3549
|
+
},
|
|
3550
|
+
"required": [
|
|
3551
|
+
"arch",
|
|
3552
|
+
"platform"
|
|
3553
|
+
],
|
|
3554
|
+
"additionalProperties": {
|
|
3555
|
+
"type": "string"
|
|
3556
|
+
}
|
|
3557
|
+
},
|
|
3558
|
+
"timestamp": {
|
|
3559
|
+
"description": "Execution timestamp",
|
|
3560
|
+
"type": "string"
|
|
3561
|
+
}
|
|
3562
|
+
},
|
|
3563
|
+
"required": [
|
|
3564
|
+
"runtime"
|
|
3565
|
+
],
|
|
3566
|
+
"additionalProperties": false
|
|
3567
|
+
},
|
|
3568
|
+
"run": {
|
|
3569
|
+
"title": "RunContext",
|
|
3570
|
+
"description": "Run identity and context",
|
|
3571
|
+
"type": "object",
|
|
3572
|
+
"properties": {
|
|
3573
|
+
"caseClass": {
|
|
3574
|
+
"description": "Case class for grouping",
|
|
3575
|
+
"type": "string"
|
|
3576
|
+
},
|
|
3577
|
+
"caseId": {
|
|
3578
|
+
"description": "Case identifier",
|
|
3579
|
+
"type": "string"
|
|
3580
|
+
},
|
|
3581
|
+
"config": {
|
|
3582
|
+
"description": "Configuration overrides for this run",
|
|
3583
|
+
"type": "object",
|
|
3584
|
+
"additionalProperties": {
|
|
3585
|
+
"anyOf": [
|
|
3586
|
+
{
|
|
3587
|
+
"type": "string"
|
|
3588
|
+
},
|
|
3589
|
+
{
|
|
3590
|
+
"type": "number"
|
|
3591
|
+
},
|
|
3592
|
+
{
|
|
3593
|
+
"type": "boolean"
|
|
3594
|
+
},
|
|
3595
|
+
{
|
|
3596
|
+
"type": "null"
|
|
3597
|
+
}
|
|
3598
|
+
]
|
|
3599
|
+
},
|
|
3600
|
+
"propertyNames": {
|
|
3601
|
+
"type": "string"
|
|
3602
|
+
}
|
|
3603
|
+
},
|
|
3604
|
+
"repetition": {
|
|
3605
|
+
"description": "Repetition number for statistical runs",
|
|
3606
|
+
"type": "integer",
|
|
3607
|
+
"minimum": -9007199254740991,
|
|
3608
|
+
"maximum": 2147483647
|
|
3609
|
+
},
|
|
3610
|
+
"runId": {
|
|
3611
|
+
"description": "Deterministic run ID (hash of inputs)",
|
|
3612
|
+
"type": "string"
|
|
3613
|
+
},
|
|
3614
|
+
"seed": {
|
|
3615
|
+
"description": "Random seed if applicable",
|
|
949
3616
|
"type": "number"
|
|
950
3617
|
},
|
|
951
|
-
"
|
|
952
|
-
"description": "
|
|
953
|
-
"type": "
|
|
954
|
-
}
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
},
|
|
958
|
-
"threshold": {
|
|
959
|
-
"description": "Threshold operator and value (required when type is threshold)",
|
|
960
|
-
"type": "object",
|
|
961
|
-
"properties": {
|
|
962
|
-
"operator": {
|
|
963
|
-
"description": "Comparison operator",
|
|
3618
|
+
"sut": {
|
|
3619
|
+
"description": "SUT identifier",
|
|
3620
|
+
"type": "string"
|
|
3621
|
+
},
|
|
3622
|
+
"sutRole": {
|
|
3623
|
+
"description": "Role of the SUT in evaluation",
|
|
964
3624
|
"type": "string",
|
|
965
3625
|
"oneOf": [
|
|
966
3626
|
{
|
|
967
|
-
"description": "
|
|
968
|
-
"const": "
|
|
969
|
-
},
|
|
970
|
-
{
|
|
971
|
-
"description": "Greater than or equal to",
|
|
972
|
-
"const": "gte"
|
|
973
|
-
},
|
|
974
|
-
{
|
|
975
|
-
"description": "Less than",
|
|
976
|
-
"const": "lt"
|
|
3627
|
+
"description": "The system being evaluated; the novel algorithm or implementation",
|
|
3628
|
+
"const": "primary"
|
|
977
3629
|
},
|
|
978
3630
|
{
|
|
979
|
-
"description": "
|
|
980
|
-
"const": "
|
|
3631
|
+
"description": "A reference implementation for comparison",
|
|
3632
|
+
"const": "baseline"
|
|
981
3633
|
},
|
|
982
3634
|
{
|
|
983
|
-
"description": "
|
|
984
|
-
"const": "
|
|
3635
|
+
"description": "Ground truth provider; defines correct answers",
|
|
3636
|
+
"const": "oracle"
|
|
985
3637
|
}
|
|
986
3638
|
]
|
|
987
3639
|
},
|
|
988
|
-
"
|
|
989
|
-
"description": "
|
|
990
|
-
"type": "
|
|
3640
|
+
"sutVersion": {
|
|
3641
|
+
"description": "SUT version for reproducibility",
|
|
3642
|
+
"type": "string"
|
|
991
3643
|
}
|
|
992
3644
|
},
|
|
993
3645
|
"required": [
|
|
994
|
-
"
|
|
995
|
-
"
|
|
3646
|
+
"caseId",
|
|
3647
|
+
"runId",
|
|
3648
|
+
"sut",
|
|
3649
|
+
"sutRole"
|
|
996
3650
|
],
|
|
997
3651
|
"additionalProperties": false
|
|
998
|
-
},
|
|
999
|
-
"type": {
|
|
1000
|
-
"description": "Type of metrics criterion",
|
|
1001
|
-
"type": "string",
|
|
1002
|
-
"oneOf": [
|
|
1003
|
-
{
|
|
1004
|
-
"description": "Compare a metric against a fixed threshold value",
|
|
1005
|
-
"const": "threshold"
|
|
1006
|
-
},
|
|
1007
|
-
{
|
|
1008
|
-
"description": "Compare a metric against a baseline SUT",
|
|
1009
|
-
"const": "baseline"
|
|
1010
|
-
},
|
|
1011
|
-
{
|
|
1012
|
-
"description": "Check that a metric falls within a target range",
|
|
1013
|
-
"const": "target-range"
|
|
1014
|
-
}
|
|
1015
|
-
]
|
|
1016
3652
|
}
|
|
1017
3653
|
},
|
|
1018
3654
|
"required": [
|
|
1019
|
-
"
|
|
1020
|
-
"
|
|
1021
|
-
"
|
|
1022
|
-
"
|
|
1023
|
-
"
|
|
3655
|
+
"correctness",
|
|
3656
|
+
"metrics",
|
|
3657
|
+
"outputs",
|
|
3658
|
+
"provenance",
|
|
3659
|
+
"run"
|
|
1024
3660
|
],
|
|
1025
3661
|
"additionalProperties": false
|
|
1026
|
-
}
|
|
1027
|
-
"minItems": 1
|
|
3662
|
+
}
|
|
1028
3663
|
},
|
|
1029
|
-
"
|
|
1030
|
-
"description": "
|
|
3664
|
+
"timestamp": {
|
|
3665
|
+
"description": "Generation timestamp",
|
|
1031
3666
|
"type": "string"
|
|
1032
3667
|
},
|
|
1033
|
-
"
|
|
1034
|
-
"description": "
|
|
3668
|
+
"version": {
|
|
3669
|
+
"description": "Schema version",
|
|
1035
3670
|
"type": "string"
|
|
1036
|
-
},
|
|
1037
|
-
"options": {
|
|
1038
|
-
"description": "Additional evaluator-specific options",
|
|
1039
|
-
"type": "object",
|
|
1040
|
-
"additionalProperties": {},
|
|
1041
|
-
"propertyNames": {
|
|
1042
|
-
"type": "string"
|
|
1043
|
-
}
|
|
1044
3671
|
}
|
|
1045
3672
|
},
|
|
1046
3673
|
"required": [
|
|
1047
|
-
"
|
|
3674
|
+
"results",
|
|
3675
|
+
"timestamp",
|
|
3676
|
+
"version"
|
|
1048
3677
|
],
|
|
1049
|
-
"additionalProperties": false
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
"
|
|
1061
|
-
|
|
1062
|
-
|
|
3678
|
+
"additionalProperties": false
|
|
3679
|
+
},
|
|
3680
|
+
"RobustnessAnalysisOutput": {
|
|
3681
|
+
"title": "RobustnessAnalysisOutput",
|
|
3682
|
+
"description": "Complete robustness analysis output",
|
|
3683
|
+
"type": "object",
|
|
3684
|
+
"properties": {
|
|
3685
|
+
"config": {
|
|
3686
|
+
"type": "object",
|
|
3687
|
+
"properties": {
|
|
3688
|
+
"intensityLevels": {
|
|
3689
|
+
"description": "Intensity levels tested",
|
|
3690
|
+
"type": "array",
|
|
3691
|
+
"items": {
|
|
3692
|
+
"type": "number"
|
|
1063
3693
|
}
|
|
1064
3694
|
},
|
|
1065
|
-
{
|
|
1066
|
-
"description": "
|
|
1067
|
-
"type": "
|
|
1068
|
-
"
|
|
1069
|
-
"
|
|
1070
|
-
|
|
1071
|
-
},
|
|
1072
|
-
"criterionId": "length-baseline",
|
|
1073
|
-
"metric": "length",
|
|
1074
|
-
"sut": "builtin-length"
|
|
3695
|
+
"metrics": {
|
|
3696
|
+
"description": "Metrics analyzed",
|
|
3697
|
+
"type": "array",
|
|
3698
|
+
"items": {
|
|
3699
|
+
"type": "string"
|
|
3700
|
+
}
|
|
1075
3701
|
},
|
|
1076
|
-
{
|
|
1077
|
-
"description": "
|
|
1078
|
-
"type": "
|
|
1079
|
-
"
|
|
1080
|
-
|
|
1081
|
-
"sut": "*",
|
|
1082
|
-
"targetRange": {
|
|
1083
|
-
"max": 100,
|
|
1084
|
-
"maxInclusive": true,
|
|
1085
|
-
"min": 1,
|
|
1086
|
-
"minInclusive": true
|
|
3702
|
+
"perturbations": {
|
|
3703
|
+
"description": "Perturbations applied",
|
|
3704
|
+
"type": "array",
|
|
3705
|
+
"items": {
|
|
3706
|
+
"type": "string"
|
|
1087
3707
|
}
|
|
3708
|
+
},
|
|
3709
|
+
"runsPerLevel": {
|
|
3710
|
+
"description": "Runs per perturbation level",
|
|
3711
|
+
"type": "integer",
|
|
3712
|
+
"minimum": -9007199254740991,
|
|
3713
|
+
"maximum": 10000
|
|
1088
3714
|
}
|
|
3715
|
+
},
|
|
3716
|
+
"required": [
|
|
3717
|
+
"metrics",
|
|
3718
|
+
"perturbations",
|
|
3719
|
+
"runsPerLevel"
|
|
1089
3720
|
],
|
|
1090
|
-
"
|
|
3721
|
+
"additionalProperties": false
|
|
3722
|
+
},
|
|
3723
|
+
"results": {
|
|
3724
|
+
"description": "Individual analysis results",
|
|
3725
|
+
"type": "array",
|
|
3726
|
+
"items": {
|
|
3727
|
+
"title": "RobustnessAnalysisResult",
|
|
3728
|
+
"description": "Result of robustness analysis for a single SUT",
|
|
3729
|
+
"type": "object",
|
|
3730
|
+
"properties": {
|
|
3731
|
+
"baselineValue": {
|
|
3732
|
+
"type": "number"
|
|
3733
|
+
},
|
|
3734
|
+
"caseClass": {
|
|
3735
|
+
"type": "string"
|
|
3736
|
+
},
|
|
3737
|
+
"metric": {
|
|
3738
|
+
"type": "string"
|
|
3739
|
+
},
|
|
3740
|
+
"perturbation": {
|
|
3741
|
+
"type": "string"
|
|
3742
|
+
},
|
|
3743
|
+
"robustness": {
|
|
3744
|
+
"title": "RobustnessMetrics",
|
|
3745
|
+
"description": "Robustness analysis metrics",
|
|
3746
|
+
"type": "object",
|
|
3747
|
+
"properties": {
|
|
3748
|
+
"breakpoint": {
|
|
3749
|
+
"type": "number"
|
|
3750
|
+
},
|
|
3751
|
+
"coefficientOfVariation": {
|
|
3752
|
+
"type": "number"
|
|
3753
|
+
},
|
|
3754
|
+
"degradationCurve": {
|
|
3755
|
+
"type": "array",
|
|
3756
|
+
"items": {
|
|
3757
|
+
"type": "object",
|
|
3758
|
+
"properties": {
|
|
3759
|
+
"metricValue": {
|
|
3760
|
+
"type": "number"
|
|
3761
|
+
},
|
|
3762
|
+
"perturbationLevel": {
|
|
3763
|
+
"type": "number"
|
|
3764
|
+
},
|
|
3765
|
+
"stdDev": {
|
|
3766
|
+
"type": "number"
|
|
3767
|
+
}
|
|
3768
|
+
},
|
|
3769
|
+
"required": [
|
|
3770
|
+
"metricValue",
|
|
3771
|
+
"perturbationLevel"
|
|
3772
|
+
],
|
|
3773
|
+
"additionalProperties": false
|
|
3774
|
+
}
|
|
3775
|
+
},
|
|
3776
|
+
"rankingStability": {
|
|
3777
|
+
"type": "number"
|
|
3778
|
+
},
|
|
3779
|
+
"stdUnderPerturbation": {
|
|
3780
|
+
"type": "number"
|
|
3781
|
+
},
|
|
3782
|
+
"varianceUnderPerturbation": {
|
|
3783
|
+
"type": "number"
|
|
3784
|
+
}
|
|
3785
|
+
},
|
|
3786
|
+
"required": [
|
|
3787
|
+
"coefficientOfVariation",
|
|
3788
|
+
"stdUnderPerturbation",
|
|
3789
|
+
"varianceUnderPerturbation"
|
|
3790
|
+
],
|
|
3791
|
+
"additionalProperties": false
|
|
3792
|
+
},
|
|
3793
|
+
"runCount": {
|
|
3794
|
+
"type": "integer",
|
|
3795
|
+
"minimum": -9007199254740991,
|
|
3796
|
+
"maximum": 2147483647
|
|
3797
|
+
},
|
|
3798
|
+
"sut": {
|
|
3799
|
+
"type": "string"
|
|
3800
|
+
}
|
|
3801
|
+
},
|
|
3802
|
+
"required": [
|
|
3803
|
+
"baselineValue",
|
|
3804
|
+
"metric",
|
|
3805
|
+
"perturbation",
|
|
3806
|
+
"robustness",
|
|
3807
|
+
"runCount",
|
|
3808
|
+
"sut"
|
|
3809
|
+
],
|
|
3810
|
+
"additionalProperties": false
|
|
3811
|
+
}
|
|
3812
|
+
},
|
|
3813
|
+
"timestamp": {
|
|
3814
|
+
"description": "Generation timestamp",
|
|
3815
|
+
"type": "string"
|
|
3816
|
+
},
|
|
3817
|
+
"version": {
|
|
3818
|
+
"description": "Schema version",
|
|
3819
|
+
"type": "string"
|
|
1091
3820
|
}
|
|
1092
|
-
|
|
3821
|
+
},
|
|
3822
|
+
"required": [
|
|
3823
|
+
"config",
|
|
3824
|
+
"results",
|
|
3825
|
+
"timestamp",
|
|
3826
|
+
"version"
|
|
3827
|
+
],
|
|
3828
|
+
"additionalProperties": false
|
|
1093
3829
|
},
|
|
1094
3830
|
"RobustnessEvaluatorConfig": {
|
|
1095
3831
|
"title": "RobustnessEvaluatorConfig",
|
|
@@ -1173,6 +3909,156 @@
|
|
|
1173
3909
|
"runsPerLevel": 10
|
|
1174
3910
|
}
|
|
1175
3911
|
]
|
|
3912
|
+
},
|
|
3913
|
+
"RunContext": {
|
|
3914
|
+
"title": "RunContext",
|
|
3915
|
+
"description": "Run identity and context",
|
|
3916
|
+
"type": "object",
|
|
3917
|
+
"properties": {
|
|
3918
|
+
"caseClass": {
|
|
3919
|
+
"description": "Case class for grouping",
|
|
3920
|
+
"type": "string"
|
|
3921
|
+
},
|
|
3922
|
+
"caseId": {
|
|
3923
|
+
"description": "Case identifier",
|
|
3924
|
+
"type": "string"
|
|
3925
|
+
},
|
|
3926
|
+
"config": {
|
|
3927
|
+
"description": "Configuration overrides for this run",
|
|
3928
|
+
"type": "object",
|
|
3929
|
+
"additionalProperties": {
|
|
3930
|
+
"anyOf": [
|
|
3931
|
+
{
|
|
3932
|
+
"type": "string"
|
|
3933
|
+
},
|
|
3934
|
+
{
|
|
3935
|
+
"type": "number"
|
|
3936
|
+
},
|
|
3937
|
+
{
|
|
3938
|
+
"type": "boolean"
|
|
3939
|
+
},
|
|
3940
|
+
{
|
|
3941
|
+
"type": "null"
|
|
3942
|
+
}
|
|
3943
|
+
]
|
|
3944
|
+
},
|
|
3945
|
+
"propertyNames": {
|
|
3946
|
+
"type": "string"
|
|
3947
|
+
}
|
|
3948
|
+
},
|
|
3949
|
+
"repetition": {
|
|
3950
|
+
"description": "Repetition number for statistical runs",
|
|
3951
|
+
"type": "integer",
|
|
3952
|
+
"minimum": -9007199254740991,
|
|
3953
|
+
"maximum": 2147483647
|
|
3954
|
+
},
|
|
3955
|
+
"runId": {
|
|
3956
|
+
"description": "Deterministic run ID (hash of inputs)",
|
|
3957
|
+
"type": "string"
|
|
3958
|
+
},
|
|
3959
|
+
"seed": {
|
|
3960
|
+
"description": "Random seed if applicable",
|
|
3961
|
+
"type": "number"
|
|
3962
|
+
},
|
|
3963
|
+
"sut": {
|
|
3964
|
+
"description": "SUT identifier",
|
|
3965
|
+
"type": "string"
|
|
3966
|
+
},
|
|
3967
|
+
"sutRole": {
|
|
3968
|
+
"description": "Role of the SUT in evaluation",
|
|
3969
|
+
"type": "string",
|
|
3970
|
+
"oneOf": [
|
|
3971
|
+
{
|
|
3972
|
+
"description": "The system being evaluated; the novel algorithm or implementation",
|
|
3973
|
+
"const": "primary"
|
|
3974
|
+
},
|
|
3975
|
+
{
|
|
3976
|
+
"description": "A reference implementation for comparison",
|
|
3977
|
+
"const": "baseline"
|
|
3978
|
+
},
|
|
3979
|
+
{
|
|
3980
|
+
"description": "Ground truth provider; defines correct answers",
|
|
3981
|
+
"const": "oracle"
|
|
3982
|
+
}
|
|
3983
|
+
]
|
|
3984
|
+
},
|
|
3985
|
+
"sutVersion": {
|
|
3986
|
+
"description": "SUT version for reproducibility",
|
|
3987
|
+
"type": "string"
|
|
3988
|
+
}
|
|
3989
|
+
},
|
|
3990
|
+
"required": [
|
|
3991
|
+
"caseId",
|
|
3992
|
+
"runId",
|
|
3993
|
+
"sut",
|
|
3994
|
+
"sutRole"
|
|
3995
|
+
],
|
|
3996
|
+
"additionalProperties": false
|
|
3997
|
+
},
|
|
3998
|
+
"SummaryStats": {
|
|
3999
|
+
"title": "SummaryStats",
|
|
4000
|
+
"description": "Summary statistics for a numeric metric",
|
|
4001
|
+
"type": "object",
|
|
4002
|
+
"properties": {
|
|
4003
|
+
"confidence95": {
|
|
4004
|
+
"description": "95% confidence interval [lower, upper]",
|
|
4005
|
+
"type": "array",
|
|
4006
|
+
"prefixItems": [
|
|
4007
|
+
{
|
|
4008
|
+
"type": "number"
|
|
4009
|
+
},
|
|
4010
|
+
{
|
|
4011
|
+
"type": "number"
|
|
4012
|
+
}
|
|
4013
|
+
]
|
|
4014
|
+
},
|
|
4015
|
+
"max": {
|
|
4016
|
+
"description": "Maximum value",
|
|
4017
|
+
"type": "number"
|
|
4018
|
+
},
|
|
4019
|
+
"mean": {
|
|
4020
|
+
"description": "Arithmetic mean",
|
|
4021
|
+
"type": "number"
|
|
4022
|
+
},
|
|
4023
|
+
"median": {
|
|
4024
|
+
"description": "Median (50th percentile)",
|
|
4025
|
+
"type": "number"
|
|
4026
|
+
},
|
|
4027
|
+
"min": {
|
|
4028
|
+
"description": "Minimum value",
|
|
4029
|
+
"type": "number"
|
|
4030
|
+
},
|
|
4031
|
+
"n": {
|
|
4032
|
+
"description": "Number of observations",
|
|
4033
|
+
"type": "integer",
|
|
4034
|
+
"minimum": -9007199254740991,
|
|
4035
|
+
"maximum": 2147483647
|
|
4036
|
+
},
|
|
4037
|
+
"p25": {
|
|
4038
|
+
"description": "25th percentile",
|
|
4039
|
+
"type": "number"
|
|
4040
|
+
},
|
|
4041
|
+
"p75": {
|
|
4042
|
+
"description": "75th percentile",
|
|
4043
|
+
"type": "number"
|
|
4044
|
+
},
|
|
4045
|
+
"std": {
|
|
4046
|
+
"description": "Standard deviation (sample)",
|
|
4047
|
+
"type": "number"
|
|
4048
|
+
},
|
|
4049
|
+
"sum": {
|
|
4050
|
+
"description": "Sum of all values",
|
|
4051
|
+
"type": "number"
|
|
4052
|
+
}
|
|
4053
|
+
},
|
|
4054
|
+
"required": [
|
|
4055
|
+
"max",
|
|
4056
|
+
"mean",
|
|
4057
|
+
"median",
|
|
4058
|
+
"min",
|
|
4059
|
+
"n"
|
|
4060
|
+
],
|
|
4061
|
+
"additionalProperties": false
|
|
1176
4062
|
}
|
|
1177
4063
|
}
|
|
1178
4064
|
}
|