ppef 1.2.1 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/README.md +270 -6
  2. package/dist/__tests__/cli/evaluate-command.integration.test.js +60 -0
  3. package/dist/__tests__/cli/evaluate-command.integration.test.js.map +1 -1
  4. package/dist/__tests__/examples.integration.test.d.ts +8 -0
  5. package/dist/__tests__/examples.integration.test.d.ts.map +1 -0
  6. package/dist/__tests__/examples.integration.test.js +236 -0
  7. package/dist/__tests__/examples.integration.test.js.map +1 -0
  8. package/dist/cli/__tests__/commands.unit.test.js +12 -5
  9. package/dist/cli/__tests__/commands.unit.test.js.map +1 -1
  10. package/dist/cli/__tests__/config-loader.unit.test.js +30 -25
  11. package/dist/cli/__tests__/config-loader.unit.test.js.map +1 -1
  12. package/dist/cli/__tests__/evaluator-schemas.unit.test.d.ts +9 -0
  13. package/dist/cli/__tests__/evaluator-schemas.unit.test.d.ts.map +1 -0
  14. package/dist/cli/__tests__/evaluator-schemas.unit.test.js +334 -0
  15. package/dist/cli/__tests__/evaluator-schemas.unit.test.js.map +1 -0
  16. package/dist/cli/commands/aggregate.d.ts.map +1 -1
  17. package/dist/cli/commands/aggregate.js +20 -12
  18. package/dist/cli/commands/aggregate.js.map +1 -1
  19. package/dist/cli/commands/evaluate.d.ts.map +1 -1
  20. package/dist/cli/commands/evaluate.js +130 -24
  21. package/dist/cli/commands/evaluate.js.map +1 -1
  22. package/dist/cli/commands/plan.d.ts.map +1 -1
  23. package/dist/cli/commands/plan.js +40 -6
  24. package/dist/cli/commands/plan.js.map +1 -1
  25. package/dist/cli/commands/run.d.ts +9 -0
  26. package/dist/cli/commands/run.d.ts.map +1 -1
  27. package/dist/cli/commands/run.js +71 -12
  28. package/dist/cli/commands/run.js.map +1 -1
  29. package/dist/cli/commands/validate.d.ts.map +1 -1
  30. package/dist/cli/commands/validate.js +55 -0
  31. package/dist/cli/commands/validate.js.map +1 -1
  32. package/dist/cli/config-loader.d.ts +6 -3
  33. package/dist/cli/config-loader.d.ts.map +1 -1
  34. package/dist/cli/config-loader.js +31 -106
  35. package/dist/cli/config-loader.js.map +1 -1
  36. package/dist/cli/evaluator-schemas.d.ts +395 -0
  37. package/dist/cli/evaluator-schemas.d.ts.map +1 -0
  38. package/dist/cli/evaluator-schemas.js +285 -0
  39. package/dist/cli/evaluator-schemas.js.map +1 -0
  40. package/dist/cli/index.d.ts.map +1 -1
  41. package/dist/cli/index.js +11 -1
  42. package/dist/cli/index.js.map +1 -1
  43. package/dist/cli/module-loader.d.ts.map +1 -1
  44. package/dist/cli/module-loader.js +38 -20
  45. package/dist/cli/module-loader.js.map +1 -1
  46. package/dist/cli/type-utils.d.ts +31 -0
  47. package/dist/cli/type-utils.d.ts.map +1 -0
  48. package/dist/cli/type-utils.js +38 -0
  49. package/dist/cli/type-utils.js.map +1 -0
  50. package/dist/cli/types.d.ts +284 -94
  51. package/dist/cli/types.d.ts.map +1 -1
  52. package/dist/cli/types.js +177 -1
  53. package/dist/cli/types.js.map +1 -1
  54. package/dist/collector/schema.js.map +1 -1
  55. package/dist/evaluators/claims-evaluator.d.ts.map +1 -1
  56. package/dist/evaluators/claims-evaluator.js +1 -1
  57. package/dist/evaluators/claims-evaluator.js.map +1 -1
  58. package/dist/evaluators/exploratory-evaluator.js.map +1 -1
  59. package/dist/executor/__tests__/worker-entry.integration.test.d.ts.map +1 -1
  60. package/dist/executor/__tests__/worker-entry.integration.test.js +19 -4
  61. package/dist/executor/__tests__/worker-entry.integration.test.js.map +1 -1
  62. package/dist/executor/binary-sut.d.ts.map +1 -1
  63. package/dist/executor/binary-sut.js +2 -1
  64. package/dist/executor/binary-sut.js.map +1 -1
  65. package/dist/executor/checkpoint-storage.d.ts.map +1 -1
  66. package/dist/executor/checkpoint-storage.js +13 -4
  67. package/dist/executor/checkpoint-storage.js.map +1 -1
  68. package/dist/executor/executor.d.ts +22 -0
  69. package/dist/executor/executor.d.ts.map +1 -1
  70. package/dist/executor/executor.js +133 -6
  71. package/dist/executor/executor.js.map +1 -1
  72. package/dist/executor/parallel-executor.d.ts.map +1 -1
  73. package/dist/executor/parallel-executor.js +9 -2
  74. package/dist/executor/parallel-executor.js.map +1 -1
  75. package/dist/executor/worker-entry.js +3 -1
  76. package/dist/executor/worker-entry.js.map +1 -1
  77. package/dist/executor/worker-executor.d.ts +9 -0
  78. package/dist/executor/worker-executor.d.ts.map +1 -1
  79. package/dist/executor/worker-executor.js +88 -9
  80. package/dist/executor/worker-executor.js.map +1 -1
  81. package/dist/executor/worker-threads-executor.d.ts.map +1 -1
  82. package/dist/executor/worker-threads-executor.js +52 -18
  83. package/dist/executor/worker-threads-executor.js.map +1 -1
  84. package/dist/index.cjs +1 -1
  85. package/dist/index.d.ts +1 -0
  86. package/dist/index.d.ts.map +1 -1
  87. package/dist/index.js +2 -0
  88. package/dist/index.js.map +1 -1
  89. package/dist/renderers/latex-renderer.d.ts.map +1 -1
  90. package/dist/renderers/latex-renderer.js +20 -12
  91. package/dist/renderers/latex-renderer.js.map +1 -1
  92. package/dist/schemas/__tests__/json-schema-validator.unit.test.d.ts +8 -0
  93. package/dist/schemas/__tests__/json-schema-validator.unit.test.d.ts.map +1 -0
  94. package/dist/schemas/__tests__/json-schema-validator.unit.test.js +170 -0
  95. package/dist/schemas/__tests__/json-schema-validator.unit.test.js.map +1 -0
  96. package/dist/schemas/index.d.ts +7 -0
  97. package/dist/schemas/index.d.ts.map +1 -0
  98. package/dist/schemas/index.js +7 -0
  99. package/dist/schemas/index.js.map +1 -0
  100. package/dist/schemas/json-schema-validator.d.ts +59 -0
  101. package/dist/schemas/json-schema-validator.d.ts.map +1 -0
  102. package/dist/schemas/json-schema-validator.js +67 -0
  103. package/dist/schemas/json-schema-validator.js.map +1 -0
  104. package/dist/types/case.d.ts +4 -0
  105. package/dist/types/case.d.ts.map +1 -1
  106. package/dist/types/result.d.ts +2 -0
  107. package/dist/types/result.d.ts.map +1 -1
  108. package/dist/types/sut.d.ts +4 -0
  109. package/dist/types/sut.d.ts.map +1 -1
  110. package/package.json +15 -5
  111. package/ppef.schema.json +1178 -0
@@ -0,0 +1,1178 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "https://ppef.dev/schemas/v1.3.0/ppef.schema.json",
4
+ "title": "ExperimentConfig",
5
+ "description": "PPEF experiment configuration",
6
+ "type": "object",
7
+ "properties": {
8
+ "cases": {
9
+ "description": "Test cases to run",
10
+ "type": "array",
11
+ "items": {
12
+ "title": "CaseConfig",
13
+ "description": "Test case configuration",
14
+ "type": "object",
15
+ "properties": {
16
+ "exportName": {
17
+ "description": "Name of the export to use as case factory",
18
+ "type": "string",
19
+ "minLength": 1
20
+ },
21
+ "id": {
22
+ "description": "Unique case identifier",
23
+ "type": "string",
24
+ "minLength": 1
25
+ },
26
+ "inputSchema": {
27
+ "description": "Per-case input schema (overrides schemas.input)",
28
+ "type": "object",
29
+ "additionalProperties": {},
30
+ "propertyNames": {
31
+ "type": "string"
32
+ }
33
+ },
34
+ "module": {
35
+ "description": "Path to module file (relative to config file)",
36
+ "type": "string",
37
+ "minLength": 1
38
+ }
39
+ },
40
+ "required": [
41
+ "exportName",
42
+ "id",
43
+ "module"
44
+ ],
45
+ "additionalProperties": false
46
+ }
47
+ },
48
+ "evaluators": {
49
+ "description": "Evaluator configurations to run after experiment completion",
50
+ "type": "array",
51
+ "items": {
52
+ "title": "EvaluatorEntry",
53
+ "description": "An evaluator configuration entry",
54
+ "type": "object",
55
+ "properties": {
56
+ "config": {
57
+ "$comment": "Discriminated by sibling 'type' field: claims -> ClaimsEvaluatorConfig, metrics -> MetricsEvaluatorConfig, robustness -> RobustnessEvaluatorConfig, exploratory -> ExploratoryEvaluatorConfig, custom -> CustomEvaluatorConfig",
58
+ "anyOf": [
59
+ {
60
+ "$ref": "#/$defs/ClaimsEvaluatorConfig"
61
+ },
62
+ {
63
+ "$ref": "#/$defs/MetricsEvaluatorConfig"
64
+ },
65
+ {
66
+ "$ref": "#/$defs/RobustnessEvaluatorConfig"
67
+ },
68
+ {
69
+ "$ref": "#/$defs/ExploratoryEvaluatorConfig"
70
+ },
71
+ {
72
+ "$ref": "#/$defs/CustomEvaluatorConfig"
73
+ }
74
+ ]
75
+ },
76
+ "type": {
77
+ "description": "Evaluation type",
78
+ "type": "string",
79
+ "oneOf": [
80
+ {
81
+ "description": "Test explicit hypotheses with statistical significance",
82
+ "const": "claims"
83
+ },
84
+ {
85
+ "description": "Measure sensitivity under perturbations",
86
+ "const": "robustness"
87
+ },
88
+ {
89
+ "description": "Evaluate against thresholds, baselines, or target ranges",
90
+ "const": "metrics"
91
+ },
92
+ {
93
+ "description": "Hypothesis-free analysis: rankings, correlations",
94
+ "const": "exploratory"
95
+ },
96
+ {
97
+ "description": "User-defined evaluator loaded from a module",
98
+ "const": "custom"
99
+ }
100
+ ]
101
+ }
102
+ },
103
+ "required": [
104
+ "config",
105
+ "type"
106
+ ],
107
+ "additionalProperties": false
108
+ }
109
+ },
110
+ "executor": {
111
+ "title": "ExecutorConfig",
112
+ "description": "Executor configuration",
113
+ "type": "object",
114
+ "properties": {
115
+ "collectProvenance": {
116
+ "description": "Whether to collect provenance information",
117
+ "type": "boolean",
118
+ "default": true
119
+ },
120
+ "concurrency": {
121
+ "description": "Number of concurrent runs",
122
+ "type": "integer",
123
+ "minimum": 1,
124
+ "maximum": 256
125
+ },
126
+ "continueOnError": {
127
+ "description": "Continue execution if a single run fails",
128
+ "type": "boolean",
129
+ "default": true
130
+ },
131
+ "repetitions": {
132
+ "description": "Number of repetitions per case",
133
+ "type": "integer",
134
+ "minimum": 1,
135
+ "maximum": 10000,
136
+ "default": 1
137
+ },
138
+ "seedBase": {
139
+ "description": "Random seed base",
140
+ "type": "integer",
141
+ "minimum": 0,
142
+ "maximum": 2147483647,
143
+ "default": 42
144
+ },
145
+ "timeoutMs": {
146
+ "description": "Timeout per run in milliseconds (0 = no timeout)",
147
+ "type": "integer",
148
+ "minimum": 0,
149
+ "maximum": 86400000,
150
+ "default": 0
151
+ }
152
+ },
153
+ "additionalProperties": false
154
+ },
155
+ "experiment": {
156
+ "title": "ExperimentMeta",
157
+ "description": "Experiment metadata",
158
+ "type": "object",
159
+ "properties": {
160
+ "description": {
161
+ "description": "Experiment description",
162
+ "type": "string"
163
+ },
164
+ "name": {
165
+ "description": "Human-readable experiment name",
166
+ "type": "string",
167
+ "minLength": 1
168
+ },
169
+ "version": {
170
+ "description": "Experiment version string",
171
+ "type": "string"
172
+ }
173
+ },
174
+ "required": [
175
+ "name"
176
+ ],
177
+ "additionalProperties": false
178
+ },
179
+ "metricsExtractor": {
180
+ "title": "MetricsExtractorConfig",
181
+ "description": "Metrics extractor configuration",
182
+ "type": "object",
183
+ "properties": {
184
+ "exportName": {
185
+ "description": "Name of the export to use as metrics extractor",
186
+ "type": "string",
187
+ "minLength": 1
188
+ },
189
+ "module": {
190
+ "description": "Path to module file (relative to config file)",
191
+ "type": "string",
192
+ "minLength": 1
193
+ }
194
+ },
195
+ "required": [
196
+ "exportName",
197
+ "module"
198
+ ],
199
+ "additionalProperties": false
200
+ },
201
+ "output": {
202
+ "title": "OutputConfig",
203
+ "description": "Output configuration",
204
+ "type": "object",
205
+ "properties": {
206
+ "aggregate": {
207
+ "description": "Whether to aggregate results",
208
+ "type": "boolean",
209
+ "default": true
210
+ },
211
+ "format": {
212
+ "description": "Output format",
213
+ "type": "string",
214
+ "enum": [
215
+ "json",
216
+ "json-pretty"
217
+ ],
218
+ "default": "json-pretty"
219
+ },
220
+ "path": {
221
+ "description": "Output directory path",
222
+ "type": "string"
223
+ }
224
+ },
225
+ "additionalProperties": false
226
+ },
227
+ "schemas": {
228
+ "description": "Optional JSON Schemas for input/output validation",
229
+ "type": "object",
230
+ "properties": {
231
+ "input": {
232
+ "description": "Schema for case inputs",
233
+ "type": "object",
234
+ "additionalProperties": {},
235
+ "propertyNames": {
236
+ "type": "string"
237
+ }
238
+ },
239
+ "output": {
240
+ "description": "Schema for SUT outputs",
241
+ "type": "object",
242
+ "additionalProperties": {},
243
+ "propertyNames": {
244
+ "type": "string"
245
+ }
246
+ }
247
+ },
248
+ "additionalProperties": false
249
+ },
250
+ "suts": {
251
+ "description": "Systems Under Test to evaluate",
252
+ "type": "array",
253
+ "items": {
254
+ "title": "SutConfig",
255
+ "description": "System Under Test configuration",
256
+ "type": "object",
257
+ "properties": {
258
+ "binaryArgs": {
259
+ "description": "Arguments to pass to binary command",
260
+ "type": "array",
261
+ "items": {
262
+ "type": "string"
263
+ }
264
+ },
265
+ "binaryCommand": {
266
+ "description": "Command to execute (when type=\"binary\")",
267
+ "type": "string"
268
+ },
269
+ "binaryInputFormat": {
270
+ "description": "How to serialize inputs to stdin",
271
+ "type": "string",
272
+ "enum": [
273
+ "json",
274
+ "raw",
275
+ "lines"
276
+ ]
277
+ },
278
+ "binaryOutputFormat": {
279
+ "description": "How to deserialize stdout",
280
+ "type": "string",
281
+ "enum": [
282
+ "json",
283
+ "raw",
284
+ "lines"
285
+ ]
286
+ },
287
+ "binaryTimeout": {
288
+ "description": "Binary SUT timeout per run in milliseconds",
289
+ "type": "integer",
290
+ "minimum": 0,
291
+ "maximum": 86400000
292
+ },
293
+ "config": {
294
+ "description": "Optional configuration to pass to factory",
295
+ "type": "object",
296
+ "additionalProperties": {},
297
+ "propertyNames": {
298
+ "type": "string"
299
+ }
300
+ },
301
+ "exportName": {
302
+ "description": "Name of the export to use as factory",
303
+ "type": "string",
304
+ "minLength": 1
305
+ },
306
+ "id": {
307
+ "description": "Unique SUT identifier",
308
+ "type": "string",
309
+ "minLength": 1
310
+ },
311
+ "module": {
312
+ "description": "Path to module file (relative to config file)",
313
+ "type": "string",
314
+ "minLength": 1
315
+ },
316
+ "outputSchema": {
317
+ "description": "Per-SUT output schema (overrides schemas.output)",
318
+ "type": "object",
319
+ "additionalProperties": {},
320
+ "propertyNames": {
321
+ "type": "string"
322
+ }
323
+ },
324
+ "registration": {
325
+ "title": "SutRegistration",
326
+ "description": "SUT registration metadata",
327
+ "type": "object",
328
+ "properties": {
329
+ "description": {
330
+ "description": "Optional SUT description",
331
+ "type": "string"
332
+ },
333
+ "name": {
334
+ "description": "Human-readable SUT name",
335
+ "type": "string",
336
+ "minLength": 1
337
+ },
338
+ "role": {
339
+ "description": "Role of the SUT in evaluation",
340
+ "type": "string",
341
+ "oneOf": [
342
+ {
343
+ "description": "The system being evaluated; the novel algorithm or implementation",
344
+ "const": "primary"
345
+ },
346
+ {
347
+ "description": "A reference implementation for comparison",
348
+ "const": "baseline"
349
+ },
350
+ {
351
+ "description": "Ground truth provider; defines correct answers",
352
+ "const": "oracle"
353
+ }
354
+ ]
355
+ },
356
+ "tags": {
357
+ "description": "Searchable tags",
358
+ "type": "array",
359
+ "items": {
360
+ "type": "string"
361
+ }
362
+ },
363
+ "version": {
364
+ "description": "SUT version string",
365
+ "type": "string",
366
+ "minLength": 1
367
+ }
368
+ },
369
+ "required": [
370
+ "name",
371
+ "role",
372
+ "version"
373
+ ],
374
+ "additionalProperties": false
375
+ },
376
+ "type": {
377
+ "description": "SUT type: \"module\" (default) or \"binary\"",
378
+ "type": "string",
379
+ "enum": [
380
+ "module",
381
+ "binary"
382
+ ]
383
+ }
384
+ },
385
+ "required": [
386
+ "exportName",
387
+ "id",
388
+ "module",
389
+ "registration"
390
+ ],
391
+ "additionalProperties": false
392
+ }
393
+ }
394
+ },
395
+ "required": [
396
+ "cases",
397
+ "executor",
398
+ "experiment",
399
+ "metricsExtractor",
400
+ "output",
401
+ "suts"
402
+ ],
403
+ "additionalProperties": false,
404
+ "examples": [
405
+ {
406
+ "cases": [
407
+ {
408
+ "exportName": "createCase",
409
+ "id": "hello-world",
410
+ "module": "./case.mjs"
411
+ }
412
+ ],
413
+ "executor": {
414
+ "repetitions": 3
415
+ },
416
+ "experiment": {
417
+ "description": "Compare string length implementations",
418
+ "name": "string-length"
419
+ },
420
+ "metricsExtractor": {
421
+ "exportName": "extract",
422
+ "module": "./metrics.mjs"
423
+ },
424
+ "output": {
425
+ "path": "./results"
426
+ },
427
+ "suts": [
428
+ {
429
+ "exportName": "createSut",
430
+ "id": "builtin-length",
431
+ "module": "./sut.mjs",
432
+ "registration": {
433
+ "name": "Built-in .length",
434
+ "role": "primary",
435
+ "version": "1.0.0"
436
+ }
437
+ }
438
+ ]
439
+ }
440
+ ],
441
+ "$defs": {
442
+ "ClaimsEvaluatorConfig": {
443
+ "title": "ClaimsEvaluatorConfig",
444
+ "description": "Configuration for the claims evaluator",
445
+ "type": "object",
446
+ "properties": {
447
+ "claims": {
448
+ "description": "Claims to evaluate",
449
+ "type": "array",
450
+ "items": {
451
+ "title": "EvaluationClaim",
452
+ "description": "An evaluation claim (hypothesis)",
453
+ "type": "object",
454
+ "properties": {
455
+ "baseline": {
456
+ "description": "Baseline SUT for comparison",
457
+ "type": "string",
458
+ "minLength": 1
459
+ },
460
+ "citation": {
461
+ "description": "Citation/reference for the claim",
462
+ "type": "string"
463
+ },
464
+ "claimId": {
465
+ "description": "Unique claim identifier",
466
+ "type": "string",
467
+ "minLength": 1
468
+ },
469
+ "description": {
470
+ "description": "Human-readable claim description",
471
+ "type": "string",
472
+ "minLength": 1
473
+ },
474
+ "direction": {
475
+ "description": "Expected direction of difference",
476
+ "type": "string",
477
+ "oneOf": [
478
+ {
479
+ "description": "Primary SUT metric should be greater than baseline",
480
+ "const": "greater"
481
+ },
482
+ {
483
+ "description": "Primary SUT metric should be less than baseline",
484
+ "const": "less"
485
+ },
486
+ {
487
+ "description": "Primary SUT metric should be equal to baseline",
488
+ "const": "equal"
489
+ }
490
+ ]
491
+ },
492
+ "metric": {
493
+ "description": "Metric being compared",
494
+ "type": "string",
495
+ "minLength": 1
496
+ },
497
+ "minEffectSize": {
498
+ "description": "Minimum effect size (Cohen's d)",
499
+ "type": "number",
500
+ "minimum": 0
501
+ },
502
+ "scope": {
503
+ "description": "Scope of claim validity",
504
+ "type": "string",
505
+ "oneOf": [
506
+ {
507
+ "description": "Claim applies across all cases and conditions",
508
+ "const": "global"
509
+ },
510
+ {
511
+ "description": "Claim applies within a specific case class",
512
+ "const": "caseClass"
513
+ },
514
+ {
515
+ "description": "Claim applies within a parameter range",
516
+ "const": "parameterRange"
517
+ },
518
+ {
519
+ "description": "Claim applies to local structural properties",
520
+ "const": "localStructure"
521
+ }
522
+ ]
523
+ },
524
+ "scopeConstraints": {
525
+ "description": "Scope constraints",
526
+ "type": "object",
527
+ "additionalProperties": {
528
+ "anyOf": [
529
+ {
530
+ "anyOf": [
531
+ {
532
+ "type": "string"
533
+ },
534
+ {
535
+ "type": "number"
536
+ },
537
+ {
538
+ "type": "boolean"
539
+ },
540
+ {
541
+ "type": "null"
542
+ }
543
+ ]
544
+ },
545
+ {
546
+ "type": "array",
547
+ "items": {
548
+ "anyOf": [
549
+ {
550
+ "type": "string"
551
+ },
552
+ {
553
+ "type": "number"
554
+ },
555
+ {
556
+ "type": "boolean"
557
+ },
558
+ {
559
+ "type": "null"
560
+ }
561
+ ]
562
+ }
563
+ }
564
+ ]
565
+ },
566
+ "propertyNames": {
567
+ "type": "string"
568
+ }
569
+ },
570
+ "significanceLevel": {
571
+ "description": "Required significance level (default: 0.05)",
572
+ "type": "number",
573
+ "minimum": 0,
574
+ "maximum": 1
575
+ },
576
+ "sut": {
577
+ "description": "Primary SUT being evaluated",
578
+ "type": "string",
579
+ "minLength": 1
580
+ },
581
+ "tags": {
582
+ "description": "Tags for filtering",
583
+ "type": "array",
584
+ "items": {
585
+ "type": "string"
586
+ }
587
+ },
588
+ "threshold": {
589
+ "description": "Optional threshold for the difference",
590
+ "type": "number"
591
+ }
592
+ },
593
+ "required": [
594
+ "baseline",
595
+ "claimId",
596
+ "description",
597
+ "direction",
598
+ "metric",
599
+ "scope",
600
+ "sut"
601
+ ],
602
+ "additionalProperties": false
603
+ },
604
+ "minItems": 1
605
+ },
606
+ "description": {
607
+ "description": "Evaluator description",
608
+ "type": "string"
609
+ },
610
+ "minEffectSize": {
611
+ "description": "Global minimum effect size override",
612
+ "type": "number",
613
+ "minimum": 0
614
+ },
615
+ "name": {
616
+ "description": "Human-readable evaluator name",
617
+ "type": "string"
618
+ },
619
+ "options": {
620
+ "description": "Additional evaluator-specific options",
621
+ "type": "object",
622
+ "additionalProperties": {},
623
+ "propertyNames": {
624
+ "type": "string"
625
+ }
626
+ },
627
+ "significanceLevel": {
628
+ "description": "Global significance level override",
629
+ "type": "number",
630
+ "minimum": 0,
631
+ "maximum": 1
632
+ }
633
+ },
634
+ "required": [
635
+ "claims"
636
+ ],
637
+ "additionalProperties": false,
638
+ "examples": [
639
+ {
640
+ "claims": [
641
+ {
642
+ "description": "Built-in .length reports greater length than spread operator on emoji strings",
643
+ "baseline": "spread-length",
644
+ "claimId": "C001",
645
+ "direction": "greater",
646
+ "metric": "length",
647
+ "scope": "global",
648
+ "sut": "builtin-length"
649
+ }
650
+ ],
651
+ "significanceLevel": 0.05
652
+ }
653
+ ]
654
+ },
655
+ "CustomEvaluatorConfig": {
656
+ "title": "CustomEvaluatorConfig",
657
+ "description": "Configuration for a custom evaluator",
658
+ "type": "object",
659
+ "properties": {
660
+ "customType": {
661
+ "description": "Custom evaluator type name",
662
+ "type": "string",
663
+ "minLength": 1
664
+ },
665
+ "description": {
666
+ "description": "Evaluator description",
667
+ "type": "string"
668
+ },
669
+ "name": {
670
+ "description": "Human-readable evaluator name",
671
+ "type": "string"
672
+ },
673
+ "options": {
674
+ "description": "Additional evaluator-specific options",
675
+ "type": "object",
676
+ "additionalProperties": {},
677
+ "propertyNames": {
678
+ "type": "string"
679
+ }
680
+ }
681
+ },
682
+ "required": [
683
+ "customType"
684
+ ],
685
+ "additionalProperties": {}
686
+ },
687
+ "ExploratoryEvaluatorConfig": {
688
+ "title": "ExploratoryEvaluatorConfig",
689
+ "description": "Configuration for the exploratory evaluator",
690
+ "type": "object",
691
+ "properties": {
692
+ "analyzeCaseClassEffects": {
693
+ "description": "Whether to analyze case-class effects",
694
+ "type": "boolean"
695
+ },
696
+ "computeCorrelations": {
697
+ "description": "Whether to compute metric correlations",
698
+ "type": "boolean"
699
+ },
700
+ "description": {
701
+ "description": "Evaluator description",
702
+ "type": "string"
703
+ },
704
+ "metricDirections": {
705
+ "description": "Metric directions for ranking interpretation",
706
+ "type": "object",
707
+ "additionalProperties": {
708
+ "description": "Metric direction for ranking",
709
+ "type": "string",
710
+ "oneOf": [
711
+ {
712
+ "description": "Higher values indicate better performance",
713
+ "const": "higher-better"
714
+ },
715
+ {
716
+ "description": "Lower values indicate better performance",
717
+ "const": "lower-better"
718
+ }
719
+ ]
720
+ },
721
+ "propertyNames": {
722
+ "type": "string"
723
+ }
724
+ },
725
+ "metrics": {
726
+ "description": "Metrics to analyze (all if not specified)",
727
+ "type": "array",
728
+ "items": {
729
+ "type": "string",
730
+ "minLength": 1
731
+ }
732
+ },
733
+ "minEffectSize": {
734
+ "description": "Minimum effect size to consider meaningful",
735
+ "type": "number",
736
+ "minimum": 0
737
+ },
738
+ "name": {
739
+ "description": "Human-readable evaluator name",
740
+ "type": "string"
741
+ },
742
+ "options": {
743
+ "description": "Additional evaluator-specific options",
744
+ "type": "object",
745
+ "additionalProperties": {},
746
+ "propertyNames": {
747
+ "type": "string"
748
+ }
749
+ },
750
+ "significanceLevel": {
751
+ "description": "Significance level for statistical tests (default: 0.05)",
752
+ "type": "number",
753
+ "minimum": 0,
754
+ "maximum": 1
755
+ },
756
+ "suts": {
757
+ "description": "SUTs to include (all if not specified)",
758
+ "type": "array",
759
+ "items": {
760
+ "type": "string",
761
+ "minLength": 1
762
+ }
763
+ }
764
+ },
765
+ "additionalProperties": false,
766
+ "examples": [
767
+ {
768
+ "analyzeCaseClassEffects": true,
769
+ "computeCorrelations": false,
770
+ "metricDirections": {
771
+ "length": "higher-better"
772
+ },
773
+ "metrics": [
774
+ "length"
775
+ ]
776
+ }
777
+ ]
778
+ },
779
+ "MetricsEvaluatorConfig": {
780
+ "title": "MetricsEvaluatorConfig",
781
+ "description": "Configuration for the metrics evaluator",
782
+ "type": "object",
783
+ "properties": {
784
+ "criteria": {
785
+ "description": "Criteria to evaluate",
786
+ "type": "array",
787
+ "items": {
788
+ "title": "MetricsCriterion",
789
+ "description": "A metrics evaluation criterion",
790
+ "type": "object",
791
+ "allOf": [
792
+ {
793
+ "if": {
794
+ "properties": {
795
+ "type": {
796
+ "const": "threshold"
797
+ }
798
+ },
799
+ "required": [
800
+ "type"
801
+ ]
802
+ },
803
+ "then": {
804
+ "required": [
805
+ "threshold"
806
+ ]
807
+ }
808
+ },
809
+ {
810
+ "if": {
811
+ "properties": {
812
+ "type": {
813
+ "const": "baseline"
814
+ }
815
+ },
816
+ "required": [
817
+ "type"
818
+ ]
819
+ },
820
+ "then": {
821
+ "required": [
822
+ "baseline"
823
+ ]
824
+ }
825
+ },
826
+ {
827
+ "if": {
828
+ "properties": {
829
+ "type": {
830
+ "const": "target-range"
831
+ }
832
+ },
833
+ "required": [
834
+ "type"
835
+ ]
836
+ },
837
+ "then": {
838
+ "required": [
839
+ "targetRange"
840
+ ]
841
+ }
842
+ }
843
+ ],
844
+ "properties": {
845
+ "baseline": {
846
+ "description": "Baseline comparison (required when type is baseline)",
847
+ "type": "object",
848
+ "properties": {
849
+ "operator": {
850
+ "description": "Comparison operator",
851
+ "type": "string",
852
+ "oneOf": [
853
+ {
854
+ "description": "Greater than",
855
+ "const": "gt"
856
+ },
857
+ {
858
+ "description": "Greater than or equal to",
859
+ "const": "gte"
860
+ },
861
+ {
862
+ "description": "Less than",
863
+ "const": "lt"
864
+ },
865
+ {
866
+ "description": "Less than or equal to",
867
+ "const": "lte"
868
+ },
869
+ {
870
+ "description": "Equal to",
871
+ "const": "eq"
872
+ }
873
+ ]
874
+ },
875
+ "sut": {
876
+ "description": "Baseline SUT identifier",
877
+ "type": "string",
878
+ "minLength": 1
879
+ }
880
+ },
881
+ "required": [
882
+ "operator",
883
+ "sut"
884
+ ],
885
+ "additionalProperties": false
886
+ },
887
+ "criterionId": {
888
+ "description": "Unique criterion identifier",
889
+ "type": "string",
890
+ "minLength": 1
891
+ },
892
+ "description": {
893
+ "description": "Human-readable description",
894
+ "type": "string",
895
+ "minLength": 1
896
+ },
897
+ "metric": {
898
+ "description": "Metric to evaluate",
899
+ "type": "string",
900
+ "minLength": 1
901
+ },
902
+ "scopeConstraints": {
903
+ "description": "Optional scope constraints",
904
+ "type": "object",
905
+ "properties": {
906
+ "caseClass": {
907
+ "description": "Case class filter",
908
+ "anyOf": [
909
+ {
910
+ "type": "string"
911
+ },
912
+ {
913
+ "type": "array",
914
+ "items": {
915
+ "type": "string"
916
+ }
917
+ }
918
+ ]
919
+ }
920
+ },
921
+ "additionalProperties": false
922
+ },
923
+ "sut": {
924
+ "description": "SUT to evaluate (or \"*\" for all SUTs)",
925
+ "type": "string",
926
+ "minLength": 1
927
+ },
928
+ "tags": {
929
+ "description": "Tags for filtering",
930
+ "type": "array",
931
+ "items": {
932
+ "type": "string"
933
+ }
934
+ },
935
+ "targetRange": {
936
+ "description": "Target range (required when type is target-range)",
937
+ "type": "object",
938
+ "properties": {
939
+ "max": {
940
+ "description": "Maximum value",
941
+ "type": "number"
942
+ },
943
+ "maxInclusive": {
944
+ "description": "Whether max is inclusive",
945
+ "type": "boolean"
946
+ },
947
+ "min": {
948
+ "description": "Minimum value",
949
+ "type": "number"
950
+ },
951
+ "minInclusive": {
952
+ "description": "Whether min is inclusive",
953
+ "type": "boolean"
954
+ }
955
+ },
956
+ "additionalProperties": false
957
+ },
958
+ "threshold": {
959
+ "description": "Threshold operator and value (required when type is threshold)",
960
+ "type": "object",
961
+ "properties": {
962
+ "operator": {
963
+ "description": "Comparison operator",
964
+ "type": "string",
965
+ "oneOf": [
966
+ {
967
+ "description": "Greater than",
968
+ "const": "gt"
969
+ },
970
+ {
971
+ "description": "Greater than or equal to",
972
+ "const": "gte"
973
+ },
974
+ {
975
+ "description": "Less than",
976
+ "const": "lt"
977
+ },
978
+ {
979
+ "description": "Less than or equal to",
980
+ "const": "lte"
981
+ },
982
+ {
983
+ "description": "Equal to",
984
+ "const": "eq"
985
+ }
986
+ ]
987
+ },
988
+ "value": {
989
+ "description": "Threshold value",
990
+ "type": "number"
991
+ }
992
+ },
993
+ "required": [
994
+ "operator",
995
+ "value"
996
+ ],
997
+ "additionalProperties": false
998
+ },
999
+ "type": {
1000
+ "description": "Type of metrics criterion",
1001
+ "type": "string",
1002
+ "oneOf": [
1003
+ {
1004
+ "description": "Compare a metric against a fixed threshold value",
1005
+ "const": "threshold"
1006
+ },
1007
+ {
1008
+ "description": "Compare a metric against a baseline SUT",
1009
+ "const": "baseline"
1010
+ },
1011
+ {
1012
+ "description": "Check that a metric falls within a target range",
1013
+ "const": "target-range"
1014
+ }
1015
+ ]
1016
+ }
1017
+ },
1018
+ "required": [
1019
+ "criterionId",
1020
+ "description",
1021
+ "metric",
1022
+ "sut",
1023
+ "type"
1024
+ ],
1025
+ "additionalProperties": false
1026
+ },
1027
+ "minItems": 1
1028
+ },
1029
+ "description": {
1030
+ "description": "Evaluator description",
1031
+ "type": "string"
1032
+ },
1033
+ "name": {
1034
+ "description": "Human-readable evaluator name",
1035
+ "type": "string"
1036
+ },
1037
+ "options": {
1038
+ "description": "Additional evaluator-specific options",
1039
+ "type": "object",
1040
+ "additionalProperties": {},
1041
+ "propertyNames": {
1042
+ "type": "string"
1043
+ }
1044
+ }
1045
+ },
1046
+ "required": [
1047
+ "criteria"
1048
+ ],
1049
+ "additionalProperties": false,
1050
+ "examples": [
1051
+ {
1052
+ "description": "Evaluate length metric against threshold, baseline, and target-range criteria",
1053
+ "criteria": [
1054
+ {
1055
+ "description": "Measured length should be greater than zero",
1056
+ "type": "threshold",
1057
+ "criterionId": "length-threshold",
1058
+ "metric": "length",
1059
+ "sut": "*",
1060
+ "threshold": {
1061
+ "operator": "gt",
1062
+ "value": 0
1063
+ }
1064
+ },
1065
+ {
1066
+ "description": "Built-in .length should be at least as large as spread operator",
1067
+ "type": "baseline",
1068
+ "baseline": {
1069
+ "operator": "gte",
1070
+ "sut": "spread-length"
1071
+ },
1072
+ "criterionId": "length-baseline",
1073
+ "metric": "length",
1074
+ "sut": "builtin-length"
1075
+ },
1076
+ {
1077
+ "description": "Length should be in reasonable range [1, 100]",
1078
+ "type": "target-range",
1079
+ "criterionId": "length-target-range",
1080
+ "metric": "length",
1081
+ "sut": "*",
1082
+ "targetRange": {
1083
+ "max": 100,
1084
+ "maxInclusive": true,
1085
+ "min": 1,
1086
+ "minInclusive": true
1087
+ }
1088
+ }
1089
+ ],
1090
+ "name": "Metrics-Only Evaluation"
1091
+ }
1092
+ ]
1093
+ },
1094
+ "RobustnessEvaluatorConfig": {
1095
+ "title": "RobustnessEvaluatorConfig",
1096
+ "description": "Configuration for the robustness evaluator",
1097
+ "type": "object",
1098
+ "properties": {
1099
+ "description": {
1100
+ "description": "Evaluator description",
1101
+ "type": "string"
1102
+ },
1103
+ "intensityLevels": {
1104
+ "description": "Intensity levels tested",
1105
+ "type": "array",
1106
+ "items": {
1107
+ "type": "number"
1108
+ }
1109
+ },
1110
+ "metrics": {
1111
+ "description": "Metrics to analyze",
1112
+ "type": "array",
1113
+ "items": {
1114
+ "type": "string",
1115
+ "minLength": 1
1116
+ },
1117
+ "minItems": 1
1118
+ },
1119
+ "name": {
1120
+ "description": "Human-readable evaluator name",
1121
+ "type": "string"
1122
+ },
1123
+ "options": {
1124
+ "description": "Additional evaluator-specific options",
1125
+ "type": "object",
1126
+ "additionalProperties": {},
1127
+ "propertyNames": {
1128
+ "type": "string"
1129
+ }
1130
+ },
1131
+ "perturbations": {
1132
+ "description": "Perturbations applied",
1133
+ "type": "array",
1134
+ "items": {
1135
+ "type": "string",
1136
+ "minLength": 1
1137
+ },
1138
+ "minItems": 1
1139
+ },
1140
+ "runsPerLevel": {
1141
+ "description": "Number of runs per perturbation level",
1142
+ "type": "integer",
1143
+ "minimum": 1,
1144
+ "maximum": 10000
1145
+ }
1146
+ },
1147
+ "required": [
1148
+ "metrics",
1149
+ "perturbations"
1150
+ ],
1151
+ "additionalProperties": false,
1152
+ "examples": [
1153
+ {
1154
+ "description": "Analyze algorithm robustness under perturbations",
1155
+ "intensityLevels": [
1156
+ 0.1,
1157
+ 0.2,
1158
+ 0.3,
1159
+ 0.4,
1160
+ 0.5
1161
+ ],
1162
+ "metrics": [
1163
+ "executionTime",
1164
+ "accuracy",
1165
+ "f1Score"
1166
+ ],
1167
+ "name": "Robustness Analysis",
1168
+ "perturbations": [
1169
+ "edge-removal",
1170
+ "noise",
1171
+ "seed-shift"
1172
+ ],
1173
+ "runsPerLevel": 10
1174
+ }
1175
+ ]
1176
+ }
1177
+ }
1178
+ }