metaensemble 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. evals/README.md +147 -0
  2. evals/__init__.py +0 -0
  3. evals/cassettes/README.md +10 -0
  4. evals/cassettes/bootstrap.jsonl +800 -0
  5. evals/configs/default.yaml +59 -0
  6. evals/datasets/__init__.py +0 -0
  7. evals/datasets/suite_a/tasks.yaml +123 -0
  8. evals/datasets/suite_b/items.yaml +90 -0
  9. evals/runners/__init__.py +12 -0
  10. evals/runners/api.py +518 -0
  11. evals/runners/metrics.py +132 -0
  12. metaensemble/__init__.py +13 -0
  13. metaensemble/cli.py +1362 -0
  14. metaensemble/commands/dispatch.md +39 -0
  15. metaensemble/commands/executors.md +12 -0
  16. metaensemble/commands/ledger.md +19 -0
  17. metaensemble/commands/limits.md +12 -0
  18. metaensemble/commands/perf.md +12 -0
  19. metaensemble/commands/relaunch.md +29 -0
  20. metaensemble/commands/standup.md +14 -0
  21. metaensemble/config/budgets.example.yaml +72 -0
  22. metaensemble/config/quality.example.yaml +82 -0
  23. metaensemble/hooks/__init__.py +1 -0
  24. metaensemble/hooks/_common.py +148 -0
  25. metaensemble/hooks/deliverable_sync.py +73 -0
  26. metaensemble/hooks/file_event.py +303 -0
  27. metaensemble/hooks/post_task.py +460 -0
  28. metaensemble/hooks/pre_task.py +548 -0
  29. metaensemble/hooks/session_start.py +212 -0
  30. metaensemble/hooks/session_summary.py +392 -0
  31. metaensemble/hooks/subagent_stop.py +94 -0
  32. metaensemble/lib/__init__.py +1 -0
  33. metaensemble/lib/config.py +414 -0
  34. metaensemble/lib/cost_gate.py +299 -0
  35. metaensemble/lib/dispatch.py +341 -0
  36. metaensemble/lib/doctor.py +1563 -0
  37. metaensemble/lib/file_events.py +395 -0
  38. metaensemble/lib/ids.py +91 -0
  39. metaensemble/lib/installer.py +5018 -0
  40. metaensemble/lib/ledger.py +812 -0
  41. metaensemble/lib/manifest.py +141 -0
  42. metaensemble/lib/native_state.py +463 -0
  43. metaensemble/lib/overlaps.py +155 -0
  44. metaensemble/lib/quality_gate.py +155 -0
  45. metaensemble/lib/quality_runners.py +446 -0
  46. metaensemble/lib/reconcile.py +420 -0
  47. metaensemble/lib/recording.py +422 -0
  48. metaensemble/lib/relaunch.py +174 -0
  49. metaensemble/lib/runtime_payload.py +42 -0
  50. metaensemble/lib/runtime_state.py +308 -0
  51. metaensemble/lib/sidecar.py +166 -0
  52. metaensemble/lib/topology.py +181 -0
  53. metaensemble/lib/transcript.py +432 -0
  54. metaensemble/output-styles/deliverable.md +33 -0
  55. metaensemble/output-styles/wire.md +38 -0
  56. metaensemble/roles/architect.md +52 -0
  57. metaensemble/roles/backend.md +43 -0
  58. metaensemble/roles/code-quality.md +49 -0
  59. metaensemble/roles/data-engineer.md +42 -0
  60. metaensemble/roles/devops.md +42 -0
  61. metaensemble/roles/docs.md +41 -0
  62. metaensemble/roles/frontend.md +42 -0
  63. metaensemble/roles/ml-engineer.md +42 -0
  64. metaensemble/roles/test-engineer.md +42 -0
  65. metaensemble/schemas/brief.schema.json +80 -0
  66. metaensemble/schemas/manifest.schema.json +142 -0
  67. metaensemble/schemas/role.schema.json +84 -0
  68. metaensemble/skills/metaensemble-protocol/SKILL.md +226 -0
  69. metaensemble/state/migrations/001_init.sql +72 -0
  70. metaensemble/state/migrations/002_outcome_extended.sql +86 -0
  71. metaensemble/state/migrations/003_run_provenance.sql +36 -0
  72. metaensemble/statusline/me_status.py +187 -0
  73. metaensemble/tools/__init__.py +7 -0
  74. metaensemble/tools/executors.py +62 -0
  75. metaensemble/tools/ledger.py +121 -0
  76. metaensemble/tools/limits.py +165 -0
  77. metaensemble/tools/perf.py +150 -0
  78. metaensemble/tools/standup.py +177 -0
  79. metaensemble/tools/stats.py +115 -0
  80. metaensemble-0.2.0.dist-info/METADATA +221 -0
  81. metaensemble-0.2.0.dist-info/RECORD +85 -0
  82. metaensemble-0.2.0.dist-info/WHEEL +5 -0
  83. metaensemble-0.2.0.dist-info/entry_points.txt +2 -0
  84. metaensemble-0.2.0.dist-info/licenses/LICENSE +21 -0
  85. metaensemble-0.2.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,800 @@
1
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1600.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 900, "tokens_out": 450}
2
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1653.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 913, "tokens_out": 461}
3
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1706.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 926, "tokens_out": 472}
4
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1759.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 939, "tokens_out": 483}
5
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1812.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 952, "tokens_out": 494}
6
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1617.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 909, "tokens_out": 455}
7
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1670.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 922, "tokens_out": 466}
8
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1723.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 935, "tokens_out": 477}
9
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1776.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 948, "tokens_out": 488}
10
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1829.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 961, "tokens_out": 499}
11
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1634.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 918, "tokens_out": 460}
12
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1687.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 931, "tokens_out": 471}
13
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1740.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 944, "tokens_out": 482}
14
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1793.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 957, "tokens_out": 493}
15
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1846.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 970, "tokens_out": 504}
16
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1651.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 927, "tokens_out": 465}
17
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1704.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 940, "tokens_out": 476}
18
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1757.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 953, "tokens_out": 487}
19
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1810.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 966, "tokens_out": 498}
20
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1863.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 979, "tokens_out": 509}
21
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1668.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 936, "tokens_out": 470}
22
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1721.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 949, "tokens_out": 481}
23
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1774.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 962, "tokens_out": 492}
24
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1827.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 975, "tokens_out": 503}
25
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1880.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 988, "tokens_out": 514}
26
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1685.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 945, "tokens_out": 475}
27
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1738.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 958, "tokens_out": 486}
28
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1791.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 971, "tokens_out": 497}
29
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1844.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 984, "tokens_out": 508}
30
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1897.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 997, "tokens_out": 519}
31
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1702.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 954, "tokens_out": 480}
32
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1755.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 967, "tokens_out": 491}
33
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1808.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 980, "tokens_out": 502}
34
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1861.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 993, "tokens_out": 513}
35
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1914.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1006, "tokens_out": 524}
36
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1719.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 963, "tokens_out": 485}
37
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1772.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 976, "tokens_out": 496}
38
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1825.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 989, "tokens_out": 507}
39
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1878.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1002, "tokens_out": 518}
40
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1931.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1015, "tokens_out": 529}
41
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1736.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 972, "tokens_out": 490}
42
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1789.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 985, "tokens_out": 501}
43
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1842.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 998, "tokens_out": 512}
44
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1895.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1011, "tokens_out": 523}
45
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1948.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1024, "tokens_out": 534}
46
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1753.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 981, "tokens_out": 495}
47
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1806.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 994, "tokens_out": 506}
48
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1859.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1007, "tokens_out": 517}
49
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1912.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1020, "tokens_out": 528}
50
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1965.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1033, "tokens_out": 539}
51
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1770.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 990, "tokens_out": 500}
52
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1823.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1003, "tokens_out": 511}
53
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1876.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1016, "tokens_out": 522}
54
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1929.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1029, "tokens_out": 533}
55
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1982.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1042, "tokens_out": 544}
56
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1787.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 999, "tokens_out": 505}
57
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1840.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1012, "tokens_out": 516}
58
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1893.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1025, "tokens_out": 527}
59
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1946.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1038, "tokens_out": 538}
60
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1999.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1051, "tokens_out": 549}
61
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1804.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1008, "tokens_out": 510}
62
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1857.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1021, "tokens_out": 521}
63
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1910.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1034, "tokens_out": 532}
64
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1963.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1047, "tokens_out": 543}
65
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 2016.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1060, "tokens_out": 554}
66
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1821.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1017, "tokens_out": 515}
67
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1874.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1030, "tokens_out": 526}
68
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1927.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1043, "tokens_out": 537}
69
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1980.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1056, "tokens_out": 548}
70
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 2033.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1069, "tokens_out": 559}
71
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1838.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1026, "tokens_out": 520}
72
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1891.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1039, "tokens_out": 531}
73
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1944.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1052, "tokens_out": 542}
74
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1997.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1065, "tokens_out": 553}
75
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 2050.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1078, "tokens_out": 564}
76
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1855.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1035, "tokens_out": 525}
77
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1908.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1048, "tokens_out": 536}
78
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1961.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1061, "tokens_out": 547}
79
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 2014.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1074, "tokens_out": 558}
80
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 2067.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1087, "tokens_out": 569}
81
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1872.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1044, "tokens_out": 530}
82
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1925.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1057, "tokens_out": 541}
83
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1978.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1070, "tokens_out": 552}
84
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 2031.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1083, "tokens_out": 563}
85
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 2084.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1096, "tokens_out": 574}
86
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1889.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1053, "tokens_out": 535}
87
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1942.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1066, "tokens_out": 546}
88
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1995.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1079, "tokens_out": 557}
89
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 2048.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1092, "tokens_out": 568}
90
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 2101.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1105, "tokens_out": 579}
91
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1906.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1062, "tokens_out": 540}
92
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1959.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1075, "tokens_out": 551}
93
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 2012.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1088, "tokens_out": 562}
94
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 2065.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1101, "tokens_out": 573}
95
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 2118.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1114, "tokens_out": 584}
96
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1923.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1071, "tokens_out": 545}
97
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 1976.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1084, "tokens_out": 556}
98
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 2029.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1097, "tokens_out": 567}
99
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 2082.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1110, "tokens_out": 578}
100
+ {"budget_exceeded": false, "cell_id": "B1_single_agent", "duration_ms": 2135.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1123, "tokens_out": 589}
101
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1600.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1050, "tokens_out": 520}
102
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1653.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1063, "tokens_out": 531}
103
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1706.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1076, "tokens_out": 542}
104
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1759.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1089, "tokens_out": 553}
105
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1812.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1102, "tokens_out": 564}
106
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1617.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1059, "tokens_out": 525}
107
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1670.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1072, "tokens_out": 536}
108
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1723.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1085, "tokens_out": 547}
109
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1776.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1098, "tokens_out": 558}
110
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1829.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1111, "tokens_out": 569}
111
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1634.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1068, "tokens_out": 530}
112
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1687.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1081, "tokens_out": 541}
113
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1740.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1094, "tokens_out": 552}
114
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1793.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1107, "tokens_out": 563}
115
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1846.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1120, "tokens_out": 574}
116
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1651.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1077, "tokens_out": 535}
117
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1704.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1090, "tokens_out": 546}
118
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1757.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1103, "tokens_out": 557}
119
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1810.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1116, "tokens_out": 568}
120
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1863.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1129, "tokens_out": 579}
121
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1668.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1086, "tokens_out": 540}
122
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1721.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1099, "tokens_out": 551}
123
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1774.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1112, "tokens_out": 562}
124
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1827.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1125, "tokens_out": 573}
125
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1880.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1138, "tokens_out": 584}
126
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1685.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1095, "tokens_out": 545}
127
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1738.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1108, "tokens_out": 556}
128
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1791.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1121, "tokens_out": 567}
129
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1844.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1134, "tokens_out": 578}
130
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1897.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1147, "tokens_out": 589}
131
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1702.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1104, "tokens_out": 550}
132
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1755.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1117, "tokens_out": 561}
133
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1808.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1130, "tokens_out": 572}
134
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1861.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1143, "tokens_out": 583}
135
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1914.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1156, "tokens_out": 594}
136
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1719.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1113, "tokens_out": 555}
137
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1772.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1126, "tokens_out": 566}
138
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1825.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1139, "tokens_out": 577}
139
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1878.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1152, "tokens_out": 588}
140
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1931.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1165, "tokens_out": 599}
141
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1736.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1122, "tokens_out": 560}
142
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1789.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1135, "tokens_out": 571}
143
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1842.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1148, "tokens_out": 582}
144
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1895.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1161, "tokens_out": 593}
145
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1948.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1174, "tokens_out": 604}
146
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1753.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1131, "tokens_out": 565}
147
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1806.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1144, "tokens_out": 576}
148
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1859.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1157, "tokens_out": 587}
149
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1912.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1170, "tokens_out": 598}
150
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1965.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1183, "tokens_out": 609}
151
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1770.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1140, "tokens_out": 570}
152
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1823.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1153, "tokens_out": 581}
153
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1876.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1166, "tokens_out": 592}
154
+ {"budget_exceeded": true, "cell_id": "B2_single_agent_prompted", "duration_ms": 1929.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1179, "tokens_out": 603}
155
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1982.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1192, "tokens_out": 614}
156
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1787.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1149, "tokens_out": 575}
157
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1840.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1162, "tokens_out": 586}
158
+ {"budget_exceeded": true, "cell_id": "B2_single_agent_prompted", "duration_ms": 1893.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1175, "tokens_out": 597}
159
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1946.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1188, "tokens_out": 608}
160
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1999.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1201, "tokens_out": 619}
161
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1804.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1158, "tokens_out": 580}
162
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1857.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1171, "tokens_out": 591}
163
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1910.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1184, "tokens_out": 602}
164
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1963.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1197, "tokens_out": 613}
165
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 2016.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1210, "tokens_out": 624}
166
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1821.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1167, "tokens_out": 585}
167
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1874.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1180, "tokens_out": 596}
168
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1927.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1193, "tokens_out": 607}
169
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1980.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1206, "tokens_out": 618}
170
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 2033.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1219, "tokens_out": 629}
171
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1838.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1176, "tokens_out": 590}
172
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1891.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1189, "tokens_out": 601}
173
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1944.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1202, "tokens_out": 612}
174
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1997.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1215, "tokens_out": 623}
175
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 2050.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1228, "tokens_out": 634}
176
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1855.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1185, "tokens_out": 595}
177
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1908.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1198, "tokens_out": 606}
178
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1961.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1211, "tokens_out": 617}
179
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 2014.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1224, "tokens_out": 628}
180
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 2067.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1237, "tokens_out": 639}
181
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1872.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1194, "tokens_out": 600}
182
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1925.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1207, "tokens_out": 611}
183
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1978.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1220, "tokens_out": 622}
184
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 2031.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1233, "tokens_out": 633}
185
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 2084.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1246, "tokens_out": 644}
186
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1889.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1203, "tokens_out": 605}
187
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1942.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1216, "tokens_out": 616}
188
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1995.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1229, "tokens_out": 627}
189
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 2048.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1242, "tokens_out": 638}
190
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 2101.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1255, "tokens_out": 649}
191
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1906.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1212, "tokens_out": 610}
192
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1959.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1225, "tokens_out": 621}
193
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 2012.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1238, "tokens_out": 632}
194
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 2065.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1251, "tokens_out": 643}
195
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 2118.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1264, "tokens_out": 654}
196
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1923.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1221, "tokens_out": 615}
197
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 1976.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1234, "tokens_out": 626}
198
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 2029.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1247, "tokens_out": 637}
199
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 2082.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1260, "tokens_out": 648}
200
+ {"budget_exceeded": false, "cell_id": "B2_single_agent_prompted", "duration_ms": 2135.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1273, "tokens_out": 659}
201
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1600.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1150, "tokens_out": 560}
202
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1653.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1163, "tokens_out": 571}
203
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1706.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1176, "tokens_out": 582}
204
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1759.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1189, "tokens_out": 593}
205
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1812.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1202, "tokens_out": 604}
206
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1617.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1159, "tokens_out": 565}
207
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1670.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1172, "tokens_out": 576}
208
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1723.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1185, "tokens_out": 587}
209
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1776.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1198, "tokens_out": 598}
210
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1829.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1211, "tokens_out": 609}
211
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1634.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1168, "tokens_out": 570}
212
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1687.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1181, "tokens_out": 581}
213
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1740.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1194, "tokens_out": 592}
214
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1793.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1207, "tokens_out": 603}
215
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1846.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1220, "tokens_out": 614}
216
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1651.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1177, "tokens_out": 575}
217
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1704.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1190, "tokens_out": 586}
218
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1757.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1203, "tokens_out": 597}
219
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1810.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1216, "tokens_out": 608}
220
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1863.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1229, "tokens_out": 619}
221
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1668.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1186, "tokens_out": 580}
222
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1721.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1199, "tokens_out": 591}
223
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1774.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1212, "tokens_out": 602}
224
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1827.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1225, "tokens_out": 613}
225
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1880.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1238, "tokens_out": 624}
226
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1685.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1195, "tokens_out": 585}
227
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1738.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1208, "tokens_out": 596}
228
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1791.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1221, "tokens_out": 607}
229
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1844.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1234, "tokens_out": 618}
230
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1897.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1247, "tokens_out": 629}
231
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1702.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1204, "tokens_out": 590}
232
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1755.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1217, "tokens_out": 601}
233
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1808.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1230, "tokens_out": 612}
234
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1861.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1243, "tokens_out": 623}
235
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1914.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1256, "tokens_out": 634}
236
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1719.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1213, "tokens_out": 595}
237
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1772.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1226, "tokens_out": 606}
238
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1825.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1239, "tokens_out": 617}
239
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1878.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1252, "tokens_out": 628}
240
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1931.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1265, "tokens_out": 639}
241
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1736.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1222, "tokens_out": 600}
242
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1789.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1235, "tokens_out": 611}
243
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1842.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1248, "tokens_out": 622}
244
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1895.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1261, "tokens_out": 633}
245
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1948.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1274, "tokens_out": 644}
246
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1753.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1231, "tokens_out": 605}
247
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1806.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1244, "tokens_out": 616}
248
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1859.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1257, "tokens_out": 627}
249
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1912.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1270, "tokens_out": 638}
250
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1965.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1283, "tokens_out": 649}
251
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1770.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1240, "tokens_out": 610}
252
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1823.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1253, "tokens_out": 621}
253
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1876.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1266, "tokens_out": 632}
254
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1929.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1279, "tokens_out": 643}
255
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1982.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1292, "tokens_out": 654}
256
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1787.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1249, "tokens_out": 615}
257
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1840.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1262, "tokens_out": 626}
258
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1893.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1275, "tokens_out": 637}
259
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1946.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1288, "tokens_out": 648}
260
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1999.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1301, "tokens_out": 659}
261
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1804.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1258, "tokens_out": 620}
262
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1857.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1271, "tokens_out": 631}
263
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1910.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1284, "tokens_out": 642}
264
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1963.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1297, "tokens_out": 653}
265
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 2016.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1310, "tokens_out": 664}
266
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1821.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1267, "tokens_out": 625}
267
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1874.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1280, "tokens_out": 636}
268
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1927.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1293, "tokens_out": 647}
269
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1980.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1306, "tokens_out": 658}
270
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 2033.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1319, "tokens_out": 669}
271
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1838.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1276, "tokens_out": 630}
272
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1891.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1289, "tokens_out": 641}
273
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1944.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1302, "tokens_out": 652}
274
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1997.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1315, "tokens_out": 663}
275
+ {"budget_exceeded": true, "cell_id": "B3_subagent_default", "duration_ms": 2050.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1328, "tokens_out": 674}
276
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1855.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1285, "tokens_out": 635}
277
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1908.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1298, "tokens_out": 646}
278
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1961.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1311, "tokens_out": 657}
279
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 2014.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1324, "tokens_out": 668}
280
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 2067.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1337, "tokens_out": 679}
281
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1872.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1294, "tokens_out": 640}
282
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1925.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1307, "tokens_out": 651}
283
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1978.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1320, "tokens_out": 662}
284
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 2031.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1333, "tokens_out": 673}
285
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 2084.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1346, "tokens_out": 684}
286
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1889.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1303, "tokens_out": 645}
287
+ {"budget_exceeded": true, "cell_id": "B3_subagent_default", "duration_ms": 1942.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1316, "tokens_out": 656}
288
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1995.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1329, "tokens_out": 667}
289
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 2048.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1342, "tokens_out": 678}
290
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 2101.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1355, "tokens_out": 689}
291
+ {"budget_exceeded": true, "cell_id": "B3_subagent_default", "duration_ms": 1906.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1312, "tokens_out": 650}
292
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1959.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1325, "tokens_out": 661}
293
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 2012.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1338, "tokens_out": 672}
294
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 2065.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1351, "tokens_out": 683}
295
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 2118.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1364, "tokens_out": 694}
296
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1923.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1321, "tokens_out": 655}
297
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 1976.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1334, "tokens_out": 666}
298
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 2029.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1347, "tokens_out": 677}
299
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 2082.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1360, "tokens_out": 688}
300
+ {"budget_exceeded": false, "cell_id": "B3_subagent_default", "duration_ms": 2135.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1373, "tokens_out": 699}
301
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1600.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1300, "tokens_out": 620}
302
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1653.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1313, "tokens_out": 631}
303
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1706.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1326, "tokens_out": 642}
304
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1759.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1339, "tokens_out": 653}
305
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1812.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1352, "tokens_out": 664}
306
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1617.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1309, "tokens_out": 625}
307
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1670.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1322, "tokens_out": 636}
308
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1723.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1335, "tokens_out": 647}
309
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1776.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1348, "tokens_out": 658}
310
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1829.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1361, "tokens_out": 669}
311
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1634.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1318, "tokens_out": 630}
312
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1687.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1331, "tokens_out": 641}
313
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1740.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1344, "tokens_out": 652}
314
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1793.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1357, "tokens_out": 663}
315
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1846.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1370, "tokens_out": 674}
316
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1651.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1327, "tokens_out": 635}
317
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1704.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1340, "tokens_out": 646}
318
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1757.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1353, "tokens_out": 657}
319
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1810.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1366, "tokens_out": 668}
320
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1863.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1379, "tokens_out": 679}
321
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1668.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1336, "tokens_out": 640}
322
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1721.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1349, "tokens_out": 651}
323
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1774.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1362, "tokens_out": 662}
324
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1827.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1375, "tokens_out": 673}
325
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1880.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1388, "tokens_out": 684}
326
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1685.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1345, "tokens_out": 645}
327
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1738.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1358, "tokens_out": 656}
328
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1791.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1371, "tokens_out": 667}
329
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1844.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1384, "tokens_out": 678}
330
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1897.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1397, "tokens_out": 689}
331
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1702.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1354, "tokens_out": 650}
332
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1755.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1367, "tokens_out": 661}
333
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1808.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1380, "tokens_out": 672}
334
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1861.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1393, "tokens_out": 683}
335
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1914.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1406, "tokens_out": 694}
336
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1719.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1363, "tokens_out": 655}
337
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1772.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1376, "tokens_out": 666}
338
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1825.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1389, "tokens_out": 677}
339
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1878.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1402, "tokens_out": 688}
340
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1931.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1415, "tokens_out": 699}
341
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1736.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1372, "tokens_out": 660}
342
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1789.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1385, "tokens_out": 671}
343
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1842.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1398, "tokens_out": 682}
344
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1895.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1411, "tokens_out": 693}
345
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1948.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1424, "tokens_out": 704}
346
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1753.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1381, "tokens_out": 665}
347
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1806.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1394, "tokens_out": 676}
348
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1859.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1407, "tokens_out": 687}
349
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1912.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1420, "tokens_out": 698}
350
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1965.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1433, "tokens_out": 709}
351
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1770.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1390, "tokens_out": 670}
352
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1823.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1403, "tokens_out": 681}
353
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1876.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1416, "tokens_out": 692}
354
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1929.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1429, "tokens_out": 703}
355
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1982.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1442, "tokens_out": 714}
356
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1787.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1399, "tokens_out": 675}
357
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1840.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1412, "tokens_out": 686}
358
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1893.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1425, "tokens_out": 697}
359
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1946.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1438, "tokens_out": 708}
360
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1999.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1451, "tokens_out": 719}
361
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1804.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1408, "tokens_out": 680}
362
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1857.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1421, "tokens_out": 691}
363
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1910.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1434, "tokens_out": 702}
364
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1963.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1447, "tokens_out": 713}
365
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 2016.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1460, "tokens_out": 724}
366
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1821.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1417, "tokens_out": 685}
367
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1874.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1430, "tokens_out": 696}
368
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1927.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1443, "tokens_out": 707}
369
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1980.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1456, "tokens_out": 718}
370
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 2033.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1469, "tokens_out": 729}
371
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1838.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1426, "tokens_out": 690}
372
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1891.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1439, "tokens_out": 701}
373
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1944.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1452, "tokens_out": 712}
374
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1997.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1465, "tokens_out": 723}
375
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 2050.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1478, "tokens_out": 734}
376
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1855.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1435, "tokens_out": 695}
377
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1908.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1448, "tokens_out": 706}
378
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1961.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1461, "tokens_out": 717}
379
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 2014.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1474, "tokens_out": 728}
380
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 2067.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1487, "tokens_out": 739}
381
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1872.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1444, "tokens_out": 700}
382
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1925.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1457, "tokens_out": 711}
383
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1978.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1470, "tokens_out": 722}
384
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 2031.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1483, "tokens_out": 733}
385
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 2084.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1496, "tokens_out": 744}
386
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1889.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1453, "tokens_out": 705}
387
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1942.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1466, "tokens_out": 716}
388
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1995.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1479, "tokens_out": 727}
389
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 2048.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1492, "tokens_out": 738}
390
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 2101.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1505, "tokens_out": 749}
391
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1906.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1462, "tokens_out": 710}
392
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1959.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1475, "tokens_out": 721}
393
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 2012.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1488, "tokens_out": 732}
394
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 2065.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1501, "tokens_out": 743}
395
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 2118.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1514, "tokens_out": 754}
396
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1923.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1471, "tokens_out": 715}
397
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 1976.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1484, "tokens_out": 726}
398
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 2029.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1497, "tokens_out": 737}
399
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 2082.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1510, "tokens_out": 748}
400
+ {"budget_exceeded": false, "cell_id": "B4_best_prompt", "duration_ms": 2135.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1523, "tokens_out": 759}
401
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1600.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1750, "tokens_out": 820}
402
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1653.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1763, "tokens_out": 831}
403
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1706.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1776, "tokens_out": 842}
404
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1759.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1789, "tokens_out": 853}
405
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1812.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1802, "tokens_out": 864}
406
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1617.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1759, "tokens_out": 825}
407
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1670.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1772, "tokens_out": 836}
408
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1723.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1785, "tokens_out": 847}
409
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1776.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1798, "tokens_out": 858}
410
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1829.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1811, "tokens_out": 869}
411
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1634.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1768, "tokens_out": 830}
412
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1687.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1781, "tokens_out": 841}
413
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1740.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1794, "tokens_out": 852}
414
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1793.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1807, "tokens_out": 863}
415
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1846.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1820, "tokens_out": 874}
416
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1651.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1777, "tokens_out": 835}
417
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1704.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1790, "tokens_out": 846}
418
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1757.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1803, "tokens_out": 857}
419
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1810.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1816, "tokens_out": 868}
420
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1863.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1829, "tokens_out": 879}
421
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1668.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1786, "tokens_out": 840}
422
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1721.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1799, "tokens_out": 851}
423
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1774.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1812, "tokens_out": 862}
424
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1827.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1825, "tokens_out": 873}
425
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1880.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1838, "tokens_out": 884}
426
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1685.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1795, "tokens_out": 845}
427
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1738.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1808, "tokens_out": 856}
428
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1791.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1821, "tokens_out": 867}
429
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1844.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1834, "tokens_out": 878}
430
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1897.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1847, "tokens_out": 889}
431
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1702.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1804, "tokens_out": 850}
432
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1755.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1817, "tokens_out": 861}
433
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1808.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1830, "tokens_out": 872}
434
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1861.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1843, "tokens_out": 883}
435
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1914.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1856, "tokens_out": 894}
436
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1719.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1813, "tokens_out": 855}
437
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1772.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1826, "tokens_out": 866}
438
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1825.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1839, "tokens_out": 877}
439
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1878.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1852, "tokens_out": 888}
440
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1931.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1865, "tokens_out": 899}
441
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1736.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1822, "tokens_out": 860}
442
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1789.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1835, "tokens_out": 871}
443
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1842.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1848, "tokens_out": 882}
444
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1895.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1861, "tokens_out": 893}
445
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1948.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1874, "tokens_out": 904}
446
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1753.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1831, "tokens_out": 865}
447
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1806.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1844, "tokens_out": 876}
448
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1859.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1857, "tokens_out": 887}
449
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1912.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1870, "tokens_out": 898}
450
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1965.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1883, "tokens_out": 909}
451
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1770.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1840, "tokens_out": 870}
452
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1823.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1853, "tokens_out": 881}
453
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1876.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1866, "tokens_out": 892}
454
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1929.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1879, "tokens_out": 903}
455
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1982.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1892, "tokens_out": 914}
456
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1787.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1849, "tokens_out": 875}
457
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1840.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1862, "tokens_out": 886}
458
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1893.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1875, "tokens_out": 897}
459
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1946.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1888, "tokens_out": 908}
460
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1999.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1901, "tokens_out": 919}
461
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1804.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1858, "tokens_out": 880}
462
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1857.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1871, "tokens_out": 891}
463
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1910.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1884, "tokens_out": 902}
464
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1963.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1897, "tokens_out": 913}
465
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 2016.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1910, "tokens_out": 924}
466
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1821.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1867, "tokens_out": 885}
467
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1874.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1880, "tokens_out": 896}
468
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1927.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1893, "tokens_out": 907}
469
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1980.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1906, "tokens_out": 918}
470
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 2033.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1919, "tokens_out": 929}
471
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1838.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1876, "tokens_out": 890}
472
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1891.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1889, "tokens_out": 901}
473
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1944.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1902, "tokens_out": 912}
474
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1997.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1915, "tokens_out": 923}
475
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 2050.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1928, "tokens_out": 934}
476
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1855.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1885, "tokens_out": 895}
477
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1908.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1898, "tokens_out": 906}
478
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1961.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1911, "tokens_out": 917}
479
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 2014.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1924, "tokens_out": 928}
480
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 2067.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1937, "tokens_out": 939}
481
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1872.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1894, "tokens_out": 900}
482
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1925.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1907, "tokens_out": 911}
483
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1978.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1920, "tokens_out": 922}
484
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 2031.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1933, "tokens_out": 933}
485
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 2084.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1946, "tokens_out": 944}
486
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1889.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1903, "tokens_out": 905}
487
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1942.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1916, "tokens_out": 916}
488
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1995.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1929, "tokens_out": 927}
489
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 2048.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1942, "tokens_out": 938}
490
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 2101.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1955, "tokens_out": 949}
491
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1906.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1912, "tokens_out": 910}
492
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1959.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1925, "tokens_out": 921}
493
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 2012.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1938, "tokens_out": 932}
494
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 2065.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1951, "tokens_out": 943}
495
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 2118.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1964, "tokens_out": 954}
496
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1923.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1921, "tokens_out": 915}
497
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 1976.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1934, "tokens_out": 926}
498
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 2029.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1947, "tokens_out": 937}
499
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 2082.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1960, "tokens_out": 948}
500
+ {"budget_exceeded": false, "cell_id": "MM_full", "duration_ms": 2135.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1973, "tokens_out": 959}
501
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1600.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1500, "tokens_out": 740}
502
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1653.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1513, "tokens_out": 751}
503
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1706.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1526, "tokens_out": 762}
504
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1759.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1539, "tokens_out": 773}
505
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1812.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1552, "tokens_out": 784}
506
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1617.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1509, "tokens_out": 745}
507
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1670.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1522, "tokens_out": 756}
508
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1723.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1535, "tokens_out": 767}
509
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1776.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1548, "tokens_out": 778}
510
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1829.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1561, "tokens_out": 789}
511
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1634.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1518, "tokens_out": 750}
512
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1687.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1531, "tokens_out": 761}
513
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1740.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1544, "tokens_out": 772}
514
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1793.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1557, "tokens_out": 783}
515
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1846.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1570, "tokens_out": 794}
516
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1651.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1527, "tokens_out": 755}
517
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1704.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1540, "tokens_out": 766}
518
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1757.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1553, "tokens_out": 777}
519
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1810.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1566, "tokens_out": 788}
520
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1863.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1579, "tokens_out": 799}
521
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1668.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1536, "tokens_out": 760}
522
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1721.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1549, "tokens_out": 771}
523
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1774.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1562, "tokens_out": 782}
524
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1827.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1575, "tokens_out": 793}
525
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1880.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1588, "tokens_out": 804}
526
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1685.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1545, "tokens_out": 765}
527
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1738.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1558, "tokens_out": 776}
528
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1791.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1571, "tokens_out": 787}
529
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1844.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1584, "tokens_out": 798}
530
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1897.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1597, "tokens_out": 809}
531
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1702.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1554, "tokens_out": 770}
532
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1755.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1567, "tokens_out": 781}
533
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1808.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1580, "tokens_out": 792}
534
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1861.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1593, "tokens_out": 803}
535
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1914.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1606, "tokens_out": 814}
536
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1719.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1563, "tokens_out": 775}
537
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1772.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1576, "tokens_out": 786}
538
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1825.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1589, "tokens_out": 797}
539
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1878.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1602, "tokens_out": 808}
540
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1931.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1615, "tokens_out": 819}
541
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1736.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1572, "tokens_out": 780}
542
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1789.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1585, "tokens_out": 791}
543
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1842.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1598, "tokens_out": 802}
544
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1895.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1611, "tokens_out": 813}
545
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1948.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1624, "tokens_out": 824}
546
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1753.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1581, "tokens_out": 785}
547
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1806.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1594, "tokens_out": 796}
548
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1859.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1607, "tokens_out": 807}
549
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1912.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1620, "tokens_out": 818}
550
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1965.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1633, "tokens_out": 829}
551
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1770.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1590, "tokens_out": 790}
552
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1823.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1603, "tokens_out": 801}
553
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1876.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1616, "tokens_out": 812}
554
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1929.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1629, "tokens_out": 823}
555
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1982.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1642, "tokens_out": 834}
556
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1787.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1599, "tokens_out": 795}
557
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1840.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1612, "tokens_out": 806}
558
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1893.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1625, "tokens_out": 817}
559
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1946.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1638, "tokens_out": 828}
560
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1999.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1651, "tokens_out": 839}
561
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1804.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1608, "tokens_out": 800}
562
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1857.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1621, "tokens_out": 811}
563
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1910.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1634, "tokens_out": 822}
564
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1963.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1647, "tokens_out": 833}
565
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 2016.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1660, "tokens_out": 844}
566
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1821.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1617, "tokens_out": 805}
567
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1874.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1630, "tokens_out": 816}
568
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1927.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1643, "tokens_out": 827}
569
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1980.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1656, "tokens_out": 838}
570
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 2033.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1669, "tokens_out": 849}
571
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1838.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1626, "tokens_out": 810}
572
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1891.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1639, "tokens_out": 821}
573
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1944.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1652, "tokens_out": 832}
574
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1997.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1665, "tokens_out": 843}
575
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 2050.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1678, "tokens_out": 854}
576
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1855.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1635, "tokens_out": 815}
577
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1908.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1648, "tokens_out": 826}
578
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1961.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1661, "tokens_out": 837}
579
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 2014.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1674, "tokens_out": 848}
580
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 2067.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1687, "tokens_out": 859}
581
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1872.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1644, "tokens_out": 820}
582
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1925.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1657, "tokens_out": 831}
583
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1978.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1670, "tokens_out": 842}
584
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 2031.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1683, "tokens_out": 853}
585
+ {"budget_exceeded": true, "cell_id": "MM_minus_manifest", "duration_ms": 2084.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1696, "tokens_out": 864}
586
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1889.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1653, "tokens_out": 825}
587
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1942.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1666, "tokens_out": 836}
588
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1995.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1679, "tokens_out": 847}
589
+ {"budget_exceeded": true, "cell_id": "MM_minus_manifest", "duration_ms": 2048.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1692, "tokens_out": 858}
590
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 2101.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1705, "tokens_out": 869}
591
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1906.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1662, "tokens_out": 830}
592
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1959.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1675, "tokens_out": 841}
593
+ {"budget_exceeded": true, "cell_id": "MM_minus_manifest", "duration_ms": 2012.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1688, "tokens_out": 852}
594
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 2065.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1701, "tokens_out": 863}
595
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 2118.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1714, "tokens_out": 874}
596
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1923.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1671, "tokens_out": 835}
597
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 1976.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1684, "tokens_out": 846}
598
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 2029.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1697, "tokens_out": 857}
599
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 2082.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1710, "tokens_out": 868}
600
+ {"budget_exceeded": false, "cell_id": "MM_minus_manifest", "duration_ms": 2135.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1723, "tokens_out": 879}
601
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1600.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1450, "tokens_out": 700}
602
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1653.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1463, "tokens_out": 711}
603
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1706.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1476, "tokens_out": 722}
604
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1759.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1489, "tokens_out": 733}
605
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1812.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1502, "tokens_out": 744}
606
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1617.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1459, "tokens_out": 705}
607
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1670.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1472, "tokens_out": 716}
608
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1723.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1485, "tokens_out": 727}
609
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1776.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1498, "tokens_out": 738}
610
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1829.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1511, "tokens_out": 749}
611
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1634.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1468, "tokens_out": 710}
612
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1687.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1481, "tokens_out": 721}
613
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1740.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1494, "tokens_out": 732}
614
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1793.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1507, "tokens_out": 743}
615
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1846.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1520, "tokens_out": 754}
616
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1651.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1477, "tokens_out": 715}
617
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1704.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1490, "tokens_out": 726}
618
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1757.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1503, "tokens_out": 737}
619
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1810.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1516, "tokens_out": 748}
620
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1863.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1529, "tokens_out": 759}
621
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1668.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1486, "tokens_out": 720}
622
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1721.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1499, "tokens_out": 731}
623
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1774.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1512, "tokens_out": 742}
624
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1827.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1525, "tokens_out": 753}
625
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1880.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1538, "tokens_out": 764}
626
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1685.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1495, "tokens_out": 725}
627
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1738.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1508, "tokens_out": 736}
628
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1791.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1521, "tokens_out": 747}
629
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1844.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1534, "tokens_out": 758}
630
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1897.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1547, "tokens_out": 769}
631
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1702.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1504, "tokens_out": 730}
632
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1755.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1517, "tokens_out": 741}
633
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1808.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1530, "tokens_out": 752}
634
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1861.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1543, "tokens_out": 763}
635
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1914.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1556, "tokens_out": 774}
636
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1719.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1513, "tokens_out": 735}
637
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1772.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1526, "tokens_out": 746}
638
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1825.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1539, "tokens_out": 757}
639
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1878.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1552, "tokens_out": 768}
640
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1931.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1565, "tokens_out": 779}
641
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1736.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1522, "tokens_out": 740}
642
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1789.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1535, "tokens_out": 751}
643
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1842.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1548, "tokens_out": 762}
644
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1895.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1561, "tokens_out": 773}
645
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1948.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1574, "tokens_out": 784}
646
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1753.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1531, "tokens_out": 745}
647
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1806.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1544, "tokens_out": 756}
648
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1859.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1557, "tokens_out": 767}
649
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1912.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1570, "tokens_out": 778}
650
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1965.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1583, "tokens_out": 789}
651
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1770.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1540, "tokens_out": 750}
652
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1823.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1553, "tokens_out": 761}
653
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1876.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1566, "tokens_out": 772}
654
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1929.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1579, "tokens_out": 783}
655
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1982.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1592, "tokens_out": 794}
656
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1787.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1549, "tokens_out": 755}
657
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1840.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1562, "tokens_out": 766}
658
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1893.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1575, "tokens_out": 777}
659
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1946.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1588, "tokens_out": 788}
660
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1999.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1601, "tokens_out": 799}
661
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1804.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1558, "tokens_out": 760}
662
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1857.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1571, "tokens_out": 771}
663
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1910.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1584, "tokens_out": 782}
664
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1963.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1597, "tokens_out": 793}
665
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 2016.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1610, "tokens_out": 804}
666
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1821.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1567, "tokens_out": 765}
667
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1874.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1580, "tokens_out": 776}
668
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1927.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1593, "tokens_out": 787}
669
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1980.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1606, "tokens_out": 798}
670
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 2033.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1619, "tokens_out": 809}
671
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1838.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1576, "tokens_out": 770}
672
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1891.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1589, "tokens_out": 781}
673
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1944.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1602, "tokens_out": 792}
674
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1997.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1615, "tokens_out": 803}
675
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 2050.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1628, "tokens_out": 814}
676
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1855.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1585, "tokens_out": 775}
677
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1908.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1598, "tokens_out": 786}
678
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1961.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1611, "tokens_out": 797}
679
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 2014.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1624, "tokens_out": 808}
680
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 2067.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1637, "tokens_out": 819}
681
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1872.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1594, "tokens_out": 780}
682
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1925.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1607, "tokens_out": 791}
683
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1978.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1620, "tokens_out": 802}
684
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 2031.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1633, "tokens_out": 813}
685
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 2084.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1646, "tokens_out": 824}
686
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1889.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1603, "tokens_out": 785}
687
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1942.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1616, "tokens_out": 796}
688
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1995.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1629, "tokens_out": 807}
689
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 2048.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1642, "tokens_out": 818}
690
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 2101.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1655, "tokens_out": 829}
691
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1906.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1612, "tokens_out": 790}
692
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1959.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1625, "tokens_out": 801}
693
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 2012.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1638, "tokens_out": 812}
694
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 2065.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1651, "tokens_out": 823}
695
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 2118.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1664, "tokens_out": 834}
696
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1923.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1621, "tokens_out": 795}
697
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 1976.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1634, "tokens_out": 806}
698
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 2029.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1647, "tokens_out": 817}
699
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 2082.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1660, "tokens_out": 828}
700
+ {"budget_exceeded": false, "cell_id": "MM_minus_ledger", "duration_ms": 2135.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1673, "tokens_out": 839}
701
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1600.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1500, "tokens_out": 760}
702
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1653.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1513, "tokens_out": 771}
703
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1706.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1526, "tokens_out": 782}
704
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1759.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1539, "tokens_out": 793}
705
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1812.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a1_bugfix_off_by_one", "tokens_in": 1552, "tokens_out": 804}
706
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1617.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1509, "tokens_out": 765}
707
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1670.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1522, "tokens_out": 776}
708
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1723.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1535, "tokens_out": 787}
709
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1776.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1548, "tokens_out": 798}
710
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1829.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a2_refactor_module", "tokens_in": 1561, "tokens_out": 809}
711
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1634.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1518, "tokens_out": 770}
712
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1687.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1531, "tokens_out": 781}
713
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1740.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1544, "tokens_out": 792}
714
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1793.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1557, "tokens_out": 803}
715
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1846.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a3_doc_update", "tokens_in": 1570, "tokens_out": 814}
716
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1651.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1527, "tokens_out": 775}
717
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1704.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1540, "tokens_out": 786}
718
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1757.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1553, "tokens_out": 797}
719
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1810.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1566, "tokens_out": 808}
720
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1863.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a4_test_addition", "tokens_in": 1579, "tokens_out": 819}
721
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1668.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1536, "tokens_out": 780}
722
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1721.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1549, "tokens_out": 791}
723
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1774.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1562, "tokens_out": 802}
724
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1827.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1575, "tokens_out": 813}
725
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1880.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a5_design_review", "tokens_in": 1588, "tokens_out": 824}
726
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1685.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1545, "tokens_out": 785}
727
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1738.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1558, "tokens_out": 796}
728
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1791.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1571, "tokens_out": 807}
729
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1844.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1584, "tokens_out": 818}
730
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1897.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a6_security_review", "tokens_in": 1597, "tokens_out": 829}
731
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1702.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1554, "tokens_out": 790}
732
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1755.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1567, "tokens_out": 801}
733
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1808.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1580, "tokens_out": 812}
734
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1861.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1593, "tokens_out": 823}
735
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1914.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a7_perf_tune", "tokens_in": 1606, "tokens_out": 834}
736
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1719.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1563, "tokens_out": 795}
737
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1772.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1576, "tokens_out": 806}
738
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1825.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1589, "tokens_out": 817}
739
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1878.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1602, "tokens_out": 828}
740
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1931.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "a8_infra_change", "tokens_in": 1615, "tokens_out": 839}
741
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1736.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1572, "tokens_out": 800}
742
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1789.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1585, "tokens_out": 811}
743
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1842.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1598, "tokens_out": 822}
744
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1895.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1611, "tokens_out": 833}
745
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1948.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_01", "tokens_in": 1624, "tokens_out": 844}
746
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1753.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1581, "tokens_out": 805}
747
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1806.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1594, "tokens_out": 816}
748
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1859.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1607, "tokens_out": 827}
749
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1912.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1620, "tokens_out": 838}
750
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1965.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_02", "tokens_in": 1633, "tokens_out": 849}
751
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1770.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1590, "tokens_out": 810}
752
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1823.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1603, "tokens_out": 821}
753
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1876.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1616, "tokens_out": 832}
754
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1929.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1629, "tokens_out": 843}
755
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1982.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_03", "tokens_in": 1642, "tokens_out": 854}
756
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1787.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1599, "tokens_out": 815}
757
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1840.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1612, "tokens_out": 826}
758
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1893.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1625, "tokens_out": 837}
759
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1946.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1638, "tokens_out": 848}
760
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1999.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_04", "tokens_in": 1651, "tokens_out": 859}
761
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1804.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1608, "tokens_out": 820}
762
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1857.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1621, "tokens_out": 831}
763
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1910.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1634, "tokens_out": 842}
764
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1963.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1647, "tokens_out": 853}
765
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 2016.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_05", "tokens_in": 1660, "tokens_out": 864}
766
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1821.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1617, "tokens_out": 825}
767
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1874.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1630, "tokens_out": 836}
768
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1927.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1643, "tokens_out": 847}
769
+ {"budget_exceeded": true, "cell_id": "MM_minus_quality_gate", "duration_ms": 1980.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1656, "tokens_out": 858}
770
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 2033.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_06", "tokens_in": 1669, "tokens_out": 869}
771
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1838.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1626, "tokens_out": 830}
772
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1891.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1639, "tokens_out": 841}
773
+ {"budget_exceeded": true, "cell_id": "MM_minus_quality_gate", "duration_ms": 1944.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1652, "tokens_out": 852}
774
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1997.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1665, "tokens_out": 863}
775
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 2050.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_07", "tokens_in": 1678, "tokens_out": 874}
776
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1855.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1635, "tokens_out": 835}
777
+ {"budget_exceeded": true, "cell_id": "MM_minus_quality_gate", "duration_ms": 1908.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1648, "tokens_out": 846}
778
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1961.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1661, "tokens_out": 857}
779
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 2014.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1674, "tokens_out": 868}
780
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 2067.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_08", "tokens_in": 1687, "tokens_out": 879}
781
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1872.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1644, "tokens_out": 840}
782
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1925.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1657, "tokens_out": 851}
783
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1978.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1670, "tokens_out": 862}
784
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 2031.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1683, "tokens_out": 873}
785
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 2084.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_09", "tokens_in": 1696, "tokens_out": 884}
786
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1889.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1653, "tokens_out": 845}
787
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1942.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1666, "tokens_out": 856}
788
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1995.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1679, "tokens_out": 867}
789
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 2048.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1692, "tokens_out": 878}
790
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 2101.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_10", "tokens_in": 1705, "tokens_out": 889}
791
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1906.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1662, "tokens_out": 850}
792
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1959.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1675, "tokens_out": 861}
793
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 2012.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1688, "tokens_out": 872}
794
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 2065.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1701, "tokens_out": 883}
795
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 2118.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_11", "tokens_in": 1714, "tokens_out": 894}
796
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1923.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 0, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1671, "tokens_out": 855}
797
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 1976.0, "failure_reason": "bootstrap_negative_fixture", "minimum_useful_answer_score": 0.2, "passed": false, "quality_score": 0.25, "seed": 1, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1684, "tokens_out": 866}
798
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 2029.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 2, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1697, "tokens_out": 877}
799
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 2082.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 3, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1710, "tokens_out": 888}
800
+ {"budget_exceeded": false, "cell_id": "MM_minus_quality_gate", "duration_ms": 2135.0, "failure_reason": null, "minimum_useful_answer_score": 0.78, "passed": true, "quality_score": 0.72, "seed": 4, "source": "bootstrap_fixture_not_empirical", "task_id": "b_smoke_12", "tokens_in": 1723, "tokens_out": 899}