@researai/deepscientist 1.5.7 → 1.5.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. package/LICENSE +186 -21
  2. package/README.md +8 -4
  3. package/bin/ds.js +224 -9
  4. package/docs/en/00_QUICK_START.md +2 -2
  5. package/docs/en/07_MEMORY_AND_MCP.md +40 -3
  6. package/docs/en/99_ACKNOWLEDGEMENTS.md +1 -0
  7. package/docs/zh/00_QUICK_START.md +2 -2
  8. package/docs/zh/07_MEMORY_AND_MCP.md +40 -3
  9. package/docs/zh/99_ACKNOWLEDGEMENTS.md +1 -0
  10. package/install.sh +34 -0
  11. package/package.json +2 -2
  12. package/pyproject.toml +2 -2
  13. package/src/deepscientist/__init__.py +1 -1
  14. package/src/deepscientist/acp/envelope.py +1 -0
  15. package/src/deepscientist/artifact/metrics.py +814 -83
  16. package/src/deepscientist/artifact/schemas.py +1 -0
  17. package/src/deepscientist/artifact/service.py +2001 -229
  18. package/src/deepscientist/bash_exec/monitor.py +1 -1
  19. package/src/deepscientist/bash_exec/service.py +17 -9
  20. package/src/deepscientist/channels/qq.py +17 -0
  21. package/src/deepscientist/channels/relay.py +16 -0
  22. package/src/deepscientist/config/models.py +6 -0
  23. package/src/deepscientist/config/service.py +70 -2
  24. package/src/deepscientist/daemon/api/handlers.py +414 -14
  25. package/src/deepscientist/daemon/api/router.py +4 -0
  26. package/src/deepscientist/daemon/app.py +292 -21
  27. package/src/deepscientist/gitops/diff.py +6 -10
  28. package/src/deepscientist/mcp/server.py +191 -40
  29. package/src/deepscientist/prompts/builder.py +65 -19
  30. package/src/deepscientist/quest/node_traces.py +129 -2
  31. package/src/deepscientist/quest/service.py +140 -34
  32. package/src/deepscientist/quest/stage_views.py +175 -33
  33. package/src/deepscientist/registries/baseline.py +56 -4
  34. package/src/deepscientist/runners/codex.py +1 -1
  35. package/src/prompts/connectors/qq.md +1 -1
  36. package/src/prompts/contracts/shared_interaction.md +14 -0
  37. package/src/prompts/system.md +113 -32
  38. package/src/skills/analysis-campaign/SKILL.md +10 -14
  39. package/src/skills/baseline/SKILL.md +51 -38
  40. package/src/skills/baseline/references/baseline-plan-template.md +2 -0
  41. package/src/skills/decision/SKILL.md +12 -8
  42. package/src/skills/experiment/SKILL.md +28 -16
  43. package/src/skills/experiment/references/main-experiment-plan-template.md +2 -0
  44. package/src/skills/figure-polish/SKILL.md +1 -0
  45. package/src/skills/finalize/SKILL.md +3 -8
  46. package/src/skills/idea/SKILL.md +18 -8
  47. package/src/skills/idea/references/literature-survey-template.md +24 -0
  48. package/src/skills/idea/references/related-work-playbook.md +4 -0
  49. package/src/skills/idea/references/selection-gate.md +9 -0
  50. package/src/skills/intake-audit/SKILL.md +2 -8
  51. package/src/skills/rebuttal/SKILL.md +2 -8
  52. package/src/skills/review/SKILL.md +2 -8
  53. package/src/skills/scout/SKILL.md +2 -8
  54. package/src/skills/write/SKILL.md +53 -17
  55. package/src/skills/write/templates/DEEPSCIENTIST_NOTES.md +21 -0
  56. package/src/skills/write/templates/README.md +408 -0
  57. package/src/skills/write/templates/UPSTREAM_LICENSE.txt +21 -0
  58. package/src/skills/write/templates/aaai2026/README.md +534 -0
  59. package/src/skills/write/templates/aaai2026/aaai2026-unified-supp.tex +144 -0
  60. package/src/skills/write/templates/aaai2026/aaai2026-unified-template.tex +952 -0
  61. package/src/skills/write/templates/aaai2026/aaai2026.bib +111 -0
  62. package/src/skills/write/templates/aaai2026/aaai2026.bst +1493 -0
  63. package/src/skills/write/templates/aaai2026/aaai2026.sty +315 -0
  64. package/src/skills/write/templates/acl/README.md +50 -0
  65. package/src/skills/write/templates/acl/acl.sty +312 -0
  66. package/src/skills/write/templates/acl/acl_latex.tex +377 -0
  67. package/src/skills/write/templates/acl/acl_lualatex.tex +101 -0
  68. package/src/skills/write/templates/acl/acl_natbib.bst +1940 -0
  69. package/src/skills/write/templates/acl/anthology.bib.txt +26 -0
  70. package/src/skills/write/templates/acl/custom.bib +70 -0
  71. package/src/skills/write/templates/acl/formatting.md +326 -0
  72. package/src/skills/write/templates/asplos2027/main.tex +459 -0
  73. package/src/skills/write/templates/asplos2027/references.bib +135 -0
  74. package/src/skills/write/templates/colm2025/README.md +3 -0
  75. package/src/skills/write/templates/colm2025/colm2025_conference.bib +11 -0
  76. package/src/skills/write/templates/colm2025/colm2025_conference.bst +1440 -0
  77. package/src/skills/write/templates/colm2025/colm2025_conference.sty +218 -0
  78. package/src/skills/write/templates/colm2025/colm2025_conference.tex +305 -0
  79. package/src/skills/write/templates/colm2025/fancyhdr.sty +485 -0
  80. package/src/skills/write/templates/colm2025/math_commands.tex +508 -0
  81. package/src/skills/write/templates/colm2025/natbib.sty +1246 -0
  82. package/src/skills/write/templates/iclr2026/fancyhdr.sty +485 -0
  83. package/src/skills/write/templates/iclr2026/iclr2026_conference.bib +24 -0
  84. package/src/skills/write/templates/iclr2026/iclr2026_conference.bst +1440 -0
  85. package/src/skills/write/templates/iclr2026/iclr2026_conference.sty +246 -0
  86. package/src/skills/write/templates/iclr2026/iclr2026_conference.tex +414 -0
  87. package/src/skills/write/templates/iclr2026/math_commands.tex +508 -0
  88. package/src/skills/write/templates/iclr2026/natbib.sty +1246 -0
  89. package/src/skills/write/templates/icml2026/algorithm.sty +79 -0
  90. package/src/skills/write/templates/icml2026/algorithmic.sty +201 -0
  91. package/src/skills/write/templates/icml2026/example_paper.bib +75 -0
  92. package/src/skills/write/templates/icml2026/example_paper.tex +662 -0
  93. package/src/skills/write/templates/icml2026/fancyhdr.sty +864 -0
  94. package/src/skills/write/templates/icml2026/icml2026.bst +1443 -0
  95. package/src/skills/write/templates/icml2026/icml2026.sty +767 -0
  96. package/src/skills/write/templates/neurips2025/Makefile +36 -0
  97. package/src/skills/write/templates/neurips2025/extra_pkgs.tex +53 -0
  98. package/src/skills/write/templates/neurips2025/main.tex +38 -0
  99. package/src/skills/write/templates/neurips2025/neurips.sty +382 -0
  100. package/src/skills/write/templates/nsdi2027/main.tex +426 -0
  101. package/src/skills/write/templates/nsdi2027/references.bib +151 -0
  102. package/src/skills/write/templates/nsdi2027/usenix-2020-09.sty +83 -0
  103. package/src/skills/write/templates/osdi2026/main.tex +429 -0
  104. package/src/skills/write/templates/osdi2026/references.bib +150 -0
  105. package/src/skills/write/templates/osdi2026/usenix-2020-09.sty +83 -0
  106. package/src/skills/write/templates/sosp2026/main.tex +532 -0
  107. package/src/skills/write/templates/sosp2026/references.bib +148 -0
  108. package/src/tui/package.json +1 -1
  109. package/src/ui/dist/assets/{AiManusChatView-BS3V4ZOk.js → AiManusChatView-BKZ103sn.js} +110 -14
  110. package/src/ui/dist/assets/{AnalysisPlugin-DLPXQsmr.js → AnalysisPlugin-mTTzGAlK.js} +1 -1
  111. package/src/ui/dist/assets/{AutoFigurePlugin-C-Fr9knQ.js → AutoFigurePlugin-C_wWw4AP.js} +5 -5
  112. package/src/ui/dist/assets/{CliPlugin-Dd8AHzFg.js → CliPlugin-BH58n3GY.js} +9 -9
  113. package/src/ui/dist/assets/{CodeEditorPlugin-Dg-RepTl.js → CodeEditorPlugin-BKGRUH7e.js} +8 -8
  114. package/src/ui/dist/assets/{CodeViewerPlugin-D2J_3nyt.js → CodeViewerPlugin-BMADwFWJ.js} +5 -5
  115. package/src/ui/dist/assets/{DocViewerPlugin-ChRLLKNb.js → DocViewerPlugin-ZOnTIHLN.js} +3 -3
  116. package/src/ui/dist/assets/{GitDiffViewerPlugin-DgHfcved.js → GitDiffViewerPlugin-CQ7h1Djm.js} +830 -86
  117. package/src/ui/dist/assets/{ImageViewerPlugin-C89GZMBy.js → ImageViewerPlugin-GVS5MsnC.js} +5 -5
  118. package/src/ui/dist/assets/{LabCopilotPanel-BUfIwUcb.js → LabCopilotPanel-BZNv1JML.js} +10 -10
  119. package/src/ui/dist/assets/{LabPlugin-zvUmQUMq.js → LabPlugin-TWcJsdQA.js} +1 -1
  120. package/src/ui/dist/assets/{LatexPlugin-C1SSNuWp.js → LatexPlugin-DIjHiR2x.js} +7 -7
  121. package/src/ui/dist/assets/{MarkdownViewerPlugin-D2Mf5tU5.js → MarkdownViewerPlugin-D3ooGAH0.js} +4 -4
  122. package/src/ui/dist/assets/{MarketplacePlugin-CF4LgiS2.js → MarketplacePlugin-DfVfE9hN.js} +3 -3
  123. package/src/ui/dist/assets/{NotebookEditor-BM7Bgwlv.js → NotebookEditor-DDl0_Mc0.js} +1 -1
  124. package/src/ui/dist/assets/{index-Be0NAmh8.js → NotebookEditor-s8JhzuX1.js} +12 -155
  125. package/src/ui/dist/assets/{PdfLoader-Bc5qfD-Z.js → PdfLoader-C2Sf6SJM.js} +1 -1
  126. package/src/ui/dist/assets/{PdfMarkdownPlugin-sh1-IRcp.js → PdfMarkdownPlugin-CXFLoIsa.js} +3 -3
  127. package/src/ui/dist/assets/{PdfViewerPlugin-C_a7CpWG.js → PdfViewerPlugin-BYTmz2fK.js} +10 -10
  128. package/src/ui/dist/assets/{SearchPlugin-L4z3HcLf.js → SearchPlugin-CjWBI1O9.js} +1 -1
  129. package/src/ui/dist/assets/{Stepper-Dk4aQ3fN.js → Stepper-B0Dd8CxK.js} +1 -1
  130. package/src/ui/dist/assets/{TextViewerPlugin-BsNtlKVo.js → TextViewerPlugin-DdOBU3-S.js} +4 -4
  131. package/src/ui/dist/assets/{VNCViewer-BpeDcZ5_.js → VNCViewer-B8HGgLwQ.js} +9 -9
  132. package/src/ui/dist/assets/{bibtex-C4QI-bbj.js → bibtex-CKaefIN2.js} +1 -1
  133. package/src/ui/dist/assets/{code-DuMINRsg.js → code-BWAY76JP.js} +1 -1
  134. package/src/ui/dist/assets/{file-content-C3N-432K.js → file-content-C1NwU5oQ.js} +1 -1
  135. package/src/ui/dist/assets/{file-diff-panel-CffQ4ZMg.js → file-diff-panel-CywslwB9.js} +1 -1
  136. package/src/ui/dist/assets/{file-socket-CRH59PCO.js → file-socket-B4kzuOBQ.js} +1 -1
  137. package/src/ui/dist/assets/{file-utils-vYGtW2mI.js → file-utils-H2fjA46S.js} +1 -1
  138. package/src/ui/dist/assets/{image-DBVGaooo.js → image-D-NZM-6P.js} +1 -1
  139. package/src/ui/dist/assets/{index-B1P6hQRJ.js → index-7Chr1g9c.js} +3734 -1862
  140. package/src/ui/dist/assets/{index-DjSFDmgB.js → index-BdM1Gqfr.js} +2 -2
  141. package/src/ui/dist/assets/{index-BpjYH9Vg.js → index-CDxNdQdz.js} +1 -1
  142. package/src/ui/dist/assets/{index-Do9N28uB.css → index-DGIYDuTv.css} +163 -34
  143. package/src/ui/dist/assets/index-DHZJ_0TI.js +159 -0
  144. package/src/ui/dist/assets/{message-square-BsPDBhiY.js → message-square-BzjLiXir.js} +1 -1
  145. package/src/ui/dist/assets/{monaco-BTkdPojV.js → monaco-Cb2uKKe6.js} +1 -1
  146. package/src/ui/dist/assets/{popover-cWjCk-vc.js → popover-Bg72DGgT.js} +1 -1
  147. package/src/ui/dist/assets/{project-sync-CXn530xb.js → project-sync-Ce_0BglY.js} +1 -1
  148. package/src/ui/dist/assets/{sigma-04Jr12jg.js → sigma-DPaACDrh.js} +1 -1
  149. package/src/ui/dist/assets/{tooltip-BdVDl0G5.js → tooltip-C_mA6R0w.js} +1 -1
  150. package/src/ui/dist/assets/{trash-CB_GlQyC.js → trash-BvTgE5__.js} +1 -1
  151. package/src/ui/dist/assets/{useCliAccess-BL932NwS.js → useCliAccess-CgPeMOwP.js} +1 -1
  152. package/src/ui/dist/assets/{useFileDiffOverlay-B2WK7Tvq.js → useFileDiffOverlay-xPhz7P5B.js} +1 -1
  153. package/src/ui/dist/assets/{wrap-text-YC68g12z.js → wrap-text-C3Un3YQr.js} +1 -1
  154. package/src/ui/dist/assets/{zoom-out-C0RJvFiJ.js → zoom-out-BgxLa0Ri.js} +1 -1
  155. package/src/ui/dist/index.html +5 -2
  156. /package/src/ui/dist/assets/{index-CccQYZjX.css → NotebookEditor-CccQYZjX.css} +0 -0
@@ -4,6 +4,27 @@ from collections import OrderedDict
4
4
  from typing import Any
5
5
 
6
6
 
7
+ class MetricContractValidationError(ValueError):
8
+ def __init__(
9
+ self,
10
+ message: str,
11
+ *,
12
+ error_code: str = "metric_contract_validation_failed",
13
+ details: dict[str, Any] | None = None,
14
+ ) -> None:
15
+ super().__init__(message)
16
+ self.error_code = error_code
17
+ self.details = details or {}
18
+
19
+ def as_payload(self) -> dict[str, Any]:
20
+ return {
21
+ "ok": False,
22
+ "error_code": self.error_code,
23
+ "message": str(self),
24
+ **self.details,
25
+ }
26
+
27
+
7
28
  def as_metric_id(value: object, *, fallback: str | None = None) -> str:
8
29
  text = str(value or "").strip()
9
30
  if text:
@@ -34,6 +55,17 @@ def infer_metric_direction(metric_id: str) -> str:
34
55
  return "maximize"
35
56
 
36
57
 
58
+ def normalize_metric_direction(value: object, *, metric_id: str | None = None) -> str:
59
+ text = str(value or "").strip().lower().replace("-", "_").replace(" ", "_")
60
+ if text in {"maximize", "max", "higher", "higher_better", "more_is_better", "greater_is_better"}:
61
+ return "maximize"
62
+ if text in {"minimize", "min", "lower", "lower_better", "less_is_better", "smaller_is_better"}:
63
+ return "minimize"
64
+ if metric_id:
65
+ return infer_metric_direction(metric_id)
66
+ return "maximize"
67
+
68
+
37
69
  def normalize_metrics_summary(summary: object) -> dict[str, Any]:
38
70
  if not isinstance(summary, dict):
39
71
  return {}
@@ -46,6 +78,409 @@ def normalize_metrics_summary(summary: object) -> dict[str, Any]:
46
78
  return normalized
47
79
 
48
80
 
81
+ def flatten_metric_leaf_map(summary: object, *, separator: str = ".") -> dict[str, Any]:
82
+ flattened: OrderedDict[str, Any] = OrderedDict()
83
+
84
+ def visit(value: object, path: tuple[str, ...]) -> None:
85
+ if isinstance(value, dict):
86
+ for key, child in value.items():
87
+ normalized_key = str(key or "").strip()
88
+ if not normalized_key:
89
+ continue
90
+ visit(child, (*path, normalized_key))
91
+ return
92
+ if path:
93
+ flattened[separator.join(path)] = value
94
+
95
+ if isinstance(summary, dict):
96
+ for key, value in summary.items():
97
+ normalized_key = str(key or "").strip()
98
+ if not normalized_key:
99
+ continue
100
+ visit(value, (normalized_key,))
101
+ return dict(flattened)
102
+
103
+
104
+ def _resolve_origin_path_value(summary: object, origin_path: object) -> Any:
105
+ if not isinstance(summary, dict):
106
+ return None
107
+ normalized_path = str(origin_path or "").strip().replace("/", ".")
108
+ if not normalized_path:
109
+ return None
110
+ current: Any = summary
111
+ for part in normalized_path.split("."):
112
+ normalized_part = str(part or "").strip()
113
+ if not normalized_part:
114
+ continue
115
+ if not isinstance(current, dict) or normalized_part not in current:
116
+ return None
117
+ current = current[normalized_part]
118
+ return current
119
+
120
+
121
+ def _metric_explanation_fields(metric: dict[str, Any]) -> dict[str, str | None]:
122
+ description = str(metric.get("description") or metric.get("explanation") or "").strip() or None
123
+ derivation = str(metric.get("derivation") or metric.get("how_derived") or "").strip() or None
124
+ source_ref = str(metric.get("source_ref") or metric.get("source") or "").strip() or None
125
+ origin_path = str(metric.get("origin_path") or metric.get("source_path") or "").strip() or None
126
+ return {
127
+ "description": description,
128
+ "derivation": derivation,
129
+ "source_ref": source_ref,
130
+ "origin_path": origin_path,
131
+ }
132
+
133
+
134
+ def resolve_metric_value_from_summary(
135
+ metric_id: str,
136
+ *,
137
+ metrics_summary: object = None,
138
+ primary_metric: object = None,
139
+ origin_path: object = None,
140
+ ) -> float | None:
141
+ normalized_metric_id = str(metric_id or "").strip()
142
+ if not normalized_metric_id:
143
+ return None
144
+ summary = normalize_metrics_summary(metrics_summary)
145
+ direct_value = summary.get(normalized_metric_id)
146
+ direct_number = to_number(direct_value)
147
+ if direct_number is not None:
148
+ return direct_number
149
+ origin_value = _resolve_origin_path_value(metrics_summary, origin_path)
150
+ origin_number = to_number(origin_value)
151
+ if origin_number is not None:
152
+ return origin_number
153
+ if isinstance(primary_metric, dict):
154
+ primary_metric_id = str(
155
+ primary_metric.get("metric_id") or primary_metric.get("name") or primary_metric.get("id") or ""
156
+ ).strip()
157
+ if primary_metric_id == normalized_metric_id:
158
+ primary_number = to_number(primary_metric.get("value"))
159
+ if primary_number is not None:
160
+ return primary_number
161
+ elif isinstance(primary_metric, str) and primary_metric.strip() == normalized_metric_id:
162
+ return None
163
+ return None
164
+
165
+
166
+ def canonicalize_baseline_submission(
167
+ *,
168
+ metric_contract: object,
169
+ metrics_summary: object = None,
170
+ primary_metric: object = None,
171
+ ) -> dict[str, Any]:
172
+ contract_payload = metric_contract if isinstance(metric_contract, dict) else {}
173
+ explicit_metrics = contract_payload.get("metrics") if isinstance(contract_payload.get("metrics"), list) else []
174
+ normalized_contract = normalize_metric_contract(
175
+ contract_payload,
176
+ metrics_summary=None,
177
+ primary_metric=primary_metric,
178
+ )
179
+ canonical_metrics: OrderedDict[str, float] = OrderedDict()
180
+ metric_details: list[dict[str, Any]] = []
181
+ unresolved_metric_ids: list[str] = []
182
+
183
+ if explicit_metrics:
184
+ for metric in normalized_contract.get("metrics", []):
185
+ if not isinstance(metric, dict):
186
+ continue
187
+ metric_id = str(metric.get("metric_id") or "").strip()
188
+ if not metric_id:
189
+ continue
190
+ explanation = _metric_explanation_fields(metric)
191
+ value = resolve_metric_value_from_summary(
192
+ metric_id,
193
+ metrics_summary=metrics_summary,
194
+ primary_metric=primary_metric,
195
+ origin_path=explanation.get("origin_path"),
196
+ )
197
+ required = bool(metric.get("required", True))
198
+ detail = {
199
+ **metric,
200
+ "metric_id": metric_id,
201
+ "required": required,
202
+ **explanation,
203
+ }
204
+ if value is None:
205
+ if required:
206
+ unresolved_metric_ids.append(metric_id)
207
+ detail["value"] = None
208
+ else:
209
+ canonical_metrics[metric_id] = value
210
+ detail["value"] = value
211
+ metric_details.append(detail)
212
+ else:
213
+ for metric_id, value in extract_numeric_metric_map(metrics_summary=metrics_summary).items():
214
+ canonical_metrics[metric_id] = value
215
+ metric_details.append(
216
+ {
217
+ "metric_id": metric_id,
218
+ "required": True,
219
+ "description": None,
220
+ "derivation": None,
221
+ "source_ref": None,
222
+ "origin_path": None,
223
+ "value": value,
224
+ }
225
+ )
226
+
227
+ return {
228
+ "metric_contract": normalized_contract,
229
+ "metrics_summary": dict(canonical_metrics),
230
+ "metric_details": metric_details,
231
+ "unresolved_metric_ids": unresolved_metric_ids,
232
+ "source_leaf_map": flatten_metric_leaf_map(metrics_summary),
233
+ }
234
+
235
+
236
+ def validate_baseline_metric_contract_submission(
237
+ *,
238
+ metric_contract: object,
239
+ metrics_summary: object = None,
240
+ primary_metric: object = None,
241
+ ) -> dict[str, Any]:
242
+ canonical = canonicalize_baseline_submission(
243
+ metric_contract=metric_contract,
244
+ metrics_summary=metrics_summary,
245
+ primary_metric=primary_metric,
246
+ )
247
+ normalized_contract = canonical["metric_contract"]
248
+ metric_details = canonical["metric_details"]
249
+ canonical_metrics = canonical["metrics_summary"]
250
+ explicit_metrics = normalized_contract.get("metrics") if isinstance(normalized_contract.get("metrics"), list) else []
251
+ if not explicit_metrics:
252
+ raise MetricContractValidationError(
253
+ "Baseline metric contract must define explicit metric entries for every canonical metric.",
254
+ error_code="baseline_metric_contract_missing_entries",
255
+ details={
256
+ "validation_stage": "baseline",
257
+ "baseline_metric_ids": [],
258
+ "baseline_metric_details": metric_details,
259
+ "source_metric_paths": sorted(canonical["source_leaf_map"].keys()),
260
+ },
261
+ )
262
+
263
+ missing_explanations: list[dict[str, Any]] = []
264
+ for detail in metric_details:
265
+ if not isinstance(detail, dict):
266
+ continue
267
+ missing_fields: list[str] = []
268
+ if not str(detail.get("description") or "").strip():
269
+ missing_fields.append("description")
270
+ if not (str(detail.get("derivation") or "").strip() or str(detail.get("origin_path") or "").strip()):
271
+ missing_fields.append("derivation_or_origin_path")
272
+ if not str(detail.get("source_ref") or "").strip():
273
+ missing_fields.append("source_ref")
274
+ if missing_fields:
275
+ missing_explanations.append(
276
+ {
277
+ "metric_id": detail.get("metric_id"),
278
+ "missing_fields": missing_fields,
279
+ "detail": detail,
280
+ }
281
+ )
282
+
283
+ if canonical["unresolved_metric_ids"]:
284
+ raise MetricContractValidationError(
285
+ "Baseline metric contract is missing canonical values for one or more required metrics.",
286
+ error_code="baseline_metric_values_missing",
287
+ details={
288
+ "validation_stage": "baseline",
289
+ "missing_metric_ids": canonical["unresolved_metric_ids"],
290
+ "baseline_metric_ids": list(canonical_metrics.keys()),
291
+ "baseline_metric_details": metric_details,
292
+ "source_metric_paths": sorted(canonical["source_leaf_map"].keys()),
293
+ },
294
+ )
295
+
296
+ if missing_explanations:
297
+ raise MetricContractValidationError(
298
+ "Baseline metric contract must explain every canonical metric with description, derivation/origin path, and source reference.",
299
+ error_code="baseline_metric_explanations_missing",
300
+ details={
301
+ "validation_stage": "baseline",
302
+ "baseline_metric_ids": list(canonical_metrics.keys()),
303
+ "baseline_metric_details": metric_details,
304
+ "missing_explanations": missing_explanations,
305
+ "source_metric_paths": sorted(canonical["source_leaf_map"].keys()),
306
+ },
307
+ )
308
+
309
+ if not canonical_metrics:
310
+ raise MetricContractValidationError(
311
+ "Baseline metric contract did not yield any canonical numeric metrics.",
312
+ error_code="baseline_metric_contract_empty",
313
+ details={
314
+ "validation_stage": "baseline",
315
+ "baseline_metric_ids": [],
316
+ "baseline_metric_details": metric_details,
317
+ "source_metric_paths": sorted(canonical["source_leaf_map"].keys()),
318
+ },
319
+ )
320
+
321
+ return canonical
322
+
323
+
324
+ def validate_main_experiment_against_baseline_contract(
325
+ *,
326
+ baseline_contract_payload: object,
327
+ run_metric_contract: object = None,
328
+ metric_rows: object = None,
329
+ metrics_summary: object = None,
330
+ dataset_scope: object = None,
331
+ ) -> dict[str, Any]:
332
+ baseline_payload = baseline_contract_payload if isinstance(baseline_contract_payload, dict) else {}
333
+ if not baseline_payload:
334
+ raise MetricContractValidationError(
335
+ "Canonical baseline metric contract JSON is missing, so main-experiment metric validation cannot run.",
336
+ error_code="baseline_metric_contract_json_missing",
337
+ details={
338
+ "validation_stage": "main_experiment",
339
+ "baseline_metric_ids": [],
340
+ "baseline_metric_details": [],
341
+ },
342
+ )
343
+ baseline_metrics_summary = extract_numeric_metric_map(metrics_summary=baseline_payload.get("metrics_summary"))
344
+ baseline_contract = normalize_metric_contract(
345
+ baseline_payload.get("metric_contract"),
346
+ metrics_summary=baseline_metrics_summary,
347
+ primary_metric=baseline_payload.get("primary_metric"),
348
+ )
349
+ baseline_details = []
350
+ required_metric_ids: list[str] = []
351
+ baseline_meta_map = extract_metric_meta_map(
352
+ metric_contract=baseline_contract,
353
+ metrics_summary=baseline_metrics_summary,
354
+ )
355
+ for metric in baseline_contract.get("metrics", []):
356
+ if not isinstance(metric, dict):
357
+ continue
358
+ metric_id = str(metric.get("metric_id") or "").strip()
359
+ if not metric_id or metric_id not in baseline_metrics_summary:
360
+ continue
361
+ detail = {
362
+ **metric,
363
+ **_metric_explanation_fields(metric),
364
+ "metric_id": metric_id,
365
+ "baseline_value": baseline_metrics_summary.get(metric_id),
366
+ }
367
+ baseline_details.append(detail)
368
+ if bool(metric.get("required", True)) and not bool(metric.get("supplementary", False)):
369
+ required_metric_ids.append(metric_id)
370
+
371
+ if not required_metric_ids:
372
+ raise MetricContractValidationError(
373
+ "Canonical baseline metric contract does not expose any required numeric metrics for comparison.",
374
+ error_code="baseline_metric_contract_empty",
375
+ details={
376
+ "validation_stage": "main_experiment",
377
+ "baseline_metric_ids": [],
378
+ "baseline_metric_details": baseline_details,
379
+ },
380
+ )
381
+
382
+ run_numeric_metrics = extract_numeric_metric_map(metric_rows=metric_rows, metrics_summary=metrics_summary)
383
+ run_meta_map = extract_metric_meta_map(
384
+ metric_contract=run_metric_contract,
385
+ metric_rows=metric_rows,
386
+ metrics_summary=metrics_summary,
387
+ )
388
+ missing_metric_ids = [metric_id for metric_id in required_metric_ids if metric_id not in run_numeric_metrics]
389
+ extra_metric_ids = [metric_id for metric_id in run_numeric_metrics.keys() if metric_id not in required_metric_ids]
390
+ direction_mismatches: list[dict[str, Any]] = []
391
+ for metric_id in required_metric_ids:
392
+ if metric_id not in run_numeric_metrics:
393
+ continue
394
+ baseline_direction = normalize_metric_direction(
395
+ (baseline_meta_map.get(metric_id) or {}).get("direction"),
396
+ metric_id=metric_id,
397
+ )
398
+ run_direction = normalize_metric_direction(
399
+ (run_meta_map.get(metric_id) or {}).get("direction"),
400
+ metric_id=metric_id,
401
+ )
402
+ if baseline_direction != run_direction:
403
+ direction_mismatches.append(
404
+ {
405
+ "metric_id": metric_id,
406
+ "expected_direction": baseline_direction,
407
+ "actual_direction": run_direction,
408
+ }
409
+ )
410
+
411
+ expected_eval = (
412
+ dict(baseline_contract.get("evaluation_protocol") or {})
413
+ if isinstance(baseline_contract.get("evaluation_protocol"), dict)
414
+ else {}
415
+ )
416
+ actual_eval = (
417
+ dict((run_metric_contract or {}).get("evaluation_protocol") or {})
418
+ if isinstance((run_metric_contract or {}).get("evaluation_protocol"), dict)
419
+ else {}
420
+ )
421
+ expected_scope = str(
422
+ expected_eval.get("scope_id")
423
+ or expected_eval.get("dataset_scope")
424
+ or dataset_scope
425
+ or ""
426
+ ).strip() or None
427
+ actual_scopes = sorted(
428
+ {
429
+ str(row.get("scope_id") or row.get("scope") or dataset_scope or "").strip()
430
+ for row in normalize_metric_rows(metric_rows, metrics_summary=metrics_summary)
431
+ if isinstance(row, dict) and str(row.get("metric_id") or "").strip() in required_metric_ids
432
+ }
433
+ - {""}
434
+ )
435
+ scope_mismatch = bool(expected_scope and actual_scopes and any(scope != expected_scope for scope in actual_scopes))
436
+ eval_protocol_mismatch: dict[str, Any] | None = None
437
+ if expected_eval and actual_eval:
438
+ expected_code_hashes = expected_eval.get("code_hashes") if isinstance(expected_eval.get("code_hashes"), dict) else {}
439
+ actual_code_hashes = actual_eval.get("code_hashes") if isinstance(actual_eval.get("code_hashes"), dict) else {}
440
+ expected_code_paths = expected_eval.get("code_paths") if isinstance(expected_eval.get("code_paths"), list) else []
441
+ actual_code_paths = actual_eval.get("code_paths") if isinstance(actual_eval.get("code_paths"), list) else []
442
+ if (
443
+ str(expected_eval.get("scope_id") or expected_eval.get("dataset_scope") or "").strip()
444
+ and str(expected_eval.get("scope_id") or expected_eval.get("dataset_scope") or "").strip()
445
+ != str(actual_eval.get("scope_id") or actual_eval.get("dataset_scope") or "").strip()
446
+ ) or (expected_code_hashes and actual_code_hashes and expected_code_hashes != actual_code_hashes) or (
447
+ expected_code_paths and actual_code_paths and expected_code_paths != actual_code_paths
448
+ ):
449
+ eval_protocol_mismatch = {
450
+ "expected": expected_eval,
451
+ "actual": actual_eval,
452
+ }
453
+
454
+ if missing_metric_ids or direction_mismatches or scope_mismatch or eval_protocol_mismatch:
455
+ details: dict[str, Any] = {
456
+ "validation_stage": "main_experiment",
457
+ "baseline_metric_ids": required_metric_ids,
458
+ "baseline_metric_details": baseline_details,
459
+ "missing_metric_ids": missing_metric_ids,
460
+ "extra_metric_ids": extra_metric_ids,
461
+ }
462
+ if direction_mismatches:
463
+ details["direction_mismatches"] = direction_mismatches
464
+ if scope_mismatch:
465
+ details["evaluation_protocol_mismatch"] = {
466
+ "expected_scope_id": expected_scope,
467
+ "actual_scope_ids": actual_scopes,
468
+ }
469
+ if eval_protocol_mismatch:
470
+ details["evaluation_protocol_mismatch"] = eval_protocol_mismatch
471
+ raise MetricContractValidationError(
472
+ "Main experiment must cover every required baseline metric and stay aligned with the canonical evaluation contract.",
473
+ error_code="main_experiment_metric_validation_failed",
474
+ details=details,
475
+ )
476
+
477
+ return {
478
+ "baseline_metric_ids": required_metric_ids,
479
+ "baseline_metric_details": baseline_details,
480
+ "extra_metric_ids": extra_metric_ids,
481
+ }
482
+
483
+
49
484
  def _normalize_metric_entry(metric: object, *, fallback_id: str | None = None) -> dict[str, Any]:
50
485
  if isinstance(metric, str):
51
486
  metric_id = as_metric_id(metric, fallback=fallback_id)
@@ -71,9 +506,7 @@ def _normalize_metric_entry(metric: object, *, fallback_id: str | None = None) -
71
506
  metric_id = as_metric_id(
72
507
  metric.get("metric_id") or metric.get("id") or metric.get("name") or fallback_id,
73
508
  )
74
- direction = str(metric.get("direction") or "").strip().lower()
75
- if direction not in {"maximize", "minimize"}:
76
- direction = infer_metric_direction(metric_id)
509
+ direction = normalize_metric_direction(metric.get("direction"), metric_id=metric_id)
77
510
  decimals_raw = metric.get("decimals")
78
511
  decimals = int(decimals_raw) if isinstance(decimals_raw, int) else None
79
512
  chart_group = str(metric.get("chart_group") or "default").strip() or "default"
@@ -93,6 +526,7 @@ def normalize_metric_contract(
93
526
  *,
94
527
  baseline_id: str | None = None,
95
528
  metrics_summary: object = None,
529
+ metric_rows: object = None,
96
530
  primary_metric: object = None,
97
531
  baseline_variants: object = None,
98
532
  ) -> dict[str, Any]:
@@ -104,7 +538,7 @@ def normalize_metric_contract(
104
538
  normalized = _normalize_metric_entry(metric, fallback_id=f"metric_{index + 1}")
105
539
  metrics_by_id[normalized["metric_id"]] = normalized
106
540
 
107
- summary_metrics = normalize_metrics_summary(metrics_summary)
541
+ summary_metrics = extract_numeric_metric_map(metric_rows=metric_rows, metrics_summary=metrics_summary)
108
542
  for metric_id in summary_metrics.keys():
109
543
  metrics_by_id.setdefault(metric_id, _normalize_metric_entry({}, fallback_id=metric_id))
110
544
 
@@ -112,7 +546,7 @@ def normalize_metric_contract(
112
546
  for variant in baseline_variants:
113
547
  if not isinstance(variant, dict):
114
548
  continue
115
- for metric_id in normalize_metrics_summary(variant.get("metrics_summary")).keys():
549
+ for metric_id in extract_numeric_metric_map(metrics_summary=variant.get("metrics_summary")).keys():
116
550
  metrics_by_id.setdefault(metric_id, _normalize_metric_entry({}, fallback_id=metric_id))
117
551
 
118
552
  primary_metric_id = str(contract_payload.get("primary_metric_id") or "").strip()
@@ -131,7 +565,13 @@ def normalize_metric_contract(
131
565
  if primary_metric_id:
132
566
  metrics_by_id.setdefault(primary_metric_id, _normalize_metric_entry({}, fallback_id=primary_metric_id))
133
567
 
568
+ preserved_top_level = {
569
+ key: value
570
+ for key, value in contract_payload.items()
571
+ if key not in {"contract_id", "primary_metric_id", "metrics"}
572
+ }
134
573
  return {
574
+ **preserved_top_level,
135
575
  "contract_id": str(contract_payload.get("contract_id") or baseline_id or "default").strip() or "default",
136
576
  "primary_metric_id": primary_metric_id or None,
137
577
  "metrics": list(metrics_by_id.values()),
@@ -152,10 +592,10 @@ def selected_baseline_metrics(entry: dict[str, Any] | None, selected_variant_id:
152
592
  if selected_variant is None and variants:
153
593
  selected_variant = next((item for item in variants if isinstance(item, dict)), None)
154
594
  if isinstance(selected_variant, dict):
155
- summary = normalize_metrics_summary(selected_variant.get("metrics_summary"))
595
+ summary = extract_numeric_metric_map(metrics_summary=selected_variant.get("metrics_summary"))
156
596
  if summary:
157
597
  return summary
158
- return normalize_metrics_summary(entry.get("metrics_summary"))
598
+ return extract_numeric_metric_map(metrics_summary=entry.get("metrics_summary"))
159
599
 
160
600
 
161
601
  def baseline_metric_lines(entry: dict[str, Any] | None, selected_variant_id: str | None = None) -> list[dict[str, Any]]:
@@ -169,9 +609,8 @@ def baseline_metric_lines(entry: dict[str, Any] | None, selected_variant_id: str
169
609
  if not isinstance(variant, dict):
170
610
  continue
171
611
  variant_id = str(variant.get("variant_id") or "").strip() or None
172
- metrics_summary = normalize_metrics_summary(variant.get("metrics_summary"))
612
+ metrics_summary = extract_numeric_metric_map(metrics_summary=variant.get("metrics_summary"))
173
613
  for metric_id, value in metrics_summary.items():
174
- numeric_value = to_number(value)
175
614
  lines.append(
176
615
  {
177
616
  "metric_id": metric_id,
@@ -179,14 +618,13 @@ def baseline_metric_lines(entry: dict[str, Any] | None, selected_variant_id: str
179
618
  "baseline_id": baseline_id,
180
619
  "variant_id": variant_id,
181
620
  "selected": bool(selected_id and variant_id == selected_id),
182
- "value": numeric_value,
621
+ "value": value,
183
622
  "raw_value": value,
184
623
  }
185
624
  )
186
625
  if lines:
187
626
  return lines
188
- for metric_id, value in normalize_metrics_summary(entry.get("metrics_summary")).items():
189
- numeric_value = to_number(value)
627
+ for metric_id, value in extract_numeric_metric_map(metrics_summary=entry.get("metrics_summary")).items():
190
628
  lines.append(
191
629
  {
192
630
  "metric_id": metric_id,
@@ -194,7 +632,7 @@ def baseline_metric_lines(entry: dict[str, Any] | None, selected_variant_id: str
194
632
  "baseline_id": baseline_id,
195
633
  "variant_id": None,
196
634
  "selected": True,
197
- "value": numeric_value,
635
+ "value": value,
198
636
  "raw_value": value,
199
637
  }
200
638
  )
@@ -240,29 +678,252 @@ def normalize_metric_rows(
240
678
  return rows
241
679
 
242
680
 
681
+ def extract_numeric_metric_map(
682
+ *,
683
+ metric_rows: object = None,
684
+ metrics_summary: object = None,
685
+ ) -> dict[str, float]:
686
+ metrics: OrderedDict[str, float] = OrderedDict()
687
+ rows = normalize_metric_rows(metric_rows, metrics_summary=metrics_summary)
688
+ for row in rows:
689
+ if not isinstance(row, dict):
690
+ continue
691
+ metric_id = str(row.get("metric_id") or "").strip()
692
+ numeric_value = to_number(row.get("numeric_value", row.get("value")))
693
+ if not metric_id or numeric_value is None:
694
+ continue
695
+ metrics[metric_id] = numeric_value
696
+ for metric_id, value in normalize_metrics_summary(metrics_summary).items():
697
+ numeric_value = to_number(value)
698
+ if metric_id and numeric_value is not None and metric_id not in metrics:
699
+ metrics[metric_id] = numeric_value
700
+ return dict(metrics)
701
+
702
+
703
+ def extract_metric_raw_value_map(
704
+ *,
705
+ metric_rows: object = None,
706
+ metrics_summary: object = None,
707
+ ) -> dict[str, Any]:
708
+ values: OrderedDict[str, Any] = OrderedDict()
709
+ rows = normalize_metric_rows(metric_rows, metrics_summary=metrics_summary)
710
+ for row in rows:
711
+ if not isinstance(row, dict):
712
+ continue
713
+ metric_id = str(row.get("metric_id") or "").strip()
714
+ if not metric_id:
715
+ continue
716
+ values[metric_id] = row.get("value")
717
+ for metric_id, value in normalize_metrics_summary(metrics_summary).items():
718
+ if metric_id not in values:
719
+ values[metric_id] = value
720
+ return dict(values)
721
+
722
+
723
+ def extract_metric_meta_map(
724
+ *,
725
+ metric_contract: object = None,
726
+ metric_rows: object = None,
727
+ metrics_summary: object = None,
728
+ ) -> dict[str, dict[str, Any]]:
729
+ contract = normalize_metric_contract(
730
+ metric_contract,
731
+ metrics_summary=metrics_summary,
732
+ metric_rows=metric_rows,
733
+ )
734
+ meta_map: OrderedDict[str, dict[str, Any]] = OrderedDict()
735
+ for item in contract.get("metrics", []):
736
+ if not isinstance(item, dict):
737
+ continue
738
+ metric_id = str(item.get("metric_id") or "").strip()
739
+ if not metric_id:
740
+ continue
741
+ meta_map[metric_id] = {
742
+ **item,
743
+ "metric_id": metric_id,
744
+ "direction": normalize_metric_direction(item.get("direction"), metric_id=metric_id),
745
+ "label": str(item.get("label") or metric_id).strip() or metric_id,
746
+ }
747
+
748
+ for row in normalize_metric_rows(metric_rows, metrics_summary=metrics_summary):
749
+ if not isinstance(row, dict):
750
+ continue
751
+ metric_id = str(row.get("metric_id") or "").strip()
752
+ if not metric_id:
753
+ continue
754
+ current = dict(meta_map.get(metric_id) or _normalize_metric_entry({}, fallback_id=metric_id))
755
+ label = str(row.get("label") or row.get("name") or current.get("label") or metric_id).strip() or metric_id
756
+ decimals = row.get("decimals") if isinstance(row.get("decimals"), int) else current.get("decimals")
757
+ meta_map[metric_id] = {
758
+ **current,
759
+ "metric_id": metric_id,
760
+ "label": label,
761
+ "direction": normalize_metric_direction(row.get("direction") or current.get("direction"), metric_id=metric_id),
762
+ "unit": str(row.get("unit") or current.get("unit") or "").strip() or None,
763
+ "decimals": decimals,
764
+ "chart_group": str(row.get("chart_group") or current.get("chart_group") or "default").strip() or "default",
765
+ }
766
+
767
+ for metric_id in extract_numeric_metric_map(metric_rows=metric_rows, metrics_summary=metrics_summary).keys():
768
+ meta_map.setdefault(metric_id, _normalize_metric_entry({}, fallback_id=metric_id))
769
+ return dict(meta_map)
770
+
771
+
772
+ def extract_metric_comparison_map(
773
+ baseline_comparisons: object,
774
+ ) -> dict[str, dict[str, Any]]:
775
+ comparisons = baseline_comparisons if isinstance(baseline_comparisons, dict) else {}
776
+ return {
777
+ str(item.get("metric_id") or "").strip(): item
778
+ for item in comparisons.get("items", [])
779
+ if isinstance(item, dict) and item.get("metric_id")
780
+ }
781
+
782
+
783
+ def extract_metric_delta_map(
784
+ *,
785
+ metric_rows: object = None,
786
+ baseline_comparisons: object = None,
787
+ ) -> dict[str, float]:
788
+ delta_map: OrderedDict[str, float] = OrderedDict()
789
+ for metric_id, item in extract_metric_comparison_map(baseline_comparisons).items():
790
+ delta_value = to_number(item.get("delta"))
791
+ if delta_value is not None:
792
+ delta_map[metric_id] = delta_value
793
+ for row in normalize_metric_rows(metric_rows):
794
+ if not isinstance(row, dict):
795
+ continue
796
+ metric_id = str(row.get("metric_id") or "").strip()
797
+ if not metric_id or metric_id in delta_map:
798
+ continue
799
+ delta_value = to_number(row.get("delta"))
800
+ if delta_value is not None:
801
+ delta_map[metric_id] = delta_value
802
+ return dict(delta_map)
803
+
804
+
805
+ def resolve_primary_metric_id(
806
+ *,
807
+ metric_contract: object = None,
808
+ metric_rows: object = None,
809
+ metrics_summary: object = None,
810
+ primary_metric: object = None,
811
+ progress_eval: object = None,
812
+ baseline_comparisons: object = None,
813
+ ) -> str | None:
814
+ numeric_metrics = extract_numeric_metric_map(metric_rows=metric_rows, metrics_summary=metrics_summary)
815
+ if not numeric_metrics:
816
+ return None
817
+
818
+ contract = normalize_metric_contract(
819
+ metric_contract,
820
+ metrics_summary=metrics_summary,
821
+ metric_rows=metric_rows,
822
+ primary_metric=primary_metric,
823
+ )
824
+ candidates: list[str] = []
825
+ for value in (
826
+ (progress_eval or {}).get("primary_metric_id") if isinstance(progress_eval, dict) else None,
827
+ (baseline_comparisons or {}).get("primary_metric_id") if isinstance(baseline_comparisons, dict) else None,
828
+ contract.get("primary_metric_id"),
829
+ ):
830
+ candidate = str(value or "").strip()
831
+ if candidate:
832
+ candidates.append(candidate)
833
+ if isinstance(primary_metric, dict):
834
+ candidate = str(
835
+ primary_metric.get("metric_id") or primary_metric.get("name") or primary_metric.get("id") or ""
836
+ ).strip()
837
+ if candidate:
838
+ candidates.append(candidate)
839
+ elif isinstance(primary_metric, str):
840
+ candidate = primary_metric.strip()
841
+ if candidate:
842
+ candidates.append(candidate)
843
+ for candidate in candidates:
844
+ if candidate in numeric_metrics:
845
+ return candidate
846
+ return next(iter(numeric_metrics.keys()), None)
847
+
848
+
849
+ def extract_latest_metric(payload: dict[str, Any] | None) -> dict[str, Any] | None:
850
+ if not isinstance(payload, dict) or not payload:
851
+ return None
852
+ numeric_metrics = extract_numeric_metric_map(
853
+ metric_rows=payload.get("metric_rows"),
854
+ metrics_summary=payload.get("metrics_summary"),
855
+ )
856
+ if not numeric_metrics:
857
+ return None
858
+
859
+ metric_id = resolve_primary_metric_id(
860
+ metric_contract=payload.get("metric_contract"),
861
+ metric_rows=payload.get("metric_rows"),
862
+ metrics_summary=payload.get("metrics_summary"),
863
+ primary_metric=payload.get("primary_metric"),
864
+ progress_eval=payload.get("progress_eval"),
865
+ baseline_comparisons=payload.get("baseline_comparisons"),
866
+ )
867
+ if not metric_id:
868
+ return None
869
+ metric_value = numeric_metrics.get(metric_id)
870
+ if metric_value is None:
871
+ return None
872
+
873
+ meta_map = extract_metric_meta_map(
874
+ metric_contract=payload.get("metric_contract"),
875
+ metric_rows=payload.get("metric_rows"),
876
+ metrics_summary=payload.get("metrics_summary"),
877
+ )
878
+ delta_map = extract_metric_delta_map(
879
+ metric_rows=payload.get("metric_rows"),
880
+ baseline_comparisons=payload.get("baseline_comparisons"),
881
+ )
882
+ meta = meta_map.get(metric_id) or {}
883
+ result = {
884
+ "key": metric_id,
885
+ "value": metric_value,
886
+ }
887
+ if metric_id in delta_map:
888
+ result["delta_vs_baseline"] = delta_map[metric_id]
889
+ if meta.get("label"):
890
+ result["label"] = meta["label"]
891
+ if meta.get("direction"):
892
+ result["direction"] = meta["direction"]
893
+ if meta.get("unit"):
894
+ result["unit"] = meta["unit"]
895
+ if meta.get("decimals") is not None:
896
+ result["decimals"] = meta["decimals"]
897
+ return result
898
+
899
+
243
900
  def compare_with_baseline(
244
901
  *,
245
902
  metrics_summary: object,
903
+ metric_rows: object = None,
246
904
  metric_contract: object,
247
905
  baseline_metrics: object,
248
906
  ) -> dict[str, Any]:
249
- run_summary = normalize_metrics_summary(metrics_summary)
250
- baseline_summary = normalize_metrics_summary(baseline_metrics)
251
- contract = normalize_metric_contract(metric_contract, metrics_summary=run_summary)
907
+ run_summary = extract_numeric_metric_map(metric_rows=metric_rows, metrics_summary=metrics_summary)
908
+ baseline_summary = extract_numeric_metric_map(metrics_summary=baseline_metrics)
909
+ contract = normalize_metric_contract(metric_contract, metrics_summary=run_summary, metric_rows=metric_rows)
252
910
  items: list[dict[str, Any]] = []
253
- metric_ids = [item["metric_id"] for item in contract.get("metrics", [])]
911
+ metric_meta = extract_metric_meta_map(
912
+ metric_contract=contract,
913
+ metric_rows=metric_rows,
914
+ metrics_summary=run_summary,
915
+ )
916
+ metric_ids = [
917
+ metric_id
918
+ for metric_id in metric_meta.keys()
919
+ if metric_id in run_summary or metric_id in baseline_summary
920
+ ]
254
921
  for metric_id in baseline_summary.keys():
255
922
  if metric_id not in metric_ids:
256
923
  metric_ids.append(metric_id)
257
924
  for metric_id in run_summary.keys():
258
925
  if metric_id not in metric_ids:
259
926
  metric_ids.append(metric_id)
260
-
261
- metric_meta = {
262
- item["metric_id"]: item
263
- for item in contract.get("metrics", [])
264
- if isinstance(item, dict) and item.get("metric_id")
265
- }
266
927
  for metric_id in metric_ids:
267
928
  meta = metric_meta.get(metric_id) or _normalize_metric_entry({}, fallback_id=metric_id)
268
929
  run_value = run_summary.get(metric_id)
@@ -276,7 +937,7 @@ def compare_with_baseline(
276
937
  delta = run_number - baseline_number
277
938
  if baseline_number not in {0.0, -0.0}:
278
939
  relative_delta = delta / abs(baseline_number)
279
- direction = meta.get("direction") or infer_metric_direction(metric_id)
940
+ direction = normalize_metric_direction(meta.get("direction"), metric_id=metric_id)
280
941
  if direction == "maximize":
281
942
  better = run_number > baseline_number
282
943
  else:
@@ -285,7 +946,7 @@ def compare_with_baseline(
285
946
  {
286
947
  "metric_id": metric_id,
287
948
  "label": meta.get("label") or metric_id,
288
- "direction": meta.get("direction") or infer_metric_direction(metric_id),
949
+ "direction": normalize_metric_direction(meta.get("direction"), metric_id=metric_id),
289
950
  "unit": meta.get("unit"),
290
951
  "decimals": meta.get("decimals"),
291
952
  "chart_group": meta.get("chart_group"),
@@ -299,7 +960,10 @@ def compare_with_baseline(
299
960
  }
300
961
  )
301
962
 
302
- primary_metric_id = str(contract.get("primary_metric_id") or "").strip() or None
963
+ primary_metric_id = resolve_primary_metric_id(
964
+ metric_contract=contract,
965
+ metrics_summary=run_summary,
966
+ )
303
967
  primary_item = next((item for item in items if item["metric_id"] == primary_metric_id), None)
304
968
  if primary_item is None and items:
305
969
  primary_item = items[0]
@@ -372,6 +1036,60 @@ def compute_progress_eval(
372
1036
  }
373
1037
 
374
1038
 
1039
+ def _record_sort_key(record: dict[str, Any]) -> str:
1040
+ return str(record.get("updated_at") or record.get("created_at") or "")
1041
+
1042
+
1043
+ def _record_dedupe_key(record: dict[str, Any]) -> str:
1044
+ run_id = str(record.get("run_id") or "").strip()
1045
+ if run_id:
1046
+ return f"run:{run_id}"
1047
+ artifact_id = str(record.get("artifact_id") or "").strip()
1048
+ if artifact_id:
1049
+ return f"artifact:{artifact_id}"
1050
+ result_path = str(((record.get("paths") or {}) if isinstance(record.get("paths"), dict) else {}).get("result_json") or "").strip()
1051
+ if result_path:
1052
+ return f"path:{result_path}"
1053
+ branch_name = str(record.get("branch") or "").strip()
1054
+ return f"record:{branch_name}:{_record_sort_key(record)}"
1055
+
1056
+
1057
+ def _record_richness(record: dict[str, Any]) -> tuple[int, int, int, int, str]:
1058
+ numeric_metrics = extract_numeric_metric_map(
1059
+ metric_rows=record.get("metric_rows"),
1060
+ metrics_summary=record.get("metrics_summary"),
1061
+ )
1062
+ comparisons = extract_metric_comparison_map(record.get("baseline_comparisons"))
1063
+ has_result_path = int(
1064
+ bool(((record.get("paths") or {}) if isinstance(record.get("paths"), dict) else {}).get("result_json"))
1065
+ )
1066
+ metric_meta = extract_metric_meta_map(
1067
+ metric_contract=record.get("metric_contract"),
1068
+ metric_rows=record.get("metric_rows"),
1069
+ metrics_summary=record.get("metrics_summary"),
1070
+ )
1071
+ return (
1072
+ len(numeric_metrics),
1073
+ len(comparisons),
1074
+ has_result_path,
1075
+ len(metric_meta),
1076
+ _record_sort_key(record),
1077
+ )
1078
+
1079
+
1080
+ def dedupe_run_records(run_records: list[dict[str, Any]]) -> list[dict[str, Any]]:
1081
+ deduped: OrderedDict[str, dict[str, Any]] = OrderedDict()
1082
+ for record in sorted(
1083
+ [item for item in run_records if isinstance(item, dict)],
1084
+ key=_record_sort_key,
1085
+ ):
1086
+ key = _record_dedupe_key(record)
1087
+ existing = deduped.get(key)
1088
+ if existing is None or _record_richness(record) >= _record_richness(existing):
1089
+ deduped[key] = record
1090
+ return sorted(deduped.values(), key=_record_sort_key)
1091
+
1092
+
375
1093
  def build_metrics_timeline(
376
1094
  *,
377
1095
  quest_id: str,
@@ -379,81 +1097,95 @@ def build_metrics_timeline(
379
1097
  baseline_entry: dict[str, Any] | None = None,
380
1098
  selected_variant_id: str | None = None,
381
1099
  ) -> dict[str, Any]:
382
- ordered_runs = sorted(
383
- [item for item in run_records if isinstance(item, dict)],
384
- key=lambda item: str(item.get("updated_at") or item.get("created_at") or ""),
385
- )
1100
+ ordered_runs = dedupe_run_records(run_records)
1101
+ baseline_metrics = selected_baseline_metrics(baseline_entry, selected_variant_id)
386
1102
  contract = normalize_metric_contract(
387
1103
  None,
388
1104
  baseline_id=str((baseline_entry or {}).get("baseline_id") or ""),
389
- metrics_summary=(baseline_entry or {}).get("metrics_summary"),
1105
+ metrics_summary=baseline_metrics,
390
1106
  primary_metric=(baseline_entry or {}).get("primary_metric"),
391
1107
  baseline_variants=(baseline_entry or {}).get("baseline_variants"),
392
1108
  )
1109
+ primary_metric_id = str(contract.get("primary_metric_id") or "").strip() or None
393
1110
  for record in ordered_runs:
394
- run_contract = record.get("metric_contract")
395
- if run_contract:
396
- contract = normalize_metric_contract(run_contract, metrics_summary=record.get("metrics_summary"))
1111
+ candidate = resolve_primary_metric_id(
1112
+ metric_contract=record.get("metric_contract"),
1113
+ metric_rows=record.get("metric_rows"),
1114
+ metrics_summary=record.get("metrics_summary"),
1115
+ progress_eval=record.get("progress_eval"),
1116
+ baseline_comparisons=record.get("baseline_comparisons"),
1117
+ )
1118
+ if candidate:
1119
+ primary_metric_id = candidate
397
1120
  break
398
1121
 
399
1122
  series_map: OrderedDict[str, dict[str, Any]] = OrderedDict()
1123
+ baseline_meta_map = extract_metric_meta_map(
1124
+ metric_contract=(baseline_entry or {}).get("metric_contract"),
1125
+ metrics_summary=baseline_metrics,
1126
+ )
1127
+
1128
+ def ensure_series(metric_id: str, meta: dict[str, Any] | None = None) -> dict[str, Any]:
1129
+ resolved_meta = meta or baseline_meta_map.get(metric_id) or _normalize_metric_entry({}, fallback_id=metric_id)
1130
+ if metric_id not in series_map:
1131
+ series_map[metric_id] = {
1132
+ "metric_id": metric_id,
1133
+ "label": resolved_meta.get("label") or metric_id,
1134
+ "direction": normalize_metric_direction(resolved_meta.get("direction"), metric_id=metric_id),
1135
+ "unit": resolved_meta.get("unit"),
1136
+ "decimals": resolved_meta.get("decimals"),
1137
+ "chart_group": resolved_meta.get("chart_group"),
1138
+ "baselines": [],
1139
+ "points": [],
1140
+ }
1141
+ else:
1142
+ series_map[metric_id]["label"] = resolved_meta.get("label") or series_map[metric_id]["label"]
1143
+ series_map[metric_id]["direction"] = normalize_metric_direction(
1144
+ resolved_meta.get("direction") or series_map[metric_id]["direction"],
1145
+ metric_id=metric_id,
1146
+ )
1147
+ series_map[metric_id]["unit"] = resolved_meta.get("unit") or series_map[metric_id]["unit"]
1148
+ if resolved_meta.get("decimals") is not None:
1149
+ series_map[metric_id]["decimals"] = resolved_meta.get("decimals")
1150
+ series_map[metric_id]["chart_group"] = (
1151
+ resolved_meta.get("chart_group") or series_map[metric_id]["chart_group"]
1152
+ )
1153
+ return series_map[metric_id]
1154
+
400
1155
  for metric in contract.get("metrics", []):
401
1156
  metric_id = str(metric.get("metric_id") or "").strip()
402
1157
  if not metric_id:
403
1158
  continue
404
- series_map[metric_id] = {
405
- "metric_id": metric_id,
406
- "label": metric.get("label") or metric_id,
407
- "direction": metric.get("direction") or infer_metric_direction(metric_id),
408
- "unit": metric.get("unit"),
409
- "decimals": metric.get("decimals"),
410
- "chart_group": metric.get("chart_group"),
411
- "baselines": [],
412
- "points": [],
413
- }
1159
+ ensure_series(metric_id, metric)
414
1160
 
415
1161
  for line in baseline_metric_lines(baseline_entry, selected_variant_id):
416
1162
  metric_id = str(line.get("metric_id") or "").strip()
417
1163
  if not metric_id:
418
1164
  continue
419
- series_map.setdefault(
420
- metric_id,
421
- {
422
- "metric_id": metric_id,
423
- "label": metric_id,
424
- "direction": infer_metric_direction(metric_id),
425
- "unit": None,
426
- "decimals": None,
427
- "chart_group": "default",
428
- "baselines": [],
429
- "points": [],
430
- },
431
- )
432
- series_map[metric_id]["baselines"].append(line)
1165
+ ensure_series(metric_id).setdefault("baselines", []).append(line)
433
1166
 
434
1167
  for index, record in enumerate(ordered_runs, start=1):
435
- summary = normalize_metrics_summary(record.get("metrics_summary"))
1168
+ numeric_metrics = extract_numeric_metric_map(
1169
+ metric_rows=record.get("metric_rows"),
1170
+ metrics_summary=record.get("metrics_summary"),
1171
+ )
1172
+ raw_values = extract_metric_raw_value_map(
1173
+ metric_rows=record.get("metric_rows"),
1174
+ metrics_summary=record.get("metrics_summary"),
1175
+ )
436
1176
  progress = record.get("progress_eval") if isinstance(record.get("progress_eval"), dict) else {}
437
- comparisons = record.get("baseline_comparisons") if isinstance(record.get("baseline_comparisons"), dict) else {}
438
- comparison_by_id = {
439
- str(item.get("metric_id") or "").strip(): item
440
- for item in comparisons.get("items", [])
441
- if isinstance(item, dict) and item.get("metric_id")
442
- }
443
- for metric_id, raw_value in summary.items():
444
- series_map.setdefault(
445
- metric_id,
446
- {
447
- "metric_id": metric_id,
448
- "label": metric_id,
449
- "direction": infer_metric_direction(metric_id),
450
- "unit": None,
451
- "decimals": None,
452
- "chart_group": "default",
453
- "baselines": [],
454
- "points": [],
455
- },
456
- )
1177
+ comparison_by_id = extract_metric_comparison_map(record.get("baseline_comparisons"))
1178
+ delta_by_id = extract_metric_delta_map(
1179
+ metric_rows=record.get("metric_rows"),
1180
+ baseline_comparisons=record.get("baseline_comparisons"),
1181
+ )
1182
+ record_meta = extract_metric_meta_map(
1183
+ metric_contract=record.get("metric_contract"),
1184
+ metric_rows=record.get("metric_rows"),
1185
+ metrics_summary=record.get("metrics_summary"),
1186
+ )
1187
+ for metric_id, numeric_value in numeric_metrics.items():
1188
+ ensure_series(metric_id, record_meta.get(metric_id))
457
1189
  comparison = comparison_by_id.get(metric_id, {})
458
1190
  series_map[metric_id]["points"].append(
459
1191
  {
@@ -463,9 +1195,9 @@ def build_metrics_timeline(
463
1195
  "created_at": record.get("updated_at") or record.get("created_at"),
464
1196
  "branch": record.get("branch"),
465
1197
  "idea_id": record.get("idea_id"),
466
- "value": to_number(raw_value),
467
- "raw_value": raw_value,
468
- "delta_vs_baseline": comparison.get("delta"),
1198
+ "value": numeric_value,
1199
+ "raw_value": raw_values.get(metric_id, numeric_value),
1200
+ "delta_vs_baseline": delta_by_id.get(metric_id),
469
1201
  "relative_delta_vs_baseline": comparison.get("relative_delta"),
470
1202
  "breakthrough": bool(progress.get("breakthrough")),
471
1203
  "breakthrough_level": progress.get("breakthrough_level"),
@@ -473,7 +1205,6 @@ def build_metrics_timeline(
473
1205
  }
474
1206
  )
475
1207
 
476
- primary_metric_id = str(contract.get("primary_metric_id") or "").strip() or None
477
1208
  series = [item for item in series_map.values() if item["points"] or item["baselines"]]
478
1209
  return {
479
1210
  "quest_id": quest_id,