@researai/deepscientist 1.5.7 → 1.5.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/README.md +4 -0
  2. package/bin/ds.js +220 -5
  3. package/docs/en/07_MEMORY_AND_MCP.md +40 -3
  4. package/docs/en/99_ACKNOWLEDGEMENTS.md +1 -0
  5. package/docs/zh/07_MEMORY_AND_MCP.md +40 -3
  6. package/docs/zh/99_ACKNOWLEDGEMENTS.md +1 -0
  7. package/install.sh +34 -0
  8. package/package.json +1 -1
  9. package/pyproject.toml +1 -1
  10. package/src/deepscientist/__init__.py +1 -1
  11. package/src/deepscientist/acp/envelope.py +1 -0
  12. package/src/deepscientist/artifact/metrics.py +813 -80
  13. package/src/deepscientist/artifact/schemas.py +1 -0
  14. package/src/deepscientist/artifact/service.py +1101 -99
  15. package/src/deepscientist/bash_exec/monitor.py +1 -1
  16. package/src/deepscientist/bash_exec/service.py +17 -9
  17. package/src/deepscientist/channels/qq.py +17 -0
  18. package/src/deepscientist/channels/relay.py +16 -0
  19. package/src/deepscientist/config/models.py +6 -0
  20. package/src/deepscientist/config/service.py +70 -2
  21. package/src/deepscientist/daemon/api/handlers.py +284 -14
  22. package/src/deepscientist/daemon/api/router.py +1 -0
  23. package/src/deepscientist/daemon/app.py +291 -20
  24. package/src/deepscientist/gitops/diff.py +6 -10
  25. package/src/deepscientist/mcp/server.py +188 -39
  26. package/src/deepscientist/prompts/builder.py +51 -18
  27. package/src/deepscientist/quest/service.py +83 -34
  28. package/src/deepscientist/quest/stage_views.py +74 -29
  29. package/src/deepscientist/runners/codex.py +1 -1
  30. package/src/prompts/connectors/qq.md +1 -1
  31. package/src/prompts/contracts/shared_interaction.md +14 -0
  32. package/src/prompts/system.md +106 -32
  33. package/src/skills/analysis-campaign/SKILL.md +10 -14
  34. package/src/skills/baseline/SKILL.md +51 -38
  35. package/src/skills/baseline/references/baseline-plan-template.md +2 -0
  36. package/src/skills/decision/SKILL.md +12 -8
  37. package/src/skills/experiment/SKILL.md +28 -16
  38. package/src/skills/experiment/references/main-experiment-plan-template.md +2 -0
  39. package/src/skills/figure-polish/SKILL.md +1 -0
  40. package/src/skills/finalize/SKILL.md +3 -8
  41. package/src/skills/idea/SKILL.md +2 -8
  42. package/src/skills/intake-audit/SKILL.md +2 -8
  43. package/src/skills/rebuttal/SKILL.md +2 -8
  44. package/src/skills/review/SKILL.md +2 -8
  45. package/src/skills/scout/SKILL.md +2 -8
  46. package/src/skills/write/SKILL.md +52 -16
  47. package/src/skills/write/templates/DEEPSCIENTIST_NOTES.md +21 -0
  48. package/src/skills/write/templates/README.md +408 -0
  49. package/src/skills/write/templates/UPSTREAM_LICENSE.txt +21 -0
  50. package/src/skills/write/templates/aaai2026/README.md +534 -0
  51. package/src/skills/write/templates/aaai2026/aaai2026-unified-supp.tex +144 -0
  52. package/src/skills/write/templates/aaai2026/aaai2026-unified-template.tex +952 -0
  53. package/src/skills/write/templates/aaai2026/aaai2026.bib +111 -0
  54. package/src/skills/write/templates/aaai2026/aaai2026.bst +1493 -0
  55. package/src/skills/write/templates/aaai2026/aaai2026.sty +315 -0
  56. package/src/skills/write/templates/acl/README.md +50 -0
  57. package/src/skills/write/templates/acl/acl.sty +312 -0
  58. package/src/skills/write/templates/acl/acl_latex.tex +377 -0
  59. package/src/skills/write/templates/acl/acl_lualatex.tex +101 -0
  60. package/src/skills/write/templates/acl/acl_natbib.bst +1940 -0
  61. package/src/skills/write/templates/acl/anthology.bib.txt +26 -0
  62. package/src/skills/write/templates/acl/custom.bib +70 -0
  63. package/src/skills/write/templates/acl/formatting.md +326 -0
  64. package/src/skills/write/templates/asplos2027/main.tex +459 -0
  65. package/src/skills/write/templates/asplos2027/references.bib +135 -0
  66. package/src/skills/write/templates/colm2025/README.md +3 -0
  67. package/src/skills/write/templates/colm2025/colm2025_conference.bib +11 -0
  68. package/src/skills/write/templates/colm2025/colm2025_conference.bst +1440 -0
  69. package/src/skills/write/templates/colm2025/colm2025_conference.sty +218 -0
  70. package/src/skills/write/templates/colm2025/colm2025_conference.tex +305 -0
  71. package/src/skills/write/templates/colm2025/fancyhdr.sty +485 -0
  72. package/src/skills/write/templates/colm2025/math_commands.tex +508 -0
  73. package/src/skills/write/templates/colm2025/natbib.sty +1246 -0
  74. package/src/skills/write/templates/iclr2026/fancyhdr.sty +485 -0
  75. package/src/skills/write/templates/iclr2026/iclr2026_conference.bib +24 -0
  76. package/src/skills/write/templates/iclr2026/iclr2026_conference.bst +1440 -0
  77. package/src/skills/write/templates/iclr2026/iclr2026_conference.sty +246 -0
  78. package/src/skills/write/templates/iclr2026/iclr2026_conference.tex +414 -0
  79. package/src/skills/write/templates/iclr2026/math_commands.tex +508 -0
  80. package/src/skills/write/templates/iclr2026/natbib.sty +1246 -0
  81. package/src/skills/write/templates/icml2026/algorithm.sty +79 -0
  82. package/src/skills/write/templates/icml2026/algorithmic.sty +201 -0
  83. package/src/skills/write/templates/icml2026/example_paper.bib +75 -0
  84. package/src/skills/write/templates/icml2026/example_paper.tex +662 -0
  85. package/src/skills/write/templates/icml2026/fancyhdr.sty +864 -0
  86. package/src/skills/write/templates/icml2026/icml2026.bst +1443 -0
  87. package/src/skills/write/templates/icml2026/icml2026.sty +767 -0
  88. package/src/skills/write/templates/neurips2025/Makefile +36 -0
  89. package/src/skills/write/templates/neurips2025/extra_pkgs.tex +53 -0
  90. package/src/skills/write/templates/neurips2025/main.tex +38 -0
  91. package/src/skills/write/templates/neurips2025/neurips.sty +382 -0
  92. package/src/skills/write/templates/nsdi2027/main.tex +426 -0
  93. package/src/skills/write/templates/nsdi2027/references.bib +151 -0
  94. package/src/skills/write/templates/nsdi2027/usenix-2020-09.sty +83 -0
  95. package/src/skills/write/templates/osdi2026/main.tex +429 -0
  96. package/src/skills/write/templates/osdi2026/references.bib +150 -0
  97. package/src/skills/write/templates/osdi2026/usenix-2020-09.sty +83 -0
  98. package/src/skills/write/templates/sosp2026/main.tex +532 -0
  99. package/src/skills/write/templates/sosp2026/references.bib +148 -0
  100. package/src/tui/package.json +1 -1
  101. package/src/ui/dist/assets/{AiManusChatView-BS3V4ZOk.js → AiManusChatView-m2FNtwbn.js} +110 -14
  102. package/src/ui/dist/assets/{AnalysisPlugin-DLPXQsmr.js → AnalysisPlugin-BMTF8EGL.js} +1 -1
  103. package/src/ui/dist/assets/{AutoFigurePlugin-C-Fr9knQ.js → AutoFigurePlugin-DxPdMUNb.js} +5 -5
  104. package/src/ui/dist/assets/{CliPlugin-Dd8AHzFg.js → CliPlugin-BEOWgxCI.js} +9 -9
  105. package/src/ui/dist/assets/{CodeEditorPlugin-Dg-RepTl.js → CodeEditorPlugin-BCXvjqmb.js} +8 -8
  106. package/src/ui/dist/assets/{CodeViewerPlugin-D2J_3nyt.js → CodeViewerPlugin-DaJcy3nD.js} +5 -5
  107. package/src/ui/dist/assets/{DocViewerPlugin-ChRLLKNb.js → DocViewerPlugin-ByfeIq4K.js} +3 -3
  108. package/src/ui/dist/assets/{GitDiffViewerPlugin-DgHfcved.js → GitDiffViewerPlugin-Cksf3VZ-.js} +830 -86
  109. package/src/ui/dist/assets/{ImageViewerPlugin-C89GZMBy.js → ImageViewerPlugin-CFz-OsTS.js} +5 -5
  110. package/src/ui/dist/assets/{LabCopilotPanel-BUfIwUcb.js → LabCopilotPanel-CJ1cJzoX.js} +10 -10
  111. package/src/ui/dist/assets/{LabPlugin-zvUmQUMq.js → LabPlugin-BF3dVJwa.js} +1 -1
  112. package/src/ui/dist/assets/{LatexPlugin-C1SSNuWp.js → LatexPlugin-DDkwZ6Sj.js} +7 -7
  113. package/src/ui/dist/assets/{MarkdownViewerPlugin-D2Mf5tU5.js → MarkdownViewerPlugin-HAuvurcT.js} +4 -4
  114. package/src/ui/dist/assets/{MarketplacePlugin-CF4LgiS2.js → MarketplacePlugin-BtoTYy2C.js} +3 -3
  115. package/src/ui/dist/assets/{index-Be0NAmh8.js → NotebookEditor-CSJYx7b-.js} +12 -155
  116. package/src/ui/dist/assets/{NotebookEditor-BM7Bgwlv.js → NotebookEditor-DQgRezm_.js} +1 -1
  117. package/src/ui/dist/assets/{PdfLoader-Bc5qfD-Z.js → PdfLoader-DPa_-fv6.js} +1 -1
  118. package/src/ui/dist/assets/{PdfMarkdownPlugin-sh1-IRcp.js → PdfMarkdownPlugin-BZpXOEjm.js} +3 -3
  119. package/src/ui/dist/assets/{PdfViewerPlugin-C_a7CpWG.js → PdfViewerPlugin-BT8a6wGR.js} +10 -10
  120. package/src/ui/dist/assets/{SearchPlugin-L4z3HcLf.js → SearchPlugin-D_blveZi.js} +1 -1
  121. package/src/ui/dist/assets/{Stepper-Dk4aQ3fN.js → Stepper-DH2k75Vo.js} +1 -1
  122. package/src/ui/dist/assets/{TextViewerPlugin-BsNtlKVo.js → TextViewerPlugin-Btx0M3hX.js} +4 -4
  123. package/src/ui/dist/assets/{VNCViewer-BpeDcZ5_.js → VNCViewer-DImJO4rO.js} +9 -9
  124. package/src/ui/dist/assets/{bibtex-C4QI-bbj.js → bibtex-B-Hqu0Sg.js} +1 -1
  125. package/src/ui/dist/assets/{code-DuMINRsg.js → code-BUfXGJSl.js} +1 -1
  126. package/src/ui/dist/assets/{file-content-C3N-432K.js → file-content-VqamwI3X.js} +1 -1
  127. package/src/ui/dist/assets/{file-diff-panel-CffQ4ZMg.js → file-diff-panel-C_wOoS7a.js} +1 -1
  128. package/src/ui/dist/assets/{file-socket-CRH59PCO.js → file-socket-D2bTuMVP.js} +1 -1
  129. package/src/ui/dist/assets/{file-utils-vYGtW2mI.js → file-utils--zJCPN1i.js} +1 -1
  130. package/src/ui/dist/assets/{image-DBVGaooo.js → image-BZkGJ4mM.js} +1 -1
  131. package/src/ui/dist/assets/{index-DjSFDmgB.js → index-CxkvSeKw.js} +2 -2
  132. package/src/ui/dist/assets/{index-BpjYH9Vg.js → index-D9QIGcmc.js} +1 -1
  133. package/src/ui/dist/assets/{index-Do9N28uB.css → index-DXZ1daiJ.css} +163 -34
  134. package/src/ui/dist/assets/index-DdRW6RMJ.js +159 -0
  135. package/src/ui/dist/assets/{index-B1P6hQRJ.js → index-DjggJovS.js} +3029 -1780
  136. package/src/ui/dist/assets/{message-square-BsPDBhiY.js → message-square-FUIPIhU2.js} +1 -1
  137. package/src/ui/dist/assets/{monaco-BTkdPojV.js → monaco-DHMc7kKM.js} +1 -1
  138. package/src/ui/dist/assets/{popover-cWjCk-vc.js → popover-B85oCgCS.js} +1 -1
  139. package/src/ui/dist/assets/{project-sync-CXn530xb.js → project-sync-DOMCcPac.js} +1 -1
  140. package/src/ui/dist/assets/{sigma-04Jr12jg.js → sigma-BO2rQrl3.js} +1 -1
  141. package/src/ui/dist/assets/{tooltip-BdVDl0G5.js → tooltip-B1OspAkx.js} +1 -1
  142. package/src/ui/dist/assets/{trash-CB_GlQyC.js → trash-BsVEH_dV.js} +1 -1
  143. package/src/ui/dist/assets/{useCliAccess-BL932NwS.js → useCliAccess-b8L6JuZm.js} +1 -1
  144. package/src/ui/dist/assets/{useFileDiffOverlay-B2WK7Tvq.js → useFileDiffOverlay-BY7uA9hV.js} +1 -1
  145. package/src/ui/dist/assets/{wrap-text-YC68g12z.js → wrap-text-BwyVuUIK.js} +1 -1
  146. package/src/ui/dist/assets/{zoom-out-C0RJvFiJ.js → zoom-out-RDpLugQP.js} +1 -1
  147. package/src/ui/dist/index.html +5 -2
  148. /package/src/ui/dist/assets/{index-CccQYZjX.css → NotebookEditor-CccQYZjX.css} +0 -0
@@ -4,6 +4,27 @@ from collections import OrderedDict
4
4
  from typing import Any
5
5
 
6
6
 
7
+ class MetricContractValidationError(ValueError):
8
+ def __init__(
9
+ self,
10
+ message: str,
11
+ *,
12
+ error_code: str = "metric_contract_validation_failed",
13
+ details: dict[str, Any] | None = None,
14
+ ) -> None:
15
+ super().__init__(message)
16
+ self.error_code = error_code
17
+ self.details = details or {}
18
+
19
+ def as_payload(self) -> dict[str, Any]:
20
+ return {
21
+ "ok": False,
22
+ "error_code": self.error_code,
23
+ "message": str(self),
24
+ **self.details,
25
+ }
26
+
27
+
7
28
  def as_metric_id(value: object, *, fallback: str | None = None) -> str:
8
29
  text = str(value or "").strip()
9
30
  if text:
@@ -34,6 +55,17 @@ def infer_metric_direction(metric_id: str) -> str:
34
55
  return "maximize"
35
56
 
36
57
 
58
+ def normalize_metric_direction(value: object, *, metric_id: str | None = None) -> str:
59
+ text = str(value or "").strip().lower().replace("-", "_").replace(" ", "_")
60
+ if text in {"maximize", "max", "higher", "higher_better", "more_is_better", "greater_is_better"}:
61
+ return "maximize"
62
+ if text in {"minimize", "min", "lower", "lower_better", "less_is_better", "smaller_is_better"}:
63
+ return "minimize"
64
+ if metric_id:
65
+ return infer_metric_direction(metric_id)
66
+ return "maximize"
67
+
68
+
37
69
  def normalize_metrics_summary(summary: object) -> dict[str, Any]:
38
70
  if not isinstance(summary, dict):
39
71
  return {}
@@ -46,6 +78,409 @@ def normalize_metrics_summary(summary: object) -> dict[str, Any]:
46
78
  return normalized
47
79
 
48
80
 
81
+ def flatten_metric_leaf_map(summary: object, *, separator: str = ".") -> dict[str, Any]:
82
+ flattened: OrderedDict[str, Any] = OrderedDict()
83
+
84
+ def visit(value: object, path: tuple[str, ...]) -> None:
85
+ if isinstance(value, dict):
86
+ for key, child in value.items():
87
+ normalized_key = str(key or "").strip()
88
+ if not normalized_key:
89
+ continue
90
+ visit(child, (*path, normalized_key))
91
+ return
92
+ if path:
93
+ flattened[separator.join(path)] = value
94
+
95
+ if isinstance(summary, dict):
96
+ for key, value in summary.items():
97
+ normalized_key = str(key or "").strip()
98
+ if not normalized_key:
99
+ continue
100
+ visit(value, (normalized_key,))
101
+ return dict(flattened)
102
+
103
+
104
+ def _resolve_origin_path_value(summary: object, origin_path: object) -> Any:
105
+ if not isinstance(summary, dict):
106
+ return None
107
+ normalized_path = str(origin_path or "").strip().replace("/", ".")
108
+ if not normalized_path:
109
+ return None
110
+ current: Any = summary
111
+ for part in normalized_path.split("."):
112
+ normalized_part = str(part or "").strip()
113
+ if not normalized_part:
114
+ continue
115
+ if not isinstance(current, dict) or normalized_part not in current:
116
+ return None
117
+ current = current[normalized_part]
118
+ return current
119
+
120
+
121
+ def _metric_explanation_fields(metric: dict[str, Any]) -> dict[str, str | None]:
122
+ description = str(metric.get("description") or metric.get("explanation") or "").strip() or None
123
+ derivation = str(metric.get("derivation") or metric.get("how_derived") or "").strip() or None
124
+ source_ref = str(metric.get("source_ref") or metric.get("source") or "").strip() or None
125
+ origin_path = str(metric.get("origin_path") or metric.get("source_path") or "").strip() or None
126
+ return {
127
+ "description": description,
128
+ "derivation": derivation,
129
+ "source_ref": source_ref,
130
+ "origin_path": origin_path,
131
+ }
132
+
133
+
134
+ def resolve_metric_value_from_summary(
135
+ metric_id: str,
136
+ *,
137
+ metrics_summary: object = None,
138
+ primary_metric: object = None,
139
+ origin_path: object = None,
140
+ ) -> float | None:
141
+ normalized_metric_id = str(metric_id or "").strip()
142
+ if not normalized_metric_id:
143
+ return None
144
+ summary = normalize_metrics_summary(metrics_summary)
145
+ direct_value = summary.get(normalized_metric_id)
146
+ direct_number = to_number(direct_value)
147
+ if direct_number is not None:
148
+ return direct_number
149
+ origin_value = _resolve_origin_path_value(metrics_summary, origin_path)
150
+ origin_number = to_number(origin_value)
151
+ if origin_number is not None:
152
+ return origin_number
153
+ if isinstance(primary_metric, dict):
154
+ primary_metric_id = str(
155
+ primary_metric.get("metric_id") or primary_metric.get("name") or primary_metric.get("id") or ""
156
+ ).strip()
157
+ if primary_metric_id == normalized_metric_id:
158
+ primary_number = to_number(primary_metric.get("value"))
159
+ if primary_number is not None:
160
+ return primary_number
161
+ elif isinstance(primary_metric, str) and primary_metric.strip() == normalized_metric_id:
162
+ return None
163
+ return None
164
+
165
+
166
+ def canonicalize_baseline_submission(
167
+ *,
168
+ metric_contract: object,
169
+ metrics_summary: object = None,
170
+ primary_metric: object = None,
171
+ ) -> dict[str, Any]:
172
+ contract_payload = metric_contract if isinstance(metric_contract, dict) else {}
173
+ explicit_metrics = contract_payload.get("metrics") if isinstance(contract_payload.get("metrics"), list) else []
174
+ normalized_contract = normalize_metric_contract(
175
+ contract_payload,
176
+ metrics_summary=None,
177
+ primary_metric=primary_metric,
178
+ )
179
+ canonical_metrics: OrderedDict[str, float] = OrderedDict()
180
+ metric_details: list[dict[str, Any]] = []
181
+ unresolved_metric_ids: list[str] = []
182
+
183
+ if explicit_metrics:
184
+ for metric in normalized_contract.get("metrics", []):
185
+ if not isinstance(metric, dict):
186
+ continue
187
+ metric_id = str(metric.get("metric_id") or "").strip()
188
+ if not metric_id:
189
+ continue
190
+ explanation = _metric_explanation_fields(metric)
191
+ value = resolve_metric_value_from_summary(
192
+ metric_id,
193
+ metrics_summary=metrics_summary,
194
+ primary_metric=primary_metric,
195
+ origin_path=explanation.get("origin_path"),
196
+ )
197
+ required = bool(metric.get("required", True))
198
+ detail = {
199
+ **metric,
200
+ "metric_id": metric_id,
201
+ "required": required,
202
+ **explanation,
203
+ }
204
+ if value is None:
205
+ if required:
206
+ unresolved_metric_ids.append(metric_id)
207
+ detail["value"] = None
208
+ else:
209
+ canonical_metrics[metric_id] = value
210
+ detail["value"] = value
211
+ metric_details.append(detail)
212
+ else:
213
+ for metric_id, value in extract_numeric_metric_map(metrics_summary=metrics_summary).items():
214
+ canonical_metrics[metric_id] = value
215
+ metric_details.append(
216
+ {
217
+ "metric_id": metric_id,
218
+ "required": True,
219
+ "description": None,
220
+ "derivation": None,
221
+ "source_ref": None,
222
+ "origin_path": None,
223
+ "value": value,
224
+ }
225
+ )
226
+
227
+ return {
228
+ "metric_contract": normalized_contract,
229
+ "metrics_summary": dict(canonical_metrics),
230
+ "metric_details": metric_details,
231
+ "unresolved_metric_ids": unresolved_metric_ids,
232
+ "source_leaf_map": flatten_metric_leaf_map(metrics_summary),
233
+ }
234
+
235
+
236
+ def validate_baseline_metric_contract_submission(
237
+ *,
238
+ metric_contract: object,
239
+ metrics_summary: object = None,
240
+ primary_metric: object = None,
241
+ ) -> dict[str, Any]:
242
+ canonical = canonicalize_baseline_submission(
243
+ metric_contract=metric_contract,
244
+ metrics_summary=metrics_summary,
245
+ primary_metric=primary_metric,
246
+ )
247
+ normalized_contract = canonical["metric_contract"]
248
+ metric_details = canonical["metric_details"]
249
+ canonical_metrics = canonical["metrics_summary"]
250
+ explicit_metrics = normalized_contract.get("metrics") if isinstance(normalized_contract.get("metrics"), list) else []
251
+ if not explicit_metrics:
252
+ raise MetricContractValidationError(
253
+ "Baseline metric contract must define explicit metric entries for every canonical metric.",
254
+ error_code="baseline_metric_contract_missing_entries",
255
+ details={
256
+ "validation_stage": "baseline",
257
+ "baseline_metric_ids": [],
258
+ "baseline_metric_details": metric_details,
259
+ "source_metric_paths": sorted(canonical["source_leaf_map"].keys()),
260
+ },
261
+ )
262
+
263
+ missing_explanations: list[dict[str, Any]] = []
264
+ for detail in metric_details:
265
+ if not isinstance(detail, dict):
266
+ continue
267
+ missing_fields: list[str] = []
268
+ if not str(detail.get("description") or "").strip():
269
+ missing_fields.append("description")
270
+ if not (str(detail.get("derivation") or "").strip() or str(detail.get("origin_path") or "").strip()):
271
+ missing_fields.append("derivation_or_origin_path")
272
+ if not str(detail.get("source_ref") or "").strip():
273
+ missing_fields.append("source_ref")
274
+ if missing_fields:
275
+ missing_explanations.append(
276
+ {
277
+ "metric_id": detail.get("metric_id"),
278
+ "missing_fields": missing_fields,
279
+ "detail": detail,
280
+ }
281
+ )
282
+
283
+ if canonical["unresolved_metric_ids"]:
284
+ raise MetricContractValidationError(
285
+ "Baseline metric contract is missing canonical values for one or more required metrics.",
286
+ error_code="baseline_metric_values_missing",
287
+ details={
288
+ "validation_stage": "baseline",
289
+ "missing_metric_ids": canonical["unresolved_metric_ids"],
290
+ "baseline_metric_ids": list(canonical_metrics.keys()),
291
+ "baseline_metric_details": metric_details,
292
+ "source_metric_paths": sorted(canonical["source_leaf_map"].keys()),
293
+ },
294
+ )
295
+
296
+ if missing_explanations:
297
+ raise MetricContractValidationError(
298
+ "Baseline metric contract must explain every canonical metric with description, derivation/origin path, and source reference.",
299
+ error_code="baseline_metric_explanations_missing",
300
+ details={
301
+ "validation_stage": "baseline",
302
+ "baseline_metric_ids": list(canonical_metrics.keys()),
303
+ "baseline_metric_details": metric_details,
304
+ "missing_explanations": missing_explanations,
305
+ "source_metric_paths": sorted(canonical["source_leaf_map"].keys()),
306
+ },
307
+ )
308
+
309
+ if not canonical_metrics:
310
+ raise MetricContractValidationError(
311
+ "Baseline metric contract did not yield any canonical numeric metrics.",
312
+ error_code="baseline_metric_contract_empty",
313
+ details={
314
+ "validation_stage": "baseline",
315
+ "baseline_metric_ids": [],
316
+ "baseline_metric_details": metric_details,
317
+ "source_metric_paths": sorted(canonical["source_leaf_map"].keys()),
318
+ },
319
+ )
320
+
321
+ return canonical
322
+
323
+
324
+ def validate_main_experiment_against_baseline_contract(
325
+ *,
326
+ baseline_contract_payload: object,
327
+ run_metric_contract: object = None,
328
+ metric_rows: object = None,
329
+ metrics_summary: object = None,
330
+ dataset_scope: object = None,
331
+ ) -> dict[str, Any]:
332
+ baseline_payload = baseline_contract_payload if isinstance(baseline_contract_payload, dict) else {}
333
+ if not baseline_payload:
334
+ raise MetricContractValidationError(
335
+ "Canonical baseline metric contract JSON is missing, so main-experiment metric validation cannot run.",
336
+ error_code="baseline_metric_contract_json_missing",
337
+ details={
338
+ "validation_stage": "main_experiment",
339
+ "baseline_metric_ids": [],
340
+ "baseline_metric_details": [],
341
+ },
342
+ )
343
+ baseline_metrics_summary = extract_numeric_metric_map(metrics_summary=baseline_payload.get("metrics_summary"))
344
+ baseline_contract = normalize_metric_contract(
345
+ baseline_payload.get("metric_contract"),
346
+ metrics_summary=baseline_metrics_summary,
347
+ primary_metric=baseline_payload.get("primary_metric"),
348
+ )
349
+ baseline_details = []
350
+ required_metric_ids: list[str] = []
351
+ baseline_meta_map = extract_metric_meta_map(
352
+ metric_contract=baseline_contract,
353
+ metrics_summary=baseline_metrics_summary,
354
+ )
355
+ for metric in baseline_contract.get("metrics", []):
356
+ if not isinstance(metric, dict):
357
+ continue
358
+ metric_id = str(metric.get("metric_id") or "").strip()
359
+ if not metric_id or metric_id not in baseline_metrics_summary:
360
+ continue
361
+ detail = {
362
+ **metric,
363
+ **_metric_explanation_fields(metric),
364
+ "metric_id": metric_id,
365
+ "baseline_value": baseline_metrics_summary.get(metric_id),
366
+ }
367
+ baseline_details.append(detail)
368
+ if bool(metric.get("required", True)) and not bool(metric.get("supplementary", False)):
369
+ required_metric_ids.append(metric_id)
370
+
371
+ if not required_metric_ids:
372
+ raise MetricContractValidationError(
373
+ "Canonical baseline metric contract does not expose any required numeric metrics for comparison.",
374
+ error_code="baseline_metric_contract_empty",
375
+ details={
376
+ "validation_stage": "main_experiment",
377
+ "baseline_metric_ids": [],
378
+ "baseline_metric_details": baseline_details,
379
+ },
380
+ )
381
+
382
+ run_numeric_metrics = extract_numeric_metric_map(metric_rows=metric_rows, metrics_summary=metrics_summary)
383
+ run_meta_map = extract_metric_meta_map(
384
+ metric_contract=run_metric_contract,
385
+ metric_rows=metric_rows,
386
+ metrics_summary=metrics_summary,
387
+ )
388
+ missing_metric_ids = [metric_id for metric_id in required_metric_ids if metric_id not in run_numeric_metrics]
389
+ extra_metric_ids = [metric_id for metric_id in run_numeric_metrics.keys() if metric_id not in required_metric_ids]
390
+ direction_mismatches: list[dict[str, Any]] = []
391
+ for metric_id in required_metric_ids:
392
+ if metric_id not in run_numeric_metrics:
393
+ continue
394
+ baseline_direction = normalize_metric_direction(
395
+ (baseline_meta_map.get(metric_id) or {}).get("direction"),
396
+ metric_id=metric_id,
397
+ )
398
+ run_direction = normalize_metric_direction(
399
+ (run_meta_map.get(metric_id) or {}).get("direction"),
400
+ metric_id=metric_id,
401
+ )
402
+ if baseline_direction != run_direction:
403
+ direction_mismatches.append(
404
+ {
405
+ "metric_id": metric_id,
406
+ "expected_direction": baseline_direction,
407
+ "actual_direction": run_direction,
408
+ }
409
+ )
410
+
411
+ expected_eval = (
412
+ dict(baseline_contract.get("evaluation_protocol") or {})
413
+ if isinstance(baseline_contract.get("evaluation_protocol"), dict)
414
+ else {}
415
+ )
416
+ actual_eval = (
417
+ dict((run_metric_contract or {}).get("evaluation_protocol") or {})
418
+ if isinstance((run_metric_contract or {}).get("evaluation_protocol"), dict)
419
+ else {}
420
+ )
421
+ expected_scope = str(
422
+ expected_eval.get("scope_id")
423
+ or expected_eval.get("dataset_scope")
424
+ or dataset_scope
425
+ or ""
426
+ ).strip() or None
427
+ actual_scopes = sorted(
428
+ {
429
+ str(row.get("scope_id") or row.get("scope") or dataset_scope or "").strip()
430
+ for row in normalize_metric_rows(metric_rows, metrics_summary=metrics_summary)
431
+ if isinstance(row, dict) and str(row.get("metric_id") or "").strip() in required_metric_ids
432
+ }
433
+ - {""}
434
+ )
435
+ scope_mismatch = bool(expected_scope and actual_scopes and any(scope != expected_scope for scope in actual_scopes))
436
+ eval_protocol_mismatch: dict[str, Any] | None = None
437
+ if expected_eval and actual_eval:
438
+ expected_code_hashes = expected_eval.get("code_hashes") if isinstance(expected_eval.get("code_hashes"), dict) else {}
439
+ actual_code_hashes = actual_eval.get("code_hashes") if isinstance(actual_eval.get("code_hashes"), dict) else {}
440
+ expected_code_paths = expected_eval.get("code_paths") if isinstance(expected_eval.get("code_paths"), list) else []
441
+ actual_code_paths = actual_eval.get("code_paths") if isinstance(actual_eval.get("code_paths"), list) else []
442
+ if (
443
+ str(expected_eval.get("scope_id") or expected_eval.get("dataset_scope") or "").strip()
444
+ and str(expected_eval.get("scope_id") or expected_eval.get("dataset_scope") or "").strip()
445
+ != str(actual_eval.get("scope_id") or actual_eval.get("dataset_scope") or "").strip()
446
+ ) or (expected_code_hashes and actual_code_hashes and expected_code_hashes != actual_code_hashes) or (
447
+ expected_code_paths and actual_code_paths and expected_code_paths != actual_code_paths
448
+ ):
449
+ eval_protocol_mismatch = {
450
+ "expected": expected_eval,
451
+ "actual": actual_eval,
452
+ }
453
+
454
+ if missing_metric_ids or direction_mismatches or scope_mismatch or eval_protocol_mismatch:
455
+ details: dict[str, Any] = {
456
+ "validation_stage": "main_experiment",
457
+ "baseline_metric_ids": required_metric_ids,
458
+ "baseline_metric_details": baseline_details,
459
+ "missing_metric_ids": missing_metric_ids,
460
+ "extra_metric_ids": extra_metric_ids,
461
+ }
462
+ if direction_mismatches:
463
+ details["direction_mismatches"] = direction_mismatches
464
+ if scope_mismatch:
465
+ details["evaluation_protocol_mismatch"] = {
466
+ "expected_scope_id": expected_scope,
467
+ "actual_scope_ids": actual_scopes,
468
+ }
469
+ if eval_protocol_mismatch:
470
+ details["evaluation_protocol_mismatch"] = eval_protocol_mismatch
471
+ raise MetricContractValidationError(
472
+ "Main experiment must cover every required baseline metric and stay aligned with the canonical evaluation contract.",
473
+ error_code="main_experiment_metric_validation_failed",
474
+ details=details,
475
+ )
476
+
477
+ return {
478
+ "baseline_metric_ids": required_metric_ids,
479
+ "baseline_metric_details": baseline_details,
480
+ "extra_metric_ids": extra_metric_ids,
481
+ }
482
+
483
+
49
484
  def _normalize_metric_entry(metric: object, *, fallback_id: str | None = None) -> dict[str, Any]:
50
485
  if isinstance(metric, str):
51
486
  metric_id = as_metric_id(metric, fallback=fallback_id)
@@ -93,6 +528,7 @@ def normalize_metric_contract(
93
528
  *,
94
529
  baseline_id: str | None = None,
95
530
  metrics_summary: object = None,
531
+ metric_rows: object = None,
96
532
  primary_metric: object = None,
97
533
  baseline_variants: object = None,
98
534
  ) -> dict[str, Any]:
@@ -104,7 +540,7 @@ def normalize_metric_contract(
104
540
  normalized = _normalize_metric_entry(metric, fallback_id=f"metric_{index + 1}")
105
541
  metrics_by_id[normalized["metric_id"]] = normalized
106
542
 
107
- summary_metrics = normalize_metrics_summary(metrics_summary)
543
+ summary_metrics = extract_numeric_metric_map(metric_rows=metric_rows, metrics_summary=metrics_summary)
108
544
  for metric_id in summary_metrics.keys():
109
545
  metrics_by_id.setdefault(metric_id, _normalize_metric_entry({}, fallback_id=metric_id))
110
546
 
@@ -112,7 +548,7 @@ def normalize_metric_contract(
112
548
  for variant in baseline_variants:
113
549
  if not isinstance(variant, dict):
114
550
  continue
115
- for metric_id in normalize_metrics_summary(variant.get("metrics_summary")).keys():
551
+ for metric_id in extract_numeric_metric_map(metrics_summary=variant.get("metrics_summary")).keys():
116
552
  metrics_by_id.setdefault(metric_id, _normalize_metric_entry({}, fallback_id=metric_id))
117
553
 
118
554
  primary_metric_id = str(contract_payload.get("primary_metric_id") or "").strip()
@@ -131,7 +567,13 @@ def normalize_metric_contract(
131
567
  if primary_metric_id:
132
568
  metrics_by_id.setdefault(primary_metric_id, _normalize_metric_entry({}, fallback_id=primary_metric_id))
133
569
 
570
+ preserved_top_level = {
571
+ key: value
572
+ for key, value in contract_payload.items()
573
+ if key not in {"contract_id", "primary_metric_id", "metrics"}
574
+ }
134
575
  return {
576
+ **preserved_top_level,
135
577
  "contract_id": str(contract_payload.get("contract_id") or baseline_id or "default").strip() or "default",
136
578
  "primary_metric_id": primary_metric_id or None,
137
579
  "metrics": list(metrics_by_id.values()),
@@ -152,10 +594,10 @@ def selected_baseline_metrics(entry: dict[str, Any] | None, selected_variant_id:
152
594
  if selected_variant is None and variants:
153
595
  selected_variant = next((item for item in variants if isinstance(item, dict)), None)
154
596
  if isinstance(selected_variant, dict):
155
- summary = normalize_metrics_summary(selected_variant.get("metrics_summary"))
597
+ summary = extract_numeric_metric_map(metrics_summary=selected_variant.get("metrics_summary"))
156
598
  if summary:
157
599
  return summary
158
- return normalize_metrics_summary(entry.get("metrics_summary"))
600
+ return extract_numeric_metric_map(metrics_summary=entry.get("metrics_summary"))
159
601
 
160
602
 
161
603
  def baseline_metric_lines(entry: dict[str, Any] | None, selected_variant_id: str | None = None) -> list[dict[str, Any]]:
@@ -169,9 +611,8 @@ def baseline_metric_lines(entry: dict[str, Any] | None, selected_variant_id: str
169
611
  if not isinstance(variant, dict):
170
612
  continue
171
613
  variant_id = str(variant.get("variant_id") or "").strip() or None
172
- metrics_summary = normalize_metrics_summary(variant.get("metrics_summary"))
614
+ metrics_summary = extract_numeric_metric_map(metrics_summary=variant.get("metrics_summary"))
173
615
  for metric_id, value in metrics_summary.items():
174
- numeric_value = to_number(value)
175
616
  lines.append(
176
617
  {
177
618
  "metric_id": metric_id,
@@ -179,14 +620,13 @@ def baseline_metric_lines(entry: dict[str, Any] | None, selected_variant_id: str
179
620
  "baseline_id": baseline_id,
180
621
  "variant_id": variant_id,
181
622
  "selected": bool(selected_id and variant_id == selected_id),
182
- "value": numeric_value,
623
+ "value": value,
183
624
  "raw_value": value,
184
625
  }
185
626
  )
186
627
  if lines:
187
628
  return lines
188
- for metric_id, value in normalize_metrics_summary(entry.get("metrics_summary")).items():
189
- numeric_value = to_number(value)
629
+ for metric_id, value in extract_numeric_metric_map(metrics_summary=entry.get("metrics_summary")).items():
190
630
  lines.append(
191
631
  {
192
632
  "metric_id": metric_id,
@@ -194,7 +634,7 @@ def baseline_metric_lines(entry: dict[str, Any] | None, selected_variant_id: str
194
634
  "baseline_id": baseline_id,
195
635
  "variant_id": None,
196
636
  "selected": True,
197
- "value": numeric_value,
637
+ "value": value,
198
638
  "raw_value": value,
199
639
  }
200
640
  )
@@ -240,29 +680,252 @@ def normalize_metric_rows(
240
680
  return rows
241
681
 
242
682
 
683
+ def extract_numeric_metric_map(
684
+ *,
685
+ metric_rows: object = None,
686
+ metrics_summary: object = None,
687
+ ) -> dict[str, float]:
688
+ metrics: OrderedDict[str, float] = OrderedDict()
689
+ rows = normalize_metric_rows(metric_rows, metrics_summary=metrics_summary)
690
+ for row in rows:
691
+ if not isinstance(row, dict):
692
+ continue
693
+ metric_id = str(row.get("metric_id") or "").strip()
694
+ numeric_value = to_number(row.get("numeric_value", row.get("value")))
695
+ if not metric_id or numeric_value is None:
696
+ continue
697
+ metrics[metric_id] = numeric_value
698
+ for metric_id, value in normalize_metrics_summary(metrics_summary).items():
699
+ numeric_value = to_number(value)
700
+ if metric_id and numeric_value is not None and metric_id not in metrics:
701
+ metrics[metric_id] = numeric_value
702
+ return dict(metrics)
703
+
704
+
705
+ def extract_metric_raw_value_map(
706
+ *,
707
+ metric_rows: object = None,
708
+ metrics_summary: object = None,
709
+ ) -> dict[str, Any]:
710
+ values: OrderedDict[str, Any] = OrderedDict()
711
+ rows = normalize_metric_rows(metric_rows, metrics_summary=metrics_summary)
712
+ for row in rows:
713
+ if not isinstance(row, dict):
714
+ continue
715
+ metric_id = str(row.get("metric_id") or "").strip()
716
+ if not metric_id:
717
+ continue
718
+ values[metric_id] = row.get("value")
719
+ for metric_id, value in normalize_metrics_summary(metrics_summary).items():
720
+ if metric_id not in values:
721
+ values[metric_id] = value
722
+ return dict(values)
723
+
724
+
725
+ def extract_metric_meta_map(
726
+ *,
727
+ metric_contract: object = None,
728
+ metric_rows: object = None,
729
+ metrics_summary: object = None,
730
+ ) -> dict[str, dict[str, Any]]:
731
+ contract = normalize_metric_contract(
732
+ metric_contract,
733
+ metrics_summary=metrics_summary,
734
+ metric_rows=metric_rows,
735
+ )
736
+ meta_map: OrderedDict[str, dict[str, Any]] = OrderedDict()
737
+ for item in contract.get("metrics", []):
738
+ if not isinstance(item, dict):
739
+ continue
740
+ metric_id = str(item.get("metric_id") or "").strip()
741
+ if not metric_id:
742
+ continue
743
+ meta_map[metric_id] = {
744
+ **item,
745
+ "metric_id": metric_id,
746
+ "direction": normalize_metric_direction(item.get("direction"), metric_id=metric_id),
747
+ "label": str(item.get("label") or metric_id).strip() or metric_id,
748
+ }
749
+
750
+ for row in normalize_metric_rows(metric_rows, metrics_summary=metrics_summary):
751
+ if not isinstance(row, dict):
752
+ continue
753
+ metric_id = str(row.get("metric_id") or "").strip()
754
+ if not metric_id:
755
+ continue
756
+ current = dict(meta_map.get(metric_id) or _normalize_metric_entry({}, fallback_id=metric_id))
757
+ label = str(row.get("label") or row.get("name") or current.get("label") or metric_id).strip() or metric_id
758
+ decimals = row.get("decimals") if isinstance(row.get("decimals"), int) else current.get("decimals")
759
+ meta_map[metric_id] = {
760
+ **current,
761
+ "metric_id": metric_id,
762
+ "label": label,
763
+ "direction": normalize_metric_direction(row.get("direction") or current.get("direction"), metric_id=metric_id),
764
+ "unit": str(row.get("unit") or current.get("unit") or "").strip() or None,
765
+ "decimals": decimals,
766
+ "chart_group": str(row.get("chart_group") or current.get("chart_group") or "default").strip() or "default",
767
+ }
768
+
769
+ for metric_id in extract_numeric_metric_map(metric_rows=metric_rows, metrics_summary=metrics_summary).keys():
770
+ meta_map.setdefault(metric_id, _normalize_metric_entry({}, fallback_id=metric_id))
771
+ return dict(meta_map)
772
+
773
+
774
+ def extract_metric_comparison_map(
775
+ baseline_comparisons: object,
776
+ ) -> dict[str, dict[str, Any]]:
777
+ comparisons = baseline_comparisons if isinstance(baseline_comparisons, dict) else {}
778
+ return {
779
+ str(item.get("metric_id") or "").strip(): item
780
+ for item in comparisons.get("items", [])
781
+ if isinstance(item, dict) and item.get("metric_id")
782
+ }
783
+
784
+
785
+ def extract_metric_delta_map(
786
+ *,
787
+ metric_rows: object = None,
788
+ baseline_comparisons: object = None,
789
+ ) -> dict[str, float]:
790
+ delta_map: OrderedDict[str, float] = OrderedDict()
791
+ for metric_id, item in extract_metric_comparison_map(baseline_comparisons).items():
792
+ delta_value = to_number(item.get("delta"))
793
+ if delta_value is not None:
794
+ delta_map[metric_id] = delta_value
795
+ for row in normalize_metric_rows(metric_rows):
796
+ if not isinstance(row, dict):
797
+ continue
798
+ metric_id = str(row.get("metric_id") or "").strip()
799
+ if not metric_id or metric_id in delta_map:
800
+ continue
801
+ delta_value = to_number(row.get("delta"))
802
+ if delta_value is not None:
803
+ delta_map[metric_id] = delta_value
804
+ return dict(delta_map)
805
+
806
+
807
+ def resolve_primary_metric_id(
808
+ *,
809
+ metric_contract: object = None,
810
+ metric_rows: object = None,
811
+ metrics_summary: object = None,
812
+ primary_metric: object = None,
813
+ progress_eval: object = None,
814
+ baseline_comparisons: object = None,
815
+ ) -> str | None:
816
+ numeric_metrics = extract_numeric_metric_map(metric_rows=metric_rows, metrics_summary=metrics_summary)
817
+ if not numeric_metrics:
818
+ return None
819
+
820
+ contract = normalize_metric_contract(
821
+ metric_contract,
822
+ metrics_summary=metrics_summary,
823
+ metric_rows=metric_rows,
824
+ primary_metric=primary_metric,
825
+ )
826
+ candidates: list[str] = []
827
+ for value in (
828
+ (progress_eval or {}).get("primary_metric_id") if isinstance(progress_eval, dict) else None,
829
+ (baseline_comparisons or {}).get("primary_metric_id") if isinstance(baseline_comparisons, dict) else None,
830
+ contract.get("primary_metric_id"),
831
+ ):
832
+ candidate = str(value or "").strip()
833
+ if candidate:
834
+ candidates.append(candidate)
835
+ if isinstance(primary_metric, dict):
836
+ candidate = str(
837
+ primary_metric.get("metric_id") or primary_metric.get("name") or primary_metric.get("id") or ""
838
+ ).strip()
839
+ if candidate:
840
+ candidates.append(candidate)
841
+ elif isinstance(primary_metric, str):
842
+ candidate = primary_metric.strip()
843
+ if candidate:
844
+ candidates.append(candidate)
845
+ for candidate in candidates:
846
+ if candidate in numeric_metrics:
847
+ return candidate
848
+ return next(iter(numeric_metrics.keys()), None)
849
+
850
+
851
+ def extract_latest_metric(payload: dict[str, Any] | None) -> dict[str, Any] | None:
852
+ if not isinstance(payload, dict) or not payload:
853
+ return None
854
+ numeric_metrics = extract_numeric_metric_map(
855
+ metric_rows=payload.get("metric_rows"),
856
+ metrics_summary=payload.get("metrics_summary"),
857
+ )
858
+ if not numeric_metrics:
859
+ return None
860
+
861
+ metric_id = resolve_primary_metric_id(
862
+ metric_contract=payload.get("metric_contract"),
863
+ metric_rows=payload.get("metric_rows"),
864
+ metrics_summary=payload.get("metrics_summary"),
865
+ primary_metric=payload.get("primary_metric"),
866
+ progress_eval=payload.get("progress_eval"),
867
+ baseline_comparisons=payload.get("baseline_comparisons"),
868
+ )
869
+ if not metric_id:
870
+ return None
871
+ metric_value = numeric_metrics.get(metric_id)
872
+ if metric_value is None:
873
+ return None
874
+
875
+ meta_map = extract_metric_meta_map(
876
+ metric_contract=payload.get("metric_contract"),
877
+ metric_rows=payload.get("metric_rows"),
878
+ metrics_summary=payload.get("metrics_summary"),
879
+ )
880
+ delta_map = extract_metric_delta_map(
881
+ metric_rows=payload.get("metric_rows"),
882
+ baseline_comparisons=payload.get("baseline_comparisons"),
883
+ )
884
+ meta = meta_map.get(metric_id) or {}
885
+ result = {
886
+ "key": metric_id,
887
+ "value": metric_value,
888
+ }
889
+ if metric_id in delta_map:
890
+ result["delta_vs_baseline"] = delta_map[metric_id]
891
+ if meta.get("label"):
892
+ result["label"] = meta["label"]
893
+ if meta.get("direction"):
894
+ result["direction"] = meta["direction"]
895
+ if meta.get("unit"):
896
+ result["unit"] = meta["unit"]
897
+ if meta.get("decimals") is not None:
898
+ result["decimals"] = meta["decimals"]
899
+ return result
900
+
901
+
243
902
  def compare_with_baseline(
244
903
  *,
245
904
  metrics_summary: object,
905
+ metric_rows: object = None,
246
906
  metric_contract: object,
247
907
  baseline_metrics: object,
248
908
  ) -> dict[str, Any]:
249
- run_summary = normalize_metrics_summary(metrics_summary)
250
- baseline_summary = normalize_metrics_summary(baseline_metrics)
251
- contract = normalize_metric_contract(metric_contract, metrics_summary=run_summary)
909
+ run_summary = extract_numeric_metric_map(metric_rows=metric_rows, metrics_summary=metrics_summary)
910
+ baseline_summary = extract_numeric_metric_map(metrics_summary=baseline_metrics)
911
+ contract = normalize_metric_contract(metric_contract, metrics_summary=run_summary, metric_rows=metric_rows)
252
912
  items: list[dict[str, Any]] = []
253
- metric_ids = [item["metric_id"] for item in contract.get("metrics", [])]
913
+ metric_meta = extract_metric_meta_map(
914
+ metric_contract=contract,
915
+ metric_rows=metric_rows,
916
+ metrics_summary=run_summary,
917
+ )
918
+ metric_ids = [
919
+ metric_id
920
+ for metric_id in metric_meta.keys()
921
+ if metric_id in run_summary or metric_id in baseline_summary
922
+ ]
254
923
  for metric_id in baseline_summary.keys():
255
924
  if metric_id not in metric_ids:
256
925
  metric_ids.append(metric_id)
257
926
  for metric_id in run_summary.keys():
258
927
  if metric_id not in metric_ids:
259
928
  metric_ids.append(metric_id)
260
-
261
- metric_meta = {
262
- item["metric_id"]: item
263
- for item in contract.get("metrics", [])
264
- if isinstance(item, dict) and item.get("metric_id")
265
- }
266
929
  for metric_id in metric_ids:
267
930
  meta = metric_meta.get(metric_id) or _normalize_metric_entry({}, fallback_id=metric_id)
268
931
  run_value = run_summary.get(metric_id)
@@ -276,7 +939,7 @@ def compare_with_baseline(
276
939
  delta = run_number - baseline_number
277
940
  if baseline_number not in {0.0, -0.0}:
278
941
  relative_delta = delta / abs(baseline_number)
279
- direction = meta.get("direction") or infer_metric_direction(metric_id)
942
+ direction = normalize_metric_direction(meta.get("direction"), metric_id=metric_id)
280
943
  if direction == "maximize":
281
944
  better = run_number > baseline_number
282
945
  else:
@@ -285,7 +948,7 @@ def compare_with_baseline(
285
948
  {
286
949
  "metric_id": metric_id,
287
950
  "label": meta.get("label") or metric_id,
288
- "direction": meta.get("direction") or infer_metric_direction(metric_id),
951
+ "direction": normalize_metric_direction(meta.get("direction"), metric_id=metric_id),
289
952
  "unit": meta.get("unit"),
290
953
  "decimals": meta.get("decimals"),
291
954
  "chart_group": meta.get("chart_group"),
@@ -299,7 +962,10 @@ def compare_with_baseline(
299
962
  }
300
963
  )
301
964
 
302
- primary_metric_id = str(contract.get("primary_metric_id") or "").strip() or None
965
+ primary_metric_id = resolve_primary_metric_id(
966
+ metric_contract=contract,
967
+ metrics_summary=run_summary,
968
+ )
303
969
  primary_item = next((item for item in items if item["metric_id"] == primary_metric_id), None)
304
970
  if primary_item is None and items:
305
971
  primary_item = items[0]
@@ -372,6 +1038,60 @@ def compute_progress_eval(
372
1038
  }
373
1039
 
374
1040
 
1041
+ def _record_sort_key(record: dict[str, Any]) -> str:
1042
+ return str(record.get("updated_at") or record.get("created_at") or "")
1043
+
1044
+
1045
+ def _record_dedupe_key(record: dict[str, Any]) -> str:
1046
+ run_id = str(record.get("run_id") or "").strip()
1047
+ if run_id:
1048
+ return f"run:{run_id}"
1049
+ artifact_id = str(record.get("artifact_id") or "").strip()
1050
+ if artifact_id:
1051
+ return f"artifact:{artifact_id}"
1052
+ result_path = str(((record.get("paths") or {}) if isinstance(record.get("paths"), dict) else {}).get("result_json") or "").strip()
1053
+ if result_path:
1054
+ return f"path:{result_path}"
1055
+ branch_name = str(record.get("branch") or "").strip()
1056
+ return f"record:{branch_name}:{_record_sort_key(record)}"
1057
+
1058
+
1059
+ def _record_richness(record: dict[str, Any]) -> tuple[int, int, int, int, str]:
1060
+ numeric_metrics = extract_numeric_metric_map(
1061
+ metric_rows=record.get("metric_rows"),
1062
+ metrics_summary=record.get("metrics_summary"),
1063
+ )
1064
+ comparisons = extract_metric_comparison_map(record.get("baseline_comparisons"))
1065
+ has_result_path = int(
1066
+ bool(((record.get("paths") or {}) if isinstance(record.get("paths"), dict) else {}).get("result_json"))
1067
+ )
1068
+ metric_meta = extract_metric_meta_map(
1069
+ metric_contract=record.get("metric_contract"),
1070
+ metric_rows=record.get("metric_rows"),
1071
+ metrics_summary=record.get("metrics_summary"),
1072
+ )
1073
+ return (
1074
+ len(numeric_metrics),
1075
+ len(comparisons),
1076
+ has_result_path,
1077
+ len(metric_meta),
1078
+ _record_sort_key(record),
1079
+ )
1080
+
1081
+
1082
+ def dedupe_run_records(run_records: list[dict[str, Any]]) -> list[dict[str, Any]]:
1083
+ deduped: OrderedDict[str, dict[str, Any]] = OrderedDict()
1084
+ for record in sorted(
1085
+ [item for item in run_records if isinstance(item, dict)],
1086
+ key=_record_sort_key,
1087
+ ):
1088
+ key = _record_dedupe_key(record)
1089
+ existing = deduped.get(key)
1090
+ if existing is None or _record_richness(record) >= _record_richness(existing):
1091
+ deduped[key] = record
1092
+ return sorted(deduped.values(), key=_record_sort_key)
1093
+
1094
+
375
1095
  def build_metrics_timeline(
376
1096
  *,
377
1097
  quest_id: str,
@@ -379,81 +1099,95 @@ def build_metrics_timeline(
379
1099
  baseline_entry: dict[str, Any] | None = None,
380
1100
  selected_variant_id: str | None = None,
381
1101
  ) -> dict[str, Any]:
382
- ordered_runs = sorted(
383
- [item for item in run_records if isinstance(item, dict)],
384
- key=lambda item: str(item.get("updated_at") or item.get("created_at") or ""),
385
- )
1102
+ ordered_runs = dedupe_run_records(run_records)
1103
+ baseline_metrics = selected_baseline_metrics(baseline_entry, selected_variant_id)
386
1104
  contract = normalize_metric_contract(
387
1105
  None,
388
1106
  baseline_id=str((baseline_entry or {}).get("baseline_id") or ""),
389
- metrics_summary=(baseline_entry or {}).get("metrics_summary"),
1107
+ metrics_summary=baseline_metrics,
390
1108
  primary_metric=(baseline_entry or {}).get("primary_metric"),
391
1109
  baseline_variants=(baseline_entry or {}).get("baseline_variants"),
392
1110
  )
1111
+ primary_metric_id = str(contract.get("primary_metric_id") or "").strip() or None
393
1112
  for record in ordered_runs:
394
- run_contract = record.get("metric_contract")
395
- if run_contract:
396
- contract = normalize_metric_contract(run_contract, metrics_summary=record.get("metrics_summary"))
1113
+ candidate = resolve_primary_metric_id(
1114
+ metric_contract=record.get("metric_contract"),
1115
+ metric_rows=record.get("metric_rows"),
1116
+ metrics_summary=record.get("metrics_summary"),
1117
+ progress_eval=record.get("progress_eval"),
1118
+ baseline_comparisons=record.get("baseline_comparisons"),
1119
+ )
1120
+ if candidate:
1121
+ primary_metric_id = candidate
397
1122
  break
398
1123
 
399
1124
  series_map: OrderedDict[str, dict[str, Any]] = OrderedDict()
1125
+ baseline_meta_map = extract_metric_meta_map(
1126
+ metric_contract=(baseline_entry or {}).get("metric_contract"),
1127
+ metrics_summary=baseline_metrics,
1128
+ )
1129
+
1130
+ def ensure_series(metric_id: str, meta: dict[str, Any] | None = None) -> dict[str, Any]:
1131
+ resolved_meta = meta or baseline_meta_map.get(metric_id) or _normalize_metric_entry({}, fallback_id=metric_id)
1132
+ if metric_id not in series_map:
1133
+ series_map[metric_id] = {
1134
+ "metric_id": metric_id,
1135
+ "label": resolved_meta.get("label") or metric_id,
1136
+ "direction": normalize_metric_direction(resolved_meta.get("direction"), metric_id=metric_id),
1137
+ "unit": resolved_meta.get("unit"),
1138
+ "decimals": resolved_meta.get("decimals"),
1139
+ "chart_group": resolved_meta.get("chart_group"),
1140
+ "baselines": [],
1141
+ "points": [],
1142
+ }
1143
+ else:
1144
+ series_map[metric_id]["label"] = resolved_meta.get("label") or series_map[metric_id]["label"]
1145
+ series_map[metric_id]["direction"] = normalize_metric_direction(
1146
+ resolved_meta.get("direction") or series_map[metric_id]["direction"],
1147
+ metric_id=metric_id,
1148
+ )
1149
+ series_map[metric_id]["unit"] = resolved_meta.get("unit") or series_map[metric_id]["unit"]
1150
+ if resolved_meta.get("decimals") is not None:
1151
+ series_map[metric_id]["decimals"] = resolved_meta.get("decimals")
1152
+ series_map[metric_id]["chart_group"] = (
1153
+ resolved_meta.get("chart_group") or series_map[metric_id]["chart_group"]
1154
+ )
1155
+ return series_map[metric_id]
1156
+
400
1157
  for metric in contract.get("metrics", []):
401
1158
  metric_id = str(metric.get("metric_id") or "").strip()
402
1159
  if not metric_id:
403
1160
  continue
404
- series_map[metric_id] = {
405
- "metric_id": metric_id,
406
- "label": metric.get("label") or metric_id,
407
- "direction": metric.get("direction") or infer_metric_direction(metric_id),
408
- "unit": metric.get("unit"),
409
- "decimals": metric.get("decimals"),
410
- "chart_group": metric.get("chart_group"),
411
- "baselines": [],
412
- "points": [],
413
- }
1161
+ ensure_series(metric_id, metric)
414
1162
 
415
1163
  for line in baseline_metric_lines(baseline_entry, selected_variant_id):
416
1164
  metric_id = str(line.get("metric_id") or "").strip()
417
1165
  if not metric_id:
418
1166
  continue
419
- series_map.setdefault(
420
- metric_id,
421
- {
422
- "metric_id": metric_id,
423
- "label": metric_id,
424
- "direction": infer_metric_direction(metric_id),
425
- "unit": None,
426
- "decimals": None,
427
- "chart_group": "default",
428
- "baselines": [],
429
- "points": [],
430
- },
431
- )
432
- series_map[metric_id]["baselines"].append(line)
1167
+ ensure_series(metric_id).setdefault("baselines", []).append(line)
433
1168
 
434
1169
  for index, record in enumerate(ordered_runs, start=1):
435
- summary = normalize_metrics_summary(record.get("metrics_summary"))
1170
+ numeric_metrics = extract_numeric_metric_map(
1171
+ metric_rows=record.get("metric_rows"),
1172
+ metrics_summary=record.get("metrics_summary"),
1173
+ )
1174
+ raw_values = extract_metric_raw_value_map(
1175
+ metric_rows=record.get("metric_rows"),
1176
+ metrics_summary=record.get("metrics_summary"),
1177
+ )
436
1178
  progress = record.get("progress_eval") if isinstance(record.get("progress_eval"), dict) else {}
437
- comparisons = record.get("baseline_comparisons") if isinstance(record.get("baseline_comparisons"), dict) else {}
438
- comparison_by_id = {
439
- str(item.get("metric_id") or "").strip(): item
440
- for item in comparisons.get("items", [])
441
- if isinstance(item, dict) and item.get("metric_id")
442
- }
443
- for metric_id, raw_value in summary.items():
444
- series_map.setdefault(
445
- metric_id,
446
- {
447
- "metric_id": metric_id,
448
- "label": metric_id,
449
- "direction": infer_metric_direction(metric_id),
450
- "unit": None,
451
- "decimals": None,
452
- "chart_group": "default",
453
- "baselines": [],
454
- "points": [],
455
- },
456
- )
1179
+ comparison_by_id = extract_metric_comparison_map(record.get("baseline_comparisons"))
1180
+ delta_by_id = extract_metric_delta_map(
1181
+ metric_rows=record.get("metric_rows"),
1182
+ baseline_comparisons=record.get("baseline_comparisons"),
1183
+ )
1184
+ record_meta = extract_metric_meta_map(
1185
+ metric_contract=record.get("metric_contract"),
1186
+ metric_rows=record.get("metric_rows"),
1187
+ metrics_summary=record.get("metrics_summary"),
1188
+ )
1189
+ for metric_id, numeric_value in numeric_metrics.items():
1190
+ ensure_series(metric_id, record_meta.get(metric_id))
457
1191
  comparison = comparison_by_id.get(metric_id, {})
458
1192
  series_map[metric_id]["points"].append(
459
1193
  {
@@ -463,9 +1197,9 @@ def build_metrics_timeline(
463
1197
  "created_at": record.get("updated_at") or record.get("created_at"),
464
1198
  "branch": record.get("branch"),
465
1199
  "idea_id": record.get("idea_id"),
466
- "value": to_number(raw_value),
467
- "raw_value": raw_value,
468
- "delta_vs_baseline": comparison.get("delta"),
1200
+ "value": numeric_value,
1201
+ "raw_value": raw_values.get(metric_id, numeric_value),
1202
+ "delta_vs_baseline": delta_by_id.get(metric_id),
469
1203
  "relative_delta_vs_baseline": comparison.get("relative_delta"),
470
1204
  "breakthrough": bool(progress.get("breakthrough")),
471
1205
  "breakthrough_level": progress.get("breakthrough_level"),
@@ -473,7 +1207,6 @@ def build_metrics_timeline(
473
1207
  }
474
1208
  )
475
1209
 
476
- primary_metric_id = str(contract.get("primary_metric_id") or "").strip() or None
477
1210
  series = [item for item in series_map.values() if item["points"] or item["baselines"]]
478
1211
  return {
479
1212
  "quest_id": quest_id,