fc-data 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. datasmith/__init__.py +330 -0
  2. datasmith/__init__.pyi +194 -0
  3. datasmith/agents/__init__.py +31 -0
  4. datasmith/agents/classifiers.py +272 -0
  5. datasmith/agents/codex.py +25 -0
  6. datasmith/agents/config.py +108 -0
  7. datasmith/agents/extractors.py +197 -0
  8. datasmith/agents/installed/README.md +52 -0
  9. datasmith/agents/installed/__init__.py +22 -0
  10. datasmith/agents/installed/base.py +240 -0
  11. datasmith/agents/installed/claude.py +134 -0
  12. datasmith/agents/installed/codex.py +91 -0
  13. datasmith/agents/installed/gemini.py +118 -0
  14. datasmith/agents/installed/none.py +27 -0
  15. datasmith/agents/sandbox.py +547 -0
  16. datasmith/agents/synthesizer.py +439 -0
  17. datasmith/agents/templates/AGENTS.md.j2 +150 -0
  18. datasmith/agents/templates/sandbox_verify.py +428 -0
  19. datasmith/docker/__init__.py +31 -0
  20. datasmith/docker/context.py +112 -0
  21. datasmith/docker/images.py +158 -0
  22. datasmith/docker/publish.py +56 -0
  23. datasmith/docker/templates/Dockerfile.base +26 -0
  24. datasmith/docker/templates/Dockerfile.pr +42 -0
  25. datasmith/docker/templates/Dockerfile.repo +11 -0
  26. datasmith/docker/templates/docker_build_base.sh +780 -0
  27. datasmith/docker/templates/docker_build_env.sh +309 -0
  28. datasmith/docker/templates/docker_build_final.sh +106 -0
  29. datasmith/docker/templates/docker_build_pkg.sh +99 -0
  30. datasmith/docker/templates/docker_build_run.sh +124 -0
  31. datasmith/docker/templates/entrypoint.sh +62 -0
  32. datasmith/docker/templates/parser.py +1405 -0
  33. datasmith/docker/templates/profile.sh +199 -0
  34. datasmith/docker/templates/pytest_runner.py +692 -0
  35. datasmith/docker/templates/run-tests.sh +197 -0
  36. datasmith/docker/verifiers.py +131 -0
  37. datasmith/filters.py +154 -0
  38. datasmith/github/__init__.py +22 -0
  39. datasmith/github/client.py +333 -0
  40. datasmith/github/hooks.py +50 -0
  41. datasmith/github/links.py +110 -0
  42. datasmith/github/models.py +206 -0
  43. datasmith/github/render.py +173 -0
  44. datasmith/github/search.py +66 -0
  45. datasmith/github/templates/comment.md.j2 +5 -0
  46. datasmith/github/templates/final.md.j2 +66 -0
  47. datasmith/github/templates/issues.md.j2 +21 -0
  48. datasmith/github/templates/repo.md.j2 +1 -0
  49. datasmith/preflight.py +162 -0
  50. datasmith/publish/__init__.py +13 -0
  51. datasmith/publish/huggingface.py +104 -0
  52. datasmith/publish/pipeline.py +60 -0
  53. datasmith/publish/records.py +91 -0
  54. datasmith/py.typed +1 -0
  55. datasmith/resolution/__init__.py +14 -0
  56. datasmith/resolution/blocklist.py +145 -0
  57. datasmith/resolution/cache.py +120 -0
  58. datasmith/resolution/constants.py +277 -0
  59. datasmith/resolution/dependency_resolver.py +174 -0
  60. datasmith/resolution/git_utils.py +378 -0
  61. datasmith/resolution/import_analyzer.py +66 -0
  62. datasmith/resolution/metadata_parser.py +412 -0
  63. datasmith/resolution/models.py +41 -0
  64. datasmith/resolution/orchestrator.py +522 -0
  65. datasmith/resolution/package_filters.py +312 -0
  66. datasmith/resolution/python_manager.py +110 -0
  67. datasmith/runners/__init__.py +15 -0
  68. datasmith/runners/base.py +112 -0
  69. datasmith/runners/classify_prs.py +48 -0
  70. datasmith/runners/render_problems.py +113 -0
  71. datasmith/runners/resolve_packages.py +66 -0
  72. datasmith/runners/scrape_commits.py +166 -0
  73. datasmith/runners/scrape_repos.py +44 -0
  74. datasmith/runners/synthesize_images.py +310 -0
  75. datasmith/update/__init__.py +5 -0
  76. datasmith/update/cli.py +169 -0
  77. datasmith/update/offline.py +173 -0
  78. datasmith/update/pipeline.py +497 -0
  79. datasmith/utils/__init__.py +18 -0
  80. datasmith/utils/core.py +67 -0
  81. datasmith/utils/db.py +156 -0
  82. datasmith/utils/tokens.py +65 -0
  83. fc_data-0.2.0.dist-info/METADATA +441 -0
  84. fc_data-0.2.0.dist-info/RECORD +87 -0
  85. fc_data-0.2.0.dist-info/WHEEL +4 -0
  86. fc_data-0.2.0.dist-info/entry_points.txt +2 -0
  87. fc_data-0.2.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,1405 @@
1
+ from __future__ import annotations
2
+
3
+ """
4
+ Refactored ASV comparison utility with clean abstractions and a small CLI.
5
+
6
+ Design goals
7
+ ------------
8
+ - Keep pure, unit-testable functions where possible
9
+ - Separate IO (tarballs, tempdirs) from comparison logic
10
+ - Encapsulate ASV access behind thin helpers
11
+ - Preserve behavior of the original script
12
+
13
+ Usage (same behavior):
14
+ python asv_compare_refactor.py
15
+ Prints a JSON object mapping benchmark names to comparison details.
16
+
17
+ Env vars respected (same names as original):
18
+ T_BENCH_TASK_LOGS_PATH | T_BENCH_CONTAINER_LOGS_PATH | default '/logs'
19
+ T_BENCH_FACTOR (float, e.g. '1.05')
20
+ T_BENCH_USE_STATS ('0'/'false' to disable)
21
+
22
+ Assumptions:
23
+ - `asv` and `asv_runner` are installed
24
+ - `asv.test.conf.json` is present in CWD (same as original)
25
+ """
26
+
27
+ import itertools
28
+ import json
29
+ import math
30
+ import os
31
+ import tarfile
32
+ import tempfile
33
+ from argparse import Namespace
34
+ from collections.abc import Iterable, Iterator
35
+ from dataclasses import dataclass
36
+ from enum import Enum
37
+ from pathlib import Path
38
+ from typing import Dict, List, Optional, Tuple
39
+
40
+ import asv
41
+
42
+ try: # asv._stats may not exist in older ASV versions
43
+ from asv import _stats as _ASV_STATS
44
+ except Exception: # pragma: no cover
45
+ _ASV_STATS = None
46
+ from asv import results as asv_results
47
+ from asv.benchmarks import Benchmarks
48
+ from asv_runner.statistics import get_err
49
+
50
+ # Hardcoded harness basenames to exclude from coverage (explicit list).
51
+ _HARNESS_BASENAMES: set[str] = {
52
+ "formulacode_testrunner.py",
53
+ "jinja_patch_plugin_pandas.py",
54
+ "asv_discover_and_cover.py",
55
+ "parser.py",
56
+ # Generated helper used only to list benchmarks; not core project code
57
+ "get_benchmarks.py",
58
+ }
59
+
60
+
61
+ def _agent_key(agent_name: str, model_name: str) -> str:
62
+ """Create compound agent:model key."""
63
+ return f"{agent_name}:{model_name}"
64
+
65
+
66
+ def _normalize_agent_key(agent_key: str, model_by_name: dict[str, str]) -> str:
67
+ """Ensure agent key is agent:model, falling back to agent:agent if needed."""
68
+ if ":" in agent_key:
69
+ agent_name, model_name = agent_key.split(":", 1)
70
+ if model_name:
71
+ return agent_key
72
+ model_name = model_by_name.get(agent_name, agent_name)
73
+ return _agent_key(agent_name, model_name)
74
+ model_name = model_by_name.get(agent_key, agent_key)
75
+ return _agent_key(agent_key, model_name)
76
+
77
+
78
+ def _find_agent_key(agents_data: dict, target_agent: str) -> str | None:
79
+ """Find the full key for an agent by its agent_name prefix."""
80
+ for key in agents_data:
81
+ if key.split(":")[0] == target_agent:
82
+ return key
83
+ return None
84
+
85
+
86
+ # -----------------------------
87
+ # Errors
88
+ # -----------------------------
89
+
90
+
91
+ class ExtractionError(RuntimeError):
92
+ pass
93
+
94
+
95
+ # -----------------------------
96
+ # Utility & IO layer
97
+ # -----------------------------
98
+
99
+
100
+ def find_tarball(root: Path, name: str) -> Path:
101
+ exact = root / f"{name}.tar.gz"
102
+ if exact.is_file():
103
+ return exact
104
+ cands = sorted(p for p in root.glob("*.tar.gz") if name in p.stem)
105
+ if not cands:
106
+ raise FileNotFoundError(f"Could not find {name}.tar.gz in {root}")
107
+ return max(cands, key=lambda p: p.stat().st_mtime)
108
+
109
+
110
+ @dataclass
111
+ class AgentTarball:
112
+ """Represents a discovered agent tarball."""
113
+
114
+ path: Path
115
+ agent_number: int
116
+ agent_name: str
117
+ model_name: str
118
+
119
+
120
+ def discover_agent_tarballs(root: Path) -> list[AgentTarball]:
121
+ """
122
+ Discover all agent tarballs in the new naming scheme:
123
+ postrun_agent-{N}-{agent_type}:{model_name}.tar.gz
124
+
125
+ Returns list sorted by agent_number.
126
+ """
127
+ import re
128
+
129
+ pattern = re.compile(r"postrun_agent-(\d+)-([^:]+):([^.]+)\.tar\.gz")
130
+ agents = []
131
+ for tarball in root.glob("postrun_agent-*.tar.gz"):
132
+ match = pattern.match(tarball.name)
133
+ if match:
134
+ agent_num = int(match.group(1))
135
+ agent_name = match.group(2) # Use the agent type before colon
136
+ model_name = match.group(3) # Use the model name after colon
137
+ agents.append(AgentTarball(tarball, agent_num, agent_name, model_name))
138
+
139
+ return sorted(agents, key=lambda a: a.agent_number)
140
+
141
+
142
+ def _safe_extract(tar: tarfile.TarFile, dest: Path) -> None:
143
+ base = dest.resolve()
144
+ for m in tar.getmembers():
145
+ target = (dest / m.name).resolve()
146
+ if base not in target.parents and target != base:
147
+ raise ExtractionError("Refusing to extract path outside destination")
148
+ # Keep current behavior despite Python 3.14 deprecation notice.
149
+ tar.extractall(dest)
150
+
151
+
152
+ def extract_topdir(tb: Path, dest_root: Path) -> Path:
153
+ with tarfile.open(tb, "r:gz") as tar:
154
+ _safe_extract(tar, dest_root)
155
+ dirs = [p for p in dest_root.iterdir() if p.is_dir()]
156
+ if len(dirs) == 1:
157
+ return dirs[0]
158
+ candidates = [p for p in dirs if (p / "results").is_dir()]
159
+ return max(candidates, key=lambda p: p.stat().st_mtime) if candidates else (dirs[0] if dirs else dest_root)
160
+
161
+
162
+ def read_sha(root: Path) -> Optional[str]:
163
+ f = root / "sha.txt"
164
+ try:
165
+ return f.read_text(encoding="utf-8").strip()
166
+ except Exception:
167
+ return None
168
+
169
+
170
+ def detect_machines(results_root: Path) -> List[str]:
171
+ if not results_root.is_dir():
172
+ return []
173
+ return sorted([p.name for p in results_root.iterdir() if p.is_dir()])
174
+
175
+
176
+ def deconstruct_benchmark(
177
+ bench: str,
178
+ ) -> tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
179
+ """
180
+ Parse a benchmark string of the form:
181
+ - "module.func"
182
+ - "module.Class.method(args)"
183
+ - "pkg.subpkg.module.func"
184
+ - "pkg.subpkg.Class.method(1, 'x')"
185
+ - "func" (module-less)
186
+ and return a 4-tuple: (module, class, func, params)
187
+ """
188
+ s = bench.strip()
189
+
190
+ # --- Extract params (if any) ---
191
+ params: Optional[str] = None
192
+ if "(" in s and s.endswith(")"):
193
+ lpar = s.find("(")
194
+ rpar = s.rfind(")")
195
+ if lpar != -1 and rpar > lpar:
196
+ params = s[lpar : rpar + 1].strip()
197
+ s = s[:lpar].strip()
198
+
199
+ if not s:
200
+ return (None, None, None, params)
201
+
202
+ # --- Split head on dots ---
203
+ parts = [p for p in s.split(".") if p]
204
+
205
+ if not parts:
206
+ return (None, None, None, params)
207
+
208
+ # Heuristic: if there are >=3 parts and the penultimate looks like a Class (CamelCase / starts uppercase),
209
+ # treat the last two as Class.method; otherwise, treat last as function.
210
+ def looks_like_class(name: str) -> bool:
211
+ return name[:1].isupper() and any(ch.isalpha() for ch in name)
212
+
213
+ module: Optional[str] = None
214
+ cls: Optional[str] = None
215
+ func: Optional[str] = None
216
+
217
+ if len(parts) == 1:
218
+ # Just "func"
219
+ func = parts[0]
220
+ elif len(parts) >= 3 and looks_like_class(parts[-2]):
221
+ module = ".".join(parts[:-2]) or None
222
+ cls = parts[-2]
223
+ func = parts[-1]
224
+ else:
225
+ module = ".".join(parts[:-1]) or None
226
+ func = parts[-1]
227
+
228
+ return (module, cls, func, params)
229
+
230
+
231
+ def pick_machine_for_commit(results_root: Path, commit: Optional[str]) -> Optional[str]:
232
+ machines = detect_machines(results_root)
233
+ if not machines:
234
+ return None
235
+ if "dockertest" in machines:
236
+ if not commit:
237
+ return "dockertest"
238
+ for _ in asv_results.iter_results_for_machine_and_hash(str(results_root), "dockertest", commit):
239
+ return "dockertest"
240
+ if commit:
241
+ for m in machines:
242
+ for _ in asv_results.iter_results_for_machine_and_hash(str(results_root), m, commit):
243
+ return m
244
+ return machines[0]
245
+
246
+
247
+ def load_from_benchmarks_file(path: Path) -> Benchmarks:
248
+ """Load benchmarks from a benchmarks.json file.
249
+
250
+ Filters out entries that don't have a 'name' field to gracefully handle
251
+ metadata entries or malformed benchmark data.
252
+ """
253
+ if not path.exists() or not path.is_file():
254
+ raise ValueError(f"Benchmark list file {path} missing!")
255
+ d = asv.util.load_json(path, api_version=Benchmarks.api_version)
256
+
257
+ # Filter to only include valid benchmark entries (must have 'name' field)
258
+ # This gracefully handles metadata entries like "version" and malformed data
259
+ benchmarks = [bench for bench in d.values() if isinstance(bench, dict) and "name" in bench]
260
+
261
+ conf = Namespace(benchmark_dir=path.parent)
262
+ return Benchmarks(conf, benchmarks, regex=None)
263
+
264
+
265
+ def is_core_file(filepath: str) -> bool:
266
+ """
267
+ Check if a file is a core source file (not test/init/etc).
268
+
269
+ Uses a sound heuristic to filter out:
270
+ - Test files (test_, test/, tests/, testing/)
271
+ - __init__.py files
272
+ - Setup/config files (setup.py, conf.py, config.py)
273
+ - Build/dist files
274
+ """
275
+ import re
276
+
277
+ # Normalize path
278
+ filepath = filepath.lower().replace("\\", "/")
279
+
280
+ # Exclude patterns
281
+ exclude_patterns = [
282
+ r"/test[s]?/", # /test/ or /tests/ directory
283
+ r"/test[s]?_", # /test_ or /tests_ prefix
284
+ r"_test\.py$", # _test.py suffix
285
+ r"^test[s]?/", # starts with test/ or tests/
286
+ r"^test[s]?_", # starts with test_ or tests_
287
+ r"/__init__\.py$", # __init__.py files
288
+ r"/setup\.py$", # setup.py
289
+ r"/conf\.py$", # conf.py (sphinx config)
290
+ r"/conftest\.py$", # conftest.py (pytest)
291
+ r"/config\.py$", # config.py
292
+ r"/(build|dist|\.egg)/", # build/dist directories
293
+ r"/site-packages/", # site-packages
294
+ r"/__pycache__/", # pycache
295
+ r"\.pyc$", # compiled python
296
+ r"(^|/)benchmarks/", # ASV benchmark sources
297
+ # Hardcode here.
298
+ ]
299
+
300
+ for pattern in exclude_patterns:
301
+ if re.search(pattern, filepath):
302
+ return False
303
+
304
+ # Exclude harness file basenames from this adapter directory and generated ones
305
+ base = filepath.rsplit("/", 1)[-1]
306
+ if base in _HARNESS_BASENAMES:
307
+ return False
308
+
309
+ # Must be a .py file
310
+ return filepath.endswith(".py")
311
+
312
+
313
+ def get_folder_contents(root: Path) -> Optional[RunData]:
314
+ """Load RunData from an extracted directory."""
315
+ results_dir = root / "results"
316
+ benchmarks_file = results_dir / "benchmarks.json"
317
+ if not benchmarks_file.exists():
318
+ return None
319
+ benchmarks = load_from_benchmarks_file(results_dir / "benchmarks.json")
320
+ pre_sha = read_sha(root)
321
+ machine_name = pick_machine_for_commit(results_dir, pre_sha)
322
+ rundata = RunData.from_asv(results_dir, benchmarks, machine_name, pre_sha)
323
+ if not len(rundata.machine_env_names):
324
+ return None
325
+ return rundata
326
+
327
+
328
+ def _collect_core_file_coverage(benchmarks_json_path: Path) -> dict[str, dict]:
329
+ """
330
+ Collect per-core-file max coverage from benchmarks.json.
331
+
332
+ Returns mapping filepath -> {percent, covered, num_statements}
333
+
334
+ Prefers global_coverage if available (union of all benchmark runs),
335
+ otherwise aggregates per-benchmark coverage using max across params.
336
+
337
+ Gracefully handles missing or None coverage data by returning empty dict.
338
+ """
339
+ import json
340
+
341
+ try:
342
+ with open(benchmarks_json_path) as f:
343
+ benchmarks = json.load(f)
344
+ except Exception:
345
+ return {}
346
+
347
+ # First check if global_coverage is available (preferred)
348
+ if "global_coverage" in benchmarks:
349
+ global_cov = benchmarks["global_coverage"]
350
+ if isinstance(global_cov, dict) and "files" in global_cov:
351
+ all_files_coverage = global_cov["files"]
352
+ if isinstance(all_files_coverage, dict) and len(all_files_coverage) > 0:
353
+ # Filter for core files only
354
+ core_files_coverage = {
355
+ filepath: cov
356
+ for filepath, cov in all_files_coverage.items()
357
+ if isinstance(cov, dict) and is_core_file(filepath)
358
+ }
359
+ # Only return if we found core files with coverage
360
+ if core_files_coverage:
361
+ return core_files_coverage
362
+
363
+ # Fallback: collect max coverage across all benchmarks and params
364
+ all_files_coverage = {}
365
+
366
+ for bench_name, bench_data in benchmarks.items():
367
+ if bench_name in ("version", "global_coverage"): # Skip metadata
368
+ continue
369
+
370
+ # Skip if bench_data is not a dict (malformed data)
371
+ if not isinstance(bench_data, dict):
372
+ continue
373
+
374
+ # Gracefully handle missing or None coverage data
375
+ coverage_data = bench_data.get("coverage")
376
+ if coverage_data is None or not isinstance(coverage_data, dict):
377
+ continue
378
+
379
+ by_param = coverage_data.get("by_param")
380
+ if by_param is None or not isinstance(by_param, dict):
381
+ continue
382
+
383
+ # Aggregate coverage across all params for this benchmark (max)
384
+ for param_idx, param_data in by_param.items():
385
+ # Skip if param_data is not a dict
386
+ if not isinstance(param_data, dict):
387
+ continue
388
+
389
+ files = param_data.get("files")
390
+ if files is None or not isinstance(files, dict):
391
+ continue
392
+
393
+ for filepath, file_cov in files.items():
394
+ # Skip if file_cov is not a dict
395
+ if not isinstance(file_cov, dict):
396
+ continue
397
+
398
+ try:
399
+ percent = float(file_cov.get("percent", 0.0) or 0.0)
400
+ covered = int(file_cov.get("covered", 0) or 0)
401
+ num_statements = int(file_cov.get("num_statements", 0) or 0)
402
+ except (ValueError, TypeError):
403
+ # Skip this file if coverage data is malformed
404
+ continue
405
+ if num_statements > 0 and covered > num_statements:
406
+ covered = num_statements
407
+ if filepath not in all_files_coverage:
408
+ all_files_coverage[filepath] = {
409
+ "percent": percent,
410
+ "covered": covered,
411
+ "num_statements": num_statements,
412
+ **file_cov,
413
+ }
414
+ else:
415
+ # Keep the max coverage across params/benchmarks
416
+ if percent > float(all_files_coverage[filepath]["percent"]):
417
+ all_files_coverage[filepath] = {
418
+ "percent": percent,
419
+ "covered": covered,
420
+ "num_statements": num_statements,
421
+ **file_cov,
422
+ }
423
+
424
+ # Filter for core files only
425
+ core_files_coverage = {filepath: cov for filepath, cov in all_files_coverage.items() if is_core_file(filepath)}
426
+
427
+ return core_files_coverage
428
+
429
+
430
+ def _stats_from_core_cov(
431
+ core_files_coverage: dict[str, dict],
432
+ ) -> tuple[float, int, int]:
433
+ """
434
+ Calculate max coverage statistics from per-file coverage data.
435
+
436
+ Returns the maximum coverage percentage among all core files.
437
+ """
438
+ if not core_files_coverage:
439
+ return (0.0, 0, 0)
440
+
441
+ # Get max coverage percentage across all core files
442
+ max_coverage = max((min(f["percent"], 100.0) for f in core_files_coverage.values()), default=0.0)
443
+
444
+ num_covered = sum(1 for f in core_files_coverage.values() if f["percent"] > 0)
445
+ total_core = len(core_files_coverage)
446
+
447
+ return (max_coverage, num_covered, total_core)
448
+
449
+
450
+ def calculate_max_coverage(benchmarks_json_path: Path) -> tuple[float, int, int]:
451
+ """
452
+ Calculate max coverage of core files from benchmarks.json.
453
+
454
+ Uses global_coverage if available (union of all benchmark runs),
455
+ otherwise aggregates per-benchmark coverage.
456
+
457
+ Gracefully handles missing or malformed coverage data by returning (0.0, 0, 0).
458
+
459
+ Returns:
460
+ (max_coverage_percent, num_core_files_covered, total_core_files)
461
+ """
462
+ core = _collect_core_file_coverage(benchmarks_json_path)
463
+ return _stats_from_core_cov(core)
464
+
465
+
466
+ def calculate_max_coverage_from_tarball(tarball_path: Path) -> tuple[float, int, int]:
467
+ """Extract a tarball to a temp dir and compute max coverage.
468
+
469
+ Uses global_coverage if available, otherwise aggregates per-benchmark data.
470
+
471
+ Gracefully handles missing coverage data, extraction errors, or any other
472
+ exceptions by returning (0.0, 0, 0).
473
+ """
474
+ try:
475
+ with tempfile.TemporaryDirectory(prefix="tmp-cov-") as td:
476
+ root = extract_topdir(tarball_path, Path(td))
477
+ bjson = root / "results" / "benchmarks.json"
478
+ if bjson.exists():
479
+ return calculate_max_coverage(bjson)
480
+ except Exception:
481
+ pass
482
+ return (0.0, 0, 0)
483
+
484
+
485
+ # Backward compatibility aliases
486
+ calculate_mean_coverage = calculate_max_coverage
487
+ calculate_mean_coverage_from_tarball = calculate_max_coverage_from_tarball
488
+
489
+
490
+ def collect_core_file_coverage_from_tarball(tarball_path: Path) -> dict[str, dict]:
491
+ """Extract a tarball and return per-core-file coverage mapping.
492
+
493
+ Gracefully handles missing coverage data, extraction errors, or any other
494
+ exceptions by returning an empty dict.
495
+ """
496
+ try:
497
+ with tempfile.TemporaryDirectory(prefix="tmp-cov-") as td:
498
+ root = extract_topdir(tarball_path, Path(td))
499
+ bjson = root / "results" / "benchmarks.json"
500
+ if bjson.exists():
501
+ return _collect_core_file_coverage(bjson)
502
+ except Exception:
503
+ pass
504
+ return {}
505
+
506
+
507
+ def get_tarball_contents(tarball_pth: Path) -> Optional[RunData]:
508
+ """
509
+ Load RunData from a tarball, with fallback to extracted directory.
510
+
511
+ This handles corrupted tarballs and permission errors by falling back to already-extracted directories.
512
+ """
513
+ with tempfile.TemporaryDirectory(prefix="tmp-") as pre_tmp:
514
+ root = extract_topdir(tarball_pth, Path(pre_tmp))
515
+ contents = get_folder_contents(root)
516
+
517
+ if contents is None and (tarball_pth.with_suffix("").with_suffix("").is_dir()):
518
+ contents = get_folder_contents(tarball_pth.with_suffix("").with_suffix(""))
519
+ return contents
520
+
521
+
522
+ def gmean(data: List[float]) -> float:
523
+ """Compute geometric mean of a list of floats"""
524
+ log_sum = sum(math.log(x) for x in data if x > 0)
525
+ return math.exp(log_sum / len(data))
526
+
527
+
528
+ # -----------------------------
529
+ # ASV access layer
530
+ # -----------------------------
531
+
532
+
533
+ @dataclass(frozen=True)
534
+ class ASVConfigPair:
535
+ pre_cfg: asv.config.Config
536
+ post_cfg: asv.config.Config
537
+
538
+ @staticmethod
539
+ def load(pre_results_dir: Path, post_results_dir: Path, asv_conf_json_loc: Path) -> ASVConfigPair:
540
+ pre = asv.config.Config.load(asv_conf_json_loc)
541
+ post = asv.config.Config.load(asv_conf_json_loc)
542
+ pre.results_dir = pre_results_dir
543
+ post.results_dir = post_results_dir
544
+ return ASVConfigPair(pre, post)
545
+
546
+
547
+ def results_default_iter(
548
+ results_dir: Path,
549
+ machine: Optional[str],
550
+ commit_hash: Optional[str],
551
+ env_names: Optional[Iterable[str]] = None,
552
+ ) -> Iterator[Tuple[str, List[Tuple[str, ...]], list, list, list, Optional[str], str, str]]:
553
+ if machine is None or commit_hash is None:
554
+ return iter(())
555
+ for result in asv.results.iter_results_for_machine_and_hash(str(results_dir), machine, commit_hash):
556
+ if env_names is not None and result.env_name not in env_names:
557
+ continue
558
+ for key in result.get_all_result_keys():
559
+ params = result.get_result_params(key)
560
+ result_value = result.get_result_value(key, params)
561
+ result_stats = result.get_result_stats(key, params)
562
+ result_samples = result.get_result_samples(key, params)
563
+ result_version = result.benchmark_version.get(key)
564
+ yield (
565
+ key,
566
+ params,
567
+ result_value,
568
+ result_stats,
569
+ result_samples,
570
+ result_version,
571
+ result.params["machine"],
572
+ result.env_name,
573
+ )
574
+
575
+
576
+ def unroll_result(
577
+ benchmark_name: str,
578
+ params: List[Tuple[str, ...]],
579
+ *values: List[Optional[List[Optional[float]]]],
580
+ ) -> Iterator[Tuple[str, Optional[float], Optional[list], Optional[list]]]:
581
+ """Yield parameterized result tuples for a single benchmark key.
582
+
583
+ Returns (name, value, stats, samples) where name may be decorated with params.
584
+ """
585
+ num_comb = 1
586
+ for p in params:
587
+ num_comb *= len(p)
588
+
589
+ values = list(values)
590
+ for j in range(len(values)):
591
+ if values[j] is None:
592
+ values[j] = [None] * num_comb
593
+
594
+ for param_vals, value in zip(itertools.product(*params), zip(*values)):
595
+ if param_vals == ():
596
+ name = benchmark_name
597
+ else:
598
+ name = f"{benchmark_name}({', '.join(param_vals)})"
599
+ yield (name,) + value # type: ignore[return-value]
600
+
601
+
602
+ # -----------------------------
603
+ # Comparison core
604
+ # -----------------------------
605
+
606
+ BenchmarkKey = Tuple[str, str] # (bench_name, machine/env)
607
+
608
+
609
+ def _isna(value: Optional[float]) -> bool:
610
+ # None (failed) or NaN (skipped)
611
+ return value is None or value != value
612
+
613
+
614
+ def pct_change(pre: float, post: float) -> float:
615
+ if pre == 0 or not math.isfinite(pre) or not math.isfinite(post):
616
+ return float("nan")
617
+ return ((post - pre) / pre) * 100.0
618
+
619
+
620
+ class ChangeMark(str, Enum):
621
+ BETTER = "-" # post better
622
+ WORSE = "+" # post worse
623
+ INTRO_FAIL = "!"
624
+ FIXED_FAIL = "*"
625
+ INCOMPARABLE = "x"
626
+ INSIGNIFICANT = "~"
627
+ NONE = " "
628
+
629
+
630
+ @dataclass
631
+ class EnvSettings:
632
+ logs_root: Path
633
+ factor: float
634
+ use_stats: bool
635
+
636
+ @staticmethod
637
+ def from_env() -> EnvSettings:
638
+ logs_root = Path(
639
+ os.environ.get("T_BENCH_TASK_LOGS_PATH") or os.environ.get("T_BENCH_CONTAINER_LOGS_PATH") or "/logs"
640
+ ).resolve()
641
+ factor = float(os.environ.get("T_BENCH_FACTOR", "1.0"))
642
+ use_stats = os.environ.get("T_BENCH_USE_STATS", "1") not in (
643
+ "0",
644
+ "false",
645
+ "False",
646
+ )
647
+ return EnvSettings(logs_root, factor, use_stats)
648
+
649
+
650
+ @dataclass
651
+ class BenchMeta:
652
+ unit: Optional[str]
653
+ version_pre: Optional[str]
654
+ version_post: Optional[str]
655
+
656
+
657
+ @dataclass
658
+ class BenchDetail:
659
+ change: ChangeMark
660
+ pre_time: Optional[float]
661
+ post_time: Optional[float]
662
+ unit: str
663
+ ratio: Optional[float]
664
+ delta_pct: float
665
+ significant: bool
666
+ color: str
667
+ pre_err: Optional[float]
668
+ post_err: Optional[float]
669
+ pre_stats: Optional[dict]
670
+ post_stats: Optional[dict]
671
+ pre_samples: Optional[list]
672
+ post_samples: Optional[list]
673
+ version_1: Optional[str]
674
+ version_2: Optional[str]
675
+ machine_env_name: str
676
+
677
+
678
+ class Comparator:
679
+ def __init__(self, factor: float = 1.0, use_stats: bool = True) -> None:
680
+ self.factor = factor
681
+ self.use_stats = use_stats
682
+
683
+ def _is_result_better(
684
+ self,
685
+ a: float,
686
+ b: float,
687
+ a_ss: Optional[Tuple[Optional[dict], Optional[list]]],
688
+ b_ss: Optional[Tuple[Optional[dict], Optional[list]]],
689
+ ) -> bool:
690
+ if (
691
+ self.use_stats
692
+ and _ASV_STATS is not None
693
+ and a_ss
694
+ and b_ss
695
+ and a_ss[0]
696
+ and b_ss[0]
697
+ and (a_ss[0].get("repeat", 0) != 1 and b_ss[0].get("repeat", 0) != 1)
698
+ ):
699
+ if not _ASV_STATS.is_different(a_ss[1], b_ss[1], a_ss[0], b_ss[0]):
700
+ return False
701
+ return a < b / self.factor
702
+
703
+ def classify(
704
+ self,
705
+ key: BenchmarkKey,
706
+ unit: Optional[str],
707
+ pre_val: Optional[float],
708
+ post_val: Optional[float],
709
+ pre_ss: Optional[Tuple[Optional[dict], Optional[list]]],
710
+ post_ss: Optional[Tuple[Optional[dict], Optional[list]]],
711
+ versions: Tuple[Optional[str], Optional[str]],
712
+ ) -> BenchDetail:
713
+ name, machine_env = key
714
+ version_1, version_2 = versions
715
+
716
+ if version_1 is not None and version_2 is not None and version_1 != version_2:
717
+ return BenchDetail(
718
+ change=ChangeMark.INCOMPARABLE,
719
+ pre_time=pre_val,
720
+ post_time=post_val,
721
+ unit=unit or "",
722
+ ratio=None,
723
+ delta_pct=float("nan"),
724
+ significant=False,
725
+ color="lightgrey",
726
+ pre_err=self._err(pre_val, pre_ss),
727
+ post_err=self._err(post_val, post_ss),
728
+ pre_stats=pre_ss[0] if pre_ss else None,
729
+ post_stats=post_ss[0] if post_ss else None,
730
+ pre_samples=pre_ss[1] if pre_ss else None,
731
+ post_samples=post_ss[1] if post_ss else None,
732
+ version_1=version_1,
733
+ version_2=version_2,
734
+ machine_env_name=machine_env,
735
+ )
736
+
737
+ if pre_val is not None and post_val is None:
738
+ return self._mk(
739
+ ChangeMark.INTRO_FAIL,
740
+ True,
741
+ unit,
742
+ pre_val,
743
+ post_val,
744
+ pre_ss,
745
+ post_ss,
746
+ versions,
747
+ machine_env,
748
+ )
749
+ if pre_val is None and post_val is not None:
750
+ return self._mk(
751
+ ChangeMark.FIXED_FAIL,
752
+ True,
753
+ unit,
754
+ pre_val,
755
+ post_val,
756
+ pre_ss,
757
+ post_ss,
758
+ versions,
759
+ machine_env,
760
+ )
761
+ if pre_val is None and post_val is None:
762
+ return self._mk(
763
+ ChangeMark.NONE,
764
+ False,
765
+ unit,
766
+ pre_val,
767
+ post_val,
768
+ pre_ss,
769
+ post_ss,
770
+ versions,
771
+ machine_env,
772
+ )
773
+ if _isna(pre_val) or _isna(post_val):
774
+ return self._mk(
775
+ ChangeMark.NONE,
776
+ False,
777
+ unit,
778
+ pre_val,
779
+ post_val,
780
+ pre_ss,
781
+ post_ss,
782
+ versions,
783
+ machine_env,
784
+ )
785
+
786
+ assert pre_val is not None and post_val is not None
787
+ ratio = None
788
+ delta = pct_change(pre_val, post_val)
789
+ try:
790
+ ratio = post_val / pre_val
791
+ except ZeroDivisionError:
792
+ ratio = None
793
+
794
+ if self._is_result_better(post_val, pre_val, post_ss, pre_ss):
795
+ return self._mk(
796
+ ChangeMark.BETTER,
797
+ True,
798
+ unit,
799
+ pre_val,
800
+ post_val,
801
+ pre_ss,
802
+ post_ss,
803
+ versions,
804
+ machine_env,
805
+ ratio,
806
+ delta,
807
+ color="green",
808
+ )
809
+ if self._is_result_better(pre_val, post_val, pre_ss, post_ss):
810
+ return self._mk(
811
+ ChangeMark.WORSE,
812
+ True,
813
+ unit,
814
+ pre_val,
815
+ post_val,
816
+ pre_ss,
817
+ post_ss,
818
+ versions,
819
+ machine_env,
820
+ ratio,
821
+ delta,
822
+ color="red",
823
+ )
824
+
825
+ # Not significant. If raw comparison would pass without stats, mark '~'.
826
+ insignif = ChangeMark.NONE
827
+ if self._is_result_better(post_val, pre_val, None, None) or self._is_result_better(
828
+ pre_val, post_val, None, None
829
+ ):
830
+ insignif = ChangeMark.INSIGNIFICANT
831
+ return self._mk(
832
+ insignif,
833
+ False,
834
+ unit,
835
+ pre_val,
836
+ post_val,
837
+ pre_ss,
838
+ post_ss,
839
+ versions,
840
+ machine_env,
841
+ ratio,
842
+ delta,
843
+ )
844
+
845
+ @staticmethod
846
+ def _err(value: Optional[float], ss: Optional[Tuple[Optional[dict], Optional[list]]]) -> Optional[float]:
847
+ if value is None or not ss or not ss[0]:
848
+ return None
849
+ return get_err(value, ss[0])
850
+
851
+ def _mk(
852
+ self,
853
+ mark: ChangeMark,
854
+ significant: bool,
855
+ unit: Optional[str],
856
+ pre_val: Optional[float],
857
+ post_val: Optional[float],
858
+ pre_ss: Optional[Tuple[Optional[dict], Optional[list]]],
859
+ post_ss: Optional[Tuple[Optional[dict], Optional[list]]],
860
+ versions: Tuple[Optional[str], Optional[str]],
861
+ machine_env: str,
862
+ ratio: Optional[float] = None,
863
+ delta_pct_val: float = float("nan"),
864
+ color: str = "default",
865
+ ) -> BenchDetail:
866
+ return BenchDetail(
867
+ change=mark,
868
+ pre_time=pre_val,
869
+ post_time=post_val,
870
+ unit=unit or "",
871
+ ratio=ratio,
872
+ delta_pct=delta_pct_val,
873
+ significant=significant,
874
+ color=color,
875
+ pre_err=self._err(pre_val, pre_ss),
876
+ post_err=self._err(post_val, post_ss),
877
+ pre_stats=pre_ss[0] if pre_ss else None,
878
+ post_stats=post_ss[0] if post_ss else None,
879
+ pre_samples=pre_ss[1] if pre_ss else None,
880
+ post_samples=post_ss[1] if post_ss else None,
881
+ version_1=versions[0],
882
+ version_2=versions[1],
883
+ machine_env_name=machine_env,
884
+ )
885
+
886
+
887
+ # -----------------------------
888
+ # Aggregation
889
+ # -----------------------------
890
+
891
+
892
+ @dataclass
893
+ class RunData:
894
+ units: Dict[BenchmarkKey, Optional[str]]
895
+ values: Dict[BenchmarkKey, Optional[float]]
896
+ stats: Dict[BenchmarkKey, Tuple[Optional[dict], Optional[list]]]
897
+ versions: Dict[BenchmarkKey, Optional[str]]
898
+ machine_env_names: set[str]
899
+
900
+ @staticmethod
901
+ def from_asv(
902
+ results_dir: Path,
903
+ benchmarks: Dict,
904
+ machine: Optional[str],
905
+ sha: Optional[str],
906
+ ) -> RunData:
907
+ units: Dict[BenchmarkKey, Optional[str]] = {}
908
+ values: Dict[BenchmarkKey, Optional[float]] = {}
909
+ stats: Dict[BenchmarkKey, Tuple[Optional[dict], Optional[list]]] = {}
910
+ versions: Dict[BenchmarkKey, Optional[str]] = {}
911
+ env_names: set[str] = set()
912
+
913
+ for (
914
+ key,
915
+ params,
916
+ value,
917
+ sstats,
918
+ samples,
919
+ version,
920
+ machine_name,
921
+ env_name,
922
+ ) in results_default_iter(results_dir, machine, sha):
923
+ machine_env_name = f"{machine_name}/{env_name}"
924
+ env_names.add(machine_env_name)
925
+ for name, v, st, sp in unroll_result(key, params, value, sstats, samples):
926
+ bk: BenchmarkKey = (name, machine_env_name)
927
+ units[bk] = benchmarks.get(key, {}).get("unit")
928
+ values[bk] = v
929
+ stats[bk] = (st, sp)
930
+ versions[bk] = version
931
+ return RunData(units, values, stats, versions, env_names)
932
+
933
+
934
+ # -----------------------------
935
+ # Agent Advantage Calculation
936
+ # -----------------------------
937
+
938
+
939
+ @dataclass
940
+ class AgentAdvantageResult:
941
+ """Results of agent advantage calculation for a single agent."""
942
+
943
+ agent_advantage: float
944
+ agent_advantage_level1: float
945
+ agent_advantage_level2: float
946
+ agent_advantage_level3: float
947
+ agent_advantage_level4: float
948
+ # Field retained for backward compatibility. Represents the agent's name.
949
+ last_agent: str
950
+ num_valid_benchmarks: int
951
+ total_benchmarks: int
952
+ per_benchmark_speedups: Dict[str, Dict[str, float]] # benchmark -> {agent/nop, oracle/nop}
953
+
954
+
955
+ def calculate_agent_advantage(
956
+ agents_data: Dict[str, RunData],
957
+ last_agent_name: str,
958
+ factor: float = 1.0,
959
+ use_stats: bool = True,
960
+ ) -> AgentAdvantageResult:
961
+ """
962
+ Calculate paper-aligned per-task advantage for one agent.
963
+
964
+ For every valid benchmark present in nop/oracle/agent:
965
+ - speedup_agent = nop_median / agent_median
966
+ - speedup_oracle = nop_median / oracle_median
967
+ - advantage = speedup_agent - speedup_oracle
968
+
969
+ Then aggregate as:
970
+ - level-1: module groups
971
+ - level-2: class groups
972
+ - level-3: function groups
973
+ - level-4: overall
974
+ where each group uses geometric mean speedups and each level is arithmetic
975
+ mean over its group-level advantages.
976
+
977
+ Args:
978
+ agents_data: Dict mapping agent name to RunData
979
+ last_agent_name: Name of the last agent executed
980
+ factor: Unused (kept for call compatibility)
981
+ use_stats: Unused (kept for call compatibility)
982
+
983
+ Returns:
984
+ AgentAdvantageResult with advantage and details
985
+ """
986
+ import numpy as np
987
+
988
+ # Find baseline agents by their agent_name prefix (keys are now agent:model)
989
+ nop_key = _find_agent_key(agents_data, "nop")
990
+ oracle_key = _find_agent_key(agents_data, "oracle")
991
+
992
+ if nop_key is None:
993
+ raise ValueError("Missing 'nop' agent data for baseline comparison")
994
+ if oracle_key is None:
995
+ raise ValueError("Missing 'oracle' agent data for comparison")
996
+ if last_agent_name not in agents_data:
997
+ raise ValueError(f"Missing data for last agent: {last_agent_name}")
998
+
999
+ nop_data = agents_data[nop_key]
1000
+ oracle_data = agents_data[oracle_key]
1001
+ last_agent_data = agents_data[last_agent_name]
1002
+ _ = (factor, use_stats)
1003
+
1004
+ # Collect all benchmark keys that exist in all three agents
1005
+ common_keys = set(nop_data.values.keys()) & set(oracle_data.values.keys()) & set(last_agent_data.values.keys())
1006
+
1007
+ total_benchmarks = len(common_keys)
1008
+
1009
+ per_benchmark = {}
1010
+ advantages = []
1011
+
1012
+ for bk in common_keys:
1013
+ benchmark_name = bk[0]
1014
+
1015
+ # Get samples and stats for each agent
1016
+ nop_samples = nop_data.stats.get(bk, (None, None))[1]
1017
+ oracle_samples = oracle_data.stats.get(bk, (None, None))[1]
1018
+ last_agent_samples = last_agent_data.stats.get(bk, (None, None))[1]
1019
+
1020
+ # Skip if any samples are missing
1021
+ if not nop_samples or not oracle_samples or not last_agent_samples:
1022
+ continue
1023
+
1024
+ # Calculate medians from samples (robust to outliers)
1025
+ nop_median = float(np.nanmedian(nop_samples))
1026
+ oracle_median = float(np.nanmedian(oracle_samples))
1027
+ last_agent_median = float(np.nanmedian(last_agent_samples))
1028
+
1029
+ # Skip if any median is invalid
1030
+ if (
1031
+ not np.isfinite(nop_median)
1032
+ or not np.isfinite(oracle_median)
1033
+ or not np.isfinite(last_agent_median)
1034
+ or nop_median == 0
1035
+ or oracle_median == 0
1036
+ or last_agent_median == 0
1037
+ ):
1038
+ continue
1039
+
1040
+ # Calculate speedup ratios (T_nop / T_agent)
1041
+ last_agent_speedup = nop_median / last_agent_median
1042
+ oracle_speedup = nop_median / oracle_median
1043
+
1044
+ # Filter out inf and -inf values
1045
+ if (
1046
+ not np.isfinite(last_agent_speedup)
1047
+ or not np.isfinite(oracle_speedup)
1048
+ or last_agent_speedup == np.inf
1049
+ or last_agent_speedup == -np.inf
1050
+ or oracle_speedup == np.inf
1051
+ or oracle_speedup == -np.inf
1052
+ ):
1053
+ continue
1054
+
1055
+ # Per-benchmark advantage
1056
+ advantage = last_agent_speedup - oracle_speedup
1057
+
1058
+ advantages.append(advantage)
1059
+ per_benchmark[benchmark_name] = {
1060
+ "agent/nop": last_agent_speedup,
1061
+ "oracle/nop": oracle_speedup,
1062
+ "advantage": advantage,
1063
+ }
1064
+
1065
+ # If no valid benchmarks were found, return zeroed metrics.
1066
+ if not advantages:
1067
+ return AgentAdvantageResult(
1068
+ agent_advantage=0.0,
1069
+ agent_advantage_level1=0.0,
1070
+ agent_advantage_level2=0.0,
1071
+ agent_advantage_level3=0.0,
1072
+ agent_advantage_level4=0.0,
1073
+ last_agent=last_agent_name,
1074
+ num_valid_benchmarks=0,
1075
+ total_benchmarks=total_benchmarks,
1076
+ per_benchmark_speedups={},
1077
+ )
1078
+
1079
+ # Multi-level aggregation using geometric means per group.
1080
+ from collections import defaultdict
1081
+
1082
+ # Deconstruct all benchmarks and organize by levels
1083
+ level_data = defaultdict(lambda: {"agent_speedups": [], "oracle_speedups": []})
1084
+
1085
+ for benchmark_name, data in per_benchmark.items():
1086
+ module, cls, func, params = deconstruct_benchmark(benchmark_name)
1087
+
1088
+ # Level 1: module (coarsest)
1089
+ level1_key = (module,)
1090
+ level_data[("level1", level1_key)]["agent_speedups"].append(data["agent/nop"])
1091
+ level_data[("level1", level1_key)]["oracle_speedups"].append(data["oracle/nop"])
1092
+
1093
+ # Level 2: per class
1094
+ level2_key = (module, cls)
1095
+ level_data[("level2", level2_key)]["agent_speedups"].append(data["agent/nop"])
1096
+ level_data[("level2", level2_key)]["oracle_speedups"].append(data["oracle/nop"])
1097
+
1098
+ # Level 3: function
1099
+ level3_key = (module, cls, func)
1100
+ level_data[("level3", level3_key)]["agent_speedups"].append(data["agent/nop"])
1101
+ level_data[("level3", level3_key)]["oracle_speedups"].append(data["oracle/nop"])
1102
+
1103
+ # Level 4: overall (all benchmarks)
1104
+ level_data[("level4", ())]["agent_speedups"].append(data["agent/nop"])
1105
+ level_data[("level4", ())]["oracle_speedups"].append(data["oracle/nop"])
1106
+
1107
+ # Calculate agent advantage at each level using geometric mean.
1108
+ def calc_level_advantage(level_key: str) -> float:
1109
+ groups = {}
1110
+ for (lvl, grp_key), data in level_data.items():
1111
+ if lvl == level_key:
1112
+ groups[grp_key] = data
1113
+
1114
+ group_advantages = []
1115
+ for grp_data in groups.values():
1116
+ if grp_data["agent_speedups"] and grp_data["oracle_speedups"]:
1117
+ agent_gmean = float(gmean(grp_data["agent_speedups"]))
1118
+ oracle_gmean = float(gmean(grp_data["oracle_speedups"]))
1119
+ group_advantages.append(agent_gmean - oracle_gmean)
1120
+
1121
+ return float(np.mean(group_advantages)) if group_advantages else 0.0
1122
+
1123
+ # Calculate advantages at all 4 levels
1124
+ level1_advantage = calc_level_advantage("level1")
1125
+ level2_advantage = calc_level_advantage("level2")
1126
+ level3_advantage = calc_level_advantage("level3")
1127
+ level4_advantage = calc_level_advantage("level4")
1128
+
1129
+ # Use level 4 (overall) as the main agent_advantage
1130
+ mean_advantage = level4_advantage
1131
+
1132
+ return AgentAdvantageResult(
1133
+ agent_advantage=mean_advantage,
1134
+ agent_advantage_level1=level1_advantage,
1135
+ agent_advantage_level2=level2_advantage,
1136
+ agent_advantage_level3=level3_advantage,
1137
+ agent_advantage_level4=level4_advantage,
1138
+ last_agent=last_agent_name,
1139
+ num_valid_benchmarks=len(advantages),
1140
+ total_benchmarks=total_benchmarks,
1141
+ per_benchmark_speedups=per_benchmark,
1142
+ )
1143
+
1144
+
1145
+ def calculate_all_agent_advantages(
1146
+ agents_data: Dict[str, RunData],
1147
+ factor: float = 1.0,
1148
+ use_stats: bool = True,
1149
+ ) -> Dict[str, AgentAdvantageResult]:
1150
+ """Compute agent advantage for every agent except 'nop'.
1151
+
1152
+ Returns a mapping: agent_key (agent:model) -> AgentAdvantageResult
1153
+ """
1154
+ # Validate required baselines exist (keys are now agent:model format)
1155
+ nop_key = _find_agent_key(agents_data, "nop")
1156
+ oracle_key = _find_agent_key(agents_data, "oracle")
1157
+
1158
+ if nop_key is None:
1159
+ raise ValueError("Missing 'nop' agent data for baseline comparison")
1160
+ if oracle_key is None:
1161
+ raise ValueError("Missing 'oracle' agent data for comparison")
1162
+
1163
+ results: Dict[str, AgentAdvantageResult] = {}
1164
+ for agent_key in agents_data:
1165
+ # Only exclude the true baseline 'nop'.
1166
+ # Include 'oracle' as an agent so runs with only 'nop' and 'oracle'
1167
+ # still produce a valid (zero-advantage) result for 'oracle'.
1168
+ agent_name_part = agent_key.split(":")[0]
1169
+ if agent_name_part == "nop":
1170
+ continue
1171
+ # Reuse the single-agent calculator for consistency
1172
+ results[agent_key] = calculate_agent_advantage(agents_data, agent_key, factor=factor, use_stats=use_stats)
1173
+ if not results:
1174
+ # Edge case: no agents beyond the 'nop' baseline were present
1175
+ raise ValueError("No agents found beyond 'nop' baseline")
1176
+ return results
1177
+
1178
+
1179
+ # -----------------------------
1180
+ # Orchestration
1181
+ # -----------------------------
1182
+
1183
+
1184
+ class Runner:
1185
+ def __init__(self, env: EnvSettings) -> None:
1186
+ self.env = env
1187
+
1188
+ def run(self) -> Dict[str, dict]:
1189
+ """Run agent advantage calculation on discovered agent tarballs."""
1190
+
1191
+ # Discover all agent tarballs
1192
+ agent_tarballs = discover_agent_tarballs(self.env.logs_root)
1193
+
1194
+ if not agent_tarballs:
1195
+ return {"error": "No agent tarballs found in the logs directory"}
1196
+
1197
+ if len(agent_tarballs) < 2:
1198
+ return {"error": "Need at least 2 agents (nop and oracle) for comparison"}
1199
+
1200
+ # Last agent is the one with highest agent_number
1201
+ last_agent = agent_tarballs[-1]
1202
+
1203
+ agent_model_by_name: Dict[str, str] = {tb.agent_name: (tb.model_name or tb.agent_name) for tb in agent_tarballs}
1204
+
1205
+ # Load data for each agent using get_tarball_contents
1206
+ # Keys are now compound format: agent_name:model_name
1207
+ agents_data: Dict[str, RunData] = {}
1208
+ agents_coverage: Dict[str, tuple[float, int, int]] = {}
1209
+ core_cov_by_agent: Dict[str, Dict[str, dict]] = {}
1210
+
1211
+ for agent_tb in agent_tarballs:
1212
+ # Friendly permissions check before attempting to read tarball
1213
+ try:
1214
+ if not os.access(agent_tb.path, os.R_OK):
1215
+ return {
1216
+ "error": (
1217
+ f"Tarball not readable (permission denied): {agent_tb.path}. "
1218
+ "Fix with 'sudo chmod 644 <file>' or rerun so the container writes world-readable artifacts (umask 022)."
1219
+ )
1220
+ }
1221
+ except Exception:
1222
+ pass
1223
+
1224
+ # Use compound key: agent_name:model_name
1225
+ agent_key = _agent_key(agent_tb.agent_name, agent_tb.model_name)
1226
+
1227
+ run_data = get_tarball_contents(agent_tb.path)
1228
+ if run_data is None:
1229
+ return {"error": f"Failed to load data for agent {agent_key}"}
1230
+ agents_data[agent_key] = run_data
1231
+
1232
+ # Calculate coverage for this agent
1233
+ # Prefer a pre-extracted sibling directory named like the tarball without both suffixes
1234
+ # (e.g., postrun_agent-2-oracle:oracle.tar.gz -> postrun_agent-2-oracle:oracle)
1235
+ extracted_dir = agent_tb.path.with_suffix("").with_suffix("")
1236
+ cov_tuple: tuple[float, int, int]
1237
+ bjson = extracted_dir / "results" / "benchmarks.json"
1238
+ if bjson.exists():
1239
+ cov_tuple = calculate_mean_coverage(bjson)
1240
+ core_cov_by_agent[agent_key] = _collect_core_file_coverage(bjson)
1241
+ else:
1242
+ # Fall back to reading benchmarks.json from inside the tarball
1243
+ cov_tuple = calculate_mean_coverage_from_tarball(agent_tb.path)
1244
+ core_cov_by_agent[agent_key] = collect_core_file_coverage_from_tarball(agent_tb.path)
1245
+ agents_coverage[agent_key] = cov_tuple
1246
+
1247
+ # Calculate agent advantages with env settings
1248
+ try:
1249
+ # Per-agent advantages (excluding only 'nop')
1250
+ all_advantages = calculate_all_agent_advantages(
1251
+ agents_data,
1252
+ factor=self.env.factor,
1253
+ use_stats=self.env.use_stats,
1254
+ )
1255
+
1256
+ # Keep backward-compatible "last agent" summary
1257
+ # Now keys are compound format: agent_name:model_name
1258
+ last_agent_key = _agent_key(last_agent.agent_name, last_agent.model_name)
1259
+ if last_agent_key not in all_advantages:
1260
+ # If the last agent is the 'nop' baseline, choose the highest-index agent available
1261
+ # for backward-compatible fields.
1262
+ non_baseline = [a for a in all_advantages.keys()]
1263
+ if not non_baseline:
1264
+ raise ValueError("No agents available for summary beyond 'nop'")
1265
+ last_agent_summary_key = non_baseline[-1]
1266
+ else:
1267
+ last_agent_summary_key = last_agent_key
1268
+
1269
+ result = all_advantages[last_agent_summary_key]
1270
+
1271
+ # Calculate max coverage across all agents (max of per-agent max coverages)
1272
+ all_coverages = [cov[0] for cov in agents_coverage.values() if cov[0] > 0]
1273
+ max_coverage = max(all_coverages) if all_coverages else 0.0
1274
+
1275
+ # Build coverage info for each agent
1276
+ coverage_by_agent = {}
1277
+ for agent_name, (
1278
+ cov_pct,
1279
+ num_covered,
1280
+ total_core,
1281
+ ) in agents_coverage.items():
1282
+ agent_id = _normalize_agent_key(agent_name, agent_model_by_name)
1283
+ coverage_by_agent[agent_id] = {
1284
+ "max_coverage_percent": cov_pct,
1285
+ "num_core_files_covered": num_covered,
1286
+ "total_core_files": total_core,
1287
+ }
1288
+
1289
+ # Compute coverage over common file set (intersection across agents)
1290
+ common_files: set[str] = set()
1291
+ for i, (agent_name, files_map) in enumerate(core_cov_by_agent.items()):
1292
+ keys = set(files_map.keys())
1293
+ if i == 0:
1294
+ common_files = keys
1295
+ else:
1296
+ common_files &= keys
1297
+ coverage_by_agent_common: Dict[str, dict] = {}
1298
+ per_agent_common_percents: list[float] = []
1299
+ if common_files:
1300
+ for agent_name, files_map in core_cov_by_agent.items():
1301
+ total_cov = 0
1302
+ total_stmts = 0
1303
+ num_cov_files = 0
1304
+ for fp in common_files:
1305
+ cov = files_map.get(fp, {"covered": 0, "num_statements": 0, "percent": 0.0})
1306
+ total_cov += int(cov.get("covered", 0) or 0)
1307
+ ns = int(cov.get("num_statements", 0) or 0)
1308
+ total_stmts += ns
1309
+ if float(cov.get("percent", 0.0) or 0.0) > 0:
1310
+ num_cov_files += 1
1311
+ if total_stmts > 0 and total_cov > total_stmts:
1312
+ total_cov = total_stmts
1313
+ pct = (total_cov / total_stmts) * 100.0 if total_stmts > 0 else 0.0
1314
+ agent_id = _normalize_agent_key(agent_name, agent_model_by_name)
1315
+ coverage_by_agent_common[agent_id] = {
1316
+ "max_coverage_percent": pct,
1317
+ "num_core_files_covered": num_cov_files,
1318
+ "total_core_files": len(common_files),
1319
+ }
1320
+ if pct > 0:
1321
+ per_agent_common_percents.append(pct)
1322
+ max_coverage_common = max(per_agent_common_percents) if per_agent_common_percents else 0.0
1323
+
1324
+ # Assemble detailed per-agent advantages for logging
1325
+ advantages_by_agent: Dict[str, dict] = {}
1326
+ per_benchmark_speedups_by_agent: Dict[str, Dict[str, Dict[str, float]]] = {}
1327
+ for agent_name, adv in all_advantages.items():
1328
+ agent_id = _normalize_agent_key(agent_name, agent_model_by_name)
1329
+ advantages_by_agent[agent_id] = {
1330
+ "agent_advantage": adv.agent_advantage,
1331
+ "agent_advantage_level1": adv.agent_advantage_level1,
1332
+ "agent_advantage_level2": adv.agent_advantage_level2,
1333
+ "agent_advantage_level3": adv.agent_advantage_level3,
1334
+ "agent_advantage_level4": adv.agent_advantage_level4,
1335
+ "num_valid_benchmarks": adv.num_valid_benchmarks,
1336
+ "total_benchmarks": adv.total_benchmarks,
1337
+ }
1338
+ per_benchmark_speedups_by_agent[agent_id] = adv.per_benchmark_speedups
1339
+
1340
+ return {
1341
+ # Backward-compatible fields (summarize using the last available agent
1342
+ # which can be 'oracle' as well; only 'nop' is excluded)
1343
+ # last_agent is now in agent:model format
1344
+ "agent_advantage": result.agent_advantage,
1345
+ "agent_advantage_level1": result.agent_advantage_level1,
1346
+ "agent_advantage_level2": result.agent_advantage_level2,
1347
+ "agent_advantage_level3": result.agent_advantage_level3,
1348
+ "agent_advantage_level4": result.agent_advantage_level4,
1349
+ "last_agent": _normalize_agent_key(result.last_agent, agent_model_by_name),
1350
+ "num_valid_benchmarks": result.num_valid_benchmarks,
1351
+ "total_benchmarks": result.total_benchmarks,
1352
+ "per_benchmark_speedups": result.per_benchmark_speedups,
1353
+ # Per-agent fields (keys are agent:model format)
1354
+ "agent_advantages_by_agent": advantages_by_agent,
1355
+ "per_benchmark_speedups_by_agent": per_benchmark_speedups_by_agent,
1356
+ # Coverage (max across agents/benchmarks)
1357
+ "max_coverage_percent": max_coverage,
1358
+ "coverage_by_agent": coverage_by_agent,
1359
+ "max_coverage_percent_common": max_coverage_common,
1360
+ "coverage_by_agent_common": coverage_by_agent_common,
1361
+ }
1362
+ except ValueError as e:
1363
+ return {"error": str(e)}
1364
+
1365
+
1366
+ def main():
1367
+ env = EnvSettings.from_env()
1368
+ runner = Runner(env)
1369
+ out = runner.run()
1370
+ print("FORMULACODE_RESULT_START")
1371
+ print(json.dumps(out, sort_keys=True))
1372
+ print("FORMULACODE_RESULT_END")
1373
+
1374
+ # Output snapshot results - always print markers
1375
+ agent_model_name = os.environ.get("T_BENCH_AGENT_MODEL_NAME", "")
1376
+ snapshot_data = None
1377
+
1378
+ if agent_model_name:
1379
+ snapshot_path = Path(f"/logs/summary_{agent_model_name}.json")
1380
+ if snapshot_path.exists():
1381
+ try:
1382
+ snapshot_data = json.loads(snapshot_path.read_text(encoding="utf-8"))
1383
+ except Exception:
1384
+ pass
1385
+
1386
+ # If we couldn't load snapshot data, use dummy with failed=1
1387
+ if snapshot_data is None:
1388
+ snapshot_data = {
1389
+ "total": 0,
1390
+ "passed": 0,
1391
+ "failed": 1,
1392
+ "skipped": 0,
1393
+ "timestamp": "",
1394
+ "snapshot_dir": "",
1395
+ "benchmark_dir": "",
1396
+ }
1397
+
1398
+ print("FORMULACODE_SNAPSHOT_START")
1399
+ print(json.dumps(snapshot_data, sort_keys=True))
1400
+ print("FORMULACODE_SNAPSHOT_END")
1401
+
1402
+ return out
1403
+
1404
+
1405
+ out = main()