zeno-mobile-runner 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +497 -0
- package/CONTRIBUTING.md +42 -0
- package/FEATURES.md +111 -0
- package/LICENSE +21 -0
- package/README.md +176 -0
- package/SECURITY.md +34 -0
- package/build.zig +38 -0
- package/build.zig.zon +7 -0
- package/clients/README.md +149 -0
- package/clients/go/README.md +24 -0
- package/clients/go/examples/fake-session/main.go +93 -0
- package/clients/go/go.mod +3 -0
- package/clients/go/zmr/client.go +432 -0
- package/clients/kotlin/README.md +35 -0
- package/clients/kotlin/build.gradle.kts +35 -0
- package/clients/kotlin/settings.gradle.kts +15 -0
- package/clients/kotlin/src/main/kotlin/dev/zmr/FakeSession.kt +86 -0
- package/clients/kotlin/src/main/kotlin/dev/zmr/ZmrClient.kt +67 -0
- package/clients/python/README.md +29 -0
- package/clients/python/examples/fake_session.py +48 -0
- package/clients/python/pyproject.toml +13 -0
- package/clients/python/zmr_client.py +202 -0
- package/clients/rust/Cargo.lock +107 -0
- package/clients/rust/Cargo.toml +10 -0
- package/clients/rust/README.md +19 -0
- package/clients/rust/examples/fake_session.rs +70 -0
- package/clients/rust/src/lib.rs +461 -0
- package/clients/swift/Package.swift +16 -0
- package/clients/swift/README.md +36 -0
- package/clients/swift/Sources/ZMRClient/ZMRClient.swift +114 -0
- package/clients/swift/Sources/ZMRFakeSession/main.swift +86 -0
- package/clients/typescript/README.md +34 -0
- package/clients/typescript/examples/fake-session.mjs +36 -0
- package/clients/typescript/index.d.ts +144 -0
- package/clients/typescript/index.mjs +192 -0
- package/clients/typescript/package.json +8 -0
- package/docs/adr/0001-agent-native-runner-boundary.md +31 -0
- package/docs/adr/0002-app-local-zmr-contract.md +39 -0
- package/docs/adr/0003-ios-simulator-xctest-shim.md +41 -0
- package/docs/adr/0004-benchmark-claims-and-baseline-collection.md +37 -0
- package/docs/adr/README.md +12 -0
- package/docs/ai-agents.md +154 -0
- package/docs/app-integration.md +330 -0
- package/docs/benchmarking.md +273 -0
- package/docs/client-installation.md +133 -0
- package/docs/clients.md +98 -0
- package/docs/config.md +175 -0
- package/docs/demo.md +259 -0
- package/docs/frameworks.md +72 -0
- package/docs/install.md +95 -0
- package/docs/npm.md +356 -0
- package/docs/protocol-fixtures/README.md +8 -0
- package/docs/protocol-fixtures/core-session.requests.jsonl +8 -0
- package/docs/protocol-fixtures/core-session.responses.jsonl +8 -0
- package/docs/protocol-versioning.md +65 -0
- package/docs/protocol.md +560 -0
- package/docs/scenario-authoring.md +88 -0
- package/docs/trace-privacy.md +88 -0
- package/docs/troubleshooting.md +256 -0
- package/examples/android-app-auth-probe.json +89 -0
- package/examples/android-app-error-state.json +13 -0
- package/examples/android-app-login-smoke.json +192 -0
- package/examples/android-app-onboarding.json +12 -0
- package/examples/android-app-referral-deep-link.json +12 -0
- package/examples/android-shim-smoke.json +19 -0
- package/examples/demo-failure.json +12 -0
- package/examples/demo-fake.json +14 -0
- package/examples/ios-dev-client-open-link.json +26 -0
- package/examples/ios-dev-client-route-snapshot.json +24 -0
- package/examples/ios-shim-smoke.json +23 -0
- package/examples/ios-smoke.json +9 -0
- package/go.work +3 -0
- package/npm/agents.mjs +183 -0
- package/npm/app-config.mjs +95 -0
- package/npm/build-zmr.mjs +21 -0
- package/npm/commands.mjs +104 -0
- package/npm/generated-files.mjs +50 -0
- package/npm/index.mjs +75 -0
- package/npm/init-app.mjs +80 -0
- package/npm/package-scripts.mjs +72 -0
- package/npm/postinstall.mjs +21 -0
- package/npm/scaffold.mjs +179 -0
- package/npm/scenarios.mjs +93 -0
- package/npm/setup.mjs +69 -0
- package/npm/wizard.mjs +117 -0
- package/npm/zmr.mjs +23 -0
- package/package.json +118 -0
- package/schemas/README.md +26 -0
- package/schemas/action-result.schema.json +27 -0
- package/schemas/capabilities-output.schema.json +98 -0
- package/schemas/devices-output.schema.json +25 -0
- package/schemas/doctor-output.schema.json +51 -0
- package/schemas/explain-output.schema.json +51 -0
- package/schemas/import-output.schema.json +23 -0
- package/schemas/init-output.schema.json +71 -0
- package/schemas/json-rpc.schema.json +55 -0
- package/schemas/release-manifest.schema.json +43 -0
- package/schemas/release-readiness-output.schema.json +127 -0
- package/schemas/run-output.schema.json +43 -0
- package/schemas/scenario.schema.json +128 -0
- package/schemas/schemas-output.schema.json +26 -0
- package/schemas/semantic-snapshot.schema.json +116 -0
- package/schemas/snapshot.schema.json +60 -0
- package/schemas/trace-event.schema.json +14 -0
- package/schemas/trace-manifest.schema.json +59 -0
- package/schemas/validate-output.schema.json +42 -0
- package/schemas/version-output.schema.json +23 -0
- package/schemas/zmr-config.schema.json +75 -0
- package/scripts/android-emulator.sh +126 -0
- package/scripts/assert-ios-physical-ready.sh +213 -0
- package/scripts/benchmark-command.sh +307 -0
- package/scripts/benchmark.sh +359 -0
- package/scripts/benchmark_gate.py +117 -0
- package/scripts/benchmark_result_row.py +88 -0
- package/scripts/compare-benchmarks.py +288 -0
- package/scripts/create-android-demo-app.sh +342 -0
- package/scripts/create-ios-demo-app.sh +261 -0
- package/scripts/demo-android-real.sh +232 -0
- package/scripts/demo-ios-real.sh +270 -0
- package/scripts/demo.sh +464 -0
- package/scripts/device-matrix.sh +338 -0
- package/scripts/ensure-ios-shim-target.rb +237 -0
- package/scripts/install-android-shim.sh +281 -0
- package/scripts/install-ios-shim.sh +589 -0
- package/scripts/pilot-gate.sh +560 -0
- package/scripts/release-readiness.py +838 -0
- package/scripts/release-readiness.sh +91 -0
- package/scripts/run-android-pilot.sh +561 -0
- package/scripts/run-ios-pilot.sh +509 -0
- package/shims/android/README.md +21 -0
- package/shims/android/ZMRShimInstrumentedTest.java +152 -0
- package/shims/android/protocol.md +18 -0
- package/shims/ios/README.md +50 -0
- package/shims/ios/ZMRShim.swift +110 -0
- package/shims/ios/ZMRShimUITestCase.swift +518 -0
- package/shims/ios/protocol.md +74 -0
- package/skills/zmr-mobile-testing/SKILL.md +127 -0
- package/src/android.zig +344 -0
- package/src/android_device_info.zig +99 -0
- package/src/android_emulator.zig +154 -0
- package/src/android_screen_recording.zig +112 -0
- package/src/android_shell.zig +112 -0
- package/src/bundle.zig +124 -0
- package/src/bundle_redaction.zig +272 -0
- package/src/bundle_tar.zig +123 -0
- package/src/cli_devices.zig +97 -0
- package/src/cli_doctor.zig +114 -0
- package/src/cli_import.zig +70 -0
- package/src/cli_info.zig +39 -0
- package/src/cli_init.zig +72 -0
- package/src/cli_output.zig +467 -0
- package/src/cli_run.zig +259 -0
- package/src/cli_serve.zig +287 -0
- package/src/cli_trace.zig +111 -0
- package/src/cli_validate.zig +41 -0
- package/src/command.zig +211 -0
- package/src/config.zig +305 -0
- package/src/config_diagnostics.zig +212 -0
- package/src/config_paths.zig +49 -0
- package/src/device_registry.zig +37 -0
- package/src/doctor.zig +412 -0
- package/src/doctor_hints.zig +52 -0
- package/src/errors.zig +55 -0
- package/src/fake_device.zig +163 -0
- package/src/health.zig +28 -0
- package/src/importer.zig +343 -0
- package/src/importer_json.zig +100 -0
- package/src/importer_model.zig +103 -0
- package/src/ios.zig +399 -0
- package/src/ios_devices.zig +219 -0
- package/src/ios_lifecycle.zig +72 -0
- package/src/ios_shim.zig +242 -0
- package/src/ios_snapshot.zig +20 -0
- package/src/json_fields.zig +80 -0
- package/src/json_rpc.zig +150 -0
- package/src/json_rpc_methods.zig +318 -0
- package/src/json_rpc_observation.zig +31 -0
- package/src/json_rpc_params.zig +52 -0
- package/src/json_rpc_protocol.zig +110 -0
- package/src/json_rpc_trace.zig +73 -0
- package/src/main.zig +131 -0
- package/src/mcp.zig +234 -0
- package/src/mcp_protocol.zig +64 -0
- package/src/mcp_trace.zig +83 -0
- package/src/report.zig +346 -0
- package/src/report_html.zig +63 -0
- package/src/report_values.zig +27 -0
- package/src/run_options.zig +152 -0
- package/src/runner.zig +280 -0
- package/src/runner_actions.zig +109 -0
- package/src/runner_config.zig +6 -0
- package/src/runner_diagnostics.zig +268 -0
- package/src/runner_events.zig +170 -0
- package/src/runner_native.zig +88 -0
- package/src/runner_waits.zig +300 -0
- package/src/scaffold.zig +472 -0
- package/src/scenario.zig +346 -0
- package/src/scenario_fields.zig +50 -0
- package/src/schema_registry.zig +53 -0
- package/src/selector.zig +84 -0
- package/src/semantic.zig +171 -0
- package/src/trace.zig +315 -0
- package/src/trace_json.zig +340 -0
- package/src/trace_summary.zig +218 -0
- package/src/trace_summary_diagnostic.zig +202 -0
- package/src/types.zig +120 -0
- package/src/uiautomator.zig +164 -0
- package/src/validation.zig +187 -0
- package/src/version.zig +22 -0
- package/viewer/app.js +373 -0
- package/viewer/index.html +126 -0
- package/viewer/parser.js +233 -0
- package/viewer/styles.css +585 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import argparse
|
|
3
|
+
import json
|
|
4
|
+
import math
|
|
5
|
+
import statistics
|
|
6
|
+
import sys
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def parse_args():
|
|
12
|
+
parser = argparse.ArgumentParser(description="Gate benchmark results by pass rate and duration thresholds.")
|
|
13
|
+
parser.add_argument("--results", required=True, help="Path to benchmark results.jsonl.")
|
|
14
|
+
parser.add_argument("--min-pass-rate", type=float, default=None, help="Minimum pass rate percentage, for example 100.")
|
|
15
|
+
parser.add_argument("--max-failures", type=int, default=None, help="Maximum allowed failed runs.")
|
|
16
|
+
parser.add_argument("--max-mean-ms", type=int, default=None, help="Maximum allowed mean duration in ms.")
|
|
17
|
+
parser.add_argument("--max-p95-ms", type=int, default=None, help="Maximum allowed p95 duration in ms.")
|
|
18
|
+
return parser.parse_args()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def is_pass(row):
|
|
22
|
+
if row.get("status") != "ok":
|
|
23
|
+
return False
|
|
24
|
+
trace_status = row.get("traceStatus")
|
|
25
|
+
return trace_status in (None, "passed")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def p95(durations):
|
|
29
|
+
if not durations:
|
|
30
|
+
return 0
|
|
31
|
+
ordered = sorted(durations)
|
|
32
|
+
index = max(0, math.ceil(len(ordered) * 0.95) - 1)
|
|
33
|
+
return ordered[index]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def read_rows(path):
|
|
37
|
+
rows = []
|
|
38
|
+
with Path(path).open(encoding="utf-8") as handle:
|
|
39
|
+
for line_number, line in enumerate(handle, start=1):
|
|
40
|
+
line = line.strip()
|
|
41
|
+
if not line:
|
|
42
|
+
continue
|
|
43
|
+
try:
|
|
44
|
+
row = json.loads(line)
|
|
45
|
+
except json.JSONDecodeError as exc:
|
|
46
|
+
raise SystemExit(f"{path}:{line_number}: invalid json: {exc}") from exc
|
|
47
|
+
if not isinstance(row, dict):
|
|
48
|
+
raise SystemExit(f"{path}:{line_number}: expected object row")
|
|
49
|
+
rows.append(row)
|
|
50
|
+
return rows
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def summarize(tool, rows):
|
|
54
|
+
durations = [int(row.get("durationMs", 0)) for row in rows]
|
|
55
|
+
failures = [row for row in rows if not is_pass(row)]
|
|
56
|
+
passed = len(rows) - len(failures)
|
|
57
|
+
pass_rate = (passed / len(rows) * 100.0) if rows else 0.0
|
|
58
|
+
mean_ms = round(statistics.mean(durations)) if durations else 0
|
|
59
|
+
p95_ms = p95(durations)
|
|
60
|
+
return {
|
|
61
|
+
"tool": tool,
|
|
62
|
+
"runs": len(rows),
|
|
63
|
+
"passed": passed,
|
|
64
|
+
"failures": len(failures),
|
|
65
|
+
"passRate": pass_rate,
|
|
66
|
+
"meanMs": mean_ms,
|
|
67
|
+
"p95Ms": p95_ms,
|
|
68
|
+
"failureRows": failures,
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def format_summary(summary):
|
|
73
|
+
return (
|
|
74
|
+
f"{summary['tool']}: runs={summary['runs']} "
|
|
75
|
+
f"passRate={summary['passRate']:.2f}% failures={summary['failures']} "
|
|
76
|
+
f"meanMs={summary['meanMs']} p95Ms={summary['p95Ms']}"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def violations(summary, args):
|
|
81
|
+
problems = []
|
|
82
|
+
if args.min_pass_rate is not None and summary["passRate"] < args.min_pass_rate:
|
|
83
|
+
problems.append(f"passRate {summary['passRate']:.2f}% < {args.min_pass_rate:.2f}%")
|
|
84
|
+
if args.max_failures is not None and summary["failures"] > args.max_failures:
|
|
85
|
+
problems.append(f"failures {summary['failures']} > {args.max_failures}")
|
|
86
|
+
if args.max_mean_ms is not None and summary["meanMs"] > args.max_mean_ms:
|
|
87
|
+
problems.append(f"meanMs {summary['meanMs']} > {args.max_mean_ms}")
|
|
88
|
+
if args.max_p95_ms is not None and summary["p95Ms"] > args.max_p95_ms:
|
|
89
|
+
problems.append(f"p95Ms {summary['p95Ms']} > {args.max_p95_ms}")
|
|
90
|
+
return problems
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def main():
|
|
94
|
+
args = parse_args()
|
|
95
|
+
rows = read_rows(args.results)
|
|
96
|
+
if not rows:
|
|
97
|
+
print(f"no benchmark rows found: {args.results}", file=sys.stderr)
|
|
98
|
+
return 2
|
|
99
|
+
|
|
100
|
+
by_tool = defaultdict(list)
|
|
101
|
+
for row in rows:
|
|
102
|
+
by_tool[str(row.get("tool", "unknown"))].append(row)
|
|
103
|
+
|
|
104
|
+
failed = False
|
|
105
|
+
for tool in sorted(by_tool):
|
|
106
|
+
summary = summarize(tool, by_tool[tool])
|
|
107
|
+
print(format_summary(summary))
|
|
108
|
+
problems = violations(summary, args)
|
|
109
|
+
for problem in problems:
|
|
110
|
+
print(f"gate failed for {tool}: {problem}", file=sys.stderr)
|
|
111
|
+
failed = failed or bool(problems)
|
|
112
|
+
|
|
113
|
+
return 1 if failed else 0
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
if __name__ == "__main__":
|
|
117
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import argparse
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def parse_args():
|
|
8
|
+
parser = argparse.ArgumentParser(description="Build one benchmark results JSON row.")
|
|
9
|
+
parser.add_argument("--tool", required=True)
|
|
10
|
+
parser.add_argument("--run", required=True, type=int)
|
|
11
|
+
parser.add_argument("--command-status", required=True, type=int)
|
|
12
|
+
parser.add_argument("--duration-ms", required=True, type=int)
|
|
13
|
+
parser.add_argument("--trace-dir", required=True)
|
|
14
|
+
parser.add_argument("--platform")
|
|
15
|
+
parser.add_argument("--device")
|
|
16
|
+
parser.add_argument("--app-id")
|
|
17
|
+
parser.add_argument("--scenario")
|
|
18
|
+
parser.add_argument("--app-build")
|
|
19
|
+
return parser.parse_args()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def read_zmr_trace(trace_dir):
|
|
23
|
+
events_path = Path(trace_dir) / "events.jsonl"
|
|
24
|
+
if not events_path.exists():
|
|
25
|
+
return {}
|
|
26
|
+
|
|
27
|
+
last_step_error = {}
|
|
28
|
+
last_scenario_end = {}
|
|
29
|
+
|
|
30
|
+
with events_path.open(encoding="utf-8") as events:
|
|
31
|
+
for line in events:
|
|
32
|
+
line = line.strip()
|
|
33
|
+
if not line:
|
|
34
|
+
continue
|
|
35
|
+
try:
|
|
36
|
+
event = json.loads(line)
|
|
37
|
+
except json.JSONDecodeError:
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
payload = event.get("payload")
|
|
41
|
+
if not isinstance(payload, dict):
|
|
42
|
+
payload = {}
|
|
43
|
+
|
|
44
|
+
if event.get("kind") == "step.error":
|
|
45
|
+
last_step_error = payload
|
|
46
|
+
elif event.get("kind") == "scenario.end":
|
|
47
|
+
last_scenario_end = payload
|
|
48
|
+
|
|
49
|
+
trace = {}
|
|
50
|
+
if "status" in last_scenario_end:
|
|
51
|
+
trace["traceStatus"] = last_scenario_end["status"]
|
|
52
|
+
if "error" in last_scenario_end:
|
|
53
|
+
trace["traceError"] = last_scenario_end["error"]
|
|
54
|
+
elif "error" in last_step_error:
|
|
55
|
+
trace["traceError"] = last_step_error["error"]
|
|
56
|
+
if "failedStepIndex" in last_scenario_end:
|
|
57
|
+
trace["failedStepIndex"] = last_scenario_end["failedStepIndex"]
|
|
58
|
+
elif "index" in last_step_error:
|
|
59
|
+
trace["failedStepIndex"] = last_step_error["index"]
|
|
60
|
+
return trace
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def main():
|
|
64
|
+
args = parse_args()
|
|
65
|
+
row = {
|
|
66
|
+
"tool": args.tool,
|
|
67
|
+
"run": args.run,
|
|
68
|
+
"status": "ok" if args.command_status == 0 else "failed",
|
|
69
|
+
"durationMs": args.duration_ms,
|
|
70
|
+
"traceDir": args.trace_dir,
|
|
71
|
+
}
|
|
72
|
+
metadata = {
|
|
73
|
+
"platform": args.platform,
|
|
74
|
+
"device": args.device,
|
|
75
|
+
"appId": args.app_id,
|
|
76
|
+
"scenario": args.scenario,
|
|
77
|
+
"appBuild": args.app_build,
|
|
78
|
+
}
|
|
79
|
+
row.update({key: value for key, value in metadata.items() if value})
|
|
80
|
+
|
|
81
|
+
if args.tool == "zmr":
|
|
82
|
+
row.update(read_zmr_trace(args.trace_dir))
|
|
83
|
+
|
|
84
|
+
print(json.dumps(row, separators=(",", ":")))
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
if __name__ == "__main__":
|
|
88
|
+
main()
|
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import argparse
|
|
3
|
+
import json
|
|
4
|
+
import math
|
|
5
|
+
import shlex
|
|
6
|
+
import statistics
|
|
7
|
+
import sys
|
|
8
|
+
import time
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
CONTEXT_FIELDS = ("platform", "device", "appId", "scenario", "appBuild")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def parse_args():
|
|
16
|
+
parser = argparse.ArgumentParser(
|
|
17
|
+
description="Compare benchmark JSONL rows for two runner labels.",
|
|
18
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
19
|
+
epilog=(
|
|
20
|
+
"--evidence-out requires --min-candidate-pass-rate, "
|
|
21
|
+
"--max-candidate-failures, --min-mean-speedup, and "
|
|
22
|
+
"--min-p95-speedup so market-claim evidence includes explicit gates."
|
|
23
|
+
),
|
|
24
|
+
)
|
|
25
|
+
parser.add_argument("--results", required=True, help="Path to a benchmark results.jsonl file.")
|
|
26
|
+
parser.add_argument("--candidate", default="zmr", help="Candidate tool label. Default: zmr.")
|
|
27
|
+
parser.add_argument("--baseline", required=True, help="Baseline tool label to compare against.")
|
|
28
|
+
parser.add_argument("--format", choices=("markdown", "json"), default="markdown", help="Output format.")
|
|
29
|
+
parser.add_argument("--out", help="Optional output file. Defaults to stdout.")
|
|
30
|
+
parser.add_argument("--min-candidate-pass-rate", type=float, help="Minimum candidate pass rate percentage.")
|
|
31
|
+
parser.add_argument("--max-candidate-failures", type=int, help="Maximum allowed candidate failures.")
|
|
32
|
+
parser.add_argument("--min-mean-speedup", type=float, help="Minimum required mean speedup versus baseline.")
|
|
33
|
+
parser.add_argument("--min-p95-speedup", type=float, help="Minimum required p95 speedup versus baseline.")
|
|
34
|
+
parser.add_argument("--evidence-out", help="Optional JSONL file to append a market-claim readiness evidence row.")
|
|
35
|
+
args = parser.parse_args()
|
|
36
|
+
for name in (
|
|
37
|
+
"min_candidate_pass_rate",
|
|
38
|
+
"max_candidate_failures",
|
|
39
|
+
"min_mean_speedup",
|
|
40
|
+
"min_p95_speedup",
|
|
41
|
+
):
|
|
42
|
+
value = getattr(args, name)
|
|
43
|
+
if value is not None and value < 0:
|
|
44
|
+
parser.error(f"--{name.replace('_', '-')} must be non-negative")
|
|
45
|
+
if args.evidence_out:
|
|
46
|
+
missing_gate_args = [
|
|
47
|
+
f"--{name.replace('_', '-')}"
|
|
48
|
+
for name in (
|
|
49
|
+
"min_candidate_pass_rate",
|
|
50
|
+
"max_candidate_failures",
|
|
51
|
+
"min_mean_speedup",
|
|
52
|
+
"min_p95_speedup",
|
|
53
|
+
)
|
|
54
|
+
if getattr(args, name) is None
|
|
55
|
+
]
|
|
56
|
+
if missing_gate_args:
|
|
57
|
+
parser.error(
|
|
58
|
+
"; ".join(f"{name} is required with --evidence-out" for name in missing_gate_args)
|
|
59
|
+
)
|
|
60
|
+
return args
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def is_pass(row):
|
|
64
|
+
if row.get("status") != "ok":
|
|
65
|
+
return False
|
|
66
|
+
trace_status = row.get("traceStatus")
|
|
67
|
+
return trace_status in (None, "passed")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def p95(durations):
|
|
71
|
+
if not durations:
|
|
72
|
+
return 0
|
|
73
|
+
ordered = sorted(durations)
|
|
74
|
+
index = max(0, math.ceil(len(ordered) * 0.95) - 1)
|
|
75
|
+
return ordered[index]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def read_rows(path):
|
|
79
|
+
rows = []
|
|
80
|
+
with Path(path).open(encoding="utf-8") as handle:
|
|
81
|
+
for line_number, line in enumerate(handle, start=1):
|
|
82
|
+
line = line.strip()
|
|
83
|
+
if not line:
|
|
84
|
+
continue
|
|
85
|
+
try:
|
|
86
|
+
row = json.loads(line)
|
|
87
|
+
except json.JSONDecodeError as exc:
|
|
88
|
+
raise SystemExit(f"{path}:{line_number}: invalid json: {exc}") from exc
|
|
89
|
+
if not isinstance(row, dict):
|
|
90
|
+
raise SystemExit(f"{path}:{line_number}: expected object row")
|
|
91
|
+
rows.append(row)
|
|
92
|
+
return rows
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def summarize(tool, rows):
|
|
96
|
+
durations = [int(row.get("durationMs", 0)) for row in rows]
|
|
97
|
+
failures = [row for row in rows if not is_pass(row)]
|
|
98
|
+
passed = len(rows) - len(failures)
|
|
99
|
+
pass_rate = (passed / len(rows) * 100.0) if rows else 0.0
|
|
100
|
+
mean_ms = round(statistics.mean(durations)) if durations else 0
|
|
101
|
+
return {
|
|
102
|
+
"tool": tool,
|
|
103
|
+
"runs": len(rows),
|
|
104
|
+
"passed": passed,
|
|
105
|
+
"failures": len(failures),
|
|
106
|
+
"passRate": pass_rate,
|
|
107
|
+
"meanMs": mean_ms,
|
|
108
|
+
"p95Ms": p95(durations),
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def ratio(baseline_value, candidate_value):
|
|
113
|
+
if baseline_value <= 0 or candidate_value <= 0:
|
|
114
|
+
return None
|
|
115
|
+
return baseline_value / candidate_value
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def percent_delta(candidate_value, baseline_value):
|
|
119
|
+
if baseline_value <= 0:
|
|
120
|
+
return None
|
|
121
|
+
return ((candidate_value - baseline_value) / baseline_value) * 100.0
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def comparison(candidate, baseline):
|
|
125
|
+
return {
|
|
126
|
+
"candidate": candidate,
|
|
127
|
+
"baseline": baseline,
|
|
128
|
+
"meanSpeedup": ratio(baseline["meanMs"], candidate["meanMs"]),
|
|
129
|
+
"p95Speedup": ratio(baseline["p95Ms"], candidate["p95Ms"]),
|
|
130
|
+
"meanDeltaPct": percent_delta(candidate["meanMs"], baseline["meanMs"]),
|
|
131
|
+
"p95DeltaPct": percent_delta(candidate["p95Ms"], baseline["p95Ms"]),
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def benchmark_context(candidate_rows, baseline_rows):
|
|
136
|
+
rows = candidate_rows + baseline_rows
|
|
137
|
+
context = {}
|
|
138
|
+
problems = []
|
|
139
|
+
for field in CONTEXT_FIELDS:
|
|
140
|
+
values = [str(row.get(field, "")).strip() for row in rows]
|
|
141
|
+
concrete = [value for value in values if value]
|
|
142
|
+
unique = sorted(set(concrete))
|
|
143
|
+
if len(concrete) != len(values):
|
|
144
|
+
problems.append(f"{field} missing")
|
|
145
|
+
elif len(unique) != 1:
|
|
146
|
+
problems.append(f"{field} mismatch: {', '.join(unique)}")
|
|
147
|
+
else:
|
|
148
|
+
context[field] = unique[0]
|
|
149
|
+
return {
|
|
150
|
+
"sameContext": not problems,
|
|
151
|
+
"context": context,
|
|
152
|
+
"contextProblems": problems,
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def format_ratio(value):
|
|
157
|
+
return "n/a" if value is None else f"{value:.2f}x"
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def format_pct(value):
|
|
161
|
+
return "n/a" if value is None else f"{value:+.1f}%"
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def markdown_report(data):
|
|
165
|
+
candidate = data["candidate"]
|
|
166
|
+
baseline = data["baseline"]
|
|
167
|
+
lines = [
|
|
168
|
+
"# Benchmark Comparison",
|
|
169
|
+
"",
|
|
170
|
+
"| Tool | Runs | Pass rate | Failures | Mean ms | P95 ms |",
|
|
171
|
+
"| --- | ---: | ---: | ---: | ---: | ---: |",
|
|
172
|
+
f"| {candidate['tool']} | {candidate['runs']} | {candidate['passRate']:.2f}% | {candidate['failures']} | {candidate['meanMs']} | {candidate['p95Ms']} |",
|
|
173
|
+
f"| {baseline['tool']} | {baseline['runs']} | {baseline['passRate']:.2f}% | {baseline['failures']} | {baseline['meanMs']} | {baseline['p95Ms']} |",
|
|
174
|
+
"",
|
|
175
|
+
f"- Mean speedup: {format_ratio(data['meanSpeedup'])} ({format_pct(data['meanDeltaPct'])} candidate vs baseline)",
|
|
176
|
+
f"- P95 speedup: {format_ratio(data['p95Speedup'])} ({format_pct(data['p95DeltaPct'])} candidate vs baseline)",
|
|
177
|
+
f"- Same benchmark context: {'yes' if data.get('sameContext') else 'no'}",
|
|
178
|
+
"",
|
|
179
|
+
"Interpretation: negative deltas mean the candidate was faster for that metric. Compare only runs collected on the same host, device state, app build, and scenario.",
|
|
180
|
+
]
|
|
181
|
+
return "\n".join(lines) + "\n"
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def gate_failures(data, args):
|
|
185
|
+
failures = []
|
|
186
|
+
candidate = data["candidate"]
|
|
187
|
+
baseline = data["baseline"]
|
|
188
|
+
if args.evidence_out and candidate["runs"] < 20:
|
|
189
|
+
failures.append(f"candidateRuns {candidate['runs']} below minimum 20")
|
|
190
|
+
if args.evidence_out and baseline["runs"] < 20:
|
|
191
|
+
failures.append(f"baselineRuns {baseline['runs']} below minimum 20")
|
|
192
|
+
if args.min_candidate_pass_rate is not None and candidate["passRate"] < args.min_candidate_pass_rate:
|
|
193
|
+
failures.append(
|
|
194
|
+
f"candidate passRate {candidate['passRate']:.2f}% below minimum {args.min_candidate_pass_rate:.2f}%"
|
|
195
|
+
)
|
|
196
|
+
if args.max_candidate_failures is not None and candidate["failures"] > args.max_candidate_failures:
|
|
197
|
+
failures.append(
|
|
198
|
+
f"candidate failures={candidate['failures']} above maximum {args.max_candidate_failures}"
|
|
199
|
+
)
|
|
200
|
+
if args.min_mean_speedup is not None:
|
|
201
|
+
speedup = data["meanSpeedup"]
|
|
202
|
+
if speedup is None or speedup < args.min_mean_speedup:
|
|
203
|
+
actual = "n/a" if speedup is None else f"{speedup:.2f}x"
|
|
204
|
+
failures.append(f"meanSpeedup {actual} below minimum {args.min_mean_speedup:.2f}x")
|
|
205
|
+
if args.min_p95_speedup is not None:
|
|
206
|
+
speedup = data["p95Speedup"]
|
|
207
|
+
if speedup is None or speedup < args.min_p95_speedup:
|
|
208
|
+
actual = "n/a" if speedup is None else f"{speedup:.2f}x"
|
|
209
|
+
failures.append(f"p95Speedup {actual} below minimum {args.min_p95_speedup:.2f}x")
|
|
210
|
+
if args.evidence_out and not data.get("sameContext"):
|
|
211
|
+
details = "; ".join(data.get("contextProblems", [])) or "missing context"
|
|
212
|
+
failures.append(f"same benchmark context evidence required ({details})")
|
|
213
|
+
return failures
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def write_evidence(args, data, failures, duration_ms):
|
|
217
|
+
if not args.evidence_out:
|
|
218
|
+
return
|
|
219
|
+
path = Path(args.evidence_out)
|
|
220
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
221
|
+
row = {
|
|
222
|
+
"name": "competitive benchmark comparison",
|
|
223
|
+
"status": "failed" if failures else "passed",
|
|
224
|
+
"durationMs": duration_ms,
|
|
225
|
+
"command": " ".join(shlex.quote(part) for part in sys.argv),
|
|
226
|
+
"candidate": args.candidate,
|
|
227
|
+
"baseline": args.baseline,
|
|
228
|
+
"results": args.results,
|
|
229
|
+
"minCandidatePassRate": args.min_candidate_pass_rate,
|
|
230
|
+
"maxCandidateFailures": args.max_candidate_failures,
|
|
231
|
+
"minMeanSpeedup": args.min_mean_speedup,
|
|
232
|
+
"minP95Speedup": args.min_p95_speedup,
|
|
233
|
+
"candidateRuns": data["candidate"]["runs"],
|
|
234
|
+
"baselineRuns": data["baseline"]["runs"],
|
|
235
|
+
"candidatePassRate": data["candidate"]["passRate"],
|
|
236
|
+
"candidateFailures": data["candidate"]["failures"],
|
|
237
|
+
"meanSpeedup": data["meanSpeedup"],
|
|
238
|
+
"p95Speedup": data["p95Speedup"],
|
|
239
|
+
"sameContext": data["sameContext"],
|
|
240
|
+
"context": data["context"],
|
|
241
|
+
}
|
|
242
|
+
if failures:
|
|
243
|
+
row["error"] = "; ".join(failures)
|
|
244
|
+
with path.open("a", encoding="utf-8") as handle:
|
|
245
|
+
handle.write(json.dumps(row, sort_keys=True, separators=(",", ":")) + "\n")
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def main():
|
|
249
|
+
started = time.monotonic()
|
|
250
|
+
args = parse_args()
|
|
251
|
+
rows = read_rows(args.results)
|
|
252
|
+
by_tool = defaultdict(list)
|
|
253
|
+
for row in rows:
|
|
254
|
+
by_tool[str(row.get("tool", "unknown"))].append(row)
|
|
255
|
+
|
|
256
|
+
missing = [tool for tool in (args.candidate, args.baseline) if tool not in by_tool]
|
|
257
|
+
if missing:
|
|
258
|
+
print(f"missing benchmark rows for: {', '.join(missing)}", file=sys.stderr)
|
|
259
|
+
return 2
|
|
260
|
+
|
|
261
|
+
data = comparison(
|
|
262
|
+
summarize(args.candidate, by_tool[args.candidate]),
|
|
263
|
+
summarize(args.baseline, by_tool[args.baseline]),
|
|
264
|
+
)
|
|
265
|
+
data.update(benchmark_context(by_tool[args.candidate], by_tool[args.baseline]))
|
|
266
|
+
|
|
267
|
+
if args.format == "json":
|
|
268
|
+
output = json.dumps(data, sort_keys=True) + "\n"
|
|
269
|
+
else:
|
|
270
|
+
output = markdown_report(data)
|
|
271
|
+
|
|
272
|
+
if args.out:
|
|
273
|
+
Path(args.out).write_text(output, encoding="utf-8")
|
|
274
|
+
else:
|
|
275
|
+
sys.stdout.write(output)
|
|
276
|
+
|
|
277
|
+
failures = gate_failures(data, args)
|
|
278
|
+
duration_ms = round((time.monotonic() - started) * 1000)
|
|
279
|
+
write_evidence(args, data, failures, duration_ms)
|
|
280
|
+
if failures:
|
|
281
|
+
for failure in failures:
|
|
282
|
+
print(f"benchmark comparison gate failed: {failure}", file=sys.stderr)
|
|
283
|
+
return 1
|
|
284
|
+
return 0
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
if __name__ == "__main__":
|
|
288
|
+
raise SystemExit(main())
|