@moleculeagora/cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -0
- package/dist/index.js +30368 -0
- package/dist/index.js.map +1 -0
- package/dist/python-v1/agora_runtime.py +282 -0
- package/dist/python-v1/answer-set-metric.py +264 -0
- package/dist/python-v1/assertion-set-evaluation.py +879 -0
- package/dist/python-v1/exact-match.py +60 -0
- package/dist/python-v1/l4-composition.py +435 -0
- package/dist/python-v1/multi-output-tabular-metric.py +392 -0
- package/dist/python-v1/panel-ranking-metric.py +622 -0
- package/dist/python-v1/project-test.py +256 -0
- package/dist/python-v1/protein-binder-assay-metric.py +600 -0
- package/dist/python-v1/public-tool-metric.py +161 -0
- package/dist/python-v1/ranking-metric.py +426 -0
- package/dist/python-v1/reference-artifact-assertion.py +532 -0
- package/dist/python-v1/rubric-validation.py +246 -0
- package/dist/python-v1/solver-python-stdio-test.py +160 -0
- package/dist/python-v1/statistical-endpoint-test-v2.py +629 -0
- package/dist/python-v1/statistical-endpoint-test.py +442 -0
- package/dist/python-v1/table-metric.py +1291 -0
- package/dist/release-metadata.json +7 -0
- package/package.json +67 -0
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
from agora_runtime import (
|
|
2
|
+
fail_runtime,
|
|
3
|
+
load_json_file,
|
|
4
|
+
load_runtime_context,
|
|
5
|
+
reject_submission,
|
|
6
|
+
resolve_evaluation_artifact,
|
|
7
|
+
resolve_scoring_asset,
|
|
8
|
+
resolve_submission_artifact,
|
|
9
|
+
write_score,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def require_string(value, label):
|
|
14
|
+
if not isinstance(value, str) or not value.strip():
|
|
15
|
+
fail_runtime(f"{label} must be a non-empty string.")
|
|
16
|
+
return value.strip()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def require_string_list(value, label):
|
|
20
|
+
if value is None:
|
|
21
|
+
return []
|
|
22
|
+
if not isinstance(value, list):
|
|
23
|
+
fail_runtime(f"{label} must be an array of strings.")
|
|
24
|
+
normalized = []
|
|
25
|
+
for index, item in enumerate(value):
|
|
26
|
+
if not isinstance(item, str) or not item.strip():
|
|
27
|
+
fail_runtime(f"{label}[{index}] must be a non-empty string.")
|
|
28
|
+
normalized.append(item.strip())
|
|
29
|
+
return normalized
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def require_number(value, label):
|
|
33
|
+
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
|
34
|
+
fail_runtime(f"{label} must be a number.")
|
|
35
|
+
return float(value)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def require_positive_number(value, label):
|
|
39
|
+
number = require_number(value, label)
|
|
40
|
+
if number <= 0:
|
|
41
|
+
fail_runtime(f"{label} must be positive.")
|
|
42
|
+
return number
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def require_positive_int(value, label):
|
|
46
|
+
if isinstance(value, bool) or not isinstance(value, int):
|
|
47
|
+
fail_runtime(f"{label} must be a positive integer.")
|
|
48
|
+
if value <= 0:
|
|
49
|
+
fail_runtime(f"{label} must be a positive integer.")
|
|
50
|
+
return value
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def require_object(value, label):
|
|
54
|
+
if not isinstance(value, dict):
|
|
55
|
+
fail_runtime(f"{label} must be a JSON object.")
|
|
56
|
+
return value
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def normalize_check(value, index):
|
|
60
|
+
check = require_object(value, f"rubric.checks[{index}]")
|
|
61
|
+
rule = require_string(check.get("rule"), f"rubric.checks[{index}].rule")
|
|
62
|
+
normalized = {
|
|
63
|
+
"id": require_string(check.get("id"), f"rubric.checks[{index}].id"),
|
|
64
|
+
"weight": require_positive_number(check.get("weight"), f"rubric.checks[{index}].weight"),
|
|
65
|
+
"rule": rule,
|
|
66
|
+
}
|
|
67
|
+
if rule == "required_path":
|
|
68
|
+
normalized["path"] = require_string(check.get("path"), f"rubric.checks[{index}].path")
|
|
69
|
+
return normalized
|
|
70
|
+
if rule == "string_min_length":
|
|
71
|
+
normalized["path"] = require_string(check.get("path"), f"rubric.checks[{index}].path")
|
|
72
|
+
normalized["min"] = require_positive_int(check.get("min"), f"rubric.checks[{index}].min")
|
|
73
|
+
return normalized
|
|
74
|
+
if rule == "array_min_length":
|
|
75
|
+
normalized["path"] = require_string(check.get("path"), f"rubric.checks[{index}].path")
|
|
76
|
+
normalized["min"] = require_positive_int(check.get("min"), f"rubric.checks[{index}].min")
|
|
77
|
+
return normalized
|
|
78
|
+
if rule == "forbidden_terms_absent":
|
|
79
|
+
normalized["paths"] = require_string_list(
|
|
80
|
+
check.get("paths"),
|
|
81
|
+
f"rubric.checks[{index}].paths",
|
|
82
|
+
)
|
|
83
|
+
normalized["terms"] = require_string_list(
|
|
84
|
+
check.get("terms"),
|
|
85
|
+
f"rubric.checks[{index}].terms",
|
|
86
|
+
)
|
|
87
|
+
if not normalized["paths"] or not normalized["terms"]:
|
|
88
|
+
fail_runtime(f"rubric.checks[{index}] must declare non-empty paths and terms.")
|
|
89
|
+
return normalized
|
|
90
|
+
if rule == "enum":
|
|
91
|
+
normalized["path"] = require_string(check.get("path"), f"rubric.checks[{index}].path")
|
|
92
|
+
normalized["allowed"] = require_string_list(
|
|
93
|
+
check.get("allowed"),
|
|
94
|
+
f"rubric.checks[{index}].allowed",
|
|
95
|
+
)
|
|
96
|
+
if not normalized["allowed"]:
|
|
97
|
+
fail_runtime(f"rubric.checks[{index}].allowed must be non-empty.")
|
|
98
|
+
return normalized
|
|
99
|
+
fail_runtime(
|
|
100
|
+
f"rubric.checks[{index}].rule={rule} is not supported. Supported rules are required_path, string_min_length, array_min_length, forbidden_terms_absent, enum."
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def normalize_aggregation(rubric):
|
|
105
|
+
aggregation = rubric.get("aggregation")
|
|
106
|
+
aggregation = require_object(aggregation, "rubric.aggregation")
|
|
107
|
+
strategy = require_string(aggregation.get("strategy"), "rubric.aggregation.strategy")
|
|
108
|
+
if strategy != "weighted_average":
|
|
109
|
+
fail_runtime(
|
|
110
|
+
f"rubric.aggregation.strategy={strategy} is not supported. Supported strategy is weighted_average."
|
|
111
|
+
)
|
|
112
|
+
return strategy
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def normalize_checks(rubric):
|
|
116
|
+
checks_value = rubric.get("checks")
|
|
117
|
+
if not isinstance(checks_value, list) or not checks_value:
|
|
118
|
+
fail_runtime("rubric.checks must be a non-empty array.")
|
|
119
|
+
checks = [normalize_check(check, index) for index, check in enumerate(checks_value)]
|
|
120
|
+
seen_ids = set()
|
|
121
|
+
for check in checks:
|
|
122
|
+
if check["id"] in seen_ids:
|
|
123
|
+
fail_runtime(f"rubric.checks id {check['id']} is duplicated.")
|
|
124
|
+
seen_ids.add(check["id"])
|
|
125
|
+
return checks
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def has_required_content(value):
|
|
129
|
+
if value is None:
|
|
130
|
+
return False
|
|
131
|
+
if isinstance(value, str):
|
|
132
|
+
return len(value.strip()) > 0
|
|
133
|
+
if isinstance(value, (list, dict)):
|
|
134
|
+
return len(value) > 0
|
|
135
|
+
return True
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def evaluate_check(check, record):
|
|
139
|
+
rule = check["rule"]
|
|
140
|
+
if rule == "required_path":
|
|
141
|
+
return check["path"] in record and has_required_content(record.get(check["path"]))
|
|
142
|
+
if rule == "string_min_length":
|
|
143
|
+
value = record.get(check["path"])
|
|
144
|
+
return isinstance(value, str) and len(value.strip()) >= check["min"]
|
|
145
|
+
if rule == "array_min_length":
|
|
146
|
+
value = record.get(check["path"])
|
|
147
|
+
return isinstance(value, list) and len(value) >= check["min"]
|
|
148
|
+
if rule == "forbidden_terms_absent":
|
|
149
|
+
lowered_terms = [term.lower() for term in check["terms"]]
|
|
150
|
+
for path in check["paths"]:
|
|
151
|
+
value = record.get(path)
|
|
152
|
+
if not isinstance(value, str):
|
|
153
|
+
continue
|
|
154
|
+
lowered_value = value.lower()
|
|
155
|
+
if any(term in lowered_value for term in lowered_terms):
|
|
156
|
+
return False
|
|
157
|
+
return True
|
|
158
|
+
if rule == "enum":
|
|
159
|
+
value = record.get(check["path"])
|
|
160
|
+
return isinstance(value, str) and value in check["allowed"]
|
|
161
|
+
fail_runtime(f"rubric.checks rule {rule} is not supported.")
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def load_json_object(path, label, *, reject_invalid):
|
|
165
|
+
try:
|
|
166
|
+
data = load_json_file(path, label=label)
|
|
167
|
+
except RuntimeError as error:
|
|
168
|
+
if reject_invalid:
|
|
169
|
+
reject_submission(str(error))
|
|
170
|
+
fail_runtime(str(error))
|
|
171
|
+
if not isinstance(data, dict):
|
|
172
|
+
message = f"{label} must be a JSON object."
|
|
173
|
+
if reject_invalid:
|
|
174
|
+
reject_submission(message)
|
|
175
|
+
fail_runtime(message)
|
|
176
|
+
return data
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def main():
|
|
180
|
+
runtime_context = load_runtime_context()
|
|
181
|
+
config_path = resolve_scoring_asset(
|
|
182
|
+
runtime_context,
|
|
183
|
+
"compiled_config",
|
|
184
|
+
kind="config",
|
|
185
|
+
)
|
|
186
|
+
try:
|
|
187
|
+
config = load_json_file(config_path, label="compiled_config")
|
|
188
|
+
except RuntimeError as error:
|
|
189
|
+
fail_runtime(str(error))
|
|
190
|
+
rubric_role = require_string(config.get("evaluation_role"), "compiled_config.evaluation_role")
|
|
191
|
+
submission_role = require_string(config.get("submission_role"), "compiled_config.submission_role")
|
|
192
|
+
final_score_key = require_string(
|
|
193
|
+
runtime_context.get("final_score_key"),
|
|
194
|
+
"runtime_context.final_score_key",
|
|
195
|
+
)
|
|
196
|
+
rubric_path = resolve_evaluation_artifact(runtime_context, rubric_role)
|
|
197
|
+
submission_path = resolve_submission_artifact(runtime_context, submission_role)
|
|
198
|
+
rubric = load_json_object(
|
|
199
|
+
rubric_path,
|
|
200
|
+
f"evaluation artifact {rubric_role}",
|
|
201
|
+
reject_invalid=False,
|
|
202
|
+
)
|
|
203
|
+
record = load_json_object(
|
|
204
|
+
submission_path,
|
|
205
|
+
f"submission artifact {submission_role}",
|
|
206
|
+
reject_invalid=True,
|
|
207
|
+
)
|
|
208
|
+
aggregation_strategy = normalize_aggregation(rubric)
|
|
209
|
+
checks = normalize_checks(rubric)
|
|
210
|
+
total_weight = sum(check["weight"] for check in checks)
|
|
211
|
+
if total_weight <= 0:
|
|
212
|
+
fail_runtime("rubric.checks must declare a positive total weight.")
|
|
213
|
+
check_results = []
|
|
214
|
+
earned_weight = 0.0
|
|
215
|
+
for check in checks:
|
|
216
|
+
passed = evaluate_check(check, record)
|
|
217
|
+
rule_score = 1.0 if passed else 0.0
|
|
218
|
+
earned_weight += check["weight"] * rule_score
|
|
219
|
+
check_results.append(
|
|
220
|
+
{
|
|
221
|
+
"id": check["id"],
|
|
222
|
+
"rule": check["rule"],
|
|
223
|
+
"weight": check["weight"],
|
|
224
|
+
"score": rule_score,
|
|
225
|
+
"passed": passed,
|
|
226
|
+
}
|
|
227
|
+
)
|
|
228
|
+
score = earned_weight / total_weight
|
|
229
|
+
passed_checks = sum(1 for result in check_results if result["passed"])
|
|
230
|
+
write_score(
|
|
231
|
+
score=score,
|
|
232
|
+
details={
|
|
233
|
+
final_score_key: score,
|
|
234
|
+
"validated_fields": passed_checks,
|
|
235
|
+
"passed_checks": passed_checks,
|
|
236
|
+
"total_checks": len(checks),
|
|
237
|
+
"earned_weight": earned_weight,
|
|
238
|
+
"total_weight": total_weight,
|
|
239
|
+
"aggregation_strategy": aggregation_strategy,
|
|
240
|
+
"check_results": check_results,
|
|
241
|
+
},
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
if __name__ == "__main__":
|
|
246
|
+
main()
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import sys
|
|
3
|
+
import tempfile
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from agora_runtime import (
|
|
7
|
+
fail_runtime,
|
|
8
|
+
load_json_file,
|
|
9
|
+
load_runtime_context,
|
|
10
|
+
resolve_evaluation_artifact,
|
|
11
|
+
resolve_scoring_asset,
|
|
12
|
+
resolve_submission_artifact,
|
|
13
|
+
safe_extract_zip,
|
|
14
|
+
write_score,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def require_string(value, label):
|
|
19
|
+
if not isinstance(value, str) or not value.strip():
|
|
20
|
+
fail_runtime(f"{label} must be a non-empty string.")
|
|
21
|
+
return value.strip()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def require_int(value, label):
|
|
25
|
+
if not isinstance(value, int) or value <= 0:
|
|
26
|
+
fail_runtime(f"{label} must be a positive integer.")
|
|
27
|
+
return value
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def load_json_object(path, label):
|
|
31
|
+
try:
|
|
32
|
+
data = load_json_file(path, label=label)
|
|
33
|
+
except RuntimeError as error:
|
|
34
|
+
fail_runtime(str(error))
|
|
35
|
+
if not isinstance(data, dict):
|
|
36
|
+
fail_runtime(f"{label} must be a JSON object.")
|
|
37
|
+
return data
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def read_required_text(path, label):
|
|
41
|
+
try:
|
|
42
|
+
return path.read_text(encoding="utf-8")
|
|
43
|
+
except FileNotFoundError:
|
|
44
|
+
fail_runtime(f"Missing {label} at {path}.")
|
|
45
|
+
except OSError as error:
|
|
46
|
+
fail_runtime(f"Unable to read {label}: {error}.")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def normalize_output(text, strip_trailing_whitespace):
|
|
50
|
+
if strip_trailing_whitespace:
|
|
51
|
+
return text.rstrip()
|
|
52
|
+
return text
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def run_python_solution(solution_path, stdin_text, timeout_ms, working_dir):
|
|
56
|
+
try:
|
|
57
|
+
return subprocess.run(
|
|
58
|
+
[sys.executable, str(solution_path)],
|
|
59
|
+
input=stdin_text,
|
|
60
|
+
capture_output=True,
|
|
61
|
+
text=True,
|
|
62
|
+
timeout=timeout_ms / 1000.0,
|
|
63
|
+
cwd=str(working_dir),
|
|
64
|
+
check=False,
|
|
65
|
+
)
|
|
66
|
+
except subprocess.TimeoutExpired:
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def main():
|
|
71
|
+
runtime_context = load_runtime_context()
|
|
72
|
+
config_path = resolve_scoring_asset(
|
|
73
|
+
runtime_context,
|
|
74
|
+
"compiled_config",
|
|
75
|
+
kind="config",
|
|
76
|
+
)
|
|
77
|
+
config = load_json_object(config_path, "compiled_config")
|
|
78
|
+
harness_role = require_string(config.get("evaluation_role"), "compiled_config.evaluation_role")
|
|
79
|
+
solution_role = require_string(config.get("submission_role"), "compiled_config.submission_role")
|
|
80
|
+
final_score_key = require_string(
|
|
81
|
+
runtime_context.get("final_score_key"),
|
|
82
|
+
"runtime_context.final_score_key",
|
|
83
|
+
)
|
|
84
|
+
harness_path = resolve_evaluation_artifact(runtime_context, harness_role)
|
|
85
|
+
solution_path = resolve_submission_artifact(runtime_context, solution_role)
|
|
86
|
+
with tempfile.TemporaryDirectory(prefix="agora-harness-") as working_root:
|
|
87
|
+
working_dir = Path(working_root)
|
|
88
|
+
safe_extract_zip(
|
|
89
|
+
harness_path,
|
|
90
|
+
working_dir,
|
|
91
|
+
label=f"evaluation artifact {harness_role}",
|
|
92
|
+
)
|
|
93
|
+
manifest_path = working_dir / "agora-harness.json"
|
|
94
|
+
harness_manifest = load_json_object(manifest_path, "agora-harness.json")
|
|
95
|
+
version = require_string(harness_manifest.get("version"), "agora-harness.version")
|
|
96
|
+
language = require_string(harness_manifest.get("language"), "agora-harness.language")
|
|
97
|
+
if version != "v1":
|
|
98
|
+
fail_runtime(f"Unsupported harness version {version}.")
|
|
99
|
+
if language != "python":
|
|
100
|
+
fail_runtime(f"Unsupported harness language {language}.")
|
|
101
|
+
timeout_ms = require_int(
|
|
102
|
+
harness_manifest.get("timeout_ms", 30000),
|
|
103
|
+
"agora-harness.timeout_ms",
|
|
104
|
+
)
|
|
105
|
+
strip_trailing_whitespace = bool(
|
|
106
|
+
harness_manifest.get("strip_trailing_whitespace", False)
|
|
107
|
+
)
|
|
108
|
+
tests = harness_manifest.get("tests")
|
|
109
|
+
if not isinstance(tests, list) or len(tests) == 0:
|
|
110
|
+
fail_runtime("agora-harness.tests must declare at least one test case.")
|
|
111
|
+
passed_tests = 0
|
|
112
|
+
for index, test_case in enumerate(tests):
|
|
113
|
+
if not isinstance(test_case, dict):
|
|
114
|
+
fail_runtime(f"agora-harness.tests[{index}] must be an object.")
|
|
115
|
+
stdin_path = working_dir / require_string(
|
|
116
|
+
test_case.get("stdin_path"),
|
|
117
|
+
f"agora-harness.tests[{index}].stdin_path",
|
|
118
|
+
)
|
|
119
|
+
expected_stdout_path = working_dir / require_string(
|
|
120
|
+
test_case.get("expected_stdout_path"),
|
|
121
|
+
f"agora-harness.tests[{index}].expected_stdout_path",
|
|
122
|
+
)
|
|
123
|
+
stdin_text = read_required_text(
|
|
124
|
+
stdin_path,
|
|
125
|
+
f"test stdin {stdin_path.name}",
|
|
126
|
+
)
|
|
127
|
+
expected_stdout = normalize_output(
|
|
128
|
+
read_required_text(
|
|
129
|
+
expected_stdout_path,
|
|
130
|
+
f"test expected stdout {expected_stdout_path.name}",
|
|
131
|
+
),
|
|
132
|
+
strip_trailing_whitespace,
|
|
133
|
+
)
|
|
134
|
+
run = run_python_solution(
|
|
135
|
+
solution_path,
|
|
136
|
+
stdin_text,
|
|
137
|
+
timeout_ms,
|
|
138
|
+
working_dir,
|
|
139
|
+
)
|
|
140
|
+
if run is None:
|
|
141
|
+
continue
|
|
142
|
+
if run.returncode != 0:
|
|
143
|
+
continue
|
|
144
|
+
actual_stdout = normalize_output(run.stdout, strip_trailing_whitespace)
|
|
145
|
+
if actual_stdout == expected_stdout:
|
|
146
|
+
passed_tests += 1
|
|
147
|
+
total_tests = len(tests)
|
|
148
|
+
score = passed_tests / total_tests
|
|
149
|
+
write_score(
|
|
150
|
+
score=score,
|
|
151
|
+
details={
|
|
152
|
+
final_score_key: score,
|
|
153
|
+
"tests_passed": passed_tests,
|
|
154
|
+
"total_tests": total_tests,
|
|
155
|
+
},
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
if __name__ == "__main__":
|
|
160
|
+
main()
|