@moleculeagora/cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -0
- package/dist/index.js +30368 -0
- package/dist/index.js.map +1 -0
- package/dist/python-v1/agora_runtime.py +282 -0
- package/dist/python-v1/answer-set-metric.py +264 -0
- package/dist/python-v1/assertion-set-evaluation.py +879 -0
- package/dist/python-v1/exact-match.py +60 -0
- package/dist/python-v1/l4-composition.py +435 -0
- package/dist/python-v1/multi-output-tabular-metric.py +392 -0
- package/dist/python-v1/panel-ranking-metric.py +622 -0
- package/dist/python-v1/project-test.py +256 -0
- package/dist/python-v1/protein-binder-assay-metric.py +600 -0
- package/dist/python-v1/public-tool-metric.py +161 -0
- package/dist/python-v1/ranking-metric.py +426 -0
- package/dist/python-v1/reference-artifact-assertion.py +532 -0
- package/dist/python-v1/rubric-validation.py +246 -0
- package/dist/python-v1/solver-python-stdio-test.py +160 -0
- package/dist/python-v1/statistical-endpoint-test-v2.py +629 -0
- package/dist/python-v1/statistical-endpoint-test.py +442 -0
- package/dist/python-v1/table-metric.py +1291 -0
- package/dist/release-metadata.json +7 -0
- package/package.json +67 -0
|
@@ -0,0 +1,622 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import math
|
|
3
|
+
|
|
4
|
+
from agora_runtime import (
|
|
5
|
+
fail_runtime,
|
|
6
|
+
load_json_file,
|
|
7
|
+
load_runtime_context,
|
|
8
|
+
reject_submission,
|
|
9
|
+
resolve_evaluation_artifact,
|
|
10
|
+
resolve_scoring_asset,
|
|
11
|
+
resolve_submission_artifact,
|
|
12
|
+
write_score,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
SUPPORTED_METRICS = ("grouped_ndcg", "top_1_accuracy", "pairwise_accuracy")
|
|
16
|
+
SUPPORTED_METRIC_SET = set(SUPPORTED_METRICS)
|
|
17
|
+
SUPPORTED_AGGREGATIONS = {"macro_mean", "weighted_mean"}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def format_metric_list(metrics):
|
|
21
|
+
ordered = list(metrics)
|
|
22
|
+
if len(ordered) == 1:
|
|
23
|
+
return ordered[0]
|
|
24
|
+
if len(ordered) == 2:
|
|
25
|
+
return f"{ordered[0]} or {ordered[1]}"
|
|
26
|
+
return f"{', '.join(ordered[:-1])}, or {ordered[-1]}"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def require_string(value, label):
|
|
30
|
+
if not isinstance(value, str) or not value.strip():
|
|
31
|
+
fail_runtime(f"{label} must be a non-empty string.")
|
|
32
|
+
return value.strip()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def require_metric(config):
|
|
36
|
+
metric = require_string(config.get("metric"), "compiled_config.metric").lower()
|
|
37
|
+
if metric not in SUPPORTED_METRIC_SET:
|
|
38
|
+
fail_runtime(
|
|
39
|
+
f"compiled_config.metric must be one of {format_metric_list(SUPPORTED_METRICS)}."
|
|
40
|
+
)
|
|
41
|
+
return metric
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def require_aggregation(config):
|
|
45
|
+
aggregation = require_string(
|
|
46
|
+
config.get("aggregation"),
|
|
47
|
+
"compiled_config.aggregation",
|
|
48
|
+
)
|
|
49
|
+
if aggregation not in SUPPORTED_AGGREGATIONS:
|
|
50
|
+
fail_runtime(
|
|
51
|
+
"compiled_config.aggregation must be macro_mean or weighted_mean for panel_ranking_metric."
|
|
52
|
+
)
|
|
53
|
+
return aggregation
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def require_metric_params(config, metric, panel_field, aggregation):
|
|
57
|
+
metric_params = config.get("metric_params")
|
|
58
|
+
if not isinstance(metric_params, dict):
|
|
59
|
+
fail_runtime("compiled_config.metric_params must be an object.")
|
|
60
|
+
|
|
61
|
+
param_metric = require_string(
|
|
62
|
+
metric_params.get("metric"),
|
|
63
|
+
"compiled_config.metric_params.metric",
|
|
64
|
+
).lower()
|
|
65
|
+
if param_metric != metric:
|
|
66
|
+
fail_runtime(
|
|
67
|
+
f"compiled_config.metric_params.metric must equal compiled_config.metric {metric!r}."
|
|
68
|
+
)
|
|
69
|
+
param_panel_field = require_string(
|
|
70
|
+
metric_params.get("panel_field"),
|
|
71
|
+
"compiled_config.metric_params.panel_field",
|
|
72
|
+
)
|
|
73
|
+
if param_panel_field != panel_field:
|
|
74
|
+
fail_runtime(
|
|
75
|
+
"compiled_config.metric_params.panel_field must match compiled_config.panel_field."
|
|
76
|
+
)
|
|
77
|
+
param_aggregation = require_string(
|
|
78
|
+
metric_params.get("aggregation"),
|
|
79
|
+
"compiled_config.metric_params.aggregation",
|
|
80
|
+
)
|
|
81
|
+
if param_aggregation != aggregation:
|
|
82
|
+
fail_runtime(
|
|
83
|
+
"compiled_config.metric_params.aggregation must match compiled_config.aggregation."
|
|
84
|
+
)
|
|
85
|
+
return metric_params
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def require_policy(policies, key, allowed):
|
|
89
|
+
value = require_string(policies.get(key), f"policies.{key}")
|
|
90
|
+
if value not in allowed:
|
|
91
|
+
fail_runtime(
|
|
92
|
+
f"policies.{key} must be one of {', '.join(sorted(allowed))}."
|
|
93
|
+
)
|
|
94
|
+
return value
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def find_slot(runtime_context, lane, role):
|
|
98
|
+
slot_key = f"{lane}_slots"
|
|
99
|
+
slots = runtime_context.get(slot_key)
|
|
100
|
+
if not isinstance(slots, list):
|
|
101
|
+
fail_runtime(f"Runtime context is missing {slot_key}.")
|
|
102
|
+
for slot in slots:
|
|
103
|
+
if isinstance(slot, dict) and slot.get("role") == role:
|
|
104
|
+
return slot
|
|
105
|
+
fail_runtime(f"Runtime context is missing {lane} slot for role {role}.")
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def require_csv_slot_columns(runtime_context, lane, role):
|
|
109
|
+
slot = find_slot(runtime_context, lane, role)
|
|
110
|
+
validator = slot.get("validator")
|
|
111
|
+
if not isinstance(validator, dict) or validator.get("kind") != "csv_columns":
|
|
112
|
+
fail_runtime(
|
|
113
|
+
f"{lane} role {role} must use validator.kind=csv_columns for panel_ranking_metric."
|
|
114
|
+
)
|
|
115
|
+
record_key = require_string(
|
|
116
|
+
validator.get("record_key"),
|
|
117
|
+
f"{lane}.{role}.validator.record_key",
|
|
118
|
+
)
|
|
119
|
+
value_field = require_string(
|
|
120
|
+
validator.get("value_field"),
|
|
121
|
+
f"{lane}.{role}.validator.value_field",
|
|
122
|
+
)
|
|
123
|
+
return record_key, value_field
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def read_csv_rows(path, label, *, invalid_handler):
|
|
127
|
+
try:
|
|
128
|
+
with path.open("r", encoding="utf-8", newline="") as handle:
|
|
129
|
+
reader = csv.DictReader(handle)
|
|
130
|
+
fieldnames = reader.fieldnames
|
|
131
|
+
if not fieldnames:
|
|
132
|
+
invalid_handler(f"{label} must include a CSV header row.")
|
|
133
|
+
normalized_fieldnames = []
|
|
134
|
+
for fieldname in fieldnames:
|
|
135
|
+
if not isinstance(fieldname, str) or not fieldname.strip():
|
|
136
|
+
invalid_handler(f"{label} contains an empty CSV column name.")
|
|
137
|
+
normalized_fieldnames.append(fieldname.strip())
|
|
138
|
+
rows = list(reader)
|
|
139
|
+
except FileNotFoundError:
|
|
140
|
+
invalid_handler(f"Missing {label} at {path}.")
|
|
141
|
+
except OSError as error:
|
|
142
|
+
invalid_handler(f"Unable to read {label}: {error}.")
|
|
143
|
+
return normalized_fieldnames, rows
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def parse_reference_relevance(raw_value, label):
|
|
147
|
+
text = raw_value.strip() if isinstance(raw_value, str) else ""
|
|
148
|
+
if not text:
|
|
149
|
+
fail_runtime(f"{label} is blank.")
|
|
150
|
+
try:
|
|
151
|
+
value = float(text)
|
|
152
|
+
except ValueError:
|
|
153
|
+
fail_runtime(f"{label} must be numeric, received {text!r}.")
|
|
154
|
+
if not math.isfinite(value):
|
|
155
|
+
fail_runtime(f"{label} must be finite.")
|
|
156
|
+
if value < 0:
|
|
157
|
+
fail_runtime(f"{label} must be non-negative for panel_ranking_metric.")
|
|
158
|
+
return value
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def parse_submission_score(raw_value, label, invalid_value_policy):
|
|
162
|
+
text = raw_value.strip() if isinstance(raw_value, str) else ""
|
|
163
|
+
if not text:
|
|
164
|
+
if invalid_value_policy == "reject":
|
|
165
|
+
reject_submission(f"{label} is blank.")
|
|
166
|
+
return None
|
|
167
|
+
try:
|
|
168
|
+
value = float(text)
|
|
169
|
+
except ValueError:
|
|
170
|
+
if invalid_value_policy == "reject":
|
|
171
|
+
reject_submission(f"{label} must be numeric, received {text!r}.")
|
|
172
|
+
return None
|
|
173
|
+
if not math.isfinite(value):
|
|
174
|
+
if invalid_value_policy == "reject":
|
|
175
|
+
reject_submission(f"{label} must be finite.")
|
|
176
|
+
return None
|
|
177
|
+
return value
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def load_reference_panel_records(path, role, record_key, value_field, panel_field):
|
|
181
|
+
fieldnames, rows = read_csv_rows(
|
|
182
|
+
path,
|
|
183
|
+
f"evaluation artifact {role}",
|
|
184
|
+
invalid_handler=fail_runtime,
|
|
185
|
+
)
|
|
186
|
+
if record_key not in fieldnames:
|
|
187
|
+
fail_runtime(
|
|
188
|
+
f"evaluation artifact {role} is missing record key column {record_key}."
|
|
189
|
+
)
|
|
190
|
+
if value_field not in fieldnames:
|
|
191
|
+
fail_runtime(
|
|
192
|
+
f"evaluation artifact {role} is missing value column {value_field}."
|
|
193
|
+
)
|
|
194
|
+
if panel_field not in fieldnames:
|
|
195
|
+
fail_runtime(
|
|
196
|
+
f"evaluation artifact {role} is missing panel field column {panel_field}."
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
records = {}
|
|
200
|
+
for row_index, row in enumerate(rows, start=2):
|
|
201
|
+
raw_key = row.get(record_key)
|
|
202
|
+
key = raw_key.strip() if isinstance(raw_key, str) else ""
|
|
203
|
+
if not key:
|
|
204
|
+
fail_runtime(
|
|
205
|
+
f"evaluation artifact {role} row {row_index} is missing {record_key}."
|
|
206
|
+
)
|
|
207
|
+
if key in records:
|
|
208
|
+
fail_runtime(
|
|
209
|
+
f"evaluation artifact {role} contains duplicate record id {key!r}."
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
raw_panel = row.get(panel_field)
|
|
213
|
+
panel_value = raw_panel.strip() if isinstance(raw_panel, str) else ""
|
|
214
|
+
if not panel_value:
|
|
215
|
+
fail_runtime(
|
|
216
|
+
f"evaluation artifact {role} row {row_index} is missing panel field column {panel_field}."
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
records[key] = {
|
|
220
|
+
"relevance": parse_reference_relevance(
|
|
221
|
+
row.get(value_field),
|
|
222
|
+
f"evaluation artifact {role} row {row_index} column {value_field}",
|
|
223
|
+
),
|
|
224
|
+
"panel": panel_value,
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
if not records:
|
|
228
|
+
fail_runtime(
|
|
229
|
+
f"evaluation artifact {role} must contain at least one ranked row."
|
|
230
|
+
)
|
|
231
|
+
return records
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def load_submission_scores(
|
|
235
|
+
path,
|
|
236
|
+
role,
|
|
237
|
+
record_key,
|
|
238
|
+
value_field,
|
|
239
|
+
duplicate_id_policy,
|
|
240
|
+
invalid_value_policy,
|
|
241
|
+
):
|
|
242
|
+
fieldnames, rows = read_csv_rows(
|
|
243
|
+
path,
|
|
244
|
+
f"submission artifact {role}",
|
|
245
|
+
invalid_handler=reject_submission,
|
|
246
|
+
)
|
|
247
|
+
if record_key not in fieldnames:
|
|
248
|
+
reject_submission(
|
|
249
|
+
f"submission artifact {role} is missing record key column {record_key}."
|
|
250
|
+
)
|
|
251
|
+
if value_field not in fieldnames:
|
|
252
|
+
reject_submission(
|
|
253
|
+
f"submission artifact {role} is missing value column {value_field}."
|
|
254
|
+
)
|
|
255
|
+
values = {}
|
|
256
|
+
for row_index, row in enumerate(rows, start=2):
|
|
257
|
+
raw_key = row.get(record_key)
|
|
258
|
+
key = raw_key.strip() if isinstance(raw_key, str) else ""
|
|
259
|
+
if not key:
|
|
260
|
+
if invalid_value_policy == "reject":
|
|
261
|
+
reject_submission(
|
|
262
|
+
f"submission artifact {role} row {row_index} is missing {record_key}."
|
|
263
|
+
)
|
|
264
|
+
continue
|
|
265
|
+
if key in values:
|
|
266
|
+
if duplicate_id_policy == "reject":
|
|
267
|
+
reject_submission(
|
|
268
|
+
f"submission artifact {role} contains duplicate record id {key!r}."
|
|
269
|
+
)
|
|
270
|
+
continue
|
|
271
|
+
parsed_value = parse_submission_score(
|
|
272
|
+
row.get(value_field),
|
|
273
|
+
f"submission artifact {role} row {row_index} column {value_field}",
|
|
274
|
+
invalid_value_policy,
|
|
275
|
+
)
|
|
276
|
+
if parsed_value is None:
|
|
277
|
+
continue
|
|
278
|
+
values[key] = parsed_value
|
|
279
|
+
return values
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def discount(rank_index):
|
|
283
|
+
return 1.0 / math.log2(rank_index + 2.0)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def dcg(labels):
|
|
287
|
+
return sum(
|
|
288
|
+
((2.0 ** label) - 1.0) * discount(index)
|
|
289
|
+
for index, label in enumerate(labels)
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def rank_submission_ids(submission_by_id):
|
|
294
|
+
return sorted(
|
|
295
|
+
submission_by_id.keys(),
|
|
296
|
+
key=lambda record_id: (-submission_by_id[record_id], record_id),
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def has_comparable_pairs(relevances):
|
|
301
|
+
seen = set()
|
|
302
|
+
for relevance in relevances:
|
|
303
|
+
seen.add(relevance)
|
|
304
|
+
if len(seen) > 1:
|
|
305
|
+
return True
|
|
306
|
+
return False
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def validate_reference_panel_preconditions(metric, panel_label, reference_relevances):
|
|
310
|
+
if metric == "grouped_ndcg":
|
|
311
|
+
if len(reference_relevances) < 2:
|
|
312
|
+
fail_runtime(
|
|
313
|
+
f"panel_ranking_metric metric grouped_ndcg requires at least two reference rows in panel {panel_label}."
|
|
314
|
+
)
|
|
315
|
+
if max(reference_relevances) <= 0:
|
|
316
|
+
fail_runtime(
|
|
317
|
+
f"panel_ranking_metric metric grouped_ndcg requires at least one positive reference relevance in panel {panel_label}."
|
|
318
|
+
)
|
|
319
|
+
return
|
|
320
|
+
|
|
321
|
+
if metric == "top_1_accuracy":
|
|
322
|
+
if max(reference_relevances) <= 0:
|
|
323
|
+
fail_runtime(
|
|
324
|
+
f"panel_ranking_metric metric top_1_accuracy requires a positive winning relevance in panel {panel_label}."
|
|
325
|
+
)
|
|
326
|
+
return
|
|
327
|
+
|
|
328
|
+
if not has_comparable_pairs(reference_relevances):
|
|
329
|
+
fail_runtime(
|
|
330
|
+
f"panel_ranking_metric metric pairwise_accuracy requires at least one unequal reference relevance pair in panel {panel_label}."
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def validate_scored_panel_preconditions(metric, panel_label, scored_relevances):
|
|
335
|
+
if metric == "grouped_ndcg":
|
|
336
|
+
if len(scored_relevances) < 2:
|
|
337
|
+
reject_submission(
|
|
338
|
+
f"panel_ranking_metric metric grouped_ndcg requires at least two scored rows in panel {panel_label}."
|
|
339
|
+
)
|
|
340
|
+
if max(scored_relevances) <= 0:
|
|
341
|
+
reject_submission(
|
|
342
|
+
f"panel_ranking_metric metric grouped_ndcg requires at least one positive relevance among scored rows in panel {panel_label}."
|
|
343
|
+
)
|
|
344
|
+
return
|
|
345
|
+
|
|
346
|
+
if metric == "pairwise_accuracy" and not has_comparable_pairs(scored_relevances):
|
|
347
|
+
reject_submission(
|
|
348
|
+
f"panel_ranking_metric metric pairwise_accuracy requires at least one comparable scored row pair in panel {panel_label}."
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def compute_grouped_ndcg(reference_by_id, submission_by_id, scored_ids):
|
|
353
|
+
scored_submission = {
|
|
354
|
+
record_id: submission_by_id[record_id] for record_id in scored_ids
|
|
355
|
+
}
|
|
356
|
+
ranked_ids = rank_submission_ids(scored_submission)
|
|
357
|
+
ranked_labels = [
|
|
358
|
+
reference_by_id[record_id]["relevance"] for record_id in ranked_ids
|
|
359
|
+
]
|
|
360
|
+
ideal_labels = sorted(ranked_labels, reverse=True)
|
|
361
|
+
ideal_dcg = dcg(ideal_labels)
|
|
362
|
+
if ideal_dcg == 0.0:
|
|
363
|
+
reject_submission(
|
|
364
|
+
"panel_ranking_metric metric grouped_ndcg has no positive scored relevance after applying runtime policies."
|
|
365
|
+
)
|
|
366
|
+
return dcg(ranked_labels) / ideal_dcg
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def compute_top_1_accuracy(reference_by_id, submission_by_id, panel_record_ids, scored_ids):
|
|
370
|
+
max_relevance = max(
|
|
371
|
+
reference_by_id[record_id]["relevance"] for record_id in panel_record_ids
|
|
372
|
+
)
|
|
373
|
+
winners = {
|
|
374
|
+
record_id
|
|
375
|
+
for record_id in panel_record_ids
|
|
376
|
+
if reference_by_id[record_id]["relevance"] == max_relevance
|
|
377
|
+
}
|
|
378
|
+
scored_submission = {
|
|
379
|
+
record_id: submission_by_id[record_id] for record_id in scored_ids
|
|
380
|
+
}
|
|
381
|
+
top_record_id = rank_submission_ids(scored_submission)[0]
|
|
382
|
+
return 1.0 if top_record_id in winners else 0.0
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def compute_pairwise_accuracy(reference_by_id, submission_by_id, scored_ids):
|
|
386
|
+
correct = 0.0
|
|
387
|
+
total = 0
|
|
388
|
+
for left_index, left_id in enumerate(scored_ids):
|
|
389
|
+
for right_id in scored_ids[left_index + 1 :]:
|
|
390
|
+
left_relevance = reference_by_id[left_id]["relevance"]
|
|
391
|
+
right_relevance = reference_by_id[right_id]["relevance"]
|
|
392
|
+
if left_relevance == right_relevance:
|
|
393
|
+
continue
|
|
394
|
+
|
|
395
|
+
total += 1
|
|
396
|
+
left_score = submission_by_id[left_id]
|
|
397
|
+
right_score = submission_by_id[right_id]
|
|
398
|
+
if left_score == right_score:
|
|
399
|
+
correct += 0.5
|
|
400
|
+
continue
|
|
401
|
+
if left_relevance > right_relevance and left_score > right_score:
|
|
402
|
+
correct += 1.0
|
|
403
|
+
elif right_relevance > left_relevance and right_score > left_score:
|
|
404
|
+
correct += 1.0
|
|
405
|
+
|
|
406
|
+
if total == 0:
|
|
407
|
+
reject_submission(
|
|
408
|
+
"panel_ranking_metric metric pairwise_accuracy has no comparable scored row pairs after applying runtime policies."
|
|
409
|
+
)
|
|
410
|
+
return correct / total
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def compute_panel_metric(metric, reference_by_id, submission_by_id, panel_record_ids, scored_ids):
|
|
414
|
+
if metric == "grouped_ndcg":
|
|
415
|
+
return compute_grouped_ndcg(reference_by_id, submission_by_id, scored_ids)
|
|
416
|
+
if metric == "top_1_accuracy":
|
|
417
|
+
return compute_top_1_accuracy(
|
|
418
|
+
reference_by_id,
|
|
419
|
+
submission_by_id,
|
|
420
|
+
panel_record_ids,
|
|
421
|
+
scored_ids,
|
|
422
|
+
)
|
|
423
|
+
return compute_pairwise_accuracy(reference_by_id, submission_by_id, scored_ids)
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def format_panel_label(panel_field, panel_value):
|
|
427
|
+
return f"{panel_field}={panel_value!r}"
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def aggregate_panel_scores(panel_scores, aggregation):
|
|
431
|
+
if not panel_scores:
|
|
432
|
+
reject_submission(
|
|
433
|
+
"Submission produced no scoreable panels after applying runtime policies."
|
|
434
|
+
)
|
|
435
|
+
if aggregation == "macro_mean":
|
|
436
|
+
return sum(panel_score["score"] for panel_score in panel_scores) / len(
|
|
437
|
+
panel_scores
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
total_weight = sum(panel_score["weight"] for panel_score in panel_scores)
|
|
441
|
+
if total_weight <= 0:
|
|
442
|
+
reject_submission(
|
|
443
|
+
"Submission produced no weighted panel rows after applying runtime policies."
|
|
444
|
+
)
|
|
445
|
+
return (
|
|
446
|
+
sum(
|
|
447
|
+
panel_score["score"] * panel_score["weight"]
|
|
448
|
+
for panel_score in panel_scores
|
|
449
|
+
)
|
|
450
|
+
/ total_weight
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def main():
|
|
455
|
+
runtime_context = load_runtime_context()
|
|
456
|
+
config_path = resolve_scoring_asset(
|
|
457
|
+
runtime_context,
|
|
458
|
+
"compiled_config",
|
|
459
|
+
kind="config",
|
|
460
|
+
)
|
|
461
|
+
try:
|
|
462
|
+
config = load_json_file(config_path, label="compiled_config")
|
|
463
|
+
except RuntimeError as error:
|
|
464
|
+
fail_runtime(str(error))
|
|
465
|
+
|
|
466
|
+
metric = require_metric(config)
|
|
467
|
+
panel_field = require_string(
|
|
468
|
+
config.get("panel_field"),
|
|
469
|
+
"compiled_config.panel_field",
|
|
470
|
+
)
|
|
471
|
+
aggregation = require_aggregation(config)
|
|
472
|
+
require_metric_params(config, metric, panel_field, aggregation)
|
|
473
|
+
evaluation_role = require_string(
|
|
474
|
+
config.get("evaluation_role"),
|
|
475
|
+
"compiled_config.evaluation_role",
|
|
476
|
+
)
|
|
477
|
+
submission_role = require_string(
|
|
478
|
+
config.get("submission_role"),
|
|
479
|
+
"compiled_config.submission_role",
|
|
480
|
+
)
|
|
481
|
+
objective = require_string(
|
|
482
|
+
runtime_context.get("objective"),
|
|
483
|
+
"runtime_context.objective",
|
|
484
|
+
)
|
|
485
|
+
if objective != "maximize":
|
|
486
|
+
fail_runtime(f"panel_ranking_metric metric {metric} requires objective=maximize.")
|
|
487
|
+
final_score_key = require_string(
|
|
488
|
+
runtime_context.get("final_score_key"),
|
|
489
|
+
"runtime_context.final_score_key",
|
|
490
|
+
)
|
|
491
|
+
policies = runtime_context.get("policies")
|
|
492
|
+
if not isinstance(policies, dict):
|
|
493
|
+
fail_runtime("Runtime context is missing execution policies.")
|
|
494
|
+
coverage_policy = require_policy(
|
|
495
|
+
policies,
|
|
496
|
+
"coverage_policy",
|
|
497
|
+
{"reject", "ignore", "penalize"},
|
|
498
|
+
)
|
|
499
|
+
duplicate_id_policy = require_policy(
|
|
500
|
+
policies,
|
|
501
|
+
"duplicate_id_policy",
|
|
502
|
+
{"reject", "ignore"},
|
|
503
|
+
)
|
|
504
|
+
invalid_value_policy = require_policy(
|
|
505
|
+
policies,
|
|
506
|
+
"invalid_value_policy",
|
|
507
|
+
{"reject", "ignore"},
|
|
508
|
+
)
|
|
509
|
+
evaluation_record_key, evaluation_value_field = require_csv_slot_columns(
|
|
510
|
+
runtime_context,
|
|
511
|
+
"evaluation",
|
|
512
|
+
evaluation_role,
|
|
513
|
+
)
|
|
514
|
+
submission_record_key, submission_value_field = require_csv_slot_columns(
|
|
515
|
+
runtime_context,
|
|
516
|
+
"submission",
|
|
517
|
+
submission_role,
|
|
518
|
+
)
|
|
519
|
+
evaluation_path = resolve_evaluation_artifact(runtime_context, evaluation_role)
|
|
520
|
+
submission_path = resolve_submission_artifact(runtime_context, submission_role)
|
|
521
|
+
|
|
522
|
+
reference_by_id = load_reference_panel_records(
|
|
523
|
+
evaluation_path,
|
|
524
|
+
evaluation_role,
|
|
525
|
+
evaluation_record_key,
|
|
526
|
+
evaluation_value_field,
|
|
527
|
+
panel_field,
|
|
528
|
+
)
|
|
529
|
+
panels = {}
|
|
530
|
+
for record_id, record in reference_by_id.items():
|
|
531
|
+
panels.setdefault(record["panel"], []).append(record_id)
|
|
532
|
+
for panel_value in sorted(panels):
|
|
533
|
+
panel_label = format_panel_label(panel_field, panel_value)
|
|
534
|
+
panel_relevances = [
|
|
535
|
+
reference_by_id[record_id]["relevance"] for record_id in panels[panel_value]
|
|
536
|
+
]
|
|
537
|
+
validate_reference_panel_preconditions(metric, panel_label, panel_relevances)
|
|
538
|
+
|
|
539
|
+
submission_by_id = load_submission_scores(
|
|
540
|
+
submission_path,
|
|
541
|
+
submission_role,
|
|
542
|
+
submission_record_key,
|
|
543
|
+
submission_value_field,
|
|
544
|
+
duplicate_id_policy,
|
|
545
|
+
invalid_value_policy,
|
|
546
|
+
)
|
|
547
|
+
missing_ids = [
|
|
548
|
+
record_id
|
|
549
|
+
for record_id in reference_by_id
|
|
550
|
+
if record_id not in submission_by_id
|
|
551
|
+
]
|
|
552
|
+
if missing_ids and coverage_policy == "reject":
|
|
553
|
+
reject_submission(
|
|
554
|
+
f"Submission is missing rankings for {len(missing_ids)} required rows; first missing id is {missing_ids[0]!r}."
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
panel_scores = []
|
|
558
|
+
panel_details = []
|
|
559
|
+
total_scored = 0
|
|
560
|
+
|
|
561
|
+
for panel_value in sorted(panels):
|
|
562
|
+
panel_record_ids = panels[panel_value]
|
|
563
|
+
panel_label = format_panel_label(panel_field, panel_value)
|
|
564
|
+
panel_missing_ids = [
|
|
565
|
+
record_id for record_id in panel_record_ids if record_id not in submission_by_id
|
|
566
|
+
]
|
|
567
|
+
scored_ids = [
|
|
568
|
+
record_id for record_id in panel_record_ids if record_id in submission_by_id
|
|
569
|
+
]
|
|
570
|
+
if not scored_ids:
|
|
571
|
+
reject_submission(
|
|
572
|
+
f"Submission produced no ranked rows for panel {panel_label} after applying runtime policies."
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
scored_relevances = [
|
|
576
|
+
reference_by_id[record_id]["relevance"] for record_id in scored_ids
|
|
577
|
+
]
|
|
578
|
+
validate_scored_panel_preconditions(metric, panel_label, scored_relevances)
|
|
579
|
+
raw_metric = compute_panel_metric(
|
|
580
|
+
metric,
|
|
581
|
+
reference_by_id,
|
|
582
|
+
submission_by_id,
|
|
583
|
+
panel_record_ids,
|
|
584
|
+
scored_ids,
|
|
585
|
+
)
|
|
586
|
+
normalized_score = max(0.0, min(raw_metric, 1.0))
|
|
587
|
+
if coverage_policy == "penalize":
|
|
588
|
+
normalized_score *= len(scored_ids) / len(panel_record_ids)
|
|
589
|
+
|
|
590
|
+
aggregation_weight = len(scored_ids)
|
|
591
|
+
total_scored += aggregation_weight
|
|
592
|
+
panel_scores.append({"score": normalized_score, "weight": aggregation_weight})
|
|
593
|
+
panel_details.append(
|
|
594
|
+
{
|
|
595
|
+
"panel": {panel_field: panel_value},
|
|
596
|
+
"reference_row_count": len(panel_record_ids),
|
|
597
|
+
"rows_scored": len(scored_ids),
|
|
598
|
+
"missing_count": len(panel_missing_ids),
|
|
599
|
+
"selected_metric_value": raw_metric,
|
|
600
|
+
"normalized_score": normalized_score,
|
|
601
|
+
"aggregation_weight": aggregation_weight,
|
|
602
|
+
}
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
final_score = aggregate_panel_scores(panel_scores, aggregation)
|
|
606
|
+
write_score(
|
|
607
|
+
score=final_score,
|
|
608
|
+
details={
|
|
609
|
+
final_score_key: final_score,
|
|
610
|
+
"selected_metric": metric,
|
|
611
|
+
"aggregation": aggregation,
|
|
612
|
+
"panel_field": panel_field,
|
|
613
|
+
"panel_count": len(panel_details),
|
|
614
|
+
"rows_scored": total_scored,
|
|
615
|
+
"missing_count": len(missing_ids),
|
|
616
|
+
"panels": panel_details,
|
|
617
|
+
},
|
|
618
|
+
)
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
if __name__ == "__main__":
|
|
622
|
+
main()
|