themis-eval 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/_version.py +1 -1
- themis/evaluation/pipelines/standard_pipeline.py +2 -1
- themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +5 -2
- themis/evaluation/strategies/judge_evaluation_strategy.py +6 -1
- themis/generation/plan.py +28 -6
- {themis_eval-0.2.2.dist-info → themis_eval-0.2.3.dist-info}/METADATA +1 -1
- {themis_eval-0.2.2.dist-info → themis_eval-0.2.3.dist-info}/RECORD +10 -10
- {themis_eval-0.2.2.dist-info → themis_eval-0.2.3.dist-info}/WHEEL +0 -0
- {themis_eval-0.2.2.dist-info → themis_eval-0.2.3.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.2.2.dist-info → themis_eval-0.2.3.dist-info}/top_level.txt +0 -0
themis/_version.py
CHANGED
|
@@ -9,7 +9,7 @@ def _detect_version() -> str:
|
|
|
9
9
|
try:
|
|
10
10
|
return metadata.version("themis-eval")
|
|
11
11
|
except metadata.PackageNotFoundError: # pragma: no cover - local dev only
|
|
12
|
-
return "0.2.
|
|
12
|
+
return "0.2.3" # Fallback for development
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
__version__ = _detect_version()
|
|
@@ -233,7 +233,8 @@ class EvaluationPipeline:
|
|
|
233
233
|
if reference is not None
|
|
234
234
|
else []
|
|
235
235
|
)
|
|
236
|
-
metadata
|
|
236
|
+
# Preserve all task metadata for metrics, add sample_id
|
|
237
|
+
metadata = {**record.task.metadata, "sample_id": sample_id}
|
|
237
238
|
extract_start = time.perf_counter()
|
|
238
239
|
item_scores_for_item: list[core_entities.MetricScore] = []
|
|
239
240
|
for metric in self._metrics:
|
|
@@ -37,13 +37,16 @@ class AttemptAwareEvaluationStrategy:
|
|
|
37
37
|
grouped.setdefault(score.metric_name, []).append(score)
|
|
38
38
|
for metric_name, group in grouped.items():
|
|
39
39
|
value = sum(item.value for item in group) / len(group)
|
|
40
|
+
# Preserve original metadata from first score
|
|
41
|
+
base_metadata = group[0].metadata.copy() if group[0].metadata else {}
|
|
40
42
|
aggregated.append(
|
|
41
43
|
core_entities.MetricScore(
|
|
42
44
|
metric_name=metric_name,
|
|
43
45
|
value=value,
|
|
44
46
|
metadata={
|
|
45
|
-
|
|
46
|
-
"
|
|
47
|
+
**base_metadata, # Preserve all original metadata
|
|
48
|
+
"attempts": len(group), # Add aggregation-specific field
|
|
49
|
+
"sample_id": base_metadata.get("sample_id"),
|
|
47
50
|
},
|
|
48
51
|
details={},
|
|
49
52
|
)
|
|
@@ -49,6 +49,8 @@ class JudgeEvaluationStrategy:
|
|
|
49
49
|
counts = Counter(labels)
|
|
50
50
|
agreement = max(counts.values()) / max(1, len(labels))
|
|
51
51
|
|
|
52
|
+
# Preserve original metadata from first score
|
|
53
|
+
base_metadata = group[0].metadata.copy() if group[0].metadata else {}
|
|
52
54
|
aggregated.append(
|
|
53
55
|
core_entities.MetricScore(
|
|
54
56
|
metric_name=metric_name,
|
|
@@ -58,7 +60,10 @@ class JudgeEvaluationStrategy:
|
|
|
58
60
|
"agreement": agreement,
|
|
59
61
|
"labels": labels,
|
|
60
62
|
},
|
|
61
|
-
metadata={
|
|
63
|
+
metadata={
|
|
64
|
+
**base_metadata, # Preserve all original metadata
|
|
65
|
+
"sample_id": base_metadata.get("sample_id"),
|
|
66
|
+
},
|
|
62
67
|
)
|
|
63
68
|
)
|
|
64
69
|
return aggregated
|
themis/generation/plan.py
CHANGED
|
@@ -88,9 +88,20 @@ class GenerationPlan:
|
|
|
88
88
|
}
|
|
89
89
|
if dataset_id is not None:
|
|
90
90
|
metadata["dataset_id"] = dataset_id
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
91
|
+
|
|
92
|
+
# If metadata_fields is explicitly specified, use it as a filter (existing behavior)
|
|
93
|
+
# Otherwise, include all fields by default (new behavior for custom metrics)
|
|
94
|
+
if self.metadata_fields:
|
|
95
|
+
# Explicit filter - only include specified fields
|
|
96
|
+
for field_name in self.metadata_fields:
|
|
97
|
+
if field_name in row:
|
|
98
|
+
metadata[field_name] = row[field_name]
|
|
99
|
+
else:
|
|
100
|
+
# No filter - include all fields except those used for other purposes
|
|
101
|
+
for field_name, field_value in row.items():
|
|
102
|
+
if field_name not in (self.dataset_id_field, self.reference_field):
|
|
103
|
+
metadata[field_name] = field_value
|
|
104
|
+
|
|
94
105
|
return metadata
|
|
95
106
|
|
|
96
107
|
def _build_reference(
|
|
@@ -209,9 +220,20 @@ class CartesianExpansionStrategy:
|
|
|
209
220
|
}
|
|
210
221
|
if dataset_id is not None:
|
|
211
222
|
metadata["dataset_id"] = dataset_id
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
223
|
+
|
|
224
|
+
# If metadata_fields is explicitly specified, use it as a filter (existing behavior)
|
|
225
|
+
# Otherwise, include all fields by default (new behavior for custom metrics)
|
|
226
|
+
if context.metadata_fields:
|
|
227
|
+
# Explicit filter - only include specified fields
|
|
228
|
+
for field_name in context.metadata_fields:
|
|
229
|
+
if field_name in row:
|
|
230
|
+
metadata[field_name] = row[field_name]
|
|
231
|
+
else:
|
|
232
|
+
# No filter - include all fields except those used for other purposes
|
|
233
|
+
for field_name, field_value in row.items():
|
|
234
|
+
if field_name not in (context.dataset_id_field, context.reference_field):
|
|
235
|
+
metadata[field_name] = field_value
|
|
236
|
+
|
|
215
237
|
return metadata
|
|
216
238
|
|
|
217
239
|
def _build_reference(
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
themis/__init__.py,sha256=rQL3njf3i5lnAcmu0HuRzGGMELbA9xX21hzw4HrbIxw,1394
|
|
2
|
-
themis/_version.py,sha256=
|
|
2
|
+
themis/_version.py,sha256=Tk4OCTQHYoZ61gm9JnkdgajR0vkBHbVm5OUjInzyJug,378
|
|
3
3
|
themis/api.py,sha256=flZTbU-jRcbv7oXcfRKG4hkZjASmWlT52A4PghKj9G0,17700
|
|
4
4
|
themis/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
5
|
themis/backends/__init__.py,sha256=RWM5SnV5FrS_cVjpHHeZZM_b9CgqBu1rPS5DlT5YQTY,578
|
|
@@ -88,7 +88,7 @@ themis/evaluation/metrics/nlp/meteor.py,sha256=QZT09s4aiUcVvDJDVPZYjzi5SxXdS2gn2
|
|
|
88
88
|
themis/evaluation/metrics/nlp/rouge.py,sha256=YL05qluF-KsesHYFRfm5zELJlcvo6RvaKp7xKy6BuLI,4365
|
|
89
89
|
themis/evaluation/pipelines/__init__.py,sha256=5YI1xaUULHisctFxrumN4XRpWYneoonX7nd9zBtsjvQ,384
|
|
90
90
|
themis/evaluation/pipelines/composable_pipeline.py,sha256=nNP9MSvQQJvaSBw5_gO3FeyhGm9So2ZlGqh5qSvE8Ac,10905
|
|
91
|
-
themis/evaluation/pipelines/standard_pipeline.py,sha256=
|
|
91
|
+
themis/evaluation/pipelines/standard_pipeline.py,sha256=GI5_ImebBuM6D8GpGKLoNq4p3JhTq-ocOThlah8RxME,14754
|
|
92
92
|
themis/evaluation/statistics/__init__.py,sha256=TTrScTLAW7EHNq0nbjuJs6iP3_HgDx1yy3EtYXx5JCk,1257
|
|
93
93
|
themis/evaluation/statistics/bootstrap.py,sha256=JUQ8rtzFvW2e41I2pLJ7pqgSEjuJ1r6McyYLI42At9g,2409
|
|
94
94
|
themis/evaluation/statistics/confidence_intervals.py,sha256=CN5EO2gWiSITQubuWuPryngnGXhGwczY9kO3mcG6JVc,3676
|
|
@@ -97,10 +97,10 @@ themis/evaluation/statistics/effect_sizes.py,sha256=EWFVDilczpR8rR3_YurWy7QcjYcN
|
|
|
97
97
|
themis/evaluation/statistics/hypothesis_tests.py,sha256=MVlVsY8wXifbBG5aSwauFShsQtIKqYREJApbriojS2o,10042
|
|
98
98
|
themis/evaluation/statistics/types.py,sha256=hW0RYWs-G4C_njNl0ZGG9lJROgU2CfLWfnTQDWYmWuw,3685
|
|
99
99
|
themis/evaluation/strategies/__init__.py,sha256=3f5LQkzlu3pRbN7dgDbdYOUNZTRexcn6f8D8I5-C724,439
|
|
100
|
-
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py,sha256=
|
|
100
|
+
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py,sha256=MFcBdtK8rBeDXPFD2YWPSprez2iwSB-8yfyWhAlylug,1959
|
|
101
101
|
themis/evaluation/strategies/default_evaluation_strategy.py,sha256=LShW-3Nxg_W4Ln-4qUvHJZqe5YMt64gHoK3uNJYLQNo,693
|
|
102
102
|
themis/evaluation/strategies/evaluation_strategy.py,sha256=YFF-bXkz4Z52GuCw52FcklfEnf8dK8_z_I40DJRcmwE,669
|
|
103
|
-
themis/evaluation/strategies/judge_evaluation_strategy.py,sha256=
|
|
103
|
+
themis/evaluation/strategies/judge_evaluation_strategy.py,sha256=IRSgnnD2R6qrjiOTyA_PIOHUfQj4FqutkU3pKMth0CQ,2562
|
|
104
104
|
themis/experiment/__init__.py,sha256=dGranqpESugmmfbQlTU9efwspazW6j3vcmAKEtAoWZk,182
|
|
105
105
|
themis/experiment/builder.py,sha256=AEjCDeSOI2B0i0PBjkfY1GUDNrYGTGiqPvt0SxnDQFo,5618
|
|
106
106
|
themis/experiment/cache_manager.py,sha256=Fd8Qxifrmyn8f2zjAyPrLv-ZU4Dcp-MKo8-09BoW7tY,4361
|
|
@@ -121,7 +121,7 @@ themis/generation/agentic_runner.py,sha256=armBQBk7qZDBEwT8HqjIWomYDQm57NfrP5CZJ
|
|
|
121
121
|
themis/generation/batching.py,sha256=ddpgpn1pq_EwipvTg-K4WcoSs3c2rbW37jEA5Pa_spo,7557
|
|
122
122
|
themis/generation/clients.py,sha256=6apXCp_VNQosnpnmohTHOhHGXw-VZgsUyLds8MwtYUE,4910
|
|
123
123
|
themis/generation/conversation_runner.py,sha256=kSZHwEvfqzxZ-eQYxmg5OkNZcgEHggZExjad6nBOeTM,7980
|
|
124
|
-
themis/generation/plan.py,sha256=
|
|
124
|
+
themis/generation/plan.py,sha256=k6_gdKFM12nrKz7ac1c5vTZsFanIKJJgyQ8IhvakDNQ,17158
|
|
125
125
|
themis/generation/router.py,sha256=jZc0KFL483f8TrYtt9yxzFKs-T9CG2CoE2kfOQdHMEc,1082
|
|
126
126
|
themis/generation/runner.py,sha256=pH4Dw77qskMQk3yxEkaHYAl1PItTofI7OXdvevnFiCA,8984
|
|
127
127
|
themis/generation/strategies.py,sha256=hjqaVkNycFxJWh_edJ7ilBl7HS6bL-8pYm24zTfoAvg,2975
|
|
@@ -150,8 +150,8 @@ themis/utils/dashboard.py,sha256=2yiIu9_oENglTde_J3G1d5cpQ5VtSnfbUvdliw5Og1E,130
|
|
|
150
150
|
themis/utils/logging_utils.py,sha256=buC64X-xOu-2SZ0wVkz3nCXzYVGiqKbxK-8DGSGsAdM,1173
|
|
151
151
|
themis/utils/progress.py,sha256=HS0-yVbRT7Ai9zRlsJcex_OKP6dUiKx1vOp_IsobiHM,2097
|
|
152
152
|
themis/utils/tracing.py,sha256=VTeiRjcW_B5fOOoSeAp37nrmlwP1DiqPcoe6OtIQ7dk,8468
|
|
153
|
-
themis_eval-0.2.
|
|
154
|
-
themis_eval-0.2.
|
|
155
|
-
themis_eval-0.2.
|
|
156
|
-
themis_eval-0.2.
|
|
157
|
-
themis_eval-0.2.
|
|
153
|
+
themis_eval-0.2.3.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
|
|
154
|
+
themis_eval-0.2.3.dist-info/METADATA,sha256=4N7tBOyUi8PAlFT2qJseKIABjHOzkFmLtfqVVUSFz84,15235
|
|
155
|
+
themis_eval-0.2.3.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
156
|
+
themis_eval-0.2.3.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
|
|
157
|
+
themis_eval-0.2.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|