themis-eval 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
themis/_version.py CHANGED
@@ -9,7 +9,7 @@ def _detect_version() -> str:
9
9
  try:
10
10
  return metadata.version("themis-eval")
11
11
  except metadata.PackageNotFoundError: # pragma: no cover - local dev only
12
- return "0.2.2" # Fallback for development
12
+ return "0.2.3" # Fallback for development
13
13
 
14
14
 
15
15
  __version__ = _detect_version()
@@ -233,7 +233,8 @@ class EvaluationPipeline:
233
233
  if reference is not None
234
234
  else []
235
235
  )
236
- metadata = {"sample_id": sample_id}
236
+ # Preserve all task metadata for metrics, add sample_id
237
+ metadata = {**record.task.metadata, "sample_id": sample_id}
237
238
  extract_start = time.perf_counter()
238
239
  item_scores_for_item: list[core_entities.MetricScore] = []
239
240
  for metric in self._metrics:
@@ -37,13 +37,16 @@ class AttemptAwareEvaluationStrategy:
37
37
  grouped.setdefault(score.metric_name, []).append(score)
38
38
  for metric_name, group in grouped.items():
39
39
  value = sum(item.value for item in group) / len(group)
40
+ # Preserve original metadata from first score
41
+ base_metadata = group[0].metadata.copy() if group[0].metadata else {}
40
42
  aggregated.append(
41
43
  core_entities.MetricScore(
42
44
  metric_name=metric_name,
43
45
  value=value,
44
46
  metadata={
45
- "attempts": len(group),
46
- "sample_id": group[0].metadata.get("sample_id"),
47
+ **base_metadata, # Preserve all original metadata
48
+ "attempts": len(group), # Add aggregation-specific field
49
+ "sample_id": base_metadata.get("sample_id"),
47
50
  },
48
51
  details={},
49
52
  )
@@ -49,6 +49,8 @@ class JudgeEvaluationStrategy:
49
49
  counts = Counter(labels)
50
50
  agreement = max(counts.values()) / max(1, len(labels))
51
51
 
52
+ # Preserve original metadata from first score
53
+ base_metadata = group[0].metadata.copy() if group[0].metadata else {}
52
54
  aggregated.append(
53
55
  core_entities.MetricScore(
54
56
  metric_name=metric_name,
@@ -58,7 +60,10 @@ class JudgeEvaluationStrategy:
58
60
  "agreement": agreement,
59
61
  "labels": labels,
60
62
  },
61
- metadata={"sample_id": group[0].metadata.get("sample_id")},
63
+ metadata={
64
+ **base_metadata, # Preserve all original metadata
65
+ "sample_id": base_metadata.get("sample_id"),
66
+ },
62
67
  )
63
68
  )
64
69
  return aggregated
themis/generation/plan.py CHANGED
@@ -88,9 +88,20 @@ class GenerationPlan:
88
88
  }
89
89
  if dataset_id is not None:
90
90
  metadata["dataset_id"] = dataset_id
91
- for field_name in self.metadata_fields:
92
- if field_name in row:
93
- metadata[field_name] = row[field_name]
91
+
92
+ # If metadata_fields is explicitly specified, use it as a filter (existing behavior)
93
+ # Otherwise, include all fields by default (new behavior for custom metrics)
94
+ if self.metadata_fields:
95
+ # Explicit filter - only include specified fields
96
+ for field_name in self.metadata_fields:
97
+ if field_name in row:
98
+ metadata[field_name] = row[field_name]
99
+ else:
100
+ # No filter - include all fields except those used for other purposes
101
+ for field_name, field_value in row.items():
102
+ if field_name not in (self.dataset_id_field, self.reference_field):
103
+ metadata[field_name] = field_value
104
+
94
105
  return metadata
95
106
 
96
107
  def _build_reference(
@@ -209,9 +220,20 @@ class CartesianExpansionStrategy:
209
220
  }
210
221
  if dataset_id is not None:
211
222
  metadata["dataset_id"] = dataset_id
212
- for field_name in context.metadata_fields:
213
- if field_name in row:
214
- metadata[field_name] = row[field_name]
223
+
224
+ # If metadata_fields is explicitly specified, use it as a filter (existing behavior)
225
+ # Otherwise, include all fields by default (new behavior for custom metrics)
226
+ if context.metadata_fields:
227
+ # Explicit filter - only include specified fields
228
+ for field_name in context.metadata_fields:
229
+ if field_name in row:
230
+ metadata[field_name] = row[field_name]
231
+ else:
232
+ # No filter - include all fields except those used for other purposes
233
+ for field_name, field_value in row.items():
234
+ if field_name not in (context.dataset_id_field, context.reference_field):
235
+ metadata[field_name] = field_value
236
+
215
237
  return metadata
216
238
 
217
239
  def _build_reference(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: themis-eval
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Lightweight evaluation platform for LLM experiments
5
5
  Author: Pittawat Taveekitworachai
6
6
  License: MIT
@@ -1,5 +1,5 @@
1
1
  themis/__init__.py,sha256=rQL3njf3i5lnAcmu0HuRzGGMELbA9xX21hzw4HrbIxw,1394
2
- themis/_version.py,sha256=y0Oqv0Je2udPmKCy5_D8Lib7GNLGxtLVp8b5WdavITg,378
2
+ themis/_version.py,sha256=Tk4OCTQHYoZ61gm9JnkdgajR0vkBHbVm5OUjInzyJug,378
3
3
  themis/api.py,sha256=flZTbU-jRcbv7oXcfRKG4hkZjASmWlT52A4PghKj9G0,17700
4
4
  themis/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  themis/backends/__init__.py,sha256=RWM5SnV5FrS_cVjpHHeZZM_b9CgqBu1rPS5DlT5YQTY,578
@@ -88,7 +88,7 @@ themis/evaluation/metrics/nlp/meteor.py,sha256=QZT09s4aiUcVvDJDVPZYjzi5SxXdS2gn2
88
88
  themis/evaluation/metrics/nlp/rouge.py,sha256=YL05qluF-KsesHYFRfm5zELJlcvo6RvaKp7xKy6BuLI,4365
89
89
  themis/evaluation/pipelines/__init__.py,sha256=5YI1xaUULHisctFxrumN4XRpWYneoonX7nd9zBtsjvQ,384
90
90
  themis/evaluation/pipelines/composable_pipeline.py,sha256=nNP9MSvQQJvaSBw5_gO3FeyhGm9So2ZlGqh5qSvE8Ac,10905
91
- themis/evaluation/pipelines/standard_pipeline.py,sha256=nDd_bkqAVQxgwG9RK6G_fsgqwZth3058uG3p4QM0Dck,14650
91
+ themis/evaluation/pipelines/standard_pipeline.py,sha256=GI5_ImebBuM6D8GpGKLoNq4p3JhTq-ocOThlah8RxME,14754
92
92
  themis/evaluation/statistics/__init__.py,sha256=TTrScTLAW7EHNq0nbjuJs6iP3_HgDx1yy3EtYXx5JCk,1257
93
93
  themis/evaluation/statistics/bootstrap.py,sha256=JUQ8rtzFvW2e41I2pLJ7pqgSEjuJ1r6McyYLI42At9g,2409
94
94
  themis/evaluation/statistics/confidence_intervals.py,sha256=CN5EO2gWiSITQubuWuPryngnGXhGwczY9kO3mcG6JVc,3676
@@ -97,10 +97,10 @@ themis/evaluation/statistics/effect_sizes.py,sha256=EWFVDilczpR8rR3_YurWy7QcjYcN
97
97
  themis/evaluation/statistics/hypothesis_tests.py,sha256=MVlVsY8wXifbBG5aSwauFShsQtIKqYREJApbriojS2o,10042
98
98
  themis/evaluation/statistics/types.py,sha256=hW0RYWs-G4C_njNl0ZGG9lJROgU2CfLWfnTQDWYmWuw,3685
99
99
  themis/evaluation/strategies/__init__.py,sha256=3f5LQkzlu3pRbN7dgDbdYOUNZTRexcn6f8D8I5-C724,439
100
- themis/evaluation/strategies/attempt_aware_evaluation_strategy.py,sha256=O3dlsQ2F0Ucv2Dhjz2Qf-jpPhwaVs3zrdQDRRu9du5w,1714
100
+ themis/evaluation/strategies/attempt_aware_evaluation_strategy.py,sha256=MFcBdtK8rBeDXPFD2YWPSprez2iwSB-8yfyWhAlylug,1959
101
101
  themis/evaluation/strategies/default_evaluation_strategy.py,sha256=LShW-3Nxg_W4Ln-4qUvHJZqe5YMt64gHoK3uNJYLQNo,693
102
102
  themis/evaluation/strategies/evaluation_strategy.py,sha256=YFF-bXkz4Z52GuCw52FcklfEnf8dK8_z_I40DJRcmwE,669
103
- themis/evaluation/strategies/judge_evaluation_strategy.py,sha256=58pDB30y1VpM_1KPB6sGS0JImGZk5WTgnK9CKDF8N5k,2304
103
+ themis/evaluation/strategies/judge_evaluation_strategy.py,sha256=IRSgnnD2R6qrjiOTyA_PIOHUfQj4FqutkU3pKMth0CQ,2562
104
104
  themis/experiment/__init__.py,sha256=dGranqpESugmmfbQlTU9efwspazW6j3vcmAKEtAoWZk,182
105
105
  themis/experiment/builder.py,sha256=AEjCDeSOI2B0i0PBjkfY1GUDNrYGTGiqPvt0SxnDQFo,5618
106
106
  themis/experiment/cache_manager.py,sha256=Fd8Qxifrmyn8f2zjAyPrLv-ZU4Dcp-MKo8-09BoW7tY,4361
@@ -121,7 +121,7 @@ themis/generation/agentic_runner.py,sha256=armBQBk7qZDBEwT8HqjIWomYDQm57NfrP5CZJ
121
121
  themis/generation/batching.py,sha256=ddpgpn1pq_EwipvTg-K4WcoSs3c2rbW37jEA5Pa_spo,7557
122
122
  themis/generation/clients.py,sha256=6apXCp_VNQosnpnmohTHOhHGXw-VZgsUyLds8MwtYUE,4910
123
123
  themis/generation/conversation_runner.py,sha256=kSZHwEvfqzxZ-eQYxmg5OkNZcgEHggZExjad6nBOeTM,7980
124
- themis/generation/plan.py,sha256=RmPIdefXkQMHYv5EWiilpx91I9a-svw31imvG0wV3fE,15961
124
+ themis/generation/plan.py,sha256=k6_gdKFM12nrKz7ac1c5vTZsFanIKJJgyQ8IhvakDNQ,17158
125
125
  themis/generation/router.py,sha256=jZc0KFL483f8TrYtt9yxzFKs-T9CG2CoE2kfOQdHMEc,1082
126
126
  themis/generation/runner.py,sha256=pH4Dw77qskMQk3yxEkaHYAl1PItTofI7OXdvevnFiCA,8984
127
127
  themis/generation/strategies.py,sha256=hjqaVkNycFxJWh_edJ7ilBl7HS6bL-8pYm24zTfoAvg,2975
@@ -150,8 +150,8 @@ themis/utils/dashboard.py,sha256=2yiIu9_oENglTde_J3G1d5cpQ5VtSnfbUvdliw5Og1E,130
150
150
  themis/utils/logging_utils.py,sha256=buC64X-xOu-2SZ0wVkz3nCXzYVGiqKbxK-8DGSGsAdM,1173
151
151
  themis/utils/progress.py,sha256=HS0-yVbRT7Ai9zRlsJcex_OKP6dUiKx1vOp_IsobiHM,2097
152
152
  themis/utils/tracing.py,sha256=VTeiRjcW_B5fOOoSeAp37nrmlwP1DiqPcoe6OtIQ7dk,8468
153
- themis_eval-0.2.2.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
154
- themis_eval-0.2.2.dist-info/METADATA,sha256=eOlF2Obimv_822azCt0vwhLaBz3CKsuvJPgDHMA3WFU,15235
155
- themis_eval-0.2.2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
156
- themis_eval-0.2.2.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
157
- themis_eval-0.2.2.dist-info/RECORD,,
153
+ themis_eval-0.2.3.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
154
+ themis_eval-0.2.3.dist-info/METADATA,sha256=4N7tBOyUi8PAlFT2qJseKIABjHOzkFmLtfqVVUSFz84,15235
155
+ themis_eval-0.2.3.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
156
+ themis_eval-0.2.3.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
157
+ themis_eval-0.2.3.dist-info/RECORD,,