PyPI - themis-eval - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

themis-eval 0.2.2py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

themis/_version.py CHANGED Viewed

@@ -9,7 +9,7 @@ def _detect_version() -> str:
     try:
         return metadata.version("themis-eval")
     except metadata.PackageNotFoundError:  # pragma: no cover - local dev only
-        return "0.2.2"  # Fallback for development
+        return "0.2.3"  # Fallback for development
 __version__ = _detect_version()

themis/evaluation/pipelines/standard_pipeline.py CHANGED Viewed

@@ -233,7 +233,8 @@ class EvaluationPipeline:
                             if reference is not None
                             else []
                         )
-                        metadata = {"sample_id": sample_id}
+                        # Preserve all task metadata for metrics, add sample_id
+                        metadata = {**record.task.metadata, "sample_id": sample_id}
                         extract_start = time.perf_counter()
                         item_scores_for_item: list[core_entities.MetricScore] = []
                         for metric in self._metrics:

themis/evaluation/strategies/attempt_aware_evaluation_strategy.py CHANGED Viewed

@@ -37,13 +37,16 @@ class AttemptAwareEvaluationStrategy:
             grouped.setdefault(score.metric_name, []).append(score)
         for metric_name, group in grouped.items():
             value = sum(item.value for item in group) / len(group)
+            # Preserve original metadata from first score
+            base_metadata = group[0].metadata.copy() if group[0].metadata else {}
             aggregated.append(
                 core_entities.MetricScore(
                     metric_name=metric_name,
                     value=value,
                     metadata={
-                        "attempts": len(group),
-                        "sample_id": group[0].metadata.get("sample_id"),
+                        **base_metadata,  # Preserve all original metadata
+                        "attempts": len(group),  # Add aggregation-specific field
+                        "sample_id": base_metadata.get("sample_id"),
                     },
                     details={},
                 )

themis/evaluation/strategies/judge_evaluation_strategy.py CHANGED Viewed

@@ -49,6 +49,8 @@ class JudgeEvaluationStrategy:
                 counts = Counter(labels)
                 agreement = max(counts.values()) / max(1, len(labels))
+            # Preserve original metadata from first score
+            base_metadata = group[0].metadata.copy() if group[0].metadata else {}
             aggregated.append(
                 core_entities.MetricScore(
                     metric_name=metric_name,
@@ -58,7 +60,10 @@ class JudgeEvaluationStrategy:
                         "agreement": agreement,
                         "labels": labels,
                     },
-                    metadata={"sample_id": group[0].metadata.get("sample_id")},
+                    metadata={
+                        **base_metadata,  # Preserve all original metadata
+                        "sample_id": base_metadata.get("sample_id"),
+                    },
                 )
             )
         return aggregated

themis/generation/plan.py CHANGED Viewed

@@ -88,9 +88,20 @@ class GenerationPlan:
         }
         if dataset_id is not None:
             metadata["dataset_id"] = dataset_id
-        for field_name in self.metadata_fields:
-            if field_name in row:
-                metadata[field_name] = row[field_name]
+        # If metadata_fields is explicitly specified, use it as a filter (existing behavior)
+        # Otherwise, include all fields by default (new behavior for custom metrics)
+        if self.metadata_fields:
+            # Explicit filter - only include specified fields
+            for field_name in self.metadata_fields:
+                if field_name in row:
+                    metadata[field_name] = row[field_name]
+        else:
+            # No filter - include all fields except those used for other purposes
+            for field_name, field_value in row.items():
+                if field_name not in (self.dataset_id_field, self.reference_field):
+                    metadata[field_name] = field_value
         return metadata
     def _build_reference(
@@ -209,9 +220,20 @@ class CartesianExpansionStrategy:
         }
         if dataset_id is not None:
             metadata["dataset_id"] = dataset_id
-        for field_name in context.metadata_fields:
-            if field_name in row:
-                metadata[field_name] = row[field_name]
+        # If metadata_fields is explicitly specified, use it as a filter (existing behavior)
+        # Otherwise, include all fields by default (new behavior for custom metrics)
+        if context.metadata_fields:
+            # Explicit filter - only include specified fields
+            for field_name in context.metadata_fields:
+                if field_name in row:
+                    metadata[field_name] = row[field_name]
+        else:
+            # No filter - include all fields except those used for other purposes
+            for field_name, field_value in row.items():
+                if field_name not in (context.dataset_id_field, context.reference_field):
+                    metadata[field_name] = field_value
         return metadata
     def _build_reference(

{themis_eval-0.2.2.dist-info → themis_eval-0.2.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: themis-eval
-Version: 0.2.2
+Version: 0.2.3
 Summary: Lightweight evaluation platform for LLM experiments
 Author: Pittawat Taveekitworachai
 License: MIT

{themis_eval-0.2.2.dist-info → themis_eval-0.2.3.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 themis/__init__.py,sha256=rQL3njf3i5lnAcmu0HuRzGGMELbA9xX21hzw4HrbIxw,1394
-themis/_version.py,sha256=y0Oqv0Je2udPmKCy5_D8Lib7GNLGxtLVp8b5WdavITg,378
+themis/_version.py,sha256=Tk4OCTQHYoZ61gm9JnkdgajR0vkBHbVm5OUjInzyJug,378
 themis/api.py,sha256=flZTbU-jRcbv7oXcfRKG4hkZjASmWlT52A4PghKj9G0,17700
 themis/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 themis/backends/__init__.py,sha256=RWM5SnV5FrS_cVjpHHeZZM_b9CgqBu1rPS5DlT5YQTY,578
@@ -88,7 +88,7 @@ themis/evaluation/metrics/nlp/meteor.py,sha256=QZT09s4aiUcVvDJDVPZYjzi5SxXdS2gn2
 themis/evaluation/metrics/nlp/rouge.py,sha256=YL05qluF-KsesHYFRfm5zELJlcvo6RvaKp7xKy6BuLI,4365
 themis/evaluation/pipelines/__init__.py,sha256=5YI1xaUULHisctFxrumN4XRpWYneoonX7nd9zBtsjvQ,384
 themis/evaluation/pipelines/composable_pipeline.py,sha256=nNP9MSvQQJvaSBw5_gO3FeyhGm9So2ZlGqh5qSvE8Ac,10905
-themis/evaluation/pipelines/standard_pipeline.py,sha256=nDd_bkqAVQxgwG9RK6G_fsgqwZth3058uG3p4QM0Dck,14650
+themis/evaluation/pipelines/standard_pipeline.py,sha256=GI5_ImebBuM6D8GpGKLoNq4p3JhTq-ocOThlah8RxME,14754
 themis/evaluation/statistics/__init__.py,sha256=TTrScTLAW7EHNq0nbjuJs6iP3_HgDx1yy3EtYXx5JCk,1257
 themis/evaluation/statistics/bootstrap.py,sha256=JUQ8rtzFvW2e41I2pLJ7pqgSEjuJ1r6McyYLI42At9g,2409
 themis/evaluation/statistics/confidence_intervals.py,sha256=CN5EO2gWiSITQubuWuPryngnGXhGwczY9kO3mcG6JVc,3676
@@ -97,10 +97,10 @@ themis/evaluation/statistics/effect_sizes.py,sha256=EWFVDilczpR8rR3_YurWy7QcjYcN
 themis/evaluation/statistics/hypothesis_tests.py,sha256=MVlVsY8wXifbBG5aSwauFShsQtIKqYREJApbriojS2o,10042
 themis/evaluation/statistics/types.py,sha256=hW0RYWs-G4C_njNl0ZGG9lJROgU2CfLWfnTQDWYmWuw,3685
 themis/evaluation/strategies/__init__.py,sha256=3f5LQkzlu3pRbN7dgDbdYOUNZTRexcn6f8D8I5-C724,439
-themis/evaluation/strategies/attempt_aware_evaluation_strategy.py,sha256=O3dlsQ2F0Ucv2Dhjz2Qf-jpPhwaVs3zrdQDRRu9du5w,1714
+themis/evaluation/strategies/attempt_aware_evaluation_strategy.py,sha256=MFcBdtK8rBeDXPFD2YWPSprez2iwSB-8yfyWhAlylug,1959
 themis/evaluation/strategies/default_evaluation_strategy.py,sha256=LShW-3Nxg_W4Ln-4qUvHJZqe5YMt64gHoK3uNJYLQNo,693
 themis/evaluation/strategies/evaluation_strategy.py,sha256=YFF-bXkz4Z52GuCw52FcklfEnf8dK8_z_I40DJRcmwE,669
-themis/evaluation/strategies/judge_evaluation_strategy.py,sha256=58pDB30y1VpM_1KPB6sGS0JImGZk5WTgnK9CKDF8N5k,2304
+themis/evaluation/strategies/judge_evaluation_strategy.py,sha256=IRSgnnD2R6qrjiOTyA_PIOHUfQj4FqutkU3pKMth0CQ,2562
 themis/experiment/__init__.py,sha256=dGranqpESugmmfbQlTU9efwspazW6j3vcmAKEtAoWZk,182
 themis/experiment/builder.py,sha256=AEjCDeSOI2B0i0PBjkfY1GUDNrYGTGiqPvt0SxnDQFo,5618
 themis/experiment/cache_manager.py,sha256=Fd8Qxifrmyn8f2zjAyPrLv-ZU4Dcp-MKo8-09BoW7tY,4361
@@ -121,7 +121,7 @@ themis/generation/agentic_runner.py,sha256=armBQBk7qZDBEwT8HqjIWomYDQm57NfrP5CZJ
 themis/generation/batching.py,sha256=ddpgpn1pq_EwipvTg-K4WcoSs3c2rbW37jEA5Pa_spo,7557
 themis/generation/clients.py,sha256=6apXCp_VNQosnpnmohTHOhHGXw-VZgsUyLds8MwtYUE,4910
 themis/generation/conversation_runner.py,sha256=kSZHwEvfqzxZ-eQYxmg5OkNZcgEHggZExjad6nBOeTM,7980
-themis/generation/plan.py,sha256=RmPIdefXkQMHYv5EWiilpx91I9a-svw31imvG0wV3fE,15961
+themis/generation/plan.py,sha256=k6_gdKFM12nrKz7ac1c5vTZsFanIKJJgyQ8IhvakDNQ,17158
 themis/generation/router.py,sha256=jZc0KFL483f8TrYtt9yxzFKs-T9CG2CoE2kfOQdHMEc,1082
 themis/generation/runner.py,sha256=pH4Dw77qskMQk3yxEkaHYAl1PItTofI7OXdvevnFiCA,8984
 themis/generation/strategies.py,sha256=hjqaVkNycFxJWh_edJ7ilBl7HS6bL-8pYm24zTfoAvg,2975
@@ -150,8 +150,8 @@ themis/utils/dashboard.py,sha256=2yiIu9_oENglTde_J3G1d5cpQ5VtSnfbUvdliw5Og1E,130
 themis/utils/logging_utils.py,sha256=buC64X-xOu-2SZ0wVkz3nCXzYVGiqKbxK-8DGSGsAdM,1173
 themis/utils/progress.py,sha256=HS0-yVbRT7Ai9zRlsJcex_OKP6dUiKx1vOp_IsobiHM,2097
 themis/utils/tracing.py,sha256=VTeiRjcW_B5fOOoSeAp37nrmlwP1DiqPcoe6OtIQ7dk,8468
-themis_eval-0.2.2.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
-themis_eval-0.2.2.dist-info/METADATA,sha256=eOlF2Obimv_822azCt0vwhLaBz3CKsuvJPgDHMA3WFU,15235
-themis_eval-0.2.2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
-themis_eval-0.2.2.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
-themis_eval-0.2.2.dist-info/RECORD,,
+themis_eval-0.2.3.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
+themis_eval-0.2.3.dist-info/METADATA,sha256=4N7tBOyUi8PAlFT2qJseKIABjHOzkFmLtfqVVUSFz84,15235
+themis_eval-0.2.3.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
+themis_eval-0.2.3.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
+themis_eval-0.2.3.dist-info/RECORD,,

{themis_eval-0.2.2.dist-info → themis_eval-0.2.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{themis_eval-0.2.2.dist-info → themis_eval-0.2.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{themis_eval-0.2.2.dist-info → themis_eval-0.2.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

themis-eval 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl

themis-eval 0.2.2py3-none-any.whl → 0.2.3py3-none-any.whl