themis-eval 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. {themis_eval-0.2.2/themis_eval.egg-info → themis_eval-0.2.3}/PKG-INFO +1 -1
  2. {themis_eval-0.2.2 → themis_eval-0.2.3}/pyproject.toml +1 -1
  3. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/_version.py +1 -1
  4. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/pipelines/standard_pipeline.py +2 -1
  5. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +5 -2
  6. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/strategies/judge_evaluation_strategy.py +6 -1
  7. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/generation/plan.py +28 -6
  8. {themis_eval-0.2.2 → themis_eval-0.2.3/themis_eval.egg-info}/PKG-INFO +1 -1
  9. {themis_eval-0.2.2 → themis_eval-0.2.3}/LICENSE +0 -0
  10. {themis_eval-0.2.2 → themis_eval-0.2.3}/README.md +0 -0
  11. {themis_eval-0.2.2 → themis_eval-0.2.3}/setup.cfg +0 -0
  12. {themis_eval-0.2.2 → themis_eval-0.2.3}/tests/test_package_metadata.py +0 -0
  13. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/__init__.py +0 -0
  14. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/api.py +0 -0
  15. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/backends/__init__.py +0 -0
  16. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/backends/execution.py +0 -0
  17. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/backends/storage.py +0 -0
  18. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/cli/__init__.py +0 -0
  19. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/cli/__main__.py +0 -0
  20. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/cli/commands/__init__.py +0 -0
  21. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/cli/commands/benchmarks.py +0 -0
  22. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/cli/commands/comparison.py +0 -0
  23. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/cli/commands/config_commands.py +0 -0
  24. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/cli/commands/cost.py +0 -0
  25. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/cli/commands/demo.py +0 -0
  26. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/cli/commands/info.py +0 -0
  27. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/cli/commands/leaderboard.py +0 -0
  28. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/cli/commands/math_benchmarks.py +0 -0
  29. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/cli/commands/mcq_benchmarks.py +0 -0
  30. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/cli/commands/results.py +0 -0
  31. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/cli/commands/sample_run.py +0 -0
  32. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/cli/commands/visualize.py +0 -0
  33. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/cli/main.py +0 -0
  34. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/cli/new_project.py +0 -0
  35. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/cli/utils.py +0 -0
  36. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/comparison/__init__.py +0 -0
  37. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/comparison/engine.py +0 -0
  38. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/comparison/reports.py +0 -0
  39. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/comparison/statistics.py +0 -0
  40. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/config/__init__.py +0 -0
  41. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/config/loader.py +0 -0
  42. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/config/registry.py +0 -0
  43. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/config/runtime.py +0 -0
  44. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/config/schema.py +0 -0
  45. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/core/__init__.py +0 -0
  46. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/core/conversation.py +0 -0
  47. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/core/entities.py +0 -0
  48. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/core/serialization.py +0 -0
  49. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/core/tools.py +0 -0
  50. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/core/types.py +0 -0
  51. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/datasets/__init__.py +0 -0
  52. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/datasets/base.py +0 -0
  53. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/datasets/commonsense_qa.py +0 -0
  54. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/datasets/competition_math.py +0 -0
  55. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/datasets/coqa.py +0 -0
  56. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/datasets/gpqa.py +0 -0
  57. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/datasets/gsm8k.py +0 -0
  58. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/datasets/gsm_symbolic.py +0 -0
  59. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/datasets/math500.py +0 -0
  60. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/datasets/med_qa.py +0 -0
  61. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/datasets/medmcqa.py +0 -0
  62. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/datasets/mmlu_pro.py +0 -0
  63. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/datasets/piqa.py +0 -0
  64. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/datasets/registry.py +0 -0
  65. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/datasets/schema.py +0 -0
  66. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/datasets/sciq.py +0 -0
  67. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/datasets/social_i_qa.py +0 -0
  68. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/datasets/super_gpqa.py +0 -0
  69. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/__init__.py +0 -0
  70. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/conditional.py +0 -0
  71. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/extractors/__init__.py +0 -0
  72. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/extractors/error_taxonomy_extractor.py +0 -0
  73. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/extractors/exceptions.py +0 -0
  74. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/extractors/identity_extractor.py +0 -0
  75. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/extractors/json_field_extractor.py +0 -0
  76. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/extractors/math_verify_extractor.py +0 -0
  77. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/extractors/regex_extractor.py +0 -0
  78. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/math_verify_utils.py +0 -0
  79. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/metrics/__init__.py +0 -0
  80. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/metrics/code/__init__.py +0 -0
  81. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/metrics/code/codebleu.py +0 -0
  82. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/metrics/code/execution.py +0 -0
  83. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/metrics/code/pass_at_k.py +0 -0
  84. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/metrics/composite_metric.py +0 -0
  85. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/metrics/consistency_metric.py +0 -0
  86. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/metrics/exact_match.py +0 -0
  87. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/metrics/length_difference_tolerance.py +0 -0
  88. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/metrics/math_verify_accuracy.py +0 -0
  89. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/metrics/nlp/__init__.py +0 -0
  90. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/metrics/nlp/bertscore.py +0 -0
  91. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/metrics/nlp/bleu.py +0 -0
  92. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/metrics/nlp/meteor.py +0 -0
  93. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/metrics/nlp/rouge.py +0 -0
  94. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/metrics/pairwise_judge_metric.py +0 -0
  95. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/metrics/response_length.py +0 -0
  96. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/metrics/rubric_judge_metric.py +0 -0
  97. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/pipeline.py +0 -0
  98. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/pipelines/__init__.py +0 -0
  99. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/pipelines/composable_pipeline.py +0 -0
  100. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/reports.py +0 -0
  101. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/statistics/__init__.py +0 -0
  102. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/statistics/bootstrap.py +0 -0
  103. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/statistics/confidence_intervals.py +0 -0
  104. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/statistics/distributions.py +0 -0
  105. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/statistics/effect_sizes.py +0 -0
  106. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/statistics/hypothesis_tests.py +0 -0
  107. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/statistics/types.py +0 -0
  108. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/strategies/__init__.py +0 -0
  109. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/strategies/default_evaluation_strategy.py +0 -0
  110. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/evaluation/strategies/evaluation_strategy.py +0 -0
  111. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/experiment/__init__.py +0 -0
  112. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/experiment/builder.py +0 -0
  113. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/experiment/cache_manager.py +0 -0
  114. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/experiment/comparison.py +0 -0
  115. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/experiment/cost.py +0 -0
  116. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/experiment/definitions.py +0 -0
  117. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/experiment/export.py +0 -0
  118. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/experiment/export_csv.py +0 -0
  119. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/experiment/integration_manager.py +0 -0
  120. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/experiment/math.py +0 -0
  121. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/experiment/mcq.py +0 -0
  122. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/experiment/orchestrator.py +0 -0
  123. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/experiment/pricing.py +0 -0
  124. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/experiment/storage.py +0 -0
  125. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/experiment/visualization.py +0 -0
  126. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/generation/__init__.py +0 -0
  127. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/generation/agentic_runner.py +0 -0
  128. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/generation/batching.py +0 -0
  129. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/generation/clients.py +0 -0
  130. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/generation/conversation_runner.py +0 -0
  131. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/generation/providers/litellm_provider.py +0 -0
  132. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/generation/providers/vllm_provider.py +0 -0
  133. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/generation/router.py +0 -0
  134. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/generation/runner.py +0 -0
  135. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/generation/strategies.py +0 -0
  136. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/generation/templates.py +0 -0
  137. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/generation/turn_strategies.py +0 -0
  138. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/generation/types.py +0 -0
  139. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/integrations/__init__.py +0 -0
  140. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/integrations/huggingface.py +0 -0
  141. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/integrations/wandb.py +0 -0
  142. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/interfaces/__init__.py +0 -0
  143. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/presets/__init__.py +0 -0
  144. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/presets/benchmarks.py +0 -0
  145. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/presets/models.py +0 -0
  146. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/project/__init__.py +0 -0
  147. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/project/definitions.py +0 -0
  148. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/project/patterns.py +0 -0
  149. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/providers/__init__.py +0 -0
  150. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/providers/registry.py +0 -0
  151. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/py.typed +0 -0
  152. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/server/__init__.py +0 -0
  153. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/server/app.py +0 -0
  154. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/utils/api_generator.py +0 -0
  155. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/utils/cost_tracking.py +0 -0
  156. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/utils/dashboard.py +0 -0
  157. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/utils/logging_utils.py +0 -0
  158. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/utils/progress.py +0 -0
  159. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis/utils/tracing.py +0 -0
  160. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis_eval.egg-info/SOURCES.txt +0 -0
  161. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis_eval.egg-info/dependency_links.txt +0 -0
  162. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis_eval.egg-info/requires.txt +0 -0
  163. {themis_eval-0.2.2 → themis_eval-0.2.3}/themis_eval.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: themis-eval
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Lightweight evaluation platform for LLM experiments
5
5
  Author: Pittawat Taveekitworachai
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "themis-eval"
7
- version = "0.2.2"
7
+ version = "0.2.3"
8
8
  description = "Lightweight evaluation platform for LLM experiments"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.12"
@@ -9,7 +9,7 @@ def _detect_version() -> str:
9
9
  try:
10
10
  return metadata.version("themis-eval")
11
11
  except metadata.PackageNotFoundError: # pragma: no cover - local dev only
12
- return "0.2.2" # Fallback for development
12
+ return "0.2.3" # Fallback for development
13
13
 
14
14
 
15
15
  __version__ = _detect_version()
@@ -233,7 +233,8 @@ class EvaluationPipeline:
233
233
  if reference is not None
234
234
  else []
235
235
  )
236
- metadata = {"sample_id": sample_id}
236
+ # Preserve all task metadata for metrics, add sample_id
237
+ metadata = {**record.task.metadata, "sample_id": sample_id}
237
238
  extract_start = time.perf_counter()
238
239
  item_scores_for_item: list[core_entities.MetricScore] = []
239
240
  for metric in self._metrics:
@@ -37,13 +37,16 @@ class AttemptAwareEvaluationStrategy:
37
37
  grouped.setdefault(score.metric_name, []).append(score)
38
38
  for metric_name, group in grouped.items():
39
39
  value = sum(item.value for item in group) / len(group)
40
+ # Preserve original metadata from first score
41
+ base_metadata = group[0].metadata.copy() if group[0].metadata else {}
40
42
  aggregated.append(
41
43
  core_entities.MetricScore(
42
44
  metric_name=metric_name,
43
45
  value=value,
44
46
  metadata={
45
- "attempts": len(group),
46
- "sample_id": group[0].metadata.get("sample_id"),
47
+ **base_metadata, # Preserve all original metadata
48
+ "attempts": len(group), # Add aggregation-specific field
49
+ "sample_id": base_metadata.get("sample_id"),
47
50
  },
48
51
  details={},
49
52
  )
@@ -49,6 +49,8 @@ class JudgeEvaluationStrategy:
49
49
  counts = Counter(labels)
50
50
  agreement = max(counts.values()) / max(1, len(labels))
51
51
 
52
+ # Preserve original metadata from first score
53
+ base_metadata = group[0].metadata.copy() if group[0].metadata else {}
52
54
  aggregated.append(
53
55
  core_entities.MetricScore(
54
56
  metric_name=metric_name,
@@ -58,7 +60,10 @@ class JudgeEvaluationStrategy:
58
60
  "agreement": agreement,
59
61
  "labels": labels,
60
62
  },
61
- metadata={"sample_id": group[0].metadata.get("sample_id")},
63
+ metadata={
64
+ **base_metadata, # Preserve all original metadata
65
+ "sample_id": base_metadata.get("sample_id"),
66
+ },
62
67
  )
63
68
  )
64
69
  return aggregated
@@ -88,9 +88,20 @@ class GenerationPlan:
88
88
  }
89
89
  if dataset_id is not None:
90
90
  metadata["dataset_id"] = dataset_id
91
- for field_name in self.metadata_fields:
92
- if field_name in row:
93
- metadata[field_name] = row[field_name]
91
+
92
+ # If metadata_fields is explicitly specified, use it as a filter (existing behavior)
93
+ # Otherwise, include all fields by default (new behavior for custom metrics)
94
+ if self.metadata_fields:
95
+ # Explicit filter - only include specified fields
96
+ for field_name in self.metadata_fields:
97
+ if field_name in row:
98
+ metadata[field_name] = row[field_name]
99
+ else:
100
+ # No filter - include all fields except those used for other purposes
101
+ for field_name, field_value in row.items():
102
+ if field_name not in (self.dataset_id_field, self.reference_field):
103
+ metadata[field_name] = field_value
104
+
94
105
  return metadata
95
106
 
96
107
  def _build_reference(
@@ -209,9 +220,20 @@ class CartesianExpansionStrategy:
209
220
  }
210
221
  if dataset_id is not None:
211
222
  metadata["dataset_id"] = dataset_id
212
- for field_name in context.metadata_fields:
213
- if field_name in row:
214
- metadata[field_name] = row[field_name]
223
+
224
+ # If metadata_fields is explicitly specified, use it as a filter (existing behavior)
225
+ # Otherwise, include all fields by default (new behavior for custom metrics)
226
+ if context.metadata_fields:
227
+ # Explicit filter - only include specified fields
228
+ for field_name in context.metadata_fields:
229
+ if field_name in row:
230
+ metadata[field_name] = row[field_name]
231
+ else:
232
+ # No filter - include all fields except those used for other purposes
233
+ for field_name, field_value in row.items():
234
+ if field_name not in (context.dataset_id_field, context.reference_field):
235
+ metadata[field_name] = field_value
236
+
215
237
  return metadata
216
238
 
217
239
  def _build_reference(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: themis-eval
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Lightweight evaluation platform for LLM experiments
5
5
  Author: Pittawat Taveekitworachai
6
6
  License: MIT
File without changes
File without changes
File without changes
File without changes
File without changes