wisent 0.5.12__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (225) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +26 -0
  3. wisent/core/activations/activations.py +96 -0
  4. wisent/core/activations/activations_collector.py +71 -20
  5. wisent/core/activations/prompt_construction_strategy.py +47 -0
  6. wisent/core/agent/budget.py +2 -2
  7. wisent/core/agent/device_benchmarks.py +1 -1
  8. wisent/core/agent/diagnose/classifier_marketplace.py +8 -8
  9. wisent/core/agent/diagnose/response_diagnostics.py +4 -4
  10. wisent/core/agent/diagnose/synthetic_classifier_option.py +1 -1
  11. wisent/core/agent/diagnose/tasks/task_manager.py +3 -3
  12. wisent/core/agent/diagnose.py +2 -1
  13. wisent/core/autonomous_agent.py +10 -2
  14. wisent/core/benchmark_extractors.py +293 -0
  15. wisent/core/bigcode_integration.py +20 -7
  16. wisent/core/branding.py +108 -0
  17. wisent/core/cli/__init__.py +15 -0
  18. wisent/core/cli/create_steering_vector.py +138 -0
  19. wisent/core/cli/evaluate_responses.py +715 -0
  20. wisent/core/cli/generate_pairs.py +128 -0
  21. wisent/core/cli/generate_pairs_from_task.py +119 -0
  22. wisent/core/cli/generate_responses.py +129 -0
  23. wisent/core/cli/generate_vector_from_synthetic.py +149 -0
  24. wisent/core/cli/generate_vector_from_task.py +147 -0
  25. wisent/core/cli/get_activations.py +191 -0
  26. wisent/core/cli/optimize_classification.py +339 -0
  27. wisent/core/cli/optimize_steering.py +364 -0
  28. wisent/core/cli/tasks.py +182 -0
  29. wisent/core/cli_logger.py +22 -0
  30. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +27 -1
  31. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +49 -1
  32. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +115 -0
  33. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +115 -0
  34. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +111 -0
  35. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +119 -0
  36. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +115 -0
  37. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +114 -0
  38. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +118 -0
  39. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +146 -0
  40. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +129 -0
  41. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +119 -0
  42. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +112 -0
  43. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +113 -0
  44. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livecodebench.py +367 -0
  45. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +115 -0
  46. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +114 -0
  47. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +113 -0
  48. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +112 -0
  49. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +111 -0
  50. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +114 -0
  51. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +113 -0
  52. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +115 -0
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +111 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +113 -0
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +113 -0
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +112 -0
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +116 -0
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +115 -0
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +111 -0
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +111 -0
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +121 -0
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +121 -0
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +111 -0
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +110 -0
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +114 -0
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad2.py +124 -0
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py +111 -0
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +112 -0
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +127 -0
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_gen.py +112 -0
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +117 -0
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +117 -0
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +127 -0
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +119 -0
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +1 -1
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +111 -0
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +114 -0
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +112 -0
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +114 -0
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +114 -0
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +1 -1
  82. wisent/core/data_loaders/__init__.py +235 -0
  83. wisent/core/data_loaders/loaders/lm_loader.py +2 -2
  84. wisent/core/data_loaders/loaders/task_interface_loader.py +300 -0
  85. wisent/{cli/data_loaders/data_loader_rotator.py → core/data_loaders/rotator.py} +1 -1
  86. wisent/core/download_full_benchmarks.py +79 -2
  87. wisent/core/evaluators/benchmark_specific/__init__.py +26 -0
  88. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/evaluator.py +17 -17
  89. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/cpp_sanitizer.py +2 -2
  90. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/java_sanitizer.py +2 -2
  91. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/python_sanitizer.py +2 -2
  92. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py +3 -0
  93. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py +305 -0
  94. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/runtime.py +36 -4
  95. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/entrypoint.py +2 -4
  96. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/recipes.py +1 -1
  97. wisent/core/evaluators/benchmark_specific/coding/solution_generator.py +258 -0
  98. wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py +79 -0
  99. wisent/core/evaluators/benchmark_specific/f1_evaluator.py +101 -0
  100. wisent/core/evaluators/benchmark_specific/generation_evaluator.py +197 -0
  101. wisent/core/{log_likelihoods_evaluator.py → evaluators/benchmark_specific/log_likelihoods_evaluator.py} +10 -2
  102. wisent/core/evaluators/benchmark_specific/perplexity_evaluator.py +140 -0
  103. wisent/core/evaluators/benchmark_specific/personalization_evaluator.py +250 -0
  104. wisent/{cli/evaluators/evaluator_rotator.py → core/evaluators/rotator.py} +4 -4
  105. wisent/core/lm_eval_harness_ground_truth.py +3 -2
  106. wisent/core/main.py +57 -0
  107. wisent/core/model_persistence.py +2 -2
  108. wisent/core/models/wisent_model.py +6 -6
  109. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  110. wisent/core/optuna/steering/steering_optimization.py +1 -1
  111. wisent/core/parser_arguments/__init__.py +10 -0
  112. wisent/core/parser_arguments/agent_parser.py +110 -0
  113. wisent/core/parser_arguments/configure_model_parser.py +7 -0
  114. wisent/core/parser_arguments/create_steering_vector_parser.py +59 -0
  115. wisent/core/parser_arguments/evaluate_parser.py +40 -0
  116. wisent/core/parser_arguments/evaluate_responses_parser.py +10 -0
  117. wisent/core/parser_arguments/full_optimize_parser.py +115 -0
  118. wisent/core/parser_arguments/generate_pairs_from_task_parser.py +33 -0
  119. wisent/core/parser_arguments/generate_pairs_parser.py +29 -0
  120. wisent/core/parser_arguments/generate_responses_parser.py +15 -0
  121. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +127 -0
  122. wisent/core/parser_arguments/generate_vector_from_task_parser.py +127 -0
  123. wisent/core/parser_arguments/generate_vector_parser.py +90 -0
  124. wisent/core/parser_arguments/get_activations_parser.py +90 -0
  125. wisent/core/parser_arguments/main_parser.py +152 -0
  126. wisent/core/parser_arguments/model_config_parser.py +59 -0
  127. wisent/core/parser_arguments/monitor_parser.py +17 -0
  128. wisent/core/parser_arguments/multi_steer_parser.py +47 -0
  129. wisent/core/parser_arguments/optimize_classification_parser.py +67 -0
  130. wisent/core/parser_arguments/optimize_sample_size_parser.py +58 -0
  131. wisent/core/parser_arguments/optimize_steering_parser.py +147 -0
  132. wisent/core/parser_arguments/synthetic_parser.py +93 -0
  133. wisent/core/parser_arguments/tasks_parser.py +584 -0
  134. wisent/core/parser_arguments/test_nonsense_parser.py +26 -0
  135. wisent/core/parser_arguments/utils.py +111 -0
  136. wisent/core/prompts/core/prompt_formater.py +3 -3
  137. wisent/core/prompts/prompt_stratiegies/direct_completion.py +2 -0
  138. wisent/core/prompts/prompt_stratiegies/instruction_following.py +2 -0
  139. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +2 -0
  140. wisent/core/prompts/prompt_stratiegies/role_playing.py +2 -0
  141. wisent/{cli/steering_methods/steering_rotator.py → core/steering_methods/rotator.py} +4 -4
  142. wisent/core/steering_optimizer.py +45 -21
  143. wisent/{synthetic → core/synthetic}/cleaners/deduper_cleaner.py +3 -3
  144. wisent/{synthetic → core/synthetic}/cleaners/methods/base_dedupers.py +2 -2
  145. wisent/{synthetic → core/synthetic}/cleaners/methods/base_refusalers.py +1 -1
  146. wisent/{synthetic → core/synthetic}/cleaners/pairs_cleaner.py +5 -5
  147. wisent/{synthetic → core/synthetic}/cleaners/refusaler_cleaner.py +4 -4
  148. wisent/{synthetic → core/synthetic}/db_instructions/mini_dp.py +1 -1
  149. wisent/{synthetic → core/synthetic}/generators/diversities/methods/fast_diversity.py +1 -1
  150. wisent/{synthetic → core/synthetic}/generators/pairs_generator.py +38 -12
  151. wisent/core/tasks/livecodebench_task.py +4 -103
  152. wisent/core/timing_calibration.py +1 -1
  153. {wisent-0.5.12.dist-info → wisent-0.5.13.dist-info}/METADATA +3 -3
  154. wisent-0.5.13.dist-info/RECORD +294 -0
  155. wisent-0.5.13.dist-info/entry_points.txt +2 -0
  156. wisent/benchmarks/coding/providers/livecodebench/provider.py +0 -53
  157. wisent/classifiers/core/atoms.py +0 -747
  158. wisent/classifiers/models/logistic.py +0 -29
  159. wisent/classifiers/models/mlp.py +0 -47
  160. wisent/cli/classifiers/classifier_rotator.py +0 -137
  161. wisent/cli/cli_logger.py +0 -142
  162. wisent/cli/wisent_cli/commands/help_cmd.py +0 -52
  163. wisent/cli/wisent_cli/commands/listing.py +0 -154
  164. wisent/cli/wisent_cli/commands/train_cmd.py +0 -322
  165. wisent/cli/wisent_cli/main.py +0 -93
  166. wisent/cli/wisent_cli/shell.py +0 -80
  167. wisent/cli/wisent_cli/ui.py +0 -69
  168. wisent/cli/wisent_cli/util/aggregations.py +0 -43
  169. wisent/cli/wisent_cli/util/parsing.py +0 -126
  170. wisent/cli/wisent_cli/version.py +0 -4
  171. wisent/opti/methods/__init__.py +0 -0
  172. wisent/synthetic/__init__.py +0 -0
  173. wisent/synthetic/cleaners/__init__.py +0 -0
  174. wisent/synthetic/cleaners/core/__init__.py +0 -0
  175. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  176. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  177. wisent/synthetic/db_instructions/__init__.py +0 -0
  178. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  179. wisent/synthetic/generators/__init__.py +0 -0
  180. wisent/synthetic/generators/core/__init__.py +0 -0
  181. wisent/synthetic/generators/diversities/__init__.py +0 -0
  182. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  183. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  184. wisent-0.5.12.dist-info/RECORD +0 -220
  185. /wisent/{benchmarks → core/evaluators/benchmark_specific/coding}/__init__.py +0 -0
  186. /wisent/{benchmarks/coding → core/evaluators/benchmark_specific/coding/metrics}/__init__.py +0 -0
  187. /wisent/{benchmarks/coding/metrics → core/evaluators/benchmark_specific/coding/metrics/core}/__init__.py +0 -0
  188. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/core/atoms.py +0 -0
  189. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/passk.py +0 -0
  190. /wisent/{benchmarks/coding/metrics/core → core/evaluators/benchmark_specific/coding/output_sanitizer}/__init__.py +0 -0
  191. /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/output_sanitizer/core}/__init__.py +0 -0
  192. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/core/atoms.py +0 -0
  193. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/utils.py +0 -0
  194. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/__init__.py +0 -0
  195. /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/providers}/core/__init__.py +0 -0
  196. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/core/atoms.py +0 -0
  197. /wisent/{benchmarks/coding/providers/core → core/evaluators/benchmark_specific/coding/safe_docker}/__init__.py +0 -0
  198. /wisent/{benchmarks/coding/providers/livecodebench → core/evaluators/benchmark_specific/coding/safe_docker/core}/__init__.py +0 -0
  199. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/atoms.py +0 -0
  200. /wisent/{benchmarks/coding/safe_docker → core/opti}/__init__.py +0 -0
  201. /wisent/{benchmarks/coding/safe_docker → core/opti}/core/__init__.py +0 -0
  202. /wisent/{opti → core/opti}/core/atoms.py +0 -0
  203. /wisent/{classifiers → core/opti/methods}/__init__.py +0 -0
  204. /wisent/{opti → core/opti}/methods/opti_classificator.py +0 -0
  205. /wisent/{opti → core/opti}/methods/opti_steering.py +0 -0
  206. /wisent/{classifiers/core → core/synthetic}/__init__.py +0 -0
  207. /wisent/{classifiers/models → core/synthetic/cleaners}/__init__.py +0 -0
  208. /wisent/{cli → core/synthetic/cleaners/core}/__init__.py +0 -0
  209. /wisent/{synthetic → core/synthetic}/cleaners/core/atoms.py +0 -0
  210. /wisent/{cli/classifiers → core/synthetic/cleaners/methods}/__init__.py +0 -0
  211. /wisent/{cli/data_loaders → core/synthetic/cleaners/methods/core}/__init__.py +0 -0
  212. /wisent/{synthetic → core/synthetic}/cleaners/methods/core/atoms.py +0 -0
  213. /wisent/{cli/evaluators → core/synthetic/db_instructions}/__init__.py +0 -0
  214. /wisent/{cli/steering_methods → core/synthetic/db_instructions/core}/__init__.py +0 -0
  215. /wisent/{synthetic → core/synthetic}/db_instructions/core/atoms.py +0 -0
  216. /wisent/{cli/wisent_cli → core/synthetic/generators}/__init__.py +0 -0
  217. /wisent/{cli/wisent_cli/commands → core/synthetic/generators/core}/__init__.py +0 -0
  218. /wisent/{synthetic → core/synthetic}/generators/core/atoms.py +0 -0
  219. /wisent/{cli/wisent_cli/util → core/synthetic/generators/diversities}/__init__.py +0 -0
  220. /wisent/{opti → core/synthetic/generators/diversities/core}/__init__.py +0 -0
  221. /wisent/{synthetic → core/synthetic}/generators/diversities/core/core.py +0 -0
  222. /wisent/{opti/core → core/synthetic/generators/diversities/methods}/__init__.py +0 -0
  223. {wisent-0.5.12.dist-info → wisent-0.5.13.dist-info}/WHEEL +0 -0
  224. {wisent-0.5.12.dist-info → wisent-0.5.13.dist-info}/licenses/LICENSE +0 -0
  225. {wisent-0.5.12.dist-info → wisent-0.5.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,715 @@
1
+ """Evaluate responses command execution logic."""
2
+
3
+ import json
4
+ import os
5
+ import sys
6
+ from pathlib import Path
7
+
8
+
9
+ def execute_evaluate_responses(args):
10
+ """
11
+ Execute the evaluate-responses command.
12
+
13
+ Evaluates generated responses using benchmark-specific evaluators.
14
+ Routes to appropriate evaluator based on task type from task-evaluator.json.
15
+ """
16
+ from lm_eval.tasks import TaskManager
17
+ from wisent.core.evaluators.benchmark_specific import (
18
+ GenerationEvaluator,
19
+ ExactMatchEvaluator,
20
+ F1Evaluator,
21
+ PerplexityEvaluator
22
+ )
23
+
24
+ print(f"\n{'='*80}")
25
+ print(f"📊 EVALUATING GENERATED RESPONSES")
26
+ print(f"{'='*80}")
27
+ print(f" Input: {args.input}")
28
+ print(f"{'='*80}\n")
29
+
30
+ # Load input file
31
+ print(f"📂 Loading generated responses...")
32
+ try:
33
+ with open(args.input, 'r') as f:
34
+ input_data = json.load(f)
35
+
36
+ responses = input_data.get('responses', [])
37
+ task_name = args.task if args.task else input_data.get('task')
38
+ if not task_name:
39
+ print(f" ❌ Task name not found in input file and not provided via --task")
40
+ sys.exit(1)
41
+ print(f" ✓ Loaded {len(responses)} responses")
42
+ print(f" Task: {task_name}\n")
43
+ except Exception as e:
44
+ print(f" ❌ Failed to load input file: {e}")
45
+ sys.exit(1)
46
+
47
+ # Load task-evaluator mapping
48
+ print(f"📋 Loading task evaluation config...")
49
+ try:
50
+ # Find task-evaluator.json in project root
51
+ current_dir = Path(__file__).resolve()
52
+ project_root = None
53
+ for parent in current_dir.parents:
54
+ task_eval_file = parent / 'task-evaluator.json'
55
+ if task_eval_file.exists():
56
+ project_root = parent
57
+ break
58
+
59
+ if not project_root:
60
+ raise FileNotFoundError("Could not find task-evaluator.json")
61
+
62
+ with open(project_root / 'task-evaluator.json', 'r') as f:
63
+ task_evaluator_map = json.load(f)
64
+
65
+ # Get task evaluation type
66
+ task_config = task_evaluator_map['tasks'].get(task_name)
67
+ if not task_config:
68
+ raise ValueError(f"Task {task_name} not found in task-evaluator.json")
69
+
70
+ evaluation_type = task_config['evaluation_type']
71
+ primary_metric = task_config['primary_metric']
72
+
73
+ print(f" ✓ Task evaluation type: {evaluation_type}")
74
+ print(f" ✓ Primary metric: {primary_metric}\n")
75
+ except Exception as e:
76
+ print(f" ❌ Could not load task config: {e}")
77
+ sys.exit(1)
78
+
79
+ # Load task to get ground truth (skip for docker_execution and personalization)
80
+ task_docs = None
81
+ task = None
82
+ if evaluation_type not in ["docker_execution", "personalization"]:
83
+ print(f"📚 Loading task data...")
84
+ try:
85
+ tm = TaskManager()
86
+ task_dict = tm.load_task_or_group(task_name)
87
+ task = task_dict[task_name]
88
+
89
+ # Get validation docs
90
+ task_docs = list(task.validation_docs())
91
+
92
+ print(f" ✓ Loaded {len(task_docs)} task documents\n")
93
+ except Exception as e:
94
+ print(f" ❌ Could not load task: {e}")
95
+ sys.exit(1)
96
+
97
+ # Select evaluator based on evaluation type
98
+ print(f"🔧 Selecting evaluator for {evaluation_type} task...")
99
+ if evaluation_type == "docker_execution":
100
+ # Handle coding tasks with Docker execution
101
+ from wisent.core.evaluators.benchmark_specific.coding.metrics.evaluator import (
102
+ CodingEvaluator,
103
+ EvaluatorConfig
104
+ )
105
+ from wisent.core.evaluators.benchmark_specific.coding.providers.livecodebench import LiveCodeBenchProvider
106
+
107
+ print(f" Using: CodingEvaluator (Docker sandbox execution)\n")
108
+
109
+ # Get Docker config from task config
110
+ docker_config = task_config.get('docker_config', {})
111
+ provider_name = task_config.get('provider', 'livecodebench')
112
+
113
+ # This will be handled separately - set evaluator to None for now
114
+ evaluator = None
115
+
116
+ elif evaluation_type == "multiple_choice":
117
+ evaluator = F1Evaluator()
118
+ print(f" Using: F1Evaluator (compares response to choice texts)\n")
119
+ elif evaluation_type == "generate_until":
120
+ if primary_metric == "exact_match":
121
+ evaluator = ExactMatchEvaluator()
122
+ print(f" Using: ExactMatchEvaluator (extracts and compares answers)\n")
123
+ elif primary_metric in ["em", "f1"]:
124
+ evaluator = F1Evaluator()
125
+ print(f" Using: F1Evaluator (token-level comparison)\n")
126
+ else:
127
+ evaluator = GenerationEvaluator()
128
+ print(f" Using: GenerationEvaluator (extracts and compares answers)\n")
129
+ elif evaluation_type == "loglikelihood_rolling":
130
+ evaluator = PerplexityEvaluator()
131
+ print(f" Using: PerplexityEvaluator (perplexity computation)\n")
132
+ elif evaluation_type == "personalization":
133
+ from wisent.core.evaluators.benchmark_specific import PersonalizationEvaluator
134
+ evaluator = PersonalizationEvaluator()
135
+ print(f" Using: PersonalizationEvaluator (personality trait evaluation)\n")
136
+ else:
137
+ evaluator = F1Evaluator()
138
+ print(f" Using: F1Evaluator (default fallback)\n")
139
+
140
+ # Evaluate responses
141
+ print(f"🎯 Evaluating responses...\n")
142
+ evaluation_results = []
143
+ task_results = [] # For aggregation
144
+
145
+ # Handle docker_execution separately - actual Docker execution
146
+ if evaluation_type == "docker_execution":
147
+ from wisent.core.evaluators.benchmark_specific.coding.providers.livecodebench.provider import LiveCodeBenchProvider
148
+ from wisent.core.evaluators.benchmark_specific.coding.metrics.evaluator import CodingEvaluator, EvaluatorConfig, _make_schema
149
+ from wisent.core.evaluators.benchmark_specific.coding.safe_docker.recipes import RECIPE_REGISTRY
150
+ from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.python_sanitizer import PythonStandardizer
151
+ from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.cpp_sanitizer import CppStandardizer
152
+ from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.java_sanitizer import JavaStandardizer
153
+
154
+ _SANITIZERS = {
155
+ "python": PythonStandardizer(),
156
+ "cpp": CppStandardizer(),
157
+ "java": JavaStandardizer(),
158
+ }
159
+
160
+ print(f"🐳 Running Docker execution evaluation...")
161
+
162
+ # Get provider configuration
163
+ provider_name = task_config.get('provider', 'livecodebench')
164
+
165
+ # Initialize provider
166
+ if provider_name == 'livecodebench':
167
+ # Extract language from responses or use default
168
+ language = input_data.get('language', 'python')
169
+ release_version = input_data.get('release_version', 'all')
170
+
171
+ provider = LiveCodeBenchProvider(
172
+ language=language,
173
+ release_version=release_version,
174
+ limit=None # Load all problems
175
+ )
176
+ print(f" ✓ LiveCodeBench provider loaded")
177
+ else:
178
+ print(f" ❌ Unknown provider: {provider_name}")
179
+ sys.exit(1)
180
+
181
+ # Create mapping of problem_id to generated code
182
+ code_map = {}
183
+ for response_data in responses:
184
+ problem_id = response_data.get('problem_id')
185
+ generated_code = response_data.get('generated_response', '')
186
+ if problem_id:
187
+ code_map[problem_id] = generated_code
188
+
189
+ print(f" ✓ Loaded {len(code_map)} generated solutions\n")
190
+
191
+ # Track which problems we want to evaluate
192
+ problems_to_evaluate = set(code_map.keys())
193
+
194
+ # Create model_fn that returns pre-generated code
195
+ def model_fn(task):
196
+ problem_id = task.options.get('problem_id', '')
197
+ if problem_id in code_map:
198
+ # Return code as solution file
199
+ return {'solution.py': code_map[problem_id]}
200
+ else:
201
+ # No solution available for this problem - return empty
202
+ return None
203
+
204
+ # Configure evaluator
205
+ config = EvaluatorConfig(
206
+ image=docker_config.get('image', 'coding/sandbox:polyglot-1.0'),
207
+ time_limit_s=docker_config.get('time_limit_s', 8),
208
+ cpu_limit_s=docker_config.get('cpu_limit_s', 3),
209
+ mem_limit_mb=docker_config.get('mem_limit_mb', 768),
210
+ self_repair=False # No self-repair for evaluation
211
+ )
212
+
213
+ print(f"🐳 Docker Configuration:")
214
+ print(f" Image: {config.image}")
215
+ print(f" Time limit: {config.time_limit_s}s")
216
+ print(f" CPU limit: {config.cpu_limit_s}s")
217
+ print(f" Memory limit: {config.mem_limit_mb}MB\n")
218
+
219
+ # Run evaluation
220
+ evaluator = CodingEvaluator(provider, model_fn, cfg=config)
221
+
222
+ print(f"🎯 Executing code in Docker sandbox...\n")
223
+
224
+ evaluated_count = 0
225
+ skipped_count = 0
226
+
227
+ # Iterate through provider tasks manually to track problem_ids
228
+ for idx, task in enumerate(provider.iter_tasks()):
229
+ problem_id = task.options.get('problem_id', f'unknown_{idx}')
230
+
231
+ # Skip problems without solutions
232
+ if problem_id not in problems_to_evaluate:
233
+ skipped_count += 1
234
+ continue
235
+
236
+ # Get the generated code
237
+ files = model_fn(task)
238
+ if files is None:
239
+ skipped_count += 1
240
+ continue
241
+
242
+ # Merge with task files (test files)
243
+ files = {**task.files, **files}
244
+
245
+ # Optionally sanitize
246
+ if config.pre_sanitize:
247
+ schema = _make_schema(task)
248
+ sanitizer = _SANITIZERS.get(task.language)
249
+ if sanitizer:
250
+ raw = files.get(schema.file_name) or files.get("__raw__")
251
+ if raw:
252
+ out = sanitizer.normalize(raw, schema)
253
+ files = {**files, schema.file_name: out.files.get(schema.file_name, raw)}
254
+
255
+ # Run the code in Docker
256
+ recipe = RECIPE_REGISTRY[task.language]
257
+ job = recipe.make_job(**task.options,
258
+ time_limit_s=config.time_limit_s,
259
+ cpu_limit_s=config.cpu_limit_s,
260
+ mem_limit_mb=config.mem_limit_mb)
261
+ result_obj = evaluator.exec.run(files, job)
262
+
263
+ evaluated_count += 1
264
+
265
+ # Check if passed
266
+ passed = (result_obj.status == "ok")
267
+
268
+ result = {
269
+ 'problem_id': problem_id,
270
+ 'passed': passed,
271
+ 'status': result_obj.status,
272
+ 'elapsed': result_obj.elapsed
273
+ }
274
+
275
+ evaluation_results.append(result)
276
+ task_results.append({
277
+ 'pass_rate': 1.0 if passed else 0.0
278
+ })
279
+
280
+ if args.verbose:
281
+ status_icon = '✅' if passed else '❌'
282
+ elapsed_time = result_obj.elapsed
283
+ print(f"{status_icon} {problem_id}: {result_obj.status} ({elapsed_time:.2f}s)")
284
+ print(f"\n ✓ Evaluated {evaluated_count} problems (skipped {skipped_count})")
285
+
286
+ # Aggregate results
287
+ aggregated_metrics = {}
288
+ if task_results:
289
+ pass_rate = sum(r['pass_rate'] for r in task_results) / len(task_results)
290
+ aggregated_metrics['pass_rate'] = pass_rate
291
+ aggregated_metrics['total_passed'] = sum(r['pass_rate'] for r in task_results)
292
+ aggregated_metrics['total_problems'] = len(task_results)
293
+
294
+ # Save results
295
+ print(f"\n💾 Saving evaluation results...")
296
+ output_dir = os.path.dirname(args.output)
297
+ if output_dir:
298
+ os.makedirs(output_dir, exist_ok=True)
299
+
300
+ output_data = {
301
+ "input_file": args.input,
302
+ "task": input_data.get('task'),
303
+ "model": input_data.get('model'),
304
+ "evaluation_type": evaluation_type,
305
+ "evaluator_used": "CodingEvaluator",
306
+ "aggregated_metrics": aggregated_metrics,
307
+ "num_evaluated": len(task_results),
308
+ "num_total": len(responses),
309
+ "evaluations": evaluation_results
310
+ }
311
+
312
+ with open(args.output, 'w') as f:
313
+ json.dump(output_data, f, indent=2)
314
+
315
+ print(f" ✓ Results saved to: {args.output}\n")
316
+ print(f"{'='*80}")
317
+ print(f"✅ EVALUATION COMPLETE")
318
+ print(f"{'='*80}")
319
+ print(f" Total problems: {len(task_results)}")
320
+ print(f" Passed: {int(aggregated_metrics.get('total_passed', 0))}")
321
+ print(f" Failed: {len(task_results) - int(aggregated_metrics.get('total_passed', 0))}")
322
+ print(f" Pass rate: {aggregated_metrics.get('pass_rate', 0):.2%}")
323
+ print(f"{'='*80}\n")
324
+ return
325
+
326
+ # Handle personalization separately - LLM-as-judge evaluation
327
+ if evaluation_type == "personalization":
328
+ print(f"🎭 Running personality trait evaluation...")
329
+
330
+ # Extract judge model from task config if available
331
+ judge_model = task_config.get('judge_model', evaluator.default_judge_model)
332
+ use_mock = task_config.get('use_mock', False)
333
+
334
+ # Check if trait is specified via CLI
335
+ if hasattr(args, 'trait') and args.trait:
336
+ print(f" Target trait: {args.trait} (from CLI)")
337
+
338
+ if use_mock:
339
+ print(f" ⚠️ Using mock evaluation (no API calls)\n")
340
+ else:
341
+ print(f" Judge model: {judge_model}\n")
342
+
343
+ print(f" Evaluating {len(responses)} responses...\n")
344
+
345
+ evaluated_count = 0
346
+ trait_scores = []
347
+
348
+ for idx, response_data in enumerate(responses, 1):
349
+ if 'error' in response_data:
350
+ if args.verbose:
351
+ print(f"Response {idx}: Skipped (generation error)")
352
+ evaluation_results.append({
353
+ **response_data,
354
+ "evaluation": {
355
+ "error": "Generation failed"
356
+ }
357
+ })
358
+ continue
359
+
360
+ try:
361
+ generated_response = response_data.get('generated_response', '')
362
+ prompt = response_data.get('prompt', '')
363
+
364
+ # Extract trait information from CLI argument or response_data
365
+ # CLI argument takes precedence
366
+ if hasattr(args, 'trait') and args.trait:
367
+ trait = args.trait
368
+ trait_description = f'The trait: {trait}'
369
+ else:
370
+ trait = response_data.get('trait', 'unknown')
371
+ trait_description = response_data.get('trait_description', f'The trait: {trait}')
372
+
373
+ # If trait info is in a nested dict
374
+ if isinstance(response_data.get('expected'), dict):
375
+ trait = response_data['expected'].get('trait', trait)
376
+ trait_description = response_data['expected'].get('trait_description', trait_description)
377
+
378
+ # Call evaluator
379
+ eval_result = evaluator.evaluate(
380
+ response=generated_response,
381
+ expected={
382
+ 'trait': trait,
383
+ 'trait_description': trait_description
384
+ },
385
+ prompt=prompt,
386
+ judge_model=judge_model,
387
+ use_mock=use_mock
388
+ )
389
+
390
+ evaluated_count += 1
391
+
392
+ # Extract metrics from meta
393
+ trait_score = eval_result.meta.get('trait_score', 0)
394
+ intensity = eval_result.meta.get('intensity', 'unknown')
395
+
396
+ trait_scores.append(trait_score)
397
+
398
+ # Store result
399
+ result = {
400
+ 'response_id': response_data.get('id', idx),
401
+ 'trait': trait,
402
+ 'trait_score': trait_score,
403
+ 'intensity': intensity,
404
+ 'ground_truth': eval_result.ground_truth,
405
+ 'confidence': eval_result.confidence,
406
+ 'explanation': eval_result.meta.get('explanation', ''),
407
+ 'judge_model': eval_result.meta.get('judge_model', judge_model)
408
+ }
409
+
410
+ evaluation_results.append(result)
411
+ task_results.append({
412
+ 'trait_score': trait_score,
413
+ 'confidence': eval_result.confidence
414
+ })
415
+
416
+ if args.verbose:
417
+ score_icon = '✅' if trait_score >= 7 else ('⚠️' if trait_score >= 4 else '❌')
418
+ print(f"{score_icon} Response {idx} ({trait}): {trait_score}/10 ({intensity})")
419
+
420
+ except Exception as e:
421
+ logger.exception(f"Error evaluating response {idx}: {e}")
422
+ evaluation_results.append({
423
+ **response_data,
424
+ "evaluation": {
425
+ "error": str(e)
426
+ }
427
+ })
428
+
429
+ print(f"\n ✓ Evaluated {evaluated_count} responses\n")
430
+
431
+ # Aggregate results
432
+ aggregated_metrics = {}
433
+ if task_results:
434
+ avg_trait_score = sum(r['trait_score'] for r in task_results) / len(task_results)
435
+ avg_confidence = sum(r['confidence'] for r in task_results) / len(task_results)
436
+
437
+ # Count by intensity thresholds
438
+ strong_count = sum(1 for s in trait_scores if s >= 7)
439
+ moderate_count = sum(1 for s in trait_scores if 4 <= s < 7)
440
+ weak_count = sum(1 for s in trait_scores if s < 4)
441
+
442
+ aggregated_metrics['avg_trait_score'] = avg_trait_score
443
+ aggregated_metrics['avg_confidence'] = avg_confidence
444
+ aggregated_metrics['strong_manifestation_rate'] = strong_count / len(trait_scores) if trait_scores else 0
445
+ aggregated_metrics['moderate_manifestation_rate'] = moderate_count / len(trait_scores) if trait_scores else 0
446
+ aggregated_metrics['weak_manifestation_rate'] = weak_count / len(trait_scores) if trait_scores else 0
447
+ aggregated_metrics['total_evaluated'] = len(task_results)
448
+
449
+ # Save results
450
+ print(f"💾 Saving evaluation results...")
451
+ output_dir = os.path.dirname(args.output)
452
+ if output_dir:
453
+ os.makedirs(output_dir, exist_ok=True)
454
+
455
+ output_data = {
456
+ "input_file": args.input,
457
+ "task": input_data.get('task'),
458
+ "model": input_data.get('model'),
459
+ "evaluation_type": evaluation_type,
460
+ "evaluator_used": "PersonalizationEvaluator",
461
+ "judge_model": judge_model,
462
+ "use_mock": use_mock,
463
+ "aggregated_metrics": aggregated_metrics,
464
+ "num_evaluated": len(task_results),
465
+ "num_total": len(responses),
466
+ "evaluations": evaluation_results
467
+ }
468
+
469
+ with open(args.output, 'w') as f:
470
+ json.dump(output_data, f, indent=2)
471
+
472
+ print(f" ✓ Results saved to: {args.output}\n")
473
+ print(f"{'='*80}")
474
+ print(f"✅ PERSONALIZATION EVALUATION COMPLETE")
475
+ print(f"{'='*80}")
476
+ print(f" Total responses: {len(task_results)}")
477
+ print(f" Average trait score: {aggregated_metrics.get('avg_trait_score', 0):.2f}/10")
478
+ print(f" Average confidence: {aggregated_metrics.get('avg_confidence', 0):.2%}")
479
+ print(f" Strong manifestation: {aggregated_metrics.get('strong_manifestation_rate', 0):.1%}")
480
+ print(f" Moderate manifestation: {aggregated_metrics.get('moderate_manifestation_rate', 0):.1%}")
481
+ print(f" Weak manifestation: {aggregated_metrics.get('weak_manifestation_rate', 0):.1%}")
482
+ print(f"{'='*80}\n")
483
+ return
484
+
485
+ for idx, response_data in enumerate(responses, 1):
486
+ if 'error' in response_data:
487
+ if args.verbose:
488
+ print(f"Question {idx}: Skipped (generation error)")
489
+ evaluation_results.append({
490
+ **response_data,
491
+ "evaluation": {
492
+ "error": "Generation failed"
493
+ }
494
+ })
495
+ continue
496
+
497
+ try:
498
+ generated_response = response_data.get('generated_response', '')
499
+ prompt = response_data.get('prompt', '')
500
+
501
+ # Find matching task doc by question text
502
+ task_doc = None
503
+ if task_docs:
504
+ for doc in task_docs:
505
+ doc_question = doc.get('question', '').strip()
506
+ if doc_question and doc_question in prompt:
507
+ task_doc = doc
508
+ break
509
+
510
+ if not task_doc:
511
+ if args.verbose:
512
+ print(f"Question {idx}: Could not match to task doc")
513
+ evaluation_results.append({
514
+ **response_data,
515
+ "evaluation": {
516
+ "error": "Could not match to task document"
517
+ }
518
+ })
519
+ continue
520
+
521
+ # Get expected answer based on evaluation type
522
+ if evaluation_type == "multiple_choice":
523
+ # Get all choice texts and gold index
524
+ gold_idx = None
525
+ choice_texts = []
526
+
527
+ if 'mc1_targets' in task_doc:
528
+ # truthfulqa_mc1 format
529
+ labels = task_doc['mc1_targets']['labels']
530
+ gold_idx = labels.index(1)
531
+ choice_texts = task_doc['mc1_targets']['choices']
532
+ elif 'choices' in task_doc:
533
+ # arc_easy, piqa, etc. format
534
+ answer_key = task_doc.get('answerKey', 'A')
535
+ gold_idx = ord(answer_key) - ord('A')
536
+ if isinstance(task_doc['choices'], dict):
537
+ choice_texts = task_doc['choices']['text']
538
+ else:
539
+ choice_texts = task_doc['choices']
540
+ elif 'gold' in task_doc:
541
+ # Some tasks have gold directly
542
+ gold_idx = task_doc['gold']
543
+ choice_texts = task.doc_to_choice(task_doc)
544
+ else:
545
+ if args.verbose:
546
+ print(f"Question {idx}: Unknown multiple-choice format")
547
+ evaluation_results.append({
548
+ **response_data,
549
+ "evaluation": {
550
+ "error": "Unknown task format"
551
+ }
552
+ })
553
+ continue
554
+
555
+ # Use F1Evaluator to match response to best choice
556
+ best_score = 0.0
557
+ best_choice_idx = None
558
+
559
+ for i, choice_text in enumerate(choice_texts):
560
+ result = evaluator.evaluate(generated_response, choice_text)
561
+ if result.confidence > best_score:
562
+ best_score = result.confidence
563
+ best_choice_idx = i
564
+
565
+ # Check if correct
566
+ is_correct = (best_choice_idx == gold_idx)
567
+
568
+ # Store result
569
+ task_results.append({
570
+ 'acc': 1.0 if is_correct else 0.0,
571
+ 'f1_score': best_score
572
+ })
573
+
574
+ if args.verbose:
575
+ doc_question = task_doc.get('question', '')
576
+ print(f"Question {idx}:")
577
+ print(f" Question: {doc_question[:60]}...")
578
+ print(f" Predicted choice: {best_choice_idx} (F1: {best_score:.3f})")
579
+ print(f" Correct choice: {gold_idx}")
580
+ print(f" Result: {'✓ CORRECT' if is_correct else '✗ INCORRECT'}\n")
581
+
582
+ evaluation_results.append({
583
+ **response_data,
584
+ "evaluation": {
585
+ "predicted_choice_idx": best_choice_idx,
586
+ "predicted_choice_text": choice_texts[best_choice_idx] if best_choice_idx is not None else None,
587
+ "correct_choice_idx": gold_idx,
588
+ "correct_choice_text": choice_texts[gold_idx],
589
+ "f1_score": best_score,
590
+ "correct": is_correct
591
+ }
592
+ })
593
+
594
+ elif evaluation_type == "generate_until":
595
+ # Get expected answer
596
+ expected = None
597
+ if 'answer' in task_doc:
598
+ expected = task_doc['answer']
599
+ elif 'answers' in task_doc:
600
+ expected = task_doc['answers']
601
+ elif 'target' in task_doc:
602
+ expected = task_doc['target']
603
+ else:
604
+ if args.verbose:
605
+ print(f"Question {idx}: No expected answer found")
606
+ evaluation_results.append({
607
+ **response_data,
608
+ "evaluation": {
609
+ "error": "No expected answer in task document"
610
+ }
611
+ })
612
+ continue
613
+
614
+ # Evaluate using selected evaluator
615
+ result = evaluator.evaluate(generated_response, expected)
616
+
617
+ is_correct = (result.ground_truth == "TRUTHFUL")
618
+
619
+ # Store result
620
+ task_results.append({
621
+ 'acc': 1.0 if is_correct else 0.0,
622
+ 'confidence': result.confidence
623
+ })
624
+
625
+ if args.verbose:
626
+ doc_question = task_doc.get('question', '')
627
+ print(f"Question {idx}:")
628
+ print(f" Question: {doc_question[:60]}...")
629
+ print(f" Ground truth: {result.ground_truth}")
630
+ print(f" Confidence: {result.confidence:.3f}")
631
+ print(f" Result: {'✓ CORRECT' if is_correct else '✗ INCORRECT'}\n")
632
+
633
+ evaluation_results.append({
634
+ **response_data,
635
+ "evaluation": {
636
+ "ground_truth": result.ground_truth,
637
+ "confidence": result.confidence,
638
+ "details": result.details,
639
+ "correct": is_correct
640
+ }
641
+ })
642
+
643
+ else:
644
+ # Other evaluation types (loglikelihood_rolling, etc.)
645
+ if args.verbose:
646
+ print(f"Question {idx}: Evaluation type {evaluation_type} not fully implemented")
647
+ evaluation_results.append({
648
+ **response_data,
649
+ "evaluation": {
650
+ "error": f"Evaluation type {evaluation_type} not implemented"
651
+ }
652
+ })
653
+
654
+ except Exception as e:
655
+ print(f" ❌ Error evaluating question {idx}: {e}")
656
+ import traceback
657
+ traceback.print_exc()
658
+ evaluation_results.append({
659
+ **response_data,
660
+ "evaluation": {
661
+ "error": str(e)
662
+ }
663
+ })
664
+
665
+ # Aggregate results
666
+ aggregated_metrics = {}
667
+ if task_results:
668
+ # Get all metric keys
669
+ all_metric_keys = set()
670
+ for result in task_results:
671
+ all_metric_keys.update(result.keys())
672
+
673
+ # Aggregate each metric
674
+ for metric_key in all_metric_keys:
675
+ values = [r[metric_key] for r in task_results if metric_key in r]
676
+ if values:
677
+ # Most tasks use mean aggregation
678
+ aggregated_metrics[metric_key] = sum(values) / len(values)
679
+
680
+ # Save results
681
+ print(f"\n💾 Saving evaluation results...")
682
+ os.makedirs(os.path.dirname(args.output), exist_ok=True)
683
+
684
+ output_data = {
685
+ "input_file": args.input,
686
+ "task": input_data.get('task'),
687
+ "model": input_data.get('model'),
688
+ "evaluation_type": evaluation_type,
689
+ "evaluator_used": evaluator.name,
690
+ "aggregated_metrics": aggregated_metrics,
691
+ "num_evaluated": len(task_results),
692
+ "num_total": len(responses),
693
+ "evaluations": evaluation_results
694
+ }
695
+
696
+ with open(args.output, 'w') as f:
697
+ json.dump(output_data, f, indent=2)
698
+
699
+ print(f" ✓ Results saved to: {args.output}\n")
700
+
701
+ # Print summary
702
+ print(f"{'='*80}")
703
+ print(f"✅ EVALUATION COMPLETE")
704
+ print(f"{'='*80}")
705
+ print(f" Evaluator: {evaluator.name}")
706
+ print(f" Total responses: {len(responses)}")
707
+ print(f" Successfully evaluated: {len(task_results)}")
708
+ print(f" Failed/skipped: {len(responses) - len(task_results)}")
709
+ print(f"\n Metrics:")
710
+ for metric_name, metric_value in aggregated_metrics.items():
711
+ if isinstance(metric_value, float):
712
+ print(f" {metric_name}: {metric_value:.4f}")
713
+ else:
714
+ print(f" {metric_name}: {metric_value}")
715
+ print(f"{'='*80}\n")