eval-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. eval_framework/__init__.py +7 -0
  2. eval_framework/base_config.py +36 -0
  3. eval_framework/context/__init__.py +0 -0
  4. eval_framework/context/determined.py +170 -0
  5. eval_framework/context/eval.py +114 -0
  6. eval_framework/context/local.py +52 -0
  7. eval_framework/evaluation_generator.py +231 -0
  8. eval_framework/exceptions.py +2 -0
  9. eval_framework/external/ifeval_impl/README.md +5 -0
  10. eval_framework/external/ifeval_impl/instructions.py +1523 -0
  11. eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
  12. eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
  13. eval_framework/external/ifeval_impl/utils.py +135 -0
  14. eval_framework/llm/__init__.py +0 -0
  15. eval_framework/llm/aleph_alpha.py +323 -0
  16. eval_framework/llm/base.py +58 -0
  17. eval_framework/llm/huggingface.py +332 -0
  18. eval_framework/llm/mistral.py +73 -0
  19. eval_framework/llm/models.py +16 -0
  20. eval_framework/llm/openai.py +205 -0
  21. eval_framework/llm/vllm.py +438 -0
  22. eval_framework/logger.py +3 -0
  23. eval_framework/main.py +187 -0
  24. eval_framework/metrics/__init__.py +0 -0
  25. eval_framework/metrics/base.py +40 -0
  26. eval_framework/metrics/completion/__init__.py +1 -0
  27. eval_framework/metrics/completion/accuracy_completion.py +16 -0
  28. eval_framework/metrics/completion/bleu.py +76 -0
  29. eval_framework/metrics/completion/chrf.py +62 -0
  30. eval_framework/metrics/completion/code_assertion.py +44 -0
  31. eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
  32. eval_framework/metrics/completion/comet.py +56 -0
  33. eval_framework/metrics/completion/concordance_index.py +38 -0
  34. eval_framework/metrics/completion/csv_format.py +102 -0
  35. eval_framework/metrics/completion/cwe_accuracy.py +49 -0
  36. eval_framework/metrics/completion/exponential_similarity.py +65 -0
  37. eval_framework/metrics/completion/f1.py +42 -0
  38. eval_framework/metrics/completion/format_checker.py +56 -0
  39. eval_framework/metrics/completion/grid_difference.py +77 -0
  40. eval_framework/metrics/completion/ifeval.py +73 -0
  41. eval_framework/metrics/completion/json_format.py +171 -0
  42. eval_framework/metrics/completion/language_checker.py +74 -0
  43. eval_framework/metrics/completion/length_control.py +83 -0
  44. eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
  45. eval_framework/metrics/completion/niah_accuracy.py +163 -0
  46. eval_framework/metrics/completion/placeholder_checker.py +27 -0
  47. eval_framework/metrics/completion/repetition.py +88 -0
  48. eval_framework/metrics/completion/rouge_1.py +35 -0
  49. eval_framework/metrics/completion/rouge_2.py +45 -0
  50. eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
  51. eval_framework/metrics/completion/rouge_l.py +52 -0
  52. eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
  53. eval_framework/metrics/completion/ter.py +67 -0
  54. eval_framework/metrics/completion/text_counter.py +182 -0
  55. eval_framework/metrics/efficiency/__init__.py +0 -0
  56. eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
  57. eval_framework/metrics/llm/__init__.py +0 -0
  58. eval_framework/metrics/llm/base.py +8 -0
  59. eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
  60. eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
  61. eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
  62. eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
  63. eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
  64. eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
  65. eval_framework/metrics/llm/graders/language.py +56 -0
  66. eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
  67. eval_framework/metrics/llm/graders/models.py +74 -0
  68. eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
  69. eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
  70. eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
  71. eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
  72. eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
  73. eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
  74. eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
  75. eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
  76. eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
  77. eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
  78. eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
  79. eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
  80. eval_framework/metrics/llm/llm_judge_sql.py +394 -0
  81. eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
  82. eval_framework/metrics/loglikelihood/__init__.py +0 -0
  83. eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
  84. eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
  85. eval_framework/py.typed +0 -0
  86. eval_framework/response_generator.py +416 -0
  87. eval_framework/result_processors/__init__.py +0 -0
  88. eval_framework/result_processors/base.py +74 -0
  89. eval_framework/result_processors/hf_processor.py +87 -0
  90. eval_framework/result_processors/result_processor.py +129 -0
  91. eval_framework/run.py +314 -0
  92. eval_framework/run_direct.py +42 -0
  93. eval_framework/shared/types.py +227 -0
  94. eval_framework/tasks/__init__.py +6 -0
  95. eval_framework/tasks/base.py +314 -0
  96. eval_framework/tasks/benchmarks/__init__.py +0 -0
  97. eval_framework/tasks/benchmarks/arc.py +46 -0
  98. eval_framework/tasks/benchmarks/arc_de.py +46 -0
  99. eval_framework/tasks/benchmarks/arc_fi.py +46 -0
  100. eval_framework/tasks/benchmarks/belebele.py +60 -0
  101. eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
  102. eval_framework/tasks/benchmarks/casehold.py +47 -0
  103. eval_framework/tasks/benchmarks/chembench.py +85 -0
  104. eval_framework/tasks/benchmarks/copa.py +39 -0
  105. eval_framework/tasks/benchmarks/duc.py +91 -0
  106. eval_framework/tasks/benchmarks/flores200.py +62 -0
  107. eval_framework/tasks/benchmarks/flores_plus.py +84 -0
  108. eval_framework/tasks/benchmarks/gpqa.py +177 -0
  109. eval_framework/tasks/benchmarks/gsm8k.py +148 -0
  110. eval_framework/tasks/benchmarks/hellaswag.py +44 -0
  111. eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
  112. eval_framework/tasks/benchmarks/humaneval.py +97 -0
  113. eval_framework/tasks/benchmarks/ifeval.py +78 -0
  114. eval_framework/tasks/benchmarks/include.py +119 -0
  115. eval_framework/tasks/benchmarks/infinitebench.py +302 -0
  116. eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
  117. eval_framework/tasks/benchmarks/mbpp.py +192 -0
  118. eval_framework/tasks/benchmarks/mmlu.py +190 -0
  119. eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
  120. eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
  121. eval_framework/tasks/benchmarks/mmmlu.py +529 -0
  122. eval_framework/tasks/benchmarks/openbookqa.py +37 -0
  123. eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
  124. eval_framework/tasks/benchmarks/pawsx.py +65 -0
  125. eval_framework/tasks/benchmarks/piqa.py +39 -0
  126. eval_framework/tasks/benchmarks/quality.py +56 -0
  127. eval_framework/tasks/benchmarks/sciq.py +44 -0
  128. eval_framework/tasks/benchmarks/sphyr.py +75 -0
  129. eval_framework/tasks/benchmarks/squad.py +89 -0
  130. eval_framework/tasks/benchmarks/struct_eval.py +110 -0
  131. eval_framework/tasks/benchmarks/tablebench.py +117 -0
  132. eval_framework/tasks/benchmarks/triviaqa.py +42 -0
  133. eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
  134. eval_framework/tasks/benchmarks/winogender.py +39 -0
  135. eval_framework/tasks/benchmarks/winogrande.py +44 -0
  136. eval_framework/tasks/benchmarks/winox.py +57 -0
  137. eval_framework/tasks/benchmarks/wmt.py +160 -0
  138. eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
  139. eval_framework/tasks/eval_config.py +112 -0
  140. eval_framework/tasks/perturbation.py +83 -0
  141. eval_framework/tasks/registry.py +186 -0
  142. eval_framework/tasks/task_loader.py +80 -0
  143. eval_framework/tasks/task_names.py +138 -0
  144. eval_framework/tasks/utils.py +578 -0
  145. eval_framework/utils/constants.py +9 -0
  146. eval_framework/utils/generate_task_docs.py +229 -0
  147. eval_framework/utils/helpers.py +3 -0
  148. eval_framework/utils/logging.py +50 -0
  149. eval_framework/utils/packaging.py +52 -0
  150. eval_framework-0.2.0.dist-info/METADATA +514 -0
  151. eval_framework-0.2.0.dist-info/RECORD +161 -0
  152. eval_framework-0.2.0.dist-info/WHEEL +4 -0
  153. eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
  154. template_formatting/README.md +83 -0
  155. template_formatting/__init__.py +0 -0
  156. template_formatting/formatter.py +536 -0
  157. template_formatting/mistral_formatter.py +159 -0
  158. template_formatting/py.typed +0 -0
  159. template_formatting/tests/test_formatter_eval.py +408 -0
  160. template_formatting/tests/test_formatter_scaling.py +253 -0
  161. template_formatting/tests/test_mistral_formatter.py +136 -0
@@ -0,0 +1,56 @@
1
+ import json
2
+ import re
3
+
4
+ from eval_framework.metrics.base import BaseMetric, MetricResult
5
+ from eval_framework.shared.types import Completion
6
+
7
+
8
+ class CheckJsonFormat(BaseMetric[Completion]):
9
+ NAME = "JSON Format"
10
+
11
+ def _preprocess(self, completion: str) -> str:
12
+ completion = completion.strip()
13
+ for prefix in ["```json", "```Json", "```JSON", "```"]:
14
+ completion = completion.removeprefix(prefix)
15
+ completion = completion.removesuffix("```")
16
+ completion = completion.strip()
17
+ return completion
18
+
19
+ def calculate(self, response: Completion) -> list[MetricResult]:
20
+ if response.error is not None:
21
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
22
+
23
+ json_text = self._preprocess(response.completion)
24
+
25
+ try:
26
+ json.loads(json_text)
27
+ is_valid_json = True
28
+ except ValueError as _:
29
+ is_valid_json = False
30
+
31
+ return [
32
+ MetricResult(metric_name=self.NAME, value=float(is_valid_json), higher_is_better=True, error=response.error)
33
+ ]
34
+
35
+
36
+ class CheckPostScriptFormat(BaseMetric[Completion]):
37
+ """
38
+ This metric is honestly not that great
39
+ In the original IFEval implementation it just checks whether the
40
+ text contains the string (P.)P.S. or variants thereof such as p. s.
41
+ It doesn't check for parsing
42
+ """
43
+
44
+ NAME = "Postscript Format"
45
+
46
+ def calculate(self, response: Completion) -> list[MetricResult]:
47
+ if response.error is not None:
48
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
49
+
50
+ postscript_pattern = r"\s*(P\.S\.|P\.P\.S\.)"
51
+ postscript = re.findall(postscript_pattern, response.completion, flags=re.MULTILINE)
52
+ return [
53
+ MetricResult(
54
+ metric_name=self.NAME, value=1.0 if postscript else 0.0, higher_is_better=True, error=response.error
55
+ )
56
+ ]
@@ -0,0 +1,77 @@
1
+ import re
2
+
3
+ from eval_framework.metrics.base import BaseMetric, MetricResult
4
+ from eval_framework.shared.types import Completion
5
+
6
+
7
+ class GridDifference(BaseMetric[Completion]):
8
+ NAME = "grid_difference"
9
+
10
+ def count_differences(self, character_list_1: list[str], character_list_2: list[str]) -> int:
11
+ count = 0
12
+ for character_1, character_2 in zip(character_list_1, character_list_2):
13
+ if character_1 != character_2:
14
+ count += 1
15
+ return count
16
+
17
+ def calculate_score(
18
+ self, output_ground_truth_difference_count: int, input_ground_truth_difference_count: int
19
+ ) -> float:
20
+ if output_ground_truth_difference_count == 0 and input_ground_truth_difference_count == 0:
21
+ return 1.0
22
+ score = 1.0 - (float(output_ground_truth_difference_count) / float(input_ground_truth_difference_count))
23
+ return score
24
+
25
+ def extract_grid_from_prompt(self, prompt: str) -> str:
26
+ # Extract grid between known markers
27
+ start_marker = "Below is the input grid with masked regions:"
28
+ end_marker = "Please output the completed grid"
29
+
30
+ # Use regex with DOTALL flag to match across newlines
31
+ match = re.search(re.escape(start_marker) + r"(.*?)" + re.escape(end_marker), prompt, re.DOTALL)
32
+
33
+ if match:
34
+ grid = match.group(1).strip()
35
+ return grid
36
+
37
+ return ""
38
+
39
+ def calculate(self, response: Completion) -> list[MetricResult]:
40
+ if response.error is not None:
41
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
42
+
43
+ input_grid = self.extract_grid_from_prompt(prompt=response.last_user_instruction).split()
44
+ output_grid = response.completion.split()
45
+
46
+ assert response.ground_truth_list[0], "Ground truth list is empty or not provided in the response."
47
+ ground_truth_grid = response.ground_truth_list[0].split()
48
+
49
+ input_ground_truth_differences_count = self.count_differences(input_grid, ground_truth_grid)
50
+ output_ground_truth_differences_count = self.count_differences(output_grid, ground_truth_grid)
51
+
52
+ exact_match = True
53
+ score = 1.0
54
+ normalized_score = 1.0
55
+ if output_ground_truth_differences_count != 0:
56
+ exact_match = False
57
+ score = self.calculate_score(
58
+ output_ground_truth_differences_count,
59
+ input_ground_truth_differences_count,
60
+ )
61
+ normalized_score = max(score, 0.0)
62
+
63
+ return [
64
+ MetricResult(
65
+ metric_name=f"{self.NAME}_exact_match",
66
+ value=float(exact_match),
67
+ higher_is_better=True,
68
+ error=response.error,
69
+ ),
70
+ MetricResult(metric_name=f"{self.NAME}_score", value=score, higher_is_better=True, error=response.error),
71
+ MetricResult(
72
+ metric_name=f"{self.NAME}_normalized_score",
73
+ value=normalized_score,
74
+ higher_is_better=True,
75
+ error=response.error,
76
+ ),
77
+ ]
@@ -0,0 +1,73 @@
1
+ from typing import Any
2
+
3
+ from eval_framework.external.ifeval_impl.utils import process_results
4
+ from eval_framework.metrics.base import BaseMetric, MetricResult
5
+ from eval_framework.shared.types import BaseMetricContext, Completion, extract_context_metric
6
+
7
+
8
+ class IFEvalMetricContext(BaseMetricContext):
9
+ key: int
10
+ instruction_id_list: list[str]
11
+ prompt: str
12
+ additional_kwargs: list[dict[str, Any]]
13
+
14
+
15
+ class IFEvalMetric(BaseMetric[Completion]):
16
+ NAME = "IFEval"
17
+
18
+ def calculate(self, response: Completion) -> list[MetricResult]:
19
+ context = extract_context_metric(response, IFEvalMetricContext)
20
+
21
+ if response.error is not None:
22
+ return [
23
+ MetricResult(
24
+ metric_name=f"{self.NAME}/prompt_level_strict_acc",
25
+ value=None,
26
+ higher_is_better=True,
27
+ error=response.error,
28
+ ),
29
+ MetricResult(
30
+ metric_name=f"{self.NAME}/prompt_level_loose_acc",
31
+ value=None,
32
+ higher_is_better=True,
33
+ error=response.error,
34
+ ),
35
+ ]
36
+
37
+ grading = process_results(context, [response.completion])
38
+
39
+ results = [
40
+ MetricResult(
41
+ metric_name=f"{self.NAME}/prompt_level_strict_acc",
42
+ value=float(grading["prompt_level_strict_acc"]),
43
+ higher_is_better=True,
44
+ error=response.error,
45
+ ),
46
+ MetricResult(
47
+ metric_name=f"{self.NAME}/prompt_level_loose_acc",
48
+ value=float(grading["prompt_level_loose_acc"]),
49
+ higher_is_better=True,
50
+ error=response.error,
51
+ ),
52
+ ]
53
+ # this framework does not support a custom aggregation step (see agg_inst_level_acc()) so work around
54
+ # by returning the result for each instruction as a separate MetricResult
55
+ results += [
56
+ MetricResult(
57
+ metric_name=f"{self.NAME}/inst_level_strict_acc",
58
+ value=float(v),
59
+ higher_is_better=True,
60
+ error=response.error,
61
+ )
62
+ for v in grading["inst_level_strict_acc"]
63
+ ]
64
+ results += [
65
+ MetricResult(
66
+ metric_name=f"{self.NAME}/inst_level_loose_acc",
67
+ value=float(v),
68
+ higher_is_better=True,
69
+ error=response.error,
70
+ )
71
+ for v in grading["inst_level_loose_acc"]
72
+ ]
73
+ return results
@@ -0,0 +1,171 @@
1
+ import json
2
+ from collections.abc import Mapping
3
+ from typing import Any
4
+
5
+ import jsonschema # type: ignore
6
+ from pydantic import BaseModel
7
+
8
+ from eval_framework.metrics.base import BaseMetric, MetricResult
9
+ from eval_framework.shared.types import Completion
10
+
11
+
12
+ class JsonFormatEvaluation(BaseModel):
13
+ is_just_json: bool = False
14
+ is_valid_json: bool = False
15
+ fulfills_schema: bool | None = None
16
+ json_parsing_error: str | None = None
17
+ schema_validation_error: str | None = None
18
+
19
+
20
+ class JsonFormat(BaseMetric[Completion]):
21
+ NAME = "JSON Format"
22
+
23
+ def calculate(self, response: Completion) -> list[MetricResult]:
24
+ keys = [
25
+ "is_just_json",
26
+ "is_valid_json",
27
+ "fulfills_schema",
28
+ ]
29
+
30
+ if response.error is not None:
31
+ return [
32
+ MetricResult(metric_name=f"{self.NAME}/{k}", value=None, higher_is_better=True, error=response.error)
33
+ for k in keys
34
+ ]
35
+
36
+ if response.completion == "":
37
+ return [
38
+ MetricResult(metric_name=f"{self.NAME}/{k}", value=0.0, higher_is_better=True, error=response.error)
39
+ for k in keys
40
+ ]
41
+
42
+ json_dict, grading = self._extract_and_parse_json(response.completion)
43
+ schema = json.loads(str(response.ground_truth))["json_schema"]
44
+ if schema and json_dict is None:
45
+ grading.fulfills_schema = False
46
+ if schema and json_dict is not None:
47
+ grading = self._validate_json_against_schema(json_dict, schema, grading)
48
+
49
+ results = []
50
+ for key in keys:
51
+ result = MetricResult(
52
+ metric_name=f"{self.NAME}/{key}",
53
+ value=float(getattr(grading, key)) if getattr(grading, key) is not None else None,
54
+ higher_is_better=True,
55
+ error=response.error,
56
+ code_execution_trace=(grading.json_parsing_error or "") + (grading.schema_validation_error or ""),
57
+ )
58
+ results.append(result)
59
+ return results
60
+
61
+ @staticmethod
62
+ def _validate_json_against_schema(
63
+ json_obj: object, schema: Mapping[str, Any], evaluation_result: JsonFormatEvaluation
64
+ ) -> JsonFormatEvaluation:
65
+ evaluation_result = evaluation_result.model_copy(deep=True)
66
+ try:
67
+ jsonschema.validate(json_obj, schema)
68
+ evaluation_result.fulfills_schema = True
69
+ except jsonschema.exceptions.ValidationError as e:
70
+ evaluation_result.fulfills_schema = False
71
+ evaluation_result.schema_validation_error = type(e).__name__
72
+ except jsonschema.exceptions.SchemaError as e:
73
+ evaluation_result.schema_validation_error = type(e).__name__
74
+ return evaluation_result
75
+
76
+ @staticmethod
77
+ def _extract_and_parse_json(completion: str) -> tuple[object, JsonFormatEvaluation]:
78
+ evaluation_result = JsonFormatEvaluation()
79
+ json_dict = None
80
+ try:
81
+ json_dict = json.loads(remove_comments(completion.strip("`")))
82
+ evaluation_result.is_just_json = True
83
+ evaluation_result.is_valid_json = True
84
+ except Exception as _:
85
+ try:
86
+ json_string = remove_comments(get_json_object(completion))
87
+ json_dict = json.loads(json_string)
88
+ evaluation_result.is_valid_json = True
89
+ except Exception as e:
90
+ evaluation_result.json_parsing_error = type(e).__name__
91
+ return json_dict, evaluation_result
92
+
93
+
94
+ def get_json_object(text: str) -> str:
95
+ """
96
+ Extract the first valid JSON object or array from text.
97
+
98
+ This function handles nested brackets properly by using a bracket counting
99
+ approach to find complete JSON structures, rather than using regex which
100
+ can incorrectly match outer brackets containing non-JSON content.
101
+ """
102
+
103
+ def find_json_at_position(text: str, start_pos: int, open_char: str, close_char: str) -> str | None:
104
+ """Find a complete JSON object/array starting at the given position."""
105
+ if start_pos >= len(text) or text[start_pos] != open_char:
106
+ return None
107
+
108
+ bracket_count = 0
109
+ in_string = False
110
+ escaped = False
111
+
112
+ for i in range(start_pos, len(text)):
113
+ char = text[i]
114
+
115
+ if escaped:
116
+ escaped = False
117
+ continue
118
+
119
+ if char == "\\" and in_string:
120
+ escaped = True
121
+ continue
122
+
123
+ if char == '"' and not escaped:
124
+ in_string = not in_string
125
+ continue
126
+
127
+ if not in_string:
128
+ if char == open_char:
129
+ bracket_count += 1
130
+ elif char == close_char:
131
+ bracket_count -= 1
132
+ if bracket_count == 0:
133
+ # Found complete JSON structure
134
+ candidate = text[start_pos : i + 1]
135
+ # Test if it's valid JSON
136
+ try:
137
+ json.loads(candidate)
138
+ return candidate
139
+ except json.JSONDecodeError:
140
+ return None
141
+
142
+ return None
143
+
144
+ # Look for JSON objects {} and arrays []
145
+ json_candidates = []
146
+
147
+ # Search for objects starting with {
148
+ for i in range(len(text)):
149
+ if text[i] == "{":
150
+ candidate = find_json_at_position(text, i, "{", "}")
151
+ if candidate:
152
+ json_candidates.append(candidate)
153
+
154
+ # Search for arrays starting with [
155
+ for i in range(len(text)):
156
+ if text[i] == "[":
157
+ candidate = find_json_at_position(text, i, "[", "]")
158
+ if candidate:
159
+ json_candidates.append(candidate)
160
+
161
+ if not json_candidates:
162
+ raise RuntimeError(f"No valid JSON object found in {text}.")
163
+
164
+ # Return the longest valid JSON (most likely to be the main content)
165
+ return max(json_candidates, key=len)
166
+
167
+
168
+ def remove_comments(text: str, comment_indicator: str = "//") -> str:
169
+ lines = text.splitlines()
170
+ lines = [line.split(comment_indicator)[0] for line in lines]
171
+ return "\n".join([line for line in lines if line.strip()])
@@ -0,0 +1,74 @@
1
+ from eval_framework.exceptions import LogicError
2
+ from eval_framework.metrics.base import BaseMetric, MetricResult
3
+ from eval_framework.metrics.llm.graders.language import AVAILABLE_LANGUAGES
4
+ from eval_framework.shared.types import Completion
5
+
6
+
7
+ class LanguageChecker(BaseMetric[Completion]):
8
+ NAME = "Language Check"
9
+
10
+ def calculate(self, response: Completion) -> list[MetricResult]:
11
+ if response.error is not None:
12
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
13
+
14
+ if response.ground_truth is None:
15
+ raise LogicError("Language detection needs ground_truth.")
16
+ if response.ground_truth not in AVAILABLE_LANGUAGES:
17
+ raise LogicError("Checking for unknown or unavailable language.")
18
+
19
+ completion_language = response.get_completion_language()
20
+ target_language = response.ground_truth
21
+ value = float(completion_language == target_language)
22
+ return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
23
+
24
+
25
+ class GermanCompletionChecker(BaseMetric[Completion]):
26
+ NAME = "German Completion Check"
27
+
28
+ def calculate(self, response: Completion) -> list[MetricResult]:
29
+ if response.error is not None:
30
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
31
+
32
+ raw_completion_language = response.get_raw_completion_language()
33
+ value = float(raw_completion_language == "de")
34
+ return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
35
+
36
+
37
+ class LanguageConsistencyChecker(BaseMetric[Completion]):
38
+ NAME = "Language Consistency"
39
+
40
+ def calculate(self, response: Completion) -> list[MetricResult]:
41
+ if response.error is not None:
42
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
43
+
44
+ completion_language = response.get_completion_language()
45
+ target_language = response.get_instruction_language()
46
+ if completion_language == target_language == "":
47
+ return [] # No language information could be determined
48
+ else:
49
+ value = float(completion_language == target_language)
50
+ return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
51
+
52
+
53
+ class LanguageRawConsistencyChecker(BaseMetric[Completion]):
54
+ NAME = "Language Consistency Raw"
55
+
56
+ def calculate(self, response: Completion) -> list[MetricResult]:
57
+ if response.error is not None:
58
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
59
+
60
+ raw_completion_language = response.get_raw_completion_language()
61
+ target_language = response.get_instruction_language()
62
+
63
+ if raw_completion_language == target_language == "":
64
+ return [] # No language information could be determined
65
+ else:
66
+ value = float(raw_completion_language == target_language)
67
+ return [
68
+ MetricResult(
69
+ metric_name=self.NAME,
70
+ value=value,
71
+ higher_is_better=True,
72
+ error=response.error,
73
+ )
74
+ ]
@@ -0,0 +1,83 @@
1
+ import json
2
+ from enum import Enum
3
+
4
+ from eval_framework.metrics.base import BaseMetric, MetricResult
5
+ from eval_framework.metrics.completion.text_counter import ParagraphCounter, SentenceCounter, WordCounter
6
+ from eval_framework.shared.types import Completion
7
+
8
+
9
+ class LengthRequirementUnit(Enum):
10
+ WORDS = "words"
11
+ SENTENCES = "sentences"
12
+ PARAGRAPHS = "paragraphs"
13
+
14
+
15
+ class LengthRequirementType(Enum):
16
+ MIN = "minimum"
17
+ MAX = "maximum"
18
+ TARGET = "target"
19
+
20
+
21
+ class LengthControl(BaseMetric[Completion]):
22
+ NAME = "length_control"
23
+
24
+ def __init__(self, tolerance: float = 1 / 6) -> None:
25
+ super().__init__()
26
+ self.tolerance = tolerance
27
+
28
+ def calculate(self, response: Completion) -> list[MetricResult]:
29
+ if response.error is not None:
30
+ return [
31
+ MetricResult(
32
+ metric_name=f"{self.NAME}/fulfills_length_requirement",
33
+ value=None,
34
+ higher_is_better=True,
35
+ error=response.error if response.error is not None else None,
36
+ )
37
+ ]
38
+
39
+ expectations = json.loads(str(response.ground_truth))
40
+ stripped_completion = response.completion.strip()
41
+
42
+ match LengthRequirementUnit(expectations["unit"]):
43
+ case LengthRequirementUnit.WORDS:
44
+ count = WordCounter._count_words(stripped_completion)
45
+ case LengthRequirementUnit.SENTENCES:
46
+ count = SentenceCounter._count_sentences(stripped_completion)
47
+ case LengthRequirementUnit.PARAGRAPHS:
48
+ count = ParagraphCounter._count_paragraphs(stripped_completion)
49
+ case _:
50
+ raise NotImplementedError(f"LengthRequirementUnit {expectations['unit']} is not supported.")
51
+
52
+ expected_count = int(expectations["count"])
53
+ normalized_distance_to_target = (count - expected_count) / float(expected_count)
54
+ absolute_normalized_distance_to_target = abs(normalized_distance_to_target)
55
+
56
+ match LengthRequirementType(expectations["type"]):
57
+ case LengthRequirementType.TARGET:
58
+ fulfills_length_requirement = absolute_normalized_distance_to_target <= self.tolerance
59
+ case LengthRequirementType.MIN:
60
+ fulfills_length_requirement = count >= expected_count
61
+ case LengthRequirementType.MAX:
62
+ fulfills_length_requirement = count <= expected_count
63
+ case _:
64
+ raise NotImplementedError(f"LengthRequirementType {expectations['type']} is not supported.")
65
+
66
+ return [
67
+ MetricResult(
68
+ metric_name=f"{self.NAME}/normalized_distance_to_target",
69
+ value=float(normalized_distance_to_target),
70
+ higher_is_better=False,
71
+ ),
72
+ MetricResult(
73
+ metric_name=f"{self.NAME}/absolute_normalized_distance_to_target",
74
+ value=float(absolute_normalized_distance_to_target),
75
+ higher_is_better=False,
76
+ ),
77
+ MetricResult(
78
+ metric_name=f"{self.NAME}/fulfills_length_requirement",
79
+ value=float(fulfills_length_requirement),
80
+ higher_is_better=True,
81
+ error=response.error,
82
+ ),
83
+ ]