EuroEval 15.16.0__py3-none-any.whl → 16.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (63) hide show
  1. euroeval/__init__.py +3 -7
  2. euroeval/benchmark_config_factory.py +3 -7
  3. euroeval/benchmark_modules/base.py +35 -19
  4. euroeval/benchmark_modules/fresh.py +24 -19
  5. euroeval/benchmark_modules/hf.py +136 -154
  6. euroeval/benchmark_modules/litellm.py +190 -110
  7. euroeval/benchmark_modules/vllm.py +161 -114
  8. euroeval/benchmarker.py +49 -22
  9. euroeval/cli.py +3 -3
  10. euroeval/constants.py +13 -15
  11. euroeval/data_loading.py +33 -28
  12. euroeval/data_models.py +53 -7
  13. euroeval/dataset_configs/__init__.py +2 -0
  14. euroeval/dataset_configs/danish.py +38 -1
  15. euroeval/dataset_configs/dutch.py +38 -1
  16. euroeval/dataset_configs/english.py +38 -1
  17. euroeval/dataset_configs/estonian.py +95 -0
  18. euroeval/dataset_configs/faroese.py +38 -0
  19. euroeval/dataset_configs/finnish.py +39 -1
  20. euroeval/dataset_configs/french.py +38 -1
  21. euroeval/dataset_configs/german.py +38 -1
  22. euroeval/dataset_configs/icelandic.py +39 -1
  23. euroeval/dataset_configs/italian.py +38 -1
  24. euroeval/dataset_configs/latvian.py +81 -0
  25. euroeval/dataset_configs/norwegian.py +38 -1
  26. euroeval/dataset_configs/portuguese.py +38 -1
  27. euroeval/dataset_configs/spanish.py +38 -1
  28. euroeval/dataset_configs/swedish.py +38 -1
  29. euroeval/enums.py +0 -6
  30. euroeval/finetuning.py +6 -6
  31. euroeval/generation.py +25 -14
  32. euroeval/generation_utils.py +46 -14
  33. euroeval/languages.py +947 -187
  34. euroeval/metrics/__init__.py +6 -0
  35. euroeval/metrics/base.py +76 -0
  36. euroeval/metrics/huggingface.py +192 -0
  37. euroeval/metrics/llm_as_a_judge.py +257 -0
  38. euroeval/metrics/pipeline.py +234 -0
  39. euroeval/metrics/speed.py +51 -0
  40. euroeval/prompt_templates/linguistic_acceptability.py +40 -2
  41. euroeval/prompt_templates/multiple_choice.py +23 -2
  42. euroeval/prompt_templates/named_entity_recognition.py +65 -2
  43. euroeval/prompt_templates/reading_comprehension.py +42 -2
  44. euroeval/prompt_templates/sentiment_classification.py +46 -2
  45. euroeval/prompt_templates/summarization.py +24 -4
  46. euroeval/scores.py +7 -2
  47. euroeval/speed_benchmark.py +6 -6
  48. euroeval/task_group_utils/multiple_choice_classification.py +17 -6
  49. euroeval/task_group_utils/question_answering.py +35 -28
  50. euroeval/task_group_utils/sequence_classification.py +96 -23
  51. euroeval/task_group_utils/text_to_text.py +7 -3
  52. euroeval/task_group_utils/token_classification.py +47 -75
  53. euroeval/tasks.py +31 -6
  54. euroeval/tokenization_utils.py +295 -207
  55. euroeval/utils.py +118 -34
  56. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +11 -14
  57. euroeval-16.0.0.dist-info/RECORD +69 -0
  58. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
  59. euroeval/human_evaluation.py +0 -738
  60. euroeval/metrics.py +0 -470
  61. euroeval-15.16.0.dist-info/RECORD +0 -63
  62. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
  63. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
euroeval/metrics.py DELETED
@@ -1,470 +0,0 @@
1
- """All the metrics used in EuroEval."""
2
-
3
- import abc
4
- import logging
5
- import typing as t
6
-
7
- import evaluate
8
- import litellm
9
- from litellm.types.utils import Choices, ModelResponse
10
- from pydantic import BaseModel, Field
11
- from tqdm.auto import tqdm
12
-
13
- from .exceptions import InvalidBenchmark
14
- from .utils import HiddenPrints
15
-
16
- if t.TYPE_CHECKING:
17
- from datasets.arrow_dataset import Dataset
18
- from evaluate import EvaluationModule
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- class Metric(abc.ABC):
24
- """Abstract base class for all metrics."""
25
-
26
- def __init__(
27
- self,
28
- name: str,
29
- pretty_name: str,
30
- postprocessing_fn: t.Callable[[float], tuple[float, str]] | None = None,
31
- ) -> None:
32
- """Initialise the metric.
33
-
34
- Args:
35
- name:
36
- The name of the metric in snake_case.
37
- pretty_name:
38
- The pretty name of the metric, used for display purposes.
39
- postprocessing_fn:
40
- A function to apply to the metric scores after they are computed,
41
- taking the score to the postprocessed score along with its string
42
- representation. Defaults to x -> (100 * x, f"{x:.2%}").
43
- """
44
- self.name = name
45
- self.pretty_name = pretty_name
46
- self.postprocessing_fn = (
47
- postprocessing_fn
48
- if postprocessing_fn is not None
49
- else lambda x: (100 * x, f"{x:.2%}")
50
- )
51
-
52
- @abc.abstractmethod
53
- def __call__(
54
- self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
55
- ) -> float | None:
56
- """Calculate the metric score.
57
-
58
- Args:
59
- predictions:
60
- The model predictions.
61
- references:
62
- The ground truth references.
63
- dataset:
64
- The dataset used for evaluation. This is only used in case any
65
- additional metadata is used to compute the metrics.
66
-
67
- Returns:
68
- The calculated metric score, or None if the score should be ignored.
69
- """
70
- ...
71
-
72
- def __hash__(self) -> int:
73
- """Return a hash of the metric configuration."""
74
- return hash(self.name)
75
-
76
-
77
- class HuggingFaceMetric(Metric):
78
- """A metric which is implemented in the `evaluate` package.
79
-
80
- Attributes:
81
- name:
82
- The name of the metric in snake_case.
83
- pretty_name:
84
- The pretty name of the metric, used for display purposes.
85
- huggingface_id:
86
- The Hugging Face ID of the metric.
87
- results_key:
88
- The name of the key used to extract the metric scores from the results
89
- dictionary.
90
- compute_kwargs:
91
- Keyword arguments to pass to the metric's compute function. Defaults to
92
- an empty dictionary.
93
- """
94
-
95
- def __init__(
96
- self,
97
- name: str,
98
- pretty_name: str,
99
- huggingface_id: str,
100
- results_key: str,
101
- compute_kwargs: dict[str, t.Any] | None = None,
102
- postprocessing_fn: t.Callable[[float], tuple[float, str]] | None = None,
103
- ) -> None:
104
- """Initialise the Hugging Face metric.
105
-
106
- Args:
107
- name:
108
- The name of the metric in snake_case.
109
- pretty_name:
110
- The pretty name of the metric, used for display purposes.
111
- huggingface_id:
112
- The Hugging Face ID of the metric.
113
- results_key:
114
- The name of the key used to extract the metric scores from the results
115
- dictionary.
116
- compute_kwargs:
117
- Keyword arguments to pass to the metric's compute function. Defaults to
118
- an empty dictionary.
119
- postprocessing_fn:
120
- A function to apply to the metric scores after they are computed, taking
121
- the score to the postprocessed score along with its string
122
- representation. Defaults to x -> (100 * x, f"{x:.2%}").
123
- """
124
- super().__init__(
125
- name=name, pretty_name=pretty_name, postprocessing_fn=postprocessing_fn
126
- )
127
- self.huggingface_id = huggingface_id
128
- self.results_key = results_key
129
- self.compute_kwargs: dict[str, t.Any] = (
130
- dict() if compute_kwargs is None else compute_kwargs
131
- )
132
- self.metric: "EvaluationModule | None" = None
133
-
134
- def __call__(
135
- self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
136
- ) -> float | None:
137
- """Calculate the metric score.
138
-
139
- Args:
140
- predictions:
141
- The model predictions.
142
- references:
143
- The ground truth references.
144
- dataset:
145
- The dataset used for evaluation. This is only used in case any
146
- additional metadata is used to compute the metrics.
147
-
148
- Returns:
149
- The calculated metric score, or None if the score should be ignored.
150
- """
151
- if self.metric is None:
152
- self.metric = evaluate.load(path=self.huggingface_id)
153
-
154
- with HiddenPrints():
155
- results = self.metric.compute(
156
- predictions=predictions, references=references, **self.compute_kwargs
157
- )
158
-
159
- # The metric returns None if we are running on multi-GPU and the current
160
- # process is not the main process
161
- if results is None:
162
- return None
163
-
164
- score = results[self.results_key]
165
- if isinstance(score, list):
166
- score = sum(score) / len(score)
167
-
168
- return score
169
-
170
-
171
- class LLMAsAJudgeMetric(Metric):
172
- """Use an LLM to judge the quality of the predictions."""
173
-
174
- def __init__(
175
- self,
176
- name: str,
177
- pretty_name: str,
178
- judge_id: str,
179
- judge_kwargs: dict[str, t.Any],
180
- user_prompt: str,
181
- response_format: t.Type[BaseModel],
182
- scoring_fn: t.Callable[[BaseModel], float],
183
- condition_formatting_fn: t.Callable[[str], str] = lambda x: x,
184
- system_prompt: str | None = None,
185
- ) -> None:
186
- """Initialise the LLM as a judge metric.
187
-
188
- Args:
189
- name:
190
- The name of the metric in snake_case.
191
- pretty_name:
192
- The pretty name of the metric, used for display purposes.
193
- judge_id:
194
- The model ID of the LLM to use as a judge.
195
- judge_kwargs:
196
- Generation parameters for the judge model, such as temperature.
197
- user_prompt:
198
- The user prompt to use for the judge model. The prompt should be
199
- formatted with the variables `prediction` and `condition`, to
200
- include the model predictions and a description of what the prediction
201
- should be judged on, respectively. If the condition is not needed,
202
- it can be omitted from the prompt, but the `prediction` variable must
203
- still be present.
204
- response_format:
205
- The response format to use for the judge model. This should be a
206
- Pydantic model that defines the expected structure of the judge's
207
- response.
208
- scoring_fn:
209
- A function that takes the judge's response and returns a score.
210
- condition_formatting_fn (optional):
211
- A function to format the condition string before it is included in the
212
- user prompt. Defaults to a no-op function that returns the input
213
- unchanged.
214
- system_prompt (optional):
215
- The system prompt to use for the judge model. If not provided, no system
216
- prompt will be used.
217
- """
218
- super().__init__(name=name, pretty_name=pretty_name)
219
- self.judge_id = judge_id
220
- self.judge_kwargs = judge_kwargs
221
- self.user_prompt = user_prompt
222
- self.response_format = response_format
223
- self.scoring_fn = scoring_fn
224
- self.condition_formatting_fn = condition_formatting_fn
225
- self.system_prompt = system_prompt
226
-
227
- def __call__(
228
- self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
229
- ) -> float | None:
230
- """Calculate the metric score using the judge model.
231
-
232
- Args:
233
- predictions:
234
- The model predictions.
235
- references:
236
- The ground truth references.
237
- dataset:
238
- The dataset used for evaluation. This is only used in case any
239
- additional metadata is used to compute the metrics.
240
-
241
- Returns:
242
- The calculated metric score, or None if the score should be ignored.
243
-
244
- Raises:
245
- InvalidBenchmark:
246
- If the number of predictions does not match the number of references,
247
- or if the user prompt requires a condition but none is provided.
248
- """
249
- if not predictions or not references:
250
- return None
251
- elif len(predictions) != len(references):
252
- raise InvalidBenchmark(
253
- f"The number of predictions ({len(predictions):,}) does not match the "
254
- f"number of references ({len(references):,})."
255
- )
256
-
257
- # Prepare the messages for the LLM
258
- conversations: list[list[dict[str, str]]] = [
259
- [
260
- dict(
261
- role="user",
262
- content=self._apply_user_prompt(
263
- prediction=prediction, condition=condition
264
- ),
265
- )
266
- ]
267
- for prediction, condition in zip(predictions, references)
268
- ]
269
- if self.system_prompt:
270
- conversations = [
271
- [dict(role="system", content=self.system_prompt), *conversation]
272
- for conversation in conversations
273
- ]
274
-
275
- # Get the judge generations
276
- generations = [
277
- litellm.completion(
278
- model=self.judge_id,
279
- messages=conversation,
280
- response_format=self.response_format,
281
- **self.judge_kwargs,
282
- )
283
- for conversation in tqdm(
284
- iterable=conversations,
285
- desc=f"Computing {self.pretty_name} scores",
286
- unit="sample",
287
- )
288
- ]
289
-
290
- # Extract the outputs from the generations
291
- outputs: list[BaseModel] = list()
292
- for generation in generations:
293
- assert isinstance(generation, ModelResponse), (
294
- f"The judge model did not return a valid response: {generation!r}"
295
- )
296
- choice = generation.choices[0]
297
- assert isinstance(choice, Choices), (
298
- f"The judge model did not return a valid choice: {choice!r}"
299
- )
300
- json_content = choice.message.content
301
- assert json_content is not None, (
302
- "The judge model returned a None content in the response message."
303
- )
304
- output = self.response_format.model_validate_json(json_data=json_content)
305
- outputs.append(output)
306
-
307
- # Calculate the scores using the scoring function
308
- scores = [self.scoring_fn(output) for output in outputs]
309
- if not scores:
310
- logger.warning(f"No scores were calculated for {self.pretty_name}.")
311
- return None
312
- return sum(scores) / len(scores)
313
-
314
- def _apply_user_prompt(self, prediction: str, condition: str | None = None) -> str:
315
- """Apply the user prompt to the prediction and condition.
316
-
317
- Args:
318
- prediction:
319
- The model prediction.
320
- condition (optional):
321
- A description of what the prediction should be judged on. If not
322
- provided, it will be omitted from the prompt.
323
-
324
- Returns:
325
- The formatted user prompt with the prediction and reference.
326
-
327
- Raises:
328
- InvalidBenchmark:
329
- If the user prompt requires a reference but none is provided.
330
- """
331
- condition_required = "{condition}" in self.user_prompt
332
- if condition_required and condition is None:
333
- raise InvalidBenchmark(
334
- f"The user prompt for the {self.pretty_name!r} metric requires a "
335
- "condition, but none was provided."
336
- )
337
- if condition is not None:
338
- return self.user_prompt.format(
339
- prediction=prediction, condition=self.condition_formatting_fn(condition)
340
- )
341
- return self.user_prompt.format(prediction=prediction)
342
-
343
-
344
- class SpeedMetric(Metric):
345
- """Speed metric."""
346
-
347
- def __init__(self, name: str, pretty_name: str) -> None:
348
- """Initialise the speed metric.
349
-
350
- Args:
351
- name:
352
- The name of the metric in snake_case.
353
- pretty_name:
354
- The pretty name of the metric, used for display purposes.
355
- """
356
- super().__init__(
357
- name=name,
358
- pretty_name=pretty_name,
359
- postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
360
- )
361
-
362
- def __call__(
363
- self, _: t.Sequence, __: t.Sequence, ___: "Dataset | None"
364
- ) -> float | None:
365
- """Not used with the speed metric, but required for consistency."""
366
- raise NotImplementedError
367
-
368
-
369
- mcc_metric = HuggingFaceMetric(
370
- name="mcc",
371
- pretty_name="Matthew's Correlation Coefficient",
372
- huggingface_id="matthews_correlation",
373
- results_key="matthews_correlation",
374
- )
375
-
376
- macro_f1_metric = HuggingFaceMetric(
377
- name="macro_f1",
378
- pretty_name="Macro-average F1-score",
379
- huggingface_id="f1",
380
- results_key="f1",
381
- compute_kwargs=dict(average="macro"),
382
- )
383
-
384
- micro_f1_metric = HuggingFaceMetric(
385
- name="micro_f1",
386
- pretty_name="Micro-average F1-score with MISC tags",
387
- huggingface_id="seqeval",
388
- results_key="overall_f1",
389
- )
390
-
391
- micro_f1_no_misc_metric = HuggingFaceMetric(
392
- name="micro_f1_no_misc",
393
- pretty_name="Micro-average F1-score without MISC tags",
394
- huggingface_id="seqeval",
395
- results_key="overall_f1",
396
- )
397
-
398
- f1_metric = HuggingFaceMetric(
399
- name="f1",
400
- pretty_name="F1-score",
401
- huggingface_id="squad_v2",
402
- results_key="f1",
403
- postprocessing_fn=lambda x: (x, f"{x:.2f}%"),
404
- )
405
-
406
- em_metric = HuggingFaceMetric(
407
- name="em",
408
- pretty_name="Exact Match",
409
- huggingface_id="squad_v2",
410
- results_key="exact",
411
- postprocessing_fn=lambda x: (x, f"{x:.2f}%"),
412
- )
413
-
414
- bert_score_metric = HuggingFaceMetric(
415
- name="bertscore",
416
- pretty_name="BERTScore",
417
- huggingface_id="bertscore",
418
- results_key="f1",
419
- compute_kwargs=dict(
420
- model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
421
- ),
422
- )
423
-
424
- rouge_l_metric = HuggingFaceMetric(
425
- name="rouge_l", pretty_name="ROUGE-L", huggingface_id="rouge", results_key="rougeL"
426
- )
427
-
428
- accuracy_metric = HuggingFaceMetric(
429
- name="accuracy",
430
- pretty_name="Accuracy",
431
- huggingface_id="accuracy",
432
- results_key="accuracy",
433
- )
434
-
435
-
436
- class Fluency(BaseModel):
437
- """Response format for the fluency metric.
438
-
439
- Attributes:
440
- fluency:
441
- The fluency rating, an integer between 1 and 5.
442
- """
443
-
444
- fluency: t.Annotated[int, Field(ge=1, le=5)]
445
-
446
-
447
- # Example LLM-as-a-judge metric, to measure the fluency of the LLM output
448
- fluency_metric = LLMAsAJudgeMetric(
449
- name="fluency",
450
- pretty_name="Fluency",
451
- judge_id="gpt-4o-mini",
452
- judge_kwargs=dict(temperature=0.0),
453
- user_prompt="Please rate the fluency of the following text on a scale from 1 to 5, "
454
- "with the following definitions:\n"
455
- "- 1: Very poor fluency, many grammatical errors\n"
456
- "- 2: Poor fluency, several grammatical errors\n"
457
- "- 3: Average fluency, a few grammatical errors\n"
458
- "- 4: Good fluency, no grammatical errors but sounds a bit off\n"
459
- "- 5: Excellent fluency, no grammatical errors and sounds natural\n\n"
460
- "Text: {prediction!r}\n\n"
461
- "Output your rating as a JSON object with a single key 'fluency'.",
462
- response_format=Fluency,
463
- scoring_fn=lambda output: (output.fluency - 1) / 4.0,
464
- )
465
-
466
- speed_metric = SpeedMetric(name="speed", pretty_name="Tokens per second")
467
-
468
- speed_short_metric = SpeedMetric(
469
- name="speed_short", pretty_name="Tokens per second on short documents"
470
- )
@@ -1,63 +0,0 @@
1
- euroeval/__init__.py,sha256=ZZoVc6tKWz_h8Pw2n26PV-q_Gd4TM_02O235ZBRUNJw,3756
2
- euroeval/benchmark_config_factory.py,sha256=jKC8bEzJSGGCcG8aWsPxiyHX6fjOQYQWvkp1MIUuHYM,11564
3
- euroeval/benchmarker.py,sha256=6qo0ytRnvZLxTQZvo2Fryox5DFHGrLsa0tVGquLHdTQ,48419
4
- euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
5
- euroeval/cli.py,sha256=h81Lswm_q9htkYz-GQQQVIsdsUPnfe3LDH8AZdBcpKs,8602
6
- euroeval/constants.py,sha256=0KHrH74zGM8vNF4uZG_a5qFJRZH5YgyQULYZtCKlo68,2452
7
- euroeval/data_loading.py,sha256=DP-cqwN_d0Y-KaN8P8c3fDr6PX80UYROHgRwX82ix4w,4156
8
- euroeval/data_models.py,sha256=qSCNq3PV7qo--gibqEvvu4cXkEkhGGAb6UiZW8U_KiU,22031
9
- euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
10
- euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
11
- euroeval/finetuning.py,sha256=Wzagme1n3lSZLWX0WbKMHtSUlAZr8t8_FJvggDZf72c,11393
12
- euroeval/generation.py,sha256=lmvu__6w3cLxi0zBtXSlyZvV8CJpV3BdajUoIEA9ElA,11639
13
- euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
14
- euroeval/human_evaluation.py,sha256=FLuTl1DHxCiWB_laVVQHIH86yXvA_ZeNNSrUmyExZXI,27579
15
- euroeval/languages.py,sha256=cr_Z5jtaHb2XY0zeOhuk3ATHX74PODzt6gMPC2zMD7c,8594
16
- euroeval/metrics.py,sha256=m8nVnxUnwmIrlBfW8pkN4FCMjW3Sbg9Iq4oMZFAicEc,16227
17
- euroeval/model_cache.py,sha256=HgXTgn4RMBqIjKaTmYzxu0f4NIwbXx1XJFbvbITqy4E,8686
18
- euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
19
- euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
20
- euroeval/scores.py,sha256=TatSbjia7Zwj71gQFyV_gCHyppMbOgeaZgNCib8G86k,2849
21
- euroeval/speed_benchmark.py,sha256=6bFGeMmtdl_6owkxNQ3ZKiyQQS58k0NApzlsbDgBW5s,4037
22
- euroeval/tasks.py,sha256=btxf29M5rUP7JjBl6u9aQlHQAxrJNP4bRbdEQtDnmDA,3376
23
- euroeval/tokenization_utils.py,sha256=LxgGs7juS5PuMYt5LL2X6eVXdtnpi-A2jFxqcWpF6NA,17931
24
- euroeval/types.py,sha256=SCKOALV_-F1PAIwQ7qHNdSF1Uy29TSu9nIc1NYJGUUs,2754
25
- euroeval/utils.py,sha256=5R7y67xe0ODaje7k8nOu2AFS3Ph2gcsiWpIq5rjSSuA,11613
26
- euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
27
- euroeval/benchmark_modules/base.py,sha256=D1oKD16KBvxEoBUfqwvzvcDc1hx6letdD3v1PnBmF4A,10669
28
- euroeval/benchmark_modules/fresh.py,sha256=sg_AXNPApFObCzCRWhCgKxfr-eqQsT6Ri0xx0_Yy5JM,10293
29
- euroeval/benchmark_modules/hf.py,sha256=-W_bWEdm0zePkn4nDz4l0T4hhJJnlfwHrtIO3m5BrUs,44725
30
- euroeval/benchmark_modules/litellm.py,sha256=ibdbOmxAO1VsuZX4uUs5MQ8pFPfqPJoleOOjAim3syY,55493
31
- euroeval/benchmark_modules/vllm.py,sha256=7PhfqqeRGFdzOL-RBJbrHEAMGfwrVWngF14dSeq9IpI,39072
32
- euroeval/dataset_configs/__init__.py,sha256=EbjEyHwBtSztASl8_xblD8hessruDdV4Eg1vXrmGOuY,1935
33
- euroeval/dataset_configs/danish.py,sha256=0lDtvpgszXY1XaPjTU8yA3oNCU8W2OllvrBWgn6pkhk,4027
34
- euroeval/dataset_configs/dutch.py,sha256=ekZxLL9d09BUMijCxy9EFa2heNQVvySPySOjhWdtJc8,3815
35
- euroeval/dataset_configs/english.py,sha256=uQAaGWpHk8xqFCeIhmmPXYTb1cZomeEdRaRe9qIZQrg,2858
36
- euroeval/dataset_configs/faroese.py,sha256=gkgxQTWGFbfg9Eo1z-NSLROgKDcaij9tAN2mfgtrt0M,1647
37
- euroeval/dataset_configs/finnish.py,sha256=UZwy0_d17O2L-v2AKOu3OlDwFPcLGTZNAOt7ZKlr4K8,2679
38
- euroeval/dataset_configs/french.py,sha256=Hei2M4bGIz8hVtaPKQlQATcmK-0bFBNEocEszR3gia0,3014
39
- euroeval/dataset_configs/german.py,sha256=sRYtOl6CYf4kZkeINfff6xoKBG4OsDxb2b72lKwELGc,3192
40
- euroeval/dataset_configs/icelandic.py,sha256=g21IHjcwEZvf_yJ9PobeuBOqRiLOk0oCdEjY34g-UMk,4497
41
- euroeval/dataset_configs/italian.py,sha256=4SEmdUyfGbbwMPhv_9nL3JNJtoDKHLAlWuvr7Ihmi9o,3294
42
- euroeval/dataset_configs/norwegian.py,sha256=-WvQM44xCwjrqBzlAy4rjf6v87fGera2JmZV_069TeQ,6003
43
- euroeval/dataset_configs/portuguese.py,sha256=3SqbwD0PNTILGALzh50pVoEwC-spRD75ZeE2NEj151E,2367
44
- euroeval/dataset_configs/spanish.py,sha256=Bm0Z19Mh2qYXR0RIRlqEkzfVb5KiqJRectfuY7JLql4,3192
45
- euroeval/dataset_configs/swedish.py,sha256=js4paNsuC0nQzPpf6_BzHBf7MT60XUpP1-qM2uxRtQs,3445
46
- euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
47
- euroeval/prompt_templates/linguistic_acceptability.py,sha256=ZN71BEt4HAhSYY-GWjh-S-iVvq5AODQJThkrjDhy4oM,7138
48
- euroeval/prompt_templates/multiple_choice.py,sha256=wHnQCE5bv947L6hSK5zJitE37V-PbuNYAp156mWaIYA,5494
49
- euroeval/prompt_templates/named_entity_recognition.py,sha256=ga21s9T4_Hhbf88boWm7gnL7OgD7txuS_EeDgXaxEoE,13602
50
- euroeval/prompt_templates/reading_comprehension.py,sha256=3Nch-9zHfUDIwy-k5mP-TRhHQRQ9nad8HdhpJ1S8nGc,7072
51
- euroeval/prompt_templates/sentiment_classification.py,sha256=2Xsmj8lbaAXACHhwbbR4dWhoKyKB87TqpMO-ssQ-Djo,7649
52
- euroeval/prompt_templates/summarization.py,sha256=I98LlUOBVa_xo02npq7BWKKZOXGqm-_15i64QzbEsb0,5334
53
- euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
54
- euroeval/task_group_utils/multiple_choice_classification.py,sha256=yfy8lczpZ_MY-Y4FQx3Et9vEUpuD3YMFjF3wQGCfMNw,6632
55
- euroeval/task_group_utils/question_answering.py,sha256=6jpiHukzA7IrJh4vVYyZDDyvD5Xc2GsxoXzpm_PHpXw,27503
56
- euroeval/task_group_utils/sequence_classification.py,sha256=ihJO55f3Dy565d3ByYGMuSINasnjAADaTrM59LwZzA0,12977
57
- euroeval/task_group_utils/text_to_text.py,sha256=go0y6X9QAv5iywlLAclb8cqFX_3QlAT-1-VNZ9zMWFA,4832
58
- euroeval/task_group_utils/token_classification.py,sha256=BDqOfopdH5Bbj67HTEbZd9KZtNCDNket8NrCTfxZFzQ,17773
59
- euroeval-15.16.0.dist-info/METADATA,sha256=_oeIq0ZGzS0i7n51NdhNhuDX2A3_lDjYDD-6KgB1rW0,13536
60
- euroeval-15.16.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
61
- euroeval-15.16.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
62
- euroeval-15.16.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
63
- euroeval-15.16.0.dist-info/RECORD,,