EuroEval 16.0.0__py3-none-any.whl → 16.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (51) hide show
  1. euroeval/__init__.py +5 -0
  2. euroeval/benchmark_config_factory.py +6 -1
  3. euroeval/benchmark_modules/base.py +2 -0
  4. euroeval/benchmark_modules/fresh.py +7 -1
  5. euroeval/benchmark_modules/hf.py +26 -21
  6. euroeval/benchmark_modules/litellm.py +258 -131
  7. euroeval/benchmark_modules/vllm.py +120 -68
  8. euroeval/benchmarker.py +11 -2
  9. euroeval/cli.py +14 -1
  10. euroeval/constants.py +7 -1
  11. euroeval/data_models.py +95 -20
  12. euroeval/dataset_configs/__init__.py +1 -0
  13. euroeval/dataset_configs/danish.py +14 -3
  14. euroeval/dataset_configs/dutch.py +14 -0
  15. euroeval/dataset_configs/english.py +22 -0
  16. euroeval/dataset_configs/estonian.py +15 -7
  17. euroeval/dataset_configs/finnish.py +14 -0
  18. euroeval/dataset_configs/french.py +14 -0
  19. euroeval/dataset_configs/german.py +23 -0
  20. euroeval/dataset_configs/italian.py +14 -0
  21. euroeval/dataset_configs/latvian.py +14 -0
  22. euroeval/dataset_configs/norwegian.py +14 -0
  23. euroeval/dataset_configs/polish.py +126 -0
  24. euroeval/dataset_configs/portuguese.py +14 -0
  25. euroeval/dataset_configs/spanish.py +14 -0
  26. euroeval/dataset_configs/swedish.py +25 -0
  27. euroeval/enums.py +12 -0
  28. euroeval/generation.py +17 -8
  29. euroeval/generation_utils.py +102 -16
  30. euroeval/metrics/pipeline.py +51 -9
  31. euroeval/model_cache.py +13 -1
  32. euroeval/prompt_templates/linguistic_acceptability.py +9 -0
  33. euroeval/prompt_templates/multiple_choice.py +27 -1
  34. euroeval/prompt_templates/named_entity_recognition.py +20 -0
  35. euroeval/prompt_templates/reading_comprehension.py +11 -0
  36. euroeval/prompt_templates/sentiment_classification.py +15 -0
  37. euroeval/prompt_templates/summarization.py +27 -1
  38. euroeval/scores.py +5 -0
  39. euroeval/task_group_utils/multiple_choice_classification.py +2 -2
  40. euroeval/task_group_utils/question_answering.py +29 -29
  41. euroeval/task_group_utils/sequence_classification.py +71 -81
  42. euroeval/task_group_utils/token_classification.py +17 -3
  43. euroeval/tasks.py +12 -10
  44. euroeval/{tokenization_utils.py → tokenisation_utils.py} +41 -25
  45. euroeval/utils.py +67 -3
  46. {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/METADATA +3 -1
  47. euroeval-16.1.0.dist-info/RECORD +70 -0
  48. euroeval-16.0.0.dist-info/RECORD +0 -69
  49. {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/WHEEL +0 -0
  50. {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/entry_points.txt +0 -0
  51. {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -44,7 +44,11 @@ from ..exceptions import (
44
44
  NeedsEnvironmentVariable,
45
45
  NeedsExtraInstalled,
46
46
  )
47
- from ..generation_utils import apply_prompt, extract_few_shot_examples
47
+ from ..generation_utils import (
48
+ apply_prompt,
49
+ extract_few_shot_examples,
50
+ raise_if_wrong_params,
51
+ )
48
52
  from ..languages import get_all_languages
49
53
  from ..task_group_utils import (
50
54
  question_answering,
@@ -52,7 +56,7 @@ from ..task_group_utils import (
52
56
  text_to_text,
53
57
  token_classification,
54
58
  )
55
- from ..tokenization_utils import (
59
+ from ..tokenisation_utils import (
56
60
  apply_chat_template,
57
61
  get_bos_token,
58
62
  get_end_of_chat_token_ids,
@@ -69,6 +73,7 @@ from ..utils import (
69
73
  get_hf_token,
70
74
  get_min_cuda_compute_capability,
71
75
  log_once,
76
+ split_model_id,
72
77
  )
73
78
  from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
74
79
 
@@ -97,6 +102,7 @@ class VLLMModel(HuggingFaceEncoderModel):
97
102
  fresh_model = False
98
103
  batching_preference = BatchingPreference.ALL_AT_ONCE
99
104
  high_priority = True
105
+ allowed_params = {re.compile(r".*"): ["thinking", "no-thinking"]}
100
106
 
101
107
  def __init__(
102
108
  self,
@@ -120,42 +126,46 @@ class VLLMModel(HuggingFaceEncoderModel):
120
126
  if importlib.util.find_spec("vllm") is None:
121
127
  raise NeedsExtraInstalled(extra="generative")
122
128
 
129
+ raise_if_wrong_params(
130
+ model_config=model_config, allowed_params=self.allowed_params
131
+ )
132
+
123
133
  model, tokeniser = load_model_and_tokeniser(
124
134
  model_config=model_config, benchmark_config=benchmark_config
125
135
  )
126
136
  self._model: "LLM" = model
127
137
  self._tokeniser: "PreTrainedTokenizer" = tokeniser
138
+
139
+ # We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
140
+ # to call the `__init__` method of the `BenchmarkModule` class.
141
+ super(HuggingFaceEncoderModel, self).__init__(
142
+ model_config=model_config,
143
+ dataset_config=dataset_config,
144
+ benchmark_config=benchmark_config,
145
+ log_metadata=log_metadata,
146
+ )
147
+
128
148
  self.end_of_reasoning_token = get_end_of_reasoning_token(
129
149
  model=self._model, tokeniser=self._tokeniser, model_id=model_config.model_id
130
150
  )
131
151
  self.end_of_chat_token_ids = get_end_of_chat_token_ids(
132
- tokeniser=self._tokeniser
152
+ tokeniser=self._tokeniser, generative_type=self.generative_type
133
153
  )
134
154
  self.custom_stop_tokens = get_custom_stop_tokens(
135
155
  model=self._model,
136
156
  tokeniser=self._tokeniser,
137
157
  model_id=model_config.model_id,
138
- is_reasoning_model=self.end_of_reasoning_token is not None,
139
- )
140
-
141
- # We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
142
- # to call the `__init__` method of the `BenchmarkModule` class.
143
- super(HuggingFaceEncoderModel, self).__init__(
144
- model_config=model_config,
145
- dataset_config=dataset_config,
146
- benchmark_config=benchmark_config,
147
- log_metadata=log_metadata,
158
+ generative_type=self.generative_type,
148
159
  )
149
160
 
150
161
  self.buffer |= dict(
151
- instruction_model=has_chat_template(tokeniser=self._tokeniser),
152
162
  first_label_token_mapping=get_first_label_token_mapping(
153
163
  dataset_config=self.dataset_config,
154
164
  model_config=self.model_config,
155
165
  tokeniser=self._tokeniser,
156
166
  generative_type=self.generative_type,
157
167
  log_metadata=self.log_metadata,
158
- ),
168
+ )
159
169
  )
160
170
  if self.model_config.adapter_base_model_id is not None:
161
171
  adapter_path = snapshot_download(
@@ -187,16 +197,36 @@ class VLLMModel(HuggingFaceEncoderModel):
187
197
  The generative type of the model, or None if it has not been set yet.
188
198
  """
189
199
  if not hasattr(self, "_tokeniser"):
200
+ log_once(
201
+ "The generative type of the model has not been set yet as the "
202
+ "tokeniser has not been loaded.",
203
+ level=logging.DEBUG,
204
+ )
190
205
  return None
191
- elif self.end_of_reasoning_token is not None:
192
- return GenerativeType.REASONING
206
+ elif self.benchmark_config.generative_type is not None:
207
+ type_ = self.benchmark_config.generative_type
208
+ elif self.model_config.param in {"thinking"}:
209
+ type_ = GenerativeType.REASONING
210
+ elif self.model_config.param in {"no-thinking"}:
211
+ type_ = GenerativeType.INSTRUCTION_TUNED
212
+ elif (
213
+ hasattr(self, "end_of_reasoning_token")
214
+ and self.end_of_reasoning_token is not None
215
+ ):
216
+ type_ = GenerativeType.REASONING
193
217
  elif (
194
218
  has_chat_template(tokeniser=self._tokeniser)
195
219
  or "instruct" in self.model_config.model_id.lower()
196
220
  ):
197
- return GenerativeType.INSTRUCTION_TUNED
221
+ type_ = GenerativeType.INSTRUCTION_TUNED
198
222
  else:
199
- return GenerativeType.BASE
223
+ type_ = GenerativeType.BASE
224
+ log_once(
225
+ f"Detected generative type {type_.name!r} for model "
226
+ f"{self.model_config.model_id!r}",
227
+ level=logging.DEBUG,
228
+ )
229
+ return type_
200
230
 
201
231
  @property
202
232
  def extract_labels_from_generation(self) -> ExtractLabelsFunction:
@@ -285,7 +315,7 @@ class VLLMModel(HuggingFaceEncoderModel):
285
315
  few_shot_examples=few_shot_examples,
286
316
  model_config=self.model_config,
287
317
  dataset_config=self.dataset_config,
288
- instruction_model=self.buffer["instruction_model"],
318
+ generative_type=self.generative_type,
289
319
  always_populate_text_field=True,
290
320
  tokeniser=self._tokeniser,
291
321
  ),
@@ -313,7 +343,7 @@ class VLLMModel(HuggingFaceEncoderModel):
313
343
  """
314
344
  # Get stopping tokens
315
345
  stop_tokens: list[str] = self.custom_stop_tokens.copy()
316
- if self.buffer["instruction_model"] is False:
346
+ if self.generative_type == GenerativeType.BASE:
317
347
  stop_tokens.append("\n\n")
318
348
  if self._tokeniser.pad_token_id is not None:
319
349
  assert isinstance(self._tokeniser.pad_token, str), (
@@ -337,31 +367,6 @@ class VLLMModel(HuggingFaceEncoderModel):
337
367
  if end_of_chat_token:
338
368
  stop_tokens.append(end_of_chat_token)
339
369
 
340
- structured_generation_schema = None
341
- if self.dataset_config.task.uses_structured_output:
342
- if self.generative_type == GenerativeType.REASONING:
343
- log_once(
344
- f"The model {self.model_config.model_id!r} is a reasoning model "
345
- "and thus does not support structured generation, so we do not "
346
- "enable it.",
347
- level=logging.DEBUG,
348
- )
349
- else:
350
- ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
351
- keys_and_their_types: dict[str, t.Any] = {
352
- tag_name: (conlist(str, max_length=5), ...)
353
- for tag_name in ner_tag_names
354
- }
355
- answer_format_class = create_model(
356
- "AnswerFormat", **keys_and_their_types
357
- )
358
- structured_generation_schema = answer_format_class.model_json_schema()
359
- log_once(
360
- "Using structured generation with the JSON schema "
361
- f"{structured_generation_schema}",
362
- level=logging.DEBUG,
363
- )
364
-
365
370
  # Get the mapping from labels to the first token in the label. We call this each
366
371
  # time we generate a new dataset since the dataset config can change
367
372
  self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
@@ -382,8 +387,29 @@ class VLLMModel(HuggingFaceEncoderModel):
382
387
  "error was. Skipping this evaluation."
383
388
  )
384
389
 
385
- # Define the guided decoding that we will use for structured generation
386
- if structured_generation_schema is not None:
390
+ structured_generation_schema = None
391
+ if (
392
+ self.dataset_config.task.uses_structured_output
393
+ or (self.dataset_config.task.uses_logprobs and self.dataset_config.labels)
394
+ ) and self.generative_type == GenerativeType.REASONING:
395
+ guided_decoding = None
396
+ logger.debug(
397
+ "The dataset uses structured output, but we are not using it as the "
398
+ "model is a reasoning model."
399
+ )
400
+ elif self.dataset_config.task.uses_structured_output:
401
+ ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
402
+ keys_and_their_types: dict[str, t.Any] = {
403
+ tag_name: (conlist(str, max_length=5), ...)
404
+ for tag_name in ner_tag_names
405
+ }
406
+ answer_format_class = create_model("AnswerFormat", **keys_and_their_types)
407
+ structured_generation_schema = answer_format_class.model_json_schema()
408
+ log_once(
409
+ "Using structured generation with the JSON schema: "
410
+ f"{json.dumps(structured_generation_schema)}",
411
+ level=logging.DEBUG,
412
+ )
387
413
  guided_decoding = GuidedDecodingParams(json=structured_generation_schema)
388
414
  elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
389
415
  guided_decoding = GuidedDecodingParams(
@@ -392,8 +418,17 @@ class VLLMModel(HuggingFaceEncoderModel):
392
418
  for label in self.dataset_config.labels
393
419
  ]
394
420
  )
421
+ log_once(
422
+ "Using structured generation with the choices: "
423
+ f"{guided_decoding.choice!r}.",
424
+ level=logging.DEBUG,
425
+ )
395
426
  else:
396
427
  guided_decoding = None
428
+ log_once(
429
+ "Not using structured generation as the dataset does not require it.",
430
+ level=logging.DEBUG,
431
+ )
397
432
 
398
433
  # Define the parameters used for vLLM generation
399
434
  max_tokens: int = (
@@ -425,9 +460,7 @@ class VLLMModel(HuggingFaceEncoderModel):
425
460
  labels_to_be_generated = list(self.dataset_config.prompt_label_mapping.values())
426
461
  if len(labels_to_be_generated) == 0:
427
462
  labels_to_be_generated = ["negative", "positive"]
428
- if not self.buffer.get(
429
- "instruction_model", False
430
- ) and should_prompts_be_stripped(
463
+ if self.generative_type == GenerativeType.BASE and should_prompts_be_stripped(
431
464
  labels_to_be_generated=labels_to_be_generated, tokeniser=self._tokeniser
432
465
  ):
433
466
  log_once(
@@ -439,6 +472,7 @@ class VLLMModel(HuggingFaceEncoderModel):
439
472
  # Generate sequences using vLLM
440
473
  input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
441
474
  num_attempts = 3
475
+ truncation_attempts = 0
442
476
  for _ in range(num_attempts):
443
477
  try:
444
478
  raw_outputs = self._model.generate(
@@ -466,12 +500,19 @@ class VLLMModel(HuggingFaceEncoderModel):
466
500
  "Prompts are too long, so truncating them and trying again..."
467
501
  )
468
502
  logger.debug(f"The error message was: {str(e)}")
503
+
504
+ # If we have already tried truncating the prompts a few times, then
505
+ # we truncate a bit more aggressively
506
+ extra_truncation = 50 * truncation_attempts
507
+ truncation_attempts += 1
508
+
469
509
  tokenized_prompts = self._tokeniser(
470
510
  text=prompts,
471
511
  truncation=True,
472
512
  max_length=max(
473
513
  min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH)
474
- - max_tokens,
514
+ - max_tokens
515
+ - extra_truncation,
475
516
  0,
476
517
  ),
477
518
  )
@@ -577,9 +618,10 @@ class VLLMModel(HuggingFaceEncoderModel):
577
618
  if using_api:
578
619
  return False
579
620
 
580
- model_id, revision = (
581
- model_id.split("@") if "@" in model_id else (model_id, "main")
582
- )
621
+ model_id_components = split_model_id(model_id=model_id)
622
+ model_id = model_id_components.model_id
623
+ revision = model_id_components.revision
624
+
583
625
  model_info = get_model_repo_info(
584
626
  model_id=model_id, revision=revision, benchmark_config=benchmark_config
585
627
  )
@@ -603,11 +645,11 @@ class VLLMModel(HuggingFaceEncoderModel):
603
645
  Returns:
604
646
  The model configuration.
605
647
  """
606
- model_id, revision = (
607
- model_id.split("@") if "@" in model_id else (model_id, "main")
608
- )
648
+ model_id_components = split_model_id(model_id=model_id)
609
649
  model_info = get_model_repo_info(
610
- model_id=model_id, revision=revision, benchmark_config=benchmark_config
650
+ model_id=model_id_components.model_id,
651
+ revision=model_id_components.revision,
652
+ benchmark_config=benchmark_config,
611
653
  )
612
654
  if model_info is None:
613
655
  raise InvalidModel(f"The model {model_id!r} could not be found.")
@@ -616,8 +658,9 @@ class VLLMModel(HuggingFaceEncoderModel):
616
658
  language_codes = list(language_mapping.keys())
617
659
 
618
660
  model_config = ModelConfig(
619
- model_id=model_id,
620
- revision=revision,
661
+ model_id=model_id_components.model_id,
662
+ revision=model_id_components.revision,
663
+ param=model_id_components.param,
621
664
  task=model_info.pipeline_tag,
622
665
  languages=[
623
666
  language_mapping[tag]
@@ -972,7 +1015,11 @@ def get_end_of_reasoning_token(
972
1015
  prompt = "What is your name?"
973
1016
  if has_chat_template(tokeniser=tokeniser):
974
1017
  templated_prompt = apply_chat_template(
975
- conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
1018
+ conversation=[dict(role="user", content=prompt)],
1019
+ tokeniser=tokeniser,
1020
+ tokenise=False,
1021
+ add_generation_prompt=True,
1022
+ enable_thinking=True,
976
1023
  )
977
1024
  assert isinstance(templated_prompt, str)
978
1025
  prompt = templated_prompt
@@ -1050,7 +1097,7 @@ def get_custom_stop_tokens(
1050
1097
  model: "LLM",
1051
1098
  tokeniser: "PreTrainedTokenizer",
1052
1099
  model_id: str,
1053
- is_reasoning_model: bool,
1100
+ generative_type: GenerativeType | None,
1054
1101
  ) -> list[str]:
1055
1102
  """Get the stop tokens for a generative model.
1056
1103
 
@@ -1061,9 +1108,8 @@ def get_custom_stop_tokens(
1061
1108
  The tokeniser.
1062
1109
  model_id:
1063
1110
  The model ID.
1064
- is_reasoning_model:
1065
- Whether the model is a reasoning model. This is used to determine the number
1066
- of generated tokens to allow before stopping the generation.
1111
+ generative_type:
1112
+ The generative type of the model.
1067
1113
 
1068
1114
  Returns:
1069
1115
  A list of stop tokens.
@@ -1073,12 +1119,18 @@ def get_custom_stop_tokens(
1073
1119
  prompt = "Hello"
1074
1120
  if has_chat_template(tokeniser=tokeniser):
1075
1121
  templated_prompt = apply_chat_template(
1076
- conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
1122
+ conversation=[dict(role="user", content=prompt)],
1123
+ tokeniser=tokeniser,
1124
+ tokenise=False,
1125
+ add_generation_prompt=True,
1126
+ enable_thinking=generative_type == GenerativeType.REASONING,
1077
1127
  )
1078
1128
  assert isinstance(templated_prompt, str)
1079
1129
  prompt = templated_prompt
1080
1130
 
1081
- max_tokens = REASONING_MAX_TOKENS if is_reasoning_model else 10
1131
+ max_tokens = (
1132
+ REASONING_MAX_TOKENS if generative_type == GenerativeType.REASONING else 10
1133
+ )
1082
1134
  completion = (
1083
1135
  model.generate(
1084
1136
  prompts=[prompt],
euroeval/benchmarker.py CHANGED
@@ -19,7 +19,7 @@ from .constants import GENERATIVE_PIPELINE_TAGS
19
19
  from .data_loading import load_data
20
20
  from .data_models import BenchmarkConfigParams, BenchmarkResult
21
21
  from .dataset_configs import get_all_dataset_configs
22
- from .enums import Device, ModelType
22
+ from .enums import Device, GenerativeType, ModelType
23
23
  from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
24
24
  from .finetuning import finetune
25
25
  from .generation import generate
@@ -79,6 +79,7 @@ class Benchmarker:
79
79
  api_base: str | None = None,
80
80
  api_version: str | None = None,
81
81
  gpu_memory_utilization: float = 0.9,
82
+ generative_type: GenerativeType | None = None,
82
83
  debug: bool = False,
83
84
  run_with_cli: bool = False,
84
85
  requires_safetensors: bool = False,
@@ -151,6 +152,10 @@ class Benchmarker:
151
152
  is generative. A larger value will result in faster evaluation, but at
152
153
  the risk of running out of GPU memory. Only reduce this if you are
153
154
  running out of GPU memory. Defaults to 0.9.
155
+ generative_type:
156
+ The type of generative model to benchmark. Only relevant if the model is
157
+ generative. If not specified, then the type will be inferred based on
158
+ the tags of the model. Defaults to None.
154
159
  debug:
155
160
  Whether to output debug information. Defaults to False.
156
161
  run_with_cli:
@@ -199,6 +204,7 @@ class Benchmarker:
199
204
  api_base=api_base,
200
205
  api_version=api_version,
201
206
  gpu_memory_utilization=gpu_memory_utilization,
207
+ generative_type=generative_type,
202
208
  debug=debug,
203
209
  run_with_cli=run_with_cli,
204
210
  requires_safetensors=requires_safetensors,
@@ -438,7 +444,7 @@ class Benchmarker:
438
444
 
439
445
  # Skip if the model type should not be benchmarked on this dataset
440
446
  model_type = model_config.model_type
441
- allowed_model_types = dataset_config.task.allowed_model_types
447
+ allowed_model_types = dataset_config.allowed_model_types
442
448
  if model_type not in allowed_model_types:
443
449
  logger.debug(
444
450
  f"Skipping benchmarking {model_id} on "
@@ -804,6 +810,7 @@ class Benchmarker:
804
810
  scores=scores,
805
811
  model_id=model_config.model_id,
806
812
  model_revision=model_config.revision,
813
+ model_param=model_config.param,
807
814
  )
808
815
 
809
816
  record = BenchmarkResult(
@@ -1108,6 +1115,8 @@ def initial_logging(
1108
1115
  model_id = model_config.model_id
1109
1116
  if model_config.revision and model_config.revision != "main":
1110
1117
  model_id += f"@{model_config.revision}"
1118
+ if model_config.param is not None:
1119
+ model_id += f"#{model_config.param}"
1111
1120
 
1112
1121
  split_type = "validation" if not benchmark_config.evaluate_test_split else "test"
1113
1122
  if model_config.task in GENERATIVE_PIPELINE_TAGS:
euroeval/cli.py CHANGED
@@ -4,7 +4,7 @@ import click
4
4
 
5
5
  from .benchmarker import Benchmarker
6
6
  from .dataset_configs import get_all_dataset_configs
7
- from .enums import Device
7
+ from .enums import Device, GenerativeType
8
8
  from .languages import get_all_languages
9
9
  from .tasks import get_all_tasks
10
10
 
@@ -208,6 +208,14 @@ from .tasks import get_all_tasks
208
208
  help="Only allow loading models that have safetensors weights available",
209
209
  default=False,
210
210
  )
211
+ @click.option(
212
+ "--generative-type",
213
+ type=click.Choice(["base", "instruction_tuned", "reasoning"]),
214
+ default=None,
215
+ show_default=True,
216
+ help="The type of generative model. Only relevant if the model is generative. If "
217
+ "not specified, the type will be inferred automatically.",
218
+ )
211
219
  def benchmark(
212
220
  model: tuple[str],
213
221
  dataset: tuple[str],
@@ -234,6 +242,7 @@ def benchmark(
234
242
  gpu_memory_utilization: float,
235
243
  debug: bool,
236
244
  requires_safetensors: bool,
245
+ generative_type: str | None,
237
246
  ) -> None:
238
247
  """Benchmark pretrained language models on language tasks."""
239
248
  models = list(model)
@@ -244,6 +253,9 @@ def benchmark(
244
253
  tasks = None if len(task) == 0 else list(task)
245
254
  batch_size_int = int(batch_size)
246
255
  device = Device[device.upper()] if device is not None else None
256
+ generative_type_obj = (
257
+ GenerativeType[generative_type.upper()] if generative_type else None
258
+ )
247
259
 
248
260
  benchmarker = Benchmarker(
249
261
  language=languages,
@@ -268,6 +280,7 @@ def benchmark(
268
280
  api_base=api_base,
269
281
  api_version=api_version,
270
282
  gpu_memory_utilization=gpu_memory_utilization,
283
+ generative_type=generative_type_obj,
271
284
  debug=debug,
272
285
  run_with_cli=True,
273
286
  requires_safetensors=requires_safetensors,
euroeval/constants.py CHANGED
@@ -15,7 +15,7 @@ MAX_CONTEXT_LENGTH = 8_192
15
15
 
16
16
  # We need to raise the amount of tokens generated for reasoning models, to give them
17
17
  # time to think
18
- REASONING_MAX_TOKENS = 32_768
18
+ REASONING_MAX_TOKENS = 8_192
19
19
 
20
20
 
21
21
  # The Hugging Face Hub pipeline tags used to classify models as generative
@@ -75,3 +75,9 @@ LITELLM_CLASSIFICATION_OUTPUT_KEY = "label"
75
75
 
76
76
  # These characters are stripped from JSON output when trying to identify the label
77
77
  JSON_STRIP_CHARACTERS = ' {}\n\r":'
78
+
79
+
80
+ # The number of tokens we generate when evaluating generative models on classification
81
+ # tasks. We also use this to determine whether we should store logprobs in the model
82
+ # outputs (and cache).
83
+ NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10