EuroEval 15.9.2__py3-none-any.whl → 15.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -378,7 +378,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
378
378
  tokenizer=self._tokenizer,
379
379
  ),
380
380
  batched=True,
381
- batch_size=1,
381
+ batch_size=10,
382
382
  remove_columns=dataset["train"].column_names,
383
383
  load_from_cache_file=False,
384
384
  keep_in_memory=True,
@@ -389,7 +389,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
389
389
  tokenizer=self._tokenizer,
390
390
  ),
391
391
  batched=True,
392
- batch_size=1,
392
+ batch_size=10,
393
393
  remove_columns=dataset["val"].column_names,
394
394
  load_from_cache_file=False,
395
395
  keep_in_memory=True,
@@ -400,7 +400,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
400
400
  tokenizer=self._tokenizer,
401
401
  ),
402
402
  batched=True,
403
- batch_size=1,
403
+ batch_size=10,
404
404
  remove_columns=dataset["test"].column_names,
405
405
  load_from_cache_file=False,
406
406
  keep_in_memory=True,
@@ -1,5 +1,6 @@
1
1
  """Generative models from an inference API, using the LiteLLM framework."""
2
2
 
3
+ import asyncio
3
4
  import collections.abc as c
4
5
  import logging
5
6
  import os
@@ -29,6 +30,7 @@ from litellm.exceptions import (
29
30
  Timeout,
30
31
  )
31
32
  from litellm.llms.vertex_ai.common_utils import VertexAIError
33
+ from litellm.router import Router
32
34
  from litellm.types.utils import ChoiceLogprobs, ModelResponse
33
35
  from pydantic import conlist, create_model
34
36
  from requests.exceptions import RequestException
@@ -68,7 +70,7 @@ from ..task_group_utils import (
68
70
  from ..tokenization_utils import get_first_label_token_mapping
69
71
  from ..types import ExtractLabelsFunction
70
72
  from ..utils import (
71
- catch_coroutine_exception,
73
+ add_semaphore_and_catch_exception,
72
74
  create_model_cache_dir,
73
75
  log_once,
74
76
  safe_run,
@@ -201,6 +203,11 @@ class LiteLLMModel(BenchmarkModule):
201
203
  self.is_ollama = model_config.model_id.startswith(
202
204
  "ollama/"
203
205
  ) or model_config.model_id.startswith("ollama_chat/")
206
+ self._ollama_show: ollama.ShowResponse = (
207
+ ollama.show("/".join(model_config.model_id.split("/")[1:]))
208
+ if self.is_ollama
209
+ else ollama.ShowResponse(model_info=None)
210
+ )
204
211
 
205
212
  raise_if_wrong_params(model_config=model_config, allowed_params=ALLOWED_PARAMS)
206
213
 
@@ -224,7 +231,14 @@ class LiteLLMModel(BenchmarkModule):
224
231
  Returns:
225
232
  The generative type of the model, or None if it has not been set yet.
226
233
  """
227
- if self.model_config.revision == "thinking":
234
+ if self.is_ollama:
235
+ reasoning_model = "thinking" in (self._ollama_show.capabilities or [])
236
+ type_ = (
237
+ GenerativeType.REASONING
238
+ if reasoning_model
239
+ else GenerativeType.INSTRUCTION_TUNED
240
+ )
241
+ elif self.model_config.revision in {"thinking"}:
228
242
  type_ = GenerativeType.REASONING
229
243
  elif re.fullmatch(
230
244
  pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
@@ -251,8 +265,18 @@ class LiteLLMModel(BenchmarkModule):
251
265
  The generated model outputs.
252
266
  """
253
267
  assert "messages" in inputs, "The input must contain a 'messages' key."
254
- messages = inputs["messages"]
268
+ conversations: list[list[litellm.AllMessageValues]] = inputs["messages"]
255
269
 
270
+ # Get the mapping from labels to the first token in the label. We call this each
271
+ # time we generate a new dataset since the dataset config can change
272
+ self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
273
+ dataset_config=self.dataset_config,
274
+ model_config=self.model_config,
275
+ tokenizer=None,
276
+ generative_type=self.generative_type,
277
+ )
278
+
279
+ # Set the core generation arguments
256
280
  generation_kwargs: dict[str, t.Any] = dict(
257
281
  model=self.model_config.model_id,
258
282
  max_completion_tokens=(
@@ -266,33 +290,30 @@ class LiteLLMModel(BenchmarkModule):
266
290
  api_key=self.benchmark_config.api_key,
267
291
  api_base=self.benchmark_config.api_base,
268
292
  api_version=self.benchmark_config.api_version,
293
+ max_retries=3,
269
294
  )
270
295
 
271
- # Get the mapping from labels to the first token in the label. We call this each
272
- # time we generate a new dataset since the dataset config can change
273
- self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
274
- dataset_config=self.dataset_config,
275
- model_config=self.model_config,
276
- tokenizer=None,
277
- generative_type=self.generative_type,
278
- )
279
-
280
- if self.buffer["first_label_token_mapping"]:
281
- generation_kwargs["logprobs"] = True
282
- generation_kwargs["top_logprobs"] = MAX_LOGPROBS
283
-
296
+ # Set up the `response_format` generation argument if we are dealing with a task
297
+ # using structured generation
284
298
  if self.dataset_config.task in TASKS_USING_JSON:
285
- for msg_list in messages:
286
- # msg_list is a list of {'role':…, 'content':…} dicts
287
- if not msg_list:
299
+ # Sanity check that "JSON" is included in the prompt, as some models require
300
+ # this
301
+ for conversation in conversations:
302
+ if not conversation:
288
303
  raise InvalidBenchmark(
289
- "Encountered an empty message list in 'messages'."
304
+ "Encountered an empty conversation in 'messages'."
290
305
  )
291
- last = msg_list[-1]
292
- assert isinstance(last, dict), (
293
- f"Expected dict message, got {type(last)}"
306
+ last_message = conversation[-1]
307
+ assert isinstance(last_message, dict), (
308
+ f"Expected dict message, got {type(last_message)}"
294
309
  )
295
- assert "json" in last["content"].lower(), (
310
+ assert "content" in last_message, (
311
+ "Expected 'content' key in the last message of the conversation."
312
+ )
313
+ assert isinstance(last_message["content"], str), (
314
+ "Expected 'content' to be a string."
315
+ )
316
+ assert "json" in last_message["content"].lower(), (
296
317
  "Prompt must contain 'json' for JSON tasks."
297
318
  )
298
319
 
@@ -328,6 +349,19 @@ class LiteLLMModel(BenchmarkModule):
328
349
  level=logging.DEBUG,
329
350
  )
330
351
 
352
+ # If the model is an Ollama reasoning model, we ensure that thinking is enabled
353
+ if self.is_ollama and self.generative_type == GenerativeType.REASONING:
354
+ generation_kwargs["think"] = True
355
+ log_once(
356
+ "Enabling thinking mode for Ollama model "
357
+ f"{self.model_config.model_id!r}",
358
+ level=logging.DEBUG,
359
+ )
360
+
361
+ # Handle manually set parameters
362
+ if self.buffer["first_label_token_mapping"]:
363
+ generation_kwargs["logprobs"] = True
364
+ generation_kwargs["top_logprobs"] = MAX_LOGPROBS
331
365
  if self.model_config.revision == "thinking":
332
366
  generation_kwargs["thinking"] = dict(
333
367
  type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
@@ -344,66 +378,67 @@ class LiteLLMModel(BenchmarkModule):
344
378
  level=logging.DEBUG,
345
379
  )
346
380
 
347
- # This drops generation kwargs that are not supported by the model
381
+ # Drop generation kwargs that are not supported by the model
348
382
  litellm.drop_params = True
349
383
 
350
- # Extract the generated sequences from the model response. Some APIs cannot
351
- # handle using newlines as stop sequences, so we try both.
352
- num_attempts = 10
353
-
354
- all_responses = {}
355
- all_failures = []
356
- to_run = list(enumerate(messages))
357
-
358
- for attempt in range(num_attempts):
359
- if not to_run:
384
+ all_responses: dict[int, ModelResponse] = {}
385
+ conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
386
+ enumerate(conversations)
387
+ )
388
+ for attempt in range(num_attempts := 10):
389
+ if not conversations_to_run:
360
390
  break
361
391
 
362
- batch_indices, batch_msgs = zip(*to_run)
363
- model_response, failures = safe_run(
392
+ batch_indices, batch_conversations = zip(*conversations_to_run)
393
+ successes, failures = safe_run(
364
394
  self._generate_async(
365
- messages=list(batch_msgs),
366
- generation_kwargs=generation_kwargs,
367
- max_retries=3,
368
- max_reruns=15,
395
+ model_id=self.model_config.model_id,
396
+ conversations=list(batch_conversations),
397
+ **generation_kwargs,
369
398
  )
370
399
  )
371
400
 
372
- for orig_idx, response in zip(batch_indices, model_response):
401
+ # Store the successful model outputs
402
+ for idx, response in successes:
403
+ orig_idx = batch_indices[idx]
373
404
  all_responses[orig_idx] = response
374
405
 
406
+ # If all requests were successful, break
375
407
  if not failures:
376
- to_run = []
408
+ conversations_to_run = []
377
409
  break
378
410
 
379
- all_failures.extend(failures)
380
- to_run = [(orig_idx, messages[orig_idx]) for orig_idx, _ in failures]
411
+ # Put the failed requests back in the queue to try again
412
+ conversations_to_run = [
413
+ (batch_indices[idx], conversations[batch_indices[idx]])
414
+ for idx, _ in failures
415
+ ]
381
416
  logger.debug(
382
- f"Attempt {attempt + 1}/{num_attempts}: "
383
- f"retrying {len(to_run)} failed message(s)"
417
+ f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
418
+ f"{len(conversations_to_run):,} failed message(s)"
384
419
  )
385
420
 
421
+ # Attempt to handle the exceptions, to improve the chance of getting
422
+ # successful generations next time around
386
423
  for _, error in failures:
387
424
  self._handle_exception(error=error, generation_kwargs=generation_kwargs)
388
- else:
389
- raise InvalidBenchmark(
390
- message=f"Failed to generate text, after {num_attempts} attempts."
391
- )
392
425
 
393
- if to_run:
426
+ # Sleep for a second to avoid pinging the API server too quickly
427
+ sleep(1)
428
+ else:
394
429
  raise InvalidBenchmark(
395
- f"Failed to generate text after {num_attempts} attempts. "
396
- f"Errors: {all_failures}"
430
+ message=f"Failed to generate text, after {num_attempts:,} attempts."
397
431
  )
398
432
 
399
- ordered_responses = [all_responses[i] for i in range(len(messages))]
433
+ # Extract the generations from the model output
434
+ ordered_responses = [all_responses[i] for i in range(len(conversations))]
400
435
  model_output = self._create_model_output(
401
436
  model_responses=ordered_responses, model_id=self.model_config.model_id
402
437
  )
403
438
 
404
- if len(messages) != len(model_output.sequences):
439
+ if len(conversations) != len(model_output.sequences):
405
440
  raise InvalidBenchmark(
406
- f"Number of model inputs ({len(messages):,}) does not match the "
441
+ f"Number of model inputs ({len(conversations):,}) does not match the "
407
442
  f"number of model outputs ({len(model_output.sequences):,})."
408
443
  )
409
444
 
@@ -462,8 +497,8 @@ class LiteLLMModel(BenchmarkModule):
462
497
  f"The model {model_id!r} does not support logprobs, so disabling it.",
463
498
  level=logging.DEBUG,
464
499
  )
465
- generation_kwargs.pop("logprobs")
466
- generation_kwargs.pop("top_logprobs")
500
+ generation_kwargs.pop("logprobs", None)
501
+ generation_kwargs.pop("top_logprobs", None)
467
502
  return
468
503
  elif any(msg.lower() in error_msg for msg in temperature_messages):
469
504
  log_once(
@@ -471,7 +506,7 @@ class LiteLLMModel(BenchmarkModule):
471
506
  "temperature, so disabling it.",
472
507
  level=logging.DEBUG,
473
508
  )
474
- generation_kwargs.pop("temperature")
509
+ generation_kwargs.pop("temperature", None)
475
510
  return
476
511
  elif any(msg.lower() in error_msg for msg in temperature_must_be_one_messages):
477
512
  log_once(
@@ -503,14 +538,7 @@ class LiteLLMModel(BenchmarkModule):
503
538
  generation_kwargs["response_format"] = dict(type="json_object")
504
539
  return
505
540
  elif isinstance(
506
- error,
507
- (
508
- APIConnectionError,
509
- Timeout,
510
- ServiceUnavailableError,
511
- InternalServerError,
512
- SystemError,
513
- ),
541
+ error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
514
542
  ):
515
543
  logger.debug(
516
544
  f"Service temporarily unavailable. The error message was: {error}. "
@@ -518,6 +546,18 @@ class LiteLLMModel(BenchmarkModule):
518
546
  )
519
547
  sleep(5)
520
548
  return
549
+ elif isinstance(error, (APIConnectionError, OSError)):
550
+ # If there are too many I/O connections, we increase the number of allowed
551
+ # file descriptors
552
+ if "too many open files" in error_msg:
553
+ raise InvalidBenchmark(
554
+ "There are too many file descriptors running. See the current "
555
+ "value by running `ulimit -n`. Try increasing it by running "
556
+ "`ulimit -n <new-value>` and try again."
557
+ )
558
+ raise InvalidBenchmark(
559
+ f"Encountered {type(error)} during generation: {error}."
560
+ )
521
561
 
522
562
  if isinstance(error, RateLimitError):
523
563
  raise InvalidModel(
@@ -538,69 +578,66 @@ class LiteLLMModel(BenchmarkModule):
538
578
 
539
579
  async def _generate_async(
540
580
  self,
541
- messages: list[dict[str, t.Any]],
542
- generation_kwargs: dict[str, t.Any],
543
- max_retries: int,
544
- max_reruns: int,
545
- ) -> tuple[list[ModelResponse], list[tuple[int, Exception]]]:
581
+ model_id: str,
582
+ conversations: list[list[litellm.AllMessageValues]],
583
+ **generation_kwargs,
584
+ ) -> tuple[list[tuple[int, ModelResponse]], list[tuple[int, Exception]]]:
546
585
  """Generate outputs from the model asynchronously.
547
586
 
548
587
  Args:
549
- messages:
550
- The messages to pass to the model.
551
- generation_kwargs:
552
- The generation kwargs to pass to the model.
553
- max_retries:
554
- The maximum number of retries to make.
555
- max_reruns:
556
- The maximum number of reruns to make.
588
+ model_id:
589
+ The ID of the model to use for generation.
590
+ conversations:
591
+ The conversations to pass to the model.
592
+ **generation_kwargs:
593
+ Additional generation arguments to pass to the model.
557
594
 
558
595
  Returns:
559
- A tuple containing the successful responses and the failed responses.
596
+ A tuple (successes, failures), each being a list of tuples (idx, content),
597
+ where the `idx` corresponds to the index of `conversations`, and `content`
598
+ is either the model response or an Exception.
560
599
  """
561
- success = []
562
- all_failures = {}
563
- to_run = list(enumerate(messages))
564
- prev_fail_count = len(to_run)
565
- rerun_count = 0
566
-
567
- while to_run and rerun_count < max_reruns and prev_fail_count > 0:
568
- requests = [
569
- litellm.acompletion(
570
- messages=msg, max_retries=max_retries, **generation_kwargs
600
+ # Create a LiteLLM router, which will ensure that we only use a single client
601
+ # for all the requests, preventing "too many open files" errors
602
+ router = Router(
603
+ model_list=[
604
+ dict(
605
+ model_name=self.model_config.model_id,
606
+ litellm_params=generation_kwargs,
571
607
  )
572
- for _, msg in to_run
573
608
  ]
574
- wrapped_requests = [
575
- catch_coroutine_exception(request) for request in requests
576
- ]
577
- responses = await tqdm_async.gather(*wrapped_requests, leave=False)
578
-
579
- next_to_run = []
580
- current_fail_count = 0
609
+ )
581
610
 
582
- for (orig_idx, _), response in zip(to_run, responses):
583
- if isinstance(response, Exception):
584
- current_fail_count += 1
585
- all_failures[orig_idx] = response
586
- next_to_run.append((orig_idx, messages[orig_idx]))
587
- else:
588
- success.append(response)
611
+ # Get the LLM generations asynchronously
612
+ max_concurrent_calls = 20
613
+ semaphore = asyncio.Semaphore(max_concurrent_calls)
614
+ requests = [
615
+ add_semaphore_and_catch_exception(
616
+ router.acompletion(model=model_id, messages=conversation),
617
+ semaphore=semaphore,
618
+ )
619
+ for conversation in conversations
620
+ ]
621
+ responses = await tqdm_async.gather(*requests, leave=False)
589
622
 
590
- if current_fail_count >= prev_fail_count:
591
- logger.warning(
592
- "Retry loop aborting due to no progress: "
593
- f"current_fail_count={current_fail_count}, "
594
- f"prev_fail_count={prev_fail_count}"
595
- )
596
- break
623
+ # Separate the successful responses from the failed ones
624
+ successes = [
625
+ (idx, response)
626
+ for idx, response in enumerate(responses)
627
+ if not isinstance(response, Exception)
628
+ ]
629
+ failures = [
630
+ (idx, response)
631
+ for idx, response in enumerate(responses)
632
+ if isinstance(response, Exception)
633
+ ]
597
634
 
598
- prev_fail_count = current_fail_count
599
- to_run = next_to_run
600
- rerun_count += 1
635
+ # Close connections
636
+ for request in requests:
637
+ if hasattr(request, "close"):
638
+ request.close()
601
639
 
602
- failures = [(orig_idx, all_failures[orig_idx]) for orig_idx, _ in to_run]
603
- return success, failures
640
+ return successes, failures
604
641
 
605
642
  @staticmethod
606
643
  def _create_model_output(
@@ -690,8 +727,7 @@ class LiteLLMModel(BenchmarkModule):
690
727
  # If it is an Ollama model then we can get the number of parameters from the
691
728
  # Ollama Python SDK
692
729
  if self.is_ollama:
693
- ollama_model_id = "/".join(self.model_config.model_id.split("/")[1:])
694
- model_info = ollama.show(ollama_model_id).modelinfo
730
+ model_info = self._ollama_show.modelinfo
695
731
  if model_info is not None:
696
732
  num_params = model_info.get("general.parameter_count")
697
733
  if num_params is not None:
@@ -819,7 +855,7 @@ class LiteLLMModel(BenchmarkModule):
819
855
  # Python SDK
820
856
  if self.is_ollama:
821
857
  ollama_model_id = "/".join(self.model_config.model_id.split("/")[1:])
822
- model_info = ollama.show(ollama_model_id).modelinfo
858
+ model_info = self._ollama_show.modelinfo
823
859
  if model_info is not None:
824
860
  context_length_keys = [
825
861
  key for key in model_info.keys() if "context_length" in key.lower()
@@ -7,12 +7,10 @@ import json
7
7
  import logging
8
8
  import os
9
9
  import re
10
- import sys
11
10
  import typing as t
12
11
  from functools import partial
13
12
  from pathlib import Path
14
13
  from time import sleep
15
- from types import MethodType
16
14
 
17
15
  import torch
18
16
  from datasets import DatasetDict
@@ -69,6 +67,7 @@ from ..tokenization_utils import (
69
67
  get_end_of_chat_token_ids,
70
68
  get_eos_token,
71
69
  get_first_label_token_mapping,
70
+ get_pad_token,
72
71
  should_prompts_be_stripped,
73
72
  )
74
73
  from ..types import ExtractLabelsFunction
@@ -81,17 +80,12 @@ from ..utils import (
81
80
  from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
82
81
 
83
82
  if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
84
- from vllm import LLM, RequestOutput, SamplingParams
83
+ from vllm import LLM, SamplingParams
85
84
  from vllm.distributed.parallel_state import (
86
85
  destroy_distributed_environment,
87
86
  destroy_model_parallel,
88
87
  )
89
- from vllm.inputs import PromptType
90
88
  from vllm.lora.request import LoRARequest
91
- from vllm.model_executor.guided_decoding.guided_fields import GuidedDecodingRequest
92
- from vllm.pooling_params import PoolingParams
93
- from vllm.prompt_adapter.request import PromptAdapterRequest
94
- from vllm.sampling_params import RequestOutputKind
95
89
 
96
90
  if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
97
91
  from outlines.models.vllm import adapt_tokenizer
@@ -140,6 +134,9 @@ class VLLMModel(HuggingFaceEncoderModel):
140
134
  self.end_of_reasoning_token = get_end_of_reasoning_token(
141
135
  model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
142
136
  )
137
+ self.end_of_chat_token_ids = get_end_of_chat_token_ids(
138
+ tokenizer=self._tokenizer
139
+ )
143
140
  self.custom_stop_tokens = get_custom_stop_tokens(
144
141
  model=self._model,
145
142
  tokenizer=self._tokenizer,
@@ -193,7 +190,10 @@ class VLLMModel(HuggingFaceEncoderModel):
193
190
  return None
194
191
  elif self.end_of_reasoning_token is not None:
195
192
  return GenerativeType.REASONING
196
- elif self._tokenizer.chat_template is not None:
193
+ elif (
194
+ self._tokenizer.chat_template is not None
195
+ or "instruct" in self.model_config.model_id.lower()
196
+ ):
197
197
  return GenerativeType.INSTRUCTION_TUNED
198
198
  else:
199
199
  return GenerativeType.BASE
@@ -303,55 +303,29 @@ class VLLMModel(HuggingFaceEncoderModel):
303
303
  Returns:
304
304
  The generated model outputs.
305
305
  """
306
- # Define which tokens to use as stopping criteria. We want to use the padding
307
- # token, end-of-sentence token, and a double newline if the model isn't
308
- # instruction tuned (since these separate the few-shot examples in the input in
309
- # this case)
306
+ # Get stopping tokens
310
307
  stop_tokens: list[str] = self.custom_stop_tokens.copy()
311
308
  if self.buffer["instruction_model"] is False:
312
309
  stop_tokens.append("\n\n")
313
310
  if self._tokenizer.pad_token_id is not None:
311
+ assert isinstance(self._tokenizer.pad_token, str), (
312
+ f"The pad token for the model {self.model_config.model_id!r} "
313
+ f"is not a string, which is unexpected: {self._tokenizer.pad_token!r}."
314
+ )
314
315
  stop_tokens.append(self._tokenizer.pad_token)
315
316
  if self._tokenizer.eos_token_id is not None:
317
+ assert isinstance(self._tokenizer.eos_token, str), (
318
+ f"The EOS token for the model {self.model_config.model_id!r} "
319
+ f"is not a string, which is unexpected: {self._tokenizer.eos_token!r}."
320
+ )
316
321
  stop_tokens.append(self._tokenizer.eos_token)
317
322
  if self._tokenizer.pad_token_id is None:
318
323
  self._tokenizer.pad_token_id = self._tokenizer.eos_token_id
319
324
  self._tokenizer.pad_token = self._tokenizer.eos_token
320
- if (
321
- self._tokenizer.bos_token_id is not None
322
- and self._tokenizer.pad_token_id is None
323
- ):
324
- self._tokenizer.pad_token_id = self._tokenizer.bos_token_id
325
- self._tokenizer.pad_token = self._tokenizer.bos_token
326
- elif (
327
- self._tokenizer.eos_token_id is not None
328
- and self._tokenizer.pad_token_id is None
329
- ):
330
- self._tokenizer.pad_token_id = self._tokenizer.eos_token_id
331
- self._tokenizer.pad_token = self._tokenizer.eos_token
332
- elif self._tokenizer.pad_token_id is None:
333
- pad_token_candidates = ["<pad>", "[pad]", "<|endoftext|>", "<|im_end|>"]
334
- pad_token_candidates.extend([c.upper() for c in pad_token_candidates])
335
- for candidate in pad_token_candidates:
336
- if candidate in self._tokenizer.get_vocab():
337
- pad_token_id = self._tokenizer.get_vocab()[candidate]
338
- self._tokenizer.pad_token = candidate
339
- self._tokenizer.pad_token_id = pad_token_id
340
- break
341
- else:
342
- raise InvalidModel(
343
- "Could not find a suitable token to use as a padding token, since "
344
- "the model does not have a BOS, EOS, or padding token, and does "
345
- f"not have any of the following tokens in its vocabulary: "
346
- f"{pad_token_candidates}."
347
- )
348
-
349
- assert self._tokenizer.pad_token_id is not None
350
-
351
- # Add end of chat token as a stopping token, if it exists
352
- end_of_chat_token_ids = get_end_of_chat_token_ids(tokenizer=self._tokenizer)
353
- if end_of_chat_token_ids is not None:
354
- end_of_chat_token = self._tokenizer.decode(end_of_chat_token_ids).strip()
325
+ if self.end_of_chat_token_ids is not None:
326
+ end_of_chat_token = self._tokenizer.decode(
327
+ self.end_of_chat_token_ids
328
+ ).strip()
355
329
  if end_of_chat_token:
356
330
  stop_tokens.append(end_of_chat_token)
357
331
 
@@ -438,7 +412,7 @@ class VLLMModel(HuggingFaceEncoderModel):
438
412
  raw_outputs = self._model.generate(
439
413
  prompts=prompts,
440
414
  sampling_params=sampling_params,
441
- use_tqdm=(not input_is_a_test),
415
+ use_tqdm=False if input_is_a_test else get_pbar_without_leave,
442
416
  lora_request=self.buffer.get("lora_request"),
443
417
  )
444
418
  break
@@ -515,16 +489,13 @@ class VLLMModel(HuggingFaceEncoderModel):
515
489
  completion.split(self.end_of_reasoning_token)[-1]
516
490
  for completion in completions
517
491
  ]
518
- if self.custom_stop_tokens:
519
- stop_token_pattern = re.compile(
520
- "|".join(
521
- re.escape(stop_token) for stop_token in self.custom_stop_tokens
522
- )
523
- )
524
- completions = [
525
- re.split(pattern=stop_token_pattern, string=completion)[0]
526
- for completion in completions
527
- ]
492
+ stop_token_pattern = re.compile(
493
+ "|".join(re.escape(stop_token) for stop_token in stop_tokens)
494
+ )
495
+ completions = [
496
+ re.split(pattern=stop_token_pattern, string=completion)[0]
497
+ for completion in completions
498
+ ]
528
499
  completions = [completion.strip() for completion in completions]
529
500
 
530
501
  # Sanity check
@@ -824,10 +795,6 @@ def load_model_and_tokenizer(
824
795
  f"The model {model_id!r} could not be loaded. The error was {e!r}."
825
796
  )
826
797
 
827
- model._run_engine = MethodType(_run_engine_with_fixed_progress_bars, model)
828
- model._validate_and_add_requests = MethodType(
829
- _validate_and_add_requests_with_fixed_progress_bars, model
830
- )
831
798
  model.config = hf_model_config
832
799
 
833
800
  return model, tokenizer
@@ -911,90 +878,11 @@ def load_tokenizer(
911
878
  # Ensure that BOS, EOS and PAD tokens are set
912
879
  tokenizer.bos_token, tokenizer.bos_token_id = get_bos_token(tokenizer=tokenizer)
913
880
  tokenizer.eos_token, tokenizer.eos_token_id = get_eos_token(tokenizer=tokenizer)
914
- if tokenizer.pad_token_id is None:
915
- tokenizer.pad_token = tokenizer.eos_token
881
+ tokenizer.pad_token, tokenizer.pad_token_id = get_pad_token(tokenizer=tokenizer)
916
882
 
917
883
  return tokenizer
918
884
 
919
885
 
920
- def _run_engine_with_fixed_progress_bars(
921
- self: "LLM", use_tqdm: bool
922
- ) -> list["RequestOutput"]:
923
- if use_tqdm:
924
- num_requests = self.llm_engine.get_num_unfinished_requests()
925
- pbar = tqdm(
926
- total=num_requests, leave=False, disable=hasattr(sys, "_called_from_test")
927
- )
928
- else:
929
- pbar = None
930
-
931
- # Run the engine.
932
- outputs: list["RequestOutput"] = list()
933
- while self.llm_engine.has_unfinished_requests():
934
- step_outputs = self.llm_engine.step()
935
- for output in step_outputs:
936
- if output.finished:
937
- outputs.append(output)
938
- if pbar is not None:
939
- pbar.update(1)
940
-
941
- if pbar is not None:
942
- pbar.close()
943
-
944
- # Sort the outputs by request ID. This is necessary because some requests may be
945
- # finished earlier than its previous requests.
946
- outputs = sorted(outputs, key=lambda x: int(x.request_id))
947
-
948
- return outputs
949
-
950
-
951
- def _validate_and_add_requests_with_fixed_progress_bars(
952
- self: "LLM",
953
- prompts: "PromptType | c.Sequence[PromptType]",
954
- params: "SamplingParams | c.Sequence[SamplingParams] | PoolingParams | c.Sequence[PoolingParams]", # noqa: E501
955
- *,
956
- use_tqdm: bool,
957
- lora_request: "c.Sequence[LoRARequest] | LoRARequest | None",
958
- prompt_adapter_request: "PromptAdapterRequest | None",
959
- tokenization_kwargs: dict[str, t.Any] | None = None,
960
- guided_options: "GuidedDecodingRequest | None" = None,
961
- priority: list[int] | None = None,
962
- ) -> None:
963
- if isinstance(prompts, (str, dict)):
964
- # Convert a single prompt to a list.
965
- prompts = [prompts]
966
-
967
- num_requests = len(prompts)
968
- if isinstance(params, list) and len(params) != num_requests:
969
- raise ValueError("The lengths of prompts and params must be the same.")
970
- if isinstance(lora_request, list) and len(lora_request) != num_requests:
971
- raise ValueError("The lengths of prompts and lora_request must be the same.")
972
-
973
- for sp in params if isinstance(params, list) else (params,):
974
- if isinstance(sp, SamplingParams):
975
- self._add_guided_params(sp, guided_options)
976
-
977
- # We only care about the final output
978
- sp.output_kind = RequestOutputKind.FINAL_ONLY
979
-
980
- # Add requests to the engine.
981
- it = prompts
982
- if use_tqdm:
983
- it = tqdm(it, desc="Adding requests", leave=False)
984
-
985
- for i, prompt in enumerate(it):
986
- self._add_request(
987
- prompt,
988
- params[i] if isinstance(params, c.Sequence) else params,
989
- tokenization_kwargs=tokenization_kwargs,
990
- lora_request=lora_request[i]
991
- if isinstance(lora_request, c.Sequence)
992
- else lora_request,
993
- prompt_adapter_request=prompt_adapter_request,
994
- priority=priority[i] if priority else 0,
995
- )
996
-
997
-
998
886
  def clear_vllm() -> None:
999
887
  """Clear the GPU memory used by the vLLM model, enabling re-initialisation."""
1000
888
  with contextlib.suppress(ValueError):
@@ -1166,3 +1054,19 @@ def get_custom_stop_tokens(
1166
1054
  logger.debug(f"Found no custom stop tokens for model {model_id!r}.")
1167
1055
 
1168
1056
  return stop_tokens
1057
+
1058
+
1059
+ def get_pbar_without_leave(*tqdm_args, **tqdm_kwargs) -> tqdm:
1060
+ """Get a progress bar for vLLM which disappears after completion.
1061
+
1062
+ Args:
1063
+ *tqdm_args:
1064
+ Positional arguments to pass to tqdm.
1065
+ **tqdm_kwargs:
1066
+ Additional keyword arguments to pass to tqdm.
1067
+
1068
+ Returns:
1069
+ A tqdm progress bar.
1070
+ """
1071
+ tqdm_kwargs.pop("leave", None) # Remove the 'leave' key if it exists
1072
+ return tqdm(*tqdm_args, leave=False, **tqdm_kwargs)
euroeval/data_loading.py CHANGED
@@ -4,11 +4,11 @@ import logging
4
4
  import sys
5
5
  import time
6
6
 
7
+ import requests
7
8
  from datasets import Dataset, DatasetDict, load_dataset
8
9
  from datasets.exceptions import DatasetsError
9
10
  from huggingface_hub.errors import HfHubHTTPError
10
11
  from numpy.random import Generator
11
- from requests import ReadTimeout
12
12
 
13
13
  from .data_models import BenchmarkConfig, DatasetConfig
14
14
  from .exceptions import HuggingFaceHubDown, InvalidBenchmark
@@ -101,7 +101,13 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> DatasetDic
101
101
  token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
102
102
  )
103
103
  break
104
- except (FileNotFoundError, DatasetsError, ConnectionError, ReadTimeout):
104
+ except (
105
+ FileNotFoundError,
106
+ ConnectionError,
107
+ DatasetsError,
108
+ requests.ConnectionError,
109
+ requests.ReadTimeout,
110
+ ):
105
111
  logger.warning(
106
112
  f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
107
113
  )
euroeval/finetuning.py CHANGED
@@ -200,6 +200,7 @@ def finetune_single_iteration(
200
200
  compute_metrics=model.compute_metrics,
201
201
  callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
202
202
  data_collator=model.data_collator,
203
+ preprocess_logits_for_metrics=remove_extra_tensors_from_logits,
203
204
  )
204
205
 
205
206
  if not benchmark_config.verbose:
@@ -316,3 +317,24 @@ def get_training_args(
316
317
  training_args._n_gpu = 1
317
318
 
318
319
  return training_args
320
+
321
+
322
+ def remove_extra_tensors_from_logits(
323
+ logits: torch.Tensor | tuple[torch.Tensor, ...], labels: torch.Tensor
324
+ ) -> torch.Tensor | tuple[torch.Tensor, ...]:
325
+ """If the logits are a tuple, return only the first element.
326
+
327
+ Args:
328
+ logits:
329
+ The logits to process.
330
+ labels:
331
+ The labels to use for the processing.
332
+
333
+ Returns:
334
+ The processed logits.
335
+ """
336
+ if isinstance(logits, tuple):
337
+ logits = logits[:-1]
338
+ if len(logits) == 1:
339
+ logits = logits[0]
340
+ return logits
@@ -12,6 +12,8 @@ from transformers.tokenization_utils import PreTrainedTokenizer
12
12
  from transformers.tokenization_utils_base import BatchEncoding
13
13
  from transformers.trainer import Trainer
14
14
 
15
+ from ..exceptions import InvalidBenchmark
16
+
15
17
  if t.TYPE_CHECKING:
16
18
  from ..types import Labels, Predictions
17
19
 
@@ -19,7 +21,7 @@ logger = logging.getLogger("euroeval")
19
21
 
20
22
 
21
23
  class MultipleChoiceClassificationTrainer(Trainer):
22
- """Trainer subclass for question answering tasks."""
24
+ """Trainer subclass for multiple-choice classification tasks."""
23
25
 
24
26
  def evaluate( # type: ignore[override]
25
27
  self,
@@ -57,6 +59,8 @@ class MultipleChoiceClassificationTrainer(Trainer):
57
59
  )
58
60
 
59
61
  predictions = output.predictions
62
+ if isinstance(predictions, tuple):
63
+ predictions = predictions[0]
60
64
  assert isinstance(predictions, np.ndarray)
61
65
 
62
66
  metrics = output.metrics
@@ -150,6 +154,12 @@ def postprocess_predictions_and_labels(
150
154
  Returns:
151
155
  The postprocessed predictions and labels.
152
156
  """
157
+ if predictions.ndim != 2 or predictions.shape[1] != 2:
158
+ raise InvalidBenchmark(
159
+ "Predictions must be a 2D array with shape (num_examples, 2). Found "
160
+ f"shape {predictions.shape}."
161
+ )
162
+
153
163
  mapping = {0: "a", 1: "b", 2: "c", 3: "d", 4: "e"}
154
164
 
155
165
  all_predictions: list[str] = list()
@@ -8,11 +8,11 @@ from collections import defaultdict
8
8
  import evaluate
9
9
  import numpy as np
10
10
  from evaluate import EvaluationModule
11
- from transformers.tokenization_utils import PreTrainedTokenizer
12
11
  from transformers.tokenization_utils_base import PreTrainedTokenizerBase
13
12
  from transformers.trainer import Trainer
14
13
 
15
14
  from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
15
+ from ..exceptions import InvalidBenchmark
16
16
  from ..tokenization_utils import get_special_token_metadata
17
17
  from ..utils import raise_if_model_output_contains_nan_values
18
18
 
@@ -20,6 +20,7 @@ if t.TYPE_CHECKING:
20
20
  import torch.nn as nn
21
21
  from datasets.arrow_dataset import Dataset
22
22
  from transformers.modeling_utils import PreTrainedModel
23
+ from transformers.tokenization_utils import PreTrainedTokenizer
23
24
  from transformers.tokenization_utils_base import BatchEncoding
24
25
  from transformers.trainer_callback import TrainerCallback
25
26
  from transformers.trainer_utils import EvalPrediction
@@ -43,6 +44,7 @@ class QuestionAnsweringTrainer(Trainer):
43
44
  compute_metrics: "c.Callable[[EvalPrediction], dict[str, float]]",
44
45
  callbacks: "list[TrainerCallback]",
45
46
  data_collator: "c.Callable",
47
+ **kwargs,
46
48
  ) -> None:
47
49
  """Initialise the trainer."""
48
50
  super().__init__(
@@ -54,6 +56,7 @@ class QuestionAnsweringTrainer(Trainer):
54
56
  compute_metrics=compute_metrics,
55
57
  callbacks=callbacks,
56
58
  data_collator=data_collator,
59
+ **kwargs,
57
60
  )
58
61
 
59
62
  # Get the CLS token id for the tokenizer
@@ -475,7 +478,7 @@ def prepare_test_examples(
475
478
 
476
479
 
477
480
  def postprocess_predictions_and_labels(
478
- predictions: tuple[np.ndarray, np.ndarray],
481
+ predictions: tuple[np.ndarray, ...],
479
482
  dataset: "Dataset",
480
483
  prepared_dataset: "Dataset",
481
484
  cls_token_index: int,
@@ -484,7 +487,7 @@ def postprocess_predictions_and_labels(
484
487
 
485
488
  Args:
486
489
  predictions:
487
- A pair of (start_logits, end_logits) predictions.
490
+ A tuple whose first two elements are (start_logits, end_logits).
488
491
  dataset:
489
492
  The dataset containing the examples.
490
493
  prepared_dataset:
@@ -495,7 +498,14 @@ def postprocess_predictions_and_labels(
495
498
  Returns:
496
499
  The postprocessed predictions and labels.
497
500
  """
498
- all_start_logits, all_end_logits = predictions
501
+ if len(predictions) < 2:
502
+ raise InvalidBenchmark(
503
+ "The predictions should be a tuple with the first two elements being "
504
+ "(start_logits, end_logits), but got {len(predictions)} elements instead: "
505
+ f"{predictions}."
506
+ )
507
+
508
+ all_start_logits, all_end_logits = predictions[:2]
499
509
 
500
510
  # Build a map from an example to its corresponding features, being the blocks of
501
511
  # text from the context that we're feeding into the model. An example can have
@@ -185,6 +185,11 @@ def get_bos_token(
185
185
  )
186
186
  return None, None
187
187
 
188
+ log_once(
189
+ f"Beginning-of-sequence token was not set, but detected it as {bos_token!r} "
190
+ f"with ID {bos_token_id}.",
191
+ level=logging.DEBUG,
192
+ )
188
193
  return bos_token, bos_token_id
189
194
 
190
195
 
@@ -221,9 +226,97 @@ def get_eos_token(
221
226
  )
222
227
  return None, None
223
228
 
229
+ log_once(
230
+ f"End-of-sequence token was not set, but detected it as {eos_token!r} with "
231
+ f"ID {eos_token_id}.",
232
+ level=logging.DEBUG,
233
+ )
224
234
  return eos_token, eos_token_id
225
235
 
226
236
 
237
+ def get_pad_token(
238
+ tokenizer: "PreTrainedTokenizer",
239
+ ) -> tuple[str, int] | tuple[None, None]:
240
+ """Get the padding token from a tokenizer.
241
+
242
+ Args:
243
+ tokenizer:
244
+ The tokenizer.
245
+
246
+ Returns:
247
+ A pair (token, token_id) representing the padding token and its token ID, or
248
+ (None, None) if no padding token is found.
249
+ """
250
+ # If the tokenizer already has a padding token, return it
251
+ if tokenizer.pad_token is not None and tokenizer.pad_token_id is not None:
252
+ assert isinstance(tokenizer.pad_token, str), (
253
+ "Expected tokenizer.pad_token to be a string, but got "
254
+ f"{type(tokenizer.pad_token)}."
255
+ )
256
+ assert isinstance(tokenizer.pad_token_id, int), (
257
+ "Expected tokenizer.pad_token_id to be an integer, but got "
258
+ f"{type(tokenizer.pad_token_id)}."
259
+ )
260
+ return (tokenizer.pad_token, tokenizer.pad_token_id)
261
+
262
+ # If the tokenizer has a BOS token, use it as the padding token
263
+ if tokenizer.bos_token is not None and tokenizer.bos_token_id is not None:
264
+ assert isinstance(tokenizer.bos_token, str), (
265
+ "Expected tokenizer.bos_token to be a string, but got "
266
+ f"{type(tokenizer.bos_token)}."
267
+ )
268
+ assert isinstance(tokenizer.bos_token_id, int), (
269
+ "Expected tokenizer.bos_token_id to be an integer, but got "
270
+ f"{type(tokenizer.bos_token_id)}."
271
+ )
272
+ pad_token = tokenizer.bos_token
273
+ pad_token_id = tokenizer.bos_token_id
274
+
275
+ # If the tokenizer has an EOS token, use it as the padding token
276
+ elif tokenizer.eos_token is not None and tokenizer.eos_token_id is not None:
277
+ assert isinstance(tokenizer.eos_token, str), (
278
+ "Expected tokenizer.eos_token to be a string, but got "
279
+ f"{type(tokenizer.eos_token)}."
280
+ )
281
+ assert isinstance(tokenizer.eos_token_id, int), (
282
+ "Expected tokenizer.eos_token_id to be an integer, but got "
283
+ f"{type(tokenizer.eos_token_id)}."
284
+ )
285
+ pad_token = tokenizer.eos_token
286
+ pad_token_id = tokenizer.eos_token_id
287
+
288
+ # Otherwise, try to find a candidate padding token in the vocabulary
289
+ else:
290
+ pad_token_candidates = [
291
+ "<pad>",
292
+ "[pad]",
293
+ "<|endoftext|>",
294
+ "<|end▁of▁sentence|>",
295
+ "<|im_end|>",
296
+ ]
297
+ pad_token_candidates.extend([c.upper() for c in pad_token_candidates])
298
+ for candidate in pad_token_candidates:
299
+ if candidate in tokenizer.get_vocab():
300
+ pad_token = candidate
301
+ pad_token_id = tokenizer.get_vocab()[candidate]
302
+ break
303
+ else:
304
+ log_once(
305
+ "Could not identify a padding token for the model. Please ensure that "
306
+ "this has been set in the tokenizer's configuration. Using no padding "
307
+ "token. This may lead to unexpected behavior in the model.",
308
+ level=logging.INFO,
309
+ )
310
+ return None, None
311
+
312
+ log_once(
313
+ f"Padding token was not set, but detected it as {pad_token!r} with ID "
314
+ f"{pad_token_id}.",
315
+ level=logging.DEBUG,
316
+ )
317
+ return pad_token, pad_token_id
318
+
319
+
227
320
  def get_end_of_chat_token_ids(tokenizer: "PreTrainedTokenizer") -> list[int] | None:
228
321
  """Get the end token ID for chat models.
229
322
 
@@ -300,14 +393,14 @@ def get_first_label_token_mapping(
300
393
  if tokenizer is None:
301
394
  if output_scores:
302
395
  log_once(
303
- f"The model {model_config.model_id!r} will output scores, since the "
304
- "dataset supports it and no tokenizer is available.",
396
+ f"We will use logprobs with the model {model_config.model_id!r} "
397
+ "since the dataset supports it and no tokenizer is available.",
305
398
  level=logging.DEBUG,
306
399
  )
307
400
  else:
308
401
  log_once(
309
- f"The model {model_config.model_id!r} will not output scores, since "
310
- "the dataset does not support it and no tokenizer is available.",
402
+ f"We will not use logprobs with the model {model_config.model_id!r} "
403
+ "since the dataset does not support it and no tokenizer is available.",
311
404
  level=logging.DEBUG,
312
405
  )
313
406
  return output_scores
@@ -368,7 +461,7 @@ def get_first_label_token_mapping(
368
461
  if not matching_tokens:
369
462
  log_once(
370
463
  f"No matching token found in token_list for label '{label}', so "
371
- "we will not output scores.",
464
+ "we will not use logprobs with the model.",
372
465
  level=logging.DEBUG,
373
466
  )
374
467
  return False
@@ -378,8 +471,8 @@ def get_first_label_token_mapping(
378
471
  # tokens are distinct
379
472
  if len(first_tokens) == len(set(first_tokens)):
380
473
  log_once(
381
- "The model will output scores, since the first tokens of the labels "
382
- "are distinct.",
474
+ "We will use logprobs with the model since the first tokens of the "
475
+ "labels are distinct.",
383
476
  level=logging.DEBUG,
384
477
  )
385
478
  return {
@@ -388,7 +481,7 @@ def get_first_label_token_mapping(
388
481
  }
389
482
  else:
390
483
  log_once(
391
- "The model will not output scores, since the first tokens of the "
484
+ "We will not use logprobs with the model since the first tokens of the "
392
485
  "labels are not distinct. The first tokens for the labels "
393
486
  f"{local_labels} are {first_tokens}"
394
487
  )
@@ -398,7 +491,8 @@ def get_first_label_token_mapping(
398
491
  # evaluation errors. This will force the label extraction to rely on word edit
399
492
  # distance instead of logprobs.
400
493
  log_once(
401
- "The model will not output scores, since the dataset does not have labels.",
494
+ "We will not use logprobs with the model, since the dataset does not have "
495
+ "labels.",
402
496
  level=logging.DEBUG,
403
497
  )
404
498
  return False
euroeval/utils.py CHANGED
@@ -121,6 +121,8 @@ def block_terminal_output() -> None:
121
121
  logging.getLogger("matplotlib.font_manager").setLevel(logging.CRITICAL)
122
122
  logging.getLogger("accelerate").setLevel(logging.CRITICAL)
123
123
  logging.getLogger("LiteLLM").setLevel(logging.CRITICAL)
124
+ logging.getLogger("LiteLLM Router").setLevel(logging.CRITICAL)
125
+ logging.getLogger("LiteLLM Proxy").setLevel(logging.CRITICAL)
124
126
  logging.getLogger("huggingface_hub").setLevel(logging.CRITICAL)
125
127
 
126
128
  # This suppresses vLLM logging
@@ -352,19 +354,22 @@ def safe_run(coroutine: t.Coroutine[t.Any, t.Any, T]) -> T:
352
354
  asyncio.set_event_loop(None)
353
355
 
354
356
 
355
- async def catch_coroutine_exception(
356
- coroutine: t.Coroutine[t.Any, t.Any, T],
357
+ async def add_semaphore_and_catch_exception(
358
+ coroutine: t.Coroutine[t.Any, t.Any, T], semaphore: asyncio.Semaphore
357
359
  ) -> T | Exception:
358
- """Run a coroutine, catching any exceptions and returning them.
360
+ """Run a coroutine with a semaphore.
359
361
 
360
362
  Args:
361
363
  coroutine:
362
364
  The coroutine to run.
365
+ semaphore:
366
+ The semaphore to use.
363
367
 
364
368
  Returns:
365
- The result of the coroutine, or the exception if it was raised.
369
+ The result of the coroutine.
366
370
  """
367
- try:
368
- return await coroutine
369
- except Exception as exc:
370
- return exc
371
+ async with semaphore:
372
+ try:
373
+ return await coroutine
374
+ except Exception as exc:
375
+ return exc
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.9.2
3
+ Version: 15.10.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
7
7
  Author-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
8
- Maintainer-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>
8
+ Maintainer-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
9
9
  License: MIT License
10
10
 
11
11
  Copyright (c) 2022-2024 Dan Saattrup Nielsen
@@ -37,13 +37,12 @@ Requires-Dist: demjson3>=3.0.6
37
37
  Requires-Dist: evaluate>=0.4.1
38
38
  Requires-Dist: huggingface-hub>=0.30.1
39
39
  Requires-Dist: levenshtein>=0.24.0
40
- Requires-Dist: litellm>=1.63.0
40
+ Requires-Dist: litellm>=1.72.2
41
41
  Requires-Dist: more-itertools>=10.5.0
42
42
  Requires-Dist: numpy<2.0.0,>=1.23.0
43
- Requires-Dist: ollama>=0.4.7
43
+ Requires-Dist: ollama>=0.5.1
44
44
  Requires-Dist: pandas>=2.2.0
45
45
  Requires-Dist: peft>=0.15.0
46
- Requires-Dist: protobuf~=3.20.0
47
46
  Requires-Dist: pydantic>=2.6.0
48
47
  Requires-Dist: pyinfer>=0.0.3
49
48
  Requires-Dist: python-dotenv>=1.0.1
@@ -62,12 +61,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
62
61
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
63
62
  Requires-Dist: gradio>=4.26.0; extra == 'all'
64
63
  Requires-Dist: outlines>=0.1.11; extra == 'all'
65
- Requires-Dist: vllm>=0.9.0; (platform_system == 'Linux') and extra == 'all'
64
+ Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'all'
66
65
  Provides-Extra: generative
67
66
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
68
67
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
68
  Requires-Dist: outlines>=0.1.11; extra == 'generative'
70
- Requires-Dist: vllm>=0.9.0; (platform_system == 'Linux') and extra == 'generative'
69
+ Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'generative'
71
70
  Provides-Extra: human-evaluation
72
71
  Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
73
72
  Provides-Extra: test
@@ -93,7 +92,7 @@ ______________________________________________________________________
93
92
  [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
94
93
 
95
94
 
96
- ## Maintainers
95
+ ## Maintainer
97
96
 
98
97
  - Dan Saattrup Nielsen ([@saattrupdan](https://github.com/saattrupdan),
99
98
  dan.nielsen@alexandra.dk)
@@ -4,11 +4,11 @@ euroeval/benchmarker.py,sha256=wmgrYVS31PMhhrVienjaVHHyfnZAy51kUvC6OjooiOw,48047
4
4
  euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
5
5
  euroeval/cli.py,sha256=d8JztMi_RbpUlEBXidd6DQ-xeC-xhozf_qU6Vkzye20,8161
6
6
  euroeval/constants.py,sha256=0KHrH74zGM8vNF4uZG_a5qFJRZH5YgyQULYZtCKlo68,2452
7
- euroeval/data_loading.py,sha256=L_REtxefte5Ke4xE_Cz01zkfCyKlOYhSqT5ZXXulHPc,3992
7
+ euroeval/data_loading.py,sha256=2rMLSy8pbntlwmImizMtkTiUzj93mcv5kzYjZELWWfU,4081
8
8
  euroeval/data_models.py,sha256=7nAGDpN58Y35Lt9JZE_y0y5iOYesw2htcwHc68MkBZU,22953
9
9
  euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
10
10
  euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
11
- euroeval/finetuning.py,sha256=uuaUxNQJb7TivPQuI1OYQ_MIKbD-6-7mpkobLKsDefQ,10667
11
+ euroeval/finetuning.py,sha256=cx5SVgEsveMDNfoMxwLfAFsjZeKmYyHftaOZWZ-L9hA,11285
12
12
  euroeval/generation.py,sha256=LSsskfLjIJ-c3gQxmr7eiAobPOm-5bU9vnR7uHQ7XmU,10745
13
13
  euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
14
14
  euroeval/human_evaluation.py,sha256=zqbbJkqm2Uymf-88PxM3R9vVRR8SZJlq3QrqWEoiVeE,27643
@@ -19,15 +19,15 @@ euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,223
19
19
  euroeval/scores.py,sha256=TovjCZD8wmGrIjA4v5oAQp18P5KVcHvakkByDh0Hstk,3059
20
20
  euroeval/speed_benchmark.py,sha256=J7VKWMf7GU_l0lRR8f0QeUr_vAaBQqTbgQ_yToHhp_0,3980
21
21
  euroeval/tasks.py,sha256=87gbe__K5KNIb1aBSuwGnMPmZgamJFecNNYmNgMxaVo,7069
22
- euroeval/tokenization_utils.py,sha256=_B4KN3ZcuvVr8y3LedtfxBJfmPKjfVMjpbtl8bbQAuc,14278
22
+ euroeval/tokenization_utils.py,sha256=LxgGs7juS5PuMYt5LL2X6eVXdtnpi-A2jFxqcWpF6NA,17931
23
23
  euroeval/types.py,sha256=E0JhLfg-ek5pdFcYJbnGRUSodHxkuR3o8XGuIrBcuRM,2485
24
- euroeval/utils.py,sha256=e83OnWc0GJn0Tn_vP3tbqh1DAbLy2ky-LnIlTEOKzKU,11410
24
+ euroeval/utils.py,sha256=5R7y67xe0ODaje7k8nOu2AFS3Ph2gcsiWpIq5rjSSuA,11613
25
25
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
26
26
  euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
27
27
  euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
28
- euroeval/benchmark_modules/hf.py,sha256=CoiaNakjhg6gm_5IbUUeevXQZebg2VrRLuhzEi2Hhrk,44617
29
- euroeval/benchmark_modules/litellm.py,sha256=SxSr_0C6b_jVavR3y9QyhfkCOP5-va4zijGfghFTArY,48362
30
- euroeval/benchmark_modules/vllm.py,sha256=SbQ_EYSwUFBVLsp9io1Q75A9S_H-iw6AzLOn3rlEhK0,43034
28
+ euroeval/benchmark_modules/hf.py,sha256=Nbtn5eZ4axbmL09M8dGZCBr07pn9-btbqGgQ6q7KbHg,44620
29
+ euroeval/benchmark_modules/litellm.py,sha256=LS4mBXXG6h4uJwySPc6SI6f0y_HuiKE7IprprqWpoCI,50601
30
+ euroeval/benchmark_modules/vllm.py,sha256=sgeltOVfZA9bu0AmXV7PtZvuRst0I8s6VOIp0CI6DO8,38880
31
31
  euroeval/dataset_configs/__init__.py,sha256=kWKtlSAOY-olOQL3UtFqL6I3Tki3G3waMZSd2YChjCg,1895
32
32
  euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
33
33
  euroeval/dataset_configs/dutch.py,sha256=r21nxEvMmBkKqPXVW082batPsxJ9d0RB4DzngOTMJSk,3185
@@ -49,13 +49,13 @@ euroeval/prompt_templates/reading_comprehension.py,sha256=yLqryWQAW04GULz_EyNDLO
49
49
  euroeval/prompt_templates/sentiment_classification.py,sha256=LDOwjGQ2kqhwgNyphPywQeolwNB09o-xYWc9RUbzc84,7136
50
50
  euroeval/prompt_templates/summarization.py,sha256=mcWeKNhGWmp7IG_iY64T-VOSabQg5wKddjSbJNYFDp8,4984
51
51
  euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
52
- euroeval/task_group_utils/multiple_choice_classification.py,sha256=nB78TzOgd0HBvTclmjOYJid9ZVAgu8IHZsqB_n1SAZU,6178
53
- euroeval/task_group_utils/question_answering.py,sha256=kZBABJ_WYNTH4Xgo2jIvfx7iYvfoGt0EUObSaXRCGmk,27700
52
+ euroeval/task_group_utils/multiple_choice_classification.py,sha256=LQ6zD1UGi-jGCKI2xUJiQdAXoqb5QMpIJu41B2U0HPw,6543
53
+ euroeval/task_group_utils/question_answering.py,sha256=D4oJL2vQEjHghyxiiiq_vj1IQC6eryqNoLXuTiQEPmw,28071
54
54
  euroeval/task_group_utils/sequence_classification.py,sha256=zwRUgVHqLlREILwyg-yuDPkrIQOfqGVPsFBai-2D9a8,13525
55
55
  euroeval/task_group_utils/text_to_text.py,sha256=Nu1_qRPLbboCd9Q5rxqY4fQFJ_aGXu80aWQqoTG1cYc,5047
56
56
  euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
57
- euroeval-15.9.2.dist-info/METADATA,sha256=LwHTlJ51OGVwcRTUPulH-gh8IFxu82CUFYHZ1uOUyT0,13555
58
- euroeval-15.9.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
59
- euroeval-15.9.2.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
60
- euroeval-15.9.2.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
61
- euroeval-15.9.2.dist-info/RECORD,,
57
+ euroeval-15.10.0.dist-info/METADATA,sha256=WUXtSfS6qvrlA25lazql3DvyS5chyMnBPKyu-l65A_I,13472
58
+ euroeval-15.10.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
59
+ euroeval-15.10.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
60
+ euroeval-15.10.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
61
+ euroeval-15.10.0.dist-info/RECORD,,