EuroEval 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (78) hide show
  1. euroeval/__init__.py +9 -2
  2. euroeval/benchmark_config_factory.py +51 -50
  3. euroeval/benchmark_modules/base.py +9 -21
  4. euroeval/benchmark_modules/fresh.py +2 -1
  5. euroeval/benchmark_modules/hf.py +101 -71
  6. euroeval/benchmark_modules/litellm.py +115 -53
  7. euroeval/benchmark_modules/vllm.py +107 -92
  8. euroeval/benchmarker.py +144 -121
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +86 -8
  12. euroeval/constants.py +9 -0
  13. euroeval/data_loading.py +80 -29
  14. euroeval/data_models.py +338 -330
  15. euroeval/dataset_configs/__init__.py +12 -3
  16. euroeval/dataset_configs/bulgarian.py +56 -0
  17. euroeval/dataset_configs/czech.py +75 -0
  18. euroeval/dataset_configs/danish.py +55 -93
  19. euroeval/dataset_configs/dutch.py +48 -87
  20. euroeval/dataset_configs/english.py +45 -77
  21. euroeval/dataset_configs/estonian.py +42 -34
  22. euroeval/dataset_configs/faroese.py +19 -60
  23. euroeval/dataset_configs/finnish.py +36 -69
  24. euroeval/dataset_configs/french.py +39 -75
  25. euroeval/dataset_configs/german.py +45 -82
  26. euroeval/dataset_configs/greek.py +64 -0
  27. euroeval/dataset_configs/icelandic.py +54 -91
  28. euroeval/dataset_configs/italian.py +42 -79
  29. euroeval/dataset_configs/latvian.py +28 -35
  30. euroeval/dataset_configs/lithuanian.py +28 -26
  31. euroeval/dataset_configs/norwegian.py +72 -115
  32. euroeval/dataset_configs/polish.py +33 -61
  33. euroeval/dataset_configs/portuguese.py +33 -66
  34. euroeval/dataset_configs/serbian.py +64 -0
  35. euroeval/dataset_configs/slovak.py +55 -0
  36. euroeval/dataset_configs/spanish.py +42 -77
  37. euroeval/dataset_configs/swedish.py +52 -90
  38. euroeval/dataset_configs/ukrainian.py +64 -0
  39. euroeval/exceptions.py +1 -1
  40. euroeval/finetuning.py +24 -17
  41. euroeval/generation.py +15 -14
  42. euroeval/generation_utils.py +8 -8
  43. euroeval/languages.py +395 -323
  44. euroeval/logging_utils.py +250 -0
  45. euroeval/metrics/base.py +0 -3
  46. euroeval/metrics/huggingface.py +21 -6
  47. euroeval/metrics/llm_as_a_judge.py +6 -4
  48. euroeval/metrics/pipeline.py +17 -9
  49. euroeval/metrics/speed.py +0 -3
  50. euroeval/model_cache.py +17 -19
  51. euroeval/model_config.py +4 -5
  52. euroeval/model_loading.py +3 -0
  53. euroeval/prompt_templates/__init__.py +2 -0
  54. euroeval/prompt_templates/classification.py +206 -0
  55. euroeval/prompt_templates/linguistic_acceptability.py +99 -42
  56. euroeval/prompt_templates/multiple_choice.py +102 -38
  57. euroeval/prompt_templates/named_entity_recognition.py +172 -51
  58. euroeval/prompt_templates/reading_comprehension.py +119 -42
  59. euroeval/prompt_templates/sentiment_classification.py +110 -40
  60. euroeval/prompt_templates/summarization.py +85 -40
  61. euroeval/prompt_templates/token_classification.py +279 -0
  62. euroeval/scores.py +11 -10
  63. euroeval/speed_benchmark.py +5 -6
  64. euroeval/task_group_utils/multiple_choice_classification.py +2 -4
  65. euroeval/task_group_utils/question_answering.py +24 -16
  66. euroeval/task_group_utils/sequence_classification.py +48 -35
  67. euroeval/task_group_utils/text_to_text.py +19 -9
  68. euroeval/task_group_utils/token_classification.py +21 -17
  69. euroeval/tasks.py +44 -1
  70. euroeval/tokenisation_utils.py +33 -22
  71. euroeval/types.py +10 -9
  72. euroeval/utils.py +35 -149
  73. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +196 -39
  74. euroeval-16.5.0.dist-info/RECORD +81 -0
  75. euroeval-16.3.0.dist-info/RECORD +0 -71
  76. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
  77. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
  78. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -6,7 +6,7 @@ import json
6
6
  import logging
7
7
  import re
8
8
  import typing as t
9
- from functools import cache, cached_property, partial
9
+ from functools import cached_property, partial
10
10
  from time import sleep
11
11
 
12
12
  import litellm
@@ -36,8 +36,8 @@ from litellm.utils import supports_reasoning, supports_response_schema
36
36
  from pydantic import conlist, create_model
37
37
  from requests.exceptions import RequestException
38
38
  from tqdm.asyncio import tqdm as tqdm_async
39
- from tqdm.auto import tqdm
40
39
 
40
+ from ..caching_utils import cache_arguments
41
41
  from ..constants import (
42
42
  JSON_STRIP_CHARACTERS,
43
43
  LITELLM_CLASSIFICATION_OUTPUT_KEY,
@@ -70,6 +70,7 @@ from ..generation_utils import (
70
70
  extract_few_shot_examples,
71
71
  raise_if_wrong_params,
72
72
  )
73
+ from ..logging_utils import get_pbar, log, log_once
73
74
  from ..task_group_utils import (
74
75
  question_answering,
75
76
  sequence_classification,
@@ -83,7 +84,6 @@ from ..utils import (
83
84
  add_semaphore_and_catch_exception,
84
85
  create_model_cache_dir,
85
86
  get_hf_token,
86
- log_once,
87
87
  safe_run,
88
88
  split_model_id,
89
89
  )
@@ -95,8 +95,6 @@ if t.TYPE_CHECKING:
95
95
  from litellm.types.utils import ModelResponse
96
96
  from transformers.trainer import Trainer
97
97
 
98
- logger = logging.getLogger("euroeval")
99
-
100
98
 
101
99
  VOCAB_SIZE_MAPPING = {
102
100
  # OpenAI models
@@ -133,6 +131,7 @@ MODEL_MAX_LENGTH_MAPPING = {
133
131
  r"gpt-4.1.*": 1_047_576,
134
132
  # Anthropic models
135
133
  r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
134
+ r"(anthropic/)?claude-(opus|sonnet|haiku)-[1-9](-[1-9])?-[0-9]{8}": 200_000,
136
135
  # Gemini models
137
136
  r"(gemini/)?gemini-1\.5-flash.*": 1_048_576,
138
137
  r"(gemini/)?gemini-1\.5-pro.*": 2_097_152,
@@ -311,7 +310,7 @@ class LiteLLMModel(BenchmarkModule):
311
310
  InvalidBenchmark:
312
311
  If the inputs do not contain either 'messages' or 'text' keys.
313
312
  """
314
- model_inputs: list[list[litellm.AllMessageValues] | str]
313
+ model_inputs: c.Sequence[c.Sequence[litellm.AllMessageValues] | str]
315
314
  if "messages" in inputs:
316
315
  model_inputs = inputs["messages"]
317
316
  elif "text" in inputs:
@@ -332,9 +331,9 @@ class LiteLLMModel(BenchmarkModule):
332
331
  )
333
332
 
334
333
  all_responses: dict[int, "ModelResponse"] = {}
335
- inputs_to_run: list[tuple[int, list[litellm.AllMessageValues] | str]] = list(
336
- enumerate(model_inputs)
337
- )
334
+ inputs_to_run: c.Sequence[
335
+ tuple[int, c.Sequence[litellm.AllMessageValues] | str]
336
+ ] = list(enumerate(model_inputs))
338
337
  for attempt in range(num_attempts := 10):
339
338
  if not inputs_to_run:
340
339
  break
@@ -367,10 +366,11 @@ class LiteLLMModel(BenchmarkModule):
367
366
  (batch_indices[idx], model_inputs[batch_indices[idx]])
368
367
  for idx, _ in failures
369
368
  ]
370
- logger.debug(
369
+ log(
371
370
  f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
372
371
  f"{len(inputs_to_run):,} failed message(s). Here is the first error: "
373
- f"{failures[0][1]}."
372
+ f"{failures[0][1]}.",
373
+ level=logging.DEBUG,
374
374
  )
375
375
 
376
376
  # Attempt to handle the exceptions, to improve the chance of getting
@@ -422,14 +422,19 @@ class LiteLLMModel(BenchmarkModule):
422
422
  "'stop' is not supported with this model",
423
423
  "'$.stop' is invalid",
424
424
  ]
425
+ stop_pattern = re.compile(r"does not support parameters: \[.*'stop'.*\]")
425
426
  logprobs_messages = [
426
427
  "you are not allowed to request logprobs",
427
428
  "you've reached the maximum number of requests with logprobs",
428
429
  "logprobs is not supported",
429
430
  "logprobs is not enabled",
431
+ "Invalid value at 'generation_config.response_logprobs' (TYPE_BOOL)",
430
432
  ]
431
- top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
432
433
  logprobs_pattern = re.compile(
434
+ r"does not support parameters: \[.*'logprobs'.*\]"
435
+ )
436
+ top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
437
+ top_logprobs_pattern = re.compile(
433
438
  r"does not support parameters: \[.*'top_logprobs'.*\]"
434
439
  )
435
440
  max_completion_tokens_pattern = re.compile(
@@ -438,6 +443,7 @@ class LiteLLMModel(BenchmarkModule):
438
443
  temperature_messages = [
439
444
  "'temperature' is not supported with this model.",
440
445
  "temperature is not supported with this model",
446
+ r"does not support parameters: \[.*'temperature'.*\]",
441
447
  ]
442
448
  temperature_must_be_one_messages = [
443
449
  "`temperature` may only be set to 1",
@@ -454,10 +460,14 @@ class LiteLLMModel(BenchmarkModule):
454
460
  requires_thinking_disabled_messages = ["thinking.type: Field required"]
455
461
  seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
456
462
  response_format_messages = [
457
- "got an unexpected keyword argument 'response_format'"
463
+ "got an unexpected keyword argument 'response_format'",
464
+ "the model returned empty outputs",
458
465
  ]
459
466
 
460
- if any(msg.lower() in error_msg for msg in stop_messages):
467
+ if (
468
+ any(msg.lower() in error_msg for msg in stop_messages)
469
+ or stop_pattern.search(string=error_msg) is not None
470
+ ):
461
471
  log_once(
462
472
  f"The model {model_id!r} does not support "
463
473
  "stop sequences, so disabling them.",
@@ -467,7 +477,7 @@ class LiteLLMModel(BenchmarkModule):
467
477
  return generation_kwargs
468
478
  elif (
469
479
  any(msg.lower() in error_msg for msg in logprobs_messages)
470
- or logprobs_pattern.search(string=error_msg)
480
+ or logprobs_pattern.search(string=error_msg) is not None
471
481
  # Special case for Vertex AI models, since they have strict rate
472
482
  # limits on using logprobs. They also have a cap of 5 logprobs, but
473
483
  # we ignore this since the rate limiting makes it unusable anyway.
@@ -477,10 +487,15 @@ class LiteLLMModel(BenchmarkModule):
477
487
  f"The model {model_id!r} does not support logprobs, so disabling it.",
478
488
  level=logging.DEBUG,
479
489
  )
490
+ self.buffer["first_label_token_mapping"] = False
480
491
  generation_kwargs.pop("logprobs", None)
481
492
  generation_kwargs.pop("top_logprobs", None)
493
+ generation_kwargs.pop("response_format", None)
482
494
  return generation_kwargs
483
- elif any(msg.lower() in error_msg for msg in top_logprobs_messages):
495
+ elif (
496
+ any(msg.lower() in error_msg for msg in top_logprobs_messages)
497
+ or top_logprobs_pattern.search(string=error_msg) is not None
498
+ ):
484
499
  log_once(
485
500
  f"The model {model_id!r} does not support the `top_logprobs` argument, "
486
501
  "so moving the value to `logprobs`.",
@@ -525,7 +540,7 @@ class LiteLLMModel(BenchmarkModule):
525
540
  )
526
541
  ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
527
542
  keys_and_their_types = {
528
- tag_name: (list[str], ...) for tag_name in ner_tag_names
543
+ tag_name: (c.Sequence[str], ...) for tag_name in ner_tag_names
529
544
  }
530
545
  pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
531
546
  generation_kwargs["response_format"] = pydantic_class
@@ -597,9 +612,10 @@ class LiteLLMModel(BenchmarkModule):
597
612
  elif isinstance(
598
613
  error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
599
614
  ):
600
- logger.debug(
615
+ log(
601
616
  f"Service temporarily unavailable. The error message was: {error}. "
602
- "Retrying in 10 seconds..."
617
+ "Retrying in 10 seconds...",
618
+ level=logging.DEBUG,
603
619
  )
604
620
  sleep(10)
605
621
  return generation_kwargs
@@ -629,10 +645,32 @@ class LiteLLMModel(BenchmarkModule):
629
645
  ) from error
630
646
 
631
647
  if isinstance(error, RateLimitError):
632
- raise InvalidModel(
648
+ log(
633
649
  f"You have encountered your rate limit for model {model_id!r}. "
634
- "Skipping."
635
- ) from error
650
+ "Retrying in 10 seconds...",
651
+ level=logging.DEBUG,
652
+ )
653
+ sleep(10)
654
+ return generation_kwargs
655
+
656
+ if (
657
+ isinstance(error, BadRequestError)
658
+ and (
659
+ retry_match := re.search(
660
+ pattern=r"\bretry in ([0-9]+(.[0-9]+)?) ?(s|seconds)\b",
661
+ string=error_msg,
662
+ )
663
+ )
664
+ is not None
665
+ ):
666
+ retry_seconds = float(retry_match.group(1))
667
+ log(
668
+ f"Bad request error encountered. Retrying in {retry_seconds:.1f} "
669
+ "seconds...",
670
+ level=logging.DEBUG,
671
+ )
672
+ sleep(retry_seconds)
673
+ return generation_kwargs
636
674
 
637
675
  if isinstance(error, AuthenticationError):
638
676
  raise NeedsAdditionalArgument(
@@ -648,9 +686,11 @@ class LiteLLMModel(BenchmarkModule):
648
686
  async def _generate_async(
649
687
  self,
650
688
  model_id: str,
651
- inputs: list[list[litellm.AllMessageValues] | str],
689
+ inputs: c.Sequence[c.Sequence[litellm.AllMessageValues] | str],
652
690
  **generation_kwargs,
653
- ) -> tuple[list[tuple[int, "ModelResponse"]], list[tuple[int, Exception]]]:
691
+ ) -> tuple[
692
+ c.Sequence[tuple[int, "ModelResponse"]], c.Sequence[tuple[int, Exception]]
693
+ ]:
654
694
  """Generate outputs from the model asynchronously.
655
695
 
656
696
  Args:
@@ -711,7 +751,19 @@ class LiteLLMModel(BenchmarkModule):
711
751
  for input_ in inputs
712
752
  if isinstance(input_, list)
713
753
  ]
714
- responses = await tqdm_async.gather(*requests, leave=False)
754
+ responses = await tqdm_async.gather(
755
+ *requests, colour="yellow", ascii="—▰", leave=False
756
+ )
757
+
758
+ # If the outputs are empty, convert them to exceptions
759
+ if all(
760
+ not isinstance(response, Exception)
761
+ and response.choices[0].message.content == "{}"
762
+ for response in responses
763
+ ):
764
+ responses = [ValueError("The model returned empty outputs.")] * len(
765
+ responses
766
+ )
715
767
 
716
768
  # Separate the successful responses from the failed ones
717
769
  successes = [
@@ -731,13 +783,15 @@ class LiteLLMModel(BenchmarkModule):
731
783
  try:
732
784
  request.close()
733
785
  except RuntimeError as e:
734
- logger.debug(f"RuntimeError during request.close(): {e}")
786
+ log(
787
+ f"RuntimeError during request.close(): {e}", level=logging.DEBUG
788
+ )
735
789
 
736
790
  return successes, failures
737
791
 
738
792
  @staticmethod
739
793
  def _create_model_output(
740
- model_responses: list["ModelResponse"], model_id: str
794
+ model_responses: c.Sequence["ModelResponse"], model_id: str
741
795
  ) -> GenerativeModelOutput:
742
796
  """Create a GenerativeModelOutput object from a list of ModelResponse objects.
743
797
 
@@ -756,10 +810,11 @@ class LiteLLMModel(BenchmarkModule):
756
810
  for model_response in model_responses:
757
811
  if not model_response.choices:
758
812
  sequences.append("")
759
- logger.warning(
813
+ log(
760
814
  f"The model {model_id!r} did not end up "
761
815
  "generating any text. This is likely because the model ran "
762
- "out of tokens while reasoning. Returning an empty string."
816
+ "out of tokens while reasoning. Returning an empty string.",
817
+ level=logging.WARNING,
763
818
  )
764
819
  continue
765
820
 
@@ -810,7 +865,7 @@ class LiteLLMModel(BenchmarkModule):
810
865
  )
811
866
  continue
812
867
 
813
- logprobs_list: list[list[tuple[str, float]]]
868
+ logprobs_list: c.Sequence[c.Sequence[tuple[str, float]]]
814
869
  if isinstance(logprobs_obj, ChoiceLogprobs):
815
870
  logprobs_list = [
816
871
  [
@@ -847,11 +902,12 @@ class LiteLLMModel(BenchmarkModule):
847
902
  scores.append(logprobs_list)
848
903
 
849
904
  if not sequences:
850
- logger.warning(
905
+ log(
851
906
  "No sequences were generated by the model "
852
907
  f"{model_id!r}. This may be due to the "
853
908
  "model running out of tokens or an issue with the input data. "
854
- "Returning an empty GenerativeModelOutput."
909
+ "Returning an empty GenerativeModelOutput.",
910
+ level=logging.WARNING,
855
911
  )
856
912
  return GenerativeModelOutput(sequences=[], scores=None)
857
913
 
@@ -1105,7 +1161,7 @@ class LiteLLMModel(BenchmarkModule):
1105
1161
  return -1
1106
1162
 
1107
1163
  @property
1108
- def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
1164
+ def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
1109
1165
  """The data collator used to prepare samples during finetuning.
1110
1166
 
1111
1167
  Returns:
@@ -1130,6 +1186,7 @@ class LiteLLMModel(BenchmarkModule):
1130
1186
  return partial(
1131
1187
  sequence_classification.extract_labels_from_generation,
1132
1188
  dataset_config=self.dataset_config,
1189
+ model_config=self.model_config,
1133
1190
  first_label_token_mapping=self.buffer["first_label_token_mapping"],
1134
1191
  )
1135
1192
  case TaskGroup.TEXT_TO_TEXT:
@@ -1205,17 +1262,19 @@ class LiteLLMModel(BenchmarkModule):
1205
1262
  ServiceUnavailableError,
1206
1263
  InternalServerError,
1207
1264
  ) as e:
1208
- logger.debug(
1265
+ log(
1209
1266
  f"Service temporarily unavailable. The error message was: {e}. "
1210
- "Retrying in 10 seconds..."
1267
+ "Retrying in 10 seconds...",
1268
+ level=logging.DEBUG,
1211
1269
  )
1212
1270
  sleep(10)
1213
1271
  except APIError as e:
1214
1272
  if "'503 Service Unavailable" not in str(e):
1215
1273
  raise e
1216
- logger.warning(
1274
+ log(
1217
1275
  f"Failed to check if model {model_id!r} exists. Retrying in 10 "
1218
- "seconds..."
1276
+ "seconds...",
1277
+ level=logging.WARNING,
1219
1278
  )
1220
1279
  sleep(10)
1221
1280
  except (BadRequestError, NotFoundError):
@@ -1228,21 +1287,25 @@ class LiteLLMModel(BenchmarkModule):
1228
1287
  case 0:
1229
1288
  pass
1230
1289
  case 1:
1231
- logger.warning(
1290
+ log(
1232
1291
  f"Could not find the model ID {model_id!r}. Did you mean "
1233
- f"{candidate_models[0]!r}?"
1292
+ f"{candidate_models[0]!r}?",
1293
+ level=logging.WARNING,
1234
1294
  )
1235
1295
  case _:
1236
1296
  candidate_models_str = "', '".join(candidate_models)
1237
- logger.warning(
1297
+ log(
1238
1298
  f"Could not find the model ID {model_id!r}. Did you mean "
1239
- f"any of the following model IDs: '{candidate_models_str}'?"
1299
+ "any of the following model IDs: "
1300
+ f"'{candidate_models_str}'?",
1301
+ level=logging.WARNING,
1240
1302
  )
1241
1303
  return False
1242
1304
  else:
1243
- logger.error(
1305
+ log(
1244
1306
  f"Failed to check if model {model_id!r} exists after {num_attempts} "
1245
- "attempts. Assuming it does not exist."
1307
+ "attempts. Assuming it does not exist.",
1308
+ level=logging.ERROR,
1246
1309
  )
1247
1310
  return False
1248
1311
 
@@ -1275,7 +1338,8 @@ class LiteLLMModel(BenchmarkModule):
1275
1338
  "that the revision is actually the parameter and set the revision "
1276
1339
  "to 'main'. In the future, use the new '#' syntax to specify the "
1277
1340
  f"parameter (in this case, this would be {proper_model_id!r}), as this "
1278
- "will be an error in future versions of EuroEval."
1341
+ "will be an error in future versions of EuroEval.",
1342
+ level=logging.WARNING,
1279
1343
  )
1280
1344
  model_id_components.param = model_id_components.revision
1281
1345
  model_id_components.revision = "main"
@@ -1363,7 +1427,7 @@ class LiteLLMModel(BenchmarkModule):
1363
1427
 
1364
1428
  return dataset
1365
1429
 
1366
- @cache
1430
+ @cache_arguments()
1367
1431
  def get_generation_kwargs(self, dataset_config: DatasetConfig) -> dict[str, t.Any]:
1368
1432
  """Get the generation arguments for the model.
1369
1433
 
@@ -1483,7 +1547,7 @@ class LiteLLMModel(BenchmarkModule):
1483
1547
  # First attempt is a test run with a single conversation to handle errors
1484
1548
  # quickly. We repeat this multiple times to deal with different types of
1485
1549
  # errors, and stop if we get a successful response.
1486
- test_input: list[litellm.AllMessageValues] | str
1550
+ test_input: c.Sequence[litellm.AllMessageValues] | str
1487
1551
  if self.generative_type == GenerativeType.BASE:
1488
1552
  test_input = "Test message"
1489
1553
  else:
@@ -1542,7 +1606,7 @@ def try_download_ollama_model(model_id: str) -> bool:
1542
1606
  )
1543
1607
 
1544
1608
  try:
1545
- downloaded_ollama_models: list[str] = [
1609
+ downloaded_ollama_models: c.Sequence[str] = [
1546
1610
  model_obj.model
1547
1611
  for model_obj in ollama.list().models
1548
1612
  if model_obj.model is not None
@@ -1571,7 +1635,8 @@ def try_download_ollama_model(model_id: str) -> bool:
1571
1635
  f"The model {model_id!r} cannot be found on Ollama, but the "
1572
1636
  f"model {model_id_with_prefix} *was* found, so we would "
1573
1637
  "recommend you cancelling this run and trying the evaluation "
1574
- "with that model ID instead."
1638
+ "with that model ID instead.",
1639
+ level=logging.WARNING,
1575
1640
  )
1576
1641
  return False
1577
1642
  except ollama.ResponseError as inner_e:
@@ -1589,11 +1654,8 @@ def try_download_ollama_model(model_id: str) -> bool:
1589
1654
  ) from e
1590
1655
 
1591
1656
  # Download the model
1592
- with tqdm(
1593
- desc=f"Downloading {ollama_model_id}",
1594
- unit_scale=True,
1595
- unit="B",
1596
- leave=False,
1657
+ with get_pbar(
1658
+ desc=f"Downloading {ollama_model_id}", unit_scale=True, unit="B"
1597
1659
  ) as pbar:
1598
1660
  for status in response:
1599
1661
  if status.total is not None: