EuroEval 15.9.2__py3-none-any.whl → 15.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_modules/hf.py +3 -3
- euroeval/benchmark_modules/litellm.py +158 -122
- euroeval/benchmark_modules/vllm.py +47 -143
- euroeval/data_loading.py +8 -2
- euroeval/finetuning.py +22 -0
- euroeval/task_group_utils/multiple_choice_classification.py +11 -1
- euroeval/task_group_utils/question_answering.py +14 -4
- euroeval/tokenization_utils.py +103 -9
- euroeval/utils.py +13 -8
- {euroeval-15.9.2.dist-info → euroeval-15.10.0.dist-info}/METADATA +7 -8
- {euroeval-15.9.2.dist-info → euroeval-15.10.0.dist-info}/RECORD +14 -14
- {euroeval-15.9.2.dist-info → euroeval-15.10.0.dist-info}/WHEEL +0 -0
- {euroeval-15.9.2.dist-info → euroeval-15.10.0.dist-info}/entry_points.txt +0 -0
- {euroeval-15.9.2.dist-info → euroeval-15.10.0.dist-info}/licenses/LICENSE +0 -0
euroeval/benchmark_modules/hf.py
CHANGED
|
@@ -378,7 +378,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
378
378
|
tokenizer=self._tokenizer,
|
|
379
379
|
),
|
|
380
380
|
batched=True,
|
|
381
|
-
batch_size=
|
|
381
|
+
batch_size=10,
|
|
382
382
|
remove_columns=dataset["train"].column_names,
|
|
383
383
|
load_from_cache_file=False,
|
|
384
384
|
keep_in_memory=True,
|
|
@@ -389,7 +389,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
389
389
|
tokenizer=self._tokenizer,
|
|
390
390
|
),
|
|
391
391
|
batched=True,
|
|
392
|
-
batch_size=
|
|
392
|
+
batch_size=10,
|
|
393
393
|
remove_columns=dataset["val"].column_names,
|
|
394
394
|
load_from_cache_file=False,
|
|
395
395
|
keep_in_memory=True,
|
|
@@ -400,7 +400,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
400
400
|
tokenizer=self._tokenizer,
|
|
401
401
|
),
|
|
402
402
|
batched=True,
|
|
403
|
-
batch_size=
|
|
403
|
+
batch_size=10,
|
|
404
404
|
remove_columns=dataset["test"].column_names,
|
|
405
405
|
load_from_cache_file=False,
|
|
406
406
|
keep_in_memory=True,
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Generative models from an inference API, using the LiteLLM framework."""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import collections.abc as c
|
|
4
5
|
import logging
|
|
5
6
|
import os
|
|
@@ -29,6 +30,7 @@ from litellm.exceptions import (
|
|
|
29
30
|
Timeout,
|
|
30
31
|
)
|
|
31
32
|
from litellm.llms.vertex_ai.common_utils import VertexAIError
|
|
33
|
+
from litellm.router import Router
|
|
32
34
|
from litellm.types.utils import ChoiceLogprobs, ModelResponse
|
|
33
35
|
from pydantic import conlist, create_model
|
|
34
36
|
from requests.exceptions import RequestException
|
|
@@ -68,7 +70,7 @@ from ..task_group_utils import (
|
|
|
68
70
|
from ..tokenization_utils import get_first_label_token_mapping
|
|
69
71
|
from ..types import ExtractLabelsFunction
|
|
70
72
|
from ..utils import (
|
|
71
|
-
|
|
73
|
+
add_semaphore_and_catch_exception,
|
|
72
74
|
create_model_cache_dir,
|
|
73
75
|
log_once,
|
|
74
76
|
safe_run,
|
|
@@ -201,6 +203,11 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
201
203
|
self.is_ollama = model_config.model_id.startswith(
|
|
202
204
|
"ollama/"
|
|
203
205
|
) or model_config.model_id.startswith("ollama_chat/")
|
|
206
|
+
self._ollama_show: ollama.ShowResponse = (
|
|
207
|
+
ollama.show("/".join(model_config.model_id.split("/")[1:]))
|
|
208
|
+
if self.is_ollama
|
|
209
|
+
else ollama.ShowResponse(model_info=None)
|
|
210
|
+
)
|
|
204
211
|
|
|
205
212
|
raise_if_wrong_params(model_config=model_config, allowed_params=ALLOWED_PARAMS)
|
|
206
213
|
|
|
@@ -224,7 +231,14 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
224
231
|
Returns:
|
|
225
232
|
The generative type of the model, or None if it has not been set yet.
|
|
226
233
|
"""
|
|
227
|
-
if self.
|
|
234
|
+
if self.is_ollama:
|
|
235
|
+
reasoning_model = "thinking" in (self._ollama_show.capabilities or [])
|
|
236
|
+
type_ = (
|
|
237
|
+
GenerativeType.REASONING
|
|
238
|
+
if reasoning_model
|
|
239
|
+
else GenerativeType.INSTRUCTION_TUNED
|
|
240
|
+
)
|
|
241
|
+
elif self.model_config.revision in {"thinking"}:
|
|
228
242
|
type_ = GenerativeType.REASONING
|
|
229
243
|
elif re.fullmatch(
|
|
230
244
|
pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
|
|
@@ -251,8 +265,18 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
251
265
|
The generated model outputs.
|
|
252
266
|
"""
|
|
253
267
|
assert "messages" in inputs, "The input must contain a 'messages' key."
|
|
254
|
-
|
|
268
|
+
conversations: list[list[litellm.AllMessageValues]] = inputs["messages"]
|
|
255
269
|
|
|
270
|
+
# Get the mapping from labels to the first token in the label. We call this each
|
|
271
|
+
# time we generate a new dataset since the dataset config can change
|
|
272
|
+
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
273
|
+
dataset_config=self.dataset_config,
|
|
274
|
+
model_config=self.model_config,
|
|
275
|
+
tokenizer=None,
|
|
276
|
+
generative_type=self.generative_type,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# Set the core generation arguments
|
|
256
280
|
generation_kwargs: dict[str, t.Any] = dict(
|
|
257
281
|
model=self.model_config.model_id,
|
|
258
282
|
max_completion_tokens=(
|
|
@@ -266,33 +290,30 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
266
290
|
api_key=self.benchmark_config.api_key,
|
|
267
291
|
api_base=self.benchmark_config.api_base,
|
|
268
292
|
api_version=self.benchmark_config.api_version,
|
|
293
|
+
max_retries=3,
|
|
269
294
|
)
|
|
270
295
|
|
|
271
|
-
#
|
|
272
|
-
#
|
|
273
|
-
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
274
|
-
dataset_config=self.dataset_config,
|
|
275
|
-
model_config=self.model_config,
|
|
276
|
-
tokenizer=None,
|
|
277
|
-
generative_type=self.generative_type,
|
|
278
|
-
)
|
|
279
|
-
|
|
280
|
-
if self.buffer["first_label_token_mapping"]:
|
|
281
|
-
generation_kwargs["logprobs"] = True
|
|
282
|
-
generation_kwargs["top_logprobs"] = MAX_LOGPROBS
|
|
283
|
-
|
|
296
|
+
# Set up the `response_format` generation argument if we are dealing with a task
|
|
297
|
+
# using structured generation
|
|
284
298
|
if self.dataset_config.task in TASKS_USING_JSON:
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
299
|
+
# Sanity check that "JSON" is included in the prompt, as some models require
|
|
300
|
+
# this
|
|
301
|
+
for conversation in conversations:
|
|
302
|
+
if not conversation:
|
|
288
303
|
raise InvalidBenchmark(
|
|
289
|
-
"Encountered an empty
|
|
304
|
+
"Encountered an empty conversation in 'messages'."
|
|
290
305
|
)
|
|
291
|
-
|
|
292
|
-
assert isinstance(
|
|
293
|
-
f"Expected dict message, got {type(
|
|
306
|
+
last_message = conversation[-1]
|
|
307
|
+
assert isinstance(last_message, dict), (
|
|
308
|
+
f"Expected dict message, got {type(last_message)}"
|
|
294
309
|
)
|
|
295
|
-
assert "
|
|
310
|
+
assert "content" in last_message, (
|
|
311
|
+
"Expected 'content' key in the last message of the conversation."
|
|
312
|
+
)
|
|
313
|
+
assert isinstance(last_message["content"], str), (
|
|
314
|
+
"Expected 'content' to be a string."
|
|
315
|
+
)
|
|
316
|
+
assert "json" in last_message["content"].lower(), (
|
|
296
317
|
"Prompt must contain 'json' for JSON tasks."
|
|
297
318
|
)
|
|
298
319
|
|
|
@@ -328,6 +349,19 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
328
349
|
level=logging.DEBUG,
|
|
329
350
|
)
|
|
330
351
|
|
|
352
|
+
# If the model is an Ollama reasoning model, we ensure that thinking is enabled
|
|
353
|
+
if self.is_ollama and self.generative_type == GenerativeType.REASONING:
|
|
354
|
+
generation_kwargs["think"] = True
|
|
355
|
+
log_once(
|
|
356
|
+
"Enabling thinking mode for Ollama model "
|
|
357
|
+
f"{self.model_config.model_id!r}",
|
|
358
|
+
level=logging.DEBUG,
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
# Handle manually set parameters
|
|
362
|
+
if self.buffer["first_label_token_mapping"]:
|
|
363
|
+
generation_kwargs["logprobs"] = True
|
|
364
|
+
generation_kwargs["top_logprobs"] = MAX_LOGPROBS
|
|
331
365
|
if self.model_config.revision == "thinking":
|
|
332
366
|
generation_kwargs["thinking"] = dict(
|
|
333
367
|
type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
|
|
@@ -344,66 +378,67 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
344
378
|
level=logging.DEBUG,
|
|
345
379
|
)
|
|
346
380
|
|
|
347
|
-
#
|
|
381
|
+
# Drop generation kwargs that are not supported by the model
|
|
348
382
|
litellm.drop_params = True
|
|
349
383
|
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
to_run = list(enumerate(messages))
|
|
357
|
-
|
|
358
|
-
for attempt in range(num_attempts):
|
|
359
|
-
if not to_run:
|
|
384
|
+
all_responses: dict[int, ModelResponse] = {}
|
|
385
|
+
conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
|
|
386
|
+
enumerate(conversations)
|
|
387
|
+
)
|
|
388
|
+
for attempt in range(num_attempts := 10):
|
|
389
|
+
if not conversations_to_run:
|
|
360
390
|
break
|
|
361
391
|
|
|
362
|
-
batch_indices,
|
|
363
|
-
|
|
392
|
+
batch_indices, batch_conversations = zip(*conversations_to_run)
|
|
393
|
+
successes, failures = safe_run(
|
|
364
394
|
self._generate_async(
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
max_reruns=15,
|
|
395
|
+
model_id=self.model_config.model_id,
|
|
396
|
+
conversations=list(batch_conversations),
|
|
397
|
+
**generation_kwargs,
|
|
369
398
|
)
|
|
370
399
|
)
|
|
371
400
|
|
|
372
|
-
|
|
401
|
+
# Store the successful model outputs
|
|
402
|
+
for idx, response in successes:
|
|
403
|
+
orig_idx = batch_indices[idx]
|
|
373
404
|
all_responses[orig_idx] = response
|
|
374
405
|
|
|
406
|
+
# If all requests were successful, break
|
|
375
407
|
if not failures:
|
|
376
|
-
|
|
408
|
+
conversations_to_run = []
|
|
377
409
|
break
|
|
378
410
|
|
|
379
|
-
|
|
380
|
-
|
|
411
|
+
# Put the failed requests back in the queue to try again
|
|
412
|
+
conversations_to_run = [
|
|
413
|
+
(batch_indices[idx], conversations[batch_indices[idx]])
|
|
414
|
+
for idx, _ in failures
|
|
415
|
+
]
|
|
381
416
|
logger.debug(
|
|
382
|
-
f"Attempt {attempt + 1}/{num_attempts}: "
|
|
383
|
-
f"
|
|
417
|
+
f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
|
|
418
|
+
f"{len(conversations_to_run):,} failed message(s)"
|
|
384
419
|
)
|
|
385
420
|
|
|
421
|
+
# Attempt to handle the exceptions, to improve the chance of getting
|
|
422
|
+
# successful generations next time around
|
|
386
423
|
for _, error in failures:
|
|
387
424
|
self._handle_exception(error=error, generation_kwargs=generation_kwargs)
|
|
388
|
-
else:
|
|
389
|
-
raise InvalidBenchmark(
|
|
390
|
-
message=f"Failed to generate text, after {num_attempts} attempts."
|
|
391
|
-
)
|
|
392
425
|
|
|
393
|
-
|
|
426
|
+
# Sleep for a second to avoid pinging the API server too quickly
|
|
427
|
+
sleep(1)
|
|
428
|
+
else:
|
|
394
429
|
raise InvalidBenchmark(
|
|
395
|
-
f"Failed to generate text after {num_attempts} attempts.
|
|
396
|
-
f"Errors: {all_failures}"
|
|
430
|
+
message=f"Failed to generate text, after {num_attempts:,} attempts."
|
|
397
431
|
)
|
|
398
432
|
|
|
399
|
-
|
|
433
|
+
# Extract the generations from the model output
|
|
434
|
+
ordered_responses = [all_responses[i] for i in range(len(conversations))]
|
|
400
435
|
model_output = self._create_model_output(
|
|
401
436
|
model_responses=ordered_responses, model_id=self.model_config.model_id
|
|
402
437
|
)
|
|
403
438
|
|
|
404
|
-
if len(
|
|
439
|
+
if len(conversations) != len(model_output.sequences):
|
|
405
440
|
raise InvalidBenchmark(
|
|
406
|
-
f"Number of model inputs ({len(
|
|
441
|
+
f"Number of model inputs ({len(conversations):,}) does not match the "
|
|
407
442
|
f"number of model outputs ({len(model_output.sequences):,})."
|
|
408
443
|
)
|
|
409
444
|
|
|
@@ -462,8 +497,8 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
462
497
|
f"The model {model_id!r} does not support logprobs, so disabling it.",
|
|
463
498
|
level=logging.DEBUG,
|
|
464
499
|
)
|
|
465
|
-
generation_kwargs.pop("logprobs")
|
|
466
|
-
generation_kwargs.pop("top_logprobs")
|
|
500
|
+
generation_kwargs.pop("logprobs", None)
|
|
501
|
+
generation_kwargs.pop("top_logprobs", None)
|
|
467
502
|
return
|
|
468
503
|
elif any(msg.lower() in error_msg for msg in temperature_messages):
|
|
469
504
|
log_once(
|
|
@@ -471,7 +506,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
471
506
|
"temperature, so disabling it.",
|
|
472
507
|
level=logging.DEBUG,
|
|
473
508
|
)
|
|
474
|
-
generation_kwargs.pop("temperature")
|
|
509
|
+
generation_kwargs.pop("temperature", None)
|
|
475
510
|
return
|
|
476
511
|
elif any(msg.lower() in error_msg for msg in temperature_must_be_one_messages):
|
|
477
512
|
log_once(
|
|
@@ -503,14 +538,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
503
538
|
generation_kwargs["response_format"] = dict(type="json_object")
|
|
504
539
|
return
|
|
505
540
|
elif isinstance(
|
|
506
|
-
error,
|
|
507
|
-
(
|
|
508
|
-
APIConnectionError,
|
|
509
|
-
Timeout,
|
|
510
|
-
ServiceUnavailableError,
|
|
511
|
-
InternalServerError,
|
|
512
|
-
SystemError,
|
|
513
|
-
),
|
|
541
|
+
error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
|
|
514
542
|
):
|
|
515
543
|
logger.debug(
|
|
516
544
|
f"Service temporarily unavailable. The error message was: {error}. "
|
|
@@ -518,6 +546,18 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
518
546
|
)
|
|
519
547
|
sleep(5)
|
|
520
548
|
return
|
|
549
|
+
elif isinstance(error, (APIConnectionError, OSError)):
|
|
550
|
+
# If there are too many I/O connections, we increase the number of allowed
|
|
551
|
+
# file descriptors
|
|
552
|
+
if "too many open files" in error_msg:
|
|
553
|
+
raise InvalidBenchmark(
|
|
554
|
+
"There are too many file descriptors running. See the current "
|
|
555
|
+
"value by running `ulimit -n`. Try increasing it by running "
|
|
556
|
+
"`ulimit -n <new-value>` and try again."
|
|
557
|
+
)
|
|
558
|
+
raise InvalidBenchmark(
|
|
559
|
+
f"Encountered {type(error)} during generation: {error}."
|
|
560
|
+
)
|
|
521
561
|
|
|
522
562
|
if isinstance(error, RateLimitError):
|
|
523
563
|
raise InvalidModel(
|
|
@@ -538,69 +578,66 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
538
578
|
|
|
539
579
|
async def _generate_async(
|
|
540
580
|
self,
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
) -> tuple[list[ModelResponse], list[tuple[int, Exception]]]:
|
|
581
|
+
model_id: str,
|
|
582
|
+
conversations: list[list[litellm.AllMessageValues]],
|
|
583
|
+
**generation_kwargs,
|
|
584
|
+
) -> tuple[list[tuple[int, ModelResponse]], list[tuple[int, Exception]]]:
|
|
546
585
|
"""Generate outputs from the model asynchronously.
|
|
547
586
|
|
|
548
587
|
Args:
|
|
549
|
-
|
|
550
|
-
The
|
|
551
|
-
|
|
552
|
-
The
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
max_reruns:
|
|
556
|
-
The maximum number of reruns to make.
|
|
588
|
+
model_id:
|
|
589
|
+
The ID of the model to use for generation.
|
|
590
|
+
conversations:
|
|
591
|
+
The conversations to pass to the model.
|
|
592
|
+
**generation_kwargs:
|
|
593
|
+
Additional generation arguments to pass to the model.
|
|
557
594
|
|
|
558
595
|
Returns:
|
|
559
|
-
A tuple
|
|
596
|
+
A tuple (successes, failures), each being a list of tuples (idx, content),
|
|
597
|
+
where the `idx` corresponds to the index of `conversations`, and `content`
|
|
598
|
+
is either the model response or an Exception.
|
|
560
599
|
"""
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
requests = [
|
|
569
|
-
litellm.acompletion(
|
|
570
|
-
messages=msg, max_retries=max_retries, **generation_kwargs
|
|
600
|
+
# Create a LiteLLM router, which will ensure that we only use a single client
|
|
601
|
+
# for all the requests, preventing "too many open files" errors
|
|
602
|
+
router = Router(
|
|
603
|
+
model_list=[
|
|
604
|
+
dict(
|
|
605
|
+
model_name=self.model_config.model_id,
|
|
606
|
+
litellm_params=generation_kwargs,
|
|
571
607
|
)
|
|
572
|
-
for _, msg in to_run
|
|
573
608
|
]
|
|
574
|
-
|
|
575
|
-
catch_coroutine_exception(request) for request in requests
|
|
576
|
-
]
|
|
577
|
-
responses = await tqdm_async.gather(*wrapped_requests, leave=False)
|
|
578
|
-
|
|
579
|
-
next_to_run = []
|
|
580
|
-
current_fail_count = 0
|
|
609
|
+
)
|
|
581
610
|
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
611
|
+
# Get the LLM generations asynchronously
|
|
612
|
+
max_concurrent_calls = 20
|
|
613
|
+
semaphore = asyncio.Semaphore(max_concurrent_calls)
|
|
614
|
+
requests = [
|
|
615
|
+
add_semaphore_and_catch_exception(
|
|
616
|
+
router.acompletion(model=model_id, messages=conversation),
|
|
617
|
+
semaphore=semaphore,
|
|
618
|
+
)
|
|
619
|
+
for conversation in conversations
|
|
620
|
+
]
|
|
621
|
+
responses = await tqdm_async.gather(*requests, leave=False)
|
|
589
622
|
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
623
|
+
# Separate the successful responses from the failed ones
|
|
624
|
+
successes = [
|
|
625
|
+
(idx, response)
|
|
626
|
+
for idx, response in enumerate(responses)
|
|
627
|
+
if not isinstance(response, Exception)
|
|
628
|
+
]
|
|
629
|
+
failures = [
|
|
630
|
+
(idx, response)
|
|
631
|
+
for idx, response in enumerate(responses)
|
|
632
|
+
if isinstance(response, Exception)
|
|
633
|
+
]
|
|
597
634
|
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
635
|
+
# Close connections
|
|
636
|
+
for request in requests:
|
|
637
|
+
if hasattr(request, "close"):
|
|
638
|
+
request.close()
|
|
601
639
|
|
|
602
|
-
|
|
603
|
-
return success, failures
|
|
640
|
+
return successes, failures
|
|
604
641
|
|
|
605
642
|
@staticmethod
|
|
606
643
|
def _create_model_output(
|
|
@@ -690,8 +727,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
690
727
|
# If it is an Ollama model then we can get the number of parameters from the
|
|
691
728
|
# Ollama Python SDK
|
|
692
729
|
if self.is_ollama:
|
|
693
|
-
|
|
694
|
-
model_info = ollama.show(ollama_model_id).modelinfo
|
|
730
|
+
model_info = self._ollama_show.modelinfo
|
|
695
731
|
if model_info is not None:
|
|
696
732
|
num_params = model_info.get("general.parameter_count")
|
|
697
733
|
if num_params is not None:
|
|
@@ -819,7 +855,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
819
855
|
# Python SDK
|
|
820
856
|
if self.is_ollama:
|
|
821
857
|
ollama_model_id = "/".join(self.model_config.model_id.split("/")[1:])
|
|
822
|
-
model_info =
|
|
858
|
+
model_info = self._ollama_show.modelinfo
|
|
823
859
|
if model_info is not None:
|
|
824
860
|
context_length_keys = [
|
|
825
861
|
key for key in model_info.keys() if "context_length" in key.lower()
|
|
@@ -7,12 +7,10 @@ import json
|
|
|
7
7
|
import logging
|
|
8
8
|
import os
|
|
9
9
|
import re
|
|
10
|
-
import sys
|
|
11
10
|
import typing as t
|
|
12
11
|
from functools import partial
|
|
13
12
|
from pathlib import Path
|
|
14
13
|
from time import sleep
|
|
15
|
-
from types import MethodType
|
|
16
14
|
|
|
17
15
|
import torch
|
|
18
16
|
from datasets import DatasetDict
|
|
@@ -69,6 +67,7 @@ from ..tokenization_utils import (
|
|
|
69
67
|
get_end_of_chat_token_ids,
|
|
70
68
|
get_eos_token,
|
|
71
69
|
get_first_label_token_mapping,
|
|
70
|
+
get_pad_token,
|
|
72
71
|
should_prompts_be_stripped,
|
|
73
72
|
)
|
|
74
73
|
from ..types import ExtractLabelsFunction
|
|
@@ -81,17 +80,12 @@ from ..utils import (
|
|
|
81
80
|
from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
|
|
82
81
|
|
|
83
82
|
if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
|
|
84
|
-
from vllm import LLM,
|
|
83
|
+
from vllm import LLM, SamplingParams
|
|
85
84
|
from vllm.distributed.parallel_state import (
|
|
86
85
|
destroy_distributed_environment,
|
|
87
86
|
destroy_model_parallel,
|
|
88
87
|
)
|
|
89
|
-
from vllm.inputs import PromptType
|
|
90
88
|
from vllm.lora.request import LoRARequest
|
|
91
|
-
from vllm.model_executor.guided_decoding.guided_fields import GuidedDecodingRequest
|
|
92
|
-
from vllm.pooling_params import PoolingParams
|
|
93
|
-
from vllm.prompt_adapter.request import PromptAdapterRequest
|
|
94
|
-
from vllm.sampling_params import RequestOutputKind
|
|
95
89
|
|
|
96
90
|
if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
|
|
97
91
|
from outlines.models.vllm import adapt_tokenizer
|
|
@@ -140,6 +134,9 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
140
134
|
self.end_of_reasoning_token = get_end_of_reasoning_token(
|
|
141
135
|
model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
|
|
142
136
|
)
|
|
137
|
+
self.end_of_chat_token_ids = get_end_of_chat_token_ids(
|
|
138
|
+
tokenizer=self._tokenizer
|
|
139
|
+
)
|
|
143
140
|
self.custom_stop_tokens = get_custom_stop_tokens(
|
|
144
141
|
model=self._model,
|
|
145
142
|
tokenizer=self._tokenizer,
|
|
@@ -193,7 +190,10 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
193
190
|
return None
|
|
194
191
|
elif self.end_of_reasoning_token is not None:
|
|
195
192
|
return GenerativeType.REASONING
|
|
196
|
-
elif
|
|
193
|
+
elif (
|
|
194
|
+
self._tokenizer.chat_template is not None
|
|
195
|
+
or "instruct" in self.model_config.model_id.lower()
|
|
196
|
+
):
|
|
197
197
|
return GenerativeType.INSTRUCTION_TUNED
|
|
198
198
|
else:
|
|
199
199
|
return GenerativeType.BASE
|
|
@@ -303,55 +303,29 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
303
303
|
Returns:
|
|
304
304
|
The generated model outputs.
|
|
305
305
|
"""
|
|
306
|
-
#
|
|
307
|
-
# token, end-of-sentence token, and a double newline if the model isn't
|
|
308
|
-
# instruction tuned (since these separate the few-shot examples in the input in
|
|
309
|
-
# this case)
|
|
306
|
+
# Get stopping tokens
|
|
310
307
|
stop_tokens: list[str] = self.custom_stop_tokens.copy()
|
|
311
308
|
if self.buffer["instruction_model"] is False:
|
|
312
309
|
stop_tokens.append("\n\n")
|
|
313
310
|
if self._tokenizer.pad_token_id is not None:
|
|
311
|
+
assert isinstance(self._tokenizer.pad_token, str), (
|
|
312
|
+
f"The pad token for the model {self.model_config.model_id!r} "
|
|
313
|
+
f"is not a string, which is unexpected: {self._tokenizer.pad_token!r}."
|
|
314
|
+
)
|
|
314
315
|
stop_tokens.append(self._tokenizer.pad_token)
|
|
315
316
|
if self._tokenizer.eos_token_id is not None:
|
|
317
|
+
assert isinstance(self._tokenizer.eos_token, str), (
|
|
318
|
+
f"The EOS token for the model {self.model_config.model_id!r} "
|
|
319
|
+
f"is not a string, which is unexpected: {self._tokenizer.eos_token!r}."
|
|
320
|
+
)
|
|
316
321
|
stop_tokens.append(self._tokenizer.eos_token)
|
|
317
322
|
if self._tokenizer.pad_token_id is None:
|
|
318
323
|
self._tokenizer.pad_token_id = self._tokenizer.eos_token_id
|
|
319
324
|
self._tokenizer.pad_token = self._tokenizer.eos_token
|
|
320
|
-
if
|
|
321
|
-
self._tokenizer.
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
self._tokenizer.pad_token_id = self._tokenizer.bos_token_id
|
|
325
|
-
self._tokenizer.pad_token = self._tokenizer.bos_token
|
|
326
|
-
elif (
|
|
327
|
-
self._tokenizer.eos_token_id is not None
|
|
328
|
-
and self._tokenizer.pad_token_id is None
|
|
329
|
-
):
|
|
330
|
-
self._tokenizer.pad_token_id = self._tokenizer.eos_token_id
|
|
331
|
-
self._tokenizer.pad_token = self._tokenizer.eos_token
|
|
332
|
-
elif self._tokenizer.pad_token_id is None:
|
|
333
|
-
pad_token_candidates = ["<pad>", "[pad]", "<|endoftext|>", "<|im_end|>"]
|
|
334
|
-
pad_token_candidates.extend([c.upper() for c in pad_token_candidates])
|
|
335
|
-
for candidate in pad_token_candidates:
|
|
336
|
-
if candidate in self._tokenizer.get_vocab():
|
|
337
|
-
pad_token_id = self._tokenizer.get_vocab()[candidate]
|
|
338
|
-
self._tokenizer.pad_token = candidate
|
|
339
|
-
self._tokenizer.pad_token_id = pad_token_id
|
|
340
|
-
break
|
|
341
|
-
else:
|
|
342
|
-
raise InvalidModel(
|
|
343
|
-
"Could not find a suitable token to use as a padding token, since "
|
|
344
|
-
"the model does not have a BOS, EOS, or padding token, and does "
|
|
345
|
-
f"not have any of the following tokens in its vocabulary: "
|
|
346
|
-
f"{pad_token_candidates}."
|
|
347
|
-
)
|
|
348
|
-
|
|
349
|
-
assert self._tokenizer.pad_token_id is not None
|
|
350
|
-
|
|
351
|
-
# Add end of chat token as a stopping token, if it exists
|
|
352
|
-
end_of_chat_token_ids = get_end_of_chat_token_ids(tokenizer=self._tokenizer)
|
|
353
|
-
if end_of_chat_token_ids is not None:
|
|
354
|
-
end_of_chat_token = self._tokenizer.decode(end_of_chat_token_ids).strip()
|
|
325
|
+
if self.end_of_chat_token_ids is not None:
|
|
326
|
+
end_of_chat_token = self._tokenizer.decode(
|
|
327
|
+
self.end_of_chat_token_ids
|
|
328
|
+
).strip()
|
|
355
329
|
if end_of_chat_token:
|
|
356
330
|
stop_tokens.append(end_of_chat_token)
|
|
357
331
|
|
|
@@ -438,7 +412,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
438
412
|
raw_outputs = self._model.generate(
|
|
439
413
|
prompts=prompts,
|
|
440
414
|
sampling_params=sampling_params,
|
|
441
|
-
use_tqdm=
|
|
415
|
+
use_tqdm=False if input_is_a_test else get_pbar_without_leave,
|
|
442
416
|
lora_request=self.buffer.get("lora_request"),
|
|
443
417
|
)
|
|
444
418
|
break
|
|
@@ -515,16 +489,13 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
515
489
|
completion.split(self.end_of_reasoning_token)[-1]
|
|
516
490
|
for completion in completions
|
|
517
491
|
]
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
re.split(pattern=stop_token_pattern, string=completion)[0]
|
|
526
|
-
for completion in completions
|
|
527
|
-
]
|
|
492
|
+
stop_token_pattern = re.compile(
|
|
493
|
+
"|".join(re.escape(stop_token) for stop_token in stop_tokens)
|
|
494
|
+
)
|
|
495
|
+
completions = [
|
|
496
|
+
re.split(pattern=stop_token_pattern, string=completion)[0]
|
|
497
|
+
for completion in completions
|
|
498
|
+
]
|
|
528
499
|
completions = [completion.strip() for completion in completions]
|
|
529
500
|
|
|
530
501
|
# Sanity check
|
|
@@ -824,10 +795,6 @@ def load_model_and_tokenizer(
|
|
|
824
795
|
f"The model {model_id!r} could not be loaded. The error was {e!r}."
|
|
825
796
|
)
|
|
826
797
|
|
|
827
|
-
model._run_engine = MethodType(_run_engine_with_fixed_progress_bars, model)
|
|
828
|
-
model._validate_and_add_requests = MethodType(
|
|
829
|
-
_validate_and_add_requests_with_fixed_progress_bars, model
|
|
830
|
-
)
|
|
831
798
|
model.config = hf_model_config
|
|
832
799
|
|
|
833
800
|
return model, tokenizer
|
|
@@ -911,90 +878,11 @@ def load_tokenizer(
|
|
|
911
878
|
# Ensure that BOS, EOS and PAD tokens are set
|
|
912
879
|
tokenizer.bos_token, tokenizer.bos_token_id = get_bos_token(tokenizer=tokenizer)
|
|
913
880
|
tokenizer.eos_token, tokenizer.eos_token_id = get_eos_token(tokenizer=tokenizer)
|
|
914
|
-
|
|
915
|
-
tokenizer.pad_token = tokenizer.eos_token
|
|
881
|
+
tokenizer.pad_token, tokenizer.pad_token_id = get_pad_token(tokenizer=tokenizer)
|
|
916
882
|
|
|
917
883
|
return tokenizer
|
|
918
884
|
|
|
919
885
|
|
|
920
|
-
def _run_engine_with_fixed_progress_bars(
|
|
921
|
-
self: "LLM", use_tqdm: bool
|
|
922
|
-
) -> list["RequestOutput"]:
|
|
923
|
-
if use_tqdm:
|
|
924
|
-
num_requests = self.llm_engine.get_num_unfinished_requests()
|
|
925
|
-
pbar = tqdm(
|
|
926
|
-
total=num_requests, leave=False, disable=hasattr(sys, "_called_from_test")
|
|
927
|
-
)
|
|
928
|
-
else:
|
|
929
|
-
pbar = None
|
|
930
|
-
|
|
931
|
-
# Run the engine.
|
|
932
|
-
outputs: list["RequestOutput"] = list()
|
|
933
|
-
while self.llm_engine.has_unfinished_requests():
|
|
934
|
-
step_outputs = self.llm_engine.step()
|
|
935
|
-
for output in step_outputs:
|
|
936
|
-
if output.finished:
|
|
937
|
-
outputs.append(output)
|
|
938
|
-
if pbar is not None:
|
|
939
|
-
pbar.update(1)
|
|
940
|
-
|
|
941
|
-
if pbar is not None:
|
|
942
|
-
pbar.close()
|
|
943
|
-
|
|
944
|
-
# Sort the outputs by request ID. This is necessary because some requests may be
|
|
945
|
-
# finished earlier than its previous requests.
|
|
946
|
-
outputs = sorted(outputs, key=lambda x: int(x.request_id))
|
|
947
|
-
|
|
948
|
-
return outputs
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
def _validate_and_add_requests_with_fixed_progress_bars(
|
|
952
|
-
self: "LLM",
|
|
953
|
-
prompts: "PromptType | c.Sequence[PromptType]",
|
|
954
|
-
params: "SamplingParams | c.Sequence[SamplingParams] | PoolingParams | c.Sequence[PoolingParams]", # noqa: E501
|
|
955
|
-
*,
|
|
956
|
-
use_tqdm: bool,
|
|
957
|
-
lora_request: "c.Sequence[LoRARequest] | LoRARequest | None",
|
|
958
|
-
prompt_adapter_request: "PromptAdapterRequest | None",
|
|
959
|
-
tokenization_kwargs: dict[str, t.Any] | None = None,
|
|
960
|
-
guided_options: "GuidedDecodingRequest | None" = None,
|
|
961
|
-
priority: list[int] | None = None,
|
|
962
|
-
) -> None:
|
|
963
|
-
if isinstance(prompts, (str, dict)):
|
|
964
|
-
# Convert a single prompt to a list.
|
|
965
|
-
prompts = [prompts]
|
|
966
|
-
|
|
967
|
-
num_requests = len(prompts)
|
|
968
|
-
if isinstance(params, list) and len(params) != num_requests:
|
|
969
|
-
raise ValueError("The lengths of prompts and params must be the same.")
|
|
970
|
-
if isinstance(lora_request, list) and len(lora_request) != num_requests:
|
|
971
|
-
raise ValueError("The lengths of prompts and lora_request must be the same.")
|
|
972
|
-
|
|
973
|
-
for sp in params if isinstance(params, list) else (params,):
|
|
974
|
-
if isinstance(sp, SamplingParams):
|
|
975
|
-
self._add_guided_params(sp, guided_options)
|
|
976
|
-
|
|
977
|
-
# We only care about the final output
|
|
978
|
-
sp.output_kind = RequestOutputKind.FINAL_ONLY
|
|
979
|
-
|
|
980
|
-
# Add requests to the engine.
|
|
981
|
-
it = prompts
|
|
982
|
-
if use_tqdm:
|
|
983
|
-
it = tqdm(it, desc="Adding requests", leave=False)
|
|
984
|
-
|
|
985
|
-
for i, prompt in enumerate(it):
|
|
986
|
-
self._add_request(
|
|
987
|
-
prompt,
|
|
988
|
-
params[i] if isinstance(params, c.Sequence) else params,
|
|
989
|
-
tokenization_kwargs=tokenization_kwargs,
|
|
990
|
-
lora_request=lora_request[i]
|
|
991
|
-
if isinstance(lora_request, c.Sequence)
|
|
992
|
-
else lora_request,
|
|
993
|
-
prompt_adapter_request=prompt_adapter_request,
|
|
994
|
-
priority=priority[i] if priority else 0,
|
|
995
|
-
)
|
|
996
|
-
|
|
997
|
-
|
|
998
886
|
def clear_vllm() -> None:
|
|
999
887
|
"""Clear the GPU memory used by the vLLM model, enabling re-initialisation."""
|
|
1000
888
|
with contextlib.suppress(ValueError):
|
|
@@ -1166,3 +1054,19 @@ def get_custom_stop_tokens(
|
|
|
1166
1054
|
logger.debug(f"Found no custom stop tokens for model {model_id!r}.")
|
|
1167
1055
|
|
|
1168
1056
|
return stop_tokens
|
|
1057
|
+
|
|
1058
|
+
|
|
1059
|
+
def get_pbar_without_leave(*tqdm_args, **tqdm_kwargs) -> tqdm:
|
|
1060
|
+
"""Get a progress bar for vLLM which disappears after completion.
|
|
1061
|
+
|
|
1062
|
+
Args:
|
|
1063
|
+
*tqdm_args:
|
|
1064
|
+
Positional arguments to pass to tqdm.
|
|
1065
|
+
**tqdm_kwargs:
|
|
1066
|
+
Additional keyword arguments to pass to tqdm.
|
|
1067
|
+
|
|
1068
|
+
Returns:
|
|
1069
|
+
A tqdm progress bar.
|
|
1070
|
+
"""
|
|
1071
|
+
tqdm_kwargs.pop("leave", None) # Remove the 'leave' key if it exists
|
|
1072
|
+
return tqdm(*tqdm_args, leave=False, **tqdm_kwargs)
|
euroeval/data_loading.py
CHANGED
|
@@ -4,11 +4,11 @@ import logging
|
|
|
4
4
|
import sys
|
|
5
5
|
import time
|
|
6
6
|
|
|
7
|
+
import requests
|
|
7
8
|
from datasets import Dataset, DatasetDict, load_dataset
|
|
8
9
|
from datasets.exceptions import DatasetsError
|
|
9
10
|
from huggingface_hub.errors import HfHubHTTPError
|
|
10
11
|
from numpy.random import Generator
|
|
11
|
-
from requests import ReadTimeout
|
|
12
12
|
|
|
13
13
|
from .data_models import BenchmarkConfig, DatasetConfig
|
|
14
14
|
from .exceptions import HuggingFaceHubDown, InvalidBenchmark
|
|
@@ -101,7 +101,13 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> DatasetDic
|
|
|
101
101
|
token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
|
|
102
102
|
)
|
|
103
103
|
break
|
|
104
|
-
except (
|
|
104
|
+
except (
|
|
105
|
+
FileNotFoundError,
|
|
106
|
+
ConnectionError,
|
|
107
|
+
DatasetsError,
|
|
108
|
+
requests.ConnectionError,
|
|
109
|
+
requests.ReadTimeout,
|
|
110
|
+
):
|
|
105
111
|
logger.warning(
|
|
106
112
|
f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
|
|
107
113
|
)
|
euroeval/finetuning.py
CHANGED
|
@@ -200,6 +200,7 @@ def finetune_single_iteration(
|
|
|
200
200
|
compute_metrics=model.compute_metrics,
|
|
201
201
|
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
|
|
202
202
|
data_collator=model.data_collator,
|
|
203
|
+
preprocess_logits_for_metrics=remove_extra_tensors_from_logits,
|
|
203
204
|
)
|
|
204
205
|
|
|
205
206
|
if not benchmark_config.verbose:
|
|
@@ -316,3 +317,24 @@ def get_training_args(
|
|
|
316
317
|
training_args._n_gpu = 1
|
|
317
318
|
|
|
318
319
|
return training_args
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def remove_extra_tensors_from_logits(
|
|
323
|
+
logits: torch.Tensor | tuple[torch.Tensor, ...], labels: torch.Tensor
|
|
324
|
+
) -> torch.Tensor | tuple[torch.Tensor, ...]:
|
|
325
|
+
"""If the logits are a tuple, return only the first element.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
logits:
|
|
329
|
+
The logits to process.
|
|
330
|
+
labels:
|
|
331
|
+
The labels to use for the processing.
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
The processed logits.
|
|
335
|
+
"""
|
|
336
|
+
if isinstance(logits, tuple):
|
|
337
|
+
logits = logits[:-1]
|
|
338
|
+
if len(logits) == 1:
|
|
339
|
+
logits = logits[0]
|
|
340
|
+
return logits
|
|
@@ -12,6 +12,8 @@ from transformers.tokenization_utils import PreTrainedTokenizer
|
|
|
12
12
|
from transformers.tokenization_utils_base import BatchEncoding
|
|
13
13
|
from transformers.trainer import Trainer
|
|
14
14
|
|
|
15
|
+
from ..exceptions import InvalidBenchmark
|
|
16
|
+
|
|
15
17
|
if t.TYPE_CHECKING:
|
|
16
18
|
from ..types import Labels, Predictions
|
|
17
19
|
|
|
@@ -19,7 +21,7 @@ logger = logging.getLogger("euroeval")
|
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
class MultipleChoiceClassificationTrainer(Trainer):
|
|
22
|
-
"""Trainer subclass for
|
|
24
|
+
"""Trainer subclass for multiple-choice classification tasks."""
|
|
23
25
|
|
|
24
26
|
def evaluate( # type: ignore[override]
|
|
25
27
|
self,
|
|
@@ -57,6 +59,8 @@ class MultipleChoiceClassificationTrainer(Trainer):
|
|
|
57
59
|
)
|
|
58
60
|
|
|
59
61
|
predictions = output.predictions
|
|
62
|
+
if isinstance(predictions, tuple):
|
|
63
|
+
predictions = predictions[0]
|
|
60
64
|
assert isinstance(predictions, np.ndarray)
|
|
61
65
|
|
|
62
66
|
metrics = output.metrics
|
|
@@ -150,6 +154,12 @@ def postprocess_predictions_and_labels(
|
|
|
150
154
|
Returns:
|
|
151
155
|
The postprocessed predictions and labels.
|
|
152
156
|
"""
|
|
157
|
+
if predictions.ndim != 2 or predictions.shape[1] != 2:
|
|
158
|
+
raise InvalidBenchmark(
|
|
159
|
+
"Predictions must be a 2D array with shape (num_examples, 2). Found "
|
|
160
|
+
f"shape {predictions.shape}."
|
|
161
|
+
)
|
|
162
|
+
|
|
153
163
|
mapping = {0: "a", 1: "b", 2: "c", 3: "d", 4: "e"}
|
|
154
164
|
|
|
155
165
|
all_predictions: list[str] = list()
|
|
@@ -8,11 +8,11 @@ from collections import defaultdict
|
|
|
8
8
|
import evaluate
|
|
9
9
|
import numpy as np
|
|
10
10
|
from evaluate import EvaluationModule
|
|
11
|
-
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
12
11
|
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
|
13
12
|
from transformers.trainer import Trainer
|
|
14
13
|
|
|
15
14
|
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
15
|
+
from ..exceptions import InvalidBenchmark
|
|
16
16
|
from ..tokenization_utils import get_special_token_metadata
|
|
17
17
|
from ..utils import raise_if_model_output_contains_nan_values
|
|
18
18
|
|
|
@@ -20,6 +20,7 @@ if t.TYPE_CHECKING:
|
|
|
20
20
|
import torch.nn as nn
|
|
21
21
|
from datasets.arrow_dataset import Dataset
|
|
22
22
|
from transformers.modeling_utils import PreTrainedModel
|
|
23
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
23
24
|
from transformers.tokenization_utils_base import BatchEncoding
|
|
24
25
|
from transformers.trainer_callback import TrainerCallback
|
|
25
26
|
from transformers.trainer_utils import EvalPrediction
|
|
@@ -43,6 +44,7 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
43
44
|
compute_metrics: "c.Callable[[EvalPrediction], dict[str, float]]",
|
|
44
45
|
callbacks: "list[TrainerCallback]",
|
|
45
46
|
data_collator: "c.Callable",
|
|
47
|
+
**kwargs,
|
|
46
48
|
) -> None:
|
|
47
49
|
"""Initialise the trainer."""
|
|
48
50
|
super().__init__(
|
|
@@ -54,6 +56,7 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
54
56
|
compute_metrics=compute_metrics,
|
|
55
57
|
callbacks=callbacks,
|
|
56
58
|
data_collator=data_collator,
|
|
59
|
+
**kwargs,
|
|
57
60
|
)
|
|
58
61
|
|
|
59
62
|
# Get the CLS token id for the tokenizer
|
|
@@ -475,7 +478,7 @@ def prepare_test_examples(
|
|
|
475
478
|
|
|
476
479
|
|
|
477
480
|
def postprocess_predictions_and_labels(
|
|
478
|
-
predictions: tuple[np.ndarray,
|
|
481
|
+
predictions: tuple[np.ndarray, ...],
|
|
479
482
|
dataset: "Dataset",
|
|
480
483
|
prepared_dataset: "Dataset",
|
|
481
484
|
cls_token_index: int,
|
|
@@ -484,7 +487,7 @@ def postprocess_predictions_and_labels(
|
|
|
484
487
|
|
|
485
488
|
Args:
|
|
486
489
|
predictions:
|
|
487
|
-
A
|
|
490
|
+
A tuple whose first two elements are (start_logits, end_logits).
|
|
488
491
|
dataset:
|
|
489
492
|
The dataset containing the examples.
|
|
490
493
|
prepared_dataset:
|
|
@@ -495,7 +498,14 @@ def postprocess_predictions_and_labels(
|
|
|
495
498
|
Returns:
|
|
496
499
|
The postprocessed predictions and labels.
|
|
497
500
|
"""
|
|
498
|
-
|
|
501
|
+
if len(predictions) < 2:
|
|
502
|
+
raise InvalidBenchmark(
|
|
503
|
+
"The predictions should be a tuple with the first two elements being "
|
|
504
|
+
"(start_logits, end_logits), but got {len(predictions)} elements instead: "
|
|
505
|
+
f"{predictions}."
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
all_start_logits, all_end_logits = predictions[:2]
|
|
499
509
|
|
|
500
510
|
# Build a map from an example to its corresponding features, being the blocks of
|
|
501
511
|
# text from the context that we're feeding into the model. An example can have
|
euroeval/tokenization_utils.py
CHANGED
|
@@ -185,6 +185,11 @@ def get_bos_token(
|
|
|
185
185
|
)
|
|
186
186
|
return None, None
|
|
187
187
|
|
|
188
|
+
log_once(
|
|
189
|
+
f"Beginning-of-sequence token was not set, but detected it as {bos_token!r} "
|
|
190
|
+
f"with ID {bos_token_id}.",
|
|
191
|
+
level=logging.DEBUG,
|
|
192
|
+
)
|
|
188
193
|
return bos_token, bos_token_id
|
|
189
194
|
|
|
190
195
|
|
|
@@ -221,9 +226,97 @@ def get_eos_token(
|
|
|
221
226
|
)
|
|
222
227
|
return None, None
|
|
223
228
|
|
|
229
|
+
log_once(
|
|
230
|
+
f"End-of-sequence token was not set, but detected it as {eos_token!r} with "
|
|
231
|
+
f"ID {eos_token_id}.",
|
|
232
|
+
level=logging.DEBUG,
|
|
233
|
+
)
|
|
224
234
|
return eos_token, eos_token_id
|
|
225
235
|
|
|
226
236
|
|
|
237
|
+
def get_pad_token(
|
|
238
|
+
tokenizer: "PreTrainedTokenizer",
|
|
239
|
+
) -> tuple[str, int] | tuple[None, None]:
|
|
240
|
+
"""Get the padding token from a tokenizer.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
tokenizer:
|
|
244
|
+
The tokenizer.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
A pair (token, token_id) representing the padding token and its token ID, or
|
|
248
|
+
(None, None) if no padding token is found.
|
|
249
|
+
"""
|
|
250
|
+
# If the tokenizer already has a padding token, return it
|
|
251
|
+
if tokenizer.pad_token is not None and tokenizer.pad_token_id is not None:
|
|
252
|
+
assert isinstance(tokenizer.pad_token, str), (
|
|
253
|
+
"Expected tokenizer.pad_token to be a string, but got "
|
|
254
|
+
f"{type(tokenizer.pad_token)}."
|
|
255
|
+
)
|
|
256
|
+
assert isinstance(tokenizer.pad_token_id, int), (
|
|
257
|
+
"Expected tokenizer.pad_token_id to be an integer, but got "
|
|
258
|
+
f"{type(tokenizer.pad_token_id)}."
|
|
259
|
+
)
|
|
260
|
+
return (tokenizer.pad_token, tokenizer.pad_token_id)
|
|
261
|
+
|
|
262
|
+
# If the tokenizer has a BOS token, use it as the padding token
|
|
263
|
+
if tokenizer.bos_token is not None and tokenizer.bos_token_id is not None:
|
|
264
|
+
assert isinstance(tokenizer.bos_token, str), (
|
|
265
|
+
"Expected tokenizer.bos_token to be a string, but got "
|
|
266
|
+
f"{type(tokenizer.bos_token)}."
|
|
267
|
+
)
|
|
268
|
+
assert isinstance(tokenizer.bos_token_id, int), (
|
|
269
|
+
"Expected tokenizer.bos_token_id to be an integer, but got "
|
|
270
|
+
f"{type(tokenizer.bos_token_id)}."
|
|
271
|
+
)
|
|
272
|
+
pad_token = tokenizer.bos_token
|
|
273
|
+
pad_token_id = tokenizer.bos_token_id
|
|
274
|
+
|
|
275
|
+
# If the tokenizer has an EOS token, use it as the padding token
|
|
276
|
+
elif tokenizer.eos_token is not None and tokenizer.eos_token_id is not None:
|
|
277
|
+
assert isinstance(tokenizer.eos_token, str), (
|
|
278
|
+
"Expected tokenizer.eos_token to be a string, but got "
|
|
279
|
+
f"{type(tokenizer.eos_token)}."
|
|
280
|
+
)
|
|
281
|
+
assert isinstance(tokenizer.eos_token_id, int), (
|
|
282
|
+
"Expected tokenizer.eos_token_id to be an integer, but got "
|
|
283
|
+
f"{type(tokenizer.eos_token_id)}."
|
|
284
|
+
)
|
|
285
|
+
pad_token = tokenizer.eos_token
|
|
286
|
+
pad_token_id = tokenizer.eos_token_id
|
|
287
|
+
|
|
288
|
+
# Otherwise, try to find a candidate padding token in the vocabulary
|
|
289
|
+
else:
|
|
290
|
+
pad_token_candidates = [
|
|
291
|
+
"<pad>",
|
|
292
|
+
"[pad]",
|
|
293
|
+
"<|endoftext|>",
|
|
294
|
+
"<|end▁of▁sentence|>",
|
|
295
|
+
"<|im_end|>",
|
|
296
|
+
]
|
|
297
|
+
pad_token_candidates.extend([c.upper() for c in pad_token_candidates])
|
|
298
|
+
for candidate in pad_token_candidates:
|
|
299
|
+
if candidate in tokenizer.get_vocab():
|
|
300
|
+
pad_token = candidate
|
|
301
|
+
pad_token_id = tokenizer.get_vocab()[candidate]
|
|
302
|
+
break
|
|
303
|
+
else:
|
|
304
|
+
log_once(
|
|
305
|
+
"Could not identify a padding token for the model. Please ensure that "
|
|
306
|
+
"this has been set in the tokenizer's configuration. Using no padding "
|
|
307
|
+
"token. This may lead to unexpected behavior in the model.",
|
|
308
|
+
level=logging.INFO,
|
|
309
|
+
)
|
|
310
|
+
return None, None
|
|
311
|
+
|
|
312
|
+
log_once(
|
|
313
|
+
f"Padding token was not set, but detected it as {pad_token!r} with ID "
|
|
314
|
+
f"{pad_token_id}.",
|
|
315
|
+
level=logging.DEBUG,
|
|
316
|
+
)
|
|
317
|
+
return pad_token, pad_token_id
|
|
318
|
+
|
|
319
|
+
|
|
227
320
|
def get_end_of_chat_token_ids(tokenizer: "PreTrainedTokenizer") -> list[int] | None:
|
|
228
321
|
"""Get the end token ID for chat models.
|
|
229
322
|
|
|
@@ -300,14 +393,14 @@ def get_first_label_token_mapping(
|
|
|
300
393
|
if tokenizer is None:
|
|
301
394
|
if output_scores:
|
|
302
395
|
log_once(
|
|
303
|
-
f"
|
|
304
|
-
"dataset supports it and no tokenizer is available.",
|
|
396
|
+
f"We will use logprobs with the model {model_config.model_id!r} "
|
|
397
|
+
"since the dataset supports it and no tokenizer is available.",
|
|
305
398
|
level=logging.DEBUG,
|
|
306
399
|
)
|
|
307
400
|
else:
|
|
308
401
|
log_once(
|
|
309
|
-
f"
|
|
310
|
-
"the dataset does not support it and no tokenizer is available.",
|
|
402
|
+
f"We will not use logprobs with the model {model_config.model_id!r} "
|
|
403
|
+
"since the dataset does not support it and no tokenizer is available.",
|
|
311
404
|
level=logging.DEBUG,
|
|
312
405
|
)
|
|
313
406
|
return output_scores
|
|
@@ -368,7 +461,7 @@ def get_first_label_token_mapping(
|
|
|
368
461
|
if not matching_tokens:
|
|
369
462
|
log_once(
|
|
370
463
|
f"No matching token found in token_list for label '{label}', so "
|
|
371
|
-
"we will not
|
|
464
|
+
"we will not use logprobs with the model.",
|
|
372
465
|
level=logging.DEBUG,
|
|
373
466
|
)
|
|
374
467
|
return False
|
|
@@ -378,8 +471,8 @@ def get_first_label_token_mapping(
|
|
|
378
471
|
# tokens are distinct
|
|
379
472
|
if len(first_tokens) == len(set(first_tokens)):
|
|
380
473
|
log_once(
|
|
381
|
-
"
|
|
382
|
-
"are distinct.",
|
|
474
|
+
"We will use logprobs with the model since the first tokens of the "
|
|
475
|
+
"labels are distinct.",
|
|
383
476
|
level=logging.DEBUG,
|
|
384
477
|
)
|
|
385
478
|
return {
|
|
@@ -388,7 +481,7 @@ def get_first_label_token_mapping(
|
|
|
388
481
|
}
|
|
389
482
|
else:
|
|
390
483
|
log_once(
|
|
391
|
-
"
|
|
484
|
+
"We will not use logprobs with the model since the first tokens of the "
|
|
392
485
|
"labels are not distinct. The first tokens for the labels "
|
|
393
486
|
f"{local_labels} are {first_tokens}"
|
|
394
487
|
)
|
|
@@ -398,7 +491,8 @@ def get_first_label_token_mapping(
|
|
|
398
491
|
# evaluation errors. This will force the label extraction to rely on word edit
|
|
399
492
|
# distance instead of logprobs.
|
|
400
493
|
log_once(
|
|
401
|
-
"
|
|
494
|
+
"We will not use logprobs with the model, since the dataset does not have "
|
|
495
|
+
"labels.",
|
|
402
496
|
level=logging.DEBUG,
|
|
403
497
|
)
|
|
404
498
|
return False
|
euroeval/utils.py
CHANGED
|
@@ -121,6 +121,8 @@ def block_terminal_output() -> None:
|
|
|
121
121
|
logging.getLogger("matplotlib.font_manager").setLevel(logging.CRITICAL)
|
|
122
122
|
logging.getLogger("accelerate").setLevel(logging.CRITICAL)
|
|
123
123
|
logging.getLogger("LiteLLM").setLevel(logging.CRITICAL)
|
|
124
|
+
logging.getLogger("LiteLLM Router").setLevel(logging.CRITICAL)
|
|
125
|
+
logging.getLogger("LiteLLM Proxy").setLevel(logging.CRITICAL)
|
|
124
126
|
logging.getLogger("huggingface_hub").setLevel(logging.CRITICAL)
|
|
125
127
|
|
|
126
128
|
# This suppresses vLLM logging
|
|
@@ -352,19 +354,22 @@ def safe_run(coroutine: t.Coroutine[t.Any, t.Any, T]) -> T:
|
|
|
352
354
|
asyncio.set_event_loop(None)
|
|
353
355
|
|
|
354
356
|
|
|
355
|
-
async def
|
|
356
|
-
coroutine: t.Coroutine[t.Any, t.Any, T],
|
|
357
|
+
async def add_semaphore_and_catch_exception(
|
|
358
|
+
coroutine: t.Coroutine[t.Any, t.Any, T], semaphore: asyncio.Semaphore
|
|
357
359
|
) -> T | Exception:
|
|
358
|
-
"""Run a coroutine
|
|
360
|
+
"""Run a coroutine with a semaphore.
|
|
359
361
|
|
|
360
362
|
Args:
|
|
361
363
|
coroutine:
|
|
362
364
|
The coroutine to run.
|
|
365
|
+
semaphore:
|
|
366
|
+
The semaphore to use.
|
|
363
367
|
|
|
364
368
|
Returns:
|
|
365
|
-
The result of the coroutine
|
|
369
|
+
The result of the coroutine.
|
|
366
370
|
"""
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
+
async with semaphore:
|
|
372
|
+
try:
|
|
373
|
+
return await coroutine
|
|
374
|
+
except Exception as exc:
|
|
375
|
+
return exc
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 15.
|
|
3
|
+
Version: 15.10.0
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
7
7
|
Author-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
|
|
8
|
-
Maintainer-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk
|
|
8
|
+
Maintainer-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
|
|
9
9
|
License: MIT License
|
|
10
10
|
|
|
11
11
|
Copyright (c) 2022-2024 Dan Saattrup Nielsen
|
|
@@ -37,13 +37,12 @@ Requires-Dist: demjson3>=3.0.6
|
|
|
37
37
|
Requires-Dist: evaluate>=0.4.1
|
|
38
38
|
Requires-Dist: huggingface-hub>=0.30.1
|
|
39
39
|
Requires-Dist: levenshtein>=0.24.0
|
|
40
|
-
Requires-Dist: litellm>=1.
|
|
40
|
+
Requires-Dist: litellm>=1.72.2
|
|
41
41
|
Requires-Dist: more-itertools>=10.5.0
|
|
42
42
|
Requires-Dist: numpy<2.0.0,>=1.23.0
|
|
43
|
-
Requires-Dist: ollama>=0.
|
|
43
|
+
Requires-Dist: ollama>=0.5.1
|
|
44
44
|
Requires-Dist: pandas>=2.2.0
|
|
45
45
|
Requires-Dist: peft>=0.15.0
|
|
46
|
-
Requires-Dist: protobuf~=3.20.0
|
|
47
46
|
Requires-Dist: pydantic>=2.6.0
|
|
48
47
|
Requires-Dist: pyinfer>=0.0.3
|
|
49
48
|
Requires-Dist: python-dotenv>=1.0.1
|
|
@@ -62,12 +61,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
|
|
|
62
61
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
63
62
|
Requires-Dist: gradio>=4.26.0; extra == 'all'
|
|
64
63
|
Requires-Dist: outlines>=0.1.11; extra == 'all'
|
|
65
|
-
Requires-Dist: vllm>=0.9.
|
|
64
|
+
Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'all'
|
|
66
65
|
Provides-Extra: generative
|
|
67
66
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
68
67
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
69
68
|
Requires-Dist: outlines>=0.1.11; extra == 'generative'
|
|
70
|
-
Requires-Dist: vllm>=0.9.
|
|
69
|
+
Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'generative'
|
|
71
70
|
Provides-Extra: human-evaluation
|
|
72
71
|
Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
|
|
73
72
|
Provides-Extra: test
|
|
@@ -93,7 +92,7 @@ ______________________________________________________________________
|
|
|
93
92
|
[](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
|
|
94
93
|
|
|
95
94
|
|
|
96
|
-
##
|
|
95
|
+
## Maintainer
|
|
97
96
|
|
|
98
97
|
- Dan Saattrup Nielsen ([@saattrupdan](https://github.com/saattrupdan),
|
|
99
98
|
dan.nielsen@alexandra.dk)
|
|
@@ -4,11 +4,11 @@ euroeval/benchmarker.py,sha256=wmgrYVS31PMhhrVienjaVHHyfnZAy51kUvC6OjooiOw,48047
|
|
|
4
4
|
euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
|
|
5
5
|
euroeval/cli.py,sha256=d8JztMi_RbpUlEBXidd6DQ-xeC-xhozf_qU6Vkzye20,8161
|
|
6
6
|
euroeval/constants.py,sha256=0KHrH74zGM8vNF4uZG_a5qFJRZH5YgyQULYZtCKlo68,2452
|
|
7
|
-
euroeval/data_loading.py,sha256=
|
|
7
|
+
euroeval/data_loading.py,sha256=2rMLSy8pbntlwmImizMtkTiUzj93mcv5kzYjZELWWfU,4081
|
|
8
8
|
euroeval/data_models.py,sha256=7nAGDpN58Y35Lt9JZE_y0y5iOYesw2htcwHc68MkBZU,22953
|
|
9
9
|
euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
|
|
10
10
|
euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
|
|
11
|
-
euroeval/finetuning.py,sha256=
|
|
11
|
+
euroeval/finetuning.py,sha256=cx5SVgEsveMDNfoMxwLfAFsjZeKmYyHftaOZWZ-L9hA,11285
|
|
12
12
|
euroeval/generation.py,sha256=LSsskfLjIJ-c3gQxmr7eiAobPOm-5bU9vnR7uHQ7XmU,10745
|
|
13
13
|
euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
|
|
14
14
|
euroeval/human_evaluation.py,sha256=zqbbJkqm2Uymf-88PxM3R9vVRR8SZJlq3QrqWEoiVeE,27643
|
|
@@ -19,15 +19,15 @@ euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,223
|
|
|
19
19
|
euroeval/scores.py,sha256=TovjCZD8wmGrIjA4v5oAQp18P5KVcHvakkByDh0Hstk,3059
|
|
20
20
|
euroeval/speed_benchmark.py,sha256=J7VKWMf7GU_l0lRR8f0QeUr_vAaBQqTbgQ_yToHhp_0,3980
|
|
21
21
|
euroeval/tasks.py,sha256=87gbe__K5KNIb1aBSuwGnMPmZgamJFecNNYmNgMxaVo,7069
|
|
22
|
-
euroeval/tokenization_utils.py,sha256=
|
|
22
|
+
euroeval/tokenization_utils.py,sha256=LxgGs7juS5PuMYt5LL2X6eVXdtnpi-A2jFxqcWpF6NA,17931
|
|
23
23
|
euroeval/types.py,sha256=E0JhLfg-ek5pdFcYJbnGRUSodHxkuR3o8XGuIrBcuRM,2485
|
|
24
|
-
euroeval/utils.py,sha256=
|
|
24
|
+
euroeval/utils.py,sha256=5R7y67xe0ODaje7k8nOu2AFS3Ph2gcsiWpIq5rjSSuA,11613
|
|
25
25
|
euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
|
|
26
26
|
euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
|
|
27
27
|
euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
|
|
28
|
-
euroeval/benchmark_modules/hf.py,sha256=
|
|
29
|
-
euroeval/benchmark_modules/litellm.py,sha256=
|
|
30
|
-
euroeval/benchmark_modules/vllm.py,sha256=
|
|
28
|
+
euroeval/benchmark_modules/hf.py,sha256=Nbtn5eZ4axbmL09M8dGZCBr07pn9-btbqGgQ6q7KbHg,44620
|
|
29
|
+
euroeval/benchmark_modules/litellm.py,sha256=LS4mBXXG6h4uJwySPc6SI6f0y_HuiKE7IprprqWpoCI,50601
|
|
30
|
+
euroeval/benchmark_modules/vllm.py,sha256=sgeltOVfZA9bu0AmXV7PtZvuRst0I8s6VOIp0CI6DO8,38880
|
|
31
31
|
euroeval/dataset_configs/__init__.py,sha256=kWKtlSAOY-olOQL3UtFqL6I3Tki3G3waMZSd2YChjCg,1895
|
|
32
32
|
euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
|
|
33
33
|
euroeval/dataset_configs/dutch.py,sha256=r21nxEvMmBkKqPXVW082batPsxJ9d0RB4DzngOTMJSk,3185
|
|
@@ -49,13 +49,13 @@ euroeval/prompt_templates/reading_comprehension.py,sha256=yLqryWQAW04GULz_EyNDLO
|
|
|
49
49
|
euroeval/prompt_templates/sentiment_classification.py,sha256=LDOwjGQ2kqhwgNyphPywQeolwNB09o-xYWc9RUbzc84,7136
|
|
50
50
|
euroeval/prompt_templates/summarization.py,sha256=mcWeKNhGWmp7IG_iY64T-VOSabQg5wKddjSbJNYFDp8,4984
|
|
51
51
|
euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
|
|
52
|
-
euroeval/task_group_utils/multiple_choice_classification.py,sha256=
|
|
53
|
-
euroeval/task_group_utils/question_answering.py,sha256=
|
|
52
|
+
euroeval/task_group_utils/multiple_choice_classification.py,sha256=LQ6zD1UGi-jGCKI2xUJiQdAXoqb5QMpIJu41B2U0HPw,6543
|
|
53
|
+
euroeval/task_group_utils/question_answering.py,sha256=D4oJL2vQEjHghyxiiiq_vj1IQC6eryqNoLXuTiQEPmw,28071
|
|
54
54
|
euroeval/task_group_utils/sequence_classification.py,sha256=zwRUgVHqLlREILwyg-yuDPkrIQOfqGVPsFBai-2D9a8,13525
|
|
55
55
|
euroeval/task_group_utils/text_to_text.py,sha256=Nu1_qRPLbboCd9Q5rxqY4fQFJ_aGXu80aWQqoTG1cYc,5047
|
|
56
56
|
euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
|
|
57
|
-
euroeval-15.
|
|
58
|
-
euroeval-15.
|
|
59
|
-
euroeval-15.
|
|
60
|
-
euroeval-15.
|
|
61
|
-
euroeval-15.
|
|
57
|
+
euroeval-15.10.0.dist-info/METADATA,sha256=WUXtSfS6qvrlA25lazql3DvyS5chyMnBPKyu-l65A_I,13472
|
|
58
|
+
euroeval-15.10.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
59
|
+
euroeval-15.10.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
|
|
60
|
+
euroeval-15.10.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
|
|
61
|
+
euroeval-15.10.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|