r-llm-evals 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: r-llm-evals
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A utility library to evaluate LLMs.
|
|
5
|
+
Author: Raphaël Bournhonesque
|
|
6
|
+
Author-email: Raphaël Bournhonesque <sub+package@raphaelb.net>
|
|
7
|
+
Requires-Dist: deepdiff>=9.1.0
|
|
8
|
+
Requires-Dist: orjson>=3.11.9
|
|
9
|
+
Requires-Dist: pydantic>=2.13.4
|
|
10
|
+
Requires-Dist: pydantic-ai-slim[openai]>=2.0.0
|
|
11
|
+
Requires-Python: >=3.12
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
File without changes
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "r-llm-evals"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "A utility library to evaluate LLMs."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Raphaël Bournhonesque", email = "sub+package@raphaelb.net" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.12"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"deepdiff>=9.1.0",
|
|
12
|
+
"orjson>=3.11.9",
|
|
13
|
+
"pydantic>=2.13.4",
|
|
14
|
+
"pydantic-ai-slim[openai]>=2.0.0",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[build-system]
|
|
18
|
+
requires = ["uv_build>=0.10.3,<0.11.0"]
|
|
19
|
+
build-backend = "uv_build"
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from pydantic_ai.providers.openai import OpenAIProvider
|
|
5
|
+
from pydantic_ai.providers.openrouter import OpenRouterProvider
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def is_qwen_3_5(model: str) -> bool:
|
|
9
|
+
"""Check if the model is a Qwen 3.5 model.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
model: The model name to check.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
True if the model is a Qwen 3.5 model, False otherwise.
|
|
16
|
+
"""
|
|
17
|
+
_QWEN_3_5_RE = re.compile(r"qwen-?3[\.\-]5")
|
|
18
|
+
return bool(_QWEN_3_5_RE.search(model))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_model_provider():
|
|
22
|
+
"""Get the model provider based on the OPENAI_BASE_URL envvar.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
The model provider instance.
|
|
26
|
+
|
|
27
|
+
Raises:
|
|
28
|
+
ValueError: If OPENAI_BASE_URL envvar is not set.
|
|
29
|
+
"""
|
|
30
|
+
base_url = os.getenv("OPENAI_BASE_URL")
|
|
31
|
+
if not base_url:
|
|
32
|
+
raise ValueError("OPENAI_BASE_URL envvar is required")
|
|
33
|
+
|
|
34
|
+
if base_url.startswith("https://openrouter.ai"):
|
|
35
|
+
return OpenRouterProvider(api_key=os.getenv("OPENAI_API_KEY"))
|
|
36
|
+
return OpenAIProvider()
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import orjson
|
|
4
|
+
from deepdiff import DeepHash
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
from pydantic_ai.capabilities.thinking import ThinkingLevel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ModelOutputCache[BaseModelType: BaseModel | str]:
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
task_name: str,
|
|
13
|
+
output_type: type[BaseModelType],
|
|
14
|
+
cache_dir: Path | None = None,
|
|
15
|
+
):
|
|
16
|
+
if cache_dir is None:
|
|
17
|
+
cache_dir = Path("~/.cache/llm_evals").expanduser()
|
|
18
|
+
self.cache_dir = cache_dir
|
|
19
|
+
self.task_name = task_name
|
|
20
|
+
self.output_type = output_type
|
|
21
|
+
|
|
22
|
+
def get_query_cache_path(
|
|
23
|
+
self,
|
|
24
|
+
*,
|
|
25
|
+
model: str,
|
|
26
|
+
instructions: str | list,
|
|
27
|
+
output_mode: str | None,
|
|
28
|
+
thinking_effort: ThinkingLevel,
|
|
29
|
+
) -> Path:
|
|
30
|
+
if self.output_type is str:
|
|
31
|
+
json_schema = None
|
|
32
|
+
else:
|
|
33
|
+
json_schema = self.output_type.model_json_schema()
|
|
34
|
+
model = model.replace("/", "_")
|
|
35
|
+
cache_key = (
|
|
36
|
+
model,
|
|
37
|
+
self.task_name,
|
|
38
|
+
instructions,
|
|
39
|
+
json_schema,
|
|
40
|
+
output_mode,
|
|
41
|
+
thinking_effort,
|
|
42
|
+
)
|
|
43
|
+
cache_sha256 = DeepHash(cache_key)[cache_key]
|
|
44
|
+
|
|
45
|
+
# Split the cache sha256 into subdirectories for better file system
|
|
46
|
+
# performance
|
|
47
|
+
cache_sha256_str = str(cache_sha256)
|
|
48
|
+
subdirs = [cache_sha256_str[i : i + 2] for i in range(0, 6, 2)]
|
|
49
|
+
cache_subdir = Path(*subdirs)
|
|
50
|
+
full_cache_dir = self.cache_dir / self.task_name / model / cache_subdir
|
|
51
|
+
return full_cache_dir / f"{cache_sha256}.json"
|
|
52
|
+
|
|
53
|
+
def check_cache(
|
|
54
|
+
self,
|
|
55
|
+
*,
|
|
56
|
+
model: str,
|
|
57
|
+
instructions: str | list,
|
|
58
|
+
output_mode: str | None,
|
|
59
|
+
thinking_effort: ThinkingLevel,
|
|
60
|
+
) -> BaseModelType | None:
|
|
61
|
+
query_cache_path = self.get_query_cache_path(
|
|
62
|
+
model=model,
|
|
63
|
+
instructions=instructions,
|
|
64
|
+
output_mode=output_mode,
|
|
65
|
+
thinking_effort=thinking_effort,
|
|
66
|
+
)
|
|
67
|
+
if query_cache_path.exists():
|
|
68
|
+
output = orjson.loads(query_cache_path.read_bytes())["output"]
|
|
69
|
+
if self.output_type is str:
|
|
70
|
+
return output
|
|
71
|
+
else:
|
|
72
|
+
return self.output_type.model_validate(output)
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
def save_to_cache(
|
|
76
|
+
self,
|
|
77
|
+
*,
|
|
78
|
+
model: str,
|
|
79
|
+
instructions: str | list,
|
|
80
|
+
output_mode: str | None,
|
|
81
|
+
thinking_effort: ThinkingLevel,
|
|
82
|
+
output: BaseModelType,
|
|
83
|
+
) -> None:
|
|
84
|
+
query_cache_path = self.get_query_cache_path(
|
|
85
|
+
model=model,
|
|
86
|
+
instructions=instructions,
|
|
87
|
+
output_mode=output_mode,
|
|
88
|
+
thinking_effort=thinking_effort,
|
|
89
|
+
)
|
|
90
|
+
query_cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
91
|
+
|
|
92
|
+
if self.output_type is str:
|
|
93
|
+
_output = output
|
|
94
|
+
json_schema = None
|
|
95
|
+
else:
|
|
96
|
+
_output = output.model_dump()
|
|
97
|
+
json_schema = self.output_type.model_json_schema()
|
|
98
|
+
data = {
|
|
99
|
+
"output": _output,
|
|
100
|
+
"model": model,
|
|
101
|
+
"task_name": self.task_name,
|
|
102
|
+
"thinking_effort": thinking_effort,
|
|
103
|
+
"instructions": instructions,
|
|
104
|
+
"output_mode": output_mode,
|
|
105
|
+
"json_schema": json_schema,
|
|
106
|
+
}
|
|
107
|
+
with query_cache_path.open("wb") as f:
|
|
108
|
+
f.write(orjson.dumps(data))
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from difflib import Differ
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def normalize(s: str) -> str:
|
|
6
|
+
return (
|
|
7
|
+
# Replace all whitespace characters (including tabs, newlines, etc.) with a single space
|
|
8
|
+
re.sub(r"\s+", " ", s)
|
|
9
|
+
# Normalize quotes
|
|
10
|
+
.replace("’", "'")
|
|
11
|
+
.replace("œ", "oe")
|
|
12
|
+
# Remove leading/trailing whitespace
|
|
13
|
+
.strip()
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def tokenize(s: str, split_chars: str = ".:,;!/()[]{}") -> list[str]:
|
|
18
|
+
"""Tokenize the input string.
|
|
19
|
+
|
|
20
|
+
The input string is first splitted into tokens using whitespace as separator,
|
|
21
|
+
then we look for any chars in `split_chars` at the beginning or the end of
|
|
22
|
+
the token to further split into additional tokens.
|
|
23
|
+
"""
|
|
24
|
+
output = []
|
|
25
|
+
for token in s.split(" "):
|
|
26
|
+
if len(token) > 1:
|
|
27
|
+
while len(token) > 1 and any(
|
|
28
|
+
token.startswith(char) for char in split_chars
|
|
29
|
+
):
|
|
30
|
+
output.append(token[0])
|
|
31
|
+
token = token[1:]
|
|
32
|
+
|
|
33
|
+
token_buffer = []
|
|
34
|
+
while len(token) > 1 and any(token.endswith(char) for char in split_chars):
|
|
35
|
+
token_buffer.insert(0, token[-1])
|
|
36
|
+
token = token[:-1]
|
|
37
|
+
output.append(token)
|
|
38
|
+
if token_buffer:
|
|
39
|
+
output += token_buffer
|
|
40
|
+
else:
|
|
41
|
+
output.append(token)
|
|
42
|
+
|
|
43
|
+
return output
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def get_diff(s1: str, s2: str) -> str:
|
|
47
|
+
"""Compute the diff between two strings, returning the diff as a multi-line string.
|
|
48
|
+
|
|
49
|
+
Words are first splitted using whitespace as separator, then they are compared using
|
|
50
|
+
Differ.
|
|
51
|
+
"""
|
|
52
|
+
expected_lines = tokenize(s1)
|
|
53
|
+
actual_lines = tokenize(s2)
|
|
54
|
+
diffs = list(Differ().compare(expected_lines, actual_lines))
|
|
55
|
+
# Filter to only lines that differ
|
|
56
|
+
return "\n".join([d.replace("\n", "") for d in diffs if not d.startswith(" ")])
|