r-llm-evals 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.3
2
+ Name: r-llm-evals
3
+ Version: 0.1.0
4
+ Summary: A utility library to evaluate LLMs.
5
+ Author: Raphaël Bournhonesque
6
+ Author-email: Raphaël Bournhonesque <sub+package@raphaelb.net>
7
+ Requires-Dist: deepdiff>=9.1.0
8
+ Requires-Dist: orjson>=3.11.9
9
+ Requires-Dist: pydantic>=2.13.4
10
+ Requires-Dist: pydantic-ai-slim[openai]>=2.0.0
11
+ Requires-Python: >=3.12
12
+ Description-Content-Type: text/markdown
13
+
File without changes
@@ -0,0 +1,19 @@
1
+ [project]
2
+ name = "r-llm-evals"
3
+ version = "0.1.0"
4
+ description = "A utility library to evaluate LLMs."
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Raphaël Bournhonesque", email = "sub+package@raphaelb.net" }
8
+ ]
9
+ requires-python = ">=3.12"
10
+ dependencies = [
11
+ "deepdiff>=9.1.0",
12
+ "orjson>=3.11.9",
13
+ "pydantic>=2.13.4",
14
+ "pydantic-ai-slim[openai]>=2.0.0",
15
+ ]
16
+
17
+ [build-system]
18
+ requires = ["uv_build>=0.10.3,<0.11.0"]
19
+ build-backend = "uv_build"
@@ -0,0 +1,36 @@
1
+ import os
2
+ import re
3
+
4
+ from pydantic_ai.providers.openai import OpenAIProvider
5
+ from pydantic_ai.providers.openrouter import OpenRouterProvider
6
+
7
+
8
+ def is_qwen_3_5(model: str) -> bool:
9
+ """Check if the model is a Qwen 3.5 model.
10
+
11
+ Args:
12
+ model: The model name to check.
13
+
14
+ Returns:
15
+ True if the model is a Qwen 3.5 model, False otherwise.
16
+ """
17
+ _QWEN_3_5_RE = re.compile(r"qwen-?3[\.\-]5")
18
+ return bool(_QWEN_3_5_RE.search(model))
19
+
20
+
21
+ def get_model_provider():
22
+ """Get the model provider based on the OPENAI_BASE_URL envvar.
23
+
24
+ Returns:
25
+ The model provider instance.
26
+
27
+ Raises:
28
+ ValueError: If OPENAI_BASE_URL envvar is not set.
29
+ """
30
+ base_url = os.getenv("OPENAI_BASE_URL")
31
+ if not base_url:
32
+ raise ValueError("OPENAI_BASE_URL envvar is required")
33
+
34
+ if base_url.startswith("https://openrouter.ai"):
35
+ return OpenRouterProvider(api_key=os.getenv("OPENAI_API_KEY"))
36
+ return OpenAIProvider()
@@ -0,0 +1,108 @@
1
+ from pathlib import Path
2
+
3
+ import orjson
4
+ from deepdiff import DeepHash
5
+ from pydantic import BaseModel
6
+ from pydantic_ai.capabilities.thinking import ThinkingLevel
7
+
8
+
9
+ class ModelOutputCache[BaseModelType: BaseModel | str]:
10
+ def __init__(
11
+ self,
12
+ task_name: str,
13
+ output_type: type[BaseModelType],
14
+ cache_dir: Path | None = None,
15
+ ):
16
+ if cache_dir is None:
17
+ cache_dir = Path("~/.cache/llm_evals").expanduser()
18
+ self.cache_dir = cache_dir
19
+ self.task_name = task_name
20
+ self.output_type = output_type
21
+
22
+ def get_query_cache_path(
23
+ self,
24
+ *,
25
+ model: str,
26
+ instructions: str | list,
27
+ output_mode: str | None,
28
+ thinking_effort: ThinkingLevel,
29
+ ) -> Path:
30
+ if self.output_type is str:
31
+ json_schema = None
32
+ else:
33
+ json_schema = self.output_type.model_json_schema()
34
+ model = model.replace("/", "_")
35
+ cache_key = (
36
+ model,
37
+ self.task_name,
38
+ instructions,
39
+ json_schema,
40
+ output_mode,
41
+ thinking_effort,
42
+ )
43
+ cache_sha256 = DeepHash(cache_key)[cache_key]
44
+
45
+ # Split the cache sha256 into subdirectories for better file system
46
+ # performance
47
+ cache_sha256_str = str(cache_sha256)
48
+ subdirs = [cache_sha256_str[i : i + 2] for i in range(0, 6, 2)]
49
+ cache_subdir = Path(*subdirs)
50
+ full_cache_dir = self.cache_dir / self.task_name / model / cache_subdir
51
+ return full_cache_dir / f"{cache_sha256}.json"
52
+
53
+ def check_cache(
54
+ self,
55
+ *,
56
+ model: str,
57
+ instructions: str | list,
58
+ output_mode: str | None,
59
+ thinking_effort: ThinkingLevel,
60
+ ) -> BaseModelType | None:
61
+ query_cache_path = self.get_query_cache_path(
62
+ model=model,
63
+ instructions=instructions,
64
+ output_mode=output_mode,
65
+ thinking_effort=thinking_effort,
66
+ )
67
+ if query_cache_path.exists():
68
+ output = orjson.loads(query_cache_path.read_bytes())["output"]
69
+ if self.output_type is str:
70
+ return output
71
+ else:
72
+ return self.output_type.model_validate(output)
73
+ return None
74
+
75
+ def save_to_cache(
76
+ self,
77
+ *,
78
+ model: str,
79
+ instructions: str | list,
80
+ output_mode: str | None,
81
+ thinking_effort: ThinkingLevel,
82
+ output: BaseModelType,
83
+ ) -> None:
84
+ query_cache_path = self.get_query_cache_path(
85
+ model=model,
86
+ instructions=instructions,
87
+ output_mode=output_mode,
88
+ thinking_effort=thinking_effort,
89
+ )
90
+ query_cache_path.parent.mkdir(parents=True, exist_ok=True)
91
+
92
+ if self.output_type is str:
93
+ _output = output
94
+ json_schema = None
95
+ else:
96
+ _output = output.model_dump()
97
+ json_schema = self.output_type.model_json_schema()
98
+ data = {
99
+ "output": _output,
100
+ "model": model,
101
+ "task_name": self.task_name,
102
+ "thinking_effort": thinking_effort,
103
+ "instructions": instructions,
104
+ "output_mode": output_mode,
105
+ "json_schema": json_schema,
106
+ }
107
+ with query_cache_path.open("wb") as f:
108
+ f.write(orjson.dumps(data))
@@ -0,0 +1,56 @@
1
+ import re
2
+ from difflib import Differ
3
+
4
+
5
+ def normalize(s: str) -> str:
6
+ return (
7
+ # Replace all whitespace characters (including tabs, newlines, etc.) with a single space
8
+ re.sub(r"\s+", " ", s)
9
+ # Normalize quotes
10
+ .replace("’", "'")
11
+ .replace("œ", "oe")
12
+ # Remove leading/trailing whitespace
13
+ .strip()
14
+ )
15
+
16
+
17
+ def tokenize(s: str, split_chars: str = ".:,;!/()[]{}") -> list[str]:
18
+ """Tokenize the input string.
19
+
20
+ The input string is first splitted into tokens using whitespace as separator,
21
+ then we look for any chars in `split_chars` at the beginning or the end of
22
+ the token to further split into additional tokens.
23
+ """
24
+ output = []
25
+ for token in s.split(" "):
26
+ if len(token) > 1:
27
+ while len(token) > 1 and any(
28
+ token.startswith(char) for char in split_chars
29
+ ):
30
+ output.append(token[0])
31
+ token = token[1:]
32
+
33
+ token_buffer = []
34
+ while len(token) > 1 and any(token.endswith(char) for char in split_chars):
35
+ token_buffer.insert(0, token[-1])
36
+ token = token[:-1]
37
+ output.append(token)
38
+ if token_buffer:
39
+ output += token_buffer
40
+ else:
41
+ output.append(token)
42
+
43
+ return output
44
+
45
+
46
+ def get_diff(s1: str, s2: str) -> str:
47
+ """Compute the diff between two strings, returning the diff as a multi-line string.
48
+
49
+ Words are first splitted using whitespace as separator, then they are compared using
50
+ Differ.
51
+ """
52
+ expected_lines = tokenize(s1)
53
+ actual_lines = tokenize(s2)
54
+ diffs = list(Differ().compare(expected_lines, actual_lines))
55
+ # Filter to only lines that differ
56
+ return "\n".join([d.replace("\n", "") for d in diffs if not d.startswith(" ")])