llmgrader 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmgrader-1.0.0/LICENSE +59 -0
- llmgrader-1.0.0/PKG-INFO +452 -0
- llmgrader-1.0.0/README.md +399 -0
- llmgrader-1.0.0/llmeval/__init__.py +116 -0
- llmgrader-1.0.0/llmeval/cli/__init__.py +1 -0
- llmgrader-1.0.0/llmeval/cli/main.py +140 -0
- llmgrader-1.0.0/llmeval/dataset.py +150 -0
- llmgrader-1.0.0/llmeval/evaluate.py +222 -0
- llmgrader-1.0.0/llmeval/integrations/__init__.py +1 -0
- llmgrader-1.0.0/llmeval/integrations/crewai.py +72 -0
- llmgrader-1.0.0/llmeval/integrations/langchain.py +99 -0
- llmgrader-1.0.0/llmeval/integrations/llamaindex.py +85 -0
- llmgrader-1.0.0/llmeval/metrics/__init__.py +58 -0
- llmgrader-1.0.0/llmeval/metrics/agentic/__init__.py +6 -0
- llmgrader-1.0.0/llmeval/metrics/agentic/argument_correctness.py +88 -0
- llmgrader-1.0.0/llmeval/metrics/agentic/step_efficiency.py +70 -0
- llmgrader-1.0.0/llmeval/metrics/agentic/task_completion.py +68 -0
- llmgrader-1.0.0/llmeval/metrics/agentic/tool_correctness.py +62 -0
- llmgrader-1.0.0/llmeval/metrics/base.py +139 -0
- llmgrader-1.0.0/llmeval/metrics/conversational/__init__.py +11 -0
- llmgrader-1.0.0/llmeval/metrics/conversational/completeness.py +58 -0
- llmgrader-1.0.0/llmeval/metrics/conversational/knowledge_retention.py +68 -0
- llmgrader-1.0.0/llmeval/metrics/conversational/relevancy.py +64 -0
- llmgrader-1.0.0/llmeval/metrics/conversational/role_adherence.py +60 -0
- llmgrader-1.0.0/llmeval/metrics/custom/__init__.py +4 -0
- llmgrader-1.0.0/llmeval/metrics/custom/dag.py +86 -0
- llmgrader-1.0.0/llmeval/metrics/custom/geval.py +133 -0
- llmgrader-1.0.0/llmeval/metrics/other/__init__.py +4 -0
- llmgrader-1.0.0/llmeval/metrics/other/json_correctness.py +108 -0
- llmgrader-1.0.0/llmeval/metrics/other/summarization.py +74 -0
- llmgrader-1.0.0/llmeval/metrics/rag/__init__.py +13 -0
- llmgrader-1.0.0/llmeval/metrics/rag/answer_relevancy.py +97 -0
- llmgrader-1.0.0/llmeval/metrics/rag/contextual_precision.py +83 -0
- llmgrader-1.0.0/llmeval/metrics/rag/contextual_recall.py +81 -0
- llmgrader-1.0.0/llmeval/metrics/rag/contextual_relevancy.py +61 -0
- llmgrader-1.0.0/llmeval/metrics/rag/faithfulness.py +99 -0
- llmgrader-1.0.0/llmeval/metrics/safety/__init__.py +7 -0
- llmgrader-1.0.0/llmeval/metrics/safety/bias.py +66 -0
- llmgrader-1.0.0/llmeval/metrics/safety/hallucination.py +84 -0
- llmgrader-1.0.0/llmeval/metrics/safety/misuse.py +55 -0
- llmgrader-1.0.0/llmeval/metrics/safety/pii_leakage.py +87 -0
- llmgrader-1.0.0/llmeval/metrics/safety/toxicity.py +61 -0
- llmgrader-1.0.0/llmeval/providers/__init__.py +12 -0
- llmgrader-1.0.0/llmeval/providers/anthropic_provider.py +49 -0
- llmgrader-1.0.0/llmeval/providers/base.py +49 -0
- llmgrader-1.0.0/llmeval/providers/ollama_provider.py +60 -0
- llmgrader-1.0.0/llmeval/providers/openai_provider.py +132 -0
- llmgrader-1.0.0/llmeval/pytest_plugin.py +76 -0
- llmgrader-1.0.0/llmeval/synthesizer.py +181 -0
- llmgrader-1.0.0/llmeval/test_case.py +158 -0
- llmgrader-1.0.0/llmeval/tracing/__init__.py +3 -0
- llmgrader-1.0.0/llmeval/tracing/tracer.py +181 -0
- llmgrader-1.0.0/llmgrader.egg-info/PKG-INFO +452 -0
- llmgrader-1.0.0/llmgrader.egg-info/SOURCES.txt +59 -0
- llmgrader-1.0.0/llmgrader.egg-info/dependency_links.txt +1 -0
- llmgrader-1.0.0/llmgrader.egg-info/entry_points.txt +5 -0
- llmgrader-1.0.0/llmgrader.egg-info/requires.txt +27 -0
- llmgrader-1.0.0/llmgrader.egg-info/top_level.txt +1 -0
- llmgrader-1.0.0/setup.cfg +4 -0
- llmgrader-1.0.0/setup.py +55 -0
- llmgrader-1.0.0/tests/test_core.py +551 -0
llmgrader-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
6
|
+
|
|
7
|
+
1. Definitions.
|
|
8
|
+
|
|
9
|
+
"License" shall mean the terms and conditions for use, reproduction, and distribution.
|
|
10
|
+
|
|
11
|
+
"Licensor" shall mean the copyright owner or entity authorized by the copyright owner.
|
|
12
|
+
|
|
13
|
+
"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity.
|
|
14
|
+
|
|
15
|
+
"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
|
|
16
|
+
|
|
17
|
+
"Source" form shall mean the preferred form for making modifications.
|
|
18
|
+
|
|
19
|
+
"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form.
|
|
20
|
+
|
|
21
|
+
"Work" shall mean the work of authorship made available under the License.
|
|
22
|
+
|
|
23
|
+
"Derivative Works" shall mean any work that is based on the Work.
|
|
24
|
+
|
|
25
|
+
"Contribution" shall mean any work of authorship submitted to the Licensor for inclusion in the Work.
|
|
26
|
+
|
|
27
|
+
"Contributor" shall mean Licensor and any Legal Entity on behalf of whom a Contribution has been received by the Licensor and included within the Work.
|
|
28
|
+
|
|
29
|
+
2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
|
|
30
|
+
|
|
31
|
+
3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work.
|
|
32
|
+
|
|
33
|
+
4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
|
|
34
|
+
(a) You must give any other recipients of the Work or Derivative Works a copy of this License; and
|
|
35
|
+
(b) You must cause any modified files to carry prominent notices stating that You changed the files; and
|
|
36
|
+
(c) You must retain, in all Source forms of the Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work; and
|
|
37
|
+
(d) If the Work includes a "NOTICE" text file, You must include a readable copy of the attribution notices contained within such NOTICE file.
|
|
38
|
+
|
|
39
|
+
5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License.
|
|
40
|
+
|
|
41
|
+
6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor.
|
|
42
|
+
|
|
43
|
+
7. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING, LICENSOR PROVIDES THE WORK ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
|
|
44
|
+
|
|
45
|
+
8. Limitation of Liability. IN NO EVENT SHALL ANY CONTRIBUTOR BE LIABLE FOR ANY DAMAGES ARISING FROM THIS LICENSE OR OUT OF OR IN CONNECTION WITH THE USE OR INABILITY TO USE THE WORK.
|
|
46
|
+
|
|
47
|
+
Copyright 2024 Mahesh Makvana
|
|
48
|
+
|
|
49
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
50
|
+
you may not use this file except in compliance with the License.
|
|
51
|
+
You may obtain a copy of the License at
|
|
52
|
+
|
|
53
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
54
|
+
|
|
55
|
+
Unless required by applicable law or agreed to in writing, software
|
|
56
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
57
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
58
|
+
See the License for the specific language governing permissions and
|
|
59
|
+
limitations under the License.
|
llmgrader-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,452 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llmgrader
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Open-source LLM evaluation framework — 50+ research-backed metrics for RAG, agents, safety, and more
|
|
5
|
+
Home-page: https://github.com/maheshmakvana/llmeval
|
|
6
|
+
Author: Mahesh Makvana
|
|
7
|
+
Author-email: maheshmakvana@example.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Requires-Python: >=3.8
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: openai>=1.0.0
|
|
20
|
+
Requires-Dist: anthropic>=0.20.0
|
|
21
|
+
Requires-Dist: pydantic>=2.0.0
|
|
22
|
+
Requires-Dist: rich>=13.0.0
|
|
23
|
+
Requires-Dist: typer>=0.9.0
|
|
24
|
+
Requires-Dist: pytest>=7.0.0
|
|
25
|
+
Requires-Dist: httpx>=0.24.0
|
|
26
|
+
Requires-Dist: numpy>=1.20.0
|
|
27
|
+
Requires-Dist: tenacity>=8.0.0
|
|
28
|
+
Requires-Dist: jinja2>=3.0.0
|
|
29
|
+
Requires-Dist: colorama>=0.4.6
|
|
30
|
+
Provides-Extra: langchain
|
|
31
|
+
Requires-Dist: langchain>=0.1.0; extra == "langchain"
|
|
32
|
+
Requires-Dist: langchain-openai>=0.1.0; extra == "langchain"
|
|
33
|
+
Provides-Extra: llamaindex
|
|
34
|
+
Requires-Dist: llama-index>=0.10.0; extra == "llamaindex"
|
|
35
|
+
Provides-Extra: ollama
|
|
36
|
+
Requires-Dist: ollama>=0.1.0; extra == "ollama"
|
|
37
|
+
Provides-Extra: all
|
|
38
|
+
Requires-Dist: langchain>=0.1.0; extra == "all"
|
|
39
|
+
Requires-Dist: langchain-openai>=0.1.0; extra == "all"
|
|
40
|
+
Requires-Dist: llama-index>=0.10.0; extra == "all"
|
|
41
|
+
Requires-Dist: ollama>=0.1.0; extra == "all"
|
|
42
|
+
Dynamic: author
|
|
43
|
+
Dynamic: author-email
|
|
44
|
+
Dynamic: classifier
|
|
45
|
+
Dynamic: description
|
|
46
|
+
Dynamic: description-content-type
|
|
47
|
+
Dynamic: home-page
|
|
48
|
+
Dynamic: license-file
|
|
49
|
+
Dynamic: provides-extra
|
|
50
|
+
Dynamic: requires-dist
|
|
51
|
+
Dynamic: requires-python
|
|
52
|
+
Dynamic: summary
|
|
53
|
+
|
|
54
|
+
# llmeval
|
|
55
|
+
|
|
56
|
+
**Open-source LLM evaluation framework** — 50+ research-backed metrics for RAG pipelines, AI agents, safety, and conversational systems. Pytest-native. Provider-agnostic.
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install llmeval
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Quick Start
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from llmeval import LLMTestCase, assert_test
|
|
68
|
+
from llmeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
|
|
69
|
+
|
|
70
|
+
tc = LLMTestCase(
|
|
71
|
+
input="What is the capital of France?",
|
|
72
|
+
actual_output="The capital of France is Paris.",
|
|
73
|
+
retrieval_context=["France is a country in Western Europe. Its capital is Paris."],
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
assert_test(tc, metrics=[
|
|
77
|
+
AnswerRelevancyMetric(threshold=0.7),
|
|
78
|
+
FaithfulnessMetric(threshold=0.8),
|
|
79
|
+
])
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Features
|
|
85
|
+
|
|
86
|
+
| Category | What it does |
|
|
87
|
+
|---|---|
|
|
88
|
+
| **RAG Evaluation** | Answer relevancy, faithfulness, contextual precision/recall/relevancy |
|
|
89
|
+
| **Custom Metrics** | GEval (LLM-as-judge + CoT), DAG (deterministic decision-tree) |
|
|
90
|
+
| **Safety** | Hallucination, bias, toxicity, PII leakage, misuse detection |
|
|
91
|
+
| **Agent Evaluation** | Task completion, tool correctness, step efficiency, argument correctness |
|
|
92
|
+
| **Conversational** | Relevancy, completeness, role adherence, knowledge retention |
|
|
93
|
+
| **Other** | JSON correctness (with schema), summarization quality |
|
|
94
|
+
| **Pytest Native** | `assert_test()`, fixtures, parametrize helpers |
|
|
95
|
+
| **Tracing** | `@observe` decorator, span/trace trees, component-level evaluation |
|
|
96
|
+
| **Bulk Evaluation** | `evaluate()` with concurrent execution and aggregated reports |
|
|
97
|
+
| **Dataset Tools** | `EvaluationDataset`, versioned JSON storage, CSV import |
|
|
98
|
+
| **Synthesizer** | Auto-generate Goldens from documents (4-step pipeline) |
|
|
99
|
+
| **Providers** | OpenAI, Azure OpenAI, Anthropic, Ollama, custom LLM base class |
|
|
100
|
+
| **Integrations** | LangChain, LlamaIndex, CrewAI |
|
|
101
|
+
| **CLI** | `llmeval test`, `llmeval set-openai`, `llmeval list-metrics` |
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Installation
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
# Core
|
|
109
|
+
pip install llmeval
|
|
110
|
+
|
|
111
|
+
# With LangChain integration
|
|
112
|
+
pip install "llmeval[langchain]"
|
|
113
|
+
|
|
114
|
+
# With LlamaIndex
|
|
115
|
+
pip install "llmeval[llamaindex]"
|
|
116
|
+
|
|
117
|
+
# Everything
|
|
118
|
+
pip install "llmeval[all]"
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Metrics Reference
|
|
124
|
+
|
|
125
|
+
### RAG Metrics
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from llmeval.metrics import (
|
|
129
|
+
AnswerRelevancyMetric, # Is the answer relevant to the question?
|
|
130
|
+
FaithfulnessMetric, # Are claims grounded in retrieved context?
|
|
131
|
+
ContextualRelevancyMetric, # Are retrieved chunks relevant to the query?
|
|
132
|
+
ContextualPrecisionMetric, # Do relevant chunks rank higher?
|
|
133
|
+
ContextualRecallMetric, # Does context cover expected answer claims?
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
tc = LLMTestCase(
|
|
137
|
+
input="What causes rain?",
|
|
138
|
+
actual_output="Rain is caused by water vapor condensing in clouds.",
|
|
139
|
+
expected_output="Rain is caused by condensation of water vapor.",
|
|
140
|
+
retrieval_context=[
|
|
141
|
+
"The water cycle involves evaporation and condensation.",
|
|
142
|
+
"Rain forms when water vapor cools and condenses around particles.",
|
|
143
|
+
],
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
result = FaithfulnessMetric(threshold=0.8).measure(tc)
|
|
147
|
+
print(result.score, result.reason)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Custom: GEval (LLM-as-Judge)
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
from llmeval import GEvalMetric, LLMTestCaseParams
|
|
154
|
+
|
|
155
|
+
metric = GEvalMetric(
|
|
156
|
+
name="Correctness",
|
|
157
|
+
criteria="The output should be factually correct and directly answer the question.",
|
|
158
|
+
evaluation_params=[
|
|
159
|
+
LLMTestCaseParams.INPUT,
|
|
160
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
161
|
+
LLMTestCaseParams.EXPECTED_OUTPUT,
|
|
162
|
+
],
|
|
163
|
+
threshold=0.7,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
result = metric.measure(tc)
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Custom: DAG (Deterministic)
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
from llmeval import DAGMetric
|
|
173
|
+
from llmeval.metrics.custom.dag import DAGNode
|
|
174
|
+
|
|
175
|
+
dag = DAGNode(
|
|
176
|
+
condition=lambda tc: len(tc.actual_output) > 0,
|
|
177
|
+
score_if_false=0.0,
|
|
178
|
+
next_if_true=DAGNode(
|
|
179
|
+
condition=lambda tc: "error" not in tc.actual_output.lower(),
|
|
180
|
+
score_if_true=1.0,
|
|
181
|
+
score_if_false=0.2,
|
|
182
|
+
)
|
|
183
|
+
)
|
|
184
|
+
metric = DAGMetric(name="ResponseQuality", root=dag, threshold=0.5)
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### Safety Metrics
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
from llmeval.metrics import (
|
|
191
|
+
HallucinationMetric, # Detects factual hallucinations vs context
|
|
192
|
+
BiasMetric, # Gender, racial, political, religious bias
|
|
193
|
+
ToxicityMetric, # Hate speech, harassment, harmful content
|
|
194
|
+
PIILeakageMetric, # SSN, email, phone, credit card detection
|
|
195
|
+
MisuseMetric, # Weapons, illegal activity enablement
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
result = BiasMetric(threshold=0.7).measure(tc)
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### Agentic Metrics
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
from llmeval import ToolCall
|
|
205
|
+
from llmeval.metrics import (
|
|
206
|
+
TaskCompletionMetric, # Did the agent accomplish the goal?
|
|
207
|
+
ToolCorrectnessMetric, # Were the right tools called?
|
|
208
|
+
StepEfficiencyMetric, # Were unnecessary steps avoided?
|
|
209
|
+
ArgumentCorrectnessMetric, # Were tool arguments correct?
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
tc = LLMTestCase(
|
|
213
|
+
input="Search for the latest news on AI and summarize it.",
|
|
214
|
+
actual_output="Here is a summary of recent AI news...",
|
|
215
|
+
tools_called=[
|
|
216
|
+
ToolCall(name="web_search", input_parameters={"query": "latest AI news"}),
|
|
217
|
+
ToolCall(name="summarize", input_parameters={"max_length": 200}),
|
|
218
|
+
],
|
|
219
|
+
expected_tools=["web_search", "summarize"],
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
result = ToolCorrectnessMetric(threshold=0.8).measure(tc)
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
### Conversational Metrics
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
from llmeval import ConversationalTestCase, Message
|
|
229
|
+
from llmeval.metrics import (
|
|
230
|
+
ConversationalRelevancyMetric,
|
|
231
|
+
ConversationCompletenessMetric,
|
|
232
|
+
RoleAdherenceMetric,
|
|
233
|
+
KnowledgeRetentionMetric,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
tc = ConversationalTestCase(
|
|
237
|
+
messages=[
|
|
238
|
+
Message(role="user", content="My name is Alice and I like Python."),
|
|
239
|
+
Message(role="assistant", content="Nice to meet you, Alice! Python is great."),
|
|
240
|
+
Message(role="user", content="What's my name again?"),
|
|
241
|
+
Message(role="assistant", content="Your name is Alice."),
|
|
242
|
+
],
|
|
243
|
+
chatbot_role="A helpful assistant that remembers user preferences.",
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
result = KnowledgeRetentionMetric(threshold=0.7).measure(tc)
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
---
|
|
250
|
+
|
|
251
|
+
## Bulk Evaluation
|
|
252
|
+
|
|
253
|
+
```python
|
|
254
|
+
from llmeval import evaluate, LLMTestCase
|
|
255
|
+
from llmeval.metrics import AnswerRelevancyMetric, JSONCorrectnessMetric
|
|
256
|
+
|
|
257
|
+
test_cases = [
|
|
258
|
+
LLMTestCase(input="What is 2+2?", actual_output="4"),
|
|
259
|
+
LLMTestCase(input="Capital of Japan?", actual_output="Tokyo"),
|
|
260
|
+
LLMTestCase(input="Return JSON", actual_output='{"status": "ok"}'),
|
|
261
|
+
]
|
|
262
|
+
|
|
263
|
+
result = evaluate(
|
|
264
|
+
test_cases=test_cases,
|
|
265
|
+
metrics=[AnswerRelevancyMetric(), JSONCorrectnessMetric()],
|
|
266
|
+
max_concurrent=4,
|
|
267
|
+
verbose=True,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
print(f"Pass rate: {result.pass_rate:.1%}")
|
|
271
|
+
print(f"Overall score: {result.overall_score:.3f}")
|
|
272
|
+
result.print_summary()
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
## Pytest Integration
|
|
278
|
+
|
|
279
|
+
```python
|
|
280
|
+
# test_my_llm.py
|
|
281
|
+
import pytest
|
|
282
|
+
from llmeval import LLMTestCase, assert_test
|
|
283
|
+
from llmeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
|
|
284
|
+
|
|
285
|
+
def test_rag_answer():
|
|
286
|
+
tc = LLMTestCase(
|
|
287
|
+
input="What causes lightning?",
|
|
288
|
+
actual_output=my_rag_pipeline("What causes lightning?"),
|
|
289
|
+
retrieval_context=get_context("lightning"),
|
|
290
|
+
)
|
|
291
|
+
assert_test(tc, metrics=[
|
|
292
|
+
AnswerRelevancyMetric(threshold=0.7),
|
|
293
|
+
FaithfulnessMetric(threshold=0.8),
|
|
294
|
+
])
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
# Run with: llmeval test test_my_llm.py
|
|
298
|
+
# Or: pytest test_my_llm.py
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
---
|
|
302
|
+
|
|
303
|
+
## Tracing & Component-Level Evaluation
|
|
304
|
+
|
|
305
|
+
```python
|
|
306
|
+
from llmeval import observe, Tracer, set_tracer, clear_tracer
|
|
307
|
+
|
|
308
|
+
tracer = Tracer()
|
|
309
|
+
set_tracer(tracer)
|
|
310
|
+
trace = tracer.start_trace()
|
|
311
|
+
|
|
312
|
+
@observe(span_type="retriever")
|
|
313
|
+
def retrieve(query: str) -> list:
|
|
314
|
+
return vector_db.search(query)
|
|
315
|
+
|
|
316
|
+
@observe(span_type="llm")
|
|
317
|
+
def generate(context: list, query: str) -> str:
|
|
318
|
+
return llm.generate(f"Context: {context}\nQuestion: {query}")
|
|
319
|
+
|
|
320
|
+
def rag_pipeline(query: str) -> str:
|
|
321
|
+
context = retrieve(query)
|
|
322
|
+
return generate(context, query)
|
|
323
|
+
|
|
324
|
+
answer = rag_pipeline("What is quantum computing?")
|
|
325
|
+
tracer.end_trace()
|
|
326
|
+
clear_tracer()
|
|
327
|
+
|
|
328
|
+
tracer.print_last_trace() # Shows span tree with latencies
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
---
|
|
332
|
+
|
|
333
|
+
## Dataset Management
|
|
334
|
+
|
|
335
|
+
```python
|
|
336
|
+
from llmeval import EvaluationDataset, Golden
|
|
337
|
+
|
|
338
|
+
# Build a dataset
|
|
339
|
+
ds = EvaluationDataset()
|
|
340
|
+
ds.add_goldens([
|
|
341
|
+
Golden(input="What is AI?", expected_output="Artificial intelligence."),
|
|
342
|
+
Golden(input="Capital of Germany?", expected_output="Berlin"),
|
|
343
|
+
])
|
|
344
|
+
ds.save("my_dataset.json")
|
|
345
|
+
|
|
346
|
+
# Load and use
|
|
347
|
+
ds = EvaluationDataset.load("my_dataset.json")
|
|
348
|
+
test_cases = ds.to_test_cases(generate_fn=my_llm.generate)
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
---
|
|
352
|
+
|
|
353
|
+
## Synthetic Dataset Generation
|
|
354
|
+
|
|
355
|
+
```python
|
|
356
|
+
from llmeval import Synthesizer
|
|
357
|
+
|
|
358
|
+
synth = Synthesizer()
|
|
359
|
+
|
|
360
|
+
docs = [
|
|
361
|
+
"The Python programming language was created by Guido van Rossum...",
|
|
362
|
+
"Machine learning is a branch of artificial intelligence...",
|
|
363
|
+
]
|
|
364
|
+
|
|
365
|
+
goldens = synth.generate_goldens_from_docs(
|
|
366
|
+
documents=docs,
|
|
367
|
+
max_goldens_per_doc=5,
|
|
368
|
+
filter_questions=True,
|
|
369
|
+
evolve_questions=True,
|
|
370
|
+
generate_expected_outputs=True,
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
print(f"Generated {len(goldens)} golden test cases")
|
|
374
|
+
for g in goldens[:2]:
|
|
375
|
+
print(f"Q: {g.input}")
|
|
376
|
+
print(f"A: {g.expected_output}\n")
|
|
377
|
+
```
|
|
378
|
+
|
|
379
|
+
---
|
|
380
|
+
|
|
381
|
+
## LLM Providers
|
|
382
|
+
|
|
383
|
+
```python
|
|
384
|
+
from llmeval.providers import OpenAIProvider, AnthropicProvider, OllamaProvider
|
|
385
|
+
|
|
386
|
+
# OpenAI
|
|
387
|
+
provider = OpenAIProvider(model="gpt-4o", api_key="sk-...")
|
|
388
|
+
|
|
389
|
+
# Anthropic Claude
|
|
390
|
+
provider = AnthropicProvider(model="claude-sonnet-4-6")
|
|
391
|
+
|
|
392
|
+
# Ollama (local)
|
|
393
|
+
provider = OllamaProvider(model="llama3")
|
|
394
|
+
|
|
395
|
+
# Custom provider
|
|
396
|
+
from llmeval.providers import LLMProvider
|
|
397
|
+
|
|
398
|
+
class MyProvider(LLMProvider):
|
|
399
|
+
def generate(self, prompt: str, **kwargs) -> str:
|
|
400
|
+
return my_llm_api.call(prompt)
|
|
401
|
+
|
|
402
|
+
# Use in any metric
|
|
403
|
+
metric = AnswerRelevancyMetric(model=MyProvider())
|
|
404
|
+
```
|
|
405
|
+
|
|
406
|
+
---
|
|
407
|
+
|
|
408
|
+
## LangChain Integration
|
|
409
|
+
|
|
410
|
+
```python
|
|
411
|
+
from langchain_openai import ChatOpenAI
|
|
412
|
+
from llmeval.integrations.langchain import LangChainCallbackHandler, evaluate_chain
|
|
413
|
+
from llmeval.metrics import AnswerRelevancyMetric
|
|
414
|
+
|
|
415
|
+
llm = ChatOpenAI(model="gpt-4o")
|
|
416
|
+
chain = llm # or any LangChain runnable
|
|
417
|
+
|
|
418
|
+
result = evaluate_chain(
|
|
419
|
+
chain=chain,
|
|
420
|
+
inputs=["What is the capital of France?", "Who invented Python?"],
|
|
421
|
+
metrics=[AnswerRelevancyMetric(threshold=0.7)],
|
|
422
|
+
)
|
|
423
|
+
```
|
|
424
|
+
|
|
425
|
+
---
|
|
426
|
+
|
|
427
|
+
## CLI
|
|
428
|
+
|
|
429
|
+
```bash
|
|
430
|
+
# Run evaluation tests
|
|
431
|
+
llmeval test tests/test_llm.py
|
|
432
|
+
llmeval test tests/ -n 4 # 4 parallel workers
|
|
433
|
+
|
|
434
|
+
# Configure providers
|
|
435
|
+
llmeval set-openai --key sk-... --model gpt-4o
|
|
436
|
+
llmeval set-anthropic --key sk-... --model claude-sonnet-4-6
|
|
437
|
+
llmeval set-ollama --model llama3
|
|
438
|
+
|
|
439
|
+
# List all metrics
|
|
440
|
+
llmeval list-metrics
|
|
441
|
+
|
|
442
|
+
# Version
|
|
443
|
+
llmeval version
|
|
444
|
+
```
|
|
445
|
+
|
|
446
|
+
---
|
|
447
|
+
|
|
448
|
+
## License
|
|
449
|
+
|
|
450
|
+
Apache 2.0 — see LICENSE.
|
|
451
|
+
|
|
452
|
+
**Author:** Mahesh Makvana
|