llmgrader 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. llmgrader-1.0.0/LICENSE +59 -0
  2. llmgrader-1.0.0/PKG-INFO +452 -0
  3. llmgrader-1.0.0/README.md +399 -0
  4. llmgrader-1.0.0/llmeval/__init__.py +116 -0
  5. llmgrader-1.0.0/llmeval/cli/__init__.py +1 -0
  6. llmgrader-1.0.0/llmeval/cli/main.py +140 -0
  7. llmgrader-1.0.0/llmeval/dataset.py +150 -0
  8. llmgrader-1.0.0/llmeval/evaluate.py +222 -0
  9. llmgrader-1.0.0/llmeval/integrations/__init__.py +1 -0
  10. llmgrader-1.0.0/llmeval/integrations/crewai.py +72 -0
  11. llmgrader-1.0.0/llmeval/integrations/langchain.py +99 -0
  12. llmgrader-1.0.0/llmeval/integrations/llamaindex.py +85 -0
  13. llmgrader-1.0.0/llmeval/metrics/__init__.py +58 -0
  14. llmgrader-1.0.0/llmeval/metrics/agentic/__init__.py +6 -0
  15. llmgrader-1.0.0/llmeval/metrics/agentic/argument_correctness.py +88 -0
  16. llmgrader-1.0.0/llmeval/metrics/agentic/step_efficiency.py +70 -0
  17. llmgrader-1.0.0/llmeval/metrics/agentic/task_completion.py +68 -0
  18. llmgrader-1.0.0/llmeval/metrics/agentic/tool_correctness.py +62 -0
  19. llmgrader-1.0.0/llmeval/metrics/base.py +139 -0
  20. llmgrader-1.0.0/llmeval/metrics/conversational/__init__.py +11 -0
  21. llmgrader-1.0.0/llmeval/metrics/conversational/completeness.py +58 -0
  22. llmgrader-1.0.0/llmeval/metrics/conversational/knowledge_retention.py +68 -0
  23. llmgrader-1.0.0/llmeval/metrics/conversational/relevancy.py +64 -0
  24. llmgrader-1.0.0/llmeval/metrics/conversational/role_adherence.py +60 -0
  25. llmgrader-1.0.0/llmeval/metrics/custom/__init__.py +4 -0
  26. llmgrader-1.0.0/llmeval/metrics/custom/dag.py +86 -0
  27. llmgrader-1.0.0/llmeval/metrics/custom/geval.py +133 -0
  28. llmgrader-1.0.0/llmeval/metrics/other/__init__.py +4 -0
  29. llmgrader-1.0.0/llmeval/metrics/other/json_correctness.py +108 -0
  30. llmgrader-1.0.0/llmeval/metrics/other/summarization.py +74 -0
  31. llmgrader-1.0.0/llmeval/metrics/rag/__init__.py +13 -0
  32. llmgrader-1.0.0/llmeval/metrics/rag/answer_relevancy.py +97 -0
  33. llmgrader-1.0.0/llmeval/metrics/rag/contextual_precision.py +83 -0
  34. llmgrader-1.0.0/llmeval/metrics/rag/contextual_recall.py +81 -0
  35. llmgrader-1.0.0/llmeval/metrics/rag/contextual_relevancy.py +61 -0
  36. llmgrader-1.0.0/llmeval/metrics/rag/faithfulness.py +99 -0
  37. llmgrader-1.0.0/llmeval/metrics/safety/__init__.py +7 -0
  38. llmgrader-1.0.0/llmeval/metrics/safety/bias.py +66 -0
  39. llmgrader-1.0.0/llmeval/metrics/safety/hallucination.py +84 -0
  40. llmgrader-1.0.0/llmeval/metrics/safety/misuse.py +55 -0
  41. llmgrader-1.0.0/llmeval/metrics/safety/pii_leakage.py +87 -0
  42. llmgrader-1.0.0/llmeval/metrics/safety/toxicity.py +61 -0
  43. llmgrader-1.0.0/llmeval/providers/__init__.py +12 -0
  44. llmgrader-1.0.0/llmeval/providers/anthropic_provider.py +49 -0
  45. llmgrader-1.0.0/llmeval/providers/base.py +49 -0
  46. llmgrader-1.0.0/llmeval/providers/ollama_provider.py +60 -0
  47. llmgrader-1.0.0/llmeval/providers/openai_provider.py +132 -0
  48. llmgrader-1.0.0/llmeval/pytest_plugin.py +76 -0
  49. llmgrader-1.0.0/llmeval/synthesizer.py +181 -0
  50. llmgrader-1.0.0/llmeval/test_case.py +158 -0
  51. llmgrader-1.0.0/llmeval/tracing/__init__.py +3 -0
  52. llmgrader-1.0.0/llmeval/tracing/tracer.py +181 -0
  53. llmgrader-1.0.0/llmgrader.egg-info/PKG-INFO +452 -0
  54. llmgrader-1.0.0/llmgrader.egg-info/SOURCES.txt +59 -0
  55. llmgrader-1.0.0/llmgrader.egg-info/dependency_links.txt +1 -0
  56. llmgrader-1.0.0/llmgrader.egg-info/entry_points.txt +5 -0
  57. llmgrader-1.0.0/llmgrader.egg-info/requires.txt +27 -0
  58. llmgrader-1.0.0/llmgrader.egg-info/top_level.txt +1 -0
  59. llmgrader-1.0.0/setup.cfg +4 -0
  60. llmgrader-1.0.0/setup.py +55 -0
  61. llmgrader-1.0.0/tests/test_core.py +551 -0
@@ -0,0 +1,59 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction, and distribution.
10
+
11
+ "Licensor" shall mean the copyright owner or entity authorized by the copyright owner.
12
+
13
+ "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity.
14
+
15
+ "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
16
+
17
+ "Source" form shall mean the preferred form for making modifications.
18
+
19
+ "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form.
20
+
21
+ "Work" shall mean the work of authorship made available under the License.
22
+
23
+ "Derivative Works" shall mean any work that is based on the Work.
24
+
25
+ "Contribution" shall mean any work of authorship submitted to the Licensor for inclusion in the Work.
26
+
27
+ "Contributor" shall mean Licensor and any Legal Entity on behalf of whom a Contribution has been received by the Licensor and included within the Work.
28
+
29
+ 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
30
+
31
+ 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work.
32
+
33
+ 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
34
+ (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and
35
+ (b) You must cause any modified files to carry prominent notices stating that You changed the files; and
36
+ (c) You must retain, in all Source forms of the Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work; and
37
+ (d) If the Work includes a "NOTICE" text file, You must include a readable copy of the attribution notices contained within such NOTICE file.
38
+
39
+ 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License.
40
+
41
+ 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor.
42
+
43
+ 7. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING, LICENSOR PROVIDES THE WORK ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
44
+
45
+ 8. Limitation of Liability. IN NO EVENT SHALL ANY CONTRIBUTOR BE LIABLE FOR ANY DAMAGES ARISING FROM THIS LICENSE OR OUT OF OR IN CONNECTION WITH THE USE OR INABILITY TO USE THE WORK.
46
+
47
+ Copyright 2024 Mahesh Makvana
48
+
49
+ Licensed under the Apache License, Version 2.0 (the "License");
50
+ you may not use this file except in compliance with the License.
51
+ You may obtain a copy of the License at
52
+
53
+ http://www.apache.org/licenses/LICENSE-2.0
54
+
55
+ Unless required by applicable law or agreed to in writing, software
56
+ distributed under the License is distributed on an "AS IS" BASIS,
57
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
58
+ See the License for the specific language governing permissions and
59
+ limitations under the License.
@@ -0,0 +1,452 @@
1
+ Metadata-Version: 2.4
2
+ Name: llmgrader
3
+ Version: 1.0.0
4
+ Summary: Open-source LLM evaluation framework — 50+ research-backed metrics for RAG, agents, safety, and more
5
+ Home-page: https://github.com/maheshmakvana/llmeval
6
+ Author: Mahesh Makvana
7
+ Author-email: maheshmakvana@example.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.8
10
+ Classifier: Programming Language :: Python :: 3.9
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: License :: OSI Approved :: Apache Software License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Requires-Python: >=3.8
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: openai>=1.0.0
20
+ Requires-Dist: anthropic>=0.20.0
21
+ Requires-Dist: pydantic>=2.0.0
22
+ Requires-Dist: rich>=13.0.0
23
+ Requires-Dist: typer>=0.9.0
24
+ Requires-Dist: pytest>=7.0.0
25
+ Requires-Dist: httpx>=0.24.0
26
+ Requires-Dist: numpy>=1.20.0
27
+ Requires-Dist: tenacity>=8.0.0
28
+ Requires-Dist: jinja2>=3.0.0
29
+ Requires-Dist: colorama>=0.4.6
30
+ Provides-Extra: langchain
31
+ Requires-Dist: langchain>=0.1.0; extra == "langchain"
32
+ Requires-Dist: langchain-openai>=0.1.0; extra == "langchain"
33
+ Provides-Extra: llamaindex
34
+ Requires-Dist: llama-index>=0.10.0; extra == "llamaindex"
35
+ Provides-Extra: ollama
36
+ Requires-Dist: ollama>=0.1.0; extra == "ollama"
37
+ Provides-Extra: all
38
+ Requires-Dist: langchain>=0.1.0; extra == "all"
39
+ Requires-Dist: langchain-openai>=0.1.0; extra == "all"
40
+ Requires-Dist: llama-index>=0.10.0; extra == "all"
41
+ Requires-Dist: ollama>=0.1.0; extra == "all"
42
+ Dynamic: author
43
+ Dynamic: author-email
44
+ Dynamic: classifier
45
+ Dynamic: description
46
+ Dynamic: description-content-type
47
+ Dynamic: home-page
48
+ Dynamic: license-file
49
+ Dynamic: provides-extra
50
+ Dynamic: requires-dist
51
+ Dynamic: requires-python
52
+ Dynamic: summary
53
+
54
+ # llmeval
55
+
56
+ **Open-source LLM evaluation framework** — 50+ research-backed metrics for RAG pipelines, AI agents, safety, and conversational systems. Pytest-native. Provider-agnostic.
57
+
58
+ ```bash
59
+ pip install llmeval
60
+ ```
61
+
62
+ ---
63
+
64
+ ## Quick Start
65
+
66
+ ```python
67
+ from llmeval import LLMTestCase, assert_test
68
+ from llmeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
69
+
70
+ tc = LLMTestCase(
71
+ input="What is the capital of France?",
72
+ actual_output="The capital of France is Paris.",
73
+ retrieval_context=["France is a country in Western Europe. Its capital is Paris."],
74
+ )
75
+
76
+ assert_test(tc, metrics=[
77
+ AnswerRelevancyMetric(threshold=0.7),
78
+ FaithfulnessMetric(threshold=0.8),
79
+ ])
80
+ ```
81
+
82
+ ---
83
+
84
+ ## Features
85
+
86
+ | Category | What it does |
87
+ |---|---|
88
+ | **RAG Evaluation** | Answer relevancy, faithfulness, contextual precision/recall/relevancy |
89
+ | **Custom Metrics** | GEval (LLM-as-judge + CoT), DAG (deterministic decision-tree) |
90
+ | **Safety** | Hallucination, bias, toxicity, PII leakage, misuse detection |
91
+ | **Agent Evaluation** | Task completion, tool correctness, step efficiency, argument correctness |
92
+ | **Conversational** | Relevancy, completeness, role adherence, knowledge retention |
93
+ | **Other** | JSON correctness (with schema), summarization quality |
94
+ | **Pytest Native** | `assert_test()`, fixtures, parametrize helpers |
95
+ | **Tracing** | `@observe` decorator, span/trace trees, component-level evaluation |
96
+ | **Bulk Evaluation** | `evaluate()` with concurrent execution and aggregated reports |
97
+ | **Dataset Tools** | `EvaluationDataset`, versioned JSON storage, CSV import |
98
+ | **Synthesizer** | Auto-generate Goldens from documents (4-step pipeline) |
99
+ | **Providers** | OpenAI, Azure OpenAI, Anthropic, Ollama, custom LLM base class |
100
+ | **Integrations** | LangChain, LlamaIndex, CrewAI |
101
+ | **CLI** | `llmeval test`, `llmeval set-openai`, `llmeval list-metrics` |
102
+
103
+ ---
104
+
105
+ ## Installation
106
+
107
+ ```bash
108
+ # Core
109
+ pip install llmeval
110
+
111
+ # With LangChain integration
112
+ pip install "llmeval[langchain]"
113
+
114
+ # With LlamaIndex
115
+ pip install "llmeval[llamaindex]"
116
+
117
+ # Everything
118
+ pip install "llmeval[all]"
119
+ ```
120
+
121
+ ---
122
+
123
+ ## Metrics Reference
124
+
125
+ ### RAG Metrics
126
+
127
+ ```python
128
+ from llmeval.metrics import (
129
+ AnswerRelevancyMetric, # Is the answer relevant to the question?
130
+ FaithfulnessMetric, # Are claims grounded in retrieved context?
131
+ ContextualRelevancyMetric, # Are retrieved chunks relevant to the query?
132
+ ContextualPrecisionMetric, # Do relevant chunks rank higher?
133
+ ContextualRecallMetric, # Does context cover expected answer claims?
134
+ )
135
+
136
+ tc = LLMTestCase(
137
+ input="What causes rain?",
138
+ actual_output="Rain is caused by water vapor condensing in clouds.",
139
+ expected_output="Rain is caused by condensation of water vapor.",
140
+ retrieval_context=[
141
+ "The water cycle involves evaporation and condensation.",
142
+ "Rain forms when water vapor cools and condenses around particles.",
143
+ ],
144
+ )
145
+
146
+ result = FaithfulnessMetric(threshold=0.8).measure(tc)
147
+ print(result.score, result.reason)
148
+ ```
149
+
150
+ ### Custom: GEval (LLM-as-Judge)
151
+
152
+ ```python
153
+ from llmeval import GEvalMetric, LLMTestCaseParams
154
+
155
+ metric = GEvalMetric(
156
+ name="Correctness",
157
+ criteria="The output should be factually correct and directly answer the question.",
158
+ evaluation_params=[
159
+ LLMTestCaseParams.INPUT,
160
+ LLMTestCaseParams.ACTUAL_OUTPUT,
161
+ LLMTestCaseParams.EXPECTED_OUTPUT,
162
+ ],
163
+ threshold=0.7,
164
+ )
165
+
166
+ result = metric.measure(tc)
167
+ ```
168
+
169
+ ### Custom: DAG (Deterministic)
170
+
171
+ ```python
172
+ from llmeval import DAGMetric
173
+ from llmeval.metrics.custom.dag import DAGNode
174
+
175
+ dag = DAGNode(
176
+ condition=lambda tc: len(tc.actual_output) > 0,
177
+ score_if_false=0.0,
178
+ next_if_true=DAGNode(
179
+ condition=lambda tc: "error" not in tc.actual_output.lower(),
180
+ score_if_true=1.0,
181
+ score_if_false=0.2,
182
+ )
183
+ )
184
+ metric = DAGMetric(name="ResponseQuality", root=dag, threshold=0.5)
185
+ ```
186
+
187
+ ### Safety Metrics
188
+
189
+ ```python
190
+ from llmeval.metrics import (
191
+ HallucinationMetric, # Detects factual hallucinations vs context
192
+ BiasMetric, # Gender, racial, political, religious bias
193
+ ToxicityMetric, # Hate speech, harassment, harmful content
194
+ PIILeakageMetric, # SSN, email, phone, credit card detection
195
+ MisuseMetric, # Weapons, illegal activity enablement
196
+ )
197
+
198
+ result = BiasMetric(threshold=0.7).measure(tc)
199
+ ```
200
+
201
+ ### Agentic Metrics
202
+
203
+ ```python
204
+ from llmeval import ToolCall
205
+ from llmeval.metrics import (
206
+ TaskCompletionMetric, # Did the agent accomplish the goal?
207
+ ToolCorrectnessMetric, # Were the right tools called?
208
+ StepEfficiencyMetric, # Were unnecessary steps avoided?
209
+ ArgumentCorrectnessMetric, # Were tool arguments correct?
210
+ )
211
+
212
+ tc = LLMTestCase(
213
+ input="Search for the latest news on AI and summarize it.",
214
+ actual_output="Here is a summary of recent AI news...",
215
+ tools_called=[
216
+ ToolCall(name="web_search", input_parameters={"query": "latest AI news"}),
217
+ ToolCall(name="summarize", input_parameters={"max_length": 200}),
218
+ ],
219
+ expected_tools=["web_search", "summarize"],
220
+ )
221
+
222
+ result = ToolCorrectnessMetric(threshold=0.8).measure(tc)
223
+ ```
224
+
225
+ ### Conversational Metrics
226
+
227
+ ```python
228
+ from llmeval import ConversationalTestCase, Message
229
+ from llmeval.metrics import (
230
+ ConversationalRelevancyMetric,
231
+ ConversationCompletenessMetric,
232
+ RoleAdherenceMetric,
233
+ KnowledgeRetentionMetric,
234
+ )
235
+
236
+ tc = ConversationalTestCase(
237
+ messages=[
238
+ Message(role="user", content="My name is Alice and I like Python."),
239
+ Message(role="assistant", content="Nice to meet you, Alice! Python is great."),
240
+ Message(role="user", content="What's my name again?"),
241
+ Message(role="assistant", content="Your name is Alice."),
242
+ ],
243
+ chatbot_role="A helpful assistant that remembers user preferences.",
244
+ )
245
+
246
+ result = KnowledgeRetentionMetric(threshold=0.7).measure(tc)
247
+ ```
248
+
249
+ ---
250
+
251
+ ## Bulk Evaluation
252
+
253
+ ```python
254
+ from llmeval import evaluate, LLMTestCase
255
+ from llmeval.metrics import AnswerRelevancyMetric, JSONCorrectnessMetric
256
+
257
+ test_cases = [
258
+ LLMTestCase(input="What is 2+2?", actual_output="4"),
259
+ LLMTestCase(input="Capital of Japan?", actual_output="Tokyo"),
260
+ LLMTestCase(input="Return JSON", actual_output='{"status": "ok"}'),
261
+ ]
262
+
263
+ result = evaluate(
264
+ test_cases=test_cases,
265
+ metrics=[AnswerRelevancyMetric(), JSONCorrectnessMetric()],
266
+ max_concurrent=4,
267
+ verbose=True,
268
+ )
269
+
270
+ print(f"Pass rate: {result.pass_rate:.1%}")
271
+ print(f"Overall score: {result.overall_score:.3f}")
272
+ result.print_summary()
273
+ ```
274
+
275
+ ---
276
+
277
+ ## Pytest Integration
278
+
279
+ ```python
280
+ # test_my_llm.py
281
+ import pytest
282
+ from llmeval import LLMTestCase, assert_test
283
+ from llmeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
284
+
285
+ def test_rag_answer():
286
+ tc = LLMTestCase(
287
+ input="What causes lightning?",
288
+ actual_output=my_rag_pipeline("What causes lightning?"),
289
+ retrieval_context=get_context("lightning"),
290
+ )
291
+ assert_test(tc, metrics=[
292
+ AnswerRelevancyMetric(threshold=0.7),
293
+ FaithfulnessMetric(threshold=0.8),
294
+ ])
295
+
296
+
297
+ # Run with: llmeval test test_my_llm.py
298
+ # Or: pytest test_my_llm.py
299
+ ```
300
+
301
+ ---
302
+
303
+ ## Tracing & Component-Level Evaluation
304
+
305
+ ```python
306
+ from llmeval import observe, Tracer, set_tracer, clear_tracer
307
+
308
+ tracer = Tracer()
309
+ set_tracer(tracer)
310
+ trace = tracer.start_trace()
311
+
312
+ @observe(span_type="retriever")
313
+ def retrieve(query: str) -> list:
314
+ return vector_db.search(query)
315
+
316
+ @observe(span_type="llm")
317
+ def generate(context: list, query: str) -> str:
318
+ return llm.generate(f"Context: {context}\nQuestion: {query}")
319
+
320
+ def rag_pipeline(query: str) -> str:
321
+ context = retrieve(query)
322
+ return generate(context, query)
323
+
324
+ answer = rag_pipeline("What is quantum computing?")
325
+ tracer.end_trace()
326
+ clear_tracer()
327
+
328
+ tracer.print_last_trace() # Shows span tree with latencies
329
+ ```
330
+
331
+ ---
332
+
333
+ ## Dataset Management
334
+
335
+ ```python
336
+ from llmeval import EvaluationDataset, Golden
337
+
338
+ # Build a dataset
339
+ ds = EvaluationDataset()
340
+ ds.add_goldens([
341
+ Golden(input="What is AI?", expected_output="Artificial intelligence."),
342
+ Golden(input="Capital of Germany?", expected_output="Berlin"),
343
+ ])
344
+ ds.save("my_dataset.json")
345
+
346
+ # Load and use
347
+ ds = EvaluationDataset.load("my_dataset.json")
348
+ test_cases = ds.to_test_cases(generate_fn=my_llm.generate)
349
+ ```
350
+
351
+ ---
352
+
353
+ ## Synthetic Dataset Generation
354
+
355
+ ```python
356
+ from llmeval import Synthesizer
357
+
358
+ synth = Synthesizer()
359
+
360
+ docs = [
361
+ "The Python programming language was created by Guido van Rossum...",
362
+ "Machine learning is a branch of artificial intelligence...",
363
+ ]
364
+
365
+ goldens = synth.generate_goldens_from_docs(
366
+ documents=docs,
367
+ max_goldens_per_doc=5,
368
+ filter_questions=True,
369
+ evolve_questions=True,
370
+ generate_expected_outputs=True,
371
+ )
372
+
373
+ print(f"Generated {len(goldens)} golden test cases")
374
+ for g in goldens[:2]:
375
+ print(f"Q: {g.input}")
376
+ print(f"A: {g.expected_output}\n")
377
+ ```
378
+
379
+ ---
380
+
381
+ ## LLM Providers
382
+
383
+ ```python
384
+ from llmeval.providers import OpenAIProvider, AnthropicProvider, OllamaProvider
385
+
386
+ # OpenAI
387
+ provider = OpenAIProvider(model="gpt-4o", api_key="sk-...")
388
+
389
+ # Anthropic Claude
390
+ provider = AnthropicProvider(model="claude-sonnet-4-6")
391
+
392
+ # Ollama (local)
393
+ provider = OllamaProvider(model="llama3")
394
+
395
+ # Custom provider
396
+ from llmeval.providers import LLMProvider
397
+
398
+ class MyProvider(LLMProvider):
399
+ def generate(self, prompt: str, **kwargs) -> str:
400
+ return my_llm_api.call(prompt)
401
+
402
+ # Use in any metric
403
+ metric = AnswerRelevancyMetric(model=MyProvider())
404
+ ```
405
+
406
+ ---
407
+
408
+ ## LangChain Integration
409
+
410
+ ```python
411
+ from langchain_openai import ChatOpenAI
412
+ from llmeval.integrations.langchain import LangChainCallbackHandler, evaluate_chain
413
+ from llmeval.metrics import AnswerRelevancyMetric
414
+
415
+ llm = ChatOpenAI(model="gpt-4o")
416
+ chain = llm # or any LangChain runnable
417
+
418
+ result = evaluate_chain(
419
+ chain=chain,
420
+ inputs=["What is the capital of France?", "Who invented Python?"],
421
+ metrics=[AnswerRelevancyMetric(threshold=0.7)],
422
+ )
423
+ ```
424
+
425
+ ---
426
+
427
+ ## CLI
428
+
429
+ ```bash
430
+ # Run evaluation tests
431
+ llmeval test tests/test_llm.py
432
+ llmeval test tests/ -n 4 # 4 parallel workers
433
+
434
+ # Configure providers
435
+ llmeval set-openai --key sk-... --model gpt-4o
436
+ llmeval set-anthropic --key sk-... --model claude-sonnet-4-6
437
+ llmeval set-ollama --model llama3
438
+
439
+ # List all metrics
440
+ llmeval list-metrics
441
+
442
+ # Version
443
+ llmeval version
444
+ ```
445
+
446
+ ---
447
+
448
+ ## License
449
+
450
+ Apache 2.0 — see LICENSE.
451
+
452
+ **Author:** Mahesh Makvana