ragpill 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragpill-0.1.0/PKG-INFO +191 -0
- ragpill-0.1.0/README.md +179 -0
- ragpill-0.1.0/pyproject.toml +133 -0
- ragpill-0.1.0/src/ragpill/__init__.py +26 -0
- ragpill-0.1.0/src/ragpill/base.py +259 -0
- ragpill-0.1.0/src/ragpill/csv/__init__.py +7 -0
- ragpill-0.1.0/src/ragpill/csv/questions_answers.py +2 -0
- ragpill-0.1.0/src/ragpill/csv/testset.py +403 -0
- ragpill-0.1.0/src/ragpill/evaluators.py +873 -0
- ragpill-0.1.0/src/ragpill/mlflow_helper.py +498 -0
- ragpill-0.1.0/src/ragpill/py.typed +0 -0
- ragpill-0.1.0/src/ragpill/settings.py +35 -0
- ragpill-0.1.0/src/ragpill/utils.py +315 -0
ragpill-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: ragpill
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: This library allows for granular testing of llm-applications based on expert input.
|
|
5
|
+
Author: Joel Gotsch
|
|
6
|
+
Author-email: Joel Gotsch <contact@joelgotsch.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Requires-Dist: pydantic-ai>=1.39.1
|
|
9
|
+
Requires-Dist: mlflow>=3.8.1
|
|
10
|
+
Requires-Python: >=3.11
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
<p align="center">
|
|
14
|
+
<img src="docs/assets/ragpill.svg" alt="ragpill logo" width="340">
|
|
15
|
+
</p>
|
|
16
|
+
|
|
17
|
+
<p align="center">
|
|
18
|
+
<em>Stop believing your chatbot. Take the ragpill.</em>
|
|
19
|
+
</p>
|
|
20
|
+
|
|
21
|
+
<p align="center">
|
|
22
|
+
<a href="https://github.com/JoelGotsch/ragpill/actions/workflows/ci.yml"><img src="https://img.shields.io/github/actions/workflow/status/JoelGotsch/ragpill/ci.yml?branch=main&label=CI&style=flat-square" alt="CI"></a>
|
|
23
|
+
<a href="https://pypi.org/project/ragpill/"><img src="https://img.shields.io/pypi/v/ragpill?style=flat-square" alt="PyPI version"></a>
|
|
24
|
+
<a href="https://pypi.org/project/ragpill/"><img src="https://img.shields.io/pypi/pyversions/ragpill?style=flat-square" alt="Python versions"></a>
|
|
25
|
+
<a href="https://codecov.io/gh/JoelGotsch/ragpill"><img src="https://img.shields.io/codecov/c/github/JoelGotsch/ragpill?style=flat-square&label=coverage" alt="Codecov"></a>
|
|
26
|
+
<a href="https://github.com/JoelGotsch/ragpill/blob/main/LICENSE"><img src="https://img.shields.io/github/license/JoelGotsch/ragpill?style=flat-square" alt="License: MIT"></a>
|
|
27
|
+
<a href="https://joelgotsch.github.io/ragpill/latest/"><img src="https://img.shields.io/badge/docs-latest-blue?style=flat-square" alt="Documentation"></a>
|
|
28
|
+
<a href="https://github.com/astral-sh/ruff"><img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json&style=flat-square" alt="Ruff"></a>
|
|
29
|
+
<img src="https://img.shields.io/badge/types-basedpyright-blue?style=flat-square" alt="basedpyright">
|
|
30
|
+
</p>
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
**ragpill** is an evaluation framework for LLM agents and RAG pipelines. Define facts, sources, and tool call expectations — and find out what your AI actually does.
|
|
35
|
+
|
|
36
|
+
## What is RAGPill?
|
|
37
|
+
|
|
38
|
+
RAGPill helps you:
|
|
39
|
+
|
|
40
|
+
- **Create test datasets from CSV files** - Easy collaboration with domain experts
|
|
41
|
+
- **Define custom evaluators** - Add domain-specific knowledge to evaluations
|
|
42
|
+
- **Track results in MLflow** - Full experiment tracking and tracing
|
|
43
|
+
- **Follow best practices** - Opinionated design guides you to robust testing
|
|
44
|
+
|
|
45
|
+
It specializes in "offline" evaluation of LLM-based systems, meaning it's supposed to be part of your CI/CD pipeline or run as scheduled tests, not real-time monitoring.
|
|
46
|
+
|
|
47
|
+
<!--
|
|
48
|
+
## Demo!
|
|
49
|
+
|
|
50
|
+
TODO: this should be a video demo
|
|
51
|
+
Prerequisites:
|
|
52
|
+
- [ragpill installed](docs/getting-started/installation.md)
|
|
53
|
+
- MLflow tracking server running (local or remote) with tracing enabled.
|
|
54
|
+
Locally: `mlflow server --backend-store-uri sqlite:///mlflow.db` or if remote, then configure the env vars properly.
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
Produces the following mlflow views:
|
|
61
|
+
|
|
62
|
+
### Metrics View
|
|
63
|
+
[](https://www.youtube.com/watch?v=your-video-id)
|
|
64
|
+
|
|
65
|
+
### Traces Views
|
|
66
|
+
|
|
67
|
+
### Artifacts View
|
|
68
|
+
[](https://www.youtube.com/watch?v=your-video-id)
|
|
69
|
+
|
|
70
|
+
### Comparing runs
|
|
71
|
+
|
|
72
|
+
### What's wrong with other frameworks?
|
|
73
|
+
|
|
74
|
+
- The [pydantic-ai's evaluation framework](https://ai.pydantic.dev/evals/) only integrates with cloud-based logfire and it sucks for mlflow tracing (if you go the hassle and use mlflow opentelemetry endpoint as logfire backend, a lot of mlflow features are lost in translation). However, we like the core concepts and type-safety of pydantic-ai evals a lot, so we build on top of it.
|
|
75
|
+
- additionally, it's not straightforward to test for example, if a regex pattern is found in retrieved sources or document metadata. Which is common enough in retrieval-augmented generation (RAG) systems.
|
|
76
|
+
- [Langsmith evaluation](https://docs.langchain.com/langsmith/evaluation) doesn't support multiple tests per dataset item, nor custom evaluators easily. Also no mlflow integration.
|
|
77
|
+
-->
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
## Core Philosophy
|
|
82
|
+
|
|
83
|
+
Here we focus a lot on the [LLM Judge evaluator](docs/api/evaluators.md#llmjudge), although it's the last evaluator you should use - prefer deterministic evaluators (regex, exact match) whenever possible.
|
|
84
|
+
However, for deterministic tests, there's already a lot of tooling available, like pytest for example (yes, we like the 'code-first' approach).
|
|
85
|
+
|
|
86
|
+
### Expert-Defined Attributes
|
|
87
|
+
|
|
88
|
+
LLM judges usually lack context awareness to judge which discrepancies between chatbot answers and expected answers are relevant - especially in specialized fields like law, engineering, and science where words have precise definitions.
|
|
89
|
+
|
|
90
|
+
**Domain experts should define specific attributes and criteria for evaluation.**
|
|
91
|
+
|
|
92
|
+
### Binary Evaluations
|
|
93
|
+
|
|
94
|
+
We use **boolean pass/fail values only**, not scoring scales (1-10), because:
|
|
95
|
+
|
|
96
|
+
- Scales are arbitrary and often decided by LLMs
|
|
97
|
+
- Binary decisions are more stable and reproducible (although LLMs of course remain probabilistic)
|
|
98
|
+
- Easier to track and reason about over time
|
|
99
|
+
|
|
100
|
+
### Tags and Attributes for Organization
|
|
101
|
+
|
|
102
|
+
Evaluators can have:
|
|
103
|
+
|
|
104
|
+
- **Tags**: Categorical labels for filtering (e.g., `retrieval`, `time-aware-rag`, `basic_logic`)
|
|
105
|
+
- **Attributes**: Key-value metadata for categorization (e.g., `importance: high`, `scope: Phase1`)
|
|
106
|
+
|
|
107
|
+
Metrics are automatically calculated per tag and attribute.
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
## Quick Navigation
|
|
111
|
+
|
|
112
|
+
### Getting Started:
|
|
113
|
+
|
|
114
|
+
- [Installation](docs/getting-started/installation.md)
|
|
115
|
+
- [Quick Start](docs/getting-started/quickstart.md)
|
|
116
|
+
|
|
117
|
+
### Evaluators:
|
|
118
|
+
|
|
119
|
+
## Key Concepts
|
|
120
|
+
|
|
121
|
+
As this library is built on pydantic-ai evals, please have a look [here](https://ai.pydantic.dev/evals/core-concepts/)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
### Key Components
|
|
125
|
+
|
|
126
|
+
- **Dataset**: From pydantic-ai, contains test cases with inputs, evaluators, and metadata
|
|
127
|
+
- **Evaluators**: Check if outputs meet criteria (LLMJudge, regex matchers, custom evaluators)
|
|
128
|
+
- **MLflow Integration**: Wraps execution, traces runs, evaluates outputs, uploads results
|
|
129
|
+
|
|
130
|
+
## Features
|
|
131
|
+
|
|
132
|
+
- **Great MLflow Integration**: Traces your agent/function execution to MLflow with evaluations in the native format
|
|
133
|
+
- **CSV/Excel Adapter**: Load test cases from CSV files with evaluator configurations
|
|
134
|
+
- **Flexible Evaluators**: Built-in LLM judges, regex matchers, and easy custom evaluator creation
|
|
135
|
+
- **Metrics per Tags/Attributes**: Automatic metric calculation for each tag and attribute combination
|
|
136
|
+
- **Type Safety**: Built on pydantic-ai with full type safety throughout
|
|
137
|
+
|
|
138
|
+
## [Built-in Evaluators](docs/api/evaluators.md)
|
|
139
|
+
|
|
140
|
+
- [**LLMJudge**](docs/api/evaluators.md#llmjudge): Uses an LLM to judge correctness based on a rubric
|
|
141
|
+
- [**RegexInSourcesEvaluator**](docs/api/evaluators.md#regexinsourcesevaluator): Checks if regex patterns appear in retrieved sources
|
|
142
|
+
- [**RegexInDocumentMetadataEvaluator**](docs/api/evaluators.md#regexindocumentmetadataevaluator): Checks regex in document metadata
|
|
143
|
+
- [**Custom Evaluators**](docs/guide/evaluators.md#creating-custom-evaluators): Inherit from `BaseEvaluator` and implement your logic
|
|
144
|
+
|
|
145
|
+
## Best Practices
|
|
146
|
+
|
|
147
|
+
> [!TIP]
|
|
148
|
+
> **TDD Mindset** — Begin with defining a Test-Set with potential users before even starting to develop the solution. This enables clear expectation management and progress tracking.
|
|
149
|
+
|
|
150
|
+
> [!TIP]
|
|
151
|
+
> **Create Multiple Testsets** — It might make sense for you to have some core tests that run relatively quickly and inexpensive - use these for development. Before deploying to prod, you can run an exhaustive dataset that is integrated in your CI/CD.
|
|
152
|
+
|
|
153
|
+
> [!TIP]
|
|
154
|
+
> **Separate Evaluation Experiments** — Create dedicated MLflow experiments for evaluations. Don't mix evaluation traces with production traces.
|
|
155
|
+
|
|
156
|
+
> [!TIP]
|
|
157
|
+
> **Use Domain Experts** — Have domain experts define evaluation criteria rather than relying solely on generic LLM judges.
|
|
158
|
+
|
|
159
|
+
> [!TIP]
|
|
160
|
+
> **Version Your Tests** — Keep test datasets in version control alongside your code.
|
|
161
|
+
|
|
162
|
+
## Documentation
|
|
163
|
+
|
|
164
|
+
Full documentation is available at [joelgotsch.github.io/ragpill/latest](https://joelgotsch.github.io/ragpill/latest/) including:
|
|
165
|
+
|
|
166
|
+
- **Installation Guide**: Setup instructions
|
|
167
|
+
- **Quickstart Tutorial**: Run your first evaluation
|
|
168
|
+
- **CSV Adapter Guide**: Learn the CSV format and column meanings
|
|
169
|
+
- **Evaluators Guide**: Create custom evaluators
|
|
170
|
+
- **MLflow Integration**: Advanced MLflow usage
|
|
171
|
+
- **API Reference**: Complete API documentation
|
|
172
|
+
|
|
173
|
+
## Roadmap
|
|
174
|
+
|
|
175
|
+
- [x] Adapter for testset from CSV
|
|
176
|
+
- [x] Documentation via mkdocs
|
|
177
|
+
- [x] Evaluators for sources and regex
|
|
178
|
+
- [ ] Repeat Task Evaluations (run task multiple times and evaluate with threshold)
|
|
179
|
+
- [ ] Adapter for task from CSV (upload to mlflow)
|
|
180
|
+
- [ ] Create demo video
|
|
181
|
+
- [ ] CI/CD (tests, build package, publish docs)
|
|
182
|
+
- [x] Global evaluators from CSV (empty input)
|
|
183
|
+
- [ ] Track git-commit hash in experiment
|
|
184
|
+
- [x] Tests with mlflow server
|
|
185
|
+
- [ ] Dependency injection for llm, input_to_key functions
|
|
186
|
+
- [ ] pytest integration
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
## Contributing
|
|
190
|
+
|
|
191
|
+
Contributions are welcome! Please see [CONTRIBUTING.md](docs/development/contributing.md) for guidelines.
|
ragpill-0.1.0/README.md
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="docs/assets/ragpill.svg" alt="ragpill logo" width="340">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<p align="center">
|
|
6
|
+
<em>Stop believing your chatbot. Take the ragpill.</em>
|
|
7
|
+
</p>
|
|
8
|
+
|
|
9
|
+
<p align="center">
|
|
10
|
+
<a href="https://github.com/JoelGotsch/ragpill/actions/workflows/ci.yml"><img src="https://img.shields.io/github/actions/workflow/status/JoelGotsch/ragpill/ci.yml?branch=main&label=CI&style=flat-square" alt="CI"></a>
|
|
11
|
+
<a href="https://pypi.org/project/ragpill/"><img src="https://img.shields.io/pypi/v/ragpill?style=flat-square" alt="PyPI version"></a>
|
|
12
|
+
<a href="https://pypi.org/project/ragpill/"><img src="https://img.shields.io/pypi/pyversions/ragpill?style=flat-square" alt="Python versions"></a>
|
|
13
|
+
<a href="https://codecov.io/gh/JoelGotsch/ragpill"><img src="https://img.shields.io/codecov/c/github/JoelGotsch/ragpill?style=flat-square&label=coverage" alt="Codecov"></a>
|
|
14
|
+
<a href="https://github.com/JoelGotsch/ragpill/blob/main/LICENSE"><img src="https://img.shields.io/github/license/JoelGotsch/ragpill?style=flat-square" alt="License: MIT"></a>
|
|
15
|
+
<a href="https://joelgotsch.github.io/ragpill/latest/"><img src="https://img.shields.io/badge/docs-latest-blue?style=flat-square" alt="Documentation"></a>
|
|
16
|
+
<a href="https://github.com/astral-sh/ruff"><img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json&style=flat-square" alt="Ruff"></a>
|
|
17
|
+
<img src="https://img.shields.io/badge/types-basedpyright-blue?style=flat-square" alt="basedpyright">
|
|
18
|
+
</p>
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
**ragpill** is an evaluation framework for LLM agents and RAG pipelines. Define facts, sources, and tool call expectations — and find out what your AI actually does.
|
|
23
|
+
|
|
24
|
+
## What is RAGPill?
|
|
25
|
+
|
|
26
|
+
RAGPill helps you:
|
|
27
|
+
|
|
28
|
+
- **Create test datasets from CSV files** - Easy collaboration with domain experts
|
|
29
|
+
- **Define custom evaluators** - Add domain-specific knowledge to evaluations
|
|
30
|
+
- **Track results in MLflow** - Full experiment tracking and tracing
|
|
31
|
+
- **Follow best practices** - Opinionated design guides you to robust testing
|
|
32
|
+
|
|
33
|
+
It specializes in "offline" evaluation of LLM-based systems, meaning it's supposed to be part of your CI/CD pipeline or run as scheduled tests, not real-time monitoring.
|
|
34
|
+
|
|
35
|
+
<!--
|
|
36
|
+
## Demo!
|
|
37
|
+
|
|
38
|
+
TODO: this should be a video demo
|
|
39
|
+
Prerequisites:
|
|
40
|
+
- [ragpill installed](docs/getting-started/installation.md)
|
|
41
|
+
- MLflow tracking server running (local or remote) with tracing enabled.
|
|
42
|
+
Locally: `mlflow server --backend-store-uri sqlite:///mlflow.db` or if remote, then configure the env vars properly.
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
```
|
|
48
|
+
Produces the following mlflow views:
|
|
49
|
+
|
|
50
|
+
### Metrics View
|
|
51
|
+
[](https://www.youtube.com/watch?v=your-video-id)
|
|
52
|
+
|
|
53
|
+
### Traces Views
|
|
54
|
+
|
|
55
|
+
### Artifacts View
|
|
56
|
+
[](https://www.youtube.com/watch?v=your-video-id)
|
|
57
|
+
|
|
58
|
+
### Comparing runs
|
|
59
|
+
|
|
60
|
+
### What's wrong with other frameworks?
|
|
61
|
+
|
|
62
|
+
- The [pydantic-ai's evaluation framework](https://ai.pydantic.dev/evals/) only integrates with cloud-based logfire and it sucks for mlflow tracing (if you go the hassle and use mlflow opentelemetry endpoint as logfire backend, a lot of mlflow features are lost in translation). However, we like the core concepts and type-safety of pydantic-ai evals a lot, so we build on top of it.
|
|
63
|
+
- additionally, it's not straightforward to test for example, if a regex pattern is found in retrieved sources or document metadata. Which is common enough in retrieval-augmented generation (RAG) systems.
|
|
64
|
+
- [Langsmith evaluation](https://docs.langchain.com/langsmith/evaluation) doesn't support multiple tests per dataset item, nor custom evaluators easily. Also no mlflow integration.
|
|
65
|
+
-->
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
## Core Philosophy
|
|
70
|
+
|
|
71
|
+
Here we focus a lot on the [LLM Judge evaluator](docs/api/evaluators.md#llmjudge), although it's the last evaluator you should use - prefer deterministic evaluators (regex, exact match) whenever possible.
|
|
72
|
+
However, for deterministic tests, there's already a lot of tooling available, like pytest for example (yes, we like the 'code-first' approach).
|
|
73
|
+
|
|
74
|
+
### Expert-Defined Attributes
|
|
75
|
+
|
|
76
|
+
LLM judges usually lack context awareness to judge which discrepancies between chatbot answers and expected answers are relevant - especially in specialized fields like law, engineering, and science where words have precise definitions.
|
|
77
|
+
|
|
78
|
+
**Domain experts should define specific attributes and criteria for evaluation.**
|
|
79
|
+
|
|
80
|
+
### Binary Evaluations
|
|
81
|
+
|
|
82
|
+
We use **boolean pass/fail values only**, not scoring scales (1-10), because:
|
|
83
|
+
|
|
84
|
+
- Scales are arbitrary and often decided by LLMs
|
|
85
|
+
- Binary decisions are more stable and reproducible (although LLMs of course remain probabilistic)
|
|
86
|
+
- Easier to track and reason about over time
|
|
87
|
+
|
|
88
|
+
### Tags and Attributes for Organization
|
|
89
|
+
|
|
90
|
+
Evaluators can have:
|
|
91
|
+
|
|
92
|
+
- **Tags**: Categorical labels for filtering (e.g., `retrieval`, `time-aware-rag`, `basic_logic`)
|
|
93
|
+
- **Attributes**: Key-value metadata for categorization (e.g., `importance: high`, `scope: Phase1`)
|
|
94
|
+
|
|
95
|
+
Metrics are automatically calculated per tag and attribute.
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
## Quick Navigation
|
|
99
|
+
|
|
100
|
+
### Getting Started:
|
|
101
|
+
|
|
102
|
+
- [Installation](docs/getting-started/installation.md)
|
|
103
|
+
- [Quick Start](docs/getting-started/quickstart.md)
|
|
104
|
+
|
|
105
|
+
### Evaluators:
|
|
106
|
+
|
|
107
|
+
## Key Concepts
|
|
108
|
+
|
|
109
|
+
As this library is built on pydantic-ai evals, please have a look [here](https://ai.pydantic.dev/evals/core-concepts/)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
### Key Components
|
|
113
|
+
|
|
114
|
+
- **Dataset**: From pydantic-ai, contains test cases with inputs, evaluators, and metadata
|
|
115
|
+
- **Evaluators**: Check if outputs meet criteria (LLMJudge, regex matchers, custom evaluators)
|
|
116
|
+
- **MLflow Integration**: Wraps execution, traces runs, evaluates outputs, uploads results
|
|
117
|
+
|
|
118
|
+
## Features
|
|
119
|
+
|
|
120
|
+
- **Great MLflow Integration**: Traces your agent/function execution to MLflow with evaluations in the native format
|
|
121
|
+
- **CSV/Excel Adapter**: Load test cases from CSV files with evaluator configurations
|
|
122
|
+
- **Flexible Evaluators**: Built-in LLM judges, regex matchers, and easy custom evaluator creation
|
|
123
|
+
- **Metrics per Tags/Attributes**: Automatic metric calculation for each tag and attribute combination
|
|
124
|
+
- **Type Safety**: Built on pydantic-ai with full type safety throughout
|
|
125
|
+
|
|
126
|
+
## [Built-in Evaluators](docs/api/evaluators.md)
|
|
127
|
+
|
|
128
|
+
- [**LLMJudge**](docs/api/evaluators.md#llmjudge): Uses an LLM to judge correctness based on a rubric
|
|
129
|
+
- [**RegexInSourcesEvaluator**](docs/api/evaluators.md#regexinsourcesevaluator): Checks if regex patterns appear in retrieved sources
|
|
130
|
+
- [**RegexInDocumentMetadataEvaluator**](docs/api/evaluators.md#regexindocumentmetadataevaluator): Checks regex in document metadata
|
|
131
|
+
- [**Custom Evaluators**](docs/guide/evaluators.md#creating-custom-evaluators): Inherit from `BaseEvaluator` and implement your logic
|
|
132
|
+
|
|
133
|
+
## Best Practices
|
|
134
|
+
|
|
135
|
+
> [!TIP]
|
|
136
|
+
> **TDD Mindset** — Begin with defining a Test-Set with potential users before even starting to develop the solution. This enables clear expectation management and progress tracking.
|
|
137
|
+
|
|
138
|
+
> [!TIP]
|
|
139
|
+
> **Create Multiple Testsets** — It might make sense for you to have some core tests that run relatively quickly and inexpensive - use these for development. Before deploying to prod, you can run an exhaustive dataset that is integrated in your CI/CD.
|
|
140
|
+
|
|
141
|
+
> [!TIP]
|
|
142
|
+
> **Separate Evaluation Experiments** — Create dedicated MLflow experiments for evaluations. Don't mix evaluation traces with production traces.
|
|
143
|
+
|
|
144
|
+
> [!TIP]
|
|
145
|
+
> **Use Domain Experts** — Have domain experts define evaluation criteria rather than relying solely on generic LLM judges.
|
|
146
|
+
|
|
147
|
+
> [!TIP]
|
|
148
|
+
> **Version Your Tests** — Keep test datasets in version control alongside your code.
|
|
149
|
+
|
|
150
|
+
## Documentation
|
|
151
|
+
|
|
152
|
+
Full documentation is available at [joelgotsch.github.io/ragpill/latest](https://joelgotsch.github.io/ragpill/latest/) including:
|
|
153
|
+
|
|
154
|
+
- **Installation Guide**: Setup instructions
|
|
155
|
+
- **Quickstart Tutorial**: Run your first evaluation
|
|
156
|
+
- **CSV Adapter Guide**: Learn the CSV format and column meanings
|
|
157
|
+
- **Evaluators Guide**: Create custom evaluators
|
|
158
|
+
- **MLflow Integration**: Advanced MLflow usage
|
|
159
|
+
- **API Reference**: Complete API documentation
|
|
160
|
+
|
|
161
|
+
## Roadmap
|
|
162
|
+
|
|
163
|
+
- [x] Adapter for testset from CSV
|
|
164
|
+
- [x] Documentation via mkdocs
|
|
165
|
+
- [x] Evaluators for sources and regex
|
|
166
|
+
- [ ] Repeat Task Evaluations (run task multiple times and evaluate with threshold)
|
|
167
|
+
- [ ] Adapter for task from CSV (upload to mlflow)
|
|
168
|
+
- [ ] Create demo video
|
|
169
|
+
- [ ] CI/CD (tests, build package, publish docs)
|
|
170
|
+
- [x] Global evaluators from CSV (empty input)
|
|
171
|
+
- [ ] Track git-commit hash in experiment
|
|
172
|
+
- [x] Tests with mlflow server
|
|
173
|
+
- [ ] Dependency injection for llm, input_to_key functions
|
|
174
|
+
- [ ] pytest integration
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
## Contributing
|
|
178
|
+
|
|
179
|
+
Contributions are welcome! Please see [CONTRIBUTING.md](docs/development/contributing.md) for guidelines.
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["uv_build>=0.9.22,<0.10.0"]
|
|
3
|
+
build-backend = "uv_build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ragpill"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "This library allows for granular testing of llm-applications based on expert input."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [{ name = "Joel Gotsch", email = "contact@joelgotsch.com" }]
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
requires-python = ">=3.11"
|
|
13
|
+
dependencies = [
|
|
14
|
+
"pydantic-ai>=1.39.1",
|
|
15
|
+
"mlflow>=3.8.1",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[dependency-groups]
|
|
19
|
+
dev = [
|
|
20
|
+
"basedpyright>=1.31.3",
|
|
21
|
+
"pytest>=8.3.3",
|
|
22
|
+
"pytest-pretty>=1.2.0",
|
|
23
|
+
"isort>=5.13.2",
|
|
24
|
+
"ruff>=0.8.0",
|
|
25
|
+
"anyio[trio]>=4.9.0",
|
|
26
|
+
"pytest-cov>=6.0.0",
|
|
27
|
+
"coverage[toml]>=7.6.0",
|
|
28
|
+
"genbadge[coverage]>=1.1.3",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
docs = [
|
|
32
|
+
"zensical>=0.0.32", # replaces mkdocs + mkdocs-material
|
|
33
|
+
"mkdocstrings[python]>=0.26.0",
|
|
34
|
+
"mkdocs-autorefs>=1.2.0",
|
|
35
|
+
"mike>=2.1.3", # versioning documentation
|
|
36
|
+
"nbconvert>=7.0.0", # pre-convert notebooks to markdown
|
|
37
|
+
"ipywidgets>=8.1.8",
|
|
38
|
+
]
|
|
39
|
+
build_docs = [
|
|
40
|
+
"qdrant-client>=1.12.0",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
[tool.ruff]
|
|
45
|
+
line-length = 120
|
|
46
|
+
target-version = "py311"
|
|
47
|
+
|
|
48
|
+
[tool.ruff.lint]
|
|
49
|
+
extend-select = ["RUF", "C90", "UP", "I"]
|
|
50
|
+
mccabe = { max-complexity = 19 }
|
|
51
|
+
|
|
52
|
+
[tool.ruff.lint.pydocstyle]
|
|
53
|
+
convention = "google"
|
|
54
|
+
|
|
55
|
+
[tool.ruff.lint.isort]
|
|
56
|
+
combine-as-imports = true
|
|
57
|
+
|
|
58
|
+
[tool.ruff.format]
|
|
59
|
+
quote-style = "double"
|
|
60
|
+
|
|
61
|
+
[tool.basedpyright]
|
|
62
|
+
pythonVersion = "3.11"
|
|
63
|
+
typeCheckingMode = "strict"
|
|
64
|
+
reportUnnecessaryTypeIgnoreComment = true
|
|
65
|
+
reportIgnoreCommentWithoutRule = true
|
|
66
|
+
reportUnusedParameter = true
|
|
67
|
+
reportIncompatibleUnannotatedOverride = true
|
|
68
|
+
reportImplicitAbstractClass = true
|
|
69
|
+
reportMissingTypeStubs = false
|
|
70
|
+
reportMissingModuleSource = false
|
|
71
|
+
include = ["src"]
|
|
72
|
+
|
|
73
|
+
[tool.pytest.ini_options]
|
|
74
|
+
testpaths = ["tests"]
|
|
75
|
+
addopts = [
|
|
76
|
+
"--tb=short",
|
|
77
|
+
"--cov=ragpill",
|
|
78
|
+
"--cov-report=term-missing",
|
|
79
|
+
"--cov-report=xml",
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
[tool.coverage.run]
|
|
83
|
+
source = ["ragpill"]
|
|
84
|
+
branch = true
|
|
85
|
+
omit = ["tests/*"]
|
|
86
|
+
|
|
87
|
+
[tool.coverage.report]
|
|
88
|
+
show_missing = true
|
|
89
|
+
exclude_lines = [
|
|
90
|
+
"pragma: no cover",
|
|
91
|
+
"if TYPE_CHECKING:",
|
|
92
|
+
"@overload",
|
|
93
|
+
"raise NotImplementedError",
|
|
94
|
+
"\\.\\.\\.",
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
[tool.coverage.paths]
|
|
98
|
+
source = ["src/ragpill", "*/site-packages/ragpill"]
|
|
99
|
+
|
|
100
|
+
[tool.tox]
|
|
101
|
+
legacy_tox_ini = """
|
|
102
|
+
[tox]
|
|
103
|
+
requires =
|
|
104
|
+
tox>=4.23
|
|
105
|
+
tox-uv>=1.17
|
|
106
|
+
min_version = 4.0
|
|
107
|
+
env_list =
|
|
108
|
+
lint
|
|
109
|
+
type
|
|
110
|
+
py{311,312,313}-highest
|
|
111
|
+
py{311,312,313}-lowest
|
|
112
|
+
|
|
113
|
+
[testenv]
|
|
114
|
+
runner = uv-venv-runner
|
|
115
|
+
dependency_groups = dev
|
|
116
|
+
commands =
|
|
117
|
+
pytest {posargs}
|
|
118
|
+
|
|
119
|
+
[testenv:py{311,312,313}-lowest]
|
|
120
|
+
uv_resolution = lowest-direct
|
|
121
|
+
|
|
122
|
+
[testenv:lint]
|
|
123
|
+
skip_install = true
|
|
124
|
+
dependency_groups = dev
|
|
125
|
+
commands =
|
|
126
|
+
ruff check src tests
|
|
127
|
+
ruff format --check src tests
|
|
128
|
+
|
|
129
|
+
[testenv:type]
|
|
130
|
+
dependency_groups = dev
|
|
131
|
+
commands =
|
|
132
|
+
basedpyright
|
|
133
|
+
"""
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from ragpill.base import BaseEvaluator, EvaluatorMetadata, TestCaseMetadata
|
|
2
|
+
from ragpill.evaluators import (
|
|
3
|
+
HasQuotesEvaluator,
|
|
4
|
+
LiteralQuoteEvaluator,
|
|
5
|
+
LLMJudge,
|
|
6
|
+
RegexInDocumentMetadataEvaluator,
|
|
7
|
+
RegexInOutputEvaluator,
|
|
8
|
+
RegexInSourcesEvaluator,
|
|
9
|
+
)
|
|
10
|
+
from ragpill.mlflow_helper import evaluate_testset_with_mlflow, evaluate_testset_with_mlflow_sync
|
|
11
|
+
from ragpill.utils import merge_settings
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"BaseEvaluator",
|
|
15
|
+
"EvaluatorMetadata",
|
|
16
|
+
"HasQuotesEvaluator",
|
|
17
|
+
"LLMJudge",
|
|
18
|
+
"LiteralQuoteEvaluator",
|
|
19
|
+
"RegexInDocumentMetadataEvaluator",
|
|
20
|
+
"RegexInOutputEvaluator",
|
|
21
|
+
"RegexInSourcesEvaluator",
|
|
22
|
+
"TestCaseMetadata",
|
|
23
|
+
"evaluate_testset_with_mlflow",
|
|
24
|
+
"evaluate_testset_with_mlflow_sync",
|
|
25
|
+
"merge_settings",
|
|
26
|
+
]
|