ragbits-evaluate 0.15.0__tar.gz → 1.4.0.dev202601130240__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/.gitignore +13 -0
- ragbits_evaluate-1.4.0.dev202601130240/CHANGELOG.md +240 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/PKG-INFO +8 -6
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/pyproject.toml +2 -2
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/__init__.py +87 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/context.py +118 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/conversation.py +333 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/deepeval_evaluator.py +92 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/logger.py +165 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/metrics/__init__.py +19 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/metrics/builtin.py +221 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/metrics/collectors.py +142 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/models.py +37 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/results.py +200 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/scenarios.py +129 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/simulation.py +243 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/cli.py +40 -39
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/config.py +11 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/dataloaders/__init__.py +3 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/dataloaders/base.py +95 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/dataloaders/document_search.py +61 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/dataloaders/exceptions.py +25 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/dataloaders/gaia.py +78 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/dataloaders/hotpot_qa.py +95 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/dataloaders/human_eval.py +70 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/dataloaders/question_answer.py +56 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/base.py +1 -1
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py +1 -1
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/evaluator.py +244 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/factories/__init__.py +42 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/metrics/base.py +21 -13
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/metrics/document_search.py +14 -3
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/metrics/gaia.py +84 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/metrics/hotpot_qa.py +51 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/metrics/human_eval.py +105 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/metrics/question_answer.py +205 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/optimizer.py +20 -20
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/pipelines/__init__.py +15 -4
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/pipelines/base.py +64 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/pipelines/document_search.py +39 -19
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/pipelines/gaia.py +249 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/pipelines/hotpot_qa.py +342 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/pipelines/human_eval.py +323 -0
- ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/pipelines/question_answer.py +96 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/utils.py +48 -14
- ragbits_evaluate-1.4.0.dev202601130240/tests/cli/test_run_evaluation.py +50 -0
- ragbits_evaluate-1.4.0.dev202601130240/tests/unit/test_agent_simulation_context.py +264 -0
- ragbits_evaluate-1.4.0.dev202601130240/tests/unit/test_agent_simulation_metrics.py +360 -0
- ragbits_evaluate-1.4.0.dev202601130240/tests/unit/test_agent_simulation_results.py +406 -0
- ragbits_evaluate-1.4.0.dev202601130240/tests/unit/test_evaluator.py +198 -0
- ragbits_evaluate-1.4.0.dev202601130240/tests/unit/test_metrics.py +217 -0
- ragbits_evaluate-1.4.0.dev202601130240/tests/unit/test_optimizer.py +124 -0
- ragbits_evaluate-0.15.0/CHANGELOG.md +0 -126
- ragbits_evaluate-0.15.0/src/ragbits/evaluate/config.py +0 -13
- ragbits_evaluate-0.15.0/src/ragbits/evaluate/dataloaders/__init__.py +0 -21
- ragbits_evaluate-0.15.0/src/ragbits/evaluate/dataloaders/base.py +0 -21
- ragbits_evaluate-0.15.0/src/ragbits/evaluate/dataloaders/hf.py +0 -29
- ragbits_evaluate-0.15.0/src/ragbits/evaluate/dataloaders/local.py +0 -45
- ragbits_evaluate-0.15.0/src/ragbits/evaluate/evaluator.py +0 -161
- ragbits_evaluate-0.15.0/src/ragbits/evaluate/factories/__init__.py +0 -44
- ragbits_evaluate-0.15.0/src/ragbits/evaluate/pipelines/base.py +0 -41
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/README.md +0 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/__init__.py +0 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/__init__.py +0 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/pipeline.py +0 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/prompts/__init__.py +0 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/prompts/corpus_generation.py +0 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/prompts/qa.py +0 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/tasks/__init__.py +0 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/tasks/corpus_generation.py +0 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/tasks/filter/__init__.py +0 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/tasks/filter/base.py +0 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/tasks/filter/dont_know.py +0 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/__init__.py +0 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/utils.py +0 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/metrics/__init__.py +0 -0
- {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/py.typed +0 -0
|
@@ -8,6 +8,10 @@ venv/
|
|
|
8
8
|
.venv/
|
|
9
9
|
__pycache__/
|
|
10
10
|
**.egg-info/
|
|
11
|
+
.deepeval/
|
|
12
|
+
|
|
13
|
+
# Local cursor rules
|
|
14
|
+
.cursor/rules/local/
|
|
11
15
|
|
|
12
16
|
# Byte-compiled / optimized / DLL files
|
|
13
17
|
__pycache__/
|
|
@@ -101,3 +105,12 @@ qdrant/
|
|
|
101
105
|
.aider*
|
|
102
106
|
|
|
103
107
|
.DS_Store
|
|
108
|
+
node_modules/
|
|
109
|
+
|
|
110
|
+
lazygit
|
|
111
|
+
|
|
112
|
+
lazygit.tar.gz
|
|
113
|
+
|
|
114
|
+
# chat conversation logs
|
|
115
|
+
duet_conversation.log
|
|
116
|
+
worktrees/
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
# CHANGELOG
|
|
2
|
+
|
|
3
|
+
## Unreleased
|
|
4
|
+
|
|
5
|
+
- Feat: introduce agent evaluation pipelines and metrics (HotpotQA, HumanEval, GAIA) (#829)
|
|
6
|
+
|
|
7
|
+
- Feat: introduce agent simulation module with utilities for agent-to-agent conversation and evaluation scenarios (#857)
|
|
8
|
+
|
|
9
|
+
- Feat: add structured results to agent simulation with `SimulationResult`, `TurnResult`, `TaskResult`, and `ConversationMetrics` models (#885)
|
|
10
|
+
|
|
11
|
+
- Feat: add generic `DomainContext` and `DataSnapshot` for flexible agent simulation context (#884)
|
|
12
|
+
|
|
13
|
+
- Feat: add metrics collection system for agent simulation (`MetricCollector` protocol, `LatencyMetricCollector`, `TokenUsageMetricCollector`, `ToolUsageMetricCollector`) (#882)
|
|
14
|
+
|
|
15
|
+
## 1.3.0 (2025-09-11)
|
|
16
|
+
|
|
17
|
+
### Changed
|
|
18
|
+
|
|
19
|
+
- ragbits-core updated to version v1.3.0
|
|
20
|
+
|
|
21
|
+
- Optional parallel batches execution in ragbits.evaluate.Evaluator (#769)
|
|
22
|
+
|
|
23
|
+
## 1.2.2 (2025-08-08)
|
|
24
|
+
|
|
25
|
+
### Changed
|
|
26
|
+
|
|
27
|
+
- ragbits-core updated to version v1.2.2
|
|
28
|
+
|
|
29
|
+
## 1.2.1 (2025-08-04)
|
|
30
|
+
|
|
31
|
+
### Changed
|
|
32
|
+
|
|
33
|
+
- ragbits-core updated to version v1.2.1
|
|
34
|
+
|
|
35
|
+
## 1.2.0 (2025-08-01)
|
|
36
|
+
|
|
37
|
+
### Changed
|
|
38
|
+
|
|
39
|
+
- ragbits-core updated to version v1.2.0
|
|
40
|
+
|
|
41
|
+
## 1.1.0 (2025-07-09)
|
|
42
|
+
|
|
43
|
+
### Changed
|
|
44
|
+
|
|
45
|
+
- ragbits-core updated to version v1.1.0
|
|
46
|
+
|
|
47
|
+
- Update qa data loader docstring (#565)
|
|
48
|
+
- Fix deadlock on qa metrics compute (#609)
|
|
49
|
+
- Upgrade distilabel version to 1.5.0 (#682)
|
|
50
|
+
|
|
51
|
+
## 1.0.0 (2025-06-04)
|
|
52
|
+
|
|
53
|
+
### Changed
|
|
54
|
+
|
|
55
|
+
- ragbits-core updated to version v1.0.0
|
|
56
|
+
|
|
57
|
+
## 0.20.1 (2025-06-04)
|
|
58
|
+
|
|
59
|
+
### Changed
|
|
60
|
+
|
|
61
|
+
- ragbits-core updated to version v0.20.1
|
|
62
|
+
|
|
63
|
+
## 0.20.0 (2025-06-03)
|
|
64
|
+
|
|
65
|
+
### Changed
|
|
66
|
+
|
|
67
|
+
- ragbits-core updated to version v0.20.0
|
|
68
|
+
|
|
69
|
+
## 0.19.1 (2025-05-27)
|
|
70
|
+
|
|
71
|
+
### Changed
|
|
72
|
+
|
|
73
|
+
- ragbits-core updated to version v0.19.1
|
|
74
|
+
|
|
75
|
+
## 0.19.0 (2025-05-27)
|
|
76
|
+
|
|
77
|
+
### Changed
|
|
78
|
+
|
|
79
|
+
- ragbits-core updated to version v0.19.0
|
|
80
|
+
|
|
81
|
+
- Add evals for question answering (#577)
|
|
82
|
+
- Add support for slicing dataset (#576)
|
|
83
|
+
- Separate load and map ops in data loaders (#576)
|
|
84
|
+
|
|
85
|
+
## 0.18.0 (2025-05-22)
|
|
86
|
+
|
|
87
|
+
### Changed
|
|
88
|
+
|
|
89
|
+
- ragbits-core updated to version v0.18.0
|
|
90
|
+
|
|
91
|
+
- Add support for custom column names in evaluation dataset (#566)
|
|
92
|
+
- Add support for reference document ids and page numbers in evaluation dataset (#566)
|
|
93
|
+
- BREAKING CHANGE: Adjust eval pipline interface to batch processing (#555)
|
|
94
|
+
- Rename DocumentMeta create_text_document_from_literal to from_literal (#561)
|
|
95
|
+
- Adjust typing for DocumentSearch (#554)
|
|
96
|
+
|
|
97
|
+
## 0.17.1 (2025-05-09)
|
|
98
|
+
|
|
99
|
+
### Changed
|
|
100
|
+
|
|
101
|
+
- ragbits-core updated to version v0.17.1
|
|
102
|
+
|
|
103
|
+
## 0.17.0 (2025-05-06)
|
|
104
|
+
|
|
105
|
+
### Changed
|
|
106
|
+
|
|
107
|
+
- ragbits-core updated to version v0.17.0
|
|
108
|
+
|
|
109
|
+
- Add tests for ragbits-evaluate package (#390)
|
|
110
|
+
- Integrate sources with dataloaders (#529)
|
|
111
|
+
|
|
112
|
+
## 0.16.0 (2025-04-29)
|
|
113
|
+
|
|
114
|
+
### Changed
|
|
115
|
+
|
|
116
|
+
- ragbits-core updated to version v0.16.0
|
|
117
|
+
|
|
118
|
+
## 0.15.0 (2025-04-28)
|
|
119
|
+
|
|
120
|
+
### Changed
|
|
121
|
+
|
|
122
|
+
- ragbits-core updated to version v0.15.0
|
|
123
|
+
|
|
124
|
+
## 0.14.0 (2025-04-22)
|
|
125
|
+
|
|
126
|
+
### Changed
|
|
127
|
+
|
|
128
|
+
- ragbits-core updated to version v0.14.0
|
|
129
|
+
|
|
130
|
+
- move sources from ragbits-document-search to ragbits-core (#496)
|
|
131
|
+
|
|
132
|
+
## 0.13.0 (2025-04-02)
|
|
133
|
+
|
|
134
|
+
### Changed
|
|
135
|
+
|
|
136
|
+
- ragbits-core updated to version v0.13.0
|
|
137
|
+
|
|
138
|
+
## 0.12.0 (2025-03-25)
|
|
139
|
+
|
|
140
|
+
### Changed
|
|
141
|
+
|
|
142
|
+
- ragbits-core updated to version v0.12.0
|
|
143
|
+
|
|
144
|
+
## 0.11.0 (2025-03-25)
|
|
145
|
+
|
|
146
|
+
### Changed
|
|
147
|
+
|
|
148
|
+
- ragbits-core updated to version v0.11.0
|
|
149
|
+
|
|
150
|
+
## 0.10.2 (2025-03-21)
|
|
151
|
+
|
|
152
|
+
### Changed
|
|
153
|
+
|
|
154
|
+
- ragbits-core updated to version v0.10.2
|
|
155
|
+
|
|
156
|
+
## 0.10.1 (2025-03-19)
|
|
157
|
+
|
|
158
|
+
### Changed
|
|
159
|
+
|
|
160
|
+
- ragbits-core updated to version v0.10.1
|
|
161
|
+
|
|
162
|
+
## 0.10.0 (2025-03-17)
|
|
163
|
+
|
|
164
|
+
### Changed
|
|
165
|
+
|
|
166
|
+
- ragbits-core updated to version v0.10.0
|
|
167
|
+
|
|
168
|
+
- Compability with the new Vector Store interface from ragbits-core (#288)
|
|
169
|
+
- chore: fix typo in README.
|
|
170
|
+
- fix typos in doc strings
|
|
171
|
+
|
|
172
|
+
## 0.9.0 (2025-02-25)
|
|
173
|
+
|
|
174
|
+
### Changed
|
|
175
|
+
|
|
176
|
+
- ragbits-core updated to version v0.9.0
|
|
177
|
+
- Add cli for document search evaluation added (#356)
|
|
178
|
+
- Add local data loader (#334).
|
|
179
|
+
|
|
180
|
+
## 0.8.0 (2025-01-29)
|
|
181
|
+
|
|
182
|
+
### Changed
|
|
183
|
+
|
|
184
|
+
- ragbits-core updated to version v0.8.0
|
|
185
|
+
|
|
186
|
+
## 0.7.0 (2025-01-21)
|
|
187
|
+
|
|
188
|
+
### Added
|
|
189
|
+
|
|
190
|
+
- Simplified interface to document-search evaluation (#258).
|
|
191
|
+
|
|
192
|
+
### Changed
|
|
193
|
+
|
|
194
|
+
- ragbits-core updated to version v0.7.0
|
|
195
|
+
|
|
196
|
+
## 0.6.0 (2024-12-27)
|
|
197
|
+
|
|
198
|
+
### Changed
|
|
199
|
+
|
|
200
|
+
- ragbits-core updated to version v0.6.0
|
|
201
|
+
|
|
202
|
+
## 0.5.1 (2024-12-09)
|
|
203
|
+
|
|
204
|
+
### Changed
|
|
205
|
+
|
|
206
|
+
- ragbits-core updated to version v0.5.1
|
|
207
|
+
- document search evaluation now returns all Element types, rather than only TextElements (#241).
|
|
208
|
+
|
|
209
|
+
## 0.5.0 (2024-12-05)
|
|
210
|
+
|
|
211
|
+
### Changed
|
|
212
|
+
|
|
213
|
+
- ragbits-core updated to version v0.5.0
|
|
214
|
+
|
|
215
|
+
## 0.4.0 (2024-11-27)
|
|
216
|
+
|
|
217
|
+
### Added
|
|
218
|
+
|
|
219
|
+
- Introduced optimization with optuna (#177).
|
|
220
|
+
- Add synthetic data generation pipeline (#165).
|
|
221
|
+
|
|
222
|
+
### Changed
|
|
223
|
+
|
|
224
|
+
- ragbits-core updated to version v0.4.0
|
|
225
|
+
|
|
226
|
+
## 0.3.0 (2024-11-06)
|
|
227
|
+
|
|
228
|
+
### Changed
|
|
229
|
+
|
|
230
|
+
- ragbits-core updated to version v0.3.0
|
|
231
|
+
|
|
232
|
+
## 0.2.0 (2024-10-23)
|
|
233
|
+
|
|
234
|
+
- Initial release of the package.
|
|
235
|
+
- Evaluation pipeline framework with capability to define evaluators & metrics.
|
|
236
|
+
- Evaluation pipeline for `ragbits-document-search`.
|
|
237
|
+
|
|
238
|
+
### Changed
|
|
239
|
+
|
|
240
|
+
- ragbits-core updated to version v0.2.0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragbits-evaluate
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.4.0.dev202601130240
|
|
4
4
|
Summary: Evaluation module for Ragbits components
|
|
5
5
|
Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
|
|
6
6
|
Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
|
|
@@ -22,11 +22,13 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
23
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
24
|
Requires-Python: >=3.10
|
|
25
|
-
Requires-Dist:
|
|
26
|
-
Requires-Dist:
|
|
27
|
-
Requires-Dist:
|
|
28
|
-
Requires-Dist:
|
|
29
|
-
Requires-Dist:
|
|
25
|
+
Requires-Dist: datasets<4.0.0,>=3.0.1
|
|
26
|
+
Requires-Dist: deepeval<3.0.0,>=2.0.0
|
|
27
|
+
Requires-Dist: distilabel<2.0.0,>=1.5.0
|
|
28
|
+
Requires-Dist: hydra-core<2.0.0,>=1.3.2
|
|
29
|
+
Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
|
|
30
|
+
Requires-Dist: optuna<5.0.0,>=4.0.0
|
|
31
|
+
Requires-Dist: ragbits-core==1.4.0.dev202601130240
|
|
30
32
|
Provides-Extra: relari
|
|
31
33
|
Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
|
|
32
34
|
Description-Content-Type: text/markdown
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "ragbits-evaluate"
|
|
3
|
-
version = "
|
|
3
|
+
version = "1.4.0.dev202601130240"
|
|
4
4
|
description = "Evaluation module for Ragbits components"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.10"
|
|
@@ -32,7 +32,7 @@ classifiers = [
|
|
|
32
32
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
33
33
|
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
34
34
|
]
|
|
35
|
-
dependencies = ["hydra-core
|
|
35
|
+
dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.5.0,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==1.4.0.dev202601130240", "deepeval>=2.0.0,<3.0.0"]
|
|
36
36
|
|
|
37
37
|
[project.urls]
|
|
38
38
|
"Homepage" = "https://github.com/deepsense-ai/ragbits"
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Agent simulation utilities for evaluation scenarios.
|
|
2
|
+
|
|
3
|
+
This module uses lazy imports for components that require optional dependencies
|
|
4
|
+
(ragbits-agents, ragbits-chat) to allow importing result models independently.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
# Import context, metrics, and result models eagerly - they have no external dependencies
|
|
10
|
+
from ragbits.evaluate.agent_simulation.context import DataSnapshot, DomainContext
|
|
11
|
+
from ragbits.evaluate.agent_simulation.metrics import (
|
|
12
|
+
CompositeMetricCollector,
|
|
13
|
+
LatencyMetricCollector,
|
|
14
|
+
MetricCollector,
|
|
15
|
+
TokenUsageMetricCollector,
|
|
16
|
+
ToolUsageMetricCollector,
|
|
17
|
+
)
|
|
18
|
+
from ragbits.evaluate.agent_simulation.results import (
|
|
19
|
+
ConversationMetrics,
|
|
20
|
+
SimulationResult,
|
|
21
|
+
SimulationStatus,
|
|
22
|
+
TaskResult,
|
|
23
|
+
TurnResult,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from ragbits.evaluate.agent_simulation.conversation import run_simulation
|
|
28
|
+
from ragbits.evaluate.agent_simulation.deepeval_evaluator import DeepEvalEvaluator
|
|
29
|
+
from ragbits.evaluate.agent_simulation.logger import ConversationLogger
|
|
30
|
+
from ragbits.evaluate.agent_simulation.models import Personality, Scenario, Task, Turn
|
|
31
|
+
from ragbits.evaluate.agent_simulation.scenarios import load_personalities, load_scenarios
|
|
32
|
+
from ragbits.evaluate.agent_simulation.simulation import GoalChecker, SimulatedUser
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
"CompositeMetricCollector",
|
|
36
|
+
"ConversationLogger",
|
|
37
|
+
"ConversationMetrics",
|
|
38
|
+
"DataSnapshot",
|
|
39
|
+
"DeepEvalEvaluator",
|
|
40
|
+
"DomainContext",
|
|
41
|
+
"GoalChecker",
|
|
42
|
+
"LatencyMetricCollector",
|
|
43
|
+
"MetricCollector",
|
|
44
|
+
"Personality",
|
|
45
|
+
"Scenario",
|
|
46
|
+
"SimulatedUser",
|
|
47
|
+
"SimulationResult",
|
|
48
|
+
"SimulationStatus",
|
|
49
|
+
"Task",
|
|
50
|
+
"TaskResult",
|
|
51
|
+
"TokenUsageMetricCollector",
|
|
52
|
+
"ToolUsageMetricCollector",
|
|
53
|
+
"Turn",
|
|
54
|
+
"TurnResult",
|
|
55
|
+
"load_personalities",
|
|
56
|
+
"load_scenarios",
|
|
57
|
+
"run_simulation",
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def __getattr__(name: str) -> object:
|
|
62
|
+
"""Lazy import for components with optional dependencies."""
|
|
63
|
+
if name == "run_simulation":
|
|
64
|
+
from ragbits.evaluate.agent_simulation.conversation import run_simulation
|
|
65
|
+
|
|
66
|
+
return run_simulation
|
|
67
|
+
if name == "DeepEvalEvaluator":
|
|
68
|
+
from ragbits.evaluate.agent_simulation.deepeval_evaluator import DeepEvalEvaluator
|
|
69
|
+
|
|
70
|
+
return DeepEvalEvaluator
|
|
71
|
+
if name == "ConversationLogger":
|
|
72
|
+
from ragbits.evaluate.agent_simulation.logger import ConversationLogger
|
|
73
|
+
|
|
74
|
+
return ConversationLogger
|
|
75
|
+
if name in ("Personality", "Scenario", "Task", "Turn"):
|
|
76
|
+
from ragbits.evaluate.agent_simulation import models
|
|
77
|
+
|
|
78
|
+
return getattr(models, name)
|
|
79
|
+
if name in ("load_personalities", "load_scenarios"):
|
|
80
|
+
from ragbits.evaluate.agent_simulation import scenarios
|
|
81
|
+
|
|
82
|
+
return getattr(scenarios, name)
|
|
83
|
+
if name in ("GoalChecker", "SimulatedUser"):
|
|
84
|
+
from ragbits.evaluate.agent_simulation import simulation
|
|
85
|
+
|
|
86
|
+
return getattr(simulation, name)
|
|
87
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Context models for agent simulation scenarios."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
DEFAULT_MAX_ITEMS_IN_PROMPT = 15
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class DomainContext:
|
|
11
|
+
"""Domain-specific context for goal checking and simulation.
|
|
12
|
+
|
|
13
|
+
Provides additional context to the GoalChecker to avoid false negatives
|
|
14
|
+
from value interpretation differences or missing domain knowledge.
|
|
15
|
+
|
|
16
|
+
The context is intentionally generic - use the `metadata` field for any
|
|
17
|
+
domain-specific information that doesn't fit the standard fields.
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
>>> context = DomainContext(
|
|
21
|
+
... domain_type="customer_support",
|
|
22
|
+
... locale="en_US",
|
|
23
|
+
... metadata={"ticket_statuses": ["open", "pending", "resolved"]},
|
|
24
|
+
... )
|
|
25
|
+
>>> result = await goal_checker.is_task_achieved(task, history, context=context)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
domain_type: str
|
|
29
|
+
"""Type of domain (e.g., "customer_support", "booking", "search", "qa")."""
|
|
30
|
+
|
|
31
|
+
locale: str = "en_US"
|
|
32
|
+
"""Locale for language and formatting (e.g., "en_US", "de_DE")."""
|
|
33
|
+
|
|
34
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
35
|
+
"""Arbitrary domain-specific metadata for goal checking context."""
|
|
36
|
+
|
|
37
|
+
def format_for_prompt(self) -> str:
|
|
38
|
+
"""Format context for inclusion in LLM prompts.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Formatted string suitable for prompt injection.
|
|
42
|
+
"""
|
|
43
|
+
parts = [
|
|
44
|
+
f"Domain: {self.domain_type}",
|
|
45
|
+
f"Locale: {self.locale}",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
if self.metadata:
|
|
49
|
+
parts.append("Additional context:")
|
|
50
|
+
for key, value in self.metadata.items():
|
|
51
|
+
if isinstance(value, list) and len(value) > DEFAULT_MAX_ITEMS_IN_PROMPT:
|
|
52
|
+
truncated = value[:DEFAULT_MAX_ITEMS_IN_PROMPT]
|
|
53
|
+
parts.append(f" {key}: {truncated} ... and {len(value) - DEFAULT_MAX_ITEMS_IN_PROMPT} more")
|
|
54
|
+
else:
|
|
55
|
+
parts.append(f" {key}: {value}")
|
|
56
|
+
|
|
57
|
+
return "\n".join(parts)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class DataSnapshot:
|
|
62
|
+
"""Sample of available data to ground simulated user requests.
|
|
63
|
+
|
|
64
|
+
Provides the simulated user with knowledge of what data actually exists,
|
|
65
|
+
preventing unrealistic requests for non-existent entities.
|
|
66
|
+
|
|
67
|
+
The snapshot is intentionally generic - store any domain-specific data
|
|
68
|
+
in the `entities` dict with descriptive keys.
|
|
69
|
+
|
|
70
|
+
Example:
|
|
71
|
+
>>> snapshot = DataSnapshot(
|
|
72
|
+
... entities={
|
|
73
|
+
... "available_topics": ["billing", "technical", "returns"],
|
|
74
|
+
... "sample_users": [{"id": "u1", "name": "John"}],
|
|
75
|
+
... },
|
|
76
|
+
... description="Customer support knowledge base",
|
|
77
|
+
... )
|
|
78
|
+
>>> # SimulatedUser will only reference items from this data
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
entities: dict[str, list[Any]] = field(default_factory=dict)
|
|
82
|
+
"""Named collections of available entities (e.g., {"users": [...], "documents": [...]})."""
|
|
83
|
+
|
|
84
|
+
description: str = ""
|
|
85
|
+
"""Optional description of the data snapshot for context."""
|
|
86
|
+
|
|
87
|
+
def format_for_prompt(self, max_items: int = DEFAULT_MAX_ITEMS_IN_PROMPT) -> str:
|
|
88
|
+
"""Format data snapshot for inclusion in LLM prompts.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
max_items: Maximum number of items to include per entity type.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Formatted string suitable for prompt injection.
|
|
95
|
+
"""
|
|
96
|
+
parts = []
|
|
97
|
+
|
|
98
|
+
if self.description:
|
|
99
|
+
parts.append(f"Context: {self.description}")
|
|
100
|
+
|
|
101
|
+
for entity_name, entity_list in self.entities.items():
|
|
102
|
+
if not entity_list:
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
truncated = entity_list[:max_items]
|
|
106
|
+
# Format items - if dicts with 'name', use that; otherwise str()
|
|
107
|
+
formatted_items = []
|
|
108
|
+
for item in truncated:
|
|
109
|
+
if isinstance(item, dict) and "name" in item:
|
|
110
|
+
formatted_items.append(item["name"])
|
|
111
|
+
else:
|
|
112
|
+
formatted_items.append(str(item))
|
|
113
|
+
|
|
114
|
+
parts.append(f"{entity_name}: {', '.join(formatted_items)}")
|
|
115
|
+
if len(entity_list) > max_items:
|
|
116
|
+
parts.append(f" ... and {len(entity_list) - max_items} more")
|
|
117
|
+
|
|
118
|
+
return "\n".join(parts)
|