ragbits-evaluate 0.15.0__tar.gz → 1.4.0.dev202601130240__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/.gitignore +13 -0
  2. ragbits_evaluate-1.4.0.dev202601130240/CHANGELOG.md +240 -0
  3. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/PKG-INFO +8 -6
  4. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/pyproject.toml +2 -2
  5. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/__init__.py +87 -0
  6. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/context.py +118 -0
  7. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/conversation.py +333 -0
  8. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/deepeval_evaluator.py +92 -0
  9. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/logger.py +165 -0
  10. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/metrics/__init__.py +19 -0
  11. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/metrics/builtin.py +221 -0
  12. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/metrics/collectors.py +142 -0
  13. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/models.py +37 -0
  14. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/results.py +200 -0
  15. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/scenarios.py +129 -0
  16. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/agent_simulation/simulation.py +243 -0
  17. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/cli.py +40 -39
  18. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/config.py +11 -0
  19. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/dataloaders/__init__.py +3 -0
  20. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/dataloaders/base.py +95 -0
  21. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/dataloaders/document_search.py +61 -0
  22. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/dataloaders/exceptions.py +25 -0
  23. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/dataloaders/gaia.py +78 -0
  24. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/dataloaders/hotpot_qa.py +95 -0
  25. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/dataloaders/human_eval.py +70 -0
  26. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/dataloaders/question_answer.py +56 -0
  27. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/base.py +1 -1
  28. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py +1 -1
  29. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/evaluator.py +244 -0
  30. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/factories/__init__.py +42 -0
  31. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/metrics/base.py +21 -13
  32. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/metrics/document_search.py +14 -3
  33. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/metrics/gaia.py +84 -0
  34. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/metrics/hotpot_qa.py +51 -0
  35. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/metrics/human_eval.py +105 -0
  36. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/metrics/question_answer.py +205 -0
  37. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/optimizer.py +20 -20
  38. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/pipelines/__init__.py +15 -4
  39. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/pipelines/base.py +64 -0
  40. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/pipelines/document_search.py +39 -19
  41. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/pipelines/gaia.py +249 -0
  42. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/pipelines/hotpot_qa.py +342 -0
  43. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/pipelines/human_eval.py +323 -0
  44. ragbits_evaluate-1.4.0.dev202601130240/src/ragbits/evaluate/pipelines/question_answer.py +96 -0
  45. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/utils.py +48 -14
  46. ragbits_evaluate-1.4.0.dev202601130240/tests/cli/test_run_evaluation.py +50 -0
  47. ragbits_evaluate-1.4.0.dev202601130240/tests/unit/test_agent_simulation_context.py +264 -0
  48. ragbits_evaluate-1.4.0.dev202601130240/tests/unit/test_agent_simulation_metrics.py +360 -0
  49. ragbits_evaluate-1.4.0.dev202601130240/tests/unit/test_agent_simulation_results.py +406 -0
  50. ragbits_evaluate-1.4.0.dev202601130240/tests/unit/test_evaluator.py +198 -0
  51. ragbits_evaluate-1.4.0.dev202601130240/tests/unit/test_metrics.py +217 -0
  52. ragbits_evaluate-1.4.0.dev202601130240/tests/unit/test_optimizer.py +124 -0
  53. ragbits_evaluate-0.15.0/CHANGELOG.md +0 -126
  54. ragbits_evaluate-0.15.0/src/ragbits/evaluate/config.py +0 -13
  55. ragbits_evaluate-0.15.0/src/ragbits/evaluate/dataloaders/__init__.py +0 -21
  56. ragbits_evaluate-0.15.0/src/ragbits/evaluate/dataloaders/base.py +0 -21
  57. ragbits_evaluate-0.15.0/src/ragbits/evaluate/dataloaders/hf.py +0 -29
  58. ragbits_evaluate-0.15.0/src/ragbits/evaluate/dataloaders/local.py +0 -45
  59. ragbits_evaluate-0.15.0/src/ragbits/evaluate/evaluator.py +0 -161
  60. ragbits_evaluate-0.15.0/src/ragbits/evaluate/factories/__init__.py +0 -44
  61. ragbits_evaluate-0.15.0/src/ragbits/evaluate/pipelines/base.py +0 -41
  62. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/README.md +0 -0
  63. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/__init__.py +0 -0
  64. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/__init__.py +0 -0
  65. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/pipeline.py +0 -0
  66. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/prompts/__init__.py +0 -0
  67. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/prompts/corpus_generation.py +0 -0
  68. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/prompts/qa.py +0 -0
  69. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/tasks/__init__.py +0 -0
  70. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/tasks/corpus_generation.py +0 -0
  71. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/tasks/filter/__init__.py +0 -0
  72. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/tasks/filter/base.py +0 -0
  73. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/tasks/filter/dont_know.py +0 -0
  74. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/__init__.py +0 -0
  75. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/dataset_generator/utils.py +0 -0
  76. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/metrics/__init__.py +0 -0
  77. {ragbits_evaluate-0.15.0 → ragbits_evaluate-1.4.0.dev202601130240}/src/ragbits/evaluate/py.typed +0 -0
@@ -8,6 +8,10 @@ venv/
8
8
  .venv/
9
9
  __pycache__/
10
10
  **.egg-info/
11
+ .deepeval/
12
+
13
+ # Local cursor rules
14
+ .cursor/rules/local/
11
15
 
12
16
  # Byte-compiled / optimized / DLL files
13
17
  __pycache__/
@@ -101,3 +105,12 @@ qdrant/
101
105
  .aider*
102
106
 
103
107
  .DS_Store
108
+ node_modules/
109
+
110
+ lazygit
111
+
112
+ lazygit.tar.gz
113
+
114
+ # chat conversation logs
115
+ duet_conversation.log
116
+ worktrees/
@@ -0,0 +1,240 @@
1
+ # CHANGELOG
2
+
3
+ ## Unreleased
4
+
5
+ - Feat: introduce agent evaluation pipelines and metrics (HotpotQA, HumanEval, GAIA) (#829)
6
+
7
+ - Feat: introduce agent simulation module with utilities for agent-to-agent conversation and evaluation scenarios (#857)
8
+
9
+ - Feat: add structured results to agent simulation with `SimulationResult`, `TurnResult`, `TaskResult`, and `ConversationMetrics` models (#885)
10
+
11
+ - Feat: add generic `DomainContext` and `DataSnapshot` for flexible agent simulation context (#884)
12
+
13
+ - Feat: add metrics collection system for agent simulation (`MetricCollector` protocol, `LatencyMetricCollector`, `TokenUsageMetricCollector`, `ToolUsageMetricCollector`) (#882)
14
+
15
+ ## 1.3.0 (2025-09-11)
16
+
17
+ ### Changed
18
+
19
+ - ragbits-core updated to version v1.3.0
20
+
21
+ - Optional parallel batches execution in ragbits.evaluate.Evaluator (#769)
22
+
23
+ ## 1.2.2 (2025-08-08)
24
+
25
+ ### Changed
26
+
27
+ - ragbits-core updated to version v1.2.2
28
+
29
+ ## 1.2.1 (2025-08-04)
30
+
31
+ ### Changed
32
+
33
+ - ragbits-core updated to version v1.2.1
34
+
35
+ ## 1.2.0 (2025-08-01)
36
+
37
+ ### Changed
38
+
39
+ - ragbits-core updated to version v1.2.0
40
+
41
+ ## 1.1.0 (2025-07-09)
42
+
43
+ ### Changed
44
+
45
+ - ragbits-core updated to version v1.1.0
46
+
47
+ - Update qa data loader docstring (#565)
48
+ - Fix deadlock on qa metrics compute (#609)
49
+ - Upgrade distilabel version to 1.5.0 (#682)
50
+
51
+ ## 1.0.0 (2025-06-04)
52
+
53
+ ### Changed
54
+
55
+ - ragbits-core updated to version v1.0.0
56
+
57
+ ## 0.20.1 (2025-06-04)
58
+
59
+ ### Changed
60
+
61
+ - ragbits-core updated to version v0.20.1
62
+
63
+ ## 0.20.0 (2025-06-03)
64
+
65
+ ### Changed
66
+
67
+ - ragbits-core updated to version v0.20.0
68
+
69
+ ## 0.19.1 (2025-05-27)
70
+
71
+ ### Changed
72
+
73
+ - ragbits-core updated to version v0.19.1
74
+
75
+ ## 0.19.0 (2025-05-27)
76
+
77
+ ### Changed
78
+
79
+ - ragbits-core updated to version v0.19.0
80
+
81
+ - Add evals for question answering (#577)
82
+ - Add support for slicing dataset (#576)
83
+ - Separate load and map ops in data loaders (#576)
84
+
85
+ ## 0.18.0 (2025-05-22)
86
+
87
+ ### Changed
88
+
89
+ - ragbits-core updated to version v0.18.0
90
+
91
+ - Add support for custom column names in evaluation dataset (#566)
92
+ - Add support for reference document ids and page numbers in evaluation dataset (#566)
93
+ - BREAKING CHANGE: Adjust eval pipline interface to batch processing (#555)
94
+ - Rename DocumentMeta create_text_document_from_literal to from_literal (#561)
95
+ - Adjust typing for DocumentSearch (#554)
96
+
97
+ ## 0.17.1 (2025-05-09)
98
+
99
+ ### Changed
100
+
101
+ - ragbits-core updated to version v0.17.1
102
+
103
+ ## 0.17.0 (2025-05-06)
104
+
105
+ ### Changed
106
+
107
+ - ragbits-core updated to version v0.17.0
108
+
109
+ - Add tests for ragbits-evaluate package (#390)
110
+ - Integrate sources with dataloaders (#529)
111
+
112
+ ## 0.16.0 (2025-04-29)
113
+
114
+ ### Changed
115
+
116
+ - ragbits-core updated to version v0.16.0
117
+
118
+ ## 0.15.0 (2025-04-28)
119
+
120
+ ### Changed
121
+
122
+ - ragbits-core updated to version v0.15.0
123
+
124
+ ## 0.14.0 (2025-04-22)
125
+
126
+ ### Changed
127
+
128
+ - ragbits-core updated to version v0.14.0
129
+
130
+ - move sources from ragbits-document-search to ragbits-core (#496)
131
+
132
+ ## 0.13.0 (2025-04-02)
133
+
134
+ ### Changed
135
+
136
+ - ragbits-core updated to version v0.13.0
137
+
138
+ ## 0.12.0 (2025-03-25)
139
+
140
+ ### Changed
141
+
142
+ - ragbits-core updated to version v0.12.0
143
+
144
+ ## 0.11.0 (2025-03-25)
145
+
146
+ ### Changed
147
+
148
+ - ragbits-core updated to version v0.11.0
149
+
150
+ ## 0.10.2 (2025-03-21)
151
+
152
+ ### Changed
153
+
154
+ - ragbits-core updated to version v0.10.2
155
+
156
+ ## 0.10.1 (2025-03-19)
157
+
158
+ ### Changed
159
+
160
+ - ragbits-core updated to version v0.10.1
161
+
162
+ ## 0.10.0 (2025-03-17)
163
+
164
+ ### Changed
165
+
166
+ - ragbits-core updated to version v0.10.0
167
+
168
+ - Compability with the new Vector Store interface from ragbits-core (#288)
169
+ - chore: fix typo in README.
170
+ - fix typos in doc strings
171
+
172
+ ## 0.9.0 (2025-02-25)
173
+
174
+ ### Changed
175
+
176
+ - ragbits-core updated to version v0.9.0
177
+ - Add cli for document search evaluation added (#356)
178
+ - Add local data loader (#334).
179
+
180
+ ## 0.8.0 (2025-01-29)
181
+
182
+ ### Changed
183
+
184
+ - ragbits-core updated to version v0.8.0
185
+
186
+ ## 0.7.0 (2025-01-21)
187
+
188
+ ### Added
189
+
190
+ - Simplified interface to document-search evaluation (#258).
191
+
192
+ ### Changed
193
+
194
+ - ragbits-core updated to version v0.7.0
195
+
196
+ ## 0.6.0 (2024-12-27)
197
+
198
+ ### Changed
199
+
200
+ - ragbits-core updated to version v0.6.0
201
+
202
+ ## 0.5.1 (2024-12-09)
203
+
204
+ ### Changed
205
+
206
+ - ragbits-core updated to version v0.5.1
207
+ - document search evaluation now returns all Element types, rather than only TextElements (#241).
208
+
209
+ ## 0.5.0 (2024-12-05)
210
+
211
+ ### Changed
212
+
213
+ - ragbits-core updated to version v0.5.0
214
+
215
+ ## 0.4.0 (2024-11-27)
216
+
217
+ ### Added
218
+
219
+ - Introduced optimization with optuna (#177).
220
+ - Add synthetic data generation pipeline (#165).
221
+
222
+ ### Changed
223
+
224
+ - ragbits-core updated to version v0.4.0
225
+
226
+ ## 0.3.0 (2024-11-06)
227
+
228
+ ### Changed
229
+
230
+ - ragbits-core updated to version v0.3.0
231
+
232
+ ## 0.2.0 (2024-10-23)
233
+
234
+ - Initial release of the package.
235
+ - Evaluation pipeline framework with capability to define evaluators & metrics.
236
+ - Evaluation pipeline for `ragbits-document-search`.
237
+
238
+ ### Changed
239
+
240
+ - ragbits-core updated to version v0.2.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragbits-evaluate
3
- Version: 0.15.0
3
+ Version: 1.4.0.dev202601130240
4
4
  Summary: Evaluation module for Ragbits components
5
5
  Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
6
6
  Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
@@ -22,11 +22,13 @@ Classifier: Programming Language :: Python :: 3.13
22
22
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
24
  Requires-Python: >=3.10
25
- Requires-Dist: distilabel==1.4.1
26
- Requires-Dist: hydra-core~=1.3.2
27
- Requires-Dist: neptune[optuna]~=1.12.0
28
- Requires-Dist: optuna==4.0.0
29
- Requires-Dist: ragbits-core==0.15.0
25
+ Requires-Dist: datasets<4.0.0,>=3.0.1
26
+ Requires-Dist: deepeval<3.0.0,>=2.0.0
27
+ Requires-Dist: distilabel<2.0.0,>=1.5.0
28
+ Requires-Dist: hydra-core<2.0.0,>=1.3.2
29
+ Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
30
+ Requires-Dist: optuna<5.0.0,>=4.0.0
31
+ Requires-Dist: ragbits-core==1.4.0.dev202601130240
30
32
  Provides-Extra: relari
31
33
  Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
32
34
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ragbits-evaluate"
3
- version = "0.15.0"
3
+ version = "1.4.0.dev202601130240"
4
4
  description = "Evaluation module for Ragbits components"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -32,7 +32,7 @@ classifiers = [
32
32
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
33
33
  "Topic :: Software Development :: Libraries :: Python Modules",
34
34
  ]
35
- dependencies = ["hydra-core~=1.3.2", "neptune[optuna]~=1.12.0", "optuna==4.0.0", "distilabel==1.4.1", "ragbits-core==0.15.0"]
35
+ dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.5.0,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==1.4.0.dev202601130240", "deepeval>=2.0.0,<3.0.0"]
36
36
 
37
37
  [project.urls]
38
38
  "Homepage" = "https://github.com/deepsense-ai/ragbits"
@@ -0,0 +1,87 @@
1
+ """Agent simulation utilities for evaluation scenarios.
2
+
3
+ This module uses lazy imports for components that require optional dependencies
4
+ (ragbits-agents, ragbits-chat) to allow importing result models independently.
5
+ """
6
+
7
+ from typing import TYPE_CHECKING
8
+
9
+ # Import context, metrics, and result models eagerly - they have no external dependencies
10
+ from ragbits.evaluate.agent_simulation.context import DataSnapshot, DomainContext
11
+ from ragbits.evaluate.agent_simulation.metrics import (
12
+ CompositeMetricCollector,
13
+ LatencyMetricCollector,
14
+ MetricCollector,
15
+ TokenUsageMetricCollector,
16
+ ToolUsageMetricCollector,
17
+ )
18
+ from ragbits.evaluate.agent_simulation.results import (
19
+ ConversationMetrics,
20
+ SimulationResult,
21
+ SimulationStatus,
22
+ TaskResult,
23
+ TurnResult,
24
+ )
25
+
26
+ if TYPE_CHECKING:
27
+ from ragbits.evaluate.agent_simulation.conversation import run_simulation
28
+ from ragbits.evaluate.agent_simulation.deepeval_evaluator import DeepEvalEvaluator
29
+ from ragbits.evaluate.agent_simulation.logger import ConversationLogger
30
+ from ragbits.evaluate.agent_simulation.models import Personality, Scenario, Task, Turn
31
+ from ragbits.evaluate.agent_simulation.scenarios import load_personalities, load_scenarios
32
+ from ragbits.evaluate.agent_simulation.simulation import GoalChecker, SimulatedUser
33
+
34
+ __all__ = [
35
+ "CompositeMetricCollector",
36
+ "ConversationLogger",
37
+ "ConversationMetrics",
38
+ "DataSnapshot",
39
+ "DeepEvalEvaluator",
40
+ "DomainContext",
41
+ "GoalChecker",
42
+ "LatencyMetricCollector",
43
+ "MetricCollector",
44
+ "Personality",
45
+ "Scenario",
46
+ "SimulatedUser",
47
+ "SimulationResult",
48
+ "SimulationStatus",
49
+ "Task",
50
+ "TaskResult",
51
+ "TokenUsageMetricCollector",
52
+ "ToolUsageMetricCollector",
53
+ "Turn",
54
+ "TurnResult",
55
+ "load_personalities",
56
+ "load_scenarios",
57
+ "run_simulation",
58
+ ]
59
+
60
+
61
+ def __getattr__(name: str) -> object:
62
+ """Lazy import for components with optional dependencies."""
63
+ if name == "run_simulation":
64
+ from ragbits.evaluate.agent_simulation.conversation import run_simulation
65
+
66
+ return run_simulation
67
+ if name == "DeepEvalEvaluator":
68
+ from ragbits.evaluate.agent_simulation.deepeval_evaluator import DeepEvalEvaluator
69
+
70
+ return DeepEvalEvaluator
71
+ if name == "ConversationLogger":
72
+ from ragbits.evaluate.agent_simulation.logger import ConversationLogger
73
+
74
+ return ConversationLogger
75
+ if name in ("Personality", "Scenario", "Task", "Turn"):
76
+ from ragbits.evaluate.agent_simulation import models
77
+
78
+ return getattr(models, name)
79
+ if name in ("load_personalities", "load_scenarios"):
80
+ from ragbits.evaluate.agent_simulation import scenarios
81
+
82
+ return getattr(scenarios, name)
83
+ if name in ("GoalChecker", "SimulatedUser"):
84
+ from ragbits.evaluate.agent_simulation import simulation
85
+
86
+ return getattr(simulation, name)
87
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,118 @@
1
+ """Context models for agent simulation scenarios."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any
5
+
6
+ DEFAULT_MAX_ITEMS_IN_PROMPT = 15
7
+
8
+
9
+ @dataclass
10
+ class DomainContext:
11
+ """Domain-specific context for goal checking and simulation.
12
+
13
+ Provides additional context to the GoalChecker to avoid false negatives
14
+ from value interpretation differences or missing domain knowledge.
15
+
16
+ The context is intentionally generic - use the `metadata` field for any
17
+ domain-specific information that doesn't fit the standard fields.
18
+
19
+ Example:
20
+ >>> context = DomainContext(
21
+ ... domain_type="customer_support",
22
+ ... locale="en_US",
23
+ ... metadata={"ticket_statuses": ["open", "pending", "resolved"]},
24
+ ... )
25
+ >>> result = await goal_checker.is_task_achieved(task, history, context=context)
26
+ """
27
+
28
+ domain_type: str
29
+ """Type of domain (e.g., "customer_support", "booking", "search", "qa")."""
30
+
31
+ locale: str = "en_US"
32
+ """Locale for language and formatting (e.g., "en_US", "de_DE")."""
33
+
34
+ metadata: dict[str, Any] = field(default_factory=dict)
35
+ """Arbitrary domain-specific metadata for goal checking context."""
36
+
37
+ def format_for_prompt(self) -> str:
38
+ """Format context for inclusion in LLM prompts.
39
+
40
+ Returns:
41
+ Formatted string suitable for prompt injection.
42
+ """
43
+ parts = [
44
+ f"Domain: {self.domain_type}",
45
+ f"Locale: {self.locale}",
46
+ ]
47
+
48
+ if self.metadata:
49
+ parts.append("Additional context:")
50
+ for key, value in self.metadata.items():
51
+ if isinstance(value, list) and len(value) > DEFAULT_MAX_ITEMS_IN_PROMPT:
52
+ truncated = value[:DEFAULT_MAX_ITEMS_IN_PROMPT]
53
+ parts.append(f" {key}: {truncated} ... and {len(value) - DEFAULT_MAX_ITEMS_IN_PROMPT} more")
54
+ else:
55
+ parts.append(f" {key}: {value}")
56
+
57
+ return "\n".join(parts)
58
+
59
+
60
+ @dataclass
61
+ class DataSnapshot:
62
+ """Sample of available data to ground simulated user requests.
63
+
64
+ Provides the simulated user with knowledge of what data actually exists,
65
+ preventing unrealistic requests for non-existent entities.
66
+
67
+ The snapshot is intentionally generic - store any domain-specific data
68
+ in the `entities` dict with descriptive keys.
69
+
70
+ Example:
71
+ >>> snapshot = DataSnapshot(
72
+ ... entities={
73
+ ... "available_topics": ["billing", "technical", "returns"],
74
+ ... "sample_users": [{"id": "u1", "name": "John"}],
75
+ ... },
76
+ ... description="Customer support knowledge base",
77
+ ... )
78
+ >>> # SimulatedUser will only reference items from this data
79
+ """
80
+
81
+ entities: dict[str, list[Any]] = field(default_factory=dict)
82
+ """Named collections of available entities (e.g., {"users": [...], "documents": [...]})."""
83
+
84
+ description: str = ""
85
+ """Optional description of the data snapshot for context."""
86
+
87
+ def format_for_prompt(self, max_items: int = DEFAULT_MAX_ITEMS_IN_PROMPT) -> str:
88
+ """Format data snapshot for inclusion in LLM prompts.
89
+
90
+ Args:
91
+ max_items: Maximum number of items to include per entity type.
92
+
93
+ Returns:
94
+ Formatted string suitable for prompt injection.
95
+ """
96
+ parts = []
97
+
98
+ if self.description:
99
+ parts.append(f"Context: {self.description}")
100
+
101
+ for entity_name, entity_list in self.entities.items():
102
+ if not entity_list:
103
+ continue
104
+
105
+ truncated = entity_list[:max_items]
106
+ # Format items - if dicts with 'name', use that; otherwise str()
107
+ formatted_items = []
108
+ for item in truncated:
109
+ if isinstance(item, dict) and "name" in item:
110
+ formatted_items.append(item["name"])
111
+ else:
112
+ formatted_items.append(str(item))
113
+
114
+ parts.append(f"{entity_name}: {', '.join(formatted_items)}")
115
+ if len(entity_list) > max_items:
116
+ parts.append(f" ... and {len(entity_list) - max_items} more")
117
+
118
+ return "\n".join(parts)