ragbits-evaluate 0.0.8.dev23005__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragbits/evaluate/__init__.py +0 -0
- ragbits/evaluate/agent_simulation/__init__.py +122 -0
- ragbits/evaluate/agent_simulation/context.py +140 -0
- ragbits/evaluate/agent_simulation/conversation.py +515 -0
- ragbits/evaluate/agent_simulation/deepeval_evaluator.py +92 -0
- ragbits/evaluate/agent_simulation/logger.py +165 -0
- ragbits/evaluate/agent_simulation/metrics/__init__.py +19 -0
- ragbits/evaluate/agent_simulation/metrics/builtin.py +221 -0
- ragbits/evaluate/agent_simulation/metrics/collectors.py +142 -0
- ragbits/evaluate/agent_simulation/models.py +37 -0
- ragbits/evaluate/agent_simulation/results.py +200 -0
- ragbits/evaluate/agent_simulation/scenarios.py +129 -0
- ragbits/evaluate/agent_simulation/simulation.py +245 -0
- ragbits/evaluate/cli.py +150 -0
- ragbits/evaluate/config.py +11 -0
- ragbits/evaluate/dataloaders/__init__.py +3 -0
- ragbits/evaluate/dataloaders/base.py +95 -0
- ragbits/evaluate/dataloaders/document_search.py +61 -0
- ragbits/evaluate/dataloaders/exceptions.py +25 -0
- ragbits/evaluate/dataloaders/gaia.py +78 -0
- ragbits/evaluate/dataloaders/hotpot_qa.py +95 -0
- ragbits/evaluate/dataloaders/human_eval.py +70 -0
- ragbits/evaluate/dataloaders/question_answer.py +56 -0
- ragbits/evaluate/dataset_generator/__init__.py +0 -0
- ragbits/evaluate/dataset_generator/pipeline.py +141 -0
- ragbits/evaluate/dataset_generator/prompts/__init__.py +0 -0
- ragbits/evaluate/dataset_generator/prompts/corpus_generation.py +21 -0
- ragbits/evaluate/dataset_generator/prompts/qa.py +83 -0
- ragbits/evaluate/dataset_generator/tasks/__init__.py +0 -0
- ragbits/evaluate/dataset_generator/tasks/corpus_generation.py +67 -0
- ragbits/evaluate/dataset_generator/tasks/filter/__init__.py +0 -0
- ragbits/evaluate/dataset_generator/tasks/filter/base.py +43 -0
- ragbits/evaluate/dataset_generator/tasks/filter/dont_know.py +34 -0
- ragbits/evaluate/dataset_generator/tasks/text_generation/__init__.py +0 -0
- ragbits/evaluate/dataset_generator/tasks/text_generation/base.py +66 -0
- ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py +96 -0
- ragbits/evaluate/dataset_generator/utils.py +43 -0
- ragbits/evaluate/evaluator.py +244 -0
- ragbits/evaluate/factories/__init__.py +42 -0
- ragbits/evaluate/metrics/__init__.py +3 -0
- ragbits/evaluate/metrics/base.py +89 -0
- ragbits/evaluate/metrics/document_search.py +90 -0
- ragbits/evaluate/metrics/gaia.py +84 -0
- ragbits/evaluate/metrics/hotpot_qa.py +51 -0
- ragbits/evaluate/metrics/human_eval.py +105 -0
- ragbits/evaluate/metrics/question_answer.py +205 -0
- ragbits/evaluate/optimizer.py +210 -0
- ragbits/evaluate/pipelines/__init__.py +37 -0
- ragbits/evaluate/pipelines/base.py +64 -0
- ragbits/evaluate/pipelines/document_search.py +106 -0
- ragbits/evaluate/pipelines/gaia.py +249 -0
- ragbits/evaluate/pipelines/hotpot_qa.py +342 -0
- ragbits/evaluate/pipelines/human_eval.py +323 -0
- ragbits/evaluate/pipelines/question_answer.py +96 -0
- ragbits/evaluate/py.typed +0 -0
- ragbits/evaluate/utils.py +160 -0
- ragbits_evaluate-0.0.8.dev23005.dist-info/METADATA +58 -0
- ragbits_evaluate-0.0.8.dev23005.dist-info/RECORD +59 -0
- ragbits_evaluate-0.0.8.dev23005.dist-info/WHEEL +4 -0
|
File without changes
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""Agent simulation utilities for evaluation scenarios.
|
|
2
|
+
|
|
3
|
+
This module uses lazy imports for components that require optional dependencies
|
|
4
|
+
(ragbits-agents, ragbits-chat) to allow importing result models independently.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
# Import context, metrics, and result models eagerly - they have no external dependencies
|
|
10
|
+
# Adapters are re-exported from ragbits.chat.adapters for convenience
|
|
11
|
+
from ragbits.chat.adapters import (
|
|
12
|
+
AdapterContext,
|
|
13
|
+
AdapterPipeline,
|
|
14
|
+
BaseAdapter,
|
|
15
|
+
ChatResponseAdapter,
|
|
16
|
+
FilterAdapter,
|
|
17
|
+
ResponseAdapter,
|
|
18
|
+
TextAccumulatorAdapter,
|
|
19
|
+
ToolCallAccumulatorAdapter,
|
|
20
|
+
ToolResultTextAdapter,
|
|
21
|
+
UsageAggregatorAdapter,
|
|
22
|
+
)
|
|
23
|
+
from ragbits.evaluate.agent_simulation.context import DataSnapshot, DomainContext
|
|
24
|
+
from ragbits.evaluate.agent_simulation.metrics import (
|
|
25
|
+
CompositeMetricCollector,
|
|
26
|
+
LatencyMetricCollector,
|
|
27
|
+
MetricCollector,
|
|
28
|
+
TokenUsageMetricCollector,
|
|
29
|
+
ToolUsageMetricCollector,
|
|
30
|
+
)
|
|
31
|
+
from ragbits.evaluate.agent_simulation.results import (
|
|
32
|
+
ConversationMetrics,
|
|
33
|
+
SimulationResult,
|
|
34
|
+
SimulationStatus,
|
|
35
|
+
TaskResult,
|
|
36
|
+
TurnResult,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if TYPE_CHECKING:
|
|
40
|
+
from ragbits.evaluate.agent_simulation.conversation import (
|
|
41
|
+
run_scenario_matrix,
|
|
42
|
+
run_simulation,
|
|
43
|
+
run_simulations_concurrent,
|
|
44
|
+
)
|
|
45
|
+
from ragbits.evaluate.agent_simulation.deepeval_evaluator import DeepEvalEvaluator
|
|
46
|
+
from ragbits.evaluate.agent_simulation.logger import ConversationLogger
|
|
47
|
+
from ragbits.evaluate.agent_simulation.models import Personality, Scenario, Task, Turn
|
|
48
|
+
from ragbits.evaluate.agent_simulation.scenarios import load_personalities, load_scenarios
|
|
49
|
+
from ragbits.evaluate.agent_simulation.simulation import GoalChecker, SimulatedUser
|
|
50
|
+
|
|
51
|
+
__all__ = [
|
|
52
|
+
# Adapters
|
|
53
|
+
"AdapterContext",
|
|
54
|
+
"AdapterPipeline",
|
|
55
|
+
"BaseAdapter",
|
|
56
|
+
"ChatResponseAdapter",
|
|
57
|
+
"FilterAdapter",
|
|
58
|
+
"ResponseAdapter",
|
|
59
|
+
"TextAccumulatorAdapter",
|
|
60
|
+
"ToolCallAccumulatorAdapter",
|
|
61
|
+
"ToolResultTextAdapter",
|
|
62
|
+
"UsageAggregatorAdapter",
|
|
63
|
+
# Metrics
|
|
64
|
+
"CompositeMetricCollector",
|
|
65
|
+
"LatencyMetricCollector",
|
|
66
|
+
"MetricCollector",
|
|
67
|
+
"TokenUsageMetricCollector",
|
|
68
|
+
"ToolUsageMetricCollector",
|
|
69
|
+
# Context
|
|
70
|
+
"DataSnapshot",
|
|
71
|
+
"DomainContext",
|
|
72
|
+
# Results
|
|
73
|
+
"ConversationMetrics",
|
|
74
|
+
"SimulationResult",
|
|
75
|
+
"SimulationStatus",
|
|
76
|
+
"TaskResult",
|
|
77
|
+
"TurnResult",
|
|
78
|
+
# Components (lazy loaded)
|
|
79
|
+
"ConversationLogger",
|
|
80
|
+
"DeepEvalEvaluator",
|
|
81
|
+
"GoalChecker",
|
|
82
|
+
"Personality",
|
|
83
|
+
"Scenario",
|
|
84
|
+
"SimulatedUser",
|
|
85
|
+
"Task",
|
|
86
|
+
"Turn",
|
|
87
|
+
# Functions (lazy loaded)
|
|
88
|
+
"load_personalities",
|
|
89
|
+
"load_scenarios",
|
|
90
|
+
"run_scenario_matrix",
|
|
91
|
+
"run_simulation",
|
|
92
|
+
"run_simulations_concurrent",
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def __getattr__(name: str) -> object:
|
|
97
|
+
"""Lazy import for components with optional dependencies."""
|
|
98
|
+
if name in ("run_simulation", "run_simulations_concurrent", "run_scenario_matrix"):
|
|
99
|
+
from ragbits.evaluate.agent_simulation import conversation
|
|
100
|
+
|
|
101
|
+
return getattr(conversation, name)
|
|
102
|
+
if name == "DeepEvalEvaluator":
|
|
103
|
+
from ragbits.evaluate.agent_simulation.deepeval_evaluator import DeepEvalEvaluator
|
|
104
|
+
|
|
105
|
+
return DeepEvalEvaluator
|
|
106
|
+
if name == "ConversationLogger":
|
|
107
|
+
from ragbits.evaluate.agent_simulation.logger import ConversationLogger
|
|
108
|
+
|
|
109
|
+
return ConversationLogger
|
|
110
|
+
if name in ("Personality", "Scenario", "Task", "Turn"):
|
|
111
|
+
from ragbits.evaluate.agent_simulation import models
|
|
112
|
+
|
|
113
|
+
return getattr(models, name)
|
|
114
|
+
if name in ("load_personalities", "load_scenarios"):
|
|
115
|
+
from ragbits.evaluate.agent_simulation import scenarios
|
|
116
|
+
|
|
117
|
+
return getattr(scenarios, name)
|
|
118
|
+
if name in ("GoalChecker", "SimulatedUser"):
|
|
119
|
+
from ragbits.evaluate.agent_simulation import simulation
|
|
120
|
+
|
|
121
|
+
return getattr(simulation, name)
|
|
122
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""Context models for agent simulation scenarios."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
MAX_MERCHANTS_IN_PROMPT = 5
|
|
7
|
+
MAX_PRODUCTS_IN_PROMPT = 10
|
|
8
|
+
MAX_CATEGORIES_IN_PROMPT = 10
|
|
9
|
+
DEFAULT_MAX_ITEMS_IN_PROMPT = 15
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class DomainContext:
|
|
14
|
+
"""Domain-specific context for goal checking and simulation.
|
|
15
|
+
|
|
16
|
+
Provides additional context to the GoalChecker to avoid false negatives
|
|
17
|
+
from currency confusion, locale differences, or missing domain knowledge.
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
>>> context = DomainContext(
|
|
21
|
+
... domain_type="retail", currency="SAR", locale="ar_SA", business_rules={"prices_include_vat": True}
|
|
22
|
+
... )
|
|
23
|
+
>>> result = await goal_checker.is_task_achieved(task, history, context=context)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
domain_type: str
|
|
27
|
+
"""Type of domain: "food", "retail", "travel", "groceries", etc."""
|
|
28
|
+
|
|
29
|
+
currency: str = "USD"
|
|
30
|
+
"""Currency code for price interpretation (e.g., "USD", "SAR", "EUR")."""
|
|
31
|
+
|
|
32
|
+
locale: str = "en_US"
|
|
33
|
+
"""Locale for language and formatting (e.g., "en_US", "ar_SA")."""
|
|
34
|
+
|
|
35
|
+
available_merchants: list[dict[str, Any]] = field(default_factory=list)
|
|
36
|
+
"""List of available merchants/vendors for validation."""
|
|
37
|
+
|
|
38
|
+
available_products: list[dict[str, Any]] = field(default_factory=list)
|
|
39
|
+
"""List of available products for validation."""
|
|
40
|
+
|
|
41
|
+
business_rules: dict[str, Any] = field(default_factory=dict)
|
|
42
|
+
"""Domain-specific business rules (e.g., {"min_order": 50, "delivery_fee": 10})."""
|
|
43
|
+
|
|
44
|
+
def format_for_prompt(self) -> str:
|
|
45
|
+
"""Format context for inclusion in LLM prompts.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Formatted string suitable for prompt injection.
|
|
49
|
+
"""
|
|
50
|
+
parts = [
|
|
51
|
+
f"Domain: {self.domain_type}",
|
|
52
|
+
f"Currency: {self.currency}",
|
|
53
|
+
f"Locale: {self.locale}",
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
if self.business_rules:
|
|
57
|
+
rules_str = ", ".join(f"{k}={v}" for k, v in self.business_rules.items())
|
|
58
|
+
parts.append(f"Business Rules: {rules_str}")
|
|
59
|
+
|
|
60
|
+
if self.available_merchants:
|
|
61
|
+
merchant_names = [m.get("name", str(m)) for m in self.available_merchants[:MAX_MERCHANTS_IN_PROMPT]]
|
|
62
|
+
parts.append(f"Available Merchants: {', '.join(merchant_names)}")
|
|
63
|
+
if len(self.available_merchants) > MAX_MERCHANTS_IN_PROMPT:
|
|
64
|
+
parts.append(f" ... and {len(self.available_merchants) - MAX_MERCHANTS_IN_PROMPT} more")
|
|
65
|
+
|
|
66
|
+
if self.available_products:
|
|
67
|
+
product_names = [p.get("name", str(p)) for p in self.available_products[:MAX_PRODUCTS_IN_PROMPT]]
|
|
68
|
+
parts.append(f"Sample Products: {', '.join(product_names)}")
|
|
69
|
+
if len(self.available_products) > MAX_PRODUCTS_IN_PROMPT:
|
|
70
|
+
parts.append(f" ... and {len(self.available_products) - MAX_PRODUCTS_IN_PROMPT} more")
|
|
71
|
+
|
|
72
|
+
return "\n".join(parts)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class DataSnapshot:
|
|
77
|
+
"""Sample of available data to ground simulated user requests.
|
|
78
|
+
|
|
79
|
+
Provides the simulated user with knowledge of what data actually exists,
|
|
80
|
+
preventing unrealistic requests (e.g., asking for "sushi" when only burgers are available).
|
|
81
|
+
|
|
82
|
+
Example:
|
|
83
|
+
>>> snapshot = DataSnapshot(
|
|
84
|
+
... merchants=[{"name": "Burger House"}],
|
|
85
|
+
... sample_products=[{"name": "Classic Burger"}, {"name": "Fries"}],
|
|
86
|
+
... categories=["burgers", "sides", "drinks"],
|
|
87
|
+
... )
|
|
88
|
+
>>> # SimulatedUser will only request items from this data
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
merchants: list[dict[str, Any]] = field(default_factory=list)
|
|
92
|
+
"""List of available merchants/restaurants/stores."""
|
|
93
|
+
|
|
94
|
+
sample_products: list[dict[str, Any]] = field(default_factory=list)
|
|
95
|
+
"""Sample products available for ordering/browsing."""
|
|
96
|
+
|
|
97
|
+
categories: list[str] = field(default_factory=list)
|
|
98
|
+
"""Available product/service categories."""
|
|
99
|
+
|
|
100
|
+
max_price: float | None = None
|
|
101
|
+
"""Maximum price in the dataset (for realistic price constraints)."""
|
|
102
|
+
|
|
103
|
+
min_price: float | None = None
|
|
104
|
+
"""Minimum price in the dataset."""
|
|
105
|
+
|
|
106
|
+
def format_for_prompt(self, max_items: int = DEFAULT_MAX_ITEMS_IN_PROMPT) -> str:
|
|
107
|
+
"""Format data snapshot for inclusion in LLM prompts.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
max_items: Maximum number of items to include per category.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Formatted string suitable for prompt injection.
|
|
114
|
+
"""
|
|
115
|
+
parts = []
|
|
116
|
+
|
|
117
|
+
if self.merchants:
|
|
118
|
+
merchants_str = ", ".join(m.get("name", str(m)) for m in self.merchants[:max_items])
|
|
119
|
+
parts.append(f"Available merchants: {merchants_str}")
|
|
120
|
+
if len(self.merchants) > max_items:
|
|
121
|
+
parts.append(f" ... and {len(self.merchants) - max_items} more")
|
|
122
|
+
|
|
123
|
+
if self.sample_products:
|
|
124
|
+
products = self.sample_products[:max_items]
|
|
125
|
+
products_str = ", ".join(p.get("name", str(p)) for p in products)
|
|
126
|
+
parts.append(f"Sample products: {products_str}")
|
|
127
|
+
if len(self.sample_products) > max_items:
|
|
128
|
+
parts.append(f" ... and {len(self.sample_products) - max_items} more")
|
|
129
|
+
|
|
130
|
+
if self.categories:
|
|
131
|
+
categories_str = ", ".join(self.categories[:MAX_CATEGORIES_IN_PROMPT])
|
|
132
|
+
parts.append(f"Categories: {categories_str}")
|
|
133
|
+
if len(self.categories) > MAX_CATEGORIES_IN_PROMPT:
|
|
134
|
+
parts.append(f" ... and {len(self.categories) - MAX_CATEGORIES_IN_PROMPT} more")
|
|
135
|
+
|
|
136
|
+
if self.max_price is not None:
|
|
137
|
+
min_price = self.min_price or 0
|
|
138
|
+
parts.append(f"Price range: {min_price} - {self.max_price}")
|
|
139
|
+
|
|
140
|
+
return "\n".join(parts)
|