ragbits-evaluate 0.0.8.dev23005__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. ragbits/evaluate/__init__.py +0 -0
  2. ragbits/evaluate/agent_simulation/__init__.py +122 -0
  3. ragbits/evaluate/agent_simulation/context.py +140 -0
  4. ragbits/evaluate/agent_simulation/conversation.py +515 -0
  5. ragbits/evaluate/agent_simulation/deepeval_evaluator.py +92 -0
  6. ragbits/evaluate/agent_simulation/logger.py +165 -0
  7. ragbits/evaluate/agent_simulation/metrics/__init__.py +19 -0
  8. ragbits/evaluate/agent_simulation/metrics/builtin.py +221 -0
  9. ragbits/evaluate/agent_simulation/metrics/collectors.py +142 -0
  10. ragbits/evaluate/agent_simulation/models.py +37 -0
  11. ragbits/evaluate/agent_simulation/results.py +200 -0
  12. ragbits/evaluate/agent_simulation/scenarios.py +129 -0
  13. ragbits/evaluate/agent_simulation/simulation.py +245 -0
  14. ragbits/evaluate/cli.py +150 -0
  15. ragbits/evaluate/config.py +11 -0
  16. ragbits/evaluate/dataloaders/__init__.py +3 -0
  17. ragbits/evaluate/dataloaders/base.py +95 -0
  18. ragbits/evaluate/dataloaders/document_search.py +61 -0
  19. ragbits/evaluate/dataloaders/exceptions.py +25 -0
  20. ragbits/evaluate/dataloaders/gaia.py +78 -0
  21. ragbits/evaluate/dataloaders/hotpot_qa.py +95 -0
  22. ragbits/evaluate/dataloaders/human_eval.py +70 -0
  23. ragbits/evaluate/dataloaders/question_answer.py +56 -0
  24. ragbits/evaluate/dataset_generator/__init__.py +0 -0
  25. ragbits/evaluate/dataset_generator/pipeline.py +141 -0
  26. ragbits/evaluate/dataset_generator/prompts/__init__.py +0 -0
  27. ragbits/evaluate/dataset_generator/prompts/corpus_generation.py +21 -0
  28. ragbits/evaluate/dataset_generator/prompts/qa.py +83 -0
  29. ragbits/evaluate/dataset_generator/tasks/__init__.py +0 -0
  30. ragbits/evaluate/dataset_generator/tasks/corpus_generation.py +67 -0
  31. ragbits/evaluate/dataset_generator/tasks/filter/__init__.py +0 -0
  32. ragbits/evaluate/dataset_generator/tasks/filter/base.py +43 -0
  33. ragbits/evaluate/dataset_generator/tasks/filter/dont_know.py +34 -0
  34. ragbits/evaluate/dataset_generator/tasks/text_generation/__init__.py +0 -0
  35. ragbits/evaluate/dataset_generator/tasks/text_generation/base.py +66 -0
  36. ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py +96 -0
  37. ragbits/evaluate/dataset_generator/utils.py +43 -0
  38. ragbits/evaluate/evaluator.py +244 -0
  39. ragbits/evaluate/factories/__init__.py +42 -0
  40. ragbits/evaluate/metrics/__init__.py +3 -0
  41. ragbits/evaluate/metrics/base.py +89 -0
  42. ragbits/evaluate/metrics/document_search.py +90 -0
  43. ragbits/evaluate/metrics/gaia.py +84 -0
  44. ragbits/evaluate/metrics/hotpot_qa.py +51 -0
  45. ragbits/evaluate/metrics/human_eval.py +105 -0
  46. ragbits/evaluate/metrics/question_answer.py +205 -0
  47. ragbits/evaluate/optimizer.py +210 -0
  48. ragbits/evaluate/pipelines/__init__.py +37 -0
  49. ragbits/evaluate/pipelines/base.py +64 -0
  50. ragbits/evaluate/pipelines/document_search.py +106 -0
  51. ragbits/evaluate/pipelines/gaia.py +249 -0
  52. ragbits/evaluate/pipelines/hotpot_qa.py +342 -0
  53. ragbits/evaluate/pipelines/human_eval.py +323 -0
  54. ragbits/evaluate/pipelines/question_answer.py +96 -0
  55. ragbits/evaluate/py.typed +0 -0
  56. ragbits/evaluate/utils.py +160 -0
  57. ragbits_evaluate-0.0.8.dev23005.dist-info/METADATA +58 -0
  58. ragbits_evaluate-0.0.8.dev23005.dist-info/RECORD +59 -0
  59. ragbits_evaluate-0.0.8.dev23005.dist-info/WHEEL +4 -0
File without changes
@@ -0,0 +1,122 @@
1
+ """Agent simulation utilities for evaluation scenarios.
2
+
3
+ This module uses lazy imports for components that require optional dependencies
4
+ (ragbits-agents, ragbits-chat) to allow importing result models independently.
5
+ """
6
+
7
+ from typing import TYPE_CHECKING
8
+
9
+ # Import context, metrics, and result models eagerly - they have no external dependencies
10
+ # Adapters are re-exported from ragbits.chat.adapters for convenience
11
+ from ragbits.chat.adapters import (
12
+ AdapterContext,
13
+ AdapterPipeline,
14
+ BaseAdapter,
15
+ ChatResponseAdapter,
16
+ FilterAdapter,
17
+ ResponseAdapter,
18
+ TextAccumulatorAdapter,
19
+ ToolCallAccumulatorAdapter,
20
+ ToolResultTextAdapter,
21
+ UsageAggregatorAdapter,
22
+ )
23
+ from ragbits.evaluate.agent_simulation.context import DataSnapshot, DomainContext
24
+ from ragbits.evaluate.agent_simulation.metrics import (
25
+ CompositeMetricCollector,
26
+ LatencyMetricCollector,
27
+ MetricCollector,
28
+ TokenUsageMetricCollector,
29
+ ToolUsageMetricCollector,
30
+ )
31
+ from ragbits.evaluate.agent_simulation.results import (
32
+ ConversationMetrics,
33
+ SimulationResult,
34
+ SimulationStatus,
35
+ TaskResult,
36
+ TurnResult,
37
+ )
38
+
39
+ if TYPE_CHECKING:
40
+ from ragbits.evaluate.agent_simulation.conversation import (
41
+ run_scenario_matrix,
42
+ run_simulation,
43
+ run_simulations_concurrent,
44
+ )
45
+ from ragbits.evaluate.agent_simulation.deepeval_evaluator import DeepEvalEvaluator
46
+ from ragbits.evaluate.agent_simulation.logger import ConversationLogger
47
+ from ragbits.evaluate.agent_simulation.models import Personality, Scenario, Task, Turn
48
+ from ragbits.evaluate.agent_simulation.scenarios import load_personalities, load_scenarios
49
+ from ragbits.evaluate.agent_simulation.simulation import GoalChecker, SimulatedUser
50
+
51
+ __all__ = [
52
+ # Adapters
53
+ "AdapterContext",
54
+ "AdapterPipeline",
55
+ "BaseAdapter",
56
+ "ChatResponseAdapter",
57
+ "FilterAdapter",
58
+ "ResponseAdapter",
59
+ "TextAccumulatorAdapter",
60
+ "ToolCallAccumulatorAdapter",
61
+ "ToolResultTextAdapter",
62
+ "UsageAggregatorAdapter",
63
+ # Metrics
64
+ "CompositeMetricCollector",
65
+ "LatencyMetricCollector",
66
+ "MetricCollector",
67
+ "TokenUsageMetricCollector",
68
+ "ToolUsageMetricCollector",
69
+ # Context
70
+ "DataSnapshot",
71
+ "DomainContext",
72
+ # Results
73
+ "ConversationMetrics",
74
+ "SimulationResult",
75
+ "SimulationStatus",
76
+ "TaskResult",
77
+ "TurnResult",
78
+ # Components (lazy loaded)
79
+ "ConversationLogger",
80
+ "DeepEvalEvaluator",
81
+ "GoalChecker",
82
+ "Personality",
83
+ "Scenario",
84
+ "SimulatedUser",
85
+ "Task",
86
+ "Turn",
87
+ # Functions (lazy loaded)
88
+ "load_personalities",
89
+ "load_scenarios",
90
+ "run_scenario_matrix",
91
+ "run_simulation",
92
+ "run_simulations_concurrent",
93
+ ]
94
+
95
+
96
+ def __getattr__(name: str) -> object:
97
+ """Lazy import for components with optional dependencies."""
98
+ if name in ("run_simulation", "run_simulations_concurrent", "run_scenario_matrix"):
99
+ from ragbits.evaluate.agent_simulation import conversation
100
+
101
+ return getattr(conversation, name)
102
+ if name == "DeepEvalEvaluator":
103
+ from ragbits.evaluate.agent_simulation.deepeval_evaluator import DeepEvalEvaluator
104
+
105
+ return DeepEvalEvaluator
106
+ if name == "ConversationLogger":
107
+ from ragbits.evaluate.agent_simulation.logger import ConversationLogger
108
+
109
+ return ConversationLogger
110
+ if name in ("Personality", "Scenario", "Task", "Turn"):
111
+ from ragbits.evaluate.agent_simulation import models
112
+
113
+ return getattr(models, name)
114
+ if name in ("load_personalities", "load_scenarios"):
115
+ from ragbits.evaluate.agent_simulation import scenarios
116
+
117
+ return getattr(scenarios, name)
118
+ if name in ("GoalChecker", "SimulatedUser"):
119
+ from ragbits.evaluate.agent_simulation import simulation
120
+
121
+ return getattr(simulation, name)
122
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,140 @@
1
+ """Context models for agent simulation scenarios."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any
5
+
6
+ MAX_MERCHANTS_IN_PROMPT = 5
7
+ MAX_PRODUCTS_IN_PROMPT = 10
8
+ MAX_CATEGORIES_IN_PROMPT = 10
9
+ DEFAULT_MAX_ITEMS_IN_PROMPT = 15
10
+
11
+
12
+ @dataclass
13
+ class DomainContext:
14
+ """Domain-specific context for goal checking and simulation.
15
+
16
+ Provides additional context to the GoalChecker to avoid false negatives
17
+ from currency confusion, locale differences, or missing domain knowledge.
18
+
19
+ Example:
20
+ >>> context = DomainContext(
21
+ ... domain_type="retail", currency="SAR", locale="ar_SA", business_rules={"prices_include_vat": True}
22
+ ... )
23
+ >>> result = await goal_checker.is_task_achieved(task, history, context=context)
24
+ """
25
+
26
+ domain_type: str
27
+ """Type of domain: "food", "retail", "travel", "groceries", etc."""
28
+
29
+ currency: str = "USD"
30
+ """Currency code for price interpretation (e.g., "USD", "SAR", "EUR")."""
31
+
32
+ locale: str = "en_US"
33
+ """Locale for language and formatting (e.g., "en_US", "ar_SA")."""
34
+
35
+ available_merchants: list[dict[str, Any]] = field(default_factory=list)
36
+ """List of available merchants/vendors for validation."""
37
+
38
+ available_products: list[dict[str, Any]] = field(default_factory=list)
39
+ """List of available products for validation."""
40
+
41
+ business_rules: dict[str, Any] = field(default_factory=dict)
42
+ """Domain-specific business rules (e.g., {"min_order": 50, "delivery_fee": 10})."""
43
+
44
+ def format_for_prompt(self) -> str:
45
+ """Format context for inclusion in LLM prompts.
46
+
47
+ Returns:
48
+ Formatted string suitable for prompt injection.
49
+ """
50
+ parts = [
51
+ f"Domain: {self.domain_type}",
52
+ f"Currency: {self.currency}",
53
+ f"Locale: {self.locale}",
54
+ ]
55
+
56
+ if self.business_rules:
57
+ rules_str = ", ".join(f"{k}={v}" for k, v in self.business_rules.items())
58
+ parts.append(f"Business Rules: {rules_str}")
59
+
60
+ if self.available_merchants:
61
+ merchant_names = [m.get("name", str(m)) for m in self.available_merchants[:MAX_MERCHANTS_IN_PROMPT]]
62
+ parts.append(f"Available Merchants: {', '.join(merchant_names)}")
63
+ if len(self.available_merchants) > MAX_MERCHANTS_IN_PROMPT:
64
+ parts.append(f" ... and {len(self.available_merchants) - MAX_MERCHANTS_IN_PROMPT} more")
65
+
66
+ if self.available_products:
67
+ product_names = [p.get("name", str(p)) for p in self.available_products[:MAX_PRODUCTS_IN_PROMPT]]
68
+ parts.append(f"Sample Products: {', '.join(product_names)}")
69
+ if len(self.available_products) > MAX_PRODUCTS_IN_PROMPT:
70
+ parts.append(f" ... and {len(self.available_products) - MAX_PRODUCTS_IN_PROMPT} more")
71
+
72
+ return "\n".join(parts)
73
+
74
+
75
+ @dataclass
76
+ class DataSnapshot:
77
+ """Sample of available data to ground simulated user requests.
78
+
79
+ Provides the simulated user with knowledge of what data actually exists,
80
+ preventing unrealistic requests (e.g., asking for "sushi" when only burgers are available).
81
+
82
+ Example:
83
+ >>> snapshot = DataSnapshot(
84
+ ... merchants=[{"name": "Burger House"}],
85
+ ... sample_products=[{"name": "Classic Burger"}, {"name": "Fries"}],
86
+ ... categories=["burgers", "sides", "drinks"],
87
+ ... )
88
+ >>> # SimulatedUser will only request items from this data
89
+ """
90
+
91
+ merchants: list[dict[str, Any]] = field(default_factory=list)
92
+ """List of available merchants/restaurants/stores."""
93
+
94
+ sample_products: list[dict[str, Any]] = field(default_factory=list)
95
+ """Sample products available for ordering/browsing."""
96
+
97
+ categories: list[str] = field(default_factory=list)
98
+ """Available product/service categories."""
99
+
100
+ max_price: float | None = None
101
+ """Maximum price in the dataset (for realistic price constraints)."""
102
+
103
+ min_price: float | None = None
104
+ """Minimum price in the dataset."""
105
+
106
+ def format_for_prompt(self, max_items: int = DEFAULT_MAX_ITEMS_IN_PROMPT) -> str:
107
+ """Format data snapshot for inclusion in LLM prompts.
108
+
109
+ Args:
110
+ max_items: Maximum number of items to include per category.
111
+
112
+ Returns:
113
+ Formatted string suitable for prompt injection.
114
+ """
115
+ parts = []
116
+
117
+ if self.merchants:
118
+ merchants_str = ", ".join(m.get("name", str(m)) for m in self.merchants[:max_items])
119
+ parts.append(f"Available merchants: {merchants_str}")
120
+ if len(self.merchants) > max_items:
121
+ parts.append(f" ... and {len(self.merchants) - max_items} more")
122
+
123
+ if self.sample_products:
124
+ products = self.sample_products[:max_items]
125
+ products_str = ", ".join(p.get("name", str(p)) for p in products)
126
+ parts.append(f"Sample products: {products_str}")
127
+ if len(self.sample_products) > max_items:
128
+ parts.append(f" ... and {len(self.sample_products) - max_items} more")
129
+
130
+ if self.categories:
131
+ categories_str = ", ".join(self.categories[:MAX_CATEGORIES_IN_PROMPT])
132
+ parts.append(f"Categories: {categories_str}")
133
+ if len(self.categories) > MAX_CATEGORIES_IN_PROMPT:
134
+ parts.append(f" ... and {len(self.categories) - MAX_CATEGORIES_IN_PROMPT} more")
135
+
136
+ if self.max_price is not None:
137
+ min_price = self.min_price or 0
138
+ parts.append(f"Price range: {min_price} - {self.max_price}")
139
+
140
+ return "\n".join(parts)