ragbits-evaluate 0.0.30rc1__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragbits/evaluate/agent_simulation/__init__.py +4 -49
- ragbits/evaluate/agent_simulation/conversation.py +278 -663
- ragbits/evaluate/agent_simulation/logger.py +1 -1
- ragbits/evaluate/agent_simulation/metrics/__init__.py +0 -10
- ragbits/evaluate/agent_simulation/metrics/builtin.py +49 -59
- ragbits/evaluate/agent_simulation/metrics/collectors.py +17 -37
- ragbits/evaluate/agent_simulation/models.py +18 -198
- ragbits/evaluate/agent_simulation/results.py +49 -125
- ragbits/evaluate/agent_simulation/scenarios.py +19 -95
- ragbits/evaluate/agent_simulation/simulation.py +166 -72
- ragbits/evaluate/metrics/question_answer.py +25 -8
- {ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/METADATA +2 -6
- {ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/RECORD +14 -25
- ragbits/evaluate/agent_simulation/checkers.py +0 -591
- ragbits/evaluate/agent_simulation/display.py +0 -118
- ragbits/evaluate/agent_simulation/metrics/deepeval.py +0 -295
- ragbits/evaluate/agent_simulation/tracing.py +0 -233
- ragbits/evaluate/api.py +0 -603
- ragbits/evaluate/api_types.py +0 -343
- ragbits/evaluate/execution_manager.py +0 -451
- ragbits/evaluate/stores/__init__.py +0 -36
- ragbits/evaluate/stores/base.py +0 -98
- ragbits/evaluate/stores/file.py +0 -466
- ragbits/evaluate/stores/kv.py +0 -535
- {ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/WHEEL +0 -0
|
@@ -2,47 +2,16 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
5
7
|
from typing import TYPE_CHECKING
|
|
6
8
|
|
|
7
|
-
from
|
|
8
|
-
|
|
9
|
+
from ragbits.agents.tool import ToolCallResult
|
|
9
10
|
from ragbits.core.llms import LiteLLM
|
|
10
|
-
from ragbits.core.prompt import Prompt
|
|
11
11
|
from ragbits.evaluate.agent_simulation.models import Personality, Scenario, Task, Turn
|
|
12
12
|
|
|
13
13
|
if TYPE_CHECKING:
|
|
14
|
-
from ragbits.evaluate.agent_simulation.context import DataSnapshot
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class SimulatedUserPromptInput(BaseModel):
|
|
18
|
-
"""Input for the simulated user prompt."""
|
|
19
|
-
|
|
20
|
-
scenario_name: str
|
|
21
|
-
task_context: str
|
|
22
|
-
personality_instruction: str
|
|
23
|
-
grounding_block: str
|
|
24
|
-
history_block: str
|
|
25
|
-
current_task: str
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class SimulatedUserPrompt(Prompt[SimulatedUserPromptInput, str]):
|
|
29
|
-
"""Prompt for generating simulated user messages."""
|
|
30
|
-
|
|
31
|
-
system_prompt = """
|
|
32
|
-
You are simulating a concise human user in a terminal chat.
|
|
33
|
-
Scenario: {{ scenario_name }}
|
|
34
|
-
{{ task_context }}{{ personality_instruction }}{{ grounding_block }}
|
|
35
|
-
Given the assistant's last reply and the conversation so far,
|
|
36
|
-
write ONLY the next user message to work on the current task. Be specific and brief.
|
|
37
|
-
"""
|
|
38
|
-
|
|
39
|
-
user_prompt = """
|
|
40
|
-
[CONVERSATION]
|
|
41
|
-
{{ history_block }}
|
|
42
|
-
|
|
43
|
-
[TASK]
|
|
44
|
-
Write the next USER message now (follow task: {{ current_task }}):
|
|
45
|
-
"""
|
|
14
|
+
from ragbits.evaluate.agent_simulation.context import DataSnapshot, DomainContext
|
|
46
15
|
|
|
47
16
|
|
|
48
17
|
class SimulatedUser:
|
|
@@ -95,52 +64,177 @@ class SimulatedUser:
|
|
|
95
64
|
if current_task is None:
|
|
96
65
|
return "Thank you, all tasks are completed."
|
|
97
66
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
personality_instruction=self._build_personality_instruction(),
|
|
103
|
-
grounding_block=self._build_grounding_block(),
|
|
104
|
-
history_block=self._build_history_block(history),
|
|
105
|
-
current_task=current_task.task,
|
|
106
|
-
)
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
response = await self.llm.generate(prompt)
|
|
110
|
-
return response.strip()
|
|
67
|
+
history_text = []
|
|
68
|
+
for t in history:
|
|
69
|
+
history_text.append(f"User: {t.user}\nAssistant: {t.assistant}")
|
|
70
|
+
history_block = "\n\n".join(history_text) if history_text else "(no prior messages)"
|
|
111
71
|
|
|
112
|
-
def _build_task_context(self, current_task: Task) -> str:
|
|
113
|
-
"""Build the task context string."""
|
|
114
72
|
task_context = f"Current task: {current_task.task}"
|
|
115
73
|
if self.current_task_idx > 0:
|
|
116
74
|
completed_tasks = ", ".join([t.task for t in self.scenario.tasks[: self.current_task_idx]])
|
|
117
75
|
task_context += f"\nCompleted tasks: {completed_tasks}"
|
|
118
|
-
return task_context
|
|
119
76
|
|
|
120
|
-
|
|
121
|
-
"""Build the personality instruction string."""
|
|
77
|
+
personality_instruction = ""
|
|
122
78
|
if self.personality:
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
79
|
+
personality_instruction = f"\n\nPersonality: {self.personality.description}"
|
|
80
|
+
|
|
81
|
+
# Build data grounding block if snapshot is provided
|
|
82
|
+
grounding_block = ""
|
|
83
|
+
if self.data_snapshot:
|
|
84
|
+
grounding_block = (
|
|
85
|
+
"\n\n[AVAILABLE DATA]\n"
|
|
86
|
+
f"{self.data_snapshot.format_for_prompt()}\n\n"
|
|
87
|
+
"IMPORTANT: Only reference items that exist in the AVAILABLE DATA above. "
|
|
88
|
+
"Do not ask for entities that are not listed."
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
prompt = (
|
|
92
|
+
"[SYSTEM]\n"
|
|
93
|
+
"You are simulating a concise human user in a terminal chat. "
|
|
94
|
+
f"Scenario: {self.scenario.name}\n"
|
|
95
|
+
f"{task_context}{personality_instruction}{grounding_block}\n"
|
|
96
|
+
"Given the assistant's last reply and the conversation so far, "
|
|
97
|
+
"write ONLY the next user message to work on the current task. Be specific and brief.\n\n"
|
|
98
|
+
"[CONVERSATION]\n"
|
|
99
|
+
f"{history_block}\n\n"
|
|
100
|
+
"[TASK]\nWrite the next USER message now:"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
response = await self.llm.generate(prompt=prompt)
|
|
104
|
+
return response.strip()
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class GoalChecker:
|
|
108
|
+
"""A lightweight judge model that decides whether the current task has been achieved.
|
|
109
|
+
|
|
110
|
+
It inspects the conversation so far and checks if the task matches the expected result.
|
|
111
|
+
Supports optional domain context for accurate evaluation in specific domains.
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
def __init__(self, llm: LiteLLM, scenario: Scenario) -> None:
|
|
115
|
+
self.llm = llm
|
|
116
|
+
self.scenario = scenario
|
|
117
|
+
|
|
118
|
+
async def is_task_achieved(
|
|
119
|
+
self,
|
|
120
|
+
current_task: Task,
|
|
121
|
+
history: list[Turn],
|
|
122
|
+
context: DomainContext | None = None,
|
|
123
|
+
) -> tuple[bool, str]:
|
|
124
|
+
"""Check if the current task has been completed based on the conversation history.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
current_task: The task to check completion for.
|
|
128
|
+
history: List of conversation turns so far.
|
|
129
|
+
context: Optional domain context for accurate evaluation (e.g., currency, locale).
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Tuple of (is_completed, reason).
|
|
135
133
|
"""
|
|
134
|
+
history_text = []
|
|
135
|
+
for t in history:
|
|
136
|
+
history_text.append(f"User: {t.user}\nAssistant: {t.assistant}")
|
|
137
|
+
history_block = "\n\n".join(history_text) if history_text else "(no prior messages)"
|
|
138
|
+
|
|
139
|
+
# Build context block if provided
|
|
140
|
+
context_block = ""
|
|
141
|
+
if context:
|
|
142
|
+
context_block = (
|
|
143
|
+
"\n[IMPORTANT CONTEXT]\n"
|
|
144
|
+
f"{context.format_for_prompt()}\n\n"
|
|
145
|
+
"When evaluating task completion, consider the domain context above "
|
|
146
|
+
f"and use {context.locale} locale conventions.\n\n"
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
prompt = (
|
|
150
|
+
"[SYSTEM]\n"
|
|
151
|
+
"You are a strict task-completion judge for a user-assistant conversation. "
|
|
152
|
+
"Decide if the assistant has fulfilled the current task.\n"
|
|
153
|
+
f"Current task: {current_task.task}\n"
|
|
154
|
+
f"Expected result: {current_task.expected_result}\n"
|
|
155
|
+
f"{context_block}"
|
|
156
|
+
"Respond with a concise JSON object ONLY, no extra text, with fields:\n"
|
|
157
|
+
'{"done": true|false, "reason": "short reason"}\n\n'
|
|
158
|
+
"[CONVERSATION]\n"
|
|
159
|
+
f"{history_block}\n\n"
|
|
160
|
+
"[TASK]\nReturn the JSON now:"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
response = await self.llm.generate(prompt=prompt)
|
|
164
|
+
text = response.strip()
|
|
165
|
+
# Be robust to slight deviations by attempting a minimal parse
|
|
166
|
+
done = False
|
|
167
|
+
reason = ""
|
|
168
|
+
|
|
169
|
+
if not text:
|
|
170
|
+
return False, "Empty response from goal checker"
|
|
171
|
+
|
|
172
|
+
# Try to extract JSON from markdown code blocks if present
|
|
173
|
+
code_block_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
|
|
174
|
+
if code_block_match:
|
|
175
|
+
text = code_block_match.group(1)
|
|
176
|
+
|
|
177
|
+
# Try to find JSON object in the text
|
|
178
|
+
json_match = re.search(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}", text, re.DOTALL)
|
|
179
|
+
if json_match:
|
|
180
|
+
text = json_match.group(0)
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
data = json.loads(text)
|
|
184
|
+
done = bool(data.get("done", False))
|
|
185
|
+
reason = str(data.get("reason", "")).strip()
|
|
186
|
+
except json.JSONDecodeError:
|
|
187
|
+
# If JSON parsing fails, try to infer from response text
|
|
188
|
+
reason = f"Failed to parse JSON response: {text[:100]}"
|
|
189
|
+
# Heuristic: if response contains "done" or "completed" or "true", assume done
|
|
190
|
+
text_lower = text.lower()
|
|
191
|
+
if any(word in text_lower for word in ["done", "completed", "true", "yes", "success"]):
|
|
192
|
+
done = True
|
|
193
|
+
elif any(word in text_lower for word in ["not done", "incomplete", "false", "no", "failed"]):
|
|
194
|
+
done = False
|
|
195
|
+
|
|
196
|
+
return done, reason
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class ToolUsageChecker:
|
|
200
|
+
"""A simple comparator that verifies whether the agent used the expected tools.
|
|
201
|
+
|
|
202
|
+
It checks if all expected tools from the task were called during the conversation turn.
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
def __init__(self, scenario: Scenario) -> None:
|
|
206
|
+
self.scenario = scenario
|
|
207
|
+
|
|
208
|
+
def check_tool_usage(self, current_task: Task, tool_calls: list[ToolCallResult]) -> tuple[bool, str]: # noqa: PLR6301
|
|
209
|
+
"""Check if the expected tools were used for the current task.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
current_task: The current task being evaluated
|
|
213
|
+
tool_calls: List of tool calls made during this turn
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Tuple of (success: bool, reason: str)
|
|
217
|
+
"""
|
|
218
|
+
if not current_task.expected_tools:
|
|
219
|
+
return True, "No expected tools specified"
|
|
220
|
+
|
|
221
|
+
if not tool_calls:
|
|
222
|
+
return False, "No tools were called, but tools were expected"
|
|
223
|
+
|
|
224
|
+
# Get the names of tools that were actually called
|
|
225
|
+
called_tool_names = [tc.name for tc in tool_calls]
|
|
226
|
+
expected_tool_names = current_task.expected_tools
|
|
227
|
+
|
|
228
|
+
# Check if all expected tools were used
|
|
229
|
+
missing_tools = [tool for tool in expected_tool_names if tool not in called_tool_names]
|
|
230
|
+
|
|
231
|
+
if missing_tools:
|
|
232
|
+
return (
|
|
233
|
+
False,
|
|
234
|
+
f"Expected tools not used: {', '.join(missing_tools)}. Tools called: {', '.join(called_tool_names)}",
|
|
235
|
+
)
|
|
136
236
|
|
|
137
|
-
|
|
138
|
-
def _build_history_block(history: list[Turn]) -> str:
|
|
139
|
-
"""Build the conversation history block string."""
|
|
140
|
-
if not history:
|
|
141
|
-
return "(no prior messages)"
|
|
142
|
-
history_text = [f"User: {t.user}\nAssistant: {t.assistant}" for t in history]
|
|
143
|
-
return "\n\n".join(history_text)
|
|
237
|
+
return True, f"All expected tools used: {', '.join(called_tool_names)}"
|
|
144
238
|
|
|
145
239
|
|
|
146
240
|
def build_llm(model_name: str | None, default_model: str, api_key: str) -> LiteLLM:
|
|
@@ -4,14 +4,6 @@ from asyncio import AbstractEventLoop
|
|
|
4
4
|
from itertools import chain
|
|
5
5
|
from typing import Generic, TypeVar
|
|
6
6
|
|
|
7
|
-
from continuous_eval.llm_factory import LLMInterface
|
|
8
|
-
from continuous_eval.metrics.base import LLMBasedMetric
|
|
9
|
-
from continuous_eval.metrics.generation.text import (
|
|
10
|
-
LLMBasedAnswerCorrectness,
|
|
11
|
-
LLMBasedAnswerRelevance,
|
|
12
|
-
LLMBasedFaithfulness,
|
|
13
|
-
LLMBasedStyleConsistency,
|
|
14
|
-
)
|
|
15
7
|
from typing_extensions import Self
|
|
16
8
|
|
|
17
9
|
from ragbits.agents.types import QuestionAnswerPromptOutputT
|
|
@@ -20,6 +12,31 @@ from ragbits.core.utils.helpers import batched
|
|
|
20
12
|
from ragbits.evaluate.metrics.base import Metric
|
|
21
13
|
from ragbits.evaluate.pipelines.question_answer import QuestionAnswerResult
|
|
22
14
|
|
|
15
|
+
try:
|
|
16
|
+
from continuous_eval.llm_factory import LLMInterface
|
|
17
|
+
from continuous_eval.metrics.base import LLMBasedMetric
|
|
18
|
+
from continuous_eval.metrics.generation.text import (
|
|
19
|
+
LLMBasedAnswerCorrectness,
|
|
20
|
+
LLMBasedAnswerRelevance,
|
|
21
|
+
LLMBasedFaithfulness,
|
|
22
|
+
LLMBasedStyleConsistency,
|
|
23
|
+
)
|
|
24
|
+
except ModuleNotFoundError:
|
|
25
|
+
from continuous_eval.llms.base import LLMInterface
|
|
26
|
+
from continuous_eval.metrics import Metric as LLMBasedMetric
|
|
27
|
+
from continuous_eval.metrics.generation.text import (
|
|
28
|
+
AnswerCorrectness as LLMBasedAnswerCorrectness,
|
|
29
|
+
)
|
|
30
|
+
from continuous_eval.metrics.generation.text import (
|
|
31
|
+
AnswerRelevance as LLMBasedAnswerRelevance,
|
|
32
|
+
)
|
|
33
|
+
from continuous_eval.metrics.generation.text import (
|
|
34
|
+
Faithfulness as LLMBasedFaithfulness,
|
|
35
|
+
)
|
|
36
|
+
from continuous_eval.metrics.generation.text import (
|
|
37
|
+
StyleConsistency as LLMBasedStyleConsistency,
|
|
38
|
+
)
|
|
39
|
+
|
|
23
40
|
MetricT = TypeVar("MetricT", bound=LLMBasedMetric)
|
|
24
41
|
|
|
25
42
|
|
{ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragbits-evaluate
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.4.0.dev202602030301
|
|
4
4
|
Summary: Evaluation module for Ragbits components
|
|
5
5
|
Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
|
|
6
6
|
Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
|
|
@@ -28,13 +28,9 @@ Requires-Dist: distilabel<2.0.0,>=1.5.0
|
|
|
28
28
|
Requires-Dist: hydra-core<2.0.0,>=1.3.2
|
|
29
29
|
Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
|
|
30
30
|
Requires-Dist: optuna<5.0.0,>=4.0.0
|
|
31
|
-
Requires-Dist: ragbits-core==
|
|
32
|
-
Provides-Extra: postgres
|
|
33
|
-
Requires-Dist: ragbits-core[postgres]==1.4.0.dev202512110238; extra == 'postgres'
|
|
31
|
+
Requires-Dist: ragbits-core==1.4.0.dev202602030301
|
|
34
32
|
Provides-Extra: relari
|
|
35
33
|
Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
|
|
36
|
-
Provides-Extra: sqlite
|
|
37
|
-
Requires-Dist: ragbits-core[sqlite]==1.4.0.dev202512110238; extra == 'sqlite'
|
|
38
34
|
Description-Content-Type: text/markdown
|
|
39
35
|
|
|
40
36
|
# Ragbits Evaluate
|
{ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/RECORD
RENAMED
|
@@ -1,29 +1,22 @@
|
|
|
1
1
|
ragbits/evaluate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
ragbits/evaluate/api.py,sha256=OtM60L4WNAuPMvN1pE3vmWwq7ORD1YuYWI3ETsnpGk0,26223
|
|
3
|
-
ragbits/evaluate/api_types.py,sha256=yDGkHf6ACeytK5NETYKKgaAW0aRaMmRoSib70ZvHvxM,8678
|
|
4
2
|
ragbits/evaluate/cli.py,sha256=vP8l2DyNXpR6jQP83wXKP_RRnGjEXjKnTVBg9RPbDKo,4505
|
|
5
3
|
ragbits/evaluate/config.py,sha256=2WSmbVxyQi893L2FSjRFQoXkWZp1GetcNmR2GCDe0tA,339
|
|
6
4
|
ragbits/evaluate/evaluator.py,sha256=-VcO61r340lt6KWTjkl8DdHmU78WygBP7wfYLT2hu9k,8319
|
|
7
|
-
ragbits/evaluate/execution_manager.py,sha256=elZ4iUhpvFazdLJPNyK5hWuPJmUFfFQNTwDHoy8uyus,15561
|
|
8
5
|
ragbits/evaluate/optimizer.py,sha256=RqYgoiCIEhjXO0HEN6uwldblHyoPuT3qUdncuHPZgCg,8485
|
|
9
6
|
ragbits/evaluate/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
7
|
ragbits/evaluate/utils.py,sha256=w-hbvKRHI9tEva9wKDTVla0Wm2eCHT2MxVkof27Sqfw,4831
|
|
11
|
-
ragbits/evaluate/agent_simulation/__init__.py,sha256=
|
|
12
|
-
ragbits/evaluate/agent_simulation/checkers.py,sha256=oo-KzuPhq3Ur9B2ZIgs5MLEMpmgG0vdBD0SIba-I_-o,21452
|
|
8
|
+
ragbits/evaluate/agent_simulation/__init__.py,sha256=Cf867LFsLLnDRwDO8rGFQFxvrLemR8qkSqkk5zYyxT0,2971
|
|
13
9
|
ragbits/evaluate/agent_simulation/context.py,sha256=_gZcRpKtHOXoyR2LsMYWH2w9Wm7h2bozuSlNKsZC7ng,4196
|
|
14
|
-
ragbits/evaluate/agent_simulation/conversation.py,sha256=
|
|
10
|
+
ragbits/evaluate/agent_simulation/conversation.py,sha256=nQr3EHlaBu066rFdWXacyPNXtte6PEDoJnVXjymEifQ,14087
|
|
15
11
|
ragbits/evaluate/agent_simulation/deepeval_evaluator.py,sha256=WDtrApfLS1W5-mB86Yg2xlFgPYVzLtvJV8wCiJs5y2w,3453
|
|
16
|
-
ragbits/evaluate/agent_simulation/
|
|
17
|
-
ragbits/evaluate/agent_simulation/
|
|
18
|
-
ragbits/evaluate/agent_simulation/
|
|
19
|
-
ragbits/evaluate/agent_simulation/
|
|
20
|
-
ragbits/evaluate/agent_simulation/
|
|
21
|
-
ragbits/evaluate/agent_simulation/
|
|
22
|
-
ragbits/evaluate/agent_simulation/
|
|
23
|
-
ragbits/evaluate/agent_simulation/metrics/
|
|
24
|
-
ragbits/evaluate/agent_simulation/metrics/builtin.py,sha256=U3kFLeglQ64KPDOdUBx0VVhGOXoyCIwahxg-305GeXU,8086
|
|
25
|
-
ragbits/evaluate/agent_simulation/metrics/collectors.py,sha256=eZSVV9KKxjbDzFyaIiUw1Vo-40W3y5tB3K_4-RFZ7Ww,5782
|
|
26
|
-
ragbits/evaluate/agent_simulation/metrics/deepeval.py,sha256=iEoWF5y3_intueG4uJsC_AEGrCrb-V03vaAZGIiS00A,10956
|
|
12
|
+
ragbits/evaluate/agent_simulation/logger.py,sha256=47lxIRM-DMvP3Cw4XGUVHSNxqjGOk9ozSgxCY6DOIIQ,6948
|
|
13
|
+
ragbits/evaluate/agent_simulation/models.py,sha256=tOjEzK2Iv1_Nu-jT-soORgd15oqJ2zb8g3EXN7ZKn0c,719
|
|
14
|
+
ragbits/evaluate/agent_simulation/results.py,sha256=EJcpJq-86NYPcY8KMNsmxH9aLXTWGoXZY5aig1fbdP4,6925
|
|
15
|
+
ragbits/evaluate/agent_simulation/scenarios.py,sha256=FU0Z9eZUA7yMAZyAUJ3e5SxZR6XwWdZrbEsLz_GHbhI,4364
|
|
16
|
+
ragbits/evaluate/agent_simulation/simulation.py,sha256=GiKutdl-RRIcL_p9sVxvGYCHGCDS1Rz3oQcwLt9rdi0,9633
|
|
17
|
+
ragbits/evaluate/agent_simulation/metrics/__init__.py,sha256=3Gtix1JRAGyLe6JHQTGNr2gIrvEsErY7ys2UkPwdBVU,499
|
|
18
|
+
ragbits/evaluate/agent_simulation/metrics/builtin.py,sha256=ZSK_cBpk1J_bnDes8mbjqw6cW4oml5BAGM7_EOgx6CE,7481
|
|
19
|
+
ragbits/evaluate/agent_simulation/metrics/collectors.py,sha256=fTTYrYimyF0i6KsWagiU12XV0n7AeNaLYqy1n_faWdI,4796
|
|
27
20
|
ragbits/evaluate/dataloaders/__init__.py,sha256=UFJFjmvi3GUQFsx6A5sYD01HH2f7TXcHRW2VNM1pmIA,83
|
|
28
21
|
ragbits/evaluate/dataloaders/base.py,sha256=x8rEl5utNOziF_9urL0grkqoXwMgaDWYSM5akw3Kt9Y,3213
|
|
29
22
|
ragbits/evaluate/dataloaders/document_search.py,sha256=c9Bc4ZtFEKAiG9B70JFiBZlZDkBSGNWFRKabF7PMTU0,2495
|
|
@@ -53,7 +46,7 @@ ragbits/evaluate/metrics/document_search.py,sha256=MfvMwEPenqiJdKYuW6WLvmtMch9ZV
|
|
|
53
46
|
ragbits/evaluate/metrics/gaia.py,sha256=Q1oZPVAxRsQkyctJLE95fsGNewmDhzTJ5vjNoXvu10E,3086
|
|
54
47
|
ragbits/evaluate/metrics/hotpot_qa.py,sha256=Tw4gKDbua60fbE7BbxKV08-yp0PbQKTHlnk87GULNe8,1776
|
|
55
48
|
ragbits/evaluate/metrics/human_eval.py,sha256=ud4G-xaMi0f1tkzYb1V2uSgYDF-ymnKxiN6CdOWGZqU,4285
|
|
56
|
-
ragbits/evaluate/metrics/question_answer.py,sha256=
|
|
49
|
+
ragbits/evaluate/metrics/question_answer.py,sha256=4rbJ9z_eAyk_5mOeQGyaP_rqodS8MZZltwmPmDitb_4,7722
|
|
57
50
|
ragbits/evaluate/pipelines/__init__.py,sha256=PZ2477OqOV622QMC-3iwW5ThC-nYRS9KBe_nlyas3Zs,1573
|
|
58
51
|
ragbits/evaluate/pipelines/base.py,sha256=QV3fjPnbJjeCgcbt8yV1Ho3BamEUc3wSca3MAzaBlV0,1739
|
|
59
52
|
ragbits/evaluate/pipelines/document_search.py,sha256=tgk-I21eshdBbWVsuNa1zWK_fWuDNXhhMCn1_Fdu_Ko,3840
|
|
@@ -61,10 +54,6 @@ ragbits/evaluate/pipelines/gaia.py,sha256=DkVAlNI-a9chQGPyOFtjrXWGoPSWogrSfSxxgx
|
|
|
61
54
|
ragbits/evaluate/pipelines/hotpot_qa.py,sha256=eHDQ7e_Pa1YRWkc-7oxarYHnZKKvEv_Q8gCBBvo_iss,13629
|
|
62
55
|
ragbits/evaluate/pipelines/human_eval.py,sha256=o2q3O3-OcdRBESwRFUKVU2dQt0TIMDXqxFitdKEw_fw,12406
|
|
63
56
|
ragbits/evaluate/pipelines/question_answer.py,sha256=3CYVHDLnOy4z7kgYPMluiJ8POulHo-w3PEiqvqsF4Dc,2797
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
ragbits/evaluate/stores/kv.py,sha256=WsmowNtTtvXf2Jjt5pSqAGbFt64KdFq4UXL9v5JnPpQ,21624
|
|
68
|
-
ragbits_evaluate-0.0.30rc1.dist-info/METADATA,sha256=__OeAgqfwoHZuN3LqDXP4I7DEC-_12enk3UY_Y0lfOM,2553
|
|
69
|
-
ragbits_evaluate-0.0.30rc1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
70
|
-
ragbits_evaluate-0.0.30rc1.dist-info/RECORD,,
|
|
57
|
+
ragbits_evaluate-1.4.0.dev202602030301.dist-info/METADATA,sha256=XQ4uwmwNQTu4DUrfFrAdZLMiyRn_a1YlSkkiEuIcIZw,2368
|
|
58
|
+
ragbits_evaluate-1.4.0.dev202602030301.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
59
|
+
ragbits_evaluate-1.4.0.dev202602030301.dist-info/RECORD,,
|