aiqtoolkit 1.2.0.dev0__py3-none-any.whl → 1.2.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiqtoolkit might be problematic. Click here for more details.
- aiq/agent/base.py +170 -8
- aiq/agent/dual_node.py +1 -1
- aiq/agent/react_agent/agent.py +146 -112
- aiq/agent/react_agent/prompt.py +1 -6
- aiq/agent/react_agent/register.py +36 -35
- aiq/agent/rewoo_agent/agent.py +36 -35
- aiq/agent/rewoo_agent/register.py +2 -2
- aiq/agent/tool_calling_agent/agent.py +3 -7
- aiq/agent/tool_calling_agent/register.py +1 -1
- aiq/authentication/__init__.py +14 -0
- aiq/authentication/api_key/__init__.py +14 -0
- aiq/authentication/api_key/api_key_auth_provider.py +92 -0
- aiq/authentication/api_key/api_key_auth_provider_config.py +124 -0
- aiq/authentication/api_key/register.py +26 -0
- aiq/authentication/exceptions/__init__.py +14 -0
- aiq/authentication/exceptions/api_key_exceptions.py +38 -0
- aiq/authentication/exceptions/auth_code_grant_exceptions.py +86 -0
- aiq/authentication/exceptions/call_back_exceptions.py +38 -0
- aiq/authentication/exceptions/request_exceptions.py +54 -0
- aiq/authentication/http_basic_auth/__init__.py +0 -0
- aiq/authentication/http_basic_auth/http_basic_auth_provider.py +81 -0
- aiq/authentication/http_basic_auth/register.py +30 -0
- aiq/authentication/interfaces.py +93 -0
- aiq/authentication/oauth2/__init__.py +14 -0
- aiq/authentication/oauth2/oauth2_auth_code_flow_provider.py +107 -0
- aiq/authentication/oauth2/oauth2_auth_code_flow_provider_config.py +39 -0
- aiq/authentication/oauth2/register.py +25 -0
- aiq/authentication/register.py +21 -0
- aiq/builder/builder.py +64 -2
- aiq/builder/component_utils.py +16 -3
- aiq/builder/context.py +37 -0
- aiq/builder/eval_builder.py +43 -2
- aiq/builder/function.py +44 -12
- aiq/builder/function_base.py +1 -1
- aiq/builder/intermediate_step_manager.py +6 -8
- aiq/builder/user_interaction_manager.py +3 -0
- aiq/builder/workflow.py +23 -18
- aiq/builder/workflow_builder.py +421 -61
- aiq/cli/commands/info/list_mcp.py +103 -16
- aiq/cli/commands/sizing/__init__.py +14 -0
- aiq/cli/commands/sizing/calc.py +294 -0
- aiq/cli/commands/sizing/sizing.py +27 -0
- aiq/cli/commands/start.py +2 -1
- aiq/cli/entrypoint.py +2 -0
- aiq/cli/register_workflow.py +80 -0
- aiq/cli/type_registry.py +151 -30
- aiq/data_models/api_server.py +124 -12
- aiq/data_models/authentication.py +231 -0
- aiq/data_models/common.py +35 -7
- aiq/data_models/component.py +17 -9
- aiq/data_models/component_ref.py +33 -0
- aiq/data_models/config.py +60 -3
- aiq/data_models/dataset_handler.py +2 -1
- aiq/data_models/embedder.py +1 -0
- aiq/data_models/evaluate.py +23 -0
- aiq/data_models/function_dependencies.py +8 -0
- aiq/data_models/interactive.py +10 -1
- aiq/data_models/intermediate_step.py +38 -5
- aiq/data_models/its_strategy.py +30 -0
- aiq/data_models/llm.py +1 -0
- aiq/data_models/memory.py +1 -0
- aiq/data_models/object_store.py +44 -0
- aiq/data_models/profiler.py +1 -0
- aiq/data_models/retry_mixin.py +35 -0
- aiq/data_models/span.py +187 -0
- aiq/data_models/telemetry_exporter.py +2 -2
- aiq/embedder/nim_embedder.py +2 -1
- aiq/embedder/openai_embedder.py +2 -1
- aiq/eval/config.py +19 -1
- aiq/eval/dataset_handler/dataset_handler.py +87 -2
- aiq/eval/evaluate.py +208 -27
- aiq/eval/evaluator/base_evaluator.py +73 -0
- aiq/eval/evaluator/evaluator_model.py +1 -0
- aiq/eval/intermediate_step_adapter.py +11 -5
- aiq/eval/rag_evaluator/evaluate.py +55 -15
- aiq/eval/rag_evaluator/register.py +6 -1
- aiq/eval/remote_workflow.py +7 -2
- aiq/eval/runners/__init__.py +14 -0
- aiq/eval/runners/config.py +39 -0
- aiq/eval/runners/multi_eval_runner.py +54 -0
- aiq/eval/trajectory_evaluator/evaluate.py +22 -65
- aiq/eval/tunable_rag_evaluator/evaluate.py +150 -168
- aiq/eval/tunable_rag_evaluator/register.py +2 -0
- aiq/eval/usage_stats.py +41 -0
- aiq/eval/utils/output_uploader.py +10 -1
- aiq/eval/utils/weave_eval.py +184 -0
- aiq/experimental/__init__.py +0 -0
- aiq/experimental/decorators/__init__.py +0 -0
- aiq/experimental/decorators/experimental_warning_decorator.py +130 -0
- aiq/experimental/inference_time_scaling/__init__.py +0 -0
- aiq/experimental/inference_time_scaling/editing/__init__.py +0 -0
- aiq/experimental/inference_time_scaling/editing/iterative_plan_refinement_editor.py +147 -0
- aiq/experimental/inference_time_scaling/editing/llm_as_a_judge_editor.py +204 -0
- aiq/experimental/inference_time_scaling/editing/motivation_aware_summarization.py +107 -0
- aiq/experimental/inference_time_scaling/functions/__init__.py +0 -0
- aiq/experimental/inference_time_scaling/functions/execute_score_select_function.py +105 -0
- aiq/experimental/inference_time_scaling/functions/its_tool_orchestration_function.py +205 -0
- aiq/experimental/inference_time_scaling/functions/its_tool_wrapper_function.py +146 -0
- aiq/experimental/inference_time_scaling/functions/plan_select_execute_function.py +224 -0
- aiq/experimental/inference_time_scaling/models/__init__.py +0 -0
- aiq/experimental/inference_time_scaling/models/editor_config.py +132 -0
- aiq/experimental/inference_time_scaling/models/its_item.py +48 -0
- aiq/experimental/inference_time_scaling/models/scoring_config.py +112 -0
- aiq/experimental/inference_time_scaling/models/search_config.py +120 -0
- aiq/experimental/inference_time_scaling/models/selection_config.py +154 -0
- aiq/experimental/inference_time_scaling/models/stage_enums.py +43 -0
- aiq/experimental/inference_time_scaling/models/strategy_base.py +66 -0
- aiq/experimental/inference_time_scaling/models/tool_use_config.py +41 -0
- aiq/experimental/inference_time_scaling/register.py +36 -0
- aiq/experimental/inference_time_scaling/scoring/__init__.py +0 -0
- aiq/experimental/inference_time_scaling/scoring/llm_based_agent_scorer.py +168 -0
- aiq/experimental/inference_time_scaling/scoring/llm_based_plan_scorer.py +168 -0
- aiq/experimental/inference_time_scaling/scoring/motivation_aware_scorer.py +111 -0
- aiq/experimental/inference_time_scaling/search/__init__.py +0 -0
- aiq/experimental/inference_time_scaling/search/multi_llm_planner.py +128 -0
- aiq/experimental/inference_time_scaling/search/multi_query_retrieval_search.py +122 -0
- aiq/experimental/inference_time_scaling/search/single_shot_multi_plan_planner.py +128 -0
- aiq/experimental/inference_time_scaling/selection/__init__.py +0 -0
- aiq/experimental/inference_time_scaling/selection/best_of_n_selector.py +63 -0
- aiq/experimental/inference_time_scaling/selection/llm_based_agent_output_selector.py +131 -0
- aiq/experimental/inference_time_scaling/selection/llm_based_output_merging_selector.py +159 -0
- aiq/experimental/inference_time_scaling/selection/llm_based_plan_selector.py +128 -0
- aiq/experimental/inference_time_scaling/selection/threshold_selector.py +58 -0
- aiq/front_ends/console/authentication_flow_handler.py +233 -0
- aiq/front_ends/console/console_front_end_plugin.py +11 -2
- aiq/front_ends/fastapi/auth_flow_handlers/__init__.py +0 -0
- aiq/front_ends/fastapi/auth_flow_handlers/http_flow_handler.py +27 -0
- aiq/front_ends/fastapi/auth_flow_handlers/websocket_flow_handler.py +107 -0
- aiq/front_ends/fastapi/fastapi_front_end_config.py +93 -9
- aiq/front_ends/fastapi/fastapi_front_end_controller.py +68 -0
- aiq/front_ends/fastapi/fastapi_front_end_plugin.py +14 -1
- aiq/front_ends/fastapi/fastapi_front_end_plugin_worker.py +537 -52
- aiq/front_ends/fastapi/html_snippets/__init__.py +14 -0
- aiq/front_ends/fastapi/html_snippets/auth_code_grant_success.py +35 -0
- aiq/front_ends/fastapi/job_store.py +47 -25
- aiq/front_ends/fastapi/main.py +2 -0
- aiq/front_ends/fastapi/message_handler.py +108 -89
- aiq/front_ends/fastapi/step_adaptor.py +2 -1
- aiq/llm/aws_bedrock_llm.py +57 -0
- aiq/llm/nim_llm.py +2 -1
- aiq/llm/openai_llm.py +3 -2
- aiq/llm/register.py +1 -0
- aiq/meta/pypi.md +12 -12
- aiq/object_store/__init__.py +20 -0
- aiq/object_store/in_memory_object_store.py +74 -0
- aiq/object_store/interfaces.py +84 -0
- aiq/object_store/models.py +36 -0
- aiq/object_store/register.py +20 -0
- aiq/observability/__init__.py +14 -0
- aiq/observability/exporter/__init__.py +14 -0
- aiq/observability/exporter/base_exporter.py +449 -0
- aiq/observability/exporter/exporter.py +78 -0
- aiq/observability/exporter/file_exporter.py +33 -0
- aiq/observability/exporter/processing_exporter.py +269 -0
- aiq/observability/exporter/raw_exporter.py +52 -0
- aiq/observability/exporter/span_exporter.py +264 -0
- aiq/observability/exporter_manager.py +335 -0
- aiq/observability/mixin/__init__.py +14 -0
- aiq/observability/mixin/batch_config_mixin.py +26 -0
- aiq/observability/mixin/collector_config_mixin.py +23 -0
- aiq/observability/mixin/file_mixin.py +288 -0
- aiq/observability/mixin/file_mode.py +23 -0
- aiq/observability/mixin/resource_conflict_mixin.py +134 -0
- aiq/observability/mixin/serialize_mixin.py +61 -0
- aiq/observability/mixin/type_introspection_mixin.py +183 -0
- aiq/observability/processor/__init__.py +14 -0
- aiq/observability/processor/batching_processor.py +316 -0
- aiq/observability/processor/intermediate_step_serializer.py +28 -0
- aiq/observability/processor/processor.py +68 -0
- aiq/observability/register.py +36 -39
- aiq/observability/utils/__init__.py +14 -0
- aiq/observability/utils/dict_utils.py +236 -0
- aiq/observability/utils/time_utils.py +31 -0
- aiq/profiler/calc/__init__.py +14 -0
- aiq/profiler/calc/calc_runner.py +623 -0
- aiq/profiler/calc/calculations.py +288 -0
- aiq/profiler/calc/data_models.py +176 -0
- aiq/profiler/calc/plot.py +345 -0
- aiq/profiler/callbacks/langchain_callback_handler.py +22 -10
- aiq/profiler/data_models.py +24 -0
- aiq/profiler/inference_metrics_model.py +3 -0
- aiq/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py +8 -0
- aiq/profiler/inference_optimization/data_models.py +2 -2
- aiq/profiler/inference_optimization/llm_metrics.py +2 -2
- aiq/profiler/profile_runner.py +61 -21
- aiq/runtime/loader.py +9 -3
- aiq/runtime/runner.py +23 -9
- aiq/runtime/session.py +25 -7
- aiq/runtime/user_metadata.py +2 -3
- aiq/tool/chat_completion.py +74 -0
- aiq/tool/code_execution/README.md +152 -0
- aiq/tool/code_execution/code_sandbox.py +151 -72
- aiq/tool/code_execution/local_sandbox/.gitignore +1 -0
- aiq/tool/code_execution/local_sandbox/local_sandbox_server.py +139 -24
- aiq/tool/code_execution/local_sandbox/sandbox.requirements.txt +3 -1
- aiq/tool/code_execution/local_sandbox/start_local_sandbox.sh +27 -2
- aiq/tool/code_execution/register.py +7 -3
- aiq/tool/code_execution/test_code_execution_sandbox.py +414 -0
- aiq/tool/mcp/exceptions.py +142 -0
- aiq/tool/mcp/mcp_client.py +41 -6
- aiq/tool/mcp/mcp_tool.py +3 -2
- aiq/tool/register.py +1 -0
- aiq/tool/server_tools.py +6 -3
- aiq/utils/exception_handlers/automatic_retries.py +289 -0
- aiq/utils/exception_handlers/mcp.py +211 -0
- aiq/utils/io/model_processing.py +28 -0
- aiq/utils/log_utils.py +37 -0
- aiq/utils/string_utils.py +38 -0
- aiq/utils/type_converter.py +18 -2
- aiq/utils/type_utils.py +87 -0
- {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/METADATA +53 -21
- aiqtoolkit-1.2.0rc2.dist-info/RECORD +436 -0
- {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/WHEEL +1 -1
- {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/entry_points.txt +3 -0
- aiq/front_ends/fastapi/websocket.py +0 -148
- aiq/observability/async_otel_listener.py +0 -429
- aiqtoolkit-1.2.0.dev0.dist-info/RECORD +0 -316
- {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/licenses/LICENSE-3rd-party.txt +0 -0
- {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/licenses/LICENSE.md +0 -0
- {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/top_level.txt +0 -0
aiq/eval/evaluate.py
CHANGED
|
@@ -18,18 +18,25 @@ import logging
|
|
|
18
18
|
import shutil
|
|
19
19
|
from pathlib import Path
|
|
20
20
|
from typing import Any
|
|
21
|
+
from uuid import uuid4
|
|
21
22
|
|
|
22
23
|
from pydantic import BaseModel
|
|
23
24
|
from tqdm import tqdm
|
|
24
25
|
|
|
25
26
|
from aiq.data_models.evaluate import EvalConfig
|
|
27
|
+
from aiq.data_models.evaluate import JobEvictionPolicy
|
|
26
28
|
from aiq.eval.config import EvaluationRunConfig
|
|
27
29
|
from aiq.eval.config import EvaluationRunOutput
|
|
28
30
|
from aiq.eval.dataset_handler.dataset_handler import DatasetHandler
|
|
29
31
|
from aiq.eval.evaluator.evaluator_model import EvalInput
|
|
30
32
|
from aiq.eval.evaluator.evaluator_model import EvalInputItem
|
|
31
33
|
from aiq.eval.evaluator.evaluator_model import EvalOutput
|
|
34
|
+
from aiq.eval.usage_stats import UsageStats
|
|
35
|
+
from aiq.eval.usage_stats import UsageStatsItem
|
|
36
|
+
from aiq.eval.usage_stats import UsageStatsLLM
|
|
32
37
|
from aiq.eval.utils.output_uploader import OutputUploader
|
|
38
|
+
from aiq.eval.utils.weave_eval import WeaveEvaluationIntegration
|
|
39
|
+
from aiq.profiler.data_models import ProfilerResults
|
|
33
40
|
from aiq.runtime.session import AIQSessionManager
|
|
34
41
|
|
|
35
42
|
logger = logging.getLogger(__name__)
|
|
@@ -52,7 +59,7 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
52
59
|
|
|
53
60
|
# Helpers
|
|
54
61
|
self.intermediate_step_adapter: IntermediateStepAdapter = IntermediateStepAdapter()
|
|
55
|
-
|
|
62
|
+
self.weave_eval: WeaveEvaluationIntegration = WeaveEvaluationIntegration()
|
|
56
63
|
# Metadata
|
|
57
64
|
self.eval_input: EvalInput | None = None
|
|
58
65
|
self.workflow_interrupted: bool = False
|
|
@@ -60,12 +67,68 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
60
67
|
# evaluation_results is list of tuples (evaluator_name, EvalOutput)
|
|
61
68
|
self.evaluation_results: list[tuple[str, EvalOutput]] = []
|
|
62
69
|
|
|
70
|
+
# usage stats
|
|
71
|
+
self.usage_stats: UsageStats = UsageStats()
|
|
72
|
+
|
|
63
73
|
# workflow output file
|
|
64
74
|
self.workflow_output_file: Path | None = None
|
|
65
75
|
|
|
66
76
|
# evaluation output files
|
|
67
77
|
self.evaluator_output_files: list[Path] = []
|
|
68
78
|
|
|
79
|
+
def _compute_usage_stats(self, item: EvalInputItem):
|
|
80
|
+
"""Compute usage stats for a single item using the intermediate steps"""
|
|
81
|
+
# get the prompt and completion tokens from the intermediate steps
|
|
82
|
+
from aiq.profiler.intermediate_property_adapter import IntermediatePropertyAdaptor
|
|
83
|
+
steps = [IntermediatePropertyAdaptor.from_intermediate_step(step) for step in item.trajectory]
|
|
84
|
+
usage_stats_per_llm = {}
|
|
85
|
+
total_tokens = 0
|
|
86
|
+
for step in steps:
|
|
87
|
+
if step.event_type == "LLM_END":
|
|
88
|
+
llm_name = step.llm_name
|
|
89
|
+
if llm_name not in usage_stats_per_llm:
|
|
90
|
+
usage_stats_per_llm[llm_name] = UsageStatsLLM()
|
|
91
|
+
usage_stats_per_llm[llm_name].prompt_tokens += step.token_usage.prompt_tokens
|
|
92
|
+
usage_stats_per_llm[llm_name].completion_tokens += step.token_usage.completion_tokens
|
|
93
|
+
usage_stats_per_llm[llm_name].total_tokens += step.token_usage.total_tokens
|
|
94
|
+
total_tokens += step.token_usage.total_tokens
|
|
95
|
+
|
|
96
|
+
# find min and max event timestamps
|
|
97
|
+
if item.trajectory:
|
|
98
|
+
min_timestamp = min(step.event_timestamp for step in item.trajectory)
|
|
99
|
+
max_timestamp = max(step.event_timestamp for step in item.trajectory)
|
|
100
|
+
runtime = max_timestamp - min_timestamp
|
|
101
|
+
else:
|
|
102
|
+
min_timestamp = 0.0
|
|
103
|
+
max_timestamp = 0.0
|
|
104
|
+
runtime = 0.0
|
|
105
|
+
|
|
106
|
+
# find llm latency by calculating p95 of all llm calls
|
|
107
|
+
llm_latencies = []
|
|
108
|
+
previous_llm_start_time = None
|
|
109
|
+
for step in steps:
|
|
110
|
+
if step.event_type == "LLM_START":
|
|
111
|
+
previous_llm_start_time = step.event_timestamp
|
|
112
|
+
elif step.event_type == "LLM_END" and previous_llm_start_time is not None:
|
|
113
|
+
llm_latencies.append(step.event_timestamp - previous_llm_start_time)
|
|
114
|
+
previous_llm_start_time = None
|
|
115
|
+
|
|
116
|
+
# Calculate p95 LLM latency (or 0 if no LLM calls)
|
|
117
|
+
if llm_latencies:
|
|
118
|
+
import numpy as np
|
|
119
|
+
llm_latency = float(np.percentile(llm_latencies, 95))
|
|
120
|
+
else:
|
|
121
|
+
llm_latency = 0.0
|
|
122
|
+
|
|
123
|
+
# add the usage stats to the usage stats dict
|
|
124
|
+
self.usage_stats.usage_stats_items[item.id] = UsageStatsItem(usage_stats_per_llm=usage_stats_per_llm,
|
|
125
|
+
runtime=runtime,
|
|
126
|
+
total_tokens=total_tokens,
|
|
127
|
+
min_timestamp=min_timestamp,
|
|
128
|
+
max_timestamp=max_timestamp,
|
|
129
|
+
llm_latency=llm_latency)
|
|
130
|
+
return self.usage_stats.usage_stats_items[item.id]
|
|
131
|
+
|
|
69
132
|
async def run_workflow_local(self, session_manager: AIQSessionManager):
|
|
70
133
|
'''
|
|
71
134
|
Launch the workflow with the specified questions and extract the output using the jsonpath
|
|
@@ -84,15 +147,19 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
84
147
|
return "", []
|
|
85
148
|
|
|
86
149
|
async with session_manager.run(item.input_obj) as runner:
|
|
150
|
+
if not session_manager.workflow.has_single_output:
|
|
151
|
+
# raise an error if the workflow has multiple outputs
|
|
152
|
+
raise NotImplementedError("Multiple outputs are not supported")
|
|
153
|
+
|
|
154
|
+
runner_result = None
|
|
155
|
+
intermediate_future = None
|
|
156
|
+
|
|
87
157
|
try:
|
|
158
|
+
|
|
88
159
|
# Start usage stats and intermediate steps collection in parallel
|
|
89
160
|
intermediate_future = pull_intermediate()
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
base_output = await runner.result()
|
|
93
|
-
else:
|
|
94
|
-
# raise an error if the workflow has multiple outputs
|
|
95
|
-
raise NotImplementedError("Multiple outputs are not supported")
|
|
161
|
+
runner_result = runner.result()
|
|
162
|
+
base_output = await runner_result
|
|
96
163
|
intermediate_steps = await intermediate_future
|
|
97
164
|
except NotImplementedError as e:
|
|
98
165
|
# raise original error
|
|
@@ -101,6 +168,13 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
101
168
|
logger.exception("Failed to run the workflow: %s", e, exc_info=True)
|
|
102
169
|
# stop processing if a workflow error occurs
|
|
103
170
|
self.workflow_interrupted = True
|
|
171
|
+
|
|
172
|
+
# Cancel any coroutines that are still running, avoiding a warning about unawaited coroutines
|
|
173
|
+
# (typically one of these two is what raised the exception and the other is still running)
|
|
174
|
+
for coro in (runner_result, intermediate_future):
|
|
175
|
+
if coro is not None:
|
|
176
|
+
asyncio.ensure_future(coro).cancel()
|
|
177
|
+
|
|
104
178
|
stop_event.set()
|
|
105
179
|
return
|
|
106
180
|
|
|
@@ -124,6 +198,10 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
124
198
|
|
|
125
199
|
item.output_obj = output
|
|
126
200
|
item.trajectory = self.intermediate_step_adapter.validate_intermediate_steps(intermediate_steps)
|
|
201
|
+
usage_stats_item = self._compute_usage_stats(item)
|
|
202
|
+
|
|
203
|
+
self.weave_eval.log_prediction(item, output)
|
|
204
|
+
await self.weave_eval.log_usage_stats(item, usage_stats_item)
|
|
127
205
|
|
|
128
206
|
async def wrapped_run(item: EvalInputItem) -> None:
|
|
129
207
|
await run_one(item)
|
|
@@ -145,15 +223,19 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
145
223
|
from aiq.eval.remote_workflow import EvaluationRemoteWorkflowHandler
|
|
146
224
|
handler = EvaluationRemoteWorkflowHandler(self.config, self.eval_config.general.max_concurrency)
|
|
147
225
|
await handler.run_workflow_remote(self.eval_input)
|
|
226
|
+
for item in self.eval_input.eval_input_items:
|
|
227
|
+
usage_stats_item = self._compute_usage_stats(item)
|
|
228
|
+
self.weave_eval.log_prediction(item, item.output_obj)
|
|
229
|
+
await self.weave_eval.log_usage_stats(item, usage_stats_item)
|
|
148
230
|
|
|
149
|
-
async def profile_workflow(self):
|
|
231
|
+
async def profile_workflow(self) -> ProfilerResults:
|
|
150
232
|
"""
|
|
151
233
|
Profile a dataset
|
|
152
234
|
"""
|
|
153
235
|
|
|
154
236
|
if not self.eval_config.general.profiler:
|
|
155
237
|
logger.info("Profiler is not enabled. Skipping profiling.")
|
|
156
|
-
return
|
|
238
|
+
return ProfilerResults()
|
|
157
239
|
|
|
158
240
|
from aiq.profiler.profile_runner import ProfilerRunner
|
|
159
241
|
|
|
@@ -161,18 +243,70 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
161
243
|
for input_item in self.eval_input.eval_input_items:
|
|
162
244
|
all_stats.append(input_item.trajectory)
|
|
163
245
|
|
|
164
|
-
profiler_runner = ProfilerRunner(self.eval_config.general.profiler,
|
|
246
|
+
profiler_runner = ProfilerRunner(self.eval_config.general.profiler,
|
|
247
|
+
self.eval_config.general.output_dir,
|
|
248
|
+
write_output=self.config.write_output)
|
|
165
249
|
|
|
166
|
-
await profiler_runner.run(all_stats)
|
|
250
|
+
return await profiler_runner.run(all_stats)
|
|
167
251
|
|
|
168
252
|
def cleanup_output_directory(self):
|
|
169
253
|
'''Remove contents of the output directory if it exists'''
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
254
|
+
output_config = self.eval_config.general.output
|
|
255
|
+
output_dir = output_config.dir
|
|
256
|
+
|
|
257
|
+
if not (output_config and output_dir.exists()):
|
|
258
|
+
return
|
|
259
|
+
|
|
260
|
+
# If cleanup is true, remove the entire directory and we are done
|
|
261
|
+
if output_config.cleanup:
|
|
262
|
+
logger.info("Cleaning up entire output directory: %s", output_config.dir)
|
|
263
|
+
shutil.rmtree(output_config.dir)
|
|
264
|
+
return
|
|
265
|
+
|
|
266
|
+
if output_config.job_management.max_jobs == 0:
|
|
267
|
+
# No eviction policy
|
|
268
|
+
return
|
|
269
|
+
|
|
270
|
+
base_dir = output_dir / "jobs"
|
|
271
|
+
if not base_dir.exists():
|
|
272
|
+
return
|
|
174
273
|
|
|
175
|
-
|
|
274
|
+
# Get all subdirectories, which represent individual job runs
|
|
275
|
+
job_dirs = [d for d in base_dir.iterdir() if d.is_dir()]
|
|
276
|
+
if len(job_dirs) <= output_config.job_management.max_jobs:
|
|
277
|
+
return
|
|
278
|
+
|
|
279
|
+
# Determine sort key based on eviction_policy, defaulting to creation time
|
|
280
|
+
if output_config.job_management.eviction_policy == JobEvictionPolicy.TIME_MODIFIED:
|
|
281
|
+
|
|
282
|
+
def sort_key(x):
|
|
283
|
+
return x.stat().st_mtime
|
|
284
|
+
|
|
285
|
+
logger.info("Using last modified time for job eviction policy.")
|
|
286
|
+
else:
|
|
287
|
+
|
|
288
|
+
def sort_key(x):
|
|
289
|
+
return x.stat().st_ctime
|
|
290
|
+
|
|
291
|
+
logger.info("Using creation time for job eviction policy.")
|
|
292
|
+
|
|
293
|
+
# Sort directories (oldest first)
|
|
294
|
+
job_dirs.sort(key=sort_key)
|
|
295
|
+
num_to_delete = len(job_dirs) - output_config.job_management.max_jobs
|
|
296
|
+
|
|
297
|
+
logger.info("Found %d jobs, exceeding limit of %d. Removing %d oldest jobs.",
|
|
298
|
+
len(job_dirs),
|
|
299
|
+
output_config.job_management.max_jobs,
|
|
300
|
+
num_to_delete)
|
|
301
|
+
|
|
302
|
+
for dir_to_delete in job_dirs[:num_to_delete]:
|
|
303
|
+
try:
|
|
304
|
+
logger.info("Deleting old job directory: %s", dir_to_delete)
|
|
305
|
+
shutil.rmtree(dir_to_delete)
|
|
306
|
+
except Exception as e:
|
|
307
|
+
logger.exception("Failed to delete old job directory: %s: %s", dir_to_delete, e, exc_info=True)
|
|
308
|
+
|
|
309
|
+
def write_output(self, dataset_handler: DatasetHandler, profiler_results: ProfilerResults):
|
|
176
310
|
workflow_output_file = self.eval_config.general.output_dir / "workflow_output.json"
|
|
177
311
|
workflow_output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
178
312
|
|
|
@@ -198,6 +332,11 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
198
332
|
self.evaluator_output_files.append(output_file)
|
|
199
333
|
logger.info("Evaluation results written to %s", output_file)
|
|
200
334
|
|
|
335
|
+
def publish_output(self, dataset_handler: DatasetHandler, profiler_results: ProfilerResults):
|
|
336
|
+
"""Publish the output"""
|
|
337
|
+
if self.config.write_output:
|
|
338
|
+
self.write_output(dataset_handler, profiler_results)
|
|
339
|
+
|
|
201
340
|
if self.workflow_interrupted:
|
|
202
341
|
# Issue a warning if the workflow was not completed on all datasets
|
|
203
342
|
msg = ("Workflow execution was interrupted due to an error. The results may be incomplete. "
|
|
@@ -205,11 +344,15 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
205
344
|
"`eval` with the --skip_completed_entries flag.")
|
|
206
345
|
logger.warning(msg)
|
|
207
346
|
|
|
347
|
+
self.weave_eval.log_summary(self.usage_stats, self.evaluation_results, profiler_results)
|
|
348
|
+
|
|
208
349
|
async def run_single_evaluator(self, evaluator_name: str, evaluator: Any):
|
|
209
350
|
"""Run a single evaluator and store its results."""
|
|
210
351
|
try:
|
|
211
352
|
eval_output = await evaluator.evaluate_fn(self.eval_input)
|
|
212
353
|
self.evaluation_results.append((evaluator_name, eval_output))
|
|
354
|
+
|
|
355
|
+
await self.weave_eval.alog_score(eval_output, evaluator_name)
|
|
213
356
|
except Exception as e:
|
|
214
357
|
logger.exception("An error occurred while running evaluator %s: %s", evaluator_name, e, exc_info=True)
|
|
215
358
|
|
|
@@ -226,6 +369,9 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
226
369
|
except Exception as e:
|
|
227
370
|
logger.exception("An error occurred while running evaluators: %s", e, exc_info=True)
|
|
228
371
|
raise
|
|
372
|
+
finally:
|
|
373
|
+
# Finish prediction loggers in Weave
|
|
374
|
+
await self.weave_eval.afinish_loggers()
|
|
229
375
|
|
|
230
376
|
def apply_overrides(self):
|
|
231
377
|
from aiq.cli.cli_utils.config_override import load_and_override_config
|
|
@@ -241,6 +387,16 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
241
387
|
config = validate_schema(config_dict, AIQConfig)
|
|
242
388
|
return config
|
|
243
389
|
|
|
390
|
+
def _get_workflow_alias(self, workflow_type: str | None = None):
|
|
391
|
+
"""Get the workflow alias for displaying in evaluation UI."""
|
|
392
|
+
if self.eval_config.general.workflow_alias:
|
|
393
|
+
return self.eval_config.general.workflow_alias
|
|
394
|
+
|
|
395
|
+
if not workflow_type or workflow_type == "EmptyFunctionConfig":
|
|
396
|
+
return "aiqtoolkit-eval"
|
|
397
|
+
|
|
398
|
+
return workflow_type
|
|
399
|
+
|
|
244
400
|
async def run_and_evaluate(self,
|
|
245
401
|
session_manager: AIQSessionManager | None = None,
|
|
246
402
|
job_id: str | None = None) -> EvaluationRunOutput:
|
|
@@ -258,12 +414,19 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
258
414
|
else:
|
|
259
415
|
config = load_config(self.config.config_file)
|
|
260
416
|
self.eval_config = config.eval
|
|
261
|
-
|
|
417
|
+
workflow_alias = self._get_workflow_alias(config.workflow.type)
|
|
418
|
+
logger.debug("Loaded %s evaluation configuration: %s", workflow_alias, self.eval_config)
|
|
262
419
|
|
|
263
420
|
# Cleanup the output directory
|
|
264
|
-
if self.eval_config.general.output
|
|
421
|
+
if self.eval_config.general.output:
|
|
265
422
|
self.cleanup_output_directory()
|
|
266
423
|
|
|
424
|
+
# Generate a job_id if append_job_id_to_output_dir is enabled and no job_id provided
|
|
425
|
+
if (self.eval_config.general.output
|
|
426
|
+
and self.eval_config.general.output.job_management.append_job_id_to_output_dir and not job_id):
|
|
427
|
+
job_id = "job_" + str(uuid4())
|
|
428
|
+
logger.info("Generated job ID for output directory: %s", job_id)
|
|
429
|
+
|
|
267
430
|
# If a job id is provided keep the data per-job
|
|
268
431
|
if job_id:
|
|
269
432
|
self.eval_config.general.output_dir = self.eval_config.general.output_dir / f"jobs/{job_id}"
|
|
@@ -281,7 +444,11 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
281
444
|
workflow_interrupted=self.workflow_interrupted,
|
|
282
445
|
)
|
|
283
446
|
|
|
284
|
-
dataset_handler = DatasetHandler(dataset_config=dataset_config,
|
|
447
|
+
dataset_handler = DatasetHandler(dataset_config=dataset_config,
|
|
448
|
+
reps=self.config.reps,
|
|
449
|
+
concurrency=self.eval_config.general.max_concurrency,
|
|
450
|
+
num_passes=self.config.num_passes,
|
|
451
|
+
adjust_dataset_size=self.config.adjust_dataset_size)
|
|
285
452
|
self.eval_input = dataset_handler.get_eval_input_from_dataset(self.config.dataset)
|
|
286
453
|
if not self.eval_input.eval_input_items:
|
|
287
454
|
logger.info("Dataset is empty. Nothing to evaluate.")
|
|
@@ -293,6 +460,10 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
293
460
|
|
|
294
461
|
# Run workflow and evaluate
|
|
295
462
|
async with WorkflowEvalBuilder.from_config(config=config) as eval_workflow:
|
|
463
|
+
# Initialize Weave integration
|
|
464
|
+
self.weave_eval.initialize_logger(workflow_alias, self.eval_input, config)
|
|
465
|
+
|
|
466
|
+
# Run workflow
|
|
296
467
|
if self.config.endpoint:
|
|
297
468
|
await self.run_workflow_remote()
|
|
298
469
|
else:
|
|
@@ -307,10 +478,18 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
307
478
|
await self.run_evaluators(evaluators)
|
|
308
479
|
|
|
309
480
|
# Profile the workflow
|
|
310
|
-
await self.profile_workflow()
|
|
481
|
+
profiler_results = await self.profile_workflow()
|
|
482
|
+
|
|
483
|
+
# compute total runtime
|
|
484
|
+
if self.usage_stats.usage_stats_items:
|
|
485
|
+
self.usage_stats.total_runtime = max(self.usage_stats.usage_stats_items.values(),
|
|
486
|
+
key=lambda x: x.max_timestamp).max_timestamp - \
|
|
487
|
+
min(self.usage_stats.usage_stats_items.values(), key=lambda x: x.min_timestamp).min_timestamp
|
|
488
|
+
else:
|
|
489
|
+
self.usage_stats.total_runtime = 0.0
|
|
311
490
|
|
|
312
|
-
#
|
|
313
|
-
self.
|
|
491
|
+
# Publish the results
|
|
492
|
+
self.publish_output(dataset_handler, profiler_results)
|
|
314
493
|
|
|
315
494
|
# Run custom scripts and upload evaluation outputs to S3
|
|
316
495
|
if self.eval_config.general.output:
|
|
@@ -318,8 +497,10 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
318
497
|
output_uploader.run_custom_scripts()
|
|
319
498
|
await output_uploader.upload_directory()
|
|
320
499
|
|
|
321
|
-
return EvaluationRunOutput(
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
500
|
+
return EvaluationRunOutput(workflow_output_file=self.workflow_output_file,
|
|
501
|
+
evaluator_output_files=self.evaluator_output_files,
|
|
502
|
+
workflow_interrupted=self.workflow_interrupted,
|
|
503
|
+
eval_input=self.eval_input,
|
|
504
|
+
evaluation_results=self.evaluation_results,
|
|
505
|
+
usage_stats=self.usage_stats,
|
|
506
|
+
profiler_results=profiler_results)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
from abc import ABC
|
|
18
|
+
from abc import abstractmethod
|
|
19
|
+
|
|
20
|
+
from tqdm import tqdm
|
|
21
|
+
|
|
22
|
+
from aiq.eval.evaluator.evaluator_model import EvalInput
|
|
23
|
+
from aiq.eval.evaluator.evaluator_model import EvalInputItem
|
|
24
|
+
from aiq.eval.evaluator.evaluator_model import EvalOutput
|
|
25
|
+
from aiq.eval.evaluator.evaluator_model import EvalOutputItem
|
|
26
|
+
from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class BaseEvaluator(ABC):
|
|
30
|
+
"""
|
|
31
|
+
Base class for custom evaluators.
|
|
32
|
+
|
|
33
|
+
Each custom evaluator must implement the `evaluate_item` method which is used to evaluate a
|
|
34
|
+
single EvalInputItem.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, max_concurrency: int = 4, tqdm_desc: str = "Evaluating"):
|
|
38
|
+
self.max_concurrency = max_concurrency
|
|
39
|
+
self.semaphore = asyncio.Semaphore(max_concurrency)
|
|
40
|
+
self.tqdm_desc = tqdm_desc
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
|
|
44
|
+
"""Each evaluator must implement this for item-level evaluation"""
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
|
|
48
|
+
pbar = None
|
|
49
|
+
try:
|
|
50
|
+
tqdm_position = TqdmPositionRegistry.claim()
|
|
51
|
+
pbar = tqdm(total=len(eval_input.eval_input_items), desc=self.tqdm_desc, position=tqdm_position)
|
|
52
|
+
|
|
53
|
+
async def wrapped(item):
|
|
54
|
+
async with self.semaphore:
|
|
55
|
+
try:
|
|
56
|
+
output_item = await self.evaluate_item(item)
|
|
57
|
+
pbar.update(1)
|
|
58
|
+
return output_item
|
|
59
|
+
except Exception as e:
|
|
60
|
+
# If the evaluator fails, return an error item with a score of 0.0
|
|
61
|
+
pbar.update(1)
|
|
62
|
+
return EvalOutputItem(id=item.id, score=0.0, reasoning={"error": f"Evaluator error: {str(e)}"})
|
|
63
|
+
|
|
64
|
+
output_items = await asyncio.gather(*[wrapped(item) for item in eval_input.eval_input_items])
|
|
65
|
+
finally:
|
|
66
|
+
pbar.close()
|
|
67
|
+
TqdmPositionRegistry.release(tqdm_position)
|
|
68
|
+
|
|
69
|
+
# Compute average if possible
|
|
70
|
+
numeric_scores = [item.score for item in output_items if isinstance(item.score, (int, float))]
|
|
71
|
+
avg_score = round(sum(numeric_scores) / len(numeric_scores), 2) if numeric_scores else None
|
|
72
|
+
|
|
73
|
+
return EvalOutput(average_score=avg_score, eval_output_items=output_items)
|
|
@@ -79,15 +79,21 @@ class IntermediateStepAdapter:
|
|
|
79
79
|
for step in steps:
|
|
80
80
|
if step.event_type == IntermediateStepType.LLM_END:
|
|
81
81
|
last_llm_end_step = step
|
|
82
|
+
action = self.get_agent_action_single(step, "")
|
|
83
|
+
agent_actions.append(action)
|
|
82
84
|
else:
|
|
83
85
|
action = self.get_agent_action_single(step, last_llm_end_step)
|
|
84
86
|
agent_actions.append(action)
|
|
85
87
|
|
|
86
88
|
return agent_actions
|
|
87
89
|
|
|
88
|
-
def get_context(self, intermediate_steps: list[IntermediateStep]
|
|
90
|
+
def get_context(self, intermediate_steps: list[IntermediateStep],
|
|
91
|
+
event_filter: list[IntermediateStepType]) -> list[str]:
|
|
89
92
|
"""Grab the output of all the tools and return them as retrieved context."""
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
93
|
+
count = 0
|
|
94
|
+
agent_actions = []
|
|
95
|
+
for step in intermediate_steps:
|
|
96
|
+
if step.event_type in event_filter and step.data and step.data.output:
|
|
97
|
+
agent_actions.append(f"**Step {count}**\n{str(step.data.output)}")
|
|
98
|
+
count += 1
|
|
99
|
+
return agent_actions
|
|
@@ -14,8 +14,10 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
|
|
16
16
|
import logging
|
|
17
|
+
import math
|
|
17
18
|
from collections.abc import Sequence
|
|
18
19
|
|
|
20
|
+
from pydantic import BaseModel
|
|
19
21
|
from ragas import EvaluationDataset
|
|
20
22
|
from ragas import SingleTurnSample
|
|
21
23
|
from ragas.dataset_schema import EvaluationResult
|
|
@@ -23,7 +25,9 @@ from ragas.llms import LangchainLLMWrapper
|
|
|
23
25
|
from ragas.metrics import Metric
|
|
24
26
|
from tqdm import tqdm
|
|
25
27
|
|
|
28
|
+
from aiq.data_models.intermediate_step import IntermediateStepType
|
|
26
29
|
from aiq.eval.evaluator.evaluator_model import EvalInput
|
|
30
|
+
from aiq.eval.evaluator.evaluator_model import EvalInputItem
|
|
27
31
|
from aiq.eval.evaluator.evaluator_model import EvalOutput
|
|
28
32
|
from aiq.eval.evaluator.evaluator_model import EvalOutputItem
|
|
29
33
|
from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
|
|
@@ -33,21 +37,45 @@ logger = logging.getLogger(__name__)
|
|
|
33
37
|
|
|
34
38
|
class RAGEvaluator:
|
|
35
39
|
|
|
36
|
-
def __init__(self,
|
|
40
|
+
def __init__(self,
|
|
41
|
+
evaluator_llm: LangchainLLMWrapper,
|
|
42
|
+
metrics: Sequence[Metric],
|
|
43
|
+
max_concurrency=8,
|
|
44
|
+
input_obj_field: str | None = None):
|
|
37
45
|
self.evaluator_llm = evaluator_llm
|
|
38
46
|
self.metrics = metrics
|
|
47
|
+
self.max_concurrency = max_concurrency
|
|
48
|
+
self.input_obj_field = input_obj_field
|
|
39
49
|
|
|
40
|
-
|
|
41
|
-
|
|
50
|
+
def extract_input_obj(self, item: EvalInputItem) -> str:
|
|
51
|
+
"""Extracts the input object from EvalInputItem based on the configured input_obj_field."""
|
|
52
|
+
input_obj = item.input_obj
|
|
53
|
+
if isinstance(input_obj, BaseModel):
|
|
54
|
+
if self.input_obj_field and hasattr(input_obj, self.input_obj_field):
|
|
55
|
+
# If input_obj_field is specified, return the value of that field
|
|
56
|
+
return str(getattr(input_obj, self.input_obj_field, ""))
|
|
57
|
+
|
|
58
|
+
# If no input_obj_field is specified, return the string representation of the model
|
|
59
|
+
return input_obj.model_dump_json()
|
|
60
|
+
|
|
61
|
+
if isinstance(input_obj, dict):
|
|
62
|
+
# If input_obj is a dict, return the JSON string representation
|
|
63
|
+
if self.input_obj_field and self.input_obj_field in input_obj:
|
|
64
|
+
# If input_obj_field is specified, return the value of that field
|
|
65
|
+
return str(input_obj[self.input_obj_field])
|
|
66
|
+
|
|
67
|
+
return str(input_obj) # Fallback to string representation of the dict
|
|
68
|
+
|
|
69
|
+
def eval_input_to_ragas(self, eval_input: EvalInput) -> EvaluationDataset:
|
|
42
70
|
"""Converts EvalInput into a Ragas-compatible EvaluationDataset."""
|
|
43
71
|
from aiq.eval.intermediate_step_adapter import IntermediateStepAdapter
|
|
44
|
-
|
|
72
|
+
event_filter = [IntermediateStepType.TOOL_END, IntermediateStepType.LLM_END, IntermediateStepType.CUSTOM_END]
|
|
45
73
|
samples = []
|
|
46
74
|
|
|
47
75
|
intermediate_step_adapter = IntermediateStepAdapter()
|
|
48
76
|
for item in eval_input.eval_input_items:
|
|
49
77
|
# Extract required fields from EvalInputItem
|
|
50
|
-
user_input = item
|
|
78
|
+
user_input = self.extract_input_obj(item) # Extract input object as string
|
|
51
79
|
reference = item.expected_output_obj # Reference correct answer
|
|
52
80
|
response = item.output_obj # Model's generated response
|
|
53
81
|
|
|
@@ -55,7 +83,7 @@ class RAGEvaluator:
|
|
|
55
83
|
reference_contexts = [""] # Default to empty context
|
|
56
84
|
# implement context extraction from expected_trajectory
|
|
57
85
|
|
|
58
|
-
retrieved_contexts = intermediate_step_adapter.get_context(item.trajectory)
|
|
86
|
+
retrieved_contexts = intermediate_step_adapter.get_context(item.trajectory, event_filter)
|
|
59
87
|
# implement context extraction from expected_trajectory
|
|
60
88
|
|
|
61
89
|
# Create a SingleTurnSample
|
|
@@ -78,19 +106,29 @@ class RAGEvaluator:
|
|
|
78
106
|
return EvalOutput(average_score=0.0, eval_output_items=[])
|
|
79
107
|
|
|
80
108
|
scores: list[dict[str, float]] = results_dataset.scores
|
|
109
|
+
|
|
110
|
+
# If Ragas returned no scores, return empty output to avoid downstream errors
|
|
81
111
|
if not scores:
|
|
82
|
-
logger.
|
|
112
|
+
logger.warning("Ragas returned empty score list")
|
|
83
113
|
return EvalOutput(average_score=0.0, eval_output_items=[])
|
|
84
114
|
|
|
85
|
-
|
|
86
|
-
|
|
115
|
+
def _nan_to_zero(v: float | None) -> float:
|
|
116
|
+
"""Convert NaN or None to 0.0 for safe arithmetic/serialization."""
|
|
117
|
+
return 0.0 if v is None or (isinstance(v, float) and math.isnan(v)) else v
|
|
118
|
+
|
|
119
|
+
# Convert from list of dicts to dict of lists, coercing NaN/None to 0.0
|
|
120
|
+
scores_dict = {metric: [_nan_to_zero(score.get(metric)) for score in scores] for metric in scores[0]}
|
|
121
|
+
first_metric_name = list(scores_dict.keys())[0] if scores_dict else None
|
|
87
122
|
|
|
88
|
-
# Compute the average of each metric
|
|
89
|
-
average_scores = {
|
|
123
|
+
# Compute the average of each metric, guarding against empty lists
|
|
124
|
+
average_scores = {
|
|
125
|
+
metric: (sum(values) / len(values) if values else 0.0)
|
|
126
|
+
for metric, values in scores_dict.items()
|
|
127
|
+
}
|
|
90
128
|
|
|
91
|
-
|
|
92
|
-
first_avg_score
|
|
93
|
-
|
|
129
|
+
first_avg_score = average_scores.get(list(scores_dict.keys())[0], 0.0)
|
|
130
|
+
if isinstance(first_avg_score, float) and math.isnan(first_avg_score):
|
|
131
|
+
first_avg_score = 0.0
|
|
94
132
|
|
|
95
133
|
df = results_dataset.to_pandas()
|
|
96
134
|
# Get id from eval_input if df size matches number of eval_input_items
|
|
@@ -103,7 +141,7 @@ class RAGEvaluator:
|
|
|
103
141
|
eval_output_items = [
|
|
104
142
|
EvalOutputItem(
|
|
105
143
|
id=ids[i],
|
|
106
|
-
score=getattr(row, first_metric_name, 0.0),
|
|
144
|
+
score=_nan_to_zero(getattr(row, first_metric_name, 0.0) if first_metric_name else 0.0),
|
|
107
145
|
reasoning={
|
|
108
146
|
key:
|
|
109
147
|
getattr(row, key, None) # Use getattr to safely access attributes
|
|
@@ -116,6 +154,7 @@ class RAGEvaluator:
|
|
|
116
154
|
async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
|
|
117
155
|
"""Run Ragas metrics evaluation on the provided EvalInput"""
|
|
118
156
|
from ragas import evaluate as ragas_evaluate
|
|
157
|
+
from ragas.run_config import RunConfig
|
|
119
158
|
|
|
120
159
|
ragas_dataset = self.eval_input_to_ragas(eval_input)
|
|
121
160
|
tqdm_position = TqdmPositionRegistry.claim()
|
|
@@ -126,6 +165,7 @@ class RAGEvaluator:
|
|
|
126
165
|
metrics=self.metrics,
|
|
127
166
|
show_progress=True,
|
|
128
167
|
llm=self.evaluator_llm,
|
|
168
|
+
run_config=RunConfig(max_workers=self.max_concurrency),
|
|
129
169
|
_pbar=pbar)
|
|
130
170
|
except Exception as e:
|
|
131
171
|
# On exception we still continue with other evaluators. Log and return an avg_score of 0.0
|
|
@@ -47,6 +47,8 @@ class RagasEvaluatorConfig(EvaluatorBaseConfig, name="ragas"):
|
|
|
47
47
|
# Ragas metric
|
|
48
48
|
metric: str | dict[str, RagasMetricConfig] = Field(default="AnswerAccuracy",
|
|
49
49
|
description="RAGAS metric callable with optional 'kwargs:'")
|
|
50
|
+
input_obj_field: str | None = Field(
|
|
51
|
+
default=None, description="The field in the input object that contains the content to evaluate.")
|
|
50
52
|
|
|
51
53
|
@model_validator(mode="before")
|
|
52
54
|
@classmethod
|
|
@@ -133,6 +135,9 @@ async def register_ragas_evaluator(config: RagasEvaluatorConfig, builder: EvalBu
|
|
|
133
135
|
metrics.append(metric_callable(**kwargs))
|
|
134
136
|
|
|
135
137
|
# Create the RAG evaluator
|
|
136
|
-
_evaluator = RAGEvaluator(evaluator_llm=llm,
|
|
138
|
+
_evaluator = RAGEvaluator(evaluator_llm=llm,
|
|
139
|
+
metrics=metrics,
|
|
140
|
+
max_concurrency=builder.get_max_concurrency(),
|
|
141
|
+
input_obj_field=config.input_obj_field) if metrics else None
|
|
137
142
|
|
|
138
143
|
yield EvaluatorInfo(config=config, evaluate_fn=evaluate_fn, description="Evaluator for RAGAS metrics")
|