PyPI - aiqtoolkit - Versions diffs - 1.2.0.dev0__py3-none-any.whl → 1.2.0rc1__py3-none-any.whl - Mend

aiqtoolkit 1.2.0.dev0py3-none-any.whl → 1.2.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of aiqtoolkit might be problematic. Click here for more details.

Files changed (220) hide show

aiq/agent/base.py +170 -8
aiq/agent/dual_node.py +1 -1
aiq/agent/react_agent/agent.py +146 -112
aiq/agent/react_agent/prompt.py +1 -6
aiq/agent/react_agent/register.py +36 -35
aiq/agent/rewoo_agent/agent.py +36 -35
aiq/agent/rewoo_agent/register.py +2 -2
aiq/agent/tool_calling_agent/agent.py +3 -7
aiq/agent/tool_calling_agent/register.py +1 -1
aiq/authentication/__init__.py +14 -0
aiq/authentication/api_key/__init__.py +14 -0
aiq/authentication/api_key/api_key_auth_provider.py +92 -0
aiq/authentication/api_key/api_key_auth_provider_config.py +124 -0
aiq/authentication/api_key/register.py +26 -0
aiq/authentication/exceptions/__init__.py +14 -0
aiq/authentication/exceptions/api_key_exceptions.py +38 -0
aiq/authentication/exceptions/auth_code_grant_exceptions.py +86 -0
aiq/authentication/exceptions/call_back_exceptions.py +38 -0
aiq/authentication/exceptions/request_exceptions.py +54 -0
aiq/authentication/http_basic_auth/__init__.py +0 -0
aiq/authentication/http_basic_auth/http_basic_auth_provider.py +81 -0
aiq/authentication/http_basic_auth/register.py +30 -0
aiq/authentication/interfaces.py +93 -0
aiq/authentication/oauth2/__init__.py +14 -0
aiq/authentication/oauth2/oauth2_auth_code_flow_provider.py +107 -0
aiq/authentication/oauth2/oauth2_auth_code_flow_provider_config.py +39 -0
aiq/authentication/oauth2/register.py +25 -0
aiq/authentication/register.py +21 -0
aiq/builder/builder.py +64 -2
aiq/builder/component_utils.py +16 -3
aiq/builder/context.py +37 -0
aiq/builder/eval_builder.py +43 -2
aiq/builder/function.py +44 -12
aiq/builder/function_base.py +1 -1
aiq/builder/intermediate_step_manager.py +6 -8
aiq/builder/user_interaction_manager.py +3 -0
aiq/builder/workflow.py +23 -18
aiq/builder/workflow_builder.py +421 -61
aiq/cli/commands/info/list_mcp.py +103 -16
aiq/cli/commands/sizing/__init__.py +14 -0
aiq/cli/commands/sizing/calc.py +294 -0
aiq/cli/commands/sizing/sizing.py +27 -0
aiq/cli/commands/start.py +2 -1
aiq/cli/entrypoint.py +2 -0
aiq/cli/register_workflow.py +80 -0
aiq/cli/type_registry.py +151 -30
aiq/data_models/api_server.py +124 -12
aiq/data_models/authentication.py +231 -0
aiq/data_models/common.py +35 -7
aiq/data_models/component.py +17 -9
aiq/data_models/component_ref.py +33 -0
aiq/data_models/config.py +60 -3
aiq/data_models/dataset_handler.py +2 -1
aiq/data_models/embedder.py +1 -0
aiq/data_models/evaluate.py +23 -0
aiq/data_models/function_dependencies.py +8 -0
aiq/data_models/interactive.py +10 -1
aiq/data_models/intermediate_step.py +38 -5
aiq/data_models/its_strategy.py +30 -0
aiq/data_models/llm.py +1 -0
aiq/data_models/memory.py +1 -0
aiq/data_models/object_store.py +44 -0
aiq/data_models/profiler.py +1 -0
aiq/data_models/retry_mixin.py +35 -0
aiq/data_models/span.py +187 -0
aiq/data_models/telemetry_exporter.py +2 -2
aiq/embedder/nim_embedder.py +2 -1
aiq/embedder/openai_embedder.py +2 -1
aiq/eval/config.py +19 -1
aiq/eval/dataset_handler/dataset_handler.py +87 -2
aiq/eval/evaluate.py +208 -27
aiq/eval/evaluator/base_evaluator.py +73 -0
aiq/eval/evaluator/evaluator_model.py +1 -0
aiq/eval/intermediate_step_adapter.py +11 -5
aiq/eval/rag_evaluator/evaluate.py +55 -15
aiq/eval/rag_evaluator/register.py +6 -1
aiq/eval/remote_workflow.py +7 -2
aiq/eval/runners/__init__.py +14 -0
aiq/eval/runners/config.py +39 -0
aiq/eval/runners/multi_eval_runner.py +54 -0
aiq/eval/trajectory_evaluator/evaluate.py +22 -65
aiq/eval/tunable_rag_evaluator/evaluate.py +150 -168
aiq/eval/tunable_rag_evaluator/register.py +2 -0
aiq/eval/usage_stats.py +41 -0
aiq/eval/utils/output_uploader.py +10 -1
aiq/eval/utils/weave_eval.py +184 -0
aiq/experimental/__init__.py +0 -0
aiq/experimental/decorators/__init__.py +0 -0
aiq/experimental/decorators/experimental_warning_decorator.py +130 -0
aiq/experimental/inference_time_scaling/__init__.py +0 -0
aiq/experimental/inference_time_scaling/editing/__init__.py +0 -0
aiq/experimental/inference_time_scaling/editing/iterative_plan_refinement_editor.py +147 -0
aiq/experimental/inference_time_scaling/editing/llm_as_a_judge_editor.py +204 -0
aiq/experimental/inference_time_scaling/editing/motivation_aware_summarization.py +107 -0
aiq/experimental/inference_time_scaling/functions/__init__.py +0 -0
aiq/experimental/inference_time_scaling/functions/execute_score_select_function.py +105 -0
aiq/experimental/inference_time_scaling/functions/its_tool_orchestration_function.py +205 -0
aiq/experimental/inference_time_scaling/functions/its_tool_wrapper_function.py +146 -0
aiq/experimental/inference_time_scaling/functions/plan_select_execute_function.py +224 -0
aiq/experimental/inference_time_scaling/models/__init__.py +0 -0
aiq/experimental/inference_time_scaling/models/editor_config.py +132 -0
aiq/experimental/inference_time_scaling/models/its_item.py +48 -0
aiq/experimental/inference_time_scaling/models/scoring_config.py +112 -0
aiq/experimental/inference_time_scaling/models/search_config.py +120 -0
aiq/experimental/inference_time_scaling/models/selection_config.py +154 -0
aiq/experimental/inference_time_scaling/models/stage_enums.py +43 -0
aiq/experimental/inference_time_scaling/models/strategy_base.py +66 -0
aiq/experimental/inference_time_scaling/models/tool_use_config.py +41 -0
aiq/experimental/inference_time_scaling/register.py +36 -0
aiq/experimental/inference_time_scaling/scoring/__init__.py +0 -0
aiq/experimental/inference_time_scaling/scoring/llm_based_agent_scorer.py +168 -0
aiq/experimental/inference_time_scaling/scoring/llm_based_plan_scorer.py +168 -0
aiq/experimental/inference_time_scaling/scoring/motivation_aware_scorer.py +111 -0
aiq/experimental/inference_time_scaling/search/__init__.py +0 -0
aiq/experimental/inference_time_scaling/search/multi_llm_planner.py +128 -0
aiq/experimental/inference_time_scaling/search/multi_query_retrieval_search.py +122 -0
aiq/experimental/inference_time_scaling/search/single_shot_multi_plan_planner.py +128 -0
aiq/experimental/inference_time_scaling/selection/__init__.py +0 -0
aiq/experimental/inference_time_scaling/selection/best_of_n_selector.py +63 -0
aiq/experimental/inference_time_scaling/selection/llm_based_agent_output_selector.py +131 -0
aiq/experimental/inference_time_scaling/selection/llm_based_output_merging_selector.py +159 -0
aiq/experimental/inference_time_scaling/selection/llm_based_plan_selector.py +128 -0
aiq/experimental/inference_time_scaling/selection/threshold_selector.py +58 -0
aiq/front_ends/console/authentication_flow_handler.py +233 -0
aiq/front_ends/console/console_front_end_plugin.py +11 -2
aiq/front_ends/fastapi/auth_flow_handlers/__init__.py +0 -0
aiq/front_ends/fastapi/auth_flow_handlers/http_flow_handler.py +27 -0
aiq/front_ends/fastapi/auth_flow_handlers/websocket_flow_handler.py +107 -0
aiq/front_ends/fastapi/fastapi_front_end_config.py +93 -9
aiq/front_ends/fastapi/fastapi_front_end_controller.py +68 -0
aiq/front_ends/fastapi/fastapi_front_end_plugin.py +14 -1
aiq/front_ends/fastapi/fastapi_front_end_plugin_worker.py +537 -52
aiq/front_ends/fastapi/html_snippets/__init__.py +14 -0
aiq/front_ends/fastapi/html_snippets/auth_code_grant_success.py +35 -0
aiq/front_ends/fastapi/job_store.py +47 -25
aiq/front_ends/fastapi/main.py +2 -0
aiq/front_ends/fastapi/message_handler.py +108 -89
aiq/front_ends/fastapi/step_adaptor.py +2 -1
aiq/llm/aws_bedrock_llm.py +57 -0
aiq/llm/nim_llm.py +2 -1
aiq/llm/openai_llm.py +3 -2
aiq/llm/register.py +1 -0
aiq/meta/pypi.md +12 -12
aiq/object_store/__init__.py +20 -0
aiq/object_store/in_memory_object_store.py +74 -0
aiq/object_store/interfaces.py +84 -0
aiq/object_store/models.py +36 -0
aiq/object_store/register.py +20 -0
aiq/observability/__init__.py +14 -0
aiq/observability/exporter/__init__.py +14 -0
aiq/observability/exporter/base_exporter.py +449 -0
aiq/observability/exporter/exporter.py +78 -0
aiq/observability/exporter/file_exporter.py +33 -0
aiq/observability/exporter/processing_exporter.py +269 -0
aiq/observability/exporter/raw_exporter.py +52 -0
aiq/observability/exporter/span_exporter.py +264 -0
aiq/observability/exporter_manager.py +335 -0
aiq/observability/mixin/__init__.py +14 -0
aiq/observability/mixin/batch_config_mixin.py +26 -0
aiq/observability/mixin/collector_config_mixin.py +23 -0
aiq/observability/mixin/file_mixin.py +288 -0
aiq/observability/mixin/file_mode.py +23 -0
aiq/observability/mixin/resource_conflict_mixin.py +134 -0
aiq/observability/mixin/serialize_mixin.py +61 -0
aiq/observability/mixin/type_introspection_mixin.py +183 -0
aiq/observability/processor/__init__.py +14 -0
aiq/observability/processor/batching_processor.py +316 -0
aiq/observability/processor/intermediate_step_serializer.py +28 -0
aiq/observability/processor/processor.py +68 -0
aiq/observability/register.py +36 -39
aiq/observability/utils/__init__.py +14 -0
aiq/observability/utils/dict_utils.py +236 -0
aiq/observability/utils/time_utils.py +31 -0
aiq/profiler/calc/__init__.py +14 -0
aiq/profiler/calc/calc_runner.py +623 -0
aiq/profiler/calc/calculations.py +288 -0
aiq/profiler/calc/data_models.py +176 -0
aiq/profiler/calc/plot.py +345 -0
aiq/profiler/callbacks/langchain_callback_handler.py +22 -10
aiq/profiler/data_models.py +24 -0
aiq/profiler/inference_metrics_model.py +3 -0
aiq/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py +8 -0
aiq/profiler/inference_optimization/data_models.py +2 -2
aiq/profiler/inference_optimization/llm_metrics.py +2 -2
aiq/profiler/profile_runner.py +61 -21
aiq/runtime/loader.py +9 -3
aiq/runtime/runner.py +23 -9
aiq/runtime/session.py +25 -7
aiq/runtime/user_metadata.py +2 -3
aiq/tool/chat_completion.py +74 -0
aiq/tool/code_execution/README.md +152 -0
aiq/tool/code_execution/code_sandbox.py +151 -72
aiq/tool/code_execution/local_sandbox/.gitignore +1 -0
aiq/tool/code_execution/local_sandbox/local_sandbox_server.py +139 -24
aiq/tool/code_execution/local_sandbox/sandbox.requirements.txt +3 -1
aiq/tool/code_execution/local_sandbox/start_local_sandbox.sh +27 -2
aiq/tool/code_execution/register.py +7 -3
aiq/tool/code_execution/test_code_execution_sandbox.py +414 -0
aiq/tool/mcp/exceptions.py +142 -0
aiq/tool/mcp/mcp_client.py +41 -6
aiq/tool/mcp/mcp_tool.py +3 -2
aiq/tool/register.py +1 -0
aiq/tool/server_tools.py +6 -3
aiq/utils/exception_handlers/automatic_retries.py +289 -0
aiq/utils/exception_handlers/mcp.py +211 -0
aiq/utils/io/model_processing.py +28 -0
aiq/utils/log_utils.py +37 -0
aiq/utils/string_utils.py +38 -0
aiq/utils/type_converter.py +18 -2
aiq/utils/type_utils.py +87 -0
{aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc1.dist-info}/METADATA +53 -21
aiqtoolkit-1.2.0rc1.dist-info/RECORD +436 -0
{aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc1.dist-info}/WHEEL +1 -1
{aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc1.dist-info}/entry_points.txt +3 -0
aiq/front_ends/fastapi/websocket.py +0 -148
aiq/observability/async_otel_listener.py +0 -429
aiqtoolkit-1.2.0.dev0.dist-info/RECORD +0 -316
{aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc1.dist-info}/licenses/LICENSE-3rd-party.txt +0 -0
{aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc1.dist-info}/licenses/LICENSE.md +0 -0
{aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc1.dist-info}/top_level.txt +0 -0

aiq/eval/evaluate.py CHANGED Viewed

@@ -18,18 +18,25 @@ import logging
 import shutil
 from pathlib import Path
 from typing import Any
+from uuid import uuid4
 from pydantic import BaseModel
 from tqdm import tqdm
 from aiq.data_models.evaluate import EvalConfig
+from aiq.data_models.evaluate import JobEvictionPolicy
 from aiq.eval.config import EvaluationRunConfig
 from aiq.eval.config import EvaluationRunOutput
 from aiq.eval.dataset_handler.dataset_handler import DatasetHandler
 from aiq.eval.evaluator.evaluator_model import EvalInput
 from aiq.eval.evaluator.evaluator_model import EvalInputItem
 from aiq.eval.evaluator.evaluator_model import EvalOutput
+from aiq.eval.usage_stats import UsageStats
+from aiq.eval.usage_stats import UsageStatsItem
+from aiq.eval.usage_stats import UsageStatsLLM
 from aiq.eval.utils.output_uploader import OutputUploader
+from aiq.eval.utils.weave_eval import WeaveEvaluationIntegration
+from aiq.profiler.data_models import ProfilerResults
 from aiq.runtime.session import AIQSessionManager
 logger = logging.getLogger(__name__)
@@ -52,7 +59,7 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
         # Helpers
         self.intermediate_step_adapter: IntermediateStepAdapter = IntermediateStepAdapter()
+        self.weave_eval: WeaveEvaluationIntegration = WeaveEvaluationIntegration()
         # Metadata
         self.eval_input: EvalInput | None = None
         self.workflow_interrupted: bool = False
@@ -60,12 +67,68 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
         # evaluation_results is list of tuples (evaluator_name, EvalOutput)
         self.evaluation_results: list[tuple[str, EvalOutput]] = []
+        # usage stats
+        self.usage_stats: UsageStats = UsageStats()
         # workflow output file
         self.workflow_output_file: Path | None = None
         # evaluation output files
         self.evaluator_output_files: list[Path] = []
+    def _compute_usage_stats(self, item: EvalInputItem):
+        """Compute usage stats for a single item using the intermediate steps"""
+        # get the prompt and completion tokens from the intermediate steps
+        from aiq.profiler.intermediate_property_adapter import IntermediatePropertyAdaptor
+        steps = [IntermediatePropertyAdaptor.from_intermediate_step(step) for step in item.trajectory]
+        usage_stats_per_llm = {}
+        total_tokens = 0
+        for step in steps:
+            if step.event_type == "LLM_END":
+                llm_name = step.llm_name
+                if llm_name not in usage_stats_per_llm:
+                    usage_stats_per_llm[llm_name] = UsageStatsLLM()
+                usage_stats_per_llm[llm_name].prompt_tokens += step.token_usage.prompt_tokens
+                usage_stats_per_llm[llm_name].completion_tokens += step.token_usage.completion_tokens
+                usage_stats_per_llm[llm_name].total_tokens += step.token_usage.total_tokens
+                total_tokens += step.token_usage.total_tokens
+        # find min and max event timestamps
+        if item.trajectory:
+            min_timestamp = min(step.event_timestamp for step in item.trajectory)
+            max_timestamp = max(step.event_timestamp for step in item.trajectory)
+            runtime = max_timestamp - min_timestamp
+        else:
+            min_timestamp = 0.0
+            max_timestamp = 0.0
+            runtime = 0.0
+        # find llm latency by calculating p95 of all llm calls
+        llm_latencies = []
+        previous_llm_start_time = None
+        for step in steps:
+            if step.event_type == "LLM_START":
+                previous_llm_start_time = step.event_timestamp
+            elif step.event_type == "LLM_END" and previous_llm_start_time is not None:
+                llm_latencies.append(step.event_timestamp - previous_llm_start_time)
+                previous_llm_start_time = None
+        # Calculate p95 LLM latency (or 0 if no LLM calls)
+        if llm_latencies:
+            import numpy as np
+            llm_latency = float(np.percentile(llm_latencies, 95))
+        else:
+            llm_latency = 0.0
+        # add the usage stats to the usage stats dict
+        self.usage_stats.usage_stats_items[item.id] = UsageStatsItem(usage_stats_per_llm=usage_stats_per_llm,
+                                                                     runtime=runtime,
+                                                                     total_tokens=total_tokens,
+                                                                     min_timestamp=min_timestamp,
+                                                                     max_timestamp=max_timestamp,
+                                                                     llm_latency=llm_latency)
+        return self.usage_stats.usage_stats_items[item.id]
     async def run_workflow_local(self, session_manager: AIQSessionManager):
         '''
         Launch the workflow with the specified questions and extract the output using the jsonpath
@@ -84,15 +147,19 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
                 return "", []
             async with session_manager.run(item.input_obj) as runner:
+                if not session_manager.workflow.has_single_output:
+                    # raise an error if the workflow has multiple outputs
+                    raise NotImplementedError("Multiple outputs are not supported")
+                runner_result = None
+                intermediate_future = None
                 try:
                     # Start usage stats and intermediate steps collection in parallel
                     intermediate_future = pull_intermediate()
-                    if session_manager.workflow.has_single_output:
-                        base_output = await runner.result()
-                    else:
-                        # raise an error if the workflow has multiple outputs
-                        raise NotImplementedError("Multiple outputs are not supported")
+                    runner_result = runner.result()
+                    base_output = await runner_result
                     intermediate_steps = await intermediate_future
                 except NotImplementedError as e:
                     # raise original error
@@ -101,6 +168,13 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
                     logger.exception("Failed to run the workflow: %s", e, exc_info=True)
                     # stop processing if a workflow error occurs
                     self.workflow_interrupted = True
+                    # Cancel any coroutines that are still running, avoiding a warning about unawaited coroutines
+                    # (typically one of these two is what raised the exception and the other is still running)
+                    for coro in (runner_result, intermediate_future):
+                        if coro is not None:
+                            asyncio.ensure_future(coro).cancel()
                     stop_event.set()
                     return
@@ -124,6 +198,10 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
                 item.output_obj = output
                 item.trajectory = self.intermediate_step_adapter.validate_intermediate_steps(intermediate_steps)
+                usage_stats_item = self._compute_usage_stats(item)
+                self.weave_eval.log_prediction(item, output)
+                await self.weave_eval.log_usage_stats(item, usage_stats_item)
         async def wrapped_run(item: EvalInputItem) -> None:
             await run_one(item)
@@ -145,15 +223,19 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
         from aiq.eval.remote_workflow import EvaluationRemoteWorkflowHandler
         handler = EvaluationRemoteWorkflowHandler(self.config, self.eval_config.general.max_concurrency)
         await handler.run_workflow_remote(self.eval_input)
+        for item in self.eval_input.eval_input_items:
+            usage_stats_item = self._compute_usage_stats(item)
+            self.weave_eval.log_prediction(item, item.output_obj)
+            await self.weave_eval.log_usage_stats(item, usage_stats_item)
-    async def profile_workflow(self):
+    async def profile_workflow(self) -> ProfilerResults:
         """
         Profile a dataset
         """
         if not self.eval_config.general.profiler:
             logger.info("Profiler is not enabled. Skipping profiling.")
-            return
+            return ProfilerResults()
         from aiq.profiler.profile_runner import ProfilerRunner
@@ -161,18 +243,70 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
         for input_item in self.eval_input.eval_input_items:
             all_stats.append(input_item.trajectory)
-        profiler_runner = ProfilerRunner(self.eval_config.general.profiler, self.eval_config.general.output_dir)
+        profiler_runner = ProfilerRunner(self.eval_config.general.profiler,
+                                         self.eval_config.general.output_dir,
+                                         write_output=self.config.write_output)
-        await profiler_runner.run(all_stats)
+        return await profiler_runner.run(all_stats)
     def cleanup_output_directory(self):
         '''Remove contents of the output directory if it exists'''
-        if self.eval_config.general.output and self.eval_config.general.output.dir and \
-                self.eval_config.general.output.dir.exists():
-            logger.info("Cleaning up output directory %s", self.eval_config.general.output.dir)
-            shutil.rmtree(self.eval_config.general.output.dir)
+        output_config = self.eval_config.general.output
+        output_dir = output_config.dir
+        if not (output_config and output_dir.exists()):
+            return
+        # If cleanup is true, remove the entire directory and we are done
+        if output_config.cleanup:
+            logger.info("Cleaning up entire output directory: %s", output_config.dir)
+            shutil.rmtree(output_config.dir)
+            return
+        if output_config.job_management.max_jobs == 0:
+            # No eviction policy
+            return
+        base_dir = output_dir / "jobs"
+        if not base_dir.exists():
+            return
-    def write_output(self, dataset_handler: DatasetHandler):
+        # Get all subdirectories, which represent individual job runs
+        job_dirs = [d for d in base_dir.iterdir() if d.is_dir()]
+        if len(job_dirs) <= output_config.job_management.max_jobs:
+            return
+        # Determine sort key based on eviction_policy, defaulting to creation time
+        if output_config.job_management.eviction_policy == JobEvictionPolicy.TIME_MODIFIED:
+            def sort_key(x):
+                return x.stat().st_mtime
+            logger.info("Using last modified time for job eviction policy.")
+        else:
+            def sort_key(x):
+                return x.stat().st_ctime
+            logger.info("Using creation time for job eviction policy.")
+        # Sort directories (oldest first)
+        job_dirs.sort(key=sort_key)
+        num_to_delete = len(job_dirs) - output_config.job_management.max_jobs
+        logger.info("Found %d jobs, exceeding limit of %d. Removing %d oldest jobs.",
+                    len(job_dirs),
+                    output_config.job_management.max_jobs,
+                    num_to_delete)
+        for dir_to_delete in job_dirs[:num_to_delete]:
+            try:
+                logger.info("Deleting old job directory: %s", dir_to_delete)
+                shutil.rmtree(dir_to_delete)
+            except Exception as e:
+                logger.exception("Failed to delete old job directory: %s: %s", dir_to_delete, e, exc_info=True)
+    def write_output(self, dataset_handler: DatasetHandler, profiler_results: ProfilerResults):
         workflow_output_file = self.eval_config.general.output_dir / "workflow_output.json"
         workflow_output_file.parent.mkdir(parents=True, exist_ok=True)
@@ -198,6 +332,11 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
             self.evaluator_output_files.append(output_file)
             logger.info("Evaluation results written to %s", output_file)
+    def publish_output(self, dataset_handler: DatasetHandler, profiler_results: ProfilerResults):
+        """Publish the output"""
+        if self.config.write_output:
+            self.write_output(dataset_handler, profiler_results)
         if self.workflow_interrupted:
             # Issue a warning if the workflow was not completed on all datasets
             msg = ("Workflow execution was interrupted due to an error. The results may be incomplete. "
@@ -205,11 +344,15 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
                    "`eval` with the --skip_completed_entries flag.")
             logger.warning(msg)
+        self.weave_eval.log_summary(self.usage_stats, self.evaluation_results, profiler_results)
     async def run_single_evaluator(self, evaluator_name: str, evaluator: Any):
         """Run a single evaluator and store its results."""
         try:
             eval_output = await evaluator.evaluate_fn(self.eval_input)
             self.evaluation_results.append((evaluator_name, eval_output))
+            await self.weave_eval.alog_score(eval_output, evaluator_name)
         except Exception as e:
             logger.exception("An error occurred while running evaluator %s: %s", evaluator_name, e, exc_info=True)
@@ -226,6 +369,9 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
         except Exception as e:
             logger.exception("An error occurred while running evaluators: %s", e, exc_info=True)
             raise
+        finally:
+            # Finish prediction loggers in Weave
+            await self.weave_eval.afinish_loggers()
     def apply_overrides(self):
         from aiq.cli.cli_utils.config_override import load_and_override_config
@@ -241,6 +387,16 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
         config = validate_schema(config_dict, AIQConfig)
         return config
+    def _get_workflow_alias(self, workflow_type: str | None = None):
+        """Get the workflow alias for displaying in evaluation UI."""
+        if self.eval_config.general.workflow_alias:
+            return self.eval_config.general.workflow_alias
+        if not workflow_type or workflow_type == "EmptyFunctionConfig":
+            return "aiqtoolkit-eval"
+        return workflow_type
     async def run_and_evaluate(self,
                                session_manager: AIQSessionManager | None = None,
                                job_id: str | None = None) -> EvaluationRunOutput:
@@ -258,12 +414,19 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
         else:
             config = load_config(self.config.config_file)
         self.eval_config = config.eval
-        logger.debug("Loaded evaluation configuration: %s", self.eval_config)
+        workflow_alias = self._get_workflow_alias(config.workflow.type)
+        logger.debug("Loaded %s evaluation configuration: %s", workflow_alias, self.eval_config)
         # Cleanup the output directory
-        if self.eval_config.general.output and self.eval_config.general.output.cleanup:
+        if self.eval_config.general.output:
             self.cleanup_output_directory()
+        # Generate a job_id if append_job_id_to_output_dir is enabled and no job_id provided
+        if (self.eval_config.general.output
+                and self.eval_config.general.output.job_management.append_job_id_to_output_dir and not job_id):
+            job_id = "job_" + str(uuid4())
+            logger.info("Generated job ID for output directory: %s", job_id)
         # If a job id is provided keep the data per-job
         if job_id:
             self.eval_config.general.output_dir = self.eval_config.general.output_dir / f"jobs/{job_id}"
@@ -281,7 +444,11 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
                 workflow_interrupted=self.workflow_interrupted,
             )
-        dataset_handler = DatasetHandler(dataset_config=dataset_config, reps=self.config.reps)
+        dataset_handler = DatasetHandler(dataset_config=dataset_config,
+                                         reps=self.config.reps,
+                                         concurrency=self.eval_config.general.max_concurrency,
+                                         num_passes=self.config.num_passes,
+                                         adjust_dataset_size=self.config.adjust_dataset_size)
         self.eval_input = dataset_handler.get_eval_input_from_dataset(self.config.dataset)
         if not self.eval_input.eval_input_items:
             logger.info("Dataset is empty. Nothing to evaluate.")
@@ -293,6 +460,10 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
         # Run workflow and evaluate
         async with WorkflowEvalBuilder.from_config(config=config) as eval_workflow:
+            # Initialize Weave integration
+            self.weave_eval.initialize_logger(workflow_alias, self.eval_input, config)
+            # Run workflow
             if self.config.endpoint:
                 await self.run_workflow_remote()
             else:
@@ -307,10 +478,18 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
             await self.run_evaluators(evaluators)
         # Profile the workflow
-        await self.profile_workflow()
+        profiler_results = await self.profile_workflow()
+        # compute total runtime
+        if self.usage_stats.usage_stats_items:
+            self.usage_stats.total_runtime = max(self.usage_stats.usage_stats_items.values(),
+                                                 key=lambda x: x.max_timestamp).max_timestamp - \
+                min(self.usage_stats.usage_stats_items.values(), key=lambda x: x.min_timestamp).min_timestamp
+        else:
+            self.usage_stats.total_runtime = 0.0
-        # Write the results to the output directory
-        self.write_output(dataset_handler)
+        # Publish the results
+        self.publish_output(dataset_handler, profiler_results)
         # Run custom scripts and upload evaluation outputs to S3
         if self.eval_config.general.output:
@@ -318,8 +497,10 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
             output_uploader.run_custom_scripts()
             await output_uploader.upload_directory()
-        return EvaluationRunOutput(
-            workflow_output_file=self.workflow_output_file,
-            evaluator_output_files=self.evaluator_output_files,
-            workflow_interrupted=self.workflow_interrupted,
-        )
+        return EvaluationRunOutput(workflow_output_file=self.workflow_output_file,
+                                   evaluator_output_files=self.evaluator_output_files,
+                                   workflow_interrupted=self.workflow_interrupted,
+                                   eval_input=self.eval_input,
+                                   evaluation_results=self.evaluation_results,
+                                   usage_stats=self.usage_stats,
+                                   profiler_results=profiler_results)

aiq/eval/evaluator/base_evaluator.py ADDED Viewed

@@ -0,0 +1,73 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+from abc import ABC
+from abc import abstractmethod
+from tqdm import tqdm
+from aiq.eval.evaluator.evaluator_model import EvalInput
+from aiq.eval.evaluator.evaluator_model import EvalInputItem
+from aiq.eval.evaluator.evaluator_model import EvalOutput
+from aiq.eval.evaluator.evaluator_model import EvalOutputItem
+from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
+class BaseEvaluator(ABC):
+    """
+    Base class for custom evaluators.
+    Each custom evaluator must implement the `evaluate_item` method which is used to evaluate a
+    single EvalInputItem.
+    """
+    def __init__(self, max_concurrency: int = 4, tqdm_desc: str = "Evaluating"):
+        self.max_concurrency = max_concurrency
+        self.semaphore = asyncio.Semaphore(max_concurrency)
+        self.tqdm_desc = tqdm_desc
+    @abstractmethod
+    async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
+        """Each evaluator must implement this for item-level evaluation"""
+        pass
+    async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
+        pbar = None
+        try:
+            tqdm_position = TqdmPositionRegistry.claim()
+            pbar = tqdm(total=len(eval_input.eval_input_items), desc=self.tqdm_desc, position=tqdm_position)
+            async def wrapped(item):
+                async with self.semaphore:
+                    try:
+                        output_item = await self.evaluate_item(item)
+                        pbar.update(1)
+                        return output_item
+                    except Exception as e:
+                        # If the evaluator fails, return an error item with a score of 0.0
+                        pbar.update(1)
+                        return EvalOutputItem(id=item.id, score=0.0, reasoning={"error": f"Evaluator error: {str(e)}"})
+            output_items = await asyncio.gather(*[wrapped(item) for item in eval_input.eval_input_items])
+        finally:
+            pbar.close()
+            TqdmPositionRegistry.release(tqdm_position)
+        # Compute average if possible
+        numeric_scores = [item.score for item in output_items if isinstance(item.score, (int, float))]
+        avg_score = round(sum(numeric_scores) / len(numeric_scores), 2) if numeric_scores else None
+        return EvalOutput(average_score=avg_score, eval_output_items=output_items)

aiq/eval/evaluator/evaluator_model.py CHANGED Viewed

@@ -27,6 +27,7 @@ class EvalInputItem(BaseModel):
     output_obj: typing.Any
     expected_trajectory: list[IntermediateStep]
     trajectory: list[IntermediateStep]
+    full_dataset_entry: typing.Any
 class EvalInput(BaseModel):

aiq/eval/intermediate_step_adapter.py CHANGED Viewed

@@ -79,15 +79,21 @@ class IntermediateStepAdapter:
         for step in steps:
             if step.event_type == IntermediateStepType.LLM_END:
                 last_llm_end_step = step
+                action = self.get_agent_action_single(step, "")
+                agent_actions.append(action)
             else:
                 action = self.get_agent_action_single(step, last_llm_end_step)
                 agent_actions.append(action)
         return agent_actions
-    def get_context(self, intermediate_steps: list[IntermediateStep]) -> list[str]:
+    def get_context(self, intermediate_steps: list[IntermediateStep],
+                    event_filter: list[IntermediateStepType]) -> list[str]:
         """Grab the output of all the tools and return them as retrieved context."""
-        return [
-            str(step.data.output) for step in intermediate_steps
-            if step.event_type == IntermediateStepType.TOOL_END and step.data and step.data.output
-        ]
+        count = 0
+        agent_actions = []
+        for step in intermediate_steps:
+            if step.event_type in event_filter and step.data and step.data.output:
+                agent_actions.append(f"**Step {count}**\n{str(step.data.output)}")
+                count += 1
+        return agent_actions

aiq/eval/rag_evaluator/evaluate.py CHANGED Viewed

@@ -14,8 +14,10 @@
 # limitations under the License.
 import logging
+import math
 from collections.abc import Sequence
+from pydantic import BaseModel
 from ragas import EvaluationDataset
 from ragas import SingleTurnSample
 from ragas.dataset_schema import EvaluationResult
@@ -23,7 +25,9 @@ from ragas.llms import LangchainLLMWrapper
 from ragas.metrics import Metric
 from tqdm import tqdm
+from aiq.data_models.intermediate_step import IntermediateStepType
 from aiq.eval.evaluator.evaluator_model import EvalInput
+from aiq.eval.evaluator.evaluator_model import EvalInputItem
 from aiq.eval.evaluator.evaluator_model import EvalOutput
 from aiq.eval.evaluator.evaluator_model import EvalOutputItem
 from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
@@ -33,21 +37,45 @@ logger = logging.getLogger(__name__)
 class RAGEvaluator:
-    def __init__(self, evaluator_llm: LangchainLLMWrapper, metrics: Sequence[Metric]):
+    def __init__(self,
+                 evaluator_llm: LangchainLLMWrapper,
+                 metrics: Sequence[Metric],
+                 max_concurrency=8,
+                 input_obj_field: str | None = None):
         self.evaluator_llm = evaluator_llm
         self.metrics = metrics
+        self.max_concurrency = max_concurrency
+        self.input_obj_field = input_obj_field
-    @staticmethod
-    def eval_input_to_ragas(eval_input: EvalInput) -> EvaluationDataset:
+    def extract_input_obj(self, item: EvalInputItem) -> str:
+        """Extracts the input object from EvalInputItem based on the configured input_obj_field."""
+        input_obj = item.input_obj
+        if isinstance(input_obj, BaseModel):
+            if self.input_obj_field and hasattr(input_obj, self.input_obj_field):
+                # If input_obj_field is specified, return the value of that field
+                return str(getattr(input_obj, self.input_obj_field, ""))
+            # If no input_obj_field is specified, return the string representation of the model
+            return input_obj.model_dump_json()
+        if isinstance(input_obj, dict):
+            # If input_obj is a dict, return the JSON string representation
+            if self.input_obj_field and self.input_obj_field in input_obj:
+                # If input_obj_field is specified, return the value of that field
+                return str(input_obj[self.input_obj_field])
+        return str(input_obj)  # Fallback to string representation of the dict
+    def eval_input_to_ragas(self, eval_input: EvalInput) -> EvaluationDataset:
         """Converts EvalInput into a Ragas-compatible EvaluationDataset."""
         from aiq.eval.intermediate_step_adapter import IntermediateStepAdapter
+        event_filter = [IntermediateStepType.TOOL_END, IntermediateStepType.LLM_END, IntermediateStepType.CUSTOM_END]
         samples = []
         intermediate_step_adapter = IntermediateStepAdapter()
         for item in eval_input.eval_input_items:
             # Extract required fields from EvalInputItem
-            user_input = item.input_obj  # Assumes input_obj is a string (modify if needed)
+            user_input = self.extract_input_obj(item)  # Extract input object as string
             reference = item.expected_output_obj  # Reference correct answer
             response = item.output_obj  # Model's generated response
@@ -55,7 +83,7 @@ class RAGEvaluator:
             reference_contexts = [""]  # Default to empty context
             # implement context extraction from expected_trajectory
-            retrieved_contexts = intermediate_step_adapter.get_context(item.trajectory)
+            retrieved_contexts = intermediate_step_adapter.get_context(item.trajectory, event_filter)
             # implement context extraction from expected_trajectory
             # Create a SingleTurnSample
@@ -78,19 +106,29 @@ class RAGEvaluator:
             return EvalOutput(average_score=0.0, eval_output_items=[])
         scores: list[dict[str, float]] = results_dataset.scores
+        # If Ragas returned no scores, return empty output to avoid downstream errors
         if not scores:
-            logger.error("Ragas returned empty score list")
+            logger.warning("Ragas returned empty score list")
             return EvalOutput(average_score=0.0, eval_output_items=[])
-        # Convert from list of dicts to dict of lists
-        scores_dict = {metric: [score[metric] for score in scores] for metric in scores[0]}
+        def _nan_to_zero(v: float | None) -> float:
+            """Convert NaN or None to 0.0 for safe arithmetic/serialization."""
+            return 0.0 if v is None or (isinstance(v, float) and math.isnan(v)) else v
+        # Convert from list of dicts to dict of lists, coercing NaN/None to 0.0
+        scores_dict = {metric: [_nan_to_zero(score.get(metric)) for score in scores] for metric in scores[0]}
+        first_metric_name = list(scores_dict.keys())[0] if scores_dict else None
-        # Compute the average of each metric
-        average_scores = {metric: sum(values) / len(values) for metric, values in scores_dict.items()}
+        # Compute the average of each metric, guarding against empty lists
+        average_scores = {
+            metric: (sum(values) / len(values) if values else 0.0)
+            for metric, values in scores_dict.items()
+        }
-        # Extract the first (and only) metric's average score
-        first_avg_score = next(iter(average_scores.values()))
-        first_metric_name = list(scores_dict.keys())[0]
+        first_avg_score = average_scores.get(list(scores_dict.keys())[0], 0.0)
+        if isinstance(first_avg_score, float) and math.isnan(first_avg_score):
+            first_avg_score = 0.0
         df = results_dataset.to_pandas()
         # Get id from eval_input if df size matches number of eval_input_items
@@ -103,7 +141,7 @@ class RAGEvaluator:
         eval_output_items = [
             EvalOutputItem(
                 id=ids[i],
-                score=getattr(row, first_metric_name, 0.0),
+                score=_nan_to_zero(getattr(row, first_metric_name, 0.0) if first_metric_name else 0.0),
                 reasoning={
                     key:
                         getattr(row, key, None)  # Use getattr to safely access attributes
@@ -116,6 +154,7 @@ class RAGEvaluator:
     async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
         """Run Ragas metrics evaluation on the provided EvalInput"""
         from ragas import evaluate as ragas_evaluate
+        from ragas.run_config import RunConfig
         ragas_dataset = self.eval_input_to_ragas(eval_input)
         tqdm_position = TqdmPositionRegistry.claim()
@@ -126,6 +165,7 @@ class RAGEvaluator:
                                              metrics=self.metrics,
                                              show_progress=True,
                                              llm=self.evaluator_llm,
+                                             run_config=RunConfig(max_workers=self.max_concurrency),
                                              _pbar=pbar)
         except Exception as e:
             # On exception we still continue with other evaluators. Log and return an avg_score of 0.0

aiq/eval/rag_evaluator/register.py CHANGED Viewed

@@ -47,6 +47,8 @@ class RagasEvaluatorConfig(EvaluatorBaseConfig, name="ragas"):
     # Ragas metric
     metric: str | dict[str, RagasMetricConfig] = Field(default="AnswerAccuracy",
                                                        description="RAGAS metric callable with optional 'kwargs:'")
+    input_obj_field: str | None = Field(
+        default=None, description="The field in the input object that contains the content to evaluate.")
     @model_validator(mode="before")
     @classmethod
@@ -133,6 +135,9 @@ async def register_ragas_evaluator(config: RagasEvaluatorConfig, builder: EvalBu
             metrics.append(metric_callable(**kwargs))
     # Create the RAG evaluator
-    _evaluator = RAGEvaluator(evaluator_llm=llm, metrics=metrics) if metrics else None
+    _evaluator = RAGEvaluator(evaluator_llm=llm,
+                              metrics=metrics,
+                              max_concurrency=builder.get_max_concurrency(),
+                              input_obj_field=config.input_obj_field) if metrics else None
     yield EvaluatorInfo(config=config, evaluate_fn=evaluate_fn, description="Evaluator for RAGAS metrics")

aiqtoolkit 1.2.0.dev0__py3-none-any.whl → 1.2.0rc1__py3-none-any.whl

Potentially problematic release.

aiqtoolkit 1.2.0.dev0py3-none-any.whl → 1.2.0rc1py3-none-any.whl