aiqtoolkit 1.2.0.dev0__py3-none-any.whl → 1.2.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiqtoolkit might be problematic. Click here for more details.

Files changed (220) hide show
  1. aiq/agent/base.py +170 -8
  2. aiq/agent/dual_node.py +1 -1
  3. aiq/agent/react_agent/agent.py +146 -112
  4. aiq/agent/react_agent/prompt.py +1 -6
  5. aiq/agent/react_agent/register.py +36 -35
  6. aiq/agent/rewoo_agent/agent.py +36 -35
  7. aiq/agent/rewoo_agent/register.py +2 -2
  8. aiq/agent/tool_calling_agent/agent.py +3 -7
  9. aiq/agent/tool_calling_agent/register.py +1 -1
  10. aiq/authentication/__init__.py +14 -0
  11. aiq/authentication/api_key/__init__.py +14 -0
  12. aiq/authentication/api_key/api_key_auth_provider.py +92 -0
  13. aiq/authentication/api_key/api_key_auth_provider_config.py +124 -0
  14. aiq/authentication/api_key/register.py +26 -0
  15. aiq/authentication/exceptions/__init__.py +14 -0
  16. aiq/authentication/exceptions/api_key_exceptions.py +38 -0
  17. aiq/authentication/exceptions/auth_code_grant_exceptions.py +86 -0
  18. aiq/authentication/exceptions/call_back_exceptions.py +38 -0
  19. aiq/authentication/exceptions/request_exceptions.py +54 -0
  20. aiq/authentication/http_basic_auth/__init__.py +0 -0
  21. aiq/authentication/http_basic_auth/http_basic_auth_provider.py +81 -0
  22. aiq/authentication/http_basic_auth/register.py +30 -0
  23. aiq/authentication/interfaces.py +93 -0
  24. aiq/authentication/oauth2/__init__.py +14 -0
  25. aiq/authentication/oauth2/oauth2_auth_code_flow_provider.py +107 -0
  26. aiq/authentication/oauth2/oauth2_auth_code_flow_provider_config.py +39 -0
  27. aiq/authentication/oauth2/register.py +25 -0
  28. aiq/authentication/register.py +21 -0
  29. aiq/builder/builder.py +64 -2
  30. aiq/builder/component_utils.py +16 -3
  31. aiq/builder/context.py +37 -0
  32. aiq/builder/eval_builder.py +43 -2
  33. aiq/builder/function.py +44 -12
  34. aiq/builder/function_base.py +1 -1
  35. aiq/builder/intermediate_step_manager.py +6 -8
  36. aiq/builder/user_interaction_manager.py +3 -0
  37. aiq/builder/workflow.py +23 -18
  38. aiq/builder/workflow_builder.py +421 -61
  39. aiq/cli/commands/info/list_mcp.py +103 -16
  40. aiq/cli/commands/sizing/__init__.py +14 -0
  41. aiq/cli/commands/sizing/calc.py +294 -0
  42. aiq/cli/commands/sizing/sizing.py +27 -0
  43. aiq/cli/commands/start.py +2 -1
  44. aiq/cli/entrypoint.py +2 -0
  45. aiq/cli/register_workflow.py +80 -0
  46. aiq/cli/type_registry.py +151 -30
  47. aiq/data_models/api_server.py +124 -12
  48. aiq/data_models/authentication.py +231 -0
  49. aiq/data_models/common.py +35 -7
  50. aiq/data_models/component.py +17 -9
  51. aiq/data_models/component_ref.py +33 -0
  52. aiq/data_models/config.py +60 -3
  53. aiq/data_models/dataset_handler.py +2 -1
  54. aiq/data_models/embedder.py +1 -0
  55. aiq/data_models/evaluate.py +23 -0
  56. aiq/data_models/function_dependencies.py +8 -0
  57. aiq/data_models/interactive.py +10 -1
  58. aiq/data_models/intermediate_step.py +38 -5
  59. aiq/data_models/its_strategy.py +30 -0
  60. aiq/data_models/llm.py +1 -0
  61. aiq/data_models/memory.py +1 -0
  62. aiq/data_models/object_store.py +44 -0
  63. aiq/data_models/profiler.py +1 -0
  64. aiq/data_models/retry_mixin.py +35 -0
  65. aiq/data_models/span.py +187 -0
  66. aiq/data_models/telemetry_exporter.py +2 -2
  67. aiq/embedder/nim_embedder.py +2 -1
  68. aiq/embedder/openai_embedder.py +2 -1
  69. aiq/eval/config.py +19 -1
  70. aiq/eval/dataset_handler/dataset_handler.py +87 -2
  71. aiq/eval/evaluate.py +208 -27
  72. aiq/eval/evaluator/base_evaluator.py +73 -0
  73. aiq/eval/evaluator/evaluator_model.py +1 -0
  74. aiq/eval/intermediate_step_adapter.py +11 -5
  75. aiq/eval/rag_evaluator/evaluate.py +55 -15
  76. aiq/eval/rag_evaluator/register.py +6 -1
  77. aiq/eval/remote_workflow.py +7 -2
  78. aiq/eval/runners/__init__.py +14 -0
  79. aiq/eval/runners/config.py +39 -0
  80. aiq/eval/runners/multi_eval_runner.py +54 -0
  81. aiq/eval/trajectory_evaluator/evaluate.py +22 -65
  82. aiq/eval/tunable_rag_evaluator/evaluate.py +150 -168
  83. aiq/eval/tunable_rag_evaluator/register.py +2 -0
  84. aiq/eval/usage_stats.py +41 -0
  85. aiq/eval/utils/output_uploader.py +10 -1
  86. aiq/eval/utils/weave_eval.py +184 -0
  87. aiq/experimental/__init__.py +0 -0
  88. aiq/experimental/decorators/__init__.py +0 -0
  89. aiq/experimental/decorators/experimental_warning_decorator.py +130 -0
  90. aiq/experimental/inference_time_scaling/__init__.py +0 -0
  91. aiq/experimental/inference_time_scaling/editing/__init__.py +0 -0
  92. aiq/experimental/inference_time_scaling/editing/iterative_plan_refinement_editor.py +147 -0
  93. aiq/experimental/inference_time_scaling/editing/llm_as_a_judge_editor.py +204 -0
  94. aiq/experimental/inference_time_scaling/editing/motivation_aware_summarization.py +107 -0
  95. aiq/experimental/inference_time_scaling/functions/__init__.py +0 -0
  96. aiq/experimental/inference_time_scaling/functions/execute_score_select_function.py +105 -0
  97. aiq/experimental/inference_time_scaling/functions/its_tool_orchestration_function.py +205 -0
  98. aiq/experimental/inference_time_scaling/functions/its_tool_wrapper_function.py +146 -0
  99. aiq/experimental/inference_time_scaling/functions/plan_select_execute_function.py +224 -0
  100. aiq/experimental/inference_time_scaling/models/__init__.py +0 -0
  101. aiq/experimental/inference_time_scaling/models/editor_config.py +132 -0
  102. aiq/experimental/inference_time_scaling/models/its_item.py +48 -0
  103. aiq/experimental/inference_time_scaling/models/scoring_config.py +112 -0
  104. aiq/experimental/inference_time_scaling/models/search_config.py +120 -0
  105. aiq/experimental/inference_time_scaling/models/selection_config.py +154 -0
  106. aiq/experimental/inference_time_scaling/models/stage_enums.py +43 -0
  107. aiq/experimental/inference_time_scaling/models/strategy_base.py +66 -0
  108. aiq/experimental/inference_time_scaling/models/tool_use_config.py +41 -0
  109. aiq/experimental/inference_time_scaling/register.py +36 -0
  110. aiq/experimental/inference_time_scaling/scoring/__init__.py +0 -0
  111. aiq/experimental/inference_time_scaling/scoring/llm_based_agent_scorer.py +168 -0
  112. aiq/experimental/inference_time_scaling/scoring/llm_based_plan_scorer.py +168 -0
  113. aiq/experimental/inference_time_scaling/scoring/motivation_aware_scorer.py +111 -0
  114. aiq/experimental/inference_time_scaling/search/__init__.py +0 -0
  115. aiq/experimental/inference_time_scaling/search/multi_llm_planner.py +128 -0
  116. aiq/experimental/inference_time_scaling/search/multi_query_retrieval_search.py +122 -0
  117. aiq/experimental/inference_time_scaling/search/single_shot_multi_plan_planner.py +128 -0
  118. aiq/experimental/inference_time_scaling/selection/__init__.py +0 -0
  119. aiq/experimental/inference_time_scaling/selection/best_of_n_selector.py +63 -0
  120. aiq/experimental/inference_time_scaling/selection/llm_based_agent_output_selector.py +131 -0
  121. aiq/experimental/inference_time_scaling/selection/llm_based_output_merging_selector.py +159 -0
  122. aiq/experimental/inference_time_scaling/selection/llm_based_plan_selector.py +128 -0
  123. aiq/experimental/inference_time_scaling/selection/threshold_selector.py +58 -0
  124. aiq/front_ends/console/authentication_flow_handler.py +233 -0
  125. aiq/front_ends/console/console_front_end_plugin.py +11 -2
  126. aiq/front_ends/fastapi/auth_flow_handlers/__init__.py +0 -0
  127. aiq/front_ends/fastapi/auth_flow_handlers/http_flow_handler.py +27 -0
  128. aiq/front_ends/fastapi/auth_flow_handlers/websocket_flow_handler.py +107 -0
  129. aiq/front_ends/fastapi/fastapi_front_end_config.py +93 -9
  130. aiq/front_ends/fastapi/fastapi_front_end_controller.py +68 -0
  131. aiq/front_ends/fastapi/fastapi_front_end_plugin.py +14 -1
  132. aiq/front_ends/fastapi/fastapi_front_end_plugin_worker.py +537 -52
  133. aiq/front_ends/fastapi/html_snippets/__init__.py +14 -0
  134. aiq/front_ends/fastapi/html_snippets/auth_code_grant_success.py +35 -0
  135. aiq/front_ends/fastapi/job_store.py +47 -25
  136. aiq/front_ends/fastapi/main.py +2 -0
  137. aiq/front_ends/fastapi/message_handler.py +108 -89
  138. aiq/front_ends/fastapi/step_adaptor.py +2 -1
  139. aiq/llm/aws_bedrock_llm.py +57 -0
  140. aiq/llm/nim_llm.py +2 -1
  141. aiq/llm/openai_llm.py +3 -2
  142. aiq/llm/register.py +1 -0
  143. aiq/meta/pypi.md +12 -12
  144. aiq/object_store/__init__.py +20 -0
  145. aiq/object_store/in_memory_object_store.py +74 -0
  146. aiq/object_store/interfaces.py +84 -0
  147. aiq/object_store/models.py +36 -0
  148. aiq/object_store/register.py +20 -0
  149. aiq/observability/__init__.py +14 -0
  150. aiq/observability/exporter/__init__.py +14 -0
  151. aiq/observability/exporter/base_exporter.py +449 -0
  152. aiq/observability/exporter/exporter.py +78 -0
  153. aiq/observability/exporter/file_exporter.py +33 -0
  154. aiq/observability/exporter/processing_exporter.py +269 -0
  155. aiq/observability/exporter/raw_exporter.py +52 -0
  156. aiq/observability/exporter/span_exporter.py +264 -0
  157. aiq/observability/exporter_manager.py +335 -0
  158. aiq/observability/mixin/__init__.py +14 -0
  159. aiq/observability/mixin/batch_config_mixin.py +26 -0
  160. aiq/observability/mixin/collector_config_mixin.py +23 -0
  161. aiq/observability/mixin/file_mixin.py +288 -0
  162. aiq/observability/mixin/file_mode.py +23 -0
  163. aiq/observability/mixin/resource_conflict_mixin.py +134 -0
  164. aiq/observability/mixin/serialize_mixin.py +61 -0
  165. aiq/observability/mixin/type_introspection_mixin.py +183 -0
  166. aiq/observability/processor/__init__.py +14 -0
  167. aiq/observability/processor/batching_processor.py +316 -0
  168. aiq/observability/processor/intermediate_step_serializer.py +28 -0
  169. aiq/observability/processor/processor.py +68 -0
  170. aiq/observability/register.py +36 -39
  171. aiq/observability/utils/__init__.py +14 -0
  172. aiq/observability/utils/dict_utils.py +236 -0
  173. aiq/observability/utils/time_utils.py +31 -0
  174. aiq/profiler/calc/__init__.py +14 -0
  175. aiq/profiler/calc/calc_runner.py +623 -0
  176. aiq/profiler/calc/calculations.py +288 -0
  177. aiq/profiler/calc/data_models.py +176 -0
  178. aiq/profiler/calc/plot.py +345 -0
  179. aiq/profiler/callbacks/langchain_callback_handler.py +22 -10
  180. aiq/profiler/data_models.py +24 -0
  181. aiq/profiler/inference_metrics_model.py +3 -0
  182. aiq/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py +8 -0
  183. aiq/profiler/inference_optimization/data_models.py +2 -2
  184. aiq/profiler/inference_optimization/llm_metrics.py +2 -2
  185. aiq/profiler/profile_runner.py +61 -21
  186. aiq/runtime/loader.py +9 -3
  187. aiq/runtime/runner.py +23 -9
  188. aiq/runtime/session.py +25 -7
  189. aiq/runtime/user_metadata.py +2 -3
  190. aiq/tool/chat_completion.py +74 -0
  191. aiq/tool/code_execution/README.md +152 -0
  192. aiq/tool/code_execution/code_sandbox.py +151 -72
  193. aiq/tool/code_execution/local_sandbox/.gitignore +1 -0
  194. aiq/tool/code_execution/local_sandbox/local_sandbox_server.py +139 -24
  195. aiq/tool/code_execution/local_sandbox/sandbox.requirements.txt +3 -1
  196. aiq/tool/code_execution/local_sandbox/start_local_sandbox.sh +27 -2
  197. aiq/tool/code_execution/register.py +7 -3
  198. aiq/tool/code_execution/test_code_execution_sandbox.py +414 -0
  199. aiq/tool/mcp/exceptions.py +142 -0
  200. aiq/tool/mcp/mcp_client.py +41 -6
  201. aiq/tool/mcp/mcp_tool.py +3 -2
  202. aiq/tool/register.py +1 -0
  203. aiq/tool/server_tools.py +6 -3
  204. aiq/utils/exception_handlers/automatic_retries.py +289 -0
  205. aiq/utils/exception_handlers/mcp.py +211 -0
  206. aiq/utils/io/model_processing.py +28 -0
  207. aiq/utils/log_utils.py +37 -0
  208. aiq/utils/string_utils.py +38 -0
  209. aiq/utils/type_converter.py +18 -2
  210. aiq/utils/type_utils.py +87 -0
  211. {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc1.dist-info}/METADATA +53 -21
  212. aiqtoolkit-1.2.0rc1.dist-info/RECORD +436 -0
  213. {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc1.dist-info}/WHEEL +1 -1
  214. {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc1.dist-info}/entry_points.txt +3 -0
  215. aiq/front_ends/fastapi/websocket.py +0 -148
  216. aiq/observability/async_otel_listener.py +0 -429
  217. aiqtoolkit-1.2.0.dev0.dist-info/RECORD +0 -316
  218. {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc1.dist-info}/licenses/LICENSE-3rd-party.txt +0 -0
  219. {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc1.dist-info}/licenses/LICENSE.md +0 -0
  220. {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc1.dist-info}/top_level.txt +0 -0
aiq/eval/evaluate.py CHANGED
@@ -18,18 +18,25 @@ import logging
18
18
  import shutil
19
19
  from pathlib import Path
20
20
  from typing import Any
21
+ from uuid import uuid4
21
22
 
22
23
  from pydantic import BaseModel
23
24
  from tqdm import tqdm
24
25
 
25
26
  from aiq.data_models.evaluate import EvalConfig
27
+ from aiq.data_models.evaluate import JobEvictionPolicy
26
28
  from aiq.eval.config import EvaluationRunConfig
27
29
  from aiq.eval.config import EvaluationRunOutput
28
30
  from aiq.eval.dataset_handler.dataset_handler import DatasetHandler
29
31
  from aiq.eval.evaluator.evaluator_model import EvalInput
30
32
  from aiq.eval.evaluator.evaluator_model import EvalInputItem
31
33
  from aiq.eval.evaluator.evaluator_model import EvalOutput
34
+ from aiq.eval.usage_stats import UsageStats
35
+ from aiq.eval.usage_stats import UsageStatsItem
36
+ from aiq.eval.usage_stats import UsageStatsLLM
32
37
  from aiq.eval.utils.output_uploader import OutputUploader
38
+ from aiq.eval.utils.weave_eval import WeaveEvaluationIntegration
39
+ from aiq.profiler.data_models import ProfilerResults
33
40
  from aiq.runtime.session import AIQSessionManager
34
41
 
35
42
  logger = logging.getLogger(__name__)
@@ -52,7 +59,7 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
52
59
 
53
60
  # Helpers
54
61
  self.intermediate_step_adapter: IntermediateStepAdapter = IntermediateStepAdapter()
55
-
62
+ self.weave_eval: WeaveEvaluationIntegration = WeaveEvaluationIntegration()
56
63
  # Metadata
57
64
  self.eval_input: EvalInput | None = None
58
65
  self.workflow_interrupted: bool = False
@@ -60,12 +67,68 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
60
67
  # evaluation_results is list of tuples (evaluator_name, EvalOutput)
61
68
  self.evaluation_results: list[tuple[str, EvalOutput]] = []
62
69
 
70
+ # usage stats
71
+ self.usage_stats: UsageStats = UsageStats()
72
+
63
73
  # workflow output file
64
74
  self.workflow_output_file: Path | None = None
65
75
 
66
76
  # evaluation output files
67
77
  self.evaluator_output_files: list[Path] = []
68
78
 
79
+ def _compute_usage_stats(self, item: EvalInputItem):
80
+ """Compute usage stats for a single item using the intermediate steps"""
81
+ # get the prompt and completion tokens from the intermediate steps
82
+ from aiq.profiler.intermediate_property_adapter import IntermediatePropertyAdaptor
83
+ steps = [IntermediatePropertyAdaptor.from_intermediate_step(step) for step in item.trajectory]
84
+ usage_stats_per_llm = {}
85
+ total_tokens = 0
86
+ for step in steps:
87
+ if step.event_type == "LLM_END":
88
+ llm_name = step.llm_name
89
+ if llm_name not in usage_stats_per_llm:
90
+ usage_stats_per_llm[llm_name] = UsageStatsLLM()
91
+ usage_stats_per_llm[llm_name].prompt_tokens += step.token_usage.prompt_tokens
92
+ usage_stats_per_llm[llm_name].completion_tokens += step.token_usage.completion_tokens
93
+ usage_stats_per_llm[llm_name].total_tokens += step.token_usage.total_tokens
94
+ total_tokens += step.token_usage.total_tokens
95
+
96
+ # find min and max event timestamps
97
+ if item.trajectory:
98
+ min_timestamp = min(step.event_timestamp for step in item.trajectory)
99
+ max_timestamp = max(step.event_timestamp for step in item.trajectory)
100
+ runtime = max_timestamp - min_timestamp
101
+ else:
102
+ min_timestamp = 0.0
103
+ max_timestamp = 0.0
104
+ runtime = 0.0
105
+
106
+ # find llm latency by calculating p95 of all llm calls
107
+ llm_latencies = []
108
+ previous_llm_start_time = None
109
+ for step in steps:
110
+ if step.event_type == "LLM_START":
111
+ previous_llm_start_time = step.event_timestamp
112
+ elif step.event_type == "LLM_END" and previous_llm_start_time is not None:
113
+ llm_latencies.append(step.event_timestamp - previous_llm_start_time)
114
+ previous_llm_start_time = None
115
+
116
+ # Calculate p95 LLM latency (or 0 if no LLM calls)
117
+ if llm_latencies:
118
+ import numpy as np
119
+ llm_latency = float(np.percentile(llm_latencies, 95))
120
+ else:
121
+ llm_latency = 0.0
122
+
123
+ # add the usage stats to the usage stats dict
124
+ self.usage_stats.usage_stats_items[item.id] = UsageStatsItem(usage_stats_per_llm=usage_stats_per_llm,
125
+ runtime=runtime,
126
+ total_tokens=total_tokens,
127
+ min_timestamp=min_timestamp,
128
+ max_timestamp=max_timestamp,
129
+ llm_latency=llm_latency)
130
+ return self.usage_stats.usage_stats_items[item.id]
131
+
69
132
  async def run_workflow_local(self, session_manager: AIQSessionManager):
70
133
  '''
71
134
  Launch the workflow with the specified questions and extract the output using the jsonpath
@@ -84,15 +147,19 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
84
147
  return "", []
85
148
 
86
149
  async with session_manager.run(item.input_obj) as runner:
150
+ if not session_manager.workflow.has_single_output:
151
+ # raise an error if the workflow has multiple outputs
152
+ raise NotImplementedError("Multiple outputs are not supported")
153
+
154
+ runner_result = None
155
+ intermediate_future = None
156
+
87
157
  try:
158
+
88
159
  # Start usage stats and intermediate steps collection in parallel
89
160
  intermediate_future = pull_intermediate()
90
-
91
- if session_manager.workflow.has_single_output:
92
- base_output = await runner.result()
93
- else:
94
- # raise an error if the workflow has multiple outputs
95
- raise NotImplementedError("Multiple outputs are not supported")
161
+ runner_result = runner.result()
162
+ base_output = await runner_result
96
163
  intermediate_steps = await intermediate_future
97
164
  except NotImplementedError as e:
98
165
  # raise original error
@@ -101,6 +168,13 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
101
168
  logger.exception("Failed to run the workflow: %s", e, exc_info=True)
102
169
  # stop processing if a workflow error occurs
103
170
  self.workflow_interrupted = True
171
+
172
+ # Cancel any coroutines that are still running, avoiding a warning about unawaited coroutines
173
+ # (typically one of these two is what raised the exception and the other is still running)
174
+ for coro in (runner_result, intermediate_future):
175
+ if coro is not None:
176
+ asyncio.ensure_future(coro).cancel()
177
+
104
178
  stop_event.set()
105
179
  return
106
180
 
@@ -124,6 +198,10 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
124
198
 
125
199
  item.output_obj = output
126
200
  item.trajectory = self.intermediate_step_adapter.validate_intermediate_steps(intermediate_steps)
201
+ usage_stats_item = self._compute_usage_stats(item)
202
+
203
+ self.weave_eval.log_prediction(item, output)
204
+ await self.weave_eval.log_usage_stats(item, usage_stats_item)
127
205
 
128
206
  async def wrapped_run(item: EvalInputItem) -> None:
129
207
  await run_one(item)
@@ -145,15 +223,19 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
145
223
  from aiq.eval.remote_workflow import EvaluationRemoteWorkflowHandler
146
224
  handler = EvaluationRemoteWorkflowHandler(self.config, self.eval_config.general.max_concurrency)
147
225
  await handler.run_workflow_remote(self.eval_input)
226
+ for item in self.eval_input.eval_input_items:
227
+ usage_stats_item = self._compute_usage_stats(item)
228
+ self.weave_eval.log_prediction(item, item.output_obj)
229
+ await self.weave_eval.log_usage_stats(item, usage_stats_item)
148
230
 
149
- async def profile_workflow(self):
231
+ async def profile_workflow(self) -> ProfilerResults:
150
232
  """
151
233
  Profile a dataset
152
234
  """
153
235
 
154
236
  if not self.eval_config.general.profiler:
155
237
  logger.info("Profiler is not enabled. Skipping profiling.")
156
- return
238
+ return ProfilerResults()
157
239
 
158
240
  from aiq.profiler.profile_runner import ProfilerRunner
159
241
 
@@ -161,18 +243,70 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
161
243
  for input_item in self.eval_input.eval_input_items:
162
244
  all_stats.append(input_item.trajectory)
163
245
 
164
- profiler_runner = ProfilerRunner(self.eval_config.general.profiler, self.eval_config.general.output_dir)
246
+ profiler_runner = ProfilerRunner(self.eval_config.general.profiler,
247
+ self.eval_config.general.output_dir,
248
+ write_output=self.config.write_output)
165
249
 
166
- await profiler_runner.run(all_stats)
250
+ return await profiler_runner.run(all_stats)
167
251
 
168
252
  def cleanup_output_directory(self):
169
253
  '''Remove contents of the output directory if it exists'''
170
- if self.eval_config.general.output and self.eval_config.general.output.dir and \
171
- self.eval_config.general.output.dir.exists():
172
- logger.info("Cleaning up output directory %s", self.eval_config.general.output.dir)
173
- shutil.rmtree(self.eval_config.general.output.dir)
254
+ output_config = self.eval_config.general.output
255
+ output_dir = output_config.dir
256
+
257
+ if not (output_config and output_dir.exists()):
258
+ return
259
+
260
+ # If cleanup is true, remove the entire directory and we are done
261
+ if output_config.cleanup:
262
+ logger.info("Cleaning up entire output directory: %s", output_config.dir)
263
+ shutil.rmtree(output_config.dir)
264
+ return
265
+
266
+ if output_config.job_management.max_jobs == 0:
267
+ # No eviction policy
268
+ return
269
+
270
+ base_dir = output_dir / "jobs"
271
+ if not base_dir.exists():
272
+ return
174
273
 
175
- def write_output(self, dataset_handler: DatasetHandler):
274
+ # Get all subdirectories, which represent individual job runs
275
+ job_dirs = [d for d in base_dir.iterdir() if d.is_dir()]
276
+ if len(job_dirs) <= output_config.job_management.max_jobs:
277
+ return
278
+
279
+ # Determine sort key based on eviction_policy, defaulting to creation time
280
+ if output_config.job_management.eviction_policy == JobEvictionPolicy.TIME_MODIFIED:
281
+
282
+ def sort_key(x):
283
+ return x.stat().st_mtime
284
+
285
+ logger.info("Using last modified time for job eviction policy.")
286
+ else:
287
+
288
+ def sort_key(x):
289
+ return x.stat().st_ctime
290
+
291
+ logger.info("Using creation time for job eviction policy.")
292
+
293
+ # Sort directories (oldest first)
294
+ job_dirs.sort(key=sort_key)
295
+ num_to_delete = len(job_dirs) - output_config.job_management.max_jobs
296
+
297
+ logger.info("Found %d jobs, exceeding limit of %d. Removing %d oldest jobs.",
298
+ len(job_dirs),
299
+ output_config.job_management.max_jobs,
300
+ num_to_delete)
301
+
302
+ for dir_to_delete in job_dirs[:num_to_delete]:
303
+ try:
304
+ logger.info("Deleting old job directory: %s", dir_to_delete)
305
+ shutil.rmtree(dir_to_delete)
306
+ except Exception as e:
307
+ logger.exception("Failed to delete old job directory: %s: %s", dir_to_delete, e, exc_info=True)
308
+
309
+ def write_output(self, dataset_handler: DatasetHandler, profiler_results: ProfilerResults):
176
310
  workflow_output_file = self.eval_config.general.output_dir / "workflow_output.json"
177
311
  workflow_output_file.parent.mkdir(parents=True, exist_ok=True)
178
312
 
@@ -198,6 +332,11 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
198
332
  self.evaluator_output_files.append(output_file)
199
333
  logger.info("Evaluation results written to %s", output_file)
200
334
 
335
+ def publish_output(self, dataset_handler: DatasetHandler, profiler_results: ProfilerResults):
336
+ """Publish the output"""
337
+ if self.config.write_output:
338
+ self.write_output(dataset_handler, profiler_results)
339
+
201
340
  if self.workflow_interrupted:
202
341
  # Issue a warning if the workflow was not completed on all datasets
203
342
  msg = ("Workflow execution was interrupted due to an error. The results may be incomplete. "
@@ -205,11 +344,15 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
205
344
  "`eval` with the --skip_completed_entries flag.")
206
345
  logger.warning(msg)
207
346
 
347
+ self.weave_eval.log_summary(self.usage_stats, self.evaluation_results, profiler_results)
348
+
208
349
  async def run_single_evaluator(self, evaluator_name: str, evaluator: Any):
209
350
  """Run a single evaluator and store its results."""
210
351
  try:
211
352
  eval_output = await evaluator.evaluate_fn(self.eval_input)
212
353
  self.evaluation_results.append((evaluator_name, eval_output))
354
+
355
+ await self.weave_eval.alog_score(eval_output, evaluator_name)
213
356
  except Exception as e:
214
357
  logger.exception("An error occurred while running evaluator %s: %s", evaluator_name, e, exc_info=True)
215
358
 
@@ -226,6 +369,9 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
226
369
  except Exception as e:
227
370
  logger.exception("An error occurred while running evaluators: %s", e, exc_info=True)
228
371
  raise
372
+ finally:
373
+ # Finish prediction loggers in Weave
374
+ await self.weave_eval.afinish_loggers()
229
375
 
230
376
  def apply_overrides(self):
231
377
  from aiq.cli.cli_utils.config_override import load_and_override_config
@@ -241,6 +387,16 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
241
387
  config = validate_schema(config_dict, AIQConfig)
242
388
  return config
243
389
 
390
+ def _get_workflow_alias(self, workflow_type: str | None = None):
391
+ """Get the workflow alias for displaying in evaluation UI."""
392
+ if self.eval_config.general.workflow_alias:
393
+ return self.eval_config.general.workflow_alias
394
+
395
+ if not workflow_type or workflow_type == "EmptyFunctionConfig":
396
+ return "aiqtoolkit-eval"
397
+
398
+ return workflow_type
399
+
244
400
  async def run_and_evaluate(self,
245
401
  session_manager: AIQSessionManager | None = None,
246
402
  job_id: str | None = None) -> EvaluationRunOutput:
@@ -258,12 +414,19 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
258
414
  else:
259
415
  config = load_config(self.config.config_file)
260
416
  self.eval_config = config.eval
261
- logger.debug("Loaded evaluation configuration: %s", self.eval_config)
417
+ workflow_alias = self._get_workflow_alias(config.workflow.type)
418
+ logger.debug("Loaded %s evaluation configuration: %s", workflow_alias, self.eval_config)
262
419
 
263
420
  # Cleanup the output directory
264
- if self.eval_config.general.output and self.eval_config.general.output.cleanup:
421
+ if self.eval_config.general.output:
265
422
  self.cleanup_output_directory()
266
423
 
424
+ # Generate a job_id if append_job_id_to_output_dir is enabled and no job_id provided
425
+ if (self.eval_config.general.output
426
+ and self.eval_config.general.output.job_management.append_job_id_to_output_dir and not job_id):
427
+ job_id = "job_" + str(uuid4())
428
+ logger.info("Generated job ID for output directory: %s", job_id)
429
+
267
430
  # If a job id is provided keep the data per-job
268
431
  if job_id:
269
432
  self.eval_config.general.output_dir = self.eval_config.general.output_dir / f"jobs/{job_id}"
@@ -281,7 +444,11 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
281
444
  workflow_interrupted=self.workflow_interrupted,
282
445
  )
283
446
 
284
- dataset_handler = DatasetHandler(dataset_config=dataset_config, reps=self.config.reps)
447
+ dataset_handler = DatasetHandler(dataset_config=dataset_config,
448
+ reps=self.config.reps,
449
+ concurrency=self.eval_config.general.max_concurrency,
450
+ num_passes=self.config.num_passes,
451
+ adjust_dataset_size=self.config.adjust_dataset_size)
285
452
  self.eval_input = dataset_handler.get_eval_input_from_dataset(self.config.dataset)
286
453
  if not self.eval_input.eval_input_items:
287
454
  logger.info("Dataset is empty. Nothing to evaluate.")
@@ -293,6 +460,10 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
293
460
 
294
461
  # Run workflow and evaluate
295
462
  async with WorkflowEvalBuilder.from_config(config=config) as eval_workflow:
463
+ # Initialize Weave integration
464
+ self.weave_eval.initialize_logger(workflow_alias, self.eval_input, config)
465
+
466
+ # Run workflow
296
467
  if self.config.endpoint:
297
468
  await self.run_workflow_remote()
298
469
  else:
@@ -307,10 +478,18 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
307
478
  await self.run_evaluators(evaluators)
308
479
 
309
480
  # Profile the workflow
310
- await self.profile_workflow()
481
+ profiler_results = await self.profile_workflow()
482
+
483
+ # compute total runtime
484
+ if self.usage_stats.usage_stats_items:
485
+ self.usage_stats.total_runtime = max(self.usage_stats.usage_stats_items.values(),
486
+ key=lambda x: x.max_timestamp).max_timestamp - \
487
+ min(self.usage_stats.usage_stats_items.values(), key=lambda x: x.min_timestamp).min_timestamp
488
+ else:
489
+ self.usage_stats.total_runtime = 0.0
311
490
 
312
- # Write the results to the output directory
313
- self.write_output(dataset_handler)
491
+ # Publish the results
492
+ self.publish_output(dataset_handler, profiler_results)
314
493
 
315
494
  # Run custom scripts and upload evaluation outputs to S3
316
495
  if self.eval_config.general.output:
@@ -318,8 +497,10 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
318
497
  output_uploader.run_custom_scripts()
319
498
  await output_uploader.upload_directory()
320
499
 
321
- return EvaluationRunOutput(
322
- workflow_output_file=self.workflow_output_file,
323
- evaluator_output_files=self.evaluator_output_files,
324
- workflow_interrupted=self.workflow_interrupted,
325
- )
500
+ return EvaluationRunOutput(workflow_output_file=self.workflow_output_file,
501
+ evaluator_output_files=self.evaluator_output_files,
502
+ workflow_interrupted=self.workflow_interrupted,
503
+ eval_input=self.eval_input,
504
+ evaluation_results=self.evaluation_results,
505
+ usage_stats=self.usage_stats,
506
+ profiler_results=profiler_results)
@@ -0,0 +1,73 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import asyncio
17
+ from abc import ABC
18
+ from abc import abstractmethod
19
+
20
+ from tqdm import tqdm
21
+
22
+ from aiq.eval.evaluator.evaluator_model import EvalInput
23
+ from aiq.eval.evaluator.evaluator_model import EvalInputItem
24
+ from aiq.eval.evaluator.evaluator_model import EvalOutput
25
+ from aiq.eval.evaluator.evaluator_model import EvalOutputItem
26
+ from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
27
+
28
+
29
+ class BaseEvaluator(ABC):
30
+ """
31
+ Base class for custom evaluators.
32
+
33
+ Each custom evaluator must implement the `evaluate_item` method which is used to evaluate a
34
+ single EvalInputItem.
35
+ """
36
+
37
+ def __init__(self, max_concurrency: int = 4, tqdm_desc: str = "Evaluating"):
38
+ self.max_concurrency = max_concurrency
39
+ self.semaphore = asyncio.Semaphore(max_concurrency)
40
+ self.tqdm_desc = tqdm_desc
41
+
42
+ @abstractmethod
43
+ async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
44
+ """Each evaluator must implement this for item-level evaluation"""
45
+ pass
46
+
47
+ async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
48
+ pbar = None
49
+ try:
50
+ tqdm_position = TqdmPositionRegistry.claim()
51
+ pbar = tqdm(total=len(eval_input.eval_input_items), desc=self.tqdm_desc, position=tqdm_position)
52
+
53
+ async def wrapped(item):
54
+ async with self.semaphore:
55
+ try:
56
+ output_item = await self.evaluate_item(item)
57
+ pbar.update(1)
58
+ return output_item
59
+ except Exception as e:
60
+ # If the evaluator fails, return an error item with a score of 0.0
61
+ pbar.update(1)
62
+ return EvalOutputItem(id=item.id, score=0.0, reasoning={"error": f"Evaluator error: {str(e)}"})
63
+
64
+ output_items = await asyncio.gather(*[wrapped(item) for item in eval_input.eval_input_items])
65
+ finally:
66
+ pbar.close()
67
+ TqdmPositionRegistry.release(tqdm_position)
68
+
69
+ # Compute average if possible
70
+ numeric_scores = [item.score for item in output_items if isinstance(item.score, (int, float))]
71
+ avg_score = round(sum(numeric_scores) / len(numeric_scores), 2) if numeric_scores else None
72
+
73
+ return EvalOutput(average_score=avg_score, eval_output_items=output_items)
@@ -27,6 +27,7 @@ class EvalInputItem(BaseModel):
27
27
  output_obj: typing.Any
28
28
  expected_trajectory: list[IntermediateStep]
29
29
  trajectory: list[IntermediateStep]
30
+ full_dataset_entry: typing.Any
30
31
 
31
32
 
32
33
  class EvalInput(BaseModel):
@@ -79,15 +79,21 @@ class IntermediateStepAdapter:
79
79
  for step in steps:
80
80
  if step.event_type == IntermediateStepType.LLM_END:
81
81
  last_llm_end_step = step
82
+ action = self.get_agent_action_single(step, "")
83
+ agent_actions.append(action)
82
84
  else:
83
85
  action = self.get_agent_action_single(step, last_llm_end_step)
84
86
  agent_actions.append(action)
85
87
 
86
88
  return agent_actions
87
89
 
88
- def get_context(self, intermediate_steps: list[IntermediateStep]) -> list[str]:
90
+ def get_context(self, intermediate_steps: list[IntermediateStep],
91
+ event_filter: list[IntermediateStepType]) -> list[str]:
89
92
  """Grab the output of all the tools and return them as retrieved context."""
90
- return [
91
- str(step.data.output) for step in intermediate_steps
92
- if step.event_type == IntermediateStepType.TOOL_END and step.data and step.data.output
93
- ]
93
+ count = 0
94
+ agent_actions = []
95
+ for step in intermediate_steps:
96
+ if step.event_type in event_filter and step.data and step.data.output:
97
+ agent_actions.append(f"**Step {count}**\n{str(step.data.output)}")
98
+ count += 1
99
+ return agent_actions
@@ -14,8 +14,10 @@
14
14
  # limitations under the License.
15
15
 
16
16
  import logging
17
+ import math
17
18
  from collections.abc import Sequence
18
19
 
20
+ from pydantic import BaseModel
19
21
  from ragas import EvaluationDataset
20
22
  from ragas import SingleTurnSample
21
23
  from ragas.dataset_schema import EvaluationResult
@@ -23,7 +25,9 @@ from ragas.llms import LangchainLLMWrapper
23
25
  from ragas.metrics import Metric
24
26
  from tqdm import tqdm
25
27
 
28
+ from aiq.data_models.intermediate_step import IntermediateStepType
26
29
  from aiq.eval.evaluator.evaluator_model import EvalInput
30
+ from aiq.eval.evaluator.evaluator_model import EvalInputItem
27
31
  from aiq.eval.evaluator.evaluator_model import EvalOutput
28
32
  from aiq.eval.evaluator.evaluator_model import EvalOutputItem
29
33
  from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
@@ -33,21 +37,45 @@ logger = logging.getLogger(__name__)
33
37
 
34
38
  class RAGEvaluator:
35
39
 
36
- def __init__(self, evaluator_llm: LangchainLLMWrapper, metrics: Sequence[Metric]):
40
+ def __init__(self,
41
+ evaluator_llm: LangchainLLMWrapper,
42
+ metrics: Sequence[Metric],
43
+ max_concurrency=8,
44
+ input_obj_field: str | None = None):
37
45
  self.evaluator_llm = evaluator_llm
38
46
  self.metrics = metrics
47
+ self.max_concurrency = max_concurrency
48
+ self.input_obj_field = input_obj_field
39
49
 
40
- @staticmethod
41
- def eval_input_to_ragas(eval_input: EvalInput) -> EvaluationDataset:
50
+ def extract_input_obj(self, item: EvalInputItem) -> str:
51
+ """Extracts the input object from EvalInputItem based on the configured input_obj_field."""
52
+ input_obj = item.input_obj
53
+ if isinstance(input_obj, BaseModel):
54
+ if self.input_obj_field and hasattr(input_obj, self.input_obj_field):
55
+ # If input_obj_field is specified, return the value of that field
56
+ return str(getattr(input_obj, self.input_obj_field, ""))
57
+
58
+ # If no input_obj_field is specified, return the string representation of the model
59
+ return input_obj.model_dump_json()
60
+
61
+ if isinstance(input_obj, dict):
62
+ # If input_obj is a dict, return the JSON string representation
63
+ if self.input_obj_field and self.input_obj_field in input_obj:
64
+ # If input_obj_field is specified, return the value of that field
65
+ return str(input_obj[self.input_obj_field])
66
+
67
+ return str(input_obj) # Fallback to string representation of the dict
68
+
69
+ def eval_input_to_ragas(self, eval_input: EvalInput) -> EvaluationDataset:
42
70
  """Converts EvalInput into a Ragas-compatible EvaluationDataset."""
43
71
  from aiq.eval.intermediate_step_adapter import IntermediateStepAdapter
44
-
72
+ event_filter = [IntermediateStepType.TOOL_END, IntermediateStepType.LLM_END, IntermediateStepType.CUSTOM_END]
45
73
  samples = []
46
74
 
47
75
  intermediate_step_adapter = IntermediateStepAdapter()
48
76
  for item in eval_input.eval_input_items:
49
77
  # Extract required fields from EvalInputItem
50
- user_input = item.input_obj # Assumes input_obj is a string (modify if needed)
78
+ user_input = self.extract_input_obj(item) # Extract input object as string
51
79
  reference = item.expected_output_obj # Reference correct answer
52
80
  response = item.output_obj # Model's generated response
53
81
 
@@ -55,7 +83,7 @@ class RAGEvaluator:
55
83
  reference_contexts = [""] # Default to empty context
56
84
  # implement context extraction from expected_trajectory
57
85
 
58
- retrieved_contexts = intermediate_step_adapter.get_context(item.trajectory)
86
+ retrieved_contexts = intermediate_step_adapter.get_context(item.trajectory, event_filter)
59
87
  # implement context extraction from expected_trajectory
60
88
 
61
89
  # Create a SingleTurnSample
@@ -78,19 +106,29 @@ class RAGEvaluator:
78
106
  return EvalOutput(average_score=0.0, eval_output_items=[])
79
107
 
80
108
  scores: list[dict[str, float]] = results_dataset.scores
109
+
110
+ # If Ragas returned no scores, return empty output to avoid downstream errors
81
111
  if not scores:
82
- logger.error("Ragas returned empty score list")
112
+ logger.warning("Ragas returned empty score list")
83
113
  return EvalOutput(average_score=0.0, eval_output_items=[])
84
114
 
85
- # Convert from list of dicts to dict of lists
86
- scores_dict = {metric: [score[metric] for score in scores] for metric in scores[0]}
115
+ def _nan_to_zero(v: float | None) -> float:
116
+ """Convert NaN or None to 0.0 for safe arithmetic/serialization."""
117
+ return 0.0 if v is None or (isinstance(v, float) and math.isnan(v)) else v
118
+
119
+ # Convert from list of dicts to dict of lists, coercing NaN/None to 0.0
120
+ scores_dict = {metric: [_nan_to_zero(score.get(metric)) for score in scores] for metric in scores[0]}
121
+ first_metric_name = list(scores_dict.keys())[0] if scores_dict else None
87
122
 
88
- # Compute the average of each metric
89
- average_scores = {metric: sum(values) / len(values) for metric, values in scores_dict.items()}
123
+ # Compute the average of each metric, guarding against empty lists
124
+ average_scores = {
125
+ metric: (sum(values) / len(values) if values else 0.0)
126
+ for metric, values in scores_dict.items()
127
+ }
90
128
 
91
- # Extract the first (and only) metric's average score
92
- first_avg_score = next(iter(average_scores.values()))
93
- first_metric_name = list(scores_dict.keys())[0]
129
+ first_avg_score = average_scores.get(list(scores_dict.keys())[0], 0.0)
130
+ if isinstance(first_avg_score, float) and math.isnan(first_avg_score):
131
+ first_avg_score = 0.0
94
132
 
95
133
  df = results_dataset.to_pandas()
96
134
  # Get id from eval_input if df size matches number of eval_input_items
@@ -103,7 +141,7 @@ class RAGEvaluator:
103
141
  eval_output_items = [
104
142
  EvalOutputItem(
105
143
  id=ids[i],
106
- score=getattr(row, first_metric_name, 0.0),
144
+ score=_nan_to_zero(getattr(row, first_metric_name, 0.0) if first_metric_name else 0.0),
107
145
  reasoning={
108
146
  key:
109
147
  getattr(row, key, None) # Use getattr to safely access attributes
@@ -116,6 +154,7 @@ class RAGEvaluator:
116
154
  async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
117
155
  """Run Ragas metrics evaluation on the provided EvalInput"""
118
156
  from ragas import evaluate as ragas_evaluate
157
+ from ragas.run_config import RunConfig
119
158
 
120
159
  ragas_dataset = self.eval_input_to_ragas(eval_input)
121
160
  tqdm_position = TqdmPositionRegistry.claim()
@@ -126,6 +165,7 @@ class RAGEvaluator:
126
165
  metrics=self.metrics,
127
166
  show_progress=True,
128
167
  llm=self.evaluator_llm,
168
+ run_config=RunConfig(max_workers=self.max_concurrency),
129
169
  _pbar=pbar)
130
170
  except Exception as e:
131
171
  # On exception we still continue with other evaluators. Log and return an avg_score of 0.0
@@ -47,6 +47,8 @@ class RagasEvaluatorConfig(EvaluatorBaseConfig, name="ragas"):
47
47
  # Ragas metric
48
48
  metric: str | dict[str, RagasMetricConfig] = Field(default="AnswerAccuracy",
49
49
  description="RAGAS metric callable with optional 'kwargs:'")
50
+ input_obj_field: str | None = Field(
51
+ default=None, description="The field in the input object that contains the content to evaluate.")
50
52
 
51
53
  @model_validator(mode="before")
52
54
  @classmethod
@@ -133,6 +135,9 @@ async def register_ragas_evaluator(config: RagasEvaluatorConfig, builder: EvalBu
133
135
  metrics.append(metric_callable(**kwargs))
134
136
 
135
137
  # Create the RAG evaluator
136
- _evaluator = RAGEvaluator(evaluator_llm=llm, metrics=metrics) if metrics else None
138
+ _evaluator = RAGEvaluator(evaluator_llm=llm,
139
+ metrics=metrics,
140
+ max_concurrency=builder.get_max_concurrency(),
141
+ input_obj_field=config.input_obj_field) if metrics else None
137
142
 
138
143
  yield EvaluatorInfo(config=config, evaluate_fn=evaluate_fn, description="Evaluator for RAGAS metrics")