aiqtoolkit 1.2.0.dev0__py3-none-any.whl → 1.2.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiqtoolkit might be problematic. Click here for more details.

Files changed (220) hide show
  1. aiq/agent/base.py +170 -8
  2. aiq/agent/dual_node.py +1 -1
  3. aiq/agent/react_agent/agent.py +146 -112
  4. aiq/agent/react_agent/prompt.py +1 -6
  5. aiq/agent/react_agent/register.py +36 -35
  6. aiq/agent/rewoo_agent/agent.py +36 -35
  7. aiq/agent/rewoo_agent/register.py +2 -2
  8. aiq/agent/tool_calling_agent/agent.py +3 -7
  9. aiq/agent/tool_calling_agent/register.py +1 -1
  10. aiq/authentication/__init__.py +14 -0
  11. aiq/authentication/api_key/__init__.py +14 -0
  12. aiq/authentication/api_key/api_key_auth_provider.py +92 -0
  13. aiq/authentication/api_key/api_key_auth_provider_config.py +124 -0
  14. aiq/authentication/api_key/register.py +26 -0
  15. aiq/authentication/exceptions/__init__.py +14 -0
  16. aiq/authentication/exceptions/api_key_exceptions.py +38 -0
  17. aiq/authentication/exceptions/auth_code_grant_exceptions.py +86 -0
  18. aiq/authentication/exceptions/call_back_exceptions.py +38 -0
  19. aiq/authentication/exceptions/request_exceptions.py +54 -0
  20. aiq/authentication/http_basic_auth/__init__.py +0 -0
  21. aiq/authentication/http_basic_auth/http_basic_auth_provider.py +81 -0
  22. aiq/authentication/http_basic_auth/register.py +30 -0
  23. aiq/authentication/interfaces.py +93 -0
  24. aiq/authentication/oauth2/__init__.py +14 -0
  25. aiq/authentication/oauth2/oauth2_auth_code_flow_provider.py +107 -0
  26. aiq/authentication/oauth2/oauth2_auth_code_flow_provider_config.py +39 -0
  27. aiq/authentication/oauth2/register.py +25 -0
  28. aiq/authentication/register.py +21 -0
  29. aiq/builder/builder.py +64 -2
  30. aiq/builder/component_utils.py +16 -3
  31. aiq/builder/context.py +37 -0
  32. aiq/builder/eval_builder.py +43 -2
  33. aiq/builder/function.py +44 -12
  34. aiq/builder/function_base.py +1 -1
  35. aiq/builder/intermediate_step_manager.py +6 -8
  36. aiq/builder/user_interaction_manager.py +3 -0
  37. aiq/builder/workflow.py +23 -18
  38. aiq/builder/workflow_builder.py +421 -61
  39. aiq/cli/commands/info/list_mcp.py +103 -16
  40. aiq/cli/commands/sizing/__init__.py +14 -0
  41. aiq/cli/commands/sizing/calc.py +294 -0
  42. aiq/cli/commands/sizing/sizing.py +27 -0
  43. aiq/cli/commands/start.py +2 -1
  44. aiq/cli/entrypoint.py +2 -0
  45. aiq/cli/register_workflow.py +80 -0
  46. aiq/cli/type_registry.py +151 -30
  47. aiq/data_models/api_server.py +124 -12
  48. aiq/data_models/authentication.py +231 -0
  49. aiq/data_models/common.py +35 -7
  50. aiq/data_models/component.py +17 -9
  51. aiq/data_models/component_ref.py +33 -0
  52. aiq/data_models/config.py +60 -3
  53. aiq/data_models/dataset_handler.py +2 -1
  54. aiq/data_models/embedder.py +1 -0
  55. aiq/data_models/evaluate.py +23 -0
  56. aiq/data_models/function_dependencies.py +8 -0
  57. aiq/data_models/interactive.py +10 -1
  58. aiq/data_models/intermediate_step.py +38 -5
  59. aiq/data_models/its_strategy.py +30 -0
  60. aiq/data_models/llm.py +1 -0
  61. aiq/data_models/memory.py +1 -0
  62. aiq/data_models/object_store.py +44 -0
  63. aiq/data_models/profiler.py +1 -0
  64. aiq/data_models/retry_mixin.py +35 -0
  65. aiq/data_models/span.py +187 -0
  66. aiq/data_models/telemetry_exporter.py +2 -2
  67. aiq/embedder/nim_embedder.py +2 -1
  68. aiq/embedder/openai_embedder.py +2 -1
  69. aiq/eval/config.py +19 -1
  70. aiq/eval/dataset_handler/dataset_handler.py +87 -2
  71. aiq/eval/evaluate.py +208 -27
  72. aiq/eval/evaluator/base_evaluator.py +73 -0
  73. aiq/eval/evaluator/evaluator_model.py +1 -0
  74. aiq/eval/intermediate_step_adapter.py +11 -5
  75. aiq/eval/rag_evaluator/evaluate.py +55 -15
  76. aiq/eval/rag_evaluator/register.py +6 -1
  77. aiq/eval/remote_workflow.py +7 -2
  78. aiq/eval/runners/__init__.py +14 -0
  79. aiq/eval/runners/config.py +39 -0
  80. aiq/eval/runners/multi_eval_runner.py +54 -0
  81. aiq/eval/trajectory_evaluator/evaluate.py +22 -65
  82. aiq/eval/tunable_rag_evaluator/evaluate.py +150 -168
  83. aiq/eval/tunable_rag_evaluator/register.py +2 -0
  84. aiq/eval/usage_stats.py +41 -0
  85. aiq/eval/utils/output_uploader.py +10 -1
  86. aiq/eval/utils/weave_eval.py +184 -0
  87. aiq/experimental/__init__.py +0 -0
  88. aiq/experimental/decorators/__init__.py +0 -0
  89. aiq/experimental/decorators/experimental_warning_decorator.py +130 -0
  90. aiq/experimental/inference_time_scaling/__init__.py +0 -0
  91. aiq/experimental/inference_time_scaling/editing/__init__.py +0 -0
  92. aiq/experimental/inference_time_scaling/editing/iterative_plan_refinement_editor.py +147 -0
  93. aiq/experimental/inference_time_scaling/editing/llm_as_a_judge_editor.py +204 -0
  94. aiq/experimental/inference_time_scaling/editing/motivation_aware_summarization.py +107 -0
  95. aiq/experimental/inference_time_scaling/functions/__init__.py +0 -0
  96. aiq/experimental/inference_time_scaling/functions/execute_score_select_function.py +105 -0
  97. aiq/experimental/inference_time_scaling/functions/its_tool_orchestration_function.py +205 -0
  98. aiq/experimental/inference_time_scaling/functions/its_tool_wrapper_function.py +146 -0
  99. aiq/experimental/inference_time_scaling/functions/plan_select_execute_function.py +224 -0
  100. aiq/experimental/inference_time_scaling/models/__init__.py +0 -0
  101. aiq/experimental/inference_time_scaling/models/editor_config.py +132 -0
  102. aiq/experimental/inference_time_scaling/models/its_item.py +48 -0
  103. aiq/experimental/inference_time_scaling/models/scoring_config.py +112 -0
  104. aiq/experimental/inference_time_scaling/models/search_config.py +120 -0
  105. aiq/experimental/inference_time_scaling/models/selection_config.py +154 -0
  106. aiq/experimental/inference_time_scaling/models/stage_enums.py +43 -0
  107. aiq/experimental/inference_time_scaling/models/strategy_base.py +66 -0
  108. aiq/experimental/inference_time_scaling/models/tool_use_config.py +41 -0
  109. aiq/experimental/inference_time_scaling/register.py +36 -0
  110. aiq/experimental/inference_time_scaling/scoring/__init__.py +0 -0
  111. aiq/experimental/inference_time_scaling/scoring/llm_based_agent_scorer.py +168 -0
  112. aiq/experimental/inference_time_scaling/scoring/llm_based_plan_scorer.py +168 -0
  113. aiq/experimental/inference_time_scaling/scoring/motivation_aware_scorer.py +111 -0
  114. aiq/experimental/inference_time_scaling/search/__init__.py +0 -0
  115. aiq/experimental/inference_time_scaling/search/multi_llm_planner.py +128 -0
  116. aiq/experimental/inference_time_scaling/search/multi_query_retrieval_search.py +122 -0
  117. aiq/experimental/inference_time_scaling/search/single_shot_multi_plan_planner.py +128 -0
  118. aiq/experimental/inference_time_scaling/selection/__init__.py +0 -0
  119. aiq/experimental/inference_time_scaling/selection/best_of_n_selector.py +63 -0
  120. aiq/experimental/inference_time_scaling/selection/llm_based_agent_output_selector.py +131 -0
  121. aiq/experimental/inference_time_scaling/selection/llm_based_output_merging_selector.py +159 -0
  122. aiq/experimental/inference_time_scaling/selection/llm_based_plan_selector.py +128 -0
  123. aiq/experimental/inference_time_scaling/selection/threshold_selector.py +58 -0
  124. aiq/front_ends/console/authentication_flow_handler.py +233 -0
  125. aiq/front_ends/console/console_front_end_plugin.py +11 -2
  126. aiq/front_ends/fastapi/auth_flow_handlers/__init__.py +0 -0
  127. aiq/front_ends/fastapi/auth_flow_handlers/http_flow_handler.py +27 -0
  128. aiq/front_ends/fastapi/auth_flow_handlers/websocket_flow_handler.py +107 -0
  129. aiq/front_ends/fastapi/fastapi_front_end_config.py +93 -9
  130. aiq/front_ends/fastapi/fastapi_front_end_controller.py +68 -0
  131. aiq/front_ends/fastapi/fastapi_front_end_plugin.py +14 -1
  132. aiq/front_ends/fastapi/fastapi_front_end_plugin_worker.py +537 -52
  133. aiq/front_ends/fastapi/html_snippets/__init__.py +14 -0
  134. aiq/front_ends/fastapi/html_snippets/auth_code_grant_success.py +35 -0
  135. aiq/front_ends/fastapi/job_store.py +47 -25
  136. aiq/front_ends/fastapi/main.py +2 -0
  137. aiq/front_ends/fastapi/message_handler.py +108 -89
  138. aiq/front_ends/fastapi/step_adaptor.py +2 -1
  139. aiq/llm/aws_bedrock_llm.py +57 -0
  140. aiq/llm/nim_llm.py +2 -1
  141. aiq/llm/openai_llm.py +3 -2
  142. aiq/llm/register.py +1 -0
  143. aiq/meta/pypi.md +12 -12
  144. aiq/object_store/__init__.py +20 -0
  145. aiq/object_store/in_memory_object_store.py +74 -0
  146. aiq/object_store/interfaces.py +84 -0
  147. aiq/object_store/models.py +36 -0
  148. aiq/object_store/register.py +20 -0
  149. aiq/observability/__init__.py +14 -0
  150. aiq/observability/exporter/__init__.py +14 -0
  151. aiq/observability/exporter/base_exporter.py +449 -0
  152. aiq/observability/exporter/exporter.py +78 -0
  153. aiq/observability/exporter/file_exporter.py +33 -0
  154. aiq/observability/exporter/processing_exporter.py +269 -0
  155. aiq/observability/exporter/raw_exporter.py +52 -0
  156. aiq/observability/exporter/span_exporter.py +264 -0
  157. aiq/observability/exporter_manager.py +335 -0
  158. aiq/observability/mixin/__init__.py +14 -0
  159. aiq/observability/mixin/batch_config_mixin.py +26 -0
  160. aiq/observability/mixin/collector_config_mixin.py +23 -0
  161. aiq/observability/mixin/file_mixin.py +288 -0
  162. aiq/observability/mixin/file_mode.py +23 -0
  163. aiq/observability/mixin/resource_conflict_mixin.py +134 -0
  164. aiq/observability/mixin/serialize_mixin.py +61 -0
  165. aiq/observability/mixin/type_introspection_mixin.py +183 -0
  166. aiq/observability/processor/__init__.py +14 -0
  167. aiq/observability/processor/batching_processor.py +316 -0
  168. aiq/observability/processor/intermediate_step_serializer.py +28 -0
  169. aiq/observability/processor/processor.py +68 -0
  170. aiq/observability/register.py +36 -39
  171. aiq/observability/utils/__init__.py +14 -0
  172. aiq/observability/utils/dict_utils.py +236 -0
  173. aiq/observability/utils/time_utils.py +31 -0
  174. aiq/profiler/calc/__init__.py +14 -0
  175. aiq/profiler/calc/calc_runner.py +623 -0
  176. aiq/profiler/calc/calculations.py +288 -0
  177. aiq/profiler/calc/data_models.py +176 -0
  178. aiq/profiler/calc/plot.py +345 -0
  179. aiq/profiler/callbacks/langchain_callback_handler.py +22 -10
  180. aiq/profiler/data_models.py +24 -0
  181. aiq/profiler/inference_metrics_model.py +3 -0
  182. aiq/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py +8 -0
  183. aiq/profiler/inference_optimization/data_models.py +2 -2
  184. aiq/profiler/inference_optimization/llm_metrics.py +2 -2
  185. aiq/profiler/profile_runner.py +61 -21
  186. aiq/runtime/loader.py +9 -3
  187. aiq/runtime/runner.py +23 -9
  188. aiq/runtime/session.py +25 -7
  189. aiq/runtime/user_metadata.py +2 -3
  190. aiq/tool/chat_completion.py +74 -0
  191. aiq/tool/code_execution/README.md +152 -0
  192. aiq/tool/code_execution/code_sandbox.py +151 -72
  193. aiq/tool/code_execution/local_sandbox/.gitignore +1 -0
  194. aiq/tool/code_execution/local_sandbox/local_sandbox_server.py +139 -24
  195. aiq/tool/code_execution/local_sandbox/sandbox.requirements.txt +3 -1
  196. aiq/tool/code_execution/local_sandbox/start_local_sandbox.sh +27 -2
  197. aiq/tool/code_execution/register.py +7 -3
  198. aiq/tool/code_execution/test_code_execution_sandbox.py +414 -0
  199. aiq/tool/mcp/exceptions.py +142 -0
  200. aiq/tool/mcp/mcp_client.py +41 -6
  201. aiq/tool/mcp/mcp_tool.py +3 -2
  202. aiq/tool/register.py +1 -0
  203. aiq/tool/server_tools.py +6 -3
  204. aiq/utils/exception_handlers/automatic_retries.py +289 -0
  205. aiq/utils/exception_handlers/mcp.py +211 -0
  206. aiq/utils/io/model_processing.py +28 -0
  207. aiq/utils/log_utils.py +37 -0
  208. aiq/utils/string_utils.py +38 -0
  209. aiq/utils/type_converter.py +18 -2
  210. aiq/utils/type_utils.py +87 -0
  211. {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/METADATA +53 -21
  212. aiqtoolkit-1.2.0rc2.dist-info/RECORD +436 -0
  213. {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/WHEEL +1 -1
  214. {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/entry_points.txt +3 -0
  215. aiq/front_ends/fastapi/websocket.py +0 -148
  216. aiq/observability/async_otel_listener.py +0 -429
  217. aiqtoolkit-1.2.0.dev0.dist-info/RECORD +0 -316
  218. {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/licenses/LICENSE-3rd-party.txt +0 -0
  219. {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/licenses/LICENSE.md +0 -0
  220. {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/top_level.txt +0 -0
@@ -24,6 +24,7 @@ from tqdm import tqdm
24
24
  from aiq.data_models.api_server import AIQResponseIntermediateStep
25
25
  from aiq.data_models.intermediate_step import IntermediateStep
26
26
  from aiq.data_models.intermediate_step import IntermediateStepPayload
27
+ from aiq.data_models.invocation_node import InvocationNode
27
28
  from aiq.eval.config import EvaluationRunConfig
28
29
  from aiq.eval.evaluator.evaluator_model import EvalInput
29
30
  from aiq.eval.evaluator.evaluator_model import EvalInputItem
@@ -81,8 +82,12 @@ class EvaluationRemoteWorkflowHandler:
81
82
  step_data = json.loads(line[len(INTERMEDIATE_DATA_PREFIX):])
82
83
  response_intermediate = AIQResponseIntermediateStep.model_validate(step_data)
83
84
  # The payload is expected to be IntermediateStepPayload
84
- intermediate_step = IntermediateStep(
85
- payload=IntermediateStepPayload.model_validate_json(response_intermediate.payload))
85
+ payload = IntermediateStepPayload.model_validate_json(response_intermediate.payload)
86
+ intermediate_step = IntermediateStep(parent_id="remote",
87
+ function_ancestry=InvocationNode(
88
+ function_name=payload.name or "remote_function",
89
+ function_id=payload.UUID or "remote_function_id"),
90
+ payload=payload)
86
91
  intermediate_steps.append(intermediate_step)
87
92
  except (json.JSONDecodeError, ValidationError) as e:
88
93
  logger.error("Failed to parse intermediate step: %s", e)
@@ -0,0 +1,14 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
@@ -0,0 +1,39 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import typing
17
+
18
+ from pydantic import BaseModel
19
+
20
+ from aiq.eval.config import EvaluationRunConfig
21
+ from aiq.eval.config import EvaluationRunOutput
22
+
23
+
24
+ class MultiEvaluationRunConfig(BaseModel):
25
+ """
26
+ Parameters used for a multi-evaluation run.
27
+ This includes a dict of configs. The key is an id of any type.
28
+ Each pass loads the config, applies the overrides and runs to completion
29
+ before the next pass starts.
30
+ """
31
+ configs: dict[typing.Any, EvaluationRunConfig]
32
+
33
+
34
+ class MultiEvaluationRunOutput(BaseModel):
35
+ """
36
+ Output of a multi-evaluation run.
37
+ The results per-pass are accumulated in the evaluation_run_outputs dict.
38
+ """
39
+ evaluation_run_outputs: dict[typing.Any, EvaluationRunOutput]
@@ -0,0 +1,54 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import copy
17
+ import typing
18
+
19
+ from aiq.eval.config import EvaluationRunConfig
20
+ from aiq.eval.config import EvaluationRunOutput
21
+ from aiq.eval.evaluate import EvaluationRun
22
+ from aiq.eval.runners.config import MultiEvaluationRunConfig
23
+
24
+
25
+ class MultiEvaluationRunner:
26
+ """
27
+ Run a multi-evaluation run.
28
+ """
29
+
30
+ def __init__(self, config: MultiEvaluationRunConfig):
31
+ """
32
+ Initialize a multi-evaluation run.
33
+ """
34
+ self.config = config
35
+ self.evaluation_run_outputs: dict[typing.Any, EvaluationRunOutput] = {}
36
+
37
+ async def run_all(self):
38
+ """
39
+ Run all evaluations defined by the overrides.
40
+ """
41
+ for id, config in self.config.configs.items():
42
+ output = await self.run_single_evaluation(id, config)
43
+ self.evaluation_run_outputs[id] = output
44
+
45
+ return self.evaluation_run_outputs
46
+
47
+ async def run_single_evaluation(self, id: typing.Any, config: EvaluationRunConfig) -> EvaluationRunOutput:
48
+ """
49
+ Run a single evaluation and return the output.
50
+ """
51
+ # copy the config in case the caller is using the same config for multiple evaluations
52
+ config_copy = copy.deepcopy(config)
53
+ evaluation_run = EvaluationRun(config_copy)
54
+ return await evaluation_run.run_and_evaluate()
@@ -13,24 +13,20 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
- import asyncio
17
16
  import logging
18
17
 
19
18
  from langchain.evaluation import TrajectoryEvalChain
20
19
  from langchain_core.language_models import BaseChatModel
21
20
  from langchain_core.tools import BaseTool
22
- from tqdm import tqdm
23
21
 
24
- from aiq.eval.evaluator.evaluator_model import EvalInput
22
+ from aiq.eval.evaluator.base_evaluator import BaseEvaluator
25
23
  from aiq.eval.evaluator.evaluator_model import EvalInputItem
26
- from aiq.eval.evaluator.evaluator_model import EvalOutput
27
24
  from aiq.eval.evaluator.evaluator_model import EvalOutputItem
28
- from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
29
25
 
30
26
  logger = logging.getLogger(__name__)
31
27
 
32
28
 
33
- class TrajectoryEvaluator:
29
+ class TrajectoryEvaluator(BaseEvaluator):
34
30
 
35
31
  def __init__(
36
32
  self,
@@ -38,11 +34,9 @@ class TrajectoryEvaluator:
38
34
  tools: list[BaseTool] | None = None,
39
35
  max_concurrency: int = 8,
40
36
  ):
41
-
37
+ super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating Trajectory")
42
38
  self.llm = llm
43
39
  self.tools = tools
44
- self.max_concurrency = max_concurrency
45
- self.semaphore = asyncio.Semaphore(self.max_concurrency)
46
40
  # Initialize trajectory evaluation chain
47
41
  self.traj_eval_chain = TrajectoryEvalChain.from_llm(llm=self.llm,
48
42
  tools=self.tools,
@@ -50,69 +44,32 @@ class TrajectoryEvaluator:
50
44
  requires_reference=True)
51
45
  logger.debug("Trajectory evaluation chain initialized.")
52
46
 
53
- async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
47
+ async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
54
48
  """
55
- Evaluates the agent trajectories using trajectory evaluation chain.
49
+ Evaluate a single EvalInputItem and return an EvalOutputItem.
56
50
  """
57
-
58
- num_records = len(eval_input.eval_input_items)
59
- logger.info("Running trajectory evaluation with %d records", num_records)
60
51
  from aiq.data_models.intermediate_step import IntermediateStepType
61
52
  from aiq.eval.intermediate_step_adapter import IntermediateStepAdapter
62
53
 
63
54
  intermediate_step_adapter = IntermediateStepAdapter()
64
55
  event_filter = [IntermediateStepType.LLM_END, IntermediateStepType.TOOL_END]
65
56
 
66
- async def process_item(item: EvalInputItem) -> tuple[float, dict]:
67
- """
68
- Evaluate a single EvalInputItem asynchronously and return a tuple of-
69
- 1. score
70
- 2. reasoning for the score
71
- """
72
- question = item.input_obj
73
- generated_answer = item.output_obj
74
- agent_trajectory = intermediate_step_adapter.get_agent_actions(item.trajectory, event_filter)
75
- try:
76
- eval_result = await self.traj_eval_chain.aevaluate_agent_trajectory(
77
- input=question,
78
- agent_trajectory=agent_trajectory,
79
- prediction=generated_answer,
80
- )
81
- except Exception as e:
82
- logger.exception("Error evaluating trajectory for question: %s, Error: %s", question, e, exc_info=True)
83
- return 0.0, f"Error evaluating trajectory: {e}"
84
-
85
- reasoning = {
86
- "reasoning": eval_result["reasoning"],
87
- "trajectory": [(action.model_dump(), output) for (action, output) in agent_trajectory]
88
- }
89
- return eval_result["score"], reasoning
90
-
91
- async def wrapped_process(item: EvalInputItem) -> tuple[float, dict]:
92
- async with self.semaphore:
93
- result = await process_item(item)
94
- pbar.update(1)
95
- return result
57
+ question = item.input_obj
58
+ generated_answer = item.output_obj
59
+ agent_trajectory = intermediate_step_adapter.get_agent_actions(item.trajectory, event_filter)
96
60
 
97
- # Execute all evaluations asynchronously
98
61
  try:
99
- tqdm_position = TqdmPositionRegistry.claim()
100
- pbar = tqdm(total=len(eval_input.eval_input_items), desc="Evaluating Trajectory", position=tqdm_position)
101
- results = await asyncio.gather(*[wrapped_process(item) for item in eval_input.eval_input_items])
102
- finally:
103
- pbar.close()
104
- TqdmPositionRegistry.release(tqdm_position)
105
-
106
- # Extract scores and reasonings
107
- sample_scores, sample_reasonings = zip(*results) if results else ([], [])
108
-
109
- # Compute average score
110
- avg_score = round(sum(sample_scores) / len(sample_scores), 2) if sample_scores else 0.0
111
-
112
- # Construct EvalOutputItems
113
- eval_output_items = [
114
- EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
115
- for item, score, reasoning in zip(eval_input.eval_input_items, sample_scores, sample_reasonings)
116
- ]
117
-
118
- return EvalOutput(average_score=avg_score, eval_output_items=eval_output_items)
62
+ eval_result = await self.traj_eval_chain.aevaluate_agent_trajectory(
63
+ input=question,
64
+ agent_trajectory=agent_trajectory,
65
+ prediction=generated_answer,
66
+ )
67
+ except Exception as e:
68
+ logger.exception("Error evaluating trajectory for question: %s, Error: %s", question, e, exc_info=True)
69
+ return EvalOutputItem(id=item.id, score=0.0, reasoning=f"Error evaluating trajectory: {e}")
70
+
71
+ reasoning = {
72
+ "reasoning": eval_result["reasoning"],
73
+ "trajectory": [(action.model_dump(), output) for (action, output) in agent_trajectory]
74
+ }
75
+ return EvalOutputItem(id=item.id, score=eval_result["score"], reasoning=reasoning)
@@ -15,19 +15,19 @@
15
15
 
16
16
  import asyncio
17
17
  import logging
18
+ from typing import Callable
18
19
 
19
20
  from langchain.output_parsers import ResponseSchema
20
21
  from langchain.output_parsers import StructuredOutputParser
21
22
  from langchain.schema import HumanMessage
22
23
  from langchain.schema import SystemMessage
23
24
  from langchain_core.language_models import BaseChatModel
25
+ from langchain_core.runnables import RunnableLambda
24
26
  from tqdm import tqdm
25
27
 
26
- from aiq.eval.evaluator.evaluator_model import EvalInput
28
+ from aiq.eval.evaluator.base_evaluator import BaseEvaluator
27
29
  from aiq.eval.evaluator.evaluator_model import EvalInputItem
28
- from aiq.eval.evaluator.evaluator_model import EvalOutput
29
30
  from aiq.eval.evaluator.evaluator_model import EvalOutputItem
30
- from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
31
31
 
32
32
  logger = logging.getLogger(__name__)
33
33
 
@@ -69,195 +69,177 @@ def evaluation_prompt(judge_llm_prompt: str,
69
69
  return EVAL_PROMPT if not default_scoring else DEFAULT_EVAL_PROMPT
70
70
 
71
71
 
72
- class TunableRagEvaluator:
72
+ def runnable_with_retries(original_fn: Callable, llm_retry_control_params: dict | None = None):
73
+ runnable = RunnableLambda(original_fn)
74
+
75
+ if llm_retry_control_params is None:
76
+ llm_retry_control_params = {
77
+ "stop_after_attempt": 3, "initial_backoff_delay_seconds": 1, "has_exponential_jitter": True
78
+ }
79
+
80
+ if llm_retry_control_params["has_exponential_jitter"] is None:
81
+ llm_retry_control_params["has_exponential_jitter"] = True
82
+ if llm_retry_control_params["stop_after_attempt"] is None:
83
+ llm_retry_control_params["stop_after_attempt"] = 3
84
+ if llm_retry_control_params["initial_backoff_delay_seconds"] is None:
85
+ llm_retry_control_params["initial_backoff_delay_seconds"] = 1
86
+
87
+ # Add retry logic with exponential backoff and jitter
88
+ return runnable.with_retry(
89
+ retry_if_exception_type=(Exception, ), # Retry on any error
90
+ wait_exponential_jitter=llm_retry_control_params["has_exponential_jitter"], # Add jitter to exponential backoff
91
+ stop_after_attempt=llm_retry_control_params["stop_after_attempt"],
92
+ exponential_jitter_params={"initial": llm_retry_control_params["initial_backoff_delay_seconds"]
93
+ } # Optional: set initial backoff (seconds)
94
+ )
95
+
96
+
97
+ class TunableRagEvaluator(BaseEvaluator):
73
98
  '''Tunable RAG evaluator class with customizable LLM prompt for scoring.'''
74
99
 
75
100
  def __init__(self,
76
101
  llm: BaseChatModel,
77
102
  judge_llm_prompt: str,
103
+ llm_retry_control_params: dict | None,
78
104
  max_concurrency: int,
79
105
  default_scoring: bool,
80
106
  default_score_weights: dict):
107
+ super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating RAG")
81
108
  self.llm = llm
82
- self.max_concurrency = max_concurrency
83
109
  self.judge_llm_prompt = judge_llm_prompt
84
- self.semaphore = asyncio.Semaphore(self.max_concurrency)
110
+ self.llm_retry_control_params = llm_retry_control_params
85
111
  self.default_scoring = default_scoring
86
112
  # Use user-provided weights if available; otherwise, set equal weights for each score
87
113
  self.default_score_weights = default_score_weights if default_score_weights else {
88
114
  "coverage": 1 / 3, "correctness": 1 / 3, "relevance": 1 / 3
89
115
  }
90
116
 
91
- async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
92
- '''Evaluate function'''
93
-
94
- async def process_item(item):
95
- """Compute RAG evaluation for an individual item"""
96
- question = item.input_obj
97
- answer_description = item.expected_output_obj
98
- generated_answer = item.output_obj
99
-
100
- # Call judge LLM to generate score
101
- score = 0.0
102
-
103
- default_evaluation_schema = [
104
- ResponseSchema(
105
- name="coverage_score",
106
- description=
107
- "Score for the coverage of all critical aspects mentioned in the expected answer. Ex. 0.5",
108
- type="float"),
109
- ResponseSchema(
110
- name="correctness_score",
111
- description=
112
- "Score for the accuracy of the generated answer compared to the expected answer. Ex. 0.5",
113
- type="float"),
114
- ResponseSchema(name="relevance_score",
115
- description="Score for the relevance of the generated answer to the question. Ex. 0.5",
116
- type="float"),
117
- ResponseSchema(
118
- name="reasoning",
119
- description=
120
- "1-2 summarized sentences of reasoning for the scores. Ex. 'The generated answer covers all critical aspects mentioned in the expected answer, is correct, and is relevant to the question.'",
121
- type="string"),
122
- ]
123
-
124
- custom_evaluation_schema = [
125
- ResponseSchema(name="score", description="Score for the generated answer. Ex. 0.5", type="float"),
126
- ResponseSchema(
127
- name="reasoning",
128
- description=
129
- "1-2 sentence reasoning for the score. Ex. 'The generated answer is exactly the same as the description of the expected answer.'",
130
- type="string"),
131
- ]
117
+ async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
118
+ """Compute RAG evaluation for an individual item and return EvalOutputItem"""
119
+ question = item.input_obj
120
+ answer_description = item.expected_output_obj
121
+ generated_answer = item.output_obj
122
+
123
+ # Call judge LLM to generate score
124
+ score = 0.0
125
+
126
+ default_evaluation_schema = [
127
+ ResponseSchema(
128
+ name="coverage_score",
129
+ description="Score for the coverage of all critical aspects mentioned in the expected answer. Ex. 0.5",
130
+ type="float"),
131
+ ResponseSchema(
132
+ name="correctness_score",
133
+ description="Score for the accuracy of the generated answer compared to the expected answer. Ex. 0.5",
134
+ type="float"),
135
+ ResponseSchema(name="relevance_score",
136
+ description="Score for the relevance of the generated answer to the question. Ex. 0.5",
137
+ type="float"),
138
+ ResponseSchema(
139
+ name="reasoning",
140
+ description=
141
+ "1-2 summarized sentences of reasoning for the scores. Ex. 'The generated answer covers all critical aspects mentioned in the expected answer, is correct, and is relevant to the question.'",
142
+ type="string"),
143
+ ]
132
144
 
133
- if self.default_scoring:
134
- evaluation_schema = default_evaluation_schema
135
- else:
136
- evaluation_schema = custom_evaluation_schema
145
+ custom_evaluation_schema = [
146
+ ResponseSchema(name="score", description="Score for the generated answer. Ex. 0.5", type="float"),
147
+ ResponseSchema(
148
+ name="reasoning",
149
+ description=
150
+ "1-2 sentence reasoning for the score. Ex. 'The generated answer is exactly the same as the description of the expected answer.'",
151
+ type="string"),
152
+ ]
137
153
 
138
- llm_input_response_parser = StructuredOutputParser.from_response_schemas(evaluation_schema)
139
- format_instructions = llm_input_response_parser.get_format_instructions()
154
+ if self.default_scoring:
155
+ evaluation_schema = default_evaluation_schema
156
+ else:
157
+ evaluation_schema = custom_evaluation_schema
140
158
 
141
- eval_prompt = evaluation_prompt(judge_llm_prompt=self.judge_llm_prompt,
142
- question=question,
143
- answer_description=answer_description,
144
- generated_answer=generated_answer,
145
- format_instructions=format_instructions,
146
- default_scoring=self.default_scoring)
159
+ llm_input_response_parser = StructuredOutputParser.from_response_schemas(evaluation_schema)
160
+ format_instructions = llm_input_response_parser.get_format_instructions()
147
161
 
148
- messages = [
149
- SystemMessage(content="You must respond only in JSON format."), HumanMessage(content=eval_prompt)
150
- ]
162
+ eval_prompt = evaluation_prompt(judge_llm_prompt=self.judge_llm_prompt,
163
+ question=question,
164
+ answer_description=answer_description,
165
+ generated_answer=generated_answer,
166
+ format_instructions=format_instructions,
167
+ default_scoring=self.default_scoring)
151
168
 
152
- response = await self.llm.ainvoke(messages)
169
+ messages = [SystemMessage(content="You must respond only in JSON format."), HumanMessage(content=eval_prompt)]
153
170
 
154
- # Initialize default values to handle service errors
155
- coverage_score = 0.0
156
- correctness_score = 0.0
157
- relevance_score = 0.0
158
- reasoning = "Error in evaluator from parsing judge LLM response."
171
+ response = await runnable_with_retries(self.llm.ainvoke, self.llm_retry_control_params).ainvoke(messages)
159
172
 
160
- try:
161
- parsed_response = llm_input_response_parser.parse(response.content)
162
- if self.default_scoring:
163
- try:
164
- coverage_score = parsed_response["coverage_score"]
165
- correctness_score = parsed_response["correctness_score"]
166
- relevance_score = parsed_response["relevance_score"]
167
- reasoning = parsed_response["reasoning"]
168
- except KeyError as e:
169
- logger.error("Missing required keys in default scoring response: %s",
170
- ", ".join(str(arg) for arg in e.args))
171
- reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
172
-
173
- coverage_weight = self.default_score_weights.get("coverage", 1 / 3)
174
- correctness_weight = self.default_score_weights.get("correctness", 1 / 3)
175
- relevance_weight = self.default_score_weights.get("relevance", 1 / 3)
176
-
177
- # Calculate score
178
- total_weight = coverage_weight + correctness_weight + relevance_weight
179
- coverage_weight = coverage_weight / total_weight
180
- correctness_weight = correctness_weight / total_weight
181
- relevance_weight = relevance_weight / total_weight
182
-
183
- if round(coverage_weight + correctness_weight + relevance_weight, 2) != 1:
184
- logger.warning("The sum of the default score weights is not 1. The weights will be normalized.")
185
- coverage_weight = coverage_weight / (coverage_weight + correctness_weight + relevance_weight)
186
- correctness_weight = correctness_weight / (coverage_weight + correctness_weight +
187
- relevance_weight)
188
- relevance_weight = relevance_weight / (coverage_weight + correctness_weight + relevance_weight)
189
-
190
- score = (coverage_weight * coverage_score + correctness_weight * correctness_score +
191
- relevance_weight * relevance_score)
192
-
193
- else:
194
- try:
195
- score = parsed_response["score"]
196
- reasoning = parsed_response["reasoning"]
197
- except KeyError as e:
198
- logger.error("Missing required keys in custom scoring response: %s",
199
- ", ".join(str(arg) for arg in e.args))
200
- reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
201
- raise
202
- except (KeyError, ValueError) as e:
203
- logger.error("Error parsing judge LLM response: %s", e)
204
- score = 0.0
205
- reasoning = "Error in evaluator from parsing judge LLM response."
173
+ # Initialize default values to handle service errors
174
+ coverage_score = 0.0
175
+ correctness_score = 0.0
176
+ relevance_score = 0.0
177
+ reasoning = "Error in evaluator from parsing judge LLM response."
206
178
 
179
+ try:
180
+ parsed_response = llm_input_response_parser.parse(response.content)
207
181
  if self.default_scoring:
208
- reasoning = {
209
- "question": question,
210
- "answer_description": answer_description,
211
- "generated_answer": generated_answer,
212
- "score_breakdown": {
213
- "coverage_score": coverage_score,
214
- "correctness_score": correctness_score,
215
- "relevance_score": relevance_score,
216
- },
217
- "reasoning": reasoning,
218
- }
219
- else:
220
- reasoning = {
221
- "question": question,
222
- "answer_description": answer_description,
223
- "generated_answer": generated_answer,
224
- "reasoning": reasoning
225
- }
226
-
227
- return score, reasoning
228
-
229
- async def wrapped_process(item: EvalInputItem) -> tuple[float, dict]:
230
- """
231
- Process an item asynchronously and update the progress bar.
232
- Use the semaphore to limit the number of concurrent items.
233
- """
234
- async with self.semaphore:
235
- result = await process_item(item)
236
- # Update the progress bar
237
- pbar.update(1)
238
- return result
182
+ try:
183
+ coverage_score = parsed_response["coverage_score"]
184
+ correctness_score = parsed_response["correctness_score"]
185
+ relevance_score = parsed_response["relevance_score"]
186
+ reasoning = parsed_response["reasoning"]
187
+ except KeyError as e:
188
+ logger.error("Missing required keys in default scoring response: %s",
189
+ ", ".join(str(arg) for arg in e.args))
190
+ reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
191
+
192
+ coverage_weight = self.default_score_weights.get("coverage", 1 / 3)
193
+ correctness_weight = self.default_score_weights.get("correctness", 1 / 3)
194
+ relevance_weight = self.default_score_weights.get("relevance", 1 / 3)
195
+
196
+ # Calculate score
197
+ total_weight = coverage_weight + correctness_weight + relevance_weight
198
+ coverage_weight = coverage_weight / total_weight
199
+ correctness_weight = correctness_weight / total_weight
200
+ relevance_weight = relevance_weight / total_weight
201
+
202
+ if round(coverage_weight + correctness_weight + relevance_weight, 2) != 1:
203
+ logger.warning("The sum of the default score weights is not 1. The weights will be normalized.")
204
+ coverage_weight = coverage_weight / (coverage_weight + correctness_weight + relevance_weight)
205
+ correctness_weight = correctness_weight / (coverage_weight + correctness_weight + relevance_weight)
206
+ relevance_weight = relevance_weight / (coverage_weight + correctness_weight + relevance_weight)
207
+
208
+ score = (coverage_weight * coverage_score + correctness_weight * correctness_score +
209
+ relevance_weight * relevance_score)
239
210
 
240
- try:
241
- # Claim a tqdm position to display the progress bar
242
- tqdm_position = TqdmPositionRegistry.claim()
243
- # Create a progress bar
244
- pbar = tqdm(total=len(eval_input.eval_input_items), desc="Evaluating RAG", position=tqdm_position)
245
- # Process items concurrently with a limit on concurrency
246
- results = await asyncio.gather(*[wrapped_process(item) for item in eval_input.eval_input_items])
247
- finally:
248
- pbar.close()
249
- TqdmPositionRegistry.release(tqdm_position)
250
-
251
- # Extract scores and reasonings
252
- sample_scores, sample_reasonings = zip(*results) if results else ([], [])
253
-
254
- # Compute average score
255
- avg_score = round(sum(sample_scores) / len(sample_scores), 2) if sample_scores else 0.0
256
-
257
- # Construct EvalOutputItems
258
- eval_output_items = [
259
- EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
260
- for item, score, reasoning in zip(eval_input.eval_input_items, sample_scores, sample_reasonings)
261
- ]
211
+ else:
212
+ try:
213
+ score = parsed_response["score"]
214
+ reasoning = parsed_response["reasoning"]
215
+ except KeyError as e:
216
+ logger.error("Missing required keys in custom scoring response: %s",
217
+ ", ".join(str(arg) for arg in e.args))
218
+ reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
219
+ raise
220
+ except (KeyError, ValueError) as e:
221
+ logger.error("Error parsing judge LLM response: %s", e)
222
+ score = 0.0
223
+ reasoning = "Error in evaluator from parsing judge LLM response."
262
224
 
263
- return EvalOutput(average_score=avg_score, eval_output_items=eval_output_items)
225
+ if self.default_scoring:
226
+ reasoning = {
227
+ "question": question,
228
+ "answer_description": answer_description,
229
+ "generated_answer": generated_answer,
230
+ "score_breakdown": {
231
+ "coverage_score": coverage_score,
232
+ "correctness_score": correctness_score,
233
+ "relevance_score": relevance_score,
234
+ },
235
+ "reasoning": reasoning,
236
+ }
237
+ else:
238
+ reasoning = {
239
+ "question": question,
240
+ "answer_description": answer_description,
241
+ "generated_answer": generated_answer,
242
+ "reasoning": reasoning
243
+ }
244
+
245
+ return EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
@@ -26,6 +26,7 @@ from aiq.data_models.evaluator import EvaluatorBaseConfig
26
26
  class TunableRagEvaluatorConfig(EvaluatorBaseConfig, name="tunable_rag_evaluator"):
27
27
  '''Configuration for tunable RAG evaluator'''
28
28
  llm_name: LLMRef = Field(description="Name of the judge LLM")
29
+ llm_retry_control_params: dict | None = Field(description="Parameters to control LLM retry behavior", default=None)
29
30
  judge_llm_prompt: str = Field(description="LLM prompt for the judge LLM")
30
31
  default_scoring: bool = Field(description="Whether to use default scoring", default=False)
31
32
  default_score_weights: dict = Field(
@@ -43,6 +44,7 @@ async def register_tunable_rag_evaluator(config: TunableRagEvaluatorConfig, buil
43
44
  llm = await builder.get_llm(config.llm_name, wrapper_type=LLMFrameworkEnum.LANGCHAIN)
44
45
  evaluator = TunableRagEvaluator(llm,
45
46
  config.judge_llm_prompt,
47
+ config.llm_retry_control_params,
46
48
  builder.get_max_concurrency(),
47
49
  config.default_scoring,
48
50
  config.default_score_weights)