ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
@@ -0,0 +1,1254 @@
1
+ from typing import Any, Dict, Optional, List
2
+ import logging
3
+ import json
4
+ import asyncio
5
+ import re
6
+ from datetime import datetime
7
+
8
+ try:
9
+ from langgraph.prebuilt import create_react_agent
10
+ from langchain_core.messages import HumanMessage, SystemMessage
11
+ from langchain_core.tools import tool
12
+ from langchain_experimental.tools import PythonREPLTool
13
+
14
+ LANGGRAPH_AVAILABLE = True
15
+ except ImportError:
16
+ LANGGRAPH_AVAILABLE = False
17
+
18
+ # Create a dummy tool decorator for when LangGraph is not available
19
+ def tool(func):
20
+ return func
21
+
22
+
23
+ from .base import BaseComparator
24
+ from ..types import (
25
+ ParameterComparisonResult,
26
+ ComparisonStrategy,
27
+ ParameterStatus,
28
+ )
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class CodeAgentComparator(BaseComparator):
34
+ """
35
+ Code Agent-based comparator using LangGraph for detailed tool call analysis.
36
+
37
+ This comparator creates a code agent that can:
38
+ - Write Python code to analyze tool call differences
39
+ - Use context from conversation history and tool specifications
40
+ - Perform complex logical comparisons beyond simple string/semantic matching
41
+ - Handle edge cases through programmatic analysis
42
+ """
43
+
44
+ def __init__(self, config, llm_client=None):
45
+ super().__init__(config)
46
+
47
+ if not LANGGRAPH_AVAILABLE:
48
+ raise ImportError(
49
+ "LangGraph is required for CodeAgentComparator. "
50
+ "Install with: pip install langgraph langchain-core langchain"
51
+ )
52
+
53
+ self.llm_client = llm_client
54
+ self.agent = None
55
+ self._initialize_agent()
56
+
57
+ def _initialize_agent(self):
58
+ """Initialize the LangGraph code agent."""
59
+ if not self.llm_client:
60
+ logger.warning("No LLM client provided for CodeAgentComparator")
61
+ return
62
+
63
+ try:
64
+ # Create tools for the agent - using tool functions directly
65
+ from langchain_core.tools import tool
66
+
67
+ @tool
68
+ def analyze_values(
69
+ predicted_value: str,
70
+ ground_truth_value: str,
71
+ param_type: str = "string",
72
+ ) -> str:
73
+ """
74
+ Analyze two parameter values for comparison.
75
+
76
+ Args:
77
+ predicted_value: The predicted parameter value
78
+ ground_truth_value: The ground truth parameter value
79
+ param_type: The expected parameter type
80
+
81
+ Returns:
82
+ JSON string with analysis results
83
+ """
84
+ import json
85
+
86
+ # Parse JSON strings if needed
87
+ def safe_parse(value):
88
+ if isinstance(value, str):
89
+ try:
90
+ return json.loads(value)
91
+ except:
92
+ return value
93
+ return value
94
+
95
+ pred_parsed = safe_parse(predicted_value)
96
+ gt_parsed = safe_parse(ground_truth_value)
97
+
98
+ # Type analysis
99
+ pred_type = type(pred_parsed).__name__
100
+ gt_type = type(gt_parsed).__name__
101
+
102
+ # Exact match check
103
+ exact_match = pred_parsed == gt_parsed
104
+
105
+ # Type compatibility check
106
+ type_compatible = pred_type == gt_type or (
107
+ (pred_type in ["int", "float"] and gt_type in ["int", "float"])
108
+ or (pred_type == "str" and gt_type in ["int", "float", "bool"])
109
+ or (gt_type == "str" and pred_type in ["int", "float", "bool"])
110
+ )
111
+
112
+ # Semantic equivalence check
113
+ semantic_equivalent = False
114
+ if not exact_match:
115
+ # String representations
116
+ pred_str = str(pred_parsed).lower().strip()
117
+ gt_str = str(gt_parsed).lower().strip()
118
+
119
+ # Common equivalences
120
+ equivalences = [
121
+ (pred_str == gt_str),
122
+ (
123
+ pred_str in ["true", "1", "yes", "on"]
124
+ and gt_str in ["true", "1", "yes", "on"]
125
+ ),
126
+ (
127
+ pred_str in ["false", "0", "no", "off"]
128
+ and gt_str in ["false", "0", "no", "off"]
129
+ ),
130
+ (
131
+ pred_str.replace(" ", "") == gt_str.replace(" ", "")
132
+ ), # Whitespace differences
133
+ ]
134
+
135
+ semantic_equivalent = any(equivalences)
136
+
137
+ result = {
138
+ "exact_match": exact_match,
139
+ "semantic_equivalent": semantic_equivalent,
140
+ "type_compatible": type_compatible,
141
+ "predicted_type": pred_type,
142
+ "ground_truth_type": gt_type,
143
+ "predicted_parsed": pred_parsed,
144
+ "ground_truth_parsed": gt_parsed,
145
+ }
146
+
147
+ return json.dumps(result)
148
+
149
+ @tool
150
+ def compare_json_structures(obj1: str, obj2: str) -> str:
151
+ """
152
+ Compare two JSON structures for deep equality.
153
+
154
+ Args:
155
+ obj1: First JSON object as string
156
+ obj2: Second JSON object as string
157
+
158
+ Returns:
159
+ JSON string with comparison results
160
+ """
161
+ import json
162
+
163
+ try:
164
+ parsed1 = json.loads(obj1) if isinstance(obj1, str) else obj1
165
+ parsed2 = json.loads(obj2) if isinstance(obj2, str) else obj2
166
+
167
+ def deep_compare(a, b, path=""):
168
+ if type(a) != type(b):
169
+ return {
170
+ "match": False,
171
+ "reason": f"Type mismatch at {path}: {type(a)} vs {type(b)}",
172
+ }
173
+
174
+ if isinstance(a, dict):
175
+ if set(a.keys()) != set(b.keys()):
176
+ return {
177
+ "match": False,
178
+ "reason": f"Key mismatch at {path}: {set(a.keys())} vs {set(b.keys())}",
179
+ }
180
+
181
+ for key in a:
182
+ result = deep_compare(a[key], b[key], f"{path}.{key}")
183
+ if not result["match"]:
184
+ return result
185
+
186
+ return {"match": True, "reason": "Deep equality"}
187
+
188
+ elif isinstance(a, list):
189
+ if len(a) != len(b):
190
+ return {
191
+ "match": False,
192
+ "reason": f"List length mismatch at {path}: {len(a)} vs {len(b)}",
193
+ }
194
+
195
+ for i, (item_a, item_b) in enumerate(zip(a, b)):
196
+ result = deep_compare(item_a, item_b, f"{path}[{i}]")
197
+ if not result["match"]:
198
+ return result
199
+
200
+ return {"match": True, "reason": "Deep equality"}
201
+
202
+ else:
203
+ match = a == b
204
+ return {
205
+ "match": match,
206
+ "reason": (
207
+ "Direct comparison"
208
+ if match
209
+ else f"Value mismatch: {a} != {b}"
210
+ ),
211
+ }
212
+
213
+ result = deep_compare(parsed1, parsed2)
214
+ return json.dumps(result)
215
+
216
+ except Exception as e:
217
+ return json.dumps(
218
+ {"match": False, "reason": f"JSON parsing error: {str(e)}"}
219
+ )
220
+
221
+ # Create tools for the agent
222
+ tools = [PythonREPLTool(), analyze_values, compare_json_structures]
223
+
224
+ # Create the React agent with code execution capabilities
225
+ self.agent = create_react_agent(model=self._adapt_llm_client(), tools=tools)
226
+
227
+ except Exception as e:
228
+ logger.error(f"Failed to initialize code agent: {e}")
229
+ self.agent = None
230
+
231
+ def _adapt_llm_client(self):
232
+ """Adapt our LLM client to work with LangChain using a Runnable wrapper."""
233
+ from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
234
+ from langchain_core.runnables import Runnable
235
+
236
+ # Create a Runnable adapter for our LLM client
237
+ class RunnableAdapter(Runnable):
238
+ def __init__(self, llm_client):
239
+ self.llm_client = llm_client
240
+
241
+ def invoke(self, input_data, config=None):
242
+ # Handle different input types
243
+ if isinstance(input_data, dict) and "messages" in input_data:
244
+ messages = input_data["messages"]
245
+ elif isinstance(input_data, list):
246
+ messages = input_data
247
+ else:
248
+ messages = [input_data]
249
+
250
+ # Convert messages to prompt
251
+ prompt_parts = []
252
+ for msg in messages:
253
+ if hasattr(msg, "content"):
254
+ prompt_parts.append(msg.content)
255
+ else:
256
+ prompt_parts.append(str(msg))
257
+ prompt = "\n".join(prompt_parts)
258
+
259
+ # Call our LLM
260
+ try:
261
+ if hasattr(self.llm_client, "generate"):
262
+ # Check if this is a ValidatingLLMClient that requires schema
263
+ from llmevalkit.llm.output_parser import ValidatingLLMClient
264
+
265
+ if isinstance(self.llm_client, ValidatingLLMClient):
266
+ # Provide a simple string schema for ValidatingLLMClient
267
+ response = self.llm_client.generate(
268
+ prompt=prompt, schema=str
269
+ )
270
+ else:
271
+ response = self.llm_client.generate(prompt=prompt)
272
+ else:
273
+ response = "LLM client not available"
274
+
275
+ # Return AIMessage
276
+ return AIMessage(content=str(response))
277
+ except Exception as e:
278
+ return AIMessage(content=f"Error: {str(e)}")
279
+
280
+ def bind_tools(self, tools):
281
+ """Bind tools to the model (required by LangGraph)."""
282
+ return self
283
+
284
+ def with_structured_output(self, schema):
285
+ """Support structured output (optional for LangGraph)."""
286
+ return self
287
+
288
+ return RunnableAdapter(self.llm_client)
289
+
290
+ def _get_system_prompt(self) -> str:
291
+ """Get the system prompt for the code agent."""
292
+ return """You are an expert code agent with access to a Python REPL tool for executing code.
293
+
294
+ CRITICAL INSTRUCTIONS:
295
+ 1. When you need to execute Python code, you MUST use the "Python REPL" tool
296
+ 2. Do NOT just write code - you must CALL THE TOOL to execute it
297
+ 3. The tool is called "Python REPL" - use it to run any Python code
298
+ 4. After tool execution, use the actual results in your analysis
299
+
300
+ AVAILABLE TOOLS:
301
+ - Python REPL: Execute Python code and get real results
302
+
303
+ Your task is to compare tool calls using executed Python analysis.
304
+
305
+ WORKFLOW:
306
+ 1. Use the Python REPL tool to execute comparison code
307
+ 2. Get the actual execution results
308
+ 3. Use those results to determine equivalence
309
+ 4. Return structured comparison results
310
+
311
+ EXAMPLE TOOL USAGE:
312
+ To execute code, you would call the Python REPL tool with your code, NOT just display it.
313
+
314
+ Remember: ALWAYS use the Python REPL tool for code execution!"""
315
+
316
+ def compare_parameter(
317
+ self,
318
+ param_name: str,
319
+ predicted_value: Any,
320
+ ground_truth_value: Any,
321
+ context: Optional[Dict[str, Any]] = None,
322
+ ) -> ParameterComparisonResult:
323
+ """Compare a single parameter using code agent analysis."""
324
+
325
+ if not self.agent:
326
+ # Fallback to basic comparison if agent not available
327
+ return self._fallback_comparison(
328
+ param_name, predicted_value, ground_truth_value, context
329
+ )
330
+
331
+ try:
332
+ # Prepare context for the agent
333
+ context = context or {}
334
+ param_def = context.get("parameter_definition", {})
335
+ param_status = context.get("parameter_status", ParameterStatus.BOTH_PRESENT)
336
+
337
+ # Create the analysis prompt
338
+ prompt = f"""
339
+ Analyze parameter '{param_name}' by writing and EXECUTING Python code.
340
+
341
+ PARAMETER DATA:
342
+ - Name: {param_name}
343
+ - Predicted Value: {json.dumps(predicted_value)}
344
+ - Ground Truth Value: {json.dumps(ground_truth_value)}
345
+ - Parameter Type: {param_def.get('type', 'unknown')}
346
+ - Required: {param_def.get('required', False)}
347
+ - Default: {param_def.get('default', 'None')}
348
+ - Status: {param_status.value if hasattr(param_status, 'value') else param_status}
349
+
350
+ TASK: Write Python code to analyze these values AND EXECUTE IT to get results.
351
+
352
+ Consider:
353
+ 1. Exact equality
354
+ 2. Type compatibility (string "1500.00" vs float 1500.0)
355
+ 3. Semantic equivalence (string "true" vs boolean True)
356
+ 4. Missing value handling
357
+ 5. Parameter definition context
358
+
359
+ EXECUTE your analysis code and print the JSON result:
360
+ ```python
361
+ import json
362
+
363
+ # The actual parameter values
364
+ predicted_value = {json.dumps(predicted_value)}
365
+ ground_truth_value = {json.dumps(ground_truth_value)}
366
+ param_name = "{param_name}"
367
+
368
+ def analyze_parameter(pred_val, gt_val, param_name):
369
+ # Your analysis logic here
370
+ # Return JSON with: score, is_match, explanation, confidence, parameter_analysis
371
+ pass
372
+
373
+ result = analyze_parameter(predicted_value, ground_truth_value, param_name)
374
+ print(json.dumps(result, indent=2))
375
+ ```
376
+
377
+ Execute this and provide the JSON output!
378
+ """
379
+
380
+ # Run the agent
381
+ messages = [HumanMessage(content=prompt)]
382
+ result = self.agent.invoke({"messages": messages})
383
+
384
+ # Extract the result from agent output
385
+ final_message = result["messages"][-1].content
386
+
387
+ # Try to parse JSON from the final message
388
+ analysis_result = self._extract_json_from_response(final_message)
389
+
390
+ return ParameterComparisonResult(
391
+ parameter_name=param_name,
392
+ predicted_value=predicted_value,
393
+ ground_truth_value=ground_truth_value,
394
+ predicted_resolved_value=predicted_value,
395
+ ground_truth_resolved_value=ground_truth_value,
396
+ parameter_status=param_status,
397
+ comparison_strategy=ComparisonStrategy.CODE_AGENT,
398
+ score=analysis_result.get("score", 0.5),
399
+ explanation=analysis_result.get(
400
+ "explanation", "Code agent analysis completed"
401
+ ),
402
+ is_match=analysis_result.get("is_match", False),
403
+ confidence=analysis_result.get("confidence", 0.8),
404
+ metadata={
405
+ "code_agent_analysis": analysis_result,
406
+ "parameter_analysis": analysis_result.get("parameter_analysis", {}),
407
+ },
408
+ )
409
+
410
+ except Exception as e:
411
+ logger.error(f"Code agent comparison failed: {e}")
412
+ return self._fallback_comparison(
413
+ param_name, predicted_value, ground_truth_value, context
414
+ )
415
+
416
+ def compare_tool_calls(
417
+ self,
418
+ predicted_call: Dict[str, Any],
419
+ ground_truth_call: Dict[str, Any],
420
+ conversation_history: Optional[List[Dict[str, str]]] = None,
421
+ tool_specs: Optional[List[Dict[str, Any]]] = None,
422
+ custom_instructions: Optional[str] = None,
423
+ ) -> Dict[str, Any]:
424
+ """Compare two complete tool calls using code agent analysis."""
425
+
426
+ # Try direct code generation and execution first (more reliable)
427
+ try:
428
+ direct_result = self._direct_code_comparison(
429
+ predicted_call, ground_truth_call, custom_instructions
430
+ )
431
+ if direct_result.get("comparison_strategy") == "code_agent_direct":
432
+ return direct_result
433
+ except Exception as e:
434
+ logger.warning(
435
+ f"Direct code comparison failed, trying LangGraph agent: {e}"
436
+ )
437
+
438
+ # Fall back to LangGraph agent approach
439
+ if not self.agent:
440
+ return self._fallback_tool_call_comparison(
441
+ predicted_call, ground_truth_call
442
+ )
443
+
444
+ try:
445
+ # Prepare detailed context
446
+ conversation_context = ""
447
+ if conversation_history:
448
+ conversation_context = "\n".join(
449
+ [
450
+ f"{msg.get('role', 'unknown')}: {msg.get('content', '')}"
451
+ for msg in conversation_history[
452
+ -5:
453
+ ] # Last 5 messages for context
454
+ ]
455
+ )
456
+
457
+ tool_specification = ""
458
+ function_name = predicted_call.get("name") or ground_truth_call.get("name")
459
+ if tool_specs and function_name:
460
+ for spec in tool_specs:
461
+ if spec.get("name") == function_name:
462
+ tool_specification = json.dumps(spec, indent=2)
463
+ break
464
+
465
+ # Create detailed analysis prompt
466
+ prompt = f"""
467
+ CRITICAL: Use the Python REPL tool to execute analysis code. Do not just show code - RUN it using the tool!
468
+
469
+ Compare these tool calls by EXECUTING Python code:
470
+
471
+ PREDICTED: {json.dumps(predicted_call, indent=2)}
472
+ GROUND TRUTH: {json.dumps(ground_truth_call, indent=2)}
473
+
474
+ TASK: Use the Python REPL tool to execute this analysis:
475
+
476
+ ```python
477
+ import json
478
+
479
+ predicted_call = {json.dumps(predicted_call)}
480
+ ground_truth_call = {json.dumps(ground_truth_call)}
481
+
482
+ def analyze_tool_calls(pred, gt):
483
+ func_match = pred.get("name") == gt.get("name")
484
+ pred_args = pred.get("arguments", {{}})
485
+ gt_args = gt.get("arguments", {{}})
486
+
487
+ param_scores = []
488
+ param_details = {{}}
489
+ all_params = set(pred_args.keys()) | set(gt_args.keys())
490
+
491
+ for param in all_params:
492
+ pred_val = pred_args.get(param)
493
+ gt_val = gt_args.get(param)
494
+
495
+ if pred_val == gt_val:
496
+ score = 1.0
497
+ elif str(pred_val).lower() == str(gt_val).lower():
498
+ score = 1.0
499
+ elif (isinstance(pred_val, str) and isinstance(gt_val, (int, float)) and
500
+ pred_val.replace('.', '').replace('-', '').isdigit() and
501
+ float(pred_val) == float(gt_val)):
502
+ score = 1.0
503
+ elif (pred_val in ['true', 'True', True] and gt_val in ['true', 'True', True]) or \\
504
+ (pred_val in ['false', 'False', False] and gt_val in ['false', 'False', False]):
505
+ score = 1.0
506
+ else:
507
+ score = 0.0
508
+
509
+ param_scores.append(score)
510
+ param_details[param] = {{"score": score, "pred": pred_val, "gt": gt_val}}
511
+
512
+ param_avg = sum(param_scores) / len(param_scores) if param_scores else 1.0
513
+ overall_score = (1.0 if func_match else 0.0 + param_avg) / 2.0
514
+ is_match = func_match and param_avg == 1.0
515
+
516
+ return {{
517
+ "score": overall_score,
518
+ "is_match": is_match,
519
+ "explanation": f"Function match: {{func_match}}, Param avg: {{param_avg:.3f}}",
520
+ "confidence": 0.95,
521
+ "function_analysis": {{"name_match": func_match}},
522
+ "parameter_analysis": param_details,
523
+ "contextual_insights": {{"analysis_complete": True}}
524
+ }}
525
+
526
+ result = analyze_tool_calls(predicted_call, ground_truth_call)
527
+ print("ANALYSIS_RESULT:", json.dumps(result, indent=2))
528
+ ```
529
+
530
+ EXECUTE this code using the Python REPL tool and return the results!
531
+ """
532
+
533
+ # Run the agent
534
+ messages = [HumanMessage(content=prompt)]
535
+ result = self.agent.invoke({"messages": messages})
536
+
537
+ # Extract analysis result
538
+ final_message = result["messages"][-1].content
539
+ analysis_result = self._extract_json_from_response(final_message)
540
+
541
+ # Structure the result
542
+ return {
543
+ "overall_score": analysis_result.get("score", 0.5),
544
+ "is_match": analysis_result.get("is_match", False),
545
+ "explanation": analysis_result.get(
546
+ "explanation", "Code agent tool call analysis completed"
547
+ ),
548
+ "confidence": analysis_result.get("confidence", 0.8),
549
+ "comparison_strategy": "code_agent",
550
+ "function_analysis": analysis_result.get("function_analysis", {}),
551
+ "parameter_analysis": analysis_result.get("parameter_analysis", {}),
552
+ "contextual_insights": analysis_result.get("contextual_insights", {}),
553
+ "code_agent_metadata": {
554
+ "full_analysis": analysis_result,
555
+ "agent_response": final_message,
556
+ "timestamp": datetime.now().isoformat(),
557
+ },
558
+ }
559
+
560
+ except Exception as e:
561
+ logger.error(f"Code agent tool call comparison failed: {e}")
562
+ return self._fallback_tool_call_comparison(
563
+ predicted_call, ground_truth_call
564
+ )
565
+
566
+ async def compare_tool_calls_async(
567
+ self,
568
+ predicted_call: Dict[str, Any],
569
+ ground_truth_call: Dict[str, Any],
570
+ conversation_history: Optional[List[Dict[str, str]]] = None,
571
+ tool_specs: Optional[List[Dict[str, Any]]] = None,
572
+ custom_instructions: Optional[str] = None,
573
+ ) -> Dict[str, Any]:
574
+ """Async version of tool call comparison."""
575
+ # For now, run the sync version in an executor
576
+ # In a full implementation, you'd make the agent calls async
577
+ loop = asyncio.get_event_loop()
578
+ return await loop.run_in_executor(
579
+ None,
580
+ self.compare_tool_calls,
581
+ predicted_call,
582
+ ground_truth_call,
583
+ conversation_history,
584
+ tool_specs,
585
+ custom_instructions,
586
+ )
587
+
588
+ def compare_function_name(
589
+ self,
590
+ predicted_name: str,
591
+ ground_truth_name: str,
592
+ context: Optional[Dict[str, Any]] = None,
593
+ ) -> float:
594
+ """Compare function names using code agent analysis."""
595
+
596
+ if not self.agent:
597
+ return 1.0 if predicted_name == ground_truth_name else 0.0
598
+
599
+ try:
600
+ prompt = f"""
601
+ Compare these function names: '{predicted_name}' vs '{ground_truth_name}'
602
+
603
+ Write Python code to analyze:
604
+ 1. Exact match
605
+ 2. Case sensitivity differences
606
+ 3. Underscore vs camelCase patterns
607
+ 4. Common abbreviations or variations
608
+ 5. Semantic similarity
609
+
610
+ Return a float score between 0.0-1.0.
611
+ """
612
+
613
+ messages = [HumanMessage(content=prompt)]
614
+ result = self.agent.invoke({"messages": messages})
615
+
616
+ # Extract score from response
617
+ final_message = result["messages"][-1].content
618
+ # Look for a numeric score in the response
619
+ import re
620
+
621
+ score_match = re.search(r"score[:\s=]+([0-9.]+)", final_message.lower())
622
+ if score_match:
623
+ return float(score_match.group(1))
624
+
625
+ # Fallback
626
+ return 1.0 if predicted_name == ground_truth_name else 0.0
627
+
628
+ except Exception as e:
629
+ logger.error(f"Code agent function name comparison failed: {e}")
630
+ return 1.0 if predicted_name == ground_truth_name else 0.0
631
+
632
+ def _extract_json_from_response(self, response: str) -> Dict[str, Any]:
633
+ """Extract JSON object from agent response, prioritizing executed code output."""
634
+ import re
635
+
636
+ # Look for specific result patterns
637
+ patterns = [
638
+ r"ANALYSIS_RESULT:\s*(\{.*?\})",
639
+ r"FINAL_RESULT:\s*(\{.*?\})",
640
+ r"Result:\s*(\{.*?\})",
641
+ ]
642
+
643
+ for pattern in patterns:
644
+ match = re.search(pattern, response, re.DOTALL)
645
+ if match:
646
+ try:
647
+ return json.loads(match.group(1))
648
+ except json.JSONDecodeError:
649
+ continue
650
+
651
+ # Look for Python execution output (lines starting with execution results)
652
+ lines = response.split("\n")
653
+ for line in lines:
654
+ line = line.strip()
655
+ if line.startswith("{") and '"score"' in line:
656
+ try:
657
+ # Try to parse this line as JSON
658
+ return json.loads(line)
659
+ except json.JSONDecodeError:
660
+ continue
661
+
662
+ # Look for JSON output after code execution
663
+ json_candidates = []
664
+
665
+ # Look for lines that start with { and seem to be JSON
666
+ for i, line in enumerate(lines):
667
+ stripped = line.strip()
668
+ if stripped.startswith("{"):
669
+ # Try to find the complete JSON object
670
+ json_text = stripped
671
+ j = i + 1
672
+ brace_count = json_text.count("{") - json_text.count("}")
673
+
674
+ while j < len(lines) and brace_count > 0:
675
+ json_text += "\n" + lines[j]
676
+ brace_count = json_text.count("{") - json_text.count("}")
677
+ j += 1
678
+
679
+ if brace_count == 0:
680
+ json_candidates.append(json_text)
681
+
682
+ # Try to parse JSON candidates (prioritize later ones as they're likely execution results)
683
+ for candidate in reversed(json_candidates):
684
+ try:
685
+ parsed = json.loads(candidate)
686
+ # Validate it has the expected structure
687
+ if isinstance(parsed, dict) and any(
688
+ key in parsed for key in ["score", "is_match", "result"]
689
+ ):
690
+ return parsed
691
+ except json.JSONDecodeError:
692
+ continue
693
+
694
+ # Fallback: look for any JSON objects with regex
695
+ json_pattern = r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}"
696
+ matches = re.findall(json_pattern, response, re.DOTALL)
697
+
698
+ for match in reversed(matches): # Try latest matches first
699
+ try:
700
+ parsed = json.loads(match)
701
+ if isinstance(parsed, dict):
702
+ return parsed
703
+ except json.JSONDecodeError:
704
+ continue
705
+
706
+ # Final fallback - extract key information with regex
707
+ score_match = re.search(
708
+ r'["\']?score["\']?\s*[:=]\s*([0-9.]+)', response.lower()
709
+ )
710
+ match_pattern = re.search(
711
+ r'["\']?is_match["\']?\s*[:=]\s*(true|false)', response.lower()
712
+ )
713
+ explanation_match = re.search(
714
+ r'["\']?explanation["\']?\s*[:=]\s*["\']([^"\']+)["\']',
715
+ response,
716
+ re.IGNORECASE,
717
+ )
718
+
719
+ return {
720
+ "score": float(score_match.group(1)) if score_match else 0.5,
721
+ "is_match": (
722
+ match_pattern.group(1).lower() == "true" if match_pattern else False
723
+ ),
724
+ "explanation": (
725
+ explanation_match.group(1)
726
+ if explanation_match
727
+ else "Analysis extracted from agent response"
728
+ ),
729
+ "confidence": 0.7,
730
+ "parameter_analysis": {},
731
+ "function_analysis": {},
732
+ "contextual_insights": {},
733
+ }
734
+
735
+ def _fallback_comparison(
736
+ self,
737
+ param_name: str,
738
+ predicted_value: Any,
739
+ ground_truth_value: Any,
740
+ context: Optional[Dict[str, Any]] = None,
741
+ ) -> ParameterComparisonResult:
742
+ """Fallback comparison when code agent is not available."""
743
+
744
+ context = context or {}
745
+ param_status = context.get("parameter_status", ParameterStatus.BOTH_PRESENT)
746
+
747
+ # Basic exact match
748
+ is_match = predicted_value == ground_truth_value
749
+ score = 1.0 if is_match else 0.0
750
+
751
+ # Handle None values
752
+ if predicted_value is None or ground_truth_value is None:
753
+ if predicted_value is None and ground_truth_value is None:
754
+ is_match = True
755
+ score = 1.0
756
+ explanation = "Both values are None"
757
+ else:
758
+ is_match = False
759
+ score = 0.0
760
+ explanation = f"One value is None: predicted={predicted_value}, ground_truth={ground_truth_value}"
761
+ else:
762
+ explanation = (
763
+ f"Exact match: {predicted_value}"
764
+ if is_match
765
+ else f"Values differ: {predicted_value} != {ground_truth_value}"
766
+ )
767
+
768
+ return ParameterComparisonResult(
769
+ parameter_name=param_name,
770
+ predicted_value=predicted_value,
771
+ ground_truth_value=ground_truth_value,
772
+ predicted_resolved_value=predicted_value,
773
+ ground_truth_resolved_value=ground_truth_value,
774
+ parameter_status=param_status,
775
+ comparison_strategy=ComparisonStrategy.CODE_AGENT,
776
+ score=score,
777
+ explanation=f"Fallback comparison (code agent unavailable): {explanation}",
778
+ is_match=is_match,
779
+ confidence=0.9 if is_match else 0.8,
780
+ error_type="code_agent_unavailable" if not is_match else None,
781
+ )
782
+
783
+ def _fallback_tool_call_comparison(
784
+ self,
785
+ predicted_call: Dict[str, Any],
786
+ ground_truth_call: Dict[str, Any],
787
+ ) -> Dict[str, Any]:
788
+ """Fallback tool call comparison when code agent is not available."""
789
+
790
+ # Basic comparison
791
+ function_match = predicted_call.get("name") == ground_truth_call.get("name")
792
+ args_match = predicted_call.get("arguments") == ground_truth_call.get(
793
+ "arguments"
794
+ )
795
+
796
+ overall_score = (
797
+ 1.0 if (function_match and args_match) else 0.5 if function_match else 0.0
798
+ )
799
+
800
+ return {
801
+ "overall_score": overall_score,
802
+ "is_match": function_match and args_match,
803
+ "explanation": "Fallback comparison - code agent unavailable",
804
+ "confidence": 0.8,
805
+ "comparison_strategy": "code_agent_fallback",
806
+ "function_analysis": {"score": 1.0 if function_match else 0.0},
807
+ "parameter_analysis": {"score": 1.0 if args_match else 0.0},
808
+ "contextual_insights": {},
809
+ }
810
+
811
+ def _extract_python_code(self, response: str) -> str:
812
+ """Extract Python code from LLM response."""
813
+ import re
814
+
815
+ # Look for code blocks
816
+ code_patterns = [
817
+ r"```python\s*(.*?)\s*```",
818
+ r"```\s*(.*?)\s*```",
819
+ r"<\|python_tag\|>(.*?)(?=<\||$)",
820
+ ]
821
+
822
+ for pattern in code_patterns:
823
+ matches = re.findall(pattern, response, re.DOTALL)
824
+ if matches:
825
+ return matches[0].strip()
826
+
827
+ # If no code blocks found, try to extract code heuristically
828
+ lines = response.split("\n")
829
+ code_lines = []
830
+ in_code = False
831
+
832
+ for line in lines:
833
+ if (
834
+ "import " in line
835
+ or "def " in line
836
+ or line.strip().startswith("predicted_call")
837
+ ):
838
+ in_code = True
839
+
840
+ if in_code:
841
+ code_lines.append(line)
842
+
843
+ # Stop if we see explanatory text after code
844
+ if (
845
+ in_code
846
+ and line.strip()
847
+ and not any(
848
+ keyword in line
849
+ for keyword in [
850
+ "import",
851
+ "def",
852
+ "=",
853
+ "return",
854
+ "print",
855
+ "if",
856
+ "elif",
857
+ "else",
858
+ "for",
859
+ "while",
860
+ "try",
861
+ "except",
862
+ "#",
863
+ ]
864
+ )
865
+ ):
866
+ break
867
+
868
+ return "\n".join(code_lines) if code_lines else ""
869
+
870
+ def _execute_python_code(
871
+ self,
872
+ code: str,
873
+ predicted_call: Dict[str, Any],
874
+ ground_truth_call: Dict[str, Any],
875
+ ) -> Optional[Dict[str, Any]]:
876
+ """Safely execute Python code and return results."""
877
+ try:
878
+ # Use a less restrictive but still safe execution environment
879
+ exec_globals = {
880
+ "json": json,
881
+ "predicted_call": predicted_call,
882
+ "ground_truth_call": ground_truth_call,
883
+ }
884
+ exec_locals = {}
885
+
886
+ # Execute the code with default builtins (safer than completely restricting)
887
+ exec(code, exec_globals, exec_locals)
888
+
889
+ # Look for result in various formats
890
+ result_candidates = [
891
+ exec_locals.get("result"),
892
+ exec_locals.get("analysis_result"),
893
+ exec_locals.get("comparison_result"),
894
+ ]
895
+
896
+ for candidate in result_candidates:
897
+ if isinstance(candidate, dict) and "score" in candidate:
898
+ return candidate
899
+
900
+ # If no explicit result variable, try to find a function and call it
901
+ for name, obj in exec_locals.items():
902
+ if callable(obj) and name.startswith(("analyze", "compare")):
903
+ try:
904
+ result = obj(predicted_call, ground_truth_call)
905
+ if isinstance(result, dict):
906
+ return result
907
+ except Exception:
908
+ continue
909
+
910
+ return None
911
+
912
+ except Exception as e:
913
+ logger.error(f"Code execution failed: {e}")
914
+ return None
915
+
916
+ def _direct_code_comparison(
917
+ self,
918
+ predicted_call: Dict[str, Any],
919
+ ground_truth_call: Dict[str, Any],
920
+ custom_instructions: Optional[str] = None,
921
+ ) -> Dict[str, Any]:
922
+ """Direct code generation and execution approach."""
923
+
924
+ try:
925
+ # Use a simpler approach with hardcoded comparison logic
926
+ return self._execute_hardcoded_comparison(
927
+ predicted_call, ground_truth_call, custom_instructions
928
+ )
929
+
930
+ except Exception as e:
931
+ logger.error(f"Direct code comparison failed: {e}")
932
+
933
+ # Fall back to basic comparison
934
+ return self._fallback_tool_call_comparison(predicted_call, ground_truth_call)
935
+
936
+ def _execute_hardcoded_comparison(
937
+ self,
938
+ predicted_call: Dict[str, Any],
939
+ ground_truth_call: Dict[str, Any],
940
+ custom_instructions: Optional[str] = None,
941
+ ) -> Dict[str, Any]:
942
+ """Execute hardcoded comparison logic that handles type conversions."""
943
+
944
+ try:
945
+ # Extract function info
946
+ pred_func = predicted_call.get("function", {})
947
+ gt_func = ground_truth_call.get("function", {})
948
+
949
+ func_match = pred_func.get("name") == gt_func.get("name")
950
+ pred_args = pred_func.get("arguments", {})
951
+ gt_args = gt_func.get("arguments", {})
952
+
953
+ param_scores = []
954
+ param_details = {}
955
+ all_params = set(pred_args.keys()) | set(gt_args.keys())
956
+
957
+ for param in all_params:
958
+ pred_val = pred_args.get(param)
959
+ gt_val = gt_args.get(param)
960
+
961
+ # Handle custom instructions for specific comparisons
962
+ score = self._compare_values_with_conversion(
963
+ pred_val, gt_val, param, custom_instructions
964
+ )
965
+
966
+ param_scores.append(score)
967
+ param_details[param] = {
968
+ "score": score,
969
+ "pred": pred_val,
970
+ "gt": gt_val,
971
+ "match": score >= 0.9,
972
+ "custom_instructions_applied": custom_instructions is not None,
973
+ }
974
+
975
+ param_avg = sum(param_scores) / len(param_scores) if param_scores else 1.0
976
+ overall_score = (1.0 if func_match else 0.0) * 0.3 + param_avg * 0.7
977
+ is_match = func_match and param_avg >= 0.9
978
+
979
+ return {
980
+ "overall_score": overall_score,
981
+ "is_match": is_match,
982
+ "explanation": f"Function match: {func_match}, Param avg: {param_avg:.3f}",
983
+ "confidence": 0.95,
984
+ "comparison_strategy": "code_agent_direct",
985
+ "function_analysis": {"name_match": func_match},
986
+ "parameter_analysis": param_details,
987
+ "contextual_insights": {"direct_execution": True},
988
+ }
989
+
990
+ except Exception as e:
991
+ logger.error(f"Hardcoded comparison failed: {e}")
992
+ return self._fallback_tool_call_comparison(
993
+ predicted_call, ground_truth_call
994
+ )
995
+
996
+ def _compare_values_with_conversion(
997
+ self,
998
+ pred_val: Any,
999
+ gt_val: Any,
1000
+ param_name: str = None,
1001
+ custom_instructions: str = None,
1002
+ ) -> float:
1003
+ """Compare two values with intelligent type conversion and custom instructions."""
1004
+
1005
+ # Apply custom instructions if provided
1006
+ if custom_instructions and param_name:
1007
+ custom_score = self._apply_custom_instructions(
1008
+ pred_val, gt_val, param_name, custom_instructions
1009
+ )
1010
+ if custom_score is not None:
1011
+ return custom_score
1012
+
1013
+ # Exact match
1014
+ if pred_val == gt_val:
1015
+ return 1.0
1016
+
1017
+ # Handle None values
1018
+ if pred_val is None or gt_val is None:
1019
+ return 0.0
1020
+
1021
+ # String comparison (case insensitive)
1022
+ if str(pred_val).lower() == str(gt_val).lower():
1023
+ return 1.0
1024
+
1025
+ # Numeric conversion
1026
+ try:
1027
+ if (
1028
+ isinstance(pred_val, str)
1029
+ and isinstance(gt_val, (int, float))
1030
+ and pred_val.replace(".", "").replace("-", "").isdigit()
1031
+ ):
1032
+ if float(pred_val) == float(gt_val):
1033
+ return 1.0
1034
+ except (ValueError, TypeError):
1035
+ pass
1036
+
1037
+ try:
1038
+ if (
1039
+ isinstance(gt_val, str)
1040
+ and isinstance(pred_val, (int, float))
1041
+ and gt_val.replace(".", "").replace("-", "").isdigit()
1042
+ ):
1043
+ if float(gt_val) == float(pred_val):
1044
+ return 1.0
1045
+ except (ValueError, TypeError):
1046
+ pass
1047
+
1048
+ # Boolean conversion
1049
+ pred_bool = self._convert_to_bool(pred_val)
1050
+ gt_bool = self._convert_to_bool(gt_val)
1051
+ if pred_bool is not None and gt_bool is not None:
1052
+ return 1.0 if pred_bool == gt_bool else 0.0
1053
+
1054
+ # Partial string match for semantic similarity
1055
+ pred_str = str(pred_val).lower()
1056
+ gt_str = str(gt_val).lower()
1057
+
1058
+ # Check if one is contained in the other (for cases like "Delta" vs "Delta Airlines")
1059
+ if pred_str in gt_str or gt_str in pred_str:
1060
+ return 0.8
1061
+
1062
+ # Check for common abbreviations
1063
+ abbreviations = {
1064
+ "nyc": "new york city",
1065
+ "ny": "new york",
1066
+ "la": "los angeles",
1067
+ "sf": "san francisco",
1068
+ "dc": "washington dc",
1069
+ }
1070
+
1071
+ for abbrev, full in abbreviations.items():
1072
+ if (pred_str == abbrev and gt_str == full) or (
1073
+ pred_str == full and gt_str == abbrev
1074
+ ):
1075
+ return 0.9
1076
+
1077
+ return 0.0
1078
+
1079
+ def _convert_to_bool(self, value: Any) -> bool:
1080
+ """Convert various representations to boolean."""
1081
+ if isinstance(value, bool):
1082
+ return value
1083
+ if isinstance(value, str):
1084
+ lower_val = value.lower()
1085
+ if lower_val in ["true", "1", "yes", "on"]:
1086
+ return True
1087
+ elif lower_val in ["false", "0", "no", "off"]:
1088
+ return False
1089
+ if isinstance(value, (int, float)):
1090
+ return bool(value)
1091
+ return None
1092
+
1093
+ def _extract_python_code(self, response: str) -> str:
1094
+ """Extract Python code from LLM response."""
1095
+
1096
+ # Look for code blocks
1097
+ code_patterns = [
1098
+ r"```python\s*(.*?)\s*```",
1099
+ r"```\s*(.*?)\s*```",
1100
+ r"<\|python_tag\|>(.*?)(?=<\||$)",
1101
+ ]
1102
+
1103
+ for pattern in code_patterns:
1104
+ matches = re.findall(pattern, response, re.DOTALL)
1105
+ if matches:
1106
+ return matches[0].strip()
1107
+
1108
+ # If no code blocks found, try to extract code heuristically
1109
+ lines = response.split("\n")
1110
+ code_lines = []
1111
+ in_code = False
1112
+
1113
+ for line in lines:
1114
+ if (
1115
+ "import " in line
1116
+ or "def " in line
1117
+ or line.strip().startswith("predicted_call")
1118
+ ):
1119
+ in_code = True
1120
+
1121
+ if in_code:
1122
+ code_lines.append(line)
1123
+
1124
+ # Stop if we see explanatory text after code
1125
+ if (
1126
+ in_code
1127
+ and line.strip()
1128
+ and not any(
1129
+ keyword in line
1130
+ for keyword in [
1131
+ "import",
1132
+ "def",
1133
+ "=",
1134
+ "return",
1135
+ "print",
1136
+ "if",
1137
+ "elif",
1138
+ "else",
1139
+ "for",
1140
+ "while",
1141
+ "try",
1142
+ "except",
1143
+ "#",
1144
+ ]
1145
+ )
1146
+ ):
1147
+ break
1148
+
1149
+ return "\n".join(code_lines) if code_lines else ""
1150
+
1151
+ def _apply_custom_instructions(
1152
+ self, pred_val: Any, gt_val: Any, param_name: str, custom_instructions: str
1153
+ ) -> Optional[float]:
1154
+ """Apply custom instructions for specific parameter comparisons."""
1155
+ try:
1156
+ # Execute Python code to apply custom instructions
1157
+ import datetime
1158
+
1159
+ # Try to import dateutil, but don't fail if not available
1160
+ try:
1161
+ from dateutil import parser as date_parser
1162
+ except ImportError:
1163
+ date_parser = None
1164
+
1165
+ # Create a local namespace for safe execution
1166
+ local_vars = {
1167
+ "pred_val": pred_val,
1168
+ "gt_val": gt_val,
1169
+ "param_name": param_name,
1170
+ "custom_instructions": custom_instructions,
1171
+ "datetime": datetime,
1172
+ "date_parser": date_parser,
1173
+ "str": str,
1174
+ "int": int,
1175
+ "float": float,
1176
+ "bool": bool,
1177
+ "len": len,
1178
+ "abs": abs,
1179
+ "min": min,
1180
+ "max": max,
1181
+ }
1182
+
1183
+ # Example custom instruction patterns
1184
+ if (
1185
+ "yesterday" in custom_instructions.lower()
1186
+ and "date" in param_name.lower()
1187
+ ):
1188
+ # Handle date comparisons where "yesterday" might be represented differently
1189
+ code = """
1190
+ import datetime
1191
+ from datetime import timedelta
1192
+
1193
+ def check_yesterday_equivalence(pred_val, gt_val):
1194
+ try:
1195
+ # Parse dates from various formats
1196
+ pred_date = None
1197
+ gt_date = None
1198
+
1199
+ # Try to parse predicted value as date
1200
+ if isinstance(pred_val, str):
1201
+ try:
1202
+ if date_parser:
1203
+ pred_date = date_parser.parse(pred_val).date()
1204
+ else:
1205
+ # Simple parsing for YYYY-MM-DD format
1206
+ pred_date = datetime.datetime.strptime(pred_val, "%Y-%m-%d").date()
1207
+ except:
1208
+ pass
1209
+
1210
+ # Try to parse ground truth value as date
1211
+ if isinstance(gt_val, str):
1212
+ try:
1213
+ if date_parser:
1214
+ gt_date = date_parser.parse(gt_val).date()
1215
+ else:
1216
+ # Simple parsing for YYYY-MM-DD format
1217
+ gt_date = datetime.datetime.strptime(gt_val, "%Y-%m-%d").date()
1218
+ except:
1219
+ pass
1220
+
1221
+ # Check if either date represents yesterday
1222
+ yesterday = datetime.date.today() - timedelta(days=1)
1223
+
1224
+ # If both are valid dates and both represent yesterday, they match
1225
+ if pred_date and gt_date:
1226
+ if pred_date == yesterday and gt_date == yesterday:
1227
+ return 1.0
1228
+ elif pred_date == gt_date:
1229
+ return 1.0
1230
+ else:
1231
+ return 0.0
1232
+
1233
+ # If one is yesterday and the other matches yesterday, they match
1234
+ if pred_date == yesterday or gt_date == yesterday:
1235
+ return 1.0
1236
+
1237
+ return None # Fall back to normal comparison
1238
+ except Exception as e:
1239
+ return None # Fall back to normal comparison
1240
+
1241
+ result = check_yesterday_equivalence(pred_val, gt_val)
1242
+ """
1243
+
1244
+ exec(code, {}, local_vars)
1245
+ return local_vars.get("result")
1246
+
1247
+ # Add more custom instruction patterns here
1248
+ # For example, handling relative time expressions, currency formats, etc.
1249
+
1250
+ return None # No custom instruction matched
1251
+
1252
+ except Exception as e:
1253
+ logger.warning(f"Custom instructions execution failed: {e}")
1254
+ return None # Fall back to normal comparison