ibm-watsonx-gov 1.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. ibm_watsonx_gov/__init__.py +8 -0
  2. ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
  3. ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
  4. ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
  5. ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
  6. ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
  7. ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
  8. ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
  9. ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
  10. ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
  11. ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
  12. ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
  13. ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
  14. ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
  15. ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
  16. ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
  17. ibm_watsonx_gov/clients/__init__.py +0 -0
  18. ibm_watsonx_gov/clients/api_client.py +99 -0
  19. ibm_watsonx_gov/clients/segment_client.py +46 -0
  20. ibm_watsonx_gov/clients/usage_client.cp313-win_amd64.pyd +0 -0
  21. ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
  22. ibm_watsonx_gov/config/__init__.py +14 -0
  23. ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
  24. ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
  25. ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
  26. ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
  27. ibm_watsonx_gov/entities/__init__.py +8 -0
  28. ibm_watsonx_gov/entities/agentic_app.py +209 -0
  29. ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
  30. ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
  31. ibm_watsonx_gov/entities/ai_experiment.py +419 -0
  32. ibm_watsonx_gov/entities/base_classes.py +134 -0
  33. ibm_watsonx_gov/entities/container.py +54 -0
  34. ibm_watsonx_gov/entities/credentials.py +633 -0
  35. ibm_watsonx_gov/entities/criteria.py +508 -0
  36. ibm_watsonx_gov/entities/enums.py +274 -0
  37. ibm_watsonx_gov/entities/evaluation_result.py +444 -0
  38. ibm_watsonx_gov/entities/foundation_model.py +490 -0
  39. ibm_watsonx_gov/entities/llm_judge.py +44 -0
  40. ibm_watsonx_gov/entities/locale.py +17 -0
  41. ibm_watsonx_gov/entities/mapping.py +49 -0
  42. ibm_watsonx_gov/entities/metric.py +211 -0
  43. ibm_watsonx_gov/entities/metric_threshold.py +36 -0
  44. ibm_watsonx_gov/entities/model_provider.py +329 -0
  45. ibm_watsonx_gov/entities/model_risk_result.py +43 -0
  46. ibm_watsonx_gov/entities/monitor.py +71 -0
  47. ibm_watsonx_gov/entities/prompt_setup.py +40 -0
  48. ibm_watsonx_gov/entities/state.py +22 -0
  49. ibm_watsonx_gov/entities/utils.py +99 -0
  50. ibm_watsonx_gov/evaluators/__init__.py +26 -0
  51. ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
  52. ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
  53. ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
  54. ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
  55. ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cp313-win_amd64.pyd +0 -0
  56. ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cp313-win_amd64.pyd +0 -0
  57. ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
  58. ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
  59. ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
  60. ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
  61. ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
  62. ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
  63. ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
  64. ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
  65. ibm_watsonx_gov/metrics/__init__.py +74 -0
  66. ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
  67. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
  68. ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
  69. ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
  70. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
  71. ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
  72. ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
  73. ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
  74. ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
  75. ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
  76. ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
  77. ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
  78. ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
  79. ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
  80. ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
  81. ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
  82. ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
  83. ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
  84. ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
  85. ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
  86. ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
  87. ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
  88. ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
  89. ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
  90. ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
  91. ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
  92. ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
  93. ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
  94. ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
  95. ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
  96. ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
  97. ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
  98. ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
  99. ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
  100. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
  101. ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
  102. ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
  103. ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
  104. ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
  105. ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
  106. ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
  107. ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
  108. ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
  109. ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
  110. ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
  111. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
  112. ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
  113. ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
  114. ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
  115. ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
  116. ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
  117. ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
  118. ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
  119. ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
  120. ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
  121. ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
  122. ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
  123. ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
  124. ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
  125. ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
  126. ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
  127. ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
  128. ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
  129. ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
  130. ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
  131. ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
  132. ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
  133. ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
  134. ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
  135. ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
  136. ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
  137. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
  138. ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
  139. ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
  140. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
  141. ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
  142. ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
  143. ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
  144. ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
  145. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
  146. ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
  147. ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
  148. ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
  149. ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
  150. ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
  151. ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
  152. ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
  153. ibm_watsonx_gov/metrics/status/__init__.py +0 -0
  154. ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
  155. ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
  156. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
  157. ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
  158. ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
  159. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
  160. ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
  161. ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
  162. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
  163. ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
  164. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
  165. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
  166. ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
  167. ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
  168. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
  169. ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
  170. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
  171. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
  172. ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
  173. ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
  174. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
  175. ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
  176. ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
  177. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
  178. ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
  179. ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
  180. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
  181. ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
  182. ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
  183. ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
  184. ibm_watsonx_gov/metrics/utils.py +440 -0
  185. ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
  186. ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
  187. ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
  188. ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
  189. ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
  190. ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
  191. ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
  192. ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
  193. ibm_watsonx_gov/providers/__init__.py +8 -0
  194. ibm_watsonx_gov/providers/detectors_provider.cp313-win_amd64.pyd +0 -0
  195. ibm_watsonx_gov/providers/detectors_provider.py +415 -0
  196. ibm_watsonx_gov/providers/eval_assist_provider.cp313-win_amd64.pyd +0 -0
  197. ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
  198. ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
  199. ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
  200. ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
  201. ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
  202. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
  203. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
  204. ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
  205. ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
  206. ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
  207. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
  208. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
  209. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
  210. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
  211. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
  212. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
  213. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
  214. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
  215. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
  216. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
  217. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
  218. ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
  219. ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
  220. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
  221. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
  222. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
  223. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
  224. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
  225. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  226. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
  227. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
  228. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
  229. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  230. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
  231. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
  232. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
  233. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
  234. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
  235. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
  236. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
  237. ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
  238. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
  239. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
  240. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
  241. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
  242. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
  243. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
  244. ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
  245. ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
  246. ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
  247. ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
  248. ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
  249. ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
  250. ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
  251. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
  252. ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
  253. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
  254. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
  255. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
  256. ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
  257. ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
  258. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
  259. ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
  260. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
  261. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
  262. ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
  263. ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
  264. ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
  265. ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
  266. ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
  267. ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
  268. ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
  269. ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
  270. ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
  271. ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
  272. ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
  273. ibm_watsonx_gov/providers/unitxt_provider.cp313-win_amd64.pyd +0 -0
  274. ibm_watsonx_gov/tools/__init__.py +10 -0
  275. ibm_watsonx_gov/tools/clients/__init__.py +11 -0
  276. ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
  277. ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
  278. ibm_watsonx_gov/tools/core/__init__.py +8 -0
  279. ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
  280. ibm_watsonx_gov/tools/entities/__init__.py +8 -0
  281. ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
  282. ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
  283. ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
  284. ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
  285. ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
  286. ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
  287. ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
  288. ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
  289. ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
  290. ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
  291. ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
  292. ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
  293. ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
  294. ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
  295. ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
  296. ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
  297. ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
  298. ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
  299. ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
  300. ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
  301. ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
  302. ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
  303. ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
  304. ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
  305. ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
  306. ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
  307. ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
  308. ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
  309. ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
  310. ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
  311. ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
  312. ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
  313. ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
  314. ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
  315. ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
  316. ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
  317. ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
  318. ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
  319. ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
  320. ibm_watsonx_gov/tools/utils/__init__.py +14 -0
  321. ibm_watsonx_gov/tools/utils/constants.py +69 -0
  322. ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
  323. ibm_watsonx_gov/tools/utils/environment.py +108 -0
  324. ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
  325. ibm_watsonx_gov/tools/utils/platform_url_mapping.cp313-win_amd64.pyd +0 -0
  326. ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
  327. ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
  328. ibm_watsonx_gov/traces/__init__.py +8 -0
  329. ibm_watsonx_gov/traces/span_exporter.py +195 -0
  330. ibm_watsonx_gov/traces/span_node.py +251 -0
  331. ibm_watsonx_gov/traces/span_util.py +153 -0
  332. ibm_watsonx_gov/traces/trace_utils.py +1074 -0
  333. ibm_watsonx_gov/utils/__init__.py +8 -0
  334. ibm_watsonx_gov/utils/aggregation_util.py +346 -0
  335. ibm_watsonx_gov/utils/async_util.py +62 -0
  336. ibm_watsonx_gov/utils/authenticator.py +144 -0
  337. ibm_watsonx_gov/utils/constants.py +15 -0
  338. ibm_watsonx_gov/utils/errors.py +40 -0
  339. ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
  340. ibm_watsonx_gov/utils/insights_generator.py +1285 -0
  341. ibm_watsonx_gov/utils/python_utils.py +425 -0
  342. ibm_watsonx_gov/utils/rest_util.py +73 -0
  343. ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
  344. ibm_watsonx_gov/utils/singleton_meta.py +25 -0
  345. ibm_watsonx_gov/utils/url_mapping.cp313-win_amd64.pyd +0 -0
  346. ibm_watsonx_gov/utils/validation_util.py +126 -0
  347. ibm_watsonx_gov/visualizations/__init__.py +13 -0
  348. ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
  349. ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
  350. ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
  351. ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
  352. ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
  353. ibm_watsonx_gov-1.3.3.dist-info/WHEEL +4 -0
@@ -0,0 +1,525 @@
1
+ # ----------------------------------------------------------------------------------------------------
2
+ # IBM Confidential
3
+ # Licensed Materials - Property of IBM
4
+ # 5737-H76, 5900-A3Q
5
+ # © Copyright IBM Corp. 2025 All Rights Reserved.
6
+ # US Government Users Restricted Rights - Use, duplication or disclosure restricted by
7
+ # GSA ADPSchedule Contract with IBM Corp.
8
+ # ----------------------------------------------------------------------------------------------------
9
+ import random
10
+ import re
11
+ from collections import defaultdict
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed
13
+ from typing import Any, Dict, List, TypedDict, Union
14
+
15
+ import pandas as pd
16
+
17
+ from ibm_watsonx_gov.metrics.llm_validation.llm_validation_constants import max_eval_text_for_synthesis, \
18
+ min_recurrent_evaluation_issues
19
+ from ibm_watsonx_gov.metrics.llm_validation.llm_validation_prompts import \
20
+ map_shortcomings_system_prompt, map_shortcomings_human_prompt, \
21
+ recurrent_issues_synthesis_human_prompt, recurrent_issues_synthesis_system_prompt, full_response_eval_human_prompt, \
22
+ full_response_eval_system_prompt, summarization_system_prompt, summarization_human_prompt, \
23
+ shortcomings_clustering_system_prompt, shortcomings_clustering_human_prompt
24
+
25
+ try:
26
+ from langchain.schema import HumanMessage, SystemMessage
27
+ from langgraph.graph import END, StateGraph
28
+ from langchain_ibm import ChatWatsonx
29
+ except:
30
+ pass
31
+
32
+ from tqdm.auto import tqdm
33
+ from ibm_watsonx_gov.metrics.llm_validation.evaluation_criteria import (
34
+ EvaluationCriteria, get_default_evaluation_criteria)
35
+
36
+
37
+ class State(TypedDict):
38
+ model_input: str
39
+ model_output: str
40
+ evaluation_text: str
41
+ evaluation_score: Union[int, None]
42
+ evaluation_summary: str
43
+ llm: Any
44
+ evaluation_criteria: EvaluationCriteria
45
+
46
+
47
+
48
+
49
+ # --- Helper Functions ---
50
+ def parse_evaluation_response(response_content):
51
+ """Parses LLM response for evaluation text and score."""
52
+ text = response_content.strip()
53
+ score = None
54
+
55
+ # Attempt to find a score line like "Evaluation score: X.Y"
56
+ score_match = re.search(r"Evaluation score:\s*(\d+(?:\.\d+)?)", text, re.IGNORECASE)
57
+ if score_match:
58
+ try:
59
+ s = float(score_match.group(1))
60
+ if 0 <= s <= 1:
61
+ score = s
62
+ # Try to remove the score line and preceding/following whitespace
63
+ text = re.sub(r"(\n|^)\s*Evaluation score:\s*\d+(?:\.\d+)?\s*(\n|$)", "\n", text).strip()
64
+
65
+ except ValueError:
66
+ pass
67
+
68
+ # Fallback: try to find any float between 0 and 1 if not found above
69
+ if score is None:
70
+ potential_scores = re.findall(r'\b(0(?:\.\d+)?|1(?:\.0+)?)\b', response_content)
71
+ for num_str in reversed(potential_scores): # Check from end, often score is last
72
+ try:
73
+ s = float(num_str)
74
+ if 0 <= s <= 1:
75
+ score = s
76
+ text = text.replace(num_str, "").strip()
77
+ break
78
+ except ValueError:
79
+ continue
80
+
81
+ if score is None:
82
+ print(f"Warning: Could not extract valid score from evaluation response: {response_content}")
83
+
84
+ # Clean up common artifacts if needed
85
+ text = text.replace("--- Begin Evaluation ---", "").replace("Textual Evaluation:", "").strip()
86
+
87
+ return text, score
88
+
89
+
90
+ def generate_llm_response(llm: Any, system_prompt: str, human_prompt: str) -> str:
91
+ """Generates a response from the LLM given prompts."""
92
+ try:
93
+ messages = [
94
+ SystemMessage(content=str(system_prompt)),
95
+ HumanMessage(content=str(human_prompt))
96
+ ]
97
+ results = llm.invoke(messages)
98
+ return results.content
99
+ except Exception as e:
100
+ return f"LLM Error: {e}"
101
+
102
+
103
+ def run_func_in_threads(func, input_list, max_workers=10, error_prefix="Error: ", progress_desc="Processing tasks"):
104
+ if len(input_list) == 1:
105
+ return [func(*input_list[0])]
106
+
107
+ results = [None] * len(input_list)
108
+ with ThreadPoolExecutor(max_workers) as executor:
109
+ future_to_input_idx = {executor.submit(func, *input_list[i]): i
110
+ for i, _ in enumerate(input_list)}
111
+ for future in tqdm(as_completed(future_to_input_idx), total=len(input_list), desc=progress_desc):
112
+ try:
113
+ result = future.result()
114
+ except Exception as e:
115
+ result = [f"{error_prefix}: {e}"]
116
+ results[future_to_input_idx[future]] = result
117
+
118
+ return results
119
+
120
+
121
+ # --- Node Functions ---
122
+ def evaluate_response_node(state: State) -> Dict[str, Any]:
123
+ """Evaluates the model's response using the full_response_template."""
124
+ evaluation_criteria_str = state.get(
125
+ "evaluation_criteria", get_default_evaluation_criteria()).to_str()
126
+
127
+ system_prompt = full_response_eval_system_prompt.format(
128
+ evaluation_criteria=evaluation_criteria_str
129
+ )
130
+ human_prompt = full_response_eval_human_prompt.format(
131
+ model_input=state['model_input'],
132
+ model_output=state['model_output'],
133
+ )
134
+ evaluation_response = generate_llm_response(
135
+ state["llm"], system_prompt, human_prompt)
136
+ evaluation_text, score = parse_evaluation_response(evaluation_response)
137
+ return {"evaluation_text": evaluation_text, "evaluation_score": score}
138
+
139
+
140
+ def summarize_evaluation_node(state: State) -> Dict[str, str]:
141
+ """Summarizes the generated evaluation text."""
142
+ if not state.get("evaluation_text") or "LLM Error" in state["evaluation_text"]:
143
+ return {"evaluation_summary": "Evaluation text missing or contains error."}
144
+
145
+ system_prompt = summarization_system_prompt
146
+ human_prompt = summarization_human_prompt.format(
147
+ evaluation_text=state["evaluation_text"])
148
+ summary = generate_llm_response(state["llm"], system_prompt, human_prompt)
149
+ return {"evaluation_summary": summary.strip()}
150
+
151
+
152
+ # --- Graph Definition ---
153
+ def get_evaluation_graph():
154
+ """Builds the simplified evaluation workflow."""
155
+ workflow = StateGraph(State)
156
+ workflow.add_node("evaluate_response", evaluate_response_node)
157
+ workflow.add_node("summarize_evaluation", summarize_evaluation_node)
158
+ workflow.set_entry_point("evaluate_response")
159
+ workflow.add_edge("evaluate_response", "summarize_evaluation")
160
+ workflow.add_edge("summarize_evaluation", END)
161
+ app = workflow.compile()
162
+ return app
163
+
164
+
165
+ # --- Evaluate single records ---
166
+ def _evaluate_row(row: pd.Series, app: Any, llm: Any, input_col: str, output_col: str,
167
+ text_col: str, score_col: str, summary_col: str, evaluation_criteria: EvaluationCriteria = None) -> pd.Series:
168
+ """
169
+ Helper function to evaluate a single row using the pre-compiled graph.
170
+ To be used with df.apply().
171
+ """
172
+ model_input = row.get(input_col)
173
+ model_output = row.get(output_col)
174
+
175
+ if not model_input or not model_output:
176
+ return pd.Series({
177
+ text_col: "",
178
+ score_col: None,
179
+ summary_col: ""
180
+ })
181
+
182
+ initial_state: State = {
183
+ "model_input": str(model_input),
184
+ "model_output": str(model_output),
185
+ "llm": llm,
186
+ "evaluation_text": "",
187
+ "evaluation_score": None,
188
+ "evaluation_summary": "",
189
+ "evaluation_criteria": evaluation_criteria,
190
+ }
191
+
192
+ try:
193
+ final_state = app.invoke(initial_state)
194
+ return pd.Series({
195
+ text_col: final_state.get("evaluation_text", "Error: Text not generated"),
196
+ # Default to None
197
+ score_col: final_state.get("evaluation_score", None),
198
+ summary_col: final_state.get(
199
+ "evaluation_summary", "Error: Summary not generated")
200
+ })
201
+ except Exception as e:
202
+ return pd.Series({
203
+ text_col: f"Error during processing: {e}",
204
+ score_col: None, # Error -> None score
205
+ summary_col: f"Error during processing: {e}"
206
+ })
207
+
208
+
209
+ def get_num_workers_for_llm(llm):
210
+ if isinstance(llm, ChatWatsonx):
211
+ return 3
212
+ return 15
213
+
214
+ def llm_validation_per_record(
215
+ df: pd.DataFrame,
216
+ llm: Any,
217
+ input_col: str,
218
+ output_col: str,
219
+ text_col: str = 'evaluation_text',
220
+ score_col: str = 'evaluation_score',
221
+ summary_col: str = 'evaluation_summary',
222
+ evaluation_criteria: EvaluationCriteria | None = None
223
+ ) -> pd.DataFrame:
224
+ """
225
+ Evaluates model responses in a DataFrame using a pre-compiled LangGraph.
226
+
227
+ Args:
228
+ df: The Pandas DataFrame to process.
229
+ llm: An initialized LangChain compatible LLM instance.
230
+ input_col: Name of the column containing the model input text.
231
+ output_col: Name of the column containing the model output text.
232
+ text_col: Name for the new column for the full evaluation text.
233
+ score_col: Name for the new column for the extracted evaluation score.
234
+ summary_col: Name for the new column for the evaluation summary.
235
+ evaluation_criteria: Optional[EvaluationCriterion] List of evaluation criterion
236
+
237
+ Returns:
238
+ The original DataFrame with the new evaluation columns added.
239
+ """
240
+ if not isinstance(df, pd.DataFrame):
241
+ raise ValueError("Input 'df' must be a Pandas DataFrame.")
242
+ if not llm:
243
+ raise ValueError("LLM instance must be provided.")
244
+ if input_col not in df.columns:
245
+ raise ValueError(f"Input column '{input_col}' not found in DataFrame.")
246
+ if output_col not in df.columns:
247
+ raise ValueError(
248
+ f"Output column '{output_col}' not found in DataFrame.")
249
+
250
+ app = get_evaluation_graph()
251
+ tqdm.pandas(desc="Evaluating Rows")
252
+ inputs = []
253
+ indices = []
254
+ for i, r in df.iterrows():
255
+ inputs.append([r, app, llm, input_col, output_col, text_col, score_col, summary_col, evaluation_criteria])
256
+ indices.append(i)
257
+
258
+ results = run_func_in_threads(
259
+ _evaluate_row, inputs, max_workers=get_num_workers_for_llm(llm), progress_desc="Evaluating single records")
260
+ results = pd.DataFrame(results)
261
+
262
+ df.loc[indices, text_col] = results[text_col]
263
+ df.loc[indices, score_col] = results[score_col]
264
+ df.loc[indices, summary_col] = results[summary_col]
265
+
266
+ return df
267
+
268
+
269
+ # --- Aggregation over records ---
270
+ def generate_issues_and_map_to_records(summaries_list: List[str],
271
+ llm: Any) -> Dict[str, List[int]]:
272
+ """
273
+ Analyzes a column of evaluation summaries to identify and rank recurring issues.
274
+ Map each of the recurring issues back to the original summaries
275
+ Args:
276
+ summaries_list: The list of record level summaries
277
+ llm: An initialized LangChain compatible LLM instance.
278
+ Returns:
279
+ A dictionary mapping form each identified recurring issue to a list of corresponding indices in summaries_list.
280
+ """
281
+ if len(summaries_list) == 0:
282
+ issues_list = []
283
+ elif len(summaries_list) == 1:
284
+ issues_list = summaries_list
285
+ else:
286
+ issues_list = find_recurrent_evaluation_issues(summaries_list, llm)
287
+
288
+ # return the error
289
+ if is_issues_list_error(issues_list):
290
+ return {issues_list[0]: []}
291
+
292
+ if len(issues_list) > min_recurrent_evaluation_issues:
293
+ issues_list = cluster_similar_issues(issues_list, llm)
294
+
295
+ # return the error
296
+ if is_issues_list_error(issues_list):
297
+ return {issues_list[0]: []}
298
+
299
+ # since the same summaries use for issue generation and mapping, issues must apply to the single record
300
+ if len(summaries_list) == 1:
301
+ return {issue: [0] for issue in issues_list}
302
+
303
+ return map_issues_to_records(summaries_list, llm, issues_list)
304
+
305
+ def parse_shortcoming_list_response(response_content: str) -> List[str]:
306
+ """Parses LLM response expected to be a Python list of strings."""
307
+ try:
308
+ # Find the list within the response
309
+ list_match = re.search(r'\[\s*(".*?"(?:\s*,\s*".*?")*)\s*\]', response_content, re.DOTALL)
310
+ if list_match:
311
+ list_content = list_match.group(1)
312
+ shortcomings = re.findall(r'"(.*?)"', list_content)
313
+ shortcomings = [s.strip() for s in shortcomings if s.strip()]
314
+ if shortcomings:
315
+ return shortcomings
316
+ else:
317
+ return []
318
+ else:
319
+ return [f"Error during issue summarization: no issues found"]
320
+ except Exception as e:
321
+ return [f"Error during issue summarization: {e}"]
322
+
323
+
324
+ def get_summaries_for_synthesis_as_text(summaries_list: List[str]) -> str:
325
+ valid_summaries = [str(summary) for summary in summaries_list if summary]
326
+ valid_summaries = [
327
+ s for s in valid_summaries
328
+ if not is_summary_error(s)
329
+ ]
330
+
331
+ if not valid_summaries:
332
+ return ""
333
+
334
+ # Sample texts if there are too many
335
+ if valid_summaries and len(valid_summaries) > max_eval_text_for_synthesis:
336
+ valid_summaries = random.sample(valid_summaries, max_eval_text_for_synthesis)
337
+ return "\n---\n".join(valid_summaries)
338
+
339
+
340
+ def cluster_similar_issues(issues_list: List[str], llm: Any) -> List[str]:
341
+ """
342
+ Analyzes a column of evaluation summaries to identify and rank recurring issues.
343
+
344
+ Args:
345
+ issues_list: The initial list of issues
346
+ llm: An initialized LangChain compatible LLM instance.
347
+
348
+ Returns:
349
+ A list of strings, each describing a recurring issue after removing duplicates,
350
+ perceived frequency (most frequent first), based on LLM analysis.
351
+ Returns the original list if the LLM call fails.
352
+ """
353
+ system_prompt = shortcomings_clustering_system_prompt
354
+ human_prompt = shortcomings_clustering_human_prompt.format(recurring_issues_list=issues_list)
355
+ try:
356
+ analysis_result = generate_llm_response(llm, system_prompt, human_prompt)
357
+ new_issue_list = parse_shortcoming_list_response(analysis_result)
358
+ if not new_issue_list or is_issues_list_error(new_issue_list):
359
+ # if clustering failed - fallback to original issues list
360
+ return issues_list
361
+ return new_issue_list
362
+
363
+ except Exception as e:
364
+ return issues_list
365
+
366
+
367
+
368
+ def find_recurrent_evaluation_issues(
369
+ summaries_list: List[str],
370
+ llm: Any,
371
+ ) -> List[str]:
372
+ """
373
+ Analyzes a column of evaluation summaries to identify and rank recurring issues.
374
+
375
+ Args:
376
+ summaries_list: The list of record level summaries
377
+ llm: An initialized LangChain compatible LLM instance.
378
+
379
+ Returns:
380
+ A list of strings, each describing a recurring issue, ordered by
381
+ perceived frequency (most frequent first), based on LLM analysis.
382
+ Returns an empty list if no summaries are found or no issues are identified.
383
+ Returns a list containing an error message if the LLM call fails.
384
+ """
385
+ try:
386
+ summaries_text = get_summaries_for_synthesis_as_text(summaries_list)
387
+ except Exception as e:
388
+ summaries_text = ""
389
+
390
+ if not summaries_text:
391
+ return []
392
+
393
+ system_prompt = recurrent_issues_synthesis_system_prompt
394
+ human_prompt = recurrent_issues_synthesis_human_prompt.format(concatenated_evaluation_text=summaries_text)
395
+
396
+ try:
397
+ analysis_result = generate_llm_response(llm, system_prompt, human_prompt)
398
+ return parse_shortcoming_list_response(analysis_result)
399
+
400
+ except Exception as e:
401
+ return [f"Error during issue summarization: {e}"]
402
+
403
+
404
+ def analyze_shortcomings_llm(eval_text, llm, shortcomings):
405
+ """
406
+ Use LLM to analyze evaluation text for shortcomings.
407
+ Returns a list of binary values (0 or 1) indicating presence of each shortcoming.
408
+ """
409
+
410
+ if eval_text.startswith("Evaluation text missing") or not eval_text:
411
+ return []
412
+
413
+ # Create numbered list of shortcomings for the prompt
414
+ shortcomings_list = "\n".join(
415
+ [f"{i + 1}. {s}" for i, s in enumerate(shortcomings)])
416
+ num_shortcomings = len(shortcomings)
417
+
418
+ system_prompt = map_shortcomings_system_prompt.format(num_shortcomings=num_shortcomings)
419
+ human_prompt = map_shortcomings_human_prompt.format(shortcomings_list=shortcomings_list,
420
+ num_shortcomings=num_shortcomings,
421
+ eval_text=eval_text)
422
+ try:
423
+ response = generate_llm_response(
424
+ llm, system_prompt, human_prompt).strip()
425
+
426
+ # Extract the list from the response if needed
427
+ if '[' in response and ']' in response:
428
+ response = response[response.find(
429
+ '['):response.find(']') + 1].strip("[]")
430
+
431
+ binary_values = response.split(',')
432
+
433
+ if len(binary_values) == num_shortcomings:
434
+ return [int(value.strip()) for value in binary_values]
435
+
436
+ return ["Error in issues selection: bad response format"]
437
+
438
+ except Exception as e:
439
+ return [f"Error in issues selection: {e}"]
440
+
441
+
442
+ def is_summary_error(summary):
443
+ return not summary or summary.startswith("Evaluation text missing or contains error.") or "LLM Error" in summary
444
+
445
+
446
+ def is_issues_list_error(issues_list):
447
+ if len(issues_list) == 1:
448
+ if "Error during issue summarization" in issues_list[0]:
449
+ return True
450
+ return False
451
+
452
+
453
+ def map_issues_to_records(summaries_list: List[str], llm, issues_list=List[str]) -> Dict[str, List[int]]:
454
+ """
455
+ Map each record the relevant issues from issues_list.
456
+ Args:
457
+ summaries_list: The list of record level summaries
458
+ llm: An initialized LangChain compatible LLM instance.
459
+ issues_list: The list of common recurring issues
460
+ Returns:
461
+ a dictionary mapping each recurring issue to the indices of the matching summaries in summaries_list
462
+ """
463
+
464
+ if not issues_list:
465
+ return {}
466
+
467
+ # return the error
468
+ if is_issues_list_error(issues_list):
469
+ return {issues_list[0]: []}
470
+
471
+ # Process each evaluation
472
+ issues_counts = {shotrcoming: 0 for shotrcoming in issues_list}
473
+ recurring_issues_per_record = []
474
+
475
+ input_list = [[record_summary, llm, issues_list]
476
+ for record_summary in summaries_list]
477
+ results = run_func_in_threads(analyze_shortcomings_llm, input_list,
478
+ max_workers=get_num_workers_for_llm(llm), error_prefix="Error in issues selection",
479
+ progress_desc="Mapping issues to records")
480
+
481
+ for i, detected_issues_result in enumerate(results):
482
+ if not detected_issues_result:
483
+ identified_issues = []
484
+ elif len(detected_issues_result) == 1 and isinstance(detected_issues_result[0], str):
485
+ identified_issues = detected_issues_result
486
+ else:
487
+ # Create a list of identified shortcomings for this evaluation
488
+ identified_issues = [issues_list[i] for i in range(
489
+ len(issues_list)) if detected_issues_result[i] == 1]
490
+ for issue in identified_issues:
491
+ issues_counts[issue] += 1
492
+
493
+ recurring_issues_per_record.append(identified_issues)
494
+
495
+ issues_stats = list(issues_counts.items())
496
+ issues_stats.sort(key=lambda x: x[1], reverse=True)
497
+ sorted_issues = [issue[0] for issue in issues_stats]
498
+ issue_to_matching_record_ids = {s: [] for s in sorted_issues}
499
+
500
+ for rec_i, record_issues in enumerate(recurring_issues_per_record):
501
+ for record_issue in record_issues:
502
+ issue_to_matching_record_ids[record_issue].append(rec_i)
503
+
504
+ return issue_to_matching_record_ids
505
+
506
+
507
+ def reverse_mapping(mapping):
508
+ """
509
+ Reverses a mapping from keys to index lists .
510
+ Uses the order of keys in the original mapping to produce a reverse mapping:
511
+ index -> list of key indices.
512
+
513
+ Args:
514
+ mapping (dict): Mapping from keys to list of indices.
515
+
516
+ Returns:
517
+ dict: Mapping from index to list of key positions (ints).
518
+ """
519
+ reversed_map = defaultdict(list)
520
+
521
+ for i, key in enumerate(mapping):
522
+ for index in mapping[key]:
523
+ reversed_map[index].append(i)
524
+
525
+ return dict(reversed_map)