ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_gov/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/clients/__init__.py +14 -0
- ibm_watsonx_gov/agent_catalog/clients/ai_agent_client.py +333 -0
- ibm_watsonx_gov/agent_catalog/core/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/core/agent_loader.py +202 -0
- ibm_watsonx_gov/agent_catalog/core/agents.py +134 -0
- ibm_watsonx_gov/agent_catalog/entities/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/entities/ai_agent.py +599 -0
- ibm_watsonx_gov/agent_catalog/utils/__init__.py +8 -0
- ibm_watsonx_gov/agent_catalog/utils/constants.py +36 -0
- ibm_watsonx_gov/agent_catalog/utils/notebook_utils.py +70 -0
- ibm_watsonx_gov/ai_experiments/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/ai_experiments_client.py +980 -0
- ibm_watsonx_gov/ai_experiments/utils/__init__.py +8 -0
- ibm_watsonx_gov/ai_experiments/utils/ai_experiment_utils.py +139 -0
- ibm_watsonx_gov/clients/__init__.py +0 -0
- ibm_watsonx_gov/clients/api_client.py +99 -0
- ibm_watsonx_gov/clients/segment_client.py +46 -0
- ibm_watsonx_gov/clients/usage_client.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/clients/wx_ai_client.py +87 -0
- ibm_watsonx_gov/config/__init__.py +14 -0
- ibm_watsonx_gov/config/agentic_ai_configuration.py +225 -0
- ibm_watsonx_gov/config/gen_ai_configuration.py +129 -0
- ibm_watsonx_gov/config/model_risk_configuration.py +173 -0
- ibm_watsonx_gov/config/predictive_ai_configuration.py +20 -0
- ibm_watsonx_gov/entities/__init__.py +8 -0
- ibm_watsonx_gov/entities/agentic_app.py +209 -0
- ibm_watsonx_gov/entities/agentic_evaluation_result.py +185 -0
- ibm_watsonx_gov/entities/ai_evaluation.py +290 -0
- ibm_watsonx_gov/entities/ai_experiment.py +419 -0
- ibm_watsonx_gov/entities/base_classes.py +134 -0
- ibm_watsonx_gov/entities/container.py +54 -0
- ibm_watsonx_gov/entities/credentials.py +633 -0
- ibm_watsonx_gov/entities/criteria.py +508 -0
- ibm_watsonx_gov/entities/enums.py +274 -0
- ibm_watsonx_gov/entities/evaluation_result.py +444 -0
- ibm_watsonx_gov/entities/foundation_model.py +490 -0
- ibm_watsonx_gov/entities/llm_judge.py +44 -0
- ibm_watsonx_gov/entities/locale.py +17 -0
- ibm_watsonx_gov/entities/mapping.py +49 -0
- ibm_watsonx_gov/entities/metric.py +211 -0
- ibm_watsonx_gov/entities/metric_threshold.py +36 -0
- ibm_watsonx_gov/entities/model_provider.py +329 -0
- ibm_watsonx_gov/entities/model_risk_result.py +43 -0
- ibm_watsonx_gov/entities/monitor.py +71 -0
- ibm_watsonx_gov/entities/prompt_setup.py +40 -0
- ibm_watsonx_gov/entities/state.py +22 -0
- ibm_watsonx_gov/entities/utils.py +99 -0
- ibm_watsonx_gov/evaluators/__init__.py +26 -0
- ibm_watsonx_gov/evaluators/agentic_evaluator.py +2725 -0
- ibm_watsonx_gov/evaluators/agentic_traces_evaluator.py +115 -0
- ibm_watsonx_gov/evaluators/base_evaluator.py +22 -0
- ibm_watsonx_gov/evaluators/impl/__init__.py +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_metrics_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/impl/evaluate_model_risk_impl.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/evaluators/metrics_evaluator.py +187 -0
- ibm_watsonx_gov/evaluators/model_risk_evaluator.py +89 -0
- ibm_watsonx_gov/evaluators/traces_evaluator.py +93 -0
- ibm_watsonx_gov/metric_groups/answer_quality/answer_quality_decorator.py +66 -0
- ibm_watsonx_gov/metric_groups/content_safety/content_safety_decorator.py +76 -0
- ibm_watsonx_gov/metric_groups/readability/readability_decorator.py +59 -0
- ibm_watsonx_gov/metric_groups/retrieval_quality/retrieval_quality_decorator.py +63 -0
- ibm_watsonx_gov/metric_groups/usage/usage_decorator.py +58 -0
- ibm_watsonx_gov/metrics/__init__.py +74 -0
- ibm_watsonx_gov/metrics/answer_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_decorator.py +63 -0
- ibm_watsonx_gov/metrics/answer_relevance/answer_relevance_metric.py +260 -0
- ibm_watsonx_gov/metrics/answer_similarity/__init__.py +0 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_decorator.py +66 -0
- ibm_watsonx_gov/metrics/answer_similarity/answer_similarity_metric.py +219 -0
- ibm_watsonx_gov/metrics/average_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/average_precision/average_precision_metric.py +174 -0
- ibm_watsonx_gov/metrics/base_metric_decorator.py +193 -0
- ibm_watsonx_gov/metrics/context_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_decorator.py +60 -0
- ibm_watsonx_gov/metrics/context_relevance/context_relevance_metric.py +414 -0
- ibm_watsonx_gov/metrics/cost/__init__.py +8 -0
- ibm_watsonx_gov/metrics/cost/cost_decorator.py +58 -0
- ibm_watsonx_gov/metrics/cost/cost_metric.py +155 -0
- ibm_watsonx_gov/metrics/duration/__init__.py +8 -0
- ibm_watsonx_gov/metrics/duration/duration_decorator.py +59 -0
- ibm_watsonx_gov/metrics/duration/duration_metric.py +111 -0
- ibm_watsonx_gov/metrics/evasiveness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_decorator.py +61 -0
- ibm_watsonx_gov/metrics/evasiveness/evasiveness_metric.py +103 -0
- ibm_watsonx_gov/metrics/faithfulness/__init__.py +8 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_decorator.py +65 -0
- ibm_watsonx_gov/metrics/faithfulness/faithfulness_metric.py +254 -0
- ibm_watsonx_gov/metrics/hap/__init__.py +16 -0
- ibm_watsonx_gov/metrics/hap/hap_decorator.py +58 -0
- ibm_watsonx_gov/metrics/hap/hap_metric.py +98 -0
- ibm_watsonx_gov/metrics/hap/input_hap_metric.py +104 -0
- ibm_watsonx_gov/metrics/hap/output_hap_metric.py +110 -0
- ibm_watsonx_gov/metrics/harm/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm/harm_decorator.py +60 -0
- ibm_watsonx_gov/metrics/harm/harm_metric.py +103 -0
- ibm_watsonx_gov/metrics/harm_engagement/__init__.py +8 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_decorator.py +61 -0
- ibm_watsonx_gov/metrics/harm_engagement/harm_engagement_metric.py +103 -0
- ibm_watsonx_gov/metrics/hit_rate/__init__.py +0 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_decorator.py +59 -0
- ibm_watsonx_gov/metrics/hit_rate/hit_rate_metric.py +167 -0
- ibm_watsonx_gov/metrics/input_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/input_token_count/input_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/jailbreak/__init__.py +8 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_decorator.py +60 -0
- ibm_watsonx_gov/metrics/jailbreak/jailbreak_metric.py +103 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/keyword_detection/keyword_detection_metric.py +111 -0
- ibm_watsonx_gov/metrics/llm_validation/__init__.py +8 -0
- ibm_watsonx_gov/metrics/llm_validation/evaluation_criteria.py +84 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_constants.py +24 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_decorator.py +54 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_impl.py +525 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py +258 -0
- ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py +106 -0
- ibm_watsonx_gov/metrics/llmaj/__init__.py +0 -0
- ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py +298 -0
- ibm_watsonx_gov/metrics/ndcg/__init__.py +0 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_decorator.py +61 -0
- ibm_watsonx_gov/metrics/ndcg/ndcg_metric.py +166 -0
- ibm_watsonx_gov/metrics/output_token_count/__init__.py +8 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_decorator.py +58 -0
- ibm_watsonx_gov/metrics/output_token_count/output_token_count_metric.py +112 -0
- ibm_watsonx_gov/metrics/pii/__init__.py +16 -0
- ibm_watsonx_gov/metrics/pii/input_pii_metric.py +102 -0
- ibm_watsonx_gov/metrics/pii/output_pii_metric.py +107 -0
- ibm_watsonx_gov/metrics/pii/pii_decorator.py +59 -0
- ibm_watsonx_gov/metrics/pii/pii_metric.py +96 -0
- ibm_watsonx_gov/metrics/profanity/__init__.py +8 -0
- ibm_watsonx_gov/metrics/profanity/profanity_decorator.py +60 -0
- ibm_watsonx_gov/metrics/profanity/profanity_metric.py +103 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/__init__.py +8 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_decorator.py +57 -0
- ibm_watsonx_gov/metrics/prompt_safety_risk/prompt_safety_risk_metric.py +128 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/__init__.py +0 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_decorator.py +62 -0
- ibm_watsonx_gov/metrics/reciprocal_rank/reciprocal_rank_metric.py +162 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_decorator.py +58 -0
- ibm_watsonx_gov/metrics/regex_detection/regex_detection_metric.py +106 -0
- ibm_watsonx_gov/metrics/retrieval_precision/__init__.py +0 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_decorator.py +62 -0
- ibm_watsonx_gov/metrics/retrieval_precision/retrieval_precision_metric.py +170 -0
- ibm_watsonx_gov/metrics/sexual_content/__init__.py +8 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_decorator.py +61 -0
- ibm_watsonx_gov/metrics/sexual_content/sexual_content_metric.py +103 -0
- ibm_watsonx_gov/metrics/social_bias/__init__.py +8 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_decorator.py +62 -0
- ibm_watsonx_gov/metrics/social_bias/social_bias_metric.py +103 -0
- ibm_watsonx_gov/metrics/status/__init__.py +0 -0
- ibm_watsonx_gov/metrics/status/status_metric.py +113 -0
- ibm_watsonx_gov/metrics/text_grade_level/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_grade_level/text_grade_level_metric.py +127 -0
- ibm_watsonx_gov/metrics/text_reading_ease/__init__.py +8 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_decorator.py +59 -0
- ibm_watsonx_gov/metrics/text_reading_ease/text_reading_ease_metric.py +123 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_decorator.py +67 -0
- ibm_watsonx_gov/metrics/tool_call_accuracy/tool_call_accuracy_metric.py +162 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_decorator.py +68 -0
- ibm_watsonx_gov/metrics/tool_call_parameter_accuracy/tool_call_parameter_accuracy_metric.py +151 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_decorator.py +71 -0
- ibm_watsonx_gov/metrics/tool_call_relevance/tool_call_relevance_metric.py +166 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/__init__.py +0 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_decorator.py +66 -0
- ibm_watsonx_gov/metrics/tool_call_syntactic_accuracy/tool_call_syntactic_accuracy_metric.py +121 -0
- ibm_watsonx_gov/metrics/topic_relevance/__init__.py +8 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_decorator.py +57 -0
- ibm_watsonx_gov/metrics/topic_relevance/topic_relevance_metric.py +106 -0
- ibm_watsonx_gov/metrics/unethical_behavior/__init__.py +8 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_decorator.py +61 -0
- ibm_watsonx_gov/metrics/unethical_behavior/unethical_behavior_metric.py +103 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/__init__.py +0 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_decorator.py +66 -0
- ibm_watsonx_gov/metrics/unsuccessful_requests/unsuccessful_requests_metric.py +128 -0
- ibm_watsonx_gov/metrics/user_id/__init__.py +0 -0
- ibm_watsonx_gov/metrics/user_id/user_id_metric.py +111 -0
- ibm_watsonx_gov/metrics/utils.py +440 -0
- ibm_watsonx_gov/metrics/violence/__init__.py +8 -0
- ibm_watsonx_gov/metrics/violence/violence_decorator.py +60 -0
- ibm_watsonx_gov/metrics/violence/violence_metric.py +103 -0
- ibm_watsonx_gov/prompt_evaluator/__init__.py +9 -0
- ibm_watsonx_gov/prompt_evaluator/impl/__init__.py +8 -0
- ibm_watsonx_gov/prompt_evaluator/impl/prompt_evaluator_impl.py +554 -0
- ibm_watsonx_gov/prompt_evaluator/impl/pta_lifecycle_evaluator.py +2332 -0
- ibm_watsonx_gov/prompt_evaluator/prompt_evaluator.py +262 -0
- ibm_watsonx_gov/providers/__init__.py +8 -0
- ibm_watsonx_gov/providers/detectors_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/detectors_provider.py +415 -0
- ibm_watsonx_gov/providers/eval_assist_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/providers/eval_assist_provider.py +266 -0
- ibm_watsonx_gov/providers/inference_engines/__init__.py +0 -0
- ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py +165 -0
- ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py +57 -0
- ibm_watsonx_gov/providers/llmevalkit/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/main.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/preprocess_log.py +111 -0
- ibm_watsonx_gov/providers/llmevalkit/ciso_agent/utils.py +186 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/README.md +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/__init__.py +27 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/README.md +306 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/__init__.py +89 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/__init__.py +30 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/base.py +411 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/code_agent.py +1254 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/exact_match.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/fuzzy_string.py +104 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/hybrid.py +516 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/comparators/llm_judge.py +1882 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/pipeline.py +387 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/types.py +178 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/comparison/utils.py +298 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/consts.py +33 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/__init__.py +31 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/base.py +26 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics.json +783 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/__init__.py +6 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection.py +28 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics.json +599 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/loader.py +259 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter.py +52 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics.json +613 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/parameter/parameter_metrics_runtime.json +489 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/__init__.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory.py +43 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/metrics/trajectory/trajectory_metrics.json +161 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/adapters.py +102 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/pipeline.py +355 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/semantic_checker.py +816 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/static_checker.py +297 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/transformation_prompts.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/function_calling/pipeline/types.py +596 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/README.md +375 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/__init__.py +137 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/base.py +426 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/output_parser.py +364 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/consts.py +7 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/ibm_watsonx_ai/ibm_watsonx_ai.py +656 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/litellm.py +509 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/rits.py +224 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/litellm/watsonx.py +60 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/mock_llm_client.py +75 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/openai/openai.py +639 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway.py +134 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/providers/wxo_ai_gateway/wxo_ai_gateway_inference.py +214 -0
- ibm_watsonx_gov/providers/llmevalkit/llm/types.py +136 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/__init__.py +4 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/field.py +255 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metric.py +332 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/metrics_runner.py +188 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/prompt.py +403 -0
- ibm_watsonx_gov/providers/llmevalkit/metrics/utils.py +46 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/__init__.py +0 -0
- ibm_watsonx_gov/providers/llmevalkit/prompt/runner.py +144 -0
- ibm_watsonx_gov/providers/tool_call_metric_provider.py +455 -0
- ibm_watsonx_gov/providers/unitxt_provider.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/__init__.py +10 -0
- ibm_watsonx_gov/tools/clients/__init__.py +11 -0
- ibm_watsonx_gov/tools/clients/ai_tool_client.py +405 -0
- ibm_watsonx_gov/tools/clients/detector_client.py +82 -0
- ibm_watsonx_gov/tools/core/__init__.py +8 -0
- ibm_watsonx_gov/tools/core/tool_loader.py +237 -0
- ibm_watsonx_gov/tools/entities/__init__.py +8 -0
- ibm_watsonx_gov/tools/entities/ai_tools.py +435 -0
- ibm_watsonx_gov/tools/onboarding/create/answer_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/chromadb_retrieval_tool.json +63 -0
- ibm_watsonx_gov/tools/onboarding/create/context_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/duduckgo_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/create/google_search_tool.json +62 -0
- ibm_watsonx_gov/tools/onboarding/create/hap_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/jailbreak_detector.json +70 -0
- ibm_watsonx_gov/tools/onboarding/create/pii_detector.json +36 -0
- ibm_watsonx_gov/tools/onboarding/create/prompt_safety_risk_detector.json +69 -0
- ibm_watsonx_gov/tools/onboarding/create/topic_relevance_detector.json +57 -0
- ibm_watsonx_gov/tools/onboarding/create/weather_tool.json +39 -0
- ibm_watsonx_gov/tools/onboarding/create/webcrawler_tool.json +34 -0
- ibm_watsonx_gov/tools/onboarding/create/wikipedia_search_tool.json +53 -0
- ibm_watsonx_gov/tools/onboarding/delete/delete_tools.json +4 -0
- ibm_watsonx_gov/tools/onboarding/update/google_search_tool.json +38 -0
- ibm_watsonx_gov/tools/ootb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/detectors/hap_detector_tool.py +109 -0
- ibm_watsonx_gov/tools/ootb/detectors/jailbreak_detector_tool.py +104 -0
- ibm_watsonx_gov/tools/ootb/detectors/pii_detector_tool.py +83 -0
- ibm_watsonx_gov/tools/ootb/detectors/prompt_safety_risk_detector_tool.py +111 -0
- ibm_watsonx_gov/tools/ootb/detectors/topic_relevance_detector_tool.py +101 -0
- ibm_watsonx_gov/tools/ootb/rag/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/rag/answer_relevance_detector_tool.py +119 -0
- ibm_watsonx_gov/tools/ootb/rag/context_relevance_detector_tool.py +118 -0
- ibm_watsonx_gov/tools/ootb/search/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/search/duckduckgo_search_tool.py +62 -0
- ibm_watsonx_gov/tools/ootb/search/google_search_tool.py +105 -0
- ibm_watsonx_gov/tools/ootb/search/weather_tool.py +95 -0
- ibm_watsonx_gov/tools/ootb/search/web_crawler_tool.py +69 -0
- ibm_watsonx_gov/tools/ootb/search/wikipedia_search_tool.py +63 -0
- ibm_watsonx_gov/tools/ootb/vectordb/__init__.py +8 -0
- ibm_watsonx_gov/tools/ootb/vectordb/chromadb_retriever_tool.py +111 -0
- ibm_watsonx_gov/tools/rest_api/__init__.py +10 -0
- ibm_watsonx_gov/tools/rest_api/restapi_tool.py +72 -0
- ibm_watsonx_gov/tools/schemas/__init__.py +10 -0
- ibm_watsonx_gov/tools/schemas/search_tool_schema.py +46 -0
- ibm_watsonx_gov/tools/schemas/vectordb_retrieval_schema.py +55 -0
- ibm_watsonx_gov/tools/utils/__init__.py +14 -0
- ibm_watsonx_gov/tools/utils/constants.py +69 -0
- ibm_watsonx_gov/tools/utils/display_utils.py +38 -0
- ibm_watsonx_gov/tools/utils/environment.py +108 -0
- ibm_watsonx_gov/tools/utils/package_utils.py +40 -0
- ibm_watsonx_gov/tools/utils/platform_url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/tools/utils/python_utils.py +68 -0
- ibm_watsonx_gov/tools/utils/tool_utils.py +206 -0
- ibm_watsonx_gov/traces/__init__.py +8 -0
- ibm_watsonx_gov/traces/span_exporter.py +195 -0
- ibm_watsonx_gov/traces/span_node.py +251 -0
- ibm_watsonx_gov/traces/span_util.py +153 -0
- ibm_watsonx_gov/traces/trace_utils.py +1074 -0
- ibm_watsonx_gov/utils/__init__.py +8 -0
- ibm_watsonx_gov/utils/aggregation_util.py +346 -0
- ibm_watsonx_gov/utils/async_util.py +62 -0
- ibm_watsonx_gov/utils/authenticator.py +144 -0
- ibm_watsonx_gov/utils/constants.py +15 -0
- ibm_watsonx_gov/utils/errors.py +40 -0
- ibm_watsonx_gov/utils/gov_sdk_logger.py +39 -0
- ibm_watsonx_gov/utils/insights_generator.py +1285 -0
- ibm_watsonx_gov/utils/python_utils.py +425 -0
- ibm_watsonx_gov/utils/rest_util.py +73 -0
- ibm_watsonx_gov/utils/segment_batch_manager.py +162 -0
- ibm_watsonx_gov/utils/singleton_meta.py +25 -0
- ibm_watsonx_gov/utils/url_mapping.cpython-313-darwin.so +0 -0
- ibm_watsonx_gov/utils/validation_util.py +126 -0
- ibm_watsonx_gov/visualizations/__init__.py +13 -0
- ibm_watsonx_gov/visualizations/metric_descriptions.py +57 -0
- ibm_watsonx_gov/visualizations/model_insights.py +1304 -0
- ibm_watsonx_gov/visualizations/visualization_utils.py +75 -0
- ibm_watsonx_gov-1.3.3.dist-info/METADATA +93 -0
- ibm_watsonx_gov-1.3.3.dist-info/RECORD +353 -0
- ibm_watsonx_gov-1.3.3.dist-info/WHEEL +6 -0
|
@@ -0,0 +1,1074 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------------------------------------
|
|
2
|
+
# IBM Confidential
|
|
3
|
+
# Licensed Materials - Property of IBM
|
|
4
|
+
# 5737-H76, 5900-A3Q
|
|
5
|
+
# © Copyright IBM Corp. 2025 All Rights Reserved.
|
|
6
|
+
# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
|
|
7
|
+
# GSA ADPSchedule Contract with IBM Corp.
|
|
8
|
+
# ----------------------------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import uuid
|
|
12
|
+
from collections import defaultdict
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any, Dict, Generator, List, Optional, Tuple
|
|
16
|
+
|
|
17
|
+
from jsonpath_ng import parse as parse_jsonpath
|
|
18
|
+
|
|
19
|
+
from ibm_watsonx_gov.clients.api_client import APIClient
|
|
20
|
+
from ibm_watsonx_gov.config.agentic_ai_configuration import \
|
|
21
|
+
AgenticAIConfiguration
|
|
22
|
+
from ibm_watsonx_gov.entities.agentic_app import (AgenticApp,
|
|
23
|
+
MetricsConfiguration, Node)
|
|
24
|
+
from ibm_watsonx_gov.entities.enums import (EvaluatorFields, MessageStatus,
|
|
25
|
+
MetricGroup)
|
|
26
|
+
from ibm_watsonx_gov.entities.evaluation_result import (AgentMetricResult,
|
|
27
|
+
MessageData,
|
|
28
|
+
MetricMapping,
|
|
29
|
+
MetricsMappingData,
|
|
30
|
+
NodeData)
|
|
31
|
+
from ibm_watsonx_gov.entities.foundation_model import FoundationModelInfo
|
|
32
|
+
from ibm_watsonx_gov.entities.metric import Mapping, MappingItem
|
|
33
|
+
from ibm_watsonx_gov.entities.utils import \
|
|
34
|
+
build_configuration_from_metric_mappings
|
|
35
|
+
from ibm_watsonx_gov.evaluators.impl.evaluate_metrics_impl import \
|
|
36
|
+
_evaluate_metrics_async
|
|
37
|
+
from ibm_watsonx_gov.metrics.utils import (COST_METADATA, ONE_M,
|
|
38
|
+
TARGETED_USAGE_TRACE_NAMES,
|
|
39
|
+
mapping_to_df)
|
|
40
|
+
from ibm_watsonx_gov.traces.span_node import SpanNode
|
|
41
|
+
from ibm_watsonx_gov.traces.span_util import (get_attributes,
|
|
42
|
+
get_span_nodes_from_json)
|
|
43
|
+
from ibm_watsonx_gov.utils.async_util import (gather_with_concurrency,
|
|
44
|
+
run_in_event_loop)
|
|
45
|
+
from ibm_watsonx_gov.utils.python_utils import add_if_unique
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
from opentelemetry.proto.trace.v1.trace_pb2 import Span, Status
|
|
49
|
+
except:
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
STATUS_MAP = {
|
|
54
|
+
Status.STATUS_CODE_OK: MessageStatus.SUCCESSFUL,
|
|
55
|
+
Status.STATUS_CODE_ERROR: MessageStatus.FAILURE,
|
|
56
|
+
Status.STATUS_CODE_UNSET: MessageStatus.UNKNOWN
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class TraceUtils:
|
|
61
|
+
|
|
62
|
+
@staticmethod
|
|
63
|
+
def build_span_trees(spans: list[dict], agentic_app: AgenticApp | None = None) -> List[SpanNode]:
|
|
64
|
+
root_spans: list[SpanNode] = []
|
|
65
|
+
|
|
66
|
+
span_nodes: dict[bytes, SpanNode] = {}
|
|
67
|
+
for span in spans:
|
|
68
|
+
span_nodes.update(get_span_nodes_from_json(span, agentic_app))
|
|
69
|
+
|
|
70
|
+
# Create tree
|
|
71
|
+
for _, node in span_nodes.items():
|
|
72
|
+
parent_id = node.span.parent_span_id
|
|
73
|
+
if not parent_id:
|
|
74
|
+
root_spans.append(node) # Root span which will not have parent
|
|
75
|
+
else:
|
|
76
|
+
# Use composite key of trace_id + parent_span_id to handle spans with same span_id across different traces
|
|
77
|
+
parent_composite_key = node.span.trace_id + parent_id
|
|
78
|
+
parent_node = span_nodes.get(parent_composite_key)
|
|
79
|
+
if parent_node:
|
|
80
|
+
parent_node.add_child(node)
|
|
81
|
+
else:
|
|
82
|
+
# Orphan span where parent is not found
|
|
83
|
+
root_spans.append(node)
|
|
84
|
+
|
|
85
|
+
return root_spans
|
|
86
|
+
|
|
87
|
+
@staticmethod
|
|
88
|
+
def convert_array_value(array_obj: Dict) -> List:
|
|
89
|
+
"""Convert OTEL array value to Python list"""
|
|
90
|
+
return [
|
|
91
|
+
item.get("stringValue")
|
|
92
|
+
or int(item.get("intValue", ""))
|
|
93
|
+
or float(item.get("doubleValue", ""))
|
|
94
|
+
or bool(item.get("boolValue", ""))
|
|
95
|
+
for item in array_obj.get("values", [])
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
@staticmethod
|
|
99
|
+
def stream_trace_data(file_path: Path) -> Generator:
|
|
100
|
+
"""Generator that yields spans one at a time."""
|
|
101
|
+
with open(file_path) as f:
|
|
102
|
+
for line in f:
|
|
103
|
+
try:
|
|
104
|
+
yield json.loads(line)
|
|
105
|
+
except json.JSONDecodeError as e:
|
|
106
|
+
print(f"Failed to parse line: {line}\nError: {e}")
|
|
107
|
+
|
|
108
|
+
@staticmethod
|
|
109
|
+
def __extract_usage_meta_data(attributes: dict) -> dict:
|
|
110
|
+
"""
|
|
111
|
+
Extract meta data required to calculate usage metrics from spans
|
|
112
|
+
"""
|
|
113
|
+
meta_data = {}
|
|
114
|
+
model = attributes.get("gen_ai.request.model")
|
|
115
|
+
|
|
116
|
+
if not model:
|
|
117
|
+
return meta_data
|
|
118
|
+
|
|
119
|
+
meta_data["cost"] = {
|
|
120
|
+
"model": model,
|
|
121
|
+
"total_prompt_tokens": attributes.get("gen_ai.usage.prompt_tokens", 0),
|
|
122
|
+
"total_completion_tokens": attributes.get(
|
|
123
|
+
"gen_ai.usage.completion_tokens", 0
|
|
124
|
+
),
|
|
125
|
+
"total_tokens": attributes.get("llm.usage.total_tokens", 0),
|
|
126
|
+
}
|
|
127
|
+
meta_data["input_token_count"] = attributes.get(
|
|
128
|
+
"gen_ai.usage.prompt_tokens", 0)
|
|
129
|
+
meta_data["output_token_count"] = attributes.get(
|
|
130
|
+
"gen_ai.usage.completion_tokens", 0)
|
|
131
|
+
return meta_data
|
|
132
|
+
|
|
133
|
+
@staticmethod
|
|
134
|
+
def calculate_cost(usage_data: List[dict]) -> float:
|
|
135
|
+
"""Calculate cost for given list of usage."""
|
|
136
|
+
total_cost = 0.0
|
|
137
|
+
|
|
138
|
+
for data in usage_data:
|
|
139
|
+
model = data["model"].lower()
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
model_pricing = COST_METADATA[model]
|
|
143
|
+
except KeyError:
|
|
144
|
+
return 0
|
|
145
|
+
# raise ValueError(
|
|
146
|
+
# f"Pricing not available for {model}")
|
|
147
|
+
|
|
148
|
+
# Calculate costs (per 1M tokens)
|
|
149
|
+
input_cost = (data["total_prompt_tokens"] /
|
|
150
|
+
ONE_M) * model_pricing["input"]
|
|
151
|
+
output_cost = (data["total_completion_tokens"] / ONE_M) * model_pricing[
|
|
152
|
+
"output"
|
|
153
|
+
]
|
|
154
|
+
total_cost += input_cost + output_cost
|
|
155
|
+
|
|
156
|
+
return total_cost
|
|
157
|
+
|
|
158
|
+
@staticmethod
|
|
159
|
+
async def compute_metrics_from_trace_async(span_tree: SpanNode, api_client: APIClient = None, **kwargs) -> tuple[list[AgentMetricResult], list[Node], list]:
|
|
160
|
+
metric_results, edges = [], []
|
|
161
|
+
|
|
162
|
+
# Add Interaction level metrics
|
|
163
|
+
metric_results.extend(await TraceUtils.__compute_message_level_metrics(
|
|
164
|
+
span_tree, api_client, **kwargs))
|
|
165
|
+
|
|
166
|
+
# Add node level metrics result
|
|
167
|
+
node_metric_results, nodes_list, experiment_run_metadata = await TraceUtils.__compute_node_level_metrics(
|
|
168
|
+
span_tree, api_client, **kwargs)
|
|
169
|
+
metric_results.extend(node_metric_results)
|
|
170
|
+
|
|
171
|
+
for node in nodes_list:
|
|
172
|
+
if node.name in experiment_run_metadata:
|
|
173
|
+
node.foundation_models = list(
|
|
174
|
+
experiment_run_metadata[node.name]["foundation_models"])
|
|
175
|
+
|
|
176
|
+
return metric_results, nodes_list, edges
|
|
177
|
+
|
|
178
|
+
@staticmethod
|
|
179
|
+
def compute_metrics_from_trace(span_tree: SpanNode, api_client: APIClient = None) -> tuple[
|
|
180
|
+
list[AgentMetricResult], list[Node], list]:
|
|
181
|
+
return run_in_event_loop(
|
|
182
|
+
TraceUtils.compute_metrics_from_trace_async, span_tree, api_client)
|
|
183
|
+
|
|
184
|
+
@staticmethod
|
|
185
|
+
async def __compute_node_level_metrics(span_tree: SpanNode, api_client: APIClient | None, **kwargs):
|
|
186
|
+
metric_results = []
|
|
187
|
+
trace_metadata = defaultdict(list)
|
|
188
|
+
experiment_run_metadata = defaultdict(lambda: defaultdict(set))
|
|
189
|
+
nodes_list = []
|
|
190
|
+
node_stack = list(span_tree.children)
|
|
191
|
+
child_stack = list()
|
|
192
|
+
node_execution_count = {}
|
|
193
|
+
while node_stack or child_stack:
|
|
194
|
+
is_parent = not child_stack
|
|
195
|
+
node = child_stack.pop() if child_stack else node_stack.pop()
|
|
196
|
+
if is_parent:
|
|
197
|
+
parent_span: Span = node.span
|
|
198
|
+
node_name, metrics_config_from_decorators, code_id, events, execution_order = None, [], "", [], None
|
|
199
|
+
data = {}
|
|
200
|
+
# inputs = get_nested_attribute_values(
|
|
201
|
+
# [node], "traceloop.entity.input")
|
|
202
|
+
# outputs = get_nested_attribute_values(
|
|
203
|
+
# [node], "traceloop.entity.output")
|
|
204
|
+
span: Span = node.span
|
|
205
|
+
attributes = get_attributes(span.attributes)
|
|
206
|
+
if is_parent:
|
|
207
|
+
node_name = attributes.get("traceloop.entity.name")
|
|
208
|
+
code_id = attributes.get("gen_ai.runnable.code_id")
|
|
209
|
+
execution_order = int(attributes.get("traceloop.association.properties.langgraph_step")) if attributes.get(
|
|
210
|
+
"traceloop.association.properties.langgraph_step") else None
|
|
211
|
+
for key in ("traceloop.entity.input", "traceloop.entity.output"):
|
|
212
|
+
try:
|
|
213
|
+
attr_value = attributes.get(key)
|
|
214
|
+
content = attr_value if isinstance(
|
|
215
|
+
attr_value, dict) else json.loads(attr_value)
|
|
216
|
+
|
|
217
|
+
inputs_outputs = content.get(
|
|
218
|
+
"inputs" if key.endswith("input") else "outputs")
|
|
219
|
+
if isinstance(inputs_outputs, str):
|
|
220
|
+
inputs_outputs = json.loads(inputs_outputs)
|
|
221
|
+
if data:
|
|
222
|
+
data.update(inputs_outputs)
|
|
223
|
+
else:
|
|
224
|
+
data = inputs_outputs
|
|
225
|
+
except (json.JSONDecodeError, AttributeError) as e:
|
|
226
|
+
raise Exception(
|
|
227
|
+
"Unable to parse json string") from e
|
|
228
|
+
|
|
229
|
+
if attributes.get("wxgov.config.metrics"):
|
|
230
|
+
metrics_config_from_decorators.append(
|
|
231
|
+
json.loads(attributes.get("wxgov.config.metrics")))
|
|
232
|
+
|
|
233
|
+
if span.events:
|
|
234
|
+
events.extend(span.events)
|
|
235
|
+
|
|
236
|
+
if (not node_name) or (node_name == "__start__"):
|
|
237
|
+
continue
|
|
238
|
+
|
|
239
|
+
if span.name in TARGETED_USAGE_TRACE_NAMES:
|
|
240
|
+
# Extract required details to calculate usage metrics from each span
|
|
241
|
+
for k, v in TraceUtils.__extract_usage_meta_data(attributes).items():
|
|
242
|
+
trace_metadata[k].append(v)
|
|
243
|
+
|
|
244
|
+
for k, v in TraceUtils.__get_run_metadata_from_span(attributes).items():
|
|
245
|
+
experiment_run_metadata[node_name][k].add(v)
|
|
246
|
+
|
|
247
|
+
child_stack.extend(node.children)
|
|
248
|
+
|
|
249
|
+
if not child_stack:
|
|
250
|
+
metrics_to_compute, all_metrics_config = TraceUtils.__get_metrics_to_compute(
|
|
251
|
+
span_tree.get_nodes_configuration(), node_name, metrics_config_from_decorators)
|
|
252
|
+
|
|
253
|
+
add_if_unique(Node(name=node_name, func_name=code_id.split(":")[-1] if code_id else node_name, metrics_configurations=all_metrics_config), nodes_list,
|
|
254
|
+
["name", "func_name"])
|
|
255
|
+
|
|
256
|
+
if node_name in node_execution_count:
|
|
257
|
+
node_execution_count[node_name] += node_execution_count.get(
|
|
258
|
+
node_name)
|
|
259
|
+
else:
|
|
260
|
+
node_execution_count[node_name] = 1
|
|
261
|
+
|
|
262
|
+
coros = []
|
|
263
|
+
for mc in metrics_to_compute:
|
|
264
|
+
coros.append(_evaluate_metrics_async(
|
|
265
|
+
configuration=mc.configuration,
|
|
266
|
+
data=data,
|
|
267
|
+
metrics=mc.metrics,
|
|
268
|
+
metric_groups=mc.metric_groups,
|
|
269
|
+
api_client=api_client,
|
|
270
|
+
**kwargs))
|
|
271
|
+
|
|
272
|
+
results = await gather_with_concurrency(coros, max_concurrency=kwargs.get("max_concurrency", 10))
|
|
273
|
+
for metric_result in results:
|
|
274
|
+
for mr in metric_result.to_dict():
|
|
275
|
+
node_result = {
|
|
276
|
+
"applies_to": "node",
|
|
277
|
+
"message_id": span_tree.get_message_id(),
|
|
278
|
+
"node_name": node_name,
|
|
279
|
+
"conversation_id": span_tree.get_conversation_id(),
|
|
280
|
+
"execution_count": node_execution_count.get(node_name),
|
|
281
|
+
"execution_order": execution_order,
|
|
282
|
+
**mr
|
|
283
|
+
}
|
|
284
|
+
metric_results.append(AgentMetricResult(**node_result))
|
|
285
|
+
|
|
286
|
+
# Add node latency metric result
|
|
287
|
+
metric_results.append(AgentMetricResult(name="latency",
|
|
288
|
+
display_name="Latency",
|
|
289
|
+
value=(int(
|
|
290
|
+
parent_span.end_time_unix_nano) - int(parent_span.start_time_unix_nano))/1e9,
|
|
291
|
+
group=MetricGroup.PERFORMANCE,
|
|
292
|
+
applies_to="node",
|
|
293
|
+
message_id=span_tree.get_message_id(),
|
|
294
|
+
conversation_id=span_tree.get_conversation_id(),
|
|
295
|
+
node_name=node_name,
|
|
296
|
+
execution_count=node_execution_count.get(
|
|
297
|
+
node_name),
|
|
298
|
+
execution_order=execution_order))
|
|
299
|
+
|
|
300
|
+
# Get the node level metrics computed online during graph invocation from events
|
|
301
|
+
metric_results.extend(TraceUtils.__get_metrics_results_from_events(
|
|
302
|
+
events=events,
|
|
303
|
+
message_id=span_tree.get_message_id(),
|
|
304
|
+
conversation_id=span_tree.get_conversation_id(),
|
|
305
|
+
node_name=node_name,
|
|
306
|
+
execution_count=node_execution_count.get(node_name),
|
|
307
|
+
execution_order=execution_order))
|
|
308
|
+
|
|
309
|
+
metric_results.extend(
|
|
310
|
+
TraceUtils.__compute_usage_metrics_from_trace_metadata(trace_metadata, span_tree.get_message_id(), span_tree.get_conversation_id()))
|
|
311
|
+
|
|
312
|
+
return metric_results, nodes_list, experiment_run_metadata
|
|
313
|
+
|
|
314
|
+
@staticmethod
|
|
315
|
+
async def __compute_message_level_metrics(span_tree: SpanNode, api_client: APIClient | None, **kwargs) -> list[AgentMetricResult]:
|
|
316
|
+
metric_results = []
|
|
317
|
+
span = span_tree.span
|
|
318
|
+
metric_results.append(AgentMetricResult(name="duration",
|
|
319
|
+
display_name="Duration",
|
|
320
|
+
value=(int(
|
|
321
|
+
span.end_time_unix_nano) - int(span.start_time_unix_nano))/1000000000,
|
|
322
|
+
group=MetricGroup.PERFORMANCE,
|
|
323
|
+
applies_to="message",
|
|
324
|
+
message_id=span_tree.get_message_id(),
|
|
325
|
+
conversation_id=span_tree.get_conversation_id()))
|
|
326
|
+
|
|
327
|
+
if not span_tree.agentic_app:
|
|
328
|
+
return metric_results
|
|
329
|
+
|
|
330
|
+
data = TraceUtils.__get_data_from_default_mapping(span_tree)
|
|
331
|
+
|
|
332
|
+
metric_result = await _evaluate_metrics_async(configuration=span_tree.agentic_app.metrics_configuration.configuration,
|
|
333
|
+
data=data,
|
|
334
|
+
metrics=span_tree.agentic_app.metrics_configuration.metrics,
|
|
335
|
+
metric_groups=span_tree.agentic_app.metrics_configuration.metric_groups,
|
|
336
|
+
api_client=api_client,
|
|
337
|
+
**kwargs)
|
|
338
|
+
metric_result = metric_result.to_dict()
|
|
339
|
+
for mr in metric_result:
|
|
340
|
+
node_result = {
|
|
341
|
+
"applies_to": "message",
|
|
342
|
+
"message_id": span_tree.get_message_id(),
|
|
343
|
+
"conversation_id": span_tree.get_conversation_id(),
|
|
344
|
+
**mr
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
metric_results.append(AgentMetricResult(**node_result))
|
|
348
|
+
|
|
349
|
+
return metric_results
|
|
350
|
+
|
|
351
|
+
@staticmethod
|
|
352
|
+
def __get_data_from_default_mapping(span_tree: SpanNode) -> Dict[str, Any]:
|
|
353
|
+
data = {}
|
|
354
|
+
|
|
355
|
+
span = span_tree.span
|
|
356
|
+
attrs = get_attributes(
|
|
357
|
+
span.attributes, ["traceloop.entity.input", "traceloop.entity.output"])
|
|
358
|
+
inputs = attrs.get("traceloop.entity.input", "{}")
|
|
359
|
+
if isinstance(inputs, str):
|
|
360
|
+
inputs = json.loads(inputs).get("inputs", {})
|
|
361
|
+
elif isinstance(inputs, dict):
|
|
362
|
+
inputs = inputs.get("inputs", {})
|
|
363
|
+
|
|
364
|
+
if "messages" in inputs:
|
|
365
|
+
for message in reversed(inputs["messages"]):
|
|
366
|
+
if "kwargs" in message and "type" in message["kwargs"] and message["kwargs"]["type"].upper() == "HUMAN":
|
|
367
|
+
data["input_text"] = message["kwargs"]["content"]
|
|
368
|
+
break
|
|
369
|
+
else:
|
|
370
|
+
data.update(inputs)
|
|
371
|
+
|
|
372
|
+
outputs = attrs.get("traceloop.entity.output", "{}")
|
|
373
|
+
if isinstance(outputs, str):
|
|
374
|
+
outputs = json.loads(outputs).get("outputs", {})
|
|
375
|
+
elif isinstance(outputs, dict):
|
|
376
|
+
outputs = outputs.get("outputs", {})
|
|
377
|
+
|
|
378
|
+
if "messages" in outputs:
|
|
379
|
+
# The messages is a list depicting the history of messages with the agent.
|
|
380
|
+
# It need NOT be the whole list of messages in the conversation though.
|
|
381
|
+
# We will traverse the list from the end to find the human input of the message,
|
|
382
|
+
# and the AI output.
|
|
383
|
+
|
|
384
|
+
# If there was no input_text so far, find first human message
|
|
385
|
+
if "input_text" not in data:
|
|
386
|
+
for message in reversed(outputs["messages"]):
|
|
387
|
+
if "kwargs" in message and "type" in message["kwargs"] and message["kwargs"]["type"].upper() == "HUMAN":
|
|
388
|
+
data["input_text"] = message["kwargs"]["content"]
|
|
389
|
+
break
|
|
390
|
+
|
|
391
|
+
# Find last AI message
|
|
392
|
+
for message in reversed(outputs["messages"]):
|
|
393
|
+
if "kwargs" in message and "type" in message["kwargs"] and message["kwargs"]["type"].upper() == "AI":
|
|
394
|
+
data["generated_text"] = message["kwargs"]["content"]
|
|
395
|
+
break
|
|
396
|
+
else:
|
|
397
|
+
data.update(outputs)
|
|
398
|
+
|
|
399
|
+
mapping = EvaluatorFields.get_default_fields_mapping()
|
|
400
|
+
data.update(
|
|
401
|
+
{mapping[EvaluatorFields.STATUS_FIELD]: span_tree.get_message_status()})
|
|
402
|
+
|
|
403
|
+
return data
|
|
404
|
+
|
|
405
|
+
@staticmethod
|
|
406
|
+
def __get_metrics_to_compute(nodes_config, node_name, metrics_configurations):
|
|
407
|
+
metrics_to_compute, all_metrics_config = [], []
|
|
408
|
+
|
|
409
|
+
if nodes_config.get(node_name):
|
|
410
|
+
metrics_config = nodes_config.get(node_name)
|
|
411
|
+
for mc in metrics_config:
|
|
412
|
+
mc_obj = MetricsConfiguration(configuration=mc.configuration,
|
|
413
|
+
metrics=mc.metrics,
|
|
414
|
+
metric_groups=mc.metric_groups)
|
|
415
|
+
metrics_to_compute.append(mc_obj)
|
|
416
|
+
all_metrics_config.append(mc_obj)
|
|
417
|
+
|
|
418
|
+
for mc in metrics_configurations:
|
|
419
|
+
mc_obj = MetricsConfiguration.model_validate(
|
|
420
|
+
mc.get("metrics_configuration"))
|
|
421
|
+
|
|
422
|
+
all_metrics_config.append(mc_obj)
|
|
423
|
+
if mc.get("compute_real_time") == "false":
|
|
424
|
+
metrics_to_compute.append(mc_obj)
|
|
425
|
+
|
|
426
|
+
return metrics_to_compute, all_metrics_config
|
|
427
|
+
|
|
428
|
+
@staticmethod
|
|
429
|
+
def __get_metrics_results_from_events(events, message_id, conversation_id, node_name, execution_count, execution_order):
|
|
430
|
+
results = []
|
|
431
|
+
if not events:
|
|
432
|
+
return results
|
|
433
|
+
|
|
434
|
+
for event in events:
|
|
435
|
+
for attr in event.attributes:
|
|
436
|
+
if attr.key == "attr_wxgov.result.metric":
|
|
437
|
+
val = attr.value.string_value
|
|
438
|
+
if val:
|
|
439
|
+
mr = json.loads(val)
|
|
440
|
+
mr.update({
|
|
441
|
+
"node_name": node_name,
|
|
442
|
+
"message_id": message_id,
|
|
443
|
+
"conversation_id": conversation_id,
|
|
444
|
+
"execution_count": execution_count,
|
|
445
|
+
"execution_order": execution_order
|
|
446
|
+
})
|
|
447
|
+
results.append(AgentMetricResult(**mr))
|
|
448
|
+
|
|
449
|
+
return results
|
|
450
|
+
|
|
451
|
+
@staticmethod
|
|
452
|
+
def __compute_usage_metrics_from_trace_metadata(trace_metadata: dict, message_id: str, conversation_id: str) -> list:
|
|
453
|
+
metrics_result = []
|
|
454
|
+
|
|
455
|
+
for metric, data in trace_metadata.items():
|
|
456
|
+
if metric == "cost":
|
|
457
|
+
metric_value = TraceUtils.calculate_cost(data)
|
|
458
|
+
elif metric == "input_token_count":
|
|
459
|
+
metric_value = sum(data)
|
|
460
|
+
elif metric == "output_token_count":
|
|
461
|
+
metric_value = sum(data)
|
|
462
|
+
else:
|
|
463
|
+
continue
|
|
464
|
+
agent_mr = {
|
|
465
|
+
"name": metric,
|
|
466
|
+
"value": metric_value,
|
|
467
|
+
"display_name": metric,
|
|
468
|
+
"message_id": message_id,
|
|
469
|
+
"applies_to": "message",
|
|
470
|
+
"conversation_id": conversation_id,
|
|
471
|
+
"group": MetricGroup.USAGE.value
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
metrics_result.append(AgentMetricResult(**agent_mr))
|
|
475
|
+
|
|
476
|
+
return metrics_result
|
|
477
|
+
|
|
478
|
+
@staticmethod
|
|
479
|
+
def __get_run_metadata_from_span(attributes: dict) -> dict:
|
|
480
|
+
"""
|
|
481
|
+
Extract run specific metadata from traces
|
|
482
|
+
1. Foundation model involved in run
|
|
483
|
+
2. Tools involved in run
|
|
484
|
+
"""
|
|
485
|
+
metadata = {}
|
|
486
|
+
provider = attributes.get(
|
|
487
|
+
"traceloop.association.properties.ls_provider", attributes.get("gen_ai.system"))
|
|
488
|
+
llm_type = attributes.get("llm.request.type")
|
|
489
|
+
model_name = attributes.get("gen_ai.request.model")
|
|
490
|
+
|
|
491
|
+
if model_name:
|
|
492
|
+
metadata["foundation_models"] = FoundationModelInfo(
|
|
493
|
+
model_name=model_name, provider=provider, type=llm_type
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
return metadata
|
|
497
|
+
|
|
498
|
+
@staticmethod
|
|
499
|
+
async def __process_span_and_extract_data(span_tree: SpanNode,
|
|
500
|
+
metric_mappings: List[MetricMapping],
|
|
501
|
+
target_component_mapping: List[MappingItem],
|
|
502
|
+
message_io_mapping: Optional[Mapping],
|
|
503
|
+
**kwargs) -> Tuple[MessageData, Dict[str, List[NodeData]], MetricsMappingData, Dict[str, Node], Dict]:
|
|
504
|
+
"""
|
|
505
|
+
Extract and process span tree data to generate metrics, node information, and mapping data.
|
|
506
|
+
|
|
507
|
+
This method traverses a span tree extracting:
|
|
508
|
+
- Node information and I/O data
|
|
509
|
+
- Experiment run metadata
|
|
510
|
+
- Metric mapping data
|
|
511
|
+
- Application I/O data
|
|
512
|
+
"""
|
|
513
|
+
root_span = span_tree.span
|
|
514
|
+
conversation_id = str(span_tree.get_conversation_id())
|
|
515
|
+
message_id = str(span_tree.get_message_id())
|
|
516
|
+
|
|
517
|
+
app_io_start_time = TraceUtils._timestamp_to_iso(
|
|
518
|
+
root_span.start_time_unix_nano)
|
|
519
|
+
app_io_end_time = TraceUtils._timestamp_to_iso(
|
|
520
|
+
root_span.end_time_unix_nano)
|
|
521
|
+
|
|
522
|
+
app_io_data = TraceUtils._extract_app_io_from_attributes(
|
|
523
|
+
root_span.attributes, message_io_mapping)
|
|
524
|
+
|
|
525
|
+
# Initialize data structures
|
|
526
|
+
experiment_run_metadata = defaultdict(lambda: defaultdict(set))
|
|
527
|
+
nodes_list = []
|
|
528
|
+
node_execution_count = {}
|
|
529
|
+
nodes_data: Dict[str, List[NodeData]] = {}
|
|
530
|
+
|
|
531
|
+
# Build quick index for span name to mapping items lookup
|
|
532
|
+
span_mapping_items = defaultdict(list)
|
|
533
|
+
metrics_with_mapping = dict()
|
|
534
|
+
for metric_mapping in metric_mappings:
|
|
535
|
+
metrics_with_mapping[metric_mapping.name] = False
|
|
536
|
+
if metric_mapping.mapping:
|
|
537
|
+
metrics_with_mapping[metric_mapping.name] = True
|
|
538
|
+
for mapping_item in metric_mapping.mapping.items:
|
|
539
|
+
if mapping_item.span_name and (mapping_item not in span_mapping_items[mapping_item.span_name]):
|
|
540
|
+
span_mapping_items[mapping_item.span_name].append(
|
|
541
|
+
mapping_item)
|
|
542
|
+
|
|
543
|
+
for mapping_item in target_component_mapping:
|
|
544
|
+
if mapping_item.span_name:
|
|
545
|
+
span_mapping_items[mapping_item.span_name].append(
|
|
546
|
+
mapping_item)
|
|
547
|
+
|
|
548
|
+
metric_map_data = defaultdict(
|
|
549
|
+
lambda: defaultdict(lambda: defaultdict(list)))
|
|
550
|
+
|
|
551
|
+
# Process span tree using iterative DFS
|
|
552
|
+
TraceUtils._process_span_tree(
|
|
553
|
+
span_tree=span_tree,
|
|
554
|
+
root_span=root_span,
|
|
555
|
+
conversation_id=conversation_id,
|
|
556
|
+
message_id=message_id,
|
|
557
|
+
app_io_data=app_io_data,
|
|
558
|
+
span_mapping_items=span_mapping_items,
|
|
559
|
+
experiment_run_metadata=experiment_run_metadata,
|
|
560
|
+
nodes_list=nodes_list,
|
|
561
|
+
node_execution_count=node_execution_count,
|
|
562
|
+
nodes_data=nodes_data,
|
|
563
|
+
metric_map_data=metric_map_data,
|
|
564
|
+
metrics_with_mapping=metrics_with_mapping,
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
# Prepare message data
|
|
568
|
+
messages_data = MessageData(
|
|
569
|
+
message_id=message_id,
|
|
570
|
+
message_timestamp=app_io_end_time,
|
|
571
|
+
conversation_id=conversation_id,
|
|
572
|
+
start_time=app_io_start_time,
|
|
573
|
+
end_time=app_io_end_time,
|
|
574
|
+
input=TraceUtils._string_to_bytes(app_io_data["input"]),
|
|
575
|
+
output=TraceUtils._string_to_bytes(app_io_data["output"]),
|
|
576
|
+
num_loops=sum(node_execution_count.values()) -
|
|
577
|
+
len(node_execution_count)
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
metric_mapping_data = MetricsMappingData(
|
|
581
|
+
message_id=message_id,
|
|
582
|
+
metric_mappings=metric_mappings,
|
|
583
|
+
data=metric_map_data
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
return (
|
|
587
|
+
messages_data,
|
|
588
|
+
nodes_data,
|
|
589
|
+
metric_mapping_data,
|
|
590
|
+
nodes_list,
|
|
591
|
+
experiment_run_metadata,
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
@staticmethod
|
|
595
|
+
def _timestamp_to_iso(timestamp_ns: int) -> str:
|
|
596
|
+
"""Convert nanosecond timestamp to ISO format string."""
|
|
597
|
+
return datetime.fromtimestamp(timestamp_ns / 1e9).isoformat()
|
|
598
|
+
|
|
599
|
+
@staticmethod
|
|
600
|
+
def _iso_to_timestamp(iso_str: str) -> int:
|
|
601
|
+
"""Convert ISO format string to nanosecond timestamp."""
|
|
602
|
+
dt = datetime.fromisoformat(iso_str)
|
|
603
|
+
return int(dt.timestamp() * 1e9)
|
|
604
|
+
|
|
605
|
+
@staticmethod
|
|
606
|
+
def _extract_app_io_from_attributes(attributes: List, message_io_mapping: Optional[Mapping]) -> Tuple[Optional[str], Optional[str]]:
|
|
607
|
+
"""
|
|
608
|
+
Extract application input and output from span attributes.
|
|
609
|
+
"""
|
|
610
|
+
app_input = None
|
|
611
|
+
app_output = None
|
|
612
|
+
input_key = "traceloop.entity.input"
|
|
613
|
+
output_key = "traceloop.entity.output"
|
|
614
|
+
input_json_path, output_json_path = None, None
|
|
615
|
+
|
|
616
|
+
# If message_io_mapping is provided, use it to extract the input and output from the attributes
|
|
617
|
+
if message_io_mapping is not None:
|
|
618
|
+
for item in message_io_mapping.items:
|
|
619
|
+
if item.type_ == "input":
|
|
620
|
+
input_key = item.attribute_name if item.attribute_name else input_key
|
|
621
|
+
input_json_path = item.json_path
|
|
622
|
+
elif item.type_ == "output":
|
|
623
|
+
output_key = item.attribute_name if item.attribute_name else output_key
|
|
624
|
+
output_json_path = item.json_path
|
|
625
|
+
|
|
626
|
+
for attribute in attributes:
|
|
627
|
+
att_key = attribute.key
|
|
628
|
+
att_val = attribute.value.string_value
|
|
629
|
+
|
|
630
|
+
if att_key == input_key:
|
|
631
|
+
if input_json_path:
|
|
632
|
+
app_input = TraceUtils._extract_with_jsonpath(
|
|
633
|
+
json.loads(att_val), input_json_path)
|
|
634
|
+
else:
|
|
635
|
+
app_input = TraceUtils._safe_json_dumps(att_val)
|
|
636
|
+
elif att_key == output_key:
|
|
637
|
+
if output_json_path:
|
|
638
|
+
app_output = TraceUtils._extract_with_jsonpath(
|
|
639
|
+
json.loads(att_val), output_json_path)
|
|
640
|
+
else:
|
|
641
|
+
app_output = TraceUtils._safe_json_dumps(att_val)
|
|
642
|
+
|
|
643
|
+
return {"input": app_input, "output": app_output}
|
|
644
|
+
|
|
645
|
+
@staticmethod
|
|
646
|
+
def _safe_json_dumps(value: str) -> str:
|
|
647
|
+
"""
|
|
648
|
+
Safely JSON dump a string value only if it's not already JSON-formatted.
|
|
649
|
+
"""
|
|
650
|
+
if value and '\\"' not in value:
|
|
651
|
+
try:
|
|
652
|
+
return json.dumps(value)
|
|
653
|
+
except (TypeError, ValueError):
|
|
654
|
+
return value
|
|
655
|
+
return value
|
|
656
|
+
|
|
657
|
+
@staticmethod
|
|
658
|
+
def _string_to_bytes(text: Optional[str]) -> Optional[bytes]:
|
|
659
|
+
"""Convert string to bytes if not None."""
|
|
660
|
+
return bytes(text, "utf-8") if text is not None else None
|
|
661
|
+
|
|
662
|
+
@staticmethod
|
|
663
|
+
def _process_span_tree(span_tree: SpanNode, root_span: Span, conversation_id: str, message_id: str,
|
|
664
|
+
app_io_data: Dict, span_mapping_items: defaultdict[str, list[MappingItem]], experiment_run_metadata: defaultdict[str, defaultdict[str, set]],
|
|
665
|
+
nodes_list: List[Node], node_execution_count: Dict[str, int], nodes_data: Dict[str, List[NodeData]], metric_map_data: defaultdict,
|
|
666
|
+
metrics_with_mapping: dict) -> None:
|
|
667
|
+
"""
|
|
668
|
+
Process the span tree using iterative depth-first search in correct order.
|
|
669
|
+
"""
|
|
670
|
+
current_parent_context = TraceUtils._initialize_parent_context(
|
|
671
|
+
span_tree)
|
|
672
|
+
root_span_status = root_span.status.code
|
|
673
|
+
|
|
674
|
+
# Process root span attributes for message I/O data
|
|
675
|
+
TraceUtils._process_span_attributes(
|
|
676
|
+
current_span=root_span,
|
|
677
|
+
is_parent=True,
|
|
678
|
+
parent_context=current_parent_context,
|
|
679
|
+
span_mapping_items=span_mapping_items,
|
|
680
|
+
metric_map_data=metric_map_data,
|
|
681
|
+
experiment_run_metadata=experiment_run_metadata,
|
|
682
|
+
metrics_with_mapping=metrics_with_mapping,
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
# Reverse the initial children to process in correct order
|
|
686
|
+
node_stack: List[SpanNode] = list(reversed(span_tree.children))
|
|
687
|
+
child_stack: List[SpanNode] = []
|
|
688
|
+
while node_stack or child_stack:
|
|
689
|
+
is_parent = not child_stack
|
|
690
|
+
node = child_stack.pop() if child_stack else node_stack.pop()
|
|
691
|
+
current_span = node.span
|
|
692
|
+
|
|
693
|
+
if not current_span.name:
|
|
694
|
+
# No data to extract from current span
|
|
695
|
+
continue
|
|
696
|
+
if is_parent:
|
|
697
|
+
current_parent_context = TraceUtils._initialize_parent_context(
|
|
698
|
+
node)
|
|
699
|
+
|
|
700
|
+
# Process span attributes for node I/O data and metric mappings
|
|
701
|
+
TraceUtils._process_span_attributes(
|
|
702
|
+
current_span=current_span,
|
|
703
|
+
is_parent=is_parent,
|
|
704
|
+
parent_context=current_parent_context,
|
|
705
|
+
span_mapping_items=span_mapping_items,
|
|
706
|
+
metric_map_data=metric_map_data,
|
|
707
|
+
experiment_run_metadata=experiment_run_metadata,
|
|
708
|
+
metrics_with_mapping=metrics_with_mapping,
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
if current_parent_context.get("name") == "__start__":
|
|
712
|
+
if app_io_data["input"] is None:
|
|
713
|
+
# Reading the application input from `__start__` node
|
|
714
|
+
app_io_data["input"] = current_parent_context["input"]
|
|
715
|
+
# No data to extract from current span
|
|
716
|
+
continue
|
|
717
|
+
|
|
718
|
+
# Add children to stack for processing
|
|
719
|
+
child_stack.extend(node.children)
|
|
720
|
+
|
|
721
|
+
# All node span process completed when all children are processed
|
|
722
|
+
if not child_stack:
|
|
723
|
+
TraceUtils._finalize_node_processing(
|
|
724
|
+
parent_context=current_parent_context,
|
|
725
|
+
conversation_id=conversation_id,
|
|
726
|
+
message_id=message_id,
|
|
727
|
+
node_execution_count=node_execution_count,
|
|
728
|
+
nodes_list=nodes_list,
|
|
729
|
+
nodes_data=nodes_data,
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
# If status is extracted from default paths
|
|
733
|
+
if metrics_with_mapping.get("status") is False:
|
|
734
|
+
# Once process all child spans, finalize the message status
|
|
735
|
+
metric_map_data["status"] = metric_map_data["status"] if metric_map_data["status"] else STATUS_MAP[root_span_status]
|
|
736
|
+
|
|
737
|
+
@staticmethod
|
|
738
|
+
def _initialize_parent_context(node: SpanNode) -> Dict:
|
|
739
|
+
"""
|
|
740
|
+
Initialize context for a parent node.
|
|
741
|
+
"""
|
|
742
|
+
parent_span = node.span
|
|
743
|
+
return {
|
|
744
|
+
"span": parent_span,
|
|
745
|
+
"txn_id": str(uuid.uuid4()),
|
|
746
|
+
"execution_order": None,
|
|
747
|
+
"name": None,
|
|
748
|
+
"input": None,
|
|
749
|
+
"output": None,
|
|
750
|
+
"metrics_config": [],
|
|
751
|
+
"code_id": "",
|
|
752
|
+
"start_time": TraceUtils._timestamp_to_iso(parent_span.start_time_unix_nano),
|
|
753
|
+
"end_time": TraceUtils._timestamp_to_iso(parent_span.end_time_unix_nano)
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
@staticmethod
|
|
757
|
+
def _process_span_attributes(current_span: Span, is_parent: bool, parent_context: Dict, span_mapping_items: defaultdict[str, list[MappingItem]], metric_map_data: defaultdict,
|
|
758
|
+
experiment_run_metadata: defaultdict, metrics_with_mapping: dict
|
|
759
|
+
) -> None:
|
|
760
|
+
"""
|
|
761
|
+
Need to process all spans to extract FM details
|
|
762
|
+
Process attributes of the current span for I/O data and metric mappings.
|
|
763
|
+
"""
|
|
764
|
+
has_metric_mapping = current_span.name in span_mapping_items
|
|
765
|
+
attributes = get_attributes(current_span.attributes)
|
|
766
|
+
|
|
767
|
+
if is_parent:
|
|
768
|
+
TraceUtils._process_parent_attribute(
|
|
769
|
+
attributes, parent_context)
|
|
770
|
+
# Extract required details to calculate duration metrics from each parent span
|
|
771
|
+
if any(
|
|
772
|
+
metrics_with_mapping.get(metric) is False
|
|
773
|
+
for metric in ("duration", "latency")
|
|
774
|
+
):
|
|
775
|
+
# Process only non `__start__` span
|
|
776
|
+
if "__start__" not in current_span.name:
|
|
777
|
+
# Initialize span start end time
|
|
778
|
+
if current_span.name not in metric_map_data:
|
|
779
|
+
metric_map_data[current_span.name]["start_time"] = []
|
|
780
|
+
metric_map_data[current_span.name]["end_time"] = []
|
|
781
|
+
|
|
782
|
+
metric_map_data[current_span.name]["start_time"].append(TraceUtils._iso_to_timestamp(
|
|
783
|
+
parent_context["start_time"]))
|
|
784
|
+
metric_map_data[current_span.name]["end_time"].append(TraceUtils._iso_to_timestamp(
|
|
785
|
+
parent_context["end_time"]))
|
|
786
|
+
|
|
787
|
+
if has_metric_mapping:
|
|
788
|
+
TraceUtils._process_metric_mapping(
|
|
789
|
+
current_span.name, attributes,
|
|
790
|
+
span_mapping_items[current_span.name],
|
|
791
|
+
metric_map_data,
|
|
792
|
+
)
|
|
793
|
+
|
|
794
|
+
# Extract required details to calculate usage and duration metrics from each span, in case mapping is not provided in metric configuration
|
|
795
|
+
if current_span.name in TARGETED_USAGE_TRACE_NAMES:
|
|
796
|
+
cost_meta_data = TraceUtils.__extract_usage_meta_data(
|
|
797
|
+
attributes)["cost"]
|
|
798
|
+
# Aggregate total input and output token
|
|
799
|
+
model_key = cost_meta_data["model"]
|
|
800
|
+
inner_map = metric_map_data.get(current_span.name)
|
|
801
|
+
if inner_map and model_key in inner_map:
|
|
802
|
+
prev_cost_meta_data = metric_map_data[current_span.name][model_key]["model_usage_details"]
|
|
803
|
+
cost_meta_data["total_prompt_tokens"] += prev_cost_meta_data.get(
|
|
804
|
+
"total_prompt_tokens", 0)
|
|
805
|
+
cost_meta_data["total_completion_tokens"] += prev_cost_meta_data.get(
|
|
806
|
+
"total_completion_tokens", 0)
|
|
807
|
+
|
|
808
|
+
# Cost
|
|
809
|
+
if metrics_with_mapping.get("cost") is False:
|
|
810
|
+
metric_map_data[current_span.name][model_key]["model_usage_details"] = cost_meta_data
|
|
811
|
+
# Token count
|
|
812
|
+
if metrics_with_mapping.get("input_token_count") is False:
|
|
813
|
+
metric_map_data[current_span.name][model_key]["prompt_tokens_count"] = cost_meta_data["total_prompt_tokens"]
|
|
814
|
+
if metrics_with_mapping.get("output_token_count") is False:
|
|
815
|
+
metric_map_data[current_span.name][model_key]["completion_tokens_count"] = cost_meta_data["total_completion_tokens"]
|
|
816
|
+
|
|
817
|
+
# Extract FM details to store it node details
|
|
818
|
+
for k, v in TraceUtils.__get_run_metadata_from_span(attributes).items():
|
|
819
|
+
experiment_run_metadata[parent_context.get("name")][k].add(v)
|
|
820
|
+
|
|
821
|
+
# Extract failed status if any
|
|
822
|
+
if metrics_with_mapping.get("status") is False:
|
|
823
|
+
if current_span.status.code == Status.STATUS_CODE_ERROR:
|
|
824
|
+
metric_map_data["status"] = MessageStatus.FAILURE
|
|
825
|
+
|
|
826
|
+
@staticmethod
|
|
827
|
+
def _process_parent_attribute(attributes: dict, parent_context: Dict) -> None:
|
|
828
|
+
"""
|
|
829
|
+
Process an attribute for a parent node.
|
|
830
|
+
"""
|
|
831
|
+
parent_context["name"] = attributes.get("traceloop.entity.name")
|
|
832
|
+
parent_context["code_id"] = attributes.get("gen_ai.runnable.code_id")
|
|
833
|
+
parent_context["execution_order"] = int(attributes.get("traceloop.association.properties.langgraph_step")) if attributes.get(
|
|
834
|
+
"traceloop.association.properties.langgraph_step") else None
|
|
835
|
+
parent_context["input"] = TraceUtils._safe_json_dumps(
|
|
836
|
+
attributes.get("traceloop.entity.input"))
|
|
837
|
+
parent_context["output"] = TraceUtils._safe_json_dumps(
|
|
838
|
+
attributes.get("traceloop.entity.output"))
|
|
839
|
+
|
|
840
|
+
@staticmethod
|
|
841
|
+
def _process_metric_mapping(span_name: str, attribute: dict, mapping_items: List[MappingItem], metric_map_data: defaultdict
|
|
842
|
+
) -> None:
|
|
843
|
+
"""
|
|
844
|
+
Process metric mapping for a span attribute.
|
|
845
|
+
"""
|
|
846
|
+
for mapping_item in mapping_items:
|
|
847
|
+
try:
|
|
848
|
+
content = attribute.get(mapping_item.attribute_name)
|
|
849
|
+
content = TraceUtils._parse_nested_json_fields(content)
|
|
850
|
+
if mapping_item.json_path:
|
|
851
|
+
extracted_value = TraceUtils._extract_with_jsonpath(
|
|
852
|
+
content, mapping_item.json_path)
|
|
853
|
+
else:
|
|
854
|
+
extracted_value = content
|
|
855
|
+
except (json.JSONDecodeError, AttributeError):
|
|
856
|
+
# Fallback to string value if JSON parsing fails
|
|
857
|
+
extracted_value = attribute.get(mapping_item.attribute_name)
|
|
858
|
+
|
|
859
|
+
if mapping_item.type_ == "target_component":
|
|
860
|
+
metric_map_data[span_name][mapping_item.attribute_name][mapping_item.json_path] = extracted_value
|
|
861
|
+
else:
|
|
862
|
+
metric_map_data[span_name][mapping_item.attribute_name][mapping_item.json_path].append(
|
|
863
|
+
extracted_value)
|
|
864
|
+
|
|
865
|
+
@staticmethod
|
|
866
|
+
def _parse_nested_json_fields(content) -> Dict:
|
|
867
|
+
"""
|
|
868
|
+
Recursively parse a value that might be a JSON string.
|
|
869
|
+
"""
|
|
870
|
+
if isinstance(content, str):
|
|
871
|
+
try:
|
|
872
|
+
# Try to parse as JSON
|
|
873
|
+
parsed = json.loads(content)
|
|
874
|
+
# Recursively parse the result in case it contains more JSON strings
|
|
875
|
+
return TraceUtils._parse_nested_json_fields(parsed)
|
|
876
|
+
except (json.JSONDecodeError, ValueError):
|
|
877
|
+
# Not a JSON string, return as-is
|
|
878
|
+
return content
|
|
879
|
+
elif isinstance(content, dict):
|
|
880
|
+
# Recursively parse all values in the dictionary
|
|
881
|
+
return {k: TraceUtils._parse_nested_json_fields(v) for k, v in content.items()}
|
|
882
|
+
elif isinstance(content, list):
|
|
883
|
+
# Recursively parse all items in the list
|
|
884
|
+
return [TraceUtils._parse_nested_json_fields(item) for item in content]
|
|
885
|
+
else:
|
|
886
|
+
# Return other types as-is (int, float, bool, None, etc.)
|
|
887
|
+
return content
|
|
888
|
+
|
|
889
|
+
@staticmethod
|
|
890
|
+
def _extract_with_jsonpath(content: Dict, json_path: str) -> Any:
|
|
891
|
+
"""
|
|
892
|
+
Extract value from content using JSONPath expression.
|
|
893
|
+
"""
|
|
894
|
+
try:
|
|
895
|
+
jsonpath_expr = parse_jsonpath(json_path)
|
|
896
|
+
matches = [match.value for match in jsonpath_expr.find(content)]
|
|
897
|
+
|
|
898
|
+
if matches:
|
|
899
|
+
return matches[0] if len(matches) == 1 else matches
|
|
900
|
+
return None
|
|
901
|
+
except Exception:
|
|
902
|
+
return None
|
|
903
|
+
|
|
904
|
+
@staticmethod
|
|
905
|
+
def _finalize_node_processing(parent_context: Dict, conversation_id: str, message_id: str, node_execution_count: Dict[str, int],
|
|
906
|
+
nodes_list: List[Node], nodes_data: Dict[str, List[NodeData]]) -> None:
|
|
907
|
+
"""
|
|
908
|
+
Finalize processing for a completed node.
|
|
909
|
+
"""
|
|
910
|
+
node_name = parent_context["name"]
|
|
911
|
+
|
|
912
|
+
# Update execution count
|
|
913
|
+
node_execution_count[node_name] = node_execution_count.get(
|
|
914
|
+
node_name, 0) + 1
|
|
915
|
+
|
|
916
|
+
# Add unique node to nodes list
|
|
917
|
+
func_name = parent_context["code_id"].split(
|
|
918
|
+
":")[-1] if parent_context["code_id"] else node_name
|
|
919
|
+
add_if_unique(
|
|
920
|
+
Node(
|
|
921
|
+
name=node_name,
|
|
922
|
+
func_name=func_name,
|
|
923
|
+
),
|
|
924
|
+
nodes_list,
|
|
925
|
+
["name", "func_name"]
|
|
926
|
+
)
|
|
927
|
+
|
|
928
|
+
# Add node I/O data
|
|
929
|
+
if node_name not in nodes_data:
|
|
930
|
+
nodes_data[node_name] = []
|
|
931
|
+
|
|
932
|
+
nodes_data[node_name].append(NodeData(
|
|
933
|
+
message_id=message_id,
|
|
934
|
+
message_timestamp=parent_context["end_time"],
|
|
935
|
+
conversation_id=conversation_id,
|
|
936
|
+
node_name=node_name,
|
|
937
|
+
start_time=parent_context["start_time"],
|
|
938
|
+
end_time=parent_context["end_time"],
|
|
939
|
+
input=TraceUtils._string_to_bytes(parent_context["input"]),
|
|
940
|
+
output=TraceUtils._string_to_bytes(parent_context["output"]),
|
|
941
|
+
execution_order=parent_context["execution_order"],
|
|
942
|
+
execution_count=node_execution_count[node_name],
|
|
943
|
+
node_txn_id=parent_context["txn_id"],
|
|
944
|
+
node_txn_timestamp=parent_context["end_time"]
|
|
945
|
+
))
|
|
946
|
+
|
|
947
|
+
@staticmethod
|
|
948
|
+
async def __compute_metrics_from_maps(metrics_configuration: MetricsConfiguration,
|
|
949
|
+
mapping_data: Dict,
|
|
950
|
+
api_client: APIClient,
|
|
951
|
+
message_id: str,
|
|
952
|
+
conversation_id: str,
|
|
953
|
+
message_timestamp: str,
|
|
954
|
+
nodes_data: Dict[str, List[NodeData]],
|
|
955
|
+
**kwargs) -> List[AgentMetricResult]:
|
|
956
|
+
"""
|
|
957
|
+
Process all configured metrics by:
|
|
958
|
+
1. Extracting required data from mapping data
|
|
959
|
+
2. Computing metrics asynchronously
|
|
960
|
+
"""
|
|
961
|
+
metric_results = []
|
|
962
|
+
coros = []
|
|
963
|
+
execution_map = defaultdict(lambda: defaultdict())
|
|
964
|
+
metric_count = 0
|
|
965
|
+
msg_data = mapping_to_df(mapping_data)
|
|
966
|
+
for metric in metrics_configuration.metrics:
|
|
967
|
+
target_component = None
|
|
968
|
+
if metric.target_component:
|
|
969
|
+
if metric.target_component.type == "mapping":
|
|
970
|
+
target_component = mapping_data[metric.target_component.value.span_name][
|
|
971
|
+
metric.target_component.value.attribute_name][metric.target_component.value.json_path]
|
|
972
|
+
else:
|
|
973
|
+
target_component = metric.target_component.value
|
|
974
|
+
configuration = AgenticAIConfiguration(
|
|
975
|
+
**build_configuration_from_metric_mappings(metric, target_component))
|
|
976
|
+
if metric.applies_to == "message":
|
|
977
|
+
coros.append(_evaluate_metrics_async(
|
|
978
|
+
configuration=configuration,
|
|
979
|
+
data=msg_data,
|
|
980
|
+
metrics=[metric],
|
|
981
|
+
api_client=api_client,
|
|
982
|
+
**kwargs))
|
|
983
|
+
metric_count += 1
|
|
984
|
+
execution_map[metric_count]["applies_to"] = metric.applies_to
|
|
985
|
+
else: # Node level
|
|
986
|
+
node_data_list = nodes_data.get(target_component)
|
|
987
|
+
if node_data_list is None:
|
|
988
|
+
# Skip this metric if the target component doesn't exist in nodes_data
|
|
989
|
+
continue
|
|
990
|
+
for i in range(len(node_data_list)):
|
|
991
|
+
coros.append(_evaluate_metrics_async(
|
|
992
|
+
configuration=configuration,
|
|
993
|
+
# Extract data specific to execution order <i>
|
|
994
|
+
data=mapping_to_df(mapping_data, i),
|
|
995
|
+
metrics=[metric],
|
|
996
|
+
api_client=api_client,
|
|
997
|
+
**kwargs))
|
|
998
|
+
metric_count += 1
|
|
999
|
+
execution_map[metric_count]["target_component"] = target_component
|
|
1000
|
+
execution_map[metric_count]["applies_to"] = metric.applies_to
|
|
1001
|
+
execution_map[metric_count]["execution_count"] = node_data_list[i].execution_count
|
|
1002
|
+
execution_map[metric_count]["execution_order"] = node_data_list[i].execution_order
|
|
1003
|
+
|
|
1004
|
+
results = await gather_with_concurrency(coros, max_concurrency=kwargs.get("max_concurrency", 10))
|
|
1005
|
+
for i, result in enumerate(results, start=1):
|
|
1006
|
+
for mr in result.to_dict():
|
|
1007
|
+
result = {
|
|
1008
|
+
"applies_to": execution_map[i].get("applies_to"),
|
|
1009
|
+
"message_id": message_id,
|
|
1010
|
+
"conversation_id": conversation_id,
|
|
1011
|
+
"message_timestamp": message_timestamp,
|
|
1012
|
+
**mr
|
|
1013
|
+
}
|
|
1014
|
+
if execution_map[i].get("target_component"):
|
|
1015
|
+
result.update({
|
|
1016
|
+
"node_name": execution_map[i].get("target_component"),
|
|
1017
|
+
"execution_count": execution_map[i].get("execution_count"),
|
|
1018
|
+
"execution_order": execution_map[i].get("execution_order"),
|
|
1019
|
+
})
|
|
1020
|
+
metric_results.append(AgentMetricResult(**result))
|
|
1021
|
+
|
|
1022
|
+
return metric_results
|
|
1023
|
+
|
|
1024
|
+
@staticmethod
|
|
1025
|
+
async def compute_metrics_from_trace_async_v2(span_tree: SpanNode,
|
|
1026
|
+
metrics_configuration: MetricsConfiguration,
|
|
1027
|
+
message_io_mapping: Mapping | None = None,
|
|
1028
|
+
api_client: APIClient | None = None,
|
|
1029
|
+
**kwargs
|
|
1030
|
+
) -> Tuple[List[AgentMetricResult], MessageData, List[NodeData], MetricsMappingData, List[Node]]:
|
|
1031
|
+
"""
|
|
1032
|
+
Process span tree data to compute comprehensive metrics and extract execution artifacts.
|
|
1033
|
+
|
|
1034
|
+
This method orchestrates the end-to-end metrics computation pipeline by:
|
|
1035
|
+
1. Extracting and processing raw data from span traces
|
|
1036
|
+
2. Computing metrics from the extracted trace data
|
|
1037
|
+
3. Calculating additional metrics based on mapping configurations
|
|
1038
|
+
"""
|
|
1039
|
+
|
|
1040
|
+
# Assuming both the message and node level mappings are available in `agentic_app.metrics_configuration`
|
|
1041
|
+
metric_mappings = []
|
|
1042
|
+
target_component_mapping = []
|
|
1043
|
+
for m in metrics_configuration.metrics:
|
|
1044
|
+
metric_mappings.append(MetricMapping(
|
|
1045
|
+
name=m.name, method=m.method, applies_to=m.applies_to, mapping=m.mapping))
|
|
1046
|
+
if m.target_component and m.target_component.type == "mapping":
|
|
1047
|
+
target_component_mapping.append(m.target_component.value)
|
|
1048
|
+
|
|
1049
|
+
# Extract and process core data components from span tree
|
|
1050
|
+
(
|
|
1051
|
+
message_data, nodes_data, metric_mapping_data,
|
|
1052
|
+
nodes, experiment_run_metadata) = await TraceUtils.__process_span_and_extract_data(span_tree,
|
|
1053
|
+
metric_mappings,
|
|
1054
|
+
target_component_mapping,
|
|
1055
|
+
message_io_mapping,
|
|
1056
|
+
**kwargs)
|
|
1057
|
+
|
|
1058
|
+
# Compute metrics using mapping configurations
|
|
1059
|
+
metric_results = await TraceUtils.__compute_metrics_from_maps(metrics_configuration=metrics_configuration,
|
|
1060
|
+
mapping_data=metric_mapping_data.data,
|
|
1061
|
+
api_client=api_client,
|
|
1062
|
+
message_id=message_data.message_id,
|
|
1063
|
+
conversation_id=message_data.conversation_id,
|
|
1064
|
+
message_timestamp=message_data.message_timestamp,
|
|
1065
|
+
nodes_data=nodes_data,
|
|
1066
|
+
**kwargs)
|
|
1067
|
+
|
|
1068
|
+
# Add foundation model details to node
|
|
1069
|
+
for node in nodes:
|
|
1070
|
+
if node.name in experiment_run_metadata:
|
|
1071
|
+
node.foundation_models = list(
|
|
1072
|
+
experiment_run_metadata[node.name]["foundation_models"])
|
|
1073
|
+
|
|
1074
|
+
return metric_results, message_data, [item for sublist in nodes_data.values() for item in sublist], metric_mapping_data, nodes
|