agentevals-cli 0.7.0__tar.gz → 0.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/PKG-INFO +1 -1
  2. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/README.md +4 -1
  3. agentevals_cli-0.7.1/examples/zero-code-examples/pydantic-ai/requirements.txt +5 -0
  4. agentevals_cli-0.7.1/examples/zero-code-examples/pydantic-ai/run.py +105 -0
  5. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/pyproject.toml +1 -1
  6. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/app.py +1 -2
  7. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/models.py +10 -0
  8. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/routes.py +158 -2
  9. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/config.py +35 -18
  10. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/loader/otlp.py +55 -13
  11. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/runner.py +59 -28
  12. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/integration/test_live_agents.py +60 -0
  13. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_api.py +221 -0
  14. agentevals_cli-0.7.1/tests/test_otlp_loader.py +454 -0
  15. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/uv.lock +1 -1
  16. agentevals_cli-0.7.0/tests/test_otlp_loader.py +0 -210
  17. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.claude/skills/eval/SKILL.md +0 -0
  18. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.claude/skills/eval/evals/evals.json +0 -0
  19. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.claude/skills/inspect/SKILL.md +0 -0
  20. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.claude/skills/inspect/evals/evals.json +0 -0
  21. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.dockerignore +0 -0
  22. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  23. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  24. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  25. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.github/workflows/ci.yml +0 -0
  26. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.github/workflows/publish-evaluator-sdk.yml +0 -0
  27. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.github/workflows/release.yml +0 -0
  28. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.gitignore +0 -0
  29. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.mcp.json +0 -0
  30. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/CONTRIBUTING.md +0 -0
  31. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/DEVELOPMENT.md +0 -0
  32. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/Dockerfile +0 -0
  33. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/LICENSE +0 -0
  34. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/Makefile +0 -0
  35. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/README.md +0 -0
  36. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/charts/agentevals/Chart.yaml +0 -0
  37. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/charts/agentevals/templates/NOTES.txt +0 -0
  38. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/charts/agentevals/templates/_helpers.tpl +0 -0
  39. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/charts/agentevals/templates/deployment.yaml +0 -0
  40. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/charts/agentevals/templates/service.yaml +0 -0
  41. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/charts/agentevals/templates/serviceaccount.yaml +0 -0
  42. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/charts/agentevals/values.yaml +0 -0
  43. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/docs/assets/logo-color-on-transparent.svg +0 -0
  44. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/docs/assets/logo-color.png +0 -0
  45. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/docs/assets/logo-dark-on-transparent.svg +0 -0
  46. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/docs/custom-evaluators.md +0 -0
  47. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/docs/eval-set-format.md +0 -0
  48. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/docs/otel-compatibility.md +0 -0
  49. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/docs/streaming.md +0 -0
  50. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/custom_evaluators/eval_config.yaml +0 -0
  51. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/custom_evaluators/response_quality.py +0 -0
  52. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/custom_evaluators/tool_call_checker.py +0 -0
  53. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/dice_agent/README.md +0 -0
  54. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/dice_agent/agent.py +0 -0
  55. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/dice_agent/eval_set.json +0 -0
  56. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/dice_agent/main.py +0 -0
  57. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/dice_agent/test_streaming.py +0 -0
  58. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/kubernetes/README.md +0 -0
  59. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/langchain_agent/README.md +0 -0
  60. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/langchain_agent/agent.py +0 -0
  61. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/langchain_agent/eval_set.json +0 -0
  62. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/langchain_agent/main.py +0 -0
  63. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/langchain_agent/requirements.txt +0 -0
  64. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/langchain_agent/test_streaming.py +0 -0
  65. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/sdk_example/async_example.py +0 -0
  66. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/sdk_example/context_manager_example.py +0 -0
  67. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/sdk_example/decorator_example.py +0 -0
  68. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/sdk_example/requirements.txt +0 -0
  69. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/strands_agent/agent.py +0 -0
  70. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/strands_agent/eval_set.json +0 -0
  71. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/strands_agent/main.py +0 -0
  72. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/strands_agent/requirements.txt +0 -0
  73. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/adk/requirements.txt +0 -0
  74. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/adk/run.py +0 -0
  75. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/langchain/requirements.txt +0 -0
  76. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/langchain/run.py +0 -0
  77. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/ollama/requirements.txt +0 -0
  78. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/ollama/run.py +0 -0
  79. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/openai-agents/requirements.txt +0 -0
  80. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/openai-agents/run.py +0 -0
  81. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/strands/requirements.txt +0 -0
  82. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/strands/run.py +0 -0
  83. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/flake.lock +0 -0
  84. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/flake.nix +0 -0
  85. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/packages/evaluator-sdk-py/README.md +0 -0
  86. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/packages/evaluator-sdk-py/pyproject.toml +0 -0
  87. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
  88. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
  89. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
  90. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/samples/eval_set_helm.json +0 -0
  91. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/samples/evalset_helm_3_2026-02-23.json +0 -0
  92. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/samples/evalset_k8s_2026-02-20.json +0 -0
  93. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/samples/helm.json +0 -0
  94. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/samples/helm_2.json +0 -0
  95. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/samples/helm_3.json +0 -0
  96. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/samples/k8s.json +0 -0
  97. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/__init__.py +0 -0
  98. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/_protocol.py +0 -0
  99. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/_static/assets/index-7YPfPT4N.js +0 -0
  100. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
  101. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/_static/index.html +0 -0
  102. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/_static/logo.svg +0 -0
  103. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/_static/vite.svg +0 -0
  104. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/__init__.py +0 -0
  105. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/debug_routes.py +0 -0
  106. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/dependencies.py +0 -0
  107. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/otlp_app.py +0 -0
  108. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/otlp_grpc.py +0 -0
  109. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/otlp_processing.py +0 -0
  110. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/otlp_routes.py +0 -0
  111. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/streaming_routes.py +0 -0
  112. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/builtin_metrics.py +0 -0
  113. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/cli.py +0 -0
  114. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/converter.py +0 -0
  115. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/custom_evaluators.py +0 -0
  116. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/eval_config_loader.py +0 -0
  117. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/evaluator/__init__.py +0 -0
  118. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/evaluator/resolver.py +0 -0
  119. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/evaluator/sources.py +0 -0
  120. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/evaluator/templates.py +0 -0
  121. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/evaluator/venv.py +0 -0
  122. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/extraction.py +0 -0
  123. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/genai_converter.py +0 -0
  124. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/loader/__init__.py +0 -0
  125. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/loader/base.py +0 -0
  126. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/loader/jaeger.py +0 -0
  127. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/mcp_server.py +0 -0
  128. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/openai_eval_backend.py +0 -0
  129. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/output.py +0 -0
  130. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/sdk.py +0 -0
  131. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/streaming/__init__.py +0 -0
  132. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/streaming/incremental_processor.py +0 -0
  133. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/streaming/processor.py +0 -0
  134. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/streaming/session.py +0 -0
  135. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/streaming/ws_server.py +0 -0
  136. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/trace_attrs.py +0 -0
  137. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/trace_metrics.py +0 -0
  138. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/utils/__init__.py +0 -0
  139. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/utils/genai_messages.py +0 -0
  140. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/utils/log_buffer.py +0 -0
  141. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/utils/log_enrichment.py +0 -0
  142. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/integration/__init__.py +0 -0
  143. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/integration/conftest.py +0 -0
  144. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/integration/test_evaluation_pipeline.py +0 -0
  145. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/integration/test_otlp_grpc_receiver.py +0 -0
  146. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/integration/test_session_grouping.py +0 -0
  147. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/integration/test_timing_stress.py +0 -0
  148. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_cli.py +0 -0
  149. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_converter.py +0 -0
  150. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_extraction.py +0 -0
  151. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_genai_converter.py +0 -0
  152. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_jaeger_loader.py +0 -0
  153. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_log_enrichment.py +0 -0
  154. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_mcp_server.py +0 -0
  155. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_otlp_receiver.py +0 -0
  156. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_output.py +0 -0
  157. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_protocol.py +0 -0
  158. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_runner.py +0 -0
  159. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_sdk.py +0 -0
  160. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_trace_metrics.py +0 -0
  161. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/.gitignore +0 -0
  162. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/README.md +0 -0
  163. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/eslint.config.js +0 -0
  164. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/index.html +0 -0
  165. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/package-lock.json +0 -0
  166. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/package.json +0 -0
  167. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/public/logo.svg +0 -0
  168. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/public/vite.svg +0 -0
  169. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/App.css +0 -0
  170. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/App.tsx +0 -0
  171. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/api/client.ts +0 -0
  172. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/assets/react.svg +0 -0
  173. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
  174. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
  175. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
  176. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
  177. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/builder/BuilderHeader.tsx +0 -0
  178. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/builder/BuilderView.tsx +0 -0
  179. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
  180. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/builder/EvalCasesList.tsx +0 -0
  181. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/builder/InvocationEditor.tsx +0 -0
  182. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/builder/JsonPreview.tsx +0 -0
  183. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/builder/MetadataEditor.tsx +0 -0
  184. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/builder/TraceUploadZone.tsx +0 -0
  185. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/builder/index.ts +0 -0
  186. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/dashboard/DashboardView.tsx +0 -0
  187. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
  188. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
  189. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/dashboard/PerformanceCharts.tsx +0 -0
  190. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
  191. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/dashboard/TraceCard.tsx +0 -0
  192. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/dashboard/TraceTable.tsx +0 -0
  193. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
  194. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/DataSection.tsx +0 -0
  195. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/InspectorHeader.tsx +0 -0
  196. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
  197. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/InspectorView.tsx +0 -0
  198. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/InvocationCard.tsx +0 -0
  199. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
  200. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
  201. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
  202. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
  203. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/ToolCallList.tsx +0 -0
  204. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
  205. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/sidebar/Sidebar.tsx +0 -0
  206. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
  207. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/streaming/LiveMessage.tsx +0 -0
  208. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/streaming/LiveStreamingView.tsx +0 -0
  209. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/streaming/SessionCard.tsx +0 -0
  210. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
  211. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
  212. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/upload/FileDropZone.tsx +0 -0
  213. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/upload/MetricSelector.tsx +0 -0
  214. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
  215. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/upload/TraceEditorDrawer.tsx +0 -0
  216. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/upload/UploadView.tsx +0 -0
  217. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/welcome/WelcomeView.tsx +0 -0
  218. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/config.ts +0 -0
  219. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/context/TraceContext.tsx +0 -0
  220. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/context/TraceProvider.tsx +0 -0
  221. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/index.css +0 -0
  222. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/lib/console-capture.ts +0 -0
  223. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/lib/evalset-builder.ts +0 -0
  224. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/lib/network-capture.ts +0 -0
  225. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/lib/trace-helpers.ts +0 -0
  226. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/lib/trace-loader.ts +0 -0
  227. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/lib/trace-metadata.ts +0 -0
  228. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/lib/trace-patcher.ts +0 -0
  229. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/lib/types.ts +0 -0
  230. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/lib/utils.ts +0 -0
  231. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/main.tsx +0 -0
  232. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/tsconfig.app.json +0 -0
  233. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/tsconfig.json +0 -0
  234. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/tsconfig.node.json +0 -0
  235. {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/vite.config.ts +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentevals-cli
3
- Version: 0.7.0
3
+ Version: 0.7.1
4
4
  Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
5
5
  License-File: LICENSE
6
6
  Requires-Python: >=3.11
@@ -29,6 +29,7 @@ agentevals accepts OTLP/HTTP on port 4318 (`http/protobuf` and `http/json`) and
29
29
  | [zero-code-examples/ollama/](./zero-code-examples/ollama/) | LangChain | Ollama |
30
30
  | [zero-code-examples/strands/](./zero-code-examples/strands/) | Strands | OpenAI |
31
31
  | [zero-code-examples/adk/](./zero-code-examples/adk/) | Google ADK | Gemini |
32
+ | [zero-code-examples/pydantic-ai/](./zero-code-examples/pydantic-ai/) | Pydantic AI | OpenAI |
32
33
 
33
34
  This approach works with any framework that has OTel instrumentation: LangChain, Strands, Google ADK, etc. If your framework already emits OTel spans, you only need to add `OTLPSpanExporter` (and `OTLPLogExporter` if it uses GenAI log-based content delivery).
34
35
 
@@ -103,6 +104,7 @@ Detection checks for `gen_ai.request.model` / `gen_ai.input.messages` (GenAI sem
103
104
  | [zero-code-examples/ollama/](./zero-code-examples/ollama/) | LangChain | Ollama | GenAI semconv (logs) | Standard OTLP export |
104
105
  | [zero-code-examples/strands/](./zero-code-examples/strands/) | Strands | OpenAI | GenAI semconv (events*) | Standard OTLP export |
105
106
  | [zero-code-examples/adk/](./zero-code-examples/adk/) | Google ADK | Gemini | ADK built-in | Standard OTLP export |
107
+ | [zero-code-examples/pydantic-ai/](./zero-code-examples/pydantic-ai/) | Pydantic AI | OpenAI | GenAI semconv (span attrs) | Standard OTLP export |
106
108
  | [langchain_agent](./langchain_agent/) | LangChain | OpenAI | GenAI semconv (logs) | SDK WebSocket |
107
109
  | [strands_agent](./strands_agent/) | Strands | OpenAI | GenAI semconv (events*) | SDK WebSocket |
108
110
  | [dice_agent](./dice_agent/) | Google ADK | Gemini | ADK built-in | SDK WebSocket |
@@ -217,6 +219,7 @@ python examples/zero-code-examples/langchain/run.py
217
219
  python examples/zero-code-examples/ollama/run.py
218
220
  python examples/zero-code-examples/strands/run.py
219
221
  python examples/zero-code-examples/adk/run.py
222
+ python examples/zero-code-examples/pydantic-ai/run.py
220
223
 
221
224
  # SDK examples:
222
225
  python examples/sdk_example/context_manager_example.py
@@ -232,7 +235,7 @@ python examples/strands_agent/main.py
232
235
  Traces stream to the dev server in real-time. Evaluation runs automatically when the session completes.
233
236
 
234
237
  See each example's README for prerequisites and detailed instructions:
235
- - [zero-code-examples/](./zero-code-examples/) (LangChain + Strands, standard OTLP)
238
+ - [zero-code-examples/](./zero-code-examples/) (LangChain, Strands, ADK, OpenAI Agents, Pydantic AI — standard OTLP)
236
239
  - [dice_agent/README.md](./dice_agent/README.md) (Google ADK + Gemini)
237
240
  - [langchain_agent/README.md](./langchain_agent/README.md) (LangChain + OpenAI, SDK)
238
241
  - [strands_agent/](./strands_agent/) (Strands + OpenAI, SDK)
@@ -0,0 +1,5 @@
1
+ pydantic-ai>=1.81.0
2
+
3
+ opentelemetry-sdk>=1.36.0
4
+ opentelemetry-exporter-otlp-proto-http>=1.36.0
5
+ python-dotenv>=1.0.0
@@ -0,0 +1,105 @@
1
+ """Run a dice-rolling Pydantic AI agent with OTLP export — no agentevals SDK.
2
+
3
+ Demonstrates zero-code integration: any OTel-instrumented agent streams
4
+ traces to agentevals by pointing the OTLP exporter at the receiver.
5
+
6
+ Pydantic AI has built-in OTel support via Agent.instrument_all(). By default
7
+ it uses version 2 of the GenAI semconv format, storing message content in span
8
+ attributes — only a TracerProvider is needed.
9
+ No separate instrumentation library is needed.
10
+
11
+ Prerequisites:
12
+ 1. pip install -r requirements.txt
13
+ 2. agentevals serve --dev
14
+ 3. export OPENAI_API_KEY="your-key-here"
15
+
16
+ Usage:
17
+ python examples/zero-code-examples/pydantic-ai/run.py
18
+ """
19
+
20
+ import os
21
+ import random
22
+
23
+ from dotenv import load_dotenv
24
+ from opentelemetry import trace
25
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
26
+ from opentelemetry.sdk.resources import Resource
27
+ from opentelemetry.sdk.trace import TracerProvider
28
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
29
+ from pydantic_ai import Agent
30
+
31
+ load_dotenv(override=True)
32
+
33
+
34
+ def roll_die(sides: int) -> int:
35
+ """Roll a die with the given number of sides and return the result."""
36
+ return random.randint(1, sides)
37
+
38
+
39
+ def check_prime(number: int) -> bool:
40
+ """Return True if the number is prime, False otherwise."""
41
+ if number < 2:
42
+ return False
43
+ for i in range(2, int(number**0.5) + 1):
44
+ if number % i == 0:
45
+ return False
46
+ return True
47
+
48
+
49
+ def main():
50
+ if not os.getenv("OPENAI_API_KEY"):
51
+ print("OPENAI_API_KEY not set.")
52
+ return
53
+
54
+ endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
55
+ print(f"OTLP endpoint: {endpoint}")
56
+
57
+ os.environ.setdefault(
58
+ "OTEL_RESOURCE_ATTRIBUTES",
59
+ "agentevals.eval_set_id=pydantic_ai_eval,agentevals.session_name=pydantic-ai-zero-code",
60
+ )
61
+
62
+ resource = Resource.create()
63
+
64
+ tracer_provider = TracerProvider(resource=resource)
65
+ tracer_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter(), schedule_delay_millis=1000))
66
+ trace.set_tracer_provider(tracer_provider)
67
+
68
+ # Enable Pydantic AI's built-in OTel instrumentation. This one call
69
+ # wires up all agents globally — no framework-specific instrumentor
70
+ # library (like opentelemetry-instrumentation-openai-v2) is needed.
71
+ Agent.instrument_all()
72
+
73
+ agent = Agent(
74
+ "openai:gpt-4o-mini",
75
+ instructions="You are a helpful assistant. You can roll dice and check if numbers are prime.",
76
+ )
77
+ agent.tool_plain(roll_die)
78
+ agent.tool_plain(check_prime)
79
+
80
+ test_queries = [
81
+ "Hi! Can you help me?",
82
+ "Roll a 20-sided die for me",
83
+ "Is the number you rolled prime?",
84
+ ]
85
+
86
+ message_history = []
87
+
88
+ try:
89
+ for i, query in enumerate(test_queries, 1):
90
+ print(f"\n[{i}/{len(test_queries)}] User: {query}")
91
+
92
+ result = agent.run_sync(query, message_history=message_history)
93
+
94
+ print(f" Agent: {result.output}")
95
+
96
+ # Pass the full message history forward for multi-turn conversation.
97
+ message_history = result.all_messages()
98
+ finally:
99
+ print()
100
+ tracer_provider.force_flush()
101
+ print("All traces flushed to OTLP receiver.")
102
+
103
+
104
+ if __name__ == "__main__":
105
+ main()
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "agentevals-cli"
7
- version = "0.7.0"
7
+ version = "0.7.1"
8
8
  description = "Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -10,8 +10,7 @@ from contextlib import asynccontextmanager
10
10
  from pathlib import Path
11
11
  from typing import TYPE_CHECKING
12
12
 
13
- from fastapi import FastAPI, Request
14
- from fastapi import WebSocket
13
+ from fastapi import FastAPI, Request, WebSocket
15
14
  from fastapi.middleware.cors import CORSMiddleware
16
15
  from fastapi.responses import StreamingResponse
17
16
 
@@ -11,6 +11,8 @@ from typing import Any, Generic, TypeVar
11
11
  from pydantic import BaseModel, ConfigDict, Field
12
12
  from pydantic.alias_generators import to_camel
13
13
 
14
+ from ..config import EvalParams
15
+
14
16
  T = TypeVar("T")
15
17
 
16
18
 
@@ -134,6 +136,14 @@ class ConvertTracesData(CamelModel):
134
136
  traces: list[TraceConversionEntry]
135
137
 
136
138
 
139
+ class EvaluateJsonRequest(CamelModel):
140
+ """Request body for JSON-based trace evaluation (``POST /evaluate/json``)."""
141
+
142
+ traces: dict = Field(description="OTLP JSON export with resourceSpans structure.")
143
+ config: EvalParams = Field(default_factory=EvalParams, description="Evaluation parameters.")
144
+ eval_set: dict | None = Field(default=None, description="Optional ADK EvalSet JSON.")
145
+
146
+
137
147
  # ---------------------------------------------------------------------------
138
148
  # SSE evaluation event models
139
149
  # ---------------------------------------------------------------------------
@@ -11,7 +11,7 @@ import shutil
11
11
  import tempfile
12
12
  from typing import Any
13
13
 
14
- from fastapi import APIRouter, File, Form, HTTPException, UploadFile
14
+ from fastapi import APIRouter, File, Form, HTTPException, Request, UploadFile
15
15
  from fastapi.responses import StreamingResponse
16
16
  from pydantic.alias_generators import to_camel
17
17
 
@@ -27,13 +27,22 @@ from ..config import (
27
27
  )
28
28
  from ..converter import convert_traces
29
29
  from ..extraction import get_extractor
30
- from ..runner import RunResult, get_loader, load_eval_set, run_evaluation
30
+ from ..loader.otlp import OtlpJsonLoader
31
+ from ..runner import (
32
+ RunResult,
33
+ get_loader,
34
+ load_eval_set,
35
+ load_eval_set_from_dict,
36
+ run_evaluation,
37
+ run_evaluation_from_traces,
38
+ )
31
39
  from ..trace_metrics import extract_performance_metrics, extract_trace_metadata
32
40
  from .models import (
33
41
  ApiKeyStatus,
34
42
  ConfigData,
35
43
  ConvertTracesData,
36
44
  EvalSetValidation,
45
+ EvaluateJsonRequest,
37
46
  HealthData,
38
47
  MetricInfo,
39
48
  SSEDoneEvent,
@@ -61,6 +70,8 @@ def _camel_keys(obj: Any) -> Any:
61
70
 
62
71
  router = APIRouter()
63
72
 
73
+ _MAX_JSON_BODY_BYTES = 50 * 1024 * 1024 # 50 MB (multipart endpoints allow 10 MB per file)
74
+
64
75
  _TYPE_TO_MODEL = {
65
76
  "builtin": BuiltinMetricDef,
66
77
  "code": CodeEvaluatorDef,
@@ -729,3 +740,148 @@ async def evaluate_traces_stream(
729
740
  "Connection": "keep-alive",
730
741
  },
731
742
  )
743
+
744
+
745
+ def _parse_json_request(request: EvaluateJsonRequest):
746
+ """Parse traces and eval set from an EvaluateJsonRequest.
747
+
748
+ Returns (traces, eval_set). Raises HTTPException on invalid input.
749
+ """
750
+ try:
751
+ traces = OtlpJsonLoader().load_from_dict(request.traces)
752
+ except ValueError as exc:
753
+ raise HTTPException(status_code=400, detail=str(exc)) from exc
754
+
755
+ if not traces:
756
+ raise HTTPException(status_code=400, detail="No traces found in OTLP JSON")
757
+
758
+ eval_set = None
759
+ if request.eval_set:
760
+ try:
761
+ eval_set = load_eval_set_from_dict(request.eval_set)
762
+ except Exception as exc:
763
+ raise HTTPException(status_code=400, detail=f"Invalid eval set: {exc}") from exc
764
+
765
+ return traces, eval_set
766
+
767
+
768
+ def _check_json_body_size(raw_request: Request):
769
+ content_length = int(raw_request.headers.get("content-length", 0))
770
+ if content_length > _MAX_JSON_BODY_BYTES:
771
+ raise HTTPException(
772
+ status_code=413,
773
+ detail=f"Request body exceeds {_MAX_JSON_BODY_BYTES // (1024 * 1024)}MB limit",
774
+ )
775
+
776
+
777
+ def _sse_error(message: str) -> str:
778
+ return f"data: {SSEErrorEvent(error=message).model_dump_json(by_alias=True)}\n\n"
779
+
780
+
781
+ @router.post("/evaluate/json", response_model=StandardResponse[RunResult])
782
+ async def evaluate_traces_json(request: EvaluateJsonRequest, raw_request: Request):
783
+ """Evaluate OTLP JSON traces passed in the request body."""
784
+ _check_json_body_size(raw_request)
785
+ traces, eval_set = _parse_json_request(request)
786
+
787
+ try:
788
+ result = await run_evaluation_from_traces(
789
+ traces=traces,
790
+ config=request.config,
791
+ eval_set=eval_set,
792
+ )
793
+ return StandardResponse(data=_camel_keys(result.model_dump(by_alias=True)))
794
+ except Exception as exc:
795
+ logger.exception("JSON evaluation failed")
796
+ raise HTTPException(status_code=500, detail=f"Internal error: {exc!s}") from exc
797
+
798
+
799
+ @router.post("/evaluate/json/stream")
800
+ async def evaluate_traces_json_stream(request: EvaluateJsonRequest, raw_request: Request):
801
+ """Evaluate OTLP JSON traces with real-time progress via SSE."""
802
+ _check_json_body_size(raw_request)
803
+
804
+ async def event_generator():
805
+ try:
806
+ try:
807
+ traces, eval_set = _parse_json_request(request)
808
+ except HTTPException as exc:
809
+ yield _sse_error(exc.detail)
810
+ return
811
+
812
+ for trace in traces:
813
+ try:
814
+ extractor = get_extractor(trace)
815
+ perf_metrics = _camel_keys(extract_performance_metrics(trace, extractor))
816
+ trace_metadata = _camel_keys(extract_trace_metadata(trace, extractor))
817
+ evt = SSEPerformanceMetricsEvent(
818
+ trace_id=trace.trace_id,
819
+ performance_metrics=perf_metrics,
820
+ trace_metadata=trace_metadata,
821
+ )
822
+ yield f"event: performance_metrics\ndata: {evt.model_dump_json(by_alias=True)}\n\n"
823
+ except Exception as e:
824
+ logger.error(f"Failed to extract early performance metrics: {e}")
825
+
826
+ queue: asyncio.Queue = asyncio.Queue()
827
+
828
+ async def progress_callback(message: str):
829
+ await queue.put(("progress", message))
830
+
831
+ async def trace_progress_callback(trace_result):
832
+ await queue.put(("trace_progress", trace_result))
833
+
834
+ async def run_with_progress():
835
+ result = await run_evaluation_from_traces(
836
+ traces=traces,
837
+ config=request.config,
838
+ eval_set=eval_set,
839
+ progress_callback=progress_callback,
840
+ trace_progress_callback=trace_progress_callback,
841
+ )
842
+ await queue.put(("done", result))
843
+
844
+ eval_task = asyncio.create_task(run_with_progress())
845
+
846
+ try:
847
+ while True:
848
+ msg = await queue.get()
849
+ tag, payload = msg
850
+
851
+ if tag == "done":
852
+ evt = SSEDoneEvent(
853
+ result=_camel_keys(payload.model_dump(by_alias=True)),
854
+ )
855
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
856
+ break
857
+ elif tag == "trace_progress":
858
+ evt = SSETraceProgressEvent(
859
+ trace_progress=SSETraceProgress(
860
+ trace_id=payload.trace_id,
861
+ partial_result=_camel_keys(payload.model_dump(by_alias=True)),
862
+ )
863
+ )
864
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
865
+ elif tag == "progress":
866
+ evt = SSEProgressEvent(message=payload)
867
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
868
+ finally:
869
+ if not eval_task.done():
870
+ eval_task.cancel()
871
+ try:
872
+ await eval_task
873
+ except asyncio.CancelledError:
874
+ pass
875
+
876
+ except Exception as exc:
877
+ logger.exception("JSON evaluation stream failed")
878
+ yield _sse_error(str(exc))
879
+
880
+ return StreamingResponse(
881
+ event_generator(),
882
+ media_type="text/event-stream",
883
+ headers={
884
+ "Cache-Control": "no-cache",
885
+ "Connection": "keep-alive",
886
+ },
887
+ )
@@ -5,7 +5,8 @@ from __future__ import annotations
5
5
  from pathlib import Path
6
6
  from typing import Annotated, Any, Literal
7
7
 
8
- from pydantic import BaseModel, Field, field_validator
8
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
9
+ from pydantic.alias_generators import to_camel
9
10
 
10
11
 
11
12
  class BuiltinMetricDef(BaseModel):
@@ -99,13 +100,14 @@ CustomEvaluatorDef = Annotated[
99
100
  ]
100
101
 
101
102
 
102
- class EvalRunConfig(BaseModel):
103
- trace_files: list[str] = Field(description="Paths to trace files (Jaeger JSON or OTLP JSON).")
103
+ class EvalParams(BaseModel):
104
+ """Evaluation parameters independent of how traces are provided.
104
105
 
105
- eval_set_file: str | None = Field(
106
- default=None,
107
- description="Path to a golden eval set JSON file (ADK EvalSet format).",
108
- )
106
+ Used by ``run_evaluation_from_traces`` for programmatic / API-driven
107
+ evaluation. ``EvalRunConfig`` inherits from this and adds file I/O fields.
108
+ """
109
+
110
+ model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
109
111
 
110
112
  metrics: list[str] = Field(
111
113
  default_factory=lambda: ["tool_trajectory_avg_score"],
@@ -117,11 +119,6 @@ class EvalRunConfig(BaseModel):
117
119
  description="Custom evaluator definitions.",
118
120
  )
119
121
 
120
- trace_format: str = Field(
121
- default="jaeger-json",
122
- description="Format of the trace files (jaeger-json or otlp-json).",
123
- )
124
-
125
122
  judge_model: str | None = Field(
126
123
  default=None,
127
124
  description="LLM model for judge-based metrics.",
@@ -129,7 +126,9 @@ class EvalRunConfig(BaseModel):
129
126
 
130
127
  threshold: float | None = Field(
131
128
  default=None,
132
- description="Score threshold for pass/fail.",
129
+ ge=0,
130
+ le=1,
131
+ description="Score threshold for pass/fail (0.0 to 1.0).",
133
132
  )
134
133
 
135
134
  trajectory_match_type: str | None = Field(
@@ -145,17 +144,35 @@ class EvalRunConfig(BaseModel):
145
144
  raise ValueError(f"Invalid trajectory_match_type '{v}'. Valid values: {sorted(valid)}")
146
145
  return v.upper() if v is not None else v
147
146
 
148
- output_format: str = Field(
149
- default="table",
150
- description="Output format: 'table', 'json', or 'summary'.",
151
- )
152
-
153
147
  max_concurrent_traces: int = Field(
154
148
  default=10,
149
+ ge=1,
155
150
  description="Maximum number of traces to evaluate concurrently.",
156
151
  )
157
152
 
158
153
  max_concurrent_evals: int = Field(
159
154
  default=5,
155
+ ge=1,
160
156
  description="Maximum number of concurrent metric evaluations (LLM API calls).",
161
157
  )
158
+
159
+
160
+ class EvalRunConfig(EvalParams):
161
+ """Full configuration for file-based evaluation runs."""
162
+
163
+ trace_files: list[str] = Field(description="Paths to trace files (Jaeger JSON or OTLP JSON).")
164
+
165
+ eval_set_file: str | None = Field(
166
+ default=None,
167
+ description="Path to a golden eval set JSON file (ADK EvalSet format).",
168
+ )
169
+
170
+ trace_format: str = Field(
171
+ default="jaeger-json",
172
+ description="Format of the trace files (jaeger-json or otlp-json).",
173
+ )
174
+
175
+ output_format: str = Field(
176
+ default="table",
177
+ description="Output format: 'table', 'json', or 'summary'.",
178
+ )
@@ -56,6 +56,12 @@ class OtlpJsonLoader(TraceLoader):
56
56
  logger.info("Loaded %d trace(s) from %s", len(traces), source)
57
57
  return traces
58
58
 
59
+ def load_from_dict(self, data: dict) -> list[Trace]:
60
+ """Load traces from an OTLP JSON dict (resourceSpans structure)."""
61
+ if "resourceSpans" not in data:
62
+ raise ValueError("Expected OTLP JSON with 'resourceSpans' key")
63
+ return self._parse_otlp_export(data)
64
+
59
65
  def _parse_otlp_export(self, data: dict) -> list[Trace]:
60
66
  """Parse full OTLP export structure with resourceSpans."""
61
67
  all_spans = []
@@ -122,23 +128,40 @@ class OtlpJsonLoader(TraceLoader):
122
128
  Some SDKs (e.g. Strands) store message content in span events rather
123
129
  than span attributes. This promotes those values so the converter can
124
130
  find them via normal attribute lookups.
131
+
132
+ Accepts events in OTLP array format or flat/nested dict format.
125
133
  """
126
134
  for event in span_data.get("events", []):
127
- for attr in event.get("attributes", []):
128
- key = attr.get("key", "")
129
- if key in self._GENAI_EVENT_KEYS and key not in attributes:
130
- value_obj = attr.get("value", {})
131
- if "stringValue" in value_obj:
132
- attributes[key] = value_obj["stringValue"]
133
-
134
- def _extract_attributes(self, attrs_list: list[dict]) -> dict:
135
- """Convert OTLP attributes array to flat dict.
136
-
137
- OTLP attributes are [{key, value: {stringValue|intValue|...}}]
138
- We flatten to {key: value} for easier use.
135
+ event_attrs = event.get("attributes", [])
136
+ if isinstance(event_attrs, dict):
137
+ flat = self._flatten_nested_dict(event_attrs)
138
+ for key in self._GENAI_EVENT_KEYS:
139
+ if key in flat and key not in attributes:
140
+ attributes[key] = flat[key]
141
+ else:
142
+ for attr in event_attrs:
143
+ key = attr.get("key", "")
144
+ if key in self._GENAI_EVENT_KEYS and key not in attributes:
145
+ value_obj = attr.get("value", {})
146
+ if "stringValue" in value_obj:
147
+ attributes[key] = value_obj["stringValue"]
148
+
149
+ def _extract_attributes(self, attrs) -> dict:
150
+ """Convert attributes to a flat ``{key: value}`` dict.
151
+
152
+ Accepts three formats:
153
+ 1. OTLP array: ``[{key, value: {stringValue|intValue|...}}]``
154
+ 2. Flat dict: ``{"gen_ai.operation.name": "chat"}``
155
+ 3. Nested dict (ClickHouse JSON column): ``{"gen_ai": {"operation": {"name": "chat"}}}``
156
+
157
+ Formats 2 and 3 are auto-detected by checking whether *attrs* is a dict.
158
+ Nested dicts are recursively flattened to dot-notation keys.
139
159
  """
160
+ if isinstance(attrs, dict):
161
+ return self._flatten_nested_dict(attrs)
162
+
140
163
  result = {}
141
- for attr in attrs_list:
164
+ for attr in attrs:
142
165
  key = attr.get("key", "")
143
166
  value_obj = attr.get("value", {})
144
167
 
@@ -157,6 +180,25 @@ class OtlpJsonLoader(TraceLoader):
157
180
 
158
181
  return result
159
182
 
183
+ @staticmethod
184
+ def _flatten_nested_dict(d: dict, prefix: str = "") -> dict:
185
+ """Recursively flatten a nested dict to dot-notation keys.
186
+
187
+ ``{"gen_ai": {"operation": {"name": "chat"}}}``
188
+ becomes ``{"gen_ai.operation.name": "chat"}``.
189
+
190
+ Already-flat keys (e.g. ``{"service.name": "agent"}``) pass through
191
+ unchanged.
192
+ """
193
+ result = {}
194
+ for key, value in d.items():
195
+ full_key = f"{prefix}{key}" if not prefix else f"{prefix}.{key}"
196
+ if isinstance(value, dict):
197
+ result.update(OtlpJsonLoader._flatten_nested_dict(value, full_key))
198
+ else:
199
+ result[full_key] = value
200
+ return result
201
+
160
202
  def _build_traces(self, all_spans: list[Span]) -> list[Trace]:
161
203
  """Group spans by trace_id and build parent-child relationships."""
162
204
  traces_by_id: dict[str, list[Span]] = {}