agentevals-cli 0.9.0__tar.gz → 0.9.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/PKG-INFO +1 -1
  2. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/docs/custom-evaluators.md +20 -0
  3. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/custom_evaluators/eval_config.yaml +1 -0
  4. agentevals_cli-0.9.1/examples/custom_evaluators/eval_config_openai_eval.yaml +18 -0
  5. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/pyproject.toml +1 -1
  6. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/config.py +15 -7
  7. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/converter.py +19 -15
  8. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/extraction.py +38 -8
  9. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/openai_eval_backend.py +40 -19
  10. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_converter.py +131 -0
  11. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_extraction.py +50 -0
  12. agentevals_cli-0.9.1/tests/test_openai_eval_backend.py +116 -0
  13. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/uv.lock +1 -1
  14. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.claude/skills/eval/SKILL.md +0 -0
  15. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.claude/skills/eval/evals/evals.json +0 -0
  16. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.claude/skills/inspect/SKILL.md +0 -0
  17. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.claude/skills/inspect/evals/evals.json +0 -0
  18. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.dockerignore +0 -0
  19. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  20. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  21. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  22. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.github/workflows/ci.yml +0 -0
  23. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.github/workflows/publish-evaluator-sdk.yml +0 -0
  24. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.github/workflows/release.yml +0 -0
  25. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.gitignore +0 -0
  26. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.mcp.json +0 -0
  27. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/CONTRIBUTING.md +0 -0
  28. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/DEVELOPMENT.md +0 -0
  29. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/Dockerfile +0 -0
  30. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/LICENSE +0 -0
  31. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/Makefile +0 -0
  32. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/README.md +0 -0
  33. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/charts/agentevals/Chart.yaml +0 -0
  34. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/charts/agentevals/templates/NOTES.txt +0 -0
  35. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/charts/agentevals/templates/_helpers.tpl +0 -0
  36. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/charts/agentevals/templates/deployment.yaml +0 -0
  37. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/charts/agentevals/templates/postgresql-secret.yaml +0 -0
  38. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/charts/agentevals/templates/postgresql.yaml +0 -0
  39. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/charts/agentevals/templates/service.yaml +0 -0
  40. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/charts/agentevals/templates/serviceaccount.yaml +0 -0
  41. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/charts/agentevals/values.yaml +0 -0
  42. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/docs/assets/logo-color-on-transparent.svg +0 -0
  43. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/docs/assets/logo-color.png +0 -0
  44. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/docs/assets/logo-dark-on-transparent.svg +0 -0
  45. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/docs/eval-set-format.md +0 -0
  46. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/docs/otel-compatibility.md +0 -0
  47. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/docs/streaming.md +0 -0
  48. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/README.md +0 -0
  49. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/custom_evaluators/response_quality.py +0 -0
  50. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/custom_evaluators/tool_call_checker.py +0 -0
  51. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/custom_sink/README.md +0 -0
  52. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/custom_sink/agentevals_example_custom_sink/__init__.py +0 -0
  53. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/custom_sink/agentevals_example_custom_sink/sink.py +0 -0
  54. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/custom_sink/pyproject.toml +0 -0
  55. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/dice_agent/README.md +0 -0
  56. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/dice_agent/agent.py +0 -0
  57. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/dice_agent/eval_set.json +0 -0
  58. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/dice_agent/main.py +0 -0
  59. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/dice_agent/test_streaming.py +0 -0
  60. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/kubernetes/README.md +0 -0
  61. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/langchain_agent/README.md +0 -0
  62. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/langchain_agent/agent.py +0 -0
  63. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/langchain_agent/eval_set.json +0 -0
  64. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/langchain_agent/main.py +0 -0
  65. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/langchain_agent/requirements.txt +0 -0
  66. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/langchain_agent/test_streaming.py +0 -0
  67. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/sdk_example/async_example.py +0 -0
  68. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/sdk_example/context_manager_example.py +0 -0
  69. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/sdk_example/decorator_example.py +0 -0
  70. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/sdk_example/requirements.txt +0 -0
  71. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/strands_agent/agent.py +0 -0
  72. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/strands_agent/eval_set.json +0 -0
  73. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/strands_agent/main.py +0 -0
  74. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/strands_agent/requirements.txt +0 -0
  75. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/adk/requirements.txt +0 -0
  76. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/adk/run.py +0 -0
  77. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/langchain/requirements.txt +0 -0
  78. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/langchain/run.py +0 -0
  79. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/ollama/requirements.txt +0 -0
  80. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/ollama/run.py +0 -0
  81. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/openai-agents/requirements.txt +0 -0
  82. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/openai-agents/run.py +0 -0
  83. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/pydantic-ai/requirements.txt +0 -0
  84. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/pydantic-ai/run.py +0 -0
  85. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/strands/requirements.txt +0 -0
  86. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/strands/run.py +0 -0
  87. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/flake.lock +0 -0
  88. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/flake.nix +0 -0
  89. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/packages/evaluator-sdk-py/README.md +0 -0
  90. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/packages/evaluator-sdk-py/pyproject.toml +0 -0
  91. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
  92. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
  93. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
  94. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/samples/eval_set_helm.json +0 -0
  95. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/samples/evalset_helm_3_2026-02-23.json +0 -0
  96. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/samples/evalset_k8s_2026-02-20.json +0 -0
  97. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/samples/helm.json +0 -0
  98. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/samples/helm_2.json +0 -0
  99. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/samples/helm_3.json +0 -0
  100. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/samples/k8s.json +0 -0
  101. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/samples/tempo_export_with_batches.json +0 -0
  102. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/__init__.py +0 -0
  103. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/_protocol.py +0 -0
  104. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
  105. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/_static/assets/index-f8LUVQc3.js +0 -0
  106. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/_static/index.html +0 -0
  107. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/_static/logo.svg +0 -0
  108. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/_static/vite.svg +0 -0
  109. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/__init__.py +0 -0
  110. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/app.py +0 -0
  111. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/debug_routes.py +0 -0
  112. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/dependencies.py +0 -0
  113. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/models.py +0 -0
  114. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/otlp_app.py +0 -0
  115. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/otlp_grpc.py +0 -0
  116. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/otlp_processing.py +0 -0
  117. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/otlp_routes.py +0 -0
  118. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/routes.py +0 -0
  119. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/runs_routes.py +0 -0
  120. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/streaming_routes.py +0 -0
  121. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/builtin_metrics.py +0 -0
  122. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/cli.py +0 -0
  123. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/custom_evaluators.py +0 -0
  124. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/eval_config_loader.py +0 -0
  125. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/evaluator/__init__.py +0 -0
  126. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/evaluator/resolver.py +0 -0
  127. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/evaluator/sources.py +0 -0
  128. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/evaluator/templates.py +0 -0
  129. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/evaluator/venv.py +0 -0
  130. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/genai_converter.py +0 -0
  131. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/loader/__init__.py +0 -0
  132. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/loader/auto.py +0 -0
  133. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/loader/base.py +0 -0
  134. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/loader/jaeger.py +0 -0
  135. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/loader/otlp.py +0 -0
  136. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/mcp_server.py +0 -0
  137. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/output.py +0 -0
  138. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/run/__init__.py +0 -0
  139. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/run/fetcher.py +0 -0
  140. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/run/result_builder.py +0 -0
  141. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/run/service.py +0 -0
  142. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/run/sinks.py +0 -0
  143. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/run/worker.py +0 -0
  144. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/runner.py +0 -0
  145. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/sdk.py +0 -0
  146. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/__init__.py +0 -0
  147. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/config.py +0 -0
  148. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/models.py +0 -0
  149. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/postgres/__init__.py +0 -0
  150. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/postgres/migrations/000001_init.down.sql +0 -0
  151. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/postgres/migrations/000001_init.up.sql +0 -0
  152. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/postgres/migrator.py +0 -0
  153. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/postgres/pool.py +0 -0
  154. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/repos/__init__.py +0 -0
  155. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/repos/memory.py +0 -0
  156. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/repos/postgres.py +0 -0
  157. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/streaming/__init__.py +0 -0
  158. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/streaming/incremental_processor.py +0 -0
  159. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/streaming/processor.py +0 -0
  160. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/streaming/session.py +0 -0
  161. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/streaming/ws_server.py +0 -0
  162. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/trace_attrs.py +0 -0
  163. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/trace_metrics.py +0 -0
  164. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/utils/__init__.py +0 -0
  165. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/utils/genai_messages.py +0 -0
  166. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/utils/log_buffer.py +0 -0
  167. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/utils/log_enrichment.py +0 -0
  168. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/api/__init__.py +0 -0
  169. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/api/test_evaluate_persistence.py +0 -0
  170. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/api/test_runs_routes.py +0 -0
  171. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/integration/__init__.py +0 -0
  172. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/integration/conftest.py +0 -0
  173. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/integration/test_evaluation_pipeline.py +0 -0
  174. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/integration/test_live_agents.py +0 -0
  175. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/integration/test_otlp_grpc_receiver.py +0 -0
  176. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/integration/test_session_grouping.py +0 -0
  177. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/integration/test_timing_stress.py +0 -0
  178. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/run/__init__.py +0 -0
  179. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/run/test_fetcher.py +0 -0
  180. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/run/test_result_builder.py +0 -0
  181. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/run/test_service.py +0 -0
  182. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/run/test_sinks.py +0 -0
  183. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/storage/__init__.py +0 -0
  184. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/storage/test_config.py +0 -0
  185. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/storage/test_memory_repos.py +0 -0
  186. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/storage/test_migrator.py +0 -0
  187. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/storage/test_models.py +0 -0
  188. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_api.py +0 -0
  189. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_cli.py +0 -0
  190. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_eval_config_loader.py +0 -0
  191. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_genai_converter.py +0 -0
  192. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_jaeger_loader.py +0 -0
  193. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_loader_auto.py +0 -0
  194. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_log_enrichment.py +0 -0
  195. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_mcp_server.py +0 -0
  196. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_otlp_loader.py +0 -0
  197. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_otlp_receiver.py +0 -0
  198. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_output.py +0 -0
  199. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_protocol.py +0 -0
  200. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_runner.py +0 -0
  201. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_sdk.py +0 -0
  202. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_trace_metrics.py +0 -0
  203. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/.gitignore +0 -0
  204. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/README.md +0 -0
  205. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/eslint.config.js +0 -0
  206. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/index.html +0 -0
  207. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/package-lock.json +0 -0
  208. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/package.json +0 -0
  209. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/public/logo.svg +0 -0
  210. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/public/vite.svg +0 -0
  211. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/App.css +0 -0
  212. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/App.tsx +0 -0
  213. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/api/client.ts +0 -0
  214. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/assets/react.svg +0 -0
  215. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
  216. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
  217. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
  218. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
  219. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/builder/BuilderHeader.tsx +0 -0
  220. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/builder/BuilderView.tsx +0 -0
  221. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
  222. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/builder/EvalCasesList.tsx +0 -0
  223. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/builder/InvocationEditor.tsx +0 -0
  224. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/builder/JsonPreview.tsx +0 -0
  225. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/builder/MetadataEditor.tsx +0 -0
  226. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/builder/TraceUploadZone.tsx +0 -0
  227. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/builder/index.ts +0 -0
  228. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/dashboard/DashboardView.tsx +0 -0
  229. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
  230. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
  231. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/dashboard/PerformanceCharts.tsx +0 -0
  232. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
  233. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/dashboard/TraceCard.tsx +0 -0
  234. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/dashboard/TraceTable.tsx +0 -0
  235. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
  236. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/DataSection.tsx +0 -0
  237. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/InspectorHeader.tsx +0 -0
  238. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
  239. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/InspectorView.tsx +0 -0
  240. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/InvocationCard.tsx +0 -0
  241. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
  242. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
  243. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
  244. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
  245. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/ToolCallList.tsx +0 -0
  246. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
  247. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/sidebar/Sidebar.tsx +0 -0
  248. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
  249. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/streaming/LiveMessage.tsx +0 -0
  250. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/streaming/LiveStreamingView.tsx +0 -0
  251. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/streaming/SessionCard.tsx +0 -0
  252. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
  253. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
  254. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/upload/FileDropZone.tsx +0 -0
  255. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/upload/MetricSelector.tsx +0 -0
  256. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
  257. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/upload/TraceEditorDrawer.tsx +0 -0
  258. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/upload/UploadView.tsx +0 -0
  259. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/welcome/WelcomeView.tsx +0 -0
  260. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/config.ts +0 -0
  261. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/context/TraceContext.tsx +0 -0
  262. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/context/TraceProvider.tsx +0 -0
  263. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/index.css +0 -0
  264. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/console-capture.ts +0 -0
  265. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/eval-config.ts +0 -0
  266. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/evalset-builder.ts +0 -0
  267. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/network-capture.ts +0 -0
  268. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/trace-helpers.ts +0 -0
  269. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/trace-loader.ts +0 -0
  270. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/trace-metadata.ts +0 -0
  271. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/trace-patcher.ts +0 -0
  272. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/types.ts +0 -0
  273. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/utils.ts +0 -0
  274. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/main.tsx +0 -0
  275. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/tsconfig.app.json +0 -0
  276. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/tsconfig.json +0 -0
  277. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/tsconfig.node.json +0 -0
  278. {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/vite.config.ts +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentevals-cli
3
- Version: 0.9.0
3
+ Version: 0.9.1
4
4
  Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
5
5
  License-File: LICENSE
6
6
  Requires-Python: >=3.11
@@ -317,6 +317,26 @@ The `grader.evaluation_metric` field selects the similarity algorithm:
317
317
  | `rouge_1` through `rouge_5` | Unigram through 5-gram overlap (F-measure) |
318
318
  | `rouge_l` | Longest common subsequence overlap (F-measure) |
319
319
 
320
+ ### Label Model Grader
321
+
322
+ Scores responses without a golden set. The model reads each response and assigns a label from a fixed list. Passing labels are defined in the config.
323
+
324
+ ```yaml
325
+ evaluators:
326
+ - name: quality_check
327
+ type: openai_eval
328
+ grader:
329
+ type: label_model
330
+ model: gpt-4o-mini
331
+ input:
332
+ - role: user
333
+ content: "Rate this response: {{ item.actual_response }}"
334
+ labels: [good, bad]
335
+ passing_labels: [good]
336
+ ```
337
+
338
+ The `threshold` field is not used for `label_model`. A response passes if its assigned label is in `passing_labels`.
339
+
320
340
  ### How it works
321
341
 
322
342
  Under the hood, agentevals creates an ephemeral eval on OpenAI, submits the actual and expected responses as JSONL items, polls for results, and cleans up. The agent's response and the golden reference are both placed in the `item` namespace (with `include_sample_schema: false`), so OpenAI only grades the provided text without generating any model outputs.
@@ -32,3 +32,4 @@ evaluators:
32
32
  ref: evaluators/random_evaluator/random_evaluator.py
33
33
  threshold: 0.110
34
34
  executor: local
35
+
@@ -0,0 +1,18 @@
1
+ # Eval config using OpenAI Evals API graders.
2
+ # Requires OPENAI_API_KEY to be set.
3
+ #
4
+ # Run with:
5
+ # agentevals run samples/helm.json \
6
+ # --config examples/custom_evaluators/eval_config_openai_eval.yaml
7
+
8
+ evaluators:
9
+ - name: quality_check
10
+ type: openai_eval
11
+ grader:
12
+ type: label_model
13
+ model: gpt-4o-mini
14
+ input:
15
+ - role: user
16
+ content: "Rate this response: {{ item.actual_response }}"
17
+ labels: [good, bad]
18
+ passing_labels: [good]
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "agentevals-cli"
7
- version = "0.9.0"
7
+ version = "0.9.1"
8
8
  description = "Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -100,13 +100,21 @@ class OpenAIEvalDef(BaseModel):
100
100
  @classmethod
101
101
  def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
102
102
  grader_type = v.get("type")
103
- if grader_type != "text_similarity":
104
- raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'")
105
- metric = v.get("evaluation_metric")
106
- if not metric:
107
- raise ValueError("'evaluation_metric' is required for text_similarity grader")
108
- if metric not in _VALID_SIMILARITY_METRICS:
109
- raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
103
+ if grader_type == "text_similarity":
104
+ metric = v.get("evaluation_metric")
105
+ if not metric:
106
+ raise ValueError("'evaluation_metric' is required for text_similarity grader")
107
+ if metric not in _VALID_SIMILARITY_METRICS:
108
+ raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
109
+ elif grader_type == "label_model":
110
+ for field in ("model", "input", "labels", "passing_labels"):
111
+ if not v.get(field):
112
+ raise ValueError(f"'{field}' is required for label_model grader")
113
+ invalid = [lbl for lbl in v["passing_labels"] if lbl not in v["labels"]]
114
+ if invalid:
115
+ raise ValueError(f"passing_labels contains labels not declared in labels: {invalid}")
116
+ else:
117
+ raise ValueError(f"Unsupported grader type: '{grader_type}'. Supported: label_model, text_similarity")
110
118
  return v
111
119
 
112
120
 
@@ -23,6 +23,7 @@ from .extraction import (
23
23
  extract_tool_call_from_span,
24
24
  extract_tool_result_from_span,
25
25
  extract_user_text_from_attrs,
26
+ find_adk_llm_spans_in,
26
27
  get_extractor,
27
28
  has_adk_descendant,
28
29
  is_adk_scope,
@@ -127,15 +128,18 @@ def _find_adk_spans(trace: Trace, operation: str) -> list[Span]:
127
128
 
128
129
 
129
130
  def _convert_invoke_span(invoke_span: Span) -> Invocation:
130
- call_llm_spans = _find_children_by_op(invoke_span, "call_llm")
131
- if not call_llm_spans:
132
- raise ValueError(f"invoke_agent span {invoke_span.span_id} has no child call_llm spans")
131
+ llm_spans = find_adk_llm_spans_in(invoke_span)
132
+ if not llm_spans:
133
+ raise ValueError(
134
+ f"invoke_agent span {invoke_span.span_id} has no converter-compatible ADK LLM descendants; "
135
+ "expected call_llm or ADK generate_content spans"
136
+ )
133
137
 
134
138
  tool_spans = _find_children_by_op(invoke_span, "execute_tool")
135
139
 
136
- user_content = _extract_user_content(call_llm_spans[0])
137
- final_response = _extract_final_response(call_llm_spans[-1])
138
- tool_uses, tool_responses = _extract_tool_trajectory(call_llm_spans, tool_spans)
140
+ user_content = _extract_user_content(llm_spans[0])
141
+ final_response = _extract_final_response(llm_spans[-1])
142
+ tool_uses, tool_responses = _extract_tool_trajectory(llm_spans, tool_spans)
139
143
 
140
144
  intermediate_data = IntermediateData(
141
145
  tool_uses=tool_uses,
@@ -177,7 +181,7 @@ def _extract_user_content(first_call_llm: Span) -> genai_types.Content:
177
181
  )
178
182
  llm_request_raw = first_call_llm.get_tag(ADK_LLM_REQUEST, "{}")
179
183
  llm_request = parse_json(llm_request_raw)
180
- for content_dict in llm_request.get("contents", []):
184
+ for content_dict in llm_request.get("contents", llm_request.get("Contents", [])):
181
185
  if content_dict.get("role") == "user":
182
186
  return _content_from_dict(content_dict)
183
187
  raise ValueError(f"call_llm span {first_call_llm.span_id}: no user content found in llm_request")
@@ -193,7 +197,7 @@ def _extract_final_response(last_call_llm: Span) -> genai_types.Content:
193
197
  )
194
198
  llm_response_raw = last_call_llm.get_tag(ADK_LLM_RESPONSE, "{}")
195
199
  llm_response = parse_json(llm_response_raw)
196
- content_dict = llm_response.get("content", {})
200
+ content_dict = llm_response.get("content", llm_response.get("Content", {}))
197
201
  if not content_dict:
198
202
  raise ValueError(f"call_llm span {last_call_llm.span_id}: no content in llm_response")
199
203
  logger.warning(
@@ -263,12 +267,12 @@ def _extract_function_calls_from_llm_response(
263
267
  llm_response_raw = call_llm.get_tag(ADK_LLM_RESPONSE, "{}")
264
268
  llm_response = parse_json(llm_response_raw)
265
269
 
266
- content_dict = llm_response.get("content", {})
270
+ content_dict = llm_response.get("content", llm_response.get("Content", {}))
267
271
  parts = content_dict.get("parts", [])
268
272
 
269
273
  calls = []
270
274
  for part in parts:
271
- fc_dict = part.get("function_call")
275
+ fc_dict = part.get("function_call", part.get("functionCall"))
272
276
  if fc_dict:
273
277
  calls.append(
274
278
  genai_types.FunctionCall(
@@ -288,9 +292,9 @@ def _content_from_dict(content_dict: dict[str, Any]) -> genai_types.Content:
288
292
  parts: list[genai_types.Part] = []
289
293
  for p in parts_dicts:
290
294
  if "text" in p:
291
- parts.append(genai_types.Part(text=p["text"]))
292
- elif "function_call" in p:
293
- fc = p["function_call"]
295
+ parts.append(genai_types.Part(text=p.get("text")))
296
+ elif "function_call" in p or "functionCall" in p:
297
+ fc = p.get("function_call", p.get("functionCall"))
294
298
  parts.append(
295
299
  genai_types.Part(
296
300
  function_call=genai_types.FunctionCall(
@@ -300,8 +304,8 @@ def _content_from_dict(content_dict: dict[str, Any]) -> genai_types.Content:
300
304
  )
301
305
  )
302
306
  )
303
- elif "function_response" in p:
304
- fr = p["function_response"]
307
+ elif "function_response" in p or "functionResponse" in p:
308
+ fr = p.get("function_response", p.get("functionResponse"))
305
309
  parts.append(
306
310
  genai_types.Part(
307
311
  function_response=genai_types.FunctionResponse(
@@ -69,14 +69,15 @@ def extract_user_text_from_attrs(attrs: dict[str, Any]) -> str | None:
69
69
  if llm_request_raw:
70
70
  llm_request = parse_json(llm_request_raw)
71
71
  if isinstance(llm_request, dict):
72
- for content_dict in reversed(llm_request.get("contents", [])):
72
+ contents = llm_request.get("contents", llm_request.get("Contents", []))
73
+ for content_dict in reversed(contents):
73
74
  if content_dict.get("role") != "user":
74
75
  continue
75
76
  parts = content_dict.get("parts", [])
76
77
  text_parts = [p for p in parts if "text" in p]
77
78
  if text_parts:
78
79
  return " ".join(p["text"] for p in text_parts)
79
- for content_dict in llm_request.get("contents", []):
80
+ for content_dict in contents:
80
81
  if content_dict.get("role") == "user":
81
82
  parts = content_dict.get("parts", [])
82
83
  if parts:
@@ -101,7 +102,7 @@ def extract_agent_response_from_attrs(attrs: dict[str, Any]) -> str | None:
101
102
  if llm_response_raw:
102
103
  llm_response = parse_json(llm_response_raw)
103
104
  if isinstance(llm_response, dict):
104
- content_dict = llm_response.get("content", {})
105
+ content_dict = llm_response.get("content", llm_response.get("Content", {}))
105
106
  if content_dict:
106
107
  parts_dicts = content_dict.get("parts", [])
107
108
  text_parts = [p for p in parts_dicts if "text" in p]
@@ -392,6 +393,38 @@ def is_adk_scope(span: Span) -> bool:
392
393
  return False
393
394
 
394
395
 
396
+ def is_adk_generate_content_llm_span(span: Span) -> bool:
397
+ if not (span.operation_name.startswith("generate_content") or span.get_tag(OTEL_GENAI_OP) == "generate_content"):
398
+ return False
399
+ return bool(span.get_tag(ADK_LLM_REQUEST) or span.get_tag(ADK_LLM_RESPONSE))
400
+
401
+
402
+ def is_adk_llm_span(span: Span) -> bool:
403
+ return span.operation_name.startswith("call_llm") or is_adk_generate_content_llm_span(span)
404
+
405
+
406
+ def find_adk_llm_spans_in(root: Span) -> list[Span]:
407
+ call_llm_spans: list[Span] = []
408
+ generate_content_spans: list[Span] = []
409
+
410
+ def collect(span: Span) -> None:
411
+ if span.operation_name.startswith("call_llm"):
412
+ call_llm_spans.append(span)
413
+ elif is_adk_generate_content_llm_span(span):
414
+ generate_content_spans.append(span)
415
+
416
+ _walk_descendants(root, collect)
417
+ call_llm_spans.sort(key=lambda s: s.start_time)
418
+ generate_content_spans.sort(key=lambda s: s.start_time)
419
+ return call_llm_spans or generate_content_spans
420
+
421
+
422
+ def _walk_descendants(span: Span, visit) -> None:
423
+ for child in span.children:
424
+ visit(child)
425
+ _walk_descendants(child, visit)
426
+
427
+
395
428
  def is_llm_span(span: Span) -> bool:
396
429
  return span.get_tag(OTEL_GENAI_REQUEST_MODEL) is not None
397
430
 
@@ -477,10 +510,7 @@ class AdkExtractor:
477
510
  return matches
478
511
 
479
512
  def find_llm_spans_in(self, root: Span) -> list[Span]:
480
- results: list[Span] = []
481
- self._walk(root, lambda s: s.operation_name.startswith("call_llm"), results)
482
- results.sort(key=lambda s: s.start_time)
483
- return results
513
+ return find_adk_llm_spans_in(root)
484
514
 
485
515
  def find_tool_spans_in(self, root: Span) -> list[Span]:
486
516
  results: list[Span] = []
@@ -493,7 +523,7 @@ class AdkExtractor:
493
523
  return None
494
524
  if span.operation_name.startswith("invoke_agent"):
495
525
  return "invocation"
496
- if span.operation_name.startswith("call_llm"):
526
+ if is_adk_llm_span(span):
497
527
  return "llm"
498
528
  if span.operation_name.startswith("execute_tool"):
499
529
  return "tool"
@@ -31,6 +31,12 @@ _TEXT_PAIR_SCHEMA = {
31
31
  "required": ["actual_response", "expected_response"],
32
32
  }
33
33
 
34
+ _ACTUAL_ONLY_SCHEMA = {
35
+ "type": "object",
36
+ "properties": {"actual_response": {"type": "string"}},
37
+ "required": ["actual_response"],
38
+ }
39
+
34
40
 
35
41
  def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
36
42
  """Build the OpenAI testing_criteria dict from the evaluator config.
@@ -51,28 +57,33 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
51
57
  "pass_threshold": evaluator_def.threshold,
52
58
  }
53
59
 
60
+ if grader_type == "label_model":
61
+ return {
62
+ "type": "label_model",
63
+ "name": evaluator_def.name,
64
+ "model": grader["model"],
65
+ "input": grader["input"],
66
+ "labels": grader["labels"],
67
+ "passing_labels": grader["passing_labels"],
68
+ }
69
+
54
70
  raise ValueError(f"Unsupported grader type: {grader_type}")
55
71
 
56
72
 
57
73
  def _build_jsonl_items(
58
74
  actual_invocations: list[Invocation],
59
75
  expected_invocations: list[Invocation],
76
+ include_expected: bool = True,
60
77
  ) -> list[dict[str, Any]]:
61
78
  items = []
62
79
  for i, actual_inv in enumerate(actual_invocations):
63
- actual_text = _content_to_text(actual_inv.final_response)
64
- if i < len(expected_invocations):
65
- expected_text = _content_to_text(expected_invocations[i].final_response)
66
- else:
67
- expected_text = ""
68
- items.append(
69
- {
70
- "item": {
71
- "actual_response": actual_text,
72
- "expected_response": expected_text,
73
- }
74
- }
75
- )
80
+ entry: dict[str, Any] = {"actual_response": _content_to_text(actual_inv.final_response)}
81
+ if include_expected:
82
+ expected_text = (
83
+ _content_to_text(expected_invocations[i].final_response) if i < len(expected_invocations) else ""
84
+ )
85
+ entry["expected_response"] = expected_text
86
+ items.append({"item": entry})
76
87
  return items
77
88
 
78
89
 
@@ -111,13 +122,17 @@ async def evaluate_openai_eval(
111
122
  error="OPENAI_API_KEY environment variable is not set.",
112
123
  )
113
124
 
114
- if expected_invocations is None:
125
+ grader_type = evaluator_def.grader["type"]
126
+
127
+ if grader_type == "text_similarity" and expected_invocations is None:
115
128
  return MetricResult(
116
129
  metric_name=evaluator_def.name,
117
130
  error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
118
131
  )
119
132
 
120
- items = _build_jsonl_items(actual_invocations, expected_invocations)
133
+ items = _build_jsonl_items(
134
+ actual_invocations, expected_invocations or [], include_expected=(grader_type != "label_model")
135
+ )
121
136
  if not items:
122
137
  return MetricResult(
123
138
  metric_name=evaluator_def.name,
@@ -130,12 +145,13 @@ async def evaluate_openai_eval(
130
145
  try:
131
146
  client = await asyncio.to_thread(_get_openai_client)
132
147
 
148
+ item_schema = _ACTUAL_ONLY_SCHEMA if grader_type == "label_model" else _TEXT_PAIR_SCHEMA
133
149
  eval_obj = await asyncio.to_thread(
134
150
  client.evals.create,
135
- name=f"agentevals-{evaluator_def.name}",
151
+ name=f"agentevals-openai-{evaluator_def.name}",
136
152
  data_source_config={
137
153
  "type": "custom",
138
- "item_schema": _TEXT_PAIR_SCHEMA,
154
+ "item_schema": item_schema,
139
155
  "include_sample_schema": False,
140
156
  },
141
157
  testing_criteria=[testing_criteria],
@@ -146,7 +162,7 @@ async def evaluate_openai_eval(
146
162
  run = await asyncio.to_thread(
147
163
  client.evals.runs.create,
148
164
  eval_id=eval_id,
149
- name=f"agentevals-run-{evaluator_def.name}",
165
+ name=f"agentevals-openai-run-{evaluator_def.name}",
150
166
  data_source={
151
167
  "type": "jsonl",
152
168
  "source": {
@@ -225,12 +241,17 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva
225
241
  total = result_counts.total if result_counts else 0
226
242
  eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED"
227
243
 
244
+ grader = evaluator_def.grader
228
245
  details: dict[str, Any] = {
229
246
  "openai_eval_id": eval_id,
230
247
  "openai_run_id": run_id,
231
- "evaluation_metric": evaluator_def.grader.get("evaluation_metric"),
232
248
  "result_counts": {"passed": passed, "failed": failed, "total": total},
233
249
  }
250
+ if grader["type"] == "text_similarity":
251
+ details["evaluation_metric"] = grader.get("evaluation_metric")
252
+ elif grader["type"] == "label_model":
253
+ details["model"] = grader.get("model")
254
+ details["passing_labels"] = grader.get("passing_labels")
234
255
  per_criteria = getattr(run, "per_testing_criteria_results", None)
235
256
  if per_criteria:
236
257
  details["per_testing_criteria"] = [
@@ -186,6 +186,108 @@ class TestConverter:
186
186
  assert len(results) == 2
187
187
  assert all(r.trace_id == "t1" for r in results)
188
188
 
189
+ def test_convert_adk_generate_content_llm_spans(self):
190
+ invoke = Span(
191
+ trace_id="t-gc",
192
+ span_id="invoke1",
193
+ parent_span_id=None,
194
+ operation_name="invoke_agent query_agent",
195
+ start_time=1000,
196
+ duration=10000,
197
+ tags={"gen_ai.operation.name": "invoke_agent"},
198
+ )
199
+ llm_1 = Span(
200
+ trace_id="t-gc",
201
+ span_id="llm1",
202
+ parent_span_id="invoke1",
203
+ operation_name="generate_content mockllm-deterministic",
204
+ start_time=2000,
205
+ duration=1000,
206
+ tags={
207
+ "gen_ai.operation.name": "generate_content",
208
+ "gcp.vertex.agent.llm_request": json.dumps(
209
+ {"Contents": [{"role": "user", "parts": [{"text": "inspect pods"}]}]}
210
+ ),
211
+ "gcp.vertex.agent.llm_response": json.dumps(
212
+ {"Content": {"role": "model", "parts": [{"text": "Calling tools."}]}}
213
+ ),
214
+ },
215
+ )
216
+ tool_1 = Span(
217
+ trace_id="t-gc",
218
+ span_id="tool1",
219
+ parent_span_id="invoke1",
220
+ operation_name="execute_tool list_pods",
221
+ start_time=3000,
222
+ duration=500,
223
+ tags={
224
+ "gen_ai.tool.name": "list_pods",
225
+ "gen_ai.tool.call.id": "call_1",
226
+ "gcp.vertex.agent.tool_call_args": json.dumps({"namespace": "default"}),
227
+ "gcp.vertex.agent.tool_response": json.dumps({"pods": []}),
228
+ },
229
+ )
230
+ llm_2 = Span(
231
+ trace_id="t-gc",
232
+ span_id="llm2",
233
+ parent_span_id="invoke1",
234
+ operation_name="generate_content mockllm-deterministic",
235
+ start_time=4000,
236
+ duration=1000,
237
+ tags={
238
+ "gen_ai.operation.name": "generate_content",
239
+ "gcp.vertex.agent.llm_request": json.dumps({"contents": []}),
240
+ "gcp.vertex.agent.llm_response": json.dumps(
241
+ {
242
+ "Content": {
243
+ "role": "model",
244
+ "parts": [
245
+ {
246
+ "functionCall": {
247
+ "name": "summarize_pods",
248
+ "args": {"namespace": "default"},
249
+ "id": "call_final",
250
+ }
251
+ }
252
+ ],
253
+ }
254
+ }
255
+ ),
256
+ },
257
+ )
258
+ tool_2 = Span(
259
+ trace_id="t-gc",
260
+ span_id="tool2",
261
+ parent_span_id="invoke1",
262
+ operation_name="execute_tool get_events",
263
+ start_time=5000,
264
+ duration=500,
265
+ tags={
266
+ "gen_ai.tool.name": "get_events",
267
+ "gen_ai.tool.call.id": "call_2",
268
+ "gcp.vertex.agent.tool_call_args": json.dumps({"namespace": "default"}),
269
+ "gcp.vertex.agent.tool_response": json.dumps({"events": []}),
270
+ },
271
+ )
272
+ invoke.children.extend([llm_1, tool_1, llm_2, tool_2])
273
+ trace = Trace(
274
+ trace_id="t-gc",
275
+ root_spans=[invoke],
276
+ all_spans=[invoke, llm_1, tool_1, llm_2, tool_2],
277
+ )
278
+
279
+ result = convert_trace(trace)
280
+
281
+ assert result.warnings == []
282
+ assert len(result.invocations) == 1
283
+ inv = result.invocations[0]
284
+ assert inv.user_content.parts[0].text == "inspect pods"
285
+ final_call = inv.final_response.parts[0].function_call
286
+ assert final_call.name == "summarize_pods"
287
+ assert final_call.args == {"namespace": "default"}
288
+ assert final_call.id == "call_final"
289
+ assert [t.name for t in inv.intermediate_data.tool_uses] == ["list_pods", "get_events"]
290
+
189
291
  def test_no_invoke_agent_warns(self):
190
292
  trace = Trace(
191
293
  trace_id="empty",
@@ -207,6 +309,35 @@ class TestConverter:
207
309
  assert len(result.warnings) == 1
208
310
  assert "no invoke_agent" in result.warnings[0]
209
311
 
312
+ def test_no_llm_descendants_warns_with_compatible_shapes(self):
313
+ invoke = Span(
314
+ trace_id="no-llm",
315
+ span_id="invoke-no-llm",
316
+ parent_span_id=None,
317
+ operation_name="invoke_agent test_agent",
318
+ start_time=1000,
319
+ duration=1000,
320
+ tags={
321
+ "otel.scope.name": "gcp.vertex.agent",
322
+ "gen_ai.operation.name": "invoke_agent",
323
+ },
324
+ )
325
+ trace = Trace(
326
+ trace_id="no-llm",
327
+ root_spans=[invoke],
328
+ all_spans=[invoke],
329
+ )
330
+
331
+ result = convert_trace(trace)
332
+
333
+ assert result.invocations == []
334
+ assert len(result.warnings) == 1
335
+ warning = result.warnings[0]
336
+ assert "invoke-no-llm" in warning
337
+ assert "no converter-compatible ADK LLM descendants" in warning
338
+ assert "call_llm" in warning
339
+ assert "ADK generate_content" in warning
340
+
210
341
  def test_no_tool_spans_fallback_to_llm_response(self):
211
342
  """When no execute_tool spans exist, function_calls should be
212
343
  extracted from call_llm responses instead."""
@@ -107,6 +107,18 @@ class TestExtractUserText:
107
107
  }
108
108
  assert extract_user_text_from_attrs(attrs) == "Second"
109
109
 
110
+ def test_adk_llm_request_outer_contents_pascalcase(self):
111
+ attrs = {
112
+ ADK_LLM_REQUEST: json.dumps(
113
+ {
114
+ "Contents": [
115
+ {"role": "user", "parts": [{"text": "Outer PascalCase only"}]},
116
+ ]
117
+ }
118
+ )
119
+ }
120
+ assert extract_user_text_from_attrs(attrs) == "Outer PascalCase only"
121
+
110
122
  def test_genai_content_based(self):
111
123
  attrs = {
112
124
  OTEL_GENAI_INPUT_MESSAGES: json.dumps(
@@ -170,6 +182,10 @@ class TestExtractAgentResponse:
170
182
  attrs = {ADK_LLM_RESPONSE: json.dumps({"content": {"parts": [{"text": "ADK response"}]}})}
171
183
  assert extract_agent_response_from_attrs(attrs) == "ADK response"
172
184
 
185
+ def test_adk_llm_response_outer_content_pascalcase(self):
186
+ attrs = {ADK_LLM_RESPONSE: json.dumps({"Content": {"parts": [{"text": "Outer Content only"}]}})}
187
+ assert extract_agent_response_from_attrs(attrs) == "Outer Content only"
188
+
173
189
  def test_genai_content_based(self):
174
190
  attrs = {
175
191
  OTEL_GENAI_OUTPUT_MESSAGES: json.dumps(
@@ -519,6 +535,39 @@ class TestAdkExtractorSpanFinding:
519
535
  ext = AdkExtractor()
520
536
  assert [s.span_id for s in ext.find_llm_spans_in(root)] == ["llm1"]
521
537
 
538
+ def test_find_llm_spans_in_falls_back_to_adk_generate_content(self):
539
+ child_llm = _span(
540
+ op="generate_content mockllm-deterministic",
541
+ tags={ADK_LLM_REQUEST: "{}"},
542
+ span_id="llm1",
543
+ )
544
+ child_tool = _span(op="execute_tool search", span_id="tool1")
545
+ root = _span(op="invoke_agent a", children=[child_llm, child_tool])
546
+ ext = AdkExtractor()
547
+ assert [s.span_id for s in ext.find_llm_spans_in(root)] == ["llm1"]
548
+
549
+ def test_find_llm_spans_in_ignores_provider_generate_content_without_adk_payload(self):
550
+ child_llm = _span(
551
+ op="generate_content gpt-4",
552
+ tags={OTEL_GENAI_REQUEST_MODEL: "gpt-4"},
553
+ span_id="llm1",
554
+ )
555
+ root = _span(op="invoke_agent a", children=[child_llm])
556
+ ext = AdkExtractor()
557
+ assert ext.find_llm_spans_in(root) == []
558
+
559
+ def test_find_llm_spans_in_prefers_call_llm_over_generate_content(self):
560
+ call_llm = _span(op="call_llm gemini", span_id="llm1", start_time=20)
561
+ generate_content = _span(
562
+ op="generate_content gemini",
563
+ tags={ADK_LLM_REQUEST: "{}"},
564
+ span_id="llm2",
565
+ start_time=10,
566
+ )
567
+ root = _span(op="invoke_agent a", children=[generate_content, call_llm])
568
+ ext = AdkExtractor()
569
+ assert [s.span_id for s in ext.find_llm_spans_in(root)] == ["llm1"]
570
+
522
571
  def test_find_tool_spans_in(self):
523
572
  child_llm = _span(op="call_llm gemini", span_id="llm1")
524
573
  child_tool = _span(op="execute_tool search", span_id="tool1")
@@ -530,6 +579,7 @@ class TestAdkExtractorSpanFinding:
530
579
  ext = AdkExtractor()
531
580
  assert ext.classify_span(_span(op="invoke_agent a", tags={OTEL_SCOPE: ADK_SCOPE_VALUE})) == "invocation"
532
581
  assert ext.classify_span(_span(op="call_llm", tags={OTEL_SCOPE: ADK_SCOPE_VALUE})) == "llm"
582
+ assert ext.classify_span(_span(op="generate_content", tags={ADK_LLM_REQUEST: "{}"})) == "llm"
533
583
  assert ext.classify_span(_span(op="execute_tool x", tags={OTEL_SCOPE: ADK_SCOPE_VALUE})) == "tool"
534
584
  assert ext.classify_span(_span(op="random")) is None
535
585