agentevals-cli 0.6.4__tar.gz → 0.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/PKG-INFO +16 -37
  2. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/README.md +14 -35
  3. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/docs/otel-compatibility.md +104 -10
  4. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/README.md +4 -1
  5. agentevals_cli-0.7.1/examples/zero-code-examples/pydantic-ai/requirements.txt +5 -0
  6. agentevals_cli-0.7.1/examples/zero-code-examples/pydantic-ai/run.py +105 -0
  7. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/flake.lock +0 -21
  8. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/flake.nix +21 -13
  9. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/pyproject.toml +14 -2
  10. agentevals_cli-0.6.4/src/agentevals/_static/assets/index-X7q-J7YQ.js → agentevals_cli-0.7.1/src/agentevals/_static/assets/index-7YPfPT4N.js +12 -12
  11. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/_static/index.html +1 -1
  12. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/api/app.py +2 -5
  13. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/api/models.py +10 -0
  14. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/api/routes.py +191 -2
  15. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/builtin_metrics.py +123 -1
  16. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/config.py +35 -18
  17. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/extraction.py +3 -3
  18. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/loader/otlp.py +55 -13
  19. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/output.py +77 -19
  20. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/runner.py +98 -31
  21. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/trace_metrics.py +60 -14
  22. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/integration/test_live_agents.py +60 -0
  23. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/test_api.py +315 -5
  24. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/test_extraction.py +2 -2
  25. agentevals_cli-0.7.1/tests/test_otlp_loader.py +454 -0
  26. agentevals_cli-0.7.1/tests/test_trace_metrics.py +519 -0
  27. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/upload/MetricSelector.tsx +46 -31
  28. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/lib/types.ts +31 -0
  29. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/uv.lock +764 -27
  30. agentevals_cli-0.6.4/tests/test_otlp_loader.py +0 -210
  31. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/.claude/skills/eval/SKILL.md +0 -0
  32. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/.claude/skills/eval/evals/evals.json +0 -0
  33. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/.claude/skills/inspect/SKILL.md +0 -0
  34. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/.claude/skills/inspect/evals/evals.json +0 -0
  35. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/.dockerignore +0 -0
  36. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  37. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  38. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  39. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/.github/workflows/ci.yml +0 -0
  40. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/.github/workflows/publish-evaluator-sdk.yml +0 -0
  41. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/.github/workflows/release.yml +0 -0
  42. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/.gitignore +0 -0
  43. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/.mcp.json +0 -0
  44. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/CONTRIBUTING.md +0 -0
  45. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/DEVELOPMENT.md +0 -0
  46. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/Dockerfile +0 -0
  47. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/LICENSE +0 -0
  48. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/Makefile +0 -0
  49. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/charts/agentevals/Chart.yaml +0 -0
  50. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/charts/agentevals/templates/NOTES.txt +0 -0
  51. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/charts/agentevals/templates/_helpers.tpl +0 -0
  52. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/charts/agentevals/templates/deployment.yaml +0 -0
  53. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/charts/agentevals/templates/service.yaml +0 -0
  54. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/charts/agentevals/templates/serviceaccount.yaml +0 -0
  55. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/charts/agentevals/values.yaml +0 -0
  56. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/docs/assets/logo-color-on-transparent.svg +0 -0
  57. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/docs/assets/logo-color.png +0 -0
  58. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/docs/assets/logo-dark-on-transparent.svg +0 -0
  59. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/docs/custom-evaluators.md +0 -0
  60. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/docs/eval-set-format.md +0 -0
  61. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/docs/streaming.md +0 -0
  62. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/custom_evaluators/eval_config.yaml +0 -0
  63. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/custom_evaluators/response_quality.py +0 -0
  64. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/custom_evaluators/tool_call_checker.py +0 -0
  65. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/dice_agent/README.md +0 -0
  66. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/dice_agent/agent.py +0 -0
  67. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/dice_agent/eval_set.json +0 -0
  68. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/dice_agent/main.py +0 -0
  69. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/dice_agent/test_streaming.py +0 -0
  70. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/kubernetes/README.md +0 -0
  71. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/langchain_agent/README.md +0 -0
  72. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/langchain_agent/agent.py +0 -0
  73. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/langchain_agent/eval_set.json +0 -0
  74. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/langchain_agent/main.py +0 -0
  75. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/langchain_agent/requirements.txt +0 -0
  76. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/langchain_agent/test_streaming.py +0 -0
  77. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/sdk_example/async_example.py +0 -0
  78. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/sdk_example/context_manager_example.py +0 -0
  79. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/sdk_example/decorator_example.py +0 -0
  80. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/sdk_example/requirements.txt +0 -0
  81. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/strands_agent/agent.py +0 -0
  82. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/strands_agent/eval_set.json +0 -0
  83. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/strands_agent/main.py +0 -0
  84. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/strands_agent/requirements.txt +0 -0
  85. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/zero-code-examples/adk/requirements.txt +0 -0
  86. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/zero-code-examples/adk/run.py +0 -0
  87. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/zero-code-examples/langchain/requirements.txt +0 -0
  88. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/zero-code-examples/langchain/run.py +0 -0
  89. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/zero-code-examples/ollama/requirements.txt +0 -0
  90. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/zero-code-examples/ollama/run.py +0 -0
  91. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/zero-code-examples/openai-agents/requirements.txt +0 -0
  92. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/zero-code-examples/openai-agents/run.py +0 -0
  93. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/zero-code-examples/strands/requirements.txt +0 -0
  94. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/examples/zero-code-examples/strands/run.py +0 -0
  95. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/packages/evaluator-sdk-py/README.md +0 -0
  96. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/packages/evaluator-sdk-py/pyproject.toml +0 -0
  97. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
  98. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
  99. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
  100. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/samples/eval_set_helm.json +0 -0
  101. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/samples/evalset_helm_3_2026-02-23.json +0 -0
  102. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/samples/evalset_k8s_2026-02-20.json +0 -0
  103. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/samples/helm.json +0 -0
  104. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/samples/helm_2.json +0 -0
  105. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/samples/helm_3.json +0 -0
  106. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/samples/k8s.json +0 -0
  107. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/__init__.py +0 -0
  108. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/_protocol.py +0 -0
  109. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
  110. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/_static/logo.svg +0 -0
  111. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/_static/vite.svg +0 -0
  112. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/api/__init__.py +0 -0
  113. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/api/debug_routes.py +0 -0
  114. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/api/dependencies.py +0 -0
  115. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/api/otlp_app.py +0 -0
  116. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/api/otlp_grpc.py +0 -0
  117. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/api/otlp_processing.py +0 -0
  118. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/api/otlp_routes.py +0 -0
  119. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/api/streaming_routes.py +0 -0
  120. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/cli.py +0 -0
  121. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/converter.py +0 -0
  122. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/custom_evaluators.py +0 -0
  123. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/eval_config_loader.py +0 -0
  124. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/evaluator/__init__.py +0 -0
  125. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/evaluator/resolver.py +0 -0
  126. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/evaluator/sources.py +0 -0
  127. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/evaluator/templates.py +0 -0
  128. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/evaluator/venv.py +0 -0
  129. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/genai_converter.py +0 -0
  130. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/loader/__init__.py +0 -0
  131. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/loader/base.py +0 -0
  132. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/loader/jaeger.py +0 -0
  133. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/mcp_server.py +0 -0
  134. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/openai_eval_backend.py +0 -0
  135. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/sdk.py +0 -0
  136. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/streaming/__init__.py +0 -0
  137. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/streaming/incremental_processor.py +0 -0
  138. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/streaming/processor.py +0 -0
  139. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/streaming/session.py +0 -0
  140. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/streaming/ws_server.py +0 -0
  141. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/trace_attrs.py +0 -0
  142. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/utils/__init__.py +0 -0
  143. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/utils/genai_messages.py +0 -0
  144. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/utils/log_buffer.py +0 -0
  145. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/src/agentevals/utils/log_enrichment.py +0 -0
  146. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/integration/__init__.py +0 -0
  147. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/integration/conftest.py +0 -0
  148. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/integration/test_evaluation_pipeline.py +0 -0
  149. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/integration/test_otlp_grpc_receiver.py +0 -0
  150. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/integration/test_session_grouping.py +0 -0
  151. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/integration/test_timing_stress.py +0 -0
  152. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/test_cli.py +0 -0
  153. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/test_converter.py +0 -0
  154. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/test_genai_converter.py +0 -0
  155. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/test_jaeger_loader.py +0 -0
  156. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/test_log_enrichment.py +0 -0
  157. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/test_mcp_server.py +0 -0
  158. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/test_otlp_receiver.py +0 -0
  159. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/test_output.py +0 -0
  160. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/test_protocol.py +0 -0
  161. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/test_runner.py +0 -0
  162. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/tests/test_sdk.py +0 -0
  163. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/.gitignore +0 -0
  164. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/README.md +0 -0
  165. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/eslint.config.js +0 -0
  166. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/index.html +0 -0
  167. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/package-lock.json +0 -0
  168. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/package.json +0 -0
  169. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/public/logo.svg +0 -0
  170. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/public/vite.svg +0 -0
  171. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/App.css +0 -0
  172. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/App.tsx +0 -0
  173. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/api/client.ts +0 -0
  174. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/assets/react.svg +0 -0
  175. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
  176. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
  177. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
  178. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
  179. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/builder/BuilderHeader.tsx +0 -0
  180. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/builder/BuilderView.tsx +0 -0
  181. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
  182. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/builder/EvalCasesList.tsx +0 -0
  183. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/builder/InvocationEditor.tsx +0 -0
  184. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/builder/JsonPreview.tsx +0 -0
  185. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/builder/MetadataEditor.tsx +0 -0
  186. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/builder/TraceUploadZone.tsx +0 -0
  187. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/builder/index.ts +0 -0
  188. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/dashboard/DashboardView.tsx +0 -0
  189. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
  190. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
  191. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/dashboard/PerformanceCharts.tsx +0 -0
  192. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
  193. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/dashboard/TraceCard.tsx +0 -0
  194. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/dashboard/TraceTable.tsx +0 -0
  195. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
  196. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/inspector/DataSection.tsx +0 -0
  197. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/inspector/InspectorHeader.tsx +0 -0
  198. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
  199. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/inspector/InspectorView.tsx +0 -0
  200. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/inspector/InvocationCard.tsx +0 -0
  201. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
  202. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
  203. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
  204. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
  205. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/inspector/ToolCallList.tsx +0 -0
  206. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
  207. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/sidebar/Sidebar.tsx +0 -0
  208. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
  209. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/streaming/LiveMessage.tsx +0 -0
  210. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/streaming/LiveStreamingView.tsx +0 -0
  211. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/streaming/SessionCard.tsx +0 -0
  212. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
  213. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
  214. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/upload/FileDropZone.tsx +0 -0
  215. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
  216. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/upload/TraceEditorDrawer.tsx +0 -0
  217. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/upload/UploadView.tsx +0 -0
  218. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/components/welcome/WelcomeView.tsx +0 -0
  219. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/config.ts +0 -0
  220. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/context/TraceContext.tsx +0 -0
  221. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/context/TraceProvider.tsx +0 -0
  222. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/index.css +0 -0
  223. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/lib/console-capture.ts +0 -0
  224. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/lib/evalset-builder.ts +0 -0
  225. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/lib/network-capture.ts +0 -0
  226. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/lib/trace-helpers.ts +0 -0
  227. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/lib/trace-loader.ts +0 -0
  228. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/lib/trace-metadata.ts +0 -0
  229. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/lib/trace-patcher.ts +0 -0
  230. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/lib/utils.ts +0 -0
  231. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/src/main.tsx +0 -0
  232. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/tsconfig.app.json +0 -0
  233. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/tsconfig.json +0 -0
  234. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/tsconfig.node.json +0 -0
  235. {agentevals_cli-0.6.4 → agentevals_cli-0.7.1}/ui/vite.config.ts +0 -0
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentevals-cli
3
- Version: 0.6.4
3
+ Version: 0.7.1
4
4
  Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
5
5
  License-File: LICENSE
6
6
  Requires-Python: >=3.11
7
7
  Requires-Dist: click>=8.0
8
8
  Requires-Dist: fastapi>=0.115.0
9
- Requires-Dist: google-adk[eval]>=1.25.0
9
+ Requires-Dist: google-adk[eval]>=1.30.0
10
10
  Requires-Dist: httpx>=0.27.0
11
11
  Requires-Dist: opentelemetry-proto>=1.36.0
12
12
  Requires-Dist: python-dotenv>=1.0.0
@@ -59,34 +59,16 @@ agentevals scores performance and inference quality from OpenTelemetry traces. N
59
59
 
60
60
  ## What is agentevals?
61
61
 
62
- agentevals is a framework-agnostic evaluation solution that scores AI agent behavior directly from [OpenTelemetry](https://opentelemetry.io/) traces. Record your agent's actions once, then evaluate as many times as you want. No re-runs, no guesswork.
62
+ agentevals is a framework-agnostic evaluation solution that scores AI agent behavior directly from [OpenTelemetry](https://opentelemetry.io/) traces. Record your agent's actions once, then evaluate as many times as you want without re-executing or burning extra tokens.
63
63
 
64
64
  It works with any OTel-instrumented framework (LangChain, Strands, Google ADK, OpenAI Agents SDK, and others), supports Jaeger JSON and native OTLP trace formats, and ships with built-in evaluators, custom evaluator support, and LLM-based judges.
65
65
 
66
- - **CLI** for scripting and CI pipelines
67
- - **Web UI** for visual inspection and local developer experience
68
- - **Kubernetes and OTel support** so you can deploy right next to your agents; works natively in your OpenTelemetry pipeline
69
- - **MCP server** so MCP clients can run evaluations from a conversation
70
-
71
- ## Why agentevals?
72
-
73
- Most evaluation tools require you to **re-execute your agent** for every test, burning tokens, time, and money on duplicate LLM calls. agentevals takes a different approach:
74
-
75
66
  - **No re-execution**: score agents from existing traces without replaying expensive LLM calls
76
- - **Framework-agnostic**: works with any agent framework that emits OpenTelemetry spans
77
67
  - **Golden eval sets**: compare actual behavior against defined expected behaviors for deterministic pass/fail gating
78
68
  - **Custom evaluators**: write scoring logic in Python, JavaScript, or any language, or offload scoring to OpenAI Eval API
79
69
  - **CI/CD ready**: gate deployments on quality thresholds directly in your pipeline
80
70
  - **Local-first**: no cloud dependency required; everything runs on your machine
81
-
82
- ## How It Works
83
-
84
- agentevals follows three simple steps:
85
-
86
- 1. **Collect traces**: Instrument your agent with OpenTelemetry (or export traces from your tracing backend). Point the OTLP exporter at the agentevals receiver, or load trace files directly.
87
- 2. **Define eval sets**: Create golden evaluation sets that describe expected agent behavior: which tools should be called, in what order, and what the output should look like.
88
- 3. **Run evaluations**: Use the CLI, Web UI, or MCP server to score traces against your eval sets. Get per-metric scores, pass/fail results, and detailed span-level breakdowns.
89
-
71
+ - **Multiple interfaces**: CLI for scripting and CI, Web UI for visual inspection, MCP server for conversational evaluation, Helm chart for Kubernetes environments
90
72
 
91
73
  > [!IMPORTANT]
92
74
  > This project is under active development. Expect breaking changes.
@@ -95,7 +77,7 @@ agentevals follows three simple steps:
95
77
 
96
78
  - [Installation](#installation)
97
79
  - [Quick Start](#quick-start)
98
- - [Integration](#integration)
80
+ - [Use-cases and Integrations](#use-cases-and-integrations)
99
81
  - [CLI](#cli)
100
82
  - [Custom Evaluators](#custom-evaluators)
101
83
  - [Web UI](#web-ui)
@@ -194,14 +176,14 @@ agentevals serve
194
176
  # opens http://localhost:8001
195
177
  ```
196
178
 
197
- You can also point any OTel-instrumented agent directly at the built-in receiver (`OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318`). The UI streams tool calls, inputs, and outputs live as your agent runs. For production setups, the same receiver slots into a Kubernetes OTel Collector pipeline as an exporter destination. See [Integration](#integration) and the [Kubernetes example](examples/kubernetes/README.md) for walkthroughs.
179
+ You can also point any OTel-instrumented agent directly at the built-in receiver (`OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318`). The UI streams tool calls, inputs, and outputs live as your agent runs. For production setups, the same receiver slots into a Kubernetes OTel Collector pipeline as an exporter destination. See [Use-cases and Integrations](#use-cases-and-integrations) and the [Kubernetes example](examples/kubernetes/README.md) for walkthroughs.
198
180
 
199
181
  **Next steps:**
200
182
 
201
183
  - `agentevals evaluator list` to see all built-in and community evaluators
202
184
  - [Custom Evaluators](#custom-evaluators) to write your own scoring logic
203
185
 
204
- ## Use-cases and integrations
186
+ ## Use-cases and Integrations
205
187
 
206
188
  ### Zero-Code (Recommended)
207
189
 
@@ -243,7 +225,7 @@ with app.session(eval_set_id="my-eval"):
243
225
 
244
226
  Requires `pip install "agentevals-cli[streaming]"`. See [examples/sdk_example/](examples/sdk_example/) for framework-specific patterns.
245
227
 
246
- ## CLI for local testing, and CI pipelines
228
+ ## CLI
247
229
 
248
230
  ```bash
249
231
  # Multiple traces, JSON output
@@ -306,12 +288,13 @@ A `Dockerfile` is included at the project root. The image bundles the API, web U
306
288
 
307
289
  ```bash
308
290
  docker build -t agentevals .
309
- docker run -p 8001:8001 -p 4318:4318 agentevals
291
+ docker run -p 8001:8001 -p 4317:4317 -p 4318:4318 agentevals
310
292
  ```
311
293
 
312
294
  | Port | Purpose |
313
295
  |------|---------|
314
296
  | 8001 | Web UI and REST API |
297
+ | 4317 | OTLP gRPC receiver (traces and logs) |
315
298
  | 4318 | OTLP HTTP receiver (traces and logs) |
316
299
  | 8080 | MCP (Streamable HTTP) |
317
300
 
@@ -389,7 +372,7 @@ See [DEVELOPMENT.md](DEVELOPMENT.md) for build tiers, Makefile targets, and Nix
389
372
 
390
373
  **Do I need a database or any infrastructure to run agentevals?**
391
374
 
392
- No. agentevals is a single `pip install` with no database, no message queue, and no external services. The CLI evaluates trace files directly from disk. The web UI and live streaming use in-memory session state. You can go from zero to scored traces in under a minute.
375
+ No. agentevals is a single `pip install` with no database, no message queue, and no external services. The CLI evaluates trace files directly from disk. The web UI and live streaming use in-memory session state.
393
376
 
394
377
  **Does the CLI require a running server?**
395
378
 
@@ -397,23 +380,19 @@ No. `agentevals run` evaluates trace files entirely offline. The server (`agente
397
380
 
398
381
  **Can I use agentevals in CI/CD?**
399
382
 
400
- Yes. The CLI is designed for pipeline use: pass trace files and an eval set, set a threshold, and let the exit code gate your deployment. Combine it with `--output json` for machine-readable results. No server process needed.
383
+ Yes. Pass trace files and an eval set, set a threshold, and let the exit code gate your deployment. Combine with `--output json` for machine-readable results. No server process needed.
401
384
 
402
385
  **What if I switch agent frameworks?**
403
386
 
404
- Because agentevals uses OpenTelemetry as its universal interface, switching frameworks (e.g., from LangChain to Strands, or from ADK to OpenAI Agents) does not require changing your evaluation setup. As long as your new framework emits OTel spans, the same eval sets and metrics work as before.
387
+ Because agentevals uses OpenTelemetry as its universal interface, switching frameworks does not require changing your evaluation setup. As long as your new framework emits OTel spans, the same eval sets and metrics work as before.
405
388
 
406
389
  **Can I write evaluators in my own language?**
407
390
 
408
- Yes. A custom evaluator is any program that reads JSON from stdin and writes a score to stdout. Python and JavaScript have first-class scaffolding support (`agentevals evaluator init`), but any language works. If your evaluator has a `requirements.txt`, agentevals manages a cached virtual environment automatically.
391
+ Yes. A custom evaluator is any program that reads JSON from stdin and writes a score to stdout. Python and JavaScript have first-class scaffolding support (`agentevals evaluator init`), but any language works.
409
392
 
410
393
  **Can I plug agentevals into an existing OTel pipeline?**
411
394
 
412
- Yes. The OTLP receiver on port 4318 accepts standard `http/protobuf` and `http/json` trace exports, so it slots into any OpenTelemetry pipeline as just another exporter destination. If your pipeline uses gRPC (port 4317), place an [OTel Collector](https://opentelemetry.io/docs/collector/) in front to bridge gRPC to HTTP. The [Kubernetes example](examples/kubernetes/README.md) shows this exact pattern.
413
-
414
- **Can I deploy agentevals on Kubernetes?**
415
-
416
- Yes. A Dockerfile and a [Helm chart](charts/agentevals/) are included. A single pod exposes the web UI (8001), OTLP receiver (4318), and MCP server (8080). See the [Kubernetes example](examples/kubernetes/README.md) for a full walkthrough deploying agentevals alongside kagent and an OTel Collector.
395
+ Yes. The OTLP receiver on port 4318 accepts standard `http/protobuf` and `http/json` trace exports, so it slots into any OpenTelemetry pipeline as just another exporter destination. If your pipeline uses gRPC (port 4317), place an [OTel Collector](https://opentelemetry.io/docs/collector/) in front to bridge gRPC to HTTP. The [Kubernetes example](examples/kubernetes/README.md) shows this pattern.
417
396
 
418
397
  **How does this compare to ADK's evaluations?**
419
398
 
@@ -425,7 +404,7 @@ However, if you're iterating on your agents locally, you can point your agents t
425
404
 
426
405
  AgentCore's evaluation integration (via `strands-agents-evals`) also couples agent execution with evaluation. It re-invokes the agent for each test case, converts the resulting OTel spans to AWS's ADOT format, and scores them against 4 built-in evaluators (Helpfulness, Accuracy, Harmfulness, Relevance) via a cloud API call. This means you need an AWS account, valid credentials, and network access for every evaluation.
427
406
 
428
- agentevals takes a different approach: it scores pre-recorded traces locally without re-running anything. It works with standard Jaeger JSON and OTLP formats from any framework, supports open-ended metrics (tool trajectory matching, LLM-based judges, custom scorers), and ships with a CLI, web UI, and MCP server. No cloud dependency required, though we do include all ADK's GCP-based evals as of now.
407
+ agentevals scores pre-recorded traces locally without re-running anything. It works with standard Jaeger JSON and OTLP formats from any framework, supports open-ended metrics (tool trajectory matching, LLM-based judges, custom scorers), and ships with a CLI, web UI, and MCP server. No cloud dependency required, though we do include all ADK's GCP-based evals as of now.
429
408
 
430
409
  **How does this compare to LangSmith?**
431
410
 
@@ -33,34 +33,16 @@ agentevals scores performance and inference quality from OpenTelemetry traces. N
33
33
 
34
34
  ## What is agentevals?
35
35
 
36
- agentevals is a framework-agnostic evaluation solution that scores AI agent behavior directly from [OpenTelemetry](https://opentelemetry.io/) traces. Record your agent's actions once, then evaluate as many times as you want. No re-runs, no guesswork.
36
+ agentevals is a framework-agnostic evaluation solution that scores AI agent behavior directly from [OpenTelemetry](https://opentelemetry.io/) traces. Record your agent's actions once, then evaluate as many times as you want without re-executing or burning extra tokens.
37
37
 
38
38
  It works with any OTel-instrumented framework (LangChain, Strands, Google ADK, OpenAI Agents SDK, and others), supports Jaeger JSON and native OTLP trace formats, and ships with built-in evaluators, custom evaluator support, and LLM-based judges.
39
39
 
40
- - **CLI** for scripting and CI pipelines
41
- - **Web UI** for visual inspection and local developer experience
42
- - **Kubernetes and OTel support** so you can deploy right next to your agents; works natively in your OpenTelemetry pipeline
43
- - **MCP server** so MCP clients can run evaluations from a conversation
44
-
45
- ## Why agentevals?
46
-
47
- Most evaluation tools require you to **re-execute your agent** for every test, burning tokens, time, and money on duplicate LLM calls. agentevals takes a different approach:
48
-
49
40
  - **No re-execution**: score agents from existing traces without replaying expensive LLM calls
50
- - **Framework-agnostic**: works with any agent framework that emits OpenTelemetry spans
51
41
  - **Golden eval sets**: compare actual behavior against defined expected behaviors for deterministic pass/fail gating
52
42
  - **Custom evaluators**: write scoring logic in Python, JavaScript, or any language, or offload scoring to OpenAI Eval API
53
43
  - **CI/CD ready**: gate deployments on quality thresholds directly in your pipeline
54
44
  - **Local-first**: no cloud dependency required; everything runs on your machine
55
-
56
- ## How It Works
57
-
58
- agentevals follows three simple steps:
59
-
60
- 1. **Collect traces**: Instrument your agent with OpenTelemetry (or export traces from your tracing backend). Point the OTLP exporter at the agentevals receiver, or load trace files directly.
61
- 2. **Define eval sets**: Create golden evaluation sets that describe expected agent behavior: which tools should be called, in what order, and what the output should look like.
62
- 3. **Run evaluations**: Use the CLI, Web UI, or MCP server to score traces against your eval sets. Get per-metric scores, pass/fail results, and detailed span-level breakdowns.
63
-
45
+ - **Multiple interfaces**: CLI for scripting and CI, Web UI for visual inspection, MCP server for conversational evaluation, Helm chart for Kubernetes environments
64
46
 
65
47
  > [!IMPORTANT]
66
48
  > This project is under active development. Expect breaking changes.
@@ -69,7 +51,7 @@ agentevals follows three simple steps:
69
51
 
70
52
  - [Installation](#installation)
71
53
  - [Quick Start](#quick-start)
72
- - [Integration](#integration)
54
+ - [Use-cases and Integrations](#use-cases-and-integrations)
73
55
  - [CLI](#cli)
74
56
  - [Custom Evaluators](#custom-evaluators)
75
57
  - [Web UI](#web-ui)
@@ -168,14 +150,14 @@ agentevals serve
168
150
  # opens http://localhost:8001
169
151
  ```
170
152
 
171
- You can also point any OTel-instrumented agent directly at the built-in receiver (`OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318`). The UI streams tool calls, inputs, and outputs live as your agent runs. For production setups, the same receiver slots into a Kubernetes OTel Collector pipeline as an exporter destination. See [Integration](#integration) and the [Kubernetes example](examples/kubernetes/README.md) for walkthroughs.
153
+ You can also point any OTel-instrumented agent directly at the built-in receiver (`OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318`). The UI streams tool calls, inputs, and outputs live as your agent runs. For production setups, the same receiver slots into a Kubernetes OTel Collector pipeline as an exporter destination. See [Use-cases and Integrations](#use-cases-and-integrations) and the [Kubernetes example](examples/kubernetes/README.md) for walkthroughs.
172
154
 
173
155
  **Next steps:**
174
156
 
175
157
  - `agentevals evaluator list` to see all built-in and community evaluators
176
158
  - [Custom Evaluators](#custom-evaluators) to write your own scoring logic
177
159
 
178
- ## Use-cases and integrations
160
+ ## Use-cases and Integrations
179
161
 
180
162
  ### Zero-Code (Recommended)
181
163
 
@@ -217,7 +199,7 @@ with app.session(eval_set_id="my-eval"):
217
199
 
218
200
  Requires `pip install "agentevals-cli[streaming]"`. See [examples/sdk_example/](examples/sdk_example/) for framework-specific patterns.
219
201
 
220
- ## CLI for local testing, and CI pipelines
202
+ ## CLI
221
203
 
222
204
  ```bash
223
205
  # Multiple traces, JSON output
@@ -280,12 +262,13 @@ A `Dockerfile` is included at the project root. The image bundles the API, web U
280
262
 
281
263
  ```bash
282
264
  docker build -t agentevals .
283
- docker run -p 8001:8001 -p 4318:4318 agentevals
265
+ docker run -p 8001:8001 -p 4317:4317 -p 4318:4318 agentevals
284
266
  ```
285
267
 
286
268
  | Port | Purpose |
287
269
  |------|---------|
288
270
  | 8001 | Web UI and REST API |
271
+ | 4317 | OTLP gRPC receiver (traces and logs) |
289
272
  | 4318 | OTLP HTTP receiver (traces and logs) |
290
273
  | 8080 | MCP (Streamable HTTP) |
291
274
 
@@ -363,7 +346,7 @@ See [DEVELOPMENT.md](DEVELOPMENT.md) for build tiers, Makefile targets, and Nix
363
346
 
364
347
  **Do I need a database or any infrastructure to run agentevals?**
365
348
 
366
- No. agentevals is a single `pip install` with no database, no message queue, and no external services. The CLI evaluates trace files directly from disk. The web UI and live streaming use in-memory session state. You can go from zero to scored traces in under a minute.
349
+ No. agentevals is a single `pip install` with no database, no message queue, and no external services. The CLI evaluates trace files directly from disk. The web UI and live streaming use in-memory session state.
367
350
 
368
351
  **Does the CLI require a running server?**
369
352
 
@@ -371,23 +354,19 @@ No. `agentevals run` evaluates trace files entirely offline. The server (`agente
371
354
 
372
355
  **Can I use agentevals in CI/CD?**
373
356
 
374
- Yes. The CLI is designed for pipeline use: pass trace files and an eval set, set a threshold, and let the exit code gate your deployment. Combine it with `--output json` for machine-readable results. No server process needed.
357
+ Yes. Pass trace files and an eval set, set a threshold, and let the exit code gate your deployment. Combine with `--output json` for machine-readable results. No server process needed.
375
358
 
376
359
  **What if I switch agent frameworks?**
377
360
 
378
- Because agentevals uses OpenTelemetry as its universal interface, switching frameworks (e.g., from LangChain to Strands, or from ADK to OpenAI Agents) does not require changing your evaluation setup. As long as your new framework emits OTel spans, the same eval sets and metrics work as before.
361
+ Because agentevals uses OpenTelemetry as its universal interface, switching frameworks does not require changing your evaluation setup. As long as your new framework emits OTel spans, the same eval sets and metrics work as before.
379
362
 
380
363
  **Can I write evaluators in my own language?**
381
364
 
382
- Yes. A custom evaluator is any program that reads JSON from stdin and writes a score to stdout. Python and JavaScript have first-class scaffolding support (`agentevals evaluator init`), but any language works. If your evaluator has a `requirements.txt`, agentevals manages a cached virtual environment automatically.
365
+ Yes. A custom evaluator is any program that reads JSON from stdin and writes a score to stdout. Python and JavaScript have first-class scaffolding support (`agentevals evaluator init`), but any language works.
383
366
 
384
367
  **Can I plug agentevals into an existing OTel pipeline?**
385
368
 
386
- Yes. The OTLP receiver on port 4318 accepts standard `http/protobuf` and `http/json` trace exports, so it slots into any OpenTelemetry pipeline as just another exporter destination. If your pipeline uses gRPC (port 4317), place an [OTel Collector](https://opentelemetry.io/docs/collector/) in front to bridge gRPC to HTTP. The [Kubernetes example](examples/kubernetes/README.md) shows this exact pattern.
387
-
388
- **Can I deploy agentevals on Kubernetes?**
389
-
390
- Yes. A Dockerfile and a [Helm chart](charts/agentevals/) are included. A single pod exposes the web UI (8001), OTLP receiver (4318), and MCP server (8080). See the [Kubernetes example](examples/kubernetes/README.md) for a full walkthrough deploying agentevals alongside kagent and an OTel Collector.
369
+ Yes. The OTLP receiver on port 4318 accepts standard `http/protobuf` and `http/json` trace exports, so it slots into any OpenTelemetry pipeline as just another exporter destination. If your pipeline uses gRPC (port 4317), place an [OTel Collector](https://opentelemetry.io/docs/collector/) in front to bridge gRPC to HTTP. The [Kubernetes example](examples/kubernetes/README.md) shows this pattern.
391
370
 
392
371
  **How does this compare to ADK's evaluations?**
393
372
 
@@ -399,7 +378,7 @@ However, if you're iterating on your agents locally, you can point your agents t
399
378
 
400
379
  AgentCore's evaluation integration (via `strands-agents-evals`) also couples agent execution with evaluation. It re-invokes the agent for each test case, converts the resulting OTel spans to AWS's ADOT format, and scores them against 4 built-in evaluators (Helpfulness, Accuracy, Harmfulness, Relevance) via a cloud API call. This means you need an AWS account, valid credentials, and network access for every evaluation.
401
380
 
402
- agentevals takes a different approach: it scores pre-recorded traces locally without re-running anything. It works with standard Jaeger JSON and OTLP formats from any framework, supports open-ended metrics (tool trajectory matching, LLM-based judges, custom scorers), and ships with a CLI, web UI, and MCP server. No cloud dependency required, though we do include all ADK's GCP-based evals as of now.
381
+ agentevals scores pre-recorded traces locally without re-running anything. It works with standard Jaeger JSON and OTLP formats from any framework, supports open-ended metrics (tool trajectory matching, LLM-based judges, custom scorers), and ships with a CLI, web UI, and MCP server. No cloud dependency required, though we do include all ADK's GCP-based evals as of now.
403
382
 
404
383
  **How does this compare to LangSmith?**
405
384
 
@@ -8,7 +8,9 @@ agentevals consumes OpenTelemetry traces to evaluate AI agents. This document co
8
8
 
9
9
  The [GenAI semantic conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/) define standard span attributes for LLM interactions. agentevals auto-detects this format when spans contain `gen_ai.request.model` or `gen_ai.input.messages`.
10
10
 
11
- Supported attributes:
11
+ This format works with LangChain, Strands, OpenAI instrumentation, Anthropic instrumentation, and any framework that follows the GenAI semantic conventions.
12
+
13
+ #### Core attributes
12
14
 
13
15
  | Attribute | Description |
14
16
  |-----------|-------------|
@@ -18,9 +20,51 @@ Supported attributes:
18
20
  | `gen_ai.response.finish_reasons` | Why the model stopped generating |
19
21
  | `gen_ai.usage.input_tokens` | Input token count |
20
22
  | `gen_ai.usage.output_tokens` | Output token count |
21
- | `gen_ai.system` | AI system identifier (e.g. `openai`, `anthropic`) |
22
23
 
23
- This format works with LangChain, Strands, OpenAI instrumentation, Anthropic instrumentation, and any framework that follows the GenAI semantic conventions.
24
+ #### Provider and response metadata (v1.37.0+)
25
+
26
+ | Attribute | Description |
27
+ |-----------|-------------|
28
+ | `gen_ai.provider.name` | LLM provider (e.g. `openai`, `anthropic`). Replaces the deprecated `gen_ai.system`. |
29
+ | `gen_ai.response.model` | Model name returned in the response |
30
+ | `gen_ai.response.id` | Unique response identifier |
31
+
32
+ #### Request parameters (v1.40.0)
33
+
34
+ | Attribute | Description |
35
+ |-----------|-------------|
36
+ | `gen_ai.request.temperature` | Temperature sampling parameter |
37
+ | `gen_ai.request.max_tokens` | Maximum output tokens limit |
38
+ | `gen_ai.request.top_p` | Top-P (nucleus) sampling parameter |
39
+ | `gen_ai.request.top_k` | Top-K sampling parameter |
40
+
41
+ #### Cache token usage
42
+
43
+ | Attribute | Description |
44
+ |-----------|-------------|
45
+ | `gen_ai.usage.cache_creation.input_tokens` | Tokens spent creating a prompt cache entry |
46
+ | `gen_ai.usage.cache_read.input_tokens` | Tokens served from an existing cache entry |
47
+
48
+ These are relevant for providers that support prompt caching (Anthropic, OpenAI). agentevals aggregates these across LLM spans and displays them in the performance summary.
49
+
50
+ #### Agent and tool metadata (v1.31.0+)
51
+
52
+ | Attribute | Description |
53
+ |-----------|-------------|
54
+ | `gen_ai.agent.id` | Unique agent identifier |
55
+ | `gen_ai.agent.description` | Agent description |
56
+ | `gen_ai.tool.description` | Tool description |
57
+ | `gen_ai.tool.type` | Tool type classification |
58
+
59
+ #### Opt-in attributes (v1.37.0+)
60
+
61
+ These may contain large payloads and are typically gated behind instrumentation flags:
62
+
63
+ | Attribute | Description |
64
+ |-----------|-------------|
65
+ | `gen_ai.system_instructions` | System prompt text |
66
+ | `gen_ai.tool.definitions` | Tool schema definitions (JSON) |
67
+ | `gen_ai.output.type` | Classification of output content |
24
68
 
25
69
  ### Google ADK (framework-native)
26
70
 
@@ -30,9 +74,33 @@ Google ADK emits spans under the `gcp.vertex.agent` OTel scope with proprietary
30
74
 
31
75
  Format detection is automatic. When a trace contains both ADK and GenAI attributes, ADK takes priority because it provides richer structured data. The detection logic lives in `src/agentevals/converter.py` (`get_extractor()`).
32
76
 
77
+ ## Message Formats
78
+
79
+ GenAI message content (`gen_ai.input.messages`, `gen_ai.output.messages`) can use two JSON schemas. agentevals supports both and normalizes them internally.
80
+
81
+ ### Content-based format
82
+
83
+ Used by OpenAI and LangChain instrumentors (v2):
84
+
85
+ ```json
86
+ {"role": "user", "content": "Hello"}
87
+ {"role": "assistant", "content": "...", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"city\": \"NYC\"}"}}]}
88
+ ```
89
+
90
+ ### Parts-based format (v1.36.0+)
91
+
92
+ Used by newer instrumentors that follow the GenAI semconv parts schema:
93
+
94
+ ```json
95
+ {"role": "user", "parts": [{"type": "text", "content": "Hello"}]}
96
+ {"role": "assistant", "parts": [{"type": "tool_call", "name": "get_weather", "arguments": {"city": "NYC"}}]}
97
+ ```
98
+
99
+ Both formats are auto-detected per message. Tool calls are normalized to `{name, id, arguments}` regardless of source format.
100
+
33
101
  ## Message Content Delivery
34
102
 
35
- GenAI message content (`gen_ai.input.messages`, `gen_ai.output.messages`) can arrive through three mechanisms. agentevals supports all of them:
103
+ GenAI message content can arrive through three mechanisms. agentevals supports all of them:
36
104
 
37
105
  ### 1. Span attributes (simplest)
38
106
 
@@ -80,18 +148,44 @@ If you maintain an OTel-instrumented agent framework and want to align with the
80
148
 
81
149
  ## OTLP Receiver
82
150
 
83
- agentevals runs:
151
+ agentevals runs two OTLP receivers:
152
+
153
+ - **gRPC** on port 4317 (standard OTLP gRPC port, configurable via `--otlp-grpc-port`)
154
+ - **HTTP** on port 4318 (standard OTLP HTTP port)
84
155
 
85
- - OTLP HTTP receiver on port 4318 (standard OTLP HTTP port)
86
- - OTLP gRPC receiver on port 4317 (standard OTLP gRPC port).
156
+ Both accept traces and logs and feed into the same session manager.
87
157
 
88
- OTLP HTTP accepts:
158
+ ### OTLP HTTP
89
159
 
90
160
  | Endpoint | Content Types |
91
161
  |----------|--------------|
92
162
  | `/v1/traces` | `application/json`, `application/x-protobuf` |
93
163
  | `/v1/logs` | `application/json`, `application/x-protobuf` |
94
164
 
95
- Point OTLP/HTTP exporters at `http://localhost:4318`.
96
- Point OTLP/gRPC exporters at `localhost:4317` with `OTEL_EXPORTER_OTLP_PROTOCOL=grpc`.
165
+ ### OTLP gRPC
166
+
167
+ Implements the standard `TraceService/Export` and `LogsService/Export` RPCs. Configuration:
168
+
169
+ | Setting | Default |
170
+ |---------|---------|
171
+ | Max message size | 8 MB |
172
+ | Max concurrent RPCs | 32 |
173
+ | Compression | gzip |
174
+ | TLS | off (insecure) |
175
+
176
+ ### Client configuration
177
+
178
+ For HTTP exporters:
179
+
180
+ ```bash
181
+ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
182
+ ```
183
+
184
+ For gRPC exporters:
185
+
186
+ ```bash
187
+ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
188
+ export OTEL_EXPORTER_OTLP_PROTOCOL=grpc
189
+ ```
190
+
97
191
  Traces and logs stream into agentevals automatically. See [examples/README.md](../examples/README.md) for zero-code setup instructions.
@@ -29,6 +29,7 @@ agentevals accepts OTLP/HTTP on port 4318 (`http/protobuf` and `http/json`) and
29
29
  | [zero-code-examples/ollama/](./zero-code-examples/ollama/) | LangChain | Ollama |
30
30
  | [zero-code-examples/strands/](./zero-code-examples/strands/) | Strands | OpenAI |
31
31
  | [zero-code-examples/adk/](./zero-code-examples/adk/) | Google ADK | Gemini |
32
+ | [zero-code-examples/pydantic-ai/](./zero-code-examples/pydantic-ai/) | Pydantic AI | OpenAI |
32
33
 
33
34
  This approach works with any framework that has OTel instrumentation: LangChain, Strands, Google ADK, etc. If your framework already emits OTel spans, you only need to add `OTLPSpanExporter` (and `OTLPLogExporter` if it uses GenAI log-based content delivery).
34
35
 
@@ -103,6 +104,7 @@ Detection checks for `gen_ai.request.model` / `gen_ai.input.messages` (GenAI sem
103
104
  | [zero-code-examples/ollama/](./zero-code-examples/ollama/) | LangChain | Ollama | GenAI semconv (logs) | Standard OTLP export |
104
105
  | [zero-code-examples/strands/](./zero-code-examples/strands/) | Strands | OpenAI | GenAI semconv (events*) | Standard OTLP export |
105
106
  | [zero-code-examples/adk/](./zero-code-examples/adk/) | Google ADK | Gemini | ADK built-in | Standard OTLP export |
107
+ | [zero-code-examples/pydantic-ai/](./zero-code-examples/pydantic-ai/) | Pydantic AI | OpenAI | GenAI semconv (span attrs) | Standard OTLP export |
106
108
  | [langchain_agent](./langchain_agent/) | LangChain | OpenAI | GenAI semconv (logs) | SDK WebSocket |
107
109
  | [strands_agent](./strands_agent/) | Strands | OpenAI | GenAI semconv (events*) | SDK WebSocket |
108
110
  | [dice_agent](./dice_agent/) | Google ADK | Gemini | ADK built-in | SDK WebSocket |
@@ -217,6 +219,7 @@ python examples/zero-code-examples/langchain/run.py
217
219
  python examples/zero-code-examples/ollama/run.py
218
220
  python examples/zero-code-examples/strands/run.py
219
221
  python examples/zero-code-examples/adk/run.py
222
+ python examples/zero-code-examples/pydantic-ai/run.py
220
223
 
221
224
  # SDK examples:
222
225
  python examples/sdk_example/context_manager_example.py
@@ -232,7 +235,7 @@ python examples/strands_agent/main.py
232
235
  Traces stream to the dev server in real-time. Evaluation runs automatically when the session completes.
233
236
 
234
237
  See each example's README for prerequisites and detailed instructions:
235
- - [zero-code-examples/](./zero-code-examples/) (LangChain + Strands, standard OTLP)
238
+ - [zero-code-examples/](./zero-code-examples/) (LangChain, Strands, ADK, OpenAI Agents, Pydantic AI — standard OTLP)
236
239
  - [dice_agent/README.md](./dice_agent/README.md) (Google ADK + Gemini)
237
240
  - [langchain_agent/README.md](./langchain_agent/README.md) (LangChain + OpenAI, SDK)
238
241
  - [strands_agent/](./strands_agent/) (Strands + OpenAI, SDK)
@@ -0,0 +1,5 @@
1
+ pydantic-ai>=1.81.0
2
+
3
+ opentelemetry-sdk>=1.36.0
4
+ opentelemetry-exporter-otlp-proto-http>=1.36.0
5
+ python-dotenv>=1.0.0
@@ -0,0 +1,105 @@
1
+ """Run a dice-rolling Pydantic AI agent with OTLP export — no agentevals SDK.
2
+
3
+ Demonstrates zero-code integration: any OTel-instrumented agent streams
4
+ traces to agentevals by pointing the OTLP exporter at the receiver.
5
+
6
+ Pydantic AI has built-in OTel support via Agent.instrument_all(). By default
7
+ it uses version 2 of the GenAI semconv format, storing message content in span
8
+ attributes — only a TracerProvider is needed.
9
+ No separate instrumentation library is needed.
10
+
11
+ Prerequisites:
12
+ 1. pip install -r requirements.txt
13
+ 2. agentevals serve --dev
14
+ 3. export OPENAI_API_KEY="your-key-here"
15
+
16
+ Usage:
17
+ python examples/zero-code-examples/pydantic-ai/run.py
18
+ """
19
+
20
+ import os
21
+ import random
22
+
23
+ from dotenv import load_dotenv
24
+ from opentelemetry import trace
25
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
26
+ from opentelemetry.sdk.resources import Resource
27
+ from opentelemetry.sdk.trace import TracerProvider
28
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
29
+ from pydantic_ai import Agent
30
+
31
+ load_dotenv(override=True)
32
+
33
+
34
+ def roll_die(sides: int) -> int:
35
+ """Roll a die with the given number of sides and return the result."""
36
+ return random.randint(1, sides)
37
+
38
+
39
+ def check_prime(number: int) -> bool:
40
+ """Return True if the number is prime, False otherwise."""
41
+ if number < 2:
42
+ return False
43
+ for i in range(2, int(number**0.5) + 1):
44
+ if number % i == 0:
45
+ return False
46
+ return True
47
+
48
+
49
+ def main():
50
+ if not os.getenv("OPENAI_API_KEY"):
51
+ print("OPENAI_API_KEY not set.")
52
+ return
53
+
54
+ endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
55
+ print(f"OTLP endpoint: {endpoint}")
56
+
57
+ os.environ.setdefault(
58
+ "OTEL_RESOURCE_ATTRIBUTES",
59
+ "agentevals.eval_set_id=pydantic_ai_eval,agentevals.session_name=pydantic-ai-zero-code",
60
+ )
61
+
62
+ resource = Resource.create()
63
+
64
+ tracer_provider = TracerProvider(resource=resource)
65
+ tracer_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter(), schedule_delay_millis=1000))
66
+ trace.set_tracer_provider(tracer_provider)
67
+
68
+ # Enable Pydantic AI's built-in OTel instrumentation. This one call
69
+ # wires up all agents globally — no framework-specific instrumentor
70
+ # library (like opentelemetry-instrumentation-openai-v2) is needed.
71
+ Agent.instrument_all()
72
+
73
+ agent = Agent(
74
+ "openai:gpt-4o-mini",
75
+ instructions="You are a helpful assistant. You can roll dice and check if numbers are prime.",
76
+ )
77
+ agent.tool_plain(roll_die)
78
+ agent.tool_plain(check_prime)
79
+
80
+ test_queries = [
81
+ "Hi! Can you help me?",
82
+ "Roll a 20-sided die for me",
83
+ "Is the number you rolled prime?",
84
+ ]
85
+
86
+ message_history = []
87
+
88
+ try:
89
+ for i, query in enumerate(test_queries, 1):
90
+ print(f"\n[{i}/{len(test_queries)}] User: {query}")
91
+
92
+ result = agent.run_sync(query, message_history=message_history)
93
+
94
+ print(f" Agent: {result.output}")
95
+
96
+ # Pass the full message history forward for multi-turn conversation.
97
+ message_history = result.all_messages()
98
+ finally:
99
+ print()
100
+ tracer_provider.force_flush()
101
+ print("All traces flushed to OTLP receiver.")
102
+
103
+
104
+ if __name__ == "__main__":
105
+ main()
@@ -1,25 +1,5 @@
1
1
  {
2
2
  "nodes": {
3
- "devshell": {
4
- "inputs": {
5
- "nixpkgs": [
6
- "nixpkgs"
7
- ]
8
- },
9
- "locked": {
10
- "lastModified": 1768818222,
11
- "narHash": "sha256-460jc0+CZfyaO8+w8JNtlClB2n4ui1RbHfPTLkpwhU8=",
12
- "owner": "numtide",
13
- "repo": "devshell",
14
- "rev": "255a2b1725a20d060f566e4755dbf571bbbb5f76",
15
- "type": "github"
16
- },
17
- "original": {
18
- "owner": "numtide",
19
- "repo": "devshell",
20
- "type": "github"
21
- }
22
- },
23
3
  "flake-utils": {
24
4
  "inputs": {
25
5
  "systems": "systems"
@@ -102,7 +82,6 @@
102
82
  },
103
83
  "root": {
104
84
  "inputs": {
105
- "devshell": "devshell",
106
85
  "flake-utils": "flake-utils",
107
86
  "nixpkgs": "nixpkgs",
108
87
  "pyproject-build-systems": "pyproject-build-systems",