agentevals-cli 0.8.4__tar.gz → 0.9.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/DEVELOPMENT.md +1 -1
  2. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/PKG-INFO +2 -2
  3. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/README.md +1 -1
  4. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/docs/custom-evaluators.md +20 -0
  5. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/docs/eval-set-format.md +3 -3
  6. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/custom_evaluators/eval_config.yaml +1 -0
  7. agentevals_cli-0.9.1/examples/custom_evaluators/eval_config_openai_eval.yaml +18 -0
  8. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/dice_agent/README.md +1 -1
  9. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/kubernetes/README.md +2 -2
  10. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/pyproject.toml +1 -1
  11. agentevals_cli-0.8.4/src/agentevals/_static/assets/index-Cl6S2lcn.js → agentevals_cli-0.9.1/src/agentevals/_static/assets/index-f8LUVQc3.js +63 -63
  12. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/_static/index.html +1 -1
  13. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/api/routes.py +28 -87
  14. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/api/runs_routes.py +2 -0
  15. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/api/streaming_routes.py +7 -9
  16. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/cli.py +42 -70
  17. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/config.py +100 -44
  18. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/converter.py +19 -15
  19. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/custom_evaluators.py +12 -1
  20. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/eval_config_loader.py +34 -59
  21. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/extraction.py +38 -8
  22. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/mcp_server.py +55 -22
  23. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/openai_eval_backend.py +40 -19
  24. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/run/result_builder.py +3 -3
  25. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/run/service.py +10 -0
  26. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/runner.py +5 -31
  27. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/api/test_evaluate_persistence.py +6 -6
  28. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/api/test_runs_routes.py +9 -0
  29. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/run/test_result_builder.py +6 -8
  30. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/run/test_service.py +2 -2
  31. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/test_api.py +29 -10
  32. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/test_cli.py +78 -0
  33. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/test_converter.py +131 -0
  34. agentevals_cli-0.9.1/tests/test_eval_config_loader.py +76 -0
  35. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/test_extraction.py +50 -0
  36. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/test_mcp_server.py +60 -0
  37. agentevals_cli-0.9.1/tests/test_openai_eval_backend.py +116 -0
  38. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/test_runner.py +48 -13
  39. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/api/client.ts +8 -1
  40. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/bug-report/BugReportModal.tsx +11 -1
  41. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/dashboard/DashboardView.tsx +1 -1
  42. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/dashboard/TraceTable.tsx +3 -3
  43. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/inspector/ComparisonPanel.tsx +3 -3
  44. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/inspector/InspectorHeader.tsx +1 -1
  45. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/inspector/InspectorView.tsx +1 -1
  46. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/inspector/MetricResultsSection.tsx +3 -3
  47. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/inspector/MetricsComparisonSection.tsx +4 -4
  48. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/upload/MetricSelector.tsx +22 -10
  49. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/upload/UploadView.tsx +5 -5
  50. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/context/TraceContext.tsx +2 -2
  51. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/context/TraceProvider.tsx +17 -14
  52. agentevals_cli-0.9.1/ui/src/lib/eval-config.ts +25 -0
  53. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/lib/types.ts +8 -5
  54. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/uv.lock +1 -1
  55. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/.claude/skills/eval/SKILL.md +0 -0
  56. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/.claude/skills/eval/evals/evals.json +0 -0
  57. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/.claude/skills/inspect/SKILL.md +0 -0
  58. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/.claude/skills/inspect/evals/evals.json +0 -0
  59. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/.dockerignore +0 -0
  60. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  61. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  62. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  63. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/.github/workflows/ci.yml +0 -0
  64. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/.github/workflows/publish-evaluator-sdk.yml +0 -0
  65. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/.github/workflows/release.yml +0 -0
  66. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/.gitignore +0 -0
  67. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/.mcp.json +0 -0
  68. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/CONTRIBUTING.md +0 -0
  69. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/Dockerfile +0 -0
  70. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/LICENSE +0 -0
  71. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/Makefile +0 -0
  72. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/charts/agentevals/Chart.yaml +0 -0
  73. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/charts/agentevals/templates/NOTES.txt +0 -0
  74. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/charts/agentevals/templates/_helpers.tpl +0 -0
  75. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/charts/agentevals/templates/deployment.yaml +0 -0
  76. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/charts/agentevals/templates/postgresql-secret.yaml +0 -0
  77. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/charts/agentevals/templates/postgresql.yaml +0 -0
  78. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/charts/agentevals/templates/service.yaml +0 -0
  79. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/charts/agentevals/templates/serviceaccount.yaml +0 -0
  80. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/charts/agentevals/values.yaml +0 -0
  81. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/docs/assets/logo-color-on-transparent.svg +0 -0
  82. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/docs/assets/logo-color.png +0 -0
  83. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/docs/assets/logo-dark-on-transparent.svg +0 -0
  84. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/docs/otel-compatibility.md +0 -0
  85. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/docs/streaming.md +0 -0
  86. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/README.md +0 -0
  87. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/custom_evaluators/response_quality.py +0 -0
  88. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/custom_evaluators/tool_call_checker.py +0 -0
  89. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/custom_sink/README.md +0 -0
  90. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/custom_sink/agentevals_example_custom_sink/__init__.py +0 -0
  91. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/custom_sink/agentevals_example_custom_sink/sink.py +0 -0
  92. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/custom_sink/pyproject.toml +0 -0
  93. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/dice_agent/agent.py +0 -0
  94. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/dice_agent/eval_set.json +0 -0
  95. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/dice_agent/main.py +0 -0
  96. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/dice_agent/test_streaming.py +0 -0
  97. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/langchain_agent/README.md +0 -0
  98. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/langchain_agent/agent.py +0 -0
  99. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/langchain_agent/eval_set.json +0 -0
  100. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/langchain_agent/main.py +0 -0
  101. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/langchain_agent/requirements.txt +0 -0
  102. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/langchain_agent/test_streaming.py +0 -0
  103. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/sdk_example/async_example.py +0 -0
  104. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/sdk_example/context_manager_example.py +0 -0
  105. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/sdk_example/decorator_example.py +0 -0
  106. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/sdk_example/requirements.txt +0 -0
  107. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/strands_agent/agent.py +0 -0
  108. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/strands_agent/eval_set.json +0 -0
  109. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/strands_agent/main.py +0 -0
  110. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/strands_agent/requirements.txt +0 -0
  111. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/zero-code-examples/adk/requirements.txt +0 -0
  112. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/zero-code-examples/adk/run.py +0 -0
  113. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/zero-code-examples/langchain/requirements.txt +0 -0
  114. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/zero-code-examples/langchain/run.py +0 -0
  115. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/zero-code-examples/ollama/requirements.txt +0 -0
  116. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/zero-code-examples/ollama/run.py +0 -0
  117. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/zero-code-examples/openai-agents/requirements.txt +0 -0
  118. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/zero-code-examples/openai-agents/run.py +0 -0
  119. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/zero-code-examples/pydantic-ai/requirements.txt +0 -0
  120. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/zero-code-examples/pydantic-ai/run.py +0 -0
  121. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/zero-code-examples/strands/requirements.txt +0 -0
  122. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/examples/zero-code-examples/strands/run.py +0 -0
  123. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/flake.lock +0 -0
  124. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/flake.nix +0 -0
  125. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/packages/evaluator-sdk-py/README.md +0 -0
  126. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/packages/evaluator-sdk-py/pyproject.toml +0 -0
  127. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
  128. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
  129. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
  130. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/samples/eval_set_helm.json +0 -0
  131. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/samples/evalset_helm_3_2026-02-23.json +0 -0
  132. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/samples/evalset_k8s_2026-02-20.json +0 -0
  133. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/samples/helm.json +0 -0
  134. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/samples/helm_2.json +0 -0
  135. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/samples/helm_3.json +0 -0
  136. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/samples/k8s.json +0 -0
  137. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/samples/tempo_export_with_batches.json +0 -0
  138. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/__init__.py +0 -0
  139. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/_protocol.py +0 -0
  140. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
  141. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/_static/logo.svg +0 -0
  142. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/_static/vite.svg +0 -0
  143. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/api/__init__.py +0 -0
  144. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/api/app.py +0 -0
  145. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/api/debug_routes.py +0 -0
  146. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/api/dependencies.py +0 -0
  147. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/api/models.py +0 -0
  148. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/api/otlp_app.py +0 -0
  149. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/api/otlp_grpc.py +0 -0
  150. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/api/otlp_processing.py +0 -0
  151. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/api/otlp_routes.py +0 -0
  152. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/builtin_metrics.py +0 -0
  153. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/evaluator/__init__.py +0 -0
  154. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/evaluator/resolver.py +0 -0
  155. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/evaluator/sources.py +0 -0
  156. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/evaluator/templates.py +0 -0
  157. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/evaluator/venv.py +0 -0
  158. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/genai_converter.py +0 -0
  159. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/loader/__init__.py +0 -0
  160. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/loader/auto.py +0 -0
  161. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/loader/base.py +0 -0
  162. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/loader/jaeger.py +0 -0
  163. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/loader/otlp.py +0 -0
  164. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/output.py +0 -0
  165. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/run/__init__.py +0 -0
  166. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/run/fetcher.py +0 -0
  167. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/run/sinks.py +0 -0
  168. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/run/worker.py +0 -0
  169. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/sdk.py +0 -0
  170. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/storage/__init__.py +0 -0
  171. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/storage/config.py +0 -0
  172. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/storage/models.py +0 -0
  173. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/storage/postgres/__init__.py +0 -0
  174. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/storage/postgres/migrations/000001_init.down.sql +0 -0
  175. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/storage/postgres/migrations/000001_init.up.sql +0 -0
  176. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/storage/postgres/migrator.py +0 -0
  177. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/storage/postgres/pool.py +0 -0
  178. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/storage/repos/__init__.py +0 -0
  179. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/storage/repos/memory.py +0 -0
  180. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/storage/repos/postgres.py +0 -0
  181. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/streaming/__init__.py +0 -0
  182. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/streaming/incremental_processor.py +0 -0
  183. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/streaming/processor.py +0 -0
  184. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/streaming/session.py +0 -0
  185. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/streaming/ws_server.py +0 -0
  186. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/trace_attrs.py +0 -0
  187. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/trace_metrics.py +0 -0
  188. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/utils/__init__.py +0 -0
  189. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/utils/genai_messages.py +0 -0
  190. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/utils/log_buffer.py +0 -0
  191. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/src/agentevals/utils/log_enrichment.py +0 -0
  192. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/api/__init__.py +0 -0
  193. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/integration/__init__.py +0 -0
  194. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/integration/conftest.py +0 -0
  195. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/integration/test_evaluation_pipeline.py +0 -0
  196. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/integration/test_live_agents.py +0 -0
  197. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/integration/test_otlp_grpc_receiver.py +0 -0
  198. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/integration/test_session_grouping.py +0 -0
  199. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/integration/test_timing_stress.py +0 -0
  200. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/run/__init__.py +0 -0
  201. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/run/test_fetcher.py +0 -0
  202. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/run/test_sinks.py +0 -0
  203. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/storage/__init__.py +0 -0
  204. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/storage/test_config.py +0 -0
  205. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/storage/test_memory_repos.py +0 -0
  206. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/storage/test_migrator.py +0 -0
  207. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/storage/test_models.py +0 -0
  208. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/test_genai_converter.py +0 -0
  209. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/test_jaeger_loader.py +0 -0
  210. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/test_loader_auto.py +0 -0
  211. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/test_log_enrichment.py +0 -0
  212. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/test_otlp_loader.py +0 -0
  213. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/test_otlp_receiver.py +0 -0
  214. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/test_output.py +0 -0
  215. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/test_protocol.py +0 -0
  216. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/test_sdk.py +0 -0
  217. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/tests/test_trace_metrics.py +0 -0
  218. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/.gitignore +0 -0
  219. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/README.md +0 -0
  220. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/eslint.config.js +0 -0
  221. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/index.html +0 -0
  222. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/package-lock.json +0 -0
  223. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/package.json +0 -0
  224. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/public/logo.svg +0 -0
  225. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/public/vite.svg +0 -0
  226. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/App.css +0 -0
  227. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/App.tsx +0 -0
  228. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/assets/react.svg +0 -0
  229. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
  230. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
  231. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
  232. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/builder/BuilderHeader.tsx +0 -0
  233. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/builder/BuilderView.tsx +0 -0
  234. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
  235. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/builder/EvalCasesList.tsx +0 -0
  236. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/builder/InvocationEditor.tsx +0 -0
  237. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/builder/JsonPreview.tsx +0 -0
  238. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/builder/MetadataEditor.tsx +0 -0
  239. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/builder/TraceUploadZone.tsx +0 -0
  240. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/builder/index.ts +0 -0
  241. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
  242. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
  243. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/dashboard/PerformanceCharts.tsx +0 -0
  244. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
  245. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/dashboard/TraceCard.tsx +0 -0
  246. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/inspector/DataSection.tsx +0 -0
  247. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
  248. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/inspector/InvocationCard.tsx +0 -0
  249. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
  250. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
  251. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/inspector/ToolCallList.tsx +0 -0
  252. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
  253. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/sidebar/Sidebar.tsx +0 -0
  254. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
  255. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/streaming/LiveMessage.tsx +0 -0
  256. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/streaming/LiveStreamingView.tsx +0 -0
  257. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/streaming/SessionCard.tsx +0 -0
  258. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
  259. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
  260. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/upload/FileDropZone.tsx +0 -0
  261. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
  262. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/upload/TraceEditorDrawer.tsx +0 -0
  263. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/components/welcome/WelcomeView.tsx +0 -0
  264. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/config.ts +0 -0
  265. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/index.css +0 -0
  266. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/lib/console-capture.ts +0 -0
  267. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/lib/evalset-builder.ts +0 -0
  268. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/lib/network-capture.ts +0 -0
  269. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/lib/trace-helpers.ts +0 -0
  270. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/lib/trace-loader.ts +0 -0
  271. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/lib/trace-metadata.ts +0 -0
  272. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/lib/trace-patcher.ts +0 -0
  273. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/lib/utils.ts +0 -0
  274. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/src/main.tsx +0 -0
  275. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/tsconfig.app.json +0 -0
  276. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/tsconfig.json +0 -0
  277. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/tsconfig.node.json +0 -0
  278. {agentevals_cli-0.8.4 → agentevals_cli-0.9.1}/ui/vite.config.ts +0 -0
@@ -50,7 +50,7 @@ Once running, submit a run with:
50
50
  ```bash
51
51
  curl -X POST http://localhost:8001/api/runs \
52
52
  -H 'content-type: application/json' \
53
- -d '{"spec": {"approach": "trace_replay", "target": {"kind": "inline", "inline": {...}}, "evalConfig": {"metrics": ["tool_trajectory_avg_score"]}}}'
53
+ -d '{"spec": {"approach": "trace_replay", "target": {"kind": "inline", "inline": {...}}, "evalConfig": {"evaluators": [{"name": "tool_trajectory_avg_score", "type": "builtin"}]}}}'
54
54
  ```
55
55
 
56
56
  Then poll `GET /api/runs/{runId}` and `GET /api/runs/{runId}/results`. Without `storage.backend=postgres`, the `/api/runs` endpoints return 503 with a hint pointing at the env var.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentevals-cli
3
- Version: 0.8.4
3
+ Version: 0.9.1
4
4
  Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
5
5
  License-File: LICENSE
6
6
  Requires-Python: >=3.11
@@ -278,7 +278,7 @@ See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protoc
278
278
  agentevals serve # bundled UI on http://localhost:8001
279
279
  ```
280
280
 
281
- Upload traces and eval sets, select metrics, and view results with interactive span trees. Live-streamed traces appear in the "Local Dev" tab, grouped by session ID. For running from source, see [DEVELOPMENT.md](DEVELOPMENT.md).
281
+ Upload traces and eval sets, select evaluators, and view results with interactive span trees. Live-streamed traces appear in the "Local Dev" tab, grouped by session ID. For running from source, see [DEVELOPMENT.md](DEVELOPMENT.md).
282
282
 
283
283
  Interactive API docs are available at `/docs` (Swagger) and `/redoc` while the server is running. The OTLP receiver on port 4318 serves its own docs at `http://localhost:4318/docs`.
284
284
 
@@ -250,7 +250,7 @@ See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protoc
250
250
  agentevals serve # bundled UI on http://localhost:8001
251
251
  ```
252
252
 
253
- Upload traces and eval sets, select metrics, and view results with interactive span trees. Live-streamed traces appear in the "Local Dev" tab, grouped by session ID. For running from source, see [DEVELOPMENT.md](DEVELOPMENT.md).
253
+ Upload traces and eval sets, select evaluators, and view results with interactive span trees. Live-streamed traces appear in the "Local Dev" tab, grouped by session ID. For running from source, see [DEVELOPMENT.md](DEVELOPMENT.md).
254
254
 
255
255
  Interactive API docs are available at `/docs` (Swagger) and `/redoc` while the server is running. The OTLP receiver on port 4318 serves its own docs at `http://localhost:4318/docs`.
256
256
 
@@ -317,6 +317,26 @@ The `grader.evaluation_metric` field selects the similarity algorithm:
317
317
  | `rouge_1` through `rouge_5` | Unigram through 5-gram overlap (F-measure) |
318
318
  | `rouge_l` | Longest common subsequence overlap (F-measure) |
319
319
 
320
+ ### Label Model Grader
321
+
322
+ Scores responses without a golden set. The model reads each response and assigns a label from a fixed list. Passing labels are defined in the config.
323
+
324
+ ```yaml
325
+ evaluators:
326
+ - name: quality_check
327
+ type: openai_eval
328
+ grader:
329
+ type: label_model
330
+ model: gpt-4o-mini
331
+ input:
332
+ - role: user
333
+ content: "Rate this response: {{ item.actual_response }}"
334
+ labels: [good, bad]
335
+ passing_labels: [good]
336
+ ```
337
+
338
+ The `threshold` field is not used for `label_model`. A response passes if its assigned label is in `passing_labels`.
339
+
320
340
  ### How it works
321
341
 
322
342
  Under the hood, agentevals creates an ephemeral eval on OpenAI, submits the actual and expected responses as JSONL items, polls for results, and cleans up. The agent's response and the golden reference are both placed in the `item` namespace (with `include_sample_schema: false`), so OpenAI only grades the provided text without generating any model outputs.
@@ -1,6 +1,6 @@
1
1
  # Eval Set Format
2
2
 
3
- An eval set is a JSON file containing golden reference data that metrics compare agent traces against. It follows the [Google ADK `EvalSet`](https://github.com/google/adk-python/blob/main/src/google/adk/evaluation/eval_set.py) schema, which means eval sets are portable between agentevals and ADK tooling.
3
+ An eval set is a JSON file containing golden reference data that evaluators compare agent traces against. It follows the [Google ADK `EvalSet`](https://github.com/google/adk-python/blob/main/src/google/adk/evaluation/eval_set.py) schema, which means eval sets are portable between agentevals and ADK tooling.
4
4
 
5
5
  Most users will not need to author eval sets by hand. The web UI can generate them from live sessions (mark a session as golden, and the server builds the eval set automatically). This document is for users who want to create or edit eval sets directly, whether for CLI usage, CI pipelines, or version-controlled test suites.
6
6
 
@@ -203,9 +203,9 @@ The `parts` array can contain text, function calls, or function responses. Most
203
203
 
204
204
  Each `FunctionCall` has `name`, `args`, and `id`. Each `FunctionResponse` has `name`, `response`, and `id`. Match `id` values between calls and responses to pair them.
205
205
 
206
- ## Which Metrics Use Eval Sets
206
+ ## Which Evaluators Use Eval Sets
207
207
 
208
- Not all metrics require an eval set. Use `agentevals list-metrics` to see which do:
208
+ Not all evaluators require an eval set. Use `agentevals evaluator list --source builtin` to see which built-in evaluators do:
209
209
 
210
210
  | Metric | Needs Eval Set | What It Reads |
211
211
  |---|---|---|
@@ -32,3 +32,4 @@ evaluators:
32
32
  ref: evaluators/random_evaluator/random_evaluator.py
33
33
  threshold: 0.110
34
34
  executor: local
35
+
@@ -0,0 +1,18 @@
1
+ # Eval config using OpenAI Evals API graders.
2
+ # Requires OPENAI_API_KEY to be set.
3
+ #
4
+ # Run with:
5
+ # agentevals run samples/helm.json \
6
+ # --config examples/custom_evaluators/eval_config_openai_eval.yaml
7
+
8
+ evaluators:
9
+ - name: quality_check
10
+ type: openai_eval
11
+ grader:
12
+ type: label_model
13
+ model: gpt-4o-mini
14
+ input:
15
+ - role: user
16
+ content: "Rate this response: {{ item.actual_response }}"
17
+ labels: [good, bad]
18
+ passing_labels: [good]
@@ -149,7 +149,7 @@ Update `main.py` to test the new functionality.
149
149
  **After agent completes:**
150
150
  - Status changes to "EVALUATED"
151
151
  - Evaluation results appear as colored badges
152
- - Each metric shows: name and score (e.g., "tool_trajectory_avg_score: 1.00")
152
+ - Each evaluator result shows: name and score (e.g., "tool_trajectory_avg_score: 1.00")
153
153
 
154
154
  **Multiple runs:**
155
155
  - Each run creates a new session with model name in ID
@@ -221,7 +221,7 @@ This captures the GPT-5 session's tool trajectory and final responses as the gol
221
221
  2. Select both sessions (the `gpt-4.1-mini` session and the `gpt-5` session)
222
222
  3. Click **Evaluate**
223
223
  4. Select the `helm-agent-comparison` eval set
224
- 5. Choose the metrics:
224
+ 5. Choose the evaluators:
225
225
  - **tool_trajectory_avg_score**: Did the agent call the correct tools in the correct order?
226
226
  - **response_match_score**: Did the agent produce responses consistent with the golden reference?
227
227
  6. Run the evaluation
@@ -241,7 +241,7 @@ Compare the two sessions in the results table:
241
241
 
242
242
  <img width="1914" height="1154" alt="image" src="https://github.com/user-attachments/assets/5939a8d4-3775-4cf1-9cf2-d3b6b4afd582" />
243
243
 
244
- You can also click an individual conversation and see a breakdown of each evaluators.
244
+ You can also click an individual conversation and see a breakdown of each evaluator.
245
245
 
246
246
  <img width="1916" height="1348" alt="image" src="https://github.com/user-attachments/assets/984b3d29-8018-4fcb-9036-bb7c6e97d9ff" />
247
247
 
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "agentevals-cli"
7
- version = "0.8.4"
7
+ version = "0.9.1"
8
8
  description = "Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"