agentevals-cli 0.9.0__tar.gz → 0.9.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/.github/workflows/ci.yml +4 -0
  2. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/.github/workflows/publish-evaluator-sdk.yml +3 -1
  3. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/.github/workflows/release.yml +2 -6
  4. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/Makefile +1 -1
  5. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/PKG-INFO +1 -1
  6. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/docs/custom-evaluators.md +20 -0
  7. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/custom_evaluators/eval_config.yaml +1 -0
  8. agentevals_cli-0.9.2/examples/custom_evaluators/eval_config_openai_eval.yaml +18 -0
  9. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/pyproject.toml +5 -2
  10. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/__init__.py +1 -1
  11. agentevals_cli-0.9.0/src/agentevals/_static/assets/index-f8LUVQc3.js → agentevals_cli-0.9.2/src/agentevals/_static/assets/index-RIquRPno.js +1 -1
  12. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/_static/index.html +1 -1
  13. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/config.py +15 -7
  14. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/converter.py +19 -15
  15. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/extraction.py +38 -8
  16. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/openai_eval_backend.py +40 -19
  17. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/test_cli.py +18 -0
  18. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/test_converter.py +131 -0
  19. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/test_extraction.py +50 -0
  20. agentevals_cli-0.9.2/tests/test_openai_eval_backend.py +116 -0
  21. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/package-lock.json +10 -10
  22. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/sidebar/Sidebar.tsx +1 -1
  23. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/uv.lock +1 -2
  24. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/.claude/skills/eval/SKILL.md +0 -0
  25. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/.claude/skills/eval/evals/evals.json +0 -0
  26. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/.claude/skills/inspect/SKILL.md +0 -0
  27. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/.claude/skills/inspect/evals/evals.json +0 -0
  28. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/.dockerignore +0 -0
  29. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  30. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  31. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  32. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/.gitignore +0 -0
  33. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/.mcp.json +0 -0
  34. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/CONTRIBUTING.md +0 -0
  35. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/DEVELOPMENT.md +0 -0
  36. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/Dockerfile +0 -0
  37. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/LICENSE +0 -0
  38. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/README.md +0 -0
  39. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/charts/agentevals/Chart.yaml +0 -0
  40. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/charts/agentevals/templates/NOTES.txt +0 -0
  41. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/charts/agentevals/templates/_helpers.tpl +0 -0
  42. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/charts/agentevals/templates/deployment.yaml +0 -0
  43. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/charts/agentevals/templates/postgresql-secret.yaml +0 -0
  44. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/charts/agentevals/templates/postgresql.yaml +0 -0
  45. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/charts/agentevals/templates/service.yaml +0 -0
  46. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/charts/agentevals/templates/serviceaccount.yaml +0 -0
  47. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/charts/agentevals/values.yaml +0 -0
  48. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/docs/assets/logo-color-on-transparent.svg +0 -0
  49. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/docs/assets/logo-color.png +0 -0
  50. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/docs/assets/logo-dark-on-transparent.svg +0 -0
  51. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/docs/eval-set-format.md +0 -0
  52. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/docs/otel-compatibility.md +0 -0
  53. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/docs/streaming.md +0 -0
  54. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/README.md +0 -0
  55. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/custom_evaluators/response_quality.py +0 -0
  56. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/custom_evaluators/tool_call_checker.py +0 -0
  57. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/custom_sink/README.md +0 -0
  58. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/custom_sink/agentevals_example_custom_sink/__init__.py +0 -0
  59. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/custom_sink/agentevals_example_custom_sink/sink.py +0 -0
  60. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/custom_sink/pyproject.toml +0 -0
  61. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/dice_agent/README.md +0 -0
  62. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/dice_agent/agent.py +0 -0
  63. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/dice_agent/eval_set.json +0 -0
  64. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/dice_agent/main.py +0 -0
  65. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/dice_agent/test_streaming.py +0 -0
  66. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/kubernetes/README.md +0 -0
  67. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/langchain_agent/README.md +0 -0
  68. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/langchain_agent/agent.py +0 -0
  69. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/langchain_agent/eval_set.json +0 -0
  70. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/langchain_agent/main.py +0 -0
  71. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/langchain_agent/requirements.txt +0 -0
  72. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/langchain_agent/test_streaming.py +0 -0
  73. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/sdk_example/async_example.py +0 -0
  74. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/sdk_example/context_manager_example.py +0 -0
  75. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/sdk_example/decorator_example.py +0 -0
  76. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/sdk_example/requirements.txt +0 -0
  77. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/strands_agent/agent.py +0 -0
  78. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/strands_agent/eval_set.json +0 -0
  79. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/strands_agent/main.py +0 -0
  80. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/strands_agent/requirements.txt +0 -0
  81. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/zero-code-examples/adk/requirements.txt +0 -0
  82. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/zero-code-examples/adk/run.py +0 -0
  83. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/zero-code-examples/langchain/requirements.txt +0 -0
  84. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/zero-code-examples/langchain/run.py +0 -0
  85. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/zero-code-examples/ollama/requirements.txt +0 -0
  86. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/zero-code-examples/ollama/run.py +0 -0
  87. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/zero-code-examples/openai-agents/requirements.txt +0 -0
  88. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/zero-code-examples/openai-agents/run.py +0 -0
  89. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/zero-code-examples/pydantic-ai/requirements.txt +0 -0
  90. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/zero-code-examples/pydantic-ai/run.py +0 -0
  91. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/zero-code-examples/strands/requirements.txt +0 -0
  92. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/examples/zero-code-examples/strands/run.py +0 -0
  93. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/flake.lock +0 -0
  94. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/flake.nix +0 -0
  95. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/packages/evaluator-sdk-py/README.md +0 -0
  96. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/packages/evaluator-sdk-py/pyproject.toml +0 -0
  97. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
  98. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
  99. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
  100. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/samples/eval_set_helm.json +0 -0
  101. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/samples/evalset_helm_3_2026-02-23.json +0 -0
  102. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/samples/evalset_k8s_2026-02-20.json +0 -0
  103. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/samples/helm.json +0 -0
  104. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/samples/helm_2.json +0 -0
  105. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/samples/helm_3.json +0 -0
  106. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/samples/k8s.json +0 -0
  107. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/samples/tempo_export_with_batches.json +0 -0
  108. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/_protocol.py +0 -0
  109. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
  110. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/_static/logo.svg +0 -0
  111. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/_static/vite.svg +0 -0
  112. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/api/__init__.py +0 -0
  113. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/api/app.py +0 -0
  114. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/api/debug_routes.py +0 -0
  115. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/api/dependencies.py +0 -0
  116. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/api/models.py +0 -0
  117. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/api/otlp_app.py +0 -0
  118. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/api/otlp_grpc.py +0 -0
  119. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/api/otlp_processing.py +0 -0
  120. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/api/otlp_routes.py +0 -0
  121. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/api/routes.py +0 -0
  122. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/api/runs_routes.py +0 -0
  123. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/api/streaming_routes.py +0 -0
  124. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/builtin_metrics.py +0 -0
  125. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/cli.py +0 -0
  126. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/custom_evaluators.py +0 -0
  127. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/eval_config_loader.py +0 -0
  128. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/evaluator/__init__.py +0 -0
  129. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/evaluator/resolver.py +0 -0
  130. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/evaluator/sources.py +0 -0
  131. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/evaluator/templates.py +0 -0
  132. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/evaluator/venv.py +0 -0
  133. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/genai_converter.py +0 -0
  134. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/loader/__init__.py +0 -0
  135. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/loader/auto.py +0 -0
  136. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/loader/base.py +0 -0
  137. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/loader/jaeger.py +0 -0
  138. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/loader/otlp.py +0 -0
  139. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/mcp_server.py +0 -0
  140. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/output.py +0 -0
  141. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/run/__init__.py +0 -0
  142. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/run/fetcher.py +0 -0
  143. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/run/result_builder.py +0 -0
  144. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/run/service.py +0 -0
  145. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/run/sinks.py +0 -0
  146. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/run/worker.py +0 -0
  147. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/runner.py +0 -0
  148. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/sdk.py +0 -0
  149. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/storage/__init__.py +0 -0
  150. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/storage/config.py +0 -0
  151. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/storage/models.py +0 -0
  152. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/storage/postgres/__init__.py +0 -0
  153. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/storage/postgres/migrations/000001_init.down.sql +0 -0
  154. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/storage/postgres/migrations/000001_init.up.sql +0 -0
  155. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/storage/postgres/migrator.py +0 -0
  156. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/storage/postgres/pool.py +0 -0
  157. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/storage/repos/__init__.py +0 -0
  158. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/storage/repos/memory.py +0 -0
  159. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/storage/repos/postgres.py +0 -0
  160. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/streaming/__init__.py +0 -0
  161. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/streaming/incremental_processor.py +0 -0
  162. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/streaming/processor.py +0 -0
  163. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/streaming/session.py +0 -0
  164. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/streaming/ws_server.py +0 -0
  165. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/trace_attrs.py +0 -0
  166. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/trace_metrics.py +0 -0
  167. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/utils/__init__.py +0 -0
  168. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/utils/genai_messages.py +0 -0
  169. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/utils/log_buffer.py +0 -0
  170. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/src/agentevals/utils/log_enrichment.py +0 -0
  171. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/api/__init__.py +0 -0
  172. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/api/test_evaluate_persistence.py +0 -0
  173. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/api/test_runs_routes.py +0 -0
  174. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/integration/__init__.py +0 -0
  175. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/integration/conftest.py +0 -0
  176. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/integration/test_evaluation_pipeline.py +0 -0
  177. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/integration/test_live_agents.py +0 -0
  178. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/integration/test_otlp_grpc_receiver.py +0 -0
  179. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/integration/test_session_grouping.py +0 -0
  180. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/integration/test_timing_stress.py +0 -0
  181. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/run/__init__.py +0 -0
  182. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/run/test_fetcher.py +0 -0
  183. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/run/test_result_builder.py +0 -0
  184. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/run/test_service.py +0 -0
  185. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/run/test_sinks.py +0 -0
  186. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/storage/__init__.py +0 -0
  187. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/storage/test_config.py +0 -0
  188. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/storage/test_memory_repos.py +0 -0
  189. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/storage/test_migrator.py +0 -0
  190. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/storage/test_models.py +0 -0
  191. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/test_api.py +0 -0
  192. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/test_eval_config_loader.py +0 -0
  193. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/test_genai_converter.py +0 -0
  194. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/test_jaeger_loader.py +0 -0
  195. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/test_loader_auto.py +0 -0
  196. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/test_log_enrichment.py +0 -0
  197. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/test_mcp_server.py +0 -0
  198. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/test_otlp_loader.py +0 -0
  199. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/test_otlp_receiver.py +0 -0
  200. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/test_output.py +0 -0
  201. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/test_protocol.py +0 -0
  202. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/test_runner.py +0 -0
  203. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/test_sdk.py +0 -0
  204. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/tests/test_trace_metrics.py +0 -0
  205. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/.gitignore +0 -0
  206. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/README.md +0 -0
  207. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/eslint.config.js +0 -0
  208. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/index.html +0 -0
  209. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/package.json +0 -0
  210. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/public/logo.svg +0 -0
  211. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/public/vite.svg +0 -0
  212. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/App.css +0 -0
  213. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/App.tsx +0 -0
  214. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/api/client.ts +0 -0
  215. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/assets/react.svg +0 -0
  216. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
  217. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
  218. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
  219. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
  220. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/builder/BuilderHeader.tsx +0 -0
  221. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/builder/BuilderView.tsx +0 -0
  222. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
  223. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/builder/EvalCasesList.tsx +0 -0
  224. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/builder/InvocationEditor.tsx +0 -0
  225. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/builder/JsonPreview.tsx +0 -0
  226. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/builder/MetadataEditor.tsx +0 -0
  227. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/builder/TraceUploadZone.tsx +0 -0
  228. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/builder/index.ts +0 -0
  229. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/dashboard/DashboardView.tsx +0 -0
  230. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
  231. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
  232. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/dashboard/PerformanceCharts.tsx +0 -0
  233. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
  234. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/dashboard/TraceCard.tsx +0 -0
  235. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/dashboard/TraceTable.tsx +0 -0
  236. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
  237. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/inspector/DataSection.tsx +0 -0
  238. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/inspector/InspectorHeader.tsx +0 -0
  239. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
  240. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/inspector/InspectorView.tsx +0 -0
  241. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/inspector/InvocationCard.tsx +0 -0
  242. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
  243. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
  244. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
  245. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
  246. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/inspector/ToolCallList.tsx +0 -0
  247. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
  248. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
  249. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/streaming/LiveMessage.tsx +0 -0
  250. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/streaming/LiveStreamingView.tsx +0 -0
  251. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/streaming/SessionCard.tsx +0 -0
  252. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
  253. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
  254. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/upload/FileDropZone.tsx +0 -0
  255. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/upload/MetricSelector.tsx +0 -0
  256. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
  257. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/upload/TraceEditorDrawer.tsx +0 -0
  258. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/upload/UploadView.tsx +0 -0
  259. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/components/welcome/WelcomeView.tsx +0 -0
  260. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/config.ts +0 -0
  261. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/context/TraceContext.tsx +0 -0
  262. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/context/TraceProvider.tsx +0 -0
  263. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/index.css +0 -0
  264. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/lib/console-capture.ts +0 -0
  265. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/lib/eval-config.ts +0 -0
  266. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/lib/evalset-builder.ts +0 -0
  267. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/lib/network-capture.ts +0 -0
  268. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/lib/trace-helpers.ts +0 -0
  269. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/lib/trace-loader.ts +0 -0
  270. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/lib/trace-metadata.ts +0 -0
  271. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/lib/trace-patcher.ts +0 -0
  272. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/lib/types.ts +0 -0
  273. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/lib/utils.ts +0 -0
  274. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/src/main.tsx +0 -0
  275. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/tsconfig.app.json +0 -0
  276. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/tsconfig.json +0 -0
  277. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/tsconfig.node.json +0 -0
  278. {agentevals_cli-0.9.0 → agentevals_cli-0.9.2}/ui/vite.config.ts +0 -0
@@ -22,6 +22,8 @@ jobs:
22
22
  runs-on: ubuntu-latest
23
23
  steps:
24
24
  - uses: actions/checkout@v6
25
+ with:
26
+ fetch-depth: 0
25
27
 
26
28
  - uses: astral-sh/setup-uv@v7
27
29
  with:
@@ -46,6 +48,8 @@ jobs:
46
48
  python-version: ["3.11", "3.12", "3.13"]
47
49
  steps:
48
50
  - uses: actions/checkout@v6
51
+ with:
52
+ fetch-depth: 0
49
53
 
50
54
  - uses: astral-sh/setup-uv@v7
51
55
  with:
@@ -18,7 +18,9 @@ jobs:
18
18
  runs-on: ubuntu-latest
19
19
  steps:
20
20
  - name: 'Checkout GitHub Action'
21
- uses: actions/checkout@main
21
+ uses: actions/checkout@v6
22
+ with:
23
+ fetch-depth: 0
22
24
 
23
25
  - name: Install uv
24
26
  uses: astral-sh/setup-uv@v6
@@ -20,6 +20,7 @@ jobs:
20
20
  - uses: actions/checkout@v6
21
21
  with:
22
22
  ref: ${{ github.event.inputs.tag || github.ref_name }}
23
+ fetch-depth: 0
23
24
 
24
25
  - uses: astral-sh/setup-uv@v7
25
26
  with:
@@ -31,9 +32,6 @@ jobs:
31
32
  cache: npm
32
33
  cache-dependency-path: ui/package-lock.json
33
34
 
34
- - name: Set version from tag
35
- run: uv version "${{ github.event.inputs.tag || github.ref_name }}" --package agentevals-cli
36
-
37
35
  - name: Build core and bundled wheels
38
36
  run: make release
39
37
 
@@ -51,6 +49,7 @@ jobs:
51
49
  - uses: actions/checkout@v6
52
50
  with:
53
51
  ref: ${{ github.event.inputs.tag || github.ref_name }}
52
+ fetch-depth: 0
54
53
 
55
54
  - uses: astral-sh/setup-uv@v7
56
55
  with:
@@ -65,11 +64,8 @@ jobs:
65
64
  # Same bundle as `make release` / `build-bundle`: wheel must include ui/dist in src/agentevals/_static
66
65
  # (see [tool.hatch.build] artifacts in pyproject.toml).
67
66
  - name: Release Python package (wheel + sdist with bundled UI)
68
- env:
69
- VERSION: ${{ github.event.inputs.tag || github.ref_name }}
70
67
  run: |
71
68
  uv sync --package agentevals-cli --all-extras
72
- uv version "$VERSION" --package agentevals-cli
73
69
 
74
70
  make build-ui
75
71
  rm -rf src/agentevals/_static
@@ -1,4 +1,4 @@
1
- VERSION := $(shell grep '^version' pyproject.toml | cut -d'"' -f2)
1
+ VERSION := $(shell uv run --with hatch hatch version 2>/dev/null)
2
2
  WHEEL := dist/agentevals_cli-$(VERSION)-py3-none-any.whl
3
3
 
4
4
  DOCKER_REGISTRY ?= soloio
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentevals-cli
3
- Version: 0.9.0
3
+ Version: 0.9.2
4
4
  Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
5
5
  License-File: LICENSE
6
6
  Requires-Python: >=3.11
@@ -317,6 +317,26 @@ The `grader.evaluation_metric` field selects the similarity algorithm:
317
317
  | `rouge_1` through `rouge_5` | Unigram through 5-gram overlap (F-measure) |
318
318
  | `rouge_l` | Longest common subsequence overlap (F-measure) |
319
319
 
320
+ ### Label Model Grader
321
+
322
+ Scores responses without a golden set. The model reads each response and assigns a label from a fixed list. Passing labels are defined in the config.
323
+
324
+ ```yaml
325
+ evaluators:
326
+ - name: quality_check
327
+ type: openai_eval
328
+ grader:
329
+ type: label_model
330
+ model: gpt-4o-mini
331
+ input:
332
+ - role: user
333
+ content: "Rate this response: {{ item.actual_response }}"
334
+ labels: [good, bad]
335
+ passing_labels: [good]
336
+ ```
337
+
338
+ The `threshold` field is not used for `label_model`. A response passes if its assigned label is in `passing_labels`.
339
+
320
340
  ### How it works
321
341
 
322
342
  Under the hood, agentevals creates an ephemeral eval on OpenAI, submits the actual and expected responses as JSONL items, polls for results, and cleans up. The agent's response and the golden reference are both placed in the `item` namespace (with `include_sample_schema: false`), so OpenAI only grades the provided text without generating any model outputs.
@@ -32,3 +32,4 @@ evaluators:
32
32
  ref: evaluators/random_evaluator/random_evaluator.py
33
33
  threshold: 0.110
34
34
  executor: local
35
+
@@ -0,0 +1,18 @@
1
+ # Eval config using OpenAI Evals API graders.
2
+ # Requires OPENAI_API_KEY to be set.
3
+ #
4
+ # Run with:
5
+ # agentevals run samples/helm.json \
6
+ # --config examples/custom_evaluators/eval_config_openai_eval.yaml
7
+
8
+ evaluators:
9
+ - name: quality_check
10
+ type: openai_eval
11
+ grader:
12
+ type: label_model
13
+ model: gpt-4o-mini
14
+ input:
15
+ - role: user
16
+ content: "Rate this response: {{ item.actual_response }}"
17
+ labels: [good, bad]
18
+ passing_labels: [good]
@@ -1,10 +1,10 @@
1
1
  [build-system]
2
- requires = ["hatchling"]
2
+ requires = ["hatchling", "hatch-vcs"]
3
3
  build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "agentevals-cli"
7
- version = "0.9.0"
7
+ dynamic = ["version"]
8
8
  description = "Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -40,6 +40,9 @@ postgres = [
40
40
  [project.scripts]
41
41
  agentevals = "agentevals.cli:main"
42
42
 
43
+ [tool.hatch.version]
44
+ source = "vcs"
45
+
43
46
  [tool.hatch.build]
44
47
  artifacts = ["src/agentevals/_static/**"]
45
48
 
@@ -3,7 +3,7 @@
3
3
  from importlib.metadata import PackageNotFoundError, version
4
4
 
5
5
  try:
6
- __version__ = version("agentevals")
6
+ __version__ = version("agentevals-cli")
7
7
  except PackageNotFoundError:
8
8
  __version__ = "0.0.0-dev"
9
9