agentevals-cli 0.5.3__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/PKG-INFO +59 -5
  2. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/README.md +56 -4
  3. agentevals_cli-0.6.0/docs/assets/logo-color-on-transparent.svg +13 -0
  4. agentevals_cli-0.6.0/docs/assets/logo-dark-on-transparent.svg +13 -0
  5. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/docs/custom-evaluators.md +82 -35
  6. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/custom_evaluators/eval_config.yaml +0 -1
  7. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/pyproject.toml +4 -1
  8. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/api/routes.py +2 -0
  9. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/config.py +41 -1
  10. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/custom_evaluators.py +45 -11
  11. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/eval_config_loader.py +3 -1
  12. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/evaluator/sources.py +23 -3
  13. agentevals_cli-0.6.0/src/agentevals/evaluator/venv.py +119 -0
  14. agentevals_cli-0.6.0/src/agentevals/openai_eval_backend.py +246 -0
  15. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/output.py +21 -4
  16. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/runner.py +6 -0
  17. agentevals_cli-0.6.0/tests/test_output.py +112 -0
  18. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/tests/test_runner.py +4 -0
  19. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/uv.lock +6 -2
  20. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/.claude/skills/eval/SKILL.md +0 -0
  21. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/.claude/skills/eval/evals/evals.json +0 -0
  22. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/.claude/skills/inspect/SKILL.md +0 -0
  23. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/.claude/skills/inspect/evals/evals.json +0 -0
  24. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  25. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  26. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  27. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/.github/workflows/ci.yml +0 -0
  28. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/.github/workflows/publish-evaluator-sdk.yml +0 -0
  29. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/.github/workflows/release.yml +0 -0
  30. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/.gitignore +0 -0
  31. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/.mcp.json +0 -0
  32. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/CONTRIBUTING.md +0 -0
  33. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/DEVELOPMENT.md +0 -0
  34. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/LICENSE +0 -0
  35. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/Makefile +0 -0
  36. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/docs/assets/logo-color.png +0 -0
  37. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/docs/eval-set-format.md +0 -0
  38. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/docs/otel-compatibility.md +0 -0
  39. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/docs/streaming.md +0 -0
  40. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/README.md +0 -0
  41. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/custom_evaluators/response_quality.py +0 -0
  42. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/custom_evaluators/tool_call_checker.py +0 -0
  43. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/dice_agent/README.md +0 -0
  44. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/dice_agent/agent.py +0 -0
  45. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/dice_agent/eval_set.json +0 -0
  46. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/dice_agent/main.py +0 -0
  47. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/dice_agent/test_streaming.py +0 -0
  48. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/langchain_agent/README.md +0 -0
  49. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/langchain_agent/agent.py +0 -0
  50. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/langchain_agent/eval_set.json +0 -0
  51. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/langchain_agent/main.py +0 -0
  52. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/langchain_agent/requirements.txt +0 -0
  53. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/langchain_agent/test_streaming.py +0 -0
  54. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/sdk_example/async_example.py +0 -0
  55. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/sdk_example/context_manager_example.py +0 -0
  56. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/sdk_example/decorator_example.py +0 -0
  57. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/sdk_example/requirements.txt +0 -0
  58. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/strands_agent/agent.py +0 -0
  59. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/strands_agent/eval_set.json +0 -0
  60. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/strands_agent/main.py +0 -0
  61. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/strands_agent/requirements.txt +0 -0
  62. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/zero-code-examples/adk/requirements.txt +0 -0
  63. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/zero-code-examples/adk/run.py +0 -0
  64. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/zero-code-examples/langchain/requirements.txt +0 -0
  65. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/zero-code-examples/langchain/run.py +0 -0
  66. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/zero-code-examples/strands/requirements.txt +0 -0
  67. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/examples/zero-code-examples/strands/run.py +0 -0
  68. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/flake.lock +0 -0
  69. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/flake.nix +0 -0
  70. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/packages/evaluator-sdk-py/README.md +0 -0
  71. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/packages/evaluator-sdk-py/pyproject.toml +0 -0
  72. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
  73. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
  74. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
  75. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/samples/eval_set_helm.json +0 -0
  76. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/samples/evalset_helm_3_2026-02-23.json +0 -0
  77. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/samples/evalset_k8s_2026-02-20.json +0 -0
  78. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/samples/helm.json +0 -0
  79. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/samples/helm_2.json +0 -0
  80. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/samples/helm_3.json +0 -0
  81. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/samples/k8s.json +0 -0
  82. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/__init__.py +0 -0
  83. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/_protocol.py +0 -0
  84. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
  85. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/_static/assets/index-Dz2NgC8m.js +0 -0
  86. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/_static/index.html +0 -0
  87. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/_static/logo.svg +0 -0
  88. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/_static/vite.svg +0 -0
  89. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/api/__init__.py +0 -0
  90. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/api/app.py +0 -0
  91. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/api/debug_routes.py +0 -0
  92. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/api/models.py +0 -0
  93. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/api/otlp_app.py +0 -0
  94. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/api/otlp_routes.py +0 -0
  95. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/api/streaming_routes.py +0 -0
  96. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/builtin_metrics.py +0 -0
  97. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/cli.py +0 -0
  98. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/converter.py +0 -0
  99. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/evaluator/__init__.py +0 -0
  100. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/evaluator/resolver.py +0 -0
  101. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/evaluator/templates.py +0 -0
  102. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/extraction.py +0 -0
  103. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/genai_converter.py +0 -0
  104. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/loader/__init__.py +0 -0
  105. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/loader/base.py +0 -0
  106. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/loader/jaeger.py +0 -0
  107. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/loader/otlp.py +0 -0
  108. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/mcp_server.py +0 -0
  109. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/sdk.py +0 -0
  110. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/streaming/__init__.py +0 -0
  111. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/streaming/incremental_processor.py +0 -0
  112. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/streaming/processor.py +0 -0
  113. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/streaming/session.py +0 -0
  114. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/streaming/ws_server.py +0 -0
  115. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/trace_attrs.py +0 -0
  116. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/trace_metrics.py +0 -0
  117. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/utils/__init__.py +0 -0
  118. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/utils/genai_messages.py +0 -0
  119. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/utils/log_buffer.py +0 -0
  120. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/src/agentevals/utils/log_enrichment.py +0 -0
  121. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/tests/integration/__init__.py +0 -0
  122. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/tests/integration/conftest.py +0 -0
  123. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/tests/integration/test_evaluation_pipeline.py +0 -0
  124. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/tests/integration/test_live_agents.py +0 -0
  125. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/tests/integration/test_session_grouping.py +0 -0
  126. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/tests/integration/test_timing_stress.py +0 -0
  127. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/tests/test_api.py +0 -0
  128. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/tests/test_converter.py +0 -0
  129. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/tests/test_extraction.py +0 -0
  130. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/tests/test_genai_converter.py +0 -0
  131. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/tests/test_jaeger_loader.py +0 -0
  132. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/tests/test_log_enrichment.py +0 -0
  133. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/tests/test_otlp_loader.py +0 -0
  134. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/tests/test_otlp_receiver.py +0 -0
  135. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/tests/test_protocol.py +0 -0
  136. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/tests/test_sdk.py +0 -0
  137. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/.gitignore +0 -0
  138. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/README.md +0 -0
  139. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/eslint.config.js +0 -0
  140. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/index.html +0 -0
  141. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/package-lock.json +0 -0
  142. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/package.json +0 -0
  143. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/public/logo.svg +0 -0
  144. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/public/vite.svg +0 -0
  145. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/App.css +0 -0
  146. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/App.tsx +0 -0
  147. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/api/client.ts +0 -0
  148. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/assets/react.svg +0 -0
  149. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
  150. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
  151. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
  152. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
  153. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/builder/BuilderHeader.tsx +0 -0
  154. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/builder/BuilderView.tsx +0 -0
  155. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
  156. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/builder/EvalCasesList.tsx +0 -0
  157. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/builder/InvocationEditor.tsx +0 -0
  158. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/builder/JsonPreview.tsx +0 -0
  159. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/builder/MetadataEditor.tsx +0 -0
  160. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/builder/TraceUploadZone.tsx +0 -0
  161. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/builder/index.ts +0 -0
  162. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/dashboard/DashboardView.tsx +0 -0
  163. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
  164. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
  165. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/dashboard/PerformanceCharts.tsx +0 -0
  166. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
  167. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/dashboard/TraceCard.tsx +0 -0
  168. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/dashboard/TraceTable.tsx +0 -0
  169. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
  170. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/inspector/DataSection.tsx +0 -0
  171. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/inspector/InspectorHeader.tsx +0 -0
  172. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
  173. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/inspector/InspectorView.tsx +0 -0
  174. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/inspector/InvocationCard.tsx +0 -0
  175. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
  176. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
  177. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
  178. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
  179. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/inspector/ToolCallList.tsx +0 -0
  180. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
  181. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/sidebar/Sidebar.tsx +0 -0
  182. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
  183. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/streaming/LiveMessage.tsx +0 -0
  184. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/streaming/LiveStreamingView.tsx +0 -0
  185. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/streaming/SessionCard.tsx +0 -0
  186. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
  187. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
  188. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/upload/FileDropZone.tsx +0 -0
  189. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/upload/MetricSelector.tsx +0 -0
  190. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
  191. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/upload/TraceEditorDrawer.tsx +0 -0
  192. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/upload/UploadView.tsx +0 -0
  193. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/components/welcome/WelcomeView.tsx +0 -0
  194. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/config.ts +0 -0
  195. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/context/TraceContext.tsx +0 -0
  196. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/context/TraceProvider.tsx +0 -0
  197. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/index.css +0 -0
  198. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/lib/console-capture.ts +0 -0
  199. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/lib/evalset-builder.ts +0 -0
  200. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/lib/network-capture.ts +0 -0
  201. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/lib/trace-converter.ts +0 -0
  202. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/lib/trace-loader.ts +0 -0
  203. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/lib/trace-metadata.ts +0 -0
  204. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/lib/trace-patcher.ts +0 -0
  205. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/lib/types.ts +0 -0
  206. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/lib/utils.ts +0 -0
  207. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/src/main.tsx +0 -0
  208. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/tsconfig.app.json +0 -0
  209. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/tsconfig.json +0 -0
  210. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/tsconfig.node.json +0 -0
  211. {agentevals_cli-0.5.3 → agentevals_cli-0.6.0}/ui/vite.config.ts +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentevals-cli
3
- Version: 0.5.3
3
+ Version: 0.6.0
4
4
  Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
5
5
  License-File: LICENSE
6
6
  Requires-Python: >=3.11
@@ -17,23 +17,76 @@ Requires-Dist: uvicorn[standard]>=0.32.0
17
17
  Provides-Extra: live
18
18
  Requires-Dist: httpx>=0.27.0; extra == 'live'
19
19
  Requires-Dist: mcp>=1.26.0; extra == 'live'
20
+ Provides-Extra: openai
21
+ Requires-Dist: openai>=2.0; extra == 'openai'
20
22
  Provides-Extra: streaming
21
23
  Requires-Dist: opentelemetry-sdk>=1.20.0; extra == 'streaming'
22
24
  Requires-Dist: websockets>=12.0; extra == 'streaming'
23
25
  Description-Content-Type: text/markdown
24
26
 
25
27
  <p align="center">
26
- <img src="docs/assets/logo-color.png" alt="agentevals" width="420" />
28
+ <picture>
29
+ <source media="(prefers-color-scheme: dark)" srcset="docs/assets/logo-color-on-transparent.svg">
30
+ <source media="(prefers-color-scheme: light)" srcset="docs/assets/logo-dark-on-transparent.svg">
31
+ <img src="docs/assets/logo-color-on-transparent.svg" alt="agentevals" width="420" />
32
+ </picture>
27
33
  </p>
28
34
 
29
- `agentevals` evaluates AI agent behavior from OpenTelemetry traces, without re-running the agent. Record once, score as many times as you want.
35
+ <h1 align="center">Ship Agents Reliably</h1>
30
36
 
31
- Works with any OTel-instrumented framework (LangChain, Strands, Google ADK, and others). Supports Jaeger JSON and OTLP trace formats, built-in and custom evaluators, and LLM-based judges.
37
+ <p align="center">
38
+ Benchmark your agents before they hit production.<br>
39
+ agentevals scores performance and inference quality from OpenTelemetry traces — no re-runs, no guesswork.
40
+ </p>
41
+
42
+ <p align="center">
43
+ <a href="https://github.com/agentevals-dev/agentevals/stargazers"><img src="https://img.shields.io/github/stars/agentevals-dev/agentevals?style=social" alt="GitHub Stars"></a>
44
+ &nbsp;
45
+ <a href="https://discord.gg/cpveEn8Ah2"><img src="https://img.shields.io/discord/1435836734666707190?label=Discord&logo=discord&logoColor=white&color=5865F2" alt="Discord"></a>
46
+ &nbsp;
47
+ <a href="https://github.com/agentevals-dev/agentevals/releases"><img src="https://img.shields.io/github/v/release/agentevals-dev/agentevals?label=Release" alt="Release"></a>
48
+ &nbsp;
49
+ <a href="https://github.com/agentevals-dev/agentevals/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-Apache%202.0-green.svg" alt="License"></a>
50
+ &nbsp;
51
+ <a href="https://pypi.org/project/agentevals-cli/"><img src="https://img.shields.io/pypi/v/agentevals-cli?label=PyPI&color=blue" alt="PyPI"></a>
52
+ </p>
53
+
54
+ <p align="center">
55
+ <a href="#installation">Install</a> · <a href="#quick-start">Quick Start</a> · <a href="https://github.com/agentevals-dev/agentevals/releases">Releases</a> · <a href="CONTRIBUTING.md">Contributing</a> · <a href="https://discord.gg/cpveEn8Ah2">Discord</a>
56
+ </p>
57
+
58
+ ---
59
+
60
+ ## What is agentevals?
61
+
62
+ agentevals is a framework-agnostic evaluation solution that scores AI agent behavior directly from [OpenTelemetry](https://opentelemetry.io/) traces. Record your agent's actions once, then evaluate as many times as you want — no re-runs, no guesswork.
63
+
64
+ It works with any OTel-instrumented framework (LangChain, Strands, Google ADK, and others), supports Jaeger JSON and OTLP trace formats, and ships with built-in evaluators, custom evaluator support, and LLM-based judges.
32
65
 
33
66
  - **CLI** for scripting and CI pipelines
34
67
  - **Web UI** for visual inspection and local developer experience
35
68
  - **MCP server** so MCP clients can run evaluations from a conversation
36
69
 
70
+ ## Why agentevals?
71
+
72
+ Most evaluation tools require you to **re-execute your agent** for every test — burning tokens, time, and money on duplicate LLM calls. agentevals takes a different approach:
73
+
74
+ - **No re-execution** — score agents from existing traces without replaying expensive LLM calls
75
+ - **Framework-agnostic** — works with any agent framework that emits OpenTelemetry spans
76
+ - **Golden eval sets** — compare actual behavior against defined expected behaviors for deterministic pass/fail gating
77
+ - **Custom evaluators** — write scoring logic in Python, JavaScript, or any language
78
+ - **CI/CD ready** — gate deployments on quality thresholds directly in your pipeline
79
+ - **Local-first** — no cloud dependency required; everything runs on your machine
80
+
81
+ ## How It Works
82
+
83
+ agentevals follows three simple steps:
84
+
85
+ 1. **Collect traces** — Instrument your agent with OpenTelemetry (or export traces from your tracing backend). Point the OTLP exporter at the agentevals receiver, or load trace files directly.
86
+ 2. **Define eval sets** — Create golden evaluation sets that describe expected agent behavior: which tools should be called, in what order, and what the output should look like.
87
+ 3. **Run evaluations** — Use the CLI, Web UI, or MCP server to score traces against your eval sets. Get per-metric scores, pass/fail results, and detailed span-level breakdowns.
88
+
89
+
37
90
  > [!IMPORTANT]
38
91
  > This project is under active development. Expect breaking changes.
39
92
 
@@ -64,6 +117,7 @@ Optional extras:
64
117
 
65
118
  ```bash
66
119
  pip install "agentevals-cli[live]" # MCP server support
120
+ pip install "agentevals-cli[openai]" # OpenAI Evals API graders
67
121
  ```
68
122
 
69
123
  **GitHub [releases](../../releases)** also ship **core** wheels (CLI and API only) and **bundle** wheels (with the embedded UI) if you need a specific version or offline `pip install ./path/to.whl`.
@@ -188,7 +242,7 @@ evaluators:
188
242
  agentevals run trace.json --config eval_config.yaml --eval-set eval_set.json
189
243
  ```
190
244
 
191
- Community evaluators can be referenced directly from a shared GitHub repository using `type: remote`. See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protocol reference, SDK usage, and how to contribute evaluators.
245
+ Community evaluators can be referenced directly from a shared GitHub repository using `type: remote`. You can also delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) using `type: openai_eval` (requires `pip install "agentevals-cli[openai]"` and `OPENAI_API_KEY`). See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protocol reference, SDK usage, and how to contribute evaluators.
192
246
 
193
247
  ## Web UI
194
248
 
@@ -1,15 +1,66 @@
1
1
  <p align="center">
2
- <img src="docs/assets/logo-color.png" alt="agentevals" width="420" />
2
+ <picture>
3
+ <source media="(prefers-color-scheme: dark)" srcset="docs/assets/logo-color-on-transparent.svg">
4
+ <source media="(prefers-color-scheme: light)" srcset="docs/assets/logo-dark-on-transparent.svg">
5
+ <img src="docs/assets/logo-color-on-transparent.svg" alt="agentevals" width="420" />
6
+ </picture>
3
7
  </p>
4
8
 
5
- `agentevals` evaluates AI agent behavior from OpenTelemetry traces, without re-running the agent. Record once, score as many times as you want.
9
+ <h1 align="center">Ship Agents Reliably</h1>
6
10
 
7
- Works with any OTel-instrumented framework (LangChain, Strands, Google ADK, and others). Supports Jaeger JSON and OTLP trace formats, built-in and custom evaluators, and LLM-based judges.
11
+ <p align="center">
12
+ Benchmark your agents before they hit production.<br>
13
+ agentevals scores performance and inference quality from OpenTelemetry traces — no re-runs, no guesswork.
14
+ </p>
15
+
16
+ <p align="center">
17
+ <a href="https://github.com/agentevals-dev/agentevals/stargazers"><img src="https://img.shields.io/github/stars/agentevals-dev/agentevals?style=social" alt="GitHub Stars"></a>
18
+ &nbsp;
19
+ <a href="https://discord.gg/cpveEn8Ah2"><img src="https://img.shields.io/discord/1435836734666707190?label=Discord&logo=discord&logoColor=white&color=5865F2" alt="Discord"></a>
20
+ &nbsp;
21
+ <a href="https://github.com/agentevals-dev/agentevals/releases"><img src="https://img.shields.io/github/v/release/agentevals-dev/agentevals?label=Release" alt="Release"></a>
22
+ &nbsp;
23
+ <a href="https://github.com/agentevals-dev/agentevals/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-Apache%202.0-green.svg" alt="License"></a>
24
+ &nbsp;
25
+ <a href="https://pypi.org/project/agentevals-cli/"><img src="https://img.shields.io/pypi/v/agentevals-cli?label=PyPI&color=blue" alt="PyPI"></a>
26
+ </p>
27
+
28
+ <p align="center">
29
+ <a href="#installation">Install</a> · <a href="#quick-start">Quick Start</a> · <a href="https://github.com/agentevals-dev/agentevals/releases">Releases</a> · <a href="CONTRIBUTING.md">Contributing</a> · <a href="https://discord.gg/cpveEn8Ah2">Discord</a>
30
+ </p>
31
+
32
+ ---
33
+
34
+ ## What is agentevals?
35
+
36
+ agentevals is a framework-agnostic evaluation solution that scores AI agent behavior directly from [OpenTelemetry](https://opentelemetry.io/) traces. Record your agent's actions once, then evaluate as many times as you want — no re-runs, no guesswork.
37
+
38
+ It works with any OTel-instrumented framework (LangChain, Strands, Google ADK, and others), supports Jaeger JSON and OTLP trace formats, and ships with built-in evaluators, custom evaluator support, and LLM-based judges.
8
39
 
9
40
  - **CLI** for scripting and CI pipelines
10
41
  - **Web UI** for visual inspection and local developer experience
11
42
  - **MCP server** so MCP clients can run evaluations from a conversation
12
43
 
44
+ ## Why agentevals?
45
+
46
+ Most evaluation tools require you to **re-execute your agent** for every test — burning tokens, time, and money on duplicate LLM calls. agentevals takes a different approach:
47
+
48
+ - **No re-execution** — score agents from existing traces without replaying expensive LLM calls
49
+ - **Framework-agnostic** — works with any agent framework that emits OpenTelemetry spans
50
+ - **Golden eval sets** — compare actual behavior against defined expected behaviors for deterministic pass/fail gating
51
+ - **Custom evaluators** — write scoring logic in Python, JavaScript, or any language
52
+ - **CI/CD ready** — gate deployments on quality thresholds directly in your pipeline
53
+ - **Local-first** — no cloud dependency required; everything runs on your machine
54
+
55
+ ## How It Works
56
+
57
+ agentevals follows three simple steps:
58
+
59
+ 1. **Collect traces** — Instrument your agent with OpenTelemetry (or export traces from your tracing backend). Point the OTLP exporter at the agentevals receiver, or load trace files directly.
60
+ 2. **Define eval sets** — Create golden evaluation sets that describe expected agent behavior: which tools should be called, in what order, and what the output should look like.
61
+ 3. **Run evaluations** — Use the CLI, Web UI, or MCP server to score traces against your eval sets. Get per-metric scores, pass/fail results, and detailed span-level breakdowns.
62
+
63
+
13
64
  > [!IMPORTANT]
14
65
  > This project is under active development. Expect breaking changes.
15
66
 
@@ -40,6 +91,7 @@ Optional extras:
40
91
 
41
92
  ```bash
42
93
  pip install "agentevals-cli[live]" # MCP server support
94
+ pip install "agentevals-cli[openai]" # OpenAI Evals API graders
43
95
  ```
44
96
 
45
97
  **GitHub [releases](../../releases)** also ship **core** wheels (CLI and API only) and **bundle** wheels (with the embedded UI) if you need a specific version or offline `pip install ./path/to.whl`.
@@ -164,7 +216,7 @@ evaluators:
164
216
  agentevals run trace.json --config eval_config.yaml --eval-set eval_set.json
165
217
  ```
166
218
 
167
- Community evaluators can be referenced directly from a shared GitHub repository using `type: remote`. See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protocol reference, SDK usage, and how to contribute evaluators.
219
+ Community evaluators can be referenced directly from a shared GitHub repository using `type: remote`. You can also delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) using `type: openai_eval` (requires `pip install "agentevals-cli[openai]"` and `OPENAI_API_KEY`). See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protocol reference, SDK usage, and how to contribute evaluators.
168
220
 
169
221
  ## Web UI
170
222
 
@@ -0,0 +1,13 @@
1
+ <svg width="3302" height="1066" viewBox="0 0 3302 1066" fill="none" xmlns="http://www.w3.org/2000/svg">
2
+ <path d="M518.695 264C560.958 264 595.207 298.274 595.207 340.537C595.207 382.8 560.958 417.048 518.695 417.048C454.983 417.048 403.305 468.548 403 532.184V533.304C403.306 596.94 454.983 648.438 518.695 648.438H518.722C560.985 648.439 595.232 682.687 595.232 724.95C595.232 767.213 560.984 801.461 518.722 801.461C476.459 801.461 442.21 767.213 442.21 724.95V724.67C442.057 661.008 390.482 609.408 326.795 609.255H326.515C284.252 609.255 250.004 575.006 250.004 532.743C250.004 490.48 284.252 456.232 326.515 456.232H326.642C390.431 456.156 442.108 404.453 442.185 340.664V340.512C442.185 298.249 476.432 264 518.695 264ZM492.436 469.353C527.452 454.848 567.596 471.476 582.101 506.492C596.605 541.508 579.976 581.653 544.96 596.157C509.944 610.661 469.8 594.033 455.296 559.017C440.792 524.001 457.42 483.857 492.436 469.353Z" fill="#8023C3"/>
3
+ <path d="M1029.16 401.476V655.084H982.736L976.321 616.93C956.253 644.357 928.75 658.054 893.878 658.054C870.849 658.054 850.254 652.839 832.16 642.443C814.066 632.046 799.887 617.029 789.688 597.359C779.49 577.721 774.391 554.683 774.391 528.247C774.391 501.81 779.589 479.796 789.952 460.125C800.315 440.487 814.56 425.305 832.654 414.546C850.748 403.819 871.178 398.439 893.878 398.439C912.301 398.439 928.454 401.839 942.271 408.605C956.089 415.371 967.274 424.711 975.861 436.593V401.41H1029.19L1029.16 401.476ZM902.76 612.97C924.802 612.97 942.567 605.214 956.089 589.701C969.577 574.189 976.321 554.023 976.321 529.27C976.321 504.516 969.577 483.294 956.089 467.617C942.6 451.94 924.802 444.085 902.76 444.085C880.718 444.085 862.92 451.94 849.432 467.617C835.944 483.294 829.199 503.526 829.199 528.28C829.199 553.033 835.944 573.76 849.432 589.47C862.92 605.148 880.685 613.003 902.76 613.003V612.97Z" fill="white"/>
4
+ <path d="M2723.45 401.476V655.084H2677.03L2670.61 616.93C2650.54 644.357 2623.04 658.054 2588.17 658.054C2565.14 658.054 2544.54 652.839 2526.45 642.443C2508.36 632.046 2494.18 617.029 2483.98 597.359C2473.78 577.721 2468.68 554.683 2468.68 528.247C2468.68 501.81 2473.88 479.796 2484.24 460.125C2494.6 440.487 2508.85 425.305 2526.94 414.546C2545.04 403.819 2565.47 398.439 2588.17 398.439C2606.59 398.439 2622.74 401.839 2636.56 408.605C2650.38 415.371 2661.56 424.711 2670.15 436.593V401.41H2723.48L2723.45 401.476ZM2597.05 612.97C2619.09 612.97 2636.86 605.214 2650.38 589.701C2663.87 574.189 2670.61 554.023 2670.61 529.27C2670.61 504.516 2663.87 483.294 2650.38 467.617C2636.89 451.94 2619.09 444.085 2597.05 444.085C2575.01 444.085 2557.21 451.94 2543.72 467.617C2530.23 483.294 2523.49 503.526 2523.49 528.28C2523.49 553.033 2530.23 573.76 2543.72 589.47C2557.21 605.148 2574.97 613.003 2597.05 613.003V612.97Z" fill="white"/>
5
+ <path d="M1308.72 401.47V644.682C1308.72 680.36 1298.19 707.985 1277.14 727.655C1256.08 747.293 1223.48 757.129 1179.36 757.129C1145.11 757.129 1117.32 749.439 1095.93 734.091C1074.51 718.744 1062.67 697.027 1060.37 668.94H1114.68C1117.97 683.132 1125.54 694.123 1137.38 701.879C1149.23 709.635 1164.52 713.529 1183.31 713.529C1231.7 713.529 1255.88 689.898 1255.88 642.701V614.482C1237.46 642.206 1209.96 656.101 1173.44 656.101C1150.41 656.101 1129.82 650.887 1111.72 640.49C1093.63 630.094 1079.45 615.242 1069.25 595.901C1059.05 576.593 1053.95 553.721 1053.95 527.284C1053.95 500.847 1059.15 479.394 1069.51 459.922C1079.88 440.449 1094.12 425.333 1112.22 414.606C1130.31 403.88 1150.74 398.5 1173.44 398.5C1192.52 398.5 1209 402.296 1222.82 409.887C1236.64 417.478 1247.82 427.907 1256.41 441.109L1262.33 401.47H1308.75H1308.72ZM1182.32 610.984C1204.36 610.984 1222.13 603.294 1235.65 587.947C1249.14 572.6 1255.88 552.698 1255.88 528.274C1255.88 503.851 1249.14 482.794 1235.65 467.084C1222.16 451.406 1204.36 443.551 1182.32 443.551C1160.28 443.551 1142.48 451.307 1128.99 466.82C1115.51 482.332 1108.76 502.498 1108.76 527.251C1108.76 552.005 1115.51 572.17 1128.99 587.683C1142.48 603.195 1160.25 610.951 1182.32 610.951V610.984Z" fill="white"/>
6
+ <path d="M1324.01 528.769C1324.01 502.696 1329.21 479.823 1339.57 460.153C1349.94 440.515 1364.41 425.333 1383.03 414.573C1401.62 403.847 1422.94 398.467 1446.99 398.467C1471.03 398.467 1492.81 403.417 1511.43 413.319C1530.02 423.22 1544.66 437.28 1555.39 455.433C1566.08 473.585 1571.61 494.906 1571.93 519.33C1571.93 525.931 1571.44 532.697 1570.45 539.628H1379.87V542.598C1381.19 564.711 1388.1 582.237 1400.6 595.109C1413.1 607.98 1429.71 614.416 1450.47 614.416C1466.92 614.416 1480.74 610.555 1491.96 602.766C1503.14 595.01 1510.55 584.019 1514.16 569.827H1567.49C1562.89 595.571 1550.45 616.727 1530.22 633.229C1509.99 649.731 1484.72 657.982 1454.42 657.982C1428.07 657.982 1405.14 652.636 1385.53 641.876C1365.96 631.15 1350.79 616.034 1340.1 596.561C1329.41 577.088 1324.04 554.447 1324.04 528.703L1324.01 528.769ZM1517.55 500.517C1515.25 482.035 1507.91 467.579 1495.58 457.182C1483.24 446.786 1467.68 441.571 1448.93 441.571C1431.49 441.571 1416.42 446.951 1403.76 457.677C1391.09 468.404 1383.76 482.695 1381.78 500.517H1517.55Z" fill="white"/>
7
+ <path d="M1587.2 401.47H1633.61L1639.54 434.673C1658.62 410.58 1685.63 398.5 1720.5 398.5C1735.3 398.5 1748.96 400.711 1761.49 405.2C1773.99 409.656 1784.78 416.686 1793.83 426.257C1802.88 435.828 1809.88 447.974 1814.82 462.661C1819.75 477.348 1822.22 494.94 1822.22 515.402V655.078H1768.4V518.373C1768.4 494.28 1763.3 475.929 1753.1 463.387C1742.9 450.845 1727.93 444.574 1708.16 444.574C1687.11 444.574 1670.56 451.935 1658.55 466.622C1646.54 481.309 1640.52 501.541 1640.52 527.317V655.111H1587.2V401.47Z" fill="white"/>
8
+ <path d="M1845.75 401.47V330.609H1899.57V401.437H1960.3V448.502H1899.57V580.752C1899.57 590.653 1901.54 597.683 1905.49 601.809C1909.44 605.934 1916.18 608.014 1925.72 608.014H1966.22V655.078H1914.87C1890.85 655.078 1873.31 649.566 1862.29 638.477C1851.27 627.42 1845.75 609.994 1845.75 586.23V448.535" fill="white"/>
9
+ <path d="M1976.12 528.769C1976.12 502.696 1981.32 479.823 1991.69 460.153C2002.05 440.515 2016.52 425.333 2035.14 414.573C2053.73 403.847 2075.05 398.467 2099.1 398.467C2123.15 398.467 2144.93 403.417 2163.51 413.319C2182.1 423.22 2196.77 437.28 2207.47 455.433C2218.16 473.585 2223.69 494.906 2224.01 519.33C2224.01 525.931 2223.52 532.697 2222.53 539.628H2031.95V542.598C2033.27 564.711 2040.18 582.237 2052.68 595.109C2065.18 607.98 2081.83 614.416 2102.55 614.416C2119 614.416 2132.85 610.555 2144.04 602.766C2155.22 595.01 2162.63 584.019 2166.24 569.827H2219.57C2214.97 595.571 2202.53 616.727 2182.3 633.229C2162.07 649.731 2136.8 657.982 2106.5 657.982C2080.15 657.982 2057.19 652.636 2037.61 641.876C2018.04 631.15 2002.87 616.034 1992.18 596.561C1981.49 577.088 1976.12 554.447 1976.12 528.703V528.769ZM2169.67 500.517C2167.36 482.035 2160.03 467.579 2147.69 457.182C2135.35 446.786 2119.79 441.571 2101.04 441.571C2083.6 441.571 2068.54 446.951 2055.87 457.677C2043.2 468.404 2035.87 482.695 2033.89 500.517H2169.67Z" fill="white"/>
10
+ <path d="M2216.86 401.475H2274.14L2343.75 597.621L2412.38 401.475H2468.67L2375.33 655.082H2310.16L2216.83 401.475H2216.86Z" fill="white"/>
11
+ <path d="M2754.43 308.332H2807.76V655.079H2754.43V308.332Z" fill="white"/>
12
+ <path d="M2882.6 571.352C2883.59 584.554 2889.77 595.379 2901.12 603.796C2912.47 612.212 2927.21 616.436 2945.31 616.436C2961.43 616.436 2974.52 613.4 2984.55 607.261C2994.59 601.155 2999.62 592.97 2999.62 582.739C2999.62 574.157 2997.32 567.721 2992.71 563.431C2988.11 559.14 2981.92 556.104 2974.19 554.256C2966.46 552.44 2954.52 550.559 2938.4 548.546C2916.36 545.905 2898.16 542.341 2883.85 537.885C2869.54 533.43 2857.99 526.334 2849.28 516.597C2840.56 506.861 2836.18 493.725 2836.18 477.223C2836.18 461.711 2840.56 447.915 2849.28 435.868C2857.99 423.821 2870 414.481 2885.33 407.88C2900.63 401.279 2918 397.979 2937.41 397.979C2969.32 397.979 2995.25 405.075 3015.18 419.267C3035.09 433.459 3045.88 453.459 3047.52 479.203H2995.67C2994.36 467.651 2988.6 458.146 2978.4 450.72C2968.2 443.294 2955.37 439.564 2939.88 439.564C2924.38 439.564 2911.92 442.535 2902.34 448.476C2892.8 454.416 2888.03 462.503 2888.03 472.734C2888.03 480.325 2890.4 486.035 2895.2 489.83C2899.97 493.626 2905.99 496.266 2913.23 497.752C2920.47 499.237 2932.15 500.986 2948.3 502.966C2970.01 505.277 2988.31 508.841 3003.11 513.627C3017.91 518.413 3029.76 526.004 3038.67 536.4C3047.56 546.797 3052 560.923 3052 578.745C3052 594.587 3047.39 608.548 3038.18 620.595C3028.97 632.642 3016.27 641.883 3000.15 648.319C2984.03 654.755 2965.9 657.989 2945.83 657.989C2911.92 657.989 2884.51 650.299 2863.62 634.952C2842.73 619.605 2831.94 598.383 2831.28 571.286H2882.64L2882.6 571.352Z" fill="white"/>
13
+ </svg>
@@ -0,0 +1,13 @@
1
+ <svg width="3302" height="1066" viewBox="0 0 3302 1066" fill="none" xmlns="http://www.w3.org/2000/svg">
2
+ <path d="M518.695 264C560.958 264 595.207 298.274 595.207 340.537C595.207 382.8 560.958 417.048 518.695 417.048C454.983 417.048 403.305 468.548 403 532.184V533.304C403.306 596.94 454.983 648.438 518.695 648.438H518.722C560.985 648.439 595.232 682.687 595.232 724.95C595.232 767.213 560.984 801.461 518.722 801.461C476.459 801.461 442.21 767.213 442.21 724.95V724.67C442.057 661.008 390.482 609.408 326.795 609.255H326.515C284.252 609.255 250.004 575.006 250.004 532.743C250.004 490.48 284.252 456.232 326.515 456.232H326.642C390.431 456.156 442.108 404.453 442.185 340.664V340.512C442.185 298.249 476.432 264 518.695 264ZM492.436 469.353C527.452 454.848 567.596 471.476 582.101 506.492C596.605 541.508 579.976 581.653 544.96 596.157C509.944 610.661 469.8 594.033 455.296 559.017C440.792 524.001 457.42 483.857 492.436 469.353Z" fill="#151927"/>
3
+ <path d="M1029.16 401.476V655.084H982.736L976.321 616.93C956.253 644.357 928.75 658.054 893.878 658.054C870.849 658.054 850.254 652.839 832.16 642.443C814.066 632.046 799.887 617.029 789.688 597.359C779.49 577.721 774.391 554.683 774.391 528.247C774.391 501.81 779.589 479.796 789.952 460.125C800.315 440.487 814.56 425.305 832.654 414.546C850.748 403.819 871.178 398.439 893.878 398.439C912.301 398.439 928.454 401.839 942.271 408.605C956.089 415.371 967.274 424.711 975.861 436.593V401.41H1029.19L1029.16 401.476ZM902.76 612.97C924.802 612.97 942.567 605.214 956.089 589.701C969.577 574.189 976.321 554.023 976.321 529.27C976.321 504.516 969.577 483.294 956.089 467.617C942.6 451.94 924.802 444.085 902.76 444.085C880.718 444.085 862.92 451.94 849.432 467.617C835.944 483.294 829.199 503.526 829.199 528.28C829.199 553.033 835.944 573.76 849.432 589.47C862.92 605.148 880.685 613.003 902.76 613.003V612.97Z" fill="#151927"/>
4
+ <path d="M2723.45 401.476V655.084H2677.03L2670.61 616.93C2650.54 644.357 2623.04 658.054 2588.17 658.054C2565.14 658.054 2544.54 652.839 2526.45 642.443C2508.36 632.046 2494.18 617.029 2483.98 597.359C2473.78 577.721 2468.68 554.683 2468.68 528.247C2468.68 501.81 2473.88 479.796 2484.24 460.125C2494.6 440.487 2508.85 425.305 2526.94 414.546C2545.04 403.819 2565.47 398.439 2588.17 398.439C2606.59 398.439 2622.74 401.839 2636.56 408.605C2650.38 415.371 2661.56 424.711 2670.15 436.593V401.41H2723.48L2723.45 401.476ZM2597.05 612.97C2619.09 612.97 2636.86 605.214 2650.38 589.701C2663.87 574.189 2670.61 554.023 2670.61 529.27C2670.61 504.516 2663.87 483.294 2650.38 467.617C2636.89 451.94 2619.09 444.085 2597.05 444.085C2575.01 444.085 2557.21 451.94 2543.72 467.617C2530.23 483.294 2523.49 503.526 2523.49 528.28C2523.49 553.033 2530.23 573.76 2543.72 589.47C2557.21 605.148 2574.97 613.003 2597.05 613.003V612.97Z" fill="#151927"/>
5
+ <path d="M1308.72 401.47V644.682C1308.72 680.36 1298.19 707.985 1277.14 727.655C1256.08 747.293 1223.48 757.129 1179.36 757.129C1145.11 757.129 1117.32 749.439 1095.93 734.091C1074.51 718.744 1062.67 697.027 1060.37 668.94H1114.68C1117.97 683.132 1125.54 694.123 1137.38 701.879C1149.23 709.635 1164.52 713.529 1183.31 713.529C1231.7 713.529 1255.88 689.898 1255.88 642.701V614.482C1237.46 642.206 1209.96 656.101 1173.44 656.101C1150.41 656.101 1129.82 650.887 1111.72 640.49C1093.63 630.094 1079.45 615.242 1069.25 595.901C1059.05 576.593 1053.95 553.721 1053.95 527.284C1053.95 500.847 1059.15 479.394 1069.51 459.922C1079.88 440.449 1094.12 425.333 1112.22 414.606C1130.31 403.88 1150.74 398.5 1173.44 398.5C1192.52 398.5 1209 402.296 1222.82 409.887C1236.64 417.478 1247.82 427.907 1256.41 441.109L1262.33 401.47H1308.75H1308.72ZM1182.32 610.984C1204.36 610.984 1222.13 603.294 1235.65 587.947C1249.14 572.6 1255.88 552.698 1255.88 528.274C1255.88 503.851 1249.14 482.794 1235.65 467.084C1222.16 451.406 1204.36 443.551 1182.32 443.551C1160.28 443.551 1142.48 451.307 1128.99 466.82C1115.51 482.332 1108.76 502.498 1108.76 527.251C1108.76 552.005 1115.51 572.17 1128.99 587.683C1142.48 603.195 1160.25 610.951 1182.32 610.951V610.984Z" fill="#151927"/>
6
+ <path d="M1324.01 528.769C1324.01 502.696 1329.21 479.823 1339.57 460.153C1349.94 440.515 1364.41 425.333 1383.03 414.573C1401.62 403.847 1422.94 398.467 1446.99 398.467C1471.03 398.467 1492.81 403.417 1511.43 413.319C1530.02 423.22 1544.66 437.28 1555.39 455.433C1566.08 473.585 1571.61 494.906 1571.93 519.33C1571.93 525.931 1571.44 532.697 1570.45 539.628H1379.87V542.598C1381.19 564.711 1388.1 582.237 1400.6 595.109C1413.1 607.98 1429.71 614.416 1450.47 614.416C1466.92 614.416 1480.74 610.555 1491.96 602.766C1503.14 595.01 1510.55 584.019 1514.16 569.827H1567.49C1562.89 595.571 1550.45 616.727 1530.22 633.229C1509.99 649.731 1484.72 657.982 1454.42 657.982C1428.07 657.982 1405.14 652.636 1385.53 641.876C1365.96 631.15 1350.79 616.034 1340.1 596.561C1329.41 577.088 1324.04 554.447 1324.04 528.703L1324.01 528.769ZM1517.55 500.517C1515.25 482.035 1507.91 467.579 1495.58 457.182C1483.24 446.786 1467.68 441.571 1448.93 441.571C1431.49 441.571 1416.42 446.951 1403.76 457.677C1391.09 468.404 1383.76 482.695 1381.78 500.517H1517.55Z" fill="#151927"/>
7
+ <path d="M1587.2 401.47H1633.61L1639.54 434.673C1658.62 410.58 1685.63 398.5 1720.5 398.5C1735.3 398.5 1748.96 400.711 1761.49 405.2C1773.99 409.656 1784.78 416.686 1793.83 426.257C1802.88 435.828 1809.88 447.974 1814.82 462.661C1819.75 477.348 1822.22 494.94 1822.22 515.402V655.078H1768.4V518.373C1768.4 494.28 1763.3 475.929 1753.1 463.387C1742.9 450.845 1727.93 444.574 1708.16 444.574C1687.11 444.574 1670.56 451.935 1658.55 466.622C1646.54 481.309 1640.52 501.541 1640.52 527.317V655.111H1587.2V401.47Z" fill="#151927"/>
8
+ <path d="M1845.75 401.47V330.609H1899.57V401.437H1960.3V448.502H1899.57V580.752C1899.57 590.653 1901.54 597.683 1905.49 601.809C1909.44 605.934 1916.18 608.014 1925.72 608.014H1966.22V655.078H1914.87C1890.85 655.078 1873.31 649.566 1862.29 638.477C1851.27 627.42 1845.75 609.994 1845.75 586.23V448.535" fill="#151927"/>
9
+ <path d="M1976.12 528.769C1976.12 502.696 1981.32 479.823 1991.69 460.153C2002.05 440.515 2016.52 425.333 2035.14 414.573C2053.73 403.847 2075.05 398.467 2099.1 398.467C2123.15 398.467 2144.93 403.417 2163.51 413.319C2182.1 423.22 2196.77 437.28 2207.47 455.433C2218.16 473.585 2223.69 494.906 2224.01 519.33C2224.01 525.931 2223.52 532.697 2222.53 539.628H2031.95V542.598C2033.27 564.711 2040.18 582.237 2052.68 595.109C2065.18 607.98 2081.83 614.416 2102.55 614.416C2119 614.416 2132.85 610.555 2144.04 602.766C2155.22 595.01 2162.63 584.019 2166.24 569.827H2219.57C2214.97 595.571 2202.53 616.727 2182.3 633.229C2162.07 649.731 2136.8 657.982 2106.5 657.982C2080.15 657.982 2057.19 652.636 2037.61 641.876C2018.04 631.15 2002.87 616.034 1992.18 596.561C1981.49 577.088 1976.12 554.447 1976.12 528.703V528.769ZM2169.67 500.517C2167.36 482.035 2160.03 467.579 2147.69 457.182C2135.35 446.786 2119.79 441.571 2101.04 441.571C2083.6 441.571 2068.54 446.951 2055.87 457.677C2043.2 468.404 2035.87 482.695 2033.89 500.517H2169.67Z" fill="#151927"/>
10
+ <path d="M2216.86 401.475H2274.14L2343.75 597.621L2412.38 401.475H2468.67L2375.33 655.082H2310.16L2216.83 401.475H2216.86Z" fill="#151927"/>
11
+ <path d="M2754.43 308.332H2807.76V655.079H2754.43V308.332Z" fill="#151927"/>
12
+ <path d="M2882.6 571.352C2883.59 584.554 2889.77 595.379 2901.12 603.796C2912.47 612.212 2927.21 616.436 2945.31 616.436C2961.43 616.436 2974.52 613.4 2984.55 607.261C2994.59 601.155 2999.62 592.97 2999.62 582.739C2999.62 574.157 2997.32 567.721 2992.71 563.431C2988.11 559.14 2981.92 556.104 2974.19 554.256C2966.46 552.44 2954.52 550.559 2938.4 548.546C2916.36 545.905 2898.16 542.341 2883.85 537.885C2869.54 533.43 2857.99 526.334 2849.28 516.597C2840.56 506.861 2836.18 493.725 2836.18 477.223C2836.18 461.711 2840.56 447.915 2849.28 435.868C2857.99 423.821 2870 414.481 2885.33 407.88C2900.63 401.279 2918 397.979 2937.41 397.979C2969.32 397.979 2995.25 405.075 3015.18 419.267C3035.09 433.459 3045.88 453.459 3047.52 479.203H2995.67C2994.36 467.651 2988.6 458.146 2978.4 450.72C2968.2 443.294 2955.37 439.564 2939.88 439.564C2924.38 439.564 2911.92 442.535 2902.34 448.476C2892.8 454.416 2888.03 462.503 2888.03 472.734C2888.03 480.325 2890.4 486.035 2895.2 489.83C2899.97 493.626 2905.99 496.266 2913.23 497.752C2920.47 499.237 2932.15 500.986 2948.3 502.966C2970.01 505.277 2988.31 508.841 3003.11 513.627C3017.91 518.413 3029.76 526.004 3038.67 536.4C3047.56 546.797 3052 560.923 3052 578.745C3052 594.587 3047.39 608.548 3038.18 620.595C3028.97 632.642 3016.27 641.883 3000.15 648.319C2984.03 654.755 2965.9 657.989 2945.83 657.989C2911.92 657.989 2884.51 650.299 2863.62 634.952C2842.73 619.605 2831.94 598.383 2831.28 571.286H2882.64L2882.6 571.352Z" fill="#151927"/>
13
+ </svg>
@@ -85,7 +85,9 @@ agentevals run traces/my_trace.json \
85
85
 
86
86
  ## Eval Config Reference
87
87
 
88
- Each evaluator entry in the `evaluators` list uses the following fields:
88
+ Each evaluator entry in the `evaluators` list uses the following fields. The `type` field determines which other fields are valid.
89
+
90
+ ### `type: code` (local scripts)
89
91
 
90
92
  | Field | Required | Default | Description |
91
93
  |---|---|---|---|
@@ -96,6 +98,16 @@ Each evaluator entry in the `evaluators` list uses the following fields:
96
98
  | `timeout` | no | `30` | Subprocess timeout in seconds |
97
99
  | `config` | no | `{}` | Arbitrary key-value pairs passed to the evaluator |
98
100
 
101
+ ### `type: openai_eval` (OpenAI Evals API)
102
+
103
+ | Field | Required | Default | Description |
104
+ |---|---|---|---|
105
+ | `name` | yes | | Unique name for the evaluator (used in output) |
106
+ | `type` | yes | | `openai_eval` for OpenAI Evals API graders |
107
+ | `threshold` | no | `0.5` | Maps to `pass_threshold` in the OpenAI grader |
108
+ | `timeout` | no | `120` | Max seconds to wait for the OpenAI eval run |
109
+ | `grader` | yes | | OpenAI grader config (see [OpenAI Evals Graders](#openai-evals-api-graders)) |
110
+
99
111
  ## Protocol
100
112
 
101
113
  Every evaluator — regardless of language — communicates via the same JSON protocol over stdin/stdout.
@@ -275,6 +287,40 @@ evaluators:
275
287
 
276
288
  Remote evaluators are cached in `~/.cache/agentevals/evaluators/`. To force a re-download, delete the cached file.
277
289
 
290
+ ## OpenAI Evals API Graders
291
+
292
+ You can delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) instead of running scoring logic locally. This requires `pip install "agentevals-cli[openai]"` and `OPENAI_API_KEY` to be set.
293
+
294
+ ### Text Similarity Grader
295
+
296
+ Compares the agent's response against a golden reference using text similarity metrics. Requires an eval set.
297
+
298
+ ```yaml
299
+ evaluators:
300
+ - name: response_similarity
301
+ type: openai_eval
302
+ threshold: 0.8
303
+ grader:
304
+ type: text_similarity
305
+ evaluation_metric: fuzzy_match
306
+ ```
307
+
308
+ The `grader.evaluation_metric` field selects the similarity algorithm:
309
+
310
+ | Metric | Description |
311
+ |---|---|
312
+ | `fuzzy_match` | Approximate string matching using edit distance |
313
+ | `bleu` | N-gram overlap score, commonly used for translation quality |
314
+ | `gleu` | Google's variant of BLEU with sentence-level scoring |
315
+ | `meteor` | Alignment-based metric considering synonyms and paraphrases |
316
+ | `cosine` | Cosine similarity on vectorized text |
317
+ | `rouge_1` through `rouge_5` | Unigram through 5-gram overlap (F-measure) |
318
+ | `rouge_l` | Longest common subsequence overlap (F-measure) |
319
+
320
+ ### How it works
321
+
322
+ Under the hood, agentevals creates an ephemeral eval on OpenAI, submits the actual and expected responses as JSONL items, polls for results, and cleans up. The agent's response and the golden reference are both placed in the `item` namespace (with `include_sample_schema: false`), so OpenAI only grades the provided text without generating any model outputs.
323
+
278
324
  ### Configuring the GitHub source
279
325
 
280
326
  By default, evaluators are fetched from the official community repository. Override with environment variables:
@@ -303,42 +349,43 @@ The community repo uses per-evaluator manifests. A CI workflow compiles all `eva
303
349
  Custom evaluators use a layered architecture designed for extensibility.
304
350
 
305
351
  ```
306
- ┌─────────────────────────────────────────┐
307
- │ Eval Config (YAML)
308
- │ type: code | remote
309
- └──────────────┬──────────────────────────┘
310
-
311
-
312
- ┌─────────────────────────────────────────┐
313
- │ EvaluatorResolver │
314
- Downloads remote → local cache │
315
- (passthrough for type: code)
316
- └──────────────┬──────────────────────────┘
317
-
318
-
319
- ┌─────────────────────────────────────────┐
320
- │ CustomEvaluatorRunner │
321
- │ ADK Evaluator adapter │
322
- Invocation ↔ EvalInput/EvalResult
323
- └──────────────┬──────────────────────────┘
324
-
325
-
326
- ┌─────────────────────────────────────────┐
327
- │ EvaluatorBackend (ABC) — executor factory │
328
- │ async run(EvalInput) → EvalResult │
329
- ├─────────────────────────────────────────┤
330
- │ "local" → SubprocessBackend
331
- │ "docker" → DockerBackend (future)
332
- └──────────────┬──────────────────────────┘
333
-
334
-
335
- ┌─────────────────────────────────────────┐
336
- │ Runtime registry
337
- │ PythonRuntime (.py)
338
- │ NodeRuntime (.js, .ts)
339
- └─────────────────────────────────────────┘
352
+ ┌─────────────────────────────────────────────┐
353
+ │ Eval Config (YAML)
354
+ │ type: code | remote | openai_eval
355
+ └──────────────┬─────────────┬────────────────┘
356
+
357
+ code/remote openai_eval
358
+ │ │
359
+ ▼ ▼
360
+ ┌──────────────────────┐ ┌──────────────────────┐
361
+ EvaluatorResolver │ │ OpenAI Evals API
362
+ │ remote → local │ │ create eval + run │
363
+ (passthrough: code) │ │ poll → get results │
364
+ └──────────┬───────────┘ └──────────────────────┘
365
+
366
+
367
+ ┌──────────────────────────┐
368
+ CustomEvaluatorRunner
369
+ │ ADK Evaluator adapter │
370
+ Invocation ↔ EvalInput │
371
+ └──────────┬───────────────┘
372
+
373
+
374
+ ┌──────────────────────────┐
375
+ │ EvaluatorBackend (ABC) │
376
+ │ "local" → Subprocess
377
+ │ "docker" → (future)
378
+ └──────────┬───────────────┘
379
+
380
+
381
+ ┌──────────────────────────┐
382
+ │ Runtime registry
383
+ │ PythonRuntime (.py)
384
+ │ NodeRuntime (.js, .ts)
385
+ └──────────────────────────┘
340
386
  ```
341
387
 
388
+ - **`type: openai_eval`** takes a separate path: it calls the OpenAI Evals API directly (create eval, create run, poll, collect results) and returns a `MetricResult`. It does not go through the subprocess/backend stack.
342
389
  - **`EvaluatorSource`** is the registry abstraction. Implementations (`BuiltinEvaluatorSource`, `GitHubEvaluatorSource`) list and fetch evaluators from different registries.
343
390
  - **`EvaluatorResolver`** downloads remote evaluators and converts `RemoteEvaluatorDef` to `CodeEvaluatorDef` with a local cached path.
344
391
  - **`EvaluatorBackend`** is the execution abstraction. The `executor` field in config selects which factory to use (`"local"` → `SubprocessBackend`). New executors (e.g. `DockerBackend`) register via `register_executor()`.
@@ -32,4 +32,3 @@ evaluators:
32
32
  ref: evaluators/random_evaluator/random_evaluator.py
33
33
  threshold: 0.110
34
34
  executor: local
35
-
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "agentevals-cli"
7
- version = "0.5.3"
7
+ version = "0.6.0"
8
8
  description = "Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -30,6 +30,9 @@ streaming = [
30
30
  "opentelemetry-sdk>=1.20.0",
31
31
  "websockets>=12.0",
32
32
  ]
33
+ openai = [
34
+ "openai>=2.0",
35
+ ]
33
36
 
34
37
  [project.scripts]
35
38
  agentevals = "agentevals.cli:main"
@@ -22,6 +22,7 @@ from ..config import (
22
22
  CodeEvaluatorDef,
23
23
  CustomEvaluatorDef,
24
24
  EvalRunConfig,
25
+ OpenAIEvalDef,
25
26
  )
26
27
  from ..extraction import get_extractor
27
28
  from ..runner import RunResult, get_loader, load_eval_set, run_evaluation
@@ -58,6 +59,7 @@ router = APIRouter()
58
59
  _TYPE_TO_MODEL = {
59
60
  "builtin": BuiltinMetricDef,
60
61
  "code": CodeEvaluatorDef,
62
+ "openai_eval": OpenAIEvalDef,
61
63
  }
62
64
 
63
65
 
@@ -53,8 +53,48 @@ class RemoteEvaluatorDef(BaseEvaluatorDef):
53
53
  ref: str = Field(description="Source-specific reference (e.g. path within the repo).")
54
54
 
55
55
 
56
+ _VALID_SIMILARITY_METRICS = frozenset(
57
+ {
58
+ "fuzzy_match",
59
+ "bleu",
60
+ "gleu",
61
+ "meteor",
62
+ "cosine",
63
+ "rouge_1",
64
+ "rouge_2",
65
+ "rouge_3",
66
+ "rouge_4",
67
+ "rouge_5",
68
+ "rouge_l",
69
+ }
70
+ )
71
+
72
+
73
+ class OpenAIEvalDef(BaseModel):
74
+ """An evaluator that delegates grading to the OpenAI Evals API."""
75
+
76
+ type: Literal["openai_eval"] = "openai_eval"
77
+ name: str
78
+ threshold: float = 0.5
79
+ timeout: int = Field(default=120, description="Max seconds to wait for the OpenAI eval run to complete.")
80
+ grader: dict[str, Any] = Field(description="OpenAI grader config passed to testing_criteria.")
81
+
82
+ @field_validator("grader")
83
+ @classmethod
84
+ def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
85
+ grader_type = v.get("type")
86
+ if grader_type != "text_similarity":
87
+ raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'")
88
+ metric = v.get("evaluation_metric")
89
+ if not metric:
90
+ raise ValueError("'evaluation_metric' is required for text_similarity grader")
91
+ if metric not in _VALID_SIMILARITY_METRICS:
92
+ raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
93
+ return v
94
+
95
+
56
96
  CustomEvaluatorDef = Annotated[
57
- BuiltinMetricDef | CodeEvaluatorDef | RemoteEvaluatorDef,
97
+ BuiltinMetricDef | CodeEvaluatorDef | RemoteEvaluatorDef | OpenAIEvalDef,
58
98
  Field(discriminator="type"),
59
99
  ]
60
100
 
@@ -81,6 +81,9 @@ class Runtime(abc.ABC):
81
81
 
82
82
 
83
83
  class PythonRuntime(Runtime):
84
+ def __init__(self, python_path: Path | None = None):
85
+ self._exe = str(python_path) if python_path else sys.executable
86
+
84
87
  @property
85
88
  def name(self) -> str:
86
89
  return "Python"
@@ -90,13 +93,16 @@ class PythonRuntime(Runtime):
90
93
  return (".py",)
91
94
 
92
95
  def build_command(self, path: Path) -> list[str]:
93
- return [sys.executable, str(path)]
96
+ return [self._exe, str(path)]
94
97
 
95
98
  def is_available(self) -> bool:
96
99
  return True
97
100
 
98
101
 
99
102
  class NodeRuntime(Runtime):
103
+ def __init__(self) -> None:
104
+ self._exe = shutil.which("node")
105
+
100
106
  @property
101
107
  def name(self) -> str:
102
108
  return "Node.js"
@@ -106,10 +112,12 @@ class NodeRuntime(Runtime):
106
112
  return (".js", ".ts")
107
113
 
108
114
  def build_command(self, path: Path) -> list[str]:
109
- node = shutil.which("node")
110
- if not node:
115
+ if not self._exe:
111
116
  raise RuntimeError("Node.js not found on PATH (required for .js/.ts evaluators)")
112
- return [node, str(path)]
117
+ return [self._exe, str(path)]
118
+
119
+ def is_available(self) -> bool:
120
+ return self._exe is not None
113
121
 
114
122
 
115
123
  _RUNTIMES: list[Runtime] = [
@@ -203,12 +211,13 @@ class SubprocessBackend(EvaluatorBackend):
203
211
  """Runs a local code file (.py, .js, .ts, …) as a subprocess.
204
212
 
205
213
  The correct interpreter is resolved from the file extension via the
206
- :data:`_RUNTIMES` registry.
214
+ :data:`_RUNTIMES` registry. Pass a pre-configured *runtime* to override
215
+ the default (e.g. a :class:`PythonRuntime` with a venv interpreter).
207
216
  """
208
217
 
209
- def __init__(self, path: Path, timeout: int = 30):
218
+ def __init__(self, path: Path, timeout: int = 30, runtime: Runtime | None = None):
210
219
  self._path = path.resolve()
211
- self._runtime = _resolve_runtime(self._path)
220
+ self._runtime = runtime or _resolve_runtime(self._path)
212
221
  self._timeout = timeout
213
222
 
214
223
  if not self._path.exists():
@@ -223,7 +232,7 @@ class SubprocessBackend(EvaluatorBackend):
223
232
  # Executor factory
224
233
  # ---------------------------------------------------------------------------
225
234
 
226
- _EXECUTOR_FACTORIES: dict[str, Callable[[Path, int], EvaluatorBackend]] = {
235
+ _EXECUTOR_FACTORIES: dict[str, Callable[..., EvaluatorBackend]] = {
227
236
  "local": lambda path, timeout: SubprocessBackend(path, timeout),
228
237
  }
229
238
 
@@ -236,7 +245,7 @@ def create_executor(executor_name: str, path: Path, timeout: int = 30) -> Evalua
236
245
  return factory(path, timeout)
237
246
 
238
247
 
239
- def register_executor(name: str, factory: Callable[[Path, int], EvaluatorBackend]) -> None:
248
+ def register_executor(name: str, factory: Callable[..., EvaluatorBackend]) -> None:
240
249
  """Register a new executor factory (e.g. for Docker support)."""
241
250
  _EXECUTOR_FACTORIES[name] = factory
242
251
 
@@ -416,16 +425,41 @@ async def evaluate_custom_evaluator(
416
425
  """
417
426
  import inspect as _inspect
418
427
 
419
- from .config import CodeEvaluatorDef, RemoteEvaluatorDef
428
+ from .config import CodeEvaluatorDef, OpenAIEvalDef, RemoteEvaluatorDef
420
429
  from .runner import MetricResult
421
430
 
431
+ if isinstance(evaluator_def, OpenAIEvalDef):
432
+ from .openai_eval_backend import evaluate_openai_eval
433
+
434
+ return await evaluate_openai_eval(evaluator_def, actual_invocations, expected_invocations)
435
+
422
436
  if isinstance(evaluator_def, RemoteEvaluatorDef):
423
437
  from .evaluator.resolver import get_default_resolver
424
438
 
425
439
  evaluator_def = await get_default_resolver().resolve(evaluator_def)
426
440
 
427
441
  if isinstance(evaluator_def, CodeEvaluatorDef):
428
- backend = create_executor(evaluator_def.executor, Path(evaluator_def.path), evaluator_def.timeout)
442
+ evaluator_path = Path(evaluator_def.path)
443
+
444
+ runtime: Runtime | None = None
445
+ if evaluator_path.suffix == ".py":
446
+ from .evaluator.venv import ensure_venv_async
447
+
448
+ try:
449
+ venv_python = await ensure_venv_async(evaluator_path)
450
+ except Exception as exc:
451
+ logger.error("Failed to set up venv for '%s': %s", evaluator_def.name, exc)
452
+ return MetricResult(
453
+ metric_name=evaluator_def.name,
454
+ error=f"Dependency installation failed: {exc}",
455
+ )
456
+ if venv_python:
457
+ runtime = PythonRuntime(python_path=venv_python)
458
+
459
+ if runtime is not None:
460
+ backend = SubprocessBackend(evaluator_path, evaluator_def.timeout, runtime=runtime)
461
+ else:
462
+ backend = create_executor(evaluator_def.executor, evaluator_path, evaluator_def.timeout)
429
463
  else:
430
464
  raise ValueError(f"Unsupported custom evaluator type: {type(evaluator_def).__name__}")
431
465
 
@@ -13,6 +13,7 @@ from .config import (
13
13
  CodeEvaluatorDef,
14
14
  CustomEvaluatorDef,
15
15
  EvalRunConfig,
16
+ OpenAIEvalDef,
16
17
  RemoteEvaluatorDef,
17
18
  )
18
19
 
@@ -22,6 +23,7 @@ _TYPE_TO_MODEL = {
22
23
  "builtin": BuiltinMetricDef,
23
24
  "code": CodeEvaluatorDef,
24
25
  "remote": RemoteEvaluatorDef,
26
+ "openai_eval": OpenAIEvalDef,
25
27
  }
26
28
 
27
29
 
@@ -42,7 +44,7 @@ def _parse_evaluator_entry(entry: dict[str, Any]) -> tuple[str | None, CustomEva
42
44
 
43
45
  evaluator_type = entry.get("type")
44
46
  if not evaluator_type:
45
- raise ValueError(f"Evaluator entry '{name}' must have a 'type' field (builtin, code, or remote)")
47
+ raise ValueError(f"Evaluator entry '{name}' must have a 'type' field ({', '.join(_TYPE_TO_MODEL)})")
46
48
 
47
49
  if evaluator_type not in _TYPE_TO_MODEL:
48
50
  raise ValueError(