judgeval 0.15.0__tar.gz → 0.16.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. judgeval-0.16.1/CONTRIBUTING.md +10 -0
  2. judgeval-0.16.1/PKG-INFO +266 -0
  3. judgeval-0.16.1/README.md +239 -0
  4. judgeval-0.16.1/assets/custom_scorer_online_abm.png +0 -0
  5. judgeval-0.16.1/assets/logo_darkmode.svg +7 -0
  6. judgeval-0.16.1/assets/logo_lightmode.svg +7 -0
  7. judgeval-0.16.1/assets/quickstart_trajectory_ss.png +0 -0
  8. {judgeval-0.15.0 → judgeval-0.16.1}/pyproject.toml +1 -1
  9. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/api/__init__.py +4 -18
  10. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/api/api_types.py +18 -2
  11. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/data/judgment_types.py +18 -2
  12. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/logger.py +1 -1
  13. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/tracer/__init__.py +10 -7
  14. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/tracer/keys.py +7 -3
  15. judgeval-0.16.1/src/judgeval/tracer/llm/__init__.py +7 -0
  16. judgeval-0.16.1/src/judgeval/tracer/llm/config.py +110 -0
  17. judgeval-0.16.1/src/judgeval/tracer/llm/constants.py +10 -0
  18. judgeval-0.16.1/src/judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  19. judgeval-0.16.1/src/judgeval/tracer/llm/llm_anthropic/wrapper.py +611 -0
  20. judgeval-0.16.1/src/judgeval/tracer/llm/llm_google/__init__.py +0 -0
  21. judgeval-0.16.1/src/judgeval/tracer/llm/llm_google/config.py +24 -0
  22. judgeval-0.16.1/src/judgeval/tracer/llm/llm_google/wrapper.py +426 -0
  23. judgeval-0.16.1/src/judgeval/tracer/llm/llm_groq/__init__.py +0 -0
  24. judgeval-0.16.1/src/judgeval/tracer/llm/llm_groq/config.py +23 -0
  25. judgeval-0.16.1/src/judgeval/tracer/llm/llm_groq/wrapper.py +477 -0
  26. judgeval-0.16.1/src/judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  27. judgeval-0.16.1/src/judgeval/tracer/llm/llm_openai/wrapper.py +637 -0
  28. judgeval-0.16.1/src/judgeval/tracer/llm/llm_together/__init__.py +0 -0
  29. judgeval-0.16.1/src/judgeval/tracer/llm/llm_together/config.py +23 -0
  30. judgeval-0.16.1/src/judgeval/tracer/llm/llm_together/wrapper.py +478 -0
  31. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/tracer/llm/providers.py +5 -5
  32. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/tracer/processors/__init__.py +1 -1
  33. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/trainer/console.py +1 -1
  34. judgeval-0.16.1/src/judgeval/utils/decorators/__init__.py +0 -0
  35. judgeval-0.16.1/src/judgeval/utils/decorators/dont_throw.py +21 -0
  36. judgeval-0.15.0/src/judgeval/utils/decorators.py → judgeval-0.16.1/src/judgeval/utils/decorators/use_once.py +0 -11
  37. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/utils/meta.py +1 -1
  38. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/utils/version_check.py +1 -1
  39. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/version.py +1 -1
  40. judgeval-0.15.0/PKG-INFO +0 -158
  41. judgeval-0.15.0/README.md +0 -131
  42. judgeval-0.15.0/assets/logo-dark.svg +0 -23
  43. judgeval-0.15.0/assets/logo-light.svg +0 -18
  44. judgeval-0.15.0/assets/new_darkmode.svg +0 -29
  45. judgeval-0.15.0/assets/new_lightmode.svg +0 -34
  46. judgeval-0.15.0/src/judgeval/tracer/llm/__init__.py +0 -1232
  47. judgeval-0.15.0/src/judgeval/tracer/llm/google/__init__.py +0 -21
  48. judgeval-0.15.0/src/judgeval/tracer/llm/groq/__init__.py +0 -20
  49. judgeval-0.15.0/src/judgeval/tracer/llm/together/__init__.py +0 -20
  50. {judgeval-0.15.0 → judgeval-0.16.1}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  51. {judgeval-0.15.0 → judgeval-0.16.1}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  52. {judgeval-0.15.0 → judgeval-0.16.1}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  53. {judgeval-0.15.0 → judgeval-0.16.1}/.github/pull_request_template.md +0 -0
  54. {judgeval-0.15.0 → judgeval-0.16.1}/.github/workflows/blocked-pr.yaml +0 -0
  55. {judgeval-0.15.0 → judgeval-0.16.1}/.github/workflows/ci.yaml +0 -0
  56. {judgeval-0.15.0 → judgeval-0.16.1}/.github/workflows/claude-code-review.yml +0 -0
  57. {judgeval-0.15.0 → judgeval-0.16.1}/.github/workflows/claude.yml +0 -0
  58. {judgeval-0.15.0 → judgeval-0.16.1}/.github/workflows/lint.yaml +0 -0
  59. {judgeval-0.15.0 → judgeval-0.16.1}/.github/workflows/merge-branch-check.yaml +0 -0
  60. {judgeval-0.15.0 → judgeval-0.16.1}/.github/workflows/mypy.yaml +0 -0
  61. {judgeval-0.15.0 → judgeval-0.16.1}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
  62. {judgeval-0.15.0 → judgeval-0.16.1}/.github/workflows/release.yaml +0 -0
  63. {judgeval-0.15.0 → judgeval-0.16.1}/.github/workflows/validate-branch.yaml +0 -0
  64. {judgeval-0.15.0 → judgeval-0.16.1}/.gitignore +0 -0
  65. {judgeval-0.15.0 → judgeval-0.16.1}/.pre-commit-config.yaml +0 -0
  66. {judgeval-0.15.0 → judgeval-0.16.1}/LICENSE.md +0 -0
  67. {judgeval-0.15.0 → judgeval-0.16.1}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
  68. {judgeval-0.15.0 → judgeval-0.16.1}/assets/agent.gif +0 -0
  69. {judgeval-0.15.0 → judgeval-0.16.1}/assets/agent_trace_example.png +0 -0
  70. {judgeval-0.15.0 → judgeval-0.16.1}/assets/brand/company.jpg +0 -0
  71. {judgeval-0.15.0 → judgeval-0.16.1}/assets/brand/company_banner.jpg +0 -0
  72. {judgeval-0.15.0 → judgeval-0.16.1}/assets/brand/darkmode.svg +0 -0
  73. {judgeval-0.15.0 → judgeval-0.16.1}/assets/brand/full_logo.png +0 -0
  74. {judgeval-0.15.0 → judgeval-0.16.1}/assets/brand/icon.png +0 -0
  75. {judgeval-0.15.0 → judgeval-0.16.1}/assets/brand/lightmode.svg +0 -0
  76. {judgeval-0.15.0 → judgeval-0.16.1}/assets/brand/white_background.png +0 -0
  77. {judgeval-0.15.0 → judgeval-0.16.1}/assets/data.gif +0 -0
  78. {judgeval-0.15.0 → judgeval-0.16.1}/assets/dataset_clustering_screenshot.png +0 -0
  79. {judgeval-0.15.0 → judgeval-0.16.1}/assets/dataset_clustering_screenshot_dm.png +0 -0
  80. {judgeval-0.15.0 → judgeval-0.16.1}/assets/datasets_preview_screenshot.png +0 -0
  81. {judgeval-0.15.0 → judgeval-0.16.1}/assets/document.gif +0 -0
  82. {judgeval-0.15.0 → judgeval-0.16.1}/assets/error_analysis_dashboard.png +0 -0
  83. {judgeval-0.15.0 → judgeval-0.16.1}/assets/errors.png +0 -0
  84. {judgeval-0.15.0 → judgeval-0.16.1}/assets/experiments_dashboard_screenshot.png +0 -0
  85. {judgeval-0.15.0 → judgeval-0.16.1}/assets/experiments_page.png +0 -0
  86. {judgeval-0.15.0 → judgeval-0.16.1}/assets/experiments_pagev2.png +0 -0
  87. {judgeval-0.15.0 → judgeval-0.16.1}/assets/monitoring_screenshot.png +0 -0
  88. {judgeval-0.15.0 → judgeval-0.16.1}/assets/online_eval.png +0 -0
  89. {judgeval-0.15.0 → judgeval-0.16.1}/assets/product_shot.png +0 -0
  90. {judgeval-0.15.0 → judgeval-0.16.1}/assets/test.png +0 -0
  91. {judgeval-0.15.0 → judgeval-0.16.1}/assets/tests.png +0 -0
  92. {judgeval-0.15.0 → judgeval-0.16.1}/assets/trace.gif +0 -0
  93. {judgeval-0.15.0 → judgeval-0.16.1}/assets/trace_demo.png +0 -0
  94. {judgeval-0.15.0 → judgeval-0.16.1}/assets/trace_screenshot.png +0 -0
  95. {judgeval-0.15.0 → judgeval-0.16.1}/assets/trace_screenshot_old.png +0 -0
  96. {judgeval-0.15.0 → judgeval-0.16.1}/pytest.ini +0 -0
  97. {judgeval-0.15.0 → judgeval-0.16.1}/scripts/api_generator.py +0 -0
  98. {judgeval-0.15.0 → judgeval-0.16.1}/scripts/openapi_transform.py +0 -0
  99. {judgeval-0.15.0 → judgeval-0.16.1}/scripts/update_types.sh +0 -0
  100. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/__init__.py +0 -0
  101. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/cli.py +0 -0
  102. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/constants.py +0 -0
  103. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/data/__init__.py +0 -0
  104. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/data/evaluation_run.py +0 -0
  105. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/data/example.py +0 -0
  106. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/data/result.py +0 -0
  107. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/data/scorer_data.py +0 -0
  108. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
  109. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/data/scripts/openapi_transform.py +0 -0
  110. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/data/trace.py +0 -0
  111. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/dataset/__init__.py +0 -0
  112. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/env.py +0 -0
  113. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/evaluation/__init__.py +0 -0
  114. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/exceptions.py +0 -0
  115. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/integrations/langgraph/__init__.py +0 -0
  116. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/integrations/openlit/__init__.py +0 -0
  117. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/judges/__init__.py +0 -0
  118. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/judges/base_judge.py +0 -0
  119. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/judges/litellm_judge.py +0 -0
  120. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/judges/together_judge.py +0 -0
  121. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/judges/utils.py +0 -0
  122. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/scorers/__init__.py +0 -0
  123. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/scorers/agent_scorer.py +0 -0
  124. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/scorers/api_scorer.py +0 -0
  125. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/scorers/base_scorer.py +0 -0
  126. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/scorers/example_scorer.py +0 -0
  127. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/scorers/exceptions.py +0 -0
  128. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  129. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  130. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  131. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  132. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  133. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
  134. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +0 -0
  135. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/scorers/score.py +0 -0
  136. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/scorers/utils.py +0 -0
  137. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/tracer/constants.py +0 -0
  138. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/tracer/exporters/__init__.py +0 -0
  139. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/tracer/exporters/s3.py +0 -0
  140. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/tracer/exporters/store.py +0 -0
  141. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/tracer/exporters/utils.py +0 -0
  142. /judgeval-0.15.0/src/judgeval/tracer/llm/anthropic/__init__.py → /judgeval-0.16.1/src/judgeval/tracer/llm/llm_anthropic/config.py +0 -0
  143. /judgeval-0.15.0/src/judgeval/tracer/llm/openai/__init__.py → /judgeval-0.16.1/src/judgeval/tracer/llm/llm_openai/config.py +0 -0
  144. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/tracer/local_eval_queue.py +0 -0
  145. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/tracer/managers.py +0 -0
  146. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/tracer/utils.py +0 -0
  147. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/trainer/__init__.py +0 -0
  148. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/trainer/config.py +0 -0
  149. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/trainer/trainable_model.py +0 -0
  150. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/trainer/trainer.py +0 -0
  151. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/utils/async_utils.py +0 -0
  152. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/utils/file_utils.py +0 -0
  153. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/utils/guards.py +0 -0
  154. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/utils/serialize.py +0 -0
  155. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/utils/testing.py +0 -0
  156. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/utils/url.py +0 -0
  157. {judgeval-0.15.0 → judgeval-0.16.1}/src/judgeval/warnings.py +0 -0
  158. {judgeval-0.15.0 → judgeval-0.16.1}/update_version.py +0 -0
  159. {judgeval-0.15.0 → judgeval-0.16.1}/uv.lock +0 -0
@@ -0,0 +1,10 @@
1
+ # Contribute to Judgeval
2
+
3
+ There are many ways to contribute to Judgeval:
4
+
5
+ - Submit [bug reports](https://github.com/JudgmentLabs/judgeval/issues) and [feature requests](https://github.com/JudgmentLabs/judgeval/issues)
6
+ - Review the documentation and submit [Pull Requests](https://github.com/JudgmentLabs/judgeval/pulls) to improve it
7
+ - Speaking or writing about Judgment and letting us know!
8
+
9
+ <!-- Contributors collage -->
10
+ [![Contributors](https://contributors-img.web.app/image?repo=JudgmentLabs/judgeval)](https://github.com/JudgmentLabs/judgeval/graphs/contributors)
@@ -0,0 +1,266 @@
1
+ Metadata-Version: 2.4
2
+ Name: judgeval
3
+ Version: 0.16.1
4
+ Summary: Judgeval Package
5
+ Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
+ Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
7
+ Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
8
+ License-Expression: Apache-2.0
9
+ License-File: LICENSE.md
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.10
13
+ Requires-Dist: boto3>=1.40.11
14
+ Requires-Dist: click<8.2.0
15
+ Requires-Dist: dotenv
16
+ Requires-Dist: httpx>=0.28.1
17
+ Requires-Dist: litellm<1.75.0
18
+ Requires-Dist: opentelemetry-exporter-otlp>=1.36.0
19
+ Requires-Dist: opentelemetry-sdk>=1.36.0
20
+ Requires-Dist: orjson>=3.9.0
21
+ Requires-Dist: typer>=0.9.0
22
+ Provides-Extra: s3
23
+ Requires-Dist: boto3>=1.40.11; extra == 's3'
24
+ Provides-Extra: trainer
25
+ Requires-Dist: fireworks-ai>=0.19.18; extra == 'trainer'
26
+ Description-Content-Type: text/markdown
27
+
28
+ <div align="center">
29
+
30
+ <a href="https://judgmentlabs.ai/">
31
+ <picture>
32
+ <source media="(prefers-color-scheme: dark)" srcset="assets/logo_darkmode.svg">
33
+ <img src="assets/logo_lightmode.svg" alt="Judgment Logo" width="400" />
34
+ </picture>
35
+ </a>
36
+
37
+ <br>
38
+
39
+ ## Agent Behavior Monitoring (ABM)
40
+
41
+ Track and judge any agent behavior in online and offline setups. Set up Sentry-style alerts and analyze agent behaviors / topic patterns at scale!
42
+
43
+ [![Docs](https://img.shields.io/badge/Documentation-blue)](https://docs.judgmentlabs.ai/documentation)
44
+ [![Judgment Cloud](https://img.shields.io/badge/Judgment%20Cloud-brightgreen)](https://app.judgmentlabs.ai/register)
45
+ [![Self-Host](https://img.shields.io/badge/Self--Host-orange)](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
46
+
47
+
48
+ [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
49
+ [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
50
+
51
+ </div>
52
+
53
+
54
+ </table>
55
+
56
+ ## [NEW] 🎆 Agent Reinforcement Learning
57
+
58
+ Train your agents with multi-turn reinforcement learning using judgeval and [Fireworks AI](https://fireworks.ai/)! Judgeval's ABM now integrates with Fireworks' Reinforcement Fine-Tuning (RFT) endpoint, supporting gpt-oss, qwen3, Kimi2, DeepSeek, and more.
59
+
60
+ Judgeval's agent monitoring infra provides a simple harness for integrating GRPO into any Python agent, giving builders a quick method to **try RL with minimal code changes** to their existing agents!
61
+
62
+ ```python
63
+ await trainer.train(
64
+ agent_function=your_agent_function, # entry point to your agent
65
+ scorers=[RewardScorer()], # Custom scorer you define based on task criteria, acts as reward
66
+ prompts=training_prompts, # Tasks
67
+ rft_provider="fireworks"
68
+ )
69
+ ```
70
+
71
+ **That's it!** Judgeval automatically manages trajectory collection and reward tagging - your agent can learn from production data with minimal code changes.
72
+
73
+ 👉 Check out the [Wikipedia Racer notebook](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/rl/WikiRacingAgent_RL.ipynb), where an agent learns to navigate Wikipedia using RL, to see Judgeval in action.
74
+
75
+
76
+ You can view and monitor training progress for free via the [Judgment Dashboard](https://app.judgmentlabs.ai/).
77
+
78
+
79
+ ## Judgeval Overview
80
+
81
+ Judgeval is an open-source framework for agent behavior monitoring. Judgeval offers a toolkit to track and judge agent behavior in online and offline setups, enabling you to convert interaction data from production/test environments into improved agents. To get started, try running one of the notebooks below or dive deeper in our [docs](https://docs.judgmentlabs.ai/documentation).
82
+
83
+ Our mission is to unlock the power of production data for agent development, enabling teams to improve their apps by catching real-time failures and optimizing over their users' preferences.
84
+
85
+ ## 📚 Cookbooks
86
+
87
+ | Try Out | Notebook | Description |
88
+ |:---------|:-----|:------------|
89
+ | RL | [Wikipedia Racer](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/rl/WikiRacingAgent_RL.ipynb) | Train agents with reinforcement learning |
90
+ | Online ABM | [Research Agent](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/monitoring/Research_Agent_Online_Monitoring.ipynb) | Monitor agent behavior in production |
91
+ | Custom Scorers | [HumanEval](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/custom_scorers/HumanEval_Custom_Scorer.ipynb) | Build custom evaluators for your agents |
92
+ | Offline Testing | [Get Started For Free] | Compare how different prompts, models, or agent configs affect performance across ANY metric |
93
+
94
+ You can access our [repo of cookbooks](https://github.com/JudgmentLabs/judgment-cookbook).
95
+
96
+ You can find a list of [video tutorials for Judgeval use cases](https://www.youtube.com/@Alexshander-JL).
97
+
98
+ ## Why Judgeval?
99
+
100
+ 🤖 **Simple to run multi-turn RL**: Optimize your agents with multi-turn RL without managing compute infrastructure or data pipelines. Just add a few lines of code to your existing agent code and train!
101
+
102
+ ⚙️ **Custom Evaluators**: No restriction to only monitoring with prefab scorers. Judgeval provides simple abstractions for custom Python scorers, supporting any LLM-as-a-judge rubrics/models and code-based scorers that integrate to our live agent-tracking infrastructure. [Learn more](https://docs.judgmentlabs.ai/documentation/evaluation/custom-scorers)
103
+
104
+ 🚨 **Production Monitoring**: Run any custom scorer in a hosted, virtualized secure container to flag agent behaviors online in production. Get Slack alerts for failures and add custom hooks to address regressions before they impact users. [Learn more](https://docs.judgmentlabs.ai/documentation/performance/online-evals)
105
+
106
+ 📊 **Behavior/Topic Grouping**: Group agent runs by behavior type or topic for deeper analysis. Drill down into subsets of users, agents, or use cases to reveal patterns of agent behavior.
107
+ <!-- Add link to Bucketing docs once we have it -->
108
+ <!--
109
+ TODO: Once we have trainer code docs, plug in here
110
+ -->
111
+
112
+ 🧪 **Run experiments on your agents**: Compare test different prompts, models, or agent configs across customer segments. Measure which changes improve agent performance and decrease bad agent behaviors.
113
+
114
+ <!--
115
+ Use this once we have AI PM features:
116
+
117
+ **Run experiments on your agents**: A/B test different prompts, models, or agent configs across customer segments. Measure which changes improve agent performance and decrease bad agent behaviors. [Learn more]
118
+
119
+ -->
120
+
121
+ ## 🛠️ Quickstart
122
+
123
+ Get started with Judgeval by installing our SDK using pip:
124
+
125
+ ```bash
126
+ pip install judgeval
127
+ ```
128
+
129
+ Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment Platform](https://app.judgmentlabs.ai/).
130
+
131
+ ```bash
132
+ export JUDGMENT_API_KEY=...
133
+ export JUDGMENT_ORG_ID=...
134
+ ```
135
+
136
+ **If you don't have keys, [create an account for free](https://app.judgmentlabs.ai/register) on the platform!**
137
+
138
+ ### Start monitoring with Judgeval
139
+
140
+ ```python
141
+ from judgeval.tracer import Tracer, wrap
142
+ from judgeval.data import Example
143
+ from judgeval.scorers import AnswerRelevancyScorer
144
+ from openai import OpenAI
145
+
146
+
147
+ judgment = Tracer(project_name="default_project")
148
+ client = wrap(OpenAI()) # tracks all LLM calls
149
+
150
+ @judgment.observe(span_type="tool")
151
+ def format_question(question: str) -> str:
152
+ # dummy tool
153
+ return f"Question : {question}"
154
+
155
+ @judgment.observe(span_type="function")
156
+ def run_agent(prompt: str) -> str:
157
+ task = format_question(prompt)
158
+ response = client.chat.completions.create(
159
+ model="gpt-5-mini",
160
+ messages=[{"role": "user", "content": task}]
161
+ )
162
+
163
+ judgment.async_evaluate( # trigger online monitoring
164
+ scorer=AnswerRelevancyScorer(threshold=0.5), # swap with any scorer
165
+ example=Example(input=task, actual_output=response), # customize to your data
166
+ model="gpt-5",
167
+ )
168
+ return response.choices[0].message.content
169
+
170
+ run_agent("What is the capital of the United States?")
171
+ ```
172
+
173
+ Running this code will deliver monitoring results to your [free platform account](https://app.judgmentlabs.ai/register) and should look like this:
174
+
175
+ ![Judgment Platform Trajectory View](assets/quickstart_trajectory_ss.png)
176
+
177
+
178
+ ### Customizable Scorers Over Agent Behavior
179
+
180
+ Judgeval's strongest suit is the full customization over the types of scorers you can run online monitoring with. No restrictions to only single-prompt LLM judges or prefab scorers - if you can express your scorer
181
+ in python code, judgeval can monitor it! Under the hood, judgeval hosts your scorer in a virtualized secure container, enabling online monitoring for any scorer.
182
+
183
+
184
+ First, create a behavior scorer in a file called `helpfulness_scorer.py`:
185
+
186
+ ```python
187
+ from judgeval.data import Example
188
+ from judgeval.scorers.example_scorer import ExampleScorer
189
+
190
+ # Define custom example class
191
+ class QuestionAnswer(Example):
192
+ question: str
193
+ answer: str
194
+
195
+ # Define a server-hosted custom scorer
196
+ class HelpfulnessScorer(ExampleScorer):
197
+ name: str = "Helpfulness Scorer"
198
+ server_hosted: bool = True # Enable server hosting
199
+ async def a_score_example(self, example: QuestionAnswer):
200
+ # Custom scoring logic for agent behavior
201
+ # Can be an arbitrary combination of code and LLM calls
202
+ if len(example.answer) > 10 and "?" not in example.answer:
203
+ self.reason = "Answer is detailed and provides helpful information"
204
+ return 1.0
205
+ else:
206
+ self.reason = "Answer is too brief or unclear"
207
+ return 0.0
208
+ ```
209
+
210
+ Then deploy your scorer to Judgment's infrastructure:
211
+
212
+ ```bash
213
+ echo "pydantic" > requirements.txt
214
+ uv run judgeval upload_scorer helpfulness_scorer.py requirements.txt
215
+ ```
216
+
217
+ Now you can instrument your agent with monitoring and online evaluation:
218
+
219
+ ```python
220
+ from judgeval.tracer import Tracer, wrap
221
+ from helpfulness_scorer import HelpfulnessScorer, QuestionAnswer
222
+ from openai import OpenAI
223
+
224
+ judgment = Tracer(project_name="default_project")
225
+ client = wrap(OpenAI()) # tracks all LLM calls
226
+
227
+ @judgment.observe(span_type="tool")
228
+ def format_task(question: str) -> str: # replace with your prompt engineering
229
+ return f"Please answer the following question: {question}"
230
+
231
+ @judgment.observe(span_type="tool")
232
+ def answer_question(prompt: str) -> str: # replace with your LLM system calls
233
+ response = client.chat.completions.create(
234
+ model="gpt-5-mini",
235
+ messages=[{"role": "user", "content": prompt}]
236
+ )
237
+ return response.choices[0].message.content
238
+
239
+ @judgment.observe(span_type="function")
240
+ def run_agent(question: str) -> str:
241
+ task = format_task(question)
242
+ answer = answer_question(task)
243
+
244
+ # Add online evaluation with server-hosted scorer
245
+ judgment.async_evaluate(
246
+ scorer=HelpfulnessScorer(),
247
+ example=QuestionAnswer(question=question, answer=answer),
248
+ sampling_rate=0.9 # Evaluate 90% of agent runs
249
+ )
250
+
251
+ return answer
252
+
253
+ if __name__ == "__main__":
254
+ result = run_agent("What is the capital of the United States?")
255
+ print(result)
256
+ ```
257
+
258
+ Congratulations! Your online eval result should look like this:
259
+
260
+ ![Custom Scorer Online ABM](assets/custom_scorer_online_abm.png)
261
+
262
+ You can now run any online scorer in a secure Firecracker microVMs with no latency impact on your applications.
263
+
264
+ ---
265
+
266
+ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
@@ -0,0 +1,239 @@
1
+ <div align="center">
2
+
3
+ <a href="https://judgmentlabs.ai/">
4
+ <picture>
5
+ <source media="(prefers-color-scheme: dark)" srcset="assets/logo_darkmode.svg">
6
+ <img src="assets/logo_lightmode.svg" alt="Judgment Logo" width="400" />
7
+ </picture>
8
+ </a>
9
+
10
+ <br>
11
+
12
+ ## Agent Behavior Monitoring (ABM)
13
+
14
+ Track and judge any agent behavior in online and offline setups. Set up Sentry-style alerts and analyze agent behaviors / topic patterns at scale!
15
+
16
+ [![Docs](https://img.shields.io/badge/Documentation-blue)](https://docs.judgmentlabs.ai/documentation)
17
+ [![Judgment Cloud](https://img.shields.io/badge/Judgment%20Cloud-brightgreen)](https://app.judgmentlabs.ai/register)
18
+ [![Self-Host](https://img.shields.io/badge/Self--Host-orange)](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
19
+
20
+
21
+ [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
22
+ [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
23
+
24
+ </div>
25
+
26
+
27
+ </table>
28
+
29
+ ## [NEW] 🎆 Agent Reinforcement Learning
30
+
31
+ Train your agents with multi-turn reinforcement learning using judgeval and [Fireworks AI](https://fireworks.ai/)! Judgeval's ABM now integrates with Fireworks' Reinforcement Fine-Tuning (RFT) endpoint, supporting gpt-oss, qwen3, Kimi2, DeepSeek, and more.
32
+
33
+ Judgeval's agent monitoring infra provides a simple harness for integrating GRPO into any Python agent, giving builders a quick method to **try RL with minimal code changes** to their existing agents!
34
+
35
+ ```python
36
+ await trainer.train(
37
+ agent_function=your_agent_function, # entry point to your agent
38
+ scorers=[RewardScorer()], # Custom scorer you define based on task criteria, acts as reward
39
+ prompts=training_prompts, # Tasks
40
+ rft_provider="fireworks"
41
+ )
42
+ ```
43
+
44
+ **That's it!** Judgeval automatically manages trajectory collection and reward tagging - your agent can learn from production data with minimal code changes.
45
+
46
+ 👉 Check out the [Wikipedia Racer notebook](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/rl/WikiRacingAgent_RL.ipynb), where an agent learns to navigate Wikipedia using RL, to see Judgeval in action.
47
+
48
+
49
+ You can view and monitor training progress for free via the [Judgment Dashboard](https://app.judgmentlabs.ai/).
50
+
51
+
52
+ ## Judgeval Overview
53
+
54
+ Judgeval is an open-source framework for agent behavior monitoring. Judgeval offers a toolkit to track and judge agent behavior in online and offline setups, enabling you to convert interaction data from production/test environments into improved agents. To get started, try running one of the notebooks below or dive deeper in our [docs](https://docs.judgmentlabs.ai/documentation).
55
+
56
+ Our mission is to unlock the power of production data for agent development, enabling teams to improve their apps by catching real-time failures and optimizing over their users' preferences.
57
+
58
+ ## 📚 Cookbooks
59
+
60
+ | Try Out | Notebook | Description |
61
+ |:---------|:-----|:------------|
62
+ | RL | [Wikipedia Racer](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/rl/WikiRacingAgent_RL.ipynb) | Train agents with reinforcement learning |
63
+ | Online ABM | [Research Agent](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/monitoring/Research_Agent_Online_Monitoring.ipynb) | Monitor agent behavior in production |
64
+ | Custom Scorers | [HumanEval](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/custom_scorers/HumanEval_Custom_Scorer.ipynb) | Build custom evaluators for your agents |
65
+ | Offline Testing | [Get Started For Free] | Compare how different prompts, models, or agent configs affect performance across ANY metric |
66
+
67
+ You can access our [repo of cookbooks](https://github.com/JudgmentLabs/judgment-cookbook).
68
+
69
+ You can find a list of [video tutorials for Judgeval use cases](https://www.youtube.com/@Alexshander-JL).
70
+
71
+ ## Why Judgeval?
72
+
73
+ 🤖 **Simple to run multi-turn RL**: Optimize your agents with multi-turn RL without managing compute infrastructure or data pipelines. Just add a few lines of code to your existing agent code and train!
74
+
75
+ ⚙️ **Custom Evaluators**: No restriction to only monitoring with prefab scorers. Judgeval provides simple abstractions for custom Python scorers, supporting any LLM-as-a-judge rubrics/models and code-based scorers that integrate to our live agent-tracking infrastructure. [Learn more](https://docs.judgmentlabs.ai/documentation/evaluation/custom-scorers)
76
+
77
+ 🚨 **Production Monitoring**: Run any custom scorer in a hosted, virtualized secure container to flag agent behaviors online in production. Get Slack alerts for failures and add custom hooks to address regressions before they impact users. [Learn more](https://docs.judgmentlabs.ai/documentation/performance/online-evals)
78
+
79
+ 📊 **Behavior/Topic Grouping**: Group agent runs by behavior type or topic for deeper analysis. Drill down into subsets of users, agents, or use cases to reveal patterns of agent behavior.
80
+ <!-- Add link to Bucketing docs once we have it -->
81
+ <!--
82
+ TODO: Once we have trainer code docs, plug in here
83
+ -->
84
+
85
+ 🧪 **Run experiments on your agents**: Compare test different prompts, models, or agent configs across customer segments. Measure which changes improve agent performance and decrease bad agent behaviors.
86
+
87
+ <!--
88
+ Use this once we have AI PM features:
89
+
90
+ **Run experiments on your agents**: A/B test different prompts, models, or agent configs across customer segments. Measure which changes improve agent performance and decrease bad agent behaviors. [Learn more]
91
+
92
+ -->
93
+
94
+ ## 🛠️ Quickstart
95
+
96
+ Get started with Judgeval by installing our SDK using pip:
97
+
98
+ ```bash
99
+ pip install judgeval
100
+ ```
101
+
102
+ Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment Platform](https://app.judgmentlabs.ai/).
103
+
104
+ ```bash
105
+ export JUDGMENT_API_KEY=...
106
+ export JUDGMENT_ORG_ID=...
107
+ ```
108
+
109
+ **If you don't have keys, [create an account for free](https://app.judgmentlabs.ai/register) on the platform!**
110
+
111
+ ### Start monitoring with Judgeval
112
+
113
+ ```python
114
+ from judgeval.tracer import Tracer, wrap
115
+ from judgeval.data import Example
116
+ from judgeval.scorers import AnswerRelevancyScorer
117
+ from openai import OpenAI
118
+
119
+
120
+ judgment = Tracer(project_name="default_project")
121
+ client = wrap(OpenAI()) # tracks all LLM calls
122
+
123
+ @judgment.observe(span_type="tool")
124
+ def format_question(question: str) -> str:
125
+ # dummy tool
126
+ return f"Question : {question}"
127
+
128
+ @judgment.observe(span_type="function")
129
+ def run_agent(prompt: str) -> str:
130
+ task = format_question(prompt)
131
+ response = client.chat.completions.create(
132
+ model="gpt-5-mini",
133
+ messages=[{"role": "user", "content": task}]
134
+ )
135
+
136
+ judgment.async_evaluate( # trigger online monitoring
137
+ scorer=AnswerRelevancyScorer(threshold=0.5), # swap with any scorer
138
+ example=Example(input=task, actual_output=response), # customize to your data
139
+ model="gpt-5",
140
+ )
141
+ return response.choices[0].message.content
142
+
143
+ run_agent("What is the capital of the United States?")
144
+ ```
145
+
146
+ Running this code will deliver monitoring results to your [free platform account](https://app.judgmentlabs.ai/register) and should look like this:
147
+
148
+ ![Judgment Platform Trajectory View](assets/quickstart_trajectory_ss.png)
149
+
150
+
151
+ ### Customizable Scorers Over Agent Behavior
152
+
153
+ Judgeval's strongest suit is the full customization over the types of scorers you can run online monitoring with. No restrictions to only single-prompt LLM judges or prefab scorers - if you can express your scorer
154
+ in python code, judgeval can monitor it! Under the hood, judgeval hosts your scorer in a virtualized secure container, enabling online monitoring for any scorer.
155
+
156
+
157
+ First, create a behavior scorer in a file called `helpfulness_scorer.py`:
158
+
159
+ ```python
160
+ from judgeval.data import Example
161
+ from judgeval.scorers.example_scorer import ExampleScorer
162
+
163
+ # Define custom example class
164
+ class QuestionAnswer(Example):
165
+ question: str
166
+ answer: str
167
+
168
+ # Define a server-hosted custom scorer
169
+ class HelpfulnessScorer(ExampleScorer):
170
+ name: str = "Helpfulness Scorer"
171
+ server_hosted: bool = True # Enable server hosting
172
+ async def a_score_example(self, example: QuestionAnswer):
173
+ # Custom scoring logic for agent behavior
174
+ # Can be an arbitrary combination of code and LLM calls
175
+ if len(example.answer) > 10 and "?" not in example.answer:
176
+ self.reason = "Answer is detailed and provides helpful information"
177
+ return 1.0
178
+ else:
179
+ self.reason = "Answer is too brief or unclear"
180
+ return 0.0
181
+ ```
182
+
183
+ Then deploy your scorer to Judgment's infrastructure:
184
+
185
+ ```bash
186
+ echo "pydantic" > requirements.txt
187
+ uv run judgeval upload_scorer helpfulness_scorer.py requirements.txt
188
+ ```
189
+
190
+ Now you can instrument your agent with monitoring and online evaluation:
191
+
192
+ ```python
193
+ from judgeval.tracer import Tracer, wrap
194
+ from helpfulness_scorer import HelpfulnessScorer, QuestionAnswer
195
+ from openai import OpenAI
196
+
197
+ judgment = Tracer(project_name="default_project")
198
+ client = wrap(OpenAI()) # tracks all LLM calls
199
+
200
+ @judgment.observe(span_type="tool")
201
+ def format_task(question: str) -> str: # replace with your prompt engineering
202
+ return f"Please answer the following question: {question}"
203
+
204
+ @judgment.observe(span_type="tool")
205
+ def answer_question(prompt: str) -> str: # replace with your LLM system calls
206
+ response = client.chat.completions.create(
207
+ model="gpt-5-mini",
208
+ messages=[{"role": "user", "content": prompt}]
209
+ )
210
+ return response.choices[0].message.content
211
+
212
+ @judgment.observe(span_type="function")
213
+ def run_agent(question: str) -> str:
214
+ task = format_task(question)
215
+ answer = answer_question(task)
216
+
217
+ # Add online evaluation with server-hosted scorer
218
+ judgment.async_evaluate(
219
+ scorer=HelpfulnessScorer(),
220
+ example=QuestionAnswer(question=question, answer=answer),
221
+ sampling_rate=0.9 # Evaluate 90% of agent runs
222
+ )
223
+
224
+ return answer
225
+
226
+ if __name__ == "__main__":
227
+ result = run_agent("What is the capital of the United States?")
228
+ print(result)
229
+ ```
230
+
231
+ Congratulations! Your online eval result should look like this:
232
+
233
+ ![Custom Scorer Online ABM](assets/custom_scorer_online_abm.png)
234
+
235
+ You can now run any online scorer in a secure Firecracker microVMs with no latency impact on your applications.
236
+
237
+ ---
238
+
239
+ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
@@ -0,0 +1,7 @@
1
+ <svg width="544" height="91" viewBox="0 0 544 91" fill="none" xmlns="http://www.w3.org/2000/svg">
2
+ <path d="M31.2246 18H39.5512V51.3061L31.2246 59.6327V18Z" fill="#FF4B2E"/>
3
+ <path d="M0 59.6328H31.2245L21.8571 69.0002H0V59.6328Z" fill="#FF4B2E"/>
4
+ <path d="M52.041 18H43.7145V51.3061L52.041 59.6327V18Z" fill="#FF4B2E"/>
5
+ <path d="M83.2656 59.6328H52.0411L62.4493 69.0002H83.2656V59.6328Z" fill="#FF4B2E"/>
6
+ <path d="M111.45 61.3V54.37H116.63V59.55L121.39 64.24H133.36L137.35 60.32V20H142.67V62L135.67 69H119.15L111.45 61.3ZM147.896 62.56V34.14H153.076V60.95L156.576 64.38H163.576L172.256 55.7V34.14H177.436V69H172.396V61.58L164.976 69H154.336L147.896 62.56ZM182.363 62.56V40.58L188.803 34.14H202.243L207.983 39.18V19.02H213.163V69H208.123V62.63L201.753 69H188.803L182.363 62.56ZM200.633 64.38L207.983 57.03V44.64L201.263 38.76H191.043L187.543 42.19V60.95L191.043 64.38H200.633ZM248.869 34.14V77.89L242.499 84.26H225.209L219.819 78.87V74.6H224.999V77.19L227.449 79.64H240.189L243.689 76.21V63.19L237.249 69H224.509L218.069 62.56V40.58L224.509 34.14H237.739L243.829 40.23V34.14H248.869ZM243.689 46.11L236.339 38.76H226.749L223.249 42.19V60.95L226.749 64.38H236.409L243.689 57.59V46.11ZM254.474 34.14H259.514V40.86L266.234 34.14H274.564L280.024 39.6L285.484 34.14H296.474L302.914 40.58V69H297.734V42.19L294.234 38.76H286.534L281.634 43.66V69H276.594V42.19L273.094 38.76H267.214L259.654 46.32V69H254.474V34.14ZM307.458 62.56V40.58L313.898 34.14H331.468L337.978 40.58V53.11H312.638V60.95L316.138 64.38H329.228L332.728 60.95V58.29H337.908V62.56L331.468 69H313.898L307.458 62.56ZM332.798 48.63V42.19L329.298 38.76H316.138L312.638 42.19V48.63H332.798ZM342.496 34.14H347.536V41.56L354.956 34.14H365.666L372.106 40.58V69H366.926V42.19L363.426 38.76H356.356L347.676 47.44V69H342.496V34.14ZM379.848 62.56V38.69H373.548V34.14H379.988V22.8H385.028V34.14H395.948V38.69H385.028V60.95L388.528 64.45H395.948V69H386.288L379.848 62.56ZM411.613 20H416.933V64.31H441.853V69H411.613V20ZM442.227 63.26V54.37L447.967 48.7H466.587V42.05L463.087 38.62H451.187L447.687 42.05V44.92H442.507V40.58L448.947 34.14H465.257L471.697 40.58V69H466.727V62.84L460.287 69H447.967L442.227 63.26ZM459.237 64.52L466.587 57.45V53.18H450.207L447.407 55.91V61.79L450.207 64.52H459.237ZM476.932 62.56V19.02H482.112V40.93L488.902 34.14H501.152L507.592 40.58V62.56L501.152 69H483.372L476.932 62.56ZM498.912 64.38L502.412 60.95V42.19L498.912 38.76H490.372L482.112 47.02V60.95L485.612 64.38H498.912ZM510.751 63.26V58.92H515.931V61.79L518.731 64.52H531.611L534.411 61.79V56.4L531.611 53.6H516.561L511.031 48.07V39.88L516.771 34.14H533.151L538.891 39.88V44.22H533.711V41.35L530.911 38.62H519.011L516.211 41.35V46.46L519.011 49.26H533.851L539.591 55V63.26L533.851 69H516.491L510.751 63.26Z" fill="#F4F4F5"/>
7
+ </svg>
@@ -0,0 +1,7 @@
1
+ <svg width="544" height="91" viewBox="0 0 544 91" fill="none" xmlns="http://www.w3.org/2000/svg">
2
+ <path d="M31.2246 18H39.5512V51.3061L31.2246 59.6327V18Z" fill="#FF4B2E"/>
3
+ <path d="M0 59.6328H31.2245L21.8571 69.0002H0V59.6328Z" fill="#FF4B2E"/>
4
+ <path d="M52.041 18H43.7145V51.3061L52.041 59.6327V18Z" fill="#FF4B2E"/>
5
+ <path d="M83.2656 59.6328H52.0411L62.4493 69.0002H83.2656V59.6328Z" fill="#FF4B2E"/>
6
+ <path d="M111.45 61.3V54.37H116.63V59.55L121.39 64.24H133.36L137.35 60.32V20H142.67V62L135.67 69H119.15L111.45 61.3ZM147.896 62.56V34.14H153.076V60.95L156.576 64.38H163.576L172.256 55.7V34.14H177.436V69H172.396V61.58L164.976 69H154.336L147.896 62.56ZM182.363 62.56V40.58L188.803 34.14H202.243L207.983 39.18V19.02H213.163V69H208.123V62.63L201.753 69H188.803L182.363 62.56ZM200.633 64.38L207.983 57.03V44.64L201.263 38.76H191.043L187.543 42.19V60.95L191.043 64.38H200.633ZM248.869 34.14V77.89L242.499 84.26H225.209L219.819 78.87V74.6H224.999V77.19L227.449 79.64H240.189L243.689 76.21V63.19L237.249 69H224.509L218.069 62.56V40.58L224.509 34.14H237.739L243.829 40.23V34.14H248.869ZM243.689 46.11L236.339 38.76H226.749L223.249 42.19V60.95L226.749 64.38H236.409L243.689 57.59V46.11ZM254.474 34.14H259.514V40.86L266.234 34.14H274.564L280.024 39.6L285.484 34.14H296.474L302.914 40.58V69H297.734V42.19L294.234 38.76H286.534L281.634 43.66V69H276.594V42.19L273.094 38.76H267.214L259.654 46.32V69H254.474V34.14ZM307.458 62.56V40.58L313.898 34.14H331.468L337.978 40.58V53.11H312.638V60.95L316.138 64.38H329.228L332.728 60.95V58.29H337.908V62.56L331.468 69H313.898L307.458 62.56ZM332.798 48.63V42.19L329.298 38.76H316.138L312.638 42.19V48.63H332.798ZM342.496 34.14H347.536V41.56L354.956 34.14H365.666L372.106 40.58V69H366.926V42.19L363.426 38.76H356.356L347.676 47.44V69H342.496V34.14ZM379.848 62.56V38.69H373.548V34.14H379.988V22.8H385.028V34.14H395.948V38.69H385.028V60.95L388.528 64.45H395.948V69H386.288L379.848 62.56ZM411.613 20H416.933V64.31H441.853V69H411.613V20ZM442.227 63.26V54.37L447.967 48.7H466.587V42.05L463.087 38.62H451.187L447.687 42.05V44.92H442.507V40.58L448.947 34.14H465.257L471.697 40.58V69H466.727V62.84L460.287 69H447.967L442.227 63.26ZM459.237 64.52L466.587 57.45V53.18H450.207L447.407 55.91V61.79L450.207 64.52H459.237ZM476.932 62.56V19.02H482.112V40.93L488.902 34.14H501.152L507.592 40.58V62.56L501.152 69H483.372L476.932 62.56ZM498.912 64.38L502.412 60.95V42.19L498.912 38.76H490.372L482.112 47.02V60.95L485.612 64.38H498.912ZM510.751 63.26V58.92H515.931V61.79L518.731 64.52H531.611L534.411 61.79V56.4L531.611 53.6H516.561L511.031 48.07V39.88L516.771 34.14H533.151L538.891 39.88V44.22H533.711V41.35L530.911 38.62H519.011L516.211 41.35V46.46L519.011 49.26H533.851L539.591 55V63.26L533.851 69H516.491L510.751 63.26Z" fill="black"/>
7
+ </svg>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.15.0"
3
+ version = "0.16.1"
4
4
  authors = [
5
5
  { name = "Andrew Li", email = "andrew@judgmentlabs.ai" },
6
6
  { name = "Alex Shan", email = "alex@judgmentlabs.ai" },
@@ -73,7 +73,7 @@ class JudgmentSyncClient:
73
73
 
74
74
  def evaluate_examples(
75
75
  self, payload: ExampleEvaluationRun, stream: Optional[str] = None
76
- ) -> Any:
76
+ ) -> EvaluateResponse:
77
77
  query_params = {}
78
78
  if stream is not None:
79
79
  query_params["stream"] = stream
@@ -86,7 +86,7 @@ class JudgmentSyncClient:
86
86
 
87
87
  def evaluate_traces(
88
88
  self, payload: TraceEvaluationRun, stream: Optional[str] = None
89
- ) -> Any:
89
+ ) -> EvaluateResponse:
90
90
  query_params = {}
91
91
  if stream is not None:
92
92
  query_params["stream"] = stream
@@ -212,13 +212,6 @@ class JudgmentSyncClient:
212
212
  payload,
213
213
  )
214
214
 
215
- def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
216
- return self._request(
217
- "POST",
218
- url_for("/e2e_fetch_trace_scorer_span_score/"),
219
- payload,
220
- )
221
-
222
215
 
223
216
  class JudgmentAsyncClient:
224
217
  __slots__ = ("api_key", "organization_id", "client")
@@ -270,7 +263,7 @@ class JudgmentAsyncClient:
270
263
 
271
264
  async def evaluate_examples(
272
265
  self, payload: ExampleEvaluationRun, stream: Optional[str] = None
273
- ) -> Any:
266
+ ) -> EvaluateResponse:
274
267
  query_params = {}
275
268
  if stream is not None:
276
269
  query_params["stream"] = stream
@@ -283,7 +276,7 @@ class JudgmentAsyncClient:
283
276
 
284
277
  async def evaluate_traces(
285
278
  self, payload: TraceEvaluationRun, stream: Optional[str] = None
286
- ) -> Any:
279
+ ) -> EvaluateResponse:
287
280
  query_params = {}
288
281
  if stream is not None:
289
282
  query_params["stream"] = stream
@@ -411,13 +404,6 @@ class JudgmentAsyncClient:
411
404
  payload,
412
405
  )
413
406
 
414
- async def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
415
- return await self._request(
416
- "POST",
417
- url_for("/e2e_fetch_trace_scorer_span_score/"),
418
- payload,
419
- )
420
-
421
407
 
422
408
  __all__ = [
423
409
  "JudgmentSyncClient",
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-09-30T18:06:51+00:00
3
+ # timestamp: 2025-10-09T00:16:42+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
  from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
@@ -94,6 +94,7 @@ class ResolveProjectNameRequest(TypedDict):
94
94
 
95
95
  class ResolveProjectNameResponse(TypedDict):
96
96
  project_id: str
97
+ project_created: bool
97
98
 
98
99
 
99
100
  class TraceIdRequest(TypedDict):
@@ -146,6 +147,14 @@ class ValidationError(TypedDict):
146
147
  type: str
147
148
 
148
149
 
150
+ class UsageInfo(TypedDict):
151
+ total_judgees: int
152
+ regular_use: int
153
+ pay_as_you_go_use: int
154
+ remaining_regular: int
155
+ remaining_after: int
156
+
157
+
149
158
  DatasetKind = Literal["trace", "example"]
150
159
 
151
160
 
@@ -273,7 +282,6 @@ class OtelTraceListItem(TypedDict):
273
282
  trace_id: str
274
283
  created_at: str
275
284
  duration: NotRequired[Optional[int]]
276
- has_notification: NotRequired[Optional[bool]]
277
285
  tags: NotRequired[Optional[List[str]]]
278
286
  experiment_run_id: NotRequired[Optional[str]]
279
287
  span_name: NotRequired[Optional[str]]
@@ -281,6 +289,8 @@ class OtelTraceListItem(TypedDict):
281
289
  error: NotRequired[str]
282
290
  scores: NotRequired[List[OtelSpanListItemScores]]
283
291
  customer_id: NotRequired[Optional[str]]
292
+ input: NotRequired[Optional[str]]
293
+ output: NotRequired[Optional[str]]
284
294
  input_preview: NotRequired[Optional[str]]
285
295
  output_preview: NotRequired[Optional[str]]
286
296
  annotation_count: NotRequired[int]
@@ -312,6 +322,12 @@ class OtelSpanDetail(TypedDict):
312
322
  scores: NotRequired[Optional[List[OtelSpanDetailScores]]]
313
323
 
314
324
 
325
+ class EvaluateResponse(TypedDict):
326
+ status: str
327
+ results: List[ScoringResult]
328
+ resource_usage: NotRequired[Optional[UsageInfo]]
329
+
330
+
315
331
  class EvalResults(TypedDict):
316
332
  results: List[ScoringResult]
317
333
  run: Union[ExampleEvaluationRun, TraceEvaluationRun]