judgeval 0.5.0__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. {judgeval-0.5.0 → judgeval-0.7.0}/PKG-INFO +10 -47
  2. {judgeval-0.5.0 → judgeval-0.7.0}/README.md +6 -46
  3. {judgeval-0.5.0 → judgeval-0.7.0}/pyproject.toml +7 -1
  4. judgeval-0.7.0/src/judgeval/cli.py +65 -0
  5. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/api/api.py +44 -38
  6. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/api/constants.py +18 -5
  7. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/api/json_encoder.py +8 -9
  8. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/tracer/core.py +448 -256
  9. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/tracer/otel_span_processor.py +1 -1
  10. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/tracer/span_processor.py +1 -1
  11. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/tracer/span_transformer.py +2 -1
  12. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/tracer/trace_manager.py +6 -1
  13. judgeval-0.7.0/src/judgeval/common/trainer/__init__.py +5 -0
  14. judgeval-0.7.0/src/judgeval/common/trainer/config.py +125 -0
  15. judgeval-0.7.0/src/judgeval/common/trainer/console.py +151 -0
  16. judgeval-0.7.0/src/judgeval/common/trainer/trainable_model.py +238 -0
  17. judgeval-0.7.0/src/judgeval/common/trainer/trainer.py +301 -0
  18. judgeval-0.7.0/src/judgeval/data/evaluation_run.py +104 -0
  19. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/judgment_types.py +37 -8
  20. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/trace.py +1 -0
  21. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/trace_run.py +0 -2
  22. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/integrations/langgraph.py +2 -1
  23. judgeval-0.7.0/src/judgeval/judgment_client.py +267 -0
  24. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/local_eval_queue.py +3 -5
  25. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/run_evaluation.py +43 -299
  26. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/base_scorer.py +9 -10
  27. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
  28. {judgeval-0.5.0 → judgeval-0.7.0}/uv.lock +883 -25
  29. judgeval-0.5.0/src/judgeval/evaluation_run.py +0 -80
  30. judgeval-0.5.0/src/judgeval/judgment_client.py +0 -312
  31. {judgeval-0.5.0 → judgeval-0.7.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  32. {judgeval-0.5.0 → judgeval-0.7.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  33. {judgeval-0.5.0 → judgeval-0.7.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  34. {judgeval-0.5.0 → judgeval-0.7.0}/.github/pull_request_template.md +0 -0
  35. {judgeval-0.5.0 → judgeval-0.7.0}/.github/workflows/blocked-pr.yaml +0 -0
  36. {judgeval-0.5.0 → judgeval-0.7.0}/.github/workflows/ci.yaml +0 -0
  37. {judgeval-0.5.0 → judgeval-0.7.0}/.github/workflows/lint.yaml +0 -0
  38. {judgeval-0.5.0 → judgeval-0.7.0}/.github/workflows/merge-branch-check.yaml +0 -0
  39. {judgeval-0.5.0 → judgeval-0.7.0}/.github/workflows/mypy.yaml +0 -0
  40. {judgeval-0.5.0 → judgeval-0.7.0}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
  41. {judgeval-0.5.0 → judgeval-0.7.0}/.github/workflows/release.yaml +0 -0
  42. {judgeval-0.5.0 → judgeval-0.7.0}/.github/workflows/validate-branch.yaml +0 -0
  43. {judgeval-0.5.0 → judgeval-0.7.0}/.gitignore +0 -0
  44. {judgeval-0.5.0 → judgeval-0.7.0}/.pre-commit-config.yaml +0 -0
  45. {judgeval-0.5.0 → judgeval-0.7.0}/LICENSE.md +0 -0
  46. {judgeval-0.5.0 → judgeval-0.7.0}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
  47. {judgeval-0.5.0 → judgeval-0.7.0}/assets/agent.gif +0 -0
  48. {judgeval-0.5.0 → judgeval-0.7.0}/assets/agent_trace_example.png +0 -0
  49. {judgeval-0.5.0 → judgeval-0.7.0}/assets/data.gif +0 -0
  50. {judgeval-0.5.0 → judgeval-0.7.0}/assets/dataset_clustering_screenshot.png +0 -0
  51. {judgeval-0.5.0 → judgeval-0.7.0}/assets/dataset_clustering_screenshot_dm.png +0 -0
  52. {judgeval-0.5.0 → judgeval-0.7.0}/assets/datasets_preview_screenshot.png +0 -0
  53. {judgeval-0.5.0 → judgeval-0.7.0}/assets/document.gif +0 -0
  54. {judgeval-0.5.0 → judgeval-0.7.0}/assets/error_analysis_dashboard.png +0 -0
  55. {judgeval-0.5.0 → judgeval-0.7.0}/assets/errors.png +0 -0
  56. {judgeval-0.5.0 → judgeval-0.7.0}/assets/experiments_dashboard_screenshot.png +0 -0
  57. {judgeval-0.5.0 → judgeval-0.7.0}/assets/experiments_page.png +0 -0
  58. {judgeval-0.5.0 → judgeval-0.7.0}/assets/experiments_pagev2.png +0 -0
  59. {judgeval-0.5.0 → judgeval-0.7.0}/assets/logo-dark.svg +0 -0
  60. {judgeval-0.5.0 → judgeval-0.7.0}/assets/logo-light.svg +0 -0
  61. {judgeval-0.5.0 → judgeval-0.7.0}/assets/monitoring_screenshot.png +0 -0
  62. {judgeval-0.5.0 → judgeval-0.7.0}/assets/new_darkmode.svg +0 -0
  63. {judgeval-0.5.0 → judgeval-0.7.0}/assets/new_lightmode.svg +0 -0
  64. {judgeval-0.5.0 → judgeval-0.7.0}/assets/online_eval.png +0 -0
  65. {judgeval-0.5.0 → judgeval-0.7.0}/assets/product_shot.png +0 -0
  66. {judgeval-0.5.0 → judgeval-0.7.0}/assets/test.png +0 -0
  67. {judgeval-0.5.0 → judgeval-0.7.0}/assets/tests.png +0 -0
  68. {judgeval-0.5.0 → judgeval-0.7.0}/assets/trace.gif +0 -0
  69. {judgeval-0.5.0 → judgeval-0.7.0}/assets/trace_demo.png +0 -0
  70. {judgeval-0.5.0 → judgeval-0.7.0}/assets/trace_screenshot.png +0 -0
  71. {judgeval-0.5.0 → judgeval-0.7.0}/assets/trace_screenshot_old.png +0 -0
  72. {judgeval-0.5.0 → judgeval-0.7.0}/pytest.ini +0 -0
  73. {judgeval-0.5.0 → judgeval-0.7.0}/src/.coveragerc +0 -0
  74. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/__init__.py +0 -0
  75. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/clients.py +0 -0
  76. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/__init__.py +0 -0
  77. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/api/__init__.py +0 -0
  78. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/exceptions.py +0 -0
  79. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/logger.py +0 -0
  80. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/storage/__init__.py +0 -0
  81. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/storage/s3_storage.py +0 -0
  82. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/tracer/__init__.py +0 -0
  83. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/tracer/constants.py +0 -0
  84. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/tracer/otel_exporter.py +0 -0
  85. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/tracer/providers.py +0 -0
  86. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/utils.py +0 -0
  87. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/constants.py +0 -0
  88. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/__init__.py +0 -0
  89. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/example.py +0 -0
  90. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/result.py +0 -0
  91. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/scorer_data.py +0 -0
  92. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
  93. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/scripts/openapi_transform.py +0 -0
  94. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/tool.py +0 -0
  95. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/dataset.py +0 -0
  96. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/judges/__init__.py +0 -0
  97. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/judges/base_judge.py +0 -0
  98. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/judges/litellm_judge.py +0 -0
  99. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/judges/mixture_of_judges.py +0 -0
  100. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/judges/together_judge.py +0 -0
  101. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/judges/utils.py +0 -0
  102. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/rules.py +0 -0
  103. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/__init__.py +0 -0
  104. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/agent_scorer.py +0 -0
  105. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/api_scorer.py +0 -0
  106. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/example_scorer.py +0 -0
  107. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/exceptions.py +0 -0
  108. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  109. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  110. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  111. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  112. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
  113. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
  114. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  115. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
  116. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
  117. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -0
  118. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -0
  119. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/score.py +0 -0
  120. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/utils.py +0 -0
  121. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/tracer/__init__.py +0 -0
  122. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/utils/alerts.py +0 -0
  123. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/utils/async_utils.py +0 -0
  124. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/utils/file_utils.py +0 -0
  125. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/utils/requests.py +0 -0
  126. {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/version_check.py +0 -0
  127. {judgeval-0.5.0 → judgeval-0.7.0}/src/update_types.sh +0 -0
  128. {judgeval-0.5.0 → judgeval-0.7.0}/update_version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.5.0
3
+ Version: 0.7.0
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -11,6 +11,8 @@ Classifier: Operating System :: OS Independent
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.11
13
13
  Requires-Dist: boto3
14
+ Requires-Dist: click<8.2.0
15
+ Requires-Dist: fireworks-ai>=0.19.18
14
16
  Requires-Dist: langchain-anthropic
15
17
  Requires-Dist: langchain-core
16
18
  Requires-Dist: langchain-huggingface
@@ -23,6 +25,7 @@ Requires-Dist: orjson>=3.9.0
23
25
  Requires-Dist: python-dotenv
24
26
  Requires-Dist: requests
25
27
  Requires-Dist: rich
28
+ Requires-Dist: typer>=0.9.0
26
29
  Provides-Extra: langchain
27
30
  Requires-Dist: langchain-anthropic; extra == 'langchain'
28
31
  Requires-Dist: langchain-core; extra == 'langchain'
@@ -37,7 +40,7 @@ Description-Content-Type: text/markdown
37
40
 
38
41
  <br>
39
42
  <div style="font-size: 1.5em;">
40
- Enable self-learning agents with traces, evals, and environment data.
43
+ Enable self-learning agents with environment data and evals.
41
44
  </div>
42
45
 
43
46
  ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
@@ -54,11 +57,11 @@ We're hiring! Join us in our mission to enable self-learning agents by providing
54
57
 
55
58
  </div>
56
59
 
57
- Judgeval offers **open-source tooling** for tracing and evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
60
+ Judgeval offers **open-source tooling** for evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
58
61
 
59
62
  ## 🎬 See Judgeval in Action
60
63
 
61
- **[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval traces every input/output + environment response across all agent tool calls for debugging. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
64
+ **[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval captures all environment responses across all agent tool calls for monitoring. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
62
65
 
63
66
  <table style="width: 100%; max-width: 800px; table-layout: fixed;">
64
67
  <tr>
@@ -67,8 +70,8 @@ Judgeval offers **open-source tooling** for tracing and evaluating autonomous, s
67
70
  <br><strong>🤖 Agents Running</strong>
68
71
  </td>
69
72
  <td align="center" style="padding: 8px; width: 50%;">
70
- <img src="assets/trace.gif" alt="Trace Demo" style="width: 100%; max-width: 350px; height: auto;" />
71
- <br><strong>📊 Real-time Tracing</strong>
73
+ <img src="assets/trace.gif" alt="Capturing Environment Data Demo" style="width: 100%; max-width: 350px; height: auto;" />
74
+ <br><strong>📊 Capturing Environment Data </strong>
72
75
  </td>
73
76
  </tr>
74
77
  <tr>
@@ -109,54 +112,14 @@ export JUDGMENT_ORG_ID=...
109
112
 
110
113
  **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
111
114
 
112
- ## 🏁 Quickstarts
113
-
114
- ### 🛰️ Tracing
115
-
116
- Create a file named `agent.py` with the following code:
117
-
118
- ```python
119
- from judgeval.tracer import Tracer, wrap
120
- from openai import OpenAI
121
-
122
- client = wrap(OpenAI()) # tracks all LLM calls
123
- judgment = Tracer(project_name="my_project")
124
-
125
- @judgment.observe(span_type="tool")
126
- def format_question(question: str) -> str:
127
- # dummy tool
128
- return f"Question : {question}"
129
-
130
- @judgment.observe(span_type="function")
131
- def run_agent(prompt: str) -> str:
132
- task = format_question(prompt)
133
- response = client.chat.completions.create(
134
- model="gpt-4.1",
135
- messages=[{"role": "user", "content": task}]
136
- )
137
- return response.choices[0].message.content
138
-
139
- run_agent("What is the capital of the United States?")
140
- ```
141
- You'll see your trace exported to the Judgment Platform:
142
-
143
- <p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
144
-
145
-
146
- [Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
147
-
148
-
149
- <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
150
-
151
115
 
152
116
  ## ✨ Features
153
117
 
154
118
  | | |
155
119
  |:---|:---:|
156
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
157
120
  | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
158
121
  | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
159
- | <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
122
+ | <h3>📊 Datasets</h3>Export environment interactions and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
160
123
 
161
124
  ## 🏢 Self-Hosting
162
125
 
@@ -5,7 +5,7 @@
5
5
 
6
6
  <br>
7
7
  <div style="font-size: 1.5em;">
8
- Enable self-learning agents with traces, evals, and environment data.
8
+ Enable self-learning agents with environment data and evals.
9
9
  </div>
10
10
 
11
11
  ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
@@ -22,11 +22,11 @@ We're hiring! Join us in our mission to enable self-learning agents by providing
22
22
 
23
23
  </div>
24
24
 
25
- Judgeval offers **open-source tooling** for tracing and evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
25
+ Judgeval offers **open-source tooling** for evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
26
26
 
27
27
  ## 🎬 See Judgeval in Action
28
28
 
29
- **[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval traces every input/output + environment response across all agent tool calls for debugging. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
29
+ **[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval captures all environment responses across all agent tool calls for monitoring. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
30
30
 
31
31
  <table style="width: 100%; max-width: 800px; table-layout: fixed;">
32
32
  <tr>
@@ -35,8 +35,8 @@ Judgeval offers **open-source tooling** for tracing and evaluating autonomous, s
35
35
  <br><strong>🤖 Agents Running</strong>
36
36
  </td>
37
37
  <td align="center" style="padding: 8px; width: 50%;">
38
- <img src="assets/trace.gif" alt="Trace Demo" style="width: 100%; max-width: 350px; height: auto;" />
39
- <br><strong>📊 Real-time Tracing</strong>
38
+ <img src="assets/trace.gif" alt="Capturing Environment Data Demo" style="width: 100%; max-width: 350px; height: auto;" />
39
+ <br><strong>📊 Capturing Environment Data </strong>
40
40
  </td>
41
41
  </tr>
42
42
  <tr>
@@ -77,54 +77,14 @@ export JUDGMENT_ORG_ID=...
77
77
 
78
78
  **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
79
79
 
80
- ## 🏁 Quickstarts
81
-
82
- ### 🛰️ Tracing
83
-
84
- Create a file named `agent.py` with the following code:
85
-
86
- ```python
87
- from judgeval.tracer import Tracer, wrap
88
- from openai import OpenAI
89
-
90
- client = wrap(OpenAI()) # tracks all LLM calls
91
- judgment = Tracer(project_name="my_project")
92
-
93
- @judgment.observe(span_type="tool")
94
- def format_question(question: str) -> str:
95
- # dummy tool
96
- return f"Question : {question}"
97
-
98
- @judgment.observe(span_type="function")
99
- def run_agent(prompt: str) -> str:
100
- task = format_question(prompt)
101
- response = client.chat.completions.create(
102
- model="gpt-4.1",
103
- messages=[{"role": "user", "content": task}]
104
- )
105
- return response.choices[0].message.content
106
-
107
- run_agent("What is the capital of the United States?")
108
- ```
109
- You'll see your trace exported to the Judgment Platform:
110
-
111
- <p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
112
-
113
-
114
- [Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
115
-
116
-
117
- <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
118
-
119
80
 
120
81
  ## ✨ Features
121
82
 
122
83
  | | |
123
84
  |:---|:---:|
124
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
125
85
  | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
126
86
  | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
127
- | <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
87
+ | <h3>📊 Datasets</h3>Export environment interactions and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
128
88
 
129
89
  ## 🏢 Self-Hosting
130
90
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.5.0"
3
+ version = "0.7.0"
4
4
  authors = [
5
5
  { name = "Andrew Li", email = "andrew@judgmentlabs.ai" },
6
6
  { name = "Alex Shan", email = "alex@judgmentlabs.ai" },
@@ -29,12 +29,18 @@ dependencies = [
29
29
  "langchain-openai",
30
30
  "langchain-anthropic",
31
31
  "langchain-core",
32
+ "click<8.2.0",
33
+ "typer>=0.9.0",
34
+ "fireworks-ai>=0.19.18",
32
35
  ]
33
36
 
34
37
  [project.urls]
35
38
  Homepage = "https://github.com/JudgmentLabs/judgeval"
36
39
  Issues = "https://github.com/JudgmentLabs/judgeval/issues"
37
40
 
41
+ [project.scripts]
42
+ judgeval = "judgeval.cli:app"
43
+
38
44
  [build-system]
39
45
  requires = ["hatchling"]
40
46
  build-backend = "hatchling.build"
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import typer
4
+ from pathlib import Path
5
+ from dotenv import load_dotenv
6
+ from judgeval.common.logger import judgeval_logger
7
+ from judgeval.judgment_client import JudgmentClient
8
+
9
+ load_dotenv()
10
+
11
+ app = typer.Typer(
12
+ no_args_is_help=True,
13
+ rich_markup_mode=None,
14
+ rich_help_panel=None,
15
+ pretty_exceptions_enable=False,
16
+ pretty_exceptions_show_locals=False,
17
+ pretty_exceptions_short=False,
18
+ )
19
+
20
+
21
+ @app.command("upload_scorer")
22
+ def upload_scorer(
23
+ scorer_file_path: str,
24
+ requirements_file_path: str,
25
+ unique_name: str = typer.Option(
26
+ None, help="Custom name for the scorer (auto-detected if not provided)"
27
+ ),
28
+ ):
29
+ # Validate file paths
30
+ if not Path(scorer_file_path).exists():
31
+ judgeval_logger.error(f"Scorer file not found: {scorer_file_path}")
32
+ raise typer.Exit(1)
33
+
34
+ if not Path(requirements_file_path).exists():
35
+ judgeval_logger.error(f"Requirements file not found: {requirements_file_path}")
36
+ raise typer.Exit(1)
37
+
38
+ try:
39
+ client = JudgmentClient()
40
+
41
+ result = client.upload_custom_scorer(
42
+ scorer_file_path=scorer_file_path,
43
+ requirements_file_path=requirements_file_path,
44
+ unique_name=unique_name,
45
+ )
46
+
47
+ if not result:
48
+ judgeval_logger.error("Failed to upload custom scorer")
49
+ raise typer.Exit(1)
50
+
51
+ raise typer.Exit(0)
52
+ except Exception:
53
+ raise
54
+
55
+
56
+ @app.command()
57
+ def version():
58
+ """Show version info"""
59
+ judgeval_logger.info("JudgEval CLI v0.0.0")
60
+
61
+
62
+ if __name__ == "__main__":
63
+ app()
64
+
65
+ # judgeval upload_scorer /Users/alanzhang/repo/JudgmentLabs/judgeval/src/demo/profile_match_scorer.py /Users/alanzhang/repo/JudgmentLabs/judgeval/src/demo/requirements.txt
@@ -20,13 +20,11 @@ from judgeval.common.api.constants import (
20
20
  JUDGMENT_EVAL_DELETE_API_URL,
21
21
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
22
22
  JUDGMENT_GET_EVAL_STATUS_API_URL,
23
- JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL,
24
- JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL,
25
23
  JUDGMENT_SCORER_SAVE_API_URL,
26
24
  JUDGMENT_SCORER_FETCH_API_URL,
27
25
  JUDGMENT_SCORER_EXISTS_API_URL,
26
+ JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL,
28
27
  JUDGMENT_DATASETS_APPEND_TRACES_API_URL,
29
- JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL,
30
28
  )
31
29
  from judgeval.common.api.constants import (
32
30
  TraceFetchPayload,
@@ -45,12 +43,11 @@ from judgeval.common.api.constants import (
45
43
  DeleteEvalRunRequestBody,
46
44
  EvalLogPayload,
47
45
  EvalStatusPayload,
48
- CheckExperimentTypePayload,
49
- EvalRunNameExistsPayload,
50
46
  ScorerSavePayload,
51
47
  ScorerFetchPayload,
52
48
  ScorerExistsPayload,
53
- CheckExampleKeysPayload,
49
+ CustomScorerUploadPayload,
50
+ CustomScorerTemplateResponse,
54
51
  )
55
52
  from judgeval.utils.requests import requests
56
53
  from judgeval.common.api.json_encoder import json_encoder
@@ -97,14 +94,20 @@ class JudgmentApiClient:
97
94
  method: Literal["POST", "PATCH", "GET", "DELETE"],
98
95
  url: str,
99
96
  payload: Any,
97
+ timeout: Optional[Union[float, tuple]] = None,
100
98
  ) -> Any:
99
+ # Prepare request kwargs with optional timeout
100
+ request_kwargs = self._request_kwargs()
101
+ if timeout is not None:
102
+ request_kwargs["timeout"] = timeout
103
+
101
104
  if method == "GET":
102
105
  r = requests.request(
103
106
  method,
104
107
  url,
105
108
  params=payload,
106
109
  headers=self._headers(),
107
- **self._request_kwargs(),
110
+ **request_kwargs,
108
111
  )
109
112
  else:
110
113
  r = requests.request(
@@ -112,7 +115,7 @@ class JudgmentApiClient:
112
115
  url,
113
116
  json=json_encoder(payload),
114
117
  headers=self._headers(),
115
- **self._request_kwargs(),
118
+ **request_kwargs,
116
119
  )
117
120
 
118
121
  try:
@@ -186,10 +189,10 @@ class JudgmentApiClient:
186
189
  payload: EvalLogPayload = {"results": results, "run": run}
187
190
  return self._do_request("POST", JUDGMENT_EVAL_LOG_API_URL, payload)
188
191
 
189
- def fetch_evaluation_results(self, project_name: str, eval_name: str):
192
+ def fetch_evaluation_results(self, experiment_run_id: str, project_name: str):
190
193
  payload: EvalRunRequestBody = {
191
194
  "project_name": project_name,
192
- "eval_name": eval_name,
195
+ "experiment_run_id": experiment_run_id,
193
196
  }
194
197
  return self._do_request("POST", JUDGMENT_EVAL_FETCH_API_URL, payload)
195
198
 
@@ -204,43 +207,21 @@ class JudgmentApiClient:
204
207
  def add_to_evaluation_queue(self, payload: Dict[str, Any]):
205
208
  return self._do_request("POST", JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL, payload)
206
209
 
207
- def get_evaluation_status(self, eval_name: str, project_name: str):
210
+ def get_evaluation_status(self, experiment_run_id: str, project_name: str):
208
211
  payload: EvalStatusPayload = {
209
- "eval_name": eval_name,
212
+ "experiment_run_id": experiment_run_id,
210
213
  "project_name": project_name,
211
214
  "judgment_api_key": self.api_key,
212
215
  }
213
216
  return self._do_request("GET", JUDGMENT_GET_EVAL_STATUS_API_URL, payload)
214
217
 
215
- def check_experiment_type(self, eval_name: str, project_name: str, is_trace: bool):
216
- payload: CheckExperimentTypePayload = {
217
- "eval_name": eval_name,
218
- "project_name": project_name,
219
- "judgment_api_key": self.api_key,
220
- "is_trace": is_trace,
221
- }
222
- return self._do_request("POST", JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL, payload)
223
-
224
- def check_eval_run_name_exists(self, eval_name: str, project_name: str):
225
- payload: EvalRunNameExistsPayload = {
226
- "eval_name": eval_name,
227
- "project_name": project_name,
228
- "judgment_api_key": self.api_key,
229
- }
230
- return self._do_request("POST", JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL, payload)
231
-
232
- def check_example_keys(self, keys: List[str], eval_name: str, project_name: str):
233
- payload: CheckExampleKeysPayload = {
234
- "keys": keys,
235
- "eval_name": eval_name,
236
- "project_name": project_name,
237
- }
238
- return self._do_request("POST", JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL, payload)
239
-
240
- def save_scorer(self, name: str, prompt: str, options: Optional[dict] = None):
218
+ def save_scorer(
219
+ self, name: str, prompt: str, threshold: float, options: Optional[dict] = None
220
+ ):
241
221
  payload: ScorerSavePayload = {
242
222
  "name": name,
243
223
  "prompt": prompt,
224
+ "threshold": threshold,
244
225
  "options": options,
245
226
  }
246
227
  try:
@@ -292,6 +273,31 @@ class JudgmentApiClient:
292
273
  request=e.request,
293
274
  )
294
275
 
276
+ def upload_custom_scorer(
277
+ self,
278
+ scorer_name: str,
279
+ scorer_code: str,
280
+ requirements_text: str,
281
+ ) -> CustomScorerTemplateResponse:
282
+ """Upload custom scorer to backend"""
283
+ payload: CustomScorerUploadPayload = {
284
+ "scorer_name": scorer_name,
285
+ "scorer_code": scorer_code,
286
+ "requirements_text": requirements_text,
287
+ }
288
+
289
+ try:
290
+ # Use longer timeout for custom scorer upload (5 minutes)
291
+ response = self._do_request(
292
+ "POST",
293
+ JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL,
294
+ payload,
295
+ timeout=(10, 300),
296
+ )
297
+ return response
298
+ except JudgmentAPIException as e:
299
+ raise e
300
+
295
301
  def push_dataset(
296
302
  self,
297
303
  dataset_alias: str,
@@ -49,9 +49,9 @@ JUDGMENT_EVAL_DELETE_API_URL = (
49
49
  JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
50
50
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
51
51
  JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
52
- JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL = f"{ROOT_API}/check_experiment_type/"
53
- JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL = f"{ROOT_API}/eval-run-name-exists/"
54
- JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL = f"{ROOT_API}/check_example_keys/"
52
+
53
+ # Custom Scorers API
54
+ JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL = f"{ROOT_API}/upload_scorer/"
55
55
 
56
56
 
57
57
  # Evaluation API Payloads
@@ -73,9 +73,9 @@ class EvalLogPayload(TypedDict):
73
73
 
74
74
 
75
75
  class EvalStatusPayload(TypedDict):
76
- eval_name: str
77
- project_name: str
76
+ experiment_run_id: str
78
77
  judgment_api_key: str
78
+ project_name: str
79
79
 
80
80
 
81
81
  class CheckExperimentTypePayload(TypedDict):
@@ -162,6 +162,7 @@ JUDGMENT_SCORER_EXISTS_API_URL = f"{ROOT_API}/scorer_exists/"
162
162
  class ScorerSavePayload(TypedDict):
163
163
  name: str
164
164
  prompt: str
165
+ threshold: float
165
166
  options: Optional[dict]
166
167
 
167
168
 
@@ -171,3 +172,15 @@ class ScorerFetchPayload(TypedDict):
171
172
 
172
173
  class ScorerExistsPayload(TypedDict):
173
174
  name: str
175
+
176
+
177
+ class CustomScorerUploadPayload(TypedDict):
178
+ scorer_name: str
179
+ scorer_code: str
180
+ requirements_text: str
181
+
182
+
183
+ class CustomScorerTemplateResponse(TypedDict):
184
+ scorer_name: str
185
+ status: str
186
+ message: str
@@ -84,7 +84,7 @@ def json_encoder(
84
84
  )
85
85
 
86
86
  # Sequences
87
- if isinstance(obj, (list, set, frozenset, GeneratorType, tuple, deque)):
87
+ if isinstance(obj, (list, set, frozenset, tuple, deque)):
88
88
  return _dump_sequence(
89
89
  obj=obj,
90
90
  )
@@ -169,16 +169,15 @@ def _dump_other(
169
169
  obj: Any,
170
170
  ) -> Any:
171
171
  """
172
- Dump an object to a hashable object, using the same parameters as jsonable_encoder
172
+ Dump an object to a representation without iterating it.
173
+
174
+ Avoids calling dict(obj) which can consume iterators/generators or
175
+ invoke user-defined iteration protocols.
173
176
  """
174
177
  try:
175
- data = dict(obj)
176
- except Exception:
177
178
  return repr(obj)
178
-
179
- return json_encoder(
180
- data,
181
- )
179
+ except Exception:
180
+ return str(obj)
182
181
 
183
182
 
184
183
  def iso_format(o: Union[datetime.date, datetime.time]) -> str:
@@ -218,7 +217,7 @@ ENCODERS_BY_TYPE: Dict[Type[Any], Callable[[Any], Any]] = {
218
217
  Enum: lambda o: o.value,
219
218
  frozenset: list,
220
219
  deque: list,
221
- GeneratorType: list,
220
+ GeneratorType: repr,
222
221
  Path: str,
223
222
  Pattern: lambda o: o.pattern,
224
223
  SecretBytes: str,