judgeval 0.5.0__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.5.0 → judgeval-0.7.0}/PKG-INFO +10 -47
- {judgeval-0.5.0 → judgeval-0.7.0}/README.md +6 -46
- {judgeval-0.5.0 → judgeval-0.7.0}/pyproject.toml +7 -1
- judgeval-0.7.0/src/judgeval/cli.py +65 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/api/api.py +44 -38
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/api/constants.py +18 -5
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/api/json_encoder.py +8 -9
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/tracer/core.py +448 -256
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/tracer/otel_span_processor.py +1 -1
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/tracer/span_processor.py +1 -1
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/tracer/span_transformer.py +2 -1
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/tracer/trace_manager.py +6 -1
- judgeval-0.7.0/src/judgeval/common/trainer/__init__.py +5 -0
- judgeval-0.7.0/src/judgeval/common/trainer/config.py +125 -0
- judgeval-0.7.0/src/judgeval/common/trainer/console.py +151 -0
- judgeval-0.7.0/src/judgeval/common/trainer/trainable_model.py +238 -0
- judgeval-0.7.0/src/judgeval/common/trainer/trainer.py +301 -0
- judgeval-0.7.0/src/judgeval/data/evaluation_run.py +104 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/judgment_types.py +37 -8
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/trace.py +1 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/trace_run.py +0 -2
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/integrations/langgraph.py +2 -1
- judgeval-0.7.0/src/judgeval/judgment_client.py +267 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/local_eval_queue.py +3 -5
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/run_evaluation.py +43 -299
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/base_scorer.py +9 -10
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
- {judgeval-0.5.0 → judgeval-0.7.0}/uv.lock +883 -25
- judgeval-0.5.0/src/judgeval/evaluation_run.py +0 -80
- judgeval-0.5.0/src/judgeval/judgment_client.py +0 -312
- {judgeval-0.5.0 → judgeval-0.7.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/.github/pull_request_template.md +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/.github/workflows/blocked-pr.yaml +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/.github/workflows/lint.yaml +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/.github/workflows/merge-branch-check.yaml +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/.github/workflows/mypy.yaml +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/.github/workflows/release.yaml +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/.github/workflows/validate-branch.yaml +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/.gitignore +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/.pre-commit-config.yaml +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/LICENSE.md +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/agent.gif +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/agent_trace_example.png +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/data.gif +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/dataset_clustering_screenshot.png +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/dataset_clustering_screenshot_dm.png +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/datasets_preview_screenshot.png +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/document.gif +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/error_analysis_dashboard.png +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/errors.png +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/experiments_dashboard_screenshot.png +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/experiments_page.png +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/experiments_pagev2.png +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/logo-dark.svg +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/logo-light.svg +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/monitoring_screenshot.png +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/new_darkmode.svg +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/new_lightmode.svg +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/online_eval.png +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/product_shot.png +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/test.png +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/tests.png +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/trace.gif +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/trace_demo.png +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/trace_screenshot.png +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/assets/trace_screenshot_old.png +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/pytest.ini +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/.coveragerc +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/__init__.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/clients.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/__init__.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/api/__init__.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/exceptions.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/logger.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/storage/__init__.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/storage/s3_storage.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/tracer/__init__.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/tracer/constants.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/tracer/otel_exporter.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/tracer/providers.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/common/utils.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/constants.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/example.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/result.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/scripts/openapi_transform.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/data/tool.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/dataset.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/judges/mixture_of_judges.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/rules.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/agent_scorer.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/api_scorer.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/example_scorer.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/score.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/scorers/utils.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/tracer/__init__.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/utils/alerts.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/utils/async_utils.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/utils/file_utils.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/utils/requests.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/judgeval/version_check.py +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/src/update_types.sh +0 -0
- {judgeval-0.5.0 → judgeval-0.7.0}/update_version.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.7.0
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -11,6 +11,8 @@ Classifier: Operating System :: OS Independent
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
12
12
|
Requires-Python: >=3.11
|
13
13
|
Requires-Dist: boto3
|
14
|
+
Requires-Dist: click<8.2.0
|
15
|
+
Requires-Dist: fireworks-ai>=0.19.18
|
14
16
|
Requires-Dist: langchain-anthropic
|
15
17
|
Requires-Dist: langchain-core
|
16
18
|
Requires-Dist: langchain-huggingface
|
@@ -23,6 +25,7 @@ Requires-Dist: orjson>=3.9.0
|
|
23
25
|
Requires-Dist: python-dotenv
|
24
26
|
Requires-Dist: requests
|
25
27
|
Requires-Dist: rich
|
28
|
+
Requires-Dist: typer>=0.9.0
|
26
29
|
Provides-Extra: langchain
|
27
30
|
Requires-Dist: langchain-anthropic; extra == 'langchain'
|
28
31
|
Requires-Dist: langchain-core; extra == 'langchain'
|
@@ -37,7 +40,7 @@ Description-Content-Type: text/markdown
|
|
37
40
|
|
38
41
|
<br>
|
39
42
|
<div style="font-size: 1.5em;">
|
40
|
-
Enable self-learning agents with
|
43
|
+
Enable self-learning agents with environment data and evals.
|
41
44
|
</div>
|
42
45
|
|
43
46
|
## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
|
@@ -54,11 +57,11 @@ We're hiring! Join us in our mission to enable self-learning agents by providing
|
|
54
57
|
|
55
58
|
</div>
|
56
59
|
|
57
|
-
Judgeval offers **open-source tooling** for
|
60
|
+
Judgeval offers **open-source tooling** for evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
|
58
61
|
|
59
62
|
## 🎬 See Judgeval in Action
|
60
63
|
|
61
|
-
**[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval
|
64
|
+
**[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval captures all environment responses across all agent tool calls for monitoring. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
|
62
65
|
|
63
66
|
<table style="width: 100%; max-width: 800px; table-layout: fixed;">
|
64
67
|
<tr>
|
@@ -67,8 +70,8 @@ Judgeval offers **open-source tooling** for tracing and evaluating autonomous, s
|
|
67
70
|
<br><strong>🤖 Agents Running</strong>
|
68
71
|
</td>
|
69
72
|
<td align="center" style="padding: 8px; width: 50%;">
|
70
|
-
<img src="assets/trace.gif" alt="
|
71
|
-
<br><strong>📊
|
73
|
+
<img src="assets/trace.gif" alt="Capturing Environment Data Demo" style="width: 100%; max-width: 350px; height: auto;" />
|
74
|
+
<br><strong>📊 Capturing Environment Data </strong>
|
72
75
|
</td>
|
73
76
|
</tr>
|
74
77
|
<tr>
|
@@ -109,54 +112,14 @@ export JUDGMENT_ORG_ID=...
|
|
109
112
|
|
110
113
|
**If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
|
111
114
|
|
112
|
-
## 🏁 Quickstarts
|
113
|
-
|
114
|
-
### 🛰️ Tracing
|
115
|
-
|
116
|
-
Create a file named `agent.py` with the following code:
|
117
|
-
|
118
|
-
```python
|
119
|
-
from judgeval.tracer import Tracer, wrap
|
120
|
-
from openai import OpenAI
|
121
|
-
|
122
|
-
client = wrap(OpenAI()) # tracks all LLM calls
|
123
|
-
judgment = Tracer(project_name="my_project")
|
124
|
-
|
125
|
-
@judgment.observe(span_type="tool")
|
126
|
-
def format_question(question: str) -> str:
|
127
|
-
# dummy tool
|
128
|
-
return f"Question : {question}"
|
129
|
-
|
130
|
-
@judgment.observe(span_type="function")
|
131
|
-
def run_agent(prompt: str) -> str:
|
132
|
-
task = format_question(prompt)
|
133
|
-
response = client.chat.completions.create(
|
134
|
-
model="gpt-4.1",
|
135
|
-
messages=[{"role": "user", "content": task}]
|
136
|
-
)
|
137
|
-
return response.choices[0].message.content
|
138
|
-
|
139
|
-
run_agent("What is the capital of the United States?")
|
140
|
-
```
|
141
|
-
You'll see your trace exported to the Judgment Platform:
|
142
|
-
|
143
|
-
<p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
|
144
|
-
|
145
|
-
|
146
|
-
[Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
|
147
|
-
|
148
|
-
|
149
|
-
<!-- Created by https://github.com/ekalinin/github-markdown-toc -->
|
150
|
-
|
151
115
|
|
152
116
|
## ✨ Features
|
153
117
|
|
154
118
|
| | |
|
155
119
|
|:---|:---:|
|
156
|
-
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
|
157
120
|
| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
|
158
121
|
| <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
159
|
-
| <h3>📊 Datasets</h3>Export
|
122
|
+
| <h3>📊 Datasets</h3>Export environment interactions and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
160
123
|
|
161
124
|
## 🏢 Self-Hosting
|
162
125
|
|
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
<br>
|
7
7
|
<div style="font-size: 1.5em;">
|
8
|
-
Enable self-learning agents with
|
8
|
+
Enable self-learning agents with environment data and evals.
|
9
9
|
</div>
|
10
10
|
|
11
11
|
## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
|
@@ -22,11 +22,11 @@ We're hiring! Join us in our mission to enable self-learning agents by providing
|
|
22
22
|
|
23
23
|
</div>
|
24
24
|
|
25
|
-
Judgeval offers **open-source tooling** for
|
25
|
+
Judgeval offers **open-source tooling** for evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
|
26
26
|
|
27
27
|
## 🎬 See Judgeval in Action
|
28
28
|
|
29
|
-
**[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval
|
29
|
+
**[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval captures all environment responses across all agent tool calls for monitoring. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
|
30
30
|
|
31
31
|
<table style="width: 100%; max-width: 800px; table-layout: fixed;">
|
32
32
|
<tr>
|
@@ -35,8 +35,8 @@ Judgeval offers **open-source tooling** for tracing and evaluating autonomous, s
|
|
35
35
|
<br><strong>🤖 Agents Running</strong>
|
36
36
|
</td>
|
37
37
|
<td align="center" style="padding: 8px; width: 50%;">
|
38
|
-
<img src="assets/trace.gif" alt="
|
39
|
-
<br><strong>📊
|
38
|
+
<img src="assets/trace.gif" alt="Capturing Environment Data Demo" style="width: 100%; max-width: 350px; height: auto;" />
|
39
|
+
<br><strong>📊 Capturing Environment Data </strong>
|
40
40
|
</td>
|
41
41
|
</tr>
|
42
42
|
<tr>
|
@@ -77,54 +77,14 @@ export JUDGMENT_ORG_ID=...
|
|
77
77
|
|
78
78
|
**If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
|
79
79
|
|
80
|
-
## 🏁 Quickstarts
|
81
|
-
|
82
|
-
### 🛰️ Tracing
|
83
|
-
|
84
|
-
Create a file named `agent.py` with the following code:
|
85
|
-
|
86
|
-
```python
|
87
|
-
from judgeval.tracer import Tracer, wrap
|
88
|
-
from openai import OpenAI
|
89
|
-
|
90
|
-
client = wrap(OpenAI()) # tracks all LLM calls
|
91
|
-
judgment = Tracer(project_name="my_project")
|
92
|
-
|
93
|
-
@judgment.observe(span_type="tool")
|
94
|
-
def format_question(question: str) -> str:
|
95
|
-
# dummy tool
|
96
|
-
return f"Question : {question}"
|
97
|
-
|
98
|
-
@judgment.observe(span_type="function")
|
99
|
-
def run_agent(prompt: str) -> str:
|
100
|
-
task = format_question(prompt)
|
101
|
-
response = client.chat.completions.create(
|
102
|
-
model="gpt-4.1",
|
103
|
-
messages=[{"role": "user", "content": task}]
|
104
|
-
)
|
105
|
-
return response.choices[0].message.content
|
106
|
-
|
107
|
-
run_agent("What is the capital of the United States?")
|
108
|
-
```
|
109
|
-
You'll see your trace exported to the Judgment Platform:
|
110
|
-
|
111
|
-
<p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
|
112
|
-
|
113
|
-
|
114
|
-
[Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
|
115
|
-
|
116
|
-
|
117
|
-
<!-- Created by https://github.com/ekalinin/github-markdown-toc -->
|
118
|
-
|
119
80
|
|
120
81
|
## ✨ Features
|
121
82
|
|
122
83
|
| | |
|
123
84
|
|:---|:---:|
|
124
|
-
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
|
125
85
|
| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
|
126
86
|
| <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
127
|
-
| <h3>📊 Datasets</h3>Export
|
87
|
+
| <h3>📊 Datasets</h3>Export environment interactions and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
128
88
|
|
129
89
|
## 🏢 Self-Hosting
|
130
90
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "judgeval"
|
3
|
-
version = "0.
|
3
|
+
version = "0.7.0"
|
4
4
|
authors = [
|
5
5
|
{ name = "Andrew Li", email = "andrew@judgmentlabs.ai" },
|
6
6
|
{ name = "Alex Shan", email = "alex@judgmentlabs.ai" },
|
@@ -29,12 +29,18 @@ dependencies = [
|
|
29
29
|
"langchain-openai",
|
30
30
|
"langchain-anthropic",
|
31
31
|
"langchain-core",
|
32
|
+
"click<8.2.0",
|
33
|
+
"typer>=0.9.0",
|
34
|
+
"fireworks-ai>=0.19.18",
|
32
35
|
]
|
33
36
|
|
34
37
|
[project.urls]
|
35
38
|
Homepage = "https://github.com/JudgmentLabs/judgeval"
|
36
39
|
Issues = "https://github.com/JudgmentLabs/judgeval/issues"
|
37
40
|
|
41
|
+
[project.scripts]
|
42
|
+
judgeval = "judgeval.cli:app"
|
43
|
+
|
38
44
|
[build-system]
|
39
45
|
requires = ["hatchling"]
|
40
46
|
build-backend = "hatchling.build"
|
@@ -0,0 +1,65 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
import typer
|
4
|
+
from pathlib import Path
|
5
|
+
from dotenv import load_dotenv
|
6
|
+
from judgeval.common.logger import judgeval_logger
|
7
|
+
from judgeval.judgment_client import JudgmentClient
|
8
|
+
|
9
|
+
load_dotenv()
|
10
|
+
|
11
|
+
app = typer.Typer(
|
12
|
+
no_args_is_help=True,
|
13
|
+
rich_markup_mode=None,
|
14
|
+
rich_help_panel=None,
|
15
|
+
pretty_exceptions_enable=False,
|
16
|
+
pretty_exceptions_show_locals=False,
|
17
|
+
pretty_exceptions_short=False,
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
@app.command("upload_scorer")
|
22
|
+
def upload_scorer(
|
23
|
+
scorer_file_path: str,
|
24
|
+
requirements_file_path: str,
|
25
|
+
unique_name: str = typer.Option(
|
26
|
+
None, help="Custom name for the scorer (auto-detected if not provided)"
|
27
|
+
),
|
28
|
+
):
|
29
|
+
# Validate file paths
|
30
|
+
if not Path(scorer_file_path).exists():
|
31
|
+
judgeval_logger.error(f"Scorer file not found: {scorer_file_path}")
|
32
|
+
raise typer.Exit(1)
|
33
|
+
|
34
|
+
if not Path(requirements_file_path).exists():
|
35
|
+
judgeval_logger.error(f"Requirements file not found: {requirements_file_path}")
|
36
|
+
raise typer.Exit(1)
|
37
|
+
|
38
|
+
try:
|
39
|
+
client = JudgmentClient()
|
40
|
+
|
41
|
+
result = client.upload_custom_scorer(
|
42
|
+
scorer_file_path=scorer_file_path,
|
43
|
+
requirements_file_path=requirements_file_path,
|
44
|
+
unique_name=unique_name,
|
45
|
+
)
|
46
|
+
|
47
|
+
if not result:
|
48
|
+
judgeval_logger.error("Failed to upload custom scorer")
|
49
|
+
raise typer.Exit(1)
|
50
|
+
|
51
|
+
raise typer.Exit(0)
|
52
|
+
except Exception:
|
53
|
+
raise
|
54
|
+
|
55
|
+
|
56
|
+
@app.command()
|
57
|
+
def version():
|
58
|
+
"""Show version info"""
|
59
|
+
judgeval_logger.info("JudgEval CLI v0.0.0")
|
60
|
+
|
61
|
+
|
62
|
+
if __name__ == "__main__":
|
63
|
+
app()
|
64
|
+
|
65
|
+
# judgeval upload_scorer /Users/alanzhang/repo/JudgmentLabs/judgeval/src/demo/profile_match_scorer.py /Users/alanzhang/repo/JudgmentLabs/judgeval/src/demo/requirements.txt
|
@@ -20,13 +20,11 @@ from judgeval.common.api.constants import (
|
|
20
20
|
JUDGMENT_EVAL_DELETE_API_URL,
|
21
21
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
22
22
|
JUDGMENT_GET_EVAL_STATUS_API_URL,
|
23
|
-
JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL,
|
24
|
-
JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL,
|
25
23
|
JUDGMENT_SCORER_SAVE_API_URL,
|
26
24
|
JUDGMENT_SCORER_FETCH_API_URL,
|
27
25
|
JUDGMENT_SCORER_EXISTS_API_URL,
|
26
|
+
JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL,
|
28
27
|
JUDGMENT_DATASETS_APPEND_TRACES_API_URL,
|
29
|
-
JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL,
|
30
28
|
)
|
31
29
|
from judgeval.common.api.constants import (
|
32
30
|
TraceFetchPayload,
|
@@ -45,12 +43,11 @@ from judgeval.common.api.constants import (
|
|
45
43
|
DeleteEvalRunRequestBody,
|
46
44
|
EvalLogPayload,
|
47
45
|
EvalStatusPayload,
|
48
|
-
CheckExperimentTypePayload,
|
49
|
-
EvalRunNameExistsPayload,
|
50
46
|
ScorerSavePayload,
|
51
47
|
ScorerFetchPayload,
|
52
48
|
ScorerExistsPayload,
|
53
|
-
|
49
|
+
CustomScorerUploadPayload,
|
50
|
+
CustomScorerTemplateResponse,
|
54
51
|
)
|
55
52
|
from judgeval.utils.requests import requests
|
56
53
|
from judgeval.common.api.json_encoder import json_encoder
|
@@ -97,14 +94,20 @@ class JudgmentApiClient:
|
|
97
94
|
method: Literal["POST", "PATCH", "GET", "DELETE"],
|
98
95
|
url: str,
|
99
96
|
payload: Any,
|
97
|
+
timeout: Optional[Union[float, tuple]] = None,
|
100
98
|
) -> Any:
|
99
|
+
# Prepare request kwargs with optional timeout
|
100
|
+
request_kwargs = self._request_kwargs()
|
101
|
+
if timeout is not None:
|
102
|
+
request_kwargs["timeout"] = timeout
|
103
|
+
|
101
104
|
if method == "GET":
|
102
105
|
r = requests.request(
|
103
106
|
method,
|
104
107
|
url,
|
105
108
|
params=payload,
|
106
109
|
headers=self._headers(),
|
107
|
-
**
|
110
|
+
**request_kwargs,
|
108
111
|
)
|
109
112
|
else:
|
110
113
|
r = requests.request(
|
@@ -112,7 +115,7 @@ class JudgmentApiClient:
|
|
112
115
|
url,
|
113
116
|
json=json_encoder(payload),
|
114
117
|
headers=self._headers(),
|
115
|
-
**
|
118
|
+
**request_kwargs,
|
116
119
|
)
|
117
120
|
|
118
121
|
try:
|
@@ -186,10 +189,10 @@ class JudgmentApiClient:
|
|
186
189
|
payload: EvalLogPayload = {"results": results, "run": run}
|
187
190
|
return self._do_request("POST", JUDGMENT_EVAL_LOG_API_URL, payload)
|
188
191
|
|
189
|
-
def fetch_evaluation_results(self,
|
192
|
+
def fetch_evaluation_results(self, experiment_run_id: str, project_name: str):
|
190
193
|
payload: EvalRunRequestBody = {
|
191
194
|
"project_name": project_name,
|
192
|
-
"
|
195
|
+
"experiment_run_id": experiment_run_id,
|
193
196
|
}
|
194
197
|
return self._do_request("POST", JUDGMENT_EVAL_FETCH_API_URL, payload)
|
195
198
|
|
@@ -204,43 +207,21 @@ class JudgmentApiClient:
|
|
204
207
|
def add_to_evaluation_queue(self, payload: Dict[str, Any]):
|
205
208
|
return self._do_request("POST", JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL, payload)
|
206
209
|
|
207
|
-
def get_evaluation_status(self,
|
210
|
+
def get_evaluation_status(self, experiment_run_id: str, project_name: str):
|
208
211
|
payload: EvalStatusPayload = {
|
209
|
-
"
|
212
|
+
"experiment_run_id": experiment_run_id,
|
210
213
|
"project_name": project_name,
|
211
214
|
"judgment_api_key": self.api_key,
|
212
215
|
}
|
213
216
|
return self._do_request("GET", JUDGMENT_GET_EVAL_STATUS_API_URL, payload)
|
214
217
|
|
215
|
-
def
|
216
|
-
|
217
|
-
|
218
|
-
"project_name": project_name,
|
219
|
-
"judgment_api_key": self.api_key,
|
220
|
-
"is_trace": is_trace,
|
221
|
-
}
|
222
|
-
return self._do_request("POST", JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL, payload)
|
223
|
-
|
224
|
-
def check_eval_run_name_exists(self, eval_name: str, project_name: str):
|
225
|
-
payload: EvalRunNameExistsPayload = {
|
226
|
-
"eval_name": eval_name,
|
227
|
-
"project_name": project_name,
|
228
|
-
"judgment_api_key": self.api_key,
|
229
|
-
}
|
230
|
-
return self._do_request("POST", JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL, payload)
|
231
|
-
|
232
|
-
def check_example_keys(self, keys: List[str], eval_name: str, project_name: str):
|
233
|
-
payload: CheckExampleKeysPayload = {
|
234
|
-
"keys": keys,
|
235
|
-
"eval_name": eval_name,
|
236
|
-
"project_name": project_name,
|
237
|
-
}
|
238
|
-
return self._do_request("POST", JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL, payload)
|
239
|
-
|
240
|
-
def save_scorer(self, name: str, prompt: str, options: Optional[dict] = None):
|
218
|
+
def save_scorer(
|
219
|
+
self, name: str, prompt: str, threshold: float, options: Optional[dict] = None
|
220
|
+
):
|
241
221
|
payload: ScorerSavePayload = {
|
242
222
|
"name": name,
|
243
223
|
"prompt": prompt,
|
224
|
+
"threshold": threshold,
|
244
225
|
"options": options,
|
245
226
|
}
|
246
227
|
try:
|
@@ -292,6 +273,31 @@ class JudgmentApiClient:
|
|
292
273
|
request=e.request,
|
293
274
|
)
|
294
275
|
|
276
|
+
def upload_custom_scorer(
|
277
|
+
self,
|
278
|
+
scorer_name: str,
|
279
|
+
scorer_code: str,
|
280
|
+
requirements_text: str,
|
281
|
+
) -> CustomScorerTemplateResponse:
|
282
|
+
"""Upload custom scorer to backend"""
|
283
|
+
payload: CustomScorerUploadPayload = {
|
284
|
+
"scorer_name": scorer_name,
|
285
|
+
"scorer_code": scorer_code,
|
286
|
+
"requirements_text": requirements_text,
|
287
|
+
}
|
288
|
+
|
289
|
+
try:
|
290
|
+
# Use longer timeout for custom scorer upload (5 minutes)
|
291
|
+
response = self._do_request(
|
292
|
+
"POST",
|
293
|
+
JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL,
|
294
|
+
payload,
|
295
|
+
timeout=(10, 300),
|
296
|
+
)
|
297
|
+
return response
|
298
|
+
except JudgmentAPIException as e:
|
299
|
+
raise e
|
300
|
+
|
295
301
|
def push_dataset(
|
296
302
|
self,
|
297
303
|
dataset_alias: str,
|
@@ -49,9 +49,9 @@ JUDGMENT_EVAL_DELETE_API_URL = (
|
|
49
49
|
JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
|
50
50
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
|
51
51
|
JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
|
52
|
-
|
53
|
-
|
54
|
-
|
52
|
+
|
53
|
+
# Custom Scorers API
|
54
|
+
JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL = f"{ROOT_API}/upload_scorer/"
|
55
55
|
|
56
56
|
|
57
57
|
# Evaluation API Payloads
|
@@ -73,9 +73,9 @@ class EvalLogPayload(TypedDict):
|
|
73
73
|
|
74
74
|
|
75
75
|
class EvalStatusPayload(TypedDict):
|
76
|
-
|
77
|
-
project_name: str
|
76
|
+
experiment_run_id: str
|
78
77
|
judgment_api_key: str
|
78
|
+
project_name: str
|
79
79
|
|
80
80
|
|
81
81
|
class CheckExperimentTypePayload(TypedDict):
|
@@ -162,6 +162,7 @@ JUDGMENT_SCORER_EXISTS_API_URL = f"{ROOT_API}/scorer_exists/"
|
|
162
162
|
class ScorerSavePayload(TypedDict):
|
163
163
|
name: str
|
164
164
|
prompt: str
|
165
|
+
threshold: float
|
165
166
|
options: Optional[dict]
|
166
167
|
|
167
168
|
|
@@ -171,3 +172,15 @@ class ScorerFetchPayload(TypedDict):
|
|
171
172
|
|
172
173
|
class ScorerExistsPayload(TypedDict):
|
173
174
|
name: str
|
175
|
+
|
176
|
+
|
177
|
+
class CustomScorerUploadPayload(TypedDict):
|
178
|
+
scorer_name: str
|
179
|
+
scorer_code: str
|
180
|
+
requirements_text: str
|
181
|
+
|
182
|
+
|
183
|
+
class CustomScorerTemplateResponse(TypedDict):
|
184
|
+
scorer_name: str
|
185
|
+
status: str
|
186
|
+
message: str
|
@@ -84,7 +84,7 @@ def json_encoder(
|
|
84
84
|
)
|
85
85
|
|
86
86
|
# Sequences
|
87
|
-
if isinstance(obj, (list, set, frozenset,
|
87
|
+
if isinstance(obj, (list, set, frozenset, tuple, deque)):
|
88
88
|
return _dump_sequence(
|
89
89
|
obj=obj,
|
90
90
|
)
|
@@ -169,16 +169,15 @@ def _dump_other(
|
|
169
169
|
obj: Any,
|
170
170
|
) -> Any:
|
171
171
|
"""
|
172
|
-
Dump an object to a
|
172
|
+
Dump an object to a representation without iterating it.
|
173
|
+
|
174
|
+
Avoids calling dict(obj) which can consume iterators/generators or
|
175
|
+
invoke user-defined iteration protocols.
|
173
176
|
"""
|
174
177
|
try:
|
175
|
-
data = dict(obj)
|
176
|
-
except Exception:
|
177
178
|
return repr(obj)
|
178
|
-
|
179
|
-
|
180
|
-
data,
|
181
|
-
)
|
179
|
+
except Exception:
|
180
|
+
return str(obj)
|
182
181
|
|
183
182
|
|
184
183
|
def iso_format(o: Union[datetime.date, datetime.time]) -> str:
|
@@ -218,7 +217,7 @@ ENCODERS_BY_TYPE: Dict[Type[Any], Callable[[Any], Any]] = {
|
|
218
217
|
Enum: lambda o: o.value,
|
219
218
|
frozenset: list,
|
220
219
|
deque: list,
|
221
|
-
GeneratorType:
|
220
|
+
GeneratorType: repr,
|
222
221
|
Path: str,
|
223
222
|
Pattern: lambda o: o.pattern,
|
224
223
|
SecretBytes: str,
|