judgeval 0.6.0__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.6.0 → judgeval-0.7.0}/PKG-INFO +8 -47
- {judgeval-0.6.0 → judgeval-0.7.0}/README.md +6 -46
- {judgeval-0.6.0 → judgeval-0.7.0}/pyproject.toml +2 -1
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/cli.py +1 -1
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/api/constants.py +1 -1
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/tracer/core.py +171 -1
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/tracer/trace_manager.py +6 -1
- judgeval-0.7.0/src/judgeval/common/trainer/__init__.py +5 -0
- judgeval-0.7.0/src/judgeval/common/trainer/config.py +125 -0
- judgeval-0.7.0/src/judgeval/common/trainer/console.py +151 -0
- judgeval-0.7.0/src/judgeval/common/trainer/trainable_model.py +238 -0
- judgeval-0.7.0/src/judgeval/common/trainer/trainer.py +301 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/judgment_client.py +4 -104
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/run_evaluation.py +10 -107
- {judgeval-0.6.0 → judgeval-0.7.0}/uv.lock +739 -28
- {judgeval-0.6.0 → judgeval-0.7.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/.github/pull_request_template.md +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/.github/workflows/blocked-pr.yaml +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/.github/workflows/lint.yaml +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/.github/workflows/merge-branch-check.yaml +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/.github/workflows/mypy.yaml +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/.github/workflows/release.yaml +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/.github/workflows/validate-branch.yaml +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/.gitignore +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/.pre-commit-config.yaml +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/LICENSE.md +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/agent.gif +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/agent_trace_example.png +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/data.gif +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/dataset_clustering_screenshot.png +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/dataset_clustering_screenshot_dm.png +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/datasets_preview_screenshot.png +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/document.gif +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/error_analysis_dashboard.png +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/errors.png +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/experiments_dashboard_screenshot.png +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/experiments_page.png +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/experiments_pagev2.png +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/logo-dark.svg +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/logo-light.svg +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/monitoring_screenshot.png +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/new_darkmode.svg +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/new_lightmode.svg +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/online_eval.png +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/product_shot.png +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/test.png +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/tests.png +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/trace.gif +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/trace_demo.png +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/trace_screenshot.png +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/assets/trace_screenshot_old.png +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/pytest.ini +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/.coveragerc +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/__init__.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/clients.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/__init__.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/api/__init__.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/api/api.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/api/json_encoder.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/exceptions.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/logger.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/storage/__init__.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/storage/s3_storage.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/tracer/__init__.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/tracer/constants.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/tracer/otel_exporter.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/tracer/otel_span_processor.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/tracer/providers.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/tracer/span_processor.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/tracer/span_transformer.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/common/utils.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/constants.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/data/evaluation_run.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/data/example.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/data/judgment_types.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/data/result.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/data/scripts/openapi_transform.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/data/tool.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/data/trace.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/data/trace_run.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/dataset.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/integrations/langgraph.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/judges/mixture_of_judges.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/local_eval_queue.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/rules.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/agent_scorer.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/api_scorer.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/base_scorer.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/example_scorer.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/score.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/scorers/utils.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/tracer/__init__.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/utils/alerts.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/utils/async_utils.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/utils/file_utils.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/utils/requests.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/judgeval/version_check.py +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/src/update_types.sh +0 -0
- {judgeval-0.6.0 → judgeval-0.7.0}/update_version.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.7.0
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Requires-Python: >=3.11
|
13
13
|
Requires-Dist: boto3
|
14
14
|
Requires-Dist: click<8.2.0
|
15
|
+
Requires-Dist: fireworks-ai>=0.19.18
|
15
16
|
Requires-Dist: langchain-anthropic
|
16
17
|
Requires-Dist: langchain-core
|
17
18
|
Requires-Dist: langchain-huggingface
|
@@ -39,7 +40,7 @@ Description-Content-Type: text/markdown
|
|
39
40
|
|
40
41
|
<br>
|
41
42
|
<div style="font-size: 1.5em;">
|
42
|
-
Enable self-learning agents with
|
43
|
+
Enable self-learning agents with environment data and evals.
|
43
44
|
</div>
|
44
45
|
|
45
46
|
## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
|
@@ -56,11 +57,11 @@ We're hiring! Join us in our mission to enable self-learning agents by providing
|
|
56
57
|
|
57
58
|
</div>
|
58
59
|
|
59
|
-
Judgeval offers **open-source tooling** for
|
60
|
+
Judgeval offers **open-source tooling** for evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
|
60
61
|
|
61
62
|
## 🎬 See Judgeval in Action
|
62
63
|
|
63
|
-
**[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval
|
64
|
+
**[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval captures all environment responses across all agent tool calls for monitoring. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
|
64
65
|
|
65
66
|
<table style="width: 100%; max-width: 800px; table-layout: fixed;">
|
66
67
|
<tr>
|
@@ -69,8 +70,8 @@ Judgeval offers **open-source tooling** for tracing and evaluating autonomous, s
|
|
69
70
|
<br><strong>🤖 Agents Running</strong>
|
70
71
|
</td>
|
71
72
|
<td align="center" style="padding: 8px; width: 50%;">
|
72
|
-
<img src="assets/trace.gif" alt="
|
73
|
-
<br><strong>📊
|
73
|
+
<img src="assets/trace.gif" alt="Capturing Environment Data Demo" style="width: 100%; max-width: 350px; height: auto;" />
|
74
|
+
<br><strong>📊 Capturing Environment Data </strong>
|
74
75
|
</td>
|
75
76
|
</tr>
|
76
77
|
<tr>
|
@@ -111,54 +112,14 @@ export JUDGMENT_ORG_ID=...
|
|
111
112
|
|
112
113
|
**If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
|
113
114
|
|
114
|
-
## 🏁 Quickstarts
|
115
|
-
|
116
|
-
### 🛰️ Tracing
|
117
|
-
|
118
|
-
Create a file named `agent.py` with the following code:
|
119
|
-
|
120
|
-
```python
|
121
|
-
from judgeval.tracer import Tracer, wrap
|
122
|
-
from openai import OpenAI
|
123
|
-
|
124
|
-
client = wrap(OpenAI()) # tracks all LLM calls
|
125
|
-
judgment = Tracer(project_name="my_project")
|
126
|
-
|
127
|
-
@judgment.observe(span_type="tool")
|
128
|
-
def format_question(question: str) -> str:
|
129
|
-
# dummy tool
|
130
|
-
return f"Question : {question}"
|
131
|
-
|
132
|
-
@judgment.observe(span_type="function")
|
133
|
-
def run_agent(prompt: str) -> str:
|
134
|
-
task = format_question(prompt)
|
135
|
-
response = client.chat.completions.create(
|
136
|
-
model="gpt-4.1",
|
137
|
-
messages=[{"role": "user", "content": task}]
|
138
|
-
)
|
139
|
-
return response.choices[0].message.content
|
140
|
-
|
141
|
-
run_agent("What is the capital of the United States?")
|
142
|
-
```
|
143
|
-
You'll see your trace exported to the Judgment Platform:
|
144
|
-
|
145
|
-
<p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
|
146
|
-
|
147
|
-
|
148
|
-
[Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
|
149
|
-
|
150
|
-
|
151
|
-
<!-- Created by https://github.com/ekalinin/github-markdown-toc -->
|
152
|
-
|
153
115
|
|
154
116
|
## ✨ Features
|
155
117
|
|
156
118
|
| | |
|
157
119
|
|:---|:---:|
|
158
|
-
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
|
159
120
|
| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
|
160
121
|
| <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
161
|
-
| <h3>📊 Datasets</h3>Export
|
122
|
+
| <h3>📊 Datasets</h3>Export environment interactions and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
162
123
|
|
163
124
|
## 🏢 Self-Hosting
|
164
125
|
|
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
<br>
|
7
7
|
<div style="font-size: 1.5em;">
|
8
|
-
Enable self-learning agents with
|
8
|
+
Enable self-learning agents with environment data and evals.
|
9
9
|
</div>
|
10
10
|
|
11
11
|
## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
|
@@ -22,11 +22,11 @@ We're hiring! Join us in our mission to enable self-learning agents by providing
|
|
22
22
|
|
23
23
|
</div>
|
24
24
|
|
25
|
-
Judgeval offers **open-source tooling** for
|
25
|
+
Judgeval offers **open-source tooling** for evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
|
26
26
|
|
27
27
|
## 🎬 See Judgeval in Action
|
28
28
|
|
29
|
-
**[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval
|
29
|
+
**[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval captures all environment responses across all agent tool calls for monitoring. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
|
30
30
|
|
31
31
|
<table style="width: 100%; max-width: 800px; table-layout: fixed;">
|
32
32
|
<tr>
|
@@ -35,8 +35,8 @@ Judgeval offers **open-source tooling** for tracing and evaluating autonomous, s
|
|
35
35
|
<br><strong>🤖 Agents Running</strong>
|
36
36
|
</td>
|
37
37
|
<td align="center" style="padding: 8px; width: 50%;">
|
38
|
-
<img src="assets/trace.gif" alt="
|
39
|
-
<br><strong>📊
|
38
|
+
<img src="assets/trace.gif" alt="Capturing Environment Data Demo" style="width: 100%; max-width: 350px; height: auto;" />
|
39
|
+
<br><strong>📊 Capturing Environment Data </strong>
|
40
40
|
</td>
|
41
41
|
</tr>
|
42
42
|
<tr>
|
@@ -77,54 +77,14 @@ export JUDGMENT_ORG_ID=...
|
|
77
77
|
|
78
78
|
**If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
|
79
79
|
|
80
|
-
## 🏁 Quickstarts
|
81
|
-
|
82
|
-
### 🛰️ Tracing
|
83
|
-
|
84
|
-
Create a file named `agent.py` with the following code:
|
85
|
-
|
86
|
-
```python
|
87
|
-
from judgeval.tracer import Tracer, wrap
|
88
|
-
from openai import OpenAI
|
89
|
-
|
90
|
-
client = wrap(OpenAI()) # tracks all LLM calls
|
91
|
-
judgment = Tracer(project_name="my_project")
|
92
|
-
|
93
|
-
@judgment.observe(span_type="tool")
|
94
|
-
def format_question(question: str) -> str:
|
95
|
-
# dummy tool
|
96
|
-
return f"Question : {question}"
|
97
|
-
|
98
|
-
@judgment.observe(span_type="function")
|
99
|
-
def run_agent(prompt: str) -> str:
|
100
|
-
task = format_question(prompt)
|
101
|
-
response = client.chat.completions.create(
|
102
|
-
model="gpt-4.1",
|
103
|
-
messages=[{"role": "user", "content": task}]
|
104
|
-
)
|
105
|
-
return response.choices[0].message.content
|
106
|
-
|
107
|
-
run_agent("What is the capital of the United States?")
|
108
|
-
```
|
109
|
-
You'll see your trace exported to the Judgment Platform:
|
110
|
-
|
111
|
-
<p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
|
112
|
-
|
113
|
-
|
114
|
-
[Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
|
115
|
-
|
116
|
-
|
117
|
-
<!-- Created by https://github.com/ekalinin/github-markdown-toc -->
|
118
|
-
|
119
80
|
|
120
81
|
## ✨ Features
|
121
82
|
|
122
83
|
| | |
|
123
84
|
|:---|:---:|
|
124
|
-
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
|
125
85
|
| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
|
126
86
|
| <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
127
|
-
| <h3>📊 Datasets</h3>Export
|
87
|
+
| <h3>📊 Datasets</h3>Export environment interactions and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
128
88
|
|
129
89
|
## 🏢 Self-Hosting
|
130
90
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "judgeval"
|
3
|
-
version = "0.
|
3
|
+
version = "0.7.0"
|
4
4
|
authors = [
|
5
5
|
{ name = "Andrew Li", email = "andrew@judgmentlabs.ai" },
|
6
6
|
{ name = "Alex Shan", email = "alex@judgmentlabs.ai" },
|
@@ -31,6 +31,7 @@ dependencies = [
|
|
31
31
|
"langchain-core",
|
32
32
|
"click<8.2.0",
|
33
33
|
"typer>=0.9.0",
|
34
|
+
"fireworks-ai>=0.19.18",
|
34
35
|
]
|
35
36
|
|
36
37
|
[project.urls]
|
@@ -51,7 +51,7 @@ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
|
|
51
51
|
JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
|
52
52
|
|
53
53
|
# Custom Scorers API
|
54
|
-
JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL = f"{ROOT_API}/
|
54
|
+
JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL = f"{ROOT_API}/upload_scorer/"
|
55
55
|
|
56
56
|
|
57
57
|
# Evaluation API Payloads
|
@@ -815,6 +815,8 @@ class Tracer:
|
|
815
815
|
== "true",
|
816
816
|
enable_evaluations: bool = os.getenv("JUDGMENT_EVALUATIONS", "true").lower()
|
817
817
|
== "true",
|
818
|
+
show_trace_urls: bool = os.getenv("JUDGMENT_SHOW_TRACE_URLS", "true").lower()
|
819
|
+
== "true",
|
818
820
|
# S3 configuration
|
819
821
|
use_s3: bool = False,
|
820
822
|
s3_bucket_name: Optional[str] = None,
|
@@ -859,6 +861,7 @@ class Tracer:
|
|
859
861
|
self.traces: List[Trace] = []
|
860
862
|
self.enable_monitoring: bool = enable_monitoring
|
861
863
|
self.enable_evaluations: bool = enable_evaluations
|
864
|
+
self.show_trace_urls: bool = show_trace_urls
|
862
865
|
self.class_identifiers: Dict[
|
863
866
|
str, str
|
864
867
|
] = {} # Dictionary to store class identifiers
|
@@ -1731,6 +1734,93 @@ class Tracer:
|
|
1731
1734
|
f"Error during background service shutdown: {e}"
|
1732
1735
|
)
|
1733
1736
|
|
1737
|
+
def trace_to_message_history(
|
1738
|
+
self, trace: Union[Trace, TraceClient]
|
1739
|
+
) -> List[Dict[str, str]]:
|
1740
|
+
"""
|
1741
|
+
Extract message history from a trace for training purposes.
|
1742
|
+
|
1743
|
+
This method processes trace spans to reconstruct the conversation flow,
|
1744
|
+
extracting messages in chronological order from LLM, user, and tool spans.
|
1745
|
+
|
1746
|
+
Args:
|
1747
|
+
trace: Trace or TraceClient instance to extract messages from
|
1748
|
+
|
1749
|
+
Returns:
|
1750
|
+
List of message dictionaries with 'role' and 'content' keys
|
1751
|
+
|
1752
|
+
Raises:
|
1753
|
+
ValueError: If no trace is provided
|
1754
|
+
"""
|
1755
|
+
if not trace:
|
1756
|
+
raise ValueError("No trace provided")
|
1757
|
+
|
1758
|
+
# Handle both Trace and TraceClient objects
|
1759
|
+
if isinstance(trace, TraceClient):
|
1760
|
+
spans = trace.trace_spans
|
1761
|
+
else:
|
1762
|
+
spans = trace.trace_spans if hasattr(trace, "trace_spans") else []
|
1763
|
+
|
1764
|
+
messages = []
|
1765
|
+
first_found = False
|
1766
|
+
|
1767
|
+
# Process spans in chronological order
|
1768
|
+
for span in sorted(
|
1769
|
+
spans, key=lambda s: s.created_at if hasattr(s, "created_at") else 0
|
1770
|
+
):
|
1771
|
+
# Skip spans without output (except for first LLM span which may have input messages)
|
1772
|
+
if span.output is None and span.span_type != "llm":
|
1773
|
+
continue
|
1774
|
+
|
1775
|
+
if span.span_type == "llm":
|
1776
|
+
# For the first LLM span, extract input messages (system + user prompts)
|
1777
|
+
if not first_found and hasattr(span, "inputs") and span.inputs:
|
1778
|
+
input_messages = span.inputs.get("messages", [])
|
1779
|
+
if input_messages:
|
1780
|
+
first_found = True
|
1781
|
+
# Add input messages (typically system and user messages)
|
1782
|
+
for msg in input_messages:
|
1783
|
+
if (
|
1784
|
+
isinstance(msg, dict)
|
1785
|
+
and "role" in msg
|
1786
|
+
and "content" in msg
|
1787
|
+
):
|
1788
|
+
messages.append(
|
1789
|
+
{"role": msg["role"], "content": msg["content"]}
|
1790
|
+
)
|
1791
|
+
|
1792
|
+
# Add assistant response from span output
|
1793
|
+
if span.output is not None:
|
1794
|
+
messages.append({"role": "assistant", "content": str(span.output)})
|
1795
|
+
|
1796
|
+
elif span.span_type == "user":
|
1797
|
+
# Add user messages
|
1798
|
+
if span.output is not None:
|
1799
|
+
messages.append({"role": "user", "content": str(span.output)})
|
1800
|
+
|
1801
|
+
elif span.span_type == "tool":
|
1802
|
+
# Add tool responses as user messages (common pattern in training)
|
1803
|
+
if span.output is not None:
|
1804
|
+
messages.append({"role": "user", "content": str(span.output)})
|
1805
|
+
|
1806
|
+
return messages
|
1807
|
+
|
1808
|
+
def get_current_message_history(self) -> List[Dict[str, str]]:
|
1809
|
+
"""
|
1810
|
+
Get message history from the current trace.
|
1811
|
+
|
1812
|
+
Returns:
|
1813
|
+
List of message dictionaries from the current trace context
|
1814
|
+
|
1815
|
+
Raises:
|
1816
|
+
ValueError: If no current trace is found
|
1817
|
+
"""
|
1818
|
+
current_trace = self.get_current_trace()
|
1819
|
+
if not current_trace:
|
1820
|
+
raise ValueError("No current trace found")
|
1821
|
+
|
1822
|
+
return self.trace_to_message_history(current_trace)
|
1823
|
+
|
1734
1824
|
|
1735
1825
|
def _get_current_trace(
|
1736
1826
|
trace_across_async_contexts: bool = Tracer.trace_across_async_contexts,
|
@@ -1746,7 +1836,7 @@ def wrap(
|
|
1746
1836
|
) -> Any:
|
1747
1837
|
"""
|
1748
1838
|
Wraps an API client to add tracing capabilities.
|
1749
|
-
Supports OpenAI, Together, Anthropic,
|
1839
|
+
Supports OpenAI, Together, Anthropic, Google GenAI clients, and TrainableModel.
|
1750
1840
|
Patches both '.create' and Anthropic's '.stream' methods using a wrapper class.
|
1751
1841
|
"""
|
1752
1842
|
(
|
@@ -1871,6 +1961,39 @@ def wrap(
|
|
1871
1961
|
setattr(client.chat.completions, "create", wrapped(original_create))
|
1872
1962
|
elif isinstance(client, (groq_AsyncGroq)):
|
1873
1963
|
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1964
|
+
|
1965
|
+
# Check for TrainableModel from judgeval.common.trainer
|
1966
|
+
try:
|
1967
|
+
from judgeval.common.trainer import TrainableModel
|
1968
|
+
|
1969
|
+
if isinstance(client, TrainableModel):
|
1970
|
+
# Define a wrapper function that can be reapplied to new model instances
|
1971
|
+
def wrap_model_instance(model_instance):
|
1972
|
+
"""Wrap a model instance with tracing functionality"""
|
1973
|
+
if hasattr(model_instance, "chat") and hasattr(
|
1974
|
+
model_instance.chat, "completions"
|
1975
|
+
):
|
1976
|
+
if hasattr(model_instance.chat.completions, "create"):
|
1977
|
+
setattr(
|
1978
|
+
model_instance.chat.completions,
|
1979
|
+
"create",
|
1980
|
+
wrapped(model_instance.chat.completions.create),
|
1981
|
+
)
|
1982
|
+
if hasattr(model_instance.chat.completions, "acreate"):
|
1983
|
+
setattr(
|
1984
|
+
model_instance.chat.completions,
|
1985
|
+
"acreate",
|
1986
|
+
wrapped_async(model_instance.chat.completions.acreate),
|
1987
|
+
)
|
1988
|
+
|
1989
|
+
# Register the wrapper function with the TrainableModel
|
1990
|
+
client._register_tracer_wrapper(wrap_model_instance)
|
1991
|
+
|
1992
|
+
# Apply wrapping to the current model
|
1993
|
+
wrap_model_instance(client._current_model)
|
1994
|
+
except ImportError:
|
1995
|
+
pass # TrainableModel not available
|
1996
|
+
|
1874
1997
|
return client
|
1875
1998
|
|
1876
1999
|
|
@@ -1977,6 +2100,22 @@ def _get_client_config(
|
|
1977
2100
|
return "GROQ_API_CALL", client.chat.completions.create, None, None, None
|
1978
2101
|
elif isinstance(client, (groq_AsyncGroq)):
|
1979
2102
|
return "GROQ_API_CALL", client.chat.completions.create, None, None, None
|
2103
|
+
|
2104
|
+
# Check for TrainableModel
|
2105
|
+
try:
|
2106
|
+
from judgeval.common.trainer import TrainableModel
|
2107
|
+
|
2108
|
+
if isinstance(client, TrainableModel):
|
2109
|
+
return (
|
2110
|
+
"FIREWORKS_TRAINABLE_MODEL_CALL",
|
2111
|
+
client._current_model.chat.completions.create,
|
2112
|
+
None,
|
2113
|
+
None,
|
2114
|
+
None,
|
2115
|
+
)
|
2116
|
+
except ImportError:
|
2117
|
+
pass # TrainableModel not available
|
2118
|
+
|
1980
2119
|
raise ValueError(f"Unsupported client type: {type(client)}")
|
1981
2120
|
|
1982
2121
|
|
@@ -2155,6 +2294,37 @@ def _format_output_data(
|
|
2155
2294
|
cache_creation_input_tokens,
|
2156
2295
|
)
|
2157
2296
|
|
2297
|
+
# Check for TrainableModel
|
2298
|
+
try:
|
2299
|
+
from judgeval.common.trainer import TrainableModel
|
2300
|
+
|
2301
|
+
if isinstance(client, TrainableModel):
|
2302
|
+
# TrainableModel uses Fireworks LLM internally, so response format should be similar to OpenAI
|
2303
|
+
if (
|
2304
|
+
hasattr(response, "model")
|
2305
|
+
and hasattr(response, "usage")
|
2306
|
+
and hasattr(response, "choices")
|
2307
|
+
):
|
2308
|
+
model_name = response.model
|
2309
|
+
prompt_tokens = response.usage.prompt_tokens if response.usage else 0
|
2310
|
+
completion_tokens = (
|
2311
|
+
response.usage.completion_tokens if response.usage else 0
|
2312
|
+
)
|
2313
|
+
message_content = response.choices[0].message.content
|
2314
|
+
|
2315
|
+
# Use LiteLLM cost calculation with fireworks_ai prefix
|
2316
|
+
# LiteLLM supports Fireworks AI models for cost calculation when prefixed with "fireworks_ai/"
|
2317
|
+
fireworks_model_name = f"fireworks_ai/{model_name}"
|
2318
|
+
return message_content, _create_usage(
|
2319
|
+
fireworks_model_name,
|
2320
|
+
prompt_tokens,
|
2321
|
+
completion_tokens,
|
2322
|
+
cache_read_input_tokens,
|
2323
|
+
cache_creation_input_tokens,
|
2324
|
+
)
|
2325
|
+
except ImportError:
|
2326
|
+
pass # TrainableModel not available
|
2327
|
+
|
2158
2328
|
judgeval_logger.warning(f"Unsupported client type: {type(client)}")
|
2159
2329
|
return None, None
|
2160
2330
|
|
@@ -71,7 +71,12 @@ class TraceManagerClient:
|
|
71
71
|
|
72
72
|
server_response = self.api_client.upsert_trace(trace_data)
|
73
73
|
|
74
|
-
if
|
74
|
+
if (
|
75
|
+
not offline_mode
|
76
|
+
and show_link
|
77
|
+
and "ui_results_url" in server_response
|
78
|
+
and self.tracer.show_trace_urls
|
79
|
+
):
|
75
80
|
pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={server_response['ui_results_url']}]View Trace[/link]\n"
|
76
81
|
rprint(pretty_str)
|
77
82
|
|
@@ -0,0 +1,125 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from typing import Optional, Dict, Any
|
3
|
+
import json
|
4
|
+
|
5
|
+
|
6
|
+
@dataclass
|
7
|
+
class TrainerConfig:
|
8
|
+
"""Configuration class for JudgmentTrainer parameters."""
|
9
|
+
|
10
|
+
deployment_id: str
|
11
|
+
user_id: str
|
12
|
+
model_id: str
|
13
|
+
base_model_name: str = "qwen2p5-7b-instruct"
|
14
|
+
rft_provider: str = "fireworks"
|
15
|
+
num_steps: int = 5
|
16
|
+
num_generations_per_prompt: int = (
|
17
|
+
4 # Number of rollouts/generations per input prompt
|
18
|
+
)
|
19
|
+
num_prompts_per_step: int = 4 # Number of input prompts to sample per training step
|
20
|
+
concurrency: int = 100
|
21
|
+
epochs: int = 1
|
22
|
+
learning_rate: float = 1e-5
|
23
|
+
accelerator_count: int = 1
|
24
|
+
accelerator_type: str = "NVIDIA_A100_80GB"
|
25
|
+
temperature: float = 1.5
|
26
|
+
max_tokens: int = 50
|
27
|
+
enable_addons: bool = True
|
28
|
+
|
29
|
+
|
30
|
+
@dataclass
|
31
|
+
class ModelConfig:
|
32
|
+
"""
|
33
|
+
Configuration class for storing and loading trained model state.
|
34
|
+
|
35
|
+
This class enables persistence of trained models so they can be loaded
|
36
|
+
and used later without retraining.
|
37
|
+
|
38
|
+
Example usage:
|
39
|
+
trainer = JudgmentTrainer(config)
|
40
|
+
model_config = trainer.train(agent_function, scorers, prompts)
|
41
|
+
|
42
|
+
# Save the trained model configuration
|
43
|
+
model_config.save_to_file("my_trained_model.json")
|
44
|
+
|
45
|
+
# Later, load and use the trained model
|
46
|
+
loaded_config = ModelConfig.load_from_file("my_trained_model.json")
|
47
|
+
trained_model = TrainableModel.from_model_config(loaded_config)
|
48
|
+
|
49
|
+
# Use the trained model for inference
|
50
|
+
response = trained_model.chat.completions.create(
|
51
|
+
model="current", # Uses the loaded trained model
|
52
|
+
messages=[{"role": "user", "content": "Hello!"}]
|
53
|
+
)
|
54
|
+
"""
|
55
|
+
|
56
|
+
# Base model configuration
|
57
|
+
base_model_name: str
|
58
|
+
deployment_id: str
|
59
|
+
user_id: str
|
60
|
+
model_id: str
|
61
|
+
enable_addons: bool
|
62
|
+
|
63
|
+
# Training state
|
64
|
+
current_step: int
|
65
|
+
total_steps: int
|
66
|
+
|
67
|
+
# Current model information
|
68
|
+
current_model_name: Optional[str] = None
|
69
|
+
is_trained: bool = False
|
70
|
+
|
71
|
+
# Training parameters used (for reference)
|
72
|
+
training_params: Optional[Dict[str, Any]] = None
|
73
|
+
|
74
|
+
def to_dict(self) -> Dict[str, Any]:
|
75
|
+
"""Convert ModelConfig to dictionary for serialization."""
|
76
|
+
return {
|
77
|
+
"base_model_name": self.base_model_name,
|
78
|
+
"deployment_id": self.deployment_id,
|
79
|
+
"user_id": self.user_id,
|
80
|
+
"model_id": self.model_id,
|
81
|
+
"enable_addons": self.enable_addons,
|
82
|
+
"current_step": self.current_step,
|
83
|
+
"total_steps": self.total_steps,
|
84
|
+
"current_model_name": self.current_model_name,
|
85
|
+
"is_trained": self.is_trained,
|
86
|
+
"training_params": self.training_params,
|
87
|
+
}
|
88
|
+
|
89
|
+
@classmethod
|
90
|
+
def from_dict(cls, data: Dict[str, Any]) -> "ModelConfig":
|
91
|
+
"""Create ModelConfig from dictionary."""
|
92
|
+
return cls(
|
93
|
+
base_model_name=data.get("base_model_name", "qwen2p5-7b-instruct"),
|
94
|
+
deployment_id=data.get("deployment_id", "my-base-deployment"),
|
95
|
+
user_id=data.get("user_id", ""),
|
96
|
+
model_id=data.get("model_id", ""),
|
97
|
+
enable_addons=data.get("enable_addons", True),
|
98
|
+
current_step=data.get("current_step", 0),
|
99
|
+
total_steps=data.get("total_steps", 0),
|
100
|
+
current_model_name=data.get("current_model_name"),
|
101
|
+
is_trained=data.get("is_trained", False),
|
102
|
+
training_params=data.get("training_params"),
|
103
|
+
)
|
104
|
+
|
105
|
+
def to_json(self) -> str:
|
106
|
+
"""Convert ModelConfig to JSON string."""
|
107
|
+
return json.dumps(self.to_dict(), indent=2)
|
108
|
+
|
109
|
+
@classmethod
|
110
|
+
def from_json(cls, json_str: str) -> "ModelConfig":
|
111
|
+
"""Create ModelConfig from JSON string."""
|
112
|
+
data = json.loads(json_str)
|
113
|
+
return cls.from_dict(data)
|
114
|
+
|
115
|
+
def save_to_file(self, filepath: str):
|
116
|
+
"""Save ModelConfig to a JSON file."""
|
117
|
+
with open(filepath, "w") as f:
|
118
|
+
f.write(self.to_json())
|
119
|
+
|
120
|
+
@classmethod
|
121
|
+
def load_from_file(cls, filepath: str) -> "ModelConfig":
|
122
|
+
"""Load ModelConfig from a JSON file."""
|
123
|
+
with open(filepath, "r") as f:
|
124
|
+
json_str = f.read()
|
125
|
+
return cls.from_json(json_str)
|