judgeval 0.6.0__tar.gz → 0.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. {judgeval-0.6.0 → judgeval-0.7.1}/PKG-INFO +8 -47
  2. {judgeval-0.6.0 → judgeval-0.7.1}/README.md +6 -46
  3. {judgeval-0.6.0 → judgeval-0.7.1}/pyproject.toml +2 -1
  4. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/cli.py +1 -1
  5. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/api/constants.py +1 -1
  6. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/tracer/core.py +171 -1
  7. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/tracer/trace_manager.py +6 -1
  8. judgeval-0.7.1/src/judgeval/common/trainer/__init__.py +5 -0
  9. judgeval-0.7.1/src/judgeval/common/trainer/config.py +125 -0
  10. judgeval-0.7.1/src/judgeval/common/trainer/console.py +151 -0
  11. judgeval-0.7.1/src/judgeval/common/trainer/trainable_model.py +238 -0
  12. judgeval-0.7.1/src/judgeval/common/trainer/trainer.py +301 -0
  13. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/judgment_client.py +4 -104
  14. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/run_evaluation.py +10 -107
  15. {judgeval-0.6.0 → judgeval-0.7.1}/uv.lock +739 -28
  16. {judgeval-0.6.0 → judgeval-0.7.1}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  17. {judgeval-0.6.0 → judgeval-0.7.1}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  18. {judgeval-0.6.0 → judgeval-0.7.1}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  19. {judgeval-0.6.0 → judgeval-0.7.1}/.github/pull_request_template.md +0 -0
  20. {judgeval-0.6.0 → judgeval-0.7.1}/.github/workflows/blocked-pr.yaml +0 -0
  21. {judgeval-0.6.0 → judgeval-0.7.1}/.github/workflows/ci.yaml +0 -0
  22. {judgeval-0.6.0 → judgeval-0.7.1}/.github/workflows/lint.yaml +0 -0
  23. {judgeval-0.6.0 → judgeval-0.7.1}/.github/workflows/merge-branch-check.yaml +0 -0
  24. {judgeval-0.6.0 → judgeval-0.7.1}/.github/workflows/mypy.yaml +0 -0
  25. {judgeval-0.6.0 → judgeval-0.7.1}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
  26. {judgeval-0.6.0 → judgeval-0.7.1}/.github/workflows/release.yaml +0 -0
  27. {judgeval-0.6.0 → judgeval-0.7.1}/.github/workflows/validate-branch.yaml +0 -0
  28. {judgeval-0.6.0 → judgeval-0.7.1}/.gitignore +0 -0
  29. {judgeval-0.6.0 → judgeval-0.7.1}/.pre-commit-config.yaml +0 -0
  30. {judgeval-0.6.0 → judgeval-0.7.1}/LICENSE.md +0 -0
  31. {judgeval-0.6.0 → judgeval-0.7.1}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
  32. {judgeval-0.6.0 → judgeval-0.7.1}/assets/agent.gif +0 -0
  33. {judgeval-0.6.0 → judgeval-0.7.1}/assets/agent_trace_example.png +0 -0
  34. {judgeval-0.6.0 → judgeval-0.7.1}/assets/data.gif +0 -0
  35. {judgeval-0.6.0 → judgeval-0.7.1}/assets/dataset_clustering_screenshot.png +0 -0
  36. {judgeval-0.6.0 → judgeval-0.7.1}/assets/dataset_clustering_screenshot_dm.png +0 -0
  37. {judgeval-0.6.0 → judgeval-0.7.1}/assets/datasets_preview_screenshot.png +0 -0
  38. {judgeval-0.6.0 → judgeval-0.7.1}/assets/document.gif +0 -0
  39. {judgeval-0.6.0 → judgeval-0.7.1}/assets/error_analysis_dashboard.png +0 -0
  40. {judgeval-0.6.0 → judgeval-0.7.1}/assets/errors.png +0 -0
  41. {judgeval-0.6.0 → judgeval-0.7.1}/assets/experiments_dashboard_screenshot.png +0 -0
  42. {judgeval-0.6.0 → judgeval-0.7.1}/assets/experiments_page.png +0 -0
  43. {judgeval-0.6.0 → judgeval-0.7.1}/assets/experiments_pagev2.png +0 -0
  44. {judgeval-0.6.0 → judgeval-0.7.1}/assets/logo-dark.svg +0 -0
  45. {judgeval-0.6.0 → judgeval-0.7.1}/assets/logo-light.svg +0 -0
  46. {judgeval-0.6.0 → judgeval-0.7.1}/assets/monitoring_screenshot.png +0 -0
  47. {judgeval-0.6.0 → judgeval-0.7.1}/assets/new_darkmode.svg +0 -0
  48. {judgeval-0.6.0 → judgeval-0.7.1}/assets/new_lightmode.svg +0 -0
  49. {judgeval-0.6.0 → judgeval-0.7.1}/assets/online_eval.png +0 -0
  50. {judgeval-0.6.0 → judgeval-0.7.1}/assets/product_shot.png +0 -0
  51. {judgeval-0.6.0 → judgeval-0.7.1}/assets/test.png +0 -0
  52. {judgeval-0.6.0 → judgeval-0.7.1}/assets/tests.png +0 -0
  53. {judgeval-0.6.0 → judgeval-0.7.1}/assets/trace.gif +0 -0
  54. {judgeval-0.6.0 → judgeval-0.7.1}/assets/trace_demo.png +0 -0
  55. {judgeval-0.6.0 → judgeval-0.7.1}/assets/trace_screenshot.png +0 -0
  56. {judgeval-0.6.0 → judgeval-0.7.1}/assets/trace_screenshot_old.png +0 -0
  57. {judgeval-0.6.0 → judgeval-0.7.1}/pytest.ini +0 -0
  58. {judgeval-0.6.0 → judgeval-0.7.1}/src/.coveragerc +0 -0
  59. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/__init__.py +0 -0
  60. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/clients.py +0 -0
  61. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/__init__.py +0 -0
  62. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/api/__init__.py +0 -0
  63. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/api/api.py +0 -0
  64. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/api/json_encoder.py +0 -0
  65. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/exceptions.py +0 -0
  66. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/logger.py +0 -0
  67. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/storage/__init__.py +0 -0
  68. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/storage/s3_storage.py +0 -0
  69. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/tracer/__init__.py +0 -0
  70. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/tracer/constants.py +0 -0
  71. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/tracer/otel_exporter.py +0 -0
  72. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/tracer/otel_span_processor.py +0 -0
  73. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/tracer/providers.py +0 -0
  74. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/tracer/span_processor.py +0 -0
  75. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/tracer/span_transformer.py +0 -0
  76. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/common/utils.py +0 -0
  77. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/constants.py +0 -0
  78. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/data/__init__.py +0 -0
  79. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/data/evaluation_run.py +0 -0
  80. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/data/example.py +0 -0
  81. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/data/judgment_types.py +0 -0
  82. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/data/result.py +0 -0
  83. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/data/scorer_data.py +0 -0
  84. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
  85. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/data/scripts/openapi_transform.py +0 -0
  86. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/data/tool.py +0 -0
  87. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/data/trace.py +0 -0
  88. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/data/trace_run.py +0 -0
  89. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/dataset.py +0 -0
  90. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/integrations/langgraph.py +0 -0
  91. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/judges/__init__.py +0 -0
  92. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/judges/base_judge.py +0 -0
  93. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/judges/litellm_judge.py +0 -0
  94. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/judges/mixture_of_judges.py +0 -0
  95. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/judges/together_judge.py +0 -0
  96. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/judges/utils.py +0 -0
  97. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/local_eval_queue.py +0 -0
  98. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/rules.py +0 -0
  99. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/__init__.py +0 -0
  100. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/agent_scorer.py +0 -0
  101. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/api_scorer.py +0 -0
  102. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/base_scorer.py +0 -0
  103. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/example_scorer.py +0 -0
  104. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/exceptions.py +0 -0
  105. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  106. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  107. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  108. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  109. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
  110. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
  111. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  112. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
  113. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
  114. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +0 -0
  115. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -0
  116. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -0
  117. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/score.py +0 -0
  118. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/scorers/utils.py +0 -0
  119. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/tracer/__init__.py +0 -0
  120. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/utils/alerts.py +0 -0
  121. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/utils/async_utils.py +0 -0
  122. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/utils/file_utils.py +0 -0
  123. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/utils/requests.py +0 -0
  124. {judgeval-0.6.0 → judgeval-0.7.1}/src/judgeval/version_check.py +0 -0
  125. {judgeval-0.6.0 → judgeval-0.7.1}/src/update_types.sh +0 -0
  126. {judgeval-0.6.0 → judgeval-0.7.1}/update_version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.6.0
3
+ Version: 0.7.1
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.11
13
13
  Requires-Dist: boto3
14
14
  Requires-Dist: click<8.2.0
15
+ Requires-Dist: fireworks-ai>=0.19.18
15
16
  Requires-Dist: langchain-anthropic
16
17
  Requires-Dist: langchain-core
17
18
  Requires-Dist: langchain-huggingface
@@ -39,7 +40,7 @@ Description-Content-Type: text/markdown
39
40
 
40
41
  <br>
41
42
  <div style="font-size: 1.5em;">
42
- Enable self-learning agents with traces, evals, and environment data.
43
+ Enable self-learning agents with environment data and evals.
43
44
  </div>
44
45
 
45
46
  ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
@@ -56,11 +57,11 @@ We're hiring! Join us in our mission to enable self-learning agents by providing
56
57
 
57
58
  </div>
58
59
 
59
- Judgeval offers **open-source tooling** for tracing and evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
60
+ Judgeval offers **open-source tooling** for evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
60
61
 
61
62
  ## 🎬 See Judgeval in Action
62
63
 
63
- **[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval traces every input/output + environment response across all agent tool calls for debugging. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
64
+ **[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval captures all environment responses across all agent tool calls for monitoring. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
64
65
 
65
66
  <table style="width: 100%; max-width: 800px; table-layout: fixed;">
66
67
  <tr>
@@ -69,8 +70,8 @@ Judgeval offers **open-source tooling** for tracing and evaluating autonomous, s
69
70
  <br><strong>🤖 Agents Running</strong>
70
71
  </td>
71
72
  <td align="center" style="padding: 8px; width: 50%;">
72
- <img src="assets/trace.gif" alt="Trace Demo" style="width: 100%; max-width: 350px; height: auto;" />
73
- <br><strong>📊 Real-time Tracing</strong>
73
+ <img src="assets/trace.gif" alt="Capturing Environment Data Demo" style="width: 100%; max-width: 350px; height: auto;" />
74
+ <br><strong>📊 Capturing Environment Data </strong>
74
75
  </td>
75
76
  </tr>
76
77
  <tr>
@@ -111,54 +112,14 @@ export JUDGMENT_ORG_ID=...
111
112
 
112
113
  **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
113
114
 
114
- ## 🏁 Quickstarts
115
-
116
- ### 🛰️ Tracing
117
-
118
- Create a file named `agent.py` with the following code:
119
-
120
- ```python
121
- from judgeval.tracer import Tracer, wrap
122
- from openai import OpenAI
123
-
124
- client = wrap(OpenAI()) # tracks all LLM calls
125
- judgment = Tracer(project_name="my_project")
126
-
127
- @judgment.observe(span_type="tool")
128
- def format_question(question: str) -> str:
129
- # dummy tool
130
- return f"Question : {question}"
131
-
132
- @judgment.observe(span_type="function")
133
- def run_agent(prompt: str) -> str:
134
- task = format_question(prompt)
135
- response = client.chat.completions.create(
136
- model="gpt-4.1",
137
- messages=[{"role": "user", "content": task}]
138
- )
139
- return response.choices[0].message.content
140
-
141
- run_agent("What is the capital of the United States?")
142
- ```
143
- You'll see your trace exported to the Judgment Platform:
144
-
145
- <p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
146
-
147
-
148
- [Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
149
-
150
-
151
- <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
152
-
153
115
 
154
116
  ## ✨ Features
155
117
 
156
118
  | | |
157
119
  |:---|:---:|
158
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
159
120
  | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
160
121
  | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
161
- | <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
122
+ | <h3>📊 Datasets</h3>Export environment interactions and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
162
123
 
163
124
  ## 🏢 Self-Hosting
164
125
 
@@ -5,7 +5,7 @@
5
5
 
6
6
  <br>
7
7
  <div style="font-size: 1.5em;">
8
- Enable self-learning agents with traces, evals, and environment data.
8
+ Enable self-learning agents with environment data and evals.
9
9
  </div>
10
10
 
11
11
  ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
@@ -22,11 +22,11 @@ We're hiring! Join us in our mission to enable self-learning agents by providing
22
22
 
23
23
  </div>
24
24
 
25
- Judgeval offers **open-source tooling** for tracing and evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
25
+ Judgeval offers **open-source tooling** for evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
26
26
 
27
27
  ## 🎬 See Judgeval in Action
28
28
 
29
- **[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval traces every input/output + environment response across all agent tool calls for debugging. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
29
+ **[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval captures all environment responses across all agent tool calls for monitoring. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
30
30
 
31
31
  <table style="width: 100%; max-width: 800px; table-layout: fixed;">
32
32
  <tr>
@@ -35,8 +35,8 @@ Judgeval offers **open-source tooling** for tracing and evaluating autonomous, s
35
35
  <br><strong>🤖 Agents Running</strong>
36
36
  </td>
37
37
  <td align="center" style="padding: 8px; width: 50%;">
38
- <img src="assets/trace.gif" alt="Trace Demo" style="width: 100%; max-width: 350px; height: auto;" />
39
- <br><strong>📊 Real-time Tracing</strong>
38
+ <img src="assets/trace.gif" alt="Capturing Environment Data Demo" style="width: 100%; max-width: 350px; height: auto;" />
39
+ <br><strong>📊 Capturing Environment Data </strong>
40
40
  </td>
41
41
  </tr>
42
42
  <tr>
@@ -77,54 +77,14 @@ export JUDGMENT_ORG_ID=...
77
77
 
78
78
  **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
79
79
 
80
- ## 🏁 Quickstarts
81
-
82
- ### 🛰️ Tracing
83
-
84
- Create a file named `agent.py` with the following code:
85
-
86
- ```python
87
- from judgeval.tracer import Tracer, wrap
88
- from openai import OpenAI
89
-
90
- client = wrap(OpenAI()) # tracks all LLM calls
91
- judgment = Tracer(project_name="my_project")
92
-
93
- @judgment.observe(span_type="tool")
94
- def format_question(question: str) -> str:
95
- # dummy tool
96
- return f"Question : {question}"
97
-
98
- @judgment.observe(span_type="function")
99
- def run_agent(prompt: str) -> str:
100
- task = format_question(prompt)
101
- response = client.chat.completions.create(
102
- model="gpt-4.1",
103
- messages=[{"role": "user", "content": task}]
104
- )
105
- return response.choices[0].message.content
106
-
107
- run_agent("What is the capital of the United States?")
108
- ```
109
- You'll see your trace exported to the Judgment Platform:
110
-
111
- <p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
112
-
113
-
114
- [Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
115
-
116
-
117
- <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
118
-
119
80
 
120
81
  ## ✨ Features
121
82
 
122
83
  | | |
123
84
  |:---|:---:|
124
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
125
85
  | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
126
86
  | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
127
- | <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
87
+ | <h3>📊 Datasets</h3>Export environment interactions and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
128
88
 
129
89
  ## 🏢 Self-Hosting
130
90
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.6.0"
3
+ version = "0.7.1"
4
4
  authors = [
5
5
  { name = "Andrew Li", email = "andrew@judgmentlabs.ai" },
6
6
  { name = "Alex Shan", email = "alex@judgmentlabs.ai" },
@@ -31,6 +31,7 @@ dependencies = [
31
31
  "langchain-core",
32
32
  "click<8.2.0",
33
33
  "typer>=0.9.0",
34
+ "fireworks-ai>=0.19.18",
34
35
  ]
35
36
 
36
37
  [project.urls]
@@ -38,7 +38,7 @@ def upload_scorer(
38
38
  try:
39
39
  client = JudgmentClient()
40
40
 
41
- result = client.save_custom_scorer(
41
+ result = client.upload_custom_scorer(
42
42
  scorer_file_path=scorer_file_path,
43
43
  requirements_file_path=requirements_file_path,
44
44
  unique_name=unique_name,
@@ -51,7 +51,7 @@ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
51
51
  JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
52
52
 
53
53
  # Custom Scorers API
54
- JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL = f"{ROOT_API}/build_sandbox_template/"
54
+ JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL = f"{ROOT_API}/upload_scorer/"
55
55
 
56
56
 
57
57
  # Evaluation API Payloads
@@ -815,6 +815,8 @@ class Tracer:
815
815
  == "true",
816
816
  enable_evaluations: bool = os.getenv("JUDGMENT_EVALUATIONS", "true").lower()
817
817
  == "true",
818
+ show_trace_urls: bool = os.getenv("JUDGMENT_SHOW_TRACE_URLS", "true").lower()
819
+ == "true",
818
820
  # S3 configuration
819
821
  use_s3: bool = False,
820
822
  s3_bucket_name: Optional[str] = None,
@@ -859,6 +861,7 @@ class Tracer:
859
861
  self.traces: List[Trace] = []
860
862
  self.enable_monitoring: bool = enable_monitoring
861
863
  self.enable_evaluations: bool = enable_evaluations
864
+ self.show_trace_urls: bool = show_trace_urls
862
865
  self.class_identifiers: Dict[
863
866
  str, str
864
867
  ] = {} # Dictionary to store class identifiers
@@ -1731,6 +1734,93 @@ class Tracer:
1731
1734
  f"Error during background service shutdown: {e}"
1732
1735
  )
1733
1736
 
1737
+ def trace_to_message_history(
1738
+ self, trace: Union[Trace, TraceClient]
1739
+ ) -> List[Dict[str, str]]:
1740
+ """
1741
+ Extract message history from a trace for training purposes.
1742
+
1743
+ This method processes trace spans to reconstruct the conversation flow,
1744
+ extracting messages in chronological order from LLM, user, and tool spans.
1745
+
1746
+ Args:
1747
+ trace: Trace or TraceClient instance to extract messages from
1748
+
1749
+ Returns:
1750
+ List of message dictionaries with 'role' and 'content' keys
1751
+
1752
+ Raises:
1753
+ ValueError: If no trace is provided
1754
+ """
1755
+ if not trace:
1756
+ raise ValueError("No trace provided")
1757
+
1758
+ # Handle both Trace and TraceClient objects
1759
+ if isinstance(trace, TraceClient):
1760
+ spans = trace.trace_spans
1761
+ else:
1762
+ spans = trace.trace_spans if hasattr(trace, "trace_spans") else []
1763
+
1764
+ messages = []
1765
+ first_found = False
1766
+
1767
+ # Process spans in chronological order
1768
+ for span in sorted(
1769
+ spans, key=lambda s: s.created_at if hasattr(s, "created_at") else 0
1770
+ ):
1771
+ # Skip spans without output (except for first LLM span which may have input messages)
1772
+ if span.output is None and span.span_type != "llm":
1773
+ continue
1774
+
1775
+ if span.span_type == "llm":
1776
+ # For the first LLM span, extract input messages (system + user prompts)
1777
+ if not first_found and hasattr(span, "inputs") and span.inputs:
1778
+ input_messages = span.inputs.get("messages", [])
1779
+ if input_messages:
1780
+ first_found = True
1781
+ # Add input messages (typically system and user messages)
1782
+ for msg in input_messages:
1783
+ if (
1784
+ isinstance(msg, dict)
1785
+ and "role" in msg
1786
+ and "content" in msg
1787
+ ):
1788
+ messages.append(
1789
+ {"role": msg["role"], "content": msg["content"]}
1790
+ )
1791
+
1792
+ # Add assistant response from span output
1793
+ if span.output is not None:
1794
+ messages.append({"role": "assistant", "content": str(span.output)})
1795
+
1796
+ elif span.span_type == "user":
1797
+ # Add user messages
1798
+ if span.output is not None:
1799
+ messages.append({"role": "user", "content": str(span.output)})
1800
+
1801
+ elif span.span_type == "tool":
1802
+ # Add tool responses as user messages (common pattern in training)
1803
+ if span.output is not None:
1804
+ messages.append({"role": "user", "content": str(span.output)})
1805
+
1806
+ return messages
1807
+
1808
+ def get_current_message_history(self) -> List[Dict[str, str]]:
1809
+ """
1810
+ Get message history from the current trace.
1811
+
1812
+ Returns:
1813
+ List of message dictionaries from the current trace context
1814
+
1815
+ Raises:
1816
+ ValueError: If no current trace is found
1817
+ """
1818
+ current_trace = self.get_current_trace()
1819
+ if not current_trace:
1820
+ raise ValueError("No current trace found")
1821
+
1822
+ return self.trace_to_message_history(current_trace)
1823
+
1734
1824
 
1735
1825
  def _get_current_trace(
1736
1826
  trace_across_async_contexts: bool = Tracer.trace_across_async_contexts,
@@ -1746,7 +1836,7 @@ def wrap(
1746
1836
  ) -> Any:
1747
1837
  """
1748
1838
  Wraps an API client to add tracing capabilities.
1749
- Supports OpenAI, Together, Anthropic, and Google GenAI clients.
1839
+ Supports OpenAI, Together, Anthropic, Google GenAI clients, and TrainableModel.
1750
1840
  Patches both '.create' and Anthropic's '.stream' methods using a wrapper class.
1751
1841
  """
1752
1842
  (
@@ -1871,6 +1961,39 @@ def wrap(
1871
1961
  setattr(client.chat.completions, "create", wrapped(original_create))
1872
1962
  elif isinstance(client, (groq_AsyncGroq)):
1873
1963
  setattr(client.chat.completions, "create", wrapped_async(original_create))
1964
+
1965
+ # Check for TrainableModel from judgeval.common.trainer
1966
+ try:
1967
+ from judgeval.common.trainer import TrainableModel
1968
+
1969
+ if isinstance(client, TrainableModel):
1970
+ # Define a wrapper function that can be reapplied to new model instances
1971
+ def wrap_model_instance(model_instance):
1972
+ """Wrap a model instance with tracing functionality"""
1973
+ if hasattr(model_instance, "chat") and hasattr(
1974
+ model_instance.chat, "completions"
1975
+ ):
1976
+ if hasattr(model_instance.chat.completions, "create"):
1977
+ setattr(
1978
+ model_instance.chat.completions,
1979
+ "create",
1980
+ wrapped(model_instance.chat.completions.create),
1981
+ )
1982
+ if hasattr(model_instance.chat.completions, "acreate"):
1983
+ setattr(
1984
+ model_instance.chat.completions,
1985
+ "acreate",
1986
+ wrapped_async(model_instance.chat.completions.acreate),
1987
+ )
1988
+
1989
+ # Register the wrapper function with the TrainableModel
1990
+ client._register_tracer_wrapper(wrap_model_instance)
1991
+
1992
+ # Apply wrapping to the current model
1993
+ wrap_model_instance(client._current_model)
1994
+ except ImportError:
1995
+ pass # TrainableModel not available
1996
+
1874
1997
  return client
1875
1998
 
1876
1999
 
@@ -1977,6 +2100,22 @@ def _get_client_config(
1977
2100
  return "GROQ_API_CALL", client.chat.completions.create, None, None, None
1978
2101
  elif isinstance(client, (groq_AsyncGroq)):
1979
2102
  return "GROQ_API_CALL", client.chat.completions.create, None, None, None
2103
+
2104
+ # Check for TrainableModel
2105
+ try:
2106
+ from judgeval.common.trainer import TrainableModel
2107
+
2108
+ if isinstance(client, TrainableModel):
2109
+ return (
2110
+ "FIREWORKS_TRAINABLE_MODEL_CALL",
2111
+ client._current_model.chat.completions.create,
2112
+ None,
2113
+ None,
2114
+ None,
2115
+ )
2116
+ except ImportError:
2117
+ pass # TrainableModel not available
2118
+
1980
2119
  raise ValueError(f"Unsupported client type: {type(client)}")
1981
2120
 
1982
2121
 
@@ -2155,6 +2294,37 @@ def _format_output_data(
2155
2294
  cache_creation_input_tokens,
2156
2295
  )
2157
2296
 
2297
+ # Check for TrainableModel
2298
+ try:
2299
+ from judgeval.common.trainer import TrainableModel
2300
+
2301
+ if isinstance(client, TrainableModel):
2302
+ # TrainableModel uses Fireworks LLM internally, so response format should be similar to OpenAI
2303
+ if (
2304
+ hasattr(response, "model")
2305
+ and hasattr(response, "usage")
2306
+ and hasattr(response, "choices")
2307
+ ):
2308
+ model_name = response.model
2309
+ prompt_tokens = response.usage.prompt_tokens if response.usage else 0
2310
+ completion_tokens = (
2311
+ response.usage.completion_tokens if response.usage else 0
2312
+ )
2313
+ message_content = response.choices[0].message.content
2314
+
2315
+ # Use LiteLLM cost calculation with fireworks_ai prefix
2316
+ # LiteLLM supports Fireworks AI models for cost calculation when prefixed with "fireworks_ai/"
2317
+ fireworks_model_name = f"fireworks_ai/{model_name}"
2318
+ return message_content, _create_usage(
2319
+ fireworks_model_name,
2320
+ prompt_tokens,
2321
+ completion_tokens,
2322
+ cache_read_input_tokens,
2323
+ cache_creation_input_tokens,
2324
+ )
2325
+ except ImportError:
2326
+ pass # TrainableModel not available
2327
+
2158
2328
  judgeval_logger.warning(f"Unsupported client type: {type(client)}")
2159
2329
  return None, None
2160
2330
 
@@ -71,7 +71,12 @@ class TraceManagerClient:
71
71
 
72
72
  server_response = self.api_client.upsert_trace(trace_data)
73
73
 
74
- if not offline_mode and show_link and "ui_results_url" in server_response:
74
+ if (
75
+ not offline_mode
76
+ and show_link
77
+ and "ui_results_url" in server_response
78
+ and self.tracer.show_trace_urls
79
+ ):
75
80
  pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={server_response['ui_results_url']}]View Trace[/link]\n"
76
81
  rprint(pretty_str)
77
82
 
@@ -0,0 +1,5 @@
1
+ from .trainer import JudgmentTrainer
2
+ from .config import TrainerConfig, ModelConfig
3
+ from .trainable_model import TrainableModel
4
+
5
+ __all__ = ["JudgmentTrainer", "TrainerConfig", "ModelConfig", "TrainableModel"]
@@ -0,0 +1,125 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional, Dict, Any
3
+ import json
4
+
5
+
6
+ @dataclass
7
+ class TrainerConfig:
8
+ """Configuration class for JudgmentTrainer parameters."""
9
+
10
+ deployment_id: str
11
+ user_id: str
12
+ model_id: str
13
+ base_model_name: str = "qwen2p5-7b-instruct"
14
+ rft_provider: str = "fireworks"
15
+ num_steps: int = 5
16
+ num_generations_per_prompt: int = (
17
+ 4 # Number of rollouts/generations per input prompt
18
+ )
19
+ num_prompts_per_step: int = 4 # Number of input prompts to sample per training step
20
+ concurrency: int = 100
21
+ epochs: int = 1
22
+ learning_rate: float = 1e-5
23
+ accelerator_count: int = 1
24
+ accelerator_type: str = "NVIDIA_A100_80GB"
25
+ temperature: float = 1.5
26
+ max_tokens: int = 50
27
+ enable_addons: bool = True
28
+
29
+
30
+ @dataclass
31
+ class ModelConfig:
32
+ """
33
+ Configuration class for storing and loading trained model state.
34
+
35
+ This class enables persistence of trained models so they can be loaded
36
+ and used later without retraining.
37
+
38
+ Example usage:
39
+ trainer = JudgmentTrainer(config)
40
+ model_config = trainer.train(agent_function, scorers, prompts)
41
+
42
+ # Save the trained model configuration
43
+ model_config.save_to_file("my_trained_model.json")
44
+
45
+ # Later, load and use the trained model
46
+ loaded_config = ModelConfig.load_from_file("my_trained_model.json")
47
+ trained_model = TrainableModel.from_model_config(loaded_config)
48
+
49
+ # Use the trained model for inference
50
+ response = trained_model.chat.completions.create(
51
+ model="current", # Uses the loaded trained model
52
+ messages=[{"role": "user", "content": "Hello!"}]
53
+ )
54
+ """
55
+
56
+ # Base model configuration
57
+ base_model_name: str
58
+ deployment_id: str
59
+ user_id: str
60
+ model_id: str
61
+ enable_addons: bool
62
+
63
+ # Training state
64
+ current_step: int
65
+ total_steps: int
66
+
67
+ # Current model information
68
+ current_model_name: Optional[str] = None
69
+ is_trained: bool = False
70
+
71
+ # Training parameters used (for reference)
72
+ training_params: Optional[Dict[str, Any]] = None
73
+
74
+ def to_dict(self) -> Dict[str, Any]:
75
+ """Convert ModelConfig to dictionary for serialization."""
76
+ return {
77
+ "base_model_name": self.base_model_name,
78
+ "deployment_id": self.deployment_id,
79
+ "user_id": self.user_id,
80
+ "model_id": self.model_id,
81
+ "enable_addons": self.enable_addons,
82
+ "current_step": self.current_step,
83
+ "total_steps": self.total_steps,
84
+ "current_model_name": self.current_model_name,
85
+ "is_trained": self.is_trained,
86
+ "training_params": self.training_params,
87
+ }
88
+
89
+ @classmethod
90
+ def from_dict(cls, data: Dict[str, Any]) -> "ModelConfig":
91
+ """Create ModelConfig from dictionary."""
92
+ return cls(
93
+ base_model_name=data.get("base_model_name", "qwen2p5-7b-instruct"),
94
+ deployment_id=data.get("deployment_id", "my-base-deployment"),
95
+ user_id=data.get("user_id", ""),
96
+ model_id=data.get("model_id", ""),
97
+ enable_addons=data.get("enable_addons", True),
98
+ current_step=data.get("current_step", 0),
99
+ total_steps=data.get("total_steps", 0),
100
+ current_model_name=data.get("current_model_name"),
101
+ is_trained=data.get("is_trained", False),
102
+ training_params=data.get("training_params"),
103
+ )
104
+
105
+ def to_json(self) -> str:
106
+ """Convert ModelConfig to JSON string."""
107
+ return json.dumps(self.to_dict(), indent=2)
108
+
109
+ @classmethod
110
+ def from_json(cls, json_str: str) -> "ModelConfig":
111
+ """Create ModelConfig from JSON string."""
112
+ data = json.loads(json_str)
113
+ return cls.from_dict(data)
114
+
115
+ def save_to_file(self, filepath: str):
116
+ """Save ModelConfig to a JSON file."""
117
+ with open(filepath, "w") as f:
118
+ f.write(self.to_json())
119
+
120
+ @classmethod
121
+ def load_from_file(cls, filepath: str) -> "ModelConfig":
122
+ """Load ModelConfig from a JSON file."""
123
+ with open(filepath, "r") as f:
124
+ json_str = f.read()
125
+ return cls.from_json(json_str)