judgeval 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/api/__init__.py CHANGED
@@ -73,7 +73,7 @@ class JudgmentSyncClient:
73
73
 
74
74
  def evaluate_examples(
75
75
  self, payload: ExampleEvaluationRun, stream: Optional[str] = None
76
- ) -> Any:
76
+ ) -> EvaluateResponse:
77
77
  query_params = {}
78
78
  if stream is not None:
79
79
  query_params["stream"] = stream
@@ -86,7 +86,7 @@ class JudgmentSyncClient:
86
86
 
87
87
  def evaluate_traces(
88
88
  self, payload: TraceEvaluationRun, stream: Optional[str] = None
89
- ) -> Any:
89
+ ) -> EvaluateResponse:
90
90
  query_params = {}
91
91
  if stream is not None:
92
92
  query_params["stream"] = stream
@@ -212,13 +212,6 @@ class JudgmentSyncClient:
212
212
  payload,
213
213
  )
214
214
 
215
- def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
216
- return self._request(
217
- "POST",
218
- url_for("/e2e_fetch_trace_scorer_span_score/"),
219
- payload,
220
- )
221
-
222
215
 
223
216
  class JudgmentAsyncClient:
224
217
  __slots__ = ("api_key", "organization_id", "client")
@@ -270,7 +263,7 @@ class JudgmentAsyncClient:
270
263
 
271
264
  async def evaluate_examples(
272
265
  self, payload: ExampleEvaluationRun, stream: Optional[str] = None
273
- ) -> Any:
266
+ ) -> EvaluateResponse:
274
267
  query_params = {}
275
268
  if stream is not None:
276
269
  query_params["stream"] = stream
@@ -283,7 +276,7 @@ class JudgmentAsyncClient:
283
276
 
284
277
  async def evaluate_traces(
285
278
  self, payload: TraceEvaluationRun, stream: Optional[str] = None
286
- ) -> Any:
279
+ ) -> EvaluateResponse:
287
280
  query_params = {}
288
281
  if stream is not None:
289
282
  query_params["stream"] = stream
@@ -411,13 +404,6 @@ class JudgmentAsyncClient:
411
404
  payload,
412
405
  )
413
406
 
414
- async def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
415
- return await self._request(
416
- "POST",
417
- url_for("/e2e_fetch_trace_scorer_span_score/"),
418
- payload,
419
- )
420
-
421
407
 
422
408
  __all__ = [
423
409
  "JudgmentSyncClient",
judgeval/api/api_types.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-09-30T18:06:51+00:00
3
+ # timestamp: 2025-10-07T20:43:52+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
  from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
@@ -146,6 +146,14 @@ class ValidationError(TypedDict):
146
146
  type: str
147
147
 
148
148
 
149
+ class UsageInfo(TypedDict):
150
+ total_judgees: int
151
+ regular_use: int
152
+ pay_as_you_go_use: int
153
+ remaining_regular: int
154
+ remaining_after: int
155
+
156
+
149
157
  DatasetKind = Literal["trace", "example"]
150
158
 
151
159
 
@@ -273,7 +281,6 @@ class OtelTraceListItem(TypedDict):
273
281
  trace_id: str
274
282
  created_at: str
275
283
  duration: NotRequired[Optional[int]]
276
- has_notification: NotRequired[Optional[bool]]
277
284
  tags: NotRequired[Optional[List[str]]]
278
285
  experiment_run_id: NotRequired[Optional[str]]
279
286
  span_name: NotRequired[Optional[str]]
@@ -281,6 +288,8 @@ class OtelTraceListItem(TypedDict):
281
288
  error: NotRequired[str]
282
289
  scores: NotRequired[List[OtelSpanListItemScores]]
283
290
  customer_id: NotRequired[Optional[str]]
291
+ input: NotRequired[Optional[str]]
292
+ output: NotRequired[Optional[str]]
284
293
  input_preview: NotRequired[Optional[str]]
285
294
  output_preview: NotRequired[Optional[str]]
286
295
  annotation_count: NotRequired[int]
@@ -312,6 +321,12 @@ class OtelSpanDetail(TypedDict):
312
321
  scores: NotRequired[Optional[List[OtelSpanDetailScores]]]
313
322
 
314
323
 
324
+ class EvaluateResponse(TypedDict):
325
+ status: str
326
+ results: List[ScoringResult]
327
+ resource_usage: NotRequired[Optional[UsageInfo]]
328
+
329
+
315
330
  class EvalResults(TypedDict):
316
331
  results: List[ScoringResult]
317
332
  run: Union[ExampleEvaluationRun, TraceEvaluationRun]
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-09-30T18:06:50+00:00
3
+ # timestamp: 2025-10-07T20:43:51+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
  from typing import Annotated, Any, Dict, List, Optional, Union
@@ -162,6 +162,14 @@ class ValidationError(BaseModel):
162
162
  type: Annotated[str, Field(title="Error Type")]
163
163
 
164
164
 
165
+ class UsageInfo(BaseModel):
166
+ total_judgees: Annotated[int, Field(title="Total Judgees")]
167
+ regular_use: Annotated[int, Field(title="Regular Use")]
168
+ pay_as_you_go_use: Annotated[int, Field(title="Pay As You Go Use")]
169
+ remaining_regular: Annotated[int, Field(title="Remaining Regular")]
170
+ remaining_after: Annotated[int, Field(title="Remaining After")]
171
+
172
+
165
173
  class DatasetKind(Enum):
166
174
  trace = "trace"
167
175
  example = "example"
@@ -309,7 +317,6 @@ class OtelTraceListItem(BaseModel):
309
317
  trace_id: Annotated[str, Field(title="Trace Id")]
310
318
  created_at: Annotated[AwareDatetime, Field(title="Created At")]
311
319
  duration: Annotated[Optional[int], Field(title="Duration")] = None
312
- has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = None
313
320
  tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
314
321
  experiment_run_id: Annotated[Optional[str], Field(title="Experiment Run Id")] = None
315
322
  span_name: Annotated[Optional[str], Field(title="Span Name")] = None
@@ -319,6 +326,8 @@ class OtelTraceListItem(BaseModel):
319
326
  Optional[List[OtelSpanListItemScores]], Field(title="Scores")
320
327
  ] = []
321
328
  customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
329
+ input: Annotated[Optional[str], Field(title="Input")] = None
330
+ output: Annotated[Optional[str], Field(title="Output")] = None
322
331
  input_preview: Annotated[Optional[str], Field(title="Input Preview")] = None
323
332
  output_preview: Annotated[Optional[str], Field(title="Output Preview")] = None
324
333
  annotation_count: Annotated[Optional[int], Field(title="Annotation Count")] = 0
@@ -358,6 +367,12 @@ class OtelSpanDetail(BaseModel):
358
367
  )
359
368
 
360
369
 
370
+ class EvaluateResponse(BaseModel):
371
+ status: Annotated[str, Field(title="Status")]
372
+ results: Annotated[List[ScoringResult], Field(title="Results")]
373
+ resource_usage: Optional[UsageInfo] = None
374
+
375
+
361
376
  class EvalResults(BaseModel):
362
377
  results: Annotated[List[ScoringResult], Field(title="Results")]
363
378
  run: Annotated[Union[ExampleEvaluationRun, TraceEvaluationRun], Field(title="Run")]
@@ -137,9 +137,23 @@ def _extract_openai_content(chunk) -> str:
137
137
 
138
138
  def _extract_anthropic_content(chunk) -> str:
139
139
  """Extract content from Anthropic streaming chunk."""
140
- if hasattr(chunk, "type") and chunk.type == "content_block_delta":
141
- if hasattr(chunk, "delta") and hasattr(chunk.delta, "text"):
142
- return chunk.delta.text or ""
140
+ if hasattr(chunk, "type"):
141
+ if chunk.type == "content_block_delta":
142
+ if hasattr(chunk, "delta"):
143
+ if hasattr(chunk.delta, "text"):
144
+ return chunk.delta.text or ""
145
+ elif hasattr(chunk.delta, "partial_json"):
146
+ # Tool use input streaming - return raw JSON to accumulate properly
147
+ return chunk.delta.partial_json or ""
148
+ elif chunk.type == "content_block_start":
149
+ if hasattr(chunk, "content_block") and hasattr(chunk.content_block, "type"):
150
+ if chunk.content_block.type == "tool_use":
151
+ tool_info = {
152
+ "type": "tool_use",
153
+ "id": getattr(chunk.content_block, "id", None),
154
+ "name": getattr(chunk.content_block, "name", None),
155
+ }
156
+ return f"[TOOL_USE_START: {tool_info}]"
143
157
  elif hasattr(chunk, "delta") and hasattr(chunk.delta, "text"):
144
158
  return chunk.delta.text or ""
145
159
  elif hasattr(chunk, "text"):
@@ -409,7 +423,25 @@ def _format_anthropic_output(
409
423
  and usage.cache_creation_input_tokens is not None
410
424
  else 0
411
425
  )
412
- message_content = response.content[0].text if hasattr(response, "content") else None
426
+ # Extract content from Anthropic response, handling both text and tool use blocks
427
+ message_content = None
428
+ if hasattr(response, "content") and response.content:
429
+ content_parts = []
430
+ for content_block in response.content:
431
+ block_type = getattr(content_block, "type", None)
432
+ if block_type == "text":
433
+ # Text content block
434
+ content_parts.append(getattr(content_block, "text", ""))
435
+ elif block_type == "tool_use":
436
+ # Tool use block - serialize the tool call information
437
+ tool_info = {
438
+ "type": "tool_use",
439
+ "id": getattr(content_block, "id", None),
440
+ "name": getattr(content_block, "name", None),
441
+ "input": getattr(content_block, "input", None),
442
+ }
443
+ content_parts.append(f"[TOOL_USE: {tool_info}]")
444
+ message_content = "\n".join(content_parts) if content_parts else None
413
445
 
414
446
  if model_name:
415
447
  return message_content, _create_usage(
judgeval/version.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.15.0"
1
+ __version__ = "0.16.0"
2
2
 
3
3
 
4
4
  def get_version() -> str:
@@ -0,0 +1,266 @@
1
+ Metadata-Version: 2.4
2
+ Name: judgeval
3
+ Version: 0.16.0
4
+ Summary: Judgeval Package
5
+ Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
+ Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
7
+ Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
8
+ License-Expression: Apache-2.0
9
+ License-File: LICENSE.md
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.10
13
+ Requires-Dist: boto3>=1.40.11
14
+ Requires-Dist: click<8.2.0
15
+ Requires-Dist: dotenv
16
+ Requires-Dist: httpx>=0.28.1
17
+ Requires-Dist: litellm<1.75.0
18
+ Requires-Dist: opentelemetry-exporter-otlp>=1.36.0
19
+ Requires-Dist: opentelemetry-sdk>=1.36.0
20
+ Requires-Dist: orjson>=3.9.0
21
+ Requires-Dist: typer>=0.9.0
22
+ Provides-Extra: s3
23
+ Requires-Dist: boto3>=1.40.11; extra == 's3'
24
+ Provides-Extra: trainer
25
+ Requires-Dist: fireworks-ai>=0.19.18; extra == 'trainer'
26
+ Description-Content-Type: text/markdown
27
+
28
+ <div align="center">
29
+
30
+ <a href="https://judgmentlabs.ai/">
31
+ <picture>
32
+ <source media="(prefers-color-scheme: dark)" srcset="assets/logo_darkmode.svg">
33
+ <img src="assets/logo_lightmode.svg" alt="Judgment Logo" width="400" />
34
+ </picture>
35
+ </a>
36
+
37
+ <br>
38
+
39
+ ## Agent Behavior Monitoring (ABM)
40
+
41
+ Track and judge any agent behavior in online and offline setups. Set up Sentry-style alerts and analyze agent behaviors / topic patterns at scale!
42
+
43
+ [![Docs](https://img.shields.io/badge/Documentation-blue)](https://docs.judgmentlabs.ai/documentation)
44
+ [![Judgment Cloud](https://img.shields.io/badge/Judgment%20Cloud-brightgreen)](https://app.judgmentlabs.ai/register)
45
+ [![Self-Host](https://img.shields.io/badge/Self--Host-orange)](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
46
+
47
+
48
+ [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
49
+ [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
50
+
51
+ </div>
52
+
53
+
54
+ </table>
55
+
56
+ ## [NEW] 🎆 Agent Reinforcement Learning
57
+
58
+ Train your agents with multi-turn reinforcement learning using judgeval and [Fireworks AI](https://fireworks.ai/)! Judgeval's ABM now integrates with Fireworks' Reinforcement Fine-Tuning (RFT) endpoint, supporting gpt-oss, qwen3, Kimi2, DeepSeek, and more.
59
+
60
+ Judgeval's agent monitoring infra provides a simple harness for integrating GRPO into any Python agent, giving builders a quick method to **try RL with minimal code changes** to their existing agents!
61
+
62
+ ```python
63
+ await trainer.train(
64
+ agent_function=your_agent_function, # entry point to your agent
65
+ scorers=[RewardScorer()], # Custom scorer you define based on task criteria, acts as reward
66
+ prompts=training_prompts, # Tasks
67
+ rft_provider="fireworks"
68
+ )
69
+ ```
70
+
71
+ **That's it!** Judgeval automatically manages trajectory collection and reward tagging - your agent can learn from production data with minimal code changes.
72
+
73
+ 👉 Check out the [Wikipedia Racer notebook](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/rl/WikiRacingAgent_RL.ipynb), where an agent learns to navigate Wikipedia using RL, to see Judgeval in action.
74
+
75
+
76
+ You can view and monitor training progress for free via the [Judgment Dashboard](https://app.judgmentlabs.ai/).
77
+
78
+
79
+ ## Judgeval Overview
80
+
81
+ Judgeval is an open-source framework for agent behavior monitoring. Judgeval offers a toolkit to track and judge agent behavior in online and offline setups, enabling you to convert interaction data from production/test environments into improved agents. To get started, try running one of the notebooks below or dive deeper in our [docs](https://docs.judgmentlabs.ai/documentation).
82
+
83
+ Our mission is to unlock the power of production data for agent development, enabling teams to improve their apps by catching real-time failures and optimizing over their users' preferences.
84
+
85
+ ## 📚 Cookbooks
86
+
87
+ | Try Out | Notebook | Description |
88
+ |:---------|:-----|:------------|
89
+ | RL | [Wikipedia Racer](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/rl/WikiRacingAgent_RL.ipynb) | Train agents with reinforcement learning |
90
+ | Online ABM | [Research Agent](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/monitoring/Research_Agent_Online_Monitoring.ipynb) | Monitor agent behavior in production |
91
+ | Custom Scorers | [HumanEval](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/custom_scorers/HumanEval_Custom_Scorer.ipynb) | Build custom evaluators for your agents |
92
+ | Offline Testing | [Get Started For Free] | Compare how different prompts, models, or agent configs affect performance across ANY metric |
93
+
94
+ You can access our [repo of cookbooks](https://github.com/JudgmentLabs/judgment-cookbook).
95
+
96
+ You can find a list of [video tutorials for Judgeval use cases](https://www.youtube.com/@Alexshander-JL).
97
+
98
+ ## Why Judgeval?
99
+
100
+ 🤖 **Simple to run multi-turn RL**: Optimize your agents with multi-turn RL without managing compute infrastructure or data pipelines. Just add a few lines of code to your existing agent code and train!
101
+
102
+ ⚙️ **Custom Evaluators**: No restriction to only monitoring with prefab scorers. Judgeval provides simple abstractions for custom Python scorers, supporting any LLM-as-a-judge rubrics/models and code-based scorers that integrate to our live agent-tracking infrastructure. [Learn more](https://docs.judgmentlabs.ai/documentation/evaluation/custom-scorers)
103
+
104
+ 🚨 **Production Monitoring**: Run any custom scorer in a hosted, virtualized secure container to flag agent behaviors online in production. Get Slack alerts for failures and add custom hooks to address regressions before they impact users. [Learn more](https://docs.judgmentlabs.ai/documentation/performance/online-evals)
105
+
106
+ 📊 **Behavior/Topic Grouping**: Group agent runs by behavior type or topic for deeper analysis. Drill down into subsets of users, agents, or use cases to reveal patterns of agent behavior.
107
+ <!-- Add link to Bucketing docs once we have it -->
108
+ <!--
109
+ TODO: Once we have trainer code docs, plug in here
110
+ -->
111
+
112
+ 🧪 **Run experiments on your agents**: Compare test different prompts, models, or agent configs across customer segments. Measure which changes improve agent performance and decrease bad agent behaviors.
113
+
114
+ <!--
115
+ Use this once we have AI PM features:
116
+
117
+ **Run experiments on your agents**: A/B test different prompts, models, or agent configs across customer segments. Measure which changes improve agent performance and decrease bad agent behaviors. [Learn more]
118
+
119
+ -->
120
+
121
+ ## 🛠️ Quickstart
122
+
123
+ Get started with Judgeval by installing our SDK using pip:
124
+
125
+ ```bash
126
+ pip install judgeval
127
+ ```
128
+
129
+ Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment Platform](https://app.judgmentlabs.ai/).
130
+
131
+ ```bash
132
+ export JUDGMENT_API_KEY=...
133
+ export JUDGMENT_ORG_ID=...
134
+ ```
135
+
136
+ **If you don't have keys, [create an account for free](https://app.judgmentlabs.ai/register) on the platform!**
137
+
138
+ ### Start monitoring with Judgeval
139
+
140
+ ```python
141
+ from judgeval.tracer import Tracer, wrap
142
+ from judgeval.data import Example
143
+ from judgeval.scorers import AnswerRelevancyScorer
144
+ from openai import OpenAI
145
+
146
+
147
+ judgment = Tracer(project_name="default_project")
148
+ client = wrap(OpenAI()) # tracks all LLM calls
149
+
150
+ @judgment.observe(span_type="tool")
151
+ def format_question(question: str) -> str:
152
+ # dummy tool
153
+ return f"Question : {question}"
154
+
155
+ @judgment.observe(span_type="function")
156
+ def run_agent(prompt: str) -> str:
157
+ task = format_question(prompt)
158
+ response = client.chat.completions.create(
159
+ model="gpt-5-mini",
160
+ messages=[{"role": "user", "content": task}]
161
+ )
162
+
163
+ judgment.async_evaluate( # trigger online monitoring
164
+ scorer=AnswerRelevancyScorer(threshold=0.5), # swap with any scorer
165
+ example=Example(input=task, actual_output=response), # customize to your data
166
+ model="gpt-5",
167
+ )
168
+ return response.choices[0].message.content
169
+
170
+ run_agent("What is the capital of the United States?")
171
+ ```
172
+
173
+ Running this code will deliver monitoring results to your [free platform account](https://app.judgmentlabs.ai/register) and should look like this:
174
+
175
+ ![Judgment Platform Trajectory View](assets/quickstart_trajectory_ss.png)
176
+
177
+
178
+ ### Customizable Scorers Over Agent Behavior
179
+
180
+ Judgeval's strongest suit is the full customization over the types of scorers you can run online monitoring with. No restrictions to only single-prompt LLM judges or prefab scorers - if you can express your scorer
181
+ in python code, judgeval can monitor it! Under the hood, judgeval hosts your scorer in a virtualized secure container, enabling online monitoring for any scorer.
182
+
183
+
184
+ First, create a behavior scorer in a file called `helpfulness_scorer.py`:
185
+
186
+ ```python
187
+ from judgeval.data import Example
188
+ from judgeval.scorers.example_scorer import ExampleScorer
189
+
190
+ # Define custom example class
191
+ class QuestionAnswer(Example):
192
+ question: str
193
+ answer: str
194
+
195
+ # Define a server-hosted custom scorer
196
+ class HelpfulnessScorer(ExampleScorer):
197
+ name: str = "Helpfulness Scorer"
198
+ server_hosted: bool = True # Enable server hosting
199
+ async def a_score_example(self, example: QuestionAnswer):
200
+ # Custom scoring logic for agent behavior
201
+ # Can be an arbitrary combination of code and LLM calls
202
+ if len(example.answer) > 10 and "?" not in example.answer:
203
+ self.reason = "Answer is detailed and provides helpful information"
204
+ return 1.0
205
+ else:
206
+ self.reason = "Answer is too brief or unclear"
207
+ return 0.0
208
+ ```
209
+
210
+ Then deploy your scorer to Judgment's infrastructure:
211
+
212
+ ```bash
213
+ echo "pydantic" > requirements.txt
214
+ uv run judgeval upload_scorer helpfulness_scorer.py requirements.txt
215
+ ```
216
+
217
+ Now you can instrument your agent with monitoring and online evaluation:
218
+
219
+ ```python
220
+ from judgeval.tracer import Tracer, wrap
221
+ from helpfulness_scorer import HelpfulnessScorer, QuestionAnswer
222
+ from openai import OpenAI
223
+
224
+ judgment = Tracer(project_name="default_project")
225
+ client = wrap(OpenAI()) # tracks all LLM calls
226
+
227
+ @judgment.observe(span_type="tool")
228
+ def format_task(question: str) -> str: # replace with your prompt engineering
229
+ return f"Please answer the following question: {question}"
230
+
231
+ @judgment.observe(span_type="tool")
232
+ def answer_question(prompt: str) -> str: # replace with your LLM system calls
233
+ response = client.chat.completions.create(
234
+ model="gpt-5-mini",
235
+ messages=[{"role": "user", "content": prompt}]
236
+ )
237
+ return response.choices[0].message.content
238
+
239
+ @judgment.observe(span_type="function")
240
+ def run_agent(question: str) -> str:
241
+ task = format_task(question)
242
+ answer = answer_question(task)
243
+
244
+ # Add online evaluation with server-hosted scorer
245
+ judgment.async_evaluate(
246
+ scorer=HelpfulnessScorer(),
247
+ example=QuestionAnswer(question=question, answer=answer),
248
+ sampling_rate=0.9 # Evaluate 90% of agent runs
249
+ )
250
+
251
+ return answer
252
+
253
+ if __name__ == "__main__":
254
+ result = run_agent("What is the capital of the United States?")
255
+ print(result)
256
+ ```
257
+
258
+ Congratulations! Your online eval result should look like this:
259
+
260
+ ![Custom Scorer Online ABM](assets/custom_scorer_online_abm.png)
261
+
262
+ You can now run any online scorer in a secure Firecracker microVMs with no latency impact on your applications.
263
+
264
+ ---
265
+
266
+ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
@@ -4,14 +4,14 @@ judgeval/constants.py,sha256=JZZJ1MqzZZDVk-5PRPRbmLnM8mXI-RDL5vxa1JFuscs,3408
4
4
  judgeval/env.py,sha256=37Mn4g0OkpFxXCZGlO_CLqKJnyX-jx_R24tC28XJzig,2112
5
5
  judgeval/exceptions.py,sha256=tTbfe4yoOtPXmn22UQz9-6a-5PT9uOko85xaRRwr0Sw,621
6
6
  judgeval/logger.py,sha256=ZWbp0QfT1CJnQIjV-Zle4n489nFCKEmD2-ukx--iiow,1553
7
- judgeval/version.py,sha256=1a6hS0-ubylneLxq8Pt0EqBRx0hSP1cO9JKaTmHazfo,74
7
+ judgeval/version.py,sha256=UCd6S0KuM6h0ZUz8pm-Ty1EDHaJNSUYM_7PrDz0ov-E,74
8
8
  judgeval/warnings.py,sha256=LbGte14ppiFjrkp-JJYueZ40NWFvMkWRvPXr6r-fUWw,73
9
- judgeval/api/__init__.py,sha256=_oDuEDBDmyPQkdfvWebvBSvrnlzg4vreETpt16frXEA,12468
10
- judgeval/api/api_types.py,sha256=hpUpVRCLIGF-lHHg1gIgdTaRfwS94Vh1E23vU9Z34js,8555
9
+ judgeval/api/__init__.py,sha256=ho8L4wC9y-STYEpk5zHwc2mZJhC4ezW8jiGgOIERBVY,12058
10
+ judgeval/api/api_types.py,sha256=6wrjvO8XsYbfPxjQ_sHS9EOjqexbn3XDFclWqb4CgZ4,8874
11
11
  judgeval/data/__init__.py,sha256=1tU0EN0ThIfQ1fad5I3dKxAfTcZ5U8cvTLcQ6qLVLU0,407
12
12
  judgeval/data/evaluation_run.py,sha256=O41p99wNAuCAf6lsLNKzkZ6W-kL9LlzCYxVls7IcKkA,4727
13
13
  judgeval/data/example.py,sha256=eGJpF-lyUH734Cg90B7WtU9f8iKoS3VFGeV6R-GVCCc,1039
14
- judgeval/data/judgment_types.py,sha256=u45rfHEtUNzXSQstJ4TcOo-yX9cZymma5W0hTtb5u34,15965
14
+ judgeval/data/judgment_types.py,sha256=uI4wUiXeA6k8o2ONia506eaZcydHKQKrK1LzccTK-xc,16577
15
15
  judgeval/data/result.py,sha256=XufFGSAkBDfevPUmzSgsR9HEqytISkM0U5HkhJmsjpY,2102
16
16
  judgeval/data/scorer_data.py,sha256=HeP15ZgftFTJCF8JmDJCLWXRnZJIaGDJCzl7Hg6gWwE,2006
17
17
  judgeval/data/trace.py,sha256=zSiR3o6xt8Z46XA3M9fJBtViF0BsPO6yKp9jxdscOSc,3881
@@ -51,7 +51,7 @@ judgeval/tracer/exporters/__init__.py,sha256=3WDXC28iY5gYMM5s7ejmy7P-DVDQ_iIuzwo
51
51
  judgeval/tracer/exporters/s3.py,sha256=N9gmw17cnR0VkfAQQkLsNj5BksgNRETThR5qYhWRjP4,4360
52
52
  judgeval/tracer/exporters/store.py,sha256=KQV3cyqteesByQjR-9VdPXT9OlUZ-6F08ogqj837_c0,1012
53
53
  judgeval/tracer/exporters/utils.py,sha256=JRcoSQuEHxMDJbXfyrUIfA2SHBVkZM82h4bTbYGxkNw,1154
54
- judgeval/tracer/llm/__init__.py,sha256=6JSF-RaK6tZNzd0rZOK6Don7vvf15EhSPSio_FmS7i8,42564
54
+ judgeval/tracer/llm/__init__.py,sha256=b7toFMVyZU4Pv8jximfneP5gyohUB4DwJDvy8b2_IMw,44217
55
55
  judgeval/tracer/llm/providers.py,sha256=UU8xrh2n9p3xZwnlWMUcZoFpog2-F9-YfcV0c2aUNqQ,1432
56
56
  judgeval/tracer/llm/anthropic/__init__.py,sha256=DUTkYjMejWLI8inFJ_Ih7vf7_aJFAiCyi1Oxls-ACGo,439
57
57
  judgeval/tracer/llm/google/__init__.py,sha256=7j96SPUl61yVl3jCQ-JuPpgVU9GhmcsBzY2vj5wJAVo,506
@@ -73,8 +73,8 @@ judgeval/utils/serialize.py,sha256=QXR-8Nj5rqOrI9zLx0oRLdk6DW6Bc7j8eyF4zQ7PLxA,6
73
73
  judgeval/utils/testing.py,sha256=m5Nexv65tmfSj1XvAPK5Ear7aJ7w5xjDtZN0tLZ_RBk,2939
74
74
  judgeval/utils/url.py,sha256=Shf0v3XcbaWpL0m1eGJEEO_z4TsQCnDB2Rl25OTUmiI,195
75
75
  judgeval/utils/version_check.py,sha256=ylZQSqV7kLzEOChxvav9SCHUU4OnaCp36tXHLjdzmw0,1072
76
- judgeval-0.15.0.dist-info/METADATA,sha256=MT857VBF8qoWXiCu_NyK_JCBcrddN1kCSWxDd58D3g0,8564
77
- judgeval-0.15.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
78
- judgeval-0.15.0.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
79
- judgeval-0.15.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
80
- judgeval-0.15.0.dist-info/RECORD,,
76
+ judgeval-0.16.0.dist-info/METADATA,sha256=kojyijzNE_2gKKvMGrs7E0zHHv3GtOXRjfmIOUQujTY,11512
77
+ judgeval-0.16.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
78
+ judgeval-0.16.0.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
79
+ judgeval-0.16.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
80
+ judgeval-0.16.0.dist-info/RECORD,,
@@ -1,158 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: judgeval
3
- Version: 0.15.0
4
- Summary: Judgeval Package
5
- Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
- Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
7
- Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
8
- License-Expression: Apache-2.0
9
- License-File: LICENSE.md
10
- Classifier: Operating System :: OS Independent
11
- Classifier: Programming Language :: Python :: 3
12
- Requires-Python: >=3.10
13
- Requires-Dist: boto3>=1.40.11
14
- Requires-Dist: click<8.2.0
15
- Requires-Dist: dotenv
16
- Requires-Dist: httpx>=0.28.1
17
- Requires-Dist: litellm<1.75.0
18
- Requires-Dist: opentelemetry-exporter-otlp>=1.36.0
19
- Requires-Dist: opentelemetry-sdk>=1.36.0
20
- Requires-Dist: orjson>=3.9.0
21
- Requires-Dist: typer>=0.9.0
22
- Provides-Extra: s3
23
- Requires-Dist: boto3>=1.40.11; extra == 's3'
24
- Provides-Extra: trainer
25
- Requires-Dist: fireworks-ai>=0.19.18; extra == 'trainer'
26
- Description-Content-Type: text/markdown
27
-
28
- <div align="center">
29
-
30
- <img src="assets/new_lightmode.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
31
- <img src="assets/new_darkmode.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
32
-
33
- <br>
34
- <div style="font-size: 1.5em;">
35
- Enable self-learning agents with environment data and evals.
36
- </div>
37
-
38
- ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
39
-
40
- [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
41
-
42
- We're hiring! Join us in our mission to enable self-learning agents by providing the data and signals needed for monitoring and post-training.
43
-
44
- [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
45
- [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
46
- [![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/tGVFf8UBUY)
47
-
48
- <img src="assets/product_shot.png" alt="Judgment Platform" width="800" />
49
-
50
- </div>
51
-
52
- Judgeval offers **open-source tooling** for evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
53
-
54
- ## 🎬 See Judgeval in Action
55
-
56
- **[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval captures all environment responses across all agent tool calls for monitoring. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
57
-
58
- <table style="width: 100%; max-width: 800px; table-layout: fixed;">
59
- <tr>
60
- <td align="center" style="padding: 8px; width: 50%;">
61
- <img src="assets/agent.gif" alt="Agent Demo" style="width: 100%; max-width: 350px; height: auto;" />
62
- <br><strong>🤖 Agents Running</strong>
63
- </td>
64
- <td align="center" style="padding: 8px; width: 50%;">
65
- <img src="assets/trace.gif" alt="Capturing Environment Data Demo" style="width: 100%; max-width: 350px; height: auto;" />
66
- <br><strong>📊 Capturing Environment Data </strong>
67
- </td>
68
- </tr>
69
- <tr>
70
- <td align="center" style="padding: 8px; width: 50%;">
71
- <img src="assets/document.gif" alt="Agent Completed Demo" style="width: 100%; max-width: 350px; height: auto;" />
72
- <br><strong>✅ Agents Completed Running</strong>
73
- </td>
74
- <td align="center" style="padding: 8px; width: 50%;">
75
- <img src="assets/data.gif" alt="Data Export Demo" style="width: 100%; max-width: 350px; height: auto;" />
76
- <br><strong>📤 Exporting Agent Environment Data</strong>
77
- </td>
78
- </tr>
79
-
80
- </table>
81
-
82
- ## 📋 Table of Contents
83
- - [🛠️ Installation](#️-installation)
84
- - [🏁 Quickstarts](#-quickstarts)
85
- - [✨ Features](#-features)
86
- - [🏢 Self-Hosting](#-self-hosting)
87
- - [📚 Cookbooks](#-cookbooks)
88
- - [💻 Development with Cursor](#-development-with-cursor)
89
-
90
- ## 🛠️ Installation
91
-
92
- Get started with Judgeval by installing our SDK using pip:
93
-
94
- ```bash
95
- pip install judgeval
96
- ```
97
-
98
- Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment Platform](https://app.judgmentlabs.ai/).
99
-
100
- ```bash
101
- export JUDGMENT_API_KEY=...
102
- export JUDGMENT_ORG_ID=...
103
- ```
104
-
105
- **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
106
-
107
-
108
- ## ✨ Features
109
-
110
- | | |
111
- |:---|:---:|
112
- | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
113
- | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
114
- | <h3>📊 Datasets</h3>Export environment interactions and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
115
-
116
- ## 🏢 Self-Hosting
117
-
118
- Run Judgment on your own infrastructure: we provide comprehensive self-hosting capabilities that give you full control over the backend and data plane that Judgeval interfaces with.
119
-
120
- ### Key Features
121
- * Deploy Judgment on your own AWS account
122
- * Store data in your own Supabase instance
123
- * Access Judgment through your own custom domain
124
-
125
- ### Getting Started
126
- 1. Check out our [self-hosting documentation](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) for detailed setup instructions, along with how your self-hosted instance can be accessed
127
- 2. Use the [Judgment CLI](https://docs.judgmentlabs.ai/documentation/developer-tools/judgment-cli/installation) to deploy your self-hosted environment
128
- 3. After your self-hosted instance is setup, make sure the `JUDGMENT_API_URL` environmental variable is set to your self-hosted backend endpoint
129
-
130
- ## 📚 Cookbooks
131
-
132
- Have your own? We're happy to feature it if you create a PR or message us on [Discord](https://discord.gg/tGVFf8UBUY).
133
-
134
- You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judgment-cookbook).
135
-
136
- ## 💻 Development with Cursor
137
- Building agents and LLM workflows in Cursor works best when your coding assistant has the proper context about Judgment integration. The Cursor rules file contains the key information needed for your assistant to implement Judgment features effectively.
138
-
139
- Refer to the official [documentation](https://docs.judgmentlabs.ai/documentation/developer-tools/cursor/cursor-rules) for access to the rules file and more information on integrating this rules file with your codebase.
140
-
141
- ## ⭐ Star Us on GitHub
142
-
143
- If you find Judgeval useful, please consider giving us a star on GitHub! Your support helps us grow our community and continue improving the repository.
144
-
145
- ## ❤️ Contributors
146
-
147
- There are many ways to contribute to Judgeval:
148
-
149
- - Submit [bug reports](https://github.com/JudgmentLabs/judgeval/issues) and [feature requests](https://github.com/JudgmentLabs/judgeval/issues)
150
- - Review the documentation and submit [Pull Requests](https://github.com/JudgmentLabs/judgeval/pulls) to improve it
151
- - Speaking or writing about Judgment and letting us know!
152
-
153
- <!-- Contributors collage -->
154
- [![Contributors](https://contributors-img.web.app/image?repo=JudgmentLabs/judgeval)](https://github.com/JudgmentLabs/judgeval/graphs/contributors)
155
-
156
- ---
157
-
158
- Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).