judgeval 0.14.1__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/api/__init__.py CHANGED
@@ -73,7 +73,7 @@ class JudgmentSyncClient:
73
73
 
74
74
  def evaluate_examples(
75
75
  self, payload: ExampleEvaluationRun, stream: Optional[str] = None
76
- ) -> Any:
76
+ ) -> EvaluateResponse:
77
77
  query_params = {}
78
78
  if stream is not None:
79
79
  query_params["stream"] = stream
@@ -86,7 +86,7 @@ class JudgmentSyncClient:
86
86
 
87
87
  def evaluate_traces(
88
88
  self, payload: TraceEvaluationRun, stream: Optional[str] = None
89
- ) -> Any:
89
+ ) -> EvaluateResponse:
90
90
  query_params = {}
91
91
  if stream is not None:
92
92
  query_params["stream"] = stream
@@ -111,16 +111,6 @@ class JudgmentSyncClient:
111
111
  payload,
112
112
  )
113
113
 
114
- def get_evaluation_status(self, experiment_run_id: str, project_name: str) -> Any:
115
- query_params = {}
116
- query_params["experiment_run_id"] = experiment_run_id
117
- query_params["project_name"] = project_name
118
- return self._request(
119
- "GET",
120
- url_for("/get_evaluation_status/"),
121
- query_params,
122
- )
123
-
124
114
  def datasets_insert_examples_for_judgeval(
125
115
  self, payload: DatasetInsertExamples
126
116
  ) -> Any:
@@ -222,13 +212,6 @@ class JudgmentSyncClient:
222
212
  payload,
223
213
  )
224
214
 
225
- def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
226
- return self._request(
227
- "POST",
228
- url_for("/e2e_fetch_trace_scorer_span_score/"),
229
- payload,
230
- )
231
-
232
215
 
233
216
  class JudgmentAsyncClient:
234
217
  __slots__ = ("api_key", "organization_id", "client")
@@ -280,7 +263,7 @@ class JudgmentAsyncClient:
280
263
 
281
264
  async def evaluate_examples(
282
265
  self, payload: ExampleEvaluationRun, stream: Optional[str] = None
283
- ) -> Any:
266
+ ) -> EvaluateResponse:
284
267
  query_params = {}
285
268
  if stream is not None:
286
269
  query_params["stream"] = stream
@@ -293,7 +276,7 @@ class JudgmentAsyncClient:
293
276
 
294
277
  async def evaluate_traces(
295
278
  self, payload: TraceEvaluationRun, stream: Optional[str] = None
296
- ) -> Any:
279
+ ) -> EvaluateResponse:
297
280
  query_params = {}
298
281
  if stream is not None:
299
282
  query_params["stream"] = stream
@@ -318,18 +301,6 @@ class JudgmentAsyncClient:
318
301
  payload,
319
302
  )
320
303
 
321
- async def get_evaluation_status(
322
- self, experiment_run_id: str, project_name: str
323
- ) -> Any:
324
- query_params = {}
325
- query_params["experiment_run_id"] = experiment_run_id
326
- query_params["project_name"] = project_name
327
- return await self._request(
328
- "GET",
329
- url_for("/get_evaluation_status/"),
330
- query_params,
331
- )
332
-
333
304
  async def datasets_insert_examples_for_judgeval(
334
305
  self, payload: DatasetInsertExamples
335
306
  ) -> Any:
@@ -433,13 +404,6 @@ class JudgmentAsyncClient:
433
404
  payload,
434
405
  )
435
406
 
436
- async def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
437
- return await self._request(
438
- "POST",
439
- url_for("/e2e_fetch_trace_scorer_span_score/"),
440
- payload,
441
- )
442
-
443
407
 
444
408
  __all__ = [
445
409
  "JudgmentSyncClient",
judgeval/api/api_types.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-09-29T19:54:47+00:00
3
+ # timestamp: 2025-10-07T20:43:52+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
  from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
@@ -24,6 +24,15 @@ class DatasetsFetch(TypedDict):
24
24
  project_name: str
25
25
 
26
26
 
27
+ class DatasetsTableRow(TypedDict):
28
+ dataset_id: str
29
+ name: str
30
+ created_at: str
31
+ kind: Literal["trace", "example"]
32
+ entries: int
33
+ creator: str
34
+
35
+
27
36
  class ProjectAdd(TypedDict):
28
37
  project_name: str
29
38
 
@@ -137,6 +146,14 @@ class ValidationError(TypedDict):
137
146
  type: str
138
147
 
139
148
 
149
+ class UsageInfo(TypedDict):
150
+ total_judgees: int
151
+ regular_use: int
152
+ pay_as_you_go_use: int
153
+ remaining_regular: int
154
+ remaining_after: int
155
+
156
+
140
157
  DatasetKind = Literal["trace", "example"]
141
158
 
142
159
 
@@ -180,18 +197,10 @@ class OtelTraceSpan(TypedDict):
180
197
  resource_attributes: NotRequired[Optional[Dict[str, Any]]]
181
198
  span_attributes: NotRequired[Optional[Dict[str, Any]]]
182
199
  duration: NotRequired[Optional[int]]
183
- status_code: NotRequired[Optional[str]]
200
+ status_code: NotRequired[Optional[int]]
184
201
  status_message: NotRequired[Optional[str]]
185
202
  events: NotRequired[Optional[List[Dict[str, Any]]]]
186
203
  links: NotRequired[Optional[List[Dict[str, Any]]]]
187
- legacy_span_id: NotRequired[Optional[str]]
188
- inputs: NotRequired[Optional[Dict[str, Any]]]
189
- output: Any
190
- error: NotRequired[Optional[Dict[str, Any]]]
191
- agent_id: NotRequired[Optional[str]]
192
- cumulative_llm_cost: NotRequired[Optional[float]]
193
- state_after: NotRequired[Optional[Dict[str, Any]]]
194
- state_before: NotRequired[Optional[Dict[str, Any]]]
195
204
 
196
205
 
197
206
  class OtelSpanListItemScores(TypedDict):
@@ -206,7 +215,7 @@ class OtelSpanDetailScores(TypedDict):
206
215
  score: float
207
216
  reason: NotRequired[Optional[str]]
208
217
  name: str
209
- data: NotRequired[Optional[Dict[str, Any]]]
218
+ example_id: NotRequired[Optional[str]]
210
219
 
211
220
 
212
221
  class ExampleEvaluationRun(TypedDict):
@@ -244,15 +253,6 @@ class DatasetInsertExamples(TypedDict):
244
253
  project_name: str
245
254
 
246
255
 
247
- class DatasetInfo(TypedDict):
248
- dataset_id: str
249
- name: str
250
- created_at: str
251
- dataset_kind: DatasetKind
252
- entries: int
253
- creator: str
254
-
255
-
256
256
  class DatasetCreate(TypedDict):
257
257
  name: str
258
258
  dataset_kind: DatasetKind
@@ -279,16 +279,17 @@ class OtelTraceListItem(TypedDict):
279
279
  organization_id: str
280
280
  project_id: str
281
281
  trace_id: str
282
- timestamp: str
282
+ created_at: str
283
283
  duration: NotRequired[Optional[int]]
284
- has_notification: NotRequired[Optional[bool]]
285
284
  tags: NotRequired[Optional[List[str]]]
286
285
  experiment_run_id: NotRequired[Optional[str]]
287
286
  span_name: NotRequired[Optional[str]]
288
- cumulative_llm_cost: NotRequired[Optional[float]]
289
- error: NotRequired[Optional[Dict[str, Any]]]
287
+ llm_cost: NotRequired[Optional[float]]
288
+ error: NotRequired[str]
290
289
  scores: NotRequired[List[OtelSpanListItemScores]]
291
290
  customer_id: NotRequired[Optional[str]]
291
+ input: NotRequired[Optional[str]]
292
+ output: NotRequired[Optional[str]]
292
293
  input_preview: NotRequired[Optional[str]]
293
294
  output_preview: NotRequired[Optional[str]]
294
295
  annotation_count: NotRequired[int]
@@ -310,9 +311,9 @@ class OtelSpanDetail(TypedDict):
310
311
  resource_attributes: NotRequired[Optional[Dict[str, Any]]]
311
312
  span_attributes: NotRequired[Optional[Dict[str, Any]]]
312
313
  duration: NotRequired[Optional[int]]
313
- status_code: NotRequired[Optional[str]]
314
+ status_code: NotRequired[Optional[int]]
314
315
  status_message: NotRequired[Optional[str]]
315
- events: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
316
+ events: NotRequired[Optional[List[Dict[str, Any]]]]
316
317
  links: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
317
318
  llm_cost: NotRequired[Optional[float]]
318
319
  prompt_tokens: NotRequired[Optional[int]]
@@ -320,6 +321,12 @@ class OtelSpanDetail(TypedDict):
320
321
  scores: NotRequired[Optional[List[OtelSpanDetailScores]]]
321
322
 
322
323
 
324
+ class EvaluateResponse(TypedDict):
325
+ status: str
326
+ results: List[ScoringResult]
327
+ resource_usage: NotRequired[Optional[UsageInfo]]
328
+
329
+
323
330
  class EvalResults(TypedDict):
324
331
  results: List[ScoringResult]
325
332
  run: Union[ExampleEvaluationRun, TraceEvaluationRun]
@@ -335,5 +342,5 @@ class DatasetReturn(TypedDict):
335
342
  name: str
336
343
  project_name: str
337
344
  dataset_kind: DatasetKind
338
- examples: NotRequired[Optional[List[Example]]]
345
+ examples: NotRequired[List[Example]]
339
346
  traces: NotRequired[Optional[List[DatasetTraceWithSpans]]]
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-09-29T19:54:46+00:00
3
+ # timestamp: 2025-10-07T20:43:51+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
  from typing import Annotated, Any, Dict, List, Optional, Union
@@ -26,6 +26,20 @@ class DatasetsFetch(BaseModel):
26
26
  project_name: Annotated[str, Field(title="Project Name")]
27
27
 
28
28
 
29
+ class Kind(Enum):
30
+ trace = "trace"
31
+ example = "example"
32
+
33
+
34
+ class DatasetsTableRow(BaseModel):
35
+ dataset_id: Annotated[str, Field(title="Dataset Id")]
36
+ name: Annotated[str, Field(title="Name")]
37
+ created_at: Annotated[str, Field(title="Created At")]
38
+ kind: Annotated[Kind, Field(title="Kind")]
39
+ entries: Annotated[int, Field(title="Entries")]
40
+ creator: Annotated[str, Field(title="Creator")]
41
+
42
+
29
43
  class ProjectAdd(BaseModel):
30
44
  project_name: Annotated[str, Field(title="Project Name")]
31
45
 
@@ -148,6 +162,14 @@ class ValidationError(BaseModel):
148
162
  type: Annotated[str, Field(title="Error Type")]
149
163
 
150
164
 
165
+ class UsageInfo(BaseModel):
166
+ total_judgees: Annotated[int, Field(title="Total Judgees")]
167
+ regular_use: Annotated[int, Field(title="Regular Use")]
168
+ pay_as_you_go_use: Annotated[int, Field(title="Pay As You Go Use")]
169
+ remaining_regular: Annotated[int, Field(title="Remaining Regular")]
170
+ remaining_after: Annotated[int, Field(title="Remaining After")]
171
+
172
+
151
173
  class DatasetKind(Enum):
152
174
  trace = "trace"
153
175
  example = "example"
@@ -199,22 +221,10 @@ class OtelTraceSpan(BaseModel):
199
221
  Optional[Dict[str, Any]], Field(title="Span Attributes")
200
222
  ] = None
201
223
  duration: Annotated[Optional[int], Field(title="Duration")] = None
202
- status_code: Annotated[Optional[str], Field(title="Status Code")] = None
224
+ status_code: Annotated[Optional[int], Field(title="Status Code")] = None
203
225
  status_message: Annotated[Optional[str], Field(title="Status Message")] = None
204
226
  events: Annotated[Optional[List[Dict[str, Any]]], Field(title="Events")] = None
205
227
  links: Annotated[Optional[List[Dict[str, Any]]], Field(title="Links")] = None
206
- legacy_span_id: Annotated[Optional[str], Field(title="Legacy Span Id")] = None
207
- inputs: Annotated[Optional[Dict[str, Any]], Field(title="Inputs")] = None
208
- output: Annotated[Any, Field(title="Output")]
209
- error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
210
- agent_id: Annotated[Optional[str], Field(title="Agent Id")] = None
211
- cumulative_llm_cost: Annotated[
212
- Optional[float], Field(title="Cumulative Llm Cost")
213
- ] = None
214
- state_after: Annotated[Optional[Dict[str, Any]], Field(title="State After")] = None
215
- state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
216
- None
217
- )
218
228
 
219
229
 
220
230
  class OtelSpanListItemScores(BaseModel):
@@ -229,7 +239,7 @@ class OtelSpanDetailScores(BaseModel):
229
239
  score: Annotated[float, Field(title="Score")]
230
240
  reason: Annotated[Optional[str], Field(title="Reason")] = None
231
241
  name: Annotated[str, Field(title="Name")]
232
- data: Annotated[Optional[Dict[str, Any]], Field(title="Data")] = None
242
+ example_id: Annotated[Optional[str], Field(title="Example Id")] = None
233
243
 
234
244
 
235
245
  class ExampleEvaluationRun(BaseModel):
@@ -277,15 +287,6 @@ class DatasetInsertExamples(BaseModel):
277
287
  project_name: Annotated[str, Field(title="Project Name")]
278
288
 
279
289
 
280
- class DatasetInfo(BaseModel):
281
- dataset_id: Annotated[str, Field(title="Dataset Id")]
282
- name: Annotated[str, Field(title="Name")]
283
- created_at: Annotated[str, Field(title="Created At")]
284
- dataset_kind: DatasetKind
285
- entries: Annotated[int, Field(title="Entries")]
286
- creator: Annotated[str, Field(title="Creator")]
287
-
288
-
289
290
  class DatasetCreate(BaseModel):
290
291
  name: Annotated[str, Field(title="Name")]
291
292
  dataset_kind: DatasetKind
@@ -314,20 +315,19 @@ class OtelTraceListItem(BaseModel):
314
315
  organization_id: Annotated[str, Field(title="Organization Id")]
315
316
  project_id: Annotated[str, Field(title="Project Id")]
316
317
  trace_id: Annotated[str, Field(title="Trace Id")]
317
- timestamp: Annotated[str, Field(title="Timestamp")]
318
+ created_at: Annotated[AwareDatetime, Field(title="Created At")]
318
319
  duration: Annotated[Optional[int], Field(title="Duration")] = None
319
- has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = None
320
320
  tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
321
321
  experiment_run_id: Annotated[Optional[str], Field(title="Experiment Run Id")] = None
322
322
  span_name: Annotated[Optional[str], Field(title="Span Name")] = None
323
- cumulative_llm_cost: Annotated[
324
- Optional[float], Field(title="Cumulative Llm Cost")
325
- ] = None
326
- error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
323
+ llm_cost: Annotated[Optional[float], Field(title="Llm Cost")] = None
324
+ error: Annotated[Optional[str], Field(title="Error")] = ""
327
325
  scores: Annotated[
328
326
  Optional[List[OtelSpanListItemScores]], Field(title="Scores")
329
327
  ] = []
330
328
  customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
329
+ input: Annotated[Optional[str], Field(title="Input")] = None
330
+ output: Annotated[Optional[str], Field(title="Output")] = None
331
331
  input_preview: Annotated[Optional[str], Field(title="Input Preview")] = None
332
332
  output_preview: Annotated[Optional[str], Field(title="Output Preview")] = None
333
333
  annotation_count: Annotated[Optional[int], Field(title="Annotation Count")] = 0
@@ -338,7 +338,7 @@ class OtelTraceListItem(BaseModel):
338
338
  class OtelSpanDetail(BaseModel):
339
339
  organization_id: Annotated[str, Field(title="Organization Id")]
340
340
  project_id: Annotated[str, Field(title="Project Id")]
341
- timestamp: Annotated[str, Field(title="Timestamp")]
341
+ timestamp: Annotated[AwareDatetime, Field(title="Timestamp")]
342
342
  trace_id: Annotated[str, Field(title="Trace Id")]
343
343
  span_id: Annotated[str, Field(title="Span Id")]
344
344
  parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
@@ -353,11 +353,9 @@ class OtelSpanDetail(BaseModel):
353
353
  Optional[Dict[str, Any]], Field(title="Span Attributes")
354
354
  ] = None
355
355
  duration: Annotated[Optional[int], Field(title="Duration")] = None
356
- status_code: Annotated[Optional[str], Field(title="Status Code")] = None
356
+ status_code: Annotated[Optional[int], Field(title="Status Code")] = None
357
357
  status_message: Annotated[Optional[str], Field(title="Status Message")] = None
358
- events: Annotated[
359
- Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Events")
360
- ] = None
358
+ events: Annotated[Optional[List[Dict[str, Any]]], Field(title="Events")] = None
361
359
  links: Annotated[
362
360
  Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Links")
363
361
  ] = None
@@ -369,6 +367,12 @@ class OtelSpanDetail(BaseModel):
369
367
  )
370
368
 
371
369
 
370
+ class EvaluateResponse(BaseModel):
371
+ status: Annotated[str, Field(title="Status")]
372
+ results: Annotated[List[ScoringResult], Field(title="Results")]
373
+ resource_usage: Optional[UsageInfo] = None
374
+
375
+
372
376
  class EvalResults(BaseModel):
373
377
  results: Annotated[List[ScoringResult], Field(title="Results")]
374
378
  run: Annotated[Union[ExampleEvaluationRun, TraceEvaluationRun], Field(title="Run")]
@@ -20,7 +20,7 @@ class DatasetInfo:
20
20
  dataset_id: str
21
21
  name: str
22
22
  created_at: str
23
- dataset_kind: DatasetKind
23
+ kind: DatasetKind
24
24
  entries: int
25
25
  creator: str
26
26
 
@@ -84,7 +84,7 @@ def log_evaluation_results(
84
84
 
85
85
  def _poll_evaluation_until_complete(
86
86
  evaluation_run: ExampleEvaluationRun,
87
- expected_scorer_data_count: int,
87
+ expected_examples_count: int,
88
88
  poll_interval_seconds: float = 5,
89
89
  max_failures: int = 5,
90
90
  max_poll_count: int = 60, # This should be equivalent to 5 minutes
@@ -117,29 +117,22 @@ def _poll_evaluation_until_complete(
117
117
  poll_count += 1
118
118
  try:
119
119
  # Check status
120
- status_response = api_client.get_evaluation_status(
121
- experiment_run_id, project_name
122
- )
123
-
124
- if status_response.get("status") != "completed":
125
- time.sleep(poll_interval_seconds)
126
- continue
127
-
128
- example_scorer_pairings = status_response.get("results", [])
129
- if len(example_scorer_pairings) != expected_scorer_data_count:
130
- time.sleep(poll_interval_seconds)
131
- continue
132
-
133
120
  results_response = api_client.fetch_experiment_run(
134
121
  {
135
122
  "experiment_run_id": experiment_run_id,
136
123
  "project_name": project_name,
137
124
  }
138
125
  )
126
+
127
+ example_scorer_pairings = results_response.get("results", [])
128
+ if len(example_scorer_pairings) != expected_examples_count:
129
+ time.sleep(poll_interval_seconds)
130
+ continue
131
+
139
132
  url = results_response.get("ui_results_url")
140
133
 
141
134
  scoring_result_list = []
142
- for res in results_response.get("results", []):
135
+ for res in example_scorer_pairings:
143
136
  example = res.get("data", {}).copy()
144
137
  example["example_id"] = res.get("example_id")
145
138
  scoring_result = ScoringResult(
@@ -241,14 +234,9 @@ def run_eval(
241
234
  )
242
235
  raise JudgmentRuntimeError(error_message)
243
236
 
244
- num_scorers = (
245
- len(evaluation_run.judgment_scorers)
246
- if evaluation_run.judgment_scorers
247
- else sum(1 for cs in evaluation_run.custom_scorers if cs.server_hosted)
248
- )
249
237
  results, url = _poll_evaluation_until_complete(
250
238
  evaluation_run=evaluation_run,
251
- expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
239
+ expected_examples_count=len(evaluation_run.examples),
252
240
  )
253
241
  finally:
254
242
  stop_event.set()
@@ -0,0 +1,50 @@
1
+ from abc import ABC
2
+ from judgeval.tracer import Tracer
3
+ from judgeval.logger import judgeval_logger
4
+ from judgeval.utils.url import url_for
5
+
6
+
7
+ try:
8
+ import openlit # type: ignore
9
+ except ImportError:
10
+ raise ImportError(
11
+ "Openlit is not installed and required for the openlit integration. Please install it with `pip install openlit`."
12
+ )
13
+
14
+
15
+ class Openlit(ABC):
16
+ @staticmethod
17
+ def initialize(
18
+ **kwargs,
19
+ ):
20
+ tracer = Tracer.get_instance()
21
+ if not tracer or not tracer._initialized:
22
+ raise ValueError(
23
+ "Openlit must be initialized after the tracer has been initialized. Please create the Tracer instance first before initializing Openlit."
24
+ )
25
+
26
+ api_key = tracer.api_key
27
+ organization_id = tracer.organization_id
28
+ project_name = tracer.project_name
29
+
30
+ project_id = Tracer._resolve_project_id(project_name, api_key, organization_id)
31
+ if not project_id:
32
+ judgeval_logger.warning(
33
+ f"Project {project_name} not found. Please create it first at https://app.judgmentlabs.ai/org/{organization_id}/projects."
34
+ )
35
+ return
36
+
37
+ openlit.init(
38
+ service_name=project_name,
39
+ otlp_endpoint=url_for("/otel"),
40
+ otlp_headers={
41
+ "Authorization": f"Bearer {api_key}",
42
+ "X-Organization-Id": organization_id,
43
+ "X-Project-Id": project_id,
44
+ },
45
+ tracer=tracer.get_tracer(),
46
+ **kwargs,
47
+ )
48
+
49
+
50
+ __all__ = ["Openlit"]
@@ -20,6 +20,7 @@ def push_prompt_scorer(
20
20
  threshold: float,
21
21
  options: Optional[Dict[str, float]] = None,
22
22
  model: str = JUDGMENT_DEFAULT_GPT_MODEL,
23
+ description: Optional[str] = None,
23
24
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
24
25
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
25
26
  is_trace: bool = False,
@@ -33,6 +34,7 @@ def push_prompt_scorer(
33
34
  "threshold": threshold,
34
35
  "options": options,
35
36
  "model": model,
37
+ "description": description,
36
38
  "is_trace": is_trace,
37
39
  }
38
40
  )
@@ -102,6 +104,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
102
104
  score_type: APIScorerType
103
105
  prompt: str
104
106
  options: Optional[Dict[str, float]] = None
107
+ description: Optional[str] = None
105
108
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
106
109
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
107
110
 
@@ -130,6 +133,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
130
133
  threshold=scorer_config["threshold"],
131
134
  options=scorer_config.get("options"),
132
135
  model=scorer_config.get("model"),
136
+ description=scorer_config.get("description"),
133
137
  judgment_api_key=judgment_api_key,
134
138
  organization_id=organization_id,
135
139
  )
@@ -142,6 +146,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
142
146
  threshold: float = 0.5,
143
147
  options: Optional[Dict[str, float]] = None,
144
148
  model: str = JUDGMENT_DEFAULT_GPT_MODEL,
149
+ description: Optional[str] = None,
145
150
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
146
151
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
147
152
  ):
@@ -158,6 +163,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
158
163
  threshold,
159
164
  options,
160
165
  model,
166
+ description,
161
167
  judgment_api_key,
162
168
  organization_id,
163
169
  is_trace,
@@ -170,6 +176,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
170
176
  threshold=threshold,
171
177
  options=options,
172
178
  model=model,
179
+ description=description,
173
180
  judgment_api_key=judgment_api_key,
174
181
  organization_id=organization_id,
175
182
  )
@@ -215,6 +222,14 @@ class BasePromptScorer(ABC, APIScorerConfig):
215
222
  self.push_prompt_scorer()
216
223
  judgeval_logger.info(f"Successfully updated options for {self.name}")
217
224
 
225
+ def set_description(self, description: Optional[str]):
226
+ """
227
+ Updates the description of the scorer.
228
+ """
229
+ self.description = description
230
+ self.push_prompt_scorer()
231
+ judgeval_logger.info(f"Successfully updated description for {self.name}")
232
+
218
233
  def append_to_prompt(self, prompt_addition: str):
219
234
  """
220
235
  Appends a string to the prompt.
@@ -248,7 +263,13 @@ class BasePromptScorer(ABC, APIScorerConfig):
248
263
  """
249
264
  return copy(self.options) if self.options is not None else None
250
265
 
251
- def get_name(self) -> str | None:
266
+ def get_description(self) -> str | None:
267
+ """
268
+ Returns the description of the scorer.
269
+ """
270
+ return self.description
271
+
272
+ def get_name(self) -> str:
252
273
  """
253
274
  Returns the name of the scorer.
254
275
  """
@@ -264,6 +285,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
264
285
  "prompt": self.prompt,
265
286
  "threshold": self.threshold,
266
287
  "options": self.options,
288
+ "description": self.description,
267
289
  }
268
290
 
269
291
  def push_prompt_scorer(self):
@@ -276,13 +298,14 @@ class BasePromptScorer(ABC, APIScorerConfig):
276
298
  self.threshold,
277
299
  self.options,
278
300
  self.model,
301
+ self.description,
279
302
  self.judgment_api_key,
280
303
  self.organization_id,
281
304
  isinstance(self, TracePromptScorer),
282
305
  )
283
306
 
284
307
  def __str__(self):
285
- return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold}, options={self.options})"
308
+ return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold}, options={self.options}, description={self.description})"
286
309
 
287
310
  def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
288
311
  base = super().model_dump(*args, **kwargs)
@@ -255,6 +255,10 @@ class Tracer(metaclass=SingletonMeta):
255
255
  def get_current_agent_context(self):
256
256
  return self.agent_context
257
257
 
258
+ def get_span_processor(self) -> JudgmentSpanProcessor:
259
+ """Get the internal span processor of this tracer instance."""
260
+ return self.judgment_processor
261
+
258
262
  def set_customer_id(self, customer_id: str) -> None:
259
263
  span = self.get_current_span()
260
264
  if span and span.is_recording():
@@ -137,9 +137,23 @@ def _extract_openai_content(chunk) -> str:
137
137
 
138
138
  def _extract_anthropic_content(chunk) -> str:
139
139
  """Extract content from Anthropic streaming chunk."""
140
- if hasattr(chunk, "type") and chunk.type == "content_block_delta":
141
- if hasattr(chunk, "delta") and hasattr(chunk.delta, "text"):
142
- return chunk.delta.text or ""
140
+ if hasattr(chunk, "type"):
141
+ if chunk.type == "content_block_delta":
142
+ if hasattr(chunk, "delta"):
143
+ if hasattr(chunk.delta, "text"):
144
+ return chunk.delta.text or ""
145
+ elif hasattr(chunk.delta, "partial_json"):
146
+ # Tool use input streaming - return raw JSON to accumulate properly
147
+ return chunk.delta.partial_json or ""
148
+ elif chunk.type == "content_block_start":
149
+ if hasattr(chunk, "content_block") and hasattr(chunk.content_block, "type"):
150
+ if chunk.content_block.type == "tool_use":
151
+ tool_info = {
152
+ "type": "tool_use",
153
+ "id": getattr(chunk.content_block, "id", None),
154
+ "name": getattr(chunk.content_block, "name", None),
155
+ }
156
+ return f"[TOOL_USE_START: {tool_info}]"
143
157
  elif hasattr(chunk, "delta") and hasattr(chunk.delta, "text"):
144
158
  return chunk.delta.text or ""
145
159
  elif hasattr(chunk, "text"):
@@ -409,7 +423,25 @@ def _format_anthropic_output(
409
423
  and usage.cache_creation_input_tokens is not None
410
424
  else 0
411
425
  )
412
- message_content = response.content[0].text if hasattr(response, "content") else None
426
+ # Extract content from Anthropic response, handling both text and tool use blocks
427
+ message_content = None
428
+ if hasattr(response, "content") and response.content:
429
+ content_parts = []
430
+ for content_block in response.content:
431
+ block_type = getattr(content_block, "type", None)
432
+ if block_type == "text":
433
+ # Text content block
434
+ content_parts.append(getattr(content_block, "text", ""))
435
+ elif block_type == "tool_use":
436
+ # Tool use block - serialize the tool call information
437
+ tool_info = {
438
+ "type": "tool_use",
439
+ "id": getattr(content_block, "id", None),
440
+ "name": getattr(content_block, "name", None),
441
+ "input": getattr(content_block, "input", None),
442
+ }
443
+ content_parts.append(f"[TOOL_USE: {tool_info}]")
444
+ message_content = "\n".join(content_parts) if content_parts else None
413
445
 
414
446
  if model_name:
415
447
  return message_content, _create_usage(
judgeval/version.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.14.1"
1
+ __version__ = "0.16.0"
2
2
 
3
3
 
4
4
  def get_version() -> str:
@@ -0,0 +1,266 @@
1
+ Metadata-Version: 2.4
2
+ Name: judgeval
3
+ Version: 0.16.0
4
+ Summary: Judgeval Package
5
+ Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
+ Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
7
+ Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
8
+ License-Expression: Apache-2.0
9
+ License-File: LICENSE.md
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.10
13
+ Requires-Dist: boto3>=1.40.11
14
+ Requires-Dist: click<8.2.0
15
+ Requires-Dist: dotenv
16
+ Requires-Dist: httpx>=0.28.1
17
+ Requires-Dist: litellm<1.75.0
18
+ Requires-Dist: opentelemetry-exporter-otlp>=1.36.0
19
+ Requires-Dist: opentelemetry-sdk>=1.36.0
20
+ Requires-Dist: orjson>=3.9.0
21
+ Requires-Dist: typer>=0.9.0
22
+ Provides-Extra: s3
23
+ Requires-Dist: boto3>=1.40.11; extra == 's3'
24
+ Provides-Extra: trainer
25
+ Requires-Dist: fireworks-ai>=0.19.18; extra == 'trainer'
26
+ Description-Content-Type: text/markdown
27
+
28
+ <div align="center">
29
+
30
+ <a href="https://judgmentlabs.ai/">
31
+ <picture>
32
+ <source media="(prefers-color-scheme: dark)" srcset="assets/logo_darkmode.svg">
33
+ <img src="assets/logo_lightmode.svg" alt="Judgment Logo" width="400" />
34
+ </picture>
35
+ </a>
36
+
37
+ <br>
38
+
39
+ ## Agent Behavior Monitoring (ABM)
40
+
41
+ Track and judge any agent behavior in online and offline setups. Set up Sentry-style alerts and analyze agent behaviors / topic patterns at scale!
42
+
43
+ [![Docs](https://img.shields.io/badge/Documentation-blue)](https://docs.judgmentlabs.ai/documentation)
44
+ [![Judgment Cloud](https://img.shields.io/badge/Judgment%20Cloud-brightgreen)](https://app.judgmentlabs.ai/register)
45
+ [![Self-Host](https://img.shields.io/badge/Self--Host-orange)](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
46
+
47
+
48
+ [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
49
+ [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
50
+
51
+ </div>
52
+
53
+
54
+ </table>
55
+
56
+ ## [NEW] 🎆 Agent Reinforcement Learning
57
+
58
+ Train your agents with multi-turn reinforcement learning using judgeval and [Fireworks AI](https://fireworks.ai/)! Judgeval's ABM now integrates with Fireworks' Reinforcement Fine-Tuning (RFT) endpoint, supporting gpt-oss, qwen3, Kimi2, DeepSeek, and more.
59
+
60
+ Judgeval's agent monitoring infra provides a simple harness for integrating GRPO into any Python agent, giving builders a quick method to **try RL with minimal code changes** to their existing agents!
61
+
62
+ ```python
63
+ await trainer.train(
64
+ agent_function=your_agent_function, # entry point to your agent
65
+ scorers=[RewardScorer()], # Custom scorer you define based on task criteria, acts as reward
66
+ prompts=training_prompts, # Tasks
67
+ rft_provider="fireworks"
68
+ )
69
+ ```
70
+
71
+ **That's it!** Judgeval automatically manages trajectory collection and reward tagging - your agent can learn from production data with minimal code changes.
72
+
73
+ 👉 Check out the [Wikipedia Racer notebook](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/rl/WikiRacingAgent_RL.ipynb), where an agent learns to navigate Wikipedia using RL, to see Judgeval in action.
74
+
75
+
76
+ You can view and monitor training progress for free via the [Judgment Dashboard](https://app.judgmentlabs.ai/).
77
+
78
+
79
+ ## Judgeval Overview
80
+
81
+ Judgeval is an open-source framework for agent behavior monitoring. Judgeval offers a toolkit to track and judge agent behavior in online and offline setups, enabling you to convert interaction data from production/test environments into improved agents. To get started, try running one of the notebooks below or dive deeper in our [docs](https://docs.judgmentlabs.ai/documentation).
82
+
83
+ Our mission is to unlock the power of production data for agent development, enabling teams to improve their apps by catching real-time failures and optimizing over their users' preferences.
84
+
85
+ ## 📚 Cookbooks
86
+
87
+ | Try Out | Notebook | Description |
88
+ |:---------|:-----|:------------|
89
+ | RL | [Wikipedia Racer](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/rl/WikiRacingAgent_RL.ipynb) | Train agents with reinforcement learning |
90
+ | Online ABM | [Research Agent](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/monitoring/Research_Agent_Online_Monitoring.ipynb) | Monitor agent behavior in production |
91
+ | Custom Scorers | [HumanEval](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/custom_scorers/HumanEval_Custom_Scorer.ipynb) | Build custom evaluators for your agents |
92
+ | Offline Testing | [Get Started For Free] | Compare how different prompts, models, or agent configs affect performance across ANY metric |
93
+
94
+ You can access our [repo of cookbooks](https://github.com/JudgmentLabs/judgment-cookbook).
95
+
96
+ You can find a list of [video tutorials for Judgeval use cases](https://www.youtube.com/@Alexshander-JL).
97
+
98
+ ## Why Judgeval?
99
+
100
+ 🤖 **Simple to run multi-turn RL**: Optimize your agents with multi-turn RL without managing compute infrastructure or data pipelines. Just add a few lines of code to your existing agent code and train!
101
+
102
+ ⚙️ **Custom Evaluators**: No restriction to only monitoring with prefab scorers. Judgeval provides simple abstractions for custom Python scorers, supporting any LLM-as-a-judge rubrics/models and code-based scorers that integrate to our live agent-tracking infrastructure. [Learn more](https://docs.judgmentlabs.ai/documentation/evaluation/custom-scorers)
103
+
104
+ 🚨 **Production Monitoring**: Run any custom scorer in a hosted, virtualized secure container to flag agent behaviors online in production. Get Slack alerts for failures and add custom hooks to address regressions before they impact users. [Learn more](https://docs.judgmentlabs.ai/documentation/performance/online-evals)
105
+
106
+ 📊 **Behavior/Topic Grouping**: Group agent runs by behavior type or topic for deeper analysis. Drill down into subsets of users, agents, or use cases to reveal patterns of agent behavior.
107
+ <!-- Add link to Bucketing docs once we have it -->
108
+ <!--
109
+ TODO: Once we have trainer code docs, plug in here
110
+ -->
111
+
112
+ 🧪 **Run experiments on your agents**: Compare test different prompts, models, or agent configs across customer segments. Measure which changes improve agent performance and decrease bad agent behaviors.
113
+
114
+ <!--
115
+ Use this once we have AI PM features:
116
+
117
+ **Run experiments on your agents**: A/B test different prompts, models, or agent configs across customer segments. Measure which changes improve agent performance and decrease bad agent behaviors. [Learn more]
118
+
119
+ -->
120
+
121
+ ## 🛠️ Quickstart
122
+
123
+ Get started with Judgeval by installing our SDK using pip:
124
+
125
+ ```bash
126
+ pip install judgeval
127
+ ```
128
+
129
+ Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment Platform](https://app.judgmentlabs.ai/).
130
+
131
+ ```bash
132
+ export JUDGMENT_API_KEY=...
133
+ export JUDGMENT_ORG_ID=...
134
+ ```
135
+
136
+ **If you don't have keys, [create an account for free](https://app.judgmentlabs.ai/register) on the platform!**
137
+
138
+ ### Start monitoring with Judgeval
139
+
140
+ ```python
141
+ from judgeval.tracer import Tracer, wrap
142
+ from judgeval.data import Example
143
+ from judgeval.scorers import AnswerRelevancyScorer
144
+ from openai import OpenAI
145
+
146
+
147
+ judgment = Tracer(project_name="default_project")
148
+ client = wrap(OpenAI()) # tracks all LLM calls
149
+
150
+ @judgment.observe(span_type="tool")
151
+ def format_question(question: str) -> str:
152
+ # dummy tool
153
+ return f"Question : {question}"
154
+
155
+ @judgment.observe(span_type="function")
156
+ def run_agent(prompt: str) -> str:
157
+ task = format_question(prompt)
158
+ response = client.chat.completions.create(
159
+ model="gpt-5-mini",
160
+ messages=[{"role": "user", "content": task}]
161
+ )
162
+
163
+ judgment.async_evaluate( # trigger online monitoring
164
+ scorer=AnswerRelevancyScorer(threshold=0.5), # swap with any scorer
165
+ example=Example(input=task, actual_output=response), # customize to your data
166
+ model="gpt-5",
167
+ )
168
+ return response.choices[0].message.content
169
+
170
+ run_agent("What is the capital of the United States?")
171
+ ```
172
+
173
+ Running this code will deliver monitoring results to your [free platform account](https://app.judgmentlabs.ai/register) and should look like this:
174
+
175
+ ![Judgment Platform Trajectory View](assets/quickstart_trajectory_ss.png)
176
+
177
+
178
+ ### Customizable Scorers Over Agent Behavior
179
+
180
+ Judgeval's strongest suit is the full customization over the types of scorers you can run online monitoring with. No restrictions to only single-prompt LLM judges or prefab scorers - if you can express your scorer
181
+ in python code, judgeval can monitor it! Under the hood, judgeval hosts your scorer in a virtualized secure container, enabling online monitoring for any scorer.
182
+
183
+
184
+ First, create a behavior scorer in a file called `helpfulness_scorer.py`:
185
+
186
+ ```python
187
+ from judgeval.data import Example
188
+ from judgeval.scorers.example_scorer import ExampleScorer
189
+
190
+ # Define custom example class
191
+ class QuestionAnswer(Example):
192
+ question: str
193
+ answer: str
194
+
195
+ # Define a server-hosted custom scorer
196
+ class HelpfulnessScorer(ExampleScorer):
197
+ name: str = "Helpfulness Scorer"
198
+ server_hosted: bool = True # Enable server hosting
199
+ async def a_score_example(self, example: QuestionAnswer):
200
+ # Custom scoring logic for agent behavior
201
+ # Can be an arbitrary combination of code and LLM calls
202
+ if len(example.answer) > 10 and "?" not in example.answer:
203
+ self.reason = "Answer is detailed and provides helpful information"
204
+ return 1.0
205
+ else:
206
+ self.reason = "Answer is too brief or unclear"
207
+ return 0.0
208
+ ```
209
+
210
+ Then deploy your scorer to Judgment's infrastructure:
211
+
212
+ ```bash
213
+ echo "pydantic" > requirements.txt
214
+ uv run judgeval upload_scorer helpfulness_scorer.py requirements.txt
215
+ ```
216
+
217
+ Now you can instrument your agent with monitoring and online evaluation:
218
+
219
+ ```python
220
+ from judgeval.tracer import Tracer, wrap
221
+ from helpfulness_scorer import HelpfulnessScorer, QuestionAnswer
222
+ from openai import OpenAI
223
+
224
+ judgment = Tracer(project_name="default_project")
225
+ client = wrap(OpenAI()) # tracks all LLM calls
226
+
227
+ @judgment.observe(span_type="tool")
228
+ def format_task(question: str) -> str: # replace with your prompt engineering
229
+ return f"Please answer the following question: {question}"
230
+
231
+ @judgment.observe(span_type="tool")
232
+ def answer_question(prompt: str) -> str: # replace with your LLM system calls
233
+ response = client.chat.completions.create(
234
+ model="gpt-5-mini",
235
+ messages=[{"role": "user", "content": prompt}]
236
+ )
237
+ return response.choices[0].message.content
238
+
239
+ @judgment.observe(span_type="function")
240
+ def run_agent(question: str) -> str:
241
+ task = format_task(question)
242
+ answer = answer_question(task)
243
+
244
+ # Add online evaluation with server-hosted scorer
245
+ judgment.async_evaluate(
246
+ scorer=HelpfulnessScorer(),
247
+ example=QuestionAnswer(question=question, answer=answer),
248
+ sampling_rate=0.9 # Evaluate 90% of agent runs
249
+ )
250
+
251
+ return answer
252
+
253
+ if __name__ == "__main__":
254
+ result = run_agent("What is the capital of the United States?")
255
+ print(result)
256
+ ```
257
+
258
+ Congratulations! Your online eval result should look like this:
259
+
260
+ ![Custom Scorer Online ABM](assets/custom_scorer_online_abm.png)
261
+
262
+ You can now run any online scorer in a secure Firecracker microVMs with no latency impact on your applications.
263
+
264
+ ---
265
+
266
+ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
@@ -4,22 +4,23 @@ judgeval/constants.py,sha256=JZZJ1MqzZZDVk-5PRPRbmLnM8mXI-RDL5vxa1JFuscs,3408
4
4
  judgeval/env.py,sha256=37Mn4g0OkpFxXCZGlO_CLqKJnyX-jx_R24tC28XJzig,2112
5
5
  judgeval/exceptions.py,sha256=tTbfe4yoOtPXmn22UQz9-6a-5PT9uOko85xaRRwr0Sw,621
6
6
  judgeval/logger.py,sha256=ZWbp0QfT1CJnQIjV-Zle4n489nFCKEmD2-ukx--iiow,1553
7
- judgeval/version.py,sha256=jxLK8GY7YWWLhTk4egDdn5VKiEty1Qpb-C3dLL2m-To,74
7
+ judgeval/version.py,sha256=UCd6S0KuM6h0ZUz8pm-Ty1EDHaJNSUYM_7PrDz0ov-E,74
8
8
  judgeval/warnings.py,sha256=LbGte14ppiFjrkp-JJYueZ40NWFvMkWRvPXr6r-fUWw,73
9
- judgeval/api/__init__.py,sha256=3Pm0qQ4ZQj76jUsJVrnuazRnYcqF3pzM_Wv_Z6lOv0w,13216
10
- judgeval/api/api_types.py,sha256=mtk9xcgYGj1zXV1w_vZ_fbVu9OI4i2IIDLL37lgYnV4,8979
9
+ judgeval/api/__init__.py,sha256=ho8L4wC9y-STYEpk5zHwc2mZJhC4ezW8jiGgOIERBVY,12058
10
+ judgeval/api/api_types.py,sha256=6wrjvO8XsYbfPxjQ_sHS9EOjqexbn3XDFclWqb4CgZ4,8874
11
11
  judgeval/data/__init__.py,sha256=1tU0EN0ThIfQ1fad5I3dKxAfTcZ5U8cvTLcQ6qLVLU0,407
12
12
  judgeval/data/evaluation_run.py,sha256=O41p99wNAuCAf6lsLNKzkZ6W-kL9LlzCYxVls7IcKkA,4727
13
13
  judgeval/data/example.py,sha256=eGJpF-lyUH734Cg90B7WtU9f8iKoS3VFGeV6R-GVCCc,1039
14
- judgeval/data/judgment_types.py,sha256=fNRqiGEG_nJhVkucagoxxgFqmpwK0-GlwWOwjmBtpXk,16603
14
+ judgeval/data/judgment_types.py,sha256=uI4wUiXeA6k8o2ONia506eaZcydHKQKrK1LzccTK-xc,16577
15
15
  judgeval/data/result.py,sha256=XufFGSAkBDfevPUmzSgsR9HEqytISkM0U5HkhJmsjpY,2102
16
16
  judgeval/data/scorer_data.py,sha256=HeP15ZgftFTJCF8JmDJCLWXRnZJIaGDJCzl7Hg6gWwE,2006
17
17
  judgeval/data/trace.py,sha256=zSiR3o6xt8Z46XA3M9fJBtViF0BsPO6yKp9jxdscOSc,3881
18
18
  judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
19
19
  judgeval/data/scripts/openapi_transform.py,sha256=Sm04JClzyP1ga8KA3gkIdsae8Hlx-XU7-x0gHCQYOhg,3877
20
- judgeval/dataset/__init__.py,sha256=4CiV7jQUiJ8_IXnD_E-vS5OfoEr0hghBe3-OSuVoBwE,8277
21
- judgeval/evaluation/__init__.py,sha256=6bSC1Sw-fpJN6OkZTv4UtAoYZqkjUy7OG17lxiRX5qE,13321
20
+ judgeval/dataset/__init__.py,sha256=kL0_tIMP3qV6t4W17HQU91ybdXMZ5iDZzyUKzyfRdyY,8269
21
+ judgeval/evaluation/__init__.py,sha256=WcqOgQdwgtc_BwEwDz6RDlF2RczyLrNjjIevQp-_NKE,12788
22
22
  judgeval/integrations/langgraph/__init__.py,sha256=HwXmtDxaO75Kn4KPErnMb6Ne6FcpRxV_SCYVuwFsve0,332
23
+ judgeval/integrations/openlit/__init__.py,sha256=-8D4D6-fGsWPwoOojw82OaE9X5sUbmb16x1bF-WfOmg,1571
23
24
  judgeval/judges/__init__.py,sha256=e7JnTc1TG_SwqydDHTXHIP0EBazQxt-ydMQG7ghSU5A,228
24
25
  judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
25
26
  judgeval/judges/litellm_judge.py,sha256=5vEF0IUo7HVWnOF2ww-DMke8Xkarnz32B_qbgKjc0-I,4182
@@ -39,8 +40,8 @@ judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=WUeFy
39
40
  judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=ciiFBQQC4UDsk9qou9OiKbAR31s82eRUY1ZTt1gdM-0,407
40
41
  judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=ucYOI6ztAjfoYmcgTDzN8u5RrehlVqrkeLEfss9b1fk,441
41
42
  judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=V3RdrWhnR_vLBrtWw7QbgN9K_A-Och7-v9I2fN4z8gY,506
42
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=FbrXNMedeepYp_bADsysapIIZcr09l9EV9QWfGxvanw,10075
43
- judgeval/tracer/__init__.py,sha256=iqFvWok4QBW-1bs2zCmkhw4Y_o2d2mVeiPUtQbG9Nvc,35995
43
+ judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=zJ0n3HyZ1FFBnMnTYxBi37m_3Er7ENd4HpqLjNi5Eag,10902
44
+ judgeval/tracer/__init__.py,sha256=uIOx-2P_FVwBKhwVkkIOyEQCv3gouCZ2I8-eApocnKU,36165
44
45
  judgeval/tracer/constants.py,sha256=ae8tivAW97awJQxdRB9OMqX50wOLX3zqChT_AGkPBu0,85
45
46
  judgeval/tracer/keys.py,sha256=ho4-_w4ngTVejdSKUH80sG6vtYt4c7FEKrYpFrDfPLs,2105
46
47
  judgeval/tracer/local_eval_queue.py,sha256=KZKvSSli7B-EVzdHa4-CmXUpv0uOjGLLRa2KTPg8lRc,7320
@@ -50,7 +51,7 @@ judgeval/tracer/exporters/__init__.py,sha256=3WDXC28iY5gYMM5s7ejmy7P-DVDQ_iIuzwo
50
51
  judgeval/tracer/exporters/s3.py,sha256=N9gmw17cnR0VkfAQQkLsNj5BksgNRETThR5qYhWRjP4,4360
51
52
  judgeval/tracer/exporters/store.py,sha256=KQV3cyqteesByQjR-9VdPXT9OlUZ-6F08ogqj837_c0,1012
52
53
  judgeval/tracer/exporters/utils.py,sha256=JRcoSQuEHxMDJbXfyrUIfA2SHBVkZM82h4bTbYGxkNw,1154
53
- judgeval/tracer/llm/__init__.py,sha256=6JSF-RaK6tZNzd0rZOK6Don7vvf15EhSPSio_FmS7i8,42564
54
+ judgeval/tracer/llm/__init__.py,sha256=b7toFMVyZU4Pv8jximfneP5gyohUB4DwJDvy8b2_IMw,44217
54
55
  judgeval/tracer/llm/providers.py,sha256=UU8xrh2n9p3xZwnlWMUcZoFpog2-F9-YfcV0c2aUNqQ,1432
55
56
  judgeval/tracer/llm/anthropic/__init__.py,sha256=DUTkYjMejWLI8inFJ_Ih7vf7_aJFAiCyi1Oxls-ACGo,439
56
57
  judgeval/tracer/llm/google/__init__.py,sha256=7j96SPUl61yVl3jCQ-JuPpgVU9GhmcsBzY2vj5wJAVo,506
@@ -72,8 +73,8 @@ judgeval/utils/serialize.py,sha256=QXR-8Nj5rqOrI9zLx0oRLdk6DW6Bc7j8eyF4zQ7PLxA,6
72
73
  judgeval/utils/testing.py,sha256=m5Nexv65tmfSj1XvAPK5Ear7aJ7w5xjDtZN0tLZ_RBk,2939
73
74
  judgeval/utils/url.py,sha256=Shf0v3XcbaWpL0m1eGJEEO_z4TsQCnDB2Rl25OTUmiI,195
74
75
  judgeval/utils/version_check.py,sha256=ylZQSqV7kLzEOChxvav9SCHUU4OnaCp36tXHLjdzmw0,1072
75
- judgeval-0.14.1.dist-info/METADATA,sha256=e8rJlBzFrfcadnR6-WiBQaRTKj2LlsnuxAS-Ag_WK1Q,8564
76
- judgeval-0.14.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
77
- judgeval-0.14.1.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
78
- judgeval-0.14.1.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
79
- judgeval-0.14.1.dist-info/RECORD,,
76
+ judgeval-0.16.0.dist-info/METADATA,sha256=kojyijzNE_2gKKvMGrs7E0zHHv3GtOXRjfmIOUQujTY,11512
77
+ judgeval-0.16.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
78
+ judgeval-0.16.0.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
79
+ judgeval-0.16.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
80
+ judgeval-0.16.0.dist-info/RECORD,,
@@ -1,158 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: judgeval
3
- Version: 0.14.1
4
- Summary: Judgeval Package
5
- Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
- Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
7
- Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
8
- License-Expression: Apache-2.0
9
- License-File: LICENSE.md
10
- Classifier: Operating System :: OS Independent
11
- Classifier: Programming Language :: Python :: 3
12
- Requires-Python: >=3.10
13
- Requires-Dist: boto3>=1.40.11
14
- Requires-Dist: click<8.2.0
15
- Requires-Dist: dotenv
16
- Requires-Dist: httpx>=0.28.1
17
- Requires-Dist: litellm<1.75.0
18
- Requires-Dist: opentelemetry-exporter-otlp>=1.36.0
19
- Requires-Dist: opentelemetry-sdk>=1.36.0
20
- Requires-Dist: orjson>=3.9.0
21
- Requires-Dist: typer>=0.9.0
22
- Provides-Extra: s3
23
- Requires-Dist: boto3>=1.40.11; extra == 's3'
24
- Provides-Extra: trainer
25
- Requires-Dist: fireworks-ai>=0.19.18; extra == 'trainer'
26
- Description-Content-Type: text/markdown
27
-
28
- <div align="center">
29
-
30
- <img src="assets/new_lightmode.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
31
- <img src="assets/new_darkmode.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
32
-
33
- <br>
34
- <div style="font-size: 1.5em;">
35
- Enable self-learning agents with environment data and evals.
36
- </div>
37
-
38
- ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
39
-
40
- [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
41
-
42
- We're hiring! Join us in our mission to enable self-learning agents by providing the data and signals needed for monitoring and post-training.
43
-
44
- [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
45
- [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
46
- [![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/tGVFf8UBUY)
47
-
48
- <img src="assets/product_shot.png" alt="Judgment Platform" width="800" />
49
-
50
- </div>
51
-
52
- Judgeval offers **open-source tooling** for evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
53
-
54
- ## 🎬 See Judgeval in Action
55
-
56
- **[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval captures all environment responses across all agent tool calls for monitoring. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
57
-
58
- <table style="width: 100%; max-width: 800px; table-layout: fixed;">
59
- <tr>
60
- <td align="center" style="padding: 8px; width: 50%;">
61
- <img src="assets/agent.gif" alt="Agent Demo" style="width: 100%; max-width: 350px; height: auto;" />
62
- <br><strong>🤖 Agents Running</strong>
63
- </td>
64
- <td align="center" style="padding: 8px; width: 50%;">
65
- <img src="assets/trace.gif" alt="Capturing Environment Data Demo" style="width: 100%; max-width: 350px; height: auto;" />
66
- <br><strong>📊 Capturing Environment Data </strong>
67
- </td>
68
- </tr>
69
- <tr>
70
- <td align="center" style="padding: 8px; width: 50%;">
71
- <img src="assets/document.gif" alt="Agent Completed Demo" style="width: 100%; max-width: 350px; height: auto;" />
72
- <br><strong>✅ Agents Completed Running</strong>
73
- </td>
74
- <td align="center" style="padding: 8px; width: 50%;">
75
- <img src="assets/data.gif" alt="Data Export Demo" style="width: 100%; max-width: 350px; height: auto;" />
76
- <br><strong>📤 Exporting Agent Environment Data</strong>
77
- </td>
78
- </tr>
79
-
80
- </table>
81
-
82
- ## 📋 Table of Contents
83
- - [🛠️ Installation](#️-installation)
84
- - [🏁 Quickstarts](#-quickstarts)
85
- - [✨ Features](#-features)
86
- - [🏢 Self-Hosting](#-self-hosting)
87
- - [📚 Cookbooks](#-cookbooks)
88
- - [💻 Development with Cursor](#-development-with-cursor)
89
-
90
- ## 🛠️ Installation
91
-
92
- Get started with Judgeval by installing our SDK using pip:
93
-
94
- ```bash
95
- pip install judgeval
96
- ```
97
-
98
- Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment Platform](https://app.judgmentlabs.ai/).
99
-
100
- ```bash
101
- export JUDGMENT_API_KEY=...
102
- export JUDGMENT_ORG_ID=...
103
- ```
104
-
105
- **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
106
-
107
-
108
- ## ✨ Features
109
-
110
- | | |
111
- |:---|:---:|
112
- | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
113
- | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
114
- | <h3>📊 Datasets</h3>Export environment interactions and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
115
-
116
- ## 🏢 Self-Hosting
117
-
118
- Run Judgment on your own infrastructure: we provide comprehensive self-hosting capabilities that give you full control over the backend and data plane that Judgeval interfaces with.
119
-
120
- ### Key Features
121
- * Deploy Judgment on your own AWS account
122
- * Store data in your own Supabase instance
123
- * Access Judgment through your own custom domain
124
-
125
- ### Getting Started
126
- 1. Check out our [self-hosting documentation](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) for detailed setup instructions, along with how your self-hosted instance can be accessed
127
- 2. Use the [Judgment CLI](https://docs.judgmentlabs.ai/documentation/developer-tools/judgment-cli/installation) to deploy your self-hosted environment
128
- 3. After your self-hosted instance is setup, make sure the `JUDGMENT_API_URL` environmental variable is set to your self-hosted backend endpoint
129
-
130
- ## 📚 Cookbooks
131
-
132
- Have your own? We're happy to feature it if you create a PR or message us on [Discord](https://discord.gg/tGVFf8UBUY).
133
-
134
- You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judgment-cookbook).
135
-
136
- ## 💻 Development with Cursor
137
- Building agents and LLM workflows in Cursor works best when your coding assistant has the proper context about Judgment integration. The Cursor rules file contains the key information needed for your assistant to implement Judgment features effectively.
138
-
139
- Refer to the official [documentation](https://docs.judgmentlabs.ai/documentation/developer-tools/cursor/cursor-rules) for access to the rules file and more information on integrating this rules file with your codebase.
140
-
141
- ## ⭐ Star Us on GitHub
142
-
143
- If you find Judgeval useful, please consider giving us a star on GitHub! Your support helps us grow our community and continue improving the repository.
144
-
145
- ## ❤️ Contributors
146
-
147
- There are many ways to contribute to Judgeval:
148
-
149
- - Submit [bug reports](https://github.com/JudgmentLabs/judgeval/issues) and [feature requests](https://github.com/JudgmentLabs/judgeval/issues)
150
- - Review the documentation and submit [Pull Requests](https://github.com/JudgmentLabs/judgeval/pulls) to improve it
151
- - Speaking or writing about Judgment and letting us know!
152
-
153
- <!-- Contributors collage -->
154
- [![Contributors](https://contributors-img.web.app/image?repo=JudgmentLabs/judgeval)](https://github.com/JudgmentLabs/judgeval/graphs/contributors)
155
-
156
- ---
157
-
158
- Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).