judgeval 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,10 +6,10 @@ import time
6
6
  import orjson
7
7
  import sys
8
8
  import threading
9
- from typing import List, Dict, Union, Optional, Callable, Tuple, Any, TYPE_CHECKING
9
+ from typing import List, Dict, Union, Tuple, Any, TYPE_CHECKING
10
10
  from rich import print as rprint
11
11
 
12
- from judgeval.data import ScorerData, ScoringResult, Example, Trace
12
+ from judgeval.data import ScorerData, ScoringResult, Example
13
13
  from judgeval.scorers import BaseScorer, APIScorerConfig
14
14
  from judgeval.scorers.score import a_execute_scoring
15
15
  from judgeval.common.api import JudgmentApiClient
@@ -22,10 +22,7 @@ from judgeval.common.logger import judgeval_logger
22
22
 
23
23
 
24
24
  if TYPE_CHECKING:
25
- from judgeval.common.tracer import Tracer
26
- from judgeval.data.trace_run import TraceRun
27
- from judgeval.evaluation_run import EvaluationRun
28
- from judgeval.integrations.langgraph import JudgevalCallbackHandler
25
+ from judgeval.data.evaluation_run import EvaluationRun
29
26
 
30
27
 
31
28
  def safe_run_async(coro):
@@ -99,29 +96,6 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
99
96
  )
100
97
 
101
98
 
102
- def execute_api_trace_eval(trace_run: TraceRun, judgment_api_key: str) -> Dict:
103
- """
104
- Executes an evaluation of a list of `Trace`s using one or more `JudgmentScorer`s via the Judgment API.
105
- """
106
-
107
- try:
108
- # submit API request to execute evals
109
- if not judgment_api_key or not trace_run.organization_id:
110
- raise ValueError("API key and organization ID are required")
111
- api_client = JudgmentApiClient(judgment_api_key, trace_run.organization_id)
112
- return api_client.run_trace_evaluation(trace_run.model_dump(warnings=False))
113
- except Exception as e:
114
- judgeval_logger.error(f"Error: {e}")
115
-
116
- details = "An unknown error occurred."
117
- if isinstance(e, JudgmentAPIException):
118
- details = e.response_json.get("detail", "An unknown error occurred.")
119
-
120
- raise JudgmentAPIError(
121
- "An error occurred while executing the Judgment API request: " + details
122
- )
123
-
124
-
125
99
  def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
126
100
  """
127
101
  Checks if any `ScoringResult` objects are missing `scorers_data`.
@@ -140,83 +114,9 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
140
114
  return results
141
115
 
142
116
 
143
- def check_experiment_type(
144
- eval_name: str,
145
- project_name: str,
146
- judgment_api_key: str,
147
- organization_id: str,
148
- is_trace: bool,
149
- ) -> None:
150
- """
151
- Checks if the current experiment, if one exists, has the same type (examples of traces)
152
- """
153
- api_client = JudgmentApiClient(judgment_api_key, organization_id)
154
-
155
- try:
156
- api_client.check_experiment_type(eval_name, project_name, is_trace)
157
- except JudgmentAPIException as e:
158
- if e.response.status_code == 422:
159
- judgeval_logger.error(f"{e.response_json}")
160
- raise ValueError(f"{e.response_json}")
161
- else:
162
- raise e
163
- except Exception as e:
164
- judgeval_logger.error(f"Failed to check if experiment type exists: {str(e)}")
165
- raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
166
-
167
-
168
- def check_eval_run_name_exists(
169
- eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
170
- ) -> None:
171
- """
172
- Checks if an evaluation run name already exists for a given project.
173
-
174
- Args:
175
- eval_name (str): Name of the evaluation run
176
- project_name (str): Name of the project
177
- judgment_api_key (str): API key for authentication
178
-
179
- Raises:
180
- ValueError: If the evaluation run name already exists
181
- JudgmentAPIError: If there's an API error during the check
182
- """
183
- api_client = JudgmentApiClient(judgment_api_key, organization_id)
184
- try:
185
- api_client.check_eval_run_name_exists(eval_name, project_name)
186
- except JudgmentAPIException as e:
187
- if e.response.status_code == 409:
188
- error_str = f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true. See https://docs.judgmentlabs.ai/sdk-reference/judgment-client#override for more information."
189
- judgeval_logger.error(error_str)
190
- raise ValueError(error_str)
191
- else:
192
- raise e
193
-
194
- except Exception as e:
195
- judgeval_logger.error(f"Failed to check if eval run name exists: {str(e)}")
196
- raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
197
-
198
-
199
- def check_example_keys(
200
- keys: List[str],
201
- eval_name: str,
202
- project_name: str,
203
- judgment_api_key: str,
204
- organization_id: str,
205
- ) -> None:
206
- """
207
- Checks if the current experiment (if one exists) has the same keys for example
208
- """
209
- api_client = JudgmentApiClient(judgment_api_key, organization_id)
210
- try:
211
- api_client.check_example_keys(keys, eval_name, project_name)
212
- except Exception as e:
213
- judgeval_logger.error(f"Failed to check if example keys match: {str(e)}")
214
- raise JudgmentAPIError(f"Failed to check if example keys match: {str(e)}")
215
-
216
-
217
117
  def log_evaluation_results(
218
118
  scoring_results: List[ScoringResult],
219
- run: Union[EvaluationRun, TraceRun],
119
+ run: EvaluationRun,
220
120
  judgment_api_key: str,
221
121
  ) -> str:
222
122
  """
@@ -282,137 +182,8 @@ def check_examples(
282
182
  rprint("[green]Continuing...[/green]")
283
183
 
284
184
 
285
- def run_trace_eval(
286
- trace_run: TraceRun,
287
- judgment_api_key: str,
288
- override: bool = False,
289
- function: Optional[Callable] = None,
290
- tracer: Optional[Union[Tracer, "JudgevalCallbackHandler"]] = None,
291
- examples: Optional[List[Example]] = None,
292
- ) -> List[ScoringResult]:
293
- # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
294
- if not override and not trace_run.append:
295
- check_eval_run_name_exists(
296
- trace_run.eval_name,
297
- trace_run.project_name,
298
- judgment_api_key,
299
- trace_run.organization_id,
300
- )
301
-
302
- if trace_run.append:
303
- # Check that the current experiment, if one exists, has the same type (examples or traces)
304
- check_experiment_type(
305
- trace_run.eval_name,
306
- trace_run.project_name,
307
- judgment_api_key,
308
- trace_run.organization_id,
309
- True,
310
- )
311
- if function and tracer and examples is not None:
312
- new_traces: List[Trace] = []
313
-
314
- # Handle case where tracer is actually a callback handler
315
- actual_tracer = tracer
316
- if hasattr(tracer, "tracer") and hasattr(tracer.tracer, "traces"):
317
- # This is a callback handler, get the underlying tracer
318
- actual_tracer = tracer.tracer
319
-
320
- if trace_run.project_name != actual_tracer.project_name:
321
- raise ValueError(
322
- f"Project name mismatch between run_trace_eval and tracer. "
323
- f"Trace run: {trace_run.project_name}, "
324
- f"Tracer: {actual_tracer.project_name}"
325
- )
326
-
327
- actual_tracer.offline_mode = True
328
- actual_tracer.traces = []
329
- judgeval_logger.info("Running agent function: ")
330
- for example in examples:
331
- if example.input:
332
- if isinstance(example.input, str):
333
- function(example.input)
334
- elif isinstance(example.input, dict):
335
- function(**example.input)
336
- else:
337
- raise ValueError(
338
- f"Input must be string or dict, got {type(example.input)}"
339
- )
340
- else:
341
- function()
342
-
343
- for i, trace in enumerate(actual_tracer.traces):
344
- # We set the root-level trace span with the expected tools of the Trace
345
- trace = Trace(**trace)
346
- trace.trace_spans[0].expected_tools = examples[i].expected_tools
347
- new_traces.append(trace)
348
- trace_run.traces = new_traces
349
- actual_tracer.traces = []
350
-
351
- # Execute evaluation using Judgment API
352
- try: # execute an EvaluationRun with just JudgmentScorers
353
- judgeval_logger.info("Executing Trace Evaluation... ")
354
- response_data: Dict = execute_api_trace_eval(trace_run, judgment_api_key)
355
- scoring_results = [
356
- ScoringResult(**result) for result in response_data["results"]
357
- ]
358
- except JudgmentAPIError as e:
359
- raise JudgmentAPIError(
360
- f"An error occurred while executing the Judgment API request: {str(e)}"
361
- )
362
- except ValueError as e:
363
- raise ValueError(
364
- f"Please check your TraceRun object, one or more fields are invalid: {str(e)}"
365
- )
366
-
367
- # Convert the response data to `ScoringResult` objects
368
- # TODO: allow for custom scorer on traces
369
-
370
- url = log_evaluation_results(
371
- response_data["agent_results"], trace_run, judgment_api_key
372
- )
373
- rprint(
374
- f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
375
- )
376
- return scoring_results
377
-
378
-
379
- async def get_evaluation_status(
380
- eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
381
- ) -> Dict:
382
- """
383
- Gets the status of an async evaluation run.
384
-
385
- Args:
386
- eval_name (str): Name of the evaluation run
387
- project_name (str): Name of the project
388
- judgment_api_key (str): API key for authentication
389
- organization_id (str): Organization ID for the evaluation
390
-
391
- Returns:
392
- Dict: Status information including:
393
- - status: 'pending', 'running', 'completed', or 'failed'
394
- - results: List of ScoringResult objects if completed
395
- - error: Error message if failed
396
- """
397
- api_client = JudgmentApiClient(judgment_api_key, organization_id)
398
- try:
399
- return api_client.get_evaluation_status(eval_name, project_name)
400
- except Exception as e:
401
- raise JudgmentAPIError(
402
- f"An error occurred while checking evaluation status: {str(e)}"
403
- )
404
-
405
-
406
- def retrieve_counts(result: Dict):
407
- scorer_data_count = 0
408
- for example in result.get("examples", []):
409
- for scorer in example.get("scorer_data", []):
410
- scorer_data_count += 1
411
- return scorer_data_count
412
-
413
-
414
185
  def _poll_evaluation_until_complete(
415
- eval_name: str,
186
+ experiment_run_id: str,
416
187
  project_name: str,
417
188
  judgment_api_key: str,
418
189
  organization_id: str,
@@ -443,14 +214,16 @@ def _poll_evaluation_until_complete(
443
214
  poll_count += 1
444
215
  try:
445
216
  # Check status
446
- status_response = api_client.get_evaluation_status(eval_name, project_name)
217
+ status_response = api_client.get_evaluation_status(
218
+ experiment_run_id, project_name
219
+ )
447
220
 
448
221
  if status_response.get("status") != "completed":
449
222
  time.sleep(poll_interval_seconds)
450
223
  continue
451
224
 
452
225
  results_response = api_client.fetch_evaluation_results(
453
- project_name, eval_name
226
+ experiment_run_id, project_name
454
227
  )
455
228
  url = results_response.get("ui_results_url")
456
229
 
@@ -513,14 +286,15 @@ def progress_logger(stop_event, msg="Working...", interval=5):
513
286
  def run_eval(
514
287
  evaluation_run: EvaluationRun,
515
288
  judgment_api_key: str,
516
- override: bool = False,
289
+ show_url: bool = True,
517
290
  ) -> List[ScoringResult]:
518
291
  """
519
292
  Executes an evaluation of `Example`s using one or more `Scorer`s
520
293
 
521
294
  Args:
522
295
  evaluation_run (EvaluationRun): Stores example and evaluation together for running
523
- override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
296
+ judgment_api_key (str): API key for authentication
297
+ show_url (bool): Whether to display the evaluation results URL. Defaults to True.
524
298
 
525
299
  Returns:
526
300
  List[ScoringResult]: A list of ScoringResult objects
@@ -534,52 +308,31 @@ def run_eval(
534
308
  f"All examples must have the same keys: {current_keys} != {keys}"
535
309
  )
536
310
 
537
- # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
538
- if not override and not evaluation_run.append:
539
- check_eval_run_name_exists(
540
- evaluation_run.eval_name,
541
- evaluation_run.project_name,
542
- judgment_api_key,
543
- evaluation_run.organization_id,
544
- )
545
-
546
- if evaluation_run.append:
547
- # Check that the current experiment, if one exists, has the same type (examples of traces)
548
- check_experiment_type(
549
- evaluation_run.eval_name,
550
- evaluation_run.project_name,
551
- judgment_api_key,
552
- evaluation_run.organization_id,
553
- False,
554
- )
555
-
556
- # Ensure that current experiment (if one exists) has the same keys for example
557
- check_example_keys(
558
- keys=list(keys),
559
- eval_name=evaluation_run.eval_name,
560
- project_name=evaluation_run.project_name,
561
- judgment_api_key=judgment_api_key,
562
- organization_id=evaluation_run.organization_id,
563
- )
564
-
565
- judgment_scorers: List[APIScorerConfig] = []
566
- local_scorers: List[BaseScorer] = []
567
- for scorer in evaluation_run.scorers:
568
- if isinstance(scorer, APIScorerConfig):
569
- judgment_scorers.append(scorer)
570
- else:
571
- local_scorers.append(scorer)
572
-
573
311
  results: List[ScoringResult] = []
574
312
  url = ""
575
313
 
576
- if len(local_scorers) > 0 and len(judgment_scorers) > 0:
314
+ if (
315
+ len(evaluation_run.custom_scorers) > 0
316
+ and len(evaluation_run.judgment_scorers) > 0
317
+ ):
577
318
  error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
578
319
  judgeval_logger.error(error_msg)
579
320
  raise ValueError(error_msg)
580
321
 
581
- if len(judgment_scorers) > 0:
582
- check_examples(evaluation_run.examples, judgment_scorers)
322
+ e2b_scorers = [cs for cs in evaluation_run.custom_scorers if cs.server_hosted]
323
+
324
+ if evaluation_run.judgment_scorers or e2b_scorers:
325
+ if evaluation_run.judgment_scorers and e2b_scorers:
326
+ error_msg = "We currently do not support running both hosted custom scorers and Judgment API scorers at the same time. Please run your evaluation with one or the other, but not both."
327
+ judgeval_logger.error(error_msg)
328
+ raise ValueError(error_msg)
329
+
330
+ if len(e2b_scorers) > 1:
331
+ error_msg = "We currently do not support running multiple hosted custom scorers at the same time."
332
+ judgeval_logger.error(error_msg)
333
+ raise ValueError(error_msg)
334
+
335
+ check_examples(evaluation_run.examples, evaluation_run.judgment_scorers)
583
336
  stop_event = threading.Event()
584
337
  t = threading.Thread(
585
338
  target=progress_logger, args=(stop_event, "Running evaluation...")
@@ -600,36 +353,26 @@ def run_eval(
600
353
  )
601
354
  raise JudgmentAPIError(error_message)
602
355
 
603
- old_scorer_data_count = 0
604
- if evaluation_run.append:
605
- try:
606
- results_response = api_client.fetch_evaluation_results(
607
- evaluation_run.project_name, evaluation_run.eval_name
608
- )
609
- old_scorer_data_count = retrieve_counts(results_response)
610
- except Exception:
611
- # This usually means the user did append = True but the eval run name doesn't exist yet
612
- pass
613
-
356
+ num_scorers = (
357
+ len(evaluation_run.judgment_scorers)
358
+ if evaluation_run.judgment_scorers
359
+ else sum(1 for cs in evaluation_run.custom_scorers if cs.server_hosted)
360
+ )
614
361
  results, url = _poll_evaluation_until_complete(
615
- eval_name=evaluation_run.eval_name,
362
+ experiment_run_id=evaluation_run.id,
616
363
  project_name=evaluation_run.project_name,
617
364
  judgment_api_key=judgment_api_key,
618
365
  organization_id=evaluation_run.organization_id,
619
- expected_scorer_data_count=(
620
- len(evaluation_run.scorers) * len(evaluation_run.examples)
621
- )
622
- + old_scorer_data_count,
366
+ expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
623
367
  )
624
368
  finally:
625
369
  stop_event.set()
626
370
  t.join()
627
-
628
- if len(local_scorers) > 0:
371
+ else:
629
372
  results = safe_run_async(
630
373
  a_execute_scoring(
631
374
  evaluation_run.examples,
632
- local_scorers,
375
+ evaluation_run.custom_scorers,
633
376
  model=evaluation_run.model,
634
377
  throttle_value=0,
635
378
  max_concurrent=MAX_CONCURRENT_EVALUATIONS,
@@ -640,9 +383,10 @@ def run_eval(
640
383
  scoring_result.model_dump(warnings=False) for scoring_result in results
641
384
  ]
642
385
  url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
643
- rprint(
644
- f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
645
- )
386
+ if show_url:
387
+ rprint(
388
+ f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
389
+ )
646
390
  return results
647
391
 
648
392
 
@@ -26,6 +26,7 @@ class BaseScorer(BaseModel):
26
26
  name: Optional[str] = (
27
27
  None # name of your scorer (Faithfulness, PromptScorer-randomslug)
28
28
  )
29
+ class_name: Optional[str] = None # The name of the class of the scorer
29
30
  score: Optional[float] = None # The float score of the scorer run on the test case
30
31
  score_breakdown: Optional[Dict] = None
31
32
  reason: Optional[str] = ""
@@ -39,24 +40,22 @@ class BaseScorer(BaseModel):
39
40
  error: Optional[str] = None # The error message if the scorer failed
40
41
  additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
41
42
  user: Optional[str] = None # The user ID of the scorer
43
+ server_hosted: bool = False # Whether the scorer is enabled for e2b
42
44
 
43
- @model_validator(mode="before")
45
+ @model_validator(mode="after")
44
46
  @classmethod
45
- def enforce_strict_threshold(cls, data: dict):
46
- if data.get("strict_mode"):
47
- data["threshold"] = 1.0
47
+ def enforce_strict_threshold(cls, data: "BaseScorer"):
48
+ if data.strict_mode:
49
+ data.threshold = 1.0
48
50
  return data
49
51
 
50
52
  @model_validator(mode="after")
51
53
  @classmethod
52
54
  def default_name(cls, m: "BaseScorer") -> "BaseScorer":
55
+ # Always set class_name to the string name of the class
56
+ m.class_name = m.__class__.__name__
53
57
  if not m.name:
54
- # Try to use the class name if it exists and is not empty
55
- class_name = getattr(m, "__class__", None)
56
- if class_name and getattr(m.__class__, "__name__", None):
57
- m.name = m.__class__.__name__
58
- else:
59
- m.name = m.score_type
58
+ m.name = m.class_name
60
59
  return m
61
60
 
62
61
  def _add_model(self, model: str):
@@ -11,13 +11,14 @@ from judgeval.common.logger import judgeval_logger
11
11
  def push_prompt_scorer(
12
12
  name: str,
13
13
  prompt: str,
14
+ threshold: float,
14
15
  options: Optional[Dict[str, float]] = None,
15
16
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
16
17
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
17
18
  ) -> str:
18
19
  client = JudgmentApiClient(judgment_api_key, organization_id)
19
20
  try:
20
- r = client.save_scorer(name, prompt, options)
21
+ r = client.save_scorer(name, prompt, threshold, options)
21
22
  except JudgmentAPIException as e:
22
23
  if e.status_code == 500:
23
24
  raise JudgmentAPIError(
@@ -90,6 +91,7 @@ class PromptScorer(APIScorerConfig):
90
91
  return cls(
91
92
  name=name,
92
93
  prompt=scorer_config["prompt"],
94
+ threshold=scorer_config["threshold"],
93
95
  options=scorer_config.get("options"),
94
96
  judgment_api_key=judgment_api_key,
95
97
  organization_id=organization_id,
@@ -100,16 +102,20 @@ class PromptScorer(APIScorerConfig):
100
102
  cls,
101
103
  name: str,
102
104
  prompt: str,
105
+ threshold: Optional[float] = 0.5,
103
106
  options: Optional[Dict[str, float]] = None,
104
107
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
105
108
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
106
109
  ):
107
110
  if not scorer_exists(name, judgment_api_key, organization_id):
108
- push_prompt_scorer(name, prompt, options, judgment_api_key, organization_id)
111
+ push_prompt_scorer(
112
+ name, prompt, threshold, options, judgment_api_key, organization_id
113
+ )
109
114
  judgeval_logger.info(f"Successfully created PromptScorer: {name}")
110
115
  return cls(
111
116
  name=name,
112
117
  prompt=prompt,
118
+ threshold=threshold,
113
119
  options=options,
114
120
  judgment_api_key=judgment_api_key,
115
121
  organization_id=organization_id,
@@ -158,6 +164,12 @@ class PromptScorer(APIScorerConfig):
158
164
  judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
159
165
 
160
166
  # Getters
167
+ def get_threshold(self) -> float | None:
168
+ """
169
+ Returns the threshold of the scorer.
170
+ """
171
+ return self.threshold
172
+
161
173
  def get_prompt(self) -> str | None:
162
174
  """
163
175
  Returns the prompt of the scorer.
@@ -183,6 +195,7 @@ class PromptScorer(APIScorerConfig):
183
195
  return {
184
196
  "name": self.name,
185
197
  "prompt": self.prompt,
198
+ "threshold": self.threshold,
186
199
  "options": self.options,
187
200
  }
188
201
 
@@ -193,13 +206,14 @@ class PromptScorer(APIScorerConfig):
193
206
  push_prompt_scorer(
194
207
  self.name,
195
208
  self.prompt,
209
+ self.threshold,
196
210
  self.options,
197
211
  self.judgment_api_key,
198
212
  self.organization_id,
199
213
  )
200
214
 
201
215
  def __str__(self):
202
- return f"PromptScorer(name={self.name}, prompt={self.prompt}, options={self.options})"
216
+ return f"PromptScorer(name={self.name}, prompt={self.prompt}, threshold={self.threshold}, options={self.options})"
203
217
 
204
218
  def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
205
219
  base = super().model_dump(*args, **kwargs)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.5.0
3
+ Version: 0.7.0
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -11,6 +11,8 @@ Classifier: Operating System :: OS Independent
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.11
13
13
  Requires-Dist: boto3
14
+ Requires-Dist: click<8.2.0
15
+ Requires-Dist: fireworks-ai>=0.19.18
14
16
  Requires-Dist: langchain-anthropic
15
17
  Requires-Dist: langchain-core
16
18
  Requires-Dist: langchain-huggingface
@@ -23,6 +25,7 @@ Requires-Dist: orjson>=3.9.0
23
25
  Requires-Dist: python-dotenv
24
26
  Requires-Dist: requests
25
27
  Requires-Dist: rich
28
+ Requires-Dist: typer>=0.9.0
26
29
  Provides-Extra: langchain
27
30
  Requires-Dist: langchain-anthropic; extra == 'langchain'
28
31
  Requires-Dist: langchain-core; extra == 'langchain'
@@ -37,7 +40,7 @@ Description-Content-Type: text/markdown
37
40
 
38
41
  <br>
39
42
  <div style="font-size: 1.5em;">
40
- Enable self-learning agents with traces, evals, and environment data.
43
+ Enable self-learning agents with environment data and evals.
41
44
  </div>
42
45
 
43
46
  ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
@@ -54,11 +57,11 @@ We're hiring! Join us in our mission to enable self-learning agents by providing
54
57
 
55
58
  </div>
56
59
 
57
- Judgeval offers **open-source tooling** for tracing and evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
60
+ Judgeval offers **open-source tooling** for evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
58
61
 
59
62
  ## 🎬 See Judgeval in Action
60
63
 
61
- **[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval traces every input/output + environment response across all agent tool calls for debugging. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
64
+ **[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval captures all environment responses across all agent tool calls for monitoring. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
62
65
 
63
66
  <table style="width: 100%; max-width: 800px; table-layout: fixed;">
64
67
  <tr>
@@ -67,8 +70,8 @@ Judgeval offers **open-source tooling** for tracing and evaluating autonomous, s
67
70
  <br><strong>🤖 Agents Running</strong>
68
71
  </td>
69
72
  <td align="center" style="padding: 8px; width: 50%;">
70
- <img src="assets/trace.gif" alt="Trace Demo" style="width: 100%; max-width: 350px; height: auto;" />
71
- <br><strong>📊 Real-time Tracing</strong>
73
+ <img src="assets/trace.gif" alt="Capturing Environment Data Demo" style="width: 100%; max-width: 350px; height: auto;" />
74
+ <br><strong>📊 Capturing Environment Data </strong>
72
75
  </td>
73
76
  </tr>
74
77
  <tr>
@@ -109,54 +112,14 @@ export JUDGMENT_ORG_ID=...
109
112
 
110
113
  **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
111
114
 
112
- ## 🏁 Quickstarts
113
-
114
- ### 🛰️ Tracing
115
-
116
- Create a file named `agent.py` with the following code:
117
-
118
- ```python
119
- from judgeval.tracer import Tracer, wrap
120
- from openai import OpenAI
121
-
122
- client = wrap(OpenAI()) # tracks all LLM calls
123
- judgment = Tracer(project_name="my_project")
124
-
125
- @judgment.observe(span_type="tool")
126
- def format_question(question: str) -> str:
127
- # dummy tool
128
- return f"Question : {question}"
129
-
130
- @judgment.observe(span_type="function")
131
- def run_agent(prompt: str) -> str:
132
- task = format_question(prompt)
133
- response = client.chat.completions.create(
134
- model="gpt-4.1",
135
- messages=[{"role": "user", "content": task}]
136
- )
137
- return response.choices[0].message.content
138
-
139
- run_agent("What is the capital of the United States?")
140
- ```
141
- You'll see your trace exported to the Judgment Platform:
142
-
143
- <p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
144
-
145
-
146
- [Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
147
-
148
-
149
- <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
150
-
151
115
 
152
116
  ## ✨ Features
153
117
 
154
118
  | | |
155
119
  |:---|:---:|
156
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
157
120
  | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
158
121
  | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
159
- | <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
122
+ | <h3>📊 Datasets</h3>Export environment interactions and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
160
123
 
161
124
  ## 🏢 Self-Hosting
162
125