judgeval 0.0.37__py3-none-any.whl → 0.0.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,7 +12,7 @@ BASE_CONVERSATION = [
12
12
 
13
13
 
14
14
  class LiteLLMJudge(JudgevalJudge):
15
- def __init__(self, model: str = "gpt-4o-mini", **kwargs):
15
+ def __init__(self, model: str = "gpt-4.1-mini", **kwargs):
16
16
  debug(f"Initializing LiteLLMJudge with model={model}")
17
17
  self.model = model
18
18
  self.kwargs = kwargs
@@ -136,7 +136,7 @@ class MixtureOfJudges(JudgevalJudge):
136
136
  """
137
137
  def __init__(self,
138
138
  models: List[str] = ['QWEN', 'LLAMA3_70B_INSTRUCT_TURBO', 'MISTRAL_8x22B_INSTRUCT'],
139
- aggregator: str = 'gpt-4o',
139
+ aggregator: str = 'gpt-4.1',
140
140
  **kwargs):
141
141
  """
142
142
  `models` are the individual judge models to be used for generating responses.
judgeval/judges/utils.py CHANGED
@@ -23,7 +23,7 @@ def create_judge(
23
23
  If no model is provided, uses GPT4o as the default judge.
24
24
  """
25
25
  if model is None: # default option
26
- return LiteLLMJudge(model="gpt-4o"), True
26
+ return LiteLLMJudge(model="gpt-4.1"), True
27
27
  if not isinstance(model, (str, list, JudgevalJudge)):
28
28
  raise InvalidJudgeModelError(f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead.")
29
29
  # If model is already a valid judge type, return it and mark native
@@ -12,7 +12,7 @@ from judgeval.data import (
12
12
  ScoringResult,
13
13
  Example,
14
14
  CustomExample,
15
- Sequence,
15
+ Trace,
16
16
  )
17
17
  from judgeval.scorers import (
18
18
  APIJudgmentScorer,
@@ -23,9 +23,9 @@ from judgeval.evaluation_run import EvaluationRun
23
23
  from judgeval.run_evaluation import (
24
24
  run_eval,
25
25
  assert_test,
26
- run_sequence_eval
26
+ run_trace_eval
27
27
  )
28
- from judgeval.data.sequence_run import SequenceRun
28
+ from judgeval.data.trace_run import TraceRun
29
29
  from judgeval.judges import JudgevalJudge
30
30
  from judgeval.constants import (
31
31
  JUDGMENT_EVAL_FETCH_API_URL,
@@ -105,16 +105,16 @@ class JudgmentClient(metaclass=SingletonMeta):
105
105
  rules=rules
106
106
  )
107
107
 
108
- def run_sequence_evaluation(
108
+ def run_trace_evaluation(
109
109
  self,
110
110
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
111
111
  model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
112
- sequences: Optional[List[Sequence]] = None,
112
+ traces: Optional[List[Trace]] = None,
113
113
  examples: Optional[List[Example]] = None,
114
114
  test_file: Optional[str] = None,
115
115
  aggregator: Optional[str] = None,
116
116
  project_name: str = "default_project",
117
- eval_run_name: str = "default_eval_sequence",
117
+ eval_run_name: str = "default_eval_trace",
118
118
  log_results: bool = True,
119
119
  append: bool = False,
120
120
  override: bool = False,
@@ -134,16 +134,16 @@ class JudgmentClient(metaclass=SingletonMeta):
134
134
  if examples and not function:
135
135
  raise ValueError("Cannot pass in examples without a function")
136
136
 
137
- if sequences and function:
138
- raise ValueError("Cannot pass in sequences and function")
137
+ if traces and function:
138
+ raise ValueError("Cannot pass in traces and function")
139
139
 
140
- if examples and sequences:
141
- raise ValueError("Cannot pass in both examples and sequences")
140
+ if examples and traces:
141
+ raise ValueError("Cannot pass in both examples and traces")
142
142
 
143
- sequence_run = SequenceRun(
143
+ trace_run = TraceRun(
144
144
  project_name=project_name,
145
145
  eval_name=eval_run_name,
146
- sequences=sequences,
146
+ traces=traces,
147
147
  scorers=scorers,
148
148
  model=model,
149
149
  aggregator=aggregator,
@@ -152,9 +152,9 @@ class JudgmentClient(metaclass=SingletonMeta):
152
152
  judgment_api_key=self.judgment_api_key,
153
153
  organization_id=self.organization_id,
154
154
  )
155
- return run_sequence_eval(sequence_run, override, ignore_errors, function, tracer, examples)
155
+ return run_trace_eval(trace_run, override, ignore_errors, function, tracer, examples)
156
156
  except ValueError as e:
157
- raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
157
+ raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}")
158
158
  except Exception as e:
159
159
  raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
160
160
 
@@ -245,12 +245,6 @@ class JudgmentClient(metaclass=SingletonMeta):
245
245
  """
246
246
  return self.eval_dataset_client.append_examples(alias, examples, project_name)
247
247
 
248
- def append_sequence_dataset(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
249
- """
250
- Appends a `Sequence` to the Judgment platform for storage.
251
- """
252
- return self.eval_dataset_client.append_sequences(alias, sequences, project_name)
253
-
254
248
  def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
255
249
  """
256
250
  Retrieves a saved `EvalDataset` from the Judgment platform.
@@ -523,7 +517,7 @@ class JudgmentClient(metaclass=SingletonMeta):
523
517
  raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
524
518
 
525
519
  if function:
526
- results = self.run_sequence_evaluation(
520
+ results = self.run_trace_evaluation(
527
521
  examples=examples,
528
522
  scorers=scorers,
529
523
  model=model,
@@ -13,7 +13,6 @@ from judgeval.data import (
13
13
  ScoringResult,
14
14
  Example,
15
15
  CustomExample,
16
- Sequence,
17
16
  Trace
18
17
  )
19
18
  from judgeval.scorers import (
@@ -25,11 +24,10 @@ from judgeval.scorers.score import a_execute_scoring
25
24
  from judgeval.constants import (
26
25
  ROOT_API,
27
26
  JUDGMENT_EVAL_API_URL,
28
- JUDGMENT_SEQUENCE_EVAL_API_URL,
27
+ JUDGMENT_TRACE_EVAL_API_URL,
29
28
  JUDGMENT_EVAL_LOG_API_URL,
30
29
  MAX_CONCURRENT_EVALUATIONS,
31
30
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
32
- JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL
33
31
  )
34
32
  from judgeval.common.exceptions import JudgmentAPIError
35
33
  from judgeval.common.logger import (
@@ -39,7 +37,7 @@ from judgeval.common.logger import (
39
37
  example_logging_context
40
38
  )
41
39
  from judgeval.evaluation_run import EvaluationRun
42
- from judgeval.data.sequence_run import SequenceRun
40
+ from judgeval.data.trace_run import TraceRun
43
41
  from judgeval.common.tracer import Tracer
44
42
  from langchain_core.callbacks import BaseCallbackHandler
45
43
 
@@ -98,20 +96,20 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
98
96
  raise JudgmentAPIError(error_message)
99
97
  return response_data
100
98
 
101
- def execute_api_sequence_eval(sequence_run: SequenceRun) -> List[Dict]:
99
+ def execute_api_trace_eval(trace_run: TraceRun) -> List[Dict]:
102
100
  """
103
101
  Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
104
102
  """
105
103
 
106
104
  try:
107
105
  # submit API request to execute evals
108
- payload = sequence_run.model_dump(warnings=False)
106
+ payload = trace_run.model_dump(warnings=False)
109
107
  response = requests.post(
110
- JUDGMENT_SEQUENCE_EVAL_API_URL,
108
+ JUDGMENT_TRACE_EVAL_API_URL,
111
109
  headers={
112
110
  "Content-Type": "application/json",
113
- "Authorization": f"Bearer {sequence_run.judgment_api_key}",
114
- "X-Organization-Id": sequence_run.organization_id
111
+ "Authorization": f"Bearer {trace_run.judgment_api_key}",
112
+ "X-Organization-Id": trace_run.organization_id
115
113
  },
116
114
  json=payload,
117
115
  verify=True
@@ -282,7 +280,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
282
280
  raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
283
281
 
284
282
 
285
- def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
283
+ def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, TraceRun]) -> str:
286
284
  """
287
285
  Logs evaluation results to the Judgment API database.
288
286
 
@@ -327,51 +325,6 @@ def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[Eval
327
325
  error(f"Failed to save evaluation results to DB: {str(e)}")
328
326
  raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
329
327
 
330
- def retrieve_sequence_from_trace(trace_id: str, parent_span: str, judgment_api_key: str, organization_id: str) -> Sequence:
331
- """
332
- Retrieves a sequence from a trace ID.
333
- """
334
- """
335
- Logs evaluation results to the Judgment API database.
336
-
337
- Args:
338
- merged_results (List[ScoringResult]): The results to log
339
- evaluation_run (EvaluationRun): The evaluation run containing project info and API key
340
-
341
- Raises:
342
- JudgmentAPIError: If there's an API error during logging
343
- ValueError: If there's a validation error with the results
344
- """
345
- try:
346
- res = requests.post(
347
- JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL,
348
- headers={
349
- "Content-Type": "application/json",
350
- "Authorization": f"Bearer {judgment_api_key}",
351
- "X-Organization-Id": organization_id
352
- },
353
- json={
354
- "trace_id": trace_id,
355
- "trace_span_id": parent_span,
356
- },
357
- verify=True
358
- )
359
-
360
- if not res.ok:
361
- response_data = res.json()
362
- error_message = response_data.get('detail', 'An unknown error occurred.')
363
- error(f"Error {res.status_code}: {error_message}")
364
- raise JudgmentAPIError(error_message)
365
-
366
- return Sequence(**res.json())
367
- except requests.exceptions.RequestException as e:
368
- error(f"Request failed while saving evaluation results to DB: {str(e)}")
369
- raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
370
- except Exception as e:
371
- error(f"Failed to save evaluation results to DB: {str(e)}")
372
- raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
373
-
374
-
375
328
  def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
376
329
  """Run a function with a spinner in the terminal."""
377
330
  spinner = itertools.cycle(['|', '/', '-', '\\'])
@@ -415,62 +368,59 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
415
368
  if missing_params:
416
369
  print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
417
370
 
418
- def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
371
+ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
419
372
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
420
- if not override and sequence_run.log_results and not sequence_run.append:
373
+ if not override and trace_run.log_results and not trace_run.append:
421
374
  check_eval_run_name_exists(
422
- sequence_run.eval_name,
423
- sequence_run.project_name,
424
- sequence_run.judgment_api_key,
425
- sequence_run.organization_id
375
+ trace_run.eval_name,
376
+ trace_run.project_name,
377
+ trace_run.judgment_api_key,
378
+ trace_run.organization_id
426
379
  )
427
380
 
428
- if sequence_run.append:
381
+ if trace_run.append:
429
382
  # Check that the current experiment, if one exists, has the same type (examples of sequences)
430
383
  check_experiment_type(
431
- sequence_run.eval_name,
432
- sequence_run.project_name,
433
- sequence_run.judgment_api_key,
434
- sequence_run.organization_id,
384
+ trace_run.eval_name,
385
+ trace_run.project_name,
386
+ trace_run.judgment_api_key,
387
+ trace_run.organization_id,
435
388
  True
436
389
  )
437
390
 
438
391
  if function and tracer:
439
- new_sequences: List[Sequence] = []
392
+ new_traces: List[Trace] = []
393
+ tracer.offline_mode = True
440
394
  for example in examples:
441
395
  if example.input:
442
396
  result = run_with_spinner("Running agent function: ", function, **example.input)
443
397
  else:
444
398
  result = run_with_spinner("Running agent function: ", function)
445
399
  for i, trace in enumerate(tracer.traces):
446
- trace_id = trace['trace_id']
447
- parent_span = trace['entries'][0]['span_id']
448
- new_sequence = retrieve_sequence_from_trace(trace_id, parent_span, sequence_run.judgment_api_key, sequence_run.organization_id)
449
- new_sequence.expected_tools = examples[i].expected_tools
450
- new_sequences.append(new_sequence)
451
- sequence_run.sequences = new_sequences
452
-
453
- for sequence in sequence_run.sequences:
454
- sequence.scorers = sequence_run.scorers
400
+ # We set the root-level trace span with the expected tools of the Trace
401
+ trace = Trace(**trace)
402
+ trace.entries[0].expected_tools = examples[i].expected_tools
403
+ new_traces.append(trace)
404
+ trace_run.traces = new_traces
455
405
 
456
406
  # Execute evaluation using Judgment API
457
407
  info("Starting API evaluation")
458
408
  try: # execute an EvaluationRun with just JudgmentScorers
459
409
  debug("Sending request to Judgment API")
460
- response_data: List[Dict] = run_with_spinner("Running Sequence Evaluation: ", execute_api_sequence_eval, sequence_run)
410
+ response_data: List[Dict] = run_with_spinner("Running Trace Evaluation: ", execute_api_trace_eval, trace_run)
461
411
  scoring_results = [ScoringResult(**result) for result in response_data["results"]]
462
412
  info(f"Received {len(scoring_results)} results from API")
463
413
  except JudgmentAPIError as e:
464
414
  error(f"An error occurred while executing the Judgment API request: {str(e)}")
465
415
  raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
466
416
  except ValueError as e:
467
- raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: {str(e)}")
417
+ raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: {str(e)}")
468
418
 
469
419
  # Convert the response data to `ScoringResult` objects
470
420
  debug("Processing API results")
471
- # TODO: allow for custom scorer on sequences
472
- if sequence_run.log_results:
473
- pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], sequence_run)
421
+ # TODO: allow for custom scorer on traces
422
+ if trace_run.log_results:
423
+ pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], trace_run)
474
424
  rprint(pretty_str)
475
425
 
476
426
  return scoring_results
@@ -5,13 +5,15 @@
5
5
  # Internal imports
6
6
  from judgeval.scorers.api_scorer import APIJudgmentScorer
7
7
  from judgeval.constants import APIScorer
8
-
8
+ from typing import Optional, Dict
9
9
  class ToolOrderScorer(APIJudgmentScorer):
10
- def __init__(self, threshold: float=1.0):
10
+ kwargs: Optional[Dict] = None
11
+ def __init__(self, threshold: float=1.0, exact_match: bool=False):
11
12
  super().__init__(
12
13
  threshold=threshold,
13
14
  score_type=APIScorer.TOOL_ORDER,
14
15
  )
16
+ self.kwargs = {"exact_match": exact_match}
15
17
 
16
18
  @property
17
19
  def __name__(self):
@@ -0,0 +1,247 @@
1
+ Metadata-Version: 2.4
2
+ Name: judgeval
3
+ Version: 0.0.38
4
+ Summary: Judgeval Package
5
+ Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
+ Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
7
+ Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
8
+ License-Expression: Apache-2.0
9
+ License-File: LICENSE.md
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.11
13
+ Requires-Dist: anthropic
14
+ Requires-Dist: boto3
15
+ Requires-Dist: google-genai
16
+ Requires-Dist: langchain-anthropic
17
+ Requires-Dist: langchain-core
18
+ Requires-Dist: langchain-huggingface
19
+ Requires-Dist: langchain-openai
20
+ Requires-Dist: litellm==1.38.12
21
+ Requires-Dist: nest-asyncio
22
+ Requires-Dist: openai
23
+ Requires-Dist: pandas
24
+ Requires-Dist: python-dotenv==1.0.1
25
+ Requires-Dist: requests
26
+ Requires-Dist: together
27
+ Description-Content-Type: text/markdown
28
+
29
+ <div align="center">
30
+
31
+ <img src="assets/logo-light.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
32
+ <img src="assets/logo-dark.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
33
+
34
+ **Build monitoring & evaluation pipelines for complex agents**
35
+
36
+ <img src="assets/experiments_page.png" alt="Judgment Platform Experiments Page" width="800" />
37
+
38
+ <br>
39
+
40
+ ## [🌐 Landing Page](https://www.judgmentlabs.ai/) • [Twitter/X](https://x.com/JudgmentLabs) • [💼 LinkedIn](https://www.linkedin.com/company/judgmentlabs) • [📚 Docs](https://judgment.mintlify.app/getting_started) • [🚀 Demos](https://www.youtube.com/@AlexShan-j3o) • [🎮 Discord](https://discord.gg/taAufyhf)
41
+ </div>
42
+
43
+ ## Judgeval: open-source testing, monitoring, and optimization for AI agents
44
+
45
+ Judgeval offers robust tooling for evaluating and tracing LLM agent systems. It is dev-friendly and open-source (licensed under Apache 2.0).
46
+
47
+ Judgeval gets you started in five minutes, after which you'll be ready to use all of its features as your agent becomes more complex. Judgeval is natively connected to the [Judgment Platform](https://www.judgmentlabs.ai/) for free and you can export your data and self-host at any time.
48
+
49
+ We support tracing agents built with LangGraph, OpenAI SDK, Anthropic, ... and allow custom eval integrations for any use case. Check out our quickstarts below or our [setup guide](https://judgment.mintlify.app/getting_started) to get started.
50
+
51
+ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
52
+
53
+ ## 📋 Table of Contents
54
+ * [✨ Features](#-features)
55
+ * [🔍 Tracing](#-tracing)
56
+ * [🧪 Evals](#-evals)
57
+ * [📡 Monitoring](#-monitoring)
58
+ * [📊 Datasets](#-datasets)
59
+ * [💡 Insights](#-insights)
60
+ * [🛠️ Installation](#️-installation)
61
+ * [🏁 Get Started](#-get-started)
62
+ * [🏢 Self-Hosting](#-self-hosting)
63
+ * [📚 Cookbooks](#-cookbooks)
64
+ * [⭐ Star Us on GitHub](#-star-us-on-github)
65
+ * [❤️ Contributors](#️-contributors)
66
+
67
+ <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
68
+
69
+
70
+ ## ✨ Features
71
+
72
+ | | |
73
+ |:---|:---:|
74
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
75
+ | <h3>🧪 Evals</h3>15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Build custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails <br><br> | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
76
+ | <h3>📡 Monitoring</h3>Real-time performance tracking of your agents in production environments. **Track all your metrics in one place.**<br><br>Set up **Slack/email alerts** for critical metrics and receive notifications when thresholds are exceeded.<br><br> **Useful for:** <br>•📉 Identifying degradation early <br>•📈 Visualizing performance trends across versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
77
+ | <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets hosted on Judgment's Platform. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🔄 Scaled analysis for A/B tests <br>• 🗃️ Filtered collections of agent runtime data| <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
78
+ | <h3>💡 Insights</h3>Cluster on your data to reveal common use cases and failure modes.<br><br>Trace failures to their exact source with Judgment's Osiris agent, which localizes errors to specific components for precise fixes.<br><br> **Useful for:**<br>•🔮 Surfacing common inputs that lead to error<br>•🤖 Investigating agent/user behavior for optimization <br>| <p align="center"><img src="assets/dataset_clustering_screenshot_dm.png" alt="Insights dashboard" width="1200"/></p> |
79
+
80
+ ## 🛠️ Installation
81
+
82
+ Get started with Judgeval by installing our SDK using pip:
83
+
84
+ ```bash
85
+ pip install judgeval
86
+ ```
87
+
88
+ Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
89
+
90
+ **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
91
+
92
+ ## 🏁 Get Started
93
+
94
+ Here's how you can quickly start using Judgeval:
95
+
96
+ ### 🛰️ Tracing
97
+
98
+ Track your agent execution with full observability with just a few lines of code.
99
+ Create a file named `traces.py` with the following code:
100
+
101
+ ```python
102
+ from judgeval.common.tracer import Tracer, wrap
103
+ from openai import OpenAI
104
+
105
+ client = wrap(OpenAI())
106
+ judgment = Tracer(project_name="my_project")
107
+
108
+ @judgment.observe(span_type="tool")
109
+ def my_tool():
110
+ return "What's the capital of the U.S.?"
111
+
112
+ @judgment.observe(span_type="function")
113
+ def main():
114
+ task_input = my_tool()
115
+ res = client.chat.completions.create(
116
+ model="gpt-4.1",
117
+ messages=[{"role": "user", "content": f"{task_input}"}]
118
+ )
119
+ return res.choices[0].message.content
120
+
121
+ main()
122
+ ```
123
+
124
+ [Click here](https://judgment.mintlify.app/getting_started#create-your-first-trace) for a more detailed explanation.
125
+
126
+ ### 📝 Offline Evaluations
127
+
128
+ You can evaluate your agent's execution to measure quality metrics such as hallucination.
129
+ Create a file named `evaluate.py` with the following code:
130
+
131
+ ```python evaluate.py
132
+ from judgeval import JudgmentClient
133
+ from judgeval.data import Example
134
+ from judgeval.scorers import FaithfulnessScorer
135
+
136
+ client = JudgmentClient()
137
+
138
+ example = Example(
139
+ input="What if these shoes don't fit?",
140
+ actual_output="We offer a 30-day full refund at no extra cost.",
141
+ retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
142
+ )
143
+
144
+ scorer = FaithfulnessScorer(threshold=0.5)
145
+ results = client.run_evaluation(
146
+ examples=[example],
147
+ scorers=[scorer],
148
+ model="gpt-4.1",
149
+ )
150
+ print(results)
151
+ ```
152
+
153
+ [Click here](https://judgment.mintlify.app/getting_started#create-your-first-experiment) for a more detailed explanation.
154
+
155
+ ### 📡 Online Evaluations
156
+
157
+ Attach performance monitoring on traces to measure the quality of your systems in production.
158
+
159
+ Using the same `traces.py` file we created earlier, modify `main` function:
160
+
161
+ ```python
162
+ from judgeval.common.tracer import Tracer, wrap
163
+ from judgeval.scorers import AnswerRelevancyScorer
164
+ from openai import OpenAI
165
+
166
+ client = wrap(OpenAI())
167
+ judgment = Tracer(project_name="my_project")
168
+
169
+ @judgment.observe(span_type="tool")
170
+ def my_tool():
171
+ return "Hello world!"
172
+
173
+ @judgment.observe(span_type="function")
174
+ def main():
175
+ task_input = my_tool()
176
+ res = client.chat.completions.create(
177
+ model="gpt-4.1",
178
+ messages=[{"role": "user", "content": f"{task_input}"}]
179
+ ).choices[0].message.content
180
+
181
+ judgment.async_evaluate(
182
+ scorers=[AnswerRelevancyScorer(threshold=0.5)],
183
+ input=task_input,
184
+ actual_output=res,
185
+ model="gpt-4.1"
186
+ )
187
+ print("Online evaluation submitted.")
188
+ return res
189
+
190
+ main()
191
+ ```
192
+
193
+ [Click here](https://judgment.mintlify.app/getting_started#create-your-first-online-evaluation) for a more detailed explanation.
194
+
195
+ ## 🏢 Self-Hosting
196
+
197
+ Run Judgment on your own infrastructure: we provide comprehensive self-hosting capabilities that give you full control over the backend and data plane that Judgeval interfaces with.
198
+
199
+ ### Key Features
200
+ * Deploy Judgment on your own AWS account
201
+ * Store data in your own Supabase instance
202
+ * Access Judgment through your own custom domain
203
+
204
+ ### Getting Started
205
+ 1. Check out our [self-hosting documentation](https://judgment.mintlify.app/self_hosting/get_started) for detailed setup instructions, along with how your self-hosted instance can be accessed
206
+ 2. Use the [Judgment CLI](https://github.com/JudgmentLabs/judgment-cli) to deploy your self-hosted environment
207
+ 3. After your self-hosted instance is setup, make sure the `JUDGMENT_API_URL` environmental variable is set to your self-hosted backend endpoint
208
+
209
+ ## 📚 Cookbooks
210
+
211
+ Have your own? We're happy to feature it if you create a PR or message us on [Discord](https://discord.gg/taAufyhf).
212
+
213
+ You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judgment-cookbook). Here are some highlights:
214
+
215
+ ### Sample Agents
216
+
217
+ #### 💰 [LangGraph Financial QA Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/financial_agent/demo.py)
218
+ A LangGraph-based agent for financial queries, featuring RAG capabilities with a vector database for contextual data retrieval and evaluation of its reasoning and data accuracy.
219
+
220
+ #### ✈️ [OpenAI Travel Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/openai_travel_agent/agent.py)
221
+ A travel planning agent using OpenAI API calls, custom tool functions, and RAG with a vector database for up-to-date and contextual travel information. Evaluated for itinerary quality and information relevance.
222
+
223
+ ### Custom Evaluators
224
+
225
+ #### 🔍 [PII Detection](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/classifier_scorer/pii_checker.py)
226
+ Detecting and evaluating Personal Identifiable Information (PII) leakage.
227
+
228
+ #### 📧 [Cold Email Generation](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/cold_email_scorer.py)
229
+
230
+ Evaluates if a cold email generator properly utilizes all relevant information about the target recipient.
231
+
232
+ ## ⭐ Star Us on GitHub
233
+
234
+ If you find Judgeval useful, please consider giving us a star on GitHub! Your support helps us grow our community and continue improving the product.
235
+
236
+
237
+ ## ❤️ Contributors
238
+
239
+ There are many ways to contribute to Judgeval:
240
+
241
+ - Submit [bug reports](https://github.com/JudgmentLabs/judgeval/issues) and [feature requests](https://github.com/JudgmentLabs/judgeval/issues)
242
+ - Review the documentation and submit [Pull Requests](https://github.com/JudgmentLabs/judgeval/pulls) to improve it
243
+ - Speaking or writing about Judgment and letting us know!
244
+
245
+ <!-- Contributors collage -->
246
+ [![Contributors](https://contributors-img.web.app/image?repo=JudgmentLabs/judgeval)](https://github.com/JudgmentLabs/judgeval/graphs/contributors)
247
+