judgeval 0.0.37__py3-none-any.whl → 0.0.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +132 -281
- judgeval/common/utils.py +1 -1
- judgeval/constants.py +1 -3
- judgeval/data/__init__.py +0 -2
- judgeval/data/datasets/dataset.py +2 -9
- judgeval/data/datasets/eval_dataset_client.py +1 -62
- judgeval/data/example.py +0 -1
- judgeval/data/result.py +3 -3
- judgeval/data/trace.py +4 -1
- judgeval/data/{sequence_run.py → trace_run.py} +4 -4
- judgeval/evaluation_run.py +1 -1
- judgeval/integrations/langgraph.py +187 -1768
- judgeval/judges/litellm_judge.py +1 -1
- judgeval/judges/mixture_of_judges.py +1 -1
- judgeval/judges/utils.py +1 -1
- judgeval/judgment_client.py +15 -21
- judgeval/run_evaluation.py +31 -81
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +4 -2
- judgeval-0.0.38.dist-info/METADATA +247 -0
- {judgeval-0.0.37.dist-info → judgeval-0.0.38.dist-info}/RECORD +22 -23
- judgeval/data/sequence.py +0 -50
- judgeval-0.0.37.dist-info/METADATA +0 -214
- {judgeval-0.0.37.dist-info → judgeval-0.0.38.dist-info}/WHEEL +0 -0
- {judgeval-0.0.37.dist-info → judgeval-0.0.38.dist-info}/licenses/LICENSE.md +0 -0
judgeval/judges/litellm_judge.py
CHANGED
@@ -12,7 +12,7 @@ BASE_CONVERSATION = [
|
|
12
12
|
|
13
13
|
|
14
14
|
class LiteLLMJudge(JudgevalJudge):
|
15
|
-
def __init__(self, model: str = "gpt-
|
15
|
+
def __init__(self, model: str = "gpt-4.1-mini", **kwargs):
|
16
16
|
debug(f"Initializing LiteLLMJudge with model={model}")
|
17
17
|
self.model = model
|
18
18
|
self.kwargs = kwargs
|
@@ -136,7 +136,7 @@ class MixtureOfJudges(JudgevalJudge):
|
|
136
136
|
"""
|
137
137
|
def __init__(self,
|
138
138
|
models: List[str] = ['QWEN', 'LLAMA3_70B_INSTRUCT_TURBO', 'MISTRAL_8x22B_INSTRUCT'],
|
139
|
-
aggregator: str = 'gpt-
|
139
|
+
aggregator: str = 'gpt-4.1',
|
140
140
|
**kwargs):
|
141
141
|
"""
|
142
142
|
`models` are the individual judge models to be used for generating responses.
|
judgeval/judges/utils.py
CHANGED
@@ -23,7 +23,7 @@ def create_judge(
|
|
23
23
|
If no model is provided, uses GPT4o as the default judge.
|
24
24
|
"""
|
25
25
|
if model is None: # default option
|
26
|
-
return LiteLLMJudge(model="gpt-
|
26
|
+
return LiteLLMJudge(model="gpt-4.1"), True
|
27
27
|
if not isinstance(model, (str, list, JudgevalJudge)):
|
28
28
|
raise InvalidJudgeModelError(f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead.")
|
29
29
|
# If model is already a valid judge type, return it and mark native
|
judgeval/judgment_client.py
CHANGED
@@ -12,7 +12,7 @@ from judgeval.data import (
|
|
12
12
|
ScoringResult,
|
13
13
|
Example,
|
14
14
|
CustomExample,
|
15
|
-
|
15
|
+
Trace,
|
16
16
|
)
|
17
17
|
from judgeval.scorers import (
|
18
18
|
APIJudgmentScorer,
|
@@ -23,9 +23,9 @@ from judgeval.evaluation_run import EvaluationRun
|
|
23
23
|
from judgeval.run_evaluation import (
|
24
24
|
run_eval,
|
25
25
|
assert_test,
|
26
|
-
|
26
|
+
run_trace_eval
|
27
27
|
)
|
28
|
-
from judgeval.data.
|
28
|
+
from judgeval.data.trace_run import TraceRun
|
29
29
|
from judgeval.judges import JudgevalJudge
|
30
30
|
from judgeval.constants import (
|
31
31
|
JUDGMENT_EVAL_FETCH_API_URL,
|
@@ -105,16 +105,16 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
105
105
|
rules=rules
|
106
106
|
)
|
107
107
|
|
108
|
-
def
|
108
|
+
def run_trace_evaluation(
|
109
109
|
self,
|
110
110
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
111
111
|
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
112
|
-
|
112
|
+
traces: Optional[List[Trace]] = None,
|
113
113
|
examples: Optional[List[Example]] = None,
|
114
114
|
test_file: Optional[str] = None,
|
115
115
|
aggregator: Optional[str] = None,
|
116
116
|
project_name: str = "default_project",
|
117
|
-
eval_run_name: str = "
|
117
|
+
eval_run_name: str = "default_eval_trace",
|
118
118
|
log_results: bool = True,
|
119
119
|
append: bool = False,
|
120
120
|
override: bool = False,
|
@@ -134,16 +134,16 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
134
134
|
if examples and not function:
|
135
135
|
raise ValueError("Cannot pass in examples without a function")
|
136
136
|
|
137
|
-
if
|
138
|
-
raise ValueError("Cannot pass in
|
137
|
+
if traces and function:
|
138
|
+
raise ValueError("Cannot pass in traces and function")
|
139
139
|
|
140
|
-
if examples and
|
141
|
-
raise ValueError("Cannot pass in both examples and
|
140
|
+
if examples and traces:
|
141
|
+
raise ValueError("Cannot pass in both examples and traces")
|
142
142
|
|
143
|
-
|
143
|
+
trace_run = TraceRun(
|
144
144
|
project_name=project_name,
|
145
145
|
eval_name=eval_run_name,
|
146
|
-
|
146
|
+
traces=traces,
|
147
147
|
scorers=scorers,
|
148
148
|
model=model,
|
149
149
|
aggregator=aggregator,
|
@@ -152,9 +152,9 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
152
152
|
judgment_api_key=self.judgment_api_key,
|
153
153
|
organization_id=self.organization_id,
|
154
154
|
)
|
155
|
-
return
|
155
|
+
return run_trace_eval(trace_run, override, ignore_errors, function, tracer, examples)
|
156
156
|
except ValueError as e:
|
157
|
-
raise ValueError(f"Please check your
|
157
|
+
raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}")
|
158
158
|
except Exception as e:
|
159
159
|
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
160
160
|
|
@@ -245,12 +245,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
245
245
|
"""
|
246
246
|
return self.eval_dataset_client.append_examples(alias, examples, project_name)
|
247
247
|
|
248
|
-
def append_sequence_dataset(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
|
249
|
-
"""
|
250
|
-
Appends a `Sequence` to the Judgment platform for storage.
|
251
|
-
"""
|
252
|
-
return self.eval_dataset_client.append_sequences(alias, sequences, project_name)
|
253
|
-
|
254
248
|
def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
|
255
249
|
"""
|
256
250
|
Retrieves a saved `EvalDataset` from the Judgment platform.
|
@@ -523,7 +517,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
523
517
|
raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
|
524
518
|
|
525
519
|
if function:
|
526
|
-
results = self.
|
520
|
+
results = self.run_trace_evaluation(
|
527
521
|
examples=examples,
|
528
522
|
scorers=scorers,
|
529
523
|
model=model,
|
judgeval/run_evaluation.py
CHANGED
@@ -13,7 +13,6 @@ from judgeval.data import (
|
|
13
13
|
ScoringResult,
|
14
14
|
Example,
|
15
15
|
CustomExample,
|
16
|
-
Sequence,
|
17
16
|
Trace
|
18
17
|
)
|
19
18
|
from judgeval.scorers import (
|
@@ -25,11 +24,10 @@ from judgeval.scorers.score import a_execute_scoring
|
|
25
24
|
from judgeval.constants import (
|
26
25
|
ROOT_API,
|
27
26
|
JUDGMENT_EVAL_API_URL,
|
28
|
-
|
27
|
+
JUDGMENT_TRACE_EVAL_API_URL,
|
29
28
|
JUDGMENT_EVAL_LOG_API_URL,
|
30
29
|
MAX_CONCURRENT_EVALUATIONS,
|
31
30
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
32
|
-
JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL
|
33
31
|
)
|
34
32
|
from judgeval.common.exceptions import JudgmentAPIError
|
35
33
|
from judgeval.common.logger import (
|
@@ -39,7 +37,7 @@ from judgeval.common.logger import (
|
|
39
37
|
example_logging_context
|
40
38
|
)
|
41
39
|
from judgeval.evaluation_run import EvaluationRun
|
42
|
-
from judgeval.data.
|
40
|
+
from judgeval.data.trace_run import TraceRun
|
43
41
|
from judgeval.common.tracer import Tracer
|
44
42
|
from langchain_core.callbacks import BaseCallbackHandler
|
45
43
|
|
@@ -98,20 +96,20 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
|
98
96
|
raise JudgmentAPIError(error_message)
|
99
97
|
return response_data
|
100
98
|
|
101
|
-
def
|
99
|
+
def execute_api_trace_eval(trace_run: TraceRun) -> List[Dict]:
|
102
100
|
"""
|
103
101
|
Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
|
104
102
|
"""
|
105
103
|
|
106
104
|
try:
|
107
105
|
# submit API request to execute evals
|
108
|
-
payload =
|
106
|
+
payload = trace_run.model_dump(warnings=False)
|
109
107
|
response = requests.post(
|
110
|
-
|
108
|
+
JUDGMENT_TRACE_EVAL_API_URL,
|
111
109
|
headers={
|
112
110
|
"Content-Type": "application/json",
|
113
|
-
"Authorization": f"Bearer {
|
114
|
-
"X-Organization-Id":
|
111
|
+
"Authorization": f"Bearer {trace_run.judgment_api_key}",
|
112
|
+
"X-Organization-Id": trace_run.organization_id
|
115
113
|
},
|
116
114
|
json=payload,
|
117
115
|
verify=True
|
@@ -282,7 +280,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
282
280
|
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
283
281
|
|
284
282
|
|
285
|
-
def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun,
|
283
|
+
def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[EvaluationRun, TraceRun]) -> str:
|
286
284
|
"""
|
287
285
|
Logs evaluation results to the Judgment API database.
|
288
286
|
|
@@ -327,51 +325,6 @@ def log_evaluation_results(scoring_results: List[ScoringResult], run: Union[Eval
|
|
327
325
|
error(f"Failed to save evaluation results to DB: {str(e)}")
|
328
326
|
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
329
327
|
|
330
|
-
def retrieve_sequence_from_trace(trace_id: str, parent_span: str, judgment_api_key: str, organization_id: str) -> Sequence:
|
331
|
-
"""
|
332
|
-
Retrieves a sequence from a trace ID.
|
333
|
-
"""
|
334
|
-
"""
|
335
|
-
Logs evaluation results to the Judgment API database.
|
336
|
-
|
337
|
-
Args:
|
338
|
-
merged_results (List[ScoringResult]): The results to log
|
339
|
-
evaluation_run (EvaluationRun): The evaluation run containing project info and API key
|
340
|
-
|
341
|
-
Raises:
|
342
|
-
JudgmentAPIError: If there's an API error during logging
|
343
|
-
ValueError: If there's a validation error with the results
|
344
|
-
"""
|
345
|
-
try:
|
346
|
-
res = requests.post(
|
347
|
-
JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL,
|
348
|
-
headers={
|
349
|
-
"Content-Type": "application/json",
|
350
|
-
"Authorization": f"Bearer {judgment_api_key}",
|
351
|
-
"X-Organization-Id": organization_id
|
352
|
-
},
|
353
|
-
json={
|
354
|
-
"trace_id": trace_id,
|
355
|
-
"trace_span_id": parent_span,
|
356
|
-
},
|
357
|
-
verify=True
|
358
|
-
)
|
359
|
-
|
360
|
-
if not res.ok:
|
361
|
-
response_data = res.json()
|
362
|
-
error_message = response_data.get('detail', 'An unknown error occurred.')
|
363
|
-
error(f"Error {res.status_code}: {error_message}")
|
364
|
-
raise JudgmentAPIError(error_message)
|
365
|
-
|
366
|
-
return Sequence(**res.json())
|
367
|
-
except requests.exceptions.RequestException as e:
|
368
|
-
error(f"Request failed while saving evaluation results to DB: {str(e)}")
|
369
|
-
raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
|
370
|
-
except Exception as e:
|
371
|
-
error(f"Failed to save evaluation results to DB: {str(e)}")
|
372
|
-
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
373
|
-
|
374
|
-
|
375
328
|
def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
|
376
329
|
"""Run a function with a spinner in the terminal."""
|
377
330
|
spinner = itertools.cycle(['|', '/', '-', '\\'])
|
@@ -415,62 +368,59 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
|
|
415
368
|
if missing_params:
|
416
369
|
print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
|
417
370
|
|
418
|
-
def
|
371
|
+
def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
|
419
372
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
420
|
-
if not override and
|
373
|
+
if not override and trace_run.log_results and not trace_run.append:
|
421
374
|
check_eval_run_name_exists(
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
375
|
+
trace_run.eval_name,
|
376
|
+
trace_run.project_name,
|
377
|
+
trace_run.judgment_api_key,
|
378
|
+
trace_run.organization_id
|
426
379
|
)
|
427
380
|
|
428
|
-
if
|
381
|
+
if trace_run.append:
|
429
382
|
# Check that the current experiment, if one exists, has the same type (examples of sequences)
|
430
383
|
check_experiment_type(
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
384
|
+
trace_run.eval_name,
|
385
|
+
trace_run.project_name,
|
386
|
+
trace_run.judgment_api_key,
|
387
|
+
trace_run.organization_id,
|
435
388
|
True
|
436
389
|
)
|
437
390
|
|
438
391
|
if function and tracer:
|
439
|
-
|
392
|
+
new_traces: List[Trace] = []
|
393
|
+
tracer.offline_mode = True
|
440
394
|
for example in examples:
|
441
395
|
if example.input:
|
442
396
|
result = run_with_spinner("Running agent function: ", function, **example.input)
|
443
397
|
else:
|
444
398
|
result = run_with_spinner("Running agent function: ", function)
|
445
399
|
for i, trace in enumerate(tracer.traces):
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
sequence_run.sequences = new_sequences
|
452
|
-
|
453
|
-
for sequence in sequence_run.sequences:
|
454
|
-
sequence.scorers = sequence_run.scorers
|
400
|
+
# We set the root-level trace span with the expected tools of the Trace
|
401
|
+
trace = Trace(**trace)
|
402
|
+
trace.entries[0].expected_tools = examples[i].expected_tools
|
403
|
+
new_traces.append(trace)
|
404
|
+
trace_run.traces = new_traces
|
455
405
|
|
456
406
|
# Execute evaluation using Judgment API
|
457
407
|
info("Starting API evaluation")
|
458
408
|
try: # execute an EvaluationRun with just JudgmentScorers
|
459
409
|
debug("Sending request to Judgment API")
|
460
|
-
response_data: List[Dict] = run_with_spinner("Running
|
410
|
+
response_data: List[Dict] = run_with_spinner("Running Trace Evaluation: ", execute_api_trace_eval, trace_run)
|
461
411
|
scoring_results = [ScoringResult(**result) for result in response_data["results"]]
|
462
412
|
info(f"Received {len(scoring_results)} results from API")
|
463
413
|
except JudgmentAPIError as e:
|
464
414
|
error(f"An error occurred while executing the Judgment API request: {str(e)}")
|
465
415
|
raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
|
466
416
|
except ValueError as e:
|
467
|
-
raise ValueError(f"Please check your
|
417
|
+
raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: {str(e)}")
|
468
418
|
|
469
419
|
# Convert the response data to `ScoringResult` objects
|
470
420
|
debug("Processing API results")
|
471
|
-
# TODO: allow for custom scorer on
|
472
|
-
if
|
473
|
-
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"],
|
421
|
+
# TODO: allow for custom scorer on traces
|
422
|
+
if trace_run.log_results:
|
423
|
+
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], trace_run)
|
474
424
|
rprint(pretty_str)
|
475
425
|
|
476
426
|
return scoring_results
|
@@ -5,13 +5,15 @@
|
|
5
5
|
# Internal imports
|
6
6
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
7
7
|
from judgeval.constants import APIScorer
|
8
|
-
|
8
|
+
from typing import Optional, Dict
|
9
9
|
class ToolOrderScorer(APIJudgmentScorer):
|
10
|
-
|
10
|
+
kwargs: Optional[Dict] = None
|
11
|
+
def __init__(self, threshold: float=1.0, exact_match: bool=False):
|
11
12
|
super().__init__(
|
12
13
|
threshold=threshold,
|
13
14
|
score_type=APIScorer.TOOL_ORDER,
|
14
15
|
)
|
16
|
+
self.kwargs = {"exact_match": exact_match}
|
15
17
|
|
16
18
|
@property
|
17
19
|
def __name__(self):
|
@@ -0,0 +1,247 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: judgeval
|
3
|
+
Version: 0.0.38
|
4
|
+
Summary: Judgeval Package
|
5
|
+
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
|
+
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
7
|
+
Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
|
8
|
+
License-Expression: Apache-2.0
|
9
|
+
License-File: LICENSE.md
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Requires-Python: >=3.11
|
13
|
+
Requires-Dist: anthropic
|
14
|
+
Requires-Dist: boto3
|
15
|
+
Requires-Dist: google-genai
|
16
|
+
Requires-Dist: langchain-anthropic
|
17
|
+
Requires-Dist: langchain-core
|
18
|
+
Requires-Dist: langchain-huggingface
|
19
|
+
Requires-Dist: langchain-openai
|
20
|
+
Requires-Dist: litellm==1.38.12
|
21
|
+
Requires-Dist: nest-asyncio
|
22
|
+
Requires-Dist: openai
|
23
|
+
Requires-Dist: pandas
|
24
|
+
Requires-Dist: python-dotenv==1.0.1
|
25
|
+
Requires-Dist: requests
|
26
|
+
Requires-Dist: together
|
27
|
+
Description-Content-Type: text/markdown
|
28
|
+
|
29
|
+
<div align="center">
|
30
|
+
|
31
|
+
<img src="assets/logo-light.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
|
32
|
+
<img src="assets/logo-dark.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
|
33
|
+
|
34
|
+
**Build monitoring & evaluation pipelines for complex agents**
|
35
|
+
|
36
|
+
<img src="assets/experiments_page.png" alt="Judgment Platform Experiments Page" width="800" />
|
37
|
+
|
38
|
+
<br>
|
39
|
+
|
40
|
+
## [🌐 Landing Page](https://www.judgmentlabs.ai/) • [Twitter/X](https://x.com/JudgmentLabs) • [💼 LinkedIn](https://www.linkedin.com/company/judgmentlabs) • [📚 Docs](https://judgment.mintlify.app/getting_started) • [🚀 Demos](https://www.youtube.com/@AlexShan-j3o) • [🎮 Discord](https://discord.gg/taAufyhf)
|
41
|
+
</div>
|
42
|
+
|
43
|
+
## Judgeval: open-source testing, monitoring, and optimization for AI agents
|
44
|
+
|
45
|
+
Judgeval offers robust tooling for evaluating and tracing LLM agent systems. It is dev-friendly and open-source (licensed under Apache 2.0).
|
46
|
+
|
47
|
+
Judgeval gets you started in five minutes, after which you'll be ready to use all of its features as your agent becomes more complex. Judgeval is natively connected to the [Judgment Platform](https://www.judgmentlabs.ai/) for free and you can export your data and self-host at any time.
|
48
|
+
|
49
|
+
We support tracing agents built with LangGraph, OpenAI SDK, Anthropic, ... and allow custom eval integrations for any use case. Check out our quickstarts below or our [setup guide](https://judgment.mintlify.app/getting_started) to get started.
|
50
|
+
|
51
|
+
Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
|
52
|
+
|
53
|
+
## 📋 Table of Contents
|
54
|
+
* [✨ Features](#-features)
|
55
|
+
* [🔍 Tracing](#-tracing)
|
56
|
+
* [🧪 Evals](#-evals)
|
57
|
+
* [📡 Monitoring](#-monitoring)
|
58
|
+
* [📊 Datasets](#-datasets)
|
59
|
+
* [💡 Insights](#-insights)
|
60
|
+
* [🛠️ Installation](#️-installation)
|
61
|
+
* [🏁 Get Started](#-get-started)
|
62
|
+
* [🏢 Self-Hosting](#-self-hosting)
|
63
|
+
* [📚 Cookbooks](#-cookbooks)
|
64
|
+
* [⭐ Star Us on GitHub](#-star-us-on-github)
|
65
|
+
* [❤️ Contributors](#️-contributors)
|
66
|
+
|
67
|
+
<!-- Created by https://github.com/ekalinin/github-markdown-toc -->
|
68
|
+
|
69
|
+
|
70
|
+
## ✨ Features
|
71
|
+
|
72
|
+
| | |
|
73
|
+
|:---|:---:|
|
74
|
+
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
|
75
|
+
| <h3>🧪 Evals</h3>15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Build custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails <br><br> | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
|
76
|
+
| <h3>📡 Monitoring</h3>Real-time performance tracking of your agents in production environments. **Track all your metrics in one place.**<br><br>Set up **Slack/email alerts** for critical metrics and receive notifications when thresholds are exceeded.<br><br> **Useful for:** <br>•📉 Identifying degradation early <br>•📈 Visualizing performance trends across versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
77
|
+
| <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets hosted on Judgment's Platform. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🔄 Scaled analysis for A/B tests <br>• 🗃️ Filtered collections of agent runtime data| <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
78
|
+
| <h3>💡 Insights</h3>Cluster on your data to reveal common use cases and failure modes.<br><br>Trace failures to their exact source with Judgment's Osiris agent, which localizes errors to specific components for precise fixes.<br><br> **Useful for:**<br>•🔮 Surfacing common inputs that lead to error<br>•🤖 Investigating agent/user behavior for optimization <br>| <p align="center"><img src="assets/dataset_clustering_screenshot_dm.png" alt="Insights dashboard" width="1200"/></p> |
|
79
|
+
|
80
|
+
## 🛠️ Installation
|
81
|
+
|
82
|
+
Get started with Judgeval by installing our SDK using pip:
|
83
|
+
|
84
|
+
```bash
|
85
|
+
pip install judgeval
|
86
|
+
```
|
87
|
+
|
88
|
+
Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
|
89
|
+
|
90
|
+
**If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
|
91
|
+
|
92
|
+
## 🏁 Get Started
|
93
|
+
|
94
|
+
Here's how you can quickly start using Judgeval:
|
95
|
+
|
96
|
+
### 🛰️ Tracing
|
97
|
+
|
98
|
+
Track your agent execution with full observability with just a few lines of code.
|
99
|
+
Create a file named `traces.py` with the following code:
|
100
|
+
|
101
|
+
```python
|
102
|
+
from judgeval.common.tracer import Tracer, wrap
|
103
|
+
from openai import OpenAI
|
104
|
+
|
105
|
+
client = wrap(OpenAI())
|
106
|
+
judgment = Tracer(project_name="my_project")
|
107
|
+
|
108
|
+
@judgment.observe(span_type="tool")
|
109
|
+
def my_tool():
|
110
|
+
return "What's the capital of the U.S.?"
|
111
|
+
|
112
|
+
@judgment.observe(span_type="function")
|
113
|
+
def main():
|
114
|
+
task_input = my_tool()
|
115
|
+
res = client.chat.completions.create(
|
116
|
+
model="gpt-4.1",
|
117
|
+
messages=[{"role": "user", "content": f"{task_input}"}]
|
118
|
+
)
|
119
|
+
return res.choices[0].message.content
|
120
|
+
|
121
|
+
main()
|
122
|
+
```
|
123
|
+
|
124
|
+
[Click here](https://judgment.mintlify.app/getting_started#create-your-first-trace) for a more detailed explanation.
|
125
|
+
|
126
|
+
### 📝 Offline Evaluations
|
127
|
+
|
128
|
+
You can evaluate your agent's execution to measure quality metrics such as hallucination.
|
129
|
+
Create a file named `evaluate.py` with the following code:
|
130
|
+
|
131
|
+
```python evaluate.py
|
132
|
+
from judgeval import JudgmentClient
|
133
|
+
from judgeval.data import Example
|
134
|
+
from judgeval.scorers import FaithfulnessScorer
|
135
|
+
|
136
|
+
client = JudgmentClient()
|
137
|
+
|
138
|
+
example = Example(
|
139
|
+
input="What if these shoes don't fit?",
|
140
|
+
actual_output="We offer a 30-day full refund at no extra cost.",
|
141
|
+
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
|
142
|
+
)
|
143
|
+
|
144
|
+
scorer = FaithfulnessScorer(threshold=0.5)
|
145
|
+
results = client.run_evaluation(
|
146
|
+
examples=[example],
|
147
|
+
scorers=[scorer],
|
148
|
+
model="gpt-4.1",
|
149
|
+
)
|
150
|
+
print(results)
|
151
|
+
```
|
152
|
+
|
153
|
+
[Click here](https://judgment.mintlify.app/getting_started#create-your-first-experiment) for a more detailed explanation.
|
154
|
+
|
155
|
+
### 📡 Online Evaluations
|
156
|
+
|
157
|
+
Attach performance monitoring on traces to measure the quality of your systems in production.
|
158
|
+
|
159
|
+
Using the same `traces.py` file we created earlier, modify `main` function:
|
160
|
+
|
161
|
+
```python
|
162
|
+
from judgeval.common.tracer import Tracer, wrap
|
163
|
+
from judgeval.scorers import AnswerRelevancyScorer
|
164
|
+
from openai import OpenAI
|
165
|
+
|
166
|
+
client = wrap(OpenAI())
|
167
|
+
judgment = Tracer(project_name="my_project")
|
168
|
+
|
169
|
+
@judgment.observe(span_type="tool")
|
170
|
+
def my_tool():
|
171
|
+
return "Hello world!"
|
172
|
+
|
173
|
+
@judgment.observe(span_type="function")
|
174
|
+
def main():
|
175
|
+
task_input = my_tool()
|
176
|
+
res = client.chat.completions.create(
|
177
|
+
model="gpt-4.1",
|
178
|
+
messages=[{"role": "user", "content": f"{task_input}"}]
|
179
|
+
).choices[0].message.content
|
180
|
+
|
181
|
+
judgment.async_evaluate(
|
182
|
+
scorers=[AnswerRelevancyScorer(threshold=0.5)],
|
183
|
+
input=task_input,
|
184
|
+
actual_output=res,
|
185
|
+
model="gpt-4.1"
|
186
|
+
)
|
187
|
+
print("Online evaluation submitted.")
|
188
|
+
return res
|
189
|
+
|
190
|
+
main()
|
191
|
+
```
|
192
|
+
|
193
|
+
[Click here](https://judgment.mintlify.app/getting_started#create-your-first-online-evaluation) for a more detailed explanation.
|
194
|
+
|
195
|
+
## 🏢 Self-Hosting
|
196
|
+
|
197
|
+
Run Judgment on your own infrastructure: we provide comprehensive self-hosting capabilities that give you full control over the backend and data plane that Judgeval interfaces with.
|
198
|
+
|
199
|
+
### Key Features
|
200
|
+
* Deploy Judgment on your own AWS account
|
201
|
+
* Store data in your own Supabase instance
|
202
|
+
* Access Judgment through your own custom domain
|
203
|
+
|
204
|
+
### Getting Started
|
205
|
+
1. Check out our [self-hosting documentation](https://judgment.mintlify.app/self_hosting/get_started) for detailed setup instructions, along with how your self-hosted instance can be accessed
|
206
|
+
2. Use the [Judgment CLI](https://github.com/JudgmentLabs/judgment-cli) to deploy your self-hosted environment
|
207
|
+
3. After your self-hosted instance is setup, make sure the `JUDGMENT_API_URL` environmental variable is set to your self-hosted backend endpoint
|
208
|
+
|
209
|
+
## 📚 Cookbooks
|
210
|
+
|
211
|
+
Have your own? We're happy to feature it if you create a PR or message us on [Discord](https://discord.gg/taAufyhf).
|
212
|
+
|
213
|
+
You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judgment-cookbook). Here are some highlights:
|
214
|
+
|
215
|
+
### Sample Agents
|
216
|
+
|
217
|
+
#### 💰 [LangGraph Financial QA Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/financial_agent/demo.py)
|
218
|
+
A LangGraph-based agent for financial queries, featuring RAG capabilities with a vector database for contextual data retrieval and evaluation of its reasoning and data accuracy.
|
219
|
+
|
220
|
+
#### ✈️ [OpenAI Travel Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/openai_travel_agent/agent.py)
|
221
|
+
A travel planning agent using OpenAI API calls, custom tool functions, and RAG with a vector database for up-to-date and contextual travel information. Evaluated for itinerary quality and information relevance.
|
222
|
+
|
223
|
+
### Custom Evaluators
|
224
|
+
|
225
|
+
#### 🔍 [PII Detection](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/classifier_scorer/pii_checker.py)
|
226
|
+
Detecting and evaluating Personal Identifiable Information (PII) leakage.
|
227
|
+
|
228
|
+
#### 📧 [Cold Email Generation](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/cold_email_scorer.py)
|
229
|
+
|
230
|
+
Evaluates if a cold email generator properly utilizes all relevant information about the target recipient.
|
231
|
+
|
232
|
+
## ⭐ Star Us on GitHub
|
233
|
+
|
234
|
+
If you find Judgeval useful, please consider giving us a star on GitHub! Your support helps us grow our community and continue improving the product.
|
235
|
+
|
236
|
+
|
237
|
+
## ❤️ Contributors
|
238
|
+
|
239
|
+
There are many ways to contribute to Judgeval:
|
240
|
+
|
241
|
+
- Submit [bug reports](https://github.com/JudgmentLabs/judgeval/issues) and [feature requests](https://github.com/JudgmentLabs/judgeval/issues)
|
242
|
+
- Review the documentation and submit [Pull Requests](https://github.com/JudgmentLabs/judgeval/pulls) to improve it
|
243
|
+
- Speaking or writing about Judgment and letting us know!
|
244
|
+
|
245
|
+
<!-- Contributors collage -->
|
246
|
+
[](https://github.com/JudgmentLabs/judgeval/graphs/contributors)
|
247
|
+
|