judgeval 0.0.44__tar.gz → 0.0.46__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval-0.0.46/.github/pull_request_template.md +13 -0
- judgeval-0.0.46/.github/workflows/lint.yaml +37 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/.github/workflows/release.yaml +3 -1
- {judgeval-0.0.44 → judgeval-0.0.46}/.gitignore +1 -0
- judgeval-0.0.46/.pre-commit-config.yaml +21 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/PKG-INFO +79 -135
- {judgeval-0.0.44 → judgeval-0.0.46}/README.md +78 -134
- judgeval-0.0.46/assets/agent.gif +0 -0
- judgeval-0.0.46/assets/data.gif +0 -0
- judgeval-0.0.46/assets/document.gif +0 -0
- judgeval-0.0.46/assets/error_analysis_dashboard.png +0 -0
- judgeval-0.0.46/assets/product_shot.png +0 -0
- judgeval-0.0.46/assets/trace.gif +0 -0
- judgeval-0.0.46/assets/trace_demo.png +0 -0
- judgeval-0.0.46/assets/trace_screenshot.png +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/pyproject.toml +5 -1
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/__init__.py +5 -4
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/clients.py +6 -6
- judgeval-0.0.46/src/judgeval/common/__init__.py +13 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/common/exceptions.py +2 -3
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/common/logger.py +74 -49
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/common/s3_storage.py +30 -23
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/common/tracer.py +1273 -939
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/common/utils.py +416 -244
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/constants.py +73 -61
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/data/__init__.py +1 -1
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/data/custom_example.py +3 -2
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/data/datasets/dataset.py +80 -54
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/data/datasets/eval_dataset_client.py +131 -181
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/data/example.py +67 -43
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/data/result.py +11 -9
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/data/scorer_data.py +4 -2
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/data/tool.py +25 -16
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/data/trace.py +57 -29
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/data/trace_run.py +5 -11
- judgeval-0.0.46/src/judgeval/evaluation_run.py +84 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/integrations/langgraph.py +546 -184
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/judges/base_judge.py +1 -2
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/judges/litellm_judge.py +33 -11
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/judges/mixture_of_judges.py +128 -78
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/judges/together_judge.py +22 -9
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/judges/utils.py +14 -5
- judgeval-0.0.46/src/judgeval/judgment_client.py +565 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/rules.py +169 -142
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/run_evaluation.py +462 -305
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/api_scorer.py +20 -11
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/exceptions.py +1 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorer.py +77 -58
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/prompt_scorer.py +48 -37
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/score.py +86 -53
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/utils.py +11 -7
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/tracer/__init__.py +1 -1
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/utils/alerts.py +23 -12
- judgeval-0.0.44/src/judgeval/utils/data_utils.py → judgeval-0.0.46/src/judgeval/utils/file_utils.py +5 -9
- judgeval-0.0.46/src/judgeval/utils/requests.py +29 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/version_check.py +5 -2
- {judgeval-0.0.44 → judgeval-0.0.46}/update_version.py +1 -1
- {judgeval-0.0.44 → judgeval-0.0.46}/uv.lock +77 -0
- judgeval-0.0.44/.github/pull_request_template.md +0 -31
- judgeval-0.0.44/assets/trace_demo.png +0 -0
- judgeval-0.0.44/docs/README.md +0 -32
- judgeval-0.0.44/docs/alerts/notifications.mdx +0 -283
- judgeval-0.0.44/docs/alerts/platform_notifications.mdx +0 -74
- judgeval-0.0.44/docs/alerts/rules.mdx +0 -160
- judgeval-0.0.44/docs/api_reference/judgment_client.mdx +0 -147
- judgeval-0.0.44/docs/api_reference/trace.mdx +0 -140
- judgeval-0.0.44/docs/changelog/2025-04-21.mdx +0 -19
- judgeval-0.0.44/docs/clustering/clustering.mdx +0 -72
- judgeval-0.0.44/docs/compliance/certifications.mdx +0 -47
- judgeval-0.0.44/docs/development.mdx +0 -106
- judgeval-0.0.44/docs/essentials/code.mdx +0 -37
- judgeval-0.0.44/docs/essentials/images.mdx +0 -59
- judgeval-0.0.44/docs/essentials/markdown.mdx +0 -88
- judgeval-0.0.44/docs/essentials/navigation.mdx +0 -66
- judgeval-0.0.44/docs/essentials/reusable-snippets.mdx +0 -110
- judgeval-0.0.44/docs/essentials/settings.mdx +0 -318
- judgeval-0.0.44/docs/evaluation/data_datasets.mdx +0 -356
- judgeval-0.0.44/docs/evaluation/data_examples.mdx +0 -229
- judgeval-0.0.44/docs/evaluation/data_sequences.mdx +0 -80
- judgeval-0.0.44/docs/evaluation/experiment_comparisons.mdx +0 -143
- judgeval-0.0.44/docs/evaluation/introduction.mdx +0 -224
- judgeval-0.0.44/docs/evaluation/judges.mdx +0 -209
- judgeval-0.0.44/docs/evaluation/scorers/agent/derailment.mdx +0 -54
- judgeval-0.0.44/docs/evaluation/scorers/classifier_scorer.mdx +0 -103
- judgeval-0.0.44/docs/evaluation/scorers/custom_scorers.mdx +0 -365
- judgeval-0.0.44/docs/evaluation/scorers/default/answer_correctness.mdx +0 -86
- judgeval-0.0.44/docs/evaluation/scorers/default/answer_relevancy.mdx +0 -85
- judgeval-0.0.44/docs/evaluation/scorers/default/comparison.mdx +0 -102
- judgeval-0.0.44/docs/evaluation/scorers/default/contextual_precision.mdx +0 -106
- judgeval-0.0.44/docs/evaluation/scorers/default/contextual_recall.mdx +0 -104
- judgeval-0.0.44/docs/evaluation/scorers/default/contextual_relevancy.mdx +0 -90
- judgeval-0.0.44/docs/evaluation/scorers/default/execution_order.mdx +0 -72
- judgeval-0.0.44/docs/evaluation/scorers/default/faithfulness.mdx +0 -97
- judgeval-0.0.44/docs/evaluation/scorers/default/groundedness.mdx +0 -65
- judgeval-0.0.44/docs/evaluation/scorers/default/json_correctness.mdx +0 -54
- judgeval-0.0.44/docs/evaluation/scorers/default/summarization.mdx +0 -62
- judgeval-0.0.44/docs/evaluation/scorers/introduction.mdx +0 -111
- judgeval-0.0.44/docs/evaluation/unit_testing.mdx +0 -93
- judgeval-0.0.44/docs/favicon.svg +0 -49
- judgeval-0.0.44/docs/getting_started.mdx +0 -374
- judgeval-0.0.44/docs/images/annotation_queue_ui.png +0 -0
- judgeval-0.0.44/docs/images/basic_trace_example.png +0 -0
- judgeval-0.0.44/docs/images/checks-passed.png +0 -0
- judgeval-0.0.44/docs/images/cluster.png +0 -0
- judgeval-0.0.44/docs/images/cluster_button.png +0 -0
- judgeval-0.0.44/docs/images/create_aggressive_scorer.png +0 -0
- judgeval-0.0.44/docs/images/create_scorer.png +0 -0
- judgeval-0.0.44/docs/images/dashboard_annotation_queue_button.png +0 -0
- judgeval-0.0.44/docs/images/evaluation_diagram.png +0 -0
- judgeval-0.0.44/docs/images/experiment-comparison-page-2.png +0 -0
- judgeval-0.0.44/docs/images/experiment-page-comparison.png +0 -0
- judgeval-0.0.44/docs/images/experiment-popout-comparison.png +0 -0
- judgeval-0.0.44/docs/images/experiments-page-comparison-2.png +0 -0
- judgeval-0.0.44/docs/images/experiments-page-comparison.png +0 -0
- judgeval-0.0.44/docs/images/export-dataset.png +0 -0
- judgeval-0.0.44/docs/images/hero-dark.svg +0 -161
- judgeval-0.0.44/docs/images/hero-light.svg +0 -155
- judgeval-0.0.44/docs/images/notifications_page.png +0 -0
- judgeval-0.0.44/docs/images/online_eval_fault.png +0 -0
- judgeval-0.0.44/docs/images/reports_modal.png +0 -0
- judgeval-0.0.44/docs/images/synth_data_button.png +0 -0
- judgeval-0.0.44/docs/images/synth_data_window.png +0 -0
- judgeval-0.0.44/docs/images/trace_ss.png +0 -0
- judgeval-0.0.44/docs/integration/langgraph.mdx +0 -207
- judgeval-0.0.44/docs/introduction.mdx +0 -19
- judgeval-0.0.44/docs/judgment_cli/installation.mdx +0 -91
- judgeval-0.0.44/docs/judgment_cli/self-hosting.mdx +0 -190
- judgeval-0.0.44/docs/judgment_cli/supabase-org-id.png +0 -0
- judgeval-0.0.44/docs/logo/dark.svg +0 -55
- judgeval-0.0.44/docs/logo/light.svg +0 -51
- judgeval-0.0.44/docs/mint.json +0 -168
- judgeval-0.0.44/docs/monitoring/annotations.mdx +0 -41
- judgeval-0.0.44/docs/monitoring/introduction.mdx +0 -36
- judgeval-0.0.44/docs/monitoring/production_insights.mdx +0 -0
- judgeval-0.0.44/docs/monitoring/tracing.mdx +0 -490
- judgeval-0.0.44/docs/monitoring/tracing_s3.mdx +0 -60
- judgeval-0.0.44/docs/notebooks/create_dataset.ipynb +0 -250
- judgeval-0.0.44/docs/notebooks/create_scorer.ipynb +0 -57
- judgeval-0.0.44/docs/notebooks/demo.ipynb +0 -389
- judgeval-0.0.44/docs/notebooks/prompt_scorer.ipynb +0 -165
- judgeval-0.0.44/docs/notebooks/quickstart.ipynb +0 -252
- judgeval-0.0.44/docs/optimization/osiris_agent.mdx +0 -598
- judgeval-0.0.44/docs/quickstart.mdx +0 -89
- judgeval-0.0.44/docs/self_hosting/get_started.mdx +0 -73
- judgeval-0.0.44/docs/snippets/snippet-intro.mdx +0 -4
- judgeval-0.0.44/docs/synthetic_data/synthetic_data.mdx +0 -66
- judgeval-0.0.44/src/judgeval/common/__init__.py +0 -8
- judgeval-0.0.44/src/judgeval/evaluation_run.py +0 -144
- judgeval-0.0.44/src/judgeval/judgment_client.py +0 -577
- {judgeval-0.0.44 → judgeval-0.0.46}/.github/workflows/blocked-pr.yaml +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/.github/workflows/merge-branch-check.yaml +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/.github/workflows/validate-branch.yaml +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/LICENSE.md +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/assets/dataset_clustering_screenshot.png +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/assets/dataset_clustering_screenshot_dm.png +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/assets/datasets_preview_screenshot.png +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/assets/experiments_dashboard_screenshot.png +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/assets/experiments_page.png +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/assets/experiments_pagev2.png +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/assets/logo-dark.svg +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/assets/logo-light.svg +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/assets/monitoring_screenshot.png +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/assets/new_darkmode.svg +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/assets/new_lightmode.svg +0 -0
- /judgeval-0.0.44/assets/trace_screenshot.png → /judgeval-0.0.46/assets/trace_screenshot_old.png +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/pytest.ini +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/src/.coveragerc +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/data/datasets/__init__.py +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
- {judgeval-0.0.44 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
@@ -0,0 +1,13 @@
|
|
1
|
+
## 📝 Summary
|
2
|
+
|
3
|
+
<!-- Provide a brief description of the changes introduced by this PR -->
|
4
|
+
|
5
|
+
## 🎥 Demo of Changes
|
6
|
+
|
7
|
+
<!-- Add a short 1-3 minute video describing/demoing the changes -->
|
8
|
+
|
9
|
+
## ✅ Checklist
|
10
|
+
|
11
|
+
- [ ] Tagged Linear ticket in PR title. Ie. PR Title (JUD-XXXX)
|
12
|
+
- [ ] Video demo of changes
|
13
|
+
- [ ] Reviewers assigned
|
@@ -0,0 +1,37 @@
|
|
1
|
+
name: Lint
|
2
|
+
|
3
|
+
on:
|
4
|
+
pull_request:
|
5
|
+
branches: [ main, staging ]
|
6
|
+
|
7
|
+
jobs:
|
8
|
+
lint:
|
9
|
+
runs-on: ubuntu-latest
|
10
|
+
steps:
|
11
|
+
- uses: actions/checkout@v4
|
12
|
+
|
13
|
+
- name: Set up Python
|
14
|
+
uses: actions/setup-python@v5
|
15
|
+
with:
|
16
|
+
python-version: '3.11'
|
17
|
+
|
18
|
+
- name: Install ruff
|
19
|
+
uses: astral-sh/ruff-action@v3
|
20
|
+
with:
|
21
|
+
args: "--version"
|
22
|
+
|
23
|
+
- name: Install mypy and dependencies
|
24
|
+
run: |
|
25
|
+
pip install mypy types-requests types-PyYAML
|
26
|
+
|
27
|
+
- name: Run ruff formatter
|
28
|
+
if: always()
|
29
|
+
run: ruff format --check .
|
30
|
+
|
31
|
+
- name: Run ruff linter
|
32
|
+
if: always()
|
33
|
+
run: ruff check .
|
34
|
+
|
35
|
+
- name: Run mypy
|
36
|
+
if: always()
|
37
|
+
run: mypy --explicit-package-bases --ignore-missing-imports .
|
@@ -4,6 +4,7 @@ on:
|
|
4
4
|
push:
|
5
5
|
branches:
|
6
6
|
- main
|
7
|
+
- trigger_release
|
7
8
|
|
8
9
|
jobs:
|
9
10
|
release:
|
@@ -38,7 +39,8 @@ jobs:
|
|
38
39
|
IFS='.' read -r major minor patch <<< "$latest_version"
|
39
40
|
|
40
41
|
# Bump patch version
|
41
|
-
patch=$((patch + 1))
|
42
|
+
# patch=$((patch + 1))
|
43
|
+
patch=46
|
42
44
|
new_version="$major.$minor.$patch"
|
43
45
|
|
44
46
|
echo "New version: $new_version"
|
@@ -0,0 +1,21 @@
|
|
1
|
+
repos:
|
2
|
+
- repo: https://github.com/astral-sh/uv-pre-commit
|
3
|
+
rev: 0.7.14
|
4
|
+
hooks:
|
5
|
+
- id: uv-lock
|
6
|
+
|
7
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
8
|
+
rev: v0.12.0
|
9
|
+
hooks:
|
10
|
+
- id: ruff
|
11
|
+
name: ruff (linter)
|
12
|
+
args: [--fix]
|
13
|
+
- id: ruff-format
|
14
|
+
name: ruff (formatter)
|
15
|
+
|
16
|
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
17
|
+
rev: v1.16.1
|
18
|
+
hooks:
|
19
|
+
- id: mypy
|
20
|
+
args: [--explicit-package-bases, --ignore-missing-imports]
|
21
|
+
additional_dependencies: [types-requests, types-PyYAML]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.46
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -34,57 +34,60 @@ Description-Content-Type: text/markdown
|
|
34
34
|
|
35
35
|
<br>
|
36
36
|
<div style="font-size: 1.5em;">
|
37
|
-
|
37
|
+
Enable self-learning agents with traces, evals, and environment data.
|
38
38
|
</div>
|
39
39
|
|
40
|
-
## [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/self-hosting/get_started)
|
40
|
+
## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/self-hosting/get_started)
|
41
41
|
|
42
|
-
[
|
42
|
+
[Demo](https://www.youtube.com/watch?v=1S4LixpVbcc) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
|
43
43
|
|
44
|
-
We're hiring! Join us in our mission to
|
44
|
+
We're hiring! Join us in our mission to enable self-learning agents by providing the data and signals needed for continuous improvement.
|
45
45
|
|
46
46
|
[](https://x.com/JudgmentLabs)
|
47
47
|
[](https://www.linkedin.com/company/judgmentlabs)
|
48
|
-
[](https://discord.gg/
|
48
|
+
[](https://discord.gg/tGVFf8UBUY)
|
49
49
|
|
50
|
-
<img src="assets/
|
50
|
+
<img src="assets/product_shot.png" alt="Judgment Platform" width="800" />
|
51
51
|
|
52
52
|
</div>
|
53
53
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
54
|
+
Judgeval offers **open-source tooling** for tracing, evaluating, and monitoring LLM agents. **Provides comprehensive data from agent-environment interactions** for continuous learning and self-improvement—**enabling the future of autonomous agents**.
|
55
|
+
|
56
|
+
## 🎬 See Judgeval in Action
|
57
|
+
|
58
|
+
**[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval traces every input/output + environment response across all agent tool calls for debugging. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
|
59
|
+
|
60
|
+
<table style="width: 100%; max-width: 800px; table-layout: fixed;">
|
61
|
+
<tr>
|
62
|
+
<td align="center" style="padding: 8px; width: 50%;">
|
63
|
+
<img src="assets/agent.gif" alt="Agent Demo" style="width: 100%; max-width: 350px; height: auto;" />
|
64
|
+
<br><strong>🤖 Agents Running</strong>
|
65
|
+
</td>
|
66
|
+
<td align="center" style="padding: 8px; width: 50%;">
|
67
|
+
<img src="assets/trace.gif" alt="Trace Demo" style="width: 100%; max-width: 350px; height: auto;" />
|
68
|
+
<br><strong>📊 Real-time Tracing</strong>
|
69
|
+
</td>
|
70
|
+
</tr>
|
71
|
+
<tr>
|
72
|
+
<td align="center" style="padding: 8px; width: 50%;">
|
73
|
+
<img src="assets/document.gif" alt="Agent Completed Demo" style="width: 100%; max-width: 350px; height: auto;" />
|
74
|
+
<br><strong>✅ Agents Completed Running</strong>
|
75
|
+
</td>
|
76
|
+
<td align="center" style="padding: 8px; width: 50%;">
|
77
|
+
<img src="assets/data.gif" alt="Data Export Demo" style="width: 100%; max-width: 350px; height: auto;" />
|
78
|
+
<br><strong>📤 Exporting Agent Environment Data</strong>
|
79
|
+
</td>
|
80
|
+
</tr>
|
81
|
+
|
82
|
+
</table>
|
61
83
|
|
62
84
|
## 📋 Table of Contents
|
63
|
-
- [✨ Features](#-features)
|
64
85
|
- [🛠️ Installation](#️-installation)
|
65
86
|
- [🏁 Quickstarts](#-quickstarts)
|
66
|
-
|
67
|
-
- [📝 Offline Evaluations](#-offline-evaluations)
|
68
|
-
- [📡 Online Evaluations](#-online-evaluations)
|
87
|
+
- [✨ Features](#-features)
|
69
88
|
- [🏢 Self-Hosting](#-self-hosting)
|
70
|
-
- [Key Features](#key-features)
|
71
|
-
- [Getting Started](#getting-started)
|
72
89
|
- [📚 Cookbooks](#-cookbooks)
|
73
90
|
- [💻 Development with Cursor](#-development-with-cursor)
|
74
|
-
- [⭐ Star Us on GitHub](#-star-us-on-github)
|
75
|
-
- [❤️ Contributors](#️-contributors)
|
76
|
-
|
77
|
-
<!-- Created by https://github.com/ekalinin/github-markdown-toc -->
|
78
|
-
|
79
|
-
|
80
|
-
## ✨ Features
|
81
|
-
|
82
|
-
| | |
|
83
|
-
|:---|:---:|
|
84
|
-
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
|
85
|
-
| <h3>🧪 Evals</h3>Evals are the key to regression testing for agents. Judgeval provides 15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Judgeval supports LLM-as-a-judge, manual labeling, and custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
|
86
|
-
| <h3>📡 Monitoring</h3>Track all your agent metrics in production. **Catch production regressions early.**<br><br>Configure alerts to trigger automated actions when metric thresholds are exceeded (add agent trace to review queue/dataset, Slack notification, etc.).<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
87
|
-
| <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets for scaled unit testing and structured experiments. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🗃️ Filtered agent runtime data for fine tuning<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
88
91
|
|
89
92
|
## 🛠️ Installation
|
90
93
|
|
@@ -94,7 +97,7 @@ Get started with Judgeval by installing our SDK using pip:
|
|
94
97
|
pip install judgeval
|
95
98
|
```
|
96
99
|
|
97
|
-
Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment
|
100
|
+
Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment Platform](https://app.judgmentlabs.ai/).
|
98
101
|
|
99
102
|
```bash
|
100
103
|
export JUDGMENT_API_KEY=...
|
@@ -107,106 +110,50 @@ export JUDGMENT_ORG_ID=...
|
|
107
110
|
|
108
111
|
### 🛰️ Tracing
|
109
112
|
|
110
|
-
Create a file named `
|
113
|
+
Create a file named `agent.py` with the following code:
|
111
114
|
|
112
115
|
```python
|
113
|
-
from judgeval.
|
116
|
+
from judgeval.tracer import Tracer, wrap
|
114
117
|
from openai import OpenAI
|
115
118
|
|
116
|
-
client = wrap(OpenAI())
|
119
|
+
client = wrap(OpenAI()) # tracks all LLM calls
|
117
120
|
judgment = Tracer(project_name="my_project")
|
118
121
|
|
119
122
|
@judgment.observe(span_type="tool")
|
120
|
-
def
|
121
|
-
|
123
|
+
def format_question(question: str) -> str:
|
124
|
+
# dummy tool
|
125
|
+
return f"Question : {question}"
|
122
126
|
|
123
127
|
@judgment.observe(span_type="function")
|
124
|
-
def
|
125
|
-
|
126
|
-
|
128
|
+
def run_agent(prompt: str) -> str:
|
129
|
+
task = format_question(prompt)
|
130
|
+
response = client.chat.completions.create(
|
127
131
|
model="gpt-4.1",
|
128
|
-
messages=[{"role": "user", "content":
|
132
|
+
messages=[{"role": "user", "content": task}]
|
129
133
|
)
|
130
|
-
return
|
131
|
-
|
132
|
-
|
134
|
+
return response.choices[0].message.content
|
135
|
+
|
136
|
+
run_agent("What is the capital of the United States?")
|
133
137
|
```
|
134
138
|
You'll see your trace exported to the Judgment Platform:
|
135
139
|
|
136
140
|
<p align="center"><img src="assets/trace_demo.png" alt="Judgment Platform Trace Example" width="800" /></p>
|
137
141
|
|
138
142
|
|
139
|
-
[Click here](https://docs.judgmentlabs.ai/
|
143
|
+
[Click here](https://docs.judgmentlabs.ai/tracing/introduction) for a more detailed explanation.
|
140
144
|
|
141
|
-
### 📝 Offline Evaluations
|
142
|
-
|
143
|
-
Create a file named `evaluate.py` with the following code:
|
144
|
-
|
145
|
-
```python evaluate.py
|
146
|
-
from judgeval import JudgmentClient
|
147
|
-
from judgeval.data import Example
|
148
|
-
from judgeval.scorers import FaithfulnessScorer
|
149
|
-
|
150
|
-
client = JudgmentClient()
|
151
|
-
|
152
|
-
example = Example(
|
153
|
-
input="What if these shoes don't fit?",
|
154
|
-
actual_output="We offer a 30-day full refund at no extra cost.",
|
155
|
-
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
|
156
|
-
)
|
157
|
-
|
158
|
-
scorer = FaithfulnessScorer(threshold=0.5) # Hallucination detector
|
159
|
-
results = client.run_evaluation(
|
160
|
-
examples=[example],
|
161
|
-
scorers=[scorer],
|
162
|
-
model="gpt-4.1",
|
163
|
-
)
|
164
|
-
print(results)
|
165
|
-
```
|
166
|
-
|
167
|
-
[Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-experiment) for a more detailed explanation.
|
168
|
-
|
169
|
-
### 📡 Online Evaluations
|
170
|
-
|
171
|
-
Attach performance monitoring on traces to measure the quality of your systems in production.
|
172
145
|
|
173
|
-
|
174
|
-
|
175
|
-
```python
|
176
|
-
from judgeval.common.tracer import Tracer, wrap
|
177
|
-
from judgeval.scorers import AnswerRelevancyScorer
|
178
|
-
from openai import OpenAI
|
179
|
-
|
180
|
-
client = wrap(OpenAI())
|
181
|
-
judgment = Tracer(project_name="my_project")
|
182
|
-
|
183
|
-
@judgment.observe(span_type="tool")
|
184
|
-
def my_tool():
|
185
|
-
return "Hello world!"
|
186
|
-
|
187
|
-
@judgment.observe(span_type="function")
|
188
|
-
def main():
|
189
|
-
task_input = my_tool()
|
190
|
-
res = client.chat.completions.create(
|
191
|
-
model="gpt-4.1",
|
192
|
-
messages=[{"role": "user", "content": f"{task_input}"}]
|
193
|
-
).choices[0].message.content
|
194
|
-
|
195
|
-
judgment.async_evaluate(
|
196
|
-
scorers=[AnswerRelevancyScorer(threshold=0.5)],
|
197
|
-
input=task_input,
|
198
|
-
actual_output=res,
|
199
|
-
model="gpt-4.1"
|
200
|
-
)
|
201
|
-
print("Online evaluation submitted.")
|
202
|
-
return res
|
146
|
+
<!-- Created by https://github.com/ekalinin/github-markdown-toc -->
|
203
147
|
|
204
|
-
main()
|
205
|
-
```
|
206
148
|
|
207
|
-
|
149
|
+
## ✨ Features
|
208
150
|
|
209
|
-
|
151
|
+
| | |
|
152
|
+
|:---|:---:|
|
153
|
+
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time. Export data per individual trace for detailed analysis.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
|
154
|
+
| <h3>🧪 Evals</h3>Evals are the key to regression testing for agents. Judgeval provides 15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Judgeval supports LLM-as-a-judge, manual labeling, and custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
|
155
|
+
| <h3>📡 Monitoring</h3>Track all your agent metrics in production. **Catch production regressions early.**<br><br>Configure alerts to trigger automated actions when metric thresholds are exceeded (add agent trace to review queue/dataset, Slack notification, etc.).<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
156
|
+
| <h3>📊 Datasets</h3>Export comprehensive agent-environment interaction data or import external testcases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
210
157
|
|
211
158
|
## 🏢 Self-Hosting
|
212
159
|
|
@@ -224,14 +171,9 @@ Run Judgment on your own infrastructure: we provide comprehensive self-hosting c
|
|
224
171
|
|
225
172
|
## 📚 Cookbooks
|
226
173
|
|
227
|
-
Have your own? We're happy to feature it if you create a PR or message us on [Discord](https://discord.gg/
|
174
|
+
Have your own? We're happy to feature it if you create a PR or message us on [Discord](https://discord.gg/tGVFf8UBUY).
|
228
175
|
|
229
|
-
You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judgment-cookbook).
|
230
|
-
|
231
|
-
### Sample Agents
|
232
|
-
|
233
|
-
#### [Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent)
|
234
|
-
A multi-agent system augmented with tool calls designed for general purpose tasks like financial research and math. Traced and evaluated on Faithfulness (factual adherence to retrieval context).
|
176
|
+
You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judgment-cookbook).
|
235
177
|
|
236
178
|
## 💻 Development with Cursor
|
237
179
|
When building agents and LLM workflows in Cursor, providing proper context to your coding assistant helps ensure seamless integration with Judgment. This rule file supplies the essential context your coding assistant needs for successful implementation.
|
@@ -1243,10 +1185,10 @@ Judgeval is created and maintained by @Judgment Labs.
|
|
1243
1185
|
|
1244
1186
|
| | |
|
1245
1187
|
|:---|:---:|
|
1246
|
-
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br
|
1188
|
+
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
|
1247
1189
|
| <h3>🧪 Evals</h3>15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Build custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails <br><br> | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
|
1248
1190
|
| <h3>📡 Monitoring</h3>Real-time performance tracking of your agents in production environments. **Track all your metrics in one place.**<br><br>Set up **Slack/email alerts** for critical metrics and receive notifications when thresholds are exceeded.<br><br> **Useful for:** <br>•📉 Identifying degradation early <br>•📈 Visualizing performance trends across versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
1249
|
-
| <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets
|
1191
|
+
| <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets for scaled unit testing and structured experiments. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🗃️ Filtered agent runtime data for fine tuning<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
1250
1192
|
| <h3>💡 Insights</h3>Cluster on your data to reveal common use cases and failure modes.<br><br>Trace failures to their exact source with Judgment's Osiris agent, which localizes errors to specific components for precise fixes.<br><br> **Useful for:**<br>•🔮 Surfacing common inputs that lead to error<br>•🤖 Investigating agent/user behavior for optimization <br>| <p align="center"><img src="assets/dataset_clustering_screenshot_dm.png" alt="Insights dashboard" width="1200"/></p> |
|
1251
1193
|
|
1252
1194
|
## 🛠️ Installation
|
@@ -1271,26 +1213,27 @@ Track your agent execution with full observability with just a few lines of code
|
|
1271
1213
|
Create a file named `traces.py` with the following code:
|
1272
1214
|
|
1273
1215
|
```python
|
1274
|
-
from judgeval.
|
1216
|
+
from judgeval.tracer import Tracer, wrap
|
1275
1217
|
from openai import OpenAI
|
1276
1218
|
|
1277
|
-
client = wrap(OpenAI())
|
1219
|
+
client = wrap(OpenAI()) # tracks all LLM calls
|
1278
1220
|
judgment = Tracer(project_name="my_project")
|
1279
1221
|
|
1280
1222
|
@judgment.observe(span_type="tool")
|
1281
|
-
def
|
1282
|
-
|
1223
|
+
def format_question(question: str) -> str:
|
1224
|
+
# dummy tool
|
1225
|
+
return f"Question : {question}"
|
1283
1226
|
|
1284
1227
|
@judgment.observe(span_type="function")
|
1285
|
-
def
|
1286
|
-
|
1287
|
-
|
1228
|
+
def run_agent(prompt: str) -> str:
|
1229
|
+
task = format_question(prompt)
|
1230
|
+
response = client.chat.completions.create(
|
1288
1231
|
model="gpt-4.1",
|
1289
|
-
messages=[{"role": "user", "content":
|
1232
|
+
messages=[{"role": "user", "content": task}]
|
1290
1233
|
)
|
1291
|
-
return
|
1234
|
+
return response.choices[0].message.content
|
1292
1235
|
|
1293
|
-
|
1236
|
+
run_agent("What is the capital of the United States?")
|
1294
1237
|
```
|
1295
1238
|
|
1296
1239
|
@Click here for a more detailed explanation.
|
@@ -1418,13 +1361,11 @@ There are many ways to contribute to Judgeval:
|
|
1418
1361
|
@
|
1419
1362
|
|
1420
1363
|
````
|
1421
|
-
|
1422
1364
|
</details>
|
1423
1365
|
|
1424
1366
|
## ⭐ Star Us on GitHub
|
1425
1367
|
|
1426
|
-
If you find Judgeval useful, please consider giving us a star on GitHub! Your support helps us grow our community and continue improving the
|
1427
|
-
|
1368
|
+
If you find Judgeval useful, please consider giving us a star on GitHub! Your support helps us grow our community and continue improving the repository.
|
1428
1369
|
|
1429
1370
|
## ❤️ Contributors
|
1430
1371
|
|
@@ -1437,3 +1378,6 @@ There are many ways to contribute to Judgeval:
|
|
1437
1378
|
<!-- Contributors collage -->
|
1438
1379
|
[](https://github.com/JudgmentLabs/judgeval/graphs/contributors)
|
1439
1380
|
|
1381
|
+
---
|
1382
|
+
|
1383
|
+
Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
|