judgeval 0.0.40__tar.gz → 0.0.42__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval-0.0.42/.github/workflows/blocked-pr.yaml +19 -0
- judgeval-0.0.42/.github/workflows/ci.yaml +163 -0
- judgeval-0.0.40/.github/workflows/merge-to-main.yaml → judgeval-0.0.42/.github/workflows/merge-branch-check.yaml +2 -7
- judgeval-0.0.42/.github/workflows/validate-branch.yaml +9 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/PKG-INFO +48 -50
- {judgeval-0.0.40 → judgeval-0.0.42}/README.md +46 -49
- judgeval-0.0.42/assets/trace_demo.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/pyproject.toml +15 -2
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/common/s3_storage.py +3 -1
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/common/tracer.py +1079 -139
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/common/utils.py +6 -2
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/constants.py +5 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/datasets/dataset.py +12 -6
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/datasets/eval_dataset_client.py +3 -1
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/trace.py +7 -2
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/integrations/langgraph.py +218 -34
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/judgment_client.py +9 -1
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/rules.py +60 -50
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/run_evaluation.py +53 -29
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorer.py +4 -1
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/prompt_scorer.py +3 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/utils/alerts.py +8 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/update_version.py +1 -1
- judgeval-0.0.42/uv.lock +4032 -0
- judgeval-0.0.40/.github/workflows/ci-staging.yaml +0 -103
- judgeval-0.0.40/.github/workflows/ci.yaml +0 -103
- judgeval-0.0.40/Pipfile +0 -33
- judgeval-0.0.40/Pipfile.lock +0 -4329
- {judgeval-0.0.40 → judgeval-0.0.42}/.github/pull_request_template.md +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/.github/workflows/release.yaml +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/.gitignore +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/LICENSE.md +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/assets/dataset_clustering_screenshot.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/assets/dataset_clustering_screenshot_dm.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/assets/datasets_preview_screenshot.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/assets/experiments_dashboard_screenshot.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/assets/experiments_page.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/assets/experiments_pagev2.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/assets/logo-dark.svg +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/assets/logo-light.svg +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/assets/monitoring_screenshot.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/assets/new_darkmode.svg +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/assets/new_lightmode.svg +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/assets/trace_screenshot.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/README.md +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/alerts/notifications.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/alerts/platform_notifications.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/alerts/rules.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/api_reference/judgment_client.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/api_reference/trace.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/changelog/2025-04-21.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/clustering/clustering.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/compliance/certifications.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/development.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/essentials/code.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/essentials/images.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/essentials/markdown.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/essentials/navigation.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/essentials/reusable-snippets.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/essentials/settings.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/data_datasets.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/data_examples.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/data_sequences.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/experiment_comparisons.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/introduction.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/judges.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/agent/derailment.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/answer_correctness.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/answer_relevancy.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/comparison.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/contextual_precision.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/contextual_recall.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/contextual_relevancy.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/execution_order.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/faithfulness.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/groundedness.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/json_correctness.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/summarization.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/introduction.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/unit_testing.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/favicon.svg +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/getting_started.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/annotation_queue_ui.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/basic_trace_example.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/checks-passed.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/cluster.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/cluster_button.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/create_aggressive_scorer.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/create_scorer.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/dashboard_annotation_queue_button.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/evaluation_diagram.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/experiment-comparison-page-2.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/experiment-page-comparison.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/experiment-popout-comparison.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/experiments-page-comparison-2.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/experiments-page-comparison.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/export-dataset.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/hero-dark.svg +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/hero-light.svg +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/notifications_page.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/online_eval_fault.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/reports_modal.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/synth_data_button.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/synth_data_window.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/trace_ss.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/integration/langgraph.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/introduction.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/judgment_cli/installation.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/judgment_cli/self-hosting.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/judgment_cli/supabase-org-id.png +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/logo/dark.svg +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/logo/light.svg +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/mint.json +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/monitoring/annotations.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/monitoring/introduction.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/monitoring/production_insights.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/monitoring/tracing.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/monitoring/tracing_s3.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/notebooks/create_dataset.ipynb +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/notebooks/create_scorer.ipynb +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/notebooks/demo.ipynb +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/notebooks/prompt_scorer.ipynb +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/notebooks/quickstart.ipynb +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/optimization/osiris_agent.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/quickstart.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/self_hosting/get_started.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/snippets/snippet-intro.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/docs/synthetic_data/synthetic_data.mdx +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/pytest.ini +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/.coveragerc +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/__init__.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/clients.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/common/__init__.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/common/exceptions.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/common/logger.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/custom_example.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/datasets/__init__.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/example.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/result.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/tool.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/trace_run.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/evaluation_run.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/judges/mixture_of_judges.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/api_scorer.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/score.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/utils.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/tracer/__init__.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/utils/data_utils.py +0 -0
- {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/version_check.py +0 -0
@@ -0,0 +1,19 @@
|
|
1
|
+
name: Check Blocked PR
|
2
|
+
|
3
|
+
on:
|
4
|
+
pull_request:
|
5
|
+
types:
|
6
|
+
- opened
|
7
|
+
- labeled
|
8
|
+
- unlabeled
|
9
|
+
- synchronize
|
10
|
+
|
11
|
+
jobs:
|
12
|
+
fail-for-blocked:
|
13
|
+
if: contains(github.event.pull_request.labels.*.name, 'Blocked')
|
14
|
+
runs-on: ubuntu-latest
|
15
|
+
steps:
|
16
|
+
- name: Fail if PR is blocked
|
17
|
+
run: |
|
18
|
+
echo "This PR is currently blocked. Please unblock it before merging."
|
19
|
+
exit 1
|
@@ -0,0 +1,163 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on:
|
4
|
+
pull_request:
|
5
|
+
types: [opened, synchronize, reopened]
|
6
|
+
|
7
|
+
permissions: read-all
|
8
|
+
|
9
|
+
jobs:
|
10
|
+
validate-branch:
|
11
|
+
uses: ./.github/workflows/merge-branch-check.yaml
|
12
|
+
|
13
|
+
run-tests:
|
14
|
+
needs: [validate-branch]
|
15
|
+
if: needs.validate-branch.result == 'success' || needs.validate-branch.result == 'skipped'
|
16
|
+
strategy:
|
17
|
+
fail-fast: false
|
18
|
+
matrix:
|
19
|
+
os: [ubuntu-latest, macos-latest]
|
20
|
+
python-version:
|
21
|
+
- "3.11"
|
22
|
+
name: Unit Tests
|
23
|
+
runs-on: ${{ matrix.os }}
|
24
|
+
env:
|
25
|
+
PYTHONPATH: "."
|
26
|
+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
27
|
+
TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
|
28
|
+
JUDGMENT_DEV: true
|
29
|
+
|
30
|
+
steps:
|
31
|
+
- name: Checkout code
|
32
|
+
uses: actions/checkout@v4
|
33
|
+
|
34
|
+
- name: Set up Python
|
35
|
+
uses: actions/setup-python@v4
|
36
|
+
with:
|
37
|
+
python-version: ${{ matrix.python-version }}
|
38
|
+
|
39
|
+
- name: Install dependencies
|
40
|
+
run: |
|
41
|
+
pip install uv
|
42
|
+
uv sync --dev
|
43
|
+
|
44
|
+
- name: Run tests
|
45
|
+
run: |
|
46
|
+
cd src
|
47
|
+
uv run pytest tests
|
48
|
+
|
49
|
+
run-e2e-tests-staging:
|
50
|
+
needs: [validate-branch]
|
51
|
+
if: "github.base_ref == 'staging' && !contains(github.actor, '[bot]') && (needs.validate-branch.result == 'success' || needs.validate-branch.result == 'skipped')"
|
52
|
+
name: Staging E2E Tests
|
53
|
+
runs-on: ubuntu-latest
|
54
|
+
env:
|
55
|
+
TEST_TIMEOUT_SECONDS: ${{ secrets.TEST_TIMEOUT_SECONDS }}
|
56
|
+
steps:
|
57
|
+
- name: Wait for turn
|
58
|
+
uses: softprops/turnstyle@v2
|
59
|
+
with:
|
60
|
+
poll-interval-seconds: 10
|
61
|
+
same-branch-only: false
|
62
|
+
job-to-wait-for: "Staging E2E Tests"
|
63
|
+
|
64
|
+
- name: Configure AWS Credentials
|
65
|
+
uses: aws-actions/configure-aws-credentials@v4
|
66
|
+
with:
|
67
|
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
68
|
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
69
|
+
aws-region: us-west-1
|
70
|
+
|
71
|
+
- name: Checkout code
|
72
|
+
uses: actions/checkout@v4
|
73
|
+
|
74
|
+
- name: Set up Python
|
75
|
+
uses: actions/setup-python@v4
|
76
|
+
with:
|
77
|
+
python-version: "3.11"
|
78
|
+
|
79
|
+
- name: Install judgeval dependencies
|
80
|
+
run: |
|
81
|
+
pip install uv
|
82
|
+
uv sync --dev
|
83
|
+
|
84
|
+
- name: Check if server is running
|
85
|
+
run: |
|
86
|
+
if ! curl -s https://staging.api.judgmentlabs.ai/health > /dev/null; then
|
87
|
+
echo "Staging Judgment server is not running properly. Check logs on AWS CloudWatch for more details."
|
88
|
+
exit 1
|
89
|
+
else
|
90
|
+
echo "Staging server is running."
|
91
|
+
fi
|
92
|
+
|
93
|
+
- name: Run E2E tests
|
94
|
+
working-directory: src
|
95
|
+
run: |
|
96
|
+
SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id gh-actions-stg-judgeval/api-keys/judgeval --query SecretString --output text)
|
97
|
+
export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
|
98
|
+
timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
|
99
|
+
|
100
|
+
- name: Upload coverage HTML report (staging)
|
101
|
+
if: always()
|
102
|
+
uses: actions/upload-artifact@v4
|
103
|
+
with:
|
104
|
+
name: coverage-html-staging
|
105
|
+
path: src/htmlcov
|
106
|
+
|
107
|
+
run-e2e-tests-main:
|
108
|
+
needs: [validate-branch]
|
109
|
+
if: "github.base_ref == 'main' && !contains(github.actor, '[bot]') && needs.validate-branch.result == 'success'"
|
110
|
+
name: Production E2E Tests
|
111
|
+
runs-on: ubuntu-latest
|
112
|
+
env:
|
113
|
+
TEST_TIMEOUT_SECONDS: ${{ secrets.TEST_TIMEOUT_SECONDS }}
|
114
|
+
steps:
|
115
|
+
- name: Wait for turn
|
116
|
+
uses: softprops/turnstyle@v2
|
117
|
+
with:
|
118
|
+
poll-interval-seconds: 10
|
119
|
+
same-branch-only: false
|
120
|
+
job-to-wait-for: "Production E2E Tests"
|
121
|
+
|
122
|
+
- name: Configure AWS Credentials
|
123
|
+
uses: aws-actions/configure-aws-credentials@v4
|
124
|
+
with:
|
125
|
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
126
|
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
127
|
+
aws-region: us-west-1
|
128
|
+
|
129
|
+
- name: Checkout code
|
130
|
+
uses: actions/checkout@v4
|
131
|
+
|
132
|
+
- name: Set up Python
|
133
|
+
uses: actions/setup-python@v4
|
134
|
+
with:
|
135
|
+
python-version: "3.11"
|
136
|
+
|
137
|
+
- name: Install judgeval dependencies
|
138
|
+
run: |
|
139
|
+
pip install uv
|
140
|
+
uv sync --dev
|
141
|
+
|
142
|
+
- name: Check if server is running
|
143
|
+
run: |
|
144
|
+
if ! curl -s https://api.judgmentlabs.ai/health > /dev/null; then
|
145
|
+
echo "Production Judgment server is not running properly. Check logs on AWS CloudWatch for more details."
|
146
|
+
exit 1
|
147
|
+
else
|
148
|
+
echo "Production server is running."
|
149
|
+
fi
|
150
|
+
|
151
|
+
- name: Run E2E tests
|
152
|
+
working-directory: src
|
153
|
+
run: |
|
154
|
+
SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id gh-actions-judgeval/api-keys/judgeval --query SecretString --output text)
|
155
|
+
export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
|
156
|
+
timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
|
157
|
+
|
158
|
+
- name: Upload coverage HTML report (production)
|
159
|
+
if: always()
|
160
|
+
uses: actions/upload-artifact@v4
|
161
|
+
with:
|
162
|
+
name: coverage-html-production
|
163
|
+
path: src/htmlcov
|
@@ -1,8 +1,6 @@
|
|
1
|
-
name:
|
1
|
+
name: Branch Protection Check
|
2
2
|
|
3
|
-
on:
|
4
|
-
pull_request:
|
5
|
-
types: [opened, synchronize, reopened, edited]
|
3
|
+
on: workflow_call
|
6
4
|
|
7
5
|
jobs:
|
8
6
|
validate-branch:
|
@@ -10,20 +8,17 @@ jobs:
|
|
10
8
|
steps:
|
11
9
|
- name: Check branch name
|
12
10
|
run: |
|
13
|
-
# Get the base and source branch names
|
14
11
|
BASE_BRANCH="${{ github.base_ref }}"
|
15
12
|
SOURCE_BRANCH="${{ github.head_ref }}"
|
16
13
|
|
17
14
|
echo "BASE_BRANCH: $BASE_BRANCH"
|
18
15
|
echo "SOURCE_BRANCH: $SOURCE_BRANCH"
|
19
16
|
|
20
|
-
# Only run validation if the base branch is main
|
21
17
|
if [[ "$BASE_BRANCH" != "main" ]]; then
|
22
18
|
echo "Skipping branch validation - not targeting main branch"
|
23
19
|
exit 0
|
24
20
|
fi
|
25
21
|
|
26
|
-
# Check if the source branch is staging or starts with hotfix/
|
27
22
|
if [[ "$SOURCE_BRANCH" != "staging" && ! "$SOURCE_BRANCH" =~ ^hotfix/ ]]; then
|
28
23
|
echo "::error::Pull requests to main can only be created from 'staging' or 'hotfix/*' branches. Current branch: $SOURCE_BRANCH"
|
29
24
|
exit 1
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.42
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -18,6 +18,7 @@ Requires-Dist: langchain-core
|
|
18
18
|
Requires-Dist: langchain-huggingface
|
19
19
|
Requires-Dist: langchain-openai
|
20
20
|
Requires-Dist: litellm==1.61.15
|
21
|
+
Requires-Dist: matplotlib>=3.10.3
|
21
22
|
Requires-Dist: nest-asyncio
|
22
23
|
Requires-Dist: openai
|
23
24
|
Requires-Dist: pandas
|
@@ -31,44 +32,47 @@ Description-Content-Type: text/markdown
|
|
31
32
|
<img src="assets/new_lightmode.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
|
32
33
|
<img src="assets/new_darkmode.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
|
33
34
|
|
34
|
-
|
35
|
+
<br>
|
36
|
+
<div style="font-size: 1.5em;">
|
37
|
+
Open source tracing, evals, and metrics to debug, test, and monitor LLM agents.
|
38
|
+
</div>
|
35
39
|
|
36
|
-
|
40
|
+
## [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/self-hosting/get_started) • [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc)
|
37
41
|
|
38
|
-
|
42
|
+
[Docs](https://docs.judgmentlabs.ai/introduction) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
|
39
43
|
|
40
|
-
|
44
|
+
We're hiring! Join us in our mission to unleash optimized agents.
|
41
45
|
|
42
46
|
[](https://x.com/JudgmentLabs)
|
43
47
|
[](https://www.linkedin.com/company/judgmentlabs)
|
44
|
-
[](https://discord.gg/
|
48
|
+
[](https://discord.gg/ZCnSXYug)
|
45
49
|
|
46
|
-
|
50
|
+
<img src="assets/experiments_pagev2.png" alt="Judgment Platform Experiments Page" width="800" />
|
47
51
|
|
48
|
-
|
52
|
+
</div>
|
49
53
|
|
50
|
-
Judgeval offers robust tooling for evaluating and tracing LLM agent systems. It is dev-friendly and open-source (licensed under Apache 2.0).
|
51
54
|
|
52
|
-
Judgeval
|
55
|
+
Judgeval offers **robust open-source tooling** for tracing, evaluating, and monitoring LLM agents. It helps AI teams effectively **test and monitor** agents in development and production, **closing the agent feedback loop**.
|
53
56
|
|
54
|
-
|
57
|
+
Judgeval can be set up **(cloud-hosted or self-hosted) in 5 minutes**!
|
58
|
+
> 🎁 Generous monthly [free tier](https://judgmentlabs.ai/pricing) (10k traces, 1k evals) - No credit card required!
|
55
59
|
|
56
60
|
Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
|
57
61
|
|
58
62
|
## 📋 Table of Contents
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
63
|
+
- [✨ Features](#-features)
|
64
|
+
- [🛠️ Installation](#️-installation)
|
65
|
+
- [🏁 Quickstarts](#-quickstarts)
|
66
|
+
- [🛰️ Tracing](#️-tracing)
|
67
|
+
- [📝 Offline Evaluations](#-offline-evaluations)
|
68
|
+
- [📡 Online Evaluations](#-online-evaluations)
|
69
|
+
- [🏢 Self-Hosting](#-self-hosting)
|
70
|
+
- [Key Features](#key-features)
|
71
|
+
- [Getting Started](#getting-started)
|
72
|
+
- [📚 Cookbooks](#-cookbooks)
|
73
|
+
- [💻 Development with Cursor](#-development-with-cursor)
|
74
|
+
- [⭐ Star Us on GitHub](#-star-us-on-github)
|
75
|
+
- [❤️ Contributors](#️-contributors)
|
72
76
|
|
73
77
|
<!-- Created by https://github.com/ekalinin/github-markdown-toc -->
|
74
78
|
|
@@ -77,11 +81,10 @@ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
|
|
77
81
|
|
78
82
|
| | |
|
79
83
|
|:---|:---:|
|
80
|
-
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
|
81
|
-
| <h3>🧪 Evals</h3>15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>
|
82
|
-
| <h3>📡 Monitoring</h3>
|
83
|
-
| <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets
|
84
|
-
| <h3>💡 Insights</h3>Cluster on your data to reveal common use cases and failure modes.<br><br>Trace failures to their exact source with Judgment's Osiris agent, which localizes errors to specific components for precise fixes.<br><br> **Useful for:**<br>•🔮 Surfacing common inputs that lead to error<br>•🤖 Investigating agent/user behavior for optimization <br>| <p align="center"><img src="assets/dataset_clustering_screenshot_dm.png" alt="Insights dashboard" width="1200"/></p> |
|
84
|
+
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
|
85
|
+
| <h3>🧪 Evals</h3>Evals are the key to regression testing for agents. Judgeval provides 15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Judgeval supports LLM-as-a-judge, manual labeling, and custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
|
86
|
+
| <h3>📡 Monitoring</h3>Track all your agent metrics in production. **Catch production regressions early.**<br><br>Configure alerts to trigger automated actions when metric thresholds are exceeded (add agent trace to review queue/dataset, Slack notification, etc.).<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
87
|
+
| <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets for scaled unit testing and structured experiments. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🗃️ Filtered agent runtime data for fine tuning<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
85
88
|
|
86
89
|
## 🛠️ Installation
|
87
90
|
|
@@ -91,17 +94,19 @@ Get started with Judgeval by installing our SDK using pip:
|
|
91
94
|
pip install judgeval
|
92
95
|
```
|
93
96
|
|
94
|
-
Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
|
97
|
+
Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
|
95
98
|
|
96
|
-
|
99
|
+
```bash
|
100
|
+
export JUDGMENT_API_KEY=...
|
101
|
+
export JUDGMENT_ORG_ID=...
|
102
|
+
```
|
97
103
|
|
98
|
-
|
104
|
+
**If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
|
99
105
|
|
100
|
-
|
106
|
+
## 🏁 Quickstarts
|
101
107
|
|
102
108
|
### 🛰️ Tracing
|
103
109
|
|
104
|
-
Track your agent execution with full observability with just a few lines of code.
|
105
110
|
Create a file named `traces.py` with the following code:
|
106
111
|
|
107
112
|
```python
|
@@ -126,12 +131,15 @@ def main():
|
|
126
131
|
|
127
132
|
main()
|
128
133
|
```
|
134
|
+
You'll see your trace exported to the Judgment Platform:
|
135
|
+
|
136
|
+
<p align="center"><img src="assets/trace_demo.png" alt="Judgment Platform Trace Example" width="800" /></p>
|
137
|
+
|
129
138
|
|
130
139
|
[Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-trace) for a more detailed explanation.
|
131
140
|
|
132
141
|
### 📝 Offline Evaluations
|
133
142
|
|
134
|
-
You can evaluate your agent's execution to measure quality metrics such as hallucination.
|
135
143
|
Create a file named `evaluate.py` with the following code:
|
136
144
|
|
137
145
|
```python evaluate.py
|
@@ -147,7 +155,7 @@ example = Example(
|
|
147
155
|
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
|
148
156
|
)
|
149
157
|
|
150
|
-
scorer = FaithfulnessScorer(threshold=0.5)
|
158
|
+
scorer = FaithfulnessScorer(threshold=0.5) # Hallucination detector
|
151
159
|
results = client.run_evaluation(
|
152
160
|
examples=[example],
|
153
161
|
scorers=[scorer],
|
@@ -196,6 +204,8 @@ def main():
|
|
196
204
|
main()
|
197
205
|
```
|
198
206
|
|
207
|
+
You should see an evaluation attached to your trace on the Judgment Platform.
|
208
|
+
|
199
209
|
[Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-online-evaluation) for a more detailed explanation.
|
200
210
|
|
201
211
|
## 🏢 Self-Hosting
|
@@ -220,20 +230,8 @@ You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judg
|
|
220
230
|
|
221
231
|
### Sample Agents
|
222
232
|
|
223
|
-
####
|
224
|
-
A
|
225
|
-
|
226
|
-
#### ✈️ [OpenAI Travel Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/openai_travel_agent/agent.py)
|
227
|
-
A travel planning agent using OpenAI API calls, custom tool functions, and RAG with a vector database for up-to-date and contextual travel information. Evaluated for itinerary quality and information relevance.
|
228
|
-
|
229
|
-
### Custom Evaluators
|
230
|
-
|
231
|
-
#### 🔍 [PII Detection](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/classifier_scorer/pii_checker.py)
|
232
|
-
Detecting and evaluating Personal Identifiable Information (PII) leakage.
|
233
|
-
|
234
|
-
#### 📧 [Cold Email Generation](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/cold_email_scorer.py)
|
235
|
-
|
236
|
-
Evaluates if a cold email generator properly utilizes all relevant information about the target recipient.
|
233
|
+
#### [Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent)
|
234
|
+
A multi-agent system augmented with tool calls designed for general purpose tasks like financial research and math. Traced and evaluated on Faithfulness (factual adherence to retrieval context).
|
237
235
|
|
238
236
|
## 💻 Development with Cursor
|
239
237
|
When building agents and LLM workflows in Cursor, providing proper context to your coding assistant helps ensure seamless integration with Judgment. This rule file supplies the essential context your coding assistant needs for successful implementation.
|
@@ -3,44 +3,47 @@
|
|
3
3
|
<img src="assets/new_lightmode.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
|
4
4
|
<img src="assets/new_darkmode.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
|
5
5
|
|
6
|
-
|
6
|
+
<br>
|
7
|
+
<div style="font-size: 1.5em;">
|
8
|
+
Open source tracing, evals, and metrics to debug, test, and monitor LLM agents.
|
9
|
+
</div>
|
7
10
|
|
8
|
-
|
11
|
+
## [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/self-hosting/get_started) • [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc)
|
9
12
|
|
10
|
-
|
13
|
+
[Docs](https://docs.judgmentlabs.ai/introduction) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
|
11
14
|
|
12
|
-
|
15
|
+
We're hiring! Join us in our mission to unleash optimized agents.
|
13
16
|
|
14
17
|
[](https://x.com/JudgmentLabs)
|
15
18
|
[](https://www.linkedin.com/company/judgmentlabs)
|
16
|
-
[](https://discord.gg/
|
19
|
+
[](https://discord.gg/ZCnSXYug)
|
17
20
|
|
18
|
-
|
21
|
+
<img src="assets/experiments_pagev2.png" alt="Judgment Platform Experiments Page" width="800" />
|
19
22
|
|
20
|
-
|
23
|
+
</div>
|
21
24
|
|
22
|
-
Judgeval offers robust tooling for evaluating and tracing LLM agent systems. It is dev-friendly and open-source (licensed under Apache 2.0).
|
23
25
|
|
24
|
-
Judgeval
|
26
|
+
Judgeval offers **robust open-source tooling** for tracing, evaluating, and monitoring LLM agents. It helps AI teams effectively **test and monitor** agents in development and production, **closing the agent feedback loop**.
|
25
27
|
|
26
|
-
|
28
|
+
Judgeval can be set up **(cloud-hosted or self-hosted) in 5 minutes**!
|
29
|
+
> 🎁 Generous monthly [free tier](https://judgmentlabs.ai/pricing) (10k traces, 1k evals) - No credit card required!
|
27
30
|
|
28
31
|
Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
|
29
32
|
|
30
33
|
## 📋 Table of Contents
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
34
|
+
- [✨ Features](#-features)
|
35
|
+
- [🛠️ Installation](#️-installation)
|
36
|
+
- [🏁 Quickstarts](#-quickstarts)
|
37
|
+
- [🛰️ Tracing](#️-tracing)
|
38
|
+
- [📝 Offline Evaluations](#-offline-evaluations)
|
39
|
+
- [📡 Online Evaluations](#-online-evaluations)
|
40
|
+
- [🏢 Self-Hosting](#-self-hosting)
|
41
|
+
- [Key Features](#key-features)
|
42
|
+
- [Getting Started](#getting-started)
|
43
|
+
- [📚 Cookbooks](#-cookbooks)
|
44
|
+
- [💻 Development with Cursor](#-development-with-cursor)
|
45
|
+
- [⭐ Star Us on GitHub](#-star-us-on-github)
|
46
|
+
- [❤️ Contributors](#️-contributors)
|
44
47
|
|
45
48
|
<!-- Created by https://github.com/ekalinin/github-markdown-toc -->
|
46
49
|
|
@@ -49,11 +52,10 @@ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
|
|
49
52
|
|
50
53
|
| | |
|
51
54
|
|:---|:---:|
|
52
|
-
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
|
53
|
-
| <h3>🧪 Evals</h3>15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>
|
54
|
-
| <h3>📡 Monitoring</h3>
|
55
|
-
| <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets
|
56
|
-
| <h3>💡 Insights</h3>Cluster on your data to reveal common use cases and failure modes.<br><br>Trace failures to their exact source with Judgment's Osiris agent, which localizes errors to specific components for precise fixes.<br><br> **Useful for:**<br>•🔮 Surfacing common inputs that lead to error<br>•🤖 Investigating agent/user behavior for optimization <br>| <p align="center"><img src="assets/dataset_clustering_screenshot_dm.png" alt="Insights dashboard" width="1200"/></p> |
|
55
|
+
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
|
56
|
+
| <h3>🧪 Evals</h3>Evals are the key to regression testing for agents. Judgeval provides 15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Judgeval supports LLM-as-a-judge, manual labeling, and custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
|
57
|
+
| <h3>📡 Monitoring</h3>Track all your agent metrics in production. **Catch production regressions early.**<br><br>Configure alerts to trigger automated actions when metric thresholds are exceeded (add agent trace to review queue/dataset, Slack notification, etc.).<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
58
|
+
| <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets for scaled unit testing and structured experiments. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🗃️ Filtered agent runtime data for fine tuning<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
57
59
|
|
58
60
|
## 🛠️ Installation
|
59
61
|
|
@@ -63,17 +65,19 @@ Get started with Judgeval by installing our SDK using pip:
|
|
63
65
|
pip install judgeval
|
64
66
|
```
|
65
67
|
|
66
|
-
Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
|
68
|
+
Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
|
67
69
|
|
68
|
-
|
70
|
+
```bash
|
71
|
+
export JUDGMENT_API_KEY=...
|
72
|
+
export JUDGMENT_ORG_ID=...
|
73
|
+
```
|
69
74
|
|
70
|
-
|
75
|
+
**If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
|
71
76
|
|
72
|
-
|
77
|
+
## 🏁 Quickstarts
|
73
78
|
|
74
79
|
### 🛰️ Tracing
|
75
80
|
|
76
|
-
Track your agent execution with full observability with just a few lines of code.
|
77
81
|
Create a file named `traces.py` with the following code:
|
78
82
|
|
79
83
|
```python
|
@@ -98,12 +102,15 @@ def main():
|
|
98
102
|
|
99
103
|
main()
|
100
104
|
```
|
105
|
+
You'll see your trace exported to the Judgment Platform:
|
106
|
+
|
107
|
+
<p align="center"><img src="assets/trace_demo.png" alt="Judgment Platform Trace Example" width="800" /></p>
|
108
|
+
|
101
109
|
|
102
110
|
[Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-trace) for a more detailed explanation.
|
103
111
|
|
104
112
|
### 📝 Offline Evaluations
|
105
113
|
|
106
|
-
You can evaluate your agent's execution to measure quality metrics such as hallucination.
|
107
114
|
Create a file named `evaluate.py` with the following code:
|
108
115
|
|
109
116
|
```python evaluate.py
|
@@ -119,7 +126,7 @@ example = Example(
|
|
119
126
|
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
|
120
127
|
)
|
121
128
|
|
122
|
-
scorer = FaithfulnessScorer(threshold=0.5)
|
129
|
+
scorer = FaithfulnessScorer(threshold=0.5) # Hallucination detector
|
123
130
|
results = client.run_evaluation(
|
124
131
|
examples=[example],
|
125
132
|
scorers=[scorer],
|
@@ -168,6 +175,8 @@ def main():
|
|
168
175
|
main()
|
169
176
|
```
|
170
177
|
|
178
|
+
You should see an evaluation attached to your trace on the Judgment Platform.
|
179
|
+
|
171
180
|
[Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-online-evaluation) for a more detailed explanation.
|
172
181
|
|
173
182
|
## 🏢 Self-Hosting
|
@@ -192,20 +201,8 @@ You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judg
|
|
192
201
|
|
193
202
|
### Sample Agents
|
194
203
|
|
195
|
-
####
|
196
|
-
A
|
197
|
-
|
198
|
-
#### ✈️ [OpenAI Travel Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/openai_travel_agent/agent.py)
|
199
|
-
A travel planning agent using OpenAI API calls, custom tool functions, and RAG with a vector database for up-to-date and contextual travel information. Evaluated for itinerary quality and information relevance.
|
200
|
-
|
201
|
-
### Custom Evaluators
|
202
|
-
|
203
|
-
#### 🔍 [PII Detection](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/classifier_scorer/pii_checker.py)
|
204
|
-
Detecting and evaluating Personal Identifiable Information (PII) leakage.
|
205
|
-
|
206
|
-
#### 📧 [Cold Email Generation](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/cold_email_scorer.py)
|
207
|
-
|
208
|
-
Evaluates if a cold email generator properly utilizes all relevant information about the target recipient.
|
204
|
+
#### [Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent)
|
205
|
+
A multi-agent system augmented with tool calls designed for general purpose tasks like financial research and math. Traced and evaluated on Faithfulness (factual adherence to retrieval context).
|
209
206
|
|
210
207
|
## 💻 Development with Cursor
|
211
208
|
When building agents and LLM workflows in Cursor, providing proper context to your coding assistant helps ensure seamless integration with Judgment. This rule file supplies the essential context your coding assistant needs for successful implementation.
|
Binary file
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "judgeval"
|
3
|
-
version = "0.0.
|
3
|
+
version = "0.0.42"
|
4
4
|
authors = [
|
5
5
|
{ name="Andrew Li", email="andrew@judgmentlabs.ai" },
|
6
6
|
{ name="Alex Shan", email="alex@judgmentlabs.ai" },
|
@@ -29,7 +29,8 @@ dependencies = [
|
|
29
29
|
"langchain-anthropic",
|
30
30
|
"langchain-core",
|
31
31
|
"google-genai",
|
32
|
-
"boto3"
|
32
|
+
"boto3",
|
33
|
+
"matplotlib>=3.10.3",
|
33
34
|
]
|
34
35
|
|
35
36
|
[project.urls]
|
@@ -47,6 +48,18 @@ include = [
|
|
47
48
|
"/src/judgeval/**/*.py",
|
48
49
|
]
|
49
50
|
|
51
|
+
[dependency-groups]
|
52
|
+
dev = [
|
53
|
+
"chromadb>=1.0.12",
|
54
|
+
"langchain-community>=0.3.24",
|
55
|
+
"pytest>=8.4.0",
|
56
|
+
"pytest-asyncio>=1.0.0",
|
57
|
+
"pytest-cov>=6.1.1",
|
58
|
+
"pytest-mock>=3.14.1",
|
59
|
+
"tavily-python>=0.7.5",
|
60
|
+
"langgraph>=0.4.3",
|
61
|
+
]
|
62
|
+
|
50
63
|
[tool.hatch.build]
|
51
64
|
directory = "dist"
|
52
65
|
artifacts = [
|