judgeval 0.2.0__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.2.0 → judgeval-0.3.1}/.github/workflows/lint.yaml +0 -13
- judgeval-0.3.1/.github/workflows/mypy.yaml +25 -0
- judgeval-0.3.1/.github/workflows/pre-commit-autoupdate.yaml +38 -0
- judgeval-0.3.1/.pre-commit-config.yaml +23 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/PKG-INFO +10 -6
- {judgeval-0.2.0 → judgeval-0.3.1}/README.md +5 -5
- judgeval-0.3.1/assets/agent_trace_example.png +0 -0
- judgeval-0.3.1/assets/errors.png +0 -0
- judgeval-0.3.1/assets/online_eval.png +0 -0
- judgeval-0.3.1/assets/product_shot.png +0 -0
- judgeval-0.3.1/assets/test.png +0 -0
- judgeval-0.3.1/assets/tests.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/pyproject.toml +29 -1
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/api/api.py +38 -7
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/api/constants.py +9 -1
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/storage/s3_storage.py +2 -3
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/tracer/core.py +66 -32
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/tracer/otel_span_processor.py +4 -50
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/tracer/span_transformer.py +16 -10
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/utils.py +46 -38
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/constants.py +2 -0
- judgeval-0.3.1/src/judgeval/data/example.py +33 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/data/judgment_types.py +23 -45
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/data/result.py +8 -14
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/data/scripts/openapi_transform.py +5 -5
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/data/trace.py +3 -4
- judgeval-0.3.1/src/judgeval/dataset.py +192 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/evaluation_run.py +1 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/judges/litellm_judge.py +2 -2
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/judges/mixture_of_judges.py +6 -6
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/judges/together_judge.py +6 -3
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/judgment_client.py +9 -71
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/run_evaluation.py +41 -9
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/score.py +11 -7
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/utils.py +3 -3
- judgeval-0.3.1/src/judgeval/utils/file_utils.py +66 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/update_types.sh +1 -1
- {judgeval-0.2.0 → judgeval-0.3.1}/uv.lock +449 -0
- judgeval-0.2.0/.pre-commit-config.yaml +0 -21
- judgeval-0.2.0/assets/product_shot.png +0 -0
- judgeval-0.2.0/src/judgeval/data/datasets/__init__.py +0 -4
- judgeval-0.2.0/src/judgeval/data/datasets/dataset.py +0 -341
- judgeval-0.2.0/src/judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval-0.2.0/src/judgeval/data/example.py +0 -61
- judgeval-0.2.0/src/judgeval/utils/file_utils.py +0 -51
- {judgeval-0.2.0 → judgeval-0.3.1}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/.github/pull_request_template.md +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/.github/workflows/blocked-pr.yaml +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/.github/workflows/merge-branch-check.yaml +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/.github/workflows/release.yaml +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/.github/workflows/validate-branch.yaml +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/.gitignore +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/LICENSE.md +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/agent.gif +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/data.gif +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/dataset_clustering_screenshot.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/dataset_clustering_screenshot_dm.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/datasets_preview_screenshot.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/document.gif +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/error_analysis_dashboard.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/experiments_dashboard_screenshot.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/experiments_page.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/experiments_pagev2.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/logo-dark.svg +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/logo-light.svg +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/monitoring_screenshot.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/new_darkmode.svg +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/new_lightmode.svg +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/trace.gif +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/trace_demo.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/trace_screenshot.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/assets/trace_screenshot_old.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/pytest.ini +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/.coveragerc +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/clients.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/api/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/exceptions.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/logger.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/storage/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/tracer/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/tracer/constants.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/tracer/otel_exporter.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/tracer/span_processor.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/tracer/trace_manager.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/data/tool.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/data/trace_run.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/integrations/langgraph.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/rules.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/agent_scorer.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/api_scorer.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/base_scorer.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/example_scorer.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/tracer/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/utils/alerts.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/utils/requests.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/version_check.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.1}/update_version.py +0 -0
@@ -10,20 +10,11 @@ jobs:
|
|
10
10
|
steps:
|
11
11
|
- uses: actions/checkout@v4
|
12
12
|
|
13
|
-
- name: Set up Python
|
14
|
-
uses: actions/setup-python@v5
|
15
|
-
with:
|
16
|
-
python-version: '3.11'
|
17
|
-
|
18
13
|
- name: Install ruff
|
19
14
|
uses: astral-sh/ruff-action@v3
|
20
15
|
with:
|
21
16
|
args: "--version"
|
22
17
|
|
23
|
-
- name: Install mypy and dependencies
|
24
|
-
run: |
|
25
|
-
pip install mypy types-requests types-PyYAML
|
26
|
-
|
27
18
|
- name: Run ruff formatter
|
28
19
|
if: always()
|
29
20
|
run: ruff format --check .
|
@@ -31,7 +22,3 @@ jobs:
|
|
31
22
|
- name: Run ruff linter
|
32
23
|
if: always()
|
33
24
|
run: ruff check .
|
34
|
-
|
35
|
-
- name: Run mypy
|
36
|
-
if: always()
|
37
|
-
run: mypy --explicit-package-bases --ignore-missing-imports .
|
@@ -0,0 +1,25 @@
|
|
1
|
+
name: MyPy Check
|
2
|
+
|
3
|
+
on:
|
4
|
+
pull_request:
|
5
|
+
branches: [ main, staging ]
|
6
|
+
|
7
|
+
jobs:
|
8
|
+
mypy:
|
9
|
+
runs-on: ubuntu-latest
|
10
|
+
steps:
|
11
|
+
- uses: actions/checkout@v4
|
12
|
+
|
13
|
+
- name: Set up Python
|
14
|
+
uses: actions/setup-python@v5
|
15
|
+
with:
|
16
|
+
python-version: '3.11'
|
17
|
+
|
18
|
+
- name: Install dependencies
|
19
|
+
run: |
|
20
|
+
pip install uv
|
21
|
+
uv sync --dev
|
22
|
+
|
23
|
+
- name: Run mypy
|
24
|
+
if: always()
|
25
|
+
run: uv run mypy ./src/judgeval/
|
@@ -0,0 +1,38 @@
|
|
1
|
+
name: Pre-commit auto-update
|
2
|
+
on:
|
3
|
+
schedule:
|
4
|
+
- cron: '0 0 * * 1' # Weekly on Monday at midnight UTC
|
5
|
+
workflow_dispatch:
|
6
|
+
|
7
|
+
jobs:
|
8
|
+
auto-update:
|
9
|
+
runs-on: ubuntu-latest
|
10
|
+
steps:
|
11
|
+
- name: Checkout repository
|
12
|
+
uses: actions/checkout@v4
|
13
|
+
with:
|
14
|
+
ref: staging
|
15
|
+
|
16
|
+
- name: Set up Python
|
17
|
+
uses: actions/setup-python@v4
|
18
|
+
with:
|
19
|
+
python-version: '3.11'
|
20
|
+
|
21
|
+
- name: Install and update pre-commit
|
22
|
+
run: |
|
23
|
+
pip install pre-commit
|
24
|
+
pre-commit autoupdate
|
25
|
+
|
26
|
+
- name: Create Pull Request
|
27
|
+
uses: peter-evans/create-pull-request@v7
|
28
|
+
with:
|
29
|
+
commit-message: 'chore: update pre-commit hooks'
|
30
|
+
title: 'chore: update pre-commit hooks'
|
31
|
+
body: |
|
32
|
+
Auto-generated PR to update pre-commit hook versions.
|
33
|
+
|
34
|
+
Please review the changes and merge if everything looks good.
|
35
|
+
|
36
|
+
Updated by GitHub Actions on {{ date }}.
|
37
|
+
branch: update-pre-commit-hooks
|
38
|
+
base: staging
|
@@ -0,0 +1,23 @@
|
|
1
|
+
repos:
|
2
|
+
- repo: https://github.com/astral-sh/uv-pre-commit
|
3
|
+
rev: 0.8.0
|
4
|
+
hooks:
|
5
|
+
- id: uv-lock
|
6
|
+
|
7
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
8
|
+
rev: v0.12.4
|
9
|
+
hooks:
|
10
|
+
- id: ruff
|
11
|
+
name: ruff (linter)
|
12
|
+
args: [--fix]
|
13
|
+
- id: ruff-format
|
14
|
+
name: ruff (formatter)
|
15
|
+
|
16
|
+
# - repo: https://github.com/pre-commit/mirrors-mypy
|
17
|
+
# rev: v1.17.0
|
18
|
+
# hooks:
|
19
|
+
# - id: mypy
|
20
|
+
# language: system
|
21
|
+
# # These next two lines allow commits even if mypy fails, REMOVE once we fix all mypy errors
|
22
|
+
# verbose: true
|
23
|
+
# entry: bash -c 'mypy src/judgeval/ || true'
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.1
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -14,6 +14,7 @@ Requires-Dist: anthropic
|
|
14
14
|
Requires-Dist: boto3
|
15
15
|
Requires-Dist: datamodel-code-generator>=0.31.1
|
16
16
|
Requires-Dist: google-genai
|
17
|
+
Requires-Dist: groq>=0.30.0
|
17
18
|
Requires-Dist: langchain-anthropic
|
18
19
|
Requires-Dist: langchain-core
|
19
20
|
Requires-Dist: langchain-huggingface
|
@@ -22,6 +23,9 @@ Requires-Dist: litellm>=1.61.15
|
|
22
23
|
Requires-Dist: matplotlib>=3.10.3
|
23
24
|
Requires-Dist: nest-asyncio
|
24
25
|
Requires-Dist: openai
|
26
|
+
Requires-Dist: opentelemetry-api>=1.34.1
|
27
|
+
Requires-Dist: opentelemetry-sdk>=1.34.1
|
28
|
+
Requires-Dist: orjson>=3.9.0
|
25
29
|
Requires-Dist: pandas
|
26
30
|
Requires-Dist: python-dotenv==1.0.1
|
27
31
|
Requires-Dist: python-slugify>=8.0.4
|
@@ -39,7 +43,7 @@ Description-Content-Type: text/markdown
|
|
39
43
|
Enable self-learning agents with traces, evals, and environment data.
|
40
44
|
</div>
|
41
45
|
|
42
|
-
## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
|
46
|
+
## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
|
43
47
|
|
44
48
|
[Demo](https://www.youtube.com/watch?v=1S4LixpVbcc) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
|
45
49
|
|
@@ -139,7 +143,7 @@ run_agent("What is the capital of the United States?")
|
|
139
143
|
```
|
140
144
|
You'll see your trace exported to the Judgment Platform:
|
141
145
|
|
142
|
-
<p align="center"><img src="assets/
|
146
|
+
<p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
|
143
147
|
|
144
148
|
|
145
149
|
[Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
|
@@ -152,9 +156,9 @@ You'll see your trace exported to the Judgment Platform:
|
|
152
156
|
|
153
157
|
| | |
|
154
158
|
|:---|:---:|
|
155
|
-
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/
|
156
|
-
| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/
|
157
|
-
| <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/
|
159
|
+
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
|
160
|
+
| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
|
161
|
+
| <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
158
162
|
| <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
159
163
|
|
160
164
|
## 🏢 Self-Hosting
|
@@ -8,7 +8,7 @@
|
|
8
8
|
Enable self-learning agents with traces, evals, and environment data.
|
9
9
|
</div>
|
10
10
|
|
11
|
-
## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
|
11
|
+
## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
|
12
12
|
|
13
13
|
[Demo](https://www.youtube.com/watch?v=1S4LixpVbcc) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
|
14
14
|
|
@@ -108,7 +108,7 @@ run_agent("What is the capital of the United States?")
|
|
108
108
|
```
|
109
109
|
You'll see your trace exported to the Judgment Platform:
|
110
110
|
|
111
|
-
<p align="center"><img src="assets/
|
111
|
+
<p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
|
112
112
|
|
113
113
|
|
114
114
|
[Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
|
@@ -121,9 +121,9 @@ You'll see your trace exported to the Judgment Platform:
|
|
121
121
|
|
122
122
|
| | |
|
123
123
|
|:---|:---:|
|
124
|
-
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/
|
125
|
-
| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/
|
126
|
-
| <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/
|
124
|
+
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
|
125
|
+
| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
|
126
|
+
| <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
127
127
|
| <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
128
128
|
|
129
129
|
## 🏢 Self-Hosting
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "judgeval"
|
3
|
-
version = "0.
|
3
|
+
version = "0.3.1"
|
4
4
|
authors = [
|
5
5
|
{ name="Andrew Li", email="andrew@judgmentlabs.ai" },
|
6
6
|
{ name="Alex Shan", email="alex@judgmentlabs.ai" },
|
@@ -33,6 +33,10 @@ dependencies = [
|
|
33
33
|
"matplotlib>=3.10.3",
|
34
34
|
"python-slugify>=8.0.4",
|
35
35
|
"datamodel-code-generator>=0.31.1",
|
36
|
+
"groq>=0.30.0",
|
37
|
+
"opentelemetry-api>=1.34.1",
|
38
|
+
"opentelemetry-sdk>=1.34.1",
|
39
|
+
"orjson>=3.9.0",
|
36
40
|
]
|
37
41
|
|
38
42
|
[project.urls]
|
@@ -62,6 +66,30 @@ dev = [
|
|
62
66
|
"langgraph>=0.4.3",
|
63
67
|
"pre-commit>=4.2.0",
|
64
68
|
"types-requests>=2.32.4.20250611",
|
69
|
+
"mypy>=1.17.0",
|
70
|
+
"types-pyyaml>=6.0.12.20250516",
|
71
|
+
"pandas-stubs>=2.3.0.250703",
|
72
|
+
"lxml-stubs>=0.5.1",
|
73
|
+
"types-pygments>=2.19.0.20250715",
|
74
|
+
"types-beautifulsoup4>=4.12.0.20250516",
|
75
|
+
"types-cachetools>=6.1.0.20250717",
|
76
|
+
"types-cffi>=1.17.0.20250523",
|
77
|
+
"types-defusedxml>=0.7.0.20250708",
|
78
|
+
"types-greenlet>=3.2.0.20250417",
|
79
|
+
"types-jsonschema>=4.24.0.20250708",
|
80
|
+
"types-objgraph>=3.6.0.20240907",
|
81
|
+
"types-pexpect>=4.9.0.20250516",
|
82
|
+
"types-protobuf>=6.30.2.20250703",
|
83
|
+
"types-psutil>=7.0.0.20250601",
|
84
|
+
"types-pyopenssl>=24.1.0.20240722",
|
85
|
+
"types-pyasn1>=0.6.0.20250516",
|
86
|
+
"types-regex>=2024.11.6.20250403",
|
87
|
+
"types-reportlab>=4.4.1.20250602",
|
88
|
+
"types-simplejson>=3.20.0.20250326",
|
89
|
+
"types-tensorflow>=2.18.0.20250516",
|
90
|
+
"types-tqdm>=4.67.0.20250516",
|
91
|
+
"types-tree-sitter-languages>=1.10.0.20250530",
|
92
|
+
"types-xmltodict>=0.14.0.20241009",
|
65
93
|
]
|
66
94
|
|
67
95
|
[tool.hatch.build]
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import Literal, List, Dict, Any
|
1
|
+
from typing import Literal, List, Dict, Any, Union
|
2
2
|
from requests import exceptions
|
3
3
|
from judgeval.common.api.constants import (
|
4
4
|
JUDGMENT_TRACES_FETCH_API_URL,
|
@@ -25,6 +25,8 @@ from judgeval.common.api.constants import (
|
|
25
25
|
JUDGMENT_SCORER_SAVE_API_URL,
|
26
26
|
JUDGMENT_SCORER_FETCH_API_URL,
|
27
27
|
JUDGMENT_SCORER_EXISTS_API_URL,
|
28
|
+
JUDGMENT_DATASETS_APPEND_TRACES_API_URL,
|
29
|
+
JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL,
|
28
30
|
)
|
29
31
|
from judgeval.common.api.constants import (
|
30
32
|
TraceFetchPayload,
|
@@ -48,9 +50,12 @@ from judgeval.common.api.constants import (
|
|
48
50
|
ScorerSavePayload,
|
49
51
|
ScorerFetchPayload,
|
50
52
|
ScorerExistsPayload,
|
53
|
+
CheckExampleKeysPayload,
|
51
54
|
)
|
52
55
|
from judgeval.utils.requests import requests
|
53
56
|
|
57
|
+
import orjson
|
58
|
+
|
54
59
|
|
55
60
|
class JudgmentAPIException(exceptions.HTTPError):
|
56
61
|
"""
|
@@ -65,7 +70,7 @@ class JudgmentAPIException(exceptions.HTTPError):
|
|
65
70
|
self.request = request
|
66
71
|
|
67
72
|
@property
|
68
|
-
def status_code(self) -> int
|
73
|
+
def status_code(self) -> Union[int, None]:
|
69
74
|
"""Get the HTTP status code from the response."""
|
70
75
|
return self.response.status_code if self.response else None
|
71
76
|
|
@@ -114,8 +119,15 @@ class JudgmentApiClient:
|
|
114
119
|
try:
|
115
120
|
r.raise_for_status()
|
116
121
|
except exceptions.HTTPError as e:
|
122
|
+
try:
|
123
|
+
detail = r.json().get("detail", "")
|
124
|
+
except Exception:
|
125
|
+
detail = r.text
|
126
|
+
|
117
127
|
raise JudgmentAPIException(
|
118
|
-
f"HTTP {r.status_code}: {r.reason}
|
128
|
+
f"HTTP {r.status_code}: {r.reason}, {detail}",
|
129
|
+
response=r,
|
130
|
+
request=e.request,
|
119
131
|
)
|
120
132
|
|
121
133
|
return r.json()
|
@@ -218,6 +230,14 @@ class JudgmentApiClient:
|
|
218
230
|
}
|
219
231
|
return self._do_request("POST", JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL, payload)
|
220
232
|
|
233
|
+
def check_example_keys(self, keys: List[str], eval_name: str, project_name: str):
|
234
|
+
payload: CheckExampleKeysPayload = {
|
235
|
+
"keys": keys,
|
236
|
+
"eval_name": eval_name,
|
237
|
+
"project_name": project_name,
|
238
|
+
}
|
239
|
+
return self._do_request("POST", JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL, payload)
|
240
|
+
|
221
241
|
def save_scorer(self, name: str, prompt: str, options: dict):
|
222
242
|
payload: ScorerSavePayload = {
|
223
243
|
"name": name,
|
@@ -279,7 +299,7 @@ class JudgmentApiClient:
|
|
279
299
|
project_name: str,
|
280
300
|
examples: List[Dict[str, Any]],
|
281
301
|
traces: List[Dict[str, Any]],
|
282
|
-
overwrite: bool,
|
302
|
+
overwrite: bool = False,
|
283
303
|
):
|
284
304
|
payload: DatasetPushPayload = {
|
285
305
|
"dataset_alias": dataset_alias,
|
@@ -302,6 +322,18 @@ class JudgmentApiClient:
|
|
302
322
|
"POST", JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL, payload
|
303
323
|
)
|
304
324
|
|
325
|
+
def append_traces(
|
326
|
+
self, dataset_alias: str, project_name: str, traces: List[Dict[str, Any]]
|
327
|
+
):
|
328
|
+
payload: DatasetAppendPayload = {
|
329
|
+
"dataset_alias": dataset_alias,
|
330
|
+
"project_name": project_name,
|
331
|
+
"traces": traces,
|
332
|
+
}
|
333
|
+
return self._do_request(
|
334
|
+
"POST", JUDGMENT_DATASETS_APPEND_TRACES_API_URL, payload
|
335
|
+
)
|
336
|
+
|
305
337
|
def pull_dataset(self, dataset_alias: str, project_name: str):
|
306
338
|
payload: DatasetPullPayload = {
|
307
339
|
"dataset_alias": dataset_alias,
|
@@ -347,6 +379,5 @@ class JudgmentApiClient:
|
|
347
379
|
except Exception as e:
|
348
380
|
return f"<Unserializable object of type {type(obj).__name__}: {e}>"
|
349
381
|
|
350
|
-
|
351
|
-
|
352
|
-
return json.dumps(data, default=fallback_encoder)
|
382
|
+
# orjson returns bytes, so we need to decode to str
|
383
|
+
return orjson.dumps(data, default=fallback_encoder).decode("utf-8")
|
@@ -51,6 +51,7 @@ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
|
|
51
51
|
JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
|
52
52
|
JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL = f"{ROOT_API}/check_experiment_type/"
|
53
53
|
JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL = f"{ROOT_API}/eval-run-name-exists/"
|
54
|
+
JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL = f"{ROOT_API}/check_example_keys/"
|
54
55
|
|
55
56
|
|
56
57
|
# Evaluation API Payloads
|
@@ -90,9 +91,16 @@ class EvalRunNameExistsPayload(TypedDict):
|
|
90
91
|
judgment_api_key: str
|
91
92
|
|
92
93
|
|
94
|
+
class CheckExampleKeysPayload(TypedDict):
|
95
|
+
keys: List[str]
|
96
|
+
eval_name: str
|
97
|
+
project_name: str
|
98
|
+
|
99
|
+
|
93
100
|
# Datasets API
|
94
101
|
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
95
102
|
JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
103
|
+
JUDGMENT_DATASETS_APPEND_TRACES_API_URL = f"{ROOT_API}/traces/add_to_dataset/"
|
96
104
|
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
|
97
105
|
JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
|
98
106
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
|
@@ -134,7 +142,7 @@ class DatasetStatsPayload(TypedDict):
|
|
134
142
|
|
135
143
|
|
136
144
|
# Projects API
|
137
|
-
JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/
|
145
|
+
JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete_from_judgeval/"
|
138
146
|
JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
|
139
147
|
|
140
148
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import os
|
2
|
-
import json
|
3
2
|
import boto3
|
3
|
+
import orjson
|
4
4
|
from typing import Optional
|
5
5
|
from datetime import datetime, UTC
|
6
6
|
from botocore.exceptions import ClientError
|
@@ -85,8 +85,7 @@ class S3Storage:
|
|
85
85
|
timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
|
86
86
|
s3_key = f"traces/{project_name}/{trace_id}_{timestamp}.json"
|
87
87
|
|
88
|
-
|
89
|
-
trace_json = json.dumps(trace_data)
|
88
|
+
trace_json = orjson.dumps(trace_data).decode("utf-8")
|
90
89
|
|
91
90
|
self.s3_client.put_object(
|
92
91
|
Bucket=self.bucket_name,
|
@@ -32,6 +32,7 @@ from typing import (
|
|
32
32
|
)
|
33
33
|
import types
|
34
34
|
|
35
|
+
|
35
36
|
from judgeval.common.tracer.constants import _TRACE_FILEPATH_BLOCKLIST
|
36
37
|
|
37
38
|
from judgeval.common.tracer.otel_span_processor import JudgmentSpanProcessor
|
@@ -45,6 +46,7 @@ from openai.types.chat import ParsedChatCompletion
|
|
45
46
|
from together import Together, AsyncTogether
|
46
47
|
from anthropic import Anthropic, AsyncAnthropic
|
47
48
|
from google import genai
|
49
|
+
from groq import Groq, AsyncGroq
|
48
50
|
|
49
51
|
from judgeval.data import Example, Trace, TraceSpan, TraceUsage
|
50
52
|
from judgeval.scorers import APIScorerConfig, BaseScorer
|
@@ -67,6 +69,8 @@ ApiClient: TypeAlias = Union[
|
|
67
69
|
AsyncTogether,
|
68
70
|
genai.Client,
|
69
71
|
genai.client.AsyncClient,
|
72
|
+
Groq,
|
73
|
+
AsyncGroq,
|
70
74
|
]
|
71
75
|
SpanType: TypeAlias = str
|
72
76
|
|
@@ -79,7 +83,7 @@ class TraceClient:
|
|
79
83
|
tracer: Tracer,
|
80
84
|
trace_id: Optional[str] = None,
|
81
85
|
name: str = "default",
|
82
|
-
project_name: str
|
86
|
+
project_name: Union[str, None] = None,
|
83
87
|
enable_monitoring: bool = True,
|
84
88
|
enable_evaluations: bool = True,
|
85
89
|
parent_trace_id: Optional[str] = None,
|
@@ -414,8 +418,6 @@ class TraceClient:
|
|
414
418
|
self.start_time or time.time(), timezone.utc
|
415
419
|
).isoformat(),
|
416
420
|
"duration": total_duration,
|
417
|
-
"trace_spans": [span.model_dump() for span in self.trace_spans],
|
418
|
-
"evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
|
419
421
|
"offline_mode": self.tracer.offline_mode,
|
420
422
|
"parent_trace_id": self.parent_trace_id,
|
421
423
|
"parent_name": self.parent_name,
|
@@ -850,9 +852,9 @@ class Tracer:
|
|
850
852
|
|
851
853
|
def __init__(
|
852
854
|
self,
|
853
|
-
api_key: str
|
854
|
-
organization_id: str
|
855
|
-
project_name: str
|
855
|
+
api_key: Union[str, None] = os.getenv("JUDGMENT_API_KEY"),
|
856
|
+
organization_id: Union[str, None] = os.getenv("JUDGMENT_ORG_ID"),
|
857
|
+
project_name: Union[str, None] = None,
|
856
858
|
deep_tracing: bool = False, # Deep tracing is disabled by default
|
857
859
|
enable_monitoring: bool = os.getenv("JUDGMENT_MONITORING", "true").lower()
|
858
860
|
== "true",
|
@@ -905,8 +907,8 @@ class Tracer:
|
|
905
907
|
self.class_identifiers: Dict[
|
906
908
|
str, str
|
907
909
|
] = {} # Dictionary to store class identifiers
|
908
|
-
self.span_id_to_previous_span_id: Dict[str, str
|
909
|
-
self.trace_id_to_previous_trace: Dict[str, TraceClient
|
910
|
+
self.span_id_to_previous_span_id: Dict[str, Union[str, None]] = {}
|
911
|
+
self.trace_id_to_previous_trace: Dict[str, Union[TraceClient, None]] = {}
|
910
912
|
self.current_span_id: Optional[str] = None
|
911
913
|
self.current_trace: Optional[TraceClient] = None
|
912
914
|
self.trace_across_async_contexts: bool = trace_across_async_contexts
|
@@ -958,7 +960,9 @@ class Tracer:
|
|
958
960
|
self.enable_monitoring = False
|
959
961
|
self.enable_evaluations = False
|
960
962
|
|
961
|
-
def set_current_span(
|
963
|
+
def set_current_span(
|
964
|
+
self, span_id: str
|
965
|
+
) -> Optional[contextvars.Token[Union[str, None]]]:
|
962
966
|
self.span_id_to_previous_span_id[span_id] = self.current_span_id
|
963
967
|
self.current_span_id = span_id
|
964
968
|
Tracer.current_span_id = span_id
|
@@ -981,7 +985,7 @@ class Tracer:
|
|
981
985
|
|
982
986
|
def reset_current_span(
|
983
987
|
self,
|
984
|
-
token: Optional[contextvars.Token[str
|
988
|
+
token: Optional[contextvars.Token[Union[str, None]]] = None,
|
985
989
|
span_id: Optional[str] = None,
|
986
990
|
):
|
987
991
|
try:
|
@@ -997,7 +1001,7 @@ class Tracer:
|
|
997
1001
|
|
998
1002
|
def set_current_trace(
|
999
1003
|
self, trace: TraceClient
|
1000
|
-
) -> Optional[contextvars.Token[TraceClient
|
1004
|
+
) -> Optional[contextvars.Token[Union[TraceClient, None]]]:
|
1001
1005
|
"""
|
1002
1006
|
Set the current trace context in contextvars
|
1003
1007
|
"""
|
@@ -1030,7 +1034,7 @@ class Tracer:
|
|
1030
1034
|
|
1031
1035
|
def reset_current_trace(
|
1032
1036
|
self,
|
1033
|
-
token: Optional[contextvars.Token[TraceClient
|
1037
|
+
token: Optional[contextvars.Token[Union[TraceClient, None]]] = None,
|
1034
1038
|
trace_id: Optional[str] = None,
|
1035
1039
|
):
|
1036
1040
|
try:
|
@@ -1046,7 +1050,7 @@ class Tracer:
|
|
1046
1050
|
|
1047
1051
|
@contextmanager
|
1048
1052
|
def trace(
|
1049
|
-
self, name: str, project_name: str
|
1053
|
+
self, name: str, project_name: Union[str, None] = None
|
1050
1054
|
) -> Generator[TraceClient, None, None]:
|
1051
1055
|
"""Start a new trace context using a context manager"""
|
1052
1056
|
trace_id = str(uuid.uuid4())
|
@@ -1692,25 +1696,31 @@ def wrap(
|
|
1692
1696
|
return wrapper
|
1693
1697
|
|
1694
1698
|
if isinstance(client, (OpenAI)):
|
1695
|
-
client.chat.completions
|
1696
|
-
client.responses
|
1697
|
-
client.beta.chat.completions
|
1699
|
+
setattr(client.chat.completions, "create", wrapped(original_create))
|
1700
|
+
setattr(client.responses, "create", wrapped(original_responses_create))
|
1701
|
+
setattr(client.beta.chat.completions, "parse", wrapped(original_beta_parse))
|
1698
1702
|
elif isinstance(client, (AsyncOpenAI)):
|
1699
|
-
client.chat.completions
|
1700
|
-
client.responses
|
1701
|
-
|
1703
|
+
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1704
|
+
setattr(client.responses, "create", wrapped_async(original_responses_create))
|
1705
|
+
setattr(
|
1706
|
+
client.beta.chat.completions, "parse", wrapped_async(original_beta_parse)
|
1707
|
+
)
|
1702
1708
|
elif isinstance(client, (Together)):
|
1703
|
-
client.chat.completions
|
1709
|
+
setattr(client.chat.completions, "create", wrapped(original_create))
|
1704
1710
|
elif isinstance(client, (AsyncTogether)):
|
1705
|
-
client.chat.completions
|
1711
|
+
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1706
1712
|
elif isinstance(client, (Anthropic)):
|
1707
|
-
client.messages
|
1713
|
+
setattr(client.messages, "create", wrapped(original_create))
|
1708
1714
|
elif isinstance(client, (AsyncAnthropic)):
|
1709
|
-
client.messages
|
1715
|
+
setattr(client.messages, "create", wrapped_async(original_create))
|
1710
1716
|
elif isinstance(client, (genai.Client)):
|
1711
|
-
client.models
|
1717
|
+
setattr(client.models, "generate_content", wrapped(original_create))
|
1712
1718
|
elif isinstance(client, (genai.client.AsyncClient)):
|
1713
|
-
client.models
|
1719
|
+
setattr(client.models, "generate_content", wrapped_async(original_create))
|
1720
|
+
elif isinstance(client, (Groq)):
|
1721
|
+
setattr(client.chat.completions, "create", wrapped(original_create))
|
1722
|
+
elif isinstance(client, (AsyncGroq)):
|
1723
|
+
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1714
1724
|
|
1715
1725
|
return client
|
1716
1726
|
|
@@ -1745,6 +1755,8 @@ def _get_client_config(
|
|
1745
1755
|
None,
|
1746
1756
|
client.beta.chat.completions.parse,
|
1747
1757
|
)
|
1758
|
+
elif isinstance(client, (Groq, AsyncGroq)):
|
1759
|
+
return "GROQ_API_CALL", client.chat.completions.create, None, None, None
|
1748
1760
|
elif isinstance(client, (Together, AsyncTogether)):
|
1749
1761
|
return "TOGETHER_API_CALL", client.chat.completions.create, None, None, None
|
1750
1762
|
elif isinstance(client, (Anthropic, AsyncAnthropic)):
|
@@ -1783,9 +1795,17 @@ def _format_output_data(
|
|
1783
1795
|
if isinstance(client, (OpenAI, AsyncOpenAI)):
|
1784
1796
|
if isinstance(response, ChatCompletion):
|
1785
1797
|
model_name = response.model
|
1786
|
-
prompt_tokens = response.usage.prompt_tokens
|
1787
|
-
completion_tokens =
|
1788
|
-
|
1798
|
+
prompt_tokens = response.usage.prompt_tokens if response.usage else 0
|
1799
|
+
completion_tokens = (
|
1800
|
+
response.usage.completion_tokens if response.usage else 0
|
1801
|
+
)
|
1802
|
+
cache_read_input_tokens = (
|
1803
|
+
response.usage.prompt_tokens_details.cached_tokens
|
1804
|
+
if response.usage
|
1805
|
+
and response.usage.prompt_tokens_details
|
1806
|
+
and response.usage.prompt_tokens_details.cached_tokens
|
1807
|
+
else 0
|
1808
|
+
)
|
1789
1809
|
|
1790
1810
|
if isinstance(response, ParsedChatCompletion):
|
1791
1811
|
message_content = response.choices[0].message.parsed
|
@@ -1793,10 +1813,19 @@ def _format_output_data(
|
|
1793
1813
|
message_content = response.choices[0].message.content
|
1794
1814
|
elif isinstance(response, Response):
|
1795
1815
|
model_name = response.model
|
1796
|
-
prompt_tokens = response.usage.input_tokens
|
1797
|
-
completion_tokens = response.usage.output_tokens
|
1798
|
-
cache_read_input_tokens =
|
1799
|
-
|
1816
|
+
prompt_tokens = response.usage.input_tokens if response.usage else 0
|
1817
|
+
completion_tokens = response.usage.output_tokens if response.usage else 0
|
1818
|
+
cache_read_input_tokens = (
|
1819
|
+
response.usage.input_tokens_details.cached_tokens
|
1820
|
+
if response.usage and response.usage.input_tokens_details
|
1821
|
+
else 0
|
1822
|
+
)
|
1823
|
+
if hasattr(response.output[0], "content"):
|
1824
|
+
message_content = "".join(
|
1825
|
+
seg.text
|
1826
|
+
for seg in response.output[0].content
|
1827
|
+
if hasattr(seg, "text")
|
1828
|
+
)
|
1800
1829
|
|
1801
1830
|
# Note: LiteLLM seems to use cache_read_input_tokens to calculate the cost for OpenAI
|
1802
1831
|
elif isinstance(client, (Together, AsyncTogether)):
|
@@ -1821,6 +1850,11 @@ def _format_output_data(
|
|
1821
1850
|
cache_read_input_tokens = response.usage.cache_read_input_tokens
|
1822
1851
|
cache_creation_input_tokens = response.usage.cache_creation_input_tokens
|
1823
1852
|
message_content = response.content[0].text
|
1853
|
+
elif isinstance(client, (Groq, AsyncGroq)):
|
1854
|
+
model_name = "groq/" + response.model
|
1855
|
+
prompt_tokens = response.usage.prompt_tokens
|
1856
|
+
completion_tokens = response.usage.completion_tokens
|
1857
|
+
message_content = response.choices[0].message.content
|
1824
1858
|
else:
|
1825
1859
|
judgeval_logger.warning(f"Unsupported client type: {type(client)}")
|
1826
1860
|
return None, None
|