judgeval 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.2.0 → judgeval-0.3.0}/.github/workflows/lint.yaml +0 -13
- judgeval-0.3.0/.github/workflows/mypy.yaml +25 -0
- judgeval-0.3.0/.github/workflows/pre-commit-autoupdate.yaml +38 -0
- judgeval-0.3.0/.pre-commit-config.yaml +23 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/PKG-INFO +10 -6
- {judgeval-0.2.0 → judgeval-0.3.0}/README.md +5 -5
- judgeval-0.3.0/assets/agent_trace_example.png +0 -0
- judgeval-0.3.0/assets/errors.png +0 -0
- judgeval-0.3.0/assets/online_eval.png +0 -0
- judgeval-0.3.0/assets/product_shot.png +0 -0
- judgeval-0.3.0/assets/test.png +0 -0
- judgeval-0.3.0/assets/tests.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/pyproject.toml +29 -1
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/common/api/api.py +38 -7
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/common/api/constants.py +9 -1
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/common/storage/s3_storage.py +2 -3
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/common/tracer/core.py +66 -30
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/common/tracer/otel_span_processor.py +4 -50
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/common/tracer/span_transformer.py +16 -10
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/common/utils.py +46 -38
- judgeval-0.3.0/src/judgeval/data/example.py +33 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/data/judgment_types.py +23 -44
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/data/result.py +8 -14
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/data/scripts/openapi_transform.py +5 -5
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/data/trace.py +3 -4
- judgeval-0.3.0/src/judgeval/dataset.py +192 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/evaluation_run.py +1 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/judges/litellm_judge.py +2 -2
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/judges/mixture_of_judges.py +6 -6
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/judges/together_judge.py +4 -2
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/judgment_client.py +9 -71
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/run_evaluation.py +40 -8
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/score.py +11 -7
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/utils.py +3 -3
- judgeval-0.3.0/src/judgeval/utils/file_utils.py +66 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/update_types.sh +1 -1
- {judgeval-0.2.0 → judgeval-0.3.0}/uv.lock +449 -0
- judgeval-0.2.0/.pre-commit-config.yaml +0 -21
- judgeval-0.2.0/assets/product_shot.png +0 -0
- judgeval-0.2.0/src/judgeval/data/datasets/__init__.py +0 -4
- judgeval-0.2.0/src/judgeval/data/datasets/dataset.py +0 -341
- judgeval-0.2.0/src/judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval-0.2.0/src/judgeval/data/example.py +0 -61
- judgeval-0.2.0/src/judgeval/utils/file_utils.py +0 -51
- {judgeval-0.2.0 → judgeval-0.3.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/.github/pull_request_template.md +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/.github/workflows/blocked-pr.yaml +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/.github/workflows/merge-branch-check.yaml +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/.github/workflows/release.yaml +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/.github/workflows/validate-branch.yaml +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/.gitignore +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/LICENSE.md +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/agent.gif +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/data.gif +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/dataset_clustering_screenshot.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/dataset_clustering_screenshot_dm.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/datasets_preview_screenshot.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/document.gif +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/error_analysis_dashboard.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/experiments_dashboard_screenshot.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/experiments_page.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/experiments_pagev2.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/logo-dark.svg +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/logo-light.svg +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/monitoring_screenshot.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/new_darkmode.svg +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/new_lightmode.svg +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/trace.gif +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/trace_demo.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/trace_screenshot.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/assets/trace_screenshot_old.png +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/pytest.ini +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/.coveragerc +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/clients.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/common/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/common/api/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/common/exceptions.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/common/logger.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/common/storage/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/common/tracer/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/common/tracer/constants.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/common/tracer/otel_exporter.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/common/tracer/span_processor.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/common/tracer/trace_manager.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/constants.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/data/tool.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/data/trace_run.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/integrations/langgraph.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/rules.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/agent_scorer.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/api_scorer.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/base_scorer.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/example_scorer.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/tracer/__init__.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/utils/alerts.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/utils/requests.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/src/judgeval/version_check.py +0 -0
- {judgeval-0.2.0 → judgeval-0.3.0}/update_version.py +0 -0
@@ -10,20 +10,11 @@ jobs:
|
|
10
10
|
steps:
|
11
11
|
- uses: actions/checkout@v4
|
12
12
|
|
13
|
-
- name: Set up Python
|
14
|
-
uses: actions/setup-python@v5
|
15
|
-
with:
|
16
|
-
python-version: '3.11'
|
17
|
-
|
18
13
|
- name: Install ruff
|
19
14
|
uses: astral-sh/ruff-action@v3
|
20
15
|
with:
|
21
16
|
args: "--version"
|
22
17
|
|
23
|
-
- name: Install mypy and dependencies
|
24
|
-
run: |
|
25
|
-
pip install mypy types-requests types-PyYAML
|
26
|
-
|
27
18
|
- name: Run ruff formatter
|
28
19
|
if: always()
|
29
20
|
run: ruff format --check .
|
@@ -31,7 +22,3 @@ jobs:
|
|
31
22
|
- name: Run ruff linter
|
32
23
|
if: always()
|
33
24
|
run: ruff check .
|
34
|
-
|
35
|
-
- name: Run mypy
|
36
|
-
if: always()
|
37
|
-
run: mypy --explicit-package-bases --ignore-missing-imports .
|
@@ -0,0 +1,25 @@
|
|
1
|
+
name: MyPy Check
|
2
|
+
|
3
|
+
on:
|
4
|
+
pull_request:
|
5
|
+
branches: [ main, staging ]
|
6
|
+
|
7
|
+
jobs:
|
8
|
+
mypy:
|
9
|
+
runs-on: ubuntu-latest
|
10
|
+
steps:
|
11
|
+
- uses: actions/checkout@v4
|
12
|
+
|
13
|
+
- name: Set up Python
|
14
|
+
uses: actions/setup-python@v5
|
15
|
+
with:
|
16
|
+
python-version: '3.11'
|
17
|
+
|
18
|
+
- name: Install dependencies
|
19
|
+
run: |
|
20
|
+
pip install uv
|
21
|
+
uv sync --dev
|
22
|
+
|
23
|
+
- name: Run mypy
|
24
|
+
if: always()
|
25
|
+
run: uv run mypy ./src/judgeval/
|
@@ -0,0 +1,38 @@
|
|
1
|
+
name: Pre-commit auto-update
|
2
|
+
on:
|
3
|
+
schedule:
|
4
|
+
- cron: '0 0 * * 1' # Weekly on Monday at midnight UTC
|
5
|
+
workflow_dispatch:
|
6
|
+
|
7
|
+
jobs:
|
8
|
+
auto-update:
|
9
|
+
runs-on: ubuntu-latest
|
10
|
+
steps:
|
11
|
+
- name: Checkout repository
|
12
|
+
uses: actions/checkout@v4
|
13
|
+
with:
|
14
|
+
ref: staging
|
15
|
+
|
16
|
+
- name: Set up Python
|
17
|
+
uses: actions/setup-python@v4
|
18
|
+
with:
|
19
|
+
python-version: '3.11'
|
20
|
+
|
21
|
+
- name: Install and update pre-commit
|
22
|
+
run: |
|
23
|
+
pip install pre-commit
|
24
|
+
pre-commit autoupdate
|
25
|
+
|
26
|
+
- name: Create Pull Request
|
27
|
+
uses: peter-evans/create-pull-request@v7
|
28
|
+
with:
|
29
|
+
commit-message: 'chore: update pre-commit hooks'
|
30
|
+
title: 'chore: update pre-commit hooks'
|
31
|
+
body: |
|
32
|
+
Auto-generated PR to update pre-commit hook versions.
|
33
|
+
|
34
|
+
Please review the changes and merge if everything looks good.
|
35
|
+
|
36
|
+
Updated by GitHub Actions on {{ date }}.
|
37
|
+
branch: update-pre-commit-hooks
|
38
|
+
base: staging
|
@@ -0,0 +1,23 @@
|
|
1
|
+
repos:
|
2
|
+
- repo: https://github.com/astral-sh/uv-pre-commit
|
3
|
+
rev: 0.8.0
|
4
|
+
hooks:
|
5
|
+
- id: uv-lock
|
6
|
+
|
7
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
8
|
+
rev: v0.12.4
|
9
|
+
hooks:
|
10
|
+
- id: ruff
|
11
|
+
name: ruff (linter)
|
12
|
+
args: [--fix]
|
13
|
+
- id: ruff-format
|
14
|
+
name: ruff (formatter)
|
15
|
+
|
16
|
+
# - repo: https://github.com/pre-commit/mirrors-mypy
|
17
|
+
# rev: v1.17.0
|
18
|
+
# hooks:
|
19
|
+
# - id: mypy
|
20
|
+
# language: system
|
21
|
+
# # These next two lines allow commits even if mypy fails, REMOVE once we fix all mypy errors
|
22
|
+
# verbose: true
|
23
|
+
# entry: bash -c 'mypy src/judgeval/ || true'
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.0
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -14,6 +14,7 @@ Requires-Dist: anthropic
|
|
14
14
|
Requires-Dist: boto3
|
15
15
|
Requires-Dist: datamodel-code-generator>=0.31.1
|
16
16
|
Requires-Dist: google-genai
|
17
|
+
Requires-Dist: groq>=0.30.0
|
17
18
|
Requires-Dist: langchain-anthropic
|
18
19
|
Requires-Dist: langchain-core
|
19
20
|
Requires-Dist: langchain-huggingface
|
@@ -22,6 +23,9 @@ Requires-Dist: litellm>=1.61.15
|
|
22
23
|
Requires-Dist: matplotlib>=3.10.3
|
23
24
|
Requires-Dist: nest-asyncio
|
24
25
|
Requires-Dist: openai
|
26
|
+
Requires-Dist: opentelemetry-api>=1.34.1
|
27
|
+
Requires-Dist: opentelemetry-sdk>=1.34.1
|
28
|
+
Requires-Dist: orjson>=3.9.0
|
25
29
|
Requires-Dist: pandas
|
26
30
|
Requires-Dist: python-dotenv==1.0.1
|
27
31
|
Requires-Dist: python-slugify>=8.0.4
|
@@ -39,7 +43,7 @@ Description-Content-Type: text/markdown
|
|
39
43
|
Enable self-learning agents with traces, evals, and environment data.
|
40
44
|
</div>
|
41
45
|
|
42
|
-
## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
|
46
|
+
## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
|
43
47
|
|
44
48
|
[Demo](https://www.youtube.com/watch?v=1S4LixpVbcc) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
|
45
49
|
|
@@ -139,7 +143,7 @@ run_agent("What is the capital of the United States?")
|
|
139
143
|
```
|
140
144
|
You'll see your trace exported to the Judgment Platform:
|
141
145
|
|
142
|
-
<p align="center"><img src="assets/
|
146
|
+
<p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
|
143
147
|
|
144
148
|
|
145
149
|
[Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
|
@@ -152,9 +156,9 @@ You'll see your trace exported to the Judgment Platform:
|
|
152
156
|
|
153
157
|
| | |
|
154
158
|
|:---|:---:|
|
155
|
-
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/
|
156
|
-
| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/
|
157
|
-
| <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/
|
159
|
+
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
|
160
|
+
| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
|
161
|
+
| <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
158
162
|
| <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
159
163
|
|
160
164
|
## 🏢 Self-Hosting
|
@@ -8,7 +8,7 @@
|
|
8
8
|
Enable self-learning agents with traces, evals, and environment data.
|
9
9
|
</div>
|
10
10
|
|
11
|
-
## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
|
11
|
+
## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
|
12
12
|
|
13
13
|
[Demo](https://www.youtube.com/watch?v=1S4LixpVbcc) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
|
14
14
|
|
@@ -108,7 +108,7 @@ run_agent("What is the capital of the United States?")
|
|
108
108
|
```
|
109
109
|
You'll see your trace exported to the Judgment Platform:
|
110
110
|
|
111
|
-
<p align="center"><img src="assets/
|
111
|
+
<p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
|
112
112
|
|
113
113
|
|
114
114
|
[Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
|
@@ -121,9 +121,9 @@ You'll see your trace exported to the Judgment Platform:
|
|
121
121
|
|
122
122
|
| | |
|
123
123
|
|:---|:---:|
|
124
|
-
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/
|
125
|
-
| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/
|
126
|
-
| <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/
|
124
|
+
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
|
125
|
+
| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
|
126
|
+
| <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
127
127
|
| <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
128
128
|
|
129
129
|
## 🏢 Self-Hosting
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "judgeval"
|
3
|
-
version = "0.
|
3
|
+
version = "0.3.0"
|
4
4
|
authors = [
|
5
5
|
{ name="Andrew Li", email="andrew@judgmentlabs.ai" },
|
6
6
|
{ name="Alex Shan", email="alex@judgmentlabs.ai" },
|
@@ -33,6 +33,10 @@ dependencies = [
|
|
33
33
|
"matplotlib>=3.10.3",
|
34
34
|
"python-slugify>=8.0.4",
|
35
35
|
"datamodel-code-generator>=0.31.1",
|
36
|
+
"groq>=0.30.0",
|
37
|
+
"opentelemetry-api>=1.34.1",
|
38
|
+
"opentelemetry-sdk>=1.34.1",
|
39
|
+
"orjson>=3.9.0",
|
36
40
|
]
|
37
41
|
|
38
42
|
[project.urls]
|
@@ -62,6 +66,30 @@ dev = [
|
|
62
66
|
"langgraph>=0.4.3",
|
63
67
|
"pre-commit>=4.2.0",
|
64
68
|
"types-requests>=2.32.4.20250611",
|
69
|
+
"mypy>=1.17.0",
|
70
|
+
"types-pyyaml>=6.0.12.20250516",
|
71
|
+
"pandas-stubs>=2.3.0.250703",
|
72
|
+
"lxml-stubs>=0.5.1",
|
73
|
+
"types-pygments>=2.19.0.20250715",
|
74
|
+
"types-beautifulsoup4>=4.12.0.20250516",
|
75
|
+
"types-cachetools>=6.1.0.20250717",
|
76
|
+
"types-cffi>=1.17.0.20250523",
|
77
|
+
"types-defusedxml>=0.7.0.20250708",
|
78
|
+
"types-greenlet>=3.2.0.20250417",
|
79
|
+
"types-jsonschema>=4.24.0.20250708",
|
80
|
+
"types-objgraph>=3.6.0.20240907",
|
81
|
+
"types-pexpect>=4.9.0.20250516",
|
82
|
+
"types-protobuf>=6.30.2.20250703",
|
83
|
+
"types-psutil>=7.0.0.20250601",
|
84
|
+
"types-pyopenssl>=24.1.0.20240722",
|
85
|
+
"types-pyasn1>=0.6.0.20250516",
|
86
|
+
"types-regex>=2024.11.6.20250403",
|
87
|
+
"types-reportlab>=4.4.1.20250602",
|
88
|
+
"types-simplejson>=3.20.0.20250326",
|
89
|
+
"types-tensorflow>=2.18.0.20250516",
|
90
|
+
"types-tqdm>=4.67.0.20250516",
|
91
|
+
"types-tree-sitter-languages>=1.10.0.20250530",
|
92
|
+
"types-xmltodict>=0.14.0.20241009",
|
65
93
|
]
|
66
94
|
|
67
95
|
[tool.hatch.build]
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import Literal, List, Dict, Any
|
1
|
+
from typing import Literal, List, Dict, Any, Union
|
2
2
|
from requests import exceptions
|
3
3
|
from judgeval.common.api.constants import (
|
4
4
|
JUDGMENT_TRACES_FETCH_API_URL,
|
@@ -25,6 +25,8 @@ from judgeval.common.api.constants import (
|
|
25
25
|
JUDGMENT_SCORER_SAVE_API_URL,
|
26
26
|
JUDGMENT_SCORER_FETCH_API_URL,
|
27
27
|
JUDGMENT_SCORER_EXISTS_API_URL,
|
28
|
+
JUDGMENT_DATASETS_APPEND_TRACES_API_URL,
|
29
|
+
JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL,
|
28
30
|
)
|
29
31
|
from judgeval.common.api.constants import (
|
30
32
|
TraceFetchPayload,
|
@@ -48,9 +50,12 @@ from judgeval.common.api.constants import (
|
|
48
50
|
ScorerSavePayload,
|
49
51
|
ScorerFetchPayload,
|
50
52
|
ScorerExistsPayload,
|
53
|
+
CheckExampleKeysPayload,
|
51
54
|
)
|
52
55
|
from judgeval.utils.requests import requests
|
53
56
|
|
57
|
+
import orjson
|
58
|
+
|
54
59
|
|
55
60
|
class JudgmentAPIException(exceptions.HTTPError):
|
56
61
|
"""
|
@@ -65,7 +70,7 @@ class JudgmentAPIException(exceptions.HTTPError):
|
|
65
70
|
self.request = request
|
66
71
|
|
67
72
|
@property
|
68
|
-
def status_code(self) -> int
|
73
|
+
def status_code(self) -> Union[int, None]:
|
69
74
|
"""Get the HTTP status code from the response."""
|
70
75
|
return self.response.status_code if self.response else None
|
71
76
|
|
@@ -114,8 +119,15 @@ class JudgmentApiClient:
|
|
114
119
|
try:
|
115
120
|
r.raise_for_status()
|
116
121
|
except exceptions.HTTPError as e:
|
122
|
+
try:
|
123
|
+
detail = r.json().get("detail", "")
|
124
|
+
except Exception:
|
125
|
+
detail = r.text
|
126
|
+
|
117
127
|
raise JudgmentAPIException(
|
118
|
-
f"HTTP {r.status_code}: {r.reason}
|
128
|
+
f"HTTP {r.status_code}: {r.reason}, {detail}",
|
129
|
+
response=r,
|
130
|
+
request=e.request,
|
119
131
|
)
|
120
132
|
|
121
133
|
return r.json()
|
@@ -218,6 +230,14 @@ class JudgmentApiClient:
|
|
218
230
|
}
|
219
231
|
return self._do_request("POST", JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL, payload)
|
220
232
|
|
233
|
+
def check_example_keys(self, keys: List[str], eval_name: str, project_name: str):
|
234
|
+
payload: CheckExampleKeysPayload = {
|
235
|
+
"keys": keys,
|
236
|
+
"eval_name": eval_name,
|
237
|
+
"project_name": project_name,
|
238
|
+
}
|
239
|
+
return self._do_request("POST", JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL, payload)
|
240
|
+
|
221
241
|
def save_scorer(self, name: str, prompt: str, options: dict):
|
222
242
|
payload: ScorerSavePayload = {
|
223
243
|
"name": name,
|
@@ -279,7 +299,7 @@ class JudgmentApiClient:
|
|
279
299
|
project_name: str,
|
280
300
|
examples: List[Dict[str, Any]],
|
281
301
|
traces: List[Dict[str, Any]],
|
282
|
-
overwrite: bool,
|
302
|
+
overwrite: bool = False,
|
283
303
|
):
|
284
304
|
payload: DatasetPushPayload = {
|
285
305
|
"dataset_alias": dataset_alias,
|
@@ -302,6 +322,18 @@ class JudgmentApiClient:
|
|
302
322
|
"POST", JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL, payload
|
303
323
|
)
|
304
324
|
|
325
|
+
def append_traces(
|
326
|
+
self, dataset_alias: str, project_name: str, traces: List[Dict[str, Any]]
|
327
|
+
):
|
328
|
+
payload: DatasetAppendPayload = {
|
329
|
+
"dataset_alias": dataset_alias,
|
330
|
+
"project_name": project_name,
|
331
|
+
"traces": traces,
|
332
|
+
}
|
333
|
+
return self._do_request(
|
334
|
+
"POST", JUDGMENT_DATASETS_APPEND_TRACES_API_URL, payload
|
335
|
+
)
|
336
|
+
|
305
337
|
def pull_dataset(self, dataset_alias: str, project_name: str):
|
306
338
|
payload: DatasetPullPayload = {
|
307
339
|
"dataset_alias": dataset_alias,
|
@@ -347,6 +379,5 @@ class JudgmentApiClient:
|
|
347
379
|
except Exception as e:
|
348
380
|
return f"<Unserializable object of type {type(obj).__name__}: {e}>"
|
349
381
|
|
350
|
-
|
351
|
-
|
352
|
-
return json.dumps(data, default=fallback_encoder)
|
382
|
+
# orjson returns bytes, so we need to decode to str
|
383
|
+
return orjson.dumps(data, default=fallback_encoder).decode("utf-8")
|
@@ -51,6 +51,7 @@ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
|
|
51
51
|
JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
|
52
52
|
JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL = f"{ROOT_API}/check_experiment_type/"
|
53
53
|
JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL = f"{ROOT_API}/eval-run-name-exists/"
|
54
|
+
JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL = f"{ROOT_API}/check_example_keys/"
|
54
55
|
|
55
56
|
|
56
57
|
# Evaluation API Payloads
|
@@ -90,9 +91,16 @@ class EvalRunNameExistsPayload(TypedDict):
|
|
90
91
|
judgment_api_key: str
|
91
92
|
|
92
93
|
|
94
|
+
class CheckExampleKeysPayload(TypedDict):
|
95
|
+
keys: List[str]
|
96
|
+
eval_name: str
|
97
|
+
project_name: str
|
98
|
+
|
99
|
+
|
93
100
|
# Datasets API
|
94
101
|
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
95
102
|
JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
103
|
+
JUDGMENT_DATASETS_APPEND_TRACES_API_URL = f"{ROOT_API}/traces/add_to_dataset/"
|
96
104
|
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
|
97
105
|
JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
|
98
106
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
|
@@ -134,7 +142,7 @@ class DatasetStatsPayload(TypedDict):
|
|
134
142
|
|
135
143
|
|
136
144
|
# Projects API
|
137
|
-
JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/
|
145
|
+
JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete_from_judgeval"
|
138
146
|
JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
|
139
147
|
|
140
148
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import os
|
2
|
-
import json
|
3
2
|
import boto3
|
3
|
+
import orjson
|
4
4
|
from typing import Optional
|
5
5
|
from datetime import datetime, UTC
|
6
6
|
from botocore.exceptions import ClientError
|
@@ -85,8 +85,7 @@ class S3Storage:
|
|
85
85
|
timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
|
86
86
|
s3_key = f"traces/{project_name}/{trace_id}_{timestamp}.json"
|
87
87
|
|
88
|
-
|
89
|
-
trace_json = json.dumps(trace_data)
|
88
|
+
trace_json = orjson.dumps(trace_data).decode("utf-8")
|
90
89
|
|
91
90
|
self.s3_client.put_object(
|
92
91
|
Bucket=self.bucket_name,
|
@@ -32,6 +32,7 @@ from typing import (
|
|
32
32
|
)
|
33
33
|
import types
|
34
34
|
|
35
|
+
|
35
36
|
from judgeval.common.tracer.constants import _TRACE_FILEPATH_BLOCKLIST
|
36
37
|
|
37
38
|
from judgeval.common.tracer.otel_span_processor import JudgmentSpanProcessor
|
@@ -45,6 +46,7 @@ from openai.types.chat import ParsedChatCompletion
|
|
45
46
|
from together import Together, AsyncTogether
|
46
47
|
from anthropic import Anthropic, AsyncAnthropic
|
47
48
|
from google import genai
|
49
|
+
from groq import Groq, AsyncGroq
|
48
50
|
|
49
51
|
from judgeval.data import Example, Trace, TraceSpan, TraceUsage
|
50
52
|
from judgeval.scorers import APIScorerConfig, BaseScorer
|
@@ -67,6 +69,8 @@ ApiClient: TypeAlias = Union[
|
|
67
69
|
AsyncTogether,
|
68
70
|
genai.Client,
|
69
71
|
genai.client.AsyncClient,
|
72
|
+
Groq,
|
73
|
+
AsyncGroq,
|
70
74
|
]
|
71
75
|
SpanType: TypeAlias = str
|
72
76
|
|
@@ -79,7 +83,7 @@ class TraceClient:
|
|
79
83
|
tracer: Tracer,
|
80
84
|
trace_id: Optional[str] = None,
|
81
85
|
name: str = "default",
|
82
|
-
project_name: str
|
86
|
+
project_name: Union[str, None] = None,
|
83
87
|
enable_monitoring: bool = True,
|
84
88
|
enable_evaluations: bool = True,
|
85
89
|
parent_trace_id: Optional[str] = None,
|
@@ -850,9 +854,9 @@ class Tracer:
|
|
850
854
|
|
851
855
|
def __init__(
|
852
856
|
self,
|
853
|
-
api_key: str
|
854
|
-
organization_id: str
|
855
|
-
project_name: str
|
857
|
+
api_key: Union[str, None] = os.getenv("JUDGMENT_API_KEY"),
|
858
|
+
organization_id: Union[str, None] = os.getenv("JUDGMENT_ORG_ID"),
|
859
|
+
project_name: Union[str, None] = None,
|
856
860
|
deep_tracing: bool = False, # Deep tracing is disabled by default
|
857
861
|
enable_monitoring: bool = os.getenv("JUDGMENT_MONITORING", "true").lower()
|
858
862
|
== "true",
|
@@ -905,8 +909,8 @@ class Tracer:
|
|
905
909
|
self.class_identifiers: Dict[
|
906
910
|
str, str
|
907
911
|
] = {} # Dictionary to store class identifiers
|
908
|
-
self.span_id_to_previous_span_id: Dict[str, str
|
909
|
-
self.trace_id_to_previous_trace: Dict[str, TraceClient
|
912
|
+
self.span_id_to_previous_span_id: Dict[str, Union[str, None]] = {}
|
913
|
+
self.trace_id_to_previous_trace: Dict[str, Union[TraceClient, None]] = {}
|
910
914
|
self.current_span_id: Optional[str] = None
|
911
915
|
self.current_trace: Optional[TraceClient] = None
|
912
916
|
self.trace_across_async_contexts: bool = trace_across_async_contexts
|
@@ -958,7 +962,9 @@ class Tracer:
|
|
958
962
|
self.enable_monitoring = False
|
959
963
|
self.enable_evaluations = False
|
960
964
|
|
961
|
-
def set_current_span(
|
965
|
+
def set_current_span(
|
966
|
+
self, span_id: str
|
967
|
+
) -> Optional[contextvars.Token[Union[str, None]]]:
|
962
968
|
self.span_id_to_previous_span_id[span_id] = self.current_span_id
|
963
969
|
self.current_span_id = span_id
|
964
970
|
Tracer.current_span_id = span_id
|
@@ -981,7 +987,7 @@ class Tracer:
|
|
981
987
|
|
982
988
|
def reset_current_span(
|
983
989
|
self,
|
984
|
-
token: Optional[contextvars.Token[str
|
990
|
+
token: Optional[contextvars.Token[Union[str, None]]] = None,
|
985
991
|
span_id: Optional[str] = None,
|
986
992
|
):
|
987
993
|
try:
|
@@ -997,7 +1003,7 @@ class Tracer:
|
|
997
1003
|
|
998
1004
|
def set_current_trace(
|
999
1005
|
self, trace: TraceClient
|
1000
|
-
) -> Optional[contextvars.Token[TraceClient
|
1006
|
+
) -> Optional[contextvars.Token[Union[TraceClient, None]]]:
|
1001
1007
|
"""
|
1002
1008
|
Set the current trace context in contextvars
|
1003
1009
|
"""
|
@@ -1030,7 +1036,7 @@ class Tracer:
|
|
1030
1036
|
|
1031
1037
|
def reset_current_trace(
|
1032
1038
|
self,
|
1033
|
-
token: Optional[contextvars.Token[TraceClient
|
1039
|
+
token: Optional[contextvars.Token[Union[TraceClient, None]]] = None,
|
1034
1040
|
trace_id: Optional[str] = None,
|
1035
1041
|
):
|
1036
1042
|
try:
|
@@ -1046,7 +1052,7 @@ class Tracer:
|
|
1046
1052
|
|
1047
1053
|
@contextmanager
|
1048
1054
|
def trace(
|
1049
|
-
self, name: str, project_name: str
|
1055
|
+
self, name: str, project_name: Union[str, None] = None
|
1050
1056
|
) -> Generator[TraceClient, None, None]:
|
1051
1057
|
"""Start a new trace context using a context manager"""
|
1052
1058
|
trace_id = str(uuid.uuid4())
|
@@ -1692,25 +1698,31 @@ def wrap(
|
|
1692
1698
|
return wrapper
|
1693
1699
|
|
1694
1700
|
if isinstance(client, (OpenAI)):
|
1695
|
-
client.chat.completions
|
1696
|
-
client.responses
|
1697
|
-
client.beta.chat.completions
|
1701
|
+
setattr(client.chat.completions, "create", wrapped(original_create))
|
1702
|
+
setattr(client.responses, "create", wrapped(original_responses_create))
|
1703
|
+
setattr(client.beta.chat.completions, "parse", wrapped(original_beta_parse))
|
1698
1704
|
elif isinstance(client, (AsyncOpenAI)):
|
1699
|
-
client.chat.completions
|
1700
|
-
client.responses
|
1701
|
-
|
1705
|
+
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1706
|
+
setattr(client.responses, "create", wrapped_async(original_responses_create))
|
1707
|
+
setattr(
|
1708
|
+
client.beta.chat.completions, "parse", wrapped_async(original_beta_parse)
|
1709
|
+
)
|
1702
1710
|
elif isinstance(client, (Together)):
|
1703
|
-
client.chat.completions
|
1711
|
+
setattr(client.chat.completions, "create", wrapped(original_create))
|
1704
1712
|
elif isinstance(client, (AsyncTogether)):
|
1705
|
-
client.chat.completions
|
1713
|
+
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1706
1714
|
elif isinstance(client, (Anthropic)):
|
1707
|
-
client.messages
|
1715
|
+
setattr(client.messages, "create", wrapped(original_create))
|
1708
1716
|
elif isinstance(client, (AsyncAnthropic)):
|
1709
|
-
client.messages
|
1717
|
+
setattr(client.messages, "create", wrapped_async(original_create))
|
1710
1718
|
elif isinstance(client, (genai.Client)):
|
1711
|
-
client.models
|
1719
|
+
setattr(client.models, "generate_content", wrapped(original_create))
|
1712
1720
|
elif isinstance(client, (genai.client.AsyncClient)):
|
1713
|
-
client.models
|
1721
|
+
setattr(client.models, "generate_content", wrapped_async(original_create))
|
1722
|
+
elif isinstance(client, (Groq)):
|
1723
|
+
setattr(client.chat.completions, "create", wrapped(original_create))
|
1724
|
+
elif isinstance(client, (AsyncGroq)):
|
1725
|
+
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1714
1726
|
|
1715
1727
|
return client
|
1716
1728
|
|
@@ -1745,6 +1757,8 @@ def _get_client_config(
|
|
1745
1757
|
None,
|
1746
1758
|
client.beta.chat.completions.parse,
|
1747
1759
|
)
|
1760
|
+
elif isinstance(client, (Groq, AsyncGroq)):
|
1761
|
+
return "GROQ_API_CALL", client.chat.completions.create, None, None, None
|
1748
1762
|
elif isinstance(client, (Together, AsyncTogether)):
|
1749
1763
|
return "TOGETHER_API_CALL", client.chat.completions.create, None, None, None
|
1750
1764
|
elif isinstance(client, (Anthropic, AsyncAnthropic)):
|
@@ -1783,9 +1797,17 @@ def _format_output_data(
|
|
1783
1797
|
if isinstance(client, (OpenAI, AsyncOpenAI)):
|
1784
1798
|
if isinstance(response, ChatCompletion):
|
1785
1799
|
model_name = response.model
|
1786
|
-
prompt_tokens = response.usage.prompt_tokens
|
1787
|
-
completion_tokens =
|
1788
|
-
|
1800
|
+
prompt_tokens = response.usage.prompt_tokens if response.usage else 0
|
1801
|
+
completion_tokens = (
|
1802
|
+
response.usage.completion_tokens if response.usage else 0
|
1803
|
+
)
|
1804
|
+
cache_read_input_tokens = (
|
1805
|
+
response.usage.prompt_tokens_details.cached_tokens
|
1806
|
+
if response.usage
|
1807
|
+
and response.usage.prompt_tokens_details
|
1808
|
+
and response.usage.prompt_tokens_details.cached_tokens
|
1809
|
+
else 0
|
1810
|
+
)
|
1789
1811
|
|
1790
1812
|
if isinstance(response, ParsedChatCompletion):
|
1791
1813
|
message_content = response.choices[0].message.parsed
|
@@ -1793,10 +1815,19 @@ def _format_output_data(
|
|
1793
1815
|
message_content = response.choices[0].message.content
|
1794
1816
|
elif isinstance(response, Response):
|
1795
1817
|
model_name = response.model
|
1796
|
-
prompt_tokens = response.usage.input_tokens
|
1797
|
-
completion_tokens = response.usage.output_tokens
|
1798
|
-
cache_read_input_tokens =
|
1799
|
-
|
1818
|
+
prompt_tokens = response.usage.input_tokens if response.usage else 0
|
1819
|
+
completion_tokens = response.usage.output_tokens if response.usage else 0
|
1820
|
+
cache_read_input_tokens = (
|
1821
|
+
response.usage.input_tokens_details.cached_tokens
|
1822
|
+
if response.usage and response.usage.input_tokens_details
|
1823
|
+
else 0
|
1824
|
+
)
|
1825
|
+
if hasattr(response.output[0], "content"):
|
1826
|
+
message_content = "".join(
|
1827
|
+
seg.text
|
1828
|
+
for seg in response.output[0].content
|
1829
|
+
if hasattr(seg, "text")
|
1830
|
+
)
|
1800
1831
|
|
1801
1832
|
# Note: LiteLLM seems to use cache_read_input_tokens to calculate the cost for OpenAI
|
1802
1833
|
elif isinstance(client, (Together, AsyncTogether)):
|
@@ -1821,6 +1852,11 @@ def _format_output_data(
|
|
1821
1852
|
cache_read_input_tokens = response.usage.cache_read_input_tokens
|
1822
1853
|
cache_creation_input_tokens = response.usage.cache_creation_input_tokens
|
1823
1854
|
message_content = response.content[0].text
|
1855
|
+
elif isinstance(client, (Groq, AsyncGroq)):
|
1856
|
+
model_name = "groq/" + response.model
|
1857
|
+
prompt_tokens = response.usage.prompt_tokens
|
1858
|
+
completion_tokens = response.usage.completion_tokens
|
1859
|
+
message_content = response.choices[0].message.content
|
1824
1860
|
else:
|
1825
1861
|
judgeval_logger.warning(f"Unsupported client type: {type(client)}")
|
1826
1862
|
return None, None
|