judgeval 0.2.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. {judgeval-0.2.0 → judgeval-0.3.1}/.github/workflows/lint.yaml +0 -13
  2. judgeval-0.3.1/.github/workflows/mypy.yaml +25 -0
  3. judgeval-0.3.1/.github/workflows/pre-commit-autoupdate.yaml +38 -0
  4. judgeval-0.3.1/.pre-commit-config.yaml +23 -0
  5. {judgeval-0.2.0 → judgeval-0.3.1}/PKG-INFO +10 -6
  6. {judgeval-0.2.0 → judgeval-0.3.1}/README.md +5 -5
  7. judgeval-0.3.1/assets/agent_trace_example.png +0 -0
  8. judgeval-0.3.1/assets/errors.png +0 -0
  9. judgeval-0.3.1/assets/online_eval.png +0 -0
  10. judgeval-0.3.1/assets/product_shot.png +0 -0
  11. judgeval-0.3.1/assets/test.png +0 -0
  12. judgeval-0.3.1/assets/tests.png +0 -0
  13. {judgeval-0.2.0 → judgeval-0.3.1}/pyproject.toml +29 -1
  14. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/api/api.py +38 -7
  15. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/api/constants.py +9 -1
  16. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/storage/s3_storage.py +2 -3
  17. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/tracer/core.py +66 -32
  18. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/tracer/otel_span_processor.py +4 -50
  19. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/tracer/span_transformer.py +16 -10
  20. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/utils.py +46 -38
  21. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/constants.py +2 -0
  22. judgeval-0.3.1/src/judgeval/data/example.py +33 -0
  23. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/data/judgment_types.py +23 -45
  24. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/data/result.py +8 -14
  25. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/data/scripts/openapi_transform.py +5 -5
  26. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/data/trace.py +3 -4
  27. judgeval-0.3.1/src/judgeval/dataset.py +192 -0
  28. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/evaluation_run.py +1 -0
  29. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/judges/litellm_judge.py +2 -2
  30. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/judges/mixture_of_judges.py +6 -6
  31. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/judges/together_judge.py +6 -3
  32. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/judgment_client.py +9 -71
  33. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/run_evaluation.py +41 -9
  34. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/score.py +11 -7
  35. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/utils.py +3 -3
  36. judgeval-0.3.1/src/judgeval/utils/file_utils.py +66 -0
  37. {judgeval-0.2.0 → judgeval-0.3.1}/src/update_types.sh +1 -1
  38. {judgeval-0.2.0 → judgeval-0.3.1}/uv.lock +449 -0
  39. judgeval-0.2.0/.pre-commit-config.yaml +0 -21
  40. judgeval-0.2.0/assets/product_shot.png +0 -0
  41. judgeval-0.2.0/src/judgeval/data/datasets/__init__.py +0 -4
  42. judgeval-0.2.0/src/judgeval/data/datasets/dataset.py +0 -341
  43. judgeval-0.2.0/src/judgeval/data/datasets/eval_dataset_client.py +0 -214
  44. judgeval-0.2.0/src/judgeval/data/example.py +0 -61
  45. judgeval-0.2.0/src/judgeval/utils/file_utils.py +0 -51
  46. {judgeval-0.2.0 → judgeval-0.3.1}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  47. {judgeval-0.2.0 → judgeval-0.3.1}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  48. {judgeval-0.2.0 → judgeval-0.3.1}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  49. {judgeval-0.2.0 → judgeval-0.3.1}/.github/pull_request_template.md +0 -0
  50. {judgeval-0.2.0 → judgeval-0.3.1}/.github/workflows/blocked-pr.yaml +0 -0
  51. {judgeval-0.2.0 → judgeval-0.3.1}/.github/workflows/ci.yaml +0 -0
  52. {judgeval-0.2.0 → judgeval-0.3.1}/.github/workflows/merge-branch-check.yaml +0 -0
  53. {judgeval-0.2.0 → judgeval-0.3.1}/.github/workflows/release.yaml +0 -0
  54. {judgeval-0.2.0 → judgeval-0.3.1}/.github/workflows/validate-branch.yaml +0 -0
  55. {judgeval-0.2.0 → judgeval-0.3.1}/.gitignore +0 -0
  56. {judgeval-0.2.0 → judgeval-0.3.1}/LICENSE.md +0 -0
  57. {judgeval-0.2.0 → judgeval-0.3.1}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
  58. {judgeval-0.2.0 → judgeval-0.3.1}/assets/agent.gif +0 -0
  59. {judgeval-0.2.0 → judgeval-0.3.1}/assets/data.gif +0 -0
  60. {judgeval-0.2.0 → judgeval-0.3.1}/assets/dataset_clustering_screenshot.png +0 -0
  61. {judgeval-0.2.0 → judgeval-0.3.1}/assets/dataset_clustering_screenshot_dm.png +0 -0
  62. {judgeval-0.2.0 → judgeval-0.3.1}/assets/datasets_preview_screenshot.png +0 -0
  63. {judgeval-0.2.0 → judgeval-0.3.1}/assets/document.gif +0 -0
  64. {judgeval-0.2.0 → judgeval-0.3.1}/assets/error_analysis_dashboard.png +0 -0
  65. {judgeval-0.2.0 → judgeval-0.3.1}/assets/experiments_dashboard_screenshot.png +0 -0
  66. {judgeval-0.2.0 → judgeval-0.3.1}/assets/experiments_page.png +0 -0
  67. {judgeval-0.2.0 → judgeval-0.3.1}/assets/experiments_pagev2.png +0 -0
  68. {judgeval-0.2.0 → judgeval-0.3.1}/assets/logo-dark.svg +0 -0
  69. {judgeval-0.2.0 → judgeval-0.3.1}/assets/logo-light.svg +0 -0
  70. {judgeval-0.2.0 → judgeval-0.3.1}/assets/monitoring_screenshot.png +0 -0
  71. {judgeval-0.2.0 → judgeval-0.3.1}/assets/new_darkmode.svg +0 -0
  72. {judgeval-0.2.0 → judgeval-0.3.1}/assets/new_lightmode.svg +0 -0
  73. {judgeval-0.2.0 → judgeval-0.3.1}/assets/trace.gif +0 -0
  74. {judgeval-0.2.0 → judgeval-0.3.1}/assets/trace_demo.png +0 -0
  75. {judgeval-0.2.0 → judgeval-0.3.1}/assets/trace_screenshot.png +0 -0
  76. {judgeval-0.2.0 → judgeval-0.3.1}/assets/trace_screenshot_old.png +0 -0
  77. {judgeval-0.2.0 → judgeval-0.3.1}/pytest.ini +0 -0
  78. {judgeval-0.2.0 → judgeval-0.3.1}/src/.coveragerc +0 -0
  79. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/__init__.py +0 -0
  80. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/clients.py +0 -0
  81. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/__init__.py +0 -0
  82. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/api/__init__.py +0 -0
  83. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/exceptions.py +0 -0
  84. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/logger.py +0 -0
  85. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/storage/__init__.py +0 -0
  86. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/tracer/__init__.py +0 -0
  87. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/tracer/constants.py +0 -0
  88. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/tracer/otel_exporter.py +0 -0
  89. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/tracer/span_processor.py +0 -0
  90. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/common/tracer/trace_manager.py +0 -0
  91. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/data/__init__.py +0 -0
  92. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/data/scorer_data.py +0 -0
  93. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
  94. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/data/tool.py +0 -0
  95. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/data/trace_run.py +0 -0
  96. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/integrations/langgraph.py +0 -0
  97. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/judges/__init__.py +0 -0
  98. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/judges/base_judge.py +0 -0
  99. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/judges/utils.py +0 -0
  100. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/rules.py +0 -0
  101. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/__init__.py +0 -0
  102. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/agent_scorer.py +0 -0
  103. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/api_scorer.py +0 -0
  104. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/base_scorer.py +0 -0
  105. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/example_scorer.py +0 -0
  106. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/exceptions.py +0 -0
  107. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  108. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  109. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  110. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  111. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
  112. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
  113. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  114. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
  115. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
  116. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +0 -0
  117. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -0
  118. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -0
  119. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/tracer/__init__.py +0 -0
  120. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/utils/alerts.py +0 -0
  121. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/utils/requests.py +0 -0
  122. {judgeval-0.2.0 → judgeval-0.3.1}/src/judgeval/version_check.py +0 -0
  123. {judgeval-0.2.0 → judgeval-0.3.1}/update_version.py +0 -0
@@ -10,20 +10,11 @@ jobs:
10
10
  steps:
11
11
  - uses: actions/checkout@v4
12
12
 
13
- - name: Set up Python
14
- uses: actions/setup-python@v5
15
- with:
16
- python-version: '3.11'
17
-
18
13
  - name: Install ruff
19
14
  uses: astral-sh/ruff-action@v3
20
15
  with:
21
16
  args: "--version"
22
17
 
23
- - name: Install mypy and dependencies
24
- run: |
25
- pip install mypy types-requests types-PyYAML
26
-
27
18
  - name: Run ruff formatter
28
19
  if: always()
29
20
  run: ruff format --check .
@@ -31,7 +22,3 @@ jobs:
31
22
  - name: Run ruff linter
32
23
  if: always()
33
24
  run: ruff check .
34
-
35
- - name: Run mypy
36
- if: always()
37
- run: mypy --explicit-package-bases --ignore-missing-imports .
@@ -0,0 +1,25 @@
1
+ name: MyPy Check
2
+
3
+ on:
4
+ pull_request:
5
+ branches: [ main, staging ]
6
+
7
+ jobs:
8
+ mypy:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+
13
+ - name: Set up Python
14
+ uses: actions/setup-python@v5
15
+ with:
16
+ python-version: '3.11'
17
+
18
+ - name: Install dependencies
19
+ run: |
20
+ pip install uv
21
+ uv sync --dev
22
+
23
+ - name: Run mypy
24
+ if: always()
25
+ run: uv run mypy ./src/judgeval/
@@ -0,0 +1,38 @@
1
+ name: Pre-commit auto-update
2
+ on:
3
+ schedule:
4
+ - cron: '0 0 * * 1' # Weekly on Monday at midnight UTC
5
+ workflow_dispatch:
6
+
7
+ jobs:
8
+ auto-update:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - name: Checkout repository
12
+ uses: actions/checkout@v4
13
+ with:
14
+ ref: staging
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v4
18
+ with:
19
+ python-version: '3.11'
20
+
21
+ - name: Install and update pre-commit
22
+ run: |
23
+ pip install pre-commit
24
+ pre-commit autoupdate
25
+
26
+ - name: Create Pull Request
27
+ uses: peter-evans/create-pull-request@v7
28
+ with:
29
+ commit-message: 'chore: update pre-commit hooks'
30
+ title: 'chore: update pre-commit hooks'
31
+ body: |
32
+ Auto-generated PR to update pre-commit hook versions.
33
+
34
+ Please review the changes and merge if everything looks good.
35
+
36
+ Updated by GitHub Actions on {{ date }}.
37
+ branch: update-pre-commit-hooks
38
+ base: staging
@@ -0,0 +1,23 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/uv-pre-commit
3
+ rev: 0.8.0
4
+ hooks:
5
+ - id: uv-lock
6
+
7
+ - repo: https://github.com/astral-sh/ruff-pre-commit
8
+ rev: v0.12.4
9
+ hooks:
10
+ - id: ruff
11
+ name: ruff (linter)
12
+ args: [--fix]
13
+ - id: ruff-format
14
+ name: ruff (formatter)
15
+
16
+ # - repo: https://github.com/pre-commit/mirrors-mypy
17
+ # rev: v1.17.0
18
+ # hooks:
19
+ # - id: mypy
20
+ # language: system
21
+ # # These next two lines allow commits even if mypy fails, REMOVE once we fix all mypy errors
22
+ # verbose: true
23
+ # entry: bash -c 'mypy src/judgeval/ || true'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.2.0
3
+ Version: 0.3.1
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -14,6 +14,7 @@ Requires-Dist: anthropic
14
14
  Requires-Dist: boto3
15
15
  Requires-Dist: datamodel-code-generator>=0.31.1
16
16
  Requires-Dist: google-genai
17
+ Requires-Dist: groq>=0.30.0
17
18
  Requires-Dist: langchain-anthropic
18
19
  Requires-Dist: langchain-core
19
20
  Requires-Dist: langchain-huggingface
@@ -22,6 +23,9 @@ Requires-Dist: litellm>=1.61.15
22
23
  Requires-Dist: matplotlib>=3.10.3
23
24
  Requires-Dist: nest-asyncio
24
25
  Requires-Dist: openai
26
+ Requires-Dist: opentelemetry-api>=1.34.1
27
+ Requires-Dist: opentelemetry-sdk>=1.34.1
28
+ Requires-Dist: orjson>=3.9.0
25
29
  Requires-Dist: pandas
26
30
  Requires-Dist: python-dotenv==1.0.1
27
31
  Requires-Dist: python-slugify>=8.0.4
@@ -39,7 +43,7 @@ Description-Content-Type: text/markdown
39
43
  Enable self-learning agents with traces, evals, and environment data.
40
44
  </div>
41
45
 
42
- ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
46
+ ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
43
47
 
44
48
  [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
45
49
 
@@ -139,7 +143,7 @@ run_agent("What is the capital of the United States?")
139
143
  ```
140
144
  You'll see your trace exported to the Judgment Platform:
141
145
 
142
- <p align="center"><img src="assets/trace_demo.png" alt="Judgment Platform Trace Example" width="800" /></p>
146
+ <p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
143
147
 
144
148
 
145
149
  [Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
@@ -152,9 +156,9 @@ You'll see your trace exported to the Judgment Platform:
152
156
 
153
157
  | | |
154
158
  |:---|:---:|
155
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
156
- | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
157
- | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
159
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
160
+ | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
161
+ | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
158
162
  | <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
159
163
 
160
164
  ## 🏢 Self-Hosting
@@ -8,7 +8,7 @@
8
8
  Enable self-learning agents with traces, evals, and environment data.
9
9
  </div>
10
10
 
11
- ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
11
+ ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
12
12
 
13
13
  [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
14
14
 
@@ -108,7 +108,7 @@ run_agent("What is the capital of the United States?")
108
108
  ```
109
109
  You'll see your trace exported to the Judgment Platform:
110
110
 
111
- <p align="center"><img src="assets/trace_demo.png" alt="Judgment Platform Trace Example" width="800" /></p>
111
+ <p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
112
112
 
113
113
 
114
114
  [Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
@@ -121,9 +121,9 @@ You'll see your trace exported to the Judgment Platform:
121
121
 
122
122
  | | |
123
123
  |:---|:---:|
124
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
125
- | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
126
- | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
124
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
125
+ | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
126
+ | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
127
127
  | <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
128
128
 
129
129
  ## 🏢 Self-Hosting
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.2.0"
3
+ version = "0.3.1"
4
4
  authors = [
5
5
  { name="Andrew Li", email="andrew@judgmentlabs.ai" },
6
6
  { name="Alex Shan", email="alex@judgmentlabs.ai" },
@@ -33,6 +33,10 @@ dependencies = [
33
33
  "matplotlib>=3.10.3",
34
34
  "python-slugify>=8.0.4",
35
35
  "datamodel-code-generator>=0.31.1",
36
+ "groq>=0.30.0",
37
+ "opentelemetry-api>=1.34.1",
38
+ "opentelemetry-sdk>=1.34.1",
39
+ "orjson>=3.9.0",
36
40
  ]
37
41
 
38
42
  [project.urls]
@@ -62,6 +66,30 @@ dev = [
62
66
  "langgraph>=0.4.3",
63
67
  "pre-commit>=4.2.0",
64
68
  "types-requests>=2.32.4.20250611",
69
+ "mypy>=1.17.0",
70
+ "types-pyyaml>=6.0.12.20250516",
71
+ "pandas-stubs>=2.3.0.250703",
72
+ "lxml-stubs>=0.5.1",
73
+ "types-pygments>=2.19.0.20250715",
74
+ "types-beautifulsoup4>=4.12.0.20250516",
75
+ "types-cachetools>=6.1.0.20250717",
76
+ "types-cffi>=1.17.0.20250523",
77
+ "types-defusedxml>=0.7.0.20250708",
78
+ "types-greenlet>=3.2.0.20250417",
79
+ "types-jsonschema>=4.24.0.20250708",
80
+ "types-objgraph>=3.6.0.20240907",
81
+ "types-pexpect>=4.9.0.20250516",
82
+ "types-protobuf>=6.30.2.20250703",
83
+ "types-psutil>=7.0.0.20250601",
84
+ "types-pyopenssl>=24.1.0.20240722",
85
+ "types-pyasn1>=0.6.0.20250516",
86
+ "types-regex>=2024.11.6.20250403",
87
+ "types-reportlab>=4.4.1.20250602",
88
+ "types-simplejson>=3.20.0.20250326",
89
+ "types-tensorflow>=2.18.0.20250516",
90
+ "types-tqdm>=4.67.0.20250516",
91
+ "types-tree-sitter-languages>=1.10.0.20250530",
92
+ "types-xmltodict>=0.14.0.20241009",
65
93
  ]
66
94
 
67
95
  [tool.hatch.build]
@@ -1,4 +1,4 @@
1
- from typing import Literal, List, Dict, Any
1
+ from typing import Literal, List, Dict, Any, Union
2
2
  from requests import exceptions
3
3
  from judgeval.common.api.constants import (
4
4
  JUDGMENT_TRACES_FETCH_API_URL,
@@ -25,6 +25,8 @@ from judgeval.common.api.constants import (
25
25
  JUDGMENT_SCORER_SAVE_API_URL,
26
26
  JUDGMENT_SCORER_FETCH_API_URL,
27
27
  JUDGMENT_SCORER_EXISTS_API_URL,
28
+ JUDGMENT_DATASETS_APPEND_TRACES_API_URL,
29
+ JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL,
28
30
  )
29
31
  from judgeval.common.api.constants import (
30
32
  TraceFetchPayload,
@@ -48,9 +50,12 @@ from judgeval.common.api.constants import (
48
50
  ScorerSavePayload,
49
51
  ScorerFetchPayload,
50
52
  ScorerExistsPayload,
53
+ CheckExampleKeysPayload,
51
54
  )
52
55
  from judgeval.utils.requests import requests
53
56
 
57
+ import orjson
58
+
54
59
 
55
60
  class JudgmentAPIException(exceptions.HTTPError):
56
61
  """
@@ -65,7 +70,7 @@ class JudgmentAPIException(exceptions.HTTPError):
65
70
  self.request = request
66
71
 
67
72
  @property
68
- def status_code(self) -> int | None:
73
+ def status_code(self) -> Union[int, None]:
69
74
  """Get the HTTP status code from the response."""
70
75
  return self.response.status_code if self.response else None
71
76
 
@@ -114,8 +119,15 @@ class JudgmentApiClient:
114
119
  try:
115
120
  r.raise_for_status()
116
121
  except exceptions.HTTPError as e:
122
+ try:
123
+ detail = r.json().get("detail", "")
124
+ except Exception:
125
+ detail = r.text
126
+
117
127
  raise JudgmentAPIException(
118
- f"HTTP {r.status_code}: {r.reason}", response=r, request=e.request
128
+ f"HTTP {r.status_code}: {r.reason}, {detail}",
129
+ response=r,
130
+ request=e.request,
119
131
  )
120
132
 
121
133
  return r.json()
@@ -218,6 +230,14 @@ class JudgmentApiClient:
218
230
  }
219
231
  return self._do_request("POST", JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL, payload)
220
232
 
233
+ def check_example_keys(self, keys: List[str], eval_name: str, project_name: str):
234
+ payload: CheckExampleKeysPayload = {
235
+ "keys": keys,
236
+ "eval_name": eval_name,
237
+ "project_name": project_name,
238
+ }
239
+ return self._do_request("POST", JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL, payload)
240
+
221
241
  def save_scorer(self, name: str, prompt: str, options: dict):
222
242
  payload: ScorerSavePayload = {
223
243
  "name": name,
@@ -279,7 +299,7 @@ class JudgmentApiClient:
279
299
  project_name: str,
280
300
  examples: List[Dict[str, Any]],
281
301
  traces: List[Dict[str, Any]],
282
- overwrite: bool,
302
+ overwrite: bool = False,
283
303
  ):
284
304
  payload: DatasetPushPayload = {
285
305
  "dataset_alias": dataset_alias,
@@ -302,6 +322,18 @@ class JudgmentApiClient:
302
322
  "POST", JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL, payload
303
323
  )
304
324
 
325
+ def append_traces(
326
+ self, dataset_alias: str, project_name: str, traces: List[Dict[str, Any]]
327
+ ):
328
+ payload: DatasetAppendPayload = {
329
+ "dataset_alias": dataset_alias,
330
+ "project_name": project_name,
331
+ "traces": traces,
332
+ }
333
+ return self._do_request(
334
+ "POST", JUDGMENT_DATASETS_APPEND_TRACES_API_URL, payload
335
+ )
336
+
305
337
  def pull_dataset(self, dataset_alias: str, project_name: str):
306
338
  payload: DatasetPullPayload = {
307
339
  "dataset_alias": dataset_alias,
@@ -347,6 +379,5 @@ class JudgmentApiClient:
347
379
  except Exception as e:
348
380
  return f"<Unserializable object of type {type(obj).__name__}: {e}>"
349
381
 
350
- import json
351
-
352
- return json.dumps(data, default=fallback_encoder)
382
+ # orjson returns bytes, so we need to decode to str
383
+ return orjson.dumps(data, default=fallback_encoder).decode("utf-8")
@@ -51,6 +51,7 @@ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
51
51
  JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
52
52
  JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL = f"{ROOT_API}/check_experiment_type/"
53
53
  JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL = f"{ROOT_API}/eval-run-name-exists/"
54
+ JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL = f"{ROOT_API}/check_example_keys/"
54
55
 
55
56
 
56
57
  # Evaluation API Payloads
@@ -90,9 +91,16 @@ class EvalRunNameExistsPayload(TypedDict):
90
91
  judgment_api_key: str
91
92
 
92
93
 
94
+ class CheckExampleKeysPayload(TypedDict):
95
+ keys: List[str]
96
+ eval_name: str
97
+ project_name: str
98
+
99
+
93
100
  # Datasets API
94
101
  JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
95
102
  JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
103
+ JUDGMENT_DATASETS_APPEND_TRACES_API_URL = f"{ROOT_API}/traces/add_to_dataset/"
96
104
  JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
97
105
  JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
98
106
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
@@ -134,7 +142,7 @@ class DatasetStatsPayload(TypedDict):
134
142
 
135
143
 
136
144
  # Projects API
137
- JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
145
+ JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete_from_judgeval/"
138
146
  JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
139
147
 
140
148
 
@@ -1,6 +1,6 @@
1
1
  import os
2
- import json
3
2
  import boto3
3
+ import orjson
4
4
  from typing import Optional
5
5
  from datetime import datetime, UTC
6
6
  from botocore.exceptions import ClientError
@@ -85,8 +85,7 @@ class S3Storage:
85
85
  timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
86
86
  s3_key = f"traces/{project_name}/{trace_id}_{timestamp}.json"
87
87
 
88
- # Convert trace data to JSON string
89
- trace_json = json.dumps(trace_data)
88
+ trace_json = orjson.dumps(trace_data).decode("utf-8")
90
89
 
91
90
  self.s3_client.put_object(
92
91
  Bucket=self.bucket_name,
@@ -32,6 +32,7 @@ from typing import (
32
32
  )
33
33
  import types
34
34
 
35
+
35
36
  from judgeval.common.tracer.constants import _TRACE_FILEPATH_BLOCKLIST
36
37
 
37
38
  from judgeval.common.tracer.otel_span_processor import JudgmentSpanProcessor
@@ -45,6 +46,7 @@ from openai.types.chat import ParsedChatCompletion
45
46
  from together import Together, AsyncTogether
46
47
  from anthropic import Anthropic, AsyncAnthropic
47
48
  from google import genai
49
+ from groq import Groq, AsyncGroq
48
50
 
49
51
  from judgeval.data import Example, Trace, TraceSpan, TraceUsage
50
52
  from judgeval.scorers import APIScorerConfig, BaseScorer
@@ -67,6 +69,8 @@ ApiClient: TypeAlias = Union[
67
69
  AsyncTogether,
68
70
  genai.Client,
69
71
  genai.client.AsyncClient,
72
+ Groq,
73
+ AsyncGroq,
70
74
  ]
71
75
  SpanType: TypeAlias = str
72
76
 
@@ -79,7 +83,7 @@ class TraceClient:
79
83
  tracer: Tracer,
80
84
  trace_id: Optional[str] = None,
81
85
  name: str = "default",
82
- project_name: str | None = None,
86
+ project_name: Union[str, None] = None,
83
87
  enable_monitoring: bool = True,
84
88
  enable_evaluations: bool = True,
85
89
  parent_trace_id: Optional[str] = None,
@@ -414,8 +418,6 @@ class TraceClient:
414
418
  self.start_time or time.time(), timezone.utc
415
419
  ).isoformat(),
416
420
  "duration": total_duration,
417
- "trace_spans": [span.model_dump() for span in self.trace_spans],
418
- "evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
419
421
  "offline_mode": self.tracer.offline_mode,
420
422
  "parent_trace_id": self.parent_trace_id,
421
423
  "parent_name": self.parent_name,
@@ -850,9 +852,9 @@ class Tracer:
850
852
 
851
853
  def __init__(
852
854
  self,
853
- api_key: str | None = os.getenv("JUDGMENT_API_KEY"),
854
- organization_id: str | None = os.getenv("JUDGMENT_ORG_ID"),
855
- project_name: str | None = None,
855
+ api_key: Union[str, None] = os.getenv("JUDGMENT_API_KEY"),
856
+ organization_id: Union[str, None] = os.getenv("JUDGMENT_ORG_ID"),
857
+ project_name: Union[str, None] = None,
856
858
  deep_tracing: bool = False, # Deep tracing is disabled by default
857
859
  enable_monitoring: bool = os.getenv("JUDGMENT_MONITORING", "true").lower()
858
860
  == "true",
@@ -905,8 +907,8 @@ class Tracer:
905
907
  self.class_identifiers: Dict[
906
908
  str, str
907
909
  ] = {} # Dictionary to store class identifiers
908
- self.span_id_to_previous_span_id: Dict[str, str | None] = {}
909
- self.trace_id_to_previous_trace: Dict[str, TraceClient | None] = {}
910
+ self.span_id_to_previous_span_id: Dict[str, Union[str, None]] = {}
911
+ self.trace_id_to_previous_trace: Dict[str, Union[TraceClient, None]] = {}
910
912
  self.current_span_id: Optional[str] = None
911
913
  self.current_trace: Optional[TraceClient] = None
912
914
  self.trace_across_async_contexts: bool = trace_across_async_contexts
@@ -958,7 +960,9 @@ class Tracer:
958
960
  self.enable_monitoring = False
959
961
  self.enable_evaluations = False
960
962
 
961
- def set_current_span(self, span_id: str) -> Optional[contextvars.Token[str | None]]:
963
+ def set_current_span(
964
+ self, span_id: str
965
+ ) -> Optional[contextvars.Token[Union[str, None]]]:
962
966
  self.span_id_to_previous_span_id[span_id] = self.current_span_id
963
967
  self.current_span_id = span_id
964
968
  Tracer.current_span_id = span_id
@@ -981,7 +985,7 @@ class Tracer:
981
985
 
982
986
  def reset_current_span(
983
987
  self,
984
- token: Optional[contextvars.Token[str | None]] = None,
988
+ token: Optional[contextvars.Token[Union[str, None]]] = None,
985
989
  span_id: Optional[str] = None,
986
990
  ):
987
991
  try:
@@ -997,7 +1001,7 @@ class Tracer:
997
1001
 
998
1002
  def set_current_trace(
999
1003
  self, trace: TraceClient
1000
- ) -> Optional[contextvars.Token[TraceClient | None]]:
1004
+ ) -> Optional[contextvars.Token[Union[TraceClient, None]]]:
1001
1005
  """
1002
1006
  Set the current trace context in contextvars
1003
1007
  """
@@ -1030,7 +1034,7 @@ class Tracer:
1030
1034
 
1031
1035
  def reset_current_trace(
1032
1036
  self,
1033
- token: Optional[contextvars.Token[TraceClient | None]] = None,
1037
+ token: Optional[contextvars.Token[Union[TraceClient, None]]] = None,
1034
1038
  trace_id: Optional[str] = None,
1035
1039
  ):
1036
1040
  try:
@@ -1046,7 +1050,7 @@ class Tracer:
1046
1050
 
1047
1051
  @contextmanager
1048
1052
  def trace(
1049
- self, name: str, project_name: str | None = None
1053
+ self, name: str, project_name: Union[str, None] = None
1050
1054
  ) -> Generator[TraceClient, None, None]:
1051
1055
  """Start a new trace context using a context manager"""
1052
1056
  trace_id = str(uuid.uuid4())
@@ -1692,25 +1696,31 @@ def wrap(
1692
1696
  return wrapper
1693
1697
 
1694
1698
  if isinstance(client, (OpenAI)):
1695
- client.chat.completions.create = wrapped(original_create)
1696
- client.responses.create = wrapped(original_responses_create)
1697
- client.beta.chat.completions.parse = wrapped(original_beta_parse)
1699
+ setattr(client.chat.completions, "create", wrapped(original_create))
1700
+ setattr(client.responses, "create", wrapped(original_responses_create))
1701
+ setattr(client.beta.chat.completions, "parse", wrapped(original_beta_parse))
1698
1702
  elif isinstance(client, (AsyncOpenAI)):
1699
- client.chat.completions.create = wrapped_async(original_create)
1700
- client.responses.create = wrapped_async(original_responses_create)
1701
- client.beta.chat.completions.parse = wrapped_async(original_beta_parse)
1703
+ setattr(client.chat.completions, "create", wrapped_async(original_create))
1704
+ setattr(client.responses, "create", wrapped_async(original_responses_create))
1705
+ setattr(
1706
+ client.beta.chat.completions, "parse", wrapped_async(original_beta_parse)
1707
+ )
1702
1708
  elif isinstance(client, (Together)):
1703
- client.chat.completions.create = wrapped(original_create)
1709
+ setattr(client.chat.completions, "create", wrapped(original_create))
1704
1710
  elif isinstance(client, (AsyncTogether)):
1705
- client.chat.completions.create = wrapped_async(original_create)
1711
+ setattr(client.chat.completions, "create", wrapped_async(original_create))
1706
1712
  elif isinstance(client, (Anthropic)):
1707
- client.messages.create = wrapped(original_create)
1713
+ setattr(client.messages, "create", wrapped(original_create))
1708
1714
  elif isinstance(client, (AsyncAnthropic)):
1709
- client.messages.create = wrapped_async(original_create)
1715
+ setattr(client.messages, "create", wrapped_async(original_create))
1710
1716
  elif isinstance(client, (genai.Client)):
1711
- client.models.generate_content = wrapped(original_create)
1717
+ setattr(client.models, "generate_content", wrapped(original_create))
1712
1718
  elif isinstance(client, (genai.client.AsyncClient)):
1713
- client.models.generate_content = wrapped_async(original_create)
1719
+ setattr(client.models, "generate_content", wrapped_async(original_create))
1720
+ elif isinstance(client, (Groq)):
1721
+ setattr(client.chat.completions, "create", wrapped(original_create))
1722
+ elif isinstance(client, (AsyncGroq)):
1723
+ setattr(client.chat.completions, "create", wrapped_async(original_create))
1714
1724
 
1715
1725
  return client
1716
1726
 
@@ -1745,6 +1755,8 @@ def _get_client_config(
1745
1755
  None,
1746
1756
  client.beta.chat.completions.parse,
1747
1757
  )
1758
+ elif isinstance(client, (Groq, AsyncGroq)):
1759
+ return "GROQ_API_CALL", client.chat.completions.create, None, None, None
1748
1760
  elif isinstance(client, (Together, AsyncTogether)):
1749
1761
  return "TOGETHER_API_CALL", client.chat.completions.create, None, None, None
1750
1762
  elif isinstance(client, (Anthropic, AsyncAnthropic)):
@@ -1783,9 +1795,17 @@ def _format_output_data(
1783
1795
  if isinstance(client, (OpenAI, AsyncOpenAI)):
1784
1796
  if isinstance(response, ChatCompletion):
1785
1797
  model_name = response.model
1786
- prompt_tokens = response.usage.prompt_tokens
1787
- completion_tokens = response.usage.completion_tokens
1788
- cache_read_input_tokens = response.usage.prompt_tokens_details.cached_tokens
1798
+ prompt_tokens = response.usage.prompt_tokens if response.usage else 0
1799
+ completion_tokens = (
1800
+ response.usage.completion_tokens if response.usage else 0
1801
+ )
1802
+ cache_read_input_tokens = (
1803
+ response.usage.prompt_tokens_details.cached_tokens
1804
+ if response.usage
1805
+ and response.usage.prompt_tokens_details
1806
+ and response.usage.prompt_tokens_details.cached_tokens
1807
+ else 0
1808
+ )
1789
1809
 
1790
1810
  if isinstance(response, ParsedChatCompletion):
1791
1811
  message_content = response.choices[0].message.parsed
@@ -1793,10 +1813,19 @@ def _format_output_data(
1793
1813
  message_content = response.choices[0].message.content
1794
1814
  elif isinstance(response, Response):
1795
1815
  model_name = response.model
1796
- prompt_tokens = response.usage.input_tokens
1797
- completion_tokens = response.usage.output_tokens
1798
- cache_read_input_tokens = response.usage.input_tokens_details.cached_tokens
1799
- message_content = "".join(seg.text for seg in response.output[0].content)
1816
+ prompt_tokens = response.usage.input_tokens if response.usage else 0
1817
+ completion_tokens = response.usage.output_tokens if response.usage else 0
1818
+ cache_read_input_tokens = (
1819
+ response.usage.input_tokens_details.cached_tokens
1820
+ if response.usage and response.usage.input_tokens_details
1821
+ else 0
1822
+ )
1823
+ if hasattr(response.output[0], "content"):
1824
+ message_content = "".join(
1825
+ seg.text
1826
+ for seg in response.output[0].content
1827
+ if hasattr(seg, "text")
1828
+ )
1800
1829
 
1801
1830
  # Note: LiteLLM seems to use cache_read_input_tokens to calculate the cost for OpenAI
1802
1831
  elif isinstance(client, (Together, AsyncTogether)):
@@ -1821,6 +1850,11 @@ def _format_output_data(
1821
1850
  cache_read_input_tokens = response.usage.cache_read_input_tokens
1822
1851
  cache_creation_input_tokens = response.usage.cache_creation_input_tokens
1823
1852
  message_content = response.content[0].text
1853
+ elif isinstance(client, (Groq, AsyncGroq)):
1854
+ model_name = "groq/" + response.model
1855
+ prompt_tokens = response.usage.prompt_tokens
1856
+ completion_tokens = response.usage.completion_tokens
1857
+ message_content = response.choices[0].message.content
1824
1858
  else:
1825
1859
  judgeval_logger.warning(f"Unsupported client type: {type(client)}")
1826
1860
  return None, None