judgeval 0.1.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. {judgeval-0.1.0 → judgeval-0.3.0}/.github/workflows/lint.yaml +0 -13
  2. judgeval-0.3.0/.github/workflows/mypy.yaml +25 -0
  3. judgeval-0.3.0/.github/workflows/pre-commit-autoupdate.yaml +38 -0
  4. judgeval-0.3.0/.pre-commit-config.yaml +23 -0
  5. {judgeval-0.1.0 → judgeval-0.3.0}/PKG-INFO +10 -6
  6. {judgeval-0.1.0 → judgeval-0.3.0}/README.md +5 -5
  7. judgeval-0.3.0/assets/agent_trace_example.png +0 -0
  8. judgeval-0.3.0/assets/errors.png +0 -0
  9. judgeval-0.3.0/assets/online_eval.png +0 -0
  10. judgeval-0.3.0/assets/product_shot.png +0 -0
  11. judgeval-0.3.0/assets/test.png +0 -0
  12. judgeval-0.3.0/assets/tests.png +0 -0
  13. {judgeval-0.1.0 → judgeval-0.3.0}/pyproject.toml +29 -1
  14. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/common/api/api.py +38 -7
  15. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/common/api/constants.py +9 -1
  16. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/common/storage/s3_storage.py +2 -3
  17. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/common/tracer/core.py +66 -30
  18. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/common/tracer/otel_span_processor.py +4 -50
  19. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/common/tracer/span_transformer.py +16 -10
  20. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/common/utils.py +46 -38
  21. judgeval-0.3.0/src/judgeval/data/example.py +33 -0
  22. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/data/judgment_types.py +23 -44
  23. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/data/result.py +8 -14
  24. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/data/scripts/openapi_transform.py +5 -5
  25. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/data/trace.py +3 -4
  26. judgeval-0.3.0/src/judgeval/dataset.py +192 -0
  27. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/evaluation_run.py +1 -0
  28. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/judges/litellm_judge.py +2 -2
  29. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/judges/mixture_of_judges.py +6 -6
  30. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/judges/together_judge.py +4 -2
  31. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/judgment_client.py +9 -71
  32. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/run_evaluation.py +47 -8
  33. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/score.py +11 -7
  34. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/utils.py +3 -3
  35. judgeval-0.3.0/src/judgeval/utils/file_utils.py +66 -0
  36. {judgeval-0.1.0 → judgeval-0.3.0}/src/update_types.sh +1 -1
  37. {judgeval-0.1.0 → judgeval-0.3.0}/uv.lock +449 -0
  38. judgeval-0.1.0/.pre-commit-config.yaml +0 -21
  39. judgeval-0.1.0/assets/product_shot.png +0 -0
  40. judgeval-0.1.0/src/judgeval/data/datasets/__init__.py +0 -4
  41. judgeval-0.1.0/src/judgeval/data/datasets/dataset.py +0 -341
  42. judgeval-0.1.0/src/judgeval/data/datasets/eval_dataset_client.py +0 -214
  43. judgeval-0.1.0/src/judgeval/data/example.py +0 -61
  44. judgeval-0.1.0/src/judgeval/utils/file_utils.py +0 -51
  45. {judgeval-0.1.0 → judgeval-0.3.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  46. {judgeval-0.1.0 → judgeval-0.3.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  47. {judgeval-0.1.0 → judgeval-0.3.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  48. {judgeval-0.1.0 → judgeval-0.3.0}/.github/pull_request_template.md +0 -0
  49. {judgeval-0.1.0 → judgeval-0.3.0}/.github/workflows/blocked-pr.yaml +0 -0
  50. {judgeval-0.1.0 → judgeval-0.3.0}/.github/workflows/ci.yaml +0 -0
  51. {judgeval-0.1.0 → judgeval-0.3.0}/.github/workflows/merge-branch-check.yaml +0 -0
  52. {judgeval-0.1.0 → judgeval-0.3.0}/.github/workflows/release.yaml +0 -0
  53. {judgeval-0.1.0 → judgeval-0.3.0}/.github/workflows/validate-branch.yaml +0 -0
  54. {judgeval-0.1.0 → judgeval-0.3.0}/.gitignore +0 -0
  55. {judgeval-0.1.0 → judgeval-0.3.0}/LICENSE.md +0 -0
  56. {judgeval-0.1.0 → judgeval-0.3.0}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
  57. {judgeval-0.1.0 → judgeval-0.3.0}/assets/agent.gif +0 -0
  58. {judgeval-0.1.0 → judgeval-0.3.0}/assets/data.gif +0 -0
  59. {judgeval-0.1.0 → judgeval-0.3.0}/assets/dataset_clustering_screenshot.png +0 -0
  60. {judgeval-0.1.0 → judgeval-0.3.0}/assets/dataset_clustering_screenshot_dm.png +0 -0
  61. {judgeval-0.1.0 → judgeval-0.3.0}/assets/datasets_preview_screenshot.png +0 -0
  62. {judgeval-0.1.0 → judgeval-0.3.0}/assets/document.gif +0 -0
  63. {judgeval-0.1.0 → judgeval-0.3.0}/assets/error_analysis_dashboard.png +0 -0
  64. {judgeval-0.1.0 → judgeval-0.3.0}/assets/experiments_dashboard_screenshot.png +0 -0
  65. {judgeval-0.1.0 → judgeval-0.3.0}/assets/experiments_page.png +0 -0
  66. {judgeval-0.1.0 → judgeval-0.3.0}/assets/experiments_pagev2.png +0 -0
  67. {judgeval-0.1.0 → judgeval-0.3.0}/assets/logo-dark.svg +0 -0
  68. {judgeval-0.1.0 → judgeval-0.3.0}/assets/logo-light.svg +0 -0
  69. {judgeval-0.1.0 → judgeval-0.3.0}/assets/monitoring_screenshot.png +0 -0
  70. {judgeval-0.1.0 → judgeval-0.3.0}/assets/new_darkmode.svg +0 -0
  71. {judgeval-0.1.0 → judgeval-0.3.0}/assets/new_lightmode.svg +0 -0
  72. {judgeval-0.1.0 → judgeval-0.3.0}/assets/trace.gif +0 -0
  73. {judgeval-0.1.0 → judgeval-0.3.0}/assets/trace_demo.png +0 -0
  74. {judgeval-0.1.0 → judgeval-0.3.0}/assets/trace_screenshot.png +0 -0
  75. {judgeval-0.1.0 → judgeval-0.3.0}/assets/trace_screenshot_old.png +0 -0
  76. {judgeval-0.1.0 → judgeval-0.3.0}/pytest.ini +0 -0
  77. {judgeval-0.1.0 → judgeval-0.3.0}/src/.coveragerc +0 -0
  78. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/__init__.py +0 -0
  79. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/clients.py +0 -0
  80. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/common/__init__.py +0 -0
  81. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/common/api/__init__.py +0 -0
  82. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/common/exceptions.py +0 -0
  83. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/common/logger.py +0 -0
  84. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/common/storage/__init__.py +0 -0
  85. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/common/tracer/__init__.py +0 -0
  86. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/common/tracer/constants.py +0 -0
  87. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/common/tracer/otel_exporter.py +0 -0
  88. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/common/tracer/span_processor.py +0 -0
  89. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/common/tracer/trace_manager.py +0 -0
  90. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/constants.py +0 -0
  91. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/data/__init__.py +0 -0
  92. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/data/scorer_data.py +0 -0
  93. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
  94. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/data/tool.py +0 -0
  95. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/data/trace_run.py +0 -0
  96. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/integrations/langgraph.py +0 -0
  97. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/judges/__init__.py +0 -0
  98. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/judges/base_judge.py +0 -0
  99. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/judges/utils.py +0 -0
  100. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/rules.py +0 -0
  101. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/__init__.py +0 -0
  102. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/agent_scorer.py +0 -0
  103. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/api_scorer.py +0 -0
  104. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/base_scorer.py +0 -0
  105. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/example_scorer.py +0 -0
  106. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/exceptions.py +0 -0
  107. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  108. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  109. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  110. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  111. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
  112. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
  113. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  114. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
  115. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
  116. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +0 -0
  117. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -0
  118. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -0
  119. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/tracer/__init__.py +0 -0
  120. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/utils/alerts.py +0 -0
  121. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/utils/requests.py +0 -0
  122. {judgeval-0.1.0 → judgeval-0.3.0}/src/judgeval/version_check.py +0 -0
  123. {judgeval-0.1.0 → judgeval-0.3.0}/update_version.py +0 -0
@@ -10,20 +10,11 @@ jobs:
10
10
  steps:
11
11
  - uses: actions/checkout@v4
12
12
 
13
- - name: Set up Python
14
- uses: actions/setup-python@v5
15
- with:
16
- python-version: '3.11'
17
-
18
13
  - name: Install ruff
19
14
  uses: astral-sh/ruff-action@v3
20
15
  with:
21
16
  args: "--version"
22
17
 
23
- - name: Install mypy and dependencies
24
- run: |
25
- pip install mypy types-requests types-PyYAML
26
-
27
18
  - name: Run ruff formatter
28
19
  if: always()
29
20
  run: ruff format --check .
@@ -31,7 +22,3 @@ jobs:
31
22
  - name: Run ruff linter
32
23
  if: always()
33
24
  run: ruff check .
34
-
35
- - name: Run mypy
36
- if: always()
37
- run: mypy --explicit-package-bases --ignore-missing-imports .
@@ -0,0 +1,25 @@
1
+ name: MyPy Check
2
+
3
+ on:
4
+ pull_request:
5
+ branches: [ main, staging ]
6
+
7
+ jobs:
8
+ mypy:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+
13
+ - name: Set up Python
14
+ uses: actions/setup-python@v5
15
+ with:
16
+ python-version: '3.11'
17
+
18
+ - name: Install dependencies
19
+ run: |
20
+ pip install uv
21
+ uv sync --dev
22
+
23
+ - name: Run mypy
24
+ if: always()
25
+ run: uv run mypy ./src/judgeval/
@@ -0,0 +1,38 @@
1
+ name: Pre-commit auto-update
2
+ on:
3
+ schedule:
4
+ - cron: '0 0 * * 1' # Weekly on Monday at midnight UTC
5
+ workflow_dispatch:
6
+
7
+ jobs:
8
+ auto-update:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - name: Checkout repository
12
+ uses: actions/checkout@v4
13
+ with:
14
+ ref: staging
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v4
18
+ with:
19
+ python-version: '3.11'
20
+
21
+ - name: Install and update pre-commit
22
+ run: |
23
+ pip install pre-commit
24
+ pre-commit autoupdate
25
+
26
+ - name: Create Pull Request
27
+ uses: peter-evans/create-pull-request@v7
28
+ with:
29
+ commit-message: 'chore: update pre-commit hooks'
30
+ title: 'chore: update pre-commit hooks'
31
+ body: |
32
+ Auto-generated PR to update pre-commit hook versions.
33
+
34
+ Please review the changes and merge if everything looks good.
35
+
36
+ Updated by GitHub Actions on {{ date }}.
37
+ branch: update-pre-commit-hooks
38
+ base: staging
@@ -0,0 +1,23 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/uv-pre-commit
3
+ rev: 0.8.0
4
+ hooks:
5
+ - id: uv-lock
6
+
7
+ - repo: https://github.com/astral-sh/ruff-pre-commit
8
+ rev: v0.12.4
9
+ hooks:
10
+ - id: ruff
11
+ name: ruff (linter)
12
+ args: [--fix]
13
+ - id: ruff-format
14
+ name: ruff (formatter)
15
+
16
+ # - repo: https://github.com/pre-commit/mirrors-mypy
17
+ # rev: v1.17.0
18
+ # hooks:
19
+ # - id: mypy
20
+ # language: system
21
+ # # These next two lines allow commits even if mypy fails, REMOVE once we fix all mypy errors
22
+ # verbose: true
23
+ # entry: bash -c 'mypy src/judgeval/ || true'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -14,6 +14,7 @@ Requires-Dist: anthropic
14
14
  Requires-Dist: boto3
15
15
  Requires-Dist: datamodel-code-generator>=0.31.1
16
16
  Requires-Dist: google-genai
17
+ Requires-Dist: groq>=0.30.0
17
18
  Requires-Dist: langchain-anthropic
18
19
  Requires-Dist: langchain-core
19
20
  Requires-Dist: langchain-huggingface
@@ -22,6 +23,9 @@ Requires-Dist: litellm>=1.61.15
22
23
  Requires-Dist: matplotlib>=3.10.3
23
24
  Requires-Dist: nest-asyncio
24
25
  Requires-Dist: openai
26
+ Requires-Dist: opentelemetry-api>=1.34.1
27
+ Requires-Dist: opentelemetry-sdk>=1.34.1
28
+ Requires-Dist: orjson>=3.9.0
25
29
  Requires-Dist: pandas
26
30
  Requires-Dist: python-dotenv==1.0.1
27
31
  Requires-Dist: python-slugify>=8.0.4
@@ -39,7 +43,7 @@ Description-Content-Type: text/markdown
39
43
  Enable self-learning agents with traces, evals, and environment data.
40
44
  </div>
41
45
 
42
- ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
46
+ ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
43
47
 
44
48
  [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
45
49
 
@@ -139,7 +143,7 @@ run_agent("What is the capital of the United States?")
139
143
  ```
140
144
  You'll see your trace exported to the Judgment Platform:
141
145
 
142
- <p align="center"><img src="assets/trace_demo.png" alt="Judgment Platform Trace Example" width="800" /></p>
146
+ <p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
143
147
 
144
148
 
145
149
  [Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
@@ -152,9 +156,9 @@ You'll see your trace exported to the Judgment Platform:
152
156
 
153
157
  | | |
154
158
  |:---|:---:|
155
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
156
- | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
157
- | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
159
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
160
+ | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
161
+ | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
158
162
  | <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
159
163
 
160
164
  ## 🏢 Self-Hosting
@@ -8,7 +8,7 @@
8
8
  Enable self-learning agents with traces, evals, and environment data.
9
9
  </div>
10
10
 
11
- ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
11
+ ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
12
12
 
13
13
  [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
14
14
 
@@ -108,7 +108,7 @@ run_agent("What is the capital of the United States?")
108
108
  ```
109
109
  You'll see your trace exported to the Judgment Platform:
110
110
 
111
- <p align="center"><img src="assets/trace_demo.png" alt="Judgment Platform Trace Example" width="800" /></p>
111
+ <p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
112
112
 
113
113
 
114
114
  [Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
@@ -121,9 +121,9 @@ You'll see your trace exported to the Judgment Platform:
121
121
 
122
122
  | | |
123
123
  |:---|:---:|
124
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
125
- | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
126
- | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
124
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
125
+ | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
126
+ | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
127
127
  | <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
128
128
 
129
129
  ## 🏢 Self-Hosting
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.1.0"
3
+ version = "0.3.0"
4
4
  authors = [
5
5
  { name="Andrew Li", email="andrew@judgmentlabs.ai" },
6
6
  { name="Alex Shan", email="alex@judgmentlabs.ai" },
@@ -33,6 +33,10 @@ dependencies = [
33
33
  "matplotlib>=3.10.3",
34
34
  "python-slugify>=8.0.4",
35
35
  "datamodel-code-generator>=0.31.1",
36
+ "groq>=0.30.0",
37
+ "opentelemetry-api>=1.34.1",
38
+ "opentelemetry-sdk>=1.34.1",
39
+ "orjson>=3.9.0",
36
40
  ]
37
41
 
38
42
  [project.urls]
@@ -62,6 +66,30 @@ dev = [
62
66
  "langgraph>=0.4.3",
63
67
  "pre-commit>=4.2.0",
64
68
  "types-requests>=2.32.4.20250611",
69
+ "mypy>=1.17.0",
70
+ "types-pyyaml>=6.0.12.20250516",
71
+ "pandas-stubs>=2.3.0.250703",
72
+ "lxml-stubs>=0.5.1",
73
+ "types-pygments>=2.19.0.20250715",
74
+ "types-beautifulsoup4>=4.12.0.20250516",
75
+ "types-cachetools>=6.1.0.20250717",
76
+ "types-cffi>=1.17.0.20250523",
77
+ "types-defusedxml>=0.7.0.20250708",
78
+ "types-greenlet>=3.2.0.20250417",
79
+ "types-jsonschema>=4.24.0.20250708",
80
+ "types-objgraph>=3.6.0.20240907",
81
+ "types-pexpect>=4.9.0.20250516",
82
+ "types-protobuf>=6.30.2.20250703",
83
+ "types-psutil>=7.0.0.20250601",
84
+ "types-pyopenssl>=24.1.0.20240722",
85
+ "types-pyasn1>=0.6.0.20250516",
86
+ "types-regex>=2024.11.6.20250403",
87
+ "types-reportlab>=4.4.1.20250602",
88
+ "types-simplejson>=3.20.0.20250326",
89
+ "types-tensorflow>=2.18.0.20250516",
90
+ "types-tqdm>=4.67.0.20250516",
91
+ "types-tree-sitter-languages>=1.10.0.20250530",
92
+ "types-xmltodict>=0.14.0.20241009",
65
93
  ]
66
94
 
67
95
  [tool.hatch.build]
@@ -1,4 +1,4 @@
1
- from typing import Literal, List, Dict, Any
1
+ from typing import Literal, List, Dict, Any, Union
2
2
  from requests import exceptions
3
3
  from judgeval.common.api.constants import (
4
4
  JUDGMENT_TRACES_FETCH_API_URL,
@@ -25,6 +25,8 @@ from judgeval.common.api.constants import (
25
25
  JUDGMENT_SCORER_SAVE_API_URL,
26
26
  JUDGMENT_SCORER_FETCH_API_URL,
27
27
  JUDGMENT_SCORER_EXISTS_API_URL,
28
+ JUDGMENT_DATASETS_APPEND_TRACES_API_URL,
29
+ JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL,
28
30
  )
29
31
  from judgeval.common.api.constants import (
30
32
  TraceFetchPayload,
@@ -48,9 +50,12 @@ from judgeval.common.api.constants import (
48
50
  ScorerSavePayload,
49
51
  ScorerFetchPayload,
50
52
  ScorerExistsPayload,
53
+ CheckExampleKeysPayload,
51
54
  )
52
55
  from judgeval.utils.requests import requests
53
56
 
57
+ import orjson
58
+
54
59
 
55
60
  class JudgmentAPIException(exceptions.HTTPError):
56
61
  """
@@ -65,7 +70,7 @@ class JudgmentAPIException(exceptions.HTTPError):
65
70
  self.request = request
66
71
 
67
72
  @property
68
- def status_code(self) -> int | None:
73
+ def status_code(self) -> Union[int, None]:
69
74
  """Get the HTTP status code from the response."""
70
75
  return self.response.status_code if self.response else None
71
76
 
@@ -114,8 +119,15 @@ class JudgmentApiClient:
114
119
  try:
115
120
  r.raise_for_status()
116
121
  except exceptions.HTTPError as e:
122
+ try:
123
+ detail = r.json().get("detail", "")
124
+ except Exception:
125
+ detail = r.text
126
+
117
127
  raise JudgmentAPIException(
118
- f"HTTP {r.status_code}: {r.reason}", response=r, request=e.request
128
+ f"HTTP {r.status_code}: {r.reason}, {detail}",
129
+ response=r,
130
+ request=e.request,
119
131
  )
120
132
 
121
133
  return r.json()
@@ -218,6 +230,14 @@ class JudgmentApiClient:
218
230
  }
219
231
  return self._do_request("POST", JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL, payload)
220
232
 
233
+ def check_example_keys(self, keys: List[str], eval_name: str, project_name: str):
234
+ payload: CheckExampleKeysPayload = {
235
+ "keys": keys,
236
+ "eval_name": eval_name,
237
+ "project_name": project_name,
238
+ }
239
+ return self._do_request("POST", JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL, payload)
240
+
221
241
  def save_scorer(self, name: str, prompt: str, options: dict):
222
242
  payload: ScorerSavePayload = {
223
243
  "name": name,
@@ -279,7 +299,7 @@ class JudgmentApiClient:
279
299
  project_name: str,
280
300
  examples: List[Dict[str, Any]],
281
301
  traces: List[Dict[str, Any]],
282
- overwrite: bool,
302
+ overwrite: bool = False,
283
303
  ):
284
304
  payload: DatasetPushPayload = {
285
305
  "dataset_alias": dataset_alias,
@@ -302,6 +322,18 @@ class JudgmentApiClient:
302
322
  "POST", JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL, payload
303
323
  )
304
324
 
325
+ def append_traces(
326
+ self, dataset_alias: str, project_name: str, traces: List[Dict[str, Any]]
327
+ ):
328
+ payload: DatasetAppendPayload = {
329
+ "dataset_alias": dataset_alias,
330
+ "project_name": project_name,
331
+ "traces": traces,
332
+ }
333
+ return self._do_request(
334
+ "POST", JUDGMENT_DATASETS_APPEND_TRACES_API_URL, payload
335
+ )
336
+
305
337
  def pull_dataset(self, dataset_alias: str, project_name: str):
306
338
  payload: DatasetPullPayload = {
307
339
  "dataset_alias": dataset_alias,
@@ -347,6 +379,5 @@ class JudgmentApiClient:
347
379
  except Exception as e:
348
380
  return f"<Unserializable object of type {type(obj).__name__}: {e}>"
349
381
 
350
- import json
351
-
352
- return json.dumps(data, default=fallback_encoder)
382
+ # orjson returns bytes, so we need to decode to str
383
+ return orjson.dumps(data, default=fallback_encoder).decode("utf-8")
@@ -51,6 +51,7 @@ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
51
51
  JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
52
52
  JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL = f"{ROOT_API}/check_experiment_type/"
53
53
  JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL = f"{ROOT_API}/eval-run-name-exists/"
54
+ JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL = f"{ROOT_API}/check_example_keys/"
54
55
 
55
56
 
56
57
  # Evaluation API Payloads
@@ -90,9 +91,16 @@ class EvalRunNameExistsPayload(TypedDict):
90
91
  judgment_api_key: str
91
92
 
92
93
 
94
+ class CheckExampleKeysPayload(TypedDict):
95
+ keys: List[str]
96
+ eval_name: str
97
+ project_name: str
98
+
99
+
93
100
  # Datasets API
94
101
  JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
95
102
  JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
103
+ JUDGMENT_DATASETS_APPEND_TRACES_API_URL = f"{ROOT_API}/traces/add_to_dataset/"
96
104
  JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
97
105
  JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
98
106
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
@@ -134,7 +142,7 @@ class DatasetStatsPayload(TypedDict):
134
142
 
135
143
 
136
144
  # Projects API
137
- JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
145
+ JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete_from_judgeval"
138
146
  JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
139
147
 
140
148
 
@@ -1,6 +1,6 @@
1
1
  import os
2
- import json
3
2
  import boto3
3
+ import orjson
4
4
  from typing import Optional
5
5
  from datetime import datetime, UTC
6
6
  from botocore.exceptions import ClientError
@@ -85,8 +85,7 @@ class S3Storage:
85
85
  timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
86
86
  s3_key = f"traces/{project_name}/{trace_id}_{timestamp}.json"
87
87
 
88
- # Convert trace data to JSON string
89
- trace_json = json.dumps(trace_data)
88
+ trace_json = orjson.dumps(trace_data).decode("utf-8")
90
89
 
91
90
  self.s3_client.put_object(
92
91
  Bucket=self.bucket_name,
@@ -32,6 +32,7 @@ from typing import (
32
32
  )
33
33
  import types
34
34
 
35
+
35
36
  from judgeval.common.tracer.constants import _TRACE_FILEPATH_BLOCKLIST
36
37
 
37
38
  from judgeval.common.tracer.otel_span_processor import JudgmentSpanProcessor
@@ -45,6 +46,7 @@ from openai.types.chat import ParsedChatCompletion
45
46
  from together import Together, AsyncTogether
46
47
  from anthropic import Anthropic, AsyncAnthropic
47
48
  from google import genai
49
+ from groq import Groq, AsyncGroq
48
50
 
49
51
  from judgeval.data import Example, Trace, TraceSpan, TraceUsage
50
52
  from judgeval.scorers import APIScorerConfig, BaseScorer
@@ -67,6 +69,8 @@ ApiClient: TypeAlias = Union[
67
69
  AsyncTogether,
68
70
  genai.Client,
69
71
  genai.client.AsyncClient,
72
+ Groq,
73
+ AsyncGroq,
70
74
  ]
71
75
  SpanType: TypeAlias = str
72
76
 
@@ -79,7 +83,7 @@ class TraceClient:
79
83
  tracer: Tracer,
80
84
  trace_id: Optional[str] = None,
81
85
  name: str = "default",
82
- project_name: str | None = None,
86
+ project_name: Union[str, None] = None,
83
87
  enable_monitoring: bool = True,
84
88
  enable_evaluations: bool = True,
85
89
  parent_trace_id: Optional[str] = None,
@@ -850,9 +854,9 @@ class Tracer:
850
854
 
851
855
  def __init__(
852
856
  self,
853
- api_key: str | None = os.getenv("JUDGMENT_API_KEY"),
854
- organization_id: str | None = os.getenv("JUDGMENT_ORG_ID"),
855
- project_name: str | None = None,
857
+ api_key: Union[str, None] = os.getenv("JUDGMENT_API_KEY"),
858
+ organization_id: Union[str, None] = os.getenv("JUDGMENT_ORG_ID"),
859
+ project_name: Union[str, None] = None,
856
860
  deep_tracing: bool = False, # Deep tracing is disabled by default
857
861
  enable_monitoring: bool = os.getenv("JUDGMENT_MONITORING", "true").lower()
858
862
  == "true",
@@ -905,8 +909,8 @@ class Tracer:
905
909
  self.class_identifiers: Dict[
906
910
  str, str
907
911
  ] = {} # Dictionary to store class identifiers
908
- self.span_id_to_previous_span_id: Dict[str, str | None] = {}
909
- self.trace_id_to_previous_trace: Dict[str, TraceClient | None] = {}
912
+ self.span_id_to_previous_span_id: Dict[str, Union[str, None]] = {}
913
+ self.trace_id_to_previous_trace: Dict[str, Union[TraceClient, None]] = {}
910
914
  self.current_span_id: Optional[str] = None
911
915
  self.current_trace: Optional[TraceClient] = None
912
916
  self.trace_across_async_contexts: bool = trace_across_async_contexts
@@ -958,7 +962,9 @@ class Tracer:
958
962
  self.enable_monitoring = False
959
963
  self.enable_evaluations = False
960
964
 
961
- def set_current_span(self, span_id: str) -> Optional[contextvars.Token[str | None]]:
965
+ def set_current_span(
966
+ self, span_id: str
967
+ ) -> Optional[contextvars.Token[Union[str, None]]]:
962
968
  self.span_id_to_previous_span_id[span_id] = self.current_span_id
963
969
  self.current_span_id = span_id
964
970
  Tracer.current_span_id = span_id
@@ -981,7 +987,7 @@ class Tracer:
981
987
 
982
988
  def reset_current_span(
983
989
  self,
984
- token: Optional[contextvars.Token[str | None]] = None,
990
+ token: Optional[contextvars.Token[Union[str, None]]] = None,
985
991
  span_id: Optional[str] = None,
986
992
  ):
987
993
  try:
@@ -997,7 +1003,7 @@ class Tracer:
997
1003
 
998
1004
  def set_current_trace(
999
1005
  self, trace: TraceClient
1000
- ) -> Optional[contextvars.Token[TraceClient | None]]:
1006
+ ) -> Optional[contextvars.Token[Union[TraceClient, None]]]:
1001
1007
  """
1002
1008
  Set the current trace context in contextvars
1003
1009
  """
@@ -1030,7 +1036,7 @@ class Tracer:
1030
1036
 
1031
1037
  def reset_current_trace(
1032
1038
  self,
1033
- token: Optional[contextvars.Token[TraceClient | None]] = None,
1039
+ token: Optional[contextvars.Token[Union[TraceClient, None]]] = None,
1034
1040
  trace_id: Optional[str] = None,
1035
1041
  ):
1036
1042
  try:
@@ -1046,7 +1052,7 @@ class Tracer:
1046
1052
 
1047
1053
  @contextmanager
1048
1054
  def trace(
1049
- self, name: str, project_name: str | None = None
1055
+ self, name: str, project_name: Union[str, None] = None
1050
1056
  ) -> Generator[TraceClient, None, None]:
1051
1057
  """Start a new trace context using a context manager"""
1052
1058
  trace_id = str(uuid.uuid4())
@@ -1692,25 +1698,31 @@ def wrap(
1692
1698
  return wrapper
1693
1699
 
1694
1700
  if isinstance(client, (OpenAI)):
1695
- client.chat.completions.create = wrapped(original_create)
1696
- client.responses.create = wrapped(original_responses_create)
1697
- client.beta.chat.completions.parse = wrapped(original_beta_parse)
1701
+ setattr(client.chat.completions, "create", wrapped(original_create))
1702
+ setattr(client.responses, "create", wrapped(original_responses_create))
1703
+ setattr(client.beta.chat.completions, "parse", wrapped(original_beta_parse))
1698
1704
  elif isinstance(client, (AsyncOpenAI)):
1699
- client.chat.completions.create = wrapped_async(original_create)
1700
- client.responses.create = wrapped_async(original_responses_create)
1701
- client.beta.chat.completions.parse = wrapped_async(original_beta_parse)
1705
+ setattr(client.chat.completions, "create", wrapped_async(original_create))
1706
+ setattr(client.responses, "create", wrapped_async(original_responses_create))
1707
+ setattr(
1708
+ client.beta.chat.completions, "parse", wrapped_async(original_beta_parse)
1709
+ )
1702
1710
  elif isinstance(client, (Together)):
1703
- client.chat.completions.create = wrapped(original_create)
1711
+ setattr(client.chat.completions, "create", wrapped(original_create))
1704
1712
  elif isinstance(client, (AsyncTogether)):
1705
- client.chat.completions.create = wrapped_async(original_create)
1713
+ setattr(client.chat.completions, "create", wrapped_async(original_create))
1706
1714
  elif isinstance(client, (Anthropic)):
1707
- client.messages.create = wrapped(original_create)
1715
+ setattr(client.messages, "create", wrapped(original_create))
1708
1716
  elif isinstance(client, (AsyncAnthropic)):
1709
- client.messages.create = wrapped_async(original_create)
1717
+ setattr(client.messages, "create", wrapped_async(original_create))
1710
1718
  elif isinstance(client, (genai.Client)):
1711
- client.models.generate_content = wrapped(original_create)
1719
+ setattr(client.models, "generate_content", wrapped(original_create))
1712
1720
  elif isinstance(client, (genai.client.AsyncClient)):
1713
- client.models.generate_content = wrapped_async(original_create)
1721
+ setattr(client.models, "generate_content", wrapped_async(original_create))
1722
+ elif isinstance(client, (Groq)):
1723
+ setattr(client.chat.completions, "create", wrapped(original_create))
1724
+ elif isinstance(client, (AsyncGroq)):
1725
+ setattr(client.chat.completions, "create", wrapped_async(original_create))
1714
1726
 
1715
1727
  return client
1716
1728
 
@@ -1745,6 +1757,8 @@ def _get_client_config(
1745
1757
  None,
1746
1758
  client.beta.chat.completions.parse,
1747
1759
  )
1760
+ elif isinstance(client, (Groq, AsyncGroq)):
1761
+ return "GROQ_API_CALL", client.chat.completions.create, None, None, None
1748
1762
  elif isinstance(client, (Together, AsyncTogether)):
1749
1763
  return "TOGETHER_API_CALL", client.chat.completions.create, None, None, None
1750
1764
  elif isinstance(client, (Anthropic, AsyncAnthropic)):
@@ -1783,9 +1797,17 @@ def _format_output_data(
1783
1797
  if isinstance(client, (OpenAI, AsyncOpenAI)):
1784
1798
  if isinstance(response, ChatCompletion):
1785
1799
  model_name = response.model
1786
- prompt_tokens = response.usage.prompt_tokens
1787
- completion_tokens = response.usage.completion_tokens
1788
- cache_read_input_tokens = response.usage.prompt_tokens_details.cached_tokens
1800
+ prompt_tokens = response.usage.prompt_tokens if response.usage else 0
1801
+ completion_tokens = (
1802
+ response.usage.completion_tokens if response.usage else 0
1803
+ )
1804
+ cache_read_input_tokens = (
1805
+ response.usage.prompt_tokens_details.cached_tokens
1806
+ if response.usage
1807
+ and response.usage.prompt_tokens_details
1808
+ and response.usage.prompt_tokens_details.cached_tokens
1809
+ else 0
1810
+ )
1789
1811
 
1790
1812
  if isinstance(response, ParsedChatCompletion):
1791
1813
  message_content = response.choices[0].message.parsed
@@ -1793,10 +1815,19 @@ def _format_output_data(
1793
1815
  message_content = response.choices[0].message.content
1794
1816
  elif isinstance(response, Response):
1795
1817
  model_name = response.model
1796
- prompt_tokens = response.usage.input_tokens
1797
- completion_tokens = response.usage.output_tokens
1798
- cache_read_input_tokens = response.usage.input_tokens_details.cached_tokens
1799
- message_content = "".join(seg.text for seg in response.output[0].content)
1818
+ prompt_tokens = response.usage.input_tokens if response.usage else 0
1819
+ completion_tokens = response.usage.output_tokens if response.usage else 0
1820
+ cache_read_input_tokens = (
1821
+ response.usage.input_tokens_details.cached_tokens
1822
+ if response.usage and response.usage.input_tokens_details
1823
+ else 0
1824
+ )
1825
+ if hasattr(response.output[0], "content"):
1826
+ message_content = "".join(
1827
+ seg.text
1828
+ for seg in response.output[0].content
1829
+ if hasattr(seg, "text")
1830
+ )
1800
1831
 
1801
1832
  # Note: LiteLLM seems to use cache_read_input_tokens to calculate the cost for OpenAI
1802
1833
  elif isinstance(client, (Together, AsyncTogether)):
@@ -1821,6 +1852,11 @@ def _format_output_data(
1821
1852
  cache_read_input_tokens = response.usage.cache_read_input_tokens
1822
1853
  cache_creation_input_tokens = response.usage.cache_creation_input_tokens
1823
1854
  message_content = response.content[0].text
1855
+ elif isinstance(client, (Groq, AsyncGroq)):
1856
+ model_name = "groq/" + response.model
1857
+ prompt_tokens = response.usage.prompt_tokens
1858
+ completion_tokens = response.usage.completion_tokens
1859
+ message_content = response.choices[0].message.content
1824
1860
  else:
1825
1861
  judgeval_logger.warning(f"Unsupported client type: {type(client)}")
1826
1862
  return None, None