judgeval 0.0.43__tar.gz → 0.0.46__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. judgeval-0.0.46/.github/pull_request_template.md +13 -0
  2. judgeval-0.0.46/.github/workflows/lint.yaml +37 -0
  3. {judgeval-0.0.43 → judgeval-0.0.46}/.github/workflows/release.yaml +3 -1
  4. {judgeval-0.0.43 → judgeval-0.0.46}/.gitignore +1 -0
  5. judgeval-0.0.46/.pre-commit-config.yaml +21 -0
  6. {judgeval-0.0.43 → judgeval-0.0.46}/PKG-INFO +79 -135
  7. {judgeval-0.0.43 → judgeval-0.0.46}/README.md +78 -134
  8. judgeval-0.0.46/assets/agent.gif +0 -0
  9. judgeval-0.0.46/assets/data.gif +0 -0
  10. judgeval-0.0.46/assets/document.gif +0 -0
  11. judgeval-0.0.46/assets/error_analysis_dashboard.png +0 -0
  12. judgeval-0.0.46/assets/product_shot.png +0 -0
  13. judgeval-0.0.46/assets/trace.gif +0 -0
  14. judgeval-0.0.46/assets/trace_demo.png +0 -0
  15. judgeval-0.0.46/assets/trace_screenshot.png +0 -0
  16. {judgeval-0.0.43 → judgeval-0.0.46}/pyproject.toml +5 -1
  17. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/__init__.py +5 -4
  18. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/clients.py +6 -6
  19. judgeval-0.0.46/src/judgeval/common/__init__.py +13 -0
  20. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/common/exceptions.py +2 -3
  21. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/common/logger.py +74 -49
  22. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/common/s3_storage.py +30 -23
  23. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/common/tracer.py +1302 -984
  24. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/common/utils.py +416 -244
  25. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/constants.py +73 -61
  26. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/data/__init__.py +1 -1
  27. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/data/custom_example.py +3 -2
  28. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/data/datasets/dataset.py +80 -54
  29. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/data/datasets/eval_dataset_client.py +131 -181
  30. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/data/example.py +67 -43
  31. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/data/result.py +11 -9
  32. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/data/scorer_data.py +4 -2
  33. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/data/tool.py +25 -16
  34. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/data/trace.py +57 -29
  35. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/data/trace_run.py +5 -11
  36. judgeval-0.0.46/src/judgeval/evaluation_run.py +84 -0
  37. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/integrations/langgraph.py +546 -184
  38. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/judges/base_judge.py +1 -2
  39. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/judges/litellm_judge.py +33 -11
  40. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/judges/mixture_of_judges.py +128 -78
  41. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/judges/together_judge.py +22 -9
  42. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/judges/utils.py +14 -5
  43. judgeval-0.0.46/src/judgeval/judgment_client.py +565 -0
  44. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/rules.py +169 -142
  45. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/run_evaluation.py +462 -305
  46. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/api_scorer.py +20 -11
  47. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/exceptions.py +1 -0
  48. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorer.py +77 -58
  49. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
  50. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
  51. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
  52. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
  53. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
  54. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
  55. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
  56. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
  57. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
  58. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
  59. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
  60. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
  61. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
  62. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
  63. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
  64. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
  65. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
  66. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
  67. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
  68. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/prompt_scorer.py +48 -37
  69. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/score.py +86 -53
  70. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/utils.py +11 -7
  71. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/tracer/__init__.py +1 -1
  72. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/utils/alerts.py +23 -12
  73. judgeval-0.0.43/src/judgeval/utils/data_utils.py → judgeval-0.0.46/src/judgeval/utils/file_utils.py +5 -9
  74. judgeval-0.0.46/src/judgeval/utils/requests.py +29 -0
  75. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/version_check.py +5 -2
  76. {judgeval-0.0.43 → judgeval-0.0.46}/update_version.py +1 -1
  77. {judgeval-0.0.43 → judgeval-0.0.46}/uv.lock +77 -0
  78. judgeval-0.0.43/.github/pull_request_template.md +0 -31
  79. judgeval-0.0.43/assets/trace_demo.png +0 -0
  80. judgeval-0.0.43/docs/README.md +0 -32
  81. judgeval-0.0.43/docs/alerts/notifications.mdx +0 -283
  82. judgeval-0.0.43/docs/alerts/platform_notifications.mdx +0 -74
  83. judgeval-0.0.43/docs/alerts/rules.mdx +0 -160
  84. judgeval-0.0.43/docs/api_reference/judgment_client.mdx +0 -147
  85. judgeval-0.0.43/docs/api_reference/trace.mdx +0 -140
  86. judgeval-0.0.43/docs/changelog/2025-04-21.mdx +0 -19
  87. judgeval-0.0.43/docs/clustering/clustering.mdx +0 -72
  88. judgeval-0.0.43/docs/compliance/certifications.mdx +0 -47
  89. judgeval-0.0.43/docs/development.mdx +0 -106
  90. judgeval-0.0.43/docs/essentials/code.mdx +0 -37
  91. judgeval-0.0.43/docs/essentials/images.mdx +0 -59
  92. judgeval-0.0.43/docs/essentials/markdown.mdx +0 -88
  93. judgeval-0.0.43/docs/essentials/navigation.mdx +0 -66
  94. judgeval-0.0.43/docs/essentials/reusable-snippets.mdx +0 -110
  95. judgeval-0.0.43/docs/essentials/settings.mdx +0 -318
  96. judgeval-0.0.43/docs/evaluation/data_datasets.mdx +0 -356
  97. judgeval-0.0.43/docs/evaluation/data_examples.mdx +0 -229
  98. judgeval-0.0.43/docs/evaluation/data_sequences.mdx +0 -80
  99. judgeval-0.0.43/docs/evaluation/experiment_comparisons.mdx +0 -143
  100. judgeval-0.0.43/docs/evaluation/introduction.mdx +0 -224
  101. judgeval-0.0.43/docs/evaluation/judges.mdx +0 -209
  102. judgeval-0.0.43/docs/evaluation/scorers/agent/derailment.mdx +0 -54
  103. judgeval-0.0.43/docs/evaluation/scorers/classifier_scorer.mdx +0 -103
  104. judgeval-0.0.43/docs/evaluation/scorers/custom_scorers.mdx +0 -365
  105. judgeval-0.0.43/docs/evaluation/scorers/default/answer_correctness.mdx +0 -86
  106. judgeval-0.0.43/docs/evaluation/scorers/default/answer_relevancy.mdx +0 -85
  107. judgeval-0.0.43/docs/evaluation/scorers/default/comparison.mdx +0 -102
  108. judgeval-0.0.43/docs/evaluation/scorers/default/contextual_precision.mdx +0 -106
  109. judgeval-0.0.43/docs/evaluation/scorers/default/contextual_recall.mdx +0 -104
  110. judgeval-0.0.43/docs/evaluation/scorers/default/contextual_relevancy.mdx +0 -90
  111. judgeval-0.0.43/docs/evaluation/scorers/default/execution_order.mdx +0 -72
  112. judgeval-0.0.43/docs/evaluation/scorers/default/faithfulness.mdx +0 -97
  113. judgeval-0.0.43/docs/evaluation/scorers/default/groundedness.mdx +0 -65
  114. judgeval-0.0.43/docs/evaluation/scorers/default/json_correctness.mdx +0 -54
  115. judgeval-0.0.43/docs/evaluation/scorers/default/summarization.mdx +0 -62
  116. judgeval-0.0.43/docs/evaluation/scorers/introduction.mdx +0 -111
  117. judgeval-0.0.43/docs/evaluation/unit_testing.mdx +0 -93
  118. judgeval-0.0.43/docs/favicon.svg +0 -49
  119. judgeval-0.0.43/docs/getting_started.mdx +0 -374
  120. judgeval-0.0.43/docs/images/annotation_queue_ui.png +0 -0
  121. judgeval-0.0.43/docs/images/basic_trace_example.png +0 -0
  122. judgeval-0.0.43/docs/images/checks-passed.png +0 -0
  123. judgeval-0.0.43/docs/images/cluster.png +0 -0
  124. judgeval-0.0.43/docs/images/cluster_button.png +0 -0
  125. judgeval-0.0.43/docs/images/create_aggressive_scorer.png +0 -0
  126. judgeval-0.0.43/docs/images/create_scorer.png +0 -0
  127. judgeval-0.0.43/docs/images/dashboard_annotation_queue_button.png +0 -0
  128. judgeval-0.0.43/docs/images/evaluation_diagram.png +0 -0
  129. judgeval-0.0.43/docs/images/experiment-comparison-page-2.png +0 -0
  130. judgeval-0.0.43/docs/images/experiment-page-comparison.png +0 -0
  131. judgeval-0.0.43/docs/images/experiment-popout-comparison.png +0 -0
  132. judgeval-0.0.43/docs/images/experiments-page-comparison-2.png +0 -0
  133. judgeval-0.0.43/docs/images/experiments-page-comparison.png +0 -0
  134. judgeval-0.0.43/docs/images/export-dataset.png +0 -0
  135. judgeval-0.0.43/docs/images/hero-dark.svg +0 -161
  136. judgeval-0.0.43/docs/images/hero-light.svg +0 -155
  137. judgeval-0.0.43/docs/images/notifications_page.png +0 -0
  138. judgeval-0.0.43/docs/images/online_eval_fault.png +0 -0
  139. judgeval-0.0.43/docs/images/reports_modal.png +0 -0
  140. judgeval-0.0.43/docs/images/synth_data_button.png +0 -0
  141. judgeval-0.0.43/docs/images/synth_data_window.png +0 -0
  142. judgeval-0.0.43/docs/images/trace_ss.png +0 -0
  143. judgeval-0.0.43/docs/integration/langgraph.mdx +0 -207
  144. judgeval-0.0.43/docs/introduction.mdx +0 -19
  145. judgeval-0.0.43/docs/judgment_cli/installation.mdx +0 -91
  146. judgeval-0.0.43/docs/judgment_cli/self-hosting.mdx +0 -190
  147. judgeval-0.0.43/docs/judgment_cli/supabase-org-id.png +0 -0
  148. judgeval-0.0.43/docs/logo/dark.svg +0 -55
  149. judgeval-0.0.43/docs/logo/light.svg +0 -51
  150. judgeval-0.0.43/docs/mint.json +0 -168
  151. judgeval-0.0.43/docs/monitoring/annotations.mdx +0 -41
  152. judgeval-0.0.43/docs/monitoring/introduction.mdx +0 -36
  153. judgeval-0.0.43/docs/monitoring/production_insights.mdx +0 -0
  154. judgeval-0.0.43/docs/monitoring/tracing.mdx +0 -490
  155. judgeval-0.0.43/docs/monitoring/tracing_s3.mdx +0 -60
  156. judgeval-0.0.43/docs/notebooks/create_dataset.ipynb +0 -250
  157. judgeval-0.0.43/docs/notebooks/create_scorer.ipynb +0 -57
  158. judgeval-0.0.43/docs/notebooks/demo.ipynb +0 -389
  159. judgeval-0.0.43/docs/notebooks/prompt_scorer.ipynb +0 -165
  160. judgeval-0.0.43/docs/notebooks/quickstart.ipynb +0 -252
  161. judgeval-0.0.43/docs/optimization/osiris_agent.mdx +0 -598
  162. judgeval-0.0.43/docs/quickstart.mdx +0 -89
  163. judgeval-0.0.43/docs/self_hosting/get_started.mdx +0 -73
  164. judgeval-0.0.43/docs/snippets/snippet-intro.mdx +0 -4
  165. judgeval-0.0.43/docs/synthetic_data/synthetic_data.mdx +0 -66
  166. judgeval-0.0.43/src/judgeval/common/__init__.py +0 -8
  167. judgeval-0.0.43/src/judgeval/evaluation_run.py +0 -144
  168. judgeval-0.0.43/src/judgeval/judgment_client.py +0 -577
  169. {judgeval-0.0.43 → judgeval-0.0.46}/.github/workflows/blocked-pr.yaml +0 -0
  170. {judgeval-0.0.43 → judgeval-0.0.46}/.github/workflows/ci.yaml +0 -0
  171. {judgeval-0.0.43 → judgeval-0.0.46}/.github/workflows/merge-branch-check.yaml +0 -0
  172. {judgeval-0.0.43 → judgeval-0.0.46}/.github/workflows/validate-branch.yaml +0 -0
  173. {judgeval-0.0.43 → judgeval-0.0.46}/LICENSE.md +0 -0
  174. {judgeval-0.0.43 → judgeval-0.0.46}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
  175. {judgeval-0.0.43 → judgeval-0.0.46}/assets/dataset_clustering_screenshot.png +0 -0
  176. {judgeval-0.0.43 → judgeval-0.0.46}/assets/dataset_clustering_screenshot_dm.png +0 -0
  177. {judgeval-0.0.43 → judgeval-0.0.46}/assets/datasets_preview_screenshot.png +0 -0
  178. {judgeval-0.0.43 → judgeval-0.0.46}/assets/experiments_dashboard_screenshot.png +0 -0
  179. {judgeval-0.0.43 → judgeval-0.0.46}/assets/experiments_page.png +0 -0
  180. {judgeval-0.0.43 → judgeval-0.0.46}/assets/experiments_pagev2.png +0 -0
  181. {judgeval-0.0.43 → judgeval-0.0.46}/assets/logo-dark.svg +0 -0
  182. {judgeval-0.0.43 → judgeval-0.0.46}/assets/logo-light.svg +0 -0
  183. {judgeval-0.0.43 → judgeval-0.0.46}/assets/monitoring_screenshot.png +0 -0
  184. {judgeval-0.0.43 → judgeval-0.0.46}/assets/new_darkmode.svg +0 -0
  185. {judgeval-0.0.43 → judgeval-0.0.46}/assets/new_lightmode.svg +0 -0
  186. /judgeval-0.0.43/assets/trace_screenshot.png → /judgeval-0.0.46/assets/trace_screenshot_old.png +0 -0
  187. {judgeval-0.0.43 → judgeval-0.0.46}/pytest.ini +0 -0
  188. {judgeval-0.0.43 → judgeval-0.0.46}/src/.coveragerc +0 -0
  189. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/data/datasets/__init__.py +0 -0
  190. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/judges/__init__.py +0 -0
  191. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/__init__.py +0 -0
  192. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  193. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
  194. {judgeval-0.0.43 → judgeval-0.0.46}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
@@ -0,0 +1,13 @@
1
+ ## 📝 Summary
2
+
3
+ <!-- Provide a brief description of the changes introduced by this PR -->
4
+
5
+ ## 🎥 Demo of Changes
6
+
7
+ <!-- Add a short 1-3 minute video describing/demoing the changes -->
8
+
9
+ ## ✅ Checklist
10
+
11
+ - [ ] Tagged Linear ticket in PR title. Ie. PR Title (JUD-XXXX)
12
+ - [ ] Video demo of changes
13
+ - [ ] Reviewers assigned
@@ -0,0 +1,37 @@
1
+ name: Lint
2
+
3
+ on:
4
+ pull_request:
5
+ branches: [ main, staging ]
6
+
7
+ jobs:
8
+ lint:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+
13
+ - name: Set up Python
14
+ uses: actions/setup-python@v5
15
+ with:
16
+ python-version: '3.11'
17
+
18
+ - name: Install ruff
19
+ uses: astral-sh/ruff-action@v3
20
+ with:
21
+ args: "--version"
22
+
23
+ - name: Install mypy and dependencies
24
+ run: |
25
+ pip install mypy types-requests types-PyYAML
26
+
27
+ - name: Run ruff formatter
28
+ if: always()
29
+ run: ruff format --check .
30
+
31
+ - name: Run ruff linter
32
+ if: always()
33
+ run: ruff check .
34
+
35
+ - name: Run mypy
36
+ if: always()
37
+ run: mypy --explicit-package-bases --ignore-missing-imports .
@@ -4,6 +4,7 @@ on:
4
4
  push:
5
5
  branches:
6
6
  - main
7
+ - trigger_release
7
8
 
8
9
  jobs:
9
10
  release:
@@ -38,7 +39,8 @@ jobs:
38
39
  IFS='.' read -r major minor patch <<< "$latest_version"
39
40
 
40
41
  # Bump patch version
41
- patch=$((patch + 1))
42
+ # patch=$((patch + 1))
43
+ patch=46
42
44
  new_version="$major.$minor.$patch"
43
45
 
44
46
  echo "New version: $new_version"
@@ -110,3 +110,4 @@ test-results.xml
110
110
 
111
111
  # Logs
112
112
  ./logs
113
+ demo
@@ -0,0 +1,21 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/uv-pre-commit
3
+ rev: 0.7.14
4
+ hooks:
5
+ - id: uv-lock
6
+
7
+ - repo: https://github.com/astral-sh/ruff-pre-commit
8
+ rev: v0.12.0
9
+ hooks:
10
+ - id: ruff
11
+ name: ruff (linter)
12
+ args: [--fix]
13
+ - id: ruff-format
14
+ name: ruff (formatter)
15
+
16
+ - repo: https://github.com/pre-commit/mirrors-mypy
17
+ rev: v1.16.1
18
+ hooks:
19
+ - id: mypy
20
+ args: [--explicit-package-bases, --ignore-missing-imports]
21
+ additional_dependencies: [types-requests, types-PyYAML]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.43
3
+ Version: 0.0.46
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -34,57 +34,60 @@ Description-Content-Type: text/markdown
34
34
 
35
35
  <br>
36
36
  <div style="font-size: 1.5em;">
37
- Open source tracing, evals, and metrics to debug, test, and monitor LLM agents.
37
+ Enable self-learning agents with traces, evals, and environment data.
38
38
  </div>
39
39
 
40
- ## [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/self-hosting/get_started) • [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc)
40
+ ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/self-hosting/get_started)
41
41
 
42
- [Docs](https://docs.judgmentlabs.ai/introduction) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
42
+ [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
43
43
 
44
- We're hiring! Join us in our mission to unleash optimized agents.
44
+ We're hiring! Join us in our mission to enable self-learning agents by providing the data and signals needed for continuous improvement.
45
45
 
46
46
  [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
47
47
  [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
48
- [![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/ZCnSXYug)
48
+ [![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/tGVFf8UBUY)
49
49
 
50
- <img src="assets/experiments_pagev2.png" alt="Judgment Platform Experiments Page" width="800" />
50
+ <img src="assets/product_shot.png" alt="Judgment Platform" width="800" />
51
51
 
52
52
  </div>
53
53
 
54
-
55
- Judgeval offers **robust open-source tooling** for tracing, evaluating, and monitoring LLM agents. It helps AI teams effectively **test and monitor** agents in development and production, **closing the agent feedback loop**.
56
-
57
- Judgeval can be set up **(cloud-hosted or self-hosted) in 5 minutes**!
58
- > 🎁 Generous monthly [free tier](https://judgmentlabs.ai/pricing) (10k traces, 1k evals) - No credit card required!
59
-
60
- Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
54
+ Judgeval offers **open-source tooling** for tracing, evaluating, and monitoring LLM agents. **Provides comprehensive data from agent-environment interactions** for continuous learning and self-improvement—**enabling the future of autonomous agents**.
55
+
56
+ ## 🎬 See Judgeval in Action
57
+
58
+ **[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval traces every input/output + environment response across all agent tool calls for debugging. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
59
+
60
+ <table style="width: 100%; max-width: 800px; table-layout: fixed;">
61
+ <tr>
62
+ <td align="center" style="padding: 8px; width: 50%;">
63
+ <img src="assets/agent.gif" alt="Agent Demo" style="width: 100%; max-width: 350px; height: auto;" />
64
+ <br><strong>🤖 Agents Running</strong>
65
+ </td>
66
+ <td align="center" style="padding: 8px; width: 50%;">
67
+ <img src="assets/trace.gif" alt="Trace Demo" style="width: 100%; max-width: 350px; height: auto;" />
68
+ <br><strong>📊 Real-time Tracing</strong>
69
+ </td>
70
+ </tr>
71
+ <tr>
72
+ <td align="center" style="padding: 8px; width: 50%;">
73
+ <img src="assets/document.gif" alt="Agent Completed Demo" style="width: 100%; max-width: 350px; height: auto;" />
74
+ <br><strong>✅ Agents Completed Running</strong>
75
+ </td>
76
+ <td align="center" style="padding: 8px; width: 50%;">
77
+ <img src="assets/data.gif" alt="Data Export Demo" style="width: 100%; max-width: 350px; height: auto;" />
78
+ <br><strong>📤 Exporting Agent Environment Data</strong>
79
+ </td>
80
+ </tr>
81
+
82
+ </table>
61
83
 
62
84
  ## 📋 Table of Contents
63
- - [✨ Features](#-features)
64
85
  - [🛠️ Installation](#️-installation)
65
86
  - [🏁 Quickstarts](#-quickstarts)
66
- - [🛰️ Tracing](#️-tracing)
67
- - [📝 Offline Evaluations](#-offline-evaluations)
68
- - [📡 Online Evaluations](#-online-evaluations)
87
+ - [ Features](#-features)
69
88
  - [🏢 Self-Hosting](#-self-hosting)
70
- - [Key Features](#key-features)
71
- - [Getting Started](#getting-started)
72
89
  - [📚 Cookbooks](#-cookbooks)
73
90
  - [💻 Development with Cursor](#-development-with-cursor)
74
- - [⭐ Star Us on GitHub](#-star-us-on-github)
75
- - [❤️ Contributors](#️-contributors)
76
-
77
- <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
78
-
79
-
80
- ## ✨ Features
81
-
82
- | | |
83
- |:---|:---:|
84
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
85
- | <h3>🧪 Evals</h3>Evals are the key to regression testing for agents. Judgeval provides 15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Judgeval supports LLM-as-a-judge, manual labeling, and custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
86
- | <h3>📡 Monitoring</h3>Track all your agent metrics in production. **Catch production regressions early.**<br><br>Configure alerts to trigger automated actions when metric thresholds are exceeded (add agent trace to review queue/dataset, Slack notification, etc.).<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
87
- | <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets for scaled unit testing and structured experiments. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🗃️ Filtered agent runtime data for fine tuning<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
88
91
 
89
92
  ## 🛠️ Installation
90
93
 
@@ -94,7 +97,7 @@ Get started with Judgeval by installing our SDK using pip:
94
97
  pip install judgeval
95
98
  ```
96
99
 
97
- Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
100
+ Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment Platform](https://app.judgmentlabs.ai/).
98
101
 
99
102
  ```bash
100
103
  export JUDGMENT_API_KEY=...
@@ -107,106 +110,50 @@ export JUDGMENT_ORG_ID=...
107
110
 
108
111
  ### 🛰️ Tracing
109
112
 
110
- Create a file named `traces.py` with the following code:
113
+ Create a file named `agent.py` with the following code:
111
114
 
112
115
  ```python
113
- from judgeval.common.tracer import Tracer, wrap
116
+ from judgeval.tracer import Tracer, wrap
114
117
  from openai import OpenAI
115
118
 
116
- client = wrap(OpenAI())
119
+ client = wrap(OpenAI()) # tracks all LLM calls
117
120
  judgment = Tracer(project_name="my_project")
118
121
 
119
122
  @judgment.observe(span_type="tool")
120
- def my_tool():
121
- return "What's the capital of the U.S.?"
123
+ def format_question(question: str) -> str:
124
+ # dummy tool
125
+ return f"Question : {question}"
122
126
 
123
127
  @judgment.observe(span_type="function")
124
- def main():
125
- task_input = my_tool()
126
- res = client.chat.completions.create(
128
+ def run_agent(prompt: str) -> str:
129
+ task = format_question(prompt)
130
+ response = client.chat.completions.create(
127
131
  model="gpt-4.1",
128
- messages=[{"role": "user", "content": f"{task_input}"}]
132
+ messages=[{"role": "user", "content": task}]
129
133
  )
130
- return res.choices[0].message.content
131
-
132
- main()
134
+ return response.choices[0].message.content
135
+
136
+ run_agent("What is the capital of the United States?")
133
137
  ```
134
138
  You'll see your trace exported to the Judgment Platform:
135
139
 
136
140
  <p align="center"><img src="assets/trace_demo.png" alt="Judgment Platform Trace Example" width="800" /></p>
137
141
 
138
142
 
139
- [Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-trace) for a more detailed explanation.
143
+ [Click here](https://docs.judgmentlabs.ai/tracing/introduction) for a more detailed explanation.
140
144
 
141
- ### 📝 Offline Evaluations
142
-
143
- Create a file named `evaluate.py` with the following code:
144
-
145
- ```python evaluate.py
146
- from judgeval import JudgmentClient
147
- from judgeval.data import Example
148
- from judgeval.scorers import FaithfulnessScorer
149
-
150
- client = JudgmentClient()
151
-
152
- example = Example(
153
- input="What if these shoes don't fit?",
154
- actual_output="We offer a 30-day full refund at no extra cost.",
155
- retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
156
- )
157
-
158
- scorer = FaithfulnessScorer(threshold=0.5) # Hallucination detector
159
- results = client.run_evaluation(
160
- examples=[example],
161
- scorers=[scorer],
162
- model="gpt-4.1",
163
- )
164
- print(results)
165
- ```
166
-
167
- [Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-experiment) for a more detailed explanation.
168
-
169
- ### 📡 Online Evaluations
170
-
171
- Attach performance monitoring on traces to measure the quality of your systems in production.
172
145
 
173
- Using the same `traces.py` file we created earlier, modify `main` function:
174
-
175
- ```python
176
- from judgeval.common.tracer import Tracer, wrap
177
- from judgeval.scorers import AnswerRelevancyScorer
178
- from openai import OpenAI
179
-
180
- client = wrap(OpenAI())
181
- judgment = Tracer(project_name="my_project")
182
-
183
- @judgment.observe(span_type="tool")
184
- def my_tool():
185
- return "Hello world!"
186
-
187
- @judgment.observe(span_type="function")
188
- def main():
189
- task_input = my_tool()
190
- res = client.chat.completions.create(
191
- model="gpt-4.1",
192
- messages=[{"role": "user", "content": f"{task_input}"}]
193
- ).choices[0].message.content
194
-
195
- judgment.async_evaluate(
196
- scorers=[AnswerRelevancyScorer(threshold=0.5)],
197
- input=task_input,
198
- actual_output=res,
199
- model="gpt-4.1"
200
- )
201
- print("Online evaluation submitted.")
202
- return res
146
+ <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
203
147
 
204
- main()
205
- ```
206
148
 
207
- You should see an evaluation attached to your trace on the Judgment Platform.
149
+ ## Features
208
150
 
209
- [Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-online-evaluation) for a more detailed explanation.
151
+ | | |
152
+ |:---|:---:|
153
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time. Export data per individual trace for detailed analysis.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
154
+ | <h3>🧪 Evals</h3>Evals are the key to regression testing for agents. Judgeval provides 15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Judgeval supports LLM-as-a-judge, manual labeling, and custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
155
+ | <h3>📡 Monitoring</h3>Track all your agent metrics in production. **Catch production regressions early.**<br><br>Configure alerts to trigger automated actions when metric thresholds are exceeded (add agent trace to review queue/dataset, Slack notification, etc.).<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
156
+ | <h3>📊 Datasets</h3>Export comprehensive agent-environment interaction data or import external testcases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
210
157
 
211
158
  ## 🏢 Self-Hosting
212
159
 
@@ -224,14 +171,9 @@ Run Judgment on your own infrastructure: we provide comprehensive self-hosting c
224
171
 
225
172
  ## 📚 Cookbooks
226
173
 
227
- Have your own? We're happy to feature it if you create a PR or message us on [Discord](https://discord.gg/taAufyhf).
174
+ Have your own? We're happy to feature it if you create a PR or message us on [Discord](https://discord.gg/tGVFf8UBUY).
228
175
 
229
- You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judgment-cookbook). Here are some highlights:
230
-
231
- ### Sample Agents
232
-
233
- #### [Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent)
234
- A multi-agent system augmented with tool calls designed for general purpose tasks like financial research and math. Traced and evaluated on Faithfulness (factual adherence to retrieval context).
176
+ You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judgment-cookbook).
235
177
 
236
178
  ## 💻 Development with Cursor
237
179
  When building agents and LLM workflows in Cursor, providing proper context to your coding assistant helps ensure seamless integration with Judgment. This rule file supplies the essential context your coding assistant needs for successful implementation.
@@ -1243,10 +1185,10 @@ Judgeval is created and maintained by @Judgment Labs.
1243
1185
 
1244
1186
  | | |
1245
1187
  |:---|:---:|
1246
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
1188
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
1247
1189
  | <h3>🧪 Evals</h3>15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Build custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails <br><br> | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
1248
1190
  | <h3>📡 Monitoring</h3>Real-time performance tracking of your agents in production environments. **Track all your metrics in one place.**<br><br>Set up **Slack/email alerts** for critical metrics and receive notifications when thresholds are exceeded.<br><br> **Useful for:** <br>•📉 Identifying degradation early <br>•📈 Visualizing performance trends across versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
1249
- | <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets hosted on Judgment's Platform. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🔄 Scaled analysis for A/B tests <br>• 🗃️ Filtered collections of agent runtime data| <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
1191
+ | <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets for scaled unit testing and structured experiments. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🗃️ Filtered agent runtime data for fine tuning<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
1250
1192
  | <h3>💡 Insights</h3>Cluster on your data to reveal common use cases and failure modes.<br><br>Trace failures to their exact source with Judgment's Osiris agent, which localizes errors to specific components for precise fixes.<br><br> **Useful for:**<br>•🔮 Surfacing common inputs that lead to error<br>•🤖 Investigating agent/user behavior for optimization <br>| <p align="center"><img src="assets/dataset_clustering_screenshot_dm.png" alt="Insights dashboard" width="1200"/></p> |
1251
1193
 
1252
1194
  ## 🛠️ Installation
@@ -1271,26 +1213,27 @@ Track your agent execution with full observability with just a few lines of code
1271
1213
  Create a file named `traces.py` with the following code:
1272
1214
 
1273
1215
  ```python
1274
- from judgeval.common.tracer import Tracer, wrap
1216
+ from judgeval.tracer import Tracer, wrap
1275
1217
  from openai import OpenAI
1276
1218
 
1277
- client = wrap(OpenAI())
1219
+ client = wrap(OpenAI()) # tracks all LLM calls
1278
1220
  judgment = Tracer(project_name="my_project")
1279
1221
 
1280
1222
  @judgment.observe(span_type="tool")
1281
- def my_tool():
1282
- return "What's the capital of the U.S.?"
1223
+ def format_question(question: str) -> str:
1224
+ # dummy tool
1225
+ return f"Question : {question}"
1283
1226
 
1284
1227
  @judgment.observe(span_type="function")
1285
- def main():
1286
- task_input = my_tool()
1287
- res = client.chat.completions.create(
1228
+ def run_agent(prompt: str) -> str:
1229
+ task = format_question(prompt)
1230
+ response = client.chat.completions.create(
1288
1231
  model="gpt-4.1",
1289
- messages=[{"role": "user", "content": f"{task_input}"}]
1232
+ messages=[{"role": "user", "content": task}]
1290
1233
  )
1291
- return res.choices[0].message.content
1234
+ return response.choices[0].message.content
1292
1235
 
1293
- main()
1236
+ run_agent("What is the capital of the United States?")
1294
1237
  ```
1295
1238
 
1296
1239
  @Click here for a more detailed explanation.
@@ -1418,13 +1361,11 @@ There are many ways to contribute to Judgeval:
1418
1361
  @![Contributors](https://github.com/JudgmentLabs/judgeval/graphs/contributors)
1419
1362
 
1420
1363
  ````
1421
-
1422
1364
  </details>
1423
1365
 
1424
1366
  ## ⭐ Star Us on GitHub
1425
1367
 
1426
- If you find Judgeval useful, please consider giving us a star on GitHub! Your support helps us grow our community and continue improving the product.
1427
-
1368
+ If you find Judgeval useful, please consider giving us a star on GitHub! Your support helps us grow our community and continue improving the repository.
1428
1369
 
1429
1370
  ## ❤️ Contributors
1430
1371
 
@@ -1437,3 +1378,6 @@ There are many ways to contribute to Judgeval:
1437
1378
  <!-- Contributors collage -->
1438
1379
  [![Contributors](https://contributors-img.web.app/image?repo=JudgmentLabs/judgeval)](https://github.com/JudgmentLabs/judgeval/graphs/contributors)
1439
1380
 
1381
+ ---
1382
+
1383
+ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).