judgeval 0.0.40__tar.gz → 0.0.42__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. judgeval-0.0.42/.github/workflows/blocked-pr.yaml +19 -0
  2. judgeval-0.0.42/.github/workflows/ci.yaml +163 -0
  3. judgeval-0.0.40/.github/workflows/merge-to-main.yaml → judgeval-0.0.42/.github/workflows/merge-branch-check.yaml +2 -7
  4. judgeval-0.0.42/.github/workflows/validate-branch.yaml +9 -0
  5. {judgeval-0.0.40 → judgeval-0.0.42}/PKG-INFO +48 -50
  6. {judgeval-0.0.40 → judgeval-0.0.42}/README.md +46 -49
  7. judgeval-0.0.42/assets/trace_demo.png +0 -0
  8. {judgeval-0.0.40 → judgeval-0.0.42}/pyproject.toml +15 -2
  9. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/common/s3_storage.py +3 -1
  10. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/common/tracer.py +1079 -139
  11. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/common/utils.py +6 -2
  12. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/constants.py +5 -0
  13. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/datasets/dataset.py +12 -6
  14. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/datasets/eval_dataset_client.py +3 -1
  15. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/trace.py +7 -2
  16. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/integrations/langgraph.py +218 -34
  17. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/judgment_client.py +9 -1
  18. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/rules.py +60 -50
  19. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/run_evaluation.py +53 -29
  20. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorer.py +4 -1
  21. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/prompt_scorer.py +3 -0
  22. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/utils/alerts.py +8 -0
  23. {judgeval-0.0.40 → judgeval-0.0.42}/update_version.py +1 -1
  24. judgeval-0.0.42/uv.lock +4032 -0
  25. judgeval-0.0.40/.github/workflows/ci-staging.yaml +0 -103
  26. judgeval-0.0.40/.github/workflows/ci.yaml +0 -103
  27. judgeval-0.0.40/Pipfile +0 -33
  28. judgeval-0.0.40/Pipfile.lock +0 -4329
  29. {judgeval-0.0.40 → judgeval-0.0.42}/.github/pull_request_template.md +0 -0
  30. {judgeval-0.0.40 → judgeval-0.0.42}/.github/workflows/release.yaml +0 -0
  31. {judgeval-0.0.40 → judgeval-0.0.42}/.gitignore +0 -0
  32. {judgeval-0.0.40 → judgeval-0.0.42}/LICENSE.md +0 -0
  33. {judgeval-0.0.40 → judgeval-0.0.42}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
  34. {judgeval-0.0.40 → judgeval-0.0.42}/assets/dataset_clustering_screenshot.png +0 -0
  35. {judgeval-0.0.40 → judgeval-0.0.42}/assets/dataset_clustering_screenshot_dm.png +0 -0
  36. {judgeval-0.0.40 → judgeval-0.0.42}/assets/datasets_preview_screenshot.png +0 -0
  37. {judgeval-0.0.40 → judgeval-0.0.42}/assets/experiments_dashboard_screenshot.png +0 -0
  38. {judgeval-0.0.40 → judgeval-0.0.42}/assets/experiments_page.png +0 -0
  39. {judgeval-0.0.40 → judgeval-0.0.42}/assets/experiments_pagev2.png +0 -0
  40. {judgeval-0.0.40 → judgeval-0.0.42}/assets/logo-dark.svg +0 -0
  41. {judgeval-0.0.40 → judgeval-0.0.42}/assets/logo-light.svg +0 -0
  42. {judgeval-0.0.40 → judgeval-0.0.42}/assets/monitoring_screenshot.png +0 -0
  43. {judgeval-0.0.40 → judgeval-0.0.42}/assets/new_darkmode.svg +0 -0
  44. {judgeval-0.0.40 → judgeval-0.0.42}/assets/new_lightmode.svg +0 -0
  45. {judgeval-0.0.40 → judgeval-0.0.42}/assets/trace_screenshot.png +0 -0
  46. {judgeval-0.0.40 → judgeval-0.0.42}/docs/README.md +0 -0
  47. {judgeval-0.0.40 → judgeval-0.0.42}/docs/alerts/notifications.mdx +0 -0
  48. {judgeval-0.0.40 → judgeval-0.0.42}/docs/alerts/platform_notifications.mdx +0 -0
  49. {judgeval-0.0.40 → judgeval-0.0.42}/docs/alerts/rules.mdx +0 -0
  50. {judgeval-0.0.40 → judgeval-0.0.42}/docs/api_reference/judgment_client.mdx +0 -0
  51. {judgeval-0.0.40 → judgeval-0.0.42}/docs/api_reference/trace.mdx +0 -0
  52. {judgeval-0.0.40 → judgeval-0.0.42}/docs/changelog/2025-04-21.mdx +0 -0
  53. {judgeval-0.0.40 → judgeval-0.0.42}/docs/clustering/clustering.mdx +0 -0
  54. {judgeval-0.0.40 → judgeval-0.0.42}/docs/compliance/certifications.mdx +0 -0
  55. {judgeval-0.0.40 → judgeval-0.0.42}/docs/development.mdx +0 -0
  56. {judgeval-0.0.40 → judgeval-0.0.42}/docs/essentials/code.mdx +0 -0
  57. {judgeval-0.0.40 → judgeval-0.0.42}/docs/essentials/images.mdx +0 -0
  58. {judgeval-0.0.40 → judgeval-0.0.42}/docs/essentials/markdown.mdx +0 -0
  59. {judgeval-0.0.40 → judgeval-0.0.42}/docs/essentials/navigation.mdx +0 -0
  60. {judgeval-0.0.40 → judgeval-0.0.42}/docs/essentials/reusable-snippets.mdx +0 -0
  61. {judgeval-0.0.40 → judgeval-0.0.42}/docs/essentials/settings.mdx +0 -0
  62. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/data_datasets.mdx +0 -0
  63. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/data_examples.mdx +0 -0
  64. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/data_sequences.mdx +0 -0
  65. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/experiment_comparisons.mdx +0 -0
  66. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/introduction.mdx +0 -0
  67. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/judges.mdx +0 -0
  68. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/agent/derailment.mdx +0 -0
  69. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
  70. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
  71. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/answer_correctness.mdx +0 -0
  72. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/answer_relevancy.mdx +0 -0
  73. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/comparison.mdx +0 -0
  74. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/contextual_precision.mdx +0 -0
  75. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/contextual_recall.mdx +0 -0
  76. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/contextual_relevancy.mdx +0 -0
  77. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/execution_order.mdx +0 -0
  78. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/faithfulness.mdx +0 -0
  79. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/groundedness.mdx +0 -0
  80. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/json_correctness.mdx +0 -0
  81. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/default/summarization.mdx +0 -0
  82. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/scorers/introduction.mdx +0 -0
  83. {judgeval-0.0.40 → judgeval-0.0.42}/docs/evaluation/unit_testing.mdx +0 -0
  84. {judgeval-0.0.40 → judgeval-0.0.42}/docs/favicon.svg +0 -0
  85. {judgeval-0.0.40 → judgeval-0.0.42}/docs/getting_started.mdx +0 -0
  86. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/annotation_queue_ui.png +0 -0
  87. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/basic_trace_example.png +0 -0
  88. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/checks-passed.png +0 -0
  89. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/cluster.png +0 -0
  90. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/cluster_button.png +0 -0
  91. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/create_aggressive_scorer.png +0 -0
  92. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/create_scorer.png +0 -0
  93. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/dashboard_annotation_queue_button.png +0 -0
  94. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/evaluation_diagram.png +0 -0
  95. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/experiment-comparison-page-2.png +0 -0
  96. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/experiment-page-comparison.png +0 -0
  97. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/experiment-popout-comparison.png +0 -0
  98. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/experiments-page-comparison-2.png +0 -0
  99. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/experiments-page-comparison.png +0 -0
  100. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/export-dataset.png +0 -0
  101. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/hero-dark.svg +0 -0
  102. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/hero-light.svg +0 -0
  103. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/notifications_page.png +0 -0
  104. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/online_eval_fault.png +0 -0
  105. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/reports_modal.png +0 -0
  106. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/synth_data_button.png +0 -0
  107. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/synth_data_window.png +0 -0
  108. {judgeval-0.0.40 → judgeval-0.0.42}/docs/images/trace_ss.png +0 -0
  109. {judgeval-0.0.40 → judgeval-0.0.42}/docs/integration/langgraph.mdx +0 -0
  110. {judgeval-0.0.40 → judgeval-0.0.42}/docs/introduction.mdx +0 -0
  111. {judgeval-0.0.40 → judgeval-0.0.42}/docs/judgment_cli/installation.mdx +0 -0
  112. {judgeval-0.0.40 → judgeval-0.0.42}/docs/judgment_cli/self-hosting.mdx +0 -0
  113. {judgeval-0.0.40 → judgeval-0.0.42}/docs/judgment_cli/supabase-org-id.png +0 -0
  114. {judgeval-0.0.40 → judgeval-0.0.42}/docs/logo/dark.svg +0 -0
  115. {judgeval-0.0.40 → judgeval-0.0.42}/docs/logo/light.svg +0 -0
  116. {judgeval-0.0.40 → judgeval-0.0.42}/docs/mint.json +0 -0
  117. {judgeval-0.0.40 → judgeval-0.0.42}/docs/monitoring/annotations.mdx +0 -0
  118. {judgeval-0.0.40 → judgeval-0.0.42}/docs/monitoring/introduction.mdx +0 -0
  119. {judgeval-0.0.40 → judgeval-0.0.42}/docs/monitoring/production_insights.mdx +0 -0
  120. {judgeval-0.0.40 → judgeval-0.0.42}/docs/monitoring/tracing.mdx +0 -0
  121. {judgeval-0.0.40 → judgeval-0.0.42}/docs/monitoring/tracing_s3.mdx +0 -0
  122. {judgeval-0.0.40 → judgeval-0.0.42}/docs/notebooks/create_dataset.ipynb +0 -0
  123. {judgeval-0.0.40 → judgeval-0.0.42}/docs/notebooks/create_scorer.ipynb +0 -0
  124. {judgeval-0.0.40 → judgeval-0.0.42}/docs/notebooks/demo.ipynb +0 -0
  125. {judgeval-0.0.40 → judgeval-0.0.42}/docs/notebooks/prompt_scorer.ipynb +0 -0
  126. {judgeval-0.0.40 → judgeval-0.0.42}/docs/notebooks/quickstart.ipynb +0 -0
  127. {judgeval-0.0.40 → judgeval-0.0.42}/docs/optimization/osiris_agent.mdx +0 -0
  128. {judgeval-0.0.40 → judgeval-0.0.42}/docs/quickstart.mdx +0 -0
  129. {judgeval-0.0.40 → judgeval-0.0.42}/docs/self_hosting/get_started.mdx +0 -0
  130. {judgeval-0.0.40 → judgeval-0.0.42}/docs/snippets/snippet-intro.mdx +0 -0
  131. {judgeval-0.0.40 → judgeval-0.0.42}/docs/synthetic_data/synthetic_data.mdx +0 -0
  132. {judgeval-0.0.40 → judgeval-0.0.42}/pytest.ini +0 -0
  133. {judgeval-0.0.40 → judgeval-0.0.42}/src/.coveragerc +0 -0
  134. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/__init__.py +0 -0
  135. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/clients.py +0 -0
  136. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/common/__init__.py +0 -0
  137. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/common/exceptions.py +0 -0
  138. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/common/logger.py +0 -0
  139. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/__init__.py +0 -0
  140. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/custom_example.py +0 -0
  141. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/datasets/__init__.py +0 -0
  142. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/example.py +0 -0
  143. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/result.py +0 -0
  144. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/scorer_data.py +0 -0
  145. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/tool.py +0 -0
  146. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/data/trace_run.py +0 -0
  147. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/evaluation_run.py +0 -0
  148. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/judges/__init__.py +0 -0
  149. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/judges/base_judge.py +0 -0
  150. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/judges/litellm_judge.py +0 -0
  151. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/judges/mixture_of_judges.py +0 -0
  152. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/judges/together_judge.py +0 -0
  153. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/judges/utils.py +0 -0
  154. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/__init__.py +0 -0
  155. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/api_scorer.py +0 -0
  156. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/exceptions.py +0 -0
  157. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  158. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  159. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  160. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  161. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +0 -0
  162. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -0
  163. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
  164. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
  165. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
  166. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
  167. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
  168. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  169. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -0
  170. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
  171. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
  172. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
  173. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
  174. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -0
  175. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -0
  176. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
  177. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
  178. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
  179. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/score.py +0 -0
  180. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/scorers/utils.py +0 -0
  181. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/tracer/__init__.py +0 -0
  182. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/utils/data_utils.py +0 -0
  183. {judgeval-0.0.40 → judgeval-0.0.42}/src/judgeval/version_check.py +0 -0
@@ -0,0 +1,19 @@
1
+ name: Check Blocked PR
2
+
3
+ on:
4
+ pull_request:
5
+ types:
6
+ - opened
7
+ - labeled
8
+ - unlabeled
9
+ - synchronize
10
+
11
+ jobs:
12
+ fail-for-blocked:
13
+ if: contains(github.event.pull_request.labels.*.name, 'Blocked')
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - name: Fail if PR is blocked
17
+ run: |
18
+ echo "This PR is currently blocked. Please unblock it before merging."
19
+ exit 1
@@ -0,0 +1,163 @@
1
+ name: CI
2
+
3
+ on:
4
+ pull_request:
5
+ types: [opened, synchronize, reopened]
6
+
7
+ permissions: read-all
8
+
9
+ jobs:
10
+ validate-branch:
11
+ uses: ./.github/workflows/merge-branch-check.yaml
12
+
13
+ run-tests:
14
+ needs: [validate-branch]
15
+ if: needs.validate-branch.result == 'success' || needs.validate-branch.result == 'skipped'
16
+ strategy:
17
+ fail-fast: false
18
+ matrix:
19
+ os: [ubuntu-latest, macos-latest]
20
+ python-version:
21
+ - "3.11"
22
+ name: Unit Tests
23
+ runs-on: ${{ matrix.os }}
24
+ env:
25
+ PYTHONPATH: "."
26
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
27
+ TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
28
+ JUDGMENT_DEV: true
29
+
30
+ steps:
31
+ - name: Checkout code
32
+ uses: actions/checkout@v4
33
+
34
+ - name: Set up Python
35
+ uses: actions/setup-python@v4
36
+ with:
37
+ python-version: ${{ matrix.python-version }}
38
+
39
+ - name: Install dependencies
40
+ run: |
41
+ pip install uv
42
+ uv sync --dev
43
+
44
+ - name: Run tests
45
+ run: |
46
+ cd src
47
+ uv run pytest tests
48
+
49
+ run-e2e-tests-staging:
50
+ needs: [validate-branch]
51
+ if: "github.base_ref == 'staging' && !contains(github.actor, '[bot]') && (needs.validate-branch.result == 'success' || needs.validate-branch.result == 'skipped')"
52
+ name: Staging E2E Tests
53
+ runs-on: ubuntu-latest
54
+ env:
55
+ TEST_TIMEOUT_SECONDS: ${{ secrets.TEST_TIMEOUT_SECONDS }}
56
+ steps:
57
+ - name: Wait for turn
58
+ uses: softprops/turnstyle@v2
59
+ with:
60
+ poll-interval-seconds: 10
61
+ same-branch-only: false
62
+ job-to-wait-for: "Staging E2E Tests"
63
+
64
+ - name: Configure AWS Credentials
65
+ uses: aws-actions/configure-aws-credentials@v4
66
+ with:
67
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
68
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
69
+ aws-region: us-west-1
70
+
71
+ - name: Checkout code
72
+ uses: actions/checkout@v4
73
+
74
+ - name: Set up Python
75
+ uses: actions/setup-python@v4
76
+ with:
77
+ python-version: "3.11"
78
+
79
+ - name: Install judgeval dependencies
80
+ run: |
81
+ pip install uv
82
+ uv sync --dev
83
+
84
+ - name: Check if server is running
85
+ run: |
86
+ if ! curl -s https://staging.api.judgmentlabs.ai/health > /dev/null; then
87
+ echo "Staging Judgment server is not running properly. Check logs on AWS CloudWatch for more details."
88
+ exit 1
89
+ else
90
+ echo "Staging server is running."
91
+ fi
92
+
93
+ - name: Run E2E tests
94
+ working-directory: src
95
+ run: |
96
+ SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id gh-actions-stg-judgeval/api-keys/judgeval --query SecretString --output text)
97
+ export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
98
+ timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
99
+
100
+ - name: Upload coverage HTML report (staging)
101
+ if: always()
102
+ uses: actions/upload-artifact@v4
103
+ with:
104
+ name: coverage-html-staging
105
+ path: src/htmlcov
106
+
107
+ run-e2e-tests-main:
108
+ needs: [validate-branch]
109
+ if: "github.base_ref == 'main' && !contains(github.actor, '[bot]') && needs.validate-branch.result == 'success'"
110
+ name: Production E2E Tests
111
+ runs-on: ubuntu-latest
112
+ env:
113
+ TEST_TIMEOUT_SECONDS: ${{ secrets.TEST_TIMEOUT_SECONDS }}
114
+ steps:
115
+ - name: Wait for turn
116
+ uses: softprops/turnstyle@v2
117
+ with:
118
+ poll-interval-seconds: 10
119
+ same-branch-only: false
120
+ job-to-wait-for: "Production E2E Tests"
121
+
122
+ - name: Configure AWS Credentials
123
+ uses: aws-actions/configure-aws-credentials@v4
124
+ with:
125
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
126
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
127
+ aws-region: us-west-1
128
+
129
+ - name: Checkout code
130
+ uses: actions/checkout@v4
131
+
132
+ - name: Set up Python
133
+ uses: actions/setup-python@v4
134
+ with:
135
+ python-version: "3.11"
136
+
137
+ - name: Install judgeval dependencies
138
+ run: |
139
+ pip install uv
140
+ uv sync --dev
141
+
142
+ - name: Check if server is running
143
+ run: |
144
+ if ! curl -s https://api.judgmentlabs.ai/health > /dev/null; then
145
+ echo "Production Judgment server is not running properly. Check logs on AWS CloudWatch for more details."
146
+ exit 1
147
+ else
148
+ echo "Production server is running."
149
+ fi
150
+
151
+ - name: Run E2E tests
152
+ working-directory: src
153
+ run: |
154
+ SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id gh-actions-judgeval/api-keys/judgeval --query SecretString --output text)
155
+ export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
156
+ timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
157
+
158
+ - name: Upload coverage HTML report (production)
159
+ if: always()
160
+ uses: actions/upload-artifact@v4
161
+ with:
162
+ name: coverage-html-production
163
+ path: src/htmlcov
@@ -1,8 +1,6 @@
1
- name: Enforce Main Branch Protection
1
+ name: Branch Protection Check
2
2
 
3
- on:
4
- pull_request:
5
- types: [opened, synchronize, reopened, edited]
3
+ on: workflow_call
6
4
 
7
5
  jobs:
8
6
  validate-branch:
@@ -10,20 +8,17 @@ jobs:
10
8
  steps:
11
9
  - name: Check branch name
12
10
  run: |
13
- # Get the base and source branch names
14
11
  BASE_BRANCH="${{ github.base_ref }}"
15
12
  SOURCE_BRANCH="${{ github.head_ref }}"
16
13
 
17
14
  echo "BASE_BRANCH: $BASE_BRANCH"
18
15
  echo "SOURCE_BRANCH: $SOURCE_BRANCH"
19
16
 
20
- # Only run validation if the base branch is main
21
17
  if [[ "$BASE_BRANCH" != "main" ]]; then
22
18
  echo "Skipping branch validation - not targeting main branch"
23
19
  exit 0
24
20
  fi
25
21
 
26
- # Check if the source branch is staging or starts with hotfix/
27
22
  if [[ "$SOURCE_BRANCH" != "staging" && ! "$SOURCE_BRANCH" =~ ^hotfix/ ]]; then
28
23
  echo "::error::Pull requests to main can only be created from 'staging' or 'hotfix/*' branches. Current branch: $SOURCE_BRANCH"
29
24
  exit 1
@@ -0,0 +1,9 @@
1
+ name: Branch Protection
2
+
3
+ on:
4
+ pull_request:
5
+ types: [opened, synchronize, reopened, edited]
6
+
7
+ jobs:
8
+ branch-protection:
9
+ uses: ./.github/workflows/merge-branch-check.yaml
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.40
3
+ Version: 0.0.42
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -18,6 +18,7 @@ Requires-Dist: langchain-core
18
18
  Requires-Dist: langchain-huggingface
19
19
  Requires-Dist: langchain-openai
20
20
  Requires-Dist: litellm==1.61.15
21
+ Requires-Dist: matplotlib>=3.10.3
21
22
  Requires-Dist: nest-asyncio
22
23
  Requires-Dist: openai
23
24
  Requires-Dist: pandas
@@ -31,44 +32,47 @@ Description-Content-Type: text/markdown
31
32
  <img src="assets/new_lightmode.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
32
33
  <img src="assets/new_darkmode.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
33
34
 
34
- **Build monitoring & evaluation pipelines for complex agents**
35
+ <br>
36
+ <div style="font-size: 1.5em;">
37
+ Open source tracing, evals, and metrics to debug, test, and monitor LLM agents.
38
+ </div>
35
39
 
36
- <img src="assets/experiments_pagev2.png" alt="Judgment Platform Experiments Page" width="800" />
40
+ ## [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/self-hosting/get_started) [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc)
37
41
 
38
- <br>
42
+ [Docs](https://docs.judgmentlabs.ai/introduction) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
39
43
 
40
- ## [🌐 Landing Page](https://www.judgmentlabs.ai/) • [📚 Docs](https://judgment.mintlify.app/getting_started) [🚀 Demos](https://www.youtube.com/@AlexShan-j3o)
44
+ We're hiring! Join us in our mission to unleash optimized agents.
41
45
 
42
46
  [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
43
47
  [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
44
- [![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/FMxHkYTtFE)
48
+ [![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/ZCnSXYug)
45
49
 
46
- </div>
50
+ <img src="assets/experiments_pagev2.png" alt="Judgment Platform Experiments Page" width="800" />
47
51
 
48
- ## Judgeval: open-source testing, monitoring, and optimization for AI agents
52
+ </div>
49
53
 
50
- Judgeval offers robust tooling for evaluating and tracing LLM agent systems. It is dev-friendly and open-source (licensed under Apache 2.0).
51
54
 
52
- Judgeval gets you started in five minutes, after which you'll be ready to use all of its features as your agent becomes more complex. Judgeval is natively connected to the [Judgment Platform](https://www.judgmentlabs.ai/) for free and you can export your data and self-host at any time.
55
+ Judgeval offers **robust open-source tooling** for tracing, evaluating, and monitoring LLM agents. It helps AI teams effectively **test and monitor** agents in development and production, **closing the agent feedback loop**.
53
56
 
54
- We support tracing agents built with LangGraph, OpenAI SDK, Anthropic, ... and allow custom eval integrations for any use case. Check out our quickstarts below or our [setup guide](https://docs.judgmentlabs.ai/getting-started) to get started.
57
+ Judgeval can be set up **(cloud-hosted or self-hosted) in 5 minutes**!
58
+ > 🎁 Generous monthly [free tier](https://judgmentlabs.ai/pricing) (10k traces, 1k evals) - No credit card required!
55
59
 
56
60
  Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
57
61
 
58
62
  ## 📋 Table of Contents
59
- * [✨ Features](#-features)
60
- * [🔍 Tracing](#-tracing)
61
- * [🧪 Evals](#-evals)
62
- * [📡 Monitoring](#-monitoring)
63
- * [📊 Datasets](#-datasets)
64
- * [💡 Insights](#-insights)
65
- * [🛠️ Installation](#️-installation)
66
- * [🏁 Get Started](#-get-started)
67
- * [🏢 Self-Hosting](#-self-hosting)
68
- * [📚 Cookbooks](#-cookbooks)
69
- * [💻 Development with Cursor](#-development-with-cursor)
70
- * [⭐ Star Us on GitHub](#-star-us-on-github)
71
- * [❤️ Contributors](#️-contributors)
63
+ - [✨ Features](#-features)
64
+ - [🛠️ Installation](#️-installation)
65
+ - [🏁 Quickstarts](#-quickstarts)
66
+ - [🛰️ Tracing](#️-tracing)
67
+ - [📝 Offline Evaluations](#-offline-evaluations)
68
+ - [📡 Online Evaluations](#-online-evaluations)
69
+ - [🏢 Self-Hosting](#-self-hosting)
70
+ - [Key Features](#key-features)
71
+ - [Getting Started](#getting-started)
72
+ - [📚 Cookbooks](#-cookbooks)
73
+ - [💻 Development with Cursor](#-development-with-cursor)
74
+ - [⭐ Star Us on GitHub](#-star-us-on-github)
75
+ - [❤️ Contributors](#️-contributors)
72
76
 
73
77
  <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
74
78
 
@@ -77,11 +81,10 @@ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
77
81
 
78
82
  | | |
79
83
  |:---|:---:|
80
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
81
- | <h3>🧪 Evals</h3>15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Build custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails <br><br> | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
82
- | <h3>📡 Monitoring</h3>Real-time performance tracking of your agents in production environments. **Track all your metrics in one place.**<br><br>Set up **Slack/email alerts** for critical metrics and receive notifications when thresholds are exceeded.<br><br> **Useful for:** <br>•📉 Identifying degradation early <br>•📈 Visualizing performance trends across versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
83
- | <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets hosted on Judgment's Platform. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🔄 Scaled analysis for A/B tests <br>• 🗃️ Filtered collections of agent runtime data| <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
84
- | <h3>💡 Insights</h3>Cluster on your data to reveal common use cases and failure modes.<br><br>Trace failures to their exact source with Judgment's Osiris agent, which localizes errors to specific components for precise fixes.<br><br> **Useful for:**<br>•🔮 Surfacing common inputs that lead to error<br>•🤖 Investigating agent/user behavior for optimization <br>| <p align="center"><img src="assets/dataset_clustering_screenshot_dm.png" alt="Insights dashboard" width="1200"/></p> |
84
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
85
+ | <h3>🧪 Evals</h3>Evals are the key to regression testing for agents. Judgeval provides 15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Judgeval supports LLM-as-a-judge, manual labeling, and custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
86
+ | <h3>📡 Monitoring</h3>Track all your agent metrics in production. **Catch production regressions early.**<br><br>Configure alerts to trigger automated actions when metric thresholds are exceeded (add agent trace to review queue/dataset, Slack notification, etc.).<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
87
+ | <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets for scaled unit testing and structured experiments. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🗃️ Filtered agent runtime data for fine tuning<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
85
88
 
86
89
  ## 🛠️ Installation
87
90
 
@@ -91,17 +94,19 @@ Get started with Judgeval by installing our SDK using pip:
91
94
  pip install judgeval
92
95
  ```
93
96
 
94
- Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
97
+ Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
95
98
 
96
- **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
99
+ ```bash
100
+ export JUDGMENT_API_KEY=...
101
+ export JUDGMENT_ORG_ID=...
102
+ ```
97
103
 
98
- ## 🏁 Get Started
104
+ **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
99
105
 
100
- Here's how you can quickly start using Judgeval:
106
+ ## 🏁 Quickstarts
101
107
 
102
108
  ### 🛰️ Tracing
103
109
 
104
- Track your agent execution with full observability with just a few lines of code.
105
110
  Create a file named `traces.py` with the following code:
106
111
 
107
112
  ```python
@@ -126,12 +131,15 @@ def main():
126
131
 
127
132
  main()
128
133
  ```
134
+ You'll see your trace exported to the Judgment Platform:
135
+
136
+ <p align="center"><img src="assets/trace_demo.png" alt="Judgment Platform Trace Example" width="800" /></p>
137
+
129
138
 
130
139
  [Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-trace) for a more detailed explanation.
131
140
 
132
141
  ### 📝 Offline Evaluations
133
142
 
134
- You can evaluate your agent's execution to measure quality metrics such as hallucination.
135
143
  Create a file named `evaluate.py` with the following code:
136
144
 
137
145
  ```python evaluate.py
@@ -147,7 +155,7 @@ example = Example(
147
155
  retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
148
156
  )
149
157
 
150
- scorer = FaithfulnessScorer(threshold=0.5)
158
+ scorer = FaithfulnessScorer(threshold=0.5) # Hallucination detector
151
159
  results = client.run_evaluation(
152
160
  examples=[example],
153
161
  scorers=[scorer],
@@ -196,6 +204,8 @@ def main():
196
204
  main()
197
205
  ```
198
206
 
207
+ You should see an evaluation attached to your trace on the Judgment Platform.
208
+
199
209
  [Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-online-evaluation) for a more detailed explanation.
200
210
 
201
211
  ## 🏢 Self-Hosting
@@ -220,20 +230,8 @@ You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judg
220
230
 
221
231
  ### Sample Agents
222
232
 
223
- #### 💰 [LangGraph Financial QA Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/financial_agent/demo.py)
224
- A LangGraph-based agent for financial queries, featuring RAG capabilities with a vector database for contextual data retrieval and evaluation of its reasoning and data accuracy.
225
-
226
- #### ✈️ [OpenAI Travel Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/openai_travel_agent/agent.py)
227
- A travel planning agent using OpenAI API calls, custom tool functions, and RAG with a vector database for up-to-date and contextual travel information. Evaluated for itinerary quality and information relevance.
228
-
229
- ### Custom Evaluators
230
-
231
- #### 🔍 [PII Detection](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/classifier_scorer/pii_checker.py)
232
- Detecting and evaluating Personal Identifiable Information (PII) leakage.
233
-
234
- #### 📧 [Cold Email Generation](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/cold_email_scorer.py)
235
-
236
- Evaluates if a cold email generator properly utilizes all relevant information about the target recipient.
233
+ #### [Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent)
234
+ A multi-agent system augmented with tool calls designed for general purpose tasks like financial research and math. Traced and evaluated on Faithfulness (factual adherence to retrieval context).
237
235
 
238
236
  ## 💻 Development with Cursor
239
237
  When building agents and LLM workflows in Cursor, providing proper context to your coding assistant helps ensure seamless integration with Judgment. This rule file supplies the essential context your coding assistant needs for successful implementation.
@@ -3,44 +3,47 @@
3
3
  <img src="assets/new_lightmode.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
4
4
  <img src="assets/new_darkmode.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
5
5
 
6
- **Build monitoring & evaluation pipelines for complex agents**
6
+ <br>
7
+ <div style="font-size: 1.5em;">
8
+ Open source tracing, evals, and metrics to debug, test, and monitor LLM agents.
9
+ </div>
7
10
 
8
- <img src="assets/experiments_pagev2.png" alt="Judgment Platform Experiments Page" width="800" />
11
+ ## [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/self-hosting/get_started) [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc)
9
12
 
10
- <br>
13
+ [Docs](https://docs.judgmentlabs.ai/introduction) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
11
14
 
12
- ## [🌐 Landing Page](https://www.judgmentlabs.ai/) • [📚 Docs](https://judgment.mintlify.app/getting_started) [🚀 Demos](https://www.youtube.com/@AlexShan-j3o)
15
+ We're hiring! Join us in our mission to unleash optimized agents.
13
16
 
14
17
  [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
15
18
  [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
16
- [![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/FMxHkYTtFE)
19
+ [![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/ZCnSXYug)
17
20
 
18
- </div>
21
+ <img src="assets/experiments_pagev2.png" alt="Judgment Platform Experiments Page" width="800" />
19
22
 
20
- ## Judgeval: open-source testing, monitoring, and optimization for AI agents
23
+ </div>
21
24
 
22
- Judgeval offers robust tooling for evaluating and tracing LLM agent systems. It is dev-friendly and open-source (licensed under Apache 2.0).
23
25
 
24
- Judgeval gets you started in five minutes, after which you'll be ready to use all of its features as your agent becomes more complex. Judgeval is natively connected to the [Judgment Platform](https://www.judgmentlabs.ai/) for free and you can export your data and self-host at any time.
26
+ Judgeval offers **robust open-source tooling** for tracing, evaluating, and monitoring LLM agents. It helps AI teams effectively **test and monitor** agents in development and production, **closing the agent feedback loop**.
25
27
 
26
- We support tracing agents built with LangGraph, OpenAI SDK, Anthropic, ... and allow custom eval integrations for any use case. Check out our quickstarts below or our [setup guide](https://docs.judgmentlabs.ai/getting-started) to get started.
28
+ Judgeval can be set up **(cloud-hosted or self-hosted) in 5 minutes**!
29
+ > 🎁 Generous monthly [free tier](https://judgmentlabs.ai/pricing) (10k traces, 1k evals) - No credit card required!
27
30
 
28
31
  Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
29
32
 
30
33
  ## 📋 Table of Contents
31
- * [✨ Features](#-features)
32
- * [🔍 Tracing](#-tracing)
33
- * [🧪 Evals](#-evals)
34
- * [📡 Monitoring](#-monitoring)
35
- * [📊 Datasets](#-datasets)
36
- * [💡 Insights](#-insights)
37
- * [🛠️ Installation](#️-installation)
38
- * [🏁 Get Started](#-get-started)
39
- * [🏢 Self-Hosting](#-self-hosting)
40
- * [📚 Cookbooks](#-cookbooks)
41
- * [💻 Development with Cursor](#-development-with-cursor)
42
- * [⭐ Star Us on GitHub](#-star-us-on-github)
43
- * [❤️ Contributors](#️-contributors)
34
+ - [✨ Features](#-features)
35
+ - [🛠️ Installation](#️-installation)
36
+ - [🏁 Quickstarts](#-quickstarts)
37
+ - [🛰️ Tracing](#️-tracing)
38
+ - [📝 Offline Evaluations](#-offline-evaluations)
39
+ - [📡 Online Evaluations](#-online-evaluations)
40
+ - [🏢 Self-Hosting](#-self-hosting)
41
+ - [Key Features](#key-features)
42
+ - [Getting Started](#getting-started)
43
+ - [📚 Cookbooks](#-cookbooks)
44
+ - [💻 Development with Cursor](#-development-with-cursor)
45
+ - [⭐ Star Us on GitHub](#-star-us-on-github)
46
+ - [❤️ Contributors](#️-contributors)
44
47
 
45
48
  <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
46
49
 
@@ -49,11 +52,10 @@ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
49
52
 
50
53
  | | |
51
54
  |:---|:---:|
52
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
53
- | <h3>🧪 Evals</h3>15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Build custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails <br><br> | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
54
- | <h3>📡 Monitoring</h3>Real-time performance tracking of your agents in production environments. **Track all your metrics in one place.**<br><br>Set up **Slack/email alerts** for critical metrics and receive notifications when thresholds are exceeded.<br><br> **Useful for:** <br>•📉 Identifying degradation early <br>•📈 Visualizing performance trends across versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
55
- | <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets hosted on Judgment's Platform. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🔄 Scaled analysis for A/B tests <br>• 🗃️ Filtered collections of agent runtime data| <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
56
- | <h3>💡 Insights</h3>Cluster on your data to reveal common use cases and failure modes.<br><br>Trace failures to their exact source with Judgment's Osiris agent, which localizes errors to specific components for precise fixes.<br><br> **Useful for:**<br>•🔮 Surfacing common inputs that lead to error<br>•🤖 Investigating agent/user behavior for optimization <br>| <p align="center"><img src="assets/dataset_clustering_screenshot_dm.png" alt="Insights dashboard" width="1200"/></p> |
55
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
56
+ | <h3>🧪 Evals</h3>Evals are the key to regression testing for agents. Judgeval provides 15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Judgeval supports LLM-as-a-judge, manual labeling, and custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
57
+ | <h3>📡 Monitoring</h3>Track all your agent metrics in production. **Catch production regressions early.**<br><br>Configure alerts to trigger automated actions when metric thresholds are exceeded (add agent trace to review queue/dataset, Slack notification, etc.).<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
58
+ | <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets for scaled unit testing and structured experiments. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🗃️ Filtered agent runtime data for fine tuning<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
57
59
 
58
60
  ## 🛠️ Installation
59
61
 
@@ -63,17 +65,19 @@ Get started with Judgeval by installing our SDK using pip:
63
65
  pip install judgeval
64
66
  ```
65
67
 
66
- Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
68
+ Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
67
69
 
68
- **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
70
+ ```bash
71
+ export JUDGMENT_API_KEY=...
72
+ export JUDGMENT_ORG_ID=...
73
+ ```
69
74
 
70
- ## 🏁 Get Started
75
+ **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
71
76
 
72
- Here's how you can quickly start using Judgeval:
77
+ ## 🏁 Quickstarts
73
78
 
74
79
  ### 🛰️ Tracing
75
80
 
76
- Track your agent execution with full observability with just a few lines of code.
77
81
  Create a file named `traces.py` with the following code:
78
82
 
79
83
  ```python
@@ -98,12 +102,15 @@ def main():
98
102
 
99
103
  main()
100
104
  ```
105
+ You'll see your trace exported to the Judgment Platform:
106
+
107
+ <p align="center"><img src="assets/trace_demo.png" alt="Judgment Platform Trace Example" width="800" /></p>
108
+
101
109
 
102
110
  [Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-trace) for a more detailed explanation.
103
111
 
104
112
  ### 📝 Offline Evaluations
105
113
 
106
- You can evaluate your agent's execution to measure quality metrics such as hallucination.
107
114
  Create a file named `evaluate.py` with the following code:
108
115
 
109
116
  ```python evaluate.py
@@ -119,7 +126,7 @@ example = Example(
119
126
  retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
120
127
  )
121
128
 
122
- scorer = FaithfulnessScorer(threshold=0.5)
129
+ scorer = FaithfulnessScorer(threshold=0.5) # Hallucination detector
123
130
  results = client.run_evaluation(
124
131
  examples=[example],
125
132
  scorers=[scorer],
@@ -168,6 +175,8 @@ def main():
168
175
  main()
169
176
  ```
170
177
 
178
+ You should see an evaluation attached to your trace on the Judgment Platform.
179
+
171
180
  [Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-online-evaluation) for a more detailed explanation.
172
181
 
173
182
  ## 🏢 Self-Hosting
@@ -192,20 +201,8 @@ You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judg
192
201
 
193
202
  ### Sample Agents
194
203
 
195
- #### 💰 [LangGraph Financial QA Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/financial_agent/demo.py)
196
- A LangGraph-based agent for financial queries, featuring RAG capabilities with a vector database for contextual data retrieval and evaluation of its reasoning and data accuracy.
197
-
198
- #### ✈️ [OpenAI Travel Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/openai_travel_agent/agent.py)
199
- A travel planning agent using OpenAI API calls, custom tool functions, and RAG with a vector database for up-to-date and contextual travel information. Evaluated for itinerary quality and information relevance.
200
-
201
- ### Custom Evaluators
202
-
203
- #### 🔍 [PII Detection](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/classifier_scorer/pii_checker.py)
204
- Detecting and evaluating Personal Identifiable Information (PII) leakage.
205
-
206
- #### 📧 [Cold Email Generation](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/cold_email_scorer.py)
207
-
208
- Evaluates if a cold email generator properly utilizes all relevant information about the target recipient.
204
+ #### [Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent)
205
+ A multi-agent system augmented with tool calls designed for general purpose tasks like financial research and math. Traced and evaluated on Faithfulness (factual adherence to retrieval context).
209
206
 
210
207
  ## 💻 Development with Cursor
211
208
  When building agents and LLM workflows in Cursor, providing proper context to your coding assistant helps ensure seamless integration with Judgment. This rule file supplies the essential context your coding assistant needs for successful implementation.
Binary file
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.0.40"
3
+ version = "0.0.42"
4
4
  authors = [
5
5
  { name="Andrew Li", email="andrew@judgmentlabs.ai" },
6
6
  { name="Alex Shan", email="alex@judgmentlabs.ai" },
@@ -29,7 +29,8 @@ dependencies = [
29
29
  "langchain-anthropic",
30
30
  "langchain-core",
31
31
  "google-genai",
32
- "boto3"
32
+ "boto3",
33
+ "matplotlib>=3.10.3",
33
34
  ]
34
35
 
35
36
  [project.urls]
@@ -47,6 +48,18 @@ include = [
47
48
  "/src/judgeval/**/*.py",
48
49
  ]
49
50
 
51
+ [dependency-groups]
52
+ dev = [
53
+ "chromadb>=1.0.12",
54
+ "langchain-community>=0.3.24",
55
+ "pytest>=8.4.0",
56
+ "pytest-asyncio>=1.0.0",
57
+ "pytest-cov>=6.1.1",
58
+ "pytest-mock>=3.14.1",
59
+ "tavily-python>=0.7.5",
60
+ "langgraph>=0.4.3",
61
+ ]
62
+
50
63
  [tool.hatch.build]
51
64
  directory = "dist"
52
65
  artifacts = [