judgeval 0.0.37__tar.gz → 0.0.38__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. {judgeval-0.0.37 → judgeval-0.0.38}/.github/workflows/ci.yaml +12 -11
  2. judgeval-0.0.38/PKG-INFO +247 -0
  3. judgeval-0.0.38/README.md +219 -0
  4. judgeval-0.0.38/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png +0 -0
  5. judgeval-0.0.38/assets/dataset_clustering_screenshot.png +0 -0
  6. judgeval-0.0.38/assets/dataset_clustering_screenshot_dm.png +0 -0
  7. judgeval-0.0.38/assets/datasets_preview_screenshot.png +0 -0
  8. judgeval-0.0.38/assets/experiments_dashboard_screenshot.png +0 -0
  9. judgeval-0.0.38/assets/experiments_page.png +0 -0
  10. judgeval-0.0.38/assets/monitoring_screenshot.png +0 -0
  11. judgeval-0.0.38/assets/trace_screenshot.png +0 -0
  12. {judgeval-0.0.37 → judgeval-0.0.38}/docs/api_reference/judgment_client.mdx +4 -4
  13. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/data_datasets.mdx +2 -2
  14. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/introduction.mdx +3 -3
  15. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/judges.mdx +4 -4
  16. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/scorers/agent/derailment.mdx +1 -1
  17. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/scorers/classifier_scorer.mdx +1 -1
  18. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/scorers/custom_scorers.mdx +2 -2
  19. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/scorers/default/answer_correctness.mdx +2 -2
  20. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/scorers/default/answer_relevancy.mdx +2 -2
  21. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/scorers/default/comparison.mdx +2 -2
  22. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/scorers/default/contextual_precision.mdx +2 -2
  23. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/scorers/default/contextual_recall.mdx +2 -2
  24. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/scorers/default/contextual_relevancy.mdx +2 -2
  25. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/scorers/default/execution_order.mdx +1 -1
  26. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/scorers/default/faithfulness.mdx +2 -2
  27. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/scorers/default/groundedness.mdx +1 -1
  28. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/scorers/default/json_correctness.mdx +1 -1
  29. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/scorers/default/summarization.mdx +1 -1
  30. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/scorers/introduction.mdx +1 -1
  31. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/unit_testing.mdx +4 -4
  32. {judgeval-0.0.37 → judgeval-0.0.38}/docs/getting_started.mdx +11 -11
  33. {judgeval-0.0.37 → judgeval-0.0.38}/docs/integration/langgraph.mdx +20 -26
  34. {judgeval-0.0.37 → judgeval-0.0.38}/docs/monitoring/tracing.mdx +9 -9
  35. judgeval-0.0.38/docs/notebooks/demo.ipynb +389 -0
  36. {judgeval-0.0.37 → judgeval-0.0.38}/docs/notebooks/quickstart.ipynb +2 -2
  37. {judgeval-0.0.37 → judgeval-0.0.38}/docs/optimization/osiris_agent.mdx +6 -6
  38. {judgeval-0.0.37 → judgeval-0.0.38}/pyproject.toml +1 -1
  39. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/common/tracer.py +132 -281
  40. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/common/utils.py +1 -1
  41. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/constants.py +1 -3
  42. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/data/__init__.py +0 -2
  43. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/data/datasets/dataset.py +2 -9
  44. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/data/datasets/eval_dataset_client.py +1 -62
  45. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/data/example.py +0 -1
  46. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/data/result.py +3 -3
  47. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/data/trace.py +4 -1
  48. judgeval-0.0.37/src/judgeval/data/sequence_run.py → judgeval-0.0.38/src/judgeval/data/trace_run.py +4 -4
  49. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/evaluation_run.py +1 -1
  50. judgeval-0.0.38/src/judgeval/integrations/langgraph.py +419 -0
  51. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/judges/litellm_judge.py +1 -1
  52. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/judges/mixture_of_judges.py +1 -1
  53. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/judges/utils.py +1 -1
  54. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/judgment_client.py +15 -21
  55. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/run_evaluation.py +31 -81
  56. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +4 -2
  57. judgeval-0.0.37/PKG-INFO +0 -214
  58. judgeval-0.0.37/README.md +0 -186
  59. judgeval-0.0.37/docs/notebooks/demo.ipynb +0 -389
  60. judgeval-0.0.37/src/judgeval/data/sequence.py +0 -50
  61. judgeval-0.0.37/src/judgeval/integrations/langgraph.py +0 -2000
  62. {judgeval-0.0.37 → judgeval-0.0.38}/.github/pull_request_template.md +0 -0
  63. {judgeval-0.0.37 → judgeval-0.0.38}/.gitignore +0 -0
  64. {judgeval-0.0.37 → judgeval-0.0.38}/LICENSE.md +0 -0
  65. {judgeval-0.0.37 → judgeval-0.0.38}/Pipfile +0 -0
  66. {judgeval-0.0.37 → judgeval-0.0.38}/Pipfile.lock +0 -0
  67. {judgeval-0.0.37 → judgeval-0.0.38}/assets/logo-dark.svg +0 -0
  68. {judgeval-0.0.37 → judgeval-0.0.38}/assets/logo-light.svg +0 -0
  69. {judgeval-0.0.37 → judgeval-0.0.38}/docs/README.md +0 -0
  70. {judgeval-0.0.37 → judgeval-0.0.38}/docs/alerts/notifications.mdx +0 -0
  71. {judgeval-0.0.37 → judgeval-0.0.38}/docs/alerts/platform_notifications.mdx +0 -0
  72. {judgeval-0.0.37 → judgeval-0.0.38}/docs/alerts/rules.mdx +0 -0
  73. {judgeval-0.0.37 → judgeval-0.0.38}/docs/api_reference/trace.mdx +0 -0
  74. {judgeval-0.0.37 → judgeval-0.0.38}/docs/changelog/2025-04-21.mdx +0 -0
  75. {judgeval-0.0.37 → judgeval-0.0.38}/docs/clustering/clustering.mdx +0 -0
  76. {judgeval-0.0.37 → judgeval-0.0.38}/docs/compliance/certifications.mdx +0 -0
  77. {judgeval-0.0.37 → judgeval-0.0.38}/docs/development.mdx +0 -0
  78. {judgeval-0.0.37 → judgeval-0.0.38}/docs/essentials/code.mdx +0 -0
  79. {judgeval-0.0.37 → judgeval-0.0.38}/docs/essentials/images.mdx +0 -0
  80. {judgeval-0.0.37 → judgeval-0.0.38}/docs/essentials/markdown.mdx +0 -0
  81. {judgeval-0.0.37 → judgeval-0.0.38}/docs/essentials/navigation.mdx +0 -0
  82. {judgeval-0.0.37 → judgeval-0.0.38}/docs/essentials/reusable-snippets.mdx +0 -0
  83. {judgeval-0.0.37 → judgeval-0.0.38}/docs/essentials/settings.mdx +0 -0
  84. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/data_examples.mdx +0 -0
  85. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/data_sequences.mdx +0 -0
  86. {judgeval-0.0.37 → judgeval-0.0.38}/docs/evaluation/experiment_comparisons.mdx +0 -0
  87. {judgeval-0.0.37 → judgeval-0.0.38}/docs/favicon.svg +0 -0
  88. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/annotation_queue_ui.png +0 -0
  89. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/basic_trace_example.png +0 -0
  90. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/checks-passed.png +0 -0
  91. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/cluster.png +0 -0
  92. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/cluster_button.png +0 -0
  93. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/create_aggressive_scorer.png +0 -0
  94. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/create_scorer.png +0 -0
  95. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/dashboard_annotation_queue_button.png +0 -0
  96. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/evaluation_diagram.png +0 -0
  97. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/experiment-comparison-page-2.png +0 -0
  98. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/experiment-page-comparison.png +0 -0
  99. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/experiment-popout-comparison.png +0 -0
  100. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/experiments-page-comparison-2.png +0 -0
  101. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/experiments-page-comparison.png +0 -0
  102. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/export-dataset.png +0 -0
  103. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/hero-dark.svg +0 -0
  104. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/hero-light.svg +0 -0
  105. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/notifications_page.png +0 -0
  106. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/online_eval_fault.png +0 -0
  107. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/reports_modal.png +0 -0
  108. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/synth_data_button.png +0 -0
  109. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/synth_data_window.png +0 -0
  110. {judgeval-0.0.37 → judgeval-0.0.38}/docs/images/trace_ss.png +0 -0
  111. {judgeval-0.0.37 → judgeval-0.0.38}/docs/introduction.mdx +0 -0
  112. {judgeval-0.0.37 → judgeval-0.0.38}/docs/judgment_cli/installation.mdx +0 -0
  113. {judgeval-0.0.37 → judgeval-0.0.38}/docs/judgment_cli/self-hosting.mdx +0 -0
  114. {judgeval-0.0.37 → judgeval-0.0.38}/docs/judgment_cli/supabase-org-id.png +0 -0
  115. {judgeval-0.0.37 → judgeval-0.0.38}/docs/logo/dark.svg +0 -0
  116. {judgeval-0.0.37 → judgeval-0.0.38}/docs/logo/light.svg +0 -0
  117. {judgeval-0.0.37 → judgeval-0.0.38}/docs/mint.json +0 -0
  118. {judgeval-0.0.37 → judgeval-0.0.38}/docs/monitoring/annotations.mdx +0 -0
  119. {judgeval-0.0.37 → judgeval-0.0.38}/docs/monitoring/introduction.mdx +0 -0
  120. {judgeval-0.0.37 → judgeval-0.0.38}/docs/monitoring/production_insights.mdx +0 -0
  121. {judgeval-0.0.37 → judgeval-0.0.38}/docs/monitoring/tracing_s3.mdx +0 -0
  122. {judgeval-0.0.37 → judgeval-0.0.38}/docs/notebooks/create_dataset.ipynb +0 -0
  123. {judgeval-0.0.37 → judgeval-0.0.38}/docs/notebooks/create_scorer.ipynb +0 -0
  124. {judgeval-0.0.37 → judgeval-0.0.38}/docs/notebooks/prompt_scorer.ipynb +0 -0
  125. {judgeval-0.0.37 → judgeval-0.0.38}/docs/quickstart.mdx +0 -0
  126. {judgeval-0.0.37 → judgeval-0.0.38}/docs/self_hosting/get_started.mdx +0 -0
  127. {judgeval-0.0.37 → judgeval-0.0.38}/docs/snippets/snippet-intro.mdx +0 -0
  128. {judgeval-0.0.37 → judgeval-0.0.38}/docs/synthetic_data/synthetic_data.mdx +0 -0
  129. {judgeval-0.0.37 → judgeval-0.0.38}/pytest.ini +0 -0
  130. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/__init__.py +0 -0
  131. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/clients.py +0 -0
  132. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/common/__init__.py +0 -0
  133. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/common/exceptions.py +0 -0
  134. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/common/logger.py +0 -0
  135. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/common/s3_storage.py +0 -0
  136. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/data/custom_example.py +0 -0
  137. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/data/datasets/__init__.py +0 -0
  138. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/data/scorer_data.py +0 -0
  139. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/judges/__init__.py +0 -0
  140. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/judges/base_judge.py +0 -0
  141. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/judges/together_judge.py +0 -0
  142. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/rules.py +0 -0
  143. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/__init__.py +0 -0
  144. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/api_scorer.py +0 -0
  145. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/exceptions.py +0 -0
  146. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorer.py +0 -0
  147. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  148. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  149. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  150. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  151. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -0
  152. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
  153. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
  154. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
  155. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
  156. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
  157. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  158. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -0
  159. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
  160. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
  161. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
  162. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
  163. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
  164. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
  165. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
  166. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/prompt_scorer.py +0 -0
  167. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/score.py +0 -0
  168. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/scorers/utils.py +0 -0
  169. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/tracer/__init__.py +0 -0
  170. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/utils/alerts.py +0 -0
  171. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/utils/data_utils.py +0 -0
  172. {judgeval-0.0.37 → judgeval-0.0.38}/src/judgeval/version_check.py +0 -0
@@ -6,6 +6,8 @@ on:
6
6
  branches:
7
7
  - main
8
8
 
9
+ permissions: read-all
10
+
9
11
  jobs:
10
12
  run-tests:
11
13
  strategy:
@@ -44,17 +46,16 @@ jobs:
44
46
 
45
47
  run-e2e-tests:
46
48
  if: "!contains(github.actor, '[bot]')" # Exclude if the actor is a bot
47
- concurrency:
48
- group: e2e-tests
49
- strategy:
50
- fail-fast: false
51
- matrix:
52
- os: [ubuntu-latest]
53
- python-version:
54
- - "3.11"
55
49
  name: E2E Tests
56
- runs-on: ${{ matrix.os }}
50
+ runs-on: ubuntu-latest
57
51
  steps:
52
+ - name: Wait for turn
53
+ uses: softprops/turnstyle@v2
54
+ with:
55
+ poll-interval-seconds: 10
56
+ same-branch-only: false
57
+ job-to-wait-for: "E2E Tests"
58
+
58
59
  - name: Configure AWS Credentials
59
60
  uses: aws-actions/configure-aws-credentials@v4
60
61
  with:
@@ -68,7 +69,7 @@ jobs:
68
69
  - name: Set up Python
69
70
  uses: actions/setup-python@v4
70
71
  with:
71
- python-version: ${{ matrix.python-version }}
72
+ python-version: "3.11"
72
73
 
73
74
  - name: Install judgeval dependencies
74
75
  run: |
@@ -89,4 +90,4 @@ jobs:
89
90
  run: |
90
91
  SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id gh-actions/api-keys/judgeval --query SecretString --output text)
91
92
  export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
92
- pipenv run pytest ./e2etests
93
+ pipenv run pytest --durations=0 ./e2etests
@@ -0,0 +1,247 @@
1
+ Metadata-Version: 2.4
2
+ Name: judgeval
3
+ Version: 0.0.38
4
+ Summary: Judgeval Package
5
+ Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
+ Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
7
+ Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
8
+ License-Expression: Apache-2.0
9
+ License-File: LICENSE.md
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.11
13
+ Requires-Dist: anthropic
14
+ Requires-Dist: boto3
15
+ Requires-Dist: google-genai
16
+ Requires-Dist: langchain-anthropic
17
+ Requires-Dist: langchain-core
18
+ Requires-Dist: langchain-huggingface
19
+ Requires-Dist: langchain-openai
20
+ Requires-Dist: litellm==1.38.12
21
+ Requires-Dist: nest-asyncio
22
+ Requires-Dist: openai
23
+ Requires-Dist: pandas
24
+ Requires-Dist: python-dotenv==1.0.1
25
+ Requires-Dist: requests
26
+ Requires-Dist: together
27
+ Description-Content-Type: text/markdown
28
+
29
+ <div align="center">
30
+
31
+ <img src="assets/logo-light.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
32
+ <img src="assets/logo-dark.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
33
+
34
+ **Build monitoring & evaluation pipelines for complex agents**
35
+
36
+ <img src="assets/experiments_page.png" alt="Judgment Platform Experiments Page" width="800" />
37
+
38
+ <br>
39
+
40
+ ## [🌐 Landing Page](https://www.judgmentlabs.ai/) • [Twitter/X](https://x.com/JudgmentLabs) • [💼 LinkedIn](https://www.linkedin.com/company/judgmentlabs) • [📚 Docs](https://judgment.mintlify.app/getting_started) • [🚀 Demos](https://www.youtube.com/@AlexShan-j3o) • [🎮 Discord](https://discord.gg/taAufyhf)
41
+ </div>
42
+
43
+ ## Judgeval: open-source testing, monitoring, and optimization for AI agents
44
+
45
+ Judgeval offers robust tooling for evaluating and tracing LLM agent systems. It is dev-friendly and open-source (licensed under Apache 2.0).
46
+
47
+ Judgeval gets you started in five minutes, after which you'll be ready to use all of its features as your agent becomes more complex. Judgeval is natively connected to the [Judgment Platform](https://www.judgmentlabs.ai/) for free and you can export your data and self-host at any time.
48
+
49
+ We support tracing agents built with LangGraph, OpenAI SDK, Anthropic, ... and allow custom eval integrations for any use case. Check out our quickstarts below or our [setup guide](https://judgment.mintlify.app/getting_started) to get started.
50
+
51
+ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
52
+
53
+ ## 📋 Table of Contents
54
+ * [✨ Features](#-features)
55
+ * [🔍 Tracing](#-tracing)
56
+ * [🧪 Evals](#-evals)
57
+ * [📡 Monitoring](#-monitoring)
58
+ * [📊 Datasets](#-datasets)
59
+ * [💡 Insights](#-insights)
60
+ * [🛠️ Installation](#️-installation)
61
+ * [🏁 Get Started](#-get-started)
62
+ * [🏢 Self-Hosting](#-self-hosting)
63
+ * [📚 Cookbooks](#-cookbooks)
64
+ * [⭐ Star Us on GitHub](#-star-us-on-github)
65
+ * [❤️ Contributors](#️-contributors)
66
+
67
+ <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
68
+
69
+
70
+ ## ✨ Features
71
+
72
+ | | |
73
+ |:---|:---:|
74
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
75
+ | <h3>🧪 Evals</h3>15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Build custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails <br><br> | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
76
+ | <h3>📡 Monitoring</h3>Real-time performance tracking of your agents in production environments. **Track all your metrics in one place.**<br><br>Set up **Slack/email alerts** for critical metrics and receive notifications when thresholds are exceeded.<br><br> **Useful for:** <br>•📉 Identifying degradation early <br>•📈 Visualizing performance trends across versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
77
+ | <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets hosted on Judgment's Platform. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🔄 Scaled analysis for A/B tests <br>• 🗃️ Filtered collections of agent runtime data| <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
78
+ | <h3>💡 Insights</h3>Cluster on your data to reveal common use cases and failure modes.<br><br>Trace failures to their exact source with Judgment's Osiris agent, which localizes errors to specific components for precise fixes.<br><br> **Useful for:**<br>•🔮 Surfacing common inputs that lead to error<br>•🤖 Investigating agent/user behavior for optimization <br>| <p align="center"><img src="assets/dataset_clustering_screenshot_dm.png" alt="Insights dashboard" width="1200"/></p> |
79
+
80
+ ## 🛠️ Installation
81
+
82
+ Get started with Judgeval by installing our SDK using pip:
83
+
84
+ ```bash
85
+ pip install judgeval
86
+ ```
87
+
88
+ Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
89
+
90
+ **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
91
+
92
+ ## 🏁 Get Started
93
+
94
+ Here's how you can quickly start using Judgeval:
95
+
96
+ ### 🛰️ Tracing
97
+
98
+ Track your agent execution with full observability with just a few lines of code.
99
+ Create a file named `traces.py` with the following code:
100
+
101
+ ```python
102
+ from judgeval.common.tracer import Tracer, wrap
103
+ from openai import OpenAI
104
+
105
+ client = wrap(OpenAI())
106
+ judgment = Tracer(project_name="my_project")
107
+
108
+ @judgment.observe(span_type="tool")
109
+ def my_tool():
110
+ return "What's the capital of the U.S.?"
111
+
112
+ @judgment.observe(span_type="function")
113
+ def main():
114
+ task_input = my_tool()
115
+ res = client.chat.completions.create(
116
+ model="gpt-4.1",
117
+ messages=[{"role": "user", "content": f"{task_input}"}]
118
+ )
119
+ return res.choices[0].message.content
120
+
121
+ main()
122
+ ```
123
+
124
+ [Click here](https://judgment.mintlify.app/getting_started#create-your-first-trace) for a more detailed explanation.
125
+
126
+ ### 📝 Offline Evaluations
127
+
128
+ You can evaluate your agent's execution to measure quality metrics such as hallucination.
129
+ Create a file named `evaluate.py` with the following code:
130
+
131
+ ```python evaluate.py
132
+ from judgeval import JudgmentClient
133
+ from judgeval.data import Example
134
+ from judgeval.scorers import FaithfulnessScorer
135
+
136
+ client = JudgmentClient()
137
+
138
+ example = Example(
139
+ input="What if these shoes don't fit?",
140
+ actual_output="We offer a 30-day full refund at no extra cost.",
141
+ retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
142
+ )
143
+
144
+ scorer = FaithfulnessScorer(threshold=0.5)
145
+ results = client.run_evaluation(
146
+ examples=[example],
147
+ scorers=[scorer],
148
+ model="gpt-4.1",
149
+ )
150
+ print(results)
151
+ ```
152
+
153
+ [Click here](https://judgment.mintlify.app/getting_started#create-your-first-experiment) for a more detailed explanation.
154
+
155
+ ### 📡 Online Evaluations
156
+
157
+ Attach performance monitoring on traces to measure the quality of your systems in production.
158
+
159
+ Using the same `traces.py` file we created earlier, modify `main` function:
160
+
161
+ ```python
162
+ from judgeval.common.tracer import Tracer, wrap
163
+ from judgeval.scorers import AnswerRelevancyScorer
164
+ from openai import OpenAI
165
+
166
+ client = wrap(OpenAI())
167
+ judgment = Tracer(project_name="my_project")
168
+
169
+ @judgment.observe(span_type="tool")
170
+ def my_tool():
171
+ return "Hello world!"
172
+
173
+ @judgment.observe(span_type="function")
174
+ def main():
175
+ task_input = my_tool()
176
+ res = client.chat.completions.create(
177
+ model="gpt-4.1",
178
+ messages=[{"role": "user", "content": f"{task_input}"}]
179
+ ).choices[0].message.content
180
+
181
+ judgment.async_evaluate(
182
+ scorers=[AnswerRelevancyScorer(threshold=0.5)],
183
+ input=task_input,
184
+ actual_output=res,
185
+ model="gpt-4.1"
186
+ )
187
+ print("Online evaluation submitted.")
188
+ return res
189
+
190
+ main()
191
+ ```
192
+
193
+ [Click here](https://judgment.mintlify.app/getting_started#create-your-first-online-evaluation) for a more detailed explanation.
194
+
195
+ ## 🏢 Self-Hosting
196
+
197
+ Run Judgment on your own infrastructure: we provide comprehensive self-hosting capabilities that give you full control over the backend and data plane that Judgeval interfaces with.
198
+
199
+ ### Key Features
200
+ * Deploy Judgment on your own AWS account
201
+ * Store data in your own Supabase instance
202
+ * Access Judgment through your own custom domain
203
+
204
+ ### Getting Started
205
+ 1. Check out our [self-hosting documentation](https://judgment.mintlify.app/self_hosting/get_started) for detailed setup instructions, along with how your self-hosted instance can be accessed
206
+ 2. Use the [Judgment CLI](https://github.com/JudgmentLabs/judgment-cli) to deploy your self-hosted environment
207
+ 3. After your self-hosted instance is setup, make sure the `JUDGMENT_API_URL` environmental variable is set to your self-hosted backend endpoint
208
+
209
+ ## 📚 Cookbooks
210
+
211
+ Have your own? We're happy to feature it if you create a PR or message us on [Discord](https://discord.gg/taAufyhf).
212
+
213
+ You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judgment-cookbook). Here are some highlights:
214
+
215
+ ### Sample Agents
216
+
217
+ #### 💰 [LangGraph Financial QA Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/financial_agent/demo.py)
218
+ A LangGraph-based agent for financial queries, featuring RAG capabilities with a vector database for contextual data retrieval and evaluation of its reasoning and data accuracy.
219
+
220
+ #### ✈️ [OpenAI Travel Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/openai_travel_agent/agent.py)
221
+ A travel planning agent using OpenAI API calls, custom tool functions, and RAG with a vector database for up-to-date and contextual travel information. Evaluated for itinerary quality and information relevance.
222
+
223
+ ### Custom Evaluators
224
+
225
+ #### 🔍 [PII Detection](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/classifier_scorer/pii_checker.py)
226
+ Detecting and evaluating Personal Identifiable Information (PII) leakage.
227
+
228
+ #### 📧 [Cold Email Generation](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/cold_email_scorer.py)
229
+
230
+ Evaluates if a cold email generator properly utilizes all relevant information about the target recipient.
231
+
232
+ ## ⭐ Star Us on GitHub
233
+
234
+ If you find Judgeval useful, please consider giving us a star on GitHub! Your support helps us grow our community and continue improving the product.
235
+
236
+
237
+ ## ❤️ Contributors
238
+
239
+ There are many ways to contribute to Judgeval:
240
+
241
+ - Submit [bug reports](https://github.com/JudgmentLabs/judgeval/issues) and [feature requests](https://github.com/JudgmentLabs/judgeval/issues)
242
+ - Review the documentation and submit [Pull Requests](https://github.com/JudgmentLabs/judgeval/pulls) to improve it
243
+ - Speaking or writing about Judgment and letting us know!
244
+
245
+ <!-- Contributors collage -->
246
+ [![Contributors](https://contributors-img.web.app/image?repo=JudgmentLabs/judgeval)](https://github.com/JudgmentLabs/judgeval/graphs/contributors)
247
+
@@ -0,0 +1,219 @@
1
+ <div align="center">
2
+
3
+ <img src="assets/logo-light.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
4
+ <img src="assets/logo-dark.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
5
+
6
+ **Build monitoring & evaluation pipelines for complex agents**
7
+
8
+ <img src="assets/experiments_page.png" alt="Judgment Platform Experiments Page" width="800" />
9
+
10
+ <br>
11
+
12
+ ## [🌐 Landing Page](https://www.judgmentlabs.ai/) • [Twitter/X](https://x.com/JudgmentLabs) • [💼 LinkedIn](https://www.linkedin.com/company/judgmentlabs) • [📚 Docs](https://judgment.mintlify.app/getting_started) • [🚀 Demos](https://www.youtube.com/@AlexShan-j3o) • [🎮 Discord](https://discord.gg/taAufyhf)
13
+ </div>
14
+
15
+ ## Judgeval: open-source testing, monitoring, and optimization for AI agents
16
+
17
+ Judgeval offers robust tooling for evaluating and tracing LLM agent systems. It is dev-friendly and open-source (licensed under Apache 2.0).
18
+
19
+ Judgeval gets you started in five minutes, after which you'll be ready to use all of its features as your agent becomes more complex. Judgeval is natively connected to the [Judgment Platform](https://www.judgmentlabs.ai/) for free and you can export your data and self-host at any time.
20
+
21
+ We support tracing agents built with LangGraph, OpenAI SDK, Anthropic, ... and allow custom eval integrations for any use case. Check out our quickstarts below or our [setup guide](https://judgment.mintlify.app/getting_started) to get started.
22
+
23
+ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
24
+
25
+ ## 📋 Table of Contents
26
+ * [✨ Features](#-features)
27
+ * [🔍 Tracing](#-tracing)
28
+ * [🧪 Evals](#-evals)
29
+ * [📡 Monitoring](#-monitoring)
30
+ * [📊 Datasets](#-datasets)
31
+ * [💡 Insights](#-insights)
32
+ * [🛠️ Installation](#️-installation)
33
+ * [🏁 Get Started](#-get-started)
34
+ * [🏢 Self-Hosting](#-self-hosting)
35
+ * [📚 Cookbooks](#-cookbooks)
36
+ * [⭐ Star Us on GitHub](#-star-us-on-github)
37
+ * [❤️ Contributors](#️-contributors)
38
+
39
+ <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
40
+
41
+
42
+ ## ✨ Features
43
+
44
+ | | |
45
+ |:---|:---:|
46
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
47
+ | <h3>🧪 Evals</h3>15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Build custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails <br><br> | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
48
+ | <h3>📡 Monitoring</h3>Real-time performance tracking of your agents in production environments. **Track all your metrics in one place.**<br><br>Set up **Slack/email alerts** for critical metrics and receive notifications when thresholds are exceeded.<br><br> **Useful for:** <br>•📉 Identifying degradation early <br>•📈 Visualizing performance trends across versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
49
+ | <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets hosted on Judgment's Platform. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🔄 Scaled analysis for A/B tests <br>• 🗃️ Filtered collections of agent runtime data| <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
50
+ | <h3>💡 Insights</h3>Cluster on your data to reveal common use cases and failure modes.<br><br>Trace failures to their exact source with Judgment's Osiris agent, which localizes errors to specific components for precise fixes.<br><br> **Useful for:**<br>•🔮 Surfacing common inputs that lead to error<br>•🤖 Investigating agent/user behavior for optimization <br>| <p align="center"><img src="assets/dataset_clustering_screenshot_dm.png" alt="Insights dashboard" width="1200"/></p> |
51
+
52
+ ## 🛠️ Installation
53
+
54
+ Get started with Judgeval by installing our SDK using pip:
55
+
56
+ ```bash
57
+ pip install judgeval
58
+ ```
59
+
60
+ Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
61
+
62
+ **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
63
+
64
+ ## 🏁 Get Started
65
+
66
+ Here's how you can quickly start using Judgeval:
67
+
68
+ ### 🛰️ Tracing
69
+
70
+ Track your agent execution with full observability with just a few lines of code.
71
+ Create a file named `traces.py` with the following code:
72
+
73
+ ```python
74
+ from judgeval.common.tracer import Tracer, wrap
75
+ from openai import OpenAI
76
+
77
+ client = wrap(OpenAI())
78
+ judgment = Tracer(project_name="my_project")
79
+
80
+ @judgment.observe(span_type="tool")
81
+ def my_tool():
82
+ return "What's the capital of the U.S.?"
83
+
84
+ @judgment.observe(span_type="function")
85
+ def main():
86
+ task_input = my_tool()
87
+ res = client.chat.completions.create(
88
+ model="gpt-4.1",
89
+ messages=[{"role": "user", "content": f"{task_input}"}]
90
+ )
91
+ return res.choices[0].message.content
92
+
93
+ main()
94
+ ```
95
+
96
+ [Click here](https://judgment.mintlify.app/getting_started#create-your-first-trace) for a more detailed explanation.
97
+
98
+ ### 📝 Offline Evaluations
99
+
100
+ You can evaluate your agent's execution to measure quality metrics such as hallucination.
101
+ Create a file named `evaluate.py` with the following code:
102
+
103
+ ```python evaluate.py
104
+ from judgeval import JudgmentClient
105
+ from judgeval.data import Example
106
+ from judgeval.scorers import FaithfulnessScorer
107
+
108
+ client = JudgmentClient()
109
+
110
+ example = Example(
111
+ input="What if these shoes don't fit?",
112
+ actual_output="We offer a 30-day full refund at no extra cost.",
113
+ retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
114
+ )
115
+
116
+ scorer = FaithfulnessScorer(threshold=0.5)
117
+ results = client.run_evaluation(
118
+ examples=[example],
119
+ scorers=[scorer],
120
+ model="gpt-4.1",
121
+ )
122
+ print(results)
123
+ ```
124
+
125
+ [Click here](https://judgment.mintlify.app/getting_started#create-your-first-experiment) for a more detailed explanation.
126
+
127
+ ### 📡 Online Evaluations
128
+
129
+ Attach performance monitoring on traces to measure the quality of your systems in production.
130
+
131
+ Using the same `traces.py` file we created earlier, modify `main` function:
132
+
133
+ ```python
134
+ from judgeval.common.tracer import Tracer, wrap
135
+ from judgeval.scorers import AnswerRelevancyScorer
136
+ from openai import OpenAI
137
+
138
+ client = wrap(OpenAI())
139
+ judgment = Tracer(project_name="my_project")
140
+
141
+ @judgment.observe(span_type="tool")
142
+ def my_tool():
143
+ return "Hello world!"
144
+
145
+ @judgment.observe(span_type="function")
146
+ def main():
147
+ task_input = my_tool()
148
+ res = client.chat.completions.create(
149
+ model="gpt-4.1",
150
+ messages=[{"role": "user", "content": f"{task_input}"}]
151
+ ).choices[0].message.content
152
+
153
+ judgment.async_evaluate(
154
+ scorers=[AnswerRelevancyScorer(threshold=0.5)],
155
+ input=task_input,
156
+ actual_output=res,
157
+ model="gpt-4.1"
158
+ )
159
+ print("Online evaluation submitted.")
160
+ return res
161
+
162
+ main()
163
+ ```
164
+
165
+ [Click here](https://judgment.mintlify.app/getting_started#create-your-first-online-evaluation) for a more detailed explanation.
166
+
167
+ ## 🏢 Self-Hosting
168
+
169
+ Run Judgment on your own infrastructure: we provide comprehensive self-hosting capabilities that give you full control over the backend and data plane that Judgeval interfaces with.
170
+
171
+ ### Key Features
172
+ * Deploy Judgment on your own AWS account
173
+ * Store data in your own Supabase instance
174
+ * Access Judgment through your own custom domain
175
+
176
+ ### Getting Started
177
+ 1. Check out our [self-hosting documentation](https://judgment.mintlify.app/self_hosting/get_started) for detailed setup instructions, along with how your self-hosted instance can be accessed
178
+ 2. Use the [Judgment CLI](https://github.com/JudgmentLabs/judgment-cli) to deploy your self-hosted environment
179
+ 3. After your self-hosted instance is setup, make sure the `JUDGMENT_API_URL` environmental variable is set to your self-hosted backend endpoint
180
+
181
+ ## 📚 Cookbooks
182
+
183
+ Have your own? We're happy to feature it if you create a PR or message us on [Discord](https://discord.gg/taAufyhf).
184
+
185
+ You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judgment-cookbook). Here are some highlights:
186
+
187
+ ### Sample Agents
188
+
189
+ #### 💰 [LangGraph Financial QA Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/financial_agent/demo.py)
190
+ A LangGraph-based agent for financial queries, featuring RAG capabilities with a vector database for contextual data retrieval and evaluation of its reasoning and data accuracy.
191
+
192
+ #### ✈️ [OpenAI Travel Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/openai_travel_agent/agent.py)
193
+ A travel planning agent using OpenAI API calls, custom tool functions, and RAG with a vector database for up-to-date and contextual travel information. Evaluated for itinerary quality and information relevance.
194
+
195
+ ### Custom Evaluators
196
+
197
+ #### 🔍 [PII Detection](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/classifier_scorer/pii_checker.py)
198
+ Detecting and evaluating Personal Identifiable Information (PII) leakage.
199
+
200
+ #### 📧 [Cold Email Generation](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/cold_email_scorer.py)
201
+
202
+ Evaluates if a cold email generator properly utilizes all relevant information about the target recipient.
203
+
204
+ ## ⭐ Star Us on GitHub
205
+
206
+ If you find Judgeval useful, please consider giving us a star on GitHub! Your support helps us grow our community and continue improving the product.
207
+
208
+
209
+ ## ❤️ Contributors
210
+
211
+ There are many ways to contribute to Judgeval:
212
+
213
+ - Submit [bug reports](https://github.com/JudgmentLabs/judgeval/issues) and [feature requests](https://github.com/JudgmentLabs/judgeval/issues)
214
+ - Review the documentation and submit [Pull Requests](https://github.com/JudgmentLabs/judgeval/pulls) to improve it
215
+ - Speaking or writing about Judgment and letting us know!
216
+
217
+ <!-- Contributors collage -->
218
+ [![Contributors](https://contributors-img.web.app/image?repo=JudgmentLabs/judgeval)](https://github.com/JudgmentLabs/judgeval/graphs/contributors)
219
+
@@ -52,7 +52,7 @@ example = Example(
52
52
  results = client.run_evaluation(
53
53
  examples=[example],
54
54
  scorers=[FaithfulnessScorer(threshold=0.5)],
55
- model="gpt-4o",
55
+ model="gpt-4.1",
56
56
  )
57
57
  ```
58
58
  ```Typescript Typescript
@@ -70,7 +70,7 @@ async function runEval() {
70
70
  const results = await client.evaluate({
71
71
  examples: [example],
72
72
  scorers: [new FaithfulnessScorer(0.5)],
73
- model: "gpt-4o",
73
+ model: "gpt-4.1",
74
74
  projectName: "client-api-ref-proj", // Optional: Provide a project name
75
75
  evalName: "client-api-ref-eval" // Optional: Provide an eval name
76
76
  });
@@ -83,7 +83,7 @@ runEval();
83
83
 
84
84
  The `run_evaluation` (Python) / `evaluate` (Typescript) method accepts the following arguments/options:
85
85
  - `examples`: A list/array of [Example](/evaluation/data_examples) objects to evaluate.
86
- - `model`: The model to use for the evaluation, such as `gpt-4o` or `Qwen/Qwen2.5-72B-Instruct-Turbo`.
86
+ - `model`: The model to use for the evaluation, such as `gpt-4.1` or `Qwen/Qwen2.5-72B-Instruct-Turbo`.
87
87
  - `scorers`: A list/array of [Scorer](/evaluation/scorers) objects to use for the evaluation.
88
88
  - `log_results` (Python) / `logResults` (Typescript): Whether to log the results of the evaluation to the Judgment platform. Defaults to `true`.
89
89
  - `override`: Whether to override an existing evaluation with the same name. Defaults to `false`.
@@ -135,7 +135,7 @@ airline_sequence = Sequence(
135
135
  results = client.run_sequence_evaluation(
136
136
  sequences=[airline_sequence],
137
137
  scorers=[DerailmentScorer(threshold=0.5)],
138
- model="gpt-4o",
138
+ model="gpt-4.1",
139
139
  log_results=True,
140
140
  override=True,
141
141
  )
@@ -253,7 +253,7 @@ from judgeval.scorers import FaithfulnessScorer # Added import
253
253
  res = client.run_evaluation(
254
254
  examples=dataset.examples,
255
255
  scorers=[FaithfulnessScorer(threshold=0.9)],
256
- model="gpt-4o",
256
+ model="gpt-4.1",
257
257
  )
258
258
  ```
259
259
  ```Typescript Typescript
@@ -270,7 +270,7 @@ const dataset: Example[] = [
270
270
  const results = await client.evaluate({
271
271
  examples: dataset,
272
272
  scorers: [new FaithfulnessScorer(0.9)],
273
- model: "gpt-4o",
273
+ model: "gpt-4.1",
274
274
  projectName: "dataset-eval-ts-proj",
275
275
  evalName: "dataset-eval-ts-run"
276
276
  });
@@ -70,14 +70,14 @@ faithfulness_scorer = FaithfulnessScorer(threshold=0.5)
70
70
  results = client.run_evaluation(
71
71
  examples=[example],
72
72
  scorers=[faithfulness_scorer],
73
- model="gpt-4o",
73
+ model="gpt-4.1",
74
74
  )
75
75
 
76
76
  # You also run evaluations asynchronously like so:
77
77
  results = client.a_run_evaluation(
78
78
  examples=[example],
79
79
  scorers=[faithfulness_scorer],
80
- model="gpt-4o",
80
+ model="gpt-4.1",
81
81
  )
82
82
  print(results)
83
83
  ```
@@ -102,7 +102,7 @@ const faithfulnessScorer = new FaithfulnessScorer(0.5);
102
102
  const results = await client.evaluate({
103
103
  examples: [example],
104
104
  scorers: [faithfulnessScorer],
105
- model: "gpt-4o",
105
+ model: "gpt-4.1",
106
106
  projectName: "my-intro-project",
107
107
  evalName: "intro-evaluation-run"
108
108
  });
@@ -15,7 +15,7 @@ Both `judgeval` (Python) and `judgeval-js` (TypeScript) support OpenAI models (l
15
15
 
16
16
  In Python, this is handled via LiteLLM integration. In TypeScript, the built-in `DefaultJudge` is used.
17
17
 
18
- You simply pass the model name (e.g., "gpt-4o") to the `model` parameter in your evaluation call:
18
+ You simply pass the model name (e.g., "gpt-4.1") to the `model` parameter in your evaluation call:
19
19
 
20
20
  <CodeGroup>
21
21
  ```Python Python
@@ -29,7 +29,7 @@ example1 = Example(input="Q1", actual_output="A1")
29
29
  results = client.run_evaluation(
30
30
  examples=[example1],
31
31
  scorers=[AnswerRelevancyScorer(threshold=0.5)],
32
- model="gpt-4o" # Uses LiteLLM
32
+ model="gpt-4.1" # Uses LiteLLM
33
33
  )
34
34
  ```
35
35
  ```Typescript Typescript
@@ -42,7 +42,7 @@ async function runOpenAIJudge() {
42
42
  const results = await client.evaluate({
43
43
  examples: [example1],
44
44
  scorers: [new AnswerRelevancyScorer(0.5)],
45
- model: "gpt-4o", // Uses DefaultJudge internally
45
+ model: "gpt-4.1", // Uses DefaultJudge internally
46
46
  projectName: "openai-judge-ts-proj",
47
47
  evalName: "openai-judge-ts-eval"
48
48
  });
@@ -205,5 +205,5 @@ useCustomJudge();
205
205
  </CodeGroup>
206
206
 
207
207
  <Note>
208
- When providing a custom judge instance (like `VertexAIJudge` in Python or `MyCustomJudge` in TypeScript), pass the instance directly to the `model` parameter (Python) or the `judge` option (TypeScript) in the evaluation call. The built-in judges (`DefaultJudge`, `TogetherJudge`) are used automatically when you pass a model *name* string (like "gpt-4o" or "meta-llama/...") to the `model` option in TypeScript.
208
+ When providing a custom judge instance (like `VertexAIJudge` in Python or `MyCustomJudge` in TypeScript), pass the instance directly to the `model` parameter (Python) or the `judge` option (TypeScript) in the evaluation call. The built-in judges (`DefaultJudge`, `TogetherJudge`) are used automatically when you pass a model *name* string (like "gpt-4.1" or "meta-llama/...") to the `model` option in TypeScript.
209
209
  </Note>
@@ -43,7 +43,7 @@ airline_sequence = Sequence(
43
43
  results = client.run_sequence_evaluation(
44
44
  sequences=[airline_sequence],
45
45
  scorers=[DerailmentScorer(threshold=0.5)],
46
- model="gpt-4o",
46
+ model="gpt-4.1",
47
47
  log_results=True,
48
48
  override=True,
49
49
  )