judgeval 0.0.40__tar.gz → 0.0.41__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. judgeval-0.0.41/.github/workflows/blocked-pr.yaml +19 -0
  2. {judgeval-0.0.40 → judgeval-0.0.41}/PKG-INFO +25 -16
  3. {judgeval-0.0.40 → judgeval-0.0.41}/README.md +24 -15
  4. {judgeval-0.0.40 → judgeval-0.0.41}/pyproject.toml +1 -1
  5. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/common/tracer.py +160 -38
  6. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/common/utils.py +5 -1
  7. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/data/datasets/dataset.py +12 -6
  8. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/data/datasets/eval_dataset_client.py +3 -1
  9. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/data/trace.py +6 -2
  10. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/judgment_client.py +9 -1
  11. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/run_evaluation.py +17 -3
  12. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorer.py +4 -1
  13. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/prompt_scorer.py +3 -0
  14. {judgeval-0.0.40 → judgeval-0.0.41}/.github/pull_request_template.md +0 -0
  15. {judgeval-0.0.40 → judgeval-0.0.41}/.github/workflows/ci-staging.yaml +0 -0
  16. {judgeval-0.0.40 → judgeval-0.0.41}/.github/workflows/ci.yaml +0 -0
  17. {judgeval-0.0.40 → judgeval-0.0.41}/.github/workflows/merge-to-main.yaml +0 -0
  18. {judgeval-0.0.40 → judgeval-0.0.41}/.github/workflows/release.yaml +0 -0
  19. {judgeval-0.0.40 → judgeval-0.0.41}/.gitignore +0 -0
  20. {judgeval-0.0.40 → judgeval-0.0.41}/LICENSE.md +0 -0
  21. {judgeval-0.0.40 → judgeval-0.0.41}/Pipfile +0 -0
  22. {judgeval-0.0.40 → judgeval-0.0.41}/Pipfile.lock +0 -0
  23. {judgeval-0.0.40 → judgeval-0.0.41}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
  24. {judgeval-0.0.40 → judgeval-0.0.41}/assets/dataset_clustering_screenshot.png +0 -0
  25. {judgeval-0.0.40 → judgeval-0.0.41}/assets/dataset_clustering_screenshot_dm.png +0 -0
  26. {judgeval-0.0.40 → judgeval-0.0.41}/assets/datasets_preview_screenshot.png +0 -0
  27. {judgeval-0.0.40 → judgeval-0.0.41}/assets/experiments_dashboard_screenshot.png +0 -0
  28. {judgeval-0.0.40 → judgeval-0.0.41}/assets/experiments_page.png +0 -0
  29. {judgeval-0.0.40 → judgeval-0.0.41}/assets/experiments_pagev2.png +0 -0
  30. {judgeval-0.0.40 → judgeval-0.0.41}/assets/logo-dark.svg +0 -0
  31. {judgeval-0.0.40 → judgeval-0.0.41}/assets/logo-light.svg +0 -0
  32. {judgeval-0.0.40 → judgeval-0.0.41}/assets/monitoring_screenshot.png +0 -0
  33. {judgeval-0.0.40 → judgeval-0.0.41}/assets/new_darkmode.svg +0 -0
  34. {judgeval-0.0.40 → judgeval-0.0.41}/assets/new_lightmode.svg +0 -0
  35. {judgeval-0.0.40 → judgeval-0.0.41}/assets/trace_screenshot.png +0 -0
  36. {judgeval-0.0.40 → judgeval-0.0.41}/docs/README.md +0 -0
  37. {judgeval-0.0.40 → judgeval-0.0.41}/docs/alerts/notifications.mdx +0 -0
  38. {judgeval-0.0.40 → judgeval-0.0.41}/docs/alerts/platform_notifications.mdx +0 -0
  39. {judgeval-0.0.40 → judgeval-0.0.41}/docs/alerts/rules.mdx +0 -0
  40. {judgeval-0.0.40 → judgeval-0.0.41}/docs/api_reference/judgment_client.mdx +0 -0
  41. {judgeval-0.0.40 → judgeval-0.0.41}/docs/api_reference/trace.mdx +0 -0
  42. {judgeval-0.0.40 → judgeval-0.0.41}/docs/changelog/2025-04-21.mdx +0 -0
  43. {judgeval-0.0.40 → judgeval-0.0.41}/docs/clustering/clustering.mdx +0 -0
  44. {judgeval-0.0.40 → judgeval-0.0.41}/docs/compliance/certifications.mdx +0 -0
  45. {judgeval-0.0.40 → judgeval-0.0.41}/docs/development.mdx +0 -0
  46. {judgeval-0.0.40 → judgeval-0.0.41}/docs/essentials/code.mdx +0 -0
  47. {judgeval-0.0.40 → judgeval-0.0.41}/docs/essentials/images.mdx +0 -0
  48. {judgeval-0.0.40 → judgeval-0.0.41}/docs/essentials/markdown.mdx +0 -0
  49. {judgeval-0.0.40 → judgeval-0.0.41}/docs/essentials/navigation.mdx +0 -0
  50. {judgeval-0.0.40 → judgeval-0.0.41}/docs/essentials/reusable-snippets.mdx +0 -0
  51. {judgeval-0.0.40 → judgeval-0.0.41}/docs/essentials/settings.mdx +0 -0
  52. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/data_datasets.mdx +0 -0
  53. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/data_examples.mdx +0 -0
  54. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/data_sequences.mdx +0 -0
  55. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/experiment_comparisons.mdx +0 -0
  56. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/introduction.mdx +0 -0
  57. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/judges.mdx +0 -0
  58. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/scorers/agent/derailment.mdx +0 -0
  59. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
  60. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
  61. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/scorers/default/answer_correctness.mdx +0 -0
  62. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/scorers/default/answer_relevancy.mdx +0 -0
  63. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/scorers/default/comparison.mdx +0 -0
  64. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/scorers/default/contextual_precision.mdx +0 -0
  65. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/scorers/default/contextual_recall.mdx +0 -0
  66. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/scorers/default/contextual_relevancy.mdx +0 -0
  67. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/scorers/default/execution_order.mdx +0 -0
  68. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/scorers/default/faithfulness.mdx +0 -0
  69. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/scorers/default/groundedness.mdx +0 -0
  70. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/scorers/default/json_correctness.mdx +0 -0
  71. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/scorers/default/summarization.mdx +0 -0
  72. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/scorers/introduction.mdx +0 -0
  73. {judgeval-0.0.40 → judgeval-0.0.41}/docs/evaluation/unit_testing.mdx +0 -0
  74. {judgeval-0.0.40 → judgeval-0.0.41}/docs/favicon.svg +0 -0
  75. {judgeval-0.0.40 → judgeval-0.0.41}/docs/getting_started.mdx +0 -0
  76. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/annotation_queue_ui.png +0 -0
  77. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/basic_trace_example.png +0 -0
  78. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/checks-passed.png +0 -0
  79. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/cluster.png +0 -0
  80. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/cluster_button.png +0 -0
  81. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/create_aggressive_scorer.png +0 -0
  82. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/create_scorer.png +0 -0
  83. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/dashboard_annotation_queue_button.png +0 -0
  84. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/evaluation_diagram.png +0 -0
  85. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/experiment-comparison-page-2.png +0 -0
  86. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/experiment-page-comparison.png +0 -0
  87. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/experiment-popout-comparison.png +0 -0
  88. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/experiments-page-comparison-2.png +0 -0
  89. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/experiments-page-comparison.png +0 -0
  90. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/export-dataset.png +0 -0
  91. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/hero-dark.svg +0 -0
  92. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/hero-light.svg +0 -0
  93. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/notifications_page.png +0 -0
  94. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/online_eval_fault.png +0 -0
  95. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/reports_modal.png +0 -0
  96. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/synth_data_button.png +0 -0
  97. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/synth_data_window.png +0 -0
  98. {judgeval-0.0.40 → judgeval-0.0.41}/docs/images/trace_ss.png +0 -0
  99. {judgeval-0.0.40 → judgeval-0.0.41}/docs/integration/langgraph.mdx +0 -0
  100. {judgeval-0.0.40 → judgeval-0.0.41}/docs/introduction.mdx +0 -0
  101. {judgeval-0.0.40 → judgeval-0.0.41}/docs/judgment_cli/installation.mdx +0 -0
  102. {judgeval-0.0.40 → judgeval-0.0.41}/docs/judgment_cli/self-hosting.mdx +0 -0
  103. {judgeval-0.0.40 → judgeval-0.0.41}/docs/judgment_cli/supabase-org-id.png +0 -0
  104. {judgeval-0.0.40 → judgeval-0.0.41}/docs/logo/dark.svg +0 -0
  105. {judgeval-0.0.40 → judgeval-0.0.41}/docs/logo/light.svg +0 -0
  106. {judgeval-0.0.40 → judgeval-0.0.41}/docs/mint.json +0 -0
  107. {judgeval-0.0.40 → judgeval-0.0.41}/docs/monitoring/annotations.mdx +0 -0
  108. {judgeval-0.0.40 → judgeval-0.0.41}/docs/monitoring/introduction.mdx +0 -0
  109. {judgeval-0.0.40 → judgeval-0.0.41}/docs/monitoring/production_insights.mdx +0 -0
  110. {judgeval-0.0.40 → judgeval-0.0.41}/docs/monitoring/tracing.mdx +0 -0
  111. {judgeval-0.0.40 → judgeval-0.0.41}/docs/monitoring/tracing_s3.mdx +0 -0
  112. {judgeval-0.0.40 → judgeval-0.0.41}/docs/notebooks/create_dataset.ipynb +0 -0
  113. {judgeval-0.0.40 → judgeval-0.0.41}/docs/notebooks/create_scorer.ipynb +0 -0
  114. {judgeval-0.0.40 → judgeval-0.0.41}/docs/notebooks/demo.ipynb +0 -0
  115. {judgeval-0.0.40 → judgeval-0.0.41}/docs/notebooks/prompt_scorer.ipynb +0 -0
  116. {judgeval-0.0.40 → judgeval-0.0.41}/docs/notebooks/quickstart.ipynb +0 -0
  117. {judgeval-0.0.40 → judgeval-0.0.41}/docs/optimization/osiris_agent.mdx +0 -0
  118. {judgeval-0.0.40 → judgeval-0.0.41}/docs/quickstart.mdx +0 -0
  119. {judgeval-0.0.40 → judgeval-0.0.41}/docs/self_hosting/get_started.mdx +0 -0
  120. {judgeval-0.0.40 → judgeval-0.0.41}/docs/snippets/snippet-intro.mdx +0 -0
  121. {judgeval-0.0.40 → judgeval-0.0.41}/docs/synthetic_data/synthetic_data.mdx +0 -0
  122. {judgeval-0.0.40 → judgeval-0.0.41}/pytest.ini +0 -0
  123. {judgeval-0.0.40 → judgeval-0.0.41}/src/.coveragerc +0 -0
  124. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/__init__.py +0 -0
  125. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/clients.py +0 -0
  126. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/common/__init__.py +0 -0
  127. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/common/exceptions.py +0 -0
  128. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/common/logger.py +0 -0
  129. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/common/s3_storage.py +0 -0
  130. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/constants.py +0 -0
  131. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/data/__init__.py +0 -0
  132. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/data/custom_example.py +0 -0
  133. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/data/datasets/__init__.py +0 -0
  134. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/data/example.py +0 -0
  135. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/data/result.py +0 -0
  136. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/data/scorer_data.py +0 -0
  137. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/data/tool.py +0 -0
  138. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/data/trace_run.py +0 -0
  139. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/evaluation_run.py +0 -0
  140. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/integrations/langgraph.py +0 -0
  141. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/judges/__init__.py +0 -0
  142. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/judges/base_judge.py +0 -0
  143. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/judges/litellm_judge.py +0 -0
  144. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/judges/mixture_of_judges.py +0 -0
  145. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/judges/together_judge.py +0 -0
  146. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/judges/utils.py +0 -0
  147. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/rules.py +0 -0
  148. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/__init__.py +0 -0
  149. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/api_scorer.py +0 -0
  150. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/exceptions.py +0 -0
  151. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  152. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  153. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  154. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  155. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +0 -0
  156. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -0
  157. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
  158. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
  159. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
  160. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
  161. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
  162. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  163. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -0
  164. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
  165. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
  166. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
  167. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
  168. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -0
  169. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -0
  170. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
  171. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
  172. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
  173. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/score.py +0 -0
  174. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/scorers/utils.py +0 -0
  175. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/tracer/__init__.py +0 -0
  176. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/utils/alerts.py +0 -0
  177. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/utils/data_utils.py +0 -0
  178. {judgeval-0.0.40 → judgeval-0.0.41}/src/judgeval/version_check.py +0 -0
  179. {judgeval-0.0.40 → judgeval-0.0.41}/update_version.py +0 -0
@@ -0,0 +1,19 @@
1
+ name: Check Blocked PR
2
+
3
+ on:
4
+ pull_request:
5
+ types:
6
+ - opened
7
+ - labeled
8
+ - unlabeled
9
+ - synchronize
10
+
11
+ jobs:
12
+ fail-for-blocked:
13
+ if: contains(github.event.pull_request.labels.*.name, 'Blocked')
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - name: Fail if PR is blocked
17
+ run: |
18
+ echo "This PR is currently blocked. Please unblock it before merging."
19
+ exit 1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.40
3
+ Version: 0.0.41
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -37,11 +37,11 @@ Description-Content-Type: text/markdown
37
37
 
38
38
  <br>
39
39
 
40
- ## [🌐 Landing Page](https://www.judgmentlabs.ai/) • [📚 Docs](https://judgment.mintlify.app/getting_started) • [🚀 Demos](https://www.youtube.com/@AlexShan-j3o)
40
+ ## [🌐 Landing Page](https://www.judgmentlabs.ai/) • [📚 Docs](https://docs.judgmentlabs.ai/introduction) • [🚀 Demos](https://www.youtube.com/@AlexShan-j3o)
41
41
 
42
42
  [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
43
43
  [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
44
- [![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/FMxHkYTtFE)
44
+ [![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/ZCnSXYug)
45
45
 
46
46
  </div>
47
47
 
@@ -56,19 +56,28 @@ We support tracing agents built with LangGraph, OpenAI SDK, Anthropic, ... and a
56
56
  Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
57
57
 
58
58
  ## 📋 Table of Contents
59
- * [ Features](#-features)
60
- * [🔍 Tracing](#-tracing)
61
- * [🧪 Evals](#-evals)
62
- * [📡 Monitoring](#-monitoring)
63
- * [📊 Datasets](#-datasets)
64
- * [💡 Insights](#-insights)
65
- * [🛠️ Installation](#️-installation)
66
- * [🏁 Get Started](#-get-started)
67
- * [🏢 Self-Hosting](#-self-hosting)
68
- * [📚 Cookbooks](#-cookbooks)
69
- * [💻 Development with Cursor](#-development-with-cursor)
70
- * [ Star Us on GitHub](#-star-us-on-github)
71
- * [❤️ Contributors](#️-contributors)
59
+ - [🌐 Landing Page • 📚 Docs • 🚀 Demos](#-landing-page----docs---demos)
60
+ - [Judgeval: open-source testing, monitoring, and optimization for AI agents](#judgeval-open-source-testing-monitoring-and-optimization-for-ai-agents)
61
+ - [📋 Table of Contents](#-table-of-contents)
62
+ - [ Features](#-features)
63
+ - [🛠️ Installation](#️-installation)
64
+ - [🏁 Get Started](#-get-started)
65
+ - [🛰️ Tracing](#️-tracing)
66
+ - [📝 Offline Evaluations](#-offline-evaluations)
67
+ - [📡 Online Evaluations](#-online-evaluations)
68
+ - [🏢 Self-Hosting](#-self-hosting)
69
+ - [Key Features](#key-features)
70
+ - [Getting Started](#getting-started)
71
+ - [📚 Cookbooks](#-cookbooks)
72
+ - [Sample Agents](#sample-agents)
73
+ - [💰 LangGraph Financial QA Agent](#-langgraph-financial-qa-agent)
74
+ - [✈️ OpenAI Travel Agent](#️-openai-travel-agent)
75
+ - [Custom Evaluators](#custom-evaluators)
76
+ - [🔍 PII Detection](#-pii-detection)
77
+ - [📧 Cold Email Generation](#-cold-email-generation)
78
+ - [💻 Development with Cursor](#-development-with-cursor)
79
+ - [⭐ Star Us on GitHub](#-star-us-on-github)
80
+ - [❤️ Contributors](#️-contributors)
72
81
 
73
82
  <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
74
83
 
@@ -9,11 +9,11 @@
9
9
 
10
10
  <br>
11
11
 
12
- ## [🌐 Landing Page](https://www.judgmentlabs.ai/) • [📚 Docs](https://judgment.mintlify.app/getting_started) • [🚀 Demos](https://www.youtube.com/@AlexShan-j3o)
12
+ ## [🌐 Landing Page](https://www.judgmentlabs.ai/) • [📚 Docs](https://docs.judgmentlabs.ai/introduction) • [🚀 Demos](https://www.youtube.com/@AlexShan-j3o)
13
13
 
14
14
  [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
15
15
  [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
16
- [![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/FMxHkYTtFE)
16
+ [![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/ZCnSXYug)
17
17
 
18
18
  </div>
19
19
 
@@ -28,19 +28,28 @@ We support tracing agents built with LangGraph, OpenAI SDK, Anthropic, ... and a
28
28
  Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
29
29
 
30
30
  ## 📋 Table of Contents
31
- * [ Features](#-features)
32
- * [🔍 Tracing](#-tracing)
33
- * [🧪 Evals](#-evals)
34
- * [📡 Monitoring](#-monitoring)
35
- * [📊 Datasets](#-datasets)
36
- * [💡 Insights](#-insights)
37
- * [🛠️ Installation](#️-installation)
38
- * [🏁 Get Started](#-get-started)
39
- * [🏢 Self-Hosting](#-self-hosting)
40
- * [📚 Cookbooks](#-cookbooks)
41
- * [💻 Development with Cursor](#-development-with-cursor)
42
- * [ Star Us on GitHub](#-star-us-on-github)
43
- * [❤️ Contributors](#️-contributors)
31
+ - [🌐 Landing Page • 📚 Docs • 🚀 Demos](#-landing-page----docs---demos)
32
+ - [Judgeval: open-source testing, monitoring, and optimization for AI agents](#judgeval-open-source-testing-monitoring-and-optimization-for-ai-agents)
33
+ - [📋 Table of Contents](#-table-of-contents)
34
+ - [ Features](#-features)
35
+ - [🛠️ Installation](#️-installation)
36
+ - [🏁 Get Started](#-get-started)
37
+ - [🛰️ Tracing](#️-tracing)
38
+ - [📝 Offline Evaluations](#-offline-evaluations)
39
+ - [📡 Online Evaluations](#-online-evaluations)
40
+ - [🏢 Self-Hosting](#-self-hosting)
41
+ - [Key Features](#key-features)
42
+ - [Getting Started](#getting-started)
43
+ - [📚 Cookbooks](#-cookbooks)
44
+ - [Sample Agents](#sample-agents)
45
+ - [💰 LangGraph Financial QA Agent](#-langgraph-financial-qa-agent)
46
+ - [✈️ OpenAI Travel Agent](#️-openai-travel-agent)
47
+ - [Custom Evaluators](#custom-evaluators)
48
+ - [🔍 PII Detection](#-pii-detection)
49
+ - [📧 Cold Email Generation](#-cold-email-generation)
50
+ - [💻 Development with Cursor](#-development-with-cursor)
51
+ - [⭐ Star Us on GitHub](#-star-us-on-github)
52
+ - [❤️ Contributors](#️-contributors)
44
53
 
45
54
  <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
46
55
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.0.40"
3
+ version = "0.0.41"
4
4
  authors = [
5
5
  { name="Andrew Li", email="andrew@judgmentlabs.ai" },
6
6
  { name="Alex Shan", email="alex@judgmentlabs.ai" },
@@ -5,7 +5,6 @@ Tracing system for judgeval that allows for function tracing using decorators.
5
5
  import asyncio
6
6
  import functools
7
7
  import inspect
8
- import json
9
8
  import os
10
9
  import site
11
10
  import sysconfig
@@ -16,6 +15,7 @@ import uuid
16
15
  import warnings
17
16
  import contextvars
18
17
  import sys
18
+ import json
19
19
  from contextlib import contextmanager, asynccontextmanager, AbstractAsyncContextManager, AbstractContextManager # Import context manager bases
20
20
  from dataclasses import dataclass, field
21
21
  from datetime import datetime
@@ -29,20 +29,16 @@ from typing import (
29
29
  Literal,
30
30
  Optional,
31
31
  Tuple,
32
- Type,
33
- TypeVar,
34
32
  Union,
35
33
  AsyncGenerator,
36
34
  TypeAlias,
37
- Set
38
35
  )
39
36
  from rich import print as rprint
40
- import types # <--- Add this import
37
+ import types
41
38
 
42
39
  # Third-party imports
43
40
  import requests
44
41
  from litellm import cost_per_token as _original_cost_per_token
45
- from pydantic import BaseModel
46
42
  from rich import print as rprint
47
43
  from openai import OpenAI, AsyncOpenAI
48
44
  from together import Together, AsyncTogether
@@ -64,8 +60,7 @@ from judgeval.data import Example, Trace, TraceSpan, TraceUsage
64
60
  from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
65
61
  from judgeval.rules import Rule
66
62
  from judgeval.evaluation_run import EvaluationRun
67
- from judgeval.data.result import ScoringResult
68
- from judgeval.common.utils import validate_api_key
63
+ from judgeval.common.utils import ExcInfo, validate_api_key
69
64
  from judgeval.common.exceptions import JudgmentAPIError
70
65
 
71
66
  # Standard library imports needed for the new class
@@ -307,7 +302,7 @@ class TraceClient:
307
302
  tracer: Optional["Tracer"],
308
303
  trace_id: Optional[str] = None,
309
304
  name: str = "default",
310
- project_name: str = "default_project",
305
+ project_name: str = None,
311
306
  overwrite: bool = False,
312
307
  rules: Optional[List[Rule]] = None,
313
308
  enable_monitoring: bool = True,
@@ -317,7 +312,7 @@ class TraceClient:
317
312
  ):
318
313
  self.name = name
319
314
  self.trace_id = trace_id or str(uuid.uuid4())
320
- self.project_name = project_name
315
+ self.project_name = project_name or str(uuid.uuid4())
321
316
  self.overwrite = overwrite
322
317
  self.tracer = tracer
323
318
  self.rules = rules or []
@@ -507,6 +502,28 @@ class TraceClient:
507
502
  span = self.span_id_to_span[current_span_id]
508
503
  span.agent_name = agent_name
509
504
 
505
+ def record_state_before(self, state: dict):
506
+ """Records the agent's state before a tool execution on the current span.
507
+
508
+ Args:
509
+ state: A dictionary representing the agent's state.
510
+ """
511
+ current_span_id = current_span_var.get()
512
+ if current_span_id:
513
+ span = self.span_id_to_span[current_span_id]
514
+ span.state_before = state
515
+
516
+ def record_state_after(self, state: dict):
517
+ """Records the agent's state after a tool execution on the current span.
518
+
519
+ Args:
520
+ state: A dictionary representing the agent's state.
521
+ """
522
+ current_span_id = current_span_var.get()
523
+ if current_span_id:
524
+ span = self.span_id_to_span[current_span_id]
525
+ span.state_after = state
526
+
510
527
  async def _update_coroutine(self, span: TraceSpan, coroutine: Any, field: str):
511
528
  """Helper method to update the output of a trace entry once the coroutine completes"""
512
529
  try:
@@ -540,7 +557,7 @@ class TraceClient:
540
557
  # Removed else block - original didn't have one
541
558
  return None # Return None if no span_id found
542
559
 
543
- def record_error(self, error: Any):
560
+ def record_error(self, error: Dict[str, Any]):
544
561
  current_span_id = current_span_var.get()
545
562
  if current_span_id:
546
563
  span = self.span_id_to_span[current_span_id]
@@ -579,7 +596,7 @@ class TraceClient:
579
596
  "project_name": self.project_name,
580
597
  "created_at": datetime.utcfromtimestamp(self.start_time).isoformat(),
581
598
  "duration": total_duration,
582
- "entries": [span.model_dump() for span in self.trace_spans],
599
+ "trace_spans": [span.model_dump() for span in self.trace_spans],
583
600
  "evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
584
601
  "overwrite": overwrite,
585
602
  "offline_mode": self.tracer.offline_mode,
@@ -599,7 +616,7 @@ class TraceClient:
599
616
  def delete(self):
600
617
  return self.trace_manager_client.delete_trace(self.trace_id)
601
618
 
602
- def _capture_exception_for_trace(current_trace: Optional['TraceClient'], exc_info: Tuple[Optional[type], Optional[BaseException], Optional[types.TracebackType]]):
619
+ def _capture_exception_for_trace(current_trace: Optional['TraceClient'], exc_info: ExcInfo):
603
620
  if not current_trace:
604
621
  return
605
622
 
@@ -609,6 +626,27 @@ def _capture_exception_for_trace(current_trace: Optional['TraceClient'], exc_inf
609
626
  "message": str(exc_value) if exc_value else "No exception message",
610
627
  "traceback": traceback.format_tb(exc_traceback_obj) if exc_traceback_obj else []
611
628
  }
629
+
630
+ # This is where we specially handle exceptions that we might want to collect additional data for.
631
+ # When we do this, always try checking the module from sys.modules instead of importing. This will
632
+ # Let us support a wider range of exceptions without needing to import them for all clients.
633
+
634
+ # Most clients (requests, httpx, urllib) support the standard format of exposing error.request.url and error.response.status_code
635
+ # The alternative is to hand select libraries we want from sys.modules and check for them:
636
+ # As an example: requests_module = sys.modules.get("requests", None) // then do things with requests_module;
637
+
638
+ # General HTTP Like errors
639
+ try:
640
+ url = getattr(getattr(exc_value, "request", None), "url", None)
641
+ status_code = getattr(getattr(exc_value, "response", None), "status_code", None)
642
+ if status_code:
643
+ formatted_exception["http"] = {
644
+ "url": url if url else "Unknown URL",
645
+ "status_code": status_code if status_code else None,
646
+ }
647
+ except Exception as e:
648
+ pass
649
+
612
650
  current_trace.record_error(formatted_exception)
613
651
  class _DeepTracer:
614
652
  _instance: Optional["_DeepTracer"] = None
@@ -907,7 +945,7 @@ class Tracer:
907
945
  def __init__(
908
946
  self,
909
947
  api_key: str = os.getenv("JUDGMENT_API_KEY"),
910
- project_name: str = "default_project",
948
+ project_name: str = None,
911
949
  rules: Optional[List[Rule]] = None, # Added rules parameter
912
950
  organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
913
951
  enable_monitoring: bool = os.getenv("JUDGMENT_MONITORING", "true").lower() == "true",
@@ -935,7 +973,7 @@ class Tracer:
935
973
  raise ValueError("S3 bucket name must be provided when use_s3 is True")
936
974
 
937
975
  self.api_key: str = api_key
938
- self.project_name: str = project_name
976
+ self.project_name: str = project_name or str(uuid.uuid4())
939
977
  self.organization_id: str = organization_id
940
978
  self._current_trace: Optional[str] = None
941
979
  self._active_trace_client: Optional[TraceClient] = None # Add active trace client attribute
@@ -1068,32 +1106,92 @@ class Tracer:
1068
1106
 
1069
1107
  rprint(f"[bold]{label}:[/bold] {msg}")
1070
1108
 
1071
- def identify(self, identifier: str):
1109
+ def identify(self, identifier: str, track_state: bool = False, track_attributes: Optional[List[str]] = None, field_mappings: Optional[Dict[str, str]] = None):
1072
1110
  """
1073
- Class decorator that associates a class with a custom identifier.
1111
+ Class decorator that associates a class with a custom identifier and enables state tracking.
1074
1112
 
1075
1113
  This decorator creates a mapping between the class name and the provided
1076
1114
  identifier, which can be useful for tagging, grouping, or referencing
1077
- classes in a standardized way.
1115
+ classes in a standardized way. It also enables automatic state capture
1116
+ for instances of the decorated class when used with tracing.
1078
1117
 
1079
1118
  Args:
1080
- identifier: The identifier to associate with the decorated class
1081
-
1082
- Returns:
1083
- A decorator function that registers the class with the given identifier
1119
+ identifier: The identifier to associate with the decorated class.
1120
+ This will be used as the instance name in traces.
1121
+ track_state: Whether to automatically capture the state (attributes)
1122
+ of instances before and after function execution. Defaults to False.
1123
+ track_attributes: Optional list of specific attribute names to track.
1124
+ If None, all non-private attributes (not starting with '_')
1125
+ will be tracked when track_state=True.
1126
+ field_mappings: Optional dictionary mapping internal attribute names to
1127
+ display names in the captured state. For example:
1128
+ {"system_prompt": "instructions"} will capture the
1129
+ 'instructions' attribute as 'system_prompt' in the state.
1084
1130
 
1085
1131
  Example:
1086
- @tracer.identify(identifier="user_model")
1132
+ @tracer.identify(identifier="user_model", track_state=True, track_attributes=["name", "age"], field_mappings={"system_prompt": "instructions"})
1087
1133
  class User:
1088
1134
  # Class implementation
1089
1135
  """
1090
1136
  def decorator(cls):
1091
1137
  class_name = cls.__name__
1092
- self.class_identifiers[class_name] = identifier
1138
+ self.class_identifiers[class_name] = {
1139
+ "identifier": identifier,
1140
+ "track_state": track_state,
1141
+ "track_attributes": track_attributes,
1142
+ "field_mappings": field_mappings or {}
1143
+ }
1093
1144
  return cls
1094
1145
 
1095
1146
  return decorator
1096
1147
 
1148
+ def _capture_instance_state(self, instance: Any, class_config: Dict[str, Any]) -> Dict[str, Any]:
1149
+ """
1150
+ Capture the state of an instance based on class configuration.
1151
+ Args:
1152
+ instance: The instance to capture the state of.
1153
+ class_config: Configuration dictionary for state capture,
1154
+ expected to contain 'track_attributes' and 'field_mappings'.
1155
+ """
1156
+ track_attributes = class_config.get('track_attributes')
1157
+ field_mappings = class_config.get('field_mappings')
1158
+
1159
+ if track_attributes:
1160
+
1161
+ state = {attr: getattr(instance, attr, None) for attr in track_attributes}
1162
+ else:
1163
+
1164
+ state = {k: v for k, v in instance.__dict__.items() if not k.startswith('_')}
1165
+
1166
+ if field_mappings:
1167
+ state['field_mappings'] = field_mappings
1168
+
1169
+ return state
1170
+
1171
+
1172
+ def _get_instance_state_if_tracked(self, args):
1173
+ """
1174
+ Extract instance state if the instance should be tracked.
1175
+
1176
+ Returns the captured state dict if tracking is enabled, None otherwise.
1177
+ """
1178
+ if args and hasattr(args[0], '__class__'):
1179
+ instance = args[0]
1180
+ class_name = instance.__class__.__name__
1181
+ if (class_name in self.class_identifiers and
1182
+ isinstance(self.class_identifiers[class_name], dict) and
1183
+ self.class_identifiers[class_name].get('track_state', False)):
1184
+ return self._capture_instance_state(instance, self.class_identifiers[class_name])
1185
+
1186
+ def _conditionally_capture_and_record_state(self, trace_client_instance: TraceClient, args: tuple, is_before: bool):
1187
+ """Captures instance state if tracked and records it via the trace_client."""
1188
+ state = self._get_instance_state_if_tracked(args)
1189
+ if state:
1190
+ if is_before:
1191
+ trace_client_instance.record_state_before(state)
1192
+ else:
1193
+ trace_client_instance.record_state_after(state)
1194
+
1097
1195
  def observe(self, func=None, *, name=None, span_type: SpanType = "span", project_name: str = None, overwrite: bool = False, deep_tracing: bool = None):
1098
1196
  """
1099
1197
  Decorator to trace function execution with detailed entry/exit information.
@@ -1171,6 +1269,9 @@ class Tracer:
1171
1269
  span.record_input(inputs)
1172
1270
  if agent_name:
1173
1271
  span.record_agent_name(agent_name)
1272
+
1273
+ # Capture state before execution
1274
+ self._conditionally_capture_and_record_state(span, args, is_before=True)
1174
1275
 
1175
1276
  if use_deep_tracing:
1176
1277
  with _DeepTracer():
@@ -1181,7 +1282,10 @@ class Tracer:
1181
1282
  except Exception as e:
1182
1283
  _capture_exception_for_trace(current_trace, sys.exc_info())
1183
1284
  raise e
1184
-
1285
+
1286
+ # Capture state after execution
1287
+ self._conditionally_capture_and_record_state(span, args, is_before=False)
1288
+
1185
1289
  # Record output
1186
1290
  span.record_output(result)
1187
1291
  return result
@@ -1199,6 +1303,9 @@ class Tracer:
1199
1303
  if agent_name:
1200
1304
  span.record_agent_name(agent_name)
1201
1305
 
1306
+ # Capture state before execution
1307
+ self._conditionally_capture_and_record_state(span, args, is_before=True)
1308
+
1202
1309
  if use_deep_tracing:
1203
1310
  with _DeepTracer():
1204
1311
  result = await func(*args, **kwargs)
@@ -1208,6 +1315,9 @@ class Tracer:
1208
1315
  except Exception as e:
1209
1316
  _capture_exception_for_trace(current_trace, sys.exc_info())
1210
1317
  raise e
1318
+
1319
+ # Capture state after execution
1320
+ self._conditionally_capture_and_record_state(span, args, is_before=False)
1211
1321
 
1212
1322
  span.record_output(result)
1213
1323
  return result
@@ -1258,6 +1368,9 @@ class Tracer:
1258
1368
  span.record_input(inputs)
1259
1369
  if agent_name:
1260
1370
  span.record_agent_name(agent_name)
1371
+ # Capture state before execution
1372
+ self._conditionally_capture_and_record_state(span, args, is_before=True)
1373
+
1261
1374
  if use_deep_tracing:
1262
1375
  with _DeepTracer():
1263
1376
  result = func(*args, **kwargs)
@@ -1267,6 +1380,10 @@ class Tracer:
1267
1380
  except Exception as e:
1268
1381
  _capture_exception_for_trace(current_trace, sys.exc_info())
1269
1382
  raise e
1383
+
1384
+ # Capture state after execution
1385
+ self._conditionally_capture_and_record_state(span, args, is_before=False)
1386
+
1270
1387
 
1271
1388
  # Record output
1272
1389
  span.record_output(result)
@@ -1286,6 +1403,9 @@ class Tracer:
1286
1403
  if agent_name:
1287
1404
  span.record_agent_name(agent_name)
1288
1405
 
1406
+ # Capture state before execution
1407
+ self._conditionally_capture_and_record_state(span, args, is_before=True)
1408
+
1289
1409
  if use_deep_tracing:
1290
1410
  with _DeepTracer():
1291
1411
  result = func(*args, **kwargs)
@@ -1296,6 +1416,9 @@ class Tracer:
1296
1416
  _capture_exception_for_trace(current_trace, sys.exc_info())
1297
1417
  raise e
1298
1418
 
1419
+ # Capture state after execution
1420
+ self._conditionally_capture_and_record_state(span, args, is_before=False)
1421
+
1299
1422
  span.record_output(result)
1300
1423
  return result
1301
1424
 
@@ -1369,13 +1492,6 @@ def wrap(client: Any) -> Any:
1369
1492
  span.record_usage(usage)
1370
1493
  return response
1371
1494
 
1372
- def _handle_error(span, e, is_async):
1373
- """Handle and record errors"""
1374
- call_type = "async" if is_async else "sync"
1375
- print(f"Error during wrapped {call_type} API call ({span_name}): {e}")
1376
- span.record_output({"error": str(e)})
1377
- raise
1378
-
1379
1495
  # --- Traced Async Functions ---
1380
1496
  async def traced_create_async(*args, **kwargs):
1381
1497
  current_trace = current_trace_var.get()
@@ -1389,7 +1505,8 @@ def wrap(client: Any) -> Any:
1389
1505
  response_or_iterator = await original_create(*args, **kwargs)
1390
1506
  return _format_and_record_output(span, response_or_iterator, is_streaming, True, False)
1391
1507
  except Exception as e:
1392
- return _handle_error(span, e, True)
1508
+ _capture_exception_for_trace(span, sys.exc_info())
1509
+ raise e
1393
1510
 
1394
1511
  # Async responses for OpenAI clients
1395
1512
  async def traced_response_create_async(*args, **kwargs):
@@ -1404,7 +1521,8 @@ def wrap(client: Any) -> Any:
1404
1521
  response_or_iterator = await original_responses_create(*args, **kwargs)
1405
1522
  return _format_and_record_output(span, response_or_iterator, is_streaming, True, True)
1406
1523
  except Exception as e:
1407
- return _handle_error(span, e, True)
1524
+ _capture_exception_for_trace(span, sys.exc_info())
1525
+ raise e
1408
1526
 
1409
1527
  # Function replacing .stream() for async clients
1410
1528
  def traced_stream_async(*args, **kwargs):
@@ -1435,7 +1553,8 @@ def wrap(client: Any) -> Any:
1435
1553
  response_or_iterator = original_create(*args, **kwargs)
1436
1554
  return _format_and_record_output(span, response_or_iterator, is_streaming, False, False)
1437
1555
  except Exception as e:
1438
- return _handle_error(span, e, False)
1556
+ _capture_exception_for_trace(span, sys.exc_info())
1557
+ raise e
1439
1558
 
1440
1559
  def traced_response_create_sync(*args, **kwargs):
1441
1560
  current_trace = current_trace_var.get()
@@ -1449,7 +1568,8 @@ def wrap(client: Any) -> Any:
1449
1568
  response_or_iterator = original_responses_create(*args, **kwargs)
1450
1569
  return _format_and_record_output(span, response_or_iterator, is_streaming, False, True)
1451
1570
  except Exception as e:
1452
- return _handle_error(span, e, False)
1571
+ _capture_exception_for_trace(span, sys.exc_info())
1572
+ raise e
1453
1573
 
1454
1574
  # Function replacing sync .stream()
1455
1575
  def traced_stream_sync(*args, **kwargs):
@@ -1990,10 +2110,12 @@ def get_instance_prefixed_name(instance, class_name, class_identifiers):
1990
2110
  Otherwise, returns None.
1991
2111
  """
1992
2112
  if class_name in class_identifiers:
1993
- attr = class_identifiers[class_name]
2113
+ class_config = class_identifiers[class_name]
2114
+ attr = class_config['identifier']
2115
+
1994
2116
  if hasattr(instance, attr):
1995
2117
  instance_name = getattr(instance, attr)
1996
2118
  return instance_name
1997
2119
  else:
1998
- raise Exception(f"Attribute {class_identifiers[class_name]} does not exist for {class_name}. Check your identify() decorator.")
2120
+ raise Exception(f"Attribute {attr} does not exist for {class_name}. Check your identify() decorator.")
1999
2121
  return None
@@ -12,9 +12,10 @@ NOTE: any function beginning with 'a', e.g. 'afetch_together_api_response', is a
12
12
  import asyncio
13
13
  import concurrent.futures
14
14
  import os
15
+ from types import TracebackType
15
16
  import requests
16
17
  import pprint
17
- from typing import Any, Dict, List, Literal, Mapping, Optional, Union
18
+ from typing import Any, Dict, List, Literal, Mapping, Optional, TypeAlias, Union
18
19
 
19
20
  # Third-party imports
20
21
  import litellm
@@ -782,3 +783,6 @@ if __name__ == "__main__":
782
783
  ]
783
784
  ]
784
785
  ))
786
+
787
+ ExcInfo: TypeAlias = tuple[type[BaseException], BaseException, TracebackType]
788
+ OptExcInfo: TypeAlias = ExcInfo | tuple[None, None, None]
@@ -5,14 +5,15 @@ import json
5
5
  import os
6
6
  import yaml
7
7
  from dataclasses import dataclass, field
8
- from typing import List, Union, Literal
8
+ from typing import List, Union, Literal, Optional
9
9
 
10
- from judgeval.data import Example
10
+ from judgeval.data import Example, Trace
11
11
  from judgeval.common.logger import debug, error, warning, info
12
12
 
13
13
  @dataclass
14
14
  class EvalDataset:
15
15
  examples: List[Example]
16
+ traces: List[Trace]
16
17
  _alias: Union[str, None] = field(default=None)
17
18
  _id: Union[str, None] = field(default=None)
18
19
  judgment_api_key: str = field(default="")
@@ -20,12 +21,13 @@ class EvalDataset:
20
21
  def __init__(self,
21
22
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
22
23
  organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
23
- examples: List[Example] = [],
24
+ examples: Optional[List[Example]] = None,
25
+ traces: Optional[List[Trace]] = None
24
26
  ):
25
- debug(f"Initializing EvalDataset with {len(examples)} examples")
26
27
  if not judgment_api_key:
27
28
  warning("No judgment_api_key provided")
28
- self.examples = examples
29
+ self.examples = examples or []
30
+ self.traces = traces or []
29
31
  self._alias = None
30
32
  self._id = None
31
33
  self.judgment_api_key = judgment_api_key
@@ -218,8 +220,11 @@ class EvalDataset:
218
220
  self.add_example(e)
219
221
 
220
222
  def add_example(self, e: Example) -> None:
221
- self.examples = self.examples + [e]
223
+ self.examples.append(e)
222
224
  # TODO if we need to add rank, then we need to do it here
225
+
226
+ def add_trace(self, t: Trace) -> None:
227
+ self.traces.append(t)
223
228
 
224
229
  def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
225
230
  """
@@ -307,6 +312,7 @@ class EvalDataset:
307
312
  return (
308
313
  f"{self.__class__.__name__}("
309
314
  f"examples={self.examples}, "
315
+ f"traces={self.traces}, "
310
316
  f"_alias={self._alias}, "
311
317
  f"_id={self._id}"
312
318
  f")"
@@ -13,7 +13,7 @@ from judgeval.constants import (
13
13
  JUDGMENT_DATASETS_INSERT_API_URL,
14
14
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
15
15
  )
16
- from judgeval.data import Example
16
+ from judgeval.data import Example, Trace
17
17
  from judgeval.data.datasets import EvalDataset
18
18
 
19
19
 
@@ -58,6 +58,7 @@ class EvalDatasetClient:
58
58
  "dataset_alias": alias,
59
59
  "project_name": project_name,
60
60
  "examples": [e.to_dict() for e in dataset.examples],
61
+ "traces": [t.model_dump() for t in dataset.traces],
61
62
  "overwrite": overwrite,
62
63
  }
63
64
  try:
@@ -202,6 +203,7 @@ class EvalDatasetClient:
202
203
  info(f"Successfully pulled dataset with alias '{alias}'")
203
204
  payload = response.json()
204
205
  dataset.examples = [Example(**e) for e in payload.get("examples", [])]
206
+ dataset.traces = [Trace(**t) for t in payload.get("traces", [])]
205
207
  dataset._alias = payload.get("alias")
206
208
  dataset._id = payload.get("id")
207
209
  progress.update(