judgeval 0.0.26__tar.gz → 0.0.27__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. {judgeval-0.0.26 → judgeval-0.0.27}/PKG-INFO +1 -1
  2. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/custom_scorers.mdx +29 -3
  3. {judgeval-0.0.26 → judgeval-0.0.27}/pyproject.toml +1 -1
  4. judgeval-0.0.27/src/demo/new_trace/example_complex_async.py +232 -0
  5. {judgeval-0.0.26 → judgeval-0.0.27}/src/demo/travel_agent.py +1 -1
  6. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/common/tracer.py +476 -161
  7. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/constants.py +4 -2
  8. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/data/__init__.py +0 -3
  9. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/data/datasets/eval_dataset_client.py +59 -20
  10. judgeval-0.0.27/src/judgeval/data/result.py +76 -0
  11. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/judgment_client.py +47 -15
  12. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/run_evaluation.py +20 -36
  13. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/score.py +9 -11
  14. judgeval-0.0.26/src/judgeval/data/api_example.py +0 -98
  15. judgeval-0.0.26/src/judgeval/data/result.py +0 -98
  16. {judgeval-0.0.26 → judgeval-0.0.27}/.github/workflows/ci.yaml +0 -0
  17. {judgeval-0.0.26 → judgeval-0.0.27}/.gitignore +0 -0
  18. {judgeval-0.0.26 → judgeval-0.0.27}/LICENSE.md +0 -0
  19. {judgeval-0.0.26 → judgeval-0.0.27}/Pipfile +0 -0
  20. {judgeval-0.0.26 → judgeval-0.0.27}/Pipfile.lock +0 -0
  21. {judgeval-0.0.26 → judgeval-0.0.27}/README.md +0 -0
  22. {judgeval-0.0.26 → judgeval-0.0.27}/docs/README.md +0 -0
  23. {judgeval-0.0.26 → judgeval-0.0.27}/docs/api_reference/judgment_client.mdx +0 -0
  24. {judgeval-0.0.26 → judgeval-0.0.27}/docs/api_reference/trace.mdx +0 -0
  25. {judgeval-0.0.26 → judgeval-0.0.27}/docs/development.mdx +0 -0
  26. {judgeval-0.0.26 → judgeval-0.0.27}/docs/essentials/code.mdx +0 -0
  27. {judgeval-0.0.26 → judgeval-0.0.27}/docs/essentials/images.mdx +0 -0
  28. {judgeval-0.0.26 → judgeval-0.0.27}/docs/essentials/markdown.mdx +0 -0
  29. {judgeval-0.0.26 → judgeval-0.0.27}/docs/essentials/navigation.mdx +0 -0
  30. {judgeval-0.0.26 → judgeval-0.0.27}/docs/essentials/reusable-snippets.mdx +0 -0
  31. {judgeval-0.0.26 → judgeval-0.0.27}/docs/essentials/settings.mdx +0 -0
  32. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/data_datasets.mdx +0 -0
  33. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/data_examples.mdx +0 -0
  34. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/introduction.mdx +0 -0
  35. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/judges.mdx +0 -0
  36. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/answer_correctness.mdx +0 -0
  37. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/answer_relevancy.mdx +0 -0
  38. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
  39. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/comparison.mdx +0 -0
  40. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/contextual_precision.mdx +0 -0
  41. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/contextual_recall.mdx +0 -0
  42. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/contextual_relevancy.mdx +0 -0
  43. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/execution_order.mdx +0 -0
  44. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/faithfulness.mdx +0 -0
  45. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/groundedness.mdx +0 -0
  46. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/hallucination.mdx +0 -0
  47. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/introduction.mdx +0 -0
  48. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/json_correctness.mdx +0 -0
  49. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/scorers/summarization.mdx +0 -0
  50. {judgeval-0.0.26 → judgeval-0.0.27}/docs/evaluation/unit_testing.mdx +0 -0
  51. {judgeval-0.0.26 → judgeval-0.0.27}/docs/favicon.svg +0 -0
  52. {judgeval-0.0.26 → judgeval-0.0.27}/docs/getting_started.mdx +0 -0
  53. {judgeval-0.0.26 → judgeval-0.0.27}/docs/images/basic_trace_example.png +0 -0
  54. {judgeval-0.0.26 → judgeval-0.0.27}/docs/images/checks-passed.png +0 -0
  55. {judgeval-0.0.26 → judgeval-0.0.27}/docs/images/create_aggressive_scorer.png +0 -0
  56. {judgeval-0.0.26 → judgeval-0.0.27}/docs/images/create_scorer.png +0 -0
  57. {judgeval-0.0.26 → judgeval-0.0.27}/docs/images/evaluation_diagram.png +0 -0
  58. {judgeval-0.0.26 → judgeval-0.0.27}/docs/images/hero-dark.svg +0 -0
  59. {judgeval-0.0.26 → judgeval-0.0.27}/docs/images/hero-light.svg +0 -0
  60. {judgeval-0.0.26 → judgeval-0.0.27}/docs/images/online_eval_fault.png +0 -0
  61. {judgeval-0.0.26 → judgeval-0.0.27}/docs/images/trace_ss.png +0 -0
  62. {judgeval-0.0.26 → judgeval-0.0.27}/docs/integration/langgraph.mdx +0 -0
  63. {judgeval-0.0.26 → judgeval-0.0.27}/docs/introduction.mdx +0 -0
  64. {judgeval-0.0.26 → judgeval-0.0.27}/docs/judgment/introduction.mdx +0 -0
  65. {judgeval-0.0.26 → judgeval-0.0.27}/docs/logo/dark.svg +0 -0
  66. {judgeval-0.0.26 → judgeval-0.0.27}/docs/logo/light.svg +0 -0
  67. {judgeval-0.0.26 → judgeval-0.0.27}/docs/mint.json +0 -0
  68. {judgeval-0.0.26 → judgeval-0.0.27}/docs/monitoring/introduction.mdx +0 -0
  69. {judgeval-0.0.26 → judgeval-0.0.27}/docs/monitoring/production_insights.mdx +0 -0
  70. {judgeval-0.0.26 → judgeval-0.0.27}/docs/monitoring/tracing.mdx +0 -0
  71. {judgeval-0.0.26 → judgeval-0.0.27}/docs/notebooks/create_dataset.ipynb +0 -0
  72. {judgeval-0.0.26 → judgeval-0.0.27}/docs/notebooks/create_scorer.ipynb +0 -0
  73. {judgeval-0.0.26 → judgeval-0.0.27}/docs/notebooks/demo.ipynb +0 -0
  74. {judgeval-0.0.26 → judgeval-0.0.27}/docs/notebooks/prompt_scorer.ipynb +0 -0
  75. {judgeval-0.0.26 → judgeval-0.0.27}/docs/notebooks/quickstart.ipynb +0 -0
  76. {judgeval-0.0.26 → judgeval-0.0.27}/docs/quickstart.mdx +0 -0
  77. {judgeval-0.0.26 → judgeval-0.0.27}/docs/snippets/snippet-intro.mdx +0 -0
  78. {judgeval-0.0.26 → judgeval-0.0.27}/pytest.ini +0 -0
  79. {judgeval-0.0.26 → judgeval-0.0.27}/src/demo/demo.py +0 -0
  80. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/__init__.py +0 -0
  81. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/clients.py +0 -0
  82. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/common/__init__.py +0 -0
  83. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/common/exceptions.py +0 -0
  84. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/common/logger.py +0 -0
  85. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/common/utils.py +0 -0
  86. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/data/datasets/__init__.py +0 -0
  87. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/data/datasets/dataset.py +0 -0
  88. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/data/example.py +0 -0
  89. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/data/scorer_data.py +0 -0
  90. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/evaluation_run.py +0 -0
  91. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/integrations/langgraph.py +0 -0
  92. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/judges/__init__.py +0 -0
  93. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/judges/base_judge.py +0 -0
  94. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/judges/litellm_judge.py +0 -0
  95. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/judges/mixture_of_judges.py +0 -0
  96. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/judges/together_judge.py +0 -0
  97. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/judges/utils.py +0 -0
  98. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/rules.py +0 -0
  99. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/__init__.py +0 -0
  100. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/api_scorer.py +0 -0
  101. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/base_scorer.py +0 -0
  102. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/exceptions.py +0 -0
  103. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorer.py +0 -0
  104. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  105. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  106. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  107. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  108. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -0
  109. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
  110. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
  111. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
  112. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
  113. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  114. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -0
  115. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
  116. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
  117. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
  118. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
  119. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
  120. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
  121. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
  122. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -0
  123. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -0
  124. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -0
  125. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -0
  126. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -0
  127. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -0
  128. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -0
  129. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
  130. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -0
  131. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -0
  132. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -0
  133. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -0
  134. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -0
  135. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -0
  136. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -0
  137. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -0
  138. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -0
  139. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -0
  140. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -0
  141. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -0
  142. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -0
  143. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -0
  144. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -0
  145. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -0
  146. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -0
  147. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -0
  148. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -0
  149. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -0
  150. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -0
  151. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -0
  152. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -0
  153. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -0
  154. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -0
  155. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -0
  156. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/prompt_scorer.py +0 -0
  157. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/scorers/utils.py +0 -0
  158. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/tracer/__init__.py +0 -0
  159. {judgeval-0.0.26 → judgeval-0.0.27}/src/judgeval/utils/alerts.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.26
3
+ Version: 0.0.27
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -4,6 +4,7 @@ description: ""
4
4
  ---
5
5
 
6
6
  If none of `judgeval`'s built-in scorers fit your evaluation criteria, you can easily build your own custom metric to be run through a `JudgevalScorer`.
7
+
7
8
  `JudgevalScorer`s are **automatically integrated** within `judgeval`'s infrastructure, so you can:
8
9
  - Run your own scorer with the same syntax as any other `judgeval` scorer.
9
10
  - Use `judgeval`'s batched evaluation infrastructure to execute **scalable evaluation runs**.
@@ -78,7 +79,6 @@ You can optionally set the self.reason attribute, depending on your preference.
78
79
  </Note>
79
80
 
80
81
  These methods are the core of your scorer, and you can implement them in any way you want. **Be creative!**
81
- Check out this list of examples our users have implemented if you need inspiration: TODO add link here
82
82
 
83
83
  #### Handling Errors
84
84
  If you want to handle errors gracefully, you can use a `try` block and in the `except` block, set the `self.error` attribute to the error message.
@@ -144,11 +144,37 @@ class SampleScorer(JudgevalScorer):
144
144
  def __name__(self):
145
145
  return "Sample Scorer"
146
146
  ```
147
-
148
147
  **Congratulations!** 🎉
149
148
 
150
149
  You've made your first custom judgeval scorer! Now that your scorer is implemented, you can run it on your own datasets
151
150
  just like any other `judgeval` scorer. Your scorer is fully integrated with `judgeval`'s infrastructure so you can view it on
152
151
  the [Judgment platform](/judgment/introduction) too.
153
152
 
154
- For more examples, check out some of the custom scorers our users have implemented: TODO add link here.
153
+ ## Using a Custom Scorer
154
+
155
+ Once you've implemented your custom scorer, you can use it in the same way as any other scorer in `judgeval`.
156
+ They can be run in conjunction with other scorers in a single evaluation run!
157
+
158
+ ```python run_custom_scorer.py
159
+ from judgeval import JudgmentClient
160
+ from your_custom_scorer import SampleScorer
161
+
162
+ client = JudgmentClient()
163
+ sample_scorer = SampleScorer()
164
+
165
+ results = client.run_evaluation(
166
+ examples=[example1],
167
+ scorers=[sample_scorer],
168
+ model="gpt-4o"
169
+ )
170
+ ```
171
+
172
+ ## Real World Examples
173
+
174
+ You can find some real world examples of how our community has used custom `JudgevalScorer`s to evaluate their LLM systems in our [cookbook repository](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/custom_scorers)!
175
+ Here are some of our favorites:
176
+
177
+ - [Code Style Scorer](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/code_style_scorer.py) - Evaluates code quality and style
178
+ - [Cold Email Scorer](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/cold_email_scorer.py) - Evaluates the effectiveness of cold emails
179
+
180
+ For more examples and detailed documentation on custom scorers, check out our [Custom Scorers Cookbook](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/README.md).
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.0.26"
3
+ version = "0.0.27"
4
4
  authors = [
5
5
  { name="Andrew Li", email="andrew@judgmentlabs.ai" },
6
6
  { name="Alex Shan", email="alex@judgmentlabs.ai" },
@@ -0,0 +1,232 @@
1
+ import asyncio
2
+ import time
3
+ import sys
4
+ import os
5
+ import functools
6
+ from unittest.mock import MagicMock, patch
7
+ from typing import Dict, Optional, List
8
+ import uuid
9
+ import json
10
+
11
+ # Standard library imports needed for the new class
12
+ import concurrent.futures
13
+ import contextvars
14
+ # Needed for partial in the executor
15
+
16
+ # Add src directory to Python path for imports
17
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
18
+
19
+ # Import and mock necessary components before initializing the tracer
20
+ from judgeval.common.tracer import Tracer, JudgmentClient, TraceClient, current_trace_var, TraceEntry, TraceManagerClient, TraceThreadPoolExecutor # Import the new class
21
+
22
+ # Initialize the tracer with test values
23
+ tracer = Tracer(
24
+ project_name="complex_async_test"
25
+ )
26
+
27
+ # In this example, we'll use a single trace with spans for all function calls
28
+ @tracer.observe(name="root_function")
29
+ async def root_function():
30
+ print("Root function starting")
31
+
32
+ # Direct await call to level 2
33
+ result1 = await level2_function("direct")
34
+
35
+ # Parallel calls (gather) to level 2 functions
36
+ # These should be level 2 - direct children of root
37
+ # Create two truly parallel functions that both have root_function as parent
38
+ level2_parallel1_task = level2_parallel1("gather1")
39
+ level2_parallel2_task = level2_parallel2("gather2")
40
+
41
+ # Use trace_gather instead of asyncio.gather to preserve context
42
+ # This ensures parent-child relationships are maintained in parallel tasks
43
+ # result2, result3 = await trace_gather(level2_parallel1_task, level2_parallel2_task) # OLD
44
+ result2, result3 = await asyncio.gather(level2_parallel1_task, level2_parallel2_task) # Use standard gather
45
+
46
+
47
+ print("Root function completed")
48
+ return f"Root results: {result1}, {result2}, {result3}"
49
+
50
+ # Level 2 - Direct child of root
51
+ # Using observe with same tracer - this will create spans in the parent trace
52
+ @tracer.observe()
53
+ async def level2_function(param):
54
+ # Capture this function in a span within the current trace
55
+ print(f"Level 2 function with {param}")
56
+
57
+ # Call to level 3
58
+ result = await level3_function(f"{param}_child")
59
+
60
+ return f"level2:{result}"
61
+
62
+ # Level 2 - First parallel function
63
+ @tracer.observe()
64
+ async def level2_parallel1(param):
65
+ # Capture this function in a span within the current trace
66
+ print(f"Level 2 parallel 1 with {param}")
67
+
68
+ # This parallel function makes another parallel call to level 3 functions
69
+ # These should be direct children of level2_parallel1
70
+ # r1, r2 = await trace_gather( # OLD
71
+ r1, r2 = await asyncio.gather( # Use standard gather
72
+ level3_parallel1(f"{param}_1"),
73
+ level3_parallel2(f"{param}_2")
74
+ )
75
+
76
+ return f"level2_parallel1:{r1},{r2}"
77
+
78
+ # Level 2 - Second parallel function
79
+ @tracer.observe()
80
+ async def level2_parallel2(param):
81
+ # Capture this function in a span within the current trace
82
+ print(f"Level 2 parallel 2 with {param}")
83
+
84
+ # Direct await to level 3
85
+ result = await level3_function(f"{param}_direct")
86
+
87
+ return f"level2_parallel2:{result}"
88
+
89
+ # Level 3 - Child of level 2 direct
90
+ @tracer.observe()
91
+ async def level3_function(param):
92
+ # Capture this function in a span within the current trace
93
+ print(f"Level 3 function with {param}")
94
+
95
+ # Call to level 4
96
+ result = await level4_function(f"{param}_deep")
97
+
98
+ return f"level3:{result}"
99
+
100
+ # Level 3 - First parallel function called by level2_parallel1
101
+ @tracer.observe()
102
+ async def level3_parallel1(param):
103
+ # Capture this function in a span within the current trace
104
+ print(f"Level 3 parallel 1 with {param}")
105
+
106
+ # This makes a nested gather call with level 4 functions
107
+ # results = await trace_gather( # OLD
108
+ results = await asyncio.gather( # Use standard gather
109
+ level4_function(f"{param}_a"),
110
+ level4_function(f"{param}_b"),
111
+ level4_function(f"{param}_c")
112
+ )
113
+
114
+ return f"level3_p1:{','.join(results)}"
115
+
116
+ # Level 3 - Second parallel function called by level2_parallel1
117
+ @tracer.observe()
118
+ async def level3_parallel2(param):
119
+ # Capture this function in a span within the current trace
120
+ print(f"Level 3 parallel 2 with {param}")
121
+ await asyncio.sleep(0.1)
122
+
123
+ # Direct call to level 4
124
+ result = await level4_deep_function(f"{param}_deep")
125
+
126
+ return f"level3_p2:{result}"
127
+
128
+ # Level 4 - Deepest regular function
129
+ @tracer.observe()
130
+ async def level4_function(param):
131
+ # Capture this function in a span within the current trace
132
+ print(f"Level 4 function with {param}")
133
+ await asyncio.sleep(0.05)
134
+
135
+ return f"level4:{param}"
136
+
137
+ # Level 4 - Deep function that calls level 5
138
+ @tracer.observe()
139
+ async def level4_deep_function(param):
140
+ # Capture this function in a span within the current trace
141
+ print(f"Level 4 deep function with {param}")
142
+
143
+ # Call to level 5 (maximum depth)
144
+ result = await level5_function(f"{param}_final")
145
+ test = await fib(5)
146
+ return f"level4_deep:{result}"
147
+
148
+ @tracer.observe()
149
+ async def fib(n):
150
+ if n <= 1:
151
+ return n
152
+ return await fib(n-1) + await fib(n-2)
153
+
154
+ # Level 5 - Deepest level
155
+ @tracer.observe()
156
+ async def level5_function(param):
157
+ # Capture this function in a span within the current trace
158
+ print(f"Level 5 function with {param}")
159
+ await asyncio.sleep(0.05)
160
+
161
+ return f"level5:{param}"
162
+
163
+ # --- Synchronous ThreadPoolExecutor Test ---
164
+
165
+ @tracer.observe(name="sync_child_task1")
166
+ def sync_child_task1(param):
167
+ """A simple synchronous function to be run in a thread."""
168
+ print(f"SYNC CHILD 1: Received {param}. Sleeping...")
169
+ time.sleep(0.15)
170
+ result = f"Result from sync_child_task1 with {param}"
171
+ print("SYNC CHILD 1: Done.")
172
+ return result
173
+
174
+ @tracer.observe(name="sync_child_task2")
175
+ def sync_child_task2(param1, param2):
176
+ """Another simple synchronous function."""
177
+ print(f"SYNC CHILD 2: Received {param1} and {param2}. Sleeping...")
178
+ time.sleep(0.05)
179
+ result = f"Result from sync_child_task2 with {param1}, {param2}"
180
+ print("SYNC CHILD 2: Done.")
181
+ return result
182
+
183
+ @tracer.observe(name="sync_parent_func")
184
+ def sync_parent_func():
185
+ """This function uses TraceThreadPoolExecutor to run sync tasks."""
186
+ print("SYNC PARENT: Starting...")
187
+ results = []
188
+ # Use the TraceThreadPoolExecutor instead of the standard one
189
+ with TraceThreadPoolExecutor(max_workers=2) as executor:
190
+ print("SYNC PARENT: Submitting tasks to TraceThreadPoolExecutor...")
191
+ future1 = executor.submit(sync_child_task1, "data_for_task1")
192
+ future2 = executor.submit(sync_child_task2, "data1_for_task2", "data2_for_task2")
193
+
194
+ print("SYNC PARENT: Waiting for futures...")
195
+ # Wait for futures and collect results (demonstrates typical usage)
196
+ for future in concurrent.futures.as_completed([future1, future2]):
197
+ try:
198
+ results.append(future.result())
199
+ except Exception as exc:
200
+ print(f"SYNC PARENT: Generated an exception: {exc}")
201
+ results.append(f"Error: {exc}")
202
+
203
+ print("SYNC PARENT: Finished.")
204
+ return results
205
+
206
+ # --- End Synchronous Test ---
207
+
208
+ async def main():
209
+ # Run the root function which has deep nesting and nested parallel calls
210
+ start_time = time.time()
211
+ result_async = await root_function()
212
+ end_time = time.time()
213
+ print(f"\nAsync Final result: {result_async}")
214
+ print(f"Async Total execution time: {end_time - start_time:.2f} seconds")
215
+
216
+ print("\n" + "="*20 + " Starting Sync ThreadPool Test " + "="*20 + "\n")
217
+
218
+ # --- Run the synchronous thread pool test ---
219
+ # Note: We run this *outside* the async root_function's trace
220
+ # If we wanted it nested, we'd need @tracer.observe on main or call it from root_function
221
+ # For simplicity, let's trace it separately by calling it directly.
222
+ # The @tracer.observe on sync_parent_func will create its own root trace.
223
+ start_time_sync = time.time()
224
+ result_sync = sync_parent_func() # This will be traced automatically
225
+ end_time_sync = time.time()
226
+ print(f"\nSync Final results: {result_sync}")
227
+ print(f"Sync Total execution time: {end_time_sync - start_time_sync:.2f} seconds")
228
+ # --- End synchronous test call ---
229
+
230
+ if __name__ == "__main__":
231
+ # Run the complex async example
232
+ asyncio.run(main())
@@ -84,7 +84,7 @@ Key Information:
84
84
  ]
85
85
 
86
86
  client = wrap(openai.Client(api_key=os.getenv("OPENAI_API_KEY")))
87
- judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"), project_name="travel_agent_demo")
87
+ judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"), project_name="travel_agent_demo", enable_evaluations=False, enable_monitoring=False)
88
88
 
89
89
  def populate_vector_db(collection, destinations_data):
90
90
  """