judgeval 0.0.26__tar.gz → 0.0.28__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. {judgeval-0.0.26 → judgeval-0.0.28}/PKG-INFO +1 -1
  2. judgeval-0.0.28/docs/alerts/notifications.mdx +191 -0
  3. judgeval-0.0.28/docs/alerts/platform_notifications.mdx +74 -0
  4. judgeval-0.0.28/docs/alerts/rules.mdx +111 -0
  5. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/scorers/custom_scorers.mdx +29 -3
  6. judgeval-0.0.28/docs/images/notifications_page.png +0 -0
  7. judgeval-0.0.28/docs/images/reports_modal.png +0 -0
  8. {judgeval-0.0.26 → judgeval-0.0.28}/docs/mint.json +8 -0
  9. {judgeval-0.0.26 → judgeval-0.0.28}/pyproject.toml +1 -1
  10. judgeval-0.0.28/src/demo/cookbooks/JNPR_Mist/test.py +21 -0
  11. judgeval-0.0.28/src/demo/cookbooks/linkd/text2sql.py +14 -0
  12. judgeval-0.0.28/src/demo/custom_example_demo/osiris_test.py +22 -0
  13. judgeval-0.0.28/src/demo/custom_example_demo/qodo_scorer.py +78 -0
  14. judgeval-0.0.28/src/demo/new_trace/example_complex_async.py +232 -0
  15. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/common/tracer.py +515 -193
  16. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/constants.py +4 -2
  17. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/data/__init__.py +0 -3
  18. judgeval-0.0.26/src/judgeval/data/api_example.py → judgeval-0.0.28/src/judgeval/data/custom_api_example.py +12 -19
  19. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/data/datasets/eval_dataset_client.py +59 -20
  20. judgeval-0.0.28/src/judgeval/data/result.py +76 -0
  21. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/evaluation_run.py +1 -0
  22. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/judgment_client.py +47 -15
  23. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/run_evaluation.py +20 -36
  24. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/score.py +9 -11
  25. judgeval-0.0.28/src/test.py +21 -0
  26. judgeval-0.0.26/src/judgeval/data/result.py +0 -98
  27. {judgeval-0.0.26 → judgeval-0.0.28}/.github/workflows/ci.yaml +0 -0
  28. {judgeval-0.0.26 → judgeval-0.0.28}/.gitignore +0 -0
  29. {judgeval-0.0.26 → judgeval-0.0.28}/LICENSE.md +0 -0
  30. {judgeval-0.0.26 → judgeval-0.0.28}/Pipfile +0 -0
  31. {judgeval-0.0.26 → judgeval-0.0.28}/Pipfile.lock +0 -0
  32. {judgeval-0.0.26 → judgeval-0.0.28}/README.md +0 -0
  33. {judgeval-0.0.26 → judgeval-0.0.28}/docs/README.md +0 -0
  34. {judgeval-0.0.26 → judgeval-0.0.28}/docs/api_reference/judgment_client.mdx +0 -0
  35. {judgeval-0.0.26 → judgeval-0.0.28}/docs/api_reference/trace.mdx +0 -0
  36. {judgeval-0.0.26 → judgeval-0.0.28}/docs/development.mdx +0 -0
  37. {judgeval-0.0.26 → judgeval-0.0.28}/docs/essentials/code.mdx +0 -0
  38. {judgeval-0.0.26 → judgeval-0.0.28}/docs/essentials/images.mdx +0 -0
  39. {judgeval-0.0.26 → judgeval-0.0.28}/docs/essentials/markdown.mdx +0 -0
  40. {judgeval-0.0.26 → judgeval-0.0.28}/docs/essentials/navigation.mdx +0 -0
  41. {judgeval-0.0.26 → judgeval-0.0.28}/docs/essentials/reusable-snippets.mdx +0 -0
  42. {judgeval-0.0.26 → judgeval-0.0.28}/docs/essentials/settings.mdx +0 -0
  43. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/data_datasets.mdx +0 -0
  44. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/data_examples.mdx +0 -0
  45. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/introduction.mdx +0 -0
  46. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/judges.mdx +0 -0
  47. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/scorers/answer_correctness.mdx +0 -0
  48. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/scorers/answer_relevancy.mdx +0 -0
  49. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
  50. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/scorers/comparison.mdx +0 -0
  51. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/scorers/contextual_precision.mdx +0 -0
  52. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/scorers/contextual_recall.mdx +0 -0
  53. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/scorers/contextual_relevancy.mdx +0 -0
  54. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/scorers/execution_order.mdx +0 -0
  55. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/scorers/faithfulness.mdx +0 -0
  56. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/scorers/groundedness.mdx +0 -0
  57. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/scorers/hallucination.mdx +0 -0
  58. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/scorers/introduction.mdx +0 -0
  59. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/scorers/json_correctness.mdx +0 -0
  60. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/scorers/summarization.mdx +0 -0
  61. {judgeval-0.0.26 → judgeval-0.0.28}/docs/evaluation/unit_testing.mdx +0 -0
  62. {judgeval-0.0.26 → judgeval-0.0.28}/docs/favicon.svg +0 -0
  63. {judgeval-0.0.26 → judgeval-0.0.28}/docs/getting_started.mdx +0 -0
  64. {judgeval-0.0.26 → judgeval-0.0.28}/docs/images/basic_trace_example.png +0 -0
  65. {judgeval-0.0.26 → judgeval-0.0.28}/docs/images/checks-passed.png +0 -0
  66. {judgeval-0.0.26 → judgeval-0.0.28}/docs/images/create_aggressive_scorer.png +0 -0
  67. {judgeval-0.0.26 → judgeval-0.0.28}/docs/images/create_scorer.png +0 -0
  68. {judgeval-0.0.26 → judgeval-0.0.28}/docs/images/evaluation_diagram.png +0 -0
  69. {judgeval-0.0.26 → judgeval-0.0.28}/docs/images/hero-dark.svg +0 -0
  70. {judgeval-0.0.26 → judgeval-0.0.28}/docs/images/hero-light.svg +0 -0
  71. {judgeval-0.0.26 → judgeval-0.0.28}/docs/images/online_eval_fault.png +0 -0
  72. {judgeval-0.0.26 → judgeval-0.0.28}/docs/images/trace_ss.png +0 -0
  73. {judgeval-0.0.26 → judgeval-0.0.28}/docs/integration/langgraph.mdx +0 -0
  74. {judgeval-0.0.26 → judgeval-0.0.28}/docs/introduction.mdx +0 -0
  75. {judgeval-0.0.26 → judgeval-0.0.28}/docs/judgment/introduction.mdx +0 -0
  76. {judgeval-0.0.26 → judgeval-0.0.28}/docs/logo/dark.svg +0 -0
  77. {judgeval-0.0.26 → judgeval-0.0.28}/docs/logo/light.svg +0 -0
  78. {judgeval-0.0.26 → judgeval-0.0.28}/docs/monitoring/introduction.mdx +0 -0
  79. {judgeval-0.0.26 → judgeval-0.0.28}/docs/monitoring/production_insights.mdx +0 -0
  80. {judgeval-0.0.26 → judgeval-0.0.28}/docs/monitoring/tracing.mdx +0 -0
  81. {judgeval-0.0.26 → judgeval-0.0.28}/docs/notebooks/create_dataset.ipynb +0 -0
  82. {judgeval-0.0.26 → judgeval-0.0.28}/docs/notebooks/create_scorer.ipynb +0 -0
  83. {judgeval-0.0.26 → judgeval-0.0.28}/docs/notebooks/demo.ipynb +0 -0
  84. {judgeval-0.0.26 → judgeval-0.0.28}/docs/notebooks/prompt_scorer.ipynb +0 -0
  85. {judgeval-0.0.26 → judgeval-0.0.28}/docs/notebooks/quickstart.ipynb +0 -0
  86. {judgeval-0.0.26 → judgeval-0.0.28}/docs/quickstart.mdx +0 -0
  87. {judgeval-0.0.26 → judgeval-0.0.28}/docs/snippets/snippet-intro.mdx +0 -0
  88. {judgeval-0.0.26 → judgeval-0.0.28}/pytest.ini +0 -0
  89. {judgeval-0.0.26 → judgeval-0.0.28}/src/demo/demo.py +0 -0
  90. {judgeval-0.0.26 → judgeval-0.0.28}/src/demo/travel_agent.py +0 -0
  91. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/__init__.py +0 -0
  92. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/clients.py +0 -0
  93. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/common/__init__.py +0 -0
  94. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/common/exceptions.py +0 -0
  95. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/common/logger.py +0 -0
  96. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/common/utils.py +0 -0
  97. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/data/datasets/__init__.py +0 -0
  98. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/data/datasets/dataset.py +0 -0
  99. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/data/example.py +0 -0
  100. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/data/scorer_data.py +0 -0
  101. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/integrations/langgraph.py +0 -0
  102. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/judges/__init__.py +0 -0
  103. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/judges/base_judge.py +0 -0
  104. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/judges/litellm_judge.py +0 -0
  105. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/judges/mixture_of_judges.py +0 -0
  106. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/judges/together_judge.py +0 -0
  107. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/judges/utils.py +0 -0
  108. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/rules.py +0 -0
  109. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/__init__.py +0 -0
  110. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/api_scorer.py +0 -0
  111. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/base_scorer.py +0 -0
  112. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/exceptions.py +0 -0
  113. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorer.py +0 -0
  114. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  115. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  116. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  117. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  118. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -0
  119. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
  120. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
  121. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
  122. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
  123. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  124. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -0
  125. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
  126. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
  127. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
  128. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
  129. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
  130. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
  131. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
  132. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -0
  133. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -0
  134. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -0
  135. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -0
  136. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -0
  137. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -0
  138. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -0
  139. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
  140. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -0
  141. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -0
  142. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -0
  143. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -0
  144. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -0
  145. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -0
  146. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -0
  147. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -0
  148. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -0
  149. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -0
  150. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -0
  151. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -0
  152. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -0
  153. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -0
  154. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -0
  155. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -0
  156. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -0
  157. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -0
  158. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -0
  159. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -0
  160. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -0
  161. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -0
  162. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -0
  163. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -0
  164. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -0
  165. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -0
  166. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/prompt_scorer.py +0 -0
  167. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/scorers/utils.py +0 -0
  168. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/tracer/__init__.py +0 -0
  169. {judgeval-0.0.26 → judgeval-0.0.28}/src/judgeval/utils/alerts.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.26
3
+ Version: 0.0.28
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -0,0 +1,191 @@
1
+ ---
2
+ title: 'Notifications'
3
+ description: 'Get alerted when your rules trigger through multiple communication channels'
4
+ ---
5
+
6
+ # Notifications
7
+
8
+ Notifications allow you to receive alerts through various communication channels when your [rules](/alerts/rules) are triggered. This feature helps you stay informed about potential issues with your AI system's performance in real-time.
9
+
10
+ ## Overview
11
+
12
+ The notification system works with [rules](/alerts/rules) to:
13
+
14
+ 1. Monitor your evaluation metrics
15
+ 2. Check if they meet your defined [conditions](/alerts/rules#conditions)
16
+ 3. Send alerts through your preferred channels when conditions are met
17
+
18
+ Notifications can be configured globally or per rule, allowing you to customize how you're alerted based on the specific rule that was triggered.
19
+
20
+ <Warning>
21
+ Rules and notifications only work with built-in APIScorers. Local scorers and custom scorers are not supported for triggering notifications.
22
+ </Warning>
23
+
24
+ ## Notification Configuration
25
+
26
+ Notifications are configured using the `NotificationConfig` class from the `judgeval.rules` module.
27
+
28
+ ### Configuration Options
29
+
30
+ | Parameter | Type | Description |
31
+ |-----------|------|-------------|
32
+ | `enabled` | boolean | Whether notifications are enabled (default: `True`) |
33
+ | `communication_methods` | list of strings | The methods to use for sending notifications (e.g., `["email", "slack"]`) |
34
+ | `email_addresses` | list of strings | Email addresses to send notifications to |
35
+ | `send_at` | integer (Unix timestamp) | Schedule notifications for a specific time ([learn more](#scheduled-notifications)) |
36
+
37
+ <Note>
38
+ For aggregated reports and periodic summaries of multiple alerts, use the [Scheduled Reports feature](/alerts/platform_notifications#scheduled-reports-recaps) in the Judgment Platform.
39
+ </Note>
40
+
41
+ ### Basic Configuration
42
+
43
+ ```python
44
+ from judgeval.rules import NotificationConfig
45
+
46
+ # Create a notification configuration
47
+ notification_config = NotificationConfig(
48
+ enabled=True,
49
+ communication_methods=["slack", "email"],
50
+ email_addresses=["user@example.com"],
51
+ send_at=None # Send immediately
52
+ )
53
+ ```
54
+
55
+ ## Communication Methods
56
+
57
+ Judgeval supports multiple communication methods for notifications:
58
+
59
+ - `"email"`: Send emails to specified email addresses
60
+ - `"slack"`: Send messages to configured Slack channels
61
+
62
+ You can configure multiple methods to be used simultaneously.
63
+
64
+ ## Slack Integration
65
+
66
+ For detailed information on integrating Slack with Judgment notifications, see the [Platform Notification Center documentation](/alerts/platform_notifications#slack-integration).
67
+
68
+ ## Attaching Notifications to Rules
69
+
70
+ Notifications can be attached to [rules](/alerts/rules) during rule creation or added/configured later.
71
+
72
+ ### During Rule Creation
73
+
74
+ ```python
75
+ from judgeval.rules import Rule, Condition, NotificationConfig
76
+ from judgeval.scorers import FaithfulnessScorer
77
+
78
+ # Create notification config
79
+ notification_config = NotificationConfig(
80
+ enabled=True,
81
+ communication_methods=["slack", "email"],
82
+ email_addresses=["user@example.com"]
83
+ )
84
+
85
+ # Create rule with notification config
86
+ rule = Rule(
87
+ name="Faithfulness Check",
88
+ description="Check if faithfulness meets threshold",
89
+ conditions=[
90
+ # Note: Only built-in APIScorers are supported
91
+ Condition(metric=FaithfulnessScorer(threshold=0.7))
92
+ ],
93
+ combine_type="all", # Trigger when all conditions fail (see Combine Types in Rules documentation)
94
+ notification=notification_config
95
+ )
96
+ ```
97
+
98
+ ## Scheduled Notifications
99
+
100
+ You can schedule one-time notifications to be sent at a specific time using the `send_at` parameter:
101
+
102
+ ```python
103
+ from judgeval.rules import NotificationConfig
104
+ import time
105
+
106
+ # Schedule notification for 1 hour from now
107
+ one_hour_from_now = int(time.time()) + 3600
108
+
109
+ notification_config = NotificationConfig(
110
+ enabled=True,
111
+ communication_methods=["email"],
112
+ email_addresses=["user@example.com"],
113
+ send_at=one_hour_from_now
114
+ )
115
+ ```
116
+
117
+ The `send_at` parameter accepts a Unix timestamp (integer) that specifies when the notification should be sent. This is useful for delaying notifications or grouping them to be sent at a specific time of day.
118
+
119
+ <Warning>
120
+ The `send_at` parameter only delays when a single notification is sent. It doesn't create recurring notifications or group multiple alerts together. Each time a rule is triggered, a separate notification is generated.
121
+ </Warning>
122
+
123
+ ## Notification Types in the Platform
124
+
125
+ The Judgment Platform offers two main types of notifications:
126
+
127
+ 1. **Evaluation Alerts** - Real-time notifications sent when specific rules are triggered. When using the API, these can be scheduled for a specific time using the `send_at` parameter.
128
+
129
+ 2. **Custom Alert Recaps** - Periodic summaries (daily, weekly, monthly) of evaluation metrics and alerts. These are configured in the [Platform Notification Center](/alerts/platform_notifications).
130
+
131
+ ### Setting Up Custom Alert Recaps
132
+
133
+ To set up periodic notification summaries:
134
+
135
+ 1. Navigate to the Notifications page in your Judgment account settings
136
+ 2. Under "Custom Alert Recaps," click the "+" button to create a new report
137
+ 3. Configure your preferred frequency (Daily, Weekly, Monthly) and delivery time
138
+ 4. Add recipient email addresses
139
+
140
+ For more details, see the [Scheduled Reports](/alerts/platform_notifications#scheduled-reports-recaps) documentation.
141
+
142
+ ## Judgment Platform Features
143
+
144
+ For information about configuring notifications in the Judgment web platform, including email alerts, scheduled reports, and Slack integration, see the [Platform Notification Center](/alerts/platform_notifications) documentation.
145
+
146
+ ## Practical Example
147
+
148
+ Here's a complete example showing how to set up rules with notifications and integrate them with the Tracer:
149
+
150
+ ```python
151
+ import os
152
+ from judgeval.common.tracer import Tracer, wrap
153
+ from judgeval.scorers import FaithfulnessScorer, AnswerRelevancyScorer
154
+ from judgeval.rules import Rule, Condition, NotificationConfig
155
+ from openai import OpenAI
156
+
157
+ # Create notification config
158
+ notification_config = NotificationConfig(
159
+ enabled=True,
160
+ communication_methods=["slack", "email"],
161
+ email_addresses=["alerts@example.com"],
162
+ send_at=None # Send immediately
163
+ )
164
+
165
+ # Create rules with notification config
166
+ rules = [
167
+ Rule(
168
+ name="Quality Check",
169
+ description="Check if all quality metrics meet thresholds",
170
+ conditions=[
171
+ # Only built-in APIScorers can be used as metrics
172
+ Condition(metric=FaithfulnessScorer(threshold=0.7)),
173
+ Condition(metric=AnswerRelevancyScorer(threshold=0.8))
174
+ ],
175
+ combine_type="all", # Trigger when all conditions fail
176
+ notification=notification_config
177
+ )
178
+ ]
179
+
180
+ # Initialize tracer with rules for notifications
181
+ judgment = Tracer(
182
+ api_key=os.getenv("JUDGMENT_API_KEY"),
183
+ project_name="my_project",
184
+ rules=rules
185
+ )
186
+
187
+ # Wrap OpenAI client for tracing
188
+ client = wrap(OpenAI())
189
+
190
+ # Now any evaluations that trigger the rules will send notifications
191
+ ```
@@ -0,0 +1,74 @@
1
+ ---
2
+ title: 'Platform Notification Center'
3
+ description: 'Configure and manage notifications through the Judgment web interface'
4
+ ---
5
+
6
+ # Platform Notification Center
7
+
8
+ The Judgment Platform provides a comprehensive notification system through its web interface, allowing you to configure email notifications, scheduled reports, and app integrations like Slack.
9
+
10
+ <Frame>
11
+ <img src="/images/notifications_page.png" alt="Notifications Settings Page" />
12
+ </Frame>
13
+
14
+ ## Slack Integration
15
+
16
+ Judgment allows you to receive notifications directly in your Slack workspace.
17
+
18
+ ### Connecting Slack
19
+
20
+ 1. Navigate to the Notifications page in your Judgment account settings
21
+ 2. In the "App Integrations" section, find the Slack card
22
+ 3. Click the "Connect" button
23
+ 4. You'll be redirected to Slack's authorization page
24
+ 5. Select the workspace you want to connect and authorize the Judgment application
25
+ 6. Once connected, you'll be redirected back to Judgment
26
+
27
+ ### Slack Notification Features
28
+
29
+ After connecting Slack:
30
+
31
+ - Receive real-time alerts when evaluation rules are triggered
32
+ - Get notifications about model performance issues
33
+ - Track Judgment activity in your Slack channels
34
+
35
+ ### Managing Slack Notifications
36
+
37
+ Once connected, you can:
38
+
39
+ - Disconnect your Slack workspace at any time
40
+ - Add specific channels for different types of notifications
41
+ - Configure which notifications are sent to Slack
42
+
43
+ ## Email Notifications
44
+
45
+ In the Notifications settings page, you can configure:
46
+
47
+ 1. **Evaluation Alerts** - Receive real-time email notifications whenever an evaluation alert is triggered
48
+ 2. **Custom Alert Recaps** - Receive periodic email summaries of evaluations, traces, and metric scores
49
+
50
+ ## Scheduled Reports (Recaps)
51
+
52
+ You can create custom scheduled reports to receive regular updates on your agent's performance.
53
+
54
+ ### Creating a Report
55
+
56
+ 1. Navigate to the Notifications page in your Judgment account settings
57
+ 2. Under "Custom Alert Recaps," click the "+" button to create a new report
58
+ 3. Configure your report with the following options:
59
+
60
+ <Frame>
61
+ <img src="/images/reports_modal.png" alt="Scheduled Reports Modal" />
62
+ </Frame>
63
+
64
+ | Setting | Description |
65
+ |---------|-------------|
66
+ | Report Name | A descriptive name for your report (e.g., "Daily Alert Summary") |
67
+ | Recipient Emails | Email addresses that will receive the report |
68
+ | Frequency | How often the report should be sent (Daily, Weekly, Monthly) |
69
+ | Select Days | For weekly reports, specify which days of the week |
70
+ | Time | When the report should be sent |
71
+ | Timezone | Your local timezone for accurate scheduling |
72
+ | Compare to Previous Period | Enable to see performance changes over time |
73
+
74
+ Your reports will be sent automatically based on your schedule settings, providing insights into your model's performance over time.
@@ -0,0 +1,111 @@
1
+ ---
2
+ title: 'Rules'
3
+ description: 'Define custom triggers and conditions for your evaluation metrics'
4
+ ---
5
+
6
+ # Rules
7
+
8
+ Rules allow you to define specific conditions for your evaluation metrics that can trigger alerts and [notifications](/alerts/notifications) when met. They serve as the foundation for the alerting system and help you monitor your AI system's performance against predetermined thresholds.
9
+
10
+ ## Overview
11
+
12
+ A rule consists of one or more [conditions](#conditions), each tied to a specific metric, that is supported by our Scorer (like Faithfulness or AnswerRelevancy). When evaluations are performed, the rules engine checks if the measured scores satisfy the conditions set in your rules. Based on the rule's configuration, alerts can be triggered and notifications sent through various channels.
13
+
14
+ <Note>
15
+ Rules and notifications only work with built-in APIScorers. Local scorers and custom scorers are not supported for triggering rules.
16
+ </Note>
17
+
18
+ ## Creating Rules
19
+
20
+ Rules can be created using the `Rule` class from the `judgeval.rules` module. Each rule requires:
21
+
22
+ - A name
23
+ - A list of [conditions](#conditions)
24
+ - A [combine type](#combine-types) (how conditions should be evaluated together)
25
+
26
+ Optional parameters include:
27
+ - A description
28
+ - [Notification configuration](/alerts/notifications#notification-configuration)
29
+
30
+ ### Basic Rule Structure
31
+
32
+ ```python
33
+ from judgeval.rules import Rule, Condition
34
+ from judgeval.scorers import FaithfulnessScorer, AnswerRelevancyScorer
35
+
36
+ # Create a rule
37
+ rule = Rule(
38
+ name="Quality Check",
39
+ description="Check if quality metrics meet thresholds",
40
+ conditions=[
41
+ Condition(metric=FaithfulnessScorer(threshold=0.7)),
42
+ Condition(metric=AnswerRelevancyScorer(threshold=0.8))
43
+ ],
44
+ combine_type="all" # "all" = AND, "any" = OR
45
+ )
46
+ ```
47
+
48
+ ## Conditions
49
+
50
+ Conditions are the building blocks of rules. Each condition specifies a metric (must be a built-in APIScorer like FaithfulnessScorer or AnswerRelevancyScorer). The condition is met when the score for that metric is greater than or equal to the threshold specified in the scorer.
51
+
52
+ ### Creating Conditions
53
+
54
+ ```python
55
+ from judgeval.rules import Condition
56
+ from judgeval.scorers import FaithfulnessScorer
57
+
58
+ # Create a condition that passes when faithfulness score is greater than or equal to 0.7
59
+ condition = Condition(
60
+ metric=FaithfulnessScorer(threshold=0.7)
61
+ )
62
+ ```
63
+
64
+ ### How Conditions are Evaluated
65
+
66
+ When a condition is evaluated, it uses the scorer's threshold and internal evaluation logic:
67
+
68
+ 1. By default, a condition passes when the actual score is greater than or equal to the threshold
69
+ 2. If the scorer has a custom `success_check()` method, that method will be used instead
70
+ 3. The threshold is retrieved from the scorer's `threshold` attribute
71
+
72
+ ## Combine Types
73
+
74
+ Rules support two combine types that determine how multiple conditions are evaluated:
75
+
76
+ - `"all"`: The rule triggers when all conditions fail (logical AND)
77
+ - `"any"`: The rule triggers when any condition fails (logical OR)
78
+
79
+ This design is meant for setting up alerts that trigger when your metrics indicate a problem with your AI system's performance.
80
+
81
+ ## Using Rules with the Tracer
82
+
83
+ Rules are most commonly used with the `Tracer` to monitor your AI system's performance:
84
+
85
+ ```python
86
+ from judgeval.common.tracer import Tracer
87
+ from judgeval.rules import Rule, Condition
88
+ from judgeval.scorers import FaithfulnessScorer, AnswerRelevancyScorer
89
+
90
+ # Create rules
91
+ rules = [
92
+ Rule(
93
+ name="Quality Check",
94
+ description="Check if quality metrics meet thresholds",
95
+ conditions=[
96
+ Condition(metric=FaithfulnessScorer(threshold=0.7)),
97
+ Condition(metric=AnswerRelevancyScorer(threshold=0.8))
98
+ ],
99
+ combine_type="all" # Trigger when all conditions fail
100
+ )
101
+ ]
102
+
103
+ # Initialize tracer with rules
104
+ judgment = Tracer(
105
+ api_key="your_api_key",
106
+ project_name="your_project",
107
+ rules=rules
108
+ )
109
+ ```
110
+
111
+ For more information on configuring notifications with rules, see the [Notifications documentation](/alerts/notifications#attaching-notifications-to-rules).
@@ -4,6 +4,7 @@ description: ""
4
4
  ---
5
5
 
6
6
  If none of `judgeval`'s built-in scorers fit your evaluation criteria, you can easily build your own custom metric to be run through a `JudgevalScorer`.
7
+
7
8
  `JudgevalScorer`s are **automatically integrated** within `judgeval`'s infrastructure, so you can:
8
9
  - Run your own scorer with the same syntax as any other `judgeval` scorer.
9
10
  - Use `judgeval`'s batched evaluation infrastructure to execute **scalable evaluation runs**.
@@ -78,7 +79,6 @@ You can optionally set the self.reason attribute, depending on your preference.
78
79
  </Note>
79
80
 
80
81
  These methods are the core of your scorer, and you can implement them in any way you want. **Be creative!**
81
- Check out this list of examples our users have implemented if you need inspiration: TODO add link here
82
82
 
83
83
  #### Handling Errors
84
84
  If you want to handle errors gracefully, you can use a `try` block and in the `except` block, set the `self.error` attribute to the error message.
@@ -144,11 +144,37 @@ class SampleScorer(JudgevalScorer):
144
144
  def __name__(self):
145
145
  return "Sample Scorer"
146
146
  ```
147
-
148
147
  **Congratulations!** 🎉
149
148
 
150
149
  You've made your first custom judgeval scorer! Now that your scorer is implemented, you can run it on your own datasets
151
150
  just like any other `judgeval` scorer. Your scorer is fully integrated with `judgeval`'s infrastructure so you can view it on
152
151
  the [Judgment platform](/judgment/introduction) too.
153
152
 
154
- For more examples, check out some of the custom scorers our users have implemented: TODO add link here.
153
+ ## Using a Custom Scorer
154
+
155
+ Once you've implemented your custom scorer, you can use it in the same way as any other scorer in `judgeval`.
156
+ They can be run in conjunction with other scorers in a single evaluation run!
157
+
158
+ ```python run_custom_scorer.py
159
+ from judgeval import JudgmentClient
160
+ from your_custom_scorer import SampleScorer
161
+
162
+ client = JudgmentClient()
163
+ sample_scorer = SampleScorer()
164
+
165
+ results = client.run_evaluation(
166
+ examples=[example1],
167
+ scorers=[sample_scorer],
168
+ model="gpt-4o"
169
+ )
170
+ ```
171
+
172
+ ## Real World Examples
173
+
174
+ You can find some real world examples of how our community has used custom `JudgevalScorer`s to evaluate their LLM systems in our [cookbook repository](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/custom_scorers)!
175
+ Here are some of our favorites:
176
+
177
+ - [Code Style Scorer](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/code_style_scorer.py) - Evaluates code quality and style
178
+ - [Cold Email Scorer](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/cold_email_scorer.py) - Evaluates the effectiveness of cold emails
179
+
180
+ For more examples and detailed documentation on custom scorers, check out our [Custom Scorers Cookbook](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/README.md).
@@ -89,6 +89,14 @@
89
89
  "integration/langgraph"
90
90
  ]
91
91
  },
92
+ {
93
+ "group": "Alerts",
94
+ "pages": [
95
+ "alerts/rules",
96
+ "alerts/notifications",
97
+ "alerts/platform_notifications"
98
+ ]
99
+ },
92
100
  {
93
101
  "group": "Judgment Platform",
94
102
  "pages": [
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.0.26"
3
+ version = "0.0.28"
4
4
  authors = [
5
5
  { name="Andrew Li", email="andrew@judgmentlabs.ai" },
6
6
  { name="Alex Shan", email="alex@judgmentlabs.ai" },
@@ -0,0 +1,21 @@
1
+ from judgeval import JudgmentClient
2
+ from judgeval.data import Example
3
+ from judgeval.scorers import FaithfulnessScorer
4
+
5
+ client = JudgmentClient()
6
+
7
+ example = Example(
8
+ input="What if these shoes don't fit?",
9
+ actual_output="We offer a 30-day full refund at no extra cost.",
10
+ retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
11
+ )
12
+
13
+ scorer = FaithfulnessScorer(threshold=0.5)
14
+ results = client.run_evaluation(
15
+ examples=[example],
16
+ scorers=[scorer],
17
+ model="gpt-4o",
18
+ eval_run_name="TestRun",
19
+ project_name="TestProject",
20
+ )
21
+ print(results)
@@ -0,0 +1,14 @@
1
+ """
2
+ ClassifierScorer implementation for basic Text-to-SQL evaluation.
3
+
4
+ Takes a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
5
+ Determines if the LLM-generated SQL query is valid and works for the natural language query.
6
+ """
7
+ from judgeval.scorers import ClassifierScorer
8
+ from judgeval import JudgmentClient
9
+ from judgeval.scorers.judgeval_scorers.classifiers.text2sql.text2sql_scorer import Text2SQLScorer
10
+
11
+ judgment_client = JudgmentClient()
12
+
13
+ print(judgment_client.push_classifier_scorer(Text2SQLScorer, slug="text2sql-eric-linkd"))
14
+ print(judgment_client.fetch_classifier_scorer("text2sql-eric-linkd"))
@@ -0,0 +1,22 @@
1
+
2
+ from judgeval.data import CustomExample
3
+ from judgeval import JudgmentClient
4
+ from qodo_scorer import QodoScorer
5
+
6
+ judgment = JudgmentClient()
7
+
8
+ custom_example = CustomExample(
9
+ code="print('Hello, world!')",
10
+ original_code="print('Hello, world!')",
11
+ )
12
+
13
+ qodo_scorer = QodoScorer()
14
+ results = judgment.run_evaluation(
15
+ examples=[custom_example],
16
+ scorers=[qodo_scorer],
17
+ model="gpt-4o",
18
+ project_name="QoDoDemo",
19
+ eval_run_name="QoDoDemoRun1",
20
+ )
21
+
22
+ print(f"{results=}")
@@ -0,0 +1,78 @@
1
+
2
+ from judgeval.data import Example
3
+ from judgeval.common.tracer import Tracer, wrap
4
+ from judgeval.scorers import JudgevalScorer, AnswerCorrectnessScorer
5
+ from judgeval import JudgmentClient
6
+ from openai import OpenAI, AsyncOpenAI
7
+ import os
8
+
9
+ client = OpenAI()
10
+ async_client = AsyncOpenAI()
11
+
12
+
13
+ class QodoScorer(JudgevalScorer):
14
+
15
+ def __init__(self,
16
+ threshold=0.5,
17
+ score_type="CodeReviewScorer",
18
+ include_reason=True,
19
+ async_mode=True,
20
+ strict_mode=False,
21
+ verbose_mode=True):
22
+ super().__init__(
23
+ threshold=threshold,
24
+ score_type=score_type,
25
+ include_reason=include_reason,
26
+ async_mode=async_mode,
27
+ strict_mode=strict_mode,
28
+ verbose_mode=verbose_mode)
29
+
30
+ def score_example(self, example: Example) -> float:
31
+ """
32
+ Score the trace based on the code review criteria.
33
+ """
34
+
35
+ response = client.chat.completions.create(
36
+ model="gpt-4o",
37
+ messages=[
38
+ {"role": "system", "content": "You are a QoDo reviewer. You will be given CODE, a PR_REQUEST and QoDo's improved summary of the PR_REQUEST as well as its review of the PR_REQUEST given as PR_QUALITY. Your job is to review the CODE and PR_REQUEST and determine how factually accurate and thorough QoDo is. Give reasoning for why or why not you think the QoDo's review if accurate and thorough."},
39
+ {"role": "user", "content": f"INPUT: {example.input}, CONTEXT: {example.context}, QoDo's REViEW: {example.actual_output}"},
40
+ ],
41
+ )
42
+ self.reason = response.choices[0].message.content
43
+
44
+ score_response = client.chat.completions.create(
45
+ model="gpt-4o",
46
+ messages=[
47
+ {"role": "system",
48
+ "content": "You are a judge, you will be given a review of the performance of Qodo (a code review tool) on the accuracy and thoroughness of its review of a PR_REQUEST given as PR_QUALITY. Your job is to give a score from 0 to 1 on how well Qodo performed based on the REVIEW given to you. Do not output anything except the score."},
49
+ {"role": "user", "content": f"REVIEW: {self.reason}"},
50
+ ],
51
+ )
52
+ self.score = float(score_response.choices[0].message.content)
53
+ return self.score
54
+
55
+ async def a_score_example(self, example: Example) -> float:
56
+ """
57
+ Score the trace based on the code review criteria.
58
+ """
59
+ # In this case, the async implementation is the same as the sync one
60
+ # In a real scenario, you might want to use async APIs for better performance
61
+ response = await async_client.chat.completions.create(
62
+ model="gpt-4o",
63
+ messages=[
64
+ {"role": "system", "content": "You are a QoDo reviewer. You will be given CODE, a PR_REQUEST and QoDo's improved summary of the PR_REQUEST as well as its review of the PR_REQUEST given as PR_QUALITY. Your job is to review the CODE and PR_REQUEST and determine how factually accurate and thorough QoDo is. Give reasoning for why or why not you think the QoDo's review if accurate and thorough."},
65
+ {"role": "user", "content": f"INPUT: {example.input}, CONTEXT: {example.context}, QoDo's REViEW: {example.actual_output}"},
66
+ ],
67
+ )
68
+ self.score = 1.0
69
+ return self.score_example(example)
70
+
71
+ def _success_check(self):
72
+ if self.error is not None:
73
+ return False
74
+ return self.score >= self.threshold
75
+
76
+ @property
77
+ def __name__(self):
78
+ return "Qodo Scorer"