judgeval 0.0.20__tar.gz → 0.0.21__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. {judgeval-0.0.20 → judgeval-0.0.21}/PKG-INFO +7 -3
  2. {judgeval-0.0.20 → judgeval-0.0.21}/Pipfile +0 -2
  3. {judgeval-0.0.20 → judgeval-0.0.21}/docs/integration/langgraph.mdx +2 -1
  4. {judgeval-0.0.20 → judgeval-0.0.21}/pyproject.toml +7 -3
  5. judgeval-0.0.21/src/demo/cookbooks/test.py +152 -0
  6. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/common/tracer.py +41 -2
  7. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/constants.py +1 -0
  8. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/judgment_client.py +20 -3
  9. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/run_evaluation.py +62 -8
  10. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/api_scorer.py +3 -1
  11. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -2
  12. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -2
  13. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +10 -2
  14. judgeval-0.0.21/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +28 -0
  15. judgeval-0.0.21/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +28 -0
  16. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +10 -3
  17. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +10 -2
  18. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -2
  19. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +10 -2
  20. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +10 -2
  21. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +9 -2
  22. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +9 -2
  23. judgeval-0.0.21/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +27 -0
  24. judgeval-0.0.20/src/demo/cookbooks/JNPR_Mist/test.py +0 -21
  25. judgeval-0.0.20/src/demo/cookbooks/anime_chatbot_agent/animeChatBot.py +0 -443
  26. judgeval-0.0.20/src/demo/cookbooks/ci_testing/ci_testing.py +0 -201
  27. judgeval-0.0.20/src/demo/cookbooks/ci_testing/travel_response.txt +0 -52
  28. judgeval-0.0.20/src/demo/cookbooks/custom_scorers/competitor_mentions.py +0 -41
  29. judgeval-0.0.20/src/demo/cookbooks/custom_scorers/text2sql.py +0 -205
  30. judgeval-0.0.20/src/demo/cookbooks/jpmorgan/demo.ipynb +0 -211
  31. judgeval-0.0.20/src/demo/cookbooks/jpmorgan/demo.py +0 -262
  32. judgeval-0.0.20/src/demo/cookbooks/jpmorgan/vectordbdocs.py +0 -174
  33. judgeval-0.0.20/src/demo/cookbooks/langchain_basic_rag/basic_agentic_rag.ipynb +0 -781
  34. judgeval-0.0.20/src/demo/cookbooks/langchain_basic_rag/tesla_q3.pdf +0 -0
  35. judgeval-0.0.20/src/demo/cookbooks/langchain_sales/example_product_price_id_mapping.json +0 -1
  36. judgeval-0.0.20/src/demo/cookbooks/langchain_sales/sales_agent_with_context.ipynb +0 -1375
  37. judgeval-0.0.20/src/demo/cookbooks/langchain_sales/sample_product_catalog.txt +0 -20
  38. judgeval-0.0.20/src/demo/cookbooks/langgraph_basic/agent.ipynb +0 -107
  39. judgeval-0.0.20/src/demo/cookbooks/langgraph_basic/agent.py +0 -109
  40. judgeval-0.0.20/src/demo/cookbooks/linkd/text2sql.py +0 -14
  41. judgeval-0.0.20/src/demo/cookbooks/new_bot/basic_bot.py +0 -106
  42. judgeval-0.0.20/src/demo/cookbooks/openai_travel_agent/agent.py +0 -167
  43. judgeval-0.0.20/src/demo/cookbooks/openai_travel_agent/populate_db.py +0 -73
  44. judgeval-0.0.20/src/demo/cookbooks/openai_travel_agent/tools.py +0 -16
  45. judgeval-0.0.20/src/demo/cookbooks/rules_alerts/rules_bot.py +0 -132
  46. judgeval-0.0.20/src/demo/cookbooks/rules_alerts/rules_demo.py +0 -351
  47. judgeval-0.0.20/src/demo/cookbooks/rules_alerts/utils_helper.py +0 -78
  48. judgeval-0.0.20/src/demo/customer_use/jnpr/mist/demo.py +0 -131
  49. judgeval-0.0.20/src/demo/customer_use/jnpr/mist/test.yaml +0 -11
  50. judgeval-0.0.20/src/demo/customer_use/jnpr/srikar_demo.py +0 -0
  51. judgeval-0.0.20/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  52. judgeval-0.0.20/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  53. judgeval-0.0.20/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  54. {judgeval-0.0.20 → judgeval-0.0.21}/.github/workflows/ci.yaml +0 -0
  55. {judgeval-0.0.20 → judgeval-0.0.21}/.gitignore +0 -0
  56. {judgeval-0.0.20 → judgeval-0.0.21}/LICENSE.md +0 -0
  57. {judgeval-0.0.20 → judgeval-0.0.21}/Pipfile.lock +0 -0
  58. {judgeval-0.0.20 → judgeval-0.0.21}/README.md +0 -0
  59. {judgeval-0.0.20 → judgeval-0.0.21}/docs/README.md +0 -0
  60. {judgeval-0.0.20 → judgeval-0.0.21}/docs/api_reference/judgment_client.mdx +0 -0
  61. {judgeval-0.0.20 → judgeval-0.0.21}/docs/api_reference/trace.mdx +0 -0
  62. {judgeval-0.0.20 → judgeval-0.0.21}/docs/development.mdx +0 -0
  63. {judgeval-0.0.20 → judgeval-0.0.21}/docs/essentials/code.mdx +0 -0
  64. {judgeval-0.0.20 → judgeval-0.0.21}/docs/essentials/images.mdx +0 -0
  65. {judgeval-0.0.20 → judgeval-0.0.21}/docs/essentials/markdown.mdx +0 -0
  66. {judgeval-0.0.20 → judgeval-0.0.21}/docs/essentials/navigation.mdx +0 -0
  67. {judgeval-0.0.20 → judgeval-0.0.21}/docs/essentials/reusable-snippets.mdx +0 -0
  68. {judgeval-0.0.20 → judgeval-0.0.21}/docs/essentials/settings.mdx +0 -0
  69. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/data_datasets.mdx +0 -0
  70. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/data_examples.mdx +0 -0
  71. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/introduction.mdx +0 -0
  72. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/judges.mdx +0 -0
  73. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/answer_correctness.mdx +0 -0
  74. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/answer_relevancy.mdx +0 -0
  75. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
  76. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/comparison.mdx +0 -0
  77. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/contextual_precision.mdx +0 -0
  78. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/contextual_recall.mdx +0 -0
  79. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/contextual_relevancy.mdx +0 -0
  80. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
  81. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/execution_order.mdx +0 -0
  82. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/faithfulness.mdx +0 -0
  83. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/hallucination.mdx +0 -0
  84. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/introduction.mdx +0 -0
  85. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/json_correctness.mdx +0 -0
  86. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/summarization.mdx +0 -0
  87. {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/unit_testing.mdx +0 -0
  88. {judgeval-0.0.20 → judgeval-0.0.21}/docs/favicon.svg +0 -0
  89. {judgeval-0.0.20 → judgeval-0.0.21}/docs/getting_started.mdx +0 -0
  90. {judgeval-0.0.20 → judgeval-0.0.21}/docs/images/basic_trace_example.png +0 -0
  91. {judgeval-0.0.20 → judgeval-0.0.21}/docs/images/checks-passed.png +0 -0
  92. {judgeval-0.0.20 → judgeval-0.0.21}/docs/images/create_aggressive_scorer.png +0 -0
  93. {judgeval-0.0.20 → judgeval-0.0.21}/docs/images/create_scorer.png +0 -0
  94. {judgeval-0.0.20 → judgeval-0.0.21}/docs/images/evaluation_diagram.png +0 -0
  95. {judgeval-0.0.20 → judgeval-0.0.21}/docs/images/hero-dark.svg +0 -0
  96. {judgeval-0.0.20 → judgeval-0.0.21}/docs/images/hero-light.svg +0 -0
  97. {judgeval-0.0.20 → judgeval-0.0.21}/docs/images/online_eval_fault.png +0 -0
  98. {judgeval-0.0.20 → judgeval-0.0.21}/docs/images/trace_ss.png +0 -0
  99. {judgeval-0.0.20 → judgeval-0.0.21}/docs/introduction.mdx +0 -0
  100. {judgeval-0.0.20 → judgeval-0.0.21}/docs/judgment/introduction.mdx +0 -0
  101. {judgeval-0.0.20 → judgeval-0.0.21}/docs/logo/dark.svg +0 -0
  102. {judgeval-0.0.20 → judgeval-0.0.21}/docs/logo/light.svg +0 -0
  103. {judgeval-0.0.20 → judgeval-0.0.21}/docs/mint.json +0 -0
  104. {judgeval-0.0.20 → judgeval-0.0.21}/docs/monitoring/introduction.mdx +0 -0
  105. {judgeval-0.0.20 → judgeval-0.0.21}/docs/monitoring/production_insights.mdx +0 -0
  106. {judgeval-0.0.20 → judgeval-0.0.21}/docs/monitoring/tracing.mdx +0 -0
  107. {judgeval-0.0.20 → judgeval-0.0.21}/docs/notebooks/create_dataset.ipynb +0 -0
  108. {judgeval-0.0.20 → judgeval-0.0.21}/docs/notebooks/create_scorer.ipynb +0 -0
  109. {judgeval-0.0.20 → judgeval-0.0.21}/docs/notebooks/demo.ipynb +0 -0
  110. {judgeval-0.0.20 → judgeval-0.0.21}/docs/notebooks/prompt_scorer.ipynb +0 -0
  111. {judgeval-0.0.20 → judgeval-0.0.21}/docs/notebooks/quickstart.ipynb +0 -0
  112. {judgeval-0.0.20 → judgeval-0.0.21}/docs/quickstart.mdx +0 -0
  113. {judgeval-0.0.20 → judgeval-0.0.21}/docs/snippets/snippet-intro.mdx +0 -0
  114. {judgeval-0.0.20 → judgeval-0.0.21}/pytest.ini +0 -0
  115. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/__init__.py +0 -0
  116. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/clients.py +0 -0
  117. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/common/__init__.py +0 -0
  118. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/common/exceptions.py +0 -0
  119. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/common/logger.py +0 -0
  120. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/common/utils.py +0 -0
  121. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/__init__.py +0 -0
  122. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/api_example.py +0 -0
  123. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/datasets/__init__.py +0 -0
  124. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/datasets/dataset.py +0 -0
  125. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/datasets/eval_dataset_client.py +0 -0
  126. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/datasets/utils.py +0 -0
  127. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/example.py +0 -0
  128. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/ground_truth.py +0 -0
  129. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/result.py +0 -0
  130. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/scorer_data.py +0 -0
  131. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/evaluation_run.py +0 -0
  132. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/judges/__init__.py +0 -0
  133. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/judges/base_judge.py +0 -0
  134. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/judges/litellm_judge.py +0 -0
  135. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/judges/mixture_of_judges.py +0 -0
  136. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/judges/together_judge.py +0 -0
  137. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/judges/utils.py +0 -0
  138. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/rules.py +0 -0
  139. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/__init__.py +0 -0
  140. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/base_scorer.py +0 -0
  141. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/exceptions.py +0 -0
  142. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorer.py +0 -0
  143. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  144. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  145. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
  146. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
  147. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
  148. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -0
  149. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -0
  150. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -0
  151. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -0
  152. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -0
  153. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -0
  154. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -0
  155. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
  156. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -0
  157. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -0
  158. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -0
  159. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -0
  160. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -0
  161. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -0
  162. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -0
  163. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -0
  164. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -0
  165. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -0
  166. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -0
  167. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -0
  168. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -0
  169. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -0
  170. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -0
  171. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -0
  172. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -0
  173. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -0
  174. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -0
  175. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -0
  176. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -0
  177. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -0
  178. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -0
  179. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -0
  180. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -0
  181. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -0
  182. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/prompt_scorer.py +0 -0
  183. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/score.py +0 -0
  184. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/utils.py +0 -0
  185. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/tracer/__init__.py +0 -0
  186. {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/utils/alerts.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.20
3
+ Version: 0.0.21
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -12,9 +12,15 @@ Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.11
13
13
  Requires-Dist: anthropic
14
14
  Requires-Dist: fastapi
15
+ Requires-Dist: langchain
16
+ Requires-Dist: langchain-anthropic
17
+ Requires-Dist: langchain-core
18
+ Requires-Dist: langchain-huggingface
19
+ Requires-Dist: langchain-openai
15
20
  Requires-Dist: litellm
16
21
  Requires-Dist: nest-asyncio
17
22
  Requires-Dist: openai
23
+ Requires-Dist: openpyxl
18
24
  Requires-Dist: pandas
19
25
  Requires-Dist: pika
20
26
  Requires-Dist: python-dotenv==1.0.1
@@ -23,8 +29,6 @@ Requires-Dist: supabase
23
29
  Requires-Dist: together
24
30
  Requires-Dist: uvicorn
25
31
  Provides-Extra: dev
26
- Requires-Dist: langfuse==2.50.3; extra == 'dev'
27
- Requires-Dist: patronus; extra == 'dev'
28
32
  Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
29
33
  Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
30
34
  Requires-Dist: pytest>=8.3.4; extra == 'dev'
@@ -29,8 +29,6 @@ pytest = "*"
29
29
  pytest-asyncio = "*"
30
30
  pytest-mock = "*"
31
31
  tavily-python = "*"
32
- patronus = "*"
33
- langfuse = "==2.50.3"
34
32
 
35
33
  [requires]
36
34
  python_version = "3.11"
@@ -14,10 +14,11 @@ graph_builder = StateGraph(State)
14
14
  # YOUR LANGGRAPH WORKFLOW
15
15
 
16
16
  handler = JudgevalCallbackHandler(judgment.get_current_trace())
17
+ set_global_handler(handler)
17
18
 
18
19
  result = graph.invoke({
19
20
  "messages": [HumanMessage(content=prompt)]
20
- }, config=dict(callbacks=[handler]))
21
+ })
21
22
 
22
23
  ```
23
24
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.0.20"
3
+ version = "0.0.21"
4
4
  authors = [
5
5
  { name="Andrew Li", email="andrew@judgmentlabs.ai" },
6
6
  { name="Alex Shan", email="alex@judgmentlabs.ai" },
@@ -28,6 +28,12 @@ dependencies = [
28
28
  "anthropic",
29
29
  "nest-asyncio",
30
30
  "pika",
31
+ "openpyxl",
32
+ "langchain",
33
+ "langchain-huggingface",
34
+ "langchain-openai",
35
+ "langchain-anthropic",
36
+ "langchain-core",
31
37
  ]
32
38
 
33
39
  [project.optional-dependencies]
@@ -35,8 +41,6 @@ dev = [
35
41
  "pytest>=8.3.4",
36
42
  "pytest-asyncio>=0.25.0",
37
43
  "pytest-mock>=3.14.0",
38
- "langfuse==2.50.3",
39
- "patronus",
40
44
  "tavily-python"
41
45
  ]
42
46
 
@@ -0,0 +1,152 @@
1
+ from judgeval.data import Example
2
+ from judgeval.data.datasets import EvalDataset
3
+ from judgeval.scorers import AnswerRelevancyScorer
4
+ from judgeval import JudgmentClient
5
+
6
+
7
+ def create_sample_dataset():
8
+ # Define sample inputs
9
+ inputs = [
10
+ # Highly relevant Q/A pairs
11
+ "Who founded Microsoft?",
12
+ "What is the capital of France?",
13
+ "How does photosynthesis work?",
14
+ "What are the benefits of exercise?",
15
+ "Explain quantum computing in simple terms.",
16
+
17
+ # Somewhat relevant Q/A pairs
18
+ "What is machine learning?",
19
+ "How do electric cars work?",
20
+ "What causes climate change?",
21
+ "How does the human digestive system function?",
22
+ "What is blockchain technology?",
23
+
24
+ # Minimally relevant Q/A pairs
25
+ "What are the main programming languages?",
26
+ "How do I bake a chocolate cake?",
27
+ "What is the history of the Roman Empire?",
28
+ "How do vaccines work?",
29
+ "What are black holes?",
30
+
31
+ # Not relevant Q/A pairs
32
+ "What is the best smartphone to buy?",
33
+ "How tall is Mount Everest?",
34
+ "Who wrote Romeo and Juliet?",
35
+ "What is the population of Tokyo?",
36
+ "How do I change a flat tire?"
37
+ ]
38
+
39
+ # Define corresponding outputs
40
+ actual_outputs = [
41
+ # Highly relevant answers
42
+ "Bill Gates and Paul Allen founded Microsoft in 1975.",
43
+ "The capital of France is Paris, known for the Eiffel Tower and Louvre Museum.",
44
+ "Photosynthesis is the process where plants convert sunlight, water, and carbon dioxide into glucose and oxygen.",
45
+ "Regular exercise improves cardiovascular health, builds muscle strength, reduces stress, and helps maintain a healthy weight.",
46
+ "Quantum computing uses quantum bits or qubits that can exist in multiple states simultaneously, allowing for potentially faster computation of certain problems compared to classical computers.",
47
+
48
+ # Somewhat relevant answers (partial or tangential information)
49
+ "Machine learning involves statistical techniques, but it's primarily about natural language processing and computer vision applications in modern businesses.",
50
+ "Electric cars use batteries, though the most important aspect is their impact on reducing traffic congestion in urban areas.",
51
+ "Climate change is related to weather patterns, but it's mainly caused by volcanic eruptions and natural planetary cycles.",
52
+ "The digestive system breaks down food, but the most interesting part is how it connects to brain function and mental health.",
53
+ "Blockchain is a distributed ledger technology, though its primary purpose is to replace traditional banking systems entirely.",
54
+
55
+ # Minimally relevant answers (mostly off-topic but with slight connection)
56
+ "Programming languages include Python and JavaScript, but the real question is whether AI will replace programmers in the next decade.",
57
+ "Chocolate cakes require flour and sugar, but I'd recommend focusing on gluten-free alternatives since they're healthier.",
58
+ "The Roman Empire lasted for centuries, but modern Italy's political system is more relevant to understand today's European politics.",
59
+ "Vaccines stimulate immune responses, but the pharmaceutical industry's profit motives are what you should really be concerned about.",
60
+ "Black holes are regions of spacetime, but the conspiracy theories about what NASA isn't telling us are far more interesting.",
61
+
62
+ # Not relevant answers (completely off-topic)
63
+ "The migration patterns of monarch butterflies are fascinating examples of evolutionary adaptation.",
64
+ "The Great Wall of China was built over multiple dynasties and stretches over 13,000 miles.",
65
+ "Photosynthesis is how plants convert sunlight into energy, producing oxygen as a byproduct.",
66
+ "The human genome contains approximately 3 billion base pairs of DNA.",
67
+ "The Pythagorean theorem states that in a right-angled triangle, the square of the hypotenuse equals the sum of squares of the other two sides."
68
+ ]
69
+
70
+ # Create Example objects from inputs and outputs
71
+ examples = []
72
+ for i in range(len(inputs)):
73
+ examples.append(Example(
74
+ input=inputs[i],
75
+ actual_output=actual_outputs[i]
76
+ ))
77
+
78
+ return EvalDataset(examples=examples)
79
+
80
+
81
+ def save_dataset(client, dataset, alias):
82
+ """Save the dataset to Judgment API with the given alias"""
83
+ client.push_dataset(alias=alias, dataset=dataset)
84
+ print(f"Dataset saved with alias: {alias}")
85
+
86
+
87
+ def run_evaluation(client, dataset_alias, model="gpt-4o", project_name="jnpr_mist_demo_project", eval_run_name="jnpr_demo_eval_run"):
88
+ """Pull a dataset and run an evaluation on it"""
89
+ # Pull the dataset from Judgment API
90
+ eval_dataset = client.pull_dataset(alias=dataset_alias)
91
+
92
+ # Run the evaluation
93
+ results = client.evaluate_dataset(
94
+ dataset=eval_dataset,
95
+ scorers=[AnswerRelevancyScorer(threshold=0.8)],
96
+ model=model,
97
+ eval_run_name=eval_run_name,
98
+ project_name=project_name,
99
+ )
100
+
101
+ return results
102
+
103
+
104
+ def run_assertion_test(client, dataset_alias, model="gpt-4o", project_name="jnpr_mist_demo_project", eval_run_name="jnpr_demo_assertion_run"):
105
+ """Pull a dataset and run assertion tests on its examples"""
106
+ # Pull the dataset from Judgment API
107
+ eval_dataset = client.pull_dataset(alias=dataset_alias)
108
+
109
+ # Extract examples from the dataset
110
+ examples = eval_dataset.examples
111
+
112
+ # Run assertion tests on each example
113
+ # Run assertion test on all examples at once
114
+ client.assert_test(
115
+ examples=examples,
116
+ scorers=[AnswerRelevancyScorer(threshold=0.8)],
117
+ model=model,
118
+ project_name=project_name,
119
+ eval_run_name=eval_run_name
120
+ )
121
+
122
+
123
+ def main():
124
+ client = JudgmentClient()
125
+
126
+ # Uncomment to create and save a new dataset
127
+ # dataset = create_sample_dataset()
128
+ # save_dataset(client, dataset, "jnpr_demo_dataset")
129
+
130
+ # # Run evaluation on the saved dataset
131
+ # results = run_evaluation(
132
+ # client,
133
+ # dataset_alias="jnpr_demo_dataset",
134
+ # model="gpt-4o",
135
+ # project_name="jnpr_mist_demo_project",
136
+ # eval_run_name="jnpr_demo_eval"
137
+ # )
138
+
139
+ # Run assertion test on the saved dataset
140
+ results = run_assertion_test(
141
+ client,
142
+ dataset_alias="jnpr_demo_dataset",
143
+ model="gpt-4o",
144
+ project_name="jnpr_mist_demo_project",
145
+ eval_run_name="jnpr_demo_assertion"
146
+ )
147
+ return results
148
+
149
+
150
+ if __name__ == "__main__":
151
+ results = main()
152
+ print(results)
@@ -10,6 +10,7 @@ import os
10
10
  import time
11
11
  import uuid
12
12
  import warnings
13
+ from contextvars import ContextVar
13
14
  from contextlib import contextmanager
14
15
  from collections import defaultdict
15
16
  from dataclasses import dataclass, field
@@ -37,6 +38,7 @@ from judgeval.constants import (
37
38
  RABBITMQ_PORT,
38
39
  RABBITMQ_QUEUE,
39
40
  JUDGMENT_TRACES_DELETE_API_URL,
41
+ JUDGMENT_PROJECT_DELETE_API_URL,
40
42
  JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL
41
43
  )
42
44
  from judgeval.judgment_client import JudgmentClient
@@ -54,7 +56,7 @@ from langchain_core.utils.function_calling import convert_to_openai_tool
54
56
  from langchain_core.callbacks import CallbackManager, BaseCallbackHandler
55
57
  from langchain_core.agents import AgentAction, AgentFinish
56
58
  from langchain_core.outputs import LLMResult
57
-
59
+ from langchain_core.tracers.context import register_configure_hook
58
60
  from langchain_core.messages.ai import AIMessage
59
61
  from langchain_core.messages.tool import ToolMessage
60
62
  from langchain_core.messages.base import BaseMessage
@@ -251,7 +253,8 @@ class TraceManagerClient:
251
253
  raise ValueError(f"Failed to save trace data: {response.text}")
252
254
 
253
255
  if not empty_save and "ui_results_url" in response.json():
254
- rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
256
+ pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n"
257
+ rprint(pretty_str)
255
258
 
256
259
  def delete_trace(self, trace_id: str):
257
260
  """
@@ -294,6 +297,27 @@ class TraceManagerClient:
294
297
  raise ValueError(f"Failed to delete trace: {response.text}")
295
298
 
296
299
  return response.json()
300
+
301
+ def delete_project(self, project_name: str):
302
+ """
303
+ Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
304
+ """
305
+ response = requests.delete(
306
+ JUDGMENT_PROJECT_DELETE_API_URL,
307
+ json={
308
+ "project_name": project_name,
309
+ },
310
+ headers={
311
+ "Content-Type": "application/json",
312
+ "Authorization": f"Bearer {self.judgment_api_key}",
313
+ "X-Organization-Id": self.organization_id
314
+ }
315
+ )
316
+
317
+ if response.status_code != HTTPStatus.OK:
318
+ raise ValueError(f"Failed to delete traces: {response.text}")
319
+
320
+ return response.json()
297
321
 
298
322
 
299
323
  class TraceClient:
@@ -1152,3 +1176,18 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
1152
1176
  'args': str(messages),
1153
1177
  'kwargs': kwargs
1154
1178
  })
1179
+
1180
+ judgeval_callback_handler_var: ContextVar[Optional[JudgevalCallbackHandler]] = ContextVar(
1181
+ "judgeval_callback_handler", default=None
1182
+ )
1183
+
1184
+ def set_global_handler(handler: JudgevalCallbackHandler):
1185
+ judgeval_callback_handler_var.set(handler)
1186
+
1187
+ def clear_global_handler():
1188
+ judgeval_callback_handler_var.set(None)
1189
+
1190
+ register_configure_hook(
1191
+ context_var=judgeval_callback_handler_var,
1192
+ inheritable=True,
1193
+ )
@@ -48,6 +48,7 @@ JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
48
48
  JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
49
49
  JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
50
50
  JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
51
+ JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
51
52
  JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
52
53
  JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
53
54
  JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
@@ -27,7 +27,8 @@ from judgeval.judges import JudgevalJudge
27
27
  from judgeval.constants import (
28
28
  JUDGMENT_EVAL_FETCH_API_URL,
29
29
  JUDGMENT_EVAL_DELETE_API_URL,
30
- JUDGMENT_EVAL_DELETE_PROJECT_API_URL
30
+ JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
31
+ JUDGMENT_PROJECT_DELETE_API_URL
31
32
  )
32
33
  from judgeval.common.exceptions import JudgmentAPIError
33
34
  from pydantic import BaseModel
@@ -156,7 +157,7 @@ class JudgmentClient:
156
157
  metadata: Optional[Dict[str, Any]] = None,
157
158
  project_name: str = "",
158
159
  eval_run_name: str = "",
159
- log_results: bool = False,
160
+ log_results: bool = True,
160
161
  use_judgment: bool = True,
161
162
  rules: Optional[List[Rule]] = None
162
163
  ) -> List[ScoringResult]:
@@ -362,7 +363,6 @@ class JudgmentClient:
362
363
  response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
363
364
  json={
364
365
  "project_name": project_name,
365
- "judgment_api_key": self.judgment_api_key,
366
366
  },
367
367
  headers={
368
368
  "Content-Type": "application/json",
@@ -372,6 +372,23 @@ class JudgmentClient:
372
372
  if response.status_code != requests.codes.ok:
373
373
  raise ValueError(f"Error deleting eval results: {response.json()}")
374
374
  return response.json()
375
+
376
+ def delete_project(self, project_name: str) -> bool:
377
+ """
378
+ Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
379
+ """
380
+ response = requests.delete(JUDGMENT_PROJECT_DELETE_API_URL,
381
+ json={
382
+ "project_name": project_name,
383
+ },
384
+ headers={
385
+ "Content-Type": "application/json",
386
+ "Authorization": f"Bearer {self.judgment_api_key}",
387
+ "X-Organization-Id": self.organization_id
388
+ })
389
+ if response.status_code != requests.codes.ok:
390
+ raise ValueError(f"Error deleting project: {response.json()}")
391
+ return response.json()
375
392
 
376
393
  def _validate_api_key(self):
377
394
  """
@@ -1,12 +1,17 @@
1
1
  import asyncio
2
2
  import requests
3
- from typing import List, Dict
3
+ import time
4
+ import sys
5
+ import itertools
6
+ import threading
7
+ from typing import List, Dict, Any
4
8
  from datetime import datetime
5
9
  from rich import print as rprint
6
10
 
7
11
  from judgeval.data import (
8
12
  ScorerData,
9
- ScoringResult
13
+ ScoringResult,
14
+ Example
10
15
  )
11
16
  from judgeval.scorers import (
12
17
  JudgevalScorer,
@@ -14,7 +19,6 @@ from judgeval.scorers import (
14
19
  ClassifierScorer
15
20
  )
16
21
  from judgeval.scorers.score import a_execute_scoring
17
-
18
22
  from judgeval.constants import (
19
23
  ROOT_API,
20
24
  JUDGMENT_EVAL_API_URL,
@@ -185,7 +189,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
185
189
  raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
186
190
 
187
191
 
188
- def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> None:
192
+ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> str:
189
193
  """
190
194
  Logs evaluation results to the Judgment API database.
191
195
 
@@ -220,7 +224,9 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
220
224
  raise JudgmentAPIError(error_message)
221
225
 
222
226
  if "ui_results_url" in res.json():
223
- rprint(f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)]{res.json()['ui_results_url']}[/]\n")
227
+ url = res.json()['ui_results_url']
228
+ pretty_str = f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
229
+ return pretty_str
224
230
 
225
231
  except requests.exceptions.RequestException as e:
226
232
  error(f"Request failed while saving evaluation results to DB: {str(e)}")
@@ -229,6 +235,51 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
229
235
  error(f"Failed to save evaluation results to DB: {str(e)}")
230
236
  raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
231
237
 
238
+ def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
239
+ """Run a function with a spinner in the terminal."""
240
+ spinner = itertools.cycle(['|', '/', '-', '\\'])
241
+
242
+ def display_spinner():
243
+ while not stop_spinner_event.is_set():
244
+ sys.stdout.write(f'\r{message}{next(spinner)}')
245
+ sys.stdout.flush()
246
+ time.sleep(0.1)
247
+
248
+ stop_spinner_event = threading.Event()
249
+ spinner_thread = threading.Thread(target=display_spinner)
250
+ spinner_thread.start()
251
+
252
+ try:
253
+ result = func(*args, **kwargs)
254
+ except Exception as e:
255
+ error(f"An error occurred: {str(e)}")
256
+ stop_spinner_event.set()
257
+ spinner_thread.join()
258
+ raise e
259
+ finally:
260
+ stop_spinner_event.set()
261
+ spinner_thread.join()
262
+
263
+ sys.stdout.write('\r' + ' ' * (len(message) + 1) + '\r')
264
+ sys.stdout.flush()
265
+
266
+ return result
267
+
268
+ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) -> None:
269
+ """
270
+ Checks if the example contains the necessary parameters for the scorer.
271
+ """
272
+ for scorer in scorers:
273
+ if isinstance(scorer, APIJudgmentScorer):
274
+ for example in examples:
275
+ missing_params = []
276
+ for param in scorer.required_params:
277
+ if getattr(example, param.value) is None:
278
+ missing_params.append(f"'{param.value}'")
279
+ if missing_params:
280
+ # We do this because we want to inform users that an example is missing parameters for a scorer
281
+ # Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
282
+ print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
232
283
 
233
284
 
234
285
  def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
@@ -253,7 +304,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
253
304
  Returns:
254
305
  List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
255
306
  """
256
-
307
+
257
308
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
258
309
  if not override and evaluation_run.log_results:
259
310
  check_eval_run_name_exists(
@@ -306,6 +357,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
306
357
 
307
358
  # Execute evaluation using Judgment API
308
359
  if judgment_scorers:
360
+ check_examples(evaluation_run.examples, evaluation_run.scorers)
309
361
  info("Starting API evaluation")
310
362
  debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
311
363
  try: # execute an EvaluationRun with just JudgmentScorers
@@ -323,7 +375,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
323
375
  rules=evaluation_run.rules
324
376
  )
325
377
  debug("Sending request to Judgment API")
326
- response_data: List[Dict] = execute_api_eval(api_evaluation_run) # Dicts are `ScoringResult` objs
378
+ response_data: List[Dict] = run_with_spinner("Running Evaluation: ", execute_api_eval, api_evaluation_run)
327
379
  info(f"Received {len(response_data['results'])} results from API")
328
380
  except JudgmentAPIError as e:
329
381
  error(f"An error occurred while executing the Judgment API request: {str(e)}")
@@ -352,6 +404,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
352
404
  api_results.append(ScoringResult(**filtered_result))
353
405
  # Run local evals
354
406
  if local_scorers: # List[JudgevalScorer]
407
+ # We should be removing local scorers soon
355
408
  info("Starting local evaluation")
356
409
  for example in evaluation_run.examples:
357
410
  with example_logging_context(example.timestamp, example.example_id):
@@ -389,7 +442,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
389
442
  # )
390
443
 
391
444
  if evaluation_run.log_results:
392
- log_evaluation_results(merged_results, evaluation_run)
445
+ pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
446
+ rprint(pretty_str)
393
447
 
394
448
  for i, result in enumerate(merged_results):
395
449
  if not result.scorers_data: # none of the scorers could be executed on this example
@@ -5,8 +5,9 @@ Scores `Example`s using ready-made Judgment evaluators.
5
5
  """
6
6
 
7
7
  from pydantic import BaseModel, field_validator
8
+ from typing import List
8
9
  from judgeval.common.logger import debug, info, warning, error
9
-
10
+ from judgeval.data import ExampleParams
10
11
  from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
11
12
 
12
13
 
@@ -20,6 +21,7 @@ class APIJudgmentScorer(BaseModel):
20
21
  """
21
22
  score_type: APIScorer
22
23
  threshold: float
24
+ required_params: List[ExampleParams] = [] # List of the required parameters on examples for the scorer
23
25
 
24
26
  @field_validator('threshold')
25
27
  def validate_threshold(cls, v, info):
@@ -8,11 +8,19 @@ TODO add link to docs page for this scorer
8
8
  # Internal imports
9
9
  from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
-
11
+ from judgeval.data import ExampleParams
12
12
 
13
13
  class AnswerCorrectnessScorer(APIJudgmentScorer):
14
14
  def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_CORRECTNESS)
15
+ super().__init__(
16
+ threshold=threshold,
17
+ score_type=APIScorer.ANSWER_CORRECTNESS,
18
+ required_params=[
19
+ ExampleParams.INPUT,
20
+ ExampleParams.ACTUAL_OUTPUT,
21
+ ExampleParams.EXPECTED_OUTPUT,
22
+ ]
23
+ )
16
24
 
17
25
  @property
18
26
  def __name__(self):
@@ -8,11 +8,18 @@ TODO add link to docs page for this scorer
8
8
  # Internal imports
9
9
  from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
-
11
+ from judgeval.data import ExampleParams
12
12
 
13
13
  class AnswerRelevancyScorer(APIJudgmentScorer):
14
14
  def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_RELEVANCY)
15
+ super().__init__(
16
+ threshold=threshold,
17
+ score_type=APIScorer.ANSWER_RELEVANCY,
18
+ required_params=[
19
+ ExampleParams.INPUT,
20
+ ExampleParams.ACTUAL_OUTPUT,
21
+ ]
22
+ )
16
23
 
17
24
  @property
18
25
  def __name__(self):
@@ -9,12 +9,20 @@ TODO add link to docs page for this scorer
9
9
  from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
  from typing import Optional, Dict
12
-
12
+ from judgeval.data import ExampleParams
13
13
  class ComparisonScorer(APIJudgmentScorer):
14
14
  kwargs: Optional[Dict] = None
15
15
 
16
16
  def __init__(self, threshold: float, criteria: str, description: str):
17
- super().__init__(threshold=threshold, score_type=APIScorer.COMPARISON)
17
+ super().__init__(
18
+ threshold=threshold,
19
+ score_type=APIScorer.COMPARISON,
20
+ required_params=[
21
+ ExampleParams.INPUT,
22
+ ExampleParams.ACTUAL_OUTPUT,
23
+ ExampleParams.EXPECTED_OUTPUT,
24
+ ]
25
+ )
18
26
  self.kwargs = {"criteria": criteria, "description": description}
19
27
 
20
28
  @property
@@ -0,0 +1,28 @@
1
+ """
2
+ `judgeval` contextual precision scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+ from judgeval.data import ExampleParams
12
+
13
+ class ContextualPrecisionScorer(APIJudgmentScorer):
14
+ def __init__(self, threshold: float):
15
+ super().__init__(
16
+ threshold=threshold,
17
+ score_type=APIScorer.CONTEXTUAL_PRECISION,
18
+ required_params=[
19
+ ExampleParams.INPUT,
20
+ ExampleParams.ACTUAL_OUTPUT,
21
+ ExampleParams.RETRIEVAL_CONTEXT,
22
+ ExampleParams.EXPECTED_OUTPUT,
23
+ ]
24
+ )
25
+
26
+ @property
27
+ def __name__(self):
28
+ return "Contextual Precision"
@@ -0,0 +1,28 @@
1
+ """
2
+ `judgeval` contextual recall scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+ from judgeval.data import ExampleParams
12
+
13
+
14
+ class ContextualRecallScorer(APIJudgmentScorer):
15
+ def __init__(self, threshold: float):
16
+ super().__init__(
17
+ threshold=threshold,
18
+ score_type=APIScorer.CONTEXTUAL_RECALL,
19
+ required_params=[
20
+ ExampleParams.INPUT,
21
+ ExampleParams.ACTUAL_OUTPUT,
22
+ ExampleParams.EXPECTED_OUTPUT,
23
+ ExampleParams.RETRIEVAL_CONTEXT,
24
+ ]
25
+ )
26
+ @property
27
+ def __name__(self):
28
+ return "Contextual Recall"