judgeval 0.0.20__tar.gz → 0.0.22__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. {judgeval-0.0.20 → judgeval-0.0.22}/PKG-INFO +7 -3
  2. {judgeval-0.0.20 → judgeval-0.0.22}/Pipfile +0 -2
  3. {judgeval-0.0.20 → judgeval-0.0.22}/docs/integration/langgraph.mdx +2 -1
  4. {judgeval-0.0.20 → judgeval-0.0.22}/pyproject.toml +7 -3
  5. judgeval-0.0.22/src/demo/custom_example_demo/qodo_example.py +39 -0
  6. judgeval-0.0.22/src/demo/custom_example_demo/test.py +16 -0
  7. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/common/tracer.py +41 -2
  8. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/constants.py +1 -0
  9. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/data/__init__.py +2 -3
  10. judgeval-0.0.22/src/judgeval/data/custom_example.py +98 -0
  11. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/data/datasets/dataset.py +17 -124
  12. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/data/datasets/eval_dataset_client.py +5 -11
  13. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/judgment_client.py +23 -7
  14. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/run_evaluation.py +62 -8
  15. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/api_scorer.py +3 -1
  16. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -2
  17. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -2
  18. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +10 -2
  19. judgeval-0.0.22/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +28 -0
  20. judgeval-0.0.22/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +28 -0
  21. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +10 -3
  22. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +10 -2
  23. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -2
  24. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +10 -2
  25. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +10 -2
  26. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +9 -2
  27. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +9 -2
  28. judgeval-0.0.22/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +27 -0
  29. judgeval-0.0.22/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
  30. judgeval-0.0.20/src/demo/cookbooks/anime_chatbot_agent/animeChatBot.py +0 -443
  31. judgeval-0.0.20/src/demo/cookbooks/ci_testing/ci_testing.py +0 -201
  32. judgeval-0.0.20/src/demo/cookbooks/ci_testing/travel_response.txt +0 -52
  33. judgeval-0.0.20/src/demo/cookbooks/custom_scorers/competitor_mentions.py +0 -41
  34. judgeval-0.0.20/src/demo/cookbooks/custom_scorers/text2sql.py +0 -205
  35. judgeval-0.0.20/src/demo/cookbooks/jpmorgan/demo.ipynb +0 -211
  36. judgeval-0.0.20/src/demo/cookbooks/jpmorgan/demo.py +0 -262
  37. judgeval-0.0.20/src/demo/cookbooks/jpmorgan/vectordbdocs.py +0 -174
  38. judgeval-0.0.20/src/demo/cookbooks/langchain_basic_rag/basic_agentic_rag.ipynb +0 -781
  39. judgeval-0.0.20/src/demo/cookbooks/langchain_basic_rag/tesla_q3.pdf +0 -0
  40. judgeval-0.0.20/src/demo/cookbooks/langchain_sales/example_product_price_id_mapping.json +0 -1
  41. judgeval-0.0.20/src/demo/cookbooks/langchain_sales/sales_agent_with_context.ipynb +0 -1375
  42. judgeval-0.0.20/src/demo/cookbooks/langchain_sales/sample_product_catalog.txt +0 -20
  43. judgeval-0.0.20/src/demo/cookbooks/langgraph_basic/agent.ipynb +0 -107
  44. judgeval-0.0.20/src/demo/cookbooks/langgraph_basic/agent.py +0 -109
  45. judgeval-0.0.20/src/demo/cookbooks/new_bot/basic_bot.py +0 -106
  46. judgeval-0.0.20/src/demo/cookbooks/openai_travel_agent/agent.py +0 -167
  47. judgeval-0.0.20/src/demo/cookbooks/openai_travel_agent/populate_db.py +0 -73
  48. judgeval-0.0.20/src/demo/cookbooks/openai_travel_agent/tools.py +0 -16
  49. judgeval-0.0.20/src/demo/cookbooks/rules_alerts/rules_bot.py +0 -132
  50. judgeval-0.0.20/src/demo/cookbooks/rules_alerts/rules_demo.py +0 -351
  51. judgeval-0.0.20/src/demo/cookbooks/rules_alerts/utils_helper.py +0 -78
  52. judgeval-0.0.20/src/demo/customer_use/jnpr/mist/demo.py +0 -131
  53. judgeval-0.0.20/src/demo/customer_use/jnpr/mist/test.yaml +0 -11
  54. judgeval-0.0.20/src/judgeval/data/datasets/utils.py +0 -73
  55. judgeval-0.0.20/src/judgeval/data/ground_truth.py +0 -54
  56. judgeval-0.0.20/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  57. judgeval-0.0.20/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  58. judgeval-0.0.20/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  59. {judgeval-0.0.20 → judgeval-0.0.22}/.github/workflows/ci.yaml +0 -0
  60. {judgeval-0.0.20 → judgeval-0.0.22}/.gitignore +0 -0
  61. {judgeval-0.0.20 → judgeval-0.0.22}/LICENSE.md +0 -0
  62. {judgeval-0.0.20 → judgeval-0.0.22}/Pipfile.lock +0 -0
  63. {judgeval-0.0.20 → judgeval-0.0.22}/README.md +0 -0
  64. {judgeval-0.0.20 → judgeval-0.0.22}/docs/README.md +0 -0
  65. {judgeval-0.0.20 → judgeval-0.0.22}/docs/api_reference/judgment_client.mdx +0 -0
  66. {judgeval-0.0.20 → judgeval-0.0.22}/docs/api_reference/trace.mdx +0 -0
  67. {judgeval-0.0.20 → judgeval-0.0.22}/docs/development.mdx +0 -0
  68. {judgeval-0.0.20 → judgeval-0.0.22}/docs/essentials/code.mdx +0 -0
  69. {judgeval-0.0.20 → judgeval-0.0.22}/docs/essentials/images.mdx +0 -0
  70. {judgeval-0.0.20 → judgeval-0.0.22}/docs/essentials/markdown.mdx +0 -0
  71. {judgeval-0.0.20 → judgeval-0.0.22}/docs/essentials/navigation.mdx +0 -0
  72. {judgeval-0.0.20 → judgeval-0.0.22}/docs/essentials/reusable-snippets.mdx +0 -0
  73. {judgeval-0.0.20 → judgeval-0.0.22}/docs/essentials/settings.mdx +0 -0
  74. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/data_datasets.mdx +0 -0
  75. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/data_examples.mdx +0 -0
  76. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/introduction.mdx +0 -0
  77. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/judges.mdx +0 -0
  78. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/answer_correctness.mdx +0 -0
  79. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/answer_relevancy.mdx +0 -0
  80. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
  81. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/comparison.mdx +0 -0
  82. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/contextual_precision.mdx +0 -0
  83. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/contextual_recall.mdx +0 -0
  84. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/contextual_relevancy.mdx +0 -0
  85. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
  86. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/execution_order.mdx +0 -0
  87. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/faithfulness.mdx +0 -0
  88. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/hallucination.mdx +0 -0
  89. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/introduction.mdx +0 -0
  90. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/json_correctness.mdx +0 -0
  91. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/summarization.mdx +0 -0
  92. {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/unit_testing.mdx +0 -0
  93. {judgeval-0.0.20 → judgeval-0.0.22}/docs/favicon.svg +0 -0
  94. {judgeval-0.0.20 → judgeval-0.0.22}/docs/getting_started.mdx +0 -0
  95. {judgeval-0.0.20 → judgeval-0.0.22}/docs/images/basic_trace_example.png +0 -0
  96. {judgeval-0.0.20 → judgeval-0.0.22}/docs/images/checks-passed.png +0 -0
  97. {judgeval-0.0.20 → judgeval-0.0.22}/docs/images/create_aggressive_scorer.png +0 -0
  98. {judgeval-0.0.20 → judgeval-0.0.22}/docs/images/create_scorer.png +0 -0
  99. {judgeval-0.0.20 → judgeval-0.0.22}/docs/images/evaluation_diagram.png +0 -0
  100. {judgeval-0.0.20 → judgeval-0.0.22}/docs/images/hero-dark.svg +0 -0
  101. {judgeval-0.0.20 → judgeval-0.0.22}/docs/images/hero-light.svg +0 -0
  102. {judgeval-0.0.20 → judgeval-0.0.22}/docs/images/online_eval_fault.png +0 -0
  103. {judgeval-0.0.20 → judgeval-0.0.22}/docs/images/trace_ss.png +0 -0
  104. {judgeval-0.0.20 → judgeval-0.0.22}/docs/introduction.mdx +0 -0
  105. {judgeval-0.0.20 → judgeval-0.0.22}/docs/judgment/introduction.mdx +0 -0
  106. {judgeval-0.0.20 → judgeval-0.0.22}/docs/logo/dark.svg +0 -0
  107. {judgeval-0.0.20 → judgeval-0.0.22}/docs/logo/light.svg +0 -0
  108. {judgeval-0.0.20 → judgeval-0.0.22}/docs/mint.json +0 -0
  109. {judgeval-0.0.20 → judgeval-0.0.22}/docs/monitoring/introduction.mdx +0 -0
  110. {judgeval-0.0.20 → judgeval-0.0.22}/docs/monitoring/production_insights.mdx +0 -0
  111. {judgeval-0.0.20 → judgeval-0.0.22}/docs/monitoring/tracing.mdx +0 -0
  112. {judgeval-0.0.20 → judgeval-0.0.22}/docs/notebooks/create_dataset.ipynb +0 -0
  113. {judgeval-0.0.20 → judgeval-0.0.22}/docs/notebooks/create_scorer.ipynb +0 -0
  114. {judgeval-0.0.20 → judgeval-0.0.22}/docs/notebooks/demo.ipynb +0 -0
  115. {judgeval-0.0.20 → judgeval-0.0.22}/docs/notebooks/prompt_scorer.ipynb +0 -0
  116. {judgeval-0.0.20 → judgeval-0.0.22}/docs/notebooks/quickstart.ipynb +0 -0
  117. {judgeval-0.0.20 → judgeval-0.0.22}/docs/quickstart.mdx +0 -0
  118. {judgeval-0.0.20 → judgeval-0.0.22}/docs/snippets/snippet-intro.mdx +0 -0
  119. {judgeval-0.0.20 → judgeval-0.0.22}/pytest.ini +0 -0
  120. {judgeval-0.0.20 → judgeval-0.0.22}/src/demo/cookbooks/JNPR_Mist/test.py +0 -0
  121. {judgeval-0.0.20 → judgeval-0.0.22}/src/demo/cookbooks/linkd/text2sql.py +0 -0
  122. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/__init__.py +0 -0
  123. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/clients.py +0 -0
  124. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/common/__init__.py +0 -0
  125. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/common/exceptions.py +0 -0
  126. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/common/logger.py +0 -0
  127. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/common/utils.py +0 -0
  128. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/data/api_example.py +0 -0
  129. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/data/datasets/__init__.py +0 -0
  130. /judgeval-0.0.20/src/demo/customer_use/jnpr/srikar_demo.py → /judgeval-0.0.22/src/judgeval/data/datasets/utils.py +0 -0
  131. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/data/example.py +0 -0
  132. /judgeval-0.0.20/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py → /judgeval-0.0.22/src/judgeval/data/ground_truth.py +0 -0
  133. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/data/result.py +0 -0
  134. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/data/scorer_data.py +0 -0
  135. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/evaluation_run.py +0 -0
  136. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/judges/__init__.py +0 -0
  137. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/judges/base_judge.py +0 -0
  138. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/judges/litellm_judge.py +0 -0
  139. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/judges/mixture_of_judges.py +0 -0
  140. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/judges/together_judge.py +0 -0
  141. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/judges/utils.py +0 -0
  142. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/rules.py +0 -0
  143. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/__init__.py +0 -0
  144. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/base_scorer.py +0 -0
  145. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/exceptions.py +0 -0
  146. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorer.py +0 -0
  147. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  148. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  149. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
  150. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
  151. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
  152. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -0
  153. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -0
  154. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -0
  155. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -0
  156. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -0
  157. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -0
  158. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -0
  159. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -0
  160. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -0
  161. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -0
  162. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -0
  163. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -0
  164. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -0
  165. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -0
  166. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -0
  167. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -0
  168. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -0
  169. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -0
  170. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -0
  171. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -0
  172. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -0
  173. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -0
  174. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -0
  175. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -0
  176. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -0
  177. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -0
  178. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -0
  179. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -0
  180. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -0
  181. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -0
  182. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -0
  183. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -0
  184. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -0
  185. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/prompt_scorer.py +0 -0
  186. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/score.py +0 -0
  187. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/utils.py +0 -0
  188. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/tracer/__init__.py +0 -0
  189. {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/utils/alerts.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.20
3
+ Version: 0.0.22
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -12,9 +12,15 @@ Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.11
13
13
  Requires-Dist: anthropic
14
14
  Requires-Dist: fastapi
15
+ Requires-Dist: langchain
16
+ Requires-Dist: langchain-anthropic
17
+ Requires-Dist: langchain-core
18
+ Requires-Dist: langchain-huggingface
19
+ Requires-Dist: langchain-openai
15
20
  Requires-Dist: litellm
16
21
  Requires-Dist: nest-asyncio
17
22
  Requires-Dist: openai
23
+ Requires-Dist: openpyxl
18
24
  Requires-Dist: pandas
19
25
  Requires-Dist: pika
20
26
  Requires-Dist: python-dotenv==1.0.1
@@ -23,8 +29,6 @@ Requires-Dist: supabase
23
29
  Requires-Dist: together
24
30
  Requires-Dist: uvicorn
25
31
  Provides-Extra: dev
26
- Requires-Dist: langfuse==2.50.3; extra == 'dev'
27
- Requires-Dist: patronus; extra == 'dev'
28
32
  Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
29
33
  Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
30
34
  Requires-Dist: pytest>=8.3.4; extra == 'dev'
@@ -29,8 +29,6 @@ pytest = "*"
29
29
  pytest-asyncio = "*"
30
30
  pytest-mock = "*"
31
31
  tavily-python = "*"
32
- patronus = "*"
33
- langfuse = "==2.50.3"
34
32
 
35
33
  [requires]
36
34
  python_version = "3.11"
@@ -14,10 +14,11 @@ graph_builder = StateGraph(State)
14
14
  # YOUR LANGGRAPH WORKFLOW
15
15
 
16
16
  handler = JudgevalCallbackHandler(judgment.get_current_trace())
17
+ set_global_handler(handler)
17
18
 
18
19
  result = graph.invoke({
19
20
  "messages": [HumanMessage(content=prompt)]
20
- }, config=dict(callbacks=[handler]))
21
+ })
21
22
 
22
23
  ```
23
24
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.0.20"
3
+ version = "0.0.22"
4
4
  authors = [
5
5
  { name="Andrew Li", email="andrew@judgmentlabs.ai" },
6
6
  { name="Alex Shan", email="alex@judgmentlabs.ai" },
@@ -28,6 +28,12 @@ dependencies = [
28
28
  "anthropic",
29
29
  "nest-asyncio",
30
30
  "pika",
31
+ "openpyxl",
32
+ "langchain",
33
+ "langchain-huggingface",
34
+ "langchain-openai",
35
+ "langchain-anthropic",
36
+ "langchain-core",
31
37
  ]
32
38
 
33
39
  [project.optional-dependencies]
@@ -35,8 +41,6 @@ dev = [
35
41
  "pytest>=8.3.4",
36
42
  "pytest-asyncio>=0.25.0",
37
43
  "pytest-mock>=3.14.0",
38
- "langfuse==2.50.3",
39
- "patronus",
40
44
  "tavily-python"
41
45
  ]
42
46
 
@@ -0,0 +1,39 @@
1
+ from judgeval.data import CustomExample
2
+ from pydantic import field_validator
3
+
4
+ class QodoExample(CustomExample):
5
+ code: str
6
+ original_code: str
7
+
8
+ def __init__(self, **data):
9
+ super().__init__(**data)
10
+
11
+ @field_validator('code', 'original_code', mode='before')
12
+ @classmethod
13
+ def validate_code(cls, v):
14
+ if v is not None and not isinstance(v, str):
15
+ raise ValueError(f"Code must be a string or None but got {v} of type {type(v)}")
16
+ return v
17
+
18
+ def to_dict(self):
19
+ return {
20
+ "code": self.code,
21
+ "original_code": self.original_code,
22
+ **super().to_dict()
23
+ }
24
+
25
+ def model_dump(self, **kwargs):
26
+ """
27
+ Custom serialization that handles special cases for fields that might fail standard serialization.
28
+ """
29
+ data = super().model_dump(**kwargs)
30
+
31
+ # Do any additional serialization here
32
+ data["code"] = self.code
33
+ data["original_code"] = self.original_code
34
+
35
+ return data
36
+
37
+
38
+
39
+
@@ -0,0 +1,16 @@
1
+ from judgeval.data import CustomExample
2
+ from judgeval import JudgmentClient
3
+ from qodo_example import QodoExample
4
+
5
+ custom_example = CustomExample(
6
+ code="print('Hello, world!')",
7
+ original_code="print('Hello, world!')",
8
+ )
9
+
10
+ qodo_example = QodoExample(
11
+ code="print('Hello, world!')",
12
+ original_code="print('Hello, world!')",
13
+ )
14
+
15
+ print(f"{custom_example=}")
16
+ print(f"{qodo_example=}")
@@ -10,6 +10,7 @@ import os
10
10
  import time
11
11
  import uuid
12
12
  import warnings
13
+ from contextvars import ContextVar
13
14
  from contextlib import contextmanager
14
15
  from collections import defaultdict
15
16
  from dataclasses import dataclass, field
@@ -37,6 +38,7 @@ from judgeval.constants import (
37
38
  RABBITMQ_PORT,
38
39
  RABBITMQ_QUEUE,
39
40
  JUDGMENT_TRACES_DELETE_API_URL,
41
+ JUDGMENT_PROJECT_DELETE_API_URL,
40
42
  JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL
41
43
  )
42
44
  from judgeval.judgment_client import JudgmentClient
@@ -54,7 +56,7 @@ from langchain_core.utils.function_calling import convert_to_openai_tool
54
56
  from langchain_core.callbacks import CallbackManager, BaseCallbackHandler
55
57
  from langchain_core.agents import AgentAction, AgentFinish
56
58
  from langchain_core.outputs import LLMResult
57
-
59
+ from langchain_core.tracers.context import register_configure_hook
58
60
  from langchain_core.messages.ai import AIMessage
59
61
  from langchain_core.messages.tool import ToolMessage
60
62
  from langchain_core.messages.base import BaseMessage
@@ -251,7 +253,8 @@ class TraceManagerClient:
251
253
  raise ValueError(f"Failed to save trace data: {response.text}")
252
254
 
253
255
  if not empty_save and "ui_results_url" in response.json():
254
- rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
256
+ pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n"
257
+ rprint(pretty_str)
255
258
 
256
259
  def delete_trace(self, trace_id: str):
257
260
  """
@@ -294,6 +297,27 @@ class TraceManagerClient:
294
297
  raise ValueError(f"Failed to delete trace: {response.text}")
295
298
 
296
299
  return response.json()
300
+
301
+ def delete_project(self, project_name: str):
302
+ """
303
+ Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
304
+ """
305
+ response = requests.delete(
306
+ JUDGMENT_PROJECT_DELETE_API_URL,
307
+ json={
308
+ "project_name": project_name,
309
+ },
310
+ headers={
311
+ "Content-Type": "application/json",
312
+ "Authorization": f"Bearer {self.judgment_api_key}",
313
+ "X-Organization-Id": self.organization_id
314
+ }
315
+ )
316
+
317
+ if response.status_code != HTTPStatus.OK:
318
+ raise ValueError(f"Failed to delete traces: {response.text}")
319
+
320
+ return response.json()
297
321
 
298
322
 
299
323
  class TraceClient:
@@ -1152,3 +1176,18 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
1152
1176
  'args': str(messages),
1153
1177
  'kwargs': kwargs
1154
1178
  })
1179
+
1180
+ judgeval_callback_handler_var: ContextVar[Optional[JudgevalCallbackHandler]] = ContextVar(
1181
+ "judgeval_callback_handler", default=None
1182
+ )
1183
+
1184
+ def set_global_handler(handler: JudgevalCallbackHandler):
1185
+ judgeval_callback_handler_var.set(handler)
1186
+
1187
+ def clear_global_handler():
1188
+ judgeval_callback_handler_var.set(None)
1189
+
1190
+ register_configure_hook(
1191
+ context_var=judgeval_callback_handler_var,
1192
+ inheritable=True,
1193
+ )
@@ -48,6 +48,7 @@ JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
48
48
  JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
49
49
  JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
50
50
  JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
51
+ JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
51
52
  JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
52
53
  JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
53
54
  JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
@@ -2,8 +2,7 @@ from judgeval.data.example import Example, ExampleParams
2
2
  from judgeval.data.api_example import ProcessExample, create_process_example
3
3
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
4
4
  from judgeval.data.result import ScoringResult, generate_scoring_result
5
- from judgeval.data.ground_truth import GroundTruthExample
6
-
5
+ from judgeval.data.custom_example import CustomExample
7
6
  __all__ = [
8
7
  "Example",
9
8
  "ExampleParams",
@@ -13,5 +12,5 @@ __all__ = [
13
12
  "create_scorer_data",
14
13
  "ScoringResult",
15
14
  "generate_scoring_result",
16
- "GroundTruthExample",
15
+ "CustomExample",
17
16
  ]
@@ -0,0 +1,98 @@
1
+ from pydantic import BaseModel, Field, field_validator
2
+ from typing import Optional, Dict, Any
3
+ from uuid import uuid4
4
+ from datetime import datetime
5
+ import json
6
+ import warnings
7
+
8
+ # Brainstorming what are the requirements for the fields?
9
+ class CustomExample(BaseModel):
10
+ name: Optional[str] = None
11
+ additional_metadata: Optional[Dict[str, Any]] = None
12
+ example_id: str = Field(default_factory=lambda: str(uuid4()))
13
+ example_index: Optional[int] = None
14
+ timestamp: Optional[str] = None
15
+ trace_id: Optional[str] = None
16
+
17
+ model_config = {
18
+ "extra": "allow", # Allow extra fields with any types
19
+ }
20
+
21
+ def __init__(self, **data):
22
+ if 'example_id' not in data:
23
+ data['example_id'] = str(uuid4())
24
+ # Set timestamp if not provided
25
+ if 'timestamp' not in data:
26
+ data['timestamp'] = datetime.now().isoformat()
27
+ super().__init__(**data)
28
+
29
+ @field_validator('additional_metadata', mode='before')
30
+ @classmethod
31
+ def validate_additional_metadata(cls, v):
32
+ if v is not None and not isinstance(v, dict):
33
+ raise ValueError(f"Additional metadata must be a dictionary or None but got {v} of type {type(v)}")
34
+ return v
35
+
36
+ @field_validator('example_index', mode='before')
37
+ @classmethod
38
+ def validate_example_index(cls, v):
39
+ if v is not None and not isinstance(v, int):
40
+ raise ValueError(f"Example index must be an integer or None but got {v} of type {type(v)}")
41
+ return v
42
+
43
+ @field_validator('timestamp', mode='before')
44
+ @classmethod
45
+ def validate_timestamp(cls, v):
46
+ if v is not None and not isinstance(v, str):
47
+ raise ValueError(f"Timestamp must be a string or None but got {v} of type {type(v)}")
48
+ return v
49
+
50
+ @field_validator('trace_id', mode='before')
51
+ @classmethod
52
+ def validate_trace_id(cls, v):
53
+ if v is not None and not isinstance(v, str):
54
+ raise ValueError(f"Trace ID must be a string or None but got {v} of type {type(v)}")
55
+ return v
56
+
57
+ def to_dict(self):
58
+ return self.model_dump()
59
+
60
+ def __str__(self):
61
+ return str(self.model_dump())
62
+
63
+ def model_dump(self, **kwargs):
64
+ """
65
+ Custom serialization that handles special cases for fields that might fail standard serialization.
66
+ """
67
+ data = super().model_dump(**kwargs)
68
+
69
+ # Get all fields including custom ones
70
+ all_fields = self.__dict__
71
+
72
+ for field_name, value in all_fields.items():
73
+ try:
74
+ # Check if the field has its own serialization method
75
+ if hasattr(value, 'to_dict'):
76
+ data[field_name] = value.to_dict()
77
+ elif hasattr(value, 'model_dump'):
78
+ data[field_name] = value.model_dump()
79
+ # Field is already in data from super().model_dump()
80
+ elif field_name in data:
81
+ continue
82
+ else:
83
+ # Try standard JSON serialization
84
+ json.dumps(value)
85
+ data[field_name] = value
86
+ except (TypeError, OverflowError, ValueError):
87
+ # Handle non-serializable objects
88
+ try:
89
+ # Try converting to string
90
+ data[field_name] = str(value)
91
+ except Exception as _:
92
+ # If all else fails, store as None and optionally warn
93
+ warnings.warn(f"Could not serialize field {field_name}, setting to None")
94
+ data[field_name] = None
95
+
96
+ return data
97
+
98
+
@@ -7,12 +7,11 @@ import yaml
7
7
  from dataclasses import dataclass, field
8
8
  from typing import List, Union, Literal
9
9
 
10
- from judgeval.data import Example, GroundTruthExample
10
+ from judgeval.data import Example
11
11
  from judgeval.common.logger import debug, error, warning, info
12
12
 
13
13
  @dataclass
14
14
  class EvalDataset:
15
- ground_truths: List[GroundTruthExample]
16
15
  examples: List[Example]
17
16
  _alias: Union[str, None] = field(default=None)
18
17
  _id: Union[str, None] = field(default=None)
@@ -21,13 +20,11 @@ class EvalDataset:
21
20
  def __init__(self,
22
21
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
23
22
  organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
24
- ground_truths: List[GroundTruthExample] = [],
25
23
  examples: List[Example] = [],
26
24
  ):
27
- debug(f"Initializing EvalDataset with {len(ground_truths)} ground truths and {len(examples)} examples")
25
+ debug(f"Initializing EvalDataset with {len(examples)} examples")
28
26
  if not judgment_api_key:
29
27
  warning("No judgment_api_key provided")
30
- self.ground_truths = ground_truths
31
28
  self.examples = examples
32
29
  self._alias = None
33
30
  self._id = None
@@ -37,38 +34,13 @@ class EvalDataset:
37
34
  def add_from_json(self, file_path: str) -> None:
38
35
  debug(f"Loading dataset from JSON file: {file_path}")
39
36
  """
40
- Adds examples and ground truths from a JSON file.
37
+ Adds examples from a JSON file.
41
38
 
42
- The format of the JSON file is expected to be a dictionary with two keys: "examples" and "ground_truths".
43
- The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth.
39
+ The format of the JSON file is expected to be a dictionary with one key: "examples".
40
+ The value of the key is a list of dictionaries, where each dictionary represents an example.
44
41
 
45
42
  The JSON file is expected to have the following format:
46
43
  {
47
- "ground_truths": [
48
- {
49
- "input": "test input",
50
- "actual_output": null,
51
- "expected_output": "expected output",
52
- "context": [
53
- "context1"
54
- ],
55
- "retrieval_context": [
56
- "retrieval1"
57
- ],
58
- "additional_metadata": {
59
- "key": "value"
60
- },
61
- "comments": "test comment",
62
- "tools_called": [
63
- "tool1"
64
- ],
65
- "expected_tools": [
66
- "tool1"
67
- ],
68
- "source_file": "test.py",
69
- "trace_id": "094121"
70
- }
71
- ],
72
44
  "examples": [
73
45
  {
74
46
  "input": "test input",
@@ -103,7 +75,6 @@ class EvalDataset:
103
75
  with open(file_path, "r") as file:
104
76
  payload = json.load(file)
105
77
  examples = payload.get("examples", [])
106
- ground_truths = payload.get("ground_truths", [])
107
78
  except FileNotFoundError:
108
79
  error(f"JSON file not found: {file_path}")
109
80
  raise FileNotFoundError(f"The file {file_path} was not found.")
@@ -111,21 +82,17 @@ class EvalDataset:
111
82
  error(f"Invalid JSON file: {file_path}")
112
83
  raise ValueError(f"The file {file_path} is not a valid JSON file.")
113
84
 
114
- info(f"Added {len(examples)} examples and {len(ground_truths)} ground truths from JSON")
85
+ info(f"Added {len(examples)} examples from JSON")
115
86
  new_examples = [Example(**e) for e in examples]
116
87
  for e in new_examples:
117
88
  self.add_example(e)
118
-
119
- new_ground_truths = [GroundTruthExample(**g) for g in ground_truths]
120
- for g in new_ground_truths:
121
- self.add_ground_truth(g)
122
89
 
123
90
  def add_from_csv(
124
91
  self,
125
92
  file_path: str,
126
93
  ) -> None:
127
94
  """
128
- Add Examples and GroundTruthExamples from a CSV file.
95
+ Add Examples from a CSV file.
129
96
  """
130
97
  try:
131
98
  import pandas as pd
@@ -144,14 +111,14 @@ class EvalDataset:
144
111
  "expected_tools", "name", "comments", "source_file", "example", \
145
112
  "trace_id"
146
113
 
147
- We want to collect the examples and ground truths separately which can
114
+ We want to collect the examples separately which can
148
115
  be determined by the "example" column. If the value is True, then it is an
149
- example, otherwise it is a ground truth.
116
+ example
150
117
 
151
118
  We also assume that if there are multiple retrieval contexts or contexts, they are separated by semicolons.
152
119
  This can be adjusted using the `context_delimiter` and `retrieval_context_delimiter` parameters.
153
120
  """
154
- examples, ground_truths = [], []
121
+ examples = []
155
122
 
156
123
  for _, row in df.iterrows():
157
124
  data = {
@@ -174,49 +141,20 @@ class EvalDataset:
174
141
  examples.append(e)
175
142
  else:
176
143
  raise ValueError("Every example must have an 'input' and 'actual_output' field.")
177
- else:
178
- # GroundTruthExample has `comments` and `source_file` fields
179
- data["comments"] = row["comments"] if pd.notna(row["comments"]) else None
180
- data["source_file"] = row["source_file"] if pd.notna(row["source_file"]) else None
181
- # every GroundTruthExample has `input` field
182
- if data["input"] is not None:
183
- g = GroundTruthExample(**data)
184
- ground_truths.append(g)
185
- else:
186
- raise ValueError("Every ground truth must have an 'input' field.")
144
+
187
145
 
188
146
  for e in examples:
189
147
  self.add_example(e)
190
148
 
191
- for g in ground_truths:
192
- self.add_ground_truth(g)
193
-
194
149
  def add_from_yaml(self, file_path: str) -> None:
195
150
  debug(f"Loading dataset from YAML file: {file_path}")
196
151
  """
197
- Adds examples and ground truths from a YAML file.
152
+ Adds examples from a YAML file.
198
153
 
199
- The format of the YAML file is expected to be a dictionary with two keys: "examples" and "ground_truths".
200
- The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth.
154
+ The format of the YAML file is expected to be a dictionary with one key: "examples".
155
+ The value of the key is a list of dictionaries, where each dictionary represents an example.
201
156
 
202
157
  The YAML file is expected to have the following format:
203
- ground_truths:
204
- - input: "test input"
205
- actual_output: null
206
- expected_output: "expected output"
207
- context:
208
- - "context1"
209
- retrieval_context:
210
- - "retrieval1"
211
- additional_metadata:
212
- key: "value"
213
- comments: "test comment"
214
- tools_called:
215
- - "tool1"
216
- expected_tools:
217
- - "tool1"
218
- source_file: "test.py"
219
- trace_id: "094121"
220
158
  examples:
221
159
  - input: "test input"
222
160
  actual_output: "test output"
@@ -244,7 +182,6 @@ class EvalDataset:
244
182
  if payload is None:
245
183
  raise ValueError("The YAML file is empty.")
246
184
  examples = payload.get("examples", [])
247
- ground_truths = payload.get("ground_truths", [])
248
185
  except FileNotFoundError:
249
186
  error(f"YAML file not found: {file_path}")
250
187
  raise FileNotFoundError(f"The file {file_path} was not found.")
@@ -252,25 +189,18 @@ class EvalDataset:
252
189
  error(f"Invalid YAML file: {file_path}")
253
190
  raise ValueError(f"The file {file_path} is not a valid YAML file.")
254
191
 
255
- info(f"Added {len(examples)} examples and {len(ground_truths)} ground truths from YAML")
192
+ info(f"Added {len(examples)} examples from YAML")
256
193
  new_examples = [Example(**e) for e in examples]
257
194
  for e in new_examples:
258
195
  self.add_example(e)
259
196
 
260
- new_ground_truths = [GroundTruthExample(**g) for g in ground_truths]
261
- for g in new_ground_truths:
262
- self.add_ground_truth(g)
263
-
264
197
  def add_example(self, e: Example) -> None:
265
198
  self.examples = self.examples + [e]
266
199
  # TODO if we need to add rank, then we need to do it here
267
-
268
- def add_ground_truth(self, g: GroundTruthExample) -> None:
269
- self.ground_truths = self.ground_truths + [g]
270
200
 
271
201
  def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
272
202
  """
273
- Saves the dataset as a file. Save both the ground truths and examples.
203
+ Saves the dataset as a file. Save only the examples.
274
204
 
275
205
  Args:
276
206
  file_type (Literal["json", "csv"]): The file type to save the dataset as.
@@ -285,7 +215,6 @@ class EvalDataset:
285
215
  with open(complete_path, "w") as file:
286
216
  json.dump(
287
217
  {
288
- "ground_truths": [g.to_dict() for g in self.ground_truths],
289
218
  "examples": [e.to_dict() for e in self.examples],
290
219
  },
291
220
  file,
@@ -319,24 +248,7 @@ class EvalDataset:
319
248
  ]
320
249
  )
321
250
 
322
- for g in self.ground_truths:
323
- writer.writerow(
324
- [
325
- g.input,
326
- g.actual_output,
327
- g.expected_output,
328
- ";".join(g.context),
329
- ";".join(g.retrieval_context),
330
- g.additional_metadata,
331
- ";".join(g.tools_called),
332
- ";".join(g.expected_tools),
333
- None, # GroundTruthExample does not have name
334
- g.comments,
335
- g.source_file,
336
- False, # Adding a GroundTruthExample, not an Example
337
- g.trace_id
338
- ]
339
- )
251
+
340
252
  elif file_type == "yaml":
341
253
  with open(complete_path, "w") as file:
342
254
  yaml_data = {
@@ -358,24 +270,6 @@ class EvalDataset:
358
270
  }
359
271
  for e in self.examples
360
272
  ],
361
- "ground_truths": [
362
- {
363
- "input": g.input,
364
- "actual_output": g.actual_output,
365
- "expected_output": g.expected_output,
366
- "context": g.context,
367
- "retrieval_context": g.retrieval_context,
368
- "additional_metadata": g.additional_metadata,
369
- "tools_called": g.tools_called,
370
- "expected_tools": g.expected_tools,
371
- "name": None, # GroundTruthExample does not have name
372
- "comments": g.comments,
373
- "source_file": g.source_file,
374
- "example": False, # Adding a GroundTruthExample, not an Example
375
- "trace_id": g.trace_id
376
- }
377
- for g in self.ground_truths
378
- ]
379
273
  }
380
274
  yaml.dump(yaml_data, file, default_flow_style=False)
381
275
  else:
@@ -391,7 +285,6 @@ class EvalDataset:
391
285
  def __str__(self):
392
286
  return (
393
287
  f"{self.__class__.__name__}("
394
- f"ground_truths={self.ground_truths}, "
395
288
  f"examples={self.examples}, "
396
289
  f"_alias={self._alias}, "
397
290
  f"_id={self._id}"