judgeval 0.0.6__tar.gz → 0.0.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. {judgeval-0.0.6 → judgeval-0.0.8}/PKG-INFO +3 -1
  2. judgeval-0.0.8/docs/evaluation/unit_testing.mdx +37 -0
  3. {judgeval-0.0.6 → judgeval-0.0.8}/docs/introduction.mdx +2 -2
  4. {judgeval-0.0.6 → judgeval-0.0.8}/docs/mint.json +2 -1
  5. {judgeval-0.0.6 → judgeval-0.0.8}/pyproject.toml +3 -1
  6. judgeval-0.0.8/src/demo/cookbooks/ci_testing/ci_testing.py +201 -0
  7. judgeval-0.0.8/src/demo/cookbooks/ci_testing/travel_response.txt +52 -0
  8. {judgeval-0.0.6 → judgeval-0.0.8}/src/demo/cookbooks/openai_travel_agent/agent.py +0 -2
  9. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/judgment_client.py +6 -5
  10. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorer.py +2 -0
  11. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +10 -5
  12. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +11 -5
  13. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +10 -5
  14. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +10 -5
  15. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +10 -5
  16. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +11 -6
  17. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +10 -5
  18. judgeval-0.0.8/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +5 -0
  19. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +13 -6
  20. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +10 -1
  21. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +10 -4
  22. judgeval-0.0.6/src/test.txt +0 -51
  23. judgeval-0.0.6/test.txt +0 -0
  24. {judgeval-0.0.6 → judgeval-0.0.8}/.github/workflows/ci.yaml +0 -0
  25. {judgeval-0.0.6 → judgeval-0.0.8}/.gitignore +0 -0
  26. {judgeval-0.0.6 → judgeval-0.0.8}/LICENSE.md +0 -0
  27. {judgeval-0.0.6 → judgeval-0.0.8}/Pipfile +0 -0
  28. {judgeval-0.0.6 → judgeval-0.0.8}/Pipfile.lock +0 -0
  29. {judgeval-0.0.6 → judgeval-0.0.8}/README.md +0 -0
  30. {judgeval-0.0.6 → judgeval-0.0.8}/docs/README.md +0 -0
  31. {judgeval-0.0.6 → judgeval-0.0.8}/docs/development.mdx +0 -0
  32. {judgeval-0.0.6 → judgeval-0.0.8}/docs/essentials/code.mdx +0 -0
  33. {judgeval-0.0.6 → judgeval-0.0.8}/docs/essentials/images.mdx +0 -0
  34. {judgeval-0.0.6 → judgeval-0.0.8}/docs/essentials/markdown.mdx +0 -0
  35. {judgeval-0.0.6 → judgeval-0.0.8}/docs/essentials/navigation.mdx +0 -0
  36. {judgeval-0.0.6 → judgeval-0.0.8}/docs/essentials/reusable-snippets.mdx +0 -0
  37. {judgeval-0.0.6 → judgeval-0.0.8}/docs/essentials/settings.mdx +0 -0
  38. {judgeval-0.0.6 → judgeval-0.0.8}/docs/evaluation/data_datasets.mdx +0 -0
  39. {judgeval-0.0.6 → judgeval-0.0.8}/docs/evaluation/data_examples.mdx +0 -0
  40. {judgeval-0.0.6 → judgeval-0.0.8}/docs/evaluation/introduction.mdx +0 -0
  41. {judgeval-0.0.6 → judgeval-0.0.8}/docs/evaluation/judges.mdx +0 -0
  42. {judgeval-0.0.6 → judgeval-0.0.8}/docs/evaluation/scorers/answer_correctness.mdx +0 -0
  43. {judgeval-0.0.6 → judgeval-0.0.8}/docs/evaluation/scorers/answer_relevancy.mdx +0 -0
  44. {judgeval-0.0.6 → judgeval-0.0.8}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
  45. {judgeval-0.0.6 → judgeval-0.0.8}/docs/evaluation/scorers/contextual_precision.mdx +0 -0
  46. {judgeval-0.0.6 → judgeval-0.0.8}/docs/evaluation/scorers/contextual_recall.mdx +0 -0
  47. {judgeval-0.0.6 → judgeval-0.0.8}/docs/evaluation/scorers/contextual_relevancy.mdx +0 -0
  48. {judgeval-0.0.6 → judgeval-0.0.8}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
  49. {judgeval-0.0.6 → judgeval-0.0.8}/docs/evaluation/scorers/faithfulness.mdx +0 -0
  50. {judgeval-0.0.6 → judgeval-0.0.8}/docs/evaluation/scorers/hallucination.mdx +0 -0
  51. {judgeval-0.0.6 → judgeval-0.0.8}/docs/evaluation/scorers/introduction.mdx +0 -0
  52. {judgeval-0.0.6 → judgeval-0.0.8}/docs/evaluation/scorers/json_correctness.mdx +0 -0
  53. {judgeval-0.0.6 → judgeval-0.0.8}/docs/evaluation/scorers/summarization.mdx +0 -0
  54. {judgeval-0.0.6 → judgeval-0.0.8}/docs/evaluation/scorers/tool_correctness.mdx +0 -0
  55. {judgeval-0.0.6 → judgeval-0.0.8}/docs/favicon.svg +0 -0
  56. {judgeval-0.0.6 → judgeval-0.0.8}/docs/getting_started.mdx +0 -0
  57. {judgeval-0.0.6 → judgeval-0.0.8}/docs/images/checks-passed.png +0 -0
  58. {judgeval-0.0.6 → judgeval-0.0.8}/docs/images/create_aggressive_scorer.png +0 -0
  59. {judgeval-0.0.6 → judgeval-0.0.8}/docs/images/create_scorer.png +0 -0
  60. {judgeval-0.0.6 → judgeval-0.0.8}/docs/images/evaluation_diagram.png +0 -0
  61. {judgeval-0.0.6 → judgeval-0.0.8}/docs/images/hero-dark.svg +0 -0
  62. {judgeval-0.0.6 → judgeval-0.0.8}/docs/images/hero-light.svg +0 -0
  63. {judgeval-0.0.6 → judgeval-0.0.8}/docs/images/trace_screenshot.png +0 -0
  64. {judgeval-0.0.6 → judgeval-0.0.8}/docs/judgment/introduction.mdx +0 -0
  65. {judgeval-0.0.6 → judgeval-0.0.8}/docs/logo/dark.svg +0 -0
  66. {judgeval-0.0.6 → judgeval-0.0.8}/docs/logo/light.svg +0 -0
  67. {judgeval-0.0.6 → judgeval-0.0.8}/docs/monitoring/introduction.mdx +0 -0
  68. {judgeval-0.0.6 → judgeval-0.0.8}/docs/monitoring/production_insights.mdx +0 -0
  69. {judgeval-0.0.6 → judgeval-0.0.8}/docs/monitoring/tracing.mdx +0 -0
  70. {judgeval-0.0.6 → judgeval-0.0.8}/docs/notebooks/create_dataset.ipynb +0 -0
  71. {judgeval-0.0.6 → judgeval-0.0.8}/docs/notebooks/create_scorer.ipynb +0 -0
  72. {judgeval-0.0.6 → judgeval-0.0.8}/docs/notebooks/demo.ipynb +0 -0
  73. {judgeval-0.0.6 → judgeval-0.0.8}/docs/notebooks/prompt_scorer.ipynb +0 -0
  74. {judgeval-0.0.6 → judgeval-0.0.8}/docs/notebooks/quickstart.ipynb +0 -0
  75. {judgeval-0.0.6 → judgeval-0.0.8}/docs/quickstart.mdx +0 -0
  76. {judgeval-0.0.6 → judgeval-0.0.8}/docs/snippets/snippet-intro.mdx +0 -0
  77. {judgeval-0.0.6 → judgeval-0.0.8}/pytest.ini +0 -0
  78. {judgeval-0.0.6 → judgeval-0.0.8}/src/demo/cookbooks/langchain_basic_rag/basic_agentic_rag.ipynb +0 -0
  79. {judgeval-0.0.6 → judgeval-0.0.8}/src/demo/cookbooks/langchain_basic_rag/tesla_q3.pdf +0 -0
  80. {judgeval-0.0.6 → judgeval-0.0.8}/src/demo/cookbooks/langchain_sales/example_product_price_id_mapping.json +0 -0
  81. {judgeval-0.0.6 → judgeval-0.0.8}/src/demo/cookbooks/langchain_sales/sales_agent_with_context.ipynb +0 -0
  82. {judgeval-0.0.6 → judgeval-0.0.8}/src/demo/cookbooks/langchain_sales/sample_product_catalog.txt +0 -0
  83. {judgeval-0.0.6 → judgeval-0.0.8}/src/demo/cookbooks/openai_travel_agent/populate_db.py +0 -0
  84. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/__init__.py +0 -0
  85. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/clients.py +0 -0
  86. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/common/__init__.py +0 -0
  87. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/common/exceptions.py +0 -0
  88. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/common/logger.py +0 -0
  89. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/common/tracer.py +0 -0
  90. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/common/utils.py +0 -0
  91. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/constants.py +0 -0
  92. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/data/__init__.py +0 -0
  93. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/data/api_example.py +0 -0
  94. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/data/datasets/__init__.py +0 -0
  95. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/data/datasets/dataset.py +0 -0
  96. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/data/datasets/ground_truth.py +0 -0
  97. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/data/datasets/utils.py +0 -0
  98. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/data/example.py +0 -0
  99. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/data/result.py +0 -0
  100. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/data/scorer_data.py +0 -0
  101. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/evaluation_run.py +0 -0
  102. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/judges/__init__.py +0 -0
  103. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/judges/base_judge.py +0 -0
  104. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/judges/litellm_judge.py +0 -0
  105. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/judges/mixture_of_judges.py +0 -0
  106. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/judges/together_judge.py +0 -0
  107. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/judges/utils.py +0 -0
  108. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/run_evaluation.py +0 -0
  109. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/__init__.py +0 -0
  110. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/api_scorer.py +0 -0
  111. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/base_scorer.py +0 -0
  112. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/exceptions.py +0 -0
  113. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  114. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  115. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  116. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  117. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
  118. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
  119. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
  120. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  121. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
  122. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
  123. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
  124. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -0
  125. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -0
  126. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -0
  127. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -0
  128. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -0
  129. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -0
  130. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -0
  131. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -0
  132. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -0
  133. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -0
  134. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -0
  135. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -0
  136. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -0
  137. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -0
  138. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -0
  139. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -0
  140. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -0
  141. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -0
  142. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -0
  143. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/prompt_scorer.py +0 -0
  144. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/score.py +0 -0
  145. {judgeval-0.0.6 → judgeval-0.0.8}/src/judgeval/scorers/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.6
3
+ Version: 0.0.8
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -14,9 +14,11 @@ Requires-Dist: anthropic
14
14
  Requires-Dist: fastapi
15
15
  Requires-Dist: langfuse==2.50.3
16
16
  Requires-Dist: litellm
17
+ Requires-Dist: nest-asyncio
17
18
  Requires-Dist: openai
18
19
  Requires-Dist: pandas
19
20
  Requires-Dist: patronus
21
+ Requires-Dist: pika
20
22
  Requires-Dist: python-dotenv==1.0.1
21
23
  Requires-Dist: requests
22
24
  Requires-Dist: supabase
@@ -0,0 +1,37 @@
1
+ ---
2
+ title: Unit Testing
3
+ ---
4
+
5
+ CI pipelines are the core of all mature software engineering practices.
6
+
7
+ **With LLMs, developers should expect nothing less.**
8
+ Using `judgeval`, you can easily unit test your LLM applications for consistency and quality in any metric of your choice.
9
+
10
+ Unit testing is natively supported in `judgeval` through the `client.assert_test` method.
11
+ **This also integrates with `pytest`, meaning you won't have to learn any new testing frameworks!**
12
+
13
+ ```python
14
+ from judgeval import JudgmentClient
15
+ from judgeval.data import Example
16
+ from judgeval.scorers import FaithfulnessScorer
17
+
18
+ def test_faithfulness():
19
+ client = JudgmentClient()
20
+
21
+ example = Example(
22
+ input="What is the capital of France?",
23
+ actual_output="The capital of France is Lyon.",
24
+ retrieval_context=["Come tour Paris' museums in the capital of France!"],
25
+ )
26
+ with pytest.raises(AssertionError):
27
+ client.assert_test(
28
+ eval_run_name="test_eval",
29
+ examples=[example],
30
+ scorers=[FaithfulnessScorer(threshold=1.0)],
31
+ )
32
+ ```
33
+
34
+ `judgeval` naturally integrates into your CI pipelines, allowing you to execute robust unit tests across your entire codebase.
35
+ **This allows you to catch regressions in your LLM applications before they make it to production!**
36
+
37
+
@@ -82,8 +82,8 @@ Judgeval is designed for AI teams to easily benchmark and iterate on their LLM a
82
82
  - Construct powerful custom evaluation pipelines for your LLM systems.
83
83
  - Monitor LLM systems in production using state-of-the-art **real-time evaluation foundation models**.
84
84
 
85
- Judgeval integrates natively with the **Judgment Labs Platform**, allowing you to evaluate, regression test,
86
- and monitor LLM applications in the cloud.
85
+ Judgeval integrates natively with the **Judgment Labs Platform**, allowing you to [evaluate](/evaluation/introduction), regression test,
86
+ and [monitor](/monitoring/introduction) LLM applications in the cloud.
87
87
 
88
88
  Judgeval was built by a passionate team of LLM researchers from **Stanford, Datadog, and Together AI**.
89
89
 
@@ -70,7 +70,8 @@
70
70
  "evaluation/scorers/classifier_scorer"
71
71
  ]
72
72
  },
73
- "evaluation/judges"
73
+ "evaluation/judges",
74
+ "evaluation/unit_testing"
74
75
  ]
75
76
  },
76
77
  {
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.0.6"
3
+ version = "0.0.8"
4
4
  authors = [
5
5
  { name="Andrew Li", email="andrew@judgmentlabs.ai" },
6
6
  { name="Alex Shan", email="alex@judgmentlabs.ai" },
@@ -28,6 +28,8 @@ dependencies = [
28
28
  "together",
29
29
  "anthropic",
30
30
  "patronus",
31
+ "nest-asyncio",
32
+ "pika",
31
33
  ]
32
34
 
33
35
  [project.optional-dependencies]
@@ -0,0 +1,201 @@
1
+ """
2
+ Cookbook for CI testing LLM applications using `judgeval`
3
+
4
+ Includes unit tests and end-to-end tests for an OpenAI API-based travel agent
5
+ """
6
+
7
+ import asyncio
8
+ import os
9
+ import pytest
10
+ from demo.cookbooks.openai_travel_agent.agent import *
11
+ from judgeval import JudgmentClient
12
+ from judgeval.data import Example
13
+ from judgeval.scorers import (
14
+ AnswerCorrectnessScorer,
15
+ AnswerRelevancyScorer,
16
+ FaithfulnessScorer
17
+ )
18
+
19
+
20
+ @pytest.fixture
21
+ def judgment_client():
22
+ return JudgmentClient()
23
+
24
+
25
+ @pytest.fixture
26
+ def research_data():
27
+ return {
28
+ "attractions": [
29
+ "The iconic Eiffel Tower stands at 324 meters tall and welcomes over 7 million visitors annually. Visitors can access three levels, with the top floor offering panoramic views of Paris. The tower features two restaurants: 58 Tour Eiffel and the Michelin-starred Le Jules Verne.",
30
+ "The Louvre Museum houses over 380,000 objects and displays 35,000 works of art across eight departments. Home to the Mona Lisa and Venus de Milo, it's the world's largest art museum with 72,735 square meters of exhibition space. Visitors typically need 3-4 hours to see the highlights.",
31
+ "The historic district of Montmartre sits on Paris's highest hill at 130 meters. Famous for the white Sacré-Cœur Basilica and Place du Tertre filled with artists, it was once home to renowned painters like Picasso and Van Gogh. The area retains its village-like charm with winding cobblestone streets and authentic Parisian cafes."
32
+ ],
33
+ "hotels": [
34
+ "Hotel de la Paix is a luxurious 5-star establishment in the 16th arrondissement, featuring 85 rooms and suites decorated in classic Parisian style. The hotel offers a Michelin-starred restaurant, spa facilities, and is located just 10 minutes from the Arc de Triomphe.",
35
+ "Hotel de Paris, situated in the Opera district, combines Belle Époque architecture with modern amenities. Recently renovated in 2022, it offers 107 rooms, a rooftop bar with Eiffel Tower views, and has received the Palace distinction for exceptional service.",
36
+ "Hotel de Ville, a boutique hotel in Le Marais, occupies a restored 17th-century mansion. With 40 individually designed rooms, a courtyard garden, and acclaimed restaurant, it provides an authentic Parisian experience steps from Notre-Dame Cathedral."
37
+ ],
38
+ "flights": [
39
+ "Multiple daily direct flights to Paris Charles de Gaulle (CDG) from major US cities. Air France and United Airlines operate regular routes from JFK, LAX, and Chicago O'Hare. Flight times range from 7-11 hours depending on departure city.",
40
+ "From San Francisco International Airport (SFO), Air France operates a daily direct flight AF085 departing at 3:30 PM, arriving at CDG at 11:15 AM next day. United Airlines also offers UA990 with a similar schedule. Average flight time is 10 hours 45 minutes."
41
+ ],
42
+ "weather": "Paris in mid-February typically experiences cool winter conditions with average daytime temperatures ranging from 8-12°C (46-54°F). Current forecast shows mostly sunny conditions with occasional cloud cover. Morning temperatures around 6°C (43°F) rising to 12°C (54°F) by afternoon. Light breeze of 8-12 km/h expected with 20% chance of precipitation. Evening temperatures dropping to 4°C (39°F). UV index moderate at 3.",
43
+ "vector_db_results": []
44
+ }
45
+
46
+
47
+ @pytest.fixture
48
+ def sample_itinerary() -> str:
49
+ """
50
+ Loads the sample itinerary from the saved file
51
+ """
52
+ PATH_TO_ITINERARY = os.path.join(os.path.dirname(__file__), "travel_response.txt")
53
+ with open(PATH_TO_ITINERARY, 'r') as file:
54
+ return file.read()
55
+
56
+
57
+ @pytest.fixture
58
+ def expected_itinerary():
59
+ return """5-Day Paris Itinerary (February 11-15, 2025)
60
+
61
+ Accommodation: Hotel de Paris in the Opera district
62
+ - Selected for its central location, rooftop bar with Eiffel Tower views, and recent 2022 renovation
63
+
64
+ Transportation:
65
+ - Arrival via Air France flight AF085/United Airlines UA990 from SFO, landing at CDG at 11:15 AM
66
+
67
+ Weather Considerations:
68
+ - Pack warm clothing for temperatures between 4-12°C (39-54°F)
69
+ - Morning activities planned indoors due to cooler temperatures
70
+ - Outdoor activities scheduled during peak afternoon warmth
71
+
72
+ Day 1 (Feb 11):
73
+ - 11:15 AM: Arrival at CDG, transfer to Hotel de Paris
74
+ - 2:00 PM: Hotel check-in and refresh
75
+ - 3:30 PM: Visit the Eiffel Tower (taking advantage of afternoon warmth)
76
+ - 7:00 PM: Dinner at Le Jules Verne in the Eiffel Tower
77
+
78
+ Day 2 (Feb 12):
79
+ - 9:00 AM: Breakfast at hotel
80
+ - 10:00 AM: Louvre Museum visit (3-4 hours, indoor activity during cool morning)
81
+ - 2:30 PM: Late lunch in Opera district
82
+ - 4:00 PM: Rooftop bar at hotel for sunset views
83
+ - Evening: Dinner at hotel's restaurant
84
+
85
+ Day 3 (Feb 13):
86
+ - 10:00 AM: Visit Montmartre (during warming temperatures)
87
+ - 11:00 AM: Explore Sacré-Cœur Basilica
88
+ - 12:30 PM: Lunch at local cafe in Montmartre
89
+ - 2:00 PM: Artist square at Place du Tertre
90
+ - Evening: Dinner at authentic Parisian bistro
91
+
92
+ Day 4 (Feb 14):
93
+ - Morning: Arc de Triomphe visit (10-minute walk from hotel)
94
+ - Afternoon: Shopping and exploring Opera district
95
+ - Evening: Valentine's Day dinner at hotel's Michelin-starred restaurant
96
+
97
+ Day 5 (Feb 15):
98
+ - Morning: Leisurely breakfast
99
+ - Late morning: Check-out and departure
100
+
101
+ Note: Indoor alternatives planned in case of precipitation (20% chance). Schedule optimized around temperature peaks of 12°C in afternoons."""
102
+
103
+
104
+ def test_websearch_tool_answer_relevancy(judgment_client):
105
+ query = "What is the weather like in San Francisco on February 11th, 2025?"
106
+ results = search_tavily(query)
107
+
108
+ example = Example(
109
+ input=query,
110
+ actual_output=str(results)
111
+ )
112
+
113
+ scorer = AnswerRelevancyScorer(threshold=0.8)
114
+
115
+ judgment_client.assert_test(
116
+ examples=[example],
117
+ scorers=[scorer],
118
+ model="gpt-4o-mini",
119
+ project_name="travel_agent_tests",
120
+ eval_run_name="websearch_relevancy_test",
121
+ override=True
122
+ )
123
+
124
+
125
+ def test_travel_planning_faithfulness(judgment_client, sample_itinerary, research_data):
126
+
127
+ destination = "Paris, France"
128
+ start_date = "February 11th, 2025"
129
+ end_date = "February 15th, 2025"
130
+
131
+ hotels_example = Example(
132
+ input=f"Create a structured travel itinerary for a trip to {destination} from {start_date} to {end_date}.",
133
+ actual_output=sample_itinerary,
134
+ retrieval_context=research_data["hotels"]
135
+ )
136
+
137
+ flights_example = Example(
138
+ input=f"Create a structured travel itinerary for a trip to {destination} from {start_date} to {end_date}.",
139
+ actual_output=sample_itinerary,
140
+ retrieval_context=research_data["flights"]
141
+ )
142
+
143
+ judgment_client.assert_test(
144
+ examples=[hotels_example, flights_example],
145
+ scorers=[FaithfulnessScorer(threshold=1.0)],
146
+ model="gpt-4o",
147
+ project_name="travel_agent_tests",
148
+ eval_run_name="travel_planning_faithfulness_test",
149
+ override=True
150
+ )
151
+
152
+
153
+ def test_travel_planning_answer_correctness(judgment_client, sample_itinerary, expected_itinerary):
154
+
155
+ destination = "Paris, France"
156
+ start_date = "February 11th, 2025"
157
+ end_date = "February 15th, 2025"
158
+
159
+ example = Example(
160
+ input=f"Create a structured travel itinerary for a trip to {destination} from {start_date} to {end_date}.",
161
+ actual_output=sample_itinerary,
162
+ expected_output=expected_itinerary
163
+ )
164
+ with pytest.raises(AssertionError):
165
+ judgment_client.assert_test(
166
+ examples=[example],
167
+ scorers=[AnswerCorrectnessScorer(threshold=0.75)],
168
+ model="gpt-4o",
169
+ project_name="travel_agent_tests",
170
+ eval_run_name="travel_planning_correctness_test",
171
+ override=True
172
+ )
173
+
174
+
175
+ def save_travel_response(destination, start_date, end_date, research_data, file_path):
176
+ response = asyncio.run(create_travel_plan(destination, start_date, end_date, research_data))
177
+ with open(file_path, 'w') as f:
178
+ f.write(response)
179
+
180
+
181
+ if __name__ == "__main__":
182
+ sample_research_data = {
183
+ "attractions": [
184
+ "The iconic Eiffel Tower stands at 324 meters tall and welcomes over 7 million visitors annually. Visitors can access three levels, with the top floor offering panoramic views of Paris. The tower features two restaurants: 58 Tour Eiffel and the Michelin-starred Le Jules Verne.",
185
+ "The Louvre Museum houses over 380,000 objects and displays 35,000 works of art across eight departments. Home to the Mona Lisa and Venus de Milo, it's the world's largest art museum with 72,735 square meters of exhibition space. Visitors typically need 3-4 hours to see the highlights.",
186
+ "The historic district of Montmartre sits on Paris's highest hill at 130 meters. Famous for the white Sacré-Cœur Basilica and Place du Tertre filled with artists, it was once home to renowned painters like Picasso and Van Gogh. The area retains its village-like charm with winding cobblestone streets and authentic Parisian cafes."
187
+ ],
188
+ "hotels": [
189
+ "Hotel de la Paix is a luxurious 5-star establishment in the 16th arrondissement, featuring 85 rooms and suites decorated in classic Parisian style. The hotel offers a Michelin-starred restaurant, spa facilities, and is located just 10 minutes from the Arc de Triomphe.",
190
+ "Hotel de Paris, situated in the Opera district, combines Belle Époque architecture with modern amenities. Recently renovated in 2022, it offers 107 rooms, a rooftop bar with Eiffel Tower views, and has received the Palace distinction for exceptional service.",
191
+ "Hotel de Ville, a boutique hotel in Le Marais, occupies a restored 17th-century mansion. With 40 individually designed rooms, a courtyard garden, and acclaimed restaurant, it provides an authentic Parisian experience steps from Notre-Dame Cathedral."
192
+ ],
193
+ "flights": [
194
+ "Multiple daily direct flights to Paris Charles de Gaulle (CDG) from major US cities. Air France and United Airlines operate regular routes from JFK, LAX, and Chicago O'Hare. Flight times range from 7-11 hours depending on departure city.",
195
+ "From San Francisco International Airport (SFO), Air France operates a daily direct flight AF085 departing at 3:30 PM, arriving at CDG at 11:15 AM next day. United Airlines also offers UA990 with a similar schedule. Average flight time is 10 hours 45 minutes."
196
+ ],
197
+ "weather": "Paris in mid-February typically experiences cool winter conditions with average daytime temperatures ranging from 8-12°C (46-54°F). Current forecast shows mostly sunny conditions with occasional cloud cover. Morning temperatures around 6°C (43°F) rising to 12°C (54°F) by afternoon. Light breeze of 8-12 km/h expected with 20% chance of precipitation. Evening temperatures dropping to 4°C (39°F). UV index moderate at 3.",
198
+ "vector_db_results": []
199
+ }
200
+
201
+ save_travel_response("Paris, France", "February 11th, 2025", "February 15th, 2025", sample_research_data, "./travel_response.txt")
@@ -0,0 +1,52 @@
1
+ Travel Itinerary: Paris, France (February 11th, 2025 - February 15th, 2025)
2
+
3
+ Day 1 - February 11th, 2025:
4
+
5
+ Morning:
6
+ - As per your departure location, fly out on a direct flight on Air France or United Airlines to Paris Charles de Gaulle (CDG).
7
+
8
+ Afternoon:
9
+ - Arrival at CDG at approximately 11:15 AM. Take a taxi or a private car service to Hotel de Ville in Le Marais where you'll be staying. Check-in and freshen up.
10
+
11
+ Evening:
12
+ - Explore the Le Marais district, discover its historic buildings, avant-garde fashion boutiques, and vibrant food scene.
13
+
14
+ Day 2 - February 12th, 2025:
15
+
16
+ Morning & Afternoon:
17
+ - Visit the iconic Louvre Museum. We recommend arriving early to beat the crowds and spend 3-4 hours viewing the classic works of art, especially the Mona Lisa and Venus de Milo.
18
+
19
+ Evening:
20
+ - Follow the Seine river and take a leisurely evening walk from the Louvre to Notre-Dame Cathedral, a masterpiece of French Gothic architecture.
21
+
22
+ Day 3 - February 13th, 2025:
23
+
24
+ Morning:
25
+ - Dedicate this day to explore the historical district of Montmartre. Start by visiting the Sacré-Cœur Basilica, and enjoy the magnificent views of Paris.
26
+
27
+ Afternoon:
28
+ - Visit the famous square Place du Tertre, known for its artists, and have lunch at one of the authentic Parisian cafes surrounding the square.
29
+
30
+ Evening:
31
+ - Explore the Montmartre district, with its cobblestone streets and boutiques. End the day with dinner at a local bistro.
32
+
33
+ Day 4 - February 14th, 2025:
34
+
35
+ Morning:
36
+ - Take a tour to one of the world's most famous landmarks, the Eiffel Tower. Get to the top floor to enjoy a breathtaking view of Paris.
37
+
38
+ Afternoon:
39
+ - Have lunch at the Michelin-starred Le Jules Verne, situated on the Eiffel Tower itself.
40
+
41
+ Evening:
42
+ - Head back to your hotel and freshen up. Go out to Hotel de Paris' rooftop for drinks and enjoy the Eiffel Tower views.
43
+
44
+ Day 5 - February 15th, 2025:
45
+
46
+ Morning:
47
+ - Breakfast and check-out. Visit any local attractions or shopping streets that you might be interested in if time allows.
48
+
49
+ Afternoon:
50
+ - Departure: Take a taxi or a private car to Paris Charles de Gaulle (CDG) Airport in time for your flight home.
51
+
52
+ Note: Pack for cool winter conditions in mid-February with average daytime temperatures ranging from 8-12°C. It's always recommended to check the forecast closer to your departure date.
@@ -1,12 +1,10 @@
1
1
  import openai
2
- import requests
3
2
  import os
4
3
  import asyncio
5
4
  from tavily import TavilyClient
6
5
  from dotenv import load_dotenv
7
6
  import chromadb
8
7
  from chromadb.utils import embedding_functions
9
- import json
10
8
 
11
9
  from judgeval.common.tracer import Tracer, wrap
12
10
  from judgeval.scorers import FaithfulnessScorer, AnswerRelevancyScorer
@@ -267,7 +267,6 @@ class JudgmentClient:
267
267
 
268
268
  return response.json()["slug"]
269
269
 
270
-
271
270
  def assert_test(
272
271
  self,
273
272
  examples: List[Example],
@@ -275,12 +274,14 @@ class JudgmentClient:
275
274
  model: Union[str, List[str], JudgevalJudge],
276
275
  aggregator: Optional[str] = None,
277
276
  metadata: Optional[Dict[str, Any]] = None,
278
- log_results: bool = False,
279
- project_name: str = "",
280
- eval_run_name: str = "",
277
+ log_results: bool = True,
278
+ project_name: str = "default_project",
279
+ eval_run_name: str = "default_eval_run",
281
280
  override: bool = False,
282
281
  ) -> None:
283
-
282
+ """
283
+ Asserts a test by running the evaluation and checking the results for success
284
+ """
284
285
  results = self.run_evaluation(
285
286
  examples=examples,
286
287
  scorers=scorers,
@@ -58,6 +58,8 @@ class JudgevalScorer:
58
58
  additional_metadata: Optional[Dict] = None
59
59
  ):
60
60
  debug(f"Initializing CustomScorer with score_type={score_type}, threshold={threshold}")
61
+ if not 0 <= threshold <= 1:
62
+ raise ValueError("Threshold must be between 0 and 1")
61
63
  if strict_mode:
62
64
  warning("Strict mode enabled - scoring will be more rigorous")
63
65
  info(f"CustomScorer initialized with evaluation_model: {evaluation_model}")
@@ -1,6 +1,7 @@
1
1
  from typing import Optional, List, Union, Tuple
2
2
  from pydantic import BaseModel
3
3
 
4
+ from judgeval.constants import APIScorer
4
5
  from judgeval.judges import JudgevalJudge
5
6
  from judgeval.judges.utils import create_judge
6
7
  from judgeval.data import Example, ExampleParams
@@ -38,13 +39,17 @@ class AnswerCorrectnessScorer(JudgevalScorer):
38
39
  strict_mode: bool = False,
39
40
  verbose_mode: bool = False
40
41
  ):
41
- self.threshold = 1 if strict_mode else threshold
42
- self.include_reason = include_reason
42
+ super().__init__(
43
+ score_type=APIScorer.ANSWER_CORRECTNESS,
44
+ threshold=1 if strict_mode else threshold,
45
+ evaluation_model=None,
46
+ include_reason=include_reason,
47
+ async_mode=async_mode,
48
+ strict_mode=strict_mode,
49
+ verbose_mode=verbose_mode
50
+ )
43
51
  self.model, self.using_native_model = create_judge(model)
44
52
  self.evaluation_model = self.model.get_model_name()
45
- self.async_mode = async_mode
46
- self.strict_mode = strict_mode
47
- self.verbose_mode = verbose_mode
48
53
 
49
54
  async def _a_get_statements(self, expected_output: str) -> List[str]:
50
55
  prompt = AnswerCorrectnessTemplate.deduce_statements(
@@ -1,5 +1,6 @@
1
1
  from typing import Optional, List, Union, Tuple
2
2
 
3
+ from judgeval.constants import APIScorer
3
4
  from judgeval.scorers.utils import (get_or_create_event_loop,
4
5
  scorer_progress_meter,
5
6
  create_verbose_logs,
@@ -34,13 +35,18 @@ class AnswerRelevancyScorer(JudgevalScorer):
34
35
  strict_mode: bool = False,
35
36
  verbose_mode: bool = False,
36
37
  ):
37
- self.threshold = 1 if strict_mode else threshold
38
+ super().__init__(
39
+ score_type=APIScorer.ANSWER_RELEVANCY,
40
+ threshold=1 if strict_mode else threshold,
41
+ evaluation_model=None,
42
+ include_reason=include_reason,
43
+ async_mode=async_mode,
44
+ strict_mode=strict_mode,
45
+ verbose_mode=verbose_mode
46
+ )
38
47
  self.model, self.using_native_model = create_judge(model)
39
48
  self.evaluation_model = self.model.get_model_name()
40
- self.include_reason = include_reason
41
- self.async_mode = async_mode
42
- self.strict_mode = strict_mode
43
- self.verbose_mode = verbose_mode
49
+ print(self.model)
44
50
 
45
51
  def score_example(
46
52
  self,
@@ -4,6 +4,7 @@ from judgeval.judges import JudgevalJudge
4
4
  from judgeval.judges.utils import create_judge
5
5
  from judgeval.data import Example, ExampleParams
6
6
  from judgeval.scorers import JudgevalScorer
7
+ from judgeval.constants import APIScorer
7
8
  from judgeval.scorers.utils import (
8
9
  get_or_create_event_loop,
9
10
  parse_response_json,
@@ -30,13 +31,17 @@ class ContextualPrecisionScorer(JudgevalScorer):
30
31
  strict_mode: bool = False,
31
32
  verbose_mode: bool = False,
32
33
  ):
33
- self.threshold = 1 if strict_mode else threshold
34
- self.include_reason = include_reason
34
+ super().__init__(
35
+ score_type=APIScorer.CONTEXTUAL_PRECISION,
36
+ threshold=1 if strict_mode else threshold,
37
+ evaluation_model=None,
38
+ include_reason=include_reason,
39
+ async_mode=async_mode,
40
+ strict_mode=strict_mode,
41
+ verbose_mode=verbose_mode
42
+ )
35
43
  self.model, self.using_native_model = create_judge(model)
36
44
  self.evaluation_model = self.model.get_model_name()
37
- self.async_mode = async_mode
38
- self.strict_mode = strict_mode
39
- self.verbose_mode = verbose_mode
40
45
 
41
46
  def score_example(
42
47
  self,
@@ -1,5 +1,6 @@
1
1
  from typing import Optional, List, Union
2
2
 
3
+ from judgeval.constants import APIScorer
3
4
  from judgeval.scorers.utils import (
4
5
  get_or_create_event_loop,
5
6
  parse_response_json,
@@ -32,14 +33,18 @@ class ContextualRecallScorer(JudgevalScorer):
32
33
  verbose_mode: bool = False,
33
34
  user: Optional[str] = None
34
35
  ):
36
+ super().__init__(
37
+ score_type=APIScorer.CONTEXTUAL_RECALL,
38
+ threshold=1 if strict_mode else threshold,
39
+ evaluation_model=None,
40
+ include_reason=include_reason,
41
+ async_mode=async_mode,
42
+ strict_mode=strict_mode,
43
+ verbose_mode=verbose_mode
44
+ )
35
45
  self.user = user
36
- self.threshold = 1 if strict_mode else threshold
37
46
  self.model, self.using_native_model = create_judge(model)
38
47
  self.evaluation_model = self.model.get_model_name()
39
- self.include_reason = include_reason
40
- self.async_mode = async_mode
41
- self.strict_mode = strict_mode
42
- self.verbose_mode = verbose_mode
43
48
 
44
49
  def score_example(
45
50
  self,
@@ -1,6 +1,7 @@
1
1
  from typing import Optional, List, Union
2
2
  import asyncio
3
3
 
4
+ from judgeval.constants import APIScorer
4
5
  from judgeval.scorers.utils import (get_or_create_event_loop,
5
6
  scorer_progress_meter,
6
7
  create_verbose_logs,
@@ -32,14 +33,18 @@ class ContextualRelevancyScorer(JudgevalScorer):
32
33
  verbose_mode: bool = False,
33
34
  user: Optional[str] = None
34
35
  ):
36
+ super().__init__(
37
+ score_type=APIScorer.CONTEXTUAL_RELEVANCY,
38
+ threshold=1 if strict_mode else threshold,
39
+ evaluation_model=None,
40
+ include_reason=include_reason,
41
+ async_mode=async_mode,
42
+ strict_mode=strict_mode,
43
+ verbose_mode=verbose_mode
44
+ )
35
45
  self.user = user
36
- self.threshold = 1 if strict_mode else threshold
37
46
  self.model, self.using_native_model = create_judge(model)
38
47
  self.evaluation_model = self.model.get_model_name()
39
- self.include_reason = include_reason
40
- self.async_mode = async_mode
41
- self.strict_mode = strict_mode
42
- self.verbose_mode = verbose_mode
43
48
 
44
49
  def score_example(
45
50
  self,
@@ -3,7 +3,7 @@ Code for the local implementation of the Faithfulness metric.
3
3
  """
4
4
  from typing import List, Optional, Union
5
5
 
6
-
6
+ from judgeval.constants import APIScorer
7
7
  from judgeval.data import (
8
8
  Example,
9
9
  ExampleParams
@@ -47,14 +47,19 @@ class FaithfulnessScorer(JudgevalScorer):
47
47
  verbose_mode: bool = False,
48
48
  user: Optional[str] = None
49
49
  ):
50
- self.threshold = 1 if strict_mode else threshold
50
+ super().__init__(
51
+ score_type=APIScorer.FAITHFULNESS,
52
+ threshold=1 if strict_mode else threshold,
53
+ evaluation_model=None,
54
+ include_reason=include_reason,
55
+ async_mode=async_mode,
56
+ strict_mode=strict_mode,
57
+ verbose_mode=verbose_mode
58
+ )
59
+ self.user = user
51
60
  self.model, self.using_native_model = create_judge(model)
52
61
  self.using_native_model = True # NOTE: SETTING THIS FOR LITELLM and TOGETHER usage
53
62
  self.evaluation_model = self.model.get_model_name()
54
- self.include_reason = include_reason
55
- self.async_mode = async_mode
56
- self.strict_mode = strict_mode
57
- self.verbose_mode = verbose_mode
58
63
 
59
64
  def score_example(
60
65
  self,
@@ -20,6 +20,7 @@ Hallucination is measuring the fraction of contexts that agree with output (do n
20
20
 
21
21
  from typing import Optional, Union, List
22
22
 
23
+ from judgeval.constants import APIScorer
23
24
  from judgeval.scorers.utils import (get_or_create_event_loop,
24
25
  scorer_progress_meter,
25
26
  create_verbose_logs,
@@ -50,13 +51,17 @@ class HallucinationScorer(JudgevalScorer):
50
51
  strict_mode: bool = False,
51
52
  verbose_mode: bool = False,
52
53
  ):
53
- self.threshold = 1 if strict_mode else threshold
54
+ super().__init__(
55
+ score_type=APIScorer.HALLUCINATION,
56
+ threshold=1 if strict_mode else threshold,
57
+ evaluation_model=None,
58
+ include_reason=include_reason,
59
+ async_mode=async_mode,
60
+ strict_mode=strict_mode,
61
+ verbose_mode=verbose_mode
62
+ )
54
63
  self.model, self.using_native_model = create_judge(model)
55
64
  self.evaluation_model = self.model.get_model_name()
56
- self.include_reason = include_reason
57
- self.async_mode = async_mode
58
- self.strict_mode = strict_mode
59
- self.verbose_mode = verbose_mode
60
65
 
61
66
  def score_example(
62
67
  self,
@@ -0,0 +1,5 @@
1
+ from judgeval.scorers.judgeval_scorers.local_implementations.json_correctness.json_correctness_scorer import JsonCorrectnessScorer
2
+
3
+ __all__ = [
4
+ "JsonCorrectnessScorer",
5
+ ]