judgeval 0.0.2__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. {judgeval-0.0.2 → judgeval-0.0.3}/PKG-INFO +11 -12
  2. {judgeval-0.0.2 → judgeval-0.0.3}/pyproject.toml +16 -13
  3. judgeval-0.0.2/e2etests/judgment_client_test.py +0 -354
  4. judgeval-0.0.2/e2etests/playground.py +0 -629
  5. judgeval-0.0.2/e2etests/test_prompt_scoring.py +0 -114
  6. judgeval-0.0.2/e2etests/test_tracer.py +0 -143
  7. judgeval-0.0.2/tests/common/test_exceptions.py +0 -33
  8. judgeval-0.0.2/tests/common/test_logger.py +0 -154
  9. judgeval-0.0.2/tests/common/test_tracer.py +0 -284
  10. judgeval-0.0.2/tests/common/test_utils.py +0 -485
  11. judgeval-0.0.2/tests/data/datasets/sample_data/dataset.csv +0 -3
  12. judgeval-0.0.2/tests/data/datasets/sample_data/dataset.json +0 -55
  13. judgeval-0.0.2/tests/data/datasets/test_dataset.py +0 -260
  14. judgeval-0.0.2/tests/data/datasets/test_dataset_utils.py +0 -110
  15. judgeval-0.0.2/tests/data/datasets/test_ground_truth.py +0 -130
  16. judgeval-0.0.2/tests/data/test_api_example.py +0 -153
  17. judgeval-0.0.2/tests/data/test_example.py +0 -133
  18. judgeval-0.0.2/tests/data/test_result.py +0 -121
  19. judgeval-0.0.2/tests/data/test_scorer_data.py +0 -294
  20. judgeval-0.0.2/tests/judges/test_judge_utils.py +0 -62
  21. judgeval-0.0.2/tests/judges/test_litellm_judge.py +0 -218
  22. judgeval-0.0.2/tests/judges/test_mixture_of_judges.py +0 -417
  23. judgeval-0.0.2/tests/judges/test_together_judge.py +0 -187
  24. judgeval-0.0.2/tests/scorers/judgeval_scorers/test_answer_relevancy.py +0 -26
  25. judgeval-0.0.2/tests/scorers/judgeval_scorers/test_contextual_precision.py +0 -26
  26. judgeval-0.0.2/tests/scorers/judgeval_scorers/test_contextual_recall.py +0 -26
  27. judgeval-0.0.2/tests/scorers/judgeval_scorers/test_contextual_relevancy.py +0 -26
  28. judgeval-0.0.2/tests/scorers/judgeval_scorers/test_faithfulness.py +0 -27
  29. judgeval-0.0.2/tests/scorers/judgeval_scorers/test_hallucination.py +0 -26
  30. judgeval-0.0.2/tests/scorers/judgeval_scorers/test_json_correctness.py +0 -37
  31. judgeval-0.0.2/tests/scorers/judgeval_scorers/test_summarization.py +0 -27
  32. judgeval-0.0.2/tests/scorers/judgeval_scorers/test_tool_correctness.py +0 -26
  33. judgeval-0.0.2/tests/scorers/test_base_scorer.py +0 -65
  34. judgeval-0.0.2/tests/scorers/test_custom_scorer.py +0 -152
  35. judgeval-0.0.2/tests/scorers/test_prompt_scorer.py +0 -167
  36. judgeval-0.0.2/tests/scorers/test_score.py +0 -974
  37. judgeval-0.0.2/tests/scorers/test_scorer_utils.py +0 -175
  38. {judgeval-0.0.2 → judgeval-0.0.3}/.github/workflows/ci.yaml +0 -0
  39. {judgeval-0.0.2 → judgeval-0.0.3}/.gitignore +0 -0
  40. {judgeval-0.0.2 → judgeval-0.0.3}/LICENSE.md +0 -0
  41. {judgeval-0.0.2 → judgeval-0.0.3}/Pipfile +0 -0
  42. {judgeval-0.0.2 → judgeval-0.0.3}/README.md +0 -0
  43. {judgeval-0.0.2 → judgeval-0.0.3}/docs/README.md +0 -0
  44. {judgeval-0.0.2 → judgeval-0.0.3}/docs/development.mdx +0 -0
  45. {judgeval-0.0.2 → judgeval-0.0.3}/docs/essentials/code.mdx +0 -0
  46. {judgeval-0.0.2 → judgeval-0.0.3}/docs/essentials/images.mdx +0 -0
  47. {judgeval-0.0.2 → judgeval-0.0.3}/docs/essentials/markdown.mdx +0 -0
  48. {judgeval-0.0.2 → judgeval-0.0.3}/docs/essentials/navigation.mdx +0 -0
  49. {judgeval-0.0.2 → judgeval-0.0.3}/docs/essentials/reusable-snippets.mdx +0 -0
  50. {judgeval-0.0.2 → judgeval-0.0.3}/docs/essentials/settings.mdx +0 -0
  51. {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/data_datasets.mdx +0 -0
  52. {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/data_examples.mdx +0 -0
  53. {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/introduction.mdx +0 -0
  54. {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/judges.mdx +0 -0
  55. {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/answer_relevancy.mdx +0 -0
  56. {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
  57. {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/contextual_precision.mdx +0 -0
  58. {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/contextual_recall.mdx +0 -0
  59. {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/contextual_relevancy.mdx +0 -0
  60. {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
  61. {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/faithfulness.mdx +0 -0
  62. {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/hallucination.mdx +0 -0
  63. {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/introduction.mdx +0 -0
  64. {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/json_correctness.mdx +0 -0
  65. {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/summarization.mdx +0 -0
  66. {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/tool_correctness.mdx +0 -0
  67. {judgeval-0.0.2 → judgeval-0.0.3}/docs/favicon.svg +0 -0
  68. {judgeval-0.0.2 → judgeval-0.0.3}/docs/getting_started.mdx +0 -0
  69. {judgeval-0.0.2 → judgeval-0.0.3}/docs/images/checks-passed.png +0 -0
  70. {judgeval-0.0.2 → judgeval-0.0.3}/docs/images/create_aggressive_scorer.png +0 -0
  71. {judgeval-0.0.2 → judgeval-0.0.3}/docs/images/create_scorer.png +0 -0
  72. {judgeval-0.0.2 → judgeval-0.0.3}/docs/images/evaluation_diagram.png +0 -0
  73. {judgeval-0.0.2 → judgeval-0.0.3}/docs/images/hero-dark.svg +0 -0
  74. {judgeval-0.0.2 → judgeval-0.0.3}/docs/images/hero-light.svg +0 -0
  75. {judgeval-0.0.2 → judgeval-0.0.3}/docs/introduction.mdx +0 -0
  76. {judgeval-0.0.2 → judgeval-0.0.3}/docs/judgment/introduction.mdx +0 -0
  77. {judgeval-0.0.2 → judgeval-0.0.3}/docs/logo/dark.svg +0 -0
  78. {judgeval-0.0.2 → judgeval-0.0.3}/docs/logo/light.svg +0 -0
  79. {judgeval-0.0.2 → judgeval-0.0.3}/docs/mint.json +0 -0
  80. {judgeval-0.0.2 → judgeval-0.0.3}/docs/notebooks/create_dataset.ipynb +0 -0
  81. {judgeval-0.0.2 → judgeval-0.0.3}/docs/notebooks/create_scorer.ipynb +0 -0
  82. {judgeval-0.0.2 → judgeval-0.0.3}/docs/notebooks/demo.ipynb +0 -0
  83. {judgeval-0.0.2 → judgeval-0.0.3}/docs/notebooks/prompt_scorer.ipynb +0 -0
  84. {judgeval-0.0.2 → judgeval-0.0.3}/docs/notebooks/quickstart.ipynb +0 -0
  85. {judgeval-0.0.2 → judgeval-0.0.3}/docs/quickstart.mdx +0 -0
  86. {judgeval-0.0.2 → judgeval-0.0.3}/docs/snippets/snippet-intro.mdx +0 -0
  87. {judgeval-0.0.2 → judgeval-0.0.3}/pytest.ini +0 -0
  88. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/__init__.py +0 -0
  89. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/clients.py +0 -0
  90. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/common/__init__.py +0 -0
  91. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/common/exceptions.py +0 -0
  92. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/common/logger.py +0 -0
  93. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/common/tracer.py +0 -0
  94. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/common/utils.py +0 -0
  95. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/constants.py +0 -0
  96. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/data/__init__.py +0 -0
  97. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/data/api_example.py +0 -0
  98. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/data/datasets/__init__.py +0 -0
  99. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/data/datasets/dataset.py +0 -0
  100. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/data/datasets/ground_truth.py +0 -0
  101. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/data/datasets/utils.py +0 -0
  102. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/data/example.py +0 -0
  103. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/data/result.py +0 -0
  104. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/data/scorer_data.py +0 -0
  105. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/evaluation_run.py +0 -0
  106. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/judges/__init__.py +0 -0
  107. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/judges/base_judge.py +0 -0
  108. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/judges/litellm_judge.py +0 -0
  109. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/judges/mixture_of_judges.py +0 -0
  110. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/judges/together_judge.py +0 -0
  111. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/judges/utils.py +0 -0
  112. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/judgment_client.py +0 -0
  113. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/run_evaluation.py +0 -0
  114. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/__init__.py +0 -0
  115. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/base_scorer.py +0 -0
  116. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/custom_scorer.py +0 -0
  117. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  118. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/answer_relevancy.py +0 -0
  119. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/contextual_precision.py +0 -0
  120. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/contextual_recall.py +0 -0
  121. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/contextual_relevancy.py +0 -0
  122. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/faithfulness.py +0 -0
  123. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/hallucination.py +0 -0
  124. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/json_correctness.py +0 -0
  125. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/summarization.py +0 -0
  126. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/tool_correctness.py +0 -0
  127. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/prompt_scorer.py +0 -0
  128. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/score.py +0 -0
  129. {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.2
3
+ Version: 0.0.3
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -10,19 +10,18 @@ License-File: LICENSE.md
10
10
  Classifier: Operating System :: OS Independent
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.11
13
- Requires-Dist: anthropic>=0.43.1
14
- Requires-Dist: deepeval>=2.1.8
15
- Requires-Dist: fastapi>=0.115.6
13
+ Requires-Dist: anthropic
14
+ Requires-Dist: fastapi
16
15
  Requires-Dist: langfuse==2.50.3
17
- Requires-Dist: litellm>=1.48.10
18
- Requires-Dist: openai>=1.47.1
19
- Requires-Dist: pandas>=2.2.3
20
- Requires-Dist: patronus>=0.0.17
16
+ Requires-Dist: litellm
17
+ Requires-Dist: openai
18
+ Requires-Dist: pandas
19
+ Requires-Dist: patronus
21
20
  Requires-Dist: python-dotenv==1.0.1
22
- Requires-Dist: requests>=2.32.3
23
- Requires-Dist: supabase>=2.11.0
24
- Requires-Dist: together>=1.3.11
25
- Requires-Dist: uvicorn>=0.34.0
21
+ Requires-Dist: requests
22
+ Requires-Dist: supabase
23
+ Requires-Dist: together
24
+ Requires-Dist: uvicorn
26
25
  Provides-Extra: dev
27
26
  Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
28
27
  Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.0.2"
3
+ version = "0.0.3"
4
4
  authors = [
5
5
  { name="Andrew Li", email="andrew@judgmentlabs.ai" },
6
6
  { name="Alex Shan", email="alex@judgmentlabs.ai" },
@@ -17,18 +17,17 @@ license = "Apache-2.0"
17
17
  license-files = ["LICENSE.md"]
18
18
  dependencies = [
19
19
  "langfuse==2.50.3",
20
- "litellm>=1.48.10",
20
+ "litellm",
21
21
  "python-dotenv==1.0.1",
22
- "fastapi>=0.115.6",
23
- "uvicorn>=0.34.0",
24
- "deepeval>=2.1.8",
25
- "supabase>=2.11.0",
26
- "requests>=2.32.3",
27
- "pandas>=2.2.3",
28
- "openai>=1.47.1",
29
- "together>=1.3.11",
30
- "anthropic>=0.43.1",
31
- "patronus>=0.0.17"
22
+ "fastapi",
23
+ "uvicorn",
24
+ "supabase",
25
+ "requests",
26
+ "pandas",
27
+ "openai",
28
+ "together",
29
+ "anthropic",
30
+ "patronus",
32
31
  ]
33
32
 
34
33
  [project.optional-dependencies]
@@ -57,4 +56,8 @@ include = [
57
56
  directory = "dist"
58
57
  artifacts = [
59
58
  "src/judgeval/**/*.py",
60
- ]
59
+ ]
60
+ exclude = [
61
+ "src/e2etests/*",
62
+ "src/tests/*",
63
+ ]
@@ -1,354 +0,0 @@
1
- """
2
- Sanity checks for judgment client functionality
3
- """
4
-
5
- import os
6
- from pydantic import BaseModel
7
-
8
- from judgeval.judgment_client import JudgmentClient
9
- from judgeval.data import Example
10
- from judgeval.scorers import (
11
- FaithfulnessScorer,
12
- HallucinationScorer,
13
- JSONCorrectnessScorer
14
- )
15
- from judgeval.judges import TogetherJudge, judgevalJudge
16
- from judgeval.e2etests.playground import CustomFaithfulnessMetric
17
- from judgeval.data.datasets.dataset import EvalDataset
18
- from dotenv import load_dotenv
19
- import random
20
- import string
21
-
22
- from judgeval.scorers.prompt_scorer import ClassifierScorer
23
-
24
- load_dotenv()
25
-
26
- def get_client():
27
- return JudgmentClient(judgment_api_key=os.getenv("JUDGMENT_API_KEY"))
28
-
29
-
30
- def get_ui_client():
31
- return JudgmentClient(judgment_api_key=os.getenv("UI_JUDGMENT_API_KEY"))
32
-
33
-
34
- def test_dataset(client: JudgmentClient):
35
- dataset: EvalDataset = client.create_dataset()
36
- dataset.add_example(Example(input="input 1", actual_output="output 1"))
37
-
38
- client.push_dataset(alias="test_dataset_5", dataset=dataset, overwrite=False)
39
-
40
- # PULL
41
- dataset = client.pull_dataset(alias="test_dataset_5")
42
- print(dataset)
43
-
44
-
45
- def test_run_eval(client: JudgmentClient):
46
- # Single step in our workflow, an outreach Sales Agent
47
-
48
- example1 = Example(
49
- input="Generate a cold outreach email for TechCorp. Facts: They recently launched an AI-powered analytics platform. Their CEO Sarah Chen previously worked at Google. They have 50+ enterprise clients.",
50
- actual_output="Dear Ms. Chen,\n\nI noticed TechCorp's recent launch of your AI analytics platform and was impressed by its enterprise-focused approach. Your experience from Google clearly shines through in building scalable solutions, as evidenced by your impressive 50+ enterprise client base.\n\nWould you be open to a brief call to discuss how we could potentially collaborate?\n\nBest regards,\nAlex",
51
- retrieval_context=["TechCorp launched AI analytics platform in 2024", "Sarah Chen is CEO, ex-Google executive", "Current client base: 50+ enterprise customers"],
52
- )
53
-
54
- example2 = Example(
55
- input="Generate a cold outreach email for GreenEnergy Solutions. Facts: They're developing solar panel technology that's 30% more efficient. They're looking to expand into the European market. They won a sustainability award in 2023.",
56
- actual_output="Dear GreenEnergy Solutions team,\n\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.\n\nI'd love to discuss how we could support your European expansion plans.\n\nBest regards,\nAlex",
57
- expected_output="A professional cold email mentioning the sustainability award, solar technology innovation, and European expansion plans",
58
- context=["Business Development"],
59
- retrieval_context=["GreenEnergy Solutions won 2023 sustainability award", "New solar technology 30% more efficient", "Planning European market expansion"],
60
- )
61
-
62
- scorer = FaithfulnessScorer(threshold=0.5)
63
- scorer2 = HallucinationScorer(threshold=0.5)
64
- c_scorer = CustomFaithfulnessMetric(threshold=0.6)
65
-
66
- PROJECT_NAME = "OutreachWorkflow"
67
- EVAL_RUN_NAME = "ColdEmailGenerator-Improve-BasePrompt"
68
-
69
- client.run_evaluation(
70
- examples=[example1, example2],
71
- scorers=[scorer, scorer2],
72
- model="QWEN",
73
- metadata={"batch": "test"},
74
- project_name=PROJECT_NAME,
75
- eval_run_name=EVAL_RUN_NAME,
76
- log_results=True,
77
- override=True,
78
- )
79
-
80
- results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=EVAL_RUN_NAME)
81
- print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results)
82
-
83
-
84
- def test_json_scorer(client: JudgmentClient):
85
-
86
- example1 = Example(
87
- input="What if these shoes don't fit?",
88
- actual_output='{"tool": "authentication"}',
89
- retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
90
- trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6"
91
- )
92
-
93
- example2 = Example(
94
- input="How do I reset my password?",
95
- actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
96
- expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
97
- name="Password Reset",
98
- context=["User Account"],
99
- retrieval_context=["Password reset instructions"],
100
- tools_called=["authentication"],
101
- expected_tools=["authentication"],
102
- additional_metadata={"difficulty": "medium"}
103
- )
104
-
105
- class SampleSchema(BaseModel):
106
- tool: str
107
-
108
- scorer = JSONCorrectnessScorer(threshold=0.5, json_schema=SampleSchema)
109
- PROJECT_NAME = "test_project_JOSEPH"
110
- EVAL_RUN_NAME = "yomadude"
111
-
112
- res = client.run_evaluation(
113
- examples=[example1, example2],
114
- scorers=[scorer],
115
- model="QWEN",
116
- metadata={"batch": "test"},
117
- project_name=PROJECT_NAME,
118
- eval_run_name=EVAL_RUN_NAME,
119
- log_results=True,
120
- override=True,
121
- )
122
-
123
- print(res)
124
-
125
-
126
- def test_override_eval(client: JudgmentClient):
127
- example1 = Example(
128
- input="What if these shoes don't fit?",
129
- actual_output="We offer a 30-day full refund at no extra cost.",
130
- retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
131
- trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6"
132
- )
133
-
134
- scorer = FaithfulnessScorer(threshold=0.5)
135
-
136
- PROJECT_NAME = "test_eval_run_naming_collisions"
137
- EVAL_RUN_NAME = ''.join(random.choices(string.ascii_letters + string.digits, k=12))
138
-
139
- # First run should succeed
140
- client.run_evaluation(
141
- examples=[example1],
142
- scorers=[scorer],
143
- model="QWEN",
144
- metadata={"batch": "test"},
145
- project_name=PROJECT_NAME,
146
- eval_run_name=EVAL_RUN_NAME,
147
- log_results=True,
148
- override=False,
149
- )
150
-
151
- # Second run with log_results=False should succeed
152
- client.run_evaluation(
153
- examples=[example1],
154
- scorers=[scorer],
155
- model="QWEN",
156
- metadata={"batch": "test"},
157
- project_name=PROJECT_NAME,
158
- eval_run_name=EVAL_RUN_NAME,
159
- log_results=False,
160
- override=False,
161
- )
162
-
163
- # Third run with override=True should succeed
164
- try:
165
- client.run_evaluation(
166
- examples=[example1],
167
- scorers=[scorer],
168
- model="QWEN",
169
- metadata={"batch": "test"},
170
- project_name=PROJECT_NAME,
171
- eval_run_name=EVAL_RUN_NAME,
172
- log_results=True,
173
- override=True,
174
- )
175
- except ValueError as e:
176
- print(f"Unexpected error in override run: {e}")
177
- raise
178
-
179
- # Final non-override run should fail
180
- try:
181
- client.run_evaluation(
182
- examples=[example1],
183
- scorers=[scorer],
184
- model="QWEN",
185
- metadata={"batch": "test"},
186
- project_name=PROJECT_NAME,
187
- eval_run_name=EVAL_RUN_NAME,
188
- log_results=True,
189
- override=False,
190
- )
191
- raise AssertionError("Expected ValueError was not raised")
192
- except ValueError as e:
193
- if "already exists" not in str(e):
194
- raise
195
- print(f"Successfully caught expected error: {e}")
196
-
197
-
198
- def test_evaluate_dataset(client: JudgmentClient):
199
-
200
- example1 = Example(
201
- input="What if these shoes don't fit?",
202
- actual_output="We offer a 30-day full refund at no extra cost.",
203
- retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
204
- trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6"
205
- )
206
-
207
- example2 = Example(
208
- input="How do I reset my password?",
209
- actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
210
- expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
211
- name="Password Reset",
212
- context=["User Account"],
213
- retrieval_context=["Password reset instructions"],
214
- tools_called=["authentication"],
215
- expected_tools=["authentication"],
216
- additional_metadata={"difficulty": "medium"}
217
- )
218
-
219
- dataset = EvalDataset(examples=[example1, example2])
220
- res = client.evaluate_dataset(
221
- dataset=dataset,
222
- scorers=[FaithfulnessScorer(threshold=0.5)],
223
- model="QWEN",
224
- metadata={"batch": "test"},
225
- )
226
-
227
- print(res)
228
-
229
-
230
- def test_classifier_scorer(client: JudgmentClient):
231
- # Modifying a classifier scorer
232
- # Make some methods private
233
- classifier_scorer = client.fetch_classifier_scorer("tonescorer-72gl")
234
- faithfulness_scorer = FaithfulnessScorer(threshold=0.5)
235
-
236
- # Creating a classifier scorer from SDK
237
- classifier_scorer_custom = ClassifierScorer(
238
- name="Test Classifier Scorer",
239
- threshold=0.5,
240
- conversation=[],
241
- options={}
242
- )
243
-
244
- classifier_scorer_custom.update_conversation(conversation=[{"role": "user", "content": "What is the capital of France?"}])
245
- classifier_scorer_custom.update_options(options={"yes": 1, "no": 0})
246
-
247
- slug = client.push_classifier_scorer(scorer=classifier_scorer_custom)
248
-
249
- classifier_scorer_custom = client.fetch_classifier_scorer(slug=slug)
250
- print(f"{classifier_scorer_custom=}")
251
-
252
- res = client.run_evaluation(
253
- examples=[example1],
254
- scorers=[classifier_scorer, faithfulness_scorer],
255
- model="QWEN",
256
- log_results=True,
257
- eval_run_name="ToneScorerTest",
258
- project_name="ToneScorerTest",
259
- )
260
-
261
-
262
- def test_custom_judge_vertexai(client: JudgmentClient):
263
-
264
- import vertexai
265
- from vertexai.generative_models import GenerativeModel
266
-
267
- PROJECT_ID = "judgment-labs"
268
- vertexai.init(project=PROJECT_ID, location="us-west1")
269
-
270
- class VertexAIJudge(judgevalJudge):
271
-
272
- def __init__(self, model_name: str = "gemini-1.5-flash-002"):
273
- self.model_name = model_name
274
- self.model = GenerativeModel(self.model_name)
275
-
276
- def load_model(self):
277
- return self.model
278
-
279
- def generate(self, prompt) -> str:
280
- # prompt is a List[dict] (conversation history)
281
- # For models that don't support conversation history, we need to convert to string
282
- # If you're using a model that supports chat history, you can just pass the prompt directly
283
- response = self.model.generate_content(str(prompt))
284
- return response.text
285
-
286
- async def a_generate(self, prompt) -> str:
287
- # prompt is a List[dict] (conversation history)
288
- # For models that don't support conversation history, we need to convert to string
289
- # If you're using a model that supports chat history, you can just pass the prompt directly
290
- response = await self.model.generate_content_async(str(prompt))
291
- return response.text
292
-
293
- def get_model_name(self) -> str:
294
- return self.model_name
295
-
296
- example = Example(
297
- input="What is the largest animal in the world?",
298
- actual_output="The blue whale is the largest known animal.",
299
- retrieval_context=["The blue whale is the largest known animal."],
300
- )
301
-
302
- judge = VertexAIJudge()
303
-
304
- res = client.run_evaluation(
305
- examples=[example],
306
- scorers=[CustomFaithfulnessMetric()],
307
- model=judge,
308
- )
309
- print(res)
310
-
311
-
312
- if __name__ == "__main__":
313
- # Test client functionality
314
- client = get_client()
315
- ui_client = get_ui_client()
316
- print("Client initialized successfully")
317
- print("*" * 40)
318
-
319
- print("Testing dataset creation, pushing, and pulling")
320
- test_dataset(ui_client)
321
- print("Dataset creation, pushing, and pulling successful")
322
- print("*" * 40)
323
-
324
- print("Testing evaluation run")
325
- test_run_eval(ui_client)
326
- print("Evaluation run successful")
327
- print("*" * 40)
328
-
329
- print("Testing JSON scorer")
330
- test_json_scorer(ui_client)
331
- print("JSON scorer test successful")
332
- print("*" * 40)
333
-
334
- print("Testing evaluation run override")
335
- test_override_eval(client)
336
- print("Evaluation run override successful")
337
- print("*" * 40)
338
-
339
- print("Testing dataset evaluation")
340
- test_evaluate_dataset(ui_client)
341
- print("Dataset evaluation successful")
342
- print("*" * 40)
343
-
344
- print("Testing classifier scorer")
345
- test_classifier_scorer(ui_client)
346
- print("Classifier scorer test successful")
347
- print("*" * 40)
348
-
349
- print("Testing custom judge")
350
- test_custom_judge_vertexai(ui_client)
351
- print("Custom judge test successful")
352
- print("*" * 40)
353
-
354
- print("All tests passed successfully")