judgeval 0.0.2__tar.gz → 0.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.0.2 → judgeval-0.0.3}/PKG-INFO +11 -12
- {judgeval-0.0.2 → judgeval-0.0.3}/pyproject.toml +16 -13
- judgeval-0.0.2/e2etests/judgment_client_test.py +0 -354
- judgeval-0.0.2/e2etests/playground.py +0 -629
- judgeval-0.0.2/e2etests/test_prompt_scoring.py +0 -114
- judgeval-0.0.2/e2etests/test_tracer.py +0 -143
- judgeval-0.0.2/tests/common/test_exceptions.py +0 -33
- judgeval-0.0.2/tests/common/test_logger.py +0 -154
- judgeval-0.0.2/tests/common/test_tracer.py +0 -284
- judgeval-0.0.2/tests/common/test_utils.py +0 -485
- judgeval-0.0.2/tests/data/datasets/sample_data/dataset.csv +0 -3
- judgeval-0.0.2/tests/data/datasets/sample_data/dataset.json +0 -55
- judgeval-0.0.2/tests/data/datasets/test_dataset.py +0 -260
- judgeval-0.0.2/tests/data/datasets/test_dataset_utils.py +0 -110
- judgeval-0.0.2/tests/data/datasets/test_ground_truth.py +0 -130
- judgeval-0.0.2/tests/data/test_api_example.py +0 -153
- judgeval-0.0.2/tests/data/test_example.py +0 -133
- judgeval-0.0.2/tests/data/test_result.py +0 -121
- judgeval-0.0.2/tests/data/test_scorer_data.py +0 -294
- judgeval-0.0.2/tests/judges/test_judge_utils.py +0 -62
- judgeval-0.0.2/tests/judges/test_litellm_judge.py +0 -218
- judgeval-0.0.2/tests/judges/test_mixture_of_judges.py +0 -417
- judgeval-0.0.2/tests/judges/test_together_judge.py +0 -187
- judgeval-0.0.2/tests/scorers/judgeval_scorers/test_answer_relevancy.py +0 -26
- judgeval-0.0.2/tests/scorers/judgeval_scorers/test_contextual_precision.py +0 -26
- judgeval-0.0.2/tests/scorers/judgeval_scorers/test_contextual_recall.py +0 -26
- judgeval-0.0.2/tests/scorers/judgeval_scorers/test_contextual_relevancy.py +0 -26
- judgeval-0.0.2/tests/scorers/judgeval_scorers/test_faithfulness.py +0 -27
- judgeval-0.0.2/tests/scorers/judgeval_scorers/test_hallucination.py +0 -26
- judgeval-0.0.2/tests/scorers/judgeval_scorers/test_json_correctness.py +0 -37
- judgeval-0.0.2/tests/scorers/judgeval_scorers/test_summarization.py +0 -27
- judgeval-0.0.2/tests/scorers/judgeval_scorers/test_tool_correctness.py +0 -26
- judgeval-0.0.2/tests/scorers/test_base_scorer.py +0 -65
- judgeval-0.0.2/tests/scorers/test_custom_scorer.py +0 -152
- judgeval-0.0.2/tests/scorers/test_prompt_scorer.py +0 -167
- judgeval-0.0.2/tests/scorers/test_score.py +0 -974
- judgeval-0.0.2/tests/scorers/test_scorer_utils.py +0 -175
- {judgeval-0.0.2 → judgeval-0.0.3}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/.gitignore +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/LICENSE.md +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/Pipfile +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/README.md +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/README.md +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/development.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/essentials/code.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/essentials/images.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/essentials/markdown.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/essentials/navigation.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/essentials/reusable-snippets.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/essentials/settings.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/data_datasets.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/data_examples.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/introduction.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/judges.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/answer_relevancy.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/contextual_precision.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/contextual_recall.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/contextual_relevancy.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/faithfulness.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/hallucination.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/introduction.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/json_correctness.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/summarization.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/evaluation/scorers/tool_correctness.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/favicon.svg +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/getting_started.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/images/checks-passed.png +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/images/create_aggressive_scorer.png +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/images/create_scorer.png +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/images/evaluation_diagram.png +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/images/hero-dark.svg +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/images/hero-light.svg +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/introduction.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/judgment/introduction.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/logo/dark.svg +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/logo/light.svg +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/mint.json +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/notebooks/create_dataset.ipynb +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/notebooks/create_scorer.ipynb +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/notebooks/demo.ipynb +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/notebooks/prompt_scorer.ipynb +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/notebooks/quickstart.ipynb +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/quickstart.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/docs/snippets/snippet-intro.mdx +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/pytest.ini +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/__init__.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/clients.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/common/__init__.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/common/exceptions.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/common/logger.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/common/tracer.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/common/utils.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/constants.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/data/api_example.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/data/datasets/__init__.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/data/datasets/dataset.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/data/datasets/ground_truth.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/data/datasets/utils.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/data/example.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/data/result.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/evaluation_run.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/judges/mixture_of_judges.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/judgment_client.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/run_evaluation.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/base_scorer.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/custom_scorer.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/answer_relevancy.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/contextual_precision.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/contextual_recall.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/contextual_relevancy.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/faithfulness.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/hallucination.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/json_correctness.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/summarization.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/judgeval_scorers/tool_correctness.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/prompt_scorer.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/score.py +0 -0
- {judgeval-0.0.2 → judgeval-0.0.3}/src/judgeval/scorers/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.3
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -10,19 +10,18 @@ License-File: LICENSE.md
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
12
12
|
Requires-Python: >=3.11
|
13
|
-
Requires-Dist: anthropic
|
14
|
-
Requires-Dist:
|
15
|
-
Requires-Dist: fastapi>=0.115.6
|
13
|
+
Requires-Dist: anthropic
|
14
|
+
Requires-Dist: fastapi
|
16
15
|
Requires-Dist: langfuse==2.50.3
|
17
|
-
Requires-Dist: litellm
|
18
|
-
Requires-Dist: openai
|
19
|
-
Requires-Dist: pandas
|
20
|
-
Requires-Dist: patronus
|
16
|
+
Requires-Dist: litellm
|
17
|
+
Requires-Dist: openai
|
18
|
+
Requires-Dist: pandas
|
19
|
+
Requires-Dist: patronus
|
21
20
|
Requires-Dist: python-dotenv==1.0.1
|
22
|
-
Requires-Dist: requests
|
23
|
-
Requires-Dist: supabase
|
24
|
-
Requires-Dist: together
|
25
|
-
Requires-Dist: uvicorn
|
21
|
+
Requires-Dist: requests
|
22
|
+
Requires-Dist: supabase
|
23
|
+
Requires-Dist: together
|
24
|
+
Requires-Dist: uvicorn
|
26
25
|
Provides-Extra: dev
|
27
26
|
Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
|
28
27
|
Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "judgeval"
|
3
|
-
version = "0.0.
|
3
|
+
version = "0.0.3"
|
4
4
|
authors = [
|
5
5
|
{ name="Andrew Li", email="andrew@judgmentlabs.ai" },
|
6
6
|
{ name="Alex Shan", email="alex@judgmentlabs.ai" },
|
@@ -17,18 +17,17 @@ license = "Apache-2.0"
|
|
17
17
|
license-files = ["LICENSE.md"]
|
18
18
|
dependencies = [
|
19
19
|
"langfuse==2.50.3",
|
20
|
-
"litellm
|
20
|
+
"litellm",
|
21
21
|
"python-dotenv==1.0.1",
|
22
|
-
"fastapi
|
23
|
-
"uvicorn
|
24
|
-
"
|
25
|
-
"
|
26
|
-
"
|
27
|
-
"
|
28
|
-
"
|
29
|
-
"
|
30
|
-
"
|
31
|
-
"patronus>=0.0.17"
|
22
|
+
"fastapi",
|
23
|
+
"uvicorn",
|
24
|
+
"supabase",
|
25
|
+
"requests",
|
26
|
+
"pandas",
|
27
|
+
"openai",
|
28
|
+
"together",
|
29
|
+
"anthropic",
|
30
|
+
"patronus",
|
32
31
|
]
|
33
32
|
|
34
33
|
[project.optional-dependencies]
|
@@ -57,4 +56,8 @@ include = [
|
|
57
56
|
directory = "dist"
|
58
57
|
artifacts = [
|
59
58
|
"src/judgeval/**/*.py",
|
60
|
-
]
|
59
|
+
]
|
60
|
+
exclude = [
|
61
|
+
"src/e2etests/*",
|
62
|
+
"src/tests/*",
|
63
|
+
]
|
@@ -1,354 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Sanity checks for judgment client functionality
|
3
|
-
"""
|
4
|
-
|
5
|
-
import os
|
6
|
-
from pydantic import BaseModel
|
7
|
-
|
8
|
-
from judgeval.judgment_client import JudgmentClient
|
9
|
-
from judgeval.data import Example
|
10
|
-
from judgeval.scorers import (
|
11
|
-
FaithfulnessScorer,
|
12
|
-
HallucinationScorer,
|
13
|
-
JSONCorrectnessScorer
|
14
|
-
)
|
15
|
-
from judgeval.judges import TogetherJudge, judgevalJudge
|
16
|
-
from judgeval.e2etests.playground import CustomFaithfulnessMetric
|
17
|
-
from judgeval.data.datasets.dataset import EvalDataset
|
18
|
-
from dotenv import load_dotenv
|
19
|
-
import random
|
20
|
-
import string
|
21
|
-
|
22
|
-
from judgeval.scorers.prompt_scorer import ClassifierScorer
|
23
|
-
|
24
|
-
load_dotenv()
|
25
|
-
|
26
|
-
def get_client():
|
27
|
-
return JudgmentClient(judgment_api_key=os.getenv("JUDGMENT_API_KEY"))
|
28
|
-
|
29
|
-
|
30
|
-
def get_ui_client():
|
31
|
-
return JudgmentClient(judgment_api_key=os.getenv("UI_JUDGMENT_API_KEY"))
|
32
|
-
|
33
|
-
|
34
|
-
def test_dataset(client: JudgmentClient):
|
35
|
-
dataset: EvalDataset = client.create_dataset()
|
36
|
-
dataset.add_example(Example(input="input 1", actual_output="output 1"))
|
37
|
-
|
38
|
-
client.push_dataset(alias="test_dataset_5", dataset=dataset, overwrite=False)
|
39
|
-
|
40
|
-
# PULL
|
41
|
-
dataset = client.pull_dataset(alias="test_dataset_5")
|
42
|
-
print(dataset)
|
43
|
-
|
44
|
-
|
45
|
-
def test_run_eval(client: JudgmentClient):
|
46
|
-
# Single step in our workflow, an outreach Sales Agent
|
47
|
-
|
48
|
-
example1 = Example(
|
49
|
-
input="Generate a cold outreach email for TechCorp. Facts: They recently launched an AI-powered analytics platform. Their CEO Sarah Chen previously worked at Google. They have 50+ enterprise clients.",
|
50
|
-
actual_output="Dear Ms. Chen,\n\nI noticed TechCorp's recent launch of your AI analytics platform and was impressed by its enterprise-focused approach. Your experience from Google clearly shines through in building scalable solutions, as evidenced by your impressive 50+ enterprise client base.\n\nWould you be open to a brief call to discuss how we could potentially collaborate?\n\nBest regards,\nAlex",
|
51
|
-
retrieval_context=["TechCorp launched AI analytics platform in 2024", "Sarah Chen is CEO, ex-Google executive", "Current client base: 50+ enterprise customers"],
|
52
|
-
)
|
53
|
-
|
54
|
-
example2 = Example(
|
55
|
-
input="Generate a cold outreach email for GreenEnergy Solutions. Facts: They're developing solar panel technology that's 30% more efficient. They're looking to expand into the European market. They won a sustainability award in 2023.",
|
56
|
-
actual_output="Dear GreenEnergy Solutions team,\n\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.\n\nI'd love to discuss how we could support your European expansion plans.\n\nBest regards,\nAlex",
|
57
|
-
expected_output="A professional cold email mentioning the sustainability award, solar technology innovation, and European expansion plans",
|
58
|
-
context=["Business Development"],
|
59
|
-
retrieval_context=["GreenEnergy Solutions won 2023 sustainability award", "New solar technology 30% more efficient", "Planning European market expansion"],
|
60
|
-
)
|
61
|
-
|
62
|
-
scorer = FaithfulnessScorer(threshold=0.5)
|
63
|
-
scorer2 = HallucinationScorer(threshold=0.5)
|
64
|
-
c_scorer = CustomFaithfulnessMetric(threshold=0.6)
|
65
|
-
|
66
|
-
PROJECT_NAME = "OutreachWorkflow"
|
67
|
-
EVAL_RUN_NAME = "ColdEmailGenerator-Improve-BasePrompt"
|
68
|
-
|
69
|
-
client.run_evaluation(
|
70
|
-
examples=[example1, example2],
|
71
|
-
scorers=[scorer, scorer2],
|
72
|
-
model="QWEN",
|
73
|
-
metadata={"batch": "test"},
|
74
|
-
project_name=PROJECT_NAME,
|
75
|
-
eval_run_name=EVAL_RUN_NAME,
|
76
|
-
log_results=True,
|
77
|
-
override=True,
|
78
|
-
)
|
79
|
-
|
80
|
-
results = client.pull_eval(project_name=PROJECT_NAME, eval_run_name=EVAL_RUN_NAME)
|
81
|
-
print(f"Evaluation results for {EVAL_RUN_NAME} from database:", results)
|
82
|
-
|
83
|
-
|
84
|
-
def test_json_scorer(client: JudgmentClient):
|
85
|
-
|
86
|
-
example1 = Example(
|
87
|
-
input="What if these shoes don't fit?",
|
88
|
-
actual_output='{"tool": "authentication"}',
|
89
|
-
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
|
90
|
-
trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6"
|
91
|
-
)
|
92
|
-
|
93
|
-
example2 = Example(
|
94
|
-
input="How do I reset my password?",
|
95
|
-
actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
|
96
|
-
expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
|
97
|
-
name="Password Reset",
|
98
|
-
context=["User Account"],
|
99
|
-
retrieval_context=["Password reset instructions"],
|
100
|
-
tools_called=["authentication"],
|
101
|
-
expected_tools=["authentication"],
|
102
|
-
additional_metadata={"difficulty": "medium"}
|
103
|
-
)
|
104
|
-
|
105
|
-
class SampleSchema(BaseModel):
|
106
|
-
tool: str
|
107
|
-
|
108
|
-
scorer = JSONCorrectnessScorer(threshold=0.5, json_schema=SampleSchema)
|
109
|
-
PROJECT_NAME = "test_project_JOSEPH"
|
110
|
-
EVAL_RUN_NAME = "yomadude"
|
111
|
-
|
112
|
-
res = client.run_evaluation(
|
113
|
-
examples=[example1, example2],
|
114
|
-
scorers=[scorer],
|
115
|
-
model="QWEN",
|
116
|
-
metadata={"batch": "test"},
|
117
|
-
project_name=PROJECT_NAME,
|
118
|
-
eval_run_name=EVAL_RUN_NAME,
|
119
|
-
log_results=True,
|
120
|
-
override=True,
|
121
|
-
)
|
122
|
-
|
123
|
-
print(res)
|
124
|
-
|
125
|
-
|
126
|
-
def test_override_eval(client: JudgmentClient):
|
127
|
-
example1 = Example(
|
128
|
-
input="What if these shoes don't fit?",
|
129
|
-
actual_output="We offer a 30-day full refund at no extra cost.",
|
130
|
-
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
|
131
|
-
trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6"
|
132
|
-
)
|
133
|
-
|
134
|
-
scorer = FaithfulnessScorer(threshold=0.5)
|
135
|
-
|
136
|
-
PROJECT_NAME = "test_eval_run_naming_collisions"
|
137
|
-
EVAL_RUN_NAME = ''.join(random.choices(string.ascii_letters + string.digits, k=12))
|
138
|
-
|
139
|
-
# First run should succeed
|
140
|
-
client.run_evaluation(
|
141
|
-
examples=[example1],
|
142
|
-
scorers=[scorer],
|
143
|
-
model="QWEN",
|
144
|
-
metadata={"batch": "test"},
|
145
|
-
project_name=PROJECT_NAME,
|
146
|
-
eval_run_name=EVAL_RUN_NAME,
|
147
|
-
log_results=True,
|
148
|
-
override=False,
|
149
|
-
)
|
150
|
-
|
151
|
-
# Second run with log_results=False should succeed
|
152
|
-
client.run_evaluation(
|
153
|
-
examples=[example1],
|
154
|
-
scorers=[scorer],
|
155
|
-
model="QWEN",
|
156
|
-
metadata={"batch": "test"},
|
157
|
-
project_name=PROJECT_NAME,
|
158
|
-
eval_run_name=EVAL_RUN_NAME,
|
159
|
-
log_results=False,
|
160
|
-
override=False,
|
161
|
-
)
|
162
|
-
|
163
|
-
# Third run with override=True should succeed
|
164
|
-
try:
|
165
|
-
client.run_evaluation(
|
166
|
-
examples=[example1],
|
167
|
-
scorers=[scorer],
|
168
|
-
model="QWEN",
|
169
|
-
metadata={"batch": "test"},
|
170
|
-
project_name=PROJECT_NAME,
|
171
|
-
eval_run_name=EVAL_RUN_NAME,
|
172
|
-
log_results=True,
|
173
|
-
override=True,
|
174
|
-
)
|
175
|
-
except ValueError as e:
|
176
|
-
print(f"Unexpected error in override run: {e}")
|
177
|
-
raise
|
178
|
-
|
179
|
-
# Final non-override run should fail
|
180
|
-
try:
|
181
|
-
client.run_evaluation(
|
182
|
-
examples=[example1],
|
183
|
-
scorers=[scorer],
|
184
|
-
model="QWEN",
|
185
|
-
metadata={"batch": "test"},
|
186
|
-
project_name=PROJECT_NAME,
|
187
|
-
eval_run_name=EVAL_RUN_NAME,
|
188
|
-
log_results=True,
|
189
|
-
override=False,
|
190
|
-
)
|
191
|
-
raise AssertionError("Expected ValueError was not raised")
|
192
|
-
except ValueError as e:
|
193
|
-
if "already exists" not in str(e):
|
194
|
-
raise
|
195
|
-
print(f"Successfully caught expected error: {e}")
|
196
|
-
|
197
|
-
|
198
|
-
def test_evaluate_dataset(client: JudgmentClient):
|
199
|
-
|
200
|
-
example1 = Example(
|
201
|
-
input="What if these shoes don't fit?",
|
202
|
-
actual_output="We offer a 30-day full refund at no extra cost.",
|
203
|
-
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
|
204
|
-
trace_id="2231abe3-e7e0-4909-8ab7-b4ab60b645c6"
|
205
|
-
)
|
206
|
-
|
207
|
-
example2 = Example(
|
208
|
-
input="How do I reset my password?",
|
209
|
-
actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
|
210
|
-
expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.",
|
211
|
-
name="Password Reset",
|
212
|
-
context=["User Account"],
|
213
|
-
retrieval_context=["Password reset instructions"],
|
214
|
-
tools_called=["authentication"],
|
215
|
-
expected_tools=["authentication"],
|
216
|
-
additional_metadata={"difficulty": "medium"}
|
217
|
-
)
|
218
|
-
|
219
|
-
dataset = EvalDataset(examples=[example1, example2])
|
220
|
-
res = client.evaluate_dataset(
|
221
|
-
dataset=dataset,
|
222
|
-
scorers=[FaithfulnessScorer(threshold=0.5)],
|
223
|
-
model="QWEN",
|
224
|
-
metadata={"batch": "test"},
|
225
|
-
)
|
226
|
-
|
227
|
-
print(res)
|
228
|
-
|
229
|
-
|
230
|
-
def test_classifier_scorer(client: JudgmentClient):
|
231
|
-
# Modifying a classifier scorer
|
232
|
-
# Make some methods private
|
233
|
-
classifier_scorer = client.fetch_classifier_scorer("tonescorer-72gl")
|
234
|
-
faithfulness_scorer = FaithfulnessScorer(threshold=0.5)
|
235
|
-
|
236
|
-
# Creating a classifier scorer from SDK
|
237
|
-
classifier_scorer_custom = ClassifierScorer(
|
238
|
-
name="Test Classifier Scorer",
|
239
|
-
threshold=0.5,
|
240
|
-
conversation=[],
|
241
|
-
options={}
|
242
|
-
)
|
243
|
-
|
244
|
-
classifier_scorer_custom.update_conversation(conversation=[{"role": "user", "content": "What is the capital of France?"}])
|
245
|
-
classifier_scorer_custom.update_options(options={"yes": 1, "no": 0})
|
246
|
-
|
247
|
-
slug = client.push_classifier_scorer(scorer=classifier_scorer_custom)
|
248
|
-
|
249
|
-
classifier_scorer_custom = client.fetch_classifier_scorer(slug=slug)
|
250
|
-
print(f"{classifier_scorer_custom=}")
|
251
|
-
|
252
|
-
res = client.run_evaluation(
|
253
|
-
examples=[example1],
|
254
|
-
scorers=[classifier_scorer, faithfulness_scorer],
|
255
|
-
model="QWEN",
|
256
|
-
log_results=True,
|
257
|
-
eval_run_name="ToneScorerTest",
|
258
|
-
project_name="ToneScorerTest",
|
259
|
-
)
|
260
|
-
|
261
|
-
|
262
|
-
def test_custom_judge_vertexai(client: JudgmentClient):
|
263
|
-
|
264
|
-
import vertexai
|
265
|
-
from vertexai.generative_models import GenerativeModel
|
266
|
-
|
267
|
-
PROJECT_ID = "judgment-labs"
|
268
|
-
vertexai.init(project=PROJECT_ID, location="us-west1")
|
269
|
-
|
270
|
-
class VertexAIJudge(judgevalJudge):
|
271
|
-
|
272
|
-
def __init__(self, model_name: str = "gemini-1.5-flash-002"):
|
273
|
-
self.model_name = model_name
|
274
|
-
self.model = GenerativeModel(self.model_name)
|
275
|
-
|
276
|
-
def load_model(self):
|
277
|
-
return self.model
|
278
|
-
|
279
|
-
def generate(self, prompt) -> str:
|
280
|
-
# prompt is a List[dict] (conversation history)
|
281
|
-
# For models that don't support conversation history, we need to convert to string
|
282
|
-
# If you're using a model that supports chat history, you can just pass the prompt directly
|
283
|
-
response = self.model.generate_content(str(prompt))
|
284
|
-
return response.text
|
285
|
-
|
286
|
-
async def a_generate(self, prompt) -> str:
|
287
|
-
# prompt is a List[dict] (conversation history)
|
288
|
-
# For models that don't support conversation history, we need to convert to string
|
289
|
-
# If you're using a model that supports chat history, you can just pass the prompt directly
|
290
|
-
response = await self.model.generate_content_async(str(prompt))
|
291
|
-
return response.text
|
292
|
-
|
293
|
-
def get_model_name(self) -> str:
|
294
|
-
return self.model_name
|
295
|
-
|
296
|
-
example = Example(
|
297
|
-
input="What is the largest animal in the world?",
|
298
|
-
actual_output="The blue whale is the largest known animal.",
|
299
|
-
retrieval_context=["The blue whale is the largest known animal."],
|
300
|
-
)
|
301
|
-
|
302
|
-
judge = VertexAIJudge()
|
303
|
-
|
304
|
-
res = client.run_evaluation(
|
305
|
-
examples=[example],
|
306
|
-
scorers=[CustomFaithfulnessMetric()],
|
307
|
-
model=judge,
|
308
|
-
)
|
309
|
-
print(res)
|
310
|
-
|
311
|
-
|
312
|
-
if __name__ == "__main__":
|
313
|
-
# Test client functionality
|
314
|
-
client = get_client()
|
315
|
-
ui_client = get_ui_client()
|
316
|
-
print("Client initialized successfully")
|
317
|
-
print("*" * 40)
|
318
|
-
|
319
|
-
print("Testing dataset creation, pushing, and pulling")
|
320
|
-
test_dataset(ui_client)
|
321
|
-
print("Dataset creation, pushing, and pulling successful")
|
322
|
-
print("*" * 40)
|
323
|
-
|
324
|
-
print("Testing evaluation run")
|
325
|
-
test_run_eval(ui_client)
|
326
|
-
print("Evaluation run successful")
|
327
|
-
print("*" * 40)
|
328
|
-
|
329
|
-
print("Testing JSON scorer")
|
330
|
-
test_json_scorer(ui_client)
|
331
|
-
print("JSON scorer test successful")
|
332
|
-
print("*" * 40)
|
333
|
-
|
334
|
-
print("Testing evaluation run override")
|
335
|
-
test_override_eval(client)
|
336
|
-
print("Evaluation run override successful")
|
337
|
-
print("*" * 40)
|
338
|
-
|
339
|
-
print("Testing dataset evaluation")
|
340
|
-
test_evaluate_dataset(ui_client)
|
341
|
-
print("Dataset evaluation successful")
|
342
|
-
print("*" * 40)
|
343
|
-
|
344
|
-
print("Testing classifier scorer")
|
345
|
-
test_classifier_scorer(ui_client)
|
346
|
-
print("Classifier scorer test successful")
|
347
|
-
print("*" * 40)
|
348
|
-
|
349
|
-
print("Testing custom judge")
|
350
|
-
test_custom_judge_vertexai(ui_client)
|
351
|
-
print("Custom judge test successful")
|
352
|
-
print("*" * 40)
|
353
|
-
|
354
|
-
print("All tests passed successfully")
|