judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of judgeval might be problematic. Click here for more details.
- judgeval/__init__.py +177 -12
- judgeval/api/__init__.py +519 -0
- judgeval/api/api_types.py +407 -0
- judgeval/cli.py +79 -0
- judgeval/constants.py +76 -47
- judgeval/data/__init__.py +3 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +15 -56
- judgeval/data/judgment_types.py +450 -0
- judgeval/data/result.py +29 -73
- judgeval/data/scorer_data.py +29 -62
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/trace.py +121 -0
- judgeval/dataset/__init__.py +264 -0
- judgeval/env.py +52 -0
- judgeval/evaluation/__init__.py +344 -0
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +50 -0
- judgeval/judges/__init__.py +2 -3
- judgeval/judges/base_judge.py +2 -3
- judgeval/judges/litellm_judge.py +100 -20
- judgeval/judges/together_judge.py +101 -20
- judgeval/judges/utils.py +20 -24
- judgeval/logger.py +62 -0
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +18 -25
- judgeval/scorers/agent_scorer.py +17 -0
- judgeval/scorers/api_scorer.py +45 -41
- judgeval/scorers/base_scorer.py +83 -38
- judgeval/scorers/example_scorer.py +17 -0
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorers/__init__.py +0 -148
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
- judgeval/scorers/score.py +77 -306
- judgeval/scorers/utils.py +4 -199
- judgeval/tracer/__init__.py +1122 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +128 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +396 -0
- judgeval/trainer/trainable_model.py +243 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +97 -0
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/utils/version_check.py +28 -0
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.22.2.dist-info/METADATA +265 -0
- judgeval-0.22.2.dist-info/RECORD +112 -0
- judgeval-0.22.2.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -39
- judgeval/common/__init__.py +0 -8
- judgeval/common/exceptions.py +0 -28
- judgeval/common/logger.py +0 -189
- judgeval/common/tracer.py +0 -798
- judgeval/common/utils.py +0 -763
- judgeval/data/api_example.py +0 -111
- judgeval/data/datasets/__init__.py +0 -5
- judgeval/data/datasets/dataset.py +0 -286
- judgeval/data/datasets/eval_dataset_client.py +0 -193
- judgeval/data/datasets/ground_truth.py +0 -54
- judgeval/data/datasets/utils.py +0 -74
- judgeval/evaluation_run.py +0 -132
- judgeval/judges/mixture_of_judges.py +0 -248
- judgeval/judgment_client.py +0 -354
- judgeval/run_evaluation.py +0 -439
- judgeval/scorers/judgeval_scorer.py +0 -140
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
- judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
- judgeval/scorers/prompt_scorer.py +0 -439
- judgeval-0.0.11.dist-info/METADATA +0 -36
- judgeval-0.0.11.dist-info/RECORD +0 -84
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
from typing import List, Optional, Dict
|
|
2
|
+
from judgeval.api import JudgmentSyncClient
|
|
3
|
+
from judgeval.exceptions import JudgmentAPIError
|
|
4
|
+
from judgeval.api.api_types import (
|
|
5
|
+
PromptCommitInfo,
|
|
6
|
+
PromptTagResponse,
|
|
7
|
+
PromptUntagResponse,
|
|
8
|
+
PromptVersionsResponse,
|
|
9
|
+
)
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
import re
|
|
12
|
+
from string import Template
|
|
13
|
+
from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
|
|
14
|
+
from judgeval.utils.project import _resolve_project_id
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def push_prompt(
|
|
18
|
+
project_name: str,
|
|
19
|
+
name: str,
|
|
20
|
+
prompt: str,
|
|
21
|
+
tags: List[str],
|
|
22
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
23
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
24
|
+
) -> tuple[str, Optional[str], str]:
|
|
25
|
+
if not judgment_api_key or not organization_id:
|
|
26
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
27
|
+
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
|
28
|
+
try:
|
|
29
|
+
project_id = _resolve_project_id(
|
|
30
|
+
project_name, judgment_api_key, organization_id
|
|
31
|
+
)
|
|
32
|
+
if not project_id:
|
|
33
|
+
raise JudgmentAPIError(
|
|
34
|
+
status_code=404,
|
|
35
|
+
detail=f"Project '{project_name}' not found",
|
|
36
|
+
response=None, # type: ignore
|
|
37
|
+
)
|
|
38
|
+
r = client.prompts_insert(
|
|
39
|
+
payload={
|
|
40
|
+
"project_id": project_id,
|
|
41
|
+
"name": name,
|
|
42
|
+
"prompt": prompt,
|
|
43
|
+
"tags": tags,
|
|
44
|
+
}
|
|
45
|
+
)
|
|
46
|
+
return r["commit_id"], r.get("parent_commit_id"), r["created_at"]
|
|
47
|
+
except JudgmentAPIError as e:
|
|
48
|
+
raise JudgmentAPIError(
|
|
49
|
+
status_code=e.status_code,
|
|
50
|
+
detail=f"Failed to save prompt: {e.detail}",
|
|
51
|
+
response=e.response,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def fetch_prompt(
|
|
56
|
+
project_name: str,
|
|
57
|
+
name: str,
|
|
58
|
+
commit_id: Optional[str] = None,
|
|
59
|
+
tag: Optional[str] = None,
|
|
60
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
61
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
62
|
+
) -> Optional[PromptCommitInfo]:
|
|
63
|
+
if not judgment_api_key or not organization_id:
|
|
64
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
65
|
+
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
|
66
|
+
try:
|
|
67
|
+
project_id = _resolve_project_id(
|
|
68
|
+
project_name, judgment_api_key, organization_id
|
|
69
|
+
)
|
|
70
|
+
if not project_id:
|
|
71
|
+
raise JudgmentAPIError(
|
|
72
|
+
status_code=404,
|
|
73
|
+
detail=f"Project '{project_name}' not found",
|
|
74
|
+
response=None, # type: ignore
|
|
75
|
+
)
|
|
76
|
+
prompt_config = client.prompts_fetch(
|
|
77
|
+
name=name,
|
|
78
|
+
project_id=project_id,
|
|
79
|
+
commit_id=commit_id,
|
|
80
|
+
tag=tag,
|
|
81
|
+
)
|
|
82
|
+
return prompt_config["commit"]
|
|
83
|
+
except JudgmentAPIError as e:
|
|
84
|
+
raise JudgmentAPIError(
|
|
85
|
+
status_code=e.status_code,
|
|
86
|
+
detail=f"Failed to fetch prompt '{name}': {e.detail}",
|
|
87
|
+
response=e.response,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def tag_prompt(
|
|
92
|
+
project_name: str,
|
|
93
|
+
name: str,
|
|
94
|
+
commit_id: str,
|
|
95
|
+
tags: List[str],
|
|
96
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
97
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
98
|
+
) -> PromptTagResponse:
|
|
99
|
+
if not judgment_api_key or not organization_id:
|
|
100
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
101
|
+
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
|
102
|
+
try:
|
|
103
|
+
project_id = _resolve_project_id(
|
|
104
|
+
project_name, judgment_api_key, organization_id
|
|
105
|
+
)
|
|
106
|
+
if not project_id:
|
|
107
|
+
raise JudgmentAPIError(
|
|
108
|
+
status_code=404,
|
|
109
|
+
detail=f"Project '{project_name}' not found",
|
|
110
|
+
response=None, # type: ignore
|
|
111
|
+
)
|
|
112
|
+
prompt_config = client.prompts_tag(
|
|
113
|
+
payload={
|
|
114
|
+
"project_id": project_id,
|
|
115
|
+
"name": name,
|
|
116
|
+
"commit_id": commit_id,
|
|
117
|
+
"tags": tags,
|
|
118
|
+
}
|
|
119
|
+
)
|
|
120
|
+
return prompt_config
|
|
121
|
+
except JudgmentAPIError as e:
|
|
122
|
+
raise JudgmentAPIError(
|
|
123
|
+
status_code=e.status_code,
|
|
124
|
+
detail=f"Failed to tag prompt '{name}': {e.detail}",
|
|
125
|
+
response=e.response,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def untag_prompt(
|
|
130
|
+
project_name: str,
|
|
131
|
+
name: str,
|
|
132
|
+
tags: List[str],
|
|
133
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
134
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
135
|
+
) -> PromptUntagResponse:
|
|
136
|
+
if not judgment_api_key or not organization_id:
|
|
137
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
138
|
+
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
|
139
|
+
try:
|
|
140
|
+
project_id = _resolve_project_id(
|
|
141
|
+
project_name, judgment_api_key, organization_id
|
|
142
|
+
)
|
|
143
|
+
if not project_id:
|
|
144
|
+
raise JudgmentAPIError(
|
|
145
|
+
status_code=404,
|
|
146
|
+
detail=f"Project '{project_name}' not found",
|
|
147
|
+
response=None, # type: ignore
|
|
148
|
+
)
|
|
149
|
+
prompt_config = client.prompts_untag(
|
|
150
|
+
payload={"project_id": project_id, "name": name, "tags": tags}
|
|
151
|
+
)
|
|
152
|
+
return prompt_config
|
|
153
|
+
except JudgmentAPIError as e:
|
|
154
|
+
raise JudgmentAPIError(
|
|
155
|
+
status_code=e.status_code,
|
|
156
|
+
detail=f"Failed to untag prompt '{name}': {e.detail}",
|
|
157
|
+
response=e.response,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def list_prompt(
|
|
162
|
+
project_name: str,
|
|
163
|
+
name: str,
|
|
164
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
165
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
166
|
+
) -> PromptVersionsResponse:
|
|
167
|
+
if not judgment_api_key or not organization_id:
|
|
168
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
169
|
+
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
|
170
|
+
try:
|
|
171
|
+
project_id = _resolve_project_id(
|
|
172
|
+
project_name, judgment_api_key, organization_id
|
|
173
|
+
)
|
|
174
|
+
if not project_id:
|
|
175
|
+
raise JudgmentAPIError(
|
|
176
|
+
status_code=404,
|
|
177
|
+
detail=f"Project '{project_name}' not found",
|
|
178
|
+
response=None, # type: ignore
|
|
179
|
+
)
|
|
180
|
+
prompt_config = client.prompts_get_prompt_versions(
|
|
181
|
+
project_id=project_id, name=name
|
|
182
|
+
)
|
|
183
|
+
return prompt_config
|
|
184
|
+
except JudgmentAPIError as e:
|
|
185
|
+
raise JudgmentAPIError(
|
|
186
|
+
status_code=e.status_code,
|
|
187
|
+
detail=f"Failed to list prompt '{name}': {e.detail}",
|
|
188
|
+
response=e.response,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
@dataclass
|
|
193
|
+
class Prompt:
|
|
194
|
+
name: str
|
|
195
|
+
prompt: str
|
|
196
|
+
created_at: str
|
|
197
|
+
tags: List[str]
|
|
198
|
+
commit_id: str
|
|
199
|
+
parent_commit_id: Optional[str] = None
|
|
200
|
+
metadata: Dict[str, str] = field(default_factory=dict)
|
|
201
|
+
_template: Template = field(init=False, repr=False)
|
|
202
|
+
|
|
203
|
+
def __post_init__(self):
|
|
204
|
+
template_str = re.sub(r"\{\{([^}]+)\}\}", r"$\1", self.prompt)
|
|
205
|
+
self._template = Template(template_str)
|
|
206
|
+
|
|
207
|
+
@classmethod
|
|
208
|
+
def create(
|
|
209
|
+
cls,
|
|
210
|
+
project_name: str,
|
|
211
|
+
name: str,
|
|
212
|
+
prompt: str,
|
|
213
|
+
tags: Optional[List[str]] = None,
|
|
214
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
215
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
216
|
+
):
|
|
217
|
+
if tags is None:
|
|
218
|
+
tags = []
|
|
219
|
+
commit_id, parent_commit_id, created_at = push_prompt(
|
|
220
|
+
project_name, name, prompt, tags, judgment_api_key, organization_id
|
|
221
|
+
)
|
|
222
|
+
return cls(
|
|
223
|
+
name=name,
|
|
224
|
+
prompt=prompt,
|
|
225
|
+
created_at=created_at,
|
|
226
|
+
tags=tags,
|
|
227
|
+
commit_id=commit_id,
|
|
228
|
+
parent_commit_id=parent_commit_id,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
@classmethod
|
|
232
|
+
def get(
|
|
233
|
+
cls,
|
|
234
|
+
project_name: str,
|
|
235
|
+
name: str,
|
|
236
|
+
commit_id: Optional[str] = None,
|
|
237
|
+
tag: Optional[str] = None,
|
|
238
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
239
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
240
|
+
):
|
|
241
|
+
if commit_id is not None and tag is not None:
|
|
242
|
+
raise ValueError(
|
|
243
|
+
"You cannot fetch a prompt by both commit_id and tag at the same time"
|
|
244
|
+
)
|
|
245
|
+
prompt_config = fetch_prompt(
|
|
246
|
+
project_name, name, commit_id, tag, judgment_api_key, organization_id
|
|
247
|
+
)
|
|
248
|
+
if prompt_config is None:
|
|
249
|
+
raise JudgmentAPIError(
|
|
250
|
+
status_code=404,
|
|
251
|
+
detail=f"Prompt '{name}' not found in project '{project_name}'",
|
|
252
|
+
response=None, # type: ignore
|
|
253
|
+
)
|
|
254
|
+
return cls(
|
|
255
|
+
name=prompt_config["name"],
|
|
256
|
+
prompt=prompt_config["prompt"],
|
|
257
|
+
created_at=prompt_config["created_at"],
|
|
258
|
+
tags=prompt_config["tags"],
|
|
259
|
+
commit_id=prompt_config["commit_id"],
|
|
260
|
+
parent_commit_id=prompt_config.get("parent_commit_id"),
|
|
261
|
+
metadata={
|
|
262
|
+
"creator_first_name": prompt_config["first_name"],
|
|
263
|
+
"creator_last_name": prompt_config["last_name"],
|
|
264
|
+
"creator_email": prompt_config["user_email"],
|
|
265
|
+
},
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
@classmethod
|
|
269
|
+
def tag(
|
|
270
|
+
cls,
|
|
271
|
+
project_name: str,
|
|
272
|
+
name: str,
|
|
273
|
+
commit_id: str,
|
|
274
|
+
tags: List[str],
|
|
275
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
276
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
277
|
+
):
|
|
278
|
+
prompt_config = tag_prompt(
|
|
279
|
+
project_name, name, commit_id, tags, judgment_api_key, organization_id
|
|
280
|
+
)
|
|
281
|
+
return prompt_config["commit_id"]
|
|
282
|
+
|
|
283
|
+
@classmethod
|
|
284
|
+
def untag(
|
|
285
|
+
cls,
|
|
286
|
+
project_name: str,
|
|
287
|
+
name: str,
|
|
288
|
+
tags: List[str],
|
|
289
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
290
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
291
|
+
):
|
|
292
|
+
prompt_config = untag_prompt(
|
|
293
|
+
project_name, name, tags, judgment_api_key, organization_id
|
|
294
|
+
)
|
|
295
|
+
return prompt_config["commit_ids"]
|
|
296
|
+
|
|
297
|
+
@classmethod
|
|
298
|
+
def list(
|
|
299
|
+
cls,
|
|
300
|
+
project_name: str,
|
|
301
|
+
name: str,
|
|
302
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
303
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
304
|
+
):
|
|
305
|
+
prompt_configs = list_prompt(
|
|
306
|
+
project_name, name, judgment_api_key, organization_id
|
|
307
|
+
)["versions"]
|
|
308
|
+
return [
|
|
309
|
+
cls(
|
|
310
|
+
name=prompt_config["name"],
|
|
311
|
+
prompt=prompt_config["prompt"],
|
|
312
|
+
tags=prompt_config["tags"],
|
|
313
|
+
created_at=prompt_config["created_at"],
|
|
314
|
+
commit_id=prompt_config["commit_id"],
|
|
315
|
+
parent_commit_id=prompt_config.get("parent_commit_id"),
|
|
316
|
+
metadata={
|
|
317
|
+
"creator_first_name": prompt_config["first_name"],
|
|
318
|
+
"creator_last_name": prompt_config["last_name"],
|
|
319
|
+
"creator_email": prompt_config["user_email"],
|
|
320
|
+
},
|
|
321
|
+
)
|
|
322
|
+
for prompt_config in prompt_configs
|
|
323
|
+
]
|
|
324
|
+
|
|
325
|
+
def compile(self, **kwargs) -> str:
|
|
326
|
+
try:
|
|
327
|
+
return self._template.substitute(**kwargs)
|
|
328
|
+
except KeyError as e:
|
|
329
|
+
missing_var = str(e).strip("'")
|
|
330
|
+
raise ValueError(f"Missing required variable: {missing_var}")
|
judgeval/scorers/__init__.py
CHANGED
|
@@ -1,36 +1,29 @@
|
|
|
1
|
-
from judgeval.scorers.api_scorer import
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
1
|
+
from judgeval.scorers.api_scorer import (
|
|
2
|
+
APIScorerConfig,
|
|
3
|
+
ExampleAPIScorerConfig,
|
|
4
|
+
TraceAPIScorerConfig,
|
|
5
|
+
)
|
|
6
|
+
from judgeval.scorers.base_scorer import BaseScorer
|
|
7
|
+
from judgeval.scorers.example_scorer import ExampleScorer
|
|
8
|
+
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
|
9
9
|
FaithfulnessScorer,
|
|
10
|
-
ContextualRelevancyScorer,
|
|
11
|
-
ContextualPrecisionScorer,
|
|
12
|
-
ContextualRecallScorer,
|
|
13
10
|
AnswerRelevancyScorer,
|
|
14
|
-
ScorerWrapper,
|
|
15
11
|
AnswerCorrectnessScorer,
|
|
16
|
-
|
|
12
|
+
InstructionAdherenceScorer,
|
|
13
|
+
TracePromptScorer,
|
|
14
|
+
PromptScorer,
|
|
17
15
|
)
|
|
18
16
|
|
|
19
17
|
__all__ = [
|
|
20
|
-
"
|
|
21
|
-
"
|
|
18
|
+
"APIScorerConfig",
|
|
19
|
+
"ExampleAPIScorerConfig",
|
|
20
|
+
"TraceAPIScorerConfig",
|
|
21
|
+
"BaseScorer",
|
|
22
|
+
"ExampleScorer",
|
|
23
|
+
"TracePromptScorer",
|
|
22
24
|
"PromptScorer",
|
|
23
|
-
"ClassifierScorer",
|
|
24
|
-
"ToolCorrectnessScorer",
|
|
25
|
-
"JSONCorrectnessScorer",
|
|
26
|
-
"SummarizationScorer",
|
|
27
|
-
"HallucinationScorer",
|
|
28
25
|
"FaithfulnessScorer",
|
|
29
|
-
"ContextualRelevancyScorer",
|
|
30
|
-
"ContextualPrecisionScorer",
|
|
31
|
-
"ContextualRecallScorer",
|
|
32
26
|
"AnswerRelevancyScorer",
|
|
33
|
-
"ScorerWrapper",
|
|
34
27
|
"AnswerCorrectnessScorer",
|
|
35
|
-
"
|
|
28
|
+
"InstructionAdherenceScorer",
|
|
36
29
|
]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# from judgeval.scorers.base_scorer import BaseScorer
|
|
2
|
+
# from judgeval.data.judgment_types import Trace as JudgmentTrace
|
|
3
|
+
# from typing import List, Optional
|
|
4
|
+
# from abc import abstractmethod
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# class TraceScorer(BaseScorer):
|
|
8
|
+
# @abstractmethod
|
|
9
|
+
# async def a_score_trace(
|
|
10
|
+
# self, trace: JudgmentTrace, tools: Optional[List] = None, *args, **kwargs
|
|
11
|
+
# ) -> float:
|
|
12
|
+
# """
|
|
13
|
+
# Asynchronously measures the score on a trace
|
|
14
|
+
# """
|
|
15
|
+
# raise NotImplementedError(
|
|
16
|
+
# "You must implement the `a_score_trace` method in your custom scorer"
|
|
17
|
+
# )
|
judgeval/scorers/api_scorer.py
CHANGED
|
@@ -4,61 +4,65 @@ Judgment Scorer class.
|
|
|
4
4
|
Scores `Example`s using ready-made Judgment evaluators.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
from
|
|
8
|
-
from judgeval.common.logger import debug, info, warning, error
|
|
7
|
+
from __future__ import annotations
|
|
9
8
|
|
|
10
|
-
from
|
|
9
|
+
from pydantic import BaseModel, field_validator
|
|
10
|
+
from typing import List
|
|
11
|
+
from judgeval.constants import APIScorerType
|
|
12
|
+
from judgeval.data.example import ExampleParams
|
|
13
|
+
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
|
11
14
|
|
|
12
15
|
|
|
13
|
-
class
|
|
16
|
+
class APIScorerConfig(BaseModel):
|
|
14
17
|
"""
|
|
15
|
-
|
|
18
|
+
Scorer config that is used to send to our Judgment server.
|
|
16
19
|
|
|
17
20
|
Args:
|
|
18
21
|
score_type (APIScorer): The Judgment metric to use for scoring `Example`s
|
|
22
|
+
name (str): The name of the scorer, usually this is the same as the score_type
|
|
19
23
|
threshold (float): A value between 0 and 1 that determines the scoring threshold
|
|
24
|
+
strict_mode (bool): Whether to use strict mode for the scorer
|
|
25
|
+
required_params (List[ExampleParams]): List of the required parameters on examples for the scorer
|
|
26
|
+
kwargs (dict): Additional keyword arguments to pass to the scorer
|
|
20
27
|
"""
|
|
21
|
-
threshold: float
|
|
22
|
-
score_type: APIScorer
|
|
23
28
|
|
|
24
|
-
|
|
25
|
-
|
|
29
|
+
score_type: APIScorerType
|
|
30
|
+
name: str = ""
|
|
31
|
+
threshold: float = 0.5
|
|
32
|
+
strict_mode: bool = False
|
|
33
|
+
model: str = JUDGMENT_DEFAULT_GPT_MODEL
|
|
34
|
+
|
|
35
|
+
required_params: List[ExampleParams] = []
|
|
36
|
+
|
|
37
|
+
kwargs: dict = {}
|
|
38
|
+
|
|
39
|
+
@field_validator("threshold")
|
|
40
|
+
@classmethod
|
|
41
|
+
def validate_threshold(cls, v, info):
|
|
26
42
|
"""
|
|
27
43
|
Validates that the threshold is between 0 and 1 inclusive.
|
|
28
44
|
"""
|
|
45
|
+
score_type = info.data.get("score_type")
|
|
29
46
|
if not 0 <= v <= 1:
|
|
30
|
-
|
|
31
|
-
|
|
47
|
+
raise ValueError(
|
|
48
|
+
f"Threshold for {score_type} must be between 0 and 1, got: {v}"
|
|
49
|
+
)
|
|
50
|
+
return v
|
|
51
|
+
|
|
52
|
+
@field_validator("name", mode="after")
|
|
53
|
+
@classmethod
|
|
54
|
+
def set_name_to_score_type_if_none(cls, v, info):
|
|
55
|
+
if v is None:
|
|
56
|
+
return info.data.get("score_type")
|
|
32
57
|
return v
|
|
33
58
|
|
|
34
|
-
@field_validator('score_type')
|
|
35
|
-
def convert_to_enum_value(cls, v):
|
|
36
|
-
"""
|
|
37
|
-
Validates that the `score_type` is a valid `JudgmentMetric` enum value.
|
|
38
|
-
Converts string values to `JudgmentMetric` enum values.
|
|
39
|
-
"""
|
|
40
|
-
debug(f"Attempting to convert score_type value: {v}")
|
|
41
|
-
if isinstance(v, APIScorer):
|
|
42
|
-
info(f"Using existing JudgmentMetric: {v.value}")
|
|
43
|
-
return v.value
|
|
44
|
-
elif isinstance(v, str):
|
|
45
|
-
debug(f"Converting string value to JudgmentMetric enum: {v}")
|
|
46
|
-
return APIScorer[v.upper()].value
|
|
47
|
-
error(f"Invalid score_type value: {v}")
|
|
48
|
-
raise ValueError(f"Invalid value for score_type: {v}")
|
|
49
|
-
|
|
50
59
|
def __str__(self):
|
|
51
|
-
return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
return {
|
|
61
|
-
"score_type": self.score_type,
|
|
62
|
-
"threshold": self.threshold
|
|
63
|
-
}
|
|
64
|
-
|
|
60
|
+
return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class ExampleAPIScorerConfig(APIScorerConfig):
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class TraceAPIScorerConfig(APIScorerConfig):
|
|
68
|
+
pass
|
judgeval/scorers/base_scorer.py
CHANGED
|
@@ -1,52 +1,97 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
Scores `Example`s using ready-made Judgment evaluators.
|
|
2
|
+
Base class for all scorers.
|
|
5
3
|
"""
|
|
6
4
|
|
|
7
|
-
from
|
|
8
|
-
from
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
from typing import Dict, Optional
|
|
9
7
|
|
|
10
|
-
from
|
|
8
|
+
from pydantic import BaseModel
|
|
11
9
|
|
|
12
10
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
11
|
+
from judgeval.judges.utils import create_judge
|
|
12
|
+
from typing import Any
|
|
13
|
+
from pydantic import model_validator, Field
|
|
14
|
+
|
|
16
15
|
|
|
17
|
-
|
|
18
|
-
score_type (APIScorer): The Judgment metric to use for scoring `Example`s
|
|
19
|
-
threshold (float): A value between 0 and 1 that determines the scoring threshold
|
|
16
|
+
class BaseScorer(BaseModel):
|
|
20
17
|
"""
|
|
21
|
-
|
|
22
|
-
|
|
18
|
+
If you want to create a scorer that does not fall under any of the ready-made Judgment scorers,
|
|
19
|
+
you can create a custom scorer by extending this class. This is best used for special use cases
|
|
20
|
+
where none of Judgment's scorers are suitable.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
# type of your scorer (Faithfulness, PromptScorer)
|
|
24
|
+
score_type: str
|
|
25
|
+
|
|
26
|
+
# The threshold to pass a test while using this scorer as a scorer
|
|
27
|
+
threshold: float = 0.5
|
|
28
|
+
|
|
29
|
+
# name of your scorer (Faithfulness, PromptScorer-randomslug)
|
|
30
|
+
name: str = ""
|
|
31
|
+
|
|
32
|
+
# The name of the class of the scorer
|
|
33
|
+
class_name: Optional[str] = None
|
|
34
|
+
|
|
35
|
+
# The float score of the scorer run on the test case
|
|
36
|
+
score: Optional[float] = None
|
|
37
|
+
|
|
38
|
+
score_breakdown: Optional[Dict] = None
|
|
39
|
+
reason: Optional[str] = ""
|
|
40
|
+
|
|
41
|
+
# Whether the model is a native model
|
|
42
|
+
using_native_model: Optional[bool] = None
|
|
23
43
|
|
|
24
|
-
|
|
25
|
-
|
|
44
|
+
# Whether the test case passed or failed
|
|
45
|
+
success: bool = False
|
|
46
|
+
|
|
47
|
+
# The name of the model used to evaluate the test case
|
|
48
|
+
model: Optional[str] = None
|
|
49
|
+
|
|
50
|
+
# The model used to evaluate the test case
|
|
51
|
+
model_client: Optional[Any] = Field(default=None, exclude=True)
|
|
52
|
+
|
|
53
|
+
# Whether to run the scorer in strict mode
|
|
54
|
+
strict_mode: bool = False
|
|
55
|
+
|
|
56
|
+
# The error message if the scorer failed
|
|
57
|
+
error: Optional[str] = None
|
|
58
|
+
|
|
59
|
+
# Additional metadata for the scorer
|
|
60
|
+
additional_metadata: Optional[Dict] = None
|
|
61
|
+
|
|
62
|
+
# The user ID of the scorer
|
|
63
|
+
user: Optional[str] = None
|
|
64
|
+
|
|
65
|
+
# Whether the scorer is hosted on the server
|
|
66
|
+
server_hosted: bool = False
|
|
67
|
+
|
|
68
|
+
@model_validator(mode="after")
|
|
69
|
+
def enforce_strict_threshold(self):
|
|
70
|
+
if self.strict_mode:
|
|
71
|
+
self.threshold = 1.0
|
|
72
|
+
return self
|
|
73
|
+
|
|
74
|
+
@model_validator(mode="after")
|
|
75
|
+
def default_name(self):
|
|
76
|
+
self.class_name = self.__class__.__name__
|
|
77
|
+
if not self.name:
|
|
78
|
+
self.name = self.class_name
|
|
79
|
+
return self
|
|
80
|
+
|
|
81
|
+
def _add_model(self, model: str):
|
|
26
82
|
"""
|
|
27
|
-
|
|
83
|
+
Adds the evaluation model to the BaseScorer instance
|
|
84
|
+
|
|
85
|
+
This method is used at eval time
|
|
28
86
|
"""
|
|
29
|
-
|
|
30
|
-
error(f"Threshold must be between 0 and 1, got: {v}")
|
|
31
|
-
raise ValueError(f"Threshold must be between 0 and 1, got: {v}")
|
|
32
|
-
return v
|
|
87
|
+
self.model_client, self.using_native_model = create_judge(model)
|
|
33
88
|
|
|
34
|
-
|
|
35
|
-
def convert_to_enum_value(cls, v):
|
|
89
|
+
def success_check(self) -> bool:
|
|
36
90
|
"""
|
|
37
|
-
|
|
38
|
-
Converts string values to `JudgmentMetric` enum values.
|
|
91
|
+
For unit testing, determines whether the test case passes or fails
|
|
39
92
|
"""
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
return
|
|
44
|
-
|
|
45
|
-
debug(f"Converting string value to JudgmentMetric enum: {v}")
|
|
46
|
-
return APIScorer[v.upper()].value
|
|
47
|
-
error(f"Invalid score_type value: {v}")
|
|
48
|
-
raise ValueError(f"Invalid value for score_type: {v}")
|
|
49
|
-
|
|
50
|
-
def __str__(self):
|
|
51
|
-
return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
|
|
52
|
-
|
|
93
|
+
if self.error:
|
|
94
|
+
return False
|
|
95
|
+
if self.score is None:
|
|
96
|
+
return False
|
|
97
|
+
return self.score >= self.threshold
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from judgeval.scorers.base_scorer import BaseScorer
|
|
2
|
+
from judgeval.data import Example
|
|
3
|
+
from typing import List
|
|
4
|
+
from pydantic import Field
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ExampleScorer(BaseScorer):
|
|
8
|
+
score_type: str = "Custom"
|
|
9
|
+
required_params: List[str] = Field(default_factory=list)
|
|
10
|
+
|
|
11
|
+
async def a_score_example(self, example: Example, *args, **kwargs) -> float:
|
|
12
|
+
"""
|
|
13
|
+
Asynchronously measures the score on a single example
|
|
14
|
+
"""
|
|
15
|
+
raise NotImplementedError(
|
|
16
|
+
"You must implement the `a_score_example` method in your custom scorer"
|
|
17
|
+
)
|