judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,330 @@
1
+ from typing import List, Optional, Dict
2
+ from judgeval.api import JudgmentSyncClient
3
+ from judgeval.exceptions import JudgmentAPIError
4
+ from judgeval.api.api_types import (
5
+ PromptCommitInfo,
6
+ PromptTagResponse,
7
+ PromptUntagResponse,
8
+ PromptVersionsResponse,
9
+ )
10
+ from dataclasses import dataclass, field
11
+ import re
12
+ from string import Template
13
+ from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
14
+ from judgeval.utils.project import _resolve_project_id
15
+
16
+
17
+ def push_prompt(
18
+ project_name: str,
19
+ name: str,
20
+ prompt: str,
21
+ tags: List[str],
22
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
23
+ organization_id: str | None = JUDGMENT_ORG_ID,
24
+ ) -> tuple[str, Optional[str], str]:
25
+ if not judgment_api_key or not organization_id:
26
+ raise ValueError("Judgment API key and organization ID are required")
27
+ client = JudgmentSyncClient(judgment_api_key, organization_id)
28
+ try:
29
+ project_id = _resolve_project_id(
30
+ project_name, judgment_api_key, organization_id
31
+ )
32
+ if not project_id:
33
+ raise JudgmentAPIError(
34
+ status_code=404,
35
+ detail=f"Project '{project_name}' not found",
36
+ response=None, # type: ignore
37
+ )
38
+ r = client.prompts_insert(
39
+ payload={
40
+ "project_id": project_id,
41
+ "name": name,
42
+ "prompt": prompt,
43
+ "tags": tags,
44
+ }
45
+ )
46
+ return r["commit_id"], r.get("parent_commit_id"), r["created_at"]
47
+ except JudgmentAPIError as e:
48
+ raise JudgmentAPIError(
49
+ status_code=e.status_code,
50
+ detail=f"Failed to save prompt: {e.detail}",
51
+ response=e.response,
52
+ )
53
+
54
+
55
+ def fetch_prompt(
56
+ project_name: str,
57
+ name: str,
58
+ commit_id: Optional[str] = None,
59
+ tag: Optional[str] = None,
60
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
61
+ organization_id: str | None = JUDGMENT_ORG_ID,
62
+ ) -> Optional[PromptCommitInfo]:
63
+ if not judgment_api_key or not organization_id:
64
+ raise ValueError("Judgment API key and organization ID are required")
65
+ client = JudgmentSyncClient(judgment_api_key, organization_id)
66
+ try:
67
+ project_id = _resolve_project_id(
68
+ project_name, judgment_api_key, organization_id
69
+ )
70
+ if not project_id:
71
+ raise JudgmentAPIError(
72
+ status_code=404,
73
+ detail=f"Project '{project_name}' not found",
74
+ response=None, # type: ignore
75
+ )
76
+ prompt_config = client.prompts_fetch(
77
+ name=name,
78
+ project_id=project_id,
79
+ commit_id=commit_id,
80
+ tag=tag,
81
+ )
82
+ return prompt_config["commit"]
83
+ except JudgmentAPIError as e:
84
+ raise JudgmentAPIError(
85
+ status_code=e.status_code,
86
+ detail=f"Failed to fetch prompt '{name}': {e.detail}",
87
+ response=e.response,
88
+ )
89
+
90
+
91
+ def tag_prompt(
92
+ project_name: str,
93
+ name: str,
94
+ commit_id: str,
95
+ tags: List[str],
96
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
97
+ organization_id: str | None = JUDGMENT_ORG_ID,
98
+ ) -> PromptTagResponse:
99
+ if not judgment_api_key or not organization_id:
100
+ raise ValueError("Judgment API key and organization ID are required")
101
+ client = JudgmentSyncClient(judgment_api_key, organization_id)
102
+ try:
103
+ project_id = _resolve_project_id(
104
+ project_name, judgment_api_key, organization_id
105
+ )
106
+ if not project_id:
107
+ raise JudgmentAPIError(
108
+ status_code=404,
109
+ detail=f"Project '{project_name}' not found",
110
+ response=None, # type: ignore
111
+ )
112
+ prompt_config = client.prompts_tag(
113
+ payload={
114
+ "project_id": project_id,
115
+ "name": name,
116
+ "commit_id": commit_id,
117
+ "tags": tags,
118
+ }
119
+ )
120
+ return prompt_config
121
+ except JudgmentAPIError as e:
122
+ raise JudgmentAPIError(
123
+ status_code=e.status_code,
124
+ detail=f"Failed to tag prompt '{name}': {e.detail}",
125
+ response=e.response,
126
+ )
127
+
128
+
129
+ def untag_prompt(
130
+ project_name: str,
131
+ name: str,
132
+ tags: List[str],
133
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
134
+ organization_id: str | None = JUDGMENT_ORG_ID,
135
+ ) -> PromptUntagResponse:
136
+ if not judgment_api_key or not organization_id:
137
+ raise ValueError("Judgment API key and organization ID are required")
138
+ client = JudgmentSyncClient(judgment_api_key, organization_id)
139
+ try:
140
+ project_id = _resolve_project_id(
141
+ project_name, judgment_api_key, organization_id
142
+ )
143
+ if not project_id:
144
+ raise JudgmentAPIError(
145
+ status_code=404,
146
+ detail=f"Project '{project_name}' not found",
147
+ response=None, # type: ignore
148
+ )
149
+ prompt_config = client.prompts_untag(
150
+ payload={"project_id": project_id, "name": name, "tags": tags}
151
+ )
152
+ return prompt_config
153
+ except JudgmentAPIError as e:
154
+ raise JudgmentAPIError(
155
+ status_code=e.status_code,
156
+ detail=f"Failed to untag prompt '{name}': {e.detail}",
157
+ response=e.response,
158
+ )
159
+
160
+
161
+ def list_prompt(
162
+ project_name: str,
163
+ name: str,
164
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
165
+ organization_id: str | None = JUDGMENT_ORG_ID,
166
+ ) -> PromptVersionsResponse:
167
+ if not judgment_api_key or not organization_id:
168
+ raise ValueError("Judgment API key and organization ID are required")
169
+ client = JudgmentSyncClient(judgment_api_key, organization_id)
170
+ try:
171
+ project_id = _resolve_project_id(
172
+ project_name, judgment_api_key, organization_id
173
+ )
174
+ if not project_id:
175
+ raise JudgmentAPIError(
176
+ status_code=404,
177
+ detail=f"Project '{project_name}' not found",
178
+ response=None, # type: ignore
179
+ )
180
+ prompt_config = client.prompts_get_prompt_versions(
181
+ project_id=project_id, name=name
182
+ )
183
+ return prompt_config
184
+ except JudgmentAPIError as e:
185
+ raise JudgmentAPIError(
186
+ status_code=e.status_code,
187
+ detail=f"Failed to list prompt '{name}': {e.detail}",
188
+ response=e.response,
189
+ )
190
+
191
+
192
+ @dataclass
193
+ class Prompt:
194
+ name: str
195
+ prompt: str
196
+ created_at: str
197
+ tags: List[str]
198
+ commit_id: str
199
+ parent_commit_id: Optional[str] = None
200
+ metadata: Dict[str, str] = field(default_factory=dict)
201
+ _template: Template = field(init=False, repr=False)
202
+
203
+ def __post_init__(self):
204
+ template_str = re.sub(r"\{\{([^}]+)\}\}", r"$\1", self.prompt)
205
+ self._template = Template(template_str)
206
+
207
+ @classmethod
208
+ def create(
209
+ cls,
210
+ project_name: str,
211
+ name: str,
212
+ prompt: str,
213
+ tags: Optional[List[str]] = None,
214
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
215
+ organization_id: str | None = JUDGMENT_ORG_ID,
216
+ ):
217
+ if tags is None:
218
+ tags = []
219
+ commit_id, parent_commit_id, created_at = push_prompt(
220
+ project_name, name, prompt, tags, judgment_api_key, organization_id
221
+ )
222
+ return cls(
223
+ name=name,
224
+ prompt=prompt,
225
+ created_at=created_at,
226
+ tags=tags,
227
+ commit_id=commit_id,
228
+ parent_commit_id=parent_commit_id,
229
+ )
230
+
231
+ @classmethod
232
+ def get(
233
+ cls,
234
+ project_name: str,
235
+ name: str,
236
+ commit_id: Optional[str] = None,
237
+ tag: Optional[str] = None,
238
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
239
+ organization_id: str | None = JUDGMENT_ORG_ID,
240
+ ):
241
+ if commit_id is not None and tag is not None:
242
+ raise ValueError(
243
+ "You cannot fetch a prompt by both commit_id and tag at the same time"
244
+ )
245
+ prompt_config = fetch_prompt(
246
+ project_name, name, commit_id, tag, judgment_api_key, organization_id
247
+ )
248
+ if prompt_config is None:
249
+ raise JudgmentAPIError(
250
+ status_code=404,
251
+ detail=f"Prompt '{name}' not found in project '{project_name}'",
252
+ response=None, # type: ignore
253
+ )
254
+ return cls(
255
+ name=prompt_config["name"],
256
+ prompt=prompt_config["prompt"],
257
+ created_at=prompt_config["created_at"],
258
+ tags=prompt_config["tags"],
259
+ commit_id=prompt_config["commit_id"],
260
+ parent_commit_id=prompt_config.get("parent_commit_id"),
261
+ metadata={
262
+ "creator_first_name": prompt_config["first_name"],
263
+ "creator_last_name": prompt_config["last_name"],
264
+ "creator_email": prompt_config["user_email"],
265
+ },
266
+ )
267
+
268
+ @classmethod
269
+ def tag(
270
+ cls,
271
+ project_name: str,
272
+ name: str,
273
+ commit_id: str,
274
+ tags: List[str],
275
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
276
+ organization_id: str | None = JUDGMENT_ORG_ID,
277
+ ):
278
+ prompt_config = tag_prompt(
279
+ project_name, name, commit_id, tags, judgment_api_key, organization_id
280
+ )
281
+ return prompt_config["commit_id"]
282
+
283
+ @classmethod
284
+ def untag(
285
+ cls,
286
+ project_name: str,
287
+ name: str,
288
+ tags: List[str],
289
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
290
+ organization_id: str | None = JUDGMENT_ORG_ID,
291
+ ):
292
+ prompt_config = untag_prompt(
293
+ project_name, name, tags, judgment_api_key, organization_id
294
+ )
295
+ return prompt_config["commit_ids"]
296
+
297
+ @classmethod
298
+ def list(
299
+ cls,
300
+ project_name: str,
301
+ name: str,
302
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
303
+ organization_id: str | None = JUDGMENT_ORG_ID,
304
+ ):
305
+ prompt_configs = list_prompt(
306
+ project_name, name, judgment_api_key, organization_id
307
+ )["versions"]
308
+ return [
309
+ cls(
310
+ name=prompt_config["name"],
311
+ prompt=prompt_config["prompt"],
312
+ tags=prompt_config["tags"],
313
+ created_at=prompt_config["created_at"],
314
+ commit_id=prompt_config["commit_id"],
315
+ parent_commit_id=prompt_config.get("parent_commit_id"),
316
+ metadata={
317
+ "creator_first_name": prompt_config["first_name"],
318
+ "creator_last_name": prompt_config["last_name"],
319
+ "creator_email": prompt_config["user_email"],
320
+ },
321
+ )
322
+ for prompt_config in prompt_configs
323
+ ]
324
+
325
+ def compile(self, **kwargs) -> str:
326
+ try:
327
+ return self._template.substitute(**kwargs)
328
+ except KeyError as e:
329
+ missing_var = str(e).strip("'")
330
+ raise ValueError(f"Missing required variable: {missing_var}")
@@ -1,36 +1,29 @@
1
- from judgeval.scorers.api_scorer import APIJudgmentScorer
2
- from judgeval.scorers.judgeval_scorer import JudgevalScorer
3
- from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
4
- from judgeval.scorers.judgeval_scorers import (
5
- ToolCorrectnessScorer,
6
- JSONCorrectnessScorer,
7
- SummarizationScorer,
8
- HallucinationScorer,
1
+ from judgeval.scorers.api_scorer import (
2
+ APIScorerConfig,
3
+ ExampleAPIScorerConfig,
4
+ TraceAPIScorerConfig,
5
+ )
6
+ from judgeval.scorers.base_scorer import BaseScorer
7
+ from judgeval.scorers.example_scorer import ExampleScorer
8
+ from judgeval.scorers.judgeval_scorers.api_scorers import (
9
9
  FaithfulnessScorer,
10
- ContextualRelevancyScorer,
11
- ContextualPrecisionScorer,
12
- ContextualRecallScorer,
13
10
  AnswerRelevancyScorer,
14
- ScorerWrapper,
15
11
  AnswerCorrectnessScorer,
16
- Text2SQLScorer,
12
+ InstructionAdherenceScorer,
13
+ TracePromptScorer,
14
+ PromptScorer,
17
15
  )
18
16
 
19
17
  __all__ = [
20
- "APIJudgmentScorer",
21
- "JudgevalScorer",
18
+ "APIScorerConfig",
19
+ "ExampleAPIScorerConfig",
20
+ "TraceAPIScorerConfig",
21
+ "BaseScorer",
22
+ "ExampleScorer",
23
+ "TracePromptScorer",
22
24
  "PromptScorer",
23
- "ClassifierScorer",
24
- "ToolCorrectnessScorer",
25
- "JSONCorrectnessScorer",
26
- "SummarizationScorer",
27
- "HallucinationScorer",
28
25
  "FaithfulnessScorer",
29
- "ContextualRelevancyScorer",
30
- "ContextualPrecisionScorer",
31
- "ContextualRecallScorer",
32
26
  "AnswerRelevancyScorer",
33
- "ScorerWrapper",
34
27
  "AnswerCorrectnessScorer",
35
- "Text2SQLScorer",
28
+ "InstructionAdherenceScorer",
36
29
  ]
@@ -0,0 +1,17 @@
1
+ # from judgeval.scorers.base_scorer import BaseScorer
2
+ # from judgeval.data.judgment_types import Trace as JudgmentTrace
3
+ # from typing import List, Optional
4
+ # from abc import abstractmethod
5
+
6
+
7
+ # class TraceScorer(BaseScorer):
8
+ # @abstractmethod
9
+ # async def a_score_trace(
10
+ # self, trace: JudgmentTrace, tools: Optional[List] = None, *args, **kwargs
11
+ # ) -> float:
12
+ # """
13
+ # Asynchronously measures the score on a trace
14
+ # """
15
+ # raise NotImplementedError(
16
+ # "You must implement the `a_score_trace` method in your custom scorer"
17
+ # )
@@ -4,61 +4,65 @@ Judgment Scorer class.
4
4
  Scores `Example`s using ready-made Judgment evaluators.
5
5
  """
6
6
 
7
- from pydantic import BaseModel, field_validator
8
- from judgeval.common.logger import debug, info, warning, error
7
+ from __future__ import annotations
9
8
 
10
- from judgeval.constants import APIScorer
9
+ from pydantic import BaseModel, field_validator
10
+ from typing import List
11
+ from judgeval.constants import APIScorerType
12
+ from judgeval.data.example import ExampleParams
13
+ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
11
14
 
12
15
 
13
- class APIJudgmentScorer(BaseModel):
16
+ class APIScorerConfig(BaseModel):
14
17
  """
15
- Class for ready-made, "out-of-the-box" scorer that uses Judgment evaluators to score `Example`s.
18
+ Scorer config that is used to send to our Judgment server.
16
19
 
17
20
  Args:
18
21
  score_type (APIScorer): The Judgment metric to use for scoring `Example`s
22
+ name (str): The name of the scorer, usually this is the same as the score_type
19
23
  threshold (float): A value between 0 and 1 that determines the scoring threshold
24
+ strict_mode (bool): Whether to use strict mode for the scorer
25
+ required_params (List[ExampleParams]): List of the required parameters on examples for the scorer
26
+ kwargs (dict): Additional keyword arguments to pass to the scorer
20
27
  """
21
- threshold: float
22
- score_type: APIScorer
23
28
 
24
- @field_validator('threshold')
25
- def validate_threshold(cls, v):
29
+ score_type: APIScorerType
30
+ name: str = ""
31
+ threshold: float = 0.5
32
+ strict_mode: bool = False
33
+ model: str = JUDGMENT_DEFAULT_GPT_MODEL
34
+
35
+ required_params: List[ExampleParams] = []
36
+
37
+ kwargs: dict = {}
38
+
39
+ @field_validator("threshold")
40
+ @classmethod
41
+ def validate_threshold(cls, v, info):
26
42
  """
27
43
  Validates that the threshold is between 0 and 1 inclusive.
28
44
  """
45
+ score_type = info.data.get("score_type")
29
46
  if not 0 <= v <= 1:
30
- error(f"Threshold must be between 0 and 1, got: {v}")
31
- raise ValueError(f"Threshold must be between 0 and 1, got: {v}")
47
+ raise ValueError(
48
+ f"Threshold for {score_type} must be between 0 and 1, got: {v}"
49
+ )
50
+ return v
51
+
52
+ @field_validator("name", mode="after")
53
+ @classmethod
54
+ def set_name_to_score_type_if_none(cls, v, info):
55
+ if v is None:
56
+ return info.data.get("score_type")
32
57
  return v
33
58
 
34
- @field_validator('score_type')
35
- def convert_to_enum_value(cls, v):
36
- """
37
- Validates that the `score_type` is a valid `JudgmentMetric` enum value.
38
- Converts string values to `JudgmentMetric` enum values.
39
- """
40
- debug(f"Attempting to convert score_type value: {v}")
41
- if isinstance(v, APIScorer):
42
- info(f"Using existing JudgmentMetric: {v.value}")
43
- return v.value
44
- elif isinstance(v, str):
45
- debug(f"Converting string value to JudgmentMetric enum: {v}")
46
- return APIScorer[v.upper()].value
47
- error(f"Invalid score_type value: {v}")
48
- raise ValueError(f"Invalid value for score_type: {v}")
49
-
50
59
  def __str__(self):
51
- return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
52
-
53
- def to_dict(self) -> dict:
54
- """
55
- Converts the scorer configuration to a dictionary format.
56
-
57
- Returns:
58
- dict: A dictionary containing the scorer's configuration
59
- """
60
- return {
61
- "score_type": self.score_type,
62
- "threshold": self.threshold
63
- }
64
-
60
+ return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
61
+
62
+
63
+ class ExampleAPIScorerConfig(APIScorerConfig):
64
+ pass
65
+
66
+
67
+ class TraceAPIScorerConfig(APIScorerConfig):
68
+ pass
@@ -1,52 +1,97 @@
1
1
  """
2
- Judgment Scorer class.
3
-
4
- Scores `Example`s using ready-made Judgment evaluators.
2
+ Base class for all scorers.
5
3
  """
6
4
 
7
- from pydantic import BaseModel, field_validator
8
- from judgeval.common.logger import debug, info, warning, error
5
+ from __future__ import annotations
6
+ from typing import Dict, Optional
9
7
 
10
- from judgeval.constants import APIScorer
8
+ from pydantic import BaseModel
11
9
 
12
10
 
13
- class APIJudgmentScorer(BaseModel):
14
- """
15
- Class for ready-made, "out-of-the-box" scorer that uses Judgment evaluators to score `Example`s.
11
+ from judgeval.judges.utils import create_judge
12
+ from typing import Any
13
+ from pydantic import model_validator, Field
14
+
16
15
 
17
- Args:
18
- score_type (APIScorer): The Judgment metric to use for scoring `Example`s
19
- threshold (float): A value between 0 and 1 that determines the scoring threshold
16
+ class BaseScorer(BaseModel):
20
17
  """
21
- threshold: float
22
- score_type: APIScorer
18
+ If you want to create a scorer that does not fall under any of the ready-made Judgment scorers,
19
+ you can create a custom scorer by extending this class. This is best used for special use cases
20
+ where none of Judgment's scorers are suitable.
21
+ """
22
+
23
+ # type of your scorer (Faithfulness, PromptScorer)
24
+ score_type: str
25
+
26
+ # The threshold to pass a test while using this scorer as a scorer
27
+ threshold: float = 0.5
28
+
29
+ # name of your scorer (Faithfulness, PromptScorer-randomslug)
30
+ name: str = ""
31
+
32
+ # The name of the class of the scorer
33
+ class_name: Optional[str] = None
34
+
35
+ # The float score of the scorer run on the test case
36
+ score: Optional[float] = None
37
+
38
+ score_breakdown: Optional[Dict] = None
39
+ reason: Optional[str] = ""
40
+
41
+ # Whether the model is a native model
42
+ using_native_model: Optional[bool] = None
23
43
 
24
- @field_validator('threshold')
25
- def validate_threshold(cls, v):
44
+ # Whether the test case passed or failed
45
+ success: bool = False
46
+
47
+ # The name of the model used to evaluate the test case
48
+ model: Optional[str] = None
49
+
50
+ # The model used to evaluate the test case
51
+ model_client: Optional[Any] = Field(default=None, exclude=True)
52
+
53
+ # Whether to run the scorer in strict mode
54
+ strict_mode: bool = False
55
+
56
+ # The error message if the scorer failed
57
+ error: Optional[str] = None
58
+
59
+ # Additional metadata for the scorer
60
+ additional_metadata: Optional[Dict] = None
61
+
62
+ # The user ID of the scorer
63
+ user: Optional[str] = None
64
+
65
+ # Whether the scorer is hosted on the server
66
+ server_hosted: bool = False
67
+
68
+ @model_validator(mode="after")
69
+ def enforce_strict_threshold(self):
70
+ if self.strict_mode:
71
+ self.threshold = 1.0
72
+ return self
73
+
74
+ @model_validator(mode="after")
75
+ def default_name(self):
76
+ self.class_name = self.__class__.__name__
77
+ if not self.name:
78
+ self.name = self.class_name
79
+ return self
80
+
81
+ def _add_model(self, model: str):
26
82
  """
27
- Validates that the threshold is between 0 and 1 inclusive.
83
+ Adds the evaluation model to the BaseScorer instance
84
+
85
+ This method is used at eval time
28
86
  """
29
- if not 0 <= v <= 1:
30
- error(f"Threshold must be between 0 and 1, got: {v}")
31
- raise ValueError(f"Threshold must be between 0 and 1, got: {v}")
32
- return v
87
+ self.model_client, self.using_native_model = create_judge(model)
33
88
 
34
- @field_validator('score_type')
35
- def convert_to_enum_value(cls, v):
89
+ def success_check(self) -> bool:
36
90
  """
37
- Validates that the `score_type` is a valid `JudgmentMetric` enum value.
38
- Converts string values to `JudgmentMetric` enum values.
91
+ For unit testing, determines whether the test case passes or fails
39
92
  """
40
- debug(f"Attempting to convert score_type value: {v}")
41
- if isinstance(v, APIScorer):
42
- info(f"Using existing JudgmentMetric: {v.value}")
43
- return v.value
44
- elif isinstance(v, str):
45
- debug(f"Converting string value to JudgmentMetric enum: {v}")
46
- return APIScorer[v.upper()].value
47
- error(f"Invalid score_type value: {v}")
48
- raise ValueError(f"Invalid value for score_type: {v}")
49
-
50
- def __str__(self):
51
- return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
52
-
93
+ if self.error:
94
+ return False
95
+ if self.score is None:
96
+ return False
97
+ return self.score >= self.threshold
@@ -0,0 +1,17 @@
1
+ from judgeval.scorers.base_scorer import BaseScorer
2
+ from judgeval.data import Example
3
+ from typing import List
4
+ from pydantic import Field
5
+
6
+
7
+ class ExampleScorer(BaseScorer):
8
+ score_type: str = "Custom"
9
+ required_params: List[str] = Field(default_factory=list)
10
+
11
+ async def a_score_example(self, example: Example, *args, **kwargs) -> float:
12
+ """
13
+ Asynchronously measures the score on a single example
14
+ """
15
+ raise NotImplementedError(
16
+ "You must implement the `a_score_example` method in your custom scorer"
17
+ )
@@ -8,4 +8,5 @@ class MissingExampleParamsError(Exception):
8
8
  """
9
9
  Error raised when a scorer is missing required example parameters.
10
10
  """
11
+
11
12
  pass