judgeval 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/__init__.py CHANGED
@@ -2,6 +2,7 @@
2
2
  from judgeval.clients import client, together_client
3
3
  from judgeval.judgment_client import JudgmentClient
4
4
  from judgeval.version_check import check_latest_version
5
+ from judgeval.local_eval_queue import LocalEvaluationQueue
5
6
 
6
7
  check_latest_version()
7
8
 
@@ -10,4 +11,5 @@ __all__ = [
10
11
  "client",
11
12
  "together_client",
12
13
  "JudgmentClient",
14
+ "LocalEvaluationQueue",
13
15
  ]
judgeval/cli.py ADDED
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import typer
4
+ from pathlib import Path
5
+ from dotenv import load_dotenv
6
+ from judgeval.common.logger import judgeval_logger
7
+ from judgeval.judgment_client import JudgmentClient
8
+
9
+ load_dotenv()
10
+
11
+ app = typer.Typer(
12
+ no_args_is_help=True,
13
+ rich_markup_mode=None,
14
+ rich_help_panel=None,
15
+ pretty_exceptions_enable=False,
16
+ pretty_exceptions_show_locals=False,
17
+ pretty_exceptions_short=False,
18
+ )
19
+
20
+
21
+ @app.command("upload_scorer")
22
+ def upload_scorer(
23
+ scorer_file_path: str,
24
+ requirements_file_path: str,
25
+ unique_name: str = typer.Option(
26
+ None, help="Custom name for the scorer (auto-detected if not provided)"
27
+ ),
28
+ ):
29
+ # Validate file paths
30
+ if not Path(scorer_file_path).exists():
31
+ judgeval_logger.error(f"Scorer file not found: {scorer_file_path}")
32
+ raise typer.Exit(1)
33
+
34
+ if not Path(requirements_file_path).exists():
35
+ judgeval_logger.error(f"Requirements file not found: {requirements_file_path}")
36
+ raise typer.Exit(1)
37
+
38
+ try:
39
+ client = JudgmentClient()
40
+
41
+ result = client.save_custom_scorer(
42
+ scorer_file_path=scorer_file_path,
43
+ requirements_file_path=requirements_file_path,
44
+ unique_name=unique_name,
45
+ )
46
+
47
+ if not result:
48
+ judgeval_logger.error("Failed to upload custom scorer")
49
+ raise typer.Exit(1)
50
+
51
+ raise typer.Exit(0)
52
+ except Exception:
53
+ raise
54
+
55
+
56
+ @app.command()
57
+ def version():
58
+ """Show version info"""
59
+ judgeval_logger.info("JudgEval CLI v0.0.0")
60
+
61
+
62
+ if __name__ == "__main__":
63
+ app()
64
+
65
+ # judgeval upload_scorer /Users/alanzhang/repo/JudgmentLabs/judgeval/src/demo/profile_match_scorer.py /Users/alanzhang/repo/JudgmentLabs/judgeval/src/demo/requirements.txt
judgeval/clients.py CHANGED
@@ -2,7 +2,6 @@ import os
2
2
  from dotenv import load_dotenv
3
3
  from openai import OpenAI
4
4
  from typing import Optional
5
- from together import Together, AsyncTogether
6
5
 
7
6
  PATH_TO_DOTENV = os.path.join(os.path.dirname(__file__), ".env")
8
7
  load_dotenv(dotenv_path=PATH_TO_DOTENV)
@@ -28,6 +27,8 @@ async_together_client: Optional["AsyncTogether"] = None
28
27
  together_api_key = os.getenv("TOGETHERAI_API_KEY") or os.getenv("TOGETHER_API_KEY")
29
28
  if together_api_key:
30
29
  try:
30
+ from together import Together, AsyncTogether
31
+
31
32
  together_client = Together(api_key=together_api_key)
32
33
  async_together_client = AsyncTogether(api_key=together_api_key)
33
34
  except Exception:
@@ -20,13 +20,11 @@ from judgeval.common.api.constants import (
20
20
  JUDGMENT_EVAL_DELETE_API_URL,
21
21
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
22
22
  JUDGMENT_GET_EVAL_STATUS_API_URL,
23
- JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL,
24
- JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL,
25
23
  JUDGMENT_SCORER_SAVE_API_URL,
26
24
  JUDGMENT_SCORER_FETCH_API_URL,
27
25
  JUDGMENT_SCORER_EXISTS_API_URL,
26
+ JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL,
28
27
  JUDGMENT_DATASETS_APPEND_TRACES_API_URL,
29
- JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL,
30
28
  )
31
29
  from judgeval.common.api.constants import (
32
30
  TraceFetchPayload,
@@ -45,16 +43,14 @@ from judgeval.common.api.constants import (
45
43
  DeleteEvalRunRequestBody,
46
44
  EvalLogPayload,
47
45
  EvalStatusPayload,
48
- CheckExperimentTypePayload,
49
- EvalRunNameExistsPayload,
50
46
  ScorerSavePayload,
51
47
  ScorerFetchPayload,
52
48
  ScorerExistsPayload,
53
- CheckExampleKeysPayload,
49
+ CustomScorerUploadPayload,
50
+ CustomScorerTemplateResponse,
54
51
  )
55
52
  from judgeval.utils.requests import requests
56
-
57
- import orjson
53
+ from judgeval.common.api.json_encoder import json_encoder
58
54
 
59
55
 
60
56
  class JudgmentAPIException(exceptions.HTTPError):
@@ -98,22 +94,28 @@ class JudgmentApiClient:
98
94
  method: Literal["POST", "PATCH", "GET", "DELETE"],
99
95
  url: str,
100
96
  payload: Any,
97
+ timeout: Optional[Union[float, tuple]] = None,
101
98
  ) -> Any:
99
+ # Prepare request kwargs with optional timeout
100
+ request_kwargs = self._request_kwargs()
101
+ if timeout is not None:
102
+ request_kwargs["timeout"] = timeout
103
+
102
104
  if method == "GET":
103
105
  r = requests.request(
104
106
  method,
105
107
  url,
106
108
  params=payload,
107
109
  headers=self._headers(),
108
- **self._request_kwargs(),
110
+ **request_kwargs,
109
111
  )
110
112
  else:
111
113
  r = requests.request(
112
114
  method,
113
115
  url,
114
- data=self._serialize(payload),
116
+ json=json_encoder(payload),
115
117
  headers=self._headers(),
116
- **self._request_kwargs(),
118
+ **request_kwargs,
117
119
  )
118
120
 
119
121
  try:
@@ -187,10 +189,10 @@ class JudgmentApiClient:
187
189
  payload: EvalLogPayload = {"results": results, "run": run}
188
190
  return self._do_request("POST", JUDGMENT_EVAL_LOG_API_URL, payload)
189
191
 
190
- def fetch_evaluation_results(self, project_name: str, eval_name: str):
192
+ def fetch_evaluation_results(self, experiment_run_id: str, project_name: str):
191
193
  payload: EvalRunRequestBody = {
192
194
  "project_name": project_name,
193
- "eval_name": eval_name,
195
+ "experiment_run_id": experiment_run_id,
194
196
  }
195
197
  return self._do_request("POST", JUDGMENT_EVAL_FETCH_API_URL, payload)
196
198
 
@@ -205,43 +207,21 @@ class JudgmentApiClient:
205
207
  def add_to_evaluation_queue(self, payload: Dict[str, Any]):
206
208
  return self._do_request("POST", JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL, payload)
207
209
 
208
- def get_evaluation_status(self, eval_name: str, project_name: str):
210
+ def get_evaluation_status(self, experiment_run_id: str, project_name: str):
209
211
  payload: EvalStatusPayload = {
210
- "eval_name": eval_name,
212
+ "experiment_run_id": experiment_run_id,
211
213
  "project_name": project_name,
212
214
  "judgment_api_key": self.api_key,
213
215
  }
214
216
  return self._do_request("GET", JUDGMENT_GET_EVAL_STATUS_API_URL, payload)
215
217
 
216
- def check_experiment_type(self, eval_name: str, project_name: str, is_trace: bool):
217
- payload: CheckExperimentTypePayload = {
218
- "eval_name": eval_name,
219
- "project_name": project_name,
220
- "judgment_api_key": self.api_key,
221
- "is_trace": is_trace,
222
- }
223
- return self._do_request("POST", JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL, payload)
224
-
225
- def check_eval_run_name_exists(self, eval_name: str, project_name: str):
226
- payload: EvalRunNameExistsPayload = {
227
- "eval_name": eval_name,
228
- "project_name": project_name,
229
- "judgment_api_key": self.api_key,
230
- }
231
- return self._do_request("POST", JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL, payload)
232
-
233
- def check_example_keys(self, keys: List[str], eval_name: str, project_name: str):
234
- payload: CheckExampleKeysPayload = {
235
- "keys": keys,
236
- "eval_name": eval_name,
237
- "project_name": project_name,
238
- }
239
- return self._do_request("POST", JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL, payload)
240
-
241
- def save_scorer(self, name: str, prompt: str, options: Optional[dict] = None):
218
+ def save_scorer(
219
+ self, name: str, prompt: str, threshold: float, options: Optional[dict] = None
220
+ ):
242
221
  payload: ScorerSavePayload = {
243
222
  "name": name,
244
223
  "prompt": prompt,
224
+ "threshold": threshold,
245
225
  "options": options,
246
226
  }
247
227
  try:
@@ -293,6 +273,31 @@ class JudgmentApiClient:
293
273
  request=e.request,
294
274
  )
295
275
 
276
+ def upload_custom_scorer(
277
+ self,
278
+ scorer_name: str,
279
+ scorer_code: str,
280
+ requirements_text: str,
281
+ ) -> CustomScorerTemplateResponse:
282
+ """Upload custom scorer to backend"""
283
+ payload: CustomScorerUploadPayload = {
284
+ "scorer_name": scorer_name,
285
+ "scorer_code": scorer_code,
286
+ "requirements_text": requirements_text,
287
+ }
288
+
289
+ try:
290
+ # Use longer timeout for custom scorer upload (5 minutes)
291
+ response = self._do_request(
292
+ "POST",
293
+ JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL,
294
+ payload,
295
+ timeout=(10, 300),
296
+ )
297
+ return response
298
+ except JudgmentAPIException as e:
299
+ raise e
300
+
296
301
  def push_dataset(
297
302
  self,
298
303
  dataset_alias: str,
@@ -368,16 +373,3 @@ class JudgmentApiClient:
368
373
  "verify": True,
369
374
  "timeout": 30,
370
375
  }
371
-
372
- def _serialize(self, data: Any) -> str:
373
- def fallback_encoder(obj):
374
- try:
375
- return repr(obj)
376
- except Exception:
377
- try:
378
- return str(obj)
379
- except Exception as e:
380
- return f"<Unserializable object of type {type(obj).__name__}: {e}>"
381
-
382
- # orjson returns bytes, so we need to decode to str
383
- return orjson.dumps(data, default=fallback_encoder).decode("utf-8")
@@ -49,9 +49,9 @@ JUDGMENT_EVAL_DELETE_API_URL = (
49
49
  JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
50
50
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
51
51
  JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
52
- JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL = f"{ROOT_API}/check_experiment_type/"
53
- JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL = f"{ROOT_API}/eval-run-name-exists/"
54
- JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL = f"{ROOT_API}/check_example_keys/"
52
+
53
+ # Custom Scorers API
54
+ JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL = f"{ROOT_API}/build_sandbox_template/"
55
55
 
56
56
 
57
57
  # Evaluation API Payloads
@@ -73,9 +73,9 @@ class EvalLogPayload(TypedDict):
73
73
 
74
74
 
75
75
  class EvalStatusPayload(TypedDict):
76
- eval_name: str
77
- project_name: str
76
+ experiment_run_id: str
78
77
  judgment_api_key: str
78
+ project_name: str
79
79
 
80
80
 
81
81
  class CheckExperimentTypePayload(TypedDict):
@@ -162,6 +162,7 @@ JUDGMENT_SCORER_EXISTS_API_URL = f"{ROOT_API}/scorer_exists/"
162
162
  class ScorerSavePayload(TypedDict):
163
163
  name: str
164
164
  prompt: str
165
+ threshold: float
165
166
  options: Optional[dict]
166
167
 
167
168
 
@@ -171,3 +172,15 @@ class ScorerFetchPayload(TypedDict):
171
172
 
172
173
  class ScorerExistsPayload(TypedDict):
173
174
  name: str
175
+
176
+
177
+ class CustomScorerUploadPayload(TypedDict):
178
+ scorer_name: str
179
+ scorer_code: str
180
+ requirements_text: str
181
+
182
+
183
+ class CustomScorerTemplateResponse(TypedDict):
184
+ scorer_name: str
185
+ status: str
186
+ message: str
@@ -0,0 +1,241 @@
1
+ """
2
+
3
+ This is a modified version of https://docs.powertools.aws.dev/lambda/python/2.35.1/api/event_handler/openapi/encoders.html
4
+
5
+ """
6
+
7
+ import dataclasses
8
+ import datetime
9
+ from collections import defaultdict, deque
10
+ from decimal import Decimal
11
+ from enum import Enum
12
+ from pathlib import Path, PurePath
13
+ from re import Pattern
14
+ from types import GeneratorType
15
+ from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Type, Union
16
+ from uuid import UUID
17
+
18
+ from pydantic import BaseModel
19
+ from pydantic.types import SecretBytes, SecretStr
20
+
21
+
22
+ """
23
+ This module contains the encoders used by jsonable_encoder to convert Python objects to JSON serializable data types.
24
+ """
25
+
26
+
27
+ def _model_dump(
28
+ model: BaseModel, mode: Literal["json", "python"] = "json", **kwargs: Any
29
+ ) -> Any:
30
+ return model.model_dump(mode=mode, **kwargs)
31
+
32
+
33
+ def json_encoder(
34
+ obj: Any,
35
+ custom_serializer: Optional[Callable[[Any], str]] = None,
36
+ ) -> Any:
37
+ """
38
+ JSON encodes an arbitrary Python object into JSON serializable data types.
39
+
40
+ This is a modified version of fastapi.encoders.jsonable_encoder that supports
41
+ encoding of pydantic.BaseModel objects.
42
+
43
+ Parameters
44
+ ----------
45
+ obj : Any
46
+ The object to encode
47
+ custom_serializer : Callable, optional
48
+ A custom serializer to use for encoding the object, when everything else fails.
49
+
50
+ Returns
51
+ -------
52
+ Any
53
+ The JSON serializable data types
54
+ """
55
+ # Pydantic models
56
+ if isinstance(obj, BaseModel):
57
+ return _dump_base_model(
58
+ obj=obj,
59
+ )
60
+
61
+ # Dataclasses
62
+ if dataclasses.is_dataclass(obj):
63
+ obj_dict = dataclasses.asdict(obj)
64
+ return json_encoder(
65
+ obj_dict,
66
+ )
67
+
68
+ # Enums
69
+ if isinstance(obj, Enum):
70
+ return obj.value
71
+
72
+ # Paths
73
+ if isinstance(obj, PurePath):
74
+ return str(obj)
75
+
76
+ # Scalars
77
+ if isinstance(obj, (str, int, float, type(None))):
78
+ return obj
79
+
80
+ # Dictionaries
81
+ if isinstance(obj, dict):
82
+ return _dump_dict(
83
+ obj=obj,
84
+ )
85
+
86
+ # Sequences
87
+ if isinstance(obj, (list, set, frozenset, tuple, deque)):
88
+ return _dump_sequence(
89
+ obj=obj,
90
+ )
91
+
92
+ # Other types
93
+ if type(obj) in ENCODERS_BY_TYPE:
94
+ return ENCODERS_BY_TYPE[type(obj)](obj)
95
+
96
+ for encoder, classes_tuple in encoders_by_class_tuples.items():
97
+ if isinstance(obj, classes_tuple):
98
+ return encoder(obj)
99
+
100
+ # Use custom serializer if present
101
+ if custom_serializer:
102
+ return custom_serializer(obj)
103
+
104
+ # Default
105
+ return _dump_other(
106
+ obj=obj,
107
+ )
108
+
109
+
110
+ def _dump_base_model(
111
+ *,
112
+ obj: Any,
113
+ ):
114
+ """
115
+ Dump a BaseModel object to a dict, using the same parameters as jsonable_encoder
116
+ """
117
+ obj_dict = _model_dump(
118
+ obj,
119
+ mode="json",
120
+ )
121
+ if "__root__" in obj_dict:
122
+ obj_dict = obj_dict["__root__"]
123
+
124
+ return json_encoder(
125
+ obj_dict,
126
+ )
127
+
128
+
129
+ def _dump_dict(
130
+ *,
131
+ obj: Any,
132
+ ) -> Dict[str, Any]:
133
+ """
134
+ Dump a dict to a dict, using the same parameters as jsonable_encoder
135
+ """
136
+ encoded_dict = {}
137
+ allowed_keys = set(obj.keys())
138
+ for key, value in obj.items():
139
+ if key in allowed_keys:
140
+ encoded_key = json_encoder(
141
+ key,
142
+ )
143
+ encoded_value = json_encoder(
144
+ value,
145
+ )
146
+ encoded_dict[encoded_key] = encoded_value
147
+ return encoded_dict
148
+
149
+
150
+ def _dump_sequence(
151
+ *,
152
+ obj: Any,
153
+ ) -> List[Any]:
154
+ """
155
+ Dump a sequence to a list, using the same parameters as jsonable_encoder
156
+ """
157
+ encoded_list = []
158
+ for item in obj:
159
+ encoded_list.append(
160
+ json_encoder(
161
+ item,
162
+ ),
163
+ )
164
+ return encoded_list
165
+
166
+
167
+ def _dump_other(
168
+ *,
169
+ obj: Any,
170
+ ) -> Any:
171
+ """
172
+ Dump an object to a representation without iterating it.
173
+
174
+ Avoids calling dict(obj) which can consume iterators/generators or
175
+ invoke user-defined iteration protocols.
176
+ """
177
+ try:
178
+ return repr(obj)
179
+ except Exception:
180
+ return str(obj)
181
+
182
+
183
+ def iso_format(o: Union[datetime.date, datetime.time]) -> str:
184
+ """
185
+ ISO format for date and time
186
+ """
187
+ return o.isoformat()
188
+
189
+
190
+ def decimal_encoder(dec_value: Decimal) -> Union[int, float]:
191
+ """
192
+ Encodes a Decimal as int of there's no exponent, otherwise float
193
+
194
+ This is useful when we use ConstrainedDecimal to represent Numeric(x,0)
195
+ where an integer (but not int typed) is used. Encoding this as a float
196
+ results in failed round-tripping between encode and parse.
197
+
198
+ >>> decimal_encoder(Decimal("1.0"))
199
+ 1.0
200
+
201
+ >>> decimal_encoder(Decimal("1"))
202
+ 1
203
+ """
204
+ if dec_value.as_tuple().exponent >= 0: # type: ignore[operator]
205
+ return int(dec_value)
206
+ else:
207
+ return float(dec_value)
208
+
209
+
210
+ ENCODERS_BY_TYPE: Dict[Type[Any], Callable[[Any], Any]] = {
211
+ bytes: lambda o: o.decode(),
212
+ datetime.date: iso_format,
213
+ datetime.datetime: iso_format,
214
+ datetime.time: iso_format,
215
+ datetime.timedelta: lambda td: td.total_seconds(),
216
+ Decimal: decimal_encoder,
217
+ Enum: lambda o: o.value,
218
+ frozenset: list,
219
+ deque: list,
220
+ GeneratorType: repr,
221
+ Path: str,
222
+ Pattern: lambda o: o.pattern,
223
+ SecretBytes: str,
224
+ SecretStr: str,
225
+ set: list,
226
+ UUID: str,
227
+ }
228
+
229
+
230
+ # Generates a mapping of encoders to a tuple of classes that they can encode
231
+ def generate_encoders_by_class_tuples(
232
+ type_encoder_map: Dict[Any, Callable[[Any], Any]],
233
+ ) -> Dict[Callable[[Any], Any], Tuple[Any, ...]]:
234
+ encoders: Dict[Callable[[Any], Any], Tuple[Any, ...]] = defaultdict(tuple)
235
+ for type_, encoder in type_encoder_map.items():
236
+ encoders[encoder] += (type_,)
237
+ return encoders
238
+
239
+
240
+ # Mapping of encoders to a tuple of classes that they can encode
241
+ encoders_by_class_tuples = generate_encoders_by_class_tuples(ENCODERS_BY_TYPE)