judgeval 0.0.54__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/api/__init__.py +3 -0
- judgeval/common/api/api.py +352 -0
- judgeval/common/api/constants.py +165 -0
- judgeval/common/storage/__init__.py +6 -0
- judgeval/common/tracer/__init__.py +31 -0
- judgeval/common/tracer/constants.py +22 -0
- judgeval/common/tracer/core.py +1916 -0
- judgeval/common/tracer/otel_exporter.py +108 -0
- judgeval/common/tracer/otel_span_processor.py +234 -0
- judgeval/common/tracer/span_processor.py +37 -0
- judgeval/common/tracer/span_transformer.py +211 -0
- judgeval/common/tracer/trace_manager.py +92 -0
- judgeval/common/utils.py +2 -2
- judgeval/constants.py +3 -30
- judgeval/data/datasets/eval_dataset_client.py +29 -156
- judgeval/data/judgment_types.py +4 -12
- judgeval/data/result.py +1 -1
- judgeval/data/scorer_data.py +2 -2
- judgeval/data/scripts/openapi_transform.py +1 -1
- judgeval/data/trace.py +66 -1
- judgeval/data/trace_run.py +0 -3
- judgeval/evaluation_run.py +0 -2
- judgeval/integrations/langgraph.py +43 -164
- judgeval/judgment_client.py +17 -211
- judgeval/run_evaluation.py +209 -611
- judgeval/scorers/__init__.py +2 -6
- judgeval/scorers/base_scorer.py +4 -23
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +3 -3
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +215 -0
- judgeval/scorers/score.py +2 -1
- judgeval/scorers/utils.py +1 -13
- judgeval/utils/requests.py +21 -0
- judgeval-0.1.0.dist-info/METADATA +202 -0
- {judgeval-0.0.54.dist-info → judgeval-0.1.0.dist-info}/RECORD +37 -29
- judgeval/common/tracer.py +0 -3215
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +0 -73
- judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -53
- judgeval-0.0.54.dist-info/METADATA +0 -1384
- /judgeval/common/{s3_storage.py → storage/s3_storage.py} +0 -0
- {judgeval-0.0.54.dist-info → judgeval-0.1.0.dist-info}/WHEEL +0 -0
- {judgeval-0.0.54.dist-info → judgeval-0.1.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/judgment_client.py
CHANGED
@@ -5,11 +5,7 @@ Implements the JudgmentClient to interact with the Judgment API.
|
|
5
5
|
import os
|
6
6
|
from uuid import uuid4
|
7
7
|
from typing import Optional, List, Dict, Any, Union, Callable
|
8
|
-
from requests import codes
|
9
|
-
from judgeval.utils.requests import requests
|
10
|
-
import asyncio
|
11
8
|
|
12
|
-
from judgeval.constants import ROOT_API
|
13
9
|
from judgeval.data.datasets import EvalDataset, EvalDatasetClient
|
14
10
|
from judgeval.data import (
|
15
11
|
ScoringResult,
|
@@ -19,40 +15,31 @@ from judgeval.data import (
|
|
19
15
|
from judgeval.scorers import (
|
20
16
|
APIScorerConfig,
|
21
17
|
BaseScorer,
|
22
|
-
ClassifierScorer,
|
23
18
|
)
|
24
19
|
from judgeval.evaluation_run import EvaluationRun
|
25
20
|
from judgeval.run_evaluation import (
|
26
21
|
run_eval,
|
27
22
|
assert_test,
|
28
23
|
run_trace_eval,
|
29
|
-
safe_run_async,
|
30
24
|
)
|
31
25
|
from judgeval.data.trace_run import TraceRun
|
32
|
-
from judgeval.
|
33
|
-
JUDGMENT_EVAL_FETCH_API_URL,
|
34
|
-
JUDGMENT_PROJECT_DELETE_API_URL,
|
35
|
-
JUDGMENT_PROJECT_CREATE_API_URL,
|
36
|
-
)
|
26
|
+
from judgeval.common.api import JudgmentApiClient
|
37
27
|
from judgeval.common.exceptions import JudgmentAPIError
|
38
28
|
from langchain_core.callbacks import BaseCallbackHandler
|
39
29
|
from judgeval.common.tracer import Tracer
|
40
30
|
from judgeval.common.utils import validate_api_key
|
41
31
|
from pydantic import BaseModel
|
42
|
-
from judgeval.run_evaluation import SpinnerWrappedTask
|
43
32
|
from judgeval.common.logger import judgeval_logger
|
44
33
|
|
45
34
|
|
46
35
|
class EvalRunRequestBody(BaseModel):
|
47
36
|
eval_name: str
|
48
37
|
project_name: str
|
49
|
-
judgment_api_key: str
|
50
38
|
|
51
39
|
|
52
40
|
class DeleteEvalRunRequestBody(BaseModel):
|
53
41
|
eval_names: List[str]
|
54
42
|
project_name: str
|
55
|
-
judgment_api_key: str
|
56
43
|
|
57
44
|
|
58
45
|
class SingletonMeta(type):
|
@@ -83,6 +70,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
83
70
|
|
84
71
|
self.judgment_api_key = api_key
|
85
72
|
self.organization_id = organization_id
|
73
|
+
self.api_client = JudgmentApiClient(api_key, organization_id)
|
86
74
|
self.eval_dataset_client = EvalDatasetClient(api_key, organization_id)
|
87
75
|
|
88
76
|
# Verify API key is valid
|
@@ -93,29 +81,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
93
81
|
else:
|
94
82
|
judgeval_logger.info("Successfully initialized JudgmentClient!")
|
95
83
|
|
96
|
-
def a_run_evaluation(
|
97
|
-
self,
|
98
|
-
examples: List[Example],
|
99
|
-
scorers: List[Union[APIScorerConfig, BaseScorer]],
|
100
|
-
model: Optional[str] = "gpt-4.1",
|
101
|
-
project_name: str = "default_project",
|
102
|
-
eval_run_name: str = "default_eval_run",
|
103
|
-
override: bool = False,
|
104
|
-
append: bool = False,
|
105
|
-
) -> List[ScoringResult]:
|
106
|
-
result = self.run_evaluation(
|
107
|
-
examples=examples,
|
108
|
-
scorers=scorers,
|
109
|
-
model=model,
|
110
|
-
project_name=project_name,
|
111
|
-
eval_run_name=eval_run_name,
|
112
|
-
override=override,
|
113
|
-
append=append,
|
114
|
-
async_execution=True,
|
115
|
-
)
|
116
|
-
assert not isinstance(result, (asyncio.Task, SpinnerWrappedTask))
|
117
|
-
return result
|
118
|
-
|
119
84
|
def run_trace_evaluation(
|
120
85
|
self,
|
121
86
|
scorers: List[Union[APIScorerConfig, BaseScorer]],
|
@@ -147,11 +112,12 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
147
112
|
scorers=scorers,
|
148
113
|
model=model,
|
149
114
|
append=append,
|
150
|
-
judgment_api_key=self.judgment_api_key,
|
151
115
|
organization_id=self.organization_id,
|
152
116
|
tools=tools,
|
153
117
|
)
|
154
|
-
return run_trace_eval(
|
118
|
+
return run_trace_eval(
|
119
|
+
trace_run, self.judgment_api_key, override, function, tracer, examples
|
120
|
+
)
|
155
121
|
except ValueError as e:
|
156
122
|
raise ValueError(
|
157
123
|
f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}"
|
@@ -168,8 +134,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
168
134
|
eval_run_name: str = "default_eval_run",
|
169
135
|
override: bool = False,
|
170
136
|
append: bool = False,
|
171
|
-
|
172
|
-
) -> Union[List[ScoringResult], asyncio.Task | SpinnerWrappedTask]:
|
137
|
+
) -> List[ScoringResult]:
|
173
138
|
"""
|
174
139
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
175
140
|
|
@@ -181,7 +146,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
181
146
|
eval_run_name (str): A name for this evaluation run
|
182
147
|
override (bool): Whether to override an existing evaluation run with the same name
|
183
148
|
append (bool): Whether to append to an existing evaluation run with the same name
|
184
|
-
async_execution (bool): Whether to execute the evaluation asynchronously
|
185
149
|
|
186
150
|
Returns:
|
187
151
|
List[ScoringResult]: The results of the evaluation
|
@@ -194,18 +158,18 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
194
158
|
try:
|
195
159
|
eval = EvaluationRun(
|
196
160
|
append=append,
|
161
|
+
override=override,
|
197
162
|
project_name=project_name,
|
198
163
|
eval_name=eval_run_name,
|
199
164
|
examples=examples,
|
200
165
|
scorers=scorers,
|
201
166
|
model=model,
|
202
|
-
judgment_api_key=self.judgment_api_key,
|
203
167
|
organization_id=self.organization_id,
|
204
168
|
)
|
205
169
|
return run_eval(
|
206
170
|
eval,
|
171
|
+
self.judgment_api_key,
|
207
172
|
override,
|
208
|
-
async_execution=async_execution,
|
209
173
|
)
|
210
174
|
except ValueError as e:
|
211
175
|
raise ValueError(
|
@@ -292,158 +256,21 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
292
256
|
- id (str): The evaluation run ID
|
293
257
|
- results (List[ScoringResult]): List of scoring results
|
294
258
|
"""
|
295
|
-
|
296
|
-
project_name=project_name,
|
297
|
-
eval_name=eval_run_name,
|
298
|
-
judgment_api_key=self.judgment_api_key,
|
299
|
-
)
|
300
|
-
eval_run = requests.post(
|
301
|
-
JUDGMENT_EVAL_FETCH_API_URL,
|
302
|
-
headers={
|
303
|
-
"Content-Type": "application/json",
|
304
|
-
"Authorization": f"Bearer {self.judgment_api_key}",
|
305
|
-
"X-Organization-Id": self.organization_id,
|
306
|
-
},
|
307
|
-
json=eval_run_request_body.model_dump(),
|
308
|
-
verify=True,
|
309
|
-
)
|
310
|
-
if eval_run.status_code != codes.ok:
|
311
|
-
raise ValueError(f"Error fetching eval results: {eval_run.json()}")
|
312
|
-
|
313
|
-
return eval_run.json()
|
259
|
+
return self.api_client.fetch_evaluation_results(project_name, eval_run_name)
|
314
260
|
|
315
261
|
def create_project(self, project_name: str) -> bool:
|
316
262
|
"""
|
317
263
|
Creates a project on the server.
|
318
264
|
"""
|
319
|
-
|
320
|
-
|
321
|
-
json={
|
322
|
-
"project_name": project_name,
|
323
|
-
},
|
324
|
-
headers={
|
325
|
-
"Content-Type": "application/json",
|
326
|
-
"Authorization": f"Bearer {self.judgment_api_key}",
|
327
|
-
"X-Organization-Id": self.organization_id,
|
328
|
-
},
|
329
|
-
)
|
330
|
-
if response.status_code != codes.ok:
|
331
|
-
raise ValueError(f"Error creating project: {response.json()}")
|
332
|
-
return response.json()
|
265
|
+
self.api_client.create_project(project_name)
|
266
|
+
return True
|
333
267
|
|
334
268
|
def delete_project(self, project_name: str) -> bool:
|
335
269
|
"""
|
336
270
|
Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
|
337
271
|
"""
|
338
|
-
|
339
|
-
|
340
|
-
json={
|
341
|
-
"project_name": project_name,
|
342
|
-
},
|
343
|
-
headers={
|
344
|
-
"Content-Type": "application/json",
|
345
|
-
"Authorization": f"Bearer {self.judgment_api_key}",
|
346
|
-
"X-Organization-Id": self.organization_id,
|
347
|
-
},
|
348
|
-
)
|
349
|
-
if response.status_code != codes.ok:
|
350
|
-
raise ValueError(f"Error deleting project: {response.json()}")
|
351
|
-
return response.json()
|
352
|
-
|
353
|
-
def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer:
|
354
|
-
"""
|
355
|
-
Fetches a classifier scorer configuration from the Judgment API.
|
356
|
-
|
357
|
-
Args:
|
358
|
-
slug (str): Slug identifier of the custom scorer to fetch
|
359
|
-
|
360
|
-
Returns:
|
361
|
-
ClassifierScorer: The configured classifier scorer object
|
362
|
-
|
363
|
-
Raises:
|
364
|
-
JudgmentAPIError: If the scorer cannot be fetched or doesn't exist
|
365
|
-
"""
|
366
|
-
request_body = {
|
367
|
-
"slug": slug,
|
368
|
-
}
|
369
|
-
|
370
|
-
response = requests.post(
|
371
|
-
f"{ROOT_API}/fetch_scorer/",
|
372
|
-
json=request_body,
|
373
|
-
headers={
|
374
|
-
"Content-Type": "application/json",
|
375
|
-
"Authorization": f"Bearer {self.judgment_api_key}",
|
376
|
-
"X-Organization-Id": self.organization_id,
|
377
|
-
},
|
378
|
-
verify=True,
|
379
|
-
)
|
380
|
-
|
381
|
-
if response.status_code == 500:
|
382
|
-
raise JudgmentAPIError(
|
383
|
-
f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {response.json().get('detail', '')}"
|
384
|
-
)
|
385
|
-
elif response.status_code != 200:
|
386
|
-
raise JudgmentAPIError(
|
387
|
-
f"Failed to fetch classifier scorer '{slug}': {response.json().get('detail', '')}"
|
388
|
-
)
|
389
|
-
|
390
|
-
scorer_config = response.json()
|
391
|
-
scorer_config.pop("created_at")
|
392
|
-
scorer_config.pop("updated_at")
|
393
|
-
|
394
|
-
try:
|
395
|
-
return ClassifierScorer(**scorer_config)
|
396
|
-
except Exception as e:
|
397
|
-
raise JudgmentAPIError(
|
398
|
-
f"Failed to create classifier scorer '{slug}' with config {scorer_config}: {str(e)}"
|
399
|
-
)
|
400
|
-
|
401
|
-
def push_classifier_scorer(
|
402
|
-
self, scorer: ClassifierScorer, slug: str | None = None
|
403
|
-
) -> str:
|
404
|
-
"""
|
405
|
-
Pushes a classifier scorer configuration to the Judgment API.
|
406
|
-
|
407
|
-
Args:
|
408
|
-
slug (str): Slug identifier for the scorer. If it exists, the scorer will be updated.
|
409
|
-
scorer (ClassifierScorer): The classifier scorer to save
|
410
|
-
|
411
|
-
Returns:
|
412
|
-
str: The slug identifier of the saved scorer
|
413
|
-
|
414
|
-
Raises:
|
415
|
-
JudgmentAPIError: If there's an error saving the scorer
|
416
|
-
"""
|
417
|
-
request_body = {
|
418
|
-
"name": scorer.name,
|
419
|
-
"conversation": scorer.conversation,
|
420
|
-
"options": scorer.options,
|
421
|
-
"slug": slug,
|
422
|
-
}
|
423
|
-
|
424
|
-
response = requests.post(
|
425
|
-
f"{ROOT_API}/save_scorer/",
|
426
|
-
json=request_body,
|
427
|
-
headers={
|
428
|
-
"Content-Type": "application/json",
|
429
|
-
"Authorization": f"Bearer {self.judgment_api_key}",
|
430
|
-
"X-Organization-Id": self.organization_id,
|
431
|
-
},
|
432
|
-
verify=True,
|
433
|
-
)
|
434
|
-
|
435
|
-
if response.status_code == 500:
|
436
|
-
raise JudgmentAPIError(
|
437
|
-
f"The server is temporarily unavailable. \
|
438
|
-
Please try your request again in a few moments. \
|
439
|
-
Error details: {response.json().get('detail', '')}"
|
440
|
-
)
|
441
|
-
elif response.status_code != 200:
|
442
|
-
raise JudgmentAPIError(
|
443
|
-
f"Failed to save classifier scorer: {response.json().get('detail', '')}"
|
444
|
-
)
|
445
|
-
|
446
|
-
return response.json()["slug"]
|
272
|
+
self.api_client.delete_project(project_name)
|
273
|
+
return True
|
447
274
|
|
448
275
|
def assert_test(
|
449
276
|
self,
|
@@ -454,7 +281,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
454
281
|
eval_run_name: str = str(uuid4()),
|
455
282
|
override: bool = False,
|
456
283
|
append: bool = False,
|
457
|
-
async_execution: bool = False,
|
458
284
|
) -> None:
|
459
285
|
"""
|
460
286
|
Asserts a test by running the evaluation and checking the results for success
|
@@ -470,7 +296,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
470
296
|
async_execution (bool): Whether to run the evaluation asynchronously
|
471
297
|
"""
|
472
298
|
|
473
|
-
results:
|
299
|
+
results: List[ScoringResult]
|
474
300
|
|
475
301
|
results = self.run_evaluation(
|
476
302
|
examples=examples,
|
@@ -480,19 +306,8 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
480
306
|
eval_run_name=eval_run_name,
|
481
307
|
override=override,
|
482
308
|
append=append,
|
483
|
-
async_execution=async_execution,
|
484
309
|
)
|
485
|
-
|
486
|
-
if async_execution and isinstance(results, (asyncio.Task, SpinnerWrappedTask)):
|
487
|
-
|
488
|
-
async def run_async(): # Using wrapper here to resolve mypy error with passing Task into asyncio.run
|
489
|
-
return await results
|
490
|
-
|
491
|
-
actual_results = safe_run_async(run_async())
|
492
|
-
assert_test(actual_results) # Call the synchronous imported function
|
493
|
-
else:
|
494
|
-
# 'results' is already List[ScoringResult] here (synchronous path)
|
495
|
-
assert_test(results) # Call the synchronous imported function
|
310
|
+
assert_test(results)
|
496
311
|
|
497
312
|
def assert_trace_test(
|
498
313
|
self,
|
@@ -535,7 +350,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
535
350
|
f"You must provide the 'tools' argument to assert_test when using a scorer with enable_param_checking=True. If you do not want to do param checking, explicitly set enable_param_checking=False for the {scorer.__name__} scorer."
|
536
351
|
)
|
537
352
|
|
538
|
-
results:
|
353
|
+
results: List[ScoringResult]
|
539
354
|
|
540
355
|
results = self.run_trace_evaluation(
|
541
356
|
examples=examples,
|
@@ -551,13 +366,4 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
551
366
|
tools=tools,
|
552
367
|
)
|
553
368
|
|
554
|
-
|
555
|
-
|
556
|
-
async def run_async(): # Using wrapper here to resolve mypy error with passing Task into asyncio.run
|
557
|
-
return await results
|
558
|
-
|
559
|
-
actual_results = safe_run_async(run_async())
|
560
|
-
assert_test(actual_results) # Call the synchronous imported function
|
561
|
-
else:
|
562
|
-
# 'results' is already List[ScoringResult] here (synchronous path)
|
563
|
-
assert_test(results) # Call the synchronous imported function
|
369
|
+
assert_test(results)
|