judgeval 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/cli.py +65 -0
- judgeval/common/api/api.py +44 -38
- judgeval/common/api/constants.py +18 -5
- judgeval/common/api/json_encoder.py +8 -9
- judgeval/common/tracer/core.py +278 -256
- judgeval/common/tracer/otel_span_processor.py +1 -1
- judgeval/common/tracer/span_processor.py +1 -1
- judgeval/common/tracer/span_transformer.py +2 -1
- judgeval/data/evaluation_run.py +104 -0
- judgeval/data/judgment_types.py +37 -8
- judgeval/data/trace.py +1 -0
- judgeval/data/trace_run.py +0 -2
- judgeval/integrations/langgraph.py +2 -1
- judgeval/judgment_client.py +102 -47
- judgeval/local_eval_queue.py +3 -5
- judgeval/run_evaluation.py +33 -192
- judgeval/scorers/base_scorer.py +9 -10
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
- {judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/METADATA +3 -1
- {judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/RECORD +23 -21
- judgeval-0.6.0.dist-info/entry_points.txt +2 -0
- judgeval/evaluation_run.py +0 -80
- {judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/WHEEL +0 -0
- {judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/run_evaluation.py
CHANGED
@@ -24,7 +24,7 @@ from judgeval.common.logger import judgeval_logger
|
|
24
24
|
if TYPE_CHECKING:
|
25
25
|
from judgeval.common.tracer import Tracer
|
26
26
|
from judgeval.data.trace_run import TraceRun
|
27
|
-
from judgeval.evaluation_run import EvaluationRun
|
27
|
+
from judgeval.data.evaluation_run import EvaluationRun
|
28
28
|
from judgeval.integrations.langgraph import JudgevalCallbackHandler
|
29
29
|
|
30
30
|
|
@@ -140,80 +140,6 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
|
|
140
140
|
return results
|
141
141
|
|
142
142
|
|
143
|
-
def check_experiment_type(
|
144
|
-
eval_name: str,
|
145
|
-
project_name: str,
|
146
|
-
judgment_api_key: str,
|
147
|
-
organization_id: str,
|
148
|
-
is_trace: bool,
|
149
|
-
) -> None:
|
150
|
-
"""
|
151
|
-
Checks if the current experiment, if one exists, has the same type (examples of traces)
|
152
|
-
"""
|
153
|
-
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
154
|
-
|
155
|
-
try:
|
156
|
-
api_client.check_experiment_type(eval_name, project_name, is_trace)
|
157
|
-
except JudgmentAPIException as e:
|
158
|
-
if e.response.status_code == 422:
|
159
|
-
judgeval_logger.error(f"{e.response_json}")
|
160
|
-
raise ValueError(f"{e.response_json}")
|
161
|
-
else:
|
162
|
-
raise e
|
163
|
-
except Exception as e:
|
164
|
-
judgeval_logger.error(f"Failed to check if experiment type exists: {str(e)}")
|
165
|
-
raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
|
166
|
-
|
167
|
-
|
168
|
-
def check_eval_run_name_exists(
|
169
|
-
eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
|
170
|
-
) -> None:
|
171
|
-
"""
|
172
|
-
Checks if an evaluation run name already exists for a given project.
|
173
|
-
|
174
|
-
Args:
|
175
|
-
eval_name (str): Name of the evaluation run
|
176
|
-
project_name (str): Name of the project
|
177
|
-
judgment_api_key (str): API key for authentication
|
178
|
-
|
179
|
-
Raises:
|
180
|
-
ValueError: If the evaluation run name already exists
|
181
|
-
JudgmentAPIError: If there's an API error during the check
|
182
|
-
"""
|
183
|
-
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
184
|
-
try:
|
185
|
-
api_client.check_eval_run_name_exists(eval_name, project_name)
|
186
|
-
except JudgmentAPIException as e:
|
187
|
-
if e.response.status_code == 409:
|
188
|
-
error_str = f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true. See https://docs.judgmentlabs.ai/sdk-reference/judgment-client#override for more information."
|
189
|
-
judgeval_logger.error(error_str)
|
190
|
-
raise ValueError(error_str)
|
191
|
-
else:
|
192
|
-
raise e
|
193
|
-
|
194
|
-
except Exception as e:
|
195
|
-
judgeval_logger.error(f"Failed to check if eval run name exists: {str(e)}")
|
196
|
-
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
197
|
-
|
198
|
-
|
199
|
-
def check_example_keys(
|
200
|
-
keys: List[str],
|
201
|
-
eval_name: str,
|
202
|
-
project_name: str,
|
203
|
-
judgment_api_key: str,
|
204
|
-
organization_id: str,
|
205
|
-
) -> None:
|
206
|
-
"""
|
207
|
-
Checks if the current experiment (if one exists) has the same keys for example
|
208
|
-
"""
|
209
|
-
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
210
|
-
try:
|
211
|
-
api_client.check_example_keys(keys, eval_name, project_name)
|
212
|
-
except Exception as e:
|
213
|
-
judgeval_logger.error(f"Failed to check if example keys match: {str(e)}")
|
214
|
-
raise JudgmentAPIError(f"Failed to check if example keys match: {str(e)}")
|
215
|
-
|
216
|
-
|
217
143
|
def log_evaluation_results(
|
218
144
|
scoring_results: List[ScoringResult],
|
219
145
|
run: Union[EvaluationRun, TraceRun],
|
@@ -285,29 +211,10 @@ def check_examples(
|
|
285
211
|
def run_trace_eval(
|
286
212
|
trace_run: TraceRun,
|
287
213
|
judgment_api_key: str,
|
288
|
-
override: bool = False,
|
289
214
|
function: Optional[Callable] = None,
|
290
215
|
tracer: Optional[Union[Tracer, "JudgevalCallbackHandler"]] = None,
|
291
216
|
examples: Optional[List[Example]] = None,
|
292
217
|
) -> List[ScoringResult]:
|
293
|
-
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
294
|
-
if not override and not trace_run.append:
|
295
|
-
check_eval_run_name_exists(
|
296
|
-
trace_run.eval_name,
|
297
|
-
trace_run.project_name,
|
298
|
-
judgment_api_key,
|
299
|
-
trace_run.organization_id,
|
300
|
-
)
|
301
|
-
|
302
|
-
if trace_run.append:
|
303
|
-
# Check that the current experiment, if one exists, has the same type (examples or traces)
|
304
|
-
check_experiment_type(
|
305
|
-
trace_run.eval_name,
|
306
|
-
trace_run.project_name,
|
307
|
-
judgment_api_key,
|
308
|
-
trace_run.organization_id,
|
309
|
-
True,
|
310
|
-
)
|
311
218
|
if function and tracer and examples is not None:
|
312
219
|
new_traces: List[Trace] = []
|
313
220
|
|
@@ -376,43 +283,8 @@ def run_trace_eval(
|
|
376
283
|
return scoring_results
|
377
284
|
|
378
285
|
|
379
|
-
async def get_evaluation_status(
|
380
|
-
eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
|
381
|
-
) -> Dict:
|
382
|
-
"""
|
383
|
-
Gets the status of an async evaluation run.
|
384
|
-
|
385
|
-
Args:
|
386
|
-
eval_name (str): Name of the evaluation run
|
387
|
-
project_name (str): Name of the project
|
388
|
-
judgment_api_key (str): API key for authentication
|
389
|
-
organization_id (str): Organization ID for the evaluation
|
390
|
-
|
391
|
-
Returns:
|
392
|
-
Dict: Status information including:
|
393
|
-
- status: 'pending', 'running', 'completed', or 'failed'
|
394
|
-
- results: List of ScoringResult objects if completed
|
395
|
-
- error: Error message if failed
|
396
|
-
"""
|
397
|
-
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
398
|
-
try:
|
399
|
-
return api_client.get_evaluation_status(eval_name, project_name)
|
400
|
-
except Exception as e:
|
401
|
-
raise JudgmentAPIError(
|
402
|
-
f"An error occurred while checking evaluation status: {str(e)}"
|
403
|
-
)
|
404
|
-
|
405
|
-
|
406
|
-
def retrieve_counts(result: Dict):
|
407
|
-
scorer_data_count = 0
|
408
|
-
for example in result.get("examples", []):
|
409
|
-
for scorer in example.get("scorer_data", []):
|
410
|
-
scorer_data_count += 1
|
411
|
-
return scorer_data_count
|
412
|
-
|
413
|
-
|
414
286
|
def _poll_evaluation_until_complete(
|
415
|
-
|
287
|
+
experiment_run_id: str,
|
416
288
|
project_name: str,
|
417
289
|
judgment_api_key: str,
|
418
290
|
organization_id: str,
|
@@ -443,14 +315,16 @@ def _poll_evaluation_until_complete(
|
|
443
315
|
poll_count += 1
|
444
316
|
try:
|
445
317
|
# Check status
|
446
|
-
status_response = api_client.get_evaluation_status(
|
318
|
+
status_response = api_client.get_evaluation_status(
|
319
|
+
experiment_run_id, project_name
|
320
|
+
)
|
447
321
|
|
448
322
|
if status_response.get("status") != "completed":
|
449
323
|
time.sleep(poll_interval_seconds)
|
450
324
|
continue
|
451
325
|
|
452
326
|
results_response = api_client.fetch_evaluation_results(
|
453
|
-
|
327
|
+
experiment_run_id, project_name
|
454
328
|
)
|
455
329
|
url = results_response.get("ui_results_url")
|
456
330
|
|
@@ -513,14 +387,12 @@ def progress_logger(stop_event, msg="Working...", interval=5):
|
|
513
387
|
def run_eval(
|
514
388
|
evaluation_run: EvaluationRun,
|
515
389
|
judgment_api_key: str,
|
516
|
-
override: bool = False,
|
517
390
|
) -> List[ScoringResult]:
|
518
391
|
"""
|
519
392
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
520
393
|
|
521
394
|
Args:
|
522
395
|
evaluation_run (EvaluationRun): Stores example and evaluation together for running
|
523
|
-
override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
|
524
396
|
|
525
397
|
Returns:
|
526
398
|
List[ScoringResult]: A list of ScoringResult objects
|
@@ -534,52 +406,31 @@ def run_eval(
|
|
534
406
|
f"All examples must have the same keys: {current_keys} != {keys}"
|
535
407
|
)
|
536
408
|
|
537
|
-
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
538
|
-
if not override and not evaluation_run.append:
|
539
|
-
check_eval_run_name_exists(
|
540
|
-
evaluation_run.eval_name,
|
541
|
-
evaluation_run.project_name,
|
542
|
-
judgment_api_key,
|
543
|
-
evaluation_run.organization_id,
|
544
|
-
)
|
545
|
-
|
546
|
-
if evaluation_run.append:
|
547
|
-
# Check that the current experiment, if one exists, has the same type (examples of traces)
|
548
|
-
check_experiment_type(
|
549
|
-
evaluation_run.eval_name,
|
550
|
-
evaluation_run.project_name,
|
551
|
-
judgment_api_key,
|
552
|
-
evaluation_run.organization_id,
|
553
|
-
False,
|
554
|
-
)
|
555
|
-
|
556
|
-
# Ensure that current experiment (if one exists) has the same keys for example
|
557
|
-
check_example_keys(
|
558
|
-
keys=list(keys),
|
559
|
-
eval_name=evaluation_run.eval_name,
|
560
|
-
project_name=evaluation_run.project_name,
|
561
|
-
judgment_api_key=judgment_api_key,
|
562
|
-
organization_id=evaluation_run.organization_id,
|
563
|
-
)
|
564
|
-
|
565
|
-
judgment_scorers: List[APIScorerConfig] = []
|
566
|
-
local_scorers: List[BaseScorer] = []
|
567
|
-
for scorer in evaluation_run.scorers:
|
568
|
-
if isinstance(scorer, APIScorerConfig):
|
569
|
-
judgment_scorers.append(scorer)
|
570
|
-
else:
|
571
|
-
local_scorers.append(scorer)
|
572
|
-
|
573
409
|
results: List[ScoringResult] = []
|
574
410
|
url = ""
|
575
411
|
|
576
|
-
if
|
412
|
+
if (
|
413
|
+
len(evaluation_run.custom_scorers) > 0
|
414
|
+
and len(evaluation_run.judgment_scorers) > 0
|
415
|
+
):
|
577
416
|
error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
|
578
417
|
judgeval_logger.error(error_msg)
|
579
418
|
raise ValueError(error_msg)
|
580
419
|
|
581
|
-
|
582
|
-
|
420
|
+
e2b_scorers = [cs for cs in evaluation_run.custom_scorers if cs.server_hosted]
|
421
|
+
|
422
|
+
if evaluation_run.judgment_scorers or e2b_scorers:
|
423
|
+
if evaluation_run.judgment_scorers and e2b_scorers:
|
424
|
+
error_msg = "We currently do not support running both hosted custom scorers and Judgment API scorers at the same time. Please run your evaluation with one or the other, but not both."
|
425
|
+
judgeval_logger.error(error_msg)
|
426
|
+
raise ValueError(error_msg)
|
427
|
+
|
428
|
+
if len(e2b_scorers) > 1:
|
429
|
+
error_msg = "We currently do not support running multiple hosted custom scorers at the same time."
|
430
|
+
judgeval_logger.error(error_msg)
|
431
|
+
raise ValueError(error_msg)
|
432
|
+
|
433
|
+
check_examples(evaluation_run.examples, evaluation_run.judgment_scorers)
|
583
434
|
stop_event = threading.Event()
|
584
435
|
t = threading.Thread(
|
585
436
|
target=progress_logger, args=(stop_event, "Running evaluation...")
|
@@ -600,36 +451,26 @@ def run_eval(
|
|
600
451
|
)
|
601
452
|
raise JudgmentAPIError(error_message)
|
602
453
|
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
)
|
609
|
-
old_scorer_data_count = retrieve_counts(results_response)
|
610
|
-
except Exception:
|
611
|
-
# This usually means the user did append = True but the eval run name doesn't exist yet
|
612
|
-
pass
|
613
|
-
|
454
|
+
num_scorers = (
|
455
|
+
len(evaluation_run.judgment_scorers)
|
456
|
+
if evaluation_run.judgment_scorers
|
457
|
+
else sum(1 for cs in evaluation_run.custom_scorers if cs.server_hosted)
|
458
|
+
)
|
614
459
|
results, url = _poll_evaluation_until_complete(
|
615
|
-
|
460
|
+
experiment_run_id=evaluation_run.id,
|
616
461
|
project_name=evaluation_run.project_name,
|
617
462
|
judgment_api_key=judgment_api_key,
|
618
463
|
organization_id=evaluation_run.organization_id,
|
619
|
-
expected_scorer_data_count=(
|
620
|
-
len(evaluation_run.scorers) * len(evaluation_run.examples)
|
621
|
-
)
|
622
|
-
+ old_scorer_data_count,
|
464
|
+
expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
|
623
465
|
)
|
624
466
|
finally:
|
625
467
|
stop_event.set()
|
626
468
|
t.join()
|
627
|
-
|
628
|
-
if len(local_scorers) > 0:
|
469
|
+
else:
|
629
470
|
results = safe_run_async(
|
630
471
|
a_execute_scoring(
|
631
472
|
evaluation_run.examples,
|
632
|
-
|
473
|
+
evaluation_run.custom_scorers,
|
633
474
|
model=evaluation_run.model,
|
634
475
|
throttle_value=0,
|
635
476
|
max_concurrent=MAX_CONCURRENT_EVALUATIONS,
|
judgeval/scorers/base_scorer.py
CHANGED
@@ -26,6 +26,7 @@ class BaseScorer(BaseModel):
|
|
26
26
|
name: Optional[str] = (
|
27
27
|
None # name of your scorer (Faithfulness, PromptScorer-randomslug)
|
28
28
|
)
|
29
|
+
class_name: Optional[str] = None # The name of the class of the scorer
|
29
30
|
score: Optional[float] = None # The float score of the scorer run on the test case
|
30
31
|
score_breakdown: Optional[Dict] = None
|
31
32
|
reason: Optional[str] = ""
|
@@ -39,24 +40,22 @@ class BaseScorer(BaseModel):
|
|
39
40
|
error: Optional[str] = None # The error message if the scorer failed
|
40
41
|
additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
|
41
42
|
user: Optional[str] = None # The user ID of the scorer
|
43
|
+
server_hosted: bool = False # Whether the scorer is enabled for e2b
|
42
44
|
|
43
|
-
@model_validator(mode="
|
45
|
+
@model_validator(mode="after")
|
44
46
|
@classmethod
|
45
|
-
def enforce_strict_threshold(cls, data:
|
46
|
-
if data.
|
47
|
-
data
|
47
|
+
def enforce_strict_threshold(cls, data: "BaseScorer"):
|
48
|
+
if data.strict_mode:
|
49
|
+
data.threshold = 1.0
|
48
50
|
return data
|
49
51
|
|
50
52
|
@model_validator(mode="after")
|
51
53
|
@classmethod
|
52
54
|
def default_name(cls, m: "BaseScorer") -> "BaseScorer":
|
55
|
+
# Always set class_name to the string name of the class
|
56
|
+
m.class_name = m.__class__.__name__
|
53
57
|
if not m.name:
|
54
|
-
|
55
|
-
class_name = getattr(m, "__class__", None)
|
56
|
-
if class_name and getattr(m.__class__, "__name__", None):
|
57
|
-
m.name = m.__class__.__name__
|
58
|
-
else:
|
59
|
-
m.name = m.score_type
|
58
|
+
m.name = m.class_name
|
60
59
|
return m
|
61
60
|
|
62
61
|
def _add_model(self, model: str):
|
@@ -11,13 +11,14 @@ from judgeval.common.logger import judgeval_logger
|
|
11
11
|
def push_prompt_scorer(
|
12
12
|
name: str,
|
13
13
|
prompt: str,
|
14
|
+
threshold: float,
|
14
15
|
options: Optional[Dict[str, float]] = None,
|
15
16
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
16
17
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
17
18
|
) -> str:
|
18
19
|
client = JudgmentApiClient(judgment_api_key, organization_id)
|
19
20
|
try:
|
20
|
-
r = client.save_scorer(name, prompt, options)
|
21
|
+
r = client.save_scorer(name, prompt, threshold, options)
|
21
22
|
except JudgmentAPIException as e:
|
22
23
|
if e.status_code == 500:
|
23
24
|
raise JudgmentAPIError(
|
@@ -90,6 +91,7 @@ class PromptScorer(APIScorerConfig):
|
|
90
91
|
return cls(
|
91
92
|
name=name,
|
92
93
|
prompt=scorer_config["prompt"],
|
94
|
+
threshold=scorer_config["threshold"],
|
93
95
|
options=scorer_config.get("options"),
|
94
96
|
judgment_api_key=judgment_api_key,
|
95
97
|
organization_id=organization_id,
|
@@ -100,16 +102,20 @@ class PromptScorer(APIScorerConfig):
|
|
100
102
|
cls,
|
101
103
|
name: str,
|
102
104
|
prompt: str,
|
105
|
+
threshold: Optional[float] = 0.5,
|
103
106
|
options: Optional[Dict[str, float]] = None,
|
104
107
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
105
108
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
106
109
|
):
|
107
110
|
if not scorer_exists(name, judgment_api_key, organization_id):
|
108
|
-
push_prompt_scorer(
|
111
|
+
push_prompt_scorer(
|
112
|
+
name, prompt, threshold, options, judgment_api_key, organization_id
|
113
|
+
)
|
109
114
|
judgeval_logger.info(f"Successfully created PromptScorer: {name}")
|
110
115
|
return cls(
|
111
116
|
name=name,
|
112
117
|
prompt=prompt,
|
118
|
+
threshold=threshold,
|
113
119
|
options=options,
|
114
120
|
judgment_api_key=judgment_api_key,
|
115
121
|
organization_id=organization_id,
|
@@ -158,6 +164,12 @@ class PromptScorer(APIScorerConfig):
|
|
158
164
|
judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
|
159
165
|
|
160
166
|
# Getters
|
167
|
+
def get_threshold(self) -> float | None:
|
168
|
+
"""
|
169
|
+
Returns the threshold of the scorer.
|
170
|
+
"""
|
171
|
+
return self.threshold
|
172
|
+
|
161
173
|
def get_prompt(self) -> str | None:
|
162
174
|
"""
|
163
175
|
Returns the prompt of the scorer.
|
@@ -183,6 +195,7 @@ class PromptScorer(APIScorerConfig):
|
|
183
195
|
return {
|
184
196
|
"name": self.name,
|
185
197
|
"prompt": self.prompt,
|
198
|
+
"threshold": self.threshold,
|
186
199
|
"options": self.options,
|
187
200
|
}
|
188
201
|
|
@@ -193,13 +206,14 @@ class PromptScorer(APIScorerConfig):
|
|
193
206
|
push_prompt_scorer(
|
194
207
|
self.name,
|
195
208
|
self.prompt,
|
209
|
+
self.threshold,
|
196
210
|
self.options,
|
197
211
|
self.judgment_api_key,
|
198
212
|
self.organization_id,
|
199
213
|
)
|
200
214
|
|
201
215
|
def __str__(self):
|
202
|
-
return f"PromptScorer(name={self.name}, prompt={self.prompt}, options={self.options})"
|
216
|
+
return f"PromptScorer(name={self.name}, prompt={self.prompt}, threshold={self.threshold}, options={self.options})"
|
203
217
|
|
204
218
|
def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
|
205
219
|
base = super().model_dump(*args, **kwargs)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.6.0
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
12
12
|
Requires-Python: >=3.11
|
13
13
|
Requires-Dist: boto3
|
14
|
+
Requires-Dist: click<8.2.0
|
14
15
|
Requires-Dist: langchain-anthropic
|
15
16
|
Requires-Dist: langchain-core
|
16
17
|
Requires-Dist: langchain-huggingface
|
@@ -23,6 +24,7 @@ Requires-Dist: orjson>=3.9.0
|
|
23
24
|
Requires-Dist: python-dotenv
|
24
25
|
Requires-Dist: requests
|
25
26
|
Requires-Dist: rich
|
27
|
+
Requires-Dist: typer>=0.9.0
|
26
28
|
Provides-Extra: langchain
|
27
29
|
Requires-Dist: langchain-anthropic; extra == 'langchain'
|
28
30
|
Requires-Dist: langchain-core; extra == 'langchain'
|
@@ -1,43 +1,44 @@
|
|
1
1
|
judgeval/__init__.py,sha256=5Lm1JMYFREJGN_8X-Wpruu_ovwGLJ08gCzNAt-u-pQE,419
|
2
|
+
judgeval/cli.py,sha256=IcL4_bGr9CtEeea1-AFqM_TEV_VomDlArlxh4IomiSQ,1754
|
2
3
|
judgeval/clients.py,sha256=HHul68PV1om0dxsVZZu90TtCiy5zaqAwph16jXTQzQo,989
|
3
4
|
judgeval/constants.py,sha256=UNoTLHgbpZHRInPM2ZaI3m0XokPkee5ILlg20reqhzo,4180
|
4
5
|
judgeval/dataset.py,sha256=vOrDKam2I-K1WcVF5IBkQruCDvXTc8PRaFm4-dV0lXs,6220
|
5
|
-
judgeval/
|
6
|
-
judgeval/
|
7
|
-
judgeval/local_eval_queue.py,sha256=fAI0_OlvCr-WOCQWw18C4JIRJHKYzlyGzsGUm8LcsYE,7076
|
6
|
+
judgeval/judgment_client.py,sha256=-7xcBFowzXKedMINwfZCOL4FKucECWPNEY9QVMo_cys,13644
|
7
|
+
judgeval/local_eval_queue.py,sha256=GmlXeZt7bfAJe1hPUjDg_irth4RkNqL2Zdi7VzboBzI,6984
|
8
8
|
judgeval/rules.py,sha256=CoQjqmP8daEXewMkplmA-7urubDtweOr5O6z8klVwLI,20031
|
9
|
-
judgeval/run_evaluation.py,sha256=
|
9
|
+
judgeval/run_evaluation.py,sha256=gs-_v_i95LKlJj95G2RmQXvIyBfoldnd1pWCNO4UqsM,21985
|
10
10
|
judgeval/version_check.py,sha256=FoLEtpCjDw2HuDQdpw5yT29UtwumSc6ZZN6AV_c9Mnw,1057
|
11
11
|
judgeval/common/__init__.py,sha256=KH-QJyWtQ60R6yFIBDYS3WGRiNpEu1guynpxivZvpBQ,309
|
12
12
|
judgeval/common/exceptions.py,sha256=OkgDznu2wpBQZMXiZarLJYNk1HIcC8qYW7VypDC3Ook,556
|
13
13
|
judgeval/common/logger.py,sha256=514eFLYWS_UL8VY-zAR2ePUlpQe4rbYlleLASFllLE4,1511
|
14
14
|
judgeval/common/utils.py,sha256=oxGDRVWOICKWeyGgsoc36_yAyHSYF4XtH842Mkznwis,34739
|
15
15
|
judgeval/common/api/__init__.py,sha256=-E7lpZz1fG8puR_aYUMfPmQ-Vyhd0bgzoaU5EhIuFjQ,114
|
16
|
-
judgeval/common/api/api.py,sha256=
|
17
|
-
judgeval/common/api/constants.py,sha256=
|
18
|
-
judgeval/common/api/json_encoder.py,sha256=
|
16
|
+
judgeval/common/api/api.py,sha256=fWtMNln0o1wOhJ9wangWpyY_j3WF7P3at_LYPJEicP0,13670
|
17
|
+
judgeval/common/api/constants.py,sha256=y0BDcQqHBZ7MwLd4gT5hLUF8UMs_GVwsJGC-ibfxCAw,4698
|
18
|
+
judgeval/common/api/json_encoder.py,sha256=QQgCe2FBmW1uWKx8yvuhr4U7_b4D0sG97GZtXHKnBdk,5881
|
19
19
|
judgeval/common/storage/__init__.py,sha256=a-PI7OL-ydyzugGUKmJKRBASnK-Q-gs82L9K9rSyJP8,90
|
20
20
|
judgeval/common/storage/s3_storage.py,sha256=0-bNKheqJJyBZ92KGrzQtd1zocIRWBlfn_58L4a-Ay0,3719
|
21
21
|
judgeval/common/tracer/__init__.py,sha256=tJCJsmVmrL89Phv88gNCJ-j0ITPez6lh8vhMAAlLNSc,795
|
22
22
|
judgeval/common/tracer/constants.py,sha256=yu5y8gMe5yb1AaBkPtAH-BNwIaAR3NwYCRoSf45wp5U,621
|
23
|
-
judgeval/common/tracer/core.py,sha256=
|
23
|
+
judgeval/common/tracer/core.py,sha256=TQ80NODaJx7gzmntevDLA3evVJ3m2Zy2s0Pwd7APG9Y,84867
|
24
24
|
judgeval/common/tracer/otel_exporter.py,sha256=kZLlOQ6afQE4dmb9H1wgU4P3H5PG1D_zKyvnpWcT5Ak,3899
|
25
|
-
judgeval/common/tracer/otel_span_processor.py,sha256=
|
25
|
+
judgeval/common/tracer/otel_span_processor.py,sha256=BD-FKXaZft5_3zqy1Qe_tpkudVOLop9AGhBjZUgp-Z8,6502
|
26
26
|
judgeval/common/tracer/providers.py,sha256=3c3YOtKuoBjlTL0rc2HAGnUpppqvsyzrN5H6EKCqEi0,2733
|
27
|
-
judgeval/common/tracer/span_processor.py,sha256=
|
28
|
-
judgeval/common/tracer/span_transformer.py,sha256=
|
27
|
+
judgeval/common/tracer/span_processor.py,sha256=1NQxNSVWcb8qCFLmslSVMnaWdkOZmiFJnxeeN0i6vnU,1150
|
28
|
+
judgeval/common/tracer/span_transformer.py,sha256=cfzz6RpTCOG9Io9knNlwtAW34p3wyK-u8jSNMu24p1w,7382
|
29
29
|
judgeval/common/tracer/trace_manager.py,sha256=ltiXcWC-68DRc8uSa28qHiWRSIBf6NpYOPkZYooR8tg,3086
|
30
30
|
judgeval/data/__init__.py,sha256=1QagDcSQtfnJ632t9Dnq8d7XjAqhmY4mInOWt8qH9tM,455
|
31
|
+
judgeval/data/evaluation_run.py,sha256=IirmYZ1_9N99eep7DDuoyshwjmpNK9bQCxCWXnnhhuI,4053
|
31
32
|
judgeval/data/example.py,sha256=kRskIgsjwcvv2Y8jaPwV-PND7zlmMbFsvRVQ_b7SZY0,914
|
32
|
-
judgeval/data/judgment_types.py,sha256=
|
33
|
+
judgeval/data/judgment_types.py,sha256=3nGCUZ1YJhXajhFlAQvax0SOJ8eLuORtquwwjMreJFw,9826
|
33
34
|
judgeval/data/result.py,sha256=OtSnBUrdQpjyAqxXRLTW3wC9v9lOm_GqzL14ccRQxrg,2124
|
34
35
|
judgeval/data/scorer_data.py,sha256=5QBHtvOIWOq0Rn9_uPJzAMRYMlWxMB-rXnG_6kV4Z4Y,2955
|
35
36
|
judgeval/data/tool.py,sha256=iWQSdy5uNbIeACu3gQy1DC2oGYxRVYNfkkczWdQMAiA,99
|
36
|
-
judgeval/data/trace.py,sha256=
|
37
|
-
judgeval/data/trace_run.py,sha256=
|
37
|
+
judgeval/data/trace.py,sha256=S781vVU1BvQ_kTS3s7UGYdmYVVxVGjDzWJHZpHedyf0,2834
|
38
|
+
judgeval/data/trace_run.py,sha256=Oo1vDrJYX_itt4tt7PJf7fNKd0HE3fnBJxuIkRY8Wrg,1585
|
38
39
|
judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
|
39
40
|
judgeval/data/scripts/openapi_transform.py,sha256=Sm04JClzyP1ga8KA3gkIdsae8Hlx-XU7-x0gHCQYOhg,3877
|
40
|
-
judgeval/integrations/langgraph.py,sha256=
|
41
|
+
judgeval/integrations/langgraph.py,sha256=XsTNpKvXZmSf4TJBtRKSd5AB7S-Td9GTG5wZW9Npj6k,30062
|
41
42
|
judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
|
42
43
|
judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
|
43
44
|
judgeval/judges/litellm_judge.py,sha256=K9yCGOmozt7sYO0u8CHWyZNi8mXnSR3pPkP8yVsvuRc,2561
|
@@ -47,7 +48,7 @@ judgeval/judges/utils.py,sha256=_t6oYN9q63wyP7D4jI8X0bNmvVw7OfaE7uMTYDVS14E,2782
|
|
47
48
|
judgeval/scorers/__init__.py,sha256=4H_cinTQ4EogZv59YEV-3U9EOTLppNwgAPTi1-jI9Fw,746
|
48
49
|
judgeval/scorers/agent_scorer.py,sha256=TjwD_YglSywr3EowEojiCyg5qDgCRa5LRGc5nFdmIBc,703
|
49
50
|
judgeval/scorers/api_scorer.py,sha256=xlhqkeMUBFxl8daSXOTWOYwZjBAz7o6b4sVD5f8cIHw,2523
|
50
|
-
judgeval/scorers/base_scorer.py,sha256=
|
51
|
+
judgeval/scorers/base_scorer.py,sha256=hKrLLh2DaxTgAfze8p_IapvsrogRCevYgfaNCDeOJzc,2869
|
51
52
|
judgeval/scorers/example_scorer.py,sha256=2n45y3LMV1Q-ARyXLHqvVWETlnY1DqS7OLzPu9IBGz8,716
|
52
53
|
judgeval/scorers/exceptions.py,sha256=ACDHK5-TWiF3NTk-wycaedpbrdobm-CvvC1JA_iP-Mk,179
|
53
54
|
judgeval/scorers/score.py,sha256=SWyoqOOvyLpLy39tLyb_Q94sdh9r_IuDv6YNREw52lg,7546
|
@@ -61,7 +62,7 @@ judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=NABO_iBd
|
|
61
62
|
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=ps51bTgQsD9xGYsk1v9bx0WxQMqywSllCE9_xlJkLd8,531
|
62
63
|
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=SnFLvU4FGsMeUVUp0SGHSy_6wgfwr_vHPGnZx5YJl_Q,691
|
63
64
|
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=aQzu-TiGqG74JDQ927evv5yGmnZw2AOolyHvlIhiUbI,683
|
64
|
-
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=
|
65
|
+
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=1FsUGjQu3oa2rF-oqt32j-yA2YM33_trGTJ0HgagFJ0,7793
|
65
66
|
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py,sha256=Mcp1CjMNyOax9UkvoRdSyUYdO2Os1-Nko43y89m2Luo,594
|
66
67
|
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py,sha256=Z2FLGBC7m_CLx-CMgXVuTvYvN0vY5yOcWA0ImBkeBfY,787
|
67
68
|
judgeval/tracer/__init__.py,sha256=wkuXtOGDCrwgPPXlh_sSJmvGuWaAMHyNzk1TzB5f9aI,148
|
@@ -69,7 +70,8 @@ judgeval/utils/alerts.py,sha256=3w_AjQrgfmOZvfqCridW8WAnHVxHHXokX9jNzVFyGjA,3297
|
|
69
70
|
judgeval/utils/async_utils.py,sha256=uNx1SopEc0quSjc8GBQqyba0SmCMAzv2NKIq6xYwttc,989
|
70
71
|
judgeval/utils/file_utils.py,sha256=PWHRs8dUr8iDwpglSSk4Yjd7C6ZhDzUaO-jV3m7riHM,1987
|
71
72
|
judgeval/utils/requests.py,sha256=K3gUKrwL6TvwYKVYO5OeLWdUHn9NiUPmnIXhZEiEaHU,1534
|
72
|
-
judgeval-0.
|
73
|
-
judgeval-0.
|
74
|
-
judgeval-0.
|
75
|
-
judgeval-0.
|
73
|
+
judgeval-0.6.0.dist-info/METADATA,sha256=CulXMs0v5YrHjR3ntVX8xWKcZyxwEpo_nOYs_hkaeN8,10403
|
74
|
+
judgeval-0.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
75
|
+
judgeval-0.6.0.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
|
76
|
+
judgeval-0.6.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
77
|
+
judgeval-0.6.0.dist-info/RECORD,,
|
judgeval/evaluation_run.py
DELETED
@@ -1,80 +0,0 @@
|
|
1
|
-
from typing import List, Optional, Union
|
2
|
-
from pydantic import BaseModel, field_validator, Field
|
3
|
-
|
4
|
-
from judgeval.data import Example
|
5
|
-
from judgeval.scorers import BaseScorer, APIScorerConfig
|
6
|
-
from judgeval.constants import ACCEPTABLE_MODELS, DEFAULT_GPT_MODEL
|
7
|
-
|
8
|
-
|
9
|
-
class EvaluationRun(BaseModel):
|
10
|
-
"""
|
11
|
-
Stores example and evaluation scorers together for running an eval task
|
12
|
-
|
13
|
-
Args:
|
14
|
-
project_name (str): The name of the project the evaluation results belong to
|
15
|
-
eval_name (str): A name for this evaluation run
|
16
|
-
examples (List[Example]): The examples to evaluate
|
17
|
-
scorers (List[Union[JudgmentScorer, BaseScorer]]): A list of scorers to use for evaluation
|
18
|
-
model (str): The model used as a judge when using LLM as a Judge
|
19
|
-
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
20
|
-
"""
|
21
|
-
|
22
|
-
organization_id: Optional[str] = None
|
23
|
-
project_name: Optional[str] = Field(default=None, validate_default=True)
|
24
|
-
eval_name: Optional[str] = Field(default=None, validate_default=True)
|
25
|
-
examples: List[Example]
|
26
|
-
scorers: List[Union[APIScorerConfig, BaseScorer]]
|
27
|
-
model: Optional[str] = DEFAULT_GPT_MODEL
|
28
|
-
trace_span_id: Optional[str] = None
|
29
|
-
trace_id: Optional[str] = None
|
30
|
-
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
31
|
-
override: Optional[bool] = False
|
32
|
-
append: Optional[bool] = False
|
33
|
-
|
34
|
-
def model_dump(self, **kwargs):
|
35
|
-
data = super().model_dump(**kwargs)
|
36
|
-
|
37
|
-
data["scorers"] = [
|
38
|
-
scorer.model_dump() for scorer in self.scorers
|
39
|
-
] # Pydantic has problems with properly calling model_dump() on the scorers, so we need to do it manually
|
40
|
-
data["examples"] = [example.model_dump() for example in self.examples]
|
41
|
-
|
42
|
-
return data
|
43
|
-
|
44
|
-
@field_validator("examples")
|
45
|
-
def validate_examples(cls, v):
|
46
|
-
if not v:
|
47
|
-
raise ValueError("Examples cannot be empty.")
|
48
|
-
for item in v:
|
49
|
-
if not isinstance(item, Example):
|
50
|
-
raise ValueError(f"Item of type {type(item)} is not a Example")
|
51
|
-
return v
|
52
|
-
|
53
|
-
@field_validator("scorers", mode="before")
|
54
|
-
def validate_scorers(cls, v):
|
55
|
-
if not v:
|
56
|
-
raise ValueError("Scorers cannot be empty.")
|
57
|
-
if not all(
|
58
|
-
isinstance(scorer, BaseScorer) or isinstance(scorer, APIScorerConfig)
|
59
|
-
for scorer in v
|
60
|
-
):
|
61
|
-
raise ValueError(
|
62
|
-
"All scorers must be of type BaseScorer or APIScorerConfig."
|
63
|
-
)
|
64
|
-
return v
|
65
|
-
|
66
|
-
@field_validator("model")
|
67
|
-
def validate_model(cls, v, values):
|
68
|
-
if not v:
|
69
|
-
raise ValueError("Model cannot be empty.")
|
70
|
-
|
71
|
-
# Check if model is string or list of strings
|
72
|
-
if isinstance(v, str):
|
73
|
-
if v not in ACCEPTABLE_MODELS:
|
74
|
-
raise ValueError(
|
75
|
-
f"Model name {v} not recognized. Please select a valid model name.)"
|
76
|
-
)
|
77
|
-
return v
|
78
|
-
|
79
|
-
class Config:
|
80
|
-
arbitrary_types_allowed = True
|
File without changes
|
File without changes
|