judgeval 0.0.34__py3-none-any.whl → 0.0.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +352 -118
- judgeval/constants.py +3 -2
- judgeval/data/datasets/dataset.py +3 -0
- judgeval/data/datasets/eval_dataset_client.py +63 -3
- judgeval/integrations/langgraph.py +1961 -299
- judgeval/judgment_client.py +8 -2
- judgeval/run_evaluation.py +67 -18
- judgeval/scorers/score.py +1 -0
- {judgeval-0.0.34.dist-info → judgeval-0.0.36.dist-info}/METADATA +1 -1
- {judgeval-0.0.34.dist-info → judgeval-0.0.36.dist-info}/RECORD +12 -12
- {judgeval-0.0.34.dist-info → judgeval-0.0.36.dist-info}/WHEEL +0 -0
- {judgeval-0.0.34.dist-info → judgeval-0.0.36.dist-info}/licenses/LICENSE.md +0 -0
judgeval/judgment_client.py
CHANGED
@@ -232,11 +232,17 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
232
232
|
dataset.judgment_api_key = self.judgment_api_key
|
233
233
|
return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
|
234
234
|
|
235
|
-
def
|
235
|
+
def append_example_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
|
236
236
|
"""
|
237
237
|
Appends an `EvalDataset` to the Judgment platform for storage.
|
238
238
|
"""
|
239
|
-
return self.eval_dataset_client.
|
239
|
+
return self.eval_dataset_client.append_examples(alias, examples, project_name)
|
240
|
+
|
241
|
+
def append_sequence_dataset(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
|
242
|
+
"""
|
243
|
+
Appends a `Sequence` to the Judgment platform for storage.
|
244
|
+
"""
|
245
|
+
return self.eval_dataset_client.append_sequences(alias, sequences, project_name)
|
240
246
|
|
241
247
|
def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
|
242
248
|
"""
|
judgeval/run_evaluation.py
CHANGED
@@ -198,6 +198,40 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
|
|
198
198
|
)
|
199
199
|
return results
|
200
200
|
|
201
|
+
def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_sequence: bool) -> None:
|
202
|
+
"""
|
203
|
+
Checks if the current experiment, if one exists, has the same type (examples of sequences)
|
204
|
+
"""
|
205
|
+
try:
|
206
|
+
response = requests.post(
|
207
|
+
f"{ROOT_API}/check_experiment_type/",
|
208
|
+
headers={
|
209
|
+
"Content-Type": "application/json",
|
210
|
+
"Authorization": f"Bearer {judgment_api_key}",
|
211
|
+
"X-Organization-Id": organization_id
|
212
|
+
},
|
213
|
+
json={
|
214
|
+
"eval_name": eval_name,
|
215
|
+
"project_name": project_name,
|
216
|
+
"judgment_api_key": judgment_api_key,
|
217
|
+
"is_sequence": is_sequence
|
218
|
+
},
|
219
|
+
verify=True
|
220
|
+
)
|
221
|
+
|
222
|
+
if response.status_code == 422:
|
223
|
+
error(f"{response.json()}")
|
224
|
+
raise ValueError(f"{response.json()}")
|
225
|
+
|
226
|
+
if not response.ok:
|
227
|
+
response_data = response.json()
|
228
|
+
error_message = response_data.get('detail', 'An unknown error occurred.')
|
229
|
+
error(f"Error checking eval run name: {error_message}")
|
230
|
+
raise JudgmentAPIError(error_message)
|
231
|
+
|
232
|
+
except requests.exceptions.RequestException as e:
|
233
|
+
error(f"Failed to check if experiment type exists: {str(e)}")
|
234
|
+
raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
|
201
235
|
|
202
236
|
def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str) -> None:
|
203
237
|
"""
|
@@ -264,7 +298,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[Evalu
|
|
264
298
|
"X-Organization-Id": run.organization_id
|
265
299
|
},
|
266
300
|
json={
|
267
|
-
"results":
|
301
|
+
"results": merged_results,
|
268
302
|
"run": run.model_dump(warnings=False)
|
269
303
|
},
|
270
304
|
verify=True
|
@@ -318,21 +352,18 @@ def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
|
|
318
352
|
|
319
353
|
return result
|
320
354
|
|
321
|
-
def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) -> None:
|
355
|
+
def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]) -> None:
|
322
356
|
"""
|
323
357
|
Checks if the example contains the necessary parameters for the scorer.
|
324
358
|
"""
|
325
359
|
for scorer in scorers:
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
# We do this because we want to inform users that an example is missing parameters for a scorer
|
334
|
-
# Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
|
335
|
-
print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
|
360
|
+
for example in examples:
|
361
|
+
missing_params = []
|
362
|
+
for param in scorer.required_params:
|
363
|
+
if getattr(example, param.value) is None:
|
364
|
+
missing_params.append(f"'{param.value}'")
|
365
|
+
if missing_params:
|
366
|
+
print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
|
336
367
|
|
337
368
|
def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True) -> List[ScoringResult]:
|
338
369
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
@@ -344,6 +375,17 @@ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_
|
|
344
375
|
sequence_run.organization_id
|
345
376
|
)
|
346
377
|
|
378
|
+
if sequence_run.append:
|
379
|
+
# Check that the current experiment, if one exists, has the same type (examples of sequences)
|
380
|
+
check_experiment_type(
|
381
|
+
sequence_run.eval_name,
|
382
|
+
sequence_run.project_name,
|
383
|
+
sequence_run.judgment_api_key,
|
384
|
+
sequence_run.organization_id,
|
385
|
+
True
|
386
|
+
)
|
387
|
+
|
388
|
+
|
347
389
|
# Execute evaluation using Judgment API
|
348
390
|
info("Starting API evaluation")
|
349
391
|
try: # execute an EvaluationRun with just JudgmentScorers
|
@@ -359,13 +401,9 @@ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_
|
|
359
401
|
|
360
402
|
# Convert the response data to `ScoringResult` objects
|
361
403
|
debug("Processing API results")
|
362
|
-
api_results = []
|
363
|
-
for result in response_data["results"]:
|
364
|
-
api_results.append(ScoringResult(**result))
|
365
|
-
|
366
404
|
# TODO: allow for custom scorer on sequences
|
367
405
|
if sequence_run.log_results:
|
368
|
-
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results,
|
406
|
+
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], sequence_run)
|
369
407
|
rprint(pretty_str)
|
370
408
|
|
371
409
|
|
@@ -404,6 +442,16 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
404
442
|
evaluation_run.organization_id
|
405
443
|
)
|
406
444
|
|
445
|
+
if evaluation_run.append:
|
446
|
+
# Check that the current experiment, if one exists, has the same type (examples of sequences)
|
447
|
+
check_experiment_type(
|
448
|
+
evaluation_run.eval_name,
|
449
|
+
evaluation_run.project_name,
|
450
|
+
evaluation_run.judgment_api_key,
|
451
|
+
evaluation_run.organization_id,
|
452
|
+
False
|
453
|
+
)
|
454
|
+
|
407
455
|
# Set example IDs if not already set
|
408
456
|
debug("Initializing examples with IDs and timestamps")
|
409
457
|
for idx, example in enumerate(evaluation_run.examples):
|
@@ -539,7 +587,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
539
587
|
# )
|
540
588
|
# print(merged_results)
|
541
589
|
if evaluation_run.log_results:
|
542
|
-
|
590
|
+
send_results = [result.model_dump(warnings=False) for result in merged_results]
|
591
|
+
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, send_results, evaluation_run)
|
543
592
|
rprint(pretty_str)
|
544
593
|
|
545
594
|
for i, result in enumerate(merged_results):
|
judgeval/scorers/score.py
CHANGED
@@ -271,6 +271,7 @@ async def a_execute_scoring(
|
|
271
271
|
Returns:
|
272
272
|
List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results.
|
273
273
|
"""
|
274
|
+
|
274
275
|
semaphore = asyncio.Semaphore(max_concurrent)
|
275
276
|
|
276
277
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
@@ -1,16 +1,16 @@
|
|
1
1
|
judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
|
2
2
|
judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
|
3
|
-
judgeval/constants.py,sha256=
|
3
|
+
judgeval/constants.py,sha256=Gc1xpft2BkFRUIjj-puCzILsG1EUOEs8V-bUWP9b1WM,5508
|
4
4
|
judgeval/evaluation_run.py,sha256=WGzx-Ug2qhSmunFo8NrmSstBRsOUc5KpKq0Lc51rqsM,6739
|
5
|
-
judgeval/judgment_client.py,sha256=
|
5
|
+
judgeval/judgment_client.py,sha256=slYLE80FqEIsqgShMtML4I64p-RrEfELbMgZnlXhxP0,22515
|
6
6
|
judgeval/rules.py,sha256=jkh1cXXcUf8oRY7xJUZfcQBYWn_rjUW4GvrhRt15PeU,20265
|
7
|
-
judgeval/run_evaluation.py,sha256=
|
7
|
+
judgeval/run_evaluation.py,sha256=1G-KYNHowfMKTD5j3cDd4EuEme00AqZkn6wpP3zMKUo,30241
|
8
8
|
judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
|
9
9
|
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
10
10
|
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
11
11
|
judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
|
12
12
|
judgeval/common/s3_storage.py,sha256=W8wq9S7qJZdqdBR4sk3aEZ4K3-pz40DOoolOJrWs9Vo,3768
|
13
|
-
judgeval/common/tracer.py,sha256=
|
13
|
+
judgeval/common/tracer.py,sha256=bkN0Jol0mNosJeEJMtjM54jJDhEYL3OSBtkS4FB1m8E,105461
|
14
14
|
judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
|
15
15
|
judgeval/data/__init__.py,sha256=xuKx_KCVHGp6CXvQuVmKl3v7pJp-qDaz0NccKxwjtO0,481
|
16
16
|
judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
|
@@ -20,9 +20,9 @@ judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,
|
|
20
20
|
judgeval/data/sequence.py,sha256=FmKVdzQP5VTujRCHDWk097MKRR-rJgbsdrxyCKee6tA,1994
|
21
21
|
judgeval/data/sequence_run.py,sha256=RmYjfWKMWg-pcF5PLeiWfrhuDkjDZi5VEmAIEXN3Ib0,2104
|
22
22
|
judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
|
23
|
-
judgeval/data/datasets/dataset.py,sha256=
|
24
|
-
judgeval/data/datasets/eval_dataset_client.py,sha256=
|
25
|
-
judgeval/integrations/langgraph.py,sha256=
|
23
|
+
judgeval/data/datasets/dataset.py,sha256=RmZ28oyDPfRsCx4k5ftMscoq0M0LN78MW6ofTiM81BI,13134
|
24
|
+
judgeval/data/datasets/eval_dataset_client.py,sha256=uirHpkpLOfygXIz0xKAGTPx1qjbBTzdLFQK6yyoZduU,17544
|
25
|
+
judgeval/integrations/langgraph.py,sha256=7LpWDpb8wgOkeRJvlr2COvF_O1f01zm-cwsI5trKoiw,123150
|
26
26
|
judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
|
27
27
|
judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
|
28
28
|
judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
|
@@ -34,7 +34,7 @@ judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B
|
|
34
34
|
judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
|
35
35
|
judgeval/scorers/judgeval_scorer.py,sha256=79-JJurqHP-qTaWNWInx4SjvQYwXc9lvfPPNgwsh2yA,6773
|
36
36
|
judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
|
37
|
-
judgeval/scorers/score.py,sha256=
|
37
|
+
judgeval/scorers/score.py,sha256=fZuaZPumqkLCWcZdpTn3bJeHPNHXaDqgyb0WBp2EYgE,18742
|
38
38
|
judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
|
39
39
|
judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
40
40
|
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=_sDUBxSG536KGqXNi6dFpaYKghjEAadxBxaaxV9HuuE,1764
|
@@ -57,7 +57,7 @@ judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMv
|
|
57
57
|
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=ly72Z7s_c8NID6-nQnuW8qEGEW2MqdvpJ-5WfXzbAQg,2579
|
58
58
|
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
59
59
|
judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
|
60
|
-
judgeval-0.0.
|
61
|
-
judgeval-0.0.
|
62
|
-
judgeval-0.0.
|
63
|
-
judgeval-0.0.
|
60
|
+
judgeval-0.0.36.dist-info/METADATA,sha256=oexg66X9idECkevPAF2VkuQJBt-hYHvKmsZz5p5Y-LI,6097
|
61
|
+
judgeval-0.0.36.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
62
|
+
judgeval-0.0.36.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
63
|
+
judgeval-0.0.36.dist-info/RECORD,,
|
File without changes
|
File without changes
|