judgeval 0.0.52__py3-none-any.whl → 0.0.54__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/logger.py +46 -199
- judgeval/common/s3_storage.py +2 -6
- judgeval/common/tracer.py +182 -262
- judgeval/common/utils.py +16 -36
- judgeval/constants.py +14 -20
- judgeval/data/__init__.py +0 -2
- judgeval/data/datasets/dataset.py +6 -10
- judgeval/data/datasets/eval_dataset_client.py +25 -27
- judgeval/data/example.py +5 -138
- judgeval/data/judgment_types.py +214 -0
- judgeval/data/result.py +7 -25
- judgeval/data/scorer_data.py +28 -40
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/tool.py +3 -54
- judgeval/data/trace.py +31 -50
- judgeval/data/trace_run.py +3 -3
- judgeval/evaluation_run.py +16 -23
- judgeval/integrations/langgraph.py +11 -12
- judgeval/judges/litellm_judge.py +3 -6
- judgeval/judges/mixture_of_judges.py +8 -25
- judgeval/judges/together_judge.py +3 -6
- judgeval/judgment_client.py +22 -24
- judgeval/rules.py +7 -19
- judgeval/run_evaluation.py +79 -242
- judgeval/scorers/__init__.py +4 -20
- judgeval/scorers/agent_scorer.py +21 -0
- judgeval/scorers/api_scorer.py +28 -38
- judgeval/scorers/base_scorer.py +98 -0
- judgeval/scorers/example_scorer.py +19 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -24
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +16 -68
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +4 -12
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -17
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -14
- judgeval/scorers/score.py +45 -330
- judgeval/scorers/utils.py +6 -88
- judgeval/utils/file_utils.py +4 -6
- judgeval/version_check.py +3 -2
- {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/METADATA +6 -5
- judgeval-0.0.54.dist-info/RECORD +65 -0
- judgeval/data/custom_example.py +0 -19
- judgeval/scorers/judgeval_scorer.py +0 -177
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
- judgeval/scorers/prompt_scorer.py +0 -296
- judgeval-0.0.52.dist-info/RECORD +0 -69
- {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/WHEEL +0 -0
- {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/utils.py
CHANGED
@@ -31,7 +31,7 @@ from judgeval.constants import (
|
|
31
31
|
TOGETHER_SUPPORTED_MODELS,
|
32
32
|
LITELLM_SUPPORTED_MODELS,
|
33
33
|
)
|
34
|
-
from judgeval.common.logger import
|
34
|
+
from judgeval.common.logger import judgeval_logger
|
35
35
|
|
36
36
|
|
37
37
|
class CustomModelParameters(pydantic.BaseModel):
|
@@ -40,18 +40,21 @@ class CustomModelParameters(pydantic.BaseModel):
|
|
40
40
|
litellm_base_url: str
|
41
41
|
|
42
42
|
@pydantic.field_validator("model_name")
|
43
|
+
@classmethod
|
43
44
|
def validate_model_name(cls, v):
|
44
45
|
if not v:
|
45
46
|
raise ValueError("Model name cannot be empty")
|
46
47
|
return v
|
47
48
|
|
48
49
|
@pydantic.field_validator("secret_key")
|
50
|
+
@classmethod
|
49
51
|
def validate_secret_key(cls, v):
|
50
52
|
if not v:
|
51
53
|
raise ValueError("Secret key cannot be empty")
|
52
54
|
return v
|
53
55
|
|
54
56
|
@pydantic.field_validator("litellm_base_url")
|
57
|
+
@classmethod
|
55
58
|
def validate_litellm_base_url(cls, v):
|
56
59
|
if not v:
|
57
60
|
raise ValueError("Litellm base URL cannot be empty")
|
@@ -64,6 +67,7 @@ class ChatCompletionRequest(pydantic.BaseModel):
|
|
64
67
|
response_format: Optional[Union[pydantic.BaseModel, Dict[str, Any]]] = None
|
65
68
|
|
66
69
|
@pydantic.field_validator("messages")
|
70
|
+
@classmethod
|
67
71
|
def validate_messages(cls, messages):
|
68
72
|
if not messages:
|
69
73
|
raise ValueError("Messages cannot be empty")
|
@@ -83,6 +87,7 @@ class ChatCompletionRequest(pydantic.BaseModel):
|
|
83
87
|
return messages
|
84
88
|
|
85
89
|
@pydantic.field_validator("model")
|
90
|
+
@classmethod
|
86
91
|
def validate_model(cls, model):
|
87
92
|
if not model:
|
88
93
|
raise ValueError("Model cannot be empty")
|
@@ -91,6 +96,7 @@ class ChatCompletionRequest(pydantic.BaseModel):
|
|
91
96
|
return model
|
92
97
|
|
93
98
|
@pydantic.field_validator("response_format", mode="before")
|
99
|
+
@classmethod
|
94
100
|
def validate_response_format(cls, response_format):
|
95
101
|
if response_format is not None:
|
96
102
|
if not isinstance(response_format, (dict, pydantic.BaseModel)):
|
@@ -145,11 +151,7 @@ def fetch_together_api_response(
|
|
145
151
|
model=model, messages=messages, response_format=response_format
|
146
152
|
)
|
147
153
|
|
148
|
-
debug(f"Calling Together API with model: {request.model}")
|
149
|
-
debug(f"Messages: {request.messages}")
|
150
|
-
|
151
154
|
if request.response_format is not None:
|
152
|
-
debug(f"Using response format: {request.response_format}")
|
153
155
|
response = together_client.chat.completions.create(
|
154
156
|
model=request.model,
|
155
157
|
messages=request.messages,
|
@@ -161,7 +163,6 @@ def fetch_together_api_response(
|
|
161
163
|
messages=request.messages,
|
162
164
|
)
|
163
165
|
|
164
|
-
debug(f"Received response: {response.choices[0].message.content[:100]}...")
|
165
166
|
return response.choices[0].message.content
|
166
167
|
|
167
168
|
|
@@ -175,11 +176,7 @@ async def afetch_together_api_response(
|
|
175
176
|
model=model, messages=messages, response_format=response_format
|
176
177
|
)
|
177
178
|
|
178
|
-
debug(f"Calling Together API with model: {request.model}")
|
179
|
-
debug(f"Messages: {request.messages}")
|
180
|
-
|
181
179
|
if request.response_format is not None:
|
182
|
-
debug(f"Using response format: {request.response_format}")
|
183
180
|
response = await async_together_client.chat.completions.create(
|
184
181
|
model=request.model,
|
185
182
|
messages=request.messages,
|
@@ -251,7 +248,7 @@ def query_together_api_multiple_calls(
|
|
251
248
|
try:
|
252
249
|
out[idx] = future.result()
|
253
250
|
except Exception as e:
|
254
|
-
error(f"Error in parallel call {idx}: {str(e)}")
|
251
|
+
judgeval_logger.error(f"Error in parallel call {idx}: {str(e)}")
|
255
252
|
out[idx] = None
|
256
253
|
return out
|
257
254
|
|
@@ -294,17 +291,15 @@ async def aquery_together_api_multiple_calls(
|
|
294
291
|
# Validate message format
|
295
292
|
validate_batched_chat_messages(messages)
|
296
293
|
|
297
|
-
debug(f"Starting parallel Together API calls for {len(messages)} messages")
|
298
294
|
out: List[Union[str, None]] = [None] * len(messages)
|
299
295
|
|
300
296
|
async def fetch_and_store(idx, model, message, response_format):
|
301
297
|
try:
|
302
|
-
debug(f"Processing call {idx} with model {model}")
|
303
298
|
out[idx] = await afetch_together_api_response(
|
304
299
|
model, message, response_format
|
305
300
|
)
|
306
301
|
except Exception as e:
|
307
|
-
error(f"Error in parallel call {idx}: {str(e)}")
|
302
|
+
judgeval_logger.error(f"Error in parallel call {idx}: {str(e)}")
|
308
303
|
out[idx] = None
|
309
304
|
|
310
305
|
tasks = [
|
@@ -315,7 +310,6 @@ async def aquery_together_api_multiple_calls(
|
|
315
310
|
]
|
316
311
|
|
317
312
|
await asyncio.gather(*tasks)
|
318
|
-
debug(f"Completed {len(messages)} parallel calls")
|
319
313
|
return out
|
320
314
|
|
321
315
|
|
@@ -329,11 +323,7 @@ def fetch_litellm_api_response(
|
|
329
323
|
model=model, messages=messages, response_format=response_format
|
330
324
|
)
|
331
325
|
|
332
|
-
debug(f"Calling LiteLLM API with model: {request.model}")
|
333
|
-
debug(f"Messages: {request.messages}")
|
334
|
-
|
335
326
|
if request.response_format is not None:
|
336
|
-
debug(f"Using response format: {request.response_format}")
|
337
327
|
response = litellm.completion(
|
338
328
|
model=request.model,
|
339
329
|
messages=request.messages,
|
@@ -483,7 +473,7 @@ def query_litellm_api_multiple_calls(
|
|
483
473
|
try:
|
484
474
|
out[idx] = future.result()
|
485
475
|
except Exception as e:
|
486
|
-
error(f"Error in parallel call {idx}: {str(e)}")
|
476
|
+
judgeval_logger.error(f"Error in parallel call {idx}: {str(e)}")
|
487
477
|
out[idx] = None
|
488
478
|
return out
|
489
479
|
|
@@ -513,7 +503,7 @@ async def aquery_litellm_api_multiple_calls(
|
|
513
503
|
model, message, response_format
|
514
504
|
)
|
515
505
|
except Exception as e:
|
516
|
-
error(f"Error in parallel call {idx}: {str(e)}")
|
506
|
+
judgeval_logger.error(f"Error in parallel call {idx}: {str(e)}")
|
517
507
|
out[idx] = None
|
518
508
|
|
519
509
|
tasks = [
|
@@ -681,7 +671,6 @@ async def aget_chat_completion(
|
|
681
671
|
Raises:
|
682
672
|
- ValueError: If requested model is not supported by Litellm or TogetherAI.
|
683
673
|
"""
|
684
|
-
debug(f"Starting chat completion for model {model_type}, batched={batched}")
|
685
674
|
|
686
675
|
if batched:
|
687
676
|
validate_batched_chat_messages(messages)
|
@@ -693,7 +682,6 @@ async def aget_chat_completion(
|
|
693
682
|
and is_batched_messages(messages)
|
694
683
|
and model_type in TOGETHER_SUPPORTED_MODELS
|
695
684
|
):
|
696
|
-
debug("Using batched Together API call")
|
697
685
|
return await aquery_together_api_multiple_calls(
|
698
686
|
models=[model_type] * len(messages),
|
699
687
|
messages=messages,
|
@@ -704,7 +692,6 @@ async def aget_chat_completion(
|
|
704
692
|
and is_batched_messages(messages)
|
705
693
|
and model_type in LITELLM_SUPPORTED_MODELS
|
706
694
|
):
|
707
|
-
debug("Using batched LiteLLM API call")
|
708
695
|
return await aquery_litellm_api_multiple_calls(
|
709
696
|
models=[model_type] * len(messages),
|
710
697
|
messages=messages,
|
@@ -715,7 +702,6 @@ async def aget_chat_completion(
|
|
715
702
|
and is_simple_messages(messages)
|
716
703
|
and model_type in TOGETHER_SUPPORTED_MODELS
|
717
704
|
):
|
718
|
-
debug("Using single Together API call")
|
719
705
|
return await afetch_together_api_response(
|
720
706
|
model=model_type, messages=messages, response_format=response_format
|
721
707
|
)
|
@@ -724,12 +710,11 @@ async def aget_chat_completion(
|
|
724
710
|
and is_simple_messages(messages)
|
725
711
|
and model_type in LITELLM_SUPPORTED_MODELS
|
726
712
|
):
|
727
|
-
debug("Using single LiteLLM API call")
|
728
713
|
return await afetch_litellm_api_response(
|
729
714
|
model=model_type, messages=messages, response_format=response_format
|
730
715
|
)
|
731
716
|
|
732
|
-
error(f"Model {model_type} not supported by either API")
|
717
|
+
judgeval_logger.error(f"Model {model_type} not supported by either API")
|
733
718
|
raise ValueError(
|
734
719
|
f"Model {model_type} is not supported by Litellm or TogetherAI for chat completions. Please check the model name and try again."
|
735
720
|
)
|
@@ -753,7 +738,6 @@ def get_completion_multiple_models(
|
|
753
738
|
Raises:
|
754
739
|
ValueError: If a model is not supported by Litellm or Together
|
755
740
|
"""
|
756
|
-
debug(f"Starting multiple model completion for {len(models)} models")
|
757
741
|
|
758
742
|
if models is None or models == []:
|
759
743
|
raise ValueError("Models list cannot be empty")
|
@@ -761,7 +745,9 @@ def get_completion_multiple_models(
|
|
761
745
|
validate_batched_chat_messages(messages)
|
762
746
|
|
763
747
|
if len(models) != len(messages):
|
764
|
-
error(
|
748
|
+
judgeval_logger.error(
|
749
|
+
f"Model/message count mismatch: {len(models)} vs {len(messages)}"
|
750
|
+
)
|
765
751
|
raise ValueError(
|
766
752
|
f"Number of models and messages must be the same: {len(models)} != {len(messages)}"
|
767
753
|
)
|
@@ -774,13 +760,11 @@ def get_completion_multiple_models(
|
|
774
760
|
zip(models, messages, response_formats)
|
775
761
|
):
|
776
762
|
if model in TOGETHER_SUPPORTED_MODELS:
|
777
|
-
debug(f"Model {model} routed to Together API")
|
778
763
|
together_calls[idx] = (model, message, r_format)
|
779
764
|
elif model in LITELLM_SUPPORTED_MODELS:
|
780
|
-
debug(f"Model {model} routed to LiteLLM API")
|
781
765
|
litellm_calls[idx] = (model, message, r_format)
|
782
766
|
else:
|
783
|
-
error(f"Model {model} not supported by either API")
|
767
|
+
judgeval_logger.error(f"Model {model} not supported by either API")
|
784
768
|
raise ValueError(
|
785
769
|
f"Model {model} is not supported by Litellm or TogetherAI for chat completions. Please check the model name and try again."
|
786
770
|
)
|
@@ -792,7 +776,6 @@ def get_completion_multiple_models(
|
|
792
776
|
# Get the responses from the TogetherAI models
|
793
777
|
# List of responses from the TogetherAI models in order of the together_calls dict
|
794
778
|
if together_calls:
|
795
|
-
debug(f"Executing {len(together_calls)} Together API calls")
|
796
779
|
together_responses = query_together_api_multiple_calls(
|
797
780
|
models=[model for model, _, _ in together_calls.values()],
|
798
781
|
messages=[message for _, message, _ in together_calls.values()],
|
@@ -801,7 +784,6 @@ def get_completion_multiple_models(
|
|
801
784
|
|
802
785
|
# Get the responses from the Litellm models
|
803
786
|
if litellm_calls:
|
804
|
-
debug(f"Executing {len(litellm_calls)} LiteLLM API calls")
|
805
787
|
litellm_responses = query_litellm_api_multiple_calls(
|
806
788
|
models=[model for model, _, _ in litellm_calls.values()],
|
807
789
|
messages=[message for _, message, _ in litellm_calls.values()],
|
@@ -809,13 +791,11 @@ def get_completion_multiple_models(
|
|
809
791
|
)
|
810
792
|
|
811
793
|
# Merge the responses in the order of the original models
|
812
|
-
debug("Merging responses")
|
813
794
|
out: List[Union[str, None]] = [None] * len(models)
|
814
795
|
for idx, (model, message, r_format) in together_calls.items():
|
815
796
|
out[idx] = together_responses.pop(0)
|
816
797
|
for idx, (model, message, r_format) in litellm_calls.items():
|
817
798
|
out[idx] = litellm_responses.pop(0)
|
818
|
-
debug("Multiple model completion finished")
|
819
799
|
return out
|
820
800
|
|
821
801
|
|
judgeval/constants.py
CHANGED
@@ -7,7 +7,7 @@ import litellm
|
|
7
7
|
import os
|
8
8
|
|
9
9
|
|
10
|
-
class
|
10
|
+
class APIScorerType(str, Enum):
|
11
11
|
"""
|
12
12
|
Collection of proprietary scorers implemented by Judgment.
|
13
13
|
|
@@ -15,23 +15,17 @@ class APIScorer(str, Enum):
|
|
15
15
|
Examples via the Judgment API.
|
16
16
|
"""
|
17
17
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
COMPARISON = "comparison"
|
30
|
-
GROUNDEDNESS = "groundedness"
|
31
|
-
DERAILMENT = "derailment"
|
32
|
-
TOOL_ORDER = "tool_order"
|
33
|
-
CLASSIFIER = "classifier"
|
34
|
-
TOOL_DEPENDENCY = "tool_dependency"
|
18
|
+
PROMPT_SCORER = "Prompt Scorer"
|
19
|
+
FAITHFULNESS = "Faithfulness"
|
20
|
+
ANSWER_RELEVANCY = "Answer Relevancy"
|
21
|
+
ANSWER_CORRECTNESS = "Answer Correctness"
|
22
|
+
INSTRUCTION_ADHERENCE = "Instruction Adherence"
|
23
|
+
EXECUTION_ORDER = "Execution Order"
|
24
|
+
DERAILMENT = "Derailment"
|
25
|
+
TOOL_ORDER = "Tool Order"
|
26
|
+
CLASSIFIER = "Classifier"
|
27
|
+
TOOL_DEPENDENCY = "Tool Dependency"
|
28
|
+
CUSTOM = "Custom"
|
35
29
|
|
36
30
|
@classmethod
|
37
31
|
def _missing_(cls, value):
|
@@ -41,8 +35,8 @@ class APIScorer(str, Enum):
|
|
41
35
|
return member
|
42
36
|
|
43
37
|
|
44
|
-
UNBOUNDED_SCORERS =
|
45
|
-
|
38
|
+
UNBOUNDED_SCORERS: set[APIScorerType] = (
|
39
|
+
set()
|
46
40
|
) # scorers whose scores are not bounded between 0-1
|
47
41
|
|
48
42
|
ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
judgeval/data/__init__.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
from judgeval.data.example import Example, ExampleParams
|
2
|
-
from judgeval.data.custom_example import CustomExample
|
3
2
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
4
3
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
5
4
|
from judgeval.data.trace import Trace, TraceSpan, TraceUsage
|
@@ -8,7 +7,6 @@ from judgeval.data.trace import Trace, TraceSpan, TraceUsage
|
|
8
7
|
__all__ = [
|
9
8
|
"Example",
|
10
9
|
"ExampleParams",
|
11
|
-
"CustomExample",
|
12
10
|
"ScorerData",
|
13
11
|
"create_scorer_data",
|
14
12
|
"ScoringResult",
|
@@ -8,7 +8,7 @@ from dataclasses import dataclass, field
|
|
8
8
|
from typing import List, Union, Literal, Optional
|
9
9
|
|
10
10
|
from judgeval.data import Example, Trace
|
11
|
-
from judgeval.common.logger import
|
11
|
+
from judgeval.common.logger import judgeval_logger
|
12
12
|
from judgeval.utils.file_utils import get_examples_from_yaml
|
13
13
|
|
14
14
|
|
@@ -29,7 +29,7 @@ class EvalDataset:
|
|
29
29
|
traces: Optional[List[Trace]] = None,
|
30
30
|
):
|
31
31
|
if not judgment_api_key:
|
32
|
-
|
32
|
+
judgeval_logger.error("No judgment_api_key provided")
|
33
33
|
self.examples = examples or []
|
34
34
|
self.traces = traces or []
|
35
35
|
self._alias = None
|
@@ -38,11 +38,10 @@ class EvalDataset:
|
|
38
38
|
self.organization_id = organization_id
|
39
39
|
|
40
40
|
def add_from_json(self, file_path: str) -> None:
|
41
|
-
debug(f"Loading dataset from JSON file: {file_path}")
|
42
41
|
"""
|
43
42
|
Adds examples from a JSON file.
|
44
43
|
|
45
|
-
The format of the JSON file is expected to be a dictionary with one key: "examples".
|
44
|
+
The format of the JSON file is expected to be a dictionary with one key: "examples".
|
46
45
|
The value of the key is a list of dictionaries, where each dictionary represents an example.
|
47
46
|
|
48
47
|
The JSON file is expected to have the following format:
|
@@ -82,13 +81,12 @@ class EvalDataset:
|
|
82
81
|
payload = json.load(file)
|
83
82
|
examples = payload.get("examples", [])
|
84
83
|
except FileNotFoundError:
|
85
|
-
error(f"JSON file not found: {file_path}")
|
84
|
+
judgeval_logger.error(f"JSON file not found: {file_path}")
|
86
85
|
raise FileNotFoundError(f"The file {file_path} was not found.")
|
87
86
|
except json.JSONDecodeError:
|
88
|
-
error(f"Invalid JSON file: {file_path}")
|
87
|
+
judgeval_logger.error(f"Invalid JSON file: {file_path}")
|
89
88
|
raise ValueError(f"The file {file_path} is not a valid JSON file.")
|
90
89
|
|
91
|
-
info(f"Added {len(examples)} examples from JSON")
|
92
90
|
new_examples = [Example(**e) for e in examples]
|
93
91
|
for e in new_examples:
|
94
92
|
self.add_example(e)
|
@@ -189,11 +187,10 @@ class EvalDataset:
|
|
189
187
|
self.add_example(e)
|
190
188
|
|
191
189
|
def add_from_yaml(self, file_path: str) -> None:
|
192
|
-
debug(f"Loading dataset from YAML file: {file_path}")
|
193
190
|
"""
|
194
191
|
Adds examples from a YAML file.
|
195
192
|
|
196
|
-
The format of the YAML file is expected to be a dictionary with one key: "examples".
|
193
|
+
The format of the YAML file is expected to be a dictionary with one key: "examples".
|
197
194
|
The value of the key is a list of dictionaries, where each dictionary represents an example.
|
198
195
|
|
199
196
|
The YAML file is expected to have the following format:
|
@@ -220,7 +217,6 @@ class EvalDataset:
|
|
220
217
|
"""
|
221
218
|
examples = get_examples_from_yaml(file_path)
|
222
219
|
|
223
|
-
info(f"Added {len(examples)} examples from YAML")
|
224
220
|
for e in examples:
|
225
221
|
self.add_example(e)
|
226
222
|
|
@@ -2,8 +2,7 @@ from typing import Optional, List
|
|
2
2
|
from requests import Response, exceptions
|
3
3
|
from judgeval.utils.requests import requests
|
4
4
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
5
|
-
|
6
|
-
from judgeval.common.logger import debug, error, warning, info
|
5
|
+
from judgeval.common.logger import judgeval_logger
|
7
6
|
from judgeval.constants import (
|
8
7
|
JUDGMENT_DATASETS_PUSH_API_URL,
|
9
8
|
JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL,
|
@@ -31,9 +30,8 @@ class EvalDatasetClient:
|
|
31
30
|
project_name: str,
|
32
31
|
overwrite: Optional[bool] = False,
|
33
32
|
) -> bool:
|
34
|
-
debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
|
35
33
|
if overwrite:
|
36
|
-
warning(f"Overwrite enabled for alias '{alias}'")
|
34
|
+
judgeval_logger.warning(f"Overwrite enabled for alias '{alias}'")
|
37
35
|
"""
|
38
36
|
Pushes the dataset to Judgment platform
|
39
37
|
|
@@ -76,16 +74,19 @@ class EvalDatasetClient:
|
|
76
74
|
verify=True,
|
77
75
|
)
|
78
76
|
if response.status_code != 200:
|
79
|
-
error(
|
77
|
+
judgeval_logger.error(
|
78
|
+
f"Server error during push: {response.json()}"
|
79
|
+
)
|
80
80
|
raise Exception(f"Server error during push: {response.json()}")
|
81
81
|
response.raise_for_status()
|
82
82
|
except exceptions.HTTPError as err:
|
83
83
|
if response.status_code == 422:
|
84
|
-
error(
|
84
|
+
judgeval_logger.error(
|
85
|
+
f"Validation error during push: {err.response.json()}"
|
86
|
+
)
|
85
87
|
else:
|
86
|
-
error(f"HTTP error during push: {err}")
|
88
|
+
judgeval_logger.error(f"HTTP error during push: {err}")
|
87
89
|
|
88
|
-
info(f"Successfully pushed dataset with alias '{alias}'")
|
89
90
|
payload = response.json()
|
90
91
|
dataset._alias = payload.get("_alias")
|
91
92
|
dataset._id = payload.get("_id")
|
@@ -98,7 +99,6 @@ class EvalDatasetClient:
|
|
98
99
|
def append_examples(
|
99
100
|
self, alias: str, examples: List[Example], project_name: str
|
100
101
|
) -> bool:
|
101
|
-
debug(f"Appending dataset with alias '{alias}'")
|
102
102
|
"""
|
103
103
|
Appends the dataset to Judgment platform
|
104
104
|
|
@@ -139,14 +139,18 @@ class EvalDatasetClient:
|
|
139
139
|
verify=True,
|
140
140
|
)
|
141
141
|
if response.status_code != 200:
|
142
|
-
error(
|
142
|
+
judgeval_logger.error(
|
143
|
+
f"Server error during append: {response.json()}"
|
144
|
+
)
|
143
145
|
raise Exception(f"Server error during append: {response.json()}")
|
144
146
|
response.raise_for_status()
|
145
147
|
except exceptions.HTTPError as err:
|
146
148
|
if response.status_code == 422:
|
147
|
-
error(
|
149
|
+
judgeval_logger.error(
|
150
|
+
f"Validation error during append: {err.response.json()}"
|
151
|
+
)
|
148
152
|
else:
|
149
|
-
error(f"HTTP error during append: {err}")
|
153
|
+
judgeval_logger.error(f"HTTP error during append: {err}")
|
150
154
|
|
151
155
|
progress.update(
|
152
156
|
task_id,
|
@@ -155,7 +159,6 @@ class EvalDatasetClient:
|
|
155
159
|
return True
|
156
160
|
|
157
161
|
def pull(self, alias: str, project_name: str) -> EvalDataset:
|
158
|
-
debug(f"Pulling dataset with alias '{alias}'")
|
159
162
|
"""
|
160
163
|
Pulls the dataset from Judgment platform
|
161
164
|
|
@@ -163,7 +166,7 @@ class EvalDatasetClient:
|
|
163
166
|
{
|
164
167
|
"alias": alias,
|
165
168
|
"project_name": project_name
|
166
|
-
}
|
169
|
+
}
|
167
170
|
==>
|
168
171
|
{
|
169
172
|
"examples": [...],
|
@@ -198,10 +201,9 @@ class EvalDatasetClient:
|
|
198
201
|
)
|
199
202
|
response.raise_for_status()
|
200
203
|
except exceptions.RequestException as e:
|
201
|
-
error(f"Error pulling dataset: {str(e)}")
|
204
|
+
judgeval_logger.error(f"Error pulling dataset: {str(e)}")
|
202
205
|
raise
|
203
206
|
|
204
|
-
info(f"Successfully pulled dataset with alias '{alias}'")
|
205
207
|
payload = response.json()
|
206
208
|
dataset.examples = [Example(**e) for e in payload.get("examples", [])]
|
207
209
|
dataset.traces = [Trace(**t) for t in payload.get("traces", [])]
|
@@ -239,20 +241,19 @@ class EvalDatasetClient:
|
|
239
241
|
)
|
240
242
|
response.raise_for_status()
|
241
243
|
except exceptions.RequestException as e:
|
242
|
-
error(f"Error deleting dataset: {str(e)}")
|
244
|
+
judgeval_logger.error(f"Error deleting dataset: {str(e)}")
|
243
245
|
raise
|
244
246
|
|
245
247
|
return True
|
246
248
|
|
247
249
|
def pull_project_dataset_stats(self, project_name: str) -> dict:
|
248
|
-
debug(f"Pulling project datasets stats for project_name: {project_name}'")
|
249
250
|
"""
|
250
|
-
Pulls the project datasets stats from Judgment platform
|
251
|
+
Pulls the project datasets stats from Judgment platform
|
251
252
|
|
252
253
|
Mock request:
|
253
254
|
{
|
254
255
|
"project_name": project_name
|
255
|
-
}
|
256
|
+
}
|
256
257
|
==>
|
257
258
|
{
|
258
259
|
"test_dataset_1": {"examples_count": len(dataset1.examples)},
|
@@ -286,10 +287,9 @@ class EvalDatasetClient:
|
|
286
287
|
)
|
287
288
|
response.raise_for_status()
|
288
289
|
except exceptions.RequestException as e:
|
289
|
-
error(f"Error pulling dataset: {str(e)}")
|
290
|
+
judgeval_logger.error(f"Error pulling dataset: {str(e)}")
|
290
291
|
raise
|
291
292
|
|
292
|
-
info(f"Successfully pulled datasets for userid: {self.judgment_api_key}'")
|
293
293
|
payload = response.json()
|
294
294
|
|
295
295
|
progress.update(
|
@@ -301,7 +301,6 @@ class EvalDatasetClient:
|
|
301
301
|
|
302
302
|
def export_jsonl(self, alias: str, project_name: str) -> Response:
|
303
303
|
"""Export dataset in JSONL format from Judgment platform"""
|
304
|
-
debug(f"Exporting dataset with alias '{alias}' as JSONL")
|
305
304
|
with Progress(
|
306
305
|
SpinnerColumn(style="rgb(106,0,255)"),
|
307
306
|
TextColumn("[progress.description]{task.description}"),
|
@@ -326,15 +325,14 @@ class EvalDatasetClient:
|
|
326
325
|
response.raise_for_status()
|
327
326
|
except exceptions.HTTPError as err:
|
328
327
|
if err.response.status_code == 404:
|
329
|
-
error(f"Dataset not found: {alias}")
|
328
|
+
judgeval_logger.error(f"Dataset not found: {alias}")
|
330
329
|
else:
|
331
|
-
error(f"HTTP error during export: {err}")
|
330
|
+
judgeval_logger.error(f"HTTP error during export: {err}")
|
332
331
|
raise
|
333
332
|
except Exception as e:
|
334
|
-
error(f"Error during export: {str(e)}")
|
333
|
+
judgeval_logger.error(f"Error during export: {str(e)}")
|
335
334
|
raise
|
336
335
|
|
337
|
-
info(f"Successfully exported dataset with alias '{alias}'")
|
338
336
|
progress.update(
|
339
337
|
task_id,
|
340
338
|
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|