judgeval 0.0.51__py3-none-any.whl → 0.0.53__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. judgeval/common/logger.py +46 -199
  2. judgeval/common/s3_storage.py +2 -6
  3. judgeval/common/tracer.py +182 -262
  4. judgeval/common/utils.py +16 -36
  5. judgeval/constants.py +14 -20
  6. judgeval/data/__init__.py +0 -2
  7. judgeval/data/datasets/dataset.py +6 -10
  8. judgeval/data/datasets/eval_dataset_client.py +25 -27
  9. judgeval/data/example.py +5 -138
  10. judgeval/data/judgment_types.py +214 -0
  11. judgeval/data/result.py +7 -25
  12. judgeval/data/scorer_data.py +28 -40
  13. judgeval/data/scripts/fix_default_factory.py +23 -0
  14. judgeval/data/scripts/openapi_transform.py +123 -0
  15. judgeval/data/tool.py +3 -54
  16. judgeval/data/trace.py +31 -50
  17. judgeval/data/trace_run.py +3 -3
  18. judgeval/evaluation_run.py +16 -23
  19. judgeval/integrations/langgraph.py +11 -12
  20. judgeval/judges/litellm_judge.py +3 -6
  21. judgeval/judges/mixture_of_judges.py +8 -25
  22. judgeval/judges/together_judge.py +3 -6
  23. judgeval/judgment_client.py +22 -24
  24. judgeval/rules.py +7 -19
  25. judgeval/run_evaluation.py +79 -242
  26. judgeval/scorers/__init__.py +4 -20
  27. judgeval/scorers/agent_scorer.py +21 -0
  28. judgeval/scorers/api_scorer.py +28 -38
  29. judgeval/scorers/base_scorer.py +98 -0
  30. judgeval/scorers/example_scorer.py +19 -0
  31. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
  32. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -17
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -24
  34. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +16 -68
  35. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +4 -12
  36. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
  37. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -17
  38. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
  40. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
  41. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -14
  42. judgeval/scorers/score.py +45 -330
  43. judgeval/scorers/utils.py +6 -88
  44. judgeval/utils/file_utils.py +4 -6
  45. judgeval/version_check.py +3 -2
  46. {judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/METADATA +3 -2
  47. judgeval-0.0.53.dist-info/RECORD +65 -0
  48. judgeval/data/custom_example.py +0 -19
  49. judgeval/scorers/judgeval_scorer.py +0 -177
  50. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
  51. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
  52. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
  53. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
  54. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
  55. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
  56. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
  57. judgeval/scorers/prompt_scorer.py +0 -296
  58. judgeval-0.0.51.dist-info/RECORD +0 -69
  59. {judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/WHEEL +0 -0
  60. {judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/utils.py CHANGED
@@ -31,7 +31,7 @@ from judgeval.constants import (
31
31
  TOGETHER_SUPPORTED_MODELS,
32
32
  LITELLM_SUPPORTED_MODELS,
33
33
  )
34
- from judgeval.common.logger import debug, error
34
+ from judgeval.common.logger import judgeval_logger
35
35
 
36
36
 
37
37
  class CustomModelParameters(pydantic.BaseModel):
@@ -40,18 +40,21 @@ class CustomModelParameters(pydantic.BaseModel):
40
40
  litellm_base_url: str
41
41
 
42
42
  @pydantic.field_validator("model_name")
43
+ @classmethod
43
44
  def validate_model_name(cls, v):
44
45
  if not v:
45
46
  raise ValueError("Model name cannot be empty")
46
47
  return v
47
48
 
48
49
  @pydantic.field_validator("secret_key")
50
+ @classmethod
49
51
  def validate_secret_key(cls, v):
50
52
  if not v:
51
53
  raise ValueError("Secret key cannot be empty")
52
54
  return v
53
55
 
54
56
  @pydantic.field_validator("litellm_base_url")
57
+ @classmethod
55
58
  def validate_litellm_base_url(cls, v):
56
59
  if not v:
57
60
  raise ValueError("Litellm base URL cannot be empty")
@@ -64,6 +67,7 @@ class ChatCompletionRequest(pydantic.BaseModel):
64
67
  response_format: Optional[Union[pydantic.BaseModel, Dict[str, Any]]] = None
65
68
 
66
69
  @pydantic.field_validator("messages")
70
+ @classmethod
67
71
  def validate_messages(cls, messages):
68
72
  if not messages:
69
73
  raise ValueError("Messages cannot be empty")
@@ -83,6 +87,7 @@ class ChatCompletionRequest(pydantic.BaseModel):
83
87
  return messages
84
88
 
85
89
  @pydantic.field_validator("model")
90
+ @classmethod
86
91
  def validate_model(cls, model):
87
92
  if not model:
88
93
  raise ValueError("Model cannot be empty")
@@ -91,6 +96,7 @@ class ChatCompletionRequest(pydantic.BaseModel):
91
96
  return model
92
97
 
93
98
  @pydantic.field_validator("response_format", mode="before")
99
+ @classmethod
94
100
  def validate_response_format(cls, response_format):
95
101
  if response_format is not None:
96
102
  if not isinstance(response_format, (dict, pydantic.BaseModel)):
@@ -145,11 +151,7 @@ def fetch_together_api_response(
145
151
  model=model, messages=messages, response_format=response_format
146
152
  )
147
153
 
148
- debug(f"Calling Together API with model: {request.model}")
149
- debug(f"Messages: {request.messages}")
150
-
151
154
  if request.response_format is not None:
152
- debug(f"Using response format: {request.response_format}")
153
155
  response = together_client.chat.completions.create(
154
156
  model=request.model,
155
157
  messages=request.messages,
@@ -161,7 +163,6 @@ def fetch_together_api_response(
161
163
  messages=request.messages,
162
164
  )
163
165
 
164
- debug(f"Received response: {response.choices[0].message.content[:100]}...")
165
166
  return response.choices[0].message.content
166
167
 
167
168
 
@@ -175,11 +176,7 @@ async def afetch_together_api_response(
175
176
  model=model, messages=messages, response_format=response_format
176
177
  )
177
178
 
178
- debug(f"Calling Together API with model: {request.model}")
179
- debug(f"Messages: {request.messages}")
180
-
181
179
  if request.response_format is not None:
182
- debug(f"Using response format: {request.response_format}")
183
180
  response = await async_together_client.chat.completions.create(
184
181
  model=request.model,
185
182
  messages=request.messages,
@@ -251,7 +248,7 @@ def query_together_api_multiple_calls(
251
248
  try:
252
249
  out[idx] = future.result()
253
250
  except Exception as e:
254
- error(f"Error in parallel call {idx}: {str(e)}")
251
+ judgeval_logger.error(f"Error in parallel call {idx}: {str(e)}")
255
252
  out[idx] = None
256
253
  return out
257
254
 
@@ -294,17 +291,15 @@ async def aquery_together_api_multiple_calls(
294
291
  # Validate message format
295
292
  validate_batched_chat_messages(messages)
296
293
 
297
- debug(f"Starting parallel Together API calls for {len(messages)} messages")
298
294
  out: List[Union[str, None]] = [None] * len(messages)
299
295
 
300
296
  async def fetch_and_store(idx, model, message, response_format):
301
297
  try:
302
- debug(f"Processing call {idx} with model {model}")
303
298
  out[idx] = await afetch_together_api_response(
304
299
  model, message, response_format
305
300
  )
306
301
  except Exception as e:
307
- error(f"Error in parallel call {idx}: {str(e)}")
302
+ judgeval_logger.error(f"Error in parallel call {idx}: {str(e)}")
308
303
  out[idx] = None
309
304
 
310
305
  tasks = [
@@ -315,7 +310,6 @@ async def aquery_together_api_multiple_calls(
315
310
  ]
316
311
 
317
312
  await asyncio.gather(*tasks)
318
- debug(f"Completed {len(messages)} parallel calls")
319
313
  return out
320
314
 
321
315
 
@@ -329,11 +323,7 @@ def fetch_litellm_api_response(
329
323
  model=model, messages=messages, response_format=response_format
330
324
  )
331
325
 
332
- debug(f"Calling LiteLLM API with model: {request.model}")
333
- debug(f"Messages: {request.messages}")
334
-
335
326
  if request.response_format is not None:
336
- debug(f"Using response format: {request.response_format}")
337
327
  response = litellm.completion(
338
328
  model=request.model,
339
329
  messages=request.messages,
@@ -483,7 +473,7 @@ def query_litellm_api_multiple_calls(
483
473
  try:
484
474
  out[idx] = future.result()
485
475
  except Exception as e:
486
- error(f"Error in parallel call {idx}: {str(e)}")
476
+ judgeval_logger.error(f"Error in parallel call {idx}: {str(e)}")
487
477
  out[idx] = None
488
478
  return out
489
479
 
@@ -513,7 +503,7 @@ async def aquery_litellm_api_multiple_calls(
513
503
  model, message, response_format
514
504
  )
515
505
  except Exception as e:
516
- error(f"Error in parallel call {idx}: {str(e)}")
506
+ judgeval_logger.error(f"Error in parallel call {idx}: {str(e)}")
517
507
  out[idx] = None
518
508
 
519
509
  tasks = [
@@ -681,7 +671,6 @@ async def aget_chat_completion(
681
671
  Raises:
682
672
  - ValueError: If requested model is not supported by Litellm or TogetherAI.
683
673
  """
684
- debug(f"Starting chat completion for model {model_type}, batched={batched}")
685
674
 
686
675
  if batched:
687
676
  validate_batched_chat_messages(messages)
@@ -693,7 +682,6 @@ async def aget_chat_completion(
693
682
  and is_batched_messages(messages)
694
683
  and model_type in TOGETHER_SUPPORTED_MODELS
695
684
  ):
696
- debug("Using batched Together API call")
697
685
  return await aquery_together_api_multiple_calls(
698
686
  models=[model_type] * len(messages),
699
687
  messages=messages,
@@ -704,7 +692,6 @@ async def aget_chat_completion(
704
692
  and is_batched_messages(messages)
705
693
  and model_type in LITELLM_SUPPORTED_MODELS
706
694
  ):
707
- debug("Using batched LiteLLM API call")
708
695
  return await aquery_litellm_api_multiple_calls(
709
696
  models=[model_type] * len(messages),
710
697
  messages=messages,
@@ -715,7 +702,6 @@ async def aget_chat_completion(
715
702
  and is_simple_messages(messages)
716
703
  and model_type in TOGETHER_SUPPORTED_MODELS
717
704
  ):
718
- debug("Using single Together API call")
719
705
  return await afetch_together_api_response(
720
706
  model=model_type, messages=messages, response_format=response_format
721
707
  )
@@ -724,12 +710,11 @@ async def aget_chat_completion(
724
710
  and is_simple_messages(messages)
725
711
  and model_type in LITELLM_SUPPORTED_MODELS
726
712
  ):
727
- debug("Using single LiteLLM API call")
728
713
  return await afetch_litellm_api_response(
729
714
  model=model_type, messages=messages, response_format=response_format
730
715
  )
731
716
 
732
- error(f"Model {model_type} not supported by either API")
717
+ judgeval_logger.error(f"Model {model_type} not supported by either API")
733
718
  raise ValueError(
734
719
  f"Model {model_type} is not supported by Litellm or TogetherAI for chat completions. Please check the model name and try again."
735
720
  )
@@ -753,7 +738,6 @@ def get_completion_multiple_models(
753
738
  Raises:
754
739
  ValueError: If a model is not supported by Litellm or Together
755
740
  """
756
- debug(f"Starting multiple model completion for {len(models)} models")
757
741
 
758
742
  if models is None or models == []:
759
743
  raise ValueError("Models list cannot be empty")
@@ -761,7 +745,9 @@ def get_completion_multiple_models(
761
745
  validate_batched_chat_messages(messages)
762
746
 
763
747
  if len(models) != len(messages):
764
- error(f"Model/message count mismatch: {len(models)} vs {len(messages)}")
748
+ judgeval_logger.error(
749
+ f"Model/message count mismatch: {len(models)} vs {len(messages)}"
750
+ )
765
751
  raise ValueError(
766
752
  f"Number of models and messages must be the same: {len(models)} != {len(messages)}"
767
753
  )
@@ -774,13 +760,11 @@ def get_completion_multiple_models(
774
760
  zip(models, messages, response_formats)
775
761
  ):
776
762
  if model in TOGETHER_SUPPORTED_MODELS:
777
- debug(f"Model {model} routed to Together API")
778
763
  together_calls[idx] = (model, message, r_format)
779
764
  elif model in LITELLM_SUPPORTED_MODELS:
780
- debug(f"Model {model} routed to LiteLLM API")
781
765
  litellm_calls[idx] = (model, message, r_format)
782
766
  else:
783
- error(f"Model {model} not supported by either API")
767
+ judgeval_logger.error(f"Model {model} not supported by either API")
784
768
  raise ValueError(
785
769
  f"Model {model} is not supported by Litellm or TogetherAI for chat completions. Please check the model name and try again."
786
770
  )
@@ -792,7 +776,6 @@ def get_completion_multiple_models(
792
776
  # Get the responses from the TogetherAI models
793
777
  # List of responses from the TogetherAI models in order of the together_calls dict
794
778
  if together_calls:
795
- debug(f"Executing {len(together_calls)} Together API calls")
796
779
  together_responses = query_together_api_multiple_calls(
797
780
  models=[model for model, _, _ in together_calls.values()],
798
781
  messages=[message for _, message, _ in together_calls.values()],
@@ -801,7 +784,6 @@ def get_completion_multiple_models(
801
784
 
802
785
  # Get the responses from the Litellm models
803
786
  if litellm_calls:
804
- debug(f"Executing {len(litellm_calls)} LiteLLM API calls")
805
787
  litellm_responses = query_litellm_api_multiple_calls(
806
788
  models=[model for model, _, _ in litellm_calls.values()],
807
789
  messages=[message for _, message, _ in litellm_calls.values()],
@@ -809,13 +791,11 @@ def get_completion_multiple_models(
809
791
  )
810
792
 
811
793
  # Merge the responses in the order of the original models
812
- debug("Merging responses")
813
794
  out: List[Union[str, None]] = [None] * len(models)
814
795
  for idx, (model, message, r_format) in together_calls.items():
815
796
  out[idx] = together_responses.pop(0)
816
797
  for idx, (model, message, r_format) in litellm_calls.items():
817
798
  out[idx] = litellm_responses.pop(0)
818
- debug("Multiple model completion finished")
819
799
  return out
820
800
 
821
801
 
judgeval/constants.py CHANGED
@@ -7,7 +7,7 @@ import litellm
7
7
  import os
8
8
 
9
9
 
10
- class APIScorer(str, Enum):
10
+ class APIScorerType(str, Enum):
11
11
  """
12
12
  Collection of proprietary scorers implemented by Judgment.
13
13
 
@@ -15,23 +15,17 @@ class APIScorer(str, Enum):
15
15
  Examples via the Judgment API.
16
16
  """
17
17
 
18
- FAITHFULNESS = "faithfulness"
19
- ANSWER_RELEVANCY = "answer_relevancy"
20
- ANSWER_CORRECTNESS = "answer_correctness"
21
- HALLUCINATION = "hallucination"
22
- SUMMARIZATION = "summarization"
23
- CONTEXTUAL_RECALL = "contextual_recall"
24
- CONTEXTUAL_RELEVANCY = "contextual_relevancy"
25
- CONTEXTUAL_PRECISION = "contextual_precision"
26
- INSTRUCTION_ADHERENCE = "instruction_adherence"
27
- EXECUTION_ORDER = "execution_order"
28
- JSON_CORRECTNESS = "json_correctness"
29
- COMPARISON = "comparison"
30
- GROUNDEDNESS = "groundedness"
31
- DERAILMENT = "derailment"
32
- TOOL_ORDER = "tool_order"
33
- CLASSIFIER = "classifier"
34
- TOOL_DEPENDENCY = "tool_dependency"
18
+ PROMPT_SCORER = "Prompt Scorer"
19
+ FAITHFULNESS = "Faithfulness"
20
+ ANSWER_RELEVANCY = "Answer Relevancy"
21
+ ANSWER_CORRECTNESS = "Answer Correctness"
22
+ INSTRUCTION_ADHERENCE = "Instruction Adherence"
23
+ EXECUTION_ORDER = "Execution Order"
24
+ DERAILMENT = "Derailment"
25
+ TOOL_ORDER = "Tool Order"
26
+ CLASSIFIER = "Classifier"
27
+ TOOL_DEPENDENCY = "Tool Dependency"
28
+ CUSTOM = "Custom"
35
29
 
36
30
  @classmethod
37
31
  def _missing_(cls, value):
@@ -41,8 +35,8 @@ class APIScorer(str, Enum):
41
35
  return member
42
36
 
43
37
 
44
- UNBOUNDED_SCORERS = set(
45
- [APIScorer.COMPARISON]
38
+ UNBOUNDED_SCORERS: set[APIScorerType] = (
39
+ set()
46
40
  ) # scorers whose scores are not bounded between 0-1
47
41
 
48
42
  ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
judgeval/data/__init__.py CHANGED
@@ -1,5 +1,4 @@
1
1
  from judgeval.data.example import Example, ExampleParams
2
- from judgeval.data.custom_example import CustomExample
3
2
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
4
3
  from judgeval.data.result import ScoringResult, generate_scoring_result
5
4
  from judgeval.data.trace import Trace, TraceSpan, TraceUsage
@@ -8,7 +7,6 @@ from judgeval.data.trace import Trace, TraceSpan, TraceUsage
8
7
  __all__ = [
9
8
  "Example",
10
9
  "ExampleParams",
11
- "CustomExample",
12
10
  "ScorerData",
13
11
  "create_scorer_data",
14
12
  "ScoringResult",
@@ -8,7 +8,7 @@ from dataclasses import dataclass, field
8
8
  from typing import List, Union, Literal, Optional
9
9
 
10
10
  from judgeval.data import Example, Trace
11
- from judgeval.common.logger import debug, error, warning, info
11
+ from judgeval.common.logger import judgeval_logger
12
12
  from judgeval.utils.file_utils import get_examples_from_yaml
13
13
 
14
14
 
@@ -29,7 +29,7 @@ class EvalDataset:
29
29
  traces: Optional[List[Trace]] = None,
30
30
  ):
31
31
  if not judgment_api_key:
32
- warning("No judgment_api_key provided")
32
+ judgeval_logger.error("No judgment_api_key provided")
33
33
  self.examples = examples or []
34
34
  self.traces = traces or []
35
35
  self._alias = None
@@ -38,11 +38,10 @@ class EvalDataset:
38
38
  self.organization_id = organization_id
39
39
 
40
40
  def add_from_json(self, file_path: str) -> None:
41
- debug(f"Loading dataset from JSON file: {file_path}")
42
41
  """
43
42
  Adds examples from a JSON file.
44
43
 
45
- The format of the JSON file is expected to be a dictionary with one key: "examples".
44
+ The format of the JSON file is expected to be a dictionary with one key: "examples".
46
45
  The value of the key is a list of dictionaries, where each dictionary represents an example.
47
46
 
48
47
  The JSON file is expected to have the following format:
@@ -82,13 +81,12 @@ class EvalDataset:
82
81
  payload = json.load(file)
83
82
  examples = payload.get("examples", [])
84
83
  except FileNotFoundError:
85
- error(f"JSON file not found: {file_path}")
84
+ judgeval_logger.error(f"JSON file not found: {file_path}")
86
85
  raise FileNotFoundError(f"The file {file_path} was not found.")
87
86
  except json.JSONDecodeError:
88
- error(f"Invalid JSON file: {file_path}")
87
+ judgeval_logger.error(f"Invalid JSON file: {file_path}")
89
88
  raise ValueError(f"The file {file_path} is not a valid JSON file.")
90
89
 
91
- info(f"Added {len(examples)} examples from JSON")
92
90
  new_examples = [Example(**e) for e in examples]
93
91
  for e in new_examples:
94
92
  self.add_example(e)
@@ -189,11 +187,10 @@ class EvalDataset:
189
187
  self.add_example(e)
190
188
 
191
189
  def add_from_yaml(self, file_path: str) -> None:
192
- debug(f"Loading dataset from YAML file: {file_path}")
193
190
  """
194
191
  Adds examples from a YAML file.
195
192
 
196
- The format of the YAML file is expected to be a dictionary with one key: "examples".
193
+ The format of the YAML file is expected to be a dictionary with one key: "examples".
197
194
  The value of the key is a list of dictionaries, where each dictionary represents an example.
198
195
 
199
196
  The YAML file is expected to have the following format:
@@ -220,7 +217,6 @@ class EvalDataset:
220
217
  """
221
218
  examples = get_examples_from_yaml(file_path)
222
219
 
223
- info(f"Added {len(examples)} examples from YAML")
224
220
  for e in examples:
225
221
  self.add_example(e)
226
222
 
@@ -2,8 +2,7 @@ from typing import Optional, List
2
2
  from requests import Response, exceptions
3
3
  from judgeval.utils.requests import requests
4
4
  from rich.progress import Progress, SpinnerColumn, TextColumn
5
-
6
- from judgeval.common.logger import debug, error, warning, info
5
+ from judgeval.common.logger import judgeval_logger
7
6
  from judgeval.constants import (
8
7
  JUDGMENT_DATASETS_PUSH_API_URL,
9
8
  JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL,
@@ -31,9 +30,8 @@ class EvalDatasetClient:
31
30
  project_name: str,
32
31
  overwrite: Optional[bool] = False,
33
32
  ) -> bool:
34
- debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
35
33
  if overwrite:
36
- warning(f"Overwrite enabled for alias '{alias}'")
34
+ judgeval_logger.warning(f"Overwrite enabled for alias '{alias}'")
37
35
  """
38
36
  Pushes the dataset to Judgment platform
39
37
 
@@ -76,16 +74,19 @@ class EvalDatasetClient:
76
74
  verify=True,
77
75
  )
78
76
  if response.status_code != 200:
79
- error(f"Server error during push: {response.json()}")
77
+ judgeval_logger.error(
78
+ f"Server error during push: {response.json()}"
79
+ )
80
80
  raise Exception(f"Server error during push: {response.json()}")
81
81
  response.raise_for_status()
82
82
  except exceptions.HTTPError as err:
83
83
  if response.status_code == 422:
84
- error(f"Validation error during push: {err.response.json()}")
84
+ judgeval_logger.error(
85
+ f"Validation error during push: {err.response.json()}"
86
+ )
85
87
  else:
86
- error(f"HTTP error during push: {err}")
88
+ judgeval_logger.error(f"HTTP error during push: {err}")
87
89
 
88
- info(f"Successfully pushed dataset with alias '{alias}'")
89
90
  payload = response.json()
90
91
  dataset._alias = payload.get("_alias")
91
92
  dataset._id = payload.get("_id")
@@ -98,7 +99,6 @@ class EvalDatasetClient:
98
99
  def append_examples(
99
100
  self, alias: str, examples: List[Example], project_name: str
100
101
  ) -> bool:
101
- debug(f"Appending dataset with alias '{alias}'")
102
102
  """
103
103
  Appends the dataset to Judgment platform
104
104
 
@@ -139,14 +139,18 @@ class EvalDatasetClient:
139
139
  verify=True,
140
140
  )
141
141
  if response.status_code != 200:
142
- error(f"Server error during append: {response.json()}")
142
+ judgeval_logger.error(
143
+ f"Server error during append: {response.json()}"
144
+ )
143
145
  raise Exception(f"Server error during append: {response.json()}")
144
146
  response.raise_for_status()
145
147
  except exceptions.HTTPError as err:
146
148
  if response.status_code == 422:
147
- error(f"Validation error during append: {err.response.json()}")
149
+ judgeval_logger.error(
150
+ f"Validation error during append: {err.response.json()}"
151
+ )
148
152
  else:
149
- error(f"HTTP error during append: {err}")
153
+ judgeval_logger.error(f"HTTP error during append: {err}")
150
154
 
151
155
  progress.update(
152
156
  task_id,
@@ -155,7 +159,6 @@ class EvalDatasetClient:
155
159
  return True
156
160
 
157
161
  def pull(self, alias: str, project_name: str) -> EvalDataset:
158
- debug(f"Pulling dataset with alias '{alias}'")
159
162
  """
160
163
  Pulls the dataset from Judgment platform
161
164
 
@@ -163,7 +166,7 @@ class EvalDatasetClient:
163
166
  {
164
167
  "alias": alias,
165
168
  "project_name": project_name
166
- }
169
+ }
167
170
  ==>
168
171
  {
169
172
  "examples": [...],
@@ -198,10 +201,9 @@ class EvalDatasetClient:
198
201
  )
199
202
  response.raise_for_status()
200
203
  except exceptions.RequestException as e:
201
- error(f"Error pulling dataset: {str(e)}")
204
+ judgeval_logger.error(f"Error pulling dataset: {str(e)}")
202
205
  raise
203
206
 
204
- info(f"Successfully pulled dataset with alias '{alias}'")
205
207
  payload = response.json()
206
208
  dataset.examples = [Example(**e) for e in payload.get("examples", [])]
207
209
  dataset.traces = [Trace(**t) for t in payload.get("traces", [])]
@@ -239,20 +241,19 @@ class EvalDatasetClient:
239
241
  )
240
242
  response.raise_for_status()
241
243
  except exceptions.RequestException as e:
242
- error(f"Error deleting dataset: {str(e)}")
244
+ judgeval_logger.error(f"Error deleting dataset: {str(e)}")
243
245
  raise
244
246
 
245
247
  return True
246
248
 
247
249
  def pull_project_dataset_stats(self, project_name: str) -> dict:
248
- debug(f"Pulling project datasets stats for project_name: {project_name}'")
249
250
  """
250
- Pulls the project datasets stats from Judgment platform
251
+ Pulls the project datasets stats from Judgment platform
251
252
 
252
253
  Mock request:
253
254
  {
254
255
  "project_name": project_name
255
- }
256
+ }
256
257
  ==>
257
258
  {
258
259
  "test_dataset_1": {"examples_count": len(dataset1.examples)},
@@ -286,10 +287,9 @@ class EvalDatasetClient:
286
287
  )
287
288
  response.raise_for_status()
288
289
  except exceptions.RequestException as e:
289
- error(f"Error pulling dataset: {str(e)}")
290
+ judgeval_logger.error(f"Error pulling dataset: {str(e)}")
290
291
  raise
291
292
 
292
- info(f"Successfully pulled datasets for userid: {self.judgment_api_key}'")
293
293
  payload = response.json()
294
294
 
295
295
  progress.update(
@@ -301,7 +301,6 @@ class EvalDatasetClient:
301
301
 
302
302
  def export_jsonl(self, alias: str, project_name: str) -> Response:
303
303
  """Export dataset in JSONL format from Judgment platform"""
304
- debug(f"Exporting dataset with alias '{alias}' as JSONL")
305
304
  with Progress(
306
305
  SpinnerColumn(style="rgb(106,0,255)"),
307
306
  TextColumn("[progress.description]{task.description}"),
@@ -326,15 +325,14 @@ class EvalDatasetClient:
326
325
  response.raise_for_status()
327
326
  except exceptions.HTTPError as err:
328
327
  if err.response.status_code == 404:
329
- error(f"Dataset not found: {alias}")
328
+ judgeval_logger.error(f"Dataset not found: {alias}")
330
329
  else:
331
- error(f"HTTP error during export: {err}")
330
+ judgeval_logger.error(f"HTTP error during export: {err}")
332
331
  raise
333
332
  except Exception as e:
334
- error(f"Error during export: {str(e)}")
333
+ judgeval_logger.error(f"Error during export: {str(e)}")
335
334
  raise
336
335
 
337
- info(f"Successfully exported dataset with alias '{alias}'")
338
336
  progress.update(
339
337
  task_id,
340
338
  description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",