judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. judgeval/__init__.py +5 -4
  2. judgeval/clients.py +6 -6
  3. judgeval/common/__init__.py +7 -2
  4. judgeval/common/exceptions.py +2 -3
  5. judgeval/common/logger.py +74 -49
  6. judgeval/common/s3_storage.py +30 -23
  7. judgeval/common/tracer.py +1273 -939
  8. judgeval/common/utils.py +416 -244
  9. judgeval/constants.py +73 -61
  10. judgeval/data/__init__.py +1 -1
  11. judgeval/data/custom_example.py +3 -2
  12. judgeval/data/datasets/dataset.py +80 -54
  13. judgeval/data/datasets/eval_dataset_client.py +131 -181
  14. judgeval/data/example.py +67 -43
  15. judgeval/data/result.py +11 -9
  16. judgeval/data/scorer_data.py +4 -2
  17. judgeval/data/tool.py +25 -16
  18. judgeval/data/trace.py +57 -29
  19. judgeval/data/trace_run.py +5 -11
  20. judgeval/evaluation_run.py +22 -82
  21. judgeval/integrations/langgraph.py +546 -184
  22. judgeval/judges/base_judge.py +1 -2
  23. judgeval/judges/litellm_judge.py +33 -11
  24. judgeval/judges/mixture_of_judges.py +128 -78
  25. judgeval/judges/together_judge.py +22 -9
  26. judgeval/judges/utils.py +14 -5
  27. judgeval/judgment_client.py +259 -271
  28. judgeval/rules.py +169 -142
  29. judgeval/run_evaluation.py +462 -305
  30. judgeval/scorers/api_scorer.py +20 -11
  31. judgeval/scorers/exceptions.py +1 -0
  32. judgeval/scorers/judgeval_scorer.py +77 -58
  33. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
  36. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
  37. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
  38. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
  39. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
  40. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
  41. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
  42. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
  43. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
  44. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
  45. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
  46. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
  47. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
  48. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
  49. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
  50. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
  51. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
  52. judgeval/scorers/prompt_scorer.py +48 -37
  53. judgeval/scorers/score.py +86 -53
  54. judgeval/scorers/utils.py +11 -7
  55. judgeval/tracer/__init__.py +1 -1
  56. judgeval/utils/alerts.py +23 -12
  57. judgeval/utils/{data_utils.py → file_utils.py} +5 -9
  58. judgeval/utils/requests.py +29 -0
  59. judgeval/version_check.py +5 -2
  60. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
  61. judgeval-0.0.46.dist-info/RECORD +69 -0
  62. judgeval-0.0.44.dist-info/RECORD +0 -68
  63. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
  64. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0
judgeval/__init__.py CHANGED
@@ -2,11 +2,12 @@
2
2
  from judgeval.clients import client, together_client
3
3
  from judgeval.judgment_client import JudgmentClient
4
4
  from judgeval.version_check import check_latest_version
5
+
5
6
  check_latest_version()
6
7
 
7
8
  __all__ = [
8
9
  # Clients
9
- 'client',
10
- 'together_client',
11
- 'JudgmentClient',
12
- ]
10
+ "client",
11
+ "together_client",
12
+ "JudgmentClient",
13
+ ]
judgeval/clients.py CHANGED
@@ -9,18 +9,19 @@ load_dotenv(dotenv_path=PATH_TO_DOTENV)
9
9
 
10
10
 
11
11
  # Initialize optional OpenAI client
12
- client: Optional['OpenAI'] = None
12
+ client: Optional["OpenAI"] = None
13
13
  if os.getenv("OPENAI_API_KEY"):
14
14
  try:
15
15
  from openai import OpenAI
16
+
16
17
  client = OpenAI()
17
18
  except ImportError:
18
19
  # openai package not installed
19
20
  pass
20
-
21
+
21
22
  # Initialize optional Together clients
22
- together_client: Optional['Together'] = None
23
- async_together_client: Optional['AsyncTogether'] = None
23
+ together_client: Optional["Together"] = None
24
+ async_together_client: Optional["AsyncTogether"] = None
24
25
 
25
26
  # Only initialize Together clients if API key is available
26
27
 
@@ -29,6 +30,5 @@ if together_api_key:
29
30
  try:
30
31
  together_client = Together(api_key=together_api_key)
31
32
  async_together_client = AsyncTogether(api_key=together_api_key)
32
- except Exception as e:
33
+ except Exception:
33
34
  pass
34
-
@@ -2,7 +2,12 @@ from judgeval.common.utils import (
2
2
  get_chat_completion,
3
3
  aget_chat_completion,
4
4
  get_completion_multiple_models,
5
- aget_completion_multiple_models
5
+ aget_completion_multiple_models,
6
6
  )
7
7
 
8
- __all__ = ["get_chat_completion", "aget_chat_completion", "get_completion_multiple_models", "aget_completion_multiple_models"]
8
+ __all__ = [
9
+ "get_chat_completion",
10
+ "aget_chat_completion",
11
+ "get_completion_multiple_models",
12
+ "aget_completion_multiple_models",
13
+ ]
@@ -11,7 +11,7 @@ class JudgmentAPIError(Exception):
11
11
  """
12
12
  Exception raised when an error occurs while executing a Judgment API request
13
13
  """
14
-
14
+
15
15
  def __init__(self, message: str):
16
16
  super().__init__(message)
17
17
  self.message = message
@@ -21,8 +21,7 @@ class InvalidJudgeModelError(Exception):
21
21
  """
22
22
  Exception raised when an invalid judge model is provided
23
23
  """
24
-
24
+
25
25
  def __init__(self, message: str):
26
26
  super().__init__(message)
27
27
  self.message = message
28
-
judgeval/common/logger.py CHANGED
@@ -6,9 +6,12 @@ from contextlib import contextmanager
6
6
 
7
7
  # Global variables
8
8
  logger = None
9
+
10
+
9
11
  class LoggingState:
10
- enabled = False
11
- path = None
12
+ enabled: bool = False
13
+ path: str | None = None
14
+
12
15
 
13
16
  LOGGING_STATE = LoggingState()
14
17
 
@@ -18,7 +21,12 @@ current_timestamp = None
18
21
 
19
22
 
20
23
  @contextmanager
21
- def enable_logging(name: str = "judgeval", path: str = "./logs", max_bytes: int = 1024 * 1024, backup_count: int = 5):
24
+ def enable_logging(
25
+ name: str = "judgeval",
26
+ path: str = "./logs",
27
+ max_bytes: int = 1024 * 1024,
28
+ backup_count: int = 5,
29
+ ):
22
30
  """
23
31
  Context manager to temporarily enable logging for a specific block of code.
24
32
  """
@@ -27,7 +35,9 @@ def enable_logging(name: str = "judgeval", path: str = "./logs", max_bytes: int
27
35
  LOGGING_STATE.path = path
28
36
  # Initialize logger if not already initialized
29
37
  if logger is None:
30
- logger = _initialize_logger(name=name, path=path, max_bytes=max_bytes, backup_count=backup_count)
38
+ logger = _initialize_logger(
39
+ name=name, path=path, max_bytes=max_bytes, backup_count=backup_count
40
+ )
31
41
  try:
32
42
  logger.info("Logging enabled")
33
43
  yield
@@ -36,36 +46,31 @@ def enable_logging(name: str = "judgeval", path: str = "./logs", max_bytes: int
36
46
  LOGGING_STATE.enabled = False
37
47
  LOGGING_STATE.path = None
38
48
 
49
+
39
50
  def _initialize_logger(
40
51
  name: str = "judgeval",
41
52
  max_bytes: int = 1024 * 1024, # 1MB
42
53
  backup_count: int = 5,
43
- path: str = "./logs" # Added path parameter with default
54
+ path: str = "./logs", # Added path parameter with default
44
55
  ) -> logging.Logger:
45
56
  """
46
57
  Initialize the global logger instance if it doesn't exist.
47
58
  Returns the global logger instance.
48
59
  """
49
60
  global logger
50
-
61
+
51
62
  log_dir = Path(path)
52
63
  log_dir.mkdir(exist_ok=True, parents=True)
53
64
  log_file = log_dir / f"{name}.log"
54
65
  if log_file.exists():
55
66
  log_file.unlink() # Delete existing log file
56
-
67
+
57
68
  if logger is not None:
58
69
  return logger
59
-
70
+
60
71
  # Create logs directory if it doesn't exist
61
72
  log_dir = Path(path)
62
73
  log_dir.mkdir(exist_ok=True)
63
-
64
- # Create formatter
65
- formatter = logging.Formatter(
66
- fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
67
- datefmt='%Y-%m-%d %H:%M:%S'
68
- )
69
74
 
70
75
  # Create a custom formatter that includes example info when available
71
76
  class ExampleFormatter(logging.Formatter):
@@ -73,22 +78,23 @@ def _initialize_logger(
73
78
  if current_example_id is not None and current_timestamp is not None:
74
79
  record.example_id = current_example_id
75
80
  record.timestamp = current_timestamp
76
- return logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - [Example_%(example_id)s][%(timestamp)s] %(message)s',
77
- datefmt='%Y-%m-%d %H:%M:%S').format(record)
78
- return logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s',
79
- datefmt='%Y-%m-%d %H:%M:%S').format(record)
80
-
81
+ return logging.Formatter(
82
+ "%(asctime)s - %(name)s - %(levelname)s - [Example_%(example_id)s][%(timestamp)s] %(message)s",
83
+ datefmt="%Y-%m-%d %H:%M:%S",
84
+ ).format(record)
85
+ return logging.Formatter(
86
+ "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
87
+ datefmt="%Y-%m-%d %H:%M:%S",
88
+ ).format(record)
89
+
81
90
  # Use the custom formatter
82
91
  console_handler = logging.StreamHandler(sys.stdout)
83
92
  console_handler.setFormatter(ExampleFormatter())
84
93
  console_handler.setLevel(logging.DEBUG)
85
-
94
+
86
95
  log_filename = f"{name}.log"
87
96
  file_handler = RotatingFileHandler(
88
- log_dir / log_filename,
89
- maxBytes=max_bytes,
90
- backupCount=backup_count,
91
- mode='a'
97
+ log_dir / log_filename, maxBytes=max_bytes, backupCount=backup_count, mode="a"
92
98
  )
93
99
  file_handler.setFormatter(ExampleFormatter())
94
100
  file_handler.setLevel(logging.DEBUG)
@@ -96,93 +102,112 @@ def _initialize_logger(
96
102
  # Get logger
97
103
  logger = logging.getLogger(name)
98
104
  logger.setLevel(logging.DEBUG)
99
-
105
+
100
106
  # Prevent adding handlers multiple times
101
107
  if not logger.handlers:
102
108
  logger.addHandler(console_handler)
103
109
  logger.addHandler(file_handler)
104
-
110
+
105
111
  return logger
106
112
 
113
+
107
114
  # Initialize the global logger when module is imported
108
115
  # logger = _initialize_logger()
109
116
 
117
+
110
118
  def log_if_enabled(func):
111
119
  """Decorator to check if logging is enabled before executing logging statements"""
120
+
112
121
  def wrapper(*args, **kwargs):
113
122
  if LOGGING_STATE.enabled:
114
123
  return func(*args, **kwargs)
124
+
115
125
  return wrapper
116
126
 
127
+
117
128
  @log_if_enabled
118
- def debug(msg: str, example_idx: int = None):
129
+ def debug(msg: str, example_idx: int | None = None):
119
130
  """Log debug message if logging is enabled"""
120
- logger.debug(msg)
131
+ if logger:
132
+ logger.debug(msg)
133
+
121
134
 
122
135
  @log_if_enabled
123
- def info(msg: str, example_idx: int = None):
136
+ def info(msg: str, example_idx: int | None = None):
124
137
  """Log info message if logging is enabled"""
125
- logger.info(msg)
138
+ if logger:
139
+ logger.info(msg)
140
+
126
141
 
127
142
  @log_if_enabled
128
- def warning(msg: str, example_idx: int = None):
143
+ def warning(msg: str, example_idx: int | None = None):
129
144
  """Log warning message if logging is enabled"""
130
- logger.warning(msg)
145
+ if logger:
146
+ logger.warning(msg)
147
+
131
148
 
132
149
  @log_if_enabled
133
- def error(msg: str, example_idx: int = None):
150
+ def error(msg: str, example_idx: int | None = None):
134
151
  """Log error message if logging is enabled"""
135
- logger.error(msg)
152
+ if logger:
153
+ logger.error(msg)
154
+
136
155
 
137
156
  def create_example_handler(
138
- timestamp: str,
157
+ timestamp: str,
139
158
  example_idx: int,
140
- path: str = "./logs" # Added path parameter with default
159
+ path: str = "./logs", # Added path parameter with default
141
160
  ) -> RotatingFileHandler:
142
161
  """Creates a file handler for a specific example"""
143
- debug(f"Creating example handler for timestamp={timestamp}, example_idx={example_idx}")
162
+ debug(
163
+ f"Creating example handler for timestamp={timestamp}, example_idx={example_idx}"
164
+ )
144
165
  log_dir = Path(path) / "examples"
145
166
  log_dir.mkdir(exist_ok=True, parents=True)
146
-
167
+
147
168
  formatter = logging.Formatter(
148
- fmt='%(asctime)s - %(name)s - %(levelname)s - [Example_%(example_id)s][%(timestamp)s] %(message)s',
149
- datefmt='%Y-%m-%d %H:%M:%S'
169
+ fmt="%(asctime)s - %(name)s - %(levelname)s - [Example_%(example_id)s][%(timestamp)s] %(message)s",
170
+ datefmt="%Y-%m-%d %H:%M:%S",
150
171
  )
151
-
172
+
152
173
  # Create a unique file for each example
153
174
  file_handler = RotatingFileHandler(
154
175
  log_dir / f"{timestamp}_example_{example_idx}.log",
155
176
  maxBytes=1024 * 1024, # 1MB
156
177
  backupCount=5,
157
- mode='a'
178
+ mode="a",
158
179
  )
159
180
  file_handler.setFormatter(formatter)
160
181
  file_handler.setLevel(logging.DEBUG)
161
182
  info(f"Created example handler for example {example_idx}")
162
183
  return file_handler
163
184
 
185
+
164
186
  @contextmanager
165
187
  def example_logging_context(timestamp: str, example_idx: int):
166
188
  """Context manager for example-specific logging"""
167
189
  if not LOGGING_STATE.enabled:
168
190
  yield
169
191
  return
170
-
192
+
171
193
  global current_example_id, current_timestamp
172
-
194
+
173
195
  debug(f"Entering example logging context for example {example_idx}")
174
196
  current_example_id = example_idx
175
197
  current_timestamp = timestamp
176
-
177
- handler = create_example_handler(timestamp, example_idx, path=LOGGING_STATE.path)
178
- if handler:
198
+
199
+ if LOGGING_STATE.path:
200
+ handler = create_example_handler(
201
+ timestamp, example_idx, path=LOGGING_STATE.path
202
+ )
203
+ if handler and logger:
179
204
  logger.addHandler(handler)
180
205
  try:
181
206
  yield
182
207
  finally:
183
208
  current_example_id = None
184
209
  current_timestamp = None
185
- if handler:
210
+ if handler and logger:
186
211
  logger.removeHandler(handler)
187
212
  handler.close()
188
- debug(f"Closed example handler for example {example_idx}")
213
+ debug(f"Closed example handler for example {example_idx}")
@@ -6,18 +6,19 @@ from datetime import datetime, UTC
6
6
  from botocore.exceptions import ClientError
7
7
  from judgeval.common.logger import warning, info
8
8
 
9
+
9
10
  class S3Storage:
10
11
  """Utility class for storing and retrieving trace data from S3."""
11
-
12
+
12
13
  def __init__(
13
14
  self,
14
15
  bucket_name: str,
15
16
  aws_access_key_id: Optional[str] = None,
16
17
  aws_secret_access_key: Optional[str] = None,
17
- region_name: Optional[str] = None
18
+ region_name: Optional[str] = None,
18
19
  ):
19
20
  """Initialize S3 storage with credentials and bucket name.
20
-
21
+
21
22
  Args:
22
23
  bucket_name: Name of the S3 bucket to store traces in
23
24
  aws_access_key_id: AWS access key ID (optional, will use environment variables if not provided)
@@ -26,70 +27,76 @@ class S3Storage:
26
27
  """
27
28
  self.bucket_name = bucket_name
28
29
  self.s3_client = boto3.client(
29
- 's3',
30
- aws_access_key_id=aws_access_key_id or os.getenv('AWS_ACCESS_KEY_ID'),
31
- aws_secret_access_key=aws_secret_access_key or os.getenv('AWS_SECRET_ACCESS_KEY'),
32
- region_name=region_name or os.getenv('AWS_REGION', 'us-west-1')
30
+ "s3",
31
+ aws_access_key_id=aws_access_key_id or os.getenv("AWS_ACCESS_KEY_ID"),
32
+ aws_secret_access_key=aws_secret_access_key
33
+ or os.getenv("AWS_SECRET_ACCESS_KEY"),
34
+ region_name=region_name or os.getenv("AWS_REGION", "us-west-1"),
33
35
  )
34
-
36
+
35
37
  def _ensure_bucket_exists(self):
36
38
  """Ensure the S3 bucket exists, creating it if necessary."""
37
39
  try:
38
40
  self.s3_client.head_bucket(Bucket=self.bucket_name)
39
41
  except ClientError as e:
40
- error_code = e.response['Error']['Code']
41
- if error_code == '404':
42
+ error_code = e.response["Error"]["Code"]
43
+ if error_code == "404":
42
44
  # Bucket doesn't exist, create it
43
45
  info(f"Bucket {self.bucket_name} doesn't exist, creating it ...")
44
46
  try:
45
47
  self.s3_client.create_bucket(
46
48
  Bucket=self.bucket_name,
47
49
  CreateBucketConfiguration={
48
- 'LocationConstraint': self.s3_client.meta.region_name
49
- }
50
+ "LocationConstraint": self.s3_client.meta.region_name
51
+ },
50
52
  ) if self.s3_client.meta.region_name != "us-east-1" else self.s3_client.create_bucket(
51
53
  Bucket=self.bucket_name
52
54
  )
53
55
  info(f"Created S3 bucket: {self.bucket_name}")
54
56
  except ClientError as create_error:
55
- if create_error.response['Error']['Code'] == 'BucketAlreadyOwnedByYou':
57
+ if (
58
+ create_error.response["Error"]["Code"]
59
+ == "BucketAlreadyOwnedByYou"
60
+ ):
56
61
  # Bucket was just created by another process
57
- warning(f"Bucket {self.bucket_name} was just created by another process")
62
+ warning(
63
+ f"Bucket {self.bucket_name} was just created by another process"
64
+ )
58
65
  pass
59
66
  else:
60
67
  raise create_error
61
68
  else:
62
69
  # Some other error occurred
63
70
  raise e
64
-
71
+
65
72
  def save_trace(self, trace_data: dict, trace_id: str, project_name: str) -> str:
66
73
  """Save trace data to S3.
67
-
74
+
68
75
  Args:
69
76
  trace_data: The trace data to save
70
77
  trace_id: Unique identifier for the trace
71
78
  project_name: Name of the project the trace belongs to
72
-
79
+
73
80
  Returns:
74
81
  str: S3 key where the trace was saved
75
82
  """
76
83
  # Ensure bucket exists before saving
77
84
  self._ensure_bucket_exists()
78
-
85
+
79
86
  # Create a timestamped key for the trace
80
- timestamp = datetime.now(UTC).strftime('%Y%m%d_%H%M%S')
87
+ timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
81
88
  s3_key = f"traces/{project_name}/{trace_id}_{timestamp}.json"
82
-
89
+
83
90
  # Convert trace data to JSON string
84
91
  trace_json = json.dumps(trace_data)
85
-
92
+
86
93
  # Upload to S3
87
94
  info(f"Uploading trace to S3 at key {s3_key}, in bucket {self.bucket_name} ...")
88
95
  self.s3_client.put_object(
89
96
  Bucket=self.bucket_name,
90
97
  Key=s3_key,
91
98
  Body=trace_json,
92
- ContentType='application/json'
99
+ ContentType="application/json",
93
100
  )
94
-
101
+
95
102
  return s3_key