judgeval 0.0.51__py3-none-any.whl → 0.0.53__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/logger.py +46 -199
- judgeval/common/s3_storage.py +2 -6
- judgeval/common/tracer.py +182 -262
- judgeval/common/utils.py +16 -36
- judgeval/constants.py +14 -20
- judgeval/data/__init__.py +0 -2
- judgeval/data/datasets/dataset.py +6 -10
- judgeval/data/datasets/eval_dataset_client.py +25 -27
- judgeval/data/example.py +5 -138
- judgeval/data/judgment_types.py +214 -0
- judgeval/data/result.py +7 -25
- judgeval/data/scorer_data.py +28 -40
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/tool.py +3 -54
- judgeval/data/trace.py +31 -50
- judgeval/data/trace_run.py +3 -3
- judgeval/evaluation_run.py +16 -23
- judgeval/integrations/langgraph.py +11 -12
- judgeval/judges/litellm_judge.py +3 -6
- judgeval/judges/mixture_of_judges.py +8 -25
- judgeval/judges/together_judge.py +3 -6
- judgeval/judgment_client.py +22 -24
- judgeval/rules.py +7 -19
- judgeval/run_evaluation.py +79 -242
- judgeval/scorers/__init__.py +4 -20
- judgeval/scorers/agent_scorer.py +21 -0
- judgeval/scorers/api_scorer.py +28 -38
- judgeval/scorers/base_scorer.py +98 -0
- judgeval/scorers/example_scorer.py +19 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -24
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +16 -68
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +4 -12
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -17
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -14
- judgeval/scorers/score.py +45 -330
- judgeval/scorers/utils.py +6 -88
- judgeval/utils/file_utils.py +4 -6
- judgeval/version_check.py +3 -2
- {judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/METADATA +3 -2
- judgeval-0.0.53.dist-info/RECORD +65 -0
- judgeval/data/custom_example.py +0 -19
- judgeval/scorers/judgeval_scorer.py +0 -177
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
- judgeval/scorers/prompt_scorer.py +0 -296
- judgeval-0.0.51.dist-info/RECORD +0 -69
- {judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/WHEEL +0 -0
- {judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/logger.py
CHANGED
@@ -1,213 +1,60 @@
|
|
1
|
+
# logger.py
|
2
|
+
|
1
3
|
import logging
|
2
|
-
from logging.handlers import RotatingFileHandler
|
3
4
|
import sys
|
4
|
-
|
5
|
-
from contextlib import contextmanager
|
6
|
-
|
7
|
-
# Global variables
|
8
|
-
logger = None
|
9
|
-
|
10
|
-
|
11
|
-
class LoggingState:
|
12
|
-
enabled: bool = False
|
13
|
-
path: str | None = None
|
14
|
-
|
15
|
-
|
16
|
-
LOGGING_STATE = LoggingState()
|
5
|
+
import os
|
17
6
|
|
18
|
-
#
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
def enable_logging(
|
25
|
-
name: str = "judgeval",
|
26
|
-
path: str = "./logs",
|
27
|
-
max_bytes: int = 1024 * 1024,
|
28
|
-
backup_count: int = 5,
|
29
|
-
):
|
30
|
-
"""
|
31
|
-
Context manager to temporarily enable logging for a specific block of code.
|
32
|
-
"""
|
33
|
-
global logger
|
34
|
-
LOGGING_STATE.enabled = True
|
35
|
-
LOGGING_STATE.path = path
|
36
|
-
# Initialize logger if not already initialized
|
37
|
-
if logger is None:
|
38
|
-
logger = _initialize_logger(
|
39
|
-
name=name, path=path, max_bytes=max_bytes, backup_count=backup_count
|
40
|
-
)
|
41
|
-
try:
|
42
|
-
logger.info("Logging enabled")
|
43
|
-
yield
|
44
|
-
finally:
|
45
|
-
logger.info("Logging disabled")
|
46
|
-
LOGGING_STATE.enabled = False
|
47
|
-
LOGGING_STATE.path = None
|
7
|
+
# ANSI escape sequences
|
8
|
+
RESET = "\033[0m"
|
9
|
+
RED = "\033[31m"
|
10
|
+
YELLOW = "\033[33m"
|
11
|
+
BLUE = "\033[34m"
|
12
|
+
GRAY = "\033[90m"
|
48
13
|
|
49
14
|
|
50
|
-
|
51
|
-
name: str = "judgeval",
|
52
|
-
max_bytes: int = 1024 * 1024, # 1MB
|
53
|
-
backup_count: int = 5,
|
54
|
-
path: str = "./logs", # Added path parameter with default
|
55
|
-
) -> logging.Logger:
|
15
|
+
class ColorFormatter(logging.Formatter):
|
56
16
|
"""
|
57
|
-
|
58
|
-
Returns the global logger instance.
|
17
|
+
Wrap the final formatted log record in ANSI color codes based on level.
|
59
18
|
"""
|
60
|
-
global logger
|
61
19
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
if
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
console_handler.setLevel(logging.DEBUG)
|
94
|
-
|
95
|
-
log_filename = f"{name}.log"
|
96
|
-
file_handler = RotatingFileHandler(
|
97
|
-
log_dir / log_filename, maxBytes=max_bytes, backupCount=backup_count, mode="a"
|
20
|
+
COLORS = {
|
21
|
+
logging.DEBUG: GRAY,
|
22
|
+
logging.INFO: GRAY,
|
23
|
+
logging.WARNING: YELLOW,
|
24
|
+
logging.ERROR: RED,
|
25
|
+
logging.CRITICAL: RED,
|
26
|
+
}
|
27
|
+
|
28
|
+
def __init__(self, fmt=None, datefmt=None, use_color=True):
|
29
|
+
super().__init__(fmt=fmt, datefmt=datefmt)
|
30
|
+
self.use_color = use_color and sys.stdout.isatty()
|
31
|
+
|
32
|
+
def format(self, record):
|
33
|
+
message = super().format(record)
|
34
|
+
if self.use_color:
|
35
|
+
color = self.COLORS.get(record.levelno, "")
|
36
|
+
if color:
|
37
|
+
message = f"{color}{message}{RESET}"
|
38
|
+
return message
|
39
|
+
|
40
|
+
|
41
|
+
def _setup_judgeval_logger():
|
42
|
+
use_color = sys.stdout.isatty() and os.getenv("NO_COLOR") is None
|
43
|
+
handler = logging.StreamHandler(sys.stdout)
|
44
|
+
handler.setLevel(logging.DEBUG)
|
45
|
+
handler.setFormatter(
|
46
|
+
ColorFormatter(
|
47
|
+
fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
48
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
49
|
+
use_color=use_color,
|
50
|
+
)
|
98
51
|
)
|
99
|
-
file_handler.setFormatter(ExampleFormatter())
|
100
|
-
file_handler.setLevel(logging.DEBUG)
|
101
52
|
|
102
|
-
|
103
|
-
logger = logging.getLogger(name)
|
53
|
+
logger = logging.getLogger("judgeval")
|
104
54
|
logger.setLevel(logging.DEBUG)
|
105
|
-
|
106
|
-
# Prevent adding handlers multiple times
|
107
|
-
if not logger.handlers:
|
108
|
-
logger.addHandler(console_handler)
|
109
|
-
logger.addHandler(file_handler)
|
110
|
-
|
55
|
+
logger.addHandler(handler)
|
111
56
|
return logger
|
112
57
|
|
113
58
|
|
114
|
-
#
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
def log_if_enabled(func):
|
119
|
-
"""Decorator to check if logging is enabled before executing logging statements"""
|
120
|
-
|
121
|
-
def wrapper(*args, **kwargs):
|
122
|
-
if LOGGING_STATE.enabled:
|
123
|
-
return func(*args, **kwargs)
|
124
|
-
|
125
|
-
return wrapper
|
126
|
-
|
127
|
-
|
128
|
-
@log_if_enabled
|
129
|
-
def debug(msg: str, example_idx: int | None = None):
|
130
|
-
"""Log debug message if logging is enabled"""
|
131
|
-
if logger:
|
132
|
-
logger.debug(msg)
|
133
|
-
|
134
|
-
|
135
|
-
@log_if_enabled
|
136
|
-
def info(msg: str, example_idx: int | None = None):
|
137
|
-
"""Log info message if logging is enabled"""
|
138
|
-
if logger:
|
139
|
-
logger.info(msg)
|
140
|
-
|
141
|
-
|
142
|
-
@log_if_enabled
|
143
|
-
def warning(msg: str, example_idx: int | None = None):
|
144
|
-
"""Log warning message if logging is enabled"""
|
145
|
-
if logger:
|
146
|
-
logger.warning(msg)
|
147
|
-
|
148
|
-
|
149
|
-
@log_if_enabled
|
150
|
-
def error(msg: str, example_idx: int | None = None):
|
151
|
-
"""Log error message if logging is enabled"""
|
152
|
-
if logger:
|
153
|
-
logger.error(msg)
|
154
|
-
|
155
|
-
|
156
|
-
def create_example_handler(
|
157
|
-
timestamp: str,
|
158
|
-
example_idx: int,
|
159
|
-
path: str = "./logs", # Added path parameter with default
|
160
|
-
) -> RotatingFileHandler:
|
161
|
-
"""Creates a file handler for a specific example"""
|
162
|
-
debug(
|
163
|
-
f"Creating example handler for timestamp={timestamp}, example_idx={example_idx}"
|
164
|
-
)
|
165
|
-
log_dir = Path(path) / "examples"
|
166
|
-
log_dir.mkdir(exist_ok=True, parents=True)
|
167
|
-
|
168
|
-
formatter = logging.Formatter(
|
169
|
-
fmt="%(asctime)s - %(name)s - %(levelname)s - [Example_%(example_id)s][%(timestamp)s] %(message)s",
|
170
|
-
datefmt="%Y-%m-%d %H:%M:%S",
|
171
|
-
)
|
172
|
-
|
173
|
-
# Create a unique file for each example
|
174
|
-
file_handler = RotatingFileHandler(
|
175
|
-
log_dir / f"{timestamp}_example_{example_idx}.log",
|
176
|
-
maxBytes=1024 * 1024, # 1MB
|
177
|
-
backupCount=5,
|
178
|
-
mode="a",
|
179
|
-
)
|
180
|
-
file_handler.setFormatter(formatter)
|
181
|
-
file_handler.setLevel(logging.DEBUG)
|
182
|
-
info(f"Created example handler for example {example_idx}")
|
183
|
-
return file_handler
|
184
|
-
|
185
|
-
|
186
|
-
@contextmanager
|
187
|
-
def example_logging_context(timestamp: str, example_idx: int):
|
188
|
-
"""Context manager for example-specific logging"""
|
189
|
-
if not LOGGING_STATE.enabled:
|
190
|
-
yield
|
191
|
-
return
|
192
|
-
|
193
|
-
global current_example_id, current_timestamp
|
194
|
-
|
195
|
-
debug(f"Entering example logging context for example {example_idx}")
|
196
|
-
current_example_id = example_idx
|
197
|
-
current_timestamp = timestamp
|
198
|
-
|
199
|
-
if LOGGING_STATE.path:
|
200
|
-
handler = create_example_handler(
|
201
|
-
timestamp, example_idx, path=LOGGING_STATE.path
|
202
|
-
)
|
203
|
-
if handler and logger:
|
204
|
-
logger.addHandler(handler)
|
205
|
-
try:
|
206
|
-
yield
|
207
|
-
finally:
|
208
|
-
current_example_id = None
|
209
|
-
current_timestamp = None
|
210
|
-
if handler and logger:
|
211
|
-
logger.removeHandler(handler)
|
212
|
-
handler.close()
|
213
|
-
debug(f"Closed example handler for example {example_idx}")
|
59
|
+
# Global logger you can import elsewhere
|
60
|
+
judgeval_logger = _setup_judgeval_logger()
|
judgeval/common/s3_storage.py
CHANGED
@@ -4,7 +4,7 @@ import boto3
|
|
4
4
|
from typing import Optional
|
5
5
|
from datetime import datetime, UTC
|
6
6
|
from botocore.exceptions import ClientError
|
7
|
-
from judgeval.common.logger import
|
7
|
+
from judgeval.common.logger import judgeval_logger
|
8
8
|
|
9
9
|
|
10
10
|
class S3Storage:
|
@@ -42,7 +42,6 @@ class S3Storage:
|
|
42
42
|
error_code = e.response["Error"]["Code"]
|
43
43
|
if error_code == "404":
|
44
44
|
# Bucket doesn't exist, create it
|
45
|
-
info(f"Bucket {self.bucket_name} doesn't exist, creating it ...")
|
46
45
|
try:
|
47
46
|
self.s3_client.create_bucket(
|
48
47
|
Bucket=self.bucket_name,
|
@@ -52,14 +51,13 @@ class S3Storage:
|
|
52
51
|
) if self.s3_client.meta.region_name != "us-east-1" else self.s3_client.create_bucket(
|
53
52
|
Bucket=self.bucket_name
|
54
53
|
)
|
55
|
-
info(f"Created S3 bucket: {self.bucket_name}")
|
56
54
|
except ClientError as create_error:
|
57
55
|
if (
|
58
56
|
create_error.response["Error"]["Code"]
|
59
57
|
== "BucketAlreadyOwnedByYou"
|
60
58
|
):
|
61
59
|
# Bucket was just created by another process
|
62
|
-
warning(
|
60
|
+
judgeval_logger.warning(
|
63
61
|
f"Bucket {self.bucket_name} was just created by another process"
|
64
62
|
)
|
65
63
|
pass
|
@@ -90,8 +88,6 @@ class S3Storage:
|
|
90
88
|
# Convert trace data to JSON string
|
91
89
|
trace_json = json.dumps(trace_data)
|
92
90
|
|
93
|
-
# Upload to S3
|
94
|
-
info(f"Uploading trace to S3 at key {s3_key}, in bucket {self.bucket_name} ...")
|
95
91
|
self.s3_client.put_object(
|
96
92
|
Bucket=self.bucket_name,
|
97
93
|
Key=s3_key,
|