judgeval 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +83 -0
- judgeval/clients.py +19 -0
- judgeval/common/__init__.py +8 -0
- judgeval/common/exceptions.py +28 -0
- judgeval/common/logger.py +189 -0
- judgeval/common/tracer.py +587 -0
- judgeval/common/utils.py +763 -0
- judgeval/constants.py +55 -0
- judgeval/data/__init__.py +14 -0
- judgeval/data/api_example.py +111 -0
- judgeval/data/datasets/__init__.py +4 -0
- judgeval/data/datasets/dataset.py +407 -0
- judgeval/data/datasets/ground_truth.py +54 -0
- judgeval/data/datasets/utils.py +74 -0
- judgeval/data/example.py +76 -0
- judgeval/data/result.py +83 -0
- judgeval/data/scorer_data.py +86 -0
- judgeval/evaluation_run.py +130 -0
- judgeval/judges/__init__.py +7 -0
- judgeval/judges/base_judge.py +44 -0
- judgeval/judges/litellm_judge.py +49 -0
- judgeval/judges/mixture_of_judges.py +248 -0
- judgeval/judges/together_judge.py +55 -0
- judgeval/judges/utils.py +45 -0
- judgeval/judgment_client.py +244 -0
- judgeval/run_evaluation.py +355 -0
- judgeval/scorers/__init__.py +30 -0
- judgeval/scorers/base_scorer.py +51 -0
- judgeval/scorers/custom_scorer.py +134 -0
- judgeval/scorers/judgeval_scorers/__init__.py +21 -0
- judgeval/scorers/judgeval_scorers/answer_relevancy.py +19 -0
- judgeval/scorers/judgeval_scorers/contextual_precision.py +19 -0
- judgeval/scorers/judgeval_scorers/contextual_recall.py +19 -0
- judgeval/scorers/judgeval_scorers/contextual_relevancy.py +22 -0
- judgeval/scorers/judgeval_scorers/faithfulness.py +19 -0
- judgeval/scorers/judgeval_scorers/hallucination.py +19 -0
- judgeval/scorers/judgeval_scorers/json_correctness.py +32 -0
- judgeval/scorers/judgeval_scorers/summarization.py +20 -0
- judgeval/scorers/judgeval_scorers/tool_correctness.py +19 -0
- judgeval/scorers/prompt_scorer.py +439 -0
- judgeval/scorers/score.py +427 -0
- judgeval/scorers/utils.py +175 -0
- judgeval-0.0.1.dist-info/METADATA +40 -0
- judgeval-0.0.1.dist-info/RECORD +46 -0
- judgeval-0.0.1.dist-info/WHEEL +4 -0
- judgeval-0.0.1.dist-info/licenses/LICENSE.md +202 -0
judgeval/__init__.py
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
# Import key components that should be publicly accessible
|
2
|
+
from judgeval.common.utils import (
|
3
|
+
get_chat_completion,
|
4
|
+
aget_chat_completion,
|
5
|
+
get_completion_multiple_models,
|
6
|
+
aget_completion_multiple_models
|
7
|
+
)
|
8
|
+
from judgeval.data import (
|
9
|
+
Example,
|
10
|
+
ProcessExample,
|
11
|
+
ScorerData,
|
12
|
+
ScoringResult,
|
13
|
+
)
|
14
|
+
from judgeval.data.datasets import (
|
15
|
+
EvalDataset,
|
16
|
+
GroundTruthExample
|
17
|
+
)
|
18
|
+
|
19
|
+
from judgeval.judges import (
|
20
|
+
judgevalJudge,
|
21
|
+
LiteLLMJudge,
|
22
|
+
TogetherJudge,
|
23
|
+
MixtureOfJudges
|
24
|
+
)
|
25
|
+
from judgeval.scorers import (
|
26
|
+
JudgmentScorer,
|
27
|
+
CustomScorer,
|
28
|
+
PromptScorer,
|
29
|
+
ClassifierScorer,
|
30
|
+
ToolCorrectnessScorer,
|
31
|
+
JSONCorrectnessScorer,
|
32
|
+
SummarizationScorer,
|
33
|
+
HallucinationScorer,
|
34
|
+
FaithfulnessScorer,
|
35
|
+
ContextualRelevancyScorer,
|
36
|
+
ContextualPrecisionScorer,
|
37
|
+
ContextualRecallScorer,
|
38
|
+
AnswerRelevancyScorer
|
39
|
+
)
|
40
|
+
from judgeval.clients import client, langfuse, together_client
|
41
|
+
from judgeval.judgment_client import JudgmentClient
|
42
|
+
|
43
|
+
__all__ = [
|
44
|
+
# Clients
|
45
|
+
'client',
|
46
|
+
'langfuse',
|
47
|
+
'together_client',
|
48
|
+
|
49
|
+
# # Common utilities
|
50
|
+
# 'get_chat_completion',
|
51
|
+
# 'aget_chat_completion',
|
52
|
+
# 'get_completion_multiple_models',
|
53
|
+
# 'aget_completion_multiple_models',
|
54
|
+
|
55
|
+
# # Data classes
|
56
|
+
# 'Example',
|
57
|
+
# 'ProcessExample',
|
58
|
+
# 'ScorerData',
|
59
|
+
# 'ScoringResult',
|
60
|
+
|
61
|
+
# # Judges
|
62
|
+
# 'judgevalJudge',
|
63
|
+
# 'LiteLLMJudge',
|
64
|
+
# 'TogetherJudge',
|
65
|
+
# 'MixtureOfJudges',
|
66
|
+
|
67
|
+
# # Scorers
|
68
|
+
# 'JudgmentScorer',
|
69
|
+
# 'CustomScorer',
|
70
|
+
# 'PromptScorer',
|
71
|
+
# 'ClassifierScorer',
|
72
|
+
# 'ToolCorrectnessScorer',
|
73
|
+
# 'JSONCorrectnessScorer',
|
74
|
+
# 'SummarizationScorer',
|
75
|
+
# 'HallucinationScorer',
|
76
|
+
# 'FaithfulnessScorer',
|
77
|
+
# 'ContextualRelevancyScorer',
|
78
|
+
# 'ContextualPrecisionScorer',
|
79
|
+
# 'ContextualRecallScorer',
|
80
|
+
# 'AnswerRelevancyScorer',
|
81
|
+
|
82
|
+
'JudgmentClient',
|
83
|
+
]
|
judgeval/clients.py
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
import os
|
2
|
+
from dotenv import load_dotenv
|
3
|
+
from openai import OpenAI
|
4
|
+
from langfuse import Langfuse
|
5
|
+
from together import Together, AsyncTogether
|
6
|
+
|
7
|
+
PATH_TO_DOTENV = os.path.join(os.path.dirname(__file__), ".env")
|
8
|
+
load_dotenv(dotenv_path=PATH_TO_DOTENV)
|
9
|
+
|
10
|
+
# Initialize clients
|
11
|
+
client = OpenAI()
|
12
|
+
langfuse = Langfuse(
|
13
|
+
secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
|
14
|
+
public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
|
15
|
+
host=os.getenv("LANGFUSE_HOST"),
|
16
|
+
)
|
17
|
+
together_client = Together(api_key=os.getenv("TOGETHERAI_API_KEY"))
|
18
|
+
async_together_client = AsyncTogether(api_key=os.getenv("TOGETHERAI_API_KEY"))
|
19
|
+
|
@@ -0,0 +1,8 @@
|
|
1
|
+
from judgeval.common.utils import (
|
2
|
+
get_chat_completion,
|
3
|
+
aget_chat_completion,
|
4
|
+
get_completion_multiple_models,
|
5
|
+
aget_completion_multiple_models
|
6
|
+
)
|
7
|
+
|
8
|
+
__all__ = ["get_chat_completion", "aget_chat_completion", "get_completion_multiple_models", "aget_completion_multiple_models"]
|
@@ -0,0 +1,28 @@
|
|
1
|
+
"""
|
2
|
+
Common Exceptions in Judgeval
|
3
|
+
"""
|
4
|
+
|
5
|
+
|
6
|
+
class MissingTestCaseParamsError(Exception):
|
7
|
+
pass
|
8
|
+
|
9
|
+
|
10
|
+
class JudgmentAPIError(Exception):
|
11
|
+
"""
|
12
|
+
Exception raised when an error occurs while executing a Judgment API request
|
13
|
+
"""
|
14
|
+
|
15
|
+
def __init__(self, message: str):
|
16
|
+
super().__init__(message)
|
17
|
+
self.message = message
|
18
|
+
|
19
|
+
|
20
|
+
class InvalidJudgeModelError(Exception):
|
21
|
+
"""
|
22
|
+
Exception raised when an invalid judge model is provided
|
23
|
+
"""
|
24
|
+
|
25
|
+
def __init__(self, message: str):
|
26
|
+
super().__init__(message)
|
27
|
+
self.message = message
|
28
|
+
|
@@ -0,0 +1,189 @@
|
|
1
|
+
import logging
|
2
|
+
from logging.handlers import RotatingFileHandler
|
3
|
+
import sys
|
4
|
+
from pathlib import Path
|
5
|
+
from datetime import datetime
|
6
|
+
from contextlib import contextmanager
|
7
|
+
|
8
|
+
# Global variables
|
9
|
+
logger = None
|
10
|
+
class LoggingState:
|
11
|
+
enabled = False
|
12
|
+
path = None
|
13
|
+
|
14
|
+
LOGGING_STATE = LoggingState()
|
15
|
+
|
16
|
+
# Add these as module-level variables
|
17
|
+
current_example_id = None
|
18
|
+
current_timestamp = None
|
19
|
+
|
20
|
+
|
21
|
+
@contextmanager
|
22
|
+
def enable_logging(name: str = "judgeval", path: str = "./logs", max_bytes: int = 1024 * 1024, backup_count: int = 5):
|
23
|
+
"""
|
24
|
+
Context manager to temporarily enable logging for a specific block of code.
|
25
|
+
"""
|
26
|
+
global logger
|
27
|
+
LOGGING_STATE.enabled = True
|
28
|
+
LOGGING_STATE.path = path
|
29
|
+
# Initialize logger if not already initialized
|
30
|
+
if logger is None:
|
31
|
+
logger = _initialize_logger(name=name, path=path, max_bytes=max_bytes, backup_count=backup_count)
|
32
|
+
try:
|
33
|
+
logger.info("Logging enabled")
|
34
|
+
yield
|
35
|
+
finally:
|
36
|
+
logger.info("Logging disabled")
|
37
|
+
LOGGING_STATE.enabled = False
|
38
|
+
LOGGING_STATE.path = None
|
39
|
+
|
40
|
+
def _initialize_logger(
|
41
|
+
name: str = "judgeval",
|
42
|
+
max_bytes: int = 1024 * 1024, # 1MB
|
43
|
+
backup_count: int = 5,
|
44
|
+
path: str = "./logs" # Added path parameter with default
|
45
|
+
) -> logging.Logger:
|
46
|
+
"""
|
47
|
+
Initialize the global logger instance if it doesn't exist.
|
48
|
+
Returns the global logger instance.
|
49
|
+
"""
|
50
|
+
global logger
|
51
|
+
|
52
|
+
log_dir = Path(path)
|
53
|
+
log_dir.mkdir(exist_ok=True, parents=True)
|
54
|
+
log_file = log_dir / f"{name}.log"
|
55
|
+
if log_file.exists():
|
56
|
+
log_file.unlink() # Delete existing log file
|
57
|
+
|
58
|
+
if logger is not None:
|
59
|
+
return logger
|
60
|
+
|
61
|
+
# Create logs directory if it doesn't exist
|
62
|
+
log_dir = Path(path)
|
63
|
+
log_dir.mkdir(exist_ok=True)
|
64
|
+
|
65
|
+
# Create formatter
|
66
|
+
formatter = logging.Formatter(
|
67
|
+
fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
68
|
+
datefmt='%Y-%m-%d %H:%M:%S'
|
69
|
+
)
|
70
|
+
|
71
|
+
# Create a custom formatter that includes example info when available
|
72
|
+
class ExampleFormatter(logging.Formatter):
|
73
|
+
def format(self, record):
|
74
|
+
if current_example_id is not None and current_timestamp is not None:
|
75
|
+
record.example_id = current_example_id
|
76
|
+
record.timestamp = current_timestamp
|
77
|
+
return logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - [Example_%(example_id)s][%(timestamp)s] %(message)s',
|
78
|
+
datefmt='%Y-%m-%d %H:%M:%S').format(record)
|
79
|
+
return logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
80
|
+
datefmt='%Y-%m-%d %H:%M:%S').format(record)
|
81
|
+
|
82
|
+
# Use the custom formatter
|
83
|
+
console_handler = logging.StreamHandler(sys.stdout)
|
84
|
+
console_handler.setFormatter(ExampleFormatter())
|
85
|
+
console_handler.setLevel(logging.DEBUG)
|
86
|
+
|
87
|
+
log_filename = f"{name}.log"
|
88
|
+
file_handler = RotatingFileHandler(
|
89
|
+
log_dir / log_filename,
|
90
|
+
maxBytes=max_bytes,
|
91
|
+
backupCount=backup_count,
|
92
|
+
mode='a'
|
93
|
+
)
|
94
|
+
file_handler.setFormatter(ExampleFormatter())
|
95
|
+
file_handler.setLevel(logging.DEBUG)
|
96
|
+
|
97
|
+
# Get logger
|
98
|
+
logger = logging.getLogger(name)
|
99
|
+
logger.setLevel(logging.DEBUG)
|
100
|
+
|
101
|
+
# Prevent adding handlers multiple times
|
102
|
+
if not logger.handlers:
|
103
|
+
logger.addHandler(console_handler)
|
104
|
+
logger.addHandler(file_handler)
|
105
|
+
|
106
|
+
return logger
|
107
|
+
|
108
|
+
# Initialize the global logger when module is imported
|
109
|
+
# logger = _initialize_logger()
|
110
|
+
|
111
|
+
def log_if_enabled(func):
|
112
|
+
"""Decorator to check if logging is enabled before executing logging statements"""
|
113
|
+
def wrapper(*args, **kwargs):
|
114
|
+
if LOGGING_STATE.enabled:
|
115
|
+
return func(*args, **kwargs)
|
116
|
+
return wrapper
|
117
|
+
|
118
|
+
@log_if_enabled
|
119
|
+
def debug(msg: str, example_idx: int = None):
|
120
|
+
"""Log debug message if logging is enabled"""
|
121
|
+
logger.debug(msg)
|
122
|
+
|
123
|
+
@log_if_enabled
|
124
|
+
def info(msg: str, example_idx: int = None):
|
125
|
+
"""Log info message if logging is enabled"""
|
126
|
+
logger.info(msg)
|
127
|
+
|
128
|
+
@log_if_enabled
|
129
|
+
def warning(msg: str, example_idx: int = None):
|
130
|
+
"""Log warning message if logging is enabled"""
|
131
|
+
logger.warning(msg)
|
132
|
+
|
133
|
+
@log_if_enabled
|
134
|
+
def error(msg: str, example_idx: int = None):
|
135
|
+
"""Log error message if logging is enabled"""
|
136
|
+
logger.error(msg)
|
137
|
+
|
138
|
+
def create_example_handler(
|
139
|
+
timestamp: str,
|
140
|
+
example_idx: int,
|
141
|
+
path: str = "./logs" # Added path parameter with default
|
142
|
+
) -> RotatingFileHandler:
|
143
|
+
"""Creates a file handler for a specific example"""
|
144
|
+
debug(f"Creating example handler for timestamp={timestamp}, example_idx={example_idx}")
|
145
|
+
log_dir = Path(path) / "examples"
|
146
|
+
log_dir.mkdir(exist_ok=True, parents=True)
|
147
|
+
|
148
|
+
formatter = logging.Formatter(
|
149
|
+
fmt='%(asctime)s - %(name)s - %(levelname)s - [Example_%(example_id)s][%(timestamp)s] %(message)s',
|
150
|
+
datefmt='%Y-%m-%d %H:%M:%S'
|
151
|
+
)
|
152
|
+
|
153
|
+
# Create a unique file for each example
|
154
|
+
file_handler = RotatingFileHandler(
|
155
|
+
log_dir / f"{timestamp}_example_{example_idx}.log",
|
156
|
+
maxBytes=1024 * 1024, # 1MB
|
157
|
+
backupCount=5,
|
158
|
+
mode='a'
|
159
|
+
)
|
160
|
+
file_handler.setFormatter(formatter)
|
161
|
+
file_handler.setLevel(logging.DEBUG)
|
162
|
+
info(f"Created example handler for example {example_idx}")
|
163
|
+
return file_handler
|
164
|
+
|
165
|
+
@contextmanager
|
166
|
+
def example_logging_context(timestamp: str, example_idx: int):
|
167
|
+
"""Context manager for example-specific logging"""
|
168
|
+
if not LOGGING_STATE.enabled:
|
169
|
+
yield
|
170
|
+
return
|
171
|
+
|
172
|
+
global current_example_id, current_timestamp
|
173
|
+
|
174
|
+
debug(f"Entering example logging context for example {example_idx}")
|
175
|
+
current_example_id = example_idx
|
176
|
+
current_timestamp = timestamp
|
177
|
+
|
178
|
+
handler = create_example_handler(timestamp, example_idx, path=LOGGING_STATE.path)
|
179
|
+
if handler:
|
180
|
+
logger.addHandler(handler)
|
181
|
+
try:
|
182
|
+
yield
|
183
|
+
finally:
|
184
|
+
current_example_id = None
|
185
|
+
current_timestamp = None
|
186
|
+
if handler:
|
187
|
+
logger.removeHandler(handler)
|
188
|
+
handler.close()
|
189
|
+
debug(f"Closed example handler for example {example_idx}")
|