judgeval 0.7.1__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +139 -12
- judgeval/api/__init__.py +501 -0
- judgeval/api/api_types.py +344 -0
- judgeval/cli.py +2 -4
- judgeval/constants.py +10 -26
- judgeval/data/evaluation_run.py +49 -26
- judgeval/data/example.py +2 -2
- judgeval/data/judgment_types.py +266 -82
- judgeval/data/result.py +4 -5
- judgeval/data/scorer_data.py +4 -2
- judgeval/data/tool.py +2 -2
- judgeval/data/trace.py +7 -50
- judgeval/data/trace_run.py +7 -4
- judgeval/{dataset.py → dataset/__init__.py} +43 -28
- judgeval/env.py +67 -0
- judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +788 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +75 -15
- judgeval/judges/together_judge.py +86 -18
- judgeval/judges/utils.py +7 -21
- judgeval/{common/logger.py → logger.py} +8 -6
- judgeval/scorers/__init__.py +0 -4
- judgeval/scorers/agent_scorer.py +3 -7
- judgeval/scorers/api_scorer.py +8 -13
- judgeval/scorers/base_scorer.py +52 -32
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
- judgeval/scorers/score.py +21 -31
- judgeval/scorers/trace_api_scorer.py +5 -0
- judgeval/scorers/utils.py +1 -103
- judgeval/tracer/__init__.py +1075 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +37 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +43 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +67 -0
- judgeval/tracer/llm/__init__.py +1233 -0
- judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
- judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
- judgeval/tracer/managers.py +188 -0
- judgeval/tracer/processors/__init__.py +181 -0
- judgeval/tracer/utils.py +20 -0
- judgeval/trainer/__init__.py +5 -0
- judgeval/{common/trainer → trainer}/config.py +12 -9
- judgeval/{common/trainer → trainer}/console.py +2 -9
- judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
- judgeval/{common/trainer → trainer}/trainer.py +119 -17
- judgeval/utils/async_utils.py +2 -3
- judgeval/utils/decorators.py +24 -0
- judgeval/utils/file_utils.py +37 -4
- judgeval/utils/guards.py +32 -0
- judgeval/utils/meta.py +14 -0
- judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
- judgeval/utils/testing.py +88 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +3 -3
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/METADATA +12 -14
- judgeval-0.9.0.dist-info/RECORD +80 -0
- judgeval/clients.py +0 -35
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -375
- judgeval/common/api/constants.py +0 -186
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -97
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -2427
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -188
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -207
- judgeval/common/tracer/trace_manager.py +0 -101
- judgeval/common/trainer/__init__.py +0 -5
- judgeval/common/utils.py +0 -948
- judgeval/integrations/langgraph.py +0 -844
- judgeval/judges/mixture_of_judges.py +0 -287
- judgeval/judgment_client.py +0 -267
- judgeval/rules.py +0 -521
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.7.1.dist-info/RECORD +0 -82
- {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/WHEEL +0 -0
- {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/api/constants.py
DELETED
@@ -1,186 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
from typing import Optional, TypedDict, List, Dict, Any
|
3
|
-
|
4
|
-
ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
5
|
-
|
6
|
-
# Traces API
|
7
|
-
JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
|
8
|
-
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
9
|
-
JUDGMENT_TRACES_UPSERT_API_URL = f"{ROOT_API}/traces/upsert/"
|
10
|
-
JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
11
|
-
JUDGMENT_TRACES_SPANS_BATCH_API_URL = f"{ROOT_API}/traces/spans/batch/"
|
12
|
-
JUDGMENT_TRACES_EVALUATION_RUNS_BATCH_API_URL = (
|
13
|
-
f"{ROOT_API}/traces/evaluation_runs/batch/"
|
14
|
-
)
|
15
|
-
|
16
|
-
|
17
|
-
class TraceFetchPayload(TypedDict):
|
18
|
-
trace_id: str
|
19
|
-
|
20
|
-
|
21
|
-
class TraceDeletePayload(TypedDict):
|
22
|
-
trace_ids: List[str]
|
23
|
-
|
24
|
-
|
25
|
-
class SpansBatchPayload(TypedDict):
|
26
|
-
spans: List[Dict[str, Any]]
|
27
|
-
organization_id: str
|
28
|
-
|
29
|
-
|
30
|
-
class EvaluationEntryResponse(TypedDict):
|
31
|
-
evaluation_run: Dict[str, Any]
|
32
|
-
associated_span: Dict[str, Any]
|
33
|
-
queued_at: Optional[float]
|
34
|
-
|
35
|
-
|
36
|
-
class EvaluationRunsBatchPayload(TypedDict):
|
37
|
-
organization_id: str
|
38
|
-
evaluation_entries: List[EvaluationEntryResponse]
|
39
|
-
|
40
|
-
|
41
|
-
# Evaluation API
|
42
|
-
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
43
|
-
JUDGMENT_TRACE_EVAL_API_URL = f"{ROOT_API}/evaluate_trace/"
|
44
|
-
JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
45
|
-
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_experiment_run/"
|
46
|
-
JUDGMENT_EVAL_DELETE_API_URL = (
|
47
|
-
f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
|
48
|
-
)
|
49
|
-
JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
|
50
|
-
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
|
51
|
-
JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
|
52
|
-
|
53
|
-
# Custom Scorers API
|
54
|
-
JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL = f"{ROOT_API}/upload_scorer/"
|
55
|
-
|
56
|
-
|
57
|
-
# Evaluation API Payloads
|
58
|
-
class EvalRunRequestBody(TypedDict):
|
59
|
-
eval_name: str
|
60
|
-
project_name: str
|
61
|
-
judgment_api_key: str
|
62
|
-
|
63
|
-
|
64
|
-
class DeleteEvalRunRequestBody(TypedDict):
|
65
|
-
eval_names: List[str]
|
66
|
-
project_name: str
|
67
|
-
judgment_api_key: str
|
68
|
-
|
69
|
-
|
70
|
-
class EvalLogPayload(TypedDict):
|
71
|
-
results: List[Dict[str, Any]]
|
72
|
-
run: Dict[str, Any]
|
73
|
-
|
74
|
-
|
75
|
-
class EvalStatusPayload(TypedDict):
|
76
|
-
experiment_run_id: str
|
77
|
-
judgment_api_key: str
|
78
|
-
project_name: str
|
79
|
-
|
80
|
-
|
81
|
-
class CheckExperimentTypePayload(TypedDict):
|
82
|
-
eval_name: str
|
83
|
-
project_name: str
|
84
|
-
judgment_api_key: str
|
85
|
-
is_trace: bool
|
86
|
-
|
87
|
-
|
88
|
-
class EvalRunNameExistsPayload(TypedDict):
|
89
|
-
eval_name: str
|
90
|
-
project_name: str
|
91
|
-
judgment_api_key: str
|
92
|
-
|
93
|
-
|
94
|
-
class CheckExampleKeysPayload(TypedDict):
|
95
|
-
keys: List[str]
|
96
|
-
eval_name: str
|
97
|
-
project_name: str
|
98
|
-
|
99
|
-
|
100
|
-
# Datasets API
|
101
|
-
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
102
|
-
JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
103
|
-
JUDGMENT_DATASETS_APPEND_TRACES_API_URL = f"{ROOT_API}/traces/add_to_dataset/"
|
104
|
-
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
|
105
|
-
JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
|
106
|
-
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
|
107
|
-
JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
|
108
|
-
JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
109
|
-
|
110
|
-
|
111
|
-
class DatasetPushPayload(TypedDict):
|
112
|
-
dataset_alias: str
|
113
|
-
project_name: str
|
114
|
-
examples: List[Dict[str, Any]]
|
115
|
-
traces: List[Dict[str, Any]]
|
116
|
-
overwrite: bool
|
117
|
-
|
118
|
-
|
119
|
-
class DatasetAppendPayload(TypedDict):
|
120
|
-
dataset_alias: str
|
121
|
-
project_name: str
|
122
|
-
examples: List[Dict[str, Any]]
|
123
|
-
|
124
|
-
|
125
|
-
class DatasetPullPayload(TypedDict):
|
126
|
-
dataset_alias: str
|
127
|
-
project_name: str
|
128
|
-
|
129
|
-
|
130
|
-
class DatasetDeletePayload(TypedDict):
|
131
|
-
dataset_alias: str
|
132
|
-
project_name: str
|
133
|
-
|
134
|
-
|
135
|
-
class DatasetExportPayload(TypedDict):
|
136
|
-
dataset_alias: str
|
137
|
-
project_name: str
|
138
|
-
|
139
|
-
|
140
|
-
class DatasetStatsPayload(TypedDict):
|
141
|
-
project_name: str
|
142
|
-
|
143
|
-
|
144
|
-
# Projects API
|
145
|
-
JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete_from_judgeval/"
|
146
|
-
JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
|
147
|
-
|
148
|
-
|
149
|
-
class ProjectDeletePayload(TypedDict):
|
150
|
-
project_list: List[str]
|
151
|
-
|
152
|
-
|
153
|
-
class ProjectCreatePayload(TypedDict):
|
154
|
-
project_name: str
|
155
|
-
|
156
|
-
|
157
|
-
JUDGMENT_SCORER_SAVE_API_URL = f"{ROOT_API}/save_scorer/"
|
158
|
-
JUDGMENT_SCORER_FETCH_API_URL = f"{ROOT_API}/fetch_scorer/"
|
159
|
-
JUDGMENT_SCORER_EXISTS_API_URL = f"{ROOT_API}/scorer_exists/"
|
160
|
-
|
161
|
-
|
162
|
-
class ScorerSavePayload(TypedDict):
|
163
|
-
name: str
|
164
|
-
prompt: str
|
165
|
-
threshold: float
|
166
|
-
options: Optional[dict]
|
167
|
-
|
168
|
-
|
169
|
-
class ScorerFetchPayload(TypedDict):
|
170
|
-
name: str
|
171
|
-
|
172
|
-
|
173
|
-
class ScorerExistsPayload(TypedDict):
|
174
|
-
name: str
|
175
|
-
|
176
|
-
|
177
|
-
class CustomScorerUploadPayload(TypedDict):
|
178
|
-
scorer_name: str
|
179
|
-
scorer_code: str
|
180
|
-
requirements_text: str
|
181
|
-
|
182
|
-
|
183
|
-
class CustomScorerTemplateResponse(TypedDict):
|
184
|
-
scorer_name: str
|
185
|
-
status: str
|
186
|
-
message: str
|
judgeval/common/exceptions.py
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Common Exceptions in Judgeval
|
3
|
-
"""
|
4
|
-
|
5
|
-
|
6
|
-
class MissingTestCaseParamsError(Exception):
|
7
|
-
pass
|
8
|
-
|
9
|
-
|
10
|
-
class JudgmentAPIError(Exception):
|
11
|
-
"""
|
12
|
-
Exception raised when an error occurs while executing a Judgment API request
|
13
|
-
"""
|
14
|
-
|
15
|
-
def __init__(self, message: str):
|
16
|
-
super().__init__(message)
|
17
|
-
self.message = message
|
18
|
-
|
19
|
-
|
20
|
-
class InvalidJudgeModelError(Exception):
|
21
|
-
"""
|
22
|
-
Exception raised when an invalid judge model is provided
|
23
|
-
"""
|
24
|
-
|
25
|
-
def __init__(self, message: str):
|
26
|
-
super().__init__(message)
|
27
|
-
self.message = message
|
@@ -1,97 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
import boto3
|
3
|
-
import orjson
|
4
|
-
from typing import Optional
|
5
|
-
from datetime import datetime, UTC
|
6
|
-
from botocore.exceptions import ClientError
|
7
|
-
from judgeval.common.logger import judgeval_logger
|
8
|
-
|
9
|
-
|
10
|
-
class S3Storage:
|
11
|
-
"""Utility class for storing and retrieving trace data from S3."""
|
12
|
-
|
13
|
-
def __init__(
|
14
|
-
self,
|
15
|
-
bucket_name: str,
|
16
|
-
aws_access_key_id: Optional[str] = None,
|
17
|
-
aws_secret_access_key: Optional[str] = None,
|
18
|
-
region_name: Optional[str] = None,
|
19
|
-
):
|
20
|
-
"""Initialize S3 storage with credentials and bucket name.
|
21
|
-
|
22
|
-
Args:
|
23
|
-
bucket_name: Name of the S3 bucket to store traces in
|
24
|
-
aws_access_key_id: AWS access key ID (optional, will use environment variables if not provided)
|
25
|
-
aws_secret_access_key: AWS secret access key (optional, will use environment variables if not provided)
|
26
|
-
region_name: AWS region name (optional, will use environment variables if not provided)
|
27
|
-
"""
|
28
|
-
self.bucket_name = bucket_name
|
29
|
-
self.s3_client = boto3.client(
|
30
|
-
"s3",
|
31
|
-
aws_access_key_id=aws_access_key_id or os.getenv("AWS_ACCESS_KEY_ID"),
|
32
|
-
aws_secret_access_key=aws_secret_access_key
|
33
|
-
or os.getenv("AWS_SECRET_ACCESS_KEY"),
|
34
|
-
region_name=region_name or os.getenv("AWS_REGION", "us-west-1"),
|
35
|
-
)
|
36
|
-
|
37
|
-
def _ensure_bucket_exists(self):
|
38
|
-
"""Ensure the S3 bucket exists, creating it if necessary."""
|
39
|
-
try:
|
40
|
-
self.s3_client.head_bucket(Bucket=self.bucket_name)
|
41
|
-
except ClientError as e:
|
42
|
-
error_code = e.response["Error"]["Code"]
|
43
|
-
if error_code == "404":
|
44
|
-
# Bucket doesn't exist, create it
|
45
|
-
try:
|
46
|
-
self.s3_client.create_bucket(
|
47
|
-
Bucket=self.bucket_name,
|
48
|
-
CreateBucketConfiguration={
|
49
|
-
"LocationConstraint": self.s3_client.meta.region_name
|
50
|
-
},
|
51
|
-
) if self.s3_client.meta.region_name != "us-east-1" else self.s3_client.create_bucket(
|
52
|
-
Bucket=self.bucket_name
|
53
|
-
)
|
54
|
-
except ClientError as create_error:
|
55
|
-
if (
|
56
|
-
create_error.response["Error"]["Code"]
|
57
|
-
== "BucketAlreadyOwnedByYou"
|
58
|
-
):
|
59
|
-
# Bucket was just created by another process
|
60
|
-
judgeval_logger.warning(
|
61
|
-
f"Bucket {self.bucket_name} was just created by another process"
|
62
|
-
)
|
63
|
-
pass
|
64
|
-
else:
|
65
|
-
raise create_error
|
66
|
-
else:
|
67
|
-
# Some other error occurred
|
68
|
-
raise e
|
69
|
-
|
70
|
-
def save_trace(self, trace_data: dict, trace_id: str, project_name: str) -> str:
|
71
|
-
"""Save trace data to S3.
|
72
|
-
|
73
|
-
Args:
|
74
|
-
trace_data: The trace data to save
|
75
|
-
trace_id: Unique identifier for the trace
|
76
|
-
project_name: Name of the project the trace belongs to
|
77
|
-
|
78
|
-
Returns:
|
79
|
-
str: S3 key where the trace was saved
|
80
|
-
"""
|
81
|
-
# Ensure bucket exists before saving
|
82
|
-
self._ensure_bucket_exists()
|
83
|
-
|
84
|
-
# Create a timestamped key for the trace
|
85
|
-
timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
|
86
|
-
s3_key = f"traces/{project_name}/{trace_id}_{timestamp}.json"
|
87
|
-
|
88
|
-
trace_json = orjson.dumps(trace_data).decode("utf-8")
|
89
|
-
|
90
|
-
self.s3_client.put_object(
|
91
|
-
Bucket=self.bucket_name,
|
92
|
-
Key=s3_key,
|
93
|
-
Body=trace_json,
|
94
|
-
ContentType="application/json",
|
95
|
-
)
|
96
|
-
|
97
|
-
return s3_key
|
@@ -1,31 +0,0 @@
|
|
1
|
-
from judgeval.common.tracer.core import (
|
2
|
-
TraceClient,
|
3
|
-
_DeepTracer,
|
4
|
-
Tracer,
|
5
|
-
wrap,
|
6
|
-
current_span_var,
|
7
|
-
current_trace_var,
|
8
|
-
SpanType,
|
9
|
-
cost_per_token,
|
10
|
-
)
|
11
|
-
from judgeval.common.tracer.otel_exporter import JudgmentAPISpanExporter
|
12
|
-
from judgeval.common.tracer.otel_span_processor import JudgmentSpanProcessor
|
13
|
-
from judgeval.common.tracer.span_processor import SpanProcessorBase
|
14
|
-
from judgeval.common.tracer.trace_manager import TraceManagerClient
|
15
|
-
from judgeval.data import TraceSpan
|
16
|
-
|
17
|
-
__all__ = [
|
18
|
-
"_DeepTracer",
|
19
|
-
"TraceClient",
|
20
|
-
"Tracer",
|
21
|
-
"wrap",
|
22
|
-
"current_span_var",
|
23
|
-
"current_trace_var",
|
24
|
-
"TraceManagerClient",
|
25
|
-
"JudgmentAPISpanExporter",
|
26
|
-
"JudgmentSpanProcessor",
|
27
|
-
"SpanProcessorBase",
|
28
|
-
"SpanType",
|
29
|
-
"cost_per_token",
|
30
|
-
"TraceSpan",
|
31
|
-
]
|
@@ -1,22 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
import site
|
3
|
-
import sysconfig
|
4
|
-
|
5
|
-
|
6
|
-
# NOTE: This builds once, can be tweaked if we are missing / capturing other unncessary modules
|
7
|
-
# @link https://docs.python.org/3.13/library/sysconfig.html
|
8
|
-
_TRACE_FILEPATH_BLOCKLIST = tuple(
|
9
|
-
os.path.realpath(p) + os.sep
|
10
|
-
for p in {
|
11
|
-
sysconfig.get_paths()["stdlib"],
|
12
|
-
sysconfig.get_paths().get("platstdlib", ""),
|
13
|
-
*site.getsitepackages(),
|
14
|
-
site.getusersitepackages(),
|
15
|
-
*(
|
16
|
-
[os.path.join(os.path.dirname(__file__), "../../judgeval/")]
|
17
|
-
if os.environ.get("JUDGMENT_DEV")
|
18
|
-
else []
|
19
|
-
),
|
20
|
-
}
|
21
|
-
if p
|
22
|
-
)
|