judgeval 0.0.55__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/api/__init__.py +3 -0
- judgeval/common/api/api.py +352 -0
- judgeval/common/api/constants.py +165 -0
- judgeval/common/storage/__init__.py +6 -0
- judgeval/common/tracer/__init__.py +31 -0
- judgeval/common/tracer/constants.py +22 -0
- judgeval/common/tracer/core.py +1916 -0
- judgeval/common/tracer/otel_exporter.py +108 -0
- judgeval/common/tracer/otel_span_processor.py +234 -0
- judgeval/common/tracer/span_processor.py +37 -0
- judgeval/common/tracer/span_transformer.py +211 -0
- judgeval/common/tracer/trace_manager.py +92 -0
- judgeval/common/utils.py +2 -2
- judgeval/constants.py +3 -30
- judgeval/data/datasets/eval_dataset_client.py +29 -156
- judgeval/data/judgment_types.py +4 -12
- judgeval/data/result.py +1 -1
- judgeval/data/scorer_data.py +2 -2
- judgeval/data/scripts/openapi_transform.py +1 -1
- judgeval/data/trace.py +66 -1
- judgeval/data/trace_run.py +0 -3
- judgeval/evaluation_run.py +0 -2
- judgeval/integrations/langgraph.py +43 -164
- judgeval/judgment_client.py +17 -211
- judgeval/run_evaluation.py +209 -611
- judgeval/scorers/__init__.py +2 -6
- judgeval/scorers/base_scorer.py +4 -23
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +3 -3
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +215 -0
- judgeval/scorers/score.py +2 -1
- judgeval/scorers/utils.py +1 -13
- judgeval/utils/requests.py +21 -0
- judgeval-0.1.0.dist-info/METADATA +202 -0
- {judgeval-0.0.55.dist-info → judgeval-0.1.0.dist-info}/RECORD +37 -29
- judgeval/common/tracer.py +0 -3215
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +0 -73
- judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -53
- judgeval-0.0.55.dist-info/METADATA +0 -1384
- /judgeval/common/{s3_storage.py → storage/s3_storage.py} +0 -0
- {judgeval-0.0.55.dist-info → judgeval-0.1.0.dist-info}/WHEEL +0 -0
- {judgeval-0.0.55.dist-info → judgeval-0.1.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/utils.py
CHANGED
@@ -13,6 +13,7 @@ import asyncio
|
|
13
13
|
import concurrent.futures
|
14
14
|
import os
|
15
15
|
from types import TracebackType
|
16
|
+
from judgeval.common.api.constants import ROOT_API
|
16
17
|
from judgeval.utils.requests import requests
|
17
18
|
import pprint
|
18
19
|
from typing import Any, Dict, List, Mapping, Optional, TypeAlias, Union, TypeGuard
|
@@ -27,7 +28,6 @@ from judgeval.clients import async_together_client, together_client
|
|
27
28
|
from judgeval.constants import (
|
28
29
|
ACCEPTABLE_MODELS,
|
29
30
|
MAX_WORKER_THREADS,
|
30
|
-
ROOT_API,
|
31
31
|
TOGETHER_SUPPORTED_MODELS,
|
32
32
|
LITELLM_SUPPORTED_MODELS,
|
33
33
|
)
|
@@ -128,7 +128,7 @@ def validate_api_key(judgment_api_key: str):
|
|
128
128
|
"Content-Type": "application/json",
|
129
129
|
"Authorization": f"Bearer {judgment_api_key}",
|
130
130
|
},
|
131
|
-
json={},
|
131
|
+
json={},
|
132
132
|
verify=True,
|
133
133
|
)
|
134
134
|
if response.status_code == 200:
|
judgeval/constants.py
CHANGED
@@ -39,36 +39,6 @@ UNBOUNDED_SCORERS: set[APIScorerType] = (
|
|
39
39
|
set()
|
40
40
|
) # scorers whose scores are not bounded between 0-1
|
41
41
|
|
42
|
-
ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
43
|
-
# API URLs
|
44
|
-
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
45
|
-
JUDGMENT_TRACE_EVAL_API_URL = f"{ROOT_API}/evaluate_trace/"
|
46
|
-
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
47
|
-
JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
48
|
-
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
|
49
|
-
JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
|
50
|
-
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
|
51
|
-
JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
|
52
|
-
JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
53
|
-
JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
54
|
-
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_experiment_run/"
|
55
|
-
JUDGMENT_EVAL_DELETE_API_URL = (
|
56
|
-
f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
|
57
|
-
)
|
58
|
-
JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
|
59
|
-
JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
|
60
|
-
JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
|
61
|
-
JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
|
62
|
-
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
63
|
-
JUDGMENT_TRACES_UPSERT_API_URL = f"{ROOT_API}/traces/upsert/"
|
64
|
-
JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
65
|
-
JUDGMENT_TRACES_ADD_ANNOTATION_API_URL = f"{ROOT_API}/traces/add_annotation/"
|
66
|
-
JUDGMENT_TRACES_SPANS_BATCH_API_URL = f"{ROOT_API}/traces/spans/batch/"
|
67
|
-
JUDGMENT_TRACES_EVALUATION_RUNS_BATCH_API_URL = (
|
68
|
-
f"{ROOT_API}/traces/evaluation_runs/batch/"
|
69
|
-
)
|
70
|
-
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
|
71
|
-
JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
|
72
42
|
# RabbitMQ
|
73
43
|
RABBITMQ_HOST = os.getenv(
|
74
44
|
"RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com"
|
@@ -145,3 +115,6 @@ MAX_WORKER_THREADS = 10
|
|
145
115
|
|
146
116
|
# Maximum number of concurrent operations for evaluation runs
|
147
117
|
MAX_CONCURRENT_EVALUATIONS = 50 # Adjust based on system capabilities
|
118
|
+
|
119
|
+
# Span lifecycle management
|
120
|
+
SPAN_LIFECYCLE_END_UPDATE_ID = 20 # Default ending number for completed spans
|
@@ -1,27 +1,17 @@
|
|
1
1
|
from typing import Optional, List
|
2
|
-
from requests import Response, exceptions
|
3
|
-
from judgeval.utils.requests import requests
|
4
2
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
5
3
|
from judgeval.common.logger import judgeval_logger
|
6
|
-
from judgeval.
|
7
|
-
JUDGMENT_DATASETS_PUSH_API_URL,
|
8
|
-
JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL,
|
9
|
-
JUDGMENT_DATASETS_PULL_API_URL,
|
10
|
-
JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
|
11
|
-
JUDGMENT_DATASETS_DELETE_API_URL,
|
12
|
-
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL,
|
13
|
-
)
|
4
|
+
from judgeval.common.api import JudgmentApiClient
|
14
5
|
from judgeval.data import Example, Trace
|
15
6
|
from judgeval.data.datasets import EvalDataset
|
16
7
|
|
17
8
|
|
18
9
|
class EvalDatasetClient:
|
19
10
|
def __init__(self, judgment_api_key: str, organization_id: str):
|
20
|
-
self.
|
21
|
-
self.organization_id = organization_id
|
11
|
+
self.api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
22
12
|
|
23
13
|
def create_dataset(self) -> EvalDataset:
|
24
|
-
return EvalDataset(judgment_api_key=self.
|
14
|
+
return EvalDataset(judgment_api_key=self.api_client.api_key)
|
25
15
|
|
26
16
|
def push(
|
27
17
|
self,
|
@@ -55,39 +45,17 @@ class EvalDatasetClient:
|
|
55
45
|
f"Pushing [rgb(106,0,255)]'{alias}' to Judgment...",
|
56
46
|
total=100,
|
57
47
|
)
|
58
|
-
content = {
|
59
|
-
"dataset_alias": alias,
|
60
|
-
"project_name": project_name,
|
61
|
-
"examples": [e.to_dict() for e in dataset.examples],
|
62
|
-
"traces": [t.model_dump() for t in dataset.traces],
|
63
|
-
"overwrite": overwrite,
|
64
|
-
}
|
65
48
|
try:
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
"X-Organization-Id": self.organization_id,
|
73
|
-
},
|
74
|
-
verify=True,
|
49
|
+
payload = self.api_client.push_dataset(
|
50
|
+
dataset_alias=alias,
|
51
|
+
project_name=project_name,
|
52
|
+
examples=[e.to_dict() for e in dataset.examples],
|
53
|
+
traces=[t.model_dump() for t in dataset.traces],
|
54
|
+
overwrite=overwrite or False,
|
75
55
|
)
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
)
|
80
|
-
raise Exception(f"Server error during push: {response.json()}")
|
81
|
-
response.raise_for_status()
|
82
|
-
except exceptions.HTTPError as err:
|
83
|
-
if response.status_code == 422:
|
84
|
-
judgeval_logger.error(
|
85
|
-
f"Validation error during push: {err.response.json()}"
|
86
|
-
)
|
87
|
-
else:
|
88
|
-
judgeval_logger.error(f"HTTP error during push: {err}")
|
89
|
-
|
90
|
-
payload = response.json()
|
56
|
+
except Exception as e:
|
57
|
+
judgeval_logger.error(f"Error during push: {e}")
|
58
|
+
raise
|
91
59
|
dataset._alias = payload.get("_alias")
|
92
60
|
dataset._id = payload.get("_id")
|
93
61
|
progress.update(
|
@@ -122,35 +90,15 @@ class EvalDatasetClient:
|
|
122
90
|
f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
|
123
91
|
total=100,
|
124
92
|
)
|
125
|
-
content = {
|
126
|
-
"dataset_alias": alias,
|
127
|
-
"project_name": project_name,
|
128
|
-
"examples": [e.to_dict() for e in examples],
|
129
|
-
}
|
130
93
|
try:
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
"Content-Type": "application/json",
|
136
|
-
"Authorization": f"Bearer {self.judgment_api_key}",
|
137
|
-
"X-Organization-Id": self.organization_id,
|
138
|
-
},
|
139
|
-
verify=True,
|
94
|
+
self.api_client.append_examples(
|
95
|
+
dataset_alias=alias,
|
96
|
+
project_name=project_name,
|
97
|
+
examples=[e.to_dict() for e in examples],
|
140
98
|
)
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
)
|
145
|
-
raise Exception(f"Server error during append: {response.json()}")
|
146
|
-
response.raise_for_status()
|
147
|
-
except exceptions.HTTPError as err:
|
148
|
-
if response.status_code == 422:
|
149
|
-
judgeval_logger.error(
|
150
|
-
f"Validation error during append: {err.response.json()}"
|
151
|
-
)
|
152
|
-
else:
|
153
|
-
judgeval_logger.error(f"HTTP error during append: {err}")
|
99
|
+
except Exception as e:
|
100
|
+
judgeval_logger.error(f"Error during append: {e}")
|
101
|
+
raise
|
154
102
|
|
155
103
|
progress.update(
|
156
104
|
task_id,
|
@@ -186,25 +134,14 @@ class EvalDatasetClient:
|
|
186
134
|
f"Pulling [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
|
187
135
|
total=100,
|
188
136
|
)
|
189
|
-
request_body = {"dataset_alias": alias, "project_name": project_name}
|
190
|
-
|
191
137
|
try:
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
headers={
|
196
|
-
"Content-Type": "application/json",
|
197
|
-
"Authorization": f"Bearer {self.judgment_api_key}",
|
198
|
-
"X-Organization-Id": self.organization_id,
|
199
|
-
},
|
200
|
-
verify=True,
|
138
|
+
payload = self.api_client.pull_dataset(
|
139
|
+
dataset_alias=alias,
|
140
|
+
project_name=project_name,
|
201
141
|
)
|
202
|
-
|
203
|
-
except exceptions.RequestException as e:
|
142
|
+
except Exception as e:
|
204
143
|
judgeval_logger.error(f"Error pulling dataset: {str(e)}")
|
205
144
|
raise
|
206
|
-
|
207
|
-
payload = response.json()
|
208
145
|
dataset.examples = [Example(**e) for e in payload.get("examples", [])]
|
209
146
|
dataset.traces = [Trace(**t) for t in payload.get("traces", [])]
|
210
147
|
dataset._alias = payload.get("alias")
|
@@ -226,21 +163,12 @@ class EvalDatasetClient:
|
|
226
163
|
f"Deleting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
|
227
164
|
total=100,
|
228
165
|
)
|
229
|
-
request_body = {"dataset_alias": alias, "project_name": project_name}
|
230
|
-
|
231
166
|
try:
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
headers={
|
236
|
-
"Content-Type": "application/json",
|
237
|
-
"Authorization": f"Bearer {self.judgment_api_key}",
|
238
|
-
"X-Organization-Id": self.organization_id,
|
239
|
-
},
|
240
|
-
verify=True,
|
167
|
+
self.api_client.delete_dataset(
|
168
|
+
dataset_alias=alias,
|
169
|
+
project_name=project_name,
|
241
170
|
)
|
242
|
-
|
243
|
-
except exceptions.RequestException as e:
|
171
|
+
except Exception as e:
|
244
172
|
judgeval_logger.error(f"Error deleting dataset: {str(e)}")
|
245
173
|
raise
|
246
174
|
|
@@ -272,70 +200,15 @@ class EvalDatasetClient:
|
|
272
200
|
"Pulling [rgb(106,0,255)]' datasets'[/rgb(106,0,255)] from Judgment...",
|
273
201
|
total=100,
|
274
202
|
)
|
275
|
-
request_body = {"project_name": project_name}
|
276
|
-
|
277
203
|
try:
|
278
|
-
|
279
|
-
|
280
|
-
json=request_body,
|
281
|
-
headers={
|
282
|
-
"Content-Type": "application/json",
|
283
|
-
"Authorization": f"Bearer {self.judgment_api_key}",
|
284
|
-
"X-Organization-Id": self.organization_id,
|
285
|
-
},
|
286
|
-
verify=True,
|
287
|
-
)
|
288
|
-
response.raise_for_status()
|
289
|
-
except exceptions.RequestException as e:
|
204
|
+
payload = self.api_client.get_project_dataset_stats(project_name)
|
205
|
+
except Exception as e:
|
290
206
|
judgeval_logger.error(f"Error pulling dataset: {str(e)}")
|
291
207
|
raise
|
292
208
|
|
293
|
-
payload = response.json()
|
294
|
-
|
295
209
|
progress.update(
|
296
210
|
task_id,
|
297
211
|
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
298
212
|
)
|
299
213
|
|
300
214
|
return payload
|
301
|
-
|
302
|
-
def export_jsonl(self, alias: str, project_name: str) -> Response:
|
303
|
-
"""Export dataset in JSONL format from Judgment platform"""
|
304
|
-
with Progress(
|
305
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
306
|
-
TextColumn("[progress.description]{task.description}"),
|
307
|
-
transient=False,
|
308
|
-
) as progress:
|
309
|
-
task_id = progress.add_task(
|
310
|
-
f"Exporting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] as JSONL...",
|
311
|
-
total=100,
|
312
|
-
)
|
313
|
-
try:
|
314
|
-
response = requests.post(
|
315
|
-
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL,
|
316
|
-
json={"dataset_alias": alias, "project_name": project_name},
|
317
|
-
headers={
|
318
|
-
"Content-Type": "application/json",
|
319
|
-
"Authorization": f"Bearer {self.judgment_api_key}",
|
320
|
-
"X-Organization-Id": self.organization_id,
|
321
|
-
},
|
322
|
-
stream=True,
|
323
|
-
verify=True,
|
324
|
-
)
|
325
|
-
response.raise_for_status()
|
326
|
-
except exceptions.HTTPError as err:
|
327
|
-
if err.response.status_code == 404:
|
328
|
-
judgeval_logger.error(f"Dataset not found: {alias}")
|
329
|
-
else:
|
330
|
-
judgeval_logger.error(f"HTTP error during export: {err}")
|
331
|
-
raise
|
332
|
-
except Exception as e:
|
333
|
-
judgeval_logger.error(f"Error during export: {str(e)}")
|
334
|
-
raise
|
335
|
-
|
336
|
-
progress.update(
|
337
|
-
task_id,
|
338
|
-
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
339
|
-
)
|
340
|
-
|
341
|
-
return response
|
judgeval/data/judgment_types.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: openapi_new.json
|
3
|
-
# timestamp: 2025-07-
|
3
|
+
# timestamp: 2025-07-17T03:14:16+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
|
@@ -94,9 +94,6 @@ class TraceSpanJudgmentType(BaseModel):
|
|
94
94
|
output: Annotated[Any, Field(title="Output")] = None
|
95
95
|
usage: Optional[TraceUsageJudgmentType] = None
|
96
96
|
duration: Annotated[Optional[float], Field(title="Duration")] = None
|
97
|
-
annotation: Annotated[Optional[List[Dict[str, Any]]], Field(title="Annotation")] = (
|
98
|
-
None
|
99
|
-
)
|
100
97
|
expected_tools: Annotated[
|
101
98
|
Optional[List[ToolJudgmentType]], Field(title="Expected Tools")
|
102
99
|
] = None
|
@@ -176,6 +173,7 @@ class ScoringResultJudgmentType(BaseModel):
|
|
176
173
|
] = None
|
177
174
|
trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
|
178
175
|
run_duration: Annotated[Optional[float], Field(title="Run Duration")] = None
|
176
|
+
evaluation_cost: Annotated[Optional[float], Field(title="Evaluation Cost")] = None
|
179
177
|
|
180
178
|
|
181
179
|
class TraceRunJudgmentType(BaseModel):
|
@@ -184,11 +182,8 @@ class TraceRunJudgmentType(BaseModel):
|
|
184
182
|
traces: Annotated[List[TraceJudgmentType], Field(title="Traces")]
|
185
183
|
scorers: Annotated[List[ScorerConfigJudgmentType], Field(title="Scorers")]
|
186
184
|
model: Annotated[str, Field(title="Model")]
|
187
|
-
judgment_api_key: Annotated[Optional[str], Field(title="Judgment Api Key")] = None
|
188
185
|
append: Annotated[Optional[bool], Field(title="Append")] = False
|
189
|
-
|
190
|
-
Optional[bool], Field(title="Override Existing Eval Run Name")
|
191
|
-
] = False
|
186
|
+
override: Annotated[Optional[bool], Field(title="Override")] = False
|
192
187
|
trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
|
193
188
|
tools: Annotated[Optional[List[Dict[str, Any]]], Field(title="Tools")] = None
|
194
189
|
|
@@ -199,11 +194,8 @@ class JudgmentEvalJudgmentType(BaseModel):
|
|
199
194
|
examples: Annotated[List[ExampleJudgmentType], Field(title="Examples")]
|
200
195
|
scorers: Annotated[List[ScorerConfigJudgmentType], Field(title="Scorers")]
|
201
196
|
model: Annotated[str, Field(title="Model")]
|
202
|
-
judgment_api_key: Annotated[Optional[str], Field(title="Judgment Api Key")] = None
|
203
197
|
append: Annotated[Optional[bool], Field(title="Append")] = False
|
204
|
-
|
205
|
-
Optional[bool], Field(title="Override Existing Eval Run Name")
|
206
|
-
] = False
|
198
|
+
override: Annotated[Optional[bool], Field(title="Override")] = False
|
207
199
|
trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
|
208
200
|
|
209
201
|
|
judgeval/data/result.py
CHANGED
@@ -30,7 +30,7 @@ class ScoringResult(ScoringResultJudgmentType):
|
|
30
30
|
def __str__(self) -> str:
|
31
31
|
return f"ScoringResult(\
|
32
32
|
success={self.success}, \
|
33
|
-
|
33
|
+
scorers_data={self.scorers_data}, \
|
34
34
|
data_object={self.data_object}, \
|
35
35
|
run_duration={self.run_duration})"
|
36
36
|
|
judgeval/data/scorer_data.py
CHANGED
@@ -54,7 +54,7 @@ def create_scorer_data(scorer: BaseScorer) -> List[ScorerData]:
|
|
54
54
|
reason=scorer.reason,
|
55
55
|
success=scorer.success,
|
56
56
|
strict_mode=scorer.strict_mode,
|
57
|
-
evaluation_model=scorer.
|
57
|
+
evaluation_model=scorer.model,
|
58
58
|
error=scorer.error,
|
59
59
|
additional_metadata=scorer.additional_metadata,
|
60
60
|
)
|
@@ -68,7 +68,7 @@ def create_scorer_data(scorer: BaseScorer) -> List[ScorerData]:
|
|
68
68
|
reason=scorer.internal_scorer.reason,
|
69
69
|
success=scorer.internal_scorer.success,
|
70
70
|
strict_mode=scorer.internal_scorer.strict_mode,
|
71
|
-
evaluation_model=scorer.internal_scorer.
|
71
|
+
evaluation_model=scorer.internal_scorer.model,
|
72
72
|
error=scorer.internal_scorer.error,
|
73
73
|
additional_metadata=scorer.internal_scorer.additional_metadata,
|
74
74
|
)
|
judgeval/data/trace.py
CHANGED
@@ -8,6 +8,7 @@ from judgeval.data.judgment_types import (
|
|
8
8
|
TraceSpanJudgmentType,
|
9
9
|
TraceJudgmentType,
|
10
10
|
)
|
11
|
+
from judgeval.constants import SPAN_LIFECYCLE_END_UPDATE_ID
|
11
12
|
from pydantic import BaseModel
|
12
13
|
|
13
14
|
|
@@ -55,6 +56,22 @@ class TraceSpan(TraceSpanJudgmentType):
|
|
55
56
|
self.update_id += 1
|
56
57
|
return self.update_id
|
57
58
|
|
59
|
+
def set_update_id_to_ending_number(
|
60
|
+
self, ending_number: int = SPAN_LIFECYCLE_END_UPDATE_ID
|
61
|
+
) -> int:
|
62
|
+
"""
|
63
|
+
Thread-safe method to set the update_id to a predetermined ending number.
|
64
|
+
|
65
|
+
Args:
|
66
|
+
ending_number (int): The number to set update_id to. Defaults to SPAN_LIFECYCLE_END_UPDATE_ID.
|
67
|
+
|
68
|
+
Returns:
|
69
|
+
int: The new update_id value after setting
|
70
|
+
"""
|
71
|
+
with self._update_id_lock:
|
72
|
+
self.update_id = ending_number
|
73
|
+
return self.update_id
|
74
|
+
|
58
75
|
def print_span(self):
|
59
76
|
"""Print the span with proper formatting and parent relationship information."""
|
60
77
|
indent = " " * self.depth
|
@@ -73,8 +90,56 @@ class TraceSpan(TraceSpanJudgmentType):
|
|
73
90
|
|
74
91
|
def safe_stringify(self, output, function_name):
|
75
92
|
"""
|
76
|
-
Safely converts an object to a
|
93
|
+
Safely converts an object to a JSON-serializable structure, handling common object types intelligently.
|
77
94
|
"""
|
95
|
+
# Handle Pydantic models
|
96
|
+
if hasattr(output, "model_dump"):
|
97
|
+
try:
|
98
|
+
return output.model_dump()
|
99
|
+
except Exception:
|
100
|
+
pass
|
101
|
+
|
102
|
+
# Handle LangChain messages and similar objects with content/type
|
103
|
+
if hasattr(output, "content") and hasattr(output, "type"):
|
104
|
+
try:
|
105
|
+
result = {"type": output.type, "content": output.content}
|
106
|
+
# Add additional fields if they exist
|
107
|
+
if hasattr(output, "additional_kwargs"):
|
108
|
+
result["additional_kwargs"] = output.additional_kwargs
|
109
|
+
if hasattr(output, "response_metadata"):
|
110
|
+
result["response_metadata"] = output.response_metadata
|
111
|
+
if hasattr(output, "name"):
|
112
|
+
result["name"] = output.name
|
113
|
+
return result
|
114
|
+
except Exception:
|
115
|
+
pass
|
116
|
+
|
117
|
+
if hasattr(output, "dict"):
|
118
|
+
try:
|
119
|
+
return output.dict()
|
120
|
+
except Exception:
|
121
|
+
pass
|
122
|
+
|
123
|
+
if hasattr(output, "to_dict"):
|
124
|
+
try:
|
125
|
+
return output.to_dict()
|
126
|
+
except Exception:
|
127
|
+
pass
|
128
|
+
|
129
|
+
if hasattr(output, "__dataclass_fields__"):
|
130
|
+
try:
|
131
|
+
import dataclasses
|
132
|
+
|
133
|
+
return dataclasses.asdict(output)
|
134
|
+
except Exception:
|
135
|
+
pass
|
136
|
+
|
137
|
+
if hasattr(output, "__dict__"):
|
138
|
+
try:
|
139
|
+
return output.__dict__
|
140
|
+
except Exception:
|
141
|
+
pass
|
142
|
+
|
78
143
|
try:
|
79
144
|
return str(output)
|
80
145
|
except (TypeError, OverflowError, ValueError):
|
judgeval/data/trace_run.py
CHANGED
@@ -16,7 +16,6 @@ class TraceRun(BaseModel):
|
|
16
16
|
scorers (List[Union[JudgmentScorer, BaseScorer]]): A list of scorers to use for evaluation
|
17
17
|
model (str): The model used as a judge when using LLM as a Judge
|
18
18
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
19
|
-
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
20
19
|
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
21
20
|
append (Optional[bool]): Whether to append to existing evaluation results
|
22
21
|
tools (Optional[List[Dict[str, Any]]]): List of tools to use for evaluation
|
@@ -30,8 +29,6 @@ class TraceRun(BaseModel):
|
|
30
29
|
model: Optional[str] = "gpt-4.1"
|
31
30
|
trace_span_id: Optional[str] = None
|
32
31
|
append: Optional[bool] = False
|
33
|
-
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
34
|
-
judgment_api_key: Optional[str] = ""
|
35
32
|
override: Optional[bool] = False
|
36
33
|
rules: Optional[List[Rule]] = None
|
37
34
|
tools: Optional[List[Dict[str, Any]]] = None
|
judgeval/evaluation_run.py
CHANGED
@@ -17,7 +17,6 @@ class EvaluationRun(BaseModel):
|
|
17
17
|
scorers (List[Union[JudgmentScorer, BaseScorer]]): A list of scorers to use for evaluation
|
18
18
|
model (str): The model used as a judge when using LLM as a Judge
|
19
19
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
20
|
-
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
21
20
|
"""
|
22
21
|
|
23
22
|
organization_id: Optional[str] = None
|
@@ -28,7 +27,6 @@ class EvaluationRun(BaseModel):
|
|
28
27
|
model: Optional[str] = "gpt-4.1"
|
29
28
|
trace_span_id: Optional[str] = None
|
30
29
|
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
31
|
-
judgment_api_key: Optional[str] = ""
|
32
30
|
override: Optional[bool] = False
|
33
31
|
append: Optional[bool] = False
|
34
32
|
|