judgeval 0.14.0__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/api/__init__.py +0 -22
- judgeval/api/api_types.py +22 -26
- judgeval/data/judgment_types.py +27 -34
- judgeval/dataset/__init__.py +1 -1
- judgeval/evaluation/__init__.py +9 -21
- judgeval/integrations/openlit/__init__.py +50 -0
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +50 -3
- judgeval/tracer/__init__.py +4 -0
- judgeval/version.py +1 -1
- {judgeval-0.14.0.dist-info → judgeval-0.15.0.dist-info}/METADATA +1 -1
- {judgeval-0.14.0.dist-info → judgeval-0.15.0.dist-info}/RECORD +14 -13
- {judgeval-0.14.0.dist-info → judgeval-0.15.0.dist-info}/WHEEL +0 -0
- {judgeval-0.14.0.dist-info → judgeval-0.15.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.14.0.dist-info → judgeval-0.15.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/api/__init__.py
CHANGED
@@ -111,16 +111,6 @@ class JudgmentSyncClient:
|
|
111
111
|
payload,
|
112
112
|
)
|
113
113
|
|
114
|
-
def get_evaluation_status(self, experiment_run_id: str, project_name: str) -> Any:
|
115
|
-
query_params = {}
|
116
|
-
query_params["experiment_run_id"] = experiment_run_id
|
117
|
-
query_params["project_name"] = project_name
|
118
|
-
return self._request(
|
119
|
-
"GET",
|
120
|
-
url_for("/get_evaluation_status/"),
|
121
|
-
query_params,
|
122
|
-
)
|
123
|
-
|
124
114
|
def datasets_insert_examples_for_judgeval(
|
125
115
|
self, payload: DatasetInsertExamples
|
126
116
|
) -> Any:
|
@@ -318,18 +308,6 @@ class JudgmentAsyncClient:
|
|
318
308
|
payload,
|
319
309
|
)
|
320
310
|
|
321
|
-
async def get_evaluation_status(
|
322
|
-
self, experiment_run_id: str, project_name: str
|
323
|
-
) -> Any:
|
324
|
-
query_params = {}
|
325
|
-
query_params["experiment_run_id"] = experiment_run_id
|
326
|
-
query_params["project_name"] = project_name
|
327
|
-
return await self._request(
|
328
|
-
"GET",
|
329
|
-
url_for("/get_evaluation_status/"),
|
330
|
-
query_params,
|
331
|
-
)
|
332
|
-
|
333
311
|
async def datasets_insert_examples_for_judgeval(
|
334
312
|
self, payload: DatasetInsertExamples
|
335
313
|
) -> Any:
|
judgeval/api/api_types.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-09-
|
3
|
+
# timestamp: 2025-09-30T18:06:51+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
|
@@ -24,6 +24,15 @@ class DatasetsFetch(TypedDict):
|
|
24
24
|
project_name: str
|
25
25
|
|
26
26
|
|
27
|
+
class DatasetsTableRow(TypedDict):
|
28
|
+
dataset_id: str
|
29
|
+
name: str
|
30
|
+
created_at: str
|
31
|
+
kind: Literal["trace", "example"]
|
32
|
+
entries: int
|
33
|
+
creator: str
|
34
|
+
|
35
|
+
|
27
36
|
class ProjectAdd(TypedDict):
|
28
37
|
project_name: str
|
29
38
|
|
@@ -54,6 +63,8 @@ class SavePromptScorerRequest(TypedDict):
|
|
54
63
|
threshold: float
|
55
64
|
model: NotRequired[str]
|
56
65
|
is_trace: NotRequired[bool]
|
66
|
+
options: NotRequired[Optional[Dict[str, float]]]
|
67
|
+
description: NotRequired[Optional[str]]
|
57
68
|
|
58
69
|
|
59
70
|
class SavePromptScorerResponse(TypedDict):
|
@@ -143,6 +154,8 @@ class PromptScorer(TypedDict):
|
|
143
154
|
prompt: str
|
144
155
|
threshold: float
|
145
156
|
model: NotRequired[str]
|
157
|
+
options: NotRequired[Optional[Dict[str, float]]]
|
158
|
+
description: NotRequired[Optional[str]]
|
146
159
|
created_at: NotRequired[Optional[str]]
|
147
160
|
updated_at: NotRequired[Optional[str]]
|
148
161
|
is_trace: NotRequired[Optional[bool]]
|
@@ -176,18 +189,10 @@ class OtelTraceSpan(TypedDict):
|
|
176
189
|
resource_attributes: NotRequired[Optional[Dict[str, Any]]]
|
177
190
|
span_attributes: NotRequired[Optional[Dict[str, Any]]]
|
178
191
|
duration: NotRequired[Optional[int]]
|
179
|
-
status_code: NotRequired[Optional[
|
192
|
+
status_code: NotRequired[Optional[int]]
|
180
193
|
status_message: NotRequired[Optional[str]]
|
181
194
|
events: NotRequired[Optional[List[Dict[str, Any]]]]
|
182
195
|
links: NotRequired[Optional[List[Dict[str, Any]]]]
|
183
|
-
legacy_span_id: NotRequired[Optional[str]]
|
184
|
-
inputs: NotRequired[Optional[Dict[str, Any]]]
|
185
|
-
output: Any
|
186
|
-
error: NotRequired[Optional[Dict[str, Any]]]
|
187
|
-
agent_id: NotRequired[Optional[str]]
|
188
|
-
cumulative_llm_cost: NotRequired[Optional[float]]
|
189
|
-
state_after: NotRequired[Optional[Dict[str, Any]]]
|
190
|
-
state_before: NotRequired[Optional[Dict[str, Any]]]
|
191
196
|
|
192
197
|
|
193
198
|
class OtelSpanListItemScores(TypedDict):
|
@@ -202,7 +207,7 @@ class OtelSpanDetailScores(TypedDict):
|
|
202
207
|
score: float
|
203
208
|
reason: NotRequired[Optional[str]]
|
204
209
|
name: str
|
205
|
-
|
210
|
+
example_id: NotRequired[Optional[str]]
|
206
211
|
|
207
212
|
|
208
213
|
class ExampleEvaluationRun(TypedDict):
|
@@ -240,15 +245,6 @@ class DatasetInsertExamples(TypedDict):
|
|
240
245
|
project_name: str
|
241
246
|
|
242
247
|
|
243
|
-
class DatasetInfo(TypedDict):
|
244
|
-
dataset_id: str
|
245
|
-
name: str
|
246
|
-
created_at: str
|
247
|
-
dataset_kind: DatasetKind
|
248
|
-
entries: int
|
249
|
-
creator: str
|
250
|
-
|
251
|
-
|
252
248
|
class DatasetCreate(TypedDict):
|
253
249
|
name: str
|
254
250
|
dataset_kind: DatasetKind
|
@@ -275,14 +271,14 @@ class OtelTraceListItem(TypedDict):
|
|
275
271
|
organization_id: str
|
276
272
|
project_id: str
|
277
273
|
trace_id: str
|
278
|
-
|
274
|
+
created_at: str
|
279
275
|
duration: NotRequired[Optional[int]]
|
280
276
|
has_notification: NotRequired[Optional[bool]]
|
281
277
|
tags: NotRequired[Optional[List[str]]]
|
282
278
|
experiment_run_id: NotRequired[Optional[str]]
|
283
279
|
span_name: NotRequired[Optional[str]]
|
284
|
-
|
285
|
-
error: NotRequired[
|
280
|
+
llm_cost: NotRequired[Optional[float]]
|
281
|
+
error: NotRequired[str]
|
286
282
|
scores: NotRequired[List[OtelSpanListItemScores]]
|
287
283
|
customer_id: NotRequired[Optional[str]]
|
288
284
|
input_preview: NotRequired[Optional[str]]
|
@@ -306,9 +302,9 @@ class OtelSpanDetail(TypedDict):
|
|
306
302
|
resource_attributes: NotRequired[Optional[Dict[str, Any]]]
|
307
303
|
span_attributes: NotRequired[Optional[Dict[str, Any]]]
|
308
304
|
duration: NotRequired[Optional[int]]
|
309
|
-
status_code: NotRequired[Optional[
|
305
|
+
status_code: NotRequired[Optional[int]]
|
310
306
|
status_message: NotRequired[Optional[str]]
|
311
|
-
events: NotRequired[Optional[
|
307
|
+
events: NotRequired[Optional[List[Dict[str, Any]]]]
|
312
308
|
links: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
|
313
309
|
llm_cost: NotRequired[Optional[float]]
|
314
310
|
prompt_tokens: NotRequired[Optional[int]]
|
@@ -331,5 +327,5 @@ class DatasetReturn(TypedDict):
|
|
331
327
|
name: str
|
332
328
|
project_name: str
|
333
329
|
dataset_kind: DatasetKind
|
334
|
-
examples: NotRequired[
|
330
|
+
examples: NotRequired[List[Example]]
|
335
331
|
traces: NotRequired[Optional[List[DatasetTraceWithSpans]]]
|
judgeval/data/judgment_types.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-09-
|
3
|
+
# timestamp: 2025-09-30T18:06:50+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Annotated, Any, Dict, List, Optional, Union
|
@@ -26,6 +26,20 @@ class DatasetsFetch(BaseModel):
|
|
26
26
|
project_name: Annotated[str, Field(title="Project Name")]
|
27
27
|
|
28
28
|
|
29
|
+
class Kind(Enum):
|
30
|
+
trace = "trace"
|
31
|
+
example = "example"
|
32
|
+
|
33
|
+
|
34
|
+
class DatasetsTableRow(BaseModel):
|
35
|
+
dataset_id: Annotated[str, Field(title="Dataset Id")]
|
36
|
+
name: Annotated[str, Field(title="Name")]
|
37
|
+
created_at: Annotated[str, Field(title="Created At")]
|
38
|
+
kind: Annotated[Kind, Field(title="Kind")]
|
39
|
+
entries: Annotated[int, Field(title="Entries")]
|
40
|
+
creator: Annotated[str, Field(title="Creator")]
|
41
|
+
|
42
|
+
|
29
43
|
class ProjectAdd(BaseModel):
|
30
44
|
project_name: Annotated[str, Field(title="Project Name")]
|
31
45
|
|
@@ -56,6 +70,8 @@ class SavePromptScorerRequest(BaseModel):
|
|
56
70
|
threshold: Annotated[float, Field(title="Threshold")]
|
57
71
|
model: Annotated[Optional[str], Field(title="Model")] = "gpt-5"
|
58
72
|
is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
|
73
|
+
options: Annotated[Optional[Dict[str, float]], Field(title="Options")] = None
|
74
|
+
description: Annotated[Optional[str], Field(title="Description")] = None
|
59
75
|
|
60
76
|
|
61
77
|
class SavePromptScorerResponse(BaseModel):
|
@@ -156,6 +172,8 @@ class PromptScorer(BaseModel):
|
|
156
172
|
prompt: Annotated[str, Field(title="Prompt")]
|
157
173
|
threshold: Annotated[float, Field(title="Threshold")]
|
158
174
|
model: Annotated[Optional[str], Field(title="Model")] = "gpt-5"
|
175
|
+
options: Annotated[Optional[Dict[str, float]], Field(title="Options")] = None
|
176
|
+
description: Annotated[Optional[str], Field(title="Description")] = None
|
159
177
|
created_at: Annotated[Optional[AwareDatetime], Field(title="Created At")] = None
|
160
178
|
updated_at: Annotated[Optional[AwareDatetime], Field(title="Updated At")] = None
|
161
179
|
is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
|
@@ -195,22 +213,10 @@ class OtelTraceSpan(BaseModel):
|
|
195
213
|
Optional[Dict[str, Any]], Field(title="Span Attributes")
|
196
214
|
] = None
|
197
215
|
duration: Annotated[Optional[int], Field(title="Duration")] = None
|
198
|
-
status_code: Annotated[Optional[
|
216
|
+
status_code: Annotated[Optional[int], Field(title="Status Code")] = None
|
199
217
|
status_message: Annotated[Optional[str], Field(title="Status Message")] = None
|
200
218
|
events: Annotated[Optional[List[Dict[str, Any]]], Field(title="Events")] = None
|
201
219
|
links: Annotated[Optional[List[Dict[str, Any]]], Field(title="Links")] = None
|
202
|
-
legacy_span_id: Annotated[Optional[str], Field(title="Legacy Span Id")] = None
|
203
|
-
inputs: Annotated[Optional[Dict[str, Any]], Field(title="Inputs")] = None
|
204
|
-
output: Annotated[Any, Field(title="Output")]
|
205
|
-
error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
|
206
|
-
agent_id: Annotated[Optional[str], Field(title="Agent Id")] = None
|
207
|
-
cumulative_llm_cost: Annotated[
|
208
|
-
Optional[float], Field(title="Cumulative Llm Cost")
|
209
|
-
] = None
|
210
|
-
state_after: Annotated[Optional[Dict[str, Any]], Field(title="State After")] = None
|
211
|
-
state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
|
212
|
-
None
|
213
|
-
)
|
214
220
|
|
215
221
|
|
216
222
|
class OtelSpanListItemScores(BaseModel):
|
@@ -225,7 +231,7 @@ class OtelSpanDetailScores(BaseModel):
|
|
225
231
|
score: Annotated[float, Field(title="Score")]
|
226
232
|
reason: Annotated[Optional[str], Field(title="Reason")] = None
|
227
233
|
name: Annotated[str, Field(title="Name")]
|
228
|
-
|
234
|
+
example_id: Annotated[Optional[str], Field(title="Example Id")] = None
|
229
235
|
|
230
236
|
|
231
237
|
class ExampleEvaluationRun(BaseModel):
|
@@ -273,15 +279,6 @@ class DatasetInsertExamples(BaseModel):
|
|
273
279
|
project_name: Annotated[str, Field(title="Project Name")]
|
274
280
|
|
275
281
|
|
276
|
-
class DatasetInfo(BaseModel):
|
277
|
-
dataset_id: Annotated[str, Field(title="Dataset Id")]
|
278
|
-
name: Annotated[str, Field(title="Name")]
|
279
|
-
created_at: Annotated[str, Field(title="Created At")]
|
280
|
-
dataset_kind: DatasetKind
|
281
|
-
entries: Annotated[int, Field(title="Entries")]
|
282
|
-
creator: Annotated[str, Field(title="Creator")]
|
283
|
-
|
284
|
-
|
285
282
|
class DatasetCreate(BaseModel):
|
286
283
|
name: Annotated[str, Field(title="Name")]
|
287
284
|
dataset_kind: DatasetKind
|
@@ -310,16 +307,14 @@ class OtelTraceListItem(BaseModel):
|
|
310
307
|
organization_id: Annotated[str, Field(title="Organization Id")]
|
311
308
|
project_id: Annotated[str, Field(title="Project Id")]
|
312
309
|
trace_id: Annotated[str, Field(title="Trace Id")]
|
313
|
-
|
310
|
+
created_at: Annotated[AwareDatetime, Field(title="Created At")]
|
314
311
|
duration: Annotated[Optional[int], Field(title="Duration")] = None
|
315
312
|
has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = None
|
316
313
|
tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
|
317
314
|
experiment_run_id: Annotated[Optional[str], Field(title="Experiment Run Id")] = None
|
318
315
|
span_name: Annotated[Optional[str], Field(title="Span Name")] = None
|
319
|
-
|
320
|
-
|
321
|
-
] = None
|
322
|
-
error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
|
316
|
+
llm_cost: Annotated[Optional[float], Field(title="Llm Cost")] = None
|
317
|
+
error: Annotated[Optional[str], Field(title="Error")] = ""
|
323
318
|
scores: Annotated[
|
324
319
|
Optional[List[OtelSpanListItemScores]], Field(title="Scores")
|
325
320
|
] = []
|
@@ -334,7 +329,7 @@ class OtelTraceListItem(BaseModel):
|
|
334
329
|
class OtelSpanDetail(BaseModel):
|
335
330
|
organization_id: Annotated[str, Field(title="Organization Id")]
|
336
331
|
project_id: Annotated[str, Field(title="Project Id")]
|
337
|
-
timestamp: Annotated[
|
332
|
+
timestamp: Annotated[AwareDatetime, Field(title="Timestamp")]
|
338
333
|
trace_id: Annotated[str, Field(title="Trace Id")]
|
339
334
|
span_id: Annotated[str, Field(title="Span Id")]
|
340
335
|
parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
|
@@ -349,11 +344,9 @@ class OtelSpanDetail(BaseModel):
|
|
349
344
|
Optional[Dict[str, Any]], Field(title="Span Attributes")
|
350
345
|
] = None
|
351
346
|
duration: Annotated[Optional[int], Field(title="Duration")] = None
|
352
|
-
status_code: Annotated[Optional[
|
347
|
+
status_code: Annotated[Optional[int], Field(title="Status Code")] = None
|
353
348
|
status_message: Annotated[Optional[str], Field(title="Status Message")] = None
|
354
|
-
events: Annotated[
|
355
|
-
Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Events")
|
356
|
-
] = None
|
349
|
+
events: Annotated[Optional[List[Dict[str, Any]]], Field(title="Events")] = None
|
357
350
|
links: Annotated[
|
358
351
|
Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Links")
|
359
352
|
] = None
|
judgeval/dataset/__init__.py
CHANGED
judgeval/evaluation/__init__.py
CHANGED
@@ -84,7 +84,7 @@ def log_evaluation_results(
|
|
84
84
|
|
85
85
|
def _poll_evaluation_until_complete(
|
86
86
|
evaluation_run: ExampleEvaluationRun,
|
87
|
-
|
87
|
+
expected_examples_count: int,
|
88
88
|
poll_interval_seconds: float = 5,
|
89
89
|
max_failures: int = 5,
|
90
90
|
max_poll_count: int = 60, # This should be equivalent to 5 minutes
|
@@ -117,29 +117,22 @@ def _poll_evaluation_until_complete(
|
|
117
117
|
poll_count += 1
|
118
118
|
try:
|
119
119
|
# Check status
|
120
|
-
status_response = api_client.get_evaluation_status(
|
121
|
-
experiment_run_id, project_name
|
122
|
-
)
|
123
|
-
|
124
|
-
if status_response.get("status") != "completed":
|
125
|
-
time.sleep(poll_interval_seconds)
|
126
|
-
continue
|
127
|
-
|
128
|
-
example_scorer_pairings = status_response.get("results", [])
|
129
|
-
if len(example_scorer_pairings) != expected_scorer_data_count:
|
130
|
-
time.sleep(poll_interval_seconds)
|
131
|
-
continue
|
132
|
-
|
133
120
|
results_response = api_client.fetch_experiment_run(
|
134
121
|
{
|
135
122
|
"experiment_run_id": experiment_run_id,
|
136
123
|
"project_name": project_name,
|
137
124
|
}
|
138
125
|
)
|
126
|
+
|
127
|
+
example_scorer_pairings = results_response.get("results", [])
|
128
|
+
if len(example_scorer_pairings) != expected_examples_count:
|
129
|
+
time.sleep(poll_interval_seconds)
|
130
|
+
continue
|
131
|
+
|
139
132
|
url = results_response.get("ui_results_url")
|
140
133
|
|
141
134
|
scoring_result_list = []
|
142
|
-
for res in
|
135
|
+
for res in example_scorer_pairings:
|
143
136
|
example = res.get("data", {}).copy()
|
144
137
|
example["example_id"] = res.get("example_id")
|
145
138
|
scoring_result = ScoringResult(
|
@@ -241,14 +234,9 @@ def run_eval(
|
|
241
234
|
)
|
242
235
|
raise JudgmentRuntimeError(error_message)
|
243
236
|
|
244
|
-
num_scorers = (
|
245
|
-
len(evaluation_run.judgment_scorers)
|
246
|
-
if evaluation_run.judgment_scorers
|
247
|
-
else sum(1 for cs in evaluation_run.custom_scorers if cs.server_hosted)
|
248
|
-
)
|
249
237
|
results, url = _poll_evaluation_until_complete(
|
250
238
|
evaluation_run=evaluation_run,
|
251
|
-
|
239
|
+
expected_examples_count=len(evaluation_run.examples),
|
252
240
|
)
|
253
241
|
finally:
|
254
242
|
stop_event.set()
|
@@ -0,0 +1,50 @@
|
|
1
|
+
from abc import ABC
|
2
|
+
from judgeval.tracer import Tracer
|
3
|
+
from judgeval.logger import judgeval_logger
|
4
|
+
from judgeval.utils.url import url_for
|
5
|
+
|
6
|
+
|
7
|
+
try:
|
8
|
+
import openlit # type: ignore
|
9
|
+
except ImportError:
|
10
|
+
raise ImportError(
|
11
|
+
"Openlit is not installed and required for the openlit integration. Please install it with `pip install openlit`."
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
class Openlit(ABC):
|
16
|
+
@staticmethod
|
17
|
+
def initialize(
|
18
|
+
**kwargs,
|
19
|
+
):
|
20
|
+
tracer = Tracer.get_instance()
|
21
|
+
if not tracer or not tracer._initialized:
|
22
|
+
raise ValueError(
|
23
|
+
"Openlit must be initialized after the tracer has been initialized. Please create the Tracer instance first before initializing Openlit."
|
24
|
+
)
|
25
|
+
|
26
|
+
api_key = tracer.api_key
|
27
|
+
organization_id = tracer.organization_id
|
28
|
+
project_name = tracer.project_name
|
29
|
+
|
30
|
+
project_id = Tracer._resolve_project_id(project_name, api_key, organization_id)
|
31
|
+
if not project_id:
|
32
|
+
judgeval_logger.warning(
|
33
|
+
f"Project {project_name} not found. Please create it first at https://app.judgmentlabs.ai/org/{organization_id}/projects."
|
34
|
+
)
|
35
|
+
return
|
36
|
+
|
37
|
+
openlit.init(
|
38
|
+
service_name=project_name,
|
39
|
+
otlp_endpoint=url_for("/otel"),
|
40
|
+
otlp_headers={
|
41
|
+
"Authorization": f"Bearer {api_key}",
|
42
|
+
"X-Organization-Id": organization_id,
|
43
|
+
"X-Project-Id": project_id,
|
44
|
+
},
|
45
|
+
tracer=tracer.get_tracer(),
|
46
|
+
**kwargs,
|
47
|
+
)
|
48
|
+
|
49
|
+
|
50
|
+
__all__ = ["Openlit"]
|
@@ -4,20 +4,23 @@ from judgeval.scorers.api_scorer import (
|
|
4
4
|
TraceAPIScorerConfig,
|
5
5
|
)
|
6
6
|
from judgeval.constants import APIScorerType
|
7
|
-
from typing import Dict, Any
|
7
|
+
from typing import Dict, Any, Optional
|
8
8
|
from judgeval.api import JudgmentSyncClient
|
9
9
|
from judgeval.exceptions import JudgmentAPIError
|
10
10
|
import os
|
11
11
|
from judgeval.logger import judgeval_logger
|
12
12
|
from abc import ABC
|
13
13
|
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
14
|
+
from copy import copy
|
14
15
|
|
15
16
|
|
16
17
|
def push_prompt_scorer(
|
17
18
|
name: str,
|
18
19
|
prompt: str,
|
19
20
|
threshold: float,
|
21
|
+
options: Optional[Dict[str, float]] = None,
|
20
22
|
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
23
|
+
description: Optional[str] = None,
|
21
24
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
22
25
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
23
26
|
is_trace: bool = False,
|
@@ -29,7 +32,9 @@ def push_prompt_scorer(
|
|
29
32
|
"name": name,
|
30
33
|
"prompt": prompt,
|
31
34
|
"threshold": threshold,
|
35
|
+
"options": options,
|
32
36
|
"model": model,
|
37
|
+
"description": description,
|
33
38
|
"is_trace": is_trace,
|
34
39
|
}
|
35
40
|
)
|
@@ -98,6 +103,8 @@ def scorer_exists(
|
|
98
103
|
class BasePromptScorer(ABC, APIScorerConfig):
|
99
104
|
score_type: APIScorerType
|
100
105
|
prompt: str
|
106
|
+
options: Optional[Dict[str, float]] = None
|
107
|
+
description: Optional[str] = None
|
101
108
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
|
102
109
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
|
103
110
|
|
@@ -124,7 +131,9 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
124
131
|
name=name,
|
125
132
|
prompt=scorer_config["prompt"],
|
126
133
|
threshold=scorer_config["threshold"],
|
134
|
+
options=scorer_config.get("options"),
|
127
135
|
model=scorer_config.get("model"),
|
136
|
+
description=scorer_config.get("description"),
|
128
137
|
judgment_api_key=judgment_api_key,
|
129
138
|
organization_id=organization_id,
|
130
139
|
)
|
@@ -135,7 +144,9 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
135
144
|
name: str,
|
136
145
|
prompt: str,
|
137
146
|
threshold: float = 0.5,
|
147
|
+
options: Optional[Dict[str, float]] = None,
|
138
148
|
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
149
|
+
description: Optional[str] = None,
|
139
150
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
140
151
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
141
152
|
):
|
@@ -150,7 +161,9 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
150
161
|
name,
|
151
162
|
prompt,
|
152
163
|
threshold,
|
164
|
+
options,
|
153
165
|
model,
|
166
|
+
description,
|
154
167
|
judgment_api_key,
|
155
168
|
organization_id,
|
156
169
|
is_trace,
|
@@ -161,7 +174,9 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
161
174
|
name=name,
|
162
175
|
prompt=prompt,
|
163
176
|
threshold=threshold,
|
177
|
+
options=options,
|
164
178
|
model=model,
|
179
|
+
description=description,
|
165
180
|
judgment_api_key=judgment_api_key,
|
166
181
|
organization_id=organization_id,
|
167
182
|
)
|
@@ -199,6 +214,22 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
199
214
|
self.push_prompt_scorer()
|
200
215
|
judgeval_logger.info(f"Successfully updated model for {self.name}")
|
201
216
|
|
217
|
+
def set_options(self, options: Optional[Dict[str, float]]):
|
218
|
+
"""
|
219
|
+
Updates the options of the scorer.
|
220
|
+
"""
|
221
|
+
self.options = options
|
222
|
+
self.push_prompt_scorer()
|
223
|
+
judgeval_logger.info(f"Successfully updated options for {self.name}")
|
224
|
+
|
225
|
+
def set_description(self, description: Optional[str]):
|
226
|
+
"""
|
227
|
+
Updates the description of the scorer.
|
228
|
+
"""
|
229
|
+
self.description = description
|
230
|
+
self.push_prompt_scorer()
|
231
|
+
judgeval_logger.info(f"Successfully updated description for {self.name}")
|
232
|
+
|
202
233
|
def append_to_prompt(self, prompt_addition: str):
|
203
234
|
"""
|
204
235
|
Appends a string to the prompt.
|
@@ -226,7 +257,19 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
226
257
|
"""
|
227
258
|
return self.model
|
228
259
|
|
229
|
-
def
|
260
|
+
def get_options(self) -> Dict[str, float] | None:
|
261
|
+
"""
|
262
|
+
Returns the options of the scorer.
|
263
|
+
"""
|
264
|
+
return copy(self.options) if self.options is not None else None
|
265
|
+
|
266
|
+
def get_description(self) -> str | None:
|
267
|
+
"""
|
268
|
+
Returns the description of the scorer.
|
269
|
+
"""
|
270
|
+
return self.description
|
271
|
+
|
272
|
+
def get_name(self) -> str:
|
230
273
|
"""
|
231
274
|
Returns the name of the scorer.
|
232
275
|
"""
|
@@ -241,6 +284,8 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
241
284
|
"model": self.model,
|
242
285
|
"prompt": self.prompt,
|
243
286
|
"threshold": self.threshold,
|
287
|
+
"options": self.options,
|
288
|
+
"description": self.description,
|
244
289
|
}
|
245
290
|
|
246
291
|
def push_prompt_scorer(self):
|
@@ -251,14 +296,16 @@ class BasePromptScorer(ABC, APIScorerConfig):
|
|
251
296
|
self.name,
|
252
297
|
self.prompt,
|
253
298
|
self.threshold,
|
299
|
+
self.options,
|
254
300
|
self.model,
|
301
|
+
self.description,
|
255
302
|
self.judgment_api_key,
|
256
303
|
self.organization_id,
|
257
304
|
isinstance(self, TracePromptScorer),
|
258
305
|
)
|
259
306
|
|
260
307
|
def __str__(self):
|
261
|
-
return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold})"
|
308
|
+
return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold}, options={self.options}, description={self.description})"
|
262
309
|
|
263
310
|
def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
|
264
311
|
base = super().model_dump(*args, **kwargs)
|
judgeval/tracer/__init__.py
CHANGED
@@ -255,6 +255,10 @@ class Tracer(metaclass=SingletonMeta):
|
|
255
255
|
def get_current_agent_context(self):
|
256
256
|
return self.agent_context
|
257
257
|
|
258
|
+
def get_span_processor(self) -> JudgmentSpanProcessor:
|
259
|
+
"""Get the internal span processor of this tracer instance."""
|
260
|
+
return self.judgment_processor
|
261
|
+
|
258
262
|
def set_customer_id(self, customer_id: str) -> None:
|
259
263
|
span = self.get_current_span()
|
260
264
|
if span and span.is_recording():
|
judgeval/version.py
CHANGED
@@ -4,22 +4,23 @@ judgeval/constants.py,sha256=JZZJ1MqzZZDVk-5PRPRbmLnM8mXI-RDL5vxa1JFuscs,3408
|
|
4
4
|
judgeval/env.py,sha256=37Mn4g0OkpFxXCZGlO_CLqKJnyX-jx_R24tC28XJzig,2112
|
5
5
|
judgeval/exceptions.py,sha256=tTbfe4yoOtPXmn22UQz9-6a-5PT9uOko85xaRRwr0Sw,621
|
6
6
|
judgeval/logger.py,sha256=ZWbp0QfT1CJnQIjV-Zle4n489nFCKEmD2-ukx--iiow,1553
|
7
|
-
judgeval/version.py,sha256=
|
7
|
+
judgeval/version.py,sha256=1a6hS0-ubylneLxq8Pt0EqBRx0hSP1cO9JKaTmHazfo,74
|
8
8
|
judgeval/warnings.py,sha256=LbGte14ppiFjrkp-JJYueZ40NWFvMkWRvPXr6r-fUWw,73
|
9
|
-
judgeval/api/__init__.py,sha256=
|
10
|
-
judgeval/api/api_types.py,sha256=
|
9
|
+
judgeval/api/__init__.py,sha256=_oDuEDBDmyPQkdfvWebvBSvrnlzg4vreETpt16frXEA,12468
|
10
|
+
judgeval/api/api_types.py,sha256=hpUpVRCLIGF-lHHg1gIgdTaRfwS94Vh1E23vU9Z34js,8555
|
11
11
|
judgeval/data/__init__.py,sha256=1tU0EN0ThIfQ1fad5I3dKxAfTcZ5U8cvTLcQ6qLVLU0,407
|
12
12
|
judgeval/data/evaluation_run.py,sha256=O41p99wNAuCAf6lsLNKzkZ6W-kL9LlzCYxVls7IcKkA,4727
|
13
13
|
judgeval/data/example.py,sha256=eGJpF-lyUH734Cg90B7WtU9f8iKoS3VFGeV6R-GVCCc,1039
|
14
|
-
judgeval/data/judgment_types.py,sha256=
|
14
|
+
judgeval/data/judgment_types.py,sha256=u45rfHEtUNzXSQstJ4TcOo-yX9cZymma5W0hTtb5u34,15965
|
15
15
|
judgeval/data/result.py,sha256=XufFGSAkBDfevPUmzSgsR9HEqytISkM0U5HkhJmsjpY,2102
|
16
16
|
judgeval/data/scorer_data.py,sha256=HeP15ZgftFTJCF8JmDJCLWXRnZJIaGDJCzl7Hg6gWwE,2006
|
17
17
|
judgeval/data/trace.py,sha256=zSiR3o6xt8Z46XA3M9fJBtViF0BsPO6yKp9jxdscOSc,3881
|
18
18
|
judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
|
19
19
|
judgeval/data/scripts/openapi_transform.py,sha256=Sm04JClzyP1ga8KA3gkIdsae8Hlx-XU7-x0gHCQYOhg,3877
|
20
|
-
judgeval/dataset/__init__.py,sha256=
|
21
|
-
judgeval/evaluation/__init__.py,sha256=
|
20
|
+
judgeval/dataset/__init__.py,sha256=kL0_tIMP3qV6t4W17HQU91ybdXMZ5iDZzyUKzyfRdyY,8269
|
21
|
+
judgeval/evaluation/__init__.py,sha256=WcqOgQdwgtc_BwEwDz6RDlF2RczyLrNjjIevQp-_NKE,12788
|
22
22
|
judgeval/integrations/langgraph/__init__.py,sha256=HwXmtDxaO75Kn4KPErnMb6Ne6FcpRxV_SCYVuwFsve0,332
|
23
|
+
judgeval/integrations/openlit/__init__.py,sha256=-8D4D6-fGsWPwoOojw82OaE9X5sUbmb16x1bF-WfOmg,1571
|
23
24
|
judgeval/judges/__init__.py,sha256=e7JnTc1TG_SwqydDHTXHIP0EBazQxt-ydMQG7ghSU5A,228
|
24
25
|
judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
|
25
26
|
judgeval/judges/litellm_judge.py,sha256=5vEF0IUo7HVWnOF2ww-DMke8Xkarnz32B_qbgKjc0-I,4182
|
@@ -39,8 +40,8 @@ judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=WUeFy
|
|
39
40
|
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=ciiFBQQC4UDsk9qou9OiKbAR31s82eRUY1ZTt1gdM-0,407
|
40
41
|
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=ucYOI6ztAjfoYmcgTDzN8u5RrehlVqrkeLEfss9b1fk,441
|
41
42
|
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=V3RdrWhnR_vLBrtWw7QbgN9K_A-Och7-v9I2fN4z8gY,506
|
42
|
-
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=
|
43
|
-
judgeval/tracer/__init__.py,sha256=
|
43
|
+
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=zJ0n3HyZ1FFBnMnTYxBi37m_3Er7ENd4HpqLjNi5Eag,10902
|
44
|
+
judgeval/tracer/__init__.py,sha256=uIOx-2P_FVwBKhwVkkIOyEQCv3gouCZ2I8-eApocnKU,36165
|
44
45
|
judgeval/tracer/constants.py,sha256=ae8tivAW97awJQxdRB9OMqX50wOLX3zqChT_AGkPBu0,85
|
45
46
|
judgeval/tracer/keys.py,sha256=ho4-_w4ngTVejdSKUH80sG6vtYt4c7FEKrYpFrDfPLs,2105
|
46
47
|
judgeval/tracer/local_eval_queue.py,sha256=KZKvSSli7B-EVzdHa4-CmXUpv0uOjGLLRa2KTPg8lRc,7320
|
@@ -72,8 +73,8 @@ judgeval/utils/serialize.py,sha256=QXR-8Nj5rqOrI9zLx0oRLdk6DW6Bc7j8eyF4zQ7PLxA,6
|
|
72
73
|
judgeval/utils/testing.py,sha256=m5Nexv65tmfSj1XvAPK5Ear7aJ7w5xjDtZN0tLZ_RBk,2939
|
73
74
|
judgeval/utils/url.py,sha256=Shf0v3XcbaWpL0m1eGJEEO_z4TsQCnDB2Rl25OTUmiI,195
|
74
75
|
judgeval/utils/version_check.py,sha256=ylZQSqV7kLzEOChxvav9SCHUU4OnaCp36tXHLjdzmw0,1072
|
75
|
-
judgeval-0.
|
76
|
-
judgeval-0.
|
77
|
-
judgeval-0.
|
78
|
-
judgeval-0.
|
79
|
-
judgeval-0.
|
76
|
+
judgeval-0.15.0.dist-info/METADATA,sha256=MT857VBF8qoWXiCu_NyK_JCBcrddN1kCSWxDd58D3g0,8564
|
77
|
+
judgeval-0.15.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
78
|
+
judgeval-0.15.0.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
|
79
|
+
judgeval-0.15.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
80
|
+
judgeval-0.15.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|