judgeval 0.15.0__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/api/__init__.py +4 -18
- judgeval/api/api_types.py +18 -2
- judgeval/data/judgment_types.py +18 -2
- judgeval/logger.py +1 -1
- judgeval/tracer/__init__.py +10 -7
- judgeval/tracer/keys.py +7 -3
- judgeval/tracer/llm/__init__.py +2 -1227
- judgeval/tracer/llm/config.py +110 -0
- judgeval/tracer/llm/constants.py +10 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +611 -0
- judgeval/tracer/llm/llm_google/__init__.py +0 -0
- judgeval/tracer/llm/llm_google/config.py +24 -0
- judgeval/tracer/llm/llm_google/wrapper.py +426 -0
- judgeval/tracer/llm/llm_groq/__init__.py +0 -0
- judgeval/tracer/llm/llm_groq/config.py +23 -0
- judgeval/tracer/llm/llm_groq/wrapper.py +477 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +637 -0
- judgeval/tracer/llm/llm_together/__init__.py +0 -0
- judgeval/tracer/llm/llm_together/config.py +23 -0
- judgeval/tracer/llm/llm_together/wrapper.py +478 -0
- judgeval/tracer/llm/providers.py +5 -5
- judgeval/tracer/processors/__init__.py +1 -1
- judgeval/trainer/console.py +1 -1
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +21 -0
- judgeval/utils/{decorators.py → decorators/use_once.py} +0 -11
- judgeval/utils/meta.py +1 -1
- judgeval/utils/version_check.py +1 -1
- judgeval/version.py +1 -1
- judgeval-0.16.1.dist-info/METADATA +266 -0
- {judgeval-0.15.0.dist-info → judgeval-0.16.1.dist-info}/RECORD +38 -24
- judgeval/tracer/llm/google/__init__.py +0 -21
- judgeval/tracer/llm/groq/__init__.py +0 -20
- judgeval/tracer/llm/together/__init__.py +0 -20
- judgeval-0.15.0.dist-info/METADATA +0 -158
- /judgeval/tracer/llm/{anthropic/__init__.py → llm_anthropic/config.py} +0 -0
- /judgeval/tracer/llm/{openai/__init__.py → llm_openai/config.py} +0 -0
- {judgeval-0.15.0.dist-info → judgeval-0.16.1.dist-info}/WHEEL +0 -0
- {judgeval-0.15.0.dist-info → judgeval-0.16.1.dist-info}/entry_points.txt +0 -0
- {judgeval-0.15.0.dist-info → judgeval-0.16.1.dist-info}/licenses/LICENSE.md +0 -0
judgeval/api/__init__.py
CHANGED
@@ -73,7 +73,7 @@ class JudgmentSyncClient:
|
|
73
73
|
|
74
74
|
def evaluate_examples(
|
75
75
|
self, payload: ExampleEvaluationRun, stream: Optional[str] = None
|
76
|
-
) ->
|
76
|
+
) -> EvaluateResponse:
|
77
77
|
query_params = {}
|
78
78
|
if stream is not None:
|
79
79
|
query_params["stream"] = stream
|
@@ -86,7 +86,7 @@ class JudgmentSyncClient:
|
|
86
86
|
|
87
87
|
def evaluate_traces(
|
88
88
|
self, payload: TraceEvaluationRun, stream: Optional[str] = None
|
89
|
-
) ->
|
89
|
+
) -> EvaluateResponse:
|
90
90
|
query_params = {}
|
91
91
|
if stream is not None:
|
92
92
|
query_params["stream"] = stream
|
@@ -212,13 +212,6 @@ class JudgmentSyncClient:
|
|
212
212
|
payload,
|
213
213
|
)
|
214
214
|
|
215
|
-
def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
|
216
|
-
return self._request(
|
217
|
-
"POST",
|
218
|
-
url_for("/e2e_fetch_trace_scorer_span_score/"),
|
219
|
-
payload,
|
220
|
-
)
|
221
|
-
|
222
215
|
|
223
216
|
class JudgmentAsyncClient:
|
224
217
|
__slots__ = ("api_key", "organization_id", "client")
|
@@ -270,7 +263,7 @@ class JudgmentAsyncClient:
|
|
270
263
|
|
271
264
|
async def evaluate_examples(
|
272
265
|
self, payload: ExampleEvaluationRun, stream: Optional[str] = None
|
273
|
-
) ->
|
266
|
+
) -> EvaluateResponse:
|
274
267
|
query_params = {}
|
275
268
|
if stream is not None:
|
276
269
|
query_params["stream"] = stream
|
@@ -283,7 +276,7 @@ class JudgmentAsyncClient:
|
|
283
276
|
|
284
277
|
async def evaluate_traces(
|
285
278
|
self, payload: TraceEvaluationRun, stream: Optional[str] = None
|
286
|
-
) ->
|
279
|
+
) -> EvaluateResponse:
|
287
280
|
query_params = {}
|
288
281
|
if stream is not None:
|
289
282
|
query_params["stream"] = stream
|
@@ -411,13 +404,6 @@ class JudgmentAsyncClient:
|
|
411
404
|
payload,
|
412
405
|
)
|
413
406
|
|
414
|
-
async def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
|
415
|
-
return await self._request(
|
416
|
-
"POST",
|
417
|
-
url_for("/e2e_fetch_trace_scorer_span_score/"),
|
418
|
-
payload,
|
419
|
-
)
|
420
|
-
|
421
407
|
|
422
408
|
__all__ = [
|
423
409
|
"JudgmentSyncClient",
|
judgeval/api/api_types.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-
|
3
|
+
# timestamp: 2025-10-09T00:16:42+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
|
@@ -94,6 +94,7 @@ class ResolveProjectNameRequest(TypedDict):
|
|
94
94
|
|
95
95
|
class ResolveProjectNameResponse(TypedDict):
|
96
96
|
project_id: str
|
97
|
+
project_created: bool
|
97
98
|
|
98
99
|
|
99
100
|
class TraceIdRequest(TypedDict):
|
@@ -146,6 +147,14 @@ class ValidationError(TypedDict):
|
|
146
147
|
type: str
|
147
148
|
|
148
149
|
|
150
|
+
class UsageInfo(TypedDict):
|
151
|
+
total_judgees: int
|
152
|
+
regular_use: int
|
153
|
+
pay_as_you_go_use: int
|
154
|
+
remaining_regular: int
|
155
|
+
remaining_after: int
|
156
|
+
|
157
|
+
|
149
158
|
DatasetKind = Literal["trace", "example"]
|
150
159
|
|
151
160
|
|
@@ -273,7 +282,6 @@ class OtelTraceListItem(TypedDict):
|
|
273
282
|
trace_id: str
|
274
283
|
created_at: str
|
275
284
|
duration: NotRequired[Optional[int]]
|
276
|
-
has_notification: NotRequired[Optional[bool]]
|
277
285
|
tags: NotRequired[Optional[List[str]]]
|
278
286
|
experiment_run_id: NotRequired[Optional[str]]
|
279
287
|
span_name: NotRequired[Optional[str]]
|
@@ -281,6 +289,8 @@ class OtelTraceListItem(TypedDict):
|
|
281
289
|
error: NotRequired[str]
|
282
290
|
scores: NotRequired[List[OtelSpanListItemScores]]
|
283
291
|
customer_id: NotRequired[Optional[str]]
|
292
|
+
input: NotRequired[Optional[str]]
|
293
|
+
output: NotRequired[Optional[str]]
|
284
294
|
input_preview: NotRequired[Optional[str]]
|
285
295
|
output_preview: NotRequired[Optional[str]]
|
286
296
|
annotation_count: NotRequired[int]
|
@@ -312,6 +322,12 @@ class OtelSpanDetail(TypedDict):
|
|
312
322
|
scores: NotRequired[Optional[List[OtelSpanDetailScores]]]
|
313
323
|
|
314
324
|
|
325
|
+
class EvaluateResponse(TypedDict):
|
326
|
+
status: str
|
327
|
+
results: List[ScoringResult]
|
328
|
+
resource_usage: NotRequired[Optional[UsageInfo]]
|
329
|
+
|
330
|
+
|
315
331
|
class EvalResults(TypedDict):
|
316
332
|
results: List[ScoringResult]
|
317
333
|
run: Union[ExampleEvaluationRun, TraceEvaluationRun]
|
judgeval/data/judgment_types.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-
|
3
|
+
# timestamp: 2025-10-09T00:16:41+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Annotated, Any, Dict, List, Optional, Union
|
@@ -101,6 +101,7 @@ class ResolveProjectNameRequest(BaseModel):
|
|
101
101
|
|
102
102
|
class ResolveProjectNameResponse(BaseModel):
|
103
103
|
project_id: Annotated[str, Field(title="Project Id")]
|
104
|
+
project_created: Annotated[bool, Field(title="Project Created")]
|
104
105
|
|
105
106
|
|
106
107
|
class TraceIdRequest(BaseModel):
|
@@ -162,6 +163,14 @@ class ValidationError(BaseModel):
|
|
162
163
|
type: Annotated[str, Field(title="Error Type")]
|
163
164
|
|
164
165
|
|
166
|
+
class UsageInfo(BaseModel):
|
167
|
+
total_judgees: Annotated[int, Field(title="Total Judgees")]
|
168
|
+
regular_use: Annotated[int, Field(title="Regular Use")]
|
169
|
+
pay_as_you_go_use: Annotated[int, Field(title="Pay As You Go Use")]
|
170
|
+
remaining_regular: Annotated[int, Field(title="Remaining Regular")]
|
171
|
+
remaining_after: Annotated[int, Field(title="Remaining After")]
|
172
|
+
|
173
|
+
|
165
174
|
class DatasetKind(Enum):
|
166
175
|
trace = "trace"
|
167
176
|
example = "example"
|
@@ -309,7 +318,6 @@ class OtelTraceListItem(BaseModel):
|
|
309
318
|
trace_id: Annotated[str, Field(title="Trace Id")]
|
310
319
|
created_at: Annotated[AwareDatetime, Field(title="Created At")]
|
311
320
|
duration: Annotated[Optional[int], Field(title="Duration")] = None
|
312
|
-
has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = None
|
313
321
|
tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
|
314
322
|
experiment_run_id: Annotated[Optional[str], Field(title="Experiment Run Id")] = None
|
315
323
|
span_name: Annotated[Optional[str], Field(title="Span Name")] = None
|
@@ -319,6 +327,8 @@ class OtelTraceListItem(BaseModel):
|
|
319
327
|
Optional[List[OtelSpanListItemScores]], Field(title="Scores")
|
320
328
|
] = []
|
321
329
|
customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
|
330
|
+
input: Annotated[Optional[str], Field(title="Input")] = None
|
331
|
+
output: Annotated[Optional[str], Field(title="Output")] = None
|
322
332
|
input_preview: Annotated[Optional[str], Field(title="Input Preview")] = None
|
323
333
|
output_preview: Annotated[Optional[str], Field(title="Output Preview")] = None
|
324
334
|
annotation_count: Annotated[Optional[int], Field(title="Annotation Count")] = 0
|
@@ -358,6 +368,12 @@ class OtelSpanDetail(BaseModel):
|
|
358
368
|
)
|
359
369
|
|
360
370
|
|
371
|
+
class EvaluateResponse(BaseModel):
|
372
|
+
status: Annotated[str, Field(title="Status")]
|
373
|
+
results: Annotated[List[ScoringResult], Field(title="Results")]
|
374
|
+
resource_usage: Optional[UsageInfo] = None
|
375
|
+
|
376
|
+
|
361
377
|
class EvalResults(BaseModel):
|
362
378
|
results: Annotated[List[ScoringResult], Field(title="Results")]
|
363
379
|
run: Annotated[Union[ExampleEvaluationRun, TraceEvaluationRun], Field(title="Run")]
|
judgeval/logger.py
CHANGED
judgeval/tracer/__init__.py
CHANGED
@@ -55,7 +55,7 @@ from judgeval.tracer.managers import (
|
|
55
55
|
sync_agent_context,
|
56
56
|
async_agent_context,
|
57
57
|
)
|
58
|
-
from judgeval.utils.decorators import dont_throw
|
58
|
+
from judgeval.utils.decorators.dont_throw import dont_throw
|
59
59
|
from judgeval.utils.guards import expect_api_key, expect_organization_id
|
60
60
|
from judgeval.utils.serialize import safe_serialize
|
61
61
|
from judgeval.utils.meta import SingletonMeta
|
@@ -159,11 +159,14 @@ class Tracer(metaclass=SingletonMeta):
|
|
159
159
|
|
160
160
|
self.judgment_processor = NoOpJudgmentSpanProcessor()
|
161
161
|
if self.enable_monitoring:
|
162
|
-
project_id = Tracer._resolve_project_id(
|
162
|
+
project_id, project_created = Tracer._resolve_project_id(
|
163
163
|
self.project_name, self.api_key, self.organization_id
|
164
|
-
)
|
165
|
-
|
164
|
+
) or (None, False)
|
166
165
|
if project_id:
|
166
|
+
if project_created:
|
167
|
+
judgeval_logger.info(
|
168
|
+
f"Project {self.project_name} was autocreated successfully."
|
169
|
+
)
|
167
170
|
self.judgment_processor = self.get_processor(
|
168
171
|
tracer=self,
|
169
172
|
project_name=self.project_name,
|
@@ -179,7 +182,7 @@ class Tracer(metaclass=SingletonMeta):
|
|
179
182
|
set_tracer_provider(provider)
|
180
183
|
else:
|
181
184
|
judgeval_logger.error(
|
182
|
-
f"Failed to resolve project {self.project_name}, please create it first at https://app.judgmentlabs.ai/org/{self.organization_id}/projects. Skipping Judgment export."
|
185
|
+
f"Failed to resolve or autocreate project {self.project_name}, please create it first at https://app.judgmentlabs.ai/org/{self.organization_id}/projects. Skipping Judgment export."
|
183
186
|
)
|
184
187
|
|
185
188
|
self.tracer = get_tracer_provider().get_tracer(
|
@@ -237,14 +240,14 @@ class Tracer(metaclass=SingletonMeta):
|
|
237
240
|
@staticmethod
|
238
241
|
def _resolve_project_id(
|
239
242
|
project_name: str, api_key: str, organization_id: str
|
240
|
-
) -> str
|
243
|
+
) -> Tuple[str, bool]:
|
241
244
|
"""Resolve project_id from project_name using the API."""
|
242
245
|
client = JudgmentSyncClient(
|
243
246
|
api_key=api_key,
|
244
247
|
organization_id=organization_id,
|
245
248
|
)
|
246
249
|
response = client.projects_resolve({"project_name": project_name})
|
247
|
-
return response["project_id"]
|
250
|
+
return response["project_id"], response["project_created"]
|
248
251
|
|
249
252
|
def get_current_span(self):
|
250
253
|
return get_current_span()
|
judgeval/tracer/keys.py
CHANGED
@@ -12,6 +12,8 @@ class AttributeKeys(str, Enum):
|
|
12
12
|
JUDGMENT_OFFLINE_MODE = "judgment.offline_mode"
|
13
13
|
JUDGMENT_UPDATE_ID = "judgment.update_id"
|
14
14
|
|
15
|
+
JUDGMENT_USAGE_METADATA = "judgment.usage.metadata"
|
16
|
+
|
15
17
|
JUDGMENT_CUSTOMER_ID = "judgment.customer_id"
|
16
18
|
|
17
19
|
JUDGMENT_AGENT_ID = "judgment.agent_id"
|
@@ -31,13 +33,15 @@ class AttributeKeys(str, Enum):
|
|
31
33
|
GEN_AI_SYSTEM = "gen_ai.system"
|
32
34
|
GEN_AI_USAGE_INPUT_TOKENS = "gen_ai.usage.input_tokens"
|
33
35
|
GEN_AI_USAGE_OUTPUT_TOKENS = "gen_ai.usage.output_tokens"
|
34
|
-
|
36
|
+
GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS = (
|
37
|
+
"gen_ai.usage.cache_creation_input_tokens"
|
38
|
+
)
|
39
|
+
GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS = "gen_ai.usage.cache_read_input_tokens"
|
40
|
+
|
35
41
|
GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
|
36
42
|
GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
|
37
43
|
GEN_AI_RESPONSE_FINISH_REASONS = "gen_ai.response.finish_reasons"
|
38
44
|
|
39
|
-
GEN_AI_USAGE_TOTAL_COST = "gen_ai.usage.total_cost_usd"
|
40
|
-
|
41
45
|
|
42
46
|
class InternalAttributeKeys(str, Enum):
|
43
47
|
"""
|