judgeval 0.15.0__py3-none-any.whl → 0.16.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. judgeval/api/__init__.py +4 -18
  2. judgeval/api/api_types.py +18 -2
  3. judgeval/data/judgment_types.py +18 -2
  4. judgeval/logger.py +1 -1
  5. judgeval/tracer/__init__.py +10 -7
  6. judgeval/tracer/keys.py +7 -3
  7. judgeval/tracer/llm/__init__.py +2 -1227
  8. judgeval/tracer/llm/config.py +110 -0
  9. judgeval/tracer/llm/constants.py +10 -0
  10. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  11. judgeval/tracer/llm/llm_anthropic/wrapper.py +611 -0
  12. judgeval/tracer/llm/llm_google/__init__.py +0 -0
  13. judgeval/tracer/llm/llm_google/config.py +24 -0
  14. judgeval/tracer/llm/llm_google/wrapper.py +426 -0
  15. judgeval/tracer/llm/llm_groq/__init__.py +0 -0
  16. judgeval/tracer/llm/llm_groq/config.py +23 -0
  17. judgeval/tracer/llm/llm_groq/wrapper.py +477 -0
  18. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  19. judgeval/tracer/llm/llm_openai/wrapper.py +637 -0
  20. judgeval/tracer/llm/llm_together/__init__.py +0 -0
  21. judgeval/tracer/llm/llm_together/config.py +23 -0
  22. judgeval/tracer/llm/llm_together/wrapper.py +478 -0
  23. judgeval/tracer/llm/providers.py +5 -5
  24. judgeval/tracer/processors/__init__.py +1 -1
  25. judgeval/trainer/console.py +1 -1
  26. judgeval/utils/decorators/__init__.py +0 -0
  27. judgeval/utils/decorators/dont_throw.py +21 -0
  28. judgeval/utils/{decorators.py → decorators/use_once.py} +0 -11
  29. judgeval/utils/meta.py +1 -1
  30. judgeval/utils/version_check.py +1 -1
  31. judgeval/version.py +1 -1
  32. judgeval-0.16.1.dist-info/METADATA +266 -0
  33. {judgeval-0.15.0.dist-info → judgeval-0.16.1.dist-info}/RECORD +38 -24
  34. judgeval/tracer/llm/google/__init__.py +0 -21
  35. judgeval/tracer/llm/groq/__init__.py +0 -20
  36. judgeval/tracer/llm/together/__init__.py +0 -20
  37. judgeval-0.15.0.dist-info/METADATA +0 -158
  38. /judgeval/tracer/llm/{anthropic/__init__.py → llm_anthropic/config.py} +0 -0
  39. /judgeval/tracer/llm/{openai/__init__.py → llm_openai/config.py} +0 -0
  40. {judgeval-0.15.0.dist-info → judgeval-0.16.1.dist-info}/WHEEL +0 -0
  41. {judgeval-0.15.0.dist-info → judgeval-0.16.1.dist-info}/entry_points.txt +0 -0
  42. {judgeval-0.15.0.dist-info → judgeval-0.16.1.dist-info}/licenses/LICENSE.md +0 -0
judgeval/api/__init__.py CHANGED
@@ -73,7 +73,7 @@ class JudgmentSyncClient:
73
73
 
74
74
  def evaluate_examples(
75
75
  self, payload: ExampleEvaluationRun, stream: Optional[str] = None
76
- ) -> Any:
76
+ ) -> EvaluateResponse:
77
77
  query_params = {}
78
78
  if stream is not None:
79
79
  query_params["stream"] = stream
@@ -86,7 +86,7 @@ class JudgmentSyncClient:
86
86
 
87
87
  def evaluate_traces(
88
88
  self, payload: TraceEvaluationRun, stream: Optional[str] = None
89
- ) -> Any:
89
+ ) -> EvaluateResponse:
90
90
  query_params = {}
91
91
  if stream is not None:
92
92
  query_params["stream"] = stream
@@ -212,13 +212,6 @@ class JudgmentSyncClient:
212
212
  payload,
213
213
  )
214
214
 
215
- def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
216
- return self._request(
217
- "POST",
218
- url_for("/e2e_fetch_trace_scorer_span_score/"),
219
- payload,
220
- )
221
-
222
215
 
223
216
  class JudgmentAsyncClient:
224
217
  __slots__ = ("api_key", "organization_id", "client")
@@ -270,7 +263,7 @@ class JudgmentAsyncClient:
270
263
 
271
264
  async def evaluate_examples(
272
265
  self, payload: ExampleEvaluationRun, stream: Optional[str] = None
273
- ) -> Any:
266
+ ) -> EvaluateResponse:
274
267
  query_params = {}
275
268
  if stream is not None:
276
269
  query_params["stream"] = stream
@@ -283,7 +276,7 @@ class JudgmentAsyncClient:
283
276
 
284
277
  async def evaluate_traces(
285
278
  self, payload: TraceEvaluationRun, stream: Optional[str] = None
286
- ) -> Any:
279
+ ) -> EvaluateResponse:
287
280
  query_params = {}
288
281
  if stream is not None:
289
282
  query_params["stream"] = stream
@@ -411,13 +404,6 @@ class JudgmentAsyncClient:
411
404
  payload,
412
405
  )
413
406
 
414
- async def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
415
- return await self._request(
416
- "POST",
417
- url_for("/e2e_fetch_trace_scorer_span_score/"),
418
- payload,
419
- )
420
-
421
407
 
422
408
  __all__ = [
423
409
  "JudgmentSyncClient",
judgeval/api/api_types.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-09-30T18:06:51+00:00
3
+ # timestamp: 2025-10-09T00:16:42+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
  from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
@@ -94,6 +94,7 @@ class ResolveProjectNameRequest(TypedDict):
94
94
 
95
95
  class ResolveProjectNameResponse(TypedDict):
96
96
  project_id: str
97
+ project_created: bool
97
98
 
98
99
 
99
100
  class TraceIdRequest(TypedDict):
@@ -146,6 +147,14 @@ class ValidationError(TypedDict):
146
147
  type: str
147
148
 
148
149
 
150
+ class UsageInfo(TypedDict):
151
+ total_judgees: int
152
+ regular_use: int
153
+ pay_as_you_go_use: int
154
+ remaining_regular: int
155
+ remaining_after: int
156
+
157
+
149
158
  DatasetKind = Literal["trace", "example"]
150
159
 
151
160
 
@@ -273,7 +282,6 @@ class OtelTraceListItem(TypedDict):
273
282
  trace_id: str
274
283
  created_at: str
275
284
  duration: NotRequired[Optional[int]]
276
- has_notification: NotRequired[Optional[bool]]
277
285
  tags: NotRequired[Optional[List[str]]]
278
286
  experiment_run_id: NotRequired[Optional[str]]
279
287
  span_name: NotRequired[Optional[str]]
@@ -281,6 +289,8 @@ class OtelTraceListItem(TypedDict):
281
289
  error: NotRequired[str]
282
290
  scores: NotRequired[List[OtelSpanListItemScores]]
283
291
  customer_id: NotRequired[Optional[str]]
292
+ input: NotRequired[Optional[str]]
293
+ output: NotRequired[Optional[str]]
284
294
  input_preview: NotRequired[Optional[str]]
285
295
  output_preview: NotRequired[Optional[str]]
286
296
  annotation_count: NotRequired[int]
@@ -312,6 +322,12 @@ class OtelSpanDetail(TypedDict):
312
322
  scores: NotRequired[Optional[List[OtelSpanDetailScores]]]
313
323
 
314
324
 
325
+ class EvaluateResponse(TypedDict):
326
+ status: str
327
+ results: List[ScoringResult]
328
+ resource_usage: NotRequired[Optional[UsageInfo]]
329
+
330
+
315
331
  class EvalResults(TypedDict):
316
332
  results: List[ScoringResult]
317
333
  run: Union[ExampleEvaluationRun, TraceEvaluationRun]
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-09-30T18:06:50+00:00
3
+ # timestamp: 2025-10-09T00:16:41+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
  from typing import Annotated, Any, Dict, List, Optional, Union
@@ -101,6 +101,7 @@ class ResolveProjectNameRequest(BaseModel):
101
101
 
102
102
  class ResolveProjectNameResponse(BaseModel):
103
103
  project_id: Annotated[str, Field(title="Project Id")]
104
+ project_created: Annotated[bool, Field(title="Project Created")]
104
105
 
105
106
 
106
107
  class TraceIdRequest(BaseModel):
@@ -162,6 +163,14 @@ class ValidationError(BaseModel):
162
163
  type: Annotated[str, Field(title="Error Type")]
163
164
 
164
165
 
166
+ class UsageInfo(BaseModel):
167
+ total_judgees: Annotated[int, Field(title="Total Judgees")]
168
+ regular_use: Annotated[int, Field(title="Regular Use")]
169
+ pay_as_you_go_use: Annotated[int, Field(title="Pay As You Go Use")]
170
+ remaining_regular: Annotated[int, Field(title="Remaining Regular")]
171
+ remaining_after: Annotated[int, Field(title="Remaining After")]
172
+
173
+
165
174
  class DatasetKind(Enum):
166
175
  trace = "trace"
167
176
  example = "example"
@@ -309,7 +318,6 @@ class OtelTraceListItem(BaseModel):
309
318
  trace_id: Annotated[str, Field(title="Trace Id")]
310
319
  created_at: Annotated[AwareDatetime, Field(title="Created At")]
311
320
  duration: Annotated[Optional[int], Field(title="Duration")] = None
312
- has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = None
313
321
  tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
314
322
  experiment_run_id: Annotated[Optional[str], Field(title="Experiment Run Id")] = None
315
323
  span_name: Annotated[Optional[str], Field(title="Span Name")] = None
@@ -319,6 +327,8 @@ class OtelTraceListItem(BaseModel):
319
327
  Optional[List[OtelSpanListItemScores]], Field(title="Scores")
320
328
  ] = []
321
329
  customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
330
+ input: Annotated[Optional[str], Field(title="Input")] = None
331
+ output: Annotated[Optional[str], Field(title="Output")] = None
322
332
  input_preview: Annotated[Optional[str], Field(title="Input Preview")] = None
323
333
  output_preview: Annotated[Optional[str], Field(title="Output Preview")] = None
324
334
  annotation_count: Annotated[Optional[int], Field(title="Annotation Count")] = 0
@@ -358,6 +368,12 @@ class OtelSpanDetail(BaseModel):
358
368
  )
359
369
 
360
370
 
371
+ class EvaluateResponse(BaseModel):
372
+ status: Annotated[str, Field(title="Status")]
373
+ results: Annotated[List[ScoringResult], Field(title="Results")]
374
+ resource_usage: Optional[UsageInfo] = None
375
+
376
+
361
377
  class EvalResults(BaseModel):
362
378
  results: Annotated[List[ScoringResult], Field(title="Results")]
363
379
  run: Annotated[Union[ExampleEvaluationRun, TraceEvaluationRun], Field(title="Run")]
judgeval/logger.py CHANGED
@@ -2,7 +2,7 @@ import logging
2
2
  import sys
3
3
 
4
4
  from judgeval.env import JUDGMENT_NO_COLOR
5
- from judgeval.utils.decorators import use_once
5
+ from judgeval.utils.decorators.use_once import use_once
6
6
 
7
7
  RESET = "\033[0m"
8
8
  RED = "\033[31m"
@@ -55,7 +55,7 @@ from judgeval.tracer.managers import (
55
55
  sync_agent_context,
56
56
  async_agent_context,
57
57
  )
58
- from judgeval.utils.decorators import dont_throw
58
+ from judgeval.utils.decorators.dont_throw import dont_throw
59
59
  from judgeval.utils.guards import expect_api_key, expect_organization_id
60
60
  from judgeval.utils.serialize import safe_serialize
61
61
  from judgeval.utils.meta import SingletonMeta
@@ -159,11 +159,14 @@ class Tracer(metaclass=SingletonMeta):
159
159
 
160
160
  self.judgment_processor = NoOpJudgmentSpanProcessor()
161
161
  if self.enable_monitoring:
162
- project_id = Tracer._resolve_project_id(
162
+ project_id, project_created = Tracer._resolve_project_id(
163
163
  self.project_name, self.api_key, self.organization_id
164
- )
165
-
164
+ ) or (None, False)
166
165
  if project_id:
166
+ if project_created:
167
+ judgeval_logger.info(
168
+ f"Project {self.project_name} was autocreated successfully."
169
+ )
167
170
  self.judgment_processor = self.get_processor(
168
171
  tracer=self,
169
172
  project_name=self.project_name,
@@ -179,7 +182,7 @@ class Tracer(metaclass=SingletonMeta):
179
182
  set_tracer_provider(provider)
180
183
  else:
181
184
  judgeval_logger.error(
182
- f"Failed to resolve project {self.project_name}, please create it first at https://app.judgmentlabs.ai/org/{self.organization_id}/projects. Skipping Judgment export."
185
+ f"Failed to resolve or autocreate project {self.project_name}, please create it first at https://app.judgmentlabs.ai/org/{self.organization_id}/projects. Skipping Judgment export."
183
186
  )
184
187
 
185
188
  self.tracer = get_tracer_provider().get_tracer(
@@ -237,14 +240,14 @@ class Tracer(metaclass=SingletonMeta):
237
240
  @staticmethod
238
241
  def _resolve_project_id(
239
242
  project_name: str, api_key: str, organization_id: str
240
- ) -> str | None:
243
+ ) -> Tuple[str, bool]:
241
244
  """Resolve project_id from project_name using the API."""
242
245
  client = JudgmentSyncClient(
243
246
  api_key=api_key,
244
247
  organization_id=organization_id,
245
248
  )
246
249
  response = client.projects_resolve({"project_name": project_name})
247
- return response["project_id"]
250
+ return response["project_id"], response["project_created"]
248
251
 
249
252
  def get_current_span(self):
250
253
  return get_current_span()
judgeval/tracer/keys.py CHANGED
@@ -12,6 +12,8 @@ class AttributeKeys(str, Enum):
12
12
  JUDGMENT_OFFLINE_MODE = "judgment.offline_mode"
13
13
  JUDGMENT_UPDATE_ID = "judgment.update_id"
14
14
 
15
+ JUDGMENT_USAGE_METADATA = "judgment.usage.metadata"
16
+
15
17
  JUDGMENT_CUSTOMER_ID = "judgment.customer_id"
16
18
 
17
19
  JUDGMENT_AGENT_ID = "judgment.agent_id"
@@ -31,13 +33,15 @@ class AttributeKeys(str, Enum):
31
33
  GEN_AI_SYSTEM = "gen_ai.system"
32
34
  GEN_AI_USAGE_INPUT_TOKENS = "gen_ai.usage.input_tokens"
33
35
  GEN_AI_USAGE_OUTPUT_TOKENS = "gen_ai.usage.output_tokens"
34
- GEN_AI_USAGE_COMPLETION_TOKENS = "gen_ai.usage.completion_tokens"
36
+ GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS = (
37
+ "gen_ai.usage.cache_creation_input_tokens"
38
+ )
39
+ GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS = "gen_ai.usage.cache_read_input_tokens"
40
+
35
41
  GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
36
42
  GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
37
43
  GEN_AI_RESPONSE_FINISH_REASONS = "gen_ai.response.finish_reasons"
38
44
 
39
- GEN_AI_USAGE_TOTAL_COST = "gen_ai.usage.total_cost_usd"
40
-
41
45
 
42
46
  class InternalAttributeKeys(str, Enum):
43
47
  """