judgeval 0.9.4__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. judgeval/__init__.py +2 -2
  2. judgeval/api/__init__.py +28 -96
  3. judgeval/api/api_types.py +49 -140
  4. judgeval/constants.py +1 -5
  5. judgeval/data/__init__.py +1 -3
  6. judgeval/data/example.py +4 -2
  7. judgeval/data/judgment_types.py +57 -165
  8. judgeval/data/result.py +1 -2
  9. judgeval/data/trace.py +14 -40
  10. judgeval/dataset/__init__.py +15 -42
  11. judgeval/evaluation/__init__.py +23 -34
  12. judgeval/scorers/__init__.py +9 -7
  13. judgeval/scorers/api_scorer.py +8 -0
  14. judgeval/scorers/base_scorer.py +0 -1
  15. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -10
  16. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -2
  17. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  18. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -2
  19. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -2
  20. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +43 -4
  21. judgeval/tracer/__init__.py +13 -50
  22. judgeval/tracer/local_eval_queue.py +2 -2
  23. judgeval/tracer/processors/__init__.py +1 -1
  24. judgeval/tracer/utils.py +1 -1
  25. judgeval/trainer/trainer.py +4 -4
  26. {judgeval-0.9.4.dist-info → judgeval-0.10.0.dist-info}/METADATA +1 -1
  27. {judgeval-0.9.4.dist-info → judgeval-0.10.0.dist-info}/RECORD +30 -35
  28. judgeval/data/trace_run.py +0 -39
  29. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  30. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  31. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  32. judgeval/scorers/trace_api_scorer.py +0 -5
  33. {judgeval-0.9.4.dist-info → judgeval-0.10.0.dist-info}/WHEEL +0 -0
  34. {judgeval-0.9.4.dist-info → judgeval-0.10.0.dist-info}/entry_points.txt +0 -0
  35. {judgeval-0.9.4.dist-info → judgeval-0.10.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/__init__.py CHANGED
@@ -6,7 +6,7 @@ from judgeval.data.evaluation_run import ExampleEvaluationRun
6
6
 
7
7
 
8
8
  from typing import List, Optional, Union
9
- from judgeval.scorers import BaseScorer, APIScorerConfig
9
+ from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
10
10
  from judgeval.data.example import Example
11
11
  from judgeval.logger import judgeval_logger
12
12
  from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_DEFAULT_GPT_MODEL, JUDGMENT_ORG_ID
@@ -38,7 +38,7 @@ class JudgmentClient(metaclass=SingletonMeta):
38
38
  def run_evaluation(
39
39
  self,
40
40
  examples: List[Example],
41
- scorers: List[Union[APIScorerConfig, BaseScorer]],
41
+ scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
42
42
  project_name: str = "default_project",
43
43
  eval_run_name: str = "default_eval_run",
44
44
  model: str = JUDGMENT_DEFAULT_GPT_MODEL,
judgeval/api/__init__.py CHANGED
@@ -71,13 +71,6 @@ class JudgmentSyncClient:
71
71
  payload,
72
72
  )
73
73
 
74
- def evaluate_trace(self, payload: TraceRun) -> Any:
75
- return self._request(
76
- "POST",
77
- url_for("/evaluate_trace/"),
78
- payload,
79
- )
80
-
81
74
  def evaluate_examples(
82
75
  self, payload: ExampleEvaluationRun, stream: Optional[str] = None
83
76
  ) -> Any:
@@ -128,59 +121,26 @@ class JudgmentSyncClient:
128
121
  query_params,
129
122
  )
130
123
 
131
- def datasets_insert_examples(self, payload: DatasetInsertExamples) -> Any:
124
+ def datasets_insert_examples_for_judgeval(
125
+ self, payload: DatasetInsertExamples
126
+ ) -> Any:
132
127
  return self._request(
133
128
  "POST",
134
- url_for("/datasets/insert_examples/"),
129
+ url_for("/datasets/insert_examples_for_judgeval/"),
135
130
  payload,
136
131
  )
137
132
 
138
- def datasets_pull_for_judgeval(self, payload: DatasetFetch) -> Any:
133
+ def datasets_pull_for_judgeval(self, payload: DatasetFetch) -> DatasetReturn:
139
134
  return self._request(
140
135
  "POST",
141
136
  url_for("/datasets/pull_for_judgeval/"),
142
137
  payload,
143
138
  )
144
139
 
145
- def datasets_push(self, payload: DatasetPush) -> Any:
140
+ def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
146
141
  return self._request(
147
142
  "POST",
148
- url_for("/datasets/push/"),
149
- payload,
150
- )
151
-
152
- def traces_upsert(self, payload: TraceSave) -> Any:
153
- return self._request(
154
- "POST",
155
- url_for("/traces/upsert/"),
156
- payload,
157
- )
158
-
159
- def traces_fetch(self, payload: TraceFetch) -> Any:
160
- return self._request(
161
- "POST",
162
- url_for("/traces/fetch/"),
163
- payload,
164
- )
165
-
166
- def traces_add_to_dataset(self, payload: TraceAddToDataset) -> Any:
167
- return self._request(
168
- "POST",
169
- url_for("/traces/add_to_dataset/"),
170
- payload,
171
- )
172
-
173
- def traces_spans_batch(self, payload: SpansBatchRequest) -> Any:
174
- return self._request(
175
- "POST",
176
- url_for("/traces/spans/batch/"),
177
- payload,
178
- )
179
-
180
- def traces_evaluation_runs_batch(self, payload: EvaluationRunsBatchRequest) -> Any:
181
- return self._request(
182
- "POST",
183
- url_for("/traces/evaluation_runs/batch/"),
143
+ url_for("/datasets/create_for_judgeval/"),
184
144
  payload,
185
145
  )
186
146
 
@@ -255,6 +215,13 @@ class JudgmentSyncClient:
255
215
  payload,
256
216
  )
257
217
 
218
+ def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
219
+ return self._request(
220
+ "POST",
221
+ url_for("/e2e_fetch_trace_scorer_span_score/"),
222
+ payload,
223
+ )
224
+
258
225
 
259
226
  class JudgmentAsyncClient:
260
227
  __slots__ = ("api_key", "organization_id", "client")
@@ -304,13 +271,6 @@ class JudgmentAsyncClient:
304
271
  payload,
305
272
  )
306
273
 
307
- async def evaluate_trace(self, payload: TraceRun) -> Any:
308
- return await self._request(
309
- "POST",
310
- url_for("/evaluate_trace/"),
311
- payload,
312
- )
313
-
314
274
  async def evaluate_examples(
315
275
  self, payload: ExampleEvaluationRun, stream: Optional[str] = None
316
276
  ) -> Any:
@@ -363,61 +323,26 @@ class JudgmentAsyncClient:
363
323
  query_params,
364
324
  )
365
325
 
366
- async def datasets_insert_examples(self, payload: DatasetInsertExamples) -> Any:
326
+ async def datasets_insert_examples_for_judgeval(
327
+ self, payload: DatasetInsertExamples
328
+ ) -> Any:
367
329
  return await self._request(
368
330
  "POST",
369
- url_for("/datasets/insert_examples/"),
331
+ url_for("/datasets/insert_examples_for_judgeval/"),
370
332
  payload,
371
333
  )
372
334
 
373
- async def datasets_pull_for_judgeval(self, payload: DatasetFetch) -> Any:
335
+ async def datasets_pull_for_judgeval(self, payload: DatasetFetch) -> DatasetReturn:
374
336
  return await self._request(
375
337
  "POST",
376
338
  url_for("/datasets/pull_for_judgeval/"),
377
339
  payload,
378
340
  )
379
341
 
380
- async def datasets_push(self, payload: DatasetPush) -> Any:
381
- return await self._request(
382
- "POST",
383
- url_for("/datasets/push/"),
384
- payload,
385
- )
386
-
387
- async def traces_upsert(self, payload: TraceSave) -> Any:
388
- return await self._request(
389
- "POST",
390
- url_for("/traces/upsert/"),
391
- payload,
392
- )
393
-
394
- async def traces_fetch(self, payload: TraceFetch) -> Any:
395
- return await self._request(
396
- "POST",
397
- url_for("/traces/fetch/"),
398
- payload,
399
- )
400
-
401
- async def traces_add_to_dataset(self, payload: TraceAddToDataset) -> Any:
402
- return await self._request(
403
- "POST",
404
- url_for("/traces/add_to_dataset/"),
405
- payload,
406
- )
407
-
408
- async def traces_spans_batch(self, payload: SpansBatchRequest) -> Any:
409
- return await self._request(
410
- "POST",
411
- url_for("/traces/spans/batch/"),
412
- payload,
413
- )
414
-
415
- async def traces_evaluation_runs_batch(
416
- self, payload: EvaluationRunsBatchRequest
417
- ) -> Any:
342
+ async def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
418
343
  return await self._request(
419
344
  "POST",
420
- url_for("/traces/evaluation_runs/batch/"),
345
+ url_for("/datasets/create_for_judgeval/"),
421
346
  payload,
422
347
  )
423
348
 
@@ -494,6 +419,13 @@ class JudgmentAsyncClient:
494
419
  payload,
495
420
  )
496
421
 
422
+ async def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
423
+ return await self._request(
424
+ "POST",
425
+ url_for("/e2e_fetch_trace_scorer_span_score/"),
426
+ payload,
427
+ )
428
+
497
429
 
498
430
  __all__ = [
499
431
  "JudgmentSyncClient",
judgeval/api/api_types.py CHANGED
@@ -1,9 +1,9 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-08-29T04:49:39+00:00
3
+ # timestamp: 2025-09-10T17:42:12+00:00
4
4
 
5
5
  from __future__ import annotations
6
- from typing import Any, Dict, List, Optional, TypedDict, Union
6
+ from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
7
7
  from typing_extensions import NotRequired
8
8
 
9
9
 
@@ -16,40 +16,10 @@ class EvalResultsFetch(TypedDict):
16
16
 
17
17
 
18
18
  class DatasetFetch(TypedDict):
19
- dataset_alias: str
19
+ dataset_name: str
20
20
  project_name: str
21
21
 
22
22
 
23
- class TraceSave(TypedDict):
24
- project_name: str
25
- trace_id: str
26
- name: str
27
- created_at: str
28
- duration: float
29
- offline_mode: NotRequired[bool]
30
- has_notification: NotRequired[bool]
31
- customer_id: NotRequired[Optional[str]]
32
- tags: NotRequired[List[str]]
33
- metadata: NotRequired[Dict[str, Any]]
34
- update_id: NotRequired[int]
35
-
36
-
37
- class TraceFetch(TypedDict):
38
- trace_id: str
39
-
40
-
41
- class TraceAddToDataset(TypedDict):
42
- trace_id: str
43
- trace_span_id: str
44
- dataset_alias: str
45
- project_name: str
46
-
47
-
48
- class EvaluationRunsBatchRequest(TypedDict):
49
- organization_id: str
50
- evaluation_entries: List[Dict[str, Any]]
51
-
52
-
53
23
  class ProjectAdd(TypedDict):
54
24
  project_name: str
55
25
 
@@ -149,8 +119,8 @@ class ScorerConfig(TypedDict):
149
119
 
150
120
 
151
121
  class Example(TypedDict):
152
- example_id: str
153
- created_at: str
122
+ example_id: NotRequired[str]
123
+ created_at: NotRequired[str]
154
124
  name: NotRequired[Optional[str]]
155
125
 
156
126
 
@@ -160,28 +130,7 @@ class ValidationError(TypedDict):
160
130
  type: str
161
131
 
162
132
 
163
- class SpanBatchItem(TypedDict):
164
- span_id: str
165
- trace_id: str
166
- function: str
167
- created_at: NotRequired[Any]
168
- parent_span_id: NotRequired[Optional[str]]
169
- span_type: NotRequired[Optional[str]]
170
- inputs: NotRequired[Optional[Dict[str, Any]]]
171
- output: NotRequired[Any]
172
- error: NotRequired[Optional[Dict[str, Any]]]
173
- usage: NotRequired[Optional[Dict[str, Any]]]
174
- duration: NotRequired[Optional[float]]
175
- expected_tools: NotRequired[Optional[List[Dict[str, Any]]]]
176
- additional_metadata: NotRequired[Optional[Dict[str, Any]]]
177
- has_evaluation: NotRequired[Optional[bool]]
178
- agent_name: NotRequired[Optional[str]]
179
- class_name: NotRequired[Optional[str]]
180
- state_before: NotRequired[Optional[Dict[str, Any]]]
181
- state_after: NotRequired[Optional[Dict[str, Any]]]
182
- span_state: str
183
- update_id: NotRequired[int]
184
- queued_at: float
133
+ DatasetKind = Literal["trace", "example"]
185
134
 
186
135
 
187
136
  class PromptScorer(TypedDict):
@@ -195,36 +144,45 @@ class PromptScorer(TypedDict):
195
144
 
196
145
 
197
146
  class ScorerData(TypedDict):
147
+ id: NotRequired[str]
198
148
  name: str
199
149
  threshold: float
200
150
  success: bool
201
151
  score: NotRequired[Optional[float]]
202
152
  reason: NotRequired[Optional[str]]
203
153
  strict_mode: NotRequired[Optional[bool]]
204
- evaluation_model: NotRequired[Union[List[str], str]]
154
+ evaluation_model: NotRequired[str]
205
155
  error: NotRequired[Optional[str]]
206
156
  additional_metadata: NotRequired[Optional[Dict[str, Any]]]
207
157
 
208
158
 
209
- class TraceUsage(TypedDict):
210
- prompt_tokens: NotRequired[Optional[int]]
211
- completion_tokens: NotRequired[Optional[int]]
212
- cache_creation_input_tokens: NotRequired[Optional[int]]
213
- cache_read_input_tokens: NotRequired[Optional[int]]
214
- total_tokens: NotRequired[Optional[int]]
215
- prompt_tokens_cost_usd: NotRequired[Optional[float]]
216
- completion_tokens_cost_usd: NotRequired[Optional[float]]
217
- total_cost_usd: NotRequired[Optional[float]]
218
- model_name: NotRequired[Optional[str]]
219
-
220
-
221
- class Tool(TypedDict):
222
- tool_name: str
223
- parameters: NotRequired[Optional[Dict[str, Any]]]
224
- agent_name: NotRequired[Optional[str]]
225
- result_dependencies: NotRequired[Optional[List[Dict[str, Any]]]]
226
- action_dependencies: NotRequired[Optional[List[Dict[str, Any]]]]
227
- require_all: NotRequired[Optional[bool]]
159
+ class OtelTraceSpan(TypedDict):
160
+ organization_id: str
161
+ project_id: NotRequired[Optional[str]]
162
+ user_id: str
163
+ timestamp: str
164
+ trace_id: str
165
+ span_id: str
166
+ parent_span_id: NotRequired[Optional[str]]
167
+ trace_state: NotRequired[Optional[str]]
168
+ span_name: NotRequired[Optional[str]]
169
+ span_kind: NotRequired[Optional[str]]
170
+ service_name: NotRequired[Optional[str]]
171
+ resource_attributes: NotRequired[Optional[Dict[str, Any]]]
172
+ span_attributes: NotRequired[Optional[Dict[str, Any]]]
173
+ duration: NotRequired[Optional[int]]
174
+ status_code: NotRequired[Optional[str]]
175
+ status_message: NotRequired[Optional[str]]
176
+ events: NotRequired[Optional[List[Dict[str, Any]]]]
177
+ links: NotRequired[Optional[List[Dict[str, Any]]]]
178
+ legacy_span_id: NotRequired[Optional[str]]
179
+ inputs: NotRequired[Optional[Dict[str, Any]]]
180
+ output: Any
181
+ error: NotRequired[Optional[Dict[str, Any]]]
182
+ agent_id: NotRequired[Optional[str]]
183
+ cumulative_llm_cost: NotRequired[Optional[float]]
184
+ state_after: NotRequired[Optional[Dict[str, Any]]]
185
+ state_before: NotRequired[Optional[Dict[str, Any]]]
228
186
 
229
187
 
230
188
  class ExampleEvaluationRun(TypedDict):
@@ -257,88 +215,39 @@ class TraceEvaluationRun(TypedDict):
257
215
 
258
216
 
259
217
  class DatasetInsertExamples(TypedDict):
260
- dataset_alias: str
218
+ dataset_name: str
261
219
  examples: List[Example]
262
220
  project_name: str
263
221
 
264
222
 
265
- class SpansBatchRequest(TypedDict):
266
- spans: List[SpanBatchItem]
267
- organization_id: str
268
-
269
-
270
- class FetchPromptScorerResponse(TypedDict):
271
- scorer: PromptScorer
223
+ class DatasetReturn(TypedDict):
224
+ name: str
225
+ project_name: str
226
+ examples: NotRequired[Optional[List[Example]]]
272
227
 
273
228
 
274
- class TraceSpan(TypedDict):
275
- span_id: str
276
- trace_id: str
277
- function: str
278
- created_at: NotRequired[Any]
279
- parent_span_id: NotRequired[Optional[str]]
280
- span_type: NotRequired[Optional[str]]
281
- inputs: NotRequired[Optional[Dict[str, Any]]]
282
- error: NotRequired[Optional[Dict[str, Any]]]
283
- output: NotRequired[Any]
284
- usage: NotRequired[Optional[TraceUsage]]
285
- duration: NotRequired[Optional[float]]
286
- expected_tools: NotRequired[Optional[List[Tool]]]
287
- additional_metadata: NotRequired[Optional[Dict[str, Any]]]
288
- has_evaluation: NotRequired[Optional[bool]]
289
- agent_name: NotRequired[Optional[str]]
290
- class_name: NotRequired[Optional[str]]
291
- state_before: NotRequired[Optional[Dict[str, Any]]]
292
- state_after: NotRequired[Optional[Dict[str, Any]]]
293
- update_id: NotRequired[int]
229
+ class DatasetCreate(TypedDict):
230
+ name: str
231
+ dataset_kind: DatasetKind
232
+ project_name: str
233
+ examples: NotRequired[Optional[List[Example]]]
234
+ overwrite: NotRequired[Optional[bool]]
294
235
 
295
236
 
296
- class Trace(TypedDict):
297
- trace_id: str
298
- name: str
299
- created_at: str
300
- duration: float
301
- trace_spans: List[TraceSpan]
302
- offline_mode: NotRequired[bool]
303
- rules: NotRequired[Dict[str, Any]]
304
- has_notification: NotRequired[bool]
305
- customer_id: NotRequired[Optional[str]]
306
- tags: NotRequired[List[str]]
307
- metadata: NotRequired[Dict[str, Any]]
308
- update_id: NotRequired[int]
237
+ class FetchPromptScorerResponse(TypedDict):
238
+ scorer: PromptScorer
309
239
 
310
240
 
311
241
  class ScoringResult(TypedDict):
312
242
  success: bool
313
243
  scorers_data: Optional[List[ScorerData]]
314
244
  name: NotRequired[Optional[str]]
315
- data_object: NotRequired[Optional[Union[TraceSpan, Example]]]
245
+ data_object: NotRequired[Optional[Union[OtelTraceSpan, Example]]]
316
246
  trace_id: NotRequired[Optional[str]]
317
247
  run_duration: NotRequired[Optional[float]]
318
248
  evaluation_cost: NotRequired[Optional[float]]
319
249
 
320
250
 
321
- class TraceRun(TypedDict):
322
- project_name: NotRequired[Optional[str]]
323
- eval_name: NotRequired[Optional[str]]
324
- traces: List[Trace]
325
- scorers: List[ScorerConfig]
326
- model: str
327
- trace_span_id: NotRequired[Optional[str]]
328
- tools: NotRequired[Optional[List[Dict[str, Any]]]]
329
-
330
-
331
251
  class EvalResults(TypedDict):
332
252
  results: List[ScoringResult]
333
253
  run: Union[ExampleEvaluationRun, TraceEvaluationRun]
334
-
335
-
336
- class DatasetPush(TypedDict):
337
- dataset_alias: str
338
- comments: NotRequired[Optional[str]]
339
- source_file: NotRequired[Optional[str]]
340
- examples: NotRequired[Optional[List[Example]]]
341
- traces: NotRequired[Optional[List[Trace]]]
342
- is_trace: NotRequired[bool]
343
- project_name: str
344
- overwrite: NotRequired[Optional[bool]]
judgeval/constants.py CHANGED
@@ -14,16 +14,12 @@ class APIScorerType(str, Enum):
14
14
  """
15
15
 
16
16
  PROMPT_SCORER = "Prompt Scorer"
17
+ TRACE_PROMPT_SCORER = "Trace Prompt Scorer"
17
18
  FAITHFULNESS = "Faithfulness"
18
19
  ANSWER_RELEVANCY = "Answer Relevancy"
19
20
  ANSWER_CORRECTNESS = "Answer Correctness"
20
21
  INSTRUCTION_ADHERENCE = "Instruction Adherence"
21
22
  EXECUTION_ORDER = "Execution Order"
22
- DERAILMENT = "Derailment"
23
- TOOL_ORDER = "Tool Order"
24
- MOCK_TRACE_SCORER = "Mock Trace Scorer"
25
- CLASSIFIER = "Classifier"
26
- TOOL_DEPENDENCY = "Tool Dependency"
27
23
  CUSTOM = "Custom"
28
24
 
29
25
  @classmethod
judgeval/data/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from judgeval.data.example import Example, ExampleParams
2
2
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
3
3
  from judgeval.data.result import ScoringResult, generate_scoring_result
4
- from judgeval.data.trace import Trace, TraceSpan, TraceUsage
4
+ from judgeval.data.trace import TraceUsage
5
5
 
6
6
 
7
7
  __all__ = [
@@ -11,7 +11,5 @@ __all__ = [
11
11
  "create_scorer_data",
12
12
  "ScoringResult",
13
13
  "generate_scoring_result",
14
- "Trace",
15
- "TraceSpan",
16
14
  "TraceUsage",
17
15
  ]
judgeval/data/example.py CHANGED
@@ -6,6 +6,8 @@ from enum import Enum
6
6
  from datetime import datetime
7
7
  from typing import Dict, Any, Optional
8
8
  from judgeval.data.judgment_types import Example as JudgmentExample
9
+ from uuid import uuid4
10
+ from pydantic import Field
9
11
 
10
12
 
11
13
  class ExampleParams(str, Enum):
@@ -20,8 +22,8 @@ class ExampleParams(str, Enum):
20
22
 
21
23
 
22
24
  class Example(JudgmentExample):
23
- example_id: str = ""
24
- created_at: str = datetime.now().isoformat()
25
+ example_id: str = Field(default_factory=lambda: str(uuid4()))
26
+ created_at: str = Field(default_factory=lambda: datetime.now().isoformat())
25
27
  name: Optional[str] = None
26
28
 
27
29
  def to_dict(self) -> Dict[str, Any]: