judgeval 0.9.4__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. judgeval/__init__.py +2 -2
  2. judgeval/api/__init__.py +30 -92
  3. judgeval/api/api_types.py +57 -137
  4. judgeval/constants.py +1 -5
  5. judgeval/data/__init__.py +1 -3
  6. judgeval/data/example.py +4 -2
  7. judgeval/data/judgment_types.py +57 -165
  8. judgeval/data/result.py +1 -2
  9. judgeval/data/trace.py +14 -40
  10. judgeval/dataset/__init__.py +40 -44
  11. judgeval/evaluation/__init__.py +23 -34
  12. judgeval/scorers/__init__.py +9 -7
  13. judgeval/scorers/api_scorer.py +8 -0
  14. judgeval/scorers/base_scorer.py +0 -1
  15. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -10
  16. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -2
  17. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  18. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -2
  19. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -2
  20. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +43 -4
  21. judgeval/tracer/__init__.py +13 -50
  22. judgeval/tracer/local_eval_queue.py +2 -2
  23. judgeval/tracer/processors/__init__.py +1 -1
  24. judgeval/tracer/utils.py +1 -1
  25. judgeval/trainer/trainer.py +4 -4
  26. {judgeval-0.9.4.dist-info → judgeval-0.10.1.dist-info}/METADATA +1 -1
  27. {judgeval-0.9.4.dist-info → judgeval-0.10.1.dist-info}/RECORD +30 -35
  28. judgeval/data/trace_run.py +0 -39
  29. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  30. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  31. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  32. judgeval/scorers/trace_api_scorer.py +0 -5
  33. {judgeval-0.9.4.dist-info → judgeval-0.10.1.dist-info}/WHEEL +0 -0
  34. {judgeval-0.9.4.dist-info → judgeval-0.10.1.dist-info}/entry_points.txt +0 -0
  35. {judgeval-0.9.4.dist-info → judgeval-0.10.1.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,10 +1,11 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-08-29T04:49:38+00:00
3
+ # timestamp: 2025-09-10T17:42:11+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
  from typing import Annotated, Any, Dict, List, Optional, Union
7
7
  from pydantic import AwareDatetime, BaseModel, ConfigDict, Field, RootModel
8
+ from enum import Enum
8
9
 
9
10
 
10
11
  class TraceAndSpanId(RootModel[List]):
@@ -17,42 +18,10 @@ class EvalResultsFetch(BaseModel):
17
18
 
18
19
 
19
20
  class DatasetFetch(BaseModel):
20
- dataset_alias: Annotated[str, Field(title="Dataset Alias")]
21
+ dataset_name: Annotated[str, Field(title="Dataset Name")]
21
22
  project_name: Annotated[str, Field(title="Project Name")]
22
23
 
23
24
 
24
- class TraceSave(BaseModel):
25
- project_name: Annotated[str, Field(title="Project Name")]
26
- trace_id: Annotated[str, Field(title="Trace Id")]
27
- name: Annotated[str, Field(title="Name")]
28
- created_at: Annotated[str, Field(title="Created At")]
29
- duration: Annotated[float, Field(title="Duration")]
30
- offline_mode: Annotated[Optional[bool], Field(title="Offline Mode")] = False
31
- has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = False
32
- customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
33
- tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
34
- metadata: Annotated[Optional[Dict[str, Any]], Field(title="Metadata")] = None
35
- update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
36
-
37
-
38
- class TraceFetch(BaseModel):
39
- trace_id: Annotated[str, Field(title="Trace Id")]
40
-
41
-
42
- class TraceAddToDataset(BaseModel):
43
- trace_id: Annotated[str, Field(title="Trace Id")]
44
- trace_span_id: Annotated[str, Field(title="Trace Span Id")]
45
- dataset_alias: Annotated[str, Field(title="Dataset Alias")]
46
- project_name: Annotated[str, Field(title="Project Name")]
47
-
48
-
49
- class EvaluationRunsBatchRequest(BaseModel):
50
- organization_id: Annotated[str, Field(title="Organization Id")]
51
- evaluation_entries: Annotated[
52
- List[Dict[str, Any]], Field(title="Evaluation Entries")
53
- ]
54
-
55
-
56
25
  class ProjectAdd(BaseModel):
57
26
  project_name: Annotated[str, Field(title="Project Name")]
58
27
 
@@ -82,7 +51,7 @@ class SavePromptScorerRequest(BaseModel):
82
51
  prompt: Annotated[str, Field(title="Prompt")]
83
52
  threshold: Annotated[float, Field(title="Threshold")]
84
53
  options: Annotated[Optional[Dict[str, float]], Field(title="Options")] = None
85
- is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
54
+ is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = None
86
55
 
87
56
 
88
57
  class SavePromptScorerResponse(BaseModel):
@@ -161,8 +130,8 @@ class Example(BaseModel):
161
130
  model_config = ConfigDict(
162
131
  extra="allow",
163
132
  )
164
- example_id: Annotated[str, Field(title="Example Id")]
165
- created_at: Annotated[str, Field(title="Created At")]
133
+ example_id: Annotated[Optional[str], Field(title="Example Id")] = None
134
+ created_at: Annotated[Optional[str], Field(title="Created At")] = None
166
135
  name: Annotated[Optional[str], Field(title="Name")] = None
167
136
 
168
137
 
@@ -172,34 +141,9 @@ class ValidationError(BaseModel):
172
141
  type: Annotated[str, Field(title="Error Type")]
173
142
 
174
143
 
175
- class SpanBatchItem(BaseModel):
176
- span_id: Annotated[str, Field(title="Span Id")]
177
- trace_id: Annotated[str, Field(title="Trace Id")]
178
- function: Annotated[str, Field(title="Function")]
179
- created_at: Annotated[Any, Field(title="Created At")] = None
180
- parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
181
- span_type: Annotated[Optional[str], Field(title="Span Type")] = "span"
182
- inputs: Annotated[Optional[Dict[str, Any]], Field(title="Inputs")] = None
183
- output: Annotated[Any, Field(title="Output")] = None
184
- error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
185
- usage: Annotated[Optional[Dict[str, Any]], Field(title="Usage")] = None
186
- duration: Annotated[Optional[float], Field(title="Duration")] = None
187
- expected_tools: Annotated[
188
- Optional[List[Dict[str, Any]]], Field(title="Expected Tools")
189
- ] = None
190
- additional_metadata: Annotated[
191
- Optional[Dict[str, Any]], Field(title="Additional Metadata")
192
- ] = None
193
- has_evaluation: Annotated[Optional[bool], Field(title="Has Evaluation")] = False
194
- agent_name: Annotated[Optional[str], Field(title="Agent Name")] = None
195
- class_name: Annotated[Optional[str], Field(title="Class Name")] = None
196
- state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
197
- None
198
- )
199
- state_after: Annotated[Optional[Dict[str, Any]], Field(title="State After")] = None
200
- span_state: Annotated[str, Field(title="Span State")]
201
- update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
202
- queued_at: Annotated[float, Field(title="Queued At")]
144
+ class DatasetKind(Enum):
145
+ trace = "trace"
146
+ example = "example"
203
147
 
204
148
 
205
149
  class PromptScorer(BaseModel):
@@ -213,52 +157,55 @@ class PromptScorer(BaseModel):
213
157
 
214
158
 
215
159
  class ScorerData(BaseModel):
160
+ id: Annotated[Optional[str], Field(title="Id")] = None
216
161
  name: Annotated[str, Field(title="Name")]
217
162
  threshold: Annotated[float, Field(title="Threshold")]
218
163
  success: Annotated[bool, Field(title="Success")]
219
164
  score: Annotated[Optional[float], Field(title="Score")] = None
220
165
  reason: Annotated[Optional[str], Field(title="Reason")] = None
221
166
  strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = None
222
- evaluation_model: Annotated[
223
- Optional[Union[List[str], str]], Field(title="Evaluation Model")
224
- ] = None
167
+ evaluation_model: Annotated[Optional[str], Field(title="Evaluation Model")] = None
225
168
  error: Annotated[Optional[str], Field(title="Error")] = None
226
169
  additional_metadata: Annotated[
227
170
  Optional[Dict[str, Any]], Field(title="Additional Metadata")
228
171
  ] = None
229
172
 
230
173
 
231
- class TraceUsage(BaseModel):
232
- prompt_tokens: Annotated[Optional[int], Field(title="Prompt Tokens")] = None
233
- completion_tokens: Annotated[Optional[int], Field(title="Completion Tokens")] = None
234
- cache_creation_input_tokens: Annotated[
235
- Optional[int], Field(title="Cache Creation Input Tokens")
236
- ] = None
237
- cache_read_input_tokens: Annotated[
238
- Optional[int], Field(title="Cache Read Input Tokens")
239
- ] = None
240
- total_tokens: Annotated[Optional[int], Field(title="Total Tokens")] = None
241
- prompt_tokens_cost_usd: Annotated[
242
- Optional[float], Field(title="Prompt Tokens Cost Usd")
174
+ class OtelTraceSpan(BaseModel):
175
+ organization_id: Annotated[str, Field(title="Organization Id")]
176
+ project_id: Annotated[Optional[str], Field(title="Project Id")] = None
177
+ user_id: Annotated[str, Field(title="User Id")]
178
+ timestamp: Annotated[str, Field(title="Timestamp")]
179
+ trace_id: Annotated[str, Field(title="Trace Id")]
180
+ span_id: Annotated[str, Field(title="Span Id")]
181
+ parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
182
+ trace_state: Annotated[Optional[str], Field(title="Trace State")] = None
183
+ span_name: Annotated[Optional[str], Field(title="Span Name")] = None
184
+ span_kind: Annotated[Optional[str], Field(title="Span Kind")] = None
185
+ service_name: Annotated[Optional[str], Field(title="Service Name")] = None
186
+ resource_attributes: Annotated[
187
+ Optional[Dict[str, Any]], Field(title="Resource Attributes")
243
188
  ] = None
244
- completion_tokens_cost_usd: Annotated[
245
- Optional[float], Field(title="Completion Tokens Cost Usd")
189
+ span_attributes: Annotated[
190
+ Optional[Dict[str, Any]], Field(title="Span Attributes")
246
191
  ] = None
247
- total_cost_usd: Annotated[Optional[float], Field(title="Total Cost Usd")] = None
248
- model_name: Annotated[Optional[str], Field(title="Model Name")] = None
249
-
250
-
251
- class Tool(BaseModel):
252
- tool_name: Annotated[str, Field(title="Tool Name")]
253
- parameters: Annotated[Optional[Dict[str, Any]], Field(title="Parameters")] = None
254
- agent_name: Annotated[Optional[str], Field(title="Agent Name")] = None
255
- result_dependencies: Annotated[
256
- Optional[List[Dict[str, Any]]], Field(title="Result Dependencies")
257
- ] = None
258
- action_dependencies: Annotated[
259
- Optional[List[Dict[str, Any]]], Field(title="Action Dependencies")
192
+ duration: Annotated[Optional[int], Field(title="Duration")] = None
193
+ status_code: Annotated[Optional[str], Field(title="Status Code")] = None
194
+ status_message: Annotated[Optional[str], Field(title="Status Message")] = None
195
+ events: Annotated[Optional[List[Dict[str, Any]]], Field(title="Events")] = None
196
+ links: Annotated[Optional[List[Dict[str, Any]]], Field(title="Links")] = None
197
+ legacy_span_id: Annotated[Optional[str], Field(title="Legacy Span Id")] = None
198
+ inputs: Annotated[Optional[Dict[str, Any]], Field(title="Inputs")] = None
199
+ output: Annotated[Any, Field(title="Output")]
200
+ error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
201
+ agent_id: Annotated[Optional[str], Field(title="Agent Id")] = None
202
+ cumulative_llm_cost: Annotated[
203
+ Optional[float], Field(title="Cumulative Llm Cost")
260
204
  ] = None
261
- require_all: Annotated[Optional[bool], Field(title="Require All")] = None
205
+ state_after: Annotated[Optional[Dict[str, Any]], Field(title="State After")] = None
206
+ state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
207
+ None
208
+ )
262
209
 
263
210
 
264
211
  class ExampleEvaluationRun(BaseModel):
@@ -301,61 +248,27 @@ class TraceEvaluationRun(BaseModel):
301
248
 
302
249
 
303
250
  class DatasetInsertExamples(BaseModel):
304
- dataset_alias: Annotated[str, Field(title="Dataset Alias")]
251
+ dataset_name: Annotated[str, Field(title="Dataset Name")]
305
252
  examples: Annotated[List[Example], Field(title="Examples")]
306
253
  project_name: Annotated[str, Field(title="Project Name")]
307
254
 
308
255
 
309
- class SpansBatchRequest(BaseModel):
310
- spans: Annotated[List[SpanBatchItem], Field(title="Spans")]
311
- organization_id: Annotated[str, Field(title="Organization Id")]
312
-
313
-
314
- class FetchPromptScorerResponse(BaseModel):
315
- scorer: PromptScorer
256
+ class DatasetReturn(BaseModel):
257
+ name: Annotated[str, Field(title="Name")]
258
+ project_name: Annotated[str, Field(title="Project Name")]
259
+ examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
316
260
 
317
261
 
318
- class TraceSpan(BaseModel):
319
- span_id: Annotated[str, Field(title="Span Id")]
320
- trace_id: Annotated[str, Field(title="Trace Id")]
321
- function: Annotated[str, Field(title="Function")]
322
- created_at: Annotated[Any, Field(title="Created At")] = None
323
- parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
324
- span_type: Annotated[Optional[str], Field(title="Span Type")] = "span"
325
- inputs: Annotated[Optional[Dict[str, Any]], Field(title="Inputs")] = None
326
- error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
327
- output: Annotated[Any, Field(title="Output")] = None
328
- usage: Optional[TraceUsage] = None
329
- duration: Annotated[Optional[float], Field(title="Duration")] = None
330
- expected_tools: Annotated[Optional[List[Tool]], Field(title="Expected Tools")] = (
331
- None
332
- )
333
- additional_metadata: Annotated[
334
- Optional[Dict[str, Any]], Field(title="Additional Metadata")
335
- ] = None
336
- has_evaluation: Annotated[Optional[bool], Field(title="Has Evaluation")] = False
337
- agent_name: Annotated[Optional[str], Field(title="Agent Name")] = None
338
- class_name: Annotated[Optional[str], Field(title="Class Name")] = None
339
- state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
340
- None
341
- )
342
- state_after: Annotated[Optional[Dict[str, Any]], Field(title="State After")] = None
343
- update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
262
+ class DatasetCreate(BaseModel):
263
+ name: Annotated[str, Field(title="Name")]
264
+ dataset_kind: DatasetKind
265
+ project_name: Annotated[str, Field(title="Project Name")]
266
+ examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
267
+ overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
344
268
 
345
269
 
346
- class Trace(BaseModel):
347
- trace_id: Annotated[str, Field(title="Trace Id")]
348
- name: Annotated[str, Field(title="Name")]
349
- created_at: Annotated[str, Field(title="Created At")]
350
- duration: Annotated[float, Field(title="Duration")]
351
- trace_spans: Annotated[List[TraceSpan], Field(title="Trace Spans")]
352
- offline_mode: Annotated[Optional[bool], Field(title="Offline Mode")] = False
353
- rules: Annotated[Optional[Dict[str, Any]], Field(title="Rules")] = {}
354
- has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = False
355
- customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
356
- tags: Annotated[Optional[List[str]], Field(title="Tags")] = []
357
- metadata: Annotated[Optional[Dict[str, Any]], Field(title="Metadata")] = {}
358
- update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
270
+ class FetchPromptScorerResponse(BaseModel):
271
+ scorer: PromptScorer
359
272
 
360
273
 
361
274
  class ScoringResult(BaseModel):
@@ -365,34 +278,13 @@ class ScoringResult(BaseModel):
365
278
  )
366
279
  name: Annotated[Optional[str], Field(title="Name")] = None
367
280
  data_object: Annotated[
368
- Optional[Union[TraceSpan, Example]], Field(title="Data Object")
281
+ Optional[Union[OtelTraceSpan, Example]], Field(title="Data Object")
369
282
  ] = None
370
283
  trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
371
284
  run_duration: Annotated[Optional[float], Field(title="Run Duration")] = None
372
285
  evaluation_cost: Annotated[Optional[float], Field(title="Evaluation Cost")] = None
373
286
 
374
287
 
375
- class TraceRun(BaseModel):
376
- project_name: Annotated[Optional[str], Field(title="Project Name")] = None
377
- eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
378
- traces: Annotated[List[Trace], Field(title="Traces")]
379
- scorers: Annotated[List[ScorerConfig], Field(title="Scorers")]
380
- model: Annotated[str, Field(title="Model")]
381
- trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
382
- tools: Annotated[Optional[List[Dict[str, Any]]], Field(title="Tools")] = None
383
-
384
-
385
288
  class EvalResults(BaseModel):
386
289
  results: Annotated[List[ScoringResult], Field(title="Results")]
387
290
  run: Annotated[Union[ExampleEvaluationRun, TraceEvaluationRun], Field(title="Run")]
388
-
389
-
390
- class DatasetPush(BaseModel):
391
- dataset_alias: Annotated[str, Field(title="Dataset Alias")]
392
- comments: Annotated[Optional[str], Field(title="Comments")] = None
393
- source_file: Annotated[Optional[str], Field(title="Source File")] = None
394
- examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
395
- traces: Annotated[Optional[List[Trace]], Field(title="Traces")] = None
396
- is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
397
- project_name: Annotated[str, Field(title="Project Name")]
398
- overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
judgeval/data/result.py CHANGED
@@ -1,6 +1,5 @@
1
1
  from typing import List, Union
2
2
  from judgeval.data import ScorerData, Example
3
- from judgeval.data.trace import TraceSpan
4
3
  from judgeval.data.judgment_types import ScoringResult as JudgmentScoringResult
5
4
 
6
5
 
@@ -34,7 +33,7 @@ class ScoringResult(JudgmentScoringResult):
34
33
 
35
34
 
36
35
  def generate_scoring_result(
37
- data_object: Union[Example, TraceSpan],
36
+ data_object: Union[Example],
38
37
  scorers_data: List[ScorerData],
39
38
  run_duration: float,
40
39
  success: bool,
judgeval/data/trace.py CHANGED
@@ -1,40 +1,14 @@
1
- from datetime import datetime, timezone
2
- from judgeval.data.judgment_types import (
3
- TraceUsage as JudgmentTraceUsage,
4
- TraceSpan as JudgmentTraceSpan,
5
- Trace as JudgmentTrace,
6
- )
7
- from judgeval.utils.serialize import json_encoder
8
-
9
-
10
- class TraceUsage(JudgmentTraceUsage):
11
- pass
12
-
13
-
14
- class TraceSpan(JudgmentTraceSpan):
15
- def model_dump(self, **kwargs):
16
- return {
17
- "span_id": self.span_id,
18
- "trace_id": self.trace_id,
19
- "created_at": datetime.fromtimestamp(
20
- self.created_at, tz=timezone.utc
21
- ).isoformat(),
22
- "inputs": json_encoder(self.inputs),
23
- "output": json_encoder(self.output),
24
- "error": json_encoder(self.error),
25
- "parent_span_id": self.parent_span_id,
26
- "function": self.function,
27
- "duration": self.duration,
28
- "span_type": self.span_type,
29
- "usage": self.usage.model_dump() if self.usage else None,
30
- "has_evaluation": self.has_evaluation,
31
- "agent_name": self.agent_name,
32
- "state_before": self.state_before,
33
- "state_after": self.state_after,
34
- "additional_metadata": json_encoder(self.additional_metadata),
35
- "update_id": self.update_id,
36
- }
37
-
38
-
39
- class Trace(JudgmentTrace):
40
- pass
1
+ from typing import Optional
2
+ from pydantic import BaseModel
3
+
4
+
5
+ class TraceUsage(BaseModel):
6
+ prompt_tokens: Optional[int] = None
7
+ completion_tokens: Optional[int] = None
8
+ cache_creation_input_tokens: Optional[int] = None
9
+ cache_read_input_tokens: Optional[int] = None
10
+ total_tokens: Optional[int] = None
11
+ prompt_tokens_cost_usd: Optional[float] = None
12
+ completion_tokens_cost_usd: Optional[float] = None
13
+ total_cost_usd: Optional[float] = None
14
+ model_name: Optional[str] = None
@@ -5,17 +5,26 @@ import yaml
5
5
  from dataclasses import dataclass
6
6
  from typing import List, Literal, Optional
7
7
 
8
- from judgeval.data import Example, Trace
8
+ from judgeval.data import Example
9
9
  from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
10
10
  from judgeval.api import JudgmentSyncClient
11
11
  from judgeval.logger import judgeval_logger
12
12
  from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
13
13
 
14
+ from judgeval.api.api_types import DatasetKind
15
+
16
+ @dataclass
17
+ class DatasetInfo:
18
+ dataset_id: str
19
+ name: str
20
+ created_at: str
21
+ dataset_kind: DatasetKind
22
+ entries: int
23
+ creator: str
14
24
 
15
25
  @dataclass
16
26
  class Dataset:
17
27
  examples: List[Example]
18
- traces: List[Trace]
19
28
  name: str
20
29
  project_name: str
21
30
  judgment_api_key: str = JUDGMENT_API_KEY or ""
@@ -30,7 +39,7 @@ class Dataset:
30
39
  client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
31
40
  dataset = client.datasets_pull_for_judgeval(
32
41
  {
33
- "dataset_alias": name,
42
+ "dataset_name": name,
34
43
  "project_name": project_name,
35
44
  },
36
45
  )
@@ -40,12 +49,14 @@ class Dataset:
40
49
  for e in examples:
41
50
  if isinstance(e, dict) and isinstance(e.get("data"), dict):
42
51
  e.update(e.pop("data"))
43
- judgeval_logger.info(f"Succesfully retrieved dataset {name}!")
52
+ e.pop(
53
+ "example_id"
54
+ ) # TODO: remove once scorer data migraiton is complete
55
+ judgeval_logger.info(f"Successfully retrieved dataset {name}!")
44
56
  return cls(
45
57
  name=name,
46
58
  project_name=project_name,
47
59
  examples=[Example(**e) for e in examples],
48
- traces=[Trace(**t) for t in dataset.get("traces", [])],
49
60
  )
50
61
 
51
62
  @classmethod
@@ -54,36 +65,41 @@ class Dataset:
54
65
  name: str,
55
66
  project_name: str,
56
67
  examples: Optional[List[Example]] = None,
57
- traces: Optional[List[Trace]] = None,
58
68
  overwrite: bool = False,
59
69
  ):
60
- if examples and traces:
61
- raise ValueError("Only one of examples or traces must be provided")
62
-
63
70
  if not examples:
64
71
  examples = []
65
72
 
66
- if not traces:
67
- traces = []
68
-
69
73
  client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
70
- client.datasets_push(
74
+ client.datasets_create_for_judgeval(
71
75
  {
72
- "dataset_alias": name,
76
+ "name": name,
73
77
  "project_name": project_name,
74
- "examples": [e.model_dump() for e in examples], # type: ignore
75
- "traces": [t.model_dump() for t in traces], # type: ignore
78
+ "examples": [e.model_dump() for e in examples],
79
+ "dataset_kind": "example",
76
80
  "overwrite": overwrite,
77
81
  }
78
82
  )
79
83
 
80
- judgeval_logger.info(f"Succesfull created dataset {name}!")
84
+ judgeval_logger.info(f"Successfully created dataset {name}!")
81
85
  return cls(
82
86
  name=name,
83
87
  project_name=project_name,
84
88
  examples=examples,
85
- traces=traces,
86
89
  )
90
+ @classmethod
91
+ def list(
92
+ cls,
93
+ project_name: str
94
+ ):
95
+ client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
96
+ datasets = client.datasets_pull_all_for_judgeval(
97
+ {"project_name": project_name}
98
+ )
99
+
100
+ judgeval_logger.info(f"Fetched all datasets for project {project_name}!")
101
+
102
+ return [DatasetInfo(**dataset_info) for dataset_info in datasets]
87
103
 
88
104
  def add_from_json(self, file_path: str) -> None:
89
105
  """
@@ -123,29 +139,15 @@ class Dataset:
123
139
  self.add_examples(examples)
124
140
 
125
141
  def add_examples(self, examples: List[Example]) -> None:
126
- client = JudgmentSyncClient(self.judgment_api_key, self.organization_id)
127
- client.datasets_insert_examples(
128
- {
129
- "dataset_alias": self.name,
130
- "project_name": self.project_name,
131
- "examples": [
132
- {
133
- "name": e.name,
134
- "created_at": e.created_at,
135
- "example_id": e.example_id,
136
- }
137
- for e in examples
138
- ],
139
- }
140
- )
142
+ if not isinstance(examples, list):
143
+ raise TypeError("examples must be a list")
141
144
 
142
- def add_traces(self, traces: List[Trace]) -> None:
143
145
  client = JudgmentSyncClient(self.judgment_api_key, self.organization_id)
144
- client.traces_add_to_dataset(
146
+ client.datasets_insert_examples_for_judgeval(
145
147
  {
146
- "dataset_alias": self.name,
148
+ "dataset_name": self.name,
147
149
  "project_name": self.project_name,
148
- "traces": [t.model_dump() for t in traces], # type: ignore
150
+ "examples": [e.model_dump() for e in examples],
149
151
  }
150
152
  )
151
153
 
@@ -200,10 +202,4 @@ class Dataset:
200
202
  return len(self.examples)
201
203
 
202
204
  def __str__(self):
203
- return (
204
- f"{self.__class__.__name__}("
205
- f"examples={self.examples}, "
206
- f"traces={self.traces}, "
207
- f"name={self.name}"
208
- f")"
209
- )
205
+ return f"{self.__class__.__name__}(examples={self.examples}, name={self.name})"
@@ -10,7 +10,7 @@ from typing import List, Dict, Union, Tuple, TYPE_CHECKING
10
10
  from rich import print as rprint
11
11
 
12
12
  from judgeval.data import ScorerData, ScoringResult, Example
13
- from judgeval.scorers import BaseScorer, APIScorerConfig
13
+ from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
14
14
  from judgeval.scorers.score import a_execute_scoring
15
15
  from judgeval.api import JudgmentSyncClient
16
16
  from judgeval.env import (
@@ -86,7 +86,7 @@ def log_evaluation_results(
86
86
 
87
87
 
88
88
  def check_examples(
89
- examples: List[Example], scorers: List[Union[APIScorerConfig, BaseScorer]]
89
+ examples: List[Example], scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]]
90
90
  ) -> None:
91
91
  """
92
92
  Checks if the example contains the necessary parameters for the scorer.
@@ -118,10 +118,8 @@ def check_examples(
118
118
 
119
119
 
120
120
  def _poll_evaluation_until_complete(
121
- experiment_run_id: str,
122
- project_name: str,
121
+ evaluation_run: EvaluationRun,
123
122
  judgment_api_key: str,
124
- organization_id: str,
125
123
  expected_scorer_data_count: int,
126
124
  poll_interval_seconds: float = 5,
127
125
  max_failures: int = 5,
@@ -142,6 +140,10 @@ def _poll_evaluation_until_complete(
142
140
  Returns:
143
141
  List[ScoringResult]: The evaluation results
144
142
  """
143
+ organization_id = evaluation_run.organization_id
144
+ project_name = evaluation_run.project_name
145
+ experiment_run_id = evaluation_run.id
146
+
145
147
  poll_count = 0
146
148
  exception_count = 0
147
149
  api_client = JudgmentSyncClient(judgment_api_key, organization_id)
@@ -157,6 +159,11 @@ def _poll_evaluation_until_complete(
157
159
  time.sleep(poll_interval_seconds)
158
160
  continue
159
161
 
162
+ example_scorer_pairings = status_response.get("results", [])
163
+ if len(example_scorer_pairings) != expected_scorer_data_count:
164
+ time.sleep(poll_interval_seconds)
165
+ continue
166
+
160
167
  results_response = api_client.fetch_experiment_run(
161
168
  {
162
169
  "experiment_run_id": experiment_run_id,
@@ -165,36 +172,20 @@ def _poll_evaluation_until_complete(
165
172
  )
166
173
  url = results_response.get("ui_results_url")
167
174
 
168
- if results_response.get("examples") is None:
169
- time.sleep(poll_interval_seconds)
170
- continue
171
-
172
- examples_data = results_response.get("examples", [])
173
- scoring_results = []
174
- scorer_data_count = 0
175
-
176
- for example_data in examples_data:
177
- scorer_data_list = []
178
- for raw_scorer_data in example_data.get("scorer_data", []):
179
- scorer_data = ScorerData(**raw_scorer_data)
180
- scorer_data_list.append(scorer_data)
181
- scorer_data_count += 1
182
-
183
- example = Example(**example_data)
184
-
185
- success = all(scorer_data.success for scorer_data in scorer_data_list)
175
+ scoring_result_list = []
176
+ for res in results_response.get("results", []):
177
+ example = res.get("data", {}).copy()
178
+ example["example_id"] = res.get("example_id")
186
179
  scoring_result = ScoringResult(
187
- success=success,
188
- scorers_data=scorer_data_list,
180
+ scorers_data=res.get("scorers", []),
181
+ success=all(
182
+ t.get("success", False) for t in res.get("scorers", [])
183
+ ),
189
184
  data_object=example,
190
185
  )
191
- scoring_results.append(scoring_result)
192
-
193
- if scorer_data_count != expected_scorer_data_count:
194
- time.sleep(poll_interval_seconds)
195
- continue
186
+ scoring_result_list.append(scoring_result)
196
187
 
197
- return scoring_results, url
188
+ return scoring_result_list, url
198
189
  except Exception as e:
199
190
  exception_count += 1
200
191
  if isinstance(e, JudgmentAPIError):
@@ -294,10 +285,8 @@ def run_eval(
294
285
  else sum(1 for cs in evaluation_run.custom_scorers if cs.server_hosted)
295
286
  )
296
287
  results, url = _poll_evaluation_until_complete(
297
- experiment_run_id=evaluation_run.id,
298
- project_name=evaluation_run.project_name,
288
+ evaluation_run=evaluation_run,
299
289
  judgment_api_key=judgment_api_key,
300
- organization_id=evaluation_run.organization_id,
301
290
  expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
302
291
  )
303
292
  finally: