judgeval 0.0.37__py3-none-any.whl → 0.0.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,14 +7,13 @@ from judgeval.common.logger import debug, error, warning, info
7
7
  from judgeval.constants import (
8
8
  JUDGMENT_DATASETS_PUSH_API_URL,
9
9
  JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL,
10
- JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
11
10
  JUDGMENT_DATASETS_PULL_API_URL,
12
11
  JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
13
12
  JUDGMENT_DATASETS_DELETE_API_URL,
14
13
  JUDGMENT_DATASETS_INSERT_API_URL,
15
14
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
16
15
  )
17
- from judgeval.data import Example, Sequence
16
+ from judgeval.data import Example
18
17
  from judgeval.data.datasets import EvalDataset
19
18
 
20
19
 
@@ -59,8 +58,6 @@ class EvalDatasetClient:
59
58
  "dataset_alias": alias,
60
59
  "project_name": project_name,
61
60
  "examples": [e.to_dict() for e in dataset.examples],
62
- "sequences": [s.model_dump() for s in dataset.sequences],
63
- "is_sequence": len(dataset.sequences) > 0,
64
61
  "overwrite": overwrite,
65
62
  }
66
63
  try:
@@ -151,63 +148,6 @@ class EvalDatasetClient:
151
148
  description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
152
149
  )
153
150
  return True
154
-
155
- def append_sequences(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
156
- debug(f"Appending dataset with alias '{alias}'")
157
- """
158
- Appends the dataset to Judgment platform
159
-
160
- Mock request:
161
- dataset = {
162
- "alias": alias,
163
- "examples": [...],
164
- "project_name": project_name
165
- } ==>
166
- {
167
- "_alias": alias,
168
- "_id": "..." # ID of the dataset
169
- }
170
- """
171
- with Progress(
172
- SpinnerColumn(style="rgb(106,0,255)"),
173
- TextColumn("[progress.description]{task.description}"),
174
- transient=False,
175
- ) as progress:
176
- task_id = progress.add_task(
177
- f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
178
- total=100,
179
- )
180
- content = {
181
- "dataset_alias": alias,
182
- "project_name": project_name,
183
- "sequences": [s.model_dump() for s in sequences],
184
- }
185
- try:
186
- response = requests.post(
187
- JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
188
- json=content,
189
- headers={
190
- "Content-Type": "application/json",
191
- "Authorization": f"Bearer {self.judgment_api_key}",
192
- "X-Organization-Id": self.organization_id
193
- },
194
- verify=True
195
- )
196
- if response.status_code != 200:
197
- error(f"Server error during append: {response.json()}")
198
- raise Exception(f"Server error during append: {response.json()}")
199
- response.raise_for_status()
200
- except requests.exceptions.HTTPError as err:
201
- if response.status_code == 422:
202
- error(f"Validation error during append: {err.response.json()}")
203
- else:
204
- error(f"HTTP error during append: {err}")
205
-
206
- progress.update(
207
- task_id,
208
- description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
209
- )
210
- return True
211
151
 
212
152
  def pull(self, alias: str, project_name: str) -> EvalDataset:
213
153
  debug(f"Pulling dataset with alias '{alias}'")
@@ -262,7 +202,6 @@ class EvalDatasetClient:
262
202
  info(f"Successfully pulled dataset with alias '{alias}'")
263
203
  payload = response.json()
264
204
  dataset.examples = [Example(**e) for e in payload.get("examples", [])]
265
- dataset.sequences = [Sequence(**s) for s in payload.get("sequences", [])]
266
205
  dataset._alias = payload.get("alias")
267
206
  dataset._id = payload.get("id")
268
207
  progress.update(
judgeval/data/example.py CHANGED
@@ -37,7 +37,6 @@ class Example(BaseModel):
37
37
  example_index: Optional[int] = None
38
38
  timestamp: Optional[str] = None
39
39
  trace_id: Optional[str] = None
40
- sequence_order: Optional[int] = 0
41
40
 
42
41
  def __init__(self, **data):
43
42
  if 'example_id' not in data:
judgeval/data/result.py CHANGED
@@ -3,7 +3,7 @@ from typing import List, Union, Optional, Dict, Any, Union
3
3
  from judgeval.common.logger import debug, error
4
4
  from pydantic import BaseModel
5
5
  from judgeval.data import ScorerData, Example, CustomExample
6
- from judgeval.data.sequence import Sequence
6
+ from judgeval.data.trace import TraceSpan
7
7
 
8
8
 
9
9
  class ScoringResult(BaseModel):
@@ -24,7 +24,7 @@ class ScoringResult(BaseModel):
24
24
  name: Optional[str] = None
25
25
 
26
26
  # The original example object that was used to create the ScoringResult
27
- data_object: Optional[Union[Sequence, CustomExample, Example]] = None
27
+ data_object: Optional[Union[TraceSpan, CustomExample, Example]] = None
28
28
  trace_id: Optional[str] = None
29
29
 
30
30
  # Additional fields for internal use
@@ -49,7 +49,7 @@ class ScoringResult(BaseModel):
49
49
 
50
50
 
51
51
  def generate_scoring_result(
52
- data_object: Union[Example, Sequence],
52
+ data_object: Union[Example, TraceSpan],
53
53
  scorers_data: List[ScorerData],
54
54
  run_duration: float,
55
55
  success: bool,
judgeval/data/trace.py CHANGED
@@ -9,7 +9,7 @@ class TraceSpan(BaseModel):
9
9
  trace_id: str
10
10
  function: Optional[str] = None
11
11
  depth: int
12
- created_at: Optional[float] = None
12
+ created_at: Optional[Any] = None
13
13
  parent_span_id: Optional[str] = None
14
14
  span_type: Optional[str] = "span"
15
15
  inputs: Optional[Dict[str, Any]] = None
@@ -17,6 +17,8 @@ class TraceSpan(BaseModel):
17
17
  duration: Optional[float] = None
18
18
  annotation: Optional[List[Dict[str, Any]]] = None
19
19
  evaluation_runs: Optional[List[EvaluationRun]] = []
20
+ expected_tools: Optional[List[Dict[str, Any]]] = None
21
+ additional_metadata: Optional[Dict[str, Any]] = None
20
22
 
21
23
  def model_dump(self, **kwargs):
22
24
  return {
@@ -124,6 +126,7 @@ class Trace(BaseModel):
124
126
  duration: float
125
127
  entries: List[TraceSpan]
126
128
  overwrite: bool = False
129
+ offline_mode: bool = False
127
130
  rules: Optional[Dict[str, Any]] = None
128
131
  has_notification: Optional[bool] = False
129
132
 
@@ -1,20 +1,20 @@
1
1
 
2
2
  from pydantic import BaseModel
3
3
  from typing import List, Optional, Dict, Any, Union, Callable
4
- from judgeval.data import Sequence
4
+ from judgeval.data import Trace
5
5
  from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
6
6
  from judgeval.judges import JudgevalJudge
7
7
  from judgeval.rules import Rule
8
8
 
9
9
 
10
- class SequenceRun(BaseModel):
10
+ class TraceRun(BaseModel):
11
11
  """
12
12
  Stores example and evaluation scorers together for running an eval task
13
13
 
14
14
  Args:
15
15
  project_name (str): The name of the project the evaluation results belong to
16
16
  eval_name (str): A name for this evaluation run
17
- sequences (List[Sequence]): The sequences to evaluate
17
+ traces (List[Trace]): The traces to evaluate
18
18
  scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
19
19
  model (str): The model used as a judge when using LLM as a Judge
20
20
  aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
@@ -29,7 +29,7 @@ class SequenceRun(BaseModel):
29
29
  organization_id: Optional[str] = None
30
30
  project_name: Optional[str] = None
31
31
  eval_name: Optional[str] = None
32
- sequences: Optional[List[Sequence]] = None
32
+ traces: Optional[List[Trace]] = None
33
33
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
34
34
  model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
35
35
  aggregator: Optional[str] = None
@@ -79,7 +79,7 @@ class EvaluationRun(BaseModel):
79
79
  raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
80
80
  return v
81
81
 
82
- @field_validator('examples', mode='before')
82
+ @field_validator('examples')
83
83
  def validate_examples(cls, v):
84
84
  if not v:
85
85
  raise ValueError("Examples cannot be empty.")