judgeval 0.0.37__py3-none-any.whl → 0.0.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,14 +7,13 @@ from judgeval.common.logger import debug, error, warning, info
7
7
  from judgeval.constants import (
8
8
  JUDGMENT_DATASETS_PUSH_API_URL,
9
9
  JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL,
10
- JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
11
10
  JUDGMENT_DATASETS_PULL_API_URL,
12
11
  JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
13
12
  JUDGMENT_DATASETS_DELETE_API_URL,
14
13
  JUDGMENT_DATASETS_INSERT_API_URL,
15
14
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
16
15
  )
17
- from judgeval.data import Example, Sequence
16
+ from judgeval.data import Example
18
17
  from judgeval.data.datasets import EvalDataset
19
18
 
20
19
 
@@ -59,8 +58,6 @@ class EvalDatasetClient:
59
58
  "dataset_alias": alias,
60
59
  "project_name": project_name,
61
60
  "examples": [e.to_dict() for e in dataset.examples],
62
- "sequences": [s.model_dump() for s in dataset.sequences],
63
- "is_sequence": len(dataset.sequences) > 0,
64
61
  "overwrite": overwrite,
65
62
  }
66
63
  try:
@@ -151,63 +148,6 @@ class EvalDatasetClient:
151
148
  description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
152
149
  )
153
150
  return True
154
-
155
- def append_sequences(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
156
- debug(f"Appending dataset with alias '{alias}'")
157
- """
158
- Appends the dataset to Judgment platform
159
-
160
- Mock request:
161
- dataset = {
162
- "alias": alias,
163
- "examples": [...],
164
- "project_name": project_name
165
- } ==>
166
- {
167
- "_alias": alias,
168
- "_id": "..." # ID of the dataset
169
- }
170
- """
171
- with Progress(
172
- SpinnerColumn(style="rgb(106,0,255)"),
173
- TextColumn("[progress.description]{task.description}"),
174
- transient=False,
175
- ) as progress:
176
- task_id = progress.add_task(
177
- f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
178
- total=100,
179
- )
180
- content = {
181
- "dataset_alias": alias,
182
- "project_name": project_name,
183
- "sequences": [s.model_dump() for s in sequences],
184
- }
185
- try:
186
- response = requests.post(
187
- JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
188
- json=content,
189
- headers={
190
- "Content-Type": "application/json",
191
- "Authorization": f"Bearer {self.judgment_api_key}",
192
- "X-Organization-Id": self.organization_id
193
- },
194
- verify=True
195
- )
196
- if response.status_code != 200:
197
- error(f"Server error during append: {response.json()}")
198
- raise Exception(f"Server error during append: {response.json()}")
199
- response.raise_for_status()
200
- except requests.exceptions.HTTPError as err:
201
- if response.status_code == 422:
202
- error(f"Validation error during append: {err.response.json()}")
203
- else:
204
- error(f"HTTP error during append: {err}")
205
-
206
- progress.update(
207
- task_id,
208
- description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
209
- )
210
- return True
211
151
 
212
152
  def pull(self, alias: str, project_name: str) -> EvalDataset:
213
153
  debug(f"Pulling dataset with alias '{alias}'")
@@ -262,7 +202,6 @@ class EvalDatasetClient:
262
202
  info(f"Successfully pulled dataset with alias '{alias}'")
263
203
  payload = response.json()
264
204
  dataset.examples = [Example(**e) for e in payload.get("examples", [])]
265
- dataset.sequences = [Sequence(**s) for s in payload.get("sequences", [])]
266
205
  dataset._alias = payload.get("alias")
267
206
  dataset._id = payload.get("id")
268
207
  progress.update(
judgeval/data/example.py CHANGED
@@ -8,6 +8,7 @@ from uuid import uuid4
8
8
  from pydantic import BaseModel, Field, field_validator
9
9
  from enum import Enum
10
10
  from datetime import datetime
11
+ from judgeval.data.tool import Tool
11
12
  import time
12
13
 
13
14
 
@@ -31,13 +32,12 @@ class Example(BaseModel):
31
32
  retrieval_context: Optional[List[str]] = None
32
33
  additional_metadata: Optional[Dict[str, Any]] = None
33
34
  tools_called: Optional[List[str]] = None
34
- expected_tools: Optional[List[Dict[str, Any]]] = None
35
+ expected_tools: Optional[List[Tool]] = None
35
36
  name: Optional[str] = None
36
37
  example_id: str = Field(default_factory=lambda: str(uuid4()))
37
38
  example_index: Optional[int] = None
38
39
  timestamp: Optional[str] = None
39
40
  trace_id: Optional[str] = None
40
- sequence_order: Optional[int] = 0
41
41
 
42
42
  def __init__(self, **data):
43
43
  if 'example_id' not in data:
@@ -83,17 +83,17 @@ class Example(BaseModel):
83
83
  raise ValueError(f"All items in expected_output must be strings but got {v}")
84
84
  return v
85
85
 
86
- @field_validator('expected_tools', mode='before')
86
+ @field_validator('expected_tools')
87
87
  @classmethod
88
88
  def validate_expected_tools(cls, v):
89
89
  if v is not None:
90
90
  if not isinstance(v, list):
91
- raise ValueError(f"Expected tools must be a list of dictionaries or None but got {v} of type {type(v)}")
91
+ raise ValueError(f"Expected tools must be a list of Tools or None but got {v} of type {type(v)}")
92
92
 
93
- # Check that each item in the list is a dictionary
93
+ # Check that each item in the list is a Tool
94
94
  for i, item in enumerate(v):
95
- if not isinstance(item, dict):
96
- raise ValueError(f"Expected tools must be a list of dictionaries, but item at index {i} is {item} of type {type(item)}")
95
+ if not isinstance(item, Tool):
96
+ raise ValueError(f"Expected tools must be a list of Tools, but item at index {i} is {item} of type {type(item)}")
97
97
 
98
98
  return v
99
99
 
judgeval/data/result.py CHANGED
@@ -3,7 +3,7 @@ from typing import List, Union, Optional, Dict, Any, Union
3
3
  from judgeval.common.logger import debug, error
4
4
  from pydantic import BaseModel
5
5
  from judgeval.data import ScorerData, Example, CustomExample
6
- from judgeval.data.sequence import Sequence
6
+ from judgeval.data.trace import TraceSpan
7
7
 
8
8
 
9
9
  class ScoringResult(BaseModel):
@@ -24,7 +24,7 @@ class ScoringResult(BaseModel):
24
24
  name: Optional[str] = None
25
25
 
26
26
  # The original example object that was used to create the ScoringResult
27
- data_object: Optional[Union[Sequence, CustomExample, Example]] = None
27
+ data_object: Optional[Union[TraceSpan, CustomExample, Example]] = None
28
28
  trace_id: Optional[str] = None
29
29
 
30
30
  # Additional fields for internal use
@@ -49,7 +49,7 @@ class ScoringResult(BaseModel):
49
49
 
50
50
 
51
51
  def generate_scoring_result(
52
- data_object: Union[Example, Sequence],
52
+ data_object: Union[Example, TraceSpan],
53
53
  scorers_data: List[ScorerData],
54
54
  run_duration: float,
55
55
  success: bool,
judgeval/data/tool.py ADDED
@@ -0,0 +1,19 @@
1
+ from pydantic import BaseModel, field_validator
2
+ from typing import Dict, Any, Optional
3
+ import warnings
4
+
5
+ class Tool(BaseModel):
6
+ tool_name: str
7
+ parameters: Optional[Dict[str, Any]] = None
8
+
9
+ @field_validator('tool_name')
10
+ def validate_tool_name(cls, v):
11
+ if not v:
12
+ warnings.warn("Tool name is empty or None", UserWarning)
13
+ return v
14
+
15
+ @field_validator('parameters')
16
+ def validate_parameters(cls, v):
17
+ if v is not None and not isinstance(v, dict):
18
+ warnings.warn(f"Parameters should be a dictionary, got {type(v)}", UserWarning)
19
+ return v
judgeval/data/trace.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from pydantic import BaseModel
2
2
  from typing import Optional, Dict, Any, List
3
3
  from judgeval.evaluation_run import EvaluationRun
4
+ from judgeval.data.tool import Tool
4
5
  import json
5
6
  from datetime import datetime, timezone
6
7
 
@@ -9,7 +10,7 @@ class TraceSpan(BaseModel):
9
10
  trace_id: str
10
11
  function: Optional[str] = None
11
12
  depth: int
12
- created_at: Optional[float] = None
13
+ created_at: Optional[Any] = None
13
14
  parent_span_id: Optional[str] = None
14
15
  span_type: Optional[str] = "span"
15
16
  inputs: Optional[Dict[str, Any]] = None
@@ -17,6 +18,8 @@ class TraceSpan(BaseModel):
17
18
  duration: Optional[float] = None
18
19
  annotation: Optional[List[Dict[str, Any]]] = None
19
20
  evaluation_runs: Optional[List[EvaluationRun]] = []
21
+ expected_tools: Optional[List[Tool]] = None
22
+ additional_metadata: Optional[Dict[str, Any]] = None
20
23
 
21
24
  def model_dump(self, **kwargs):
22
25
  return {
@@ -124,6 +127,7 @@ class Trace(BaseModel):
124
127
  duration: float
125
128
  entries: List[TraceSpan]
126
129
  overwrite: bool = False
130
+ offline_mode: bool = False
127
131
  rules: Optional[Dict[str, Any]] = None
128
132
  has_notification: Optional[bool] = False
129
133
 
@@ -1,20 +1,20 @@
1
1
 
2
2
  from pydantic import BaseModel
3
3
  from typing import List, Optional, Dict, Any, Union, Callable
4
- from judgeval.data import Sequence
4
+ from judgeval.data import Trace
5
5
  from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
6
6
  from judgeval.judges import JudgevalJudge
7
7
  from judgeval.rules import Rule
8
8
 
9
9
 
10
- class SequenceRun(BaseModel):
10
+ class TraceRun(BaseModel):
11
11
  """
12
12
  Stores example and evaluation scorers together for running an eval task
13
13
 
14
14
  Args:
15
15
  project_name (str): The name of the project the evaluation results belong to
16
16
  eval_name (str): A name for this evaluation run
17
- sequences (List[Sequence]): The sequences to evaluate
17
+ traces (List[Trace]): The traces to evaluate
18
18
  scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
19
19
  model (str): The model used as a judge when using LLM as a Judge
20
20
  aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
@@ -29,7 +29,7 @@ class SequenceRun(BaseModel):
29
29
  organization_id: Optional[str] = None
30
30
  project_name: Optional[str] = None
31
31
  eval_name: Optional[str] = None
32
- sequences: Optional[List[Sequence]] = None
32
+ traces: Optional[List[Trace]] = None
33
33
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
34
34
  model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
35
35
  aggregator: Optional[str] = None
@@ -79,7 +79,7 @@ class EvaluationRun(BaseModel):
79
79
  raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
80
80
  return v
81
81
 
82
- @field_validator('examples', mode='before')
82
+ @field_validator('examples')
83
83
  def validate_examples(cls, v):
84
84
  if not v:
85
85
  raise ValueError("Examples cannot be empty.")