judgeval 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/common/utils.py CHANGED
@@ -12,6 +12,7 @@ NOTE: any function beginning with 'a', e.g. 'afetch_together_api_response', is a
12
12
  import asyncio
13
13
  import concurrent.futures
14
14
  import os
15
+ import requests
15
16
  import pprint
16
17
  from typing import Any, Dict, List, Literal, Mapping, Optional, Union
17
18
 
@@ -96,6 +97,23 @@ def read_file(file_path: str) -> str:
96
97
  with open(file_path, "r", encoding='utf-8') as file:
97
98
  return file.read()
98
99
 
100
+ def validate_api_key(judgment_api_key: str):
101
+ """
102
+ Validates that the user api key is valid
103
+ """
104
+ response = requests.post(
105
+ f"{ROOT_API}/validate_api_key/",
106
+ headers={
107
+ "Content-Type": "application/json",
108
+ "Authorization": f"Bearer {judgment_api_key}",
109
+ },
110
+ json={}, # Empty body now
111
+ verify=True
112
+ )
113
+ if response.status_code == 200:
114
+ return True, response.json()
115
+ else:
116
+ return False, response.json().get("detail", "Error validating API key")
99
117
 
100
118
  def fetch_together_api_response(model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None) -> str:
101
119
  """
judgeval/constants.py CHANGED
@@ -27,7 +27,7 @@ class APIScorer(str, Enum):
27
27
  COMPARISON = "comparison"
28
28
  GROUNDEDNESS = "groundedness"
29
29
  DERAILMENT = "derailment"
30
-
30
+ TOOL_ORDER = "tool_order"
31
31
  @classmethod
32
32
  def _missing_(cls, value):
33
33
  # Handle case-insensitive lookup
@@ -42,14 +42,16 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
42
42
  JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
43
43
  JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
44
44
  JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
45
- JUDGMENT_DATASETS_APPEND_API_URL = f"{ROOT_API}/datasets/insert_examples/"
45
+ JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
46
+ JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL = f"{ROOT_API}/datasets/insert_sequences/"
46
47
  JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
47
48
  JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
48
49
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
49
50
  JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
50
51
  JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
51
52
  JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
52
- JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
53
+ JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL = f"{ROOT_API}/traces/convert_trace_to_sequence/"
54
+ JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_experiment_run/"
53
55
  JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
54
56
  JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
55
57
  JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
@@ -57,6 +59,7 @@ JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
57
59
  JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
58
60
  JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
59
61
  JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
62
+ JUDGMENT_TRACES_ADD_ANNOTATION_API_URL = f"{ROOT_API}/traces/add_annotation/"
60
63
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
61
64
  # RabbitMQ
62
65
  RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
judgeval/data/__init__.py CHANGED
@@ -3,6 +3,8 @@ from judgeval.data.custom_example import CustomExample
3
3
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
4
4
  from judgeval.data.result import ScoringResult, generate_scoring_result
5
5
  from judgeval.data.sequence import Sequence
6
+ from judgeval.data.trace import Trace, TraceSpan
7
+
6
8
 
7
9
  __all__ = [
8
10
  "Example",
@@ -13,4 +15,6 @@ __all__ = [
13
15
  "ScoringResult",
14
16
  "generate_scoring_result",
15
17
  "Sequence",
18
+ "Trace",
19
+ "TraceSpan",
16
20
  ]
@@ -224,6 +224,9 @@ class EvalDataset:
224
224
  self.examples = self.examples + [e]
225
225
  # TODO if we need to add rank, then we need to do it here
226
226
 
227
+ def add_sequence(self, s: Sequence) -> None:
228
+ self.sequences = self.sequences + [s]
229
+
227
230
  def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
228
231
  """
229
232
  Saves the dataset as a file. Save only the examples.
@@ -270,7 +273,6 @@ class EvalDataset:
270
273
  None, # Example does not have comments
271
274
  None, # Example does not have source file
272
275
  True, # Adding an Example
273
- e.trace_id
274
276
  ]
275
277
  )
276
278
 
@@ -292,7 +294,6 @@ class EvalDataset:
292
294
  "comments": None, # Example does not have comments
293
295
  "source_file": None, # Example does not have source file
294
296
  "example": True, # Adding an Example
295
- "trace_id": e.trace_id
296
297
  }
297
298
  for e in self.examples
298
299
  ],
@@ -6,7 +6,8 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
6
6
  from judgeval.common.logger import debug, error, warning, info
7
7
  from judgeval.constants import (
8
8
  JUDGMENT_DATASETS_PUSH_API_URL,
9
- JUDGMENT_DATASETS_APPEND_API_URL,
9
+ JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL,
10
+ JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
10
11
  JUDGMENT_DATASETS_PULL_API_URL,
11
12
  JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
12
13
  JUDGMENT_DATASETS_DELETE_API_URL,
@@ -58,6 +59,8 @@ class EvalDatasetClient:
58
59
  "dataset_alias": alias,
59
60
  "project_name": project_name,
60
61
  "examples": [e.to_dict() for e in dataset.examples],
62
+ "sequences": [s.model_dump() for s in dataset.sequences],
63
+ "is_sequence": len(dataset.sequences) > 0,
61
64
  "overwrite": overwrite,
62
65
  }
63
66
  try:
@@ -92,7 +95,7 @@ class EvalDatasetClient:
92
95
  return True
93
96
 
94
97
 
95
- def append(self, alias: str, examples: List[Example], project_name: str) -> bool:
98
+ def append_examples(self, alias: str, examples: List[Example], project_name: str) -> bool:
96
99
  debug(f"Appending dataset with alias '{alias}'")
97
100
  """
98
101
  Appends the dataset to Judgment platform
@@ -124,7 +127,7 @@ class EvalDatasetClient:
124
127
  }
125
128
  try:
126
129
  response = requests.post(
127
- JUDGMENT_DATASETS_APPEND_API_URL,
130
+ JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL,
128
131
  json=content,
129
132
  headers={
130
133
  "Content-Type": "application/json",
@@ -149,6 +152,63 @@ class EvalDatasetClient:
149
152
  )
150
153
  return True
151
154
 
155
+ def append_sequences(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
156
+ debug(f"Appending dataset with alias '{alias}'")
157
+ """
158
+ Appends the dataset to Judgment platform
159
+
160
+ Mock request:
161
+ dataset = {
162
+ "alias": alias,
163
+ "examples": [...],
164
+ "project_name": project_name
165
+ } ==>
166
+ {
167
+ "_alias": alias,
168
+ "_id": "..." # ID of the dataset
169
+ }
170
+ """
171
+ with Progress(
172
+ SpinnerColumn(style="rgb(106,0,255)"),
173
+ TextColumn("[progress.description]{task.description}"),
174
+ transient=False,
175
+ ) as progress:
176
+ task_id = progress.add_task(
177
+ f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
178
+ total=100,
179
+ )
180
+ content = {
181
+ "dataset_alias": alias,
182
+ "project_name": project_name,
183
+ "sequences": [s.model_dump() for s in sequences],
184
+ }
185
+ try:
186
+ response = requests.post(
187
+ JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL,
188
+ json=content,
189
+ headers={
190
+ "Content-Type": "application/json",
191
+ "Authorization": f"Bearer {self.judgment_api_key}",
192
+ "X-Organization-Id": self.organization_id
193
+ },
194
+ verify=True
195
+ )
196
+ if response.status_code != 200:
197
+ error(f"Server error during append: {response.json()}")
198
+ raise Exception(f"Server error during append: {response.json()}")
199
+ response.raise_for_status()
200
+ except requests.exceptions.HTTPError as err:
201
+ if response.status_code == 422:
202
+ error(f"Validation error during append: {err.response.json()}")
203
+ else:
204
+ error(f"HTTP error during append: {err}")
205
+
206
+ progress.update(
207
+ task_id,
208
+ description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
209
+ )
210
+ return True
211
+
152
212
  def pull(self, alias: str, project_name: str) -> EvalDataset:
153
213
  debug(f"Pulling dataset with alias '{alias}'")
154
214
  """
judgeval/data/example.py CHANGED
@@ -24,14 +24,14 @@ class ExampleParams(Enum):
24
24
 
25
25
 
26
26
  class Example(BaseModel):
27
- input: Optional[str] = None
27
+ input: Optional[Union[str, Dict[str, Any]]] = None
28
28
  actual_output: Optional[Union[str, List[str]]] = None
29
29
  expected_output: Optional[Union[str, List[str]]] = None
30
30
  context: Optional[List[str]] = None
31
31
  retrieval_context: Optional[List[str]] = None
32
32
  additional_metadata: Optional[Dict[str, Any]] = None
33
33
  tools_called: Optional[List[str]] = None
34
- expected_tools: Optional[List[str]] = None
34
+ expected_tools: Optional[List[Dict[str, Any]]] = None
35
35
  name: Optional[str] = None
36
36
  example_id: str = Field(default_factory=lambda: str(uuid4()))
37
37
  example_index: Optional[int] = None
@@ -50,8 +50,18 @@ class Example(BaseModel):
50
50
  @field_validator('input', mode='before')
51
51
  @classmethod
52
52
  def validate_input(cls, v):
53
- if v is not None and (not v or not isinstance(v, str)):
54
- raise ValueError(f"Input must be a non-empty string but got '{v}' of type {type(v)}")
53
+ if v is not None:
54
+ if not isinstance(v, (str, dict)):
55
+ raise ValueError(f"Input must be a string or dictionary but got {v} of type {type(v)}")
56
+
57
+ # If it's a string, check that it's not empty
58
+ if isinstance(v, str) and not v:
59
+ raise ValueError(f"Input string must be non-empty but got '{v}'")
60
+
61
+ # If it's a dictionary, check that it's not empty
62
+ if isinstance(v, dict) and not v:
63
+ raise ValueError(f"Input dictionary must be non-empty but got {v}")
64
+
55
65
  return v
56
66
 
57
67
  @field_validator('actual_output', mode='before')
@@ -73,7 +83,21 @@ class Example(BaseModel):
73
83
  raise ValueError(f"All items in expected_output must be strings but got {v}")
74
84
  return v
75
85
 
76
- @field_validator('context', 'retrieval_context', 'tools_called', 'expected_tools', mode='before')
86
+ @field_validator('expected_tools', mode='before')
87
+ @classmethod
88
+ def validate_expected_tools(cls, v):
89
+ if v is not None:
90
+ if not isinstance(v, list):
91
+ raise ValueError(f"Expected tools must be a list of dictionaries or None but got {v} of type {type(v)}")
92
+
93
+ # Check that each item in the list is a dictionary
94
+ for i, item in enumerate(v):
95
+ if not isinstance(item, dict):
96
+ raise ValueError(f"Expected tools must be a list of dictionaries, but item at index {i} is {item} of type {type(item)}")
97
+
98
+ return v
99
+
100
+ @field_validator('context', 'retrieval_context', 'tools_called', mode='before')
77
101
  @classmethod
78
102
  def validate_string_lists(cls, v, info):
79
103
  field_name = info.field_name
@@ -127,7 +151,6 @@ class Example(BaseModel):
127
151
  "example_id": self.example_id,
128
152
  "example_index": self.example_index,
129
153
  "timestamp": self.timestamp,
130
- "trace_id": self.trace_id
131
154
  }
132
155
 
133
156
  def __str__(self):
@@ -144,5 +167,4 @@ class Example(BaseModel):
144
167
  f"example_id={self.example_id}, "
145
168
  f"example_index={self.example_index}, "
146
169
  f"timestamp={self.timestamp}, "
147
- f"trace_id={self.trace_id})"
148
170
  )
judgeval/data/sequence.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from pydantic import BaseModel, Field, field_validator, model_validator
2
- from typing import List, Optional, Union, Any
2
+ from typing import List, Optional, Union, Any, Dict
3
3
  from judgeval.data.example import Example
4
4
  from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
5
5
  from uuid import uuid4
@@ -12,13 +12,14 @@ class Sequence(BaseModel):
12
12
  sequence_id: str = Field(default_factory=lambda: str(uuid4()))
13
13
  name: Optional[str] = "Sequence"
14
14
  created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))
15
- items: List[Union["Sequence", Example]]
15
+ items: List[Union["Sequence", Example]] = []
16
16
  scorers: Optional[Any] = None
17
17
  parent_sequence_id: Optional[str] = None
18
18
  sequence_order: Optional[int] = 0
19
19
  root_sequence_id: Optional[str] = None
20
- inputs: Optional[str] = None
21
- output: Optional[str] = None
20
+ inputs: Optional[Dict[str, Any]] = None
21
+ output: Optional[Any] = None
22
+ expected_tools: Optional[List[Dict[str, Any]]] = None
22
23
 
23
24
  @field_validator("scorers")
24
25
  def validate_scorer(cls, v):
@@ -1,6 +1,6 @@
1
1
 
2
2
  from pydantic import BaseModel
3
- from typing import List, Optional, Dict, Any, Union
3
+ from typing import List, Optional, Dict, Any, Union, Callable
4
4
  from judgeval.data import Sequence
5
5
  from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
6
6
  from judgeval.judges import JudgevalJudge
@@ -29,8 +29,9 @@ class SequenceRun(BaseModel):
29
29
  organization_id: Optional[str] = None
30
30
  project_name: Optional[str] = None
31
31
  eval_name: Optional[str] = None
32
- sequences: List[Sequence]
33
- model: Union[str, List[str], JudgevalJudge]
32
+ sequences: Optional[List[Sequence]] = None
33
+ scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
34
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
34
35
  aggregator: Optional[str] = None
35
36
  metadata: Optional[Dict[str, Any]] = None
36
37
  trace_span_id: Optional[str] = None
judgeval/data/trace.py ADDED
@@ -0,0 +1,129 @@
1
+ from pydantic import BaseModel
2
+ from typing import Optional, Dict, Any, List
3
+ from judgeval.evaluation_run import EvaluationRun
4
+ import json
5
+ from datetime import datetime, timezone
6
+
7
+ class TraceSpan(BaseModel):
8
+ span_id: str
9
+ trace_id: str
10
+ function: Optional[str] = None
11
+ depth: int
12
+ created_at: Optional[float] = None
13
+ parent_span_id: Optional[str] = None
14
+ span_type: Optional[str] = "span"
15
+ inputs: Optional[Dict[str, Any]] = None
16
+ output: Optional[Any] = None
17
+ duration: Optional[float] = None
18
+ annotation: Optional[List[Dict[str, Any]]] = None
19
+ evaluation_runs: Optional[List[EvaluationRun]] = []
20
+
21
+ def model_dump(self, **kwargs):
22
+ return {
23
+ "span_id": self.span_id,
24
+ "trace_id": self.trace_id,
25
+ "depth": self.depth,
26
+ # "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
27
+ "created_at": datetime.fromtimestamp(self.created_at, tz=timezone.utc).isoformat(),
28
+ "inputs": self._serialize_inputs(),
29
+ "output": self._serialize_output(),
30
+ "evaluation_runs": [run.model_dump() for run in self.evaluation_runs] if self.evaluation_runs else [],
31
+ "parent_span_id": self.parent_span_id,
32
+ "function": self.function,
33
+ "duration": self.duration,
34
+ "span_type": self.span_type
35
+ }
36
+
37
+ def print_span(self):
38
+ """Print the span with proper formatting and parent relationship information."""
39
+ indent = " " * self.depth
40
+ parent_info = f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
41
+ print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
42
+
43
+ def _serialize_inputs(self) -> dict:
44
+ """Helper method to serialize input data safely."""
45
+ if self.inputs is None:
46
+ return {}
47
+
48
+ serialized_inputs = {}
49
+ for key, value in self.inputs.items():
50
+ if isinstance(value, BaseModel):
51
+ serialized_inputs[key] = value.model_dump()
52
+ elif isinstance(value, (list, tuple)):
53
+ # Handle lists/tuples of arguments
54
+ serialized_inputs[key] = [
55
+ item.model_dump() if isinstance(item, BaseModel)
56
+ else None if not self._is_json_serializable(item)
57
+ else item
58
+ for item in value
59
+ ]
60
+ else:
61
+ if self._is_json_serializable(value):
62
+ serialized_inputs[key] = value
63
+ else:
64
+ serialized_inputs[key] = self.safe_stringify(value, self.function)
65
+ return serialized_inputs
66
+
67
+ def _is_json_serializable(self, obj: Any) -> bool:
68
+ """Helper method to check if an object is JSON serializable."""
69
+ try:
70
+ json.dumps(obj)
71
+ return True
72
+ except (TypeError, OverflowError, ValueError):
73
+ return False
74
+
75
+ def safe_stringify(self, output, function_name):
76
+ """
77
+ Safely converts an object to a string or repr, handling serialization issues gracefully.
78
+ """
79
+ try:
80
+ return str(output)
81
+ except (TypeError, OverflowError, ValueError):
82
+ pass
83
+
84
+ try:
85
+ return repr(output)
86
+ except (TypeError, OverflowError, ValueError):
87
+ pass
88
+
89
+ warnings.warn(
90
+ f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
91
+ )
92
+ return None
93
+
94
+ def _serialize_output(self) -> Any:
95
+ """Helper method to serialize output data safely."""
96
+ if self.output is None:
97
+ return None
98
+
99
+ def serialize_value(value):
100
+ if isinstance(value, BaseModel):
101
+ return value.model_dump()
102
+ elif isinstance(value, dict):
103
+ # Recursively serialize dictionary values
104
+ return {k: serialize_value(v) for k, v in value.items()}
105
+ elif isinstance(value, (list, tuple)):
106
+ # Recursively serialize list/tuple items
107
+ return [serialize_value(item) for item in value]
108
+ else:
109
+ # Try direct JSON serialization first
110
+ try:
111
+ json.dumps(value)
112
+ return value
113
+ except (TypeError, OverflowError, ValueError):
114
+ # Fallback to safe stringification
115
+ return self.safe_stringify(value, self.function)
116
+
117
+ # Start serialization with the top-level output
118
+ return serialize_value(self.output)
119
+
120
+ class Trace(BaseModel):
121
+ trace_id: str
122
+ name: str
123
+ created_at: str
124
+ duration: float
125
+ entries: List[TraceSpan]
126
+ overwrite: bool = False
127
+ rules: Optional[Dict[str, Any]] = None
128
+ has_notification: Optional[bool] = False
129
+
@@ -31,7 +31,7 @@ class EvaluationRun(BaseModel):
31
31
  eval_name: Optional[str] = None
32
32
  examples: Union[List[Example], List[CustomExample]]
33
33
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
34
- model: Union[str, List[str], JudgevalJudge]
34
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
35
35
  aggregator: Optional[str] = None
36
36
  metadata: Optional[Dict[str, Any]] = None
37
37
  trace_span_id: Optional[str] = None