judgeval 0.0.30__py3-none-any.whl → 0.0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,9 +22,8 @@ from langchain_core.documents import Document
22
22
  class JudgevalCallbackHandler(BaseCallbackHandler):
23
23
  def __init__(self, tracer: Tracer):
24
24
  self.tracer = tracer
25
- self.trace_client = tracer.get_current_trace() if tracer.get_current_trace() else None
26
25
  self.previous_spans = [] # stack of previous spans
27
- self.finished = False
26
+ self.created_trace = False
28
27
 
29
28
  # Attributes for users to access
30
29
  self.previous_node = None
@@ -33,43 +32,58 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
33
32
  self.executed_tools = []
34
33
 
35
34
  def start_span(self, name: str, span_type: SpanType = "span"):
35
+ current_trace = self.tracer.get_current_trace()
36
36
  start_time = time.time()
37
+
38
+ # Generate a unique ID for *this specific span invocation*
39
+ span_id = str(uuid.uuid4())
40
+
41
+ parent_span_id = current_trace.get_current_span()
42
+ token = current_trace.set_current_span(span_id) # Set *this* span's ID as the current one
43
+
44
+ current_depth = 0
45
+ if parent_span_id and parent_span_id in current_trace._span_depths:
46
+ current_depth = current_trace._span_depths[parent_span_id] + 1
37
47
 
48
+ current_trace._span_depths[span_id] = current_depth # Store depth by span_id
38
49
  # Record span entry
39
- self.trace_client.add_entry(TraceEntry(
50
+ current_trace.add_entry(TraceEntry(
40
51
  type="enter",
52
+ span_id=span_id,
53
+ trace_id=current_trace.trace_id,
54
+ parent_span_id=parent_span_id,
41
55
  function=name,
42
- depth=self.trace_client.tracer.depth,
56
+ depth=current_depth,
43
57
  message=name,
44
- timestamp=start_time,
58
+ created_at=start_time,
45
59
  span_type=span_type
46
60
  ))
47
61
 
48
- self.trace_client.tracer.depth += 1
49
- self.previous_spans.append(self.trace_client._current_span)
50
- self.trace_client._current_span = name
62
+ self.previous_spans.append(token)
51
63
  self._start_time = start_time
52
64
 
53
- def end_span(self, name: str, span_type: SpanType = "span"):
54
- self.trace_client.tracer.depth -= 1
65
+ def end_span(self, span_type: SpanType = "span"):
66
+ current_trace = self.tracer.get_current_trace()
55
67
  duration = time.time() - self._start_time
68
+ span_id = current_trace.get_current_span()
69
+ exit_depth = current_trace._span_depths.get(span_id, 0) # Get depth using this span's ID
56
70
 
57
71
  # Record span exit
58
- self.trace_client.add_entry(TraceEntry(
72
+ current_trace.add_entry(TraceEntry(
59
73
  type="exit",
60
- function=name,
61
- depth=self.trace_client.tracer.depth,
62
- message=f"{name}",
63
- timestamp=time.time(),
74
+ span_id=span_id,
75
+ trace_id=current_trace.trace_id,
76
+ depth=exit_depth,
77
+ created_at=time.time(),
64
78
  duration=duration,
65
79
  span_type=span_type
66
80
  ))
67
- self.trace_client._current_span = self.previous_spans.pop()
68
-
69
- if self.trace_client.tracer.depth == 0:
81
+ current_trace.reset_current_span(self.previous_spans.pop())
82
+ if exit_depth == 0:
70
83
  # Save the trace if we are the root, this is when users dont use any @observe decorators
71
- self.trace_client.save(empty_save=False, overwrite=True)
72
- self.trace_client._current_trace = None
84
+ trace_id, trace_data = current_trace.save(overwrite=True)
85
+ self._trace_id = trace_id
86
+ current_trace = None
73
87
 
74
88
  def on_retriever_start(
75
89
  self,
@@ -85,9 +99,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
85
99
  name = "RETRIEVER_CALL"
86
100
  if serialized and "name" in serialized:
87
101
  name = f"RETRIEVER_{serialized['name'].upper()}"
88
-
102
+ current_trace = self.tracer.get_current_trace()
89
103
  self.start_span(name, span_type="retriever")
90
- self.trace_client.record_input({
104
+ current_trace.record_input({
91
105
  'query': query,
92
106
  'tags': tags,
93
107
  'metadata': metadata,
@@ -103,6 +117,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
103
117
  **kwargs: Any
104
118
  ) -> Any:
105
119
  # Process the retrieved documents into a format suitable for logging
120
+ current_trace = self.tracer.get_current_trace()
106
121
  doc_summary = []
107
122
  for i, doc in enumerate(documents):
108
123
  # Extract key information from each document
@@ -114,13 +129,13 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
114
129
  doc_summary.append(doc_data)
115
130
 
116
131
  # Record the document data
117
- self.trace_client.record_output({
132
+ current_trace.record_output({
118
133
  "document_count": len(documents),
119
134
  "documents": doc_summary
120
135
  })
121
136
 
122
137
  # End the retriever span
123
- self.end_span(self.trace_client._current_span, span_type="retriever")
138
+ self.end_span(span_type="retriever")
124
139
 
125
140
  def on_chain_start(
126
141
  self,
@@ -134,29 +149,26 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
134
149
  **kwargs: Any
135
150
  ) -> None:
136
151
  # If the user doesnt use any @observe decorators, the first action in LangGraph workflows seems tohave this attribute, so we intialize our trace client here
152
+ current_trace = self.tracer.get_current_trace()
137
153
  if kwargs.get('name') == 'LangGraph':
138
- if not self.trace_client:
154
+ if not current_trace:
155
+ self.created_trace = True
139
156
  trace_id = str(uuid.uuid4())
140
157
  project = self.tracer.project_name
141
- trace = TraceClient(self.tracer, trace_id, trace_id, project_name=project, overwrite=False, rules=self.tracer.rules, enable_monitoring=self.tracer.enable_monitoring, enable_evaluations=self.tracer.enable_evaluations)
142
- self.trace_client = trace
143
- self.tracer._current_trace = trace # set the trace in the original tracer object
144
- # Only save empty trace for the root call
145
- self.trace_client.save(empty_save=True, overwrite=False)
146
-
147
- self.start_span("LangGraph", span_type="Main Function")
158
+ trace = TraceClient(self.tracer, trace_id, "Langgraph", project_name=project, overwrite=False, rules=self.tracer.rules, enable_monitoring=self.tracer.enable_monitoring, enable_evaluations=self.tracer.enable_evaluations)
159
+ self.tracer.set_current_trace(trace)
160
+ self.start_span("LangGraph", span_type="Main Function")
148
161
 
149
- metadata = kwargs.get("metadata", {})
150
- if node := metadata.get("langgraph_node"):
151
- if node != self.previous_node:
152
- # Track node execution
153
- self.trace_client.visited_nodes.append(node)
154
- self.trace_client.executed_node_tools.append(node)
155
- self.trace_client.record_input({
156
- 'args': inputs,
157
- 'kwargs': kwargs
158
- })
159
- self.previous_node = node
162
+ node = metadata.get("langgraph_node")
163
+ if node != None and node != self.previous_node:
164
+ self.start_span(node, span_type="node")
165
+ self.executed_node_tools.append(node)
166
+ self.executed_nodes.append(node)
167
+ current_trace.record_input({
168
+ 'args': inputs,
169
+ 'kwargs': kwargs
170
+ })
171
+ self.previous_node = node
160
172
 
161
173
  def on_chain_end(
162
174
  self,
@@ -167,14 +179,13 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
167
179
  tags: Optional[List[str]] = None,
168
180
  **kwargs: Any,
169
181
  ) -> Any:
170
- if outputs == "__end__":
171
- self.finished = True
182
+ current_trace = self.tracer.get_current_trace()
172
183
  if tags is not None and any("graph:step" in tag for tag in tags):
173
- self.trace_client.record_output(outputs)
174
- self.end_span(self.trace_client._current_span, span_type="node")
184
+ current_trace.record_output(outputs)
185
+ self.end_span(span_type="node")
175
186
 
176
- if self.finished:
177
- self.end_span(self.trace_client._current_span, span_type="Main Function")
187
+ if self.created_trace and (outputs == "__end__" or (not kwargs and not tags)):
188
+ self.end_span(span_type="Main Function")
178
189
 
179
190
  def on_chain_error(
180
191
  self,
@@ -184,9 +195,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
184
195
  parent_run_id: Optional[UUID] = None,
185
196
  **kwargs: Any,
186
197
  ) -> Any:
187
- print(f"Chain error: {error}")
188
- self.trace_client.record_output(error)
189
- self.end_span(self.trace_client._current_span, span_type="node")
198
+ current_trace = self.tracer.get_current_trace()
199
+ current_trace.record_output(error)
200
+ self.end_span(span_type="node")
190
201
 
191
202
  def on_tool_start(
192
203
  self,
@@ -199,19 +210,21 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
199
210
  ):
200
211
  name = serialized["name"]
201
212
  self.start_span(name, span_type="tool")
213
+ current_trace = self.tracer.get_current_trace()
202
214
  if name:
203
215
  # Track tool execution
204
- self.trace_client.executed_tools.append(name)
216
+ current_trace.executed_tools.append(name)
205
217
  node_tool = f"{self.previous_node}:{name}" if self.previous_node else name
206
- self.trace_client.executed_node_tools.append(node_tool)
207
- self.trace_client.record_input({
208
- 'args': input_str,
209
- 'kwargs': kwargs
210
- })
218
+ current_trace.executed_node_tools.append(node_tool)
219
+ current_trace.record_input({
220
+ 'args': input_str,
221
+ 'kwargs': kwargs
222
+ })
211
223
 
212
224
  def on_tool_end(self, output: Any, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any) -> Any:
213
- self.trace_client.record_output(output)
214
- self.end_span(self.trace_client._current_span, span_type="tool")
225
+ current_trace = self.tracer.get_current_trace()
226
+ current_trace.record_output(output)
227
+ self.end_span(span_type="tool")
215
228
 
216
229
  def on_tool_error(
217
230
  self,
@@ -221,9 +234,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
221
234
  parent_run_id: Optional[UUID] = None,
222
235
  **kwargs: Any,
223
236
  ) -> Any:
224
- print(f"Tool error: {error}")
225
- self.trace_client.record_output(error)
226
- self.end_span(self.trace_client._current_span, span_type="tool")
237
+ current_trace = self.tracer.get_current_trace()
238
+ current_trace.record_output(error)
239
+ self.end_span(span_type="tool")
227
240
 
228
241
  def on_agent_action(
229
242
  self,
@@ -233,7 +246,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
233
246
  parent_run_id: Optional[UUID] = None,
234
247
  **kwargs: Any,
235
248
  ) -> Any:
236
- print(f"Agent action: {action}")
249
+ pass
237
250
 
238
251
  def on_agent_finish(
239
252
  self,
@@ -243,7 +256,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
243
256
  parent_run_id: Optional[UUID] = None,
244
257
  **kwargs: Any,
245
258
  ) -> Any:
246
- print(f"Agent finish: {finish}")
259
+
260
+ pass
247
261
 
248
262
  def on_llm_start(
249
263
  self,
@@ -256,14 +270,16 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
256
270
  ) -> Any:
257
271
  name = "LLM call"
258
272
  self.start_span(name, span_type="llm")
259
- self.trace_client.record_input({
273
+ current_trace = self.tracer.get_current_trace()
274
+ current_trace.record_input({
260
275
  'args': prompts,
261
276
  'kwargs': kwargs
262
277
  })
263
278
 
264
279
  def on_llm_end(self, response: LLMResult, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any):
265
- self.trace_client.record_output(response.generations[0][0].text)
266
- self.end_span(self.trace_client._current_span, span_type="llm")
280
+ current_trace = self.tracer.get_current_trace()
281
+ current_trace.record_output(response.generations[0][0].text)
282
+ self.end_span(span_type="llm")
267
283
 
268
284
  def on_llm_error(
269
285
  self,
@@ -273,9 +289,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
273
289
  parent_run_id: Optional[UUID] = None,
274
290
  **kwargs: Any,
275
291
  ) -> Any:
276
- print(f"LLM error: {error}")
277
- self.trace_client.record_output(error)
278
- self.end_span(self.trace_client._current_span, span_type="llm")
292
+ current_trace = self.tracer.get_current_trace()
293
+ current_trace.record_output(error)
294
+ self.end_span(span_type="llm")
279
295
 
280
296
  def on_chat_model_start(
281
297
  self,
@@ -297,7 +313,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
297
313
  name = "LLM call"
298
314
 
299
315
  self.start_span(name, span_type="llm")
300
- self.trace_client.record_input({
316
+ current_trace = self.tracer.get_current_trace()
317
+ current_trace.record_input({
301
318
  'args': str(messages),
302
319
  'kwargs': kwargs
303
320
  })
@@ -10,6 +10,8 @@ from judgeval.data.datasets import EvalDataset, EvalDatasetClient
10
10
  from judgeval.data import (
11
11
  ScoringResult,
12
12
  Example,
13
+ CustomExample,
14
+ Sequence,
13
15
  )
14
16
  from judgeval.scorers import (
15
17
  APIJudgmentScorer,
@@ -20,8 +22,10 @@ from judgeval.scorers import (
20
22
  from judgeval.evaluation_run import EvaluationRun
21
23
  from judgeval.run_evaluation import (
22
24
  run_eval,
23
- assert_test
25
+ assert_test,
26
+ run_sequence_eval
24
27
  )
28
+ from judgeval.data.sequence_run import SequenceRun
25
29
  from judgeval.judges import JudgevalJudge
26
30
  from judgeval.constants import (
27
31
  JUDGMENT_EVAL_FETCH_API_URL,
@@ -78,66 +82,58 @@ class JudgmentClient(metaclass=SingletonMeta):
78
82
  project_name: str = "default_project",
79
83
  eval_run_name: str = "default_eval_run",
80
84
  override: bool = False,
85
+ append: bool = False,
81
86
  use_judgment: bool = True,
82
87
  ignore_errors: bool = True,
83
88
  rules: Optional[List[Rule]] = None
84
89
  ) -> List[ScoringResult]:
85
- return self.run_evaluation(examples, scorers, model, aggregator, metadata, log_results, project_name, eval_run_name, override, use_judgment, ignore_errors, True, rules)
90
+ return self.run_evaluation(examples, scorers, model, aggregator, metadata, log_results, project_name, eval_run_name, override, append, use_judgment, ignore_errors, True, rules)
86
91
 
87
- def run_evaluation(
88
- self,
89
- examples: List[Example],
90
- scorers: List[Union[ScorerWrapper, JudgevalScorer]],
92
+ def run_sequence_evaluation(
93
+ self,
94
+ sequences: List[Sequence],
91
95
  model: Union[str, List[str], JudgevalJudge],
96
+ scorers: List[Union[ScorerWrapper, JudgevalScorer]],
92
97
  aggregator: Optional[str] = None,
93
- metadata: Optional[Dict[str, Any]] = None,
94
- log_results: bool = True,
95
98
  project_name: str = "default_project",
96
- eval_run_name: str = "default_eval_run",
97
- override: bool = False,
99
+ eval_run_name: str = "default_eval_sequence",
98
100
  use_judgment: bool = True,
101
+ log_results: bool = True,
102
+ append: bool = False,
103
+ override: bool = False,
99
104
  ignore_errors: bool = True,
100
- async_execution: bool = False,
101
105
  rules: Optional[List[Rule]] = None
102
106
  ) -> List[ScoringResult]:
103
- """
104
- Executes an evaluation of `Example`s using one or more `Scorer`s
105
-
106
- Args:
107
- examples (List[Example]): The examples to evaluate
108
- scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
109
- model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
110
- aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
111
- metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
112
- log_results (bool): Whether to log the results to the Judgment API
113
- project_name (str): The name of the project the evaluation results belong to
114
- eval_run_name (str): A name for this evaluation run
115
- override (bool): Whether to override an existing evaluation run with the same name
116
- use_judgment (bool): Whether to use Judgment API for evaluation
117
- ignore_errors (bool): Whether to ignore errors during evaluation (safely handled)
118
- rules (Optional[List[Rule]]): Rules to evaluate against scoring results
119
-
120
- Returns:
121
- List[ScoringResult]: The results of the evaluation
122
- """
123
107
  try:
124
- # Load appropriate implementations for all scorers
125
- loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
108
+ loaded_scorers = []
126
109
  for scorer in scorers:
127
110
  try:
128
111
  if isinstance(scorer, ScorerWrapper):
129
- loaded_scorers.append(scorer.load_implementation(use_judgment=use_judgment))
112
+ loaded_scorers.append(scorer.load_implementation())
130
113
  else:
131
114
  loaded_scorers.append(scorer)
132
115
  except Exception as e:
133
116
  raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
134
117
 
135
- # Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
136
- if rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
137
- raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
118
+ def get_all_sequences(root: Sequence) -> List[Sequence]:
119
+ all_sequences = [root]
120
+
121
+ for item in root.items:
122
+ if isinstance(item, Sequence):
123
+ all_sequences.extend(get_all_sequences(item))
124
+
125
+ return all_sequences
126
+
127
+ def flatten_sequence_list(sequences: List[Sequence]) -> List[Sequence]:
128
+ flattened = []
129
+ for seq in sequences:
130
+ flattened.extend(get_all_sequences(seq))
131
+ return flattened
132
+
133
+ flattened_sequences = flatten_sequence_list(sequences)
134
+ for sequence in flattened_sequences:
135
+ sequence.scorers = loaded_scorers
138
136
 
139
- # Convert ScorerWrapper in rules to their implementations
140
- loaded_rules = None
141
137
  if rules:
142
138
  loaded_rules = []
143
139
  for rule in rules:
@@ -161,57 +157,64 @@ class JudgmentClient(metaclass=SingletonMeta):
161
157
  loaded_rules.append(new_rule)
162
158
  except Exception as e:
163
159
  raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
164
-
165
- eval = EvaluationRun(
166
- log_results=log_results,
160
+
161
+ sequence_run = SequenceRun(
167
162
  project_name=project_name,
168
163
  eval_name=eval_run_name,
169
- examples=examples,
170
- scorers=loaded_scorers,
164
+ sequences=sequences,
171
165
  model=model,
172
166
  aggregator=aggregator,
173
- metadata=metadata,
167
+ log_results=log_results,
168
+ append=append,
174
169
  judgment_api_key=self.judgment_api_key,
175
- rules=loaded_rules,
176
170
  organization_id=self.organization_id
177
171
  )
178
- return run_eval(eval, override, ignore_errors=ignore_errors, async_execution=async_execution)
172
+ return run_sequence_eval(sequence_run, override, ignore_errors, use_judgment)
179
173
  except ValueError as e:
180
- raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
174
+ raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
181
175
  except Exception as e:
182
176
  raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
183
-
184
- def evaluate_dataset(
177
+
178
+ def run_evaluation(
185
179
  self,
186
- dataset: EvalDataset,
180
+ examples: Union[List[Example], List[CustomExample]],
187
181
  scorers: List[Union[ScorerWrapper, JudgevalScorer]],
188
182
  model: Union[str, List[str], JudgevalJudge],
189
183
  aggregator: Optional[str] = None,
190
184
  metadata: Optional[Dict[str, Any]] = None,
191
- project_name: str = "",
192
- eval_run_name: str = "",
193
185
  log_results: bool = True,
186
+ project_name: str = "default_project",
187
+ eval_run_name: str = "default_eval_run",
188
+ override: bool = False,
189
+ append: bool = False,
194
190
  use_judgment: bool = True,
191
+ ignore_errors: bool = True,
192
+ async_execution: bool = False,
195
193
  rules: Optional[List[Rule]] = None
196
194
  ) -> List[ScoringResult]:
197
195
  """
198
- Executes an evaluation of a `EvalDataset` using one or more `Scorer`s
196
+ Executes an evaluation of `Example`s using one or more `Scorer`s
199
197
 
200
198
  Args:
201
- dataset (EvalDataset): The dataset containing examples to evaluate
199
+ examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
202
200
  scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
203
201
  model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
204
202
  aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
205
203
  metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
204
+ log_results (bool): Whether to log the results to the Judgment API
206
205
  project_name (str): The name of the project the evaluation results belong to
207
206
  eval_run_name (str): A name for this evaluation run
208
- log_results (bool): Whether to log the results to the Judgment API
207
+ override (bool): Whether to override an existing evaluation run with the same name
209
208
  use_judgment (bool): Whether to use Judgment API for evaluation
209
+ ignore_errors (bool): Whether to ignore errors during evaluation (safely handled)
210
210
  rules (Optional[List[Rule]]): Rules to evaluate against scoring results
211
211
 
212
212
  Returns:
213
213
  List[ScoringResult]: The results of the evaluation
214
214
  """
215
+ if override and append:
216
+ raise ValueError("Cannot set both override and append to True. Please choose one.")
217
+
215
218
  try:
216
219
  # Load appropriate implementations for all scorers
217
220
  loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
@@ -253,12 +256,12 @@ class JudgmentClient(metaclass=SingletonMeta):
253
256
  loaded_rules.append(new_rule)
254
257
  except Exception as e:
255
258
  raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
256
-
257
- evaluation_run = EvaluationRun(
259
+ eval = EvaluationRun(
258
260
  log_results=log_results,
261
+ append=append,
259
262
  project_name=project_name,
260
263
  eval_name=eval_run_name,
261
- examples=dataset.examples,
264
+ examples=examples,
262
265
  scorers=loaded_scorers,
263
266
  model=model,
264
267
  aggregator=aggregator,
@@ -267,7 +270,7 @@ class JudgmentClient(metaclass=SingletonMeta):
267
270
  rules=loaded_rules,
268
271
  organization_id=self.organization_id
269
272
  )
270
- return run_eval(evaluation_run)
273
+ return run_eval(eval, override, ignore_errors=ignore_errors, async_execution=async_execution)
271
274
  except ValueError as e:
272
275
  raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
273
276
  except Exception as e:
@@ -292,6 +295,12 @@ class JudgmentClient(metaclass=SingletonMeta):
292
295
  dataset.judgment_api_key = self.judgment_api_key
293
296
  return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
294
297
 
298
+ def append_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
299
+ """
300
+ Appends an `EvalDataset` to the Judgment platform for storage.
301
+ """
302
+ return self.eval_dataset_client.append(alias, examples, project_name)
303
+
295
304
  def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
296
305
  """
297
306
  Retrieves a saved `EvalDataset` from the Judgment platform.
@@ -355,14 +364,7 @@ class JudgmentClient(metaclass=SingletonMeta):
355
364
  if eval_run.status_code != requests.codes.ok:
356
365
  raise ValueError(f"Error fetching eval results: {eval_run.json()}")
357
366
 
358
- eval_run_result = [{}]
359
- for result in eval_run.json():
360
- result_id = result.get("id", "")
361
- result_data = result.get("result", dict())
362
- filtered_result = {k: v for k, v in result_data.items() if k in ScoringResult.__annotations__}
363
- eval_run_result[0]["id"] = result_id
364
- eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
365
- return eval_run_result
367
+ return eval_run.json()
366
368
 
367
369
  def delete_eval(self, project_name: str, eval_run_names: List[str]) -> bool:
368
370
  """