judgeval 0.0.30__py3-none-any.whl → 0.0.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,9 +22,8 @@ from langchain_core.documents import Document
22
22
  class JudgevalCallbackHandler(BaseCallbackHandler):
23
23
  def __init__(self, tracer: Tracer):
24
24
  self.tracer = tracer
25
- self.trace_client = tracer.get_current_trace() if tracer.get_current_trace() else None
26
25
  self.previous_spans = [] # stack of previous spans
27
- self.finished = False
26
+ self.created_trace = False
28
27
 
29
28
  # Attributes for users to access
30
29
  self.previous_node = None
@@ -33,43 +32,58 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
33
32
  self.executed_tools = []
34
33
 
35
34
  def start_span(self, name: str, span_type: SpanType = "span"):
35
+ current_trace = self.tracer.get_current_trace()
36
36
  start_time = time.time()
37
+
38
+ # Generate a unique ID for *this specific span invocation*
39
+ span_id = str(uuid.uuid4())
40
+
41
+ parent_span_id = current_trace.get_current_span()
42
+ token = current_trace.set_current_span(span_id) # Set *this* span's ID as the current one
43
+
44
+ current_depth = 0
45
+ if parent_span_id and parent_span_id in current_trace._span_depths:
46
+ current_depth = current_trace._span_depths[parent_span_id] + 1
37
47
 
48
+ current_trace._span_depths[span_id] = current_depth # Store depth by span_id
38
49
  # Record span entry
39
- self.trace_client.add_entry(TraceEntry(
50
+ current_trace.add_entry(TraceEntry(
40
51
  type="enter",
52
+ span_id=span_id,
53
+ trace_id=current_trace.trace_id,
54
+ parent_span_id=parent_span_id,
41
55
  function=name,
42
- depth=self.trace_client.tracer.depth,
56
+ depth=current_depth,
43
57
  message=name,
44
- timestamp=start_time,
58
+ created_at=start_time,
45
59
  span_type=span_type
46
60
  ))
47
61
 
48
- self.trace_client.tracer.depth += 1
49
- self.previous_spans.append(self.trace_client._current_span)
50
- self.trace_client._current_span = name
62
+ self.previous_spans.append(token)
51
63
  self._start_time = start_time
52
64
 
53
- def end_span(self, name: str, span_type: SpanType = "span"):
54
- self.trace_client.tracer.depth -= 1
65
+ def end_span(self, span_type: SpanType = "span"):
66
+ current_trace = self.tracer.get_current_trace()
55
67
  duration = time.time() - self._start_time
68
+ span_id = current_trace.get_current_span()
69
+ exit_depth = current_trace._span_depths.get(span_id, 0) # Get depth using this span's ID
56
70
 
57
71
  # Record span exit
58
- self.trace_client.add_entry(TraceEntry(
72
+ current_trace.add_entry(TraceEntry(
59
73
  type="exit",
60
- function=name,
61
- depth=self.trace_client.tracer.depth,
62
- message=f"{name}",
63
- timestamp=time.time(),
74
+ span_id=span_id,
75
+ trace_id=current_trace.trace_id,
76
+ depth=exit_depth,
77
+ created_at=time.time(),
64
78
  duration=duration,
65
79
  span_type=span_type
66
80
  ))
67
- self.trace_client._current_span = self.previous_spans.pop()
68
-
69
- if self.trace_client.tracer.depth == 0:
81
+ current_trace.reset_current_span(self.previous_spans.pop())
82
+ if exit_depth == 0:
70
83
  # Save the trace if we are the root, this is when users dont use any @observe decorators
71
- self.trace_client.save(empty_save=False, overwrite=True)
72
- self.trace_client._current_trace = None
84
+ trace_id, trace_data = current_trace.save(overwrite=True)
85
+ self._trace_id = trace_id
86
+ current_trace = None
73
87
 
74
88
  def on_retriever_start(
75
89
  self,
@@ -85,9 +99,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
85
99
  name = "RETRIEVER_CALL"
86
100
  if serialized and "name" in serialized:
87
101
  name = f"RETRIEVER_{serialized['name'].upper()}"
88
-
102
+ current_trace = self.tracer.get_current_trace()
89
103
  self.start_span(name, span_type="retriever")
90
- self.trace_client.record_input({
104
+ current_trace.record_input({
91
105
  'query': query,
92
106
  'tags': tags,
93
107
  'metadata': metadata,
@@ -103,6 +117,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
103
117
  **kwargs: Any
104
118
  ) -> Any:
105
119
  # Process the retrieved documents into a format suitable for logging
120
+ current_trace = self.tracer.get_current_trace()
106
121
  doc_summary = []
107
122
  for i, doc in enumerate(documents):
108
123
  # Extract key information from each document
@@ -114,13 +129,13 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
114
129
  doc_summary.append(doc_data)
115
130
 
116
131
  # Record the document data
117
- self.trace_client.record_output({
132
+ current_trace.record_output({
118
133
  "document_count": len(documents),
119
134
  "documents": doc_summary
120
135
  })
121
136
 
122
137
  # End the retriever span
123
- self.end_span(self.trace_client._current_span, span_type="retriever")
138
+ self.end_span(span_type="retriever")
124
139
 
125
140
  def on_chain_start(
126
141
  self,
@@ -134,29 +149,26 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
134
149
  **kwargs: Any
135
150
  ) -> None:
136
151
  # If the user doesnt use any @observe decorators, the first action in LangGraph workflows seems tohave this attribute, so we intialize our trace client here
152
+ current_trace = self.tracer.get_current_trace()
137
153
  if kwargs.get('name') == 'LangGraph':
138
- if not self.trace_client:
154
+ if not current_trace:
155
+ self.created_trace = True
139
156
  trace_id = str(uuid.uuid4())
140
157
  project = self.tracer.project_name
141
- trace = TraceClient(self.tracer, trace_id, trace_id, project_name=project, overwrite=False, rules=self.tracer.rules, enable_monitoring=self.tracer.enable_monitoring, enable_evaluations=self.tracer.enable_evaluations)
142
- self.trace_client = trace
143
- self.tracer._current_trace = trace # set the trace in the original tracer object
144
- # Only save empty trace for the root call
145
- self.trace_client.save(empty_save=True, overwrite=False)
146
-
147
- self.start_span("LangGraph", span_type="Main Function")
158
+ trace = TraceClient(self.tracer, trace_id, "Langgraph", project_name=project, overwrite=False, rules=self.tracer.rules, enable_monitoring=self.tracer.enable_monitoring, enable_evaluations=self.tracer.enable_evaluations)
159
+ self.tracer.set_current_trace(trace)
160
+ self.start_span("LangGraph", span_type="Main Function")
148
161
 
149
- metadata = kwargs.get("metadata", {})
150
- if node := metadata.get("langgraph_node"):
151
- if node != self.previous_node:
152
- # Track node execution
153
- self.trace_client.visited_nodes.append(node)
154
- self.trace_client.executed_node_tools.append(node)
155
- self.trace_client.record_input({
156
- 'args': inputs,
157
- 'kwargs': kwargs
158
- })
159
- self.previous_node = node
162
+ node = metadata.get("langgraph_node")
163
+ if node != None and node != self.previous_node:
164
+ self.start_span(node, span_type="node")
165
+ self.executed_node_tools.append(node)
166
+ self.executed_nodes.append(node)
167
+ current_trace.record_input({
168
+ 'args': inputs,
169
+ 'kwargs': kwargs
170
+ })
171
+ self.previous_node = node
160
172
 
161
173
  def on_chain_end(
162
174
  self,
@@ -167,14 +179,13 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
167
179
  tags: Optional[List[str]] = None,
168
180
  **kwargs: Any,
169
181
  ) -> Any:
170
- if outputs == "__end__":
171
- self.finished = True
182
+ current_trace = self.tracer.get_current_trace()
172
183
  if tags is not None and any("graph:step" in tag for tag in tags):
173
- self.trace_client.record_output(outputs)
174
- self.end_span(self.trace_client._current_span, span_type="node")
184
+ current_trace.record_output(outputs)
185
+ self.end_span(span_type="node")
175
186
 
176
- if self.finished:
177
- self.end_span(self.trace_client._current_span, span_type="Main Function")
187
+ if self.created_trace and (outputs == "__end__" or (not kwargs and not tags)):
188
+ self.end_span(span_type="Main Function")
178
189
 
179
190
  def on_chain_error(
180
191
  self,
@@ -184,9 +195,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
184
195
  parent_run_id: Optional[UUID] = None,
185
196
  **kwargs: Any,
186
197
  ) -> Any:
187
- print(f"Chain error: {error}")
188
- self.trace_client.record_output(error)
189
- self.end_span(self.trace_client._current_span, span_type="node")
198
+ current_trace = self.tracer.get_current_trace()
199
+ current_trace.record_output(error)
200
+ self.end_span(span_type="node")
190
201
 
191
202
  def on_tool_start(
192
203
  self,
@@ -199,19 +210,21 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
199
210
  ):
200
211
  name = serialized["name"]
201
212
  self.start_span(name, span_type="tool")
213
+ current_trace = self.tracer.get_current_trace()
202
214
  if name:
203
215
  # Track tool execution
204
- self.trace_client.executed_tools.append(name)
216
+ current_trace.executed_tools.append(name)
205
217
  node_tool = f"{self.previous_node}:{name}" if self.previous_node else name
206
- self.trace_client.executed_node_tools.append(node_tool)
207
- self.trace_client.record_input({
208
- 'args': input_str,
209
- 'kwargs': kwargs
210
- })
218
+ current_trace.executed_node_tools.append(node_tool)
219
+ current_trace.record_input({
220
+ 'args': input_str,
221
+ 'kwargs': kwargs
222
+ })
211
223
 
212
224
  def on_tool_end(self, output: Any, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any) -> Any:
213
- self.trace_client.record_output(output)
214
- self.end_span(self.trace_client._current_span, span_type="tool")
225
+ current_trace = self.tracer.get_current_trace()
226
+ current_trace.record_output(output)
227
+ self.end_span(span_type="tool")
215
228
 
216
229
  def on_tool_error(
217
230
  self,
@@ -221,9 +234,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
221
234
  parent_run_id: Optional[UUID] = None,
222
235
  **kwargs: Any,
223
236
  ) -> Any:
224
- print(f"Tool error: {error}")
225
- self.trace_client.record_output(error)
226
- self.end_span(self.trace_client._current_span, span_type="tool")
237
+ current_trace = self.tracer.get_current_trace()
238
+ current_trace.record_output(error)
239
+ self.end_span(span_type="tool")
227
240
 
228
241
  def on_agent_action(
229
242
  self,
@@ -233,7 +246,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
233
246
  parent_run_id: Optional[UUID] = None,
234
247
  **kwargs: Any,
235
248
  ) -> Any:
236
- print(f"Agent action: {action}")
249
+ pass
237
250
 
238
251
  def on_agent_finish(
239
252
  self,
@@ -243,7 +256,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
243
256
  parent_run_id: Optional[UUID] = None,
244
257
  **kwargs: Any,
245
258
  ) -> Any:
246
- print(f"Agent finish: {finish}")
259
+
260
+ pass
247
261
 
248
262
  def on_llm_start(
249
263
  self,
@@ -256,14 +270,16 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
256
270
  ) -> Any:
257
271
  name = "LLM call"
258
272
  self.start_span(name, span_type="llm")
259
- self.trace_client.record_input({
273
+ current_trace = self.tracer.get_current_trace()
274
+ current_trace.record_input({
260
275
  'args': prompts,
261
276
  'kwargs': kwargs
262
277
  })
263
278
 
264
279
  def on_llm_end(self, response: LLMResult, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any):
265
- self.trace_client.record_output(response.generations[0][0].text)
266
- self.end_span(self.trace_client._current_span, span_type="llm")
280
+ current_trace = self.tracer.get_current_trace()
281
+ current_trace.record_output(response.generations[0][0].text)
282
+ self.end_span(span_type="llm")
267
283
 
268
284
  def on_llm_error(
269
285
  self,
@@ -273,9 +289,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
273
289
  parent_run_id: Optional[UUID] = None,
274
290
  **kwargs: Any,
275
291
  ) -> Any:
276
- print(f"LLM error: {error}")
277
- self.trace_client.record_output(error)
278
- self.end_span(self.trace_client._current_span, span_type="llm")
292
+ current_trace = self.tracer.get_current_trace()
293
+ current_trace.record_output(error)
294
+ self.end_span(span_type="llm")
279
295
 
280
296
  def on_chat_model_start(
281
297
  self,
@@ -297,7 +313,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
297
313
  name = "LLM call"
298
314
 
299
315
  self.start_span(name, span_type="llm")
300
- self.trace_client.record_input({
316
+ current_trace = self.tracer.get_current_trace()
317
+ current_trace.record_input({
301
318
  'args': str(messages),
302
319
  'kwargs': kwargs
303
320
  })
@@ -10,6 +10,8 @@ from judgeval.data.datasets import EvalDataset, EvalDatasetClient
10
10
  from judgeval.data import (
11
11
  ScoringResult,
12
12
  Example,
13
+ CustomExample,
14
+ Sequence,
13
15
  )
14
16
  from judgeval.scorers import (
15
17
  APIJudgmentScorer,
@@ -20,8 +22,10 @@ from judgeval.scorers import (
20
22
  from judgeval.evaluation_run import EvaluationRun
21
23
  from judgeval.run_evaluation import (
22
24
  run_eval,
23
- assert_test
25
+ assert_test,
26
+ run_sequence_eval
24
27
  )
28
+ from judgeval.data.sequence_run import SequenceRun
25
29
  from judgeval.judges import JudgevalJudge
26
30
  from judgeval.constants import (
27
31
  JUDGMENT_EVAL_FETCH_API_URL,
@@ -78,15 +82,71 @@ class JudgmentClient(metaclass=SingletonMeta):
78
82
  project_name: str = "default_project",
79
83
  eval_run_name: str = "default_eval_run",
80
84
  override: bool = False,
85
+ append: bool = False,
81
86
  use_judgment: bool = True,
82
87
  ignore_errors: bool = True,
83
88
  rules: Optional[List[Rule]] = None
84
89
  ) -> List[ScoringResult]:
85
- return self.run_evaluation(examples, scorers, model, aggregator, metadata, log_results, project_name, eval_run_name, override, use_judgment, ignore_errors, True, rules)
90
+ return self.run_evaluation(examples, scorers, model, aggregator, metadata, log_results, project_name, eval_run_name, override, append, use_judgment, ignore_errors, True, rules)
91
+
92
+ def run_sequence_evaluation(
93
+ self,
94
+ sequences: List[Sequence],
95
+ model: Union[str, List[str], JudgevalJudge],
96
+ aggregator: Optional[str] = None,
97
+ project_name: str = "default_project",
98
+ eval_run_name: str = "default_eval_sequence",
99
+ use_judgment: bool = True,
100
+ log_results: bool = True,
101
+ override: bool = False,
102
+ ignore_errors: bool = True,
103
+ rules: Optional[List[Rule]] = None
104
+ ) -> List[ScoringResult]:
105
+ try:
106
+ if rules:
107
+ loaded_rules = []
108
+ for rule in rules:
109
+ try:
110
+ processed_conditions = []
111
+ for condition in rule.conditions:
112
+ # Convert metric if it's a ScorerWrapper
113
+ if isinstance(condition.metric, ScorerWrapper):
114
+ try:
115
+ condition_copy = condition.model_copy()
116
+ condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
117
+ processed_conditions.append(condition_copy)
118
+ except Exception as e:
119
+ raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
120
+ else:
121
+ processed_conditions.append(condition)
122
+
123
+ # Create new rule with processed conditions
124
+ new_rule = rule.model_copy()
125
+ new_rule.conditions = processed_conditions
126
+ loaded_rules.append(new_rule)
127
+ except Exception as e:
128
+ raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
129
+
130
+ sequence_run = SequenceRun(
131
+ project_name=project_name,
132
+ eval_name=eval_run_name,
133
+ sequences=sequences,
134
+ model=model,
135
+ aggregator=aggregator,
136
+ log_results=log_results,
137
+ judgment_api_key=self.judgment_api_key,
138
+ organization_id=self.organization_id
139
+ )
140
+
141
+ return run_sequence_eval(sequence_run, override, ignore_errors, use_judgment)
142
+ except ValueError as e:
143
+ raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
144
+ except Exception as e:
145
+ raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
86
146
 
87
147
  def run_evaluation(
88
148
  self,
89
- examples: List[Example],
149
+ examples: Union[List[Example], List[CustomExample]],
90
150
  scorers: List[Union[ScorerWrapper, JudgevalScorer]],
91
151
  model: Union[str, List[str], JudgevalJudge],
92
152
  aggregator: Optional[str] = None,
@@ -95,6 +155,7 @@ class JudgmentClient(metaclass=SingletonMeta):
95
155
  project_name: str = "default_project",
96
156
  eval_run_name: str = "default_eval_run",
97
157
  override: bool = False,
158
+ append: bool = False,
98
159
  use_judgment: bool = True,
99
160
  ignore_errors: bool = True,
100
161
  async_execution: bool = False,
@@ -104,7 +165,7 @@ class JudgmentClient(metaclass=SingletonMeta):
104
165
  Executes an evaluation of `Example`s using one or more `Scorer`s
105
166
 
106
167
  Args:
107
- examples (List[Example]): The examples to evaluate
168
+ examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
108
169
  scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
109
170
  model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
110
171
  aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
@@ -120,6 +181,9 @@ class JudgmentClient(metaclass=SingletonMeta):
120
181
  Returns:
121
182
  List[ScoringResult]: The results of the evaluation
122
183
  """
184
+ if override and append:
185
+ raise ValueError("Cannot set both override and append to True. Please choose one.")
186
+
123
187
  try:
124
188
  # Load appropriate implementations for all scorers
125
189
  loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
@@ -161,9 +225,9 @@ class JudgmentClient(metaclass=SingletonMeta):
161
225
  loaded_rules.append(new_rule)
162
226
  except Exception as e:
163
227
  raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
164
-
165
228
  eval = EvaluationRun(
166
229
  log_results=log_results,
230
+ append=append,
167
231
  project_name=project_name,
168
232
  eval_name=eval_run_name,
169
233
  examples=examples,
@@ -180,7 +244,7 @@ class JudgmentClient(metaclass=SingletonMeta):
180
244
  raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
181
245
  except Exception as e:
182
246
  raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
183
-
247
+
184
248
  def evaluate_dataset(
185
249
  self,
186
250
  dataset: EvalDataset,
@@ -292,6 +356,12 @@ class JudgmentClient(metaclass=SingletonMeta):
292
356
  dataset.judgment_api_key = self.judgment_api_key
293
357
  return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
294
358
 
359
+ def append_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
360
+ """
361
+ Appends an `EvalDataset` to the Judgment platform for storage.
362
+ """
363
+ return self.eval_dataset_client.append(alias, examples, project_name)
364
+
295
365
  def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
296
366
  """
297
367
  Retrieves a saved `EvalDataset` from the Judgment platform.
@@ -355,14 +425,7 @@ class JudgmentClient(metaclass=SingletonMeta):
355
425
  if eval_run.status_code != requests.codes.ok:
356
426
  raise ValueError(f"Error fetching eval results: {eval_run.json()}")
357
427
 
358
- eval_run_result = [{}]
359
- for result in eval_run.json():
360
- result_id = result.get("id", "")
361
- result_data = result.get("result", dict())
362
- filtered_result = {k: v for k, v in result_data.items() if k in ScoringResult.__annotations__}
363
- eval_run_result[0]["id"] = result_id
364
- eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
365
- return eval_run_result
428
+ return eval_run.json()
366
429
 
367
430
  def delete_eval(self, project_name: str, eval_run_names: List[str]) -> bool:
368
431
  """
@@ -4,14 +4,15 @@ import time
4
4
  import sys
5
5
  import itertools
6
6
  import threading
7
- from typing import List, Dict, Any
7
+ from typing import List, Dict, Any, Union
8
8
  from datetime import datetime
9
9
  from rich import print as rprint
10
10
 
11
11
  from judgeval.data import (
12
12
  ScorerData,
13
13
  ScoringResult,
14
- Example
14
+ Example,
15
+ CustomExample
15
16
  )
16
17
  from judgeval.scorers import (
17
18
  JudgevalScorer,
@@ -22,6 +23,7 @@ from judgeval.scorers.score import a_execute_scoring
22
23
  from judgeval.constants import (
23
24
  ROOT_API,
24
25
  JUDGMENT_EVAL_API_URL,
26
+ JUDGMENT_SEQUENCE_EVAL_API_URL,
25
27
  JUDGMENT_EVAL_LOG_API_URL,
26
28
  MAX_CONCURRENT_EVALUATIONS,
27
29
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
@@ -34,7 +36,7 @@ from judgeval.common.logger import (
34
36
  example_logging_context
35
37
  )
36
38
  from judgeval.evaluation_run import EvaluationRun
37
-
39
+ from judgeval.data.sequence_run import SequenceRun
38
40
 
39
41
  def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
40
42
  """
@@ -91,6 +93,36 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
91
93
  raise JudgmentAPIError(error_message)
92
94
  return response_data
93
95
 
96
+ def execute_api_sequence_eval(sequence_run: SequenceRun) -> List[Dict]:
97
+ """
98
+ Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
99
+ """
100
+
101
+ try:
102
+ # submit API request to execute evals
103
+ payload = sequence_run.model_dump(warnings=False)
104
+ response = requests.post(
105
+ JUDGMENT_SEQUENCE_EVAL_API_URL,
106
+ headers={
107
+ "Content-Type": "application/json",
108
+ "Authorization": f"Bearer {sequence_run.judgment_api_key}",
109
+ "X-Organization-Id": sequence_run.organization_id
110
+ },
111
+ json=payload,
112
+ verify=True
113
+ )
114
+ response_data = response.json()
115
+ except Exception as e:
116
+ error(f"Error: {e}")
117
+ details = response.json().get("detail", "No details provided")
118
+ raise JudgmentAPIError("An error occurred while executing the Judgment API request: " + details)
119
+ # Check if the response status code is not 2XX
120
+ # Add check for the duplicate eval run name
121
+ if not response.ok:
122
+ error_message = response_data.get('detail', 'An unknown error occurred.')
123
+ error(f"Error: {error_message=}")
124
+ raise JudgmentAPIError(error_message)
125
+ return response_data
94
126
 
95
127
  def merge_results(api_results: List[ScoringResult], local_results: List[ScoringResult]) -> List[ScoringResult]:
96
128
  """
@@ -197,8 +229,8 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
197
229
  )
198
230
 
199
231
  if response.status_code == 409:
200
- error(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `override` flag to true.")
201
- raise ValueError(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `override` flag to true.")
232
+ error(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true.")
233
+ raise ValueError(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true.")
202
234
 
203
235
  if not response.ok:
204
236
  response_data = response.json()
@@ -211,7 +243,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
211
243
  raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
212
244
 
213
245
 
214
- def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> str:
246
+ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
215
247
  """
216
248
  Logs evaluation results to the Judgment API database.
217
249
 
@@ -228,13 +260,12 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
228
260
  JUDGMENT_EVAL_LOG_API_URL,
229
261
  headers={
230
262
  "Content-Type": "application/json",
231
- "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
232
- "X-Organization-Id": evaluation_run.organization_id
263
+ "Authorization": f"Bearer {run.judgment_api_key}",
264
+ "X-Organization-Id": run.organization_id
233
265
  },
234
266
  json={
235
- "results": [result.to_dict() for result in merged_results],
236
- "project_name": evaluation_run.project_name,
237
- "eval_name": evaluation_run.eval_name,
267
+ "results": [result.model_dump(warnings=False) for result in merged_results],
268
+ "run": run.model_dump(warnings=False)
238
269
  },
239
270
  verify=True
240
271
  )
@@ -303,6 +334,42 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
303
334
  # Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
304
335
  print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
305
336
 
337
+ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
338
+ # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
339
+ if not override and sequence_run.log_results:
340
+ check_eval_run_name_exists(
341
+ sequence_run.eval_name,
342
+ sequence_run.project_name,
343
+ sequence_run.judgment_api_key,
344
+ sequence_run.organization_id
345
+ )
346
+
347
+ # Execute evaluation using Judgment API
348
+ info("Starting API evaluation")
349
+ try: # execute an EvaluationRun with just JudgmentScorers
350
+ debug("Sending request to Judgment API")
351
+ response_data: List[Dict] = run_with_spinner("Running Sequence Evaluation: ", execute_api_sequence_eval, sequence_run)
352
+
353
+ info(f"Received {len(response_data['results'])} results from API")
354
+ except JudgmentAPIError as e:
355
+ error(f"An error occurred while executing the Judgment API request: {str(e)}")
356
+ raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
357
+ except ValueError as e:
358
+ raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: {str(e)}")
359
+
360
+ # Convert the response data to `ScoringResult` objects
361
+ debug("Processing API results")
362
+ api_results = []
363
+ for result in response_data["results"]:
364
+ api_results.append(ScoringResult(**result))
365
+
366
+ # TODO: allow for custom scorer on sequences
367
+ if sequence_run.log_results:
368
+ pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, api_results, sequence_run)
369
+ rprint(pretty_str)
370
+
371
+
372
+
306
373
  def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
307
374
  """
308
375
  Executes an evaluation of `Example`s using one or more `Scorer`s
@@ -329,7 +396,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
329
396
  """
330
397
 
331
398
  # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
332
- if not override and evaluation_run.log_results:
399
+ if not override and evaluation_run.log_results and not evaluation_run.append:
333
400
  check_eval_run_name_exists(
334
401
  evaluation_run.eval_name,
335
402
  evaluation_run.project_name,
@@ -373,12 +440,20 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
373
440
  local_scorers.append(scorer)
374
441
  debug(f"Added local scorer: {type(scorer).__name__}")
375
442
 
443
+ custom_example_check = [scorer.custom_example for scorer in local_scorers]
444
+ if any(custom_example_check) and not all(custom_example_check):
445
+ error("All scorers must be custom scorers if using custom examples")
446
+ raise ValueError("All scorers must be custom scorers if using custom examples")
447
+
376
448
  debug(f"Found {len(judgment_scorers)} judgment scorers and {len(local_scorers)} local scorers")
377
449
 
378
450
  api_results: List[ScoringResult] = []
379
451
  local_results: List[ScoringResult] = []
380
452
 
381
453
  if async_execution:
454
+ if len(local_scorers) > 0:
455
+ error("Local scorers are not supported in async execution")
456
+
382
457
  check_examples(evaluation_run.examples, evaluation_run.scorers)
383
458
  info("Starting async evaluation")
384
459
  payload = evaluation_run.model_dump(warnings=False)
@@ -396,7 +471,6 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
396
471
  else:
397
472
  if judgment_scorers:
398
473
  # Execute evaluation using Judgment API
399
- check_examples(evaluation_run.examples, evaluation_run.scorers)
400
474
  info("Starting API evaluation")
401
475
  debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
402
476
  try: # execute an EvaluationRun with just JudgmentScorers
@@ -17,6 +17,7 @@ from judgeval.scorers.judgeval_scorers import (
17
17
  ComparisonScorer,
18
18
  InstructionAdherenceScorer,
19
19
  GroundednessScorer,
20
+ DerailmentScorer,
20
21
  )
21
22
 
22
23
  __all__ = [
@@ -39,4 +40,5 @@ __all__ = [
39
40
  "ComparisonScorer",
40
41
  "InstructionAdherenceScorer",
41
42
  "GroundednessScorer",
43
+ "DerailmentScorer",
42
44
  ]