judgeval 0.0.41__py3-none-any.whl → 0.0.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/s3_storage.py +3 -1
- judgeval/common/tracer.py +921 -103
- judgeval/common/utils.py +1 -1
- judgeval/constants.py +5 -0
- judgeval/data/trace.py +2 -1
- judgeval/integrations/langgraph.py +218 -34
- judgeval/rules.py +60 -50
- judgeval/run_evaluation.py +36 -26
- judgeval/utils/alerts.py +8 -0
- {judgeval-0.0.41.dist-info → judgeval-0.0.42.dist-info}/METADATA +35 -46
- {judgeval-0.0.41.dist-info → judgeval-0.0.42.dist-info}/RECORD +13 -13
- {judgeval-0.0.41.dist-info → judgeval-0.0.42.dist-info}/WHEEL +0 -0
- {judgeval-0.0.41.dist-info → judgeval-0.0.42.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/utils.py
CHANGED
@@ -103,7 +103,7 @@ def validate_api_key(judgment_api_key: str):
|
|
103
103
|
Validates that the user api key is valid
|
104
104
|
"""
|
105
105
|
response = requests.post(
|
106
|
-
f"{ROOT_API}/validate_api_key/",
|
106
|
+
f"{ROOT_API}/auth/validate_api_key/",
|
107
107
|
headers={
|
108
108
|
"Content-Type": "application/json",
|
109
109
|
"Authorization": f"Bearer {judgment_api_key}",
|
judgeval/constants.py
CHANGED
@@ -58,8 +58,13 @@ JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
|
|
58
58
|
JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
|
59
59
|
JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
|
60
60
|
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
61
|
+
JUDGMENT_TRACES_UPSERT_API_URL = f"{ROOT_API}/traces/upsert/"
|
62
|
+
JUDGMENT_TRACES_USAGE_CHECK_API_URL = f"{ROOT_API}/traces/usage/check/"
|
63
|
+
JUDGMENT_TRACES_USAGE_UPDATE_API_URL = f"{ROOT_API}/traces/usage/update/"
|
61
64
|
JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
62
65
|
JUDGMENT_TRACES_ADD_ANNOTATION_API_URL = f"{ROOT_API}/traces/add_annotation/"
|
66
|
+
JUDGMENT_TRACES_SPANS_BATCH_API_URL = f"{ROOT_API}/traces/spans/batch/"
|
67
|
+
JUDGMENT_TRACES_EVALUATION_RUNS_BATCH_API_URL = f"{ROOT_API}/traces/evaluation_runs/batch/"
|
63
68
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
|
64
69
|
JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
|
65
70
|
# RabbitMQ
|
judgeval/data/trace.py
CHANGED
@@ -54,7 +54,8 @@ class TraceSpan(BaseModel):
|
|
54
54
|
"has_evaluation": self.has_evaluation,
|
55
55
|
"agent_name": self.agent_name,
|
56
56
|
"state_before": self.state_before,
|
57
|
-
"state_after": self.state_after
|
57
|
+
"state_after": self.state_after,
|
58
|
+
"additional_metadata": self._serialize_value(self.additional_metadata)
|
58
59
|
}
|
59
60
|
|
60
61
|
def print_span(self):
|
@@ -3,9 +3,11 @@ from uuid import UUID
|
|
3
3
|
import time
|
4
4
|
import uuid
|
5
5
|
import contextvars # <--- Import contextvars
|
6
|
+
from datetime import datetime
|
6
7
|
|
7
|
-
from judgeval.common.tracer import TraceClient, TraceSpan, Tracer, SpanType, EvaluationConfig
|
8
|
+
from judgeval.common.tracer import TraceClient, TraceSpan, Tracer, SpanType, EvaluationConfig, cost_per_token
|
8
9
|
from judgeval.data import Example # Import Example
|
10
|
+
from judgeval.data.trace import TraceUsage
|
9
11
|
|
10
12
|
from langchain_core.callbacks import BaseCallbackHandler
|
11
13
|
from langchain_core.agents import AgentAction, AgentFinish
|
@@ -36,18 +38,48 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
36
38
|
def __init__(self, tracer: Tracer):
|
37
39
|
|
38
40
|
self.tracer = tracer
|
41
|
+
# Initialize tracking/logging variables (preserved across resets)
|
42
|
+
self.executed_nodes: List[str] = []
|
43
|
+
self.executed_tools: List[str] = []
|
44
|
+
self.executed_node_tools: List[str] = []
|
45
|
+
self.traces: List[Dict[str, Any]] = []
|
46
|
+
# Initialize execution state (reset between runs)
|
47
|
+
self._reset_state()
|
48
|
+
# --- END NEW __init__ ---
|
49
|
+
|
50
|
+
def _reset_state(self):
|
51
|
+
"""Reset only the critical execution state for reuse across multiple executions"""
|
52
|
+
# Reset core execution state that must be cleared between runs
|
39
53
|
self._trace_client: Optional[TraceClient] = None
|
40
54
|
self._run_id_to_span_id: Dict[UUID, str] = {}
|
41
55
|
self._span_id_to_start_time: Dict[str, float] = {}
|
42
56
|
self._span_id_to_depth: Dict[str, int] = {}
|
43
57
|
self._root_run_id: Optional[UUID] = None
|
44
|
-
self._trace_saved: bool = False
|
45
|
-
|
46
|
-
self.
|
58
|
+
self._trace_saved: bool = False
|
59
|
+
self.span_id_to_token: Dict[str, Any] = {}
|
60
|
+
self.trace_id_to_token: Dict[str, Any] = {}
|
61
|
+
|
62
|
+
# Add timestamp to track when we last reset
|
63
|
+
self._last_reset_time: float = time.time()
|
64
|
+
|
65
|
+
# Preserve tracking/logging variables across executions:
|
66
|
+
# - self.executed_nodes: List[str] = [] # Keep as running log
|
67
|
+
# - self.executed_tools: List[str] = [] # Keep as running log
|
68
|
+
# - self.executed_node_tools: List[str] = [] # Keep as running log
|
69
|
+
# - self.traces: List[Dict[str, Any]] = [] # Keep for collecting multiple traces
|
70
|
+
|
71
|
+
def reset(self):
|
72
|
+
"""Public method to manually reset handler execution state for reuse"""
|
73
|
+
self._reset_state()
|
74
|
+
|
75
|
+
def reset_all(self):
|
76
|
+
"""Public method to reset ALL handler state including tracking/logging data"""
|
77
|
+
self._reset_state()
|
78
|
+
# Also reset tracking/logging variables
|
79
|
+
self.executed_nodes: List[str] = []
|
47
80
|
self.executed_tools: List[str] = []
|
48
81
|
self.executed_node_tools: List[str] = []
|
49
82
|
self.traces: List[Dict[str, Any]] = []
|
50
|
-
# --- END NEW __init__ ---
|
51
83
|
|
52
84
|
# --- MODIFIED _ensure_trace_client ---
|
53
85
|
def _ensure_trace_client(self, run_id: UUID, parent_run_id: Optional[UUID], event_name: str) -> Optional[TraceClient]:
|
@@ -57,6 +89,11 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
57
89
|
Returns the client or None.
|
58
90
|
"""
|
59
91
|
|
92
|
+
# If this is a potential new root execution (no parent_run_id) and we had a previous trace saved,
|
93
|
+
# reset state to allow reuse of the handler
|
94
|
+
if parent_run_id is None and self._trace_saved:
|
95
|
+
self._reset_state()
|
96
|
+
|
60
97
|
# If a client already exists, return it.
|
61
98
|
if self._trace_client:
|
62
99
|
return self._trace_client
|
@@ -73,11 +110,25 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
73
110
|
enable_evaluations=self.tracer.enable_evaluations
|
74
111
|
)
|
75
112
|
self._trace_client = client_instance
|
113
|
+
token = self.tracer.set_current_trace(self._trace_client)
|
114
|
+
if token:
|
115
|
+
self.trace_id_to_token[trace_id] = token
|
76
116
|
if self._trace_client:
|
77
117
|
self._root_run_id = run_id # Assign the first run_id encountered as the tentative root
|
78
118
|
self._trace_saved = False # Ensure flag is reset
|
79
119
|
# Set active client on Tracer (important for potential fallbacks)
|
80
120
|
self.tracer._active_trace_client = self._trace_client
|
121
|
+
|
122
|
+
# NEW: Initial save for live tracking (follows the new practice)
|
123
|
+
try:
|
124
|
+
trace_id_saved, server_response = self._trace_client.save_with_rate_limiting(
|
125
|
+
overwrite=self._trace_client.overwrite,
|
126
|
+
final_save=False # Initial save for live tracking
|
127
|
+
)
|
128
|
+
except Exception as e:
|
129
|
+
import warnings
|
130
|
+
warnings.warn(f"Failed to save initial trace for live tracking: {e}")
|
131
|
+
|
81
132
|
return self._trace_client
|
82
133
|
else:
|
83
134
|
return None
|
@@ -112,12 +163,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
112
163
|
self._span_id_to_start_time[span_id] = start_time
|
113
164
|
self._span_id_to_depth[span_id] = current_depth
|
114
165
|
|
115
|
-
|
116
|
-
# --- Set SPAN context variable ONLY for chain (node) spans (Sync version) ---
|
117
|
-
if span_type == "chain":
|
118
|
-
self.tracer.set_current_span(span_id)
|
119
|
-
|
120
|
-
new_trace = TraceSpan(
|
166
|
+
new_span = TraceSpan(
|
121
167
|
span_id=span_id,
|
122
168
|
trace_id=trace_client.trace_id,
|
123
169
|
parent_span_id=parent_span_id,
|
@@ -127,9 +173,36 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
127
173
|
span_type=span_type
|
128
174
|
)
|
129
175
|
|
130
|
-
|
131
|
-
|
132
|
-
|
176
|
+
# Separate metadata from inputs
|
177
|
+
if inputs:
|
178
|
+
metadata = {}
|
179
|
+
clean_inputs = {}
|
180
|
+
|
181
|
+
# Extract metadata fields
|
182
|
+
metadata_fields = ['tags', 'metadata', 'kwargs', 'serialized']
|
183
|
+
for field in metadata_fields:
|
184
|
+
if field in inputs:
|
185
|
+
metadata[field] = inputs.pop(field)
|
186
|
+
|
187
|
+
# Store the remaining inputs
|
188
|
+
clean_inputs = inputs
|
189
|
+
|
190
|
+
# Set both fields on the span
|
191
|
+
new_span.inputs = clean_inputs
|
192
|
+
new_span.additional_metadata = metadata
|
193
|
+
else:
|
194
|
+
new_span.inputs = {}
|
195
|
+
new_span.additional_metadata = {}
|
196
|
+
|
197
|
+
trace_client.add_span(new_span)
|
198
|
+
|
199
|
+
# Queue span with initial state (input phase) through background service
|
200
|
+
if trace_client.background_span_service:
|
201
|
+
trace_client.background_span_service.queue_span(new_span, span_state="input")
|
202
|
+
|
203
|
+
token = self.tracer.set_current_span(span_id)
|
204
|
+
if token:
|
205
|
+
self.span_id_to_token[span_id] = token
|
133
206
|
|
134
207
|
def _end_span_tracking(
|
135
208
|
self,
|
@@ -142,6 +215,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
142
215
|
|
143
216
|
# Get span ID and check if it exists
|
144
217
|
span_id = self._run_id_to_span_id.get(run_id)
|
218
|
+
token = self.span_id_to_token.pop(span_id, None)
|
219
|
+
self.tracer.reset_current_span(token, span_id)
|
145
220
|
|
146
221
|
start_time = self._span_id_to_start_time.get(span_id) if span_id else None
|
147
222
|
duration = time.time() - start_time if start_time is not None else None
|
@@ -151,7 +226,38 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
151
226
|
trace_span = trace_client.span_id_to_span.get(span_id)
|
152
227
|
if trace_span:
|
153
228
|
trace_span.duration = duration
|
154
|
-
|
229
|
+
|
230
|
+
# Handle outputs and error
|
231
|
+
if error:
|
232
|
+
trace_span.output = error
|
233
|
+
elif outputs:
|
234
|
+
# Separate metadata from outputs
|
235
|
+
metadata = {}
|
236
|
+
clean_outputs = {}
|
237
|
+
|
238
|
+
# Extract metadata fields
|
239
|
+
metadata_fields = ['tags', 'kwargs']
|
240
|
+
if isinstance(outputs, dict):
|
241
|
+
for field in metadata_fields:
|
242
|
+
if field in outputs:
|
243
|
+
metadata[field] = outputs.pop(field)
|
244
|
+
|
245
|
+
# Store the remaining outputs
|
246
|
+
clean_outputs = outputs
|
247
|
+
else:
|
248
|
+
clean_outputs = outputs
|
249
|
+
|
250
|
+
# Set both fields on the span
|
251
|
+
trace_span.output = clean_outputs
|
252
|
+
if metadata:
|
253
|
+
# Merge with existing metadata
|
254
|
+
existing_metadata = trace_span.additional_metadata or {}
|
255
|
+
trace_span.additional_metadata = {**existing_metadata, **metadata}
|
256
|
+
|
257
|
+
# Queue span with completed state through background service
|
258
|
+
if trace_client.background_span_service:
|
259
|
+
span_state = "error" if error else "completed"
|
260
|
+
trace_client.background_span_service.queue_span(trace_span, span_state=span_state)
|
155
261
|
|
156
262
|
# Clean up dictionaries for this specific span
|
157
263
|
if span_id in self._span_id_to_start_time: del self._span_id_to_start_time[span_id]
|
@@ -165,9 +271,30 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
165
271
|
# Reset input storage for this handler instance
|
166
272
|
|
167
273
|
if self._trace_client and not self._trace_saved: # Check if not already saved
|
168
|
-
#
|
169
|
-
|
170
|
-
|
274
|
+
# Flush background spans before saving the final trace
|
275
|
+
|
276
|
+
complete_trace_data = {
|
277
|
+
"trace_id": self._trace_client.trace_id,
|
278
|
+
"name": self._trace_client.name,
|
279
|
+
"created_at": datetime.utcfromtimestamp(self._trace_client.start_time).isoformat(),
|
280
|
+
"duration": self._trace_client.get_duration(),
|
281
|
+
"trace_spans": [span.model_dump() for span in self._trace_client.trace_spans],
|
282
|
+
"overwrite": self._trace_client.overwrite,
|
283
|
+
"offline_mode": self.tracer.offline_mode,
|
284
|
+
"parent_trace_id": self._trace_client.parent_trace_id,
|
285
|
+
"parent_name": self._trace_client.parent_name
|
286
|
+
}
|
287
|
+
|
288
|
+
# NEW: Use save_with_rate_limiting with final_save=True for final save
|
289
|
+
trace_id, trace_data = self._trace_client.save_with_rate_limiting(
|
290
|
+
overwrite=self._trace_client.overwrite,
|
291
|
+
final_save=True # Final save with usage counter updates
|
292
|
+
)
|
293
|
+
token = self.trace_id_to_token.pop(trace_id, None)
|
294
|
+
self.tracer.reset_current_trace(token, trace_id)
|
295
|
+
|
296
|
+
# Store complete trace data instead of server response
|
297
|
+
self.tracer.traces.append(complete_trace_data)
|
171
298
|
self._trace_saved = True # Set flag only after successful save
|
172
299
|
finally:
|
173
300
|
# --- NEW: Consolidated Cleanup Logic ---
|
@@ -254,10 +381,26 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
254
381
|
# --- Root node cleanup (Existing logic - slightly modified save call) ---
|
255
382
|
if run_id == self._root_run_id:
|
256
383
|
if trace_client and not self._trace_saved:
|
257
|
-
#
|
258
|
-
|
259
|
-
|
260
|
-
|
384
|
+
# Store complete trace data instead of server response
|
385
|
+
complete_trace_data = {
|
386
|
+
"trace_id": trace_client.trace_id,
|
387
|
+
"name": trace_client.name,
|
388
|
+
"created_at": datetime.utcfromtimestamp(trace_client.start_time).isoformat(),
|
389
|
+
"duration": trace_client.get_duration(),
|
390
|
+
"trace_spans": [span.model_dump() for span in trace_client.trace_spans],
|
391
|
+
"overwrite": trace_client.overwrite,
|
392
|
+
"offline_mode": self.tracer.offline_mode,
|
393
|
+
"parent_trace_id": trace_client.parent_trace_id,
|
394
|
+
"parent_name": trace_client.parent_name
|
395
|
+
}
|
396
|
+
# NEW: Use save_with_rate_limiting with final_save=True for final save
|
397
|
+
trace_id_saved, trace_data = trace_client.save_with_rate_limiting(
|
398
|
+
overwrite=trace_client.overwrite,
|
399
|
+
final_save=True # Final save with usage counter updates
|
400
|
+
)
|
401
|
+
|
402
|
+
|
403
|
+
self.tracer.traces.append(complete_trace_data)
|
261
404
|
self._trace_saved = True
|
262
405
|
# Reset tracer's active client *after* successful save
|
263
406
|
if self.tracer._active_trace_client == trace_client:
|
@@ -333,11 +476,23 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
333
476
|
if not trace_client:
|
334
477
|
return
|
335
478
|
outputs = {"response": response, "kwargs": kwargs}
|
336
|
-
|
337
|
-
|
338
|
-
prompt_tokens = None
|
339
|
-
completion_tokens = None
|
479
|
+
|
480
|
+
# --- Token Usage Extraction and Cost Calculation ---
|
481
|
+
prompt_tokens = None
|
482
|
+
completion_tokens = None
|
340
483
|
total_tokens = None
|
484
|
+
model_name = None
|
485
|
+
|
486
|
+
# Extract model name from response if available
|
487
|
+
if hasattr(response, 'llm_output') and response.llm_output and isinstance(response.llm_output, dict):
|
488
|
+
model_name = response.llm_output.get('model_name') or response.llm_output.get('model')
|
489
|
+
|
490
|
+
# Try to get model from the first generation if available
|
491
|
+
if not model_name and response.generations and len(response.generations) > 0:
|
492
|
+
if hasattr(response.generations[0][0], 'generation_info') and response.generations[0][0].generation_info:
|
493
|
+
gen_info = response.generations[0][0].generation_info
|
494
|
+
model_name = gen_info.get('model') or gen_info.get('model_name')
|
495
|
+
|
341
496
|
if response.llm_output and isinstance(response.llm_output, dict):
|
342
497
|
# Check for OpenAI/standard 'token_usage' first
|
343
498
|
if 'token_usage' in response.llm_output:
|
@@ -356,14 +511,43 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
356
511
|
if prompt_tokens is not None and completion_tokens is not None:
|
357
512
|
total_tokens = prompt_tokens + completion_tokens
|
358
513
|
|
359
|
-
# ---
|
514
|
+
# --- Create TraceUsage object and set on span ---
|
360
515
|
if prompt_tokens is not None or completion_tokens is not None:
|
361
|
-
#
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
516
|
+
# Calculate costs if model name is available
|
517
|
+
prompt_cost = None
|
518
|
+
completion_cost = None
|
519
|
+
total_cost_usd = None
|
520
|
+
|
521
|
+
if model_name and prompt_tokens is not None and completion_tokens is not None:
|
522
|
+
try:
|
523
|
+
prompt_cost, completion_cost = cost_per_token(
|
524
|
+
model=model_name,
|
525
|
+
prompt_tokens=prompt_tokens,
|
526
|
+
completion_tokens=completion_tokens
|
527
|
+
)
|
528
|
+
total_cost_usd = (prompt_cost + completion_cost) if prompt_cost and completion_cost else None
|
529
|
+
except Exception as e:
|
530
|
+
# If cost calculation fails, continue without costs
|
531
|
+
import warnings
|
532
|
+
warnings.warn(f"Failed to calculate token costs for model {model_name}: {e}")
|
533
|
+
|
534
|
+
# Create TraceUsage object
|
535
|
+
usage = TraceUsage(
|
536
|
+
prompt_tokens=prompt_tokens,
|
537
|
+
completion_tokens=completion_tokens,
|
538
|
+
total_tokens=total_tokens or (prompt_tokens + completion_tokens if prompt_tokens and completion_tokens else None),
|
539
|
+
prompt_tokens_cost_usd=prompt_cost,
|
540
|
+
completion_tokens_cost_usd=completion_cost,
|
541
|
+
total_cost_usd=total_cost_usd,
|
542
|
+
model_name=model_name
|
543
|
+
)
|
544
|
+
|
545
|
+
# Set usage on the actual span (not in outputs)
|
546
|
+
span_id = self._run_id_to_span_id.get(run_id)
|
547
|
+
if span_id and span_id in trace_client.span_id_to_span:
|
548
|
+
trace_span = trace_client.span_id_to_span[span_id]
|
549
|
+
trace_span.usage = usage
|
550
|
+
|
367
551
|
|
368
552
|
self._end_span_tracking(trace_client, run_id, outputs=outputs)
|
369
553
|
# --- End Token Usage ---
|
@@ -416,4 +600,4 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
416
600
|
if not trace_client: return
|
417
601
|
|
418
602
|
outputs = {'return_values': finish.return_values, 'log': finish.log, 'messages': finish.messages, 'kwargs': kwargs}
|
419
|
-
self._end_span_tracking(trace_client, run_id, outputs=outputs)
|
603
|
+
self._end_span_tracking(trace_client, run_id, outputs=outputs)
|
judgeval/rules.py
CHANGED
@@ -9,13 +9,13 @@ import asyncio
|
|
9
9
|
from concurrent.futures import ThreadPoolExecutor
|
10
10
|
import time
|
11
11
|
import uuid
|
12
|
+
import os
|
13
|
+
import re
|
14
|
+
import json
|
15
|
+
from datetime import datetime
|
12
16
|
|
13
17
|
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
14
|
-
|
15
|
-
class AlertStatus(str, Enum):
|
16
|
-
"""Status of an alert evaluation."""
|
17
|
-
TRIGGERED = "triggered"
|
18
|
-
NOT_TRIGGERED = "not_triggered"
|
18
|
+
from judgeval.utils.alerts import AlertStatus, AlertResult
|
19
19
|
|
20
20
|
class Condition(BaseModel):
|
21
21
|
"""
|
@@ -68,6 +68,36 @@ class Condition(BaseModel):
|
|
68
68
|
# Fallback to default comparison (greater than or equal)
|
69
69
|
return value >= self.threshold if self.threshold is not None else False
|
70
70
|
|
71
|
+
class PagerDutyConfig(BaseModel):
|
72
|
+
"""
|
73
|
+
Configuration for PagerDuty notifications.
|
74
|
+
|
75
|
+
Attributes:
|
76
|
+
routing_key: PagerDuty integration routing key
|
77
|
+
severity: Severity level (critical, error, warning, info)
|
78
|
+
source: Source of the alert (defaults to "judgeval")
|
79
|
+
component: Optional component that triggered the alert
|
80
|
+
group: Optional logical grouping for the alert
|
81
|
+
class_type: Optional class/type of alert event
|
82
|
+
"""
|
83
|
+
routing_key: str
|
84
|
+
severity: str = "error" # critical, error, warning, info
|
85
|
+
source: str = "judgeval"
|
86
|
+
component: Optional[str] = None
|
87
|
+
group: Optional[str] = None
|
88
|
+
class_type: Optional[str] = None
|
89
|
+
|
90
|
+
def model_dump(self, **kwargs):
|
91
|
+
"""Convert the PagerDutyConfig to a dictionary for JSON serialization."""
|
92
|
+
return {
|
93
|
+
"routing_key": self.routing_key,
|
94
|
+
"severity": self.severity,
|
95
|
+
"source": self.source,
|
96
|
+
"component": self.component,
|
97
|
+
"group": self.group,
|
98
|
+
"class_type": self.class_type
|
99
|
+
}
|
100
|
+
|
71
101
|
class NotificationConfig(BaseModel):
|
72
102
|
"""
|
73
103
|
Configuration for notifications when a rule is triggered.
|
@@ -75,8 +105,12 @@ class NotificationConfig(BaseModel):
|
|
75
105
|
Example:
|
76
106
|
{
|
77
107
|
"enabled": true,
|
78
|
-
"communication_methods": ["email", "broadcast_slack", "broadcast_email"],
|
108
|
+
"communication_methods": ["email", "broadcast_slack", "broadcast_email", "pagerduty"],
|
79
109
|
"email_addresses": ["user1@example.com", "user2@example.com"],
|
110
|
+
"pagerduty_config": {
|
111
|
+
"routing_key": "R0ABCD1234567890123456789",
|
112
|
+
"severity": "error"
|
113
|
+
},
|
80
114
|
"send_at": 1632150000 # Unix timestamp (specific date/time)
|
81
115
|
}
|
82
116
|
|
@@ -84,10 +118,12 @@ class NotificationConfig(BaseModel):
|
|
84
118
|
- "email": Send emails to specified email addresses
|
85
119
|
- "broadcast_slack": Send broadcast notifications to all configured Slack channels
|
86
120
|
- "broadcast_email": Send broadcast emails to all organization emails
|
121
|
+
- "pagerduty": Send alerts to PagerDuty using the configured routing key
|
87
122
|
"""
|
88
123
|
enabled: bool = True
|
89
124
|
communication_methods: List[str] = []
|
90
125
|
email_addresses: Optional[List[str]] = None
|
126
|
+
pagerduty_config: Optional[PagerDutyConfig] = None
|
91
127
|
send_at: Optional[int] = None # Unix timestamp for scheduled notifications
|
92
128
|
|
93
129
|
def model_dump(self, **kwargs):
|
@@ -96,6 +132,7 @@ class NotificationConfig(BaseModel):
|
|
96
132
|
"enabled": self.enabled,
|
97
133
|
"communication_methods": self.communication_methods,
|
98
134
|
"email_addresses": self.email_addresses,
|
135
|
+
"pagerduty_config": self.pagerduty_config.model_dump() if self.pagerduty_config else None,
|
99
136
|
"send_at": self.send_at
|
100
137
|
}
|
101
138
|
|
@@ -144,7 +181,8 @@ class Rule(BaseModel):
|
|
144
181
|
# Create standardized metric representation needed by server API
|
145
182
|
metric_data = {
|
146
183
|
"score_type": "",
|
147
|
-
"threshold": 0.0
|
184
|
+
"threshold": 0.0,
|
185
|
+
"name": ""
|
148
186
|
}
|
149
187
|
|
150
188
|
# First try to use object's own serialization methods
|
@@ -182,6 +220,16 @@ class Rule(BaseModel):
|
|
182
220
|
# Use condition threshold if metric doesn't have one
|
183
221
|
metric_data['threshold'] = self.conditions[i].threshold
|
184
222
|
|
223
|
+
# Make sure name is set
|
224
|
+
if not metric_data.get('name'):
|
225
|
+
if hasattr(metric_obj, '__name__'):
|
226
|
+
metric_data['name'] = metric_obj.__name__
|
227
|
+
elif hasattr(metric_obj, 'name'):
|
228
|
+
metric_data['name'] = metric_obj.name
|
229
|
+
else:
|
230
|
+
# Fallback to score_type if available
|
231
|
+
metric_data['name'] = metric_data.get('score_type', str(metric_obj))
|
232
|
+
|
185
233
|
# Update the condition with our properly serialized metric
|
186
234
|
condition["metric"] = metric_data
|
187
235
|
|
@@ -199,47 +247,6 @@ class Rule(BaseModel):
|
|
199
247
|
raise ValueError(f"combine_type must be 'all' or 'any', got: {v}")
|
200
248
|
return v
|
201
249
|
|
202
|
-
class AlertResult(BaseModel):
|
203
|
-
"""
|
204
|
-
Result of evaluating a rule.
|
205
|
-
|
206
|
-
Example:
|
207
|
-
{
|
208
|
-
"status": "triggered",
|
209
|
-
"rule_name": "Quality Check",
|
210
|
-
"conditions_result": [
|
211
|
-
{"metric": "faithfulness", "value": 0.6, "threshold": 0.7, "passed": False},
|
212
|
-
{"metric": "relevancy", "value": 0.9, "threshold": 0.8, "passed": True}
|
213
|
-
],
|
214
|
-
"rule_id": "123e4567-e89b-12d3-a456-426614174000",
|
215
|
-
"metadata": {
|
216
|
-
"example_id": "example_123",
|
217
|
-
"timestamp": "20240321_123456"
|
218
|
-
},
|
219
|
-
"notification": {
|
220
|
-
"enabled": true,
|
221
|
-
"communication_methods": ["slack", "email"],
|
222
|
-
"email_addresses": ["user1@example.com", "user2@example.com"]
|
223
|
-
}
|
224
|
-
}
|
225
|
-
"""
|
226
|
-
status: AlertStatus
|
227
|
-
rule_id: Optional[str] = None # The unique identifier of the rule
|
228
|
-
rule_name: str
|
229
|
-
conditions_result: List[Dict[str, Any]]
|
230
|
-
metadata: Dict[str, Any] = {}
|
231
|
-
notification: Optional[NotificationConfig] = None # Configuration for notifications
|
232
|
-
|
233
|
-
@property
|
234
|
-
def example_id(self) -> Optional[str]:
|
235
|
-
"""Get example_id from metadata for backward compatibility"""
|
236
|
-
return self.metadata.get("example_id")
|
237
|
-
|
238
|
-
@property
|
239
|
-
def timestamp(self) -> Optional[str]:
|
240
|
-
"""Get timestamp from metadata for backward compatibility"""
|
241
|
-
return self.metadata.get("timestamp")
|
242
|
-
|
243
250
|
class RulesEngine:
|
244
251
|
"""
|
245
252
|
Engine for creating and evaluating rules against metrics.
|
@@ -406,7 +413,7 @@ class RulesEngine:
|
|
406
413
|
# If rule has a notification config and the alert is triggered, include it in the result
|
407
414
|
notification_config = rule.notification
|
408
415
|
|
409
|
-
# Set the alert status based on whether the rule was triggered
|
416
|
+
# Set the alert status based on whether the rule was triggered using proper enum values
|
410
417
|
status = AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED
|
411
418
|
|
412
419
|
# Create the alert result
|
@@ -416,7 +423,10 @@ class RulesEngine:
|
|
416
423
|
rule_name=rule.name,
|
417
424
|
conditions_result=condition_results,
|
418
425
|
notification=notification_config,
|
419
|
-
metadata=example_metadata or {}
|
426
|
+
metadata=example_metadata or {},
|
427
|
+
combine_type=rule.combine_type,
|
428
|
+
project_id=example_metadata.get("project_id") if example_metadata else None,
|
429
|
+
trace_span_id=example_metadata.get("trace_span_id") if example_metadata else None
|
420
430
|
)
|
421
431
|
|
422
432
|
results[rule_id] = alert_result
|