judgeval 0.7.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. judgeval/__init__.py +139 -12
  2. judgeval/api/__init__.py +501 -0
  3. judgeval/api/api_types.py +344 -0
  4. judgeval/cli.py +2 -4
  5. judgeval/constants.py +10 -26
  6. judgeval/data/evaluation_run.py +49 -26
  7. judgeval/data/example.py +2 -2
  8. judgeval/data/judgment_types.py +266 -82
  9. judgeval/data/result.py +4 -5
  10. judgeval/data/scorer_data.py +4 -2
  11. judgeval/data/tool.py +2 -2
  12. judgeval/data/trace.py +7 -50
  13. judgeval/data/trace_run.py +7 -4
  14. judgeval/{dataset.py → dataset/__init__.py} +43 -28
  15. judgeval/env.py +67 -0
  16. judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
  17. judgeval/exceptions.py +27 -0
  18. judgeval/integrations/langgraph/__init__.py +788 -0
  19. judgeval/judges/__init__.py +2 -2
  20. judgeval/judges/litellm_judge.py +75 -15
  21. judgeval/judges/together_judge.py +86 -18
  22. judgeval/judges/utils.py +7 -21
  23. judgeval/{common/logger.py → logger.py} +8 -6
  24. judgeval/scorers/__init__.py +0 -4
  25. judgeval/scorers/agent_scorer.py +3 -7
  26. judgeval/scorers/api_scorer.py +8 -13
  27. judgeval/scorers/base_scorer.py +52 -32
  28. judgeval/scorers/example_scorer.py +1 -3
  29. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
  30. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
  31. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
  32. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
  33. judgeval/scorers/score.py +21 -31
  34. judgeval/scorers/trace_api_scorer.py +5 -0
  35. judgeval/scorers/utils.py +1 -103
  36. judgeval/tracer/__init__.py +1075 -2
  37. judgeval/tracer/constants.py +1 -0
  38. judgeval/tracer/exporters/__init__.py +37 -0
  39. judgeval/tracer/exporters/s3.py +119 -0
  40. judgeval/tracer/exporters/store.py +43 -0
  41. judgeval/tracer/exporters/utils.py +32 -0
  42. judgeval/tracer/keys.py +67 -0
  43. judgeval/tracer/llm/__init__.py +1233 -0
  44. judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
  45. judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
  46. judgeval/tracer/managers.py +188 -0
  47. judgeval/tracer/processors/__init__.py +181 -0
  48. judgeval/tracer/utils.py +20 -0
  49. judgeval/trainer/__init__.py +5 -0
  50. judgeval/{common/trainer → trainer}/config.py +12 -9
  51. judgeval/{common/trainer → trainer}/console.py +2 -9
  52. judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
  53. judgeval/{common/trainer → trainer}/trainer.py +119 -17
  54. judgeval/utils/async_utils.py +2 -3
  55. judgeval/utils/decorators.py +24 -0
  56. judgeval/utils/file_utils.py +37 -4
  57. judgeval/utils/guards.py +32 -0
  58. judgeval/utils/meta.py +14 -0
  59. judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
  60. judgeval/utils/testing.py +88 -0
  61. judgeval/utils/url.py +10 -0
  62. judgeval/{version_check.py → utils/version_check.py} +3 -3
  63. judgeval/version.py +5 -0
  64. judgeval/warnings.py +4 -0
  65. {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/METADATA +12 -14
  66. judgeval-0.9.0.dist-info/RECORD +80 -0
  67. judgeval/clients.py +0 -35
  68. judgeval/common/__init__.py +0 -13
  69. judgeval/common/api/__init__.py +0 -3
  70. judgeval/common/api/api.py +0 -375
  71. judgeval/common/api/constants.py +0 -186
  72. judgeval/common/exceptions.py +0 -27
  73. judgeval/common/storage/__init__.py +0 -6
  74. judgeval/common/storage/s3_storage.py +0 -97
  75. judgeval/common/tracer/__init__.py +0 -31
  76. judgeval/common/tracer/constants.py +0 -22
  77. judgeval/common/tracer/core.py +0 -2427
  78. judgeval/common/tracer/otel_exporter.py +0 -108
  79. judgeval/common/tracer/otel_span_processor.py +0 -188
  80. judgeval/common/tracer/span_processor.py +0 -37
  81. judgeval/common/tracer/span_transformer.py +0 -207
  82. judgeval/common/tracer/trace_manager.py +0 -101
  83. judgeval/common/trainer/__init__.py +0 -5
  84. judgeval/common/utils.py +0 -948
  85. judgeval/integrations/langgraph.py +0 -844
  86. judgeval/judges/mixture_of_judges.py +0 -287
  87. judgeval/judgment_client.py +0 -267
  88. judgeval/rules.py +0 -521
  89. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  90. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  91. judgeval/utils/alerts.py +0 -93
  92. judgeval/utils/requests.py +0 -50
  93. judgeval-0.7.1.dist-info/RECORD +0 -82
  94. {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/WHEEL +0 -0
  95. {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/entry_points.txt +0 -0
  96. {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/rules.py DELETED
@@ -1,521 +0,0 @@
1
- """
2
- Rules system for Judgeval that enables alerts based on metric thresholds.
3
- """
4
-
5
- from typing import Dict, List, Optional, Union, Any, Tuple
6
- from pydantic import BaseModel, Field, ConfigDict
7
- import asyncio
8
- from concurrent.futures import ThreadPoolExecutor
9
- import uuid
10
-
11
- from judgeval.scorers import APIScorerConfig, BaseScorer
12
- from judgeval.utils.alerts import AlertStatus, AlertResult
13
-
14
-
15
- class Condition(BaseModel):
16
- """
17
- A single metric condition.
18
-
19
- Example:
20
- {
21
- "metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIScorerConfig, BaseScorer
22
- }
23
-
24
- The Condition class uses the scorer's threshold and success function internally.
25
- """
26
-
27
- model_config = ConfigDict(arbitrary_types_allowed=True)
28
-
29
- metric: Union[APIScorerConfig, BaseScorer]
30
-
31
- @property
32
- def metric_name(self) -> str:
33
- """Get the name of the metric for lookups in scores dictionary."""
34
- if hasattr(self.metric, "score_type"):
35
- # Handle APIScorerConfig and BaseScorer which have score_type
36
- return self.metric.score_type
37
- elif hasattr(self.metric, "__name__"):
38
- # Handle cases where metric has a __name__ attribute
39
- return self.metric.__name__
40
- # Fallback to string representation
41
- return str(self.metric)
42
-
43
- @property
44
- def threshold(self) -> float:
45
- """Get the threshold from the metric."""
46
- return self.metric.threshold if hasattr(self.metric, "threshold") else 0.5
47
-
48
- def evaluate(self, value: float) -> bool:
49
- """
50
- Evaluate the condition against a value.
51
- Returns True if the condition passes, False otherwise.
52
- Uses the scorer's success check function if available.
53
- """
54
- # Store the value in the scorer
55
- if hasattr(self.metric, "score"):
56
- self.metric.score = value
57
-
58
- # Use the scorer's success check function if available
59
- if hasattr(self.metric, "success_check"):
60
- return self.metric.success_check()
61
- elif hasattr(self.metric, "success_check"):
62
- return self.metric.success_check()
63
- else:
64
- # Fallback to default comparison (greater than or equal)
65
- return value >= self.threshold if self.threshold is not None else False
66
-
67
-
68
- class PagerDutyConfig(BaseModel):
69
- """
70
- Configuration for PagerDuty notifications.
71
-
72
- Attributes:
73
- routing_key: PagerDuty integration routing key
74
- severity: Severity level (critical, error, warning, info)
75
- source: Source of the alert (defaults to "judgeval")
76
- component: Optional component that triggered the alert
77
- group: Optional logical grouping for the alert
78
- class_type: Optional class/type of alert event
79
- """
80
-
81
- routing_key: str
82
- severity: str = "error" # critical, error, warning, info
83
- source: str = "judgeval"
84
- component: Optional[str] = None
85
- group: Optional[str] = None
86
- class_type: Optional[str] = None
87
-
88
- def model_dump(self, **kwargs):
89
- """Convert the PagerDutyConfig to a dictionary for JSON serialization."""
90
- return {
91
- "routing_key": self.routing_key,
92
- "severity": self.severity,
93
- "source": self.source,
94
- "component": self.component,
95
- "group": self.group,
96
- "class_type": self.class_type,
97
- }
98
-
99
-
100
- class NotificationConfig(BaseModel):
101
- """
102
- Configuration for notifications when a rule is triggered.
103
-
104
- Example:
105
- {
106
- "enabled": true,
107
- "communication_methods": ["email", "broadcast_slack", "broadcast_email", "pagerduty"],
108
- "email_addresses": ["user1@example.com", "user2@example.com"],
109
- "pagerduty_config": {
110
- "routing_key": "R0ABCD1234567890123456789",
111
- "severity": "error"
112
- },
113
- "send_at": 1632150000 # Unix timestamp (specific date/time)
114
- }
115
-
116
- Communication Methods:
117
- - "email": Send emails to specified email addresses
118
- - "broadcast_slack": Send broadcast notifications to all configured Slack channels
119
- - "broadcast_email": Send broadcast emails to all organization emails
120
- - "pagerduty": Send alerts to PagerDuty using the configured routing key
121
- """
122
-
123
- enabled: bool = True
124
- communication_methods: List[str] = []
125
- email_addresses: Optional[List[str]] = None
126
- pagerduty_config: Optional[PagerDutyConfig] = None
127
- send_at: Optional[int] = None # Unix timestamp for scheduled notifications
128
-
129
- def model_dump(self, **kwargs):
130
- """Convert the NotificationConfig to a dictionary for JSON serialization."""
131
- return {
132
- "enabled": self.enabled,
133
- "communication_methods": self.communication_methods,
134
- "email_addresses": self.email_addresses,
135
- "pagerduty_config": self.pagerduty_config.model_dump()
136
- if self.pagerduty_config
137
- else None,
138
- "send_at": self.send_at,
139
- }
140
-
141
-
142
- class Rule(BaseModel):
143
- """
144
- Configuration for a single rule.
145
-
146
- Example:
147
- {
148
- "rule_id": "123e4567-e89b-12d3-a456-426614174000",
149
- "name": "Quality Check",
150
- "description": "Check if quality metrics meet thresholds",
151
- "conditions": [
152
- {"metric": FaithfulnessScorer(threshold=0.7)},
153
- {"metric": AnswerRelevancyScorer(threshold=0.8)}
154
- ],
155
- "combine_type": "all", # "all" or "any"
156
- "notification": {
157
- "enabled": true,
158
- "communication_methods": ["slack", "email"],
159
- "email_addresses": ["user1@example.com", "user2@example.com"]
160
- }
161
- }
162
- """
163
-
164
- rule_id: str = Field(
165
- default_factory=lambda: str(uuid.uuid4())
166
- ) # Random UUID string as default value
167
- name: str
168
- description: Optional[str] = None
169
- conditions: List[Condition]
170
- combine_type: str = Field(..., pattern="^(all|any)$") # all = AND, any = OR
171
- notification: Optional[NotificationConfig] = None # Configuration for notifications
172
-
173
- def model_dump(self, **kwargs):
174
- """
175
- Custom serialization that properly handles condition serialization.
176
- """
177
- data = super().model_dump(**kwargs)
178
-
179
- # Special handling for conditions with complex metric objects
180
- if "conditions" in data:
181
- for i, condition in enumerate(data["conditions"]):
182
- if "metric" in condition:
183
- # Get the actual metric object
184
- metric_obj = self.conditions[i].metric
185
-
186
- # Create standardized metric representation needed by server API
187
- metric_data = {"score_type": "", "threshold": 0.0, "name": ""}
188
-
189
- # First try to use object's own serialization methods
190
- if hasattr(metric_obj, "to_dict"):
191
- orig_data = metric_obj.to_dict()
192
- # Copy any existing fields
193
- for key, value in orig_data.items():
194
- metric_data[key] = value
195
- elif hasattr(metric_obj, "model_dump"):
196
- orig_data = metric_obj.model_dump()
197
- # Copy any existing fields
198
- for key, value in orig_data.items():
199
- metric_data[key] = value
200
-
201
- # If we already have data from original serialization methods but missing required fields
202
- if "name" in metric_data and "score_type" not in metric_data:
203
- metric_data["score_type"] = metric_data["name"]
204
-
205
- # Ensure required fields have values by checking various sources
206
- if not metric_data["score_type"]:
207
- # Try to get score_type from different possible attributes
208
- if hasattr(metric_obj, "score_type"):
209
- metric_data["score_type"] = metric_obj.score_type
210
- elif hasattr(metric_obj, "name"):
211
- metric_data["score_type"] = metric_obj.name
212
- else:
213
- # Last resort: use string representation
214
- metric_data["score_type"] = str(metric_obj)
215
-
216
- # Make sure threshold is set
217
- if (
218
- not metric_data.get("threshold")
219
- and metric_data.get("threshold") != 0.0
220
- ):
221
- if hasattr(metric_obj, "threshold"):
222
- metric_data["threshold"] = metric_obj.threshold
223
- else:
224
- # Use condition threshold if metric doesn't have one
225
- metric_data["threshold"] = self.conditions[i].threshold
226
-
227
- # Make sure name is set
228
- if not metric_data.get("name"):
229
- if hasattr(metric_obj, "__name__"):
230
- metric_data["name"] = metric_obj.__name__
231
- elif hasattr(metric_obj, "name"):
232
- metric_data["name"] = metric_obj.name
233
- else:
234
- # Fallback to score_type if available
235
- metric_data["name"] = metric_data.get(
236
- "score_type", str(metric_obj)
237
- )
238
-
239
- # Update the condition with our properly serialized metric
240
- condition["metric"] = metric_data
241
-
242
- return data
243
-
244
-
245
- class RulesEngine:
246
- """
247
- Engine for creating and evaluating rules against metrics.
248
-
249
- Example:
250
- ```python
251
- # Define rules
252
- rules = {
253
- "1": Rule(
254
- name="Quality Check",
255
- description="Check if quality metrics meet thresholds",
256
- conditions=[
257
- Condition(metric=FaithfulnessScorer(threshold=0.7)),
258
- Condition(metric=AnswerRelevancyScorer(threshold=0.8))
259
- ],
260
- combine_type="all"
261
- )
262
- }
263
-
264
- # Create rules engine
265
- engine = RulesEngine(rules)
266
-
267
- # Configure notifications
268
- engine.configure_notification(
269
- rule_id="1",
270
- enabled=True,
271
- communication_methods=["slack", "email"],
272
- email_addresses=["user@example.com"]
273
- )
274
-
275
- # Evaluate rules
276
- scores = {"faithfulness": 0.65, "relevancy": 0.85}
277
- results = engine.evaluate_rules(scores, {"example_id": "example_123"})
278
- ```
279
- """
280
-
281
- def __init__(self, rules: Dict[str, Rule]):
282
- """
283
- Initialize the rules engine.
284
-
285
- Args:
286
- rules: Dictionary mapping rule IDs to Rule objects
287
- """
288
- self.rules = rules
289
-
290
- def configure_notification(
291
- self,
292
- rule_id: str,
293
- enabled: bool = True,
294
- communication_methods: List[str] | None = None,
295
- email_addresses: List[str] | None = None,
296
- send_at: Optional[int] = None,
297
- ) -> None:
298
- """
299
- Configure notification settings for a specific rule.
300
-
301
- Args:
302
- rule_id: ID of the rule to configure notifications for
303
- enabled: Whether notifications are enabled for this rule
304
- communication_methods: List of notification methods (e.g., ["slack", "email"])
305
- email_addresses: List of email addresses to send notifications to
306
- send_at: Optional Unix timestamp for when to send the notification
307
- """
308
- if rule_id not in self.rules:
309
- raise ValueError(f"Rule ID '{rule_id}' not found")
310
-
311
- rule = self.rules[rule_id]
312
-
313
- # Create notification configuration if it doesn't exist
314
- if rule.notification is None:
315
- rule.notification = NotificationConfig()
316
-
317
- # Set notification parameters
318
- rule.notification.enabled = enabled
319
-
320
- if communication_methods is not None:
321
- rule.notification.communication_methods = communication_methods
322
-
323
- if email_addresses is not None:
324
- rule.notification.email_addresses = email_addresses
325
-
326
- if send_at is not None:
327
- rule.notification.send_at = send_at
328
-
329
- def configure_all_notifications(
330
- self,
331
- enabled: bool = True,
332
- communication_methods: List[str] | None = None,
333
- email_addresses: List[str] | None = None,
334
- send_at: Optional[int] = None,
335
- ) -> None:
336
- """
337
- Configure notification settings for all rules.
338
-
339
- Args:
340
- enabled: Whether notifications are enabled
341
- communication_methods: List of notification methods (e.g., ["slack", "email"])
342
- email_addresses: List of email addresses to send notifications to
343
- send_at: Optional Unix timestamp for when to send the notification
344
- """
345
- for rule_id, rule in self.rules.items():
346
- self.configure_notification(
347
- rule_id=rule_id,
348
- enabled=enabled,
349
- communication_methods=communication_methods,
350
- email_addresses=email_addresses,
351
- send_at=send_at,
352
- )
353
-
354
- def evaluate_rules(
355
- self,
356
- scores: Dict[str, float],
357
- example_metadata: Optional[Dict[str, Any]] = None,
358
- ) -> Dict[str, AlertResult]:
359
- """
360
- Evaluate all rules against a set of scores.
361
- Returns mapping of rule IDs to their alert results.
362
-
363
- Args:
364
- scores: Dictionary of metric names to their score values
365
- example_metadata: Optional dictionary containing example metadata (example_id, timestamp)
366
- """
367
- results = {}
368
-
369
- for rule_id, rule in self.rules.items():
370
- # Evaluate each condition
371
- condition_results = []
372
- passed_conditions = []
373
-
374
- for condition in rule.conditions:
375
- # Get the metric name for lookup
376
- metric_name = condition.metric_name
377
- value = scores.get(metric_name)
378
-
379
- if value is None:
380
- # Skip this condition instead of evaluating it as false
381
- condition_results.append(
382
- {
383
- "metric": metric_name,
384
- "value": None,
385
- "threshold": condition.threshold,
386
- "passed": None, # Using None to indicate the condition was skipped
387
- "skipped": True, # Add a flag to indicate this condition was skipped
388
- }
389
- )
390
- continue # Skip adding to passed_conditions
391
- else:
392
- passed = condition.evaluate(value)
393
- condition_results.append(
394
- {
395
- "metric": metric_name,
396
- "value": value,
397
- "threshold": condition.threshold,
398
- "passed": passed,
399
- "skipped": False, # Indicate this condition was evaluated
400
- }
401
- )
402
- passed_conditions.append(passed)
403
-
404
- # Determine if alert should trigger - only consider conditions that weren't skipped
405
- if not passed_conditions:
406
- # If all conditions were skipped, the rule doesn't trigger
407
- triggered = False
408
- else:
409
- if rule.combine_type == "all":
410
- # For "all" combine_type:
411
- # - All evaluated conditions must pass
412
- # - All conditions must have been evaluated (none skipped)
413
- all_conditions_passed = all(passed_conditions)
414
- all_conditions_evaluated = len(passed_conditions) == len(
415
- rule.conditions
416
- )
417
- triggered = all_conditions_passed and all_conditions_evaluated
418
- else:
419
- # For "any" combine_type, at least one condition must pass
420
- triggered = any(passed_conditions)
421
-
422
- # Create alert result with example metadata
423
- notification_config = None
424
- if triggered and rule.notification:
425
- # If rule has a notification config and the alert is triggered, include it in the result
426
- notification_config = rule.notification
427
-
428
- # Set the alert status based on whether the rule was triggered using proper enum values
429
- status = AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED
430
-
431
- # Create the alert result
432
- alert_result = AlertResult(
433
- status=status,
434
- rule_id=rule.rule_id,
435
- rule_name=rule.name,
436
- conditions_result=condition_results,
437
- notification=notification_config,
438
- metadata=example_metadata or {},
439
- combine_type=rule.combine_type,
440
- project_id=example_metadata.get("project_id")
441
- if example_metadata
442
- else None,
443
- trace_span_id=example_metadata.get("trace_span_id")
444
- if example_metadata
445
- else None,
446
- )
447
-
448
- results[rule_id] = alert_result
449
-
450
- return results
451
-
452
- async def evaluate_rules_parallel(
453
- self,
454
- example_scores: Dict[str, Dict[str, float]],
455
- example_metadata: Dict[str, Dict[str, Any]],
456
- max_concurrent: int = 100,
457
- ) -> Dict[str, Dict[str, AlertResult]]:
458
- """
459
- Evaluate all rules against multiple examples in parallel.
460
-
461
- Args:
462
- example_scores: Dictionary mapping example_ids to their score dictionaries
463
- example_metadata: Dictionary mapping example_ids to their metadata
464
- max_concurrent: Maximum number of concurrent evaluations
465
-
466
- Returns:
467
- Dictionary mapping example_ids to dictionaries of rule_ids and their alert results
468
- """
469
- # Create semaphore to limit concurrent executions
470
- semaphore = asyncio.Semaphore(max_concurrent)
471
- results = {}
472
- tasks = []
473
-
474
- # Create a task for each example
475
- for example_id, scores in example_scores.items():
476
- metadata = example_metadata.get(example_id, {})
477
- task = self._evaluate_with_semaphore(
478
- semaphore=semaphore,
479
- example_id=example_id,
480
- scores=scores,
481
- metadata=metadata,
482
- )
483
- tasks.append(task)
484
-
485
- # Run all tasks and collect results
486
- example_results = await asyncio.gather(*tasks)
487
-
488
- # Organize results by example_id
489
- for example_id, result in example_results:
490
- results[example_id] = result
491
-
492
- return results
493
-
494
- async def _evaluate_with_semaphore(
495
- self,
496
- semaphore: asyncio.Semaphore,
497
- example_id: str,
498
- scores: Dict[str, float],
499
- metadata: Dict[str, Any],
500
- ) -> Tuple[str, Dict[str, AlertResult]]:
501
- """
502
- Helper method to evaluate rules for an example with semaphore control.
503
-
504
- Args:
505
- semaphore: Semaphore to control concurrency
506
- example_id: ID of the example being evaluated
507
- scores: Dictionary of scores for this example
508
- metadata: Metadata for this example
509
-
510
- Returns:
511
- Tuple of (example_id, rule_results)
512
- """
513
- async with semaphore:
514
- # Run the evaluation in a thread pool to avoid blocking the event loop
515
- # for CPU-bound operations
516
- with ThreadPoolExecutor() as executor:
517
- rule_results = await asyncio.get_event_loop().run_in_executor(
518
- executor, self.evaluate_rules, scores, metadata
519
- )
520
-
521
- return (example_id, rule_results)
@@ -1,52 +0,0 @@
1
- """
2
- `judgeval` tool correctness scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIScorerConfig
10
- from judgeval.constants import APIScorerType
11
- from typing import Optional, Dict
12
- from judgeval.data import ExampleParams
13
-
14
-
15
- class ExecutionOrderScorer(APIScorerConfig):
16
- kwargs: Optional[Dict] = None
17
-
18
- def __init__(
19
- self,
20
- threshold: float,
21
- should_exact_match: bool = False,
22
- should_consider_ordering: bool = False,
23
- ):
24
- super().__init__(
25
- threshold=threshold,
26
- score_type=APIScorerType.EXECUTION_ORDER,
27
- required_params=[
28
- ExampleParams.ACTUAL_OUTPUT,
29
- ExampleParams.EXPECTED_OUTPUT,
30
- ],
31
- )
32
- self.kwargs = {
33
- "should_exact_match": should_exact_match,
34
- "should_consider_ordering": should_consider_ordering,
35
- }
36
-
37
- @property
38
- def __name__(self):
39
- return "Execution Order"
40
-
41
- def to_dict(self) -> dict:
42
- """
43
- Converts the scorer configuration to a dictionary format.
44
-
45
- Returns:
46
- dict: A dictionary containing the scorer's configuration
47
- """
48
- return {
49
- "score_type": self.score_type,
50
- "threshold": self.threshold,
51
- "kwargs": self.kwargs,
52
- }
@@ -1,28 +0,0 @@
1
- """
2
- `judgeval` hallucination scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIScorerConfig
10
- from judgeval.constants import APIScorerType
11
- from judgeval.data import ExampleParams
12
-
13
-
14
- class HallucinationScorer(APIScorerConfig):
15
- def __init__(self, threshold: float):
16
- super().__init__(
17
- threshold=threshold,
18
- score_type=APIScorerType.HALLUCINATION,
19
- required_params=[
20
- ExampleParams.INPUT,
21
- ExampleParams.ACTUAL_OUTPUT,
22
- ExampleParams.CONTEXT,
23
- ],
24
- )
25
-
26
- @property
27
- def __name__(self):
28
- return "Hallucination"
judgeval/utils/alerts.py DELETED
@@ -1,93 +0,0 @@
1
- """
2
- Handling alerts in Judgeval.
3
- """
4
-
5
- from enum import Enum
6
- from typing import Dict, Any, List, Optional
7
- from pydantic import BaseModel
8
-
9
-
10
- class AlertStatus(str, Enum):
11
- """Status of an alert evaluation."""
12
-
13
- TRIGGERED = "triggered"
14
- NOT_TRIGGERED = "not_triggered"
15
-
16
-
17
- class AlertResult(BaseModel):
18
- """
19
- Result of a rule evaluation.
20
-
21
- Attributes:
22
- rule_name: Name of the rule that was evaluated
23
- rule_id: Unique identifier of the rule
24
- status: Status of the alert (triggered or not)
25
- conditions_result: List of condition evaluation results
26
- metadata: Dictionary containing example_id, timestamp, and other metadata
27
- notification: Optional notification configuration for triggered alerts
28
- combine_type: The combination type used ("all" or "any")
29
- project_id: Optional project identifier
30
- trace_span_id: Optional trace span identifier
31
- """
32
-
33
- rule_name: str
34
- rule_id: Optional[str] = None # The unique identifier of the rule
35
- status: AlertStatus
36
- conditions_result: List[Dict[str, Any]] = []
37
- metadata: Dict[str, Any] = {}
38
- notification: Optional[Any] = (
39
- None # NotificationConfig when triggered, None otherwise
40
- )
41
- combine_type: Optional[str] = None # "all" or "any"
42
- project_id: Optional[str] = None # Project identifier
43
- trace_span_id: Optional[str] = None # Trace span identifier
44
-
45
- @property
46
- def example_id(self) -> Optional[str]:
47
- """Get example_id from metadata for backward compatibility"""
48
- return self.metadata.get("example_id")
49
-
50
- @property
51
- def timestamp(self) -> Optional[str]:
52
- """Get timestamp from metadata for backward compatibility"""
53
- return self.metadata.get("timestamp")
54
-
55
- @property
56
- def conditions_results(self) -> List[Dict[str, Any]]:
57
- """Backwards compatibility property for the conditions_result field"""
58
- return self.conditions_result
59
-
60
- def model_dump(self, **kwargs):
61
- """
62
- Convert the AlertResult to a dictionary for JSON serialization.
63
-
64
- Args:
65
- **kwargs: Additional arguments to pass to Pydantic's model_dump
66
-
67
- Returns:
68
- dict: Dictionary representation of the AlertResult
69
- """
70
- data = (
71
- super().model_dump(**kwargs)
72
- if hasattr(super(), "model_dump")
73
- else super().dict(**kwargs)
74
- )
75
-
76
- # Handle the NotificationConfig object if it exists
77
- if hasattr(self, "notification") and self.notification is not None:
78
- if hasattr(self.notification, "model_dump"):
79
- data["notification"] = self.notification.model_dump()
80
- elif hasattr(self.notification, "dict"):
81
- data["notification"] = self.notification.dict()
82
- else:
83
- # Manually convert the notification to a dictionary
84
- notif = self.notification
85
- data["notification"] = {
86
- "enabled": notif.enabled,
87
- "communication_methods": notif.communication_methods,
88
- "email_addresses": notif.email_addresses,
89
- "slack_channels": getattr(notif, "slack_channels", []),
90
- "send_at": notif.send_at,
91
- }
92
-
93
- return data