judgeval 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. judgeval/__init__.py +139 -12
  2. judgeval/api/__init__.py +501 -0
  3. judgeval/api/api_types.py +344 -0
  4. judgeval/cli.py +2 -4
  5. judgeval/constants.py +10 -26
  6. judgeval/data/evaluation_run.py +49 -26
  7. judgeval/data/example.py +2 -2
  8. judgeval/data/judgment_types.py +266 -82
  9. judgeval/data/result.py +4 -5
  10. judgeval/data/scorer_data.py +4 -2
  11. judgeval/data/tool.py +2 -2
  12. judgeval/data/trace.py +7 -50
  13. judgeval/data/trace_run.py +7 -4
  14. judgeval/{dataset.py → dataset/__init__.py} +43 -28
  15. judgeval/env.py +67 -0
  16. judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
  17. judgeval/exceptions.py +27 -0
  18. judgeval/integrations/langgraph/__init__.py +788 -0
  19. judgeval/judges/__init__.py +2 -2
  20. judgeval/judges/litellm_judge.py +75 -15
  21. judgeval/judges/together_judge.py +86 -18
  22. judgeval/judges/utils.py +7 -21
  23. judgeval/{common/logger.py → logger.py} +8 -6
  24. judgeval/scorers/__init__.py +0 -4
  25. judgeval/scorers/agent_scorer.py +3 -7
  26. judgeval/scorers/api_scorer.py +8 -13
  27. judgeval/scorers/base_scorer.py +52 -32
  28. judgeval/scorers/example_scorer.py +1 -3
  29. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
  30. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
  31. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
  32. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
  33. judgeval/scorers/score.py +21 -31
  34. judgeval/scorers/trace_api_scorer.py +5 -0
  35. judgeval/scorers/utils.py +1 -103
  36. judgeval/tracer/__init__.py +1075 -2
  37. judgeval/tracer/constants.py +1 -0
  38. judgeval/tracer/exporters/__init__.py +37 -0
  39. judgeval/tracer/exporters/s3.py +119 -0
  40. judgeval/tracer/exporters/store.py +43 -0
  41. judgeval/tracer/exporters/utils.py +32 -0
  42. judgeval/tracer/keys.py +67 -0
  43. judgeval/tracer/llm/__init__.py +1233 -0
  44. judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
  45. judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
  46. judgeval/tracer/managers.py +188 -0
  47. judgeval/tracer/processors/__init__.py +181 -0
  48. judgeval/tracer/utils.py +20 -0
  49. judgeval/trainer/__init__.py +5 -0
  50. judgeval/{common/trainer → trainer}/config.py +12 -9
  51. judgeval/{common/trainer → trainer}/console.py +2 -9
  52. judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
  53. judgeval/{common/trainer → trainer}/trainer.py +119 -17
  54. judgeval/utils/async_utils.py +2 -3
  55. judgeval/utils/decorators.py +24 -0
  56. judgeval/utils/file_utils.py +37 -4
  57. judgeval/utils/guards.py +32 -0
  58. judgeval/utils/meta.py +14 -0
  59. judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
  60. judgeval/utils/testing.py +88 -0
  61. judgeval/utils/url.py +10 -0
  62. judgeval/{version_check.py → utils/version_check.py} +3 -3
  63. judgeval/version.py +5 -0
  64. judgeval/warnings.py +4 -0
  65. {judgeval-0.8.0.dist-info → judgeval-0.9.1.dist-info}/METADATA +12 -14
  66. judgeval-0.9.1.dist-info/RECORD +80 -0
  67. judgeval/clients.py +0 -35
  68. judgeval/common/__init__.py +0 -13
  69. judgeval/common/api/__init__.py +0 -3
  70. judgeval/common/api/api.py +0 -375
  71. judgeval/common/api/constants.py +0 -186
  72. judgeval/common/exceptions.py +0 -27
  73. judgeval/common/storage/__init__.py +0 -6
  74. judgeval/common/storage/s3_storage.py +0 -97
  75. judgeval/common/tracer/__init__.py +0 -31
  76. judgeval/common/tracer/constants.py +0 -22
  77. judgeval/common/tracer/core.py +0 -2427
  78. judgeval/common/tracer/otel_exporter.py +0 -108
  79. judgeval/common/tracer/otel_span_processor.py +0 -188
  80. judgeval/common/tracer/span_processor.py +0 -37
  81. judgeval/common/tracer/span_transformer.py +0 -207
  82. judgeval/common/tracer/trace_manager.py +0 -101
  83. judgeval/common/trainer/__init__.py +0 -5
  84. judgeval/common/utils.py +0 -948
  85. judgeval/integrations/langgraph.py +0 -844
  86. judgeval/judges/mixture_of_judges.py +0 -287
  87. judgeval/judgment_client.py +0 -267
  88. judgeval/rules.py +0 -521
  89. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  90. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  91. judgeval/utils/alerts.py +0 -93
  92. judgeval/utils/requests.py +0 -50
  93. judgeval-0.8.0.dist-info/RECORD +0 -82
  94. {judgeval-0.8.0.dist-info → judgeval-0.9.1.dist-info}/WHEEL +0 -0
  95. {judgeval-0.8.0.dist-info → judgeval-0.9.1.dist-info}/entry_points.txt +0 -0
  96. {judgeval-0.8.0.dist-info → judgeval-0.9.1.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,344 @@
1
+ # generated by datamodel-codegen:
2
+ # filename: .openapi.json
3
+ # timestamp: 2025-08-29T04:49:39+00:00
4
+
5
+ from __future__ import annotations
6
+ from typing import Any, Dict, List, Optional, TypedDict, Union
7
+ from typing_extensions import NotRequired
8
+
9
+
10
+ TraceAndSpanId = List
11
+
12
+
13
+ class EvalResultsFetch(TypedDict):
14
+ experiment_run_id: str
15
+ project_name: str
16
+
17
+
18
+ class DatasetFetch(TypedDict):
19
+ dataset_alias: str
20
+ project_name: str
21
+
22
+
23
+ class TraceSave(TypedDict):
24
+ project_name: str
25
+ trace_id: str
26
+ name: str
27
+ created_at: str
28
+ duration: float
29
+ offline_mode: NotRequired[bool]
30
+ has_notification: NotRequired[bool]
31
+ customer_id: NotRequired[Optional[str]]
32
+ tags: NotRequired[List[str]]
33
+ metadata: NotRequired[Dict[str, Any]]
34
+ update_id: NotRequired[int]
35
+
36
+
37
+ class TraceFetch(TypedDict):
38
+ trace_id: str
39
+
40
+
41
+ class TraceAddToDataset(TypedDict):
42
+ trace_id: str
43
+ trace_span_id: str
44
+ dataset_alias: str
45
+ project_name: str
46
+
47
+
48
+ class EvaluationRunsBatchRequest(TypedDict):
49
+ organization_id: str
50
+ evaluation_entries: List[Dict[str, Any]]
51
+
52
+
53
+ class ProjectAdd(TypedDict):
54
+ project_name: str
55
+
56
+
57
+ class ProjectAddResponse(TypedDict):
58
+ project_id: str
59
+
60
+
61
+ class ProjectDeleteFromJudgevalResponse(TypedDict):
62
+ project_name: str
63
+
64
+
65
+ class ProjectDeleteResponse(TypedDict):
66
+ message: str
67
+
68
+
69
+ class ScorerExistsRequest(TypedDict):
70
+ name: str
71
+
72
+
73
+ class ScorerExistsResponse(TypedDict):
74
+ exists: bool
75
+
76
+
77
+ class SavePromptScorerRequest(TypedDict):
78
+ name: str
79
+ prompt: str
80
+ threshold: float
81
+ options: NotRequired[Optional[Dict[str, float]]]
82
+ is_trace: NotRequired[Optional[bool]]
83
+
84
+
85
+ class SavePromptScorerResponse(TypedDict):
86
+ message: str
87
+ name: str
88
+
89
+
90
+ class FetchPromptScorerRequest(TypedDict):
91
+ name: str
92
+
93
+
94
+ class CustomScorerUploadPayload(TypedDict):
95
+ scorer_name: str
96
+ scorer_code: str
97
+ requirements_text: str
98
+
99
+
100
+ class CustomScorerTemplateResponse(TypedDict):
101
+ scorer_name: str
102
+ status: str
103
+ message: str
104
+
105
+
106
+ class ResolveProjectNameRequest(TypedDict):
107
+ project_name: str
108
+
109
+
110
+ class ResolveProjectNameResponse(TypedDict):
111
+ project_id: str
112
+
113
+
114
+ class TraceIdRequest(TypedDict):
115
+ trace_id: str
116
+
117
+
118
+ class SpanScoreRequest(TypedDict):
119
+ span_id: str
120
+ trace_id: str
121
+
122
+
123
+ class BaseScorer(TypedDict):
124
+ score_type: str
125
+ threshold: NotRequired[float]
126
+ name: NotRequired[Optional[str]]
127
+ class_name: NotRequired[Optional[str]]
128
+ score: NotRequired[Optional[float]]
129
+ score_breakdown: NotRequired[Optional[Dict[str, Any]]]
130
+ reason: NotRequired[Optional[str]]
131
+ using_native_model: NotRequired[Optional[bool]]
132
+ success: NotRequired[Optional[bool]]
133
+ model: NotRequired[Optional[str]]
134
+ model_client: NotRequired[Any]
135
+ strict_mode: NotRequired[bool]
136
+ error: NotRequired[Optional[str]]
137
+ additional_metadata: NotRequired[Optional[Dict[str, Any]]]
138
+ user: NotRequired[Optional[str]]
139
+ server_hosted: NotRequired[bool]
140
+
141
+
142
+ class ScorerConfig(TypedDict):
143
+ score_type: str
144
+ name: NotRequired[Optional[str]]
145
+ threshold: NotRequired[float]
146
+ strict_mode: NotRequired[bool]
147
+ required_params: NotRequired[List[str]]
148
+ kwargs: NotRequired[Optional[Dict[str, Any]]]
149
+
150
+
151
+ class Example(TypedDict):
152
+ example_id: str
153
+ created_at: str
154
+ name: NotRequired[Optional[str]]
155
+
156
+
157
+ class ValidationError(TypedDict):
158
+ loc: List[Union[str, int]]
159
+ msg: str
160
+ type: str
161
+
162
+
163
+ class SpanBatchItem(TypedDict):
164
+ span_id: str
165
+ trace_id: str
166
+ function: str
167
+ created_at: NotRequired[Any]
168
+ parent_span_id: NotRequired[Optional[str]]
169
+ span_type: NotRequired[Optional[str]]
170
+ inputs: NotRequired[Optional[Dict[str, Any]]]
171
+ output: NotRequired[Any]
172
+ error: NotRequired[Optional[Dict[str, Any]]]
173
+ usage: NotRequired[Optional[Dict[str, Any]]]
174
+ duration: NotRequired[Optional[float]]
175
+ expected_tools: NotRequired[Optional[List[Dict[str, Any]]]]
176
+ additional_metadata: NotRequired[Optional[Dict[str, Any]]]
177
+ has_evaluation: NotRequired[Optional[bool]]
178
+ agent_name: NotRequired[Optional[str]]
179
+ class_name: NotRequired[Optional[str]]
180
+ state_before: NotRequired[Optional[Dict[str, Any]]]
181
+ state_after: NotRequired[Optional[Dict[str, Any]]]
182
+ span_state: str
183
+ update_id: NotRequired[int]
184
+ queued_at: float
185
+
186
+
187
+ class PromptScorer(TypedDict):
188
+ name: str
189
+ prompt: str
190
+ threshold: float
191
+ options: NotRequired[Optional[Dict[str, float]]]
192
+ created_at: NotRequired[Optional[str]]
193
+ updated_at: NotRequired[Optional[str]]
194
+ is_trace: NotRequired[Optional[bool]]
195
+
196
+
197
+ class ScorerData(TypedDict):
198
+ name: str
199
+ threshold: float
200
+ success: bool
201
+ score: NotRequired[Optional[float]]
202
+ reason: NotRequired[Optional[str]]
203
+ strict_mode: NotRequired[Optional[bool]]
204
+ evaluation_model: NotRequired[Union[List[str], str]]
205
+ error: NotRequired[Optional[str]]
206
+ additional_metadata: NotRequired[Optional[Dict[str, Any]]]
207
+
208
+
209
+ class TraceUsage(TypedDict):
210
+ prompt_tokens: NotRequired[Optional[int]]
211
+ completion_tokens: NotRequired[Optional[int]]
212
+ cache_creation_input_tokens: NotRequired[Optional[int]]
213
+ cache_read_input_tokens: NotRequired[Optional[int]]
214
+ total_tokens: NotRequired[Optional[int]]
215
+ prompt_tokens_cost_usd: NotRequired[Optional[float]]
216
+ completion_tokens_cost_usd: NotRequired[Optional[float]]
217
+ total_cost_usd: NotRequired[Optional[float]]
218
+ model_name: NotRequired[Optional[str]]
219
+
220
+
221
+ class Tool(TypedDict):
222
+ tool_name: str
223
+ parameters: NotRequired[Optional[Dict[str, Any]]]
224
+ agent_name: NotRequired[Optional[str]]
225
+ result_dependencies: NotRequired[Optional[List[Dict[str, Any]]]]
226
+ action_dependencies: NotRequired[Optional[List[Dict[str, Any]]]]
227
+ require_all: NotRequired[Optional[bool]]
228
+
229
+
230
+ class ExampleEvaluationRun(TypedDict):
231
+ id: NotRequired[Optional[str]]
232
+ project_name: NotRequired[Optional[str]]
233
+ eval_name: NotRequired[Optional[str]]
234
+ custom_scorers: NotRequired[List[BaseScorer]]
235
+ judgment_scorers: NotRequired[List[ScorerConfig]]
236
+ model: str
237
+ created_at: NotRequired[Optional[str]]
238
+ examples: List[Example]
239
+ trace_span_id: NotRequired[Optional[str]]
240
+ trace_id: NotRequired[Optional[str]]
241
+
242
+
243
+ class HTTPValidationError(TypedDict):
244
+ detail: NotRequired[List[ValidationError]]
245
+
246
+
247
+ class TraceEvaluationRun(TypedDict):
248
+ id: NotRequired[Optional[str]]
249
+ project_name: NotRequired[Optional[str]]
250
+ eval_name: NotRequired[Optional[str]]
251
+ custom_scorers: NotRequired[List[BaseScorer]]
252
+ judgment_scorers: NotRequired[List[ScorerConfig]]
253
+ model: str
254
+ created_at: NotRequired[Optional[str]]
255
+ trace_and_span_ids: List[TraceAndSpanId]
256
+ is_offline: NotRequired[bool]
257
+
258
+
259
+ class DatasetInsertExamples(TypedDict):
260
+ dataset_alias: str
261
+ examples: List[Example]
262
+ project_name: str
263
+
264
+
265
+ class SpansBatchRequest(TypedDict):
266
+ spans: List[SpanBatchItem]
267
+ organization_id: str
268
+
269
+
270
+ class FetchPromptScorerResponse(TypedDict):
271
+ scorer: PromptScorer
272
+
273
+
274
+ class TraceSpan(TypedDict):
275
+ span_id: str
276
+ trace_id: str
277
+ function: str
278
+ created_at: NotRequired[Any]
279
+ parent_span_id: NotRequired[Optional[str]]
280
+ span_type: NotRequired[Optional[str]]
281
+ inputs: NotRequired[Optional[Dict[str, Any]]]
282
+ error: NotRequired[Optional[Dict[str, Any]]]
283
+ output: NotRequired[Any]
284
+ usage: NotRequired[Optional[TraceUsage]]
285
+ duration: NotRequired[Optional[float]]
286
+ expected_tools: NotRequired[Optional[List[Tool]]]
287
+ additional_metadata: NotRequired[Optional[Dict[str, Any]]]
288
+ has_evaluation: NotRequired[Optional[bool]]
289
+ agent_name: NotRequired[Optional[str]]
290
+ class_name: NotRequired[Optional[str]]
291
+ state_before: NotRequired[Optional[Dict[str, Any]]]
292
+ state_after: NotRequired[Optional[Dict[str, Any]]]
293
+ update_id: NotRequired[int]
294
+
295
+
296
+ class Trace(TypedDict):
297
+ trace_id: str
298
+ name: str
299
+ created_at: str
300
+ duration: float
301
+ trace_spans: List[TraceSpan]
302
+ offline_mode: NotRequired[bool]
303
+ rules: NotRequired[Dict[str, Any]]
304
+ has_notification: NotRequired[bool]
305
+ customer_id: NotRequired[Optional[str]]
306
+ tags: NotRequired[List[str]]
307
+ metadata: NotRequired[Dict[str, Any]]
308
+ update_id: NotRequired[int]
309
+
310
+
311
+ class ScoringResult(TypedDict):
312
+ success: bool
313
+ scorers_data: Optional[List[ScorerData]]
314
+ name: NotRequired[Optional[str]]
315
+ data_object: NotRequired[Optional[Union[TraceSpan, Example]]]
316
+ trace_id: NotRequired[Optional[str]]
317
+ run_duration: NotRequired[Optional[float]]
318
+ evaluation_cost: NotRequired[Optional[float]]
319
+
320
+
321
+ class TraceRun(TypedDict):
322
+ project_name: NotRequired[Optional[str]]
323
+ eval_name: NotRequired[Optional[str]]
324
+ traces: List[Trace]
325
+ scorers: List[ScorerConfig]
326
+ model: str
327
+ trace_span_id: NotRequired[Optional[str]]
328
+ tools: NotRequired[Optional[List[Dict[str, Any]]]]
329
+
330
+
331
+ class EvalResults(TypedDict):
332
+ results: List[ScoringResult]
333
+ run: Union[ExampleEvaluationRun, TraceEvaluationRun]
334
+
335
+
336
+ class DatasetPush(TypedDict):
337
+ dataset_alias: str
338
+ comments: NotRequired[Optional[str]]
339
+ source_file: NotRequired[Optional[str]]
340
+ examples: NotRequired[Optional[List[Example]]]
341
+ traces: NotRequired[Optional[List[Trace]]]
342
+ is_trace: NotRequired[bool]
343
+ project_name: str
344
+ overwrite: NotRequired[Optional[bool]]
judgeval/cli.py CHANGED
@@ -3,8 +3,8 @@
3
3
  import typer
4
4
  from pathlib import Path
5
5
  from dotenv import load_dotenv
6
- from judgeval.common.logger import judgeval_logger
7
- from judgeval.judgment_client import JudgmentClient
6
+ from judgeval.logger import judgeval_logger
7
+ from judgeval import JudgmentClient
8
8
 
9
9
  load_dotenv()
10
10
 
@@ -61,5 +61,3 @@ def version():
61
61
 
62
62
  if __name__ == "__main__":
63
63
  app()
64
-
65
- # judgeval upload_scorer /Users/alanzhang/repo/JudgmentLabs/judgeval/src/demo/profile_match_scorer.py /Users/alanzhang/repo/JudgmentLabs/judgeval/src/demo/requirements.txt
judgeval/constants.py CHANGED
@@ -1,10 +1,8 @@
1
- """
2
- Constant variables used throughout source code
3
- """
1
+ from __future__ import annotations
4
2
 
5
3
  from enum import Enum
4
+ from typing import Set
6
5
  import litellm
7
- import os
8
6
 
9
7
 
10
8
  class APIScorerType(str, Enum):
@@ -23,30 +21,28 @@ class APIScorerType(str, Enum):
23
21
  EXECUTION_ORDER = "Execution Order"
24
22
  DERAILMENT = "Derailment"
25
23
  TOOL_ORDER = "Tool Order"
24
+ MOCK_TRACE_SCORER = "Mock Trace Scorer"
26
25
  CLASSIFIER = "Classifier"
27
26
  TOOL_DEPENDENCY = "Tool Dependency"
28
27
  CUSTOM = "Custom"
29
28
 
30
29
  @classmethod
31
- def _missing_(cls, value):
30
+ def __missing__(cls, value: str) -> APIScorerType:
32
31
  # Handle case-insensitive lookup
33
32
  for member in cls:
34
33
  if member.value == value.lower():
35
34
  return member
36
35
 
36
+ raise ValueError(f"Invalid scorer type: {value}")
37
37
 
38
- UNBOUNDED_SCORERS: set[APIScorerType] = (
38
+
39
+ UNBOUNDED_SCORERS: Set[APIScorerType] = (
39
40
  set()
40
41
  ) # scorers whose scores are not bounded between 0-1
41
42
 
42
- # RabbitMQ
43
- RABBITMQ_HOST = os.getenv(
44
- "RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com"
45
- )
46
- RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
47
- RABBITMQ_QUEUE = os.getenv("RABBITMQ_QUEUE", "task_queue")
48
- # Models
49
- LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
43
+
44
+ LITELLM_SUPPORTED_MODELS: Set[str] = set(litellm.model_list)
45
+
50
46
 
51
47
  TOGETHER_SUPPORTED_MODELS = [
52
48
  "meta-llama/Meta-Llama-3-70B-Instruct-Turbo",
@@ -104,20 +100,8 @@ TOGETHER_SUPPORTED_MODELS = [
104
100
  "mistralai/Mistral-7B-Instruct-v0.1",
105
101
  ]
106
102
 
107
- DEFAULT_TOGETHER_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct-Lite"
108
- DEFAULT_GPT_MODEL = "gpt-4.1"
109
-
110
103
  JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini", "osiris"}
111
104
 
112
105
  ACCEPTABLE_MODELS = (
113
106
  set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
114
107
  )
115
-
116
- ## System settings
117
- MAX_WORKER_THREADS = 10
118
-
119
- # Maximum number of concurrent operations for evaluation runs
120
- MAX_CONCURRENT_EVALUATIONS = 50 # Adjust based on system capabilities
121
-
122
- # Span lifecycle management
123
- SPAN_LIFECYCLE_END_UPDATE_ID = 20 # Default ending number for completed spans
@@ -1,4 +1,5 @@
1
- from typing import List, Optional, Union
1
+ from typing import List, Optional, Union, Tuple
2
+ from litellm.files.main import BaseModel
2
3
  from pydantic import field_validator, model_validator, Field
3
4
  from datetime import datetime, timezone
4
5
  import uuid
@@ -6,29 +7,22 @@ import uuid
6
7
  from judgeval.data import Example
7
8
  from judgeval.scorers import BaseScorer, APIScorerConfig
8
9
  from judgeval.constants import ACCEPTABLE_MODELS
9
- from judgeval.data.judgment_types import EvaluationRunJudgmentType
10
+ from judgeval.data.judgment_types import (
11
+ ExampleEvaluationRun as ExampleEvaluationRunJudgmentType,
12
+ TraceEvaluationRun as TraceEvaluationRunJudgmentType,
13
+ )
10
14
 
11
15
 
12
- class EvaluationRun(EvaluationRunJudgmentType):
13
- """
14
- Stores example and evaluation scorers together for running an eval task
15
-
16
- Args:
17
- project_name (str): The name of the project the evaluation results belong to
18
- eval_name (str): A name for this evaluation run
19
- examples (List[Example]): The examples to evaluate
20
- scorers (List[Union[BaseScorer, APIScorerConfig]]): A list of scorers to use for evaluation
21
- model (str): The model used as a judge when using LLM as a Judge
22
- metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
23
- """
24
-
16
+ class EvaluationRun(BaseModel):
25
17
  id: Optional[str] = Field(default_factory=lambda: str(uuid.uuid4()))
26
18
  created_at: Optional[str] = Field(
27
19
  default_factory=lambda: datetime.now(timezone.utc).isoformat()
28
20
  )
21
+ organization_id: Optional[str] = None
29
22
  custom_scorers: Optional[List[BaseScorer]] = None
30
23
  judgment_scorers: Optional[List[APIScorerConfig]] = None
31
- organization_id: Optional[str] = None
24
+ scorers: Optional[List[Union[BaseScorer, APIScorerConfig]]] = None
25
+ model: str
32
26
 
33
27
  def __init__(
34
28
  self,
@@ -57,19 +51,9 @@ class EvaluationRun(EvaluationRunJudgmentType):
57
51
  data = super().model_dump(**kwargs)
58
52
  data["custom_scorers"] = [s.model_dump() for s in self.custom_scorers]
59
53
  data["judgment_scorers"] = [s.model_dump() for s in self.judgment_scorers]
60
- data["examples"] = [example.model_dump() for example in self.examples]
61
54
 
62
55
  return data
63
56
 
64
- @field_validator("examples")
65
- def validate_examples(cls, v):
66
- if not v:
67
- raise ValueError("Examples cannot be empty.")
68
- for item in v:
69
- if not isinstance(item, Example):
70
- raise ValueError(f"Item of type {type(item)} is not a Example")
71
- return v
72
-
73
57
  @model_validator(mode="after")
74
58
  @classmethod
75
59
  def validate_scorer_lists(cls, values):
@@ -102,3 +86,42 @@ class EvaluationRun(EvaluationRunJudgmentType):
102
86
  f"Model name {v} not recognized. Please select a valid model name.)"
103
87
  )
104
88
  return v
89
+
90
+
91
+ class ExampleEvaluationRun(EvaluationRun, ExampleEvaluationRunJudgmentType): # type: ignore
92
+ """
93
+ Stores example and evaluation scorers together for running an eval task
94
+
95
+ Args:
96
+ project_name (str): The name of the project the evaluation results belong to
97
+ eval_name (str): A name for this evaluation run
98
+ examples (List[Example]): The examples to evaluate
99
+ scorers (List[Union[BaseScorer, APIScorerConfig]]): A list of scorers to use for evaluation
100
+ model (str): The model used as a judge when using LLM as a Judge
101
+ """
102
+
103
+ examples: List[Example] # type: ignore
104
+
105
+ @field_validator("examples")
106
+ def validate_examples(cls, v):
107
+ if not v:
108
+ raise ValueError("Examples cannot be empty.")
109
+ for item in v:
110
+ if not isinstance(item, Example):
111
+ raise ValueError(f"Item of type {type(item)} is not a Example")
112
+ return v
113
+
114
+ def model_dump(self, **kwargs):
115
+ data = super().model_dump(**kwargs)
116
+ data["examples"] = [example.model_dump() for example in self.examples]
117
+ return data
118
+
119
+
120
+ class TraceEvaluationRun(EvaluationRun, TraceEvaluationRunJudgmentType): # type: ignore
121
+ trace_and_span_ids: List[Tuple[str, str]] # type: ignore
122
+
123
+ @field_validator("trace_and_span_ids")
124
+ def validate_trace_and_span_ids(cls, v):
125
+ if not v:
126
+ raise ValueError("Trace and span IDs are required for trace evaluations.")
127
+ return v
judgeval/data/example.py CHANGED
@@ -5,7 +5,7 @@ Classes for representing examples in a dataset.
5
5
  from enum import Enum
6
6
  from datetime import datetime
7
7
  from typing import Dict, Any, Optional
8
- from judgeval.data.judgment_types import ExampleJudgmentType
8
+ from judgeval.data.judgment_types import Example as JudgmentExample
9
9
 
10
10
 
11
11
  class ExampleParams(str, Enum):
@@ -19,7 +19,7 @@ class ExampleParams(str, Enum):
19
19
  ADDITIONAL_METADATA = "additional_metadata"
20
20
 
21
21
 
22
- class Example(ExampleJudgmentType):
22
+ class Example(JudgmentExample):
23
23
  example_id: str = ""
24
24
  created_at: str = datetime.now().isoformat()
25
25
  name: Optional[str] = None