judgeval 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +139 -12
- judgeval/api/__init__.py +501 -0
- judgeval/api/api_types.py +344 -0
- judgeval/cli.py +2 -4
- judgeval/constants.py +10 -26
- judgeval/data/evaluation_run.py +49 -26
- judgeval/data/example.py +2 -2
- judgeval/data/judgment_types.py +266 -82
- judgeval/data/result.py +4 -5
- judgeval/data/scorer_data.py +4 -2
- judgeval/data/tool.py +2 -2
- judgeval/data/trace.py +7 -50
- judgeval/data/trace_run.py +7 -4
- judgeval/{dataset.py → dataset/__init__.py} +43 -28
- judgeval/env.py +67 -0
- judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +788 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +75 -15
- judgeval/judges/together_judge.py +86 -18
- judgeval/judges/utils.py +7 -21
- judgeval/{common/logger.py → logger.py} +8 -6
- judgeval/scorers/__init__.py +0 -4
- judgeval/scorers/agent_scorer.py +3 -7
- judgeval/scorers/api_scorer.py +8 -13
- judgeval/scorers/base_scorer.py +52 -32
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
- judgeval/scorers/score.py +21 -31
- judgeval/scorers/trace_api_scorer.py +5 -0
- judgeval/scorers/utils.py +1 -103
- judgeval/tracer/__init__.py +1075 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +37 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +43 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +67 -0
- judgeval/tracer/llm/__init__.py +1233 -0
- judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
- judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
- judgeval/tracer/managers.py +188 -0
- judgeval/tracer/processors/__init__.py +181 -0
- judgeval/tracer/utils.py +20 -0
- judgeval/trainer/__init__.py +5 -0
- judgeval/{common/trainer → trainer}/config.py +12 -9
- judgeval/{common/trainer → trainer}/console.py +2 -9
- judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
- judgeval/{common/trainer → trainer}/trainer.py +119 -17
- judgeval/utils/async_utils.py +2 -3
- judgeval/utils/decorators.py +24 -0
- judgeval/utils/file_utils.py +37 -4
- judgeval/utils/guards.py +32 -0
- judgeval/utils/meta.py +14 -0
- judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
- judgeval/utils/testing.py +88 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +3 -3
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- {judgeval-0.8.0.dist-info → judgeval-0.9.1.dist-info}/METADATA +12 -14
- judgeval-0.9.1.dist-info/RECORD +80 -0
- judgeval/clients.py +0 -35
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -375
- judgeval/common/api/constants.py +0 -186
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -97
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -2427
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -188
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -207
- judgeval/common/tracer/trace_manager.py +0 -101
- judgeval/common/trainer/__init__.py +0 -5
- judgeval/common/utils.py +0 -948
- judgeval/integrations/langgraph.py +0 -844
- judgeval/judges/mixture_of_judges.py +0 -287
- judgeval/judgment_client.py +0 -267
- judgeval/rules.py +0 -521
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.8.0.dist-info/RECORD +0 -82
- {judgeval-0.8.0.dist-info → judgeval-0.9.1.dist-info}/WHEEL +0 -0
- {judgeval-0.8.0.dist-info → judgeval-0.9.1.dist-info}/entry_points.txt +0 -0
- {judgeval-0.8.0.dist-info → judgeval-0.9.1.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,344 @@
|
|
1
|
+
# generated by datamodel-codegen:
|
2
|
+
# filename: .openapi.json
|
3
|
+
# timestamp: 2025-08-29T04:49:39+00:00
|
4
|
+
|
5
|
+
from __future__ import annotations
|
6
|
+
from typing import Any, Dict, List, Optional, TypedDict, Union
|
7
|
+
from typing_extensions import NotRequired
|
8
|
+
|
9
|
+
|
10
|
+
TraceAndSpanId = List
|
11
|
+
|
12
|
+
|
13
|
+
class EvalResultsFetch(TypedDict):
|
14
|
+
experiment_run_id: str
|
15
|
+
project_name: str
|
16
|
+
|
17
|
+
|
18
|
+
class DatasetFetch(TypedDict):
|
19
|
+
dataset_alias: str
|
20
|
+
project_name: str
|
21
|
+
|
22
|
+
|
23
|
+
class TraceSave(TypedDict):
|
24
|
+
project_name: str
|
25
|
+
trace_id: str
|
26
|
+
name: str
|
27
|
+
created_at: str
|
28
|
+
duration: float
|
29
|
+
offline_mode: NotRequired[bool]
|
30
|
+
has_notification: NotRequired[bool]
|
31
|
+
customer_id: NotRequired[Optional[str]]
|
32
|
+
tags: NotRequired[List[str]]
|
33
|
+
metadata: NotRequired[Dict[str, Any]]
|
34
|
+
update_id: NotRequired[int]
|
35
|
+
|
36
|
+
|
37
|
+
class TraceFetch(TypedDict):
|
38
|
+
trace_id: str
|
39
|
+
|
40
|
+
|
41
|
+
class TraceAddToDataset(TypedDict):
|
42
|
+
trace_id: str
|
43
|
+
trace_span_id: str
|
44
|
+
dataset_alias: str
|
45
|
+
project_name: str
|
46
|
+
|
47
|
+
|
48
|
+
class EvaluationRunsBatchRequest(TypedDict):
|
49
|
+
organization_id: str
|
50
|
+
evaluation_entries: List[Dict[str, Any]]
|
51
|
+
|
52
|
+
|
53
|
+
class ProjectAdd(TypedDict):
|
54
|
+
project_name: str
|
55
|
+
|
56
|
+
|
57
|
+
class ProjectAddResponse(TypedDict):
|
58
|
+
project_id: str
|
59
|
+
|
60
|
+
|
61
|
+
class ProjectDeleteFromJudgevalResponse(TypedDict):
|
62
|
+
project_name: str
|
63
|
+
|
64
|
+
|
65
|
+
class ProjectDeleteResponse(TypedDict):
|
66
|
+
message: str
|
67
|
+
|
68
|
+
|
69
|
+
class ScorerExistsRequest(TypedDict):
|
70
|
+
name: str
|
71
|
+
|
72
|
+
|
73
|
+
class ScorerExistsResponse(TypedDict):
|
74
|
+
exists: bool
|
75
|
+
|
76
|
+
|
77
|
+
class SavePromptScorerRequest(TypedDict):
|
78
|
+
name: str
|
79
|
+
prompt: str
|
80
|
+
threshold: float
|
81
|
+
options: NotRequired[Optional[Dict[str, float]]]
|
82
|
+
is_trace: NotRequired[Optional[bool]]
|
83
|
+
|
84
|
+
|
85
|
+
class SavePromptScorerResponse(TypedDict):
|
86
|
+
message: str
|
87
|
+
name: str
|
88
|
+
|
89
|
+
|
90
|
+
class FetchPromptScorerRequest(TypedDict):
|
91
|
+
name: str
|
92
|
+
|
93
|
+
|
94
|
+
class CustomScorerUploadPayload(TypedDict):
|
95
|
+
scorer_name: str
|
96
|
+
scorer_code: str
|
97
|
+
requirements_text: str
|
98
|
+
|
99
|
+
|
100
|
+
class CustomScorerTemplateResponse(TypedDict):
|
101
|
+
scorer_name: str
|
102
|
+
status: str
|
103
|
+
message: str
|
104
|
+
|
105
|
+
|
106
|
+
class ResolveProjectNameRequest(TypedDict):
|
107
|
+
project_name: str
|
108
|
+
|
109
|
+
|
110
|
+
class ResolveProjectNameResponse(TypedDict):
|
111
|
+
project_id: str
|
112
|
+
|
113
|
+
|
114
|
+
class TraceIdRequest(TypedDict):
|
115
|
+
trace_id: str
|
116
|
+
|
117
|
+
|
118
|
+
class SpanScoreRequest(TypedDict):
|
119
|
+
span_id: str
|
120
|
+
trace_id: str
|
121
|
+
|
122
|
+
|
123
|
+
class BaseScorer(TypedDict):
|
124
|
+
score_type: str
|
125
|
+
threshold: NotRequired[float]
|
126
|
+
name: NotRequired[Optional[str]]
|
127
|
+
class_name: NotRequired[Optional[str]]
|
128
|
+
score: NotRequired[Optional[float]]
|
129
|
+
score_breakdown: NotRequired[Optional[Dict[str, Any]]]
|
130
|
+
reason: NotRequired[Optional[str]]
|
131
|
+
using_native_model: NotRequired[Optional[bool]]
|
132
|
+
success: NotRequired[Optional[bool]]
|
133
|
+
model: NotRequired[Optional[str]]
|
134
|
+
model_client: NotRequired[Any]
|
135
|
+
strict_mode: NotRequired[bool]
|
136
|
+
error: NotRequired[Optional[str]]
|
137
|
+
additional_metadata: NotRequired[Optional[Dict[str, Any]]]
|
138
|
+
user: NotRequired[Optional[str]]
|
139
|
+
server_hosted: NotRequired[bool]
|
140
|
+
|
141
|
+
|
142
|
+
class ScorerConfig(TypedDict):
|
143
|
+
score_type: str
|
144
|
+
name: NotRequired[Optional[str]]
|
145
|
+
threshold: NotRequired[float]
|
146
|
+
strict_mode: NotRequired[bool]
|
147
|
+
required_params: NotRequired[List[str]]
|
148
|
+
kwargs: NotRequired[Optional[Dict[str, Any]]]
|
149
|
+
|
150
|
+
|
151
|
+
class Example(TypedDict):
|
152
|
+
example_id: str
|
153
|
+
created_at: str
|
154
|
+
name: NotRequired[Optional[str]]
|
155
|
+
|
156
|
+
|
157
|
+
class ValidationError(TypedDict):
|
158
|
+
loc: List[Union[str, int]]
|
159
|
+
msg: str
|
160
|
+
type: str
|
161
|
+
|
162
|
+
|
163
|
+
class SpanBatchItem(TypedDict):
|
164
|
+
span_id: str
|
165
|
+
trace_id: str
|
166
|
+
function: str
|
167
|
+
created_at: NotRequired[Any]
|
168
|
+
parent_span_id: NotRequired[Optional[str]]
|
169
|
+
span_type: NotRequired[Optional[str]]
|
170
|
+
inputs: NotRequired[Optional[Dict[str, Any]]]
|
171
|
+
output: NotRequired[Any]
|
172
|
+
error: NotRequired[Optional[Dict[str, Any]]]
|
173
|
+
usage: NotRequired[Optional[Dict[str, Any]]]
|
174
|
+
duration: NotRequired[Optional[float]]
|
175
|
+
expected_tools: NotRequired[Optional[List[Dict[str, Any]]]]
|
176
|
+
additional_metadata: NotRequired[Optional[Dict[str, Any]]]
|
177
|
+
has_evaluation: NotRequired[Optional[bool]]
|
178
|
+
agent_name: NotRequired[Optional[str]]
|
179
|
+
class_name: NotRequired[Optional[str]]
|
180
|
+
state_before: NotRequired[Optional[Dict[str, Any]]]
|
181
|
+
state_after: NotRequired[Optional[Dict[str, Any]]]
|
182
|
+
span_state: str
|
183
|
+
update_id: NotRequired[int]
|
184
|
+
queued_at: float
|
185
|
+
|
186
|
+
|
187
|
+
class PromptScorer(TypedDict):
|
188
|
+
name: str
|
189
|
+
prompt: str
|
190
|
+
threshold: float
|
191
|
+
options: NotRequired[Optional[Dict[str, float]]]
|
192
|
+
created_at: NotRequired[Optional[str]]
|
193
|
+
updated_at: NotRequired[Optional[str]]
|
194
|
+
is_trace: NotRequired[Optional[bool]]
|
195
|
+
|
196
|
+
|
197
|
+
class ScorerData(TypedDict):
|
198
|
+
name: str
|
199
|
+
threshold: float
|
200
|
+
success: bool
|
201
|
+
score: NotRequired[Optional[float]]
|
202
|
+
reason: NotRequired[Optional[str]]
|
203
|
+
strict_mode: NotRequired[Optional[bool]]
|
204
|
+
evaluation_model: NotRequired[Union[List[str], str]]
|
205
|
+
error: NotRequired[Optional[str]]
|
206
|
+
additional_metadata: NotRequired[Optional[Dict[str, Any]]]
|
207
|
+
|
208
|
+
|
209
|
+
class TraceUsage(TypedDict):
|
210
|
+
prompt_tokens: NotRequired[Optional[int]]
|
211
|
+
completion_tokens: NotRequired[Optional[int]]
|
212
|
+
cache_creation_input_tokens: NotRequired[Optional[int]]
|
213
|
+
cache_read_input_tokens: NotRequired[Optional[int]]
|
214
|
+
total_tokens: NotRequired[Optional[int]]
|
215
|
+
prompt_tokens_cost_usd: NotRequired[Optional[float]]
|
216
|
+
completion_tokens_cost_usd: NotRequired[Optional[float]]
|
217
|
+
total_cost_usd: NotRequired[Optional[float]]
|
218
|
+
model_name: NotRequired[Optional[str]]
|
219
|
+
|
220
|
+
|
221
|
+
class Tool(TypedDict):
|
222
|
+
tool_name: str
|
223
|
+
parameters: NotRequired[Optional[Dict[str, Any]]]
|
224
|
+
agent_name: NotRequired[Optional[str]]
|
225
|
+
result_dependencies: NotRequired[Optional[List[Dict[str, Any]]]]
|
226
|
+
action_dependencies: NotRequired[Optional[List[Dict[str, Any]]]]
|
227
|
+
require_all: NotRequired[Optional[bool]]
|
228
|
+
|
229
|
+
|
230
|
+
class ExampleEvaluationRun(TypedDict):
|
231
|
+
id: NotRequired[Optional[str]]
|
232
|
+
project_name: NotRequired[Optional[str]]
|
233
|
+
eval_name: NotRequired[Optional[str]]
|
234
|
+
custom_scorers: NotRequired[List[BaseScorer]]
|
235
|
+
judgment_scorers: NotRequired[List[ScorerConfig]]
|
236
|
+
model: str
|
237
|
+
created_at: NotRequired[Optional[str]]
|
238
|
+
examples: List[Example]
|
239
|
+
trace_span_id: NotRequired[Optional[str]]
|
240
|
+
trace_id: NotRequired[Optional[str]]
|
241
|
+
|
242
|
+
|
243
|
+
class HTTPValidationError(TypedDict):
|
244
|
+
detail: NotRequired[List[ValidationError]]
|
245
|
+
|
246
|
+
|
247
|
+
class TraceEvaluationRun(TypedDict):
|
248
|
+
id: NotRequired[Optional[str]]
|
249
|
+
project_name: NotRequired[Optional[str]]
|
250
|
+
eval_name: NotRequired[Optional[str]]
|
251
|
+
custom_scorers: NotRequired[List[BaseScorer]]
|
252
|
+
judgment_scorers: NotRequired[List[ScorerConfig]]
|
253
|
+
model: str
|
254
|
+
created_at: NotRequired[Optional[str]]
|
255
|
+
trace_and_span_ids: List[TraceAndSpanId]
|
256
|
+
is_offline: NotRequired[bool]
|
257
|
+
|
258
|
+
|
259
|
+
class DatasetInsertExamples(TypedDict):
|
260
|
+
dataset_alias: str
|
261
|
+
examples: List[Example]
|
262
|
+
project_name: str
|
263
|
+
|
264
|
+
|
265
|
+
class SpansBatchRequest(TypedDict):
|
266
|
+
spans: List[SpanBatchItem]
|
267
|
+
organization_id: str
|
268
|
+
|
269
|
+
|
270
|
+
class FetchPromptScorerResponse(TypedDict):
|
271
|
+
scorer: PromptScorer
|
272
|
+
|
273
|
+
|
274
|
+
class TraceSpan(TypedDict):
|
275
|
+
span_id: str
|
276
|
+
trace_id: str
|
277
|
+
function: str
|
278
|
+
created_at: NotRequired[Any]
|
279
|
+
parent_span_id: NotRequired[Optional[str]]
|
280
|
+
span_type: NotRequired[Optional[str]]
|
281
|
+
inputs: NotRequired[Optional[Dict[str, Any]]]
|
282
|
+
error: NotRequired[Optional[Dict[str, Any]]]
|
283
|
+
output: NotRequired[Any]
|
284
|
+
usage: NotRequired[Optional[TraceUsage]]
|
285
|
+
duration: NotRequired[Optional[float]]
|
286
|
+
expected_tools: NotRequired[Optional[List[Tool]]]
|
287
|
+
additional_metadata: NotRequired[Optional[Dict[str, Any]]]
|
288
|
+
has_evaluation: NotRequired[Optional[bool]]
|
289
|
+
agent_name: NotRequired[Optional[str]]
|
290
|
+
class_name: NotRequired[Optional[str]]
|
291
|
+
state_before: NotRequired[Optional[Dict[str, Any]]]
|
292
|
+
state_after: NotRequired[Optional[Dict[str, Any]]]
|
293
|
+
update_id: NotRequired[int]
|
294
|
+
|
295
|
+
|
296
|
+
class Trace(TypedDict):
|
297
|
+
trace_id: str
|
298
|
+
name: str
|
299
|
+
created_at: str
|
300
|
+
duration: float
|
301
|
+
trace_spans: List[TraceSpan]
|
302
|
+
offline_mode: NotRequired[bool]
|
303
|
+
rules: NotRequired[Dict[str, Any]]
|
304
|
+
has_notification: NotRequired[bool]
|
305
|
+
customer_id: NotRequired[Optional[str]]
|
306
|
+
tags: NotRequired[List[str]]
|
307
|
+
metadata: NotRequired[Dict[str, Any]]
|
308
|
+
update_id: NotRequired[int]
|
309
|
+
|
310
|
+
|
311
|
+
class ScoringResult(TypedDict):
|
312
|
+
success: bool
|
313
|
+
scorers_data: Optional[List[ScorerData]]
|
314
|
+
name: NotRequired[Optional[str]]
|
315
|
+
data_object: NotRequired[Optional[Union[TraceSpan, Example]]]
|
316
|
+
trace_id: NotRequired[Optional[str]]
|
317
|
+
run_duration: NotRequired[Optional[float]]
|
318
|
+
evaluation_cost: NotRequired[Optional[float]]
|
319
|
+
|
320
|
+
|
321
|
+
class TraceRun(TypedDict):
|
322
|
+
project_name: NotRequired[Optional[str]]
|
323
|
+
eval_name: NotRequired[Optional[str]]
|
324
|
+
traces: List[Trace]
|
325
|
+
scorers: List[ScorerConfig]
|
326
|
+
model: str
|
327
|
+
trace_span_id: NotRequired[Optional[str]]
|
328
|
+
tools: NotRequired[Optional[List[Dict[str, Any]]]]
|
329
|
+
|
330
|
+
|
331
|
+
class EvalResults(TypedDict):
|
332
|
+
results: List[ScoringResult]
|
333
|
+
run: Union[ExampleEvaluationRun, TraceEvaluationRun]
|
334
|
+
|
335
|
+
|
336
|
+
class DatasetPush(TypedDict):
|
337
|
+
dataset_alias: str
|
338
|
+
comments: NotRequired[Optional[str]]
|
339
|
+
source_file: NotRequired[Optional[str]]
|
340
|
+
examples: NotRequired[Optional[List[Example]]]
|
341
|
+
traces: NotRequired[Optional[List[Trace]]]
|
342
|
+
is_trace: NotRequired[bool]
|
343
|
+
project_name: str
|
344
|
+
overwrite: NotRequired[Optional[bool]]
|
judgeval/cli.py
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
import typer
|
4
4
|
from pathlib import Path
|
5
5
|
from dotenv import load_dotenv
|
6
|
-
from judgeval.
|
7
|
-
from judgeval
|
6
|
+
from judgeval.logger import judgeval_logger
|
7
|
+
from judgeval import JudgmentClient
|
8
8
|
|
9
9
|
load_dotenv()
|
10
10
|
|
@@ -61,5 +61,3 @@ def version():
|
|
61
61
|
|
62
62
|
if __name__ == "__main__":
|
63
63
|
app()
|
64
|
-
|
65
|
-
# judgeval upload_scorer /Users/alanzhang/repo/JudgmentLabs/judgeval/src/demo/profile_match_scorer.py /Users/alanzhang/repo/JudgmentLabs/judgeval/src/demo/requirements.txt
|
judgeval/constants.py
CHANGED
@@ -1,10 +1,8 @@
|
|
1
|
-
|
2
|
-
Constant variables used throughout source code
|
3
|
-
"""
|
1
|
+
from __future__ import annotations
|
4
2
|
|
5
3
|
from enum import Enum
|
4
|
+
from typing import Set
|
6
5
|
import litellm
|
7
|
-
import os
|
8
6
|
|
9
7
|
|
10
8
|
class APIScorerType(str, Enum):
|
@@ -23,30 +21,28 @@ class APIScorerType(str, Enum):
|
|
23
21
|
EXECUTION_ORDER = "Execution Order"
|
24
22
|
DERAILMENT = "Derailment"
|
25
23
|
TOOL_ORDER = "Tool Order"
|
24
|
+
MOCK_TRACE_SCORER = "Mock Trace Scorer"
|
26
25
|
CLASSIFIER = "Classifier"
|
27
26
|
TOOL_DEPENDENCY = "Tool Dependency"
|
28
27
|
CUSTOM = "Custom"
|
29
28
|
|
30
29
|
@classmethod
|
31
|
-
def
|
30
|
+
def __missing__(cls, value: str) -> APIScorerType:
|
32
31
|
# Handle case-insensitive lookup
|
33
32
|
for member in cls:
|
34
33
|
if member.value == value.lower():
|
35
34
|
return member
|
36
35
|
|
36
|
+
raise ValueError(f"Invalid scorer type: {value}")
|
37
37
|
|
38
|
-
|
38
|
+
|
39
|
+
UNBOUNDED_SCORERS: Set[APIScorerType] = (
|
39
40
|
set()
|
40
41
|
) # scorers whose scores are not bounded between 0-1
|
41
42
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
)
|
46
|
-
RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
|
47
|
-
RABBITMQ_QUEUE = os.getenv("RABBITMQ_QUEUE", "task_queue")
|
48
|
-
# Models
|
49
|
-
LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
|
43
|
+
|
44
|
+
LITELLM_SUPPORTED_MODELS: Set[str] = set(litellm.model_list)
|
45
|
+
|
50
46
|
|
51
47
|
TOGETHER_SUPPORTED_MODELS = [
|
52
48
|
"meta-llama/Meta-Llama-3-70B-Instruct-Turbo",
|
@@ -104,20 +100,8 @@ TOGETHER_SUPPORTED_MODELS = [
|
|
104
100
|
"mistralai/Mistral-7B-Instruct-v0.1",
|
105
101
|
]
|
106
102
|
|
107
|
-
DEFAULT_TOGETHER_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct-Lite"
|
108
|
-
DEFAULT_GPT_MODEL = "gpt-4.1"
|
109
|
-
|
110
103
|
JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini", "osiris"}
|
111
104
|
|
112
105
|
ACCEPTABLE_MODELS = (
|
113
106
|
set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
|
114
107
|
)
|
115
|
-
|
116
|
-
## System settings
|
117
|
-
MAX_WORKER_THREADS = 10
|
118
|
-
|
119
|
-
# Maximum number of concurrent operations for evaluation runs
|
120
|
-
MAX_CONCURRENT_EVALUATIONS = 50 # Adjust based on system capabilities
|
121
|
-
|
122
|
-
# Span lifecycle management
|
123
|
-
SPAN_LIFECYCLE_END_UPDATE_ID = 20 # Default ending number for completed spans
|
judgeval/data/evaluation_run.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
from typing import List, Optional, Union
|
1
|
+
from typing import List, Optional, Union, Tuple
|
2
|
+
from litellm.files.main import BaseModel
|
2
3
|
from pydantic import field_validator, model_validator, Field
|
3
4
|
from datetime import datetime, timezone
|
4
5
|
import uuid
|
@@ -6,29 +7,22 @@ import uuid
|
|
6
7
|
from judgeval.data import Example
|
7
8
|
from judgeval.scorers import BaseScorer, APIScorerConfig
|
8
9
|
from judgeval.constants import ACCEPTABLE_MODELS
|
9
|
-
from judgeval.data.judgment_types import
|
10
|
+
from judgeval.data.judgment_types import (
|
11
|
+
ExampleEvaluationRun as ExampleEvaluationRunJudgmentType,
|
12
|
+
TraceEvaluationRun as TraceEvaluationRunJudgmentType,
|
13
|
+
)
|
10
14
|
|
11
15
|
|
12
|
-
class EvaluationRun(
|
13
|
-
"""
|
14
|
-
Stores example and evaluation scorers together for running an eval task
|
15
|
-
|
16
|
-
Args:
|
17
|
-
project_name (str): The name of the project the evaluation results belong to
|
18
|
-
eval_name (str): A name for this evaluation run
|
19
|
-
examples (List[Example]): The examples to evaluate
|
20
|
-
scorers (List[Union[BaseScorer, APIScorerConfig]]): A list of scorers to use for evaluation
|
21
|
-
model (str): The model used as a judge when using LLM as a Judge
|
22
|
-
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
23
|
-
"""
|
24
|
-
|
16
|
+
class EvaluationRun(BaseModel):
|
25
17
|
id: Optional[str] = Field(default_factory=lambda: str(uuid.uuid4()))
|
26
18
|
created_at: Optional[str] = Field(
|
27
19
|
default_factory=lambda: datetime.now(timezone.utc).isoformat()
|
28
20
|
)
|
21
|
+
organization_id: Optional[str] = None
|
29
22
|
custom_scorers: Optional[List[BaseScorer]] = None
|
30
23
|
judgment_scorers: Optional[List[APIScorerConfig]] = None
|
31
|
-
|
24
|
+
scorers: Optional[List[Union[BaseScorer, APIScorerConfig]]] = None
|
25
|
+
model: str
|
32
26
|
|
33
27
|
def __init__(
|
34
28
|
self,
|
@@ -57,19 +51,9 @@ class EvaluationRun(EvaluationRunJudgmentType):
|
|
57
51
|
data = super().model_dump(**kwargs)
|
58
52
|
data["custom_scorers"] = [s.model_dump() for s in self.custom_scorers]
|
59
53
|
data["judgment_scorers"] = [s.model_dump() for s in self.judgment_scorers]
|
60
|
-
data["examples"] = [example.model_dump() for example in self.examples]
|
61
54
|
|
62
55
|
return data
|
63
56
|
|
64
|
-
@field_validator("examples")
|
65
|
-
def validate_examples(cls, v):
|
66
|
-
if not v:
|
67
|
-
raise ValueError("Examples cannot be empty.")
|
68
|
-
for item in v:
|
69
|
-
if not isinstance(item, Example):
|
70
|
-
raise ValueError(f"Item of type {type(item)} is not a Example")
|
71
|
-
return v
|
72
|
-
|
73
57
|
@model_validator(mode="after")
|
74
58
|
@classmethod
|
75
59
|
def validate_scorer_lists(cls, values):
|
@@ -102,3 +86,42 @@ class EvaluationRun(EvaluationRunJudgmentType):
|
|
102
86
|
f"Model name {v} not recognized. Please select a valid model name.)"
|
103
87
|
)
|
104
88
|
return v
|
89
|
+
|
90
|
+
|
91
|
+
class ExampleEvaluationRun(EvaluationRun, ExampleEvaluationRunJudgmentType): # type: ignore
|
92
|
+
"""
|
93
|
+
Stores example and evaluation scorers together for running an eval task
|
94
|
+
|
95
|
+
Args:
|
96
|
+
project_name (str): The name of the project the evaluation results belong to
|
97
|
+
eval_name (str): A name for this evaluation run
|
98
|
+
examples (List[Example]): The examples to evaluate
|
99
|
+
scorers (List[Union[BaseScorer, APIScorerConfig]]): A list of scorers to use for evaluation
|
100
|
+
model (str): The model used as a judge when using LLM as a Judge
|
101
|
+
"""
|
102
|
+
|
103
|
+
examples: List[Example] # type: ignore
|
104
|
+
|
105
|
+
@field_validator("examples")
|
106
|
+
def validate_examples(cls, v):
|
107
|
+
if not v:
|
108
|
+
raise ValueError("Examples cannot be empty.")
|
109
|
+
for item in v:
|
110
|
+
if not isinstance(item, Example):
|
111
|
+
raise ValueError(f"Item of type {type(item)} is not a Example")
|
112
|
+
return v
|
113
|
+
|
114
|
+
def model_dump(self, **kwargs):
|
115
|
+
data = super().model_dump(**kwargs)
|
116
|
+
data["examples"] = [example.model_dump() for example in self.examples]
|
117
|
+
return data
|
118
|
+
|
119
|
+
|
120
|
+
class TraceEvaluationRun(EvaluationRun, TraceEvaluationRunJudgmentType): # type: ignore
|
121
|
+
trace_and_span_ids: List[Tuple[str, str]] # type: ignore
|
122
|
+
|
123
|
+
@field_validator("trace_and_span_ids")
|
124
|
+
def validate_trace_and_span_ids(cls, v):
|
125
|
+
if not v:
|
126
|
+
raise ValueError("Trace and span IDs are required for trace evaluations.")
|
127
|
+
return v
|
judgeval/data/example.py
CHANGED
@@ -5,7 +5,7 @@ Classes for representing examples in a dataset.
|
|
5
5
|
from enum import Enum
|
6
6
|
from datetime import datetime
|
7
7
|
from typing import Dict, Any, Optional
|
8
|
-
from judgeval.data.judgment_types import
|
8
|
+
from judgeval.data.judgment_types import Example as JudgmentExample
|
9
9
|
|
10
10
|
|
11
11
|
class ExampleParams(str, Enum):
|
@@ -19,7 +19,7 @@ class ExampleParams(str, Enum):
|
|
19
19
|
ADDITIONAL_METADATA = "additional_metadata"
|
20
20
|
|
21
21
|
|
22
|
-
class Example(
|
22
|
+
class Example(JudgmentExample):
|
23
23
|
example_id: str = ""
|
24
24
|
created_at: str = datetime.now().isoformat()
|
25
25
|
name: Optional[str] = None
|