judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of judgeval might be problematic. Click here for more details.
- judgeval/__init__.py +177 -12
- judgeval/api/__init__.py +519 -0
- judgeval/api/api_types.py +407 -0
- judgeval/cli.py +79 -0
- judgeval/constants.py +76 -47
- judgeval/data/__init__.py +3 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +15 -56
- judgeval/data/judgment_types.py +450 -0
- judgeval/data/result.py +29 -73
- judgeval/data/scorer_data.py +29 -62
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/trace.py +121 -0
- judgeval/dataset/__init__.py +264 -0
- judgeval/env.py +52 -0
- judgeval/evaluation/__init__.py +344 -0
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +50 -0
- judgeval/judges/__init__.py +2 -3
- judgeval/judges/base_judge.py +2 -3
- judgeval/judges/litellm_judge.py +100 -20
- judgeval/judges/together_judge.py +101 -20
- judgeval/judges/utils.py +20 -24
- judgeval/logger.py +62 -0
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +18 -25
- judgeval/scorers/agent_scorer.py +17 -0
- judgeval/scorers/api_scorer.py +45 -41
- judgeval/scorers/base_scorer.py +83 -38
- judgeval/scorers/example_scorer.py +17 -0
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorers/__init__.py +0 -148
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
- judgeval/scorers/score.py +77 -306
- judgeval/scorers/utils.py +4 -199
- judgeval/tracer/__init__.py +1122 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +128 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +396 -0
- judgeval/trainer/trainable_model.py +243 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +97 -0
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/utils/version_check.py +28 -0
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.22.2.dist-info/METADATA +265 -0
- judgeval-0.22.2.dist-info/RECORD +112 -0
- judgeval-0.22.2.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -39
- judgeval/common/__init__.py +0 -8
- judgeval/common/exceptions.py +0 -28
- judgeval/common/logger.py +0 -189
- judgeval/common/tracer.py +0 -798
- judgeval/common/utils.py +0 -763
- judgeval/data/api_example.py +0 -111
- judgeval/data/datasets/__init__.py +0 -5
- judgeval/data/datasets/dataset.py +0 -286
- judgeval/data/datasets/eval_dataset_client.py +0 -193
- judgeval/data/datasets/ground_truth.py +0 -54
- judgeval/data/datasets/utils.py +0 -74
- judgeval/evaluation_run.py +0 -132
- judgeval/judges/mixture_of_judges.py +0 -248
- judgeval/judgment_client.py +0 -354
- judgeval/run_evaluation.py +0 -439
- judgeval/scorers/judgeval_scorer.py +0 -140
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
- judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
- judgeval/scorers/prompt_scorer.py +0 -439
- judgeval-0.0.11.dist-info/METADATA +0 -36
- judgeval-0.0.11.dist-info/RECORD +0 -84
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,407 @@
|
|
|
1
|
+
# generated by datamodel-codegen:
|
|
2
|
+
# filename: .openapi.json
|
|
3
|
+
# timestamp: 2025-10-25T22:30:20+00:00
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
|
|
7
|
+
from typing_extensions import NotRequired
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
TraceAndSpanId = List
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class EvalResultsFetch(TypedDict):
|
|
14
|
+
experiment_run_id: str
|
|
15
|
+
project_name: str
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DatasetFetch(TypedDict):
|
|
19
|
+
dataset_name: str
|
|
20
|
+
project_name: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DatasetsFetch(TypedDict):
|
|
24
|
+
project_name: str
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ProjectAdd(TypedDict):
|
|
28
|
+
project_name: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ProjectAddResponse(TypedDict):
|
|
32
|
+
project_id: str
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ProjectDeleteFromJudgevalResponse(TypedDict):
|
|
36
|
+
project_name: str
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ProjectDeleteResponse(TypedDict):
|
|
40
|
+
message: str
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ScorerExistsRequest(TypedDict):
|
|
44
|
+
name: str
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ScorerExistsResponse(TypedDict):
|
|
48
|
+
exists: bool
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class SavePromptScorerRequest(TypedDict):
|
|
52
|
+
name: str
|
|
53
|
+
prompt: str
|
|
54
|
+
threshold: float
|
|
55
|
+
model: NotRequired[str]
|
|
56
|
+
is_trace: NotRequired[bool]
|
|
57
|
+
options: NotRequired[Optional[Dict[str, float]]]
|
|
58
|
+
description: NotRequired[Optional[str]]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class FetchPromptScorersRequest(TypedDict):
|
|
62
|
+
names: NotRequired[Optional[List[str]]]
|
|
63
|
+
is_trace: NotRequired[Optional[bool]]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class CustomScorerUploadPayload(TypedDict):
|
|
67
|
+
scorer_name: str
|
|
68
|
+
scorer_code: str
|
|
69
|
+
requirements_text: str
|
|
70
|
+
overwrite: NotRequired[bool]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class CustomScorerTemplateResponse(TypedDict):
|
|
74
|
+
scorer_name: str
|
|
75
|
+
status: str
|
|
76
|
+
message: str
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class PromptInsertRequest(TypedDict):
|
|
80
|
+
project_id: str
|
|
81
|
+
name: str
|
|
82
|
+
prompt: str
|
|
83
|
+
tags: List[str]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class PromptInsertResponse(TypedDict):
|
|
87
|
+
commit_id: str
|
|
88
|
+
parent_commit_id: NotRequired[Optional[str]]
|
|
89
|
+
created_at: str
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class PromptTagRequest(TypedDict):
|
|
93
|
+
project_id: str
|
|
94
|
+
name: str
|
|
95
|
+
commit_id: str
|
|
96
|
+
tags: List[str]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class PromptTagResponse(TypedDict):
|
|
100
|
+
commit_id: str
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class PromptUntagRequest(TypedDict):
|
|
104
|
+
project_id: str
|
|
105
|
+
name: str
|
|
106
|
+
tags: List[str]
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class PromptUntagResponse(TypedDict):
|
|
110
|
+
commit_ids: List[str]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class ResolveProjectNameRequest(TypedDict):
|
|
114
|
+
project_name: str
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class ResolveProjectNameResponse(TypedDict):
|
|
118
|
+
project_id: str
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class TraceIdRequest(TypedDict):
|
|
122
|
+
trace_id: str
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class SpanScoreRequest(TypedDict):
|
|
126
|
+
span_id: str
|
|
127
|
+
trace_id: str
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class BaseScorer(TypedDict):
|
|
131
|
+
score_type: str
|
|
132
|
+
threshold: NotRequired[float]
|
|
133
|
+
name: NotRequired[Optional[str]]
|
|
134
|
+
class_name: NotRequired[Optional[str]]
|
|
135
|
+
score: NotRequired[Optional[float]]
|
|
136
|
+
score_breakdown: NotRequired[Optional[Dict[str, Any]]]
|
|
137
|
+
reason: NotRequired[Optional[str]]
|
|
138
|
+
using_native_model: NotRequired[Optional[bool]]
|
|
139
|
+
success: NotRequired[Optional[bool]]
|
|
140
|
+
model: NotRequired[Optional[str]]
|
|
141
|
+
model_client: NotRequired[Any]
|
|
142
|
+
strict_mode: NotRequired[bool]
|
|
143
|
+
error: NotRequired[Optional[str]]
|
|
144
|
+
additional_metadata: NotRequired[Optional[Dict[str, Any]]]
|
|
145
|
+
user: NotRequired[Optional[str]]
|
|
146
|
+
server_hosted: NotRequired[bool]
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class ScorerConfig(TypedDict):
|
|
150
|
+
score_type: str
|
|
151
|
+
name: NotRequired[Optional[str]]
|
|
152
|
+
threshold: NotRequired[float]
|
|
153
|
+
model: NotRequired[Optional[str]]
|
|
154
|
+
strict_mode: NotRequired[bool]
|
|
155
|
+
required_params: NotRequired[List[str]]
|
|
156
|
+
kwargs: NotRequired[Optional[Dict[str, Any]]]
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class Example(TypedDict):
|
|
160
|
+
example_id: NotRequired[str]
|
|
161
|
+
created_at: NotRequired[str]
|
|
162
|
+
name: NotRequired[Optional[str]]
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class ValidationError(TypedDict):
|
|
166
|
+
loc: List[Union[str, int]]
|
|
167
|
+
msg: str
|
|
168
|
+
type: str
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class UsageInfo(TypedDict):
|
|
172
|
+
total_judgees: int
|
|
173
|
+
regular_use: int
|
|
174
|
+
pay_as_you_go_use: int
|
|
175
|
+
remaining_regular: int
|
|
176
|
+
remaining_after: int
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
DatasetKind = Literal["trace", "example"]
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
class PromptScorer(TypedDict):
|
|
183
|
+
id: str
|
|
184
|
+
user_id: str
|
|
185
|
+
organization_id: str
|
|
186
|
+
name: str
|
|
187
|
+
prompt: str
|
|
188
|
+
threshold: float
|
|
189
|
+
model: NotRequired[str]
|
|
190
|
+
options: NotRequired[Optional[Dict[str, float]]]
|
|
191
|
+
description: NotRequired[Optional[str]]
|
|
192
|
+
created_at: NotRequired[Optional[str]]
|
|
193
|
+
updated_at: NotRequired[Optional[str]]
|
|
194
|
+
is_trace: NotRequired[Optional[bool]]
|
|
195
|
+
is_bucket_rubric: NotRequired[Optional[bool]]
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
class PromptCommitInfo(TypedDict):
|
|
199
|
+
name: str
|
|
200
|
+
prompt: str
|
|
201
|
+
tags: List[str]
|
|
202
|
+
commit_id: str
|
|
203
|
+
parent_commit_id: NotRequired[Optional[str]]
|
|
204
|
+
created_at: str
|
|
205
|
+
first_name: str
|
|
206
|
+
last_name: str
|
|
207
|
+
user_email: str
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
class ScorerData(TypedDict):
|
|
211
|
+
id: NotRequired[str]
|
|
212
|
+
name: str
|
|
213
|
+
threshold: float
|
|
214
|
+
success: bool
|
|
215
|
+
score: NotRequired[Optional[float]]
|
|
216
|
+
reason: NotRequired[Optional[str]]
|
|
217
|
+
strict_mode: NotRequired[Optional[bool]]
|
|
218
|
+
evaluation_model: NotRequired[Optional[str]]
|
|
219
|
+
error: NotRequired[Optional[str]]
|
|
220
|
+
additional_metadata: NotRequired[Optional[Dict[str, Any]]]
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
class OtelTraceSpan(TypedDict):
|
|
224
|
+
organization_id: str
|
|
225
|
+
project_id: NotRequired[Optional[str]]
|
|
226
|
+
user_id: str
|
|
227
|
+
timestamp: str
|
|
228
|
+
trace_id: str
|
|
229
|
+
span_id: str
|
|
230
|
+
parent_span_id: NotRequired[Optional[str]]
|
|
231
|
+
trace_state: NotRequired[Optional[str]]
|
|
232
|
+
span_name: NotRequired[Optional[str]]
|
|
233
|
+
span_kind: NotRequired[Optional[str]]
|
|
234
|
+
service_name: NotRequired[Optional[str]]
|
|
235
|
+
resource_attributes: NotRequired[Optional[Dict[str, Any]]]
|
|
236
|
+
span_attributes: NotRequired[Optional[Dict[str, Any]]]
|
|
237
|
+
duration: NotRequired[Optional[int]]
|
|
238
|
+
status_code: NotRequired[Optional[int]]
|
|
239
|
+
status_message: NotRequired[Optional[str]]
|
|
240
|
+
events: NotRequired[Optional[List[Dict[str, Any]]]]
|
|
241
|
+
links: NotRequired[Optional[List[Dict[str, Any]]]]
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
class OtelSpanListItemScores(TypedDict):
|
|
245
|
+
success: bool
|
|
246
|
+
score: float
|
|
247
|
+
reason: NotRequired[Optional[str]]
|
|
248
|
+
name: str
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
class OtelSpanDetailScores(TypedDict):
|
|
252
|
+
success: bool
|
|
253
|
+
score: float
|
|
254
|
+
reason: NotRequired[Optional[str]]
|
|
255
|
+
name: str
|
|
256
|
+
example_id: NotRequired[Optional[str]]
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class ExampleEvaluationRun(TypedDict):
|
|
260
|
+
id: NotRequired[str]
|
|
261
|
+
project_name: str
|
|
262
|
+
eval_name: str
|
|
263
|
+
custom_scorers: NotRequired[List[BaseScorer]]
|
|
264
|
+
judgment_scorers: NotRequired[List[ScorerConfig]]
|
|
265
|
+
model: NotRequired[Optional[str]]
|
|
266
|
+
created_at: NotRequired[str]
|
|
267
|
+
examples: List[Example]
|
|
268
|
+
trace_span_id: NotRequired[Optional[str]]
|
|
269
|
+
trace_id: NotRequired[Optional[str]]
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class HTTPValidationError(TypedDict):
|
|
273
|
+
detail: NotRequired[List[ValidationError]]
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
class TraceEvaluationRun(TypedDict):
|
|
277
|
+
id: NotRequired[str]
|
|
278
|
+
project_name: str
|
|
279
|
+
eval_name: str
|
|
280
|
+
custom_scorers: NotRequired[List[BaseScorer]]
|
|
281
|
+
judgment_scorers: NotRequired[List[ScorerConfig]]
|
|
282
|
+
model: NotRequired[Optional[str]]
|
|
283
|
+
created_at: NotRequired[str]
|
|
284
|
+
trace_and_span_ids: List[TraceAndSpanId]
|
|
285
|
+
is_offline: NotRequired[bool]
|
|
286
|
+
is_bucket_run: NotRequired[bool]
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
class DatasetInsertExamples(TypedDict):
|
|
290
|
+
dataset_name: str
|
|
291
|
+
examples: List[Example]
|
|
292
|
+
project_name: str
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
class DatasetInfo(TypedDict):
|
|
296
|
+
dataset_id: str
|
|
297
|
+
name: str
|
|
298
|
+
created_at: str
|
|
299
|
+
kind: DatasetKind
|
|
300
|
+
entries: int
|
|
301
|
+
creator: str
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
class DatasetCreate(TypedDict):
|
|
305
|
+
name: str
|
|
306
|
+
dataset_kind: DatasetKind
|
|
307
|
+
project_name: str
|
|
308
|
+
examples: List[Example]
|
|
309
|
+
overwrite: bool
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
class SavePromptScorerResponse(TypedDict):
|
|
313
|
+
scorer_response: PromptScorer
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
class FetchPromptScorersResponse(TypedDict):
|
|
317
|
+
scorers: List[PromptScorer]
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
class PromptFetchResponse(TypedDict):
|
|
321
|
+
commit: NotRequired[Optional[PromptCommitInfo]]
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
class PromptVersionsResponse(TypedDict):
|
|
325
|
+
versions: List[PromptCommitInfo]
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
class ScoringResult(TypedDict):
|
|
329
|
+
success: bool
|
|
330
|
+
scorers_data: List[ScorerData]
|
|
331
|
+
name: NotRequired[Optional[str]]
|
|
332
|
+
data_object: NotRequired[Optional[Union[OtelTraceSpan, Example]]]
|
|
333
|
+
trace_id: NotRequired[Optional[str]]
|
|
334
|
+
run_duration: NotRequired[Optional[float]]
|
|
335
|
+
evaluation_cost: NotRequired[Optional[float]]
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
class OtelTraceListItem(TypedDict):
|
|
339
|
+
organization_id: str
|
|
340
|
+
project_id: str
|
|
341
|
+
trace_id: str
|
|
342
|
+
created_at: str
|
|
343
|
+
duration: NotRequired[Optional[int]]
|
|
344
|
+
tags: NotRequired[Optional[List[str]]]
|
|
345
|
+
experiment_run_id: NotRequired[Optional[str]]
|
|
346
|
+
span_name: NotRequired[Optional[str]]
|
|
347
|
+
llm_cost: NotRequired[Optional[float]]
|
|
348
|
+
error: NotRequired[str]
|
|
349
|
+
scores: NotRequired[List[OtelSpanListItemScores]]
|
|
350
|
+
rules_invoked: NotRequired[List[str]]
|
|
351
|
+
customer_id: NotRequired[Optional[str]]
|
|
352
|
+
input: NotRequired[Optional[str]]
|
|
353
|
+
output: NotRequired[Optional[str]]
|
|
354
|
+
input_preview: NotRequired[Optional[str]]
|
|
355
|
+
output_preview: NotRequired[Optional[str]]
|
|
356
|
+
annotation_count: NotRequired[int]
|
|
357
|
+
span_id: str
|
|
358
|
+
rule_id: NotRequired[Optional[str]]
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
class OtelSpanDetail(TypedDict):
|
|
362
|
+
organization_id: str
|
|
363
|
+
project_id: str
|
|
364
|
+
timestamp: str
|
|
365
|
+
trace_id: str
|
|
366
|
+
span_id: str
|
|
367
|
+
parent_span_id: NotRequired[Optional[str]]
|
|
368
|
+
trace_state: NotRequired[Optional[str]]
|
|
369
|
+
span_name: NotRequired[Optional[str]]
|
|
370
|
+
span_kind: NotRequired[Optional[str]]
|
|
371
|
+
service_name: NotRequired[Optional[str]]
|
|
372
|
+
resource_attributes: NotRequired[Optional[Dict[str, Any]]]
|
|
373
|
+
span_attributes: NotRequired[Optional[Dict[str, Any]]]
|
|
374
|
+
duration: NotRequired[Optional[int]]
|
|
375
|
+
status_code: NotRequired[Optional[int]]
|
|
376
|
+
status_message: NotRequired[Optional[str]]
|
|
377
|
+
events: NotRequired[Optional[List[Dict[str, Any]]]]
|
|
378
|
+
links: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
|
|
379
|
+
llm_cost: NotRequired[Optional[float]]
|
|
380
|
+
prompt_tokens: NotRequired[Optional[int]]
|
|
381
|
+
completion_tokens: NotRequired[Optional[int]]
|
|
382
|
+
scores: NotRequired[Optional[List[OtelSpanDetailScores]]]
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
class EvaluateResponse(TypedDict):
|
|
386
|
+
status: str
|
|
387
|
+
results: List[ScoringResult]
|
|
388
|
+
resource_usage: NotRequired[Optional[UsageInfo]]
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
class EvalResults(TypedDict):
|
|
392
|
+
results: List[ScoringResult]
|
|
393
|
+
run: Union[ExampleEvaluationRun, TraceEvaluationRun]
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
class DatasetTraceWithSpans(TypedDict):
|
|
397
|
+
dataset_id: str
|
|
398
|
+
trace_detail: OtelTraceListItem
|
|
399
|
+
spans: List[OtelSpanDetail]
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
class DatasetReturn(TypedDict):
|
|
403
|
+
name: str
|
|
404
|
+
project_name: str
|
|
405
|
+
dataset_kind: DatasetKind
|
|
406
|
+
examples: NotRequired[List[Example]]
|
|
407
|
+
traces: NotRequired[Optional[List[DatasetTraceWithSpans]]]
|
judgeval/cli.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import typer
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from dotenv import load_dotenv
|
|
6
|
+
from judgeval.logger import judgeval_logger
|
|
7
|
+
from judgeval import JudgmentClient
|
|
8
|
+
from judgeval.version import get_version
|
|
9
|
+
from judgeval.exceptions import JudgmentAPIError
|
|
10
|
+
|
|
11
|
+
load_dotenv()
|
|
12
|
+
|
|
13
|
+
app = typer.Typer(
|
|
14
|
+
no_args_is_help=True,
|
|
15
|
+
rich_markup_mode=None,
|
|
16
|
+
rich_help_panel=None,
|
|
17
|
+
pretty_exceptions_enable=False,
|
|
18
|
+
pretty_exceptions_show_locals=False,
|
|
19
|
+
pretty_exceptions_short=False,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@app.command("upload_scorer")
|
|
24
|
+
def upload_scorer(
|
|
25
|
+
scorer_file_path: str,
|
|
26
|
+
requirements_file_path: str,
|
|
27
|
+
unique_name: str = typer.Option(
|
|
28
|
+
None, help="Custom name for the scorer (auto-detected if not provided)"
|
|
29
|
+
),
|
|
30
|
+
overwrite: bool = typer.Option(
|
|
31
|
+
False,
|
|
32
|
+
"--overwrite",
|
|
33
|
+
"-o",
|
|
34
|
+
help="Overwrite existing scorer if it already exists",
|
|
35
|
+
),
|
|
36
|
+
):
|
|
37
|
+
# Validate file paths
|
|
38
|
+
if not Path(scorer_file_path).exists():
|
|
39
|
+
judgeval_logger.error(f"Scorer file not found: {scorer_file_path}")
|
|
40
|
+
raise typer.Exit(1)
|
|
41
|
+
|
|
42
|
+
if not Path(requirements_file_path).exists():
|
|
43
|
+
judgeval_logger.error(f"Requirements file not found: {requirements_file_path}")
|
|
44
|
+
raise typer.Exit(1)
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
client = JudgmentClient()
|
|
48
|
+
|
|
49
|
+
result = client.upload_custom_scorer(
|
|
50
|
+
scorer_file_path=scorer_file_path,
|
|
51
|
+
requirements_file_path=requirements_file_path,
|
|
52
|
+
unique_name=unique_name,
|
|
53
|
+
overwrite=overwrite,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
if not result:
|
|
57
|
+
judgeval_logger.error("Failed to upload custom scorer")
|
|
58
|
+
raise typer.Exit(1)
|
|
59
|
+
|
|
60
|
+
judgeval_logger.info("Custom scorer uploaded successfully!")
|
|
61
|
+
raise typer.Exit(0)
|
|
62
|
+
except Exception as e:
|
|
63
|
+
if isinstance(e, JudgmentAPIError) and e.status_code == 409:
|
|
64
|
+
judgeval_logger.error(
|
|
65
|
+
"Duplicate scorer detected. Use --overwrite flag to replace the existing scorer"
|
|
66
|
+
)
|
|
67
|
+
raise typer.Exit(1)
|
|
68
|
+
# Re-raise other exceptions
|
|
69
|
+
raise
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@app.command()
|
|
73
|
+
def version():
|
|
74
|
+
"""Show version info"""
|
|
75
|
+
judgeval_logger.info(f"Judgeval CLI v{get_version()}")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
if __name__ == "__main__":
|
|
79
|
+
app()
|
judgeval/constants.py
CHANGED
|
@@ -1,68 +1,97 @@
|
|
|
1
|
-
|
|
2
|
-
Constant variables used throughout source code
|
|
3
|
-
"""
|
|
1
|
+
from __future__ import annotations
|
|
4
2
|
|
|
5
3
|
from enum import Enum
|
|
4
|
+
from typing import Set
|
|
6
5
|
import litellm
|
|
7
|
-
import os
|
|
8
6
|
|
|
9
|
-
|
|
7
|
+
|
|
8
|
+
class APIScorerType(str, Enum):
|
|
10
9
|
"""
|
|
11
10
|
Collection of proprietary scorers implemented by Judgment.
|
|
12
11
|
|
|
13
12
|
These are ready-made evaluation scorers that can be used to evaluate
|
|
14
13
|
Examples via the Judgment API.
|
|
15
14
|
"""
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
JSON_CORRECTNESS = "json_correctness"
|
|
15
|
+
|
|
16
|
+
PROMPT_SCORER = "Prompt Scorer"
|
|
17
|
+
TRACE_PROMPT_SCORER = "Trace Prompt Scorer"
|
|
18
|
+
FAITHFULNESS = "Faithfulness"
|
|
19
|
+
ANSWER_RELEVANCY = "Answer Relevancy"
|
|
20
|
+
ANSWER_CORRECTNESS = "Answer Correctness"
|
|
21
|
+
INSTRUCTION_ADHERENCE = "Instruction Adherence"
|
|
22
|
+
EXECUTION_ORDER = "Execution Order"
|
|
23
|
+
CUSTOM = "Custom"
|
|
26
24
|
|
|
27
25
|
@classmethod
|
|
28
|
-
def
|
|
29
|
-
# Handle case-insensitive lookup
|
|
26
|
+
def __missing__(cls, value: str) -> APIScorerType:
|
|
30
27
|
for member in cls:
|
|
31
28
|
if member.value == value.lower():
|
|
32
29
|
return member
|
|
33
30
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
|
37
|
-
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
|
38
|
-
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
|
|
39
|
-
JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
|
|
40
|
-
JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
|
41
|
-
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
|
|
42
|
-
JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
|
|
43
|
-
JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
|
|
44
|
-
JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
|
|
45
|
-
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
|
46
|
-
JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
|
31
|
+
raise ValueError(f"Invalid scorer type: {value}")
|
|
32
|
+
|
|
47
33
|
|
|
48
|
-
|
|
49
|
-
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
|
|
50
|
-
RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
|
|
51
|
-
RABBITMQ_QUEUE = os.getenv("RABBITMQ_QUEUE", "task_queue")
|
|
34
|
+
LITELLM_SUPPORTED_MODELS: Set[str] = set(litellm.model_list)
|
|
52
35
|
|
|
53
|
-
# Models
|
|
54
|
-
TOGETHER_SUPPORTED_MODELS = {
|
|
55
|
-
"QWEN": "Qwen/Qwen2-72B-Instruct",
|
|
56
|
-
"LLAMA3_70B_INSTRUCT_TURBO": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
|
|
57
|
-
"LLAMA3_405B_INSTRUCT_TURBO": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
|
|
58
|
-
"LLAMA3_8B_INSTRUCT_TURBO": "meta-llama/Llama-3.2-3B-Instruct-Turbo",
|
|
59
|
-
"MISTRAL_8x22B_INSTRUCT": "mistralai/Mixtral-8x22B-Instruct-v0.1",
|
|
60
|
-
"MISTRAL_8x7B_INSTRUCT": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
61
|
-
}
|
|
62
36
|
|
|
63
|
-
|
|
37
|
+
TOGETHER_SUPPORTED_MODELS = [
|
|
38
|
+
"meta-llama/Meta-Llama-3-70B-Instruct-Turbo",
|
|
39
|
+
"Qwen/Qwen2-VL-72B-Instruct",
|
|
40
|
+
"meta-llama/Llama-Vision-Free",
|
|
41
|
+
"Gryphe/MythoMax-L2-13b",
|
|
42
|
+
"Qwen/Qwen2.5-72B-Instruct-Turbo",
|
|
43
|
+
"meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
|
|
44
|
+
"deepseek-ai/DeepSeek-R1",
|
|
45
|
+
"meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
|
|
46
|
+
"meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
|
|
47
|
+
"google/gemma-2-27b-it",
|
|
48
|
+
"mistralai/Mistral-Small-24B-Instruct-2501",
|
|
49
|
+
"mistralai/Mixtral-8x22B-Instruct-v0.1",
|
|
50
|
+
"meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
|
|
51
|
+
"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
|
|
52
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-classifier",
|
|
53
|
+
"deepseek-ai/DeepSeek-V3",
|
|
54
|
+
"Qwen/Qwen2-72B-Instruct",
|
|
55
|
+
"meta-llama/Meta-Llama-3-8B-Instruct-Lite",
|
|
56
|
+
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
|
57
|
+
"upstage/SOLAR-10.7B-Instruct-v1.0",
|
|
58
|
+
"togethercomputer/MoA-1",
|
|
59
|
+
"Qwen/QwQ-32B-Preview",
|
|
60
|
+
"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
|
|
61
|
+
"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
|
|
62
|
+
"mistralai/Mistral-7B-Instruct-v0.2",
|
|
63
|
+
"databricks/dbrx-instruct",
|
|
64
|
+
"meta-llama/Llama-3-8b-chat-hf",
|
|
65
|
+
"google/gemma-2b-it",
|
|
66
|
+
"meta-llama/Meta-Llama-3-70B-Instruct-Lite",
|
|
67
|
+
"google/gemma-2-9b-it",
|
|
68
|
+
"meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
69
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-p",
|
|
70
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
|
|
71
|
+
"Gryphe/MythoMax-L2-13b-Lite",
|
|
72
|
+
"meta-llama/Llama-2-7b-chat-hf",
|
|
73
|
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
|
74
|
+
"meta-llama/Llama-2-13b-chat-hf",
|
|
75
|
+
"scb10x/scb10x-llama3-typhoon-v1-5-8b-instruct",
|
|
76
|
+
"scb10x/scb10x-llama3-typhoon-v1-5x-4f316",
|
|
77
|
+
"nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
|
|
78
|
+
"Qwen/Qwen2.5-Coder-32B-Instruct",
|
|
79
|
+
"microsoft/WizardLM-2-8x22B",
|
|
80
|
+
"mistralai/Mistral-7B-Instruct-v0.3",
|
|
81
|
+
"scb10x/scb10x-llama3-1-typhoon2-60256",
|
|
82
|
+
"Qwen/Qwen2.5-7B-Instruct-Turbo",
|
|
83
|
+
"scb10x/scb10x-llama3-1-typhoon-18370",
|
|
84
|
+
"meta-llama/Llama-3.2-3B-Instruct-Turbo",
|
|
85
|
+
"meta-llama/Llama-3-70b-chat-hf",
|
|
86
|
+
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
87
|
+
"togethercomputer/MoA-1-Turbo",
|
|
88
|
+
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
|
|
89
|
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
|
|
90
|
+
"mistralai/Mistral-7B-Instruct-v0.1",
|
|
91
|
+
]
|
|
64
92
|
|
|
65
|
-
|
|
93
|
+
JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini", "osiris"}
|
|
66
94
|
|
|
67
|
-
|
|
68
|
-
|
|
95
|
+
ACCEPTABLE_MODELS = (
|
|
96
|
+
set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
|
|
97
|
+
)
|
judgeval/data/__init__.py
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
from judgeval.data.example import Example, ExampleParams
|
|
2
|
-
from judgeval.data.api_example import ProcessExample, create_process_example
|
|
3
2
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
|
4
3
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
|
4
|
+
from judgeval.data.trace import TraceUsage
|
|
5
|
+
|
|
5
6
|
|
|
6
7
|
__all__ = [
|
|
7
8
|
"Example",
|
|
8
9
|
"ExampleParams",
|
|
9
|
-
"ProcessExample",
|
|
10
|
-
"create_process_example",
|
|
11
10
|
"ScorerData",
|
|
12
11
|
"create_scorer_data",
|
|
13
12
|
"ScoringResult",
|
|
14
13
|
"generate_scoring_result",
|
|
14
|
+
"TraceUsage",
|
|
15
15
|
]
|