judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +173 -10
- judgeval/api/__init__.py +523 -0
- judgeval/api/api_types.py +413 -0
- judgeval/cli.py +112 -0
- judgeval/constants.py +7 -30
- judgeval/data/__init__.py +1 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +14 -40
- judgeval/data/judgment_types.py +396 -146
- judgeval/data/result.py +11 -18
- judgeval/data/scorer_data.py +3 -26
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +115 -194
- judgeval/dataset/__init__.py +335 -0
- judgeval/env.py +55 -0
- judgeval/evaluation/__init__.py +346 -0
- judgeval/exceptions.py +28 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +51 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +77 -16
- judgeval/judges/together_judge.py +88 -17
- judgeval/judges/utils.py +7 -20
- judgeval/judgment_attribute_keys.py +55 -0
- judgeval/{common/logger.py → logger.py} +24 -8
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +11 -11
- judgeval/scorers/agent_scorer.py +15 -19
- judgeval/scorers/api_scorer.py +21 -23
- judgeval/scorers/base_scorer.py +54 -36
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
- judgeval/scorers/score.py +64 -47
- judgeval/scorers/utils.py +2 -107
- judgeval/tracer/__init__.py +1111 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +123 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +392 -0
- judgeval/trainer/trainable_model.py +252 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +74 -28
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +5 -3
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/v1/__init__.py +88 -0
- judgeval/v1/data/__init__.py +7 -0
- judgeval/v1/data/example.py +44 -0
- judgeval/v1/data/scorer_data.py +42 -0
- judgeval/v1/data/scoring_result.py +44 -0
- judgeval/v1/datasets/__init__.py +6 -0
- judgeval/v1/datasets/dataset.py +214 -0
- judgeval/v1/datasets/dataset_factory.py +94 -0
- judgeval/v1/evaluation/__init__.py +6 -0
- judgeval/v1/evaluation/evaluation.py +182 -0
- judgeval/v1/evaluation/evaluation_factory.py +17 -0
- judgeval/v1/instrumentation/__init__.py +6 -0
- judgeval/v1/instrumentation/llm/__init__.py +7 -0
- judgeval/v1/instrumentation/llm/config.py +78 -0
- judgeval/v1/instrumentation/llm/constants.py +11 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
- judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
- judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
- judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
- judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
- judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
- judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
- judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
- judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
- judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
- judgeval/v1/instrumentation/llm/providers.py +19 -0
- judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
- judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
- judgeval/v1/integrations/langgraph/__init__.py +13 -0
- judgeval/v1/integrations/openlit/__init__.py +47 -0
- judgeval/v1/internal/api/__init__.py +525 -0
- judgeval/v1/internal/api/api_types.py +413 -0
- judgeval/v1/prompts/__init__.py +6 -0
- judgeval/v1/prompts/prompt.py +29 -0
- judgeval/v1/prompts/prompt_factory.py +189 -0
- judgeval/v1/py.typed +0 -0
- judgeval/v1/scorers/__init__.py +6 -0
- judgeval/v1/scorers/api_scorer.py +82 -0
- judgeval/v1/scorers/base_scorer.py +17 -0
- judgeval/v1/scorers/built_in/__init__.py +17 -0
- judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
- judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
- judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
- judgeval/v1/scorers/built_in/faithfulness.py +28 -0
- judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
- judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
- judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
- judgeval/v1/scorers/scorers_factory.py +49 -0
- judgeval/v1/tracer/__init__.py +7 -0
- judgeval/v1/tracer/base_tracer.py +520 -0
- judgeval/v1/tracer/exporters/__init__.py +14 -0
- judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
- judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
- judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
- judgeval/v1/tracer/exporters/span_store.py +50 -0
- judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
- judgeval/v1/tracer/processors/__init__.py +6 -0
- judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
- judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
- judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
- judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
- judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
- judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
- judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
- judgeval/v1/tracer/tracer.py +67 -0
- judgeval/v1/tracer/tracer_factory.py +38 -0
- judgeval/v1/trainers/__init__.py +5 -0
- judgeval/v1/trainers/base_trainer.py +62 -0
- judgeval/v1/trainers/config.py +123 -0
- judgeval/v1/trainers/console.py +144 -0
- judgeval/v1/trainers/fireworks_trainer.py +392 -0
- judgeval/v1/trainers/trainable_model.py +252 -0
- judgeval/v1/trainers/trainers_factory.py +37 -0
- judgeval/v1/utils.py +18 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.23.0.dist-info/METADATA +266 -0
- judgeval-0.23.0.dist-info/RECORD +201 -0
- judgeval-0.23.0.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -34
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -352
- judgeval/common/api/constants.py +0 -165
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -98
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -1916
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -234
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -211
- judgeval/common/tracer/trace_manager.py +0 -92
- judgeval/common/utils.py +0 -940
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval/data/tool.py +0 -5
- judgeval/data/trace_run.py +0 -37
- judgeval/evaluation_run.py +0 -75
- judgeval/integrations/langgraph.py +0 -843
- judgeval/judges/mixture_of_judges.py +0 -286
- judgeval/judgment_client.py +0 -369
- judgeval/rules.py +0 -521
- judgeval/run_evaluation.py +0 -684
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.1.0.dist-info/METADATA +0 -202
- judgeval-0.1.0.dist-info/RECORD +0 -73
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
# generated by datamodel-codegen:
|
|
2
|
+
# filename: .openapi.json
|
|
3
|
+
# timestamp: 2025-11-18T18:52:11+00:00
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
|
|
7
|
+
from typing_extensions import NotRequired
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
TraceAndSpanId = List
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class LogEvalResultsResponse(TypedDict):
|
|
14
|
+
ui_results_url: str
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class EvalResultsFetch(TypedDict):
|
|
18
|
+
experiment_run_id: str
|
|
19
|
+
project_name: str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class FetchExperimentRunResponse(TypedDict):
|
|
23
|
+
results: NotRequired[Optional[List]]
|
|
24
|
+
ui_results_url: NotRequired[Optional[str]]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DatasetFetch(TypedDict):
|
|
28
|
+
dataset_name: str
|
|
29
|
+
project_name: str
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class DatasetsFetch(TypedDict):
|
|
33
|
+
project_name: str
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ProjectAdd(TypedDict):
|
|
37
|
+
project_name: str
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class ProjectAddResponse(TypedDict):
|
|
41
|
+
project_id: str
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ProjectDeleteFromJudgevalResponse(TypedDict):
|
|
45
|
+
project_name: str
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class ProjectDeleteResponse(TypedDict):
|
|
49
|
+
message: str
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class ScorerExistsRequest(TypedDict):
|
|
53
|
+
name: str
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ScorerExistsResponse(TypedDict):
|
|
57
|
+
exists: bool
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class SavePromptScorerRequest(TypedDict):
|
|
61
|
+
name: str
|
|
62
|
+
prompt: str
|
|
63
|
+
threshold: float
|
|
64
|
+
model: NotRequired[str]
|
|
65
|
+
is_trace: NotRequired[bool]
|
|
66
|
+
options: NotRequired[Optional[Dict[str, float]]]
|
|
67
|
+
description: NotRequired[Optional[str]]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class FetchPromptScorersRequest(TypedDict):
|
|
71
|
+
names: NotRequired[Optional[List[str]]]
|
|
72
|
+
is_trace: NotRequired[Optional[bool]]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class CustomScorerUploadPayload(TypedDict):
|
|
76
|
+
scorer_name: str
|
|
77
|
+
scorer_code: str
|
|
78
|
+
requirements_text: str
|
|
79
|
+
overwrite: NotRequired[bool]
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class CustomScorerTemplateResponse(TypedDict):
|
|
83
|
+
scorer_name: str
|
|
84
|
+
status: str
|
|
85
|
+
message: str
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class PromptInsertRequest(TypedDict):
|
|
89
|
+
project_id: str
|
|
90
|
+
name: str
|
|
91
|
+
prompt: str
|
|
92
|
+
tags: List[str]
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class PromptInsertResponse(TypedDict):
|
|
96
|
+
commit_id: str
|
|
97
|
+
parent_commit_id: NotRequired[Optional[str]]
|
|
98
|
+
created_at: str
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class PromptTagRequest(TypedDict):
|
|
102
|
+
project_id: str
|
|
103
|
+
name: str
|
|
104
|
+
commit_id: str
|
|
105
|
+
tags: List[str]
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class PromptTagResponse(TypedDict):
|
|
109
|
+
commit_id: str
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class PromptUntagRequest(TypedDict):
|
|
113
|
+
project_id: str
|
|
114
|
+
name: str
|
|
115
|
+
tags: List[str]
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class PromptUntagResponse(TypedDict):
|
|
119
|
+
commit_ids: List[str]
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class ResolveProjectNameRequest(TypedDict):
|
|
123
|
+
project_name: str
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class ResolveProjectNameResponse(TypedDict):
|
|
127
|
+
project_id: str
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class TraceIdRequest(TypedDict):
|
|
131
|
+
trace_id: str
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class SpanScoreRequest(TypedDict):
|
|
135
|
+
span_id: str
|
|
136
|
+
trace_id: str
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class BaseScorer(TypedDict):
|
|
140
|
+
score_type: str
|
|
141
|
+
threshold: NotRequired[float]
|
|
142
|
+
name: NotRequired[Optional[str]]
|
|
143
|
+
class_name: NotRequired[Optional[str]]
|
|
144
|
+
score: NotRequired[Optional[float]]
|
|
145
|
+
score_breakdown: NotRequired[Optional[Dict[str, Any]]]
|
|
146
|
+
reason: NotRequired[Optional[str]]
|
|
147
|
+
using_native_model: NotRequired[Optional[bool]]
|
|
148
|
+
success: NotRequired[Optional[bool]]
|
|
149
|
+
model: NotRequired[Optional[str]]
|
|
150
|
+
model_client: NotRequired[Any]
|
|
151
|
+
strict_mode: NotRequired[bool]
|
|
152
|
+
error: NotRequired[Optional[str]]
|
|
153
|
+
additional_metadata: NotRequired[Optional[Dict[str, Any]]]
|
|
154
|
+
user: NotRequired[Optional[str]]
|
|
155
|
+
server_hosted: NotRequired[bool]
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class ScorerConfig(TypedDict):
|
|
159
|
+
score_type: str
|
|
160
|
+
name: NotRequired[Optional[str]]
|
|
161
|
+
threshold: NotRequired[float]
|
|
162
|
+
strict_mode: NotRequired[bool]
|
|
163
|
+
required_params: NotRequired[List[str]]
|
|
164
|
+
kwargs: NotRequired[Optional[Dict[str, Any]]]
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class Example(TypedDict):
|
|
168
|
+
example_id: NotRequired[str]
|
|
169
|
+
created_at: NotRequired[str]
|
|
170
|
+
name: NotRequired[Optional[str]]
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class ValidationError(TypedDict):
|
|
174
|
+
loc: List[Union[str, int]]
|
|
175
|
+
msg: str
|
|
176
|
+
type: str
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class UsageInfo(TypedDict):
|
|
180
|
+
total_judgees: int
|
|
181
|
+
regular_use: int
|
|
182
|
+
pay_as_you_go_use: int
|
|
183
|
+
remaining_regular: int
|
|
184
|
+
remaining_after: int
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
DatasetKind = Literal["trace", "example"]
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class PromptScorer(TypedDict):
|
|
191
|
+
id: str
|
|
192
|
+
user_id: str
|
|
193
|
+
organization_id: str
|
|
194
|
+
name: str
|
|
195
|
+
prompt: str
|
|
196
|
+
threshold: float
|
|
197
|
+
model: NotRequired[str]
|
|
198
|
+
options: NotRequired[Optional[Dict[str, float]]]
|
|
199
|
+
description: NotRequired[Optional[str]]
|
|
200
|
+
created_at: NotRequired[Optional[str]]
|
|
201
|
+
updated_at: NotRequired[Optional[str]]
|
|
202
|
+
is_trace: NotRequired[Optional[bool]]
|
|
203
|
+
is_bucket_rubric: NotRequired[Optional[bool]]
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
class PromptCommitInfo(TypedDict):
|
|
207
|
+
name: str
|
|
208
|
+
prompt: str
|
|
209
|
+
tags: List[str]
|
|
210
|
+
commit_id: str
|
|
211
|
+
parent_commit_id: NotRequired[Optional[str]]
|
|
212
|
+
created_at: str
|
|
213
|
+
first_name: str
|
|
214
|
+
last_name: str
|
|
215
|
+
user_email: str
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
class ScorerData(TypedDict):
|
|
219
|
+
id: NotRequired[str]
|
|
220
|
+
name: str
|
|
221
|
+
threshold: float
|
|
222
|
+
success: bool
|
|
223
|
+
score: NotRequired[Optional[float]]
|
|
224
|
+
reason: NotRequired[Optional[str]]
|
|
225
|
+
strict_mode: NotRequired[Optional[bool]]
|
|
226
|
+
evaluation_model: NotRequired[Optional[str]]
|
|
227
|
+
error: NotRequired[Optional[str]]
|
|
228
|
+
additional_metadata: NotRequired[Optional[Dict[str, Any]]]
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
class OtelTraceSpan(TypedDict):
|
|
232
|
+
organization_id: str
|
|
233
|
+
project_id: NotRequired[Optional[str]]
|
|
234
|
+
user_id: str
|
|
235
|
+
timestamp: str
|
|
236
|
+
trace_id: str
|
|
237
|
+
span_id: str
|
|
238
|
+
parent_span_id: NotRequired[Optional[str]]
|
|
239
|
+
trace_state: NotRequired[Optional[str]]
|
|
240
|
+
span_name: NotRequired[Optional[str]]
|
|
241
|
+
span_kind: NotRequired[Optional[str]]
|
|
242
|
+
service_name: NotRequired[Optional[str]]
|
|
243
|
+
resource_attributes: NotRequired[Optional[Dict[str, Any]]]
|
|
244
|
+
span_attributes: NotRequired[Optional[Dict[str, Any]]]
|
|
245
|
+
duration: NotRequired[Optional[int]]
|
|
246
|
+
status_code: NotRequired[Optional[int]]
|
|
247
|
+
status_message: NotRequired[Optional[str]]
|
|
248
|
+
events: NotRequired[Optional[List[Dict[str, Any]]]]
|
|
249
|
+
links: NotRequired[Optional[List[Dict[str, Any]]]]
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
class OtelSpanListItemScores(TypedDict):
|
|
253
|
+
success: bool
|
|
254
|
+
score: float
|
|
255
|
+
reason: NotRequired[Optional[str]]
|
|
256
|
+
name: str
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class OtelSpanDetailScores(TypedDict):
|
|
260
|
+
success: bool
|
|
261
|
+
score: float
|
|
262
|
+
reason: NotRequired[Optional[str]]
|
|
263
|
+
name: str
|
|
264
|
+
example_id: NotRequired[Optional[str]]
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
class ExampleEvaluationRun(TypedDict):
|
|
268
|
+
id: NotRequired[str]
|
|
269
|
+
project_name: str
|
|
270
|
+
eval_name: str
|
|
271
|
+
custom_scorers: NotRequired[List[BaseScorer]]
|
|
272
|
+
judgment_scorers: NotRequired[List[ScorerConfig]]
|
|
273
|
+
created_at: NotRequired[str]
|
|
274
|
+
examples: List[Example]
|
|
275
|
+
trace_span_id: NotRequired[Optional[str]]
|
|
276
|
+
trace_id: NotRequired[Optional[str]]
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
class HTTPValidationError(TypedDict):
|
|
280
|
+
detail: NotRequired[List[ValidationError]]
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
class TraceEvaluationRun(TypedDict):
|
|
284
|
+
id: NotRequired[str]
|
|
285
|
+
project_name: str
|
|
286
|
+
eval_name: str
|
|
287
|
+
custom_scorers: NotRequired[List[BaseScorer]]
|
|
288
|
+
judgment_scorers: NotRequired[List[ScorerConfig]]
|
|
289
|
+
created_at: NotRequired[str]
|
|
290
|
+
trace_and_span_ids: List[TraceAndSpanId]
|
|
291
|
+
is_offline: NotRequired[bool]
|
|
292
|
+
is_bucket_run: NotRequired[bool]
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
class DatasetInsertExamples(TypedDict):
|
|
296
|
+
dataset_name: str
|
|
297
|
+
examples: List[Example]
|
|
298
|
+
project_name: str
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
class DatasetInfo(TypedDict):
|
|
302
|
+
dataset_id: str
|
|
303
|
+
name: str
|
|
304
|
+
created_at: str
|
|
305
|
+
kind: DatasetKind
|
|
306
|
+
entries: int
|
|
307
|
+
creator: str
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
class DatasetCreate(TypedDict):
|
|
311
|
+
name: str
|
|
312
|
+
dataset_kind: DatasetKind
|
|
313
|
+
project_name: str
|
|
314
|
+
examples: List[Example]
|
|
315
|
+
overwrite: bool
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
class SavePromptScorerResponse(TypedDict):
|
|
319
|
+
scorer_response: PromptScorer
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
class FetchPromptScorersResponse(TypedDict):
|
|
323
|
+
scorers: List[PromptScorer]
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
class PromptFetchResponse(TypedDict):
|
|
327
|
+
commit: NotRequired[Optional[PromptCommitInfo]]
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
class PromptVersionsResponse(TypedDict):
|
|
331
|
+
versions: List[PromptCommitInfo]
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
class ScoringResult(TypedDict):
|
|
335
|
+
success: bool
|
|
336
|
+
scorers_data: List[ScorerData]
|
|
337
|
+
name: NotRequired[Optional[str]]
|
|
338
|
+
data_object: NotRequired[Optional[Union[OtelTraceSpan, Example]]]
|
|
339
|
+
trace_id: NotRequired[Optional[str]]
|
|
340
|
+
run_duration: NotRequired[Optional[float]]
|
|
341
|
+
evaluation_cost: NotRequired[Optional[float]]
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
class OtelTraceListItem(TypedDict):
|
|
345
|
+
organization_id: str
|
|
346
|
+
project_id: str
|
|
347
|
+
trace_id: str
|
|
348
|
+
created_at: str
|
|
349
|
+
duration: NotRequired[Optional[int]]
|
|
350
|
+
tags: NotRequired[Optional[List[str]]]
|
|
351
|
+
experiment_run_id: NotRequired[Optional[str]]
|
|
352
|
+
span_name: NotRequired[Optional[str]]
|
|
353
|
+
llm_cost: NotRequired[Optional[float]]
|
|
354
|
+
error: NotRequired[str]
|
|
355
|
+
scores: NotRequired[List[OtelSpanListItemScores]]
|
|
356
|
+
rules_invoked: NotRequired[List[str]]
|
|
357
|
+
customer_id: NotRequired[Optional[str]]
|
|
358
|
+
input: NotRequired[Optional[str]]
|
|
359
|
+
output: NotRequired[Optional[str]]
|
|
360
|
+
input_preview: NotRequired[Optional[str]]
|
|
361
|
+
output_preview: NotRequired[Optional[str]]
|
|
362
|
+
annotation_count: NotRequired[int]
|
|
363
|
+
span_id: str
|
|
364
|
+
rule_id: NotRequired[Optional[str]]
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
class OtelSpanDetail(TypedDict):
|
|
368
|
+
organization_id: str
|
|
369
|
+
project_id: str
|
|
370
|
+
timestamp: str
|
|
371
|
+
trace_id: str
|
|
372
|
+
span_id: str
|
|
373
|
+
parent_span_id: NotRequired[Optional[str]]
|
|
374
|
+
trace_state: NotRequired[Optional[str]]
|
|
375
|
+
span_name: NotRequired[Optional[str]]
|
|
376
|
+
span_kind: NotRequired[Optional[str]]
|
|
377
|
+
service_name: NotRequired[Optional[str]]
|
|
378
|
+
resource_attributes: NotRequired[Optional[Dict[str, Any]]]
|
|
379
|
+
span_attributes: NotRequired[Optional[Dict[str, Any]]]
|
|
380
|
+
duration: NotRequired[Optional[int]]
|
|
381
|
+
status_code: NotRequired[Optional[int]]
|
|
382
|
+
status_message: NotRequired[Optional[str]]
|
|
383
|
+
events: NotRequired[Optional[List[Dict[str, Any]]]]
|
|
384
|
+
links: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
|
|
385
|
+
llm_cost: NotRequired[Optional[float]]
|
|
386
|
+
prompt_tokens: NotRequired[Optional[int]]
|
|
387
|
+
completion_tokens: NotRequired[Optional[int]]
|
|
388
|
+
scores: NotRequired[Optional[List[OtelSpanDetailScores]]]
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
class EvaluateResponse(TypedDict):
|
|
392
|
+
status: str
|
|
393
|
+
results: List[ScoringResult]
|
|
394
|
+
resource_usage: NotRequired[Optional[UsageInfo]]
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
class EvalResults(TypedDict):
|
|
398
|
+
results: List[ScoringResult]
|
|
399
|
+
run: Union[ExampleEvaluationRun, TraceEvaluationRun]
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
class DatasetTraceWithSpans(TypedDict):
|
|
403
|
+
dataset_id: str
|
|
404
|
+
trace_detail: OtelTraceListItem
|
|
405
|
+
spans: List[OtelSpanDetail]
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
class DatasetReturn(TypedDict):
|
|
409
|
+
name: str
|
|
410
|
+
project_name: str
|
|
411
|
+
dataset_kind: DatasetKind
|
|
412
|
+
examples: NotRequired[List[Example]]
|
|
413
|
+
traces: NotRequired[Optional[List[DatasetTraceWithSpans]]]
|
judgeval/cli.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import subprocess
|
|
5
|
+
import sys
|
|
6
|
+
import typer
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
from judgeval.logger import judgeval_logger
|
|
10
|
+
from judgeval import JudgmentClient
|
|
11
|
+
from judgeval.version import get_version
|
|
12
|
+
from judgeval.exceptions import JudgmentAPIError
|
|
13
|
+
from judgeval.utils.project import _resolve_project_id
|
|
14
|
+
from judgeval.utils.url import url_for
|
|
15
|
+
|
|
16
|
+
load_dotenv()
|
|
17
|
+
|
|
18
|
+
app = typer.Typer(
|
|
19
|
+
no_args_is_help=True,
|
|
20
|
+
pretty_exceptions_enable=False,
|
|
21
|
+
pretty_exceptions_show_locals=False,
|
|
22
|
+
pretty_exceptions_short=False,
|
|
23
|
+
rich_help_panel=None,
|
|
24
|
+
rich_markup_mode=None,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@app.command(
|
|
29
|
+
context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
|
30
|
+
)
|
|
31
|
+
def load_otel_env(
|
|
32
|
+
ctx: typer.Context,
|
|
33
|
+
project_name: str = typer.Argument(help="Project name to send telemetry to"),
|
|
34
|
+
api_key: str = typer.Option(None, envvar="JUDGMENT_API_KEY"),
|
|
35
|
+
organization_id: str = typer.Option(None, envvar="JUDGMENT_ORG_ID"),
|
|
36
|
+
):
|
|
37
|
+
"""Run command with OpenTelemetry environment variables configured for Judgment."""
|
|
38
|
+
if not api_key or not organization_id:
|
|
39
|
+
raise typer.BadParameter("JUDGMENT_API_KEY and JUDGMENT_ORG_ID required")
|
|
40
|
+
|
|
41
|
+
project_id = _resolve_project_id(project_name, api_key, organization_id)
|
|
42
|
+
if not project_id:
|
|
43
|
+
raise typer.BadParameter(f"Project '{project_name}' not found")
|
|
44
|
+
|
|
45
|
+
if not ctx.args:
|
|
46
|
+
raise typer.BadParameter(
|
|
47
|
+
"No command provided. Usage: judgeval load_otel_env PROJECT_NAME -- COMMAND"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
env = os.environ.copy()
|
|
51
|
+
env["OTEL_TRACES_EXPORTER"] = "otlp"
|
|
52
|
+
env["OTEL_EXPORTER_OTLP_TRACES_PROTOCOL"] = "http/protobuf"
|
|
53
|
+
env["OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"] = url_for("/otel/v1/traces")
|
|
54
|
+
env["OTEL_EXPORTER_OTLP_HEADERS"] = (
|
|
55
|
+
f"Authorization=Bearer {api_key},X-Organization-Id={organization_id},X-Project-Id={project_id}"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
result = subprocess.run(ctx.args, env=env)
|
|
59
|
+
sys.exit(result.returncode)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@app.command()
|
|
63
|
+
def upload_scorer(
|
|
64
|
+
scorer_file_path: str = typer.Argument(help="Path to scorer Python file"),
|
|
65
|
+
requirements_file_path: str = typer.Argument(help="Path to requirements.txt file"),
|
|
66
|
+
unique_name: str = typer.Option(
|
|
67
|
+
None, help="Custom scorer name (auto-detected if not provided)"
|
|
68
|
+
),
|
|
69
|
+
overwrite: bool = typer.Option(
|
|
70
|
+
False, "--overwrite", "-o", help="Overwrite if exists"
|
|
71
|
+
),
|
|
72
|
+
api_key: str = typer.Option(None, envvar="JUDGMENT_API_KEY"),
|
|
73
|
+
organization_id: str = typer.Option(None, envvar="JUDGMENT_ORG_ID"),
|
|
74
|
+
):
|
|
75
|
+
"""Upload custom scorer to Judgment."""
|
|
76
|
+
scorer_path = Path(scorer_file_path)
|
|
77
|
+
requirements_path = Path(requirements_file_path)
|
|
78
|
+
|
|
79
|
+
if not scorer_path.exists():
|
|
80
|
+
raise typer.BadParameter(f"Scorer file not found: {scorer_file_path}")
|
|
81
|
+
if not requirements_path.exists():
|
|
82
|
+
raise typer.BadParameter(
|
|
83
|
+
f"Requirements file not found: {requirements_file_path}"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
client = JudgmentClient(api_key=api_key, organization_id=organization_id)
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
result = client.upload_custom_scorer(
|
|
90
|
+
scorer_file_path=scorer_file_path,
|
|
91
|
+
requirements_file_path=requirements_file_path,
|
|
92
|
+
unique_name=unique_name,
|
|
93
|
+
overwrite=overwrite,
|
|
94
|
+
)
|
|
95
|
+
if not result:
|
|
96
|
+
raise typer.Abort()
|
|
97
|
+
judgeval_logger.info("Custom scorer uploaded successfully!")
|
|
98
|
+
except JudgmentAPIError as e:
|
|
99
|
+
if e.status_code == 409:
|
|
100
|
+
judgeval_logger.error("Scorer exists. Use --overwrite to replace")
|
|
101
|
+
raise typer.Exit(1)
|
|
102
|
+
raise
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@app.command()
|
|
106
|
+
def version():
|
|
107
|
+
"""Show Judgeval CLI version."""
|
|
108
|
+
typer.echo(f"Judgeval CLI v{get_version()}")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
if __name__ == "__main__":
|
|
112
|
+
app()
|
judgeval/constants.py
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
|
-
|
|
2
|
-
Constant variables used throughout source code
|
|
3
|
-
"""
|
|
1
|
+
from __future__ import annotations
|
|
4
2
|
|
|
5
3
|
from enum import Enum
|
|
4
|
+
from typing import Set
|
|
6
5
|
import litellm
|
|
7
|
-
import os
|
|
8
6
|
|
|
9
7
|
|
|
10
8
|
class APIScorerType(str, Enum):
|
|
@@ -16,37 +14,25 @@ class APIScorerType(str, Enum):
|
|
|
16
14
|
"""
|
|
17
15
|
|
|
18
16
|
PROMPT_SCORER = "Prompt Scorer"
|
|
17
|
+
TRACE_PROMPT_SCORER = "Trace Prompt Scorer"
|
|
19
18
|
FAITHFULNESS = "Faithfulness"
|
|
20
19
|
ANSWER_RELEVANCY = "Answer Relevancy"
|
|
21
20
|
ANSWER_CORRECTNESS = "Answer Correctness"
|
|
22
21
|
INSTRUCTION_ADHERENCE = "Instruction Adherence"
|
|
23
22
|
EXECUTION_ORDER = "Execution Order"
|
|
24
|
-
DERAILMENT = "Derailment"
|
|
25
|
-
TOOL_ORDER = "Tool Order"
|
|
26
|
-
CLASSIFIER = "Classifier"
|
|
27
|
-
TOOL_DEPENDENCY = "Tool Dependency"
|
|
28
23
|
CUSTOM = "Custom"
|
|
29
24
|
|
|
30
25
|
@classmethod
|
|
31
|
-
def
|
|
32
|
-
# Handle case-insensitive lookup
|
|
26
|
+
def __missing__(cls, value: str) -> APIScorerType:
|
|
33
27
|
for member in cls:
|
|
34
28
|
if member.value == value.lower():
|
|
35
29
|
return member
|
|
36
30
|
|
|
31
|
+
raise ValueError(f"Invalid scorer type: {value}")
|
|
37
32
|
|
|
38
|
-
UNBOUNDED_SCORERS: set[APIScorerType] = (
|
|
39
|
-
set()
|
|
40
|
-
) # scorers whose scores are not bounded between 0-1
|
|
41
33
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
"RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com"
|
|
45
|
-
)
|
|
46
|
-
RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
|
|
47
|
-
RABBITMQ_QUEUE = os.getenv("RABBITMQ_QUEUE", "task_queue")
|
|
48
|
-
# Models
|
|
49
|
-
LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
|
|
34
|
+
LITELLM_SUPPORTED_MODELS: Set[str] = set(litellm.model_list)
|
|
35
|
+
|
|
50
36
|
|
|
51
37
|
TOGETHER_SUPPORTED_MODELS = [
|
|
52
38
|
"meta-llama/Meta-Llama-3-70B-Instruct-Turbo",
|
|
@@ -109,12 +95,3 @@ JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini", "osiris"}
|
|
|
109
95
|
ACCEPTABLE_MODELS = (
|
|
110
96
|
set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
|
|
111
97
|
)
|
|
112
|
-
|
|
113
|
-
## System settings
|
|
114
|
-
MAX_WORKER_THREADS = 10
|
|
115
|
-
|
|
116
|
-
# Maximum number of concurrent operations for evaluation runs
|
|
117
|
-
MAX_CONCURRENT_EVALUATIONS = 50 # Adjust based on system capabilities
|
|
118
|
-
|
|
119
|
-
# Span lifecycle management
|
|
120
|
-
SPAN_LIFECYCLE_END_UPDATE_ID = 20 # Default ending number for completed spans
|
judgeval/data/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from judgeval.data.example import Example, ExampleParams
|
|
2
2
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
|
3
3
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
|
4
|
-
from judgeval.data.trace import
|
|
4
|
+
from judgeval.data.trace import TraceUsage
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
__all__ = [
|
|
@@ -11,7 +11,5 @@ __all__ = [
|
|
|
11
11
|
"create_scorer_data",
|
|
12
12
|
"ScoringResult",
|
|
13
13
|
"generate_scoring_result",
|
|
14
|
-
"Trace",
|
|
15
|
-
"TraceSpan",
|
|
16
14
|
"TraceUsage",
|
|
17
15
|
]
|