judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +173 -10
- judgeval/api/__init__.py +523 -0
- judgeval/api/api_types.py +413 -0
- judgeval/cli.py +112 -0
- judgeval/constants.py +7 -30
- judgeval/data/__init__.py +1 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +14 -40
- judgeval/data/judgment_types.py +396 -146
- judgeval/data/result.py +11 -18
- judgeval/data/scorer_data.py +3 -26
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +115 -194
- judgeval/dataset/__init__.py +335 -0
- judgeval/env.py +55 -0
- judgeval/evaluation/__init__.py +346 -0
- judgeval/exceptions.py +28 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +51 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +77 -16
- judgeval/judges/together_judge.py +88 -17
- judgeval/judges/utils.py +7 -20
- judgeval/judgment_attribute_keys.py +55 -0
- judgeval/{common/logger.py → logger.py} +24 -8
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +11 -11
- judgeval/scorers/agent_scorer.py +15 -19
- judgeval/scorers/api_scorer.py +21 -23
- judgeval/scorers/base_scorer.py +54 -36
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
- judgeval/scorers/score.py +64 -47
- judgeval/scorers/utils.py +2 -107
- judgeval/tracer/__init__.py +1111 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +123 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +392 -0
- judgeval/trainer/trainable_model.py +252 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +74 -28
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +5 -3
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/v1/__init__.py +88 -0
- judgeval/v1/data/__init__.py +7 -0
- judgeval/v1/data/example.py +44 -0
- judgeval/v1/data/scorer_data.py +42 -0
- judgeval/v1/data/scoring_result.py +44 -0
- judgeval/v1/datasets/__init__.py +6 -0
- judgeval/v1/datasets/dataset.py +214 -0
- judgeval/v1/datasets/dataset_factory.py +94 -0
- judgeval/v1/evaluation/__init__.py +6 -0
- judgeval/v1/evaluation/evaluation.py +182 -0
- judgeval/v1/evaluation/evaluation_factory.py +17 -0
- judgeval/v1/instrumentation/__init__.py +6 -0
- judgeval/v1/instrumentation/llm/__init__.py +7 -0
- judgeval/v1/instrumentation/llm/config.py +78 -0
- judgeval/v1/instrumentation/llm/constants.py +11 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
- judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
- judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
- judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
- judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
- judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
- judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
- judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
- judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
- judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
- judgeval/v1/instrumentation/llm/providers.py +19 -0
- judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
- judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
- judgeval/v1/integrations/langgraph/__init__.py +13 -0
- judgeval/v1/integrations/openlit/__init__.py +47 -0
- judgeval/v1/internal/api/__init__.py +525 -0
- judgeval/v1/internal/api/api_types.py +413 -0
- judgeval/v1/prompts/__init__.py +6 -0
- judgeval/v1/prompts/prompt.py +29 -0
- judgeval/v1/prompts/prompt_factory.py +189 -0
- judgeval/v1/py.typed +0 -0
- judgeval/v1/scorers/__init__.py +6 -0
- judgeval/v1/scorers/api_scorer.py +82 -0
- judgeval/v1/scorers/base_scorer.py +17 -0
- judgeval/v1/scorers/built_in/__init__.py +17 -0
- judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
- judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
- judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
- judgeval/v1/scorers/built_in/faithfulness.py +28 -0
- judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
- judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
- judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
- judgeval/v1/scorers/scorers_factory.py +49 -0
- judgeval/v1/tracer/__init__.py +7 -0
- judgeval/v1/tracer/base_tracer.py +520 -0
- judgeval/v1/tracer/exporters/__init__.py +14 -0
- judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
- judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
- judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
- judgeval/v1/tracer/exporters/span_store.py +50 -0
- judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
- judgeval/v1/tracer/processors/__init__.py +6 -0
- judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
- judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
- judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
- judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
- judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
- judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
- judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
- judgeval/v1/tracer/tracer.py +67 -0
- judgeval/v1/tracer/tracer_factory.py +38 -0
- judgeval/v1/trainers/__init__.py +5 -0
- judgeval/v1/trainers/base_trainer.py +62 -0
- judgeval/v1/trainers/config.py +123 -0
- judgeval/v1/trainers/console.py +144 -0
- judgeval/v1/trainers/fireworks_trainer.py +392 -0
- judgeval/v1/trainers/trainable_model.py +252 -0
- judgeval/v1/trainers/trainers_factory.py +37 -0
- judgeval/v1/utils.py +18 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.23.0.dist-info/METADATA +266 -0
- judgeval-0.23.0.dist-info/RECORD +201 -0
- judgeval-0.23.0.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -34
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -352
- judgeval/common/api/constants.py +0 -165
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -98
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -1916
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -234
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -211
- judgeval/common/tracer/trace_manager.py +0 -92
- judgeval/common/utils.py +0 -940
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval/data/tool.py +0 -5
- judgeval/data/trace_run.py +0 -37
- judgeval/evaluation_run.py +0 -75
- judgeval/integrations/langgraph.py +0 -843
- judgeval/judges/mixture_of_judges.py +0 -286
- judgeval/judgment_client.py +0 -369
- judgeval/rules.py +0 -521
- judgeval/run_evaluation.py +0 -684
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.1.0.dist-info/METADATA +0 -202
- judgeval-0.1.0.dist-info/RECORD +0 -73
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -3,14 +3,77 @@ Implementation of using TogetherAI inference for judges.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel
|
|
6
|
-
from typing import List, Union
|
|
7
|
-
|
|
6
|
+
from typing import Dict, List, Union, Any, cast
|
|
8
7
|
from judgeval.judges import JudgevalJudge
|
|
9
|
-
from judgeval.
|
|
10
|
-
|
|
11
|
-
|
|
8
|
+
from judgeval.logger import judgeval_logger
|
|
9
|
+
from judgeval.env import (
|
|
10
|
+
JUDGMENT_DEFAULT_TOGETHER_MODEL,
|
|
11
|
+
TOGETHERAI_API_KEY,
|
|
12
|
+
TOGETHER_API_KEY,
|
|
12
13
|
)
|
|
13
|
-
|
|
14
|
+
|
|
15
|
+
together_api_key = TOGETHERAI_API_KEY or TOGETHER_API_KEY
|
|
16
|
+
if together_api_key:
|
|
17
|
+
try:
|
|
18
|
+
from together import Together, AsyncTogether # type: ignore[import-untyped]
|
|
19
|
+
|
|
20
|
+
together_client = Together(api_key=together_api_key)
|
|
21
|
+
async_together_client = AsyncTogether(api_key=together_api_key)
|
|
22
|
+
except Exception:
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def fetch_together_api_response(
|
|
27
|
+
model: str,
|
|
28
|
+
messages: List[Dict[str, str]],
|
|
29
|
+
response_format: Union[Dict[str, Any], None] = None,
|
|
30
|
+
) -> str:
|
|
31
|
+
if not messages:
|
|
32
|
+
raise ValueError("Messages cannot be empty")
|
|
33
|
+
|
|
34
|
+
if response_format is not None:
|
|
35
|
+
response = together_client.chat.completions.create(
|
|
36
|
+
model=model,
|
|
37
|
+
messages=messages,
|
|
38
|
+
response_format=response_format,
|
|
39
|
+
)
|
|
40
|
+
else:
|
|
41
|
+
response = together_client.chat.completions.create(
|
|
42
|
+
model=model,
|
|
43
|
+
messages=messages,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
content = response.choices[0].message.content # type: ignore[attr-defined]
|
|
47
|
+
if content is None:
|
|
48
|
+
raise ValueError("Received empty response from TogetherAI")
|
|
49
|
+
return cast(str, content)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
async def afetch_together_api_response(
|
|
53
|
+
model: str,
|
|
54
|
+
messages: List[Dict[str, str]],
|
|
55
|
+
response_format: Union[Dict[str, Any], None] = None,
|
|
56
|
+
) -> str:
|
|
57
|
+
if not messages:
|
|
58
|
+
raise ValueError("Messages cannot be empty")
|
|
59
|
+
|
|
60
|
+
if response_format is not None:
|
|
61
|
+
response = await async_together_client.chat.completions.create(
|
|
62
|
+
model=model,
|
|
63
|
+
messages=messages,
|
|
64
|
+
response_format=response_format,
|
|
65
|
+
)
|
|
66
|
+
else:
|
|
67
|
+
response = await async_together_client.chat.completions.create(
|
|
68
|
+
model=model,
|
|
69
|
+
messages=messages,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
content = response.choices[0].message.content # type: ignore[attr-defined]
|
|
73
|
+
if content is None:
|
|
74
|
+
raise ValueError("Received empty response from TogetherAI")
|
|
75
|
+
return cast(str, content)
|
|
76
|
+
|
|
14
77
|
|
|
15
78
|
BASE_CONVERSATION = [
|
|
16
79
|
{"role": "system", "content": "You are a helpful assistant."},
|
|
@@ -18,44 +81,52 @@ BASE_CONVERSATION = [
|
|
|
18
81
|
|
|
19
82
|
|
|
20
83
|
class TogetherJudge(JudgevalJudge):
|
|
21
|
-
def __init__(self, model: str =
|
|
84
|
+
def __init__(self, model: str = JUDGMENT_DEFAULT_TOGETHER_MODEL, **kwargs):
|
|
22
85
|
self.model = model
|
|
23
86
|
self.kwargs = kwargs
|
|
24
87
|
super().__init__(model_name=model)
|
|
25
88
|
|
|
26
|
-
|
|
27
|
-
|
|
89
|
+
def generate(
|
|
90
|
+
self,
|
|
91
|
+
input: Union[str, List[Dict[str, str]]],
|
|
92
|
+
schema: Union[BaseModel, None] = None,
|
|
93
|
+
) -> str:
|
|
94
|
+
response_format = schema.model_json_schema() if schema else None
|
|
95
|
+
|
|
28
96
|
if isinstance(input, str):
|
|
29
97
|
convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
|
|
30
98
|
return fetch_together_api_response(
|
|
31
|
-
self.model, convo, response_format=
|
|
99
|
+
self.model, convo, response_format=response_format
|
|
32
100
|
)
|
|
33
101
|
elif isinstance(input, list):
|
|
34
|
-
|
|
102
|
+
messages = [dict(msg) for msg in input]
|
|
35
103
|
return fetch_together_api_response(
|
|
36
|
-
self.model,
|
|
104
|
+
self.model, messages, response_format=response_format
|
|
37
105
|
)
|
|
38
106
|
else:
|
|
39
107
|
judgeval_logger.error(f"Invalid input type received: {type(input)}")
|
|
40
108
|
raise TypeError("Input must be a string or a list of dictionaries.")
|
|
41
109
|
|
|
42
110
|
async def a_generate(
|
|
43
|
-
self,
|
|
111
|
+
self,
|
|
112
|
+
input: Union[str, List[Dict[str, str]]],
|
|
113
|
+
schema: Union[BaseModel, None] = None,
|
|
44
114
|
) -> str:
|
|
115
|
+
response_format = schema.model_json_schema() if schema else None
|
|
116
|
+
|
|
45
117
|
if isinstance(input, str):
|
|
46
118
|
convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
|
|
47
119
|
res = await afetch_together_api_response(
|
|
48
|
-
self.model, convo, response_format=
|
|
120
|
+
self.model, convo, response_format=response_format
|
|
49
121
|
)
|
|
50
122
|
return res
|
|
51
123
|
elif isinstance(input, list):
|
|
52
|
-
|
|
124
|
+
messages = [dict(msg) for msg in input]
|
|
53
125
|
res = await afetch_together_api_response(
|
|
54
|
-
self.model,
|
|
126
|
+
self.model, messages, response_format=response_format
|
|
55
127
|
)
|
|
56
128
|
return res
|
|
57
129
|
else:
|
|
58
|
-
judgeval_logger.error(f"Invalid input type received: {type(input)}")
|
|
59
130
|
raise TypeError("Input must be a string or a list of dictionaries.")
|
|
60
131
|
|
|
61
132
|
def load_model(self) -> str:
|
judgeval/judges/utils.py
CHANGED
|
@@ -3,21 +3,21 @@ This module contains utility functions for judge models.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import litellm
|
|
6
|
-
from typing import Optional, Union, Tuple
|
|
6
|
+
from typing import Optional, Union, Tuple
|
|
7
7
|
|
|
8
|
-
from judgeval.
|
|
9
|
-
from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge
|
|
8
|
+
from judgeval.exceptions import InvalidJudgeModelError
|
|
9
|
+
from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge
|
|
10
|
+
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
|
10
11
|
from judgeval.constants import (
|
|
11
12
|
TOGETHER_SUPPORTED_MODELS,
|
|
12
13
|
JUDGMENT_SUPPORTED_MODELS,
|
|
13
|
-
ACCEPTABLE_MODELS,
|
|
14
14
|
)
|
|
15
15
|
|
|
16
16
|
LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def create_judge(
|
|
20
|
-
model: Optional[Union[str,
|
|
20
|
+
model: Optional[Union[str, JudgevalJudge]] = None,
|
|
21
21
|
) -> Tuple[JudgevalJudge, bool]:
|
|
22
22
|
"""
|
|
23
23
|
Creates a judge model from string(s) or a judgeval judge object.
|
|
@@ -30,28 +30,15 @@ def create_judge(
|
|
|
30
30
|
If no model is provided, uses GPT4o as the default judge.
|
|
31
31
|
"""
|
|
32
32
|
if model is None: # default option
|
|
33
|
-
return LiteLLMJudge(model=
|
|
33
|
+
return LiteLLMJudge(model=JUDGMENT_DEFAULT_GPT_MODEL), True
|
|
34
34
|
if not isinstance(model, (str, list, JudgevalJudge)):
|
|
35
35
|
raise InvalidJudgeModelError(
|
|
36
36
|
f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead."
|
|
37
37
|
)
|
|
38
38
|
# If model is already a valid judge type, return it and mark native
|
|
39
|
-
if isinstance(model, (JudgevalJudge, LiteLLMJudge, TogetherJudge
|
|
39
|
+
if isinstance(model, (JudgevalJudge, LiteLLMJudge, TogetherJudge)):
|
|
40
40
|
return model, True
|
|
41
41
|
|
|
42
|
-
# Either string or List[str]
|
|
43
|
-
if isinstance(model, list):
|
|
44
|
-
for m in model:
|
|
45
|
-
if m in JUDGMENT_SUPPORTED_MODELS:
|
|
46
|
-
raise NotImplementedError(
|
|
47
|
-
"""Judgment models are not yet supported for local scoring.
|
|
48
|
-
Please either set the `use_judgment` flag to True or use
|
|
49
|
-
non-Judgment models."""
|
|
50
|
-
)
|
|
51
|
-
if m not in ACCEPTABLE_MODELS:
|
|
52
|
-
raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
|
|
53
|
-
return MixtureOfJudges(models=model), True
|
|
54
|
-
# If model is a string, check that it corresponds to a valid model
|
|
55
42
|
if model in LITELLM_SUPPORTED_MODELS:
|
|
56
43
|
return LiteLLMJudge(model=model), True
|
|
57
44
|
if model in TOGETHER_SUPPORTED_MODELS:
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class AttributeKeys(str, Enum):
|
|
7
|
+
JUDGMENT_SPAN_KIND = "judgment.span_kind"
|
|
8
|
+
JUDGMENT_INPUT = "judgment.input"
|
|
9
|
+
JUDGMENT_OUTPUT = "judgment.output"
|
|
10
|
+
JUDGMENT_OFFLINE_MODE = "judgment.offline_mode"
|
|
11
|
+
JUDGMENT_UPDATE_ID = "judgment.update_id"
|
|
12
|
+
JUDGMENT_CUSTOMER_ID = "judgment.customer_id"
|
|
13
|
+
JUDGMENT_AGENT_ID = "judgment.agent_id"
|
|
14
|
+
JUDGMENT_PARENT_AGENT_ID = "judgment.parent_agent_id"
|
|
15
|
+
JUDGMENT_AGENT_CLASS_NAME = "judgment.agent_class_name"
|
|
16
|
+
JUDGMENT_AGENT_INSTANCE_NAME = "judgment.agent_instance_name"
|
|
17
|
+
JUDGMENT_IS_AGENT_ENTRY_POINT = "judgment.is_agent_entry_point"
|
|
18
|
+
JUDGMENT_CUMULATIVE_LLM_COST = "judgment.cumulative_llm_cost"
|
|
19
|
+
JUDGMENT_STATE_BEFORE = "judgment.state_before"
|
|
20
|
+
JUDGMENT_STATE_AFTER = "judgment.state_after"
|
|
21
|
+
JUDGMENT_PENDING_TRACE_EVAL = "judgment.pending_trace_eval"
|
|
22
|
+
JUDGMENT_USAGE_METADATA = "judgment.usage.metadata"
|
|
23
|
+
|
|
24
|
+
JUDGMENT_LLM_PROVIDER = "judgment.llm.provider"
|
|
25
|
+
JUDGMENT_LLM_MODEL_NAME = "judgment.llm.model"
|
|
26
|
+
JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS = "judgment.usage.non_cached_input_tokens"
|
|
27
|
+
JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS = (
|
|
28
|
+
"judgment.usage.cache_creation_input_tokens"
|
|
29
|
+
)
|
|
30
|
+
JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS = "judgment.usage.cache_read_input_tokens"
|
|
31
|
+
JUDGMENT_USAGE_OUTPUT_TOKENS = "judgment.usage.output_tokens"
|
|
32
|
+
JUDGMENT_USAGE_TOTAL_COST_USD = "judgment.usage.total_cost_usd"
|
|
33
|
+
|
|
34
|
+
GEN_AI_PROMPT = "gen_ai.prompt"
|
|
35
|
+
GEN_AI_COMPLETION = "gen_ai.completion"
|
|
36
|
+
GEN_AI_REQUEST_MODEL = "gen_ai.request.model"
|
|
37
|
+
GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"
|
|
38
|
+
GEN_AI_SYSTEM = "gen_ai.system"
|
|
39
|
+
GEN_AI_USAGE_INPUT_TOKENS = "gen_ai.usage.input_tokens"
|
|
40
|
+
GEN_AI_USAGE_OUTPUT_TOKENS = "gen_ai.usage.output_tokens"
|
|
41
|
+
GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS = (
|
|
42
|
+
"gen_ai.usage.cache_creation_input_tokens"
|
|
43
|
+
)
|
|
44
|
+
GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS = "gen_ai.usage.cache_read_input_tokens"
|
|
45
|
+
GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
|
|
46
|
+
GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
|
|
47
|
+
GEN_AI_RESPONSE_FINISH_REASONS = "gen_ai.response.finish_reasons"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ResourceKeys(str, Enum):
|
|
51
|
+
SERVICE_NAME = "service.name"
|
|
52
|
+
TELEMETRY_SDK_LANGUAGE = "telemetry.sdk.language"
|
|
53
|
+
TELEMETRY_SDK_NAME = "telemetry.sdk.name"
|
|
54
|
+
TELEMETRY_SDK_VERSION = "telemetry.sdk.version"
|
|
55
|
+
JUDGMENT_PROJECT_ID = "judgment.project_id"
|
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
# logger.py
|
|
2
|
-
|
|
3
1
|
import logging
|
|
4
2
|
import sys
|
|
5
|
-
import os
|
|
6
3
|
|
|
7
|
-
|
|
4
|
+
from judgeval.env import JUDGMENT_NO_COLOR, JUDGMENT_LOG_LEVEL
|
|
5
|
+
from judgeval.utils.decorators.use_once import use_once
|
|
6
|
+
|
|
8
7
|
RESET = "\033[0m"
|
|
9
8
|
RED = "\033[31m"
|
|
10
9
|
YELLOW = "\033[33m"
|
|
@@ -38,10 +37,25 @@ class ColorFormatter(logging.Formatter):
|
|
|
38
37
|
return message
|
|
39
38
|
|
|
40
39
|
|
|
40
|
+
def _parse_log_level(level_str: str) -> int:
|
|
41
|
+
level_map = {
|
|
42
|
+
"debug": logging.DEBUG,
|
|
43
|
+
"info": logging.INFO,
|
|
44
|
+
"warning": logging.WARNING,
|
|
45
|
+
"warn": logging.WARNING,
|
|
46
|
+
"error": logging.ERROR,
|
|
47
|
+
"critical": logging.CRITICAL,
|
|
48
|
+
}
|
|
49
|
+
return level_map.get(level_str.lower(), logging.WARNING)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@use_once
|
|
41
53
|
def _setup_judgeval_logger():
|
|
42
|
-
use_color = sys.stdout.isatty() and
|
|
54
|
+
use_color = sys.stdout.isatty() and JUDGMENT_NO_COLOR is None
|
|
55
|
+
log_level = _parse_log_level(JUDGMENT_LOG_LEVEL)
|
|
56
|
+
|
|
43
57
|
handler = logging.StreamHandler(sys.stdout)
|
|
44
|
-
handler.setLevel(
|
|
58
|
+
handler.setLevel(log_level)
|
|
45
59
|
handler.setFormatter(
|
|
46
60
|
ColorFormatter(
|
|
47
61
|
fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
@@ -51,10 +65,12 @@ def _setup_judgeval_logger():
|
|
|
51
65
|
)
|
|
52
66
|
|
|
53
67
|
logger = logging.getLogger("judgeval")
|
|
54
|
-
logger.setLevel(
|
|
68
|
+
logger.setLevel(log_level)
|
|
55
69
|
logger.addHandler(handler)
|
|
56
70
|
return logger
|
|
57
71
|
|
|
58
72
|
|
|
59
|
-
# Global logger you can import elsewhere
|
|
60
73
|
judgeval_logger = _setup_judgeval_logger()
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
__all__ = ("judgeval_logger",)
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
from typing import List, Optional, Dict
|
|
2
|
+
from judgeval.api import JudgmentSyncClient
|
|
3
|
+
from judgeval.exceptions import JudgmentAPIError
|
|
4
|
+
from judgeval.api.api_types import (
|
|
5
|
+
PromptCommitInfo,
|
|
6
|
+
PromptTagResponse,
|
|
7
|
+
PromptUntagResponse,
|
|
8
|
+
PromptVersionsResponse,
|
|
9
|
+
)
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
import re
|
|
12
|
+
from string import Template
|
|
13
|
+
from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
|
|
14
|
+
from judgeval.utils.project import _resolve_project_id
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def push_prompt(
|
|
18
|
+
project_name: str,
|
|
19
|
+
name: str,
|
|
20
|
+
prompt: str,
|
|
21
|
+
tags: List[str],
|
|
22
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
23
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
24
|
+
) -> tuple[str, Optional[str], str]:
|
|
25
|
+
if not judgment_api_key or not organization_id:
|
|
26
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
27
|
+
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
|
28
|
+
try:
|
|
29
|
+
project_id = _resolve_project_id(
|
|
30
|
+
project_name, judgment_api_key, organization_id
|
|
31
|
+
)
|
|
32
|
+
if not project_id:
|
|
33
|
+
raise JudgmentAPIError(
|
|
34
|
+
status_code=404,
|
|
35
|
+
detail=f"Project '{project_name}' not found",
|
|
36
|
+
response=None, # type: ignore
|
|
37
|
+
)
|
|
38
|
+
r = client.prompts_insert(
|
|
39
|
+
payload={
|
|
40
|
+
"project_id": project_id,
|
|
41
|
+
"name": name,
|
|
42
|
+
"prompt": prompt,
|
|
43
|
+
"tags": tags,
|
|
44
|
+
}
|
|
45
|
+
)
|
|
46
|
+
return r["commit_id"], r.get("parent_commit_id"), r["created_at"]
|
|
47
|
+
except JudgmentAPIError as e:
|
|
48
|
+
raise JudgmentAPIError(
|
|
49
|
+
status_code=e.status_code,
|
|
50
|
+
detail=f"Failed to save prompt: {e.detail}",
|
|
51
|
+
response=e.response,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def fetch_prompt(
|
|
56
|
+
project_name: str,
|
|
57
|
+
name: str,
|
|
58
|
+
commit_id: Optional[str] = None,
|
|
59
|
+
tag: Optional[str] = None,
|
|
60
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
61
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
62
|
+
) -> Optional[PromptCommitInfo]:
|
|
63
|
+
if not judgment_api_key or not organization_id:
|
|
64
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
65
|
+
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
|
66
|
+
try:
|
|
67
|
+
project_id = _resolve_project_id(
|
|
68
|
+
project_name, judgment_api_key, organization_id
|
|
69
|
+
)
|
|
70
|
+
if not project_id:
|
|
71
|
+
raise JudgmentAPIError(
|
|
72
|
+
status_code=404,
|
|
73
|
+
detail=f"Project '{project_name}' not found",
|
|
74
|
+
response=None, # type: ignore
|
|
75
|
+
)
|
|
76
|
+
prompt_config = client.prompts_fetch(
|
|
77
|
+
name=name,
|
|
78
|
+
project_id=project_id,
|
|
79
|
+
commit_id=commit_id,
|
|
80
|
+
tag=tag,
|
|
81
|
+
)
|
|
82
|
+
return prompt_config["commit"]
|
|
83
|
+
except JudgmentAPIError as e:
|
|
84
|
+
raise JudgmentAPIError(
|
|
85
|
+
status_code=e.status_code,
|
|
86
|
+
detail=f"Failed to fetch prompt '{name}': {e.detail}",
|
|
87
|
+
response=e.response,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def tag_prompt(
|
|
92
|
+
project_name: str,
|
|
93
|
+
name: str,
|
|
94
|
+
commit_id: str,
|
|
95
|
+
tags: List[str],
|
|
96
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
97
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
98
|
+
) -> PromptTagResponse:
|
|
99
|
+
if not judgment_api_key or not organization_id:
|
|
100
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
101
|
+
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
|
102
|
+
try:
|
|
103
|
+
project_id = _resolve_project_id(
|
|
104
|
+
project_name, judgment_api_key, organization_id
|
|
105
|
+
)
|
|
106
|
+
if not project_id:
|
|
107
|
+
raise JudgmentAPIError(
|
|
108
|
+
status_code=404,
|
|
109
|
+
detail=f"Project '{project_name}' not found",
|
|
110
|
+
response=None, # type: ignore
|
|
111
|
+
)
|
|
112
|
+
prompt_config = client.prompts_tag(
|
|
113
|
+
payload={
|
|
114
|
+
"project_id": project_id,
|
|
115
|
+
"name": name,
|
|
116
|
+
"commit_id": commit_id,
|
|
117
|
+
"tags": tags,
|
|
118
|
+
}
|
|
119
|
+
)
|
|
120
|
+
return prompt_config
|
|
121
|
+
except JudgmentAPIError as e:
|
|
122
|
+
raise JudgmentAPIError(
|
|
123
|
+
status_code=e.status_code,
|
|
124
|
+
detail=f"Failed to tag prompt '{name}': {e.detail}",
|
|
125
|
+
response=e.response,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def untag_prompt(
|
|
130
|
+
project_name: str,
|
|
131
|
+
name: str,
|
|
132
|
+
tags: List[str],
|
|
133
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
134
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
135
|
+
) -> PromptUntagResponse:
|
|
136
|
+
if not judgment_api_key or not organization_id:
|
|
137
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
138
|
+
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
|
139
|
+
try:
|
|
140
|
+
project_id = _resolve_project_id(
|
|
141
|
+
project_name, judgment_api_key, organization_id
|
|
142
|
+
)
|
|
143
|
+
if not project_id:
|
|
144
|
+
raise JudgmentAPIError(
|
|
145
|
+
status_code=404,
|
|
146
|
+
detail=f"Project '{project_name}' not found",
|
|
147
|
+
response=None, # type: ignore
|
|
148
|
+
)
|
|
149
|
+
prompt_config = client.prompts_untag(
|
|
150
|
+
payload={"project_id": project_id, "name": name, "tags": tags}
|
|
151
|
+
)
|
|
152
|
+
return prompt_config
|
|
153
|
+
except JudgmentAPIError as e:
|
|
154
|
+
raise JudgmentAPIError(
|
|
155
|
+
status_code=e.status_code,
|
|
156
|
+
detail=f"Failed to untag prompt '{name}': {e.detail}",
|
|
157
|
+
response=e.response,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def list_prompt(
|
|
162
|
+
project_name: str,
|
|
163
|
+
name: str,
|
|
164
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
165
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
166
|
+
) -> PromptVersionsResponse:
|
|
167
|
+
if not judgment_api_key or not organization_id:
|
|
168
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
169
|
+
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
|
170
|
+
try:
|
|
171
|
+
project_id = _resolve_project_id(
|
|
172
|
+
project_name, judgment_api_key, organization_id
|
|
173
|
+
)
|
|
174
|
+
if not project_id:
|
|
175
|
+
raise JudgmentAPIError(
|
|
176
|
+
status_code=404,
|
|
177
|
+
detail=f"Project '{project_name}' not found",
|
|
178
|
+
response=None, # type: ignore
|
|
179
|
+
)
|
|
180
|
+
prompt_config = client.prompts_get_prompt_versions(
|
|
181
|
+
project_id=project_id, name=name
|
|
182
|
+
)
|
|
183
|
+
return prompt_config
|
|
184
|
+
except JudgmentAPIError as e:
|
|
185
|
+
raise JudgmentAPIError(
|
|
186
|
+
status_code=e.status_code,
|
|
187
|
+
detail=f"Failed to list prompt '{name}': {e.detail}",
|
|
188
|
+
response=e.response,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
@dataclass
|
|
193
|
+
class Prompt:
|
|
194
|
+
name: str
|
|
195
|
+
prompt: str
|
|
196
|
+
created_at: str
|
|
197
|
+
tags: List[str]
|
|
198
|
+
commit_id: str
|
|
199
|
+
parent_commit_id: Optional[str] = None
|
|
200
|
+
metadata: Dict[str, str] = field(default_factory=dict)
|
|
201
|
+
_template: Template = field(init=False, repr=False)
|
|
202
|
+
|
|
203
|
+
def __post_init__(self):
|
|
204
|
+
template_str = re.sub(r"\{\{([^}]+)\}\}", r"$\1", self.prompt)
|
|
205
|
+
self._template = Template(template_str)
|
|
206
|
+
|
|
207
|
+
@classmethod
|
|
208
|
+
def create(
|
|
209
|
+
cls,
|
|
210
|
+
project_name: str,
|
|
211
|
+
name: str,
|
|
212
|
+
prompt: str,
|
|
213
|
+
tags: Optional[List[str]] = None,
|
|
214
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
215
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
216
|
+
):
|
|
217
|
+
if tags is None:
|
|
218
|
+
tags = []
|
|
219
|
+
commit_id, parent_commit_id, created_at = push_prompt(
|
|
220
|
+
project_name, name, prompt, tags, judgment_api_key, organization_id
|
|
221
|
+
)
|
|
222
|
+
return cls(
|
|
223
|
+
name=name,
|
|
224
|
+
prompt=prompt,
|
|
225
|
+
created_at=created_at,
|
|
226
|
+
tags=tags,
|
|
227
|
+
commit_id=commit_id,
|
|
228
|
+
parent_commit_id=parent_commit_id,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
@classmethod
|
|
232
|
+
def get(
|
|
233
|
+
cls,
|
|
234
|
+
project_name: str,
|
|
235
|
+
name: str,
|
|
236
|
+
commit_id: Optional[str] = None,
|
|
237
|
+
tag: Optional[str] = None,
|
|
238
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
239
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
240
|
+
):
|
|
241
|
+
if commit_id is not None and tag is not None:
|
|
242
|
+
raise ValueError(
|
|
243
|
+
"You cannot fetch a prompt by both commit_id and tag at the same time"
|
|
244
|
+
)
|
|
245
|
+
prompt_config = fetch_prompt(
|
|
246
|
+
project_name, name, commit_id, tag, judgment_api_key, organization_id
|
|
247
|
+
)
|
|
248
|
+
if prompt_config is None:
|
|
249
|
+
raise JudgmentAPIError(
|
|
250
|
+
status_code=404,
|
|
251
|
+
detail=f"Prompt '{name}' not found in project '{project_name}'",
|
|
252
|
+
response=None, # type: ignore
|
|
253
|
+
)
|
|
254
|
+
return cls(
|
|
255
|
+
name=prompt_config["name"],
|
|
256
|
+
prompt=prompt_config["prompt"],
|
|
257
|
+
created_at=prompt_config["created_at"],
|
|
258
|
+
tags=prompt_config["tags"],
|
|
259
|
+
commit_id=prompt_config["commit_id"],
|
|
260
|
+
parent_commit_id=prompt_config.get("parent_commit_id"),
|
|
261
|
+
metadata={
|
|
262
|
+
"creator_first_name": prompt_config["first_name"],
|
|
263
|
+
"creator_last_name": prompt_config["last_name"],
|
|
264
|
+
"creator_email": prompt_config["user_email"],
|
|
265
|
+
},
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
@classmethod
|
|
269
|
+
def tag(
|
|
270
|
+
cls,
|
|
271
|
+
project_name: str,
|
|
272
|
+
name: str,
|
|
273
|
+
commit_id: str,
|
|
274
|
+
tags: List[str],
|
|
275
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
276
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
277
|
+
):
|
|
278
|
+
prompt_config = tag_prompt(
|
|
279
|
+
project_name, name, commit_id, tags, judgment_api_key, organization_id
|
|
280
|
+
)
|
|
281
|
+
return prompt_config["commit_id"]
|
|
282
|
+
|
|
283
|
+
@classmethod
|
|
284
|
+
def untag(
|
|
285
|
+
cls,
|
|
286
|
+
project_name: str,
|
|
287
|
+
name: str,
|
|
288
|
+
tags: List[str],
|
|
289
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
290
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
291
|
+
):
|
|
292
|
+
prompt_config = untag_prompt(
|
|
293
|
+
project_name, name, tags, judgment_api_key, organization_id
|
|
294
|
+
)
|
|
295
|
+
return prompt_config["commit_ids"]
|
|
296
|
+
|
|
297
|
+
@classmethod
|
|
298
|
+
def list(
|
|
299
|
+
cls,
|
|
300
|
+
project_name: str,
|
|
301
|
+
name: str,
|
|
302
|
+
judgment_api_key: str | None = JUDGMENT_API_KEY,
|
|
303
|
+
organization_id: str | None = JUDGMENT_ORG_ID,
|
|
304
|
+
):
|
|
305
|
+
prompt_configs = list_prompt(
|
|
306
|
+
project_name, name, judgment_api_key, organization_id
|
|
307
|
+
)["versions"]
|
|
308
|
+
return [
|
|
309
|
+
cls(
|
|
310
|
+
name=prompt_config["name"],
|
|
311
|
+
prompt=prompt_config["prompt"],
|
|
312
|
+
tags=prompt_config["tags"],
|
|
313
|
+
created_at=prompt_config["created_at"],
|
|
314
|
+
commit_id=prompt_config["commit_id"],
|
|
315
|
+
parent_commit_id=prompt_config.get("parent_commit_id"),
|
|
316
|
+
metadata={
|
|
317
|
+
"creator_first_name": prompt_config["first_name"],
|
|
318
|
+
"creator_last_name": prompt_config["last_name"],
|
|
319
|
+
"creator_email": prompt_config["user_email"],
|
|
320
|
+
},
|
|
321
|
+
)
|
|
322
|
+
for prompt_config in prompt_configs
|
|
323
|
+
]
|
|
324
|
+
|
|
325
|
+
def compile(self, **kwargs) -> str:
|
|
326
|
+
try:
|
|
327
|
+
return self._template.substitute(**kwargs)
|
|
328
|
+
except KeyError as e:
|
|
329
|
+
missing_var = str(e).strip("'")
|
|
330
|
+
raise ValueError(f"Missing required variable: {missing_var}")
|
judgeval/scorers/__init__.py
CHANGED
|
@@ -1,29 +1,29 @@
|
|
|
1
|
-
from judgeval.scorers.api_scorer import
|
|
1
|
+
from judgeval.scorers.api_scorer import (
|
|
2
|
+
APIScorerConfig,
|
|
3
|
+
ExampleAPIScorerConfig,
|
|
4
|
+
TraceAPIScorerConfig,
|
|
5
|
+
)
|
|
2
6
|
from judgeval.scorers.base_scorer import BaseScorer
|
|
7
|
+
from judgeval.scorers.example_scorer import ExampleScorer
|
|
3
8
|
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
|
4
|
-
ExecutionOrderScorer,
|
|
5
|
-
HallucinationScorer,
|
|
6
9
|
FaithfulnessScorer,
|
|
7
10
|
AnswerRelevancyScorer,
|
|
8
11
|
AnswerCorrectnessScorer,
|
|
9
12
|
InstructionAdherenceScorer,
|
|
10
|
-
|
|
11
|
-
ToolOrderScorer,
|
|
13
|
+
TracePromptScorer,
|
|
12
14
|
PromptScorer,
|
|
13
|
-
ToolDependencyScorer,
|
|
14
15
|
)
|
|
15
16
|
|
|
16
17
|
__all__ = [
|
|
17
18
|
"APIScorerConfig",
|
|
19
|
+
"ExampleAPIScorerConfig",
|
|
20
|
+
"TraceAPIScorerConfig",
|
|
18
21
|
"BaseScorer",
|
|
22
|
+
"ExampleScorer",
|
|
23
|
+
"TracePromptScorer",
|
|
19
24
|
"PromptScorer",
|
|
20
|
-
"ExecutionOrderScorer",
|
|
21
|
-
"HallucinationScorer",
|
|
22
25
|
"FaithfulnessScorer",
|
|
23
26
|
"AnswerRelevancyScorer",
|
|
24
27
|
"AnswerCorrectnessScorer",
|
|
25
28
|
"InstructionAdherenceScorer",
|
|
26
|
-
"DerailmentScorer",
|
|
27
|
-
"ToolOrderScorer",
|
|
28
|
-
"ToolDependencyScorer",
|
|
29
29
|
]
|