judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of judgeval might be problematic. Click here for more details.
- judgeval/__init__.py +177 -12
- judgeval/api/__init__.py +519 -0
- judgeval/api/api_types.py +407 -0
- judgeval/cli.py +79 -0
- judgeval/constants.py +76 -47
- judgeval/data/__init__.py +3 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +15 -56
- judgeval/data/judgment_types.py +450 -0
- judgeval/data/result.py +29 -73
- judgeval/data/scorer_data.py +29 -62
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/trace.py +121 -0
- judgeval/dataset/__init__.py +264 -0
- judgeval/env.py +52 -0
- judgeval/evaluation/__init__.py +344 -0
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +50 -0
- judgeval/judges/__init__.py +2 -3
- judgeval/judges/base_judge.py +2 -3
- judgeval/judges/litellm_judge.py +100 -20
- judgeval/judges/together_judge.py +101 -20
- judgeval/judges/utils.py +20 -24
- judgeval/logger.py +62 -0
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +18 -25
- judgeval/scorers/agent_scorer.py +17 -0
- judgeval/scorers/api_scorer.py +45 -41
- judgeval/scorers/base_scorer.py +83 -38
- judgeval/scorers/example_scorer.py +17 -0
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorers/__init__.py +0 -148
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
- judgeval/scorers/score.py +77 -306
- judgeval/scorers/utils.py +4 -199
- judgeval/tracer/__init__.py +1122 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +128 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +396 -0
- judgeval/trainer/trainable_model.py +243 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +97 -0
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/utils/version_check.py +28 -0
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.22.2.dist-info/METADATA +265 -0
- judgeval-0.22.2.dist-info/RECORD +112 -0
- judgeval-0.22.2.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -39
- judgeval/common/__init__.py +0 -8
- judgeval/common/exceptions.py +0 -28
- judgeval/common/logger.py +0 -189
- judgeval/common/tracer.py +0 -798
- judgeval/common/utils.py +0 -763
- judgeval/data/api_example.py +0 -111
- judgeval/data/datasets/__init__.py +0 -5
- judgeval/data/datasets/dataset.py +0 -286
- judgeval/data/datasets/eval_dataset_client.py +0 -193
- judgeval/data/datasets/ground_truth.py +0 -54
- judgeval/data/datasets/utils.py +0 -74
- judgeval/evaluation_run.py +0 -132
- judgeval/judges/mixture_of_judges.py +0 -248
- judgeval/judgment_client.py +0 -354
- judgeval/run_evaluation.py +0 -439
- judgeval/scorers/judgeval_scorer.py +0 -140
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
- judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
- judgeval/scorers/prompt_scorer.py +0 -439
- judgeval-0.0.11.dist-info/METADATA +0 -36
- judgeval-0.0.11.dist-info/RECORD +0 -84
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from judgeval.tracer import Tracer
|
|
3
|
+
from judgeval.logger import judgeval_logger
|
|
4
|
+
from judgeval.utils.url import url_for
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
import openlit # type: ignore
|
|
9
|
+
except ImportError:
|
|
10
|
+
raise ImportError(
|
|
11
|
+
"Openlit is not installed and required for the openlit integration. Please install it with `pip install openlit`."
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Openlit(ABC):
|
|
16
|
+
@staticmethod
|
|
17
|
+
def initialize(
|
|
18
|
+
**kwargs,
|
|
19
|
+
):
|
|
20
|
+
tracer = Tracer.get_instance()
|
|
21
|
+
if not tracer or not tracer._initialized:
|
|
22
|
+
raise ValueError(
|
|
23
|
+
"Openlit must be initialized after the tracer has been initialized. Please create the Tracer instance first before initializing Openlit."
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
api_key = tracer.api_key
|
|
27
|
+
organization_id = tracer.organization_id
|
|
28
|
+
project_name = tracer.project_name
|
|
29
|
+
|
|
30
|
+
project_id = Tracer._resolve_project_id(project_name, api_key, organization_id)
|
|
31
|
+
if not project_id:
|
|
32
|
+
judgeval_logger.warning(
|
|
33
|
+
f"Project {project_name} not found. Please create it first at https://app.judgmentlabs.ai/org/{organization_id}/projects."
|
|
34
|
+
)
|
|
35
|
+
return
|
|
36
|
+
|
|
37
|
+
openlit.init(
|
|
38
|
+
service_name=project_name,
|
|
39
|
+
otlp_endpoint=url_for("/otel"),
|
|
40
|
+
otlp_headers={
|
|
41
|
+
"Authorization": f"Bearer {api_key}",
|
|
42
|
+
"X-Organization-Id": organization_id,
|
|
43
|
+
"X-Project-Id": project_id,
|
|
44
|
+
},
|
|
45
|
+
tracer=tracer.get_tracer(),
|
|
46
|
+
**kwargs,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
__all__ = ["Openlit"]
|
judgeval/judges/__init__.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
from pydantic import BaseModel
|
|
2
1
|
from judgeval.judges.base_judge import JudgevalJudge
|
|
3
2
|
from judgeval.judges.litellm_judge import LiteLLMJudge
|
|
4
3
|
from judgeval.judges.together_judge import TogetherJudge
|
|
5
|
-
from judgeval.judges.mixture_of_judges import MixtureOfJudges
|
|
6
4
|
|
|
7
|
-
|
|
5
|
+
|
|
6
|
+
__all__ = ["JudgevalJudge", "LiteLLMJudge", "TogetherJudge"]
|
judgeval/judges/base_judge.py
CHANGED
|
@@ -3,7 +3,7 @@ Implements the base class for all Judgeval Judge models.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from abc import ABC, abstractmethod
|
|
6
|
-
from typing import Optional
|
|
6
|
+
from typing import Optional
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class JudgevalJudge(ABC):
|
|
@@ -37,8 +37,7 @@ class JudgevalJudge(ABC):
|
|
|
37
37
|
A string.
|
|
38
38
|
"""
|
|
39
39
|
pass
|
|
40
|
-
|
|
40
|
+
|
|
41
41
|
@abstractmethod
|
|
42
42
|
def get_model_name(self, *args, **kwargs) -> str:
|
|
43
43
|
pass
|
|
44
|
-
|
judgeval/judges/litellm_judge.py
CHANGED
|
@@ -1,47 +1,127 @@
|
|
|
1
1
|
import pydantic
|
|
2
|
-
from typing import List, Union, Mapping
|
|
2
|
+
from typing import Dict, List, Union, Mapping, Any
|
|
3
3
|
|
|
4
|
-
from judgeval import
|
|
4
|
+
from judgeval.constants import ACCEPTABLE_MODELS
|
|
5
5
|
from judgeval.judges import JudgevalJudge
|
|
6
|
-
from judgeval.
|
|
7
|
-
|
|
6
|
+
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import litellm
|
|
10
|
+
except ImportError:
|
|
11
|
+
raise ImportError(
|
|
12
|
+
"Litellm is not installed and required for the litellm judge. Please install it with `pip install litellm`."
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def fetch_litellm_api_response(
|
|
17
|
+
model: str,
|
|
18
|
+
messages: List[Dict[str, str]],
|
|
19
|
+
response_format: Union[Dict[str, Any], None] = None,
|
|
20
|
+
) -> str:
|
|
21
|
+
if response_format is not None:
|
|
22
|
+
response = litellm.completion(
|
|
23
|
+
model=model,
|
|
24
|
+
messages=messages,
|
|
25
|
+
response_format=response_format,
|
|
26
|
+
)
|
|
27
|
+
else:
|
|
28
|
+
response = litellm.completion(
|
|
29
|
+
model=model,
|
|
30
|
+
messages=messages,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
content = response.choices[0].message.content # type: ignore[attr-defined]
|
|
34
|
+
if content is None:
|
|
35
|
+
raise ValueError("Received empty response from litellm")
|
|
36
|
+
return content
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
async def afetch_litellm_api_response(
|
|
40
|
+
model: str,
|
|
41
|
+
messages: List[Dict[str, str]],
|
|
42
|
+
response_format: Union[Dict[str, Any], None] = None,
|
|
43
|
+
) -> str:
|
|
44
|
+
if not messages:
|
|
45
|
+
raise ValueError("Messages cannot be empty")
|
|
46
|
+
|
|
47
|
+
if model not in ACCEPTABLE_MODELS:
|
|
48
|
+
raise ValueError(
|
|
49
|
+
f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
if response_format is not None:
|
|
53
|
+
response = await litellm.acompletion(
|
|
54
|
+
model=model, messages=messages, response_format=response_format
|
|
55
|
+
)
|
|
56
|
+
else:
|
|
57
|
+
response = await litellm.acompletion(
|
|
58
|
+
model=model,
|
|
59
|
+
messages=messages,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
content = response.choices[0].message.content # type: ignore[attr-defined]
|
|
63
|
+
if content is None:
|
|
64
|
+
raise ValueError("Received empty response from litellm")
|
|
65
|
+
return content
|
|
66
|
+
|
|
8
67
|
|
|
9
68
|
BASE_CONVERSATION = [
|
|
10
69
|
{"role": "system", "content": "You are a helpful assistant."},
|
|
11
|
-
]
|
|
70
|
+
]
|
|
12
71
|
|
|
13
72
|
|
|
14
73
|
class LiteLLMJudge(JudgevalJudge):
|
|
15
|
-
def __init__(self, model: str =
|
|
16
|
-
debug(f"Initializing LiteLLMJudge with model={model}")
|
|
74
|
+
def __init__(self, model: str = JUDGMENT_DEFAULT_GPT_MODEL, **kwargs):
|
|
17
75
|
self.model = model
|
|
18
76
|
self.kwargs = kwargs
|
|
19
77
|
super().__init__(model_name=model)
|
|
20
78
|
|
|
21
|
-
def generate(
|
|
22
|
-
|
|
79
|
+
def generate(
|
|
80
|
+
self,
|
|
81
|
+
input: Union[str, List[Mapping[str, str]]],
|
|
82
|
+
schema: Union[pydantic.BaseModel, None] = None,
|
|
83
|
+
) -> str:
|
|
84
|
+
response_format = schema.model_json_schema() if schema else None
|
|
85
|
+
|
|
23
86
|
if isinstance(input, str):
|
|
24
87
|
convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
|
|
25
|
-
return fetch_litellm_api_response(
|
|
88
|
+
return fetch_litellm_api_response(
|
|
89
|
+
model=self.model, messages=convo, response_format=response_format
|
|
90
|
+
)
|
|
26
91
|
elif isinstance(input, list):
|
|
27
|
-
|
|
92
|
+
messages = [dict(msg) for msg in input]
|
|
93
|
+
return fetch_litellm_api_response(
|
|
94
|
+
model=self.model, messages=messages, response_format=response_format
|
|
95
|
+
)
|
|
28
96
|
else:
|
|
29
|
-
|
|
30
|
-
|
|
97
|
+
raise TypeError(
|
|
98
|
+
f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
async def a_generate(
|
|
102
|
+
self,
|
|
103
|
+
input: Union[str, List[Mapping[str, str]]],
|
|
104
|
+
schema: Union[pydantic.BaseModel, None] = None,
|
|
105
|
+
) -> str:
|
|
106
|
+
response_format = schema.model_json_schema() if schema else None
|
|
31
107
|
|
|
32
|
-
async def a_generate(self, input: Union[str, List[Mapping[str, str]]], schema: pydantic.BaseModel = None) -> str:
|
|
33
|
-
debug(f"Async generating response for input type: {type(input)}")
|
|
34
108
|
if isinstance(input, str):
|
|
35
109
|
convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
|
|
36
|
-
response = await afetch_litellm_api_response(
|
|
110
|
+
response = await afetch_litellm_api_response(
|
|
111
|
+
model=self.model, messages=convo, response_format=response_format
|
|
112
|
+
)
|
|
37
113
|
return response
|
|
38
114
|
elif isinstance(input, list):
|
|
39
|
-
|
|
115
|
+
messages = [dict(msg) for msg in input]
|
|
116
|
+
response = await afetch_litellm_api_response(
|
|
117
|
+
model=self.model, messages=messages, response_format=response_format
|
|
118
|
+
)
|
|
40
119
|
return response
|
|
41
120
|
else:
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
121
|
+
raise TypeError(
|
|
122
|
+
f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
|
|
123
|
+
)
|
|
124
|
+
|
|
45
125
|
def load_model(self):
|
|
46
126
|
return self.model
|
|
47
127
|
|
|
@@ -3,48 +3,130 @@ Implementation of using TogetherAI inference for judges.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel
|
|
6
|
-
from typing import List, Union,
|
|
7
|
-
from judgeval.common.logger import debug, error
|
|
8
|
-
|
|
6
|
+
from typing import Dict, List, Union, Any, cast
|
|
9
7
|
from judgeval.judges import JudgevalJudge
|
|
10
|
-
from judgeval.
|
|
8
|
+
from judgeval.logger import judgeval_logger
|
|
9
|
+
from judgeval.env import (
|
|
10
|
+
JUDGMENT_DEFAULT_TOGETHER_MODEL,
|
|
11
|
+
TOGETHERAI_API_KEY,
|
|
12
|
+
TOGETHER_API_KEY,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
together_api_key = TOGETHERAI_API_KEY or TOGETHER_API_KEY
|
|
16
|
+
if together_api_key:
|
|
17
|
+
try:
|
|
18
|
+
from together import Together, AsyncTogether # type: ignore[import-untyped]
|
|
19
|
+
|
|
20
|
+
together_client = Together(api_key=together_api_key)
|
|
21
|
+
async_together_client = AsyncTogether(api_key=together_api_key)
|
|
22
|
+
except Exception:
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def fetch_together_api_response(
|
|
27
|
+
model: str,
|
|
28
|
+
messages: List[Dict[str, str]],
|
|
29
|
+
response_format: Union[Dict[str, Any], None] = None,
|
|
30
|
+
) -> str:
|
|
31
|
+
if not messages:
|
|
32
|
+
raise ValueError("Messages cannot be empty")
|
|
33
|
+
|
|
34
|
+
if response_format is not None:
|
|
35
|
+
response = together_client.chat.completions.create(
|
|
36
|
+
model=model,
|
|
37
|
+
messages=messages,
|
|
38
|
+
response_format=response_format,
|
|
39
|
+
)
|
|
40
|
+
else:
|
|
41
|
+
response = together_client.chat.completions.create(
|
|
42
|
+
model=model,
|
|
43
|
+
messages=messages,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
content = response.choices[0].message.content # type: ignore[attr-defined]
|
|
47
|
+
if content is None:
|
|
48
|
+
raise ValueError("Received empty response from TogetherAI")
|
|
49
|
+
return cast(str, content)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
async def afetch_together_api_response(
|
|
53
|
+
model: str,
|
|
54
|
+
messages: List[Dict[str, str]],
|
|
55
|
+
response_format: Union[Dict[str, Any], None] = None,
|
|
56
|
+
) -> str:
|
|
57
|
+
if not messages:
|
|
58
|
+
raise ValueError("Messages cannot be empty")
|
|
59
|
+
|
|
60
|
+
if response_format is not None:
|
|
61
|
+
response = await async_together_client.chat.completions.create(
|
|
62
|
+
model=model,
|
|
63
|
+
messages=messages,
|
|
64
|
+
response_format=response_format,
|
|
65
|
+
)
|
|
66
|
+
else:
|
|
67
|
+
response = await async_together_client.chat.completions.create(
|
|
68
|
+
model=model,
|
|
69
|
+
messages=messages,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
content = response.choices[0].message.content # type: ignore[attr-defined]
|
|
73
|
+
if content is None:
|
|
74
|
+
raise ValueError("Received empty response from TogetherAI")
|
|
75
|
+
return cast(str, content)
|
|
76
|
+
|
|
11
77
|
|
|
12
78
|
BASE_CONVERSATION = [
|
|
13
79
|
{"role": "system", "content": "You are a helpful assistant."},
|
|
14
80
|
]
|
|
15
81
|
|
|
82
|
+
|
|
16
83
|
class TogetherJudge(JudgevalJudge):
|
|
17
|
-
def __init__(self, model: str =
|
|
18
|
-
debug(f"Initializing TogetherJudge with model={model}")
|
|
84
|
+
def __init__(self, model: str = JUDGMENT_DEFAULT_TOGETHER_MODEL, **kwargs):
|
|
19
85
|
self.model = model
|
|
20
86
|
self.kwargs = kwargs
|
|
21
87
|
super().__init__(model_name=model)
|
|
22
88
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
89
|
+
def generate(
|
|
90
|
+
self,
|
|
91
|
+
input: Union[str, List[Dict[str, str]]],
|
|
92
|
+
schema: Union[BaseModel, None] = None,
|
|
93
|
+
) -> str:
|
|
94
|
+
response_format = schema.model_json_schema() if schema else None
|
|
95
|
+
|
|
26
96
|
if isinstance(input, str):
|
|
27
97
|
convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
|
|
28
|
-
return fetch_together_api_response(
|
|
98
|
+
return fetch_together_api_response(
|
|
99
|
+
self.model, convo, response_format=response_format
|
|
100
|
+
)
|
|
29
101
|
elif isinstance(input, list):
|
|
30
|
-
|
|
31
|
-
return fetch_together_api_response(
|
|
102
|
+
messages = [dict(msg) for msg in input]
|
|
103
|
+
return fetch_together_api_response(
|
|
104
|
+
self.model, messages, response_format=response_format
|
|
105
|
+
)
|
|
32
106
|
else:
|
|
33
|
-
error(f"Invalid input type received: {type(input)}")
|
|
107
|
+
judgeval_logger.error(f"Invalid input type received: {type(input)}")
|
|
34
108
|
raise TypeError("Input must be a string or a list of dictionaries.")
|
|
35
109
|
|
|
36
|
-
async def a_generate(
|
|
37
|
-
|
|
110
|
+
async def a_generate(
|
|
111
|
+
self,
|
|
112
|
+
input: Union[str, List[Dict[str, str]]],
|
|
113
|
+
schema: Union[BaseModel, None] = None,
|
|
114
|
+
) -> str:
|
|
115
|
+
response_format = schema.model_json_schema() if schema else None
|
|
116
|
+
|
|
38
117
|
if isinstance(input, str):
|
|
39
118
|
convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
|
|
40
|
-
res = await afetch_together_api_response(
|
|
119
|
+
res = await afetch_together_api_response(
|
|
120
|
+
self.model, convo, response_format=response_format
|
|
121
|
+
)
|
|
41
122
|
return res
|
|
42
123
|
elif isinstance(input, list):
|
|
43
|
-
|
|
44
|
-
res = await afetch_together_api_response(
|
|
124
|
+
messages = [dict(msg) for msg in input]
|
|
125
|
+
res = await afetch_together_api_response(
|
|
126
|
+
self.model, messages, response_format=response_format
|
|
127
|
+
)
|
|
45
128
|
return res
|
|
46
129
|
else:
|
|
47
|
-
error(f"Invalid input type received: {type(input)}")
|
|
48
130
|
raise TypeError("Input must be a string or a list of dictionaries.")
|
|
49
131
|
|
|
50
132
|
def load_model(self) -> str:
|
|
@@ -52,4 +134,3 @@ class TogetherJudge(JudgevalJudge):
|
|
|
52
134
|
|
|
53
135
|
def get_model_name(self) -> str:
|
|
54
136
|
return self.model
|
|
55
|
-
|
judgeval/judges/utils.py
CHANGED
|
@@ -1,48 +1,44 @@
|
|
|
1
1
|
"""
|
|
2
2
|
This module contains utility functions for judge models.
|
|
3
3
|
"""
|
|
4
|
+
|
|
4
5
|
import litellm
|
|
5
|
-
from typing import Optional, Union, Tuple
|
|
6
|
+
from typing import Optional, Union, Tuple
|
|
6
7
|
|
|
7
|
-
from judgeval.
|
|
8
|
-
from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge
|
|
9
|
-
from judgeval.
|
|
8
|
+
from judgeval.exceptions import InvalidJudgeModelError
|
|
9
|
+
from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge
|
|
10
|
+
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
|
11
|
+
from judgeval.constants import (
|
|
12
|
+
TOGETHER_SUPPORTED_MODELS,
|
|
13
|
+
JUDGMENT_SUPPORTED_MODELS,
|
|
14
|
+
)
|
|
10
15
|
|
|
11
16
|
LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
|
|
12
17
|
|
|
18
|
+
|
|
13
19
|
def create_judge(
|
|
14
|
-
model: Optional[Union[str,
|
|
20
|
+
model: Optional[Union[str, JudgevalJudge]] = None,
|
|
21
|
+
) -> Tuple[JudgevalJudge, bool]:
|
|
15
22
|
"""
|
|
16
23
|
Creates a judge model from string(s) or a judgeval judge object.
|
|
17
24
|
|
|
18
25
|
If `model` is a single string, it is assumed to be a judge model name.
|
|
19
26
|
If `model` is a list of strings, it is assumed to be a list of judge model names (for MixtureOfJudges).
|
|
20
|
-
If `model` is a judgeval judge object, it is returned as is.
|
|
27
|
+
If `model` is a judgeval judge object, it is returned as is.
|
|
21
28
|
|
|
22
29
|
Returns a tuple of (initialized judgevalBaseLLM, using_native_model boolean)
|
|
23
30
|
If no model is provided, uses GPT4o as the default judge.
|
|
24
31
|
"""
|
|
25
32
|
if model is None: # default option
|
|
26
|
-
return LiteLLMJudge(model=
|
|
33
|
+
return LiteLLMJudge(model=JUDGMENT_DEFAULT_GPT_MODEL), True
|
|
27
34
|
if not isinstance(model, (str, list, JudgevalJudge)):
|
|
28
|
-
raise InvalidJudgeModelError(
|
|
35
|
+
raise InvalidJudgeModelError(
|
|
36
|
+
f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead."
|
|
37
|
+
)
|
|
29
38
|
# If model is already a valid judge type, return it and mark native
|
|
30
|
-
if isinstance(model, (JudgevalJudge, LiteLLMJudge, TogetherJudge
|
|
31
|
-
return model, True
|
|
32
|
-
|
|
33
|
-
# Either string or List[str]
|
|
34
|
-
if isinstance(model, list):
|
|
35
|
-
for m in model:
|
|
36
|
-
if m in JUDGMENT_SUPPORTED_MODELS:
|
|
37
|
-
raise NotImplementedError(
|
|
38
|
-
"""Judgment models are not yet supported for local scoring.
|
|
39
|
-
Please either set the `use_judgment` flag to True or use
|
|
40
|
-
non-Judgment models."""
|
|
41
|
-
)
|
|
42
|
-
if m not in LITELLM_SUPPORTED_MODELS and m not in TOGETHER_SUPPORTED_MODELS:
|
|
43
|
-
raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
|
|
44
|
-
return MixtureOfJudges(models=model), True
|
|
45
|
-
# If model is a string, check that it corresponds to a valid model
|
|
39
|
+
if isinstance(model, (JudgevalJudge, LiteLLMJudge, TogetherJudge)):
|
|
40
|
+
return model, True
|
|
41
|
+
|
|
46
42
|
if model in LITELLM_SUPPORTED_MODELS:
|
|
47
43
|
return LiteLLMJudge(model=model), True
|
|
48
44
|
if model in TOGETHER_SUPPORTED_MODELS:
|
judgeval/logger.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
from judgeval.env import JUDGMENT_NO_COLOR
|
|
5
|
+
from judgeval.utils.decorators.use_once import use_once
|
|
6
|
+
|
|
7
|
+
RESET = "\033[0m"
|
|
8
|
+
RED = "\033[31m"
|
|
9
|
+
YELLOW = "\033[33m"
|
|
10
|
+
BLUE = "\033[34m"
|
|
11
|
+
GRAY = "\033[90m"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ColorFormatter(logging.Formatter):
|
|
15
|
+
"""
|
|
16
|
+
Wrap the final formatted log record in ANSI color codes based on level.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
COLORS = {
|
|
20
|
+
logging.DEBUG: GRAY,
|
|
21
|
+
logging.INFO: GRAY,
|
|
22
|
+
logging.WARNING: YELLOW,
|
|
23
|
+
logging.ERROR: RED,
|
|
24
|
+
logging.CRITICAL: RED,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
def __init__(self, fmt=None, datefmt=None, use_color=True):
|
|
28
|
+
super().__init__(fmt=fmt, datefmt=datefmt)
|
|
29
|
+
self.use_color = use_color and sys.stdout.isatty()
|
|
30
|
+
|
|
31
|
+
def format(self, record):
|
|
32
|
+
message = super().format(record)
|
|
33
|
+
if self.use_color:
|
|
34
|
+
color = self.COLORS.get(record.levelno, "")
|
|
35
|
+
if color:
|
|
36
|
+
message = f"{color}{message}{RESET}"
|
|
37
|
+
return message
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@use_once
|
|
41
|
+
def _setup_judgeval_logger():
|
|
42
|
+
use_color = sys.stdout.isatty() and JUDGMENT_NO_COLOR is None
|
|
43
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
44
|
+
handler.setLevel(logging.DEBUG)
|
|
45
|
+
handler.setFormatter(
|
|
46
|
+
ColorFormatter(
|
|
47
|
+
fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
48
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
49
|
+
use_color=use_color,
|
|
50
|
+
)
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
logger = logging.getLogger("judgeval")
|
|
54
|
+
logger.setLevel(logging.DEBUG)
|
|
55
|
+
logger.addHandler(handler)
|
|
56
|
+
return logger
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
judgeval_logger = _setup_judgeval_logger()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
__all__ = ("judgeval_logger",)
|