judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,50 @@
1
+ from abc import ABC
2
+ from judgeval.tracer import Tracer
3
+ from judgeval.logger import judgeval_logger
4
+ from judgeval.utils.url import url_for
5
+
6
+
7
+ try:
8
+ import openlit # type: ignore
9
+ except ImportError:
10
+ raise ImportError(
11
+ "Openlit is not installed and required for the openlit integration. Please install it with `pip install openlit`."
12
+ )
13
+
14
+
15
+ class Openlit(ABC):
16
+ @staticmethod
17
+ def initialize(
18
+ **kwargs,
19
+ ):
20
+ tracer = Tracer.get_instance()
21
+ if not tracer or not tracer._initialized:
22
+ raise ValueError(
23
+ "Openlit must be initialized after the tracer has been initialized. Please create the Tracer instance first before initializing Openlit."
24
+ )
25
+
26
+ api_key = tracer.api_key
27
+ organization_id = tracer.organization_id
28
+ project_name = tracer.project_name
29
+
30
+ project_id = Tracer._resolve_project_id(project_name, api_key, organization_id)
31
+ if not project_id:
32
+ judgeval_logger.warning(
33
+ f"Project {project_name} not found. Please create it first at https://app.judgmentlabs.ai/org/{organization_id}/projects."
34
+ )
35
+ return
36
+
37
+ openlit.init(
38
+ service_name=project_name,
39
+ otlp_endpoint=url_for("/otel"),
40
+ otlp_headers={
41
+ "Authorization": f"Bearer {api_key}",
42
+ "X-Organization-Id": organization_id,
43
+ "X-Project-Id": project_id,
44
+ },
45
+ tracer=tracer.get_tracer(),
46
+ **kwargs,
47
+ )
48
+
49
+
50
+ __all__ = ["Openlit"]
@@ -1,7 +1,6 @@
1
- from pydantic import BaseModel
2
1
  from judgeval.judges.base_judge import JudgevalJudge
3
2
  from judgeval.judges.litellm_judge import LiteLLMJudge
4
3
  from judgeval.judges.together_judge import TogetherJudge
5
- from judgeval.judges.mixture_of_judges import MixtureOfJudges
6
4
 
7
- __all__ = ["JudgevalJudge", "LiteLLMJudge", "TogetherJudge", "MixtureOfJudges"]
5
+
6
+ __all__ = ["JudgevalJudge", "LiteLLMJudge", "TogetherJudge"]
@@ -3,7 +3,7 @@ Implements the base class for all Judgeval Judge models.
3
3
  """
4
4
 
5
5
  from abc import ABC, abstractmethod
6
- from typing import Optional, List
6
+ from typing import Optional
7
7
 
8
8
 
9
9
  class JudgevalJudge(ABC):
@@ -37,8 +37,7 @@ class JudgevalJudge(ABC):
37
37
  A string.
38
38
  """
39
39
  pass
40
-
40
+
41
41
  @abstractmethod
42
42
  def get_model_name(self, *args, **kwargs) -> str:
43
43
  pass
44
-
@@ -1,47 +1,127 @@
1
1
  import pydantic
2
- from typing import List, Union, Mapping
2
+ from typing import Dict, List, Union, Mapping, Any
3
3
 
4
- from judgeval import *
4
+ from judgeval.constants import ACCEPTABLE_MODELS
5
5
  from judgeval.judges import JudgevalJudge
6
- from judgeval.common.utils import afetch_litellm_api_response, fetch_litellm_api_response
7
- from judgeval.common.logger import debug, error
6
+ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
7
+
8
+ try:
9
+ import litellm
10
+ except ImportError:
11
+ raise ImportError(
12
+ "Litellm is not installed and required for the litellm judge. Please install it with `pip install litellm`."
13
+ )
14
+
15
+
16
+ def fetch_litellm_api_response(
17
+ model: str,
18
+ messages: List[Dict[str, str]],
19
+ response_format: Union[Dict[str, Any], None] = None,
20
+ ) -> str:
21
+ if response_format is not None:
22
+ response = litellm.completion(
23
+ model=model,
24
+ messages=messages,
25
+ response_format=response_format,
26
+ )
27
+ else:
28
+ response = litellm.completion(
29
+ model=model,
30
+ messages=messages,
31
+ )
32
+
33
+ content = response.choices[0].message.content # type: ignore[attr-defined]
34
+ if content is None:
35
+ raise ValueError("Received empty response from litellm")
36
+ return content
37
+
38
+
39
+ async def afetch_litellm_api_response(
40
+ model: str,
41
+ messages: List[Dict[str, str]],
42
+ response_format: Union[Dict[str, Any], None] = None,
43
+ ) -> str:
44
+ if not messages:
45
+ raise ValueError("Messages cannot be empty")
46
+
47
+ if model not in ACCEPTABLE_MODELS:
48
+ raise ValueError(
49
+ f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}."
50
+ )
51
+
52
+ if response_format is not None:
53
+ response = await litellm.acompletion(
54
+ model=model, messages=messages, response_format=response_format
55
+ )
56
+ else:
57
+ response = await litellm.acompletion(
58
+ model=model,
59
+ messages=messages,
60
+ )
61
+
62
+ content = response.choices[0].message.content # type: ignore[attr-defined]
63
+ if content is None:
64
+ raise ValueError("Received empty response from litellm")
65
+ return content
66
+
8
67
 
9
68
  BASE_CONVERSATION = [
10
69
  {"role": "system", "content": "You are a helpful assistant."},
11
- ] # for string inputs, we need to add the user query to a base conversation, since LiteLLM only accepts a list of dictionaries as a chat history
70
+ ]
12
71
 
13
72
 
14
73
  class LiteLLMJudge(JudgevalJudge):
15
- def __init__(self, model: str = "gpt-4o-mini", **kwargs):
16
- debug(f"Initializing LiteLLMJudge with model={model}")
74
+ def __init__(self, model: str = JUDGMENT_DEFAULT_GPT_MODEL, **kwargs):
17
75
  self.model = model
18
76
  self.kwargs = kwargs
19
77
  super().__init__(model_name=model)
20
78
 
21
- def generate(self, input: Union[str, List[Mapping[str, str]]], schema: pydantic.BaseModel = None) -> str:
22
- debug(f"Generating response for input type: {type(input)}")
79
+ def generate(
80
+ self,
81
+ input: Union[str, List[Mapping[str, str]]],
82
+ schema: Union[pydantic.BaseModel, None] = None,
83
+ ) -> str:
84
+ response_format = schema.model_json_schema() if schema else None
85
+
23
86
  if isinstance(input, str):
24
87
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
25
- return fetch_litellm_api_response(model=self.model, messages=convo, response_format=schema)
88
+ return fetch_litellm_api_response(
89
+ model=self.model, messages=convo, response_format=response_format
90
+ )
26
91
  elif isinstance(input, list):
27
- return fetch_litellm_api_response(model=self.model, messages=input, response_format=schema)
92
+ messages = [dict(msg) for msg in input]
93
+ return fetch_litellm_api_response(
94
+ model=self.model, messages=messages, response_format=response_format
95
+ )
28
96
  else:
29
- error(f"Invalid input type received: {type(input)}")
30
- raise TypeError(f"Input must be a string or a list of dictionaries. Input type of: {type(input)}")
97
+ raise TypeError(
98
+ f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
99
+ )
100
+
101
+ async def a_generate(
102
+ self,
103
+ input: Union[str, List[Mapping[str, str]]],
104
+ schema: Union[pydantic.BaseModel, None] = None,
105
+ ) -> str:
106
+ response_format = schema.model_json_schema() if schema else None
31
107
 
32
- async def a_generate(self, input: Union[str, List[Mapping[str, str]]], schema: pydantic.BaseModel = None) -> str:
33
- debug(f"Async generating response for input type: {type(input)}")
34
108
  if isinstance(input, str):
35
109
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
36
- response = await afetch_litellm_api_response(model=self.model, messages=convo, response_format=schema)
110
+ response = await afetch_litellm_api_response(
111
+ model=self.model, messages=convo, response_format=response_format
112
+ )
37
113
  return response
38
114
  elif isinstance(input, list):
39
- response = await afetch_litellm_api_response(model=self.model, messages=input, response_format=schema)
115
+ messages = [dict(msg) for msg in input]
116
+ response = await afetch_litellm_api_response(
117
+ model=self.model, messages=messages, response_format=response_format
118
+ )
40
119
  return response
41
120
  else:
42
- error(f"Invalid input type received: {type(input)}")
43
- raise TypeError(f"Input must be a string or a list of dictionaries. Input type of: {type(input)}")
44
-
121
+ raise TypeError(
122
+ f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
123
+ )
124
+
45
125
  def load_model(self):
46
126
  return self.model
47
127
 
@@ -3,48 +3,130 @@ Implementation of using TogetherAI inference for judges.
3
3
  """
4
4
 
5
5
  from pydantic import BaseModel
6
- from typing import List, Union, Mapping
7
- from judgeval.common.logger import debug, error
8
-
6
+ from typing import Dict, List, Union, Any, cast
9
7
  from judgeval.judges import JudgevalJudge
10
- from judgeval.common.utils import fetch_together_api_response, afetch_together_api_response
8
+ from judgeval.logger import judgeval_logger
9
+ from judgeval.env import (
10
+ JUDGMENT_DEFAULT_TOGETHER_MODEL,
11
+ TOGETHERAI_API_KEY,
12
+ TOGETHER_API_KEY,
13
+ )
14
+
15
+ together_api_key = TOGETHERAI_API_KEY or TOGETHER_API_KEY
16
+ if together_api_key:
17
+ try:
18
+ from together import Together, AsyncTogether # type: ignore[import-untyped]
19
+
20
+ together_client = Together(api_key=together_api_key)
21
+ async_together_client = AsyncTogether(api_key=together_api_key)
22
+ except Exception:
23
+ pass
24
+
25
+
26
+ def fetch_together_api_response(
27
+ model: str,
28
+ messages: List[Dict[str, str]],
29
+ response_format: Union[Dict[str, Any], None] = None,
30
+ ) -> str:
31
+ if not messages:
32
+ raise ValueError("Messages cannot be empty")
33
+
34
+ if response_format is not None:
35
+ response = together_client.chat.completions.create(
36
+ model=model,
37
+ messages=messages,
38
+ response_format=response_format,
39
+ )
40
+ else:
41
+ response = together_client.chat.completions.create(
42
+ model=model,
43
+ messages=messages,
44
+ )
45
+
46
+ content = response.choices[0].message.content # type: ignore[attr-defined]
47
+ if content is None:
48
+ raise ValueError("Received empty response from TogetherAI")
49
+ return cast(str, content)
50
+
51
+
52
+ async def afetch_together_api_response(
53
+ model: str,
54
+ messages: List[Dict[str, str]],
55
+ response_format: Union[Dict[str, Any], None] = None,
56
+ ) -> str:
57
+ if not messages:
58
+ raise ValueError("Messages cannot be empty")
59
+
60
+ if response_format is not None:
61
+ response = await async_together_client.chat.completions.create(
62
+ model=model,
63
+ messages=messages,
64
+ response_format=response_format,
65
+ )
66
+ else:
67
+ response = await async_together_client.chat.completions.create(
68
+ model=model,
69
+ messages=messages,
70
+ )
71
+
72
+ content = response.choices[0].message.content # type: ignore[attr-defined]
73
+ if content is None:
74
+ raise ValueError("Received empty response from TogetherAI")
75
+ return cast(str, content)
76
+
11
77
 
12
78
  BASE_CONVERSATION = [
13
79
  {"role": "system", "content": "You are a helpful assistant."},
14
80
  ]
15
81
 
82
+
16
83
  class TogetherJudge(JudgevalJudge):
17
- def __init__(self, model: str = "QWEN", **kwargs):
18
- debug(f"Initializing TogetherJudge with model={model}")
84
+ def __init__(self, model: str = JUDGMENT_DEFAULT_TOGETHER_MODEL, **kwargs):
19
85
  self.model = model
20
86
  self.kwargs = kwargs
21
87
  super().__init__(model_name=model)
22
88
 
23
- # TODO: Fix cost for generate and a_generate
24
- def generate(self, input: Union[str, List[Mapping[str, str]]], schema: BaseModel = None) -> str:
25
- debug(f"Generating response for input type: {type(input)}")
89
+ def generate(
90
+ self,
91
+ input: Union[str, List[Dict[str, str]]],
92
+ schema: Union[BaseModel, None] = None,
93
+ ) -> str:
94
+ response_format = schema.model_json_schema() if schema else None
95
+
26
96
  if isinstance(input, str):
27
97
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
28
- return fetch_together_api_response(self.model, convo, response_format=schema)
98
+ return fetch_together_api_response(
99
+ self.model, convo, response_format=response_format
100
+ )
29
101
  elif isinstance(input, list):
30
- convo = input
31
- return fetch_together_api_response(self.model, convo, response_format=schema)
102
+ messages = [dict(msg) for msg in input]
103
+ return fetch_together_api_response(
104
+ self.model, messages, response_format=response_format
105
+ )
32
106
  else:
33
- error(f"Invalid input type received: {type(input)}")
107
+ judgeval_logger.error(f"Invalid input type received: {type(input)}")
34
108
  raise TypeError("Input must be a string or a list of dictionaries.")
35
109
 
36
- async def a_generate(self, input: Union[str, List[dict]], schema: BaseModel = None) -> str:
37
- debug(f"Async generating response for input type: {type(input)}")
110
+ async def a_generate(
111
+ self,
112
+ input: Union[str, List[Dict[str, str]]],
113
+ schema: Union[BaseModel, None] = None,
114
+ ) -> str:
115
+ response_format = schema.model_json_schema() if schema else None
116
+
38
117
  if isinstance(input, str):
39
118
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
40
- res = await afetch_together_api_response(self.model, convo, response_format=schema)
119
+ res = await afetch_together_api_response(
120
+ self.model, convo, response_format=response_format
121
+ )
41
122
  return res
42
123
  elif isinstance(input, list):
43
- convo = input
44
- res = await afetch_together_api_response(self.model, convo, response_format=schema)
124
+ messages = [dict(msg) for msg in input]
125
+ res = await afetch_together_api_response(
126
+ self.model, messages, response_format=response_format
127
+ )
45
128
  return res
46
129
  else:
47
- error(f"Invalid input type received: {type(input)}")
48
130
  raise TypeError("Input must be a string or a list of dictionaries.")
49
131
 
50
132
  def load_model(self) -> str:
@@ -52,4 +134,3 @@ class TogetherJudge(JudgevalJudge):
52
134
 
53
135
  def get_model_name(self) -> str:
54
136
  return self.model
55
-
judgeval/judges/utils.py CHANGED
@@ -1,48 +1,44 @@
1
1
  """
2
2
  This module contains utility functions for judge models.
3
3
  """
4
+
4
5
  import litellm
5
- from typing import Optional, Union, Tuple, List
6
+ from typing import Optional, Union, Tuple
6
7
 
7
- from judgeval.common.exceptions import InvalidJudgeModelError
8
- from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
9
- from judgeval.constants import TOGETHER_SUPPORTED_MODELS, JUDGMENT_SUPPORTED_MODELS, ACCEPTABLE_MODELS
8
+ from judgeval.exceptions import InvalidJudgeModelError
9
+ from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge
10
+ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
11
+ from judgeval.constants import (
12
+ TOGETHER_SUPPORTED_MODELS,
13
+ JUDGMENT_SUPPORTED_MODELS,
14
+ )
10
15
 
11
16
  LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
12
17
 
18
+
13
19
  def create_judge(
14
- model: Optional[Union[str, List[str], JudgevalJudge]] = None) -> Tuple[JudgevalJudge, bool]:
20
+ model: Optional[Union[str, JudgevalJudge]] = None,
21
+ ) -> Tuple[JudgevalJudge, bool]:
15
22
  """
16
23
  Creates a judge model from string(s) or a judgeval judge object.
17
24
 
18
25
  If `model` is a single string, it is assumed to be a judge model name.
19
26
  If `model` is a list of strings, it is assumed to be a list of judge model names (for MixtureOfJudges).
20
- If `model` is a judgeval judge object, it is returned as is.
27
+ If `model` is a judgeval judge object, it is returned as is.
21
28
 
22
29
  Returns a tuple of (initialized judgevalBaseLLM, using_native_model boolean)
23
30
  If no model is provided, uses GPT4o as the default judge.
24
31
  """
25
32
  if model is None: # default option
26
- return LiteLLMJudge(model="gpt-4o"), True
33
+ return LiteLLMJudge(model=JUDGMENT_DEFAULT_GPT_MODEL), True
27
34
  if not isinstance(model, (str, list, JudgevalJudge)):
28
- raise InvalidJudgeModelError(f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead.")
35
+ raise InvalidJudgeModelError(
36
+ f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead."
37
+ )
29
38
  # If model is already a valid judge type, return it and mark native
30
- if isinstance(model, (JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges)):
31
- return model, True
32
-
33
- # Either string or List[str]
34
- if isinstance(model, list):
35
- for m in model:
36
- if m in JUDGMENT_SUPPORTED_MODELS:
37
- raise NotImplementedError(
38
- """Judgment models are not yet supported for local scoring.
39
- Please either set the `use_judgment` flag to True or use
40
- non-Judgment models."""
41
- )
42
- if m not in LITELLM_SUPPORTED_MODELS and m not in TOGETHER_SUPPORTED_MODELS:
43
- raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
44
- return MixtureOfJudges(models=model), True
45
- # If model is a string, check that it corresponds to a valid model
39
+ if isinstance(model, (JudgevalJudge, LiteLLMJudge, TogetherJudge)):
40
+ return model, True
41
+
46
42
  if model in LITELLM_SUPPORTED_MODELS:
47
43
  return LiteLLMJudge(model=model), True
48
44
  if model in TOGETHER_SUPPORTED_MODELS:
judgeval/logger.py ADDED
@@ -0,0 +1,62 @@
1
+ import logging
2
+ import sys
3
+
4
+ from judgeval.env import JUDGMENT_NO_COLOR
5
+ from judgeval.utils.decorators.use_once import use_once
6
+
7
+ RESET = "\033[0m"
8
+ RED = "\033[31m"
9
+ YELLOW = "\033[33m"
10
+ BLUE = "\033[34m"
11
+ GRAY = "\033[90m"
12
+
13
+
14
+ class ColorFormatter(logging.Formatter):
15
+ """
16
+ Wrap the final formatted log record in ANSI color codes based on level.
17
+ """
18
+
19
+ COLORS = {
20
+ logging.DEBUG: GRAY,
21
+ logging.INFO: GRAY,
22
+ logging.WARNING: YELLOW,
23
+ logging.ERROR: RED,
24
+ logging.CRITICAL: RED,
25
+ }
26
+
27
+ def __init__(self, fmt=None, datefmt=None, use_color=True):
28
+ super().__init__(fmt=fmt, datefmt=datefmt)
29
+ self.use_color = use_color and sys.stdout.isatty()
30
+
31
+ def format(self, record):
32
+ message = super().format(record)
33
+ if self.use_color:
34
+ color = self.COLORS.get(record.levelno, "")
35
+ if color:
36
+ message = f"{color}{message}{RESET}"
37
+ return message
38
+
39
+
40
+ @use_once
41
+ def _setup_judgeval_logger():
42
+ use_color = sys.stdout.isatty() and JUDGMENT_NO_COLOR is None
43
+ handler = logging.StreamHandler(sys.stdout)
44
+ handler.setLevel(logging.DEBUG)
45
+ handler.setFormatter(
46
+ ColorFormatter(
47
+ fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
48
+ datefmt="%Y-%m-%d %H:%M:%S",
49
+ use_color=use_color,
50
+ )
51
+ )
52
+
53
+ logger = logging.getLogger("judgeval")
54
+ logger.setLevel(logging.DEBUG)
55
+ logger.addHandler(handler)
56
+ return logger
57
+
58
+
59
+ judgeval_logger = _setup_judgeval_logger()
60
+
61
+
62
+ __all__ = ("judgeval_logger",)