judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -3,14 +3,77 @@ Implementation of using TogetherAI inference for judges.
3
3
  """
4
4
 
5
5
  from pydantic import BaseModel
6
- from typing import List, Union
7
-
6
+ from typing import Dict, List, Union, Any, cast
8
7
  from judgeval.judges import JudgevalJudge
9
- from judgeval.common.utils import (
10
- fetch_together_api_response,
11
- afetch_together_api_response,
8
+ from judgeval.logger import judgeval_logger
9
+ from judgeval.env import (
10
+ JUDGMENT_DEFAULT_TOGETHER_MODEL,
11
+ TOGETHERAI_API_KEY,
12
+ TOGETHER_API_KEY,
12
13
  )
13
- from judgeval.common.logger import judgeval_logger
14
+
15
+ together_api_key = TOGETHERAI_API_KEY or TOGETHER_API_KEY
16
+ if together_api_key:
17
+ try:
18
+ from together import Together, AsyncTogether # type: ignore[import-untyped]
19
+
20
+ together_client = Together(api_key=together_api_key)
21
+ async_together_client = AsyncTogether(api_key=together_api_key)
22
+ except Exception:
23
+ pass
24
+
25
+
26
+ def fetch_together_api_response(
27
+ model: str,
28
+ messages: List[Dict[str, str]],
29
+ response_format: Union[Dict[str, Any], None] = None,
30
+ ) -> str:
31
+ if not messages:
32
+ raise ValueError("Messages cannot be empty")
33
+
34
+ if response_format is not None:
35
+ response = together_client.chat.completions.create(
36
+ model=model,
37
+ messages=messages,
38
+ response_format=response_format,
39
+ )
40
+ else:
41
+ response = together_client.chat.completions.create(
42
+ model=model,
43
+ messages=messages,
44
+ )
45
+
46
+ content = response.choices[0].message.content # type: ignore[attr-defined]
47
+ if content is None:
48
+ raise ValueError("Received empty response from TogetherAI")
49
+ return cast(str, content)
50
+
51
+
52
+ async def afetch_together_api_response(
53
+ model: str,
54
+ messages: List[Dict[str, str]],
55
+ response_format: Union[Dict[str, Any], None] = None,
56
+ ) -> str:
57
+ if not messages:
58
+ raise ValueError("Messages cannot be empty")
59
+
60
+ if response_format is not None:
61
+ response = await async_together_client.chat.completions.create(
62
+ model=model,
63
+ messages=messages,
64
+ response_format=response_format,
65
+ )
66
+ else:
67
+ response = await async_together_client.chat.completions.create(
68
+ model=model,
69
+ messages=messages,
70
+ )
71
+
72
+ content = response.choices[0].message.content # type: ignore[attr-defined]
73
+ if content is None:
74
+ raise ValueError("Received empty response from TogetherAI")
75
+ return cast(str, content)
76
+
14
77
 
15
78
  BASE_CONVERSATION = [
16
79
  {"role": "system", "content": "You are a helpful assistant."},
@@ -18,44 +81,52 @@ BASE_CONVERSATION = [
18
81
 
19
82
 
20
83
  class TogetherJudge(JudgevalJudge):
21
- def __init__(self, model: str = "Qwen/Qwen2.5-72B-Instruct-Turbo", **kwargs):
84
+ def __init__(self, model: str = JUDGMENT_DEFAULT_TOGETHER_MODEL, **kwargs):
22
85
  self.model = model
23
86
  self.kwargs = kwargs
24
87
  super().__init__(model_name=model)
25
88
 
26
- # TODO: Fix cost for generate and a_generate
27
- def generate(self, input: Union[str, List[dict]], schema: BaseModel = None) -> str:
89
+ def generate(
90
+ self,
91
+ input: Union[str, List[Dict[str, str]]],
92
+ schema: Union[BaseModel, None] = None,
93
+ ) -> str:
94
+ response_format = schema.model_json_schema() if schema else None
95
+
28
96
  if isinstance(input, str):
29
97
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
30
98
  return fetch_together_api_response(
31
- self.model, convo, response_format=schema
99
+ self.model, convo, response_format=response_format
32
100
  )
33
101
  elif isinstance(input, list):
34
- convo = input
102
+ messages = [dict(msg) for msg in input]
35
103
  return fetch_together_api_response(
36
- self.model, convo, response_format=schema
104
+ self.model, messages, response_format=response_format
37
105
  )
38
106
  else:
39
107
  judgeval_logger.error(f"Invalid input type received: {type(input)}")
40
108
  raise TypeError("Input must be a string or a list of dictionaries.")
41
109
 
42
110
  async def a_generate(
43
- self, input: Union[str, List[dict]], schema: BaseModel = None
111
+ self,
112
+ input: Union[str, List[Dict[str, str]]],
113
+ schema: Union[BaseModel, None] = None,
44
114
  ) -> str:
115
+ response_format = schema.model_json_schema() if schema else None
116
+
45
117
  if isinstance(input, str):
46
118
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
47
119
  res = await afetch_together_api_response(
48
- self.model, convo, response_format=schema
120
+ self.model, convo, response_format=response_format
49
121
  )
50
122
  return res
51
123
  elif isinstance(input, list):
52
- convo = input
124
+ messages = [dict(msg) for msg in input]
53
125
  res = await afetch_together_api_response(
54
- self.model, convo, response_format=schema
126
+ self.model, messages, response_format=response_format
55
127
  )
56
128
  return res
57
129
  else:
58
- judgeval_logger.error(f"Invalid input type received: {type(input)}")
59
130
  raise TypeError("Input must be a string or a list of dictionaries.")
60
131
 
61
132
  def load_model(self) -> str:
judgeval/judges/utils.py CHANGED
@@ -3,21 +3,21 @@ This module contains utility functions for judge models.
3
3
  """
4
4
 
5
5
  import litellm
6
- from typing import Optional, Union, Tuple, List
6
+ from typing import Optional, Union, Tuple
7
7
 
8
- from judgeval.common.exceptions import InvalidJudgeModelError
9
- from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
8
+ from judgeval.exceptions import InvalidJudgeModelError
9
+ from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge
10
+ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
10
11
  from judgeval.constants import (
11
12
  TOGETHER_SUPPORTED_MODELS,
12
13
  JUDGMENT_SUPPORTED_MODELS,
13
- ACCEPTABLE_MODELS,
14
14
  )
15
15
 
16
16
  LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
17
17
 
18
18
 
19
19
  def create_judge(
20
- model: Optional[Union[str, List[str], JudgevalJudge]] = None,
20
+ model: Optional[Union[str, JudgevalJudge]] = None,
21
21
  ) -> Tuple[JudgevalJudge, bool]:
22
22
  """
23
23
  Creates a judge model from string(s) or a judgeval judge object.
@@ -30,28 +30,15 @@ def create_judge(
30
30
  If no model is provided, uses GPT4o as the default judge.
31
31
  """
32
32
  if model is None: # default option
33
- return LiteLLMJudge(model="gpt-4.1"), True
33
+ return LiteLLMJudge(model=JUDGMENT_DEFAULT_GPT_MODEL), True
34
34
  if not isinstance(model, (str, list, JudgevalJudge)):
35
35
  raise InvalidJudgeModelError(
36
36
  f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead."
37
37
  )
38
38
  # If model is already a valid judge type, return it and mark native
39
- if isinstance(model, (JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges)):
39
+ if isinstance(model, (JudgevalJudge, LiteLLMJudge, TogetherJudge)):
40
40
  return model, True
41
41
 
42
- # Either string or List[str]
43
- if isinstance(model, list):
44
- for m in model:
45
- if m in JUDGMENT_SUPPORTED_MODELS:
46
- raise NotImplementedError(
47
- """Judgment models are not yet supported for local scoring.
48
- Please either set the `use_judgment` flag to True or use
49
- non-Judgment models."""
50
- )
51
- if m not in ACCEPTABLE_MODELS:
52
- raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
53
- return MixtureOfJudges(models=model), True
54
- # If model is a string, check that it corresponds to a valid model
55
42
  if model in LITELLM_SUPPORTED_MODELS:
56
43
  return LiteLLMJudge(model=model), True
57
44
  if model in TOGETHER_SUPPORTED_MODELS:
@@ -0,0 +1,55 @@
1
+ from __future__ import annotations
2
+
3
+ from enum import Enum
4
+
5
+
6
+ class AttributeKeys(str, Enum):
7
+ JUDGMENT_SPAN_KIND = "judgment.span_kind"
8
+ JUDGMENT_INPUT = "judgment.input"
9
+ JUDGMENT_OUTPUT = "judgment.output"
10
+ JUDGMENT_OFFLINE_MODE = "judgment.offline_mode"
11
+ JUDGMENT_UPDATE_ID = "judgment.update_id"
12
+ JUDGMENT_CUSTOMER_ID = "judgment.customer_id"
13
+ JUDGMENT_AGENT_ID = "judgment.agent_id"
14
+ JUDGMENT_PARENT_AGENT_ID = "judgment.parent_agent_id"
15
+ JUDGMENT_AGENT_CLASS_NAME = "judgment.agent_class_name"
16
+ JUDGMENT_AGENT_INSTANCE_NAME = "judgment.agent_instance_name"
17
+ JUDGMENT_IS_AGENT_ENTRY_POINT = "judgment.is_agent_entry_point"
18
+ JUDGMENT_CUMULATIVE_LLM_COST = "judgment.cumulative_llm_cost"
19
+ JUDGMENT_STATE_BEFORE = "judgment.state_before"
20
+ JUDGMENT_STATE_AFTER = "judgment.state_after"
21
+ JUDGMENT_PENDING_TRACE_EVAL = "judgment.pending_trace_eval"
22
+ JUDGMENT_USAGE_METADATA = "judgment.usage.metadata"
23
+
24
+ JUDGMENT_LLM_PROVIDER = "judgment.llm.provider"
25
+ JUDGMENT_LLM_MODEL_NAME = "judgment.llm.model"
26
+ JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS = "judgment.usage.non_cached_input_tokens"
27
+ JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS = (
28
+ "judgment.usage.cache_creation_input_tokens"
29
+ )
30
+ JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS = "judgment.usage.cache_read_input_tokens"
31
+ JUDGMENT_USAGE_OUTPUT_TOKENS = "judgment.usage.output_tokens"
32
+ JUDGMENT_USAGE_TOTAL_COST_USD = "judgment.usage.total_cost_usd"
33
+
34
+ GEN_AI_PROMPT = "gen_ai.prompt"
35
+ GEN_AI_COMPLETION = "gen_ai.completion"
36
+ GEN_AI_REQUEST_MODEL = "gen_ai.request.model"
37
+ GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"
38
+ GEN_AI_SYSTEM = "gen_ai.system"
39
+ GEN_AI_USAGE_INPUT_TOKENS = "gen_ai.usage.input_tokens"
40
+ GEN_AI_USAGE_OUTPUT_TOKENS = "gen_ai.usage.output_tokens"
41
+ GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS = (
42
+ "gen_ai.usage.cache_creation_input_tokens"
43
+ )
44
+ GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS = "gen_ai.usage.cache_read_input_tokens"
45
+ GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
46
+ GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
47
+ GEN_AI_RESPONSE_FINISH_REASONS = "gen_ai.response.finish_reasons"
48
+
49
+
50
+ class ResourceKeys(str, Enum):
51
+ SERVICE_NAME = "service.name"
52
+ TELEMETRY_SDK_LANGUAGE = "telemetry.sdk.language"
53
+ TELEMETRY_SDK_NAME = "telemetry.sdk.name"
54
+ TELEMETRY_SDK_VERSION = "telemetry.sdk.version"
55
+ JUDGMENT_PROJECT_ID = "judgment.project_id"
@@ -1,10 +1,9 @@
1
- # logger.py
2
-
3
1
  import logging
4
2
  import sys
5
- import os
6
3
 
7
- # ANSI escape sequences
4
+ from judgeval.env import JUDGMENT_NO_COLOR, JUDGMENT_LOG_LEVEL
5
+ from judgeval.utils.decorators.use_once import use_once
6
+
8
7
  RESET = "\033[0m"
9
8
  RED = "\033[31m"
10
9
  YELLOW = "\033[33m"
@@ -38,10 +37,25 @@ class ColorFormatter(logging.Formatter):
38
37
  return message
39
38
 
40
39
 
40
+ def _parse_log_level(level_str: str) -> int:
41
+ level_map = {
42
+ "debug": logging.DEBUG,
43
+ "info": logging.INFO,
44
+ "warning": logging.WARNING,
45
+ "warn": logging.WARNING,
46
+ "error": logging.ERROR,
47
+ "critical": logging.CRITICAL,
48
+ }
49
+ return level_map.get(level_str.lower(), logging.WARNING)
50
+
51
+
52
+ @use_once
41
53
  def _setup_judgeval_logger():
42
- use_color = sys.stdout.isatty() and os.getenv("NO_COLOR") is None
54
+ use_color = sys.stdout.isatty() and JUDGMENT_NO_COLOR is None
55
+ log_level = _parse_log_level(JUDGMENT_LOG_LEVEL)
56
+
43
57
  handler = logging.StreamHandler(sys.stdout)
44
- handler.setLevel(logging.DEBUG)
58
+ handler.setLevel(log_level)
45
59
  handler.setFormatter(
46
60
  ColorFormatter(
47
61
  fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
@@ -51,10 +65,12 @@ def _setup_judgeval_logger():
51
65
  )
52
66
 
53
67
  logger = logging.getLogger("judgeval")
54
- logger.setLevel(logging.DEBUG)
68
+ logger.setLevel(log_level)
55
69
  logger.addHandler(handler)
56
70
  return logger
57
71
 
58
72
 
59
- # Global logger you can import elsewhere
60
73
  judgeval_logger = _setup_judgeval_logger()
74
+
75
+
76
+ __all__ = ("judgeval_logger",)
@@ -0,0 +1,330 @@
1
+ from typing import List, Optional, Dict
2
+ from judgeval.api import JudgmentSyncClient
3
+ from judgeval.exceptions import JudgmentAPIError
4
+ from judgeval.api.api_types import (
5
+ PromptCommitInfo,
6
+ PromptTagResponse,
7
+ PromptUntagResponse,
8
+ PromptVersionsResponse,
9
+ )
10
+ from dataclasses import dataclass, field
11
+ import re
12
+ from string import Template
13
+ from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
14
+ from judgeval.utils.project import _resolve_project_id
15
+
16
+
17
+ def push_prompt(
18
+ project_name: str,
19
+ name: str,
20
+ prompt: str,
21
+ tags: List[str],
22
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
23
+ organization_id: str | None = JUDGMENT_ORG_ID,
24
+ ) -> tuple[str, Optional[str], str]:
25
+ if not judgment_api_key or not organization_id:
26
+ raise ValueError("Judgment API key and organization ID are required")
27
+ client = JudgmentSyncClient(judgment_api_key, organization_id)
28
+ try:
29
+ project_id = _resolve_project_id(
30
+ project_name, judgment_api_key, organization_id
31
+ )
32
+ if not project_id:
33
+ raise JudgmentAPIError(
34
+ status_code=404,
35
+ detail=f"Project '{project_name}' not found",
36
+ response=None, # type: ignore
37
+ )
38
+ r = client.prompts_insert(
39
+ payload={
40
+ "project_id": project_id,
41
+ "name": name,
42
+ "prompt": prompt,
43
+ "tags": tags,
44
+ }
45
+ )
46
+ return r["commit_id"], r.get("parent_commit_id"), r["created_at"]
47
+ except JudgmentAPIError as e:
48
+ raise JudgmentAPIError(
49
+ status_code=e.status_code,
50
+ detail=f"Failed to save prompt: {e.detail}",
51
+ response=e.response,
52
+ )
53
+
54
+
55
+ def fetch_prompt(
56
+ project_name: str,
57
+ name: str,
58
+ commit_id: Optional[str] = None,
59
+ tag: Optional[str] = None,
60
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
61
+ organization_id: str | None = JUDGMENT_ORG_ID,
62
+ ) -> Optional[PromptCommitInfo]:
63
+ if not judgment_api_key or not organization_id:
64
+ raise ValueError("Judgment API key and organization ID are required")
65
+ client = JudgmentSyncClient(judgment_api_key, organization_id)
66
+ try:
67
+ project_id = _resolve_project_id(
68
+ project_name, judgment_api_key, organization_id
69
+ )
70
+ if not project_id:
71
+ raise JudgmentAPIError(
72
+ status_code=404,
73
+ detail=f"Project '{project_name}' not found",
74
+ response=None, # type: ignore
75
+ )
76
+ prompt_config = client.prompts_fetch(
77
+ name=name,
78
+ project_id=project_id,
79
+ commit_id=commit_id,
80
+ tag=tag,
81
+ )
82
+ return prompt_config["commit"]
83
+ except JudgmentAPIError as e:
84
+ raise JudgmentAPIError(
85
+ status_code=e.status_code,
86
+ detail=f"Failed to fetch prompt '{name}': {e.detail}",
87
+ response=e.response,
88
+ )
89
+
90
+
91
+ def tag_prompt(
92
+ project_name: str,
93
+ name: str,
94
+ commit_id: str,
95
+ tags: List[str],
96
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
97
+ organization_id: str | None = JUDGMENT_ORG_ID,
98
+ ) -> PromptTagResponse:
99
+ if not judgment_api_key or not organization_id:
100
+ raise ValueError("Judgment API key and organization ID are required")
101
+ client = JudgmentSyncClient(judgment_api_key, organization_id)
102
+ try:
103
+ project_id = _resolve_project_id(
104
+ project_name, judgment_api_key, organization_id
105
+ )
106
+ if not project_id:
107
+ raise JudgmentAPIError(
108
+ status_code=404,
109
+ detail=f"Project '{project_name}' not found",
110
+ response=None, # type: ignore
111
+ )
112
+ prompt_config = client.prompts_tag(
113
+ payload={
114
+ "project_id": project_id,
115
+ "name": name,
116
+ "commit_id": commit_id,
117
+ "tags": tags,
118
+ }
119
+ )
120
+ return prompt_config
121
+ except JudgmentAPIError as e:
122
+ raise JudgmentAPIError(
123
+ status_code=e.status_code,
124
+ detail=f"Failed to tag prompt '{name}': {e.detail}",
125
+ response=e.response,
126
+ )
127
+
128
+
129
+ def untag_prompt(
130
+ project_name: str,
131
+ name: str,
132
+ tags: List[str],
133
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
134
+ organization_id: str | None = JUDGMENT_ORG_ID,
135
+ ) -> PromptUntagResponse:
136
+ if not judgment_api_key or not organization_id:
137
+ raise ValueError("Judgment API key and organization ID are required")
138
+ client = JudgmentSyncClient(judgment_api_key, organization_id)
139
+ try:
140
+ project_id = _resolve_project_id(
141
+ project_name, judgment_api_key, organization_id
142
+ )
143
+ if not project_id:
144
+ raise JudgmentAPIError(
145
+ status_code=404,
146
+ detail=f"Project '{project_name}' not found",
147
+ response=None, # type: ignore
148
+ )
149
+ prompt_config = client.prompts_untag(
150
+ payload={"project_id": project_id, "name": name, "tags": tags}
151
+ )
152
+ return prompt_config
153
+ except JudgmentAPIError as e:
154
+ raise JudgmentAPIError(
155
+ status_code=e.status_code,
156
+ detail=f"Failed to untag prompt '{name}': {e.detail}",
157
+ response=e.response,
158
+ )
159
+
160
+
161
+ def list_prompt(
162
+ project_name: str,
163
+ name: str,
164
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
165
+ organization_id: str | None = JUDGMENT_ORG_ID,
166
+ ) -> PromptVersionsResponse:
167
+ if not judgment_api_key or not organization_id:
168
+ raise ValueError("Judgment API key and organization ID are required")
169
+ client = JudgmentSyncClient(judgment_api_key, organization_id)
170
+ try:
171
+ project_id = _resolve_project_id(
172
+ project_name, judgment_api_key, organization_id
173
+ )
174
+ if not project_id:
175
+ raise JudgmentAPIError(
176
+ status_code=404,
177
+ detail=f"Project '{project_name}' not found",
178
+ response=None, # type: ignore
179
+ )
180
+ prompt_config = client.prompts_get_prompt_versions(
181
+ project_id=project_id, name=name
182
+ )
183
+ return prompt_config
184
+ except JudgmentAPIError as e:
185
+ raise JudgmentAPIError(
186
+ status_code=e.status_code,
187
+ detail=f"Failed to list prompt '{name}': {e.detail}",
188
+ response=e.response,
189
+ )
190
+
191
+
192
+ @dataclass
193
+ class Prompt:
194
+ name: str
195
+ prompt: str
196
+ created_at: str
197
+ tags: List[str]
198
+ commit_id: str
199
+ parent_commit_id: Optional[str] = None
200
+ metadata: Dict[str, str] = field(default_factory=dict)
201
+ _template: Template = field(init=False, repr=False)
202
+
203
+ def __post_init__(self):
204
+ template_str = re.sub(r"\{\{([^}]+)\}\}", r"$\1", self.prompt)
205
+ self._template = Template(template_str)
206
+
207
+ @classmethod
208
+ def create(
209
+ cls,
210
+ project_name: str,
211
+ name: str,
212
+ prompt: str,
213
+ tags: Optional[List[str]] = None,
214
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
215
+ organization_id: str | None = JUDGMENT_ORG_ID,
216
+ ):
217
+ if tags is None:
218
+ tags = []
219
+ commit_id, parent_commit_id, created_at = push_prompt(
220
+ project_name, name, prompt, tags, judgment_api_key, organization_id
221
+ )
222
+ return cls(
223
+ name=name,
224
+ prompt=prompt,
225
+ created_at=created_at,
226
+ tags=tags,
227
+ commit_id=commit_id,
228
+ parent_commit_id=parent_commit_id,
229
+ )
230
+
231
+ @classmethod
232
+ def get(
233
+ cls,
234
+ project_name: str,
235
+ name: str,
236
+ commit_id: Optional[str] = None,
237
+ tag: Optional[str] = None,
238
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
239
+ organization_id: str | None = JUDGMENT_ORG_ID,
240
+ ):
241
+ if commit_id is not None and tag is not None:
242
+ raise ValueError(
243
+ "You cannot fetch a prompt by both commit_id and tag at the same time"
244
+ )
245
+ prompt_config = fetch_prompt(
246
+ project_name, name, commit_id, tag, judgment_api_key, organization_id
247
+ )
248
+ if prompt_config is None:
249
+ raise JudgmentAPIError(
250
+ status_code=404,
251
+ detail=f"Prompt '{name}' not found in project '{project_name}'",
252
+ response=None, # type: ignore
253
+ )
254
+ return cls(
255
+ name=prompt_config["name"],
256
+ prompt=prompt_config["prompt"],
257
+ created_at=prompt_config["created_at"],
258
+ tags=prompt_config["tags"],
259
+ commit_id=prompt_config["commit_id"],
260
+ parent_commit_id=prompt_config.get("parent_commit_id"),
261
+ metadata={
262
+ "creator_first_name": prompt_config["first_name"],
263
+ "creator_last_name": prompt_config["last_name"],
264
+ "creator_email": prompt_config["user_email"],
265
+ },
266
+ )
267
+
268
+ @classmethod
269
+ def tag(
270
+ cls,
271
+ project_name: str,
272
+ name: str,
273
+ commit_id: str,
274
+ tags: List[str],
275
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
276
+ organization_id: str | None = JUDGMENT_ORG_ID,
277
+ ):
278
+ prompt_config = tag_prompt(
279
+ project_name, name, commit_id, tags, judgment_api_key, organization_id
280
+ )
281
+ return prompt_config["commit_id"]
282
+
283
+ @classmethod
284
+ def untag(
285
+ cls,
286
+ project_name: str,
287
+ name: str,
288
+ tags: List[str],
289
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
290
+ organization_id: str | None = JUDGMENT_ORG_ID,
291
+ ):
292
+ prompt_config = untag_prompt(
293
+ project_name, name, tags, judgment_api_key, organization_id
294
+ )
295
+ return prompt_config["commit_ids"]
296
+
297
+ @classmethod
298
+ def list(
299
+ cls,
300
+ project_name: str,
301
+ name: str,
302
+ judgment_api_key: str | None = JUDGMENT_API_KEY,
303
+ organization_id: str | None = JUDGMENT_ORG_ID,
304
+ ):
305
+ prompt_configs = list_prompt(
306
+ project_name, name, judgment_api_key, organization_id
307
+ )["versions"]
308
+ return [
309
+ cls(
310
+ name=prompt_config["name"],
311
+ prompt=prompt_config["prompt"],
312
+ tags=prompt_config["tags"],
313
+ created_at=prompt_config["created_at"],
314
+ commit_id=prompt_config["commit_id"],
315
+ parent_commit_id=prompt_config.get("parent_commit_id"),
316
+ metadata={
317
+ "creator_first_name": prompt_config["first_name"],
318
+ "creator_last_name": prompt_config["last_name"],
319
+ "creator_email": prompt_config["user_email"],
320
+ },
321
+ )
322
+ for prompt_config in prompt_configs
323
+ ]
324
+
325
+ def compile(self, **kwargs) -> str:
326
+ try:
327
+ return self._template.substitute(**kwargs)
328
+ except KeyError as e:
329
+ missing_var = str(e).strip("'")
330
+ raise ValueError(f"Missing required variable: {missing_var}")
@@ -1,29 +1,29 @@
1
- from judgeval.scorers.api_scorer import APIScorerConfig
1
+ from judgeval.scorers.api_scorer import (
2
+ APIScorerConfig,
3
+ ExampleAPIScorerConfig,
4
+ TraceAPIScorerConfig,
5
+ )
2
6
  from judgeval.scorers.base_scorer import BaseScorer
7
+ from judgeval.scorers.example_scorer import ExampleScorer
3
8
  from judgeval.scorers.judgeval_scorers.api_scorers import (
4
- ExecutionOrderScorer,
5
- HallucinationScorer,
6
9
  FaithfulnessScorer,
7
10
  AnswerRelevancyScorer,
8
11
  AnswerCorrectnessScorer,
9
12
  InstructionAdherenceScorer,
10
- DerailmentScorer,
11
- ToolOrderScorer,
13
+ TracePromptScorer,
12
14
  PromptScorer,
13
- ToolDependencyScorer,
14
15
  )
15
16
 
16
17
  __all__ = [
17
18
  "APIScorerConfig",
19
+ "ExampleAPIScorerConfig",
20
+ "TraceAPIScorerConfig",
18
21
  "BaseScorer",
22
+ "ExampleScorer",
23
+ "TracePromptScorer",
19
24
  "PromptScorer",
20
- "ExecutionOrderScorer",
21
- "HallucinationScorer",
22
25
  "FaithfulnessScorer",
23
26
  "AnswerRelevancyScorer",
24
27
  "AnswerCorrectnessScorer",
25
28
  "InstructionAdherenceScorer",
26
- "DerailmentScorer",
27
- "ToolOrderScorer",
28
- "ToolDependencyScorer",
29
29
  ]