judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,28 +1,51 @@
1
- from judgeval.scorers.api_scorer import APIScorerConfig
1
+ from judgeval.scorers.api_scorer import (
2
+ APIScorerConfig,
3
+ ExampleAPIScorerConfig,
4
+ TraceAPIScorerConfig,
5
+ )
2
6
  from judgeval.constants import APIScorerType
3
- from typing import Mapping, Dict, Any
4
- from judgeval.common.api import JudgmentApiClient, JudgmentAPIException
7
+ from typing import Dict, Any, Optional
8
+ from judgeval.api import JudgmentSyncClient
9
+ from judgeval.exceptions import JudgmentAPIError
5
10
  import os
6
- from judgeval.common.exceptions import JudgmentAPIError
11
+ from judgeval.logger import judgeval_logger
12
+ from abc import ABC
13
+ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
14
+ from copy import copy
15
+ from judgeval.utils.decorators.dont_throw import dont_throw
7
16
 
8
17
 
9
18
  def push_prompt_scorer(
10
19
  name: str,
11
20
  prompt: str,
12
- options: Mapping[str, float],
21
+ threshold: float,
22
+ options: Optional[Dict[str, float]] = None,
23
+ model: str = JUDGMENT_DEFAULT_GPT_MODEL,
24
+ description: Optional[str] = None,
13
25
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
14
26
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
27
+ is_trace: bool = False,
15
28
  ) -> str:
16
- client = JudgmentApiClient(judgment_api_key, organization_id)
29
+ client = JudgmentSyncClient(judgment_api_key, organization_id)
17
30
  try:
18
- r = client.save_scorer(name, prompt, dict(options))
19
- except JudgmentAPIException as e:
20
- if e.status_code == 500:
21
- raise JudgmentAPIError(
22
- f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.error_detail}"
23
- )
24
- raise JudgmentAPIError(f"Failed to save classifier scorer: {e.error_detail}")
25
- return r["name"]
31
+ r = client.save_scorer(
32
+ payload={
33
+ "name": name,
34
+ "prompt": prompt,
35
+ "threshold": threshold,
36
+ "options": options,
37
+ "model": model,
38
+ "description": description,
39
+ "is_trace": is_trace,
40
+ }
41
+ )
42
+ except JudgmentAPIError as e:
43
+ raise JudgmentAPIError(
44
+ status_code=e.status_code,
45
+ detail=f"Failed to save prompt scorer: {e.detail}",
46
+ response=e.response,
47
+ )
48
+ return r["scorer_response"]["name"]
26
49
 
27
50
 
28
51
  def fetch_prompt_scorer(
@@ -30,19 +53,26 @@ def fetch_prompt_scorer(
30
53
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
31
54
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
32
55
  ):
33
- client = JudgmentApiClient(judgment_api_key, organization_id)
56
+ client = JudgmentSyncClient(judgment_api_key, organization_id)
34
57
  try:
35
- scorer_config = client.fetch_scorer(name)
36
- scorer_config.pop("created_at")
37
- scorer_config.pop("updated_at")
38
- return scorer_config
39
- except JudgmentAPIException as e:
40
- if e.status_code == 500:
58
+ fetched_scorers = client.fetch_scorers({"names": [name]})
59
+ if len(fetched_scorers["scorers"]) == 0:
60
+ judgeval_logger.error(f"Prompt scorer '{name}' not found")
41
61
  raise JudgmentAPIError(
42
- f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.error_detail}"
62
+ status_code=404,
63
+ detail=f"Prompt scorer '{name}' not found",
64
+ response=None, # type: ignore
43
65
  )
66
+ else:
67
+ scorer_config = fetched_scorers["scorers"][0]
68
+ scorer_config.pop("created_at")
69
+ scorer_config.pop("updated_at")
70
+ return scorer_config
71
+ except JudgmentAPIError as e:
44
72
  raise JudgmentAPIError(
45
- f"Failed to fetch classifier scorer '{name}': {e.error_detail}"
73
+ status_code=e.status_code,
74
+ detail=f"Failed to fetch prompt scorer '{name}': {e.detail}",
75
+ response=e.response,
46
76
  )
47
77
 
48
78
 
@@ -51,33 +81,33 @@ def scorer_exists(
51
81
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
52
82
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
53
83
  ):
54
- client = JudgmentApiClient(judgment_api_key, organization_id)
84
+ client = JudgmentSyncClient(judgment_api_key, organization_id)
55
85
  try:
56
- return client.scorer_exists(name)["exists"]
57
- except JudgmentAPIException as e:
86
+ return client.scorer_exists({"name": name})["exists"]
87
+ except JudgmentAPIError as e:
58
88
  if e.status_code == 500:
59
89
  raise JudgmentAPIError(
60
- f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.error_detail}"
90
+ status_code=e.status_code,
91
+ detail=f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.detail}",
92
+ response=e.response,
61
93
  )
62
- raise JudgmentAPIError(f"Failed to check if scorer exists: {e.error_detail}")
63
-
64
-
65
- class PromptScorer(APIScorerConfig):
66
- """
67
- In the Judgment backend, this scorer is implemented as a PromptScorer that takes
68
- 1. a system role that may involve the Example object
69
- 2. options for scores on the example
94
+ raise JudgmentAPIError(
95
+ status_code=e.status_code,
96
+ detail=f"Failed to check if scorer exists: {e.detail}",
97
+ response=e.response,
98
+ )
70
99
 
71
- and uses a judge to execute the evaluation from the system role and classify into one of the options
72
- """
73
100
 
101
+ class BasePromptScorer(ABC, APIScorerConfig):
102
+ score_type: APIScorerType
74
103
  prompt: str
75
- options: Mapping[str, float]
76
- score_type: APIScorerType = APIScorerType.PROMPT_SCORER
104
+ options: Optional[Dict[str, float]] = None
105
+ description: Optional[str] = None
77
106
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
78
107
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
79
108
 
80
109
  @classmethod
110
+ @dont_throw
81
111
  def get(
82
112
  cls,
83
113
  name: str,
@@ -85,10 +115,24 @@ class PromptScorer(APIScorerConfig):
85
115
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
86
116
  ):
87
117
  scorer_config = fetch_prompt_scorer(name, judgment_api_key, organization_id)
118
+ if scorer_config["is_trace"] != issubclass(cls, TracePromptScorer):
119
+ raise JudgmentAPIError(
120
+ status_code=400,
121
+ detail=f"Scorer with name {name} is not a {cls.__name__}",
122
+ response=None, # type: ignore
123
+ )
124
+ if issubclass(cls, TracePromptScorer):
125
+ score_type = APIScorerType.TRACE_PROMPT_SCORER
126
+ else:
127
+ score_type = APIScorerType.PROMPT_SCORER
88
128
  return cls(
129
+ score_type=score_type,
89
130
  name=name,
90
131
  prompt=scorer_config["prompt"],
91
- options=scorer_config["options"],
132
+ threshold=scorer_config["threshold"],
133
+ options=scorer_config.get("options"),
134
+ model=scorer_config.get("model"),
135
+ description=scorer_config.get("description"),
92
136
  judgment_api_key=judgment_api_key,
93
137
  organization_id=organization_id,
94
138
  )
@@ -98,32 +142,51 @@ class PromptScorer(APIScorerConfig):
98
142
  cls,
99
143
  name: str,
100
144
  prompt: str,
101
- options: Mapping[str, float],
145
+ threshold: float = 0.5,
146
+ options: Optional[Dict[str, float]] = None,
147
+ model: str = JUDGMENT_DEFAULT_GPT_MODEL,
148
+ description: Optional[str] = None,
102
149
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
103
150
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
104
151
  ):
105
152
  if not scorer_exists(name, judgment_api_key, organization_id):
106
- push_prompt_scorer(name, prompt, options, judgment_api_key, organization_id)
153
+ if issubclass(cls, TracePromptScorer):
154
+ is_trace = True
155
+ score_type = APIScorerType.TRACE_PROMPT_SCORER
156
+ else:
157
+ is_trace = False
158
+ score_type = APIScorerType.PROMPT_SCORER
159
+ push_prompt_scorer(
160
+ name,
161
+ prompt,
162
+ threshold,
163
+ options,
164
+ model,
165
+ description,
166
+ judgment_api_key,
167
+ organization_id,
168
+ is_trace,
169
+ )
170
+ judgeval_logger.info(f"Successfully created PromptScorer: {name}")
107
171
  return cls(
172
+ score_type=score_type,
108
173
  name=name,
109
174
  prompt=prompt,
175
+ threshold=threshold,
110
176
  options=options,
177
+ model=model,
178
+ description=description,
111
179
  judgment_api_key=judgment_api_key,
112
180
  organization_id=organization_id,
113
181
  )
114
182
  else:
115
183
  raise JudgmentAPIError(
116
- f"Scorer with name {name} already exists. Either use the existing scorer with the get() method or use a new name."
184
+ status_code=400,
185
+ detail=f"Scorer with name {name} already exists. Either use the existing scorer with the get() method or use a new name.",
186
+ response=None, # type: ignore
117
187
  )
118
188
 
119
189
  # Setter functions. Each setter function pushes the scorer to the DB.
120
- def set_name(self, name: str):
121
- """
122
- Updates the name of the scorer.
123
- """
124
- self.name = name
125
- self.push_prompt_scorer()
126
-
127
190
  def set_threshold(self, threshold: float):
128
191
  """
129
192
  Updates the threshold of the scorer.
@@ -140,16 +203,31 @@ class PromptScorer(APIScorerConfig):
140
203
  """
141
204
  self.prompt = prompt
142
205
  self.push_prompt_scorer()
206
+ judgeval_logger.info(f"Successfully updated prompt for {self.name}")
143
207
 
144
- def set_options(self, options: Mapping[str, float]):
208
+ def set_model(self, model: str):
145
209
  """
146
- Updates the options with the new options.
210
+ Updates the model of the scorer.
211
+ """
212
+ self.model = model
213
+ self.push_prompt_scorer()
214
+ judgeval_logger.info(f"Successfully updated model for {self.name}")
147
215
 
148
- Sample options:
149
- {"yes": 1, "no": 0}
216
+ def set_options(self, options: Optional[Dict[str, float]]):
217
+ """
218
+ Updates the options of the scorer.
150
219
  """
151
220
  self.options = options
152
221
  self.push_prompt_scorer()
222
+ judgeval_logger.info(f"Successfully updated options for {self.name}")
223
+
224
+ def set_description(self, description: Optional[str]):
225
+ """
226
+ Updates the description of the scorer.
227
+ """
228
+ self.description = description
229
+ self.push_prompt_scorer()
230
+ judgeval_logger.info(f"Successfully updated description for {self.name}")
153
231
 
154
232
  def append_to_prompt(self, prompt_addition: str):
155
233
  """
@@ -157,21 +235,40 @@ class PromptScorer(APIScorerConfig):
157
235
  """
158
236
  self.prompt += prompt_addition
159
237
  self.push_prompt_scorer()
238
+ judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
160
239
 
161
240
  # Getters
162
- def get_prompt(self) -> str | None:
241
+ def get_threshold(self) -> float:
242
+ """
243
+ Returns the threshold of the scorer.
244
+ """
245
+ return self.threshold
246
+
247
+ def get_prompt(self) -> str:
163
248
  """
164
249
  Returns the prompt of the scorer.
165
250
  """
166
251
  return self.prompt
167
252
 
168
- def get_options(self) -> Mapping[str, float] | None:
253
+ def get_model(self) -> str:
254
+ """
255
+ Returns the model of the scorer.
256
+ """
257
+ return self.model
258
+
259
+ def get_options(self) -> Dict[str, float] | None:
169
260
  """
170
261
  Returns the options of the scorer.
171
262
  """
172
- return self.options
263
+ return copy(self.options) if self.options is not None else None
264
+
265
+ def get_description(self) -> str | None:
266
+ """
267
+ Returns the description of the scorer.
268
+ """
269
+ return self.description
173
270
 
174
- def get_name(self) -> str | None:
271
+ def get_name(self) -> str:
175
272
  """
176
273
  Returns the name of the scorer.
177
274
  """
@@ -183,8 +280,11 @@ class PromptScorer(APIScorerConfig):
183
280
  """
184
281
  return {
185
282
  "name": self.name,
283
+ "model": self.model,
186
284
  "prompt": self.prompt,
285
+ "threshold": self.threshold,
187
286
  "options": self.options,
287
+ "description": self.description,
188
288
  }
189
289
 
190
290
  def push_prompt_scorer(self):
@@ -194,13 +294,17 @@ class PromptScorer(APIScorerConfig):
194
294
  push_prompt_scorer(
195
295
  self.name,
196
296
  self.prompt,
297
+ self.threshold,
197
298
  self.options,
299
+ self.model,
300
+ self.description,
198
301
  self.judgment_api_key,
199
302
  self.organization_id,
303
+ isinstance(self, TracePromptScorer),
200
304
  )
201
305
 
202
306
  def __str__(self):
203
- return f"PromptScorer(name={self.name}, prompt={self.prompt}, options={self.options})"
307
+ return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold}, options={self.options}, description={self.description})"
204
308
 
205
309
  def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
206
310
  base = super().model_dump(*args, **kwargs)
@@ -213,3 +317,11 @@ class PromptScorer(APIScorerConfig):
213
317
  k: getattr(self, k) for k in extra_fields if getattr(self, k) is not None
214
318
  }
215
319
  return base
320
+
321
+
322
+ class PromptScorer(BasePromptScorer, ExampleAPIScorerConfig):
323
+ pass
324
+
325
+
326
+ class TracePromptScorer(BasePromptScorer, TraceAPIScorerConfig):
327
+ pass
judgeval/scorers/score.py CHANGED
@@ -1,5 +1,5 @@
1
1
  """
2
- Infrastructure for executing evaluations of `Example`s using one or more `BaseScorer`s.
2
+ Infrastructure for executing evaluations of `Example`s using one or more `ExampleScorer`s.
3
3
  """
4
4
 
5
5
  import asyncio
@@ -13,61 +13,67 @@ from judgeval.data import (
13
13
  generate_scoring_result,
14
14
  create_scorer_data,
15
15
  )
16
- from judgeval.scorers import BaseScorer
16
+ from judgeval.scorers.example_scorer import ExampleScorer
17
17
  from judgeval.scorers.utils import clone_scorers
18
- from judgeval.common.logger import judgeval_logger
18
+ from judgeval.logger import judgeval_logger
19
19
  from judgeval.judges import JudgevalJudge
20
+ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
20
21
 
21
22
 
22
23
  async def safe_a_score_example(
23
- scorer: BaseScorer,
24
+ scorer: ExampleScorer,
24
25
  example: Example,
25
26
  ):
26
27
  """
27
28
  Scoring task function when not using a progress indicator!
28
- "Safely" scores an `Example` using a `BaseScorer` by gracefully handling any exceptions that may occur.
29
+ "Safely" scores an `Example` using a `ExampleScorer` by gracefully handling any exceptions that may occur.
29
30
 
30
31
  Args:
31
- scorer (BaseScorer): The `BaseScorer` to use for scoring the example.
32
+ scorer (ExampleScorer): The `ExampleScorer` to use for scoring the example.
32
33
  example (Example): The `Example` to be scored.
33
-
34
- ignore_errors (bool): Whether to ignore errors during the evaluation.
35
- If set to false, any error will be raised and stop the evaluation.
36
- If set to true, the error will be stored in the `error` attribute of the `BaseScorer` and the `success` attribute will be set to False.
37
-
38
- skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
39
34
  """
40
35
  try:
41
- scorer.score = await scorer.a_score_example(example)
36
+ score = await scorer.a_score_example(example)
37
+ if score is None:
38
+ raise Exception("a_score_example need to return a score")
39
+ elif score < 0:
40
+ judgeval_logger.warning("score cannot be less than 0 , setting to 0")
41
+ score = 0
42
+ elif score > 1:
43
+ judgeval_logger.warning("score cannot be greater than 1 , setting to 1")
44
+ score = 1
45
+ else:
46
+ scorer.score = score
42
47
  scorer.success = scorer.success_check()
43
48
  except Exception as e:
44
49
  judgeval_logger.error(f"Error during scoring: {str(e)}")
45
50
  scorer.error = str(e)
46
51
  scorer.success = False
52
+ scorer.score = 0
47
53
  return
48
54
 
49
55
 
50
56
  async def a_execute_scoring(
51
57
  examples: List[Example],
52
- scorers: List[BaseScorer],
53
- model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
58
+ scorers: List[ExampleScorer],
59
+ model: Optional[Union[str, List[str], JudgevalJudge]] = JUDGMENT_DEFAULT_GPT_MODEL,
54
60
  ignore_errors: bool = False,
55
61
  throttle_value: int = 0,
56
62
  max_concurrent: int = 100,
63
+ show_progress: bool = True,
57
64
  ) -> List[ScoringResult]:
58
65
  """
59
- Executes evaluations of `Example`s asynchronously using one or more `BaseScorer`s.
60
- Each `Example` will be evaluated by all of the `BaseScorer`s in the `scorers` list.
66
+ Executes evaluations of `Example`s asynchronously using one or more `ExampleScorer`s.
67
+ Each `Example` will be evaluated by all of the `ExampleScorer`s in the `scorers` list.
61
68
 
62
69
  Args:
63
70
  examples (List[Example]): A list of `Example` objects to be evaluated.
64
- scorers (List[BaseScorer]): A list of `BaseScorer` objects to evaluate the examples.
71
+ scorers (List[ExampleScorer]): A list of `ExampleScorer` objects to evaluate the examples.
65
72
  model (Union[str, List[str], JudgevalJudge]): The model to use for evaluation.
66
73
  ignore_errors (bool): Whether to ignore errors during evaluation.
67
74
  throttle_value (int): The amount of time to wait between starting each task.
68
75
  max_concurrent (int): The maximum number of concurrent tasks.
69
-
70
- _use_bar_indicator (bool): Whether to use a progress bar indicator.
76
+ show_progress (bool): Whether to show the progress bar indicator.
71
77
 
72
78
  Returns:
73
79
  List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results.
@@ -82,33 +88,50 @@ async def a_execute_scoring(
82
88
  except Exception as e:
83
89
  judgeval_logger.error(f"Error executing function: {e}")
84
90
  if kwargs.get("ignore_errors", False):
85
- # Simply return None when ignoring errors, as expected by the test
86
91
  return None
87
- # If we're not ignoring errors, propagate the exception
88
92
  raise
89
93
 
90
- # Add model to scorers
91
94
  for scorer in scorers:
92
- if not scorer.model:
95
+ if not scorer.model and isinstance(model, str):
93
96
  scorer._add_model(model)
94
97
 
95
- scoring_results: List[ScoringResult] = [None for _ in examples]
98
+ scoring_results: List[Optional[ScoringResult]] = [None for _ in examples]
96
99
  tasks = []
97
- cloned_scorers: List[BaseScorer]
98
-
99
- with tqdm_asyncio(
100
- desc=f"Evaluating {len(examples)} example(s) in parallel",
101
- unit="Example",
102
- total=len(examples),
103
- bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
104
- ) as pbar:
100
+
101
+ if show_progress:
102
+ with tqdm_asyncio(
103
+ desc=f"Evaluating {len(examples)} example(s) in parallel",
104
+ unit="Example",
105
+ total=len(examples),
106
+ bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
107
+ ) as pbar:
108
+ for i, ex in enumerate(examples):
109
+ if isinstance(ex, Example):
110
+ if len(scorers) == 0:
111
+ pbar.update(1)
112
+ continue
113
+
114
+ cloned_scorers = clone_scorers(scorers) # type: ignore
115
+ task = execute_with_semaphore(
116
+ func=a_eval_examples_helper,
117
+ scorers=cloned_scorers,
118
+ example=ex,
119
+ scoring_results=scoring_results,
120
+ score_index=i,
121
+ ignore_errors=ignore_errors,
122
+ pbar=pbar,
123
+ )
124
+ tasks.append(asyncio.create_task(task))
125
+
126
+ await asyncio.sleep(throttle_value)
127
+ await asyncio.gather(*tasks)
128
+ else:
105
129
  for i, ex in enumerate(examples):
106
130
  if isinstance(ex, Example):
107
131
  if len(scorers) == 0:
108
- pbar.update(1)
109
132
  continue
110
133
 
111
- cloned_scorers = clone_scorers(scorers)
134
+ cloned_scorers = clone_scorers(scorers) # type: ignore
112
135
  task = execute_with_semaphore(
113
136
  func=a_eval_examples_helper,
114
137
  scorers=cloned_scorers,
@@ -116,19 +139,19 @@ async def a_execute_scoring(
116
139
  scoring_results=scoring_results,
117
140
  score_index=i,
118
141
  ignore_errors=ignore_errors,
119
- pbar=pbar,
142
+ pbar=None,
120
143
  )
121
144
  tasks.append(asyncio.create_task(task))
122
145
 
123
146
  await asyncio.sleep(throttle_value)
124
147
  await asyncio.gather(*tasks)
125
- return scoring_results
148
+ return [result for result in scoring_results if result is not None]
126
149
 
127
150
 
128
151
  async def a_eval_examples_helper(
129
- scorers: List[BaseScorer],
152
+ scorers: List[ExampleScorer],
130
153
  example: Example,
131
- scoring_results: List[ScoringResult],
154
+ scoring_results: List[Optional[ScoringResult]],
132
155
  score_index: int,
133
156
  ignore_errors: bool,
134
157
  pbar: Optional[tqdm_asyncio] = None,
@@ -137,7 +160,7 @@ async def a_eval_examples_helper(
137
160
  Evaluate a single example asynchronously using a list of scorers.
138
161
 
139
162
  Args:
140
- scorers (List[BaseScorer]): List of BaseScorer objects to evaluate the example.
163
+ scorers (List[ExampleScorer]): List of ExampleScorer objects to evaluate the example.
141
164
  example (Example): The example to be evaluated.
142
165
  scoring_results (List[ScoringResult]): List to store the scoring results.
143
166
  score_index (int): Index at which the result should be stored in scoring_results.
@@ -147,24 +170,18 @@ async def a_eval_examples_helper(
147
170
  None
148
171
  """
149
172
 
150
- # scoring the Example
151
173
  scoring_start_time = time.perf_counter()
152
174
 
153
175
  tasks = [safe_a_score_example(scorer, example) for scorer in scorers]
154
176
 
155
177
  await asyncio.gather(*tasks)
156
178
 
157
- # Now that all the scoring functions of each scorer have executed, we collect
158
- # the results and update the ScoringResult with the scorer data
159
179
  success = True
160
180
  scorer_data_list = []
161
181
  for scorer in scorers:
162
- # At this point, the scorer has been executed and already contains data.
163
182
  if getattr(scorer, "skipped", False):
164
183
  continue
165
- scorer_data = create_scorer_data(
166
- scorer
167
- ) # Fetch scorer data from completed scorer evaluation
184
+ scorer_data = create_scorer_data(scorer)
168
185
  for s in scorer_data:
169
186
  success = success and s.success
170
187
  scorer_data_list.extend(scorer_data)