judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
judgeval/scorers/utils.py CHANGED
@@ -2,208 +2,13 @@
2
2
  Util functions for Scorer objects
3
3
  """
4
4
 
5
- import asyncio
6
- import nest_asyncio
7
- import inspect
8
- import json
9
- import sys
10
- import re
11
- from contextlib import contextmanager
12
- from rich.progress import Progress, SpinnerColumn, TextColumn
13
- from rich.console import Console
14
- from typing import List, Optional, Any
5
+ from typing import List
15
6
 
16
- from judgeval.scorers import JudgevalScorer
17
- from judgeval.data import Example, ExampleParams
18
- from judgeval.scorers.exceptions import MissingExampleParamsError
7
+ from judgeval.scorers import BaseScorer
19
8
 
20
9
 
21
- def clone_scorers(scorers: List[JudgevalScorer]) -> List[JudgevalScorer]:
10
+ def clone_scorers(scorers: List[BaseScorer]) -> List[BaseScorer]:
22
11
  """
23
12
  Creates duplicates of the scorers passed as argument.
24
13
  """
25
- cloned_scorers = []
26
- for s in scorers:
27
- scorer_class = type(s)
28
- args = vars(s)
29
-
30
- signature = inspect.signature(scorer_class.__init__)
31
- valid_params = signature.parameters.keys()
32
- valid_args = {key: args[key] for key in valid_params if key in args}
33
-
34
- cloned_scorer = scorer_class(**valid_args)
35
- # kinda hacky, but in case the class inheriting from JudgevalScorer doesn't have `model` in its __init__,
36
- # we need to explicitly include it here so that we can add the judge model to the cloned scorer
37
- cloned_scorer._add_model(model=args.get("model"))
38
- cloned_scorers.append(cloned_scorer)
39
- return cloned_scorers
40
-
41
-
42
- def scorer_console_msg(
43
- scorer: JudgevalScorer,
44
- async_mode: Optional[bool] = None,
45
- ):
46
- """
47
- Renders a message to be displayed to console when a scorer is being executed.
48
- """
49
- if async_mode is None:
50
- run_async = scorer.async_mode
51
- else:
52
- run_async = async_mode
53
-
54
- return f"🔨 Executing Judgment's [rgb(106,0,255)]{scorer.__name__} Scorer[/rgb(106,0,255)]! \
55
- [rgb(55,65,81)](using {scorer.evaluation_model}, async_mode={run_async})...[/rgb(55,65,81)]"
56
-
57
-
58
- @contextmanager
59
- def scorer_progress_meter(
60
- scorer: JudgevalScorer,
61
- async_mode: Optional[bool] = None,
62
- display_meter: bool = True,
63
- total: int = 100,
64
- transient: bool = True,
65
- ):
66
- """
67
- Context manager to display a progress indicator (spinner) while a scorer is being run.
68
- """
69
- console = Console(file=sys.stderr)
70
- if display_meter:
71
- with Progress(
72
- SpinnerColumn(style="rgb(106,0,255)"),
73
- TextColumn("[progress.description]{task.description}"),
74
- console=console,
75
- transient=transient,
76
- ) as progress:
77
- progress.add_task(
78
- description=scorer_console_msg(scorer, async_mode),
79
- total=total,
80
- )
81
- yield
82
- else:
83
- yield
84
-
85
-
86
- def parse_response_json(llm_response: str, scorer: Optional[JudgevalScorer] = None) -> dict:
87
- """
88
- Extracts JSON output from an LLM response and returns it as a dictionary.
89
-
90
- If the JSON is invalid, the error is forwarded to the `scorer`, if provided.
91
-
92
- Args:
93
- llm_response (str): The response from an LLM.
94
- scorer (JudgevalScorer, optional): The scorer object to forward errors to (if any).
95
- """
96
- start = llm_response.find("{") # opening bracket
97
- end = llm_response.rfind("}") + 1 # closing bracket
98
-
99
- if end == 0 and start != -1: # add the closing bracket if it's missing
100
- llm_response = llm_response + "}"
101
- end = len(llm_response)
102
-
103
- json_str = llm_response[start:end] if start != -1 and end != 0 else "" # extract the JSON string
104
- json_str = re.sub(r",\s*([\]}])", r"\1", json_str) # Remove trailing comma if present
105
-
106
- try:
107
- return json.loads(json_str)
108
- except json.JSONDecodeError:
109
- error_str = "Evaluation LLM outputted an invalid JSON. Please use a stronger evaluation model."
110
- if scorer is not None:
111
- scorer.error = error_str
112
- raise ValueError(error_str)
113
- except Exception as e:
114
- raise Exception(f"An unexpected error occurred: {str(e)}")
115
-
116
-
117
- def print_verbose_logs(metric: str, logs: str):
118
- print("*" * 50)
119
- print(f"{metric} Verbose Logs")
120
- print("*" * 50)
121
- print("")
122
- print(logs)
123
- print("")
124
- print("=" * 70)
125
-
126
-
127
- def create_verbose_logs(metric: JudgevalScorer, steps: List[str]) -> str:
128
- """
129
- Creates verbose logs for a scorer object.
130
-
131
- Args:
132
- metric (JudgevalScorer): The scorer object.
133
- steps (List[str]): The steps to be included in the verbose logs.
134
-
135
- Returns:
136
- str: The verbose logs (Concatenated steps).
137
- """
138
-
139
- verbose_logs = ""
140
- for i in range(len(steps) - 1):
141
- verbose_logs += steps[i]
142
- if i < len(steps) - 2: # don't add new line for penultimate step
143
- verbose_logs += " \n \n"
144
- if metric.verbose_mode:
145
- print_verbose_logs(metric.__name__, verbose_logs + f"\n \n{steps[-1]}")
146
- return verbose_logs
147
-
148
-
149
- def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
150
- """
151
- Get or create an asyncio event loop.
152
-
153
- This function attempts to retrieve the current event loop using `asyncio.get_event_loop()`.
154
- If the event loop is already running, it applies the `nest_asyncio` patch to allow nested
155
- asynchronous execution. If the event loop is closed or not found, it creates a new event loop
156
- and sets it as the current event loop.
157
-
158
- Returns:
159
- asyncio.AbstractEventLoop: The current or newly created event loop.
160
-
161
- Raises:
162
- RuntimeError: If the event loop is closed.
163
- """
164
- try:
165
- loop = asyncio.get_event_loop()
166
- if loop.is_running():
167
- print(
168
- "Event loop is already running. Applying nest_asyncio patch to allow async execution..."
169
- )
170
- nest_asyncio.apply()
171
-
172
- if loop.is_closed():
173
- raise RuntimeError
174
- except RuntimeError:
175
- loop = asyncio.new_event_loop()
176
- asyncio.set_event_loop(loop)
177
- return loop
178
-
179
-
180
- def check_example_params(
181
- example: Example,
182
- example_params: List[ExampleParams],
183
- scorer: JudgevalScorer,
184
- ):
185
- if isinstance(example, Example) is False:
186
- error_str = f"in check_example_params(): Expected example to be of type 'Example', but got {type(example)}"
187
- scorer.error = error_str
188
- raise MissingExampleParamsError(error_str)
189
-
190
- missing_params = []
191
- for param in example_params:
192
- if getattr(example, param.value) is None:
193
- missing_params.append(f"'{param.value}'")
194
-
195
- if missing_params:
196
- if len(missing_params) == 1:
197
- missing_params_str = missing_params[0]
198
- elif len(missing_params) == 2:
199
- missing_params_str = " and ".join(missing_params)
200
- else:
201
- missing_params_str = (
202
- ", ".join(missing_params[:-1]) + ", and " + missing_params[-1]
203
- )
204
-
205
- error_str = f"{missing_params_str} fields in example cannot be None for the '{scorer.__name__}' scorer"
206
- scorer.error = error_str
207
- raise MissingExampleParamsError(error_str)
208
-
209
-
14
+ return [s.model_copy(deep=True) for s in scorers]