judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,4 +0,0 @@
1
- from judgeval.data.datasets.dataset import EvalDataset
2
- from judgeval.data.datasets.eval_dataset_client import EvalDatasetClient
3
-
4
- __all__ = ["EvalDataset", "EvalDatasetClient"]
@@ -1,341 +0,0 @@
1
- import ast
2
- import csv
3
- import datetime
4
- import json
5
- import os
6
- import yaml
7
- from dataclasses import dataclass, field
8
- from typing import List, Union, Literal, Optional
9
-
10
- from judgeval.data import Example, Trace
11
- from judgeval.common.logger import judgeval_logger
12
- from judgeval.utils.file_utils import get_examples_from_yaml
13
-
14
-
15
- @dataclass
16
- class EvalDataset:
17
- examples: List[Example]
18
- traces: List[Trace]
19
- _alias: Union[str, None] = field(default=None)
20
- _id: Union[str, None] = field(default=None)
21
- judgment_api_key: str = field(default="")
22
- organization_id: str = field(default="")
23
-
24
- def __init__(
25
- self,
26
- judgment_api_key: str = os.getenv("JUDGMENT_API_KEY", ""),
27
- organization_id: str = os.getenv("JUDGMENT_ORG_ID", ""),
28
- examples: Optional[List[Example]] = None,
29
- traces: Optional[List[Trace]] = None,
30
- ):
31
- if not judgment_api_key:
32
- judgeval_logger.error("No judgment_api_key provided")
33
- self.examples = examples or []
34
- self.traces = traces or []
35
- self._alias = None
36
- self._id = None
37
- self.judgment_api_key = judgment_api_key
38
- self.organization_id = organization_id
39
-
40
- def add_from_json(self, file_path: str) -> None:
41
- """
42
- Adds examples from a JSON file.
43
-
44
- The format of the JSON file is expected to be a dictionary with one key: "examples".
45
- The value of the key is a list of dictionaries, where each dictionary represents an example.
46
-
47
- The JSON file is expected to have the following format:
48
- {
49
- "examples": [
50
- {
51
- "input": "test input",
52
- "actual_output": "test output",
53
- "expected_output": "expected output",
54
- "context": [
55
- "context1",
56
- "context2"
57
- ],
58
- "retrieval_context": [
59
- "retrieval1"
60
- ],
61
- "additional_metadata": {
62
- "key": "value"
63
- },
64
- "tools_called": [
65
- "tool1"
66
- ],
67
- "expected_tools": [
68
- "tool1",
69
- "tool2"
70
- ],
71
- "name": "test example",
72
- "example_id": null,
73
- "timestamp": "20241230_160117",
74
- "trace_id": "123"
75
- }
76
- ]
77
- }
78
- """
79
- try:
80
- with open(file_path, "r") as file:
81
- payload = json.load(file)
82
- examples = payload.get("examples", [])
83
- except FileNotFoundError:
84
- judgeval_logger.error(f"JSON file not found: {file_path}")
85
- raise FileNotFoundError(f"The file {file_path} was not found.")
86
- except json.JSONDecodeError:
87
- judgeval_logger.error(f"Invalid JSON file: {file_path}")
88
- raise ValueError(f"The file {file_path} is not a valid JSON file.")
89
-
90
- new_examples = [Example(**e) for e in examples]
91
- for e in new_examples:
92
- self.add_example(e)
93
-
94
- def add_from_csv(
95
- self,
96
- file_path: str,
97
- header_mapping: dict,
98
- primary_delimiter: str = ",",
99
- secondary_delimiter: str = ";",
100
- ) -> None:
101
- """
102
- Add Examples from a CSV file.
103
-
104
- Args:
105
- file_path (str): Path to the CSV file
106
- header_mapping (dict): Dictionary mapping Example headers to custom headers
107
- primary_delimiter (str, optional): Main delimiter used in CSV file. Defaults to ","
108
- secondary_delimiter (str, optional): Secondary delimiter for list fields. Defaults to ";"
109
- """
110
- try:
111
- import pandas as pd
112
- except ModuleNotFoundError:
113
- raise ModuleNotFoundError(
114
- "Please install pandas to use this method. 'pip install pandas'"
115
- )
116
-
117
- # Pandas naturally reads numbers in data files as ints, not strings (can lead to unexpected behavior)
118
- df = pd.read_csv(file_path, dtype={"trace_id": str}, sep=primary_delimiter)
119
- """
120
- The user should pass in a dict mapping from Judgment Example headers to their custom defined headers.
121
- Available headers for Example objects are as follows:
122
-
123
- "input", "actual_output", "expected_output", "context", \
124
- "retrieval_context", "additional_metadata", "tools_called", \
125
- "expected_tools", "name", "comments", "source_file", "example", \
126
- "trace_id"
127
-
128
- We want to collect the examples separately which can
129
- be determined by the "example" column. If the value is True, then it is an
130
- example, and we expect the `input` and `actual_output` fields to be non-null.
131
-
132
- We also assume that if there are multiple retrieval contexts, contexts, or tools called, they are separated by semicolons.
133
- This can be adjusted using the `secondary_delimiter` parameter.
134
- """
135
- examples = []
136
-
137
- def process_csv_row(value, header):
138
- """
139
- Maps a singular value in the CSV file to the appropriate type based on the header.
140
- If value exists and can be split into type List[*], we will split upon the user's provided secondary delimiter.
141
- """
142
- # check that the CSV value is not null for entry
143
- null_replacement = dict() if header == "additional_metadata" else None
144
- if pd.isna(value) or value == "":
145
- return null_replacement
146
- try:
147
- value = (
148
- ast.literal_eval(value)
149
- if header == "additional_metadata"
150
- else str(value)
151
- )
152
- except (ValueError, SyntaxError):
153
- value = str(value)
154
- if header in [
155
- "context",
156
- "retrieval_context",
157
- "tools_called",
158
- "expected_tools",
159
- ]:
160
- # attempt to split the value by the secondary delimiter
161
- value = value.split(secondary_delimiter)
162
-
163
- return value
164
-
165
- for _, row in df.iterrows():
166
- data = {
167
- header: process_csv_row(row[header_mapping[header]], header)
168
- for header in header_mapping
169
- }
170
- if "example" in header_mapping and row[header_mapping["example"]]:
171
- if "name" in header_mapping:
172
- data["name"] = (
173
- row[header_mapping["name"]]
174
- if pd.notna(row[header_mapping["name"]])
175
- else None
176
- )
177
- # every Example has `input` and `actual_output` fields
178
- if data["input"] is not None and data["actual_output"] is not None:
179
- e = Example(**data)
180
- examples.append(e)
181
- else:
182
- raise ValueError(
183
- "Every example must have an 'input' and 'actual_output' field."
184
- )
185
-
186
- for e in examples:
187
- self.add_example(e)
188
-
189
- def add_from_yaml(self, file_path: str) -> None:
190
- """
191
- Adds examples from a YAML file.
192
-
193
- The format of the YAML file is expected to be a dictionary with one key: "examples".
194
- The value of the key is a list of dictionaries, where each dictionary represents an example.
195
-
196
- The YAML file is expected to have the following format:
197
- examples:
198
- - input: "test input"
199
- actual_output: "test output"
200
- expected_output: "expected output"
201
- context:
202
- - "context1"
203
- - "context2"
204
- retrieval_context:
205
- - "retrieval1"
206
- additional_metadata:
207
- key: "value"
208
- tools_called:
209
- - "tool1"
210
- expected_tools:
211
- - "tool1"
212
- - "tool2"
213
- name: "test example"
214
- example_id: null
215
- timestamp: "20241230_160117"
216
- trace_id: "123"
217
- """
218
- examples = get_examples_from_yaml(file_path)
219
-
220
- for e in examples:
221
- self.add_example(e)
222
-
223
- def add_example(self, e: Example) -> None:
224
- self.examples.append(e)
225
- # TODO if we need to add rank, then we need to do it here
226
-
227
- def add_trace(self, t: Trace) -> None:
228
- self.traces.append(t)
229
-
230
- def save_as(
231
- self,
232
- file_type: Literal["json", "csv", "yaml"],
233
- dir_path: str,
234
- save_name: str | None = None,
235
- ) -> None:
236
- """
237
- Saves the dataset as a file. Save only the examples.
238
-
239
- Args:
240
- file_type (Literal["json", "csv"]): The file type to save the dataset as.
241
- dir_path (str): The directory path to save the file to.
242
- save_name (str, optional): The name of the file to save. Defaults to None.
243
- """
244
- if not os.path.exists(dir_path):
245
- os.makedirs(dir_path)
246
- file_name = (
247
- datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
248
- if save_name is None
249
- else save_name
250
- )
251
- complete_path = os.path.join(dir_path, f"{file_name}.{file_type}")
252
- if file_type == "json":
253
- with open(complete_path, "w") as file:
254
- json.dump(
255
- {
256
- "examples": [e.to_dict() for e in self.examples],
257
- },
258
- file,
259
- indent=4,
260
- )
261
- elif file_type == "csv":
262
- with open(complete_path, "w", newline="") as file:
263
- writer = csv.writer(file)
264
- writer.writerow(
265
- [
266
- "input",
267
- "actual_output",
268
- "expected_output",
269
- "context",
270
- "retrieval_context",
271
- "additional_metadata",
272
- "tools_called",
273
- "expected_tools",
274
- "name",
275
- "comments",
276
- "source_file",
277
- "example",
278
- "trace_id",
279
- ]
280
- )
281
- for e in self.examples:
282
- writer.writerow(
283
- [
284
- e.input,
285
- e.actual_output,
286
- e.expected_output,
287
- ";".join(e.context),
288
- ";".join(e.retrieval_context),
289
- e.additional_metadata,
290
- ";".join(e.tools_called),
291
- ";".join(e.expected_tools),
292
- e.name,
293
- None, # Example does not have comments
294
- None, # Example does not have source file
295
- True, # Adding an Example
296
- ]
297
- )
298
-
299
- elif file_type == "yaml":
300
- with open(complete_path, "w") as file:
301
- yaml_data = {
302
- "examples": [
303
- {
304
- "input": e.input,
305
- "actual_output": e.actual_output,
306
- "expected_output": e.expected_output,
307
- "context": e.context,
308
- "retrieval_context": e.retrieval_context,
309
- "additional_metadata": e.additional_metadata,
310
- "tools_called": e.tools_called,
311
- "expected_tools": e.expected_tools,
312
- "name": e.name,
313
- "comments": None, # Example does not have comments
314
- "source_file": None, # Example does not have source file
315
- "example": True, # Adding an Example
316
- }
317
- for e in self.examples
318
- ],
319
- }
320
- yaml.dump(yaml_data, file, default_flow_style=False)
321
- else:
322
- ACCEPTABLE_FILE_TYPES = ["json", "csv", "yaml"]
323
- raise TypeError(
324
- f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}"
325
- )
326
-
327
- def __iter__(self):
328
- return iter(self.examples)
329
-
330
- def __len__(self):
331
- return len(self.examples)
332
-
333
- def __str__(self):
334
- return (
335
- f"{self.__class__.__name__}("
336
- f"examples={self.examples}, "
337
- f"traces={self.traces}, "
338
- f"_alias={self._alias}, "
339
- f"_id={self._id}"
340
- f")"
341
- )
@@ -1,214 +0,0 @@
1
- from typing import Optional, List
2
- from rich.progress import Progress, SpinnerColumn, TextColumn
3
- from judgeval.common.logger import judgeval_logger
4
- from judgeval.common.api import JudgmentApiClient
5
- from judgeval.data import Example, Trace
6
- from judgeval.data.datasets import EvalDataset
7
-
8
-
9
- class EvalDatasetClient:
10
- def __init__(self, judgment_api_key: str, organization_id: str):
11
- self.api_client = JudgmentApiClient(judgment_api_key, organization_id)
12
-
13
- def create_dataset(self) -> EvalDataset:
14
- return EvalDataset(judgment_api_key=self.api_client.api_key)
15
-
16
- def push(
17
- self,
18
- dataset: EvalDataset,
19
- alias: str,
20
- project_name: str,
21
- overwrite: Optional[bool] = False,
22
- ) -> bool:
23
- if overwrite:
24
- judgeval_logger.warning(f"Overwrite enabled for alias '{alias}'")
25
- """
26
- Pushes the dataset to Judgment platform
27
-
28
- Mock request:
29
- dataset = {
30
- "alias": alias,
31
- "examples": [...],
32
- "overwrite": overwrite
33
- } ==>
34
- {
35
- "_alias": alias,
36
- "_id": "..." # ID of the dataset
37
- }
38
- """
39
- with Progress(
40
- SpinnerColumn(style="rgb(106,0,255)"),
41
- TextColumn("[progress.description]{task.description}"),
42
- transient=False,
43
- ) as progress:
44
- task_id = progress.add_task(
45
- f"Pushing [rgb(106,0,255)]'{alias}' to Judgment...",
46
- total=100,
47
- )
48
- try:
49
- payload = self.api_client.push_dataset(
50
- dataset_alias=alias,
51
- project_name=project_name,
52
- examples=[e.to_dict() for e in dataset.examples],
53
- traces=[t.model_dump() for t in dataset.traces],
54
- overwrite=overwrite or False,
55
- )
56
- except Exception as e:
57
- judgeval_logger.error(f"Error during push: {e}")
58
- raise
59
- dataset._alias = payload.get("_alias")
60
- dataset._id = payload.get("_id")
61
- progress.update(
62
- task_id,
63
- description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
64
- )
65
- return True
66
-
67
- def append_examples(
68
- self, alias: str, examples: List[Example], project_name: str
69
- ) -> bool:
70
- """
71
- Appends the dataset to Judgment platform
72
-
73
- Mock request:
74
- dataset = {
75
- "alias": alias,
76
- "examples": [...],
77
- "project_name": project_name
78
- } ==>
79
- {
80
- "_alias": alias,
81
- "_id": "..." # ID of the dataset
82
- }
83
- """
84
- with Progress(
85
- SpinnerColumn(style="rgb(106,0,255)"),
86
- TextColumn("[progress.description]{task.description}"),
87
- transient=False,
88
- ) as progress:
89
- task_id = progress.add_task(
90
- f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
91
- total=100,
92
- )
93
- try:
94
- self.api_client.append_examples(
95
- dataset_alias=alias,
96
- project_name=project_name,
97
- examples=[e.to_dict() for e in examples],
98
- )
99
- except Exception as e:
100
- judgeval_logger.error(f"Error during append: {e}")
101
- raise
102
-
103
- progress.update(
104
- task_id,
105
- description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
106
- )
107
- return True
108
-
109
- def pull(self, alias: str, project_name: str) -> EvalDataset:
110
- """
111
- Pulls the dataset from Judgment platform
112
-
113
- Mock request:
114
- {
115
- "alias": alias,
116
- "project_name": project_name
117
- }
118
- ==>
119
- {
120
- "examples": [...],
121
- "_alias": alias,
122
- "_id": "..." # ID of the dataset
123
- }
124
- """
125
- # Make a POST request to the Judgment API to get the dataset
126
- dataset = self.create_dataset()
127
-
128
- with Progress(
129
- SpinnerColumn(style="rgb(106,0,255)"),
130
- TextColumn("[progress.description]{task.description}"),
131
- transient=False,
132
- ) as progress:
133
- task_id = progress.add_task(
134
- f"Pulling [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
135
- total=100,
136
- )
137
- try:
138
- payload = self.api_client.pull_dataset(
139
- dataset_alias=alias,
140
- project_name=project_name,
141
- )
142
- except Exception as e:
143
- judgeval_logger.error(f"Error pulling dataset: {str(e)}")
144
- raise
145
- dataset.examples = [Example(**e) for e in payload.get("examples", [])]
146
- dataset.traces = [Trace(**t) for t in payload.get("traces", [])]
147
- dataset._alias = payload.get("alias")
148
- dataset._id = payload.get("id")
149
- progress.update(
150
- task_id,
151
- description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
152
- )
153
-
154
- return dataset
155
-
156
- def delete(self, alias: str, project_name: str) -> bool:
157
- with Progress(
158
- SpinnerColumn(style="rgb(106,0,255)"),
159
- TextColumn("[progress.description]{task.description}"),
160
- transient=False,
161
- ) as progress:
162
- progress.add_task(
163
- f"Deleting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
164
- total=100,
165
- )
166
- try:
167
- self.api_client.delete_dataset(
168
- dataset_alias=alias,
169
- project_name=project_name,
170
- )
171
- except Exception as e:
172
- judgeval_logger.error(f"Error deleting dataset: {str(e)}")
173
- raise
174
-
175
- return True
176
-
177
- def pull_project_dataset_stats(self, project_name: str) -> dict:
178
- """
179
- Pulls the project datasets stats from Judgment platform
180
-
181
- Mock request:
182
- {
183
- "project_name": project_name
184
- }
185
- ==>
186
- {
187
- "test_dataset_1": {"examples_count": len(dataset1.examples)},
188
- "test_dataset_2": {"examples_count": len(dataset2.examples)},
189
- ...
190
- }
191
- """
192
- # Make a POST request to the Judgment API to get the dataset
193
-
194
- with Progress(
195
- SpinnerColumn(style="rgb(106,0,255)"),
196
- TextColumn("[progress.description]{task.description}"),
197
- transient=False,
198
- ) as progress:
199
- task_id = progress.add_task(
200
- "Pulling [rgb(106,0,255)]' datasets'[/rgb(106,0,255)] from Judgment...",
201
- total=100,
202
- )
203
- try:
204
- payload = self.api_client.get_project_dataset_stats(project_name)
205
- except Exception as e:
206
- judgeval_logger.error(f"Error pulling dataset: {str(e)}")
207
- raise
208
-
209
- progress.update(
210
- task_id,
211
- description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
212
- )
213
-
214
- return payload
judgeval/data/tool.py DELETED
@@ -1,5 +0,0 @@
1
- from judgeval.data.judgment_types import ToolJudgmentType
2
-
3
-
4
- class Tool(ToolJudgmentType):
5
- pass
@@ -1,37 +0,0 @@
1
- from pydantic import BaseModel
2
- from typing import List, Optional, Dict, Any, Union
3
- from judgeval.data import Trace
4
- from judgeval.scorers import APIScorerConfig, BaseScorer
5
- from judgeval.rules import Rule
6
-
7
-
8
- class TraceRun(BaseModel):
9
- """
10
- Stores example and evaluation scorers together for running an eval task
11
-
12
- Args:
13
- project_name (str): The name of the project the evaluation results belong to
14
- eval_name (str): A name for this evaluation run
15
- traces (List[Trace]): The traces to evaluate
16
- scorers (List[Union[JudgmentScorer, BaseScorer]]): A list of scorers to use for evaluation
17
- model (str): The model used as a judge when using LLM as a Judge
18
- metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
19
- rules (Optional[List[Rule]]): Rules to evaluate against scoring results
20
- append (Optional[bool]): Whether to append to existing evaluation results
21
- tools (Optional[List[Dict[str, Any]]]): List of tools to use for evaluation
22
- """
23
-
24
- organization_id: Optional[str] = None
25
- project_name: Optional[str] = None
26
- eval_name: Optional[str] = None
27
- traces: Optional[List[Trace]] = None
28
- scorers: List[Union[APIScorerConfig, BaseScorer]]
29
- model: Optional[str] = "gpt-4.1"
30
- trace_span_id: Optional[str] = None
31
- append: Optional[bool] = False
32
- override: Optional[bool] = False
33
- rules: Optional[List[Rule]] = None
34
- tools: Optional[List[Dict[str, Any]]] = None
35
-
36
- class Config:
37
- arbitrary_types_allowed = True
@@ -1,75 +0,0 @@
1
- from typing import List, Optional, Union
2
- from pydantic import BaseModel, field_validator, Field
3
-
4
- from judgeval.data import Example
5
- from judgeval.scorers import BaseScorer, APIScorerConfig
6
- from judgeval.constants import ACCEPTABLE_MODELS
7
-
8
-
9
- class EvaluationRun(BaseModel):
10
- """
11
- Stores example and evaluation scorers together for running an eval task
12
-
13
- Args:
14
- project_name (str): The name of the project the evaluation results belong to
15
- eval_name (str): A name for this evaluation run
16
- examples (List[Example]): The examples to evaluate
17
- scorers (List[Union[JudgmentScorer, BaseScorer]]): A list of scorers to use for evaluation
18
- model (str): The model used as a judge when using LLM as a Judge
19
- metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
20
- """
21
-
22
- organization_id: Optional[str] = None
23
- project_name: Optional[str] = Field(default=None, validate_default=True)
24
- eval_name: Optional[str] = Field(default=None, validate_default=True)
25
- examples: List[Example]
26
- scorers: List[Union[APIScorerConfig, BaseScorer]]
27
- model: Optional[str] = "gpt-4.1"
28
- trace_span_id: Optional[str] = None
29
- # API Key will be "" until user calls client.run_eval(), then API Key will be set
30
- override: Optional[bool] = False
31
- append: Optional[bool] = False
32
-
33
- def model_dump(self, **kwargs):
34
- data = super().model_dump(**kwargs)
35
-
36
- data["scorers"] = [
37
- scorer.model_dump() for scorer in self.scorers
38
- ] # Pydantic has problems with properly calling model_dump() on the scorers, so we need to do it manually
39
-
40
- return data
41
-
42
- @field_validator("examples")
43
- def validate_examples(cls, v):
44
- if not v:
45
- raise ValueError("Examples cannot be empty.")
46
- return v
47
-
48
- @field_validator("scorers", mode="before")
49
- def validate_scorers(cls, v):
50
- if not v:
51
- raise ValueError("Scorers cannot be empty.")
52
- if not all(
53
- isinstance(scorer, BaseScorer) or isinstance(scorer, APIScorerConfig)
54
- for scorer in v
55
- ):
56
- raise ValueError(
57
- "All scorers must be of type BaseScorer or APIScorerConfig."
58
- )
59
- return v
60
-
61
- @field_validator("model")
62
- def validate_model(cls, v, values):
63
- if not v:
64
- raise ValueError("Model cannot be empty.")
65
-
66
- # Check if model is string or list of strings
67
- if isinstance(v, str):
68
- if v not in ACCEPTABLE_MODELS:
69
- raise ValueError(
70
- f"Model name {v} not recognized. Please select a valid model name.)"
71
- )
72
- return v
73
-
74
- class Config:
75
- arbitrary_types_allowed = True