judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,111 +0,0 @@
1
- from typing import List, Optional, Dict, Any, Union
2
- from pydantic import BaseModel, Field, ConfigDict, model_validator
3
-
4
- from judgeval.data.example import Example
5
- from judgeval.data.scorer_data import ScorerData
6
- from judgeval.common.logger import debug, error
7
-
8
- class ProcessExample(BaseModel):
9
- """
10
- ProcessExample is an `Example` object that contains intermediate information
11
- about an undergoing evaluation on the original `Example`. It is used purely for
12
- internal operations and keeping track of the evaluation process.
13
- """
14
- name: str
15
- input: Optional[str] = None
16
- actual_output: Optional[str] = None
17
- expected_output: Optional[str] = None
18
- context: Optional[list] = None
19
- retrieval_context: Optional[list] = None
20
- tools_called: Optional[list] = None
21
- expected_tools: Optional[list] = None
22
-
23
- # make these optional, not all test cases in a conversation will be evaluated
24
- success: Optional[bool] = None
25
- scorers_data: Optional[List[ScorerData]] = None
26
- run_duration: Optional[float] = None
27
- evaluation_cost: Optional[float] = None
28
-
29
- order: Optional[int] = None
30
- # These should map 1 to 1 from golden
31
- additional_metadata: Optional[Dict] = None
32
- comments: Optional[str] = None
33
- trace_id: Optional[str] = None
34
- model_config = ConfigDict(arbitrary_types_allowed=True)
35
-
36
- def update_scorer_data(self, scorer_data: ScorerData):
37
- """
38
- Updates scorer data field of test case after the scorers have been
39
- evaluated on this test case.
40
- """
41
- debug(f"Updating scorer data for example '{self.name}' with scorer: {scorer_data}")
42
- # self.scorers_data is a list of ScorerData objects that contain the
43
- # evaluation results of each scorer on this test case
44
- if self.scorers_data is None:
45
- self.scorers_data = [scorer_data]
46
- else:
47
- self.scorers_data.append(scorer_data)
48
-
49
- if self.success is None:
50
- # self.success will be None when it is a message
51
- # in that case we will be setting success for the first time
52
- self.success = scorer_data.success
53
- else:
54
- if scorer_data.success is False:
55
- debug(f"Example '{self.name}' marked as failed due to scorer: {scorer_data}")
56
- self.success = False
57
-
58
- def update_run_duration(self, run_duration: float):
59
- self.run_duration = run_duration
60
-
61
- @model_validator(mode="before")
62
- def check_input(cls, values: Dict[str, Any]):
63
- input = values.get("input")
64
- actual_output = values.get("actual_output")
65
-
66
- if (input is None or actual_output is None):
67
- error(f"Validation error: Required fields missing. input={input}, actual_output={actual_output}")
68
- raise ValueError(
69
- "'input' and 'actual_output' must be provided."
70
- )
71
-
72
- return values
73
-
74
-
75
- def create_process_example(
76
- example: Example,
77
- ) -> ProcessExample:
78
- """
79
- When an LLM Test Case is executed, we track its progress using an ProcessExample.
80
-
81
- This will track things like the success of the test case, as well as the metadata (such as verdicts and claims in Faithfulness).
82
- """
83
- success = True
84
- if example.name is not None:
85
- name = example.name
86
- else:
87
- name = "Test Case Placeholder"
88
- debug(f"No name provided for example, using default name: {name}")
89
- order = None
90
- scorers_data = []
91
-
92
- debug(f"Creating ProcessExample for: {name}")
93
- process_ex = ProcessExample(
94
- name=name,
95
- input=example.input,
96
- actual_output=example.actual_output,
97
- expected_output=example.expected_output,
98
- context=example.context,
99
- retrieval_context=example.retrieval_context,
100
- tools_called=example.tools_called,
101
- expected_tools=example.expected_tools,
102
- success=success,
103
- scorers_data=scorers_data,
104
- run_duration=None,
105
- evaluation_cost=None,
106
- order=order,
107
- additional_metadata=example.additional_metadata,
108
- trace_id=example.trace_id
109
- )
110
- return process_ex
111
-
@@ -1,5 +0,0 @@
1
- from judgeval.data.datasets.dataset import EvalDataset
2
- from judgeval.data.datasets.ground_truth import GroundTruthExample
3
- from judgeval.data.datasets.eval_dataset_client import EvalDatasetClient
4
-
5
- __all__ = ["EvalDataset", "EvalDatasetClient", "GroundTruthExample"]
@@ -1,286 +0,0 @@
1
- import ast
2
- import csv
3
- import datetime
4
- import json
5
- from dataclasses import dataclass, field
6
- import os
7
- from typing import List, Optional, Union, Literal
8
-
9
- from judgeval.data.datasets.ground_truth import GroundTruthExample
10
- from judgeval.data import Example
11
- from judgeval.common.logger import debug, error, warning, info
12
-
13
- @dataclass
14
- class EvalDataset:
15
- ground_truths: List[GroundTruthExample]
16
- examples: List[Example]
17
- _alias: Union[str, None] = field(default=None)
18
- _id: Union[str, None] = field(default=None)
19
- judgment_api_key: str = field(default="")
20
-
21
- def __init__(self,
22
- judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
23
- ground_truths: List[GroundTruthExample] = [],
24
- examples: List[Example] = [],
25
- ):
26
- debug(f"Initializing EvalDataset with {len(ground_truths)} ground truths and {len(examples)} examples")
27
- if not judgment_api_key:
28
- warning("No judgment_api_key provided")
29
- self.ground_truths = ground_truths
30
- self.examples = examples
31
- self._alias = None
32
- self._id = None
33
- self.judgment_api_key = judgment_api_key
34
-
35
-
36
- def add_from_json(self, file_path: str) -> None:
37
- debug(f"Loading dataset from JSON file: {file_path}")
38
- """
39
- Adds examples and ground truths from a JSON file.
40
-
41
- The format of the JSON file is expected to be a dictionary with two keys: "examples" and "ground_truths".
42
- The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth.
43
-
44
- The JSON file is expected to have the following format:
45
- {
46
- "ground_truths": [
47
- {
48
- "input": "test input",
49
- "actual_output": null,
50
- "expected_output": "expected output",
51
- "context": [
52
- "context1"
53
- ],
54
- "retrieval_context": [
55
- "retrieval1"
56
- ],
57
- "additional_metadata": {
58
- "key": "value"
59
- },
60
- "comments": "test comment",
61
- "tools_called": [
62
- "tool1"
63
- ],
64
- "expected_tools": [
65
- "tool1"
66
- ],
67
- "source_file": "test.py",
68
- "trace_id": "094121"
69
- }
70
- ],
71
- "examples": [
72
- {
73
- "input": "test input",
74
- "actual_output": "test output",
75
- "expected_output": "expected output",
76
- "context": [
77
- "context1",
78
- "context2"
79
- ],
80
- "retrieval_context": [
81
- "retrieval1"
82
- ],
83
- "additional_metadata": {
84
- "key": "value"
85
- },
86
- "tools_called": [
87
- "tool1"
88
- ],
89
- "expected_tools": [
90
- "tool1",
91
- "tool2"
92
- ],
93
- "name": "test example",
94
- "example_id": null,
95
- "timestamp": "20241230_160117",
96
- "trace_id": "123"
97
- }
98
- ]
99
- }
100
- """
101
- try:
102
- with open(file_path, "r") as file:
103
- payload = json.load(file)
104
- examples = payload.get("examples", [])
105
- ground_truths = payload.get("ground_truths", [])
106
- except FileNotFoundError:
107
- error(f"JSON file not found: {file_path}")
108
- raise FileNotFoundError(f"The file {file_path} was not found.")
109
- except json.JSONDecodeError:
110
- error(f"Invalid JSON file: {file_path}")
111
- raise ValueError(f"The file {file_path} is not a valid JSON file.")
112
-
113
- info(f"Added {len(examples)} examples and {len(ground_truths)} ground truths from JSON")
114
- new_examples = [Example(**e) for e in examples]
115
- for e in new_examples:
116
- self.add_example(e)
117
-
118
- new_ground_truths = [GroundTruthExample(**g) for g in ground_truths]
119
- for g in new_ground_truths:
120
- self.add_ground_truth(g)
121
-
122
- def add_from_csv(
123
- self,
124
- file_path: str,
125
- ) -> None:
126
- """
127
- Add Examples and GroundTruthExamples from a CSV file.
128
- """
129
- try:
130
- import pandas as pd
131
- except ModuleNotFoundError:
132
- raise ModuleNotFoundError(
133
- "Please install pandas to use this method. 'pip install pandas'"
134
- )
135
-
136
- # Pandas naturally reads numbers in data files as ints, not strings (can lead to unexpected behavior)
137
- df = pd.read_csv(file_path, dtype={'trace_id': str})
138
- """
139
- Expect the CSV to have headers
140
-
141
- "input", "actual_output", "expected_output", "context", \
142
- "retrieval_context", "additional_metadata", "tools_called", \
143
- "expected_tools", "name", "comments", "source_file", "example", \
144
- "trace_id"
145
-
146
- We want to collect the examples and ground truths separately which can
147
- be determined by the "example" column. If the value is True, then it is an
148
- example, otherwise it is a ground truth.
149
-
150
- We also assume that if there are multiple retrieval contexts or contexts, they are separated by semicolons.
151
- This can be adjusted using the `context_delimiter` and `retrieval_context_delimiter` parameters.
152
- """
153
- examples, ground_truths = [], []
154
-
155
- for _, row in df.iterrows():
156
- data = {
157
- "input": row["input"],
158
- "actual_output": row["actual_output"] if pd.notna(row["actual_output"]) else None,
159
- "expected_output": row["expected_output"] if pd.notna(row["expected_output"]) else None,
160
- "context": row["context"].split(";") if pd.notna(row["context"]) else [],
161
- "retrieval_context": row["retrieval_context"].split(";") if pd.notna(row["retrieval_context"]) else [],
162
- "additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(),
163
- "tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [],
164
- "expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [],
165
- "trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None
166
- }
167
- if row["example"]:
168
- data["name"] = row["name"] if pd.notna(row["name"]) else None
169
- # every Example has `input` and `actual_output` fields
170
- if data["input"] is not None and data["actual_output"] is not None:
171
- e = Example(**data)
172
- examples.append(e)
173
- else:
174
- raise ValueError("Every example must have an 'input' and 'actual_output' field.")
175
- else:
176
- # GroundTruthExample has `comments` and `source_file` fields
177
- data["comments"] = row["comments"] if pd.notna(row["comments"]) else None
178
- data["source_file"] = row["source_file"] if pd.notna(row["source_file"]) else None
179
- # every GroundTruthExample has `input` field
180
- if data["input"] is not None:
181
- g = GroundTruthExample(**data)
182
- ground_truths.append(g)
183
- else:
184
- raise ValueError("Every ground truth must have an 'input' field.")
185
-
186
- for e in examples:
187
- self.add_example(e)
188
-
189
- for g in ground_truths:
190
- self.add_ground_truth(g)
191
-
192
- def add_example(self, e: Example) -> None:
193
- self.examples = self.examples + [e]
194
- # TODO if we need to add rank, then we need to do it here
195
-
196
- def add_ground_truth(self, g: GroundTruthExample) -> None:
197
- self.ground_truths = self.ground_truths + [g]
198
-
199
- def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: str = None) -> None:
200
- """
201
- Saves the dataset as a file. Save both the ground truths and examples.
202
-
203
- Args:
204
- file_type (Literal["json", "csv"]): The file type to save the dataset as.
205
- dir_path (str): The directory path to save the file to.
206
- save_name (str, optional): The name of the file to save. Defaults to None.
207
- """
208
- if not os.path.exists(dir_path):
209
- os.makedirs(dir_path)
210
- file_name = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") if save_name is None else save_name
211
- complete_path = os.path.join(dir_path, f"{file_name}.{file_type}")
212
- if file_type == "json":
213
- with open(complete_path, "w") as file:
214
- json.dump(
215
- {
216
- "ground_truths": [g.to_dict() for g in self.ground_truths],
217
- "examples": [e.to_dict() for e in self.examples],
218
- },
219
- file,
220
- indent=4,
221
- )
222
- elif file_type == "csv":
223
- with open(complete_path, "w", newline="") as file:
224
- writer = csv.writer(file)
225
- writer.writerow([
226
- "input", "actual_output", "expected_output", "context", \
227
- "retrieval_context", "additional_metadata", "tools_called", \
228
- "expected_tools", "name", "comments", "source_file", "example", \
229
- "trace_id"
230
- ])
231
- for e in self.examples:
232
- writer.writerow(
233
- [
234
- e.input,
235
- e.actual_output,
236
- e.expected_output,
237
- ";".join(e.context),
238
- ";".join(e.retrieval_context),
239
- e.additional_metadata,
240
- ";".join(e.tools_called),
241
- ";".join(e.expected_tools),
242
- e.name,
243
- None, # Example does not have comments
244
- None, # Example does not have source file
245
- True, # Adding an Example
246
- e.trace_id
247
- ]
248
- )
249
-
250
- for g in self.ground_truths:
251
- writer.writerow(
252
- [
253
- g.input,
254
- g.actual_output,
255
- g.expected_output,
256
- ";".join(g.context),
257
- ";".join(g.retrieval_context),
258
- g.additional_metadata,
259
- ";".join(g.tools_called),
260
- ";".join(g.expected_tools),
261
- None, # GroundTruthExample does not have name
262
- g.comments,
263
- g.source_file,
264
- False, # Adding a GroundTruthExample, not an Example
265
- g.trace_id
266
- ]
267
- )
268
- else:
269
- ACCEPTABLE_FILE_TYPES = ["json", "csv"]
270
- raise TypeError(f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}")
271
-
272
- def __iter__(self):
273
- return iter(self.examples)
274
-
275
- def __len__(self):
276
- return len(self.examples)
277
-
278
- def __str__(self):
279
- return (
280
- f"{self.__class__.__name__}("
281
- f"ground_truths={self.ground_truths}, "
282
- f"examples={self.examples}, "
283
- f"_alias={self._alias}, "
284
- f"_id={self._id}"
285
- f")"
286
- )
@@ -1,193 +0,0 @@
1
-
2
- from typing import Optional
3
- import requests
4
- from rich.progress import Progress, SpinnerColumn, TextColumn
5
-
6
- from judgeval.common.logger import debug, error, warning, info
7
- from judgeval.constants import (
8
- JUDGMENT_DATASETS_PUSH_API_URL,
9
- JUDGMENT_DATASETS_PULL_API_URL,
10
- JUDGMENT_DATASETS_PULL_ALL_API_URL
11
- )
12
- from judgeval.data import Example
13
- from judgeval.data.datasets import EvalDataset
14
- from judgeval.data.datasets.ground_truth import GroundTruthExample
15
-
16
-
17
-
18
-
19
- class EvalDatasetClient:
20
- def __init__(self, judgment_api_key: str):
21
- self.judgment_api_key = judgment_api_key
22
-
23
- def create_dataset(self) -> EvalDataset:
24
- return EvalDataset(judgment_api_key=self.judgment_api_key)
25
-
26
- def push(self, dataset: EvalDataset, alias: str,overwrite: Optional[bool] = False) -> bool:
27
- debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
28
- if overwrite:
29
- warning(f"Overwrite enabled for alias '{alias}'")
30
- """
31
- Pushes the dataset to Judgment platform
32
-
33
- Mock request:
34
- dataset = {
35
- "alias": alias,
36
- "ground_truths": [...],
37
- "examples": [...],
38
- "overwrite": overwrite
39
- } ==>
40
- {
41
- "_alias": alias,
42
- "_id": "..." # ID of the dataset
43
- }
44
- """
45
- with Progress(
46
- SpinnerColumn(style="rgb(106,0,255)"),
47
- TextColumn("[progress.description]{task.description}"),
48
- transient=False,
49
- ) as progress:
50
- task_id = progress.add_task(
51
- f"Pushing [rgb(106,0,255)]'{alias}' to Judgment...",
52
- total=100,
53
- )
54
- content = {
55
- "alias": alias,
56
- "ground_truths": [g.to_dict() for g in dataset.ground_truths],
57
- "examples": [e.to_dict() for e in dataset.examples],
58
- "overwrite": overwrite,
59
- "judgment_api_key": dataset.judgment_api_key
60
- }
61
- try:
62
- response = requests.post(
63
- JUDGMENT_DATASETS_PUSH_API_URL,
64
- json=content
65
- )
66
- if response.status_code == 500:
67
- error(f"Server error during push: {content.get('message')}")
68
- return False
69
- response.raise_for_status()
70
- except requests.exceptions.HTTPError as err:
71
- if response.status_code == 422:
72
- error(f"Validation error during push: {err.response.json()}")
73
- else:
74
- error(f"HTTP error during push: {err}")
75
-
76
- info(f"Successfully pushed dataset with alias '{alias}'")
77
- payload = response.json()
78
- dataset._alias = payload.get("_alias")
79
- dataset._id = payload.get("_id")
80
- progress.update(
81
- task_id,
82
- description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
83
- )
84
- return True
85
-
86
- def pull(self, alias: str) -> EvalDataset:
87
- debug(f"Pulling dataset with alias '{alias}'")
88
- """
89
- Pulls the dataset from Judgment platform
90
-
91
- Mock request:
92
- {
93
- "alias": alias,
94
- "user_id": user_id
95
- }
96
- ==>
97
- {
98
- "ground_truths": [...],
99
- "examples": [...],
100
- "_alias": alias,
101
- "_id": "..." # ID of the dataset
102
- }
103
- """
104
- # Make a POST request to the Judgment API to get the dataset
105
- dataset = self.create_dataset()
106
-
107
- with Progress(
108
- SpinnerColumn(style="rgb(106,0,255)"),
109
- TextColumn("[progress.description]{task.description}"),
110
- transient=False,
111
- ) as progress:
112
- task_id = progress.add_task(
113
- f"Pulling [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
114
- total=100,
115
- )
116
- request_body = {
117
- "alias": alias,
118
- "judgment_api_key": self.judgment_api_key
119
- }
120
-
121
- try:
122
- response = requests.post(
123
- JUDGMENT_DATASETS_PULL_API_URL,
124
- json=request_body
125
- )
126
- response.raise_for_status()
127
- except requests.exceptions.RequestException as e:
128
- error(f"Error pulling dataset: {str(e)}")
129
- raise
130
-
131
- info(f"Successfully pulled dataset with alias '{alias}'")
132
- payload = response.json()
133
- dataset.ground_truths = [GroundTruthExample(**g) for g in payload.get("ground_truths", [])]
134
- dataset.examples = [Example(**e) for e in payload.get("examples", [])]
135
- dataset._alias = payload.get("_alias")
136
- dataset._id = payload.get("_id")
137
- progress.update(
138
- task_id,
139
- description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
140
- )
141
-
142
- return dataset
143
-
144
- def pull_all_user_dataset_stats(self) -> dict:
145
- debug(f"Pulling user datasets stats for user_id: {self.judgment_api_key}'")
146
- """
147
- Pulls the user datasets stats from Judgment platform
148
-
149
- Mock request:
150
- {
151
- "user_id": user_id
152
- }
153
- ==>
154
- {
155
- "test_dataset_1": {"examples_count": len(dataset1.examples), "ground_truths_count": len(dataset1.ground_truths)},
156
- "test_dataset_2": {"examples_count": len(dataset2.examples), "ground_truths_count": len(dataset2.ground_truths)},
157
- ...
158
- }
159
- """
160
- # Make a POST request to the Judgment API to get the dataset
161
-
162
- with Progress(
163
- SpinnerColumn(style="rgb(106,0,255)"),
164
- TextColumn("[progress.description]{task.description}"),
165
- transient=False,
166
- ) as progress:
167
- task_id = progress.add_task(
168
- f"Pulling [rgb(106,0,255)]' datasets'[/rgb(106,0,255)] from Judgment...",
169
- total=100,
170
- )
171
- request_body = {
172
- "judgment_api_key": self.judgment_api_key
173
- }
174
-
175
- try:
176
- response = requests.post(
177
- JUDGMENT_DATASETS_PULL_ALL_API_URL,
178
- json=request_body
179
- )
180
- response.raise_for_status()
181
- except requests.exceptions.RequestException as e:
182
- error(f"Error pulling dataset: {str(e)}")
183
- raise
184
-
185
- info(f"Successfully pulled datasets for userid: {self.judgment_api_key}'")
186
- payload = response.json()
187
-
188
- progress.update(
189
- task_id,
190
- description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
191
- )
192
-
193
- return payload
@@ -1,54 +0,0 @@
1
- from pydantic import BaseModel
2
- from typing import Optional, Dict, List
3
-
4
-
5
- class GroundTruthExample(BaseModel):
6
- """
7
- GroundTruthExample is the atomic unit of a `Dataset`. It is essentially the same
8
- as an `Example`, but the `actual_output` field is optional to enable users to
9
- run their workflow on the `input` field at test-time to evaluate their current
10
- workflow's performance.
11
- """
12
- input: str
13
- actual_output: Optional[str] = None
14
- expected_output: Optional[str] = None
15
- context: Optional[List[str]] = None
16
- retrieval_context: Optional[List[str]] = None
17
- additional_metadata: Optional[Dict] = None
18
- comments: Optional[str] = None
19
- tools_called: Optional[List[str]] = None
20
- expected_tools: Optional[List[str]] = None
21
- source_file: Optional[str] = None
22
- trace_id: Optional[str] = None
23
-
24
- def to_dict(self):
25
- return {
26
- "input": self.input,
27
- "actual_output": self.actual_output,
28
- "expected_output": self.expected_output,
29
- "context": self.context,
30
- "retrieval_context": self.retrieval_context,
31
- "additional_metadata": self.additional_metadata,
32
- "comments": self.comments,
33
- "tools_called": self.tools_called,
34
- "expected_tools": self.expected_tools,
35
- "source_file": self.source_file,
36
- "trace_id": self.trace_id,
37
- }
38
-
39
- def __str__(self):
40
- return (
41
- f"{self.__class__.__name__}("
42
- f"input={self.input}, "
43
- f"actual_output={self.actual_output}, "
44
- f"expected_output={self.expected_output}, "
45
- f"context={self.context}, "
46
- f"retrieval_context={self.retrieval_context}, "
47
- f"additional_metadata={self.additional_metadata}, "
48
- f"comments={self.comments}, "
49
- f"tools_called={self.tools_called}, "
50
- f"expected_tools={self.expected_tools}, "
51
- f"source_file={self.source_file}, "
52
- f"trace_id={self.trace_id}"
53
- f")"
54
- )