judgeval 0.0.25__py3-none-any.whl → 0.0.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/constants.py CHANGED
@@ -41,18 +41,21 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
41
41
  JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
42
42
  JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
43
43
  JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
44
+ JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
44
45
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
45
- JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
46
- JUDGMENT_DATASETS_EDIT_API_URL = f"{ROOT_API}/datasets/edit/"
46
+ JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
47
+ JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
47
48
  JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
48
49
  JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
49
- JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
50
+ JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
50
51
  JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
51
52
  JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
53
+ JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
52
54
  JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
53
55
  JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
54
56
  JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
55
- JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/add_to_eval_queue/"
57
+ JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/add_to_trace_eval_queue/"
58
+ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
56
59
  # RabbitMQ
57
60
  RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
58
61
  RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
judgeval/data/__init__.py CHANGED
@@ -1,13 +1,10 @@
1
1
  from judgeval.data.example import Example, ExampleParams
2
- from judgeval.data.api_example import ProcessExample, create_process_example
3
2
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
4
3
  from judgeval.data.result import ScoringResult, generate_scoring_result
5
4
 
6
5
  __all__ = [
7
6
  "Example",
8
7
  "ExampleParams",
9
- "ProcessExample",
10
- "create_process_example",
11
8
  "ScorerData",
12
9
  "create_scorer_data",
13
10
  "ScoringResult",
@@ -90,9 +90,18 @@ class EvalDataset:
90
90
  def add_from_csv(
91
91
  self,
92
92
  file_path: str,
93
+ header_mapping: dict,
94
+ primary_delimiter: str = ",",
95
+ secondary_delimiter: str = ";"
93
96
  ) -> None:
94
97
  """
95
98
  Add Examples from a CSV file.
99
+
100
+ Args:
101
+ file_path (str): Path to the CSV file
102
+ header_mapping (dict): Dictionary mapping Example headers to custom headers
103
+ primary_delimiter (str, optional): Main delimiter used in CSV file. Defaults to ","
104
+ secondary_delimiter (str, optional): Secondary delimiter for list fields. Defaults to ";"
96
105
  """
97
106
  try:
98
107
  import pandas as pd
@@ -102,9 +111,10 @@ class EvalDataset:
102
111
  )
103
112
 
104
113
  # Pandas naturally reads numbers in data files as ints, not strings (can lead to unexpected behavior)
105
- df = pd.read_csv(file_path, dtype={'trace_id': str})
114
+ df = pd.read_csv(file_path, dtype={'trace_id': str}, sep=primary_delimiter)
106
115
  """
107
- Expect the CSV to have headers
116
+ The user should pass in a dict mapping from Judgment Example headers to their custom defined headers.
117
+ Available headers for Example objects are as follows:
108
118
 
109
119
  "input", "actual_output", "expected_output", "context", \
110
120
  "retrieval_context", "additional_metadata", "tools_called", \
@@ -113,35 +123,48 @@ class EvalDataset:
113
123
 
114
124
  We want to collect the examples separately which can
115
125
  be determined by the "example" column. If the value is True, then it is an
116
- example
126
+ example, and we expect the `input` and `actual_output` fields to be non-null.
117
127
 
118
- We also assume that if there are multiple retrieval contexts or contexts, they are separated by semicolons.
119
- This can be adjusted using the `context_delimiter` and `retrieval_context_delimiter` parameters.
128
+ We also assume that if there are multiple retrieval contexts, contexts, or tools called, they are separated by semicolons.
129
+ This can be adjusted using the `secondary_delimiter` parameter.
120
130
  """
121
131
  examples = []
122
-
132
+
133
+ def process_csv_row(value, header):
134
+ """
135
+ Maps a singular value in the CSV file to the appropriate type based on the header.
136
+ If value exists and can be split into type List[*], we will split upon the user's provided secondary delimiter.
137
+ """
138
+ # check that the CSV value is not null for entry
139
+ null_replacement = dict() if header == 'additional_metadata' else None
140
+ if pd.isna(value) or value == '':
141
+ return null_replacement
142
+ try:
143
+ value = ast.literal_eval(value) if header == 'additional_metadata' else str(value)
144
+ except (ValueError, SyntaxError):
145
+ value = str(value)
146
+ if header in ["context", "retrieval_context", "tools_called", "expected_tools"]:
147
+ # attempt to split the value by the secondary delimiter
148
+ value = value.split(secondary_delimiter)
149
+
150
+ return value
151
+
123
152
  for _, row in df.iterrows():
124
153
  data = {
125
- "input": row["input"],
126
- "actual_output": row["actual_output"] if pd.notna(row["actual_output"]) else None,
127
- "expected_output": row["expected_output"] if pd.notna(row["expected_output"]) else None,
128
- "context": row["context"].split(";") if pd.notna(row["context"]) else [],
129
- "retrieval_context": row["retrieval_context"].split(";") if pd.notna(row["retrieval_context"]) else [],
130
- "additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(),
131
- "tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [],
132
- "expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [],
133
- "trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None,
134
- "example_id": str(row["example_id"]) if pd.notna(row["example_id"]) else None
154
+ header: process_csv_row(
155
+ row[header_mapping[header]], header
156
+ )
157
+ for header in header_mapping
135
158
  }
136
- if row["example"]:
137
- data["name"] = row["name"] if pd.notna(row["name"]) else None
159
+ if "example" in header_mapping and row[header_mapping["example"]]:
160
+ if "name" in header_mapping:
161
+ data["name"] = row[header_mapping["name"]] if pd.notna(row[header_mapping["name"]]) else None
138
162
  # every Example has `input` and `actual_output` fields
139
163
  if data["input"] is not None and data["actual_output"] is not None:
140
164
  e = Example(**data)
141
165
  examples.append(e)
142
166
  else:
143
167
  raise ValueError("Every example must have an 'input' and 'actual_output' field.")
144
-
145
168
 
146
169
  for e in examples:
147
170
  self.add_example(e)
@@ -7,8 +7,9 @@ from judgeval.common.logger import debug, error, warning, info
7
7
  from judgeval.constants import (
8
8
  JUDGMENT_DATASETS_PUSH_API_URL,
9
9
  JUDGMENT_DATASETS_PULL_API_URL,
10
- JUDGMENT_DATASETS_PULL_ALL_API_URL,
11
- JUDGMENT_DATASETS_EDIT_API_URL,
10
+ JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
11
+ JUDGMENT_DATASETS_DELETE_API_URL,
12
+ JUDGMENT_DATASETS_INSERT_API_URL,
12
13
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
13
14
  )
14
15
  from judgeval.data import Example
@@ -25,7 +26,7 @@ class EvalDatasetClient:
25
26
  def create_dataset(self) -> EvalDataset:
26
27
  return EvalDataset(judgment_api_key=self.judgment_api_key)
27
28
 
28
- def push(self, dataset: EvalDataset, alias: str, overwrite: Optional[bool] = False) -> bool:
29
+ def push(self, dataset: EvalDataset, alias: str, project_name: str, overwrite: Optional[bool] = False) -> bool:
29
30
  debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
30
31
  if overwrite:
31
32
  warning(f"Overwrite enabled for alias '{alias}'")
@@ -53,7 +54,8 @@ class EvalDatasetClient:
53
54
  total=100,
54
55
  )
55
56
  content = {
56
- "alias": alias,
57
+ "dataset_alias": alias,
58
+ "project_name": project_name,
57
59
  "examples": [e.to_dict() for e in dataset.examples],
58
60
  "overwrite": overwrite,
59
61
  }
@@ -88,7 +90,7 @@ class EvalDatasetClient:
88
90
  )
89
91
  return True
90
92
 
91
- def pull(self, alias: str) -> EvalDataset:
93
+ def pull(self, alias: str, project_name: str) -> EvalDataset:
92
94
  debug(f"Pulling dataset with alias '{alias}'")
93
95
  """
94
96
  Pulls the dataset from Judgment platform
@@ -96,7 +98,7 @@ class EvalDatasetClient:
96
98
  Mock request:
97
99
  {
98
100
  "alias": alias,
99
- "user_id": user_id
101
+ "project_name": project_name
100
102
  }
101
103
  ==>
102
104
  {
@@ -118,7 +120,8 @@ class EvalDatasetClient:
118
120
  total=100,
119
121
  )
120
122
  request_body = {
121
- "alias": alias,
123
+ "dataset_alias": alias,
124
+ "project_name": project_name
122
125
  }
123
126
 
124
127
  try:
@@ -139,24 +142,58 @@ class EvalDatasetClient:
139
142
 
140
143
  info(f"Successfully pulled dataset with alias '{alias}'")
141
144
  payload = response.json()
145
+
142
146
  dataset.examples = [Example(**e) for e in payload.get("examples", [])]
143
- dataset._alias = payload.get("_alias")
144
- dataset._id = payload.get("_id")
147
+ dataset._alias = payload.get("alias")
148
+ dataset._id = payload.get("id")
145
149
  progress.update(
146
150
  task_id,
147
151
  description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
148
152
  )
149
153
 
150
154
  return dataset
155
+
156
+ def delete(self, alias: str, project_name: str) -> bool:
157
+ with Progress(
158
+ SpinnerColumn(style="rgb(106,0,255)"),
159
+ TextColumn("[progress.description]{task.description}"),
160
+ transient=False,
161
+ ) as progress:
162
+ task_id = progress.add_task(
163
+ f"Deleting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
164
+ total=100,
165
+ )
166
+ request_body = {
167
+ "dataset_alias": alias,
168
+ "project_name": project_name
169
+ }
151
170
 
152
- def pull_all_user_dataset_stats(self) -> dict:
153
- debug(f"Pulling user datasets stats for user_id: {self.judgment_api_key}'")
171
+ try:
172
+ response = requests.post(
173
+ JUDGMENT_DATASETS_DELETE_API_URL,
174
+ json=request_body,
175
+ headers={
176
+ "Content-Type": "application/json",
177
+ "Authorization": f"Bearer {self.judgment_api_key}",
178
+ "X-Organization-Id": self.organization_id
179
+ },
180
+ verify=True
181
+ )
182
+ response.raise_for_status()
183
+ except requests.exceptions.RequestException as e:
184
+ error(f"Error deleting dataset: {str(e)}")
185
+ raise
186
+
187
+ return True
188
+
189
+ def pull_project_dataset_stats(self, project_name: str) -> dict:
190
+ debug(f"Pulling project datasets stats for project_name: {project_name}'")
154
191
  """
155
- Pulls the user datasets stats from Judgment platform
192
+ Pulls the project datasets stats from Judgment platform
156
193
 
157
194
  Mock request:
158
195
  {
159
- "user_id": user_id
196
+ "project_name": project_name
160
197
  }
161
198
  ==>
162
199
  {
@@ -177,11 +214,12 @@ class EvalDatasetClient:
177
214
  total=100,
178
215
  )
179
216
  request_body = {
217
+ "project_name": project_name
180
218
  }
181
219
 
182
220
  try:
183
221
  response = requests.post(
184
- JUDGMENT_DATASETS_PULL_ALL_API_URL,
222
+ JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
185
223
  json=request_body,
186
224
  headers={
187
225
  "Content-Type": "application/json",
@@ -205,7 +243,7 @@ class EvalDatasetClient:
205
243
 
206
244
  return payload
207
245
 
208
- def edit_dataset(self, alias: str, examples: List[Example]) -> bool:
246
+ def insert_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
209
247
  """
210
248
  Edits the dataset on Judgment platform by adding new examples
211
249
 
@@ -213,7 +251,7 @@ class EvalDatasetClient:
213
251
  {
214
252
  "alias": alias,
215
253
  "examples": [...],
216
- "judgment_api_key": self.judgment_api_key
254
+ "project_name": project_name
217
255
  }
218
256
  """
219
257
  with Progress(
@@ -227,13 +265,14 @@ class EvalDatasetClient:
227
265
  )
228
266
 
229
267
  content = {
230
- "alias": alias,
268
+ "dataset_alias": alias,
231
269
  "examples": [e.to_dict() for e in examples],
270
+ "project_name": project_name
232
271
  }
233
272
 
234
273
  try:
235
274
  response = requests.post(
236
- JUDGMENT_DATASETS_EDIT_API_URL,
275
+ JUDGMENT_DATASETS_INSERT_API_URL,
237
276
  json=content,
238
277
  headers={
239
278
  "Content-Type": "application/json",
@@ -250,7 +289,7 @@ class EvalDatasetClient:
250
289
  info(f"Successfully edited dataset '{alias}'")
251
290
  return True
252
291
 
253
- def export_jsonl(self, alias: str) -> requests.Response:
292
+ def export_jsonl(self, alias: str, project_name: str) -> requests.Response:
254
293
  """Export dataset in JSONL format from Judgment platform"""
255
294
  debug(f"Exporting dataset with alias '{alias}' as JSONL")
256
295
  with Progress(
@@ -265,7 +304,7 @@ class EvalDatasetClient:
265
304
  try:
266
305
  response = requests.post(
267
306
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL,
268
- json={"alias": alias},
307
+ json={"dataset_alias": alias, "project_name": project_name},
269
308
  headers={
270
309
  "Content-Type": "application/json",
271
310
  "Authorization": f"Bearer {self.judgment_api_key}",
judgeval/data/result.py CHANGED
@@ -1,10 +1,11 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import List, Union, Optional, Dict, Any, Union
3
+ from judgeval.common.logger import debug, error
4
+ from pydantic import BaseModel
5
+ from judgeval.data import ScorerData, Example
3
6
 
4
- from judgeval.data import ScorerData, ProcessExample
5
7
 
6
- @dataclass
7
- class ScoringResult:
8
+ class ScoringResult(BaseModel):
8
9
  """
9
10
  A ScoringResult contains the output of one or more scorers applied to a single example.
10
11
  Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
@@ -13,69 +14,44 @@ class ScoringResult:
13
14
  success (bool): Whether the evaluation was successful.
14
15
  This means that all scorers applied to this example returned a success.
15
16
  scorer_data (List[ScorerData]): The scorers data for the evaluated example
16
- input (Optional[str]): The input to the example
17
- actual_output (Optional[str]): The actual output of the example
18
- expected_output (Optional[str]): The expected output of the example
19
- context (Optional[List[str]]): The context of the example
20
- retrieval_context (Optional[List[str]]): The retrieval context of the example
21
- additional_metadata (Optional[Dict[str, Any]]): The additional metadata of the example
22
- tools_called (Optional[List[str]]): The tools called by the example
23
- expected_tools (Optional[List[str]]): The expected tools of the example
24
- trace_id (Optional[str]): The trace id of the example
17
+ data_object (Optional[Example]): The original example object that was used to create the ScoringResult, can be Example, CustomExample (future), WorkflowRun (future)
25
18
 
26
19
  """
27
20
  # Fields for scoring outputs
28
21
  success: bool # used for unit testing
29
22
  scorers_data: Union[List[ScorerData], None]
23
+ name: Optional[str] = None
30
24
 
31
- # Inputs from the original example
32
- input: Optional[str] = None
33
- actual_output: Optional[Union[str, List[str]]] = None
34
- expected_output: Optional[Union[str, List[str]]] = None
35
- context: Optional[List[str]] = None
36
- retrieval_context: Optional[List[str]] = None
37
- additional_metadata: Optional[Dict[str, Any]] = None
38
- tools_called: Optional[List[str]] = None
39
- expected_tools: Optional[List[str]] = None
25
+ # The original example object that was used to create the ScoringResult
26
+ data_object: Optional[Example] = None #can be Example, CustomExample (future), WorkflowRun (future)
40
27
  trace_id: Optional[str] = None
41
28
 
42
- example_id: Optional[str] = None
43
- eval_run_name: Optional[str] = None
29
+ # Additional fields for internal use
30
+ run_duration: Optional[float] = None
31
+ evaluation_cost: Optional[float] = None
44
32
 
45
33
  def to_dict(self) -> dict:
46
34
  """Convert the ScoringResult instance to a dictionary, properly serializing scorer_data."""
47
35
  return {
48
36
  "success": self.success,
49
37
  "scorers_data": [scorer_data.to_dict() for scorer_data in self.scorers_data] if self.scorers_data else None,
50
- "input": self.input,
51
- "actual_output": self.actual_output,
52
- "expected_output": self.expected_output,
53
- "context": self.context,
54
- "retrieval_context": self.retrieval_context,
55
- "additional_metadata": self.additional_metadata,
56
- "tools_called": self.tools_called,
57
- "expected_tools": self.expected_tools,
58
- "trace_id": self.trace_id,
59
- "example_id": self.example_id
38
+ "data_object": self.data_object.to_dict() if self.data_object else None,
60
39
  }
61
-
40
+
62
41
  def __str__(self) -> str:
63
42
  return f"ScoringResult(\
64
43
  success={self.success}, \
65
44
  scorer_data={self.scorers_data}, \
66
- input={self.input}, \
67
- actual_output={self.actual_output}, \
68
- expected_output={self.expected_output}, \
69
- context={self.context}, \
70
- retrieval_context={self.retrieval_context}, \
71
- additional_metadata={self.additional_metadata}, \
72
- tools_called={self.tools_called}, \
73
- expected_tools={self.expected_tools}, \
74
- trace_id={self.trace_id})"
45
+ data_object={self.data_object}, \
46
+ run_duration={self.run_duration}, \
47
+ evaluation_cost={self.evaluation_cost})"
75
48
 
76
49
 
77
50
  def generate_scoring_result(
78
- process_example: ProcessExample,
51
+ example: Example,
52
+ success: bool,
53
+ scorers_data: List[ScorerData],
54
+ run_duration: float,
79
55
  ) -> ScoringResult:
80
56
  """
81
57
  Creates a final ScoringResult object for an evaluation run based on the results from a completed LLMApiTestCase.
@@ -83,16 +59,18 @@ def generate_scoring_result(
83
59
  When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
84
60
  At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
85
61
  """
86
- return ScoringResult(
87
- success=process_example.success,
88
- scorers_data=process_example.scorers_data,
89
- input=process_example.input,
90
- actual_output=process_example.actual_output,
91
- expected_output=process_example.expected_output,
92
- context=process_example.context,
93
- retrieval_context=process_example.retrieval_context,
94
- additional_metadata=process_example.additional_metadata,
95
- tools_called=process_example.tools_called,
96
- expected_tools=process_example.expected_tools,
97
- trace_id=process_example.trace_id
62
+ if example.name is not None:
63
+ name = example.name
64
+ else:
65
+ name = "Test Case Placeholder"
66
+ debug(f"No name provided for example, using default name: {name}")
67
+ debug(f"Creating ScoringResult for: {name}")
68
+ scoring_result = ScoringResult(
69
+ name=name,
70
+ data_object=example,
71
+ success=success,
72
+ scorers_data=scorers_data,
73
+ run_duration=run_duration,
74
+ evaluation_cost=None,
98
75
  )
76
+ return scoring_result
@@ -146,16 +146,17 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
146
146
 
147
147
  self.start_span("LangGraph", span_type="Main Function")
148
148
 
149
- node = metadata.get("langgraph_node")
150
- if node != None and node != self.previous_node:
151
- self.start_span(node, span_type="node")
152
- self.executed_node_tools.append(node)
153
- self.executed_nodes.append(node)
154
- self.trace_client.record_input({
155
- 'args': inputs,
156
- 'kwargs': kwargs
157
- })
158
- self.previous_node = node
149
+ metadata = kwargs.get("metadata", {})
150
+ if node := metadata.get("langgraph_node"):
151
+ if node != self.previous_node:
152
+ # Track node execution
153
+ self.trace_client.visited_nodes.append(node)
154
+ self.trace_client.executed_node_tools.append(node)
155
+ self.trace_client.record_input({
156
+ 'args': inputs,
157
+ 'kwargs': kwargs
158
+ })
159
+ self.previous_node = node
159
160
 
160
161
  def on_chain_end(
161
162
  self,
@@ -198,8 +199,11 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
198
199
  ):
199
200
  name = serialized["name"]
200
201
  self.start_span(name, span_type="tool")
201
- self.executed_node_tools.append(f"{self.previous_node}:{name}")
202
- self.executed_tools.append(name)
202
+ if name:
203
+ # Track tool execution
204
+ self.trace_client.executed_tools.append(name)
205
+ node_tool = f"{self.previous_node}:{name}" if self.previous_node else name
206
+ self.trace_client.executed_node_tools.append(node_tool)
203
207
  self.trace_client.record_input({
204
208
  'args': input_str,
205
209
  'kwargs': kwargs