judgeval 0.0.30__py3-none-any.whl → 0.0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/constants.py CHANGED
@@ -26,7 +26,8 @@ class APIScorer(str, Enum):
26
26
  JSON_CORRECTNESS = "json_correctness"
27
27
  COMPARISON = "comparison"
28
28
  GROUNDEDNESS = "groundedness"
29
-
29
+ DERAILMENT = "derailment"
30
+
30
31
  @classmethod
31
32
  def _missing_(cls, value):
32
33
  # Handle case-insensitive lookup
@@ -39,8 +40,10 @@ UNBOUNDED_SCORERS = set([APIScorer.COMPARISON]) # scorers whose scores are not
39
40
  ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
40
41
  # API URLs
41
42
  JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
43
+ JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
42
44
  JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
43
- JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
45
+ JUDGMENT_DATASETS_APPEND_API_URL = f"{ROOT_API}/datasets/insert_examples/"
46
+ JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
44
47
  JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
45
48
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
46
49
  JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
@@ -54,7 +57,6 @@ JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
54
57
  JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
55
58
  JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
56
59
  JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
57
- JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/add_to_trace_eval_queue/"
58
60
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
59
61
  # RabbitMQ
60
62
  RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
judgeval/data/__init__.py CHANGED
@@ -1,12 +1,16 @@
1
1
  from judgeval.data.example import Example, ExampleParams
2
+ from judgeval.data.custom_example import CustomExample
2
3
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
3
4
  from judgeval.data.result import ScoringResult, generate_scoring_result
5
+ from judgeval.data.sequence import Sequence
4
6
 
5
7
  __all__ = [
6
8
  "Example",
7
9
  "ExampleParams",
10
+ "CustomExample",
8
11
  "ScorerData",
9
12
  "create_scorer_data",
10
13
  "ScoringResult",
11
14
  "generate_scoring_result",
15
+ "Sequence",
12
16
  ]
@@ -0,0 +1,18 @@
1
+ from pydantic import BaseModel, Field
2
+ from typing import Optional, Union, List, Dict, Any
3
+ from uuid import uuid4
4
+
5
+ class CustomExample(BaseModel):
6
+ input: Optional[Dict[str, Any]] = None
7
+ actual_output: Optional[Dict[str, Any]] = None
8
+ expected_output: Optional[Dict[str, Any]] = None
9
+ context: Optional[List[str]] = None
10
+ retrieval_context: Optional[List[str]] = None
11
+ additional_metadata: Optional[Dict[str, Any]] = None
12
+ tools_called: Optional[List[str]] = None
13
+ expected_tools: Optional[List[str]] = None
14
+ name: Optional[str] = None
15
+ example_id: str = Field(default_factory=lambda: str(uuid4()))
16
+ example_index: Optional[int] = None
17
+ timestamp: Optional[str] = None
18
+ trace_id: Optional[str] = None
@@ -7,12 +7,13 @@ import yaml
7
7
  from dataclasses import dataclass, field
8
8
  from typing import List, Union, Literal
9
9
 
10
- from judgeval.data import Example
10
+ from judgeval.data import Example, Sequence
11
11
  from judgeval.common.logger import debug, error, warning, info
12
12
 
13
13
  @dataclass
14
14
  class EvalDataset:
15
15
  examples: List[Example]
16
+ sequences: List[Sequence]
16
17
  _alias: Union[str, None] = field(default=None)
17
18
  _id: Union[str, None] = field(default=None)
18
19
  judgment_api_key: str = field(default="")
@@ -21,11 +22,13 @@ class EvalDataset:
21
22
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
22
23
  organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
23
24
  examples: List[Example] = [],
25
+ sequences: List[Sequence] = []
24
26
  ):
25
27
  debug(f"Initializing EvalDataset with {len(examples)} examples")
26
28
  if not judgment_api_key:
27
29
  warning("No judgment_api_key provided")
28
30
  self.examples = examples
31
+ self.sequences = sequences
29
32
  self._alias = None
30
33
  self._id = None
31
34
  self.judgment_api_key = judgment_api_key
@@ -309,6 +312,7 @@ class EvalDataset:
309
312
  return (
310
313
  f"{self.__class__.__name__}("
311
314
  f"examples={self.examples}, "
315
+ f"sequences={self.sequences}, "
312
316
  f"_alias={self._alias}, "
313
317
  f"_id={self._id}"
314
318
  f")"
@@ -6,13 +6,14 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
6
6
  from judgeval.common.logger import debug, error, warning, info
7
7
  from judgeval.constants import (
8
8
  JUDGMENT_DATASETS_PUSH_API_URL,
9
+ JUDGMENT_DATASETS_APPEND_API_URL,
9
10
  JUDGMENT_DATASETS_PULL_API_URL,
10
11
  JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
11
12
  JUDGMENT_DATASETS_DELETE_API_URL,
12
13
  JUDGMENT_DATASETS_INSERT_API_URL,
13
14
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
14
15
  )
15
- from judgeval.data import Example
16
+ from judgeval.data import Example, Sequence
16
17
  from judgeval.data.datasets import EvalDataset
17
18
 
18
19
 
@@ -70,9 +71,9 @@ class EvalDatasetClient:
70
71
  },
71
72
  verify=True
72
73
  )
73
- if response.status_code == 500:
74
- error(f"Server error during push: {content.get('message')}")
75
- return False
74
+ if response.status_code != 200:
75
+ error(f"Server error during push: {response.json()}")
76
+ raise Exception(f"Server error during push: {response.json()}")
76
77
  response.raise_for_status()
77
78
  except requests.exceptions.HTTPError as err:
78
79
  if response.status_code == 422:
@@ -90,6 +91,64 @@ class EvalDatasetClient:
90
91
  )
91
92
  return True
92
93
 
94
+
95
+ def append(self, alias: str, examples: List[Example], project_name: str) -> bool:
96
+ debug(f"Appending dataset with alias '{alias}'")
97
+ """
98
+ Appends the dataset to Judgment platform
99
+
100
+ Mock request:
101
+ dataset = {
102
+ "alias": alias,
103
+ "examples": [...],
104
+ "project_name": project_name
105
+ } ==>
106
+ {
107
+ "_alias": alias,
108
+ "_id": "..." # ID of the dataset
109
+ }
110
+ """
111
+ with Progress(
112
+ SpinnerColumn(style="rgb(106,0,255)"),
113
+ TextColumn("[progress.description]{task.description}"),
114
+ transient=False,
115
+ ) as progress:
116
+ task_id = progress.add_task(
117
+ f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
118
+ total=100,
119
+ )
120
+ content = {
121
+ "dataset_alias": alias,
122
+ "project_name": project_name,
123
+ "examples": [e.to_dict() for e in examples],
124
+ }
125
+ try:
126
+ response = requests.post(
127
+ JUDGMENT_DATASETS_APPEND_API_URL,
128
+ json=content,
129
+ headers={
130
+ "Content-Type": "application/json",
131
+ "Authorization": f"Bearer {self.judgment_api_key}",
132
+ "X-Organization-Id": self.organization_id
133
+ },
134
+ verify=True
135
+ )
136
+ if response.status_code != 200:
137
+ error(f"Server error during append: {response.json()}")
138
+ raise Exception(f"Server error during append: {response.json()}")
139
+ response.raise_for_status()
140
+ except requests.exceptions.HTTPError as err:
141
+ if response.status_code == 422:
142
+ error(f"Validation error during append: {err.response.json()}")
143
+ else:
144
+ error(f"HTTP error during append: {err}")
145
+
146
+ progress.update(
147
+ task_id,
148
+ description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
149
+ )
150
+ return True
151
+
93
152
  def pull(self, alias: str, project_name: str) -> EvalDataset:
94
153
  debug(f"Pulling dataset with alias '{alias}'")
95
154
  """
@@ -142,8 +201,8 @@ class EvalDatasetClient:
142
201
 
143
202
  info(f"Successfully pulled dataset with alias '{alias}'")
144
203
  payload = response.json()
145
-
146
204
  dataset.examples = [Example(**e) for e in payload.get("examples", [])]
205
+ dataset.sequences = [Sequence(**s) for s in payload.get("sequences", [])]
147
206
  dataset._alias = payload.get("alias")
148
207
  dataset._id = payload.get("id")
149
208
  progress.update(
judgeval/data/example.py CHANGED
@@ -37,6 +37,7 @@ class Example(BaseModel):
37
37
  example_index: Optional[int] = None
38
38
  timestamp: Optional[str] = None
39
39
  trace_id: Optional[str] = None
40
+ sequence_order: Optional[int] = 0
40
41
 
41
42
  def __init__(self, **data):
42
43
  if 'example_id' not in data:
judgeval/data/result.py CHANGED
@@ -2,7 +2,8 @@ from dataclasses import dataclass
2
2
  from typing import List, Union, Optional, Dict, Any, Union
3
3
  from judgeval.common.logger import debug, error
4
4
  from pydantic import BaseModel
5
- from judgeval.data import ScorerData, Example
5
+ from judgeval.data import ScorerData, Example, CustomExample
6
+ from judgeval.data.sequence import Sequence
6
7
 
7
8
 
8
9
  class ScoringResult(BaseModel):
@@ -23,7 +24,7 @@ class ScoringResult(BaseModel):
23
24
  name: Optional[str] = None
24
25
 
25
26
  # The original example object that was used to create the ScoringResult
26
- data_object: Optional[Example] = None #can be Example, CustomExample (future), WorkflowRun (future)
27
+ data_object: Optional[Union[Sequence, CustomExample, Example]] = None
27
28
  trace_id: Optional[str] = None
28
29
 
29
30
  # Additional fields for internal use
@@ -48,7 +49,7 @@ class ScoringResult(BaseModel):
48
49
 
49
50
 
50
51
  def generate_scoring_result(
51
- example: Example,
52
+ data_object: Union[Example, Sequence],
52
53
  scorers_data: List[ScorerData],
53
54
  run_duration: float,
54
55
  success: bool,
@@ -59,15 +60,15 @@ def generate_scoring_result(
59
60
  When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
60
61
  At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
61
62
  """
62
- if example.name is not None:
63
- name = example.name
63
+ if data_object.name is not None:
64
+ name = data_object.name
64
65
  else:
65
66
  name = "Test Case Placeholder"
66
67
  debug(f"No name provided for example, using default name: {name}")
67
68
  debug(f"Creating ScoringResult for: {name}")
68
69
  scoring_result = ScoringResult(
69
70
  name=name,
70
- data_object=example,
71
+ data_object=data_object,
71
72
  success=success,
72
73
  scorers_data=scorers_data,
73
74
  run_duration=run_duration,
@@ -0,0 +1,55 @@
1
+ from pydantic import BaseModel, Field, field_validator, model_validator
2
+ from typing import List, Optional, Union, Any
3
+ from judgeval.data.example import Example
4
+ from judgeval.scorers import ScorerWrapper, JudgevalScorer
5
+ from uuid import uuid4
6
+ from datetime import datetime, timezone
7
+
8
+ class Sequence(BaseModel):
9
+ """
10
+ A sequence is a list of either Examples or nested Sequence objects.
11
+ """
12
+ sequence_id: str = Field(default_factory=lambda: str(uuid4()))
13
+ name: Optional[str] = "Sequence"
14
+ created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))
15
+ items: List[Union["Sequence", Example]]
16
+ scorers: Optional[Any] = None
17
+ parent_sequence_id: Optional[str] = None
18
+ sequence_order: Optional[int] = 0
19
+ root_sequence_id: Optional[str] = None
20
+ inputs: Optional[str] = None
21
+ output: Optional[str] = None
22
+
23
+ @field_validator("scorers")
24
+ def validate_scorer(cls, v):
25
+ loaded_scorers = []
26
+ for scorer in v or []:
27
+ try:
28
+ if isinstance(scorer, ScorerWrapper):
29
+ loaded_scorers.append(scorer.load_implementation())
30
+ else:
31
+ loaded_scorers.append(scorer)
32
+ except Exception as e:
33
+ raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
34
+ return loaded_scorers
35
+
36
+ @model_validator(mode="after")
37
+ def populate_sequence_metadata(self) -> "Sequence":
38
+ """Recursively set parent_sequence_id, root_sequence_id, and sequence_order."""
39
+ # If root_sequence_id isn't already set, assign it to self
40
+ if self.root_sequence_id is None:
41
+ self.root_sequence_id = self.sequence_id
42
+
43
+ for idx, item in enumerate(self.items):
44
+ item.sequence_order = idx
45
+ if isinstance(item, Sequence):
46
+ item.parent_sequence_id = self.sequence_id
47
+ item.root_sequence_id = self.root_sequence_id
48
+ item.populate_sequence_metadata()
49
+ return self
50
+
51
+ class Config:
52
+ arbitrary_types_allowed = True
53
+
54
+ # Update forward references so that "Sequence" inside items is resolved.
55
+ Sequence.model_rebuild()
@@ -0,0 +1,44 @@
1
+
2
+ from pydantic import BaseModel
3
+ from typing import List, Optional, Dict, Any, Union
4
+ from judgeval.data import Sequence
5
+ from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
6
+ from judgeval.judges import JudgevalJudge
7
+ from judgeval.rules import Rule
8
+
9
+
10
+ class SequenceRun(BaseModel):
11
+ """
12
+ Stores example and evaluation scorers together for running an eval task
13
+
14
+ Args:
15
+ project_name (str): The name of the project the evaluation results belong to
16
+ eval_name (str): A name for this evaluation run
17
+ sequences (List[Sequence]): The sequences to evaluate
18
+ scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
19
+ model (str): The model used as a judge when using LLM as a Judge
20
+ aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
21
+ metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
22
+ judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
23
+ rules (Optional[List[Rule]]): Rules to evaluate against scoring results
24
+ append (Optional[bool]): Whether to append to existing evaluation results
25
+ """
26
+
27
+ # The user will specify whether they want log_results when they call run_eval
28
+ log_results: bool = False # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
29
+ organization_id: Optional[str] = None
30
+ project_name: Optional[str] = None
31
+ eval_name: Optional[str] = None
32
+ sequences: List[Sequence]
33
+ model: Union[str, List[str], JudgevalJudge]
34
+ aggregator: Optional[str] = None
35
+ metadata: Optional[Dict[str, Any]] = None
36
+ trace_span_id: Optional[str] = None
37
+ append: Optional[bool] = False
38
+ # API Key will be "" until user calls client.run_eval(), then API Key will be set
39
+ judgment_api_key: Optional[str] = ""
40
+ override: Optional[bool] = False
41
+ rules: Optional[List[Rule]] = None
42
+
43
+ class Config:
44
+ arbitrary_types_allowed = True
@@ -1,7 +1,7 @@
1
1
  from typing import List, Optional, Dict, Any, Union
2
2
  from pydantic import BaseModel, field_validator
3
3
 
4
- from judgeval.data import Example
4
+ from judgeval.data import Example, CustomExample
5
5
  from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
6
6
  from judgeval.constants import ACCEPTABLE_MODELS
7
7
  from judgeval.common.logger import debug, error
@@ -15,7 +15,7 @@ class EvaluationRun(BaseModel):
15
15
  Args:
16
16
  project_name (str): The name of the project the evaluation results belong to
17
17
  eval_name (str): A name for this evaluation run
18
- examples (List[Example]): The examples to evaluate
18
+ examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
19
19
  scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
20
20
  model (str): The model used as a judge when using LLM as a Judge
21
21
  aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
@@ -29,7 +29,7 @@ class EvaluationRun(BaseModel):
29
29
  organization_id: Optional[str] = None
30
30
  project_name: Optional[str] = None
31
31
  eval_name: Optional[str] = None
32
- examples: List[Example]
32
+ examples: Union[List[Example], List[CustomExample]]
33
33
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
34
34
  model: Union[str, List[str], JudgevalJudge]
35
35
  aggregator: Optional[str] = None
@@ -38,6 +38,7 @@ class EvaluationRun(BaseModel):
38
38
  # API Key will be "" until user calls client.run_eval(), then API Key will be set
39
39
  judgment_api_key: Optional[str] = ""
40
40
  override: Optional[bool] = False
41
+ append: Optional[bool] = False
41
42
  rules: Optional[List[Rule]] = None
42
43
 
43
44
  def model_dump(self, **kwargs):
@@ -78,13 +79,17 @@ class EvaluationRun(BaseModel):
78
79
  raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
79
80
  return v
80
81
 
81
- @field_validator('examples')
82
+ @field_validator('examples', mode='before')
82
83
  def validate_examples(cls, v):
83
84
  if not v:
84
85
  raise ValueError("Examples cannot be empty.")
85
- for ex in v:
86
- if not isinstance(ex, Example):
87
- raise ValueError(f"Invalid type for Example: {type(ex)}")
86
+
87
+ first_type = type(v[0])
88
+ if first_type not in (Example, CustomExample):
89
+ raise ValueError(f"Invalid type for Example/CustomExample: {first_type}")
90
+ if not all(isinstance(ex, first_type) for ex in v):
91
+ raise ValueError("All examples must be of the same type, either all Example or all CustomExample.")
92
+
88
93
  return v
89
94
 
90
95
  @field_validator('scorers')