judgeval 0.7.1__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +139 -12
- judgeval/api/__init__.py +501 -0
- judgeval/api/api_types.py +344 -0
- judgeval/cli.py +2 -4
- judgeval/constants.py +10 -26
- judgeval/data/evaluation_run.py +49 -26
- judgeval/data/example.py +2 -2
- judgeval/data/judgment_types.py +266 -82
- judgeval/data/result.py +4 -5
- judgeval/data/scorer_data.py +4 -2
- judgeval/data/tool.py +2 -2
- judgeval/data/trace.py +7 -50
- judgeval/data/trace_run.py +7 -4
- judgeval/{dataset.py → dataset/__init__.py} +43 -28
- judgeval/env.py +67 -0
- judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +788 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +75 -15
- judgeval/judges/together_judge.py +86 -18
- judgeval/judges/utils.py +7 -21
- judgeval/{common/logger.py → logger.py} +8 -6
- judgeval/scorers/__init__.py +0 -4
- judgeval/scorers/agent_scorer.py +3 -7
- judgeval/scorers/api_scorer.py +8 -13
- judgeval/scorers/base_scorer.py +52 -32
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
- judgeval/scorers/score.py +21 -31
- judgeval/scorers/trace_api_scorer.py +5 -0
- judgeval/scorers/utils.py +1 -103
- judgeval/tracer/__init__.py +1075 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +37 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +43 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +67 -0
- judgeval/tracer/llm/__init__.py +1233 -0
- judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
- judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
- judgeval/tracer/managers.py +188 -0
- judgeval/tracer/processors/__init__.py +181 -0
- judgeval/tracer/utils.py +20 -0
- judgeval/trainer/__init__.py +5 -0
- judgeval/{common/trainer → trainer}/config.py +12 -9
- judgeval/{common/trainer → trainer}/console.py +2 -9
- judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
- judgeval/{common/trainer → trainer}/trainer.py +119 -17
- judgeval/utils/async_utils.py +2 -3
- judgeval/utils/decorators.py +24 -0
- judgeval/utils/file_utils.py +37 -4
- judgeval/utils/guards.py +32 -0
- judgeval/utils/meta.py +14 -0
- judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
- judgeval/utils/testing.py +88 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +3 -3
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/METADATA +12 -14
- judgeval-0.9.0.dist-info/RECORD +80 -0
- judgeval/clients.py +0 -35
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -375
- judgeval/common/api/constants.py +0 -186
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -97
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -2427
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -188
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -207
- judgeval/common/tracer/trace_manager.py +0 -101
- judgeval/common/trainer/__init__.py +0 -5
- judgeval/common/utils.py +0 -948
- judgeval/integrations/langgraph.py +0 -844
- judgeval/judges/mixture_of_judges.py +0 -287
- judgeval/judgment_client.py +0 -267
- judgeval/rules.py +0 -521
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.7.1.dist-info/RECORD +0 -82
- {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/WHEEL +0 -0
- {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -2,7 +2,7 @@ from fireworks import LLM
|
|
2
2
|
from .config import TrainerConfig, ModelConfig
|
3
3
|
from typing import Optional, Dict, Any, Callable
|
4
4
|
from .console import _model_spinner_progress, _print_model_progress
|
5
|
-
from judgeval.
|
5
|
+
from judgeval.exceptions import JudgmentRuntimeError
|
6
6
|
|
7
7
|
|
8
8
|
class TrainableModel:
|
@@ -14,6 +14,12 @@ class TrainableModel:
|
|
14
14
|
abstracting away manual snapshot management from users.
|
15
15
|
"""
|
16
16
|
|
17
|
+
config: TrainerConfig
|
18
|
+
current_step: int
|
19
|
+
_current_model: LLM
|
20
|
+
_tracer_wrapper_func: Optional[Callable]
|
21
|
+
_base_model: LLM
|
22
|
+
|
17
23
|
def __init__(self, config: TrainerConfig):
|
18
24
|
"""
|
19
25
|
Initialize the TrainableModel.
|
@@ -24,13 +30,12 @@ class TrainableModel:
|
|
24
30
|
try:
|
25
31
|
self.config = config
|
26
32
|
self.current_step = 0
|
27
|
-
self._current_model = None
|
28
33
|
self._tracer_wrapper_func = None
|
29
34
|
|
30
35
|
self._base_model = self._create_base_model()
|
31
36
|
self._current_model = self._base_model
|
32
37
|
except Exception as e:
|
33
|
-
raise
|
38
|
+
raise JudgmentRuntimeError(
|
34
39
|
f"Failed to initialize TrainableModel: {str(e)}"
|
35
40
|
) from e
|
36
41
|
|
@@ -80,7 +85,7 @@ class TrainableModel:
|
|
80
85
|
_print_model_progress("Base model deployment ready")
|
81
86
|
return base_model
|
82
87
|
except Exception as e:
|
83
|
-
raise
|
88
|
+
raise JudgmentRuntimeError(
|
84
89
|
f"Failed to create and deploy base model '{self.config.base_model_name}': {str(e)}"
|
85
90
|
) from e
|
86
91
|
|
@@ -103,7 +108,7 @@ class TrainableModel:
|
|
103
108
|
if self._tracer_wrapper_func:
|
104
109
|
self._tracer_wrapper_func(self._current_model)
|
105
110
|
except Exception as e:
|
106
|
-
raise
|
111
|
+
raise JudgmentRuntimeError(
|
107
112
|
f"Failed to load and deploy trained model '{model_name}': {str(e)}"
|
108
113
|
) from e
|
109
114
|
|
@@ -150,7 +155,7 @@ class TrainableModel:
|
|
150
155
|
if self._tracer_wrapper_func:
|
151
156
|
self._tracer_wrapper_func(self._current_model)
|
152
157
|
except Exception as e:
|
153
|
-
raise
|
158
|
+
raise JudgmentRuntimeError(
|
154
159
|
f"Failed to advance to training step {step}: {str(e)}"
|
155
160
|
) from e
|
156
161
|
|
@@ -176,7 +181,7 @@ class TrainableModel:
|
|
176
181
|
accelerator_type=self.config.accelerator_type,
|
177
182
|
)
|
178
183
|
except Exception as e:
|
179
|
-
raise
|
184
|
+
raise JudgmentRuntimeError(
|
180
185
|
f"Failed to start reinforcement learning step {step + 1}: {str(e)}"
|
181
186
|
) from e
|
182
187
|
|
@@ -1,15 +1,19 @@
|
|
1
1
|
import asyncio
|
2
|
+
import json
|
2
3
|
import time
|
3
|
-
from typing import Optional, Callable, Any, List, Union
|
4
|
+
from typing import Optional, Callable, Any, List, Union, Dict
|
4
5
|
from fireworks import Dataset
|
5
6
|
from .config import TrainerConfig, ModelConfig
|
6
7
|
from .trainable_model import TrainableModel
|
7
8
|
from judgeval.tracer import Tracer
|
8
|
-
from judgeval.
|
9
|
+
from judgeval.tracer.exporters.store import SpanStore
|
10
|
+
from judgeval.tracer.exporters import InMemorySpanExporter
|
11
|
+
from judgeval.tracer.keys import AttributeKeys
|
12
|
+
from judgeval import JudgmentClient
|
9
13
|
from judgeval.scorers import BaseScorer, APIScorerConfig
|
10
14
|
from judgeval.data import Example
|
11
15
|
from .console import _spinner_progress, _print_progress, _print_progress_update
|
12
|
-
from judgeval.
|
16
|
+
from judgeval.exceptions import JudgmentRuntimeError
|
13
17
|
|
14
18
|
|
15
19
|
class JudgmentTrainer:
|
@@ -39,20 +43,114 @@ class JudgmentTrainer:
|
|
39
43
|
try:
|
40
44
|
self.config = config
|
41
45
|
self.tracer = tracer
|
42
|
-
self.tracer.show_trace_urls = False
|
43
46
|
self.project_name = project_name or "judgment_training"
|
44
|
-
|
45
|
-
if trainable_model is None:
|
46
|
-
self.trainable_model = TrainableModel(self.config)
|
47
|
-
else:
|
48
|
-
self.trainable_model = trainable_model
|
47
|
+
self.trainable_model = trainable_model
|
49
48
|
|
50
49
|
self.judgment_client = JudgmentClient()
|
50
|
+
self.span_store = SpanStore()
|
51
|
+
self.span_exporter = InMemorySpanExporter(self.span_store)
|
51
52
|
except Exception as e:
|
52
|
-
raise
|
53
|
+
raise JudgmentRuntimeError(
|
53
54
|
f"Failed to initialize JudgmentTrainer: {str(e)}"
|
54
55
|
) from e
|
55
56
|
|
57
|
+
def _extract_message_history_from_spans(self) -> List[Dict[str, str]]:
|
58
|
+
"""
|
59
|
+
Extract message history from spans in the span store for training purposes.
|
60
|
+
|
61
|
+
This method processes trace spans to reconstruct the conversation flow,
|
62
|
+
extracting messages in chronological order from LLM, user, and tool spans.
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
List of message dictionaries with 'role' and 'content' keys
|
66
|
+
"""
|
67
|
+
spans = self.span_store.get_all()
|
68
|
+
if not spans:
|
69
|
+
return []
|
70
|
+
|
71
|
+
messages = []
|
72
|
+
first_found = False
|
73
|
+
|
74
|
+
for span in sorted(spans, key=lambda s: getattr(s, "start_time", 0)):
|
75
|
+
span_attributes = span.attributes or {}
|
76
|
+
span_type = span_attributes.get(AttributeKeys.JUDGMENT_SPAN_KIND, "span")
|
77
|
+
|
78
|
+
if (
|
79
|
+
not span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
|
80
|
+
and span_type != "llm"
|
81
|
+
):
|
82
|
+
continue
|
83
|
+
|
84
|
+
if span_type == "llm":
|
85
|
+
if not first_found and span_attributes.get(
|
86
|
+
AttributeKeys.JUDGMENT_INPUT
|
87
|
+
):
|
88
|
+
input_data = span_attributes.get(AttributeKeys.JUDGMENT_INPUT, {})
|
89
|
+
if isinstance(input_data, dict) and "messages" in input_data:
|
90
|
+
input_messages = input_data["messages"]
|
91
|
+
if input_messages:
|
92
|
+
first_found = True
|
93
|
+
for msg in input_messages:
|
94
|
+
if (
|
95
|
+
isinstance(msg, dict)
|
96
|
+
and "role" in msg
|
97
|
+
and "content" in msg
|
98
|
+
):
|
99
|
+
messages.append(
|
100
|
+
{"role": msg["role"], "content": msg["content"]}
|
101
|
+
)
|
102
|
+
|
103
|
+
# Add assistant response from span output
|
104
|
+
output = span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
|
105
|
+
if output is not None:
|
106
|
+
content = str(output)
|
107
|
+
try:
|
108
|
+
parsed = json.loads(content)
|
109
|
+
if isinstance(parsed, dict) and "messages" in parsed:
|
110
|
+
# Extract the actual assistant message content
|
111
|
+
for msg in parsed["messages"]:
|
112
|
+
if (
|
113
|
+
isinstance(msg, dict)
|
114
|
+
and msg.get("role") == "assistant"
|
115
|
+
):
|
116
|
+
content = msg.get("content", content)
|
117
|
+
break
|
118
|
+
except (json.JSONDecodeError, KeyError):
|
119
|
+
pass
|
120
|
+
messages.append({"role": "assistant", "content": content})
|
121
|
+
|
122
|
+
elif span_type == "user":
|
123
|
+
output = span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
|
124
|
+
if output is not None:
|
125
|
+
content = str(output)
|
126
|
+
try:
|
127
|
+
parsed = json.loads(content)
|
128
|
+
if isinstance(parsed, dict) and "messages" in parsed:
|
129
|
+
for msg in parsed["messages"]:
|
130
|
+
if isinstance(msg, dict) and msg.get("role") == "user":
|
131
|
+
content = msg.get("content", content)
|
132
|
+
break
|
133
|
+
except (json.JSONDecodeError, KeyError):
|
134
|
+
pass
|
135
|
+
messages.append({"role": "user", "content": content})
|
136
|
+
|
137
|
+
elif span_type == "tool":
|
138
|
+
output = span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
|
139
|
+
if output is not None:
|
140
|
+
content = str(output)
|
141
|
+
try:
|
142
|
+
parsed = json.loads(content)
|
143
|
+
if isinstance(parsed, dict) and "messages" in parsed:
|
144
|
+
for msg in parsed["messages"]:
|
145
|
+
if isinstance(msg, dict) and msg.get("role") == "user":
|
146
|
+
content = msg.get("content", content)
|
147
|
+
break
|
148
|
+
except (json.JSONDecodeError, KeyError):
|
149
|
+
pass
|
150
|
+
messages.append({"role": "user", "content": content})
|
151
|
+
|
152
|
+
return messages
|
153
|
+
|
56
154
|
async def generate_rollouts_and_rewards(
|
57
155
|
self,
|
58
156
|
agent_function: Callable[[Any], Any],
|
@@ -95,13 +193,16 @@ class JudgmentTrainer:
|
|
95
193
|
messages = response_data.get("messages", [])
|
96
194
|
|
97
195
|
try:
|
98
|
-
traced_messages = self.
|
196
|
+
traced_messages = self._extract_message_history_from_spans()
|
99
197
|
if traced_messages:
|
100
198
|
messages = traced_messages
|
101
199
|
except Exception as e:
|
102
200
|
print(f"Warning: Failed to get message history from trace: {e}")
|
103
201
|
pass
|
104
202
|
|
203
|
+
finally:
|
204
|
+
self.span_store.spans = []
|
205
|
+
|
105
206
|
example = Example(
|
106
207
|
input=prompt_input,
|
107
208
|
messages=messages,
|
@@ -113,14 +214,15 @@ class JudgmentTrainer:
|
|
113
214
|
scorers=scorers,
|
114
215
|
project_name=self.project_name,
|
115
216
|
eval_run_name=f"training_step_{self.trainable_model.current_step}_prompt_{prompt_id}_gen_{generation_id}",
|
116
|
-
show_url=False,
|
117
217
|
)
|
118
218
|
|
119
219
|
if scoring_results and scoring_results[0].scorers_data:
|
120
|
-
|
220
|
+
scores = [
|
121
221
|
scorer_data.score
|
122
222
|
for scorer_data in scoring_results[0].scorers_data
|
123
|
-
|
223
|
+
if scorer_data.score is not None
|
224
|
+
]
|
225
|
+
reward = sum(scores) / len(scores) if scores else 0.0
|
124
226
|
else:
|
125
227
|
reward = 0.0
|
126
228
|
|
@@ -246,7 +348,7 @@ class JudgmentTrainer:
|
|
246
348
|
time.sleep(10)
|
247
349
|
job = job.get()
|
248
350
|
if job is None:
|
249
|
-
raise
|
351
|
+
raise JudgmentRuntimeError(
|
250
352
|
"Training job was deleted while waiting for completion"
|
251
353
|
)
|
252
354
|
|
@@ -294,8 +396,8 @@ class JudgmentTrainer:
|
|
294
396
|
return await self.run_reinforcement_learning(
|
295
397
|
agent_function, scorers, prompts
|
296
398
|
)
|
297
|
-
except
|
399
|
+
except JudgmentRuntimeError:
|
298
400
|
# Re-raise JudgmentAPIError as-is
|
299
401
|
raise
|
300
402
|
except Exception as e:
|
301
|
-
raise
|
403
|
+
raise JudgmentRuntimeError(f"Training process failed: {str(e)}") from e
|
judgeval/utils/async_utils.py
CHANGED
@@ -5,7 +5,6 @@ import concurrent.futures
|
|
5
5
|
from typing import Awaitable, TypeVar
|
6
6
|
|
7
7
|
|
8
|
-
# Generic type variable for coroutine return type
|
9
8
|
T = TypeVar("T")
|
10
9
|
|
11
10
|
|
@@ -14,8 +13,8 @@ def safe_run_async(coro: Awaitable[T]) -> T: # type: ignore[type-var]
|
|
14
13
|
|
15
14
|
This helper handles two common situations:
|
16
15
|
|
17
|
-
1. **No running event loop**
|
18
|
-
2. **Existing running loop**
|
16
|
+
1. **No running event loop** - Simply delegates to ``asyncio.run``.
|
17
|
+
2. **Existing running loop** - Executes the coroutine in a separate
|
19
18
|
thread so that we don't attempt to nest event loops (which would raise
|
20
19
|
``RuntimeError``).
|
21
20
|
|
@@ -0,0 +1,24 @@
|
|
1
|
+
from functools import lru_cache, wraps
|
2
|
+
from typing import Callable, TypeVar
|
3
|
+
|
4
|
+
T = TypeVar("T")
|
5
|
+
|
6
|
+
|
7
|
+
def use_once(func: Callable[..., T]) -> Callable[..., T]:
|
8
|
+
@lru_cache(maxsize=1)
|
9
|
+
@wraps(func)
|
10
|
+
def wrapper(*args, **kwargs):
|
11
|
+
return func(*args, **kwargs)
|
12
|
+
|
13
|
+
return wrapper
|
14
|
+
|
15
|
+
|
16
|
+
def dont_throw(func: Callable[..., T]) -> Callable[..., T | None]:
|
17
|
+
@wraps(func)
|
18
|
+
def wrapper(*args, **kwargs):
|
19
|
+
try:
|
20
|
+
return func(*args, **kwargs)
|
21
|
+
except Exception:
|
22
|
+
pass
|
23
|
+
|
24
|
+
return wrapper
|
judgeval/utils/file_utils.py
CHANGED
@@ -1,12 +1,14 @@
|
|
1
|
+
import importlib.util
|
1
2
|
import yaml
|
2
3
|
import orjson
|
4
|
+
from pathlib import Path
|
3
5
|
from typing import List
|
4
|
-
from judgeval.
|
6
|
+
from judgeval.logger import judgeval_logger
|
5
7
|
|
6
|
-
from judgeval.data import Example
|
8
|
+
from judgeval.data.example import Example
|
7
9
|
|
8
10
|
|
9
|
-
def get_examples_from_yaml(file_path: str) -> List[Example]
|
11
|
+
def get_examples_from_yaml(file_path: str) -> List[Example]:
|
10
12
|
"""
|
11
13
|
Adds examples from a YAML file.
|
12
14
|
|
@@ -34,7 +36,7 @@ def get_examples_from_yaml(file_path: str) -> List[Example] | None:
|
|
34
36
|
return new_examples
|
35
37
|
|
36
38
|
|
37
|
-
def get_examples_from_json(file_path: str) -> List[Example]
|
39
|
+
def get_examples_from_json(file_path: str) -> List[Example]:
|
38
40
|
"""
|
39
41
|
Adds examples from a JSON file.
|
40
42
|
|
@@ -64,3 +66,34 @@ def get_examples_from_json(file_path: str) -> List[Example] | None:
|
|
64
66
|
|
65
67
|
new_examples = [Example(**e) for e in payload]
|
66
68
|
return new_examples
|
69
|
+
|
70
|
+
|
71
|
+
def extract_scorer_name(scorer_file_path: str) -> str:
|
72
|
+
try:
|
73
|
+
spec = importlib.util.spec_from_file_location("scorer_module", scorer_file_path)
|
74
|
+
if spec is None or spec.loader is None:
|
75
|
+
raise ImportError(f"Could not load spec from {scorer_file_path}")
|
76
|
+
|
77
|
+
module = importlib.util.module_from_spec(spec)
|
78
|
+
spec.loader.exec_module(module)
|
79
|
+
|
80
|
+
for attr_name in dir(module):
|
81
|
+
attr = getattr(module, attr_name)
|
82
|
+
if (
|
83
|
+
isinstance(attr, type)
|
84
|
+
and any("Scorer" in str(base) for base in attr.__mro__)
|
85
|
+
and attr.__module__ == "scorer_module"
|
86
|
+
):
|
87
|
+
try:
|
88
|
+
# Instantiate the scorer and get its name
|
89
|
+
scorer_instance = attr()
|
90
|
+
if hasattr(scorer_instance, "name"):
|
91
|
+
return scorer_instance.name
|
92
|
+
except Exception:
|
93
|
+
# Skip if instantiation fails
|
94
|
+
continue
|
95
|
+
|
96
|
+
raise AttributeError("No scorer class found or could be instantiated")
|
97
|
+
except Exception as e:
|
98
|
+
judgeval_logger.warning(f"Could not extract scorer name: {e}")
|
99
|
+
return Path(scorer_file_path).stem
|
judgeval/utils/guards.py
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING
|
4
|
+
|
5
|
+
if TYPE_CHECKING:
|
6
|
+
from typing import TypeVar
|
7
|
+
|
8
|
+
T = TypeVar("T")
|
9
|
+
|
10
|
+
|
11
|
+
def expect_exists(value: T | None, message: str) -> T:
|
12
|
+
if value is None:
|
13
|
+
raise ValueError(message)
|
14
|
+
|
15
|
+
return value
|
16
|
+
|
17
|
+
|
18
|
+
def expect_api_key(api_key: str | None) -> str:
|
19
|
+
return expect_exists(
|
20
|
+
api_key,
|
21
|
+
"API Key is not set, please set JUDGMENT_API_KEY in the environment variables or pass it as `api_key`",
|
22
|
+
)
|
23
|
+
|
24
|
+
|
25
|
+
def expect_organization_id(organization_id: str | None) -> str:
|
26
|
+
return expect_exists(
|
27
|
+
organization_id,
|
28
|
+
"Organization ID is not set, please set JUDGMENT_ORG_ID in the environment variables or pass it as `organization_id`",
|
29
|
+
)
|
30
|
+
|
31
|
+
|
32
|
+
__all__ = ("expect_exists", "expect_api_key", "expect_organization_id")
|
judgeval/utils/meta.py
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
|
4
|
+
class SingletonMeta(type):
|
5
|
+
"""
|
6
|
+
Metaclass for creating singleton classes.
|
7
|
+
"""
|
8
|
+
|
9
|
+
_instances: dict[type, object] = {}
|
10
|
+
|
11
|
+
def __call__(cls, *args, **kwargs):
|
12
|
+
if cls not in cls._instances:
|
13
|
+
cls._instances[cls] = super().__call__(*args, **kwargs)
|
14
|
+
return cls._instances[cls]
|
@@ -17,6 +17,7 @@ from uuid import UUID
|
|
17
17
|
|
18
18
|
from pydantic import BaseModel
|
19
19
|
from pydantic.types import SecretBytes, SecretStr
|
20
|
+
import orjson
|
20
21
|
|
21
22
|
|
22
23
|
"""
|
@@ -60,7 +61,7 @@ def json_encoder(
|
|
60
61
|
|
61
62
|
# Dataclasses
|
62
63
|
if dataclasses.is_dataclass(obj):
|
63
|
-
obj_dict = dataclasses.asdict(obj)
|
64
|
+
obj_dict = dataclasses.asdict(obj) # type: ignore[arg-type]
|
64
65
|
return json_encoder(
|
65
66
|
obj_dict,
|
66
67
|
)
|
@@ -239,3 +240,8 @@ def generate_encoders_by_class_tuples(
|
|
239
240
|
|
240
241
|
# Mapping of encoders to a tuple of classes that they can encode
|
241
242
|
encoders_by_class_tuples = generate_encoders_by_class_tuples(ENCODERS_BY_TYPE)
|
243
|
+
|
244
|
+
|
245
|
+
# Seralize arbitrary object to a json string
|
246
|
+
def safe_serialize(obj: Any) -> str:
|
247
|
+
return orjson.dumps(json_encoder(obj)).decode()
|
@@ -0,0 +1,88 @@
|
|
1
|
+
from rich import print as rprint
|
2
|
+
|
3
|
+
from typing import List
|
4
|
+
from judgeval.evaluation import ScoringResult
|
5
|
+
from judgeval.data import ScorerData
|
6
|
+
from judgeval.exceptions import JudgmentTestError
|
7
|
+
|
8
|
+
|
9
|
+
def assert_test_results(scoring_results: List[ScoringResult]) -> None:
|
10
|
+
"""
|
11
|
+
Collects all failed scorers from the scoring results.
|
12
|
+
|
13
|
+
Args:
|
14
|
+
ScoringResults (List[ScoringResult]): List of scoring results to check
|
15
|
+
|
16
|
+
Returns:
|
17
|
+
None. Raises exceptions for any failed test cases.
|
18
|
+
"""
|
19
|
+
failed_cases: List[List[ScorerData]] = []
|
20
|
+
|
21
|
+
for result in scoring_results:
|
22
|
+
if not result.success:
|
23
|
+
# Create a test case context with all relevant fields
|
24
|
+
test_case = []
|
25
|
+
if result.scorers_data:
|
26
|
+
# If the result was not successful, check each scorer_data
|
27
|
+
for scorer_data in result.scorers_data:
|
28
|
+
if not scorer_data.success:
|
29
|
+
if scorer_data.name == "Tool Order":
|
30
|
+
# Remove threshold, evaluation model for Tool Order scorer
|
31
|
+
scorer_data.threshold = None
|
32
|
+
scorer_data.evaluation_model = None
|
33
|
+
test_case.append(scorer_data)
|
34
|
+
failed_cases.append(test_case)
|
35
|
+
|
36
|
+
if failed_cases:
|
37
|
+
error_msg = "The following test cases failed: \n"
|
38
|
+
for fail_case in failed_cases:
|
39
|
+
for fail_scorer in fail_case:
|
40
|
+
error_msg += (
|
41
|
+
f"\nScorer Name: {fail_scorer.name}\n"
|
42
|
+
f"Threshold: {fail_scorer.threshold}\n"
|
43
|
+
f"Success: {fail_scorer.success}\n"
|
44
|
+
f"Score: {fail_scorer.score}\n"
|
45
|
+
f"Reason: {fail_scorer.reason}\n"
|
46
|
+
f"Strict Mode: {fail_scorer.strict_mode}\n"
|
47
|
+
f"Evaluation Model: {fail_scorer.evaluation_model}\n"
|
48
|
+
f"Error: {fail_scorer.error}\n"
|
49
|
+
f"Additional Metadata: {fail_scorer.additional_metadata}\n"
|
50
|
+
)
|
51
|
+
error_msg += "-" * 100
|
52
|
+
|
53
|
+
total_tests = len(scoring_results)
|
54
|
+
failed_tests = len(failed_cases)
|
55
|
+
passed_tests = total_tests - failed_tests
|
56
|
+
|
57
|
+
# Print summary with colors
|
58
|
+
rprint("\n" + "=" * 80)
|
59
|
+
if failed_tests == 0:
|
60
|
+
rprint(
|
61
|
+
f"[bold green]🎉 ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]"
|
62
|
+
)
|
63
|
+
else:
|
64
|
+
rprint(
|
65
|
+
f"[bold red]⚠️ TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]"
|
66
|
+
)
|
67
|
+
rprint("=" * 80 + "\n")
|
68
|
+
|
69
|
+
# Print individual test cases
|
70
|
+
for i, result in enumerate(scoring_results):
|
71
|
+
test_num = i + 1
|
72
|
+
if result.success:
|
73
|
+
rprint(f"[green]✓ Test {test_num}: PASSED[/green]")
|
74
|
+
else:
|
75
|
+
rprint(f"[red]✗ Test {test_num}: FAILED[/red]")
|
76
|
+
if result.scorers_data:
|
77
|
+
for scorer_data in result.scorers_data:
|
78
|
+
if not scorer_data.success:
|
79
|
+
rprint(f" [yellow]Scorer: {scorer_data.name}[/yellow]")
|
80
|
+
rprint(f" [red] Score: {scorer_data.score}[/red]")
|
81
|
+
rprint(f" [red] Reason: {scorer_data.reason}[/red]")
|
82
|
+
if scorer_data.error:
|
83
|
+
rprint(f" [red] Error: {scorer_data.error}[/red]")
|
84
|
+
rprint(" " + "-" * 40)
|
85
|
+
|
86
|
+
rprint("\n" + "=" * 80)
|
87
|
+
if failed_tests > 0:
|
88
|
+
raise JudgmentTestError(failed_cases)
|
judgeval/utils/url.py
ADDED
@@ -1,14 +1,14 @@
|
|
1
1
|
import importlib.metadata
|
2
|
-
|
2
|
+
import httpx
|
3
3
|
import threading
|
4
|
-
from judgeval.
|
4
|
+
from judgeval.logger import judgeval_logger
|
5
5
|
|
6
6
|
|
7
7
|
def check_latest_version(package_name: str = "judgeval"):
|
8
8
|
def _check():
|
9
9
|
try:
|
10
10
|
current_version = importlib.metadata.version(package_name)
|
11
|
-
response =
|
11
|
+
response = httpx.get(
|
12
12
|
f"https://pypi.org/pypi/{package_name}/json", timeout=2
|
13
13
|
)
|
14
14
|
latest_version = response.json()["info"]["version"]
|
judgeval/version.py
ADDED
judgeval/warnings.py
ADDED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.9.0
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -10,27 +10,25 @@ License-File: LICENSE.md
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
12
12
|
Requires-Python: >=3.11
|
13
|
-
Requires-Dist: boto3
|
13
|
+
Requires-Dist: boto3>=1.40.11
|
14
14
|
Requires-Dist: click<8.2.0
|
15
|
-
Requires-Dist:
|
16
|
-
Requires-Dist:
|
17
|
-
Requires-Dist:
|
18
|
-
Requires-Dist:
|
19
|
-
Requires-Dist:
|
20
|
-
Requires-Dist:
|
21
|
-
Requires-Dist: nest-asyncio>=1.6.0
|
22
|
-
Requires-Dist: opentelemetry-api>=1.34.1
|
23
|
-
Requires-Dist: opentelemetry-sdk>=1.34.1
|
15
|
+
Requires-Dist: dotenv
|
16
|
+
Requires-Dist: httpx>=0.28.1
|
17
|
+
Requires-Dist: litellm<1.75.0
|
18
|
+
Requires-Dist: opentelemetry-exporter-otlp>=1.36.0
|
19
|
+
Requires-Dist: opentelemetry-sdk>=1.36.0
|
20
|
+
Requires-Dist: opentelemetry-semantic-conventions>=0.57b0
|
24
21
|
Requires-Dist: orjson>=3.9.0
|
25
|
-
Requires-Dist: python-dotenv
|
26
|
-
Requires-Dist: requests
|
27
|
-
Requires-Dist: rich
|
28
22
|
Requires-Dist: typer>=0.9.0
|
29
23
|
Provides-Extra: langchain
|
30
24
|
Requires-Dist: langchain-anthropic; extra == 'langchain'
|
31
25
|
Requires-Dist: langchain-core; extra == 'langchain'
|
32
26
|
Requires-Dist: langchain-huggingface; extra == 'langchain'
|
33
27
|
Requires-Dist: langchain-openai; extra == 'langchain'
|
28
|
+
Provides-Extra: s3
|
29
|
+
Requires-Dist: boto3>=1.40.11; extra == 's3'
|
30
|
+
Provides-Extra: trainer
|
31
|
+
Requires-Dist: fireworks-ai>=0.19.18; extra == 'trainer'
|
34
32
|
Description-Content-Type: text/markdown
|
35
33
|
|
36
34
|
<div align="center">
|