judgeval 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. judgeval/__init__.py +139 -12
  2. judgeval/api/__init__.py +501 -0
  3. judgeval/api/api_types.py +344 -0
  4. judgeval/cli.py +2 -4
  5. judgeval/constants.py +10 -26
  6. judgeval/data/evaluation_run.py +49 -26
  7. judgeval/data/example.py +2 -2
  8. judgeval/data/judgment_types.py +266 -82
  9. judgeval/data/result.py +4 -5
  10. judgeval/data/scorer_data.py +4 -2
  11. judgeval/data/tool.py +2 -2
  12. judgeval/data/trace.py +7 -50
  13. judgeval/data/trace_run.py +7 -4
  14. judgeval/{dataset.py → dataset/__init__.py} +43 -28
  15. judgeval/env.py +67 -0
  16. judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
  17. judgeval/exceptions.py +27 -0
  18. judgeval/integrations/langgraph/__init__.py +788 -0
  19. judgeval/judges/__init__.py +2 -2
  20. judgeval/judges/litellm_judge.py +75 -15
  21. judgeval/judges/together_judge.py +86 -18
  22. judgeval/judges/utils.py +7 -21
  23. judgeval/{common/logger.py → logger.py} +8 -6
  24. judgeval/scorers/__init__.py +0 -4
  25. judgeval/scorers/agent_scorer.py +3 -7
  26. judgeval/scorers/api_scorer.py +8 -13
  27. judgeval/scorers/base_scorer.py +52 -32
  28. judgeval/scorers/example_scorer.py +1 -3
  29. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
  30. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
  31. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
  32. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
  33. judgeval/scorers/score.py +21 -31
  34. judgeval/scorers/trace_api_scorer.py +5 -0
  35. judgeval/scorers/utils.py +1 -103
  36. judgeval/tracer/__init__.py +1075 -2
  37. judgeval/tracer/constants.py +1 -0
  38. judgeval/tracer/exporters/__init__.py +37 -0
  39. judgeval/tracer/exporters/s3.py +119 -0
  40. judgeval/tracer/exporters/store.py +43 -0
  41. judgeval/tracer/exporters/utils.py +32 -0
  42. judgeval/tracer/keys.py +67 -0
  43. judgeval/tracer/llm/__init__.py +1233 -0
  44. judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
  45. judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
  46. judgeval/tracer/managers.py +188 -0
  47. judgeval/tracer/processors/__init__.py +181 -0
  48. judgeval/tracer/utils.py +20 -0
  49. judgeval/trainer/__init__.py +5 -0
  50. judgeval/{common/trainer → trainer}/config.py +12 -9
  51. judgeval/{common/trainer → trainer}/console.py +2 -9
  52. judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
  53. judgeval/{common/trainer → trainer}/trainer.py +119 -17
  54. judgeval/utils/async_utils.py +2 -3
  55. judgeval/utils/decorators.py +24 -0
  56. judgeval/utils/file_utils.py +37 -4
  57. judgeval/utils/guards.py +32 -0
  58. judgeval/utils/meta.py +14 -0
  59. judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
  60. judgeval/utils/testing.py +88 -0
  61. judgeval/utils/url.py +10 -0
  62. judgeval/{version_check.py → utils/version_check.py} +3 -3
  63. judgeval/version.py +5 -0
  64. judgeval/warnings.py +4 -0
  65. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/METADATA +12 -14
  66. judgeval-0.9.0.dist-info/RECORD +80 -0
  67. judgeval/clients.py +0 -35
  68. judgeval/common/__init__.py +0 -13
  69. judgeval/common/api/__init__.py +0 -3
  70. judgeval/common/api/api.py +0 -375
  71. judgeval/common/api/constants.py +0 -186
  72. judgeval/common/exceptions.py +0 -27
  73. judgeval/common/storage/__init__.py +0 -6
  74. judgeval/common/storage/s3_storage.py +0 -97
  75. judgeval/common/tracer/__init__.py +0 -31
  76. judgeval/common/tracer/constants.py +0 -22
  77. judgeval/common/tracer/core.py +0 -2427
  78. judgeval/common/tracer/otel_exporter.py +0 -108
  79. judgeval/common/tracer/otel_span_processor.py +0 -188
  80. judgeval/common/tracer/span_processor.py +0 -37
  81. judgeval/common/tracer/span_transformer.py +0 -207
  82. judgeval/common/tracer/trace_manager.py +0 -101
  83. judgeval/common/trainer/__init__.py +0 -5
  84. judgeval/common/utils.py +0 -948
  85. judgeval/integrations/langgraph.py +0 -844
  86. judgeval/judges/mixture_of_judges.py +0 -287
  87. judgeval/judgment_client.py +0 -267
  88. judgeval/rules.py +0 -521
  89. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  90. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  91. judgeval/utils/alerts.py +0 -93
  92. judgeval/utils/requests.py +0 -50
  93. judgeval-0.8.0.dist-info/RECORD +0 -82
  94. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/WHEEL +0 -0
  95. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/entry_points.txt +0 -0
  96. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -2,7 +2,7 @@ from fireworks import LLM
2
2
  from .config import TrainerConfig, ModelConfig
3
3
  from typing import Optional, Dict, Any, Callable
4
4
  from .console import _model_spinner_progress, _print_model_progress
5
- from judgeval.common.exceptions import JudgmentAPIError
5
+ from judgeval.exceptions import JudgmentRuntimeError
6
6
 
7
7
 
8
8
  class TrainableModel:
@@ -14,6 +14,12 @@ class TrainableModel:
14
14
  abstracting away manual snapshot management from users.
15
15
  """
16
16
 
17
+ config: TrainerConfig
18
+ current_step: int
19
+ _current_model: LLM
20
+ _tracer_wrapper_func: Optional[Callable]
21
+ _base_model: LLM
22
+
17
23
  def __init__(self, config: TrainerConfig):
18
24
  """
19
25
  Initialize the TrainableModel.
@@ -24,13 +30,12 @@ class TrainableModel:
24
30
  try:
25
31
  self.config = config
26
32
  self.current_step = 0
27
- self._current_model = None
28
33
  self._tracer_wrapper_func = None
29
34
 
30
35
  self._base_model = self._create_base_model()
31
36
  self._current_model = self._base_model
32
37
  except Exception as e:
33
- raise JudgmentAPIError(
38
+ raise JudgmentRuntimeError(
34
39
  f"Failed to initialize TrainableModel: {str(e)}"
35
40
  ) from e
36
41
 
@@ -80,7 +85,7 @@ class TrainableModel:
80
85
  _print_model_progress("Base model deployment ready")
81
86
  return base_model
82
87
  except Exception as e:
83
- raise JudgmentAPIError(
88
+ raise JudgmentRuntimeError(
84
89
  f"Failed to create and deploy base model '{self.config.base_model_name}': {str(e)}"
85
90
  ) from e
86
91
 
@@ -103,7 +108,7 @@ class TrainableModel:
103
108
  if self._tracer_wrapper_func:
104
109
  self._tracer_wrapper_func(self._current_model)
105
110
  except Exception as e:
106
- raise JudgmentAPIError(
111
+ raise JudgmentRuntimeError(
107
112
  f"Failed to load and deploy trained model '{model_name}': {str(e)}"
108
113
  ) from e
109
114
 
@@ -150,7 +155,7 @@ class TrainableModel:
150
155
  if self._tracer_wrapper_func:
151
156
  self._tracer_wrapper_func(self._current_model)
152
157
  except Exception as e:
153
- raise JudgmentAPIError(
158
+ raise JudgmentRuntimeError(
154
159
  f"Failed to advance to training step {step}: {str(e)}"
155
160
  ) from e
156
161
 
@@ -176,7 +181,7 @@ class TrainableModel:
176
181
  accelerator_type=self.config.accelerator_type,
177
182
  )
178
183
  except Exception as e:
179
- raise JudgmentAPIError(
184
+ raise JudgmentRuntimeError(
180
185
  f"Failed to start reinforcement learning step {step + 1}: {str(e)}"
181
186
  ) from e
182
187
 
@@ -1,15 +1,19 @@
1
1
  import asyncio
2
+ import json
2
3
  import time
3
- from typing import Optional, Callable, Any, List, Union
4
+ from typing import Optional, Callable, Any, List, Union, Dict
4
5
  from fireworks import Dataset
5
6
  from .config import TrainerConfig, ModelConfig
6
7
  from .trainable_model import TrainableModel
7
8
  from judgeval.tracer import Tracer
8
- from judgeval.judgment_client import JudgmentClient
9
+ from judgeval.tracer.exporters.store import SpanStore
10
+ from judgeval.tracer.exporters import InMemorySpanExporter
11
+ from judgeval.tracer.keys import AttributeKeys
12
+ from judgeval import JudgmentClient
9
13
  from judgeval.scorers import BaseScorer, APIScorerConfig
10
14
  from judgeval.data import Example
11
15
  from .console import _spinner_progress, _print_progress, _print_progress_update
12
- from judgeval.common.exceptions import JudgmentAPIError
16
+ from judgeval.exceptions import JudgmentRuntimeError
13
17
 
14
18
 
15
19
  class JudgmentTrainer:
@@ -39,20 +43,114 @@ class JudgmentTrainer:
39
43
  try:
40
44
  self.config = config
41
45
  self.tracer = tracer
42
- self.tracer.show_trace_urls = False
43
46
  self.project_name = project_name or "judgment_training"
44
-
45
- if trainable_model is None:
46
- self.trainable_model = TrainableModel(self.config)
47
- else:
48
- self.trainable_model = trainable_model
47
+ self.trainable_model = trainable_model
49
48
 
50
49
  self.judgment_client = JudgmentClient()
50
+ self.span_store = SpanStore()
51
+ self.span_exporter = InMemorySpanExporter(self.span_store)
51
52
  except Exception as e:
52
- raise JudgmentAPIError(
53
+ raise JudgmentRuntimeError(
53
54
  f"Failed to initialize JudgmentTrainer: {str(e)}"
54
55
  ) from e
55
56
 
57
+ def _extract_message_history_from_spans(self) -> List[Dict[str, str]]:
58
+ """
59
+ Extract message history from spans in the span store for training purposes.
60
+
61
+ This method processes trace spans to reconstruct the conversation flow,
62
+ extracting messages in chronological order from LLM, user, and tool spans.
63
+
64
+ Returns:
65
+ List of message dictionaries with 'role' and 'content' keys
66
+ """
67
+ spans = self.span_store.get_all()
68
+ if not spans:
69
+ return []
70
+
71
+ messages = []
72
+ first_found = False
73
+
74
+ for span in sorted(spans, key=lambda s: getattr(s, "start_time", 0)):
75
+ span_attributes = span.attributes or {}
76
+ span_type = span_attributes.get(AttributeKeys.JUDGMENT_SPAN_KIND, "span")
77
+
78
+ if (
79
+ not span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
80
+ and span_type != "llm"
81
+ ):
82
+ continue
83
+
84
+ if span_type == "llm":
85
+ if not first_found and span_attributes.get(
86
+ AttributeKeys.JUDGMENT_INPUT
87
+ ):
88
+ input_data = span_attributes.get(AttributeKeys.JUDGMENT_INPUT, {})
89
+ if isinstance(input_data, dict) and "messages" in input_data:
90
+ input_messages = input_data["messages"]
91
+ if input_messages:
92
+ first_found = True
93
+ for msg in input_messages:
94
+ if (
95
+ isinstance(msg, dict)
96
+ and "role" in msg
97
+ and "content" in msg
98
+ ):
99
+ messages.append(
100
+ {"role": msg["role"], "content": msg["content"]}
101
+ )
102
+
103
+ # Add assistant response from span output
104
+ output = span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
105
+ if output is not None:
106
+ content = str(output)
107
+ try:
108
+ parsed = json.loads(content)
109
+ if isinstance(parsed, dict) and "messages" in parsed:
110
+ # Extract the actual assistant message content
111
+ for msg in parsed["messages"]:
112
+ if (
113
+ isinstance(msg, dict)
114
+ and msg.get("role") == "assistant"
115
+ ):
116
+ content = msg.get("content", content)
117
+ break
118
+ except (json.JSONDecodeError, KeyError):
119
+ pass
120
+ messages.append({"role": "assistant", "content": content})
121
+
122
+ elif span_type == "user":
123
+ output = span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
124
+ if output is not None:
125
+ content = str(output)
126
+ try:
127
+ parsed = json.loads(content)
128
+ if isinstance(parsed, dict) and "messages" in parsed:
129
+ for msg in parsed["messages"]:
130
+ if isinstance(msg, dict) and msg.get("role") == "user":
131
+ content = msg.get("content", content)
132
+ break
133
+ except (json.JSONDecodeError, KeyError):
134
+ pass
135
+ messages.append({"role": "user", "content": content})
136
+
137
+ elif span_type == "tool":
138
+ output = span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
139
+ if output is not None:
140
+ content = str(output)
141
+ try:
142
+ parsed = json.loads(content)
143
+ if isinstance(parsed, dict) and "messages" in parsed:
144
+ for msg in parsed["messages"]:
145
+ if isinstance(msg, dict) and msg.get("role") == "user":
146
+ content = msg.get("content", content)
147
+ break
148
+ except (json.JSONDecodeError, KeyError):
149
+ pass
150
+ messages.append({"role": "user", "content": content})
151
+
152
+ return messages
153
+
56
154
  async def generate_rollouts_and_rewards(
57
155
  self,
58
156
  agent_function: Callable[[Any], Any],
@@ -95,13 +193,16 @@ class JudgmentTrainer:
95
193
  messages = response_data.get("messages", [])
96
194
 
97
195
  try:
98
- traced_messages = self.tracer.get_current_message_history()
196
+ traced_messages = self._extract_message_history_from_spans()
99
197
  if traced_messages:
100
198
  messages = traced_messages
101
199
  except Exception as e:
102
200
  print(f"Warning: Failed to get message history from trace: {e}")
103
201
  pass
104
202
 
203
+ finally:
204
+ self.span_store.spans = []
205
+
105
206
  example = Example(
106
207
  input=prompt_input,
107
208
  messages=messages,
@@ -113,14 +214,15 @@ class JudgmentTrainer:
113
214
  scorers=scorers,
114
215
  project_name=self.project_name,
115
216
  eval_run_name=f"training_step_{self.trainable_model.current_step}_prompt_{prompt_id}_gen_{generation_id}",
116
- show_url=False,
117
217
  )
118
218
 
119
219
  if scoring_results and scoring_results[0].scorers_data:
120
- reward = sum(
220
+ scores = [
121
221
  scorer_data.score
122
222
  for scorer_data in scoring_results[0].scorers_data
123
- ) / len(scoring_results[0].scorers_data)
223
+ if scorer_data.score is not None
224
+ ]
225
+ reward = sum(scores) / len(scores) if scores else 0.0
124
226
  else:
125
227
  reward = 0.0
126
228
 
@@ -246,7 +348,7 @@ class JudgmentTrainer:
246
348
  time.sleep(10)
247
349
  job = job.get()
248
350
  if job is None:
249
- raise JudgmentAPIError(
351
+ raise JudgmentRuntimeError(
250
352
  "Training job was deleted while waiting for completion"
251
353
  )
252
354
 
@@ -294,8 +396,8 @@ class JudgmentTrainer:
294
396
  return await self.run_reinforcement_learning(
295
397
  agent_function, scorers, prompts
296
398
  )
297
- except JudgmentAPIError:
399
+ except JudgmentRuntimeError:
298
400
  # Re-raise JudgmentAPIError as-is
299
401
  raise
300
402
  except Exception as e:
301
- raise JudgmentAPIError(f"Training process failed: {str(e)}") from e
403
+ raise JudgmentRuntimeError(f"Training process failed: {str(e)}") from e
@@ -5,7 +5,6 @@ import concurrent.futures
5
5
  from typing import Awaitable, TypeVar
6
6
 
7
7
 
8
- # Generic type variable for coroutine return type
9
8
  T = TypeVar("T")
10
9
 
11
10
 
@@ -14,8 +13,8 @@ def safe_run_async(coro: Awaitable[T]) -> T: # type: ignore[type-var]
14
13
 
15
14
  This helper handles two common situations:
16
15
 
17
- 1. **No running event loop** Simply delegates to ``asyncio.run``.
18
- 2. **Existing running loop** Executes the coroutine in a separate
16
+ 1. **No running event loop** - Simply delegates to ``asyncio.run``.
17
+ 2. **Existing running loop** - Executes the coroutine in a separate
19
18
  thread so that we don't attempt to nest event loops (which would raise
20
19
  ``RuntimeError``).
21
20
 
@@ -0,0 +1,24 @@
1
+ from functools import lru_cache, wraps
2
+ from typing import Callable, TypeVar
3
+
4
+ T = TypeVar("T")
5
+
6
+
7
+ def use_once(func: Callable[..., T]) -> Callable[..., T]:
8
+ @lru_cache(maxsize=1)
9
+ @wraps(func)
10
+ def wrapper(*args, **kwargs):
11
+ return func(*args, **kwargs)
12
+
13
+ return wrapper
14
+
15
+
16
+ def dont_throw(func: Callable[..., T]) -> Callable[..., T | None]:
17
+ @wraps(func)
18
+ def wrapper(*args, **kwargs):
19
+ try:
20
+ return func(*args, **kwargs)
21
+ except Exception:
22
+ pass
23
+
24
+ return wrapper
@@ -1,12 +1,14 @@
1
+ import importlib.util
1
2
  import yaml
2
3
  import orjson
4
+ from pathlib import Path
3
5
  from typing import List
4
- from judgeval.common.logger import judgeval_logger
6
+ from judgeval.logger import judgeval_logger
5
7
 
6
- from judgeval.data import Example
8
+ from judgeval.data.example import Example
7
9
 
8
10
 
9
- def get_examples_from_yaml(file_path: str) -> List[Example] | None:
11
+ def get_examples_from_yaml(file_path: str) -> List[Example]:
10
12
  """
11
13
  Adds examples from a YAML file.
12
14
 
@@ -34,7 +36,7 @@ def get_examples_from_yaml(file_path: str) -> List[Example] | None:
34
36
  return new_examples
35
37
 
36
38
 
37
- def get_examples_from_json(file_path: str) -> List[Example] | None:
39
+ def get_examples_from_json(file_path: str) -> List[Example]:
38
40
  """
39
41
  Adds examples from a JSON file.
40
42
 
@@ -64,3 +66,34 @@ def get_examples_from_json(file_path: str) -> List[Example] | None:
64
66
 
65
67
  new_examples = [Example(**e) for e in payload]
66
68
  return new_examples
69
+
70
+
71
+ def extract_scorer_name(scorer_file_path: str) -> str:
72
+ try:
73
+ spec = importlib.util.spec_from_file_location("scorer_module", scorer_file_path)
74
+ if spec is None or spec.loader is None:
75
+ raise ImportError(f"Could not load spec from {scorer_file_path}")
76
+
77
+ module = importlib.util.module_from_spec(spec)
78
+ spec.loader.exec_module(module)
79
+
80
+ for attr_name in dir(module):
81
+ attr = getattr(module, attr_name)
82
+ if (
83
+ isinstance(attr, type)
84
+ and any("Scorer" in str(base) for base in attr.__mro__)
85
+ and attr.__module__ == "scorer_module"
86
+ ):
87
+ try:
88
+ # Instantiate the scorer and get its name
89
+ scorer_instance = attr()
90
+ if hasattr(scorer_instance, "name"):
91
+ return scorer_instance.name
92
+ except Exception:
93
+ # Skip if instantiation fails
94
+ continue
95
+
96
+ raise AttributeError("No scorer class found or could be instantiated")
97
+ except Exception as e:
98
+ judgeval_logger.warning(f"Could not extract scorer name: {e}")
99
+ return Path(scorer_file_path).stem
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ from typing import TypeVar
7
+
8
+ T = TypeVar("T")
9
+
10
+
11
+ def expect_exists(value: T | None, message: str) -> T:
12
+ if value is None:
13
+ raise ValueError(message)
14
+
15
+ return value
16
+
17
+
18
+ def expect_api_key(api_key: str | None) -> str:
19
+ return expect_exists(
20
+ api_key,
21
+ "API Key is not set, please set JUDGMENT_API_KEY in the environment variables or pass it as `api_key`",
22
+ )
23
+
24
+
25
+ def expect_organization_id(organization_id: str | None) -> str:
26
+ return expect_exists(
27
+ organization_id,
28
+ "Organization ID is not set, please set JUDGMENT_ORG_ID in the environment variables or pass it as `organization_id`",
29
+ )
30
+
31
+
32
+ __all__ = ("expect_exists", "expect_api_key", "expect_organization_id")
judgeval/utils/meta.py ADDED
@@ -0,0 +1,14 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ class SingletonMeta(type):
5
+ """
6
+ Metaclass for creating singleton classes.
7
+ """
8
+
9
+ _instances: dict[type, object] = {}
10
+
11
+ def __call__(cls, *args, **kwargs):
12
+ if cls not in cls._instances:
13
+ cls._instances[cls] = super().__call__(*args, **kwargs)
14
+ return cls._instances[cls]
@@ -17,6 +17,7 @@ from uuid import UUID
17
17
 
18
18
  from pydantic import BaseModel
19
19
  from pydantic.types import SecretBytes, SecretStr
20
+ import orjson
20
21
 
21
22
 
22
23
  """
@@ -60,7 +61,7 @@ def json_encoder(
60
61
 
61
62
  # Dataclasses
62
63
  if dataclasses.is_dataclass(obj):
63
- obj_dict = dataclasses.asdict(obj)
64
+ obj_dict = dataclasses.asdict(obj) # type: ignore[arg-type]
64
65
  return json_encoder(
65
66
  obj_dict,
66
67
  )
@@ -239,3 +240,8 @@ def generate_encoders_by_class_tuples(
239
240
 
240
241
  # Mapping of encoders to a tuple of classes that they can encode
241
242
  encoders_by_class_tuples = generate_encoders_by_class_tuples(ENCODERS_BY_TYPE)
243
+
244
+
245
+ # Seralize arbitrary object to a json string
246
+ def safe_serialize(obj: Any) -> str:
247
+ return orjson.dumps(json_encoder(obj)).decode()
@@ -0,0 +1,88 @@
1
+ from rich import print as rprint
2
+
3
+ from typing import List
4
+ from judgeval.evaluation import ScoringResult
5
+ from judgeval.data import ScorerData
6
+ from judgeval.exceptions import JudgmentTestError
7
+
8
+
9
+ def assert_test_results(scoring_results: List[ScoringResult]) -> None:
10
+ """
11
+ Collects all failed scorers from the scoring results.
12
+
13
+ Args:
14
+ ScoringResults (List[ScoringResult]): List of scoring results to check
15
+
16
+ Returns:
17
+ None. Raises exceptions for any failed test cases.
18
+ """
19
+ failed_cases: List[List[ScorerData]] = []
20
+
21
+ for result in scoring_results:
22
+ if not result.success:
23
+ # Create a test case context with all relevant fields
24
+ test_case = []
25
+ if result.scorers_data:
26
+ # If the result was not successful, check each scorer_data
27
+ for scorer_data in result.scorers_data:
28
+ if not scorer_data.success:
29
+ if scorer_data.name == "Tool Order":
30
+ # Remove threshold, evaluation model for Tool Order scorer
31
+ scorer_data.threshold = None
32
+ scorer_data.evaluation_model = None
33
+ test_case.append(scorer_data)
34
+ failed_cases.append(test_case)
35
+
36
+ if failed_cases:
37
+ error_msg = "The following test cases failed: \n"
38
+ for fail_case in failed_cases:
39
+ for fail_scorer in fail_case:
40
+ error_msg += (
41
+ f"\nScorer Name: {fail_scorer.name}\n"
42
+ f"Threshold: {fail_scorer.threshold}\n"
43
+ f"Success: {fail_scorer.success}\n"
44
+ f"Score: {fail_scorer.score}\n"
45
+ f"Reason: {fail_scorer.reason}\n"
46
+ f"Strict Mode: {fail_scorer.strict_mode}\n"
47
+ f"Evaluation Model: {fail_scorer.evaluation_model}\n"
48
+ f"Error: {fail_scorer.error}\n"
49
+ f"Additional Metadata: {fail_scorer.additional_metadata}\n"
50
+ )
51
+ error_msg += "-" * 100
52
+
53
+ total_tests = len(scoring_results)
54
+ failed_tests = len(failed_cases)
55
+ passed_tests = total_tests - failed_tests
56
+
57
+ # Print summary with colors
58
+ rprint("\n" + "=" * 80)
59
+ if failed_tests == 0:
60
+ rprint(
61
+ f"[bold green]🎉 ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]"
62
+ )
63
+ else:
64
+ rprint(
65
+ f"[bold red]⚠️ TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]"
66
+ )
67
+ rprint("=" * 80 + "\n")
68
+
69
+ # Print individual test cases
70
+ for i, result in enumerate(scoring_results):
71
+ test_num = i + 1
72
+ if result.success:
73
+ rprint(f"[green]✓ Test {test_num}: PASSED[/green]")
74
+ else:
75
+ rprint(f"[red]✗ Test {test_num}: FAILED[/red]")
76
+ if result.scorers_data:
77
+ for scorer_data in result.scorers_data:
78
+ if not scorer_data.success:
79
+ rprint(f" [yellow]Scorer: {scorer_data.name}[/yellow]")
80
+ rprint(f" [red] Score: {scorer_data.score}[/red]")
81
+ rprint(f" [red] Reason: {scorer_data.reason}[/red]")
82
+ if scorer_data.error:
83
+ rprint(f" [red] Error: {scorer_data.error}[/red]")
84
+ rprint(" " + "-" * 40)
85
+
86
+ rprint("\n" + "=" * 80)
87
+ if failed_tests > 0:
88
+ raise JudgmentTestError(failed_cases)
judgeval/utils/url.py ADDED
@@ -0,0 +1,10 @@
1
+ from urllib.parse import urljoin
2
+
3
+ from judgeval.env import JUDGMENT_API_URL
4
+
5
+
6
+ def url_for(path: str, base: str = JUDGMENT_API_URL) -> str:
7
+ return urljoin(base, path)
8
+
9
+
10
+ __all__ = ("url_for",)
@@ -1,14 +1,14 @@
1
1
  import importlib.metadata
2
- from judgeval.utils.requests import requests
2
+ import httpx
3
3
  import threading
4
- from judgeval.common.logger import judgeval_logger
4
+ from judgeval.logger import judgeval_logger
5
5
 
6
6
 
7
7
  def check_latest_version(package_name: str = "judgeval"):
8
8
  def _check():
9
9
  try:
10
10
  current_version = importlib.metadata.version(package_name)
11
- response = requests.get(
11
+ response = httpx.get(
12
12
  f"https://pypi.org/pypi/{package_name}/json", timeout=2
13
13
  )
14
14
  latest_version = response.json()["info"]["version"]
judgeval/version.py ADDED
@@ -0,0 +1,5 @@
1
+ __version__ = "0.0.0"
2
+
3
+
4
+ def get_version() -> str:
5
+ return __version__
judgeval/warnings.py ADDED
@@ -0,0 +1,4 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ class JudgmentWarning(Warning): ...
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.8.0
3
+ Version: 0.9.0
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -10,27 +10,25 @@ License-File: LICENSE.md
10
10
  Classifier: Operating System :: OS Independent
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.11
13
- Requires-Dist: boto3
13
+ Requires-Dist: boto3>=1.40.11
14
14
  Requires-Dist: click<8.2.0
15
- Requires-Dist: fireworks-ai>=0.19.18
16
- Requires-Dist: langchain-anthropic
17
- Requires-Dist: langchain-core
18
- Requires-Dist: langchain-huggingface
19
- Requires-Dist: langchain-openai
20
- Requires-Dist: litellm>=1.61.15
21
- Requires-Dist: nest-asyncio>=1.6.0
22
- Requires-Dist: opentelemetry-api>=1.34.1
23
- Requires-Dist: opentelemetry-sdk>=1.34.1
15
+ Requires-Dist: dotenv
16
+ Requires-Dist: httpx>=0.28.1
17
+ Requires-Dist: litellm<1.75.0
18
+ Requires-Dist: opentelemetry-exporter-otlp>=1.36.0
19
+ Requires-Dist: opentelemetry-sdk>=1.36.0
20
+ Requires-Dist: opentelemetry-semantic-conventions>=0.57b0
24
21
  Requires-Dist: orjson>=3.9.0
25
- Requires-Dist: python-dotenv
26
- Requires-Dist: requests
27
- Requires-Dist: rich
28
22
  Requires-Dist: typer>=0.9.0
29
23
  Provides-Extra: langchain
30
24
  Requires-Dist: langchain-anthropic; extra == 'langchain'
31
25
  Requires-Dist: langchain-core; extra == 'langchain'
32
26
  Requires-Dist: langchain-huggingface; extra == 'langchain'
33
27
  Requires-Dist: langchain-openai; extra == 'langchain'
28
+ Provides-Extra: s3
29
+ Requires-Dist: boto3>=1.40.11; extra == 's3'
30
+ Provides-Extra: trainer
31
+ Requires-Dist: fireworks-ai>=0.19.18; extra == 'trainer'
34
32
  Description-Content-Type: text/markdown
35
33
 
36
34
  <div align="center">