judgeval 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. judgeval/__init__.py +5 -5
  2. judgeval/api/api_types.py +81 -12
  3. judgeval/cli.py +2 -1
  4. judgeval/constants.py +0 -6
  5. judgeval/data/evaluation_run.py +7 -8
  6. judgeval/data/judgment_types.py +97 -12
  7. judgeval/data/trace.py +108 -1
  8. judgeval/dataset/__init__.py +72 -23
  9. judgeval/env.py +5 -20
  10. judgeval/integrations/langgraph/__init__.py +9 -785
  11. judgeval/scorers/__init__.py +6 -0
  12. judgeval/scorers/api_scorer.py +15 -12
  13. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  14. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  15. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  16. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  17. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +26 -35
  18. judgeval/scorers/score.py +1 -1
  19. judgeval/scorers/utils.py +1 -4
  20. judgeval/tracer/__init__.py +181 -162
  21. judgeval/tracer/exporters/__init__.py +4 -1
  22. judgeval/tracer/keys.py +15 -25
  23. judgeval/tracer/llm/__init__.py +0 -1
  24. judgeval/tracer/llm/anthropic/__init__.py +20 -0
  25. judgeval/tracer/llm/google/__init__.py +21 -0
  26. judgeval/tracer/llm/groq/__init__.py +20 -0
  27. judgeval/tracer/llm/openai/__init__.py +32 -0
  28. judgeval/tracer/llm/providers.py +28 -79
  29. judgeval/tracer/llm/together/__init__.py +20 -0
  30. judgeval/tracer/managers.py +23 -48
  31. judgeval/tracer/processors/__init__.py +36 -75
  32. judgeval/tracer/utils.py +3 -4
  33. judgeval/trainer/trainer.py +4 -4
  34. judgeval/utils/file_utils.py +0 -2
  35. judgeval/utils/meta.py +18 -5
  36. judgeval/utils/testing.py +0 -14
  37. judgeval/utils/version_check.py +2 -0
  38. judgeval/version.py +1 -1
  39. {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/METADATA +1 -7
  40. {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/RECORD +43 -38
  41. {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/WHEEL +0 -0
  42. {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/entry_points.txt +0 -0
  43. {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,21 @@
1
+ from __future__ import annotations
2
+
3
+ HAS_GOOGLE_GENAI = False
4
+ google_genai_Client = None
5
+ google_genai_AsyncClient = None
6
+
7
+ try:
8
+ from google.genai import Client # type: ignore[import-untyped]
9
+ from google.genai.client import AsyncClient # type: ignore[import-untyped]
10
+
11
+ google_genai_Client = Client
12
+ google_genai_AsyncClient = AsyncClient
13
+ HAS_GOOGLE_GENAI = True
14
+ except ImportError:
15
+ pass
16
+
17
+ __all__ = [
18
+ "HAS_GOOGLE_GENAI",
19
+ "google_genai_Client",
20
+ "google_genai_AsyncClient",
21
+ ]
@@ -0,0 +1,20 @@
1
+ from __future__ import annotations
2
+
3
+ HAS_GROQ = False
4
+ groq_Groq = None
5
+ groq_AsyncGroq = None
6
+
7
+ try:
8
+ from groq import Groq, AsyncGroq # type: ignore[import-untyped]
9
+
10
+ groq_Groq = Groq
11
+ groq_AsyncGroq = AsyncGroq
12
+ HAS_GROQ = True
13
+ except ImportError:
14
+ pass
15
+
16
+ __all__ = [
17
+ "HAS_GROQ",
18
+ "groq_Groq",
19
+ "groq_AsyncGroq",
20
+ ]
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ HAS_OPENAI = False
4
+ openai_OpenAI = None
5
+ openai_AsyncOpenAI = None
6
+ openai_ChatCompletion = None
7
+ openai_Response = None
8
+ openai_ParsedChatCompletion = None
9
+
10
+ try:
11
+ from openai import OpenAI, AsyncOpenAI
12
+ from openai.types.chat.chat_completion import ChatCompletion
13
+ from openai.types.responses.response import Response
14
+ from openai.types.chat import ParsedChatCompletion
15
+
16
+ openai_OpenAI = OpenAI
17
+ openai_AsyncOpenAI = AsyncOpenAI
18
+ openai_ChatCompletion = ChatCompletion
19
+ openai_Response = Response
20
+ openai_ParsedChatCompletion = ParsedChatCompletion
21
+ HAS_OPENAI = True
22
+ except ImportError:
23
+ pass
24
+
25
+ __all__ = [
26
+ "HAS_OPENAI",
27
+ "openai_OpenAI",
28
+ "openai_AsyncOpenAI",
29
+ "openai_ChatCompletion",
30
+ "openai_Response",
31
+ "openai_ParsedChatCompletion",
32
+ ]
@@ -1,85 +1,34 @@
1
1
  from __future__ import annotations
2
2
  from typing import Any, TypeAlias
3
3
 
4
-
5
- HAS_OPENAI = False
6
- openai_OpenAI = None
7
- openai_AsyncOpenAI = None
8
- openai_ChatCompletion = None
9
- openai_Response = None
10
- openai_ParsedChatCompletion = None
11
-
12
- try:
13
- from openai import OpenAI, AsyncOpenAI
14
- from openai.types.chat.chat_completion import ChatCompletion
15
- from openai.types.responses.response import Response
16
- from openai.types.chat import ParsedChatCompletion
17
-
18
- openai_OpenAI = OpenAI
19
- openai_AsyncOpenAI = AsyncOpenAI
20
- openai_ChatCompletion = ChatCompletion
21
- openai_Response = Response
22
- openai_ParsedChatCompletion = ParsedChatCompletion
23
- HAS_OPENAI = True
24
- except ImportError:
25
- pass
26
-
27
-
28
- HAS_TOGETHER = False
29
- together_Together = None
30
- together_AsyncTogether = None
31
-
32
- try:
33
- from together import Together, AsyncTogether # type: ignore[import-untyped]
34
-
35
- together_Together = Together
36
- together_AsyncTogether = AsyncTogether
37
- HAS_TOGETHER = True
38
- except ImportError:
39
- pass
40
-
41
-
42
- HAS_ANTHROPIC = False
43
- anthropic_Anthropic = None
44
- anthropic_AsyncAnthropic = None
45
-
46
- try:
47
- from anthropic import Anthropic, AsyncAnthropic # type: ignore[import-untyped]
48
-
49
- anthropic_Anthropic = Anthropic
50
- anthropic_AsyncAnthropic = AsyncAnthropic
51
- HAS_ANTHROPIC = True
52
- except ImportError:
53
- pass
54
-
55
-
56
- HAS_GOOGLE_GENAI = False
57
- google_genai_Client = None
58
- google_genai_cleint_AsyncClient = None
59
-
60
- try:
61
- from google.genai import Client # type: ignore[import-untyped]
62
- from google.genai.client import AsyncClient # type: ignore[import-untyped]
63
-
64
- google_genai_Client = Client
65
- google_genai_AsyncClient = AsyncClient
66
- HAS_GOOGLE_GENAI = True
67
- except ImportError:
68
- pass
69
-
70
-
71
- HAS_GROQ = False
72
- groq_Groq = None
73
- groq_AsyncGroq = None
74
-
75
- try:
76
- from groq import Groq, AsyncGroq # type: ignore[import-untyped]
77
-
78
- groq_Groq = Groq
79
- groq_AsyncGroq = AsyncGroq
80
- HAS_GROQ = True
81
- except ImportError:
82
- pass
4
+ from judgeval.tracer.llm.openai import (
5
+ HAS_OPENAI,
6
+ openai_OpenAI,
7
+ openai_AsyncOpenAI,
8
+ openai_ChatCompletion,
9
+ openai_Response,
10
+ openai_ParsedChatCompletion,
11
+ )
12
+ from judgeval.tracer.llm.together import (
13
+ HAS_TOGETHER,
14
+ together_Together,
15
+ together_AsyncTogether,
16
+ )
17
+ from judgeval.tracer.llm.anthropic import (
18
+ HAS_ANTHROPIC,
19
+ anthropic_Anthropic,
20
+ anthropic_AsyncAnthropic,
21
+ )
22
+ from judgeval.tracer.llm.google import (
23
+ HAS_GOOGLE_GENAI,
24
+ google_genai_Client,
25
+ google_genai_AsyncClient,
26
+ )
27
+ from judgeval.tracer.llm.groq import (
28
+ HAS_GROQ,
29
+ groq_Groq,
30
+ groq_AsyncGroq,
31
+ )
83
32
 
84
33
 
85
34
  # TODO: if we support dependency groups we can have this better type, but during runtime, we do
@@ -0,0 +1,20 @@
1
+ from __future__ import annotations
2
+
3
+ HAS_TOGETHER = False
4
+ together_Together = None
5
+ together_AsyncTogether = None
6
+
7
+ try:
8
+ from together import Together, AsyncTogether # type: ignore[import-untyped]
9
+
10
+ together_Together = Together
11
+ together_AsyncTogether = AsyncTogether
12
+ HAS_TOGETHER = True
13
+ except ImportError:
14
+ pass
15
+
16
+ __all__ = [
17
+ "HAS_TOGETHER",
18
+ "together_Together",
19
+ "together_AsyncTogether",
20
+ ]
@@ -2,10 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  from contextlib import asynccontextmanager, contextmanager
4
4
  from typing import TYPE_CHECKING, Dict, Optional, List, Any
5
- from judgeval.tracer.keys import AttributeKeys, InternalAttributeKeys
5
+ from judgeval.tracer.keys import InternalAttributeKeys
6
6
  import uuid
7
7
  from judgeval.exceptions import JudgmentRuntimeError
8
- from judgeval.tracer.utils import set_span_attribute
9
8
 
10
9
  if TYPE_CHECKING:
11
10
  from judgeval.tracer import Tracer
@@ -21,29 +20,17 @@ def sync_span_context(
21
20
  if span_attributes is None:
22
21
  span_attributes = {}
23
22
 
24
- current_cost_context = tracer.get_current_cost_context()
25
-
26
- cost_context = {"cumulative_cost": 0.0}
27
-
28
- cost_token = current_cost_context.set(cost_context)
29
-
30
- try:
31
- with tracer.get_tracer().start_as_current_span(
32
- name=name,
33
- attributes=span_attributes,
34
- ) as span:
35
- set_span_attribute(span, AttributeKeys.JUDGMENT_CUMULATIVE_LLM_COST, 0.0)
36
- if disable_partial_emit:
37
- tracer.judgment_processor.set_internal_attribute(
38
- span_context=span.get_span_context(),
39
- key=InternalAttributeKeys.DISABLE_PARTIAL_EMIT,
40
- value=True,
41
- )
42
- yield span
43
- finally:
44
- current_cost_context.reset(cost_token)
45
- child_cost = float(cost_context.get("cumulative_cost", 0.0))
46
- tracer.add_cost_to_current_context(child_cost)
23
+ with tracer.get_tracer().start_as_current_span(
24
+ name=name,
25
+ attributes=span_attributes,
26
+ ) as span:
27
+ if disable_partial_emit:
28
+ tracer.judgment_processor.set_internal_attribute(
29
+ span_context=span.get_span_context(),
30
+ key=InternalAttributeKeys.DISABLE_PARTIAL_EMIT,
31
+ value=True,
32
+ )
33
+ yield span
47
34
 
48
35
 
49
36
  @asynccontextmanager
@@ -56,29 +43,17 @@ async def async_span_context(
56
43
  if span_attributes is None:
57
44
  span_attributes = {}
58
45
 
59
- current_cost_context = tracer.get_current_cost_context()
60
-
61
- cost_context = {"cumulative_cost": 0.0}
62
-
63
- cost_token = current_cost_context.set(cost_context)
64
-
65
- try:
66
- with tracer.get_tracer().start_as_current_span(
67
- name=name,
68
- attributes=span_attributes,
69
- ) as span:
70
- set_span_attribute(span, AttributeKeys.JUDGMENT_CUMULATIVE_LLM_COST, 0.0)
71
- if disable_partial_emit:
72
- tracer.judgment_processor.set_internal_attribute(
73
- span_context=span.get_span_context(),
74
- key=InternalAttributeKeys.DISABLE_PARTIAL_EMIT,
75
- value=True,
76
- )
77
- yield span
78
- finally:
79
- current_cost_context.reset(cost_token)
80
- child_cost = float(cost_context.get("cumulative_cost", 0.0))
81
- tracer.add_cost_to_current_context(child_cost)
46
+ with tracer.get_tracer().start_as_current_span(
47
+ name=name,
48
+ attributes=span_attributes,
49
+ ) as span:
50
+ if disable_partial_emit:
51
+ tracer.judgment_processor.set_internal_attribute(
52
+ span_context=span.get_span_context(),
53
+ key=InternalAttributeKeys.DISABLE_PARTIAL_EMIT,
54
+ value=True,
55
+ )
56
+ yield span
82
57
 
83
58
 
84
59
  def create_agent_context(
@@ -2,16 +2,15 @@ from __future__ import annotations
2
2
  from typing import Optional, TYPE_CHECKING, Any
3
3
  from collections import defaultdict
4
4
  from opentelemetry.context import Context
5
- from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor, SpanContext
5
+ from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor
6
+ from opentelemetry.trace.span import SpanContext
6
7
  from opentelemetry.sdk.trace.export import (
7
8
  BatchSpanProcessor,
8
9
  )
9
- from opentelemetry.sdk.resources import Resource
10
10
  from judgeval.tracer.exporters import JudgmentSpanExporter
11
11
  from judgeval.tracer.keys import AttributeKeys, InternalAttributeKeys, ResourceKeys
12
- from judgeval.api import JudgmentSyncClient
13
- from judgeval.logger import judgeval_logger
14
12
  from judgeval.utils.url import url_for
13
+ from judgeval.utils.decorators import dont_throw
15
14
  from judgeval.version import get_version
16
15
 
17
16
  if TYPE_CHECKING:
@@ -33,75 +32,50 @@ class NoOpSpanProcessor(SpanProcessor):
33
32
 
34
33
 
35
34
  class JudgmentSpanProcessor(BatchSpanProcessor):
35
+ __slots__ = ("tracer", "resource_attributes", "_internal_attributes")
36
+
36
37
  def __init__(
37
38
  self,
38
39
  tracer: Tracer,
39
40
  project_name: str,
41
+ project_id: str,
40
42
  api_key: str,
41
43
  organization_id: str,
42
44
  /,
43
45
  *,
44
- max_queue_size: int = 2**18,
45
- export_timeout_millis: int = 30000,
46
+ max_queue_size: int | None = None,
47
+ schedule_delay_millis: float | None = None,
48
+ max_export_batch_size: int | None = None,
49
+ export_timeout_millis: float | None = None,
46
50
  resource_attributes: Optional[dict[str, Any]] = None,
47
51
  ):
48
52
  self.tracer = tracer
49
- self.project_name = project_name
50
- self.api_key = api_key
51
- self.organization_id = organization_id
52
-
53
- # Resolve project_id
54
- self.project_id = self._resolve_project_id()
55
53
 
56
- # Set up resource attributes with project_id
57
- self._setup_resource_attributes(resource_attributes or {})
54
+ attrs = {
55
+ ResourceKeys.SERVICE_NAME: project_name,
56
+ ResourceKeys.TELEMETRY_SDK_NAME: "judgeval",
57
+ ResourceKeys.TELEMETRY_SDK_VERSION: get_version(),
58
+ ResourceKeys.JUDGMENT_PROJECT_ID: project_id,
59
+ **(resource_attributes or {}),
60
+ }
61
+ self.resource_attributes = attrs
58
62
 
59
- endpoint = url_for("/otel/v1/traces")
60
63
  super().__init__(
61
64
  JudgmentSpanExporter(
62
- endpoint=endpoint,
65
+ endpoint=url_for("/otel/v1/traces"),
63
66
  api_key=api_key,
64
67
  organization_id=organization_id,
68
+ project_id=project_id,
65
69
  ),
66
70
  max_queue_size=max_queue_size,
71
+ schedule_delay_millis=schedule_delay_millis,
72
+ max_export_batch_size=max_export_batch_size,
67
73
  export_timeout_millis=export_timeout_millis,
68
74
  )
69
75
  self._internal_attributes: defaultdict[tuple[int, int], dict[str, Any]] = (
70
76
  defaultdict(dict)
71
77
  )
72
78
 
73
- def _resolve_project_id(self) -> str | None:
74
- """Resolve project_id from project_name using the API."""
75
- try:
76
- client = JudgmentSyncClient(
77
- api_key=self.api_key,
78
- organization_id=self.organization_id,
79
- )
80
- return client.projects_resolve({"project_name": self.project_name})[
81
- "project_id"
82
- ]
83
- except Exception:
84
- return None
85
-
86
- def _setup_resource_attributes(self, resource_attributes: dict[str, Any]) -> None:
87
- """Set up resource attributes including project_id."""
88
- resource_attributes.update(
89
- {
90
- ResourceKeys.SERVICE_NAME: self.project_name,
91
- ResourceKeys.TELEMETRY_SDK_NAME: "judgeval",
92
- ResourceKeys.TELEMETRY_SDK_VERSION: get_version(),
93
- }
94
- )
95
-
96
- if self.project_id is not None:
97
- resource_attributes[ResourceKeys.JUDGMENT_PROJECT_ID] = self.project_id
98
- else:
99
- judgeval_logger.error(
100
- f"Failed to resolve project {self.project_name}, please create it first at https://app.judgmentlabs.ai/org/{self.organization_id}/projects. Skipping Judgment export."
101
- )
102
-
103
- self.resource_attributes = resource_attributes
104
-
105
79
  def _get_span_key(self, span_context: SpanContext) -> tuple[int, int]:
106
80
  return (span_context.trace_id, span_context.span_id)
107
81
 
@@ -132,38 +106,32 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
132
106
  def _cleanup_span_state(self, span_key: tuple[int, int]) -> None:
133
107
  self._internal_attributes.pop(span_key, None)
134
108
 
109
+ @dont_throw
135
110
  def emit_partial(self) -> None:
136
111
  current_span = self.tracer.get_current_span()
137
- if not current_span or not current_span.is_recording():
138
- return
139
-
140
- if not isinstance(current_span, ReadableSpan):
112
+ if (
113
+ not current_span
114
+ or not current_span.is_recording()
115
+ or not isinstance(current_span, ReadableSpan)
116
+ ):
141
117
  return
142
118
 
143
119
  span_context = current_span.get_span_context()
144
120
  if self.get_internal_attribute(
145
- span_context=span_context,
146
- key=InternalAttributeKeys.DISABLE_PARTIAL_EMIT,
147
- default=False,
121
+ span_context, InternalAttributeKeys.DISABLE_PARTIAL_EMIT, False
148
122
  ):
149
123
  return
150
124
 
151
- current_update_id = self.increment_update_id(span_context=span_context)
152
-
153
125
  attributes = dict(current_span.attributes or {})
154
- attributes[AttributeKeys.JUDGMENT_UPDATE_ID] = current_update_id
155
-
156
- existing_resource_attrs = (
157
- dict(current_span.resource.attributes) if current_span.resource else {}
126
+ attributes[AttributeKeys.JUDGMENT_UPDATE_ID] = self.increment_update_id(
127
+ span_context
158
128
  )
159
- merged_resource_attrs = {**existing_resource_attrs, **self.resource_attributes}
160
- merged_resource = Resource.create(merged_resource_attrs)
161
129
 
162
130
  partial_span = ReadableSpan(
163
131
  name=current_span.name,
164
132
  context=span_context,
165
133
  parent=current_span.parent,
166
- resource=merged_resource,
134
+ resource=current_span.resource,
167
135
  attributes=attributes,
168
136
  events=current_span.events,
169
137
  links=current_span.links,
@@ -193,20 +161,11 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
193
161
  attributes = dict(span.attributes or {})
194
162
  attributes[AttributeKeys.JUDGMENT_UPDATE_ID] = 20
195
163
 
196
- existing_resource_attrs = (
197
- dict(span.resource.attributes) if span.resource else {}
198
- )
199
- merged_resource_attrs = {
200
- **existing_resource_attrs,
201
- **self.resource_attributes,
202
- }
203
- merged_resource = Resource.create(merged_resource_attrs)
204
-
205
164
  final_span = ReadableSpan(
206
165
  name=span.name,
207
166
  context=span.context,
208
167
  parent=span.parent,
209
- resource=merged_resource,
168
+ resource=span.resource,
210
169
  attributes=attributes,
211
170
  events=span.events,
212
171
  links=span.links,
@@ -224,8 +183,10 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
224
183
 
225
184
 
226
185
  class NoOpJudgmentSpanProcessor(JudgmentSpanProcessor):
186
+ __slots__ = ("resource_attributes",)
187
+
227
188
  def __init__(self):
228
- pass
189
+ self.resource_attributes = {}
229
190
 
230
191
  def on_start(self, span: Span, parent_context: Optional[Context] = None) -> None:
231
192
  pass
judgeval/tracer/utils.py CHANGED
@@ -2,8 +2,7 @@ from typing import Any
2
2
  from opentelemetry.trace import Span
3
3
  from pydantic import BaseModel
4
4
  from typing import Callable, Optional
5
- from judgeval.scorers.api_scorer import APIScorerConfig
6
- from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
5
+ from judgeval.scorers.api_scorer import TraceAPIScorerConfig
7
6
 
8
7
 
9
8
  def set_span_attribute(span: Span, name: str, value: Any):
@@ -14,7 +13,7 @@ def set_span_attribute(span: Span, name: str, value: Any):
14
13
 
15
14
 
16
15
  class TraceScorerConfig(BaseModel):
17
- scorer: APIScorerConfig
18
- model: str = JUDGMENT_DEFAULT_GPT_MODEL
16
+ scorer: TraceAPIScorerConfig
17
+ model: Optional[str] = None
19
18
  sampling_rate: float = 1.0
20
19
  run_condition: Optional[Callable[..., bool]] = None
@@ -10,7 +10,7 @@ from judgeval.tracer.exporters.store import SpanStore
10
10
  from judgeval.tracer.exporters import InMemorySpanExporter
11
11
  from judgeval.tracer.keys import AttributeKeys
12
12
  from judgeval import JudgmentClient
13
- from judgeval.scorers import BaseScorer, APIScorerConfig
13
+ from judgeval.scorers import ExampleScorer, ExampleAPIScorerConfig
14
14
  from judgeval.data import Example
15
15
  from .console import _spinner_progress, _print_progress, _print_progress_update
16
16
  from judgeval.exceptions import JudgmentRuntimeError
@@ -156,7 +156,7 @@ class JudgmentTrainer:
156
156
  async def generate_rollouts_and_rewards(
157
157
  self,
158
158
  agent_function: Callable[[Any], Any],
159
- scorers: List[Union[APIScorerConfig, BaseScorer]],
159
+ scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
160
160
  prompts: List[Any],
161
161
  num_prompts_per_step: Optional[int] = None,
162
162
  num_generations_per_prompt: Optional[int] = None,
@@ -266,7 +266,7 @@ class JudgmentTrainer:
266
266
  async def run_reinforcement_learning(
267
267
  self,
268
268
  agent_function: Callable[[Any], Any],
269
- scorers: List[Union[APIScorerConfig, BaseScorer]],
269
+ scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
270
270
  prompts: List[Any],
271
271
  ) -> ModelConfig:
272
272
  """
@@ -372,7 +372,7 @@ class JudgmentTrainer:
372
372
  async def train(
373
373
  self,
374
374
  agent_function: Callable[[Any], Any],
375
- scorers: List[Union[APIScorerConfig, BaseScorer]],
375
+ scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
376
376
  prompts: List[Any],
377
377
  rft_provider: Optional[str] = None,
378
378
  ) -> ModelConfig:
@@ -85,12 +85,10 @@ def extract_scorer_name(scorer_file_path: str) -> str:
85
85
  and attr.__module__ == "scorer_module"
86
86
  ):
87
87
  try:
88
- # Instantiate the scorer and get its name
89
88
  scorer_instance = attr()
90
89
  if hasattr(scorer_instance, "name"):
91
90
  return scorer_instance.name
92
91
  except Exception:
93
- # Skip if instantiation fails
94
92
  continue
95
93
 
96
94
  raise AttributeError("No scorer class found or could be instantiated")
judgeval/utils/meta.py CHANGED
@@ -1,4 +1,7 @@
1
1
  from __future__ import annotations
2
+ from typing import TypeVar, Dict, cast, Type
3
+
4
+ T = TypeVar("T")
2
5
 
3
6
 
4
7
  class SingletonMeta(type):
@@ -6,9 +9,19 @@ class SingletonMeta(type):
6
9
  Metaclass for creating singleton classes.
7
10
  """
8
11
 
9
- _instances: dict[type, object] = {}
12
+ _instances: Dict[type, object] = {}
13
+
14
+ def __call__(cls, *args, **kwargs) -> object:
15
+ if cls not in SingletonMeta._instances:
16
+ SingletonMeta._instances[cls] = super(SingletonMeta, cls).__call__(
17
+ *args, **kwargs
18
+ )
19
+ return SingletonMeta._instances[cls]
20
+
21
+ def get_instance(cls: Type[T]) -> T | None:
22
+ """Get the singleton instance if it exists, otherwise return None"""
23
+ instance = SingletonMeta._instances.get(cls, None)
24
+ return cast(T, instance) if instance is not None else None
25
+
10
26
 
11
- def __call__(cls, *args, **kwargs):
12
- if cls not in cls._instances:
13
- cls._instances[cls] = super().__call__(*args, **kwargs)
14
- return cls._instances[cls]
27
+ __all__ = ("SingletonMeta",)
judgeval/utils/testing.py CHANGED
@@ -7,23 +7,11 @@ from judgeval.exceptions import JudgmentTestError
7
7
 
8
8
 
9
9
  def assert_test_results(scoring_results: List[ScoringResult]) -> None:
10
- """
11
- Collects all failed scorers from the scoring results.
12
-
13
- Args:
14
- ScoringResults (List[ScoringResult]): List of scoring results to check
15
-
16
- Returns:
17
- None. Raises exceptions for any failed test cases.
18
- """
19
10
  failed_cases: List[List[ScorerData]] = []
20
-
21
11
  for result in scoring_results:
22
12
  if not result.success:
23
- # Create a test case context with all relevant fields
24
13
  test_case = []
25
14
  if result.scorers_data:
26
- # If the result was not successful, check each scorer_data
27
15
  for scorer_data in result.scorers_data:
28
16
  if not scorer_data.success:
29
17
  test_case.append(scorer_data)
@@ -50,7 +38,6 @@ def assert_test_results(scoring_results: List[ScoringResult]) -> None:
50
38
  failed_tests = len(failed_cases)
51
39
  passed_tests = total_tests - failed_tests
52
40
 
53
- # Print summary with colors
54
41
  rprint("\n" + "=" * 80)
55
42
  if failed_tests == 0:
56
43
  rprint(
@@ -62,7 +49,6 @@ def assert_test_results(scoring_results: List[ScoringResult]) -> None:
62
49
  )
63
50
  rprint("=" * 80 + "\n")
64
51
 
65
- # Print individual test cases
66
52
  for i, result in enumerate(scoring_results):
67
53
  test_num = i + 1
68
54
  if result.success:
@@ -2,8 +2,10 @@ import importlib.metadata
2
2
  import httpx
3
3
  import threading
4
4
  from judgeval.logger import judgeval_logger
5
+ from judgeval.utils.decorators import use_once
5
6
 
6
7
 
8
+ @use_once
7
9
  def check_latest_version(package_name: str = "judgeval"):
8
10
  def _check():
9
11
  try:
judgeval/version.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.0.0"
1
+ __version__ = "0.13.0"
2
2
 
3
3
 
4
4
  def get_version() -> str: