judgeval 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +2 -2
- judgeval/api/api_types.py +81 -12
- judgeval/cli.py +2 -1
- judgeval/constants.py +0 -6
- judgeval/data/evaluation_run.py +2 -5
- judgeval/data/judgment_types.py +97 -12
- judgeval/data/trace.py +108 -1
- judgeval/dataset/__init__.py +72 -23
- judgeval/env.py +5 -20
- judgeval/integrations/langgraph/__init__.py +9 -785
- judgeval/scorers/api_scorer.py +7 -12
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -8
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -8
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -12
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +22 -33
- judgeval/scorers/score.py +1 -1
- judgeval/scorers/utils.py +1 -4
- judgeval/tracer/__init__.py +175 -156
- judgeval/tracer/exporters/__init__.py +4 -1
- judgeval/tracer/keys.py +15 -25
- judgeval/tracer/llm/__init__.py +0 -1
- judgeval/tracer/llm/anthropic/__init__.py +20 -0
- judgeval/tracer/llm/google/__init__.py +21 -0
- judgeval/tracer/llm/groq/__init__.py +20 -0
- judgeval/tracer/llm/openai/__init__.py +32 -0
- judgeval/tracer/llm/providers.py +28 -79
- judgeval/tracer/llm/together/__init__.py +20 -0
- judgeval/tracer/managers.py +23 -48
- judgeval/tracer/processors/__init__.py +36 -75
- judgeval/tracer/utils.py +1 -2
- judgeval/utils/file_utils.py +0 -2
- judgeval/utils/meta.py +18 -5
- judgeval/utils/testing.py +0 -14
- judgeval/utils/version_check.py +2 -0
- judgeval/version.py +1 -1
- {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/METADATA +1 -7
- {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/RECORD +40 -35
- {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/WHEEL +0 -0
- {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,20 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
HAS_GROQ = False
|
4
|
+
groq_Groq = None
|
5
|
+
groq_AsyncGroq = None
|
6
|
+
|
7
|
+
try:
|
8
|
+
from groq import Groq, AsyncGroq # type: ignore[import-untyped]
|
9
|
+
|
10
|
+
groq_Groq = Groq
|
11
|
+
groq_AsyncGroq = AsyncGroq
|
12
|
+
HAS_GROQ = True
|
13
|
+
except ImportError:
|
14
|
+
pass
|
15
|
+
|
16
|
+
__all__ = [
|
17
|
+
"HAS_GROQ",
|
18
|
+
"groq_Groq",
|
19
|
+
"groq_AsyncGroq",
|
20
|
+
]
|
@@ -0,0 +1,32 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
HAS_OPENAI = False
|
4
|
+
openai_OpenAI = None
|
5
|
+
openai_AsyncOpenAI = None
|
6
|
+
openai_ChatCompletion = None
|
7
|
+
openai_Response = None
|
8
|
+
openai_ParsedChatCompletion = None
|
9
|
+
|
10
|
+
try:
|
11
|
+
from openai import OpenAI, AsyncOpenAI
|
12
|
+
from openai.types.chat.chat_completion import ChatCompletion
|
13
|
+
from openai.types.responses.response import Response
|
14
|
+
from openai.types.chat import ParsedChatCompletion
|
15
|
+
|
16
|
+
openai_OpenAI = OpenAI
|
17
|
+
openai_AsyncOpenAI = AsyncOpenAI
|
18
|
+
openai_ChatCompletion = ChatCompletion
|
19
|
+
openai_Response = Response
|
20
|
+
openai_ParsedChatCompletion = ParsedChatCompletion
|
21
|
+
HAS_OPENAI = True
|
22
|
+
except ImportError:
|
23
|
+
pass
|
24
|
+
|
25
|
+
__all__ = [
|
26
|
+
"HAS_OPENAI",
|
27
|
+
"openai_OpenAI",
|
28
|
+
"openai_AsyncOpenAI",
|
29
|
+
"openai_ChatCompletion",
|
30
|
+
"openai_Response",
|
31
|
+
"openai_ParsedChatCompletion",
|
32
|
+
]
|
judgeval/tracer/llm/providers.py
CHANGED
@@ -1,85 +1,34 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
from typing import Any, TypeAlias
|
3
3
|
|
4
|
-
|
5
|
-
HAS_OPENAI
|
6
|
-
openai_OpenAI
|
7
|
-
openai_AsyncOpenAI
|
8
|
-
openai_ChatCompletion
|
9
|
-
openai_Response
|
10
|
-
openai_ParsedChatCompletion
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
try:
|
33
|
-
from together import Together, AsyncTogether # type: ignore[import-untyped]
|
34
|
-
|
35
|
-
together_Together = Together
|
36
|
-
together_AsyncTogether = AsyncTogether
|
37
|
-
HAS_TOGETHER = True
|
38
|
-
except ImportError:
|
39
|
-
pass
|
40
|
-
|
41
|
-
|
42
|
-
HAS_ANTHROPIC = False
|
43
|
-
anthropic_Anthropic = None
|
44
|
-
anthropic_AsyncAnthropic = None
|
45
|
-
|
46
|
-
try:
|
47
|
-
from anthropic import Anthropic, AsyncAnthropic # type: ignore[import-untyped]
|
48
|
-
|
49
|
-
anthropic_Anthropic = Anthropic
|
50
|
-
anthropic_AsyncAnthropic = AsyncAnthropic
|
51
|
-
HAS_ANTHROPIC = True
|
52
|
-
except ImportError:
|
53
|
-
pass
|
54
|
-
|
55
|
-
|
56
|
-
HAS_GOOGLE_GENAI = False
|
57
|
-
google_genai_Client = None
|
58
|
-
google_genai_cleint_AsyncClient = None
|
59
|
-
|
60
|
-
try:
|
61
|
-
from google.genai import Client # type: ignore[import-untyped]
|
62
|
-
from google.genai.client import AsyncClient # type: ignore[import-untyped]
|
63
|
-
|
64
|
-
google_genai_Client = Client
|
65
|
-
google_genai_AsyncClient = AsyncClient
|
66
|
-
HAS_GOOGLE_GENAI = True
|
67
|
-
except ImportError:
|
68
|
-
pass
|
69
|
-
|
70
|
-
|
71
|
-
HAS_GROQ = False
|
72
|
-
groq_Groq = None
|
73
|
-
groq_AsyncGroq = None
|
74
|
-
|
75
|
-
try:
|
76
|
-
from groq import Groq, AsyncGroq # type: ignore[import-untyped]
|
77
|
-
|
78
|
-
groq_Groq = Groq
|
79
|
-
groq_AsyncGroq = AsyncGroq
|
80
|
-
HAS_GROQ = True
|
81
|
-
except ImportError:
|
82
|
-
pass
|
4
|
+
from judgeval.tracer.llm.openai import (
|
5
|
+
HAS_OPENAI,
|
6
|
+
openai_OpenAI,
|
7
|
+
openai_AsyncOpenAI,
|
8
|
+
openai_ChatCompletion,
|
9
|
+
openai_Response,
|
10
|
+
openai_ParsedChatCompletion,
|
11
|
+
)
|
12
|
+
from judgeval.tracer.llm.together import (
|
13
|
+
HAS_TOGETHER,
|
14
|
+
together_Together,
|
15
|
+
together_AsyncTogether,
|
16
|
+
)
|
17
|
+
from judgeval.tracer.llm.anthropic import (
|
18
|
+
HAS_ANTHROPIC,
|
19
|
+
anthropic_Anthropic,
|
20
|
+
anthropic_AsyncAnthropic,
|
21
|
+
)
|
22
|
+
from judgeval.tracer.llm.google import (
|
23
|
+
HAS_GOOGLE_GENAI,
|
24
|
+
google_genai_Client,
|
25
|
+
google_genai_AsyncClient,
|
26
|
+
)
|
27
|
+
from judgeval.tracer.llm.groq import (
|
28
|
+
HAS_GROQ,
|
29
|
+
groq_Groq,
|
30
|
+
groq_AsyncGroq,
|
31
|
+
)
|
83
32
|
|
84
33
|
|
85
34
|
# TODO: if we support dependency groups we can have this better type, but during runtime, we do
|
@@ -0,0 +1,20 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
HAS_TOGETHER = False
|
4
|
+
together_Together = None
|
5
|
+
together_AsyncTogether = None
|
6
|
+
|
7
|
+
try:
|
8
|
+
from together import Together, AsyncTogether # type: ignore[import-untyped]
|
9
|
+
|
10
|
+
together_Together = Together
|
11
|
+
together_AsyncTogether = AsyncTogether
|
12
|
+
HAS_TOGETHER = True
|
13
|
+
except ImportError:
|
14
|
+
pass
|
15
|
+
|
16
|
+
__all__ = [
|
17
|
+
"HAS_TOGETHER",
|
18
|
+
"together_Together",
|
19
|
+
"together_AsyncTogether",
|
20
|
+
]
|
judgeval/tracer/managers.py
CHANGED
@@ -2,10 +2,9 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
from contextlib import asynccontextmanager, contextmanager
|
4
4
|
from typing import TYPE_CHECKING, Dict, Optional, List, Any
|
5
|
-
from judgeval.tracer.keys import
|
5
|
+
from judgeval.tracer.keys import InternalAttributeKeys
|
6
6
|
import uuid
|
7
7
|
from judgeval.exceptions import JudgmentRuntimeError
|
8
|
-
from judgeval.tracer.utils import set_span_attribute
|
9
8
|
|
10
9
|
if TYPE_CHECKING:
|
11
10
|
from judgeval.tracer import Tracer
|
@@ -21,29 +20,17 @@ def sync_span_context(
|
|
21
20
|
if span_attributes is None:
|
22
21
|
span_attributes = {}
|
23
22
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
set_span_attribute(span, AttributeKeys.JUDGMENT_CUMULATIVE_LLM_COST, 0.0)
|
36
|
-
if disable_partial_emit:
|
37
|
-
tracer.judgment_processor.set_internal_attribute(
|
38
|
-
span_context=span.get_span_context(),
|
39
|
-
key=InternalAttributeKeys.DISABLE_PARTIAL_EMIT,
|
40
|
-
value=True,
|
41
|
-
)
|
42
|
-
yield span
|
43
|
-
finally:
|
44
|
-
current_cost_context.reset(cost_token)
|
45
|
-
child_cost = float(cost_context.get("cumulative_cost", 0.0))
|
46
|
-
tracer.add_cost_to_current_context(child_cost)
|
23
|
+
with tracer.get_tracer().start_as_current_span(
|
24
|
+
name=name,
|
25
|
+
attributes=span_attributes,
|
26
|
+
) as span:
|
27
|
+
if disable_partial_emit:
|
28
|
+
tracer.judgment_processor.set_internal_attribute(
|
29
|
+
span_context=span.get_span_context(),
|
30
|
+
key=InternalAttributeKeys.DISABLE_PARTIAL_EMIT,
|
31
|
+
value=True,
|
32
|
+
)
|
33
|
+
yield span
|
47
34
|
|
48
35
|
|
49
36
|
@asynccontextmanager
|
@@ -56,29 +43,17 @@ async def async_span_context(
|
|
56
43
|
if span_attributes is None:
|
57
44
|
span_attributes = {}
|
58
45
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
set_span_attribute(span, AttributeKeys.JUDGMENT_CUMULATIVE_LLM_COST, 0.0)
|
71
|
-
if disable_partial_emit:
|
72
|
-
tracer.judgment_processor.set_internal_attribute(
|
73
|
-
span_context=span.get_span_context(),
|
74
|
-
key=InternalAttributeKeys.DISABLE_PARTIAL_EMIT,
|
75
|
-
value=True,
|
76
|
-
)
|
77
|
-
yield span
|
78
|
-
finally:
|
79
|
-
current_cost_context.reset(cost_token)
|
80
|
-
child_cost = float(cost_context.get("cumulative_cost", 0.0))
|
81
|
-
tracer.add_cost_to_current_context(child_cost)
|
46
|
+
with tracer.get_tracer().start_as_current_span(
|
47
|
+
name=name,
|
48
|
+
attributes=span_attributes,
|
49
|
+
) as span:
|
50
|
+
if disable_partial_emit:
|
51
|
+
tracer.judgment_processor.set_internal_attribute(
|
52
|
+
span_context=span.get_span_context(),
|
53
|
+
key=InternalAttributeKeys.DISABLE_PARTIAL_EMIT,
|
54
|
+
value=True,
|
55
|
+
)
|
56
|
+
yield span
|
82
57
|
|
83
58
|
|
84
59
|
def create_agent_context(
|
@@ -2,16 +2,15 @@ from __future__ import annotations
|
|
2
2
|
from typing import Optional, TYPE_CHECKING, Any
|
3
3
|
from collections import defaultdict
|
4
4
|
from opentelemetry.context import Context
|
5
|
-
from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor
|
5
|
+
from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor
|
6
|
+
from opentelemetry.trace.span import SpanContext
|
6
7
|
from opentelemetry.sdk.trace.export import (
|
7
8
|
BatchSpanProcessor,
|
8
9
|
)
|
9
|
-
from opentelemetry.sdk.resources import Resource
|
10
10
|
from judgeval.tracer.exporters import JudgmentSpanExporter
|
11
11
|
from judgeval.tracer.keys import AttributeKeys, InternalAttributeKeys, ResourceKeys
|
12
|
-
from judgeval.api import JudgmentSyncClient
|
13
|
-
from judgeval.logger import judgeval_logger
|
14
12
|
from judgeval.utils.url import url_for
|
13
|
+
from judgeval.utils.decorators import dont_throw
|
15
14
|
from judgeval.version import get_version
|
16
15
|
|
17
16
|
if TYPE_CHECKING:
|
@@ -33,75 +32,50 @@ class NoOpSpanProcessor(SpanProcessor):
|
|
33
32
|
|
34
33
|
|
35
34
|
class JudgmentSpanProcessor(BatchSpanProcessor):
|
35
|
+
__slots__ = ("tracer", "resource_attributes", "_internal_attributes")
|
36
|
+
|
36
37
|
def __init__(
|
37
38
|
self,
|
38
39
|
tracer: Tracer,
|
39
40
|
project_name: str,
|
41
|
+
project_id: str,
|
40
42
|
api_key: str,
|
41
43
|
organization_id: str,
|
42
44
|
/,
|
43
45
|
*,
|
44
|
-
max_queue_size: int =
|
45
|
-
|
46
|
+
max_queue_size: int | None = None,
|
47
|
+
schedule_delay_millis: float | None = None,
|
48
|
+
max_export_batch_size: int | None = None,
|
49
|
+
export_timeout_millis: float | None = None,
|
46
50
|
resource_attributes: Optional[dict[str, Any]] = None,
|
47
51
|
):
|
48
52
|
self.tracer = tracer
|
49
|
-
self.project_name = project_name
|
50
|
-
self.api_key = api_key
|
51
|
-
self.organization_id = organization_id
|
52
|
-
|
53
|
-
# Resolve project_id
|
54
|
-
self.project_id = self._resolve_project_id()
|
55
53
|
|
56
|
-
|
57
|
-
|
54
|
+
attrs = {
|
55
|
+
ResourceKeys.SERVICE_NAME: project_name,
|
56
|
+
ResourceKeys.TELEMETRY_SDK_NAME: "judgeval",
|
57
|
+
ResourceKeys.TELEMETRY_SDK_VERSION: get_version(),
|
58
|
+
ResourceKeys.JUDGMENT_PROJECT_ID: project_id,
|
59
|
+
**(resource_attributes or {}),
|
60
|
+
}
|
61
|
+
self.resource_attributes = attrs
|
58
62
|
|
59
|
-
endpoint = url_for("/otel/v1/traces")
|
60
63
|
super().__init__(
|
61
64
|
JudgmentSpanExporter(
|
62
|
-
endpoint=
|
65
|
+
endpoint=url_for("/otel/v1/traces"),
|
63
66
|
api_key=api_key,
|
64
67
|
organization_id=organization_id,
|
68
|
+
project_id=project_id,
|
65
69
|
),
|
66
70
|
max_queue_size=max_queue_size,
|
71
|
+
schedule_delay_millis=schedule_delay_millis,
|
72
|
+
max_export_batch_size=max_export_batch_size,
|
67
73
|
export_timeout_millis=export_timeout_millis,
|
68
74
|
)
|
69
75
|
self._internal_attributes: defaultdict[tuple[int, int], dict[str, Any]] = (
|
70
76
|
defaultdict(dict)
|
71
77
|
)
|
72
78
|
|
73
|
-
def _resolve_project_id(self) -> str | None:
|
74
|
-
"""Resolve project_id from project_name using the API."""
|
75
|
-
try:
|
76
|
-
client = JudgmentSyncClient(
|
77
|
-
api_key=self.api_key,
|
78
|
-
organization_id=self.organization_id,
|
79
|
-
)
|
80
|
-
return client.projects_resolve({"project_name": self.project_name})[
|
81
|
-
"project_id"
|
82
|
-
]
|
83
|
-
except Exception:
|
84
|
-
return None
|
85
|
-
|
86
|
-
def _setup_resource_attributes(self, resource_attributes: dict[str, Any]) -> None:
|
87
|
-
"""Set up resource attributes including project_id."""
|
88
|
-
resource_attributes.update(
|
89
|
-
{
|
90
|
-
ResourceKeys.SERVICE_NAME: self.project_name,
|
91
|
-
ResourceKeys.TELEMETRY_SDK_NAME: "judgeval",
|
92
|
-
ResourceKeys.TELEMETRY_SDK_VERSION: get_version(),
|
93
|
-
}
|
94
|
-
)
|
95
|
-
|
96
|
-
if self.project_id is not None:
|
97
|
-
resource_attributes[ResourceKeys.JUDGMENT_PROJECT_ID] = self.project_id
|
98
|
-
else:
|
99
|
-
judgeval_logger.error(
|
100
|
-
f"Failed to resolve project {self.project_name}, please create it first at https://app.judgmentlabs.ai/org/{self.organization_id}/projects. Skipping Judgment export."
|
101
|
-
)
|
102
|
-
|
103
|
-
self.resource_attributes = resource_attributes
|
104
|
-
|
105
79
|
def _get_span_key(self, span_context: SpanContext) -> tuple[int, int]:
|
106
80
|
return (span_context.trace_id, span_context.span_id)
|
107
81
|
|
@@ -132,38 +106,32 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
|
|
132
106
|
def _cleanup_span_state(self, span_key: tuple[int, int]) -> None:
|
133
107
|
self._internal_attributes.pop(span_key, None)
|
134
108
|
|
109
|
+
@dont_throw
|
135
110
|
def emit_partial(self) -> None:
|
136
111
|
current_span = self.tracer.get_current_span()
|
137
|
-
if
|
138
|
-
|
139
|
-
|
140
|
-
|
112
|
+
if (
|
113
|
+
not current_span
|
114
|
+
or not current_span.is_recording()
|
115
|
+
or not isinstance(current_span, ReadableSpan)
|
116
|
+
):
|
141
117
|
return
|
142
118
|
|
143
119
|
span_context = current_span.get_span_context()
|
144
120
|
if self.get_internal_attribute(
|
145
|
-
span_context
|
146
|
-
key=InternalAttributeKeys.DISABLE_PARTIAL_EMIT,
|
147
|
-
default=False,
|
121
|
+
span_context, InternalAttributeKeys.DISABLE_PARTIAL_EMIT, False
|
148
122
|
):
|
149
123
|
return
|
150
124
|
|
151
|
-
current_update_id = self.increment_update_id(span_context=span_context)
|
152
|
-
|
153
125
|
attributes = dict(current_span.attributes or {})
|
154
|
-
attributes[AttributeKeys.JUDGMENT_UPDATE_ID] =
|
155
|
-
|
156
|
-
existing_resource_attrs = (
|
157
|
-
dict(current_span.resource.attributes) if current_span.resource else {}
|
126
|
+
attributes[AttributeKeys.JUDGMENT_UPDATE_ID] = self.increment_update_id(
|
127
|
+
span_context
|
158
128
|
)
|
159
|
-
merged_resource_attrs = {**existing_resource_attrs, **self.resource_attributes}
|
160
|
-
merged_resource = Resource.create(merged_resource_attrs)
|
161
129
|
|
162
130
|
partial_span = ReadableSpan(
|
163
131
|
name=current_span.name,
|
164
132
|
context=span_context,
|
165
133
|
parent=current_span.parent,
|
166
|
-
resource=
|
134
|
+
resource=current_span.resource,
|
167
135
|
attributes=attributes,
|
168
136
|
events=current_span.events,
|
169
137
|
links=current_span.links,
|
@@ -193,20 +161,11 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
|
|
193
161
|
attributes = dict(span.attributes or {})
|
194
162
|
attributes[AttributeKeys.JUDGMENT_UPDATE_ID] = 20
|
195
163
|
|
196
|
-
existing_resource_attrs = (
|
197
|
-
dict(span.resource.attributes) if span.resource else {}
|
198
|
-
)
|
199
|
-
merged_resource_attrs = {
|
200
|
-
**existing_resource_attrs,
|
201
|
-
**self.resource_attributes,
|
202
|
-
}
|
203
|
-
merged_resource = Resource.create(merged_resource_attrs)
|
204
|
-
|
205
164
|
final_span = ReadableSpan(
|
206
165
|
name=span.name,
|
207
166
|
context=span.context,
|
208
167
|
parent=span.parent,
|
209
|
-
resource=
|
168
|
+
resource=span.resource,
|
210
169
|
attributes=attributes,
|
211
170
|
events=span.events,
|
212
171
|
links=span.links,
|
@@ -224,8 +183,10 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
|
|
224
183
|
|
225
184
|
|
226
185
|
class NoOpJudgmentSpanProcessor(JudgmentSpanProcessor):
|
186
|
+
__slots__ = ("resource_attributes",)
|
187
|
+
|
227
188
|
def __init__(self):
|
228
|
-
|
189
|
+
self.resource_attributes = {}
|
229
190
|
|
230
191
|
def on_start(self, span: Span, parent_context: Optional[Context] = None) -> None:
|
231
192
|
pass
|
judgeval/tracer/utils.py
CHANGED
@@ -3,7 +3,6 @@ from opentelemetry.trace import Span
|
|
3
3
|
from pydantic import BaseModel
|
4
4
|
from typing import Callable, Optional
|
5
5
|
from judgeval.scorers.api_scorer import TraceAPIScorerConfig
|
6
|
-
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
7
6
|
|
8
7
|
|
9
8
|
def set_span_attribute(span: Span, name: str, value: Any):
|
@@ -15,6 +14,6 @@ def set_span_attribute(span: Span, name: str, value: Any):
|
|
15
14
|
|
16
15
|
class TraceScorerConfig(BaseModel):
|
17
16
|
scorer: TraceAPIScorerConfig
|
18
|
-
model: str =
|
17
|
+
model: Optional[str] = None
|
19
18
|
sampling_rate: float = 1.0
|
20
19
|
run_condition: Optional[Callable[..., bool]] = None
|
judgeval/utils/file_utils.py
CHANGED
@@ -85,12 +85,10 @@ def extract_scorer_name(scorer_file_path: str) -> str:
|
|
85
85
|
and attr.__module__ == "scorer_module"
|
86
86
|
):
|
87
87
|
try:
|
88
|
-
# Instantiate the scorer and get its name
|
89
88
|
scorer_instance = attr()
|
90
89
|
if hasattr(scorer_instance, "name"):
|
91
90
|
return scorer_instance.name
|
92
91
|
except Exception:
|
93
|
-
# Skip if instantiation fails
|
94
92
|
continue
|
95
93
|
|
96
94
|
raise AttributeError("No scorer class found or could be instantiated")
|
judgeval/utils/meta.py
CHANGED
@@ -1,4 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
+
from typing import TypeVar, Dict, cast, Type
|
3
|
+
|
4
|
+
T = TypeVar("T")
|
2
5
|
|
3
6
|
|
4
7
|
class SingletonMeta(type):
|
@@ -6,9 +9,19 @@ class SingletonMeta(type):
|
|
6
9
|
Metaclass for creating singleton classes.
|
7
10
|
"""
|
8
11
|
|
9
|
-
_instances:
|
12
|
+
_instances: Dict[type, object] = {}
|
13
|
+
|
14
|
+
def __call__(cls, *args, **kwargs) -> object:
|
15
|
+
if cls not in SingletonMeta._instances:
|
16
|
+
SingletonMeta._instances[cls] = super(SingletonMeta, cls).__call__(
|
17
|
+
*args, **kwargs
|
18
|
+
)
|
19
|
+
return SingletonMeta._instances[cls]
|
20
|
+
|
21
|
+
def get_instance(cls: Type[T]) -> T | None:
|
22
|
+
"""Get the singleton instance if it exists, otherwise return None"""
|
23
|
+
instance = SingletonMeta._instances.get(cls, None)
|
24
|
+
return cast(T, instance) if instance is not None else None
|
25
|
+
|
10
26
|
|
11
|
-
|
12
|
-
if cls not in cls._instances:
|
13
|
-
cls._instances[cls] = super().__call__(*args, **kwargs)
|
14
|
-
return cls._instances[cls]
|
27
|
+
__all__ = ("SingletonMeta",)
|
judgeval/utils/testing.py
CHANGED
@@ -7,23 +7,11 @@ from judgeval.exceptions import JudgmentTestError
|
|
7
7
|
|
8
8
|
|
9
9
|
def assert_test_results(scoring_results: List[ScoringResult]) -> None:
|
10
|
-
"""
|
11
|
-
Collects all failed scorers from the scoring results.
|
12
|
-
|
13
|
-
Args:
|
14
|
-
ScoringResults (List[ScoringResult]): List of scoring results to check
|
15
|
-
|
16
|
-
Returns:
|
17
|
-
None. Raises exceptions for any failed test cases.
|
18
|
-
"""
|
19
10
|
failed_cases: List[List[ScorerData]] = []
|
20
|
-
|
21
11
|
for result in scoring_results:
|
22
12
|
if not result.success:
|
23
|
-
# Create a test case context with all relevant fields
|
24
13
|
test_case = []
|
25
14
|
if result.scorers_data:
|
26
|
-
# If the result was not successful, check each scorer_data
|
27
15
|
for scorer_data in result.scorers_data:
|
28
16
|
if not scorer_data.success:
|
29
17
|
test_case.append(scorer_data)
|
@@ -50,7 +38,6 @@ def assert_test_results(scoring_results: List[ScoringResult]) -> None:
|
|
50
38
|
failed_tests = len(failed_cases)
|
51
39
|
passed_tests = total_tests - failed_tests
|
52
40
|
|
53
|
-
# Print summary with colors
|
54
41
|
rprint("\n" + "=" * 80)
|
55
42
|
if failed_tests == 0:
|
56
43
|
rprint(
|
@@ -62,7 +49,6 @@ def assert_test_results(scoring_results: List[ScoringResult]) -> None:
|
|
62
49
|
)
|
63
50
|
rprint("=" * 80 + "\n")
|
64
51
|
|
65
|
-
# Print individual test cases
|
66
52
|
for i, result in enumerate(scoring_results):
|
67
53
|
test_num = i + 1
|
68
54
|
if result.success:
|
judgeval/utils/version_check.py
CHANGED
judgeval/version.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.13.0
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -17,14 +17,8 @@ Requires-Dist: httpx>=0.28.1
|
|
17
17
|
Requires-Dist: litellm<1.75.0
|
18
18
|
Requires-Dist: opentelemetry-exporter-otlp>=1.36.0
|
19
19
|
Requires-Dist: opentelemetry-sdk>=1.36.0
|
20
|
-
Requires-Dist: opentelemetry-semantic-conventions>=0.57b0
|
21
20
|
Requires-Dist: orjson>=3.9.0
|
22
21
|
Requires-Dist: typer>=0.9.0
|
23
|
-
Provides-Extra: langchain
|
24
|
-
Requires-Dist: langchain-anthropic; extra == 'langchain'
|
25
|
-
Requires-Dist: langchain-core; extra == 'langchain'
|
26
|
-
Requires-Dist: langchain-huggingface; extra == 'langchain'
|
27
|
-
Requires-Dist: langchain-openai; extra == 'langchain'
|
28
22
|
Provides-Extra: s3
|
29
23
|
Requires-Dist: boto3>=1.40.11; extra == 's3'
|
30
24
|
Provides-Extra: trainer
|