judgeval 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/api/api.py +38 -7
- judgeval/common/api/constants.py +9 -1
- judgeval/common/storage/s3_storage.py +2 -3
- judgeval/common/tracer/core.py +66 -32
- judgeval/common/tracer/otel_span_processor.py +4 -50
- judgeval/common/tracer/span_transformer.py +16 -10
- judgeval/common/utils.py +46 -38
- judgeval/constants.py +2 -0
- judgeval/data/example.py +9 -37
- judgeval/data/judgment_types.py +23 -45
- judgeval/data/result.py +8 -14
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +3 -4
- judgeval/dataset.py +192 -0
- judgeval/evaluation_run.py +1 -0
- judgeval/judges/litellm_judge.py +2 -2
- judgeval/judges/mixture_of_judges.py +6 -6
- judgeval/judges/together_judge.py +6 -3
- judgeval/judgment_client.py +9 -71
- judgeval/run_evaluation.py +41 -9
- judgeval/scorers/score.py +11 -7
- judgeval/scorers/utils.py +3 -3
- judgeval/utils/file_utils.py +40 -25
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/METADATA +10 -6
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/RECORD +27 -29
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/WHEEL +0 -0
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/api/api.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import Literal, List, Dict, Any
|
1
|
+
from typing import Literal, List, Dict, Any, Union
|
2
2
|
from requests import exceptions
|
3
3
|
from judgeval.common.api.constants import (
|
4
4
|
JUDGMENT_TRACES_FETCH_API_URL,
|
@@ -25,6 +25,8 @@ from judgeval.common.api.constants import (
|
|
25
25
|
JUDGMENT_SCORER_SAVE_API_URL,
|
26
26
|
JUDGMENT_SCORER_FETCH_API_URL,
|
27
27
|
JUDGMENT_SCORER_EXISTS_API_URL,
|
28
|
+
JUDGMENT_DATASETS_APPEND_TRACES_API_URL,
|
29
|
+
JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL,
|
28
30
|
)
|
29
31
|
from judgeval.common.api.constants import (
|
30
32
|
TraceFetchPayload,
|
@@ -48,9 +50,12 @@ from judgeval.common.api.constants import (
|
|
48
50
|
ScorerSavePayload,
|
49
51
|
ScorerFetchPayload,
|
50
52
|
ScorerExistsPayload,
|
53
|
+
CheckExampleKeysPayload,
|
51
54
|
)
|
52
55
|
from judgeval.utils.requests import requests
|
53
56
|
|
57
|
+
import orjson
|
58
|
+
|
54
59
|
|
55
60
|
class JudgmentAPIException(exceptions.HTTPError):
|
56
61
|
"""
|
@@ -65,7 +70,7 @@ class JudgmentAPIException(exceptions.HTTPError):
|
|
65
70
|
self.request = request
|
66
71
|
|
67
72
|
@property
|
68
|
-
def status_code(self) -> int
|
73
|
+
def status_code(self) -> Union[int, None]:
|
69
74
|
"""Get the HTTP status code from the response."""
|
70
75
|
return self.response.status_code if self.response else None
|
71
76
|
|
@@ -114,8 +119,15 @@ class JudgmentApiClient:
|
|
114
119
|
try:
|
115
120
|
r.raise_for_status()
|
116
121
|
except exceptions.HTTPError as e:
|
122
|
+
try:
|
123
|
+
detail = r.json().get("detail", "")
|
124
|
+
except Exception:
|
125
|
+
detail = r.text
|
126
|
+
|
117
127
|
raise JudgmentAPIException(
|
118
|
-
f"HTTP {r.status_code}: {r.reason}
|
128
|
+
f"HTTP {r.status_code}: {r.reason}, {detail}",
|
129
|
+
response=r,
|
130
|
+
request=e.request,
|
119
131
|
)
|
120
132
|
|
121
133
|
return r.json()
|
@@ -218,6 +230,14 @@ class JudgmentApiClient:
|
|
218
230
|
}
|
219
231
|
return self._do_request("POST", JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL, payload)
|
220
232
|
|
233
|
+
def check_example_keys(self, keys: List[str], eval_name: str, project_name: str):
|
234
|
+
payload: CheckExampleKeysPayload = {
|
235
|
+
"keys": keys,
|
236
|
+
"eval_name": eval_name,
|
237
|
+
"project_name": project_name,
|
238
|
+
}
|
239
|
+
return self._do_request("POST", JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL, payload)
|
240
|
+
|
221
241
|
def save_scorer(self, name: str, prompt: str, options: dict):
|
222
242
|
payload: ScorerSavePayload = {
|
223
243
|
"name": name,
|
@@ -279,7 +299,7 @@ class JudgmentApiClient:
|
|
279
299
|
project_name: str,
|
280
300
|
examples: List[Dict[str, Any]],
|
281
301
|
traces: List[Dict[str, Any]],
|
282
|
-
overwrite: bool,
|
302
|
+
overwrite: bool = False,
|
283
303
|
):
|
284
304
|
payload: DatasetPushPayload = {
|
285
305
|
"dataset_alias": dataset_alias,
|
@@ -302,6 +322,18 @@ class JudgmentApiClient:
|
|
302
322
|
"POST", JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL, payload
|
303
323
|
)
|
304
324
|
|
325
|
+
def append_traces(
|
326
|
+
self, dataset_alias: str, project_name: str, traces: List[Dict[str, Any]]
|
327
|
+
):
|
328
|
+
payload: DatasetAppendPayload = {
|
329
|
+
"dataset_alias": dataset_alias,
|
330
|
+
"project_name": project_name,
|
331
|
+
"traces": traces,
|
332
|
+
}
|
333
|
+
return self._do_request(
|
334
|
+
"POST", JUDGMENT_DATASETS_APPEND_TRACES_API_URL, payload
|
335
|
+
)
|
336
|
+
|
305
337
|
def pull_dataset(self, dataset_alias: str, project_name: str):
|
306
338
|
payload: DatasetPullPayload = {
|
307
339
|
"dataset_alias": dataset_alias,
|
@@ -347,6 +379,5 @@ class JudgmentApiClient:
|
|
347
379
|
except Exception as e:
|
348
380
|
return f"<Unserializable object of type {type(obj).__name__}: {e}>"
|
349
381
|
|
350
|
-
|
351
|
-
|
352
|
-
return json.dumps(data, default=fallback_encoder)
|
382
|
+
# orjson returns bytes, so we need to decode to str
|
383
|
+
return orjson.dumps(data, default=fallback_encoder).decode("utf-8")
|
judgeval/common/api/constants.py
CHANGED
@@ -51,6 +51,7 @@ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
|
|
51
51
|
JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
|
52
52
|
JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL = f"{ROOT_API}/check_experiment_type/"
|
53
53
|
JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL = f"{ROOT_API}/eval-run-name-exists/"
|
54
|
+
JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL = f"{ROOT_API}/check_example_keys/"
|
54
55
|
|
55
56
|
|
56
57
|
# Evaluation API Payloads
|
@@ -90,9 +91,16 @@ class EvalRunNameExistsPayload(TypedDict):
|
|
90
91
|
judgment_api_key: str
|
91
92
|
|
92
93
|
|
94
|
+
class CheckExampleKeysPayload(TypedDict):
|
95
|
+
keys: List[str]
|
96
|
+
eval_name: str
|
97
|
+
project_name: str
|
98
|
+
|
99
|
+
|
93
100
|
# Datasets API
|
94
101
|
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
95
102
|
JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
103
|
+
JUDGMENT_DATASETS_APPEND_TRACES_API_URL = f"{ROOT_API}/traces/add_to_dataset/"
|
96
104
|
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
|
97
105
|
JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
|
98
106
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
|
@@ -134,7 +142,7 @@ class DatasetStatsPayload(TypedDict):
|
|
134
142
|
|
135
143
|
|
136
144
|
# Projects API
|
137
|
-
JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/
|
145
|
+
JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete_from_judgeval/"
|
138
146
|
JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
|
139
147
|
|
140
148
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import os
|
2
|
-
import json
|
3
2
|
import boto3
|
3
|
+
import orjson
|
4
4
|
from typing import Optional
|
5
5
|
from datetime import datetime, UTC
|
6
6
|
from botocore.exceptions import ClientError
|
@@ -85,8 +85,7 @@ class S3Storage:
|
|
85
85
|
timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
|
86
86
|
s3_key = f"traces/{project_name}/{trace_id}_{timestamp}.json"
|
87
87
|
|
88
|
-
|
89
|
-
trace_json = json.dumps(trace_data)
|
88
|
+
trace_json = orjson.dumps(trace_data).decode("utf-8")
|
90
89
|
|
91
90
|
self.s3_client.put_object(
|
92
91
|
Bucket=self.bucket_name,
|
judgeval/common/tracer/core.py
CHANGED
@@ -32,6 +32,7 @@ from typing import (
|
|
32
32
|
)
|
33
33
|
import types
|
34
34
|
|
35
|
+
|
35
36
|
from judgeval.common.tracer.constants import _TRACE_FILEPATH_BLOCKLIST
|
36
37
|
|
37
38
|
from judgeval.common.tracer.otel_span_processor import JudgmentSpanProcessor
|
@@ -45,6 +46,7 @@ from openai.types.chat import ParsedChatCompletion
|
|
45
46
|
from together import Together, AsyncTogether
|
46
47
|
from anthropic import Anthropic, AsyncAnthropic
|
47
48
|
from google import genai
|
49
|
+
from groq import Groq, AsyncGroq
|
48
50
|
|
49
51
|
from judgeval.data import Example, Trace, TraceSpan, TraceUsage
|
50
52
|
from judgeval.scorers import APIScorerConfig, BaseScorer
|
@@ -67,6 +69,8 @@ ApiClient: TypeAlias = Union[
|
|
67
69
|
AsyncTogether,
|
68
70
|
genai.Client,
|
69
71
|
genai.client.AsyncClient,
|
72
|
+
Groq,
|
73
|
+
AsyncGroq,
|
70
74
|
]
|
71
75
|
SpanType: TypeAlias = str
|
72
76
|
|
@@ -79,7 +83,7 @@ class TraceClient:
|
|
79
83
|
tracer: Tracer,
|
80
84
|
trace_id: Optional[str] = None,
|
81
85
|
name: str = "default",
|
82
|
-
project_name: str
|
86
|
+
project_name: Union[str, None] = None,
|
83
87
|
enable_monitoring: bool = True,
|
84
88
|
enable_evaluations: bool = True,
|
85
89
|
parent_trace_id: Optional[str] = None,
|
@@ -414,8 +418,6 @@ class TraceClient:
|
|
414
418
|
self.start_time or time.time(), timezone.utc
|
415
419
|
).isoformat(),
|
416
420
|
"duration": total_duration,
|
417
|
-
"trace_spans": [span.model_dump() for span in self.trace_spans],
|
418
|
-
"evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
|
419
421
|
"offline_mode": self.tracer.offline_mode,
|
420
422
|
"parent_trace_id": self.parent_trace_id,
|
421
423
|
"parent_name": self.parent_name,
|
@@ -850,9 +852,9 @@ class Tracer:
|
|
850
852
|
|
851
853
|
def __init__(
|
852
854
|
self,
|
853
|
-
api_key: str
|
854
|
-
organization_id: str
|
855
|
-
project_name: str
|
855
|
+
api_key: Union[str, None] = os.getenv("JUDGMENT_API_KEY"),
|
856
|
+
organization_id: Union[str, None] = os.getenv("JUDGMENT_ORG_ID"),
|
857
|
+
project_name: Union[str, None] = None,
|
856
858
|
deep_tracing: bool = False, # Deep tracing is disabled by default
|
857
859
|
enable_monitoring: bool = os.getenv("JUDGMENT_MONITORING", "true").lower()
|
858
860
|
== "true",
|
@@ -905,8 +907,8 @@ class Tracer:
|
|
905
907
|
self.class_identifiers: Dict[
|
906
908
|
str, str
|
907
909
|
] = {} # Dictionary to store class identifiers
|
908
|
-
self.span_id_to_previous_span_id: Dict[str, str
|
909
|
-
self.trace_id_to_previous_trace: Dict[str, TraceClient
|
910
|
+
self.span_id_to_previous_span_id: Dict[str, Union[str, None]] = {}
|
911
|
+
self.trace_id_to_previous_trace: Dict[str, Union[TraceClient, None]] = {}
|
910
912
|
self.current_span_id: Optional[str] = None
|
911
913
|
self.current_trace: Optional[TraceClient] = None
|
912
914
|
self.trace_across_async_contexts: bool = trace_across_async_contexts
|
@@ -958,7 +960,9 @@ class Tracer:
|
|
958
960
|
self.enable_monitoring = False
|
959
961
|
self.enable_evaluations = False
|
960
962
|
|
961
|
-
def set_current_span(
|
963
|
+
def set_current_span(
|
964
|
+
self, span_id: str
|
965
|
+
) -> Optional[contextvars.Token[Union[str, None]]]:
|
962
966
|
self.span_id_to_previous_span_id[span_id] = self.current_span_id
|
963
967
|
self.current_span_id = span_id
|
964
968
|
Tracer.current_span_id = span_id
|
@@ -981,7 +985,7 @@ class Tracer:
|
|
981
985
|
|
982
986
|
def reset_current_span(
|
983
987
|
self,
|
984
|
-
token: Optional[contextvars.Token[str
|
988
|
+
token: Optional[contextvars.Token[Union[str, None]]] = None,
|
985
989
|
span_id: Optional[str] = None,
|
986
990
|
):
|
987
991
|
try:
|
@@ -997,7 +1001,7 @@ class Tracer:
|
|
997
1001
|
|
998
1002
|
def set_current_trace(
|
999
1003
|
self, trace: TraceClient
|
1000
|
-
) -> Optional[contextvars.Token[TraceClient
|
1004
|
+
) -> Optional[contextvars.Token[Union[TraceClient, None]]]:
|
1001
1005
|
"""
|
1002
1006
|
Set the current trace context in contextvars
|
1003
1007
|
"""
|
@@ -1030,7 +1034,7 @@ class Tracer:
|
|
1030
1034
|
|
1031
1035
|
def reset_current_trace(
|
1032
1036
|
self,
|
1033
|
-
token: Optional[contextvars.Token[TraceClient
|
1037
|
+
token: Optional[contextvars.Token[Union[TraceClient, None]]] = None,
|
1034
1038
|
trace_id: Optional[str] = None,
|
1035
1039
|
):
|
1036
1040
|
try:
|
@@ -1046,7 +1050,7 @@ class Tracer:
|
|
1046
1050
|
|
1047
1051
|
@contextmanager
|
1048
1052
|
def trace(
|
1049
|
-
self, name: str, project_name: str
|
1053
|
+
self, name: str, project_name: Union[str, None] = None
|
1050
1054
|
) -> Generator[TraceClient, None, None]:
|
1051
1055
|
"""Start a new trace context using a context manager"""
|
1052
1056
|
trace_id = str(uuid.uuid4())
|
@@ -1692,25 +1696,31 @@ def wrap(
|
|
1692
1696
|
return wrapper
|
1693
1697
|
|
1694
1698
|
if isinstance(client, (OpenAI)):
|
1695
|
-
client.chat.completions
|
1696
|
-
client.responses
|
1697
|
-
client.beta.chat.completions
|
1699
|
+
setattr(client.chat.completions, "create", wrapped(original_create))
|
1700
|
+
setattr(client.responses, "create", wrapped(original_responses_create))
|
1701
|
+
setattr(client.beta.chat.completions, "parse", wrapped(original_beta_parse))
|
1698
1702
|
elif isinstance(client, (AsyncOpenAI)):
|
1699
|
-
client.chat.completions
|
1700
|
-
client.responses
|
1701
|
-
|
1703
|
+
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1704
|
+
setattr(client.responses, "create", wrapped_async(original_responses_create))
|
1705
|
+
setattr(
|
1706
|
+
client.beta.chat.completions, "parse", wrapped_async(original_beta_parse)
|
1707
|
+
)
|
1702
1708
|
elif isinstance(client, (Together)):
|
1703
|
-
client.chat.completions
|
1709
|
+
setattr(client.chat.completions, "create", wrapped(original_create))
|
1704
1710
|
elif isinstance(client, (AsyncTogether)):
|
1705
|
-
client.chat.completions
|
1711
|
+
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1706
1712
|
elif isinstance(client, (Anthropic)):
|
1707
|
-
client.messages
|
1713
|
+
setattr(client.messages, "create", wrapped(original_create))
|
1708
1714
|
elif isinstance(client, (AsyncAnthropic)):
|
1709
|
-
client.messages
|
1715
|
+
setattr(client.messages, "create", wrapped_async(original_create))
|
1710
1716
|
elif isinstance(client, (genai.Client)):
|
1711
|
-
client.models
|
1717
|
+
setattr(client.models, "generate_content", wrapped(original_create))
|
1712
1718
|
elif isinstance(client, (genai.client.AsyncClient)):
|
1713
|
-
client.models
|
1719
|
+
setattr(client.models, "generate_content", wrapped_async(original_create))
|
1720
|
+
elif isinstance(client, (Groq)):
|
1721
|
+
setattr(client.chat.completions, "create", wrapped(original_create))
|
1722
|
+
elif isinstance(client, (AsyncGroq)):
|
1723
|
+
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1714
1724
|
|
1715
1725
|
return client
|
1716
1726
|
|
@@ -1745,6 +1755,8 @@ def _get_client_config(
|
|
1745
1755
|
None,
|
1746
1756
|
client.beta.chat.completions.parse,
|
1747
1757
|
)
|
1758
|
+
elif isinstance(client, (Groq, AsyncGroq)):
|
1759
|
+
return "GROQ_API_CALL", client.chat.completions.create, None, None, None
|
1748
1760
|
elif isinstance(client, (Together, AsyncTogether)):
|
1749
1761
|
return "TOGETHER_API_CALL", client.chat.completions.create, None, None, None
|
1750
1762
|
elif isinstance(client, (Anthropic, AsyncAnthropic)):
|
@@ -1783,9 +1795,17 @@ def _format_output_data(
|
|
1783
1795
|
if isinstance(client, (OpenAI, AsyncOpenAI)):
|
1784
1796
|
if isinstance(response, ChatCompletion):
|
1785
1797
|
model_name = response.model
|
1786
|
-
prompt_tokens = response.usage.prompt_tokens
|
1787
|
-
completion_tokens =
|
1788
|
-
|
1798
|
+
prompt_tokens = response.usage.prompt_tokens if response.usage else 0
|
1799
|
+
completion_tokens = (
|
1800
|
+
response.usage.completion_tokens if response.usage else 0
|
1801
|
+
)
|
1802
|
+
cache_read_input_tokens = (
|
1803
|
+
response.usage.prompt_tokens_details.cached_tokens
|
1804
|
+
if response.usage
|
1805
|
+
and response.usage.prompt_tokens_details
|
1806
|
+
and response.usage.prompt_tokens_details.cached_tokens
|
1807
|
+
else 0
|
1808
|
+
)
|
1789
1809
|
|
1790
1810
|
if isinstance(response, ParsedChatCompletion):
|
1791
1811
|
message_content = response.choices[0].message.parsed
|
@@ -1793,10 +1813,19 @@ def _format_output_data(
|
|
1793
1813
|
message_content = response.choices[0].message.content
|
1794
1814
|
elif isinstance(response, Response):
|
1795
1815
|
model_name = response.model
|
1796
|
-
prompt_tokens = response.usage.input_tokens
|
1797
|
-
completion_tokens = response.usage.output_tokens
|
1798
|
-
cache_read_input_tokens =
|
1799
|
-
|
1816
|
+
prompt_tokens = response.usage.input_tokens if response.usage else 0
|
1817
|
+
completion_tokens = response.usage.output_tokens if response.usage else 0
|
1818
|
+
cache_read_input_tokens = (
|
1819
|
+
response.usage.input_tokens_details.cached_tokens
|
1820
|
+
if response.usage and response.usage.input_tokens_details
|
1821
|
+
else 0
|
1822
|
+
)
|
1823
|
+
if hasattr(response.output[0], "content"):
|
1824
|
+
message_content = "".join(
|
1825
|
+
seg.text
|
1826
|
+
for seg in response.output[0].content
|
1827
|
+
if hasattr(seg, "text")
|
1828
|
+
)
|
1800
1829
|
|
1801
1830
|
# Note: LiteLLM seems to use cache_read_input_tokens to calculate the cost for OpenAI
|
1802
1831
|
elif isinstance(client, (Together, AsyncTogether)):
|
@@ -1821,6 +1850,11 @@ def _format_output_data(
|
|
1821
1850
|
cache_read_input_tokens = response.usage.cache_read_input_tokens
|
1822
1851
|
cache_creation_input_tokens = response.usage.cache_creation_input_tokens
|
1823
1852
|
message_content = response.content[0].text
|
1853
|
+
elif isinstance(client, (Groq, AsyncGroq)):
|
1854
|
+
model_name = "groq/" + response.model
|
1855
|
+
prompt_tokens = response.usage.prompt_tokens
|
1856
|
+
completion_tokens = response.usage.completion_tokens
|
1857
|
+
message_content = response.choices[0].message.content
|
1824
1858
|
else:
|
1825
1859
|
judgeval_logger.warning(f"Unsupported client type: {type(client)}")
|
1826
1860
|
return None, None
|
@@ -11,11 +11,10 @@ import threading
|
|
11
11
|
from typing import Any, Dict, Optional
|
12
12
|
|
13
13
|
from opentelemetry.context import Context
|
14
|
-
from opentelemetry.sdk.trace import ReadableSpan
|
14
|
+
from opentelemetry.sdk.trace import ReadableSpan, Span
|
15
15
|
from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanProcessor
|
16
|
-
from opentelemetry.trace import
|
16
|
+
from opentelemetry.trace import Status, StatusCode, SpanContext, TraceFlags
|
17
17
|
from opentelemetry.trace.span import TraceState, INVALID_SPAN_CONTEXT
|
18
|
-
from opentelemetry.util.types import Attributes
|
19
18
|
|
20
19
|
from judgeval.common.logger import judgeval_logger
|
21
20
|
from judgeval.common.tracer.otel_exporter import JudgmentAPISpanExporter
|
@@ -51,8 +50,8 @@ class SimpleReadableSpan(ReadableSpan):
|
|
51
50
|
Status(StatusCode.ERROR) if trace_span.error else Status(StatusCode.OK)
|
52
51
|
)
|
53
52
|
|
54
|
-
self._attributes =
|
55
|
-
trace_span, span_state
|
53
|
+
self._attributes: Dict[str, Any] = (
|
54
|
+
SpanTransformer.trace_span_to_otel_attributes(trace_span, span_state)
|
56
55
|
)
|
57
56
|
|
58
57
|
try:
|
@@ -81,53 +80,8 @@ class SimpleReadableSpan(ReadableSpan):
|
|
81
80
|
self._parent: Optional[SpanContext] = None
|
82
81
|
self._events: list[Any] = []
|
83
82
|
self._links: list[Any] = []
|
84
|
-
self._resource: Optional[Any] = None
|
85
83
|
self._instrumentation_info: Optional[Any] = None
|
86
84
|
|
87
|
-
@property
|
88
|
-
def name(self) -> str:
|
89
|
-
return self._name
|
90
|
-
|
91
|
-
@property
|
92
|
-
def context(self) -> SpanContext:
|
93
|
-
return self._context
|
94
|
-
|
95
|
-
@property
|
96
|
-
def parent(self) -> Optional[SpanContext]:
|
97
|
-
return self._parent
|
98
|
-
|
99
|
-
@property
|
100
|
-
def start_time(self) -> Optional[int]:
|
101
|
-
return self._start_time
|
102
|
-
|
103
|
-
@property
|
104
|
-
def end_time(self) -> Optional[int]:
|
105
|
-
return self._end_time
|
106
|
-
|
107
|
-
@property
|
108
|
-
def status(self) -> Status:
|
109
|
-
return self._status
|
110
|
-
|
111
|
-
@property
|
112
|
-
def attributes(self) -> Optional[Attributes]:
|
113
|
-
return self._attributes
|
114
|
-
|
115
|
-
@property
|
116
|
-
def events(self):
|
117
|
-
return self._events
|
118
|
-
|
119
|
-
@property
|
120
|
-
def links(self):
|
121
|
-
return self._links
|
122
|
-
|
123
|
-
@property
|
124
|
-
def resource(self) -> Optional[Any]:
|
125
|
-
return self._resource
|
126
|
-
|
127
|
-
@property
|
128
|
-
def instrumentation_info(self) -> Optional[Any]:
|
129
|
-
return self._instrumentation_info
|
130
|
-
|
131
85
|
|
132
86
|
class JudgmentSpanProcessor(SpanProcessor, SpanProcessorBase):
|
133
87
|
"""
|
@@ -1,10 +1,10 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import json
|
4
3
|
import time
|
5
4
|
import uuid
|
5
|
+
import orjson
|
6
6
|
from datetime import datetime, timezone
|
7
|
-
from typing import Any, Dict, Optional, Union
|
7
|
+
from typing import Any, Dict, Mapping, Optional, Union
|
8
8
|
|
9
9
|
from opentelemetry.sdk.trace import ReadableSpan
|
10
10
|
from pydantic import BaseModel
|
@@ -16,11 +16,15 @@ from judgeval.evaluation_run import EvaluationRun
|
|
16
16
|
class SpanTransformer:
|
17
17
|
@staticmethod
|
18
18
|
def _needs_json_serialization(value: Any) -> bool:
|
19
|
+
"""
|
20
|
+
Check if the value needs JSON serialization.
|
21
|
+
Returns True if the value is complex and needs serialization.
|
22
|
+
"""
|
19
23
|
if value is None:
|
20
24
|
return False
|
21
25
|
|
22
|
-
|
23
|
-
if isinstance(value,
|
26
|
+
# Basic JSON-serializable types don't need serialization
|
27
|
+
if isinstance(value, (str, int, float, bool)):
|
24
28
|
return False
|
25
29
|
|
26
30
|
complex_types = (dict, list, tuple, set, BaseModel)
|
@@ -28,7 +32,7 @@ class SpanTransformer:
|
|
28
32
|
return True
|
29
33
|
|
30
34
|
try:
|
31
|
-
|
35
|
+
orjson.dumps(value)
|
32
36
|
return False
|
33
37
|
except (TypeError, ValueError):
|
34
38
|
return True
|
@@ -39,15 +43,15 @@ class SpanTransformer:
|
|
39
43
|
if obj is None:
|
40
44
|
return None
|
41
45
|
try:
|
42
|
-
return
|
46
|
+
return orjson.dumps(obj, default=str).decode("utf-8")
|
43
47
|
except Exception:
|
44
|
-
return
|
48
|
+
return orjson.dumps(str(obj)).decode("utf-8")
|
45
49
|
else:
|
46
50
|
if not isinstance(obj, str):
|
47
51
|
return obj
|
48
52
|
try:
|
49
|
-
return
|
50
|
-
except (
|
53
|
+
return orjson.loads(obj)
|
54
|
+
except (orjson.JSONDecodeError, TypeError, ValueError):
|
51
55
|
return obj
|
52
56
|
|
53
57
|
@staticmethod
|
@@ -99,7 +103,9 @@ class SpanTransformer:
|
|
99
103
|
return attributes
|
100
104
|
|
101
105
|
@staticmethod
|
102
|
-
def otel_attributes_to_judgment_data(
|
106
|
+
def otel_attributes_to_judgment_data(
|
107
|
+
attributes: Mapping[str, Any],
|
108
|
+
) -> Dict[str, Any]:
|
103
109
|
judgment_data: Dict[str, Any] = {}
|
104
110
|
|
105
111
|
for key, value in attributes.items():
|