judgeval 0.0.31__py3-none-any.whl → 0.0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +3 -1
- judgeval/common/tracer.py +262 -65
- judgeval/constants.py +1 -1
- judgeval/data/datasets/dataset.py +5 -1
- judgeval/data/datasets/eval_dataset_client.py +2 -2
- judgeval/data/sequence.py +12 -16
- judgeval/data/sequence_run.py +2 -0
- judgeval/judgment_client.py +32 -93
- judgeval/run_evaluation.py +1 -1
- judgeval/version_check.py +22 -0
- {judgeval-0.0.31.dist-info → judgeval-0.0.32.dist-info}/METADATA +1 -1
- {judgeval-0.0.31.dist-info → judgeval-0.0.32.dist-info}/RECORD +14 -13
- {judgeval-0.0.31.dist-info → judgeval-0.0.32.dist-info}/WHEEL +0 -0
- {judgeval-0.0.31.dist-info → judgeval-0.0.32.dist-info}/licenses/LICENSE.md +0 -0
judgeval/__init__.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
# Import key components that should be publicly accessible
|
2
2
|
from judgeval.clients import client, together_client
|
3
3
|
from judgeval.judgment_client import JudgmentClient
|
4
|
+
from judgeval.version_check import check_latest_version
|
5
|
+
check_latest_version()
|
4
6
|
|
5
7
|
__all__ = [
|
6
8
|
# Clients
|
7
9
|
'client',
|
8
10
|
'together_client',
|
9
11
|
'JudgmentClient',
|
10
|
-
]
|
12
|
+
]
|
judgeval/common/tracer.py
CHANGED
@@ -11,11 +11,12 @@ import time
|
|
11
11
|
import uuid
|
12
12
|
import warnings
|
13
13
|
import contextvars
|
14
|
+
import sys
|
14
15
|
from contextlib import contextmanager
|
15
16
|
from dataclasses import dataclass, field
|
16
17
|
from datetime import datetime
|
17
18
|
from http import HTTPStatus
|
18
|
-
from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, TypeAlias, Union, Callable, Awaitable
|
19
|
+
from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, TypeAlias, Union, Callable, Awaitable, Set
|
19
20
|
from rich import print as rprint
|
20
21
|
|
21
22
|
# Third-party imports
|
@@ -27,6 +28,7 @@ from rich import print as rprint
|
|
27
28
|
from openai import OpenAI, AsyncOpenAI
|
28
29
|
from together import Together, AsyncTogether
|
29
30
|
from anthropic import Anthropic, AsyncAnthropic
|
31
|
+
from google import genai
|
30
32
|
|
31
33
|
# Local application/library-specific imports
|
32
34
|
from judgeval.constants import (
|
@@ -50,10 +52,11 @@ import concurrent.futures
|
|
50
52
|
|
51
53
|
# Define context variables for tracking the current trace and the current span within a trace
|
52
54
|
current_trace_var = contextvars.ContextVar('current_trace', default=None)
|
53
|
-
current_span_var = contextvars.ContextVar('current_span', default=None) #
|
55
|
+
current_span_var = contextvars.ContextVar('current_span', default=None) # ContextVar for the active span name
|
56
|
+
in_traced_function_var = contextvars.ContextVar('in_traced_function', default=False) # Track if we're in a traced function
|
54
57
|
|
55
58
|
# Define type aliases for better code readability and maintainability
|
56
|
-
ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic, AsyncOpenAI, AsyncAnthropic, AsyncTogether] # Supported API clients
|
59
|
+
ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic, AsyncOpenAI, AsyncAnthropic, AsyncTogether, genai.Client, genai.client.AsyncClient] # Supported API clients
|
57
60
|
TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation'] # Valid trace entry types
|
58
61
|
SpanType = Literal['span', 'tool', 'llm', 'evaluation', 'chain']
|
59
62
|
@dataclass
|
@@ -888,6 +891,13 @@ class TraceClient:
|
|
888
891
|
"parent_trace_id": self.parent_trace_id,
|
889
892
|
"parent_name": self.parent_name
|
890
893
|
}
|
894
|
+
# --- Log trace data before saving ---
|
895
|
+
try:
|
896
|
+
rprint(f"[TraceClient.save] Saving trace data for trace_id {self.trace_id}:")
|
897
|
+
rprint(json.dumps(trace_data, indent=2))
|
898
|
+
except Exception as log_e:
|
899
|
+
rprint(f"[TraceClient.save] Error logging trace data: {log_e}")
|
900
|
+
# --- End logging ---
|
891
901
|
self.trace_manager_client.save_trace(trace_data)
|
892
902
|
|
893
903
|
return self.trace_id, trace_data
|
@@ -910,7 +920,8 @@ class Tracer:
|
|
910
920
|
rules: Optional[List[Rule]] = None, # Added rules parameter
|
911
921
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
|
912
922
|
enable_monitoring: bool = os.getenv("JUDGMENT_MONITORING", "true").lower() == "true",
|
913
|
-
enable_evaluations: bool = os.getenv("JUDGMENT_EVALUATIONS", "true").lower() == "true"
|
923
|
+
enable_evaluations: bool = os.getenv("JUDGMENT_EVALUATIONS", "true").lower() == "true",
|
924
|
+
deep_tracing: bool = True # NEW: Enable deep tracing by default
|
914
925
|
):
|
915
926
|
if not hasattr(self, 'initialized'):
|
916
927
|
if not api_key:
|
@@ -927,6 +938,7 @@ class Tracer:
|
|
927
938
|
self.initialized: bool = True
|
928
939
|
self.enable_monitoring: bool = enable_monitoring
|
929
940
|
self.enable_evaluations: bool = enable_evaluations
|
941
|
+
self.deep_tracing: bool = deep_tracing # NEW: Store deep tracing setting
|
930
942
|
elif hasattr(self, 'project_name') and self.project_name != project_name:
|
931
943
|
warnings.warn(
|
932
944
|
f"Attempting to initialize Tracer with project_name='{project_name}' but it was already initialized with "
|
@@ -941,12 +953,52 @@ class Tracer:
|
|
941
953
|
"""
|
942
954
|
current_trace_var.set(trace)
|
943
955
|
|
944
|
-
def get_current_trace(self):
|
956
|
+
def get_current_trace(self) -> Optional[TraceClient]:
|
945
957
|
"""
|
946
958
|
Get the current trace context from contextvars
|
947
959
|
"""
|
948
960
|
return current_trace_var.get()
|
949
961
|
|
962
|
+
def _apply_deep_tracing(self, func, span_type="span"):
|
963
|
+
"""
|
964
|
+
Apply deep tracing to all functions in the same module as the given function.
|
965
|
+
|
966
|
+
Args:
|
967
|
+
func: The function being traced
|
968
|
+
span_type: Type of span to use for traced functions
|
969
|
+
|
970
|
+
Returns:
|
971
|
+
A tuple of (module, original_functions_dict) where original_functions_dict
|
972
|
+
contains the original functions that were replaced with traced versions.
|
973
|
+
"""
|
974
|
+
module = inspect.getmodule(func)
|
975
|
+
if not module:
|
976
|
+
return None, {}
|
977
|
+
|
978
|
+
# Save original functions
|
979
|
+
original_functions = {}
|
980
|
+
|
981
|
+
# Find all functions in the module
|
982
|
+
for name, obj in inspect.getmembers(module, inspect.isfunction):
|
983
|
+
# Skip already wrapped functions
|
984
|
+
if hasattr(obj, '_judgment_traced'):
|
985
|
+
continue
|
986
|
+
|
987
|
+
# Create a traced version of the function
|
988
|
+
# Always use default span type "span" for child functions
|
989
|
+
traced_func = _create_deep_tracing_wrapper(obj, self, "span")
|
990
|
+
|
991
|
+
# Mark the function as traced to avoid double wrapping
|
992
|
+
traced_func._judgment_traced = True
|
993
|
+
|
994
|
+
# Save the original function
|
995
|
+
original_functions[name] = obj
|
996
|
+
|
997
|
+
# Replace with traced version
|
998
|
+
setattr(module, name, traced_func)
|
999
|
+
|
1000
|
+
return module, original_functions
|
1001
|
+
|
950
1002
|
@contextmanager
|
951
1003
|
def trace(
|
952
1004
|
self,
|
@@ -992,14 +1044,8 @@ class Tracer:
|
|
992
1044
|
finally:
|
993
1045
|
# Reset the context variable
|
994
1046
|
current_trace_var.reset(token)
|
995
|
-
|
996
|
-
def
|
997
|
-
"""
|
998
|
-
Get the current trace context from contextvars
|
999
|
-
"""
|
1000
|
-
return current_trace_var.get()
|
1001
|
-
|
1002
|
-
def observe(self, func=None, *, name=None, span_type: SpanType = "span", project_name: str = None, overwrite: bool = False):
|
1047
|
+
|
1048
|
+
def observe(self, func=None, *, name=None, span_type: SpanType = "span", project_name: str = None, overwrite: bool = False, deep_tracing: bool = None):
|
1003
1049
|
"""
|
1004
1050
|
Decorator to trace function execution with detailed entry/exit information.
|
1005
1051
|
|
@@ -1009,20 +1055,37 @@ class Tracer:
|
|
1009
1055
|
span_type: Type of span (default "span")
|
1010
1056
|
project_name: Optional project name override
|
1011
1057
|
overwrite: Whether to overwrite existing traces
|
1058
|
+
deep_tracing: Whether to enable deep tracing for this function and all nested calls.
|
1059
|
+
If None, uses the tracer's default setting.
|
1012
1060
|
"""
|
1013
1061
|
# If monitoring is disabled, return the function as is
|
1014
1062
|
if not self.enable_monitoring:
|
1015
1063
|
return func if func else lambda f: f
|
1016
1064
|
|
1017
1065
|
if func is None:
|
1018
|
-
return lambda f: self.observe(f, name=name, span_type=span_type, project_name=project_name,
|
1066
|
+
return lambda f: self.observe(f, name=name, span_type=span_type, project_name=project_name,
|
1067
|
+
overwrite=overwrite, deep_tracing=deep_tracing)
|
1019
1068
|
|
1020
1069
|
# Use provided name or fall back to function name
|
1021
1070
|
span_name = name or func.__name__
|
1022
1071
|
|
1072
|
+
# Store custom attributes on the function object
|
1073
|
+
func._judgment_span_name = span_name
|
1074
|
+
func._judgment_span_type = span_type
|
1075
|
+
|
1076
|
+
# Use the provided deep_tracing value or fall back to the tracer's default
|
1077
|
+
use_deep_tracing = deep_tracing if deep_tracing is not None else self.deep_tracing
|
1078
|
+
|
1023
1079
|
if asyncio.iscoroutinefunction(func):
|
1024
1080
|
@functools.wraps(func)
|
1025
1081
|
async def async_wrapper(*args, **kwargs):
|
1082
|
+
# Check if we're already in a traced function
|
1083
|
+
if in_traced_function_var.get():
|
1084
|
+
return await func(*args, **kwargs)
|
1085
|
+
|
1086
|
+
# Set in_traced_function_var to True
|
1087
|
+
token = in_traced_function_var.set(True)
|
1088
|
+
|
1026
1089
|
# Get current trace from context
|
1027
1090
|
current_trace = current_trace_var.get()
|
1028
1091
|
|
@@ -1057,9 +1120,18 @@ class Tracer:
|
|
1057
1120
|
'kwargs': kwargs
|
1058
1121
|
})
|
1059
1122
|
|
1123
|
+
# If deep tracing is enabled, apply monkey patching
|
1124
|
+
if use_deep_tracing:
|
1125
|
+
module, original_functions = self._apply_deep_tracing(func, span_type)
|
1126
|
+
|
1060
1127
|
# Execute function
|
1061
1128
|
result = await func(*args, **kwargs)
|
1062
1129
|
|
1130
|
+
# Restore original functions if deep tracing was enabled
|
1131
|
+
if use_deep_tracing and module and 'original_functions' in locals():
|
1132
|
+
for name, obj in original_functions.items():
|
1133
|
+
setattr(module, name, obj)
|
1134
|
+
|
1063
1135
|
# Record output
|
1064
1136
|
span.record_output(result)
|
1065
1137
|
|
@@ -1069,29 +1141,52 @@ class Tracer:
|
|
1069
1141
|
finally:
|
1070
1142
|
# Reset trace context (span context resets automatically)
|
1071
1143
|
current_trace_var.reset(trace_token)
|
1144
|
+
# Reset in_traced_function_var
|
1145
|
+
in_traced_function_var.reset(token)
|
1072
1146
|
else:
|
1073
1147
|
# Already have a trace context, just create a span in it
|
1074
1148
|
# The span method handles current_span_var
|
1075
|
-
|
1076
|
-
|
1077
|
-
span
|
1078
|
-
|
1079
|
-
|
1080
|
-
|
1081
|
-
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1149
|
+
|
1150
|
+
try:
|
1151
|
+
with current_trace.span(span_name, span_type=span_type) as span: # MODIFIED: Use span_name directly
|
1152
|
+
# Record inputs
|
1153
|
+
span.record_input({
|
1154
|
+
'args': str(args),
|
1155
|
+
'kwargs': kwargs
|
1156
|
+
})
|
1157
|
+
|
1158
|
+
# If deep tracing is enabled, apply monkey patching
|
1159
|
+
if use_deep_tracing:
|
1160
|
+
module, original_functions = self._apply_deep_tracing(func, span_type)
|
1161
|
+
|
1162
|
+
# Execute function
|
1163
|
+
result = await func(*args, **kwargs)
|
1164
|
+
|
1165
|
+
# Restore original functions if deep tracing was enabled
|
1166
|
+
if use_deep_tracing and module and 'original_functions' in locals():
|
1167
|
+
for name, obj in original_functions.items():
|
1168
|
+
setattr(module, name, obj)
|
1169
|
+
|
1170
|
+
# Record output
|
1171
|
+
span.record_output(result)
|
1087
1172
|
|
1088
1173
|
return result
|
1089
|
-
|
1174
|
+
finally:
|
1175
|
+
# Reset in_traced_function_var
|
1176
|
+
in_traced_function_var.reset(token)
|
1177
|
+
|
1090
1178
|
return async_wrapper
|
1091
1179
|
else:
|
1092
|
-
# Non-async function implementation
|
1180
|
+
# Non-async function implementation with deep tracing
|
1093
1181
|
@functools.wraps(func)
|
1094
1182
|
def wrapper(*args, **kwargs):
|
1183
|
+
# Check if we're already in a traced function
|
1184
|
+
if in_traced_function_var.get():
|
1185
|
+
return func(*args, **kwargs)
|
1186
|
+
|
1187
|
+
# Set in_traced_function_var to True
|
1188
|
+
token = in_traced_function_var.set(True)
|
1189
|
+
|
1095
1190
|
# Get current trace from context
|
1096
1191
|
current_trace = current_trace_var.get()
|
1097
1192
|
|
@@ -1126,9 +1221,18 @@ class Tracer:
|
|
1126
1221
|
'kwargs': kwargs
|
1127
1222
|
})
|
1128
1223
|
|
1224
|
+
# If deep tracing is enabled, apply monkey patching
|
1225
|
+
if use_deep_tracing:
|
1226
|
+
module, original_functions = self._apply_deep_tracing(func, span_type)
|
1227
|
+
|
1129
1228
|
# Execute function
|
1130
1229
|
result = func(*args, **kwargs)
|
1131
1230
|
|
1231
|
+
# Restore original functions if deep tracing was enabled
|
1232
|
+
if use_deep_tracing and module and 'original_functions' in locals():
|
1233
|
+
for name, obj in original_functions.items():
|
1234
|
+
setattr(module, name, obj)
|
1235
|
+
|
1132
1236
|
# Record output
|
1133
1237
|
span.record_output(result)
|
1134
1238
|
|
@@ -1138,24 +1242,40 @@ class Tracer:
|
|
1138
1242
|
finally:
|
1139
1243
|
# Reset trace context (span context resets automatically)
|
1140
1244
|
current_trace_var.reset(trace_token)
|
1245
|
+
# Reset in_traced_function_var
|
1246
|
+
in_traced_function_var.reset(token)
|
1141
1247
|
else:
|
1142
1248
|
# Already have a trace context, just create a span in it
|
1143
1249
|
# The span method handles current_span_var
|
1144
|
-
|
1145
|
-
|
1146
|
-
span
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1153
|
-
|
1154
|
-
|
1155
|
-
|
1250
|
+
|
1251
|
+
try:
|
1252
|
+
with current_trace.span(span_name, span_type=span_type) as span: # MODIFIED: Use span_name directly
|
1253
|
+
# Record inputs
|
1254
|
+
span.record_input({
|
1255
|
+
'args': str(args),
|
1256
|
+
'kwargs': kwargs
|
1257
|
+
})
|
1258
|
+
|
1259
|
+
# If deep tracing is enabled, apply monkey patching
|
1260
|
+
if use_deep_tracing:
|
1261
|
+
module, original_functions = self._apply_deep_tracing(func, span_type)
|
1262
|
+
|
1263
|
+
# Execute function
|
1264
|
+
result = func(*args, **kwargs)
|
1265
|
+
|
1266
|
+
# Restore original functions if deep tracing was enabled
|
1267
|
+
if use_deep_tracing and module and 'original_functions' in locals():
|
1268
|
+
for name, obj in original_functions.items():
|
1269
|
+
setattr(module, name, obj)
|
1270
|
+
|
1271
|
+
# Record output
|
1272
|
+
span.record_output(result)
|
1156
1273
|
|
1157
1274
|
return result
|
1158
|
-
|
1275
|
+
finally:
|
1276
|
+
# Reset in_traced_function_var
|
1277
|
+
in_traced_function_var.reset(token)
|
1278
|
+
|
1159
1279
|
return wrapper
|
1160
1280
|
|
1161
1281
|
def score(self, func=None, scorers: List[Union[APIJudgmentScorer, JudgevalScorer]] = None, model: str = None, log_results: bool = True, *, name: str = None, span_type: SpanType = "span"):
|
@@ -1206,7 +1326,7 @@ def wrap(client: Any) -> Any:
|
|
1206
1326
|
span_name, original_create = _get_client_config(client)
|
1207
1327
|
|
1208
1328
|
# Handle async clients differently than synchronous clients (need an async function for async clients)
|
1209
|
-
if (isinstance(client, (AsyncOpenAI, AsyncAnthropic, AsyncTogether))):
|
1329
|
+
if (isinstance(client, (AsyncOpenAI, AsyncAnthropic, AsyncTogether, genai.client.AsyncClient))):
|
1210
1330
|
async def traced_create(*args, **kwargs):
|
1211
1331
|
# Get the current trace from contextvars
|
1212
1332
|
current_trace = current_trace_var.get()
|
@@ -1265,6 +1385,8 @@ def wrap(client: Any) -> Any:
|
|
1265
1385
|
client.chat.completions.create = traced_create
|
1266
1386
|
elif isinstance(client, (Anthropic, AsyncAnthropic)):
|
1267
1387
|
client.messages.create = traced_create
|
1388
|
+
elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
|
1389
|
+
client.models.generate_content = traced_create
|
1268
1390
|
|
1269
1391
|
return client
|
1270
1392
|
|
@@ -1290,6 +1412,8 @@ def _get_client_config(client: ApiClient) -> tuple[str, callable]:
|
|
1290
1412
|
return "TOGETHER_API_CALL", client.chat.completions.create
|
1291
1413
|
elif isinstance(client, (Anthropic, AsyncAnthropic)):
|
1292
1414
|
return "ANTHROPIC_API_CALL", client.messages.create
|
1415
|
+
elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
|
1416
|
+
return "GOOGLE_API_CALL", client.models.generate_content
|
1293
1417
|
raise ValueError(f"Unsupported client type: {type(client)}")
|
1294
1418
|
|
1295
1419
|
def _format_input_data(client: ApiClient, **kwargs) -> dict:
|
@@ -1303,6 +1427,11 @@ def _format_input_data(client: ApiClient, **kwargs) -> dict:
|
|
1303
1427
|
"model": kwargs.get("model"),
|
1304
1428
|
"messages": kwargs.get("messages"),
|
1305
1429
|
}
|
1430
|
+
elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
|
1431
|
+
return {
|
1432
|
+
"model": kwargs.get("model"),
|
1433
|
+
"contents": kwargs.get("contents")
|
1434
|
+
}
|
1306
1435
|
# Anthropic requires additional max_tokens parameter
|
1307
1436
|
return {
|
1308
1437
|
"model": kwargs.get("model"),
|
@@ -1330,6 +1459,15 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
|
|
1330
1459
|
"total_tokens": response.usage.total_tokens
|
1331
1460
|
}
|
1332
1461
|
}
|
1462
|
+
elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
|
1463
|
+
return {
|
1464
|
+
"content": response.candidates[0].content.parts[0].text,
|
1465
|
+
"usage": {
|
1466
|
+
"prompt_tokens": response.usage_metadata.prompt_token_count,
|
1467
|
+
"completion_tokens": response.usage_metadata.candidates_token_count,
|
1468
|
+
"total_tokens": response.usage_metadata.total_token_count
|
1469
|
+
}
|
1470
|
+
}
|
1333
1471
|
# Anthropic has a different response structure
|
1334
1472
|
return {
|
1335
1473
|
"content": response.content[0].text,
|
@@ -1340,29 +1478,88 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
|
|
1340
1478
|
}
|
1341
1479
|
}
|
1342
1480
|
|
1343
|
-
# Add a
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1348
|
-
|
1349
|
-
|
1350
|
-
|
1351
|
-
|
1352
|
-
|
1353
|
-
|
1354
|
-
|
1355
|
-
|
1356
|
-
|
1357
|
-
|
1358
|
-
|
1359
|
-
|
1360
|
-
|
1361
|
-
#
|
1362
|
-
|
1363
|
-
|
1364
|
-
#
|
1365
|
-
|
1481
|
+
# Add a new function for deep tracing at the module level
|
1482
|
+
def _create_deep_tracing_wrapper(func, tracer, span_type="span"):
|
1483
|
+
"""
|
1484
|
+
Creates a wrapper for a function that automatically traces it when called within a traced function.
|
1485
|
+
This enables deep tracing without requiring explicit @observe decorators on every function.
|
1486
|
+
|
1487
|
+
Args:
|
1488
|
+
func: The function to wrap
|
1489
|
+
tracer: The Tracer instance
|
1490
|
+
span_type: Type of span (default "span")
|
1491
|
+
|
1492
|
+
Returns:
|
1493
|
+
A wrapped function that will be traced when called
|
1494
|
+
"""
|
1495
|
+
# Skip wrapping if the function is not callable or is a built-in
|
1496
|
+
if not callable(func) or isinstance(func, type) or func.__module__ == 'builtins':
|
1497
|
+
return func
|
1498
|
+
|
1499
|
+
# Get function name for the span - check for custom name set by @observe
|
1500
|
+
func_name = getattr(func, '_judgment_span_name', func.__name__)
|
1501
|
+
|
1502
|
+
# Check for custom span_type set by @observe
|
1503
|
+
func_span_type = getattr(func, '_judgment_span_type', "span")
|
1504
|
+
|
1505
|
+
# Store original function to prevent losing reference
|
1506
|
+
original_func = func
|
1507
|
+
|
1508
|
+
# Create appropriate wrapper based on whether the function is async or not
|
1509
|
+
if asyncio.iscoroutinefunction(func):
|
1510
|
+
@functools.wraps(func)
|
1511
|
+
async def async_deep_wrapper(*args, **kwargs):
|
1512
|
+
# Get current trace from context
|
1513
|
+
current_trace = current_trace_var.get()
|
1514
|
+
|
1515
|
+
# If no trace context, just call the function
|
1516
|
+
if not current_trace:
|
1517
|
+
return await original_func(*args, **kwargs)
|
1518
|
+
|
1519
|
+
# Create a span for this function call - use custom span_type if available
|
1520
|
+
with current_trace.span(func_name, span_type=func_span_type) as span:
|
1521
|
+
# Record inputs
|
1522
|
+
span.record_input({
|
1523
|
+
'args': str(args),
|
1524
|
+
'kwargs': kwargs
|
1525
|
+
})
|
1526
|
+
|
1527
|
+
# Execute function
|
1528
|
+
result = await original_func(*args, **kwargs)
|
1529
|
+
|
1530
|
+
# Record output
|
1531
|
+
span.record_output(result)
|
1532
|
+
|
1533
|
+
return result
|
1534
|
+
|
1535
|
+
return async_deep_wrapper
|
1536
|
+
else:
|
1537
|
+
@functools.wraps(func)
|
1538
|
+
def deep_wrapper(*args, **kwargs):
|
1539
|
+
# Get current trace from context
|
1540
|
+
current_trace = current_trace_var.get()
|
1541
|
+
|
1542
|
+
# If no trace context, just call the function
|
1543
|
+
if not current_trace:
|
1544
|
+
return original_func(*args, **kwargs)
|
1545
|
+
|
1546
|
+
# Create a span for this function call - use custom span_type if available
|
1547
|
+
with current_trace.span(func_name, span_type=func_span_type) as span:
|
1548
|
+
# Record inputs
|
1549
|
+
span.record_input({
|
1550
|
+
'args': str(args),
|
1551
|
+
'kwargs': kwargs
|
1552
|
+
})
|
1553
|
+
|
1554
|
+
# Execute function
|
1555
|
+
result = original_func(*args, **kwargs)
|
1556
|
+
|
1557
|
+
# Record output
|
1558
|
+
span.record_output(result)
|
1559
|
+
|
1560
|
+
return result
|
1561
|
+
|
1562
|
+
return deep_wrapper
|
1366
1563
|
|
1367
1564
|
# Add the new TraceThreadPoolExecutor class
|
1368
1565
|
class TraceThreadPoolExecutor(concurrent.futures.ThreadPoolExecutor):
|
judgeval/constants.py
CHANGED
@@ -43,7 +43,7 @@ JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
|
43
43
|
JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
|
44
44
|
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
45
45
|
JUDGMENT_DATASETS_APPEND_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
46
|
-
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/
|
46
|
+
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
|
47
47
|
JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
|
48
48
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
|
49
49
|
JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
|
@@ -7,12 +7,13 @@ import yaml
|
|
7
7
|
from dataclasses import dataclass, field
|
8
8
|
from typing import List, Union, Literal
|
9
9
|
|
10
|
-
from judgeval.data import Example
|
10
|
+
from judgeval.data import Example, Sequence
|
11
11
|
from judgeval.common.logger import debug, error, warning, info
|
12
12
|
|
13
13
|
@dataclass
|
14
14
|
class EvalDataset:
|
15
15
|
examples: List[Example]
|
16
|
+
sequences: List[Sequence]
|
16
17
|
_alias: Union[str, None] = field(default=None)
|
17
18
|
_id: Union[str, None] = field(default=None)
|
18
19
|
judgment_api_key: str = field(default="")
|
@@ -21,11 +22,13 @@ class EvalDataset:
|
|
21
22
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
|
22
23
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
|
23
24
|
examples: List[Example] = [],
|
25
|
+
sequences: List[Sequence] = []
|
24
26
|
):
|
25
27
|
debug(f"Initializing EvalDataset with {len(examples)} examples")
|
26
28
|
if not judgment_api_key:
|
27
29
|
warning("No judgment_api_key provided")
|
28
30
|
self.examples = examples
|
31
|
+
self.sequences = sequences
|
29
32
|
self._alias = None
|
30
33
|
self._id = None
|
31
34
|
self.judgment_api_key = judgment_api_key
|
@@ -309,6 +312,7 @@ class EvalDataset:
|
|
309
312
|
return (
|
310
313
|
f"{self.__class__.__name__}("
|
311
314
|
f"examples={self.examples}, "
|
315
|
+
f"sequences={self.sequences}, "
|
312
316
|
f"_alias={self._alias}, "
|
313
317
|
f"_id={self._id}"
|
314
318
|
f")"
|
@@ -13,7 +13,7 @@ from judgeval.constants import (
|
|
13
13
|
JUDGMENT_DATASETS_INSERT_API_URL,
|
14
14
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
|
15
15
|
)
|
16
|
-
from judgeval.data import Example
|
16
|
+
from judgeval.data import Example, Sequence
|
17
17
|
from judgeval.data.datasets import EvalDataset
|
18
18
|
|
19
19
|
|
@@ -201,8 +201,8 @@ class EvalDatasetClient:
|
|
201
201
|
|
202
202
|
info(f"Successfully pulled dataset with alias '{alias}'")
|
203
203
|
payload = response.json()
|
204
|
-
|
205
204
|
dataset.examples = [Example(**e) for e in payload.get("examples", [])]
|
205
|
+
dataset.sequences = [Sequence(**s) for s in payload.get("sequences", [])]
|
206
206
|
dataset._alias = payload.get("alias")
|
207
207
|
dataset._id = payload.get("id")
|
208
208
|
progress.update(
|
judgeval/data/sequence.py
CHANGED
@@ -16,6 +16,9 @@ class Sequence(BaseModel):
|
|
16
16
|
scorers: Optional[Any] = None
|
17
17
|
parent_sequence_id: Optional[str] = None
|
18
18
|
sequence_order: Optional[int] = 0
|
19
|
+
root_sequence_id: Optional[str] = None
|
20
|
+
inputs: Optional[str] = None
|
21
|
+
output: Optional[str] = None
|
19
22
|
|
20
23
|
@field_validator("scorers")
|
21
24
|
def validate_scorer(cls, v):
|
@@ -30,28 +33,21 @@ class Sequence(BaseModel):
|
|
30
33
|
raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
|
31
34
|
return loaded_scorers
|
32
35
|
|
33
|
-
@model_validator(mode=
|
34
|
-
def
|
35
|
-
"""Recursively set
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
# Recurse into deeper nested sequences
|
40
|
-
item.set_parent_sequence_ids()
|
41
|
-
return self
|
36
|
+
@model_validator(mode="after")
|
37
|
+
def populate_sequence_metadata(self) -> "Sequence":
|
38
|
+
"""Recursively set parent_sequence_id, root_sequence_id, and sequence_order."""
|
39
|
+
# If root_sequence_id isn't already set, assign it to self
|
40
|
+
if self.root_sequence_id is None:
|
41
|
+
self.root_sequence_id = self.sequence_id
|
42
42
|
|
43
|
-
@model_validator(mode='after')
|
44
|
-
def set_parent_and_order(self) -> "Sequence":
|
45
|
-
"""Set parent_sequence_id and sequence_order for all items."""
|
46
43
|
for idx, item in enumerate(self.items):
|
47
|
-
# Set sequence_order for both Example and Sequence objects
|
48
44
|
item.sequence_order = idx
|
49
|
-
|
50
45
|
if isinstance(item, Sequence):
|
51
46
|
item.parent_sequence_id = self.sequence_id
|
52
|
-
item.
|
47
|
+
item.root_sequence_id = self.root_sequence_id
|
48
|
+
item.populate_sequence_metadata()
|
53
49
|
return self
|
54
|
-
|
50
|
+
|
55
51
|
class Config:
|
56
52
|
arbitrary_types_allowed = True
|
57
53
|
|
judgeval/data/sequence_run.py
CHANGED
@@ -21,6 +21,7 @@ class SequenceRun(BaseModel):
|
|
21
21
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
22
22
|
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
23
23
|
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
24
|
+
append (Optional[bool]): Whether to append to existing evaluation results
|
24
25
|
"""
|
25
26
|
|
26
27
|
# The user will specify whether they want log_results when they call run_eval
|
@@ -33,6 +34,7 @@ class SequenceRun(BaseModel):
|
|
33
34
|
aggregator: Optional[str] = None
|
34
35
|
metadata: Optional[Dict[str, Any]] = None
|
35
36
|
trace_span_id: Optional[str] = None
|
37
|
+
append: Optional[bool] = False
|
36
38
|
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
37
39
|
judgment_api_key: Optional[str] = ""
|
38
40
|
override: Optional[bool] = False
|
judgeval/judgment_client.py
CHANGED
@@ -93,16 +93,47 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
93
93
|
self,
|
94
94
|
sequences: List[Sequence],
|
95
95
|
model: Union[str, List[str], JudgevalJudge],
|
96
|
+
scorers: List[Union[ScorerWrapper, JudgevalScorer]],
|
96
97
|
aggregator: Optional[str] = None,
|
97
98
|
project_name: str = "default_project",
|
98
99
|
eval_run_name: str = "default_eval_sequence",
|
99
100
|
use_judgment: bool = True,
|
100
101
|
log_results: bool = True,
|
102
|
+
append: bool = False,
|
101
103
|
override: bool = False,
|
102
104
|
ignore_errors: bool = True,
|
103
105
|
rules: Optional[List[Rule]] = None
|
104
106
|
) -> List[ScoringResult]:
|
105
107
|
try:
|
108
|
+
loaded_scorers = []
|
109
|
+
for scorer in scorers:
|
110
|
+
try:
|
111
|
+
if isinstance(scorer, ScorerWrapper):
|
112
|
+
loaded_scorers.append(scorer.load_implementation())
|
113
|
+
else:
|
114
|
+
loaded_scorers.append(scorer)
|
115
|
+
except Exception as e:
|
116
|
+
raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
|
117
|
+
|
118
|
+
def get_all_sequences(root: Sequence) -> List[Sequence]:
|
119
|
+
all_sequences = [root]
|
120
|
+
|
121
|
+
for item in root.items:
|
122
|
+
if isinstance(item, Sequence):
|
123
|
+
all_sequences.extend(get_all_sequences(item))
|
124
|
+
|
125
|
+
return all_sequences
|
126
|
+
|
127
|
+
def flatten_sequence_list(sequences: List[Sequence]) -> List[Sequence]:
|
128
|
+
flattened = []
|
129
|
+
for seq in sequences:
|
130
|
+
flattened.extend(get_all_sequences(seq))
|
131
|
+
return flattened
|
132
|
+
|
133
|
+
flattened_sequences = flatten_sequence_list(sequences)
|
134
|
+
for sequence in flattened_sequences:
|
135
|
+
sequence.scorers = loaded_scorers
|
136
|
+
|
106
137
|
if rules:
|
107
138
|
loaded_rules = []
|
108
139
|
for rule in rules:
|
@@ -134,10 +165,10 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
134
165
|
model=model,
|
135
166
|
aggregator=aggregator,
|
136
167
|
log_results=log_results,
|
168
|
+
append=append,
|
137
169
|
judgment_api_key=self.judgment_api_key,
|
138
170
|
organization_id=self.organization_id
|
139
171
|
)
|
140
|
-
|
141
172
|
return run_sequence_eval(sequence_run, override, ignore_errors, use_judgment)
|
142
173
|
except ValueError as e:
|
143
174
|
raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
|
@@ -244,98 +275,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
244
275
|
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
|
245
276
|
except Exception as e:
|
246
277
|
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
247
|
-
|
248
|
-
def evaluate_dataset(
|
249
|
-
self,
|
250
|
-
dataset: EvalDataset,
|
251
|
-
scorers: List[Union[ScorerWrapper, JudgevalScorer]],
|
252
|
-
model: Union[str, List[str], JudgevalJudge],
|
253
|
-
aggregator: Optional[str] = None,
|
254
|
-
metadata: Optional[Dict[str, Any]] = None,
|
255
|
-
project_name: str = "",
|
256
|
-
eval_run_name: str = "",
|
257
|
-
log_results: bool = True,
|
258
|
-
use_judgment: bool = True,
|
259
|
-
rules: Optional[List[Rule]] = None
|
260
|
-
) -> List[ScoringResult]:
|
261
|
-
"""
|
262
|
-
Executes an evaluation of a `EvalDataset` using one or more `Scorer`s
|
263
|
-
|
264
|
-
Args:
|
265
|
-
dataset (EvalDataset): The dataset containing examples to evaluate
|
266
|
-
scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
|
267
|
-
model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
|
268
|
-
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
269
|
-
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
|
270
|
-
project_name (str): The name of the project the evaluation results belong to
|
271
|
-
eval_run_name (str): A name for this evaluation run
|
272
|
-
log_results (bool): Whether to log the results to the Judgment API
|
273
|
-
use_judgment (bool): Whether to use Judgment API for evaluation
|
274
|
-
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
275
|
-
|
276
|
-
Returns:
|
277
|
-
List[ScoringResult]: The results of the evaluation
|
278
|
-
"""
|
279
|
-
try:
|
280
|
-
# Load appropriate implementations for all scorers
|
281
|
-
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
|
282
|
-
for scorer in scorers:
|
283
|
-
try:
|
284
|
-
if isinstance(scorer, ScorerWrapper):
|
285
|
-
loaded_scorers.append(scorer.load_implementation(use_judgment=use_judgment))
|
286
|
-
else:
|
287
|
-
loaded_scorers.append(scorer)
|
288
|
-
except Exception as e:
|
289
|
-
raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
|
290
|
-
|
291
|
-
# Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
|
292
|
-
if rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
|
293
|
-
raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
|
294
|
-
|
295
|
-
# Convert ScorerWrapper in rules to their implementations
|
296
|
-
loaded_rules = None
|
297
|
-
if rules:
|
298
|
-
loaded_rules = []
|
299
|
-
for rule in rules:
|
300
|
-
try:
|
301
|
-
processed_conditions = []
|
302
|
-
for condition in rule.conditions:
|
303
|
-
# Convert metric if it's a ScorerWrapper
|
304
|
-
if isinstance(condition.metric, ScorerWrapper):
|
305
|
-
try:
|
306
|
-
condition_copy = condition.model_copy()
|
307
|
-
condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
|
308
|
-
processed_conditions.append(condition_copy)
|
309
|
-
except Exception as e:
|
310
|
-
raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
|
311
|
-
else:
|
312
|
-
processed_conditions.append(condition)
|
313
|
-
|
314
|
-
# Create new rule with processed conditions
|
315
|
-
new_rule = rule.model_copy()
|
316
|
-
new_rule.conditions = processed_conditions
|
317
|
-
loaded_rules.append(new_rule)
|
318
|
-
except Exception as e:
|
319
|
-
raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
|
320
|
-
|
321
|
-
evaluation_run = EvaluationRun(
|
322
|
-
log_results=log_results,
|
323
|
-
project_name=project_name,
|
324
|
-
eval_name=eval_run_name,
|
325
|
-
examples=dataset.examples,
|
326
|
-
scorers=loaded_scorers,
|
327
|
-
model=model,
|
328
|
-
aggregator=aggregator,
|
329
|
-
metadata=metadata,
|
330
|
-
judgment_api_key=self.judgment_api_key,
|
331
|
-
rules=loaded_rules,
|
332
|
-
organization_id=self.organization_id
|
333
|
-
)
|
334
|
-
return run_eval(evaluation_run)
|
335
|
-
except ValueError as e:
|
336
|
-
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
|
337
|
-
except Exception as e:
|
338
|
-
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
339
278
|
|
340
279
|
def create_dataset(self) -> EvalDataset:
|
341
280
|
return self.eval_dataset_client.create_dataset()
|
judgeval/run_evaluation.py
CHANGED
@@ -336,7 +336,7 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
|
|
336
336
|
|
337
337
|
def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
|
338
338
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
339
|
-
if not override and sequence_run.log_results:
|
339
|
+
if not override and sequence_run.log_results and not sequence_run.append:
|
340
340
|
check_eval_run_name_exists(
|
341
341
|
sequence_run.eval_name,
|
342
342
|
sequence_run.project_name,
|
@@ -0,0 +1,22 @@
|
|
1
|
+
import importlib.metadata
|
2
|
+
import requests
|
3
|
+
import threading
|
4
|
+
|
5
|
+
def check_latest_version(package_name: str = "judgeval"):
|
6
|
+
def _check():
|
7
|
+
try:
|
8
|
+
current_version = importlib.metadata.version(package_name)
|
9
|
+
response = requests.get(f"https://pypi.org/pypi/{package_name}/json", timeout=2)
|
10
|
+
latest_version = response.json()["info"]["version"]
|
11
|
+
|
12
|
+
if current_version != latest_version:
|
13
|
+
print(
|
14
|
+
f"\033[93mUPDATE AVAILABLE:\033[0m You are using '{package_name}=={current_version}', "
|
15
|
+
f"but the latest version is '{latest_version}'. While this version is still supported, "
|
16
|
+
f"we recommend upgrading to avoid potential issues or missing features: "
|
17
|
+
f"`pip install --upgrade {package_name}`"
|
18
|
+
)
|
19
|
+
except Exception:
|
20
|
+
pass
|
21
|
+
|
22
|
+
threading.Thread(target=_check, daemon=True).start()
|
@@ -1,25 +1,26 @@
|
|
1
|
-
judgeval/__init__.py,sha256=
|
1
|
+
judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
|
2
2
|
judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
|
3
|
-
judgeval/constants.py,sha256=
|
3
|
+
judgeval/constants.py,sha256=_XmVAkebMyGrDvvanAVlMgVd4p6MLHdEVsTQFI0kz1k,5411
|
4
4
|
judgeval/evaluation_run.py,sha256=WGzx-Ug2qhSmunFo8NrmSstBRsOUc5KpKq0Lc51rqsM,6739
|
5
|
-
judgeval/judgment_client.py,sha256=
|
5
|
+
judgeval/judgment_client.py,sha256=k0q2s5A0RkhF9ElD9o-KWN10H36t3Of2PrvNF-silf8,26141
|
6
6
|
judgeval/rules.py,sha256=B0ZL0pn72D4Jnlr0zMQ6CPHi7D8AQQRariXCVsiCMiI,20542
|
7
|
-
judgeval/run_evaluation.py,sha256=
|
7
|
+
judgeval/run_evaluation.py,sha256=hnEY8QckEviXYNJutf-6tLFq2DWCzqWV1EVyPvrVXyA,28512
|
8
|
+
judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
|
8
9
|
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
9
10
|
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
10
11
|
judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
|
11
|
-
judgeval/common/tracer.py,sha256=
|
12
|
+
judgeval/common/tracer.py,sha256=owRRfIZXPUOVCCn0macygnf18mcp8am1eULGnZXD0Kk,68876
|
12
13
|
judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
|
13
14
|
judgeval/data/__init__.py,sha256=xuKx_KCVHGp6CXvQuVmKl3v7pJp-qDaz0NccKxwjtO0,481
|
14
15
|
judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
|
15
16
|
judgeval/data/example.py,sha256=cJrmPGLel_P2sy1UaRvuVSAi35EnA9XMR11Lhp4aDLo,5930
|
16
17
|
judgeval/data/result.py,sha256=Gb9tiSDsk1amXgh0cFG6JmlW_BMKxS2kuTwNA0rrHjA,3184
|
17
18
|
judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
|
18
|
-
judgeval/data/sequence.py,sha256=
|
19
|
-
judgeval/data/sequence_run.py,sha256=
|
19
|
+
judgeval/data/sequence.py,sha256=Fkk2HJGnPboH-Fvwgxub_ryG0eUXa3cbsj7ZD0qkeBo,2204
|
20
|
+
judgeval/data/sequence_run.py,sha256=RmYjfWKMWg-pcF5PLeiWfrhuDkjDZi5VEmAIEXN3Ib0,2104
|
20
21
|
judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
|
21
|
-
judgeval/data/datasets/dataset.py,sha256=
|
22
|
-
judgeval/data/datasets/eval_dataset_client.py,sha256=
|
22
|
+
judgeval/data/datasets/dataset.py,sha256=dhLo30hvpmmOK2R6O5wDs_neawUJ4lS8bb4S42SufNQ,13034
|
23
|
+
judgeval/data/datasets/eval_dataset_client.py,sha256=xjj66BO9Es9IxXqzQe1RT_e0kpeKlt7OrhRoSuj4KHM,15085
|
23
24
|
judgeval/integrations/langgraph.py,sha256=J-cQfFP52TjJewdSTe-fcsUC4HDvjNbXoxmbmF0SgiE,11743
|
24
25
|
judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
|
25
26
|
judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
|
@@ -90,7 +91,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
|
|
90
91
|
judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
|
91
92
|
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
92
93
|
judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
|
93
|
-
judgeval-0.0.
|
94
|
-
judgeval-0.0.
|
95
|
-
judgeval-0.0.
|
96
|
-
judgeval-0.0.
|
94
|
+
judgeval-0.0.32.dist-info/METADATA,sha256=RJzqlHJwfYiOXEcyEEO5WQBM0DC1zQDuoN-Plix6U38,5418
|
95
|
+
judgeval-0.0.32.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
96
|
+
judgeval-0.0.32.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
97
|
+
judgeval-0.0.32.dist-info/RECORD,,
|
File without changes
|
File without changes
|