judgeval 0.0.52__py3-none-any.whl → 0.0.54__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/logger.py +46 -199
- judgeval/common/s3_storage.py +2 -6
- judgeval/common/tracer.py +182 -262
- judgeval/common/utils.py +16 -36
- judgeval/constants.py +14 -20
- judgeval/data/__init__.py +0 -2
- judgeval/data/datasets/dataset.py +6 -10
- judgeval/data/datasets/eval_dataset_client.py +25 -27
- judgeval/data/example.py +5 -138
- judgeval/data/judgment_types.py +214 -0
- judgeval/data/result.py +7 -25
- judgeval/data/scorer_data.py +28 -40
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/tool.py +3 -54
- judgeval/data/trace.py +31 -50
- judgeval/data/trace_run.py +3 -3
- judgeval/evaluation_run.py +16 -23
- judgeval/integrations/langgraph.py +11 -12
- judgeval/judges/litellm_judge.py +3 -6
- judgeval/judges/mixture_of_judges.py +8 -25
- judgeval/judges/together_judge.py +3 -6
- judgeval/judgment_client.py +22 -24
- judgeval/rules.py +7 -19
- judgeval/run_evaluation.py +79 -242
- judgeval/scorers/__init__.py +4 -20
- judgeval/scorers/agent_scorer.py +21 -0
- judgeval/scorers/api_scorer.py +28 -38
- judgeval/scorers/base_scorer.py +98 -0
- judgeval/scorers/example_scorer.py +19 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -24
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +16 -68
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +4 -12
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -17
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -14
- judgeval/scorers/score.py +45 -330
- judgeval/scorers/utils.py +6 -88
- judgeval/utils/file_utils.py +4 -6
- judgeval/version_check.py +3 -2
- {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/METADATA +6 -5
- judgeval-0.0.54.dist-info/RECORD +65 -0
- judgeval/data/custom_example.py +0 -19
- judgeval/scorers/judgeval_scorer.py +0 -177
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
- judgeval/scorers/prompt_scorer.py +0 -296
- judgeval-0.0.52.dist-info/RECORD +0 -69
- {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/WHEEL +0 -0
- {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,123 @@
|
|
1
|
+
import json
|
2
|
+
import sys
|
3
|
+
from typing import Any, Dict, Generator, List
|
4
|
+
import requests
|
5
|
+
|
6
|
+
spec_file = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8000/openapi.json"
|
7
|
+
|
8
|
+
if spec_file.startswith("http"):
|
9
|
+
r = requests.get(spec_file)
|
10
|
+
r.raise_for_status()
|
11
|
+
SPEC = r.json()
|
12
|
+
else:
|
13
|
+
with open(spec_file, "r") as f:
|
14
|
+
SPEC = json.load(f)
|
15
|
+
|
16
|
+
JUDGEVAL_PATHS: List[str] = [
|
17
|
+
"/log_eval_results/",
|
18
|
+
]
|
19
|
+
|
20
|
+
|
21
|
+
def resolve_ref(ref: str) -> str:
|
22
|
+
assert ref.startswith("#/components/schemas/"), (
|
23
|
+
"Reference must start with #/components/schemas/"
|
24
|
+
)
|
25
|
+
return ref.replace("#/components/schemas/", "")
|
26
|
+
|
27
|
+
|
28
|
+
def walk(obj: Any) -> Generator[Any, None, None]:
|
29
|
+
yield obj
|
30
|
+
if isinstance(obj, list):
|
31
|
+
for item in obj:
|
32
|
+
yield from walk(item)
|
33
|
+
elif isinstance(obj, dict):
|
34
|
+
for value in obj.values():
|
35
|
+
yield from walk(value)
|
36
|
+
|
37
|
+
|
38
|
+
def get_referenced_schemas(obj: Any) -> Generator[str, None, None]:
|
39
|
+
for value in walk(obj):
|
40
|
+
if isinstance(value, dict) and "$ref" in value:
|
41
|
+
ref = value["$ref"]
|
42
|
+
resolved = resolve_ref(ref)
|
43
|
+
assert isinstance(ref, str), "Reference must be a string"
|
44
|
+
# Strip the _JudgmentType suffix if it exists to get the original schema name
|
45
|
+
if resolved.endswith("_JudgmentType"):
|
46
|
+
resolved = resolved[: -len("_JudgmentType")]
|
47
|
+
yield resolved
|
48
|
+
|
49
|
+
|
50
|
+
def transform_schema_refs(obj: Any) -> Any:
|
51
|
+
"""Transform all $ref values in a schema to use the _JudgmentType suffix"""
|
52
|
+
if isinstance(obj, dict):
|
53
|
+
result = {}
|
54
|
+
for key, value in obj.items():
|
55
|
+
if (
|
56
|
+
key == "$ref"
|
57
|
+
and isinstance(value, str)
|
58
|
+
and value.startswith("#/components/schemas/")
|
59
|
+
):
|
60
|
+
# Update the reference to use the suffixed name
|
61
|
+
original_name = resolve_ref(value)
|
62
|
+
suffixed_name = f"{original_name}_JudgmentType"
|
63
|
+
result[key] = f"#/components/schemas/{suffixed_name}"
|
64
|
+
else:
|
65
|
+
result[key] = transform_schema_refs(value)
|
66
|
+
return result
|
67
|
+
elif isinstance(obj, list):
|
68
|
+
return [transform_schema_refs(item) for item in obj]
|
69
|
+
else:
|
70
|
+
return obj
|
71
|
+
|
72
|
+
|
73
|
+
filtered_paths = {
|
74
|
+
path: spec_data
|
75
|
+
for path, spec_data in SPEC["paths"].items()
|
76
|
+
if path in JUDGEVAL_PATHS
|
77
|
+
}
|
78
|
+
|
79
|
+
|
80
|
+
def filter_schemas() -> Dict[str, Any]:
|
81
|
+
result: Dict[str, Any] = {}
|
82
|
+
processed_original_names: set[str] = set()
|
83
|
+
schemas_to_scan: Any = filtered_paths
|
84
|
+
|
85
|
+
while True:
|
86
|
+
to_commit: Dict[str, Any] = {}
|
87
|
+
for original_schema_name in get_referenced_schemas(schemas_to_scan):
|
88
|
+
if original_schema_name in processed_original_names:
|
89
|
+
continue
|
90
|
+
|
91
|
+
assert original_schema_name in SPEC["components"]["schemas"], (
|
92
|
+
f"Schema {original_schema_name} not found in components.schemas"
|
93
|
+
)
|
94
|
+
# Transform the schema to update any internal references
|
95
|
+
original_schema = SPEC["components"]["schemas"][original_schema_name]
|
96
|
+
transformed_schema = transform_schema_refs(original_schema)
|
97
|
+
suffixed_name = f"{original_schema_name}_JudgmentType"
|
98
|
+
to_commit[suffixed_name] = transformed_schema
|
99
|
+
processed_original_names.add(original_schema_name)
|
100
|
+
|
101
|
+
if not to_commit:
|
102
|
+
break
|
103
|
+
|
104
|
+
result.update(to_commit)
|
105
|
+
schemas_to_scan = to_commit
|
106
|
+
|
107
|
+
return result
|
108
|
+
|
109
|
+
|
110
|
+
# Transform the filtered paths to update schema references
|
111
|
+
transformed_paths = transform_schema_refs(filtered_paths)
|
112
|
+
|
113
|
+
spec = {
|
114
|
+
"openapi": SPEC["openapi"],
|
115
|
+
"info": SPEC["info"],
|
116
|
+
"paths": transformed_paths,
|
117
|
+
"components": {
|
118
|
+
**SPEC["components"],
|
119
|
+
"schemas": filter_schemas(),
|
120
|
+
},
|
121
|
+
}
|
122
|
+
|
123
|
+
print(json.dumps(spec, indent=4))
|
judgeval/data/tool.py
CHANGED
@@ -1,56 +1,5 @@
|
|
1
|
-
from
|
2
|
-
from typing import Dict, Any, Optional, List
|
3
|
-
import warnings
|
1
|
+
from judgeval.data.judgment_types import ToolJudgmentType
|
4
2
|
|
5
3
|
|
6
|
-
class Tool(
|
7
|
-
|
8
|
-
parameters: Optional[Dict[str, Any]] = None
|
9
|
-
agent_name: Optional[str] = None
|
10
|
-
result_dependencies: Optional[List[Dict[str, Any]]] = None
|
11
|
-
action_dependencies: Optional[List[Dict[str, Any]]] = None
|
12
|
-
require_all: Optional[bool] = None
|
13
|
-
|
14
|
-
@field_validator("tool_name")
|
15
|
-
def validate_tool_name(cls, v):
|
16
|
-
if not v:
|
17
|
-
warnings.warn("Tool name is empty or None", UserWarning)
|
18
|
-
return v
|
19
|
-
|
20
|
-
@field_validator("parameters")
|
21
|
-
def validate_parameters(cls, v):
|
22
|
-
if v is not None and not isinstance(v, dict):
|
23
|
-
warnings.warn(
|
24
|
-
f"Parameters should be a dictionary, got {type(v)}", UserWarning
|
25
|
-
)
|
26
|
-
return v
|
27
|
-
|
28
|
-
@field_validator("agent_name")
|
29
|
-
def validate_agent_name(cls, v):
|
30
|
-
if v is not None and not isinstance(v, str):
|
31
|
-
warnings.warn(f"Agent name should be a string, got {type(v)}", UserWarning)
|
32
|
-
return v
|
33
|
-
|
34
|
-
@field_validator("result_dependencies")
|
35
|
-
def validate_result_dependencies(cls, v):
|
36
|
-
if v is not None and not isinstance(v, list):
|
37
|
-
warnings.warn(
|
38
|
-
f"Result dependencies should be a list, got {type(v)}", UserWarning
|
39
|
-
)
|
40
|
-
return v
|
41
|
-
|
42
|
-
@field_validator("action_dependencies")
|
43
|
-
def validate_action_dependencies(cls, v):
|
44
|
-
if v is not None and not isinstance(v, list):
|
45
|
-
warnings.warn(
|
46
|
-
f"Action dependencies should be a list, got {type(v)}", UserWarning
|
47
|
-
)
|
48
|
-
return v
|
49
|
-
|
50
|
-
@field_validator("require_all")
|
51
|
-
def validate_require_all(cls, v):
|
52
|
-
if v is not None and not isinstance(v, bool):
|
53
|
-
warnings.warn(
|
54
|
-
f"Require all should be a boolean, got {type(v)}", UserWarning
|
55
|
-
)
|
56
|
-
return v
|
4
|
+
class Tool(ToolJudgmentType):
|
5
|
+
pass
|
judgeval/data/trace.py
CHANGED
@@ -1,44 +1,21 @@
|
|
1
|
-
from
|
2
|
-
from typing import Optional, Dict, Any, List
|
3
|
-
from judgeval.evaluation_run import EvaluationRun
|
4
|
-
from judgeval.data.tool import Tool
|
1
|
+
from typing import Any
|
5
2
|
import json
|
6
3
|
import sys
|
4
|
+
import threading
|
7
5
|
from datetime import datetime, timezone
|
6
|
+
from judgeval.data.judgment_types import (
|
7
|
+
TraceUsageJudgmentType,
|
8
|
+
TraceSpanJudgmentType,
|
9
|
+
TraceJudgmentType,
|
10
|
+
)
|
11
|
+
from pydantic import BaseModel
|
8
12
|
|
9
13
|
|
10
|
-
class TraceUsage(
|
11
|
-
|
12
|
-
completion_tokens: Optional[int] = None
|
13
|
-
total_tokens: Optional[int] = None
|
14
|
-
prompt_tokens_cost_usd: Optional[float] = None
|
15
|
-
completion_tokens_cost_usd: Optional[float] = None
|
16
|
-
total_cost_usd: Optional[float] = None
|
17
|
-
model_name: Optional[str] = None
|
18
|
-
|
19
|
-
|
20
|
-
class TraceSpan(BaseModel):
|
21
|
-
span_id: str
|
22
|
-
trace_id: str
|
23
|
-
function: str
|
24
|
-
depth: int
|
25
|
-
created_at: Optional[Any] = None
|
26
|
-
parent_span_id: Optional[str] = None
|
27
|
-
span_type: Optional[str] = "span"
|
28
|
-
inputs: Optional[Dict[str, Any]] = None
|
29
|
-
error: Optional[Dict[str, Any]] = None
|
30
|
-
output: Optional[Any] = None
|
31
|
-
usage: Optional[TraceUsage] = None
|
32
|
-
duration: Optional[float] = None
|
33
|
-
annotation: Optional[List[Dict[str, Any]]] = None
|
34
|
-
evaluation_runs: Optional[List[EvaluationRun]] = []
|
35
|
-
expected_tools: Optional[List[Tool]] = None
|
36
|
-
additional_metadata: Optional[Dict[str, Any]] = None
|
37
|
-
has_evaluation: Optional[bool] = False
|
38
|
-
agent_name: Optional[str] = None
|
39
|
-
state_before: Optional[Dict[str, Any]] = None
|
40
|
-
state_after: Optional[Dict[str, Any]] = None
|
14
|
+
class TraceUsage(TraceUsageJudgmentType):
|
15
|
+
pass
|
41
16
|
|
17
|
+
|
18
|
+
class TraceSpan(TraceSpanJudgmentType):
|
42
19
|
def model_dump(self, **kwargs):
|
43
20
|
return {
|
44
21
|
"span_id": self.span_id,
|
@@ -50,9 +27,6 @@ class TraceSpan(BaseModel):
|
|
50
27
|
"inputs": self._serialize_value(self.inputs),
|
51
28
|
"output": self._serialize_value(self.output),
|
52
29
|
"error": self._serialize_value(self.error),
|
53
|
-
"evaluation_runs": [run.model_dump() for run in self.evaluation_runs]
|
54
|
-
if self.evaluation_runs
|
55
|
-
else [],
|
56
30
|
"parent_span_id": self.parent_span_id,
|
57
31
|
"function": self.function,
|
58
32
|
"duration": self.duration,
|
@@ -63,8 +37,24 @@ class TraceSpan(BaseModel):
|
|
63
37
|
"state_before": self.state_before,
|
64
38
|
"state_after": self.state_after,
|
65
39
|
"additional_metadata": self._serialize_value(self.additional_metadata),
|
40
|
+
"update_id": self.update_id,
|
66
41
|
}
|
67
42
|
|
43
|
+
def __init__(self, **data):
|
44
|
+
super().__init__(**data)
|
45
|
+
# Initialize thread lock for thread-safe update_id increment
|
46
|
+
self._update_id_lock = threading.Lock()
|
47
|
+
|
48
|
+
def increment_update_id(self) -> int:
|
49
|
+
"""
|
50
|
+
Thread-safe method to increment the update_id counter.
|
51
|
+
Returns:
|
52
|
+
int: The new update_id value after incrementing
|
53
|
+
"""
|
54
|
+
with self._update_id_lock:
|
55
|
+
self.update_id += 1
|
56
|
+
return self.update_id
|
57
|
+
|
68
58
|
def print_span(self):
|
69
59
|
"""Print the span with proper formatting and parent relationship information."""
|
70
60
|
indent = " " * self.depth
|
@@ -94,6 +84,7 @@ class TraceSpan(BaseModel):
|
|
94
84
|
return repr(output)
|
95
85
|
except (TypeError, OverflowError, ValueError):
|
96
86
|
pass
|
87
|
+
|
97
88
|
return None
|
98
89
|
|
99
90
|
def _serialize_value(self, value: Any) -> Any:
|
@@ -140,15 +131,5 @@ class TraceSpan(BaseModel):
|
|
140
131
|
return {"error": "Unable to serialize"}
|
141
132
|
|
142
133
|
|
143
|
-
class Trace(
|
144
|
-
|
145
|
-
name: str
|
146
|
-
created_at: str
|
147
|
-
duration: float
|
148
|
-
trace_spans: List[TraceSpan]
|
149
|
-
overwrite: bool = False
|
150
|
-
offline_mode: bool = False
|
151
|
-
rules: Dict[str, Any] = Field(default_factory=dict)
|
152
|
-
has_notification: Optional[bool] = False
|
153
|
-
customer_id: Optional[str] = None
|
154
|
-
tags: List[str] = Field(default_factory=list)
|
134
|
+
class Trace(TraceJudgmentType):
|
135
|
+
pass
|
judgeval/data/trace_run.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from pydantic import BaseModel
|
2
2
|
from typing import List, Optional, Dict, Any, Union
|
3
3
|
from judgeval.data import Trace
|
4
|
-
from judgeval.scorers import
|
4
|
+
from judgeval.scorers import APIScorerConfig, BaseScorer
|
5
5
|
from judgeval.rules import Rule
|
6
6
|
|
7
7
|
|
@@ -13,7 +13,7 @@ class TraceRun(BaseModel):
|
|
13
13
|
project_name (str): The name of the project the evaluation results belong to
|
14
14
|
eval_name (str): A name for this evaluation run
|
15
15
|
traces (List[Trace]): The traces to evaluate
|
16
|
-
scorers (List[Union[JudgmentScorer,
|
16
|
+
scorers (List[Union[JudgmentScorer, BaseScorer]]): A list of scorers to use for evaluation
|
17
17
|
model (str): The model used as a judge when using LLM as a Judge
|
18
18
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
19
19
|
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
@@ -26,7 +26,7 @@ class TraceRun(BaseModel):
|
|
26
26
|
project_name: Optional[str] = None
|
27
27
|
eval_name: Optional[str] = None
|
28
28
|
traces: Optional[List[Trace]] = None
|
29
|
-
scorers: List[Union[
|
29
|
+
scorers: List[Union[APIScorerConfig, BaseScorer]]
|
30
30
|
model: Optional[str] = "gpt-4.1"
|
31
31
|
trace_span_id: Optional[str] = None
|
32
32
|
append: Optional[bool] = False
|
judgeval/evaluation_run.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
from typing import List, Optional, Union
|
2
2
|
from pydantic import BaseModel, field_validator, Field
|
3
3
|
|
4
|
-
from judgeval.data import Example
|
5
|
-
from judgeval.scorers import
|
4
|
+
from judgeval.data import Example
|
5
|
+
from judgeval.scorers import BaseScorer, APIScorerConfig
|
6
6
|
from judgeval.constants import ACCEPTABLE_MODELS
|
7
7
|
|
8
8
|
|
@@ -13,8 +13,8 @@ class EvaluationRun(BaseModel):
|
|
13
13
|
Args:
|
14
14
|
project_name (str): The name of the project the evaluation results belong to
|
15
15
|
eval_name (str): A name for this evaluation run
|
16
|
-
examples (
|
17
|
-
scorers (List[Union[JudgmentScorer,
|
16
|
+
examples (List[Example]): The examples to evaluate
|
17
|
+
scorers (List[Union[JudgmentScorer, BaseScorer]]): A list of scorers to use for evaluation
|
18
18
|
model (str): The model used as a judge when using LLM as a Judge
|
19
19
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
20
20
|
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
@@ -23,8 +23,8 @@ class EvaluationRun(BaseModel):
|
|
23
23
|
organization_id: Optional[str] = None
|
24
24
|
project_name: Optional[str] = Field(default=None, validate_default=True)
|
25
25
|
eval_name: Optional[str] = Field(default=None, validate_default=True)
|
26
|
-
examples:
|
27
|
-
scorers: List[Union[
|
26
|
+
examples: List[Example]
|
27
|
+
scorers: List[Union[APIScorerConfig, BaseScorer]]
|
28
28
|
model: Optional[str] = "gpt-4.1"
|
29
29
|
trace_span_id: Optional[str] = None
|
30
30
|
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
@@ -36,13 +36,8 @@ class EvaluationRun(BaseModel):
|
|
36
36
|
data = super().model_dump(**kwargs)
|
37
37
|
|
38
38
|
data["scorers"] = [
|
39
|
-
scorer.
|
40
|
-
|
41
|
-
else scorer.model_dump()
|
42
|
-
if hasattr(scorer, "model_dump")
|
43
|
-
else {"score_type": scorer.score_type, "threshold": scorer.threshold}
|
44
|
-
for scorer in self.scorers
|
45
|
-
]
|
39
|
+
scorer.model_dump() for scorer in self.scorers
|
40
|
+
] # Pydantic has problems with properly calling model_dump() on the scorers, so we need to do it manually
|
46
41
|
|
47
42
|
return data
|
48
43
|
|
@@ -50,21 +45,19 @@ class EvaluationRun(BaseModel):
|
|
50
45
|
def validate_examples(cls, v):
|
51
46
|
if not v:
|
52
47
|
raise ValueError("Examples cannot be empty.")
|
53
|
-
|
54
|
-
first_type = type(v[0])
|
55
|
-
if first_type not in (Example, CustomExample):
|
56
|
-
raise ValueError(f"Invalid type for Example/CustomExample: {first_type}")
|
57
|
-
if not all(isinstance(ex, first_type) for ex in v):
|
58
|
-
raise ValueError(
|
59
|
-
"All examples must be of the same type, either all Example or all CustomExample."
|
60
|
-
)
|
61
|
-
|
62
48
|
return v
|
63
49
|
|
64
|
-
@field_validator("scorers")
|
50
|
+
@field_validator("scorers", mode="before")
|
65
51
|
def validate_scorers(cls, v):
|
66
52
|
if not v:
|
67
53
|
raise ValueError("Scorers cannot be empty.")
|
54
|
+
if not all(
|
55
|
+
isinstance(scorer, BaseScorer) or isinstance(scorer, APIScorerConfig)
|
56
|
+
for scorer in v
|
57
|
+
):
|
58
|
+
raise ValueError(
|
59
|
+
"All scorers must be of type BaseScorer or APIScorerConfig."
|
60
|
+
)
|
68
61
|
return v
|
69
62
|
|
70
63
|
@field_validator("model")
|
@@ -2,7 +2,7 @@ from typing import Any, Dict, List, Optional, Sequence
|
|
2
2
|
from uuid import UUID
|
3
3
|
import time
|
4
4
|
import uuid
|
5
|
-
from datetime import datetime
|
5
|
+
from datetime import datetime, timezone
|
6
6
|
|
7
7
|
from judgeval.common.tracer import (
|
8
8
|
TraceClient,
|
@@ -120,8 +120,6 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
120
120
|
trace_id,
|
121
121
|
event_name,
|
122
122
|
project_name=project,
|
123
|
-
overwrite=False,
|
124
|
-
rules=self.tracer.rules,
|
125
123
|
enable_monitoring=self.tracer.enable_monitoring,
|
126
124
|
enable_evaluations=self.tracer.enable_evaluations,
|
127
125
|
)
|
@@ -140,7 +138,6 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
140
138
|
# NEW: Initial save for live tracking (follows the new practice)
|
141
139
|
try:
|
142
140
|
trace_id_saved, server_response = self._trace_client.save(
|
143
|
-
overwrite=self._trace_client.overwrite,
|
144
141
|
final_save=False, # Initial save for live tracking
|
145
142
|
)
|
146
143
|
except Exception as e:
|
@@ -210,6 +207,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
210
207
|
# Set both fields on the span
|
211
208
|
new_span.inputs = clean_inputs
|
212
209
|
new_span.additional_metadata = metadata
|
210
|
+
new_span.increment_update_id() # Thread-safe increment for span modification
|
213
211
|
else:
|
214
212
|
new_span.inputs = {}
|
215
213
|
new_span.additional_metadata = {}
|
@@ -249,10 +247,12 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
249
247
|
trace_span = trace_client.span_id_to_span.get(span_id)
|
250
248
|
if trace_span:
|
251
249
|
trace_span.duration = duration
|
250
|
+
trace_span.increment_update_id() # Thread-safe increment for span modification
|
252
251
|
|
253
252
|
# Handle outputs and error
|
254
253
|
if error:
|
255
254
|
trace_span.output = error
|
255
|
+
trace_span.increment_update_id() # Thread-safe increment for span modification
|
256
256
|
elif outputs:
|
257
257
|
# Separate metadata from outputs
|
258
258
|
metadata = {}
|
@@ -272,6 +272,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
272
272
|
|
273
273
|
# Set both fields on the span
|
274
274
|
trace_span.output = clean_outputs
|
275
|
+
trace_span.increment_update_id() # Thread-safe increment for span modification
|
275
276
|
if metadata:
|
276
277
|
# Merge with existing metadata
|
277
278
|
existing_metadata = trace_span.additional_metadata or {}
|
@@ -279,6 +280,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
279
280
|
**existing_metadata,
|
280
281
|
**metadata,
|
281
282
|
}
|
283
|
+
trace_span.increment_update_id() # Thread-safe increment for span modification
|
282
284
|
|
283
285
|
# Queue span with completed state through background service
|
284
286
|
if trace_client.background_span_service:
|
@@ -308,20 +310,18 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
308
310
|
complete_trace_data = {
|
309
311
|
"trace_id": self._trace_client.trace_id,
|
310
312
|
"name": self._trace_client.name,
|
311
|
-
"created_at": datetime.
|
312
|
-
self._trace_client.start_time
|
313
|
+
"created_at": datetime.fromtimestamp(
|
314
|
+
self._trace_client.start_time, timezone.utc
|
313
315
|
).isoformat(),
|
314
316
|
"duration": self._trace_client.get_duration(),
|
315
317
|
"trace_spans": [
|
316
318
|
span.model_dump() for span in self._trace_client.trace_spans
|
317
319
|
],
|
318
|
-
"overwrite": self._trace_client.overwrite,
|
319
320
|
"offline_mode": self.tracer.offline_mode,
|
320
321
|
"parent_trace_id": self._trace_client.parent_trace_id,
|
321
322
|
"parent_name": self._trace_client.parent_name,
|
322
323
|
}
|
323
324
|
trace_id, trace_data = self._trace_client.save(
|
324
|
-
overwrite=self._trace_client.overwrite,
|
325
325
|
final_save=True, # Final save with usage counter updates
|
326
326
|
)
|
327
327
|
token = self.trace_id_to_token.pop(trace_id, None)
|
@@ -518,20 +518,18 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
518
518
|
complete_trace_data = {
|
519
519
|
"trace_id": trace_client.trace_id,
|
520
520
|
"name": trace_client.name,
|
521
|
-
"created_at": datetime.
|
522
|
-
trace_client.start_time
|
521
|
+
"created_at": datetime.fromtimestamp(
|
522
|
+
trace_client.start_time, timezone.utc
|
523
523
|
).isoformat(),
|
524
524
|
"duration": trace_client.get_duration(),
|
525
525
|
"trace_spans": [
|
526
526
|
span.model_dump() for span in trace_client.trace_spans
|
527
527
|
],
|
528
|
-
"overwrite": trace_client.overwrite,
|
529
528
|
"offline_mode": self.tracer.offline_mode,
|
530
529
|
"parent_trace_id": trace_client.parent_trace_id,
|
531
530
|
"parent_name": trace_client.parent_name,
|
532
531
|
}
|
533
532
|
trace_id_saved, trace_data = trace_client.save(
|
534
|
-
overwrite=trace_client.overwrite,
|
535
533
|
final_save=True,
|
536
534
|
)
|
537
535
|
|
@@ -815,6 +813,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
815
813
|
if span_id and span_id in trace_client.span_id_to_span:
|
816
814
|
trace_span = trace_client.span_id_to_span[span_id]
|
817
815
|
trace_span.usage = usage
|
816
|
+
trace_span.increment_update_id() # Thread-safe increment for span modification
|
818
817
|
|
819
818
|
self._end_span_tracking(trace_client, run_id, outputs=outputs)
|
820
819
|
# --- End Token Usage ---
|
judgeval/judges/litellm_judge.py
CHANGED
@@ -6,7 +6,7 @@ from judgeval.common.utils import (
|
|
6
6
|
afetch_litellm_api_response,
|
7
7
|
fetch_litellm_api_response,
|
8
8
|
)
|
9
|
-
from judgeval.common.logger import
|
9
|
+
from judgeval.common.logger import judgeval_logger
|
10
10
|
|
11
11
|
BASE_CONVERSATION = [
|
12
12
|
{"role": "system", "content": "You are a helpful assistant."},
|
@@ -15,7 +15,6 @@ BASE_CONVERSATION = [
|
|
15
15
|
|
16
16
|
class LiteLLMJudge(JudgevalJudge):
|
17
17
|
def __init__(self, model: str = "gpt-4.1-mini", **kwargs):
|
18
|
-
debug(f"Initializing LiteLLMJudge with model={model}")
|
19
18
|
self.model = model
|
20
19
|
self.kwargs = kwargs
|
21
20
|
super().__init__(model_name=model)
|
@@ -25,7 +24,6 @@ class LiteLLMJudge(JudgevalJudge):
|
|
25
24
|
input: Union[str, List[Mapping[str, str]]],
|
26
25
|
schema: pydantic.BaseModel = None,
|
27
26
|
) -> str:
|
28
|
-
debug(f"Generating response for input type: {type(input)}")
|
29
27
|
if isinstance(input, str):
|
30
28
|
convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
|
31
29
|
return fetch_litellm_api_response(
|
@@ -36,7 +34,7 @@ class LiteLLMJudge(JudgevalJudge):
|
|
36
34
|
model=self.model, messages=input, response_format=schema
|
37
35
|
)
|
38
36
|
else:
|
39
|
-
error(f"Invalid input type received: {type(input)}")
|
37
|
+
judgeval_logger.error(f"Invalid input type received: {type(input)}")
|
40
38
|
raise TypeError(
|
41
39
|
f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
|
42
40
|
)
|
@@ -46,7 +44,6 @@ class LiteLLMJudge(JudgevalJudge):
|
|
46
44
|
input: Union[str, List[Mapping[str, str]]],
|
47
45
|
schema: pydantic.BaseModel = None,
|
48
46
|
) -> str:
|
49
|
-
debug(f"Async generating response for input type: {type(input)}")
|
50
47
|
if isinstance(input, str):
|
51
48
|
convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
|
52
49
|
response = await afetch_litellm_api_response(
|
@@ -59,7 +56,7 @@ class LiteLLMJudge(JudgevalJudge):
|
|
59
56
|
)
|
60
57
|
return response
|
61
58
|
else:
|
62
|
-
error(f"Invalid input type received: {type(input)}")
|
59
|
+
judgeval_logger.error(f"Invalid input type received: {type(input)}")
|
63
60
|
raise TypeError(
|
64
61
|
f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
|
65
62
|
)
|