judgeval 0.0.52__py3-none-any.whl → 0.0.54__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. judgeval/common/logger.py +46 -199
  2. judgeval/common/s3_storage.py +2 -6
  3. judgeval/common/tracer.py +182 -262
  4. judgeval/common/utils.py +16 -36
  5. judgeval/constants.py +14 -20
  6. judgeval/data/__init__.py +0 -2
  7. judgeval/data/datasets/dataset.py +6 -10
  8. judgeval/data/datasets/eval_dataset_client.py +25 -27
  9. judgeval/data/example.py +5 -138
  10. judgeval/data/judgment_types.py +214 -0
  11. judgeval/data/result.py +7 -25
  12. judgeval/data/scorer_data.py +28 -40
  13. judgeval/data/scripts/fix_default_factory.py +23 -0
  14. judgeval/data/scripts/openapi_transform.py +123 -0
  15. judgeval/data/tool.py +3 -54
  16. judgeval/data/trace.py +31 -50
  17. judgeval/data/trace_run.py +3 -3
  18. judgeval/evaluation_run.py +16 -23
  19. judgeval/integrations/langgraph.py +11 -12
  20. judgeval/judges/litellm_judge.py +3 -6
  21. judgeval/judges/mixture_of_judges.py +8 -25
  22. judgeval/judges/together_judge.py +3 -6
  23. judgeval/judgment_client.py +22 -24
  24. judgeval/rules.py +7 -19
  25. judgeval/run_evaluation.py +79 -242
  26. judgeval/scorers/__init__.py +4 -20
  27. judgeval/scorers/agent_scorer.py +21 -0
  28. judgeval/scorers/api_scorer.py +28 -38
  29. judgeval/scorers/base_scorer.py +98 -0
  30. judgeval/scorers/example_scorer.py +19 -0
  31. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
  32. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -17
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -24
  34. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +16 -68
  35. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +4 -12
  36. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
  37. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -17
  38. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
  40. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
  41. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -14
  42. judgeval/scorers/score.py +45 -330
  43. judgeval/scorers/utils.py +6 -88
  44. judgeval/utils/file_utils.py +4 -6
  45. judgeval/version_check.py +3 -2
  46. {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/METADATA +6 -5
  47. judgeval-0.0.54.dist-info/RECORD +65 -0
  48. judgeval/data/custom_example.py +0 -19
  49. judgeval/scorers/judgeval_scorer.py +0 -177
  50. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
  51. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
  52. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
  53. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
  54. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
  55. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
  56. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
  57. judgeval/scorers/prompt_scorer.py +0 -296
  58. judgeval-0.0.52.dist-info/RECORD +0 -69
  59. {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/WHEEL +0 -0
  60. {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,123 @@
1
+ import json
2
+ import sys
3
+ from typing import Any, Dict, Generator, List
4
+ import requests
5
+
6
+ spec_file = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8000/openapi.json"
7
+
8
+ if spec_file.startswith("http"):
9
+ r = requests.get(spec_file)
10
+ r.raise_for_status()
11
+ SPEC = r.json()
12
+ else:
13
+ with open(spec_file, "r") as f:
14
+ SPEC = json.load(f)
15
+
16
+ JUDGEVAL_PATHS: List[str] = [
17
+ "/log_eval_results/",
18
+ ]
19
+
20
+
21
+ def resolve_ref(ref: str) -> str:
22
+ assert ref.startswith("#/components/schemas/"), (
23
+ "Reference must start with #/components/schemas/"
24
+ )
25
+ return ref.replace("#/components/schemas/", "")
26
+
27
+
28
+ def walk(obj: Any) -> Generator[Any, None, None]:
29
+ yield obj
30
+ if isinstance(obj, list):
31
+ for item in obj:
32
+ yield from walk(item)
33
+ elif isinstance(obj, dict):
34
+ for value in obj.values():
35
+ yield from walk(value)
36
+
37
+
38
+ def get_referenced_schemas(obj: Any) -> Generator[str, None, None]:
39
+ for value in walk(obj):
40
+ if isinstance(value, dict) and "$ref" in value:
41
+ ref = value["$ref"]
42
+ resolved = resolve_ref(ref)
43
+ assert isinstance(ref, str), "Reference must be a string"
44
+ # Strip the _JudgmentType suffix if it exists to get the original schema name
45
+ if resolved.endswith("_JudgmentType"):
46
+ resolved = resolved[: -len("_JudgmentType")]
47
+ yield resolved
48
+
49
+
50
+ def transform_schema_refs(obj: Any) -> Any:
51
+ """Transform all $ref values in a schema to use the _JudgmentType suffix"""
52
+ if isinstance(obj, dict):
53
+ result = {}
54
+ for key, value in obj.items():
55
+ if (
56
+ key == "$ref"
57
+ and isinstance(value, str)
58
+ and value.startswith("#/components/schemas/")
59
+ ):
60
+ # Update the reference to use the suffixed name
61
+ original_name = resolve_ref(value)
62
+ suffixed_name = f"{original_name}_JudgmentType"
63
+ result[key] = f"#/components/schemas/{suffixed_name}"
64
+ else:
65
+ result[key] = transform_schema_refs(value)
66
+ return result
67
+ elif isinstance(obj, list):
68
+ return [transform_schema_refs(item) for item in obj]
69
+ else:
70
+ return obj
71
+
72
+
73
+ filtered_paths = {
74
+ path: spec_data
75
+ for path, spec_data in SPEC["paths"].items()
76
+ if path in JUDGEVAL_PATHS
77
+ }
78
+
79
+
80
+ def filter_schemas() -> Dict[str, Any]:
81
+ result: Dict[str, Any] = {}
82
+ processed_original_names: set[str] = set()
83
+ schemas_to_scan: Any = filtered_paths
84
+
85
+ while True:
86
+ to_commit: Dict[str, Any] = {}
87
+ for original_schema_name in get_referenced_schemas(schemas_to_scan):
88
+ if original_schema_name in processed_original_names:
89
+ continue
90
+
91
+ assert original_schema_name in SPEC["components"]["schemas"], (
92
+ f"Schema {original_schema_name} not found in components.schemas"
93
+ )
94
+ # Transform the schema to update any internal references
95
+ original_schema = SPEC["components"]["schemas"][original_schema_name]
96
+ transformed_schema = transform_schema_refs(original_schema)
97
+ suffixed_name = f"{original_schema_name}_JudgmentType"
98
+ to_commit[suffixed_name] = transformed_schema
99
+ processed_original_names.add(original_schema_name)
100
+
101
+ if not to_commit:
102
+ break
103
+
104
+ result.update(to_commit)
105
+ schemas_to_scan = to_commit
106
+
107
+ return result
108
+
109
+
110
+ # Transform the filtered paths to update schema references
111
+ transformed_paths = transform_schema_refs(filtered_paths)
112
+
113
+ spec = {
114
+ "openapi": SPEC["openapi"],
115
+ "info": SPEC["info"],
116
+ "paths": transformed_paths,
117
+ "components": {
118
+ **SPEC["components"],
119
+ "schemas": filter_schemas(),
120
+ },
121
+ }
122
+
123
+ print(json.dumps(spec, indent=4))
judgeval/data/tool.py CHANGED
@@ -1,56 +1,5 @@
1
- from pydantic import BaseModel, field_validator
2
- from typing import Dict, Any, Optional, List
3
- import warnings
1
+ from judgeval.data.judgment_types import ToolJudgmentType
4
2
 
5
3
 
6
- class Tool(BaseModel):
7
- tool_name: str
8
- parameters: Optional[Dict[str, Any]] = None
9
- agent_name: Optional[str] = None
10
- result_dependencies: Optional[List[Dict[str, Any]]] = None
11
- action_dependencies: Optional[List[Dict[str, Any]]] = None
12
- require_all: Optional[bool] = None
13
-
14
- @field_validator("tool_name")
15
- def validate_tool_name(cls, v):
16
- if not v:
17
- warnings.warn("Tool name is empty or None", UserWarning)
18
- return v
19
-
20
- @field_validator("parameters")
21
- def validate_parameters(cls, v):
22
- if v is not None and not isinstance(v, dict):
23
- warnings.warn(
24
- f"Parameters should be a dictionary, got {type(v)}", UserWarning
25
- )
26
- return v
27
-
28
- @field_validator("agent_name")
29
- def validate_agent_name(cls, v):
30
- if v is not None and not isinstance(v, str):
31
- warnings.warn(f"Agent name should be a string, got {type(v)}", UserWarning)
32
- return v
33
-
34
- @field_validator("result_dependencies")
35
- def validate_result_dependencies(cls, v):
36
- if v is not None and not isinstance(v, list):
37
- warnings.warn(
38
- f"Result dependencies should be a list, got {type(v)}", UserWarning
39
- )
40
- return v
41
-
42
- @field_validator("action_dependencies")
43
- def validate_action_dependencies(cls, v):
44
- if v is not None and not isinstance(v, list):
45
- warnings.warn(
46
- f"Action dependencies should be a list, got {type(v)}", UserWarning
47
- )
48
- return v
49
-
50
- @field_validator("require_all")
51
- def validate_require_all(cls, v):
52
- if v is not None and not isinstance(v, bool):
53
- warnings.warn(
54
- f"Require all should be a boolean, got {type(v)}", UserWarning
55
- )
56
- return v
4
+ class Tool(ToolJudgmentType):
5
+ pass
judgeval/data/trace.py CHANGED
@@ -1,44 +1,21 @@
1
- from pydantic import BaseModel, Field
2
- from typing import Optional, Dict, Any, List
3
- from judgeval.evaluation_run import EvaluationRun
4
- from judgeval.data.tool import Tool
1
+ from typing import Any
5
2
  import json
6
3
  import sys
4
+ import threading
7
5
  from datetime import datetime, timezone
6
+ from judgeval.data.judgment_types import (
7
+ TraceUsageJudgmentType,
8
+ TraceSpanJudgmentType,
9
+ TraceJudgmentType,
10
+ )
11
+ from pydantic import BaseModel
8
12
 
9
13
 
10
- class TraceUsage(BaseModel):
11
- prompt_tokens: Optional[int] = None
12
- completion_tokens: Optional[int] = None
13
- total_tokens: Optional[int] = None
14
- prompt_tokens_cost_usd: Optional[float] = None
15
- completion_tokens_cost_usd: Optional[float] = None
16
- total_cost_usd: Optional[float] = None
17
- model_name: Optional[str] = None
18
-
19
-
20
- class TraceSpan(BaseModel):
21
- span_id: str
22
- trace_id: str
23
- function: str
24
- depth: int
25
- created_at: Optional[Any] = None
26
- parent_span_id: Optional[str] = None
27
- span_type: Optional[str] = "span"
28
- inputs: Optional[Dict[str, Any]] = None
29
- error: Optional[Dict[str, Any]] = None
30
- output: Optional[Any] = None
31
- usage: Optional[TraceUsage] = None
32
- duration: Optional[float] = None
33
- annotation: Optional[List[Dict[str, Any]]] = None
34
- evaluation_runs: Optional[List[EvaluationRun]] = []
35
- expected_tools: Optional[List[Tool]] = None
36
- additional_metadata: Optional[Dict[str, Any]] = None
37
- has_evaluation: Optional[bool] = False
38
- agent_name: Optional[str] = None
39
- state_before: Optional[Dict[str, Any]] = None
40
- state_after: Optional[Dict[str, Any]] = None
14
+ class TraceUsage(TraceUsageJudgmentType):
15
+ pass
41
16
 
17
+
18
+ class TraceSpan(TraceSpanJudgmentType):
42
19
  def model_dump(self, **kwargs):
43
20
  return {
44
21
  "span_id": self.span_id,
@@ -50,9 +27,6 @@ class TraceSpan(BaseModel):
50
27
  "inputs": self._serialize_value(self.inputs),
51
28
  "output": self._serialize_value(self.output),
52
29
  "error": self._serialize_value(self.error),
53
- "evaluation_runs": [run.model_dump() for run in self.evaluation_runs]
54
- if self.evaluation_runs
55
- else [],
56
30
  "parent_span_id": self.parent_span_id,
57
31
  "function": self.function,
58
32
  "duration": self.duration,
@@ -63,8 +37,24 @@ class TraceSpan(BaseModel):
63
37
  "state_before": self.state_before,
64
38
  "state_after": self.state_after,
65
39
  "additional_metadata": self._serialize_value(self.additional_metadata),
40
+ "update_id": self.update_id,
66
41
  }
67
42
 
43
+ def __init__(self, **data):
44
+ super().__init__(**data)
45
+ # Initialize thread lock for thread-safe update_id increment
46
+ self._update_id_lock = threading.Lock()
47
+
48
+ def increment_update_id(self) -> int:
49
+ """
50
+ Thread-safe method to increment the update_id counter.
51
+ Returns:
52
+ int: The new update_id value after incrementing
53
+ """
54
+ with self._update_id_lock:
55
+ self.update_id += 1
56
+ return self.update_id
57
+
68
58
  def print_span(self):
69
59
  """Print the span with proper formatting and parent relationship information."""
70
60
  indent = " " * self.depth
@@ -94,6 +84,7 @@ class TraceSpan(BaseModel):
94
84
  return repr(output)
95
85
  except (TypeError, OverflowError, ValueError):
96
86
  pass
87
+
97
88
  return None
98
89
 
99
90
  def _serialize_value(self, value: Any) -> Any:
@@ -140,15 +131,5 @@ class TraceSpan(BaseModel):
140
131
  return {"error": "Unable to serialize"}
141
132
 
142
133
 
143
- class Trace(BaseModel):
144
- trace_id: str
145
- name: str
146
- created_at: str
147
- duration: float
148
- trace_spans: List[TraceSpan]
149
- overwrite: bool = False
150
- offline_mode: bool = False
151
- rules: Dict[str, Any] = Field(default_factory=dict)
152
- has_notification: Optional[bool] = False
153
- customer_id: Optional[str] = None
154
- tags: List[str] = Field(default_factory=list)
134
+ class Trace(TraceJudgmentType):
135
+ pass
@@ -1,7 +1,7 @@
1
1
  from pydantic import BaseModel
2
2
  from typing import List, Optional, Dict, Any, Union
3
3
  from judgeval.data import Trace
4
- from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
4
+ from judgeval.scorers import APIScorerConfig, BaseScorer
5
5
  from judgeval.rules import Rule
6
6
 
7
7
 
@@ -13,7 +13,7 @@ class TraceRun(BaseModel):
13
13
  project_name (str): The name of the project the evaluation results belong to
14
14
  eval_name (str): A name for this evaluation run
15
15
  traces (List[Trace]): The traces to evaluate
16
- scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
16
+ scorers (List[Union[JudgmentScorer, BaseScorer]]): A list of scorers to use for evaluation
17
17
  model (str): The model used as a judge when using LLM as a Judge
18
18
  metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
19
19
  judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
@@ -26,7 +26,7 @@ class TraceRun(BaseModel):
26
26
  project_name: Optional[str] = None
27
27
  eval_name: Optional[str] = None
28
28
  traces: Optional[List[Trace]] = None
29
- scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
29
+ scorers: List[Union[APIScorerConfig, BaseScorer]]
30
30
  model: Optional[str] = "gpt-4.1"
31
31
  trace_span_id: Optional[str] = None
32
32
  append: Optional[bool] = False
@@ -1,8 +1,8 @@
1
1
  from typing import List, Optional, Union
2
2
  from pydantic import BaseModel, field_validator, Field
3
3
 
4
- from judgeval.data import Example, CustomExample
5
- from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
4
+ from judgeval.data import Example
5
+ from judgeval.scorers import BaseScorer, APIScorerConfig
6
6
  from judgeval.constants import ACCEPTABLE_MODELS
7
7
 
8
8
 
@@ -13,8 +13,8 @@ class EvaluationRun(BaseModel):
13
13
  Args:
14
14
  project_name (str): The name of the project the evaluation results belong to
15
15
  eval_name (str): A name for this evaluation run
16
- examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
17
- scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
16
+ examples (List[Example]): The examples to evaluate
17
+ scorers (List[Union[JudgmentScorer, BaseScorer]]): A list of scorers to use for evaluation
18
18
  model (str): The model used as a judge when using LLM as a Judge
19
19
  metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
20
20
  judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
@@ -23,8 +23,8 @@ class EvaluationRun(BaseModel):
23
23
  organization_id: Optional[str] = None
24
24
  project_name: Optional[str] = Field(default=None, validate_default=True)
25
25
  eval_name: Optional[str] = Field(default=None, validate_default=True)
26
- examples: Union[List[Example], List[CustomExample]]
27
- scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
26
+ examples: List[Example]
27
+ scorers: List[Union[APIScorerConfig, BaseScorer]]
28
28
  model: Optional[str] = "gpt-4.1"
29
29
  trace_span_id: Optional[str] = None
30
30
  # API Key will be "" until user calls client.run_eval(), then API Key will be set
@@ -36,13 +36,8 @@ class EvaluationRun(BaseModel):
36
36
  data = super().model_dump(**kwargs)
37
37
 
38
38
  data["scorers"] = [
39
- scorer.to_dict()
40
- if hasattr(scorer, "to_dict")
41
- else scorer.model_dump()
42
- if hasattr(scorer, "model_dump")
43
- else {"score_type": scorer.score_type, "threshold": scorer.threshold}
44
- for scorer in self.scorers
45
- ]
39
+ scorer.model_dump() for scorer in self.scorers
40
+ ] # Pydantic has problems with properly calling model_dump() on the scorers, so we need to do it manually
46
41
 
47
42
  return data
48
43
 
@@ -50,21 +45,19 @@ class EvaluationRun(BaseModel):
50
45
  def validate_examples(cls, v):
51
46
  if not v:
52
47
  raise ValueError("Examples cannot be empty.")
53
-
54
- first_type = type(v[0])
55
- if first_type not in (Example, CustomExample):
56
- raise ValueError(f"Invalid type for Example/CustomExample: {first_type}")
57
- if not all(isinstance(ex, first_type) for ex in v):
58
- raise ValueError(
59
- "All examples must be of the same type, either all Example or all CustomExample."
60
- )
61
-
62
48
  return v
63
49
 
64
- @field_validator("scorers")
50
+ @field_validator("scorers", mode="before")
65
51
  def validate_scorers(cls, v):
66
52
  if not v:
67
53
  raise ValueError("Scorers cannot be empty.")
54
+ if not all(
55
+ isinstance(scorer, BaseScorer) or isinstance(scorer, APIScorerConfig)
56
+ for scorer in v
57
+ ):
58
+ raise ValueError(
59
+ "All scorers must be of type BaseScorer or APIScorerConfig."
60
+ )
68
61
  return v
69
62
 
70
63
  @field_validator("model")
@@ -2,7 +2,7 @@ from typing import Any, Dict, List, Optional, Sequence
2
2
  from uuid import UUID
3
3
  import time
4
4
  import uuid
5
- from datetime import datetime
5
+ from datetime import datetime, timezone
6
6
 
7
7
  from judgeval.common.tracer import (
8
8
  TraceClient,
@@ -120,8 +120,6 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
120
120
  trace_id,
121
121
  event_name,
122
122
  project_name=project,
123
- overwrite=False,
124
- rules=self.tracer.rules,
125
123
  enable_monitoring=self.tracer.enable_monitoring,
126
124
  enable_evaluations=self.tracer.enable_evaluations,
127
125
  )
@@ -140,7 +138,6 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
140
138
  # NEW: Initial save for live tracking (follows the new practice)
141
139
  try:
142
140
  trace_id_saved, server_response = self._trace_client.save(
143
- overwrite=self._trace_client.overwrite,
144
141
  final_save=False, # Initial save for live tracking
145
142
  )
146
143
  except Exception as e:
@@ -210,6 +207,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
210
207
  # Set both fields on the span
211
208
  new_span.inputs = clean_inputs
212
209
  new_span.additional_metadata = metadata
210
+ new_span.increment_update_id() # Thread-safe increment for span modification
213
211
  else:
214
212
  new_span.inputs = {}
215
213
  new_span.additional_metadata = {}
@@ -249,10 +247,12 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
249
247
  trace_span = trace_client.span_id_to_span.get(span_id)
250
248
  if trace_span:
251
249
  trace_span.duration = duration
250
+ trace_span.increment_update_id() # Thread-safe increment for span modification
252
251
 
253
252
  # Handle outputs and error
254
253
  if error:
255
254
  trace_span.output = error
255
+ trace_span.increment_update_id() # Thread-safe increment for span modification
256
256
  elif outputs:
257
257
  # Separate metadata from outputs
258
258
  metadata = {}
@@ -272,6 +272,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
272
272
 
273
273
  # Set both fields on the span
274
274
  trace_span.output = clean_outputs
275
+ trace_span.increment_update_id() # Thread-safe increment for span modification
275
276
  if metadata:
276
277
  # Merge with existing metadata
277
278
  existing_metadata = trace_span.additional_metadata or {}
@@ -279,6 +280,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
279
280
  **existing_metadata,
280
281
  **metadata,
281
282
  }
283
+ trace_span.increment_update_id() # Thread-safe increment for span modification
282
284
 
283
285
  # Queue span with completed state through background service
284
286
  if trace_client.background_span_service:
@@ -308,20 +310,18 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
308
310
  complete_trace_data = {
309
311
  "trace_id": self._trace_client.trace_id,
310
312
  "name": self._trace_client.name,
311
- "created_at": datetime.utcfromtimestamp(
312
- self._trace_client.start_time
313
+ "created_at": datetime.fromtimestamp(
314
+ self._trace_client.start_time, timezone.utc
313
315
  ).isoformat(),
314
316
  "duration": self._trace_client.get_duration(),
315
317
  "trace_spans": [
316
318
  span.model_dump() for span in self._trace_client.trace_spans
317
319
  ],
318
- "overwrite": self._trace_client.overwrite,
319
320
  "offline_mode": self.tracer.offline_mode,
320
321
  "parent_trace_id": self._trace_client.parent_trace_id,
321
322
  "parent_name": self._trace_client.parent_name,
322
323
  }
323
324
  trace_id, trace_data = self._trace_client.save(
324
- overwrite=self._trace_client.overwrite,
325
325
  final_save=True, # Final save with usage counter updates
326
326
  )
327
327
  token = self.trace_id_to_token.pop(trace_id, None)
@@ -518,20 +518,18 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
518
518
  complete_trace_data = {
519
519
  "trace_id": trace_client.trace_id,
520
520
  "name": trace_client.name,
521
- "created_at": datetime.utcfromtimestamp(
522
- trace_client.start_time
521
+ "created_at": datetime.fromtimestamp(
522
+ trace_client.start_time, timezone.utc
523
523
  ).isoformat(),
524
524
  "duration": trace_client.get_duration(),
525
525
  "trace_spans": [
526
526
  span.model_dump() for span in trace_client.trace_spans
527
527
  ],
528
- "overwrite": trace_client.overwrite,
529
528
  "offline_mode": self.tracer.offline_mode,
530
529
  "parent_trace_id": trace_client.parent_trace_id,
531
530
  "parent_name": trace_client.parent_name,
532
531
  }
533
532
  trace_id_saved, trace_data = trace_client.save(
534
- overwrite=trace_client.overwrite,
535
533
  final_save=True,
536
534
  )
537
535
 
@@ -815,6 +813,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
815
813
  if span_id and span_id in trace_client.span_id_to_span:
816
814
  trace_span = trace_client.span_id_to_span[span_id]
817
815
  trace_span.usage = usage
816
+ trace_span.increment_update_id() # Thread-safe increment for span modification
818
817
 
819
818
  self._end_span_tracking(trace_client, run_id, outputs=outputs)
820
819
  # --- End Token Usage ---
@@ -6,7 +6,7 @@ from judgeval.common.utils import (
6
6
  afetch_litellm_api_response,
7
7
  fetch_litellm_api_response,
8
8
  )
9
- from judgeval.common.logger import debug, error
9
+ from judgeval.common.logger import judgeval_logger
10
10
 
11
11
  BASE_CONVERSATION = [
12
12
  {"role": "system", "content": "You are a helpful assistant."},
@@ -15,7 +15,6 @@ BASE_CONVERSATION = [
15
15
 
16
16
  class LiteLLMJudge(JudgevalJudge):
17
17
  def __init__(self, model: str = "gpt-4.1-mini", **kwargs):
18
- debug(f"Initializing LiteLLMJudge with model={model}")
19
18
  self.model = model
20
19
  self.kwargs = kwargs
21
20
  super().__init__(model_name=model)
@@ -25,7 +24,6 @@ class LiteLLMJudge(JudgevalJudge):
25
24
  input: Union[str, List[Mapping[str, str]]],
26
25
  schema: pydantic.BaseModel = None,
27
26
  ) -> str:
28
- debug(f"Generating response for input type: {type(input)}")
29
27
  if isinstance(input, str):
30
28
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
31
29
  return fetch_litellm_api_response(
@@ -36,7 +34,7 @@ class LiteLLMJudge(JudgevalJudge):
36
34
  model=self.model, messages=input, response_format=schema
37
35
  )
38
36
  else:
39
- error(f"Invalid input type received: {type(input)}")
37
+ judgeval_logger.error(f"Invalid input type received: {type(input)}")
40
38
  raise TypeError(
41
39
  f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
42
40
  )
@@ -46,7 +44,6 @@ class LiteLLMJudge(JudgevalJudge):
46
44
  input: Union[str, List[Mapping[str, str]]],
47
45
  schema: pydantic.BaseModel = None,
48
46
  ) -> str:
49
- debug(f"Async generating response for input type: {type(input)}")
50
47
  if isinstance(input, str):
51
48
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
52
49
  response = await afetch_litellm_api_response(
@@ -59,7 +56,7 @@ class LiteLLMJudge(JudgevalJudge):
59
56
  )
60
57
  return response
61
58
  else:
62
- error(f"Invalid input type received: {type(input)}")
59
+ judgeval_logger.error(f"Invalid input type received: {type(input)}")
63
60
  raise TypeError(
64
61
  f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
65
62
  )