judgeval 0.0.27__py3-none-any.whl → 0.0.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/common/tracer.py CHANGED
@@ -73,8 +73,9 @@ class TraceEntry:
73
73
  span_id: str # Unique ID for this specific span instance
74
74
  depth: int # Indentation level for nested calls
75
75
  message: str # Human-readable description
76
- timestamp: float # Unix timestamp when entry was created
76
+ # created_at: Unix timestamp when entry was created, replacing the deprecated 'timestamp' field
77
77
  duration: Optional[float] = None # Time taken (for exit/evaluation entries)
78
+ trace_id: str = None # ID of the trace this entry belongs to
78
79
  output: Any = None # Function output value
79
80
  # Use field() for mutable defaults to avoid shared state issues
80
81
  inputs: dict = field(default_factory=dict)
@@ -161,9 +162,10 @@ class TraceEntry:
161
162
  "type": self.type,
162
163
  "function": self.function,
163
164
  "span_id": self.span_id,
165
+ "trace_id": self.trace_id,
164
166
  "depth": self.depth,
165
167
  "message": self.message,
166
- "timestamp": self.timestamp,
168
+ "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
167
169
  "duration": self.duration,
168
170
  "output": self._serialize_output(),
169
171
  "inputs": self._serialize_inputs(),
@@ -228,13 +230,12 @@ class TraceManagerClient:
228
230
 
229
231
  return response.json()
230
232
 
231
- def save_trace(self, trace_data: dict, empty_save: bool):
233
+ def save_trace(self, trace_data: dict):
232
234
  """
233
235
  Saves a trace to the database
234
236
 
235
237
  Args:
236
238
  trace_data: The trace data to save
237
- empty_save: Whether to save an empty trace
238
239
  NOTE we save empty traces in order to properly handle async operations; we need something in the DB to associate the async results with
239
240
  """
240
241
  response = requests.post(
@@ -253,7 +254,7 @@ class TraceManagerClient:
253
254
  elif response.status_code != HTTPStatus.OK:
254
255
  raise ValueError(f"Failed to save trace data: {response.text}")
255
256
 
256
- if not empty_save and "ui_results_url" in response.json():
257
+ if "ui_results_url" in response.json():
257
258
  pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n"
258
259
  rprint(pretty_str)
259
260
 
@@ -377,9 +378,10 @@ class TraceClient:
377
378
  type="enter",
378
379
  function=name,
379
380
  span_id=span_id, # Use the generated span_id
381
+ trace_id=self.trace_id, # Use the trace_id from the trace client
380
382
  depth=current_depth,
381
383
  message=name,
382
- timestamp=start_time,
384
+ created_at=start_time,
383
385
  span_type=span_type,
384
386
  parent_span_id=parent_span_id # Use the parent_id from context var
385
387
  )
@@ -394,9 +396,10 @@ class TraceClient:
394
396
  type="exit",
395
397
  function=name,
396
398
  span_id=span_id, # Use the same span_id for exit
399
+ trace_id=self.trace_id, # Use the trace_id from the trace client
397
400
  depth=exit_depth,
398
401
  message=f"← {name}",
399
- timestamp=time.time(),
402
+ created_at=time.time(),
400
403
  duration=duration,
401
404
  span_type=span_type
402
405
  ))
@@ -496,6 +499,7 @@ class TraceClient:
496
499
  metadata={},
497
500
  judgment_api_key=self.tracer.api_key,
498
501
  override=self.overwrite,
502
+ trace_span_id=current_span_var.get(),
499
503
  rules=loaded_rules # Use the combined rules
500
504
  )
501
505
 
@@ -524,9 +528,10 @@ class TraceClient:
524
528
  type="evaluation",
525
529
  function=function_name,
526
530
  span_id=current_span_id, # Associate with current span
531
+ trace_id=self.trace_id, # Use the trace_id from the trace client
527
532
  depth=current_depth,
528
533
  message=f"Evaluation results for {function_name}",
529
- timestamp=time.time(),
534
+ created_at=time.time(),
530
535
  evaluation_runs=[eval_run],
531
536
  duration=duration,
532
537
  span_type="evaluation"
@@ -548,9 +553,10 @@ class TraceClient:
548
553
  type="input",
549
554
  function=function_name,
550
555
  span_id=current_span_id, # Use current span_id
556
+ trace_id=self.trace_id, # Use the trace_id from the trace client
551
557
  depth=current_depth,
552
558
  message=f"Inputs to {function_name}",
553
- timestamp=time.time(),
559
+ created_at=time.time(),
554
560
  inputs=inputs,
555
561
  span_type=entry_span_type
556
562
  ))
@@ -583,7 +589,7 @@ class TraceClient:
583
589
  span_id=current_span_id, # Use current span_id
584
590
  depth=current_depth,
585
591
  message=f"Output from {function_name}",
586
- timestamp=time.time(),
592
+ created_at=time.time(),
587
593
  output="<pending>" if inspect.iscoroutine(output) else output,
588
594
  span_type=entry_span_type
589
595
  )
@@ -666,6 +672,7 @@ class TraceClient:
666
672
  preserving parent-child span relationships using span_id and parent_span_id.
667
673
  """
668
674
  spans_by_id: Dict[str, dict] = {}
675
+ evaluation_runs: List[EvaluationRun] = []
669
676
 
670
677
  # First pass: Group entries by span_id and gather data
671
678
  for entry in entries:
@@ -679,7 +686,8 @@ class TraceClient:
679
686
  "span_id": span_id,
680
687
  "function": entry["function"],
681
688
  "depth": entry["depth"], # Use the depth recorded at entry time
682
- "timestamp": entry["timestamp"],
689
+ "created_at": entry["created_at"],
690
+ "trace_id": entry["trace_id"],
683
691
  "parent_span_id": entry.get("parent_span_id"),
684
692
  "span_type": entry.get("span_type", "span"),
685
693
  "inputs": None,
@@ -704,14 +712,14 @@ class TraceClient:
704
712
  current_span_data["output"] = entry["output"]
705
713
 
706
714
  elif entry["type"] == "evaluation" and entry.get("evaluation_runs"):
707
- if current_span_data.get("evaluation_runs") is None:
708
- current_span_data["evaluation_runs"] = []
709
- current_span_data["evaluation_runs"].extend(entry["evaluation_runs"])
715
+ if current_span_data.get("evaluation_runs") is not None:
716
+ evaluation_runs.extend(entry["evaluation_runs"])
710
717
 
711
718
  elif entry["type"] == "exit":
712
719
  if current_span_data["duration"] is None: # Calculate duration only once
713
- start_time = current_span_data.get("timestamp", entry["timestamp"])
714
- current_span_data["duration"] = entry["timestamp"] - start_time
720
+ start_time = datetime.fromisoformat(current_span_data.get("created_at", entry["created_at"]))
721
+ end_time = datetime.fromisoformat(entry["created_at"])
722
+ current_span_data["duration"] = (end_time - start_time).total_seconds()
715
723
  # Update depth if exit depth is different (though current span() implementation keeps it same)
716
724
  # current_span_data["depth"] = entry["depth"]
717
725
 
@@ -733,7 +741,7 @@ class TraceClient:
733
741
  children_map[parent_id].append(span)
734
742
 
735
743
  # Sort roots by timestamp
736
- roots.sort(key=lambda x: x.get("timestamp", 0))
744
+ roots.sort(key=lambda x: datetime.fromisoformat(x.get("created_at", "1970-01-01T00:00:00")))
737
745
 
738
746
  # Perform depth-first traversal to get the final sorted list
739
747
  sorted_condensed_list = []
@@ -747,9 +755,9 @@ class TraceClient:
747
755
 
748
756
  sorted_condensed_list.append(span_data) # Add parent before children
749
757
 
750
- # Get children, sort them by timestamp, and visit them
758
+ # Get children, sort them by created_at, and visit them
751
759
  span_children = children_map.get(span_id, [])
752
- span_children.sort(key=lambda x: x.get("timestamp", 0))
760
+ span_children.sort(key=lambda x: datetime.fromisoformat(x.get("created_at", "1970-01-01T00:00:00")))
753
761
  for child in span_children:
754
762
  # Ensure the child exists in our map before recursing
755
763
  if child['span_id'] in span_map:
@@ -777,9 +785,9 @@ class TraceClient:
777
785
  sorted_condensed_list.append(span_data)
778
786
 
779
787
 
780
- return sorted_condensed_list
788
+ return sorted_condensed_list, evaluation_runs
781
789
 
782
- def save(self, empty_save: bool = False, overwrite: bool = False) -> Tuple[str, dict]:
790
+ def save(self, overwrite: bool = False) -> Tuple[str, dict]:
783
791
  """
784
792
  Save the current trace to the database.
785
793
  Returns a tuple of (trace_id, trace_data) where trace_data is the trace data that was saved.
@@ -789,7 +797,7 @@ class TraceClient:
789
797
 
790
798
  raw_entries = [entry.to_dict() for entry in self.entries]
791
799
 
792
- condensed_entries = self.condense_trace(raw_entries)
800
+ condensed_entries, evaluation_runs = self.condense_trace(raw_entries)
793
801
 
794
802
  # Calculate total token counts from LLM API calls
795
803
  total_prompt_tokens = 0
@@ -862,32 +870,32 @@ class TraceClient:
862
870
  "total_cost_usd": total_cost
863
871
  },
864
872
  "entries": condensed_entries,
865
- "empty_save": empty_save,
873
+ "evaluation_runs": evaluation_runs,
866
874
  "overwrite": overwrite,
867
875
  "parent_trace_id": self.parent_trace_id,
868
876
  "parent_name": self.parent_name
869
877
  }
870
878
  # Execute asynchrous evaluation in the background
871
- if not empty_save: # Only send to RabbitMQ if the trace is not empty
872
- # Send trace data to evaluation queue via API
873
- try:
874
- response = requests.post(
875
- JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL,
876
- json=trace_data,
877
- headers={
878
- "Content-Type": "application/json",
879
- "Authorization": f"Bearer {self.tracer.api_key}",
880
- "X-Organization-Id": self.tracer.organization_id
881
- },
882
- verify=True
883
- )
879
+ # if not empty_save: # Only send to RabbitMQ if the trace is not empty
880
+ # # Send trace data to evaluation queue via API
881
+ # try:
882
+ # response = requests.post(
883
+ # JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL,
884
+ # json=trace_data,
885
+ # headers={
886
+ # "Content-Type": "application/json",
887
+ # "Authorization": f"Bearer {self.tracer.api_key}",
888
+ # "X-Organization-Id": self.tracer.organization_id
889
+ # },
890
+ # verify=True
891
+ # )
884
892
 
885
- if response.status_code != HTTPStatus.OK:
886
- warnings.warn(f"Failed to add trace to evaluation queue: {response.text}")
887
- except Exception as e:
888
- warnings.warn(f"Error sending trace to evaluation queue: {str(e)}")
893
+ # if response.status_code != HTTPStatus.OK:
894
+ # warnings.warn(f"Failed to add trace to evaluation queue: {response.text}")
895
+ # except Exception as e:
896
+ # warnings.warn(f"Error sending trace to evaluation queue: {str(e)}")
889
897
 
890
- self.trace_manager_client.save_trace(trace_data, empty_save)
898
+ self.trace_manager_client.save_trace(trace_data)
891
899
 
892
900
  return self.trace_id, trace_data
893
901
 
@@ -975,7 +983,6 @@ class Tracer:
975
983
  with trace.span(name or "unnamed_trace") as span:
976
984
  try:
977
985
  # Save the trace to the database to handle Evaluations' trace_id referential integrity
978
- trace.save(empty_save=True, overwrite=overwrite)
979
986
  yield trace
980
987
  finally:
981
988
  # Reset the context variable
@@ -1032,7 +1039,7 @@ class Tracer:
1032
1039
  )
1033
1040
 
1034
1041
  # Save empty trace and set trace context
1035
- current_trace.save(empty_save=True, overwrite=overwrite)
1042
+ # current_trace.save(empty_save=True, overwrite=overwrite)
1036
1043
  trace_token = current_trace_var.set(current_trace)
1037
1044
 
1038
1045
  try:
@@ -1052,7 +1059,7 @@ class Tracer:
1052
1059
  span.record_output(result)
1053
1060
 
1054
1061
  # Save the completed trace
1055
- current_trace.save(empty_save=False, overwrite=overwrite)
1062
+ current_trace.save(overwrite=overwrite)
1056
1063
  return result
1057
1064
  finally:
1058
1065
  # Reset trace context (span context resets automatically)
@@ -1101,7 +1108,7 @@ class Tracer:
1101
1108
  )
1102
1109
 
1103
1110
  # Save empty trace and set trace context
1104
- current_trace.save(empty_save=True, overwrite=overwrite)
1111
+ # current_trace.save(empty_save=True, overwrite=overwrite)
1105
1112
  trace_token = current_trace_var.set(current_trace)
1106
1113
 
1107
1114
  try:
@@ -1121,7 +1128,7 @@ class Tracer:
1121
1128
  span.record_output(result)
1122
1129
 
1123
1130
  # Save the completed trace
1124
- current_trace.save(empty_save=False, overwrite=overwrite)
1131
+ current_trace.save(overwrite=overwrite)
1125
1132
  return result
1126
1133
  finally:
1127
1134
  # Reset trace context (span context resets automatically)
@@ -0,0 +1,91 @@
1
+ from typing import List, Optional, Dict, Any, Union
2
+ from pydantic import BaseModel, ConfigDict, model_validator
3
+
4
+ from judgeval.data.example import Example
5
+ from judgeval.data.custom_example import CustomExample
6
+ from judgeval.data.scorer_data import ScorerData
7
+ from judgeval.common.logger import debug, error
8
+
9
+ class ProcessExample(BaseModel):
10
+ """
11
+ ProcessExample is an `Example` object that contains intermediate information
12
+ about an undergoing evaluation on the original `Example`. It is used purely for
13
+ internal operations and keeping track of the evaluation process.
14
+ """
15
+ name: str
16
+ # input: Optional[str] = None
17
+ # actual_output: Optional[Union[str, List[str]]] = None
18
+ # expected_output: Optional[Union[str, List[str]]] = None
19
+ # context: Optional[list] = None
20
+ # retrieval_context: Optional[list] = None
21
+ # tools_called: Optional[list] = None
22
+ # expected_tools: Optional[list] = None
23
+
24
+ # make these optional, not all test cases in a conversation will be evaluated
25
+ success: Optional[bool] = None
26
+ scorers_data: Optional[List[ScorerData]] = None
27
+ run_duration: Optional[float] = None
28
+ evaluation_cost: Optional[float] = None
29
+
30
+ order: Optional[int] = None
31
+ # These should map 1 to 1 from golden
32
+ additional_metadata: Optional[Dict] = None
33
+ comments: Optional[str] = None
34
+ trace_id: Optional[str] = None
35
+ model_config = ConfigDict(arbitrary_types_allowed=True)
36
+
37
+ def update_scorer_data(self, scorer_data: ScorerData):
38
+ """
39
+ Updates scorer data field of test case after the scorers have been
40
+ evaluated on this test case.
41
+ """
42
+ debug(f"Updating scorer data for example '{self.name}' with scorer: {scorer_data}")
43
+ # self.scorers_data is a list of ScorerData objects that contain the
44
+ # evaluation results of each scorer on this test case
45
+ if self.scorers_data is None:
46
+ self.scorers_data = [scorer_data]
47
+ else:
48
+ self.scorers_data.append(scorer_data)
49
+
50
+ if self.success is None:
51
+ # self.success will be None when it is a message
52
+ # in that case we will be setting success for the first time
53
+ self.success = scorer_data.success
54
+ else:
55
+ if scorer_data.success is False:
56
+ debug(f"Example '{self.name}' marked as failed due to scorer: {scorer_data}")
57
+ self.success = False
58
+
59
+ def update_run_duration(self, run_duration: float):
60
+ self.run_duration = run_duration
61
+
62
+
63
+ def create_process_custom_example(
64
+ example: CustomExample,
65
+ ) -> ProcessExample:
66
+ """
67
+ When an LLM Test Case is executed, we track its progress using an ProcessExample.
68
+
69
+ This will track things like the success of the test case, as well as the metadata (such as verdicts and claims in Faithfulness).
70
+ """
71
+ success = True
72
+ if example.name is not None:
73
+ name = example.name
74
+ else:
75
+ name = "Test Case Placeholder"
76
+ debug(f"No name provided for example, using default name: {name}")
77
+ order = None
78
+ scorers_data = []
79
+
80
+ debug(f"Creating ProcessExample for: {name}")
81
+ process_ex = ProcessExample(
82
+ name=name,
83
+ success=success,
84
+ scorers_data=scorers_data,
85
+ run_duration=None,
86
+ evaluation_cost=None,
87
+ order=order,
88
+ additional_metadata=example.additional_metadata,
89
+ trace_id=example.trace_id
90
+ )
91
+ return process_ex
judgeval/data/result.py CHANGED
@@ -49,9 +49,9 @@ class ScoringResult(BaseModel):
49
49
 
50
50
  def generate_scoring_result(
51
51
  example: Example,
52
- success: bool,
53
52
  scorers_data: List[ScorerData],
54
53
  run_duration: float,
54
+ success: bool,
55
55
  ) -> ScoringResult:
56
56
  """
57
57
  Creates a final ScoringResult object for an evaluation run based on the results from a completed LLMApiTestCase.
@@ -34,6 +34,7 @@ class EvaluationRun(BaseModel):
34
34
  model: Union[str, List[str], JudgevalJudge]
35
35
  aggregator: Optional[str] = None
36
36
  metadata: Optional[Dict[str, Any]] = None
37
+ trace_span_id: Optional[str] = None
37
38
  # API Key will be "" until user calls client.run_eval(), then API Key will be set
38
39
  judgment_api_key: Optional[str] = ""
39
40
  override: Optional[bool] = False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.27
3
+ Version: 0.0.28
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -1,18 +1,19 @@
1
1
  judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
2
2
  judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
3
3
  judgeval/constants.py,sha256=ksAXhAXovzJKH0uHOdQtREs168uCJRG79PooHNmEbYQ,5313
4
- judgeval/evaluation_run.py,sha256=RgJD60lJsunNQzObjo7iXnAzXWgubCLOAAuuamAAuoI,6354
4
+ judgeval/evaluation_run.py,sha256=6Kft3wZDWkdBDZoMwOhWf7zSAOF4naI7Pcg_YlZaZY4,6394
5
5
  judgeval/judgment_client.py,sha256=uf0V1-eu3qnFTwrQ_Ckcv8IiWRVv7dbvou4P4KjU6hM,26794
6
6
  judgeval/rules.py,sha256=B0ZL0pn72D4Jnlr0zMQ6CPHi7D8AQQRariXCVsiCMiI,20542
7
7
  judgeval/run_evaluation.py,sha256=N2ppmEE5WoSReChKjr_n0NcdAUlUR6Nua7M1C_3zHQ8,24949
8
8
  judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
9
9
  judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
10
10
  judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
11
- judgeval/common/tracer.py,sha256=L6JkCHj6kxhtDzf9OPg5ZC-NUUH4VDvDcV4utPi_I38,57544
11
+ judgeval/common/tracer.py,sha256=Qpn2m6LCpRq1OOWRd1z16JtmeS7ITIWaQNJOddmAfQY,58178
12
12
  judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
13
13
  judgeval/data/__init__.py,sha256=dG5ytBOeOWCTd5o0KP7IblqtW4G1EBaGreLWepM3jas,345
14
+ judgeval/data/custom_api_example.py,sha256=uW_ZBzkDLWumtudmfRHAJQkVYpm2qWgcDf7vBNLpS-o,3444
14
15
  judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
15
- judgeval/data/result.py,sha256=YHD-dVYJN4JFpM-YCGgBtSdFcGAOyWYL41sf0TE9Hzg,3122
16
+ judgeval/data/result.py,sha256=BT4f2FF5EFuiRjOmS4vuIXsrEwSlG16Vw3QaWi6PZzc,3122
16
17
  judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
17
18
  judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
18
19
  judgeval/data/datasets/dataset.py,sha256=AFYjksV_wXx5CqFYJsl3aN8yZ6hC50O1myRuOJ8s8_E,12867
@@ -86,7 +87,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
86
87
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
87
88
  judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
88
89
  judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
89
- judgeval-0.0.27.dist-info/METADATA,sha256=yoUWIaLIDPksMYQSxDIbVFjtFVCxim6-5LSQ2P13a-U,5418
90
- judgeval-0.0.27.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
91
- judgeval-0.0.27.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
92
- judgeval-0.0.27.dist-info/RECORD,,
90
+ judgeval-0.0.28.dist-info/METADATA,sha256=GSGf7_cb7FkKdQ_PFPf4nw9hlMrKyD3Tv6X8m2uo3EY,5418
91
+ judgeval-0.0.28.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
92
+ judgeval-0.0.28.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
93
+ judgeval-0.0.28.dist-info/RECORD,,