PyPI - judgeval - Versions diffs - 0.0.27__py3-none-any.whl → 0.0.28__py3-none-any.whl - Mend

judgeval 0.0.27py3-none-any.whl → 0.0.28py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

judgeval/common/tracer.py CHANGED Viewed

@@ -73,8 +73,9 @@ class TraceEntry:
     span_id: str # Unique ID for this specific span instance
     depth: int    # Indentation level for nested calls
     message: str  # Human-readable description
-    timestamp: float  # Unix timestamp when entry was created
+# created_at: Unix timestamp when entry was created, replacing the deprecated 'timestamp' field
     duration: Optional[float] = None  # Time taken (for exit/evaluation entries)
+    trace_id: str = None # ID of the trace this entry belongs to
     output: Any = None  # Function output value
     # Use field() for mutable defaults to avoid shared state issues
     inputs: dict = field(default_factory=dict)
@@ -161,9 +162,10 @@ class TraceEntry:
             "type": self.type,
             "function": self.function,
             "span_id": self.span_id,
+            "trace_id": self.trace_id,
             "depth": self.depth,
             "message": self.message,
-            "timestamp": self.timestamp,
+            "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
             "duration": self.duration,
             "output": self._serialize_output(),
             "inputs": self._serialize_inputs(),
@@ -228,13 +230,12 @@ class TraceManagerClient:
         return response.json()
-    def save_trace(self, trace_data: dict, empty_save: bool):
+    def save_trace(self, trace_data: dict):
         """
         Saves a trace to the database
         Args:
             trace_data: The trace data to save
-            empty_save: Whether to save an empty trace
             NOTE we save empty traces in order to properly handle async operations; we need something in the DB to associate the async results with
         """
         response = requests.post(
@@ -253,7 +254,7 @@ class TraceManagerClient:
         elif response.status_code != HTTPStatus.OK:
             raise ValueError(f"Failed to save trace data: {response.text}")
-        if not empty_save and "ui_results_url" in response.json():
+        if "ui_results_url" in response.json():
             pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n"
             rprint(pretty_str)
@@ -377,9 +378,10 @@ class TraceClient:
             type="enter",
             function=name,
             span_id=span_id, # Use the generated span_id
+            trace_id=self.trace_id, # Use the trace_id from the trace client
             depth=current_depth,
             message=name,
-            timestamp=start_time,
+            created_at=start_time,
             span_type=span_type,
             parent_span_id=parent_span_id # Use the parent_id from context var
         )
@@ -394,9 +396,10 @@ class TraceClient:
                 type="exit",
                 function=name,
                 span_id=span_id, # Use the same span_id for exit
+                trace_id=self.trace_id, # Use the trace_id from the trace client
                 depth=exit_depth,
                 message=f"← {name}",
-                timestamp=time.time(),
+                created_at=time.time(),
                 duration=duration,
                 span_type=span_type
             ))
@@ -496,6 +499,7 @@ class TraceClient:
             metadata={},
             judgment_api_key=self.tracer.api_key,
             override=self.overwrite,
+            trace_span_id=current_span_var.get(),
             rules=loaded_rules # Use the combined rules
         )
@@ -524,9 +528,10 @@ class TraceClient:
                 type="evaluation",
                 function=function_name,
                 span_id=current_span_id, # Associate with current span
+                trace_id=self.trace_id, # Use the trace_id from the trace client
                 depth=current_depth,
                 message=f"Evaluation results for {function_name}",
-                timestamp=time.time(),
+                created_at=time.time(),
                 evaluation_runs=[eval_run],
                 duration=duration,
                 span_type="evaluation"
@@ -548,9 +553,10 @@ class TraceClient:
                 type="input",
                 function=function_name,
                 span_id=current_span_id, # Use current span_id
+                trace_id=self.trace_id, # Use the trace_id from the trace client
                 depth=current_depth,
                 message=f"Inputs to {function_name}",
-                timestamp=time.time(),
+                created_at=time.time(),
                 inputs=inputs,
                 span_type=entry_span_type
             ))
@@ -583,7 +589,7 @@ class TraceClient:
                 span_id=current_span_id, # Use current span_id
                 depth=current_depth,
                 message=f"Output from {function_name}",
-                timestamp=time.time(),
+                created_at=time.time(),
                 output="<pending>" if inspect.iscoroutine(output) else output,
                 span_type=entry_span_type
             )
@@ -666,6 +672,7 @@ class TraceClient:
         preserving parent-child span relationships using span_id and parent_span_id.
         """
         spans_by_id: Dict[str, dict] = {}
+        evaluation_runs: List[EvaluationRun] = []
         # First pass: Group entries by span_id and gather data
         for entry in entries:
@@ -679,7 +686,8 @@ class TraceClient:
                         "span_id": span_id,
                         "function": entry["function"],
                         "depth": entry["depth"], # Use the depth recorded at entry time
-                        "timestamp": entry["timestamp"],
+                        "created_at": entry["created_at"],
+                        "trace_id": entry["trace_id"],
                         "parent_span_id": entry.get("parent_span_id"),
                         "span_type": entry.get("span_type", "span"),
                         "inputs": None,
@@ -704,14 +712,14 @@ class TraceClient:
                     current_span_data["output"] = entry["output"]
                 elif entry["type"] == "evaluation" and entry.get("evaluation_runs"):
-                    if current_span_data.get("evaluation_runs") is None:
-                        current_span_data["evaluation_runs"] = []
-                    current_span_data["evaluation_runs"].extend(entry["evaluation_runs"])
+                    if current_span_data.get("evaluation_runs") is not None:
+                        evaluation_runs.extend(entry["evaluation_runs"])
                 elif entry["type"] == "exit":
                     if current_span_data["duration"] is None: # Calculate duration only once
-                        start_time = current_span_data.get("timestamp", entry["timestamp"])
-                        current_span_data["duration"] = entry["timestamp"] - start_time
+                        start_time = datetime.fromisoformat(current_span_data.get("created_at", entry["created_at"]))
+                        end_time = datetime.fromisoformat(entry["created_at"])
+                        current_span_data["duration"] = (end_time - start_time).total_seconds()
                     # Update depth if exit depth is different (though current span() implementation keeps it same)
                     # current_span_data["depth"] = entry["depth"]
@@ -733,7 +741,7 @@ class TraceClient:
                 children_map[parent_id].append(span)
         # Sort roots by timestamp
-        roots.sort(key=lambda x: x.get("timestamp", 0))
+        roots.sort(key=lambda x: datetime.fromisoformat(x.get("created_at", "1970-01-01T00:00:00")))
         # Perform depth-first traversal to get the final sorted list
         sorted_condensed_list = []
@@ -747,9 +755,9 @@ class TraceClient:
             sorted_condensed_list.append(span_data) # Add parent before children
-            # Get children, sort them by timestamp, and visit them
+            # Get children, sort them by created_at, and visit them
             span_children = children_map.get(span_id, [])
-            span_children.sort(key=lambda x: x.get("timestamp", 0))
+            span_children.sort(key=lambda x: datetime.fromisoformat(x.get("created_at", "1970-01-01T00:00:00")))
             for child in span_children:
                 # Ensure the child exists in our map before recursing
                 if child['span_id'] in span_map:
@@ -777,9 +785,9 @@ class TraceClient:
                   sorted_condensed_list.append(span_data)
-        return sorted_condensed_list
+        return sorted_condensed_list, evaluation_runs
-    def save(self, empty_save: bool = False, overwrite: bool = False) -> Tuple[str, dict]:
+    def save(self, overwrite: bool = False) -> Tuple[str, dict]:
         """
         Save the current trace to the database.
         Returns a tuple of (trace_id, trace_data) where trace_data is the trace data that was saved.
@@ -789,7 +797,7 @@ class TraceClient:
         raw_entries = [entry.to_dict() for entry in self.entries]
-        condensed_entries = self.condense_trace(raw_entries)
+        condensed_entries, evaluation_runs = self.condense_trace(raw_entries)
         # Calculate total token counts from LLM API calls
         total_prompt_tokens = 0
@@ -862,32 +870,32 @@ class TraceClient:
                 "total_cost_usd": total_cost
             },
             "entries": condensed_entries,
-            "empty_save": empty_save,
+            "evaluation_runs": evaluation_runs,
             "overwrite": overwrite,
             "parent_trace_id": self.parent_trace_id,
             "parent_name": self.parent_name
         }
         # Execute asynchrous evaluation in the background
-        if not empty_save:  # Only send to RabbitMQ if the trace is not empty
-            # Send trace data to evaluation queue via API
-            try:
-                response = requests.post(
-                    JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL,
-                    json=trace_data,
-                    headers={
-                        "Content-Type": "application/json",
-                        "Authorization": f"Bearer {self.tracer.api_key}",
-                        "X-Organization-Id": self.tracer.organization_id
-                    },
-                    verify=True
-                )
+        # if not empty_save:  # Only send to RabbitMQ if the trace is not empty
+        #     # Send trace data to evaluation queue via API
+        #     try:
+        #         response = requests.post(
+        #             JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL,
+        #             json=trace_data,
+        #             headers={
+        #                 "Content-Type": "application/json",
+        #                 "Authorization": f"Bearer {self.tracer.api_key}",
+        #                 "X-Organization-Id": self.tracer.organization_id
+        #             },
+        #             verify=True
+        #         )
-                if response.status_code != HTTPStatus.OK:
-                    warnings.warn(f"Failed to add trace to evaluation queue: {response.text}")
-            except Exception as e:
-                warnings.warn(f"Error sending trace to evaluation queue: {str(e)}")
+        #         if response.status_code != HTTPStatus.OK:
+        #             warnings.warn(f"Failed to add trace to evaluation queue: {response.text}")
+        #     except Exception as e:
+        #         warnings.warn(f"Error sending trace to evaluation queue: {str(e)}")
-        self.trace_manager_client.save_trace(trace_data, empty_save)
+        self.trace_manager_client.save_trace(trace_data)
         return self.trace_id, trace_data
@@ -975,7 +983,6 @@ class Tracer:
         with trace.span(name or "unnamed_trace") as span:
             try:
                 # Save the trace to the database to handle Evaluations' trace_id referential integrity
-                trace.save(empty_save=True, overwrite=overwrite)
                 yield trace
             finally:
                 # Reset the context variable
@@ -1032,7 +1039,7 @@ class Tracer:
                     )
                     # Save empty trace and set trace context
-                    current_trace.save(empty_save=True, overwrite=overwrite)
+                    # current_trace.save(empty_save=True, overwrite=overwrite)
                     trace_token = current_trace_var.set(current_trace)
                     try:
@@ -1052,7 +1059,7 @@ class Tracer:
                             span.record_output(result)
                         # Save the completed trace
-                        current_trace.save(empty_save=False, overwrite=overwrite)
+                        current_trace.save(overwrite=overwrite)
                         return result
                     finally:
                         # Reset trace context (span context resets automatically)
@@ -1101,7 +1108,7 @@ class Tracer:
                     )
                     # Save empty trace and set trace context
-                    current_trace.save(empty_save=True, overwrite=overwrite)
+                    # current_trace.save(empty_save=True, overwrite=overwrite)
                     trace_token = current_trace_var.set(current_trace)
                     try:
@@ -1121,7 +1128,7 @@ class Tracer:
                             span.record_output(result)
                         # Save the completed trace
-                        current_trace.save(empty_save=False, overwrite=overwrite)
+                        current_trace.save(overwrite=overwrite)
                         return result
                     finally:
                         # Reset trace context (span context resets automatically)

judgeval/data/custom_api_example.py ADDED Viewed

@@ -0,0 +1,91 @@
+from typing import List, Optional, Dict, Any, Union
+from pydantic import BaseModel, ConfigDict, model_validator
+from judgeval.data.example import Example
+from judgeval.data.custom_example import CustomExample
+from judgeval.data.scorer_data import ScorerData
+from judgeval.common.logger import debug, error
+class ProcessExample(BaseModel):
+    """
+    ProcessExample is an `Example` object that contains intermediate information
+    about an undergoing evaluation on the original `Example`. It is used purely for
+    internal operations and keeping track of the evaluation process.
+    """
+    name: str
+    # input: Optional[str] = None
+    # actual_output: Optional[Union[str, List[str]]] = None
+    # expected_output: Optional[Union[str, List[str]]] = None
+    # context: Optional[list] = None
+    # retrieval_context: Optional[list] = None
+    # tools_called: Optional[list] = None
+    # expected_tools: Optional[list] = None
+    # make these optional, not all test cases in a conversation will be evaluated
+    success: Optional[bool] = None
+    scorers_data: Optional[List[ScorerData]] = None
+    run_duration: Optional[float] = None
+    evaluation_cost: Optional[float] = None
+    order: Optional[int] =  None
+    # These should map 1 to 1 from golden
+    additional_metadata: Optional[Dict] = None
+    comments: Optional[str] = None
+    trace_id: Optional[str] = None
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    def update_scorer_data(self, scorer_data: ScorerData):
+        """
+        Updates scorer data field of test case after the scorers have been
+        evaluated on this test case.
+        """
+        debug(f"Updating scorer data for example '{self.name}' with scorer: {scorer_data}")
+        # self.scorers_data is a list of ScorerData objects that contain the
+        # evaluation results of each scorer on this test case
+        if self.scorers_data is None:
+            self.scorers_data = [scorer_data]
+        else:
+            self.scorers_data.append(scorer_data)
+        if self.success is None:
+            # self.success will be None when it is a message
+            # in that case we will be setting success for the first time
+            self.success = scorer_data.success
+        else:
+            if scorer_data.success is False:
+                debug(f"Example '{self.name}' marked as failed due to scorer: {scorer_data}")
+                self.success = False
+    def update_run_duration(self, run_duration: float):
+        self.run_duration = run_duration
+def create_process_custom_example(
+    example: CustomExample,
+) -> ProcessExample:
+    """
+    When an LLM Test Case is executed, we track its progress using an ProcessExample.
+    This will track things like the success of the test case, as well as the metadata (such as verdicts and claims in Faithfulness).
+    """
+    success = True
+    if example.name is not None:
+        name = example.name
+    else:
+        name = "Test Case Placeholder"
+        debug(f"No name provided for example, using default name: {name}")
+    order = None
+    scorers_data = []
+    debug(f"Creating ProcessExample for: {name}")
+    process_ex = ProcessExample(
+        name=name,
+        success=success,
+        scorers_data=scorers_data,
+        run_duration=None,
+        evaluation_cost=None,
+        order=order,
+        additional_metadata=example.additional_metadata,
+        trace_id=example.trace_id
+    )
+    return process_ex

judgeval/data/result.py CHANGED Viewed

@@ -49,9 +49,9 @@ class ScoringResult(BaseModel):
 def generate_scoring_result(
     example: Example,
-    success: bool,
     scorers_data: List[ScorerData],
     run_duration: float,
+    success: bool,
 ) -> ScoringResult:
     """
     Creates a final ScoringResult object for an evaluation run based on the results from a completed LLMApiTestCase.

judgeval/evaluation_run.py CHANGED Viewed

@@ -34,6 +34,7 @@ class EvaluationRun(BaseModel):
     model: Union[str, List[str], JudgevalJudge]
     aggregator: Optional[str] = None
     metadata: Optional[Dict[str, Any]] = None
+    trace_span_id: Optional[str] = None
     # API Key will be "" until user calls client.run_eval(), then API Key will be set
     judgment_api_key: Optional[str] = ""
     override: Optional[bool] = False

{judgeval-0.0.27.dist-info → judgeval-0.0.28.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.0.27
+Version: 0.0.28
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues

{judgeval-0.0.27.dist-info → judgeval-0.0.28.dist-info}/RECORD RENAMED Viewed

@@ -1,18 +1,19 @@
 judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
 judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
 judgeval/constants.py,sha256=ksAXhAXovzJKH0uHOdQtREs168uCJRG79PooHNmEbYQ,5313
-judgeval/evaluation_run.py,sha256=RgJD60lJsunNQzObjo7iXnAzXWgubCLOAAuuamAAuoI,6354
+judgeval/evaluation_run.py,sha256=6Kft3wZDWkdBDZoMwOhWf7zSAOF4naI7Pcg_YlZaZY4,6394
 judgeval/judgment_client.py,sha256=uf0V1-eu3qnFTwrQ_Ckcv8IiWRVv7dbvou4P4KjU6hM,26794
 judgeval/rules.py,sha256=B0ZL0pn72D4Jnlr0zMQ6CPHi7D8AQQRariXCVsiCMiI,20542
 judgeval/run_evaluation.py,sha256=N2ppmEE5WoSReChKjr_n0NcdAUlUR6Nua7M1C_3zHQ8,24949
 judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
 judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
 judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
-judgeval/common/tracer.py,sha256=L6JkCHj6kxhtDzf9OPg5ZC-NUUH4VDvDcV4utPi_I38,57544
+judgeval/common/tracer.py,sha256=Qpn2m6LCpRq1OOWRd1z16JtmeS7ITIWaQNJOddmAfQY,58178
 judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
 judgeval/data/__init__.py,sha256=dG5ytBOeOWCTd5o0KP7IblqtW4G1EBaGreLWepM3jas,345
+judgeval/data/custom_api_example.py,sha256=uW_ZBzkDLWumtudmfRHAJQkVYpm2qWgcDf7vBNLpS-o,3444
 judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
-judgeval/data/result.py,sha256=YHD-dVYJN4JFpM-YCGgBtSdFcGAOyWYL41sf0TE9Hzg,3122
+judgeval/data/result.py,sha256=BT4f2FF5EFuiRjOmS4vuIXsrEwSlG16Vw3QaWi6PZzc,3122
 judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
 judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
 judgeval/data/datasets/dataset.py,sha256=AFYjksV_wXx5CqFYJsl3aN8yZ6hC50O1myRuOJ8s8_E,12867
@@ -86,7 +87,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
 judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
 judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
 judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
-judgeval-0.0.27.dist-info/METADATA,sha256=yoUWIaLIDPksMYQSxDIbVFjtFVCxim6-5LSQ2P13a-U,5418
-judgeval-0.0.27.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.0.27.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.0.27.dist-info/RECORD,,
+judgeval-0.0.28.dist-info/METADATA,sha256=GSGf7_cb7FkKdQ_PFPf4nw9hlMrKyD3Tv6X8m2uo3EY,5418
+judgeval-0.0.28.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.0.28.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.0.28.dist-info/RECORD,,

{judgeval-0.0.27.dist-info → judgeval-0.0.28.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.0.27.dist-info → judgeval-0.0.28.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.0.27__py3-none-any.whl → 0.0.28__py3-none-any.whl

judgeval 0.0.27py3-none-any.whl → 0.0.28py3-none-any.whl