PyPI - judgeval - Versions diffs - 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl - Mend

judgeval 0.0.29py3-none-any.whl → 0.0.31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

judgeval/common/tracer.py +93 -55
judgeval/constants.py +4 -2
judgeval/data/__init__.py +4 -0
judgeval/data/custom_example.py +18 -0
judgeval/data/datasets/eval_dataset_client.py +62 -3
judgeval/data/example.py +1 -0
judgeval/data/result.py +7 -6
judgeval/data/sequence.py +59 -0
judgeval/data/sequence_run.py +42 -0
judgeval/evaluation_run.py +12 -7
judgeval/integrations/langgraph.py +89 -72
judgeval/judgment_client.py +77 -14
judgeval/run_evaluation.py +87 -13
judgeval/scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorer.py +3 -0
judgeval/scorers/judgeval_scorers/__init__.py +7 -0
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -1
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +21 -0
judgeval/scorers/score.py +6 -5
{judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/METADATA +1 -1
{judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/RECORD +23 -20
judgeval/data/custom_api_example.py +0 -91
{judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/WHEEL +0 -0
{judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/licenses/LICENSE.md +0 -0

judgeval/integrations/langgraph.py CHANGED Viewed

@@ -22,9 +22,8 @@ from langchain_core.documents import Document
 class JudgevalCallbackHandler(BaseCallbackHandler):
     def __init__(self, tracer: Tracer):
         self.tracer = tracer
-        self.trace_client = tracer.get_current_trace() if tracer.get_current_trace() else None
         self.previous_spans = [] # stack of previous spans
-        self.finished = False
+        self.created_trace = False
         # Attributes for users to access
         self.previous_node = None
@@ -33,43 +32,58 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
         self.executed_tools = []
     def start_span(self, name: str, span_type: SpanType = "span"):
+        current_trace = self.tracer.get_current_trace()
         start_time = time.time()
+        # Generate a unique ID for *this specific span invocation*
+        span_id = str(uuid.uuid4())
+        parent_span_id = current_trace.get_current_span()
+        token = current_trace.set_current_span(span_id) # Set *this* span's ID as the current one
+        current_depth = 0
+        if parent_span_id and parent_span_id in current_trace._span_depths:
+            current_depth = current_trace._span_depths[parent_span_id] + 1
+        current_trace._span_depths[span_id] = current_depth # Store depth by span_id
         # Record span entry
-        self.trace_client.add_entry(TraceEntry(
+        current_trace.add_entry(TraceEntry(
             type="enter",
+            span_id=span_id,
+            trace_id=current_trace.trace_id,
+            parent_span_id=parent_span_id,
             function=name,
-            depth=self.trace_client.tracer.depth,
+            depth=current_depth,
             message=name,
-            timestamp=start_time,
+            created_at=start_time,
             span_type=span_type
         ))
-        self.trace_client.tracer.depth += 1
-        self.previous_spans.append(self.trace_client._current_span)
-        self.trace_client._current_span = name
+        self.previous_spans.append(token)
         self._start_time = start_time
-    def end_span(self, name: str, span_type: SpanType = "span"):
-        self.trace_client.tracer.depth -= 1
+    def end_span(self, span_type: SpanType = "span"):
+        current_trace = self.tracer.get_current_trace()
         duration = time.time() - self._start_time
+        span_id = current_trace.get_current_span()
+        exit_depth = current_trace._span_depths.get(span_id, 0) # Get depth using this span's ID
         # Record span exit
-        self.trace_client.add_entry(TraceEntry(
+        current_trace.add_entry(TraceEntry(
             type="exit",
-            function=name,
-            depth=self.trace_client.tracer.depth,
-            message=f"{name}",
-            timestamp=time.time(),
+            span_id=span_id,
+            trace_id=current_trace.trace_id,
+            depth=exit_depth,
+            created_at=time.time(),
             duration=duration,
             span_type=span_type
         ))
-        self.trace_client._current_span = self.previous_spans.pop()
-        if self.trace_client.tracer.depth == 0:
+        current_trace.reset_current_span(self.previous_spans.pop())
+        if exit_depth == 0:
             # Save the trace if we are the root, this is when users dont use any @observe decorators
-            self.trace_client.save(empty_save=False, overwrite=True)
-            self.trace_client._current_trace = None
+            trace_id, trace_data = current_trace.save(overwrite=True)
+            self._trace_id = trace_id
+            current_trace = None
     def on_retriever_start(
         self,
@@ -85,9 +99,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
         name = "RETRIEVER_CALL"
         if serialized and "name" in serialized:
             name = f"RETRIEVER_{serialized['name'].upper()}"
+        current_trace = self.tracer.get_current_trace()
         self.start_span(name, span_type="retriever")
-        self.trace_client.record_input({
+        current_trace.record_input({
             'query': query,
             'tags': tags,
             'metadata': metadata,
@@ -103,6 +117,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
         **kwargs: Any
     ) -> Any:
         # Process the retrieved documents into a format suitable for logging
+        current_trace = self.tracer.get_current_trace()
         doc_summary = []
         for i, doc in enumerate(documents):
             # Extract key information from each document
@@ -114,13 +129,13 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
             doc_summary.append(doc_data)
         # Record the document data
-        self.trace_client.record_output({
+        current_trace.record_output({
             "document_count": len(documents),
             "documents": doc_summary
         })
         # End the retriever span
-        self.end_span(self.trace_client._current_span, span_type="retriever")
+        self.end_span(span_type="retriever")
     def on_chain_start(
         self,
@@ -134,29 +149,26 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
         **kwargs: Any
     ) -> None:
         # If the user doesnt use any @observe decorators, the first action in LangGraph workflows seems tohave this attribute, so we intialize our trace client here
+        current_trace = self.tracer.get_current_trace()
         if kwargs.get('name') == 'LangGraph':
-            if not self.trace_client:
+            if not current_trace:
+                self.created_trace = True
                 trace_id = str(uuid.uuid4())
                 project = self.tracer.project_name
-                trace = TraceClient(self.tracer, trace_id, trace_id, project_name=project, overwrite=False, rules=self.tracer.rules, enable_monitoring=self.tracer.enable_monitoring, enable_evaluations=self.tracer.enable_evaluations)
-                self.trace_client = trace
-                self.tracer._current_trace = trace # set the trace in the original tracer object
-                # Only save empty trace for the root call
-                self.trace_client.save(empty_save=True, overwrite=False)
-            self.start_span("LangGraph", span_type="Main Function")
+                trace = TraceClient(self.tracer, trace_id, "Langgraph", project_name=project, overwrite=False, rules=self.tracer.rules, enable_monitoring=self.tracer.enable_monitoring, enable_evaluations=self.tracer.enable_evaluations)
+                self.tracer.set_current_trace(trace)
+                self.start_span("LangGraph", span_type="Main Function")
-        metadata = kwargs.get("metadata", {})
-        if node := metadata.get("langgraph_node"):
-            if node != self.previous_node:
-                # Track node execution
-                self.trace_client.visited_nodes.append(node)
-                self.trace_client.executed_node_tools.append(node)
-                self.trace_client.record_input({
-                    'args': inputs,
-                    'kwargs': kwargs
-                })
-            self.previous_node = node
+        node = metadata.get("langgraph_node")
+        if node != None and node != self.previous_node:
+             self.start_span(node, span_type="node")
+             self.executed_node_tools.append(node)
+             self.executed_nodes.append(node)
+             current_trace.record_input({
+                 'args': inputs,
+                 'kwargs': kwargs
+             })
+        self.previous_node = node
     def on_chain_end(
         self,
@@ -167,14 +179,13 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
         tags: Optional[List[str]] = None,
         **kwargs: Any,
     ) -> Any:
-        if outputs == "__end__":
-            self.finished = True
+        current_trace = self.tracer.get_current_trace()
         if tags is not None and any("graph:step" in tag for tag in tags):
-            self.trace_client.record_output(outputs)
-            self.end_span(self.trace_client._current_span, span_type="node")
+            current_trace.record_output(outputs)
+            self.end_span(span_type="node")
-            if self.finished:
-                self.end_span(self.trace_client._current_span, span_type="Main Function")
+        if self.created_trace and (outputs == "__end__" or (not kwargs and not tags)):
+            self.end_span(span_type="Main Function")
     def on_chain_error(
         self,
@@ -184,9 +195,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
         parent_run_id: Optional[UUID] = None,
         **kwargs: Any,
     ) -> Any:
-        print(f"Chain error: {error}")
-        self.trace_client.record_output(error)
-        self.end_span(self.trace_client._current_span, span_type="node")
+        current_trace = self.tracer.get_current_trace()
+        current_trace.record_output(error)
+        self.end_span(span_type="node")
     def on_tool_start(
         self,
@@ -199,19 +210,21 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
     ):
         name = serialized["name"]
         self.start_span(name, span_type="tool")
+        current_trace = self.tracer.get_current_trace()
         if name:
             # Track tool execution
-            self.trace_client.executed_tools.append(name)
+            current_trace.executed_tools.append(name)
             node_tool = f"{self.previous_node}:{name}" if self.previous_node else name
-            self.trace_client.executed_node_tools.append(node_tool)
-        self.trace_client.record_input({
-            'args': input_str,
-            'kwargs': kwargs
-        })
+            current_trace.executed_node_tools.append(node_tool)
+            current_trace.record_input({
+                'args': input_str,
+                'kwargs': kwargs
+            })
     def on_tool_end(self, output: Any, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any) -> Any:
-        self.trace_client.record_output(output)
-        self.end_span(self.trace_client._current_span, span_type="tool")
+        current_trace = self.tracer.get_current_trace()
+        current_trace.record_output(output)
+        self.end_span(span_type="tool")
     def on_tool_error(
         self,
@@ -221,9 +234,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
         parent_run_id: Optional[UUID] = None,
         **kwargs: Any,
     ) -> Any:
-        print(f"Tool error: {error}")
-        self.trace_client.record_output(error)
-        self.end_span(self.trace_client._current_span, span_type="tool")
+        current_trace = self.tracer.get_current_trace()
+        current_trace.record_output(error)
+        self.end_span(span_type="tool")
     def on_agent_action(
         self,
@@ -233,7 +246,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
         parent_run_id: Optional[UUID] = None,
         **kwargs: Any,
     ) -> Any:
-        print(f"Agent action: {action}")
+        pass
     def on_agent_finish(
         self,
@@ -243,7 +256,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
         parent_run_id: Optional[UUID] = None,
         **kwargs: Any,
     ) -> Any:
-        print(f"Agent finish: {finish}")
+        pass
     def on_llm_start(
         self,
@@ -256,14 +270,16 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
     ) -> Any:
         name = "LLM call"
         self.start_span(name, span_type="llm")
-        self.trace_client.record_input({
+        current_trace = self.tracer.get_current_trace()
+        current_trace.record_input({
             'args': prompts,
             'kwargs': kwargs
         })
     def on_llm_end(self, response: LLMResult, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any):
-        self.trace_client.record_output(response.generations[0][0].text)
-        self.end_span(self.trace_client._current_span, span_type="llm")
+        current_trace = self.tracer.get_current_trace()
+        current_trace.record_output(response.generations[0][0].text)
+        self.end_span(span_type="llm")
     def on_llm_error(
         self,
@@ -273,9 +289,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
         parent_run_id: Optional[UUID] = None,
         **kwargs: Any,
     ) -> Any:
-        print(f"LLM error: {error}")
-        self.trace_client.record_output(error)
-        self.end_span(self.trace_client._current_span, span_type="llm")
+        current_trace = self.tracer.get_current_trace()
+        current_trace.record_output(error)
+        self.end_span(span_type="llm")
     def on_chat_model_start(
         self,
@@ -297,7 +313,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
             name = "LLM call"
         self.start_span(name, span_type="llm")
-        self.trace_client.record_input({
+        current_trace = self.tracer.get_current_trace()
+        current_trace.record_input({
             'args': str(messages),
             'kwargs': kwargs
         })

judgeval/judgment_client.py CHANGED Viewed

@@ -10,6 +10,8 @@ from judgeval.data.datasets import EvalDataset, EvalDatasetClient
 from judgeval.data import (
     ScoringResult,
     Example,
+    CustomExample,
+    Sequence,
 )
 from judgeval.scorers import (
     APIJudgmentScorer,
@@ -20,8 +22,10 @@ from judgeval.scorers import (
 from judgeval.evaluation_run import EvaluationRun
 from judgeval.run_evaluation import (
     run_eval,
-    assert_test
+    assert_test,
+    run_sequence_eval
 )
+from judgeval.data.sequence_run import SequenceRun
 from judgeval.judges import JudgevalJudge
 from judgeval.constants import (
     JUDGMENT_EVAL_FETCH_API_URL,
@@ -78,15 +82,71 @@ class JudgmentClient(metaclass=SingletonMeta):
         project_name: str = "default_project",
         eval_run_name: str = "default_eval_run",
         override: bool = False,
+        append: bool = False,
         use_judgment: bool = True,
         ignore_errors: bool = True,
         rules: Optional[List[Rule]] = None
     ) -> List[ScoringResult]:
-        return self.run_evaluation(examples, scorers, model, aggregator, metadata, log_results, project_name, eval_run_name, override, use_judgment, ignore_errors, True, rules)
+        return self.run_evaluation(examples, scorers, model, aggregator, metadata, log_results, project_name, eval_run_name, override, append, use_judgment, ignore_errors, True, rules)
+    def run_sequence_evaluation(
+        self,
+        sequences: List[Sequence],
+        model: Union[str, List[str], JudgevalJudge],
+        aggregator: Optional[str] = None,
+        project_name: str = "default_project",
+        eval_run_name: str = "default_eval_sequence",
+        use_judgment: bool = True,
+        log_results: bool = True,
+        override: bool = False,
+        ignore_errors: bool = True,
+        rules: Optional[List[Rule]] = None
+    ) -> List[ScoringResult]:
+        try:
+            if rules:
+                loaded_rules = []
+                for rule in rules:
+                    try:
+                        processed_conditions = []
+                        for condition in rule.conditions:
+                            # Convert metric if it's a ScorerWrapper
+                            if isinstance(condition.metric, ScorerWrapper):
+                                try:
+                                    condition_copy = condition.model_copy()
+                                    condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
+                                    processed_conditions.append(condition_copy)
+                                except Exception as e:
+                                    raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
+                            else:
+                                processed_conditions.append(condition)
+                        # Create new rule with processed conditions
+                        new_rule = rule.model_copy()
+                        new_rule.conditions = processed_conditions
+                        loaded_rules.append(new_rule)
+                    except Exception as e:
+                        raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
+            sequence_run = SequenceRun(
+                project_name=project_name,
+                eval_name=eval_run_name,
+                sequences=sequences,
+                model=model,
+                aggregator=aggregator,
+                log_results=log_results,
+                judgment_api_key=self.judgment_api_key,
+                organization_id=self.organization_id
+            )
+            return run_sequence_eval(sequence_run, override, ignore_errors, use_judgment)
+        except ValueError as e:
+            raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
+        except Exception as e:
+            raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
     def run_evaluation(
         self,
-        examples: List[Example],
+        examples: Union[List[Example], List[CustomExample]],
         scorers: List[Union[ScorerWrapper, JudgevalScorer]],
         model: Union[str, List[str], JudgevalJudge],
         aggregator: Optional[str] = None,
@@ -95,6 +155,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         project_name: str = "default_project",
         eval_run_name: str = "default_eval_run",
         override: bool = False,
+        append: bool = False,
         use_judgment: bool = True,
         ignore_errors: bool = True,
         async_execution: bool = False,
@@ -104,7 +165,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         Executes an evaluation of `Example`s using one or more `Scorer`s
         Args:
-            examples (List[Example]): The examples to evaluate
+            examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
             scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
             model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
             aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
@@ -120,6 +181,9 @@ class JudgmentClient(metaclass=SingletonMeta):
         Returns:
             List[ScoringResult]: The results of the evaluation
         """
+        if override and append:
+            raise ValueError("Cannot set both override and append to True. Please choose one.")
         try:
             # Load appropriate implementations for all scorers
             loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
@@ -161,9 +225,9 @@ class JudgmentClient(metaclass=SingletonMeta):
                         loaded_rules.append(new_rule)
                     except Exception as e:
                         raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
             eval = EvaluationRun(
                 log_results=log_results,
+                append=append,
                 project_name=project_name,
                 eval_name=eval_run_name,
                 examples=examples,
@@ -180,7 +244,7 @@ class JudgmentClient(metaclass=SingletonMeta):
             raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
         except Exception as e:
             raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
     def evaluate_dataset(
         self,
         dataset: EvalDataset,
@@ -292,6 +356,12 @@ class JudgmentClient(metaclass=SingletonMeta):
         dataset.judgment_api_key = self.judgment_api_key
         return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
+    def append_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
+        """
+        Appends an `EvalDataset` to the Judgment platform for storage.
+        """
+        return self.eval_dataset_client.append(alias, examples, project_name)
     def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
         """
         Retrieves a saved `EvalDataset` from the Judgment platform.
@@ -355,14 +425,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         if eval_run.status_code != requests.codes.ok:
             raise ValueError(f"Error fetching eval results: {eval_run.json()}")
-        eval_run_result = [{}]
-        for result in eval_run.json():
-            result_id = result.get("id", "")
-            result_data = result.get("result", dict())
-            filtered_result = {k: v for k, v in result_data.items() if k in ScoringResult.__annotations__}
-            eval_run_result[0]["id"] = result_id
-            eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
-        return eval_run_result
+        return eval_run.json()
     def delete_eval(self, project_name: str, eval_run_names: List[str]) -> bool:
         """

judgeval/run_evaluation.py CHANGED Viewed

@@ -4,14 +4,15 @@ import time
 import sys
 import itertools
 import threading
-from typing import List, Dict, Any
+from typing import List, Dict, Any, Union
 from datetime import datetime
 from rich import print as rprint
 from judgeval.data import (
     ScorerData,
     ScoringResult,
-    Example
+    Example,
+    CustomExample
 )
 from judgeval.scorers import (
     JudgevalScorer,
@@ -22,6 +23,7 @@ from judgeval.scorers.score import a_execute_scoring
 from judgeval.constants import (
     ROOT_API,
     JUDGMENT_EVAL_API_URL,
+    JUDGMENT_SEQUENCE_EVAL_API_URL,
     JUDGMENT_EVAL_LOG_API_URL,
     MAX_CONCURRENT_EVALUATIONS,
     JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
@@ -34,7 +36,7 @@ from judgeval.common.logger import (
     example_logging_context
 )
 from judgeval.evaluation_run import EvaluationRun
+from judgeval.data.sequence_run import SequenceRun
 def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
     """
@@ -91,6 +93,36 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
         raise JudgmentAPIError(error_message)
     return response_data
+def execute_api_sequence_eval(sequence_run: SequenceRun) -> List[Dict]:
+    """
+    Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
+    """
+    try:
+        # submit API request to execute evals
+        payload = sequence_run.model_dump(warnings=False)
+        response = requests.post(
+            JUDGMENT_SEQUENCE_EVAL_API_URL,
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {sequence_run.judgment_api_key}",
+                "X-Organization-Id": sequence_run.organization_id
+            },
+            json=payload,
+            verify=True
+        )
+        response_data = response.json()
+    except Exception as e:
+        error(f"Error: {e}")
+        details = response.json().get("detail", "No details provided")
+        raise JudgmentAPIError("An error occurred while executing the Judgment API request: " + details)
+    # Check if the response status code is not 2XX
+    # Add check for the duplicate eval run name
+    if not response.ok:
+        error_message = response_data.get('detail', 'An unknown error occurred.')
+        error(f"Error: {error_message=}")
+        raise JudgmentAPIError(error_message)
+    return response_data
 def merge_results(api_results: List[ScoringResult], local_results: List[ScoringResult]) -> List[ScoringResult]:
     """
@@ -197,8 +229,8 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
         )
         if response.status_code == 409:
-            error(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `override` flag to true.")
-            raise ValueError(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `override` flag to true.")
+            error(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true.")
+            raise ValueError(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true.")
         if not response.ok:
             response_data = response.json()
@@ -211,7 +243,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
         raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
-def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> str:
+def log_evaluation_results(merged_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
     """
     Logs evaluation results to the Judgment API database.
@@ -228,13 +260,12 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
             JUDGMENT_EVAL_LOG_API_URL,
             headers={
                 "Content-Type": "application/json",
-                "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
-                "X-Organization-Id": evaluation_run.organization_id
+                "Authorization": f"Bearer {run.judgment_api_key}",
+                "X-Organization-Id": run.organization_id
             },
             json={
-                "results": [result.to_dict() for result in merged_results],
-                "project_name": evaluation_run.project_name,
-                "eval_name": evaluation_run.eval_name,
+                "results": [result.model_dump(warnings=False) for result in merged_results],
+                "run": run.model_dump(warnings=False)
             },
             verify=True
         )
@@ -303,6 +334,42 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
                     # Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
                     print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
+def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
+    # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
+    if not override and sequence_run.log_results:
+        check_eval_run_name_exists(
+            sequence_run.eval_name,
+            sequence_run.project_name,
+            sequence_run.judgment_api_key,
+            sequence_run.organization_id
+        )
+    # Execute evaluation using Judgment API
+    info("Starting API evaluation")
+    try:  # execute an EvaluationRun with just JudgmentScorers
+        debug("Sending request to Judgment API")
+        response_data: List[Dict] = run_with_spinner("Running Sequence Evaluation: ", execute_api_sequence_eval, sequence_run)
+        info(f"Received {len(response_data['results'])} results from API")
+    except JudgmentAPIError as e:
+        error(f"An error occurred while executing the Judgment API request: {str(e)}")
+        raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
+    except ValueError as e:
+        raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: {str(e)}")
+    # Convert the response data to `ScoringResult` objects
+    debug("Processing API results")
+    api_results = []
+    for result in response_data["results"]:
+        api_results.append(ScoringResult(**result))
+    # TODO: allow for custom scorer on sequences
+    if sequence_run.log_results:
+        pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, api_results, sequence_run)
+        rprint(pretty_str)
 def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
     """
     Executes an evaluation of `Example`s using one or more `Scorer`s
@@ -329,7 +396,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
     """
     # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
-    if not override and evaluation_run.log_results:
+    if not override and evaluation_run.log_results and not evaluation_run.append:
         check_eval_run_name_exists(
             evaluation_run.eval_name,
             evaluation_run.project_name,
@@ -373,12 +440,20 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
             local_scorers.append(scorer)
             debug(f"Added local scorer: {type(scorer).__name__}")
+    custom_example_check = [scorer.custom_example for scorer in local_scorers]
+    if any(custom_example_check) and not all(custom_example_check):
+        error("All scorers must be custom scorers if using custom examples")
+        raise ValueError("All scorers must be custom scorers if using custom examples")
     debug(f"Found {len(judgment_scorers)} judgment scorers and {len(local_scorers)} local scorers")
     api_results: List[ScoringResult] = []
     local_results: List[ScoringResult] = []
     if async_execution:
+        if len(local_scorers) > 0:
+            error("Local scorers are not supported in async execution")
         check_examples(evaluation_run.examples, evaluation_run.scorers)
         info("Starting async evaluation")
         payload = evaluation_run.model_dump(warnings=False)
@@ -396,7 +471,6 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
     else:
         if judgment_scorers:
             # Execute evaluation using Judgment API
-            check_examples(evaluation_run.examples, evaluation_run.scorers)
             info("Starting API evaluation")
             debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
             try:  # execute an EvaluationRun with just JudgmentScorers

judgeval/scorers/__init__.py CHANGED Viewed

@@ -17,6 +17,7 @@ from judgeval.scorers.judgeval_scorers import (
     ComparisonScorer,
     InstructionAdherenceScorer,
     GroundednessScorer,
+    DerailmentScorer,
 )
 __all__ = [
@@ -39,4 +40,5 @@ __all__ = [
     "ComparisonScorer",
     "InstructionAdherenceScorer",
     "GroundednessScorer",
+    "DerailmentScorer",
 ]

judgeval 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl

judgeval 0.0.29py3-none-any.whl → 0.0.31py3-none-any.whl