PyPI - judgeval - Versions diffs - 0.0.37__py3-none-any.whl → 0.0.39__py3-none-any.whl - Mend

judgeval 0.0.37py3-none-any.whl → 0.0.39py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

judgeval/common/tracer.py +132 -281
judgeval/common/utils.py +1 -1
judgeval/constants.py +2 -3
judgeval/data/__init__.py +0 -2
judgeval/data/datasets/dataset.py +2 -9
judgeval/data/datasets/eval_dataset_client.py +1 -62
judgeval/data/example.py +7 -7
judgeval/data/result.py +3 -3
judgeval/data/tool.py +19 -0
judgeval/data/trace.py +5 -1
judgeval/data/{sequence_run.py → trace_run.py} +4 -4
judgeval/evaluation_run.py +1 -1
judgeval/integrations/langgraph.py +187 -1768
judgeval/judges/litellm_judge.py +1 -1
judgeval/judges/mixture_of_judges.py +1 -1
judgeval/judges/utils.py +1 -1
judgeval/judgment_client.py +21 -25
judgeval/run_evaluation.py +381 -107
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +4 -2
judgeval-0.0.39.dist-info/METADATA +247 -0
{judgeval-0.0.37.dist-info → judgeval-0.0.39.dist-info}/RECORD +23 -23
judgeval/data/sequence.py +0 -50
judgeval-0.0.37.dist-info/METADATA +0 -214
{judgeval-0.0.37.dist-info → judgeval-0.0.39.dist-info}/WHEEL +0 -0
{judgeval-0.0.37.dist-info → judgeval-0.0.39.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/tracer.py CHANGED Viewed

@@ -146,7 +146,7 @@ class TraceManagerClient:
         return response.json()
-    def save_trace(self, trace_data: dict):
+    def save_trace(self, trace_data: dict, offline_mode: bool = False):
         """
         Saves a trace to the Judgment Supabase and optionally to S3 if configured.
@@ -183,7 +183,7 @@ class TraceManagerClient:
             except Exception as e:
                 warnings.warn(f"Failed to save trace to S3: {str(e)}")
-        if "ui_results_url" in response.json():
+        if not offline_mode and "ui_results_url" in response.json():
             pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n"
             rprint(pretty_str)
@@ -314,6 +314,7 @@ class TraceClient:
         self.executed_tools = []
         self.executed_node_tools = []
         self._span_depths: Dict[str, int] = {} # NEW: To track depth of active spans
     def get_current_span(self):
         """Get the current span from the context var"""
         return current_span_var.get()
@@ -428,7 +429,8 @@ class TraceClient:
         # span_id_at_eval_call = current_span_var.get()
         # print(f"[TraceClient.async_evaluate] Captured span ID at eval call: {span_id_at_eval_call}")
         # Prioritize explicitly passed span_id, fallback to context var
-        span_id_to_use = span_id if span_id is not None else current_span_var.get()
+        current_span_ctx_var = current_span_var.get()
+        span_id_to_use = span_id if span_id is not None else current_span_ctx_var if current_span_ctx_var is not None else self.tracer.get_current_span()
         # print(f"[TraceClient.async_evaluate] Using span_id: {span_id_to_use}")
         # --- End Modification ---
@@ -438,7 +440,7 @@ class TraceClient:
             log_results=log_results,
             project_name=self.project_name,
             eval_name=f"{self.name.capitalize()}-"
-                f"{current_span_var.get()}-" # Keep original eval name format using context var if available
+                f"{span_id_to_use}-" # Keep original eval name format using context var if available
                 f"[{','.join(scorer.score_type.capitalize() for scorer in scorers)}]",
             examples=[example],
             scorers=scorers,
@@ -658,11 +660,12 @@ class TraceClient:
             "entries": [span.model_dump() for span in self.trace_spans],
             "evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
             "overwrite": overwrite,
+            "offline_mode": self.tracer.offline_mode,
             "parent_trace_id": self.parent_trace_id,
             "parent_name": self.parent_name
         }
         # --- Log trace data before saving ---
-        self.trace_manager_client.save_trace(trace_data)
+        self.trace_manager_client.save_trace(trace_data, offline_mode=self.tracer.offline_mode)
         # upload annotations
         # TODO: batch to the log endpoint
@@ -928,6 +931,7 @@ class Tracer:
         s3_aws_access_key_id: Optional[str] = None,
         s3_aws_secret_access_key: Optional[str] = None,
         s3_region_name: Optional[str] = None,
+        offline_mode: bool = False,
         deep_tracing: bool = True  # Deep tracing is enabled by default
         ):
         if not hasattr(self, 'initialized'):
@@ -968,6 +972,7 @@ class Tracer:
                     aws_secret_access_key=s3_aws_secret_access_key,
                     region_name=s3_region_name
                 )
+            self.offline_mode: bool = offline_mode
             self.deep_tracing: bool = deep_tracing  # NEW: Store deep tracing setting
         elif hasattr(self, 'project_name') and self.project_name != project_name:
@@ -977,6 +982,12 @@ class Tracer:
                 "To use a different project name, ensure the first Tracer initialization uses the desired project name.",
                 RuntimeWarning
             )
+    def set_current_span(self, span_id: str):
+        self.current_span_id = span_id
+    def get_current_span(self) -> Optional[str]:
+        return getattr(self, 'current_span_id', None)
     def set_current_trace(self, trace: TraceClient):
         """
@@ -1263,64 +1274,94 @@ class Tracer:
         else:
             warnings.warn("No trace found (context var or fallback), skipping evaluation") # Modified warning
 def wrap(client: Any) -> Any:
     """
     Wraps an API client to add tracing capabilities.
     Supports OpenAI, Together, Anthropic, and Google GenAI clients.
     Patches both '.create' and Anthropic's '.stream' methods using a wrapper class.
     """
-    span_name, original_create, responses_create, original_stream = _get_client_config(client)
+    span_name, original_create, original_responses_create, original_stream = _get_client_config(client)
+    def _record_input_and_check_streaming(span, kwargs, is_responses=False):
+        """Record input and check for streaming"""
+        is_streaming = kwargs.get("stream", False)
-    # --- Define Traced Async Functions ---
+            # Record input based on whether this is a responses endpoint
+        if is_responses:
+            span.record_input(kwargs)
+        else:
+            input_data = _format_input_data(client, **kwargs)
+            span.record_input(input_data)
+        # Warn about token counting limitations with streaming
+        if isinstance(client, (AsyncOpenAI, OpenAI)) and is_streaming:
+            if not kwargs.get("stream_options", {}).get("include_usage"):
+                warnings.warn(
+                    "OpenAI streaming calls don't include token counts by default. "
+                    "To enable token counting with streams, set stream_options={'include_usage': True} "
+                    "in your API call arguments.",
+                    UserWarning
+                )
+        return is_streaming
+    def _format_and_record_output(span, response, is_streaming, is_async, is_responses):
+        """Format and record the output in the span"""
+        if is_streaming:
+            output_entry = span.record_output("<pending stream>")
+            wrapper_func = _async_stream_wrapper if is_async else _sync_stream_wrapper
+            return wrapper_func(response, client, output_entry)
+        else:
+            format_func = _format_response_output_data if is_responses else _format_output_data
+            output_data = format_func(client, response)
+            span.record_output(output_data)
+            return response
+    def _handle_error(span, e, is_async):
+        """Handle and record errors"""
+        call_type = "async" if is_async else "sync"
+        print(f"Error during wrapped {call_type} API call ({span_name}): {e}")
+        span.record_output({"error": str(e)})
+        raise
+    # --- Traced Async Functions ---
     async def traced_create_async(*args, **kwargs):
-        # [Existing logic - unchanged]
         current_trace = current_trace_var.get()
         if not current_trace:
-            if asyncio.iscoroutinefunction(original_create):
-                 return await original_create(*args, **kwargs)
-            else:
-                 return original_create(*args, **kwargs)
-        is_streaming = kwargs.get("stream", False)
+            return await original_create(*args, **kwargs)
         with current_trace.span(span_name, span_type="llm") as span:
-            input_data = _format_input_data(client, **kwargs)
-            span.record_input(input_data)
-            # Warn about token counting limitations with streaming
-            if isinstance(client, (AsyncOpenAI, OpenAI)) and is_streaming:
-                if not kwargs.get("stream_options", {}).get("include_usage"):
-                    warnings.warn(
-                        "OpenAI streaming calls don't include token counts by default. "
-                        "To enable token counting with streams, set stream_options={'include_usage': True} "
-                        "in your API call arguments.",
-                        UserWarning
-                    )
+            is_streaming = _record_input_and_check_streaming(span, kwargs)
             try:
-                if is_streaming:
-                    stream_iterator = await original_create(*args, **kwargs)
-                    output_entry = span.record_output("<pending stream>")
-                    return _async_stream_wrapper(stream_iterator, client, output_entry)
-                else:
-                    awaited_response = await original_create(*args, **kwargs)
-                    output_data = _format_output_data(client, awaited_response)
-                    span.record_output(output_data)
-                    return awaited_response
+                response_or_iterator = await original_create(*args, **kwargs)
+                return _format_and_record_output(span, response_or_iterator, is_streaming, True, False)
             except Exception as e:
-                print(f"Error during wrapped async API call ({span_name}): {e}")
-                span.record_output({"error": str(e)})
-                raise
-    # Function replacing .stream() - NOW returns the wrapper class instance
+                return _handle_error(span, e, True)
+    # Async responses for OpenAI clients
+    async def traced_response_create_async(*args, **kwargs):
+        current_trace = current_trace_var.get()
+        if not current_trace:
+            return await original_responses_create(*args, **kwargs)
+        with current_trace.span(span_name, span_type="llm") as span:
+            is_streaming = _record_input_and_check_streaming(span, kwargs, is_responses=True)
+            try:
+                response_or_iterator = await original_responses_create(*args, **kwargs)
+                return _format_and_record_output(span, response_or_iterator, is_streaming, True, True)
+            except Exception as e:
+                return _handle_error(span, e, True)
+    # Function replacing .stream() for async clients
     def traced_stream_async(*args, **kwargs):
         current_trace = current_trace_var.get()
         if not current_trace or not original_stream:
             return original_stream(*args, **kwargs)
         original_manager = original_stream(*args, **kwargs)
-        wrapper_manager = _TracedAsyncStreamManagerWrapper(
+        return _TracedAsyncStreamManagerWrapper(
             original_manager=original_manager,
             client=client,
             span_name=span_name,
@@ -1328,139 +1369,74 @@ def wrap(client: Any) -> Any:
             stream_wrapper_func=_async_stream_wrapper,
             input_kwargs=kwargs
         )
-        return wrapper_manager
-    # --- Define Traced Sync Functions ---
+    # --- Traced Sync Functions ---
     def traced_create_sync(*args, **kwargs):
-         # [Existing logic - unchanged]
         current_trace = current_trace_var.get()
         if not current_trace:
-             return original_create(*args, **kwargs)
-        is_streaming = kwargs.get("stream", False)
+            return original_create(*args, **kwargs)
         with current_trace.span(span_name, span_type="llm") as span:
-             input_data = _format_input_data(client, **kwargs)
-             span.record_input(input_data)
-             # Warn about token counting limitations with streaming
-             if isinstance(client, (AsyncOpenAI, OpenAI)) and is_streaming:
-                 if not kwargs.get("stream_options", {}).get("include_usage"):
-                     warnings.warn(
-                         "OpenAI streaming calls don't include token counts by default. "
-                         "To enable token counting with streams, set stream_options={'include_usage': True} "
-                         "in your API call arguments.",
-                         UserWarning
-                     )
-             try:
-                 response_or_iterator = original_create(*args, **kwargs)
-             except Exception as e:
-                 print(f"Error during wrapped sync API call ({span_name}): {e}")
-                 span.record_output({"error": str(e)})
-                 raise
-             if is_streaming:
-                 output_entry = span.record_output("<pending stream>")
-                 return _sync_stream_wrapper(response_or_iterator, client, output_entry)
-             else:
-                 output_data = _format_output_data(client, response_or_iterator)
-                 span.record_output(output_data)
-                 return response_or_iterator
-        # --- Define Traced Sync Functions ---
+            is_streaming = _record_input_and_check_streaming(span, kwargs)
+            try:
+                response_or_iterator = original_create(*args, **kwargs)
+                return _format_and_record_output(span, response_or_iterator, is_streaming, False, False)
+            except Exception as e:
+                return _handle_error(span, e, False)
     def traced_response_create_sync(*args, **kwargs):
-         # [Existing logic - unchanged]
         current_trace = current_trace_var.get()
         if not current_trace:
-             return responses_create(*args, **kwargs)
-        is_streaming = kwargs.get("stream", False)
+            return original_responses_create(*args, **kwargs)
         with current_trace.span(span_name, span_type="llm") as span:
-             span.record_input(kwargs)
-             # Warn about token counting limitations with streaming
-             if isinstance(client, (AsyncOpenAI, OpenAI)) and is_streaming:
-                 if not kwargs.get("stream_options", {}).get("include_usage"):
-                     warnings.warn(
-                         "OpenAI streaming calls don't include token counts by default. "
-                         "To enable token counting with streams, set stream_options={'include_usage': True} "
-                         "in your API call arguments.",
-                         UserWarning
-                     )
-             try:
-                 response_or_iterator = responses_create(*args, **kwargs)
-             except Exception as e:
-                 print(f"Error during wrapped sync API call ({span_name}): {e}")
-                 span.record_output({"error": str(e)})
-                 raise
-             if is_streaming:
-                 output_entry = span.record_output("<pending stream>")
-                 return _sync_stream_wrapper(response_or_iterator, client, output_entry)
-             else:
-                 output_data = _format_response_output_data(client, response_or_iterator)
-                 span.record_output(output_data)
-                 return response_or_iterator
+            is_streaming = _record_input_and_check_streaming(span, kwargs, is_responses=True)
+            try:
+                response_or_iterator = original_responses_create(*args, **kwargs)
+                return _format_and_record_output(span, response_or_iterator, is_streaming, False, True)
+            except Exception as e:
+                return _handle_error(span, e, False)
     # Function replacing sync .stream()
     def traced_stream_sync(*args, **kwargs):
-         current_trace = current_trace_var.get()
-         if not current_trace or not original_stream:
-             return original_stream(*args, **kwargs)
-         original_manager = original_stream(*args, **kwargs)
-         wrapper_manager = _TracedSyncStreamManagerWrapper(
-             original_manager=original_manager,
-             client=client,
-             span_name=span_name,
-             trace_client=current_trace,
-             stream_wrapper_func=_sync_stream_wrapper,
-             input_kwargs=kwargs
-         )
-         return wrapper_manager
+        current_trace = current_trace_var.get()
+        if not current_trace or not original_stream:
+            return original_stream(*args, **kwargs)
+        original_manager = original_stream(*args, **kwargs)
+        return _TracedSyncStreamManagerWrapper(
+            original_manager=original_manager,
+            client=client,
+            span_name=span_name,
+            trace_client=current_trace,
+            stream_wrapper_func=_sync_stream_wrapper,
+            input_kwargs=kwargs
+        )
     # --- Assign Traced Methods to Client Instance ---
-    # [Assignment logic remains the same]
     if isinstance(client, (AsyncOpenAI, AsyncTogether)):
         client.chat.completions.create = traced_create_async
-        # Wrap the Responses API endpoint for OpenAI clients
         if hasattr(client, "responses") and hasattr(client.responses, "create"):
-            # Capture the original responses.create
-            original_responses_create = client.responses.create
-            def traced_responses(*args, **kwargs):
-                # Get the current trace from contextvars
-                current_trace = current_trace_var.get()
-                # If no active trace, call the original
-                if not current_trace:
-                    return original_responses_create(*args, **kwargs)
-                # Trace this responses.create call
-                with current_trace.span(span_name, span_type="llm") as span:
-                    # Record raw input kwargs
-                    span.record_input(kwargs)
-                    # Make the actual API call
-                    response = original_responses_create(*args, **kwargs)
-                    # Record the output object
-                    span.record_output(response)
-                    return response
-            # Assign the traced wrapper
-            client.responses.create = traced_responses
+            client.responses.create = traced_response_create_async
     elif isinstance(client, AsyncAnthropic):
         client.messages.create = traced_create_async
         if original_stream:
-             client.messages.stream = traced_stream_async
+            client.messages.stream = traced_stream_async
     elif isinstance(client, genai.client.AsyncClient):
         client.models.generate_content = traced_create_async
     elif isinstance(client, (OpenAI, Together)):
-         client.chat.completions.create = traced_create_sync
-         client.responses.create = traced_response_create_sync
+        client.chat.completions.create = traced_create_sync
+        if hasattr(client, "responses") and hasattr(client.responses, "create"):
+            client.responses.create = traced_response_create_sync
     elif isinstance(client, Anthropic):
-         client.messages.create = traced_create_sync
-         if original_stream:
-             client.messages.stream = traced_stream_sync
+        client.messages.create = traced_create_sync
+        if original_stream:
+            client.messages.stream = traced_stream_sync
     elif isinstance(client, genai.Client):
-         client.models.generate_content = traced_create_sync
+        client.models.generate_content = traced_create_sync
     return client
 # Helper functions for client-specific operations
@@ -1896,128 +1872,3 @@ class _TracedSyncStreamManagerWrapper(_BaseStreamManagerWrapper, AbstractContext
             current_span_var.reset(self._span_context_token)
             delattr(self, '_span_context_token')
         return self._original_manager.__exit__(exc_type, exc_val, exc_tb)
-# --- NEW Generalized Helper Function (Moved from demo) ---
-def prepare_evaluation_for_state(
-    scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
-    example: Optional[Example] = None,
-    # --- Individual components (alternative to 'example') ---
-    input: Optional[str] = None,
-    actual_output: Optional[Union[str, List[str]]] = None,
-    expected_output: Optional[Union[str, List[str]]] = None,
-    context: Optional[List[str]] = None,
-    retrieval_context: Optional[List[str]] = None,
-    tools_called: Optional[List[str]] = None,
-    expected_tools: Optional[List[str]] = None,
-    additional_metadata: Optional[Dict[str, Any]] = None,
-    # --- Other eval parameters ---
-    model: Optional[str] = None,
-    log_results: Optional[bool] = True
-) -> Optional[EvaluationConfig]:
-    """
-    Prepares an EvaluationConfig object, similar to TraceClient.async_evaluate.
-    Accepts either a pre-made Example object or individual components to construct one.
-    Returns the EvaluationConfig object ready to be placed in the state, or None.
-    """
-    final_example = example
-    # If example is not provided, try to construct one from individual parts
-    if final_example is None:
-        # Basic validation: Ensure at least actual_output is present for most scorers
-        if actual_output is None:
-      #      print("[prepare_evaluation_for_state] Warning: 'actual_output' is required when 'example' is not provided. Skipping evaluation setup.")
-            return None
-        try:
-            final_example = Example(
-                input=input,
-                actual_output=actual_output,
-                expected_output=expected_output,
-                context=context,
-                retrieval_context=retrieval_context,
-                tools_called=tools_called,
-                expected_tools=expected_tools,
-                additional_metadata=additional_metadata,
-                # trace_id will be set by the handler later if needed
-            )
-       #     print("[prepare_evaluation_for_state] Constructed Example from individual components.")
-        except Exception as e:
-      #      print(f"[prepare_evaluation_for_state] Error constructing Example: {e}. Skipping evaluation setup.")
-            return None
-    # If we have a valid example (provided or constructed) and scorers
-    if final_example and scorers:
-        # TODO: Add validation like check_examples if needed here,
-        # although the handler might implicitly handle some checks via TraceClient.
-        return EvaluationConfig(
-            scorers=scorers,
-            example=final_example,
-            model=model,
-            log_results=log_results
-        )
-    elif not scorers:
-    #    print("[prepare_evaluation_for_state] No scorers provided. Skipping evaluation setup.")
-        return None
-    else: # No valid example
-    #   print("[prepare_evaluation_for_state] No valid Example available. Skipping evaluation setup.")
-        return None
-# --- End NEW Helper Function ---
-# --- NEW: Helper function to simplify adding eval config to state ---
-def add_evaluation_to_state(
-    state: Dict[str, Any], # The LangGraph state dictionary
-    scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
-    # --- Evaluation components (same as prepare_evaluation_for_state) ---
-    input: Optional[str] = None,
-    actual_output: Optional[Union[str, List[str]]] = None,
-    expected_output: Optional[Union[str, List[str]]] = None,
-    context: Optional[List[str]] = None,
-    retrieval_context: Optional[List[str]] = None,
-    tools_called: Optional[List[str]] = None,
-    expected_tools: Optional[List[str]] = None,
-    additional_metadata: Optional[Dict[str, Any]] = None,
-    # --- Other eval parameters ---
-    model: Optional[str] = None,
-    log_results: Optional[bool] = True
-) -> None:
-    """
-    Prepares an EvaluationConfig and adds it to the state dictionary
-    under the '_judgeval_eval' key if successful.
-    This simplifies the process of setting up evaluations within LangGraph nodes.
-    Args:
-        state: The LangGraph state dictionary to modify.
-        scorers: List of scorer instances.
-        input: Input for the evaluation example.
-        actual_output: Actual output for the evaluation example.
-        expected_output: Expected output for the evaluation example.
-        context: Context for the evaluation example.
-        retrieval_context: Retrieval context for the evaluation example.
-        tools_called: Tools called for the evaluation example.
-        expected_tools: Expected tools for the evaluation example.
-        additional_metadata: Additional metadata for the evaluation example.
-        model: Model name used for generation (optional).
-        log_results: Whether to log evaluation results (optional, defaults to True).
-    """
-    eval_config = prepare_evaluation_for_state(
-        scorers=scorers,
-        input=input,
-        actual_output=actual_output,
-        expected_output=expected_output,
-        context=context,
-        retrieval_context=retrieval_context,
-        tools_called=tools_called,
-        expected_tools=expected_tools,
-        additional_metadata=additional_metadata,
-        model=model,
-        log_results=log_results
-    )
-    if eval_config:
-        state["_judgeval_eval"] = eval_config
-   #     print(f"[_judgeval_eval added to state for node]") # Optional: Log confirmation
-     #   print("[Skipped adding _judgeval_eval to state: prepare_evaluation_for_state failed]")
-# --- End NEW Helper ---

judgeval/common/utils.py CHANGED Viewed

@@ -765,7 +765,7 @@ if __name__ == "__main__":
     # Batched single completion to multiple models
     pprint.pprint(get_completion_multiple_models(
         models=[
-            "LLAMA3_70B_INSTRUCT_TURBO", "LLAMA3_405B_INSTRUCT_TURBO", "gpt-4o-mini"
+            "LLAMA3_70B_INSTRUCT_TURBO", "LLAMA3_405B_INSTRUCT_TURBO", "gpt-4.1-mini"
         ],
         messages=[
             [

judgeval/constants.py CHANGED Viewed

@@ -40,17 +40,15 @@ UNBOUNDED_SCORERS = set([APIScorer.COMPARISON])  # scorers whose scores are not
 ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
 # API URLs
 JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
-JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
+JUDGMENT_TRACE_EVAL_API_URL = f"{ROOT_API}/evaluate_trace/"
 JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
 JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
-JUDGMENT_DATASETS_APPEND_SEQUENCES_API_URL = f"{ROOT_API}/datasets/insert_sequences/"
 JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
 JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
 JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
 JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
 JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
 JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
-JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL = f"{ROOT_API}/traces/convert_trace_to_sequence/"
 JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_experiment_run/"
 JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
 JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
@@ -61,6 +59,7 @@ JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
 JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
 JUDGMENT_TRACES_ADD_ANNOTATION_API_URL = f"{ROOT_API}/traces/add_annotation/"
 JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
+JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
 # RabbitMQ
 RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
 RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)

judgeval/data/__init__.py CHANGED Viewed

@@ -2,7 +2,6 @@ from judgeval.data.example import Example, ExampleParams
 from judgeval.data.custom_example import CustomExample
 from judgeval.data.scorer_data import ScorerData, create_scorer_data
 from judgeval.data.result import ScoringResult, generate_scoring_result
-from judgeval.data.sequence import Sequence
 from judgeval.data.trace import Trace, TraceSpan
@@ -14,7 +13,6 @@ __all__ = [
     "create_scorer_data",
     "ScoringResult",
     "generate_scoring_result",
-    "Sequence",
     "Trace",
     "TraceSpan",
 ]

judgeval/data/datasets/dataset.py CHANGED Viewed

@@ -7,13 +7,12 @@ import yaml
 from dataclasses import dataclass, field
 from typing import List, Union, Literal
-from judgeval.data import Example, Sequence
+from judgeval.data import Example
 from judgeval.common.logger import debug, error, warning, info
 @dataclass
 class EvalDataset:
     examples: List[Example]
-    sequences: List[Sequence]
     _alias: Union[str, None] = field(default=None)
     _id: Union[str, None] = field(default=None)
     judgment_api_key: str = field(default="")
@@ -22,13 +21,11 @@ class EvalDataset:
                  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
                  organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
                  examples: List[Example] = [],
-                 sequences: List[Sequence] = []
                  ):
         debug(f"Initializing EvalDataset with {len(examples)} examples")
         if not judgment_api_key:
             warning("No judgment_api_key provided")
         self.examples = examples
-        self.sequences = sequences
         self._alias = None
         self._id = None
         self.judgment_api_key = judgment_api_key
@@ -223,10 +220,7 @@ class EvalDataset:
     def add_example(self, e: Example) -> None:
         self.examples = self.examples + [e]
         # TODO if we need to add rank, then we need to do it here
-    def add_sequence(self, s: Sequence) -> None:
-        self.sequences = self.sequences + [s]
     def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
         """
         Saves the dataset as a file. Save only the examples.
@@ -313,7 +307,6 @@ class EvalDataset:
         return (
             f"{self.__class__.__name__}("
             f"examples={self.examples}, "
-            f"sequences={self.sequences}, "
             f"_alias={self._alias}, "
             f"_id={self._id}"
             f")"

judgeval 0.0.37__py3-none-any.whl → 0.0.39__py3-none-any.whl

judgeval 0.0.37py3-none-any.whl → 0.0.39py3-none-any.whl