PyPI - deepeval - Versions diffs - 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl - Mend

deepeval 3.6.6py3-none-any.whl → 3.6.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

deepeval/_version.py +1 -1
deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
deepeval/cli/main.py +42 -0
deepeval/confident/api.py +1 -0
deepeval/config/settings.py +22 -4
deepeval/constants.py +8 -1
deepeval/dataset/dataset.py +2 -11
deepeval/dataset/utils.py +1 -1
deepeval/errors.py +20 -2
deepeval/evaluate/evaluate.py +5 -1
deepeval/evaluate/execute.py +811 -248
deepeval/evaluate/types.py +1 -0
deepeval/evaluate/utils.py +33 -119
deepeval/integrations/crewai/__init__.py +7 -1
deepeval/integrations/crewai/handler.py +1 -1
deepeval/integrations/crewai/subs.py +51 -0
deepeval/integrations/crewai/tool.py +71 -0
deepeval/integrations/crewai/wrapper.py +45 -5
deepeval/integrations/llama_index/__init__.py +0 -4
deepeval/integrations/llama_index/handler.py +20 -21
deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
deepeval/metrics/__init__.py +13 -0
deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
deepeval/metrics/api.py +281 -0
deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
deepeval/metrics/base_metric.py +1 -0
deepeval/metrics/bias/bias.py +12 -3
deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
deepeval/metrics/conversational_dag/nodes.py +12 -4
deepeval/metrics/conversational_g_eval/__init__.py +3 -0
deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
deepeval/metrics/dag/dag.py +12 -0
deepeval/metrics/dag/nodes.py +12 -4
deepeval/metrics/dag/schema.py +1 -1
deepeval/metrics/dag/templates.py +2 -2
deepeval/metrics/faithfulness/faithfulness.py +12 -1
deepeval/metrics/g_eval/g_eval.py +11 -0
deepeval/metrics/goal_accuracy/__init__.py +1 -0
deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
deepeval/metrics/goal_accuracy/schema.py +17 -0
deepeval/metrics/goal_accuracy/template.py +235 -0
deepeval/metrics/hallucination/hallucination.py +20 -9
deepeval/metrics/indicator.py +8 -2
deepeval/metrics/json_correctness/json_correctness.py +12 -1
deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
deepeval/metrics/mcp/mcp_task_completion.py +20 -2
deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
deepeval/metrics/misuse/misuse.py +12 -1
deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
deepeval/metrics/non_advice/non_advice.py +12 -0
deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
deepeval/metrics/plan_adherence/__init__.py +1 -0
deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
deepeval/metrics/plan_adherence/schema.py +11 -0
deepeval/metrics/plan_adherence/template.py +170 -0
deepeval/metrics/plan_quality/__init__.py +1 -0
deepeval/metrics/plan_quality/plan_quality.py +292 -0
deepeval/metrics/plan_quality/schema.py +11 -0
deepeval/metrics/plan_quality/template.py +101 -0
deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
deepeval/metrics/role_adherence/role_adherence.py +12 -0
deepeval/metrics/role_violation/role_violation.py +12 -0
deepeval/metrics/step_efficiency/__init__.py +1 -0
deepeval/metrics/step_efficiency/schema.py +11 -0
deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
deepeval/metrics/step_efficiency/template.py +256 -0
deepeval/metrics/summarization/summarization.py +12 -1
deepeval/metrics/task_completion/task_completion.py +4 -0
deepeval/metrics/tool_correctness/schema.py +6 -0
deepeval/metrics/tool_correctness/template.py +88 -0
deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
deepeval/metrics/tool_use/__init__.py +1 -0
deepeval/metrics/tool_use/schema.py +19 -0
deepeval/metrics/tool_use/template.py +220 -0
deepeval/metrics/tool_use/tool_use.py +458 -0
deepeval/metrics/topic_adherence/__init__.py +1 -0
deepeval/metrics/topic_adherence/schema.py +16 -0
deepeval/metrics/topic_adherence/template.py +162 -0
deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
deepeval/metrics/toxicity/toxicity.py +12 -0
deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
deepeval/models/embedding_models/azure_embedding_model.py +37 -36
deepeval/models/embedding_models/local_embedding_model.py +30 -32
deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
deepeval/models/embedding_models/openai_embedding_model.py +22 -31
deepeval/models/llms/grok_model.py +1 -1
deepeval/models/llms/openai_model.py +2 -0
deepeval/openai/__init__.py +14 -32
deepeval/openai/extractors.py +85 -50
deepeval/openai/patch.py +258 -167
deepeval/openai/types.py +20 -0
deepeval/openai/utils.py +205 -56
deepeval/prompt/__init__.py +19 -1
deepeval/prompt/api.py +160 -0
deepeval/prompt/prompt.py +245 -62
deepeval/prompt/utils.py +186 -15
deepeval/synthesizer/chunking/context_generator.py +209 -152
deepeval/synthesizer/chunking/doc_chunker.py +46 -12
deepeval/synthesizer/synthesizer.py +19 -15
deepeval/test_case/api.py +131 -0
deepeval/test_case/llm_test_case.py +6 -2
deepeval/test_run/__init__.py +1 -0
deepeval/test_run/hyperparameters.py +47 -8
deepeval/test_run/test_run.py +292 -206
deepeval/tracing/__init__.py +2 -1
deepeval/tracing/api.py +3 -1
deepeval/tracing/otel/exporter.py +3 -4
deepeval/tracing/otel/utils.py +24 -5
deepeval/tracing/trace_context.py +89 -5
deepeval/tracing/tracing.py +74 -3
deepeval/tracing/types.py +20 -2
deepeval/tracing/utils.py +8 -0
deepeval/utils.py +21 -0
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
deepeval/integrations/llama_index/agent/patched.py +0 -68
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0

deepeval/_version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__: str = "3.6.6"
1	+ __version__: str = "3.6.8"

deepeval/benchmarks/equity_med_qa/equity_med_qa.py CHANGED Viewed

@@ -121,6 +121,7 @@ class EquityMedQA(DeepEvalBaseBenchmark):
         score = metric.measure(
             LLMTestCase(input=golden.input, actual_output=prediction),
             _show_indicator=False,
+            _log_metric_to_confident=False,
         )
         flipped_score = (
             1 - metric.score if metric.score in [0, 1] else metric.score

deepeval/cli/main.py CHANGED Viewed

@@ -328,6 +328,31 @@ def set_debug(
         "--trace-flush/--no-trace-flush",
         help="Enable / disable  CONFIDENT_TRACE_FLUSH.",
     ),
+    trace_sample_rate: Optional[float] = typer.Option(
+        None,
+        "--trace-sample-rate",
+        help="Set CONFIDENT_TRACE_SAMPLE_RATE.",
+    ),
+    metric_logging_verbose: Optional[bool] = typer.Option(
+        None,
+        "--metric-logging-verbose/--no-metric-logging-verbose",
+        help="Enable / disable CONFIDENT_METRIC_LOGGING_VERBOSE.",
+    ),
+    metric_logging_flush: Optional[bool] = typer.Option(
+        None,
+        "--metric-logging-flush/--no-metric-logging-flush",
+        help="Enable / disable CONFIDENT_METRIC_LOGGING_FLUSH.",
+    ),
+    metric_logging_sample_rate: Optional[float] = typer.Option(
+        None,
+        "--metric-logging-sample-rate",
+        help="Set CONFIDENT_METRIC_LOGGING_SAMPLE_RATE.",
+    ),
+    metric_logging_enabled: Optional[bool] = typer.Option(
+        None,
+        "--metric-logging-enabled/--no-metric-logging-enabled",
+        help="Enable / disable CONFIDENT_METRIC_LOGGING_ENABLED.",
+    ),
     # Advanced / potentially surprising
     error_reporting: Optional[bool] = typer.Option(
         None,
@@ -387,6 +412,20 @@ def set_debug(
             settings.CONFIDENT_TRACE_ENVIRONMENT = trace_env
         if trace_flush is not None:
             settings.CONFIDENT_TRACE_FLUSH = trace_flush
+        if trace_sample_rate is not None:
+            settings.CONFIDENT_TRACE_SAMPLE_RATE = trace_sample_rate
+        # Confident metrics
+        if metric_logging_verbose is not None:
+            settings.CONFIDENT_METRIC_LOGGING_VERBOSE = metric_logging_verbose
+        if metric_logging_flush is not None:
+            settings.CONFIDENT_METRIC_LOGGING_FLUSH = metric_logging_flush
+        if metric_logging_sample_rate is not None:
+            settings.CONFIDENT_METRIC_LOGGING_SAMPLE_RATE = (
+                metric_logging_sample_rate
+            )
+        if metric_logging_enabled is not None:
+            settings.CONFIDENT_METRIC_LOGGING_ENABLED = metric_logging_enabled
         # Advanced
         if error_reporting is not None:
@@ -438,6 +477,8 @@ def unset_debug(
         settings.LOG_LEVEL = "info"
         settings.CONFIDENT_TRACE_ENVIRONMENT = "development"
         settings.CONFIDENT_TRACE_VERBOSE = True
+        settings.CONFIDENT_METRIC_LOGGING_VERBOSE = True
+        settings.CONFIDENT_METRIC_LOGGING_ENABLED = True
         # Clear optional toggles/overrides
         settings.DEEPEVAL_VERBOSE_MODE = None
@@ -449,6 +490,7 @@ def unset_debug(
         settings.GRPC_TRACE = None
         settings.CONFIDENT_TRACE_FLUSH = None
+        settings.CONFIDENT_METRIC_LOGGING_FLUSH = None
         settings.ERROR_REPORTING = None
         settings.IGNORE_DEEPEVAL_ERRORS = None

deepeval/confident/api.py CHANGED Viewed

@@ -87,6 +87,7 @@ class Endpoints(Enum):
     DATASET_ALIAS_QUEUE_ENDPOINT = "/v1/datasets/:alias/queue"
     TEST_RUN_ENDPOINT = "/v1/test-run"
+    METRIC_DATA_ENDPOINT = "/v1/metric-data"
     TRACES_ENDPOINT = "/v1/traces"
     ANNOTATIONS_ENDPOINT = "/v1/annotations"
     PROMPTS_VERSION_ID_ENDPOINT = "/v1/prompts/:alias/versions/:versionId"

deepeval/config/settings.py CHANGED Viewed

@@ -337,10 +337,17 @@ class Settings(BaseSettings):
     SKIP_DEEPEVAL_MISSING_PARAMS: Optional[bool] = None
     DEEPEVAL_VERBOSE_MODE: Optional[bool] = None
     ENABLE_DEEPEVAL_CACHE: Optional[bool] = None
     CONFIDENT_TRACE_FLUSH: Optional[bool] = None
     CONFIDENT_TRACE_ENVIRONMENT: Optional[str] = "development"
     CONFIDENT_TRACE_VERBOSE: Optional[bool] = True
-    CONFIDENT_SAMPLE_RATE: Optional[float] = 1.0
+    CONFIDENT_TRACE_SAMPLE_RATE: Optional[float] = 1.0
+    CONFIDENT_METRIC_LOGGING_FLUSH: Optional[bool] = None
+    CONFIDENT_METRIC_LOGGING_VERBOSE: Optional[bool] = True
+    CONFIDENT_METRIC_LOGGING_SAMPLE_RATE: Optional[float] = 1.0
+    CONFIDENT_METRIC_LOGGING_ENABLED: Optional[bool] = True
     OTEL_EXPORTER_OTLP_ENDPOINT: Optional[AnyUrl] = None
     #
@@ -355,6 +362,12 @@ class Settings(BaseSettings):
         None  # per-attempt timeout. Set 0/None to disable
     )
+    #
+    # Async Document Pipelines
+    #
+    DEEPEVAL_MAX_CONCURRENT_DOC_PROCESSING: conint(ge=1) = 2
     #
     # Async Task Configuration
     #
@@ -484,7 +497,8 @@ class Settings(BaseSettings):
         "OPENAI_COST_PER_INPUT_TOKEN",
         "OPENAI_COST_PER_OUTPUT_TOKEN",
         "TEMPERATURE",
-        "CONFIDENT_SAMPLE_RATE",
+        "CONFIDENT_TRACE_SAMPLE_RATE",
+        "CONFIDENT_METRIC_LOGGING_SAMPLE_RATE",
         mode="before",
     )
     @classmethod
@@ -496,13 +510,17 @@ class Settings(BaseSettings):
             return None
         return float(v)
-    @field_validator("CONFIDENT_SAMPLE_RATE")
+    @field_validator(
+        "CONFIDENT_TRACE_SAMPLE_RATE", "CONFIDENT_METRIC_LOGGING_SAMPLE_RATE"
+    )
     @classmethod
     def _validate_sample_rate(cls, v):
         if v is None:
             return None
         if not (0.0 <= float(v) <= 1.0):
-            raise ValueError("CONFIDENT_SAMPLE_RATE must be between 0 and 1")
+            raise ValueError(
+                "CONFIDENT_TRACE_SAMPLE_RATE or CONFIDENT_METRIC_LOGGING_SAMPLE_RATE must be between 0 and 1"
+            )
         return float(v)
     @field_validator("DEEPEVAL_DEFAULT_SAVE", mode="before")

deepeval/constants.py CHANGED Viewed

@@ -9,9 +9,16 @@ LOGIN_PROMPT = "\n✨👀 Looking for a place for your LLM test data to live
 CONFIDENT_TRACE_VERBOSE = "CONFIDENT_TRACE_VERBOSE"
 CONFIDENT_TRACE_FLUSH = "CONFIDENT_TRACE_FLUSH"
-CONFIDENT_SAMPLE_RATE = "CONFIDENT_SAMPLE_RATE"
+CONFIDENT_TRACE_SAMPLE_RATE = "CONFIDENT_TRACE_SAMPLE_RATE"
 CONFIDENT_TRACE_ENVIRONMENT = "CONFIDENT_TRACE_ENVIRONMENT"
 CONFIDENT_TRACING_ENABLED = "CONFIDENT_TRACING_ENABLED"
+CONFIDENT_METRIC_LOGGING_VERBOSE = "CONFIDENT_METRIC_LOGGING_VERBOSE"
+CONFIDENT_METRIC_LOGGING_FLUSH = "CONFIDENT_METRIC_LOGGING_FLUSH"
+CONFIDENT_METRIC_LOGGING_SAMPLE_RATE = "CONFIDENT_METRIC_LOGGING_SAMPLE_RATE"
+CONFIDENT_METRIC_LOGGING_ENABLED = "CONFIDENT_METRIC_LOGGING_ENABLED"
 CONFIDENT_OPEN_BROWSER = "CONFIDENT_OPEN_BROWSER"
 CONFIDENT_TEST_CASE_BATCH_SIZE = "CONFIDENT_TEST_CASE_BATCH_SIZE"

deepeval/dataset/dataset.py CHANGED Viewed

@@ -49,7 +49,7 @@ from deepeval.utils import (
 from deepeval.test_run import (
     global_test_run_manager,
 )
-from deepeval.openai.utils import openai_test_case_pairs
 from deepeval.tracing import trace_manager
 from deepeval.tracing.tracing import EVAL_DUMMY_SPAN_NAME
@@ -1248,16 +1248,7 @@ class EvaluationDataset:
                         display_config.file_output_dir,
                     )
-            # update hyperparameters
-            test_run = global_test_run_manager.get_test_run()
-            if len(openai_test_case_pairs) > 0:
-                raw_hyperparameters = openai_test_case_pairs[-1].hyperparameters
-                test_run.hyperparameters = process_hyperparameters(
-                    raw_hyperparameters
-                )
-            # clean up
-            openai_test_case_pairs.clear()
+            # save test run
             global_test_run_manager.save_test_run(TEMP_FILE_PATH)
             # sandwich end trace for OTEL

deepeval/dataset/utils.py CHANGED Viewed

@@ -120,7 +120,7 @@ def format_turns(turns: List[Turn]) -> str:
         }
         res.append(cur_turn)
     try:
-        return json.dumps(res)
+        return json.dumps(res, ensure_ascii=False)
     except Exception as e:
         raise ValueError(f"Error serializing turns: {e}")

deepeval/errors.py CHANGED Viewed

@@ -1,6 +1,24 @@
-class MissingTestCaseParamsError(Exception):
+class DeepEvalError(Exception):
+    """Base class for framework-originated errors.
+    If raised and not handled, it will abort the current operation.
+    We may also stringify instances of this class and attach them to traces or spans to surface
+    non-fatal diagnostics while allowing the run to continue.
+    """
+class UserAppError(Exception):
+    """Represents exceptions thrown by user LLM apps/tools.
+    We record these on traces or spans and keep the overall evaluation run alive.
+    """
+class MissingTestCaseParamsError(DeepEvalError):
+    """Required test case fields are missing."""
     pass
-class MismatchedTestCaseInputsError(Exception):
+class MismatchedTestCaseInputsError(DeepEvalError):
+    """Inputs provided to a metric or test case are inconsistent or invalid."""
     pass

deepeval/evaluate/evaluate.py CHANGED Viewed

@@ -28,7 +28,10 @@ from deepeval.evaluate.utils import (
 from deepeval.dataset import Golden
 from deepeval.prompt import Prompt
 from deepeval.test_case.utils import check_valid_test_cases_type
-from deepeval.test_run.hyperparameters import process_hyperparameters
+from deepeval.test_run.hyperparameters import (
+    process_hyperparameters,
+    process_prompts,
+)
 from deepeval.test_run.test_run import TEMP_FILE_PATH
 from deepeval.utils import (
     get_or_create_event_loop,
@@ -267,6 +270,7 @@ def evaluate(
         test_run = global_test_run_manager.get_test_run()
         test_run.hyperparameters = process_hyperparameters(hyperparameters)
+        test_run.prompts = process_prompts(hyperparameters)
         global_test_run_manager.save_test_run(TEMP_FILE_PATH)
         res = global_test_run_manager.wrap_up_test_run(
             run_duration, display_table=False

deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

deepeval 3.6.6py3-none-any.whl → 3.6.8py3-none-any.whl