deepeval 3.4.8__py3-none-any.whl → 3.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +8 -5
- deepeval/_version.py +1 -1
- deepeval/benchmarks/drop/drop.py +2 -3
- deepeval/benchmarks/hellaswag/hellaswag.py +2 -2
- deepeval/benchmarks/logi_qa/logi_qa.py +2 -2
- deepeval/benchmarks/math_qa/math_qa.py +2 -2
- deepeval/benchmarks/mmlu/mmlu.py +2 -2
- deepeval/benchmarks/truthful_qa/truthful_qa.py +2 -2
- deepeval/cli/main.py +561 -727
- deepeval/confident/api.py +30 -14
- deepeval/config/__init__.py +0 -0
- deepeval/config/settings.py +565 -0
- deepeval/config/settings_manager.py +133 -0
- deepeval/config/utils.py +86 -0
- deepeval/dataset/__init__.py +1 -0
- deepeval/dataset/dataset.py +70 -10
- deepeval/dataset/test_run_tracer.py +82 -0
- deepeval/dataset/utils.py +23 -0
- deepeval/integrations/pydantic_ai/__init__.py +2 -4
- deepeval/integrations/pydantic_ai/{setup.py → otel.py} +0 -8
- deepeval/integrations/pydantic_ai/patcher.py +376 -0
- deepeval/key_handler.py +1 -0
- deepeval/metrics/answer_relevancy/template.py +7 -2
- deepeval/metrics/faithfulness/template.py +11 -8
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +6 -4
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +6 -4
- deepeval/metrics/tool_correctness/tool_correctness.py +7 -3
- deepeval/models/llms/amazon_bedrock_model.py +24 -3
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/kimi_model.py +1 -1
- deepeval/models/llms/openai_model.py +37 -41
- deepeval/models/retry_policy.py +280 -0
- deepeval/openai_agents/agent.py +4 -2
- deepeval/test_run/api.py +1 -0
- deepeval/tracing/otel/exporter.py +20 -8
- deepeval/tracing/otel/utils.py +57 -0
- deepeval/tracing/perf_epoch_bridge.py +4 -4
- deepeval/tracing/tracing.py +37 -16
- deepeval/tracing/utils.py +98 -1
- deepeval/utils.py +111 -70
- {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/METADATA +16 -13
- {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/RECORD +45 -40
- deepeval/env.py +0 -35
- deepeval/integrations/pydantic_ai/agent.py +0 -364
- {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/LICENSE.md +0 -0
- {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/WHEEL +0 -0
- {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/entry_points.txt +0 -0
deepeval/__init__.py
CHANGED
|
@@ -3,9 +3,9 @@ import warnings
|
|
|
3
3
|
import re
|
|
4
4
|
|
|
5
5
|
# load environment variables before other imports
|
|
6
|
-
from .
|
|
6
|
+
from deepeval.config.settings import autoload_dotenv, get_settings
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
autoload_dotenv()
|
|
9
9
|
|
|
10
10
|
from ._version import __version__
|
|
11
11
|
from deepeval.evaluate import evaluate, assert_test
|
|
@@ -14,9 +14,12 @@ from deepeval.test_run import on_test_run_end, log_hyperparameters
|
|
|
14
14
|
from deepeval.utils import login
|
|
15
15
|
from deepeval.telemetry import *
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
17
|
+
|
|
18
|
+
settings = get_settings()
|
|
19
|
+
if not settings.DEEPEVAL_GRPC_LOGGING:
|
|
20
|
+
os.environ.setdefault("GRPC_VERBOSITY", "ERROR")
|
|
21
|
+
os.environ.setdefault("GRPC_TRACE", "")
|
|
22
|
+
|
|
20
23
|
|
|
21
24
|
__all__ = [
|
|
22
25
|
"login",
|
deepeval/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__: str = "3.
|
|
1
|
+
__version__: str = "3.5.0"
|
deepeval/benchmarks/drop/drop.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
from typing import List, Optional, Dict
|
|
1
|
+
from typing import List, Optional, Dict, Union
|
|
2
2
|
from tqdm import tqdm
|
|
3
|
-
from typing import Union
|
|
4
3
|
|
|
5
4
|
from deepeval.dataset import Golden
|
|
6
5
|
from deepeval.benchmarks.base_benchmark import (
|
|
@@ -50,7 +49,7 @@ class DROP(DeepEvalBaseBenchmark):
|
|
|
50
49
|
self,
|
|
51
50
|
model: DeepEvalBaseLLM,
|
|
52
51
|
*args,
|
|
53
|
-
batch_size: int
|
|
52
|
+
batch_size: Union[int, None] = None,
|
|
54
53
|
**kwargs,
|
|
55
54
|
) -> DeepEvalBaseBenchmarkResult:
|
|
56
55
|
import pandas as pd
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Dict, Optional
|
|
1
|
+
from typing import List, Dict, Optional, Union
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
@@ -51,7 +51,7 @@ class HellaSwag(DeepEvalBaseBenchmark):
|
|
|
51
51
|
self,
|
|
52
52
|
model: DeepEvalBaseLLM,
|
|
53
53
|
*args,
|
|
54
|
-
batch_size: int
|
|
54
|
+
batch_size: Union[int, None] = None,
|
|
55
55
|
**kwargs,
|
|
56
56
|
) -> DeepEvalBaseBenchmarkResult:
|
|
57
57
|
import pandas as pd
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Optional, Dict
|
|
1
|
+
from typing import List, Optional, Dict, Union
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
import requests
|
|
4
4
|
import json
|
|
@@ -52,7 +52,7 @@ class LogiQA(DeepEvalBaseBenchmark):
|
|
|
52
52
|
self,
|
|
53
53
|
model: DeepEvalBaseLLM,
|
|
54
54
|
*args,
|
|
55
|
-
batch_size: int
|
|
55
|
+
batch_size: Union[int, None] = None,
|
|
56
56
|
**kwargs,
|
|
57
57
|
) -> DeepEvalBaseBenchmarkResult:
|
|
58
58
|
import pandas as pd
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Optional, Dict
|
|
1
|
+
from typing import List, Optional, Dict, Union
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
@@ -50,7 +50,7 @@ class MathQA(DeepEvalBaseBenchmark):
|
|
|
50
50
|
self,
|
|
51
51
|
model: DeepEvalBaseLLM,
|
|
52
52
|
*args,
|
|
53
|
-
batch_size: int
|
|
53
|
+
batch_size: Union[int, None] = None,
|
|
54
54
|
**kwargs,
|
|
55
55
|
) -> DeepEvalBaseBenchmarkResult:
|
|
56
56
|
import pandas as pd
|
deepeval/benchmarks/mmlu/mmlu.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Optional, Dict
|
|
1
|
+
from typing import List, Optional, Dict, Union
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
@@ -49,7 +49,7 @@ class MMLU(DeepEvalBaseBenchmark):
|
|
|
49
49
|
self,
|
|
50
50
|
model: DeepEvalBaseLLM,
|
|
51
51
|
*args,
|
|
52
|
-
batch_size: int
|
|
52
|
+
batch_size: Union[int, None] = None,
|
|
53
53
|
**kwargs,
|
|
54
54
|
) -> DeepEvalBaseBenchmarkResult:
|
|
55
55
|
import pandas as pd
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Dict, Optional
|
|
1
|
+
from typing import List, Dict, Optional, Union
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
@@ -59,7 +59,7 @@ class TruthfulQA(DeepEvalBaseBenchmark):
|
|
|
59
59
|
self,
|
|
60
60
|
model: DeepEvalBaseLLM,
|
|
61
61
|
*args,
|
|
62
|
-
batch_size: int
|
|
62
|
+
batch_size: Union[int, None] = None,
|
|
63
63
|
**kwargs,
|
|
64
64
|
) -> DeepEvalBaseBenchmarkResult:
|
|
65
65
|
import pandas as pd
|