PyPI - validmind - Versions diffs - 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl - Mend

validmind 2.5.25py3-none-any.whl → 2.6.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (198) hide show

validmind/__init__.py +8 -17
validmind/__version__.py +1 -1
validmind/ai/test_descriptions.py +66 -85
validmind/ai/test_result_description/context.py +2 -2
validmind/ai/utils.py +26 -1
validmind/api_client.py +43 -79
validmind/client.py +5 -7
validmind/client_config.py +1 -1
validmind/datasets/__init__.py +1 -1
validmind/datasets/classification/customer_churn.py +7 -5
validmind/datasets/nlp/__init__.py +2 -2
validmind/errors.py +6 -10
validmind/html_templates/content_blocks.py +18 -16
validmind/logging.py +21 -16
validmind/tests/__init__.py +28 -5
validmind/tests/__types__.py +186 -170
validmind/tests/_store.py +7 -21
validmind/tests/comparison.py +362 -0
validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
validmind/tests/data_validation/ADF.py +49 -83
validmind/tests/data_validation/AutoAR.py +59 -96
validmind/tests/data_validation/AutoMA.py +59 -96
validmind/tests/data_validation/AutoStationarity.py +66 -114
validmind/tests/data_validation/ClassImbalance.py +48 -117
validmind/tests/data_validation/DatasetDescription.py +180 -209
validmind/tests/data_validation/DatasetSplit.py +50 -75
validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
validmind/tests/data_validation/Duplicates.py +21 -90
validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
validmind/tests/data_validation/HighCardinality.py +32 -80
validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
validmind/tests/data_validation/IQROutliersTable.py +40 -80
validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
validmind/tests/data_validation/KPSS.py +33 -81
validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
validmind/tests/data_validation/MissingValues.py +17 -58
validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
validmind/tests/data_validation/RollingStatsPlot.py +50 -81
validmind/tests/data_validation/SeasonalDecompose.py +102 -184
validmind/tests/data_validation/Skewness.py +27 -64
validmind/tests/data_validation/SpreadPlot.py +34 -57
validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
validmind/tests/data_validation/TooManyZeroValues.py +21 -70
validmind/tests/data_validation/UniqueRows.py +23 -62
validmind/tests/data_validation/WOEBinPlots.py +83 -109
validmind/tests/data_validation/WOEBinTable.py +28 -69
validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
validmind/tests/data_validation/nlp/CommonWords.py +49 -57
validmind/tests/data_validation/nlp/Hashtags.py +27 -49
validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
validmind/tests/data_validation/nlp/Mentions.py +32 -63
validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
validmind/tests/data_validation/nlp/Punctuations.py +63 -47
validmind/tests/data_validation/nlp/Sentiment.py +4 -0
validmind/tests/data_validation/nlp/StopWords.py +62 -91
validmind/tests/data_validation/nlp/TextDescription.py +116 -159
validmind/tests/data_validation/nlp/Toxicity.py +12 -4
validmind/tests/decorator.py +33 -242
validmind/tests/load.py +212 -153
validmind/tests/model_validation/BertScore.py +13 -7
validmind/tests/model_validation/BleuScore.py +4 -0
validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
validmind/tests/model_validation/ContextualRecall.py +3 -0
validmind/tests/model_validation/FeaturesAUC.py +43 -74
validmind/tests/model_validation/MeteorScore.py +3 -0
validmind/tests/model_validation/RegardScore.py +5 -1
validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
validmind/tests/model_validation/embeddings/utils.py +53 -0
validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
validmind/tests/output.py +120 -0
validmind/tests/prompt_validation/Bias.py +55 -98
validmind/tests/prompt_validation/Clarity.py +56 -99
validmind/tests/prompt_validation/Conciseness.py +63 -101
validmind/tests/prompt_validation/Delimitation.py +48 -89
validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
validmind/tests/prompt_validation/Robustness.py +80 -121
validmind/tests/prompt_validation/Specificity.py +61 -95
validmind/tests/prompt_validation/ai_powered_test.py +2 -2
validmind/tests/run.py +314 -496
validmind/tests/test_providers.py +109 -79
validmind/tests/utils.py +91 -0
validmind/unit_metrics/__init__.py +16 -155
validmind/unit_metrics/classification/F1.py +1 -0
validmind/unit_metrics/classification/Precision.py +1 -0
validmind/unit_metrics/classification/ROC_AUC.py +1 -0
validmind/unit_metrics/classification/Recall.py +1 -0
validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
validmind/unit_metrics/regression/HuberLoss.py +1 -0
validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
validmind/unit_metrics/regression/QuantileLoss.py +1 -0
validmind/unit_metrics/regression/RSquaredScore.py +2 -1
validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
validmind/utils.py +66 -17
validmind/vm_models/__init__.py +2 -17
validmind/vm_models/dataset/dataset.py +31 -4
validmind/vm_models/figure.py +7 -37
validmind/vm_models/model.py +3 -0
validmind/vm_models/result/__init__.py +7 -0
validmind/vm_models/result/result.jinja +21 -0
validmind/vm_models/result/result.py +337 -0
validmind/vm_models/result/utils.py +160 -0
validmind/vm_models/test_suite/runner.py +16 -54
validmind/vm_models/test_suite/summary.py +3 -3
validmind/vm_models/test_suite/test.py +43 -77
validmind/vm_models/test_suite/test_suite.py +8 -40
validmind-2.6.8.dist-info/METADATA +137 -0
{validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/RECORD +182 -189
validmind/tests/data_validation/AutoSeasonality.py +0 -190
validmind/tests/metadata.py +0 -59
validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
validmind/unit_metrics/composite.py +0 -238
validmind/vm_models/test/metric.py +0 -98
validmind/vm_models/test/metric_result.py +0 -61
validmind/vm_models/test/output_template.py +0 -55
validmind/vm_models/test/result_summary.py +0 -76
validmind/vm_models/test/result_wrapper.py +0 -488
validmind/vm_models/test/test.py +0 -103
validmind/vm_models/test/threshold_test.py +0 -106
validmind/vm_models/test/threshold_test_result.py +0 -75
validmind/vm_models/test_context.py +0 -259
validmind-2.5.25.dist-info/METADATA +0 -118
{validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/LICENSE +0 -0
{validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/WHEEL +0 -0
{validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/entry_points.txt +0 -0

validmind/tests/model_validation/ragas/ContextEntityRecall.py CHANGED Viewed

@@ -14,22 +14,25 @@ from .utils import get_ragas_config, get_renamed_columns
 try:
     from ragas import evaluate
-    from ragas.metrics import context_entity_recall
+    from ragas.metrics import ContextEntityRecall as context_entity_recall
 except ImportError as e:
-    raise MissingDependencyError(
-        "Missing required package `ragas` for ContextEntityRecall. "
-        "Please run `pip install validmind[llm]` to use LLM tests",
-        required_dependencies=["ragas"],
-        extra="llm",
-    ) from e
+    if "ragas" in str(e):
+        raise MissingDependencyError(
+            "Missing required package `ragas` for ContextEntityRecall. "
+            "Please run `pip install validmind[llm]` to use LLM tests",
+            required_dependencies=["ragas"],
+            extra="llm",
+        ) from e
+    raise e
 @tags("ragas", "llm", "retrieval_performance")
 @tasks("text_qa", "text_generation", "text_summarization")
 def ContextEntityRecall(
     dataset,
-    contexts_column: str = "contexts",
-    ground_truth_column: str = "ground_truth",
+    retrieved_contexts_column: str = "retrieved_contexts",
+    reference_column: str = "reference",
 ):
     """
     Evaluates the context entity recall for dataset entries and visualizes the results.
@@ -37,18 +40,18 @@ def ContextEntityRecall(
     ### Overview
     This metric gives the measure of recall of the retrieved context, based on the
-    number of entities present in both `ground_truths` and `contexts` relative to the
-    number of entities present in the `ground_truths` alone. Simply put, it is a measure
-    of what fraction of entities are recalled from `ground_truths`. This metric is
+    number of entities present in both `reference` and `retrieved_contexts` relative to the
+    number of entities present in the `reference` alone. Simply put, it is a measure
+    of what fraction of entities are recalled from `reference`. This metric is
     useful in fact-based use cases like tourism help desk, historical QA, etc. This
     metric can help evaluate the retrieval mechanism for entities, based on comparison
-    with entities present in `ground_truths`, because in cases where entities matter,
-    we need the `contexts` which cover them.
+    with entities present in `reference`, because in cases where entities matter,
+    we need the `retrieved_contexts` which cover them.
     ### Formula
     To compute this metric, we use two sets, $GE$ and $CE$, representing the set of
-    entities present in `ground_truths` and set of entities present in `contexts`
+    entities present in `reference` and set of entities present in `retrieved_contexts`
     respectively. We then take the number of elements in intersection of these sets and
     divide it by the number of elements present in the $GE$, given by the formula:
@@ -60,20 +63,20 @@ def ContextEntityRecall(
     This metric requires the following columns in your dataset:
-    - `contexts` (List[str]): A list of text contexts which will be evaluated to make
-    sure if they contain the entities present in the ground truth.
-    - `ground_truth` (str): The ground truth text from which the entities will be
-    extracted and compared with the entities in the `contexts`.
+    - `retrieved_contexts` (List[str]): A list of text contexts which will be evaluated to make
+    sure if they contain the entities present in the `reference`.
+    - `reference` (str): The ground truth text from which the entities will be
+    extracted and compared with the entities in the `retrieved_contexts`.
     If the above data is not in the appropriate column, you can specify different column
-    names for these fields using the parameters `contexts_column`, and `ground_truth_column`.
+    names for these fields using the parameters `retrieved_contexts_column`, and `reference_column`.
     For example, if your dataset has this data stored in different columns, you can
     pass the following parameters:
     ```python
     {
-        "contexts_column": "context_info"
-        "ground_truth_column": "my_ground_truth_col",
+        "retrieved_contexts_column": "context_info",
+        "reference_column": "my_ground_truth_col",
     }
     ```
@@ -82,8 +85,8 @@ def ContextEntityRecall(
     ```python
     pred_col = dataset.prediction_column(model)
     params = {
-        "contexts_column": f"{pred_col}.contexts",
-        "ground_truth_column": "my_ground_truth_col",
+        "retrieved_contexts_column": f"{pred_col}.contexts",
+        "reference_column": "my_ground_truth_col",
     }
     ```
@@ -91,8 +94,8 @@ def ContextEntityRecall(
     ```python
     pred_col = dataset.prediction_column(model)
     params = {
-        "contexts_column": lambda row: [row[pred_col]["context_message"]],
-        "ground_truth_column": "my_ground_truth_col",
+        "retrieved_contexts_column": lambda row: [row[pred_col]["context_message"]],
+        "reference_column": "my_ground_truth_col",
     }
     ```
     """
@@ -103,37 +106,37 @@ def ContextEntityRecall(
     )
     required_columns = {
-        "ground_truth": ground_truth_column,
-        "contexts": contexts_column,
+        "reference": reference_column,
+        "retrieved_contexts": retrieved_contexts_column,
     }
     df = get_renamed_columns(dataset._df, required_columns)
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=[context_entity_recall], **get_ragas_config()
+        Dataset.from_pandas(df), metrics=[context_entity_recall()], **get_ragas_config()
     ).to_pandas()
-    fig_histogram = px.histogram(
-        x=result_df["context_entity_recall"].to_list(), nbins=10
-    )
-    fig_box = px.box(x=result_df["context_entity_recall"].to_list())
+    score_column = "context_entity_recall"
+    fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
+    fig_box = px.box(x=result_df[score_column].to_list())
     return (
         {
-            # "Scores (will not be uploaded to UI)": result_df[
+            # "Scores (will not be uploaded to ValidMind Platform)": result_df[
             #     [
-            #         "contexts",
-            #         "ground_truth",
+            #         "retrieved_contexts",
+            #         "reference",
             #         "context_entity_recall",
             #     ]
             # ],
             "Aggregate Scores": [
                 {
-                    "Mean Score": result_df["context_entity_recall"].mean(),
-                    "Median Score": result_df["context_entity_recall"].median(),
-                    "Max Score": result_df["context_entity_recall"].max(),
-                    "Min Score": result_df["context_entity_recall"].min(),
-                    "Standard Deviation": result_df["context_entity_recall"].std(),
+                    "Mean Score": result_df[score_column].mean(),
+                    "Median Score": result_df[score_column].median(),
+                    "Max Score": result_df[score_column].max(),
+                    "Min Score": result_df[score_column].min(),
+                    "Standard Deviation": result_df[score_column].std(),
                     "Count": result_df.shape[0],
                 }
             ],

validmind/tests/model_validation/ragas/ContextPrecision.py CHANGED Viewed

@@ -14,23 +14,26 @@ from .utils import get_ragas_config, get_renamed_columns
 try:
     from ragas import evaluate
-    from ragas.metrics import context_precision
+    from ragas.metrics import LLMContextPrecisionWithReference as context_precision
 except ImportError as e:
-    raise MissingDependencyError(
-        "Missing required package `ragas` for ContextPrecision. "
-        "Please run `pip install validmind[llm]` to use LLM tests",
-        required_dependencies=["ragas"],
-        extra="llm",
-    ) from e
+    if "ragas" in str(e):
+        raise MissingDependencyError(
+            "Missing required package `ragas` for ContextPrecision. "
+            "Please run `pip install validmind[llm]` to use LLM tests",
+            required_dependencies=["ragas"],
+            extra="llm",
+        ) from e
+    raise e
 @tags("ragas", "llm", "retrieval_performance")
 @tasks("text_qa", "text_generation", "text_summarization", "text_classification")
 def ContextPrecision(
     dataset,
-    question_column: str = "question",
-    contexts_column: str = "contexts",
-    ground_truth_column: str = "ground_truth",
+    user_input_column: str = "user_input",
+    retrieved_contexts_column: str = "retrieved_contexts",
+    reference_column: str = "reference",
 ):  # noqa: B950
     """
     Context Precision is a metric that evaluates whether all of the ground-truth
@@ -53,22 +56,22 @@ def ContextPrecision(
     This metric requires the following columns in your dataset:
-    - `question` (str): The text query that was input into the model.
-    - `contexts` (List[str]): A list of text contexts which are retrieved and which
+    - `user_input` (str): The text query that was input into the model.
+    - `retrieved_contexts` (List[str]): A list of text contexts which are retrieved and which
     will be evaluated to make sure they contain relevant info in the correct order.
-    - `ground_truth` (str): The ground truth text to compare with the retrieved contexts.
+    - `reference` (str): The ground truth text to compare with the retrieved contexts.
     If the above data is not in the appropriate column, you can specify different column
-    names for these fields using the parameters `question_column`, `contexts_column`
-    and `ground_truth_column`.
+    names for these fields using the parameters `user_input_column`, `retrieved_contexts_column`
+    and `reference_column`.
     For example, if your dataset has this data stored in different columns, you can
     pass the following parameters:
     ```python
     {
-        "question_column": "question",
-        "contexts_column": "context_info"
-        "ground_truth_column": "my_ground_truth_col",
+        "user_input_column": "question",
+        "retrieved_contexts_column": "context_info",
+        "reference_column": "my_ground_truth_col",
     }
     ```
@@ -77,8 +80,8 @@ def ContextPrecision(
     ```python
     pred_col = dataset.prediction_column(model)
     params = {
-        "contexts_column": f"{pred_col}.contexts",
-        "ground_truth_column": "my_ground_truth_col",
+        "retrieved_contexts_column": f"{pred_col}.retrieved_contexts",
+        "reference_column": "my_ground_truth_col",
     }
     ```
@@ -86,8 +89,8 @@ def ContextPrecision(
     ```python
     pred_col = dataset.prediction_column(model)
     params = {
-        "contexts_column": lambda x: [x[pred_col]["context_message"]],
-        "ground_truth_column": "my_ground_truth_col",
+        "retrieved_contexts_column": lambda x: [x[pred_col]["context_message"]],
+        "reference_column": "my_ground_truth_col",
     }
     ```
     """
@@ -98,32 +101,34 @@ def ContextPrecision(
     )
     required_columns = {
-        "question": question_column,
-        "contexts": contexts_column,
-        "ground_truth": ground_truth_column,
+        "user_input": user_input_column,
+        "retrieved_contexts": retrieved_contexts_column,
+        "reference": reference_column,
     }
     df = get_renamed_columns(dataset._df, required_columns)
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=[context_precision], **get_ragas_config()
+        Dataset.from_pandas(df), metrics=[context_precision()], **get_ragas_config()
     ).to_pandas()
-    fig_histogram = px.histogram(x=result_df["context_precision"].to_list(), nbins=10)
-    fig_box = px.box(x=result_df["context_precision"].to_list())
+    score_column = "llm_context_precision_with_reference"
+    fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
+    fig_box = px.box(x=result_df[score_column].to_list())
     return (
         {
-            # "Scores (will not be uploaded to UI)": result_df[
-            #     ["question", "contexts", "ground_truth", "context_precision"]
+            # "Scores (will not be uploaded to ValidMind Platform)": result_df[
+            #     ["user_input", "retrieved_contexts", "reference", "llm_context_precision_with_reference"]
             # ],
             "Aggregate Scores": [
                 {
-                    "Mean Score": result_df["context_precision"].mean(),
-                    "Median Score": result_df["context_precision"].median(),
-                    "Max Score": result_df["context_precision"].max(),
-                    "Min Score": result_df["context_precision"].min(),
-                    "Standard Deviation": result_df["context_precision"].std(),
+                    "Mean Score": result_df[score_column].mean(),
+                    "Median Score": result_df[score_column].median(),
+                    "Max Score": result_df[score_column].max(),
+                    "Min Score": result_df[score_column].min(),
+                    "Standard Deviation": result_df[score_column].std(),
                     "Count": result_df.shape[0],
                 }
             ],

validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py ADDED Viewed

@@ -0,0 +1,133 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+import warnings
+import plotly.express as px
+from datasets import Dataset
+from validmind import tags, tasks
+from validmind.errors import MissingDependencyError
+from .utils import get_ragas_config, get_renamed_columns
+try:
+    from ragas import evaluate
+    from ragas.metrics import LLMContextPrecisionWithoutReference as context_precision
+except ImportError as e:
+    if "ragas" in str(e):
+        raise MissingDependencyError(
+            "Missing required package `ragas` for ContextPrecision. "
+            "Please run `pip install validmind[llm]` to use LLM tests",
+            required_dependencies=["ragas"],
+            extra="llm",
+        ) from e
+    raise e
+@tags("ragas", "llm", "retrieval_performance")
+@tasks("text_qa", "text_generation", "text_summarization", "text_classification")
+def ContextPrecisionWithoutReference(
+    dataset,
+    user_input_column: str = "user_input",
+    retrieved_contexts_column: str = "retrieved_contexts",
+    response_column: str = "response",
+):  # noqa: B950
+    """
+    Context Precision Without Reference is a metric used to evaluate the relevance of
+    retrieved contexts compared to the expected response for a given user input. This
+    metric compares each retrieved context (or chunk) with the response to estimate
+    if the retrieved context is relevant.
+    This metric can be used when you have both retrieved contexts and associated
+    reference contexts for a `user_input`. Using a Language Model (LLM), it determines
+    the relevance of each retrieved context by comparing it directly with the response,
+    producing scores between 0 and 1, where higher scores indicate better precision in
+    retrieving relevant contexts.
+    ### Configuring Columns
+    This metric requires the following columns in your dataset:
+    - `user_input` (str): The user query or input to the model.
+    - `retrieved_contexts` (List[str]): A list of text contexts retrieved for the
+      user input that will be evaluated for relevance.
+    - `response` (str): The model’s output response associated with the user input.
+    If your dataset stores this data in different columns, you can specify alternate
+    column names using the parameters `user_input_column`, `retrieved_contexts_column`,
+    and `response_column`.
+    Example configuration for custom column names:
+    ```python
+    {
+        "user_input_column": "user_query",
+        "retrieved_contexts_column": "retrieved_texts",
+        "response_column": "model_output",
+    }
+    ```
+    For datasets with data stored as dictionaries in other columns, specify the
+    column and key like so:
+    ```python
+    pred_col = dataset.prediction_column(model)
+    params = {
+        "retrieved_contexts_column": f"{pred_col}.contexts",
+        "response_column": f"{pred_col}.response",
+    }
+    ```
+    Alternatively, for complex situations, you may use a function to extract data:
+    ```python
+    pred_col = dataset.prediction_column(model)
+    params = {
+        "retrieved_contexts_column": lambda x: [x[pred_col]["context_message"]],
+        "response_column": "my_response_col",
+    }
+    ```
+    """
+    warnings.filterwarnings(
+        "ignore",
+        category=FutureWarning,
+        message="promote has been superseded by promote_options='default'.",
+    )
+    required_columns = {
+        "user_input": user_input_column,
+        "retrieved_contexts": retrieved_contexts_column,
+        "response": response_column,
+    }
+    df = get_renamed_columns(dataset._df, required_columns)
+    result_df = evaluate(
+        Dataset.from_pandas(df), metrics=[context_precision()], **get_ragas_config()
+    ).to_pandas()
+    score_column = "llm_context_precision_without_reference"
+    fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
+    fig_box = px.box(x=result_df[score_column].to_list())
+    return (
+        {
+            # "Scores (will not be uploaded to ValidMind Platform)": result_df[
+            #     ["user_input", "retrieved_contexts", "response", "llm_context_precision_without_reference"]
+            # ],
+            "Aggregate Scores": [
+                {
+                    "Mean Score": result_df[score_column].mean(),
+                    "Median Score": result_df[score_column].median(),
+                    "Max Score": result_df[score_column].max(),
+                    "Min Score": result_df[score_column].min(),
+                    "Standard Deviation": result_df[score_column].std(),
+                    "Count": result_df.shape[0],
+                }
+            ],
+        },
+        fig_histogram,
+        fig_box,
+    )

validmind/tests/model_validation/ragas/ContextRecall.py CHANGED Viewed

@@ -14,23 +14,26 @@ from .utils import get_ragas_config, get_renamed_columns
 try:
     from ragas import evaluate
-    from ragas.metrics import context_recall
+    from ragas.metrics import LLMContextRecall as context_recall
 except ImportError as e:
-    raise MissingDependencyError(
-        "Missing required package `ragas` for ContextRecall. "
-        "Please run `pip install validmind[llm]` to use LLM tests",
-        required_dependencies=["ragas"],
-        extra="llm",
-    ) from e
+    if "ragas" in str(e):
+        raise MissingDependencyError(
+            "Missing required package `ragas` for ContextRecall. "
+            "Please run `pip install validmind[llm]` to use LLM tests",
+            required_dependencies=["ragas"],
+            extra="llm",
+        ) from e
+    raise e
 @tags("ragas", "llm", "retrieval_performance")
 @tasks("text_qa", "text_generation", "text_summarization", "text_classification")
 def ContextRecall(
     dataset,
-    question_column: str = "question",
-    contexts_column: str = "contexts",
-    ground_truth_column: str = "ground_truth",
+    user_input_column: str = "user_input",
+    retrieved_contexts_column: str = "retrieved_contexts",
+    reference_column: str = "reference",
 ):
     """
     Context recall measures the extent to which the retrieved context aligns with the
@@ -53,22 +56,22 @@ def ContextRecall(
     This metric requires the following columns in your dataset:
-    - `question` (str): The text query that was input into the model.
-    - `contexts` (List[str]): A list of text contexts which are retrieved and which
-    will be evaluated to make sure they contain all items in the ground truth.
-    - `ground_truth` (str): The ground truth text to compare with the retrieved contexts.
+    - `user_input` (str): The text query that was input into the model.
+    - `retrieved_contexts` (List[str]): A list of text contexts which are retrieved and
+    which will be evaluated to make sure they contain all items in the ground truth.
+    - `reference` (str): The ground truth text to compare with the retrieved contexts.
     If the above data is not in the appropriate column, you can specify different column
-    names for these fields using the parameters `question_column`, `contexts_column`
-    and `ground_truth_column`.
+    names for these fields using the parameters `user_input_column`,
+    `retrieved_contexts_column` and `reference_column`.
     For example, if your dataset has this data stored in different columns, you can
     pass the following parameters:
     ```python
     {
-        "question_column": "question",
-        "contexts_column": "context_info"
-        "ground_truth_column": "my_ground_truth_col",
+        "user_input_column": "user_input",
+        "retrieved_contexts_column": "retrieved_contexts",
+        "reference_column": "reference",
     }
     ```
@@ -77,8 +80,8 @@ def ContextRecall(
     ```python
     pred_col = dataset.prediction_column(model)
     params = {
-        "contexts_column": f"{pred_col}.contexts",
-        "ground_truth_column": "my_ground_truth_col",
+        "retrieved_contexts_column": f"{pred_col}.retrieved_contexts",
+        "reference_column": f"{pred_col}.reference",
     }
     ```
@@ -86,8 +89,8 @@ def ContextRecall(
     ```python
     pred_col = dataset.prediction_column(model)
     params = {
-        "contexts_column": lambda x: [x[pred_col]["context_message"]],
-        "ground_truth_column": "my_ground_truth_col",
+        "retrieved_contexts_column": lambda x: [x[pred_col]["retrieved_contexts"]],
+        "reference_column": "my_ground_truth_col",
     }
     ```
     """
@@ -98,32 +101,34 @@ def ContextRecall(
     )
     required_columns = {
-        "question": question_column,
-        "contexts": contexts_column,
-        "ground_truth": ground_truth_column,
+        "user_input": user_input_column,
+        "retrieved_contexts": retrieved_contexts_column,
+        "reference": reference_column,
     }
     df = get_renamed_columns(dataset._df, required_columns)
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=[context_recall], **get_ragas_config()
+        Dataset.from_pandas(df), metrics=[context_recall()], **get_ragas_config()
     ).to_pandas()
-    fig_histogram = px.histogram(x=result_df["context_recall"].to_list(), nbins=10)
-    fig_box = px.box(x=result_df["context_recall"].to_list())
+    score_column = "context_recall"
+    fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
+    fig_box = px.box(x=result_df[score_column].to_list())
     return (
         {
-            # "Scores (will not be uploaded to UI)": result_df[
+            # "Scores (will not be uploaded to ValidMind Platform)": result_df[
             #     ["question", "contexts", "ground_truth", "context_recall"]
             # ],
             "Aggregate Scores": [
                 {
-                    "Mean Score": result_df["context_recall"].mean(),
-                    "Median Score": result_df["context_recall"].median(),
-                    "Max Score": result_df["context_recall"].max(),
-                    "Min Score": result_df["context_recall"].min(),
-                    "Standard Deviation": result_df["context_recall"].std(),
+                    "Mean Score": result_df[score_column].mean(),
+                    "Median Score": result_df[score_column].median(),
+                    "Max Score": result_df[score_column].max(),
+                    "Min Score": result_df[score_column].min(),
+                    "Standard Deviation": result_df[score_column].std(),
                     "Count": result_df.shape[0],
                 }
             ],

validmind 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl

validmind 2.5.25py3-none-any.whl → 2.6.8py3-none-any.whl