validmind 2.9.1__py3-none-any.whl → 2.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +16 -5
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +24 -17
- validmind/ai/utils.py +2 -2
- validmind/api_client.py +0 -2
- validmind/datasets/credit_risk/lending_club.py +13 -1
- validmind/datasets/nlp/cnn_dailymail.py +15 -1
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +14 -2
- validmind/tests/data_validation/DickeyFullerGLS.py +13 -2
- validmind/tests/data_validation/PhillipsPerronArch.py +13 -2
- validmind/tests/data_validation/ScoreBandDefaultRates.py +1 -1
- validmind/tests/data_validation/SeasonalDecompose.py +14 -2
- validmind/tests/data_validation/ShapiroWilk.py +14 -1
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +14 -1
- validmind/tests/data_validation/WOEBinPlots.py +14 -1
- validmind/tests/data_validation/WOEBinTable.py +13 -2
- validmind/tests/data_validation/ZivotAndrewsArch.py +13 -2
- validmind/tests/data_validation/nlp/CommonWords.py +14 -2
- validmind/tests/data_validation/nlp/LanguageDetection.py +14 -1
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +13 -1
- validmind/tests/data_validation/nlp/Sentiment.py +13 -1
- validmind/tests/data_validation/nlp/StopWords.py +14 -2
- validmind/tests/data_validation/nlp/TextDescription.py +14 -2
- validmind/tests/data_validation/nlp/Toxicity.py +13 -1
- validmind/tests/model_validation/BertScore.py +13 -2
- validmind/tests/model_validation/BleuScore.py +13 -2
- validmind/tests/model_validation/ContextualRecall.py +13 -1
- validmind/tests/model_validation/MeteorScore.py +13 -2
- validmind/tests/model_validation/ModelPredictionResiduals.py +14 -1
- validmind/tests/model_validation/RegardScore.py +13 -2
- validmind/tests/model_validation/RougeScore.py +14 -1
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +14 -1
- validmind/tests/model_validation/ToxicityScore.py +13 -1
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +1 -1
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +1 -1
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +14 -2
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +1 -1
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +13 -2
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +1 -1
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +14 -2
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +1 -1
- validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py +1 -1
- validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py +14 -1
- validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py +15 -2
- validmind/tests/ongoing_monitoring/ScoreBandsDrift.py +1 -1
- validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py +15 -2
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +14 -1
- validmind/tests/plots/BoxPlot.py +2 -2
- validmind/tests/plots/HistogramPlot.py +4 -4
- validmind/tests/stats/DescriptiveStats.py +2 -2
- validmind/vm_models/result/pii_filter.py +202 -0
- validmind/vm_models/result/result.py +34 -8
- validmind/vm_models/result/utils.py +0 -27
- validmind-2.9.3.dist-info/METADATA +848 -0
- {validmind-2.9.1.dist-info → validmind-2.9.3.dist-info}/RECORD +59 -58
- validmind-2.9.1.dist-info/METADATA +0 -137
- {validmind-2.9.1.dist-info → validmind-2.9.3.dist-info}/LICENSE +0 -0
- {validmind-2.9.1.dist-info → validmind-2.9.3.dist-info}/WHEEL +0 -0
- {validmind-2.9.1.dist-info → validmind-2.9.3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,202 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
"""
|
6
|
+
PII filtering utilities using Microsoft Presidio for detecting and masking
|
7
|
+
personally identifiable information in test result data.
|
8
|
+
"""
|
9
|
+
|
10
|
+
import os
|
11
|
+
from enum import Enum
|
12
|
+
from typing import Dict
|
13
|
+
|
14
|
+
import pandas as pd
|
15
|
+
|
16
|
+
from ...logging import get_logger
|
17
|
+
|
18
|
+
logger = get_logger(__name__)
|
19
|
+
|
20
|
+
|
21
|
+
class PIIDetectionMode(Enum):
|
22
|
+
"""Enum for PII detection modes."""
|
23
|
+
|
24
|
+
DISABLED = "disabled"
|
25
|
+
TEST_RESULTS = "test_results"
|
26
|
+
TEST_DESCRIPTIONS = "test_descriptions"
|
27
|
+
ALL = "all"
|
28
|
+
|
29
|
+
|
30
|
+
# Default entities to detect common PII types
|
31
|
+
DEFAULT_ENTITIES = [
|
32
|
+
"PERSON",
|
33
|
+
"EMAIL_ADDRESS",
|
34
|
+
"PHONE_NUMBER",
|
35
|
+
"CREDIT_CARD",
|
36
|
+
"US_SSN",
|
37
|
+
"US_DRIVER_LICENSE",
|
38
|
+
"IP_ADDRESS",
|
39
|
+
"LOCATION",
|
40
|
+
"DATE_TIME",
|
41
|
+
"US_PASSPORT",
|
42
|
+
"US_BANK_NUMBER",
|
43
|
+
"IBAN_CODE",
|
44
|
+
]
|
45
|
+
|
46
|
+
# Default confidence threshold
|
47
|
+
DEFAULT_THRESHOLD = 0.5
|
48
|
+
|
49
|
+
# Default sample size for DataFrame PII scanning
|
50
|
+
SAMPLE_SIZE = 100
|
51
|
+
|
52
|
+
|
53
|
+
def get_pii_detection_mode() -> PIIDetectionMode:
|
54
|
+
"""
|
55
|
+
Get the current PII detection mode.
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
PIIDetectionMode.DISABLED if:
|
59
|
+
- Environment variable is not set
|
60
|
+
- Environment variable is set to "disabled"
|
61
|
+
- Presidio packages are not installed
|
62
|
+
- Invalid mode value
|
63
|
+
|
64
|
+
Otherwise returns the specified mode (test_results, test_descriptions, or all)
|
65
|
+
"""
|
66
|
+
mode_str = os.getenv("VALIDMIND_PII_DETECTION", "disabled").lower()
|
67
|
+
|
68
|
+
try:
|
69
|
+
mode = PIIDetectionMode(mode_str)
|
70
|
+
except ValueError:
|
71
|
+
logger.warning(
|
72
|
+
f"Invalid PII detection mode '{mode_str}'. "
|
73
|
+
f"Valid options: {', '.join([mode.value for mode in PIIDetectionMode])}. "
|
74
|
+
f"Defaulting to 'disabled'."
|
75
|
+
)
|
76
|
+
mode = PIIDetectionMode.DISABLED
|
77
|
+
|
78
|
+
# If mode is not disabled, check if Presidio is actually available
|
79
|
+
if mode != PIIDetectionMode.DISABLED:
|
80
|
+
if not _is_presidio_available():
|
81
|
+
logger.warning(
|
82
|
+
f"PII detection mode '{mode.value}' requested but Presidio not available. "
|
83
|
+
"Falling back to 'disabled' mode. Install with: pip install validmind[pii-detection]"
|
84
|
+
)
|
85
|
+
mode = PIIDetectionMode.DISABLED
|
86
|
+
|
87
|
+
return mode
|
88
|
+
|
89
|
+
|
90
|
+
def _is_presidio_available() -> bool:
|
91
|
+
"""Check if any Presidio components are available."""
|
92
|
+
return _get_presidio_text() is not None or _get_presidio_df() is not None
|
93
|
+
|
94
|
+
|
95
|
+
def _get_presidio_text():
|
96
|
+
"""Get Presidio analyzer for text analysis."""
|
97
|
+
from presidio_analyzer import AnalyzerEngine
|
98
|
+
|
99
|
+
return AnalyzerEngine()
|
100
|
+
|
101
|
+
|
102
|
+
def _get_presidio_df():
|
103
|
+
"""Get Presidio Structured PandasAnalysisBuilder for DataFrame analysis."""
|
104
|
+
from presidio_structured import PandasAnalysisBuilder
|
105
|
+
|
106
|
+
return PandasAnalysisBuilder()
|
107
|
+
|
108
|
+
|
109
|
+
def scan_text(text: str) -> bool:
|
110
|
+
"""
|
111
|
+
Scan text for PII content. Raises ValueError if PII is found.
|
112
|
+
|
113
|
+
Args:
|
114
|
+
text: The text to scan for PII
|
115
|
+
|
116
|
+
Returns:
|
117
|
+
True if no PII is found
|
118
|
+
|
119
|
+
Raises:
|
120
|
+
ValueError: If PII is detected
|
121
|
+
"""
|
122
|
+
# sanity check
|
123
|
+
mode = get_pii_detection_mode()
|
124
|
+
if mode == PIIDetectionMode.DISABLED:
|
125
|
+
return True
|
126
|
+
|
127
|
+
analyzer = _get_presidio_text()
|
128
|
+
results = analyzer.analyze(text=text, entities=DEFAULT_ENTITIES, language="en")
|
129
|
+
|
130
|
+
# Filter results by confidence threshold
|
131
|
+
pii_entities = [
|
132
|
+
{
|
133
|
+
"entity_type": result.entity_type,
|
134
|
+
"start": result.start,
|
135
|
+
"end": result.end,
|
136
|
+
"score": result.score,
|
137
|
+
"text": text[result.start : result.end],
|
138
|
+
}
|
139
|
+
for result in results
|
140
|
+
if result.score >= DEFAULT_THRESHOLD
|
141
|
+
]
|
142
|
+
|
143
|
+
if pii_entities:
|
144
|
+
entity_types = set(entity["entity_type"] for entity in pii_entities)
|
145
|
+
raise ValueError(
|
146
|
+
f"PII detected in text content. Entity types found: {', '.join(entity_types)}."
|
147
|
+
)
|
148
|
+
|
149
|
+
return True
|
150
|
+
|
151
|
+
|
152
|
+
def scan_df(df: pd.DataFrame) -> bool:
|
153
|
+
"""
|
154
|
+
Scan a pandas DataFrame for PII content. Raises ValueError if PII is found.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
df: The DataFrame to scan
|
158
|
+
|
159
|
+
Returns:
|
160
|
+
True if no PII is found
|
161
|
+
|
162
|
+
Raises:
|
163
|
+
ValueError: If PII is detected
|
164
|
+
"""
|
165
|
+
# sanity check
|
166
|
+
mode = get_pii_detection_mode()
|
167
|
+
if mode == PIIDetectionMode.DISABLED:
|
168
|
+
return True
|
169
|
+
|
170
|
+
# Scan all string/object columns
|
171
|
+
columns = [col for col in df.columns if df[col].dtype == "object"]
|
172
|
+
|
173
|
+
if not columns:
|
174
|
+
return True
|
175
|
+
|
176
|
+
# Limit the number of rows to scan for performance
|
177
|
+
sample_df = df.head(SAMPLE_SIZE) if len(df) > SAMPLE_SIZE else df
|
178
|
+
|
179
|
+
# Use structured analysis
|
180
|
+
builder = _get_presidio_df()
|
181
|
+
tabular_analysis = builder.generate_analysis(
|
182
|
+
sample_df,
|
183
|
+
selection_strategy="mixed",
|
184
|
+
mixed_strategy_threshold=DEFAULT_THRESHOLD,
|
185
|
+
)
|
186
|
+
|
187
|
+
entity_mapping: Dict[str, str] = getattr(tabular_analysis, "entity_mapping", {})
|
188
|
+
|
189
|
+
pii_columns = [
|
190
|
+
column
|
191
|
+
for column in columns
|
192
|
+
if column in entity_mapping and entity_mapping[column]
|
193
|
+
]
|
194
|
+
|
195
|
+
if pii_columns:
|
196
|
+
entity_types = [entity_mapping[col] for col in pii_columns]
|
197
|
+
raise ValueError(
|
198
|
+
f"PII detected in DataFrame columns: {', '.join(pii_columns)}. "
|
199
|
+
f"Entity types found: {', '.join(entity_types)}."
|
200
|
+
)
|
201
|
+
|
202
|
+
return True
|
@@ -31,10 +31,10 @@ from ...utils import (
|
|
31
31
|
)
|
32
32
|
from ..figure import Figure, create_figure
|
33
33
|
from ..input import VMInput
|
34
|
+
from .pii_filter import PIIDetectionMode, get_pii_detection_mode, scan_df, scan_text
|
34
35
|
from .utils import (
|
35
36
|
AI_REVISION_NAME,
|
36
37
|
DEFAULT_REVISION_NAME,
|
37
|
-
check_for_sensitive_data,
|
38
38
|
figures_to_widgets,
|
39
39
|
get_result_template,
|
40
40
|
tables_to_widgets,
|
@@ -222,8 +222,10 @@ class TestResult(Result):
|
|
222
222
|
description = super().__getattribute__("description")
|
223
223
|
|
224
224
|
if isinstance(description, DescriptionFuture):
|
225
|
-
|
226
|
-
|
225
|
+
(
|
226
|
+
self.description,
|
227
|
+
self._was_description_generated,
|
228
|
+
) = description.get_description()
|
227
229
|
|
228
230
|
return super().__getattribute__(name)
|
229
231
|
|
@@ -465,8 +467,10 @@ class TestResult(Result):
|
|
465
467
|
)
|
466
468
|
)
|
467
469
|
|
468
|
-
|
469
|
-
|
470
|
+
# Only log unit metrics when the metric is a scalar value.
|
471
|
+
# Some tests may assign a list/array of per-row metrics to `self.metric`.
|
472
|
+
# Those should not be sent to the unit-metric endpoint which expects scalars.
|
473
|
+
if self.metric is not None and not hasattr(self.metric, "__len__"):
|
470
474
|
tasks.append(
|
471
475
|
api_client.alog_metric(
|
472
476
|
key=self.result_id,
|
@@ -521,7 +525,7 @@ class TestResult(Result):
|
|
521
525
|
|
522
526
|
return await asyncio.gather(*tasks)
|
523
527
|
|
524
|
-
def log(
|
528
|
+
def log( # noqa: C901
|
525
529
|
self,
|
526
530
|
section_id: str = None,
|
527
531
|
content_id: str = None,
|
@@ -552,9 +556,15 @@ class TestResult(Result):
|
|
552
556
|
|
553
557
|
self.check_result_id_exist()
|
554
558
|
|
555
|
-
if not unsafe
|
559
|
+
if not unsafe and get_pii_detection_mode() in [
|
560
|
+
PIIDetectionMode.TEST_RESULTS,
|
561
|
+
PIIDetectionMode.ALL,
|
562
|
+
]:
|
556
563
|
for table in self.tables or []:
|
557
|
-
|
564
|
+
scan_df(table.data)
|
565
|
+
|
566
|
+
if self.description:
|
567
|
+
scan_text(self.description)
|
558
568
|
|
559
569
|
if section_id:
|
560
570
|
self._validate_section_id_for_block(section_id, position)
|
@@ -701,6 +711,22 @@ class TextGenerationResult(Result):
|
|
701
711
|
position (int): The position (index) within the section to insert the test
|
702
712
|
result.
|
703
713
|
"""
|
714
|
+
# Check description text for PII when available
|
715
|
+
if self.description:
|
716
|
+
try:
|
717
|
+
from .pii_filter import check_text_for_pii
|
718
|
+
|
719
|
+
check_text_for_pii(self.description, raise_on_detection=True)
|
720
|
+
except ImportError:
|
721
|
+
logger.debug(
|
722
|
+
"PII detection not available - skipping PII check for description"
|
723
|
+
)
|
724
|
+
except ValueError:
|
725
|
+
# Re-raise PII detection errors
|
726
|
+
raise
|
727
|
+
except Exception as e:
|
728
|
+
logger.warning(f"PII detection failed for description: {e}")
|
729
|
+
|
704
730
|
run_async(
|
705
731
|
self.log_async,
|
706
732
|
content_id=content_id,
|
@@ -5,15 +5,12 @@
|
|
5
5
|
import os
|
6
6
|
from typing import TYPE_CHECKING, Dict, List, Union
|
7
7
|
|
8
|
-
import pandas as pd
|
9
8
|
from ipywidgets import HTML, GridBox, Layout
|
10
9
|
from jinja2 import Template
|
11
10
|
|
12
11
|
from ... import api_client
|
13
12
|
from ...logging import get_logger
|
14
|
-
from ..dataset import VMDataset
|
15
13
|
from ..figure import Figure
|
16
|
-
from ..input import VMInput
|
17
14
|
|
18
15
|
if TYPE_CHECKING:
|
19
16
|
from .result import ResultTable
|
@@ -52,30 +49,6 @@ async def update_metadata(content_id: str, text: str, _json: Union[Dict, List] =
|
|
52
49
|
await api_client.alog_metadata(content_id, text, _json)
|
53
50
|
|
54
51
|
|
55
|
-
def check_for_sensitive_data(data: pd.DataFrame, inputs: List[VMInput]):
|
56
|
-
"""Check if the data contains sensitive information from input datasets."""
|
57
|
-
dataset_columns = {
|
58
|
-
col: len(input_obj.df)
|
59
|
-
for input_obj in inputs
|
60
|
-
if isinstance(input_obj, VMDataset)
|
61
|
-
for col in input_obj.columns
|
62
|
-
}
|
63
|
-
|
64
|
-
table_columns = {col: len(data) for col in data.columns}
|
65
|
-
|
66
|
-
offending_columns = [
|
67
|
-
col
|
68
|
-
for col in table_columns
|
69
|
-
if col in dataset_columns and table_columns[col] == dataset_columns[col]
|
70
|
-
]
|
71
|
-
|
72
|
-
if offending_columns:
|
73
|
-
raise ValueError(
|
74
|
-
f"Raw input data found in table, pass `unsafe=True` "
|
75
|
-
f"or remove the offending columns: {offending_columns}"
|
76
|
-
)
|
77
|
-
|
78
|
-
|
79
52
|
def tables_to_widgets(tables: List["ResultTable"]):
|
80
53
|
"""Convert a list of tables to ipywidgets."""
|
81
54
|
widgets = [
|