protegrity-ai-developer-python 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- appython/__init__.py +12 -0
- appython/protector.py +554 -0
- appython/service/auth_provider.py +273 -0
- appython/service/auth_token_provider.py +45 -0
- appython/service/config.py +209 -0
- appython/service/payload_builder.py +141 -0
- appython/service/request_handler.py +115 -0
- appython/service/response_handler.py +78 -0
- appython/stats/__init__.py +3 -0
- appython/stats/collector.py +90 -0
- appython/stats/writer.py +185 -0
- appython/utils/codec_helper.py +86 -0
- appython/utils/constants.py +246 -0
- appython/utils/exceptions.py +141 -0
- appython/utils/input_preprocessor.py +325 -0
- appython/utils/output_postprocessor.py +99 -0
- protegrity_ai_developer_python-1.2.1.dist-info/METADATA +428 -0
- protegrity_ai_developer_python-1.2.1.dist-info/RECORD +53 -0
- protegrity_ai_developer_python-1.2.1.dist-info/WHEEL +5 -0
- protegrity_ai_developer_python-1.2.1.dist-info/entry_points.txt +2 -0
- protegrity_ai_developer_python-1.2.1.dist-info/licenses/LICENSE +21 -0
- protegrity_ai_developer_python-1.2.1.dist-info/top_level.txt +3 -0
- protegrity_developer_python/__init__.py +4 -0
- protegrity_developer_python/scan.py +37 -0
- protegrity_developer_python/securefind.py +83 -0
- protegrity_developer_python/utils/ccn_processing.py +59 -0
- protegrity_developer_python/utils/config.py +60 -0
- protegrity_developer_python/utils/constants.py +123 -0
- protegrity_developer_python/utils/discover.py +49 -0
- protegrity_developer_python/utils/logger.py +23 -0
- protegrity_developer_python/utils/pii_processing.py +291 -0
- protegrity_developer_python/utils/protector.py +23 -0
- protegrity_developer_python/utils/semantic_guardrails.py +240 -0
- protegrity_developer_python/utils/transform.py +66 -0
- pty_migrate/__init__.py +1 -0
- pty_migrate/check_cmd.py +871 -0
- pty_migrate/cli.py +93 -0
- pty_migrate/config.py +127 -0
- pty_migrate/create_policy_cmd.py +795 -0
- pty_migrate/payloads/__init__.py +51 -0
- pty_migrate/payloads/alphabets.json +42 -0
- pty_migrate/payloads/dataelements.json +342 -0
- pty_migrate/payloads/datastores.json +7 -0
- pty_migrate/payloads/deploy_policy_ta.json +1 -0
- pty_migrate/payloads/masks.json +18 -0
- pty_migrate/payloads/members.json +62 -0
- pty_migrate/payloads/policies.json +13 -0
- pty_migrate/payloads/roles.json +32 -0
- pty_migrate/payloads/rules.json +1639 -0
- pty_migrate/payloads/sources.json +10 -0
- pty_migrate/payloads/trusted_apps.json +8 -0
- pty_migrate/ppc_client.py +371 -0
- pty_migrate/stats_cmd.py +87 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module for processing credit card numbers (CCNs) by removing and reconstructing separators.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def clean_ccn(ccn_string: str) -> tuple[str, dict]:
|
|
7
|
+
"""
|
|
8
|
+
Remove separators from credit card number and store their positions.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
ccn_string (str): Credit card number with separators
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
tuple: (cleaned_string, separator_map)
|
|
15
|
+
- cleaned_string: CCN with only digits
|
|
16
|
+
- separator_map: {position: separator_char}
|
|
17
|
+
"""
|
|
18
|
+
cleaned = ""
|
|
19
|
+
separator_map = {}
|
|
20
|
+
|
|
21
|
+
for i, char in enumerate(ccn_string):
|
|
22
|
+
if char.isdigit():
|
|
23
|
+
cleaned += char
|
|
24
|
+
else:
|
|
25
|
+
# Store the original position and the separator character
|
|
26
|
+
separator_map[i] = char
|
|
27
|
+
|
|
28
|
+
return cleaned, separator_map
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def reconstruct_ccn(cleaned_ccn: str, separator_map: dict) -> str:
|
|
32
|
+
"""
|
|
33
|
+
Reconstruct original CCN format using the separator map.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
cleaned_ccn (str): CCN with only digits
|
|
37
|
+
separator_map (dict): {position: separator_char} mapping
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
str: CCN with separators restored to original positions
|
|
41
|
+
"""
|
|
42
|
+
result = ""
|
|
43
|
+
original_pos = 0
|
|
44
|
+
|
|
45
|
+
for cleaned_pos, digit in enumerate(cleaned_ccn):
|
|
46
|
+
# Add any separators that should come before this digit
|
|
47
|
+
while original_pos in separator_map:
|
|
48
|
+
result += separator_map[original_pos]
|
|
49
|
+
original_pos += 1
|
|
50
|
+
|
|
51
|
+
result += digit
|
|
52
|
+
original_pos += 1
|
|
53
|
+
|
|
54
|
+
# Add any trailing separators
|
|
55
|
+
while original_pos in separator_map:
|
|
56
|
+
result += separator_map[original_pos]
|
|
57
|
+
original_pos += 1
|
|
58
|
+
|
|
59
|
+
return result
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module for
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from protegrity_developer_python.utils.constants import CONFIG as _root_config, get_config
|
|
6
|
+
from typing import Dict, Optional
|
|
7
|
+
from protegrity_developer_python.utils.logger import get_logger
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
# Get logger instance
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def configure(
|
|
15
|
+
endpoint_url: Optional[str] = None,
|
|
16
|
+
named_entity_map: Optional[Dict[str, str]] = None,
|
|
17
|
+
masking_char: Optional[str] = None,
|
|
18
|
+
classification_score_threshold: Optional[float] = None,
|
|
19
|
+
method: Optional[str] = None,
|
|
20
|
+
enable_logging: Optional[bool] = None,
|
|
21
|
+
log_level: Optional[str] = None,
|
|
22
|
+
) -> None:
|
|
23
|
+
"""
|
|
24
|
+
Configure the protegrity_developer_python module.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
endpoint_url (str): URL of the discovery classification API.
|
|
28
|
+
named_entity_map (dict): Mapping of entity types to labels.
|
|
29
|
+
masking_char (str): Character used for masking PII.
|
|
30
|
+
classification_score_threshold (float): Minimum score to consider classification.
|
|
31
|
+
method (str): Either 'redact' or 'mask'.
|
|
32
|
+
enable_logging (bool): Enable or disable logging.
|
|
33
|
+
log_level (str): Set the logging level.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
# Configure data-discovery settings
|
|
37
|
+
dd_config = get_config("data-discovery")
|
|
38
|
+
if endpoint_url:
|
|
39
|
+
dd_config["endpoint_url"] = endpoint_url
|
|
40
|
+
if named_entity_map:
|
|
41
|
+
dd_config["named_entity_map"] = named_entity_map
|
|
42
|
+
if masking_char:
|
|
43
|
+
dd_config["masking_char"] = masking_char
|
|
44
|
+
if classification_score_threshold is not None:
|
|
45
|
+
dd_config["classification_score_threshold"] = classification_score_threshold
|
|
46
|
+
if method in ("redact", "mask"):
|
|
47
|
+
dd_config["method"] = method
|
|
48
|
+
elif method:
|
|
49
|
+
logger.warning(
|
|
50
|
+
"Invalid method specified: %s. Must be 'redact' or 'mask'.", method
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Configure global logging settings
|
|
54
|
+
if enable_logging is not None:
|
|
55
|
+
_root_config["enable_logging"] = enable_logging
|
|
56
|
+
logger.disabled = not enable_logging
|
|
57
|
+
if log_level:
|
|
58
|
+
_root_config["log_level"] = log_level
|
|
59
|
+
level = getattr(logging, log_level.upper(), logging.INFO)
|
|
60
|
+
logger.setLevel(level)
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module for constants and default configurations.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
# Mapping of entity types to Protegrity data elements
|
|
8
|
+
DATA_ELEMENT_MAPPING = {
|
|
9
|
+
# Account Information
|
|
10
|
+
"ACCOUNT_NAME": "string",
|
|
11
|
+
"ACCOUNT_NUMBER": "number",
|
|
12
|
+
|
|
13
|
+
# Personal Information
|
|
14
|
+
"AGE": "number",
|
|
15
|
+
"GENDER": "string",
|
|
16
|
+
"TITLE": "string",
|
|
17
|
+
"PERSON": "string",
|
|
18
|
+
|
|
19
|
+
# Financial
|
|
20
|
+
"AMOUNT": "number",
|
|
21
|
+
"BANK_ACCOUNT": "number",
|
|
22
|
+
"CREDIT_CARD": "ccn",
|
|
23
|
+
"CURRENCY": "string",
|
|
24
|
+
"CURRENCY_CODE": "string",
|
|
25
|
+
"CURRENCY_NAME": "string",
|
|
26
|
+
"CURRENCY_SYMBOL": "string",
|
|
27
|
+
|
|
28
|
+
# Cryptocurrency
|
|
29
|
+
"CRYPTO_ADDRESS": "address",
|
|
30
|
+
|
|
31
|
+
# Temporal
|
|
32
|
+
"DATETIME": "datetime",
|
|
33
|
+
"DOB": "datetime",
|
|
34
|
+
"DATE_OF_BIRTH": "datetime",
|
|
35
|
+
|
|
36
|
+
# Identification Documents
|
|
37
|
+
"DRIVER_LICENSE": "number",
|
|
38
|
+
"PASSPORT": "passport",
|
|
39
|
+
"NATIONAL_ID": "nin",
|
|
40
|
+
"SOCIAL_SECURITY_ID": "ssn",
|
|
41
|
+
|
|
42
|
+
# Contact Information
|
|
43
|
+
"EMAIL_ADDRESS": "email",
|
|
44
|
+
"PHONE_NUMBER": "phone",
|
|
45
|
+
|
|
46
|
+
# Healthcare
|
|
47
|
+
"HEALTH_CARE_ID": "number",
|
|
48
|
+
|
|
49
|
+
# Location Information
|
|
50
|
+
"LOCATION": "address",
|
|
51
|
+
"ADDRESS": "address",
|
|
52
|
+
|
|
53
|
+
# Network
|
|
54
|
+
"IP_ADDRESS": "address",
|
|
55
|
+
"MAC_ADDRESS": "address",
|
|
56
|
+
"URL": "address",
|
|
57
|
+
|
|
58
|
+
# Tax
|
|
59
|
+
"TAX_ID": "number",
|
|
60
|
+
|
|
61
|
+
# India-specific
|
|
62
|
+
"IN_VEHICLE_REGISTRATION": "number",
|
|
63
|
+
"IN_VOTER": "number",
|
|
64
|
+
"IN_GSTIN": "string",
|
|
65
|
+
|
|
66
|
+
# Security
|
|
67
|
+
"PASSWORD": "string",
|
|
68
|
+
"USERNAME": "string",
|
|
69
|
+
"USER_NAME": "string",
|
|
70
|
+
|
|
71
|
+
# Organization
|
|
72
|
+
"ORGANIZATION": "string",
|
|
73
|
+
|
|
74
|
+
# National Registration
|
|
75
|
+
"NRP": "number",
|
|
76
|
+
|
|
77
|
+
# Korea
|
|
78
|
+
"KR_RRN": "number",
|
|
79
|
+
|
|
80
|
+
# Thailand
|
|
81
|
+
"TH_TNIN": "nin",
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
CONFIG = {
|
|
85
|
+
"enable_logging": True,
|
|
86
|
+
"log_level": "INFO",
|
|
87
|
+
"data-discovery": {
|
|
88
|
+
"endpoint_url": os.getenv(
|
|
89
|
+
"DISCOVER_URL", "http://localhost:8580/pty/data-discovery/v2/classify/text"
|
|
90
|
+
), "transform_url": os.getenv(
|
|
91
|
+
"TRANSFORM_URL", "http://localhost:8580/pty/data-discovery/v2/transform/label"
|
|
92
|
+
), "named_entity_map": {},
|
|
93
|
+
"masking_char": "#",
|
|
94
|
+
"classification_score_threshold": 0.6,
|
|
95
|
+
"method": "redact", # or "mask"
|
|
96
|
+
},
|
|
97
|
+
"semantic-guardrails": {
|
|
98
|
+
"endpoint_url": os.getenv(
|
|
99
|
+
"SEMANTIC_GUARDRAILS_URL",
|
|
100
|
+
"http://localhost:8001/pty/semantic-guardrail/v1.1",
|
|
101
|
+
),
|
|
102
|
+
},
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def get_config(section: str = None) -> dict:
|
|
107
|
+
"""
|
|
108
|
+
Get configuration for a specific section or the entire config.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
section (str, optional): Section name ('data-discovery', 'semantic-guardrails').
|
|
112
|
+
If None, returns root config.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
dict: Configuration dictionary for the requested section.
|
|
116
|
+
|
|
117
|
+
Example:
|
|
118
|
+
>>> config = get_config('data-discovery')
|
|
119
|
+
>>> method = config['method']
|
|
120
|
+
"""
|
|
121
|
+
if section and section in CONFIG:
|
|
122
|
+
return CONFIG[section]
|
|
123
|
+
return CONFIG
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module for discovering PII entities using a discovery API.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Dict
|
|
6
|
+
import json
|
|
7
|
+
import requests
|
|
8
|
+
from protegrity_developer_python.utils.constants import get_config
|
|
9
|
+
from protegrity_developer_python.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
_config = get_config("data-discovery")
|
|
13
|
+
# Get logger instance
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def discover(text: str) -> Dict:
|
|
18
|
+
"""
|
|
19
|
+
Discover PII entities in the input text using the configured REST endpoint.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
text (str): Input text to classify.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
dict: Full JSON response from the classification API.
|
|
26
|
+
"""
|
|
27
|
+
headers = {"Content-Type": "text/plain; charset=utf-8"}
|
|
28
|
+
params = {"score_threshold": _config["classification_score_threshold"]}
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
response = requests.post(
|
|
32
|
+
_config["endpoint_url"],
|
|
33
|
+
headers=headers,
|
|
34
|
+
data=text.encode("utf-8"),
|
|
35
|
+
params=params,
|
|
36
|
+
timeout=30,
|
|
37
|
+
)
|
|
38
|
+
response.raise_for_status()
|
|
39
|
+
response_json = response.json()
|
|
40
|
+
return response_json.get("classifications", {})
|
|
41
|
+
except requests.exceptions.RequestException as e:
|
|
42
|
+
logger.error("HTTP request failed: %s", e)
|
|
43
|
+
raise
|
|
44
|
+
except json.JSONDecodeError as e:
|
|
45
|
+
logger.error("Failed to decode JSON response: %s", e)
|
|
46
|
+
raise
|
|
47
|
+
except Exception as e:
|
|
48
|
+
logger.error("Unexpected error: %s", e)
|
|
49
|
+
raise
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module for logging configuration and logger retrieval.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from protegrity_developer_python.utils.constants import CONFIG as _config
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger("protegrity_developer_python")
|
|
9
|
+
handler = logging.StreamHandler()
|
|
10
|
+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
|
11
|
+
handler.setFormatter(formatter)
|
|
12
|
+
logger.addHandler(handler)
|
|
13
|
+
|
|
14
|
+
if _config["enable_logging"]:
|
|
15
|
+
level = getattr(logging, _config["log_level"].upper(), logging.INFO)
|
|
16
|
+
logger.setLevel(level)
|
|
17
|
+
logger.propagate = False
|
|
18
|
+
else:
|
|
19
|
+
logger.disabled = True
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_logger():
|
|
23
|
+
return logger
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module for processing PII entities in text, including redaction, masking, and protection.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Dict, Tuple
|
|
7
|
+
from protegrity_developer_python.utils.ccn_processing import clean_ccn, reconstruct_ccn
|
|
8
|
+
from protegrity_developer_python.utils.protector import get_protector_session
|
|
9
|
+
from protegrity_developer_python.utils.logger import get_logger
|
|
10
|
+
from protegrity_developer_python.utils.constants import (
|
|
11
|
+
DATA_ELEMENT_MAPPING as entity_endpoint_mapped,
|
|
12
|
+
get_config,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
# Get logger instance
|
|
16
|
+
logger = get_logger()
|
|
17
|
+
|
|
18
|
+
# Get data-discovery sub-config for backward compatibility
|
|
19
|
+
_config = get_config("data-discovery")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _merge_overlapping_entities(
|
|
23
|
+
entity_spans: Dict[Tuple[int, int], Tuple[str, float]],
|
|
24
|
+
) -> Dict[Tuple[int, int], Tuple[str, float]]:
|
|
25
|
+
"""
|
|
26
|
+
Merge overlapping entity spans in a dictionary.
|
|
27
|
+
|
|
28
|
+
Given a dictionary with (start, end) index tuples as keys and (entity, score) tuples as values,
|
|
29
|
+
this function merges overlapping or adjacent spans by combining entity labels with '|' and
|
|
30
|
+
retaining the highest score.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
entity_dict (dict): Mapping of index spans to (entity, score).
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
dict: Merged spans with combined entities and max scores.
|
|
37
|
+
|
|
38
|
+
Example:
|
|
39
|
+
Input:
|
|
40
|
+
{
|
|
41
|
+
(10, 20): ("PERSON", 0.9),
|
|
42
|
+
(15, 25): ("LOCATION", 0.85)
|
|
43
|
+
}
|
|
44
|
+
Output:
|
|
45
|
+
{
|
|
46
|
+
(10, 25): ("PERSON|LOCATION", 0.9)
|
|
47
|
+
}
|
|
48
|
+
"""
|
|
49
|
+
merged = []
|
|
50
|
+
for (start, end), (entity, score) in entity_spans.items():
|
|
51
|
+
if not merged:
|
|
52
|
+
merged.append([(start, end), entity, score])
|
|
53
|
+
else:
|
|
54
|
+
last_start, last_end = merged[-1][0]
|
|
55
|
+
# Check for overlap
|
|
56
|
+
if start <= last_end and end >= last_start:
|
|
57
|
+
# Merge ranges
|
|
58
|
+
new_start = min(start, last_start)
|
|
59
|
+
new_end = max(end, last_end)
|
|
60
|
+
# Merge entities (sorted by score descending, then lexicographically)
|
|
61
|
+
if entity != merged[-1][1]:
|
|
62
|
+
last_entity = merged[-1][1]
|
|
63
|
+
last_score = merged[-1][2]
|
|
64
|
+
# Sort by score (descending), then lexicographically
|
|
65
|
+
if last_score > score:
|
|
66
|
+
entities = [last_entity, entity]
|
|
67
|
+
elif last_score < score:
|
|
68
|
+
entities = [entity, last_entity]
|
|
69
|
+
else:
|
|
70
|
+
# Scores equal, sort lexicographically
|
|
71
|
+
entities = sorted([last_entity, entity])
|
|
72
|
+
new_entity = "|".join(entities)
|
|
73
|
+
else:
|
|
74
|
+
new_entity = entity
|
|
75
|
+
# Take max score
|
|
76
|
+
new_score = max(score, merged[-1][2])
|
|
77
|
+
merged[-1] = [(new_start, new_end), new_entity, new_score]
|
|
78
|
+
else:
|
|
79
|
+
merged.append([(start, end), entity, score])
|
|
80
|
+
|
|
81
|
+
# Convert back to dictionary
|
|
82
|
+
merged_entities = {tuple(k): (v, s) for k, v, s in merged}
|
|
83
|
+
logger.debug("Before Merging Entity spans: \n%s", entity_spans)
|
|
84
|
+
logger.debug("Merged Entity spans: \n%s", merged_entities)
|
|
85
|
+
return merged_entities
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def collect_entity_spans(entities: Dict) -> Dict[Tuple[int, int], Tuple[str, float]]:
|
|
89
|
+
"""
|
|
90
|
+
Collects entity spans for redaction or masking based on score.
|
|
91
|
+
|
|
92
|
+
Returns a dictionary of (start_index, end_index) : (entity_name, score).
|
|
93
|
+
The dictionary is sorted in reverse order of start_index to ensure that replacements
|
|
94
|
+
do not affect the character positions of subsequent spans.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
entities (dict): Dictionary of detected PII entities.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
dict: Sorted entity spans mapped to entity names and scores.
|
|
101
|
+
"""
|
|
102
|
+
entity_map = {}
|
|
103
|
+
for entity_name, entity_details in entities.items():
|
|
104
|
+
for obj in entity_details:
|
|
105
|
+
score = obj.get("score")
|
|
106
|
+
loc = obj.get("location", {})
|
|
107
|
+
start = loc.get("start_index", 0)
|
|
108
|
+
end = loc.get("end_index", 0)
|
|
109
|
+
if (start, end) in entity_map:
|
|
110
|
+
if score > entity_map[(start, end)][1]:
|
|
111
|
+
entity_map[(start, end)] = (entity_name, score)
|
|
112
|
+
else:
|
|
113
|
+
entity_map[(start, end)] = (entity_name, score)
|
|
114
|
+
# Sort entity_map in reverse order of start index to avoid index shifting
|
|
115
|
+
entity_spans = {
|
|
116
|
+
key: entity_map[key]
|
|
117
|
+
for key in sorted(entity_map, key=lambda x: x[0], reverse=True)
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
logger.debug("Raw Entity spans collected: \n%s", entity_spans)
|
|
121
|
+
return _merge_overlapping_entities(entity_spans)
|
|
122
|
+
|
|
123
|
+
def unprotect_data(text: str) -> str:
|
|
124
|
+
session = get_protector_session()
|
|
125
|
+
try:
|
|
126
|
+
# Regex pattern to match [entity]data[/entity]
|
|
127
|
+
pattern = r'\[([^\]]+)\]([^[]*)\[\/\1\]'
|
|
128
|
+
|
|
129
|
+
def replace_protected_data(match):
|
|
130
|
+
data_element = entity_endpoint_mapped[match.group(1)]
|
|
131
|
+
entity = match.group(1)
|
|
132
|
+
data = match.group(2)
|
|
133
|
+
try:
|
|
134
|
+
unprotect_dataprotected = ""
|
|
135
|
+
if entity == "CREDIT_CARD":
|
|
136
|
+
input_data, separator_map = clean_ccn(data)
|
|
137
|
+
unprotect_dataprotected = session.unprotect(input_data, data_element)
|
|
138
|
+
unprotect_dataprotected = reconstruct_ccn(unprotect_dataprotected, separator_map)
|
|
139
|
+
else:
|
|
140
|
+
unprotect_dataprotected = session.unprotect(data, data_element)
|
|
141
|
+
logger.info(f"Unprotected {data_element}: {data}")
|
|
142
|
+
return unprotect_dataprotected # Return just the data, removing the tags
|
|
143
|
+
except Exception:
|
|
144
|
+
logger.warning(f"Failed to unprotect {data_element}: {data}.")
|
|
145
|
+
return data # Return original data if unprotection fails
|
|
146
|
+
|
|
147
|
+
# Replace all matches with unprotected data
|
|
148
|
+
result = re.sub(pattern, replace_protected_data, text)
|
|
149
|
+
return result
|
|
150
|
+
except Exception as e:
|
|
151
|
+
logger.error("Failed to process text: %s", e)
|
|
152
|
+
raise
|
|
153
|
+
|
|
154
|
+
def protect_data(
|
|
155
|
+
merged_entities: Dict[Tuple[int, int], Tuple[str, float]], text: str
|
|
156
|
+
) -> str:
|
|
157
|
+
"""
|
|
158
|
+
Protects the identified PII entities in the input text.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
entities (dict): Dictionary of detected PII entities.
|
|
162
|
+
text (str): Original input text.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
text (str): Protected PII text.
|
|
166
|
+
"""
|
|
167
|
+
session = get_protector_session()
|
|
168
|
+
for key, val in merged_entities.items():
|
|
169
|
+
start, end = key
|
|
170
|
+
entity, _ = val
|
|
171
|
+
data_element = None
|
|
172
|
+
protected = ""
|
|
173
|
+
get_named_entity_map = None
|
|
174
|
+
entity_selected = None
|
|
175
|
+
|
|
176
|
+
if "|" in entity:
|
|
177
|
+
data_element = entity_endpoint_mapped[entity.split("|")[0]]
|
|
178
|
+
entity_selected = entity.split("|")[0]
|
|
179
|
+
logger.debug("Multiple entities '%s' detected at span [%d:%d], using first entity for protection - %s.", entity, start, end, entity_selected)
|
|
180
|
+
get_named_entity_map = _config["named_entity_map"].get(entity.split("|")[0], None)
|
|
181
|
+
else:
|
|
182
|
+
data_element = entity_endpoint_mapped[entity]
|
|
183
|
+
entity_selected = entity
|
|
184
|
+
get_named_entity_map = _config["named_entity_map"].get(entity, None)
|
|
185
|
+
input_data = text[start:end]
|
|
186
|
+
|
|
187
|
+
if get_named_entity_map:
|
|
188
|
+
try:
|
|
189
|
+
if entity == "CREDIT_CARD":
|
|
190
|
+
input_data, separator_map = clean_ccn(input_data)
|
|
191
|
+
protected = session.protect(input_data, data_element)
|
|
192
|
+
protected = reconstruct_ccn(protected, separator_map)
|
|
193
|
+
else:
|
|
194
|
+
protected = session.protect(input_data, data_element)
|
|
195
|
+
|
|
196
|
+
logger.info(
|
|
197
|
+
"Entity '%s' at span [%d:%d] protected successfully.",
|
|
198
|
+
entity,
|
|
199
|
+
start,
|
|
200
|
+
end,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
text = text[:start] + f"[{entity_selected}]" + protected + f"[/{entity_selected}]" + text[end:]
|
|
204
|
+
logger.debug(
|
|
205
|
+
"Entity '%s' at span [%d:%d] protected as [%s]%s[/%s].",
|
|
206
|
+
entity,
|
|
207
|
+
start,
|
|
208
|
+
end,
|
|
209
|
+
entity_selected,
|
|
210
|
+
protected,
|
|
211
|
+
entity_selected,
|
|
212
|
+
)
|
|
213
|
+
except Exception:
|
|
214
|
+
logger.warning(
|
|
215
|
+
"Failed to protect entity '%s' at span [%d:%d] having value %s.",
|
|
216
|
+
entity,
|
|
217
|
+
start,
|
|
218
|
+
end,
|
|
219
|
+
input_data,
|
|
220
|
+
)
|
|
221
|
+
text = text[:start] + input_data + text[end:]
|
|
222
|
+
else:
|
|
223
|
+
logger.warning(
|
|
224
|
+
"Entity '%s' detected at span [%d:%d] but no mapping found in named_entity_map - skipping protection",
|
|
225
|
+
entity,
|
|
226
|
+
start,
|
|
227
|
+
end,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
return text
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def redact_data(
|
|
234
|
+
merged_entities: Dict[Tuple[int, int], Tuple[str, float]], text: str
|
|
235
|
+
) -> str:
|
|
236
|
+
"""
|
|
237
|
+
Redacts the identified PII entities in the input text.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
entities (dict): Dictionary of detected PII entities.
|
|
241
|
+
text (str): Original input text.
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
text (str): Redacted PII text.
|
|
245
|
+
"""
|
|
246
|
+
for key, val in merged_entities.items():
|
|
247
|
+
start, end = key
|
|
248
|
+
entity_name = val[0]
|
|
249
|
+
logger.debug(
|
|
250
|
+
"Entity '%s' found at span [%d:%d] with score %f",
|
|
251
|
+
entity_name,
|
|
252
|
+
start,
|
|
253
|
+
end,
|
|
254
|
+
val[1],
|
|
255
|
+
)
|
|
256
|
+
if _config["method"] == "redact":
|
|
257
|
+
if "|" in entity_name:
|
|
258
|
+
label = "|".join(
|
|
259
|
+
_config["named_entity_map"].get(entity, entity)
|
|
260
|
+
for entity in entity_name.split()
|
|
261
|
+
)
|
|
262
|
+
else:
|
|
263
|
+
label = _config["named_entity_map"].get(entity_name, "")
|
|
264
|
+
if label:
|
|
265
|
+
label = f"[{label}]"
|
|
266
|
+
logger.info(
|
|
267
|
+
"Entity '%s' found at span [%d:%d]... redacted as %s",
|
|
268
|
+
entity_name,
|
|
269
|
+
start,
|
|
270
|
+
end,
|
|
271
|
+
label,
|
|
272
|
+
)
|
|
273
|
+
text = text[:start] + label + text[end:]
|
|
274
|
+
else:
|
|
275
|
+
label = f"[{entity_name}]"
|
|
276
|
+
logger.warning(
|
|
277
|
+
"Entity '%s' detected at span [%d:%d] but no mapping found in named_entity_map - skipping redaction",
|
|
278
|
+
entity_name,
|
|
279
|
+
start,
|
|
280
|
+
end,
|
|
281
|
+
)
|
|
282
|
+
elif _config["method"] == "mask":
|
|
283
|
+
logger.info(
|
|
284
|
+
"Entity '%s' found at span [%d:%d]... masked",
|
|
285
|
+
entity_name,
|
|
286
|
+
start,
|
|
287
|
+
end,
|
|
288
|
+
)
|
|
289
|
+
text = text[:start] + _config["masking_char"] * (end - start) + text[end:]
|
|
290
|
+
|
|
291
|
+
return text
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module for interfacing with Protegrity's appython Protector for data protection.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from appython import Protector
|
|
6
|
+
|
|
7
|
+
instance, session = None, None
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Initialize the Protector instance
|
|
11
|
+
def _initialize_protector():
|
|
12
|
+
global instance
|
|
13
|
+
instance = Protector()
|
|
14
|
+
return instance
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# Get or create a Protector session
|
|
18
|
+
def get_protector_session():
|
|
19
|
+
global instance, session
|
|
20
|
+
if instance is None:
|
|
21
|
+
protector = _initialize_protector()
|
|
22
|
+
session = protector.create_session("superuser")
|
|
23
|
+
return session
|