idscrub 1.1.2__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- idscrub/scrub.py +692 -525
- {idscrub-1.1.2.dist-info → idscrub-2.0.0.dist-info}/METADATA +58 -12
- idscrub-2.0.0.dist-info/RECORD +24 -0
- notebooks/basic_usage.ipynb +294 -351
- test/conftest.py +36 -0
- test/test_dataframe.py +8 -8
- test/test_errors.py +32 -0
- test/test_exclude.py +22 -0
- test/test_group.py +9 -0
- test/test_huggingface.py +3 -3
- test/test_id.py +8 -7
- test/test_label.py +22 -7
- test/test_overlap.py +86 -0
- test/test_phonenumbers.py +2 -2
- test/test_presidio.py +13 -6
- test/test_regex.py +110 -59
- test/test_scrub.py +22 -12
- test/test_scrub_text.py +22 -0
- test/test_spacy.py +14 -10
- idscrub-1.1.2.dist-info/RECORD +0 -22
- test/test_all.py +0 -39
- test/test_chain.py +0 -54
- test/test_log.py +0 -17
- {idscrub-1.1.2.dist-info → idscrub-2.0.0.dist-info}/WHEEL +0 -0
- {idscrub-1.1.2.dist-info → idscrub-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {idscrub-1.1.2.dist-info → idscrub-2.0.0.dist-info}/top_level.txt +0 -0
idscrub/scrub.py
CHANGED
|
@@ -2,8 +2,9 @@ import logging
|
|
|
2
2
|
import os
|
|
3
3
|
import re
|
|
4
4
|
import warnings
|
|
5
|
+
from collections import defaultdict
|
|
5
6
|
from collections.abc import Iterable
|
|
6
|
-
from
|
|
7
|
+
from dataclasses import asdict, dataclass
|
|
7
8
|
|
|
8
9
|
import pandas as pd
|
|
9
10
|
import phonenumbers
|
|
@@ -11,8 +12,6 @@ import spacy
|
|
|
11
12
|
from huggingface_hub.utils import HFValidationError
|
|
12
13
|
from presidio_analyzer import AnalyzerEngine
|
|
13
14
|
from presidio_analyzer.nlp_engine import SpacyNlpEngine
|
|
14
|
-
from presidio_anonymizer import AnonymizerEngine
|
|
15
|
-
from presidio_anonymizer.entities import OperatorConfig
|
|
16
15
|
from spacy.cli import download
|
|
17
16
|
from spacy.language import Language
|
|
18
17
|
from tqdm import tqdm
|
|
@@ -29,12 +28,44 @@ trf_logging.set_verbosity_error()
|
|
|
29
28
|
|
|
30
29
|
|
|
31
30
|
class IDScrub:
|
|
31
|
+
"""
|
|
32
|
+
Class for identifying and scrubbing entities in text.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class IDEnt:
|
|
37
|
+
"""
|
|
38
|
+
Structured representation of an identified entity (ident) within text.
|
|
39
|
+
|
|
40
|
+
Attributes:
|
|
41
|
+
text_id (str | int | float): A unique identifier for the original text.
|
|
42
|
+
text (str): The exact substring extracted from the original text.
|
|
43
|
+
start (int): The starting character offset of the ident within the original text.
|
|
44
|
+
end (int): The ending character offset of the ident within the original text.
|
|
45
|
+
label (str): The ident type (e.g. 'person').
|
|
46
|
+
replacement (str): The text that should replace this ident during scrubbing.
|
|
47
|
+
priority (float): Priority score for overlapping idents.
|
|
48
|
+
Higher scored idents are scrubbed where an overlap occurs.
|
|
49
|
+
The scores are relative e.g. 0.2 beats 0.1.
|
|
50
|
+
source (str): The source model or method that identified the ident.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
text_id: str | int | float
|
|
54
|
+
text: str
|
|
55
|
+
start: int
|
|
56
|
+
end: int
|
|
57
|
+
label: str
|
|
58
|
+
replacement: str
|
|
59
|
+
priority: float
|
|
60
|
+
source: str
|
|
61
|
+
|
|
32
62
|
def __init__(
|
|
33
63
|
self,
|
|
34
|
-
texts: list[str] =
|
|
64
|
+
texts: list[str] = None,
|
|
35
65
|
text_ids: list | Iterable = None,
|
|
36
66
|
text_id_name: str = "text_id",
|
|
37
|
-
|
|
67
|
+
replacement: str = None,
|
|
68
|
+
exclude: list[str] = [],
|
|
38
69
|
verbose: bool = True,
|
|
39
70
|
):
|
|
40
71
|
"""
|
|
@@ -46,32 +77,37 @@ class IDScrub:
|
|
|
46
77
|
such as the ID column in a DataFrame. If None, an integer index starting at 1 is applied.
|
|
47
78
|
This is used to identify texts in get_scrubbed_data().
|
|
48
79
|
text_id_name (str): Name of the ID column in get_scrubbed_data(). Default is `text_id`.
|
|
49
|
-
|
|
50
|
-
|
|
80
|
+
replacement (str): A global string to replace every scrubbed string with.
|
|
81
|
+
exclude (list[str]): A list of strings that will not be scrubbed if identified.
|
|
51
82
|
verbose (bool): Whether to show all log messages or only warnings.
|
|
52
83
|
"""
|
|
53
84
|
|
|
54
|
-
|
|
55
|
-
"`texts`
|
|
56
|
-
)
|
|
85
|
+
if not isinstance(texts, list):
|
|
86
|
+
raise TypeError("`texts` must be a list.")
|
|
87
|
+
if not all(isinstance(text, str) for text in texts):
|
|
88
|
+
raise TypeError("`texts` must be a list of strings.")
|
|
57
89
|
|
|
58
|
-
|
|
59
|
-
"`
|
|
60
|
-
)
|
|
90
|
+
if replacement is not None and not isinstance(replacement, str):
|
|
91
|
+
raise TypeError("`replacement` must be a string or None.")
|
|
61
92
|
|
|
62
93
|
self.texts = texts
|
|
63
94
|
|
|
64
|
-
if text_ids:
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
95
|
+
if text_ids is None:
|
|
96
|
+
text_ids = range(1, len(self.texts) + 1)
|
|
97
|
+
|
|
98
|
+
if not len(self.texts) == len(text_ids):
|
|
99
|
+
raise ValueError("Length of texts is different to the length of text IDs.")
|
|
68
100
|
|
|
69
|
-
|
|
101
|
+
self.text_ids = text_ids
|
|
70
102
|
|
|
103
|
+
self.replacement = replacement
|
|
71
104
|
self.text_id_name = text_id_name
|
|
72
|
-
self.
|
|
73
|
-
self.
|
|
74
|
-
self.
|
|
105
|
+
self.exclude = exclude
|
|
106
|
+
self.scrubbed_texts = []
|
|
107
|
+
self.idents: list[IDScrub.IDEnt] = []
|
|
108
|
+
|
|
109
|
+
self.hf_ner = None
|
|
110
|
+
self.spacy_docs = None
|
|
75
111
|
|
|
76
112
|
self.logger = logging.getLogger(self.__class__.__name__)
|
|
77
113
|
self.logger.setLevel(logging.DEBUG if verbose else logging.WARNING)
|
|
@@ -84,284 +120,295 @@ class IDScrub:
|
|
|
84
120
|
|
|
85
121
|
self.logger.info("Texts loaded.")
|
|
86
122
|
|
|
87
|
-
def
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
Returns:
|
|
97
|
-
A Pandas DataFrame with text_id
|
|
98
|
-
and scrubbed in a list format.
|
|
99
|
-
"""
|
|
100
|
-
if self.cleaned_texts:
|
|
101
|
-
texts = self.cleaned_texts
|
|
102
|
-
else:
|
|
103
|
-
texts = self.texts
|
|
104
|
-
|
|
105
|
-
return texts
|
|
106
|
-
|
|
107
|
-
def get_scrubbed_data(self) -> pd.DataFrame:
|
|
123
|
+
def find_regex(
|
|
124
|
+
self,
|
|
125
|
+
texts: list[str],
|
|
126
|
+
text_ids: list,
|
|
127
|
+
pattern: str,
|
|
128
|
+
replacement: str,
|
|
129
|
+
label: str,
|
|
130
|
+
priority: float,
|
|
131
|
+
) -> list[IDEnt]:
|
|
108
132
|
"""
|
|
109
|
-
|
|
133
|
+
General method to clean text using a regex pattern.
|
|
110
134
|
|
|
111
135
|
Args:
|
|
112
|
-
|
|
136
|
+
texts (list[str]): Strings to scrub.
|
|
137
|
+
text_ids (list): A list of identifiers that correspond to each string in `texts`.
|
|
138
|
+
If None, current cleaned state of `texts` passed at Class initiation used.
|
|
139
|
+
pattern (str): Regex pattern to apply.
|
|
140
|
+
replacement (str): The replacement text for the removed text.
|
|
141
|
+
label (str): Label for the personal data removed.
|
|
142
|
+
priority (float): Priority score for overlapping entities.
|
|
143
|
+
Higher scored entities are scrubbed where an overlap occurs.
|
|
144
|
+
The scores are relative e.g. 0.2 beats 0.1.
|
|
113
145
|
|
|
114
146
|
Returns:
|
|
115
|
-
A
|
|
116
|
-
and scrubbed in a list format.
|
|
147
|
+
list[IDEnt]: A list of IDEnt objects.
|
|
117
148
|
"""
|
|
118
|
-
df = pd.DataFrame(self.scrubbed_data)
|
|
119
149
|
|
|
120
|
-
if self.
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
.
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
150
|
+
if self.replacement:
|
|
151
|
+
replacement = self.replacement
|
|
152
|
+
|
|
153
|
+
compiled = re.compile(pattern, re.IGNORECASE)
|
|
154
|
+
idents = []
|
|
155
|
+
|
|
156
|
+
for text_id, text in zip(text_ids, texts):
|
|
157
|
+
for match in compiled.finditer(text):
|
|
158
|
+
idents.append(
|
|
159
|
+
self.IDEnt(
|
|
160
|
+
text_id=text_id,
|
|
161
|
+
text=match.group(),
|
|
162
|
+
start=match.start(),
|
|
163
|
+
end=match.end(),
|
|
164
|
+
label=label,
|
|
165
|
+
replacement=replacement,
|
|
166
|
+
priority=priority,
|
|
167
|
+
source="regex",
|
|
168
|
+
)
|
|
169
|
+
)
|
|
138
170
|
|
|
139
|
-
return
|
|
171
|
+
return idents
|
|
140
172
|
|
|
141
|
-
def
|
|
173
|
+
def custom_regex(
|
|
174
|
+
self, texts: list[str] = None, text_ids: list = None, patterns: dict = None, source: str = "custom_regex"
|
|
175
|
+
) -> list[IDEnt]:
|
|
142
176
|
"""
|
|
143
|
-
|
|
177
|
+
Remove text matching a custom regex pattern.
|
|
144
178
|
|
|
145
179
|
Args:
|
|
146
|
-
|
|
180
|
+
texts (list[str]): Strings to scrub.
|
|
181
|
+
text_ids (list): A list of identifiers that correspond to each string in `texts`.
|
|
182
|
+
patterns (dict): {"name": {"pattern": r"John", "replacement": "[NAME]", "priority": 0.5}}
|
|
183
|
+
source (str): The methodological source of the scrubbed ident.
|
|
147
184
|
Returns:
|
|
148
|
-
|
|
149
|
-
"""
|
|
150
|
-
|
|
151
|
-
if any(label in key for key in self.scrubbed_data):
|
|
152
|
-
scrubbed_data = self.get_scrubbed_data()
|
|
153
|
-
count = scrubbed_data[label].dropna().apply(len).sum()
|
|
154
|
-
else:
|
|
155
|
-
count = 0
|
|
185
|
+
list[IDEnt]: A list of IDEnt objects.
|
|
156
186
|
|
|
157
|
-
self.logger.info(f"{count} {label} scrubbed.")
|
|
158
|
-
|
|
159
|
-
return count
|
|
160
|
-
|
|
161
|
-
def scrub_and_collect(self, match, text, replacement_text, i, label) -> str:
|
|
162
|
-
"""
|
|
163
|
-
Scrub pattern match and collect scrubbed name.
|
|
164
|
-
|
|
165
|
-
Args:
|
|
166
|
-
match (str): The regex match passed from `re.sub()`.
|
|
167
|
-
i (int): the enumerate id of the string.
|
|
168
|
-
label (str): Label for the personal data removed.
|
|
169
|
-
|
|
170
|
-
Returns:
|
|
171
|
-
str: The replacement text.
|
|
172
187
|
"""
|
|
173
188
|
|
|
174
|
-
|
|
189
|
+
idents = []
|
|
190
|
+
|
|
191
|
+
for text, text_id in zip(texts, text_ids):
|
|
192
|
+
for label, params in patterns.items():
|
|
193
|
+
pattern = params["pattern"]
|
|
194
|
+
replacement = params.get("replacement", "[REDACTED]")
|
|
195
|
+
priority = params.get("priority", 0.5)
|
|
196
|
+
|
|
197
|
+
compiled = re.compile(pattern, flags=re.IGNORECASE)
|
|
198
|
+
|
|
199
|
+
for match in compiled.finditer(text):
|
|
200
|
+
idents.append(
|
|
201
|
+
self.IDEnt(
|
|
202
|
+
text_id=text_id,
|
|
203
|
+
text=match.group(),
|
|
204
|
+
start=match.start(),
|
|
205
|
+
end=match.end(),
|
|
206
|
+
label=label,
|
|
207
|
+
replacement=replacement,
|
|
208
|
+
priority=priority,
|
|
209
|
+
source=source,
|
|
210
|
+
)
|
|
211
|
+
)
|
|
175
212
|
|
|
176
|
-
return
|
|
213
|
+
return idents
|
|
177
214
|
|
|
178
|
-
def
|
|
215
|
+
def email_addresses(
|
|
216
|
+
self,
|
|
217
|
+
texts: list[str] = None,
|
|
218
|
+
text_ids: list = None,
|
|
219
|
+
replacement: str = "[EMAIL_ADDRESS]",
|
|
220
|
+
label: str = "email_address",
|
|
221
|
+
priority: float = 0.7,
|
|
222
|
+
) -> list[IDEnt]:
|
|
179
223
|
"""
|
|
180
|
-
|
|
224
|
+
Remove email addresses using regex e.g. `johnsmith@mail.com` identified.
|
|
181
225
|
|
|
182
226
|
Args:
|
|
183
|
-
|
|
184
|
-
|
|
227
|
+
texts (list[str]): Strings to scrub.
|
|
228
|
+
text_ids (list): A list of identifiers that correspond to each string in `texts`.
|
|
229
|
+
If None, current cleaned state of `texts` passed at Class initiation used.
|
|
230
|
+
replacement (str): The replacement text for the removed text.
|
|
185
231
|
label (str): Label for the personal data removed.
|
|
232
|
+
priority (float): Priority score for overlapping entities.
|
|
233
|
+
Higher scored entities are scrubbed where an overlap occurs.
|
|
234
|
+
The scores are relative e.g. 0.2 beats 0.1.
|
|
186
235
|
|
|
187
236
|
Returns:
|
|
188
|
-
list[
|
|
237
|
+
list[IDEnt]: A list of IDEnt objects.
|
|
189
238
|
"""
|
|
190
239
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
if self.replacement_text:
|
|
196
|
-
replacement_text = self.replacement_text
|
|
197
|
-
|
|
198
|
-
cleaned_texts = [
|
|
199
|
-
compiled_pattern.sub(
|
|
200
|
-
partial(
|
|
201
|
-
self.scrub_and_collect,
|
|
202
|
-
text=text,
|
|
203
|
-
replacement_text=replacement_text,
|
|
204
|
-
i=i,
|
|
205
|
-
label=label,
|
|
206
|
-
),
|
|
207
|
-
text,
|
|
208
|
-
)
|
|
209
|
-
for i, text in zip(self.text_ids, texts)
|
|
210
|
-
]
|
|
211
|
-
|
|
212
|
-
self.cleaned_texts = cleaned_texts
|
|
213
|
-
|
|
214
|
-
self.log_message(label)
|
|
215
|
-
|
|
216
|
-
return cleaned_texts
|
|
240
|
+
pattern = r"\b\S+@\S+\.\S+\b"
|
|
241
|
+
return self.find_regex(
|
|
242
|
+
texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
|
|
243
|
+
)
|
|
217
244
|
|
|
218
|
-
def
|
|
245
|
+
def urls(
|
|
219
246
|
self,
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
247
|
+
texts: list[str] = None,
|
|
248
|
+
text_ids: list = None,
|
|
249
|
+
replacement: str = "[URL]",
|
|
250
|
+
label: str = "url",
|
|
251
|
+
priority: float = 0.3,
|
|
252
|
+
) -> list[IDEnt]:
|
|
224
253
|
"""
|
|
225
|
-
Remove
|
|
254
|
+
Remove `http`, `https` and `www` URLs using regex e.g. `www.google.com` identified.
|
|
226
255
|
|
|
227
|
-
|
|
228
|
-
custom_regex_patterns list[str]: Regex(s) pattern to apply.
|
|
229
|
-
custom_replacement_texts list[str]: The replacement texts for the removed text.
|
|
230
|
-
Defaults to '[REDACTED]' for all.
|
|
231
|
-
labels list[str]: Labels for patterns removed.
|
|
232
|
-
|
|
233
|
-
Returns:
|
|
234
|
-
list[str]: Cleaned texts.
|
|
235
|
-
|
|
236
|
-
"""
|
|
237
|
-
self.logger.info("Scrubbing custom regex...")
|
|
238
|
-
|
|
239
|
-
if custom_replacement_texts:
|
|
240
|
-
assert len(custom_regex_patterns) == len(custom_replacement_texts), (
|
|
241
|
-
"There must be a replacement text for each pattern."
|
|
242
|
-
)
|
|
243
|
-
else:
|
|
244
|
-
custom_replacement_texts = ["[REDACTED]"] * len(custom_regex_patterns)
|
|
245
|
-
|
|
246
|
-
for i, (pattern, replacement_text) in enumerate(zip(custom_regex_patterns, custom_replacement_texts)):
|
|
247
|
-
if labels:
|
|
248
|
-
assert len(custom_regex_patterns) == len(labels), "There must be a label for each pattern."
|
|
249
|
-
self.scrub_regex(pattern, replacement_text, label=f"{labels[i]}")
|
|
250
|
-
else:
|
|
251
|
-
self.scrub_regex(pattern, replacement_text, label=f"custom_regex_{i + 1}")
|
|
252
|
-
|
|
253
|
-
return self.cleaned_texts
|
|
254
|
-
|
|
255
|
-
def email_addresses(self, replacement_text: str = "[EMAIL_ADDRESS]", label: str = "email_address") -> list[str]:
|
|
256
|
-
"""
|
|
257
|
-
Remove email addresses using regex.
|
|
258
|
-
e.g. `johnsmith@gmail.com` scrubbed
|
|
256
|
+
`example.com` will not be scrubbed by this method.
|
|
259
257
|
|
|
260
258
|
Args:
|
|
261
|
-
|
|
259
|
+
texts (list[str]): Strings to scrub.
|
|
260
|
+
text_ids (list): A list of identifiers that correspond to each string in `texts`.
|
|
261
|
+
If None, current cleaned state of `texts` passed at Class initiation used.
|
|
262
|
+
replacement (str): The replacement text for the removed text.
|
|
262
263
|
label (str): Label for the personal data removed.
|
|
264
|
+
priority (float): Priority score for overlapping entities.
|
|
265
|
+
Higher scored entities are scrubbed where an overlap occurs.
|
|
266
|
+
The scores are relative e.g. 0.2 beats 0.1.
|
|
263
267
|
|
|
264
268
|
Returns:
|
|
265
|
-
list[
|
|
269
|
+
list[IDEnt]: A list of IDEnt objects.
|
|
266
270
|
"""
|
|
267
271
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
+
pattern = r"\b(?:https?://|www\.)[^\s<>()\"']+"
|
|
273
|
+
return self.find_regex(
|
|
274
|
+
texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
|
|
275
|
+
)
|
|
272
276
|
|
|
273
|
-
def handles(
|
|
277
|
+
def handles(
|
|
278
|
+
self,
|
|
279
|
+
texts: list[str] = None,
|
|
280
|
+
text_ids: list = None,
|
|
281
|
+
replacement: str = "[HANDLE]",
|
|
282
|
+
label: str = "handle",
|
|
283
|
+
priority: float = 0.4,
|
|
284
|
+
) -> list[IDEnt]:
|
|
274
285
|
"""
|
|
275
|
-
Remove `@` user handles using regex
|
|
276
|
-
e.g. `@username` scrubbed
|
|
286
|
+
Remove `@` user handles using regex e.g. `@username` identified.
|
|
277
287
|
|
|
278
288
|
Args:
|
|
279
|
-
|
|
289
|
+
texts (list[str]): Strings to scrub.
|
|
290
|
+
text_ids (list): A list of identifiers that correspond to each string in `texts`.
|
|
291
|
+
If None, current cleaned state of `texts` passed at Class initiation used.
|
|
292
|
+
replacement (str): The replacement text for the removed text.
|
|
280
293
|
label (str): Label for the personal data removed.
|
|
294
|
+
priority (float): Priority score for overlapping entities.
|
|
295
|
+
Higher scored entities are scrubbed where an overlap occurs.
|
|
296
|
+
The scores are relative e.g. 0.2 beats 0.1.
|
|
281
297
|
|
|
282
298
|
Returns:
|
|
283
|
-
list[
|
|
299
|
+
list[IDEnt]: A list of IDEnt objects.
|
|
284
300
|
"""
|
|
285
301
|
|
|
286
|
-
self.logger.info("Scrubbing @user handles using regex...")
|
|
287
302
|
pattern = r"@[\w.-]+(?=\b)"
|
|
288
|
-
|
|
289
|
-
|
|
303
|
+
return self.find_regex(
|
|
304
|
+
texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
|
|
305
|
+
)
|
|
290
306
|
|
|
291
307
|
def google_phone_numbers(
|
|
292
|
-
self,
|
|
293
|
-
|
|
308
|
+
self,
|
|
309
|
+
texts: list[str] = None,
|
|
310
|
+
text_ids: list = None,
|
|
311
|
+
region: str = "GB",
|
|
312
|
+
replacement: str = "[PHONENO]",
|
|
313
|
+
label: str = "phone_number",
|
|
314
|
+
priority: float = 0.8,
|
|
315
|
+
) -> list[IDEnt]:
|
|
294
316
|
"""
|
|
295
|
-
Remove phone numbers using Google's `phonenumbers
|
|
296
|
-
e.g. `+441234567891` scrubbed
|
|
317
|
+
Remove phone numbers using Google's `phonenumbers` e.g. `+441234567891` identified.
|
|
297
318
|
|
|
298
319
|
Args:
|
|
320
|
+
texts (list[str]): Strings to scrub.
|
|
321
|
+
text_ids (list): A list of identifiers that correspond to each string in `texts`.
|
|
322
|
+
If None, current cleaned state of `texts` passed at Class initiation used.
|
|
299
323
|
region (str): The region to find phone numbers for. See `phonenumbers` regions.
|
|
300
|
-
|
|
324
|
+
replacement (str): The replacement text for the removed text.
|
|
301
325
|
label (str): Label for the personal data removed.
|
|
326
|
+
priority (float): Priority score for overlapping entities.
|
|
327
|
+
Higher scored entities are scrubbed where an overlap occurs.
|
|
328
|
+
The scores are relative e.g. 0.2 beats 0.1.
|
|
302
329
|
|
|
303
330
|
Returns:
|
|
304
|
-
list[
|
|
331
|
+
list[IDEnt]: A list of IDEnt objects.
|
|
305
332
|
"""
|
|
306
333
|
|
|
307
|
-
self.
|
|
334
|
+
if self.replacement:
|
|
335
|
+
replacement = self.replacement
|
|
308
336
|
|
|
309
|
-
|
|
337
|
+
idents = []
|
|
310
338
|
|
|
311
|
-
|
|
312
|
-
replacement_text = self.replacement_text
|
|
313
|
-
|
|
314
|
-
cleaned_texts = []
|
|
315
|
-
|
|
316
|
-
for i, text in zip(self.text_ids, texts):
|
|
339
|
+
for text, text_id in zip(texts, text_ids):
|
|
317
340
|
matches = list(phonenumbers.PhoneNumberMatcher(text, region))
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
self.log_message(label)
|
|
341
|
+
for match in matches:
|
|
342
|
+
idents.append(
|
|
343
|
+
self.IDEnt(
|
|
344
|
+
text_id=text_id,
|
|
345
|
+
text=match.raw_string,
|
|
346
|
+
start=match.start,
|
|
347
|
+
end=match.end,
|
|
348
|
+
priority=priority,
|
|
349
|
+
replacement=replacement,
|
|
350
|
+
label="phone_no",
|
|
351
|
+
source="google_phone_numbers",
|
|
352
|
+
)
|
|
353
|
+
)
|
|
332
354
|
|
|
333
|
-
return
|
|
355
|
+
return idents
|
|
334
356
|
|
|
335
|
-
def uk_phone_numbers(
|
|
357
|
+
def uk_phone_numbers(
|
|
358
|
+
self,
|
|
359
|
+
texts: list[str] = None,
|
|
360
|
+
text_ids: list = None,
|
|
361
|
+
replacement: str = "[PHONENO]",
|
|
362
|
+
label: str = "uk_phone_number",
|
|
363
|
+
priority: float = 0.8,
|
|
364
|
+
) -> list[IDEnt]:
|
|
336
365
|
"""
|
|
337
|
-
Remove phone numbers using regex.
|
|
338
|
-
e.g. `+441234567891` scrubbed
|
|
366
|
+
Remove phone numbers using regex e.g. `+441234567891` identified.
|
|
339
367
|
|
|
340
368
|
Args:
|
|
341
|
-
|
|
369
|
+
texts (list[str]): Strings to scrub.
|
|
370
|
+
If None, current cleaned state of `text` passed at Class initiation used.
|
|
371
|
+
replacement (str): The replacement text for the removed text.
|
|
342
372
|
label (str): Label for the personal data removed.
|
|
373
|
+
priority (float): Priority score for overlapping entities.
|
|
374
|
+
Higher scored entities are scrubbed where an overlap occurs.
|
|
375
|
+
The scores are relative e.g. 0.2 beats 0.1.
|
|
343
376
|
|
|
344
377
|
Returns:
|
|
345
|
-
list[
|
|
378
|
+
list[IDEnt]: A list of IDEnt objects.
|
|
346
379
|
"""
|
|
347
380
|
|
|
348
|
-
self.logger.info("Scrubbing phone numbers using regex...")
|
|
349
381
|
pattern = r"(\+?\d[\d\s]{7,}\d)"
|
|
382
|
+
return self.find_regex(
|
|
383
|
+
texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
|
|
384
|
+
)
|
|
350
385
|
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
386
|
+
def titles(
|
|
387
|
+
self,
|
|
388
|
+
texts: list[str] = None,
|
|
389
|
+
text_ids: list = None,
|
|
390
|
+
strict: bool = False,
|
|
391
|
+
replacement: str = "[TITLE]",
|
|
392
|
+
label: str = "title",
|
|
393
|
+
priority: float = 0.4,
|
|
394
|
+
) -> list[IDEnt]:
|
|
354
395
|
"""
|
|
355
396
|
Remove titles using regex.
|
|
356
397
|
|
|
357
398
|
Args:
|
|
399
|
+
texts (list[str]): Strings to scrub.
|
|
400
|
+
text_ids (list): A list of identifiers that correspond to each string in `texts`.
|
|
401
|
+
If None, current cleaned state of `text` passed at Class initiation used.
|
|
358
402
|
strict (bool): Whether to use all of the titles or only essential titles.
|
|
359
403
|
If strict, you may find scrubbing of common words, such as general.
|
|
360
|
-
|
|
404
|
+
replacement (str): The replacement text for the removed text.
|
|
361
405
|
label (str): Label for the personal data removed.
|
|
406
|
+
priority (float): Priority score for overlapping entities.
|
|
407
|
+
Higher scored entities are scrubbed where an overlap occurs.
|
|
408
|
+
The scores are relative e.g. 0.2 beats 0.1.
|
|
362
409
|
|
|
363
410
|
Returns:
|
|
364
|
-
list[
|
|
411
|
+
list[IDEnt]: A list of IDEnt objects.
|
|
365
412
|
"""
|
|
366
413
|
|
|
367
414
|
titles = [
|
|
@@ -413,103 +460,109 @@ class IDScrub:
|
|
|
413
460
|
titles += [title + "." for title in titles]
|
|
414
461
|
titles += [title + ":" for title in titles]
|
|
415
462
|
|
|
416
|
-
self.logger.info("Scrubbing titles using regex...")
|
|
417
463
|
pattern = r"\b(?:{})\b".format("|".join(re.escape(t) for t in titles))
|
|
464
|
+
return self.find_regex(
|
|
465
|
+
texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
|
|
466
|
+
)
|
|
418
467
|
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
468
|
+
def ip_addresses(
|
|
469
|
+
self,
|
|
470
|
+
texts: list[str] = None,
|
|
471
|
+
text_ids: list = None,
|
|
472
|
+
replacement: str = "[IPADDRESS]",
|
|
473
|
+
label: str = "ip_address",
|
|
474
|
+
priority: float = 0.5,
|
|
475
|
+
) -> list[IDEnt]:
|
|
422
476
|
"""
|
|
423
|
-
Removes IP addresses.
|
|
424
|
-
e.g. `192.168.1.1` scrubbed
|
|
477
|
+
Removes IP addresses e.g. `192.168.1.1` identified.
|
|
425
478
|
|
|
426
479
|
Args:
|
|
427
|
-
|
|
480
|
+
texts (list[str]): Strings to scrub.
|
|
481
|
+
text_ids (list): A list of identifiers that correspond to each string in `texts`.
|
|
482
|
+
If None, current cleaned state of `texts` passed at Class initiation used.
|
|
483
|
+
replacement (str): The replacement text for the removed text.
|
|
484
|
+
label (str): Label for the personal data removed.
|
|
485
|
+
priority (float): Priority score for overlapping entities.
|
|
486
|
+
Higher scored entities are scrubbed where an overlap occurs.
|
|
487
|
+
The scores are relative e.g. 0.2 beats 0.1.
|
|
428
488
|
|
|
429
489
|
Returns:
|
|
430
|
-
list[
|
|
490
|
+
list[IDEnt]: A list of IDEnt objects.
|
|
431
491
|
"""
|
|
432
492
|
|
|
433
|
-
self.logger.info("Scrubbing IP addresses using regex...")
|
|
434
493
|
pattern = r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"
|
|
494
|
+
return self.find_regex(
|
|
495
|
+
texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
|
|
496
|
+
)
|
|
435
497
|
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
498
|
+
def uk_postcodes(
|
|
499
|
+
self,
|
|
500
|
+
texts: list[str] = None,
|
|
501
|
+
text_ids: list = None,
|
|
502
|
+
replacement: str = "[POSTCODE]",
|
|
503
|
+
label: str = "uk_postcode",
|
|
504
|
+
priority: float = 0.5,
|
|
505
|
+
) -> list[IDEnt]:
|
|
439
506
|
"""
|
|
440
|
-
Removes postcodes.
|
|
441
|
-
e.g. `A11 1AA` scrubbed
|
|
507
|
+
Removes postcodes e.g. `A11 1AA` identified.
|
|
442
508
|
|
|
443
509
|
Args:
|
|
444
|
-
|
|
510
|
+
texts (list[str]): Strings to scrub.
|
|
511
|
+
text_ids (list): A list of identifiers that correspond to each string in `texts`.
|
|
512
|
+
If None, current cleaned state of `texts` passed at Class initiation used.
|
|
513
|
+
replacement (str): The replacement text for the removed text.
|
|
445
514
|
label (str): Label for the personal data removed.
|
|
515
|
+
priority (float): Priority score for overlapping entities.
|
|
516
|
+
Higher scored entities are scrubbed where an overlap occurs.
|
|
517
|
+
The scores are relative e.g. 0.2 beats 0.1.
|
|
446
518
|
|
|
447
519
|
Returns:
|
|
448
|
-
list[
|
|
520
|
+
list[IDEnt]: A list of IDEnt objects.
|
|
449
521
|
"""
|
|
450
522
|
|
|
451
|
-
self.logger.info("Scrubbing postcodes using regex...")
|
|
452
523
|
pattern = r"\b(?:(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)[ \t]*[0-9][A-Z]{2}|GIR[ \t]*0A{2}|SAN[ \t]*TA1|ASCN[ \t]*1ZZ|STHL[ \t]*1ZZ|TDCU[ \t]*1ZZ|BBND[ \t]*1ZZ|[BFS]IQ{2}[ \t]*1ZZ|GX11[ \t]*1AA|PCRN[ \t]*1ZZ|TKCA[ \t]*1ZZ|AI-?[0-9]{4}|BFPO[ \t-]?[0-9]{2,4}|MSR[ \t-]?1(?:1[12]|[23][135])0|VG[ \t-]?11[1-6]0|KY[1-3][ \t-]?[0-2][0-9]{3})\b"
|
|
524
|
+
return self.find_regex(
|
|
525
|
+
texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
|
|
526
|
+
)
|
|
453
527
|
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
528
|
+
def uk_addresses(
|
|
529
|
+
self,
|
|
530
|
+
texts: list[str] = None,
|
|
531
|
+
text_ids: list = None,
|
|
532
|
+
replacement: str = "[ADDRESS]",
|
|
533
|
+
label: str = "uk_address",
|
|
534
|
+
priority: float = 0.8,
|
|
535
|
+
) -> list[IDEnt]:
|
|
457
536
|
"""
|
|
458
537
|
Removes addresses.
|
|
459
|
-
e.g. `10 Downing Street`
|
|
538
|
+
e.g. `10 Downing Street` and `10, Downing Street` identified.
|
|
460
539
|
|
|
461
540
|
Args:
|
|
462
|
-
|
|
541
|
+
texts (list[str]): Strings to scrub.
|
|
542
|
+
text_ids (list): A list of identifiers that correspond to each string in `texts`.
|
|
543
|
+
If None, current cleaned state of `texts` passed at Class initiation used.
|
|
544
|
+
replacement (str): The replacement text for the removed text.
|
|
463
545
|
label (str): Label for the personal data removed.
|
|
546
|
+
priority (float): Priority score for overlapping entities.
|
|
547
|
+
Higher scored entities are scrubbed where an overlap occurs.
|
|
548
|
+
The scores are relative e.g. 0.2 beats 0.1.
|
|
464
549
|
|
|
465
|
-
Returns:
|
|
466
|
-
list[str]: The input list of text with postcodes replaced.
|
|
467
|
-
"""
|
|
468
|
-
|
|
469
|
-
self.logger.info("Scrubbing addresses using regex...")
|
|
470
|
-
pattern = r"(?i)\b(?:flat\s+\w+,\s*)?\d+[a-z]?(?:[-–/]\d+[a-z]?)?\s+[a-z][a-z'’\- ]+\s+(street|st|road|rd|avenue|ave|lane|ln|close|cl|drive|dr|way|walk|gardens|gdns|place|pl|mews|court|ct|crescent|cres|terrace|ter)\b"
|
|
471
|
-
|
|
472
|
-
return self.scrub_regex(pattern, replacement_text, label)
|
|
473
|
-
|
|
474
|
-
def claimants(self, replacement_text="[CLAIMANT]", label: str = "claimant") -> list[str]:
|
|
475
|
-
"""
|
|
476
|
-
Removes claimant names from employment tribunal texts.
|
|
477
|
-
e.g. `Claimant: Jim Smith` scrubbed
|
|
478
550
|
|
|
479
|
-
Args:
|
|
480
|
-
None
|
|
481
551
|
Returns:
|
|
482
|
-
list[
|
|
552
|
+
list[IDEnt]: A list of IDEnt objects.
|
|
483
553
|
"""
|
|
484
554
|
|
|
485
|
-
self.
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
cleaned_texts = []
|
|
492
|
-
|
|
493
|
-
for i, text in zip(self.text_ids, texts):
|
|
494
|
-
|
|
495
|
-
def replace_claimant(match):
|
|
496
|
-
nonlocal claimant_name
|
|
497
|
-
claimant_name = match.group(2).strip()
|
|
498
|
-
return f"{match.group(1)}[CLAIMANT] "
|
|
499
|
-
|
|
500
|
-
cleaned = re.sub(r"[\r\n]", " ", text)
|
|
501
|
-
|
|
502
|
-
cleaned = re.sub(r"(Claimant\s*:\s*)(.*?)(?=\bRespondents?\s*:)", replace_claimant, cleaned)
|
|
503
|
-
|
|
504
|
-
if claimant_name:
|
|
505
|
-
cleaned = re.sub(re.escape(claimant_name), replacement_text, cleaned)
|
|
506
|
-
self.scrubbed_data.append({self.text_id_name: i, label: claimant_name})
|
|
507
|
-
|
|
508
|
-
cleaned_texts.append(cleaned)
|
|
509
|
-
|
|
510
|
-
self.cleaned_texts = cleaned_texts
|
|
555
|
+
if self.texts and self.text_ids:
|
|
556
|
+
texts = self.texts
|
|
557
|
+
text_ids = self.text_ids
|
|
558
|
+
else:
|
|
559
|
+
texts = texts
|
|
560
|
+
text_ids = text_ids
|
|
511
561
|
|
|
512
|
-
|
|
562
|
+
pattern = r"(?i)\b(?:flat\s+\w+,\s*)?\d+[a-z]?(?:[-–/]\d+[a-z]?)?,?\s+[a-z][a-z'’\- ]+\s+(street|st|road|rd|avenue|ave|lane|ln|close|cl|drive|dr|way|walk|gardens|gdns|place|pl|mews|court|ct|crescent|cres|terrace|ter)\b"
|
|
563
|
+
return self.find_regex(
|
|
564
|
+
texts=texts, text_ids=text_ids, pattern=pattern, label=label, replacement=replacement, priority=priority
|
|
565
|
+
)
|
|
513
566
|
|
|
514
567
|
def get_spacy_model(self, model_name: str = "en_core_web_trf") -> Language:
|
|
515
568
|
"""
|
|
@@ -548,86 +601,69 @@ class IDScrub:
|
|
|
548
601
|
|
|
549
602
|
def spacy_entities(
|
|
550
603
|
self,
|
|
604
|
+
texts: list[str] = None,
|
|
605
|
+
text_ids: list = None,
|
|
551
606
|
model_name: str = "en_core_web_trf",
|
|
552
|
-
|
|
553
|
-
replacement_map:
|
|
554
|
-
|
|
607
|
+
entity_types: list[str] = ["PERSON", "ORG", "NORP"],
|
|
608
|
+
replacement_map: dict = {"PERSON": "[PERSON]", "ORG": "[ORG]", "NORP": "[NORP]"},
|
|
609
|
+
priority: float = 1.0,
|
|
555
610
|
n_process: int = 1,
|
|
556
611
|
batch_size: int = 1000,
|
|
557
|
-
) -> list[
|
|
612
|
+
) -> list[IDEnt]:
|
|
558
613
|
"""
|
|
559
|
-
Remove SpaCy
|
|
614
|
+
Remove SpaCy idents using a given SpaCy model.
|
|
560
615
|
Documentation for entity labels: https://spacy.io/models/en#en_core_web_trf
|
|
561
616
|
Note: only "en_core_web_trf" has been evaluated.
|
|
562
617
|
|
|
563
618
|
Args:
|
|
619
|
+
texts (list[str]): Strings to scrub.
|
|
620
|
+
text_ids (list): A list of identifiers that correspond to each string in `texts`.
|
|
621
|
+
If None, current cleaned state of `texts` passed at Class initiation used.
|
|
564
622
|
model_name (str): Name of Spacy model. Only `en_core_web_trf` has been evaluated.
|
|
565
|
-
|
|
566
|
-
replacement_map (str): The replacement texts for the removed text.
|
|
623
|
+
entity_types (list[str]): Which SpaCy idents to scrub (based on SpaCy entity keys).
|
|
624
|
+
replacement_map (str): The replacement texts for the removed text. Key is entity type, value is replacement.
|
|
567
625
|
label_prefix (str): Prefix for the Spacy entity removed, e.g. `{label}_person`.
|
|
568
626
|
n_process (int): Number of parallel processes.
|
|
569
627
|
batch_size (int): The number of texts in each batch.
|
|
628
|
+
priority (float): Priority score for overlapping entities.
|
|
629
|
+
Higher scored entities are scrubbed where an overlap occurs.
|
|
630
|
+
The scores are relative e.g. 0.2 beats 0.1.
|
|
570
631
|
|
|
571
632
|
Returns:
|
|
572
|
-
list[
|
|
633
|
+
list[IDEnt]: A list of IDEnt objects.
|
|
573
634
|
"""
|
|
574
635
|
|
|
575
|
-
self.logger.info(
|
|
576
|
-
f"Scrubbing SpaCy entities `{', '.join(str(entitity) for entitity in entities)}` using SpaCy model `{model_name}`..."
|
|
577
|
-
)
|
|
578
|
-
|
|
579
|
-
texts = self.get_texts()
|
|
580
|
-
|
|
581
|
-
cleaned_texts = []
|
|
582
|
-
labels = []
|
|
583
|
-
|
|
584
636
|
nlp = self.get_spacy_model(model_name)
|
|
585
637
|
stripped_texts = [s.strip() if s.isspace() else s for s in texts]
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
for i, (ids, doc, stripped_text) in tqdm(
|
|
589
|
-
enumerate(zip(self.text_ids, documents, stripped_texts)), total=len(texts)
|
|
590
|
-
):
|
|
591
|
-
if not stripped_text:
|
|
592
|
-
cleaned_texts.append(texts[i])
|
|
593
|
-
continue
|
|
594
|
-
|
|
595
|
-
all_found_entities = []
|
|
596
|
-
|
|
597
|
-
for entity_type in entities:
|
|
598
|
-
found = [
|
|
599
|
-
ent for ent in doc.ents if ent.label_ == entity_type and ent.text not in {entity_type, "HANDLE"}
|
|
600
|
-
]
|
|
601
|
-
|
|
602
|
-
for ent in found:
|
|
603
|
-
label = ent.label_.lower()
|
|
604
|
-
if label_prefix:
|
|
605
|
-
label = f"{label_prefix}_{label}"
|
|
606
|
-
labels.append(label)
|
|
607
|
-
self.scrubbed_data.append({self.text_id_name: ids, label: ent.text})
|
|
608
|
-
|
|
609
|
-
if self.replacement_text:
|
|
610
|
-
all_found_entities.extend((ent.start_char, ent.end_char, self.replacement_text) for ent in found)
|
|
611
|
-
elif replacement_map:
|
|
612
|
-
all_found_entities.extend(
|
|
613
|
-
(ent.start_char, ent.end_char, replacement_map.get(entity_type)) for ent in found
|
|
614
|
-
)
|
|
615
|
-
else:
|
|
616
|
-
all_found_entities.extend((ent.start_char, ent.end_char, f"[{entity_type}]") for ent in found)
|
|
617
|
-
|
|
618
|
-
cleaned = stripped_text
|
|
619
|
-
|
|
620
|
-
for start, end, repl in sorted(all_found_entities, key=lambda x: x[0], reverse=True):
|
|
621
|
-
cleaned = cleaned[:start] + repl + cleaned[end:]
|
|
638
|
+
docs = nlp.pipe(stripped_texts, n_process=n_process, batch_size=batch_size)
|
|
622
639
|
|
|
623
|
-
|
|
640
|
+
idents = []
|
|
624
641
|
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
642
|
+
for doc, text_id in zip(docs, text_ids):
|
|
643
|
+
for ent in doc.ents:
|
|
644
|
+
if ent.label_ not in entity_types:
|
|
645
|
+
continue
|
|
646
|
+
if self.replacement:
|
|
647
|
+
replacement = self.replacement
|
|
648
|
+
elif replacement_map:
|
|
649
|
+
replacement = replacement_map.get(ent.label_, "[REDACTED]")
|
|
650
|
+
else:
|
|
651
|
+
replacement = f"[{ent.label_}]"
|
|
652
|
+
|
|
653
|
+
idents.append(
|
|
654
|
+
self.IDEnt(
|
|
655
|
+
text_id=text_id,
|
|
656
|
+
text=ent.text,
|
|
657
|
+
start=ent.start_char,
|
|
658
|
+
end=ent.end_char,
|
|
659
|
+
priority=priority,
|
|
660
|
+
replacement=replacement,
|
|
661
|
+
label=ent.label_.lower(),
|
|
662
|
+
source="spacy",
|
|
663
|
+
)
|
|
664
|
+
)
|
|
629
665
|
|
|
630
|
-
return
|
|
666
|
+
return idents
|
|
631
667
|
|
|
632
668
|
def get_hf_model(
|
|
633
669
|
self,
|
|
@@ -666,41 +702,46 @@ class IDScrub:
|
|
|
666
702
|
|
|
667
703
|
def huggingface_entities(
|
|
668
704
|
self,
|
|
705
|
+
texts: list[str] = None,
|
|
706
|
+
text_ids: list = None,
|
|
707
|
+
entity_type="PER",
|
|
708
|
+
replacement: str = "[PERSON]",
|
|
709
|
+
label: str = "person",
|
|
710
|
+
priority: float = 1.0,
|
|
669
711
|
hf_model_path: str = "dbmdz/bert-large-cased-finetuned-conll03-english",
|
|
670
712
|
download_directory: str = f"{DOWNLOAD_DIR}/huggingface/",
|
|
671
|
-
|
|
672
|
-
replacement_text: str = "[PERSON]",
|
|
673
|
-
label: str = "person",
|
|
674
|
-
batch_size: int = 8,
|
|
675
|
-
) -> list[str]:
|
|
713
|
+
) -> list[IDEnt]:
|
|
676
714
|
"""
|
|
677
|
-
Remove
|
|
715
|
+
Remove idents using a Hugging Face model. Default is a PERSON entity identifier.
|
|
678
716
|
Note: No Hugging Face models have been evaluated for performance.
|
|
679
717
|
|
|
680
718
|
Args:
|
|
719
|
+
texts (list[str]): Strings to scrub.
|
|
720
|
+
text_ids (list): A list of identifiers that correspond to each string in `texts`.
|
|
721
|
+
entity_type (str): Which entity to scrub (based on particular model keys).
|
|
722
|
+
If None, current cleaned state of `texts` passed at Class initiation used.
|
|
681
723
|
hf_model_path (str): Path to the Hugging Face model.
|
|
682
724
|
Only `dbmdz/bert-large-cased-finetuned-conll03-english` has been tested.
|
|
683
725
|
download_directory (str): Directory in which to save the model.
|
|
684
726
|
Default is current working directory.
|
|
685
|
-
|
|
727
|
+
replacement (str): The replacement text for the removed text.
|
|
686
728
|
label (str): Label for the personal data removed.
|
|
729
|
+
priority (float): Priority score for overlapping entities.
|
|
730
|
+
Higher scored entities are scrubbed where an overlap occurs.
|
|
731
|
+
The scores are relative e.g. 0.2 beats 0.1.
|
|
687
732
|
batch_size (int): Number of texts passed to the model in each batch.
|
|
688
733
|
Memory (instance size) dependent.
|
|
689
734
|
|
|
690
735
|
Returns:
|
|
691
|
-
list[str]: The input list of text with PERSON
|
|
736
|
+
list[str]: The input list of text with PERSON idents replaced.
|
|
692
737
|
|
|
693
738
|
"""
|
|
694
739
|
|
|
695
|
-
self.
|
|
740
|
+
if self.replacement:
|
|
741
|
+
replacement = self.replacement
|
|
696
742
|
|
|
697
743
|
tokenizer = self.get_hf_model(hf_model_path=hf_model_path, download_directory=download_directory)
|
|
698
744
|
|
|
699
|
-
texts = self.get_texts()
|
|
700
|
-
|
|
701
|
-
if self.replacement_text:
|
|
702
|
-
replacement_text = self.replacement_text
|
|
703
|
-
|
|
704
745
|
try:
|
|
705
746
|
names_model = AutoModelForTokenClassification.from_pretrained(hf_model_path)
|
|
706
747
|
except OSError:
|
|
@@ -708,74 +749,72 @@ class IDScrub:
|
|
|
708
749
|
f"Hugging Face model `{hf_model_path}` does has not been downloaded correctly. Please delete `huggingface/` and retry."
|
|
709
750
|
)
|
|
710
751
|
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
for
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
self.cleaned_texts = cleaned_texts
|
|
734
|
-
|
|
735
|
-
self.log_message(label)
|
|
752
|
+
ner = pipeline(task="ner", model=names_model, tokenizer=tokenizer, aggregation_strategy="simple")
|
|
753
|
+
|
|
754
|
+
idents = []
|
|
755
|
+
|
|
756
|
+
results = ner(texts)
|
|
757
|
+
|
|
758
|
+
for ents, text_id in zip(results, text_ids):
|
|
759
|
+
for ent in ents:
|
|
760
|
+
if ent["entity_group"] != entity_type:
|
|
761
|
+
continue
|
|
762
|
+
idents.append(
|
|
763
|
+
self.IDEnt(
|
|
764
|
+
text_id=text_id,
|
|
765
|
+
text=ent["word"],
|
|
766
|
+
start=ent["start"],
|
|
767
|
+
end=ent["end"],
|
|
768
|
+
priority=priority,
|
|
769
|
+
replacement=replacement,
|
|
770
|
+
label=label,
|
|
771
|
+
source="huggingface",
|
|
772
|
+
)
|
|
773
|
+
)
|
|
736
774
|
|
|
737
|
-
return
|
|
775
|
+
return idents
|
|
738
776
|
|
|
739
777
|
def presidio_entities(
|
|
740
778
|
self,
|
|
779
|
+
texts: list[str] = None,
|
|
780
|
+
text_ids: list = None,
|
|
741
781
|
model_name: str = "en_core_web_trf",
|
|
742
|
-
|
|
782
|
+
entity_types: list[str] = [
|
|
743
783
|
"PERSON",
|
|
784
|
+
"EMAIL_ADDRESS",
|
|
744
785
|
"UK_NINO",
|
|
745
786
|
"UK_NHS",
|
|
746
787
|
"CREDIT_CARD",
|
|
747
788
|
"CRYPTO",
|
|
748
789
|
"MEDICAL_LICENSE",
|
|
749
|
-
"
|
|
790
|
+
"SWIFT_CODE",
|
|
750
791
|
"IBAN_CODE",
|
|
792
|
+
"LOCATION",
|
|
793
|
+
"NRP",
|
|
751
794
|
],
|
|
752
|
-
replacement_map:
|
|
753
|
-
|
|
754
|
-
) -> list[
|
|
795
|
+
replacement_map: dict = {},
|
|
796
|
+
priority: float = 1.0,
|
|
797
|
+
) -> list[IDEnt]:
|
|
755
798
|
"""
|
|
756
|
-
Scrub specified
|
|
799
|
+
Scrub specified idents from texts using Presidio.
|
|
757
800
|
|
|
758
801
|
See https://microsoft.github.io/presidio/supported_entities/ for further detail.
|
|
759
802
|
|
|
760
803
|
Args:
|
|
804
|
+
texts (list[str]): Strings to scrub.
|
|
805
|
+
text_ids (list): A list of identifiers that correspond to each string in `texts`.
|
|
806
|
+
If None, current cleaned state of `texts` passed at Class initiation used.
|
|
761
807
|
model_name (str): spaCy model to use
|
|
762
|
-
|
|
763
|
-
replacement_map (
|
|
764
|
-
|
|
765
|
-
|
|
808
|
+
entity_types (list[str]): entity types to scrub (e.g. ["PERSON", "IP_ADDRESS"])
|
|
809
|
+
replacement_map (str): The replacement texts for the removed text. Key is entity type, value is replacement.
|
|
810
|
+
priority (float): Priority score for overlapping entities.
|
|
811
|
+
Higher scored entities are scrubbed where an overlap occurs.
|
|
812
|
+
The scores are relative e.g. 0.2 beats 0.1.
|
|
766
813
|
|
|
767
814
|
Returns:
|
|
768
|
-
list[str]: The input list of text with
|
|
815
|
+
list[str]: The input list of text with idents replaced.
|
|
769
816
|
"""
|
|
770
817
|
|
|
771
|
-
self.logger.info(
|
|
772
|
-
f"Scrubbing Presidio entities `{', '.join(str(entitity) for entitity in entities)}` using SpaCy model `{model_name}`..."
|
|
773
|
-
)
|
|
774
|
-
|
|
775
|
-
texts = self.get_texts()
|
|
776
|
-
|
|
777
|
-
cleaned_texts = []
|
|
778
|
-
|
|
779
818
|
class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
|
780
819
|
def __init__(self, loaded_spacy_model):
|
|
781
820
|
super().__init__()
|
|
@@ -785,199 +824,320 @@ class IDScrub:
|
|
|
785
824
|
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model=nlp)
|
|
786
825
|
|
|
787
826
|
analyzer = AnalyzerEngine(nlp_engine=loaded_nlp_engine)
|
|
788
|
-
anonymizer = AnonymizerEngine()
|
|
789
827
|
|
|
790
|
-
|
|
791
|
-
all_labels = []
|
|
828
|
+
idents = []
|
|
792
829
|
|
|
793
|
-
|
|
830
|
+
for text, text_id in zip(texts, text_ids):
|
|
831
|
+
results = analyzer.analyze(text=text, language="en", entities=entity_types)
|
|
832
|
+
for res in results:
|
|
833
|
+
if res.entity_type not in entity_types:
|
|
834
|
+
continue
|
|
835
|
+
|
|
836
|
+
if self.replacement:
|
|
837
|
+
replacement = self.replacement
|
|
838
|
+
elif replacement_map:
|
|
839
|
+
replacement = replacement_map.get(res.entity_type, "[REDACTED]")
|
|
840
|
+
else:
|
|
841
|
+
replacement = f"[{res.entity_type}]"
|
|
842
|
+
|
|
843
|
+
idents.append(
|
|
844
|
+
self.IDEnt(
|
|
845
|
+
text_id=text_id,
|
|
846
|
+
text=text[res.start : res.end],
|
|
847
|
+
start=res.start,
|
|
848
|
+
end=res.end,
|
|
849
|
+
priority=priority,
|
|
850
|
+
replacement=replacement,
|
|
851
|
+
label=res.entity_type.lower(),
|
|
852
|
+
source="presidio",
|
|
853
|
+
)
|
|
854
|
+
)
|
|
794
855
|
|
|
795
|
-
|
|
796
|
-
if stripped_text == "":
|
|
797
|
-
cleaned_texts.append(texts[i])
|
|
798
|
-
continue
|
|
856
|
+
return idents
|
|
799
857
|
|
|
800
|
-
|
|
801
|
-
|
|
858
|
+
def group_idents(self, idents: list[IDEnt]) -> dict[int | str | float, list[IDEnt]]:
|
|
859
|
+
"""
|
|
860
|
+
Group a list of IDEnt objects by `text_id`.
|
|
802
861
|
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
else:
|
|
806
|
-
labels = [f"{res.entity_type.lower()}" for res in results]
|
|
862
|
+
Each unique `text_id` becomes a dictionary key,
|
|
863
|
+
and its value is a list of all IDEnt objects associated with that ID.
|
|
807
864
|
|
|
808
|
-
|
|
809
|
-
|
|
865
|
+
Args:
|
|
866
|
+
idents (list[IDEnt]) A list of IDEnt objects.
|
|
810
867
|
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
)
|
|
868
|
+
Returns:
|
|
869
|
+
dict[int | str | float, list[IDEnt]]: A dictionary mapping each text_id to a list of IDEnt objects.
|
|
870
|
+
"""
|
|
815
871
|
|
|
816
|
-
|
|
817
|
-
operators = {
|
|
818
|
-
res.entity_type: OperatorConfig("replace", {"new_value": self.replacement_text}) for res in results
|
|
819
|
-
}
|
|
820
|
-
elif replacement_map:
|
|
821
|
-
operators = {
|
|
822
|
-
res.entity_type: OperatorConfig("replace", {"new_value": replacement_map.get(res.entity_type)})
|
|
823
|
-
for res in results
|
|
824
|
-
}
|
|
825
|
-
else:
|
|
826
|
-
operators = {
|
|
827
|
-
res.entity_type: OperatorConfig("replace", {"new_value": f"[{res.entity_type}]"}) for res in results
|
|
828
|
-
}
|
|
872
|
+
idents_grouped = defaultdict(list)
|
|
829
873
|
|
|
830
|
-
|
|
874
|
+
for ident in idents:
|
|
875
|
+
idents_grouped[ident.text_id].append(ident)
|
|
831
876
|
|
|
832
|
-
|
|
877
|
+
return idents_grouped
|
|
833
878
|
|
|
834
|
-
|
|
879
|
+
def resolve_overlaps(self, idents: list[IDEnt]) -> list[IDEnt]:
|
|
880
|
+
"""
|
|
881
|
+
Select the highest-scoring non-overlapping idents.
|
|
835
882
|
|
|
836
|
-
|
|
837
|
-
|
|
883
|
+
Resolves conflicts between idents that overlap in their
|
|
884
|
+
character ranges. Entities are first sorted by descending priority and then by
|
|
885
|
+
start position to ensure a priority order.
|
|
838
886
|
|
|
839
|
-
|
|
887
|
+
Each IDEnt is accepted only if it does not overlap with any IDEnt
|
|
888
|
+
already selected. The resulting set of idents is returned in ascending
|
|
889
|
+
document order.
|
|
840
890
|
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
Use all regex methods to remove personal information from text.
|
|
891
|
+
A IDEnt is considered overlapping if:
|
|
892
|
+
IDEnt.start <= other.end and IDEnt.end >= other.start
|
|
844
893
|
|
|
845
894
|
Args:
|
|
846
|
-
|
|
895
|
+
idents (list[IDEnt]) A list of IDEnt objects.
|
|
847
896
|
|
|
848
897
|
Returns:
|
|
849
|
-
list[
|
|
850
|
-
|
|
898
|
+
list[IDEnt]: A list of non-overlapping idents, sorted by their start position.
|
|
851
899
|
"""
|
|
852
900
|
|
|
853
|
-
self.
|
|
854
|
-
self.handles()
|
|
855
|
-
self.ip_addresses()
|
|
856
|
-
self.uk_phone_numbers()
|
|
857
|
-
self.uk_addresses()
|
|
858
|
-
self.uk_postcodes()
|
|
859
|
-
self.titles()
|
|
901
|
+
idents_grouped = self.group_idents(idents)
|
|
860
902
|
|
|
861
|
-
|
|
903
|
+
resolved = []
|
|
862
904
|
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
) -> list[str]:
|
|
905
|
+
for text_id, idents in idents_grouped.items():
|
|
906
|
+
if not idents:
|
|
907
|
+
return []
|
|
908
|
+
|
|
909
|
+
idents_by_score = sorted(idents, key=lambda ident: (-ident.priority, ident.start))
|
|
910
|
+
|
|
911
|
+
kept_idents = []
|
|
912
|
+
|
|
913
|
+
for current_ident in idents_by_score:
|
|
914
|
+
has_overlap = any(
|
|
915
|
+
current_ident.start <= existing_ident.end and current_ident.end >= existing_ident.start
|
|
916
|
+
for existing_ident in kept_idents
|
|
917
|
+
)
|
|
918
|
+
|
|
919
|
+
if not has_overlap:
|
|
920
|
+
kept_idents.append(current_ident)
|
|
921
|
+
|
|
922
|
+
resolved.extend(kept_idents)
|
|
923
|
+
|
|
924
|
+
return resolved
|
|
925
|
+
|
|
926
|
+
def scrub_text(self, texts: str = None, text_ids: list = None, idents: list[IDEnt] = None):
|
|
886
927
|
"""
|
|
887
|
-
|
|
928
|
+
Apply a set of non-overlapping replacement idents to a text.
|
|
929
|
+
|
|
930
|
+
Each IDEnt specifies a character range to replace (`IDEnt.start` to `IDEnt.end`)
|
|
931
|
+
and a `replacement` string that will be inserted in place of that range.
|
|
888
932
|
|
|
889
933
|
Args:
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
batch_size (int): The number of texts in each batch.
|
|
934
|
+
texts list[str]: The original input text with overlaps resolved.
|
|
935
|
+
text_ids (list): A list of identifiers that correspond to each string in `texts`.
|
|
936
|
+
idents list[IDEnt]: a list of IDEnt objects. Must be non-overlapping.
|
|
937
|
+
See `resolve_conflicts`.
|
|
895
938
|
|
|
896
|
-
|
|
897
|
-
|
|
939
|
+
Return:
|
|
940
|
+
str: A scrubbed string with all replacements applied.
|
|
898
941
|
"""
|
|
899
942
|
|
|
900
|
-
if
|
|
901
|
-
self
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
943
|
+
if texts is None:
|
|
944
|
+
texts = getattr(self, "texts", None)
|
|
945
|
+
if text_ids is None:
|
|
946
|
+
text_ids = getattr(self, "text_ids", None)
|
|
947
|
+
if idents is None:
|
|
948
|
+
idents = getattr(self, "idents", None)
|
|
905
949
|
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
self.google_phone_numbers()
|
|
909
|
-
self.all_regex()
|
|
950
|
+
if texts is None or text_ids is None or idents is None:
|
|
951
|
+
raise ValueError("texts, text_ids, and idents must be provided or set on self.")
|
|
910
952
|
|
|
911
|
-
|
|
953
|
+
if len(texts) != len(text_ids):
|
|
954
|
+
raise ValueError("texts and text_ids must be the same length.")
|
|
912
955
|
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
Scrubs text using given methods (in order).
|
|
916
|
-
Uses default values for the given scrub method.
|
|
956
|
+
scrubbed_texts = list(texts)
|
|
957
|
+
idents_grouped = self.group_idents(idents)
|
|
917
958
|
|
|
918
|
-
|
|
959
|
+
for i, text_id in enumerate(text_ids):
|
|
960
|
+
text = texts[i]
|
|
919
961
|
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
"titles", "presidio"
|
|
962
|
+
group = idents_grouped.get(text_id, [])
|
|
963
|
+
sorted_group = sorted(group, key=lambda ident: ident.start, reverse=True)
|
|
923
964
|
|
|
924
|
-
|
|
965
|
+
for ident in sorted_group:
|
|
966
|
+
text = text[: ident.start] + ident.replacement + text[ident.end :]
|
|
925
967
|
|
|
926
|
-
|
|
968
|
+
scrubbed_texts[i] = text
|
|
927
969
|
|
|
928
|
-
|
|
970
|
+
return scrubbed_texts
|
|
929
971
|
|
|
930
|
-
|
|
972
|
+
def scrub(
|
|
973
|
+
self,
|
|
974
|
+
pipeline: list[dict] = [
|
|
975
|
+
{"method": "presidio_entities"},
|
|
976
|
+
{"method": "spacy_entities"},
|
|
977
|
+
{"method": "email_addresses"},
|
|
978
|
+
{"method": "handles"},
|
|
979
|
+
{"method": "ip_addresses"},
|
|
980
|
+
{"method": "uk_addresses"},
|
|
981
|
+
{"method": "uk_phone_numbers"},
|
|
982
|
+
{"method": "google_phone_numbers"},
|
|
983
|
+
{"method": "uk_postcodes"},
|
|
984
|
+
{"method": "urls"},
|
|
985
|
+
{"method": "titles"},
|
|
986
|
+
],
|
|
987
|
+
):
|
|
988
|
+
"""
|
|
989
|
+
Scrubs text using given methods.
|
|
990
|
+
Uses default values for the given scrub method.
|
|
931
991
|
|
|
932
992
|
Args:
|
|
933
|
-
|
|
993
|
+
pipeline (list[dict]): Scrub methods and their method parameters to apply.
|
|
994
|
+
Methods are specified with "method" key.
|
|
995
|
+
Parameters are specified with argument name as "key" and argument value as value.
|
|
996
|
+
|
|
997
|
+
Example: IDScrub.scrub(pipeline=[{"method": "spacy_entities", "entity_types": ["PERSON"])
|
|
998
|
+
|
|
999
|
+
See associated method docstring for further parameters e.g. ?IDScrub.spacy_entities.
|
|
1000
|
+
|
|
1001
|
+
Methods available:
|
|
1002
|
+
|
|
1003
|
+
"spacy_entities", "huggingface_entities", "email_addresses", "handles",
|
|
1004
|
+
"ip_addresses", "uk_addresses", "uk_phone_numbers", "google_phone_numbers", "uk_postcodes"
|
|
1005
|
+
"titles", "presidio_entities"
|
|
1006
|
+
|
|
1007
|
+
Each method takes a `priority` argument. Higher priority scored entities
|
|
1008
|
+
are scrubbed where an overlap occurs. The scores are relative.
|
|
934
1009
|
|
|
935
1010
|
Returns:
|
|
936
|
-
list[str]: The input
|
|
1011
|
+
list[str]: The input texts scrubbed of personal data.
|
|
937
1012
|
|
|
938
1013
|
"""
|
|
939
1014
|
|
|
940
|
-
|
|
1015
|
+
if not isinstance(pipeline, list):
|
|
1016
|
+
raise TypeError("Argument `pipeline` must be a list of dicts.")
|
|
1017
|
+
|
|
1018
|
+
self.idents_all = []
|
|
1019
|
+
self.idents = []
|
|
1020
|
+
|
|
1021
|
+
for step in pipeline:
|
|
1022
|
+
scrub_method = step["method"]
|
|
1023
|
+
args = {k: v for k, v in step.items() if k != "method"}
|
|
1024
|
+
|
|
1025
|
+
if args:
|
|
1026
|
+
self.logger.info(f"Scrubbing using {scrub_method} with parameters {args}...")
|
|
1027
|
+
else:
|
|
1028
|
+
self.logger.info(f"Scrubbing using {scrub_method} with default parameters...")
|
|
1029
|
+
|
|
941
1030
|
try:
|
|
942
1031
|
method = getattr(self, scrub_method)
|
|
943
|
-
method()
|
|
944
1032
|
except AttributeError:
|
|
945
1033
|
self.logger.warning("Not a scrub method.")
|
|
946
1034
|
|
|
947
|
-
|
|
1035
|
+
self.idents_all.extend(method(texts=self.texts, text_ids=self.text_ids, **args))
|
|
1036
|
+
|
|
1037
|
+
idents_exclude = [ident for ident in self.idents_all if ident.text not in self.exclude]
|
|
1038
|
+
idents_resolved = self.resolve_overlaps(idents=idents_exclude)
|
|
1039
|
+
self.idents.extend(idents_resolved)
|
|
1040
|
+
self.scrubbed_texts = self.scrub_text(texts=self.texts, text_ids=self.text_ids, idents=self.idents)
|
|
1041
|
+
|
|
1042
|
+
return self.scrubbed_texts
|
|
1043
|
+
|
|
1044
|
+
def get_all_identified_data(self) -> pd.DataFrame:
|
|
1045
|
+
"""
|
|
1046
|
+
Get all of the identified data before overlaps have been resolved.
|
|
1047
|
+
|
|
1048
|
+
Each row is a identified entity. Columns are the IDEnt attributes.
|
|
1049
|
+
|
|
1050
|
+
Args:
|
|
1051
|
+
None
|
|
1052
|
+
Return:
|
|
1053
|
+
pd.DataFrame: All identified data and their attributes.
|
|
1054
|
+
"""
|
|
1055
|
+
all_idents = pd.DataFrame([asdict(ident) for ident in self.idents_all])
|
|
1056
|
+
return all_idents
|
|
1057
|
+
|
|
1058
|
+
def get_scrubbed_data(self) -> pd.DataFrame:
|
|
1059
|
+
"""
|
|
1060
|
+
Create a DataFrame summarising scrubbed text idents grouped by text ID and label.
|
|
1061
|
+
|
|
1062
|
+
Each row corresponds to a unique `text_id`, and each column represents a IDEnt label.
|
|
1063
|
+
The cell values are lists of the IDEnt text values associated with that label for the given text ID.
|
|
1064
|
+
Args:
|
|
1065
|
+
None
|
|
1066
|
+
Return:
|
|
1067
|
+
pd.DataFrame: All data scrubbed from text.
|
|
1068
|
+
"""
|
|
1069
|
+
data = defaultdict(lambda: defaultdict(list))
|
|
1070
|
+
|
|
1071
|
+
for ident in self.idents:
|
|
1072
|
+
data[ident.text_id][ident.label].append(ident.text)
|
|
1073
|
+
|
|
1074
|
+
df = pd.DataFrame.from_dict(data, orient="index")
|
|
1075
|
+
df = df.reset_index().rename(columns={"index": self.text_id_name})
|
|
1076
|
+
df = df.where(pd.notna(df), None)
|
|
1077
|
+
|
|
1078
|
+
return df
|
|
948
1079
|
|
|
949
1080
|
@staticmethod
|
|
950
1081
|
def dataframe(
|
|
951
1082
|
df: pd.DataFrame = None,
|
|
952
1083
|
id_col: str = None,
|
|
953
1084
|
exclude_cols: list = None,
|
|
954
|
-
|
|
1085
|
+
pipeline: list[dict] = [
|
|
1086
|
+
{"method": "presidio_entities"},
|
|
1087
|
+
{"method": "spacy_entities"},
|
|
1088
|
+
{"method": "email_addresses"},
|
|
1089
|
+
{"method": "handles"},
|
|
1090
|
+
{"method": "ip_addresses"},
|
|
1091
|
+
{"method": "uk_addresses"},
|
|
1092
|
+
{"method": "uk_phone_numbers"},
|
|
1093
|
+
{"method": "google_phone_numbers"},
|
|
1094
|
+
{"method": "uk_postcodes"},
|
|
1095
|
+
{"method": "urls"},
|
|
1096
|
+
{"method": "titles"},
|
|
1097
|
+
],
|
|
955
1098
|
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
956
1099
|
"""
|
|
957
1100
|
Scrubs all personal data from a Pandas Dataframe.
|
|
958
1101
|
|
|
959
1102
|
Args:
|
|
960
1103
|
df (pd.DataFrame): A Pandas dataframe to scrub.
|
|
961
|
-
id_col (str): Name of the ID column in `df`. If None, an integer index starting at 1 with the name `
|
|
1104
|
+
id_col (str): Name of the ID column in `df`. If None, an integer index starting at 1 with the name `text_id` is applied.
|
|
962
1105
|
exclude_cols (list): Columns to exclude from scrubbing. if None all columns are scrubbed.
|
|
963
|
-
|
|
964
|
-
|
|
1106
|
+
pipeline (list[dict]): Scrub methods and their method parameters to apply.
|
|
1107
|
+
Methods are specified with "method" key.
|
|
1108
|
+
|
|
1109
|
+
Example: IDScrub.scrub(pipeline=[{"method": "spacy_entities", "entity_types": ["PERSON"])
|
|
1110
|
+
|
|
1111
|
+
See associated method docstring for further parameters e.g. ?IDScrub.spacy_entities.
|
|
1112
|
+
|
|
1113
|
+
Methods available:
|
|
1114
|
+
|
|
1115
|
+
"spacy_entities", "huggingface_entities", "email_addresses", "handles",
|
|
1116
|
+
"ip_addresses", "uk_addresses", "uk_phone_numbers", "google_phone_numbers", "uk_postcodes"
|
|
1117
|
+
"titles", "presidio_entities"
|
|
1118
|
+
|
|
1119
|
+
Each method takes a `priority` argument. Higher priority scored entities
|
|
1120
|
+
are scrubbed where an overlap occurs. The scores are relative.
|
|
965
1121
|
|
|
966
1122
|
Returns:
|
|
967
1123
|
tuple[pd.DataFrame, pd.DataFrame]: The input dataframe with all personal data removed and a dataframe with the personal data that has been removed.
|
|
968
1124
|
|
|
969
1125
|
"""
|
|
970
1126
|
|
|
971
|
-
|
|
1127
|
+
if not isinstance(df, pd.DataFrame):
|
|
1128
|
+
raise TypeError("`df` must be a Pandas DataFrame.")
|
|
972
1129
|
|
|
973
|
-
if id_col:
|
|
974
|
-
ids = df[id_col].to_list()
|
|
975
|
-
if not id_col:
|
|
976
|
-
id_col = "id"
|
|
1130
|
+
if id_col is None:
|
|
977
1131
|
ids = range(1, len(df) + 1)
|
|
1132
|
+
id_col = "id"
|
|
1133
|
+
else:
|
|
1134
|
+
if id_col not in df.columns:
|
|
1135
|
+
raise ValueError(f"`id_col` '{id_col}' is not a column in df.")
|
|
978
1136
|
|
|
979
|
-
|
|
980
|
-
|
|
1137
|
+
ids = df[id_col].tolist()
|
|
1138
|
+
|
|
1139
|
+
if not len(df) == len(ids):
|
|
1140
|
+
raise ValueError("Length of dataframe is different to the length of IDs.")
|
|
981
1141
|
|
|
982
1142
|
if exclude_cols is None:
|
|
983
1143
|
cols_to_scrub = df.columns.to_list()
|
|
@@ -994,16 +1154,17 @@ class IDScrub:
|
|
|
994
1154
|
original_dtype = scrubbed_df[col].dtype
|
|
995
1155
|
scrubbed_df[col] = scrubbed_df[col].astype(str)
|
|
996
1156
|
|
|
997
|
-
scrub = IDScrub(texts=scrubbed_df[col].to_list(),
|
|
1157
|
+
scrub = IDScrub(texts=scrubbed_df[col].to_list(), text_ids=ids)
|
|
998
1158
|
scrub.logger.info(f"Scrubbing column `{col}`...")
|
|
999
1159
|
|
|
1000
|
-
scrubbed_texts = scrub.scrub(
|
|
1160
|
+
scrubbed_texts = scrub.scrub(pipeline=pipeline)
|
|
1001
1161
|
scrubbed_df[col] = scrubbed_texts
|
|
1002
1162
|
|
|
1003
1163
|
scrubbed_data = scrub.get_scrubbed_data()
|
|
1004
1164
|
|
|
1005
1165
|
if scrubbed_data is not None:
|
|
1006
1166
|
scrubbed_data.insert(1, "column", col)
|
|
1167
|
+
scrubbed_data.rename(columns={"text_id": id_col}, inplace=True)
|
|
1007
1168
|
all_scrubbed_data.append(scrubbed_data)
|
|
1008
1169
|
|
|
1009
1170
|
try:
|
|
@@ -1013,8 +1174,14 @@ class IDScrub:
|
|
|
1013
1174
|
pass
|
|
1014
1175
|
|
|
1015
1176
|
all_scrubbed_data = pd.concat(all_scrubbed_data).reset_index(drop=True)
|
|
1177
|
+
all_scrubbed_data["column"] = pd.Categorical(
|
|
1178
|
+
all_scrubbed_data["column"], categories=cols_to_scrub, ordered=True
|
|
1179
|
+
)
|
|
1180
|
+
all_scrubbed_data = all_scrubbed_data.sort_values(by=["column", id_col]).reset_index(drop=True)
|
|
1181
|
+
all_scrubbed_data["column"] = all_scrubbed_data["column"].astype(str)
|
|
1016
1182
|
all_scrubbed_data = all_scrubbed_data.where(pd.notna(all_scrubbed_data), None)
|
|
1017
1183
|
|
|
1018
|
-
|
|
1184
|
+
if not df.shape == scrubbed_df.shape:
|
|
1185
|
+
raise ValueError("Original and scrubbed dataframe not the same shape. Check input DataFrame.")
|
|
1019
1186
|
|
|
1020
1187
|
return scrubbed_df, all_scrubbed_data
|