idscrub 0.2.2__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {idscrub-0.2.2 → idscrub-1.0.0}/PKG-INFO +2 -2
- {idscrub-0.2.2 → idscrub-1.0.0}/README.md +1 -1
- {idscrub-0.2.2 → idscrub-1.0.0}/idscrub/scrub.py +73 -88
- {idscrub-0.2.2 → idscrub-1.0.0}/idscrub.egg-info/PKG-INFO +2 -2
- {idscrub-0.2.2 → idscrub-1.0.0}/idscrub.egg-info/SOURCES.txt +1 -1
- {idscrub-0.2.2 → idscrub-1.0.0}/notebooks/basic_usage.ipynb +153 -161
- idscrub-1.0.0/test/conftest.py +22 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/test/test_all.py +3 -3
- {idscrub-0.2.2 → idscrub-1.0.0}/test/test_chain.py +7 -7
- idscrub-1.0.0/test/test_dataframe.py +160 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/test/test_huggingface.py +1 -1
- idscrub-1.0.0/test/test_label.py +17 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/test/test_log.py +3 -3
- {idscrub-0.2.2 → idscrub-1.0.0}/test/test_persidio.py +2 -2
- {idscrub-0.2.2 → idscrub-1.0.0}/test/test_regex.py +8 -8
- {idscrub-0.2.2 → idscrub-1.0.0}/test/test_scrub.py +4 -4
- {idscrub-0.2.2 → idscrub-1.0.0}/test/test_spacy.py +1 -3
- idscrub-0.2.2/SECURITY.md +0 -47
- idscrub-0.2.2/test/conftest.py +0 -12
- idscrub-0.2.2/test/test_dataframe.py +0 -51
- {idscrub-0.2.2 → idscrub-1.0.0}/.github/pull_request_template.md +0 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/.github/workflows/cd.yml +0 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/.github/workflows/ci.yml +0 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/.gitignore +0 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/.pre-commit-config.yaml +0 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/CODEOWNERS +0 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/LICENSE +0 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/Makefile +0 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/SECURITY_CHECKLIST.md +0 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/idscrub/__init__.py +0 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/idscrub/locations.py +0 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/idscrub.egg-info/dependency_links.txt +0 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/idscrub.egg-info/requires.txt +0 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/idscrub.egg-info/top_level.txt +0 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/pyproject.toml +0 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/setup.cfg +0 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/test/test_id.py +0 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/test/test_phonenumbers.py +0 -0
- {idscrub-0.2.2 → idscrub-1.0.0}/uv.lock +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: idscrub
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Author: Department for Business and Trade
|
|
5
5
|
Requires-Python: >=3.12
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -21,7 +21,7 @@ Dynamic: license-file
|
|
|
21
21
|
|
|
22
22
|
# idscrub 🧽✨
|
|
23
23
|
|
|
24
|
-
* Names and other personally identifying information are often present in text.
|
|
24
|
+
* Names and other personally identifying information are often present in text, even if they are not clearly visible or requested.
|
|
25
25
|
* This information may need to be removed prior to further analysis in many cases.
|
|
26
26
|
* `idscrub` identifies and removes (*✨scrubs✨*) personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
|
|
27
27
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# idscrub 🧽✨
|
|
2
2
|
|
|
3
|
-
* Names and other personally identifying information are often present in text.
|
|
3
|
+
* Names and other personally identifying information are often present in text, even if they are not clearly visible or requested.
|
|
4
4
|
* This information may need to be removed prior to further analysis in many cases.
|
|
5
5
|
* `idscrub` identifies and removes (*✨scrubs✨*) personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
|
|
6
6
|
|
|
@@ -138,52 +138,51 @@ class IDScrub:
|
|
|
138
138
|
|
|
139
139
|
return grouped
|
|
140
140
|
|
|
141
|
-
def log_message(self,
|
|
141
|
+
def log_message(self, label) -> None:
|
|
142
142
|
"""
|
|
143
143
|
Log message with count of PII-type scrubbed.
|
|
144
144
|
|
|
145
145
|
Args:
|
|
146
|
-
|
|
146
|
+
label (str): Label for the personal data removed.
|
|
147
147
|
Returns:
|
|
148
148
|
int: The count of PII-type scrubbed.
|
|
149
149
|
"""
|
|
150
150
|
|
|
151
|
-
if any(
|
|
151
|
+
if any(label in key for key in self.scrubbed_data):
|
|
152
152
|
scrubbed_data = self.get_scrubbed_data()
|
|
153
|
-
count = scrubbed_data[
|
|
153
|
+
count = scrubbed_data[label].dropna().apply(len).sum()
|
|
154
154
|
else:
|
|
155
155
|
count = 0
|
|
156
156
|
|
|
157
|
-
|
|
158
|
-
self.logger.info(f"{count} {label_name} scrubbed.")
|
|
157
|
+
self.logger.info(f"{count} {label} scrubbed.")
|
|
159
158
|
|
|
160
159
|
return count
|
|
161
160
|
|
|
162
|
-
def scrub_and_collect(self, match, text, replacement_text, i,
|
|
161
|
+
def scrub_and_collect(self, match, text, replacement_text, i, label) -> str:
|
|
163
162
|
"""
|
|
164
163
|
Scrub pattern match and collect scrubbed name.
|
|
165
164
|
|
|
166
165
|
Args:
|
|
167
166
|
match (str): The regex match passed from `re.sub()`.
|
|
168
167
|
i (int): the enumerate id of the string.
|
|
169
|
-
|
|
168
|
+
label (str): Label for the personal data removed.
|
|
170
169
|
|
|
171
170
|
Returns:
|
|
172
171
|
str: The replacement text.
|
|
173
172
|
"""
|
|
174
173
|
|
|
175
|
-
self.scrubbed_data.append({self.text_id_name: i,
|
|
174
|
+
self.scrubbed_data.append({self.text_id_name: i, label: match.group()})
|
|
176
175
|
|
|
177
176
|
return replacement_text
|
|
178
177
|
|
|
179
|
-
def scrub_regex(self, pattern, replacement_text,
|
|
178
|
+
def scrub_regex(self, pattern, replacement_text, label) -> list[str]:
|
|
180
179
|
"""
|
|
181
180
|
General method to clean text using a regex pattern.
|
|
182
181
|
|
|
183
182
|
Args:
|
|
184
183
|
pattern (str): Regex pattern to apply.
|
|
185
184
|
replacement_text (str): The replacement text for the removed text.
|
|
186
|
-
|
|
185
|
+
label (str): Label for the personal data removed.
|
|
187
186
|
|
|
188
187
|
Returns:
|
|
189
188
|
list[str]: Cleaned texts.
|
|
@@ -203,7 +202,7 @@ class IDScrub:
|
|
|
203
202
|
text=text,
|
|
204
203
|
replacement_text=replacement_text,
|
|
205
204
|
i=i,
|
|
206
|
-
|
|
205
|
+
label=label,
|
|
207
206
|
),
|
|
208
207
|
text,
|
|
209
208
|
)
|
|
@@ -212,7 +211,7 @@ class IDScrub:
|
|
|
212
211
|
|
|
213
212
|
self.cleaned_texts = cleaned_texts
|
|
214
213
|
|
|
215
|
-
self.log_message(
|
|
214
|
+
self.log_message(label)
|
|
216
215
|
|
|
217
216
|
return cleaned_texts
|
|
218
217
|
|
|
@@ -220,6 +219,7 @@ class IDScrub:
|
|
|
220
219
|
self,
|
|
221
220
|
custom_regex_patterns: list[str] = None,
|
|
222
221
|
custom_replacement_texts: list[str] = None,
|
|
222
|
+
labels: list[str] = None,
|
|
223
223
|
) -> list[str]:
|
|
224
224
|
"""
|
|
225
225
|
Remove text matching a custom regex pattern.
|
|
@@ -228,6 +228,7 @@ class IDScrub:
|
|
|
228
228
|
custom_regex_patterns list[str]: Regex(s) pattern to apply.
|
|
229
229
|
custom_replacement_texts list[str]: The replacement texts for the removed text.
|
|
230
230
|
Defaults to '[REDACTED]' for all.
|
|
231
|
+
labels list[str]: Labels for patterns removed.
|
|
231
232
|
|
|
232
233
|
Returns:
|
|
233
234
|
list[str]: Cleaned texts.
|
|
@@ -243,17 +244,22 @@ class IDScrub:
|
|
|
243
244
|
custom_replacement_texts = ["[REDACTED]"] * len(custom_regex_patterns)
|
|
244
245
|
|
|
245
246
|
for i, (pattern, replacement_text) in enumerate(zip(custom_regex_patterns, custom_replacement_texts)):
|
|
246
|
-
|
|
247
|
+
if labels:
|
|
248
|
+
assert len(custom_regex_patterns) == len(labels), "There must be a label for each pattern."
|
|
249
|
+
self.scrub_regex(pattern, replacement_text, label=f"{labels[i]}")
|
|
250
|
+
else:
|
|
251
|
+
self.scrub_regex(pattern, replacement_text, label=f"custom_regex_{i + 1}")
|
|
247
252
|
|
|
248
253
|
return self.cleaned_texts
|
|
249
254
|
|
|
250
|
-
def email_addresses(self, replacement_text="[EMAIL_ADDRESS]") -> list[str]:
|
|
255
|
+
def email_addresses(self, replacement_text: str = "[EMAIL_ADDRESS]", label: str = "email_address") -> list[str]:
|
|
251
256
|
"""
|
|
252
257
|
Remove email addresses using regex.
|
|
253
258
|
e.g. `johnsmith@gmail.com` scrubbed
|
|
254
259
|
|
|
255
260
|
Args:
|
|
256
261
|
replacement_text (str): The replacement text for the removed text.
|
|
262
|
+
label (str): Label for the personal data removed.
|
|
257
263
|
|
|
258
264
|
Returns:
|
|
259
265
|
list[str]: The input list of text with email addresses replaced.
|
|
@@ -262,15 +268,16 @@ class IDScrub:
|
|
|
262
268
|
self.logger.info("Scrubbing email addresses using regex...")
|
|
263
269
|
pattern = r"\b\S+@\S+\.\S+\b"
|
|
264
270
|
|
|
265
|
-
return self.scrub_regex(pattern, replacement_text,
|
|
271
|
+
return self.scrub_regex(pattern, replacement_text, label=label)
|
|
266
272
|
|
|
267
|
-
def handles(self, replacement_text: str = "[HANDLE]") -> list[str]:
|
|
273
|
+
def handles(self, replacement_text: str = "[HANDLE]", label: str = "handle") -> list[str]:
|
|
268
274
|
"""
|
|
269
275
|
Remove `@` user handles using regex
|
|
270
276
|
e.g. `@username` scrubbed
|
|
271
277
|
|
|
272
278
|
Args:
|
|
273
279
|
replacement_text (str): The replacement text for the removed text.
|
|
280
|
+
label (str): Label for the personal data removed.
|
|
274
281
|
|
|
275
282
|
Returns:
|
|
276
283
|
list[str]: The input list of text with handles replaced.
|
|
@@ -279,9 +286,11 @@ class IDScrub:
|
|
|
279
286
|
self.logger.info("Scrubbing @user handles using regex...")
|
|
280
287
|
pattern = r"@[\w.-]+(?=\b)"
|
|
281
288
|
|
|
282
|
-
return self.scrub_regex(pattern, replacement_text,
|
|
289
|
+
return self.scrub_regex(pattern, replacement_text, label=label)
|
|
283
290
|
|
|
284
|
-
def google_phone_numbers(
|
|
291
|
+
def google_phone_numbers(
|
|
292
|
+
self, region: str = "GB", replacement_text: str = "[PHONENO]", label: str = "phone_number"
|
|
293
|
+
) -> list[str]:
|
|
285
294
|
"""
|
|
286
295
|
Remove phone numbers using Google's `phonenumbers`.
|
|
287
296
|
e.g. `+441234567891` scrubbed
|
|
@@ -289,6 +298,7 @@ class IDScrub:
|
|
|
289
298
|
Args:
|
|
290
299
|
region (str): The region to find phone numbers for. See `phonenumbers` regions.
|
|
291
300
|
replacement_text (str): The replacement text for the removed text.
|
|
301
|
+
label (str): Label for the personal data removed.
|
|
292
302
|
|
|
293
303
|
Returns:
|
|
294
304
|
list[str]: The input list of text with phone numbers replaced.
|
|
@@ -308,7 +318,7 @@ class IDScrub:
|
|
|
308
318
|
phone_nos = [match.raw_string for match in matches]
|
|
309
319
|
|
|
310
320
|
for phone_no in phone_nos:
|
|
311
|
-
self.scrubbed_data.append({self.text_id_name: i,
|
|
321
|
+
self.scrubbed_data.append({self.text_id_name: i, label: phone_no})
|
|
312
322
|
|
|
313
323
|
cleaned = text
|
|
314
324
|
for match in reversed(matches):
|
|
@@ -318,17 +328,18 @@ class IDScrub:
|
|
|
318
328
|
|
|
319
329
|
self.cleaned_texts = cleaned_texts
|
|
320
330
|
|
|
321
|
-
self.log_message(
|
|
331
|
+
self.log_message(label)
|
|
322
332
|
|
|
323
333
|
return cleaned_texts
|
|
324
334
|
|
|
325
|
-
def uk_phone_numbers(self, replacement_text: str = "[PHONENO]") -> list[str]:
|
|
335
|
+
def uk_phone_numbers(self, replacement_text: str = "[PHONENO]", label: str = "uk_phone_number") -> list[str]:
|
|
326
336
|
"""
|
|
327
337
|
Remove phone numbers using regex.
|
|
328
338
|
e.g. `+441234567891` scrubbed
|
|
329
339
|
|
|
330
340
|
Args:
|
|
331
341
|
replacement_text (str): The replacement text for the removed text.
|
|
342
|
+
label (str): Label for the personal data removed.
|
|
332
343
|
|
|
333
344
|
Returns:
|
|
334
345
|
list[str]: The input list of text with phone numbers replaced.
|
|
@@ -337,9 +348,9 @@ class IDScrub:
|
|
|
337
348
|
self.logger.info("Scrubbing phone numbers using regex...")
|
|
338
349
|
pattern = r"(\+?\d[\d\s]{7,}\d)"
|
|
339
350
|
|
|
340
|
-
return self.scrub_regex(pattern, replacement_text,
|
|
351
|
+
return self.scrub_regex(pattern, replacement_text, label=label)
|
|
341
352
|
|
|
342
|
-
def titles(self, strict: bool = False, replacement_text: str = "[TITLE]") -> list[str]:
|
|
353
|
+
def titles(self, strict: bool = False, replacement_text: str = "[TITLE]", label: str = "title") -> list[str]:
|
|
343
354
|
"""
|
|
344
355
|
Remove titles using regex.
|
|
345
356
|
|
|
@@ -347,6 +358,7 @@ class IDScrub:
|
|
|
347
358
|
strict (bool): Whether to use all of the titles or only essential titles.
|
|
348
359
|
If strict, you may find scrubbing of common words, such as general.
|
|
349
360
|
replacement_text (str): The replacement text for the removed text.
|
|
361
|
+
label (str): Label for the personal data removed.
|
|
350
362
|
|
|
351
363
|
Returns:
|
|
352
364
|
list[str]: The input list of text with names after titles replaced.
|
|
@@ -394,7 +406,7 @@ class IDScrub:
|
|
|
394
406
|
]
|
|
395
407
|
|
|
396
408
|
if not strict:
|
|
397
|
-
titles_to_remove = ["General", "Major", "Judge", "Master", "Father", "Sister"]
|
|
409
|
+
titles_to_remove = ["General", "Major", "Judge", "Master", "Father", "Sister", "Miss"]
|
|
398
410
|
titles = [title for title in titles if title not in titles_to_remove]
|
|
399
411
|
|
|
400
412
|
# Add dotted versions
|
|
@@ -404,9 +416,9 @@ class IDScrub:
|
|
|
404
416
|
self.logger.info("Scrubbing titles using regex...")
|
|
405
417
|
pattern = r"\b(?:{})\b".format("|".join(re.escape(t) for t in titles))
|
|
406
418
|
|
|
407
|
-
return self.scrub_regex(pattern, replacement_text,
|
|
419
|
+
return self.scrub_regex(pattern, replacement_text, label=label)
|
|
408
420
|
|
|
409
|
-
def ip_addresses(self, replacement_text: str = "[IPADDRESS]") -> list[str]:
|
|
421
|
+
def ip_addresses(self, replacement_text: str = "[IPADDRESS]", label: str = "ip_address") -> list[str]:
|
|
410
422
|
"""
|
|
411
423
|
Removes IP addresses.
|
|
412
424
|
e.g. `192.168.1.1` scrubbed
|
|
@@ -421,26 +433,27 @@ class IDScrub:
|
|
|
421
433
|
self.logger.info("Scrubbing IP addresses using regex...")
|
|
422
434
|
pattern = r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"
|
|
423
435
|
|
|
424
|
-
return self.scrub_regex(pattern, replacement_text,
|
|
436
|
+
return self.scrub_regex(pattern, replacement_text, label=label)
|
|
425
437
|
|
|
426
|
-
def uk_postcodes(self, replacement_text: str = "[POSTCODE]") -> list[str]:
|
|
438
|
+
def uk_postcodes(self, replacement_text: str = "[POSTCODE]", label: str = "uk_postcode") -> list[str]:
|
|
427
439
|
"""
|
|
428
|
-
Removes
|
|
440
|
+
Removes postcodes.
|
|
429
441
|
e.g. `A11 1AA` scrubbed
|
|
430
442
|
|
|
431
443
|
Args:
|
|
432
444
|
replacement_text (str): The replacement text for the removed text.
|
|
445
|
+
label (str): Label for the personal data removed.
|
|
433
446
|
|
|
434
447
|
Returns:
|
|
435
448
|
list[str]: The input list of text with postcodes replaced.
|
|
436
449
|
"""
|
|
437
450
|
|
|
438
|
-
self.logger.info("Scrubbing
|
|
451
|
+
self.logger.info("Scrubbing postcodes using regex...")
|
|
439
452
|
pattern = r"\b(?:(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)[ \t]*[0-9][A-Z]{2}|GIR[ \t]*0A{2}|SAN[ \t]*TA1|ASCN[ \t]*1ZZ|STHL[ \t]*1ZZ|TDCU[ \t]*1ZZ|BBND[ \t]*1ZZ|[BFS]IQ{2}[ \t]*1ZZ|GX11[ \t]*1AA|PCRN[ \t]*1ZZ|TKCA[ \t]*1ZZ|AI-?[0-9]{4}|BFPO[ \t-]?[0-9]{2,4}|MSR[ \t-]?1(?:1[12]|[23][135])0|VG[ \t-]?11[1-6]0|KY[1-3][ \t-]?[0-2][0-9]{3})\b"
|
|
440
453
|
|
|
441
|
-
return self.scrub_regex(pattern, replacement_text,
|
|
454
|
+
return self.scrub_regex(pattern, replacement_text, label=label)
|
|
442
455
|
|
|
443
|
-
def claimants(self, replacement_text="[CLAIMANT]") -> list[str]:
|
|
456
|
+
def claimants(self, replacement_text="[CLAIMANT]", label: str = "claimant") -> list[str]:
|
|
444
457
|
"""
|
|
445
458
|
Removes claimant names from employment tribunal texts.
|
|
446
459
|
e.g. `Claimant: Jim Smith` scrubbed
|
|
@@ -472,9 +485,7 @@ class IDScrub:
|
|
|
472
485
|
|
|
473
486
|
if claimant_name:
|
|
474
487
|
cleaned = re.sub(re.escape(claimant_name), replacement_text, cleaned)
|
|
475
|
-
self.scrubbed_data.append({self.text_id_name: i,
|
|
476
|
-
# self.scrubbed_data[self.text_id_name].append(i)
|
|
477
|
-
# self.scrubbed_data['scrubbed_claimant'].append(claimant_name)
|
|
488
|
+
self.scrubbed_data.append({self.text_id_name: i, label: claimant_name})
|
|
478
489
|
|
|
479
490
|
cleaned_texts.append(cleaned)
|
|
480
491
|
|
|
@@ -523,6 +534,7 @@ class IDScrub:
|
|
|
523
534
|
n_process: int = 1,
|
|
524
535
|
batch_size: int = 1000,
|
|
525
536
|
replacement_text: str = "[PERSON]",
|
|
537
|
+
label: str = "person",
|
|
526
538
|
) -> list[str]:
|
|
527
539
|
"""
|
|
528
540
|
Remove PERSON entities using a Spacy model.
|
|
@@ -533,6 +545,7 @@ class IDScrub:
|
|
|
533
545
|
n_process (int): Number of parallel processes.
|
|
534
546
|
batch_size (int): The number of texts in each batch.
|
|
535
547
|
replacement_text (str): The replacement text for the removed text.
|
|
548
|
+
label (str): Label for the personal data removed.
|
|
536
549
|
|
|
537
550
|
Returns:
|
|
538
551
|
list[str]: The input list of text with PERSON entities scrubbed.
|
|
@@ -561,9 +574,7 @@ class IDScrub:
|
|
|
561
574
|
person_entities = [
|
|
562
575
|
ent for ent in doc.ents if ent.label_ == "PERSON" and ent.text not in {"PERSON", "HANDLE"}
|
|
563
576
|
]
|
|
564
|
-
self.scrubbed_data.extend(
|
|
565
|
-
{self.text_id_name: ids, "scrubbed_spacy_person": ent.text} for ent in person_entities
|
|
566
|
-
)
|
|
577
|
+
self.scrubbed_data.extend({self.text_id_name: ids, label: ent.text} for ent in person_entities)
|
|
567
578
|
|
|
568
579
|
# Remove person entities
|
|
569
580
|
cleaned = stripped_text
|
|
@@ -574,7 +585,7 @@ class IDScrub:
|
|
|
574
585
|
|
|
575
586
|
self.cleaned_texts = cleaned_texts
|
|
576
587
|
|
|
577
|
-
self.log_message(
|
|
588
|
+
self.log_message(label)
|
|
578
589
|
|
|
579
590
|
return cleaned_texts
|
|
580
591
|
|
|
@@ -618,6 +629,7 @@ class IDScrub:
|
|
|
618
629
|
hf_model_path: str = "dbmdz/bert-large-cased-finetuned-conll03-english",
|
|
619
630
|
download_directory: str = f"{DOWNLOAD_DIR}/huggingface/",
|
|
620
631
|
replacement_text: str = "[PERSON]",
|
|
632
|
+
label: str = "person",
|
|
621
633
|
batch_size: int = 8,
|
|
622
634
|
) -> list[str]:
|
|
623
635
|
"""
|
|
@@ -630,6 +642,7 @@ class IDScrub:
|
|
|
630
642
|
download_directory (str): Directory in which to save the model.
|
|
631
643
|
Default is current working directory.
|
|
632
644
|
replacement_text (str): The replacement text for the removed text.
|
|
645
|
+
label (str): Label for the personal data removed.
|
|
633
646
|
batch_size (int): Number of texts passed to the model in each batch.
|
|
634
647
|
Memory (instance size) dependent.
|
|
635
648
|
|
|
@@ -668,9 +681,7 @@ class IDScrub:
|
|
|
668
681
|
person_entities = [
|
|
669
682
|
ent for ent in entities if ent["entity_group"] == "PER" and ent["word"] not in {"HANDLE", "PERSON"}
|
|
670
683
|
]
|
|
671
|
-
self.scrubbed_data.extend(
|
|
672
|
-
{self.text_id_name: ids, "scrubbed_hf_person": ent["word"]} for ent in person_entities
|
|
673
|
-
)
|
|
684
|
+
self.scrubbed_data.extend({self.text_id_name: ids, label: ent["word"]} for ent in person_entities)
|
|
674
685
|
|
|
675
686
|
cleaned = stripped_text
|
|
676
687
|
for ent in sorted(person_entities, key=lambda x: x["start"], reverse=True):
|
|
@@ -680,14 +691,14 @@ class IDScrub:
|
|
|
680
691
|
|
|
681
692
|
self.cleaned_texts = cleaned_texts
|
|
682
693
|
|
|
683
|
-
self.log_message(
|
|
694
|
+
self.log_message(label)
|
|
684
695
|
|
|
685
696
|
return cleaned_texts
|
|
686
697
|
|
|
687
698
|
def presidio(
|
|
688
699
|
self,
|
|
689
|
-
model_name="en_core_web_trf",
|
|
690
|
-
entities_to_scrub=[
|
|
700
|
+
model_name: str = "en_core_web_trf",
|
|
701
|
+
entities_to_scrub: list[str] = [
|
|
691
702
|
"PERSON",
|
|
692
703
|
"UK_NINO",
|
|
693
704
|
"UK_NHS",
|
|
@@ -697,7 +708,8 @@ class IDScrub:
|
|
|
697
708
|
"URL",
|
|
698
709
|
"IBAN_CODE",
|
|
699
710
|
],
|
|
700
|
-
replacement_map=None,
|
|
711
|
+
replacement_map: str = None,
|
|
712
|
+
label_prefix: str = None,
|
|
701
713
|
) -> list[str]:
|
|
702
714
|
"""
|
|
703
715
|
Scrub specified entities from texts using Presidio.
|
|
@@ -708,6 +720,7 @@ class IDScrub:
|
|
|
708
720
|
model_name (str): spaCy model to use
|
|
709
721
|
entities_to_scrub (list[str]): Entity types to scrub (e.g. ["PERSON", "IP_ADDRESS"])
|
|
710
722
|
replacement_map (dict): Mapping of entity_type to replacement string (e.g. {'PERSON': '[PERSON]'})
|
|
723
|
+
label_prefix (str): Prefix for the Presidio personal data type removed, e.g. `{label}_person`.
|
|
711
724
|
|
|
712
725
|
Returns:
|
|
713
726
|
list[str]: The input list of text with entities replaced.
|
|
@@ -743,7 +756,11 @@ class IDScrub:
|
|
|
743
756
|
results = analyzer.analyze(text=stripped_text, language="en")
|
|
744
757
|
results = [r for r in results if r.entity_type in entities_to_scrub]
|
|
745
758
|
|
|
746
|
-
|
|
759
|
+
if label_prefix:
|
|
760
|
+
labels = [f"{label_prefix}_{res.entity_type.lower()}" for res in results]
|
|
761
|
+
else:
|
|
762
|
+
labels = [f"{res.entity_type.lower()}" for res in results]
|
|
763
|
+
|
|
747
764
|
unique_labels.append(list(set(labels)))
|
|
748
765
|
|
|
749
766
|
self.scrubbed_data.extend(
|
|
@@ -847,38 +864,6 @@ class IDScrub:
|
|
|
847
864
|
|
|
848
865
|
return self.cleaned_texts
|
|
849
866
|
|
|
850
|
-
def call_scrub_method(self, scrub_method: str) -> list[str]:
|
|
851
|
-
"""
|
|
852
|
-
Calls a given scrub method based on its matching string name.
|
|
853
|
-
Uses default values for the given scrub method.
|
|
854
|
-
|
|
855
|
-
Example:
|
|
856
|
-
"all" == scrub.all() and "email_addresses" == scrub.email_addresses().
|
|
857
|
-
|
|
858
|
-
Args:
|
|
859
|
-
scrub_method (str): string name of scrub method.
|
|
860
|
-
|
|
861
|
-
Returns:
|
|
862
|
-
list[str]: The input list of text with personal information replaced.
|
|
863
|
-
|
|
864
|
-
"""
|
|
865
|
-
|
|
866
|
-
scrub_methods = {
|
|
867
|
-
"all": self.all,
|
|
868
|
-
"spacy_persons": self.spacy_persons,
|
|
869
|
-
"huggingface_persons": self.huggingface_persons,
|
|
870
|
-
"email_addresses": self.email_addresses,
|
|
871
|
-
"handles": self.handles,
|
|
872
|
-
"ip_addresses": self.ip_addresses,
|
|
873
|
-
"uk_phone_numbers": self.uk_phone_numbers,
|
|
874
|
-
"google_phone_numbers": self.google_phone_numbers,
|
|
875
|
-
"uk_postcodes": self.uk_postcodes,
|
|
876
|
-
"titles": self.titles,
|
|
877
|
-
"presidio": self.presidio,
|
|
878
|
-
}
|
|
879
|
-
|
|
880
|
-
return scrub_methods.get(scrub_method, lambda: "Unknown method.")()
|
|
881
|
-
|
|
882
867
|
def scrub(self, scrub_methods: list[str] = ["all"]) -> list[str]:
|
|
883
868
|
"""
|
|
884
869
|
Scrubs text using given methods (in order).
|
|
@@ -907,7 +892,11 @@ class IDScrub:
|
|
|
907
892
|
"""
|
|
908
893
|
|
|
909
894
|
for scrub_method in scrub_methods:
|
|
910
|
-
|
|
895
|
+
try:
|
|
896
|
+
method = getattr(self, scrub_method)
|
|
897
|
+
method()
|
|
898
|
+
except AttributeError:
|
|
899
|
+
self.logger.warning("Not a scrub method.")
|
|
911
900
|
|
|
912
901
|
return self.cleaned_texts
|
|
913
902
|
|
|
@@ -915,7 +904,7 @@ class IDScrub:
|
|
|
915
904
|
def dataframe(
|
|
916
905
|
df: pd.DataFrame = None,
|
|
917
906
|
id_col: str = None,
|
|
918
|
-
exclude_cols: list
|
|
907
|
+
exclude_cols: list = None,
|
|
919
908
|
scrub_methods: list[str] = ["all"],
|
|
920
909
|
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
921
910
|
"""
|
|
@@ -960,13 +949,9 @@ class IDScrub:
|
|
|
960
949
|
scrubbed_df[col] = scrubbed_df[col].astype(str)
|
|
961
950
|
|
|
962
951
|
scrub = IDScrub(texts=scrubbed_df[col].to_list(), text_id_name=id_col, text_ids=ids)
|
|
952
|
+
scrub.logger.info(f"Scrubbing column `{col}`...")
|
|
963
953
|
|
|
964
|
-
|
|
965
|
-
if i == len(scrub_methods) - 1:
|
|
966
|
-
scrubbed_texts = scrub.call_scrub_method(scrub_method)
|
|
967
|
-
else:
|
|
968
|
-
scrub.call_scrub_method(scrub_method)
|
|
969
|
-
|
|
954
|
+
scrubbed_texts = scrub.scrub(scrub_methods)
|
|
970
955
|
scrubbed_df[col] = scrubbed_texts
|
|
971
956
|
|
|
972
957
|
scrubbed_data = scrub.get_scrubbed_data()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: idscrub
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Author: Department for Business and Trade
|
|
5
5
|
Requires-Python: >=3.12
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -21,7 +21,7 @@ Dynamic: license-file
|
|
|
21
21
|
|
|
22
22
|
# idscrub 🧽✨
|
|
23
23
|
|
|
24
|
-
* Names and other personally identifying information are often present in text.
|
|
24
|
+
* Names and other personally identifying information are often present in text, even if they are not clearly visible or requested.
|
|
25
25
|
* This information may need to be removed prior to further analysis in many cases.
|
|
26
26
|
* `idscrub` identifies and removes (*✨scrubs✨*) personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
|
|
27
27
|
|
|
@@ -4,7 +4,6 @@ CODEOWNERS
|
|
|
4
4
|
LICENSE
|
|
5
5
|
Makefile
|
|
6
6
|
README.md
|
|
7
|
-
SECURITY.md
|
|
8
7
|
SECURITY_CHECKLIST.md
|
|
9
8
|
pyproject.toml
|
|
10
9
|
uv.lock
|
|
@@ -26,6 +25,7 @@ test/test_chain.py
|
|
|
26
25
|
test/test_dataframe.py
|
|
27
26
|
test/test_huggingface.py
|
|
28
27
|
test/test_id.py
|
|
28
|
+
test/test_label.py
|
|
29
29
|
test/test_log.py
|
|
30
30
|
test/test_persidio.py
|
|
31
31
|
test/test_phonenumbers.py
|