idscrub 1.0.1__tar.gz → 1.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {idscrub-1.0.1 → idscrub-1.1.1}/.pre-commit-config.yaml +1 -1
- {idscrub-1.0.1 → idscrub-1.1.1}/PKG-INFO +20 -14
- {idscrub-1.0.1 → idscrub-1.1.1}/README.md +18 -12
- {idscrub-1.0.1 → idscrub-1.1.1}/idscrub/scrub.py +85 -39
- {idscrub-1.0.1 → idscrub-1.1.1}/idscrub.egg-info/PKG-INFO +20 -14
- {idscrub-1.0.1 → idscrub-1.1.1}/idscrub.egg-info/SOURCES.txt +1 -1
- {idscrub-1.0.1 → idscrub-1.1.1}/idscrub.egg-info/requires.txt +1 -1
- {idscrub-1.0.1 → idscrub-1.1.1}/notebooks/basic_usage.ipynb +138 -74
- {idscrub-1.0.1 → idscrub-1.1.1}/pyproject.toml +1 -1
- {idscrub-1.0.1 → idscrub-1.1.1}/test/test_chain.py +2 -2
- {idscrub-1.0.1 → idscrub-1.1.1}/test/test_huggingface.py +3 -3
- {idscrub-1.0.1 → idscrub-1.1.1}/test/test_label.py +1 -1
- idscrub-1.0.1/test/test_persidio.py → idscrub-1.1.1/test/test_presidio.py +7 -7
- {idscrub-1.0.1 → idscrub-1.1.1}/test/test_regex.py +46 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/test/test_scrub.py +3 -3
- idscrub-1.1.1/test/test_spacy.py +53 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/uv.lock +356 -320
- idscrub-1.0.1/test/test_spacy.py +0 -26
- {idscrub-1.0.1 → idscrub-1.1.1}/.github/pull_request_template.md +0 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/.github/workflows/cd.yml +0 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/.github/workflows/ci.yml +0 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/.gitignore +0 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/CODEOWNERS +0 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/LICENSE +0 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/Makefile +0 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/SECURITY_CHECKLIST.md +0 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/idscrub/__init__.py +0 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/idscrub/locations.py +0 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/idscrub.egg-info/dependency_links.txt +0 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/idscrub.egg-info/top_level.txt +0 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/setup.cfg +0 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/test/conftest.py +0 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/test/test_all.py +0 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/test/test_dataframe.py +0 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/test/test_id.py +0 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/test/test_log.py +0 -0
- {idscrub-1.0.1 → idscrub-1.1.1}/test/test_phonenumbers.py +0 -0
|
@@ -12,7 +12,7 @@ repos:
|
|
|
12
12
|
|
|
13
13
|
# Mandatory internal hooks
|
|
14
14
|
- repo: https://github.com/uktrade/github-standards
|
|
15
|
-
rev: v1.
|
|
15
|
+
rev: v1.3.1 # update periodically with pre-commit autoupdate
|
|
16
16
|
hooks:
|
|
17
17
|
- id: run-security-scan
|
|
18
18
|
verbose: false
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: idscrub
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.1
|
|
4
4
|
Author: Department for Business and Trade
|
|
5
5
|
Requires-Python: >=3.12
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -8,7 +8,7 @@ License-File: LICENSE
|
|
|
8
8
|
Requires-Dist: ipykernel>=7.1.0
|
|
9
9
|
Requires-Dist: ipywidgets
|
|
10
10
|
Requires-Dist: numpy>=2.3.4
|
|
11
|
-
Requires-Dist: pandas
|
|
11
|
+
Requires-Dist: pandas<3.0
|
|
12
12
|
Requires-Dist: phonenumbers>=9.0.18
|
|
13
13
|
Requires-Dist: pip>=25.3
|
|
14
14
|
Requires-Dist: spacy-transformers>=1.3.9
|
|
@@ -19,12 +19,17 @@ Provides-Extra: trf
|
|
|
19
19
|
Requires-Dist: en_core_web_trf; extra == "trf"
|
|
20
20
|
Dynamic: license-file
|
|
21
21
|
|
|
22
|
+

|
|
23
|
+
|
|
22
24
|
# idscrub 🧽✨
|
|
23
25
|
|
|
24
26
|
* Names and other personally identifying information are often present in text, even if they are not clearly visible or requested.
|
|
25
27
|
* This information may need to be removed prior to further analysis in many cases.
|
|
26
28
|
* `idscrub` identifies and removes (*✨scrubs✨*) personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
|
|
27
29
|
|
|
30
|
+
> [!IMPORTANT]
|
|
31
|
+
> * This package is undergoing frequent internal development. Major updates will be made public periodically.
|
|
32
|
+
|
|
28
33
|
## Installation
|
|
29
34
|
|
|
30
35
|
`idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example:
|
|
@@ -45,7 +50,7 @@ Basic usage example (see [basic_usage.ipynb](https://github.com/uktrade/idscrub/
|
|
|
45
50
|
from idscrub import IDScrub
|
|
46
51
|
|
|
47
52
|
scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])x
|
|
48
|
-
scrubbed_texts = scrub.scrub(scrub_methods=['
|
|
53
|
+
scrubbed_texts = scrub.scrub(scrub_methods=['spacy_entities', 'uk_phone_numbers', 'uk_postcodes'])
|
|
49
54
|
|
|
50
55
|
print(scrubbed_texts)
|
|
51
56
|
|
|
@@ -57,17 +62,18 @@ Personal data can either be scrubbed as methods with arguments for extra customi
|
|
|
57
62
|
|
|
58
63
|
| Argument | Scrubs |
|
|
59
64
|
|-------------------------|------------------------------------------------------------------------|
|
|
60
|
-
| `all` | All supported personal data types (see `IDScrub.all()` for further customisation)
|
|
61
|
-
| `
|
|
62
|
-
| `
|
|
63
|
-
| `
|
|
64
|
-
| `
|
|
65
|
-
| `
|
|
66
|
-
| `
|
|
67
|
-
| `
|
|
68
|
-
| `
|
|
69
|
-
| `
|
|
70
|
-
| `
|
|
65
|
+
| `all` | All supported personal data types (see `IDScrub.all()` for further customisation) |
|
|
66
|
+
| `spacy_entities` | Entities detected by spaCy's `en_core_web_trf` or other user-selected spaCy models (e.g. persons (names), organisations) |
|
|
67
|
+
| `presidio_entities` | Entities supported by [Microsoft Presidio](https://microsoft.github.io/presidio/) (e.g. persons (names), URLs, NHS numbers, IBAN codes) |
|
|
68
|
+
| `huggingface_entities` | Entities detected by user-selected HuggingFace models |
|
|
69
|
+
| `email_addresses` | Email addresses (e.g. john@email.com) |
|
|
70
|
+
| `titles` | Titles (e.g. Mr., Mrs., Dr.) |
|
|
71
|
+
| `handles` | Social media handles (e.g. @username) |
|
|
72
|
+
| `ip_addresses` | IP addresses (e.g. 8.8.8.8) |
|
|
73
|
+
| `uk_postcodes` | UK postal codes (e.g. SW1A 2AA) |
|
|
74
|
+
| `uk_addresses` | UK addresses (e.g. 10 Downing Street) |
|
|
75
|
+
| `uk_phone_numbers` | UK phone numbers (e.g. +441111111111) |
|
|
76
|
+
| `google_phone_numbers` | Phone numbers detected by Google's [phonenumbers](https://github.com/daviddrysdale/python-phonenumbers) |
|
|
71
77
|
|
|
72
78
|
## Considerations before use
|
|
73
79
|
|
|
@@ -1,9 +1,14 @@
|
|
|
1
|
+

|
|
2
|
+
|
|
1
3
|
# idscrub 🧽✨
|
|
2
4
|
|
|
3
5
|
* Names and other personally identifying information are often present in text, even if they are not clearly visible or requested.
|
|
4
6
|
* This information may need to be removed prior to further analysis in many cases.
|
|
5
7
|
* `idscrub` identifies and removes (*✨scrubs✨*) personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
|
|
6
8
|
|
|
9
|
+
> [!IMPORTANT]
|
|
10
|
+
> * This package is undergoing frequent internal development. Major updates will be made public periodically.
|
|
11
|
+
|
|
7
12
|
## Installation
|
|
8
13
|
|
|
9
14
|
`idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example:
|
|
@@ -24,7 +29,7 @@ Basic usage example (see [basic_usage.ipynb](https://github.com/uktrade/idscrub/
|
|
|
24
29
|
from idscrub import IDScrub
|
|
25
30
|
|
|
26
31
|
scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])x
|
|
27
|
-
scrubbed_texts = scrub.scrub(scrub_methods=['
|
|
32
|
+
scrubbed_texts = scrub.scrub(scrub_methods=['spacy_entities', 'uk_phone_numbers', 'uk_postcodes'])
|
|
28
33
|
|
|
29
34
|
print(scrubbed_texts)
|
|
30
35
|
|
|
@@ -36,17 +41,18 @@ Personal data can either be scrubbed as methods with arguments for extra customi
|
|
|
36
41
|
|
|
37
42
|
| Argument | Scrubs |
|
|
38
43
|
|-------------------------|------------------------------------------------------------------------|
|
|
39
|
-
| `all` | All supported personal data types (see `IDScrub.all()` for further customisation)
|
|
40
|
-
| `
|
|
41
|
-
| `
|
|
42
|
-
| `
|
|
43
|
-
| `
|
|
44
|
-
| `
|
|
45
|
-
| `
|
|
46
|
-
| `
|
|
47
|
-
| `
|
|
48
|
-
| `
|
|
49
|
-
| `
|
|
44
|
+
| `all` | All supported personal data types (see `IDScrub.all()` for further customisation) |
|
|
45
|
+
| `spacy_entities` | Entities detected by spaCy's `en_core_web_trf` or other user-selected spaCy models (e.g. persons (names), organisations) |
|
|
46
|
+
| `presidio_entities` | Entities supported by [Microsoft Presidio](https://microsoft.github.io/presidio/) (e.g. persons (names), URLs, NHS numbers, IBAN codes) |
|
|
47
|
+
| `huggingface_entities` | Entities detected by user-selected HuggingFace models |
|
|
48
|
+
| `email_addresses` | Email addresses (e.g. john@email.com) |
|
|
49
|
+
| `titles` | Titles (e.g. Mr., Mrs., Dr.) |
|
|
50
|
+
| `handles` | Social media handles (e.g. @username) |
|
|
51
|
+
| `ip_addresses` | IP addresses (e.g. 8.8.8.8) |
|
|
52
|
+
| `uk_postcodes` | UK postal codes (e.g. SW1A 2AA) |
|
|
53
|
+
| `uk_addresses` | UK addresses (e.g. 10 Downing Street) |
|
|
54
|
+
| `uk_phone_numbers` | UK phone numbers (e.g. +441111111111) |
|
|
55
|
+
| `google_phone_numbers` | Phone numbers detected by Google's [phonenumbers](https://github.com/daviddrysdale/python-phonenumbers) |
|
|
50
56
|
|
|
51
57
|
## Considerations before use
|
|
52
58
|
|
|
@@ -453,6 +453,24 @@ class IDScrub:
|
|
|
453
453
|
|
|
454
454
|
return self.scrub_regex(pattern, replacement_text, label=label)
|
|
455
455
|
|
|
456
|
+
def uk_addresses(self, replacement_text: str = "[ADDRESS]", label: str = "uk_address") -> list[str]:
|
|
457
|
+
"""
|
|
458
|
+
Removes addresses.
|
|
459
|
+
e.g. `10 Downing Street` scrubbed
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
replacement_text (str): The replacement text for the removed text.
|
|
463
|
+
label (str): Label for the personal data removed.
|
|
464
|
+
|
|
465
|
+
Returns:
|
|
466
|
+
list[str]: The input list of text with postcodes replaced.
|
|
467
|
+
"""
|
|
468
|
+
|
|
469
|
+
self.logger.info("Scrubbing addresses using regex...")
|
|
470
|
+
pattern = r"(?i)\b(?:flat\s+\w+,\s*)?\d+[a-z]?(?:[-–/]\d+[a-z]?)?\s+[a-z][a-z'’\- ]+\s+(street|st|road|rd|avenue|ave|lane|ln|close|cl|drive|dr|way|walk|gardens|gdns|place|pl|mews|court|ct|crescent|cres|terrace|ter)\b"
|
|
471
|
+
|
|
472
|
+
return self.scrub_regex(pattern, replacement_text, label)
|
|
473
|
+
|
|
456
474
|
def claimants(self, replacement_text="[CLAIMANT]", label: str = "claimant") -> list[str]:
|
|
457
475
|
"""
|
|
458
476
|
Removes claimant names from employment tribunal texts.
|
|
@@ -528,64 +546,86 @@ class IDScrub:
|
|
|
528
546
|
|
|
529
547
|
return model
|
|
530
548
|
|
|
531
|
-
def
|
|
549
|
+
def spacy_entities(
|
|
532
550
|
self,
|
|
533
551
|
model_name: str = "en_core_web_trf",
|
|
552
|
+
entities: list[str] = ["PERSON", "ORG", "NORP"],
|
|
553
|
+
replacement_map: str = {"PERSON": "[PERSON]", "ORG": "[ORG]", "NORP": "[NORP]"},
|
|
554
|
+
label_prefix: str = None,
|
|
534
555
|
n_process: int = 1,
|
|
535
556
|
batch_size: int = 1000,
|
|
536
|
-
replacement_text: str = "[PERSON]",
|
|
537
|
-
label: str = "person",
|
|
538
557
|
) -> list[str]:
|
|
539
558
|
"""
|
|
540
|
-
Remove
|
|
559
|
+
Remove SpaCy entities using a given SpaCy model.
|
|
560
|
+
Documentation for entity labels: https://spacy.io/models/en#en_core_web_trf
|
|
541
561
|
Note: only "en_core_web_trf" has been evaluated.
|
|
542
562
|
|
|
543
563
|
Args:
|
|
544
564
|
model_name (str): Name of Spacy model. Only `en_core_web_trf` has been evaluated.
|
|
565
|
+
entities (list[str]): Which SpaCy entities to scrub (based on SpaCy entity keys).
|
|
566
|
+
replacement_map (str): The replacement texts for the removed text. Index will match `entities`.
|
|
567
|
+
label_prefix (str): Prefix for the Spacy entity removed, e.g. `{label}_person`.
|
|
545
568
|
n_process (int): Number of parallel processes.
|
|
546
569
|
batch_size (int): The number of texts in each batch.
|
|
547
|
-
replacement_text (str): The replacement text for the removed text.
|
|
548
|
-
label (str): Label for the personal data removed.
|
|
549
570
|
|
|
550
571
|
Returns:
|
|
551
572
|
list[str]: The input list of text with PERSON entities scrubbed.
|
|
552
573
|
"""
|
|
553
|
-
self.logger.info(f"Scrubbing names using SpaCy model `{model_name}`...")
|
|
554
574
|
|
|
555
|
-
|
|
575
|
+
self.logger.info(
|
|
576
|
+
f"Scrubbing SpaCy entities `{', '.join(str(entitity) for entitity in entities)}` using SpaCy model `{model_name}`..."
|
|
577
|
+
)
|
|
556
578
|
|
|
557
|
-
|
|
558
|
-
replacement_text = self.replacement_text
|
|
579
|
+
texts = self.get_texts()
|
|
559
580
|
|
|
560
581
|
cleaned_texts = []
|
|
582
|
+
labels = []
|
|
561
583
|
|
|
562
584
|
nlp = self.get_spacy_model(model_name)
|
|
563
585
|
stripped_texts = [s.strip() if s.isspace() else s for s in texts]
|
|
564
586
|
documents = nlp.pipe(stripped_texts, n_process=n_process, batch_size=batch_size)
|
|
565
587
|
|
|
566
588
|
for i, (ids, doc, stripped_text) in tqdm(
|
|
567
|
-
enumerate(
|
|
589
|
+
enumerate(zip(self.text_ids, documents, stripped_texts)), total=len(texts)
|
|
568
590
|
):
|
|
569
|
-
if stripped_text
|
|
591
|
+
if not stripped_text:
|
|
570
592
|
cleaned_texts.append(texts[i])
|
|
571
593
|
continue
|
|
572
594
|
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
595
|
+
all_found_entities = []
|
|
596
|
+
|
|
597
|
+
for entity_type in entities:
|
|
598
|
+
found = [
|
|
599
|
+
ent for ent in doc.ents if ent.label_ == entity_type and ent.text not in {entity_type, "HANDLE"}
|
|
600
|
+
]
|
|
601
|
+
|
|
602
|
+
for ent in found:
|
|
603
|
+
label = ent.label_.lower()
|
|
604
|
+
if label_prefix:
|
|
605
|
+
label = f"{label_prefix}_{label}"
|
|
606
|
+
labels.append(label)
|
|
607
|
+
self.scrubbed_data.append({self.text_id_name: ids, label: ent.text})
|
|
608
|
+
|
|
609
|
+
if self.replacement_text:
|
|
610
|
+
all_found_entities.extend((ent.start_char, ent.end_char, self.replacement_text) for ent in found)
|
|
611
|
+
elif replacement_map:
|
|
612
|
+
all_found_entities.extend(
|
|
613
|
+
(ent.start_char, ent.end_char, replacement_map.get(entity_type)) for ent in found
|
|
614
|
+
)
|
|
615
|
+
else:
|
|
616
|
+
all_found_entities.extend((ent.start_char, ent.end_char, f"[{entity_type}]") for ent in found)
|
|
578
617
|
|
|
579
|
-
# Remove person entities
|
|
580
618
|
cleaned = stripped_text
|
|
581
|
-
|
|
582
|
-
|
|
619
|
+
|
|
620
|
+
for start, end, repl in sorted(all_found_entities, key=lambda x: x[0], reverse=True):
|
|
621
|
+
cleaned = cleaned[:start] + repl + cleaned[end:]
|
|
583
622
|
|
|
584
623
|
cleaned_texts.append(cleaned)
|
|
585
624
|
|
|
586
625
|
self.cleaned_texts = cleaned_texts
|
|
587
626
|
|
|
588
|
-
|
|
627
|
+
for label in set(labels):
|
|
628
|
+
self.log_message(label)
|
|
589
629
|
|
|
590
630
|
return cleaned_texts
|
|
591
631
|
|
|
@@ -600,7 +640,7 @@ class IDScrub:
|
|
|
600
640
|
Note: No Hugging Face models have been evaluated for performance.
|
|
601
641
|
|
|
602
642
|
Args:
|
|
603
|
-
hf_model_path (str): Path to the Hugging Face model
|
|
643
|
+
hf_model_path (str): Path to the Hugging Face model.
|
|
604
644
|
Only `dbmdz/bert-large-cased-finetuned-conll03-english` has been evaluated.
|
|
605
645
|
download_directory (str): Directory in which to save the model.
|
|
606
646
|
Default is current working directory.
|
|
@@ -624,20 +664,21 @@ class IDScrub:
|
|
|
624
664
|
|
|
625
665
|
return tokenizer
|
|
626
666
|
|
|
627
|
-
def
|
|
667
|
+
def huggingface_entities(
|
|
628
668
|
self,
|
|
629
669
|
hf_model_path: str = "dbmdz/bert-large-cased-finetuned-conll03-english",
|
|
630
670
|
download_directory: str = f"{DOWNLOAD_DIR}/huggingface/",
|
|
671
|
+
entity="PER",
|
|
631
672
|
replacement_text: str = "[PERSON]",
|
|
632
673
|
label: str = "person",
|
|
633
674
|
batch_size: int = 8,
|
|
634
675
|
) -> list[str]:
|
|
635
676
|
"""
|
|
636
|
-
Remove
|
|
677
|
+
Remove entities using a Hugging Face model. Default is a PERSON entity identifier.
|
|
637
678
|
Note: No Hugging Face models have been evaluated for performance.
|
|
638
679
|
|
|
639
680
|
Args:
|
|
640
|
-
hf_model_path (str): Path to the Hugging Face model
|
|
681
|
+
hf_model_path (str): Path to the Hugging Face model.
|
|
641
682
|
Only `dbmdz/bert-large-cased-finetuned-conll03-english` has been tested.
|
|
642
683
|
download_directory (str): Directory in which to save the model.
|
|
643
684
|
Default is current working directory.
|
|
@@ -679,7 +720,7 @@ class IDScrub:
|
|
|
679
720
|
continue
|
|
680
721
|
|
|
681
722
|
person_entities = [
|
|
682
|
-
ent for ent in entities if ent["entity_group"] ==
|
|
723
|
+
ent for ent in entities if ent["entity_group"] == entity and ent["word"] not in {"HANDLE", entity}
|
|
683
724
|
]
|
|
684
725
|
self.scrubbed_data.extend({self.text_id_name: ids, label: ent["word"]} for ent in person_entities)
|
|
685
726
|
|
|
@@ -695,10 +736,10 @@ class IDScrub:
|
|
|
695
736
|
|
|
696
737
|
return cleaned_texts
|
|
697
738
|
|
|
698
|
-
def
|
|
739
|
+
def presidio_entities(
|
|
699
740
|
self,
|
|
700
741
|
model_name: str = "en_core_web_trf",
|
|
701
|
-
|
|
742
|
+
entities: list[str] = [
|
|
702
743
|
"PERSON",
|
|
703
744
|
"UK_NINO",
|
|
704
745
|
"UK_NHS",
|
|
@@ -718,15 +759,18 @@ class IDScrub:
|
|
|
718
759
|
|
|
719
760
|
Args:
|
|
720
761
|
model_name (str): spaCy model to use
|
|
721
|
-
|
|
762
|
+
entities (list[str]): Entity types to scrub (e.g. ["PERSON", "IP_ADDRESS"])
|
|
722
763
|
replacement_map (dict): Mapping of entity_type to replacement string (e.g. {'PERSON': '[PERSON]'})
|
|
723
764
|
label_prefix (str): Prefix for the Presidio personal data type removed, e.g. `{label}_person`.
|
|
765
|
+
Useful if you wish to identify this having being scrubbed by Presidio.
|
|
724
766
|
|
|
725
767
|
Returns:
|
|
726
768
|
list[str]: The input list of text with entities replaced.
|
|
727
769
|
"""
|
|
728
770
|
|
|
729
|
-
self.logger.info(
|
|
771
|
+
self.logger.info(
|
|
772
|
+
f"Scrubbing Presidio entities `{', '.join(str(entitity) for entitity in entities)}` using SpaCy model `{model_name}`..."
|
|
773
|
+
)
|
|
730
774
|
|
|
731
775
|
texts = self.get_texts()
|
|
732
776
|
|
|
@@ -744,7 +788,7 @@ class IDScrub:
|
|
|
744
788
|
anonymizer = AnonymizerEngine()
|
|
745
789
|
|
|
746
790
|
cleaned_texts = []
|
|
747
|
-
|
|
791
|
+
all_labels = []
|
|
748
792
|
|
|
749
793
|
stripped_texts = [s.strip() if s.isspace() else s for s in texts]
|
|
750
794
|
|
|
@@ -754,14 +798,15 @@ class IDScrub:
|
|
|
754
798
|
continue
|
|
755
799
|
|
|
756
800
|
results = analyzer.analyze(text=stripped_text, language="en")
|
|
757
|
-
results = [r for r in results if r.entity_type in
|
|
801
|
+
results = [r for r in results if r.entity_type in entities]
|
|
758
802
|
|
|
759
803
|
if label_prefix:
|
|
760
804
|
labels = [f"{label_prefix}_{res.entity_type.lower()}" for res in results]
|
|
761
805
|
else:
|
|
762
806
|
labels = [f"{res.entity_type.lower()}" for res in results]
|
|
763
807
|
|
|
764
|
-
|
|
808
|
+
for label in labels:
|
|
809
|
+
all_labels.append(label)
|
|
765
810
|
|
|
766
811
|
self.scrubbed_data.extend(
|
|
767
812
|
{self.text_id_name: ids, label: stripped_text[res.start : res.end]}
|
|
@@ -788,9 +833,8 @@ class IDScrub:
|
|
|
788
833
|
|
|
789
834
|
self.cleaned_texts = cleaned_texts
|
|
790
835
|
|
|
791
|
-
for label in
|
|
792
|
-
|
|
793
|
-
self.log_message(label[0])
|
|
836
|
+
for label in set(all_labels):
|
|
837
|
+
self.log_message(label)
|
|
794
838
|
|
|
795
839
|
return cleaned_texts
|
|
796
840
|
|
|
@@ -810,6 +854,7 @@ class IDScrub:
|
|
|
810
854
|
self.handles()
|
|
811
855
|
self.ip_addresses()
|
|
812
856
|
self.uk_phone_numbers()
|
|
857
|
+
self.uk_addresses()
|
|
813
858
|
self.uk_postcodes()
|
|
814
859
|
self.titles()
|
|
815
860
|
|
|
@@ -820,7 +865,8 @@ class IDScrub:
|
|
|
820
865
|
custom_regex_patterns: list = None,
|
|
821
866
|
custom_replacement_texts: list[str] = None,
|
|
822
867
|
model_name: str = "en_core_web_trf",
|
|
823
|
-
|
|
868
|
+
spacy_entities: list[str] = ["PERSON", "ORG", "NORP"],
|
|
869
|
+
presidio_entities: list[str] = [
|
|
824
870
|
"PERSON",
|
|
825
871
|
"EMAIL_ADDRESS",
|
|
826
872
|
"UK_NINO",
|
|
@@ -857,8 +903,8 @@ class IDScrub:
|
|
|
857
903
|
custom_replacement_texts=custom_replacement_texts,
|
|
858
904
|
)
|
|
859
905
|
|
|
860
|
-
self.
|
|
861
|
-
self.
|
|
906
|
+
self.presidio_entities(model_name=model_name, entities=presidio_entities)
|
|
907
|
+
self.spacy_entities(model_name=model_name, entities=spacy_entities, n_process=n_process, batch_size=batch_size)
|
|
862
908
|
self.google_phone_numbers()
|
|
863
909
|
self.all_regex()
|
|
864
910
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: idscrub
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.1
|
|
4
4
|
Author: Department for Business and Trade
|
|
5
5
|
Requires-Python: >=3.12
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -8,7 +8,7 @@ License-File: LICENSE
|
|
|
8
8
|
Requires-Dist: ipykernel>=7.1.0
|
|
9
9
|
Requires-Dist: ipywidgets
|
|
10
10
|
Requires-Dist: numpy>=2.3.4
|
|
11
|
-
Requires-Dist: pandas
|
|
11
|
+
Requires-Dist: pandas<3.0
|
|
12
12
|
Requires-Dist: phonenumbers>=9.0.18
|
|
13
13
|
Requires-Dist: pip>=25.3
|
|
14
14
|
Requires-Dist: spacy-transformers>=1.3.9
|
|
@@ -19,12 +19,17 @@ Provides-Extra: trf
|
|
|
19
19
|
Requires-Dist: en_core_web_trf; extra == "trf"
|
|
20
20
|
Dynamic: license-file
|
|
21
21
|
|
|
22
|
+

|
|
23
|
+
|
|
22
24
|
# idscrub 🧽✨
|
|
23
25
|
|
|
24
26
|
* Names and other personally identifying information are often present in text, even if they are not clearly visible or requested.
|
|
25
27
|
* This information may need to be removed prior to further analysis in many cases.
|
|
26
28
|
* `idscrub` identifies and removes (*✨scrubs✨*) personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
|
|
27
29
|
|
|
30
|
+
> [!IMPORTANT]
|
|
31
|
+
> * This package is undergoing frequent internal development. Major updates will be made public periodically.
|
|
32
|
+
|
|
28
33
|
## Installation
|
|
29
34
|
|
|
30
35
|
`idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example:
|
|
@@ -45,7 +50,7 @@ Basic usage example (see [basic_usage.ipynb](https://github.com/uktrade/idscrub/
|
|
|
45
50
|
from idscrub import IDScrub
|
|
46
51
|
|
|
47
52
|
scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])x
|
|
48
|
-
scrubbed_texts = scrub.scrub(scrub_methods=['
|
|
53
|
+
scrubbed_texts = scrub.scrub(scrub_methods=['spacy_entities', 'uk_phone_numbers', 'uk_postcodes'])
|
|
49
54
|
|
|
50
55
|
print(scrubbed_texts)
|
|
51
56
|
|
|
@@ -57,17 +62,18 @@ Personal data can either be scrubbed as methods with arguments for extra customi
|
|
|
57
62
|
|
|
58
63
|
| Argument | Scrubs |
|
|
59
64
|
|-------------------------|------------------------------------------------------------------------|
|
|
60
|
-
| `all` | All supported personal data types (see `IDScrub.all()` for further customisation)
|
|
61
|
-
| `
|
|
62
|
-
| `
|
|
63
|
-
| `
|
|
64
|
-
| `
|
|
65
|
-
| `
|
|
66
|
-
| `
|
|
67
|
-
| `
|
|
68
|
-
| `
|
|
69
|
-
| `
|
|
70
|
-
| `
|
|
65
|
+
| `all` | All supported personal data types (see `IDScrub.all()` for further customisation) |
|
|
66
|
+
| `spacy_entities` | Entities detected by spaCy's `en_core_web_trf` or other user-selected spaCy models (e.g. persons (names), organisations) |
|
|
67
|
+
| `presidio_entities` | Entities supported by [Microsoft Presidio](https://microsoft.github.io/presidio/) (e.g. persons (names), URLs, NHS numbers, IBAN codes) |
|
|
68
|
+
| `huggingface_entities` | Entities detected by user-selected HuggingFace models |
|
|
69
|
+
| `email_addresses` | Email addresses (e.g. john@email.com) |
|
|
70
|
+
| `titles` | Titles (e.g. Mr., Mrs., Dr.) |
|
|
71
|
+
| `handles` | Social media handles (e.g. @username) |
|
|
72
|
+
| `ip_addresses` | IP addresses (e.g. 8.8.8.8) |
|
|
73
|
+
| `uk_postcodes` | UK postal codes (e.g. SW1A 2AA) |
|
|
74
|
+
| `uk_addresses` | UK addresses (e.g. 10 Downing Street) |
|
|
75
|
+
| `uk_phone_numbers` | UK phone numbers (e.g. +441111111111) |
|
|
76
|
+
| `google_phone_numbers` | Phone numbers detected by Google's [phonenumbers](https://github.com/daviddrysdale/python-phonenumbers) |
|
|
71
77
|
|
|
72
78
|
## Considerations before use
|
|
73
79
|
|