idscrub 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- idscrub/scrub.py +85 -39
- {idscrub-1.0.1.dist-info → idscrub-1.1.0.dist-info}/METADATA +14 -13
- idscrub-1.1.0.dist-info/RECORD +22 -0
- notebooks/basic_usage.ipynb +138 -74
- test/test_chain.py +2 -2
- test/test_huggingface.py +3 -3
- test/test_label.py +1 -1
- test/{test_persidio.py → test_presidio.py} +7 -7
- test/test_regex.py +46 -0
- test/test_scrub.py +3 -3
- test/test_spacy.py +30 -3
- idscrub-1.0.1.dist-info/RECORD +0 -22
- {idscrub-1.0.1.dist-info → idscrub-1.1.0.dist-info}/WHEEL +0 -0
- {idscrub-1.0.1.dist-info → idscrub-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {idscrub-1.0.1.dist-info → idscrub-1.1.0.dist-info}/top_level.txt +0 -0
idscrub/scrub.py
CHANGED
|
@@ -453,6 +453,24 @@ class IDScrub:
|
|
|
453
453
|
|
|
454
454
|
return self.scrub_regex(pattern, replacement_text, label=label)
|
|
455
455
|
|
|
456
|
+
def uk_addresses(self, replacement_text: str = "[ADDRESS]", label: str = "uk_address") -> list[str]:
|
|
457
|
+
"""
|
|
458
|
+
Removes addresses.
|
|
459
|
+
e.g. `10 Downing Street` scrubbed
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
replacement_text (str): The replacement text for the removed text.
|
|
463
|
+
label (str): Label for the personal data removed.
|
|
464
|
+
|
|
465
|
+
Returns:
|
|
466
|
+
list[str]: The input list of text with postcodes replaced.
|
|
467
|
+
"""
|
|
468
|
+
|
|
469
|
+
self.logger.info("Scrubbing addresses using regex...")
|
|
470
|
+
pattern = r"(?i)\b(?:flat\s+\w+,\s*)?\d+[a-z]?(?:[-–/]\d+[a-z]?)?\s+[a-z][a-z'’\- ]+\s+(street|st|road|rd|avenue|ave|lane|ln|close|cl|drive|dr|way|walk|gardens|gdns|place|pl|mews|court|ct|crescent|cres|terrace|ter)\b"
|
|
471
|
+
|
|
472
|
+
return self.scrub_regex(pattern, replacement_text, label)
|
|
473
|
+
|
|
456
474
|
def claimants(self, replacement_text="[CLAIMANT]", label: str = "claimant") -> list[str]:
|
|
457
475
|
"""
|
|
458
476
|
Removes claimant names from employment tribunal texts.
|
|
@@ -528,64 +546,86 @@ class IDScrub:
|
|
|
528
546
|
|
|
529
547
|
return model
|
|
530
548
|
|
|
531
|
-
def
|
|
549
|
+
def spacy_entities(
|
|
532
550
|
self,
|
|
533
551
|
model_name: str = "en_core_web_trf",
|
|
552
|
+
entities: list[str] = ["PERSON", "ORG", "NORP"],
|
|
553
|
+
replacement_map: str = {"PERSON": "[PERSON]", "ORG": "[ORG]", "NORP": "[NORP]"},
|
|
554
|
+
label_prefix: str = None,
|
|
534
555
|
n_process: int = 1,
|
|
535
556
|
batch_size: int = 1000,
|
|
536
|
-
replacement_text: str = "[PERSON]",
|
|
537
|
-
label: str = "person",
|
|
538
557
|
) -> list[str]:
|
|
539
558
|
"""
|
|
540
|
-
Remove
|
|
559
|
+
Remove SpaCy entities using a given SpaCy model.
|
|
560
|
+
Documentation for entity labels: https://spacy.io/models/en#en_core_web_trf
|
|
541
561
|
Note: only "en_core_web_trf" has been evaluated.
|
|
542
562
|
|
|
543
563
|
Args:
|
|
544
564
|
model_name (str): Name of Spacy model. Only `en_core_web_trf` has been evaluated.
|
|
565
|
+
entities (list[str]): Which SpaCy entities to scrub (based on SpaCy entity keys).
|
|
566
|
+
replacement_map (str): The replacement texts for the removed text. Index will match `entities`.
|
|
567
|
+
label_prefix (str): Prefix for the Spacy entity removed, e.g. `{label}_person`.
|
|
545
568
|
n_process (int): Number of parallel processes.
|
|
546
569
|
batch_size (int): The number of texts in each batch.
|
|
547
|
-
replacement_text (str): The replacement text for the removed text.
|
|
548
|
-
label (str): Label for the personal data removed.
|
|
549
570
|
|
|
550
571
|
Returns:
|
|
551
572
|
list[str]: The input list of text with PERSON entities scrubbed.
|
|
552
573
|
"""
|
|
553
|
-
self.logger.info(f"Scrubbing names using SpaCy model `{model_name}`...")
|
|
554
574
|
|
|
555
|
-
|
|
575
|
+
self.logger.info(
|
|
576
|
+
f"Scrubbing SpaCy entities `{', '.join(str(entitity) for entitity in entities)}` using SpaCy model `{model_name}`..."
|
|
577
|
+
)
|
|
556
578
|
|
|
557
|
-
|
|
558
|
-
replacement_text = self.replacement_text
|
|
579
|
+
texts = self.get_texts()
|
|
559
580
|
|
|
560
581
|
cleaned_texts = []
|
|
582
|
+
labels = []
|
|
561
583
|
|
|
562
584
|
nlp = self.get_spacy_model(model_name)
|
|
563
585
|
stripped_texts = [s.strip() if s.isspace() else s for s in texts]
|
|
564
586
|
documents = nlp.pipe(stripped_texts, n_process=n_process, batch_size=batch_size)
|
|
565
587
|
|
|
566
588
|
for i, (ids, doc, stripped_text) in tqdm(
|
|
567
|
-
enumerate(
|
|
589
|
+
enumerate(zip(self.text_ids, documents, stripped_texts)), total=len(texts)
|
|
568
590
|
):
|
|
569
|
-
if stripped_text
|
|
591
|
+
if not stripped_text:
|
|
570
592
|
cleaned_texts.append(texts[i])
|
|
571
593
|
continue
|
|
572
594
|
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
595
|
+
all_found_entities = []
|
|
596
|
+
|
|
597
|
+
for entity_type in entities:
|
|
598
|
+
found = [
|
|
599
|
+
ent for ent in doc.ents if ent.label_ == entity_type and ent.text not in {entity_type, "HANDLE"}
|
|
600
|
+
]
|
|
601
|
+
|
|
602
|
+
for ent in found:
|
|
603
|
+
label = ent.label_.lower()
|
|
604
|
+
if label_prefix:
|
|
605
|
+
label = f"{label_prefix}_{label}"
|
|
606
|
+
labels.append(label)
|
|
607
|
+
self.scrubbed_data.append({self.text_id_name: ids, label: ent.text})
|
|
608
|
+
|
|
609
|
+
if self.replacement_text:
|
|
610
|
+
all_found_entities.extend((ent.start_char, ent.end_char, self.replacement_text) for ent in found)
|
|
611
|
+
elif replacement_map:
|
|
612
|
+
all_found_entities.extend(
|
|
613
|
+
(ent.start_char, ent.end_char, replacement_map.get(entity_type)) for ent in found
|
|
614
|
+
)
|
|
615
|
+
else:
|
|
616
|
+
all_found_entities.extend((ent.start_char, ent.end_char, f"[{entity_type}]") for ent in found)
|
|
578
617
|
|
|
579
|
-
# Remove person entities
|
|
580
618
|
cleaned = stripped_text
|
|
581
|
-
|
|
582
|
-
|
|
619
|
+
|
|
620
|
+
for start, end, repl in sorted(all_found_entities, key=lambda x: x[0], reverse=True):
|
|
621
|
+
cleaned = cleaned[:start] + repl + cleaned[end:]
|
|
583
622
|
|
|
584
623
|
cleaned_texts.append(cleaned)
|
|
585
624
|
|
|
586
625
|
self.cleaned_texts = cleaned_texts
|
|
587
626
|
|
|
588
|
-
|
|
627
|
+
for label in set(labels):
|
|
628
|
+
self.log_message(label)
|
|
589
629
|
|
|
590
630
|
return cleaned_texts
|
|
591
631
|
|
|
@@ -600,7 +640,7 @@ class IDScrub:
|
|
|
600
640
|
Note: No Hugging Face models have been evaluated for performance.
|
|
601
641
|
|
|
602
642
|
Args:
|
|
603
|
-
hf_model_path (str): Path to the Hugging Face model
|
|
643
|
+
hf_model_path (str): Path to the Hugging Face model.
|
|
604
644
|
Only `dbmdz/bert-large-cased-finetuned-conll03-english` has been evaluated.
|
|
605
645
|
download_directory (str): Directory in which to save the model.
|
|
606
646
|
Default is current working directory.
|
|
@@ -624,20 +664,21 @@ class IDScrub:
|
|
|
624
664
|
|
|
625
665
|
return tokenizer
|
|
626
666
|
|
|
627
|
-
def
|
|
667
|
+
def huggingface_entities(
|
|
628
668
|
self,
|
|
629
669
|
hf_model_path: str = "dbmdz/bert-large-cased-finetuned-conll03-english",
|
|
630
670
|
download_directory: str = f"{DOWNLOAD_DIR}/huggingface/",
|
|
671
|
+
entity="PER",
|
|
631
672
|
replacement_text: str = "[PERSON]",
|
|
632
673
|
label: str = "person",
|
|
633
674
|
batch_size: int = 8,
|
|
634
675
|
) -> list[str]:
|
|
635
676
|
"""
|
|
636
|
-
Remove
|
|
677
|
+
Remove entities using a Hugging Face model. Default is a PERSON entity identifier.
|
|
637
678
|
Note: No Hugging Face models have been evaluated for performance.
|
|
638
679
|
|
|
639
680
|
Args:
|
|
640
|
-
hf_model_path (str): Path to the Hugging Face model
|
|
681
|
+
hf_model_path (str): Path to the Hugging Face model.
|
|
641
682
|
Only `dbmdz/bert-large-cased-finetuned-conll03-english` has been tested.
|
|
642
683
|
download_directory (str): Directory in which to save the model.
|
|
643
684
|
Default is current working directory.
|
|
@@ -679,7 +720,7 @@ class IDScrub:
|
|
|
679
720
|
continue
|
|
680
721
|
|
|
681
722
|
person_entities = [
|
|
682
|
-
ent for ent in entities if ent["entity_group"] ==
|
|
723
|
+
ent for ent in entities if ent["entity_group"] == entity and ent["word"] not in {"HANDLE", entity}
|
|
683
724
|
]
|
|
684
725
|
self.scrubbed_data.extend({self.text_id_name: ids, label: ent["word"]} for ent in person_entities)
|
|
685
726
|
|
|
@@ -695,10 +736,10 @@ class IDScrub:
|
|
|
695
736
|
|
|
696
737
|
return cleaned_texts
|
|
697
738
|
|
|
698
|
-
def
|
|
739
|
+
def presidio_entities(
|
|
699
740
|
self,
|
|
700
741
|
model_name: str = "en_core_web_trf",
|
|
701
|
-
|
|
742
|
+
entities: list[str] = [
|
|
702
743
|
"PERSON",
|
|
703
744
|
"UK_NINO",
|
|
704
745
|
"UK_NHS",
|
|
@@ -718,15 +759,18 @@ class IDScrub:
|
|
|
718
759
|
|
|
719
760
|
Args:
|
|
720
761
|
model_name (str): spaCy model to use
|
|
721
|
-
|
|
762
|
+
entities (list[str]): Entity types to scrub (e.g. ["PERSON", "IP_ADDRESS"])
|
|
722
763
|
replacement_map (dict): Mapping of entity_type to replacement string (e.g. {'PERSON': '[PERSON]'})
|
|
723
764
|
label_prefix (str): Prefix for the Presidio personal data type removed, e.g. `{label}_person`.
|
|
765
|
+
Useful if you wish to identify this having being scrubbed by Presidio.
|
|
724
766
|
|
|
725
767
|
Returns:
|
|
726
768
|
list[str]: The input list of text with entities replaced.
|
|
727
769
|
"""
|
|
728
770
|
|
|
729
|
-
self.logger.info(
|
|
771
|
+
self.logger.info(
|
|
772
|
+
f"Scrubbing Presidio entities `{', '.join(str(entitity) for entitity in entities)}` using SpaCy model `{model_name}`..."
|
|
773
|
+
)
|
|
730
774
|
|
|
731
775
|
texts = self.get_texts()
|
|
732
776
|
|
|
@@ -744,7 +788,7 @@ class IDScrub:
|
|
|
744
788
|
anonymizer = AnonymizerEngine()
|
|
745
789
|
|
|
746
790
|
cleaned_texts = []
|
|
747
|
-
|
|
791
|
+
all_labels = []
|
|
748
792
|
|
|
749
793
|
stripped_texts = [s.strip() if s.isspace() else s for s in texts]
|
|
750
794
|
|
|
@@ -754,14 +798,15 @@ class IDScrub:
|
|
|
754
798
|
continue
|
|
755
799
|
|
|
756
800
|
results = analyzer.analyze(text=stripped_text, language="en")
|
|
757
|
-
results = [r for r in results if r.entity_type in
|
|
801
|
+
results = [r for r in results if r.entity_type in entities]
|
|
758
802
|
|
|
759
803
|
if label_prefix:
|
|
760
804
|
labels = [f"{label_prefix}_{res.entity_type.lower()}" for res in results]
|
|
761
805
|
else:
|
|
762
806
|
labels = [f"{res.entity_type.lower()}" for res in results]
|
|
763
807
|
|
|
764
|
-
|
|
808
|
+
for label in labels:
|
|
809
|
+
all_labels.append(label)
|
|
765
810
|
|
|
766
811
|
self.scrubbed_data.extend(
|
|
767
812
|
{self.text_id_name: ids, label: stripped_text[res.start : res.end]}
|
|
@@ -788,9 +833,8 @@ class IDScrub:
|
|
|
788
833
|
|
|
789
834
|
self.cleaned_texts = cleaned_texts
|
|
790
835
|
|
|
791
|
-
for label in
|
|
792
|
-
|
|
793
|
-
self.log_message(label[0])
|
|
836
|
+
for label in set(all_labels):
|
|
837
|
+
self.log_message(label)
|
|
794
838
|
|
|
795
839
|
return cleaned_texts
|
|
796
840
|
|
|
@@ -810,6 +854,7 @@ class IDScrub:
|
|
|
810
854
|
self.handles()
|
|
811
855
|
self.ip_addresses()
|
|
812
856
|
self.uk_phone_numbers()
|
|
857
|
+
self.uk_addresses()
|
|
813
858
|
self.uk_postcodes()
|
|
814
859
|
self.titles()
|
|
815
860
|
|
|
@@ -820,7 +865,8 @@ class IDScrub:
|
|
|
820
865
|
custom_regex_patterns: list = None,
|
|
821
866
|
custom_replacement_texts: list[str] = None,
|
|
822
867
|
model_name: str = "en_core_web_trf",
|
|
823
|
-
|
|
868
|
+
spacy_entities: list[str] = ["PERSON", "ORG", "NORP"],
|
|
869
|
+
presidio_entities: list[str] = [
|
|
824
870
|
"PERSON",
|
|
825
871
|
"EMAIL_ADDRESS",
|
|
826
872
|
"UK_NINO",
|
|
@@ -857,8 +903,8 @@ class IDScrub:
|
|
|
857
903
|
custom_replacement_texts=custom_replacement_texts,
|
|
858
904
|
)
|
|
859
905
|
|
|
860
|
-
self.
|
|
861
|
-
self.
|
|
906
|
+
self.presidio_entities(model_name=model_name, entities=presidio_entities)
|
|
907
|
+
self.spacy_entities(model_name=model_name, entities=spacy_entities, n_process=n_process, batch_size=batch_size)
|
|
862
908
|
self.google_phone_numbers()
|
|
863
909
|
self.all_regex()
|
|
864
910
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: idscrub
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Author: Department for Business and Trade
|
|
5
5
|
Requires-Python: >=3.12
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -45,7 +45,7 @@ Basic usage example (see [basic_usage.ipynb](https://github.com/uktrade/idscrub/
|
|
|
45
45
|
from idscrub import IDScrub
|
|
46
46
|
|
|
47
47
|
scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])x
|
|
48
|
-
scrubbed_texts = scrub.scrub(scrub_methods=['
|
|
48
|
+
scrubbed_texts = scrub.scrub(scrub_methods=['spacy_entities', 'uk_phone_numbers', 'uk_postcodes'])
|
|
49
49
|
|
|
50
50
|
print(scrubbed_texts)
|
|
51
51
|
|
|
@@ -57,17 +57,18 @@ Personal data can either be scrubbed as methods with arguments for extra customi
|
|
|
57
57
|
|
|
58
58
|
| Argument | Scrubs |
|
|
59
59
|
|-------------------------|------------------------------------------------------------------------|
|
|
60
|
-
| `all` | All supported personal data types (see `IDScrub.all()` for further customisation)
|
|
61
|
-
| `
|
|
62
|
-
| `
|
|
63
|
-
| `
|
|
64
|
-
| `
|
|
65
|
-
| `
|
|
66
|
-
| `
|
|
67
|
-
| `
|
|
68
|
-
| `
|
|
69
|
-
| `
|
|
70
|
-
| `
|
|
60
|
+
| `all` | All supported personal data types (see `IDScrub.all()` for further customisation) |
|
|
61
|
+
| `spacy_entities` | Entities detected by spaCy's `en_core_web_trf` or other user-selected spaCy models (e.g. persons (names), organisations) |
|
|
62
|
+
| `presidio_entities` | Entities supported by [Microsoft Presidio](https://microsoft.github.io/presidio/) (e.g. persons (names), URLs, NHS numbers, IBAN codes) |
|
|
63
|
+
| `huggingface_entities` | Entities detected by user-selected HuggingFace models |
|
|
64
|
+
| `email_addresses` | Email addresses (e.g. john@email.com) |
|
|
65
|
+
| `titles` | Titles (e.g. Mr., Mrs., Dr.) |
|
|
66
|
+
| `handles` | Social media handles (e.g. @username) |
|
|
67
|
+
| `ip_addresses` | IP addresses (e.g. 8.8.8.8) |
|
|
68
|
+
| `uk_postcodes` | UK postal codes (e.g. SW1A 2AA) |
|
|
69
|
+
| `uk_addresses` | UK addresses (e.g. 10 Downing Street) |
|
|
70
|
+
| `uk_phone_numbers` | UK phone numbers (e.g. +441111111111) |
|
|
71
|
+
| `google_phone_numbers` | Phone numbers detected by Google's [phonenumbers](https://github.com/daviddrysdale/python-phonenumbers) |
|
|
71
72
|
|
|
72
73
|
## Considerations before use
|
|
73
74
|
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
idscrub/__init__.py,sha256=cRugJv27q1q--bl-VNLpfiScJb_ROlUxyLFhaF55S1w,38
|
|
2
|
+
idscrub/locations.py,sha256=7fMNOcGMYe7sX8TrfhMW6oYGAlc1WVYVQKQbpxE3pqo,217
|
|
3
|
+
idscrub/scrub.py,sha256=PPTKWW-RQxZ5NixRow8nrnX9KjfyZa3tPAP9Jgwnn_M,36631
|
|
4
|
+
idscrub-1.1.0.dist-info/licenses/LICENSE,sha256=JJnuf10NSx7YXglte1oH_N9ZP3AcWR_Y8irvQb_wnsg,1090
|
|
5
|
+
notebooks/basic_usage.ipynb,sha256=V62Bz88a9Zo3LO_VxXF4sLw8-MP51ZdVRRNS-zjtNqw,42664
|
|
6
|
+
test/conftest.py,sha256=y-pwGXpdg7bbFc36HtE3wQtZkeI0JM77fcMYjej5veY,557
|
|
7
|
+
test/test_all.py,sha256=ifuXAI0Hq3ETNXzdITjNGCnuFyozhN5TpJC2hOtA2bM,1103
|
|
8
|
+
test/test_chain.py,sha256=YbJeA11EBjDNcq5ZZjG4lIIyngrRQZknNsX3Oo0jPMc,1810
|
|
9
|
+
test/test_dataframe.py,sha256=1LhtkQQpXblQ18ppI1s1nNyse0YCwGHbhtrKGkdppBw,6413
|
|
10
|
+
test/test_huggingface.py,sha256=RTkp8Xsy4w9WoXq2IQ2YOJof41snbOQkM7CVtiVVD0U,839
|
|
11
|
+
test/test_id.py,sha256=TPsvz4Kw1z_Fiek2BV79Hc2q3N37xU3oQra6Y7Ke11Q,989
|
|
12
|
+
test/test_label.py,sha256=aNkIxJ-_YkBnW8QrBfRxjSsRZWeh5hn_iM7Rk1wrfPU,652
|
|
13
|
+
test/test_log.py,sha256=tGAGOv4aeHT4E_pB9rq_nNA1CDHNoINpkVrCKaP4d3U,645
|
|
14
|
+
test/test_phonenumbers.py,sha256=hZsXgwhn5R-7426TTWwCH9gWQwhyHtjLUstN10jnX6c,607
|
|
15
|
+
test/test_presidio.py,sha256=BOGghcTWLSQPBhQxO014rO3RG-IL5XEbAaKuGN677pU,1558
|
|
16
|
+
test/test_regex.py,sha256=foc2N4UCi7mGL0EIfp1t-ivgujkXMrmbsnsU77sbWZ0,5424
|
|
17
|
+
test/test_scrub.py,sha256=tMYrIhbyXXKqt24tS1U_kAJT_vZfhOD4DAsf5ZFbEvU,1380
|
|
18
|
+
test/test_spacy.py,sha256=gxJrNpV5B3HydUfoMsbmzRUoiKNs3_zwdSXqbPeW0qA,1846
|
|
19
|
+
idscrub-1.1.0.dist-info/METADATA,sha256=HyPSfPJuFUPOib2fNr3eUtQIcvgJHr3uVNZaZQcXmS8,7003
|
|
20
|
+
idscrub-1.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
21
|
+
idscrub-1.1.0.dist-info/top_level.txt,sha256=D4EEodXGCjGiX35ObiBTmjjBAdouN-eCvH-LezGGtks,23
|
|
22
|
+
idscrub-1.1.0.dist-info/RECORD,,
|
notebooks/basic_usage.ipynb
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
"cell_type": "markdown",
|
|
5
5
|
"metadata": {},
|
|
6
6
|
"source": [
|
|
7
|
-
"### `idscrub` basic usage
|
|
7
|
+
"### `idscrub` basic usage examples"
|
|
8
8
|
]
|
|
9
9
|
},
|
|
10
10
|
{
|
|
@@ -17,11 +17,14 @@
|
|
|
17
17
|
"output_type": "stream",
|
|
18
18
|
"text": [
|
|
19
19
|
"INFO: Texts loaded.\n",
|
|
20
|
-
"INFO: Scrubbing
|
|
21
|
-
"100%|██████████| 2/2 [00:00<00:00,
|
|
20
|
+
"INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
|
|
21
|
+
"100%|██████████| 2/2 [00:00<00:00, 33.83it/s]\n",
|
|
22
|
+
"INFO: 1 org scrubbed.\n",
|
|
22
23
|
"INFO: 3 person scrubbed.\n",
|
|
23
24
|
"INFO: Scrubbing phone numbers using regex...\n",
|
|
24
25
|
"INFO: 1 uk_phone_number scrubbed.\n",
|
|
26
|
+
"INFO: Scrubbing addresses using regex...\n",
|
|
27
|
+
"INFO: 1 uk_address scrubbed.\n",
|
|
25
28
|
"INFO: Scrubbing postcodes using regex...\n",
|
|
26
29
|
"INFO: 1 uk_postcode scrubbed.\n"
|
|
27
30
|
]
|
|
@@ -30,7 +33,7 @@
|
|
|
30
33
|
"name": "stdout",
|
|
31
34
|
"output_type": "stream",
|
|
32
35
|
"text": [
|
|
33
|
-
"['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I
|
|
36
|
+
"['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], Lapland']\n"
|
|
34
37
|
]
|
|
35
38
|
}
|
|
36
39
|
],
|
|
@@ -40,11 +43,11 @@
|
|
|
40
43
|
"scrub = IDScrub(\n",
|
|
41
44
|
" [\n",
|
|
42
45
|
" \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
|
|
43
|
-
" \"My number is +441111111111 and I
|
|
46
|
+
" \"My number is +441111111111 and I work at the Department for Business and Trade, 15 Elf Road, AA11 1AA, Lapland\",\n",
|
|
44
47
|
" ]\n",
|
|
45
48
|
")\n",
|
|
46
49
|
"\n",
|
|
47
|
-
"scrubbed_texts = scrub.scrub(scrub_methods=[\"
|
|
50
|
+
"scrubbed_texts = scrub.scrub(scrub_methods=[\"spacy_entities\", \"uk_phone_numbers\", \"uk_addresses\", \"uk_postcodes\"])\n",
|
|
48
51
|
"\n",
|
|
49
52
|
"print(scrubbed_texts)"
|
|
50
53
|
]
|
|
@@ -77,7 +80,9 @@
|
|
|
77
80
|
" <th></th>\n",
|
|
78
81
|
" <th>text_id</th>\n",
|
|
79
82
|
" <th>person</th>\n",
|
|
83
|
+
" <th>org</th>\n",
|
|
80
84
|
" <th>uk_phone_number</th>\n",
|
|
85
|
+
" <th>uk_address</th>\n",
|
|
81
86
|
" <th>uk_postcode</th>\n",
|
|
82
87
|
" </tr>\n",
|
|
83
88
|
" </thead>\n",
|
|
@@ -88,12 +93,16 @@
|
|
|
88
93
|
" <td>[Hamish McDonald, L. Salah, Elena Suárez]</td>\n",
|
|
89
94
|
" <td>None</td>\n",
|
|
90
95
|
" <td>None</td>\n",
|
|
96
|
+
" <td>None</td>\n",
|
|
97
|
+
" <td>None</td>\n",
|
|
91
98
|
" </tr>\n",
|
|
92
99
|
" <tr>\n",
|
|
93
100
|
" <th>1</th>\n",
|
|
94
101
|
" <td>2</td>\n",
|
|
95
102
|
" <td>None</td>\n",
|
|
103
|
+
" <td>[the Department for Business and Trade]</td>\n",
|
|
96
104
|
" <td>[+441111111111]</td>\n",
|
|
105
|
+
" <td>[15 Elf Road]</td>\n",
|
|
97
106
|
" <td>[AA11 1AA]</td>\n",
|
|
98
107
|
" </tr>\n",
|
|
99
108
|
" </tbody>\n",
|
|
@@ -101,9 +110,13 @@
|
|
|
101
110
|
"</div>"
|
|
102
111
|
],
|
|
103
112
|
"text/plain": [
|
|
104
|
-
" text_id person
|
|
105
|
-
"0 1 [Hamish McDonald, L. Salah, Elena Suárez]
|
|
106
|
-
"1 2 None
|
|
113
|
+
" text_id person \\\n",
|
|
114
|
+
"0 1 [Hamish McDonald, L. Salah, Elena Suárez] \n",
|
|
115
|
+
"1 2 None \n",
|
|
116
|
+
"\n",
|
|
117
|
+
" org uk_phone_number uk_address \\\n",
|
|
118
|
+
"0 None None None \n",
|
|
119
|
+
"1 [the Department for Business and Trade] [+441111111111] [15 Elf Road] \n",
|
|
107
120
|
"\n",
|
|
108
121
|
" uk_postcode \n",
|
|
109
122
|
"0 None \n",
|
|
@@ -136,13 +149,13 @@
|
|
|
136
149
|
"output_type": "stream",
|
|
137
150
|
"text": [
|
|
138
151
|
"INFO: Texts loaded.\n",
|
|
139
|
-
"INFO: Scrubbing using
|
|
140
|
-
"100%|██████████| 2/2 [00:00<00:00,
|
|
152
|
+
"INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
|
|
153
|
+
"100%|██████████| 2/2 [00:00<00:00, 9.14it/s]\n",
|
|
141
154
|
"INFO: 3 person scrubbed.\n",
|
|
142
155
|
"INFO: 1 location scrubbed.\n",
|
|
143
|
-
"INFO: Scrubbing
|
|
144
|
-
"100%|██████████| 2/2 [00:00<00:00,
|
|
145
|
-
"INFO:
|
|
156
|
+
"INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
|
|
157
|
+
"100%|██████████| 2/2 [00:00<00:00, 42.62it/s]\n",
|
|
158
|
+
"INFO: 1 org scrubbed.\n",
|
|
146
159
|
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
147
160
|
"INFO: 0 phone_number scrubbed.\n",
|
|
148
161
|
"INFO: Scrubbing email addresses using regex...\n",
|
|
@@ -153,6 +166,8 @@
|
|
|
153
166
|
"INFO: 0 ip_address scrubbed.\n",
|
|
154
167
|
"INFO: Scrubbing phone numbers using regex...\n",
|
|
155
168
|
"INFO: 1 uk_phone_number scrubbed.\n",
|
|
169
|
+
"INFO: Scrubbing addresses using regex...\n",
|
|
170
|
+
"INFO: 1 uk_address scrubbed.\n",
|
|
156
171
|
"INFO: Scrubbing postcodes using regex...\n",
|
|
157
172
|
"INFO: 1 uk_postcode scrubbed.\n",
|
|
158
173
|
"INFO: Scrubbing titles using regex...\n",
|
|
@@ -163,7 +178,7 @@
|
|
|
163
178
|
"name": "stdout",
|
|
164
179
|
"output_type": "stream",
|
|
165
180
|
"text": [
|
|
166
|
-
"['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I
|
|
181
|
+
"['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], [LOCATION]']\n"
|
|
167
182
|
]
|
|
168
183
|
}
|
|
169
184
|
],
|
|
@@ -173,7 +188,7 @@
|
|
|
173
188
|
"scrub = IDScrub(\n",
|
|
174
189
|
" [\n",
|
|
175
190
|
" \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
|
|
176
|
-
" \"My number is +441111111111 and I
|
|
191
|
+
" \"My number is +441111111111 and I work at Department for Business and Trade, 15 Elf Road, AA11 1AA, Lapland\",\n",
|
|
177
192
|
" ]\n",
|
|
178
193
|
")\n",
|
|
179
194
|
"\n",
|
|
@@ -211,7 +226,9 @@
|
|
|
211
226
|
" <th>text_id</th>\n",
|
|
212
227
|
" <th>person</th>\n",
|
|
213
228
|
" <th>location</th>\n",
|
|
229
|
+
" <th>org</th>\n",
|
|
214
230
|
" <th>uk_phone_number</th>\n",
|
|
231
|
+
" <th>uk_address</th>\n",
|
|
215
232
|
" <th>uk_postcode</th>\n",
|
|
216
233
|
" </tr>\n",
|
|
217
234
|
" </thead>\n",
|
|
@@ -223,13 +240,17 @@
|
|
|
223
240
|
" <td>None</td>\n",
|
|
224
241
|
" <td>None</td>\n",
|
|
225
242
|
" <td>None</td>\n",
|
|
243
|
+
" <td>None</td>\n",
|
|
244
|
+
" <td>None</td>\n",
|
|
226
245
|
" </tr>\n",
|
|
227
246
|
" <tr>\n",
|
|
228
247
|
" <th>1</th>\n",
|
|
229
248
|
" <td>2</td>\n",
|
|
230
249
|
" <td>None</td>\n",
|
|
231
250
|
" <td>[Lapland]</td>\n",
|
|
251
|
+
" <td>[Department for Business and Trade]</td>\n",
|
|
232
252
|
" <td>[+441111111111]</td>\n",
|
|
253
|
+
" <td>[15 Elf Road]</td>\n",
|
|
233
254
|
" <td>[AA11 1AA]</td>\n",
|
|
234
255
|
" </tr>\n",
|
|
235
256
|
" </tbody>\n",
|
|
@@ -241,9 +262,13 @@
|
|
|
241
262
|
"0 1 [Hamish McDonald, L. Salah, Elena Suárez] None \n",
|
|
242
263
|
"1 2 None [Lapland] \n",
|
|
243
264
|
"\n",
|
|
244
|
-
"
|
|
245
|
-
"0 None
|
|
246
|
-
"1 [+441111111111] [
|
|
265
|
+
" org uk_phone_number uk_address \\\n",
|
|
266
|
+
"0 None None None \n",
|
|
267
|
+
"1 [Department for Business and Trade] [+441111111111] [15 Elf Road] \n",
|
|
268
|
+
"\n",
|
|
269
|
+
" uk_postcode \n",
|
|
270
|
+
"0 None \n",
|
|
271
|
+
"1 [AA11 1AA] "
|
|
247
272
|
]
|
|
248
273
|
},
|
|
249
274
|
"execution_count": 4,
|
|
@@ -272,14 +297,15 @@
|
|
|
272
297
|
"output_type": "stream",
|
|
273
298
|
"text": [
|
|
274
299
|
"INFO: Texts loaded.\n",
|
|
275
|
-
"INFO: Scrubbing using
|
|
276
|
-
"100%|██████████| 2/2 [00:00<00:00,
|
|
300
|
+
"INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
|
|
301
|
+
"100%|██████████| 2/2 [00:00<00:00, 42.58it/s]\n",
|
|
302
|
+
"INFO: 1 org scrubbed.\n",
|
|
277
303
|
"INFO: 3 person scrubbed.\n",
|
|
278
304
|
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
279
305
|
"INFO: 0 phone_number scrubbed.\n",
|
|
280
306
|
"INFO: Scrubbing custom regex...\n",
|
|
281
307
|
"INFO: 1 custom_regex_1 scrubbed.\n",
|
|
282
|
-
"INFO:
|
|
308
|
+
"INFO: 0 custom_regex_2 scrubbed.\n",
|
|
283
309
|
"INFO: Scrubbing email addresses using regex...\n",
|
|
284
310
|
"INFO: 0 email_address scrubbed.\n",
|
|
285
311
|
"INFO: Scrubbing @user handles using regex...\n",
|
|
@@ -288,6 +314,8 @@
|
|
|
288
314
|
"INFO: 0 ip_address scrubbed.\n",
|
|
289
315
|
"INFO: Scrubbing phone numbers using regex...\n",
|
|
290
316
|
"INFO: 1 uk_phone_number scrubbed.\n",
|
|
317
|
+
"INFO: Scrubbing addresses using regex...\n",
|
|
318
|
+
"INFO: 1 uk_address scrubbed.\n",
|
|
291
319
|
"INFO: Scrubbing postcodes using regex...\n",
|
|
292
320
|
"INFO: 1 uk_postcode scrubbed.\n",
|
|
293
321
|
"INFO: Scrubbing titles using regex...\n",
|
|
@@ -298,7 +326,7 @@
|
|
|
298
326
|
"name": "stdout",
|
|
299
327
|
"output_type": "stream",
|
|
300
328
|
"text": [
|
|
301
|
-
"['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I
|
|
329
|
+
"['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], [UNIVERSITY]']\n"
|
|
302
330
|
]
|
|
303
331
|
}
|
|
304
332
|
],
|
|
@@ -308,15 +336,18 @@
|
|
|
308
336
|
"scrub = IDScrub(\n",
|
|
309
337
|
" [\n",
|
|
310
338
|
" \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
|
|
311
|
-
" \"My number is +441111111111 and I
|
|
339
|
+
" \"My number is +441111111111 and I work at Department for Business and Trade, 15 Elf Road, AA11 1AA, Lapland\",\n",
|
|
312
340
|
" ]\n",
|
|
313
341
|
")\n",
|
|
314
342
|
"\n",
|
|
315
|
-
"scrub.
|
|
343
|
+
"scrub.spacy_entities()\n",
|
|
316
344
|
"scrub.google_phone_numbers(region=\"GB\")\n",
|
|
345
|
+
"\n",
|
|
346
|
+
"# Remove specific regex pattern(s). This can also be passed to all().\n",
|
|
317
347
|
"scrub.custom_regex(\n",
|
|
318
348
|
" custom_regex_patterns=[r\"Lapland\", r\"ACHILLES\"], custom_replacement_texts=[\"[UNIVERSITY]\", \"[REDACTED]\"]\n",
|
|
319
|
-
")
|
|
349
|
+
")\n",
|
|
350
|
+
"\n",
|
|
320
351
|
"scrubbed_texts = scrub.all_regex()\n",
|
|
321
352
|
"\n",
|
|
322
353
|
"print(scrubbed_texts)"
|
|
@@ -350,9 +381,10 @@
|
|
|
350
381
|
" <th></th>\n",
|
|
351
382
|
" <th>text_id</th>\n",
|
|
352
383
|
" <th>person</th>\n",
|
|
384
|
+
" <th>org</th>\n",
|
|
353
385
|
" <th>custom_regex_1</th>\n",
|
|
354
|
-
" <th>custom_regex_2</th>\n",
|
|
355
386
|
" <th>uk_phone_number</th>\n",
|
|
387
|
+
" <th>uk_address</th>\n",
|
|
356
388
|
" <th>uk_postcode</th>\n",
|
|
357
389
|
" </tr>\n",
|
|
358
390
|
" </thead>\n",
|
|
@@ -365,14 +397,16 @@
|
|
|
365
397
|
" <td>None</td>\n",
|
|
366
398
|
" <td>None</td>\n",
|
|
367
399
|
" <td>None</td>\n",
|
|
400
|
+
" <td>None</td>\n",
|
|
368
401
|
" </tr>\n",
|
|
369
402
|
" <tr>\n",
|
|
370
403
|
" <th>1</th>\n",
|
|
371
404
|
" <td>2</td>\n",
|
|
372
405
|
" <td>None</td>\n",
|
|
406
|
+
" <td>[Department for Business and Trade]</td>\n",
|
|
373
407
|
" <td>[Lapland]</td>\n",
|
|
374
|
-
" <td>[ACHILLES]</td>\n",
|
|
375
408
|
" <td>[+441111111111]</td>\n",
|
|
409
|
+
" <td>[15 Elf Road]</td>\n",
|
|
376
410
|
" <td>[AA11 1AA]</td>\n",
|
|
377
411
|
" </tr>\n",
|
|
378
412
|
" </tbody>\n",
|
|
@@ -380,13 +414,17 @@
|
|
|
380
414
|
"</div>"
|
|
381
415
|
],
|
|
382
416
|
"text/plain": [
|
|
383
|
-
" text_id person
|
|
384
|
-
"0 1 [Hamish McDonald, L. Salah, Elena Suárez]
|
|
385
|
-
"1 2 None
|
|
417
|
+
" text_id person \\\n",
|
|
418
|
+
"0 1 [Hamish McDonald, L. Salah, Elena Suárez] \n",
|
|
419
|
+
"1 2 None \n",
|
|
386
420
|
"\n",
|
|
387
|
-
"
|
|
388
|
-
"0 None None
|
|
389
|
-
"1
|
|
421
|
+
" org custom_regex_1 uk_phone_number \\\n",
|
|
422
|
+
"0 None None None \n",
|
|
423
|
+
"1 [Department for Business and Trade] [Lapland] [+441111111111] \n",
|
|
424
|
+
"\n",
|
|
425
|
+
" uk_address uk_postcode \n",
|
|
426
|
+
"0 None None \n",
|
|
427
|
+
"1 [15 Elf Road] [AA11 1AA] "
|
|
390
428
|
]
|
|
391
429
|
},
|
|
392
430
|
"execution_count": 6,
|
|
@@ -402,7 +440,7 @@
|
|
|
402
440
|
"cell_type": "markdown",
|
|
403
441
|
"metadata": {},
|
|
404
442
|
"source": [
|
|
405
|
-
"
|
|
443
|
+
"### `idscrub` example - using Presidio\n",
|
|
406
444
|
"We can also leverage the power of [Presidio](https://microsoft.github.io/presidio/) and use their entity recognition methods"
|
|
407
445
|
]
|
|
408
446
|
},
|
|
@@ -416,10 +454,10 @@
|
|
|
416
454
|
"output_type": "stream",
|
|
417
455
|
"text": [
|
|
418
456
|
"INFO: Texts loaded.\n",
|
|
419
|
-
"INFO: Scrubbing using
|
|
420
|
-
"100%|██████████| 2/2 [00:00<00:00,
|
|
421
|
-
"INFO:
|
|
422
|
-
"INFO:
|
|
457
|
+
"INFO: Scrubbing Presidio entities `PERSON, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, IBAN_CODE` using SpaCy model `en_core_web_trf`...\n",
|
|
458
|
+
"100%|██████████| 2/2 [00:00<00:00, 24.36it/s]\n",
|
|
459
|
+
"INFO: 1 iban_code scrubbed.\n",
|
|
460
|
+
"INFO: 3 person scrubbed.\n"
|
|
423
461
|
]
|
|
424
462
|
},
|
|
425
463
|
{
|
|
@@ -436,7 +474,7 @@
|
|
|
436
474
|
"scrub = IDScrub(\n",
|
|
437
475
|
" [\"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\", \"My IBAN code is GB91BKEN10000041610008\"]\n",
|
|
438
476
|
")\n",
|
|
439
|
-
"scrubbed_texts = scrub.
|
|
477
|
+
"scrubbed_texts = scrub.presidio_entities()\n",
|
|
440
478
|
"\n",
|
|
441
479
|
"print(scrubbed_texts)"
|
|
442
480
|
]
|
|
@@ -678,14 +716,11 @@
|
|
|
678
716
|
"text": [
|
|
679
717
|
" 0%| | 0/3 [00:00<?, ?it/s]INFO: Texts loaded.\n",
|
|
680
718
|
"INFO: Scrubbing column `Pride and Prejudice`...\n",
|
|
681
|
-
"INFO: Scrubbing using
|
|
682
|
-
"100%|██████████| 5/5 [00:00<00:00,
|
|
683
|
-
"INFO: 4 person scrubbed.\n",
|
|
684
|
-
"INFO: 4 person scrubbed.\n",
|
|
685
|
-
"INFO: 4 person scrubbed.\n",
|
|
686
|
-
"INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
|
|
687
|
-
"100%|██████████| 5/5 [00:00<00:00, 62.29it/s]\n",
|
|
719
|
+
"INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
|
|
720
|
+
"100%|██████████| 5/5 [00:00<00:00, 23.73it/s]\n",
|
|
688
721
|
"INFO: 4 person scrubbed.\n",
|
|
722
|
+
"INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
|
|
723
|
+
"100%|██████████| 5/5 [00:00<00:00, 77.84it/s]\n",
|
|
689
724
|
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
690
725
|
"INFO: 0 phone_number scrubbed.\n",
|
|
691
726
|
"INFO: Scrubbing email addresses using regex...\n",
|
|
@@ -696,19 +731,19 @@
|
|
|
696
731
|
"INFO: 0 ip_address scrubbed.\n",
|
|
697
732
|
"INFO: Scrubbing phone numbers using regex...\n",
|
|
698
733
|
"INFO: 0 uk_phone_number scrubbed.\n",
|
|
734
|
+
"INFO: Scrubbing addresses using regex...\n",
|
|
735
|
+
"INFO: 0 uk_address scrubbed.\n",
|
|
699
736
|
"INFO: Scrubbing postcodes using regex...\n",
|
|
700
737
|
"INFO: 0 uk_postcode scrubbed.\n",
|
|
701
738
|
"INFO: Scrubbing titles using regex...\n",
|
|
702
739
|
"INFO: 2 title scrubbed.\n",
|
|
703
|
-
" 33%|███▎ | 1/3 [00:02<00:05, 2.
|
|
740
|
+
" 33%|███▎ | 1/3 [00:02<00:05, 2.60s/it]INFO: Texts loaded.\n",
|
|
704
741
|
"INFO: Scrubbing column `The Adventures of Sherlock Holmes`...\n",
|
|
705
|
-
"INFO: Scrubbing using
|
|
706
|
-
"100%|██████████| 5/5 [00:00<00:00,
|
|
707
|
-
"INFO: 2 person scrubbed.\n",
|
|
708
|
-
"INFO: 2 person scrubbed.\n",
|
|
709
|
-
"INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
|
|
710
|
-
"100%|██████████| 5/5 [00:00<00:00, 82.44it/s]\n",
|
|
742
|
+
"INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
|
|
743
|
+
"100%|██████████| 5/5 [00:00<00:00, 24.22it/s]\n",
|
|
711
744
|
"INFO: 2 person scrubbed.\n",
|
|
745
|
+
"INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
|
|
746
|
+
"100%|██████████| 5/5 [00:00<00:00, 84.78it/s]\n",
|
|
712
747
|
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
713
748
|
"INFO: 0 phone_number scrubbed.\n",
|
|
714
749
|
"INFO: Scrubbing email addresses using regex...\n",
|
|
@@ -719,21 +754,23 @@
|
|
|
719
754
|
"INFO: 0 ip_address scrubbed.\n",
|
|
720
755
|
"INFO: Scrubbing phone numbers using regex...\n",
|
|
721
756
|
"INFO: 0 uk_phone_number scrubbed.\n",
|
|
757
|
+
"INFO: Scrubbing addresses using regex...\n",
|
|
758
|
+
"INFO: 0 uk_address scrubbed.\n",
|
|
722
759
|
"INFO: Scrubbing postcodes using regex...\n",
|
|
723
760
|
"INFO: 0 uk_postcode scrubbed.\n",
|
|
724
761
|
"INFO: Scrubbing titles using regex...\n",
|
|
725
762
|
"INFO: 0 title scrubbed.\n",
|
|
726
|
-
" 67%|██████▋ | 2/3 [00:05<00:02, 2.
|
|
763
|
+
" 67%|██████▋ | 2/3 [00:05<00:02, 2.49s/it]INFO: Texts loaded.\n",
|
|
727
764
|
"INFO: Scrubbing column `Fake book`...\n",
|
|
728
|
-
"INFO: Scrubbing using
|
|
729
|
-
"100%|██████████| 5/5 [00:00<00:00, 13.
|
|
765
|
+
"INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
|
|
766
|
+
"100%|██████████| 5/5 [00:00<00:00, 13.41it/s]\n",
|
|
730
767
|
"INFO: 1 iban_code scrubbed.\n",
|
|
768
|
+
"INFO: 5 url scrubbed.\n",
|
|
731
769
|
"INFO: 2 person scrubbed.\n",
|
|
732
770
|
"INFO: 3 email_address scrubbed.\n",
|
|
733
|
-
"INFO:
|
|
734
|
-
"
|
|
735
|
-
"
|
|
736
|
-
"INFO: 2 person scrubbed.\n",
|
|
771
|
+
"INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
|
|
772
|
+
"100%|██████████| 5/5 [00:00<00:00, 64.57it/s]\n",
|
|
773
|
+
"INFO: 1 org scrubbed.\n",
|
|
737
774
|
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
738
775
|
"INFO: 0 phone_number scrubbed.\n",
|
|
739
776
|
"INFO: Scrubbing email addresses using regex...\n",
|
|
@@ -744,11 +781,13 @@
|
|
|
744
781
|
"INFO: 0 ip_address scrubbed.\n",
|
|
745
782
|
"INFO: Scrubbing phone numbers using regex...\n",
|
|
746
783
|
"INFO: 0 uk_phone_number scrubbed.\n",
|
|
784
|
+
"INFO: Scrubbing addresses using regex...\n",
|
|
785
|
+
"INFO: 0 uk_address scrubbed.\n",
|
|
747
786
|
"INFO: Scrubbing postcodes using regex...\n",
|
|
748
787
|
"INFO: 4 uk_postcode scrubbed.\n",
|
|
749
788
|
"INFO: Scrubbing titles using regex...\n",
|
|
750
789
|
"INFO: 0 title scrubbed.\n",
|
|
751
|
-
"100%|██████████| 3/3 [00:07<00:00, 2.
|
|
790
|
+
"100%|██████████| 3/3 [00:07<00:00, 2.53s/it]\n"
|
|
752
791
|
]
|
|
753
792
|
},
|
|
754
793
|
{
|
|
@@ -810,7 +849,7 @@
|
|
|
810
849
|
" <td>The business of her life was to get her daught...</td>\n",
|
|
811
850
|
" <td>I am a brain, [PERSON]. The rest of me is a me...</td>\n",
|
|
812
851
|
" <td>Nothing is more painful to the human mind than...</td>\n",
|
|
813
|
-
" <td>A message arrived just as the
|
|
852
|
+
" <td>A message arrived just as the [ORG] clock stru...</td>\n",
|
|
814
853
|
" </tr>\n",
|
|
815
854
|
" <tr>\n",
|
|
816
855
|
" <th>4</th>\n",
|
|
@@ -850,7 +889,7 @@
|
|
|
850
889
|
"0 The letter to [EMAIL_ADDRESS] was stamped with... \n",
|
|
851
890
|
"1 She forwarded the memo from [PERSON] and [PERS... \n",
|
|
852
891
|
"2 The dossier marked confidential came from [EMA... \n",
|
|
853
|
-
"3 A message arrived just as the
|
|
892
|
+
"3 A message arrived just as the [ORG] clock stru... \n",
|
|
854
893
|
"4 They did not expected a reply from [EMAIL_ADDR... "
|
|
855
894
|
]
|
|
856
895
|
},
|
|
@@ -900,6 +939,7 @@
|
|
|
900
939
|
" <th>email_address</th>\n",
|
|
901
940
|
" <th>iban_code</th>\n",
|
|
902
941
|
" <th>url</th>\n",
|
|
942
|
+
" <th>org</th>\n",
|
|
903
943
|
" <th>uk_postcode</th>\n",
|
|
904
944
|
" </tr>\n",
|
|
905
945
|
" </thead>\n",
|
|
@@ -914,6 +954,7 @@
|
|
|
914
954
|
" <td>None</td>\n",
|
|
915
955
|
" <td>None</td>\n",
|
|
916
956
|
" <td>None</td>\n",
|
|
957
|
+
" <td>None</td>\n",
|
|
917
958
|
" </tr>\n",
|
|
918
959
|
" <tr>\n",
|
|
919
960
|
" <th>1</th>\n",
|
|
@@ -925,6 +966,7 @@
|
|
|
925
966
|
" <td>None</td>\n",
|
|
926
967
|
" <td>None</td>\n",
|
|
927
968
|
" <td>None</td>\n",
|
|
969
|
+
" <td>None</td>\n",
|
|
928
970
|
" </tr>\n",
|
|
929
971
|
" <tr>\n",
|
|
930
972
|
" <th>2</th>\n",
|
|
@@ -936,6 +978,7 @@
|
|
|
936
978
|
" <td>None</td>\n",
|
|
937
979
|
" <td>None</td>\n",
|
|
938
980
|
" <td>None</td>\n",
|
|
981
|
+
" <td>None</td>\n",
|
|
939
982
|
" </tr>\n",
|
|
940
983
|
" <tr>\n",
|
|
941
984
|
" <th>3</th>\n",
|
|
@@ -947,6 +990,7 @@
|
|
|
947
990
|
" <td>None</td>\n",
|
|
948
991
|
" <td>None</td>\n",
|
|
949
992
|
" <td>None</td>\n",
|
|
993
|
+
" <td>None</td>\n",
|
|
950
994
|
" </tr>\n",
|
|
951
995
|
" <tr>\n",
|
|
952
996
|
" <th>4</th>\n",
|
|
@@ -958,6 +1002,7 @@
|
|
|
958
1002
|
" <td>None</td>\n",
|
|
959
1003
|
" <td>None</td>\n",
|
|
960
1004
|
" <td>None</td>\n",
|
|
1005
|
+
" <td>None</td>\n",
|
|
961
1006
|
" </tr>\n",
|
|
962
1007
|
" <tr>\n",
|
|
963
1008
|
" <th>5</th>\n",
|
|
@@ -968,6 +1013,7 @@
|
|
|
968
1013
|
" <td>[freddie.mercury@queen.com]</td>\n",
|
|
969
1014
|
" <td>[GB91BKEN10000041610008]</td>\n",
|
|
970
1015
|
" <td>[freddie.me, queen.com]</td>\n",
|
|
1016
|
+
" <td>None</td>\n",
|
|
971
1017
|
" <td>[SW1A 2AA]</td>\n",
|
|
972
1018
|
" </tr>\n",
|
|
973
1019
|
" <tr>\n",
|
|
@@ -979,6 +1025,7 @@
|
|
|
979
1025
|
" <td>None</td>\n",
|
|
980
1026
|
" <td>None</td>\n",
|
|
981
1027
|
" <td>None</td>\n",
|
|
1028
|
+
" <td>None</td>\n",
|
|
982
1029
|
" <td>[SW1A 2WH]</td>\n",
|
|
983
1030
|
" </tr>\n",
|
|
984
1031
|
" <tr>\n",
|
|
@@ -990,6 +1037,7 @@
|
|
|
990
1037
|
" <td>[serena.williams@tennis.com]</td>\n",
|
|
991
1038
|
" <td>None</td>\n",
|
|
992
1039
|
" <td>[tennis.com]</td>\n",
|
|
1040
|
+
" <td>None</td>\n",
|
|
993
1041
|
" <td>[SW19 5AE]</td>\n",
|
|
994
1042
|
" </tr>\n",
|
|
995
1043
|
" <tr>\n",
|
|
@@ -1001,8 +1049,21 @@
|
|
|
1001
1049
|
" <td>[otis.redding@dockofthebay.org]</td>\n",
|
|
1002
1050
|
" <td>None</td>\n",
|
|
1003
1051
|
" <td>[otis.red, dockofthebay.org]</td>\n",
|
|
1052
|
+
" <td>None</td>\n",
|
|
1004
1053
|
" <td>[EH8 8DX]</td>\n",
|
|
1005
1054
|
" </tr>\n",
|
|
1055
|
+
" <tr>\n",
|
|
1056
|
+
" <th>9</th>\n",
|
|
1057
|
+
" <td>D</td>\n",
|
|
1058
|
+
" <td>Fake book</td>\n",
|
|
1059
|
+
" <td>None</td>\n",
|
|
1060
|
+
" <td>None</td>\n",
|
|
1061
|
+
" <td>None</td>\n",
|
|
1062
|
+
" <td>None</td>\n",
|
|
1063
|
+
" <td>None</td>\n",
|
|
1064
|
+
" <td>[Downing Street]</td>\n",
|
|
1065
|
+
" <td>None</td>\n",
|
|
1066
|
+
" </tr>\n",
|
|
1006
1067
|
" </tbody>\n",
|
|
1007
1068
|
"</table>\n",
|
|
1008
1069
|
"</div>"
|
|
@@ -1018,6 +1079,7 @@
|
|
|
1018
1079
|
"6 B Fake book [Mick Jagger, David Bowie] None \n",
|
|
1019
1080
|
"7 C Fake book None None \n",
|
|
1020
1081
|
"8 E Fake book None None \n",
|
|
1082
|
+
"9 D Fake book None None \n",
|
|
1021
1083
|
"\n",
|
|
1022
1084
|
" email_address iban_code \\\n",
|
|
1023
1085
|
"0 None None \n",
|
|
@@ -1029,17 +1091,19 @@
|
|
|
1029
1091
|
"6 None None \n",
|
|
1030
1092
|
"7 [serena.williams@tennis.com] None \n",
|
|
1031
1093
|
"8 [otis.redding@dockofthebay.org] None \n",
|
|
1094
|
+
"9 None None \n",
|
|
1032
1095
|
"\n",
|
|
1033
|
-
" url uk_postcode \n",
|
|
1034
|
-
"0 None None \n",
|
|
1035
|
-
"1 None None \n",
|
|
1036
|
-
"2 None None \n",
|
|
1037
|
-
"3 None None \n",
|
|
1038
|
-
"4 None None \n",
|
|
1039
|
-
"5 [freddie.me, queen.com] [SW1A 2AA] \n",
|
|
1040
|
-
"6 None [SW1A 2WH] \n",
|
|
1041
|
-
"7 [tennis.com] [SW19 5AE] \n",
|
|
1042
|
-
"8 [otis.red, dockofthebay.org] [EH8 8DX] "
|
|
1096
|
+
" url org uk_postcode \n",
|
|
1097
|
+
"0 None None None \n",
|
|
1098
|
+
"1 None None None \n",
|
|
1099
|
+
"2 None None None \n",
|
|
1100
|
+
"3 None None None \n",
|
|
1101
|
+
"4 None None None \n",
|
|
1102
|
+
"5 [freddie.me, queen.com] None [SW1A 2AA] \n",
|
|
1103
|
+
"6 None None [SW1A 2WH] \n",
|
|
1104
|
+
"7 [tennis.com] None [SW19 5AE] \n",
|
|
1105
|
+
"8 [otis.red, dockofthebay.org] None [EH8 8DX] \n",
|
|
1106
|
+
"9 None [Downing Street] None "
|
|
1043
1107
|
]
|
|
1044
1108
|
},
|
|
1045
1109
|
"execution_count": 11,
|
test/test_chain.py
CHANGED
|
@@ -6,7 +6,7 @@ from pandas.testing import assert_frame_equal
|
|
|
6
6
|
def test_chain(scrub_object):
|
|
7
7
|
scrub_object.uk_phone_numbers()
|
|
8
8
|
scrub_object.uk_postcodes()
|
|
9
|
-
scrubbed = scrub_object.
|
|
9
|
+
scrubbed = scrub_object.spacy_entities()
|
|
10
10
|
|
|
11
11
|
assert scrubbed == [
|
|
12
12
|
"Our names are [PERSON], [PERSON], and [PERSON].",
|
|
@@ -38,7 +38,7 @@ def test_chain_order(scrub_object):
|
|
|
38
38
|
def test_get_scrubbed_data_chain(scrub_object):
|
|
39
39
|
scrub_object.uk_phone_numbers()
|
|
40
40
|
scrub_object.uk_postcodes()
|
|
41
|
-
scrub_object.
|
|
41
|
+
scrub_object.spacy_entities()
|
|
42
42
|
|
|
43
43
|
df = scrub_object.get_scrubbed_data()
|
|
44
44
|
|
test/test_huggingface.py
CHANGED
|
@@ -6,7 +6,7 @@ from pandas.testing import assert_frame_equal
|
|
|
6
6
|
|
|
7
7
|
def test_huggingface():
|
|
8
8
|
scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
|
|
9
|
-
scrubbed = scrub.
|
|
9
|
+
scrubbed = scrub.huggingface_entities()
|
|
10
10
|
assert scrubbed == ["Our names are [PERSON], [PERSON], and [PERSON]."]
|
|
11
11
|
|
|
12
12
|
|
|
@@ -14,12 +14,12 @@ def test_huggingface_error():
|
|
|
14
14
|
scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
|
|
15
15
|
|
|
16
16
|
with pytest.raises(OSError):
|
|
17
|
-
scrub.
|
|
17
|
+
scrub.huggingface_entities(hf_model_path="not_a_path")
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
def test_huggingface_empty():
|
|
21
21
|
scrub = IDScrub([" ", "John Smith", ""])
|
|
22
|
-
scrubbed = scrub.
|
|
22
|
+
scrubbed = scrub.huggingface_entities()
|
|
23
23
|
|
|
24
24
|
assert scrubbed == [" ", "[PERSON]", ""]
|
|
25
25
|
assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
|
test/test_label.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
def test_label(scrub_object_all):
|
|
2
2
|
for i, scrub_method in enumerate(
|
|
3
|
-
["
|
|
3
|
+
["uk_postcodes", "email_addresses", "ip_addresses", "uk_phone_numbers", "titles", "handles"]
|
|
4
4
|
):
|
|
5
5
|
method = getattr(scrub_object_all, scrub_method)
|
|
6
6
|
method(label="test")
|
|
@@ -4,32 +4,32 @@ from pandas.testing import assert_frame_equal
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
# Note: These tests will fail if the kernel has not been restarted since the SpaCy model was downloaded.
|
|
7
|
-
def
|
|
7
|
+
def test_presidio():
|
|
8
8
|
scrub = IDScrub(
|
|
9
9
|
["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
|
|
10
10
|
)
|
|
11
|
-
scrubbed_texts = scrub.
|
|
11
|
+
scrubbed_texts = scrub.presidio_entities(entities=["PERSON", "IBAN_CODE"])
|
|
12
12
|
|
|
13
13
|
assert scrubbed_texts == ["Our names are [PERSON], [PERSON], and [PERSON].", "My IBAN code is [IBAN_CODE]."]
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
def
|
|
16
|
+
def test_presidio_map():
|
|
17
17
|
scrub = IDScrub(
|
|
18
18
|
["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
|
|
19
19
|
)
|
|
20
|
-
scrubbed_texts = scrub.
|
|
21
|
-
|
|
20
|
+
scrubbed_texts = scrub.presidio_entities(
|
|
21
|
+
entities=["PERSON", "IBAN_CODE"], replacement_map={"PERSON": "[PHELLO]", "IBAN_CODE": "[IHELLO]"}
|
|
22
22
|
)
|
|
23
23
|
|
|
24
24
|
assert scrubbed_texts == ["Our names are [PHELLO], [PHELLO], and [PHELLO].", "My IBAN code is [IHELLO]."]
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
def
|
|
27
|
+
def test_presidio_get_data():
|
|
28
28
|
scrub = IDScrub(
|
|
29
29
|
["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
|
|
30
30
|
)
|
|
31
31
|
|
|
32
|
-
scrub.
|
|
32
|
+
scrub.presidio_entities(entities=["PERSON", "IBAN_CODE"])
|
|
33
33
|
|
|
34
34
|
df = scrub.get_scrubbed_data()
|
|
35
35
|
|
test/test_regex.py
CHANGED
|
@@ -63,6 +63,52 @@ def test_handles():
|
|
|
63
63
|
assert scrubbed == ["Our usernames are [HANDLE], [HANDLE], [HANDLE] and [HANDLE]."]
|
|
64
64
|
|
|
65
65
|
|
|
66
|
+
def test_uk_addresses():
|
|
67
|
+
scrub = IDScrub(
|
|
68
|
+
[
|
|
69
|
+
"221B Baker Street",
|
|
70
|
+
"12 high road",
|
|
71
|
+
"Flat 3B, 47 King's Court",
|
|
72
|
+
"12–14 High Street",
|
|
73
|
+
"5a-7a Church Lane",
|
|
74
|
+
"1/2 Main Street",
|
|
75
|
+
"10 St John’s Rd",
|
|
76
|
+
"33 Queen-Anne Walk",
|
|
77
|
+
"8 Deansgate Ct",
|
|
78
|
+
]
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
scrubbed = scrub.uk_addresses()
|
|
82
|
+
assert scrubbed == [
|
|
83
|
+
"[ADDRESS]",
|
|
84
|
+
"[ADDRESS]",
|
|
85
|
+
"[ADDRESS]",
|
|
86
|
+
"[ADDRESS]",
|
|
87
|
+
"[ADDRESS]",
|
|
88
|
+
"[ADDRESS]",
|
|
89
|
+
"[ADDRESS]",
|
|
90
|
+
"[ADDRESS]",
|
|
91
|
+
"[ADDRESS]",
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
negative_tests = [
|
|
95
|
+
"12 High",
|
|
96
|
+
"Baker Street",
|
|
97
|
+
"High Road 12",
|
|
98
|
+
"Go to the high road now",
|
|
99
|
+
"500 the big building near river",
|
|
100
|
+
"I walked the long road home",
|
|
101
|
+
"12b misspelledstreet",
|
|
102
|
+
"London SW1A 1AA",
|
|
103
|
+
"12,,, High?",
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
scrub = IDScrub(negative_tests)
|
|
107
|
+
|
|
108
|
+
scrubbed = scrub.uk_addresses()
|
|
109
|
+
assert scrubbed == negative_tests
|
|
110
|
+
|
|
111
|
+
|
|
66
112
|
def test_claimants():
|
|
67
113
|
scrub = IDScrub(
|
|
68
114
|
texts=[
|
test/test_scrub.py
CHANGED
|
@@ -5,7 +5,7 @@ from pandas.testing import assert_frame_equal
|
|
|
5
5
|
|
|
6
6
|
# Note: These tests will fail if the kernel has not been restarted since the SpaCy model was downloaded.
|
|
7
7
|
def test_scrub(scrub_object):
|
|
8
|
-
scrubbed = scrub_object.scrub(scrub_methods=["
|
|
8
|
+
scrubbed = scrub_object.scrub(scrub_methods=["spacy_entities", "uk_phone_numbers", "uk_postcodes"])
|
|
9
9
|
assert scrubbed == [
|
|
10
10
|
"Our names are [PERSON], [PERSON], and [PERSON].",
|
|
11
11
|
"My number is [PHONENO] and I live at [POSTCODE].",
|
|
@@ -15,7 +15,7 @@ def test_scrub(scrub_object):
|
|
|
15
15
|
def test_scrub_text_id():
|
|
16
16
|
scrub = IDScrub(["Our names are Hamish McDonald, L. Salah, and Elena Suárez."] * 10)
|
|
17
17
|
|
|
18
|
-
scrub.scrub(scrub_methods=["
|
|
18
|
+
scrub.scrub(scrub_methods=["spacy_entities"])
|
|
19
19
|
|
|
20
20
|
df = scrub.get_scrubbed_data()
|
|
21
21
|
|
|
@@ -38,7 +38,7 @@ def test_scrub_get_scrubbed_data(scrub_object):
|
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
def test_scrub_order(scrub_object):
|
|
41
|
-
scrub_object.scrub(scrub_methods=["uk_postcodes", "uk_phone_numbers", "
|
|
41
|
+
scrub_object.scrub(scrub_methods=["uk_postcodes", "uk_phone_numbers", "spacy_entities"])
|
|
42
42
|
|
|
43
43
|
assert scrub_object.get_scrubbed_data().columns.to_list() == [
|
|
44
44
|
"text_id",
|
test/test_spacy.py
CHANGED
|
@@ -7,7 +7,7 @@ from pandas.testing import assert_frame_equal
|
|
|
7
7
|
# Note: This test will fail if the kernel has not been restarted since the SpaCy model was downloaded.
|
|
8
8
|
def test_spacy():
|
|
9
9
|
scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
|
|
10
|
-
scrubbed = scrub.
|
|
10
|
+
scrubbed = scrub.spacy_entities(entities=["PERSON"], model_name="en_core_web_trf")
|
|
11
11
|
assert scrubbed == ["Our names are [PERSON], [PERSON], and [PERSON]."]
|
|
12
12
|
|
|
13
13
|
|
|
@@ -15,12 +15,39 @@ def test_spacy_error():
|
|
|
15
15
|
scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
|
|
16
16
|
|
|
17
17
|
with pytest.raises(ValueError):
|
|
18
|
-
scrub.
|
|
18
|
+
scrub.spacy_entities(model_name="not_a_model")
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def test_spacy_empty():
|
|
22
22
|
scrub = IDScrub([" ", "John Smith", ""])
|
|
23
|
-
scrubbed = scrub.
|
|
23
|
+
scrubbed = scrub.spacy_entities()
|
|
24
24
|
|
|
25
25
|
assert scrubbed == [" ", "[PERSON]", ""]
|
|
26
26
|
assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_spacy_map():
|
|
30
|
+
scrub = IDScrub(["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My company code is NASA."])
|
|
31
|
+
scrubbed_texts = scrub.spacy_entities(
|
|
32
|
+
entities=["PERSON", "ORG"], replacement_map={"PERSON": "[PHELLO]", "ORG": "[SPACE]"}
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
assert scrubbed_texts == ["Our names are [PHELLO], [PHELLO], and [PHELLO].", "My company code is [SPACE]."]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_spacy_get_data():
|
|
39
|
+
scrub = IDScrub(["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My company code is NASA."])
|
|
40
|
+
|
|
41
|
+
scrub.spacy_entities(entities=["PERSON", "ORG"])
|
|
42
|
+
|
|
43
|
+
df = scrub.get_scrubbed_data()
|
|
44
|
+
|
|
45
|
+
expected_df = pd.DataFrame(
|
|
46
|
+
{
|
|
47
|
+
"text_id": {0: 1, 1: 2},
|
|
48
|
+
"person": {0: ["Hamish McDonald", "L. Salah", "Elena Suárez"], 1: None},
|
|
49
|
+
"org": {0: None, 1: ["NASA"]},
|
|
50
|
+
}
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
assert_frame_equal(df, expected_df)
|
idscrub-1.0.1.dist-info/RECORD
DELETED
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
idscrub/__init__.py,sha256=cRugJv27q1q--bl-VNLpfiScJb_ROlUxyLFhaF55S1w,38
|
|
2
|
-
idscrub/locations.py,sha256=7fMNOcGMYe7sX8TrfhMW6oYGAlc1WVYVQKQbpxE3pqo,217
|
|
3
|
-
idscrub/scrub.py,sha256=VqVqcChbbxMEKJR6Aci971dqG-RmD48otrp9sG2dX0o,34443
|
|
4
|
-
idscrub-1.0.1.dist-info/licenses/LICENSE,sha256=JJnuf10NSx7YXglte1oH_N9ZP3AcWR_Y8irvQb_wnsg,1090
|
|
5
|
-
notebooks/basic_usage.ipynb,sha256=XTBxdtu2F0S99V2lntUEeFj6SN4GRVm4qKvqOhs7nec,38777
|
|
6
|
-
test/conftest.py,sha256=y-pwGXpdg7bbFc36HtE3wQtZkeI0JM77fcMYjej5veY,557
|
|
7
|
-
test/test_all.py,sha256=ifuXAI0Hq3ETNXzdITjNGCnuFyozhN5TpJC2hOtA2bM,1103
|
|
8
|
-
test/test_chain.py,sha256=tGxcG5zRMcX22RfcrimqX6Le2iFPH9NqfZy7Idhelps,1808
|
|
9
|
-
test/test_dataframe.py,sha256=1LhtkQQpXblQ18ppI1s1nNyse0YCwGHbhtrKGkdppBw,6413
|
|
10
|
-
test/test_huggingface.py,sha256=OGwWSz_tzcynuRFXOdV4H4ProKnekYMdtZJviXEejiA,836
|
|
11
|
-
test/test_id.py,sha256=TPsvz4Kw1z_Fiek2BV79Hc2q3N37xU3oQra6Y7Ke11Q,989
|
|
12
|
-
test/test_label.py,sha256=aTGmtAWSLHrgoVBbCFUCqj52LmlCEKN6owycOyfVNpQ,669
|
|
13
|
-
test/test_log.py,sha256=tGAGOv4aeHT4E_pB9rq_nNA1CDHNoINpkVrCKaP4d3U,645
|
|
14
|
-
test/test_persidio.py,sha256=rkqiUr-vYnfCf7Xt0gNo2VQK2gi5JKP7ThSlT803swc,1558
|
|
15
|
-
test/test_phonenumbers.py,sha256=hZsXgwhn5R-7426TTWwCH9gWQwhyHtjLUstN10jnX6c,607
|
|
16
|
-
test/test_regex.py,sha256=zuq8g_8F_P5oCA2ChU5wUIFEWjT9LSYB0S_U1rBpTn4,4388
|
|
17
|
-
test/test_scrub.py,sha256=MWpan5cWIGeNPJCvTwtYe-iZeoIjS_fZMIg46ZVrkJo,1377
|
|
18
|
-
test/test_spacy.py,sha256=KHalx16GYHmCaQUU1O5bLMP95SLTu1007fJK1oq__v4,932
|
|
19
|
-
idscrub-1.0.1.dist-info/METADATA,sha256=mRpiv1ew3UV0ch6-ldQoLC744RVQ-wVS--KgBg2OpmI,7201
|
|
20
|
-
idscrub-1.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
21
|
-
idscrub-1.0.1.dist-info/top_level.txt,sha256=D4EEodXGCjGiX35ObiBTmjjBAdouN-eCvH-LezGGtks,23
|
|
22
|
-
idscrub-1.0.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|