idscrub 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
idscrub/scrub.py CHANGED
@@ -453,6 +453,24 @@ class IDScrub:
453
453
 
454
454
  return self.scrub_regex(pattern, replacement_text, label=label)
455
455
 
456
+ def uk_addresses(self, replacement_text: str = "[ADDRESS]", label: str = "uk_address") -> list[str]:
457
+ """
458
+ Removes addresses.
459
+ e.g. `10 Downing Street` scrubbed
460
+
461
+ Args:
462
+ replacement_text (str): The replacement text for the removed text.
463
+ label (str): Label for the personal data removed.
464
+
465
+ Returns:
466
+ list[str]: The input list of text with postcodes replaced.
467
+ """
468
+
469
+ self.logger.info("Scrubbing addresses using regex...")
470
+ pattern = r"(?i)\b(?:flat\s+\w+,\s*)?\d+[a-z]?(?:[-–/]\d+[a-z]?)?\s+[a-z][a-z'’\- ]+\s+(street|st|road|rd|avenue|ave|lane|ln|close|cl|drive|dr|way|walk|gardens|gdns|place|pl|mews|court|ct|crescent|cres|terrace|ter)\b"
471
+
472
+ return self.scrub_regex(pattern, replacement_text, label)
473
+
456
474
  def claimants(self, replacement_text="[CLAIMANT]", label: str = "claimant") -> list[str]:
457
475
  """
458
476
  Removes claimant names from employment tribunal texts.
@@ -528,64 +546,86 @@ class IDScrub:
528
546
 
529
547
  return model
530
548
 
531
- def spacy_persons(
549
+ def spacy_entities(
532
550
  self,
533
551
  model_name: str = "en_core_web_trf",
552
+ entities: list[str] = ["PERSON", "ORG", "NORP"],
553
+ replacement_map: str = {"PERSON": "[PERSON]", "ORG": "[ORG]", "NORP": "[NORP]"},
554
+ label_prefix: str = None,
534
555
  n_process: int = 1,
535
556
  batch_size: int = 1000,
536
- replacement_text: str = "[PERSON]",
537
- label: str = "person",
538
557
  ) -> list[str]:
539
558
  """
540
- Remove PERSON entities using a Spacy model.
559
+ Remove SpaCy entities using a given SpaCy model.
560
+ Documentation for entity labels: https://spacy.io/models/en#en_core_web_trf
541
561
  Note: only "en_core_web_trf" has been evaluated.
542
562
 
543
563
  Args:
544
564
  model_name (str): Name of Spacy model. Only `en_core_web_trf` has been evaluated.
565
+ entities (list[str]): Which SpaCy entities to scrub (based on SpaCy entity keys).
566
+ replacement_map (str): The replacement texts for the removed text. Index will match `entities`.
567
+ label_prefix (str): Prefix for the Spacy entity removed, e.g. `{label}_person`.
545
568
  n_process (int): Number of parallel processes.
546
569
  batch_size (int): The number of texts in each batch.
547
- replacement_text (str): The replacement text for the removed text.
548
- label (str): Label for the personal data removed.
549
570
 
550
571
  Returns:
551
572
  list[str]: The input list of text with PERSON entities scrubbed.
552
573
  """
553
- self.logger.info(f"Scrubbing names using SpaCy model `{model_name}`...")
554
574
 
555
- texts = self.get_texts()
575
+ self.logger.info(
576
+ f"Scrubbing SpaCy entities `{', '.join(str(entitity) for entitity in entities)}` using SpaCy model `{model_name}`..."
577
+ )
556
578
 
557
- if self.replacement_text:
558
- replacement_text = self.replacement_text
579
+ texts = self.get_texts()
559
580
 
560
581
  cleaned_texts = []
582
+ labels = []
561
583
 
562
584
  nlp = self.get_spacy_model(model_name)
563
585
  stripped_texts = [s.strip() if s.isspace() else s for s in texts]
564
586
  documents = nlp.pipe(stripped_texts, n_process=n_process, batch_size=batch_size)
565
587
 
566
588
  for i, (ids, doc, stripped_text) in tqdm(
567
- enumerate((zip(self.text_ids, documents, stripped_texts))), total=len(texts)
589
+ enumerate(zip(self.text_ids, documents, stripped_texts)), total=len(texts)
568
590
  ):
569
- if stripped_text == "":
591
+ if not stripped_text:
570
592
  cleaned_texts.append(texts[i])
571
593
  continue
572
594
 
573
- # Collect person entities
574
- person_entities = [
575
- ent for ent in doc.ents if ent.label_ == "PERSON" and ent.text not in {"PERSON", "HANDLE"}
576
- ]
577
- self.scrubbed_data.extend({self.text_id_name: ids, label: ent.text} for ent in person_entities)
595
+ all_found_entities = []
596
+
597
+ for entity_type in entities:
598
+ found = [
599
+ ent for ent in doc.ents if ent.label_ == entity_type and ent.text not in {entity_type, "HANDLE"}
600
+ ]
601
+
602
+ for ent in found:
603
+ label = ent.label_.lower()
604
+ if label_prefix:
605
+ label = f"{label_prefix}_{label}"
606
+ labels.append(label)
607
+ self.scrubbed_data.append({self.text_id_name: ids, label: ent.text})
608
+
609
+ if self.replacement_text:
610
+ all_found_entities.extend((ent.start_char, ent.end_char, self.replacement_text) for ent in found)
611
+ elif replacement_map:
612
+ all_found_entities.extend(
613
+ (ent.start_char, ent.end_char, replacement_map.get(entity_type)) for ent in found
614
+ )
615
+ else:
616
+ all_found_entities.extend((ent.start_char, ent.end_char, f"[{entity_type}]") for ent in found)
578
617
 
579
- # Remove person entities
580
618
  cleaned = stripped_text
581
- for ent in sorted(person_entities, key=lambda x: [x.start_char], reverse=True):
582
- cleaned = cleaned[: ent.start_char] + replacement_text + cleaned[ent.end_char :]
619
+
620
+ for start, end, repl in sorted(all_found_entities, key=lambda x: x[0], reverse=True):
621
+ cleaned = cleaned[:start] + repl + cleaned[end:]
583
622
 
584
623
  cleaned_texts.append(cleaned)
585
624
 
586
625
  self.cleaned_texts = cleaned_texts
587
626
 
588
- self.log_message(label)
627
+ for label in set(labels):
628
+ self.log_message(label)
589
629
 
590
630
  return cleaned_texts
591
631
 
@@ -600,7 +640,7 @@ class IDScrub:
600
640
  Note: No Hugging Face models have been evaluated for performance.
601
641
 
602
642
  Args:
603
- hf_model_path (str): Path to the Hugging Face model on the DBT mirror.
643
+ hf_model_path (str): Path to the Hugging Face model.
604
644
  Only `dbmdz/bert-large-cased-finetuned-conll03-english` has been evaluated.
605
645
  download_directory (str): Directory in which to save the model.
606
646
  Default is current working directory.
@@ -624,20 +664,21 @@ class IDScrub:
624
664
 
625
665
  return tokenizer
626
666
 
627
- def huggingface_persons(
667
+ def huggingface_entities(
628
668
  self,
629
669
  hf_model_path: str = "dbmdz/bert-large-cased-finetuned-conll03-english",
630
670
  download_directory: str = f"{DOWNLOAD_DIR}/huggingface/",
671
+ entity="PER",
631
672
  replacement_text: str = "[PERSON]",
632
673
  label: str = "person",
633
674
  batch_size: int = 8,
634
675
  ) -> list[str]:
635
676
  """
636
- Remove PERSON entities using a Hugging Face model.
677
+ Remove entities using a Hugging Face model. Default is a PERSON entity identifier.
637
678
  Note: No Hugging Face models have been evaluated for performance.
638
679
 
639
680
  Args:
640
- hf_model_path (str): Path to the Hugging Face model on the DBT mirror.
681
+ hf_model_path (str): Path to the Hugging Face model.
641
682
  Only `dbmdz/bert-large-cased-finetuned-conll03-english` has been tested.
642
683
  download_directory (str): Directory in which to save the model.
643
684
  Default is current working directory.
@@ -679,7 +720,7 @@ class IDScrub:
679
720
  continue
680
721
 
681
722
  person_entities = [
682
- ent for ent in entities if ent["entity_group"] == "PER" and ent["word"] not in {"HANDLE", "PERSON"}
723
+ ent for ent in entities if ent["entity_group"] == entity and ent["word"] not in {"HANDLE", entity}
683
724
  ]
684
725
  self.scrubbed_data.extend({self.text_id_name: ids, label: ent["word"]} for ent in person_entities)
685
726
 
@@ -695,10 +736,10 @@ class IDScrub:
695
736
 
696
737
  return cleaned_texts
697
738
 
698
- def presidio(
739
+ def presidio_entities(
699
740
  self,
700
741
  model_name: str = "en_core_web_trf",
701
- entities_to_scrub: list[str] = [
742
+ entities: list[str] = [
702
743
  "PERSON",
703
744
  "UK_NINO",
704
745
  "UK_NHS",
@@ -718,15 +759,18 @@ class IDScrub:
718
759
 
719
760
  Args:
720
761
  model_name (str): spaCy model to use
721
- entities_to_scrub (list[str]): Entity types to scrub (e.g. ["PERSON", "IP_ADDRESS"])
762
+ entities (list[str]): Entity types to scrub (e.g. ["PERSON", "IP_ADDRESS"])
722
763
  replacement_map (dict): Mapping of entity_type to replacement string (e.g. {'PERSON': '[PERSON]'})
723
764
  label_prefix (str): Prefix for the Presidio personal data type removed, e.g. `{label}_person`.
765
+ Useful if you wish to identify this having being scrubbed by Presidio.
724
766
 
725
767
  Returns:
726
768
  list[str]: The input list of text with entities replaced.
727
769
  """
728
770
 
729
- self.logger.info("Scrubbing using Presidio...")
771
+ self.logger.info(
772
+ f"Scrubbing Presidio entities `{', '.join(str(entitity) for entitity in entities)}` using SpaCy model `{model_name}`..."
773
+ )
730
774
 
731
775
  texts = self.get_texts()
732
776
 
@@ -744,7 +788,7 @@ class IDScrub:
744
788
  anonymizer = AnonymizerEngine()
745
789
 
746
790
  cleaned_texts = []
747
- unique_labels = []
791
+ all_labels = []
748
792
 
749
793
  stripped_texts = [s.strip() if s.isspace() else s for s in texts]
750
794
 
@@ -754,14 +798,15 @@ class IDScrub:
754
798
  continue
755
799
 
756
800
  results = analyzer.analyze(text=stripped_text, language="en")
757
- results = [r for r in results if r.entity_type in entities_to_scrub]
801
+ results = [r for r in results if r.entity_type in entities]
758
802
 
759
803
  if label_prefix:
760
804
  labels = [f"{label_prefix}_{res.entity_type.lower()}" for res in results]
761
805
  else:
762
806
  labels = [f"{res.entity_type.lower()}" for res in results]
763
807
 
764
- unique_labels.append(list(set(labels)))
808
+ for label in labels:
809
+ all_labels.append(label)
765
810
 
766
811
  self.scrubbed_data.extend(
767
812
  {self.text_id_name: ids, label: stripped_text[res.start : res.end]}
@@ -788,9 +833,8 @@ class IDScrub:
788
833
 
789
834
  self.cleaned_texts = cleaned_texts
790
835
 
791
- for label in unique_labels:
792
- if label:
793
- self.log_message(label[0])
836
+ for label in set(all_labels):
837
+ self.log_message(label)
794
838
 
795
839
  return cleaned_texts
796
840
 
@@ -810,6 +854,7 @@ class IDScrub:
810
854
  self.handles()
811
855
  self.ip_addresses()
812
856
  self.uk_phone_numbers()
857
+ self.uk_addresses()
813
858
  self.uk_postcodes()
814
859
  self.titles()
815
860
 
@@ -820,7 +865,8 @@ class IDScrub:
820
865
  custom_regex_patterns: list = None,
821
866
  custom_replacement_texts: list[str] = None,
822
867
  model_name: str = "en_core_web_trf",
823
- presidio_entities_to_scrub: list[str] = [
868
+ spacy_entities: list[str] = ["PERSON", "ORG", "NORP"],
869
+ presidio_entities: list[str] = [
824
870
  "PERSON",
825
871
  "EMAIL_ADDRESS",
826
872
  "UK_NINO",
@@ -857,8 +903,8 @@ class IDScrub:
857
903
  custom_replacement_texts=custom_replacement_texts,
858
904
  )
859
905
 
860
- self.presidio(model_name=model_name, entities_to_scrub=presidio_entities_to_scrub)
861
- self.spacy_persons(model_name=model_name, n_process=n_process, batch_size=batch_size)
906
+ self.presidio_entities(model_name=model_name, entities=presidio_entities)
907
+ self.spacy_entities(model_name=model_name, entities=spacy_entities, n_process=n_process, batch_size=batch_size)
862
908
  self.google_phone_numbers()
863
909
  self.all_regex()
864
910
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: idscrub
3
- Version: 1.0.0
3
+ Version: 1.1.0
4
4
  Author: Department for Business and Trade
5
5
  Requires-Python: >=3.12
6
6
  Description-Content-Type: text/markdown
@@ -45,12 +45,30 @@ Basic usage example (see [basic_usage.ipynb](https://github.com/uktrade/idscrub/
45
45
  from idscrub import IDScrub
46
46
 
47
47
  scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])x
48
- scrubbed_texts = scrub.scrub(scrub_methods=['spacy_persons', 'uk_phone_numbers', 'uk_postcodes'])
48
+ scrubbed_texts = scrub.scrub(scrub_methods=['spacy_entities', 'uk_phone_numbers', 'uk_postcodes'])
49
49
 
50
50
  print(scrubbed_texts)
51
51
 
52
52
  # Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE].']
53
53
  ```
54
+ ## Personal data types supported
55
+
56
+ Personal data can either be scrubbed as methods with arguments for extra customisation, e.g. `IDScrub.google_phone_numbers(region="GB")`, or as a string arguments with default configurations (see above). The method name and its string representation are the same.
57
+
58
+ | Argument | Scrubs |
59
+ |-------------------------|------------------------------------------------------------------------|
60
+ | `all` | All supported personal data types (see `IDScrub.all()` for further customisation) |
61
+ | `spacy_entities` | Entities detected by spaCy's `en_core_web_trf` or other user-selected spaCy models (e.g. persons (names), organisations) |
62
+ | `presidio_entities` | Entities supported by [Microsoft Presidio](https://microsoft.github.io/presidio/) (e.g. persons (names), URLs, NHS numbers, IBAN codes) |
63
+ | `huggingface_entities` | Entities detected by user-selected HuggingFace models |
64
+ | `email_addresses` | Email addresses (e.g. john@email.com) |
65
+ | `titles` | Titles (e.g. Mr., Mrs., Dr.) |
66
+ | `handles` | Social media handles (e.g. @username) |
67
+ | `ip_addresses` | IP addresses (e.g. 8.8.8.8) |
68
+ | `uk_postcodes` | UK postal codes (e.g. SW1A 2AA) |
69
+ | `uk_addresses` | UK addresses (e.g. 10 Downing Street) |
70
+ | `uk_phone_numbers` | UK phone numbers (e.g. +441111111111) |
71
+ | `google_phone_numbers` | Phone numbers detected by Google's [phonenumbers](https://github.com/daviddrysdale/python-phonenumbers) |
54
72
 
55
73
  ## Considerations before use
56
74
 
@@ -0,0 +1,22 @@
1
+ idscrub/__init__.py,sha256=cRugJv27q1q--bl-VNLpfiScJb_ROlUxyLFhaF55S1w,38
2
+ idscrub/locations.py,sha256=7fMNOcGMYe7sX8TrfhMW6oYGAlc1WVYVQKQbpxE3pqo,217
3
+ idscrub/scrub.py,sha256=PPTKWW-RQxZ5NixRow8nrnX9KjfyZa3tPAP9Jgwnn_M,36631
4
+ idscrub-1.1.0.dist-info/licenses/LICENSE,sha256=JJnuf10NSx7YXglte1oH_N9ZP3AcWR_Y8irvQb_wnsg,1090
5
+ notebooks/basic_usage.ipynb,sha256=V62Bz88a9Zo3LO_VxXF4sLw8-MP51ZdVRRNS-zjtNqw,42664
6
+ test/conftest.py,sha256=y-pwGXpdg7bbFc36HtE3wQtZkeI0JM77fcMYjej5veY,557
7
+ test/test_all.py,sha256=ifuXAI0Hq3ETNXzdITjNGCnuFyozhN5TpJC2hOtA2bM,1103
8
+ test/test_chain.py,sha256=YbJeA11EBjDNcq5ZZjG4lIIyngrRQZknNsX3Oo0jPMc,1810
9
+ test/test_dataframe.py,sha256=1LhtkQQpXblQ18ppI1s1nNyse0YCwGHbhtrKGkdppBw,6413
10
+ test/test_huggingface.py,sha256=RTkp8Xsy4w9WoXq2IQ2YOJof41snbOQkM7CVtiVVD0U,839
11
+ test/test_id.py,sha256=TPsvz4Kw1z_Fiek2BV79Hc2q3N37xU3oQra6Y7Ke11Q,989
12
+ test/test_label.py,sha256=aNkIxJ-_YkBnW8QrBfRxjSsRZWeh5hn_iM7Rk1wrfPU,652
13
+ test/test_log.py,sha256=tGAGOv4aeHT4E_pB9rq_nNA1CDHNoINpkVrCKaP4d3U,645
14
+ test/test_phonenumbers.py,sha256=hZsXgwhn5R-7426TTWwCH9gWQwhyHtjLUstN10jnX6c,607
15
+ test/test_presidio.py,sha256=BOGghcTWLSQPBhQxO014rO3RG-IL5XEbAaKuGN677pU,1558
16
+ test/test_regex.py,sha256=foc2N4UCi7mGL0EIfp1t-ivgujkXMrmbsnsU77sbWZ0,5424
17
+ test/test_scrub.py,sha256=tMYrIhbyXXKqt24tS1U_kAJT_vZfhOD4DAsf5ZFbEvU,1380
18
+ test/test_spacy.py,sha256=gxJrNpV5B3HydUfoMsbmzRUoiKNs3_zwdSXqbPeW0qA,1846
19
+ idscrub-1.1.0.dist-info/METADATA,sha256=HyPSfPJuFUPOib2fNr3eUtQIcvgJHr3uVNZaZQcXmS8,7003
20
+ idscrub-1.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
21
+ idscrub-1.1.0.dist-info/top_level.txt,sha256=D4EEodXGCjGiX35ObiBTmjjBAdouN-eCvH-LezGGtks,23
22
+ idscrub-1.1.0.dist-info/RECORD,,
@@ -4,7 +4,7 @@
4
4
  "cell_type": "markdown",
5
5
  "metadata": {},
6
6
  "source": [
7
- "### `idscrub` basic usage example"
7
+ "### `idscrub` basic usage examples"
8
8
  ]
9
9
  },
10
10
  {
@@ -17,11 +17,14 @@
17
17
  "output_type": "stream",
18
18
  "text": [
19
19
  "INFO: Texts loaded.\n",
20
- "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
21
- "100%|██████████| 2/2 [00:00<00:00, 42.63it/s]\n",
20
+ "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
21
+ "100%|██████████| 2/2 [00:00<00:00, 33.83it/s]\n",
22
+ "INFO: 1 org scrubbed.\n",
22
23
  "INFO: 3 person scrubbed.\n",
23
24
  "INFO: Scrubbing phone numbers using regex...\n",
24
25
  "INFO: 1 uk_phone_number scrubbed.\n",
26
+ "INFO: Scrubbing addresses using regex...\n",
27
+ "INFO: 1 uk_address scrubbed.\n",
25
28
  "INFO: Scrubbing postcodes using regex...\n",
26
29
  "INFO: 1 uk_postcode scrubbed.\n"
27
30
  ]
@@ -30,7 +33,7 @@
30
33
  "name": "stdout",
31
34
  "output_type": "stream",
32
35
  "text": [
33
- "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE], Lapland.']\n"
36
+ "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], Lapland']\n"
34
37
  ]
35
38
  }
36
39
  ],
@@ -40,11 +43,11 @@
40
43
  "scrub = IDScrub(\n",
41
44
  " [\n",
42
45
  " \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
43
- " \"My number is +441111111111 and I live at AA11 1AA, Lapland.\",\n",
46
+ " \"My number is +441111111111 and I work at the Department for Business and Trade, 15 Elf Road, AA11 1AA, Lapland\",\n",
44
47
  " ]\n",
45
48
  ")\n",
46
49
  "\n",
47
- "scrubbed_texts = scrub.scrub(scrub_methods=[\"spacy_persons\", \"uk_phone_numbers\", \"uk_postcodes\"])\n",
50
+ "scrubbed_texts = scrub.scrub(scrub_methods=[\"spacy_entities\", \"uk_phone_numbers\", \"uk_addresses\", \"uk_postcodes\"])\n",
48
51
  "\n",
49
52
  "print(scrubbed_texts)"
50
53
  ]
@@ -77,7 +80,9 @@
77
80
  " <th></th>\n",
78
81
  " <th>text_id</th>\n",
79
82
  " <th>person</th>\n",
83
+ " <th>org</th>\n",
80
84
  " <th>uk_phone_number</th>\n",
85
+ " <th>uk_address</th>\n",
81
86
  " <th>uk_postcode</th>\n",
82
87
  " </tr>\n",
83
88
  " </thead>\n",
@@ -88,12 +93,16 @@
88
93
  " <td>[Hamish McDonald, L. Salah, Elena Suárez]</td>\n",
89
94
  " <td>None</td>\n",
90
95
  " <td>None</td>\n",
96
+ " <td>None</td>\n",
97
+ " <td>None</td>\n",
91
98
  " </tr>\n",
92
99
  " <tr>\n",
93
100
  " <th>1</th>\n",
94
101
  " <td>2</td>\n",
95
102
  " <td>None</td>\n",
103
+ " <td>[the Department for Business and Trade]</td>\n",
96
104
  " <td>[+441111111111]</td>\n",
105
+ " <td>[15 Elf Road]</td>\n",
97
106
  " <td>[AA11 1AA]</td>\n",
98
107
  " </tr>\n",
99
108
  " </tbody>\n",
@@ -101,9 +110,13 @@
101
110
  "</div>"
102
111
  ],
103
112
  "text/plain": [
104
- " text_id person uk_phone_number \\\n",
105
- "0 1 [Hamish McDonald, L. Salah, Elena Suárez] None \n",
106
- "1 2 None [+441111111111] \n",
113
+ " text_id person \\\n",
114
+ "0 1 [Hamish McDonald, L. Salah, Elena Suárez] \n",
115
+ "1 2 None \n",
116
+ "\n",
117
+ " org uk_phone_number uk_address \\\n",
118
+ "0 None None None \n",
119
+ "1 [the Department for Business and Trade] [+441111111111] [15 Elf Road] \n",
107
120
  "\n",
108
121
  " uk_postcode \n",
109
122
  "0 None \n",
@@ -136,13 +149,13 @@
136
149
  "output_type": "stream",
137
150
  "text": [
138
151
  "INFO: Texts loaded.\n",
139
- "INFO: Scrubbing using Presidio...\n",
140
- "100%|██████████| 2/2 [00:00<00:00, 14.67it/s]\n",
152
+ "INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
153
+ "100%|██████████| 2/2 [00:00<00:00, 9.14it/s]\n",
141
154
  "INFO: 3 person scrubbed.\n",
142
155
  "INFO: 1 location scrubbed.\n",
143
- "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
144
- "100%|██████████| 2/2 [00:00<00:00, 48.96it/s]\n",
145
- "INFO: 3 person scrubbed.\n",
156
+ "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
157
+ "100%|██████████| 2/2 [00:00<00:00, 42.62it/s]\n",
158
+ "INFO: 1 org scrubbed.\n",
146
159
  "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
147
160
  "INFO: 0 phone_number scrubbed.\n",
148
161
  "INFO: Scrubbing email addresses using regex...\n",
@@ -153,6 +166,8 @@
153
166
  "INFO: 0 ip_address scrubbed.\n",
154
167
  "INFO: Scrubbing phone numbers using regex...\n",
155
168
  "INFO: 1 uk_phone_number scrubbed.\n",
169
+ "INFO: Scrubbing addresses using regex...\n",
170
+ "INFO: 1 uk_address scrubbed.\n",
156
171
  "INFO: Scrubbing postcodes using regex...\n",
157
172
  "INFO: 1 uk_postcode scrubbed.\n",
158
173
  "INFO: Scrubbing titles using regex...\n",
@@ -163,7 +178,7 @@
163
178
  "name": "stdout",
164
179
  "output_type": "stream",
165
180
  "text": [
166
- "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE], [LOCATION].']\n"
181
+ "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], [LOCATION]']\n"
167
182
  ]
168
183
  }
169
184
  ],
@@ -173,7 +188,7 @@
173
188
  "scrub = IDScrub(\n",
174
189
  " [\n",
175
190
  " \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
176
- " \"My number is +441111111111 and I live at AA11 1AA, Lapland.\",\n",
191
+ " \"My number is +441111111111 and I work at Department for Business and Trade, 15 Elf Road, AA11 1AA, Lapland\",\n",
177
192
  " ]\n",
178
193
  ")\n",
179
194
  "\n",
@@ -211,7 +226,9 @@
211
226
  " <th>text_id</th>\n",
212
227
  " <th>person</th>\n",
213
228
  " <th>location</th>\n",
229
+ " <th>org</th>\n",
214
230
  " <th>uk_phone_number</th>\n",
231
+ " <th>uk_address</th>\n",
215
232
  " <th>uk_postcode</th>\n",
216
233
  " </tr>\n",
217
234
  " </thead>\n",
@@ -223,13 +240,17 @@
223
240
  " <td>None</td>\n",
224
241
  " <td>None</td>\n",
225
242
  " <td>None</td>\n",
243
+ " <td>None</td>\n",
244
+ " <td>None</td>\n",
226
245
  " </tr>\n",
227
246
  " <tr>\n",
228
247
  " <th>1</th>\n",
229
248
  " <td>2</td>\n",
230
249
  " <td>None</td>\n",
231
250
  " <td>[Lapland]</td>\n",
251
+ " <td>[Department for Business and Trade]</td>\n",
232
252
  " <td>[+441111111111]</td>\n",
253
+ " <td>[15 Elf Road]</td>\n",
233
254
  " <td>[AA11 1AA]</td>\n",
234
255
  " </tr>\n",
235
256
  " </tbody>\n",
@@ -241,9 +262,13 @@
241
262
  "0 1 [Hamish McDonald, L. Salah, Elena Suárez] None \n",
242
263
  "1 2 None [Lapland] \n",
243
264
  "\n",
244
- " uk_phone_number uk_postcode \n",
245
- "0 None None \n",
246
- "1 [+441111111111] [AA11 1AA] "
265
+ " org uk_phone_number uk_address \\\n",
266
+ "0 None None None \n",
267
+ "1 [Department for Business and Trade] [+441111111111] [15 Elf Road] \n",
268
+ "\n",
269
+ " uk_postcode \n",
270
+ "0 None \n",
271
+ "1 [AA11 1AA] "
247
272
  ]
248
273
  },
249
274
  "execution_count": 4,
@@ -272,14 +297,15 @@
272
297
  "output_type": "stream",
273
298
  "text": [
274
299
  "INFO: Texts loaded.\n",
275
- "INFO: Scrubbing using Presidio...\n",
276
- "100%|██████████| 2/2 [00:00<00:00, 30.26it/s]\n",
300
+ "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
301
+ "100%|██████████| 2/2 [00:00<00:00, 42.58it/s]\n",
302
+ "INFO: 1 org scrubbed.\n",
277
303
  "INFO: 3 person scrubbed.\n",
278
304
  "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
279
305
  "INFO: 0 phone_number scrubbed.\n",
280
306
  "INFO: Scrubbing custom regex...\n",
281
307
  "INFO: 1 custom_regex_1 scrubbed.\n",
282
- "INFO: 1 custom_regex_2 scrubbed.\n",
308
+ "INFO: 0 custom_regex_2 scrubbed.\n",
283
309
  "INFO: Scrubbing email addresses using regex...\n",
284
310
  "INFO: 0 email_address scrubbed.\n",
285
311
  "INFO: Scrubbing @user handles using regex...\n",
@@ -288,6 +314,8 @@
288
314
  "INFO: 0 ip_address scrubbed.\n",
289
315
  "INFO: Scrubbing phone numbers using regex...\n",
290
316
  "INFO: 1 uk_phone_number scrubbed.\n",
317
+ "INFO: Scrubbing addresses using regex...\n",
318
+ "INFO: 1 uk_address scrubbed.\n",
291
319
  "INFO: Scrubbing postcodes using regex...\n",
292
320
  "INFO: 1 uk_postcode scrubbed.\n",
293
321
  "INFO: Scrubbing titles using regex...\n",
@@ -298,7 +326,7 @@
298
326
  "name": "stdout",
299
327
  "output_type": "stream",
300
328
  "text": [
301
- "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE], University of [UNIVERSITY] where I am on secret mission [REDACTED].']\n"
329
+ "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], [UNIVERSITY]']\n"
302
330
  ]
303
331
  }
304
332
  ],
@@ -308,15 +336,18 @@
308
336
  "scrub = IDScrub(\n",
309
337
  " [\n",
310
338
  " \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
311
- " \"My number is +441111111111 and I live at AA11 1AA, University of Lapland where I am on secret mission ACHILLES.\",\n",
339
+ " \"My number is +441111111111 and I work at Department for Business and Trade, 15 Elf Road, AA11 1AA, Lapland\",\n",
312
340
  " ]\n",
313
341
  ")\n",
314
342
  "\n",
315
- "scrub.presidio()\n",
343
+ "scrub.spacy_entities()\n",
316
344
  "scrub.google_phone_numbers(region=\"GB\")\n",
345
+ "\n",
346
+ "# Remove specific regex pattern(s). This can also be passed to all().\n",
317
347
  "scrub.custom_regex(\n",
318
348
  " custom_regex_patterns=[r\"Lapland\", r\"ACHILLES\"], custom_replacement_texts=[\"[UNIVERSITY]\", \"[REDACTED]\"]\n",
319
- ") # Remove specific regex pattern(s). This can also be passed to all().\n",
349
+ ")\n",
350
+ "\n",
320
351
  "scrubbed_texts = scrub.all_regex()\n",
321
352
  "\n",
322
353
  "print(scrubbed_texts)"
@@ -350,9 +381,10 @@
350
381
  " <th></th>\n",
351
382
  " <th>text_id</th>\n",
352
383
  " <th>person</th>\n",
384
+ " <th>org</th>\n",
353
385
  " <th>custom_regex_1</th>\n",
354
- " <th>custom_regex_2</th>\n",
355
386
  " <th>uk_phone_number</th>\n",
387
+ " <th>uk_address</th>\n",
356
388
  " <th>uk_postcode</th>\n",
357
389
  " </tr>\n",
358
390
  " </thead>\n",
@@ -365,14 +397,16 @@
365
397
  " <td>None</td>\n",
366
398
  " <td>None</td>\n",
367
399
  " <td>None</td>\n",
400
+ " <td>None</td>\n",
368
401
  " </tr>\n",
369
402
  " <tr>\n",
370
403
  " <th>1</th>\n",
371
404
  " <td>2</td>\n",
372
405
  " <td>None</td>\n",
406
+ " <td>[Department for Business and Trade]</td>\n",
373
407
  " <td>[Lapland]</td>\n",
374
- " <td>[ACHILLES]</td>\n",
375
408
  " <td>[+441111111111]</td>\n",
409
+ " <td>[15 Elf Road]</td>\n",
376
410
  " <td>[AA11 1AA]</td>\n",
377
411
  " </tr>\n",
378
412
  " </tbody>\n",
@@ -380,13 +414,17 @@
380
414
  "</div>"
381
415
  ],
382
416
  "text/plain": [
383
- " text_id person custom_regex_1 \\\n",
384
- "0 1 [Hamish McDonald, L. Salah, Elena Suárez] None \n",
385
- "1 2 None [Lapland] \n",
417
+ " text_id person \\\n",
418
+ "0 1 [Hamish McDonald, L. Salah, Elena Suárez] \n",
419
+ "1 2 None \n",
386
420
  "\n",
387
- " custom_regex_2 uk_phone_number uk_postcode \n",
388
- "0 None None None \n",
389
- "1 [ACHILLES] [+441111111111] [AA11 1AA] "
421
+ " org custom_regex_1 uk_phone_number \\\n",
422
+ "0 None None None \n",
423
+ "1 [Department for Business and Trade] [Lapland] [+441111111111] \n",
424
+ "\n",
425
+ " uk_address uk_postcode \n",
426
+ "0 None None \n",
427
+ "1 [15 Elf Road] [AA11 1AA] "
390
428
  ]
391
429
  },
392
430
  "execution_count": 6,
@@ -402,7 +440,7 @@
402
440
  "cell_type": "markdown",
403
441
  "metadata": {},
404
442
  "source": [
405
- "## `idscrub` example - using Presidio\n",
443
+ "### `idscrub` example - using Presidio\n",
406
444
  "We can also leverage the power of [Presidio](https://microsoft.github.io/presidio/) and use their entity recognition methods"
407
445
  ]
408
446
  },
@@ -416,10 +454,10 @@
416
454
  "output_type": "stream",
417
455
  "text": [
418
456
  "INFO: Texts loaded.\n",
419
- "INFO: Scrubbing using Presidio...\n",
420
- "100%|██████████| 2/2 [00:00<00:00, 28.29it/s]\n",
421
- "INFO: 3 person scrubbed.\n",
422
- "INFO: 1 iban_code scrubbed.\n"
457
+ "INFO: Scrubbing Presidio entities `PERSON, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, IBAN_CODE` using SpaCy model `en_core_web_trf`...\n",
458
+ "100%|██████████| 2/2 [00:00<00:00, 24.36it/s]\n",
459
+ "INFO: 1 iban_code scrubbed.\n",
460
+ "INFO: 3 person scrubbed.\n"
423
461
  ]
424
462
  },
425
463
  {
@@ -436,7 +474,7 @@
436
474
  "scrub = IDScrub(\n",
437
475
  " [\"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\", \"My IBAN code is GB91BKEN10000041610008\"]\n",
438
476
  ")\n",
439
- "scrubbed_texts = scrub.presidio()\n",
477
+ "scrubbed_texts = scrub.presidio_entities()\n",
440
478
  "\n",
441
479
  "print(scrubbed_texts)"
442
480
  ]
@@ -678,14 +716,11 @@
678
716
  "text": [
679
717
  " 0%| | 0/3 [00:00<?, ?it/s]INFO: Texts loaded.\n",
680
718
  "INFO: Scrubbing column `Pride and Prejudice`...\n",
681
- "INFO: Scrubbing using Presidio...\n",
682
- "100%|██████████| 5/5 [00:00<00:00, 27.93it/s]\n",
683
- "INFO: 4 person scrubbed.\n",
684
- "INFO: 4 person scrubbed.\n",
685
- "INFO: 4 person scrubbed.\n",
686
- "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
687
- "100%|██████████| 5/5 [00:00<00:00, 62.29it/s]\n",
719
+ "INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
720
+ "100%|██████████| 5/5 [00:00<00:00, 23.73it/s]\n",
688
721
  "INFO: 4 person scrubbed.\n",
722
+ "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
723
+ "100%|██████████| 5/5 [00:00<00:00, 77.84it/s]\n",
689
724
  "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
690
725
  "INFO: 0 phone_number scrubbed.\n",
691
726
  "INFO: Scrubbing email addresses using regex...\n",
@@ -696,19 +731,19 @@
696
731
  "INFO: 0 ip_address scrubbed.\n",
697
732
  "INFO: Scrubbing phone numbers using regex...\n",
698
733
  "INFO: 0 uk_phone_number scrubbed.\n",
734
+ "INFO: Scrubbing addresses using regex...\n",
735
+ "INFO: 0 uk_address scrubbed.\n",
699
736
  "INFO: Scrubbing postcodes using regex...\n",
700
737
  "INFO: 0 uk_postcode scrubbed.\n",
701
738
  "INFO: Scrubbing titles using regex...\n",
702
739
  "INFO: 2 title scrubbed.\n",
703
- " 33%|███▎ | 1/3 [00:02<00:05, 2.62s/it]INFO: Texts loaded.\n",
740
+ " 33%|███▎ | 1/3 [00:02<00:05, 2.60s/it]INFO: Texts loaded.\n",
704
741
  "INFO: Scrubbing column `The Adventures of Sherlock Holmes`...\n",
705
- "INFO: Scrubbing using Presidio...\n",
706
- "100%|██████████| 5/5 [00:00<00:00, 28.25it/s]\n",
707
- "INFO: 2 person scrubbed.\n",
708
- "INFO: 2 person scrubbed.\n",
709
- "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
710
- "100%|██████████| 5/5 [00:00<00:00, 82.44it/s]\n",
742
+ "INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
743
+ "100%|██████████| 5/5 [00:00<00:00, 24.22it/s]\n",
711
744
  "INFO: 2 person scrubbed.\n",
745
+ "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
746
+ "100%|██████████| 5/5 [00:00<00:00, 84.78it/s]\n",
712
747
  "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
713
748
  "INFO: 0 phone_number scrubbed.\n",
714
749
  "INFO: Scrubbing email addresses using regex...\n",
@@ -719,21 +754,23 @@
719
754
  "INFO: 0 ip_address scrubbed.\n",
720
755
  "INFO: Scrubbing phone numbers using regex...\n",
721
756
  "INFO: 0 uk_phone_number scrubbed.\n",
757
+ "INFO: Scrubbing addresses using regex...\n",
758
+ "INFO: 0 uk_address scrubbed.\n",
722
759
  "INFO: Scrubbing postcodes using regex...\n",
723
760
  "INFO: 0 uk_postcode scrubbed.\n",
724
761
  "INFO: Scrubbing titles using regex...\n",
725
762
  "INFO: 0 title scrubbed.\n",
726
- " 67%|██████▋ | 2/3 [00:05<00:02, 2.50s/it]INFO: Texts loaded.\n",
763
+ " 67%|██████▋ | 2/3 [00:05<00:02, 2.49s/it]INFO: Texts loaded.\n",
727
764
  "INFO: Scrubbing column `Fake book`...\n",
728
- "INFO: Scrubbing using Presidio...\n",
729
- "100%|██████████| 5/5 [00:00<00:00, 13.15it/s]\n",
765
+ "INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
766
+ "100%|██████████| 5/5 [00:00<00:00, 13.41it/s]\n",
730
767
  "INFO: 1 iban_code scrubbed.\n",
768
+ "INFO: 5 url scrubbed.\n",
731
769
  "INFO: 2 person scrubbed.\n",
732
770
  "INFO: 3 email_address scrubbed.\n",
733
- "INFO: 3 email_address scrubbed.\n",
734
- "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
735
- "100%|██████████| 5/5 [00:00<00:00, 54.15it/s]\n",
736
- "INFO: 2 person scrubbed.\n",
771
+ "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
772
+ "100%|██████████| 5/5 [00:00<00:00, 64.57it/s]\n",
773
+ "INFO: 1 org scrubbed.\n",
737
774
  "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
738
775
  "INFO: 0 phone_number scrubbed.\n",
739
776
  "INFO: Scrubbing email addresses using regex...\n",
@@ -744,11 +781,13 @@
744
781
  "INFO: 0 ip_address scrubbed.\n",
745
782
  "INFO: Scrubbing phone numbers using regex...\n",
746
783
  "INFO: 0 uk_phone_number scrubbed.\n",
784
+ "INFO: Scrubbing addresses using regex...\n",
785
+ "INFO: 0 uk_address scrubbed.\n",
747
786
  "INFO: Scrubbing postcodes using regex...\n",
748
787
  "INFO: 4 uk_postcode scrubbed.\n",
749
788
  "INFO: Scrubbing titles using regex...\n",
750
789
  "INFO: 0 title scrubbed.\n",
751
- "100%|██████████| 3/3 [00:07<00:00, 2.56s/it]\n"
790
+ "100%|██████████| 3/3 [00:07<00:00, 2.53s/it]\n"
752
791
  ]
753
792
  },
754
793
  {
@@ -810,7 +849,7 @@
810
849
  " <td>The business of her life was to get her daught...</td>\n",
811
850
  " <td>I am a brain, [PERSON]. The rest of me is a me...</td>\n",
812
851
  " <td>Nothing is more painful to the human mind than...</td>\n",
813
- " <td>A message arrived just as the Downing Street c...</td>\n",
852
+ " <td>A message arrived just as the [ORG] clock stru...</td>\n",
814
853
  " </tr>\n",
815
854
  " <tr>\n",
816
855
  " <th>4</th>\n",
@@ -850,7 +889,7 @@
850
889
  "0 The letter to [EMAIL_ADDRESS] was stamped with... \n",
851
890
  "1 She forwarded the memo from [PERSON] and [PERS... \n",
852
891
  "2 The dossier marked confidential came from [EMA... \n",
853
- "3 A message arrived just as the Downing Street c... \n",
892
+ "3 A message arrived just as the [ORG] clock stru... \n",
854
893
  "4 They did not expected a reply from [EMAIL_ADDR... "
855
894
  ]
856
895
  },
@@ -900,6 +939,7 @@
900
939
  " <th>email_address</th>\n",
901
940
  " <th>iban_code</th>\n",
902
941
  " <th>url</th>\n",
942
+ " <th>org</th>\n",
903
943
  " <th>uk_postcode</th>\n",
904
944
  " </tr>\n",
905
945
  " </thead>\n",
@@ -914,6 +954,7 @@
914
954
  " <td>None</td>\n",
915
955
  " <td>None</td>\n",
916
956
  " <td>None</td>\n",
957
+ " <td>None</td>\n",
917
958
  " </tr>\n",
918
959
  " <tr>\n",
919
960
  " <th>1</th>\n",
@@ -925,6 +966,7 @@
925
966
  " <td>None</td>\n",
926
967
  " <td>None</td>\n",
927
968
  " <td>None</td>\n",
969
+ " <td>None</td>\n",
928
970
  " </tr>\n",
929
971
  " <tr>\n",
930
972
  " <th>2</th>\n",
@@ -936,6 +978,7 @@
936
978
  " <td>None</td>\n",
937
979
  " <td>None</td>\n",
938
980
  " <td>None</td>\n",
981
+ " <td>None</td>\n",
939
982
  " </tr>\n",
940
983
  " <tr>\n",
941
984
  " <th>3</th>\n",
@@ -947,6 +990,7 @@
947
990
  " <td>None</td>\n",
948
991
  " <td>None</td>\n",
949
992
  " <td>None</td>\n",
993
+ " <td>None</td>\n",
950
994
  " </tr>\n",
951
995
  " <tr>\n",
952
996
  " <th>4</th>\n",
@@ -958,6 +1002,7 @@
958
1002
  " <td>None</td>\n",
959
1003
  " <td>None</td>\n",
960
1004
  " <td>None</td>\n",
1005
+ " <td>None</td>\n",
961
1006
  " </tr>\n",
962
1007
  " <tr>\n",
963
1008
  " <th>5</th>\n",
@@ -968,6 +1013,7 @@
968
1013
  " <td>[freddie.mercury@queen.com]</td>\n",
969
1014
  " <td>[GB91BKEN10000041610008]</td>\n",
970
1015
  " <td>[freddie.me, queen.com]</td>\n",
1016
+ " <td>None</td>\n",
971
1017
  " <td>[SW1A 2AA]</td>\n",
972
1018
  " </tr>\n",
973
1019
  " <tr>\n",
@@ -979,6 +1025,7 @@
979
1025
  " <td>None</td>\n",
980
1026
  " <td>None</td>\n",
981
1027
  " <td>None</td>\n",
1028
+ " <td>None</td>\n",
982
1029
  " <td>[SW1A 2WH]</td>\n",
983
1030
  " </tr>\n",
984
1031
  " <tr>\n",
@@ -990,6 +1037,7 @@
990
1037
  " <td>[serena.williams@tennis.com]</td>\n",
991
1038
  " <td>None</td>\n",
992
1039
  " <td>[tennis.com]</td>\n",
1040
+ " <td>None</td>\n",
993
1041
  " <td>[SW19 5AE]</td>\n",
994
1042
  " </tr>\n",
995
1043
  " <tr>\n",
@@ -1001,8 +1049,21 @@
1001
1049
  " <td>[otis.redding@dockofthebay.org]</td>\n",
1002
1050
  " <td>None</td>\n",
1003
1051
  " <td>[otis.red, dockofthebay.org]</td>\n",
1052
+ " <td>None</td>\n",
1004
1053
  " <td>[EH8 8DX]</td>\n",
1005
1054
  " </tr>\n",
1055
+ " <tr>\n",
1056
+ " <th>9</th>\n",
1057
+ " <td>D</td>\n",
1058
+ " <td>Fake book</td>\n",
1059
+ " <td>None</td>\n",
1060
+ " <td>None</td>\n",
1061
+ " <td>None</td>\n",
1062
+ " <td>None</td>\n",
1063
+ " <td>None</td>\n",
1064
+ " <td>[Downing Street]</td>\n",
1065
+ " <td>None</td>\n",
1066
+ " </tr>\n",
1006
1067
  " </tbody>\n",
1007
1068
  "</table>\n",
1008
1069
  "</div>"
@@ -1018,6 +1079,7 @@
1018
1079
  "6 B Fake book [Mick Jagger, David Bowie] None \n",
1019
1080
  "7 C Fake book None None \n",
1020
1081
  "8 E Fake book None None \n",
1082
+ "9 D Fake book None None \n",
1021
1083
  "\n",
1022
1084
  " email_address iban_code \\\n",
1023
1085
  "0 None None \n",
@@ -1029,17 +1091,19 @@
1029
1091
  "6 None None \n",
1030
1092
  "7 [serena.williams@tennis.com] None \n",
1031
1093
  "8 [otis.redding@dockofthebay.org] None \n",
1094
+ "9 None None \n",
1032
1095
  "\n",
1033
- " url uk_postcode \n",
1034
- "0 None None \n",
1035
- "1 None None \n",
1036
- "2 None None \n",
1037
- "3 None None \n",
1038
- "4 None None \n",
1039
- "5 [freddie.me, queen.com] [SW1A 2AA] \n",
1040
- "6 None [SW1A 2WH] \n",
1041
- "7 [tennis.com] [SW19 5AE] \n",
1042
- "8 [otis.red, dockofthebay.org] [EH8 8DX] "
1096
+ " url org uk_postcode \n",
1097
+ "0 None None None \n",
1098
+ "1 None None None \n",
1099
+ "2 None None None \n",
1100
+ "3 None None None \n",
1101
+ "4 None None None \n",
1102
+ "5 [freddie.me, queen.com] None [SW1A 2AA] \n",
1103
+ "6 None None [SW1A 2WH] \n",
1104
+ "7 [tennis.com] None [SW19 5AE] \n",
1105
+ "8 [otis.red, dockofthebay.org] None [EH8 8DX] \n",
1106
+ "9 None [Downing Street] None "
1043
1107
  ]
1044
1108
  },
1045
1109
  "execution_count": 11,
test/test_chain.py CHANGED
@@ -6,7 +6,7 @@ from pandas.testing import assert_frame_equal
6
6
  def test_chain(scrub_object):
7
7
  scrub_object.uk_phone_numbers()
8
8
  scrub_object.uk_postcodes()
9
- scrubbed = scrub_object.spacy_persons()
9
+ scrubbed = scrub_object.spacy_entities()
10
10
 
11
11
  assert scrubbed == [
12
12
  "Our names are [PERSON], [PERSON], and [PERSON].",
@@ -38,7 +38,7 @@ def test_chain_order(scrub_object):
38
38
  def test_get_scrubbed_data_chain(scrub_object):
39
39
  scrub_object.uk_phone_numbers()
40
40
  scrub_object.uk_postcodes()
41
- scrub_object.spacy_persons()
41
+ scrub_object.spacy_entities()
42
42
 
43
43
  df = scrub_object.get_scrubbed_data()
44
44
 
test/test_huggingface.py CHANGED
@@ -6,7 +6,7 @@ from pandas.testing import assert_frame_equal
6
6
 
7
7
  def test_huggingface():
8
8
  scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
9
- scrubbed = scrub.huggingface_persons()
9
+ scrubbed = scrub.huggingface_entities()
10
10
  assert scrubbed == ["Our names are [PERSON], [PERSON], and [PERSON]."]
11
11
 
12
12
 
@@ -14,12 +14,12 @@ def test_huggingface_error():
14
14
  scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
15
15
 
16
16
  with pytest.raises(OSError):
17
- scrub.huggingface_persons(hf_model_path="not_a_path")
17
+ scrub.huggingface_entities(hf_model_path="not_a_path")
18
18
 
19
19
 
20
20
  def test_huggingface_empty():
21
21
  scrub = IDScrub([" ", "John Smith", ""])
22
- scrubbed = scrub.huggingface_persons()
22
+ scrubbed = scrub.huggingface_entities()
23
23
 
24
24
  assert scrubbed == [" ", "[PERSON]", ""]
25
25
  assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
test/test_label.py CHANGED
@@ -1,6 +1,6 @@
1
1
  def test_label(scrub_object_all):
2
2
  for i, scrub_method in enumerate(
3
- ["spacy_persons", "uk_postcodes", "email_addresses", "ip_addresses", "uk_phone_numbers", "titles", "handles"]
3
+ ["uk_postcodes", "email_addresses", "ip_addresses", "uk_phone_numbers", "titles", "handles"]
4
4
  ):
5
5
  method = getattr(scrub_object_all, scrub_method)
6
6
  method(label="test")
@@ -4,32 +4,32 @@ from pandas.testing import assert_frame_equal
4
4
 
5
5
 
6
6
  # Note: These tests will fail if the kernel has not been restarted since the SpaCy model was downloaded.
7
- def test_persidio():
7
+ def test_presidio():
8
8
  scrub = IDScrub(
9
9
  ["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
10
10
  )
11
- scrubbed_texts = scrub.presidio(entities_to_scrub=["PERSON", "IBAN_CODE"])
11
+ scrubbed_texts = scrub.presidio_entities(entities=["PERSON", "IBAN_CODE"])
12
12
 
13
13
  assert scrubbed_texts == ["Our names are [PERSON], [PERSON], and [PERSON].", "My IBAN code is [IBAN_CODE]."]
14
14
 
15
15
 
16
- def test_persidio_map():
16
+ def test_presidio_map():
17
17
  scrub = IDScrub(
18
18
  ["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
19
19
  )
20
- scrubbed_texts = scrub.presidio(
21
- entities_to_scrub=["PERSON", "IBAN_CODE"], replacement_map={"PERSON": "[PHELLO]", "IBAN_CODE": "[IHELLO]"}
20
+ scrubbed_texts = scrub.presidio_entities(
21
+ entities=["PERSON", "IBAN_CODE"], replacement_map={"PERSON": "[PHELLO]", "IBAN_CODE": "[IHELLO]"}
22
22
  )
23
23
 
24
24
  assert scrubbed_texts == ["Our names are [PHELLO], [PHELLO], and [PHELLO].", "My IBAN code is [IHELLO]."]
25
25
 
26
26
 
27
- def test_persidio_get_data():
27
+ def test_presidio_get_data():
28
28
  scrub = IDScrub(
29
29
  ["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
30
30
  )
31
31
 
32
- scrub.presidio(entities_to_scrub=["PERSON", "IBAN_CODE"])
32
+ scrub.presidio_entities(entities=["PERSON", "IBAN_CODE"])
33
33
 
34
34
  df = scrub.get_scrubbed_data()
35
35
 
test/test_regex.py CHANGED
@@ -63,6 +63,52 @@ def test_handles():
63
63
  assert scrubbed == ["Our usernames are [HANDLE], [HANDLE], [HANDLE] and [HANDLE]."]
64
64
 
65
65
 
66
+ def test_uk_addresses():
67
+ scrub = IDScrub(
68
+ [
69
+ "221B Baker Street",
70
+ "12 high road",
71
+ "Flat 3B, 47 King's Court",
72
+ "12–14 High Street",
73
+ "5a-7a Church Lane",
74
+ "1/2 Main Street",
75
+ "10 St John’s Rd",
76
+ "33 Queen-Anne Walk",
77
+ "8 Deansgate Ct",
78
+ ]
79
+ )
80
+
81
+ scrubbed = scrub.uk_addresses()
82
+ assert scrubbed == [
83
+ "[ADDRESS]",
84
+ "[ADDRESS]",
85
+ "[ADDRESS]",
86
+ "[ADDRESS]",
87
+ "[ADDRESS]",
88
+ "[ADDRESS]",
89
+ "[ADDRESS]",
90
+ "[ADDRESS]",
91
+ "[ADDRESS]",
92
+ ]
93
+
94
+ negative_tests = [
95
+ "12 High",
96
+ "Baker Street",
97
+ "High Road 12",
98
+ "Go to the high road now",
99
+ "500 the big building near river",
100
+ "I walked the long road home",
101
+ "12b misspelledstreet",
102
+ "London SW1A 1AA",
103
+ "12,,, High?",
104
+ ]
105
+
106
+ scrub = IDScrub(negative_tests)
107
+
108
+ scrubbed = scrub.uk_addresses()
109
+ assert scrubbed == negative_tests
110
+
111
+
66
112
  def test_claimants():
67
113
  scrub = IDScrub(
68
114
  texts=[
test/test_scrub.py CHANGED
@@ -5,7 +5,7 @@ from pandas.testing import assert_frame_equal
5
5
 
6
6
  # Note: These tests will fail if the kernel has not been restarted since the SpaCy model was downloaded.
7
7
  def test_scrub(scrub_object):
8
- scrubbed = scrub_object.scrub(scrub_methods=["spacy_persons", "uk_phone_numbers", "uk_postcodes"])
8
+ scrubbed = scrub_object.scrub(scrub_methods=["spacy_entities", "uk_phone_numbers", "uk_postcodes"])
9
9
  assert scrubbed == [
10
10
  "Our names are [PERSON], [PERSON], and [PERSON].",
11
11
  "My number is [PHONENO] and I live at [POSTCODE].",
@@ -15,7 +15,7 @@ def test_scrub(scrub_object):
15
15
  def test_scrub_text_id():
16
16
  scrub = IDScrub(["Our names are Hamish McDonald, L. Salah, and Elena Suárez."] * 10)
17
17
 
18
- scrub.scrub(scrub_methods=["spacy_persons"])
18
+ scrub.scrub(scrub_methods=["spacy_entities"])
19
19
 
20
20
  df = scrub.get_scrubbed_data()
21
21
 
@@ -38,7 +38,7 @@ def test_scrub_get_scrubbed_data(scrub_object):
38
38
 
39
39
 
40
40
  def test_scrub_order(scrub_object):
41
- scrub_object.scrub(scrub_methods=["uk_postcodes", "uk_phone_numbers", "spacy_persons"])
41
+ scrub_object.scrub(scrub_methods=["uk_postcodes", "uk_phone_numbers", "spacy_entities"])
42
42
 
43
43
  assert scrub_object.get_scrubbed_data().columns.to_list() == [
44
44
  "text_id",
test/test_spacy.py CHANGED
@@ -7,7 +7,7 @@ from pandas.testing import assert_frame_equal
7
7
  # Note: This test will fail if the kernel has not been restarted since the SpaCy model was downloaded.
8
8
  def test_spacy():
9
9
  scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
10
- scrubbed = scrub.spacy_persons(model_name="en_core_web_trf")
10
+ scrubbed = scrub.spacy_entities(entities=["PERSON"], model_name="en_core_web_trf")
11
11
  assert scrubbed == ["Our names are [PERSON], [PERSON], and [PERSON]."]
12
12
 
13
13
 
@@ -15,12 +15,39 @@ def test_spacy_error():
15
15
  scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
16
16
 
17
17
  with pytest.raises(ValueError):
18
- scrub.spacy_persons(model_name="not_a_model")
18
+ scrub.spacy_entities(model_name="not_a_model")
19
19
 
20
20
 
21
21
  def test_spacy_empty():
22
22
  scrub = IDScrub([" ", "John Smith", ""])
23
- scrubbed = scrub.spacy_persons()
23
+ scrubbed = scrub.spacy_entities()
24
24
 
25
25
  assert scrubbed == [" ", "[PERSON]", ""]
26
26
  assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
27
+
28
+
29
+ def test_spacy_map():
30
+ scrub = IDScrub(["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My company code is NASA."])
31
+ scrubbed_texts = scrub.spacy_entities(
32
+ entities=["PERSON", "ORG"], replacement_map={"PERSON": "[PHELLO]", "ORG": "[SPACE]"}
33
+ )
34
+
35
+ assert scrubbed_texts == ["Our names are [PHELLO], [PHELLO], and [PHELLO].", "My company code is [SPACE]."]
36
+
37
+
38
+ def test_spacy_get_data():
39
+ scrub = IDScrub(["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My company code is NASA."])
40
+
41
+ scrub.spacy_entities(entities=["PERSON", "ORG"])
42
+
43
+ df = scrub.get_scrubbed_data()
44
+
45
+ expected_df = pd.DataFrame(
46
+ {
47
+ "text_id": {0: 1, 1: 2},
48
+ "person": {0: ["Hamish McDonald", "L. Salah", "Elena Suárez"], 1: None},
49
+ "org": {0: None, 1: ["NASA"]},
50
+ }
51
+ )
52
+
53
+ assert_frame_equal(df, expected_df)
@@ -1,22 +0,0 @@
1
- idscrub/__init__.py,sha256=cRugJv27q1q--bl-VNLpfiScJb_ROlUxyLFhaF55S1w,38
2
- idscrub/locations.py,sha256=7fMNOcGMYe7sX8TrfhMW6oYGAlc1WVYVQKQbpxE3pqo,217
3
- idscrub/scrub.py,sha256=VqVqcChbbxMEKJR6Aci971dqG-RmD48otrp9sG2dX0o,34443
4
- idscrub-1.0.0.dist-info/licenses/LICENSE,sha256=JJnuf10NSx7YXglte1oH_N9ZP3AcWR_Y8irvQb_wnsg,1090
5
- notebooks/basic_usage.ipynb,sha256=XTBxdtu2F0S99V2lntUEeFj6SN4GRVm4qKvqOhs7nec,38777
6
- test/conftest.py,sha256=y-pwGXpdg7bbFc36HtE3wQtZkeI0JM77fcMYjej5veY,557
7
- test/test_all.py,sha256=ifuXAI0Hq3ETNXzdITjNGCnuFyozhN5TpJC2hOtA2bM,1103
8
- test/test_chain.py,sha256=tGxcG5zRMcX22RfcrimqX6Le2iFPH9NqfZy7Idhelps,1808
9
- test/test_dataframe.py,sha256=1LhtkQQpXblQ18ppI1s1nNyse0YCwGHbhtrKGkdppBw,6413
10
- test/test_huggingface.py,sha256=OGwWSz_tzcynuRFXOdV4H4ProKnekYMdtZJviXEejiA,836
11
- test/test_id.py,sha256=TPsvz4Kw1z_Fiek2BV79Hc2q3N37xU3oQra6Y7Ke11Q,989
12
- test/test_label.py,sha256=aTGmtAWSLHrgoVBbCFUCqj52LmlCEKN6owycOyfVNpQ,669
13
- test/test_log.py,sha256=tGAGOv4aeHT4E_pB9rq_nNA1CDHNoINpkVrCKaP4d3U,645
14
- test/test_persidio.py,sha256=rkqiUr-vYnfCf7Xt0gNo2VQK2gi5JKP7ThSlT803swc,1558
15
- test/test_phonenumbers.py,sha256=hZsXgwhn5R-7426TTWwCH9gWQwhyHtjLUstN10jnX6c,607
16
- test/test_regex.py,sha256=zuq8g_8F_P5oCA2ChU5wUIFEWjT9LSYB0S_U1rBpTn4,4388
17
- test/test_scrub.py,sha256=MWpan5cWIGeNPJCvTwtYe-iZeoIjS_fZMIg46ZVrkJo,1377
18
- test/test_spacy.py,sha256=KHalx16GYHmCaQUU1O5bLMP95SLTu1007fJK1oq__v4,932
19
- idscrub-1.0.0.dist-info/METADATA,sha256=fo7FUBAHDei63EWPRUrfNS05p3bnZWSY2GPVrho0vjo,5403
20
- idscrub-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
21
- idscrub-1.0.0.dist-info/top_level.txt,sha256=D4EEodXGCjGiX35ObiBTmjjBAdouN-eCvH-LezGGtks,23
22
- idscrub-1.0.0.dist-info/RECORD,,