PyPI - idscrub - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

idscrub 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

idscrub/scrub.py +85 -39
{idscrub-1.0.0.dist-info → idscrub-1.1.0.dist-info}/METADATA +20 -2
idscrub-1.1.0.dist-info/RECORD +22 -0
notebooks/basic_usage.ipynb +138 -74
test/test_chain.py +2 -2
test/test_huggingface.py +3 -3
test/test_label.py +1 -1
test/{test_persidio.py → test_presidio.py} +7 -7
test/test_regex.py +46 -0
test/test_scrub.py +3 -3
test/test_spacy.py +30 -3
idscrub-1.0.0.dist-info/RECORD +0 -22
{idscrub-1.0.0.dist-info → idscrub-1.1.0.dist-info}/WHEEL +0 -0
{idscrub-1.0.0.dist-info → idscrub-1.1.0.dist-info}/licenses/LICENSE +0 -0
{idscrub-1.0.0.dist-info → idscrub-1.1.0.dist-info}/top_level.txt +0 -0

idscrub/scrub.py CHANGED Viewed

@@ -453,6 +453,24 @@ class IDScrub:
         return self.scrub_regex(pattern, replacement_text, label=label)
+    def uk_addresses(self, replacement_text: str = "[ADDRESS]", label: str = "uk_address") -> list[str]:
+        """
+        Removes addresses.
+        e.g. `10 Downing Street` scrubbed
+        Args:
+            replacement_text (str): The replacement text for the removed text.
+            label (str): Label for the personal data removed.
+        Returns:
+            list[str]: The input list of text with postcodes replaced.
+        """
+        self.logger.info("Scrubbing addresses using regex...")
+        pattern = r"(?i)\b(?:flat\s+\w+,\s*)?\d+[a-z]?(?:[-–/]\d+[a-z]?)?\s+[a-z][a-z'’\- ]+\s+(street|st|road|rd|avenue|ave|lane|ln|close|cl|drive|dr|way|walk|gardens|gdns|place|pl|mews|court|ct|crescent|cres|terrace|ter)\b"
+        return self.scrub_regex(pattern, replacement_text, label)
     def claimants(self, replacement_text="[CLAIMANT]", label: str = "claimant") -> list[str]:
         """
         Removes claimant names from employment tribunal texts.
@@ -528,64 +546,86 @@ class IDScrub:
         return model
-    def spacy_persons(
+    def spacy_entities(
         self,
         model_name: str = "en_core_web_trf",
+        entities: list[str] = ["PERSON", "ORG", "NORP"],
+        replacement_map: str = {"PERSON": "[PERSON]", "ORG": "[ORG]", "NORP": "[NORP]"},
+        label_prefix: str = None,
         n_process: int = 1,
         batch_size: int = 1000,
-        replacement_text: str = "[PERSON]",
-        label: str = "person",
     ) -> list[str]:
         """
-        Remove PERSON entities using a Spacy model.
+        Remove SpaCy entities using a given SpaCy model.
+        Documentation for entity labels: https://spacy.io/models/en#en_core_web_trf
         Note: only "en_core_web_trf" has been evaluated.
         Args:
             model_name (str): Name of Spacy model. Only `en_core_web_trf` has been evaluated.
+            entities (list[str]): Which SpaCy entities to scrub (based on SpaCy entity keys).
+            replacement_map (str): The replacement texts for the removed text. Index will match `entities`.
+            label_prefix (str): Prefix for the Spacy entity removed, e.g. `{label}_person`.
             n_process (int): Number of parallel processes.
             batch_size (int): The number of texts in each batch.
-            replacement_text (str): The replacement text for the removed text.
-            label (str): Label for the personal data removed.
         Returns:
             list[str]: The input list of text with PERSON entities scrubbed.
         """
-        self.logger.info(f"Scrubbing names using SpaCy model `{model_name}`...")
-        texts = self.get_texts()
+        self.logger.info(
+            f"Scrubbing SpaCy entities `{', '.join(str(entitity) for entitity in entities)}` using SpaCy model `{model_name}`..."
+        )
-        if self.replacement_text:
-            replacement_text = self.replacement_text
+        texts = self.get_texts()
         cleaned_texts = []
+        labels = []
         nlp = self.get_spacy_model(model_name)
         stripped_texts = [s.strip() if s.isspace() else s for s in texts]
         documents = nlp.pipe(stripped_texts, n_process=n_process, batch_size=batch_size)
         for i, (ids, doc, stripped_text) in tqdm(
-            enumerate((zip(self.text_ids, documents, stripped_texts))), total=len(texts)
+            enumerate(zip(self.text_ids, documents, stripped_texts)), total=len(texts)
         ):
-            if stripped_text == "":
+            if not stripped_text:
                 cleaned_texts.append(texts[i])
                 continue
-            # Collect person entities
-            person_entities = [
-                ent for ent in doc.ents if ent.label_ == "PERSON" and ent.text not in {"PERSON", "HANDLE"}
-            ]
-            self.scrubbed_data.extend({self.text_id_name: ids, label: ent.text} for ent in person_entities)
+            all_found_entities = []
+            for entity_type in entities:
+                found = [
+                    ent for ent in doc.ents if ent.label_ == entity_type and ent.text not in {entity_type, "HANDLE"}
+                ]
+                for ent in found:
+                    label = ent.label_.lower()
+                    if label_prefix:
+                        label = f"{label_prefix}_{label}"
+                    labels.append(label)
+                    self.scrubbed_data.append({self.text_id_name: ids, label: ent.text})
+                if self.replacement_text:
+                    all_found_entities.extend((ent.start_char, ent.end_char, self.replacement_text) for ent in found)
+                elif replacement_map:
+                    all_found_entities.extend(
+                        (ent.start_char, ent.end_char, replacement_map.get(entity_type)) for ent in found
+                    )
+                else:
+                    all_found_entities.extend((ent.start_char, ent.end_char, f"[{entity_type}]") for ent in found)
-            # Remove person entities
             cleaned = stripped_text
-            for ent in sorted(person_entities, key=lambda x: [x.start_char], reverse=True):
-                cleaned = cleaned[: ent.start_char] + replacement_text + cleaned[ent.end_char :]
+            for start, end, repl in sorted(all_found_entities, key=lambda x: x[0], reverse=True):
+                cleaned = cleaned[:start] + repl + cleaned[end:]
             cleaned_texts.append(cleaned)
         self.cleaned_texts = cleaned_texts
-        self.log_message(label)
+        for label in set(labels):
+            self.log_message(label)
         return cleaned_texts
@@ -600,7 +640,7 @@ class IDScrub:
         Note: No Hugging Face models have been evaluated for performance.
         Args:
-            hf_model_path (str): Path to the Hugging Face model on the DBT mirror.
+            hf_model_path (str): Path to the Hugging Face model.
             Only `dbmdz/bert-large-cased-finetuned-conll03-english` has been evaluated.
             download_directory (str): Directory in which to save the model.
             Default is current working directory.
@@ -624,20 +664,21 @@ class IDScrub:
         return tokenizer
-    def huggingface_persons(
+    def huggingface_entities(
         self,
         hf_model_path: str = "dbmdz/bert-large-cased-finetuned-conll03-english",
         download_directory: str = f"{DOWNLOAD_DIR}/huggingface/",
+        entity="PER",
         replacement_text: str = "[PERSON]",
         label: str = "person",
         batch_size: int = 8,
     ) -> list[str]:
         """
-        Remove PERSON entities using a Hugging Face model.
+        Remove entities using a Hugging Face model. Default is a PERSON entity identifier.
         Note: No Hugging Face models have been evaluated for performance.
         Args:
-            hf_model_path (str): Path to the Hugging Face model on the DBT mirror.
+            hf_model_path (str): Path to the Hugging Face model.
             Only `dbmdz/bert-large-cased-finetuned-conll03-english` has been tested.
             download_directory (str): Directory in which to save the model.
             Default is current working directory.
@@ -679,7 +720,7 @@ class IDScrub:
                 continue
             person_entities = [
-                ent for ent in entities if ent["entity_group"] == "PER" and ent["word"] not in {"HANDLE", "PERSON"}
+                ent for ent in entities if ent["entity_group"] == entity and ent["word"] not in {"HANDLE", entity}
             ]
             self.scrubbed_data.extend({self.text_id_name: ids, label: ent["word"]} for ent in person_entities)
@@ -695,10 +736,10 @@ class IDScrub:
         return cleaned_texts
-    def presidio(
+    def presidio_entities(
         self,
         model_name: str = "en_core_web_trf",
-        entities_to_scrub: list[str] = [
+        entities: list[str] = [
             "PERSON",
             "UK_NINO",
             "UK_NHS",
@@ -718,15 +759,18 @@ class IDScrub:
         Args:
             model_name (str): spaCy model to use
-            entities_to_scrub (list[str]): Entity types to scrub (e.g. ["PERSON", "IP_ADDRESS"])
+            entities (list[str]): Entity types to scrub (e.g. ["PERSON", "IP_ADDRESS"])
             replacement_map (dict): Mapping of entity_type to replacement string (e.g. {'PERSON': '[PERSON]'})
             label_prefix (str): Prefix for the Presidio personal data type removed, e.g. `{label}_person`.
+            Useful if you wish to identify this having being scrubbed by Presidio.
         Returns:
             list[str]: The input list of text with entities replaced.
         """
-        self.logger.info("Scrubbing using Presidio...")
+        self.logger.info(
+            f"Scrubbing Presidio entities `{', '.join(str(entitity) for entitity in entities)}` using SpaCy model `{model_name}`..."
+        )
         texts = self.get_texts()
@@ -744,7 +788,7 @@ class IDScrub:
         anonymizer = AnonymizerEngine()
         cleaned_texts = []
-        unique_labels = []
+        all_labels = []
         stripped_texts = [s.strip() if s.isspace() else s for s in texts]
@@ -754,14 +798,15 @@ class IDScrub:
                 continue
             results = analyzer.analyze(text=stripped_text, language="en")
-            results = [r for r in results if r.entity_type in entities_to_scrub]
+            results = [r for r in results if r.entity_type in entities]
             if label_prefix:
                 labels = [f"{label_prefix}_{res.entity_type.lower()}" for res in results]
             else:
                 labels = [f"{res.entity_type.lower()}" for res in results]
-            unique_labels.append(list(set(labels)))
+            for label in labels:
+                all_labels.append(label)
             self.scrubbed_data.extend(
                 {self.text_id_name: ids, label: stripped_text[res.start : res.end]}
@@ -788,9 +833,8 @@ class IDScrub:
         self.cleaned_texts = cleaned_texts
-        for label in unique_labels:
-            if label:
-                self.log_message(label[0])
+        for label in set(all_labels):
+            self.log_message(label)
         return cleaned_texts
@@ -810,6 +854,7 @@ class IDScrub:
         self.handles()
         self.ip_addresses()
         self.uk_phone_numbers()
+        self.uk_addresses()
         self.uk_postcodes()
         self.titles()
@@ -820,7 +865,8 @@ class IDScrub:
         custom_regex_patterns: list = None,
         custom_replacement_texts: list[str] = None,
         model_name: str = "en_core_web_trf",
-        presidio_entities_to_scrub: list[str] = [
+        spacy_entities: list[str] = ["PERSON", "ORG", "NORP"],
+        presidio_entities: list[str] = [
             "PERSON",
             "EMAIL_ADDRESS",
             "UK_NINO",
@@ -857,8 +903,8 @@ class IDScrub:
                 custom_replacement_texts=custom_replacement_texts,
             )
-        self.presidio(model_name=model_name, entities_to_scrub=presidio_entities_to_scrub)
-        self.spacy_persons(model_name=model_name, n_process=n_process, batch_size=batch_size)
+        self.presidio_entities(model_name=model_name, entities=presidio_entities)
+        self.spacy_entities(model_name=model_name, entities=spacy_entities, n_process=n_process, batch_size=batch_size)
         self.google_phone_numbers()
         self.all_regex()

{idscrub-1.0.0.dist-info → idscrub-1.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: idscrub
-Version: 1.0.0
+Version: 1.1.0
 Author: Department for Business and Trade
 Requires-Python: >=3.12
 Description-Content-Type: text/markdown
@@ -45,12 +45,30 @@ Basic usage example (see [basic_usage.ipynb](https://github.com/uktrade/idscrub/
 from idscrub import IDScrub
 scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])x
-scrubbed_texts = scrub.scrub(scrub_methods=['spacy_persons', 'uk_phone_numbers', 'uk_postcodes'])
+scrubbed_texts = scrub.scrub(scrub_methods=['spacy_entities', 'uk_phone_numbers', 'uk_postcodes'])
 print(scrubbed_texts)
 # Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE].']
 ```
+## Personal data types supported
+Personal data can either be scrubbed as methods with arguments for extra customisation, e.g. `IDScrub.google_phone_numbers(region="GB")`, or as a string arguments with default configurations (see above). The method name and its string representation are the same.
+| Argument                | Scrubs                                                                 |
+|-------------------------|------------------------------------------------------------------------|
+| `all`                  | All supported personal data types (see `IDScrub.all()` for further customisation) |
+| `spacy_entities`        | Entities detected by spaCy's `en_core_web_trf` or other user-selected spaCy models (e.g. persons (names), organisations) |
+| `presidio_entities`     | Entities supported by [Microsoft Presidio](https://microsoft.github.io/presidio/) (e.g. persons (names), URLs, NHS numbers, IBAN codes) |
+| `huggingface_entities`  | Entities detected by user-selected HuggingFace models |
+| `email_addresses`      | Email addresses (e.g. john@email.com)   |
+| `titles`               | Titles (e.g. Mr., Mrs., Dr.)    |
+| `handles`              | Social media handles (e.g. @username)  |
+| `ip_addresses`         | IP addresses (e.g. 8.8.8.8)  |
+| `uk_postcodes`         | UK postal codes (e.g. SW1A 2AA) |
+| `uk_addresses`         | UK addresses (e.g. 10 Downing Street)  |
+| `uk_phone_numbers`     | UK phone numbers (e.g. +441111111111) |
+| `google_phone_numbers` | Phone numbers detected by Google's [phonenumbers](https://github.com/daviddrysdale/python-phonenumbers) |
 ## Considerations before use

idscrub-1.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,22 @@
+idscrub/__init__.py,sha256=cRugJv27q1q--bl-VNLpfiScJb_ROlUxyLFhaF55S1w,38
+idscrub/locations.py,sha256=7fMNOcGMYe7sX8TrfhMW6oYGAlc1WVYVQKQbpxE3pqo,217
+idscrub/scrub.py,sha256=PPTKWW-RQxZ5NixRow8nrnX9KjfyZa3tPAP9Jgwnn_M,36631
+idscrub-1.1.0.dist-info/licenses/LICENSE,sha256=JJnuf10NSx7YXglte1oH_N9ZP3AcWR_Y8irvQb_wnsg,1090
+notebooks/basic_usage.ipynb,sha256=V62Bz88a9Zo3LO_VxXF4sLw8-MP51ZdVRRNS-zjtNqw,42664
+test/conftest.py,sha256=y-pwGXpdg7bbFc36HtE3wQtZkeI0JM77fcMYjej5veY,557
+test/test_all.py,sha256=ifuXAI0Hq3ETNXzdITjNGCnuFyozhN5TpJC2hOtA2bM,1103
+test/test_chain.py,sha256=YbJeA11EBjDNcq5ZZjG4lIIyngrRQZknNsX3Oo0jPMc,1810
+test/test_dataframe.py,sha256=1LhtkQQpXblQ18ppI1s1nNyse0YCwGHbhtrKGkdppBw,6413
+test/test_huggingface.py,sha256=RTkp8Xsy4w9WoXq2IQ2YOJof41snbOQkM7CVtiVVD0U,839
+test/test_id.py,sha256=TPsvz4Kw1z_Fiek2BV79Hc2q3N37xU3oQra6Y7Ke11Q,989
+test/test_label.py,sha256=aNkIxJ-_YkBnW8QrBfRxjSsRZWeh5hn_iM7Rk1wrfPU,652
+test/test_log.py,sha256=tGAGOv4aeHT4E_pB9rq_nNA1CDHNoINpkVrCKaP4d3U,645
+test/test_phonenumbers.py,sha256=hZsXgwhn5R-7426TTWwCH9gWQwhyHtjLUstN10jnX6c,607
+test/test_presidio.py,sha256=BOGghcTWLSQPBhQxO014rO3RG-IL5XEbAaKuGN677pU,1558
+test/test_regex.py,sha256=foc2N4UCi7mGL0EIfp1t-ivgujkXMrmbsnsU77sbWZ0,5424
+test/test_scrub.py,sha256=tMYrIhbyXXKqt24tS1U_kAJT_vZfhOD4DAsf5ZFbEvU,1380
+test/test_spacy.py,sha256=gxJrNpV5B3HydUfoMsbmzRUoiKNs3_zwdSXqbPeW0qA,1846
+idscrub-1.1.0.dist-info/METADATA,sha256=HyPSfPJuFUPOib2fNr3eUtQIcvgJHr3uVNZaZQcXmS8,7003
+idscrub-1.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+idscrub-1.1.0.dist-info/top_level.txt,sha256=D4EEodXGCjGiX35ObiBTmjjBAdouN-eCvH-LezGGtks,23
+idscrub-1.1.0.dist-info/RECORD,,

notebooks/basic_usage.ipynb CHANGED Viewed

@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### `idscrub` basic usage example"
+    "### `idscrub` basic usage examples"
    ]
   },
   {
@@ -17,11 +17,14 @@
      "output_type": "stream",
      "text": [
       "INFO: Texts loaded.\n",
-      "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 2/2 [00:00<00:00, 42.63it/s]\n",
+      "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
+      "100%|██████████| 2/2 [00:00<00:00, 33.83it/s]\n",
+      "INFO: 1 org scrubbed.\n",
       "INFO: 3 person scrubbed.\n",
       "INFO: Scrubbing phone numbers using regex...\n",
       "INFO: 1 uk_phone_number scrubbed.\n",
+      "INFO: Scrubbing addresses using regex...\n",
+      "INFO: 1 uk_address scrubbed.\n",
       "INFO: Scrubbing postcodes using regex...\n",
       "INFO: 1 uk_postcode scrubbed.\n"
      ]
@@ -30,7 +33,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE], Lapland.']\n"
+      "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], Lapland']\n"
      ]
     }
    ],
@@ -40,11 +43,11 @@
     "scrub = IDScrub(\n",
     "    [\n",
     "        \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
-    "        \"My number is +441111111111 and I live at AA11 1AA, Lapland.\",\n",
+    "        \"My number is +441111111111 and I work at the Department for Business and Trade, 15 Elf Road, AA11 1AA, Lapland\",\n",
     "    ]\n",
     ")\n",
     "\n",
-    "scrubbed_texts = scrub.scrub(scrub_methods=[\"spacy_persons\", \"uk_phone_numbers\", \"uk_postcodes\"])\n",
+    "scrubbed_texts = scrub.scrub(scrub_methods=[\"spacy_entities\", \"uk_phone_numbers\", \"uk_addresses\", \"uk_postcodes\"])\n",
     "\n",
     "print(scrubbed_texts)"
    ]
@@ -77,7 +80,9 @@
        "      <th></th>\n",
        "      <th>text_id</th>\n",
        "      <th>person</th>\n",
+       "      <th>org</th>\n",
        "      <th>uk_phone_number</th>\n",
+       "      <th>uk_address</th>\n",
        "      <th>uk_postcode</th>\n",
        "    </tr>\n",
        "  </thead>\n",
@@ -88,12 +93,16 @@
        "      <td>[Hamish McDonald, L. Salah, Elena Suárez]</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>2</td>\n",
        "      <td>None</td>\n",
+       "      <td>[the Department for Business and Trade]</td>\n",
        "      <td>[+441111111111]</td>\n",
+       "      <td>[15 Elf Road]</td>\n",
        "      <td>[AA11 1AA]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -101,9 +110,13 @@
        "</div>"
       ],
       "text/plain": [
-       "   text_id                                     person  uk_phone_number  \\\n",
-       "0        1  [Hamish McDonald, L. Salah, Elena Suárez]             None   \n",
-       "1        2                                       None  [+441111111111]   \n",
+       "   text_id                                     person  \\\n",
+       "0        1  [Hamish McDonald, L. Salah, Elena Suárez]   \n",
+       "1        2                                       None   \n",
+       "\n",
+       "                                       org  uk_phone_number     uk_address  \\\n",
+       "0                                     None             None           None   \n",
+       "1  [the Department for Business and Trade]  [+441111111111]  [15 Elf Road]   \n",
        "\n",
        "  uk_postcode  \n",
        "0        None  \n",
@@ -136,13 +149,13 @@
      "output_type": "stream",
      "text": [
       "INFO: Texts loaded.\n",
-      "INFO: Scrubbing using Presidio...\n",
-      "100%|██████████| 2/2 [00:00<00:00, 14.67it/s]\n",
+      "INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
+      "100%|██████████| 2/2 [00:00<00:00,  9.14it/s]\n",
       "INFO: 3 person scrubbed.\n",
       "INFO: 1 location scrubbed.\n",
-      "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 2/2 [00:00<00:00, 48.96it/s]\n",
-      "INFO: 3 person scrubbed.\n",
+      "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
+      "100%|██████████| 2/2 [00:00<00:00, 42.62it/s]\n",
+      "INFO: 1 org scrubbed.\n",
       "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
       "INFO: 0 phone_number scrubbed.\n",
       "INFO: Scrubbing email addresses using regex...\n",
@@ -153,6 +166,8 @@
       "INFO: 0 ip_address scrubbed.\n",
       "INFO: Scrubbing phone numbers using regex...\n",
       "INFO: 1 uk_phone_number scrubbed.\n",
+      "INFO: Scrubbing addresses using regex...\n",
+      "INFO: 1 uk_address scrubbed.\n",
       "INFO: Scrubbing postcodes using regex...\n",
       "INFO: 1 uk_postcode scrubbed.\n",
       "INFO: Scrubbing titles using regex...\n",
@@ -163,7 +178,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE], [LOCATION].']\n"
+      "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], [LOCATION]']\n"
      ]
     }
    ],
@@ -173,7 +188,7 @@
     "scrub = IDScrub(\n",
     "    [\n",
     "        \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
-    "        \"My number is +441111111111 and I live at AA11 1AA, Lapland.\",\n",
+    "        \"My number is +441111111111 and I work at Department for Business and Trade, 15 Elf Road, AA11 1AA, Lapland\",\n",
     "    ]\n",
     ")\n",
     "\n",
@@ -211,7 +226,9 @@
        "      <th>text_id</th>\n",
        "      <th>person</th>\n",
        "      <th>location</th>\n",
+       "      <th>org</th>\n",
        "      <th>uk_phone_number</th>\n",
+       "      <th>uk_address</th>\n",
        "      <th>uk_postcode</th>\n",
        "    </tr>\n",
        "  </thead>\n",
@@ -223,13 +240,17 @@
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>2</td>\n",
        "      <td>None</td>\n",
        "      <td>[Lapland]</td>\n",
+       "      <td>[Department for Business and Trade]</td>\n",
        "      <td>[+441111111111]</td>\n",
+       "      <td>[15 Elf Road]</td>\n",
        "      <td>[AA11 1AA]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -241,9 +262,13 @@
        "0        1  [Hamish McDonald, L. Salah, Elena Suárez]       None   \n",
        "1        2                                       None  [Lapland]   \n",
        "\n",
-       "   uk_phone_number uk_postcode  \n",
-       "0             None        None  \n",
-       "1  [+441111111111]  [AA11 1AA]  "
+       "                                   org  uk_phone_number     uk_address  \\\n",
+       "0                                 None             None           None   \n",
+       "1  [Department for Business and Trade]  [+441111111111]  [15 Elf Road]   \n",
+       "\n",
+       "  uk_postcode  \n",
+       "0        None  \n",
+       "1  [AA11 1AA]  "
       ]
      },
      "execution_count": 4,
@@ -272,14 +297,15 @@
      "output_type": "stream",
      "text": [
       "INFO: Texts loaded.\n",
-      "INFO: Scrubbing using Presidio...\n",
-      "100%|██████████| 2/2 [00:00<00:00, 30.26it/s]\n",
+      "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
+      "100%|██████████| 2/2 [00:00<00:00, 42.58it/s]\n",
+      "INFO: 1 org scrubbed.\n",
       "INFO: 3 person scrubbed.\n",
       "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
       "INFO: 0 phone_number scrubbed.\n",
       "INFO: Scrubbing custom regex...\n",
       "INFO: 1 custom_regex_1 scrubbed.\n",
-      "INFO: 1 custom_regex_2 scrubbed.\n",
+      "INFO: 0 custom_regex_2 scrubbed.\n",
       "INFO: Scrubbing email addresses using regex...\n",
       "INFO: 0 email_address scrubbed.\n",
       "INFO: Scrubbing @user handles using regex...\n",
@@ -288,6 +314,8 @@
       "INFO: 0 ip_address scrubbed.\n",
       "INFO: Scrubbing phone numbers using regex...\n",
       "INFO: 1 uk_phone_number scrubbed.\n",
+      "INFO: Scrubbing addresses using regex...\n",
+      "INFO: 1 uk_address scrubbed.\n",
       "INFO: Scrubbing postcodes using regex...\n",
       "INFO: 1 uk_postcode scrubbed.\n",
       "INFO: Scrubbing titles using regex...\n",
@@ -298,7 +326,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE], University of [UNIVERSITY] where I am on secret mission [REDACTED].']\n"
+      "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], [UNIVERSITY]']\n"
      ]
     }
    ],
@@ -308,15 +336,18 @@
     "scrub = IDScrub(\n",
     "    [\n",
     "        \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
-    "        \"My number is +441111111111 and I live at AA11 1AA, University of Lapland where I am on secret mission ACHILLES.\",\n",
+    "        \"My number is +441111111111 and I work at Department for Business and Trade, 15 Elf Road, AA11 1AA, Lapland\",\n",
     "    ]\n",
     ")\n",
     "\n",
-    "scrub.presidio()\n",
+    "scrub.spacy_entities()\n",
     "scrub.google_phone_numbers(region=\"GB\")\n",
+    "\n",
+    "# Remove specific regex pattern(s). This can also be passed to all().\n",
     "scrub.custom_regex(\n",
     "    custom_regex_patterns=[r\"Lapland\", r\"ACHILLES\"], custom_replacement_texts=[\"[UNIVERSITY]\", \"[REDACTED]\"]\n",
-    ")  # Remove specific regex pattern(s). This can also be passed to all().\n",
+    ")\n",
+    "\n",
     "scrubbed_texts = scrub.all_regex()\n",
     "\n",
     "print(scrubbed_texts)"
@@ -350,9 +381,10 @@
        "      <th></th>\n",
        "      <th>text_id</th>\n",
        "      <th>person</th>\n",
+       "      <th>org</th>\n",
        "      <th>custom_regex_1</th>\n",
-       "      <th>custom_regex_2</th>\n",
        "      <th>uk_phone_number</th>\n",
+       "      <th>uk_address</th>\n",
        "      <th>uk_postcode</th>\n",
        "    </tr>\n",
        "  </thead>\n",
@@ -365,14 +397,16 @@
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>2</td>\n",
        "      <td>None</td>\n",
+       "      <td>[Department for Business and Trade]</td>\n",
        "      <td>[Lapland]</td>\n",
-       "      <td>[ACHILLES]</td>\n",
        "      <td>[+441111111111]</td>\n",
+       "      <td>[15 Elf Road]</td>\n",
        "      <td>[AA11 1AA]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -380,13 +414,17 @@
        "</div>"
       ],
       "text/plain": [
-       "   text_id                                     person custom_regex_1  \\\n",
-       "0        1  [Hamish McDonald, L. Salah, Elena Suárez]           None   \n",
-       "1        2                                       None      [Lapland]   \n",
+       "   text_id                                     person  \\\n",
+       "0        1  [Hamish McDonald, L. Salah, Elena Suárez]   \n",
+       "1        2                                       None   \n",
        "\n",
-       "  custom_regex_2  uk_phone_number uk_postcode  \n",
-       "0           None             None        None  \n",
-       "1     [ACHILLES]  [+441111111111]  [AA11 1AA]  "
+       "                                   org custom_regex_1  uk_phone_number  \\\n",
+       "0                                 None           None             None   \n",
+       "1  [Department for Business and Trade]      [Lapland]  [+441111111111]   \n",
+       "\n",
+       "      uk_address uk_postcode  \n",
+       "0           None        None  \n",
+       "1  [15 Elf Road]  [AA11 1AA]  "
       ]
      },
      "execution_count": 6,
@@ -402,7 +440,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## `idscrub` example - using Presidio\n",
+    "### `idscrub` example - using Presidio\n",
     "We can also leverage the power of [Presidio](https://microsoft.github.io/presidio/) and use their entity recognition methods"
    ]
   },
@@ -416,10 +454,10 @@
      "output_type": "stream",
      "text": [
       "INFO: Texts loaded.\n",
-      "INFO: Scrubbing using Presidio...\n",
-      "100%|██████████| 2/2 [00:00<00:00, 28.29it/s]\n",
-      "INFO: 3 person scrubbed.\n",
-      "INFO: 1 iban_code scrubbed.\n"
+      "INFO: Scrubbing Presidio entities `PERSON, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, IBAN_CODE` using SpaCy model `en_core_web_trf`...\n",
+      "100%|██████████| 2/2 [00:00<00:00, 24.36it/s]\n",
+      "INFO: 1 iban_code scrubbed.\n",
+      "INFO: 3 person scrubbed.\n"
      ]
     },
     {
@@ -436,7 +474,7 @@
     "scrub = IDScrub(\n",
     "    [\"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\", \"My IBAN code is GB91BKEN10000041610008\"]\n",
     ")\n",
-    "scrubbed_texts = scrub.presidio()\n",
+    "scrubbed_texts = scrub.presidio_entities()\n",
     "\n",
     "print(scrubbed_texts)"
    ]
@@ -678,14 +716,11 @@
      "text": [
       "  0%|          | 0/3 [00:00<?, ?it/s]INFO: Texts loaded.\n",
       "INFO: Scrubbing column `Pride and Prejudice`...\n",
-      "INFO: Scrubbing using Presidio...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 27.93it/s]\n",
-      "INFO: 4 person scrubbed.\n",
-      "INFO: 4 person scrubbed.\n",
-      "INFO: 4 person scrubbed.\n",
-      "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 62.29it/s]\n",
+      "INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
+      "100%|██████████| 5/5 [00:00<00:00, 23.73it/s]\n",
       "INFO: 4 person scrubbed.\n",
+      "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
+      "100%|██████████| 5/5 [00:00<00:00, 77.84it/s]\n",
       "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
       "INFO: 0 phone_number scrubbed.\n",
       "INFO: Scrubbing email addresses using regex...\n",
@@ -696,19 +731,19 @@
       "INFO: 0 ip_address scrubbed.\n",
       "INFO: Scrubbing phone numbers using regex...\n",
       "INFO: 0 uk_phone_number scrubbed.\n",
+      "INFO: Scrubbing addresses using regex...\n",
+      "INFO: 0 uk_address scrubbed.\n",
       "INFO: Scrubbing postcodes using regex...\n",
       "INFO: 0 uk_postcode scrubbed.\n",
       "INFO: Scrubbing titles using regex...\n",
       "INFO: 2 title scrubbed.\n",
-      " 33%|███▎      | 1/3 [00:02<00:05,  2.62s/it]INFO: Texts loaded.\n",
+      " 33%|███▎      | 1/3 [00:02<00:05,  2.60s/it]INFO: Texts loaded.\n",
       "INFO: Scrubbing column `The Adventures of Sherlock Holmes`...\n",
-      "INFO: Scrubbing using Presidio...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 28.25it/s]\n",
-      "INFO: 2 person scrubbed.\n",
-      "INFO: 2 person scrubbed.\n",
-      "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 82.44it/s]\n",
+      "INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
+      "100%|██████████| 5/5 [00:00<00:00, 24.22it/s]\n",
       "INFO: 2 person scrubbed.\n",
+      "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
+      "100%|██████████| 5/5 [00:00<00:00, 84.78it/s]\n",
       "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
       "INFO: 0 phone_number scrubbed.\n",
       "INFO: Scrubbing email addresses using regex...\n",
@@ -719,21 +754,23 @@
       "INFO: 0 ip_address scrubbed.\n",
       "INFO: Scrubbing phone numbers using regex...\n",
       "INFO: 0 uk_phone_number scrubbed.\n",
+      "INFO: Scrubbing addresses using regex...\n",
+      "INFO: 0 uk_address scrubbed.\n",
       "INFO: Scrubbing postcodes using regex...\n",
       "INFO: 0 uk_postcode scrubbed.\n",
       "INFO: Scrubbing titles using regex...\n",
       "INFO: 0 title scrubbed.\n",
-      " 67%|██████▋   | 2/3 [00:05<00:02,  2.50s/it]INFO: Texts loaded.\n",
+      " 67%|██████▋   | 2/3 [00:05<00:02,  2.49s/it]INFO: Texts loaded.\n",
       "INFO: Scrubbing column `Fake book`...\n",
-      "INFO: Scrubbing using Presidio...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 13.15it/s]\n",
+      "INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
+      "100%|██████████| 5/5 [00:00<00:00, 13.41it/s]\n",
       "INFO: 1 iban_code scrubbed.\n",
+      "INFO: 5 url scrubbed.\n",
       "INFO: 2 person scrubbed.\n",
       "INFO: 3 email_address scrubbed.\n",
-      "INFO: 3 email_address scrubbed.\n",
-      "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 54.15it/s]\n",
-      "INFO: 2 person scrubbed.\n",
+      "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
+      "100%|██████████| 5/5 [00:00<00:00, 64.57it/s]\n",
+      "INFO: 1 org scrubbed.\n",
       "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
       "INFO: 0 phone_number scrubbed.\n",
       "INFO: Scrubbing email addresses using regex...\n",
@@ -744,11 +781,13 @@
       "INFO: 0 ip_address scrubbed.\n",
       "INFO: Scrubbing phone numbers using regex...\n",
       "INFO: 0 uk_phone_number scrubbed.\n",
+      "INFO: Scrubbing addresses using regex...\n",
+      "INFO: 0 uk_address scrubbed.\n",
       "INFO: Scrubbing postcodes using regex...\n",
       "INFO: 4 uk_postcode scrubbed.\n",
       "INFO: Scrubbing titles using regex...\n",
       "INFO: 0 title scrubbed.\n",
-      "100%|██████████| 3/3 [00:07<00:00,  2.56s/it]\n"
+      "100%|██████████| 3/3 [00:07<00:00,  2.53s/it]\n"
      ]
     },
     {
@@ -810,7 +849,7 @@
        "      <td>The business of her life was to get her daught...</td>\n",
        "      <td>I am a brain, [PERSON]. The rest of me is a me...</td>\n",
        "      <td>Nothing is more painful to the human mind than...</td>\n",
-       "      <td>A message arrived just as the Downing Street c...</td>\n",
+       "      <td>A message arrived just as the [ORG] clock stru...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
@@ -850,7 +889,7 @@
        "0  The letter to [EMAIL_ADDRESS] was stamped with...  \n",
        "1  She forwarded the memo from [PERSON] and [PERS...  \n",
        "2  The dossier marked confidential came from [EMA...  \n",
-       "3  A message arrived just as the Downing Street c...  \n",
+       "3  A message arrived just as the [ORG] clock stru...  \n",
        "4  They did not expected a reply from [EMAIL_ADDR...  "
       ]
      },
@@ -900,6 +939,7 @@
        "      <th>email_address</th>\n",
        "      <th>iban_code</th>\n",
        "      <th>url</th>\n",
+       "      <th>org</th>\n",
        "      <th>uk_postcode</th>\n",
        "    </tr>\n",
        "  </thead>\n",
@@ -914,6 +954,7 @@
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
@@ -925,6 +966,7 @@
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
@@ -936,6 +978,7 @@
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
@@ -947,6 +990,7 @@
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
@@ -958,6 +1002,7 @@
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
@@ -968,6 +1013,7 @@
        "      <td>[freddie.mercury@queen.com]</td>\n",
        "      <td>[GB91BKEN10000041610008]</td>\n",
        "      <td>[freddie.me, queen.com]</td>\n",
+       "      <td>None</td>\n",
        "      <td>[SW1A 2AA]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -979,6 +1025,7 @@
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
+       "      <td>None</td>\n",
        "      <td>[SW1A 2WH]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -990,6 +1037,7 @@
        "      <td>[serena.williams@tennis.com]</td>\n",
        "      <td>None</td>\n",
        "      <td>[tennis.com]</td>\n",
+       "      <td>None</td>\n",
        "      <td>[SW19 5AE]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -1001,8 +1049,21 @@
        "      <td>[otis.redding@dockofthebay.org]</td>\n",
        "      <td>None</td>\n",
        "      <td>[otis.red, dockofthebay.org]</td>\n",
+       "      <td>None</td>\n",
        "      <td>[EH8 8DX]</td>\n",
        "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>D</td>\n",
+       "      <td>Fake book</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>[Downing Street]</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
@@ -1018,6 +1079,7 @@
        "6  B                          Fake book  [Mick Jagger, David Bowie]  None   \n",
        "7  C                          Fake book                        None  None   \n",
        "8  E                          Fake book                        None  None   \n",
+       "9  D                          Fake book                        None  None   \n",
        "\n",
        "                     email_address                 iban_code  \\\n",
        "0                             None                      None   \n",
@@ -1029,17 +1091,19 @@
        "6                             None                      None   \n",
        "7     [serena.williams@tennis.com]                      None   \n",
        "8  [otis.redding@dockofthebay.org]                      None   \n",
+       "9                             None                      None   \n",
        "\n",
-       "                            url uk_postcode  \n",
-       "0                          None        None  \n",
-       "1                          None        None  \n",
-       "2                          None        None  \n",
-       "3                          None        None  \n",
-       "4                          None        None  \n",
-       "5       [freddie.me, queen.com]  [SW1A 2AA]  \n",
-       "6                          None  [SW1A 2WH]  \n",
-       "7                  [tennis.com]  [SW19 5AE]  \n",
-       "8  [otis.red, dockofthebay.org]   [EH8 8DX]  "
+       "                            url               org uk_postcode  \n",
+       "0                          None              None        None  \n",
+       "1                          None              None        None  \n",
+       "2                          None              None        None  \n",
+       "3                          None              None        None  \n",
+       "4                          None              None        None  \n",
+       "5       [freddie.me, queen.com]              None  [SW1A 2AA]  \n",
+       "6                          None              None  [SW1A 2WH]  \n",
+       "7                  [tennis.com]              None  [SW19 5AE]  \n",
+       "8  [otis.red, dockofthebay.org]              None   [EH8 8DX]  \n",
+       "9                          None  [Downing Street]        None  "
       ]
      },
      "execution_count": 11,

test/test_chain.py CHANGED Viewed

@@ -6,7 +6,7 @@ from pandas.testing import assert_frame_equal
 def test_chain(scrub_object):
     scrub_object.uk_phone_numbers()
     scrub_object.uk_postcodes()
-    scrubbed = scrub_object.spacy_persons()
+    scrubbed = scrub_object.spacy_entities()
     assert scrubbed == [
         "Our names are [PERSON], [PERSON], and [PERSON].",
@@ -38,7 +38,7 @@ def test_chain_order(scrub_object):
 def test_get_scrubbed_data_chain(scrub_object):
     scrub_object.uk_phone_numbers()
     scrub_object.uk_postcodes()
-    scrub_object.spacy_persons()
+    scrub_object.spacy_entities()
     df = scrub_object.get_scrubbed_data()

test/test_huggingface.py CHANGED Viewed

@@ -6,7 +6,7 @@ from pandas.testing import assert_frame_equal
 def test_huggingface():
     scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
-    scrubbed = scrub.huggingface_persons()
+    scrubbed = scrub.huggingface_entities()
     assert scrubbed == ["Our names are [PERSON], [PERSON], and [PERSON]."]
@@ -14,12 +14,12 @@ def test_huggingface_error():
     scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
     with pytest.raises(OSError):
-        scrub.huggingface_persons(hf_model_path="not_a_path")
+        scrub.huggingface_entities(hf_model_path="not_a_path")
 def test_huggingface_empty():
     scrub = IDScrub([" ", "John Smith", ""])
-    scrubbed = scrub.huggingface_persons()
+    scrubbed = scrub.huggingface_entities()
     assert scrubbed == [" ", "[PERSON]", ""]
     assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))

test/test_label.py CHANGED Viewed

@@ -1,6 +1,6 @@
 def test_label(scrub_object_all):
     for i, scrub_method in enumerate(
-        ["spacy_persons", "uk_postcodes", "email_addresses", "ip_addresses", "uk_phone_numbers", "titles", "handles"]
+        ["uk_postcodes", "email_addresses", "ip_addresses", "uk_phone_numbers", "titles", "handles"]
     ):
         method = getattr(scrub_object_all, scrub_method)
         method(label="test")

test/{test_persidio.py → test_presidio.py} RENAMED Viewed

@@ -4,32 +4,32 @@ from pandas.testing import assert_frame_equal
 # Note: These tests will fail if the kernel has not been restarted since the SpaCy model was downloaded.
-def test_persidio():
+def test_presidio():
     scrub = IDScrub(
         ["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
     )
-    scrubbed_texts = scrub.presidio(entities_to_scrub=["PERSON", "IBAN_CODE"])
+    scrubbed_texts = scrub.presidio_entities(entities=["PERSON", "IBAN_CODE"])
     assert scrubbed_texts == ["Our names are [PERSON], [PERSON], and [PERSON].", "My IBAN code is [IBAN_CODE]."]
-def test_persidio_map():
+def test_presidio_map():
     scrub = IDScrub(
         ["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
     )
-    scrubbed_texts = scrub.presidio(
-        entities_to_scrub=["PERSON", "IBAN_CODE"], replacement_map={"PERSON": "[PHELLO]", "IBAN_CODE": "[IHELLO]"}
+    scrubbed_texts = scrub.presidio_entities(
+        entities=["PERSON", "IBAN_CODE"], replacement_map={"PERSON": "[PHELLO]", "IBAN_CODE": "[IHELLO]"}
     )
     assert scrubbed_texts == ["Our names are [PHELLO], [PHELLO], and [PHELLO].", "My IBAN code is [IHELLO]."]
-def test_persidio_get_data():
+def test_presidio_get_data():
     scrub = IDScrub(
         ["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
     )
-    scrub.presidio(entities_to_scrub=["PERSON", "IBAN_CODE"])
+    scrub.presidio_entities(entities=["PERSON", "IBAN_CODE"])
     df = scrub.get_scrubbed_data()

test/test_regex.py CHANGED Viewed

@@ -63,6 +63,52 @@ def test_handles():
     assert scrubbed == ["Our usernames are [HANDLE], [HANDLE], [HANDLE] and [HANDLE]."]
+def test_uk_addresses():
+    scrub = IDScrub(
+        [
+            "221B Baker Street",
+            "12 high road",
+            "Flat 3B, 47 King's Court",
+            "12–14 High Street",
+            "5a-7a Church Lane",
+            "1/2 Main Street",
+            "10 St John’s Rd",
+            "33 Queen-Anne Walk",
+            "8 Deansgate Ct",
+        ]
+    )
+    scrubbed = scrub.uk_addresses()
+    assert scrubbed == [
+        "[ADDRESS]",
+        "[ADDRESS]",
+        "[ADDRESS]",
+        "[ADDRESS]",
+        "[ADDRESS]",
+        "[ADDRESS]",
+        "[ADDRESS]",
+        "[ADDRESS]",
+        "[ADDRESS]",
+    ]
+    negative_tests = [
+        "12 High",
+        "Baker Street",
+        "High Road 12",
+        "Go to the high road now",
+        "500 the big building near river",
+        "I walked the long road home",
+        "12b misspelledstreet",
+        "London SW1A 1AA",
+        "12,,, High?",
+    ]
+    scrub = IDScrub(negative_tests)
+    scrubbed = scrub.uk_addresses()
+    assert scrubbed == negative_tests
 def test_claimants():
     scrub = IDScrub(
         texts=[

test/test_scrub.py CHANGED Viewed

@@ -5,7 +5,7 @@ from pandas.testing import assert_frame_equal
 # Note: These tests will fail if the kernel has not been restarted since the SpaCy model was downloaded.
 def test_scrub(scrub_object):
-    scrubbed = scrub_object.scrub(scrub_methods=["spacy_persons", "uk_phone_numbers", "uk_postcodes"])
+    scrubbed = scrub_object.scrub(scrub_methods=["spacy_entities", "uk_phone_numbers", "uk_postcodes"])
     assert scrubbed == [
         "Our names are [PERSON], [PERSON], and [PERSON].",
         "My number is [PHONENO] and I live at [POSTCODE].",
@@ -15,7 +15,7 @@ def test_scrub(scrub_object):
 def test_scrub_text_id():
     scrub = IDScrub(["Our names are Hamish McDonald, L. Salah, and Elena Suárez."] * 10)
-    scrub.scrub(scrub_methods=["spacy_persons"])
+    scrub.scrub(scrub_methods=["spacy_entities"])
     df = scrub.get_scrubbed_data()
@@ -38,7 +38,7 @@ def test_scrub_get_scrubbed_data(scrub_object):
 def test_scrub_order(scrub_object):
-    scrub_object.scrub(scrub_methods=["uk_postcodes", "uk_phone_numbers", "spacy_persons"])
+    scrub_object.scrub(scrub_methods=["uk_postcodes", "uk_phone_numbers", "spacy_entities"])
     assert scrub_object.get_scrubbed_data().columns.to_list() == [
         "text_id",

test/test_spacy.py CHANGED Viewed

@@ -7,7 +7,7 @@ from pandas.testing import assert_frame_equal
 # Note: This test will fail if the kernel has not been restarted since the SpaCy model was downloaded.
 def test_spacy():
     scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
-    scrubbed = scrub.spacy_persons(model_name="en_core_web_trf")
+    scrubbed = scrub.spacy_entities(entities=["PERSON"], model_name="en_core_web_trf")
     assert scrubbed == ["Our names are [PERSON], [PERSON], and [PERSON]."]
@@ -15,12 +15,39 @@ def test_spacy_error():
     scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
     with pytest.raises(ValueError):
-        scrub.spacy_persons(model_name="not_a_model")
+        scrub.spacy_entities(model_name="not_a_model")
 def test_spacy_empty():
     scrub = IDScrub([" ", "John Smith", ""])
-    scrubbed = scrub.spacy_persons()
+    scrubbed = scrub.spacy_entities()
     assert scrubbed == [" ", "[PERSON]", ""]
     assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
+def test_spacy_map():
+    scrub = IDScrub(["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My company code is NASA."])
+    scrubbed_texts = scrub.spacy_entities(
+        entities=["PERSON", "ORG"], replacement_map={"PERSON": "[PHELLO]", "ORG": "[SPACE]"}
+    )
+    assert scrubbed_texts == ["Our names are [PHELLO], [PHELLO], and [PHELLO].", "My company code is [SPACE]."]
+def test_spacy_get_data():
+    scrub = IDScrub(["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My company code is NASA."])
+    scrub.spacy_entities(entities=["PERSON", "ORG"])
+    df = scrub.get_scrubbed_data()
+    expected_df = pd.DataFrame(
+        {
+            "text_id": {0: 1, 1: 2},
+            "person": {0: ["Hamish McDonald", "L. Salah", "Elena Suárez"], 1: None},
+            "org": {0: None, 1: ["NASA"]},
+        }
+    )
+    assert_frame_equal(df, expected_df)

idscrub-1.0.0.dist-info/RECORD DELETED Viewed

@@ -1,22 +0,0 @@
-idscrub/__init__.py,sha256=cRugJv27q1q--bl-VNLpfiScJb_ROlUxyLFhaF55S1w,38
-idscrub/locations.py,sha256=7fMNOcGMYe7sX8TrfhMW6oYGAlc1WVYVQKQbpxE3pqo,217
-idscrub/scrub.py,sha256=VqVqcChbbxMEKJR6Aci971dqG-RmD48otrp9sG2dX0o,34443
-idscrub-1.0.0.dist-info/licenses/LICENSE,sha256=JJnuf10NSx7YXglte1oH_N9ZP3AcWR_Y8irvQb_wnsg,1090
-notebooks/basic_usage.ipynb,sha256=XTBxdtu2F0S99V2lntUEeFj6SN4GRVm4qKvqOhs7nec,38777
-test/conftest.py,sha256=y-pwGXpdg7bbFc36HtE3wQtZkeI0JM77fcMYjej5veY,557
-test/test_all.py,sha256=ifuXAI0Hq3ETNXzdITjNGCnuFyozhN5TpJC2hOtA2bM,1103
-test/test_chain.py,sha256=tGxcG5zRMcX22RfcrimqX6Le2iFPH9NqfZy7Idhelps,1808
-test/test_dataframe.py,sha256=1LhtkQQpXblQ18ppI1s1nNyse0YCwGHbhtrKGkdppBw,6413
-test/test_huggingface.py,sha256=OGwWSz_tzcynuRFXOdV4H4ProKnekYMdtZJviXEejiA,836
-test/test_id.py,sha256=TPsvz4Kw1z_Fiek2BV79Hc2q3N37xU3oQra6Y7Ke11Q,989
-test/test_label.py,sha256=aTGmtAWSLHrgoVBbCFUCqj52LmlCEKN6owycOyfVNpQ,669
-test/test_log.py,sha256=tGAGOv4aeHT4E_pB9rq_nNA1CDHNoINpkVrCKaP4d3U,645
-test/test_persidio.py,sha256=rkqiUr-vYnfCf7Xt0gNo2VQK2gi5JKP7ThSlT803swc,1558
-test/test_phonenumbers.py,sha256=hZsXgwhn5R-7426TTWwCH9gWQwhyHtjLUstN10jnX6c,607
-test/test_regex.py,sha256=zuq8g_8F_P5oCA2ChU5wUIFEWjT9LSYB0S_U1rBpTn4,4388
-test/test_scrub.py,sha256=MWpan5cWIGeNPJCvTwtYe-iZeoIjS_fZMIg46ZVrkJo,1377
-test/test_spacy.py,sha256=KHalx16GYHmCaQUU1O5bLMP95SLTu1007fJK1oq__v4,932
-idscrub-1.0.0.dist-info/METADATA,sha256=fo7FUBAHDei63EWPRUrfNS05p3bnZWSY2GPVrho0vjo,5403
-idscrub-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-idscrub-1.0.0.dist-info/top_level.txt,sha256=D4EEodXGCjGiX35ObiBTmjjBAdouN-eCvH-LezGGtks,23
-idscrub-1.0.0.dist-info/RECORD,,

{idscrub-1.0.0.dist-info → idscrub-1.1.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{idscrub-1.0.0.dist-info → idscrub-1.1.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{idscrub-1.0.0.dist-info → idscrub-1.1.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

idscrub 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

idscrub 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl