PyPI - idscrub - Versions diffs - 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

idscrub 0.1.1py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

idscrub/scrub.py +50 -5
{idscrub-0.1.1.dist-info → idscrub-0.2.0.dist-info}/METADATA +9 -9
{idscrub-0.1.1.dist-info → idscrub-0.2.0.dist-info}/RECORD +8 -7
notebooks/basic_usage.ipynb +173 -91
test/test_scrub.py +48 -0
{idscrub-0.1.1.dist-info → idscrub-0.2.0.dist-info}/WHEEL +0 -0
{idscrub-0.1.1.dist-info → idscrub-0.2.0.dist-info}/licenses/LICENSE +0 -0
{idscrub-0.1.1.dist-info → idscrub-0.2.0.dist-info}/top_level.txt +0 -0

idscrub/scrub.py CHANGED Viewed

@@ -19,7 +19,7 @@ from tqdm import tqdm
 from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
 from transformers.utils import logging as trf_logging
-from idscrub.locations import DOWNLOAD_DIR, PROJECT_DIR
+from idscrub.locations import DOWNLOAD_DIR
 # Suppress Torch FutureWarning
 # TODO: Find better way
@@ -879,10 +879,46 @@ class IDScrub:
         return scrub_methods.get(scrub_method, lambda: "Unknown method.")()
+    def scrub(self, scrub_methods: list[str] = ["all"]) -> list[str]:
+        """
+        Scrubs text using given methods (in order).
+        Uses default values for the given scrub method.
+        Methods available (see associated method docstring for further information):
+        "all", "spacy_persons", "huggingface_persons", "email_addresses", "handles",
+        "ip_addresses", "uk_phone_numbers", "google_phone_numbers", "uk_postcodes"
+        "titles", "presidio"
+        Example:
+        "email_addresses" = scrub.email_addresses()
+        Therefore we can call:
+        IDScrub.scrub(scrub_methods = ["email_addresses"])
+        Args:
+            scrub_method (str): string name of scrub method.
+        Returns:
+            list[str]: The input list of text with personal information replaced.
+        """
+        for i, scrub_method in enumerate(scrub_methods):
+            if i == len(scrub_methods) - 1:
+                self.call_scrub_method(scrub_method)
+            else:
+                self.call_scrub_method(scrub_method)
+        return self.cleaned_texts
     @staticmethod
     def dataframe(
         df: pd.DataFrame = None,
         id_col: str = None,
+        exclude_cols: list[str] = None,
         scrub_methods: list[str] = ["all"],
     ) -> tuple[pd.DataFrame, pd.DataFrame]:
         """
@@ -891,6 +927,7 @@ class IDScrub:
         Args:
             df (pd.DataFrame): A Pandas dataframe to scrub.
             id_col (str): Name of the ID column in `df`. If None, an integer index starting at 1  with the name `id` is applied.
+            exclude_cols (list): Columns to exclude from scrubbing. if None all columns are scrubbed.
             scrub_methods (list[str]): Which scrub methods to apply to the DataFrame (in order).
             These are string versions of the existing methods e.g. "all" == scrub.all() and "email_addresses" == scrub.email_addresses().
@@ -899,6 +936,8 @@ class IDScrub:
         """
+        assert id_col in df.columns, "`id_col` is not a column in `df`. Please check."
         if id_col:
             ids = df[id_col].to_list()
         if not id_col:
@@ -908,14 +947,18 @@ class IDScrub:
         assert isinstance(df, pd.DataFrame), "`df` must be a Pandas DataFrame."
         assert len(df) == len(ids), "Length of dataframe is different to the length of IDs."
+        if exclude_cols is None:
+            cols_to_scrub = df.columns.to_list()
+        else:
+            cols_to_scrub = [col for col in df.columns if col not in exclude_cols]
+        cols_to_scrub.remove(id_col)
         scrubbed_df = df.copy()
         all_scrubbed_data = []
-        for col in tqdm(scrubbed_df.columns):
-            if col == id_col:
-                continue
+        for col in tqdm(cols_to_scrub):
             original_dtype = scrubbed_df[col].dtype
             scrubbed_df[col] = scrubbed_df[col].astype(str)
@@ -944,4 +987,6 @@ class IDScrub:
         all_scrubbed_data = pd.concat(all_scrubbed_data).reset_index(drop=True)
         all_scrubbed_data = all_scrubbed_data.where(pd.notna(all_scrubbed_data), None)
+        assert df.shape == scrubbed_df.shape, "Original and scrubbed dataframe not the same shape. Check."
         return scrubbed_df, all_scrubbed_data

{idscrub-0.1.1.dist-info → idscrub-0.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: idscrub
-Version: 0.1.1
+Version: 0.2.0
 Author: Department for Business and Trade
 Requires-Python: >=3.12
 Description-Content-Type: text/markdown
@@ -20,7 +20,7 @@ Dynamic: license-file
 # idscrub 🧽✨
-## Project Info
+## Project Information
 * This package removes (*✨scrubs✨*) identifying personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
@@ -84,15 +84,15 @@ Dynamic: license-file
 ## Installation
-`idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example (with spaCy model installed):
+`idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example:
 ```console
-pip install 'git+ssh://git@github.com/uktrade/idscrub.git#egg=idscrub[trf]'
+pip install idscrub
 ```
-or without spaCy installed (it will be installed automatically if name cleaning methods are called):
+or with the spaCy transformer model (`en_core_web_trf`) already installed:
 ```console
-pip install 'git+ssh://git@github.com/uktrade/idscrub.git'
+pip instll idscrub[trf]
 ```
 ## How to use the code
@@ -102,12 +102,12 @@ Basic usage example (see `notebooks/basic_usage.ipynb` for further examples):
 ```python
 from idscrub import IDScrub
-scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA, Lapland.'])
-scrubbed_texts = scrub.all()
+scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])
+scrubbed_texts = scrub.scrub(scrub_methods=['spacy_persons', 'uk_phone_numbers', 'uk_postcodes'])
 print(scrubbed_texts)
-# Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE], [LOCATION].']
+# Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE].']
 ```
 ## AI Declaration

{idscrub-0.1.1.dist-info → idscrub-0.2.0.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
 idscrub/__init__.py,sha256=cRugJv27q1q--bl-VNLpfiScJb_ROlUxyLFhaF55S1w,38
 idscrub/locations.py,sha256=7fMNOcGMYe7sX8TrfhMW6oYGAlc1WVYVQKQbpxE3pqo,217
-idscrub/scrub.py,sha256=WgVC2vch7zQ159jktOFIhjYEs-SiA_xW_dz_A1litaI,33504
-idscrub-0.1.1.dist-info/licenses/LICENSE,sha256=JJnuf10NSx7YXglte1oH_N9ZP3AcWR_Y8irvQb_wnsg,1090
-notebooks/basic_usage.ipynb,sha256=iRSvufgORrzzlVxOFehhJtPnpT1-ZMgMdIcIYm7kvK8,38339
+idscrub/scrub.py,sha256=cYA76efkbR6rjHvl9yejtwmJ6MV8qx7_V4Azk4sWhjA,35073
+idscrub-0.2.0.dist-info/licenses/LICENSE,sha256=JJnuf10NSx7YXglte1oH_N9ZP3AcWR_Y8irvQb_wnsg,1090
+notebooks/basic_usage.ipynb,sha256=2fQdapXAFb79ZTcMfveqSC4TMNrsvqDpvF15rw3LUvM,39798
 test/conftest.py,sha256=ph1S3LMvzlzvOsb3l2YhpyHSdmg4uV7p61ge_JVCGv0,267
 test/test_all.py,sha256=z6v9O2Ts9dWITlhvZwRMyKUZsO7ncaT3znqqBCKJ6Wc,1141
 test/test_chain.py,sha256=YFGqO0xUzZ69x-iNCdKEiH-OWWZfyYYFgmEq0urELEs,1883
@@ -13,8 +13,9 @@ test/test_log.py,sha256=qKVZAzcaVllKepM-vgCWqqY9f8GyNxO7V0sa1WD0tsA,673
 test/test_persidio.py,sha256=NSX5gzhhBX5l9GTXwPK4wjMzcp6wmAfWJYQo45UMVIc,1594
 test/test_phonenumbers.py,sha256=hZsXgwhn5R-7426TTWwCH9gWQwhyHtjLUstN10jnX6c,607
 test/test_regex.py,sha256=EQGx3PHwJJzIdy6xwR8gEsSRDtlWHR-U81EPI811eZA,4474
+test/test_scrub.py,sha256=pohmw3frtlkmZDMvOEbmvVJgtcVdFlEDL3TxR5-y-0Q,1422
 test/test_spacy.py,sha256=mrUGUulvzDGgQRttdG0tgL2sGBRmYfg1fDNp7SFq8as,961
-idscrub-0.1.1.dist-info/METADATA,sha256=WndwUbgKC_pOT_RonGvWWglHkjE3yhWGrjBw7d6deOs,6192
-idscrub-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-idscrub-0.1.1.dist-info/top_level.txt,sha256=D4EEodXGCjGiX35ObiBTmjjBAdouN-eCvH-LezGGtks,23
-idscrub-0.1.1.dist-info/RECORD,,
+idscrub-0.2.0.dist-info/METADATA,sha256=2NERZcMHsGbnotclunZ-0ZgZaCMAN39j9s_zswp1bXQ,6101
+idscrub-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+idscrub-0.2.0.dist-info/top_level.txt,sha256=D4EEodXGCjGiX35ObiBTmjjBAdouN-eCvH-LezGGtks,23
+idscrub-0.2.0.dist-info/RECORD,,

notebooks/basic_usage.ipynb CHANGED Viewed

@@ -9,22 +9,139 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO: Texts loaded.\n",
+      "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
+      "100%|██████████| 2/2 [00:00<00:00, 44.29it/s]\n",
+      "INFO: 3 spacy person scrubbed.\n",
+      "INFO: Scrubbing phone numbers using regex...\n",
+      "INFO: 1 uk phone numbers scrubbed.\n",
+      "INFO: Scrubbing UK postcodes using regex...\n",
+      "INFO: 1 uk postcodes scrubbed.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE], Lapland.']\n"
+     ]
+    }
+   ],
+   "source": [
+    "from idscrub import IDScrub\n",
+    "\n",
+    "scrub = IDScrub(\n",
+    "    [\n",
+    "        \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
+    "        \"My number is +441111111111 and I live at AA11 1AA, Lapland.\",\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "scrubbed_texts = scrub.scrub(scrub_methods=[\"spacy_persons\", \"uk_phone_numbers\", \"uk_postcodes\"])\n",
+    "\n",
+    "print(scrubbed_texts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text_id</th>\n",
+       "      <th>scrubbed_spacy_person</th>\n",
+       "      <th>scrubbed_uk_phone_numbers</th>\n",
+       "      <th>scrubbed_uk_postcodes</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>[Hamish McDonald, L. Salah, Elena Suárez]</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>None</td>\n",
+       "      <td>[+441111111111]</td>\n",
+       "      <td>[AA11 1AA]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   text_id                      scrubbed_spacy_person  \\\n",
+       "0        1  [Hamish McDonald, L. Salah, Elena Suárez]   \n",
+       "1        2                                       None   \n",
+       "\n",
+       "  scrubbed_uk_phone_numbers scrubbed_uk_postcodes  \n",
+       "0                      None                  None  \n",
+       "1           [+441111111111]            [AA11 1AA]  "
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "scrub.get_scrubbed_data()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Or scrub `all`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/Users/euansoutter/Documents/code/idscrub/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n",
       "INFO: Texts loaded.\n",
       "INFO: Scrubbing using Presidio...\n",
-      "100%|██████████| 2/2 [00:00<00:00,  9.48it/s]\n",
+      "100%|██████████| 2/2 [00:00<00:00, 25.19it/s]\n",
       "INFO: 3 presidio person scrubbed.\n",
       "INFO: 1 presidio location scrubbed.\n",
       "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 2/2 [00:00<00:00, 55.76it/s]\n",
+      "100%|██████████| 2/2 [00:00<00:00, 48.66it/s]\n",
       "INFO: 0 spacy person scrubbed.\n",
       "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
       "INFO: 0 gb phone numbers scrubbed.\n",
@@ -59,14 +176,15 @@
     "        \"My number is +441111111111 and I live at AA11 1AA, Lapland.\",\n",
     "    ]\n",
     ")\n",
-    "scrubbed_texts = scrub.all()\n",
+    "\n",
+    "scrubbed_texts = scrub.scrub(scrub_methods=[\"all\"])\n",
     "\n",
     "print(scrubbed_texts)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -128,7 +246,7 @@
        "1                  [Lapland]           [+441111111111]            [AA11 1AA]  "
       ]
      },
-     "execution_count": 2,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -146,7 +264,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -155,7 +273,7 @@
      "text": [
       "INFO: Texts loaded.\n",
       "INFO: Scrubbing using Presidio...\n",
-      "100%|██████████| 2/2 [00:00<00:00, 25.84it/s]\n",
+      "100%|██████████| 2/2 [00:00<00:00, 23.03it/s]\n",
       "INFO: 3 presidio person scrubbed.\n",
       "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
       "INFO: 0 gb phone numbers scrubbed.\n",
@@ -206,7 +324,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -271,7 +389,7 @@
        "1              [ACHILLES]           [+441111111111]            [AA11 1AA]  "
       ]
      },
-     "execution_count": 4,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -290,7 +408,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -299,7 +417,7 @@
      "text": [
       "INFO: Texts loaded.\n",
       "INFO: Scrubbing using Presidio...\n",
-      "100%|██████████| 2/2 [00:00<00:00, 26.18it/s]\n",
+      "100%|██████████| 2/2 [00:00<00:00, 23.38it/s]\n",
       "INFO: 3 presidio person scrubbed.\n",
       "INFO: 1 presidio iban code scrubbed.\n"
      ]
@@ -325,7 +443,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -381,7 +499,7 @@
        "1    [GB91BKEN10000041610008]  "
       ]
      },
-     "execution_count": 6,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -399,7 +517,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -505,7 +623,7 @@
        "4  They did not expected a reply from otis.reddin...  "
       ]
      },
-     "execution_count": 7,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -551,21 +669,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "  0%|          | 0/5 [00:00<?, ?it/s]INFO: Texts loaded.\n",
+      "  0%|          | 0/3 [00:00<?, ?it/s]INFO: Texts loaded.\n",
       "INFO: Scrubbing using Presidio...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 24.83it/s]\n",
+      "100%|██████████| 5/5 [00:00<00:00, 18.99it/s]\n",
       "INFO: 4 presidio person scrubbed.\n",
       "INFO: 4 presidio person scrubbed.\n",
       "INFO: 4 presidio person scrubbed.\n",
       "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 71.71it/s]\n",
+      "100%|██████████| 5/5 [00:00<00:00, 67.00it/s]\n",
       "INFO: 0 spacy person scrubbed.\n",
       "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
       "INFO: 0 gb phone numbers scrubbed.\n",
@@ -581,34 +699,13 @@
       "INFO: 0 uk postcodes scrubbed.\n",
       "INFO: Scrubbing titles using regex...\n",
       "INFO: 2 titles scrubbed.\n",
-      " 40%|████      | 2/5 [00:02<00:03,  1.25s/it]INFO: Texts loaded.\n",
+      " 33%|███▎      | 1/3 [00:03<00:06,  3.24s/it]INFO: Texts loaded.\n",
       "INFO: Scrubbing using Presidio...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 29.98it/s]\n",
+      "100%|██████████| 5/5 [00:00<00:00, 21.83it/s]\n",
       "INFO: 2 presidio person scrubbed.\n",
       "INFO: 2 presidio person scrubbed.\n",
       "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 96.09it/s]\n",
-      "INFO: 0 spacy person scrubbed.\n",
-      "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
-      "INFO: 0 gb phone numbers scrubbed.\n",
-      "INFO: Scrubbing email addresses using regex...\n",
-      "INFO: 0 email addresses scrubbed.\n",
-      "INFO: Scrubbing @user handles using regex...\n",
-      "INFO: 0 handles scrubbed.\n",
-      "INFO: Scrubbing IP addresses using regex...\n",
-      "INFO: 0 ip addresses scrubbed.\n",
-      "INFO: Scrubbing phone numbers using regex...\n",
-      "INFO: 0 uk phone numbers scrubbed.\n",
-      "INFO: Scrubbing UK postcodes using regex...\n",
-      "INFO: 0 uk postcodes scrubbed.\n",
-      "INFO: Scrubbing titles using regex...\n",
-      "INFO: 0 titles scrubbed.\n",
-      " 60%|██████    | 3/5 [00:04<00:03,  1.66s/it]INFO: Texts loaded.\n",
-      "INFO: Scrubbing using Presidio...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 26.73it/s]\n",
-      "INFO: 1 presidio person scrubbed.\n",
-      "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 89.71it/s]\n",
+      "100%|██████████| 5/5 [00:00<00:00, 84.69it/s]\n",
       "INFO: 0 spacy person scrubbed.\n",
       "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
       "INFO: 0 gb phone numbers scrubbed.\n",
@@ -624,15 +721,15 @@
       "INFO: 0 uk postcodes scrubbed.\n",
       "INFO: Scrubbing titles using regex...\n",
       "INFO: 0 titles scrubbed.\n",
-      " 80%|████████  | 4/5 [00:07<00:01,  1.91s/it]INFO: Texts loaded.\n",
+      " 67%|██████▋   | 2/3 [00:06<00:03,  3.24s/it]INFO: Texts loaded.\n",
       "INFO: Scrubbing using Presidio...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 21.44it/s]\n",
-      "INFO: 3 presidio email address scrubbed.\n",
+      "100%|██████████| 5/5 [00:00<00:00, 29.32it/s]\n",
+      "INFO: 5 presidio url scrubbed.\n",
       "INFO: 2 presidio person scrubbed.\n",
       "INFO: 3 presidio email address scrubbed.\n",
       "INFO: 3 presidio email address scrubbed.\n",
       "INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 57.46it/s]\n",
+      "100%|██████████| 5/5 [00:00<00:00, 66.37it/s]\n",
       "INFO: 0 spacy person scrubbed.\n",
       "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
       "INFO: 0 gb phone numbers scrubbed.\n",
@@ -648,7 +745,7 @@
       "INFO: 4 uk postcodes scrubbed.\n",
       "INFO: Scrubbing titles using regex...\n",
       "INFO: 0 titles scrubbed.\n",
-      "100%|██████████| 5/5 [00:09<00:00,  1.91s/it]\n"
+      "100%|██████████| 3/3 [00:08<00:00,  2.94s/it]\n"
      ]
     },
     {
@@ -685,7 +782,7 @@
        "      <td>A</td>\n",
        "      <td>[TITLE]. [PERSON] walked off; and [PERSON] rem...</td>\n",
        "      <td>To [PERSON] she is always the woman.</td>\n",
-       "      <td>My dear [PERSON], do not waste your time upon ...</td>\n",
+       "      <td>My dear Victor, do not waste your time upon th...</td>\n",
        "      <td>The letter to [EMAIL_ADDRESS] was stamped with...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -740,7 +837,7 @@
        "4  When you have eliminated the impossible, whate...   \n",
        "\n",
        "                                        Frankenstein  \\\n",
-       "0  My dear [PERSON], do not waste your time upon ...   \n",
+       "0  My dear Victor, do not waste your time upon th...   \n",
        "1  Learn from me, if not by my precepts, at least...   \n",
        "2  I had worked hard for nearly two years, for th...   \n",
        "3  Nothing is more painful to the human mind than...   \n",
@@ -754,7 +851,7 @@
        "4  They did not expected a reply from [EMAIL_ADDR...  "
       ]
      },
-     "execution_count": 8,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -762,14 +859,14 @@
    "source": [
     "from idscrub import IDScrub\n",
     "\n",
-    "scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col=\"ID\", scrub_methods=[\"all\"])\n",
+    "scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col=\"ID\", exclude_cols=[\"Frankenstein\"], scrub_methods=[\"all\"])\n",
     "\n",
     "scrubbed_df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -862,17 +959,6 @@
        "    <tr>\n",
        "      <th>5</th>\n",
        "      <td>A</td>\n",
-       "      <td>Frankenstein</td>\n",
-       "      <td>[Victor]</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>A</td>\n",
        "      <td>Fake book</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
@@ -882,7 +968,7 @@
        "      <td>[SW1A 2AA]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>7</th>\n",
+       "      <th>6</th>\n",
        "      <td>B</td>\n",
        "      <td>Fake book</td>\n",
        "      <td>[Mick Jagger, David Bowie]</td>\n",
@@ -893,7 +979,7 @@
        "      <td>[SW1A 2WH]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>8</th>\n",
+       "      <th>7</th>\n",
        "      <td>C</td>\n",
        "      <td>Fake book</td>\n",
        "      <td>None</td>\n",
@@ -904,7 +990,7 @@
        "      <td>[SW19 5AE]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>9</th>\n",
+       "      <th>8</th>\n",
        "      <td>E</td>\n",
        "      <td>Fake book</td>\n",
        "      <td>None</td>\n",
@@ -925,11 +1011,10 @@
        "2  C                Pride and Prejudice                 [Elizabeth]   \n",
        "3  A  The Adventures of Sherlock Holmes           [Sherlock Holmes]   \n",
        "4  D  The Adventures of Sherlock Holmes                    [Watson]   \n",
-       "5  A                       Frankenstein                    [Victor]   \n",
-       "6  A                          Fake book                        None   \n",
-       "7  B                          Fake book  [Mick Jagger, David Bowie]   \n",
-       "8  C                          Fake book                        None   \n",
-       "9  E                          Fake book                        None   \n",
+       "5  A                          Fake book                        None   \n",
+       "6  B                          Fake book  [Mick Jagger, David Bowie]   \n",
+       "7  C                          Fake book                        None   \n",
+       "8  E                          Fake book                        None   \n",
        "\n",
        "  scrubbed_titles  scrubbed_presidio_email_address  \\\n",
        "0            [Mr]                             None   \n",
@@ -937,11 +1022,10 @@
        "2            None                             None   \n",
        "3            None                             None   \n",
        "4            None                             None   \n",
-       "5            None                             None   \n",
-       "6            None      [freddie.mercury@queen.com]   \n",
-       "7            None                             None   \n",
-       "8            None     [serena.williams@tennis.com]   \n",
-       "9            None  [otis.redding@dockofthebay.org]   \n",
+       "5            None      [freddie.mercury@queen.com]   \n",
+       "6            None                             None   \n",
+       "7            None     [serena.williams@tennis.com]   \n",
+       "8            None  [otis.redding@dockofthebay.org]   \n",
        "\n",
        "  scrubbed_presidio_iban_code         scrubbed_presidio_url  \\\n",
        "0                        None                          None   \n",
@@ -949,11 +1033,10 @@
        "2                        None                          None   \n",
        "3                        None                          None   \n",
        "4                        None                          None   \n",
-       "5                        None                          None   \n",
-       "6    [GB91BKEN10000041610008]       [freddie.me, queen.com]   \n",
-       "7                        None                          None   \n",
-       "8                        None                  [tennis.com]   \n",
-       "9                        None  [otis.red, dockofthebay.org]   \n",
+       "5    [GB91BKEN10000041610008]       [freddie.me, queen.com]   \n",
+       "6                        None                          None   \n",
+       "7                        None                  [tennis.com]   \n",
+       "8                        None  [otis.red, dockofthebay.org]   \n",
        "\n",
        "  scrubbed_uk_postcodes  \n",
        "0                  None  \n",
@@ -961,14 +1044,13 @@
        "2                  None  \n",
        "3                  None  \n",
        "4                  None  \n",
-       "5                  None  \n",
-       "6            [SW1A 2AA]  \n",
-       "7            [SW1A 2WH]  \n",
-       "8            [SW19 5AE]  \n",
-       "9             [EH8 8DX]  "
+       "5            [SW1A 2AA]  \n",
+       "6            [SW1A 2WH]  \n",
+       "7            [SW19 5AE]  \n",
+       "8             [EH8 8DX]  "
       ]
      },
-     "execution_count": 9,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }

test/test_scrub.py ADDED Viewed

@@ -0,0 +1,48 @@
+import pandas as pd
+from idscrub import IDScrub
+from pandas.testing import assert_frame_equal
+# Note: These tests will fail if the kernel has not been restarted since the SpaCy model was downloaded.
+def test_scrub(scrub_object):
+    scrubbed = scrub_object.scrub(scrub_methods=["spacy_persons", "uk_phone_numbers", "uk_postcodes"])
+    assert scrubbed == [
+        "Our names are [PERSON], [PERSON], and [PERSON].",
+        "My number is [PHONENO] and I live at [POSTCODE].",
+    ]
+def test_scrub_text_id():
+    scrub = IDScrub(["Our names are Hamish McDonald, L. Salah, and Elena Suárez."] * 10)
+    scrub.scrub(scrub_methods=["spacy_persons"])
+    df = scrub.get_scrubbed_data()
+    assert df["text_id"].max() == 10
+    assert len(df["text_id"]) == 10
+def test_scrub_get_scrubbed_data(scrub_object):
+    scrub_object.scrub(scrub_methods=["uk_postcodes"])
+    df = scrub_object.get_scrubbed_data()
+    expected_df = pd.DataFrame(
+        {
+            "text_id": {0: 2},
+            "scrubbed_uk_postcodes": {0: ["AA11 1AA"]},
+        }
+    )
+    assert_frame_equal(df, expected_df)
+def test_scrub_order(scrub_object):
+    scrub_object.scrub(scrub_methods=["uk_postcodes", "uk_phone_numbers", "spacy_persons"])
+    assert scrub_object.get_scrubbed_data().columns.to_list() == [
+        "text_id",
+        "scrubbed_uk_postcodes",
+        "scrubbed_uk_phone_numbers",
+        "scrubbed_spacy_person",
+    ]

{idscrub-0.1.1.dist-info → idscrub-0.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{idscrub-0.1.1.dist-info → idscrub-0.2.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{idscrub-0.1.1.dist-info → idscrub-0.2.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

idscrub 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

idscrub 0.1.1py3-none-any.whl → 0.2.0py3-none-any.whl