PyPI - idscrub - Versions diffs - 1.1.2__py3-none-any.whl → 2.0.1__py3-none-any.whl - Mend

idscrub 1.1.2py3-none-any.whl → 2.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

idscrub/scrub.py +694 -525
{idscrub-1.1.2.dist-info → idscrub-2.0.1.dist-info}/METADATA +58 -12
idscrub-2.0.1.dist-info/RECORD +24 -0
notebooks/basic_usage.ipynb +294 -351
test/conftest.py +36 -0
test/test_dataframe.py +8 -8
test/test_errors.py +32 -0
test/test_exclude.py +22 -0
test/test_group.py +9 -0
test/test_huggingface.py +3 -3
test/test_id.py +8 -7
test/test_label.py +22 -7
test/test_overlap.py +86 -0
test/test_phonenumbers.py +2 -2
test/test_presidio.py +21 -6
test/test_regex.py +110 -59
test/test_scrub.py +22 -12
test/test_scrub_text.py +22 -0
test/test_spacy.py +16 -12
idscrub-1.1.2.dist-info/RECORD +0 -22
test/test_all.py +0 -39
test/test_chain.py +0 -54
test/test_log.py +0 -17
{idscrub-1.1.2.dist-info → idscrub-2.0.1.dist-info}/WHEEL +0 -0
{idscrub-1.1.2.dist-info → idscrub-2.0.1.dist-info}/licenses/LICENSE +0 -0
{idscrub-1.1.2.dist-info → idscrub-2.0.1.dist-info}/top_level.txt +0 -0

notebooks/basic_usage.ipynb CHANGED Viewed

@@ -7,6 +7,13 @@
     "### `idscrub` basic usage examples"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With a default pipeline:"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -17,23 +24,24 @@
      "output_type": "stream",
      "text": [
       "INFO: Texts loaded.\n",
-      "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 2/2 [00:00<00:00, 33.83it/s]\n",
-      "INFO: 1 org scrubbed.\n",
-      "INFO: 3 person scrubbed.\n",
-      "INFO: Scrubbing phone numbers using regex...\n",
-      "INFO: 1 uk_phone_number scrubbed.\n",
-      "INFO: Scrubbing addresses using regex...\n",
-      "INFO: 1 uk_address scrubbed.\n",
-      "INFO: Scrubbing postcodes using regex...\n",
-      "INFO: 1 uk_postcode scrubbed.\n"
+      "INFO: Scrubbing using presidio_entities with default parameters...\n",
+      "INFO: Scrubbing using spacy_entities with default parameters...\n",
+      "INFO: Scrubbing using email_addresses with default parameters...\n",
+      "INFO: Scrubbing using handles with default parameters...\n",
+      "INFO: Scrubbing using ip_addresses with default parameters...\n",
+      "INFO: Scrubbing using uk_addresses with default parameters...\n",
+      "INFO: Scrubbing using uk_phone_numbers with default parameters...\n",
+      "INFO: Scrubbing using google_phone_numbers with default parameters...\n",
+      "INFO: Scrubbing using uk_postcodes with default parameters...\n",
+      "INFO: Scrubbing using urls with default parameters...\n",
+      "INFO: Scrubbing using titles with default parameters...\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], Lapland']\n"
+      "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], [LOCATION]']\n"
      ]
     }
    ],
@@ -47,15 +55,82 @@
     "    ]\n",
     ")\n",
     "\n",
-    "scrubbed_texts = scrub.scrub(scrub_methods=[\"spacy_entities\", \"uk_phone_numbers\", \"uk_addresses\", \"uk_postcodes\"])\n",
+    "scrubbed_texts = scrub.scrub()\n",
     "\n",
     "print(scrubbed_texts)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With a custom pipeline:"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO: Texts loaded.\n",
+      "INFO: Scrubbing using presidio_entities with parameters {'entity_types': ['PERSON']}...\n",
+      "INFO: Scrubbing using spacy_entities with parameters {'entity_types': ['ORG']}...\n",
+      "INFO: Scrubbing using google_phone_numbers with parameters {'region': 'GB'}...\n",
+      "INFO: Scrubbing using titles with parameters {'strict': False}...\n",
+      "INFO: Scrubbing using email_addresses with default parameters...\n",
+      "INFO: Scrubbing using handles with default parameters...\n",
+      "INFO: Scrubbing using ip_addresses with default parameters...\n",
+      "INFO: Scrubbing using uk_addresses with default parameters...\n",
+      "INFO: Scrubbing using uk_phone_numbers with default parameters...\n",
+      "INFO: Scrubbing using uk_postcodes with default parameters...\n",
+      "INFO: Scrubbing using urls with default parameters...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], Lapland']\n"
+     ]
+    }
+   ],
+   "source": [
+    "from idscrub import IDScrub\n",
+    "\n",
+    "scrub = IDScrub(\n",
+    "    [\n",
+    "        \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
+    "        \"My number is +441111111111 and I work at the Department for Business and Trade, 15 Elf Road, AA11 1AA, Lapland\",\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "pipeline = [\n",
+    "    {\"method\": \"presidio_entities\", \"entity_types\": [\"PERSON\"]},\n",
+    "    {\"method\": \"spacy_entities\", \"entity_types\": [\"ORG\"]},\n",
+    "    {\"method\": \"google_phone_numbers\", \"region\": \"GB\"},\n",
+    "    {\"method\": \"titles\", \"strict\": False},\n",
+    "    {\"method\": \"email_addresses\"},\n",
+    "    {\"method\": \"handles\"},\n",
+    "    {\"method\": \"ip_addresses\"},\n",
+    "    {\"method\": \"uk_addresses\"},\n",
+    "    {\"method\": \"uk_phone_numbers\"},\n",
+    "    {\"method\": \"uk_postcodes\"},\n",
+    "    {\"method\": \"urls\"},\n",
+    "]\n",
+    "\n",
+    "scrubbed_texts = scrub.scrub(pipeline=pipeline)\n",
+    "\n",
+    "print(scrubbed_texts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -123,7 +198,7 @@
        "1  [AA11 1AA]  "
       ]
      },
-     "execution_count": 2,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -136,12 +211,19 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Or scrub `all`:"
+    "### `idscrub` example - priority scoring"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If multiple different types of personal data have been identified in the same string, such as a handle, a email address and a URL, you can score one higher to ensure it is scrubbed:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -149,57 +231,52 @@
      "output_type": "stream",
      "text": [
       "INFO: Texts loaded.\n",
-      "INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 2/2 [00:00<00:00,  9.14it/s]\n",
-      "INFO: 3 person scrubbed.\n",
-      "INFO: 1 location scrubbed.\n",
-      "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 2/2 [00:00<00:00, 42.62it/s]\n",
-      "INFO: 1 org scrubbed.\n",
-      "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
-      "INFO: 0 phone_number scrubbed.\n",
-      "INFO: Scrubbing email addresses using regex...\n",
-      "INFO: 0 email_address scrubbed.\n",
-      "INFO: Scrubbing @user handles using regex...\n",
-      "INFO: 0 handle scrubbed.\n",
-      "INFO: Scrubbing IP addresses using regex...\n",
-      "INFO: 0 ip_address scrubbed.\n",
-      "INFO: Scrubbing phone numbers using regex...\n",
-      "INFO: 1 uk_phone_number scrubbed.\n",
-      "INFO: Scrubbing addresses using regex...\n",
-      "INFO: 1 uk_address scrubbed.\n",
-      "INFO: Scrubbing postcodes using regex...\n",
-      "INFO: 1 uk_postcode scrubbed.\n",
-      "INFO: Scrubbing titles using regex...\n",
-      "INFO: 0 title scrubbed.\n"
+      "INFO: Scrubbing using handles with parameters {'priority': 0.1}...\n",
+      "INFO: Scrubbing using urls with parameters {'priority': 0.1}...\n",
+      "INFO: Scrubbing using email_addresses with parameters {'priority': 0.2}...\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], [LOCATION]']\n"
+      "\n",
+      "All personal data identified: [('handle', '@person.com'), ('url', 'www.person@person.com'), ('email_address', 'www.person@person.com')]\n",
+      "\n",
+      "Personal data removed after priority scoring: [('email_address', 'www.person@person.com')]\n",
+      "\n",
+      "['My email is [EMAIL_ADDRESS]']\n"
      ]
     }
    ],
    "source": [
     "from idscrub import IDScrub\n",
     "\n",
-    "scrub = IDScrub(\n",
-    "    [\n",
-    "        \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
-    "        \"My number is +441111111111 and I work at Department for Business and Trade, 15 Elf Road, AA11 1AA, Lapland\",\n",
+    "scrub = IDScrub(texts=[\"My email is www.person@person.com\"])\n",
+    "\n",
+    "scrubbed_texts = scrub.scrub(\n",
+    "    pipeline=[\n",
+    "        {\"method\": \"handles\", \"priority\": 0.1},\n",
+    "        {\"method\": \"urls\", \"priority\": 0.1},\n",
+    "        {\"method\": \"email_addresses\", \"priority\": 0.2},\n",
     "    ]\n",
     ")\n",
     "\n",
-    "scrubbed_texts = scrub.scrub(scrub_methods=[\"all\"])\n",
-    "\n",
+    "print(f\"\\nAll personal data identified: {[(ident.label, ident.text) for ident in scrub.idents_all]}\\n\")\n",
+    "print(f\"Personal data removed after priority scoring: {[(ident.label, ident.text) for ident in scrub.idents]}\\n\")\n",
     "print(scrubbed_texts)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To view all of the identified data:"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -224,72 +301,86 @@
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>text_id</th>\n",
-       "      <th>person</th>\n",
-       "      <th>location</th>\n",
-       "      <th>org</th>\n",
-       "      <th>uk_phone_number</th>\n",
-       "      <th>uk_address</th>\n",
-       "      <th>uk_postcode</th>\n",
+       "      <th>text</th>\n",
+       "      <th>start</th>\n",
+       "      <th>end</th>\n",
+       "      <th>label</th>\n",
+       "      <th>replacement</th>\n",
+       "      <th>priority</th>\n",
+       "      <th>source</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>1</td>\n",
-       "      <td>[Hamish McDonald, L. Salah, Elena Suárez]</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
+       "      <td>@person.com</td>\n",
+       "      <td>22</td>\n",
+       "      <td>33</td>\n",
+       "      <td>handle</td>\n",
+       "      <td>[HANDLE]</td>\n",
+       "      <td>0.1</td>\n",
+       "      <td>regex</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>2</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[Lapland]</td>\n",
-       "      <td>[Department for Business and Trade]</td>\n",
-       "      <td>[+441111111111]</td>\n",
-       "      <td>[15 Elf Road]</td>\n",
-       "      <td>[AA11 1AA]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>www.person@person.com</td>\n",
+       "      <td>12</td>\n",
+       "      <td>33</td>\n",
+       "      <td>url</td>\n",
+       "      <td>[URL]</td>\n",
+       "      <td>0.1</td>\n",
+       "      <td>regex</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>www.person@person.com</td>\n",
+       "      <td>12</td>\n",
+       "      <td>33</td>\n",
+       "      <td>email_address</td>\n",
+       "      <td>[EMAIL_ADDRESS]</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>regex</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "   text_id                                     person   location  \\\n",
-       "0        1  [Hamish McDonald, L. Salah, Elena Suárez]       None   \n",
-       "1        2                                       None  [Lapland]   \n",
-       "\n",
-       "                                   org  uk_phone_number     uk_address  \\\n",
-       "0                                 None             None           None   \n",
-       "1  [Department for Business and Trade]  [+441111111111]  [15 Elf Road]   \n",
+       "   text_id                   text  start  end          label      replacement  \\\n",
+       "0        1            @person.com     22   33         handle         [HANDLE]   \n",
+       "1        1  www.person@person.com     12   33            url            [URL]   \n",
+       "2        1  www.person@person.com     12   33  email_address  [EMAIL_ADDRESS]   \n",
        "\n",
-       "  uk_postcode  \n",
-       "0        None  \n",
-       "1  [AA11 1AA]  "
+       "   priority source  \n",
+       "0       0.1  regex  \n",
+       "1       0.1  regex  \n",
+       "2       0.2  regex  "
       ]
      },
-     "execution_count": 4,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "scrub.get_scrubbed_data()"
+    "scrub.get_all_identified_data()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### `idscrub` example - chaining methods together"
+    "Note that methods which identify multiple identities, like `spacy_entities` and `presidio_entities`, will have the same priority score applied to each entity type. \n",
+    "\n",
+    "To assign priority scores based on entity types, you can chain methods together. For example, if you wanted to prioritise email addresses over names when using `presidio_entities`:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -297,151 +388,47 @@
      "output_type": "stream",
      "text": [
       "INFO: Texts loaded.\n",
-      "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 2/2 [00:00<00:00, 42.58it/s]\n",
-      "INFO: 1 org scrubbed.\n",
-      "INFO: 3 person scrubbed.\n",
-      "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
-      "INFO: 0 phone_number scrubbed.\n",
-      "INFO: Scrubbing custom regex...\n",
-      "INFO: 1 custom_regex_1 scrubbed.\n",
-      "INFO: 0 custom_regex_2 scrubbed.\n",
-      "INFO: Scrubbing email addresses using regex...\n",
-      "INFO: 0 email_address scrubbed.\n",
-      "INFO: Scrubbing @user handles using regex...\n",
-      "INFO: 0 handle scrubbed.\n",
-      "INFO: Scrubbing IP addresses using regex...\n",
-      "INFO: 0 ip_address scrubbed.\n",
-      "INFO: Scrubbing phone numbers using regex...\n",
-      "INFO: 1 uk_phone_number scrubbed.\n",
-      "INFO: Scrubbing addresses using regex...\n",
-      "INFO: 1 uk_address scrubbed.\n",
-      "INFO: Scrubbing postcodes using regex...\n",
-      "INFO: 1 uk_postcode scrubbed.\n",
-      "INFO: Scrubbing titles using regex...\n",
-      "INFO: 0 title scrubbed.\n"
+      "INFO: Scrubbing using presidio_entities with parameters {'entity_types': ['PERSON'], 'priority': 0.1}...\n",
+      "INFO: Scrubbing using presidio_entities with parameters {'entity_types': ['EMAIL_ADDRESS'], 'priority': 0.2}...\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I work at [ORG], [ADDRESS], [POSTCODE], [UNIVERSITY]']\n"
+      "   text_id                 text  start  end          label      replacement  \\\n",
+      "0        1  John Smith@mail.com      0   19         person         [PERSON]   \n",
+      "1        1       Smith@mail.com      5   19  email_address  [EMAIL_ADDRESS]   \n",
+      "\n",
+      "   priority    source  \n",
+      "0       0.1  presidio  \n",
+      "1       0.2  presidio  \n",
+      "['John [EMAIL_ADDRESS]']\n"
      ]
     }
    ],
    "source": [
     "from idscrub import IDScrub\n",
     "\n",
-    "scrub = IDScrub(\n",
-    "    [\n",
-    "        \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
-    "        \"My number is +441111111111 and I work at Department for Business and Trade, 15 Elf Road, AA11 1AA, Lapland\",\n",
-    "    ]\n",
-    ")\n",
-    "\n",
-    "scrub.spacy_entities()\n",
-    "scrub.google_phone_numbers(region=\"GB\")\n",
+    "scrub = IDScrub([\"John Smith@mail.com\"])\n",
     "\n",
-    "# Remove specific regex pattern(s). This can also be passed to all().\n",
-    "scrub.custom_regex(\n",
-    "    custom_regex_patterns=[r\"Lapland\", r\"ACHILLES\"], custom_replacement_texts=[\"[UNIVERSITY]\", \"[REDACTED]\"]\n",
+    "scrubbed_texts = scrub.scrub(\n",
+    "    pipeline=[\n",
+    "        {\"method\": \"presidio_entities\", \"entity_types\": [\"PERSON\"], \"priority\": 0.1},\n",
+    "        {\"method\": \"presidio_entities\", \"entity_types\": [\"EMAIL_ADDRESS\"], \"priority\": 0.2},\n",
+    "    ]\n",
     ")\n",
     "\n",
-    "scrubbed_texts = scrub.all_regex()\n",
+    "print(scrub.get_all_identified_data())\n",
     "\n",
     "print(scrubbed_texts)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>text_id</th>\n",
-       "      <th>person</th>\n",
-       "      <th>org</th>\n",
-       "      <th>custom_regex_1</th>\n",
-       "      <th>uk_phone_number</th>\n",
-       "      <th>uk_address</th>\n",
-       "      <th>uk_postcode</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>[Hamish McDonald, L. Salah, Elena Suárez]</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[Department for Business and Trade]</td>\n",
-       "      <td>[Lapland]</td>\n",
-       "      <td>[+441111111111]</td>\n",
-       "      <td>[15 Elf Road]</td>\n",
-       "      <td>[AA11 1AA]</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   text_id                                     person  \\\n",
-       "0        1  [Hamish McDonald, L. Salah, Elena Suárez]   \n",
-       "1        2                                       None   \n",
-       "\n",
-       "                                   org custom_regex_1  uk_phone_number  \\\n",
-       "0                                 None           None             None   \n",
-       "1  [Department for Business and Trade]      [Lapland]  [+441111111111]   \n",
-       "\n",
-       "      uk_address uk_postcode  \n",
-       "0           None        None  \n",
-       "1  [15 Elf Road]  [AA11 1AA]  "
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "scrub.get_scrubbed_data()"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### `idscrub` example - using Presidio\n",
-    "We can also leverage the power of [Presidio](https://microsoft.github.io/presidio/) and use their entity recognition methods"
+    "### `idscrub` example - scrubbing custom regex patterns"
    ]
   },
   {
@@ -454,29 +441,41 @@
      "output_type": "stream",
      "text": [
       "INFO: Texts loaded.\n",
-      "INFO: Scrubbing Presidio entities `PERSON, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, IBAN_CODE` using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 2/2 [00:00<00:00, 24.36it/s]\n",
-      "INFO: 1 iban_code scrubbed.\n",
-      "INFO: 3 person scrubbed.\n"
+      "INFO: Scrubbing using custom_regex with parameters {'patterns': {'university': {'pattern': 'Lapland', 'replacement': '[UNIVERSITY]', 'priority': 1.0}}}...\n"
      ]
     },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['Our names are [PERSON], [PERSON], and [PERSON].', 'My IBAN code is [IBAN_CODE]']\n"
-     ]
+     "data": {
+      "text/plain": [
+       "['Our names are Hamish McDonald, L. Salah, and Elena Suárez.',\n",
+       " 'My number is +441111111111 and I work at the Department for Business and Trade, 15 Elf Road, AA11 1AA, [UNIVERSITY]']"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
     "from idscrub import IDScrub\n",
     "\n",
     "scrub = IDScrub(\n",
-    "    [\"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\", \"My IBAN code is GB91BKEN10000041610008\"]\n",
+    "    [\n",
+    "        \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
+    "        \"My number is +441111111111 and I work at the Department for Business and Trade, 15 Elf Road, AA11 1AA, Lapland\",\n",
+    "    ]\n",
     ")\n",
-    "scrubbed_texts = scrub.presidio_entities()\n",
     "\n",
-    "print(scrubbed_texts)"
+    "pipeline = [\n",
+    "    {\n",
+    "        \"method\": \"custom_regex\",\n",
+    "        \"patterns\": {\"university\": {\"pattern\": r\"Lapland\", \"replacement\": \"[UNIVERSITY]\", \"priority\": 1.0}},\n",
+    "    }\n",
+    "]\n",
+    "\n",
+    "scrubbed_texts = scrub.scrub(pipeline=pipeline)\n",
+    "\n",
+    "scrubbed_texts"
    ]
   },
   {
@@ -506,35 +505,22 @@
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>text_id</th>\n",
-       "      <th>person</th>\n",
-       "      <th>iban_code</th>\n",
+       "      <th>university</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>[Hamish McDonald, L. Salah, Elena Suárez]</td>\n",
-       "      <td>None</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
        "      <td>2</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[GB91BKEN10000041610008]</td>\n",
+       "      <td>[Lapland]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "   text_id                                     person  \\\n",
-       "0        1  [Hamish McDonald, L. Salah, Elena Suárez]   \n",
-       "1        2                                       None   \n",
-       "\n",
-       "                  iban_code  \n",
-       "0                      None  \n",
-       "1  [GB91BKEN10000041610008]  "
+       "   text_id university\n",
+       "0        2  [Lapland]"
       ]
      },
      "execution_count": 8,
@@ -693,7 +679,7 @@
     "        \"Beware; for I am fearless, and therefore powerful.\",\n",
     "    ],\n",
     "    \"Fake book\": [\n",
-    "        \"The letter to freddie.mercury@queen.com was stamped with SW1A 2AA. His IBAN was GB91BKEN10000041610008.\",\n",
+    "        \"The letter to freddie.mercury@queen.com was stamped with SW1A 2AA. He was British.\",\n",
     "        \"She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.\",\n",
     "        \"The dossier marked confidential came from serena.williams@tennis.com, with SW19 5AE etched in bold across the envelope.\",\n",
     "        \"A message arrived just as the Downing Street clock struck midnight.\",\n",
@@ -716,78 +702,44 @@
      "text": [
       "  0%|          | 0/3 [00:00<?, ?it/s]INFO: Texts loaded.\n",
       "INFO: Scrubbing column `Pride and Prejudice`...\n",
-      "INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 23.73it/s]\n",
-      "INFO: 4 person scrubbed.\n",
-      "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 77.84it/s]\n",
-      "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
-      "INFO: 0 phone_number scrubbed.\n",
-      "INFO: Scrubbing email addresses using regex...\n",
-      "INFO: 0 email_address scrubbed.\n",
-      "INFO: Scrubbing @user handles using regex...\n",
-      "INFO: 0 handle scrubbed.\n",
-      "INFO: Scrubbing IP addresses using regex...\n",
-      "INFO: 0 ip_address scrubbed.\n",
-      "INFO: Scrubbing phone numbers using regex...\n",
-      "INFO: 0 uk_phone_number scrubbed.\n",
-      "INFO: Scrubbing addresses using regex...\n",
-      "INFO: 0 uk_address scrubbed.\n",
-      "INFO: Scrubbing postcodes using regex...\n",
-      "INFO: 0 uk_postcode scrubbed.\n",
-      "INFO: Scrubbing titles using regex...\n",
-      "INFO: 2 title scrubbed.\n",
-      " 33%|███▎      | 1/3 [00:02<00:05,  2.60s/it]INFO: Texts loaded.\n",
+      "INFO: Scrubbing using presidio_entities with parameters {'entity_types': ['PERSON', 'NRP']}...\n",
+      "INFO: Scrubbing using spacy_entities with parameters {'entity_types': ['ORG']}...\n",
+      "INFO: Scrubbing using google_phone_numbers with parameters {'region': 'GB'}...\n",
+      "INFO: Scrubbing using titles with parameters {'strict': False}...\n",
+      "INFO: Scrubbing using email_addresses with default parameters...\n",
+      "INFO: Scrubbing using handles with default parameters...\n",
+      "INFO: Scrubbing using ip_addresses with default parameters...\n",
+      "INFO: Scrubbing using uk_addresses with default parameters...\n",
+      "INFO: Scrubbing using uk_phone_numbers with default parameters...\n",
+      "INFO: Scrubbing using uk_postcodes with default parameters...\n",
+      "INFO: Scrubbing using urls with default parameters...\n",
+      " 33%|███▎      | 1/3 [00:02<00:04,  2.44s/it]INFO: Texts loaded.\n",
       "INFO: Scrubbing column `The Adventures of Sherlock Holmes`...\n",
-      "INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 24.22it/s]\n",
-      "INFO: 2 person scrubbed.\n",
-      "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 84.78it/s]\n",
-      "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
-      "INFO: 0 phone_number scrubbed.\n",
-      "INFO: Scrubbing email addresses using regex...\n",
-      "INFO: 0 email_address scrubbed.\n",
-      "INFO: Scrubbing @user handles using regex...\n",
-      "INFO: 0 handle scrubbed.\n",
-      "INFO: Scrubbing IP addresses using regex...\n",
-      "INFO: 0 ip_address scrubbed.\n",
-      "INFO: Scrubbing phone numbers using regex...\n",
-      "INFO: 0 uk_phone_number scrubbed.\n",
-      "INFO: Scrubbing addresses using regex...\n",
-      "INFO: 0 uk_address scrubbed.\n",
-      "INFO: Scrubbing postcodes using regex...\n",
-      "INFO: 0 uk_postcode scrubbed.\n",
-      "INFO: Scrubbing titles using regex...\n",
-      "INFO: 0 title scrubbed.\n",
-      " 67%|██████▋   | 2/3 [00:05<00:02,  2.49s/it]INFO: Texts loaded.\n",
+      "INFO: Scrubbing using presidio_entities with parameters {'entity_types': ['PERSON', 'NRP']}...\n",
+      "INFO: Scrubbing using spacy_entities with parameters {'entity_types': ['ORG']}...\n",
+      "INFO: Scrubbing using google_phone_numbers with parameters {'region': 'GB'}...\n",
+      "INFO: Scrubbing using titles with parameters {'strict': False}...\n",
+      "INFO: Scrubbing using email_addresses with default parameters...\n",
+      "INFO: Scrubbing using handles with default parameters...\n",
+      "INFO: Scrubbing using ip_addresses with default parameters...\n",
+      "INFO: Scrubbing using uk_addresses with default parameters...\n",
+      "INFO: Scrubbing using uk_phone_numbers with default parameters...\n",
+      "INFO: Scrubbing using uk_postcodes with default parameters...\n",
+      "INFO: Scrubbing using urls with default parameters...\n",
+      " 67%|██████▋   | 2/3 [00:04<00:02,  2.44s/it]INFO: Texts loaded.\n",
       "INFO: Scrubbing column `Fake book`...\n",
-      "INFO: Scrubbing Presidio entities `PERSON, EMAIL_ADDRESS, UK_NINO, UK_NHS, CREDIT_CARD, CRYPTO, MEDICAL_LICENSE, URL, SWIFT_CODE, IBAN_CODE, LOCATION, NRP` using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 13.41it/s]\n",
-      "INFO: 1 iban_code scrubbed.\n",
-      "INFO: 5 url scrubbed.\n",
-      "INFO: 2 person scrubbed.\n",
-      "INFO: 3 email_address scrubbed.\n",
-      "INFO: Scrubbing SpaCy entities `PERSON, ORG, NORP` using SpaCy model `en_core_web_trf`...\n",
-      "100%|██████████| 5/5 [00:00<00:00, 64.57it/s]\n",
-      "INFO: 1 org scrubbed.\n",
-      "INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
-      "INFO: 0 phone_number scrubbed.\n",
-      "INFO: Scrubbing email addresses using regex...\n",
-      "INFO: 3 email_address scrubbed.\n",
-      "INFO: Scrubbing @user handles using regex...\n",
-      "INFO: 0 handle scrubbed.\n",
-      "INFO: Scrubbing IP addresses using regex...\n",
-      "INFO: 0 ip_address scrubbed.\n",
-      "INFO: Scrubbing phone numbers using regex...\n",
-      "INFO: 0 uk_phone_number scrubbed.\n",
-      "INFO: Scrubbing addresses using regex...\n",
-      "INFO: 0 uk_address scrubbed.\n",
-      "INFO: Scrubbing postcodes using regex...\n",
-      "INFO: 4 uk_postcode scrubbed.\n",
-      "INFO: Scrubbing titles using regex...\n",
-      "INFO: 0 title scrubbed.\n",
-      "100%|██████████| 3/3 [00:07<00:00,  2.53s/it]\n"
+      "INFO: Scrubbing using presidio_entities with parameters {'entity_types': ['PERSON', 'NRP']}...\n",
+      "INFO: Scrubbing using spacy_entities with parameters {'entity_types': ['ORG']}...\n",
+      "INFO: Scrubbing using google_phone_numbers with parameters {'region': 'GB'}...\n",
+      "INFO: Scrubbing using titles with parameters {'strict': False}...\n",
+      "INFO: Scrubbing using email_addresses with default parameters...\n",
+      "INFO: Scrubbing using handles with default parameters...\n",
+      "INFO: Scrubbing using ip_addresses with default parameters...\n",
+      "INFO: Scrubbing using uk_addresses with default parameters...\n",
+      "INFO: Scrubbing using uk_phone_numbers with default parameters...\n",
+      "INFO: Scrubbing using uk_postcodes with default parameters...\n",
+      "INFO: Scrubbing using urls with default parameters...\n",
+      "100%|██████████| 3/3 [00:07<00:00,  2.51s/it]\n"
      ]
     },
     {
@@ -901,7 +853,21 @@
    "source": [
     "from idscrub import IDScrub\n",
     "\n",
-    "scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col=\"ID\", exclude_cols=[\"Frankenstein\"], scrub_methods=[\"all\"])\n",
+    "pipeline = [\n",
+    "    {\"method\": \"presidio_entities\", \"entity_types\": [\"PERSON\", \"NRP\"]},\n",
+    "    {\"method\": \"spacy_entities\", \"entity_types\": [\"ORG\"]},\n",
+    "    {\"method\": \"google_phone_numbers\", \"region\": \"GB\"},\n",
+    "    {\"method\": \"titles\", \"strict\": False},\n",
+    "    {\"method\": \"email_addresses\"},\n",
+    "    {\"method\": \"handles\"},\n",
+    "    {\"method\": \"ip_addresses\"},\n",
+    "    {\"method\": \"uk_addresses\"},\n",
+    "    {\"method\": \"uk_phone_numbers\"},\n",
+    "    {\"method\": \"uk_postcodes\"},\n",
+    "    {\"method\": \"urls\"},\n",
+    "]\n",
+    "\n",
+    "scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col=\"ID\", exclude_cols=[\"Frankenstein\"], pipeline=pipeline)\n",
     "\n",
     "scrubbed_df"
    ]
@@ -936,11 +902,10 @@
        "      <th>column</th>\n",
        "      <th>person</th>\n",
        "      <th>title</th>\n",
+       "      <th>nrp</th>\n",
        "      <th>email_address</th>\n",
-       "      <th>iban_code</th>\n",
-       "      <th>url</th>\n",
-       "      <th>org</th>\n",
        "      <th>uk_postcode</th>\n",
+       "      <th>org</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
@@ -954,7 +919,6 @@
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
-       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
@@ -966,7 +930,6 @@
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
-       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
@@ -978,7 +941,6 @@
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
-       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
@@ -990,7 +952,6 @@
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
-       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
@@ -1002,7 +963,6 @@
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
-       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
@@ -1010,11 +970,10 @@
        "      <td>Fake book</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
+       "      <td>[British]</td>\n",
        "      <td>[freddie.mercury@queen.com]</td>\n",
-       "      <td>[GB91BKEN10000041610008]</td>\n",
-       "      <td>[freddie.me, queen.com]</td>\n",
-       "      <td>None</td>\n",
        "      <td>[SW1A 2AA]</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
@@ -1024,9 +983,8 @@
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
        "      <td>[SW1A 2WH]</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
@@ -1034,34 +992,31 @@
        "      <td>Fake book</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
-       "      <td>[serena.williams@tennis.com]</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[tennis.com]</td>\n",
        "      <td>None</td>\n",
+       "      <td>[serena.williams@tennis.com]</td>\n",
        "      <td>[SW19 5AE]</td>\n",
+       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>E</td>\n",
+       "      <td>D</td>\n",
        "      <td>Fake book</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
-       "      <td>[otis.redding@dockofthebay.org]</td>\n",
        "      <td>None</td>\n",
-       "      <td>[otis.red, dockofthebay.org]</td>\n",
        "      <td>None</td>\n",
-       "      <td>[EH8 8DX]</td>\n",
+       "      <td>None</td>\n",
+       "      <td>[Downing Street]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>D</td>\n",
+       "      <td>E</td>\n",
        "      <td>Fake book</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[Downing Street]</td>\n",
+       "      <td>[otis.redding@dockofthebay.org]</td>\n",
+       "      <td>[EH8 8DX]</td>\n",
        "      <td>None</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -1078,32 +1033,20 @@
        "5  A                          Fake book                        None  None   \n",
        "6  B                          Fake book  [Mick Jagger, David Bowie]  None   \n",
        "7  C                          Fake book                        None  None   \n",
-       "8  E                          Fake book                        None  None   \n",
-       "9  D                          Fake book                        None  None   \n",
-       "\n",
-       "                     email_address                 iban_code  \\\n",
-       "0                             None                      None   \n",
-       "1                             None                      None   \n",
-       "2                             None                      None   \n",
-       "3                             None                      None   \n",
-       "4                             None                      None   \n",
-       "5      [freddie.mercury@queen.com]  [GB91BKEN10000041610008]   \n",
-       "6                             None                      None   \n",
-       "7     [serena.williams@tennis.com]                      None   \n",
-       "8  [otis.redding@dockofthebay.org]                      None   \n",
-       "9                             None                      None   \n",
+       "8  D                          Fake book                        None  None   \n",
+       "9  E                          Fake book                        None  None   \n",
        "\n",
-       "                            url               org uk_postcode  \n",
-       "0                          None              None        None  \n",
-       "1                          None              None        None  \n",
-       "2                          None              None        None  \n",
-       "3                          None              None        None  \n",
-       "4                          None              None        None  \n",
-       "5       [freddie.me, queen.com]              None  [SW1A 2AA]  \n",
-       "6                          None              None  [SW1A 2WH]  \n",
-       "7                  [tennis.com]              None  [SW19 5AE]  \n",
-       "8  [otis.red, dockofthebay.org]              None   [EH8 8DX]  \n",
-       "9                          None  [Downing Street]        None  "
+       "         nrp                    email_address uk_postcode               org  \n",
+       "0       None                             None        None              None  \n",
+       "1       None                             None        None              None  \n",
+       "2       None                             None        None              None  \n",
+       "3       None                             None        None              None  \n",
+       "4       None                             None        None              None  \n",
+       "5  [British]      [freddie.mercury@queen.com]  [SW1A 2AA]              None  \n",
+       "6       None                             None  [SW1A 2WH]              None  \n",
+       "7       None     [serena.williams@tennis.com]  [SW19 5AE]              None  \n",
+       "8       None                             None        None  [Downing Street]  \n",
+       "9       None  [otis.redding@dockofthebay.org]   [EH8 8DX]              None  "
       ]
      },
      "execution_count": 11,

idscrub 1.1.2__py3-none-any.whl → 2.0.1__py3-none-any.whl

idscrub 1.1.2py3-none-any.whl → 2.0.1py3-none-any.whl