PyPI - idscrub - Versions diffs - 0.1.0__py3-none-any.whl - Mend

idscrub 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

idscrub/__init__.py +1 -0
idscrub/locations.py +10 -0
idscrub/scrub.py +947 -0
idscrub-0.1.0.dist-info/METADATA +147 -0
idscrub-0.1.0.dist-info/RECORD +20 -0
idscrub-0.1.0.dist-info/WHEEL +5 -0
idscrub-0.1.0.dist-info/licenses/LICENSE +21 -0
idscrub-0.1.0.dist-info/top_level.txt +3 -0
notebooks/basic_usage.ipynb +1002 -0
test/conftest.py +12 -0
test/test_all.py +39 -0
test/test_chain.py +54 -0
test/test_dataframe.py +51 -0
test/test_huggingface.py +25 -0
test/test_id.py +24 -0
test/test_log.py +17 -0
test/test_persidio.py +44 -0
test/test_phonenumbers.py +13 -0
test/test_regex.py +123 -0
test/test_spacy.py +28 -0

test/conftest.py ADDED Viewed

@@ -0,0 +1,12 @@
+import pytest
+from idscrub import IDScrub
+@pytest.fixture
+def scrub_object():
+    return IDScrub(
+        [
+            "Our names are Hamish McDonald, L. Salah, and Elena Suárez.",
+            "My number is +441111111111 and I live at AA11 1AA.",
+        ]
+    )

test/test_all.py ADDED Viewed

@@ -0,0 +1,39 @@
+import pandas as pd
+from idscrub import IDScrub
+from pandas.testing import assert_frame_equal
+# Note: These tests will fail if the kernel has not been restarted since the SpaCy model was downloaded.
+def test_all(scrub_object):
+    scrubbed = scrub_object.all()
+    assert scrubbed == [
+        "Our names are [PERSON], [PERSON], and [PERSON].",
+        "My number is [PHONENO] and I live at [POSTCODE].",
+    ]
+def test_text_id():
+    scrub = IDScrub(["Our names are Hamish McDonald, L. Salah, and Elena Suárez."] * 10)
+    scrub.all()
+    df = scrub.get_scrubbed_data()
+    assert df["text_id"].max() == 10
+    assert len(df["text_id"]) == 10
+def test_get_scrubbed_data(scrub_object):
+    scrub_object.all()
+    df = scrub_object.get_scrubbed_data()
+    expected_df = pd.DataFrame(
+        {
+            "text_id": {0: 1, 1: 2},
+            "scrubbed_presidio_person": {0: ["Hamish McDonald", "L. Salah", "Elena Suárez"], 1: None},
+            "scrubbed_uk_phone_numbers": {0: None, 1: ["+441111111111"]},
+            "scrubbed_uk_postcodes": {0: None, 1: ["AA11 1AA"]},
+        }
+    )
+    assert_frame_equal(df, expected_df)

test/test_chain.py ADDED Viewed

@@ -0,0 +1,54 @@
+import pandas as pd
+from pandas.testing import assert_frame_equal
+# Note: These tests will fail if the kernel has not been restarted since the SpaCy model was downloaded.
+def test_chain(scrub_object):
+    scrub_object.uk_phone_numbers()
+    scrub_object.uk_postcodes()
+    scrubbed = scrub_object.spacy_persons()
+    assert scrubbed == [
+        "Our names are [PERSON], [PERSON], and [PERSON].",
+        "My number is [PHONENO] and I live at [POSTCODE].",
+    ]
+def test_chain_order(scrub_object):
+    scrubbed = scrub_object.uk_phone_numbers()
+    assert scrubbed == [
+        "Our names are Hamish McDonald, L. Salah, and Elena Suárez.",
+        "My number is [PHONENO] and I live at AA11 1AA.",
+    ]
+    assert scrub_object.get_scrubbed_data()["scrubbed_uk_phone_numbers"].to_list() == [["+441111111111"]]
+    assert "scrubbed_uk_postcodes" not in scrub_object.get_scrubbed_data().columns
+    scrubbed = scrub_object.uk_postcodes()
+    assert scrubbed == [
+        "Our names are Hamish McDonald, L. Salah, and Elena Suárez.",
+        "My number is [PHONENO] and I live at [POSTCODE].",
+    ]
+    assert scrub_object.get_scrubbed_data()["scrubbed_uk_phone_numbers"].to_list() == [["+441111111111"]]
+    assert scrub_object.get_scrubbed_data()["scrubbed_uk_postcodes"].to_list() == [["AA11 1AA"]]
+def test_get_scrubbed_data_chain(scrub_object):
+    scrub_object.uk_phone_numbers()
+    scrub_object.uk_postcodes()
+    scrub_object.spacy_persons()
+    df = scrub_object.get_scrubbed_data()
+    expected_df = pd.DataFrame(
+        {
+            "text_id": {0: 1, 1: 2},
+            "scrubbed_uk_phone_numbers": {0: None, 1: ["+441111111111"]},
+            "scrubbed_uk_postcodes": {0: None, 1: ["AA11 1AA"]},
+            "scrubbed_spacy_person": {0: ["Hamish McDonald", "L. Salah", "Elena Suárez"], 1: None},
+        }
+    )
+    assert_frame_equal(df, expected_df)

test/test_dataframe.py ADDED Viewed

@@ -0,0 +1,51 @@
+import pandas as pd
+from idscrub import IDScrub
+from pandas.testing import assert_frame_equal
+# Note: These tests will fail if the kernel has not been restarted since the SpaCy model was downloaded.
+def test_dataframe_outputs():
+    df = pd.DataFrame(
+        {
+            "ID": [1, 2],
+            "Pride and Prejudice": [
+                "Mr. Darcy walked off; and Elizabeth remained with no very cordial feelings toward him.",
+                "Mr. Bennet was so odd a mixture of quick parts, sarcastic humour, reserve, and caprice.",
+            ],
+            "Fake book": [
+                "The letter to freddie-mercury@queen.com was stamped with SW1A 2AA.",
+                "She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.",
+            ],
+        }
+    )
+    scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID", scrub_methods=["all"])
+    expected_scrubbed_df = pd.DataFrame(
+        {
+            "ID": [1, 2],
+            "Pride and Prejudice": [
+                "[TITLE]. [PERSON] walked off; and [PERSON] remained with no very cordial feelings toward him.",
+                "[TITLE]. [PERSON] was so odd a mixture of quick parts, sarcastic humour, reserve, and caprice.",
+            ],
+            "Fake book": [
+                "The letter to [EMAIL_ADDRESS] was stamped with [POSTCODE].",
+                "She forwarded the memo from [PERSON] and [PERSON] to her chief of staff, noting the postcode [POSTCODE].",
+            ],
+        }
+    )
+    expected_scrubbed_data = pd.DataFrame(
+        {
+            "ID": [1, 2, 1, 2],
+            "column": ["Pride and Prejudice", "Pride and Prejudice", "Fake book", "Fake book"],
+            "scrubbed_presidio_person": [["Darcy", "Elizabeth"], ["Bennet"], None, ["Mick Jagger", "David Bowie"]],
+            "scrubbed_titles": [["Mr"], ["Mr"], None, None],
+            "scrubbed_presidio_email_address": [None, None, ["freddie-mercury@queen.com"], None],
+            "scrubbed_presidio_url": [None, None, ["queen.com"], None],
+            "scrubbed_uk_postcodes": [None, None, ["SW1A 2AA"], ["SW1A 2WH"]],
+        }
+    )
+    assert_frame_equal(scrubbed_df, expected_scrubbed_df)
+    assert_frame_equal(scrubbed_data, expected_scrubbed_data)

test/test_huggingface.py ADDED Viewed

@@ -0,0 +1,25 @@
+import pandas as pd
+import pytest
+from idscrub import IDScrub
+from pandas.testing import assert_frame_equal
+def test_huggingface():
+    scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
+    scrubbed = scrub.huggingface_persons()
+    assert scrubbed == ["Our names are [PERSON], [PERSON], and [PERSON]."]
+def test_huggingface_error():
+    scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
+    with pytest.raises(OSError):
+        scrub.huggingface_persons(hf_model_path="not_a_path")
+def test_huggingface_empty():
+    scrub = IDScrub([" ", "John Smith", ""])
+    scrubbed = scrub.huggingface_persons()
+    assert scrubbed == [" ", "[PERSON]", ""]
+    assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "scrubbed_hf_person": [["John Smith"]]}))

test/test_id.py ADDED Viewed

@@ -0,0 +1,24 @@
+from idscrub import IDScrub
+def test_id_ints():
+    scrub = IDScrub(texts=["clement_attlee@gmail.com"] * 10, text_ids=range(100, 110), text_id_name="PM")
+    scrub.email_addresses()
+    assert scrub.get_scrubbed_data()["PM"].min() == 100
+    assert scrub.get_scrubbed_data()["PM"].max() == 109
+    assert scrub.get_scrubbed_data()["PM"].to_list() == [100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
+def test_id_strs():
+    scrub = IDScrub(texts=["clement_attlee@gmail.com"] * 2, text_ids=["random", "minister"], text_id_name="PM")
+    scrub.email_addresses()
+    assert scrub.get_scrubbed_data()["PM"][0] == "random"
+    assert scrub.get_scrubbed_data()["PM"][1] == "minister"
+def test_multiple():
+    scrub = IDScrub(texts=["clement_attlee@gmail.com", "SW1A 2AA"] * 10, text_ids=range(100, 120), text_id_name="PM")
+    scrub.email_addresses()
+    scrub.uk_postcodes()
+    assert scrub.get_scrubbed_data()["PM"].min() == 100
+    assert scrub.get_scrubbed_data()["PM"].max() == 119

test/test_log.py ADDED Viewed

@@ -0,0 +1,17 @@
+from idscrub import IDScrub
+def test_log_message():
+    scrub = IDScrub(texts=["My name is Dr Strangelove. Dr. Strangelove is my name", "My name is Professor Oppenheimer"])
+    scrub.titles()
+    count = scrub.log_message("scrubbed_titles")
+    assert count == 3
+def test_log_message_custom_regex():
+    scrub = IDScrub(texts=["My name is Dr Strangelove. Dr. Strangelove is my name", "My name is Professor Oppenheimer"])
+    scrub.custom_regex([r"Strangelove", r"Oppenheimer"], ["[DR]", "[PROFESSOR]"])
+    count_1 = scrub.log_message("scrubbed_custom_regex_1")
+    count_2 = scrub.log_message("scrubbed_custom_regex_2")
+    assert count_1 == 2
+    assert count_2 == 1

test/test_persidio.py ADDED Viewed

@@ -0,0 +1,44 @@
+import pandas as pd
+from idscrub import IDScrub
+from pandas.testing import assert_frame_equal
+# Note: These tests will fail if the kernel has not been restarted since the SpaCy model was downloaded.
+def test_persidio():
+    scrub = IDScrub(
+        ["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
+    )
+    scrubbed_texts = scrub.presidio(entities_to_scrub=["PERSON", "IBAN_CODE"])
+    assert scrubbed_texts == ["Our names are [PERSON], [PERSON], and [PERSON].", "My IBAN code is [IBAN_CODE]."]
+def test_persidio_map():
+    scrub = IDScrub(
+        ["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
+    )
+    scrubbed_texts = scrub.presidio(
+        entities_to_scrub=["PERSON", "IBAN_CODE"], replacement_map={"PERSON": "[PHELLO]", "IBAN_CODE": "[IHELLO]"}
+    )
+    assert scrubbed_texts == ["Our names are [PHELLO], [PHELLO], and [PHELLO].", "My IBAN code is [IHELLO]."]
+def test_persidio_get_data():
+    scrub = IDScrub(
+        ["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
+    )
+    scrub.presidio(entities_to_scrub=["PERSON", "IBAN_CODE"])
+    df = scrub.get_scrubbed_data()
+    expected_df = pd.DataFrame(
+        {
+            "text_id": {0: 1, 1: 2},
+            "scrubbed_presidio_person": {0: ["Hamish McDonald", "L. Salah", "Elena Suárez"], 1: None},
+            "scrubbed_presidio_iban_code": {0: None, 1: ["GB91BKEN10000041610008"]},
+        }
+    )
+    assert_frame_equal(df, expected_df)

test/test_phonenumbers.py ADDED Viewed

@@ -0,0 +1,13 @@
+from idscrub import IDScrub
+def test_google_phone_numbers_gb():
+    scrub = IDScrub(texts=["My phone number is +441234567891! My old phone number is 01475 123456."])
+    scrubbed = scrub.google_phone_numbers(region="GB")
+    assert scrubbed == ["My phone number is [PHONENO]! My old phone number is [PHONENO]."]
+def test_google_phone_numbers_us():
+    scrub = IDScrub(texts=["My US phone number is +1-718-222-2222! My old phone number is 12124567890."])
+    scrubbed = scrub.google_phone_numbers(region="US")
+    assert scrubbed == ["My US phone number is [PHONENO]! My old phone number is [PHONENO]."]

test/test_regex.py ADDED Viewed

@@ -0,0 +1,123 @@
+import re
+from idscrub import IDScrub
+def test_email_addresses():
+    scrub = IDScrub(
+        texts=["Send me an email at jim@gmail.com or at marie-9999@randomemail.co.uk or at hello_world@john-smith.com."]
+    )
+    scrubbed = scrub.email_addresses()
+    assert scrubbed == ["Send me an email at [EMAIL_ADDRESS] or at [EMAIL_ADDRESS] or at [EMAIL_ADDRESS]."]
+def test_ip_addresses():
+    scrub = IDScrub(texts=["This has been sent to 8.8.8.8 and requested by 192.0.2.1."])
+    scrubbed = scrub.ip_addresses()
+    assert scrubbed == ["This has been sent to [IPADDRESS] and requested by [IPADDRESS]."]
+def test_uk_postcodes():
+    scrub = IDScrub(texts=["I live at A11 1AA. My friend lives at KA308JB. The Prime Minister lives at SW1A  2AA."])
+    scrubbed = scrub.uk_postcodes()
+    assert scrubbed == ["I live at [POSTCODE]. My friend lives at [POSTCODE]. The Prime Minister lives at [POSTCODE]."]
+def test_titles_not_strict():
+    scrub = IDScrub(
+        texts=[
+            "Hello Dr. Smith! I am Mrs Patel",
+            "I am here on behalf of Ms Austen, General Eisenhower, and Captain Jack Sparrow.",
+        ]
+    )
+    scrubbed = scrub.titles()
+    assert scrubbed == [
+        "Hello [TITLE]. Smith! I am [TITLE] Patel",
+        "I am here on behalf of [TITLE] Austen, General Eisenhower, and [TITLE] Jack Sparrow.",
+    ]
+def test_titles_strict():
+    scrub = IDScrub(
+        texts=[
+            "Hello Dr. Smith! I am Mrs Patel",
+            "I am here on behalf of Ms Austen, General Eisenhower, and Captain Jack Sparrow.",
+        ]
+    )
+    scrubbed = scrub.titles(strict=True)
+    assert scrubbed == [
+        "Hello [TITLE]. Smith! I am [TITLE] Patel",
+        "I am here on behalf of [TITLE] Austen, [TITLE] Eisenhower, and [TITLE] Jack Sparrow.",
+    ]
+def test_uk_phone_numbers():
+    scrub = IDScrub(texts=["My phone number is +441234567891! My old phone number is 01111 123456."])
+    scrubbed = scrub.uk_phone_numbers()
+    assert scrubbed == ["My phone number is [PHONENO]! My old phone number is [PHONENO]."]
+def test_handles():
+    scrub = IDScrub(texts=["Our usernames are @HenrikLarsson, @Jimmy_Johnstone, @Nakamura-67 and @Aidan_McGeady_46."])
+    scrubbed = scrub.handles()
+    assert scrubbed == ["Our usernames are [HANDLE], [HANDLE], [HANDLE] and [HANDLE]."]
+def test_claimants():
+    scrub = IDScrub(
+        texts=[
+            "This is legal text. Claimant: John Smith Respondents: Jill Hill.",
+            "Claimant: J Smith Respondents: Jill Hill. J Smith is the respondent.",
+        ]
+    )
+    scrubbed = scrub.claimants()
+    assert scrubbed == [
+        "This is legal text. Claimant: [CLAIMANT] Respondents: Jill Hill.",
+        "Claimant: [CLAIMANT] Respondents: Jill Hill. [CLAIMANT] is the respondent.",
+    ]
+def test_custom_regex():
+    scrub = IDScrub(texts=["It was the best of times, it was the worst of times"])
+    scrubbed = scrub.custom_regex(custom_regex_patterns=[r"times"])
+    assert scrubbed == ["It was the best of [REDACTED], it was the worst of [REDACTED]"]
+    scrub = IDScrub(texts=["It was the best of times, it was the worst of times"])
+    scrubbed = scrub.custom_regex(
+        custom_regex_patterns=[r"times", "worst"], custom_replacement_texts=["[DICKENS]", "[WORST]"]
+    )
+    assert scrubbed == ["It was the best of [DICKENS], it was the [WORST] of [DICKENS]"]
+def test_scrub_and_collect():
+    scrub = IDScrub()
+    text = "Hello Muhammad and Jack."
+    pattern = r"\bMuhammad|Jack\b"
+    replacement = "[NAME]"
+    removed_label = "scrubbed_custom_regex"
+    i = 1
+    def replacer(match):
+        return scrub.scrub_and_collect(match, text, replacement, i, removed_label)
+    scrubbed = re.sub(pattern, replacer, text)
+    assert scrubbed == "Hello [NAME] and [NAME]."
+    assert scrub.scrubbed_data == [
+        {"text_id": 1, "scrubbed_custom_regex": "Muhammad"},
+        {"text_id": 1, "scrubbed_custom_regex": "Jack"},
+    ]
+def test_remove_regex():
+    scrub = IDScrub(texts=["Hi! My name is Clement Atlee!", "I am Harold Wilson."])
+    removed_label = "scrubbed_regex_names"
+    pattern = r"Clement Atlee|Harold Wilson"
+    replacement_text = "[PM]"
+    scrubbed = scrub.scrub_regex(pattern, replacement_text, removed_label)
+    assert scrubbed == ["Hi! My name is [PM]!", "I am [PM]."]
+    assert scrub.scrubbed_data == [
+        {"text_id": 1, "scrubbed_regex_names": "Clement Atlee"},
+        {"text_id": 2, "scrubbed_regex_names": "Harold Wilson"},
+    ]

test/test_spacy.py ADDED Viewed

@@ -0,0 +1,28 @@
+import pandas as pd
+import pytest
+from idscrub import IDScrub
+from pandas.testing import assert_frame_equal
+# Note: This test will fail if the kernel has not been restarted since the SpaCy model was downloaded.
+def test_spacy():
+    scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
+    scrubbed = scrub.spacy_persons(model_name="en_core_web_trf")
+    assert scrubbed == ["Our names are [PERSON], [PERSON], and [PERSON]."]
+def test_spacy_error():
+    scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
+    with pytest.raises(ValueError):
+        scrub.spacy_persons(model_name="not_a_model")
+def test_spacy_empty():
+    scrub = IDScrub([" ", "John Smith", ""])
+    scrubbed = scrub.spacy_persons()
+    assert scrubbed == [" ", "[PERSON]", ""]
+    assert_frame_equal(
+        scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "scrubbed_spacy_person": [["John Smith"]]})
+    )