PyPI - idscrub - Versions diffs - 0.2.2__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

idscrub 0.2.2py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

idscrub/scrub.py +73 -88
{idscrub-0.2.2.dist-info → idscrub-1.0.0.dist-info}/METADATA +2 -2
idscrub-1.0.0.dist-info/RECORD +22 -0
notebooks/basic_usage.ipynb +153 -161
test/conftest.py +10 -0
test/test_all.py +3 -3
test/test_chain.py +7 -7
test/test_dataframe.py +114 -5
test/test_huggingface.py +1 -1
test/test_label.py +17 -0
test/test_log.py +3 -3
test/test_persidio.py +2 -2
test/test_regex.py +8 -8
test/test_scrub.py +4 -4
test/test_spacy.py +1 -3
idscrub-0.2.2.dist-info/RECORD +0 -21
{idscrub-0.2.2.dist-info → idscrub-1.0.0.dist-info}/WHEEL +0 -0
{idscrub-0.2.2.dist-info → idscrub-1.0.0.dist-info}/licenses/LICENSE +0 -0
{idscrub-0.2.2.dist-info → idscrub-1.0.0.dist-info}/top_level.txt +0 -0

test/test_dataframe.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import pandas as pd
+import pytest
 from idscrub import IDScrub
 from pandas.testing import assert_frame_equal
@@ -39,13 +40,121 @@ def test_dataframe_outputs():
         {
             "ID": [1, 2, 1, 2],
             "column": ["Pride and Prejudice", "Pride and Prejudice", "Fake book", "Fake book"],
-            "scrubbed_presidio_person": [["Darcy", "Elizabeth"], ["Bennet"], None, ["Mick Jagger", "David Bowie"]],
-            "scrubbed_titles": [["Mr"], ["Mr"], None, None],
-            "scrubbed_presidio_email_address": [None, None, ["freddie-mercury@queen.com"], None],
-            "scrubbed_presidio_url": [None, None, ["queen.com"], None],
-            "scrubbed_uk_postcodes": [None, None, ["SW1A 2AA"], ["SW1A 2WH"]],
+            "person": [["Darcy", "Elizabeth"], ["Bennet"], None, ["Mick Jagger", "David Bowie"]],
+            "title": [["Mr"], ["Mr"], None, None],
+            "email_address": [None, None, ["freddie-mercury@queen.com"], None],
+            "url": [None, None, ["queen.com"], None],
+            "uk_postcode": [None, None, ["SW1A 2AA"], ["SW1A 2WH"]],
         }
     )
     assert_frame_equal(scrubbed_df, expected_scrubbed_df)
     assert_frame_equal(scrubbed_data, expected_scrubbed_data)
+def test_dataframe_exclude():
+    df = pd.DataFrame(
+        {
+            "ID": [1, 2],
+            "Pride and Prejudice": [
+                "Mr. Darcy walked off; and Elizabeth remained with no very cordial feelings toward him.",
+                "Mr. Bennet was so odd a mixture of quick parts, sarcastic humour, reserve, and caprice.",
+            ],
+            "Fake book": [
+                "The letter to freddie-mercury@queen.com was stamped with SW1A 2AA.",
+                "She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.",
+            ],
+        }
+    )
+    scrubbed_df, scrubbed_data = IDScrub.dataframe(
+        df=df, id_col="ID", exclude_cols=["Fake book"], scrub_methods=["all"]
+    )
+    expected_scrubbed_df = pd.DataFrame(
+        {
+            "ID": [1, 2],
+            "Pride and Prejudice": [
+                "[TITLE]. [PERSON] walked off; and [PERSON] remained with no very cordial feelings toward him.",
+                "[TITLE]. [PERSON] was so odd a mixture of quick parts, sarcastic humour, reserve, and caprice.",
+            ],
+            "Fake book": [
+                "The letter to freddie-mercury@queen.com was stamped with SW1A 2AA.",
+                "She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.",
+            ],
+        }
+    )
+    expected_scrubbed_data = pd.DataFrame(
+        {
+            "ID": [1, 2],
+            "column": ["Pride and Prejudice", "Pride and Prejudice"],
+            "person": [["Darcy", "Elizabeth"], ["Bennet"]],
+            "title": [["Mr"], ["Mr"]],
+        }
+    )
+    assert_frame_equal(scrubbed_df, expected_scrubbed_df)
+    assert_frame_equal(scrubbed_data, expected_scrubbed_data)
+def test_dataframe_scrub_methods():
+    df = pd.DataFrame(
+        {
+            "ID": [1, 2],
+            "Pride and Prejudice": [
+                "Mr. Darcy walked off; and Elizabeth remained with no very cordial feelings toward him.",
+                "Mr. Bennet was so odd a mixture of quick parts, sarcastic humour, reserve, and caprice.",
+            ],
+            "Fake book": [
+                "The letter to freddie-mercury@queen.com was stamped with SW1A 2AA.",
+                "She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.",
+            ],
+        }
+    )
+    scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID", scrub_methods=["titles"])
+    expected_scrubbed_df = pd.DataFrame(
+        {
+            "ID": [1, 2],
+            "Pride and Prejudice": [
+                "[TITLE]. Darcy walked off; and Elizabeth remained with no very cordial feelings toward him.",
+                "[TITLE]. Bennet was so odd a mixture of quick parts, sarcastic humour, reserve, and caprice.",
+            ],
+            "Fake book": [
+                "The letter to freddie-mercury@queen.com was stamped with SW1A 2AA.",
+                "She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.",
+            ],
+        }
+    )
+    expected_scrubbed_data = pd.DataFrame(
+        {
+            "ID": [1, 2],
+            "column": ["Pride and Prejudice", "Pride and Prejudice"],
+            "title": [["Mr"], ["Mr"]],
+        }
+    )
+    assert_frame_equal(scrubbed_df, expected_scrubbed_df)
+    assert_frame_equal(scrubbed_data, expected_scrubbed_data)
+def test_dataframe_id_col():
+    df = pd.DataFrame(
+        {
+            "ID": [1, 2],
+            "Pride and Prejudice": [
+                "Mr. Darcy walked off; and Elizabeth remained with no very cordial feelings toward him.",
+                "Mr. Bennet was so odd a mixture of quick parts, sarcastic humour, reserve, and caprice.",
+            ],
+            "Fake book": [
+                "The letter to freddie-mercury@queen.com was stamped with SW1A 2AA.",
+                "She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.",
+            ],
+        }
+    )
+    with pytest.raises(AssertionError):
+        IDScrub.dataframe(df=df, id_col="ID_not_present")

test/test_huggingface.py CHANGED Viewed

@@ -22,4 +22,4 @@ def test_huggingface_empty():
     scrubbed = scrub.huggingface_persons()
     assert scrubbed == [" ", "[PERSON]", ""]
-    assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "scrubbed_hf_person": [["John Smith"]]}))
+    assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))

test/test_label.py ADDED Viewed

@@ -0,0 +1,17 @@
+def test_label(scrub_object_all):
+    for i, scrub_method in enumerate(
+        ["spacy_persons", "uk_postcodes", "email_addresses", "ip_addresses", "uk_phone_numbers", "titles", "handles"]
+    ):
+        method = getattr(scrub_object_all, scrub_method)
+        method(label="test")
+    df = scrub_object_all.get_scrubbed_data()
+    assert df.columns.to_list() == ["text_id", "test"]
+def test_regex_label(scrub_object_all):
+    scrub_object_all.custom_regex(custom_regex_patterns=[r"number", r"live"], labels=["regex_number", "regex_live"])
+    df = scrub_object_all.get_scrubbed_data()
+    assert df.columns.to_list() == ["text_id", "regex_number", "regex_live"]

test/test_log.py CHANGED Viewed

@@ -4,14 +4,14 @@ from idscrub import IDScrub
 def test_log_message():
     scrub = IDScrub(texts=["My name is Dr Strangelove. Dr. Strangelove is my name", "My name is Professor Oppenheimer"])
     scrub.titles()
-    count = scrub.log_message("scrubbed_titles")
+    count = scrub.log_message("title")
     assert count == 3
 def test_log_message_custom_regex():
     scrub = IDScrub(texts=["My name is Dr Strangelove. Dr. Strangelove is my name", "My name is Professor Oppenheimer"])
     scrub.custom_regex([r"Strangelove", r"Oppenheimer"], ["[DR]", "[PROFESSOR]"])
-    count_1 = scrub.log_message("scrubbed_custom_regex_1")
-    count_2 = scrub.log_message("scrubbed_custom_regex_2")
+    count_1 = scrub.log_message("custom_regex_1")
+    count_2 = scrub.log_message("custom_regex_2")
     assert count_1 == 2
     assert count_2 == 1

test/test_persidio.py CHANGED Viewed

@@ -36,8 +36,8 @@ def test_persidio_get_data():
     expected_df = pd.DataFrame(
         {
             "text_id": {0: 1, 1: 2},
-            "scrubbed_presidio_person": {0: ["Hamish McDonald", "L. Salah", "Elena Suárez"], 1: None},
-            "scrubbed_presidio_iban_code": {0: None, 1: ["GB91BKEN10000041610008"]},
+            "person": {0: ["Hamish McDonald", "L. Salah", "Elena Suárez"], 1: None},
+            "iban_code": {0: None, 1: ["GB91BKEN10000041610008"]},
         }
     )

test/test_regex.py CHANGED Viewed

@@ -94,30 +94,30 @@ def test_scrub_and_collect():
     text = "Hello Muhammad and Jack."
     pattern = r"\bMuhammad|Jack\b"
     replacement = "[NAME]"
-    removed_label = "scrubbed_custom_regex"
+    label = "custom_regex"
     i = 1
     def replacer(match):
-        return scrub.scrub_and_collect(match, text, replacement, i, removed_label)
+        return scrub.scrub_and_collect(match, text, replacement, i, label)
     scrubbed = re.sub(pattern, replacer, text)
     assert scrubbed == "Hello [NAME] and [NAME]."
     assert scrub.scrubbed_data == [
-        {"text_id": 1, "scrubbed_custom_regex": "Muhammad"},
-        {"text_id": 1, "scrubbed_custom_regex": "Jack"},
+        {"text_id": 1, "custom_regex": "Muhammad"},
+        {"text_id": 1, "custom_regex": "Jack"},
     ]
 def test_remove_regex():
     scrub = IDScrub(texts=["Hi! My name is Clement Atlee!", "I am Harold Wilson."])
-    removed_label = "scrubbed_regex_names"
+    label = "regex_names"
     pattern = r"Clement Atlee|Harold Wilson"
     replacement_text = "[PM]"
-    scrubbed = scrub.scrub_regex(pattern, replacement_text, removed_label)
+    scrubbed = scrub.scrub_regex(pattern, replacement_text, label)
     assert scrubbed == ["Hi! My name is [PM]!", "I am [PM]."]
     assert scrub.scrubbed_data == [
-        {"text_id": 1, "scrubbed_regex_names": "Clement Atlee"},
-        {"text_id": 2, "scrubbed_regex_names": "Harold Wilson"},
+        {"text_id": 1, "regex_names": "Clement Atlee"},
+        {"text_id": 2, "regex_names": "Harold Wilson"},
     ]

test/test_scrub.py CHANGED Viewed

@@ -30,7 +30,7 @@ def test_scrub_get_scrubbed_data(scrub_object):
     expected_df = pd.DataFrame(
         {
             "text_id": {0: 2},
-            "scrubbed_uk_postcodes": {0: ["AA11 1AA"]},
+            "uk_postcode": {0: ["AA11 1AA"]},
         }
     )
@@ -42,7 +42,7 @@ def test_scrub_order(scrub_object):
     assert scrub_object.get_scrubbed_data().columns.to_list() == [
         "text_id",
-        "scrubbed_uk_postcodes",
-        "scrubbed_uk_phone_numbers",
-        "scrubbed_spacy_person",
+        "uk_postcode",
+        "uk_phone_number",
+        "person",
     ]

test/test_spacy.py CHANGED Viewed

@@ -23,6 +23,4 @@ def test_spacy_empty():
     scrubbed = scrub.spacy_persons()
     assert scrubbed == [" ", "[PERSON]", ""]
-    assert_frame_equal(
-        scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "scrubbed_spacy_person": [["John Smith"]]})
-    )
+    assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))

idscrub-0.2.2.dist-info/RECORD DELETED Viewed

@@ -1,21 +0,0 @@
-idscrub/__init__.py,sha256=cRugJv27q1q--bl-VNLpfiScJb_ROlUxyLFhaF55S1w,38
-idscrub/locations.py,sha256=7fMNOcGMYe7sX8TrfhMW6oYGAlc1WVYVQKQbpxE3pqo,217
-idscrub/scrub.py,sha256=K4Sw4DxKhYJnnu_vpRhUcqj-AbeGr8SwDB0XrDLEciM,34940
-idscrub-0.2.2.dist-info/licenses/LICENSE,sha256=JJnuf10NSx7YXglte1oH_N9ZP3AcWR_Y8irvQb_wnsg,1090
-notebooks/basic_usage.ipynb,sha256=eQFU3mOyRXbCwFz3jVUKCxWRtIP5Jptny8fj-KYoBwA,39784
-test/conftest.py,sha256=ph1S3LMvzlzvOsb3l2YhpyHSdmg4uV7p61ge_JVCGv0,267
-test/test_all.py,sha256=z6v9O2Ts9dWITlhvZwRMyKUZsO7ncaT3znqqBCKJ6Wc,1141
-test/test_chain.py,sha256=YFGqO0xUzZ69x-iNCdKEiH-OWWZfyYYFgmEq0urELEs,1883
-test/test_dataframe.py,sha256=6k3iu69X9H-pLA2gm3fvwvFTSj_efJe0GomFiR3LPac,2284
-test/test_huggingface.py,sha256=pKL3yD1z8JxNaXwqkww0IrKEq84-J16vi-URC0kC9p8,848
-test/test_id.py,sha256=TPsvz4Kw1z_Fiek2BV79Hc2q3N37xU3oQra6Y7Ke11Q,989
-test/test_log.py,sha256=qKVZAzcaVllKepM-vgCWqqY9f8GyNxO7V0sa1WD0tsA,673
-test/test_persidio.py,sha256=NSX5gzhhBX5l9GTXwPK4wjMzcp6wmAfWJYQo45UMVIc,1594
-test/test_phonenumbers.py,sha256=hZsXgwhn5R-7426TTWwCH9gWQwhyHtjLUstN10jnX6c,607
-test/test_regex.py,sha256=EQGx3PHwJJzIdy6xwR8gEsSRDtlWHR-U81EPI811eZA,4474
-test/test_scrub.py,sha256=pohmw3frtlkmZDMvOEbmvVJgtcVdFlEDL3TxR5-y-0Q,1422
-test/test_spacy.py,sha256=mrUGUulvzDGgQRttdG0tgL2sGBRmYfg1fDNp7SFq8as,961
-idscrub-0.2.2.dist-info/METADATA,sha256=IHoFTVY6cJARkeeKoQlpunA7Nboc4y32bpSoS-IgSoM,5352
-idscrub-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-idscrub-0.2.2.dist-info/top_level.txt,sha256=D4EEodXGCjGiX35ObiBTmjjBAdouN-eCvH-LezGGtks,23
-idscrub-0.2.2.dist-info/RECORD,,

{idscrub-0.2.2.dist-info → idscrub-1.0.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{idscrub-0.2.2.dist-info → idscrub-1.0.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{idscrub-0.2.2.dist-info → idscrub-1.0.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

idscrub 0.2.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

idscrub 0.2.2py3-none-any.whl → 1.0.0py3-none-any.whl