PyPI - idscrub - Versions diffs - 1.1.2__py3-none-any.whl → 2.0.1__py3-none-any.whl - Mend

idscrub 1.1.2py3-none-any.whl → 2.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

idscrub/scrub.py +694 -525
{idscrub-1.1.2.dist-info → idscrub-2.0.1.dist-info}/METADATA +58 -12
idscrub-2.0.1.dist-info/RECORD +24 -0
notebooks/basic_usage.ipynb +294 -351
test/conftest.py +36 -0
test/test_dataframe.py +8 -8
test/test_errors.py +32 -0
test/test_exclude.py +22 -0
test/test_group.py +9 -0
test/test_huggingface.py +3 -3
test/test_id.py +8 -7
test/test_label.py +22 -7
test/test_overlap.py +86 -0
test/test_phonenumbers.py +2 -2
test/test_presidio.py +21 -6
test/test_regex.py +110 -59
test/test_scrub.py +22 -12
test/test_scrub_text.py +22 -0
test/test_spacy.py +16 -12
idscrub-1.1.2.dist-info/RECORD +0 -22
test/test_all.py +0 -39
test/test_chain.py +0 -54
test/test_log.py +0 -17
{idscrub-1.1.2.dist-info → idscrub-2.0.1.dist-info}/WHEEL +0 -0
{idscrub-1.1.2.dist-info → idscrub-2.0.1.dist-info}/licenses/LICENSE +0 -0
{idscrub-1.1.2.dist-info → idscrub-2.0.1.dist-info}/top_level.txt +0 -0

test/conftest.py CHANGED Viewed

@@ -20,3 +20,39 @@ def scrub_object_all():
             "My number is +441111111111 and I live at AA11 1AA.",
         ]
     )
+@pytest.fixture
+def idents():
+    return [
+        IDScrub.IDEnt(
+            text_id="A",
+            text="The quick brown fox jumps over the lazy dog.",
+            start=10,
+            end=19,
+            label="animal",
+            replacement="[ANIMAL]",
+            priority=0.92,
+            source="custom_regex",
+        ),
+        IDScrub.IDEnt(
+            text_id="A",
+            text="My phone number is 123-456-7890.",
+            start=19,
+            end=31,
+            label="phone_number",
+            replacement="[PHONE]",
+            priority=0.76,
+            source="google",
+        ),
+        IDScrub.IDEnt(
+            text_id="B",
+            text="Email me at example@example.com.",
+            start=12,
+            end=31,
+            label="email",
+            replacement="[EMAIL]",
+            priority=0.88,
+            source="email",
+        ),
+    ]

test/test_dataframe.py CHANGED Viewed

@@ -20,7 +20,7 @@ def test_dataframe_outputs():
         }
     )
-    scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID", scrub_methods=["all"])
+    scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID")
     expected_scrubbed_df = pd.DataFrame(
         {
@@ -43,7 +43,6 @@ def test_dataframe_outputs():
             "person": [["Darcy", "Elizabeth"], ["Bennet"], None, ["Mick Jagger", "David Bowie"]],
             "title": [["Mr"], ["Mr"], None, None],
             "email_address": [None, None, ["freddie-mercury@queen.com"], None],
-            "url": [None, None, ["queen.com"], None],
             "uk_postcode": [None, None, ["SW1A 2AA"], ["SW1A 2WH"]],
         }
     )
@@ -67,9 +66,7 @@ def test_dataframe_exclude():
         }
     )
-    scrubbed_df, scrubbed_data = IDScrub.dataframe(
-        df=df, id_col="ID", exclude_cols=["Fake book"], scrub_methods=["all"]
-    )
+    scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID", exclude_cols=["Fake book"])
     expected_scrubbed_df = pd.DataFrame(
         {
@@ -113,7 +110,7 @@ def test_dataframe_scrub_methods():
         }
     )
-    scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID", scrub_methods=["titles"])
+    scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID", pipeline=[{"method": "titles"}])
     expected_scrubbed_df = pd.DataFrame(
         {
@@ -141,7 +138,7 @@ def test_dataframe_scrub_methods():
     assert_frame_equal(scrubbed_data, expected_scrubbed_data)
-def test_dataframe_id_col():
+def test_dataframe_errors():
     df = pd.DataFrame(
         {
             "ID": [1, 2],
@@ -156,5 +153,8 @@ def test_dataframe_id_col():
         }
     )
-    with pytest.raises(AssertionError):
+    with pytest.raises(ValueError):
         IDScrub.dataframe(df=df, id_col="ID_not_present")
+    with pytest.raises(TypeError):
+        IDScrub.dataframe(df=1, id_col="ID_not_present")

test/test_errors.py ADDED Viewed

@@ -0,0 +1,32 @@
+import pytest
+from idscrub import IDScrub
+def test_scrub_input():
+    with pytest.raises(TypeError):
+        IDScrub(texts=[123])
+    with pytest.raises(TypeError):
+        IDScrub(texts=[1, 2, 3])
+    with pytest.raises(TypeError):
+        IDScrub(texts=[1.0, 2.0, 3.0])
+    with pytest.raises(TypeError):
+        IDScrub(texts="not_a_list")
+def test_scrub_input_text_ids():
+    with pytest.raises(ValueError):
+        IDScrub(texts=["Hello"], text_ids=[1, 2])
+def test_replacement_error():
+    with pytest.raises(TypeError):
+        IDScrub(texts=["Hello"], text_ids=[1], replacement=1)
+    with pytest.raises(TypeError):
+        IDScrub(texts=["Hello"], text_ids=[1], replacement=1.0)
+    with pytest.raises(TypeError):
+        IDScrub(texts=["Hello"], text_ids=[1], replacement=["ok"])
+def test_scrub_pipeline_error(scrub_object):
+    with pytest.raises(TypeError):
+        scrub_object.scrub(pipeline={"method": "spacy_entities"})

test/test_exclude.py ADDED Viewed

@@ -0,0 +1,22 @@
+from idscrub import IDScrub
+def test_exclude():
+    scrub = IDScrub(
+        [
+            "Our names are Hamish McDonald, L. Salah, and Elena Suárez.",
+        ],
+        exclude=["Hamish McDonald", "L. Salah"],
+    )
+    scrubbed = scrub.scrub(
+        pipeline=[{"method": "spacy_entities"}],
+    )
+    assert scrubbed == [
+        "Our names are Hamish McDonald, L. Salah, and [PERSON].",
+    ]
+    assert scrub.idents_all[0].text == "Hamish McDonald"
+    assert scrub.idents_all[1].text == "L. Salah"
+    assert [ident.text for ident in scrub.idents] not in ["Hamish McDonald", "L. Salah"]

test/test_group.py ADDED Viewed

@@ -0,0 +1,9 @@
+from idscrub import IDScrub
+def test_group_idents(idents):
+    scrub = IDScrub(texts=[])
+    entities_grouped = scrub.group_idents(idents)
+    assert len(entities_grouped) == 2
+    assert list(entities_grouped.keys()) == ["A", "B"]

test/test_huggingface.py CHANGED Viewed

@@ -6,7 +6,7 @@ from pandas.testing import assert_frame_equal
 def test_huggingface():
     scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
-    scrubbed = scrub.huggingface_entities()
+    scrubbed = scrub.scrub(pipeline=[{"method": "huggingface_entities"}])
     assert scrubbed == ["Our names are [PERSON], [PERSON], and [PERSON]."]
@@ -14,12 +14,12 @@ def test_huggingface_error():
     scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
     with pytest.raises(OSError):
-        scrub.huggingface_entities(hf_model_path="not_a_path")
+        scrub.scrub(pipeline=[{"method": "huggingface_entities", "hf_model_path": "not_a_model"}])
 def test_huggingface_empty():
     scrub = IDScrub([" ", "John Smith", ""])
-    scrubbed = scrub.huggingface_entities()
+    scrubbed = scrub.scrub(pipeline=[{"method": "huggingface_entities"}])
     assert scrubbed == [" ", "[PERSON]", ""]
     assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))

test/test_id.py CHANGED Viewed

@@ -2,23 +2,24 @@ from idscrub import IDScrub
 def test_id_ints():
-    scrub = IDScrub(texts=["clement_attlee@gmail.com"] * 10, text_ids=range(100, 110), text_id_name="PM")
-    scrub.email_addresses()
+    scrub = IDScrub(texts=["clement_attlee@testemail.com"] * 10, text_ids=range(100, 110), text_id_name="PM")
+    scrub.scrub(pipeline=[{"method": "email_addresses"}])
     assert scrub.get_scrubbed_data()["PM"].min() == 100
     assert scrub.get_scrubbed_data()["PM"].max() == 109
     assert scrub.get_scrubbed_data()["PM"].to_list() == [100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
 def test_id_strs():
-    scrub = IDScrub(texts=["clement_attlee@gmail.com"] * 2, text_ids=["random", "minister"], text_id_name="PM")
-    scrub.email_addresses()
+    scrub = IDScrub(texts=["clement_attlee@testemail.com"] * 2, text_ids=["random", "minister"], text_id_name="PM")
+    scrub.scrub(pipeline=[{"method": "email_addresses"}])
     assert scrub.get_scrubbed_data()["PM"][0] == "random"
     assert scrub.get_scrubbed_data()["PM"][1] == "minister"
 def test_multiple():
-    scrub = IDScrub(texts=["clement_attlee@gmail.com", "SW1A 2AA"] * 10, text_ids=range(100, 120), text_id_name="PM")
-    scrub.email_addresses()
-    scrub.uk_postcodes()
+    scrub = IDScrub(
+        texts=["clement_attlee@testemail.com", "SW1A 2AA"] * 10, text_ids=range(100, 120), text_id_name="PM"
+    )
+    scrub.scrub(pipeline=[{"method": "email_addresses"}, {"method": "uk_postcodes"}])
     assert scrub.get_scrubbed_data()["PM"].min() == 100
     assert scrub.get_scrubbed_data()["PM"].max() == 119

test/test_label.py CHANGED Viewed

@@ -1,9 +1,14 @@
 def test_label(scrub_object_all):
-    for i, scrub_method in enumerate(
-        ["uk_postcodes", "email_addresses", "ip_addresses", "uk_phone_numbers", "titles", "handles"]
-    ):
-        method = getattr(scrub_object_all, scrub_method)
-        method(label="test")
+    scrub_object_all.scrub(
+        pipeline=[
+            {"method": "uk_postcodes", "label": "test"},
+            {"method": "email_addresses", "label": "test"},
+            {"method": "ip_addresses", "label": "test"},
+            {"method": "uk_phone_numbers", "label": "test"},
+            {"method": "titles", "label": "test"},
+            {"method": "handles", "label": "test"},
+        ]
+    )
     df = scrub_object_all.get_scrubbed_data()
@@ -11,7 +16,17 @@ def test_label(scrub_object_all):
 def test_regex_label(scrub_object_all):
-    scrub_object_all.custom_regex(custom_regex_patterns=[r"number", r"live"], labels=["regex_number", "regex_live"])
+    scrub_object_all.scrub(
+        pipeline=[
+            {
+                "method": "custom_regex",
+                "patterns": {
+                    "number": {"pattern": r"number", "replacement": "[REDACTED]", "priority": 0.5},
+                    "live": {"pattern": r"live", "replacement": "[REDACTED]"},
+                },
+            }
+        ]
+    )
     df = scrub_object_all.get_scrubbed_data()
-    assert df.columns.to_list() == ["text_id", "regex_number", "regex_live"]
+    assert df.columns.to_list() == ["text_id", "number", "live"]

test/test_overlap.py ADDED Viewed

@@ -0,0 +1,86 @@
+from idscrub import IDScrub
+def test_overlap():
+    scrub = IDScrub(texts=["My email is fakeperson@fakeemail.com"])
+    scrubbed = scrub.scrub(
+        pipeline=[{"method": "handles", "priority": 0.1}, {"method": "email_addresses", "priority": 1.0}]
+    )
+    assert max([ident.priority for ident in scrub.idents_all]) == 1.0
+    assert scrub.idents_all == [
+        IDScrub.IDEnt(
+            text_id=1,
+            text="@fakeemail.com",
+            start=22,
+            end=36,
+            label="handle",
+            replacement="[HANDLE]",
+            priority=0.1,
+            source="regex",
+        ),
+        IDScrub.IDEnt(
+            text_id=1,
+            text="fakeperson@fakeemail.com",
+            start=12,
+            end=36,
+            label="email_address",
+            replacement="[EMAIL_ADDRESS]",
+            priority=1.0,
+            source="regex",
+        ),
+    ]
+    assert scrub.idents == [
+        IDScrub.IDEnt(
+            text_id=1,
+            text="fakeperson@fakeemail.com",
+            start=12,
+            end=36,
+            label="email_address",
+            replacement="[EMAIL_ADDRESS]",
+            priority=1.0,
+            source="regex",
+        )
+    ]
+    assert scrubbed == ["My email is [EMAIL_ADDRESS]"]
+def test_overlap_default():
+    scrub = IDScrub(texts=["I am @John Smith"])
+    scrubbed = scrub.scrub(pipeline=[{"method": "spacy_entities", "entity_types": ["PERSON"]}, {"method": "handles"}])
+    assert max([ident.priority for ident in scrub.idents_all]) == 1.0
+    assert scrub.idents_all == [
+        IDScrub.IDEnt(
+            text_id=1,
+            text="@John Smith",
+            start=5,
+            end=16,
+            label="person",
+            replacement="[PERSON]",
+            priority=1.0,
+            source="spacy",
+        ),
+        IDScrub.IDEnt(
+            text_id=1,
+            text="@John",
+            start=5,
+            end=10,
+            label="handle",
+            replacement="[HANDLE]",
+            priority=0.4,
+            source="regex",
+        ),
+    ]
+    assert scrub.idents == [
+        IDScrub.IDEnt(
+            text_id=1,
+            text="@John Smith",
+            start=5,
+            end=16,
+            label="person",
+            replacement="[PERSON]",
+            priority=1.0,
+            source="spacy",
+        )
+    ]
+    assert scrubbed == ["I am [PERSON]"]

test/test_phonenumbers.py CHANGED Viewed

@@ -3,11 +3,11 @@ from idscrub import IDScrub
 def test_google_phone_numbers_gb():
     scrub = IDScrub(texts=["My phone number is +441234567891! My old phone number is 01475 123456."])
-    scrubbed = scrub.google_phone_numbers(region="GB")
+    scrubbed = scrub.scrub(pipeline=[{"method": "google_phone_numbers"}])
     assert scrubbed == ["My phone number is [PHONENO]! My old phone number is [PHONENO]."]
 def test_google_phone_numbers_us():
     scrub = IDScrub(texts=["My US phone number is +1-718-222-2222! My old phone number is 12124567890."])
-    scrubbed = scrub.google_phone_numbers(region="US")
+    scrubbed = scrub.scrub(pipeline=[{"method": "google_phone_numbers", "region": "US"}])
     assert scrubbed == ["My US phone number is [PHONENO]! My old phone number is [PHONENO]."]

test/test_presidio.py CHANGED Viewed

@@ -8,20 +8,27 @@ def test_presidio():
     scrub = IDScrub(
         ["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
     )
-    scrubbed_texts = scrub.presidio_entities(entities=["PERSON", "IBAN_CODE"])
+    scrubbed = scrub.scrub(pipeline=[{"method": "presidio_entities"}])
-    assert scrubbed_texts == ["Our names are [PERSON], [PERSON], and [PERSON].", "My IBAN code is [IBAN_CODE]."]
+    assert scrubbed == ["Our names are [PERSON], [PERSON], and [PERSON].", "My IBAN code is [IBAN_CODE]."]
 def test_presidio_map():
     scrub = IDScrub(
         ["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
     )
-    scrubbed_texts = scrub.presidio_entities(
-        entities=["PERSON", "IBAN_CODE"], replacement_map={"PERSON": "[PHELLO]", "IBAN_CODE": "[IHELLO]"}
+    scrubbed = scrub.scrub(
+        pipeline=[
+            {
+                "method": "presidio_entities",
+                "entity_types": ["PERSON", "IBAN_CODE"],
+                "replacement_map": {"PERSON": "[PHELLO]", "IBAN_CODE": "[IHELLO]"},
+            }
+        ]
     )
-    assert scrubbed_texts == ["Our names are [PHELLO], [PHELLO], and [PHELLO].", "My IBAN code is [IHELLO]."]
+    assert scrubbed == ["Our names are [PHELLO], [PHELLO], and [PHELLO].", "My IBAN code is [IHELLO]."]
 def test_presidio_get_data():
@@ -29,7 +36,7 @@ def test_presidio_get_data():
         ["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
     )
-    scrub.presidio_entities(entities=["PERSON", "IBAN_CODE"])
+    scrub.scrub(pipeline=[{"method": "presidio_entities"}])
     df = scrub.get_scrubbed_data()
@@ -42,3 +49,11 @@ def test_presidio_get_data():
     )
     assert_frame_equal(df, expected_df)
+def test_presidio_empty():
+    scrub = IDScrub([" ", "  John Smith", ""])
+    scrubbed = scrub.scrub(pipeline=[{"method": "presidio_entities", "entity_types": ["PERSON"]}])
+    assert scrubbed == [" ", "  [PERSON]", ""]
+    assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))

test/test_regex.py CHANGED Viewed

@@ -1,25 +1,25 @@
-import re
 from idscrub import IDScrub
 def test_email_addresses():
     scrub = IDScrub(
-        texts=["Send me an email at jim@gmail.com or at marie-9999@randomemail.co.uk or at hello_world@john-smith.com."]
+        texts=[
+            "Send me an email at jim@testemail.com or at marie-9999@randomemail.co.uk or at hello_world@john-smith.com."
+        ]
     )
-    scrubbed = scrub.email_addresses()
+    scrubbed = scrub.scrub(pipeline=[{"method": "email_addresses"}])
     assert scrubbed == ["Send me an email at [EMAIL_ADDRESS] or at [EMAIL_ADDRESS] or at [EMAIL_ADDRESS]."]
 def test_ip_addresses():
     scrub = IDScrub(texts=["This has been sent to 8.8.8.8 and requested by 192.0.2.1."])
-    scrubbed = scrub.ip_addresses()
+    scrubbed = scrub.scrub(pipeline=[{"method": "ip_addresses"}])
     assert scrubbed == ["This has been sent to [IPADDRESS] and requested by [IPADDRESS]."]
 def test_uk_postcodes():
     scrub = IDScrub(texts=["I live at A11 1AA. My friend lives at KA308JB. The Prime Minister lives at SW1A  2AA."])
-    scrubbed = scrub.uk_postcodes()
+    scrubbed = scrub.scrub(pipeline=[{"method": "uk_postcodes"}])
     assert scrubbed == ["I live at [POSTCODE]. My friend lives at [POSTCODE]. The Prime Minister lives at [POSTCODE]."]
@@ -30,7 +30,7 @@ def test_titles_not_strict():
             "I am here on behalf of Ms Austen, General Eisenhower, and Captain Jack Sparrow.",
         ]
     )
-    scrubbed = scrub.titles()
+    scrubbed = scrub.scrub(pipeline=[{"method": "titles"}])
     assert scrubbed == [
         "Hello [TITLE]. Smith! I am [TITLE] Patel",
         "I am here on behalf of [TITLE] Austen, General Eisenhower, and [TITLE] Jack Sparrow.",
@@ -44,7 +44,7 @@ def test_titles_strict():
             "I am here on behalf of Ms Austen, General Eisenhower, and Captain Jack Sparrow.",
         ]
     )
-    scrubbed = scrub.titles(strict=True)
+    scrubbed = scrub.scrub(pipeline=[{"method": "titles", "strict": True}])
     assert scrubbed == [
         "Hello [TITLE]. Smith! I am [TITLE] Patel",
         "I am here on behalf of [TITLE] Austen, [TITLE] Eisenhower, and [TITLE] Jack Sparrow.",
@@ -53,32 +53,52 @@ def test_titles_strict():
 def test_uk_phone_numbers():
     scrub = IDScrub(texts=["My phone number is +441234567891! My old phone number is 01111 123456."])
-    scrubbed = scrub.uk_phone_numbers()
+    scrubbed = scrub.scrub(pipeline=[{"method": "uk_phone_numbers"}])
     assert scrubbed == ["My phone number is [PHONENO]! My old phone number is [PHONENO]."]
 def test_handles():
     scrub = IDScrub(texts=["Our usernames are @HenrikLarsson, @Jimmy_Johnstone, @Nakamura-67 and @Aidan_McGeady_46."])
-    scrubbed = scrub.handles()
+    scrubbed = scrub.scrub(pipeline=[{"method": "handles"}])
     assert scrubbed == ["Our usernames are [HANDLE], [HANDLE], [HANDLE] and [HANDLE]."]
+def test_urls():
+    scrub = IDScrub(
+        [
+            "www.example.co.uk",
+            "https://example.com",
+            "http://sub.domain.co.uk/path?query=1&x=2",
+            "www.example.org/page/index.html",
+            "https://example.com:8080/path/to/resource#anchor",
+            "www.test-site123.net/some/path?with=paramsexample.comexample.co.uk/home",
+        ]
+    )
+    scrubbed = scrub.scrub(pipeline=[{"method": "urls"}])
+    assert scrubbed == ["[URL]", "[URL]", "[URL]", "[URL]", "[URL]", "[URL]"]
 def test_uk_addresses():
     scrub = IDScrub(
         [
             "221B Baker Street",
             "12 high road",
             "Flat 3B, 47 King's Court",
-            "12–14 High Street",
+            "12-14 High Street",
             "5a-7a Church Lane",
             "1/2 Main Street",
             "10 St John’s Rd",
             "33 Queen-Anne Walk",
             "8 Deansgate Ct",
-        ]
+            "10 Downing Street",
+            "10, Downing Street",
+        ],
     )
-    scrubbed = scrub.uk_addresses()
+    scrubbed = scrub.scrub(pipeline=[{"method": "uk_addresses"}])
     assert scrubbed == [
         "[ADDRESS]",
         "[ADDRESS]",
@@ -89,6 +109,8 @@ def test_uk_addresses():
         "[ADDRESS]",
         "[ADDRESS]",
         "[ADDRESS]",
+        "[ADDRESS]",
+        "[ADDRESS]",
     ]
     negative_tests = [
@@ -105,65 +127,94 @@ def test_uk_addresses():
     scrub = IDScrub(negative_tests)
-    scrubbed = scrub.uk_addresses()
+    scrubbed = scrub.scrub(pipeline=[{"method": "uk_addresses"}])
     assert scrubbed == negative_tests
-def test_claimants():
+def test_custom_regex():
+    scrub = IDScrub(texts=[])
+    scrubbed_idents = scrub.custom_regex(
+        texts=["It was the best of times, it was the worst of times"],
+        text_ids=["A"],
+        patterns={
+            "times": {"pattern": r"times", "replacement": "[DICKENS]", "priority": 0.5},
+            "worst": {"pattern": r"worst", "replacement": "[WORST]", "priority": 0.8},
+        },
+    )
+    assert scrubbed_idents == [
+        IDScrub.IDEnt(
+            text_id="A",
+            text="times",
+            start=19,
+            end=24,
+            label="times",
+            replacement="[DICKENS]",
+            priority=0.5,
+            source="custom_regex",
+        ),
+        IDScrub.IDEnt(
+            text_id="A",
+            text="times",
+            start=46,
+            end=51,
+            label="times",
+            replacement="[DICKENS]",
+            priority=0.5,
+            source="custom_regex",
+        ),
+        IDScrub.IDEnt(
+            text_id="A",
+            text="worst",
+            start=37,
+            end=42,
+            label="worst",
+            replacement="[WORST]",
+            priority=0.8,
+            source="custom_regex",
+        ),
+    ]
     scrub = IDScrub(
         texts=[
-            "This is legal text. Claimant: John Smith Respondents: Jill Hill.",
-            "Claimant: J Smith Respondents: Jill Hill. J Smith is the respondent.",
+            "It was the best of times, it was the worst of times",
         ]
     )
-    scrubbed = scrub.claimants()
-    assert scrubbed == [
-        "This is legal text. Claimant: [CLAIMANT] Respondents: Jill Hill.",
-        "Claimant: [CLAIMANT] Respondents: Jill Hill. [CLAIMANT] is the respondent.",
-    ]
-def test_custom_regex():
-    scrub = IDScrub(texts=["It was the best of times, it was the worst of times"])
-    scrubbed = scrub.custom_regex(custom_regex_patterns=[r"times"])
-    assert scrubbed == ["It was the best of [REDACTED], it was the worst of [REDACTED]"]
-    scrub = IDScrub(texts=["It was the best of times, it was the worst of times"])
-    scrubbed = scrub.custom_regex(
-        custom_regex_patterns=[r"times", "worst"], custom_replacement_texts=["[DICKENS]", "[WORST]"]
+    scrubbed_text = scrub.scrub(
+        pipeline=[
+            {
+                "method": "custom_regex",
+                "patterns": {
+                    "times": {"pattern": r"times", "replacement": "[DICKENS]", "priority": 0.5},
+                    "worst": {"pattern": r"worst", "replacement": "[WORST]", "priority": 0.5},
+                },
+            }
+        ]
     )
-    assert scrubbed == ["It was the best of [DICKENS], it was the [WORST] of [DICKENS]"]
-def test_scrub_and_collect():
-    scrub = IDScrub()
-    text = "Hello Muhammad and Jack."
-    pattern = r"\bMuhammad|Jack\b"
-    replacement = "[NAME]"
-    label = "custom_regex"
-    i = 1
-    def replacer(match):
-        return scrub.scrub_and_collect(match, text, replacement, i, label)
-    scrubbed = re.sub(pattern, replacer, text)
-    assert scrubbed == "Hello [NAME] and [NAME]."
-    assert scrub.scrubbed_data == [
-        {"text_id": 1, "custom_regex": "Muhammad"},
-        {"text_id": 1, "custom_regex": "Jack"},
-    ]
+    assert scrubbed_text == ["It was the best of [DICKENS], it was the [WORST] of [DICKENS]"]
 def test_remove_regex():
-    scrub = IDScrub(texts=["Hi! My name is Clement Atlee!", "I am Harold Wilson."])
+    texts = ["Hi! My name is Clement Atlee!"]
+    text_ids = ["UK"]
+    scrub = IDScrub([])
     label = "regex_names"
     pattern = r"Clement Atlee|Harold Wilson"
-    replacement_text = "[PM]"
-    scrubbed = scrub.scrub_regex(pattern, replacement_text, label)
+    replacement = "[PM]"
+    priority = 0.5
+    idents = scrub.find_regex(
+        texts=texts, text_ids=text_ids, pattern=pattern, replacement=replacement, label=label, priority=priority
+    )
-    assert scrubbed == ["Hi! My name is [PM]!", "I am [PM]."]
-    assert scrub.scrubbed_data == [
-        {"text_id": 1, "regex_names": "Clement Atlee"},
-        {"text_id": 2, "regex_names": "Harold Wilson"},
-    ]
+    assert len(idents) == 1
+    assert idents[0].text_id == "UK"
+    assert idents[0].text == "Clement Atlee"
+    assert idents[0].start == 15
+    assert idents[0].end == 28
+    assert idents[0].label == "regex_names"
+    assert idents[0].replacement == "[PM]"
+    assert idents[0].priority == 0.5
+    assert idents[0].source == "regex"

idscrub 1.1.2__py3-none-any.whl → 2.0.1__py3-none-any.whl

idscrub 1.1.2py3-none-any.whl → 2.0.1py3-none-any.whl