idscrub 1.1.2__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- idscrub/scrub.py +692 -525
- {idscrub-1.1.2.dist-info → idscrub-2.0.0.dist-info}/METADATA +58 -12
- idscrub-2.0.0.dist-info/RECORD +24 -0
- notebooks/basic_usage.ipynb +294 -351
- test/conftest.py +36 -0
- test/test_dataframe.py +8 -8
- test/test_errors.py +32 -0
- test/test_exclude.py +22 -0
- test/test_group.py +9 -0
- test/test_huggingface.py +3 -3
- test/test_id.py +8 -7
- test/test_label.py +22 -7
- test/test_overlap.py +86 -0
- test/test_phonenumbers.py +2 -2
- test/test_presidio.py +13 -6
- test/test_regex.py +110 -59
- test/test_scrub.py +22 -12
- test/test_scrub_text.py +22 -0
- test/test_spacy.py +14 -10
- idscrub-1.1.2.dist-info/RECORD +0 -22
- test/test_all.py +0 -39
- test/test_chain.py +0 -54
- test/test_log.py +0 -17
- {idscrub-1.1.2.dist-info → idscrub-2.0.0.dist-info}/WHEEL +0 -0
- {idscrub-1.1.2.dist-info → idscrub-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {idscrub-1.1.2.dist-info → idscrub-2.0.0.dist-info}/top_level.txt +0 -0
test/test_scrub.py
CHANGED
|
@@ -3,9 +3,10 @@ from idscrub import IDScrub
|
|
|
3
3
|
from pandas.testing import assert_frame_equal
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
# Note: These tests will fail if the kernel has not been restarted since the SpaCy model was downloaded.
|
|
7
6
|
def test_scrub(scrub_object):
|
|
8
|
-
scrubbed = scrub_object.scrub(
|
|
7
|
+
scrubbed = scrub_object.scrub(
|
|
8
|
+
pipeline=[{"method": "spacy_entities"}, {"method": "uk_phone_numbers"}, {"method": "uk_postcodes"}]
|
|
9
|
+
)
|
|
9
10
|
assert scrubbed == [
|
|
10
11
|
"Our names are [PERSON], [PERSON], and [PERSON].",
|
|
11
12
|
"My number is [PHONENO] and I live at [POSTCODE].",
|
|
@@ -15,7 +16,7 @@ def test_scrub(scrub_object):
|
|
|
15
16
|
def test_scrub_text_id():
|
|
16
17
|
scrub = IDScrub(["Our names are Hamish McDonald, L. Salah, and Elena Suárez."] * 10)
|
|
17
18
|
|
|
18
|
-
scrub.scrub(
|
|
19
|
+
scrub.scrub(pipeline=[{"method": "spacy_entities"}])
|
|
19
20
|
|
|
20
21
|
df = scrub.get_scrubbed_data()
|
|
21
22
|
|
|
@@ -24,7 +25,7 @@ def test_scrub_text_id():
|
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
def test_scrub_get_scrubbed_data(scrub_object):
|
|
27
|
-
scrub_object.scrub(
|
|
28
|
+
scrub_object.scrub(pipeline=[{"method": "uk_postcodes"}])
|
|
28
29
|
df = scrub_object.get_scrubbed_data()
|
|
29
30
|
|
|
30
31
|
expected_df = pd.DataFrame(
|
|
@@ -37,12 +38,21 @@ def test_scrub_get_scrubbed_data(scrub_object):
|
|
|
37
38
|
assert_frame_equal(df, expected_df)
|
|
38
39
|
|
|
39
40
|
|
|
40
|
-
def
|
|
41
|
-
scrub_object.scrub(
|
|
41
|
+
def test_scrub_get_all_identified_data(scrub_object):
|
|
42
|
+
scrub_object.scrub(pipeline=[{"method": "uk_postcodes"}])
|
|
43
|
+
df = scrub_object.get_all_identified_data()
|
|
42
44
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
45
|
+
expected_df = pd.DataFrame(
|
|
46
|
+
{
|
|
47
|
+
"text_id": {0: 2},
|
|
48
|
+
"text": {0: "AA11 1AA"},
|
|
49
|
+
"start": {0: 41},
|
|
50
|
+
"end": {0: 49},
|
|
51
|
+
"label": {0: "uk_postcode"},
|
|
52
|
+
"replacement": {0: "[POSTCODE]"},
|
|
53
|
+
"priority": {0: 0.5},
|
|
54
|
+
"source": {0: "regex"},
|
|
55
|
+
}
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
assert_frame_equal(df, expected_df)
|
test/test_scrub_text.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from idscrub import IDScrub
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_scrub_text(scrub_object):
|
|
5
|
+
scrub_object.scrub(pipeline=[{"method": "uk_postcodes"}])
|
|
6
|
+
|
|
7
|
+
assert scrub_object.idents == [
|
|
8
|
+
IDScrub.IDEnt(
|
|
9
|
+
text_id=2,
|
|
10
|
+
text="AA11 1AA",
|
|
11
|
+
start=41,
|
|
12
|
+
end=49,
|
|
13
|
+
label="uk_postcode",
|
|
14
|
+
replacement="[POSTCODE]",
|
|
15
|
+
priority=0.5,
|
|
16
|
+
source="regex",
|
|
17
|
+
)
|
|
18
|
+
]
|
|
19
|
+
assert scrub_object.scrub_text() == [
|
|
20
|
+
"Our names are Hamish McDonald, L. Salah, and Elena Suárez.",
|
|
21
|
+
"My number is +441111111111 and I live at [POSTCODE].",
|
|
22
|
+
]
|
test/test_spacy.py
CHANGED
|
@@ -4,10 +4,9 @@ from idscrub import IDScrub
|
|
|
4
4
|
from pandas.testing import assert_frame_equal
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
# Note: This test will fail if the kernel has not been restarted since the SpaCy model was downloaded.
|
|
8
7
|
def test_spacy():
|
|
9
8
|
scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
|
|
10
|
-
scrubbed = scrub.
|
|
9
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "spacy_entities"}])
|
|
11
10
|
assert scrubbed == ["Our names are [PERSON], [PERSON], and [PERSON]."]
|
|
12
11
|
|
|
13
12
|
|
|
@@ -15,12 +14,12 @@ def test_spacy_error():
|
|
|
15
14
|
scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
|
|
16
15
|
|
|
17
16
|
with pytest.raises(ValueError):
|
|
18
|
-
scrub.
|
|
17
|
+
scrub.scrub(pipeline=[{"method": "spacy_entities", "model_name": "not_a_model"}])
|
|
19
18
|
|
|
20
19
|
|
|
21
20
|
def test_spacy_empty():
|
|
22
21
|
scrub = IDScrub([" ", "John Smith", ""])
|
|
23
|
-
scrubbed = scrub.spacy_entities
|
|
22
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "spacy_entities"}])
|
|
24
23
|
|
|
25
24
|
assert scrubbed == [" ", "[PERSON]", ""]
|
|
26
25
|
assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
|
|
@@ -28,17 +27,22 @@ def test_spacy_empty():
|
|
|
28
27
|
|
|
29
28
|
def test_spacy_map():
|
|
30
29
|
scrub = IDScrub(["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My company code is NASA."])
|
|
31
|
-
|
|
32
|
-
|
|
30
|
+
scrubbed = scrub.scrub(
|
|
31
|
+
pipeline=[
|
|
32
|
+
{
|
|
33
|
+
"method": "spacy_entities",
|
|
34
|
+
"entity_types": ["PERSON", "ORG"],
|
|
35
|
+
"replacement_map": {"PERSON": "[PHELLO]", "ORG": "[SPACE]"},
|
|
36
|
+
}
|
|
37
|
+
]
|
|
33
38
|
)
|
|
39
|
+
assert scrubbed == ["Our names are [PHELLO], [PHELLO], and [PHELLO].", "My company code is [SPACE]."]
|
|
34
40
|
|
|
35
|
-
assert scrubbed_texts == ["Our names are [PHELLO], [PHELLO], and [PHELLO].", "My company code is [SPACE]."]
|
|
36
41
|
|
|
37
|
-
|
|
38
|
-
def test_spacy_get_data():
|
|
42
|
+
def test_presidio_get_data():
|
|
39
43
|
scrub = IDScrub(["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My company code is NASA."])
|
|
40
44
|
|
|
41
|
-
scrub.
|
|
45
|
+
scrub.scrub(pipeline=[{"method": "spacy_entities"}])
|
|
42
46
|
|
|
43
47
|
df = scrub.get_scrubbed_data()
|
|
44
48
|
|
idscrub-1.1.2.dist-info/RECORD
DELETED
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
idscrub/__init__.py,sha256=cRugJv27q1q--bl-VNLpfiScJb_ROlUxyLFhaF55S1w,38
|
|
2
|
-
idscrub/locations.py,sha256=7fMNOcGMYe7sX8TrfhMW6oYGAlc1WVYVQKQbpxE3pqo,217
|
|
3
|
-
idscrub/scrub.py,sha256=PPTKWW-RQxZ5NixRow8nrnX9KjfyZa3tPAP9Jgwnn_M,36631
|
|
4
|
-
idscrub-1.1.2.dist-info/licenses/LICENSE,sha256=JJnuf10NSx7YXglte1oH_N9ZP3AcWR_Y8irvQb_wnsg,1090
|
|
5
|
-
notebooks/basic_usage.ipynb,sha256=V62Bz88a9Zo3LO_VxXF4sLw8-MP51ZdVRRNS-zjtNqw,42664
|
|
6
|
-
test/conftest.py,sha256=y-pwGXpdg7bbFc36HtE3wQtZkeI0JM77fcMYjej5veY,557
|
|
7
|
-
test/test_all.py,sha256=ifuXAI0Hq3ETNXzdITjNGCnuFyozhN5TpJC2hOtA2bM,1103
|
|
8
|
-
test/test_chain.py,sha256=YbJeA11EBjDNcq5ZZjG4lIIyngrRQZknNsX3Oo0jPMc,1810
|
|
9
|
-
test/test_dataframe.py,sha256=1LhtkQQpXblQ18ppI1s1nNyse0YCwGHbhtrKGkdppBw,6413
|
|
10
|
-
test/test_huggingface.py,sha256=RTkp8Xsy4w9WoXq2IQ2YOJof41snbOQkM7CVtiVVD0U,839
|
|
11
|
-
test/test_id.py,sha256=TPsvz4Kw1z_Fiek2BV79Hc2q3N37xU3oQra6Y7Ke11Q,989
|
|
12
|
-
test/test_label.py,sha256=aNkIxJ-_YkBnW8QrBfRxjSsRZWeh5hn_iM7Rk1wrfPU,652
|
|
13
|
-
test/test_log.py,sha256=tGAGOv4aeHT4E_pB9rq_nNA1CDHNoINpkVrCKaP4d3U,645
|
|
14
|
-
test/test_phonenumbers.py,sha256=hZsXgwhn5R-7426TTWwCH9gWQwhyHtjLUstN10jnX6c,607
|
|
15
|
-
test/test_presidio.py,sha256=BOGghcTWLSQPBhQxO014rO3RG-IL5XEbAaKuGN677pU,1558
|
|
16
|
-
test/test_regex.py,sha256=foc2N4UCi7mGL0EIfp1t-ivgujkXMrmbsnsU77sbWZ0,5424
|
|
17
|
-
test/test_scrub.py,sha256=tMYrIhbyXXKqt24tS1U_kAJT_vZfhOD4DAsf5ZFbEvU,1380
|
|
18
|
-
test/test_spacy.py,sha256=gxJrNpV5B3HydUfoMsbmzRUoiKNs3_zwdSXqbPeW0qA,1846
|
|
19
|
-
idscrub-1.1.2.dist-info/METADATA,sha256=2cgKI6cNWw-6zmTgiRoGt9fStRtGdSTLY7YnYBqHKm0,7242
|
|
20
|
-
idscrub-1.1.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
21
|
-
idscrub-1.1.2.dist-info/top_level.txt,sha256=D4EEodXGCjGiX35ObiBTmjjBAdouN-eCvH-LezGGtks,23
|
|
22
|
-
idscrub-1.1.2.dist-info/RECORD,,
|
test/test_all.py
DELETED
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
from idscrub import IDScrub
|
|
3
|
-
from pandas.testing import assert_frame_equal
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
# Note: These tests will fail if the kernel has not been restarted since the SpaCy model was downloaded.
|
|
7
|
-
def test_all(scrub_object):
|
|
8
|
-
scrubbed = scrub_object.all()
|
|
9
|
-
assert scrubbed == [
|
|
10
|
-
"Our names are [PERSON], [PERSON], and [PERSON].",
|
|
11
|
-
"My number is [PHONENO] and I live at [POSTCODE].",
|
|
12
|
-
]
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def test_text_id():
|
|
16
|
-
scrub = IDScrub(["Our names are Hamish McDonald, L. Salah, and Elena Suárez."] * 10)
|
|
17
|
-
|
|
18
|
-
scrub.all()
|
|
19
|
-
|
|
20
|
-
df = scrub.get_scrubbed_data()
|
|
21
|
-
|
|
22
|
-
assert df["text_id"].max() == 10
|
|
23
|
-
assert len(df["text_id"]) == 10
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def test_get_scrubbed_data(scrub_object):
|
|
27
|
-
scrub_object.all()
|
|
28
|
-
df = scrub_object.get_scrubbed_data()
|
|
29
|
-
|
|
30
|
-
expected_df = pd.DataFrame(
|
|
31
|
-
{
|
|
32
|
-
"text_id": {0: 1, 1: 2},
|
|
33
|
-
"person": {0: ["Hamish McDonald", "L. Salah", "Elena Suárez"], 1: None},
|
|
34
|
-
"uk_phone_number": {0: None, 1: ["+441111111111"]},
|
|
35
|
-
"uk_postcode": {0: None, 1: ["AA11 1AA"]},
|
|
36
|
-
}
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
assert_frame_equal(df, expected_df)
|
test/test_chain.py
DELETED
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
from pandas.testing import assert_frame_equal
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
# Note: These tests will fail if the kernel has not been restarted since the SpaCy model was downloaded.
|
|
6
|
-
def test_chain(scrub_object):
|
|
7
|
-
scrub_object.uk_phone_numbers()
|
|
8
|
-
scrub_object.uk_postcodes()
|
|
9
|
-
scrubbed = scrub_object.spacy_entities()
|
|
10
|
-
|
|
11
|
-
assert scrubbed == [
|
|
12
|
-
"Our names are [PERSON], [PERSON], and [PERSON].",
|
|
13
|
-
"My number is [PHONENO] and I live at [POSTCODE].",
|
|
14
|
-
]
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def test_chain_order(scrub_object):
|
|
18
|
-
scrubbed = scrub_object.uk_phone_numbers()
|
|
19
|
-
|
|
20
|
-
assert scrubbed == [
|
|
21
|
-
"Our names are Hamish McDonald, L. Salah, and Elena Suárez.",
|
|
22
|
-
"My number is [PHONENO] and I live at AA11 1AA.",
|
|
23
|
-
]
|
|
24
|
-
|
|
25
|
-
assert scrub_object.get_scrubbed_data()["uk_phone_number"].to_list() == [["+441111111111"]]
|
|
26
|
-
assert "uk_postcode" not in scrub_object.get_scrubbed_data().columns
|
|
27
|
-
|
|
28
|
-
scrubbed = scrub_object.uk_postcodes()
|
|
29
|
-
|
|
30
|
-
assert scrubbed == [
|
|
31
|
-
"Our names are Hamish McDonald, L. Salah, and Elena Suárez.",
|
|
32
|
-
"My number is [PHONENO] and I live at [POSTCODE].",
|
|
33
|
-
]
|
|
34
|
-
assert scrub_object.get_scrubbed_data()["uk_phone_number"].to_list() == [["+441111111111"]]
|
|
35
|
-
assert scrub_object.get_scrubbed_data()["uk_postcode"].to_list() == [["AA11 1AA"]]
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def test_get_scrubbed_data_chain(scrub_object):
|
|
39
|
-
scrub_object.uk_phone_numbers()
|
|
40
|
-
scrub_object.uk_postcodes()
|
|
41
|
-
scrub_object.spacy_entities()
|
|
42
|
-
|
|
43
|
-
df = scrub_object.get_scrubbed_data()
|
|
44
|
-
|
|
45
|
-
expected_df = pd.DataFrame(
|
|
46
|
-
{
|
|
47
|
-
"text_id": {0: 1, 1: 2},
|
|
48
|
-
"uk_phone_number": {0: None, 1: ["+441111111111"]},
|
|
49
|
-
"uk_postcode": {0: None, 1: ["AA11 1AA"]},
|
|
50
|
-
"person": {0: ["Hamish McDonald", "L. Salah", "Elena Suárez"], 1: None},
|
|
51
|
-
}
|
|
52
|
-
)
|
|
53
|
-
|
|
54
|
-
assert_frame_equal(df, expected_df)
|
test/test_log.py
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
from idscrub import IDScrub
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def test_log_message():
|
|
5
|
-
scrub = IDScrub(texts=["My name is Dr Strangelove. Dr. Strangelove is my name", "My name is Professor Oppenheimer"])
|
|
6
|
-
scrub.titles()
|
|
7
|
-
count = scrub.log_message("title")
|
|
8
|
-
assert count == 3
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def test_log_message_custom_regex():
|
|
12
|
-
scrub = IDScrub(texts=["My name is Dr Strangelove. Dr. Strangelove is my name", "My name is Professor Oppenheimer"])
|
|
13
|
-
scrub.custom_regex([r"Strangelove", r"Oppenheimer"], ["[DR]", "[PROFESSOR]"])
|
|
14
|
-
count_1 = scrub.log_message("custom_regex_1")
|
|
15
|
-
count_2 = scrub.log_message("custom_regex_2")
|
|
16
|
-
assert count_1 == 2
|
|
17
|
-
assert count_2 == 1
|
|
File without changes
|
|
File without changes
|
|
File without changes
|