idscrub 0.2.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- idscrub/scrub.py +73 -88
- {idscrub-0.2.2.dist-info → idscrub-1.0.0.dist-info}/METADATA +2 -2
- idscrub-1.0.0.dist-info/RECORD +22 -0
- notebooks/basic_usage.ipynb +153 -161
- test/conftest.py +10 -0
- test/test_all.py +3 -3
- test/test_chain.py +7 -7
- test/test_dataframe.py +114 -5
- test/test_huggingface.py +1 -1
- test/test_label.py +17 -0
- test/test_log.py +3 -3
- test/test_persidio.py +2 -2
- test/test_regex.py +8 -8
- test/test_scrub.py +4 -4
- test/test_spacy.py +1 -3
- idscrub-0.2.2.dist-info/RECORD +0 -21
- {idscrub-0.2.2.dist-info → idscrub-1.0.0.dist-info}/WHEEL +0 -0
- {idscrub-0.2.2.dist-info → idscrub-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {idscrub-0.2.2.dist-info → idscrub-1.0.0.dist-info}/top_level.txt +0 -0
test/test_dataframe.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
+
import pytest
|
|
2
3
|
from idscrub import IDScrub
|
|
3
4
|
from pandas.testing import assert_frame_equal
|
|
4
5
|
|
|
@@ -39,13 +40,121 @@ def test_dataframe_outputs():
|
|
|
39
40
|
{
|
|
40
41
|
"ID": [1, 2, 1, 2],
|
|
41
42
|
"column": ["Pride and Prejudice", "Pride and Prejudice", "Fake book", "Fake book"],
|
|
42
|
-
"
|
|
43
|
-
"
|
|
44
|
-
"
|
|
45
|
-
"
|
|
46
|
-
"
|
|
43
|
+
"person": [["Darcy", "Elizabeth"], ["Bennet"], None, ["Mick Jagger", "David Bowie"]],
|
|
44
|
+
"title": [["Mr"], ["Mr"], None, None],
|
|
45
|
+
"email_address": [None, None, ["freddie-mercury@queen.com"], None],
|
|
46
|
+
"url": [None, None, ["queen.com"], None],
|
|
47
|
+
"uk_postcode": [None, None, ["SW1A 2AA"], ["SW1A 2WH"]],
|
|
47
48
|
}
|
|
48
49
|
)
|
|
49
50
|
|
|
50
51
|
assert_frame_equal(scrubbed_df, expected_scrubbed_df)
|
|
51
52
|
assert_frame_equal(scrubbed_data, expected_scrubbed_data)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_dataframe_exclude():
|
|
56
|
+
df = pd.DataFrame(
|
|
57
|
+
{
|
|
58
|
+
"ID": [1, 2],
|
|
59
|
+
"Pride and Prejudice": [
|
|
60
|
+
"Mr. Darcy walked off; and Elizabeth remained with no very cordial feelings toward him.",
|
|
61
|
+
"Mr. Bennet was so odd a mixture of quick parts, sarcastic humour, reserve, and caprice.",
|
|
62
|
+
],
|
|
63
|
+
"Fake book": [
|
|
64
|
+
"The letter to freddie-mercury@queen.com was stamped with SW1A 2AA.",
|
|
65
|
+
"She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.",
|
|
66
|
+
],
|
|
67
|
+
}
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
scrubbed_df, scrubbed_data = IDScrub.dataframe(
|
|
71
|
+
df=df, id_col="ID", exclude_cols=["Fake book"], scrub_methods=["all"]
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
expected_scrubbed_df = pd.DataFrame(
|
|
75
|
+
{
|
|
76
|
+
"ID": [1, 2],
|
|
77
|
+
"Pride and Prejudice": [
|
|
78
|
+
"[TITLE]. [PERSON] walked off; and [PERSON] remained with no very cordial feelings toward him.",
|
|
79
|
+
"[TITLE]. [PERSON] was so odd a mixture of quick parts, sarcastic humour, reserve, and caprice.",
|
|
80
|
+
],
|
|
81
|
+
"Fake book": [
|
|
82
|
+
"The letter to freddie-mercury@queen.com was stamped with SW1A 2AA.",
|
|
83
|
+
"She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.",
|
|
84
|
+
],
|
|
85
|
+
}
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
expected_scrubbed_data = pd.DataFrame(
|
|
89
|
+
{
|
|
90
|
+
"ID": [1, 2],
|
|
91
|
+
"column": ["Pride and Prejudice", "Pride and Prejudice"],
|
|
92
|
+
"person": [["Darcy", "Elizabeth"], ["Bennet"]],
|
|
93
|
+
"title": [["Mr"], ["Mr"]],
|
|
94
|
+
}
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
assert_frame_equal(scrubbed_df, expected_scrubbed_df)
|
|
98
|
+
assert_frame_equal(scrubbed_data, expected_scrubbed_data)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def test_dataframe_scrub_methods():
|
|
102
|
+
df = pd.DataFrame(
|
|
103
|
+
{
|
|
104
|
+
"ID": [1, 2],
|
|
105
|
+
"Pride and Prejudice": [
|
|
106
|
+
"Mr. Darcy walked off; and Elizabeth remained with no very cordial feelings toward him.",
|
|
107
|
+
"Mr. Bennet was so odd a mixture of quick parts, sarcastic humour, reserve, and caprice.",
|
|
108
|
+
],
|
|
109
|
+
"Fake book": [
|
|
110
|
+
"The letter to freddie-mercury@queen.com was stamped with SW1A 2AA.",
|
|
111
|
+
"She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.",
|
|
112
|
+
],
|
|
113
|
+
}
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID", scrub_methods=["titles"])
|
|
117
|
+
|
|
118
|
+
expected_scrubbed_df = pd.DataFrame(
|
|
119
|
+
{
|
|
120
|
+
"ID": [1, 2],
|
|
121
|
+
"Pride and Prejudice": [
|
|
122
|
+
"[TITLE]. Darcy walked off; and Elizabeth remained with no very cordial feelings toward him.",
|
|
123
|
+
"[TITLE]. Bennet was so odd a mixture of quick parts, sarcastic humour, reserve, and caprice.",
|
|
124
|
+
],
|
|
125
|
+
"Fake book": [
|
|
126
|
+
"The letter to freddie-mercury@queen.com was stamped with SW1A 2AA.",
|
|
127
|
+
"She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.",
|
|
128
|
+
],
|
|
129
|
+
}
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
expected_scrubbed_data = pd.DataFrame(
|
|
133
|
+
{
|
|
134
|
+
"ID": [1, 2],
|
|
135
|
+
"column": ["Pride and Prejudice", "Pride and Prejudice"],
|
|
136
|
+
"title": [["Mr"], ["Mr"]],
|
|
137
|
+
}
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
assert_frame_equal(scrubbed_df, expected_scrubbed_df)
|
|
141
|
+
assert_frame_equal(scrubbed_data, expected_scrubbed_data)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def test_dataframe_id_col():
|
|
145
|
+
df = pd.DataFrame(
|
|
146
|
+
{
|
|
147
|
+
"ID": [1, 2],
|
|
148
|
+
"Pride and Prejudice": [
|
|
149
|
+
"Mr. Darcy walked off; and Elizabeth remained with no very cordial feelings toward him.",
|
|
150
|
+
"Mr. Bennet was so odd a mixture of quick parts, sarcastic humour, reserve, and caprice.",
|
|
151
|
+
],
|
|
152
|
+
"Fake book": [
|
|
153
|
+
"The letter to freddie-mercury@queen.com was stamped with SW1A 2AA.",
|
|
154
|
+
"She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.",
|
|
155
|
+
],
|
|
156
|
+
}
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
with pytest.raises(AssertionError):
|
|
160
|
+
IDScrub.dataframe(df=df, id_col="ID_not_present")
|
test/test_huggingface.py
CHANGED
|
@@ -22,4 +22,4 @@ def test_huggingface_empty():
|
|
|
22
22
|
scrubbed = scrub.huggingface_persons()
|
|
23
23
|
|
|
24
24
|
assert scrubbed == [" ", "[PERSON]", ""]
|
|
25
|
-
assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "
|
|
25
|
+
assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
|
test/test_label.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
def test_label(scrub_object_all):
|
|
2
|
+
for i, scrub_method in enumerate(
|
|
3
|
+
["spacy_persons", "uk_postcodes", "email_addresses", "ip_addresses", "uk_phone_numbers", "titles", "handles"]
|
|
4
|
+
):
|
|
5
|
+
method = getattr(scrub_object_all, scrub_method)
|
|
6
|
+
method(label="test")
|
|
7
|
+
|
|
8
|
+
df = scrub_object_all.get_scrubbed_data()
|
|
9
|
+
|
|
10
|
+
assert df.columns.to_list() == ["text_id", "test"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_regex_label(scrub_object_all):
|
|
14
|
+
scrub_object_all.custom_regex(custom_regex_patterns=[r"number", r"live"], labels=["regex_number", "regex_live"])
|
|
15
|
+
df = scrub_object_all.get_scrubbed_data()
|
|
16
|
+
|
|
17
|
+
assert df.columns.to_list() == ["text_id", "regex_number", "regex_live"]
|
test/test_log.py
CHANGED
|
@@ -4,14 +4,14 @@ from idscrub import IDScrub
|
|
|
4
4
|
def test_log_message():
|
|
5
5
|
scrub = IDScrub(texts=["My name is Dr Strangelove. Dr. Strangelove is my name", "My name is Professor Oppenheimer"])
|
|
6
6
|
scrub.titles()
|
|
7
|
-
count = scrub.log_message("
|
|
7
|
+
count = scrub.log_message("title")
|
|
8
8
|
assert count == 3
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def test_log_message_custom_regex():
|
|
12
12
|
scrub = IDScrub(texts=["My name is Dr Strangelove. Dr. Strangelove is my name", "My name is Professor Oppenheimer"])
|
|
13
13
|
scrub.custom_regex([r"Strangelove", r"Oppenheimer"], ["[DR]", "[PROFESSOR]"])
|
|
14
|
-
count_1 = scrub.log_message("
|
|
15
|
-
count_2 = scrub.log_message("
|
|
14
|
+
count_1 = scrub.log_message("custom_regex_1")
|
|
15
|
+
count_2 = scrub.log_message("custom_regex_2")
|
|
16
16
|
assert count_1 == 2
|
|
17
17
|
assert count_2 == 1
|
test/test_persidio.py
CHANGED
|
@@ -36,8 +36,8 @@ def test_persidio_get_data():
|
|
|
36
36
|
expected_df = pd.DataFrame(
|
|
37
37
|
{
|
|
38
38
|
"text_id": {0: 1, 1: 2},
|
|
39
|
-
"
|
|
40
|
-
"
|
|
39
|
+
"person": {0: ["Hamish McDonald", "L. Salah", "Elena Suárez"], 1: None},
|
|
40
|
+
"iban_code": {0: None, 1: ["GB91BKEN10000041610008"]},
|
|
41
41
|
}
|
|
42
42
|
)
|
|
43
43
|
|
test/test_regex.py
CHANGED
|
@@ -94,30 +94,30 @@ def test_scrub_and_collect():
|
|
|
94
94
|
text = "Hello Muhammad and Jack."
|
|
95
95
|
pattern = r"\bMuhammad|Jack\b"
|
|
96
96
|
replacement = "[NAME]"
|
|
97
|
-
|
|
97
|
+
label = "custom_regex"
|
|
98
98
|
i = 1
|
|
99
99
|
|
|
100
100
|
def replacer(match):
|
|
101
|
-
return scrub.scrub_and_collect(match, text, replacement, i,
|
|
101
|
+
return scrub.scrub_and_collect(match, text, replacement, i, label)
|
|
102
102
|
|
|
103
103
|
scrubbed = re.sub(pattern, replacer, text)
|
|
104
104
|
|
|
105
105
|
assert scrubbed == "Hello [NAME] and [NAME]."
|
|
106
106
|
assert scrub.scrubbed_data == [
|
|
107
|
-
{"text_id": 1, "
|
|
108
|
-
{"text_id": 1, "
|
|
107
|
+
{"text_id": 1, "custom_regex": "Muhammad"},
|
|
108
|
+
{"text_id": 1, "custom_regex": "Jack"},
|
|
109
109
|
]
|
|
110
110
|
|
|
111
111
|
|
|
112
112
|
def test_remove_regex():
|
|
113
113
|
scrub = IDScrub(texts=["Hi! My name is Clement Atlee!", "I am Harold Wilson."])
|
|
114
|
-
|
|
114
|
+
label = "regex_names"
|
|
115
115
|
pattern = r"Clement Atlee|Harold Wilson"
|
|
116
116
|
replacement_text = "[PM]"
|
|
117
|
-
scrubbed = scrub.scrub_regex(pattern, replacement_text,
|
|
117
|
+
scrubbed = scrub.scrub_regex(pattern, replacement_text, label)
|
|
118
118
|
|
|
119
119
|
assert scrubbed == ["Hi! My name is [PM]!", "I am [PM]."]
|
|
120
120
|
assert scrub.scrubbed_data == [
|
|
121
|
-
{"text_id": 1, "
|
|
122
|
-
{"text_id": 2, "
|
|
121
|
+
{"text_id": 1, "regex_names": "Clement Atlee"},
|
|
122
|
+
{"text_id": 2, "regex_names": "Harold Wilson"},
|
|
123
123
|
]
|
test/test_scrub.py
CHANGED
|
@@ -30,7 +30,7 @@ def test_scrub_get_scrubbed_data(scrub_object):
|
|
|
30
30
|
expected_df = pd.DataFrame(
|
|
31
31
|
{
|
|
32
32
|
"text_id": {0: 2},
|
|
33
|
-
"
|
|
33
|
+
"uk_postcode": {0: ["AA11 1AA"]},
|
|
34
34
|
}
|
|
35
35
|
)
|
|
36
36
|
|
|
@@ -42,7 +42,7 @@ def test_scrub_order(scrub_object):
|
|
|
42
42
|
|
|
43
43
|
assert scrub_object.get_scrubbed_data().columns.to_list() == [
|
|
44
44
|
"text_id",
|
|
45
|
-
"
|
|
46
|
-
"
|
|
47
|
-
"
|
|
45
|
+
"uk_postcode",
|
|
46
|
+
"uk_phone_number",
|
|
47
|
+
"person",
|
|
48
48
|
]
|
test/test_spacy.py
CHANGED
|
@@ -23,6 +23,4 @@ def test_spacy_empty():
|
|
|
23
23
|
scrubbed = scrub.spacy_persons()
|
|
24
24
|
|
|
25
25
|
assert scrubbed == [" ", "[PERSON]", ""]
|
|
26
|
-
assert_frame_equal(
|
|
27
|
-
scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "scrubbed_spacy_person": [["John Smith"]]})
|
|
28
|
-
)
|
|
26
|
+
assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
|
idscrub-0.2.2.dist-info/RECORD
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
idscrub/__init__.py,sha256=cRugJv27q1q--bl-VNLpfiScJb_ROlUxyLFhaF55S1w,38
|
|
2
|
-
idscrub/locations.py,sha256=7fMNOcGMYe7sX8TrfhMW6oYGAlc1WVYVQKQbpxE3pqo,217
|
|
3
|
-
idscrub/scrub.py,sha256=K4Sw4DxKhYJnnu_vpRhUcqj-AbeGr8SwDB0XrDLEciM,34940
|
|
4
|
-
idscrub-0.2.2.dist-info/licenses/LICENSE,sha256=JJnuf10NSx7YXglte1oH_N9ZP3AcWR_Y8irvQb_wnsg,1090
|
|
5
|
-
notebooks/basic_usage.ipynb,sha256=eQFU3mOyRXbCwFz3jVUKCxWRtIP5Jptny8fj-KYoBwA,39784
|
|
6
|
-
test/conftest.py,sha256=ph1S3LMvzlzvOsb3l2YhpyHSdmg4uV7p61ge_JVCGv0,267
|
|
7
|
-
test/test_all.py,sha256=z6v9O2Ts9dWITlhvZwRMyKUZsO7ncaT3znqqBCKJ6Wc,1141
|
|
8
|
-
test/test_chain.py,sha256=YFGqO0xUzZ69x-iNCdKEiH-OWWZfyYYFgmEq0urELEs,1883
|
|
9
|
-
test/test_dataframe.py,sha256=6k3iu69X9H-pLA2gm3fvwvFTSj_efJe0GomFiR3LPac,2284
|
|
10
|
-
test/test_huggingface.py,sha256=pKL3yD1z8JxNaXwqkww0IrKEq84-J16vi-URC0kC9p8,848
|
|
11
|
-
test/test_id.py,sha256=TPsvz4Kw1z_Fiek2BV79Hc2q3N37xU3oQra6Y7Ke11Q,989
|
|
12
|
-
test/test_log.py,sha256=qKVZAzcaVllKepM-vgCWqqY9f8GyNxO7V0sa1WD0tsA,673
|
|
13
|
-
test/test_persidio.py,sha256=NSX5gzhhBX5l9GTXwPK4wjMzcp6wmAfWJYQo45UMVIc,1594
|
|
14
|
-
test/test_phonenumbers.py,sha256=hZsXgwhn5R-7426TTWwCH9gWQwhyHtjLUstN10jnX6c,607
|
|
15
|
-
test/test_regex.py,sha256=EQGx3PHwJJzIdy6xwR8gEsSRDtlWHR-U81EPI811eZA,4474
|
|
16
|
-
test/test_scrub.py,sha256=pohmw3frtlkmZDMvOEbmvVJgtcVdFlEDL3TxR5-y-0Q,1422
|
|
17
|
-
test/test_spacy.py,sha256=mrUGUulvzDGgQRttdG0tgL2sGBRmYfg1fDNp7SFq8as,961
|
|
18
|
-
idscrub-0.2.2.dist-info/METADATA,sha256=IHoFTVY6cJARkeeKoQlpunA7Nboc4y32bpSoS-IgSoM,5352
|
|
19
|
-
idscrub-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
20
|
-
idscrub-0.2.2.dist-info/top_level.txt,sha256=D4EEodXGCjGiX35ObiBTmjjBAdouN-eCvH-LezGGtks,23
|
|
21
|
-
idscrub-0.2.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|