idscrub 1.1.2__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- idscrub/scrub.py +694 -525
- {idscrub-1.1.2.dist-info → idscrub-2.0.1.dist-info}/METADATA +58 -12
- idscrub-2.0.1.dist-info/RECORD +24 -0
- notebooks/basic_usage.ipynb +294 -351
- test/conftest.py +36 -0
- test/test_dataframe.py +8 -8
- test/test_errors.py +32 -0
- test/test_exclude.py +22 -0
- test/test_group.py +9 -0
- test/test_huggingface.py +3 -3
- test/test_id.py +8 -7
- test/test_label.py +22 -7
- test/test_overlap.py +86 -0
- test/test_phonenumbers.py +2 -2
- test/test_presidio.py +21 -6
- test/test_regex.py +110 -59
- test/test_scrub.py +22 -12
- test/test_scrub_text.py +22 -0
- test/test_spacy.py +16 -12
- idscrub-1.1.2.dist-info/RECORD +0 -22
- test/test_all.py +0 -39
- test/test_chain.py +0 -54
- test/test_log.py +0 -17
- {idscrub-1.1.2.dist-info → idscrub-2.0.1.dist-info}/WHEEL +0 -0
- {idscrub-1.1.2.dist-info → idscrub-2.0.1.dist-info}/licenses/LICENSE +0 -0
- {idscrub-1.1.2.dist-info → idscrub-2.0.1.dist-info}/top_level.txt +0 -0
test/conftest.py
CHANGED
|
@@ -20,3 +20,39 @@ def scrub_object_all():
|
|
|
20
20
|
"My number is +441111111111 and I live at AA11 1AA.",
|
|
21
21
|
]
|
|
22
22
|
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@pytest.fixture
|
|
26
|
+
def idents():
|
|
27
|
+
return [
|
|
28
|
+
IDScrub.IDEnt(
|
|
29
|
+
text_id="A",
|
|
30
|
+
text="The quick brown fox jumps over the lazy dog.",
|
|
31
|
+
start=10,
|
|
32
|
+
end=19,
|
|
33
|
+
label="animal",
|
|
34
|
+
replacement="[ANIMAL]",
|
|
35
|
+
priority=0.92,
|
|
36
|
+
source="custom_regex",
|
|
37
|
+
),
|
|
38
|
+
IDScrub.IDEnt(
|
|
39
|
+
text_id="A",
|
|
40
|
+
text="My phone number is 123-456-7890.",
|
|
41
|
+
start=19,
|
|
42
|
+
end=31,
|
|
43
|
+
label="phone_number",
|
|
44
|
+
replacement="[PHONE]",
|
|
45
|
+
priority=0.76,
|
|
46
|
+
source="google",
|
|
47
|
+
),
|
|
48
|
+
IDScrub.IDEnt(
|
|
49
|
+
text_id="B",
|
|
50
|
+
text="Email me at example@example.com.",
|
|
51
|
+
start=12,
|
|
52
|
+
end=31,
|
|
53
|
+
label="email",
|
|
54
|
+
replacement="[EMAIL]",
|
|
55
|
+
priority=0.88,
|
|
56
|
+
source="email",
|
|
57
|
+
),
|
|
58
|
+
]
|
test/test_dataframe.py
CHANGED
|
@@ -20,7 +20,7 @@ def test_dataframe_outputs():
|
|
|
20
20
|
}
|
|
21
21
|
)
|
|
22
22
|
|
|
23
|
-
scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID"
|
|
23
|
+
scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID")
|
|
24
24
|
|
|
25
25
|
expected_scrubbed_df = pd.DataFrame(
|
|
26
26
|
{
|
|
@@ -43,7 +43,6 @@ def test_dataframe_outputs():
|
|
|
43
43
|
"person": [["Darcy", "Elizabeth"], ["Bennet"], None, ["Mick Jagger", "David Bowie"]],
|
|
44
44
|
"title": [["Mr"], ["Mr"], None, None],
|
|
45
45
|
"email_address": [None, None, ["freddie-mercury@queen.com"], None],
|
|
46
|
-
"url": [None, None, ["queen.com"], None],
|
|
47
46
|
"uk_postcode": [None, None, ["SW1A 2AA"], ["SW1A 2WH"]],
|
|
48
47
|
}
|
|
49
48
|
)
|
|
@@ -67,9 +66,7 @@ def test_dataframe_exclude():
|
|
|
67
66
|
}
|
|
68
67
|
)
|
|
69
68
|
|
|
70
|
-
scrubbed_df, scrubbed_data = IDScrub.dataframe(
|
|
71
|
-
df=df, id_col="ID", exclude_cols=["Fake book"], scrub_methods=["all"]
|
|
72
|
-
)
|
|
69
|
+
scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID", exclude_cols=["Fake book"])
|
|
73
70
|
|
|
74
71
|
expected_scrubbed_df = pd.DataFrame(
|
|
75
72
|
{
|
|
@@ -113,7 +110,7 @@ def test_dataframe_scrub_methods():
|
|
|
113
110
|
}
|
|
114
111
|
)
|
|
115
112
|
|
|
116
|
-
scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID",
|
|
113
|
+
scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID", pipeline=[{"method": "titles"}])
|
|
117
114
|
|
|
118
115
|
expected_scrubbed_df = pd.DataFrame(
|
|
119
116
|
{
|
|
@@ -141,7 +138,7 @@ def test_dataframe_scrub_methods():
|
|
|
141
138
|
assert_frame_equal(scrubbed_data, expected_scrubbed_data)
|
|
142
139
|
|
|
143
140
|
|
|
144
|
-
def
|
|
141
|
+
def test_dataframe_errors():
|
|
145
142
|
df = pd.DataFrame(
|
|
146
143
|
{
|
|
147
144
|
"ID": [1, 2],
|
|
@@ -156,5 +153,8 @@ def test_dataframe_id_col():
|
|
|
156
153
|
}
|
|
157
154
|
)
|
|
158
155
|
|
|
159
|
-
with pytest.raises(
|
|
156
|
+
with pytest.raises(ValueError):
|
|
160
157
|
IDScrub.dataframe(df=df, id_col="ID_not_present")
|
|
158
|
+
|
|
159
|
+
with pytest.raises(TypeError):
|
|
160
|
+
IDScrub.dataframe(df=1, id_col="ID_not_present")
|
test/test_errors.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from idscrub import IDScrub
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_scrub_input():
|
|
6
|
+
with pytest.raises(TypeError):
|
|
7
|
+
IDScrub(texts=[123])
|
|
8
|
+
with pytest.raises(TypeError):
|
|
9
|
+
IDScrub(texts=[1, 2, 3])
|
|
10
|
+
with pytest.raises(TypeError):
|
|
11
|
+
IDScrub(texts=[1.0, 2.0, 3.0])
|
|
12
|
+
with pytest.raises(TypeError):
|
|
13
|
+
IDScrub(texts="not_a_list")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_scrub_input_text_ids():
|
|
17
|
+
with pytest.raises(ValueError):
|
|
18
|
+
IDScrub(texts=["Hello"], text_ids=[1, 2])
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_replacement_error():
|
|
22
|
+
with pytest.raises(TypeError):
|
|
23
|
+
IDScrub(texts=["Hello"], text_ids=[1], replacement=1)
|
|
24
|
+
with pytest.raises(TypeError):
|
|
25
|
+
IDScrub(texts=["Hello"], text_ids=[1], replacement=1.0)
|
|
26
|
+
with pytest.raises(TypeError):
|
|
27
|
+
IDScrub(texts=["Hello"], text_ids=[1], replacement=["ok"])
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_scrub_pipeline_error(scrub_object):
|
|
31
|
+
with pytest.raises(TypeError):
|
|
32
|
+
scrub_object.scrub(pipeline={"method": "spacy_entities"})
|
test/test_exclude.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from idscrub import IDScrub
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_exclude():
|
|
5
|
+
scrub = IDScrub(
|
|
6
|
+
[
|
|
7
|
+
"Our names are Hamish McDonald, L. Salah, and Elena Suárez.",
|
|
8
|
+
],
|
|
9
|
+
exclude=["Hamish McDonald", "L. Salah"],
|
|
10
|
+
)
|
|
11
|
+
scrubbed = scrub.scrub(
|
|
12
|
+
pipeline=[{"method": "spacy_entities"}],
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
assert scrubbed == [
|
|
16
|
+
"Our names are Hamish McDonald, L. Salah, and [PERSON].",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
assert scrub.idents_all[0].text == "Hamish McDonald"
|
|
20
|
+
assert scrub.idents_all[1].text == "L. Salah"
|
|
21
|
+
|
|
22
|
+
assert [ident.text for ident in scrub.idents] not in ["Hamish McDonald", "L. Salah"]
|
test/test_group.py
ADDED
test/test_huggingface.py
CHANGED
|
@@ -6,7 +6,7 @@ from pandas.testing import assert_frame_equal
|
|
|
6
6
|
|
|
7
7
|
def test_huggingface():
|
|
8
8
|
scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
|
|
9
|
-
scrubbed = scrub.huggingface_entities
|
|
9
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "huggingface_entities"}])
|
|
10
10
|
assert scrubbed == ["Our names are [PERSON], [PERSON], and [PERSON]."]
|
|
11
11
|
|
|
12
12
|
|
|
@@ -14,12 +14,12 @@ def test_huggingface_error():
|
|
|
14
14
|
scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
|
|
15
15
|
|
|
16
16
|
with pytest.raises(OSError):
|
|
17
|
-
scrub.
|
|
17
|
+
scrub.scrub(pipeline=[{"method": "huggingface_entities", "hf_model_path": "not_a_model"}])
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
def test_huggingface_empty():
|
|
21
21
|
scrub = IDScrub([" ", "John Smith", ""])
|
|
22
|
-
scrubbed = scrub.huggingface_entities
|
|
22
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "huggingface_entities"}])
|
|
23
23
|
|
|
24
24
|
assert scrubbed == [" ", "[PERSON]", ""]
|
|
25
25
|
assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
|
test/test_id.py
CHANGED
|
@@ -2,23 +2,24 @@ from idscrub import IDScrub
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
def test_id_ints():
|
|
5
|
-
scrub = IDScrub(texts=["clement_attlee@
|
|
6
|
-
scrub.email_addresses
|
|
5
|
+
scrub = IDScrub(texts=["clement_attlee@testemail.com"] * 10, text_ids=range(100, 110), text_id_name="PM")
|
|
6
|
+
scrub.scrub(pipeline=[{"method": "email_addresses"}])
|
|
7
7
|
assert scrub.get_scrubbed_data()["PM"].min() == 100
|
|
8
8
|
assert scrub.get_scrubbed_data()["PM"].max() == 109
|
|
9
9
|
assert scrub.get_scrubbed_data()["PM"].to_list() == [100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
def test_id_strs():
|
|
13
|
-
scrub = IDScrub(texts=["clement_attlee@
|
|
14
|
-
scrub.email_addresses
|
|
13
|
+
scrub = IDScrub(texts=["clement_attlee@testemail.com"] * 2, text_ids=["random", "minister"], text_id_name="PM")
|
|
14
|
+
scrub.scrub(pipeline=[{"method": "email_addresses"}])
|
|
15
15
|
assert scrub.get_scrubbed_data()["PM"][0] == "random"
|
|
16
16
|
assert scrub.get_scrubbed_data()["PM"][1] == "minister"
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def test_multiple():
|
|
20
|
-
scrub = IDScrub(
|
|
21
|
-
|
|
22
|
-
|
|
20
|
+
scrub = IDScrub(
|
|
21
|
+
texts=["clement_attlee@testemail.com", "SW1A 2AA"] * 10, text_ids=range(100, 120), text_id_name="PM"
|
|
22
|
+
)
|
|
23
|
+
scrub.scrub(pipeline=[{"method": "email_addresses"}, {"method": "uk_postcodes"}])
|
|
23
24
|
assert scrub.get_scrubbed_data()["PM"].min() == 100
|
|
24
25
|
assert scrub.get_scrubbed_data()["PM"].max() == 119
|
test/test_label.py
CHANGED
|
@@ -1,9 +1,14 @@
|
|
|
1
1
|
def test_label(scrub_object_all):
|
|
2
|
-
|
|
3
|
-
[
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
2
|
+
scrub_object_all.scrub(
|
|
3
|
+
pipeline=[
|
|
4
|
+
{"method": "uk_postcodes", "label": "test"},
|
|
5
|
+
{"method": "email_addresses", "label": "test"},
|
|
6
|
+
{"method": "ip_addresses", "label": "test"},
|
|
7
|
+
{"method": "uk_phone_numbers", "label": "test"},
|
|
8
|
+
{"method": "titles", "label": "test"},
|
|
9
|
+
{"method": "handles", "label": "test"},
|
|
10
|
+
]
|
|
11
|
+
)
|
|
7
12
|
|
|
8
13
|
df = scrub_object_all.get_scrubbed_data()
|
|
9
14
|
|
|
@@ -11,7 +16,17 @@ def test_label(scrub_object_all):
|
|
|
11
16
|
|
|
12
17
|
|
|
13
18
|
def test_regex_label(scrub_object_all):
|
|
14
|
-
scrub_object_all.
|
|
19
|
+
scrub_object_all.scrub(
|
|
20
|
+
pipeline=[
|
|
21
|
+
{
|
|
22
|
+
"method": "custom_regex",
|
|
23
|
+
"patterns": {
|
|
24
|
+
"number": {"pattern": r"number", "replacement": "[REDACTED]", "priority": 0.5},
|
|
25
|
+
"live": {"pattern": r"live", "replacement": "[REDACTED]"},
|
|
26
|
+
},
|
|
27
|
+
}
|
|
28
|
+
]
|
|
29
|
+
)
|
|
15
30
|
df = scrub_object_all.get_scrubbed_data()
|
|
16
31
|
|
|
17
|
-
assert df.columns.to_list() == ["text_id", "
|
|
32
|
+
assert df.columns.to_list() == ["text_id", "number", "live"]
|
test/test_overlap.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from idscrub import IDScrub
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_overlap():
|
|
5
|
+
scrub = IDScrub(texts=["My email is fakeperson@fakeemail.com"])
|
|
6
|
+
scrubbed = scrub.scrub(
|
|
7
|
+
pipeline=[{"method": "handles", "priority": 0.1}, {"method": "email_addresses", "priority": 1.0}]
|
|
8
|
+
)
|
|
9
|
+
assert max([ident.priority for ident in scrub.idents_all]) == 1.0
|
|
10
|
+
assert scrub.idents_all == [
|
|
11
|
+
IDScrub.IDEnt(
|
|
12
|
+
text_id=1,
|
|
13
|
+
text="@fakeemail.com",
|
|
14
|
+
start=22,
|
|
15
|
+
end=36,
|
|
16
|
+
label="handle",
|
|
17
|
+
replacement="[HANDLE]",
|
|
18
|
+
priority=0.1,
|
|
19
|
+
source="regex",
|
|
20
|
+
),
|
|
21
|
+
IDScrub.IDEnt(
|
|
22
|
+
text_id=1,
|
|
23
|
+
text="fakeperson@fakeemail.com",
|
|
24
|
+
start=12,
|
|
25
|
+
end=36,
|
|
26
|
+
label="email_address",
|
|
27
|
+
replacement="[EMAIL_ADDRESS]",
|
|
28
|
+
priority=1.0,
|
|
29
|
+
source="regex",
|
|
30
|
+
),
|
|
31
|
+
]
|
|
32
|
+
assert scrub.idents == [
|
|
33
|
+
IDScrub.IDEnt(
|
|
34
|
+
text_id=1,
|
|
35
|
+
text="fakeperson@fakeemail.com",
|
|
36
|
+
start=12,
|
|
37
|
+
end=36,
|
|
38
|
+
label="email_address",
|
|
39
|
+
replacement="[EMAIL_ADDRESS]",
|
|
40
|
+
priority=1.0,
|
|
41
|
+
source="regex",
|
|
42
|
+
)
|
|
43
|
+
]
|
|
44
|
+
assert scrubbed == ["My email is [EMAIL_ADDRESS]"]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_overlap_default():
|
|
48
|
+
scrub = IDScrub(texts=["I am @John Smith"])
|
|
49
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "spacy_entities", "entity_types": ["PERSON"]}, {"method": "handles"}])
|
|
50
|
+
assert max([ident.priority for ident in scrub.idents_all]) == 1.0
|
|
51
|
+
assert scrub.idents_all == [
|
|
52
|
+
IDScrub.IDEnt(
|
|
53
|
+
text_id=1,
|
|
54
|
+
text="@John Smith",
|
|
55
|
+
start=5,
|
|
56
|
+
end=16,
|
|
57
|
+
label="person",
|
|
58
|
+
replacement="[PERSON]",
|
|
59
|
+
priority=1.0,
|
|
60
|
+
source="spacy",
|
|
61
|
+
),
|
|
62
|
+
IDScrub.IDEnt(
|
|
63
|
+
text_id=1,
|
|
64
|
+
text="@John",
|
|
65
|
+
start=5,
|
|
66
|
+
end=10,
|
|
67
|
+
label="handle",
|
|
68
|
+
replacement="[HANDLE]",
|
|
69
|
+
priority=0.4,
|
|
70
|
+
source="regex",
|
|
71
|
+
),
|
|
72
|
+
]
|
|
73
|
+
assert scrub.idents == [
|
|
74
|
+
IDScrub.IDEnt(
|
|
75
|
+
text_id=1,
|
|
76
|
+
text="@John Smith",
|
|
77
|
+
start=5,
|
|
78
|
+
end=16,
|
|
79
|
+
label="person",
|
|
80
|
+
replacement="[PERSON]",
|
|
81
|
+
priority=1.0,
|
|
82
|
+
source="spacy",
|
|
83
|
+
)
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
assert scrubbed == ["I am [PERSON]"]
|
test/test_phonenumbers.py
CHANGED
|
@@ -3,11 +3,11 @@ from idscrub import IDScrub
|
|
|
3
3
|
|
|
4
4
|
def test_google_phone_numbers_gb():
|
|
5
5
|
scrub = IDScrub(texts=["My phone number is +441234567891! My old phone number is 01475 123456."])
|
|
6
|
-
scrubbed = scrub.
|
|
6
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "google_phone_numbers"}])
|
|
7
7
|
assert scrubbed == ["My phone number is [PHONENO]! My old phone number is [PHONENO]."]
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def test_google_phone_numbers_us():
|
|
11
11
|
scrub = IDScrub(texts=["My US phone number is +1-718-222-2222! My old phone number is 12124567890."])
|
|
12
|
-
scrubbed = scrub.
|
|
12
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "google_phone_numbers", "region": "US"}])
|
|
13
13
|
assert scrubbed == ["My US phone number is [PHONENO]! My old phone number is [PHONENO]."]
|
test/test_presidio.py
CHANGED
|
@@ -8,20 +8,27 @@ def test_presidio():
|
|
|
8
8
|
scrub = IDScrub(
|
|
9
9
|
["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
|
|
10
10
|
)
|
|
11
|
-
|
|
11
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "presidio_entities"}])
|
|
12
12
|
|
|
13
|
-
assert
|
|
13
|
+
assert scrubbed == ["Our names are [PERSON], [PERSON], and [PERSON].", "My IBAN code is [IBAN_CODE]."]
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def test_presidio_map():
|
|
17
17
|
scrub = IDScrub(
|
|
18
18
|
["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
|
|
19
19
|
)
|
|
20
|
-
|
|
21
|
-
|
|
20
|
+
|
|
21
|
+
scrubbed = scrub.scrub(
|
|
22
|
+
pipeline=[
|
|
23
|
+
{
|
|
24
|
+
"method": "presidio_entities",
|
|
25
|
+
"entity_types": ["PERSON", "IBAN_CODE"],
|
|
26
|
+
"replacement_map": {"PERSON": "[PHELLO]", "IBAN_CODE": "[IHELLO]"},
|
|
27
|
+
}
|
|
28
|
+
]
|
|
22
29
|
)
|
|
23
30
|
|
|
24
|
-
assert
|
|
31
|
+
assert scrubbed == ["Our names are [PHELLO], [PHELLO], and [PHELLO].", "My IBAN code is [IHELLO]."]
|
|
25
32
|
|
|
26
33
|
|
|
27
34
|
def test_presidio_get_data():
|
|
@@ -29,7 +36,7 @@ def test_presidio_get_data():
|
|
|
29
36
|
["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
|
|
30
37
|
)
|
|
31
38
|
|
|
32
|
-
scrub.
|
|
39
|
+
scrub.scrub(pipeline=[{"method": "presidio_entities"}])
|
|
33
40
|
|
|
34
41
|
df = scrub.get_scrubbed_data()
|
|
35
42
|
|
|
@@ -42,3 +49,11 @@ def test_presidio_get_data():
|
|
|
42
49
|
)
|
|
43
50
|
|
|
44
51
|
assert_frame_equal(df, expected_df)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_presidio_empty():
|
|
55
|
+
scrub = IDScrub([" ", " John Smith", ""])
|
|
56
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "presidio_entities", "entity_types": ["PERSON"]}])
|
|
57
|
+
|
|
58
|
+
assert scrubbed == [" ", " [PERSON]", ""]
|
|
59
|
+
assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
|
test/test_regex.py
CHANGED
|
@@ -1,25 +1,25 @@
|
|
|
1
|
-
import re
|
|
2
|
-
|
|
3
1
|
from idscrub import IDScrub
|
|
4
2
|
|
|
5
3
|
|
|
6
4
|
def test_email_addresses():
|
|
7
5
|
scrub = IDScrub(
|
|
8
|
-
texts=[
|
|
6
|
+
texts=[
|
|
7
|
+
"Send me an email at jim@testemail.com or at marie-9999@randomemail.co.uk or at hello_world@john-smith.com."
|
|
8
|
+
]
|
|
9
9
|
)
|
|
10
|
-
scrubbed = scrub.email_addresses
|
|
10
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "email_addresses"}])
|
|
11
11
|
assert scrubbed == ["Send me an email at [EMAIL_ADDRESS] or at [EMAIL_ADDRESS] or at [EMAIL_ADDRESS]."]
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
def test_ip_addresses():
|
|
15
15
|
scrub = IDScrub(texts=["This has been sent to 8.8.8.8 and requested by 192.0.2.1."])
|
|
16
|
-
scrubbed = scrub.ip_addresses
|
|
16
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "ip_addresses"}])
|
|
17
17
|
assert scrubbed == ["This has been sent to [IPADDRESS] and requested by [IPADDRESS]."]
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
def test_uk_postcodes():
|
|
21
21
|
scrub = IDScrub(texts=["I live at A11 1AA. My friend lives at KA308JB. The Prime Minister lives at SW1A 2AA."])
|
|
22
|
-
scrubbed = scrub.uk_postcodes
|
|
22
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "uk_postcodes"}])
|
|
23
23
|
assert scrubbed == ["I live at [POSTCODE]. My friend lives at [POSTCODE]. The Prime Minister lives at [POSTCODE]."]
|
|
24
24
|
|
|
25
25
|
|
|
@@ -30,7 +30,7 @@ def test_titles_not_strict():
|
|
|
30
30
|
"I am here on behalf of Ms Austen, General Eisenhower, and Captain Jack Sparrow.",
|
|
31
31
|
]
|
|
32
32
|
)
|
|
33
|
-
scrubbed = scrub.titles
|
|
33
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "titles"}])
|
|
34
34
|
assert scrubbed == [
|
|
35
35
|
"Hello [TITLE]. Smith! I am [TITLE] Patel",
|
|
36
36
|
"I am here on behalf of [TITLE] Austen, General Eisenhower, and [TITLE] Jack Sparrow.",
|
|
@@ -44,7 +44,7 @@ def test_titles_strict():
|
|
|
44
44
|
"I am here on behalf of Ms Austen, General Eisenhower, and Captain Jack Sparrow.",
|
|
45
45
|
]
|
|
46
46
|
)
|
|
47
|
-
scrubbed = scrub.titles
|
|
47
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "titles", "strict": True}])
|
|
48
48
|
assert scrubbed == [
|
|
49
49
|
"Hello [TITLE]. Smith! I am [TITLE] Patel",
|
|
50
50
|
"I am here on behalf of [TITLE] Austen, [TITLE] Eisenhower, and [TITLE] Jack Sparrow.",
|
|
@@ -53,32 +53,52 @@ def test_titles_strict():
|
|
|
53
53
|
|
|
54
54
|
def test_uk_phone_numbers():
|
|
55
55
|
scrub = IDScrub(texts=["My phone number is +441234567891! My old phone number is 01111 123456."])
|
|
56
|
-
scrubbed = scrub.uk_phone_numbers
|
|
56
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "uk_phone_numbers"}])
|
|
57
57
|
assert scrubbed == ["My phone number is [PHONENO]! My old phone number is [PHONENO]."]
|
|
58
58
|
|
|
59
59
|
|
|
60
60
|
def test_handles():
|
|
61
61
|
scrub = IDScrub(texts=["Our usernames are @HenrikLarsson, @Jimmy_Johnstone, @Nakamura-67 and @Aidan_McGeady_46."])
|
|
62
|
-
scrubbed = scrub.handles
|
|
62
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "handles"}])
|
|
63
63
|
assert scrubbed == ["Our usernames are [HANDLE], [HANDLE], [HANDLE] and [HANDLE]."]
|
|
64
64
|
|
|
65
65
|
|
|
66
|
+
def test_urls():
|
|
67
|
+
scrub = IDScrub(
|
|
68
|
+
[
|
|
69
|
+
"www.example.co.uk",
|
|
70
|
+
"https://example.com",
|
|
71
|
+
"http://sub.domain.co.uk/path?query=1&x=2",
|
|
72
|
+
"www.example.org/page/index.html",
|
|
73
|
+
"https://example.com:8080/path/to/resource#anchor",
|
|
74
|
+
"www.test-site123.net/some/path?with=paramsexample.comexample.co.uk/home",
|
|
75
|
+
]
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "urls"}])
|
|
79
|
+
|
|
80
|
+
assert scrubbed == ["[URL]", "[URL]", "[URL]", "[URL]", "[URL]", "[URL]"]
|
|
81
|
+
|
|
82
|
+
|
|
66
83
|
def test_uk_addresses():
|
|
67
84
|
scrub = IDScrub(
|
|
68
85
|
[
|
|
69
86
|
"221B Baker Street",
|
|
70
87
|
"12 high road",
|
|
71
88
|
"Flat 3B, 47 King's Court",
|
|
72
|
-
"12
|
|
89
|
+
"12-14 High Street",
|
|
73
90
|
"5a-7a Church Lane",
|
|
74
91
|
"1/2 Main Street",
|
|
75
92
|
"10 St John’s Rd",
|
|
76
93
|
"33 Queen-Anne Walk",
|
|
77
94
|
"8 Deansgate Ct",
|
|
78
|
-
|
|
95
|
+
"10 Downing Street",
|
|
96
|
+
"10, Downing Street",
|
|
97
|
+
],
|
|
79
98
|
)
|
|
80
99
|
|
|
81
|
-
scrubbed = scrub.uk_addresses
|
|
100
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "uk_addresses"}])
|
|
101
|
+
|
|
82
102
|
assert scrubbed == [
|
|
83
103
|
"[ADDRESS]",
|
|
84
104
|
"[ADDRESS]",
|
|
@@ -89,6 +109,8 @@ def test_uk_addresses():
|
|
|
89
109
|
"[ADDRESS]",
|
|
90
110
|
"[ADDRESS]",
|
|
91
111
|
"[ADDRESS]",
|
|
112
|
+
"[ADDRESS]",
|
|
113
|
+
"[ADDRESS]",
|
|
92
114
|
]
|
|
93
115
|
|
|
94
116
|
negative_tests = [
|
|
@@ -105,65 +127,94 @@ def test_uk_addresses():
|
|
|
105
127
|
|
|
106
128
|
scrub = IDScrub(negative_tests)
|
|
107
129
|
|
|
108
|
-
scrubbed = scrub.uk_addresses
|
|
130
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "uk_addresses"}])
|
|
109
131
|
assert scrubbed == negative_tests
|
|
110
132
|
|
|
111
133
|
|
|
112
|
-
def
|
|
134
|
+
def test_custom_regex():
|
|
135
|
+
scrub = IDScrub(texts=[])
|
|
136
|
+
|
|
137
|
+
scrubbed_idents = scrub.custom_regex(
|
|
138
|
+
texts=["It was the best of times, it was the worst of times"],
|
|
139
|
+
text_ids=["A"],
|
|
140
|
+
patterns={
|
|
141
|
+
"times": {"pattern": r"times", "replacement": "[DICKENS]", "priority": 0.5},
|
|
142
|
+
"worst": {"pattern": r"worst", "replacement": "[WORST]", "priority": 0.8},
|
|
143
|
+
},
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
assert scrubbed_idents == [
|
|
147
|
+
IDScrub.IDEnt(
|
|
148
|
+
text_id="A",
|
|
149
|
+
text="times",
|
|
150
|
+
start=19,
|
|
151
|
+
end=24,
|
|
152
|
+
label="times",
|
|
153
|
+
replacement="[DICKENS]",
|
|
154
|
+
priority=0.5,
|
|
155
|
+
source="custom_regex",
|
|
156
|
+
),
|
|
157
|
+
IDScrub.IDEnt(
|
|
158
|
+
text_id="A",
|
|
159
|
+
text="times",
|
|
160
|
+
start=46,
|
|
161
|
+
end=51,
|
|
162
|
+
label="times",
|
|
163
|
+
replacement="[DICKENS]",
|
|
164
|
+
priority=0.5,
|
|
165
|
+
source="custom_regex",
|
|
166
|
+
),
|
|
167
|
+
IDScrub.IDEnt(
|
|
168
|
+
text_id="A",
|
|
169
|
+
text="worst",
|
|
170
|
+
start=37,
|
|
171
|
+
end=42,
|
|
172
|
+
label="worst",
|
|
173
|
+
replacement="[WORST]",
|
|
174
|
+
priority=0.8,
|
|
175
|
+
source="custom_regex",
|
|
176
|
+
),
|
|
177
|
+
]
|
|
178
|
+
|
|
113
179
|
scrub = IDScrub(
|
|
114
180
|
texts=[
|
|
115
|
-
"
|
|
116
|
-
"Claimant: J Smith Respondents: Jill Hill. J Smith is the respondent.",
|
|
181
|
+
"It was the best of times, it was the worst of times",
|
|
117
182
|
]
|
|
118
183
|
)
|
|
119
|
-
scrubbed = scrub.claimants()
|
|
120
|
-
assert scrubbed == [
|
|
121
|
-
"This is legal text. Claimant: [CLAIMANT] Respondents: Jill Hill.",
|
|
122
|
-
"Claimant: [CLAIMANT] Respondents: Jill Hill. [CLAIMANT] is the respondent.",
|
|
123
|
-
]
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
def test_custom_regex():
|
|
127
|
-
scrub = IDScrub(texts=["It was the best of times, it was the worst of times"])
|
|
128
|
-
scrubbed = scrub.custom_regex(custom_regex_patterns=[r"times"])
|
|
129
|
-
assert scrubbed == ["It was the best of [REDACTED], it was the worst of [REDACTED]"]
|
|
130
184
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
185
|
+
scrubbed_text = scrub.scrub(
|
|
186
|
+
pipeline=[
|
|
187
|
+
{
|
|
188
|
+
"method": "custom_regex",
|
|
189
|
+
"patterns": {
|
|
190
|
+
"times": {"pattern": r"times", "replacement": "[DICKENS]", "priority": 0.5},
|
|
191
|
+
"worst": {"pattern": r"worst", "replacement": "[WORST]", "priority": 0.5},
|
|
192
|
+
},
|
|
193
|
+
}
|
|
194
|
+
]
|
|
134
195
|
)
|
|
135
|
-
assert scrubbed == ["It was the best of [DICKENS], it was the [WORST] of [DICKENS]"]
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
def test_scrub_and_collect():
|
|
139
|
-
scrub = IDScrub()
|
|
140
|
-
text = "Hello Muhammad and Jack."
|
|
141
|
-
pattern = r"\bMuhammad|Jack\b"
|
|
142
|
-
replacement = "[NAME]"
|
|
143
|
-
label = "custom_regex"
|
|
144
|
-
i = 1
|
|
145
|
-
|
|
146
|
-
def replacer(match):
|
|
147
|
-
return scrub.scrub_and_collect(match, text, replacement, i, label)
|
|
148
196
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
assert scrubbed == "Hello [NAME] and [NAME]."
|
|
152
|
-
assert scrub.scrubbed_data == [
|
|
153
|
-
{"text_id": 1, "custom_regex": "Muhammad"},
|
|
154
|
-
{"text_id": 1, "custom_regex": "Jack"},
|
|
155
|
-
]
|
|
197
|
+
assert scrubbed_text == ["It was the best of [DICKENS], it was the [WORST] of [DICKENS]"]
|
|
156
198
|
|
|
157
199
|
|
|
158
200
|
def test_remove_regex():
|
|
159
|
-
|
|
201
|
+
texts = ["Hi! My name is Clement Atlee!"]
|
|
202
|
+
text_ids = ["UK"]
|
|
203
|
+
scrub = IDScrub([])
|
|
160
204
|
label = "regex_names"
|
|
161
205
|
pattern = r"Clement Atlee|Harold Wilson"
|
|
162
|
-
|
|
163
|
-
|
|
206
|
+
replacement = "[PM]"
|
|
207
|
+
priority = 0.5
|
|
208
|
+
idents = scrub.find_regex(
|
|
209
|
+
texts=texts, text_ids=text_ids, pattern=pattern, replacement=replacement, label=label, priority=priority
|
|
210
|
+
)
|
|
164
211
|
|
|
165
|
-
assert
|
|
166
|
-
assert
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
]
|
|
212
|
+
assert len(idents) == 1
|
|
213
|
+
assert idents[0].text_id == "UK"
|
|
214
|
+
assert idents[0].text == "Clement Atlee"
|
|
215
|
+
assert idents[0].start == 15
|
|
216
|
+
assert idents[0].end == 28
|
|
217
|
+
assert idents[0].label == "regex_names"
|
|
218
|
+
assert idents[0].replacement == "[PM]"
|
|
219
|
+
assert idents[0].priority == 0.5
|
|
220
|
+
assert idents[0].source == "regex"
|