idscrub 0.2.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
test/test_dataframe.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import pandas as pd
2
+ import pytest
2
3
  from idscrub import IDScrub
3
4
  from pandas.testing import assert_frame_equal
4
5
 
@@ -39,13 +40,121 @@ def test_dataframe_outputs():
39
40
  {
40
41
  "ID": [1, 2, 1, 2],
41
42
  "column": ["Pride and Prejudice", "Pride and Prejudice", "Fake book", "Fake book"],
42
- "scrubbed_presidio_person": [["Darcy", "Elizabeth"], ["Bennet"], None, ["Mick Jagger", "David Bowie"]],
43
- "scrubbed_titles": [["Mr"], ["Mr"], None, None],
44
- "scrubbed_presidio_email_address": [None, None, ["freddie-mercury@queen.com"], None],
45
- "scrubbed_presidio_url": [None, None, ["queen.com"], None],
46
- "scrubbed_uk_postcodes": [None, None, ["SW1A 2AA"], ["SW1A 2WH"]],
43
+ "person": [["Darcy", "Elizabeth"], ["Bennet"], None, ["Mick Jagger", "David Bowie"]],
44
+ "title": [["Mr"], ["Mr"], None, None],
45
+ "email_address": [None, None, ["freddie-mercury@queen.com"], None],
46
+ "url": [None, None, ["queen.com"], None],
47
+ "uk_postcode": [None, None, ["SW1A 2AA"], ["SW1A 2WH"]],
47
48
  }
48
49
  )
49
50
 
50
51
  assert_frame_equal(scrubbed_df, expected_scrubbed_df)
51
52
  assert_frame_equal(scrubbed_data, expected_scrubbed_data)
53
+
54
+
55
+ def test_dataframe_exclude():
56
+ df = pd.DataFrame(
57
+ {
58
+ "ID": [1, 2],
59
+ "Pride and Prejudice": [
60
+ "Mr. Darcy walked off; and Elizabeth remained with no very cordial feelings toward him.",
61
+ "Mr. Bennet was so odd a mixture of quick parts, sarcastic humour, reserve, and caprice.",
62
+ ],
63
+ "Fake book": [
64
+ "The letter to freddie-mercury@queen.com was stamped with SW1A 2AA.",
65
+ "She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.",
66
+ ],
67
+ }
68
+ )
69
+
70
+ scrubbed_df, scrubbed_data = IDScrub.dataframe(
71
+ df=df, id_col="ID", exclude_cols=["Fake book"], scrub_methods=["all"]
72
+ )
73
+
74
+ expected_scrubbed_df = pd.DataFrame(
75
+ {
76
+ "ID": [1, 2],
77
+ "Pride and Prejudice": [
78
+ "[TITLE]. [PERSON] walked off; and [PERSON] remained with no very cordial feelings toward him.",
79
+ "[TITLE]. [PERSON] was so odd a mixture of quick parts, sarcastic humour, reserve, and caprice.",
80
+ ],
81
+ "Fake book": [
82
+ "The letter to freddie-mercury@queen.com was stamped with SW1A 2AA.",
83
+ "She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.",
84
+ ],
85
+ }
86
+ )
87
+
88
+ expected_scrubbed_data = pd.DataFrame(
89
+ {
90
+ "ID": [1, 2],
91
+ "column": ["Pride and Prejudice", "Pride and Prejudice"],
92
+ "person": [["Darcy", "Elizabeth"], ["Bennet"]],
93
+ "title": [["Mr"], ["Mr"]],
94
+ }
95
+ )
96
+
97
+ assert_frame_equal(scrubbed_df, expected_scrubbed_df)
98
+ assert_frame_equal(scrubbed_data, expected_scrubbed_data)
99
+
100
+
101
+ def test_dataframe_scrub_methods():
102
+ df = pd.DataFrame(
103
+ {
104
+ "ID": [1, 2],
105
+ "Pride and Prejudice": [
106
+ "Mr. Darcy walked off; and Elizabeth remained with no very cordial feelings toward him.",
107
+ "Mr. Bennet was so odd a mixture of quick parts, sarcastic humour, reserve, and caprice.",
108
+ ],
109
+ "Fake book": [
110
+ "The letter to freddie-mercury@queen.com was stamped with SW1A 2AA.",
111
+ "She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.",
112
+ ],
113
+ }
114
+ )
115
+
116
+ scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID", scrub_methods=["titles"])
117
+
118
+ expected_scrubbed_df = pd.DataFrame(
119
+ {
120
+ "ID": [1, 2],
121
+ "Pride and Prejudice": [
122
+ "[TITLE]. Darcy walked off; and Elizabeth remained with no very cordial feelings toward him.",
123
+ "[TITLE]. Bennet was so odd a mixture of quick parts, sarcastic humour, reserve, and caprice.",
124
+ ],
125
+ "Fake book": [
126
+ "The letter to freddie-mercury@queen.com was stamped with SW1A 2AA.",
127
+ "She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.",
128
+ ],
129
+ }
130
+ )
131
+
132
+ expected_scrubbed_data = pd.DataFrame(
133
+ {
134
+ "ID": [1, 2],
135
+ "column": ["Pride and Prejudice", "Pride and Prejudice"],
136
+ "title": [["Mr"], ["Mr"]],
137
+ }
138
+ )
139
+
140
+ assert_frame_equal(scrubbed_df, expected_scrubbed_df)
141
+ assert_frame_equal(scrubbed_data, expected_scrubbed_data)
142
+
143
+
144
+ def test_dataframe_id_col():
145
+ df = pd.DataFrame(
146
+ {
147
+ "ID": [1, 2],
148
+ "Pride and Prejudice": [
149
+ "Mr. Darcy walked off; and Elizabeth remained with no very cordial feelings toward him.",
150
+ "Mr. Bennet was so odd a mixture of quick parts, sarcastic humour, reserve, and caprice.",
151
+ ],
152
+ "Fake book": [
153
+ "The letter to freddie-mercury@queen.com was stamped with SW1A 2AA.",
154
+ "She forwarded the memo from Mick Jagger and David Bowie to her chief of staff, noting the postcode SW1A 2WH.",
155
+ ],
156
+ }
157
+ )
158
+
159
+ with pytest.raises(AssertionError):
160
+ IDScrub.dataframe(df=df, id_col="ID_not_present")
test/test_huggingface.py CHANGED
@@ -22,4 +22,4 @@ def test_huggingface_empty():
22
22
  scrubbed = scrub.huggingface_persons()
23
23
 
24
24
  assert scrubbed == [" ", "[PERSON]", ""]
25
- assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "scrubbed_hf_person": [["John Smith"]]}))
25
+ assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
test/test_label.py ADDED
@@ -0,0 +1,17 @@
1
+ def test_label(scrub_object_all):
2
+ for i, scrub_method in enumerate(
3
+ ["spacy_persons", "uk_postcodes", "email_addresses", "ip_addresses", "uk_phone_numbers", "titles", "handles"]
4
+ ):
5
+ method = getattr(scrub_object_all, scrub_method)
6
+ method(label="test")
7
+
8
+ df = scrub_object_all.get_scrubbed_data()
9
+
10
+ assert df.columns.to_list() == ["text_id", "test"]
11
+
12
+
13
+ def test_regex_label(scrub_object_all):
14
+ scrub_object_all.custom_regex(custom_regex_patterns=[r"number", r"live"], labels=["regex_number", "regex_live"])
15
+ df = scrub_object_all.get_scrubbed_data()
16
+
17
+ assert df.columns.to_list() == ["text_id", "regex_number", "regex_live"]
test/test_log.py CHANGED
@@ -4,14 +4,14 @@ from idscrub import IDScrub
4
4
  def test_log_message():
5
5
  scrub = IDScrub(texts=["My name is Dr Strangelove. Dr. Strangelove is my name", "My name is Professor Oppenheimer"])
6
6
  scrub.titles()
7
- count = scrub.log_message("scrubbed_titles")
7
+ count = scrub.log_message("title")
8
8
  assert count == 3
9
9
 
10
10
 
11
11
  def test_log_message_custom_regex():
12
12
  scrub = IDScrub(texts=["My name is Dr Strangelove. Dr. Strangelove is my name", "My name is Professor Oppenheimer"])
13
13
  scrub.custom_regex([r"Strangelove", r"Oppenheimer"], ["[DR]", "[PROFESSOR]"])
14
- count_1 = scrub.log_message("scrubbed_custom_regex_1")
15
- count_2 = scrub.log_message("scrubbed_custom_regex_2")
14
+ count_1 = scrub.log_message("custom_regex_1")
15
+ count_2 = scrub.log_message("custom_regex_2")
16
16
  assert count_1 == 2
17
17
  assert count_2 == 1
test/test_persidio.py CHANGED
@@ -36,8 +36,8 @@ def test_persidio_get_data():
36
36
  expected_df = pd.DataFrame(
37
37
  {
38
38
  "text_id": {0: 1, 1: 2},
39
- "scrubbed_presidio_person": {0: ["Hamish McDonald", "L. Salah", "Elena Suárez"], 1: None},
40
- "scrubbed_presidio_iban_code": {0: None, 1: ["GB91BKEN10000041610008"]},
39
+ "person": {0: ["Hamish McDonald", "L. Salah", "Elena Suárez"], 1: None},
40
+ "iban_code": {0: None, 1: ["GB91BKEN10000041610008"]},
41
41
  }
42
42
  )
43
43
 
test/test_regex.py CHANGED
@@ -94,30 +94,30 @@ def test_scrub_and_collect():
94
94
  text = "Hello Muhammad and Jack."
95
95
  pattern = r"\bMuhammad|Jack\b"
96
96
  replacement = "[NAME]"
97
- removed_label = "scrubbed_custom_regex"
97
+ label = "custom_regex"
98
98
  i = 1
99
99
 
100
100
  def replacer(match):
101
- return scrub.scrub_and_collect(match, text, replacement, i, removed_label)
101
+ return scrub.scrub_and_collect(match, text, replacement, i, label)
102
102
 
103
103
  scrubbed = re.sub(pattern, replacer, text)
104
104
 
105
105
  assert scrubbed == "Hello [NAME] and [NAME]."
106
106
  assert scrub.scrubbed_data == [
107
- {"text_id": 1, "scrubbed_custom_regex": "Muhammad"},
108
- {"text_id": 1, "scrubbed_custom_regex": "Jack"},
107
+ {"text_id": 1, "custom_regex": "Muhammad"},
108
+ {"text_id": 1, "custom_regex": "Jack"},
109
109
  ]
110
110
 
111
111
 
112
112
  def test_remove_regex():
113
113
  scrub = IDScrub(texts=["Hi! My name is Clement Atlee!", "I am Harold Wilson."])
114
- removed_label = "scrubbed_regex_names"
114
+ label = "regex_names"
115
115
  pattern = r"Clement Atlee|Harold Wilson"
116
116
  replacement_text = "[PM]"
117
- scrubbed = scrub.scrub_regex(pattern, replacement_text, removed_label)
117
+ scrubbed = scrub.scrub_regex(pattern, replacement_text, label)
118
118
 
119
119
  assert scrubbed == ["Hi! My name is [PM]!", "I am [PM]."]
120
120
  assert scrub.scrubbed_data == [
121
- {"text_id": 1, "scrubbed_regex_names": "Clement Atlee"},
122
- {"text_id": 2, "scrubbed_regex_names": "Harold Wilson"},
121
+ {"text_id": 1, "regex_names": "Clement Atlee"},
122
+ {"text_id": 2, "regex_names": "Harold Wilson"},
123
123
  ]
test/test_scrub.py CHANGED
@@ -30,7 +30,7 @@ def test_scrub_get_scrubbed_data(scrub_object):
30
30
  expected_df = pd.DataFrame(
31
31
  {
32
32
  "text_id": {0: 2},
33
- "scrubbed_uk_postcodes": {0: ["AA11 1AA"]},
33
+ "uk_postcode": {0: ["AA11 1AA"]},
34
34
  }
35
35
  )
36
36
 
@@ -42,7 +42,7 @@ def test_scrub_order(scrub_object):
42
42
 
43
43
  assert scrub_object.get_scrubbed_data().columns.to_list() == [
44
44
  "text_id",
45
- "scrubbed_uk_postcodes",
46
- "scrubbed_uk_phone_numbers",
47
- "scrubbed_spacy_person",
45
+ "uk_postcode",
46
+ "uk_phone_number",
47
+ "person",
48
48
  ]
test/test_spacy.py CHANGED
@@ -23,6 +23,4 @@ def test_spacy_empty():
23
23
  scrubbed = scrub.spacy_persons()
24
24
 
25
25
  assert scrubbed == [" ", "[PERSON]", ""]
26
- assert_frame_equal(
27
- scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "scrubbed_spacy_person": [["John Smith"]]})
28
- )
26
+ assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
@@ -1,21 +0,0 @@
1
- idscrub/__init__.py,sha256=cRugJv27q1q--bl-VNLpfiScJb_ROlUxyLFhaF55S1w,38
2
- idscrub/locations.py,sha256=7fMNOcGMYe7sX8TrfhMW6oYGAlc1WVYVQKQbpxE3pqo,217
3
- idscrub/scrub.py,sha256=K4Sw4DxKhYJnnu_vpRhUcqj-AbeGr8SwDB0XrDLEciM,34940
4
- idscrub-0.2.2.dist-info/licenses/LICENSE,sha256=JJnuf10NSx7YXglte1oH_N9ZP3AcWR_Y8irvQb_wnsg,1090
5
- notebooks/basic_usage.ipynb,sha256=eQFU3mOyRXbCwFz3jVUKCxWRtIP5Jptny8fj-KYoBwA,39784
6
- test/conftest.py,sha256=ph1S3LMvzlzvOsb3l2YhpyHSdmg4uV7p61ge_JVCGv0,267
7
- test/test_all.py,sha256=z6v9O2Ts9dWITlhvZwRMyKUZsO7ncaT3znqqBCKJ6Wc,1141
8
- test/test_chain.py,sha256=YFGqO0xUzZ69x-iNCdKEiH-OWWZfyYYFgmEq0urELEs,1883
9
- test/test_dataframe.py,sha256=6k3iu69X9H-pLA2gm3fvwvFTSj_efJe0GomFiR3LPac,2284
10
- test/test_huggingface.py,sha256=pKL3yD1z8JxNaXwqkww0IrKEq84-J16vi-URC0kC9p8,848
11
- test/test_id.py,sha256=TPsvz4Kw1z_Fiek2BV79Hc2q3N37xU3oQra6Y7Ke11Q,989
12
- test/test_log.py,sha256=qKVZAzcaVllKepM-vgCWqqY9f8GyNxO7V0sa1WD0tsA,673
13
- test/test_persidio.py,sha256=NSX5gzhhBX5l9GTXwPK4wjMzcp6wmAfWJYQo45UMVIc,1594
14
- test/test_phonenumbers.py,sha256=hZsXgwhn5R-7426TTWwCH9gWQwhyHtjLUstN10jnX6c,607
15
- test/test_regex.py,sha256=EQGx3PHwJJzIdy6xwR8gEsSRDtlWHR-U81EPI811eZA,4474
16
- test/test_scrub.py,sha256=pohmw3frtlkmZDMvOEbmvVJgtcVdFlEDL3TxR5-y-0Q,1422
17
- test/test_spacy.py,sha256=mrUGUulvzDGgQRttdG0tgL2sGBRmYfg1fDNp7SFq8as,961
18
- idscrub-0.2.2.dist-info/METADATA,sha256=IHoFTVY6cJARkeeKoQlpunA7Nboc4y32bpSoS-IgSoM,5352
19
- idscrub-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
- idscrub-0.2.2.dist-info/top_level.txt,sha256=D4EEodXGCjGiX35ObiBTmjjBAdouN-eCvH-LezGGtks,23
21
- idscrub-0.2.2.dist-info/RECORD,,