idscrub 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- idscrub/scrub.py +50 -5
- {idscrub-0.1.1.dist-info → idscrub-0.2.0.dist-info}/METADATA +9 -9
- {idscrub-0.1.1.dist-info → idscrub-0.2.0.dist-info}/RECORD +8 -7
- notebooks/basic_usage.ipynb +173 -91
- test/test_scrub.py +48 -0
- {idscrub-0.1.1.dist-info → idscrub-0.2.0.dist-info}/WHEEL +0 -0
- {idscrub-0.1.1.dist-info → idscrub-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {idscrub-0.1.1.dist-info → idscrub-0.2.0.dist-info}/top_level.txt +0 -0
idscrub/scrub.py
CHANGED
|
@@ -19,7 +19,7 @@ from tqdm import tqdm
|
|
|
19
19
|
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
|
|
20
20
|
from transformers.utils import logging as trf_logging
|
|
21
21
|
|
|
22
|
-
from idscrub.locations import DOWNLOAD_DIR
|
|
22
|
+
from idscrub.locations import DOWNLOAD_DIR
|
|
23
23
|
|
|
24
24
|
# Suppress Torch FutureWarning
|
|
25
25
|
# TODO: Find better way
|
|
@@ -879,10 +879,46 @@ class IDScrub:
|
|
|
879
879
|
|
|
880
880
|
return scrub_methods.get(scrub_method, lambda: "Unknown method.")()
|
|
881
881
|
|
|
882
|
+
def scrub(self, scrub_methods: list[str] = ["all"]) -> list[str]:
|
|
883
|
+
"""
|
|
884
|
+
Scrubs text using given methods (in order).
|
|
885
|
+
Uses default values for the given scrub method.
|
|
886
|
+
|
|
887
|
+
Methods available (see associated method docstring for further information):
|
|
888
|
+
|
|
889
|
+
"all", "spacy_persons", "huggingface_persons", "email_addresses", "handles",
|
|
890
|
+
"ip_addresses", "uk_phone_numbers", "google_phone_numbers", "uk_postcodes"
|
|
891
|
+
"titles", "presidio"
|
|
892
|
+
|
|
893
|
+
Example:
|
|
894
|
+
|
|
895
|
+
"email_addresses" = scrub.email_addresses()
|
|
896
|
+
|
|
897
|
+
Therefore we can call:
|
|
898
|
+
|
|
899
|
+
IDScrub.scrub(scrub_methods = ["email_addresses"])
|
|
900
|
+
|
|
901
|
+
Args:
|
|
902
|
+
scrub_method (str): string name of scrub method.
|
|
903
|
+
|
|
904
|
+
Returns:
|
|
905
|
+
list[str]: The input list of text with personal information replaced.
|
|
906
|
+
|
|
907
|
+
"""
|
|
908
|
+
|
|
909
|
+
for i, scrub_method in enumerate(scrub_methods):
|
|
910
|
+
if i == len(scrub_methods) - 1:
|
|
911
|
+
self.call_scrub_method(scrub_method)
|
|
912
|
+
else:
|
|
913
|
+
self.call_scrub_method(scrub_method)
|
|
914
|
+
|
|
915
|
+
return self.cleaned_texts
|
|
916
|
+
|
|
882
917
|
@staticmethod
|
|
883
918
|
def dataframe(
|
|
884
919
|
df: pd.DataFrame = None,
|
|
885
920
|
id_col: str = None,
|
|
921
|
+
exclude_cols: list[str] = None,
|
|
886
922
|
scrub_methods: list[str] = ["all"],
|
|
887
923
|
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
888
924
|
"""
|
|
@@ -891,6 +927,7 @@ class IDScrub:
|
|
|
891
927
|
Args:
|
|
892
928
|
df (pd.DataFrame): A Pandas dataframe to scrub.
|
|
893
929
|
id_col (str): Name of the ID column in `df`. If None, an integer index starting at 1 with the name `id` is applied.
|
|
930
|
+
exclude_cols (list): Columns to exclude from scrubbing. if None all columns are scrubbed.
|
|
894
931
|
scrub_methods (list[str]): Which scrub methods to apply to the DataFrame (in order).
|
|
895
932
|
These are string versions of the existing methods e.g. "all" == scrub.all() and "email_addresses" == scrub.email_addresses().
|
|
896
933
|
|
|
@@ -899,6 +936,8 @@ class IDScrub:
|
|
|
899
936
|
|
|
900
937
|
"""
|
|
901
938
|
|
|
939
|
+
assert id_col in df.columns, "`id_col` is not a column in `df`. Please check."
|
|
940
|
+
|
|
902
941
|
if id_col:
|
|
903
942
|
ids = df[id_col].to_list()
|
|
904
943
|
if not id_col:
|
|
@@ -908,14 +947,18 @@ class IDScrub:
|
|
|
908
947
|
assert isinstance(df, pd.DataFrame), "`df` must be a Pandas DataFrame."
|
|
909
948
|
assert len(df) == len(ids), "Length of dataframe is different to the length of IDs."
|
|
910
949
|
|
|
950
|
+
if exclude_cols is None:
|
|
951
|
+
cols_to_scrub = df.columns.to_list()
|
|
952
|
+
else:
|
|
953
|
+
cols_to_scrub = [col for col in df.columns if col not in exclude_cols]
|
|
954
|
+
|
|
955
|
+
cols_to_scrub.remove(id_col)
|
|
956
|
+
|
|
911
957
|
scrubbed_df = df.copy()
|
|
912
958
|
|
|
913
959
|
all_scrubbed_data = []
|
|
914
960
|
|
|
915
|
-
for col in tqdm(
|
|
916
|
-
if col == id_col:
|
|
917
|
-
continue
|
|
918
|
-
|
|
961
|
+
for col in tqdm(cols_to_scrub):
|
|
919
962
|
original_dtype = scrubbed_df[col].dtype
|
|
920
963
|
scrubbed_df[col] = scrubbed_df[col].astype(str)
|
|
921
964
|
|
|
@@ -944,4 +987,6 @@ class IDScrub:
|
|
|
944
987
|
all_scrubbed_data = pd.concat(all_scrubbed_data).reset_index(drop=True)
|
|
945
988
|
all_scrubbed_data = all_scrubbed_data.where(pd.notna(all_scrubbed_data), None)
|
|
946
989
|
|
|
990
|
+
assert df.shape == scrubbed_df.shape, "Original and scrubbed dataframe not the same shape. Check."
|
|
991
|
+
|
|
947
992
|
return scrubbed_df, all_scrubbed_data
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: idscrub
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Author: Department for Business and Trade
|
|
5
5
|
Requires-Python: >=3.12
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -20,7 +20,7 @@ Dynamic: license-file
|
|
|
20
20
|
|
|
21
21
|
# idscrub 🧽✨
|
|
22
22
|
|
|
23
|
-
## Project
|
|
23
|
+
## Project Information
|
|
24
24
|
|
|
25
25
|
* This package removes (*✨scrubs✨*) identifying personal data from text using [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) and [named-entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
|
|
26
26
|
|
|
@@ -84,15 +84,15 @@ Dynamic: license-file
|
|
|
84
84
|
|
|
85
85
|
## Installation
|
|
86
86
|
|
|
87
|
-
`idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example
|
|
87
|
+
`idscrub` can be installed using `pip` into a Python **>=3.12** environment. Example:
|
|
88
88
|
|
|
89
89
|
```console
|
|
90
|
-
pip install
|
|
90
|
+
pip install idscrub
|
|
91
91
|
```
|
|
92
|
-
or
|
|
92
|
+
or with the spaCy transformer model (`en_core_web_trf`) already installed:
|
|
93
93
|
|
|
94
94
|
```console
|
|
95
|
-
pip
|
|
95
|
+
pip instll idscrub[trf]
|
|
96
96
|
```
|
|
97
97
|
|
|
98
98
|
## How to use the code
|
|
@@ -102,12 +102,12 @@ Basic usage example (see `notebooks/basic_usage.ipynb` for further examples):
|
|
|
102
102
|
```python
|
|
103
103
|
from idscrub import IDScrub
|
|
104
104
|
|
|
105
|
-
scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA
|
|
106
|
-
scrubbed_texts = scrub.
|
|
105
|
+
scrub = IDScrub(['Our names are Hamish McDonald, L. Salah, and Elena Suárez.', 'My number is +441111111111 and I live at AA11 1AA.'])
|
|
106
|
+
scrubbed_texts = scrub.scrub(scrub_methods=['spacy_persons', 'uk_phone_numbers', 'uk_postcodes'])
|
|
107
107
|
|
|
108
108
|
print(scrubbed_texts)
|
|
109
109
|
|
|
110
|
-
# Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE]
|
|
110
|
+
# Output: ['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE].']
|
|
111
111
|
```
|
|
112
112
|
|
|
113
113
|
## AI Declaration
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
idscrub/__init__.py,sha256=cRugJv27q1q--bl-VNLpfiScJb_ROlUxyLFhaF55S1w,38
|
|
2
2
|
idscrub/locations.py,sha256=7fMNOcGMYe7sX8TrfhMW6oYGAlc1WVYVQKQbpxE3pqo,217
|
|
3
|
-
idscrub/scrub.py,sha256=
|
|
4
|
-
idscrub-0.
|
|
5
|
-
notebooks/basic_usage.ipynb,sha256=
|
|
3
|
+
idscrub/scrub.py,sha256=cYA76efkbR6rjHvl9yejtwmJ6MV8qx7_V4Azk4sWhjA,35073
|
|
4
|
+
idscrub-0.2.0.dist-info/licenses/LICENSE,sha256=JJnuf10NSx7YXglte1oH_N9ZP3AcWR_Y8irvQb_wnsg,1090
|
|
5
|
+
notebooks/basic_usage.ipynb,sha256=2fQdapXAFb79ZTcMfveqSC4TMNrsvqDpvF15rw3LUvM,39798
|
|
6
6
|
test/conftest.py,sha256=ph1S3LMvzlzvOsb3l2YhpyHSdmg4uV7p61ge_JVCGv0,267
|
|
7
7
|
test/test_all.py,sha256=z6v9O2Ts9dWITlhvZwRMyKUZsO7ncaT3znqqBCKJ6Wc,1141
|
|
8
8
|
test/test_chain.py,sha256=YFGqO0xUzZ69x-iNCdKEiH-OWWZfyYYFgmEq0urELEs,1883
|
|
@@ -13,8 +13,9 @@ test/test_log.py,sha256=qKVZAzcaVllKepM-vgCWqqY9f8GyNxO7V0sa1WD0tsA,673
|
|
|
13
13
|
test/test_persidio.py,sha256=NSX5gzhhBX5l9GTXwPK4wjMzcp6wmAfWJYQo45UMVIc,1594
|
|
14
14
|
test/test_phonenumbers.py,sha256=hZsXgwhn5R-7426TTWwCH9gWQwhyHtjLUstN10jnX6c,607
|
|
15
15
|
test/test_regex.py,sha256=EQGx3PHwJJzIdy6xwR8gEsSRDtlWHR-U81EPI811eZA,4474
|
|
16
|
+
test/test_scrub.py,sha256=pohmw3frtlkmZDMvOEbmvVJgtcVdFlEDL3TxR5-y-0Q,1422
|
|
16
17
|
test/test_spacy.py,sha256=mrUGUulvzDGgQRttdG0tgL2sGBRmYfg1fDNp7SFq8as,961
|
|
17
|
-
idscrub-0.
|
|
18
|
-
idscrub-0.
|
|
19
|
-
idscrub-0.
|
|
20
|
-
idscrub-0.
|
|
18
|
+
idscrub-0.2.0.dist-info/METADATA,sha256=2NERZcMHsGbnotclunZ-0ZgZaCMAN39j9s_zswp1bXQ,6101
|
|
19
|
+
idscrub-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
20
|
+
idscrub-0.2.0.dist-info/top_level.txt,sha256=D4EEodXGCjGiX35ObiBTmjjBAdouN-eCvH-LezGGtks,23
|
|
21
|
+
idscrub-0.2.0.dist-info/RECORD,,
|
notebooks/basic_usage.ipynb
CHANGED
|
@@ -9,22 +9,139 @@
|
|
|
9
9
|
},
|
|
10
10
|
{
|
|
11
11
|
"cell_type": "code",
|
|
12
|
-
"execution_count":
|
|
12
|
+
"execution_count": 12,
|
|
13
|
+
"metadata": {},
|
|
14
|
+
"outputs": [
|
|
15
|
+
{
|
|
16
|
+
"name": "stderr",
|
|
17
|
+
"output_type": "stream",
|
|
18
|
+
"text": [
|
|
19
|
+
"INFO: Texts loaded.\n",
|
|
20
|
+
"INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
|
|
21
|
+
"100%|██████████| 2/2 [00:00<00:00, 44.29it/s]\n",
|
|
22
|
+
"INFO: 3 spacy person scrubbed.\n",
|
|
23
|
+
"INFO: Scrubbing phone numbers using regex...\n",
|
|
24
|
+
"INFO: 1 uk phone numbers scrubbed.\n",
|
|
25
|
+
"INFO: Scrubbing UK postcodes using regex...\n",
|
|
26
|
+
"INFO: 1 uk postcodes scrubbed.\n"
|
|
27
|
+
]
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"name": "stdout",
|
|
31
|
+
"output_type": "stream",
|
|
32
|
+
"text": [
|
|
33
|
+
"['Our names are [PERSON], [PERSON], and [PERSON].', 'My number is [PHONENO] and I live at [POSTCODE], Lapland.']\n"
|
|
34
|
+
]
|
|
35
|
+
}
|
|
36
|
+
],
|
|
37
|
+
"source": [
|
|
38
|
+
"from idscrub import IDScrub\n",
|
|
39
|
+
"\n",
|
|
40
|
+
"scrub = IDScrub(\n",
|
|
41
|
+
" [\n",
|
|
42
|
+
" \"Our names are Hamish McDonald, L. Salah, and Elena Suárez.\",\n",
|
|
43
|
+
" \"My number is +441111111111 and I live at AA11 1AA, Lapland.\",\n",
|
|
44
|
+
" ]\n",
|
|
45
|
+
")\n",
|
|
46
|
+
"\n",
|
|
47
|
+
"scrubbed_texts = scrub.scrub(scrub_methods=[\"spacy_persons\", \"uk_phone_numbers\", \"uk_postcodes\"])\n",
|
|
48
|
+
"\n",
|
|
49
|
+
"print(scrubbed_texts)"
|
|
50
|
+
]
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"cell_type": "code",
|
|
54
|
+
"execution_count": 13,
|
|
55
|
+
"metadata": {},
|
|
56
|
+
"outputs": [
|
|
57
|
+
{
|
|
58
|
+
"data": {
|
|
59
|
+
"text/html": [
|
|
60
|
+
"<div>\n",
|
|
61
|
+
"<style scoped>\n",
|
|
62
|
+
" .dataframe tbody tr th:only-of-type {\n",
|
|
63
|
+
" vertical-align: middle;\n",
|
|
64
|
+
" }\n",
|
|
65
|
+
"\n",
|
|
66
|
+
" .dataframe tbody tr th {\n",
|
|
67
|
+
" vertical-align: top;\n",
|
|
68
|
+
" }\n",
|
|
69
|
+
"\n",
|
|
70
|
+
" .dataframe thead th {\n",
|
|
71
|
+
" text-align: right;\n",
|
|
72
|
+
" }\n",
|
|
73
|
+
"</style>\n",
|
|
74
|
+
"<table border=\"1\" class=\"dataframe\">\n",
|
|
75
|
+
" <thead>\n",
|
|
76
|
+
" <tr style=\"text-align: right;\">\n",
|
|
77
|
+
" <th></th>\n",
|
|
78
|
+
" <th>text_id</th>\n",
|
|
79
|
+
" <th>scrubbed_spacy_person</th>\n",
|
|
80
|
+
" <th>scrubbed_uk_phone_numbers</th>\n",
|
|
81
|
+
" <th>scrubbed_uk_postcodes</th>\n",
|
|
82
|
+
" </tr>\n",
|
|
83
|
+
" </thead>\n",
|
|
84
|
+
" <tbody>\n",
|
|
85
|
+
" <tr>\n",
|
|
86
|
+
" <th>0</th>\n",
|
|
87
|
+
" <td>1</td>\n",
|
|
88
|
+
" <td>[Hamish McDonald, L. Salah, Elena Suárez]</td>\n",
|
|
89
|
+
" <td>None</td>\n",
|
|
90
|
+
" <td>None</td>\n",
|
|
91
|
+
" </tr>\n",
|
|
92
|
+
" <tr>\n",
|
|
93
|
+
" <th>1</th>\n",
|
|
94
|
+
" <td>2</td>\n",
|
|
95
|
+
" <td>None</td>\n",
|
|
96
|
+
" <td>[+441111111111]</td>\n",
|
|
97
|
+
" <td>[AA11 1AA]</td>\n",
|
|
98
|
+
" </tr>\n",
|
|
99
|
+
" </tbody>\n",
|
|
100
|
+
"</table>\n",
|
|
101
|
+
"</div>"
|
|
102
|
+
],
|
|
103
|
+
"text/plain": [
|
|
104
|
+
" text_id scrubbed_spacy_person \\\n",
|
|
105
|
+
"0 1 [Hamish McDonald, L. Salah, Elena Suárez] \n",
|
|
106
|
+
"1 2 None \n",
|
|
107
|
+
"\n",
|
|
108
|
+
" scrubbed_uk_phone_numbers scrubbed_uk_postcodes \n",
|
|
109
|
+
"0 None None \n",
|
|
110
|
+
"1 [+441111111111] [AA11 1AA] "
|
|
111
|
+
]
|
|
112
|
+
},
|
|
113
|
+
"execution_count": 13,
|
|
114
|
+
"metadata": {},
|
|
115
|
+
"output_type": "execute_result"
|
|
116
|
+
}
|
|
117
|
+
],
|
|
118
|
+
"source": [
|
|
119
|
+
"scrub.get_scrubbed_data()"
|
|
120
|
+
]
|
|
121
|
+
},
|
|
122
|
+
{
|
|
123
|
+
"cell_type": "markdown",
|
|
124
|
+
"metadata": {},
|
|
125
|
+
"source": [
|
|
126
|
+
"Or scrub `all`:"
|
|
127
|
+
]
|
|
128
|
+
},
|
|
129
|
+
{
|
|
130
|
+
"cell_type": "code",
|
|
131
|
+
"execution_count": 14,
|
|
13
132
|
"metadata": {},
|
|
14
133
|
"outputs": [
|
|
15
134
|
{
|
|
16
135
|
"name": "stderr",
|
|
17
136
|
"output_type": "stream",
|
|
18
137
|
"text": [
|
|
19
|
-
"/Users/euansoutter/Documents/code/idscrub/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
|
20
|
-
" from .autonotebook import tqdm as notebook_tqdm\n",
|
|
21
138
|
"INFO: Texts loaded.\n",
|
|
22
139
|
"INFO: Scrubbing using Presidio...\n",
|
|
23
|
-
"100%|██████████| 2/2 [00:00<00:00,
|
|
140
|
+
"100%|██████████| 2/2 [00:00<00:00, 25.19it/s]\n",
|
|
24
141
|
"INFO: 3 presidio person scrubbed.\n",
|
|
25
142
|
"INFO: 1 presidio location scrubbed.\n",
|
|
26
143
|
"INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
|
|
27
|
-
"100%|██████████| 2/2 [00:00<00:00,
|
|
144
|
+
"100%|██████████| 2/2 [00:00<00:00, 48.66it/s]\n",
|
|
28
145
|
"INFO: 0 spacy person scrubbed.\n",
|
|
29
146
|
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
30
147
|
"INFO: 0 gb phone numbers scrubbed.\n",
|
|
@@ -59,14 +176,15 @@
|
|
|
59
176
|
" \"My number is +441111111111 and I live at AA11 1AA, Lapland.\",\n",
|
|
60
177
|
" ]\n",
|
|
61
178
|
")\n",
|
|
62
|
-
"
|
|
179
|
+
"\n",
|
|
180
|
+
"scrubbed_texts = scrub.scrub(scrub_methods=[\"all\"])\n",
|
|
63
181
|
"\n",
|
|
64
182
|
"print(scrubbed_texts)"
|
|
65
183
|
]
|
|
66
184
|
},
|
|
67
185
|
{
|
|
68
186
|
"cell_type": "code",
|
|
69
|
-
"execution_count":
|
|
187
|
+
"execution_count": 15,
|
|
70
188
|
"metadata": {},
|
|
71
189
|
"outputs": [
|
|
72
190
|
{
|
|
@@ -128,7 +246,7 @@
|
|
|
128
246
|
"1 [Lapland] [+441111111111] [AA11 1AA] "
|
|
129
247
|
]
|
|
130
248
|
},
|
|
131
|
-
"execution_count":
|
|
249
|
+
"execution_count": 15,
|
|
132
250
|
"metadata": {},
|
|
133
251
|
"output_type": "execute_result"
|
|
134
252
|
}
|
|
@@ -146,7 +264,7 @@
|
|
|
146
264
|
},
|
|
147
265
|
{
|
|
148
266
|
"cell_type": "code",
|
|
149
|
-
"execution_count":
|
|
267
|
+
"execution_count": 16,
|
|
150
268
|
"metadata": {},
|
|
151
269
|
"outputs": [
|
|
152
270
|
{
|
|
@@ -155,7 +273,7 @@
|
|
|
155
273
|
"text": [
|
|
156
274
|
"INFO: Texts loaded.\n",
|
|
157
275
|
"INFO: Scrubbing using Presidio...\n",
|
|
158
|
-
"100%|██████████| 2/2 [00:00<00:00,
|
|
276
|
+
"100%|██████████| 2/2 [00:00<00:00, 23.03it/s]\n",
|
|
159
277
|
"INFO: 3 presidio person scrubbed.\n",
|
|
160
278
|
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
161
279
|
"INFO: 0 gb phone numbers scrubbed.\n",
|
|
@@ -206,7 +324,7 @@
|
|
|
206
324
|
},
|
|
207
325
|
{
|
|
208
326
|
"cell_type": "code",
|
|
209
|
-
"execution_count":
|
|
327
|
+
"execution_count": 17,
|
|
210
328
|
"metadata": {},
|
|
211
329
|
"outputs": [
|
|
212
330
|
{
|
|
@@ -271,7 +389,7 @@
|
|
|
271
389
|
"1 [ACHILLES] [+441111111111] [AA11 1AA] "
|
|
272
390
|
]
|
|
273
391
|
},
|
|
274
|
-
"execution_count":
|
|
392
|
+
"execution_count": 17,
|
|
275
393
|
"metadata": {},
|
|
276
394
|
"output_type": "execute_result"
|
|
277
395
|
}
|
|
@@ -290,7 +408,7 @@
|
|
|
290
408
|
},
|
|
291
409
|
{
|
|
292
410
|
"cell_type": "code",
|
|
293
|
-
"execution_count":
|
|
411
|
+
"execution_count": 18,
|
|
294
412
|
"metadata": {},
|
|
295
413
|
"outputs": [
|
|
296
414
|
{
|
|
@@ -299,7 +417,7 @@
|
|
|
299
417
|
"text": [
|
|
300
418
|
"INFO: Texts loaded.\n",
|
|
301
419
|
"INFO: Scrubbing using Presidio...\n",
|
|
302
|
-
"100%|██████████| 2/2 [00:00<00:00,
|
|
420
|
+
"100%|██████████| 2/2 [00:00<00:00, 23.38it/s]\n",
|
|
303
421
|
"INFO: 3 presidio person scrubbed.\n",
|
|
304
422
|
"INFO: 1 presidio iban code scrubbed.\n"
|
|
305
423
|
]
|
|
@@ -325,7 +443,7 @@
|
|
|
325
443
|
},
|
|
326
444
|
{
|
|
327
445
|
"cell_type": "code",
|
|
328
|
-
"execution_count":
|
|
446
|
+
"execution_count": 19,
|
|
329
447
|
"metadata": {},
|
|
330
448
|
"outputs": [
|
|
331
449
|
{
|
|
@@ -381,7 +499,7 @@
|
|
|
381
499
|
"1 [GB91BKEN10000041610008] "
|
|
382
500
|
]
|
|
383
501
|
},
|
|
384
|
-
"execution_count":
|
|
502
|
+
"execution_count": 19,
|
|
385
503
|
"metadata": {},
|
|
386
504
|
"output_type": "execute_result"
|
|
387
505
|
}
|
|
@@ -399,7 +517,7 @@
|
|
|
399
517
|
},
|
|
400
518
|
{
|
|
401
519
|
"cell_type": "code",
|
|
402
|
-
"execution_count":
|
|
520
|
+
"execution_count": 20,
|
|
403
521
|
"metadata": {},
|
|
404
522
|
"outputs": [
|
|
405
523
|
{
|
|
@@ -505,7 +623,7 @@
|
|
|
505
623
|
"4 They did not expected a reply from otis.reddin... "
|
|
506
624
|
]
|
|
507
625
|
},
|
|
508
|
-
"execution_count":
|
|
626
|
+
"execution_count": 20,
|
|
509
627
|
"metadata": {},
|
|
510
628
|
"output_type": "execute_result"
|
|
511
629
|
}
|
|
@@ -551,21 +669,21 @@
|
|
|
551
669
|
},
|
|
552
670
|
{
|
|
553
671
|
"cell_type": "code",
|
|
554
|
-
"execution_count":
|
|
672
|
+
"execution_count": 21,
|
|
555
673
|
"metadata": {},
|
|
556
674
|
"outputs": [
|
|
557
675
|
{
|
|
558
676
|
"name": "stderr",
|
|
559
677
|
"output_type": "stream",
|
|
560
678
|
"text": [
|
|
561
|
-
" 0%| | 0/
|
|
679
|
+
" 0%| | 0/3 [00:00<?, ?it/s]INFO: Texts loaded.\n",
|
|
562
680
|
"INFO: Scrubbing using Presidio...\n",
|
|
563
|
-
"100%|██████████| 5/5 [00:00<00:00,
|
|
681
|
+
"100%|██████████| 5/5 [00:00<00:00, 18.99it/s]\n",
|
|
564
682
|
"INFO: 4 presidio person scrubbed.\n",
|
|
565
683
|
"INFO: 4 presidio person scrubbed.\n",
|
|
566
684
|
"INFO: 4 presidio person scrubbed.\n",
|
|
567
685
|
"INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
|
|
568
|
-
"100%|██████████| 5/5 [00:00<00:00,
|
|
686
|
+
"100%|██████████| 5/5 [00:00<00:00, 67.00it/s]\n",
|
|
569
687
|
"INFO: 0 spacy person scrubbed.\n",
|
|
570
688
|
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
571
689
|
"INFO: 0 gb phone numbers scrubbed.\n",
|
|
@@ -581,34 +699,13 @@
|
|
|
581
699
|
"INFO: 0 uk postcodes scrubbed.\n",
|
|
582
700
|
"INFO: Scrubbing titles using regex...\n",
|
|
583
701
|
"INFO: 2 titles scrubbed.\n",
|
|
584
|
-
"
|
|
702
|
+
" 33%|███▎ | 1/3 [00:03<00:06, 3.24s/it]INFO: Texts loaded.\n",
|
|
585
703
|
"INFO: Scrubbing using Presidio...\n",
|
|
586
|
-
"100%|██████████| 5/5 [00:00<00:00,
|
|
704
|
+
"100%|██████████| 5/5 [00:00<00:00, 21.83it/s]\n",
|
|
587
705
|
"INFO: 2 presidio person scrubbed.\n",
|
|
588
706
|
"INFO: 2 presidio person scrubbed.\n",
|
|
589
707
|
"INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
|
|
590
|
-
"100%|██████████| 5/5 [00:00<00:00,
|
|
591
|
-
"INFO: 0 spacy person scrubbed.\n",
|
|
592
|
-
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
593
|
-
"INFO: 0 gb phone numbers scrubbed.\n",
|
|
594
|
-
"INFO: Scrubbing email addresses using regex...\n",
|
|
595
|
-
"INFO: 0 email addresses scrubbed.\n",
|
|
596
|
-
"INFO: Scrubbing @user handles using regex...\n",
|
|
597
|
-
"INFO: 0 handles scrubbed.\n",
|
|
598
|
-
"INFO: Scrubbing IP addresses using regex...\n",
|
|
599
|
-
"INFO: 0 ip addresses scrubbed.\n",
|
|
600
|
-
"INFO: Scrubbing phone numbers using regex...\n",
|
|
601
|
-
"INFO: 0 uk phone numbers scrubbed.\n",
|
|
602
|
-
"INFO: Scrubbing UK postcodes using regex...\n",
|
|
603
|
-
"INFO: 0 uk postcodes scrubbed.\n",
|
|
604
|
-
"INFO: Scrubbing titles using regex...\n",
|
|
605
|
-
"INFO: 0 titles scrubbed.\n",
|
|
606
|
-
" 60%|██████ | 3/5 [00:04<00:03, 1.66s/it]INFO: Texts loaded.\n",
|
|
607
|
-
"INFO: Scrubbing using Presidio...\n",
|
|
608
|
-
"100%|██████████| 5/5 [00:00<00:00, 26.73it/s]\n",
|
|
609
|
-
"INFO: 1 presidio person scrubbed.\n",
|
|
610
|
-
"INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
|
|
611
|
-
"100%|██████████| 5/5 [00:00<00:00, 89.71it/s]\n",
|
|
708
|
+
"100%|██████████| 5/5 [00:00<00:00, 84.69it/s]\n",
|
|
612
709
|
"INFO: 0 spacy person scrubbed.\n",
|
|
613
710
|
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
614
711
|
"INFO: 0 gb phone numbers scrubbed.\n",
|
|
@@ -624,15 +721,15 @@
|
|
|
624
721
|
"INFO: 0 uk postcodes scrubbed.\n",
|
|
625
722
|
"INFO: Scrubbing titles using regex...\n",
|
|
626
723
|
"INFO: 0 titles scrubbed.\n",
|
|
627
|
-
"
|
|
724
|
+
" 67%|██████▋ | 2/3 [00:06<00:03, 3.24s/it]INFO: Texts loaded.\n",
|
|
628
725
|
"INFO: Scrubbing using Presidio...\n",
|
|
629
|
-
"100%|██████████| 5/5 [00:00<00:00,
|
|
630
|
-
"INFO:
|
|
726
|
+
"100%|██████████| 5/5 [00:00<00:00, 29.32it/s]\n",
|
|
727
|
+
"INFO: 5 presidio url scrubbed.\n",
|
|
631
728
|
"INFO: 2 presidio person scrubbed.\n",
|
|
632
729
|
"INFO: 3 presidio email address scrubbed.\n",
|
|
633
730
|
"INFO: 3 presidio email address scrubbed.\n",
|
|
634
731
|
"INFO: Scrubbing names using SpaCy model `en_core_web_trf`...\n",
|
|
635
|
-
"100%|██████████| 5/5 [00:00<00:00,
|
|
732
|
+
"100%|██████████| 5/5 [00:00<00:00, 66.37it/s]\n",
|
|
636
733
|
"INFO: 0 spacy person scrubbed.\n",
|
|
637
734
|
"INFO: Scrubbing GB phone numbers using Google's `phonenumbers`...\n",
|
|
638
735
|
"INFO: 0 gb phone numbers scrubbed.\n",
|
|
@@ -648,7 +745,7 @@
|
|
|
648
745
|
"INFO: 4 uk postcodes scrubbed.\n",
|
|
649
746
|
"INFO: Scrubbing titles using regex...\n",
|
|
650
747
|
"INFO: 0 titles scrubbed.\n",
|
|
651
|
-
"100%|██████████|
|
|
748
|
+
"100%|██████████| 3/3 [00:08<00:00, 2.94s/it]\n"
|
|
652
749
|
]
|
|
653
750
|
},
|
|
654
751
|
{
|
|
@@ -685,7 +782,7 @@
|
|
|
685
782
|
" <td>A</td>\n",
|
|
686
783
|
" <td>[TITLE]. [PERSON] walked off; and [PERSON] rem...</td>\n",
|
|
687
784
|
" <td>To [PERSON] she is always the woman.</td>\n",
|
|
688
|
-
" <td>My dear
|
|
785
|
+
" <td>My dear Victor, do not waste your time upon th...</td>\n",
|
|
689
786
|
" <td>The letter to [EMAIL_ADDRESS] was stamped with...</td>\n",
|
|
690
787
|
" </tr>\n",
|
|
691
788
|
" <tr>\n",
|
|
@@ -740,7 +837,7 @@
|
|
|
740
837
|
"4 When you have eliminated the impossible, whate... \n",
|
|
741
838
|
"\n",
|
|
742
839
|
" Frankenstein \\\n",
|
|
743
|
-
"0 My dear
|
|
840
|
+
"0 My dear Victor, do not waste your time upon th... \n",
|
|
744
841
|
"1 Learn from me, if not by my precepts, at least... \n",
|
|
745
842
|
"2 I had worked hard for nearly two years, for th... \n",
|
|
746
843
|
"3 Nothing is more painful to the human mind than... \n",
|
|
@@ -754,7 +851,7 @@
|
|
|
754
851
|
"4 They did not expected a reply from [EMAIL_ADDR... "
|
|
755
852
|
]
|
|
756
853
|
},
|
|
757
|
-
"execution_count":
|
|
854
|
+
"execution_count": 21,
|
|
758
855
|
"metadata": {},
|
|
759
856
|
"output_type": "execute_result"
|
|
760
857
|
}
|
|
@@ -762,14 +859,14 @@
|
|
|
762
859
|
"source": [
|
|
763
860
|
"from idscrub import IDScrub\n",
|
|
764
861
|
"\n",
|
|
765
|
-
"scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col=\"ID\", scrub_methods=[\"all\"])\n",
|
|
862
|
+
"scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col=\"ID\", exclude_cols=[\"Frankenstein\"], scrub_methods=[\"all\"])\n",
|
|
766
863
|
"\n",
|
|
767
864
|
"scrubbed_df"
|
|
768
865
|
]
|
|
769
866
|
},
|
|
770
867
|
{
|
|
771
868
|
"cell_type": "code",
|
|
772
|
-
"execution_count":
|
|
869
|
+
"execution_count": 22,
|
|
773
870
|
"metadata": {},
|
|
774
871
|
"outputs": [
|
|
775
872
|
{
|
|
@@ -862,17 +959,6 @@
|
|
|
862
959
|
" <tr>\n",
|
|
863
960
|
" <th>5</th>\n",
|
|
864
961
|
" <td>A</td>\n",
|
|
865
|
-
" <td>Frankenstein</td>\n",
|
|
866
|
-
" <td>[Victor]</td>\n",
|
|
867
|
-
" <td>None</td>\n",
|
|
868
|
-
" <td>None</td>\n",
|
|
869
|
-
" <td>None</td>\n",
|
|
870
|
-
" <td>None</td>\n",
|
|
871
|
-
" <td>None</td>\n",
|
|
872
|
-
" </tr>\n",
|
|
873
|
-
" <tr>\n",
|
|
874
|
-
" <th>6</th>\n",
|
|
875
|
-
" <td>A</td>\n",
|
|
876
962
|
" <td>Fake book</td>\n",
|
|
877
963
|
" <td>None</td>\n",
|
|
878
964
|
" <td>None</td>\n",
|
|
@@ -882,7 +968,7 @@
|
|
|
882
968
|
" <td>[SW1A 2AA]</td>\n",
|
|
883
969
|
" </tr>\n",
|
|
884
970
|
" <tr>\n",
|
|
885
|
-
" <th>
|
|
971
|
+
" <th>6</th>\n",
|
|
886
972
|
" <td>B</td>\n",
|
|
887
973
|
" <td>Fake book</td>\n",
|
|
888
974
|
" <td>[Mick Jagger, David Bowie]</td>\n",
|
|
@@ -893,7 +979,7 @@
|
|
|
893
979
|
" <td>[SW1A 2WH]</td>\n",
|
|
894
980
|
" </tr>\n",
|
|
895
981
|
" <tr>\n",
|
|
896
|
-
" <th>
|
|
982
|
+
" <th>7</th>\n",
|
|
897
983
|
" <td>C</td>\n",
|
|
898
984
|
" <td>Fake book</td>\n",
|
|
899
985
|
" <td>None</td>\n",
|
|
@@ -904,7 +990,7 @@
|
|
|
904
990
|
" <td>[SW19 5AE]</td>\n",
|
|
905
991
|
" </tr>\n",
|
|
906
992
|
" <tr>\n",
|
|
907
|
-
" <th>
|
|
993
|
+
" <th>8</th>\n",
|
|
908
994
|
" <td>E</td>\n",
|
|
909
995
|
" <td>Fake book</td>\n",
|
|
910
996
|
" <td>None</td>\n",
|
|
@@ -925,11 +1011,10 @@
|
|
|
925
1011
|
"2 C Pride and Prejudice [Elizabeth] \n",
|
|
926
1012
|
"3 A The Adventures of Sherlock Holmes [Sherlock Holmes] \n",
|
|
927
1013
|
"4 D The Adventures of Sherlock Holmes [Watson] \n",
|
|
928
|
-
"5 A
|
|
929
|
-
"6
|
|
930
|
-
"7
|
|
931
|
-
"8
|
|
932
|
-
"9 E Fake book None \n",
|
|
1014
|
+
"5 A Fake book None \n",
|
|
1015
|
+
"6 B Fake book [Mick Jagger, David Bowie] \n",
|
|
1016
|
+
"7 C Fake book None \n",
|
|
1017
|
+
"8 E Fake book None \n",
|
|
933
1018
|
"\n",
|
|
934
1019
|
" scrubbed_titles scrubbed_presidio_email_address \\\n",
|
|
935
1020
|
"0 [Mr] None \n",
|
|
@@ -937,11 +1022,10 @@
|
|
|
937
1022
|
"2 None None \n",
|
|
938
1023
|
"3 None None \n",
|
|
939
1024
|
"4 None None \n",
|
|
940
|
-
"5 None
|
|
941
|
-
"6 None
|
|
942
|
-
"7 None
|
|
943
|
-
"8 None
|
|
944
|
-
"9 None [otis.redding@dockofthebay.org] \n",
|
|
1025
|
+
"5 None [freddie.mercury@queen.com] \n",
|
|
1026
|
+
"6 None None \n",
|
|
1027
|
+
"7 None [serena.williams@tennis.com] \n",
|
|
1028
|
+
"8 None [otis.redding@dockofthebay.org] \n",
|
|
945
1029
|
"\n",
|
|
946
1030
|
" scrubbed_presidio_iban_code scrubbed_presidio_url \\\n",
|
|
947
1031
|
"0 None None \n",
|
|
@@ -949,11 +1033,10 @@
|
|
|
949
1033
|
"2 None None \n",
|
|
950
1034
|
"3 None None \n",
|
|
951
1035
|
"4 None None \n",
|
|
952
|
-
"5
|
|
953
|
-
"6
|
|
954
|
-
"7 None
|
|
955
|
-
"8 None
|
|
956
|
-
"9 None [otis.red, dockofthebay.org] \n",
|
|
1036
|
+
"5 [GB91BKEN10000041610008] [freddie.me, queen.com] \n",
|
|
1037
|
+
"6 None None \n",
|
|
1038
|
+
"7 None [tennis.com] \n",
|
|
1039
|
+
"8 None [otis.red, dockofthebay.org] \n",
|
|
957
1040
|
"\n",
|
|
958
1041
|
" scrubbed_uk_postcodes \n",
|
|
959
1042
|
"0 None \n",
|
|
@@ -961,14 +1044,13 @@
|
|
|
961
1044
|
"2 None \n",
|
|
962
1045
|
"3 None \n",
|
|
963
1046
|
"4 None \n",
|
|
964
|
-
"5
|
|
965
|
-
"6 [SW1A
|
|
966
|
-
"7 [
|
|
967
|
-
"8
|
|
968
|
-
"9 [EH8 8DX] "
|
|
1047
|
+
"5 [SW1A 2AA] \n",
|
|
1048
|
+
"6 [SW1A 2WH] \n",
|
|
1049
|
+
"7 [SW19 5AE] \n",
|
|
1050
|
+
"8 [EH8 8DX] "
|
|
969
1051
|
]
|
|
970
1052
|
},
|
|
971
|
-
"execution_count":
|
|
1053
|
+
"execution_count": 22,
|
|
972
1054
|
"metadata": {},
|
|
973
1055
|
"output_type": "execute_result"
|
|
974
1056
|
}
|
test/test_scrub.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from idscrub import IDScrub
|
|
3
|
+
from pandas.testing import assert_frame_equal
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# Note: These tests will fail if the kernel has not been restarted since the SpaCy model was downloaded.
|
|
7
|
+
def test_scrub(scrub_object):
|
|
8
|
+
scrubbed = scrub_object.scrub(scrub_methods=["spacy_persons", "uk_phone_numbers", "uk_postcodes"])
|
|
9
|
+
assert scrubbed == [
|
|
10
|
+
"Our names are [PERSON], [PERSON], and [PERSON].",
|
|
11
|
+
"My number is [PHONENO] and I live at [POSTCODE].",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_scrub_text_id():
|
|
16
|
+
scrub = IDScrub(["Our names are Hamish McDonald, L. Salah, and Elena Suárez."] * 10)
|
|
17
|
+
|
|
18
|
+
scrub.scrub(scrub_methods=["spacy_persons"])
|
|
19
|
+
|
|
20
|
+
df = scrub.get_scrubbed_data()
|
|
21
|
+
|
|
22
|
+
assert df["text_id"].max() == 10
|
|
23
|
+
assert len(df["text_id"]) == 10
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_scrub_get_scrubbed_data(scrub_object):
|
|
27
|
+
scrub_object.scrub(scrub_methods=["uk_postcodes"])
|
|
28
|
+
df = scrub_object.get_scrubbed_data()
|
|
29
|
+
|
|
30
|
+
expected_df = pd.DataFrame(
|
|
31
|
+
{
|
|
32
|
+
"text_id": {0: 2},
|
|
33
|
+
"scrubbed_uk_postcodes": {0: ["AA11 1AA"]},
|
|
34
|
+
}
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
assert_frame_equal(df, expected_df)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_scrub_order(scrub_object):
|
|
41
|
+
scrub_object.scrub(scrub_methods=["uk_postcodes", "uk_phone_numbers", "spacy_persons"])
|
|
42
|
+
|
|
43
|
+
assert scrub_object.get_scrubbed_data().columns.to_list() == [
|
|
44
|
+
"text_id",
|
|
45
|
+
"scrubbed_uk_postcodes",
|
|
46
|
+
"scrubbed_uk_phone_numbers",
|
|
47
|
+
"scrubbed_spacy_person",
|
|
48
|
+
]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|